library(pacman)
p_load(tidyverse, babynames, mdsr, Hmisc)
%+%
Historical baby names
See Section 3.3
This dataset was created in 2014.
<- make_babynames_dist()
BabynamesDist head(BabynamesDist)
# A tibble: 6 × 9
year sex name n prop alive_prob count_thousands age_today
<dbl> <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl>
1 1900 F Mary 16706 0.0526 0 16.7 114
2 1900 F Helen 6343 0.0200 0 6.34 114
3 1900 F Anna 6114 0.0192 0 6.11 114
4 1900 F Margaret 5304 0.0167 0 5.30 114
5 1900 F Ruth 4765 0.0150 0 4.76 114
6 1900 F Elizabeth 4096 0.0129 0 4.10 114
# ℹ 1 more variable: est_alive_today <dbl>
|>
BabynamesDist filter(name == "Benjamin")
# A tibble: 205 × 9
year sex name n prop alive_prob count_thousands age_today
<dbl> <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl>
1 1900 M Benjamin 450 0.00278 0 0.45 114
2 1901 M Benjamin 343 0.00297 0.000025 0.343 113
3 1902 M Benjamin 374 0.00282 0.00005 0.374 112
4 1903 M Benjamin 324 0.00251 0.000075 0.324 111
5 1904 M Benjamin 358 0.00258 0.0001 0.358 110
6 1905 M Benjamin 379 0.00265 0.000125 0.379 109
7 1906 M Benjamin 352 0.00244 0.00015 0.352 108
8 1907 M Benjamin 460 0.00290 0.000175 0.46 107
9 1908 M Benjamin 480 0.00289 0.0002 0.48 106
10 1909 M Benjamin 489 0.00276 0.000225 0.489 105
# ℹ 195 more rows
# ℹ 1 more variable: est_alive_today <dbl>
<- BabynamesDist |>
joseph filter(name == "Joseph" & sex == "M")
<- joseph |> ggplot(aes(x = year))
name_plot
name_plot
In the Second edition of the book the authors changed to using geom_col().
<- name_plot +
name_plot geom_col(
aes(y = count_thousands * alive_prob),
fill = "#b2d7e9",
color = "white",
size = 0.1
)
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
name_plot
<- name_plot +
name_plot geom_line(aes(y = count_thousands), size = 2)
name_plot
<- name_plot +
name_plot ylab("Number of People (thousands)") +
xlab(NULL)
name_plot
summary(name_plot)
data: year, sex, name, n, prop, alive_prob, count_thousands, age_today,
est_alive_today [111x9]
mapping: x = ~year
faceting: <ggproto object: Class FacetNull, Facet, gg>
compute_layout: function
draw_back: function
draw_front: function
draw_labels: function
draw_panels: function
finish_data: function
init_scales: function
map_data: function
params: list
setup_data: function
setup_params: function
shrink: TRUE
train_scales: function
vars: function
super: <ggproto object: Class FacetNull, Facet, gg>
-----------------------------------
mapping: y = ~count_thousands * alive_prob
geom_col: just = 0.5, width = NULL, na.rm = FALSE
stat_identity: na.rm = FALSE
position_stack
mapping: y = ~count_thousands
geom_line: na.rm = FALSE, orientation = NA
stat_identity: na.rm = FALSE
position_identity
<- Hmisc::wtd.quantile
wtd_quantile
<- joseph |>
median_yob summarise(year = wtd_quantile(year, weights=est_alive_today, prob = 0.5)) |>
pull(year)
median_yob
50%
1975
Again, in the new edition the authors us geom_col()
<- name_plot +
name_plot geom_col(
color = "white", fill = "#008fd5",
aes(y = ifelse(year == median_yob, est_alive_today / 1000, 0))
)
name_plot
<- tribble(
context ~year, ~num_people, ~label,
1935, 40, "Number of Josephs\nborn each year",
1915, 13, "Number of Josephs\nborn each year
\nestimated to be alive\non 1/1/2014",
2003, 40, "The median\nliving Joseph\nis 37 years old",
)
+
name_plot ggtitle("Age Distribution of American Boys Named Joseph") +
geom_text(
data = context,
aes(y = num_people, label = label, color = label)
+
) geom_curve(
x = 1990, xend = 1974, y = 40, yend = 24,
arrow = arrow(length = unit(0.3, "cm")), curvature = 0.5
+
) scale_color_manual(
guide = "none",
values = c("black", "#b2d7e9", "darkgray")
+
) ylim(0, 42)
Now replace the data in the ggplot and then make the new plot.
%+% filter(
name_plot
BabynamesDist, == "Josephine" & sex == "F"
name )
Faceting
Try the name Jessie
<- name_plot +
names_plot facet_wrap(~sex)
%+% filter(BabynamesDist, name == "Jessie") names_plot
Try it with your own name.
<- name_plot +
name_plot facet_wrap(~sex)
%+% filter(BabynamesDist, name == "Eric") name_plot
<- name_plot +
many_names_plot facet_grid(name ~ sex)
<- many_names_plot %+% filter(
mnp
BabynamesDist, %in% c("Jessie", "Marion", "Jackie")
name
) mnp
+ facet_grid(sex ~ name) mnp
Most common women’s names
Thank you Hannah Kim!
library(babynames)
<- BabynamesDist %>%
com_fem filter(sex == "F") %>%
group_by(name) %>%
summarise(
N = n(),
est_num_alive = sum(est_alive_today)
%>%
) arrange(desc(est_num_alive)) %>%
head(25) %>%
select(name) %>%
left_join(., BabynamesDist, by = "name") %>%
group_by(name) %>%
summarise(
N = n(),
est_num_alive = sum(est_alive_today),
q1_age = wtd.quantile(age_today, weight = est_alive_today, probs = 0.25),
median_age = wtd.quantile(age_today, weight = est_alive_today, probs = 0.5),
q3_age = wtd.quantile(age_today, weight = est_alive_today, probs = 0.75)
)
<- ggplot(data = com_fem, aes(x = reorder(name, -median_age),
w_plot y = median_age)) + xlab(NULL) + ylab("Age (in years)") +
ggtitle("Median ages for females with the 25 most common names")
<- w_plot + geom_linerange(aes(ymin = q1_age, ymax = q3_age),
w_plot color = "#f3d478", size = 10, alpha = 0.8)
<- w_plot +
w_plot geom_point(fill = "#ed3324", colour = "white", size = 4, shape = 21)
+
w_plot geom_point(aes(y = 55, x = 24), fill = "#ed3324", colour = "white",
size = 4, shape = 21) +
geom_text(aes(y = 58, x = 24, label = " median")) +
geom_text(aes(y = 26, x = 16, label = " 25th")) +
geom_text(aes(y = 51, x = 16, label = "75th percentile ")) +
geom_point(aes(y = 24, x = 16), shape = 17) +
geom_point(aes(y = 56, x = 16), shape = 17) +
coord_flip()