%+%

Author

Prof. Eric A. Suess

Historical baby names

See Section 3.3

library(pacman)

p_load(tidyverse, babynames, mdsr, Hmisc)

This dataset was created in 2014.

BabynamesDist <- make_babynames_dist()
head(BabynamesDist)
# A tibble: 6 × 9
   year sex   name          n   prop alive_prob count_thousands age_today
  <dbl> <chr> <chr>     <int>  <dbl>      <dbl>           <dbl>     <dbl>
1  1900 F     Mary      16706 0.0526          0           16.7        114
2  1900 F     Helen      6343 0.0200          0            6.34       114
3  1900 F     Anna       6114 0.0192          0            6.11       114
4  1900 F     Margaret   5304 0.0167          0            5.30       114
5  1900 F     Ruth       4765 0.0150          0            4.76       114
6  1900 F     Elizabeth  4096 0.0129          0            4.10       114
# ℹ 1 more variable: est_alive_today <dbl>
BabynamesDist |>  
  filter(name == "Benjamin")
# A tibble: 205 × 9
    year sex   name         n    prop alive_prob count_thousands age_today
   <dbl> <chr> <chr>    <int>   <dbl>      <dbl>           <dbl>     <dbl>
 1  1900 M     Benjamin   450 0.00278   0                  0.45        114
 2  1901 M     Benjamin   343 0.00297   0.000025           0.343       113
 3  1902 M     Benjamin   374 0.00282   0.00005            0.374       112
 4  1903 M     Benjamin   324 0.00251   0.000075           0.324       111
 5  1904 M     Benjamin   358 0.00258   0.0001             0.358       110
 6  1905 M     Benjamin   379 0.00265   0.000125           0.379       109
 7  1906 M     Benjamin   352 0.00244   0.00015            0.352       108
 8  1907 M     Benjamin   460 0.00290   0.000175           0.46        107
 9  1908 M     Benjamin   480 0.00289   0.0002             0.48        106
10  1909 M     Benjamin   489 0.00276   0.000225           0.489       105
# ℹ 195 more rows
# ℹ 1 more variable: est_alive_today <dbl>
joseph <- BabynamesDist  |> 
  filter(name == "Joseph" & sex == "M")

name_plot <- joseph  |>  ggplot(aes(x = year))

name_plot

In the Second edition of the book the authors changed to using geom_col().

name_plot <- name_plot +
  geom_col(
    aes(y = count_thousands * alive_prob), 
    fill = "#b2d7e9", 
    color = "white",
    size = 0.1
  )
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
name_plot

name_plot <- name_plot + 
  geom_line(aes(y = count_thousands), size = 2)
name_plot

name_plot <- name_plot + 
  ylab("Number of People (thousands)") +
  xlab(NULL)

name_plot

summary(name_plot)
data: year, sex, name, n, prop, alive_prob, count_thousands, age_today,
  est_alive_today [111x9]
mapping:  x = ~year
faceting: <ggproto object: Class FacetNull, Facet, gg>
    compute_layout: function
    draw_back: function
    draw_front: function
    draw_labels: function
    draw_panels: function
    finish_data: function
    init_scales: function
    map_data: function
    params: list
    setup_data: function
    setup_params: function
    shrink: TRUE
    train_scales: function
    vars: function
    super:  <ggproto object: Class FacetNull, Facet, gg>
-----------------------------------
mapping: y = ~count_thousands * alive_prob 
geom_col: just = 0.5, width = NULL, na.rm = FALSE
stat_identity: na.rm = FALSE
position_stack 

mapping: y = ~count_thousands 
geom_line: na.rm = FALSE, orientation = NA
stat_identity: na.rm = FALSE
position_identity 
wtd_quantile <- Hmisc::wtd.quantile

median_yob <- joseph  |>  
  summarise(year = wtd_quantile(year, weights=est_alive_today, prob = 0.5))  |>  
  pull(year)

median_yob    
 50% 
1975 

Again, in the new edition the authors us geom_col()

name_plot <- name_plot +
  geom_col(
    color = "white", fill = "#008fd5", 
    aes(y = ifelse(year == median_yob, est_alive_today / 1000, 0))
  )

name_plot

context <- tribble(
  ~year, ~num_people, ~label,
  1935, 40, "Number of Josephs\nborn each year",
  1915, 13, "Number of Josephs\nborn each year
  \nestimated to be alive\non 1/1/2014", 
  2003, 40, "The median\nliving Joseph\nis 37 years old", 
)

name_plot +
  ggtitle("Age Distribution of American Boys Named Joseph") + 
  geom_text(
    data = context, 
    aes(y = num_people, label = label, color = label)
  ) + 
  geom_curve(
    x = 1990, xend = 1974, y = 40, yend = 24, 
    arrow = arrow(length = unit(0.3, "cm")), curvature = 0.5
  ) + 
  scale_color_manual(
    guide = "none", 
    values = c("black", "#b2d7e9", "darkgray")
  ) + 
  ylim(0, 42)

Now replace the data in the ggplot and then make the new plot.

name_plot %+% filter(
  BabynamesDist, 
  name == "Josephine" & sex == "F"
)

Faceting

Try the name Jessie

names_plot <- name_plot + 
  facet_wrap(~sex)
names_plot %+% filter(BabynamesDist, name == "Jessie")

Try it with your own name.

name_plot <- name_plot + 
  facet_wrap(~sex)
name_plot %+% filter(BabynamesDist, name == "Eric")

many_names_plot <- name_plot + 
  facet_grid(name ~ sex)
mnp <- many_names_plot %+% filter(
  BabynamesDist, 
  name %in% c("Jessie", "Marion", "Jackie")
)
mnp

mnp + facet_grid(sex ~ name)

Most common women’s names

Thank you Hannah Kim!

library(babynames)

com_fem <- BabynamesDist %>% 
  filter(sex == "F") %>%
  group_by(name) %>% 
  summarise(
    N = n(),
    est_num_alive = sum(est_alive_today)
  ) %>% 
  arrange(desc(est_num_alive)) %>%  
  head(25) %>%  
  select(name) %>%
  left_join(., BabynamesDist, by = "name") %>%  
  group_by(name) %>%  
  summarise(
    N = n(),
    est_num_alive = sum(est_alive_today),
    q1_age = wtd.quantile(age_today, weight = est_alive_today, probs = 0.25),
    median_age = wtd.quantile(age_today, weight = est_alive_today, probs = 0.5),
    q3_age = wtd.quantile(age_today, weight = est_alive_today, probs = 0.75) 
  )

w_plot <- ggplot(data = com_fem, aes(x = reorder(name, -median_age),
  y = median_age)) + xlab(NULL) + ylab("Age (in years)") +
  ggtitle("Median ages for females with the 25 most common names")

w_plot <- w_plot + geom_linerange(aes(ymin = q1_age, ymax = q3_age),
  color = "#f3d478", size = 10, alpha = 0.8)

w_plot <- w_plot +
  geom_point(fill = "#ed3324", colour = "white", size = 4, shape = 21)

w_plot +
  geom_point(aes(y = 55, x = 24), fill = "#ed3324", colour = "white",
             size = 4, shape = 21) +
  geom_text(aes(y = 58, x = 24, label = "   median")) +
  geom_text(aes(y = 26, x = 16, label = "   25th")) +
  geom_text(aes(y = 51, x = 16, label = "75th percentile      ")) +
  geom_point(aes(y = 24, x = 16), shape = 17) +
  geom_point(aes(y = 56, x = 16), shape = 17) +
  coord_flip()