--- title: "%+%" author: "Prof. Eric A. Suess" format: html: self-contained: true --- **Historical baby names** See Section 3.3 ```{r message = FALSE} library(pacman) p_load(tidyverse, babynames, mdsr, Hmisc) ``` This dataset was created in 2014. ```{r} BabynamesDist <- make_babynames_dist() head(BabynamesDist) ``` ```{r} BabynamesDist |> filter(name == "Benjamin") ``` ```{r} joseph <- BabynamesDist |> filter(name == "Joseph" & sex == "M") name_plot <- joseph |> ggplot(aes(x = year)) name_plot ``` In the Second edition of the book the authors changed to using geom_col(). ```{r} name_plot <- name_plot + geom_col( aes(y = count_thousands * alive_prob), fill = "#b2d7e9", color = "white", size = 0.1 ) name_plot ``` ```{r} name_plot <- name_plot + geom_line(aes(y = count_thousands), size = 2) name_plot ``` ```{r} name_plot <- name_plot + ylab("Number of People (thousands)") + xlab(NULL) name_plot ``` ```{r} summary(name_plot) ``` ```{r} wtd_quantile <- Hmisc::wtd.quantile median_yob <- joseph |> summarise(year = wtd_quantile(year, weights=est_alive_today, prob = 0.5)) |> pull(year) median_yob ``` Again, in the new edition the authors us geom_col() ```{r} name_plot <- name_plot + geom_col( color = "white", fill = "#008fd5", aes(y = ifelse(year == median_yob, est_alive_today / 1000, 0)) ) name_plot ``` ```{r} context <- tribble( ~year, ~num_people, ~label, 1935, 40, "Number of Josephs\nborn each year", 1915, 13, "Number of Josephs\nborn each year \nestimated to be alive\non 1/1/2014", 2003, 40, "The median\nliving Joseph\nis 37 years old", ) name_plot + ggtitle("Age Distribution of American Boys Named Joseph") + geom_text( data = context, aes(y = num_people, label = label, color = label) ) + geom_curve( x = 1990, xend = 1974, y = 40, yend = 24, arrow = arrow(length = unit(0.3, "cm")), curvature = 0.5 ) + scale_color_manual( guide = "none", values = c("black", "#b2d7e9", "darkgray") ) + ylim(0, 42) ``` Now replace the data in the ggplot and then make the new plot. ```{r} name_plot %+% filter( BabynamesDist, name == "Josephine" & sex == "F" ) ``` Faceting Try the name Jessie ```{r} names_plot <- name_plot + facet_wrap(~sex) names_plot %+% filter(BabynamesDist, name == "Jessie") ``` Try it with your own name. ```{r} name_plot <- name_plot + facet_wrap(~sex) name_plot %+% filter(BabynamesDist, name == "Eric") ``` ```{r} many_names_plot <- name_plot + facet_grid(name ~ sex) mnp <- many_names_plot %+% filter( BabynamesDist, name %in% c("Jessie", "Marion", "Jackie") ) mnp ``` ```{r} mnp + facet_grid(sex ~ name) ``` **Most common women's names** Thank you Hannah Kim! ```{r} library(babynames) com_fem <- BabynamesDist %>% filter(sex == "F") %>% group_by(name) %>% summarise( N = n(), est_num_alive = sum(est_alive_today) ) %>% arrange(desc(est_num_alive)) %>% head(25) %>% select(name) %>% left_join(., BabynamesDist, by = "name") %>% group_by(name) %>% summarise( N = n(), est_num_alive = sum(est_alive_today), q1_age = wtd.quantile(age_today, weight = est_alive_today, probs = 0.25), median_age = wtd.quantile(age_today, weight = est_alive_today, probs = 0.5), q3_age = wtd.quantile(age_today, weight = est_alive_today, probs = 0.75) ) w_plot <- ggplot(data = com_fem, aes(x = reorder(name, -median_age), y = median_age)) + xlab(NULL) + ylab("Age (in years)") + ggtitle("Median ages for females with the 25 most common names") w_plot <- w_plot + geom_linerange(aes(ymin = q1_age, ymax = q3_age), color = "#f3d478", size = 10, alpha = 0.8) w_plot <- w_plot + geom_point(fill = "#ed3324", colour = "white", size = 4, shape = 21) w_plot + geom_point(aes(y = 55, x = 24), fill = "#ed3324", colour = "white", size = 4, shape = 21) + geom_text(aes(y = 58, x = 24, label = " median")) + geom_text(aes(y = 26, x = 16, label = " 25th")) + geom_text(aes(y = 51, x = 16, label = "75th percentile ")) + geom_point(aes(y = 24, x = 16), shape = 17) + geom_point(aes(y = 56, x = 16), shape = 17) + coord_flip() ```