library(pacman)
p_load(mdsr, nycflights13, tidyverse, skimr, moderndive, infer)Chapter 9 Statistical Foundations
Samples and Populations
The population considered in the book is contained in the nycflights13 dataset. In the nycflights13 dataset contains all of the departing flights from airports in the NYC Area.
Recall the flights dataframe.
flights# A tibble: 336,776 × 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 1 1 517 515 2 830 819
2 2013 1 1 533 529 4 850 830
3 2013 1 1 542 540 2 923 850
4 2013 1 1 544 545 -1 1004 1022
5 2013 1 1 554 600 -6 812 837
6 2013 1 1 554 558 -4 740 728
7 2013 1 1 555 600 -5 913 854
8 2013 1 1 557 600 -3 709 723
9 2013 1 1 557 600 -3 838 846
10 2013 1 1 558 600 -2 753 745
# ℹ 336,766 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>
Take a sample of flights from NYC to SF. Get all of the flights to SF and take a sample of 25 of them.
Note that I did not set a seed for the sample, so your answers will differ from what is in the book.
SF <- flights |>
filter(dest == "SFO", !is.na(arr_delay))
SF# A tibble: 13,173 × 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 1 1 558 600 -2 923 937
2 2013 1 1 611 600 11 945 931
3 2013 1 1 655 700 -5 1037 1045
4 2013 1 1 729 730 -1 1049 1115
5 2013 1 1 734 737 -3 1047 1113
6 2013 1 1 745 745 0 1135 1125
7 2013 1 1 746 746 0 1119 1129
8 2013 1 1 803 800 3 1132 1144
9 2013 1 1 826 817 9 1145 1158
10 2013 1 1 1029 1030 -1 1427 1355
# ℹ 13,163 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>
sf_25 <- SF |>
dplyr::slice_sample(n = 25)
sf_25# A tibble: 25 × 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 5 27 2042 2043 -1 2346 12
2 2013 1 8 1630 1630 0 2000 2020
3 2013 5 11 2012 2000 12 2302 2341
4 2013 7 7 745 740 5 1041 1055
5 2013 4 1 1101 1106 -5 1422 1431
6 2013 12 14 1354 1350 4 1711 1720
7 2013 7 14 1919 1800 79 2206 2130
8 2013 4 23 1010 1015 -5 1352 1336
9 2013 8 15 1930 1930 0 2251 2304
10 2013 11 10 1025 1025 0 1405 1400
# ℹ 15 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>
Get the summary statistics for the sample taken.
sf_25 |>
skim(arr_delay)| Name | sf_25 |
| Number of rows | 25 |
| Number of columns | 19 |
| _______________________ | |
| Column type frequency: | |
| numeric | 1 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| arr_delay | 0 | 1 | 1.52 | 30.91 | -41 | -17 | -9 | 16 | 94 | ▆▇▃▁▂ |
Since the SF dataset contains all flights from NYC to SF in 2013, the statistics computed from the SF dataset are the calculated population paramaters.
SF |>
skim(arr_delay)| Name | SF |
| Number of rows | 13173 |
| Number of columns | 19 |
| _______________________ | |
| Column type frequency: | |
| numeric | 1 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| arr_delay | 0 | 1 | 2.67 | 47.67 | -86 | -23 | -8 | 12 | 1007 | ▇▁▁▁▁ |
Get the 98th percentile of the sample.
sf_25 |>
summarize(q98 = quantile(arr_delay, p = 0.98))# A tibble: 1 × 1
q98
<dbl>
1 81.5
Get the estimated proportion of flights to SF with a delay less than 90 minutes.
SF |>
group_by(arr_delay < 90) |>
count() |>
mutate(pct = n / nrow(SF))# A tibble: 2 × 3
# Groups: arr_delay < 90 [2]
`arr_delay < 90` n pct
<lgl> <int> <dbl>
1 FALSE 640 0.0486
2 TRUE 12533 0.951
Compare with the 98th percentile. 90 minutes is the 95th percentile.
SF |>
summarize(q98 = quantile(arr_delay, p = 0.98))# A tibble: 1 × 1
q98
<dbl>
1 153
Sample Statistics
The sampling distribution.
Usually sampling is done without replacement.
n <- 25
SF |>
dplyr::slice_sample(n = n) |>
summarize(mean_arr_delay = mean(arr_delay))# A tibble: 1 × 1
mean_arr_delay
<dbl>
1 16.1
n <- 25
SF |>
infer::rep_slice_sample(n = n, reps = 1) |>
summarize(mean_arr_delay = mean(arr_delay))# A tibble: 1 × 2
replicate mean_arr_delay
<int> <dbl>
1 1 12.8
Note that a different random sample produces a different value for the sample statistic.
SF |>
infer::rep_slice_sample(n = n, reps = 1) |>
summarize(mean_arr_delay = mean(arr_delay))# A tibble: 1 × 2
replicate mean_arr_delay
<int> <dbl>
1 1 0.32
Using simulation we can approximate the sampling distribution of the sample statistics. Here we are computing the mean, but we could do this for any statistic we are interested in, the median, sample variance, sample standard deviation, etc.
num_trials <- 500
n <- 25
sf_25_means <- SF |>
infer::rep_slice_sample(n = n, reps = num_trials) |>
summarize(mean_arr_delay = mean(arr_delay)) |>
mutate(n = n)
head(sf_25_means)# A tibble: 6 × 3
replicate mean_arr_delay n
<int> <dbl> <dbl>
1 1 -0.28 25
2 2 -11.3 25
3 3 19.8 25
4 4 7.36 25
5 5 -8.16 25
6 6 10.5 25
Summarize the sample means.
sf_25_means |>
skim(mean_arr_delay)| Name | sf_25_means |
| Number of rows | 500 |
| Number of columns | 3 |
| _______________________ | |
| Column type frequency: | |
| numeric | 1 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| mean_arr_delay | 0 | 1 | 3.77 | 9.86 | -17.36 | -3.48 | 2.38 | 10.32 | 46.64 | ▃▇▃▁▁ |
Confidence intervals
sf_25_means |>
summarize(
x_bar = mean(mean_arr_delay),
se = sd(mean_arr_delay)
) |>
mutate(
ci_lower = x_bar - 2 * se, # approximately 95% of observations
ci_upper = x_bar + 2 * se # are within two standard errors
) # A tibble: 1 × 4
x_bar se ci_lower ci_upper
<dbl> <dbl> <dbl> <dbl>
1 3.77 9.86 -16.0 23.5
Alternatively, it can be calculated directly using a t-test.
sf_25_means |>
infer::t_test(response = mean_arr_delay)# A tibble: 1 × 7
statistic t_df p_value alternative estimate lower_ci upper_ci
<dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
1 8.55 499 1.48e-16 two.sided 3.77 2.91 4.64
Using a larger sample size give a smaller Standard Error.
n <- 100
sf_100_means <- SF |> infer::rep_slice_sample(n = n, reps = num_trials) |>
group_by(replicate) |>
summarize(mean_arr_delay = mean(arr_delay)) |>
mutate(n = n)
head(sf_100_means)# A tibble: 6 × 3
replicate mean_arr_delay n
<int> <dbl> <dbl>
1 1 2.38 100
2 2 -3.71 100
3 3 4.03 100
4 4 -1.21 100
5 5 6.86 100
6 6 0.44 100
Plots to compare the sampling distributions with different sample sizes.
sf_25_means |>
bind_rows(sf_100_means) |>
ggplot(aes(x = mean_arr_delay)) +
geom_histogram(bins = 30) +
facet_grid( ~ n) +
xlab("Sample mean")Boostrap
Usually for random sampling we sample without replacement.
three_flights <- SF |>
slice_sample(n = 3, replace = FALSE) |>
select(year, month, day, dep_time)
three_flights# A tibble: 3 × 4
year month day dep_time
<int> <int> <int> <int>
1 2013 1 14 952
2 2013 10 3 1037
3 2013 9 25 816
With the Bootstrap we use sampling with replacement
three_flights |> slice_sample(n = 3, replace = TRUE)# A tibble: 3 × 4
year month day dep_time
<int> <int> <int> <int>
1 2013 9 25 816
2 2013 10 3 1037
3 2013 1 14 952
n <- 200
orig_sample <- SF |>
slice_sample(n = n, replace = FALSE)
orig_sample# A tibble: 200 × 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 9 12 1340 1343 -3 1716 1701
2 2013 8 8 740 740 0 1046 1055
3 2013 6 9 1400 1400 0 1712 1730
4 2013 11 21 754 800 -6 1112 1136
5 2013 6 15 1737 1705 32 2148 2030
6 2013 12 5 1855 1838 17 2239 2214
7 2013 5 23 1047 1045 2 1403 1420
8 2013 7 18 1836 1830 6 2231 2155
9 2013 4 4 1858 1830 28 2223 2152
10 2013 5 23 801 800 1 1120 1135
# ℹ 190 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>
The reason for sampling with replacement is that we only have the Original Sample of size \(n\). We do not have the population. So if we sampled with out replacement we would only have one sample to look at. Sampling the Original Sample with replace allow us to general lots of samples and to investigate the variability of these samples.
orig_sample |>
slice_sample(n = n, replace = TRUE) |>
summarize(mean_arr_delay = mean(arr_delay))# A tibble: 1 × 1
mean_arr_delay
<dbl>
1 4.21
sf_200_bs <- orig_sample |> infer::rep_slice_sample(n = n, replace = TRUE, reps = num_trials) |>
summarize(mean_arr_delay = mean(arr_delay)) |>
mutate(n = n)
sf_200_bs |>
skim(mean_arr_delay)| Name | sf_200_bs |
| Number of rows | 500 |
| Number of columns | 3 |
| _______________________ | |
| Column type frequency: | |
| numeric | 1 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| mean_arr_delay | 0 | 1 | 4.1 | 3.04 | -3.4 | 1.97 | 4.14 | 6.06 | 13.82 | ▂▆▇▃▁ |
sf_200_bs |> ggplot(aes(x = mean_arr_delay)) +
geom_histogram()`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Compare with the entire population.
sf_200_pop <- SF |> infer::rep_slice_sample(n = n, replace = TRUE, reps = num_trials) |>
summarize(mean_arr_delay = mean(arr_delay)) |>
mutate(n = n)
sf_200_pop |>
skim(mean_arr_delay)| Name | sf_200_pop |
| Number of rows | 500 |
| Number of columns | 3 |
| _______________________ | |
| Column type frequency: | |
| numeric | 1 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| mean_arr_delay | 0 | 1 | 2.88 | 3.6 | -6.58 | 0.44 | 2.5 | 5.43 | 14.21 | ▁▇▇▅▁ |
sf_200_pop |> ggplot(aes(x = mean_arr_delay)) +
geom_histogram()`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Question: Do the histograms look somewhat similar?
Using the code from the Modern Dive book.
To learn about the infer package this is a good link to start with.
orig_sample |>
specify(response = arr_delay) |>
calculate(stat = "mean")Response: arr_delay (numeric)
# A tibble: 1 × 1
stat
<dbl>
1 4.05
bootstrap_distribution <- orig_sample |>
specify(formula = arr_delay ~ NULL) |>
generate(reps = num_trials, type = "bootstrap") |>
calculate(stat = "mean")
bootstrap_distributionResponse: arr_delay (numeric)
# A tibble: 500 × 2
replicate stat
<int> <dbl>
1 1 3.22
2 2 4.33
3 3 9.98
4 4 0.925
5 5 12.3
6 6 2.80
7 7 9.59
8 8 4.62
9 9 6.16
10 10 6.54
# ℹ 490 more rows
bootstrap_distribution |> visualize()See the Modern Dive book for the infer process.
percentile_ci <- bootstrap_distribution |>
get_confidence_interval(level = 0.95, type = "percentile")
percentile_ci# A tibble: 1 × 2
lower_ci upper_ci
<dbl> <dbl>
1 -2.30 11.6
bootstrap_distribution |> visualize() +
shade_confidence_interval(endpoints = percentile_ci)Example: Setting travel policy
orig_sample |>
summarize(q98 = quantile(arr_delay, p = 0.98))# A tibble: 1 × 1
q98
<dbl>
1 141.
n <- nrow(orig_sample)
sf_200_bs <- orig_sample |> infer::rep_slice_sample(n = n, replace = TRUE, reps = num_trials) |>
summarize(q98 = quantile(arr_delay, p = 0.98))
sf_200_bs |>
skim(q98)| Name | sf_200_bs |
| Number of rows | 500 |
| Number of columns | 2 |
| _______________________ | |
| Column type frequency: | |
| numeric | 1 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| q98 | 0 | 1 | 137.6 | 28.68 | 69.32 | 112.06 | 141 | 164.14 | 238 | ▂▇▇▃▁ |
n_large <- 10000
sf_10000_bs <- SF |> slice_sample(n = n_large, replace = FALSE)
sf_200_bs <- sf_10000_bs |> infer::rep_slice_sample(n = n, replace = TRUE, reps = num_trials) |>
summarize(q98 = quantile(arr_delay, p = 0.98))
sf_200_bs |>
skim(q98)| Name | sf_200_bs |
| Number of rows | 500 |
| Number of columns | 2 |
| _______________________ | |
| Column type frequency: | |
| numeric | 1 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| q98 | 0 | 1 | 144.77 | 32.49 | 67.32 | 121.03 | 146.12 | 170.08 | 251.02 | ▂▇▇▂▁ |
outliers
SF |>
filter(arr_delay >= 420) |>
select(month, day, dep_delay, arr_delay, carrier)# A tibble: 7 × 5
month day dep_delay arr_delay carrier
<int> <int> <dbl> <dbl> <chr>
1 12 7 374 422 UA
2 7 6 589 561 DL
3 7 7 629 676 VX
4 7 7 653 632 VX
5 7 10 453 445 B6
6 7 10 432 433 VX
7 9 20 1014 1007 AA
SF |>
filter(arr_delay < 420) |>
ggplot(aes(arr_delay)) +
geom_histogram(binwidth = 15) +
labs(x = "Arrival delay (in minutes)")SF |>
group_by(hour) |>
count() |>
pivot_wider(names_from = hour, values_from = n) |>
data.frame() X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21
1 55 663 1696 987 429 1744 413 504 476 528 946 897 1491 1091 731 465 57
SF |>
ggplot(aes(x = hour, y = arr_delay)) +
geom_boxplot(alpha = 0.1, aes(group = hour)) +
geom_smooth(method = "lm") +
xlab("Scheduled hour of departure") +
ylab("Arrival delay (minutes)") +
coord_cartesian(ylim = c(-30, 120)) `geom_smooth()` using formula = 'y ~ x'
mod1 <- lm(arr_delay ~ hour, data = SF)
broom::tidy(mod1)# A tibble: 2 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) -22.9 1.23 -18.6 2.88e- 76
2 hour 2.01 0.0915 22.0 1.78e-105