Chapter 9 Statistical Foundations

Samples and Populations

The population considered in the book is contained in the nycflights13 dataset. In the nycflights13 dataset contains all of the departing flights from airports in the NYC Area.

library(pacman)

p_load(mdsr, nycflights13, tidyverse, skimr, moderndive, infer)

Recall the flights dataframe.

flights

# A tibble: 336,776 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      517            515         2      830            819
 2  2013     1     1      533            529         4      850            830
 3  2013     1     1      542            540         2      923            850
 4  2013     1     1      544            545        -1     1004           1022
 5  2013     1     1      554            600        -6      812            837
 6  2013     1     1      554            558        -4      740            728
 7  2013     1     1      555            600        -5      913            854
 8  2013     1     1      557            600        -3      709            723
 9  2013     1     1      557            600        -3      838            846
10  2013     1     1      558            600        -2      753            745
# ℹ 336,766 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

Take a sample of flights from NYC to SF. Get all of the flights to SF and take a sample of 25 of them.

Note that I did not set a seed for the sample, so your answers will differ from what is in the book.

SF <- flights |> 
  filter(dest == "SFO", !is.na(arr_delay))
SF

# A tibble: 13,173 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      558            600        -2      923            937
 2  2013     1     1      611            600        11      945            931
 3  2013     1     1      655            700        -5     1037           1045
 4  2013     1     1      729            730        -1     1049           1115
 5  2013     1     1      734            737        -3     1047           1113
 6  2013     1     1      745            745         0     1135           1125
 7  2013     1     1      746            746         0     1119           1129
 8  2013     1     1      803            800         3     1132           1144
 9  2013     1     1      826            817         9     1145           1158
10  2013     1     1     1029           1030        -1     1427           1355
# ℹ 13,163 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

sf_25 <- SF |>
  dplyr::slice_sample(n = 25)
sf_25

# A tibble: 25 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     5    27     2042           2043        -1     2346             12
 2  2013     1     8     1630           1630         0     2000           2020
 3  2013     5    11     2012           2000        12     2302           2341
 4  2013     7     7      745            740         5     1041           1055
 5  2013     4     1     1101           1106        -5     1422           1431
 6  2013    12    14     1354           1350         4     1711           1720
 7  2013     7    14     1919           1800        79     2206           2130
 8  2013     4    23     1010           1015        -5     1352           1336
 9  2013     8    15     1930           1930         0     2251           2304
10  2013    11    10     1025           1025         0     1405           1400
# ℹ 15 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

Get the summary statistics for the sample taken.

sf_25 |>
  skim(arr_delay)

Data summary
Name	sf_25
Number of rows	25
Number of columns	19
_______________________
Column type frequency:
numeric	1
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
arr_delay	0	1	1.52	30.91	-41	-17	-9	16	94	▆▇▃▁▂

Since the SF dataset contains all flights from NYC to SF in 2013, the statistics computed from the SF dataset are the calculated population paramaters.

SF |>
  skim(arr_delay)

Data summary
Name	SF
Number of rows	13173
Number of columns	19
_______________________
Column type frequency:
numeric	1
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
arr_delay	0	1	2.67	47.67	-86	-23	-8	12	1007	▇▁▁▁▁

Get the 98th percentile of the sample.

sf_25 |>
  summarize(q98 = quantile(arr_delay, p = 0.98))

# A tibble: 1 × 1
    q98
  <dbl>
1  81.5

Get the estimated proportion of flights to SF with a delay less than 90 minutes.

SF |>
  group_by(arr_delay < 90) |>
  count() |>
  mutate(pct = n / nrow(SF))

# A tibble: 2 × 3
# Groups:   arr_delay < 90 [2]
  `arr_delay < 90`     n    pct
  <lgl>            <int>  <dbl>
1 FALSE              640 0.0486
2 TRUE             12533 0.951

Compare with the 98th percentile. 90 minutes is the 95th percentile.

SF |>
  summarize(q98 = quantile(arr_delay, p = 0.98))

# A tibble: 1 × 1
    q98
  <dbl>
1   153

Sample Statistics

The sampling distribution.

Usually sampling is done without replacement.

n <- 25
SF |>
  dplyr::slice_sample(n = n) |>
  summarize(mean_arr_delay = mean(arr_delay))

# A tibble: 1 × 1
  mean_arr_delay
           <dbl>
1           16.1

n <- 25
SF |>
  infer::rep_slice_sample(n = n, reps = 1) |>
  summarize(mean_arr_delay = mean(arr_delay))

# A tibble: 1 × 2
  replicate mean_arr_delay
      <int>          <dbl>
1         1           12.8

Note that a different random sample produces a different value for the sample statistic.

SF |>
  infer::rep_slice_sample(n = n, reps = 1) |>
  summarize(mean_arr_delay = mean(arr_delay))

# A tibble: 1 × 2
  replicate mean_arr_delay
      <int>          <dbl>
1         1           0.32

Using simulation we can approximate the sampling distribution of the sample statistics. Here we are computing the mean, but we could do this for any statistic we are interested in, the median, sample variance, sample standard deviation, etc.

num_trials <- 500

n <- 25

sf_25_means <- SF |>
  infer::rep_slice_sample(n = n, reps = num_trials) |>
  summarize(mean_arr_delay = mean(arr_delay)) |>
  mutate(n = n)

head(sf_25_means)

# A tibble: 6 × 3
  replicate mean_arr_delay     n
      <int>          <dbl> <dbl>
1         1          -0.28    25
2         2         -11.3     25
3         3          19.8     25
4         4           7.36    25
5         5          -8.16    25
6         6          10.5     25

Summarize the sample means.

sf_25_means |>
  skim(mean_arr_delay)

Data summary
Name	sf_25_means
Number of rows	500
Number of columns	3
_______________________
Column type frequency:
numeric	1
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
mean_arr_delay	0	1	3.77	9.86	-17.36	-3.48	2.38	10.32	46.64	▃▇▃▁▁

Confidence intervals

sf_25_means |>
  summarize(
    x_bar = mean(mean_arr_delay),
    se = sd(mean_arr_delay)
  ) |>
  mutate(
    ci_lower = x_bar - 2 * se, # approximately 95% of observations 
    ci_upper = x_bar + 2 * se  # are within two standard errors
  )

# A tibble: 1 × 4
  x_bar    se ci_lower ci_upper
  <dbl> <dbl>    <dbl>    <dbl>
1  3.77  9.86    -16.0     23.5

Alternatively, it can be calculated directly using a t-test.

sf_25_means |>
  infer::t_test(response = mean_arr_delay)

# A tibble: 1 × 7
  statistic  t_df  p_value alternative estimate lower_ci upper_ci
      <dbl> <dbl>    <dbl> <chr>          <dbl>    <dbl>    <dbl>
1      8.55   499 1.48e-16 two.sided       3.77     2.91     4.64

Using a larger sample size give a smaller Standard Error.

n <- 100

sf_100_means <- SF |> infer::rep_slice_sample(n = n, reps = num_trials) |> 
  group_by(replicate) |> 
  summarize(mean_arr_delay = mean(arr_delay)) |>
  mutate(n = n)

head(sf_100_means)

# A tibble: 6 × 3
  replicate mean_arr_delay     n
      <int>          <dbl> <dbl>
1         1           2.38   100
2         2          -3.71   100
3         3           4.03   100
4         4          -1.21   100
5         5           6.86   100
6         6           0.44   100

Plots to compare the sampling distributions with different sample sizes.

sf_25_means |>
  bind_rows(sf_100_means) |>
  ggplot(aes(x = mean_arr_delay)) + 
  geom_histogram(bins = 30) + 
  facet_grid( ~ n) + 
  xlab("Sample mean")

Boostrap

Usually for random sampling we sample without replacement.

three_flights <- SF |>
  slice_sample(n = 3, replace = FALSE) |>
  select(year, month, day, dep_time)
three_flights

# A tibble: 3 × 4
   year month   day dep_time
  <int> <int> <int>    <int>
1  2013     1    14      952
2  2013    10     3     1037
3  2013     9    25      816

With the Bootstrap we use sampling with replacement

three_flights |> slice_sample(n = 3, replace = TRUE)

# A tibble: 3 × 4
   year month   day dep_time
  <int> <int> <int>    <int>
1  2013     9    25      816
2  2013    10     3     1037
3  2013     1    14      952

n <- 200

orig_sample <- SF |> 
  slice_sample(n = n, replace = FALSE)
orig_sample

# A tibble: 200 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     9    12     1340           1343        -3     1716           1701
 2  2013     8     8      740            740         0     1046           1055
 3  2013     6     9     1400           1400         0     1712           1730
 4  2013    11    21      754            800        -6     1112           1136
 5  2013     6    15     1737           1705        32     2148           2030
 6  2013    12     5     1855           1838        17     2239           2214
 7  2013     5    23     1047           1045         2     1403           1420
 8  2013     7    18     1836           1830         6     2231           2155
 9  2013     4     4     1858           1830        28     2223           2152
10  2013     5    23      801            800         1     1120           1135
# ℹ 190 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

The reason for sampling with replacement is that we only have the Original Sample of size \(n\). We do not have the population. So if we sampled with out replacement we would only have one sample to look at. Sampling the Original Sample with replace allow us to general lots of samples and to investigate the variability of these samples.

orig_sample |>
  slice_sample(n = n, replace = TRUE) |>
  summarize(mean_arr_delay = mean(arr_delay))

# A tibble: 1 × 1
  mean_arr_delay
           <dbl>
1           4.21

sf_200_bs <- orig_sample |> infer::rep_slice_sample(n = n, replace = TRUE, reps = num_trials) |>
  summarize(mean_arr_delay = mean(arr_delay)) |>
  mutate(n = n)

sf_200_bs |>
  skim(mean_arr_delay)

Data summary
Name	sf_200_bs
Number of rows	500
Number of columns	3
_______________________
Column type frequency:
numeric	1
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
mean_arr_delay	0	1	4.1	3.04	-3.4	1.97	4.14	6.06	13.82	▂▆▇▃▁

sf_200_bs |> ggplot(aes(x = mean_arr_delay)) +
  geom_histogram()

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Compare with the entire population.

sf_200_pop <- SF |> infer::rep_slice_sample(n = n, replace = TRUE, reps = num_trials)  |>
  summarize(mean_arr_delay = mean(arr_delay)) |>
  mutate(n = n)

sf_200_pop |>
  skim(mean_arr_delay)

Data summary
Name	sf_200_pop
Number of rows	500
Number of columns	3
_______________________
Column type frequency:
numeric	1
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
mean_arr_delay	0	1	2.88	3.6	-6.58	0.44	2.5	5.43	14.21	▁▇▇▅▁

sf_200_pop |> ggplot(aes(x = mean_arr_delay)) +
  geom_histogram()

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Question: Do the histograms look somewhat similar?

Using the code from the Modern Dive book.

To learn about the infer package this is a good link to start with.

orig_sample |> 
  specify(response = arr_delay) |> 
  calculate(stat = "mean")

Response: arr_delay (numeric)
# A tibble: 1 × 1
   stat
  <dbl>
1  4.05

bootstrap_distribution <- orig_sample |> 
  specify(formula = arr_delay ~ NULL) |> 
  generate(reps = num_trials, type = "bootstrap") |> 
  calculate(stat = "mean")
bootstrap_distribution

Response: arr_delay (numeric)
# A tibble: 500 × 2
   replicate   stat
       <int>  <dbl>
 1         1  3.22 
 2         2  4.33 
 3         3  9.98 
 4         4  0.925
 5         5 12.3  
 6         6  2.80 
 7         7  9.59 
 8         8  4.62 
 9         9  6.16 
10        10  6.54 
# ℹ 490 more rows

bootstrap_distribution |> visualize()

See the Modern Dive book for the infer process.

percentile_ci <- bootstrap_distribution |> 
  get_confidence_interval(level = 0.95, type = "percentile")
percentile_ci

# A tibble: 1 × 2
  lower_ci upper_ci
     <dbl>    <dbl>
1    -2.30     11.6

bootstrap_distribution |> visualize() + 
  shade_confidence_interval(endpoints = percentile_ci)

Example: Setting travel policy

orig_sample |>
  summarize(q98 = quantile(arr_delay, p = 0.98))

# A tibble: 1 × 1
    q98
  <dbl>
1  141.

n <- nrow(orig_sample)
sf_200_bs <- orig_sample |> infer::rep_slice_sample(n = n, replace = TRUE, reps = num_trials) |> 
  summarize(q98 = quantile(arr_delay, p = 0.98))

sf_200_bs |>
  skim(q98)

Data summary
Name	sf_200_bs
Number of rows	500
Number of columns	2
_______________________
Column type frequency:
numeric	1
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
q98	0	1	137.6	28.68	69.32	112.06	141	164.14	238	▂▇▇▃▁

n_large <- 10000
sf_10000_bs <- SF |> slice_sample(n = n_large, replace = FALSE)

sf_200_bs <- sf_10000_bs |> infer::rep_slice_sample(n = n, replace = TRUE, reps = num_trials) |> 
  summarize(q98 = quantile(arr_delay, p = 0.98))
  

sf_200_bs |>
  skim(q98)

Data summary
Name	sf_200_bs
Number of rows	500
Number of columns	2
_______________________
Column type frequency:
numeric	1
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
q98	0	1	144.77	32.49	67.32	121.03	146.12	170.08	251.02	▂▇▇▂▁

outliers

SF |>
  filter(arr_delay >= 420) |> 
  select(month, day, dep_delay, arr_delay, carrier)

# A tibble: 7 × 5
  month   day dep_delay arr_delay carrier
  <int> <int>     <dbl>     <dbl> <chr>  
1    12     7       374       422 UA     
2     7     6       589       561 DL     
3     7     7       629       676 VX     
4     7     7       653       632 VX     
5     7    10       453       445 B6     
6     7    10       432       433 VX     
7     9    20      1014      1007 AA

SF |> 
  filter(arr_delay < 420) |>
  ggplot(aes(arr_delay)) + 
  geom_histogram(binwidth = 15) + 
  labs(x = "Arrival delay (in minutes)")

SF |>
  group_by(hour) |>
  count() |>
  pivot_wider(names_from = hour, values_from = n) |>
  data.frame()

  X5  X6   X7  X8  X9  X10 X11 X12 X13 X14 X15 X16  X17  X18 X19 X20 X21
1 55 663 1696 987 429 1744 413 504 476 528 946 897 1491 1091 731 465  57

SF |>
  ggplot(aes(x = hour, y = arr_delay)) +
  geom_boxplot(alpha = 0.1, aes(group = hour)) +
  geom_smooth(method = "lm") + 
  xlab("Scheduled hour of departure") + 
  ylab("Arrival delay (minutes)") + 
  coord_cartesian(ylim = c(-30, 120))

`geom_smooth()` using formula = 'y ~ x'

mod1 <- lm(arr_delay ~ hour, data = SF)
broom::tidy(mod1)

# A tibble: 2 × 5
  term        estimate std.error statistic   p.value
  <chr>          <dbl>     <dbl>     <dbl>     <dbl>
1 (Intercept)   -22.9     1.23       -18.6 2.88e- 76
2 hour            2.01    0.0915      22.0 1.78e-105