sampling distribution examples

Author

Prof. Eric A. Suess

Examples of Sampling Distributions

This is R code that simulates the sampling distribution of

a. The Median
b. The Range
c. The IQR
d. The Mean
e. The SD
f. The 97.5 percentile

for a sample of size n = 30 from the N(mu, sigmasq).
And the nonparametric bootstrap is run for comparison.

Try the calculations for a simulated sample

mu <- 25
sigma <- 5

n <- 30

x <- rnorm(n, mu, sigma)

x.orig <- x # save original sample

hist(x, probability = TRUE)
points(density(x), type = "l", col = "red")

x.med <- median(x)      # median
print(paste("The median is: ", x.med))
[1] "The median is:  24.6443918969694"
x.range <- max(x) - min(x)  # range
print(paste("The range is: ", x.range))
[1] "The range is:  20.6255103976359"
x.iqr <- quantile(x, 0.75) - quantile(x, 0.25)      # IQR
print(paste("The IQR is: ", x.iqr))
[1] "The IQR is:  6.83380156341347"
x.midr <- (max(x) - min(x))/2   # midrange
print(paste("The midrange is: ", x.midr))
[1] "The midrange is:  10.312755198818"
x.975 <- quantile(x, 0.975) # 97.5 percentile 
print(paste("The 97.5 percentile is: ", x.975))
[1] "The 97.5 percentile is:  36.8403148714931"
x.mean <- mean(x)
print(paste("The sample mean is: ", x.mean))
[1] "The sample mean is:  25.5271331275685"
x.sd <- sd(x)
print(paste("The sample standard deviation is: ", x.sd))
[1] "The sample standard deviation is:  5.17643068785037"

Run the simulations many times and plot the simulated sampling distributions of each statistic.

B <- 10000

x.med <- replicate(B, median(rnorm(n, mu, sigma)))

hist(x.med, probability = TRUE)
points(density(x.med), type = "l", col = "red")

B <- 10000

x.med <- replicate(B, median(rnorm(n, mu, sigma)))
x.range <- replicate(B, diff(range(rnorm(n, mu, sigma))))
x.iqr <- replicate(B, diff(quantile(rnorm(n, mu, sigma), c(0.25,0.75)) ))
x.mean <- replicate(B, mean(rnorm(n, mu, sigma)))
x.sd <- replicate(B, sd(rnorm(n, mu, sigma)))
x.975 <- replicate(B, quantile(rnorm(n, mu, sigma), 0.975))

hist(x.med, probability = TRUE)
points(density(x.med), type = "l", col = "red")

hist(x.range, probability = TRUE)
points(density(x.range), type = "l", col = "red")

hist(x.iqr, probability = TRUE)
points(density(x.iqr), type = "l", col = "red")

hist(x.mean, probability = TRUE)
points(density(x.mean), type = "l", col = "red")

hist(x.sd, probability = TRUE)
points(density(x.sd), type = "l", col = "red")

hist(x.975, probability = TRUE)
points(density(x.975), type = "l", col = "red")

So which is better the median or mean? Consider the properties of the sampling distributions.

mean(x.med)
[1] 24.9906
mean(x.mean)
[1] 25.00594
var(x.med)
[1] 1.240971
var(x.mean)
[1] 0.8316636
sd(x.med)
[1] 1.113989
sd(x.mean)
[1] 0.9119559

probability intervals

quantile(x.med,c(0.025,0.975))
    2.5%    97.5% 
22.82376 27.15950 
quantile(x.mean,c(0.025,0.975))
    2.5%    97.5% 
23.21673 26.78768 

Nonparmetric Bootstrap

Resample the original data using the simpleboot library.

library(simpleboot)
Simple Bootstrap Routines (1.1-8)
library(boot)
x <- x.orig
par(mfrow=c(2,3))

median.boot <- one.boot(x, median,  R = B)
boot.ci(median.boot)
Warning in boot.ci(median.boot): bootstrap variances needed for studentized
intervals
BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
Based on 10000 bootstrap replicates

CALL : 
boot.ci(boot.out = median.boot)

Intervals : 
Level      Normal              Basic         
95%   (22.13, 27.08 )   (21.98, 26.78 )  

Level     Percentile            BCa          
95%   (22.51, 27.31 )   (22.49, 27.27 )  
Calculations and Intervals on Original Scale
hist(median.boot, main = "median")

my.range = function(x){
        r = max(x) - min(x)
        return(r)
}

range.boot = one.boot(x, my.range, R = B)
boot.ci(range.boot)
Warning in boot.ci(range.boot): bootstrap variances needed for studentized
intervals
BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
Based on 10000 bootstrap replicates

CALL : 
boot.ci(boot.out = range.boot)

Intervals : 
Level      Normal              Basic         
95%   (17.79, 26.44 )   (20.63, 28.87 )  

Level     Percentile            BCa          
95%   (12.38, 20.63 )   (15.61, 20.63 )  
Calculations and Intervals on Original Scale
hist(range.boot, main="range")

iqr = function(x){
        i = quantile(x, 0.75) - quantile(x, 0.25)
        return(i)
}

iqr.boot = one.boot(x, iqr, R = B)
boot.ci(iqr.boot)
Warning in boot.ci(iqr.boot): bootstrap variances needed for studentized
intervals
BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
Based on 10000 bootstrap replicates

CALL : 
boot.ci(boot.out = iqr.boot)

Intervals : 
Level      Normal              Basic         
95%   ( 4.485,  9.627 )   ( 4.777,  9.823 )  

Level     Percentile            BCa          
95%   ( 3.844,  8.891 )   ( 4.306,  9.416 )  
Calculations and Intervals on Original Scale
hist(iqr.boot, main = "iqr")

mean.boot = one.boot(x, mean, R = B)
boot.ci(mean.boot)
Warning in boot.ci(mean.boot): bootstrap variances needed for studentized
intervals
BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
Based on 10000 bootstrap replicates

CALL : 
boot.ci(boot.out = mean.boot)

Intervals : 
Level      Normal              Basic         
95%   (23.69, 27.36 )   (23.62, 27.30 )  

Level     Percentile            BCa          
95%   (23.75, 27.43 )   (23.71, 27.38 )  
Calculations and Intervals on Original Scale
hist(mean.boot, main="mean")

sd.boot = one.boot(x, sd, R = B)
boot.ci(sd.boot)
Warning in boot.ci(sd.boot): bootstrap variances needed for studentized
intervals
BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
Based on 10000 bootstrap replicates

CALL : 
boot.ci(boot.out = sd.boot)

Intervals : 
Level      Normal              Basic         
95%   ( 3.968,  6.637 )   ( 4.002,  6.640 )  

Level     Percentile            BCa          
95%   ( 3.713,  6.351 )   ( 3.965,  6.582 )  
Calculations and Intervals on Original Scale
hist(sd.boot, main="sd")

p975 = function(x){
    x.975 = quantile(x, 0.975)
    return(x.975)
}

p975.boot = one.boot(x, p975, R = B)
boot.ci(p975.boot)
Warning in boot.ci(p975.boot): bootstrap variances needed for studentized
intervals
BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
Based on 10000 bootstrap replicates

CALL : 
boot.ci(boot.out = p975.boot)

Intervals : 
Level      Normal              Basic         
95%   (32.93, 42.79 )   (34.99, 43.24 )  

Level     Percentile            BCa          
95%   (30.44, 38.69 )   (30.44, 38.69 )  
Calculations and Intervals on Original Scale
hist(p975.boot, main="97.5 percentile")

Using the infer R package for bootstrap

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.1     ✔ stringr   1.5.2
✔ ggplot2   4.0.0     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(infer)

B <- 10000

x <- tibble(x)

meadian

boot_infer <- x |>
  specify(response = x) |>
  generate(reps = B, type = "bootstrap") |>
  calculate(stat = "median")

visualize(boot_infer) 

percentile_ci <- boot_infer |> 
  get_confidence_interval(level = 0.95, type = "percentile")
percentile_ci
# A tibble: 1 × 2
  lower_ci upper_ci
     <dbl>    <dbl>
1     22.5     27.3
visualize(boot_infer) + 
  shade_confidence_interval(endpoints = percentile_ci)

mean

boot_infer <- x |>
  specify(response = x) |>
  generate(reps = B, type = "bootstrap") |>
  calculate(stat = "mean")

visualize(boot_infer) 

percentile_ci <- boot_infer |> 
  get_confidence_interval(level = 0.95, type = "percentile")
percentile_ci
# A tibble: 1 × 2
  lower_ci upper_ci
     <dbl>    <dbl>
1     23.7     27.4
visualize(boot_infer) + 
  shade_confidence_interval(endpoints = percentile_ci)

sd

boot_infer <- x |>
  specify(response = x) |>
  generate(reps = B, type = "bootstrap") |>
  calculate(stat = "sd")

visualize(boot_infer) 

percentile_ci <- boot_infer |> 
  get_confidence_interval(level = 0.95, type = "percentile")
percentile_ci
# A tibble: 1 × 2
  lower_ci upper_ci
     <dbl>    <dbl>
1     3.68     6.33
visualize(boot_infer) + 
  shade_confidence_interval(endpoints = percentile_ci)

t

set.seed(123)
boot_infer <- x |>
  specify(response = x) |>
  hypothesize(null = "point", mu = 25) |>
  generate(reps = B, type = "bootstrap") |>
  calculate(stat = "t")

visualize(boot_infer) 

percentile_ci <- boot_infer |> 
  get_confidence_interval(level = 0.95, type = "percentile")
percentile_ci
# A tibble: 1 × 2
  lower_ci upper_ci
     <dbl>    <dbl>
1    -2.30     1.86
visualize(boot_infer) + 
  shade_confidence_interval(endpoints = percentile_ci)