### Nonparametric Bootstrap Example rm(list=ls()) # set.seed(1222) ### Heats of Sublimation of Platinum (kcal/mol), p.393 # number of bootstrap samples B = 200000 ncor = 2 # number of corse used, 2 for dual core Bp = B/ncor ######################################################################################## library(Hmisc) # The Data x = c(136.3, 136.6, 135.8, 135.4, 134.7, 135.0, 134.1, 143.3, 147.8, 148.8, 134.8, 135.2, 134.9, 146.5, 141.2, 135.4, 134.8, 135.8, 135.0, 133.7, 134.4, 134.9, 134.8, 134.5, 134.3, 135.2) x.orig = x n = length(x) # First, we will see if the data looks normal. # Plot the ecdf of the data and compare it to the normal cdf. # Standardize the values of x, get z, and then compare to the standard normal distribution. # Does the data look normal? For the data to be normal the ecdf should look like the # normal cdf. x.bar = mean(x) x.sd = sqrt(var(x)) z = (x-x.bar)/x.sd X11() plot(ecdf(z)) # Plot a histogram of the data. Do the data look normal? X11() hist(x, col="wheat", main="Heats of Sublimation of Platinum", xlab="kcal/mol") # Make a stemplot. stem(x) # Make a boxplot. Are there some outliers? X11() boxplot(x,main="Heats of Sublimation of Platinum", ylab="kcal/mol") # Make a desity plot. Does it look normal? X11() plot(density(x), type="l",main="Heats of Sublimation of Platinum (kcal/mol)" ) # Make a time plot. Note that over time there appear to be some outlies. X11() plot(x, type="l", main="Time Plot", ylab="kcal/mol") # Get some decriptive statistics on x. summary(x) ######################################################################################## ######################################################################################## # Since the data don't look normal we can't use the normal distribution and # the parametric bootstrap. We could try to find a better distibution or we could # use the nonparametric bootstrap. # Find the sampling distribution of the mean of x using the nonparametric bootstrap. x.boot = numeric(B) for(i in 1:B){ x.boot[i] = mean(sample(x,n,replace=T)) } # A bootstrap estiamte of the population mean. mean(x.boot) # A bootstrap estimate of the standard error of the sampling distribution on the # sample mean. sd(x.boot) # A plot of the sampling distribution of the sample mean. What does is its shape? # What do you expect to see here according to the CLT. X11() hist(x.boot, col="dodgerblue2",main="Sampling Distribtuion of the Mean", xlab=expression(paste(hat(theta),"*"))) # A nonparametric bootstrap 95% CI for the population mean - percentile method CI.mean = quantile(x.boot, c(0.025, 0.975)) CI.mean # A nonbootstrap bootstrap 95% CI - basic - presented in the Rice book alpha = 0.05 x.delta = x.boot - x.bar delta.up = quantile(x.delta,(1-alpha/2)) delta.low = quantile(x.delta,(alpha/2)) boot.LB = x.bar - delta.up boot.LB boot.UB = x.bar - delta.low boot.UB CI.mean.rice = c(boot.LB,boot.UB) CI.mean.rice ######################################################################### # use the boot() function in R library(boot) ### mean mean.fn = function(x, d){ return(mean(x[d])) } system.time(mu.boot <- boot(x.orig, mean.fn, R=B)) plot(mu.boot) mu.boot length(mu.boot$t) mu.boot$t0 summary(mu.boot$data) conf1 = c(0.90,0.95) ci.boot.mean = boot.ci(mu.boot, conf=conf1, type="all"); ci.boot.mean hist(mu.boot$t) # install the snow library, one library used for parallel processing in R library(snow) system.time(mu.boot <- boot(x.orig, mean.fn, R = Bp, parallel = c("snow"), ncpus = ncor)) plot(mu.boot) mu.boot length(mu.boot$t) mu.boot$t0 summary(mu.boot$data) conf1 = c(0.90,0.95) ci = boot.ci(mu.boot, conf=conf1, type="all"); ci hist(mu.boot$t) ######################################################################################## ######################################################################################## # Find the sampling distribution of the median of x using the nonparametric bootstrap. x.boot = numeric(B) for(i in 1:B){ x.boot[i] = median(sample(x,n,replace=T)) } # A bootstrap estiamte of the population median. mean(x.boot) # A bootstrap estimate of the standard error of the sampling distribution on the # sample median. sd(x.boot) # A plot of the sampling distribution of the sample median. What does is its shape? X11() hist(x.boot, col="limegreen",main="Sampling Distribution of the Median", xlab=expression(paste(hat(theta),"*"))) # A nonparametric bootstrap 95% CI for the population median - percentile method CI.median = quantile(x.boot, c(0.025, 0.975)) CI.median # A nonbootstrap bootstrap 95% CI - basic - presented in the Rice book alpha = 0.05 x.med = median(x) x.delta = x.boot - x.med delta.up = quantile(x.delta,(1-alpha/2)) delta.low = quantile(x.delta,(alpha/2)) boot.LB = x.med - delta.up boot.LB boot.UB = x.med - delta.low boot.UB CI.median.rice = c(boot.LB,boot.UB) CI.median.rice ######################################################################### # use the boot() function in R ### median median.fn = function(x, d){ return(median(x[d])) } system.time(mu.boot <- boot(x.orig, median.fn, R=B)) plot(mu.boot) mu.boot length(mu.boot$t) mu.boot$t0 summary(mu.boot$data) conf1 = c(0.90,0.95) ci.boot.median = boot.ci(mu.boot, conf=conf1, type="all"); ci.boot.median hist(mu.boot$t) # install the snow library, one library used for parallel processing in R system.time(mu.boot <- boot(x.orig, median.fn, R = Bp, parallel = c("snow"), ncpus = ncor)) plot(mu.boot) mu.boot length(mu.boot$t) mu.boot$t0 summary(mu.boot$data) conf1 = c(0.90,0.95) ci = boot.ci(mu.boot, conf=conf1, type="all"); ci hist(mu.boot$t) ######################################################################################## ######################################################################################## # Find the sampling distribution of the 10% trimmed mean of x using the nonparametric bootstrap. x.boot = numeric(B) for(i in 1:B){ x.boot[i] = mean(sample(x,n,replace=T),trim=0.10) } # A bootstrap estiamte of the population mean using the 10% trimmed mean. mean(x.boot) # A bootstrap estimate of the standard error of the sampling distribution on the # 10% trimmed mean. sd(x.boot) # A plot of the sampling distribution of the 10% trimmed mean. What does is its shape? X11() hist(x.boot, col="tomato", main="Sampling Distribution of the 10% Trimmed Mean", xlab=expression(paste(hat(theta),"*"))) # A nonparametric bootstrap 95% CI for the population mean. CI.trmean = quantile(x.boot, c(0.025, 0.975)) CI.trmean # A nonbootstrap bootstrap 95% CI - basic - presented in the Rice book alpha = 0.05 x.trmean = mean(x, trim=0.10) x.delta = x.boot - x.trmean delta.up = quantile(x.delta,(1-alpha/2)) delta.low = quantile(x.delta,(alpha/2)) boot.LB = x.trmean - delta.up boot.LB boot.UB = x.trmean - delta.low boot.UB CI.trmean.rice = c(boot.LB,boot.UB) CI.trmean.rice ######################################################################### # use the boot() function in R ### trmean trmean.fn = function(x, d){ return(mean(x[d], trim=0.10)) } system.time(mu.boot <- boot(x.orig, trmean.fn, R=B)) plot(mu.boot) mu.boot length(mu.boot$t) mu.boot$t0 summary(mu.boot$data) conf1 = c(0.90,0.95) ci.boot.trmean = boot.ci(mu.boot, conf=conf1, type="all"); ci.boot.trmean hist(mu.boot$t) # install the snow library, one library used for parallel processing in R system.time(mu.boot <- boot(x.orig, trmean.fn, R = Bp, parallel = c("snow"), ncpus = ncor)) plot(mu.boot) mu.boot length(mu.boot$t) mu.boot$t0 summary(mu.boot$data) conf1 = c(0.90,0.95) ci = boot.ci(mu.boot, conf=conf1, type="all"); ci hist(mu.boot$t) ######################################################################################## ######################################################################################## CI.mean; CI.mean.rice; ci.boot.mean CI.median; CI.median.rice; ci.boot.median CI.trmean; CI.trmean.rice; ci.boot.trmean