If you want to learn about Statistics using base R a nice website is the Quick-R website, see Statistics > t-tests
These are some example of basic statistics and hypothesis testing in R. Most of the code here is from base R.
We will use the mtcars data set.
library(tidyverse)
mtcars
mtcars %>% summarize(mpg_mean = mean(mpg), mpg_sd = sd(mpg))
mtcars %>% group_by(vs) %>%
summarize(mpg_mean = mean(mpg), mpg_sd = sd(mpg))
Note that the t.test function does not work well with the tidyverse. There is a new package called infer that works with the tidyverse. And if you are interested chech out the broom package.
I like using the formula interface when doing hypothesis testing.
?t.test
with(mtcars, boxplot(mpg ~ vs))
output1 <- with(mtcars, t.test(mpg ~vs))
output1
summary(output1)
output1$statistic
output1$p.value
?aov
with(mtcars, boxplot(mpg ~cyl))
output2 <- with(mtcars, aov(mpg ~ cyl))
output2
summary(output2)
?lm
attach(mtcars)
plot(mpg ~ wt)
output3 <-lm(mpg ~ wt)
output3
summary(output3)
plot(mpg ~ wt)
abline(lm(mpg ~ wt))
detach(mtcars)
mtcars %>% ggplot(aes(x = wt, y = mpg)) +
geom_point() +
geom_smooth(method=lm) +
geom_smooth()
If you want to learn Hypothesis Testing using modern R code check out the book moderndive. See Chapter 10. The authors of this book are working on a new package called infer R package.
library(infer)
The two sample t test example from the website.
library(nycflights13)
library(dplyr)
library(stringr)
library(infer)
set.seed(2017)
fli_small <- flights %>%
sample_n(size = 500) %>%
mutate(half_year = case_when(
between(month, 1, 6) ~ "h1",
between(month, 7, 12) ~ "h2"
)) %>%
mutate(day_hour = case_when(
between(hour, 1, 12) ~ "morning",
between(hour, 13, 24) ~ "not morning"
)) %>%
select(arr_delay, dep_delay, half_year,
day_hour, origin, carrier)
obs_t <- fli_small %>%
specify(arr_delay ~ half_year) %>%
calculate(stat = "t", order = c("h1", "h2"))
obs_t <- fli_small %>%
t_stat(formula = arr_delay ~ half_year, order = c("h1", "h2"))
t_null_perm <- fli_small %>%
# alt: response = arr_delay, explanatory = half_year
specify(arr_delay ~ half_year) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "t", order = c("h1", "h2"))
visualize(t_null_perm) +
shade_p_value(obs_stat = obs_t, direction = "two_sided")
Randomized p-value
t_null_perm %>%
get_p_value(obs_stat = obs_t, direction = "two_sided")
Theoretical p-value
t_null_theor <- fli_small %>%
# alt: response = arr_delay, explanatory = half_year
specify(arr_delay ~ half_year) %>%
hypothesize(null = "independence") %>%
# generate() ## Not used for theoretical
calculate(stat = "t", order = c("h1", "h2"))
visualize(t_null_theor, method = "theoretical") +
shade_p_value(obs_stat = obs_t, direction = "two_sided")
Overlay
visualize(t_null_perm, method = "both") +
shade_p_value(obs_stat = obs_t, direction = "two_sided")
Compute the Theoretical p-value
fli_small %>%
t_test(formula = arr_delay ~ half_year,
alternative = "two_sided",
order = c("h1", "h2")) %>%
dplyr::pull(p_value)