---
title: "R Notebook"
output: html_notebook
---
```{r}
library(pacman)
p_load(NHANES, tidyverse, tidymodels)
```
### Step 1: Access the data.
```{r}
data(NHANES)
NHANES
```
### Step 2: Clean the data. Prepare the data and make a recipe for applying the steps needed to preprocess the data.
First drop all of the rows where the y-variable SleepTrouble is missing.
```{r}
NHANES_SleepHrsNight <- NHANES %>% select(-ID, -SleepTrouble) %>%
select( SleepHrsNight, everything()) %>%
drop_na(SleepHrsNight)
NHANES_SleepHrsNight
```
Summarize the y-variable.
```{r}
NHANES_SleepHrsNight %>% group_by(SleepHrsNight) %>%
summarize(n = n()) %>%
mutate(freq = n / sum(n))
```
Make the first split with 80% of the data being in the trainning data set.
```{r}
NHANES_SleepHrsNight_split <- initial_split(NHANES_SleepHrsNight, prop = 0.8)
NHANES_SleepHrsNight_split
```
Trainning data.
```{r}
NHANES_SleepHrsNight_split %>%
training()
```
```{r}
NHANES_SleepTrouble_split %>%
training() %>%
vis_miss()
```
Create the recipe for applying the preprocessing. Note the use of step_nzv(), which removes any columns that have very low variability, the use of the step_knnimpute() function, which fills in the cells that are missing with the median of the column, and tghe use of the step_corr() function, which removes highly correlated input features.
```{r}
NHANES_SleepHrsNight_recipe <- training(NHANES_SleepHrsNight_split) %>%
recipe(SleepHrsNight ~ .) %>%
step_nzv(all_predictors()) %>%
step_knnimpute(all_predictors()) %>%
step_corr(all_numeric()) %>%
prep()
summary(NHANES_SleepTrouble_recipe)
tidy(NHANES_SleepTrouble_recipe)
```
```{r}
NHANES_SleepHrsNight_testing <- NHANES_SleepHrsNight_recipe %>%
bake(testing(NHANES_SleepHrsNight_split))
NHANES_SleepHrsNight_testing
```
```{r}
NHANES_SleepHrsNight_training <- juice(NHANES_SleepHrsNight_recipe)
NHANES_SleepHrsNight_training
```
### Step 3: Training a model on the data
Setup the models.
```{r}
NHANES_SleepHrsNight_lm <- linear_reg() %>%
set_engine("lm") %>%
set_mode("regression") %>%
fit(SleepHrsNight ~ ., data = NHANES_SleepHrsNight_training)
```
```{r}
predict(NHANES_SleepHrsNight_lm, NHANES_SleepHrsNight_testing)
```
```{r}
NHANES_SleepHrsNight_lm %>%
predict(NHANES_SleepHrsNight_testing) %>%
bind_cols(NHANES_SleepHrsNight_testing)
```
### Step 4: Evaluate the models.
```{r}
NHANES_SleepHrsNight_lm %>%
predict(NHANES_SleepHrsNight_testing) %>%
bind_cols(NHANES_SleepHrsNight_testing) %>%
metrics(truth = SleepHrsNight, estimate = .pred)
```
```{r}
NHANES_SleepHrsNight_lm %>%
predict(NHANES_SleepHrsNight_testing) %>%
bind_cols(NHANES_SleepHrsNight_testing) %>%
rmse(truth = SleepHrsNight, estimate = .pred)
```
### Step 5: Improve the model.
GLM model using regularization.
Setup the models.
```{r}
NHANES_SleepHrsNight_glm <- linear_reg(penalty = 0.001, mixture = 0.5) %>%
set_engine("glmnet") %>%
set_mode("regression") %>%
fit(SleepHrsNight ~ ., data = NHANES_SleepHrsNight_training)
```
```{r}
predict(NHANES_SleepHrsNight_glm, NHANES_SleepHrsNight_testing)
```
```{r}
NHANES_SleepHrsNight_glm %>%
predict(NHANES_SleepHrsNight_testing) %>%
bind_cols(NHANES_SleepHrsNight_testing)
```
```{r}
NHANES_SleepHrsNight_glm %>%
predict(NHANES_SleepHrsNight_testing) %>%
bind_cols(NHANES_SleepHrsNight_testing) %>%
metrics(truth = SleepHrsNight, estimate = .pred)
```
```{r}
NHANES_SleepHrsNight_glm %>%
predict(NHANES_SleepHrsNight_testing) %>%
bind_cols(NHANES_SleepHrsNight_testing) %>%
rmse(truth = SleepHrsNight, estimate = .pred)
```
Setup the models.
```{r}
NHANES_SleepHrsNight_ranger <- rand_forest(trees = 1000) %>%
set_engine("ranger") %>%
set_mode("regression") %>%
fit(SleepHrsNight ~ ., data = NHANES_SleepHrsNight_training)
```
```{r}
predict(NHANES_SleepHrsNight_ranger, NHANES_SleepHrsNight_testing)
```
```{r}
NHANES_SleepHrsNight_ranger %>%
predict(NHANES_SleepHrsNight_testing) %>%
bind_cols(NHANES_SleepHrsNight_testing)
```
```{r}
NHANES_SleepHrsNight_ranger %>%
predict(NHANES_SleepHrsNight_testing) %>%
bind_cols(NHANES_SleepHrsNight_testing) %>%
metrics(truth = SleepHrsNight, estimate = .pred)
```
```{r}
NHANES_SleepHrsNight_ranger %>%
predict(NHANES_SleepHrsNight_testing) %>%
bind_cols(NHANES_SleepHrsNight_testing) %>%
rmse(truth = SleepHrsNight, estimate = .pred)
```