---
title: "Stat. 652 - NHANES kNN"
output: html_notebook
---
Here we try using Tidymodels and Recipe to fit a classifier, the kNN model.
Note the need to impute values because of the high percentage of missing values.
For the kNN we use only numeric features.
```{r}
library(pacman)
p_load(NHANES, tidyverse, tidymodels, naniar)
```
### Step 1: Access the data.
```{r}
data(NHANES)
NHANES
```
### Step 2: Clean the data. Prepare the data and make a recipe for applying the steps needed to preprocess the data.
First drop all of the rows where the y-variable SleepTrouble is missing.
Second select all of the numeric variables because we are using kNN which computes distances and needs to have numeric variables. We create a list of the numeric variables and then use the one_of() function to pick out the columns with these names.
```{r}
NHANES_SleepTrouble <- NHANES %>% select(-ID, -SleepHrsNight) %>%
select( SleepTrouble, everything()) %>%
drop_na(SleepTrouble)
NHANES_SleepTrouble
NHAMES_SleepTrouble_num <- dplyr::select_if(NHANES_SleepTrouble, is.numeric) %>% names()
NHAMES_SleepTrouble_num
NHANES_SleepTrouble <- NHANES_SleepTrouble %>% select(SleepTrouble, one_of(NHAMES_SleepTrouble_num) )
NHANES_SleepTrouble
```
Summarize the y-variable.
```{r}
NHANES_SleepTrouble %>% group_by(SleepTrouble) %>%
summarize(n = n()) %>%
mutate(freq = n / sum(n))
```
Make the first split with 80% of the data being in the trainning data set.
```{r}
NHANES_SleepTrouble_split <- initial_split(NHANES_SleepTrouble, prop = 0.8)
NHANES_SleepTrouble_split
```
Trainning data.
```{r}
NHANES_SleepTrouble_split %>%
training()
```
```{r}
NHANES_SleepTrouble_split %>%
training() %>%
vis_miss()
```
Create the recipe for applying the preprocessing. Note the use of step_nzv(), which removes any columns that have very low variability, and the use of the step_medianimpute() function, which fills in the cells that are missing with the median of the column.
```{r}
NHANES_SleepTrouble_recipe <- training(NHANES_SleepTrouble_split) %>%
recipe(SleepTrouble ~ .) %>%
step_nzv(all_predictors()) %>%
step_medianimpute(all_numeric()) %>%
step_center(all_numeric(), -all_outcomes()) %>%
step_scale(all_numeric(), -all_outcomes()) %>%
prep()
summary(NHANES_SleepTrouble_recipe)
tidy(NHANES_SleepTrouble_recipe)
```
```{r}
NHANES_SleepTrouble_testing <- NHANES_SleepTrouble_recipe %>%
bake(testing(NHANES_SleepTrouble_split))
NHANES_SleepTrouble_testing
```
```{r}
NHANES_SleepTrouble_training <- juice(NHANES_SleepTrouble_recipe)
NHANES_SleepTrouble_training
```
### Step 3: Training a model on the data
Setup the models.
```{r}
NHANES_SleepTrouble_kknn <- nearest_neighbor(neighbors = 21) %>%
set_engine("kknn") %>%
set_mode("classification") %>%
fit(SleepTrouble ~ ., data = NHANES_SleepTrouble_training)
NHANES_SleepTrouble_ranger <- rand_forest(trees = 100) %>%
set_engine("ranger") %>%
set_mode("classification") %>%
fit(SleepTrouble ~ ., data = NHANES_SleepTrouble_training)
NHANES_SleepTrouble_rf <- rand_forest(trees = 100) %>%
set_engine("randomForest") %>%
set_mode("classification") %>%
fit(SleepTrouble ~ ., data = NHANES_SleepTrouble_training)
```
```{r}
predict(NHANES_SleepTrouble_kknn, NHANES_SleepTrouble_testing)
predict(NHANES_SleepTrouble_ranger, NHANES_SleepTrouble_testing)
predict(NHANES_SleepTrouble_rf, NHANES_SleepTrouble_testing)
```
```{r}
NHANES_SleepTrouble_kknn %>%
predict(NHANES_SleepTrouble_testing) %>%
bind_cols(NHANES_SleepTrouble_testing)
NHANES_SleepTrouble_ranger %>%
predict(NHANES_SleepTrouble_testing) %>%
bind_cols(NHANES_SleepTrouble_testing)
NHANES_SleepTrouble_rf %>%
predict(NHANES_SleepTrouble_testing) %>%
bind_cols(NHANES_SleepTrouble_testing)
```
### Step 4: Evaluate the models.
```{r}
NHANES_SleepTrouble_kknn %>%
predict(NHANES_SleepTrouble_testing) %>%
bind_cols(NHANES_SleepTrouble_testing) %>%
metrics(truth = SleepTrouble, estimate = .pred_class)
NHANES_SleepTrouble_ranger %>%
predict(NHANES_SleepTrouble_testing) %>%
bind_cols(NHANES_SleepTrouble_testing) %>%
metrics(truth = SleepTrouble, estimate = .pred_class)
NHANES_SleepTrouble_rf %>%
predict(NHANES_SleepTrouble_testing) %>%
bind_cols(NHANES_SleepTrouble_testing) %>%
metrics(truth = SleepTrouble, estimate = .pred_class)
```
```{r}
NHANES_SleepTrouble_kknn %>%
predict(NHANES_SleepTrouble_testing) %>%
bind_cols(NHANES_SleepTrouble_testing) %>%
conf_mat(truth = SleepTrouble, estimate = .pred_class)
NHANES_SleepTrouble_ranger %>%
predict(NHANES_SleepTrouble_testing) %>%
bind_cols(NHANES_SleepTrouble_testing) %>%
conf_mat(truth = SleepTrouble, estimate = .pred_class)
NHANES_SleepTrouble_rf %>%
predict(NHANES_SleepTrouble_testing) %>%
bind_cols(NHANES_SleepTrouble_testing) %>%
conf_mat(truth = SleepTrouble, estimate = .pred_class)
```