library(pacman)
p_load(titanic, tidyverse, janitor, naniar, DataExplorer, tidymodels)
Load the data from the titanic R package. Note that the titanic_train dataset contains the labels for Survived aND THE titanic_test dataset does not contain the labels. So we will build our machine learning model using the titanic_train dataset and then make a final classification for the titanic_test dataset. This is how kaggle competitions are done.
I like to clean names so the variables all have names with lowercase letters and underscores.
titanic_train <- titanic_train %>% clean_names()
head(titanic_train)
It is always a good idea to check for duplicate records/examples/rows in your dataset.
get_dupes(titanic_train)
No variable names specified - using all columns.
No duplicate combinations found of: passenger_id, survived, pclass, name, sex, age, sib_sp, parch, ticket, ... and 3 other variables
Drop the unique identifiers: passenger_id, name, and ticket. Also drop cabin because it has a high missing rate.
titanic_test <- titanic_test %>% clean_names()
head(titanic_test)
It is always a good idea to check for duplicate records/examples/rows in your dataset.
get_dupes(titanic_test)
No variable names specified - using all columns.
No duplicate combinations found of: passenger_id, pclass, name, sex, age, sib_sp, parch, ticket, fare, ... and 2 other variables
titanic_test2 <- titanic_test %>% select(-passenger_id, -name, -ticket, -cabin) %>%
mutate(
pclass = as_factor(pclass),
sex = as_factor(sex),
embarked = as_factor(embarked)
)
head(titanic_test2)
NA
Start by investigating the missing values and completeness of the features in the data. Note that the age variable contains some missing values.
vis_miss(titanic_train2)

gg_miss_var(titanic_train2)

gg_miss_var(titanic_train2, show_pct = TRUE)

create_report(titanic_train2, y = "survived", output_file = "report.html", output_dir = getwd())
Now try the ML algorithms.
Model 0:
Summarize the y-variable. Null Model.
titanic_train2 %>% group_by(survived) %>%
summarize(n = n()) %>%
mutate(freq = n / sum(n))
Make the first split with 80% of the data being in the training data set.
titanic_train2_split <- initial_split(titanic_train2, prop = 0.8)
titanic_train2_split
<Analysis/Assess/Total>
<713/178/891>
Training data.
titanic_train2_split %>%
training()
Create the recipe for applying the preprocessing. Note the use of step_nzv(), which removes any columns that have very low variability, and the use of the step_meanimpute() function, which fills in the cells that are missing with the mean of the column.
titanic_train2_recipe <- training(titanic_train2_split) %>%
recipe(survived ~ .) %>%
step_rm(pclass, sex, embarked) %>%
step_nzv(all_predictors()) %>%
step_meanimpute(age) %>%
prep()
summary(titanic_train2_recipe)
tidy(titanic_train2_recipe)
Apply the receipe, so the age variable should be complete after the imputation.
titanic_train2_testing <- titanic_train2_recipe %>%
bake(testing(titanic_train2_split))
titanic_train2_testing
titanic_train2_training <- juice(titanic_train2_recipe)
titanic_train2_training
Model 0: null
titanic_train2_null <- null_model() %>%
set_mode("classification") %>%
fit(survived ~ ., data = titanic_train2_training)
Engine set to `parsnip`.
predict(titanic_train2_null, titanic_train2_training)
titanic_train2_null %>%
predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing)
titanic_train2_null %>%
predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing) %>%
metrics(truth = survived, estimate = .pred_class)
titanic_train2_null %>%
predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing) %>%
conf_mat(truth = survived, estimate = .pred_class)
Truth
Prediction 0 1
0 109 69
1 0 0

Model 2: C5.0
Setup the model.
titanic_train2_C50 <- boost_tree(trees = 20) %>%
set_engine("C5.0") %>%
set_mode("classification") %>%
fit(survived ~ ., data = titanic_train2_training)
predict(titanic_train2_C50, titanic_train2_training)
titanic_train2_C50 %>%
predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing)
titanic_train2_C50 %>%
predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing) %>%
metrics(truth = survived, estimate = .pred_class)
titanic_train2_C50 %>%
predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing) %>%
conf_mat(truth = survived, estimate = .pred_class)
Truth
Prediction 0 1
0 92 39
1 17 30
titanic_train2_C50 %>%
predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
roc_curve(survived, .pred_0) %>%
autoplot()

titanic_train2_C50 %>%
predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
roc_auc(survived, .pred_0)
titanic_train2_C50 %>%
predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
ggplot() +
geom_density(aes(x = .pred_1, fill = survived),
alpha = 0.5)

Model 2: XGBoost
Setup the model.
titanic_train2_xgb <- boost_tree(trees = 20) %>%
set_engine("xgboost") %>%
set_mode("classification") %>%
fit(survived ~ ., data = titanic_train2_training)
[09:57:43] WARNING: amalgamation/../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
predict(titanic_train2_xgb, titanic_train2_training)
titanic_train2_xgb %>%
predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing)
titanic_train2_xgb %>%
predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing) %>%
metrics(truth = survived, estimate = .pred_class)
titanic_train2_xgb %>%
predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing) %>%
conf_mat(truth = survived, estimate = .pred_class)
Truth
Prediction 0 1
0 90 38
1 19 31
titanic_train2_xgb %>%
predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
roc_curve(survived, .pred_0) %>%
autoplot()

titanic_train2_xgb %>%
predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
roc_auc(survived, .pred_0)
titanic_train2_xgb %>%
predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
ggplot() +
geom_density(aes(x = .pred_1, fill = survived),
alpha = 0.5)

Model 3: Random Forest
Setup the model.
titanic_train2_ranger <- rand_forest(trees = 100) %>%
set_engine("ranger") %>%
set_mode("classification") %>%
fit(survived ~ ., data = titanic_train2_training)
predict(titanic_train2_ranger, titanic_train2_training)
titanic_train2_ranger %>%
predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing)
titanic_train2_ranger %>%
predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing) %>%
metrics(truth = survived, estimate = .pred_class)
titanic_train2_ranger %>%
predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing) %>%
conf_mat(truth = survived, estimate = .pred_class)
Truth
Prediction 0 1
0 89 33
1 20 36
titanic_train2_ranger %>%
predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
roc_curve(survived, .pred_1) %>%
ggplot(aes(x = 1 - specificity, y = sensitivity)) +
geom_path() +
geom_abline(lty = 3) +
coord_equal()

titanic_train2_ranger %>%
predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
roc_curve(survived, event_level = "first") %>%
autoplot()
Error: No valid variables provided to `...`.
Run `rlang::last_error()` to see where the error occurred.
titanic_train2_ranger %>%
predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
roc_auc(survived, .pred_0)
titanic_train2_ranger %>%
predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
ggplot() +
geom_density(aes(x = .pred_1, fill = survived),
alpha = 0.5)

---
title: "Stat 652: Midterm: Model 2, Model 3"
author: "Prof. Eric A. Suess"
output:
  html_notebook: default
  word_document: default
  pdf_document: default
---



```{r}
library(pacman)
p_load(titanic, tidyverse, janitor, naniar, DataExplorer, tidymodels)
```

Load the data from the titanic R package.  Note that the *titanic_train* dataset contains the labels for Survived aND THE *titanic_test* dataset does not contain the labels.  So we will build our machine learning model using the *titanic_train* dataset and then make a final classification for the *titanic_test* dataset.  This is how kaggle competitions are done.

I like to clean names so the variables all have names with lowercase letters and underscores.

```{r}
titanic_train <- titanic_train %>% clean_names()

head(titanic_train)
```

It is always a good idea to check for duplicate records/examples/rows in your dataset.

```{r}
get_dupes(titanic_train)
```

Drop the unique identifiers: *passenger_id*, *name*, and *ticket*.  Also drop *cabin* because it has a high missing rate.

```{r}
titanic_train2 <- titanic_train %>% select(-passenger_id, -name, -ticket, -cabin) %>%
  mutate(
    survived = as_factor(survived) ,
    pclass = as_factor(pclass),
    sex = as_factor(sex),
    embarked = as_factor(embarked)
  )

head(titanic_train2)
```

```{r}
titanic_test <- titanic_test %>% clean_names()

head(titanic_test)
```

It is always a good idea to check for duplicate records/examples/rows in your dataset.

```{r}
get_dupes(titanic_test)
```

```{r}
titanic_test2 <- titanic_test %>% select(-passenger_id, -name, -ticket, -cabin) %>%
  mutate(
    pclass = as_factor(pclass),
    sex = as_factor(sex),
    embarked = as_factor(embarked)
  )

head(titanic_test2)

```


Start by investigating the missing values and completeness of the features in the data.  Note that the *age* variable contains some missing values.

```{r}
vis_miss(titanic_train2)
gg_miss_var(titanic_train2)
gg_miss_var(titanic_train2, show_pct = TRUE)
```

```{r eval = FALSE}
create_report(titanic_train2, y = "survived", output_file = "report.html", output_dir = getwd())
```

Now try the ML algorithms.


## Model 0:

Summarize the y-variable.  Null Model.

```{r}
titanic_train2 %>% group_by(survived) %>%
  summarize(n = n()) %>%
  mutate(freq = n / sum(n))
```

Make the first split with 80% of the data being in the training data set.

```{r}
titanic_train2_split <- initial_split(titanic_train2, prop = 0.8)
titanic_train2_split
```

Training data.

```{r}
titanic_train2_split %>%
  training() 
```

Create the recipe for applying the preprocessing.  Note the use of step_nzv(), which removes any columns that have very low variability, and the use of the step_meanimpute() function, which fills in the cells that are missing with the mean of the column.

```{r}
titanic_train2_recipe <- training(titanic_train2_split) %>%
  recipe(survived ~ .) %>%
  step_rm(pclass, sex, embarked) %>% 
  step_nzv(all_predictors()) %>%
  step_meanimpute(age) %>%
  prep()

summary(titanic_train2_recipe)

tidy(titanic_train2_recipe)
```

Apply the receipe, so the *age* variable should be complete after the imputation.

```{r}
titanic_train2_testing <- titanic_train2_recipe %>%
  bake(testing(titanic_train2_split)) 

titanic_train2_testing
```

```{r}
titanic_train2_training <- juice(titanic_train2_recipe)

titanic_train2_training
```

### Model 0: null


```{r}

titanic_train2_null <- null_model() %>%
  set_mode("classification") %>%
  fit(survived ~ ., data = titanic_train2_training)

```

```{r}
predict(titanic_train2_null, titanic_train2_training)
```

```{r}
titanic_train2_null %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) 
```


```{r}
titanic_train2_null %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) %>%
  metrics(truth = survived, estimate = .pred_class)
```



```{r}
titanic_train2_null %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) %>%
  conf_mat(truth = survived, estimate = .pred_class)
```

```{r}
titanic_train2_null %>%
  predict(titanic_train2_testing, type = "prob") %>%
  bind_cols(titanic_train2_testing) %>%
  roc_curve(survived, .pred_1) %>%
  autoplot()
```

### Model 2: C5.0

Setup the model.

```{r}
titanic_train2_C50 <- boost_tree(trees = 20) %>% 
  set_engine("C5.0") %>%
  set_mode("classification") %>%
  fit(survived ~ ., data = titanic_train2_training)

```



```{r}
predict(titanic_train2_C50, titanic_train2_training)
```

```{r}
titanic_train2_C50 %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) 
```


```{r}
titanic_train2_C50 %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) %>%
  metrics(truth = survived, estimate = .pred_class)
```



```{r}
titanic_train2_C50 %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) %>%
  conf_mat(truth = survived, estimate = .pred_class)
```


```{r}
titanic_train2_C50 %>%
  predict(titanic_train2_testing, type = "prob") %>%
  bind_cols(titanic_train2_testing) %>%
  roc_curve(survived, .pred_0) %>%
  autoplot()
```

```{r}
titanic_train2_C50 %>%
  predict(titanic_train2_testing, type = "prob") %>%
  bind_cols(titanic_train2_testing) %>%
  roc_auc(survived, .pred_0) 
```


```{r}
titanic_train2_C50 %>%
  predict(titanic_train2_testing, type = "prob") %>%
  bind_cols(titanic_train2_testing) %>%
  ggplot() +
  geom_density(aes(x = .pred_1, fill = survived), 
               alpha = 0.5)
```



### Model 2: XGBoost

Setup the model.

```{r}
titanic_train2_xgb <- boost_tree(trees = 20) %>% 
  set_engine("xgboost") %>%
  set_mode("classification") %>%
  fit(survived ~ ., data = titanic_train2_training)

```



```{r}
predict(titanic_train2_xgb, titanic_train2_training)
```

```{r}
titanic_train2_xgb %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) 
```


```{r}
titanic_train2_xgb %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) %>%
  metrics(truth = survived, estimate = .pred_class)
```



```{r}
titanic_train2_xgb %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) %>%
  conf_mat(truth = survived, estimate = .pred_class)
```

```{r}
titanic_train2_xgb %>%
  predict(titanic_train2_testing, type = "prob") %>%
  bind_cols(titanic_train2_testing) %>%
  roc_curve(survived, .pred_0) %>%
  autoplot() 
```

```{r}
titanic_train2_xgb %>%
  predict(titanic_train2_testing, type = "prob") %>%
  bind_cols(titanic_train2_testing) %>%
  roc_auc(survived, .pred_0) 
```

```{r}
titanic_train2_xgb %>%
  predict(titanic_train2_testing, type = "prob") %>%
  bind_cols(titanic_train2_testing) %>%
  ggplot() +
  geom_density(aes(x = .pred_1, fill = survived), 
               alpha = 0.5)
```

### Model 3: Random Forest

Setup the model.

```{r}
titanic_train2_ranger <- rand_forest(trees = 100) %>% 
  set_engine("ranger") %>%
  set_mode("classification") %>%
  fit(survived ~ ., data = titanic_train2_training)

```



```{r}
predict(titanic_train2_ranger, titanic_train2_training)
```

```{r}
titanic_train2_ranger %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) 
```


```{r}
titanic_train2_ranger %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) %>%
  metrics(truth = survived, estimate = .pred_class)
```



```{r}
titanic_train2_ranger %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) %>%
  conf_mat(truth = survived, estimate = .pred_class)
```

```{r}
titanic_train2_ranger %>%
  predict(titanic_train2_testing, type = "prob") %>%
  bind_cols(titanic_train2_testing) %>%
  roc_curve(survived, .pred_1) %>%
  ggplot(aes(x = 1 - specificity, y = sensitivity)) +
  geom_path() +
  geom_abline(lty = 3) +
  coord_equal() 

titanic_train2_ranger %>%
  predict(titanic_train2_testing, type = "prob") %>%
  bind_cols(titanic_train2_testing) %>%
  roc_curve(survived, .pred_0) %>%
  autoplot()



```

```{r}
titanic_train2_ranger %>%
  predict(titanic_train2_testing, type = "prob") %>%
  bind_cols(titanic_train2_testing) %>%
  roc_auc(survived, .pred_0) 
```


```{r}
titanic_train2_ranger %>%
  predict(titanic_train2_testing, type = "prob") %>%
  bind_cols(titanic_train2_testing) %>%
  ggplot() +
  geom_density(aes(x = .pred_1, fill = survived), 
               alpha = 0.5)
```