Stat 652: Midterm: Model 2, Model 3

Author

Prof. Eric A. Suess

library(pacman)
p_load(titanic, tidyverse, janitor, naniar, DataExplorer, tidymodels)

Load the data from the titanic R package. Note that the titanic_train dataset contains the labels for Survived aND THE titanic_test dataset does not contain the labels. So we will build our machine learning model using the titanic_train dataset and then make a final classification for the titanic_test dataset. This is how kaggle competitions are done.

I like to clean names so the variables all have names with lowercase letters and underscores.

titanic_train <- titanic_train %>% clean_names()

head(titanic_train)
  passenger_id survived pclass
1            1        0      3
2            2        1      1
3            3        1      3
4            4        1      1
5            5        0      3
6            6        0      3
                                                 name    sex age sib_sp parch
1                             Braund, Mr. Owen Harris   male  22      1     0
2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38      1     0
3                              Heikkinen, Miss. Laina female  26      0     0
4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35      1     0
5                            Allen, Mr. William Henry   male  35      0     0
6                                    Moran, Mr. James   male  NA      0     0
            ticket    fare cabin embarked
1        A/5 21171  7.2500              S
2         PC 17599 71.2833   C85        C
3 STON/O2. 3101282  7.9250              S
4           113803 53.1000  C123        S
5           373450  8.0500              S
6           330877  8.4583              Q

It is always a good idea to check for duplicate records/examples/rows in your dataset.

get_dupes(titanic_train)
No variable names specified - using all columns.
No duplicate combinations found of: passenger_id, survived, pclass, name, sex, age, sib_sp, parch, ticket, ... and 3 other variables
 [1] passenger_id survived     pclass       name         sex         
 [6] age          sib_sp       parch        ticket       fare        
[11] cabin        embarked     dupe_count  
<0 rows> (or 0-length row.names)

Drop the unique identifiers: passenger_id, name, and ticket. Also drop cabin because it has a high missing rate.

titanic_train2 <- titanic_train %>% select(-passenger_id, -name, -ticket, -cabin) %>%
  mutate(
    survived = as_factor(survived) ,
    pclass = as_factor(pclass),
    sex = as_factor(sex),
    embarked = as_factor(embarked)
  )

head(titanic_train2)
  survived pclass    sex age sib_sp parch    fare embarked
1        0      3   male  22      1     0  7.2500        S
2        1      1 female  38      1     0 71.2833        C
3        1      3 female  26      0     0  7.9250        S
4        1      1 female  35      1     0 53.1000        S
5        0      3   male  35      0     0  8.0500        S
6        0      3   male  NA      0     0  8.4583        Q
titanic_test <- titanic_test %>% clean_names()

head(titanic_test)
  passenger_id pclass                                         name    sex  age
1          892      3                             Kelly, Mr. James   male 34.5
2          893      3             Wilkes, Mrs. James (Ellen Needs) female 47.0
3          894      2                    Myles, Mr. Thomas Francis   male 62.0
4          895      3                             Wirz, Mr. Albert   male 27.0
5          896      3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0
6          897      3                   Svensson, Mr. Johan Cervin   male 14.0
  sib_sp parch  ticket    fare cabin embarked
1      0     0  330911  7.8292              Q
2      1     0  363272  7.0000              S
3      0     0  240276  9.6875              Q
4      0     0  315154  8.6625              S
5      1     1 3101298 12.2875              S
6      0     0    7538  9.2250              S

It is always a good idea to check for duplicate records/examples/rows in your dataset.

get_dupes(titanic_test)
No variable names specified - using all columns.
No duplicate combinations found of: passenger_id, pclass, name, sex, age, sib_sp, parch, ticket, fare, ... and 2 other variables
 [1] passenger_id pclass       name         sex          age         
 [6] sib_sp       parch        ticket       fare         cabin       
[11] embarked     dupe_count  
<0 rows> (or 0-length row.names)
titanic_test2 <- titanic_test %>% select(-passenger_id, -name, -ticket, -cabin) %>%
  mutate(
    pclass = as_factor(pclass),
    sex = as_factor(sex),
    embarked = as_factor(embarked)
  )

head(titanic_test2)
  pclass    sex  age sib_sp parch    fare embarked
1      3   male 34.5      0     0  7.8292        Q
2      3 female 47.0      1     0  7.0000        S
3      2   male 62.0      0     0  9.6875        Q
4      3   male 27.0      0     0  8.6625        S
5      3 female 22.0      1     1 12.2875        S
6      3   male 14.0      0     0  9.2250        S

Start by investigating the missing values and completeness of the features in the data. Note that the age variable contains some missing values.

vis_miss(titanic_train2)

gg_miss_var(titanic_train2)

gg_miss_var(titanic_train2, show_pct = TRUE)

create_report(titanic_train2, y = "survived", output_file = "report.html", output_dir = getwd())

Now try the ML algorithms.

Model 0:

Summarize the y-variable. Null Model.

titanic_train2 %>% group_by(survived) %>%
  summarize(n = n()) %>%
  mutate(freq = n / sum(n))
# A tibble: 2 × 3
  survived     n  freq
  <fct>    <int> <dbl>
1 0          549 0.616
2 1          342 0.384

Make the first split with 80% of the data being in the training data set.

titanic_train2_split <- initial_split(titanic_train2, prop = 0.8)
titanic_train2_split
<Training/Testing/Total>
<712/179/891>

Create the recipe for applying the preprocessing. Note the use of step_nzv(), which removes any columns that have very low variability, and the use of the step_meanimpute() function, which fills in the cells that are missing with the mean of the column.

titanic_train2_recipe <- training(titanic_train2_split) %>%
  recipe(survived ~ .) %>%
  step_rm(pclass, sex, embarked) %>% 
  step_nzv(all_predictors()) %>%
  step_impute_mean(age) %>%
  prep()

summary(titanic_train2_recipe)
# A tibble: 5 × 4
  variable type      role      source  
  <chr>    <list>    <chr>     <chr>   
1 age      <chr [2]> predictor original
2 sib_sp   <chr [2]> predictor original
3 parch    <chr [2]> predictor original
4 fare     <chr [2]> predictor original
5 survived <chr [3]> outcome   original
tidy(titanic_train2_recipe)
# A tibble: 3 × 6
  number operation type        trained skip  id               
   <int> <chr>     <chr>       <lgl>   <lgl> <chr>            
1      1 step      rm          TRUE    FALSE rm_ZdCs5         
2      2 step      nzv         TRUE    FALSE nzv_EDkvH        
3      3 step      impute_mean TRUE    FALSE impute_mean_6j9ks

Apply the recipe, so the age variable should be complete after the imputation.

titanic_train2_testing <- titanic_train2_recipe %>%
  bake(testing(titanic_train2_split)) 

titanic_train2_testing
# A tibble: 179 × 5
     age sib_sp parch  fare survived
   <dbl>  <int> <int> <dbl> <fct>   
 1  27        0     2 11.1  1       
 2  58        0     0 26.6  1       
 3  14        0     0  7.85 0       
 4  38        1     5 31.4  1       
 5  40        0     0 27.7  0       
 6  28        1     0 82.2  0       
 7  42        1     0 52    0       
 8  29.7      0     0  7.75 1       
 9  65        0     1 62.0  0       
10  26        2     0  8.66 0       
# ℹ 169 more rows
titanic_train2_training <- juice(titanic_train2_recipe)

titanic_train2_training
# A tibble: 712 × 5
     age sib_sp parch   fare survived
   <dbl>  <int> <int>  <dbl> <fct>   
 1   9        1     1  15.9  1       
 2  20        1     0   9.82 0       
 3  29.7      0     0   8.05 0       
 4  21        0     0   7.73 0       
 5  28        1     0  15.8  0       
 6  29.7      0     0   7.90 0       
 7  27        0     2 212.   0       
 8  29.7      1     0  24.2  0       
 9  28        0     0   9.5  0       
10  34        0     0  13    0       
# ℹ 702 more rows

Model 0: null

titanic_train2_null <- null_model()  |>  
  set_engine("parsnip")  |>  
  set_mode("classification") |> 
  fit(survived ~ ., data = titanic_train2_training)
predict(titanic_train2_null, titanic_train2_training)
# A tibble: 712 × 1
   .pred_class
   <fct>      
 1 0          
 2 0          
 3 0          
 4 0          
 5 0          
 6 0          
 7 0          
 8 0          
 9 0          
10 0          
# ℹ 702 more rows
titanic_train2_null %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) 
# A tibble: 179 × 6
   .pred_class   age sib_sp parch  fare survived
   <fct>       <dbl>  <int> <int> <dbl> <fct>   
 1 0            27        0     2 11.1  1       
 2 0            58        0     0 26.6  1       
 3 0            14        0     0  7.85 0       
 4 0            38        1     5 31.4  1       
 5 0            40        0     0 27.7  0       
 6 0            28        1     0 82.2  0       
 7 0            42        1     0 52    0       
 8 0            29.7      0     0  7.75 1       
 9 0            65        0     1 62.0  0       
10 0            26        2     0  8.66 0       
# ℹ 169 more rows
titanic_train2_null %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) %>%
  metrics(truth = survived, estimate = .pred_class)
# A tibble: 2 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary         0.620
2 kap      binary         0    
titanic_train2_null %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) %>%
  conf_mat(truth = survived, estimate = .pred_class)
          Truth
Prediction   0   1
         0 111  68
         1   0   0
titanic_train2_null %>%
  predict(titanic_train2_testing, type = "prob") %>%
  bind_cols(titanic_train2_testing) %>%
  roc_curve(survived, .pred_1) %>%
  autoplot()

Model 2: C5.0

Setup the model.

titanic_train2_C50 <- boost_tree(trees = 20) %>% 
  set_engine("C5.0") %>%
  set_mode("classification") %>%
  fit(survived ~ ., data = titanic_train2_training)
predict(titanic_train2_C50, titanic_train2_training)
# A tibble: 712 × 1
   .pred_class
   <fct>      
 1 1          
 2 0          
 3 0          
 4 0          
 5 0          
 6 0          
 7 1          
 8 0          
 9 0          
10 0          
# ℹ 702 more rows
titanic_train2_C50 %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) 
# A tibble: 179 × 6
   .pred_class   age sib_sp parch  fare survived
   <fct>       <dbl>  <int> <int> <dbl> <fct>   
 1 1            27        0     2 11.1  1       
 2 0            58        0     0 26.6  1       
 3 1            14        0     0  7.85 0       
 4 1            38        1     5 31.4  1       
 5 0            40        0     0 27.7  0       
 6 1            28        1     0 82.2  0       
 7 1            42        1     0 52    0       
 8 0            29.7      0     0  7.75 1       
 9 1            65        0     1 62.0  0       
10 0            26        2     0  8.66 0       
# ℹ 169 more rows
titanic_train2_C50 %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) %>%
  metrics(truth = survived, estimate = .pred_class)
# A tibble: 2 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary         0.732
2 kap      binary         0.421
titanic_train2_C50 %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) %>%
  conf_mat(truth = survived, estimate = .pred_class)
          Truth
Prediction  0  1
         0 90 27
         1 21 41
titanic_train2_C50 %>%
  predict(titanic_train2_testing, type = "prob") %>%
  bind_cols(titanic_train2_testing) %>%
  roc_curve(survived, .pred_0) %>%
  autoplot()

titanic_train2_C50 %>%
  predict(titanic_train2_testing, type = "prob") %>%
  bind_cols(titanic_train2_testing) %>%
  roc_auc(survived, .pred_0) 
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.793
titanic_train2_C50 %>%
  predict(titanic_train2_testing, type = "prob") %>%
  bind_cols(titanic_train2_testing) %>%
  ggplot() +
  geom_density(aes(x = .pred_1, fill = survived), 
               alpha = 0.5)

Model 2: XGBoost

Setup the model.

titanic_train2_xgb <- boost_tree(trees = 20) %>% 
  set_engine("xgboost") %>%
  set_mode("classification") %>%
  fit(survived ~ ., data = titanic_train2_training)
predict(titanic_train2_xgb, titanic_train2_training)
# A tibble: 712 × 1
   .pred_class
   <fct>      
 1 1          
 2 0          
 3 0          
 4 0          
 5 0          
 6 0          
 7 0          
 8 0          
 9 0          
10 0          
# ℹ 702 more rows
titanic_train2_xgb %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) 
# A tibble: 179 × 6
   .pred_class   age sib_sp parch  fare survived
   <fct>       <dbl>  <int> <int> <dbl> <fct>   
 1 0            27        0     2 11.1  1       
 2 0            58        0     0 26.6  1       
 3 1            14        0     0  7.85 0       
 4 0            38        1     5 31.4  1       
 5 0            40        0     0 27.7  0       
 6 1            28        1     0 82.2  0       
 7 1            42        1     0 52    0       
 8 0            29.7      0     0  7.75 1       
 9 0            65        0     1 62.0  0       
10 0            26        2     0  8.66 0       
# ℹ 169 more rows
titanic_train2_xgb %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) %>%
  metrics(truth = survived, estimate = .pred_class)
# A tibble: 2 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary         0.709
2 kap      binary         0.350
titanic_train2_xgb %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) %>%
  conf_mat(truth = survived, estimate = .pred_class)
          Truth
Prediction  0  1
         0 94 35
         1 17 33
titanic_train2_xgb %>%
  predict(titanic_train2_testing, type = "prob") %>%
  bind_cols(titanic_train2_testing) %>%
  roc_curve(survived, .pred_0) %>%
  autoplot() 

titanic_train2_xgb %>%
  predict(titanic_train2_testing, type = "prob") %>%
  bind_cols(titanic_train2_testing) %>%
  roc_auc(survived, .pred_0) 
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.760
titanic_train2_xgb %>%
  predict(titanic_train2_testing, type = "prob") %>%
  bind_cols(titanic_train2_testing) %>%
  ggplot() +
  geom_density(aes(x = .pred_1, fill = survived), 
               alpha = 0.5)

Model 3: Random Forest

Setup the model.

titanic_train2_ranger <- rand_forest(trees = 100) %>% 
  set_engine("ranger") %>%
  set_mode("classification") %>%
  fit(survived ~ ., data = titanic_train2_training)
predict(titanic_train2_ranger, titanic_train2_training)
# A tibble: 712 × 1
   .pred_class
   <fct>      
 1 1          
 2 0          
 3 0          
 4 0          
 5 0          
 6 0          
 7 0          
 8 0          
 9 0          
10 1          
# ℹ 702 more rows
titanic_train2_ranger %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) 
# A tibble: 179 × 6
   .pred_class   age sib_sp parch  fare survived
   <fct>       <dbl>  <int> <int> <dbl> <fct>   
 1 0            27        0     2 11.1  1       
 2 0            58        0     0 26.6  1       
 3 1            14        0     0  7.85 0       
 4 0            38        1     5 31.4  1       
 5 1            40        0     0 27.7  0       
 6 1            28        1     0 82.2  0       
 7 1            42        1     0 52    0       
 8 0            29.7      0     0  7.75 1       
 9 0            65        0     1 62.0  0       
10 0            26        2     0  8.66 0       
# ℹ 169 more rows
titanic_train2_ranger %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) %>%
  metrics(truth = survived, estimate = .pred_class)
# A tibble: 2 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary         0.732
2 kap      binary         0.418
titanic_train2_ranger %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) %>%
  conf_mat(truth = survived, estimate = .pred_class)
          Truth
Prediction  0  1
         0 91 28
         1 20 40
titanic_train2_ranger %>%
  predict(titanic_train2_testing, type = "prob") %>%
  bind_cols(titanic_train2_testing) %>%
  roc_curve(survived, .pred_0) %>%
  ggplot(aes(x = 1 - specificity, y = sensitivity)) +
  geom_path() +
  geom_abline(lty = 3) +
  coord_equal() 

titanic_train2_ranger %>%
  predict(titanic_train2_testing, type = "prob") %>%
  bind_cols(titanic_train2_testing) %>%
  roc_curve(survived, .pred_0) %>%
  autoplot()

titanic_train2_ranger %>%
  predict(titanic_train2_testing, type = "prob") %>%
  bind_cols(titanic_train2_testing) %>%
  roc_auc(survived, .pred_0) 
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.779
titanic_train2_ranger %>%
  predict(titanic_train2_testing, type = "prob") %>%
  bind_cols(titanic_train2_testing) %>%
  ggplot() +
  geom_density(aes(x = .pred_1, fill = survived), 
               alpha = 0.5)