Stat 652: Midterm: Model 2, Model 3

Author

Prof. Eric A. Suess

library(pacman)
p_load(titanic, tidyverse, janitor, naniar, DataExplorer, tidymodels)

Load the data from the titanic R package. Note that the titanic_train dataset contains the labels for Survived aND THE titanic_test dataset does not contain the labels. So we will build our machine learning model using the titanic_train dataset and then make a final classification for the titanic_test dataset. This is how kaggle competitions are done.

I like to clean names so the variables all have names with lowercase letters and underscores.

titanic_train <- titanic_train |> clean_names()

head(titanic_train)
  passenger_id survived pclass
1            1        0      3
2            2        1      1
3            3        1      3
4            4        1      1
5            5        0      3
6            6        0      3
                                                 name    sex age sib_sp parch
1                             Braund, Mr. Owen Harris   male  22      1     0
2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38      1     0
3                              Heikkinen, Miss. Laina female  26      0     0
4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35      1     0
5                            Allen, Mr. William Henry   male  35      0     0
6                                    Moran, Mr. James   male  NA      0     0
            ticket    fare cabin embarked
1        A/5 21171  7.2500              S
2         PC 17599 71.2833   C85        C
3 STON/O2. 3101282  7.9250              S
4           113803 53.1000  C123        S
5           373450  8.0500              S
6           330877  8.4583              Q

It is always a good idea to check for duplicate records/examples/rows in your dataset.

get_dupes(titanic_train)
No variable names specified - using all columns.
No duplicate combinations found of: passenger_id, survived, pclass, name, sex, age, sib_sp, parch, ticket, ... and 3 other variables
 [1] passenger_id survived     pclass       name         sex         
 [6] age          sib_sp       parch        ticket       fare        
[11] cabin        embarked     dupe_count  
<0 rows> (or 0-length row.names)

Drop the unique identifiers: passenger_id, name, and ticket. Also drop cabin because it has a high missing rate.

titanic_train2 <- titanic_train |> select(-passenger_id, -name, -ticket, -cabin) |>
  mutate(
    survived = as_factor(survived) ,
    pclass = as_factor(pclass),
    sex = as_factor(sex),
    embarked = as_factor(embarked)
  )

head(titanic_train2)
  survived pclass    sex age sib_sp parch    fare embarked
1        0      3   male  22      1     0  7.2500        S
2        1      1 female  38      1     0 71.2833        C
3        1      3 female  26      0     0  7.9250        S
4        1      1 female  35      1     0 53.1000        S
5        0      3   male  35      0     0  8.0500        S
6        0      3   male  NA      0     0  8.4583        Q
titanic_test <- titanic_test |> clean_names()

head(titanic_test)
  passenger_id pclass                                         name    sex  age
1          892      3                             Kelly, Mr. James   male 34.5
2          893      3             Wilkes, Mrs. James (Ellen Needs) female 47.0
3          894      2                    Myles, Mr. Thomas Francis   male 62.0
4          895      3                             Wirz, Mr. Albert   male 27.0
5          896      3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0
6          897      3                   Svensson, Mr. Johan Cervin   male 14.0
  sib_sp parch  ticket    fare cabin embarked
1      0     0  330911  7.8292              Q
2      1     0  363272  7.0000              S
3      0     0  240276  9.6875              Q
4      0     0  315154  8.6625              S
5      1     1 3101298 12.2875              S
6      0     0    7538  9.2250              S

It is always a good idea to check for duplicate records/examples/rows in your dataset.

get_dupes(titanic_test)
No variable names specified - using all columns.
No duplicate combinations found of: passenger_id, pclass, name, sex, age, sib_sp, parch, ticket, fare, ... and 2 other variables
 [1] passenger_id pclass       name         sex          age         
 [6] sib_sp       parch        ticket       fare         cabin       
[11] embarked     dupe_count  
<0 rows> (or 0-length row.names)
titanic_test2 <- titanic_test |> select(-passenger_id, -name, -ticket, -cabin) |>
  mutate(
    pclass = as_factor(pclass),
    sex = as_factor(sex),
    embarked = as_factor(embarked)
  )

head(titanic_test2)
  pclass    sex  age sib_sp parch    fare embarked
1      3   male 34.5      0     0  7.8292        Q
2      3 female 47.0      1     0  7.0000        S
3      2   male 62.0      0     0  9.6875        Q
4      3   male 27.0      0     0  8.6625        S
5      3 female 22.0      1     1 12.2875        S
6      3   male 14.0      0     0  9.2250        S

Start by investigating the missing values and completeness of the features in the data. Note that the age variable contains some missing values.

vis_miss(titanic_train2)

gg_miss_var(titanic_train2)

gg_miss_var(titanic_train2, show_pct = TRUE)

create_report(titanic_train2, y = "survived", output_file = "report.html", output_dir = getwd())

Now try the ML algorithms.

Model 0:

Summarize the y-variable. Null Model.

titanic_train2 |> group_by(survived) |>
  summarize(n = n()) |>
  mutate(freq = n / sum(n))
# A tibble: 2 × 3
  survived     n  freq
  <fct>    <int> <dbl>
1 0          549 0.616
2 1          342 0.384

Make the first split with 80% of the data being in the training data set.

titanic_train2_split <- initial_split(titanic_train2, prop = 0.8)
titanic_train2_split
<Training/Testing/Total>
<712/179/891>

Create the recipe for applying the preprocessing. Note the use of step_nzv(), which removes any columns that have very low variability, and the use of the step_meanimpute() function, which fills in the cells that are missing with the mean of the column.

titanic_train2_recipe <- training(titanic_train2_split) |>
  recipe(survived ~ .) |>
  step_rm(pclass, sex, embarked) |> 
  step_nzv(all_predictors()) |>
  step_impute_mean(age) |>
  prep()

summary(titanic_train2_recipe)
# A tibble: 5 × 4
  variable type      role      source  
  <chr>    <list>    <chr>     <chr>   
1 age      <chr [2]> predictor original
2 sib_sp   <chr [2]> predictor original
3 parch    <chr [2]> predictor original
4 fare     <chr [2]> predictor original
5 survived <chr [3]> outcome   original
tidy(titanic_train2_recipe)
# A tibble: 3 × 6
  number operation type        trained skip  id               
   <int> <chr>     <chr>       <lgl>   <lgl> <chr>            
1      1 step      rm          TRUE    FALSE rm_yNBw5         
2      2 step      nzv         TRUE    FALSE nzv_uVgMS        
3      3 step      impute_mean TRUE    FALSE impute_mean_wpwl6

Apply the recipe, so the age variable should be complete after the imputation.

titanic_train2_training <- titanic_train2_recipe |>
  bake(training(titanic_train2_split)) 

titanic_train2_training |> arrange(age)
# A tibble: 712 × 5
     age sib_sp parch   fare survived
   <dbl>  <int> <int>  <dbl> <fct>   
 1  0.42      0     1   8.52 1       
 2  0.67      1     1  14.5  1       
 3  0.75      2     1  19.3  1       
 4  0.75      2     1  19.3  1       
 5  0.83      1     1  18.8  1       
 6  0.92      1     2 152.   1       
 7  1         2     1  39    1       
 8  1         1     2  20.6  1       
 9  1         1     1  11.1  1       
10  1         5     2  46.9  0       
# ℹ 702 more rows
titanic_train2_testing <- titanic_train2_recipe |>
  bake(testing(titanic_train2_split)) 

titanic_train2_testing
# A tibble: 179 × 5
     age sib_sp parch  fare survived
   <dbl>  <int> <int> <dbl> <fct>   
 1  14        1     0 30.1  1       
 2  39        1     5 31.3  0       
 3  29.5      0     0 13    1       
 4  40        0     0 27.7  0       
 5  29.5      0     0  7.75 1       
 6  66        0     0 10.5  0       
 7  18        2     0 18    0       
 8  65        0     1 62.0  0       
 9   5        1     2 27.8  1       
10  22        0     0  7.23 0       
# ℹ 169 more rows

Model 0: null

titanic_train2_null <- null_model()  |>  
  set_engine("parsnip")  |>  
  set_mode("classification") |> 
  fit(survived ~ ., data = titanic_train2_training)
predict(titanic_train2_null, titanic_train2_training)
# A tibble: 712 × 1
   .pred_class
   <fct>      
 1 0          
 2 0          
 3 0          
 4 0          
 5 0          
 6 0          
 7 0          
 8 0          
 9 0          
10 0          
# ℹ 702 more rows
titanic_train2_null |>
  predict(titanic_train2_testing) |>
  bind_cols(titanic_train2_testing) 
# A tibble: 179 × 6
   .pred_class   age sib_sp parch  fare survived
   <fct>       <dbl>  <int> <int> <dbl> <fct>   
 1 0            14        1     0 30.1  1       
 2 0            39        1     5 31.3  0       
 3 0            29.5      0     0 13    1       
 4 0            40        0     0 27.7  0       
 5 0            29.5      0     0  7.75 1       
 6 0            66        0     0 10.5  0       
 7 0            18        2     0 18    0       
 8 0            65        0     1 62.0  0       
 9 0             5        1     2 27.8  1       
10 0            22        0     0  7.23 0       
# ℹ 169 more rows
titanic_train2_null |>
  predict(titanic_train2_testing) |>
  bind_cols(titanic_train2_testing) |>
  metrics(truth = survived, estimate = .pred_class)
# A tibble: 2 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary         0.620
2 kap      binary         0    
titanic_train2_null |>
  predict(titanic_train2_testing) |>
  bind_cols(titanic_train2_testing) |>
  conf_mat(truth = survived, estimate = .pred_class)
          Truth
Prediction   0   1
         0 111  68
         1   0   0
titanic_train2_null |>
  predict(titanic_train2_testing, type = "prob") |>
  bind_cols(titanic_train2_testing) |>
  roc_curve(survived, .pred_1) |>
  autoplot()

Model 2: C5.0

Setup the model.

titanic_train2_C50 <- boost_tree(trees = 20) |> 
  set_engine("C5.0") |>
  set_mode("classification") |>
  fit(survived ~ ., data = titanic_train2_training)
predict(titanic_train2_C50, titanic_train2_training)
# A tibble: 712 × 1
   .pred_class
   <fct>      
 1 0          
 2 0          
 3 0          
 4 1          
 5 1          
 6 0          
 7 0          
 8 0          
 9 0          
10 0          
# ℹ 702 more rows
titanic_train2_C50 |>
  predict(titanic_train2_testing) |>
  bind_cols(titanic_train2_testing) 
# A tibble: 179 × 6
   .pred_class   age sib_sp parch  fare survived
   <fct>       <dbl>  <int> <int> <dbl> <fct>   
 1 1            14        1     0 30.1  1       
 2 1            39        1     5 31.3  0       
 3 0            29.5      0     0 13    1       
 4 0            40        0     0 27.7  0       
 5 0            29.5      0     0  7.75 1       
 6 0            66        0     0 10.5  0       
 7 0            18        2     0 18    0       
 8 1            65        0     1 62.0  0       
 9 1             5        1     2 27.8  1       
10 0            22        0     0  7.23 0       
# ℹ 169 more rows
titanic_train2_C50 |>
  predict(titanic_train2_testing) |>
  bind_cols(titanic_train2_testing) |>
  metrics(truth = survived, estimate = .pred_class)
# A tibble: 2 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary         0.732
2 kap      binary         0.400
titanic_train2_C50 |>
  predict(titanic_train2_testing) |>
  bind_cols(titanic_train2_testing) |>
  conf_mat(truth = survived, estimate = .pred_class)
          Truth
Prediction  0  1
         0 96 33
         1 15 35
titanic_train2_C50 |>
  predict(titanic_train2_testing, type = "prob") |>
  bind_cols(titanic_train2_testing) |>
  roc_curve(survived, .pred_0) |>
  autoplot()

titanic_train2_C50 |>
  predict(titanic_train2_testing, type = "prob") |>
  bind_cols(titanic_train2_testing) |>
  roc_auc(survived, .pred_0) 
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.765
titanic_train2_C50 |>
  predict(titanic_train2_testing, type = "prob") |>
  bind_cols(titanic_train2_testing) |>
  ggplot() +
  geom_density(aes(x = .pred_1, fill = survived), 
               alpha = 0.5)

Model 2: XGBoost

Setup the model.

titanic_train2_xgb <- boost_tree(trees = 20) |> 
  set_engine("xgboost") |>
  set_mode("classification") |>
  fit(survived ~ ., data = titanic_train2_training)
predict(titanic_train2_xgb, titanic_train2_training)
# A tibble: 712 × 1
   .pred_class
   <fct>      
 1 0          
 2 0          
 3 0          
 4 1          
 5 0          
 6 0          
 7 1          
 8 0          
 9 0          
10 0          
# ℹ 702 more rows
titanic_train2_xgb |>
  predict(titanic_train2_testing) |>
  bind_cols(titanic_train2_testing) 
# A tibble: 179 × 6
   .pred_class   age sib_sp parch  fare survived
   <fct>       <dbl>  <int> <int> <dbl> <fct>   
 1 1            14        1     0 30.1  1       
 2 0            39        1     5 31.3  0       
 3 0            29.5      0     0 13    1       
 4 0            40        0     0 27.7  0       
 5 0            29.5      0     0  7.75 1       
 6 0            66        0     0 10.5  0       
 7 0            18        2     0 18    0       
 8 0            65        0     1 62.0  0       
 9 1             5        1     2 27.8  1       
10 0            22        0     0  7.23 0       
# ℹ 169 more rows
titanic_train2_xgb |>
  predict(titanic_train2_testing) |>
  bind_cols(titanic_train2_testing) |>
  metrics(truth = survived, estimate = .pred_class)
# A tibble: 2 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary         0.715
2 kap      binary         0.379
titanic_train2_xgb |>
  predict(titanic_train2_testing) |>
  bind_cols(titanic_train2_testing) |>
  conf_mat(truth = survived, estimate = .pred_class)
          Truth
Prediction  0  1
         0 90 30
         1 21 38
titanic_train2_xgb |>
  predict(titanic_train2_testing, type = "prob") |>
  bind_cols(titanic_train2_testing) |>
  roc_curve(survived, .pred_0) |>
  autoplot() 

titanic_train2_xgb |>
  predict(titanic_train2_testing, type = "prob") |>
  bind_cols(titanic_train2_testing) |>
  roc_auc(survived, .pred_0) 
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.795
titanic_train2_xgb |>
  predict(titanic_train2_testing, type = "prob") |>
  bind_cols(titanic_train2_testing) |>
  ggplot() +
  geom_density(aes(x = .pred_1, fill = survived), 
               alpha = 0.5)

Model 3: Random Forest

Setup the model.

titanic_train2_ranger <- rand_forest(trees = 100) |> 
  set_engine("ranger") |>
  set_mode("classification") |>
  fit(survived ~ ., data = titanic_train2_training)
predict(titanic_train2_ranger, titanic_train2_training)
# A tibble: 712 × 1
   .pred_class
   <fct>      
 1 1          
 2 0          
 3 0          
 4 1          
 5 0          
 6 0          
 7 1          
 8 0          
 9 0          
10 0          
# ℹ 702 more rows
titanic_train2_ranger |>
  predict(titanic_train2_testing) |>
  bind_cols(titanic_train2_testing) 
# A tibble: 179 × 6
   .pred_class   age sib_sp parch  fare survived
   <fct>       <dbl>  <int> <int> <dbl> <fct>   
 1 1            14        1     0 30.1  1       
 2 0            39        1     5 31.3  0       
 3 0            29.5      0     0 13    1       
 4 1            40        0     0 27.7  0       
 5 0            29.5      0     0  7.75 1       
 6 0            66        0     0 10.5  0       
 7 0            18        2     0 18    0       
 8 0            65        0     1 62.0  0       
 9 1             5        1     2 27.8  1       
10 0            22        0     0  7.23 0       
# ℹ 169 more rows
titanic_train2_ranger |>
  predict(titanic_train2_testing) |>
  bind_cols(titanic_train2_testing) |>
  metrics(truth = survived, estimate = .pred_class)
# A tibble: 2 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary         0.754
2 kap      binary         0.457
titanic_train2_ranger |>
  predict(titanic_train2_testing) |>
  bind_cols(titanic_train2_testing) |>
  conf_mat(truth = survived, estimate = .pred_class)
          Truth
Prediction  0  1
         0 96 29
         1 15 39
titanic_train2_ranger |>
  predict(titanic_train2_testing, type = "prob") |>
  bind_cols(titanic_train2_testing) |>
  roc_curve(survived, .pred_0) |>
  ggplot(aes(x = 1 - specificity, y = sensitivity)) +
  geom_path() +
  geom_abline(lty = 3) +
  coord_equal() 

titanic_train2_ranger |>
  predict(titanic_train2_testing, type = "prob") |>
  bind_cols(titanic_train2_testing) |>
  roc_curve(survived, .pred_0) |>
  autoplot()

titanic_train2_ranger |>
  predict(titanic_train2_testing, type = "prob") |>
  bind_cols(titanic_train2_testing) |>
  roc_auc(survived, .pred_0) 
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.792
titanic_train2_ranger |>
  predict(titanic_train2_testing, type = "prob") |>
  bind_cols(titanic_train2_testing) |>
  ggplot() +
  geom_density(aes(x = .pred_1, fill = survived), 
               alpha = 0.5)