Stat 652: Midterm: Model 4, Model 5

Author

Prof. Eric A. Suess

library(pacman)
p_load(titanic, tidyverse, janitor, naniar, DataExplorer, tidymodels, discrim)

Load the data from the titanic R package. Note that the titanic_train dataset contains the labels for Survived aND THE titanic_test dataset does not contain the labels. So we will build our machine learning model using the titanic_train dataset and then make a final classification for the titanic_test dataset. This is how kaggle competitions are done.

I like to clean names so the variables all have names with lowercase letters and underscores.

titanic_train <- titanic_train %>% clean_names()

head(titanic_train)
  passenger_id survived pclass
1            1        0      3
2            2        1      1
3            3        1      3
4            4        1      1
5            5        0      3
6            6        0      3
                                                 name    sex age sib_sp parch
1                             Braund, Mr. Owen Harris   male  22      1     0
2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38      1     0
3                              Heikkinen, Miss. Laina female  26      0     0
4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35      1     0
5                            Allen, Mr. William Henry   male  35      0     0
6                                    Moran, Mr. James   male  NA      0     0
            ticket    fare cabin embarked
1        A/5 21171  7.2500              S
2         PC 17599 71.2833   C85        C
3 STON/O2. 3101282  7.9250              S
4           113803 53.1000  C123        S
5           373450  8.0500              S
6           330877  8.4583              Q

It is always a good idea to check for duplicate records/examples/rows in your dataset.

get_dupes(titanic_train)
No variable names specified - using all columns.
No duplicate combinations found of: passenger_id, survived, pclass, name, sex, age, sib_sp, parch, ticket, ... and 3 other variables
 [1] passenger_id survived     pclass       name         sex         
 [6] age          sib_sp       parch        ticket       fare        
[11] cabin        embarked     dupe_count  
<0 rows> (or 0-length row.names)

Drop the unique identifiers: passenger_id, name, and ticket. Also drop cabin because it has a high missing rate.

titanic_train2 <- titanic_train %>% select(-passenger_id, -name, -ticket, -cabin) %>%
  mutate(
    survived = as_factor(survived),
    pclass = as_factor(pclass),
    sex = as_factor(sex),
    embarked = as_factor(embarked)
  )

head(titanic_train2)
  survived pclass    sex age sib_sp parch    fare embarked
1        0      3   male  22      1     0  7.2500        S
2        1      1 female  38      1     0 71.2833        C
3        1      3 female  26      0     0  7.9250        S
4        1      1 female  35      1     0 53.1000        S
5        0      3   male  35      0     0  8.0500        S
6        0      3   male  NA      0     0  8.4583        Q
titanic_test <- titanic_test %>% clean_names()

head(titanic_test)
  passenger_id pclass                                         name    sex  age
1          892      3                             Kelly, Mr. James   male 34.5
2          893      3             Wilkes, Mrs. James (Ellen Needs) female 47.0
3          894      2                    Myles, Mr. Thomas Francis   male 62.0
4          895      3                             Wirz, Mr. Albert   male 27.0
5          896      3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0
6          897      3                   Svensson, Mr. Johan Cervin   male 14.0
  sib_sp parch  ticket    fare cabin embarked
1      0     0  330911  7.8292              Q
2      1     0  363272  7.0000              S
3      0     0  240276  9.6875              Q
4      0     0  315154  8.6625              S
5      1     1 3101298 12.2875              S
6      0     0    7538  9.2250              S

It is always a good idea to check for duplicate records/examples/rows in your dataset.

get_dupes(titanic_test)
No variable names specified - using all columns.
No duplicate combinations found of: passenger_id, pclass, name, sex, age, sib_sp, parch, ticket, fare, ... and 2 other variables
 [1] passenger_id pclass       name         sex          age         
 [6] sib_sp       parch        ticket       fare         cabin       
[11] embarked     dupe_count  
<0 rows> (or 0-length row.names)
titanic_test2 <- titanic_test %>% select(-passenger_id, -name, -ticket, -cabin) %>%
  mutate(
    pclass = as_factor(pclass),
    sex = as_factor(sex),
    embarked = as_factor(embarked)
  )

head(titanic_test2)
  pclass    sex  age sib_sp parch    fare embarked
1      3   male 34.5      0     0  7.8292        Q
2      3 female 47.0      1     0  7.0000        S
3      2   male 62.0      0     0  9.6875        Q
4      3   male 27.0      0     0  8.6625        S
5      3 female 22.0      1     1 12.2875        S
6      3   male 14.0      0     0  9.2250        S

Start by investigating the missing values and completeness of the features in the data. Note that the age variable contains some missing values.

vis_miss(titanic_train2)

gg_miss_var(titanic_train2)

gg_miss_var(titanic_train2, show_pct = TRUE)

create_report(titanic_train2, y = "survived", output_file = "report.html", output_dir = getwd())

Now try the ML algorithms.

Model 0:

Summarize the y-variable. Null Model.

titanic_train2 %>% group_by(survived) %>%
  summarize(n = n()) %>%
  mutate(freq = n / sum(n))
# A tibble: 2 × 3
  survived     n  freq
  <fct>    <int> <dbl>
1 0          549 0.616
2 1          342 0.384

Make the first split with 80% of the data being in the training data set.

titanic_train2_split <- initial_split(titanic_train2, prop = 0.8)
titanic_train2_split
<Training/Testing/Total>
<712/179/891>

Create the recipe for applying the preprocessing. Note the use of step_nzv(), which removes any columns that have very low variability, and the use of the step_meanimpute() function, which fills in the cells that are missing with the mean of the column.

titanic_train2_recipe <- training(titanic_train2_split) %>%
  recipe(survived ~ .) %>%
  step_rm(pclass, sex, embarked) %>% 
  step_nzv(all_predictors()) %>%
  step_impute_mean(age) %>%
  prep()

summary(titanic_train2_recipe)
# A tibble: 5 × 4
  variable type      role      source  
  <chr>    <list>    <chr>     <chr>   
1 age      <chr [2]> predictor original
2 sib_sp   <chr [2]> predictor original
3 parch    <chr [2]> predictor original
4 fare     <chr [2]> predictor original
5 survived <chr [3]> outcome   original
tidy(titanic_train2_recipe)
# A tibble: 3 × 6
  number operation type        trained skip  id               
   <int> <chr>     <chr>       <lgl>   <lgl> <chr>            
1      1 step      rm          TRUE    FALSE rm_RGORn         
2      2 step      nzv         TRUE    FALSE nzv_1bg14        
3      3 step      impute_mean TRUE    FALSE impute_mean_a6NWH

Apply the recipe, so the age variable should be complete after the imputation.

titanic_train2_testing <- titanic_train2_recipe %>%
  bake(testing(titanic_train2_split)) 

titanic_train2_testing
# A tibble: 179 × 5
     age sib_sp parch  fare survived
   <dbl>  <int> <int> <dbl> <fct>   
 1  35        1     0 53.1  1       
 2  27        0     2 11.1  1       
 3  14        1     0 30.1  1       
 4  20        0     0  8.05 0       
 5  39        1     5 31.3  0       
 6  31        1     0 18    0       
 7  28        1     0 82.2  0       
 8  18        2     0 18    0       
 9  19        0     0  7.88 1       
10  29.6      1     0 15.5  0       
# ℹ 169 more rows
titanic_train2_training <- juice(titanic_train2_recipe)

titanic_train2_training
# A tibble: 712 × 5
     age sib_sp parch   fare survived
   <dbl>  <int> <int>  <dbl> <fct>   
 1  31        0     2 165.   1       
 2  29.6      1     2  23.4  0       
 3  16        0     0   9.22 0       
 4  29.6      3     1  25.5  0       
 5  24        1     0  26    1       
 6  29.6      2     0  21.7  0       
 7   7        4     1  29.1  0       
 8  38        0     1 153.   0       
 9  29.6      8     2  69.6  0       
10  28        0     0  13.5  0       
# ℹ 702 more rows

Model 0: null

titanic_train2_null <- null_model()  |>  
  set_engine("parsnip")  |>  
  set_mode("classification") |> 
  fit(survived ~ ., data = titanic_train2_training)
predict(titanic_train2_null, titanic_train2_training)
# A tibble: 712 × 1
   .pred_class
   <fct>      
 1 0          
 2 0          
 3 0          
 4 0          
 5 0          
 6 0          
 7 0          
 8 0          
 9 0          
10 0          
# ℹ 702 more rows
titanic_train2_null %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) 
# A tibble: 179 × 6
   .pred_class   age sib_sp parch  fare survived
   <fct>       <dbl>  <int> <int> <dbl> <fct>   
 1 0            35        1     0 53.1  1       
 2 0            27        0     2 11.1  1       
 3 0            14        1     0 30.1  1       
 4 0            20        0     0  8.05 0       
 5 0            39        1     5 31.3  0       
 6 0            31        1     0 18    0       
 7 0            28        1     0 82.2  0       
 8 0            18        2     0 18    0       
 9 0            19        0     0  7.88 1       
10 0            29.6      1     0 15.5  0       
# ℹ 169 more rows
titanic_train2_null %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) %>%
  metrics(truth = survived, estimate = .pred_class)
# A tibble: 2 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary         0.603
2 kap      binary         0    
titanic_train2_null %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) %>%
  conf_mat(truth = survived, estimate = .pred_class)
          Truth
Prediction   0   1
         0 108  71
         1   0   0
titanic_train2_null %>%
  predict(titanic_train2_testing, type = "prob") %>%
  bind_cols(titanic_train2_testing) %>%
  roc_curve(survived, .pred_0) %>%
  autoplot() 

Model 4: GLM

Setup the model.

titanic_train2_glm <- logistic_reg(penalty = 0.001, mixture = 0.5) %>% 
  set_engine("glmnet") %>%
  set_mode("classification") %>%
  fit(survived ~ ., data = titanic_train2_training)
predict(titanic_train2_glm, titanic_train2_training)
# A tibble: 712 × 1
   .pred_class
   <fct>      
 1 1          
 2 0          
 3 0          
 4 0          
 5 0          
 6 0          
 7 0          
 8 1          
 9 0          
10 0          
# ℹ 702 more rows
titanic_train2_glm %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) 
# A tibble: 179 × 6
   .pred_class   age sib_sp parch  fare survived
   <fct>       <dbl>  <int> <int> <dbl> <fct>   
 1 0            35        1     0 53.1  1       
 2 0            27        0     2 11.1  1       
 3 0            14        1     0 30.1  1       
 4 0            20        0     0  8.05 0       
 5 0            39        1     5 31.3  0       
 6 0            31        1     0 18    0       
 7 1            28        1     0 82.2  0       
 8 0            18        2     0 18    0       
 9 0            19        0     0  7.88 1       
10 0            29.6      1     0 15.5  0       
# ℹ 169 more rows
titanic_train2_glm %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) %>%
  metrics(truth = survived, estimate = .pred_class)
# A tibble: 2 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary         0.704
2 kap      binary         0.310
titanic_train2_glm %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) %>%
  conf_mat(truth = survived, estimate = .pred_class)
          Truth
Prediction   0   1
         0 103  48
         1   5  23
titanic_train2_glm %>%
  predict(titanic_train2_testing, type = "prob") %>%
  bind_cols(titanic_train2_testing) %>%
  roc_curve(survived, .pred_0) %>%
  autoplot()

titanic_train2_glm %>%
  predict(titanic_train2_testing, type = "prob") %>%
  bind_cols(titanic_train2_testing) %>%
  roc_auc(survived, .pred_0) 
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.713
titanic_train2_glm %>%
  predict(titanic_train2_testing, type = "prob") %>%
  bind_cols(titanic_train2_testing) %>%
  ggplot() +
  geom_density(aes(x = .pred_1, fill = survived), 
               alpha = 0.5)

Model 5: Naive Bayes

Setup the model.

titanic_train2_nb <- naive_Bayes(Laplace = 1) %>% 
  set_engine("klaR") %>%
  set_mode("classification") %>%
  fit(survived ~ ., data = titanic_train2_training)
predict(titanic_train2_nb, titanic_train2_training)
Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
observation 198
# A tibble: 712 × 1
   .pred_class
   <fct>      
 1 1          
 2 1          
 3 0          
 4 0          
 5 0          
 6 0          
 7 0          
 8 1          
 9 0          
10 0          
# ℹ 702 more rows
titanic_train2_nb %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) 
Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
observation 137
# A tibble: 179 × 6
   .pred_class   age sib_sp parch  fare survived
   <fct>       <dbl>  <int> <int> <dbl> <fct>   
 1 1            35        1     0 53.1  1       
 2 0            27        0     2 11.1  1       
 3 1            14        1     0 30.1  1       
 4 0            20        0     0  8.05 0       
 5 1            39        1     5 31.3  0       
 6 0            31        1     0 18    0       
 7 1            28        1     0 82.2  0       
 8 0            18        2     0 18    0       
 9 0            19        0     0  7.88 1       
10 0            29.6      1     0 15.5  0       
# ℹ 169 more rows
titanic_train2_nb %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) %>%
  metrics(truth = survived, estimate = .pred_class)
Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
observation 137
# A tibble: 2 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary         0.704
2 kap      binary         0.342
titanic_train2_nb %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) %>%
  conf_mat(truth = survived, estimate = .pred_class)
Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
observation 137
          Truth
Prediction  0  1
         0 94 39
         1 14 32
titanic_train2_nb %>%
  predict(titanic_train2_testing, type = "prob") %>%
  bind_cols(titanic_train2_testing) %>%
  roc_curve(survived, .pred_0) %>%
  autoplot() 
Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
observation 137

titanic_train2_nb %>%
  predict(titanic_train2_testing, type = "prob") %>%
  bind_cols(titanic_train2_testing) %>%
  roc_auc(survived, .pred_0) 
Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
observation 137
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.720
titanic_train2_nb %>%
  predict(titanic_train2_testing, type = "prob") %>%
  bind_cols(titanic_train2_testing) %>%
  ggplot() +
  geom_density(aes(x = .pred_1, fill = survived), 
               alpha = 0.5)
Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
observation 137