Stat 652: Midterm: Model 0, Model 1

Author

Prof. Eric A. Suess

library(pacman)
p_load(titanic, tidyverse, janitor, naniar, DataExplorer, tidymodels)

Load the data from the titanic R package. Note that the titanic_train dataset contains the labels for Survived and the titanic_test dataset does not contain the labels. So we will build our machine learning model using the titanic_train dataset and then make a final classification for the titanic_test dataset. This is how kaggle competitions are done.

I like to clean names so the variables all have names with lowercase letters and underscores.

library(titanic)
data(titanic_train)
data(titanic_test)
titanic_train <- titanic_train %>% clean_names()

head(titanic_train)
  passenger_id survived pclass
1            1        0      3
2            2        1      1
3            3        1      3
4            4        1      1
5            5        0      3
6            6        0      3
                                                 name    sex age sib_sp parch
1                             Braund, Mr. Owen Harris   male  22      1     0
2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38      1     0
3                              Heikkinen, Miss. Laina female  26      0     0
4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35      1     0
5                            Allen, Mr. William Henry   male  35      0     0
6                                    Moran, Mr. James   male  NA      0     0
            ticket    fare cabin embarked
1        A/5 21171  7.2500              S
2         PC 17599 71.2833   C85        C
3 STON/O2. 3101282  7.9250              S
4           113803 53.1000  C123        S
5           373450  8.0500              S
6           330877  8.4583              Q

It is always a good idea to check for duplicate records/examples/rows in your dataset.

get_dupes(titanic_train)
No variable names specified - using all columns.
No duplicate combinations found of: passenger_id, survived, pclass, name, sex, age, sib_sp, parch, ticket, ... and 3 other variables
 [1] passenger_id survived     pclass       name         sex         
 [6] age          sib_sp       parch        ticket       fare        
[11] cabin        embarked     dupe_count  
<0 rows> (or 0-length row.names)
titanic_train2 <- titanic_train %>% dplyr::select(-passenger_id, -name, -ticket, -cabin) %>%
  mutate(
    survived = as_factor(survived),
    pclass = as_factor(pclass),
    sex = as_factor(sex),
    embarked = as_factor(embarked)
  )

head(titanic_train2)
  survived pclass    sex age sib_sp parch    fare embarked
1        0      3   male  22      1     0  7.2500        S
2        1      1 female  38      1     0 71.2833        C
3        1      3 female  26      0     0  7.9250        S
4        1      1 female  35      1     0 53.1000        S
5        0      3   male  35      0     0  8.0500        S
6        0      3   male  NA      0     0  8.4583        Q
titanic_test <- titanic_test %>% clean_names()

head(titanic_test)
  passenger_id pclass                                         name    sex  age
1          892      3                             Kelly, Mr. James   male 34.5
2          893      3             Wilkes, Mrs. James (Ellen Needs) female 47.0
3          894      2                    Myles, Mr. Thomas Francis   male 62.0
4          895      3                             Wirz, Mr. Albert   male 27.0
5          896      3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0
6          897      3                   Svensson, Mr. Johan Cervin   male 14.0
  sib_sp parch  ticket    fare cabin embarked
1      0     0  330911  7.8292              Q
2      1     0  363272  7.0000              S
3      0     0  240276  9.6875              Q
4      0     0  315154  8.6625              S
5      1     1 3101298 12.2875              S
6      0     0    7538  9.2250              S

It is always a good idea to check for duplicate records/examples/rows in your dataset.

get_dupes(titanic_test)
No variable names specified - using all columns.
No duplicate combinations found of: passenger_id, pclass, name, sex, age, sib_sp, parch, ticket, fare, ... and 2 other variables
 [1] passenger_id pclass       name         sex          age         
 [6] sib_sp       parch        ticket       fare         cabin       
[11] embarked     dupe_count  
<0 rows> (or 0-length row.names)
titanic_test2 <- titanic_test %>% select(-passenger_id, -name, -ticket, -cabin) %>%
  mutate(
    pclass = as_factor(pclass),
    sex = as_factor(sex),
    embarked = as_factor(embarked)
  )

head(titanic_test2)
  pclass    sex  age sib_sp parch    fare embarked
1      3   male 34.5      0     0  7.8292        Q
2      3 female 47.0      1     0  7.0000        S
3      2   male 62.0      0     0  9.6875        Q
4      3   male 27.0      0     0  8.6625        S
5      3 female 22.0      1     1 12.2875        S
6      3   male 14.0      0     0  9.2250        S

Start by investigating the missing values and completeness of the features in the data. Note that the age variable contains some missing values.

vis_miss(titanic_train2)

gg_miss_var(titanic_train2)

gg_miss_var(titanic_train2, show_pct = TRUE)

create_report(titanic_train2, y = "survived", output_file = "report.html", output_dir = getwd())

Now try the ML algorithms.

Model 0:

Summarize the y-variable. Null Model.

titanic_train2 %>% group_by(survived) %>%
  summarize(n = n()) %>%
  mutate(freq = n / sum(n))
# A tibble: 2 × 3
  survived     n  freq
  <fct>    <int> <dbl>
1 0          549 0.616
2 1          342 0.384

Make the first split with 80% of the data being in the training data set.

titanic_train2_split <- initial_split(titanic_train2, prop = 0.8)
titanic_train2_split
<Training/Testing/Total>
<712/179/891>

Create the recipe for applying the preprocessing. Note the use of step_nzv(), which removes any columns that have very low variability, and the use of the step_meanimpute() function, which fills in the cells that are missing with the mean of the column.

titanic_train2_recipe <- training(titanic_train2_split) %>%
  recipe(survived ~ .) %>%
  step_rm(pclass, sex, embarked) %>% 
  step_nzv(all_predictors()) %>%
  step_impute_mean(age) %>%
  prep()

summary(titanic_train2_recipe)
# A tibble: 5 × 4
  variable type      role      source  
  <chr>    <list>    <chr>     <chr>   
1 age      <chr [2]> predictor original
2 sib_sp   <chr [2]> predictor original
3 parch    <chr [2]> predictor original
4 fare     <chr [2]> predictor original
5 survived <chr [3]> outcome   original
tidy(titanic_train2_recipe)
# A tibble: 3 × 6
  number operation type        trained skip  id               
   <int> <chr>     <chr>       <lgl>   <lgl> <chr>            
1      1 step      rm          TRUE    FALSE rm_lhAJl         
2      2 step      nzv         TRUE    FALSE nzv_bTDvf        
3      3 step      impute_mean TRUE    FALSE impute_mean_fToYn

Apply the recipe, so the age variable should be complete after the imputation.

titanic_train2_testing <- titanic_train2_recipe %>%
  bake(testing(titanic_train2_split)) 

titanic_train2_testing
# A tibble: 179 × 5
     age sib_sp parch   fare survived
   <dbl>  <int> <int>  <dbl> <fct>   
 1  35        0     0   8.05 0       
 2  20        0     0   8.05 0       
 3  14        0     0   7.85 0       
 4  38        1     5  31.4  1       
 5  29.9      0     0   7.22 0       
 6  19        3     2 263    0       
 7  29.9      0     0   7.90 0       
 8  29.9      0     0   7.23 1       
 9   3        1     2  41.6  1       
10   7        4     1  39.7  0       
# ℹ 169 more rows
titanic_train2_training <- juice(titanic_train2_recipe)

titanic_train2_training
# A tibble: 712 × 5
     age sib_sp parch   fare survived
   <dbl>  <int> <int>  <dbl> <fct>   
 1  49        1     0  76.7  1       
 2  36        0     0   7.90 0       
 3  26        1     0  14.5  0       
 4  13        0     1  19.5  1       
 5  55.5      0     0   8.05 0       
 6  30        0     0   7.90 0       
 7  25        0     0   7.78 0       
 8  29.9      0     0   7.88 1       
 9  45        0     0   6.98 0       
10  50        1     0 106.   0       
# ℹ 702 more rows

Model 0: null

titanic_train2_null <- null_model()  |>  
  set_engine("parsnip")  |>  
  set_mode("classification") |> 
  fit(survived ~ ., data = titanic_train2_training)
predict(titanic_train2_null, titanic_train2_training)
# A tibble: 712 × 1
   .pred_class
   <fct>      
 1 0          
 2 0          
 3 0          
 4 0          
 5 0          
 6 0          
 7 0          
 8 0          
 9 0          
10 0          
# ℹ 702 more rows
titanic_train2_null %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) 
# A tibble: 179 × 6
   .pred_class   age sib_sp parch   fare survived
   <fct>       <dbl>  <int> <int>  <dbl> <fct>   
 1 0            35        0     0   8.05 0       
 2 0            20        0     0   8.05 0       
 3 0            14        0     0   7.85 0       
 4 0            38        1     5  31.4  1       
 5 0            29.9      0     0   7.22 0       
 6 0            19        3     2 263    0       
 7 0            29.9      0     0   7.90 0       
 8 0            29.9      0     0   7.23 1       
 9 0             3        1     2  41.6  1       
10 0             7        4     1  39.7  0       
# ℹ 169 more rows
mean(as.numeric(titanic_train$survived), na.rm = TRUE)
[1] 0.3838384
titanic_train2_null %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) %>%
  metrics(truth = survived, estimate = .pred_class)
# A tibble: 2 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary         0.637
2 kap      binary         0    
titanic_train2_null %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) %>%
  conf_mat(truth = survived, estimate = .pred_class)
          Truth
Prediction   0   1
         0 114  65
         1   0   0
titanic_train2_null %>%
  predict(titanic_train2_testing, type = "prob") %>%
  bind_cols(titanic_train2_testing) %>%
  roc_curve(survived, .pred_0) %>%
  autoplot() 

1 Model: kNN

Setup the models.

titanic_train2_knn <- nearest_neighbor(neighbors = 11) %>% 
  set_engine("kknn") %>%
  set_mode("classification") %>%
  fit(survived ~ ., data = titanic_train2_training)

What is missing here? The scaling or normalization has not been applied.

predict(titanic_train2_knn, titanic_train2_training)
# A tibble: 712 × 1
   .pred_class
   <fct>      
 1 1          
 2 0          
 3 0          
 4 1          
 5 0          
 6 0          
 7 0          
 8 1          
 9 0          
10 1          
# ℹ 702 more rows
titanic_train2_knn %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) 
# A tibble: 179 × 6
   .pred_class   age sib_sp parch   fare survived
   <fct>       <dbl>  <int> <int>  <dbl> <fct>   
 1 0            35        0     0   8.05 0       
 2 0            20        0     0   8.05 0       
 3 1            14        0     0   7.85 0       
 4 0            38        1     5  31.4  1       
 5 0            29.9      0     0   7.22 0       
 6 1            19        3     2 263    0       
 7 0            29.9      0     0   7.90 0       
 8 0            29.9      0     0   7.23 1       
 9 1             3        1     2  41.6  1       
10 0             7        4     1  39.7  0       
# ℹ 169 more rows
  titanic_train2_knn %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) %>%
  metrics(truth = survived, estimate = .pred_class)
# A tibble: 2 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary         0.732
2 kap      binary         0.396
titanic_train2_knn %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) %>%
  conf_mat(truth = survived, estimate = .pred_class)
          Truth
Prediction  0  1
         0 96 30
         1 18 35
titanic_train2_knn %>%
  predict(titanic_train2_testing, type = "prob") %>%
  bind_cols(titanic_train2_testing) %>%
  roc_curve(survived, .pred_0) %>%
  autoplot()