library(pacman)
p_load(titanic, tidyverse, janitor, naniar, DataExplorer, tidymodels)

Load the data from the titanic R package. Note that the titanic_train dataset contains the labels for Survived aND THE titanic_test dataset does not contain the labels. So we will build our machine learning model using the titanic_train dataset and then make a final classification for the titanic_test dataset. This is how kaggle competitions are done.

I like to clean names so the variables all have names with lowercase letters and underscores.

It is always a good idea to check for duplicate records/examples/rows in your dataset.

get_dupes(titanic_train)
No variable names specified - using all columns.

No duplicate combinations found of: passenger_id, survived, pclass, name, sex, age, sib_sp, parch, ticket, ... and 3 other variables
titanic_train2 <- titanic_train %>% select(-passenger_id, -name, -ticket, -cabin) %>%
  mutate(
    survived = as_factor(survived),
    pclass = as_factor(pclass),
    sex = as_factor(sex),
    embarked = as_factor(embarked)
  )

head(titanic_train2)
titanic_test <- titanic_test %>% clean_names()

head(titanic_test)

It is always a good idea to check for duplicate records/examples/rows in your dataset.

get_dupes(titanic_test)
No variable names specified - using all columns.

No duplicate combinations found of: passenger_id, pclass, name, sex, age, sib_sp, parch, ticket, fare, ... and 2 other variables
titanic_test2 <- titanic_test %>% select(-passenger_id, -name, -ticket, -cabin) %>%
  mutate(
    pclass = as_factor(pclass),
    sex = as_factor(sex),
    embarked = as_factor(embarked)
  )

head(titanic_test2)
NA

Start by investigating the missing values and completeness of the features in the data. Note that the age variable contains some missing values.

vis_miss(titanic_train2)

gg_miss_var(titanic_train2)

gg_miss_var(titanic_train2, show_pct = TRUE)

create_report(titanic_train2, y = "survived", output_file = "report.html", output_dir = getwd())

Now try the ML algorithms.

Model 0:

Summarize the y-variable. Null Model.

titanic_train2 %>% group_by(survived) %>%
  summarize(n = n()) %>%
  mutate(freq = n / sum(n))

Make the first split with 80% of the data being in the training data set.

titanic_train2_split <- initial_split(titanic_train2, prop = 0.8)
titanic_train2_split
<Analysis/Assess/Total>
<713/178/891>

Training data.

titanic_train2_split %>%
  training() 

Create the recipe for applying the preprocessing. Note the use of step_nzv(), which removes any columns that have very low variability, and the use of the step_meanimpute() function, which fills in the cells that are missing with the mean of the column.

titanic_train2_recipe <- training(titanic_train2_split) %>%
  recipe(survived ~ .) %>%
  step_rm(pclass, sex, embarked) %>% 
  step_nzv(all_predictors()) %>%
  step_meanimpute(age) %>%
  prep()

summary(titanic_train2_recipe)

tidy(titanic_train2_recipe)

Apply the receipe, so the age variable should be complete after the imputation.

titanic_train2_testing <- titanic_train2_recipe %>%
  bake(testing(titanic_train2_split)) 

titanic_train2_testing
titanic_train2_training <- juice(titanic_train2_recipe)

titanic_train2_training

Model 0: null

null_model(mode = "classification")
Model Specification (classification)
titanic_train2_null <- null_model() %>%
  set_mode("classification") %>%
  fit(survived ~ ., data = titanic_train2_training)
Engine set to `parsnip`.
predict(titanic_train2_null, titanic_train2_training)
titanic_train2_null %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) 
titanic_train2_null %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) %>%
  metrics(truth = survived, estimate = .pred_class)
titanic_train2_null %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) %>%
  conf_mat(truth = survived, estimate = .pred_class)
          Truth
Prediction   0   1
         0 117  61
         1   0   0

1 Model: kNN

Setup the models.

titanic_train2_knn <- nearest_neighbor(neighbors = 11) %>% 
  set_engine("kknn") %>%
  set_mode("classification") %>%
  fit(survived ~ ., data = titanic_train2_training)

What is missing here? The scaling or normalization has not been applied.

predict(titanic_train2_knn, titanic_train2_training)
titanic_train2_knn %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) 
  titanic_train2_knn %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) %>%
  metrics(truth = survived, estimate = .pred_class)
titanic_train2_knn %>%
  predict(titanic_train2_testing) %>%
  bind_cols(titanic_train2_testing) %>%
  conf_mat(truth = survived, estimate = .pred_class)
          Truth
Prediction  0  1
         0 95 29
         1 22 32
LS0tCnRpdGxlOiAiU3RhdCA2NTI6IE1pZHRlcm06IE1vZGVsIDAsIE1vZGVsIDEiCmF1dGhvcjogIlByb2YuIEVyaWMgQS4gU3Vlc3MiCm91dHB1dDoKICBodG1sX25vdGVib29rOiBkZWZhdWx0CiAgd29yZF9kb2N1bWVudDogZGVmYXVsdAogIHBkZl9kb2N1bWVudDogZGVmYXVsdCAKLS0tCgoKCmBgYHtyfQpsaWJyYXJ5KHBhY21hbikKcF9sb2FkKHRpdGFuaWMsIHRpZHl2ZXJzZSwgamFuaXRvciwgbmFuaWFyLCBEYXRhRXhwbG9yZXIsIHRpZHltb2RlbHMpCmBgYAoKTG9hZCB0aGUgZGF0YSBmcm9tIHRoZSB0aXRhbmljIFIgcGFja2FnZS4gIE5vdGUgdGhhdCB0aGUgKnRpdGFuaWNfdHJhaW4qIGRhdGFzZXQgY29udGFpbnMgdGhlIGxhYmVscyBmb3IgU3Vydml2ZWQgYU5EIFRIRSAqdGl0YW5pY190ZXN0KiBkYXRhc2V0IGRvZXMgbm90IGNvbnRhaW4gdGhlIGxhYmVscy4gIFNvIHdlIHdpbGwgYnVpbGQgb3VyIG1hY2hpbmUgbGVhcm5pbmcgbW9kZWwgdXNpbmcgdGhlICp0aXRhbmljX3RyYWluKiBkYXRhc2V0IGFuZCB0aGVuIG1ha2UgYSBmaW5hbCBjbGFzc2lmaWNhdGlvbiBmb3IgdGhlICp0aXRhbmljX3Rlc3QqIGRhdGFzZXQuICBUaGlzIGlzIGhvdyBrYWdnbGUgY29tcGV0aXRpb25zIGFyZSBkb25lLgoKSSBsaWtlIHRvIGNsZWFuIG5hbWVzIHNvIHRoZSB2YXJpYWJsZXMgYWxsIGhhdmUgbmFtZXMgd2l0aCBsb3dlcmNhc2UgbGV0dGVycyBhbmQgdW5kZXJzY29yZXMuCgpgYGB7cn0KdGl0YW5pY190cmFpbiA8LSB0aXRhbmljX3RyYWluICU+JSBjbGVhbl9uYW1lcygpCgpoZWFkKHRpdGFuaWNfdHJhaW4pCmBgYAoKSXQgaXMgYWx3YXlzIGEgZ29vZCBpZGVhIHRvIGNoZWNrIGZvciBkdXBsaWNhdGUgcmVjb3Jkcy9leGFtcGxlcy9yb3dzIGluIHlvdXIgZGF0YXNldC4KCmBgYHtyfQpnZXRfZHVwZXModGl0YW5pY190cmFpbikKYGBgCgpgYGB7cn0KdGl0YW5pY190cmFpbjIgPC0gdGl0YW5pY190cmFpbiAlPiUgc2VsZWN0KC1wYXNzZW5nZXJfaWQsIC1uYW1lLCAtdGlja2V0LCAtY2FiaW4pICU+JQogIG11dGF0ZSgKICAgIHN1cnZpdmVkID0gYXNfZmFjdG9yKHN1cnZpdmVkKSwKICAgIHBjbGFzcyA9IGFzX2ZhY3RvcihwY2xhc3MpLAogICAgc2V4ID0gYXNfZmFjdG9yKHNleCksCiAgICBlbWJhcmtlZCA9IGFzX2ZhY3RvcihlbWJhcmtlZCkKICApCgpoZWFkKHRpdGFuaWNfdHJhaW4yKQpgYGAKCmBgYHtyfQp0aXRhbmljX3Rlc3QgPC0gdGl0YW5pY190ZXN0ICU+JSBjbGVhbl9uYW1lcygpCgpoZWFkKHRpdGFuaWNfdGVzdCkKYGBgCgpJdCBpcyBhbHdheXMgYSBnb29kIGlkZWEgdG8gY2hlY2sgZm9yIGR1cGxpY2F0ZSByZWNvcmRzL2V4YW1wbGVzL3Jvd3MgaW4geW91ciBkYXRhc2V0LgoKYGBge3J9CmdldF9kdXBlcyh0aXRhbmljX3Rlc3QpCmBgYAoKYGBge3J9CnRpdGFuaWNfdGVzdDIgPC0gdGl0YW5pY190ZXN0ICU+JSBzZWxlY3QoLXBhc3Nlbmdlcl9pZCwgLW5hbWUsIC10aWNrZXQsIC1jYWJpbikgJT4lCiAgbXV0YXRlKAogICAgcGNsYXNzID0gYXNfZmFjdG9yKHBjbGFzcyksCiAgICBzZXggPSBhc19mYWN0b3Ioc2V4KSwKICAgIGVtYmFya2VkID0gYXNfZmFjdG9yKGVtYmFya2VkKQogICkKCmhlYWQodGl0YW5pY190ZXN0MikKCmBgYAoKClN0YXJ0IGJ5IGludmVzdGlnYXRpbmcgdGhlIG1pc3NpbmcgdmFsdWVzIGFuZCBjb21wbGV0ZW5lc3Mgb2YgdGhlIGZlYXR1cmVzIGluIHRoZSBkYXRhLiAgTm90ZSB0aGF0IHRoZSAqYWdlKiB2YXJpYWJsZSBjb250YWlucyBzb21lIG1pc3NpbmcgdmFsdWVzLgoKYGBge3J9CnZpc19taXNzKHRpdGFuaWNfdHJhaW4yKQpnZ19taXNzX3Zhcih0aXRhbmljX3RyYWluMikKZ2dfbWlzc192YXIodGl0YW5pY190cmFpbjIsIHNob3dfcGN0ID0gVFJVRSkKYGBgCgpgYGB7ciBldmFsID0gRkFMU0V9CmNyZWF0ZV9yZXBvcnQodGl0YW5pY190cmFpbjIsIHkgPSAic3Vydml2ZWQiLCBvdXRwdXRfZmlsZSA9ICJyZXBvcnQuaHRtbCIsIG91dHB1dF9kaXIgPSBnZXR3ZCgpKQpgYGAKCk5vdyB0cnkgdGhlIE1MIGFsZ29yaXRobXMuCgoKIyMgTW9kZWwgMDoKClN1bW1hcml6ZSB0aGUgeS12YXJpYWJsZS4gIE51bGwgTW9kZWwuCgpgYGB7cn0KdGl0YW5pY190cmFpbjIgJT4lIGdyb3VwX2J5KHN1cnZpdmVkKSAlPiUKICBzdW1tYXJpemUobiA9IG4oKSkgJT4lCiAgbXV0YXRlKGZyZXEgPSBuIC8gc3VtKG4pKQpgYGAKCk1ha2UgdGhlIGZpcnN0IHNwbGl0IHdpdGggODAlIG9mIHRoZSBkYXRhIGJlaW5nIGluIHRoZSB0cmFpbmluZyBkYXRhIHNldC4KCmBgYHtyfQp0aXRhbmljX3RyYWluMl9zcGxpdCA8LSBpbml0aWFsX3NwbGl0KHRpdGFuaWNfdHJhaW4yLCBwcm9wID0gMC44KQp0aXRhbmljX3RyYWluMl9zcGxpdApgYGAKClRyYWluaW5nIGRhdGEuCgpgYGB7cn0KdGl0YW5pY190cmFpbjJfc3BsaXQgJT4lCiAgdHJhaW5pbmcoKSAKYGBgCgpDcmVhdGUgdGhlIHJlY2lwZSBmb3IgYXBwbHlpbmcgdGhlIHByZXByb2Nlc3NpbmcuICBOb3RlIHRoZSB1c2Ugb2Ygc3RlcF9uenYoKSwgd2hpY2ggcmVtb3ZlcyBhbnkgY29sdW1ucyB0aGF0IGhhdmUgdmVyeSBsb3cgdmFyaWFiaWxpdHksIGFuZCB0aGUgdXNlIG9mIHRoZSBzdGVwX21lYW5pbXB1dGUoKSBmdW5jdGlvbiwgd2hpY2ggZmlsbHMgaW4gdGhlIGNlbGxzIHRoYXQgYXJlIG1pc3Npbmcgd2l0aCB0aGUgbWVhbiBvZiB0aGUgY29sdW1uLgoKYGBge3J9CnRpdGFuaWNfdHJhaW4yX3JlY2lwZSA8LSB0cmFpbmluZyh0aXRhbmljX3RyYWluMl9zcGxpdCkgJT4lCiAgcmVjaXBlKHN1cnZpdmVkIH4gLikgJT4lCiAgc3RlcF9ybShwY2xhc3MsIHNleCwgZW1iYXJrZWQpICU+JSAKICBzdGVwX256dihhbGxfcHJlZGljdG9ycygpKSAlPiUKICBzdGVwX21lYW5pbXB1dGUoYWdlKSAlPiUKICBwcmVwKCkKCnN1bW1hcnkodGl0YW5pY190cmFpbjJfcmVjaXBlKQoKdGlkeSh0aXRhbmljX3RyYWluMl9yZWNpcGUpCmBgYAoKQXBwbHkgdGhlIHJlY2VpcGUsIHNvIHRoZSAqYWdlKiB2YXJpYWJsZSBzaG91bGQgYmUgY29tcGxldGUgYWZ0ZXIgdGhlIGltcHV0YXRpb24uCgpgYGB7cn0KdGl0YW5pY190cmFpbjJfdGVzdGluZyA8LSB0aXRhbmljX3RyYWluMl9yZWNpcGUgJT4lCiAgYmFrZSh0ZXN0aW5nKHRpdGFuaWNfdHJhaW4yX3NwbGl0KSkgCgp0aXRhbmljX3RyYWluMl90ZXN0aW5nCmBgYAoKYGBge3J9CnRpdGFuaWNfdHJhaW4yX3RyYWluaW5nIDwtIGp1aWNlKHRpdGFuaWNfdHJhaW4yX3JlY2lwZSkKCnRpdGFuaWNfdHJhaW4yX3RyYWluaW5nCmBgYAoKIyMjIE1vZGVsIDA6IG51bGwKCgpgYGB7cn0KbnVsbF9tb2RlbChtb2RlID0gImNsYXNzaWZpY2F0aW9uIikKCnRpdGFuaWNfdHJhaW4yX251bGwgPC0gbnVsbF9tb2RlbCgpICU+JQogIHNldF9tb2RlKCJjbGFzc2lmaWNhdGlvbiIpICU+JQogIGZpdChzdXJ2aXZlZCB+IC4sIGRhdGEgPSB0aXRhbmljX3RyYWluMl90cmFpbmluZykKCmBgYAoKYGBge3J9CnByZWRpY3QodGl0YW5pY190cmFpbjJfbnVsbCwgdGl0YW5pY190cmFpbjJfdHJhaW5pbmcpCmBgYAoKYGBge3J9CnRpdGFuaWNfdHJhaW4yX251bGwgJT4lCiAgcHJlZGljdCh0aXRhbmljX3RyYWluMl90ZXN0aW5nKSAlPiUKICBiaW5kX2NvbHModGl0YW5pY190cmFpbjJfdGVzdGluZykgCmBgYAoKCmBgYHtyfQp0aXRhbmljX3RyYWluMl9udWxsICU+JQogIHByZWRpY3QodGl0YW5pY190cmFpbjJfdGVzdGluZykgJT4lCiAgYmluZF9jb2xzKHRpdGFuaWNfdHJhaW4yX3Rlc3RpbmcpICU+JQogIG1ldHJpY3ModHJ1dGggPSBzdXJ2aXZlZCwgZXN0aW1hdGUgPSAucHJlZF9jbGFzcykKYGBgCgoKCmBgYHtyfQp0aXRhbmljX3RyYWluMl9udWxsICU+JQogIHByZWRpY3QodGl0YW5pY190cmFpbjJfdGVzdGluZykgJT4lCiAgYmluZF9jb2xzKHRpdGFuaWNfdHJhaW4yX3Rlc3RpbmcpICU+JQogIGNvbmZfbWF0KHRydXRoID0gc3Vydml2ZWQsIGVzdGltYXRlID0gLnByZWRfY2xhc3MpCmBgYAoKYGBge3J9CnRpdGFuaWNfdHJhaW4yX251bGwgJT4lCiAgcHJlZGljdCh0aXRhbmljX3RyYWluMl90ZXN0aW5nLCB0eXBlID0gInByb2IiKSAlPiUKICBiaW5kX2NvbHModGl0YW5pY190cmFpbjJfdGVzdGluZykgJT4lCiAgcm9jX2N1cnZlKHN1cnZpdmVkLCAucHJlZF8wKSAlPiUKICBhdXRvcGxvdCgpIApgYGAKCgojIyMgMSBNb2RlbDoga05OCgpTZXR1cCB0aGUgbW9kZWxzLgoKYGBge3J9CnRpdGFuaWNfdHJhaW4yX2tubiA8LSBuZWFyZXN0X25laWdoYm9yKG5laWdoYm9ycyA9IDExKSAlPiUgCiAgc2V0X2VuZ2luZSgia2tubiIpICU+JQogIHNldF9tb2RlKCJjbGFzc2lmaWNhdGlvbiIpICU+JQogIGZpdChzdXJ2aXZlZCB+IC4sIGRhdGEgPSB0aXRhbmljX3RyYWluMl90cmFpbmluZykKCmBgYAoKV2hhdCBpcyBtaXNzaW5nIGhlcmU/ICBUaGUgc2NhbGluZyBvciBub3JtYWxpemF0aW9uIGhhcyBub3QgYmVlbiBhcHBsaWVkLgoKCmBgYHtyfQpwcmVkaWN0KHRpdGFuaWNfdHJhaW4yX2tubiwgdGl0YW5pY190cmFpbjJfdHJhaW5pbmcpCmBgYAoKYGBge3J9CnRpdGFuaWNfdHJhaW4yX2tubiAlPiUKICBwcmVkaWN0KHRpdGFuaWNfdHJhaW4yX3Rlc3RpbmcpICU+JQogIGJpbmRfY29scyh0aXRhbmljX3RyYWluMl90ZXN0aW5nKSAKYGBgCgoKYGBge3J9CiAgdGl0YW5pY190cmFpbjJfa25uICU+JQogIHByZWRpY3QodGl0YW5pY190cmFpbjJfdGVzdGluZykgJT4lCiAgYmluZF9jb2xzKHRpdGFuaWNfdHJhaW4yX3Rlc3RpbmcpICU+JQogIG1ldHJpY3ModHJ1dGggPSBzdXJ2aXZlZCwgZXN0aW1hdGUgPSAucHJlZF9jbGFzcykKYGBgCgoKCmBgYHtyfQp0aXRhbmljX3RyYWluMl9rbm4gJT4lCiAgcHJlZGljdCh0aXRhbmljX3RyYWluMl90ZXN0aW5nKSAlPiUKICBiaW5kX2NvbHModGl0YW5pY190cmFpbjJfdGVzdGluZykgJT4lCiAgY29uZl9tYXQodHJ1dGggPSBzdXJ2aXZlZCwgZXN0aW1hdGUgPSAucHJlZF9jbGFzcykKYGBgCgoK