library(pacman)
p_load(titanic, tidyverse, janitor, naniar, DataExplorer, tidymodels)
Load the data from the titanic R package. Note that the titanic_train dataset contains the labels for Survived aND THE titanic_test dataset does not contain the labels. So we will build our machine learning model using the titanic_train dataset and then make a final classification for the titanic_test dataset. This is how kaggle competitions are done.
I like to clean names so the variables all have names with lowercase letters and underscores.
titanic_train <- titanic_train %>% clean_names()
head(titanic_train)
It is always a good idea to check for duplicate records/examples/rows in your dataset.
get_dupes(titanic_train)
No variable names specified - using all columns.
No duplicate combinations found of: passenger_id, survived, pclass, name, sex, age, sib_sp, parch, ticket, ... and 3 other variables
Drop the unique identifiers: passenger_id, name, and ticket. Also drop cabin because it has a high missing rate.
titanic_test <- titanic_test %>% clean_names()
head(titanic_test)
It is always a good idea to check for duplicate records/examples/rows in your dataset.
get_dupes(titanic_test)
No variable names specified - using all columns.
No duplicate combinations found of: passenger_id, pclass, name, sex, age, sib_sp, parch, ticket, fare, ... and 2 other variables
titanic_test2 <- titanic_test %>% select(-passenger_id, -name, -ticket, -cabin) %>%
mutate(
pclass = as_factor(pclass),
sex = as_factor(sex),
embarked = as_factor(embarked)
)
head(titanic_test2)
NA
Start by investigating the missing values and completeness of the features in the data. Note that the age variable contains some missing values.
vis_miss(titanic_train2)

gg_miss_var(titanic_train2)

gg_miss_var(titanic_train2, show_pct = TRUE)

create_report(titanic_train2, y = "survived", output_file = "report.html", output_dir = getwd())
Now try the ML algorithms.
Model 0:
Summarize the y-variable. Null Model.
titanic_train2 %>% group_by(survived) %>%
summarize(n = n()) %>%
mutate(freq = n / sum(n))
Make the first split with 80% of the data being in the training data set.
titanic_train2_split <- initial_split(titanic_train2, prop = 0.8)
titanic_train2_split
<Analysis/Assess/Total>
<713/178/891>
Training data.
titanic_train2_split %>%
training()
Create the recipe for applying the preprocessing. Note the use of step_nzv(), which removes any columns that have very low variability, and the use of the step_meanimpute() function, which fills in the cells that are missing with the mean of the column.
titanic_train2_recipe <- training(titanic_train2_split) %>%
recipe(survived ~ .) %>%
step_rm(pclass, sex, embarked) %>%
step_nzv(all_predictors()) %>%
step_meanimpute(age) %>%
prep()
summary(titanic_train2_recipe)
tidy(titanic_train2_recipe)
Apply the receipe, so the age variable should be complete after the imputation.
titanic_train2_testing <- titanic_train2_recipe %>%
bake(testing(titanic_train2_split))
titanic_train2_testing
titanic_train2_training <- juice(titanic_train2_recipe)
titanic_train2_training
Model 0: null
titanic_train2_null <- null_model() %>%
set_mode("classification") %>%
fit(survived ~ ., data = titanic_train2_training)
Engine set to `parsnip`.
predict(titanic_train2_null, titanic_train2_training)
titanic_train2_null %>%
predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing)
titanic_train2_null %>%
predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing) %>%
metrics(truth = survived, estimate = .pred_class)
titanic_train2_null %>%
predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing) %>%
conf_mat(truth = survived, estimate = .pred_class)
Truth
Prediction 0 1
0 109 69
1 0 0

Model 2: C5.0
Setup the model.
titanic_train2_C50 <- boost_tree(trees = 20) %>%
set_engine("C5.0") %>%
set_mode("classification") %>%
fit(survived ~ ., data = titanic_train2_training)
predict(titanic_train2_C50, titanic_train2_training)
titanic_train2_C50 %>%
predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing)
titanic_train2_C50 %>%
predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing) %>%
metrics(truth = survived, estimate = .pred_class)
titanic_train2_C50 %>%
predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing) %>%
conf_mat(truth = survived, estimate = .pred_class)
Truth
Prediction 0 1
0 92 39
1 17 30
titanic_train2_C50 %>%
predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
roc_curve(survived, .pred_0) %>%
autoplot()

titanic_train2_C50 %>%
predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
roc_auc(survived, .pred_0)
titanic_train2_C50 %>%
predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
ggplot() +
geom_density(aes(x = .pred_1, fill = survived),
alpha = 0.5)

Model 2: XGBoost
Setup the model.
titanic_train2_xgb <- boost_tree(trees = 20) %>%
set_engine("xgboost") %>%
set_mode("classification") %>%
fit(survived ~ ., data = titanic_train2_training)
[09:57:43] WARNING: amalgamation/../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
predict(titanic_train2_xgb, titanic_train2_training)
titanic_train2_xgb %>%
predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing)
titanic_train2_xgb %>%
predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing) %>%
metrics(truth = survived, estimate = .pred_class)
titanic_train2_xgb %>%
predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing) %>%
conf_mat(truth = survived, estimate = .pred_class)
Truth
Prediction 0 1
0 90 38
1 19 31
titanic_train2_xgb %>%
predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
roc_curve(survived, .pred_0) %>%
autoplot()

titanic_train2_xgb %>%
predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
roc_auc(survived, .pred_0)
titanic_train2_xgb %>%
predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
ggplot() +
geom_density(aes(x = .pred_1, fill = survived),
alpha = 0.5)

Model 3: Random Forest
Setup the model.
titanic_train2_ranger <- rand_forest(trees = 100) %>%
set_engine("ranger") %>%
set_mode("classification") %>%
fit(survived ~ ., data = titanic_train2_training)
predict(titanic_train2_ranger, titanic_train2_training)
titanic_train2_ranger %>%
predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing)
titanic_train2_ranger %>%
predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing) %>%
metrics(truth = survived, estimate = .pred_class)
titanic_train2_ranger %>%
predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing) %>%
conf_mat(truth = survived, estimate = .pred_class)
Truth
Prediction 0 1
0 89 33
1 20 36
titanic_train2_ranger %>%
predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
roc_curve(survived, .pred_1) %>%
ggplot(aes(x = 1 - specificity, y = sensitivity)) +
geom_path() +
geom_abline(lty = 3) +
coord_equal()

titanic_train2_ranger %>%
predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
roc_curve(survived, event_level = "first") %>%
autoplot()
Error: No valid variables provided to `...`.
Run `rlang::last_error()` to see where the error occurred.
titanic_train2_ranger %>%
predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
roc_auc(survived, .pred_0)
titanic_train2_ranger %>%
predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
ggplot() +
geom_density(aes(x = .pred_1, fill = survived),
alpha = 0.5)

