library(pacman)
p_load(titanic, tidyverse, janitor, naniar, DataExplorer, tidymodels, discrim)
Load the data from the titanic R package. Note that the titanic_train dataset contains the labels for Survived aND THE titanic_test dataset does not contain the labels. So we will build our machine learning model using the titanic_train dataset and then make a final classification for the titanic_test dataset. This is how kaggle competitions are done.
I like to clean names so the variables all have names with lowercase letters and underscores.
titanic_train <- titanic_train %>% clean_names()
head(titanic_train)
It is always a good idea to check for duplicate records/examples/rows in your dataset.
get_dupes(titanic_train)
No variable names specified - using all columns.
No duplicate combinations found of: passenger_id, survived, pclass, name, sex, age, sib_sp, parch, ticket, ... and 3 other variables
Drop the unique identifiers: passenger_id, name, and ticket. Also drop cabin because it has a high missing rate.
titanic_train2 <- titanic_train %>% select(-passenger_id, -name, -ticket, -cabin) %>%
mutate(
survived = as_factor(survived),
pclass = as_factor(pclass),
sex = as_factor(sex),
embarked = as_factor(embarked)
)
head(titanic_train2)
titanic_test <- titanic_test %>% clean_names()
head(titanic_test)
It is always a good idea to check for duplicate records/examples/rows in your dataset.
get_dupes(titanic_test)
No variable names specified - using all columns.
No duplicate combinations found of: passenger_id, pclass, name, sex, age, sib_sp, parch, ticket, fare, ... and 2 other variables
titanic_test2 <- titanic_test %>% select(-passenger_id, -name, -ticket, -cabin) %>%
mutate(
pclass = as_factor(pclass),
sex = as_factor(sex),
embarked = as_factor(embarked)
)
head(titanic_test2)
NA
Start by investigating the missing values and completeness of the features in the data. Note that the age variable contains some missing values.
vis_miss(titanic_train2)

gg_miss_var(titanic_train2)

gg_miss_var(titanic_train2, show_pct = TRUE)

create_report(titanic_train2, y = "survived", output_file = "report.html", output_dir = getwd())
Now try the ML algorithms.
Model 0:
Summarize the y-variable. Null Model.
titanic_train2 %>% group_by(survived) %>%
summarize(n = n()) %>%
mutate(freq = n / sum(n))
Make the first split with 80% of the data being in the training data set.
titanic_train2_split <- initial_split(titanic_train2, prop = 0.8)
titanic_train2_split
<Analysis/Assess/Total>
<713/178/891>
Training data.
titanic_train2_split %>%
training()
Create the recipe for applying the preprocessing. Note the use of step_nzv(), which removes any columns that have very low variability, and the use of the step_meanimpute() function, which fills in the cells that are missing with the mean of the column.
Apply the recipe, so the age variable should be complete after the imputation.
titanic_train2_testing <- titanic_train2_recipe %>%
bake(testing(titanic_train2_split))
titanic_train2_testing
titanic_train2_training <- juice(titanic_train2_recipe)
titanic_train2_training
Model 0: null
null_model(mode = "classification")
Model Specification (classification)
titanic_train2_null <- null_model() %>%
set_mode("classification") %>%
fit(survived ~ ., data = titanic_train2_training)
Engine set to `parsnip`.
predict(titanic_train2_null, titanic_train2_training)
titanic_train2_null %>%
predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing)
titanic_train2_null %>%
predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing) %>%
metrics(truth = survived, estimate = .pred_class)
titanic_train2_null %>%
predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing) %>%
conf_mat(truth = survived, estimate = .pred_class)
Truth
Prediction 0 1
0 109 69
1 0 0
titanic_train2_null %>%
predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
roc_curve(survived, .pred_0) %>%
autoplot()

Model 4: GLM
Setup the model.
titanic_train2_glm <- logistic_reg(penalty = 0.001, mixture = 0.5) %>%
set_engine("glmnet") %>%
set_mode("classification") %>%
fit(survived ~ ., data = titanic_train2_training)
predict(titanic_train2_glm, titanic_train2_training)
titanic_train2_glm %>%
predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing)
titanic_train2_glm %>%
predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing) %>%
metrics(truth = survived, estimate = .pred_class)
titanic_train2_glm %>%
predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing) %>%
conf_mat(truth = survived, estimate = .pred_class)
Truth
Prediction 0 1
0 104 48
1 5 21
titanic_train2_glm %>%
predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
roc_curve(survived, .pred_0) %>%
autoplot()

titanic_train2_glm %>%
predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
roc_auc(survived, .pred_0)

Model 5: Naive Bayes
Setup the model.
titanic_train2_nb <- naive_Bayes(Laplace = 1) %>%
set_engine("klaR") %>%
set_mode("classification") %>%
fit(survived ~ ., data = titanic_train2_training)
predict(titanic_train2_nb, titanic_train2_training)
Numerical 0 probability for all classes with observation 348
titanic_train2_nb %>%
predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing)
titanic_train2_nb %>%
predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing) %>%
metrics(truth = survived, estimate = .pred_class)
titanic_train2_nb %>%
predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing) %>%
conf_mat(truth = survived, estimate = .pred_class)
Truth
Prediction 0 1
0 81 40
1 28 29

titanic_train2_nb %>%
predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
roc_auc(survived, .pred_0)
titanic_train2_nb %>%
predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
ggplot() +
geom_density(aes(x = .pred_1, fill = survived),
alpha = 0.5)
