library(pacman)
p_load(titanic, tidyverse, janitor, naniar, DataExplorer, tidymodels)
Stat 652: Midterm: Model 2, Model 3
Load the data from the titanic R package. Note that the titanic_train dataset contains the labels for Survived aND THE titanic_test dataset does not contain the labels. So we will build our machine learning model using the titanic_train dataset and then make a final classification for the titanic_test dataset. This is how kaggle competitions are done.
I like to clean names so the variables all have names with lowercase letters and underscores.
<- titanic_train %>% clean_names()
titanic_train
head(titanic_train)
passenger_id survived pclass
1 1 0 3
2 2 1 1
3 3 1 3
4 4 1 1
5 5 0 3
6 6 0 3
name sex age sib_sp parch
1 Braund, Mr. Owen Harris male 22 1 0
2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1 0
3 Heikkinen, Miss. Laina female 26 0 0
4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0
5 Allen, Mr. William Henry male 35 0 0
6 Moran, Mr. James male NA 0 0
ticket fare cabin embarked
1 A/5 21171 7.2500 S
2 PC 17599 71.2833 C85 C
3 STON/O2. 3101282 7.9250 S
4 113803 53.1000 C123 S
5 373450 8.0500 S
6 330877 8.4583 Q
It is always a good idea to check for duplicate records/examples/rows in your dataset.
get_dupes(titanic_train)
No variable names specified - using all columns.
No duplicate combinations found of: passenger_id, survived, pclass, name, sex, age, sib_sp, parch, ticket, ... and 3 other variables
[1] passenger_id survived pclass name sex
[6] age sib_sp parch ticket fare
[11] cabin embarked dupe_count
<0 rows> (or 0-length row.names)
Drop the unique identifiers: passenger_id, name, and ticket. Also drop cabin because it has a high missing rate.
<- titanic_train %>% select(-passenger_id, -name, -ticket, -cabin) %>%
titanic_train2 mutate(
survived = as_factor(survived) ,
pclass = as_factor(pclass),
sex = as_factor(sex),
embarked = as_factor(embarked)
)
head(titanic_train2)
survived pclass sex age sib_sp parch fare embarked
1 0 3 male 22 1 0 7.2500 S
2 1 1 female 38 1 0 71.2833 C
3 1 3 female 26 0 0 7.9250 S
4 1 1 female 35 1 0 53.1000 S
5 0 3 male 35 0 0 8.0500 S
6 0 3 male NA 0 0 8.4583 Q
<- titanic_test %>% clean_names()
titanic_test
head(titanic_test)
passenger_id pclass name sex age
1 892 3 Kelly, Mr. James male 34.5
2 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0
3 894 2 Myles, Mr. Thomas Francis male 62.0
4 895 3 Wirz, Mr. Albert male 27.0
5 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0
6 897 3 Svensson, Mr. Johan Cervin male 14.0
sib_sp parch ticket fare cabin embarked
1 0 0 330911 7.8292 Q
2 1 0 363272 7.0000 S
3 0 0 240276 9.6875 Q
4 0 0 315154 8.6625 S
5 1 1 3101298 12.2875 S
6 0 0 7538 9.2250 S
It is always a good idea to check for duplicate records/examples/rows in your dataset.
get_dupes(titanic_test)
No variable names specified - using all columns.
No duplicate combinations found of: passenger_id, pclass, name, sex, age, sib_sp, parch, ticket, fare, ... and 2 other variables
[1] passenger_id pclass name sex age
[6] sib_sp parch ticket fare cabin
[11] embarked dupe_count
<0 rows> (or 0-length row.names)
<- titanic_test %>% select(-passenger_id, -name, -ticket, -cabin) %>%
titanic_test2 mutate(
pclass = as_factor(pclass),
sex = as_factor(sex),
embarked = as_factor(embarked)
)
head(titanic_test2)
pclass sex age sib_sp parch fare embarked
1 3 male 34.5 0 0 7.8292 Q
2 3 female 47.0 1 0 7.0000 S
3 2 male 62.0 0 0 9.6875 Q
4 3 male 27.0 0 0 8.6625 S
5 3 female 22.0 1 1 12.2875 S
6 3 male 14.0 0 0 9.2250 S
Start by investigating the missing values and completeness of the features in the data. Note that the age variable contains some missing values.
vis_miss(titanic_train2)
gg_miss_var(titanic_train2)
gg_miss_var(titanic_train2, show_pct = TRUE)
create_report(titanic_train2, y = "survived", output_file = "report.html", output_dir = getwd())
Now try the ML algorithms.
Model 0:
Summarize the y-variable. Null Model.
%>% group_by(survived) %>%
titanic_train2 summarize(n = n()) %>%
mutate(freq = n / sum(n))
# A tibble: 2 × 3
survived n freq
<fct> <int> <dbl>
1 0 549 0.616
2 1 342 0.384
Make the first split with 80% of the data being in the training data set.
<- initial_split(titanic_train2, prop = 0.8)
titanic_train2_split titanic_train2_split
<Training/Testing/Total>
<712/179/891>
Create the recipe for applying the preprocessing. Note the use of step_nzv(), which removes any columns that have very low variability, and the use of the step_meanimpute() function, which fills in the cells that are missing with the mean of the column.
<- training(titanic_train2_split) %>%
titanic_train2_recipe recipe(survived ~ .) %>%
step_rm(pclass, sex, embarked) %>%
step_nzv(all_predictors()) %>%
step_impute_mean(age) %>%
prep()
summary(titanic_train2_recipe)
# A tibble: 5 × 4
variable type role source
<chr> <list> <chr> <chr>
1 age <chr [2]> predictor original
2 sib_sp <chr [2]> predictor original
3 parch <chr [2]> predictor original
4 fare <chr [2]> predictor original
5 survived <chr [3]> outcome original
tidy(titanic_train2_recipe)
# A tibble: 3 × 6
number operation type trained skip id
<int> <chr> <chr> <lgl> <lgl> <chr>
1 1 step rm TRUE FALSE rm_ZdCs5
2 2 step nzv TRUE FALSE nzv_EDkvH
3 3 step impute_mean TRUE FALSE impute_mean_6j9ks
Apply the recipe, so the age variable should be complete after the imputation.
<- titanic_train2_recipe %>%
titanic_train2_testing bake(testing(titanic_train2_split))
titanic_train2_testing
# A tibble: 179 × 5
age sib_sp parch fare survived
<dbl> <int> <int> <dbl> <fct>
1 27 0 2 11.1 1
2 58 0 0 26.6 1
3 14 0 0 7.85 0
4 38 1 5 31.4 1
5 40 0 0 27.7 0
6 28 1 0 82.2 0
7 42 1 0 52 0
8 29.7 0 0 7.75 1
9 65 0 1 62.0 0
10 26 2 0 8.66 0
# ℹ 169 more rows
<- juice(titanic_train2_recipe)
titanic_train2_training
titanic_train2_training
# A tibble: 712 × 5
age sib_sp parch fare survived
<dbl> <int> <int> <dbl> <fct>
1 9 1 1 15.9 1
2 20 1 0 9.82 0
3 29.7 0 0 8.05 0
4 21 0 0 7.73 0
5 28 1 0 15.8 0
6 29.7 0 0 7.90 0
7 27 0 2 212. 0
8 29.7 1 0 24.2 0
9 28 0 0 9.5 0
10 34 0 0 13 0
# ℹ 702 more rows
Model 0: null
<- null_model() |>
titanic_train2_null set_engine("parsnip") |>
set_mode("classification") |>
fit(survived ~ ., data = titanic_train2_training)
predict(titanic_train2_null, titanic_train2_training)
# A tibble: 712 × 1
.pred_class
<fct>
1 0
2 0
3 0
4 0
5 0
6 0
7 0
8 0
9 0
10 0
# ℹ 702 more rows
%>%
titanic_train2_null predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing)
# A tibble: 179 × 6
.pred_class age sib_sp parch fare survived
<fct> <dbl> <int> <int> <dbl> <fct>
1 0 27 0 2 11.1 1
2 0 58 0 0 26.6 1
3 0 14 0 0 7.85 0
4 0 38 1 5 31.4 1
5 0 40 0 0 27.7 0
6 0 28 1 0 82.2 0
7 0 42 1 0 52 0
8 0 29.7 0 0 7.75 1
9 0 65 0 1 62.0 0
10 0 26 2 0 8.66 0
# ℹ 169 more rows
%>%
titanic_train2_null predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing) %>%
metrics(truth = survived, estimate = .pred_class)
# A tibble: 2 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.620
2 kap binary 0
%>%
titanic_train2_null predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing) %>%
conf_mat(truth = survived, estimate = .pred_class)
Truth
Prediction 0 1
0 111 68
1 0 0
%>%
titanic_train2_null predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
roc_curve(survived, .pred_1) %>%
autoplot()
Model 2: C5.0
Setup the model.
<- boost_tree(trees = 20) %>%
titanic_train2_C50 set_engine("C5.0") %>%
set_mode("classification") %>%
fit(survived ~ ., data = titanic_train2_training)
predict(titanic_train2_C50, titanic_train2_training)
# A tibble: 712 × 1
.pred_class
<fct>
1 1
2 0
3 0
4 0
5 0
6 0
7 1
8 0
9 0
10 0
# ℹ 702 more rows
%>%
titanic_train2_C50 predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing)
# A tibble: 179 × 6
.pred_class age sib_sp parch fare survived
<fct> <dbl> <int> <int> <dbl> <fct>
1 1 27 0 2 11.1 1
2 0 58 0 0 26.6 1
3 1 14 0 0 7.85 0
4 1 38 1 5 31.4 1
5 0 40 0 0 27.7 0
6 1 28 1 0 82.2 0
7 1 42 1 0 52 0
8 0 29.7 0 0 7.75 1
9 1 65 0 1 62.0 0
10 0 26 2 0 8.66 0
# ℹ 169 more rows
%>%
titanic_train2_C50 predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing) %>%
metrics(truth = survived, estimate = .pred_class)
# A tibble: 2 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.732
2 kap binary 0.421
%>%
titanic_train2_C50 predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing) %>%
conf_mat(truth = survived, estimate = .pred_class)
Truth
Prediction 0 1
0 90 27
1 21 41
%>%
titanic_train2_C50 predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
roc_curve(survived, .pred_0) %>%
autoplot()
%>%
titanic_train2_C50 predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
roc_auc(survived, .pred_0)
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 roc_auc binary 0.793
%>%
titanic_train2_C50 predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
ggplot() +
geom_density(aes(x = .pred_1, fill = survived),
alpha = 0.5)
Model 2: XGBoost
Setup the model.
<- boost_tree(trees = 20) %>%
titanic_train2_xgb set_engine("xgboost") %>%
set_mode("classification") %>%
fit(survived ~ ., data = titanic_train2_training)
predict(titanic_train2_xgb, titanic_train2_training)
# A tibble: 712 × 1
.pred_class
<fct>
1 1
2 0
3 0
4 0
5 0
6 0
7 0
8 0
9 0
10 0
# ℹ 702 more rows
%>%
titanic_train2_xgb predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing)
# A tibble: 179 × 6
.pred_class age sib_sp parch fare survived
<fct> <dbl> <int> <int> <dbl> <fct>
1 0 27 0 2 11.1 1
2 0 58 0 0 26.6 1
3 1 14 0 0 7.85 0
4 0 38 1 5 31.4 1
5 0 40 0 0 27.7 0
6 1 28 1 0 82.2 0
7 1 42 1 0 52 0
8 0 29.7 0 0 7.75 1
9 0 65 0 1 62.0 0
10 0 26 2 0 8.66 0
# ℹ 169 more rows
%>%
titanic_train2_xgb predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing) %>%
metrics(truth = survived, estimate = .pred_class)
# A tibble: 2 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.709
2 kap binary 0.350
%>%
titanic_train2_xgb predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing) %>%
conf_mat(truth = survived, estimate = .pred_class)
Truth
Prediction 0 1
0 94 35
1 17 33
%>%
titanic_train2_xgb predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
roc_curve(survived, .pred_0) %>%
autoplot()
%>%
titanic_train2_xgb predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
roc_auc(survived, .pred_0)
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 roc_auc binary 0.760
%>%
titanic_train2_xgb predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
ggplot() +
geom_density(aes(x = .pred_1, fill = survived),
alpha = 0.5)
Model 3: Random Forest
Setup the model.
<- rand_forest(trees = 100) %>%
titanic_train2_ranger set_engine("ranger") %>%
set_mode("classification") %>%
fit(survived ~ ., data = titanic_train2_training)
predict(titanic_train2_ranger, titanic_train2_training)
# A tibble: 712 × 1
.pred_class
<fct>
1 1
2 0
3 0
4 0
5 0
6 0
7 0
8 0
9 0
10 1
# ℹ 702 more rows
%>%
titanic_train2_ranger predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing)
# A tibble: 179 × 6
.pred_class age sib_sp parch fare survived
<fct> <dbl> <int> <int> <dbl> <fct>
1 0 27 0 2 11.1 1
2 0 58 0 0 26.6 1
3 1 14 0 0 7.85 0
4 0 38 1 5 31.4 1
5 1 40 0 0 27.7 0
6 1 28 1 0 82.2 0
7 1 42 1 0 52 0
8 0 29.7 0 0 7.75 1
9 0 65 0 1 62.0 0
10 0 26 2 0 8.66 0
# ℹ 169 more rows
%>%
titanic_train2_ranger predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing) %>%
metrics(truth = survived, estimate = .pred_class)
# A tibble: 2 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.732
2 kap binary 0.418
%>%
titanic_train2_ranger predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing) %>%
conf_mat(truth = survived, estimate = .pred_class)
Truth
Prediction 0 1
0 91 28
1 20 40
%>%
titanic_train2_ranger predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
roc_curve(survived, .pred_0) %>%
ggplot(aes(x = 1 - specificity, y = sensitivity)) +
geom_path() +
geom_abline(lty = 3) +
coord_equal()
%>%
titanic_train2_ranger predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
roc_curve(survived, .pred_0) %>%
autoplot()
%>%
titanic_train2_ranger predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
roc_auc(survived, .pred_0)
# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 roc_auc binary 0.779
%>%
titanic_train2_ranger predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
ggplot() +
geom_density(aes(x = .pred_1, fill = survived),
alpha = 0.5)