library(pacman)
p_load(titanic, tidyverse, janitor, naniar, DataExplorer, tidymodels)
Stat 652: Midterm: Model 0, Model 1
Load the data from the titanic R package. Note that the titanic_train dataset contains the labels for Survived and the titanic_test dataset does not contain the labels. So we will build our machine learning model using the titanic_train dataset and then make a final classification for the titanic_test dataset. This is how kaggle competitions are done.
I like to clean names so the variables all have names with lowercase letters and underscores.
library(titanic)
data(titanic_train)
data(titanic_test)
<- titanic_train %>% clean_names()
titanic_train
head(titanic_train)
passenger_id survived pclass
1 1 0 3
2 2 1 1
3 3 1 3
4 4 1 1
5 5 0 3
6 6 0 3
name sex age sib_sp parch
1 Braund, Mr. Owen Harris male 22 1 0
2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1 0
3 Heikkinen, Miss. Laina female 26 0 0
4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0
5 Allen, Mr. William Henry male 35 0 0
6 Moran, Mr. James male NA 0 0
ticket fare cabin embarked
1 A/5 21171 7.2500 S
2 PC 17599 71.2833 C85 C
3 STON/O2. 3101282 7.9250 S
4 113803 53.1000 C123 S
5 373450 8.0500 S
6 330877 8.4583 Q
It is always a good idea to check for duplicate records/examples/rows in your dataset.
get_dupes(titanic_train)
No variable names specified - using all columns.
No duplicate combinations found of: passenger_id, survived, pclass, name, sex, age, sib_sp, parch, ticket, ... and 3 other variables
[1] passenger_id survived pclass name sex
[6] age sib_sp parch ticket fare
[11] cabin embarked dupe_count
<0 rows> (or 0-length row.names)
<- titanic_train %>% dplyr::select(-passenger_id, -name, -ticket, -cabin) %>%
titanic_train2 mutate(
survived = as_factor(survived),
pclass = as_factor(pclass),
sex = as_factor(sex),
embarked = as_factor(embarked)
)
head(titanic_train2)
survived pclass sex age sib_sp parch fare embarked
1 0 3 male 22 1 0 7.2500 S
2 1 1 female 38 1 0 71.2833 C
3 1 3 female 26 0 0 7.9250 S
4 1 1 female 35 1 0 53.1000 S
5 0 3 male 35 0 0 8.0500 S
6 0 3 male NA 0 0 8.4583 Q
<- titanic_test %>% clean_names()
titanic_test
head(titanic_test)
passenger_id pclass name sex age
1 892 3 Kelly, Mr. James male 34.5
2 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0
3 894 2 Myles, Mr. Thomas Francis male 62.0
4 895 3 Wirz, Mr. Albert male 27.0
5 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0
6 897 3 Svensson, Mr. Johan Cervin male 14.0
sib_sp parch ticket fare cabin embarked
1 0 0 330911 7.8292 Q
2 1 0 363272 7.0000 S
3 0 0 240276 9.6875 Q
4 0 0 315154 8.6625 S
5 1 1 3101298 12.2875 S
6 0 0 7538 9.2250 S
It is always a good idea to check for duplicate records/examples/rows in your dataset.
get_dupes(titanic_test)
No variable names specified - using all columns.
No duplicate combinations found of: passenger_id, pclass, name, sex, age, sib_sp, parch, ticket, fare, ... and 2 other variables
[1] passenger_id pclass name sex age
[6] sib_sp parch ticket fare cabin
[11] embarked dupe_count
<0 rows> (or 0-length row.names)
<- titanic_test %>% select(-passenger_id, -name, -ticket, -cabin) %>%
titanic_test2 mutate(
pclass = as_factor(pclass),
sex = as_factor(sex),
embarked = as_factor(embarked)
)
head(titanic_test2)
pclass sex age sib_sp parch fare embarked
1 3 male 34.5 0 0 7.8292 Q
2 3 female 47.0 1 0 7.0000 S
3 2 male 62.0 0 0 9.6875 Q
4 3 male 27.0 0 0 8.6625 S
5 3 female 22.0 1 1 12.2875 S
6 3 male 14.0 0 0 9.2250 S
Start by investigating the missing values and completeness of the features in the data. Note that the age variable contains some missing values.
vis_miss(titanic_train2)
gg_miss_var(titanic_train2)
gg_miss_var(titanic_train2, show_pct = TRUE)
create_report(titanic_train2, y = "survived", output_file = "report.html", output_dir = getwd())
Now try the ML algorithms.
Model 0:
Summarize the y-variable. Null Model.
%>% group_by(survived) %>%
titanic_train2 summarize(n = n()) %>%
mutate(freq = n / sum(n))
# A tibble: 2 × 3
survived n freq
<fct> <int> <dbl>
1 0 549 0.616
2 1 342 0.384
Make the first split with 80% of the data being in the training data set.
<- initial_split(titanic_train2, prop = 0.8)
titanic_train2_split titanic_train2_split
<Training/Testing/Total>
<712/179/891>
Create the recipe for applying the preprocessing. Note the use of step_nzv(), which removes any columns that have very low variability, and the use of the step_meanimpute() function, which fills in the cells that are missing with the mean of the column.
<- training(titanic_train2_split) %>%
titanic_train2_recipe recipe(survived ~ .) %>%
step_rm(pclass, sex, embarked) %>%
step_nzv(all_predictors()) %>%
step_impute_mean(age) %>%
prep()
summary(titanic_train2_recipe)
# A tibble: 5 × 4
variable type role source
<chr> <list> <chr> <chr>
1 age <chr [2]> predictor original
2 sib_sp <chr [2]> predictor original
3 parch <chr [2]> predictor original
4 fare <chr [2]> predictor original
5 survived <chr [3]> outcome original
tidy(titanic_train2_recipe)
# A tibble: 3 × 6
number operation type trained skip id
<int> <chr> <chr> <lgl> <lgl> <chr>
1 1 step rm TRUE FALSE rm_lhAJl
2 2 step nzv TRUE FALSE nzv_bTDvf
3 3 step impute_mean TRUE FALSE impute_mean_fToYn
Apply the recipe, so the age variable should be complete after the imputation.
<- titanic_train2_recipe %>%
titanic_train2_testing bake(testing(titanic_train2_split))
titanic_train2_testing
# A tibble: 179 × 5
age sib_sp parch fare survived
<dbl> <int> <int> <dbl> <fct>
1 35 0 0 8.05 0
2 20 0 0 8.05 0
3 14 0 0 7.85 0
4 38 1 5 31.4 1
5 29.9 0 0 7.22 0
6 19 3 2 263 0
7 29.9 0 0 7.90 0
8 29.9 0 0 7.23 1
9 3 1 2 41.6 1
10 7 4 1 39.7 0
# ℹ 169 more rows
<- juice(titanic_train2_recipe)
titanic_train2_training
titanic_train2_training
# A tibble: 712 × 5
age sib_sp parch fare survived
<dbl> <int> <int> <dbl> <fct>
1 49 1 0 76.7 1
2 36 0 0 7.90 0
3 26 1 0 14.5 0
4 13 0 1 19.5 1
5 55.5 0 0 8.05 0
6 30 0 0 7.90 0
7 25 0 0 7.78 0
8 29.9 0 0 7.88 1
9 45 0 0 6.98 0
10 50 1 0 106. 0
# ℹ 702 more rows
Model 0: null
<- null_model() |>
titanic_train2_null set_engine("parsnip") |>
set_mode("classification") |>
fit(survived ~ ., data = titanic_train2_training)
predict(titanic_train2_null, titanic_train2_training)
# A tibble: 712 × 1
.pred_class
<fct>
1 0
2 0
3 0
4 0
5 0
6 0
7 0
8 0
9 0
10 0
# ℹ 702 more rows
%>%
titanic_train2_null predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing)
# A tibble: 179 × 6
.pred_class age sib_sp parch fare survived
<fct> <dbl> <int> <int> <dbl> <fct>
1 0 35 0 0 8.05 0
2 0 20 0 0 8.05 0
3 0 14 0 0 7.85 0
4 0 38 1 5 31.4 1
5 0 29.9 0 0 7.22 0
6 0 19 3 2 263 0
7 0 29.9 0 0 7.90 0
8 0 29.9 0 0 7.23 1
9 0 3 1 2 41.6 1
10 0 7 4 1 39.7 0
# ℹ 169 more rows
mean(as.numeric(titanic_train$survived), na.rm = TRUE)
[1] 0.3838384
%>%
titanic_train2_null predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing) %>%
metrics(truth = survived, estimate = .pred_class)
# A tibble: 2 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.637
2 kap binary 0
%>%
titanic_train2_null predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing) %>%
conf_mat(truth = survived, estimate = .pred_class)
Truth
Prediction 0 1
0 114 65
1 0 0
%>%
titanic_train2_null predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
roc_curve(survived, .pred_0) %>%
autoplot()
1 Model: kNN
Setup the models.
<- nearest_neighbor(neighbors = 11) %>%
titanic_train2_knn set_engine("kknn") %>%
set_mode("classification") %>%
fit(survived ~ ., data = titanic_train2_training)
What is missing here? The scaling or normalization has not been applied.
predict(titanic_train2_knn, titanic_train2_training)
# A tibble: 712 × 1
.pred_class
<fct>
1 1
2 0
3 0
4 1
5 0
6 0
7 0
8 1
9 0
10 1
# ℹ 702 more rows
%>%
titanic_train2_knn predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing)
# A tibble: 179 × 6
.pred_class age sib_sp parch fare survived
<fct> <dbl> <int> <int> <dbl> <fct>
1 0 35 0 0 8.05 0
2 0 20 0 0 8.05 0
3 1 14 0 0 7.85 0
4 0 38 1 5 31.4 1
5 0 29.9 0 0 7.22 0
6 1 19 3 2 263 0
7 0 29.9 0 0 7.90 0
8 0 29.9 0 0 7.23 1
9 1 3 1 2 41.6 1
10 0 7 4 1 39.7 0
# ℹ 169 more rows
%>%
titanic_train2_knn predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing) %>%
metrics(truth = survived, estimate = .pred_class)
# A tibble: 2 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.732
2 kap binary 0.396
%>%
titanic_train2_knn predict(titanic_train2_testing) %>%
bind_cols(titanic_train2_testing) %>%
conf_mat(truth = survived, estimate = .pred_class)
Truth
Prediction 0 1
0 96 30
1 18 35
%>%
titanic_train2_knn predict(titanic_train2_testing, type = "prob") %>%
bind_cols(titanic_train2_testing) %>%
roc_curve(survived, .pred_0) %>%
autoplot()