--- title: "Code for The tidyverse for Machine Learning" output: html_notebook --- Bruna Wundervald's SatRday talk. (satRday-sp-talk)[https://github.com/brunaw/satRday-sp-talk] ```{r} library(tidyverse) library(ranger) data <- dials::Chicago dim(data) ``` ```{r} data ``` ```{r} data %>% ggplot(aes(x = ridership)) + geom_density(fill = "#919c4c", alpha = 0.8) + labs(x = "Response Variable", y = "Density") + theme_classic() ``` Make many copies of the same dataset. ```{r} data_tibble <- rep(list(data), 10) %>% enframe(name = 'index', value = 'data') data_tibble ``` ```{r} data_tibble$data ``` ```{r} train_test <- function(data){ data %>% mutate(base = ifelse(runif(n()) > 0.75, "test", "train")) %>% split(.$base) %>% purrr::map(~select(.x, -.data[["base"]])) } train_test data_tibble <- data_tibble %>% mutate(train_test = purrr::map(data, train_test)) data_tibble print(data_tibble, n = 3) ``` ```{r} modelling <- function(train, mtry = NULL, num.trees = NULL, coef.reg = 1, formula = ridership ~ .) { ranger::ranger(formula, data = train, num.trees = num.trees, mtry = mtry, importance = "impurity") } ``` ```{r} models <- list( tree = list(mtry = ncol(data) - 1, num.trees = 1, coef.reg = 1), bagging = list(mtry = ncol(data) - 1, num.trees = 100, coef.reg = 1), forest = list(mtry = sqrt(ncol(data) - 1), num.trees = 100, coef.reg = 1), forest75 = list(mtry = sqrt(ncol(data) - 1), num.trees = 75, coef.reg = 1), forest50 = list(mtry = sqrt(ncol(data) - 1), num.trees = 50, coef.reg = 1)) %>% enframe(name = "model", value = "parameters") models ``` ```{r} data_tibble <- data_tibble %>% #crossing(models) %>% arrange(model) data_tibble ``` ```{r} training_models <- data_tibble %>% mutate( full_parameters = map2(parameters, map(train_test, "train"), ~list_modify(.x, train = .y)), train_model = invoke_map(modelling, full_parameters)) print(training_models, n = 5) ``` ```{r} rmse <- function(model, test){ pp <- predict(model, test) sqrt(mean((pp$predictions - test$ridership)^2)) } number_variables <- function(model){ sum(model$variable.importance > 0) } ``` Results ```{r} results <- training_models %>% mutate( rmse = map2_dbl(.x = train_model, .y = map(train_test, "test"), ~rmse(model = .x, test = .y)), number_variables = map_int(train_model, number_variables), rsquared = map_dbl(train_model, "r.squared")) results ```