---
title: "Code for The tidyverse for Machine Learning"
output: html_notebook
---

Bruna Wundervald's SatRday talk.  (satRday-sp-talk)[https://github.com/brunaw/satRday-sp-talk]

```{r}
library(tidyverse)
library(ranger)
data <- dials::Chicago
dim(data)
```

```{r}
data
```

```{r}
data %>%  
  ggplot(aes(x = ridership)) +
  geom_density(fill = "#919c4c", alpha = 0.8) +
  labs(x = "Response Variable", y = "Density") +
  theme_classic()
```

Make many copies of the same dataset.

```{r}
data_tibble <- rep(list(data), 10) %>% 
  enframe(name = 'index', value = 'data')
data_tibble
```

```{r}
data_tibble$data
```

```{r}
train_test <- function(data){
  data %>% 
    mutate(base = ifelse(runif(n()) > 0.75, "test", "train")) %>% 
    split(.$base) %>% 
    purrr::map(~select(.x, -.data[["base"]])) }
train_test

data_tibble <- data_tibble %>% 
   mutate(train_test = purrr::map(data, train_test))
data_tibble
print(data_tibble, n = 3)
```

```{r}
modelling <- function(train, 
                      mtry = NULL, 
                      num.trees = NULL, 
                      coef.reg = 1, 
                      formula = ridership ~ .) {
ranger::ranger(formula, 
                 data = train, 
                 num.trees = num.trees,
                 mtry = mtry, 
                 importance = "impurity")
}
```

```{r}
models <- list(
  tree = list(mtry = ncol(data) - 1, num.trees = 1, coef.reg = 1),
  bagging = list(mtry = ncol(data) - 1, num.trees = 100, coef.reg = 1), 
  forest = list(mtry = sqrt(ncol(data) - 1), num.trees = 100, coef.reg = 1),
  forest75 = list(mtry = sqrt(ncol(data) - 1), num.trees = 75, coef.reg = 1),
  forest50 = list(mtry = sqrt(ncol(data) - 1), num.trees = 50, coef.reg = 1)) %>% 
  enframe(name = "model", value = "parameters") 
models
```


```{r}
data_tibble <- data_tibble %>% 
  #crossing(models) %>% 
  arrange(model)
data_tibble
```

```{r}
training_models <- data_tibble %>% 
  mutate(
    full_parameters = 
       map2(parameters, map(train_test, "train"), ~list_modify(.x, train = .y)),
     train_model = invoke_map(modelling, full_parameters))
print(training_models, n = 5)
```


```{r}
rmse <- function(model, test){
  pp <- predict(model, test)
  sqrt(mean((pp$predictions - test$ridership)^2))
}
number_variables <- function(model){
  sum(model$variable.importance > 0)
}
```

Results

```{r}
results <- training_models %>% 
  mutate(
     rmse = map2_dbl(.x = train_model,
                    .y = map(train_test, "test"), 
                    ~rmse(model = .x, test = .y)),
     number_variables = map_int(train_model, number_variables), 
     rsquared = map_dbl(train_model, "r.squared"))

results
```