##### Chapter 11: Improving Model Performance -------------------

# load the credit dataset
credit <- read.csv("http://cox.csueastbay.edu/~esuess/classes/Statistics_452/Presentations/ml10a/credit.csv")

# No Test data are used in these examples.  So the out of sample accuracy is not computed. 

# Here is the link to the caret documentation.
# http://topepo.github.io/caret/index.html

library(caret)
library(tictoc)  # A nice package for measuring run times in R.

## Creating a simple tuned model ----
# automated parameter tuning of C5.0 decision tree 
set.seed(300)
m <- train(default ~ ., data = credit, method = "C5.0")

# summary of tuning results
m

# apply the best C5.0 candidate model to make predictions
p <- predict(m, credit)
confusionMatrix(data=p, credit$default)

# obtain predicted classes
head(predict(m, credit, type = "raw"))

# obtain predicted probabilities
head(predict(m, credit, type = "prob"))

## Customizing the tuning process ----
# use trainControl() to alter resampling strategy
ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 10,
                     selectionFunction = "oneSE", returnResamp="all")

# use expand.grid() to create grid of tuning parameters
grid <- expand.grid(.model = "tree",
                    .trials = c(1, 5, 10, 15, 20, 25, 30, 35),
                    .winnow = c(TRUE,FALSE))

# look at the result of expand.grid()
grid

# customize train() with the control list and grid of parameters 
tic()
set.seed(300)
m <- train(default ~ ., data = credit, method = "C5.0",
           metric = "Kappa",
           trControl = ctrl,
           tuneGrid = grid,
           verbose=FALSE)
toc()

m

# visualize the resample distributions
xyplot(m,type = c("g", "p", "smooth"))

# run in parallel, the doMC package runs on mac and linux
library(doMC)
registerDoMC(cores = 4)

# run in parallel, the doMC package runs on Windows
library(doParallel)
registerDoParallel(cores = 4)

## All subsequent models are then run in parallel
tic()
set.seed(300)
m <- train(default ~ ., data = credit, method = "C5.0",
           metric = "Kappa",
           trControl = ctrl,
           tuneGrid = grid,
           verbose=FALSE)
toc()

m

# visualize the resample distributions
xyplot(m,type = c("g", "p", "smooth"))

## Bagging ----
# Using the ipred bagged decision trees
library(ipred)

tic()
set.seed(300)
mybag <- bagging(default ~ ., data = credit, nbagg = 25)
toc()

credit_pred <- predict(mybag, credit)
credit_pred$confusion

 # estimate performance of ipred bagged trees

tic()
set.seed(300)
ctrl <- trainControl(method = "cv", number = 10)

mybag_treebag <- train(default ~ ., data = credit, method ="treebag",
      trControl = ctrl)
toc()

credit_pred <- predict(mybag_treebag, credit)
credit_pred

confusionMatrix(data=credit_pred, credit$default)

## Boosting ----

## Using C5.0 Decision Tree (not shown in book)
library(C50)

tic()
m_c50_bst <- C5.0(default ~ ., data = credit, trials = 100)
toc()

## Using AdaBoost.M1
library(adabag)

# create a Adaboost.M1 model
tic()
set.seed(300)
m_adaboost <- boosting(default ~ ., data = credit)
toc()

p_adaboost <- predict(m_adaboost, credit)
head(p_adaboost$class)
p_adaboost$confusion

# create and evaluate an Adaboost.M1 model using 10-fold-CV
tic()
set.seed(300)
adaboost_cv <- boosting.cv(default ~ ., data = credit)
toc()

adaboost_cv$confusion

# calculate kappa
library(vcd)
Kappa(adaboost_cv$confusion)

## Random Forests ----
# random forest with default settings
library(randomForest)

tic()
set.seed(300)
rf <- randomForest(default ~ ., data = credit)
toc()

rf

credit_pred <- predict(rf, credit)
confusionMatrix(data=credit_pred, credit$default)

library(caret)
ctrl <- trainControl(method = "repeatedcv",
                     number = 10, repeats = 10)

# auto-tune a random forest
grid_rf <- expand.grid(.mtry = c(2, 4, 8, 16))

tic()
set.seed(300)
m_rf <- train(default ~ ., data = credit, method = "rf",
              metric = "Kappa", trControl = ctrl,
              tuneGrid = grid_rf)
toc()

m_rf

credit_pred <- predict(m_rf, credit)
confusionMatrix(data=credit_pred, credit$default)

# auto-tune a boosted C5.0 decision tree
grid_c50 <- expand.grid(.model = "tree",
                        .trials = c(10, 20, 30, 40),
                        .winnow = "FALSE")

tic()
set.seed(300)
m_c50 <- train(default ~ ., data = credit, method = "C5.0",
                metric = "Kappa", trControl = ctrl,
               tuneGrid = grid_c50)
toc()

m_c50

credit_pred <- predict(m_c50, credit)
confusionMatrix(data=credit_pred, credit$default)

###########################################################################
## Random Forests ----
library(ranger)

tic()
set.seed(300)
m_rf_ranger <- ranger(default ~ ., data = credit, num.threads = 8)
toc()

m_rf_ranger

m_rf_ranger$confusion.matrix

###########################################################################

# Example of running RandomForest on a larger dataset.
# The ranger package can be used in parallel.

# The dataset used is the Bank Marketing Data Set dataset from the UCI ML Library.
# https://archive.ics.uci.edu/ml/datasets/Bank+Marketing

tic()
bank <- read.csv("bank-additional-full-2.csv", header = TRUE)
toc()

tic()
set.seed(300)
m_rf_ranger <- ranger(y ~ ., data = bank, num.threads = 8)
toc()

m_rf_ranger

m_rf_ranger$predictions

m_rf_ranger$prediction.error  

m_rf_ranger$confusion.matrix

###########################################################################

# Example of running RandomForest on a larger dataset.
# The target variable is numeric, so there is not confussion matrix.
# The ranger package can be used in parallel.

# The dataset used is the Online New Populatity dataset from the UCI ML Library.
# https://archive.ics.uci.edu/ml/datasets/Online+News+Popularity

tic()
populatity <- read.csv("OnlineNewsPopularity.csv", header = TRUE)
toc()

tic()
set.seed(300)
m_rf_ranger <- ranger(shares ~ ., data = populatity[-c(1,2)], num.threads = 8)
toc()

m_rf_ranger

m_rf_ranger$predictions

cor(m_rf_ranger$predictions, populatity$shares)

plot(m_rf_ranger$predictions, populatity$shares)

m_rf_ranger$prediction.error  # very large error.