Chapter 5: Classification using Decision Trees and Rules

Part 1: Decision Trees

Understanding Decision Trees

Calculate entropy of a two-class segment

-0.60 * log2(0.60) - 0.40 * log2(0.40)

curve(-x * log2(x) - (1 - x) * log2(1 - x),
      col = "red", xlab = "x", ylab = "Entropy", lwd = 4)

Example: Identifying Risky Bank Loans

Step 1: Download the data

# URL <- "http://cox.csueastbay.edu/~esuess/classes/Statistics_652/Presentations/05_DecisionTrees/credit.csv"
# download.file(URL, destfile = "credit.csv", method="curl")

Step 2: Exploring and preparing the data —-

credit <- read.csv("credit.csv", stringsAsFactors = TRUE)
str(credit)

Look at two characteristics of the applicant

table(credit$checking_balance)
table(credit$savings_balance)

Look at two characteristics of the loan

summary(credit$months_loan_duration)
summary(credit$amount)

Look at the class variable

table(credit$default)

Create a random sample for training and test data Use set.seed to use the same random number sequence as the tutorial

set.seed(123)
train_sample <- sample(1000, 900)

str(train_sample)

Split the data frames

credit_train <- credit[train_sample, ]
credit_test  <- credit[-train_sample, ]

Check the proportion of class variable

prop.table(table(credit_train$default))
prop.table(table(credit_test$default))

Step 3: Training a model on the data

Build the simplest decision tree

library(C50)
credit_model <- C5.0(credit_train[-17], credit_train$default)

Display simple facts about the tree

credit_model

Display detailed information about the tree

summary(credit_model)

Step 4: Evaluating model performance

Create a factor vector of predictions on test data

credit_pred <- predict(credit_model, credit_test)

Cross tabulation of predicted versus actual classes

library(gmodels)
CrossTable(credit_test$default, credit_pred,
           prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
           dnn = c('actual default', 'predicted default'))

Step 5: Improving model performance

Boosting the accuracy of decision trees

Boosted decision tree with 10 trials

credit_boost10 <- C5.0(credit_train[-17], credit_train$default,
                       trials = 10)
credit_boost10
summary(credit_boost10)
credit_boost_pred10 <- predict(credit_boost10, credit_test)
CrossTable(credit_test$default, credit_boost_pred10,
           prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
           dnn = c('actual default', 'predicted default'))

Making some mistakes more costly than others

Create dimensions for a cost matrix

matrix_dimensions <- list(c("no", "yes"), c("no", "yes"))
names(matrix_dimensions) <- c("predicted", "actual")
matrix_dimensions

Build the matrix

error_cost <- matrix(c(0, 1, 4, 0), nrow = 2, dimnames = matrix_dimensions)
error_cost

Apply the cost matrix to the tree


credit_cost <- C5.0(credit_train[-17], credit_train$default,
                    costs = error_cost)
credit_cost_pred <- predict(credit_cost, credit_test)

CrossTable(credit_test$default, credit_cost_pred,
           prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
           dnn = c('actual default', 'predicted default'))

Part 2: Rule Learners

Example: Identifying Poisonous Mushrooms

Step 1: Download the data

# URL <- "http://cox.csueastbay.edu/~esuess/classes/Statistics_652/Presentations/05_DecisionTrees/mushrooms.csv"
# download.file(URL, destfile = "./mushrooms.csv", method="curl")

Step 2: Exploring and preparing the data

mushrooms <- read.csv("mushrooms.csv", stringsAsFactors = TRUE)

Examine the structure of the data frame

str(mushrooms)

drop the veil_type feature

mushrooms$veil_type <- NULL

examine the class distribution

table(mushrooms$type)

Randomize the Train and Test data

set.seed(123)
train_sample <- sample(8124, 7000)

str(train_sample)

Split the data frames

mushrooms_train <- mushrooms[train_sample, ]
mushrooms_test  <- mushrooms[-train_sample, ]

Step 3: Training a model on the data

library(RWeka)

train OneR() on the data

mushroom_1R <- OneR(type ~ ., data = mushrooms_train)

Step 4: Evaluating model performance

mushroom_1R
summary(mushroom_1R)

Make predictions

mushroom_pred <- predict(mushroom_1R, mushrooms_test)

Cross tabulation of predicted versus actual classes

library(gmodels)
CrossTable(mushrooms_test$type, mushroom_pred,
           prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
           dnn = c('actual default', 'predicted default'))

Step 5: Improving model performance

mushroom_JRip <- JRip(type ~ ., data = mushrooms_train)
mushroom_JRip
summary(mushroom_JRip)

Make predictions

mushroom_pred <- predict(mushroom_JRip, mushrooms_test)

Cross tabulation of predicted versus actual classes

library(gmodels)
CrossTable(mushrooms_test$type, mushroom_pred,
           prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
           dnn = c('actual default', 'predicted default'))

Rule Learner Using C5.0 Decision Trees (not in text)

library(C50)
mushroom_c5rules <- C5.0(type ~ odor + gill_size, data = mushrooms_train, rules = TRUE)
mushroom_c5rules
summary(mushroom_c5rules)
mushroom_pred <- predict(mushroom_c5rules, mushrooms_test)

Cross tabulation of predicted versus actual classes

library(gmodels)
CrossTable(mushrooms_test$type, mushroom_pred,
           prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
           dnn = c('actual default', 'predicted default'))
LS0tCnRpdGxlOiAiQ2hhcHRlciA1IC0gUiBOb3RlYm9vayIKb3V0cHV0OgogIHdvcmRfZG9jdW1lbnQ6IGRlZmF1bHQKICBwZGZfZG9jdW1lbnQ6IGRlZmF1bHQKICBodG1sX25vdGVib29rOiBkZWZhdWx0Ci0tLQoKIyBDaGFwdGVyIDU6IENsYXNzaWZpY2F0aW9uIHVzaW5nIERlY2lzaW9uIFRyZWVzIGFuZCBSdWxlcyAKCiMjIFBhcnQgMTogRGVjaXNpb24gVHJlZXMgCgojIyBVbmRlcnN0YW5kaW5nIERlY2lzaW9uIFRyZWVzIAoKQ2FsY3VsYXRlIGVudHJvcHkgb2YgYSB0d28tY2xhc3Mgc2VnbWVudAoKYGBge3J9Ci0wLjYwICogbG9nMigwLjYwKSAtIDAuNDAgKiBsb2cyKDAuNDApCgpjdXJ2ZSgteCAqIGxvZzIoeCkgLSAoMSAtIHgpICogbG9nMigxIC0geCksCiAgICAgIGNvbCA9ICJyZWQiLCB4bGFiID0gIngiLCB5bGFiID0gIkVudHJvcHkiLCBsd2QgPSA0KQpgYGAKCiMjIEV4YW1wbGU6IElkZW50aWZ5aW5nIFJpc2t5IEJhbmsgTG9hbnMgCiMjIFN0ZXAgMTogRG93bmxvYWQgdGhlIGRhdGEgCgpgYGB7cn0KIyBVUkwgPC0gImh0dHA6Ly9jb3guY3N1ZWFzdGJheS5lZHUvfmVzdWVzcy9jbGFzc2VzL1N0YXRpc3RpY3NfNjUyL1ByZXNlbnRhdGlvbnMvMDVfRGVjaXNpb25UcmVlcy9jcmVkaXQuY3N2IgojIGRvd25sb2FkLmZpbGUoVVJMLCBkZXN0ZmlsZSA9ICJjcmVkaXQuY3N2IiwgbWV0aG9kPSJjdXJsIikKYGBgCgojIyBTdGVwIDI6IEV4cGxvcmluZyBhbmQgcHJlcGFyaW5nIHRoZSBkYXRhIC0tLS0KCmBgYHtyfQpjcmVkaXQgPC0gcmVhZC5jc3YoImNyZWRpdC5jc3YiLCBzdHJpbmdzQXNGYWN0b3JzID0gVFJVRSkKc3RyKGNyZWRpdCkKYGBgCgpMb29rIGF0IHR3byBjaGFyYWN0ZXJpc3RpY3Mgb2YgdGhlIGFwcGxpY2FudAoKYGBge3J9CnRhYmxlKGNyZWRpdCRjaGVja2luZ19iYWxhbmNlKQp0YWJsZShjcmVkaXQkc2F2aW5nc19iYWxhbmNlKQoKYGBgCgpMb29rIGF0IHR3byBjaGFyYWN0ZXJpc3RpY3Mgb2YgdGhlIGxvYW4KCmBgYHtyfQpzdW1tYXJ5KGNyZWRpdCRtb250aHNfbG9hbl9kdXJhdGlvbikKc3VtbWFyeShjcmVkaXQkYW1vdW50KQpgYGAKCkxvb2sgYXQgdGhlIGNsYXNzIHZhcmlhYmxlCgpgYGB7cn0KdGFibGUoY3JlZGl0JGRlZmF1bHQpCmBgYAoKQ3JlYXRlIGEgcmFuZG9tIHNhbXBsZSBmb3IgdHJhaW5pbmcgYW5kIHRlc3QgZGF0YQpVc2Ugc2V0LnNlZWQgdG8gdXNlIHRoZSBzYW1lIHJhbmRvbSBudW1iZXIgc2VxdWVuY2UgYXMgdGhlIHR1dG9yaWFsCgpgYGB7cn0Kc2V0LnNlZWQoMTIzKQp0cmFpbl9zYW1wbGUgPC0gc2FtcGxlKDEwMDAsIDkwMCkKCnN0cih0cmFpbl9zYW1wbGUpCmBgYAoKU3BsaXQgdGhlIGRhdGEgZnJhbWVzCgpgYGB7cn0KY3JlZGl0X3RyYWluIDwtIGNyZWRpdFt0cmFpbl9zYW1wbGUsIF0KY3JlZGl0X3Rlc3QgIDwtIGNyZWRpdFstdHJhaW5fc2FtcGxlLCBdCmBgYAoKCkNoZWNrIHRoZSBwcm9wb3J0aW9uIG9mIGNsYXNzIHZhcmlhYmxlCgpgYGB7cn0KcHJvcC50YWJsZSh0YWJsZShjcmVkaXRfdHJhaW4kZGVmYXVsdCkpCnByb3AudGFibGUodGFibGUoY3JlZGl0X3Rlc3QkZGVmYXVsdCkpCmBgYAoKCiMjIFN0ZXAgMzogVHJhaW5pbmcgYSBtb2RlbCBvbiB0aGUgZGF0YSAKCkJ1aWxkIHRoZSBzaW1wbGVzdCBkZWNpc2lvbiB0cmVlCgpgYGB7cn0KbGlicmFyeShDNTApCmNyZWRpdF9tb2RlbCA8LSBDNS4wKGNyZWRpdF90cmFpblstMTddLCBjcmVkaXRfdHJhaW4kZGVmYXVsdCkKYGBgCgpEaXNwbGF5IHNpbXBsZSBmYWN0cyBhYm91dCB0aGUgdHJlZQoKYGBge3J9CmNyZWRpdF9tb2RlbAoKYGBgCgpEaXNwbGF5IGRldGFpbGVkIGluZm9ybWF0aW9uIGFib3V0IHRoZSB0cmVlCgpgYGB7cn0Kc3VtbWFyeShjcmVkaXRfbW9kZWwpCmBgYAoKIyMgU3RlcCA0OiBFdmFsdWF0aW5nIG1vZGVsIHBlcmZvcm1hbmNlIAoKQ3JlYXRlIGEgZmFjdG9yIHZlY3RvciBvZiBwcmVkaWN0aW9ucyBvbiB0ZXN0IGRhdGEKCmBgYHtyfQpjcmVkaXRfcHJlZCA8LSBwcmVkaWN0KGNyZWRpdF9tb2RlbCwgY3JlZGl0X3Rlc3QpCmBgYAoKQ3Jvc3MgdGFidWxhdGlvbiBvZiBwcmVkaWN0ZWQgdmVyc3VzIGFjdHVhbCBjbGFzc2VzCgpgYGB7cn0KbGlicmFyeShnbW9kZWxzKQpDcm9zc1RhYmxlKGNyZWRpdF90ZXN0JGRlZmF1bHQsIGNyZWRpdF9wcmVkLAogICAgICAgICAgIHByb3AuY2hpc3EgPSBGQUxTRSwgcHJvcC5jID0gRkFMU0UsIHByb3AuciA9IEZBTFNFLAogICAgICAgICAgIGRubiA9IGMoJ2FjdHVhbCBkZWZhdWx0JywgJ3ByZWRpY3RlZCBkZWZhdWx0JykpCmBgYAoKIyMgU3RlcCA1OiBJbXByb3ZpbmcgbW9kZWwgcGVyZm9ybWFuY2UgCgojIyBCb29zdGluZyB0aGUgYWNjdXJhY3kgb2YgZGVjaXNpb24gdHJlZXMKCkJvb3N0ZWQgZGVjaXNpb24gdHJlZSB3aXRoIDEwIHRyaWFscwoKYGBge3J9CmNyZWRpdF9ib29zdDEwIDwtIEM1LjAoY3JlZGl0X3RyYWluWy0xN10sIGNyZWRpdF90cmFpbiRkZWZhdWx0LAogICAgICAgICAgICAgICAgICAgICAgIHRyaWFscyA9IDEwKQpjcmVkaXRfYm9vc3QxMApzdW1tYXJ5KGNyZWRpdF9ib29zdDEwKQpgYGAKCmBgYHtyfQpjcmVkaXRfYm9vc3RfcHJlZDEwIDwtIHByZWRpY3QoY3JlZGl0X2Jvb3N0MTAsIGNyZWRpdF90ZXN0KQpDcm9zc1RhYmxlKGNyZWRpdF90ZXN0JGRlZmF1bHQsIGNyZWRpdF9ib29zdF9wcmVkMTAsCiAgICAgICAgICAgcHJvcC5jaGlzcSA9IEZBTFNFLCBwcm9wLmMgPSBGQUxTRSwgcHJvcC5yID0gRkFMU0UsCiAgICAgICAgICAgZG5uID0gYygnYWN0dWFsIGRlZmF1bHQnLCAncHJlZGljdGVkIGRlZmF1bHQnKSkKYGBgCgojIyBNYWtpbmcgc29tZSBtaXN0YWtlcyBtb3JlIGNvc3RseSB0aGFuIG90aGVycwoKQ3JlYXRlIGRpbWVuc2lvbnMgZm9yIGEgY29zdCBtYXRyaXgKCmBgYHtyfQptYXRyaXhfZGltZW5zaW9ucyA8LSBsaXN0KGMoIm5vIiwgInllcyIpLCBjKCJubyIsICJ5ZXMiKSkKbmFtZXMobWF0cml4X2RpbWVuc2lvbnMpIDwtIGMoInByZWRpY3RlZCIsICJhY3R1YWwiKQptYXRyaXhfZGltZW5zaW9ucwpgYGAKCkJ1aWxkIHRoZSBtYXRyaXgKCmBgYHtyfQplcnJvcl9jb3N0IDwtIG1hdHJpeChjKDAsIDEsIDQsIDApLCBucm93ID0gMiwgZGltbmFtZXMgPSBtYXRyaXhfZGltZW5zaW9ucykKZXJyb3JfY29zdApgYGAKCkFwcGx5IHRoZSBjb3N0IG1hdHJpeCB0byB0aGUgdHJlZQoKYGBge3J9CgpjcmVkaXRfY29zdCA8LSBDNS4wKGNyZWRpdF90cmFpblstMTddLCBjcmVkaXRfdHJhaW4kZGVmYXVsdCwKICAgICAgICAgICAgICAgICAgICBjb3N0cyA9IGVycm9yX2Nvc3QpCmNyZWRpdF9jb3N0X3ByZWQgPC0gcHJlZGljdChjcmVkaXRfY29zdCwgY3JlZGl0X3Rlc3QpCgpDcm9zc1RhYmxlKGNyZWRpdF90ZXN0JGRlZmF1bHQsIGNyZWRpdF9jb3N0X3ByZWQsCiAgICAgICAgICAgcHJvcC5jaGlzcSA9IEZBTFNFLCBwcm9wLmMgPSBGQUxTRSwgcHJvcC5yID0gRkFMU0UsCiAgICAgICAgICAgZG5uID0gYygnYWN0dWFsIGRlZmF1bHQnLCAncHJlZGljdGVkIGRlZmF1bHQnKSkKYGBgCgoKIyBQYXJ0IDI6IFJ1bGUgTGVhcm5lcnMgCgojIyBFeGFtcGxlOiBJZGVudGlmeWluZyBQb2lzb25vdXMgTXVzaHJvb21zIAojIyBTdGVwIDE6IERvd25sb2FkIHRoZSBkYXRhIAoKYGBge3J9CiMgVVJMIDwtICJodHRwOi8vY294LmNzdWVhc3RiYXkuZWR1L35lc3Vlc3MvY2xhc3Nlcy9TdGF0aXN0aWNzXzY1Mi9QcmVzZW50YXRpb25zLzA1X0RlY2lzaW9uVHJlZXMvbXVzaHJvb21zLmNzdiIKIyBkb3dubG9hZC5maWxlKFVSTCwgZGVzdGZpbGUgPSAiLi9tdXNocm9vbXMuY3N2IiwgbWV0aG9kPSJjdXJsIikKYGBgCgojIyBTdGVwIDI6IEV4cGxvcmluZyBhbmQgcHJlcGFyaW5nIHRoZSBkYXRhIAoKYGBge3J9Cm11c2hyb29tcyA8LSByZWFkLmNzdigibXVzaHJvb21zLmNzdiIsIHN0cmluZ3NBc0ZhY3RvcnMgPSBUUlVFKQpgYGAKCkV4YW1pbmUgdGhlIHN0cnVjdHVyZSBvZiB0aGUgZGF0YSBmcmFtZQoKYGBge3J9CnN0cihtdXNocm9vbXMpCmBgYAoKCiMgZHJvcCB0aGUgdmVpbF90eXBlIGZlYXR1cmUKCmBgYHtyfQptdXNocm9vbXMkdmVpbF90eXBlIDwtIE5VTEwKYGBgCgojIGV4YW1pbmUgdGhlIGNsYXNzIGRpc3RyaWJ1dGlvbgoKYGBge3J9CnRhYmxlKG11c2hyb29tcyR0eXBlKQpgYGAKClJhbmRvbWl6ZSB0aGUgVHJhaW4gYW5kIFRlc3QgZGF0YQoKYGBge3J9CnNldC5zZWVkKDEyMykKdHJhaW5fc2FtcGxlIDwtIHNhbXBsZSg4MTI0LCA3MDAwKQoKc3RyKHRyYWluX3NhbXBsZSkKYGBgCgpTcGxpdCB0aGUgZGF0YSBmcmFtZXMKCmBgYHtyfQptdXNocm9vbXNfdHJhaW4gPC0gbXVzaHJvb21zW3RyYWluX3NhbXBsZSwgXQptdXNocm9vbXNfdGVzdCAgPC0gbXVzaHJvb21zWy10cmFpbl9zYW1wbGUsIF0KYGBgCgojIyBTdGVwIDM6IFRyYWluaW5nIGEgbW9kZWwgb24gdGhlIGRhdGEgCgpgYGB7cn0KbGlicmFyeShSV2VrYSkKYGBgCgojIHRyYWluIE9uZVIoKSBvbiB0aGUgZGF0YQoKYGBge3J9Cm11c2hyb29tXzFSIDwtIE9uZVIodHlwZSB+IC4sIGRhdGEgPSBtdXNocm9vbXNfdHJhaW4pCmBgYAoKIyMgU3RlcCA0OiBFdmFsdWF0aW5nIG1vZGVsIHBlcmZvcm1hbmNlIAoKYGBge3J9Cm11c2hyb29tXzFSCnN1bW1hcnkobXVzaHJvb21fMVIpCmBgYAoKTWFrZSBwcmVkaWN0aW9ucwoKYGBge3J9Cm11c2hyb29tX3ByZWQgPC0gcHJlZGljdChtdXNocm9vbV8xUiwgbXVzaHJvb21zX3Rlc3QpCgpgYGAKCkNyb3NzIHRhYnVsYXRpb24gb2YgcHJlZGljdGVkIHZlcnN1cyBhY3R1YWwgY2xhc3NlcwoKYGBge3J9CmxpYnJhcnkoZ21vZGVscykKQ3Jvc3NUYWJsZShtdXNocm9vbXNfdGVzdCR0eXBlLCBtdXNocm9vbV9wcmVkLAogICAgICAgICAgIHByb3AuY2hpc3EgPSBGQUxTRSwgcHJvcC5jID0gRkFMU0UsIHByb3AuciA9IEZBTFNFLAogICAgICAgICAgIGRubiA9IGMoJ2FjdHVhbCBkZWZhdWx0JywgJ3ByZWRpY3RlZCBkZWZhdWx0JykpCmBgYAoKCgoKIyMgU3RlcCA1OiBJbXByb3ZpbmcgbW9kZWwgcGVyZm9ybWFuY2UgCgpgYGB7cn0KbXVzaHJvb21fSlJpcCA8LSBKUmlwKHR5cGUgfiAuLCBkYXRhID0gbXVzaHJvb21zX3RyYWluKQptdXNocm9vbV9KUmlwCnN1bW1hcnkobXVzaHJvb21fSlJpcCkKYGBgCgpNYWtlIHByZWRpY3Rpb25zCgpgYGB7cn0KbXVzaHJvb21fcHJlZCA8LSBwcmVkaWN0KG11c2hyb29tX0pSaXAsIG11c2hyb29tc190ZXN0KQpgYGAKCgpDcm9zcyB0YWJ1bGF0aW9uIG9mIHByZWRpY3RlZCB2ZXJzdXMgYWN0dWFsIGNsYXNzZXMKCmBgYHtyfQpsaWJyYXJ5KGdtb2RlbHMpCkNyb3NzVGFibGUobXVzaHJvb21zX3Rlc3QkdHlwZSwgbXVzaHJvb21fcHJlZCwKICAgICAgICAgICBwcm9wLmNoaXNxID0gRkFMU0UsIHByb3AuYyA9IEZBTFNFLCBwcm9wLnIgPSBGQUxTRSwKICAgICAgICAgICBkbm4gPSBjKCdhY3R1YWwgZGVmYXVsdCcsICdwcmVkaWN0ZWQgZGVmYXVsdCcpKQpgYGAKCiMgUnVsZSBMZWFybmVyIFVzaW5nIEM1LjAgRGVjaXNpb24gVHJlZXMgKG5vdCBpbiB0ZXh0KQoKYGBge3J9CmxpYnJhcnkoQzUwKQptdXNocm9vbV9jNXJ1bGVzIDwtIEM1LjAodHlwZSB+IG9kb3IgKyBnaWxsX3NpemUsIGRhdGEgPSBtdXNocm9vbXNfdHJhaW4sIHJ1bGVzID0gVFJVRSkKbXVzaHJvb21fYzVydWxlcwpzdW1tYXJ5KG11c2hyb29tX2M1cnVsZXMpCmBgYAoKCmBgYHtyfQptdXNocm9vbV9wcmVkIDwtIHByZWRpY3QobXVzaHJvb21fYzVydWxlcywgbXVzaHJvb21zX3Rlc3QpCmBgYAoKQ3Jvc3MgdGFidWxhdGlvbiBvZiBwcmVkaWN0ZWQgdmVyc3VzIGFjdHVhbCBjbGFzc2VzCgpgYGB7cn0KbGlicmFyeShnbW9kZWxzKQpDcm9zc1RhYmxlKG11c2hyb29tc190ZXN0JHR5cGUsIG11c2hyb29tX3ByZWQsCiAgICAgICAgICAgcHJvcC5jaGlzcSA9IEZBTFNFLCBwcm9wLmMgPSBGQUxTRSwgcHJvcC5yID0gRkFMU0UsCiAgICAgICAgICAgZG5uID0gYygnYWN0dWFsIGRlZmF1bHQnLCAncHJlZGljdGVkIGRlZmF1bHQnKSkKYGBgCgoKCgoKCgoK