##### Chapter 6: Regression Methods ------------------- #### Part 3: Logistic Regression ------------------- ## A nice reference for fitting logistic regression in R ## http://www.r-bloggers.com/how-to-perform-a-logistic-regression-in-r/ ## A nice reference for generalized linear models in R ## http://www.statmethods.net/advstats/glm.html ## Understanding regression ---- ## Example: Space Shuttle Launch Data ---- launch <- read.csv("challenger.csv") # examine the launch data str(launch) # logisitic regression # First recode the distress_ct variable into 0 and 1, making 1 to represent at least # one failure during a launch. launch$distress_ct <- ifelse(launch$distress_ct<1,0,1) launch$distress_ct # set up trainning and test data sets indx <- sample(1:nrow(launch), as.integer(0.9*nrow(launch))) indx launch_train <- launch[indx,] launch_test <- launch[-indx,] # Check if there are any missing values library(Amelia) missmap(launch, main = "Missing values vs observed") # number of missing values in each column sapply(launch,function(x) sum(is.na(x))) # number of unique values in each column sapply(launch, function(x) length(unique(x))) # fit the logistic regression model, with all predictor variables model <- glm(distress_ct ~.,family=binomial(link='logit'),data=launch_train) model summary(model) anova(model, test="Chisq") # drop the insignificant predictors, alpha = 0.10 model <- glm(distress_ct ~ temperature,family=binomial(link='logit'),data=launch_train) model summary(model) anova(model, test="Chisq") # Show the fitted model on the plot # nice package to plot the fitted logistic regression library(popbio) with(launch_train,logi.hist.plot(temperature,distress_ct,boxp=FALSE,col="gray", ylabel = "Probability", xlabel = "Tempurature")) # adding boxplots with(launch_train,logi.hist.plot(temperature,distress_ct,boxp=TRUE,col="gray",notch=FALSE, ylabel = "Probability", xlabel = "Tempurature")) # check Accuracy fitted.results <- predict(model,newdata=launch_test,type='response') fitted.results <- ifelse(fitted.results > 0.5,1,0) misClasificError <- mean(fitted.results != launch_test$distress_ct) print(paste('Accuracy',1-misClasificError)) # ROC # Because this data set is so small, it is possible that the test data set # does not contain both 0 and 1 values. If this happens the code will not # run. And since the test data set is so small the ROC is not useful here # but the code is provided. library(ROCR) p <- predict(model, newdata=launch_test, type="response") pr <- prediction(p, launch_test$distress_ct) prf <- performance(pr, measure = "tpr", x.measure = "fpr") plot(prf) auc <- performance(pr, measure = "auc") auc <- auc@y.values[[1]] auc ## Example: Credit Data ---- credit <- read.csv("credit.csv") ## Fix the default variable to be 0 or 1 credit$default <- as.numeric(credit$default) credit$default <- credit$default - 1 # examine the launch data str(credit) # the distribution of defaults plot(credit$default) # summary statistics of the credit data summary(credit) # logisitic regression # set up trainning and test data sets indx <- sample(1:nrow(credit), as.integer(0.9*nrow(credit))) indx credit_train <- credit[indx,] credit_test <- credit[-indx,] # Check if there are any missing values library(Amelia) missmap(credit, main = "Missing values vs observed") # number of missing values in each column sapply(credit,function(x) sum(is.na(x))) # number of unique values in each column sapply(credit, function(x) length(unique(x))) # fit the logistic regression model, with all predictor variables model <- glm(default ~.,family=binomial(link='logit'),data=credit_train) model summary(model) anova(model, test="Chisq") # drop the insignificant predictors, alpha = 0.10 model <- glm(default ~ checking_balance + months_loan_duration + credit_history + percent_of_income + age,family=binomial(link='logit'),data=credit_train) model summary(model) anova(model, test="Chisq") # check Accuracy fitted.results <- predict(model,newdata=credit_test,type='response') fitted.results <- ifelse(fitted.results > 0.5,1,0) misClasificError <- mean(fitted.results != credit_test$default) print(paste('Accuracy',1-misClasificError)) # ROC library(ROCR) p <- predict(model, newdata=credit_test, type="response") pr <- prediction(p, credit_test$default) prf.glm <- performance(pr, measure = "tpr", x.measure = "fpr") plot(prf.glm) auc <- performance(pr, measure = "auc") auc.glm <- auc@y.values[[1]] auc.glm #### Part 4: Classification Trees ------------------ # Now try building a Classification Tree to compare with Logistic Regression # examine the credit data str(credit) # summary statistics of the credit data summary(credit) ## Step 3: Training a model on the data ---- # regression tree using rpart library(rpart) m.rpart <- rpart(default ~ ., data = credit_train) # get basic information about the tree m.rpart # get more detailed information about the tree summary(m.rpart) # use the rpart.plot package to create a visualization library(rpart.plot) # a basic decision tree diagram rpart.plot(m.rpart, digits = 3) # a few adjustments to the diagram rpart.plot(m.rpart, digits = 4, fallen.leaves = TRUE, type = 3, extra = 101) ## Step 4: Evaluate model performance ---- # generate predictions for the testing dataset p.rpart <- predict(m.rpart, credit_test) # p.rpart # compare the distribution of predicted values vs. actual values summary(p.rpart) summary(credit_test$default) # ROC library(ROCR) p <- predict(m.rpart, newdata=credit_test) pr <- prediction(p, credit_test$default) prf.c50 <- performance(pr, measure = "tpr", x.measure = "fpr") plot(prf.c50) auc <- performance(pr, measure = "auc") auc.c50 <- auc@y.values[[1]] auc.c50 # Compare ROC curves plot(prf.glm, col="purple") par(new=TRUE) plot(prf.c50, col="orange") segments(x0=0, # Value from x (initial) x1=1, # Value to x (final) y0=0, # Value from y (initial) y1=1, # Value to y (final) col='red') # GLM seems to be a bit more accurate, most of the time auc.glm auc.c50