##### Chapter 3: Classification using Nearest Neighbors -------------------- ## Example: Classifying Cancer Samples ---- ## Step 2: Exploring and preparing the data ---- # import the CSV file wbcd <- read.csv("wisc_bc_data.csv", stringsAsFactors = FALSE) # examine the structure of the wbcd data frame str(wbcd) # drop the id feature wbcd <- wbcd[-1] # table of diagnosis table(wbcd$diagnosis) # recode diagnosis as a factor wbcd$diagnosis <- factor(wbcd$diagnosis, levels = c("B", "M"), labels = c("Benign", "Malignant")) # table or proportions with more informative labels round(prop.table(table(wbcd$diagnosis)) * 100, digits = 1) # summarize three numeric features summary(wbcd[c("radius_mean", "area_mean", "smoothness_mean")]) # create normalization function normalize <- function(x) { return ((x - min(x)) / (max(x) - min(x))) } # test normalization function - result should be identical normalize(c(1, 2, 3, 4, 5)) normalize(c(10, 20, 30, 40, 50)) # normalize the wbcd data wbcd_n <- as.data.frame(lapply(wbcd[2:31], normalize)) # confirm that normalization worked summary(wbcd_n$area_mean) # create training and test data wbcd_train <- wbcd_n[1:469, ] wbcd_test <- wbcd_n[470:569, ] # create labels for training and test data wbcd_train_labels <- wbcd[1:469, 1] wbcd_test_labels <- wbcd[470:569, 1] # visualize the data using labels plot(wbcd$radius_mean,wbcd$texture_mean, main = 'Scatterplot', xlab = 'randius mean', ylab = 'texture mean') pairs(~radius_mean+texture_mean+perimeter_mean+area_mean+smoothness_mean, data = wbcd, main = 'Scaterplot of many variables') library(car) scatterplot(texture_mean ~ radius_mean | diagnosis, data = wbcd, main = 'Scatterplot', xlab = 'randius mean', ylab = 'texture mean') scatterplotMatrix(~radius_mean+texture_mean+perimeter_mean+area_mean+smoothness_mean | diagnosis, data=wbcd) ## Step 3: Training a model on the data ---- # load the "class" library library(class) wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test, cl = wbcd_train_labels, k = 21) head(wbcd_test) head(wbcd_test_pred) ## Step 4: Evaluating model performance ---- # load the "gmodels" library library(gmodels) # Create the cross tabulation of predicted vs. actual CrossTable(x = wbcd_test_labels, y = wbcd_test_pred, prop.chisq = FALSE) ## Step 5: Improving model performance ---- # use the scale() function to z-score standardize a data frame wbcd_z <- as.data.frame(scale(wbcd[-1])) # confirm that the transformation was applied correctly summary(wbcd_z$area_mean) # create training and test datasets wbcd_train <- wbcd_z[1:469, ] wbcd_test <- wbcd_z[470:569, ] # re-classify test cases wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test, cl = wbcd_train_labels, k = 21) # Create the cross tabulation of predicted vs. actual CrossTable(x = wbcd_test_labels, y = wbcd_test_pred, prop.chisq = FALSE) # try several different values of k wbcd_train <- wbcd_n[1:469, ] wbcd_test <- wbcd_n[470:569, ] #start time strt<-Sys.time() wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test, cl = wbcd_train_labels, k=1) CrossTable(x = wbcd_test_labels, y = wbcd_test_pred, prop.chisq=FALSE) wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test, cl = wbcd_train_labels, k=5) CrossTable(x = wbcd_test_labels, y = wbcd_test_pred, prop.chisq=FALSE) wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test, cl = wbcd_train_labels, k=11) CrossTable(x = wbcd_test_labels, y = wbcd_test_pred, prop.chisq=FALSE) wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test, cl = wbcd_train_labels, k=15) CrossTable(x = wbcd_test_labels, y = wbcd_test_pred, prop.chisq=FALSE) wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test, cl = wbcd_train_labels, k=21) CrossTable(x = wbcd_test_labels, y = wbcd_test_pred, prop.chisq=FALSE) wbcd_test_pred <- knn(train = wbcd_train, test = wbcd_test, cl = wbcd_train_labels, k=27) CrossTable(x = wbcd_test_labels, y = wbcd_test_pred, prop.chisq=FALSE) #end time print(Sys.time()-strt)