##### Chapter 2: Managing and Understanding Data ------------------- ##### R data structures -------------------- ## Vectors ----- # create vectors of data for three medical patients subject_name <- c("John Doe", "Jane Doe", "Steve Graves") temperature <- c(98.1, 98.6, 101.4) flu_status <- c(FALSE, FALSE, TRUE) # access the second element in body temperature vector temperature[2] ## examples of accessing items in vector # include items in the range 2 to 3 temperature[2:3] # exclude item 2 using the minus sign temperature[-2] # use a vector to indicate whether to include item temperature[c(TRUE, TRUE, FALSE)] ## Factors ----- # add gender factor gender <- factor(c("MALE", "FEMALE", "MALE")) gender # add blood type factor blood <- factor(c("O", "AB", "A"), levels = c("A", "B", "AB", "O")) blood # add ordered factor symptoms <- factor(c("SEVERE", "MILD", "MODERATE"), levels = c("MILD", "MODERATE", "SEVERE"), ordered = TRUE) symptoms # check for symptoms greater than moderate symptoms > "MODERATE" ## Lists ----- # display information for a patient subject_name[1] temperature[1] flu_status[1] gender[1] blood[1] symptoms[1] # create list for a patient subject1 <- list(fullname = subject_name[1], temperature = temperature[1], flu_status = flu_status[1], gender = gender[1], blood = blood[1], symptoms = symptoms[1]) # display the patient subject1 ## methods for accessing a list # get a single list value by position (returns a sub-list) subject1[2] # get a single list value by position (returns a numeric vector) subject1[[2]] # get a single list value by name subject1$temperature # get several list items by specifying a vector of names subject1[c("temperature", "flu_status")] ## access a list like a vector # get values 2 and 3 subject1[2:3] ## Data frames ----- # create a data frame from medical patient data pt_data <- data.frame(subject_name, temperature, flu_status, gender, blood, symptoms, stringsAsFactors = FALSE) # display the data frame pt_data ## accessing a data frame # get a single column pt_data$subject_name # get several columns by specifying a vector of names pt_data[c("temperature", "flu_status")] # this is the same as above, extracting temperature and flu_status pt_data[2:3] # accessing by row and column pt_data[1, 2] # accessing several rows and several columns using vectors pt_data[c(1, 3), c(2, 4)] ## Leave a row or column blank to extract all rows or columns # column 1, all rows pt_data[, 1] # row 1, all columns pt_data[1, ] # all rows and all columns pt_data[ , ] # the following are equivalent pt_data[c(1, 3), c("temperature", "gender")] pt_data[-2, c(-1, -3, -5, -6)] ## Matrixes ----- # create a 2x2 matrix m <- matrix(c(1, 2, 3, 4), nrow = 2) m # equivalent to the above m <- matrix(c(1, 2, 3, 4), ncol = 2) m # create a 2x3 matrix m <- matrix(c(1, 2, 3, 4, 5, 6), nrow = 2) m # create a 3x2 matrix m <- matrix(c(1, 2, 3, 4, 5, 6), ncol = 2) m # extract values from matrixes m[1, 1] m[3, 2] # extract rows m[1, ] # extract columns m[, 1] ##### Managing data with R ------------ ## saving, loading, and removing R data structures # show all data structures in memory ls() # remove the m and subject1 objects rm(m, subject1) ls() rm(list=ls()) ##### Exploring and understanding data -------------------- ## data exploration example using used car data usedcars <- read.csv("usedcars.csv", stringsAsFactors = FALSE) # get structure of used car data str(usedcars) ## Exploring numeric variables ----- # summarize numeric variables summary(usedcars$year) summary(usedcars[c("price", "mileage")]) # calculate the mean income (36000 + 44000 + 56000) / 3 mean(c(36000, 44000, 56000)) # the median income median(c(36000, 44000, 56000)) # the min/max of used car prices range(usedcars$price) # the difference of the range diff(range(usedcars$price)) # IQR for used car prices IQR(usedcars$price) # use quantile to calculate five-number summary quantile(usedcars$price) # the 99th percentile quantile(usedcars$price, probs = c(0.01, 0.99)) # quintiles quantile(usedcars$price, seq(from = 0, to = 1, by = 0.20)) # boxplot of used car prices and mileage boxplot(usedcars$price, main="Boxplot of Used Car Prices", ylab="Price ($)") boxplot(usedcars$mileage, main="Boxplot of Used Car Mileage", ylab="Odometer (mi.)") # histograms of used car prices and mileage hist(usedcars$price, main = "Histogram of Used Car Prices", xlab = "Price ($)") hist(usedcars$mileage, main = "Histogram of Used Car Mileage", xlab = "Odometer (mi.)") # variance and standard deviation of the used car data var(usedcars$price) sd(usedcars$price) var(usedcars$mileage) sd(usedcars$mileage) ## Exploring numeric variables ----- # one-way tables for the used car data table(usedcars$year) table(usedcars$model) table(usedcars$color) # compute table proportions model_table <- table(usedcars$model) prop.table(model_table) # round the data color_table <- table(usedcars$color) color_pct <- prop.table(color_table) * 100 round(color_pct, digits = 1) ## Exploring relationships between variables ----- # scatterplot of price vs. mileage plot(x = usedcars$mileage, y = usedcars$price, main = "Scatterplot of Price vs. Mileage", xlab = "Used Car Odometer (mi.)", ylab = "Used Car Price ($)") # new variable indicating conservative colors usedcars$conservative <- usedcars$color %in% c("Black", "Gray", "Silver", "White") # checking our variable table(usedcars$conservative) # Crosstab of conservative by model library(gmodels) CrossTable(x = usedcars$model, y = usedcars$conservative)