##### Chapter 8: Association Rules ------------------- ## Example: Identifying Frequently-Purchased Groceries ---- ## Step 2: Exploring and preparing the data ---- # load the grocery data into a sparse matrix library(arules) groceries <- read.transactions("groceries.csv", sep = ",") summary(groceries) # examine the long format (without decoding) to see the raw item IDs head(toLongFormat(groceries, decode = FALSE), n = 7) # look at the first five transactions inspect(groceries[1:5]) # examine the frequency of items itemFrequency(groceries[, 1:3]) # plot the frequency of items itemFrequencyPlot(groceries, support = 0.1) itemFrequencyPlot(groceries, topN = 20) # a visualization of the sparse matrix for the first five transactions image(groceries[1:5]) # visualization of a random sample of 100 transactions image(sample(groceries, 100)) ## Step 3: Training a model on the data ---- library(arules) # default settings result in zero rules learned apriori(groceries) # set better support and confidence levels to learn more rules groceryrules <- apriori(groceries, parameter = list(support = 0.006, confidence = 0.25, minlen = 2)) groceryrules ## Step 4: Evaluating model performance ---- # summary of grocery association rules summary(groceryrules) # look at the first three rules inspect(groceryrules[1:3]) ## Step 5: Improving model performance ---- # sorting grocery rules by lift inspect(sort(groceryrules, by = "lift")[1:5]) # finding subsets of rules containing any berry items berryrules <- subset(groceryrules, items %in% "berries") inspect(berryrules) # writing the rules to a CSV file write(groceryrules, file = "groceryrules.csv", sep = ",", quote = TRUE, row.names = FALSE) # converting the rule set to a data frame groceryrules_df <- as(groceryrules, "data.frame") str(groceryrules_df) # using the eclat algorithm for greater performance # first, generate the frequent itemsets groceryitemsets_eclat <- eclat(groceries, support = 0.006) inspect(groceryitemsets_eclat[1:5]) # second, generate the rules groceryrules_eclat <- ruleInduction(groceryitemsets_eclat, confidence = 0.25) groceryrules_eclat inspect(groceryrules_eclat[1:5])