# UC-r Business Analytics R Programming Guide # https://uc-r.github.io/sentiment_analysis # if (packageVersion("devtools") < 1.6) { # install.packages("devtools") # } # devtools::install_github("bradleyboehmke/harrypotter") library(tidyverse) # data manipulation & plotting library(stringr) # text cleaning and regular expressions library(tidytext) # provides additional text mining functions library(harrypotter) # provides the first seven novels of the Harry Potter series philosophers_stone[1] chamber_of_secrets[1] prisoner_of_azkaban[1] goblet_of_fire[1] order_of_the_phoenix[1] half_blood_prince[1] deathly_hallows[1] text_tb <- tibble(chapter = seq_along(philosophers_stone), text = philosophers_stone) text_tb text_tb %>% unnest_tokens(word, text) library(janeaustenr) library(dplyr) library(stringr) original_books <- austen_books() %>% group_by(book) %>% mutate(linenumber = row_number(), chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", ignore_case = TRUE)))) %>% ungroup() original_books # all books titles <- c("Philosopher's Stone", "Chamber of Secrets", "Prisoner of Azkaban", "Goblet of Fire", "Order of the Phoenix", "Half-Blood Prince", "Deathly Hallows") books <- list(philosophers_stone, chamber_of_secrets, prisoner_of_azkaban, goblet_of_fire, order_of_the_phoenix, half_blood_prince, deathly_hallows) series <- tibble() for(i in seq_along(titles)) { clean <- tibble(chapter = seq_along(books[[i]]), text = books[[i]]) %>% unnest_tokens(word, text) %>% mutate(book = titles[i]) %>% select(book, everything()) series <- rbind(series, clean) } # set factor to keep books in order of publication series$book <- factor(series$book, levels = rev(titles)) series series %>% count(word, sort = TRUE) series %>% anti_join(stop_words) %>% count(word, sort = TRUE) # top 10 most common words in each book series %>% anti_join(stop_words) %>% group_by(book) %>% count(word, sort = TRUE) %>% top_n(10) # top 10 most common words in each book series %>% anti_join(stop_words) %>% group_by(book) %>% count(word, sort = TRUE) %>% top_n(10) %>% ungroup() %>% mutate(book = factor(book, levels = titles), text_order = nrow(.):1) %>% ggplot(aes(reorder(word, text_order), n, fill = book)) + geom_bar(stat = "identity") + facet_wrap(~ book, scales = "free_y") + labs(x = "NULL", y = "Frequency") + coord_flip() + theme(legend.position="none") # calculate percent of word use across all novels potter_pct <- series %>% anti_join(stop_words) %>% count(word) %>% transmute(word, all_words = n / sum(n)) # calculate percent of word use within each novel frequency <- series %>% anti_join(stop_words) %>% count(book, word) %>% mutate(book_words = n / sum(n)) %>% left_join(potter_pct) %>% arrange(desc(book_words)) %>% ungroup() # frequency # Note there is a problem with the 45 degree line not going through (0,0) ggplot(frequency, aes(x = book_words, y = all_words, color = abs(all_words - book_words))) + geom_abline(color = "gray40", lty = 2) + geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) + geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) + scale_x_log10(labels = scales::percent_format()) + scale_y_log10(labels = scales::percent_format()) + scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") + facet_wrap(~ book, ncol = 2) + theme(legend.position="none") + labs(y = "Harry Potter Series", x = NULL) # correlations frequency %>% group_by(book) %>% summarize(correlation = cor(book_words, all_words), p_value = cor.test(book_words, all_words)$p.value)