- r4ds Chapter 14
- Use of . ^ $ [ ] | ( )
- Use of stringr::str_view()
- Use of stringr::str_detect()
October 30, 2019
The . is used to specify any character.
library(pacman) p_load(tidyverse, stringr)
Find any words with the letter n and any character before it.
x <- c("apple", "banana", "pear") str_view(x, ".n")
Find any words with the letter a and any character before it. So a cannot start the word.
x <- c("apple", "banana", "pear") str_view(x, ".a")
Find words that start with a.
x <- c("apple", "banana", "pear") str_view(x, "^a")
Find words that end with a.
x <- c("apple", "banana", "pear") str_view(x, "a$")
Find words that start with a vowel and have a character after the vowel. Note that the [ ] specify the literal characters to search for.
x <- c("apple", "banana", "pear") str_view(x, "^[aeiou].")
The str_view() R function can be through of as a "search engine" for text based data.
Find the words that contain an two times.
x <- c("apple", "banana", "pear") str_view(x, "(an){2}")
head(fruit)
## [1] "apple" "apricot" "avocado" "banana" "bell pepper" ## [6] "bilberry"
str_view(fruit, "(..)\\1", match = TRUE)
Now instead of viewing matches in text we will determine if a match has occurred or now.
x <- c("apple", "banana", "pear") str_detect(x, "e")
## [1] TRUE FALSE TRUE
head(words)
## [1] "a" "able" "about" "absolute" "accept" "account"
sum(str_detect(words, "^t"))
## [1] 65
mean(str_detect(words, "^t"))
## [1] 0.06632653
What do these lines of code do?
head(words)
## [1] "a" "able" "about" "absolute" "accept" "account"
sum(str_detect(words, "[aeiou]$"))
## [1] 271
mean(str_detect(words, "[aeiou]$"))
## [1] 0.2765306
Detect matches in a column of a tibble.
words_df <- tibble( word = words, i = seq_along(word) ) head(words_df)
## # A tibble: 6 x 2 ## word i ## <chr> <int> ## 1 a 1 ## 2 able 2 ## 3 about 3 ## 4 absolute 4 ## 5 accept 5 ## 6 account 6
words_df %>% filter(str_detect(word, "x$"))
## # A tibble: 4 x 2 ## word i ## <chr> <int> ## 1 box 108 ## 2 sex 747 ## 3 six 772 ## 4 tax 841
x <- c("apple", "banana", "pear") str_count(x, "a")
## [1] 1 3 1
str_count(x, "[aeiou]")
## [1] 2 3 2
mean(str_count(x, "[aeiou]"))
## [1] 2.333333
words_df %>% mutate( vowels = str_count(word, "[aeiou]"), consonants = str_count(word, "[^aeiou]") )
## # A tibble: 980 x 4 ## word i vowels consonants ## <chr> <int> <int> <int> ## 1 a 1 1 0 ## 2 able 2 2 2 ## 3 about 3 3 2 ## 4 absolute 4 4 4 ## 5 accept 5 2 4 ## 6 account 6 3 4 ## 7 achieve 7 4 3 ## 8 across 8 2 4 ## 9 act 9 1 2 ## 10 active 10 3 3 ## # … with 970 more rows
head(sentences)
## [1] "The birch canoe slid on the smooth planks." ## [2] "Glue the sheet to the dark blue background." ## [3] "It's easy to tell the depth of a well." ## [4] "These days a chicken leg is a rare dish." ## [5] "Rice is often served in round bowls." ## [6] "The juice of lemons makes fine punch."
length(sentences)
## [1] 720
What does | do?
colors <- c("red", "orange", "yellow", "green", "blue", "purple") color_match <- str_c(colors, collapse = "|") color_match
## [1] "red|orange|yellow|green|blue|purple"
First find all sentences with colors in them. Second find the color words in each sentence.
has_color <- str_subset(sentences, color_match) matches <- str_extract(has_color, color_match) head(matches)
## [1] "blue" "blue" "red" "red" "red" "blue"
more <- sentences[str_count(sentences, color_match) > 1] str_view_all(more, color_match)
sentences %>% head(5) %>% str_split(" ")
## [[1]] ## [1] "The" "birch" "canoe" "slid" "on" "the" "smooth" ## [8] "planks." ## ## [[2]] ## [1] "Glue" "the" "sheet" "to" "the" ## [6] "dark" "blue" "background." ## ## [[3]] ## [1] "It's" "easy" "to" "tell" "the" "depth" "of" "a" "well." ## ## [[4]] ## [1] "These" "days" "a" "chicken" "leg" "is" "a" ## [8] "rare" "dish." ## ## [[5]] ## [1] "Rice" "is" "often" "served" "in" "round" "bowls."
Simplified.
sentences %>% head(5) %>% str_split(" ", simplify = TRUE)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] ## [1,] "The" "birch" "canoe" "slid" "on" "the" "smooth" ## [2,] "Glue" "the" "sheet" "to" "the" "dark" "blue" ## [3,] "It's" "easy" "to" "tell" "the" "depth" "of" ## [4,] "These" "days" "a" "chicken" "leg" "is" "a" ## [5,] "Rice" "is" "often" "served" "in" "round" "bowls." ## [,8] [,9] ## [1,] "planks." "" ## [2,] "background." "" ## [3,] "a" "well." ## [4,] "rare" "dish." ## [5,] "" ""
library(babynames) head(babynames)
## # A tibble: 6 x 5 ## year sex name n prop ## <dbl> <chr> <chr> <int> <dbl> ## 1 1880 F Mary 7065 0.0724 ## 2 1880 F Anna 2604 0.0267 ## 3 1880 F Emma 2003 0.0205 ## 4 1880 F Elizabeth 1939 0.0199 ## 5 1880 F Minnie 1746 0.0179 ## 6 1880 F Margaret 1578 0.0162
babynames %>% filter(str_detect(name, "^Kr"))
## # A tibble: 6,212 x 5 ## year sex name n prop ## <dbl> <chr> <chr> <int> <dbl> ## 1 1915 F Kristina 5 0.00000488 ## 2 1916 F Kristine 6 0.00000553 ## 3 1916 M Kramer 5 0.00000542 ## 4 1917 F Kristine 8 0.00000712 ## 5 1918 F Kristine 9 0.00000749 ## 6 1919 F Kristine 6 0.00000511 ## 7 1920 F Kristine 9 0.00000723 ## 8 1920 F Kristina 5 0.00000402 ## 9 1921 F Kristine 10 0.00000781 ## 10 1921 F Kristina 5 0.00000391 ## # … with 6,202 more rows
There is a harrypotter package on CRAN that contains color palettes.
There is another harrypotter package that can be installed from the authors github using devtools.
The references for this code are UC-R tidy_text and the book tidytext.
library(devtools) install_github("bradleyboehmke/harrypotter")
library(harrypotter) library(tidytext)
The books are character vectors with each element a chapter.
phil_stone_tb <- tibble(chapter = seq_along(philosophers_stone), text = philosophers_stone) phil_stone_tb
## # A tibble: 17 x 2 ## chapter text ## <int> <chr> ## 1 1 "THE BOY WHO LIVED Mr. and Mrs. Dursley, of number four, Prive… ## 2 2 "THE VANISHING GLASS Nearly ten years had passed since the Dur… ## 3 3 "THE LETTERS FROM NO ONE The escape of the Brazilian boa const… ## 4 4 "THE KEEPER OF THE KEYS BOOM. They knocked again. Dudley jerke… ## 5 5 "DIAGON ALLEY Harry woke early the next morning. Although he c… ## 6 6 "THE JOURNEY FROM PLATFORM NINE AND THREE-QUARTERS Harry's las… ## 7 7 "THE SORTING HAT The door swung open at once. A tall, black-ha… ## 8 8 "THE POTIONS MASTER There, look.\" \"Where?\" \"Next to the … ## 9 9 "THE MIDNIGHT DUEL Harry had never believed he would meet a bo… ## 10 10 "HALLOWEEN Malfoy couldn't believe his eyes when he saw that H… ## 11 11 "QUIDDITCH As they entered November, the weather turned very c… ## 12 12 "THE MIRROR OF ERISED Christmas was coming. One morning in mid… ## 13 13 "NICOLAS FLAMEL Dumbledore had convinced Harry not to go looki… ## 14 14 "NORBERT THE NORWEGIAN RIDGEBACK Quirrell, however, must have … ## 15 15 "THE FORIBIDDEN FOREST Things couldn't have been worse. Filch… ## 16 16 "THROUGH THE TRAPDOOR In years to come, Harry would never quit… ## 17 17 "THE MAN WITH TWO FACES It was Quirrell. \"You!\" gasped Harr…
phil_stone_tb %>% unnest_tokens(word, text)
## # A tibble: 77,875 x 2 ## chapter word ## <int> <chr> ## 1 1 the ## 2 1 boy ## 3 1 who ## 4 1 lived ## 5 1 mr ## 6 1 and ## 7 1 mrs ## 8 1 dursley ## 9 1 of ## 10 1 number ## # … with 77,865 more rows
titles <- c("Philosopher's Stone", "Chamber of Secrets", "Prisoner of Azkaban", "Goblet of Fire", "Order of the Phoenix", "Half-Blood Prince", "Deathly Hallows") books <- list(philosophers_stone, chamber_of_secrets, prisoner_of_azkaban, goblet_of_fire, order_of_the_phoenix, half_blood_prince, deathly_hallows) series <- tibble() for(i in seq_along(titles)) { clean <- tibble(chapter = seq_along(books[[i]]), text = books[[i]]) %>% unnest_tokens(word, text) %>% mutate(book = titles[i]) %>% select(book, everything()) series <- rbind(series, clean) }
# set factor to keep books in order of publication series$book <- factor(series$book, levels = rev(titles)) series
## # A tibble: 1,089,386 x 3 ## book chapter word ## <fct> <int> <chr> ## 1 Philosopher's Stone 1 the ## 2 Philosopher's Stone 1 boy ## 3 Philosopher's Stone 1 who ## 4 Philosopher's Stone 1 lived ## 5 Philosopher's Stone 1 mr ## 6 Philosopher's Stone 1 and ## 7 Philosopher's Stone 1 mrs ## 8 Philosopher's Stone 1 dursley ## 9 Philosopher's Stone 1 of ## 10 Philosopher's Stone 1 number ## # … with 1,089,376 more rows
series %>% count(word, sort = TRUE)
## # A tibble: 24,475 x 2 ## word n ## <chr> <int> ## 1 the 51593 ## 2 and 27430 ## 3 to 26985 ## 4 of 21802 ## 5 a 20966 ## 6 he 20322 ## 7 harry 16557 ## 8 was 15631 ## 9 said 14398 ## 10 his 14264 ## # … with 24,465 more rows
series %>% anti_join(stop_words) %>% count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 23,795 x 2 ## word n ## <chr> <int> ## 1 harry 16557 ## 2 ron 5750 ## 3 hermione 4912 ## 4 dumbledore 2873 ## 5 looked 2344 ## 6 professor 2006 ## 7 hagrid 1732 ## 8 time 1713 ## 9 wand 1639 ## 10 eyes 1604 ## # … with 23,785 more rows
# top 10 most common words in each book series %>% anti_join(stop_words) %>% group_by(book) %>% count(word, sort = TRUE) %>% top_n(10)
## Joining, by = "word"
## Selecting by n
## # A tibble: 70 x 3 ## # Groups: book [7] ## book word n ## <fct> <chr> <int> ## 1 Order of the Phoenix harry 3730 ## 2 Goblet of Fire harry 2936 ## 3 Deathly Hallows harry 2770 ## 4 Half-Blood Prince harry 2581 ## 5 Prisoner of Azkaban harry 1824 ## 6 Chamber of Secrets harry 1503 ## 7 Order of the Phoenix hermione 1220 ## 8 Philosopher's Stone harry 1213 ## 9 Order of the Phoenix ron 1189 ## 10 Deathly Hallows hermione 1077 ## # … with 60 more rows
# top 10 most common words in each book series %>% anti_join(stop_words) %>% group_by(book) %>% count(word, sort = TRUE) %>% top_n(10) %>% ungroup() %>% mutate(book = factor(book, levels = titles), text_order = nrow(.):1) %>% ggplot(aes(reorder(word, text_order), n, fill = book)) + geom_bar(stat = "identity") + facet_wrap(~ book, scales = "free_y") + labs(x = "NULL", y = "Frequency") + coord_flip() + theme(legend.position="none")
## Joining, by = "word"
## Selecting by n
## Joining, by = "word"
## Selecting by n