October 30, 2019

Regular Expressions

  • r4ds Chapter 14
  • Use of . ^ $ [ ] | ( )
  • Use of stringr::str_view()
  • Use of stringr::str_detect()

Datasets with text

  • Using the stringr R package we can use the simple data sets
  • fruits
  • words
  • setences
  • harrypotter

The .

The . is used to specify any character.

library(pacman)
p_load(tidyverse, stringr)

Find any words with the letter n and any character before it.

x <- c("apple", "banana", "pear")
str_view(x, ".n")

The .

Find any words with the letter a and any character before it. So a cannot start the word.

x <- c("apple", "banana", "pear")
str_view(x, ".a")

Anchors ^ $

Find words that start with a.

x <- c("apple", "banana", "pear")
str_view(x, "^a")

Anchors ^ $

Find words that end with a.

x <- c("apple", "banana", "pear")
str_view(x, "a$")

Literal characters

Find words that start with a vowel and have a character after the vowel. Note that the [ ] specify the literal characters to search for.

x <- c("apple", "banana", "pear")
str_view(x, "^[aeiou].")

The str_view() function

The str_view() R function can be through of as a "search engine" for text based data.

Repetition

Find the words that contain an two times.

x <- c("apple", "banana", "pear")
str_view(x, "(an){2}")

Repetition

head(fruit)
## [1] "apple"       "apricot"     "avocado"     "banana"      "bell pepper"
## [6] "bilberry"
str_view(fruit, "(..)\\1", match = TRUE)

Detect matches

Now instead of viewing matches in text we will determine if a match has occurred or now.

x <- c("apple", "banana", "pear")
str_detect(x, "e")
## [1]  TRUE FALSE  TRUE

Detect matches

head(words)
## [1] "a"        "able"     "about"    "absolute" "accept"   "account"
sum(str_detect(words, "^t"))
## [1] 65
mean(str_detect(words, "^t"))
## [1] 0.06632653

Detect matches

What do these lines of code do?

head(words)
## [1] "a"        "able"     "about"    "absolute" "accept"   "account"
sum(str_detect(words, "[aeiou]$"))
## [1] 271
mean(str_detect(words, "[aeiou]$"))
## [1] 0.2765306

Detect matches

Detect matches in a column of a tibble.

words_df <- tibble(
  word = words, 
  i = seq_along(word)
)

head(words_df)
## # A tibble: 6 x 2
##   word         i
##   <chr>    <int>
## 1 a            1
## 2 able         2
## 3 about        3
## 4 absolute     4
## 5 accept       5
## 6 account      6

Detect matches

words_df %>% filter(str_detect(word, "x$"))
## # A tibble: 4 x 2
##   word      i
##   <chr> <int>
## 1 box     108
## 2 sex     747
## 3 six     772
## 4 tax     841

Count matches

x <- c("apple", "banana", "pear")
str_count(x, "a")
## [1] 1 3 1
str_count(x, "[aeiou]")
## [1] 2 3 2
mean(str_count(x, "[aeiou]"))
## [1] 2.333333

Mutate

words_df %>% 
  mutate(
    vowels = str_count(word, "[aeiou]"),
    consonants = str_count(word, "[^aeiou]")
  )
## # A tibble: 980 x 4
##    word         i vowels consonants
##    <chr>    <int>  <int>      <int>
##  1 a            1      1          0
##  2 able         2      2          2
##  3 about        3      3          2
##  4 absolute     4      4          4
##  5 accept       5      2          4
##  6 account      6      3          4
##  7 achieve      7      4          3
##  8 across       8      2          4
##  9 act          9      1          2
## 10 active      10      3          3
## # … with 970 more rows

Exact matches

head(sentences)
## [1] "The birch canoe slid on the smooth planks." 
## [2] "Glue the sheet to the dark blue background."
## [3] "It's easy to tell the depth of a well."     
## [4] "These days a chicken leg is a rare dish."   
## [5] "Rice is often served in round bowls."       
## [6] "The juice of lemons makes fine punch."
length(sentences)
## [1] 720

Find sentences that contain a color word.

What does | do?

colors <- c("red", "orange", "yellow", "green", "blue", "purple")
color_match <- str_c(colors, collapse = "|")
color_match
## [1] "red|orange|yellow|green|blue|purple"

Find sentences that contain a color word.

First find all sentences with colors in them. Second find the color words in each sentence.

has_color <- str_subset(sentences, color_match)
matches <- str_extract(has_color, color_match)
head(matches)
## [1] "blue" "blue" "red"  "red"  "red"  "blue"

Find sentences that contain two color words.

more <- sentences[str_count(sentences, color_match) > 1]
str_view_all(more, color_match)

Splitting

sentences %>%
  head(5) %>% 
  str_split(" ")
## [[1]]
## [1] "The"     "birch"   "canoe"   "slid"    "on"      "the"     "smooth" 
## [8] "planks."
## 
## [[2]]
## [1] "Glue"        "the"         "sheet"       "to"          "the"        
## [6] "dark"        "blue"        "background."
## 
## [[3]]
## [1] "It's"  "easy"  "to"    "tell"  "the"   "depth" "of"    "a"     "well."
## 
## [[4]]
## [1] "These"   "days"    "a"       "chicken" "leg"     "is"      "a"      
## [8] "rare"    "dish."  
## 
## [[5]]
## [1] "Rice"   "is"     "often"  "served" "in"     "round"  "bowls."

Splitting

Simplified.

sentences %>%
  head(5) %>% 
  str_split(" ", simplify = TRUE)
##      [,1]    [,2]    [,3]    [,4]      [,5]  [,6]    [,7]    
## [1,] "The"   "birch" "canoe" "slid"    "on"  "the"   "smooth"
## [2,] "Glue"  "the"   "sheet" "to"      "the" "dark"  "blue"  
## [3,] "It's"  "easy"  "to"    "tell"    "the" "depth" "of"    
## [4,] "These" "days"  "a"     "chicken" "leg" "is"    "a"     
## [5,] "Rice"  "is"    "often" "served"  "in"  "round" "bowls."
##      [,8]          [,9]   
## [1,] "planks."     ""     
## [2,] "background." ""     
## [3,] "a"           "well."
## [4,] "rare"        "dish."
## [5,] ""            ""

Search for rows of data with a regular expression

library(babynames)

head(babynames)
## # A tibble: 6 x 5
##    year sex   name          n   prop
##   <dbl> <chr> <chr>     <int>  <dbl>
## 1  1880 F     Mary       7065 0.0724
## 2  1880 F     Anna       2604 0.0267
## 3  1880 F     Emma       2003 0.0205
## 4  1880 F     Elizabeth  1939 0.0199
## 5  1880 F     Minnie     1746 0.0179
## 6  1880 F     Margaret   1578 0.0162

Search for rows of data with a regular expression

babynames %>% filter(str_detect(name, "^Kr"))
## # A tibble: 6,212 x 5
##     year sex   name         n       prop
##    <dbl> <chr> <chr>    <int>      <dbl>
##  1  1915 F     Kristina     5 0.00000488
##  2  1916 F     Kristine     6 0.00000553
##  3  1916 M     Kramer       5 0.00000542
##  4  1917 F     Kristine     8 0.00000712
##  5  1918 F     Kristine     9 0.00000749
##  6  1919 F     Kristine     6 0.00000511
##  7  1920 F     Kristine     9 0.00000723
##  8  1920 F     Kristina     5 0.00000402
##  9  1921 F     Kristine    10 0.00000781
## 10  1921 F     Kristina     5 0.00000391
## # … with 6,202 more rows

Harry Potter text

There is a harrypotter package on CRAN that contains color palettes.

There is another harrypotter package that can be installed from the authors github using devtools.

The references for this code are UC-R tidy_text and the book tidytext.

library(devtools)
install_github("bradleyboehmke/harrypotter")
library(harrypotter)
library(tidytext)

Harry Potter text

  1. philosophers_stone: Harry Potter and the Philosophers Stone, published in 1997
  2. chamber_of_secrets: Harry Potter and the Chamber of Secrets, published in 1998
  3. prisoner_of_azkaban: Harry Potter and the Prisoner of Azkaban, published in 1999
  4. goblet_of_fire: Harry Potter and the Goblet of Fire, published in 2000
  5. order_of_the_phoenix: Harry Potter and the Order of the Phoenix, published in 2003
  6. half_blood_prince: Harry Potter and the Half-Blood Prince, published in 2005
  7. deathly_hallows: Harry Potter and the Deathly Hallows, published in 2007

Harry Potter text

The books are character vectors with each element a chapter.

phil_stone_tb <- tibble(chapter = seq_along(philosophers_stone),
                  text = philosophers_stone)
phil_stone_tb
## # A tibble: 17 x 2
##    chapter text                                                            
##      <int> <chr>                                                           
##  1       1 "THE BOY WHO LIVED  Mr. and Mrs. Dursley, of number four, Prive…
##  2       2 "THE VANISHING GLASS  Nearly ten years had passed since the Dur…
##  3       3 "THE LETTERS FROM NO ONE  The escape of the Brazilian boa const…
##  4       4 "THE KEEPER OF THE KEYS  BOOM. They knocked again. Dudley jerke…
##  5       5 "DIAGON ALLEY  Harry woke early the next morning. Although he c…
##  6       6 "THE JOURNEY FROM PLATFORM NINE AND THREE-QUARTERS  Harry's las…
##  7       7 "THE SORTING HAT  The door swung open at once. A tall, black-ha…
##  8       8 "THE POTIONS MASTER  There, look.\"  \"Where?\"  \"Next to the …
##  9       9 "THE MIDNIGHT DUEL  Harry had never believed he would meet a bo…
## 10      10 "HALLOWEEN  Malfoy couldn't believe his eyes when he saw that H…
## 11      11 "QUIDDITCH  As they entered November, the weather turned very c…
## 12      12 "THE MIRROR OF ERISED  Christmas was coming. One morning in mid…
## 13      13 "NICOLAS FLAMEL  Dumbledore had convinced Harry not to go looki…
## 14      14 "NORBERT THE NORWEGIAN RIDGEBACK  Quirrell, however, must have …
## 15      15 "THE FORIBIDDEN FOREST  Things couldn't have been worse.  Filch…
## 16      16 "THROUGH THE TRAPDOOR  In years to come, Harry would never quit…
## 17      17 "THE MAN WITH TWO FACES  It was Quirrell.  \"You!\" gasped Harr…

Harry Potter text

phil_stone_tb %>%
        unnest_tokens(word, text)
## # A tibble: 77,875 x 2
##    chapter word   
##      <int> <chr>  
##  1       1 the    
##  2       1 boy    
##  3       1 who    
##  4       1 lived  
##  5       1 mr     
##  6       1 and    
##  7       1 mrs    
##  8       1 dursley
##  9       1 of     
## 10       1 number 
## # … with 77,865 more rows

Harry Potter text

titles <- c("Philosopher's Stone", "Chamber of Secrets", 
            "Prisoner of Azkaban", "Goblet of Fire", 
            "Order of the Phoenix", "Half-Blood Prince",
            "Deathly Hallows")
books <- list(philosophers_stone, chamber_of_secrets, 
              prisoner_of_azkaban, goblet_of_fire, 
              order_of_the_phoenix, half_blood_prince,
              deathly_hallows)
series <- tibble()
for(i in seq_along(titles)) {
        clean <- tibble(chapter = seq_along(books[[i]]),
            text = books[[i]]) %>%
            unnest_tokens(word, text) %>%
            mutate(book = titles[i]) %>%
            select(book, everything())
        series <- rbind(series, clean)
}

Harry Potter text

# set factor to keep books in order of publication
series$book <- factor(series$book, levels = rev(titles))
series
## # A tibble: 1,089,386 x 3
##    book                chapter word   
##    <fct>                 <int> <chr>  
##  1 Philosopher's Stone       1 the    
##  2 Philosopher's Stone       1 boy    
##  3 Philosopher's Stone       1 who    
##  4 Philosopher's Stone       1 lived  
##  5 Philosopher's Stone       1 mr     
##  6 Philosopher's Stone       1 and    
##  7 Philosopher's Stone       1 mrs    
##  8 Philosopher's Stone       1 dursley
##  9 Philosopher's Stone       1 of     
## 10 Philosopher's Stone       1 number 
## # … with 1,089,376 more rows

Word Frequency

series %>%
        count(word, sort = TRUE)
## # A tibble: 24,475 x 2
##    word      n
##    <chr> <int>
##  1 the   51593
##  2 and   27430
##  3 to    26985
##  4 of    21802
##  5 a     20966
##  6 he    20322
##  7 harry 16557
##  8 was   15631
##  9 said  14398
## 10 his   14264
## # … with 24,465 more rows

Word Frequency

series %>%
        anti_join(stop_words) %>%
        count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 23,795 x 2
##    word           n
##    <chr>      <int>
##  1 harry      16557
##  2 ron         5750
##  3 hermione    4912
##  4 dumbledore  2873
##  5 looked      2344
##  6 professor   2006
##  7 hagrid      1732
##  8 time        1713
##  9 wand        1639
## 10 eyes        1604
## # … with 23,785 more rows

Word Frequency

# top 10 most common words in each book
series %>%
        anti_join(stop_words) %>%
        group_by(book) %>%
        count(word, sort = TRUE) %>%
        top_n(10)
## Joining, by = "word"
## Selecting by n
## # A tibble: 70 x 3
## # Groups:   book [7]
##    book                 word         n
##    <fct>                <chr>    <int>
##  1 Order of the Phoenix harry     3730
##  2 Goblet of Fire       harry     2936
##  3 Deathly Hallows      harry     2770
##  4 Half-Blood Prince    harry     2581
##  5 Prisoner of Azkaban  harry     1824
##  6 Chamber of Secrets   harry     1503
##  7 Order of the Phoenix hermione  1220
##  8 Philosopher's Stone  harry     1213
##  9 Order of the Phoenix ron       1189
## 10 Deathly Hallows      hermione  1077
## # … with 60 more rows

Visualize

# top 10 most common words in each book
series %>%
        anti_join(stop_words) %>%
        group_by(book) %>%
        count(word, sort = TRUE) %>%
        top_n(10) %>%
        ungroup() %>%
        mutate(book = factor(book, levels = titles),
               text_order = nrow(.):1) %>%
        ggplot(aes(reorder(word, text_order), n, fill = book)) +
          geom_bar(stat = "identity") +
          facet_wrap(~ book, scales = "free_y") +
          labs(x = "NULL", y = "Frequency") +
          coord_flip() +
          theme(legend.position="none")
## Joining, by = "word"
## Selecting by n

Visualize

## Joining, by = "word"
## Selecting by n