October 21, 2019

Strings

From r4ds see Chapter 14.

This is an introduction to regular expressions for working with text strings.

Acording to the authors, "When you first look at a regexp, you’ll think a cat walked across your keyboard, but as your understanding improves they will soon start to make sense."

stringr package

library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(stringr)

string1 <- "This is a string."
string2 <- 'If I want to include a "quote" inside a string, I use single quotes.'

str_length(string1)
## [1] 17
str_length(string2)
## [1] 68

stringr package

str_c(string1, string2, sep = ' ')
## [1] "This is a string. If I want to include a \"quote\" inside a string, I use single quotes."

Write a sentence with values

name <- "Hadley"
time_of_day <- "morning"
birthday <- FALSE

str_c(
  "Good ", time_of_day, " ", name,
  if (birthday) " and HAPPY BIRTHDAY",
  "."
)
## [1] "Good morning Hadley."

substrings

x <- c("Apple", "Banana", "Pear")
str_sub(x, 1, 3)
## [1] "App" "Ban" "Pea"

sort

x <- c("apple", "eggplant", "banana")

str_sort(x, locale = "en")
## [1] "apple"    "banana"   "eggplant"

Pattern matching

x <- c("apple", "banana", "pear")
str_view(x, "an")

Match except for a new line.

str_view(x, ".a.")

x <- c("apple pie", "apple", "apple cake")
str_view(x, "apple")

  • ^ to match the start of the string.
  • $ to match the end of the string.
str_view(x, "^apple$")

str_view(c("abc", "a.c", "a*c", "a c"), "a[.]c")
str_view(c("grey", "gray"), "gr(e|a)y")

Matches

x <- c("apple", "banana", "pear")
str_detect(x, "e")
## [1]  TRUE FALSE  TRUE

How many common words start with t?

sum(str_detect(words, "^t"))
## [1] 65

What proportion of common words end with a vowel?

mean(str_detect(words, "[aeiou]$"))
## [1] 0.2765306

Find words that end with an x.

df <- tibble(
  word = words, 
  i = seq_along(word)
)
df %>% 
  filter(str_detect(word, "x$"))
## # A tibble: 4 x 2
##   word      i
##   <chr> <int>
## 1 box     108
## 2 sex     747
## 3 six     772
## 4 tax     841

Replace matches

x <- c("1 house", "2 cars", "3 people")
str_replace_all(x, c("1" = "one", "2" = "two", "3" = "three"))
## [1] "one house"    "two cars"     "three people"

Splitting

sentences %>%
  head(5) %>% 
  str_split(" ")
## [[1]]
## [1] "The"     "birch"   "canoe"   "slid"    "on"      "the"     "smooth" 
## [8] "planks."
## 
## [[2]]
## [1] "Glue"        "the"         "sheet"       "to"          "the"        
## [6] "dark"        "blue"        "background."
## 
## [[3]]
## [1] "It's"  "easy"  "to"    "tell"  "the"   "depth" "of"    "a"     "well."
## 
## [[4]]
## [1] "These"   "days"    "a"       "chicken" "leg"     "is"      "a"      
## [8] "rare"    "dish."  
## 
## [[5]]
## [1] "Rice"   "is"     "often"  "served" "in"     "round"  "bowls."