Chapter 4 Data Transformation

The 5 verbs of data wrangling

library(nycflights13)
library(tidyverse)

We will continue to work with the flights dataset that is in the ggplot2 package.

flights

Change the code from the Transformation presentation to using the pipe %>%.

filter()

filter(flights, month == 1, day == 1)

arrange()

arrange(flights, year, month, day)

arrange()

arrange(flights, desc(dep_delay))

select()

select(flights, year, month, day)

select()

select(flights, time_hour, air_time, everything())

mutate()

flights_sml <- select(flights, 
  year:day, 
  ends_with("delay"), 
  distance, 
  air_time
)
mutate(flights_sml,
  gain = dep_delay - arr_delay,
  speed = distance / air_time * 60
)

summarize()

summarise(flights, delay = mean(dep_delay, na.rm = TRUE))
by_day <- group_by(flights, year, month, day)
summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))

Combining multiple operations using functions and assignment <-

by_dest <- group_by(flights, dest)
delay <- summarise(by_dest,
  count = n(),
  dist = mean(distance, na.rm = TRUE),
  delay = mean(arr_delay, na.rm = TRUE)
)
delay <- filter(delay, count > 20, dest != "HNL")

Combining multiple operations using functions and assignment <-, note the ggplot “piping” using the +

ggplot(data = delay, mapping = aes(x = dist, y = delay)) +
  geom_point(aes(size = count), alpha = 1/3) +
  geom_smooth(se = FALSE)

It looks like delays increase with distance up to ~750 miles and then decrease. Maybe as flights get longer there’s more ability to make up delays in the air?

geom_smooth() using method = ‘loess’ and formula ‘y ~ x’

Combining multiple operations with the pipe %>%

Does this code read better?

delays <- flights %>% 
  group_by(dest) %>% 
  summarise(
    count = n(),
    dist = mean(distance, na.rm = TRUE),
    delay = mean(arr_delay, na.rm = TRUE)
  ) %>% 
  filter(count > 20, dest != "HNL")
head(delays)
LS0tCnRpdGxlOiAiVHJhbnNmb3JtYXRpb24iCmF1dGhvcjogIlByb2YuIEVyaWMgQS4gU3Vlc3MiCm91dHB1dDoKICBodG1sX2RvY3VtZW50OgogICAgZGZfcHJpbnQ6IHBhZ2VkCiAgcGRmX2RvY3VtZW50OiBkZWZhdWx0CiAgaHRtbF9ub3RlYm9vazogZGVmYXVsdAogIHdvcmRfZG9jdW1lbnQ6IGRlZmF1bHQKLS0tCgojIENoYXB0ZXIgNCBEYXRhIFRyYW5zZm9ybWF0aW9uCgpUaGUgNSB2ZXJicyBvZiBkYXRhIHdyYW5nbGluZwoKLSBQaWNrIG9ic2VydmF0aW9ucyBieSB0aGVpciB2YWx1ZXMgKCoqZmlsdGVyKCkqKikuCi0gUmVvcmRlciB0aGUgcm93cyAoKiphcnJhbmdlKCkqKikuCi0gUGljayB2YXJpYWJsZXMgYnkgdGhlaXIgbmFtZXMgKCoqc2VsZWN0KCkqKikuCi0gQ3JlYXRlIG5ldyB2YXJpYWJsZXMgd2l0aCBmdW5jdGlvbnMgb2YgZXhpc3RpbmcgdmFyaWFibGVzICgqKm11dGF0ZSgpKiopLgotIENvbGxhcHNlIG1hbnkgdmFsdWVzIGRvd24gdG8gYSBzaW5nbGUgc3VtbWFyeSAoKipzdW1tYXJpc2UoKSoqKS4KLSAoKipncm91cF9ieSgpKiopCgoKYGBge3IgbWVzc2FnZT1GQUxTRX0KbGlicmFyeShueWNmbGlnaHRzMTMpCmxpYnJhcnkodGlkeXZlcnNlKQpgYGAKCldlIHdpbGwgY29udGludWUgdG8gd29yayB3aXRoIHRoZSAqZmxpZ2h0cyogZGF0YXNldCB0aGF0IGlzIGluIHRoZSBnZ3Bsb3QyIHBhY2thZ2UuCgpgYGB7cn0KZmxpZ2h0cwpgYGAKCkNoYW5nZSB0aGUgY29kZSBmcm9tIHRoZSBUcmFuc2Zvcm1hdGlvbiBwcmVzZW50YXRpb24gdG8gdXNpbmcgdGhlIHBpcGUgJT4lLgoKCiMjIGZpbHRlcigpCgpgYGB7ciBlY2hvPVRSVUV9CmZpbHRlcihmbGlnaHRzLCBtb250aCA9PSAxLCBkYXkgPT0gMSkKYGBgCgojIyBhcnJhbmdlKCkKCmBgYHtyIGVjaG89VFJVRX0KYXJyYW5nZShmbGlnaHRzLCB5ZWFyLCBtb250aCwgZGF5KQpgYGAKCiMjIGFycmFuZ2UoKQoKYGBge3IgZWNobz1UUlVFfQphcnJhbmdlKGZsaWdodHMsIGRlc2MoZGVwX2RlbGF5KSkKYGBgCgojIyBzZWxlY3QoKQoKYGBge3IgZWNobz1UUlVFfQpzZWxlY3QoZmxpZ2h0cywgeWVhciwgbW9udGgsIGRheSkKYGBgCgojIyBzZWxlY3QoKQoKYGBge3IgZWNobz1UUlVFfQpzZWxlY3QoZmxpZ2h0cywgdGltZV9ob3VyLCBhaXJfdGltZSwgZXZlcnl0aGluZygpKQpgYGAKCiMjIG11dGF0ZSgpCgpgYGB7ciBlY2hvPVRSVUV9CmZsaWdodHNfc21sIDwtIHNlbGVjdChmbGlnaHRzLCAKICB5ZWFyOmRheSwgCiAgZW5kc193aXRoKCJkZWxheSIpLCAKICBkaXN0YW5jZSwgCiAgYWlyX3RpbWUKKQptdXRhdGUoZmxpZ2h0c19zbWwsCiAgZ2FpbiA9IGRlcF9kZWxheSAtIGFycl9kZWxheSwKICBzcGVlZCA9IGRpc3RhbmNlIC8gYWlyX3RpbWUgKiA2MAopCmBgYAoKIyMgc3VtbWFyaXplKCkKCmBgYHtyIGVjaG89VFJVRX0Kc3VtbWFyaXNlKGZsaWdodHMsIGRlbGF5ID0gbWVhbihkZXBfZGVsYXksIG5hLnJtID0gVFJVRSkpCgpieV9kYXkgPC0gZ3JvdXBfYnkoZmxpZ2h0cywgeWVhciwgbW9udGgsIGRheSkKc3VtbWFyaXNlKGJ5X2RheSwgZGVsYXkgPSBtZWFuKGRlcF9kZWxheSwgbmEucm0gPSBUUlVFKSkKYGBgCgojIyBDb21iaW5pbmcgbXVsdGlwbGUgb3BlcmF0aW9ucyB1c2luZyBmdW5jdGlvbnMgYW5kIGFzc2lnbm1lbnQgPC0gCgpgYGB7ciBlY2hvPVRSVUV9CmJ5X2Rlc3QgPC0gZ3JvdXBfYnkoZmxpZ2h0cywgZGVzdCkKZGVsYXkgPC0gc3VtbWFyaXNlKGJ5X2Rlc3QsCiAgY291bnQgPSBuKCksCiAgZGlzdCA9IG1lYW4oZGlzdGFuY2UsIG5hLnJtID0gVFJVRSksCiAgZGVsYXkgPSBtZWFuKGFycl9kZWxheSwgbmEucm0gPSBUUlVFKQopCmRlbGF5IDwtIGZpbHRlcihkZWxheSwgY291bnQgPiAyMCwgZGVzdCAhPSAiSE5MIikKCmBgYAoKIyMgQ29tYmluaW5nIG11bHRpcGxlIG9wZXJhdGlvbnMgdXNpbmcgZnVuY3Rpb25zIGFuZCBhc3NpZ25tZW50IDwtLCBub3RlIHRoZSBnZ3Bsb3QgInBpcGluZyIgdXNpbmcgdGhlICsKCmBgYHtyIGVjaG89VFJVRSwgZXZhbD1GQUxTRX0KZ2dwbG90KGRhdGEgPSBkZWxheSwgbWFwcGluZyA9IGFlcyh4ID0gZGlzdCwgeSA9IGRlbGF5KSkgKwogIGdlb21fcG9pbnQoYWVzKHNpemUgPSBjb3VudCksIGFscGhhID0gMS8zKSArCiAgZ2VvbV9zbW9vdGgoc2UgPSBGQUxTRSkKYGBgCgoKSXQgbG9va3MgbGlrZSBkZWxheXMgaW5jcmVhc2Ugd2l0aCBkaXN0YW5jZSB1cCB0byB+NzUwIG1pbGVzIAphbmQgdGhlbiBkZWNyZWFzZS4gTWF5YmUgYXMgZmxpZ2h0cyBnZXQgbG9uZ2VyIHRoZXJlJ3MgbW9yZSAKYWJpbGl0eSB0byBtYWtlIHVwIGRlbGF5cyBpbiB0aGUgYWlyPwoKPiBgZ2VvbV9zbW9vdGgoKWAgdXNpbmcgbWV0aG9kID0gJ2xvZXNzJyBhbmQgZm9ybXVsYSAneSB+IHgnCgojIyBDb21iaW5pbmcgbXVsdGlwbGUgb3BlcmF0aW9ucyB3aXRoIHRoZSBwaXBlICU+JQoKRG9lcyB0aGlzIGNvZGUgcmVhZCBiZXR0ZXI/CgpgYGB7ciBlY2hvPVRSVUV9CmRlbGF5cyA8LSBmbGlnaHRzICU+JSAKICBncm91cF9ieShkZXN0KSAlPiUgCiAgc3VtbWFyaXNlKAogICAgY291bnQgPSBuKCksCiAgICBkaXN0ID0gbWVhbihkaXN0YW5jZSwgbmEucm0gPSBUUlVFKSwKICAgIGRlbGF5ID0gbWVhbihhcnJfZGVsYXksIG5hLnJtID0gVFJVRSkKICApICU+JSAKICBmaWx0ZXIoY291bnQgPiAyMCwgZGVzdCAhPSAiSE5MIikKCmhlYWQoZGVsYXlzKQpgYGAKCgoK