Chapter 4 Data Transformation
The 5 verbs of data wrangling
- Pick observations by their values (filter()).
- Reorder the rows (arrange()).
- Pick variables by their names (select()).
- Create new variables with functions of existing variables (mutate()).
- Collapse many values down to a single summary (summarise()).
- (group_by())
library(nycflights13)
library(tidyverse)
We will continue to work with the flights dataset that is in the ggplot2 package.
flights
Change the code from the Transformation presentation to using the pipe %>%.
filter()
filter(flights, month == 1, day == 1)
arrange()
arrange(flights, year, month, day)
arrange()
arrange(flights, desc(dep_delay))
select()
select(flights, year, month, day)
select()
select(flights, time_hour, air_time, everything())
mutate()
flights_sml <- select(flights,
year:day,
ends_with("delay"),
distance,
air_time
)
mutate(flights_sml,
gain = dep_delay - arr_delay,
speed = distance / air_time * 60
)
summarize()
summarise(flights, delay = mean(dep_delay, na.rm = TRUE))
by_day <- group_by(flights, year, month, day)
summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))
Combining multiple operations using functions and assignment <-
by_dest <- group_by(flights, dest)
delay <- summarise(by_dest,
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
)
delay <- filter(delay, count > 20, dest != "HNL")
Combining multiple operations using functions and assignment <-, note the ggplot “piping” using the +
ggplot(data = delay, mapping = aes(x = dist, y = delay)) +
geom_point(aes(size = count), alpha = 1/3) +
geom_smooth(se = FALSE)
It looks like delays increase with distance up to ~750 miles and then decrease. Maybe as flights get longer there’s more ability to make up delays in the air?
geom_smooth()
using method = ‘loess’ and formula ‘y ~ x’
Combining multiple operations with the pipe %>%
Does this code read better?
delays <- flights %>%
group_by(dest) %>%
summarise(
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
) %>%
filter(count > 20, dest != "HNL")
head(delays)
LS0tCnRpdGxlOiAiVHJhbnNmb3JtYXRpb24iCmF1dGhvcjogIlByb2YuIEVyaWMgQS4gU3Vlc3MiCm91dHB1dDoKICBodG1sX2RvY3VtZW50OgogICAgZGZfcHJpbnQ6IHBhZ2VkCiAgcGRmX2RvY3VtZW50OiBkZWZhdWx0CiAgaHRtbF9ub3RlYm9vazogZGVmYXVsdAogIHdvcmRfZG9jdW1lbnQ6IGRlZmF1bHQKLS0tCgojIENoYXB0ZXIgNCBEYXRhIFRyYW5zZm9ybWF0aW9uCgpUaGUgNSB2ZXJicyBvZiBkYXRhIHdyYW5nbGluZwoKLSBQaWNrIG9ic2VydmF0aW9ucyBieSB0aGVpciB2YWx1ZXMgKCoqZmlsdGVyKCkqKikuCi0gUmVvcmRlciB0aGUgcm93cyAoKiphcnJhbmdlKCkqKikuCi0gUGljayB2YXJpYWJsZXMgYnkgdGhlaXIgbmFtZXMgKCoqc2VsZWN0KCkqKikuCi0gQ3JlYXRlIG5ldyB2YXJpYWJsZXMgd2l0aCBmdW5jdGlvbnMgb2YgZXhpc3RpbmcgdmFyaWFibGVzICgqKm11dGF0ZSgpKiopLgotIENvbGxhcHNlIG1hbnkgdmFsdWVzIGRvd24gdG8gYSBzaW5nbGUgc3VtbWFyeSAoKipzdW1tYXJpc2UoKSoqKS4KLSAoKipncm91cF9ieSgpKiopCgoKYGBge3IgbWVzc2FnZT1GQUxTRX0KbGlicmFyeShueWNmbGlnaHRzMTMpCmxpYnJhcnkodGlkeXZlcnNlKQpgYGAKCldlIHdpbGwgY29udGludWUgdG8gd29yayB3aXRoIHRoZSAqZmxpZ2h0cyogZGF0YXNldCB0aGF0IGlzIGluIHRoZSBnZ3Bsb3QyIHBhY2thZ2UuCgpgYGB7cn0KZmxpZ2h0cwpgYGAKCkNoYW5nZSB0aGUgY29kZSBmcm9tIHRoZSBUcmFuc2Zvcm1hdGlvbiBwcmVzZW50YXRpb24gdG8gdXNpbmcgdGhlIHBpcGUgJT4lLgoKCiMjIGZpbHRlcigpCgpgYGB7ciBlY2hvPVRSVUV9CmZpbHRlcihmbGlnaHRzLCBtb250aCA9PSAxLCBkYXkgPT0gMSkKYGBgCgojIyBhcnJhbmdlKCkKCmBgYHtyIGVjaG89VFJVRX0KYXJyYW5nZShmbGlnaHRzLCB5ZWFyLCBtb250aCwgZGF5KQpgYGAKCiMjIGFycmFuZ2UoKQoKYGBge3IgZWNobz1UUlVFfQphcnJhbmdlKGZsaWdodHMsIGRlc2MoZGVwX2RlbGF5KSkKYGBgCgojIyBzZWxlY3QoKQoKYGBge3IgZWNobz1UUlVFfQpzZWxlY3QoZmxpZ2h0cywgeWVhciwgbW9udGgsIGRheSkKYGBgCgojIyBzZWxlY3QoKQoKYGBge3IgZWNobz1UUlVFfQpzZWxlY3QoZmxpZ2h0cywgdGltZV9ob3VyLCBhaXJfdGltZSwgZXZlcnl0aGluZygpKQpgYGAKCiMjIG11dGF0ZSgpCgpgYGB7ciBlY2hvPVRSVUV9CmZsaWdodHNfc21sIDwtIHNlbGVjdChmbGlnaHRzLCAKICB5ZWFyOmRheSwgCiAgZW5kc193aXRoKCJkZWxheSIpLCAKICBkaXN0YW5jZSwgCiAgYWlyX3RpbWUKKQptdXRhdGUoZmxpZ2h0c19zbWwsCiAgZ2FpbiA9IGRlcF9kZWxheSAtIGFycl9kZWxheSwKICBzcGVlZCA9IGRpc3RhbmNlIC8gYWlyX3RpbWUgKiA2MAopCmBgYAoKIyMgc3VtbWFyaXplKCkKCmBgYHtyIGVjaG89VFJVRX0Kc3VtbWFyaXNlKGZsaWdodHMsIGRlbGF5ID0gbWVhbihkZXBfZGVsYXksIG5hLnJtID0gVFJVRSkpCgpieV9kYXkgPC0gZ3JvdXBfYnkoZmxpZ2h0cywgeWVhciwgbW9udGgsIGRheSkKc3VtbWFyaXNlKGJ5X2RheSwgZGVsYXkgPSBtZWFuKGRlcF9kZWxheSwgbmEucm0gPSBUUlVFKSkKYGBgCgojIyBDb21iaW5pbmcgbXVsdGlwbGUgb3BlcmF0aW9ucyB1c2luZyBmdW5jdGlvbnMgYW5kIGFzc2lnbm1lbnQgPC0gCgpgYGB7ciBlY2hvPVRSVUV9CmJ5X2Rlc3QgPC0gZ3JvdXBfYnkoZmxpZ2h0cywgZGVzdCkKZGVsYXkgPC0gc3VtbWFyaXNlKGJ5X2Rlc3QsCiAgY291bnQgPSBuKCksCiAgZGlzdCA9IG1lYW4oZGlzdGFuY2UsIG5hLnJtID0gVFJVRSksCiAgZGVsYXkgPSBtZWFuKGFycl9kZWxheSwgbmEucm0gPSBUUlVFKQopCmRlbGF5IDwtIGZpbHRlcihkZWxheSwgY291bnQgPiAyMCwgZGVzdCAhPSAiSE5MIikKCmBgYAoKIyMgQ29tYmluaW5nIG11bHRpcGxlIG9wZXJhdGlvbnMgdXNpbmcgZnVuY3Rpb25zIGFuZCBhc3NpZ25tZW50IDwtLCBub3RlIHRoZSBnZ3Bsb3QgInBpcGluZyIgdXNpbmcgdGhlICsKCmBgYHtyIGVjaG89VFJVRSwgZXZhbD1GQUxTRX0KZ2dwbG90KGRhdGEgPSBkZWxheSwgbWFwcGluZyA9IGFlcyh4ID0gZGlzdCwgeSA9IGRlbGF5KSkgKwogIGdlb21fcG9pbnQoYWVzKHNpemUgPSBjb3VudCksIGFscGhhID0gMS8zKSArCiAgZ2VvbV9zbW9vdGgoc2UgPSBGQUxTRSkKYGBgCgoKSXQgbG9va3MgbGlrZSBkZWxheXMgaW5jcmVhc2Ugd2l0aCBkaXN0YW5jZSB1cCB0byB+NzUwIG1pbGVzIAphbmQgdGhlbiBkZWNyZWFzZS4gTWF5YmUgYXMgZmxpZ2h0cyBnZXQgbG9uZ2VyIHRoZXJlJ3MgbW9yZSAKYWJpbGl0eSB0byBtYWtlIHVwIGRlbGF5cyBpbiB0aGUgYWlyPwoKPiBgZ2VvbV9zbW9vdGgoKWAgdXNpbmcgbWV0aG9kID0gJ2xvZXNzJyBhbmQgZm9ybXVsYSAneSB+IHgnCgojIyBDb21iaW5pbmcgbXVsdGlwbGUgb3BlcmF0aW9ucyB3aXRoIHRoZSBwaXBlICU+JQoKRG9lcyB0aGlzIGNvZGUgcmVhZCBiZXR0ZXI/CgpgYGB7ciBlY2hvPVRSVUV9CmRlbGF5cyA8LSBmbGlnaHRzICU+JSAKICBncm91cF9ieShkZXN0KSAlPiUgCiAgc3VtbWFyaXNlKAogICAgY291bnQgPSBuKCksCiAgICBkaXN0ID0gbWVhbihkaXN0YW5jZSwgbmEucm0gPSBUUlVFKSwKICAgIGRlbGF5ID0gbWVhbihhcnJfZGVsYXksIG5hLnJtID0gVFJVRSkKICApICU+JSAKICBmaWx0ZXIoY291bnQgPiAyMCwgZGVzdCAhPSAiSE5MIikKCmhlYWQoZGVsYXlzKQpgYGAKCgoK