--- title: "Ford Go Bike" output: html_notebook: default pdf_document: default --- In this notebook I download and unzip the [Ford Go Bike](https://www.baywheels.com/) [data](https://www.baywheels.com/system-data). ```{r} library(tidyverse) library(tictoc) library(skimr) library(lubridate) library(forcats) ``` Create a directory data in your directory, as a subdirectory, within your working directory. Of use a Project and delete the previous code chunk. Download the files into the data directory. First one is not zipped, the remaining are zipped. ```{r} URL <- "https://s3.amazonaws.com/baywheels-data/201905-baywheels-tripdata.csv.zip" download.file(URL, destfile = "./data/201905-baywheels-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/baywheels-data/201906-baywheels-tripdata.csv.zip" download.file(URL, destfile = "./data/201906-baywheels-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/baywheels-data/201907-baywheels-tripdata.csv.zip" download.file(URL, destfile = "./data/201907-baywheels-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/baywheels-data/201908-baywheels-tripdata.csv.zip" download.file(URL, destfile = "./data/201908-baywheels-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/baywheels-data/201909-baywheels-tripdata.csv.zip" download.file(URL, destfile = "./data/201909-baywheels-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/baywheels-data/201910-baywheels-tripdata.csv.zip" download.file(URL, destfile = "./data/201910-baywheels-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/baywheels-data/201911-baywheels-tripdata.csv.zip" download.file(URL, destfile = "./data/201911-baywheels-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/baywheels-data/201912-baywheels-tripdata.csv.zip" download.file(URL, destfile = "./data/201912-baywheels-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/baywheels-data/202001-baywheels-tripdata.csv.zip" download.file(URL, destfile = "./data/202001-baywheels-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/baywheels-data/202002-baywheels-tripdata.csv.zip" download.file(URL, destfile = "./data/202002-baywheels-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/baywheels-data/202003-baywheels-tripdata.csv.zip" download.file(URL, destfile = "./data/202003-baywheels-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/baywheels-data/202004-baywheels-tripdata.csv.zip" download.file(URL, destfile = "./data/202004-baywheels-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/baywheels-data/202005-baywheels-tripdata.csv.zip" download.file(URL, destfile = "./data/202005-baywheels-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/baywheels-data/202006-baywheels-tripdata.csv.zip" download.file(URL, destfile = "./data/202006-baywheels-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/baywheels-data/202007-baywheels-tripdata.csv.zip" download.file(URL, destfile = "./data/202007-baywheels-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/baywheels-data/202008-baywheels-tripdata.csv.zip" download.file(URL, destfile = "./data/202008-baywheels-tripdata.csv.zip", method="curl") ``` Loop over the one value in the url and filename that changes. ```{r, eval=FALSE} for (i in 5:9) { URL <- paste0("https://s3.amazonaws.com/baywheels-data/20190",i,"-baywheels-tripdata.csv.zip") download.file(URL, destfile = paste0("./data/20190",i,"-baywheels-tripdata.csv.zip"), method="curl") } for (i in 10:12) { URL <- paste0("https://s3.amazonaws.com/baywheels-data/2019",i,"-baywheels-tripdata.csv.zip") download.file(URL, destfile = paste0("./data/2019",i,"-baywheels-tripdata.csv.zip"), method="curl") } for (i in 1:8) { URL <- paste0("https://s3.amazonaws.com/baywheels-data/20200",i,"-baywheels-tripdata.csv.zip") download.file(URL, destfile = paste0("./data/20200",i,"-baywheels-tripdata.csv.zip"), method="curl") } ``` Unzip downloaded files. ```{r} unzip("./data/201905-baywheels-tripdata.csv.zip",exdir="./data") unzip("./data/201906-baywheels-tripdata.csv.zip",exdir="./data") unzip("./data/201907-baywheels-tripdata.csv.zip",exdir="./data") unzip("./data/201908-baywheels-tripdata.csv.zip",exdir="./data") unzip("./data/201909-baywheels-tripdata.csv.zip",exdir="./data") unzip("./data/201910-baywheels-tripdata.csv.zip",exdir="./data") unzip("./data/201911-baywheels-tripdata.csv.zip",exdir="./data") unzip("./data/201912-baywheels-tripdata.csv.zip",exdir="./data") unzip("./data/202001-baywheels-tripdata.csv.zip",exdir="./data") unzip("./data/202002-baywheels-tripdata.csv.zip",exdir="./data") unzip("./data/202003-baywheels-tripdata.csv.zip",exdir="./data") unzip("./data/202004-baywheels-tripdata.csv.zip",exdir="./data") unzip("./data/202005-baywheels-tripdata.csv.zip",exdir="./data") unzip("./data/202006-baywheels-tripdata.csv.zip",exdir="./data") unzip("./data/202007-baywheels-tripdata.csv.zip",exdir="./data") unzip("./data/202008-baywheels-tripdata.csv.zip",exdir="./data") ``` Clean up data directory. ```{r} fn <- "./data/201905-baywheels-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/201906-baywheels-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/201907-baywheels-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/201908-baywheels-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/201909-baywheels-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/201910-baywheels-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/201911-baywheels-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/201912-baywheels-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/202001-baywheels-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/202002-baywheels-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/202003-baywheels-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/202004-baywheels-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/202005-baywheels-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/202006-baywheels-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/202007-baywheels-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/202008-baywheels-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) ``` Read the.csv files ```{r message=FALSE} baywheels201905 <- read_csv(file="./data/201905-baywheels-tripdata.csv") baywheels201906 <- read_csv(file="./data/201906-baywheels-tripdata.csv") baywheels201907 <- read_csv(file="./data/201907-baywheels-tripdata.csv") baywheels201908 <- read_csv(file="./data/201908-baywheels-tripdata.csv") baywheels201909 <- read_csv(file="./data/201909-baywheels-tripdata.csv") baywheels201910 <- read_csv(file="./data/201910-baywheels-tripdata.csv") baywheels201911 <- read_csv(file="./data/201911-baywheels-tripdata.csv") baywheels201912 <- read_csv(file="./data/201912-baywheels-tripdata.csv") baywheels202001 <- read_csv(file="./data/202001-baywheels-tripdata.csv") baywheels202002 <- read_csv(file="./data/202002-baywheels-tripdata.csv") baywheels202003 <- read_csv(file="./data/202003-baywheels-tripdata.csv") baywheels202004 <- read_csv(file="./data/202004-baywheels-tripdata.csv") baywheels202005 <- read_csv(file="./data/202005-baywheels-tripdata.csv") baywheels202006 <- read_csv(file="./data/202006-baywheels-tripdata.csv") baywheels202007 <- read_csv(file="./data/202007-baywheels-tripdata.csv") baywheels202008 <- read_csv(file="./data/202008-baywheels-tripdata.csv") ``` Check the head() and tail() of the data.frames that are loaded. ```{r} head(baywheels201905) head(baywheels201906) head(baywheels201907) head(baywheels201908) head(baywheels201909) head(baywheels201910) head(baywheels201911) head(baywheels201912) head(baywheels202001) head(baywheels202002) head(baywheels202003) head(baywheels202004) head(baywheels202005) head(baywheels202006) head(baywheels202007) head(baywheels202008) ``` ```{r} tail(baywheels201905) tail(baywheels201906) tail(baywheels201907) tail(baywheels201908) tail(baywheels201909) tail(baywheels201910) tail(baywheels201911) tail(baywheels201912) tail(baywheels202001) tail(baywheels202002) tail(baywheels202003) tail(baywheels202004) tail(baywheels202005) tail(baywheels202006) tail(baywheels202007) tail(baywheels202008) ``` ```{r} x <- as.numeric(c( baywheels201905 %>% count(), baywheels201906 %>% count(), baywheels201907 %>% count(), baywheels201908 %>% count(), baywheels201909 %>% count(), baywheels201910 %>% count(), baywheels201911 %>% count(), baywheels201912 %>% count(), baywheels202001 %>% count(), baywheels202002 %>% count(), baywheels202003 %>% count(), baywheels202004 %>% count(), baywheels202005 %>% count(), baywheels202006 %>% count(), baywheels202007 %>% count(), baywheels202008 %>% count() )) sum(x) ``` The end_station_id have been updated. ```{r} glimpse(baywheels201905) glimpse(baywheels201906) glimpse(baywheels201907) baywheels2019 <- bind_rows(baywheels201905, baywheels201906, baywheels201907, baywheels201908, baywheels201909, baywheels201910, baywheels201911, baywheels201912) dim(baywheels2019) baywheels2020 <- bind_rows(baywheels202001, baywheels202002, baywheels202003, baywheels202004, baywheels202005, baywheels202006, baywheels202007, baywheels202008) dim(baywheels2020) ``` ```{r} baywheels2019 %>% select(start_station_id,start_station_name, start_station_latitude,start_station_longitude) %>% arrange(start_station_id) %>% distinct() %>% head() ``` ```{r} dim(baywheels2019) baywheels2019 %>% count() dim(baywheels2020) baywheels2020 %>% count() baywheels <- bind_rows(baywheels2019, baywheels2020) baywheels %>% count() dim(baywheels) baywheels <- baywheels %>% mutate(year=year(start_time), month=month(start_time), day=day(start_time) ) baywheels %>% count() dim(baywheels) baywheels <- baywheels %>% mutate(week_day = wday(start_time) ) levels <- c("M","T","W","TH","F","SAT","SUN") baywheels <- baywheels %>% mutate( week_day = factor(week_day, levels = levels) ) baywheels %>% count() dim(baywheels) ``` ```{r} today() now() ``` Year and day of week. ```{r} baywheels %>% ggplot(aes(x=year)) + geom_bar() baywheels %>% ggplot(aes(x=month)) + geom_bar() + facet_grid(year ~ .) baywheels %>% ggplot(aes(x=day)) + geom_bar() + facet_grid(year ~ .) ``` ```{r} baywheels <- baywheels %>% filter(start_station_latitude < 38 & start_station_longitude < 120 ) baywheels_subset <- baywheels %>% select(start_station_longitude,start_station_latitude) baywheels_subset %>% ggplot(aes(x=start_station_longitude, y=start_station_latitude)) + geom_point() ``` ### There seem to be too many columns. We should check to make sure all of the data files have the same columns. Check the head() and tail() of the data.frames that are loaded. ```{r} head(baywheels201905) head(baywheels201906) head(baywheels201907) head(baywheels201908) head(baywheels201909) head(baywheels201910) head(baywheels201911) head(baywheels201912) head(baywheels202001) head(baywheels202002) head(baywheels202003) head(baywheels202004) head(baywheels202005) head(baywheels202006) head(baywheels202007) head(baywheels202008) ``` ```{r} tail(baywheels201905) tail(baywheels201906) tail(baywheels201907) tail(baywheels201908) tail(baywheels201909) tail(baywheels201910) tail(baywheels201911) tail(baywheels201912) tail(baywheels202001) tail(baywheels202002) tail(baywheels202003) tail(baywheels202004) tail(baywheels202005) tail(baywheels202006) tail(baywheels202007) tail(baywheels202008) ``` Make a list of the variable names. Do the data files contain the same columns? If not, what are the difference? ```{r} bwn <- list( baywheels201905 %>% names(), baywheels201906 %>% names(), baywheels201907 %>% names(), baywheels201908 %>% names(), baywheels201909 %>% names(), baywheels201910 %>% names(), baywheels201911 %>% names(), baywheels201912 %>% names(), baywheels202001 %>% names(), baywheels202002 %>% names(), baywheels202003 %>% names(), baywheels202004 %>% names(), baywheels202005 %>% names(), baywheels202006 %>% names(), baywheels202007 %>% names(), baywheels202008 %>% names() ) bwn ``` ```{r} x <- as.numeric(c( baywheels201905 %>% count(), baywheels201906 %>% count(), baywheels201907 %>% count(), baywheels201908 %>% count(), baywheels201909 %>% count(), baywheels201910 %>% count(), baywheels201911 %>% count(), baywheels201912 %>% count(), baywheels202001 %>% count(), baywheels202002 %>% count(), baywheels202003 %>% count(), baywheels202004 %>% count(), baywheels202005 %>% count(), baywheels202006 %>% count(), baywheels202007 %>% count(), baywheels202008 %>% count() )) sum(x) ``` There was a change in the variables in April 2020. ```{r} glimpse(baywheels201905) dim(baywheels201905) glimpse(baywheels201906) dim(baywheels201906) glimpse(baywheels201907) dim(baywheels201907) baywheels2019 <- bind_rows(baywheels201905, baywheels201906) dim(baywheels2019) baywheels2019 <- bind_rows(baywheels201905, baywheels201906, baywheels201907, baywheels201908, baywheels201909, baywheels201910, baywheels201911, baywheels201912) dim(baywheels2019) baywheels2020 <- bind_rows(baywheels202001, baywheels202002, baywheels202003) dim(baywheels2020) baywheels <- bind_rows(baywheels2019, baywheels2020) dim(baywheels) baywheels2020a <- bind_rows( baywheels202004, baywheels202005, baywheels202006, baywheels202007, baywheels202008) dim(baywheels2020a) ``` ```{r} bwna <- list( baywheels %>% names(), baywheels2020a %>% names() ) bwna identical(bwna[[1]], bwna[[2]]) intersect(bwna[[1]], bwna[[2]]) setdiff(bwna[[1]], bwna[[2]]) setdiff(bwna[[2]], bwna[[1]]) ``` ```{r} baywheels2020b <- baywheels2020a %>% rename( "start_time" = "started_at", "end_time" = "ended_at", "start_station_latitude" = "start_lat", "start_station_longitude" = "start_lng", "end_station_latitude" = "end_lat", "end_station_longitude" = "end_lng" ) baywheels2020b %>% names() ``` ```{r} baywheels %>% select(user_type) %>% table() baywheels2020b %>% select(member_casual) %>% table() ``` ```{r} baywheels2020b <- baywheels2020b %>% rename( "user_type" = "member_casual" ) baywheels2020b %>% names() baywheels2020c <- baywheels2020b %>% mutate( user_type = as_factor(user_type) ) baywheels2020c %>% select(user_type) %>% table() baywheels2020d <- baywheels2020c %>% mutate(user_type = fct_recode( user_type, Customer = "casual", Subscriber = "member" ) ) baywheels2020d %>% select(user_type) %>% table() baywheels2020e <- baywheels2020d %>% mutate(user_type = as.character(user_type)) ``` ```{r} setdiff(names(baywheels), names(baywheels2020e)) setdiff(names(baywheels2020e), names(baywheels)) baywheels2020f <- baywheels2020e %>% mutate(duration_sec = as.numeric(difftime(end_time, start_time, units = "secs"))) baywheels <- bind_rows(baywheels, baywheels2020f) dim(baywheels) baywheels %>% names() ``` ```{r} baywheels2 <- baywheels %>% mutate(year=year(start_time), month=month(start_time), day=day(start_time) ) baywheels %>% count() dim(baywheels2) baywheels3 <- baywheels2 %>% mutate(week_day = wday(start_time) ) levels <- c("M","T","W","TH","F","SAT","SUN") baywheels3 <- baywheels3 %>% mutate( week_day = factor(week_day, levels = levels) ) baywheels3 %>% count() dim(baywheels3) ``` ```{r} baywheels3 %>% select(start_station_id, start_station_name, start_station_latitude, start_station_longitude) %>% arrange(start_station_id) %>% distinct() %>% head() ``` ```{r} today() now() ``` Year and day of week. ```{r} baywheels3 %>% ggplot(aes(x=year)) + geom_bar() baywheels3 %>% ggplot(aes(x=month)) + geom_bar() + facet_grid(year ~ .) baywheels3 %>% ggplot(aes(x=day)) + geom_bar() + facet_grid(year ~ .) ``` ```{r} baywheels3 <- baywheels3 %>% filter(start_station_latitude < 38 & start_station_longitude < 120 ) baywheels_subset <- baywheels3 %>% select(start_station_longitude,start_station_latitude) baywheels_subset %>% ggplot(aes(x=start_station_longitude, y=start_station_latitude)) + geom_point() ``` ```{r} library(biganalytics) # run in parallel, the doMC package runs on Windows library(doParallel) registerDoParallel(cores = 8) head(baywheels3) baywheels_subset2 <- as.matrix(baywheels_subset) set.seed <- 123454321 tic() cl <- bigkmeans(baywheels_subset2, 3, nstart=8) toc() head(cl$cluster) cl$centers baywheels_subset %>% ggplot(aes(x=start_station_longitude, y=start_station_latitude, color=cl$cluster)) + geom_point() baywheels3 <- baywheels3 %>% mutate(clust = cl$cluster) ``` ```{r} # City of Oakland c(-122.2711, 37.8044) ) # https://stackoverflow.com/questions/20621250/simple-approach-to-assigning-clusters-for-new-data-after-k-means-clustering cl$centers closest.cluster <- function(x) { cluster.dist <- apply(cl$centers, 1, function(y) sqrt(sum((x-y)^2))) return(which.min(cluster.dist)[1]) } oak <- closest.cluster(c(-122.2711, 37.8044)) oak oakland <- baywheels3 %>% filter(clust == oak) oakland %>% ggplot(aes(x=start_station_longitude, y=start_station_latitude)) + geom_point() ``` ```{r} tic() cl.km <- kmeans(baywheels_subset, 3) toc() cl.km$centers # City of Oakland c(-122.2711, 37.8044) ) baywheels_subset %>% ggplot(aes(x=start_station_longitude, y=start_station_latitude, color=cl.km$cluster)) + geom_point() ``` Duration of rides in the Bay Area ```{r} baywheels %>% ggplot(aes(x=duration_sec, y=..density..)) + scale_x_continuous(limits = c(0, 10000)) + geom_histogram() + geom_density(aes(y=..density..)) baywheels %>% ggplot(aes(log(x=duration_sec), y=..density..)) + geom_histogram() + geom_density(aes(y=..density..)) ``` Durations of rides in Oakland ```{r} oakland %>% ggplot(aes(x=duration_sec, y=..density..)) + scale_x_continuous(limits = c(0, 10000)) + geom_histogram() + geom_density(aes(y=..density..)) oakland %>% ggplot(aes(log(x=duration_sec), y=..density..)) + geom_histogram() + geom_density(aes(y=..density..)) ``` Duration by City ```{r} baywheels3 %>% ggplot(aes(x=duration_sec, y=..density..)) + scale_x_continuous(limits = c(0, 10000)) + geom_histogram() + geom_density(aes(y=..density..)) + facet_grid(clust ~ .) baywheels3 %>% ggplot(aes(log(x=duration_sec), y=..density..)) + geom_histogram() + geom_density(aes(y=..density..)) + facet_grid(clust ~ .) baywheels3 %>% ggplot(aes(x=duration_sec, y=..density..)) + scale_x_continuous(limits = c(0, 10000)) + geom_histogram() + geom_density(aes(y=..density..)) + facet_grid(clust ~ .) baywheels3 %>% ggplot(aes(log(x=duration_sec), y=..density..)) + geom_histogram() + geom_density(aes(y=..density..)) + facet_grid(clust ~ .) ``` Duration in Oakland ```{r} oakland %>% ggplot(aes(x=duration_sec, y=..density..)) + scale_x_continuous(limits = c(0, 10000)) + geom_histogram() + geom_density(aes(y=..density..)) oakland %>% ggplot(aes(log(x=duration_sec), y=..density..)) + geom_histogram() + geom_density(aes(y=..density..)) oakland %>% ggplot(aes(x=duration_sec, y=..density..)) + scale_x_continuous(limits = c(0, 10000)) + geom_histogram() + geom_density(aes(y=..density..)) oakland %>% ggplot(aes(log(x=duration_sec), y=..density..)) + geom_histogram() + geom_density(aes(y=..density..)) ``` ```{r} baywheels3 %>% filter(clust == 1) %>% summarize(dur_mean = mean(duration_sec, na.rm = TRUE), dur_sd = sd(duration_sec, na.rm = TRUE)) baywheels3 %>% filter(clust == 2) %>% summarize(dur_mean = mean(duration_sec, na.rm = TRUE), dur_sd = sd(duration_sec, na.rm = TRUE)) baywheels3 %>% filter(clust == 3) %>% summarize(dur_mean = mean(duration_sec, na.rm = TRUE), dur_sd = sd(duration_sec, na.rm = TRUE)) ``` ```{r} oakland %>% summarize(dur_mean = mean(duration_sec, na.rm = TRUE), dur_sd = sd(duration_sec, na.rm = TRUE)) ``````