--- title: "Ford Go Bike" output: html_notebook: default pdf_document: default --- In this notebook I download and unzip the [Ford Go Bike](https://www.fordgobike.com/) [data](https://www.fordgobike.com/system-data). ```{r} library(tidyverse) library(tictoc) library(ggmap) library(skimr) library(lubridate) library(forcats) ``` Set working directory. ```{r} setwd("~/GitHub/Stat6620/fordgobike") ``` Create a directory data in your directory, as a subdirectory, within your working directory. Of use a Project and delete the previous code chunk. Download the files into the data directory. First one is not zipped, the remaining are zipped. ```{r} URL <- "https://s3.amazonaws.com/fordgobike-data/2017-fordgobike-tripdata.csv" download.file(URL, destfile = "./data/2017-fordgobike-tripdata.csv", method="curl") URL <- "https://s3.amazonaws.com/fordgobike-data/201801-fordgobike-tripdata.csv.zip" download.file(URL, destfile = "./data/201801-fordgobike-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/fordgobike-data/201802-fordgobike-tripdata.csv.zip" download.file(URL, destfile = "./data/201802-fordgobike-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/fordgobike-data/201803-fordgobike-tripdata.csv.zip" download.file(URL, destfile = "./data/201803-fordgobike-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/fordgobike-data/201804-fordgobike-tripdata.csv.zip" download.file(URL, destfile = "./data/201804-fordgobike-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/fordgobike-data/201805-fordgobike-tripdata.csv.zip" download.file(URL, destfile = "./data/201805-fordgobike-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/fordgobike-data/201806-fordgobike-tripdata.csv.zip" download.file(URL, destfile = "./data/201806-fordgobike-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/fordgobike-data/201807-fordgobike-tripdata.csv.zip" download.file(URL, destfile = "./data/201807-fordgobike-tripdata.csv.zip", method="curl") ``` Loop over the one value in the url and filename that changes. ```{r, eval=FALSE} URL <- "https://s3.amazonaws.com/fordgobike-data/2017-fordgobike-tripdata.csv" download.file(URL, destfile = "./data/2017-fordgobike-tripdata.csv", method="curl") for (i in 1:7) { URL <- paste0("https://s3.amazonaws.com/fordgobike-data/20180",i,"-fordgobike-tripdata.csv.zip") download.file(URL, destfile = paste0("./data/20180",i,"-fordgobike-tripdata.csv.zip"), method="curl") } ``` Unzip downloaded files. ```{r} unzip("./data/201801-fordgobike-tripdata.csv.zip",exdir="./data") unzip("./data/201802-fordgobike-tripdata.csv.zip",exdir="./data") unzip("./data/201803-fordgobike-tripdata.csv.zip",exdir="./data") unzip("./data/201804-fordgobike-tripdata.csv.zip",exdir="./data") unzip("./data/201805-fordgobike-tripdata.csv.zip",exdir="./data") unzip("./data/201806-fordgobike-tripdata.csv.zip",exdir="./data") unzip("./data/201807-fordgobike-tripdata.csv.zip",exdir="./data") ``` Clean up data directory. ```{r} fn <- "./data/201801-fordgobike-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/201802-fordgobike-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/201803-fordgobike-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/201804-fordgobike-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/201805-fordgobike-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/201806-fordgobike-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/201807-fordgobike-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) ``` Read the.csv files ```{r message=FALSE} fordgobike2017 <- read_csv(file="./data/2017-fordgobike-tripdata.csv") fordgobike201801 <- read_csv(file="./data/201801-fordgobike-tripdata.csv") fordgobike201802 <- read_csv(file="./data/201802-fordgobike-tripdata.csv") fordgobike201803 <- read_csv(file="./data/201803-fordgobike-tripdata.csv") fordgobike201804 <- read_csv(file="./data/201804-fordgobike-tripdata.csv") fordgobike201805 <- read_csv(file="./data/201805-fordgobike-tripdata.csv") fordgobike201806 <- read_csv(file="./data/201806-fordgobike-tripdata.csv") fordgobike201807 <- read_csv(file="./data/201807-fordgobike-tripdata.csv") ``` Check the head() and tail() of the data.frames that are loaded. ```{r} head(fordgobike2017) head(fordgobike201801) head(fordgobike201802) head(fordgobike201803) head(fordgobike201804) head(fordgobike201805) head(fordgobike201806) head(fordgobike201807) ``` ```{r} tail(fordgobike2017) tail(fordgobike201801) tail(fordgobike201802) tail(fordgobike201803) tail(fordgobike201804) tail(fordgobike201805) tail(fordgobike201806) tail(fordgobike201807) ``` ```{r} dim(fordgobike2017) fordgobike2017 %>% count() ``` ```{r} fordgobike201801 %>% count() fordgobike201802 %>% count() fordgobike201803 %>% count() fordgobike201804 %>% count() fordgobike201805 %>% count() fordgobike201806 %>% count() fordgobike201807 %>% count() ``` The end_station_id have been updated. ```{r} glimpse(fordgobike201805) glimpse(fordgobike201806) glimpse(fordgobike201807) fordgobike201806 <- fordgobike201806 %>% mutate(start_station_id = as.integer(start_station_id), end_station_id= as.integer(end_station_id) ) fordgobike201807 <- fordgobike201807 %>% mutate(start_station_id = as.integer(start_station_id), end_station_id= as.integer(end_station_id) ) fordgobike2018 <- bind_rows(fordgobike201801, fordgobike201802, fordgobike201803, fordgobike201804, fordgobike201805, fordgobike201806, fordgobike201807) glimpse(fordgobike2018) ``` ```{r} fordgobike2018 %>% select(start_station_id,start_station_name, start_station_latitude,start_station_longitude) %>% arrange(start_station_id) %>% distinct() %>% head() ``` ```{r} dim(fordgobike2017) fordgobike2017 %>% count() nrow(fordgobike201801) + nrow(fordgobike201802) + nrow(fordgobike201803) + nrow(fordgobike201804) dim(fordgobike2018) fordgobike2018 %>% count() fordgobike <- bind_rows(fordgobike2017, fordgobike2018) dim(fordgobike) fordgobike %>% count() dim(fordgobike) fordgobike <- fordgobike %>% mutate(age = 2018 - member_birth_year) fordgobike %>% count() dim(fordgobike) fordgobike <- fordgobike %>% mutate(year=year(start_time), month=month(start_time), day=day(start_time) ) fordgobike %>% count() dim(fordgobike) fordgobike <- fordgobike %>% mutate(week_day = wday(start_time) ) levels <- c("M","T","W","TH","F","SAT","SUN") fordgobike$week_day <- factor(fordgobike$week_day, levels = levels) fordgobike %>% count() dim(fordgobike) ``` ```{r} today() now() ``` Age ```{r warning=FALSE} fordgobike %>% group_by( age ) %>% count() fordgobike %>% group_by( age ) %>% summary() skim(fordgobike) fordgobike %>% ggplot(aes(x=age)) + geom_histogram() fordgobike %>% filter(age <= 80) %>% ggplot(aes(x=age)) + geom_histogram() fordgobike %>% filter(age <= 100) %>% ggplot(aes(x=age)) + geom_histogram() fordgobike %>% filter(age > 100) %>% ggplot(aes(x=age)) + geom_histogram() ``` ```{r} fordgobike %>% group_by( member_gender, age ) %>% count() fordgobike %>% ggplot(aes(x=age, class=member_gender)) + geom_histogram() fordgobike %>% ggplot(aes(x=age, class=member_gender)) + geom_histogram(aes(y=..density..)) fordgobike %>% filter(age <= 80) %>% ggplot(aes(x=age)) + geom_histogram() ``` ```{r} fordgobike %>% filter(age <= 80) %>% ggplot(aes(x=age, color=member_gender)) + geom_histogram(position="identity") + facet_grid(member_gender ~ .) fordgobike %>% filter(age <= 80) %>% ggplot(aes(x=age, color=member_gender)) + geom_histogram(aes(y=..density..),position="identity") + facet_grid(member_gender ~ .) ``` Year and day of week. ```{r} fordgobike %>% ggplot(aes(x=year)) + geom_bar() fordgobike %>% ggplot(aes(x=month)) + geom_bar() + facet_grid(year ~ .) fordgobike %>% ggplot(aes(x=day)) + geom_bar() + facet_grid(year ~ .) ``` ```{r} fordgobike2018 <- fordgobike2018 %>% filter(start_station_latitude < 38 & start_station_longitude < 120 ) fordgobike_subset <- fordgobike2018 %>% select(start_station_longitude,start_station_latitude) fordgobike_subset %>% ggplot(aes(x=start_station_longitude, y=start_station_latitude)) + geom_point() ``` ```{r} library(biganalytics) # run in parallel, the doMC package runs on Windows library(doParallel) registerDoParallel(cores = 8) head(fordgobike2018) fordgobike_subset2 <- as.matrix(fordgobike_subset) set.seed <- 123454321 tic() cl <- bigkmeans(fordgobike_subset2, 3, nstart=8) toc() head(cl$cluster) cl$centers fordgobike_subset %>% ggplot(aes(x=start_station_longitude, y=start_station_latitude, color=cl$cluster)) + geom_point() fordgobike2018 <- fordgobike2018 %>% mutate(clust = cl$cluster) ``` ```{r} # City of Oakland c(-122.2711, 37.8044) ) # https://stackoverflow.com/questions/20621250/simple-approach-to-assigning-clusters-for-new-data-after-k-means-clustering cl$centers closest.cluster <- function(x) { cluster.dist <- apply(cl$centers, 1, function(y) sqrt(sum((x-y)^2))) return(which.min(cluster.dist)[1]) } oak <- closest.cluster(c(-122.2711, 37.8044)) oak oakland <- fordgobike2018 %>% filter(clust == oak) oakland %>% ggplot(aes(x=start_station_longitude, y=start_station_latitude)) + geom_point() ``` ```{r} tic() cl.km <- kmeans(fordgobike_subset, 3) toc() cl.km$centers # City of Oakland c(-122.2711, 37.8044) ) fordgobike_subset %>% ggplot(aes(x=start_station_longitude, y=start_station_latitude, color=cl.km$cluster)) + geom_point() ``` ```{r} dim(cl$centers) bayarea <- get_map(location = c(lon=cl$centers[oak,1], lat=cl$centers[oak,2]), zoom = 12, maptype = "roadmap") ggmap(bayarea) ggmap(bayarea) + geom_point(data = oakland, aes(x = start_station_longitude, y = start_station_latitude), size = 0.2, shape = 19) + theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank(), axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank() ) ggsave(filename="oakland.jpg", width = 4, height = 4, units = "cm", plot=last_plot()) ggmap(bayarea) + geom_point(data = oakland, aes(x = start_station_longitude, y = start_station_latitude), size = 1, shape = 19) + ggtitle("Oakland Ford Go Bike stations") ``` ```{r} bayarea <- get_map(location = "hayward") ggmap(bayarea) ggmap(bayarea) + geom_point(data = fordgobike2018, aes(x = start_station_longitude, y = start_station_latitude, color = clust, alpha = 0.1), size = 0.2, shape = 19) + theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank(), axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank(), legend.position="none") ggsave(filename="bayarea.jpg", width = 4, height = 4, units = "cm", plot=last_plot()) ggmap(bayarea) + geom_point(data = fordgobike2018, aes(x = start_station_longitude, y = start_station_latitude, color = clust, alpha = 0.1), size = 1, shape = 19) + ggtitle("Bay Area Ford Go Bike stations") ``` Gender of users ```{r} fordgobike2018 %>% ggplot(aes(x=member_gender, y=duration_sec)) + geom_bar(stat="Identity") + ggtitle("Bay Area") oakland %>% ggplot(aes(x=member_gender, y=duration_sec)) + geom_bar(stat="Identity") + ggtitle("Oakland") ``` Duration of rides in the Bay Area ```{r} fordgobike2018 %>% ggplot(aes(x=duration_sec, y=..density..)) + scale_x_continuous(limits = c(0, 10000)) + geom_histogram() + geom_density(aes(y=..density..)) fordgobike2018 %>% ggplot(aes(log(x=duration_sec), y=..density..)) + geom_histogram() + geom_density(aes(y=..density..)) fordgobike2018 %>% ggplot(aes(x=duration_sec, y=..density..)) + scale_x_continuous(limits = c(0, 10000)) + geom_histogram() + geom_density(aes(y=..density..)) + facet_grid(member_gender ~ .) fordgobike2018 %>% ggplot(aes(log(x=duration_sec), y=..density..)) + geom_histogram() + geom_density(aes(y=..density..)) + facet_grid(member_gender ~ .) ``` Durations of rides in Oakland ```{r} oakland %>% ggplot(aes(x=duration_sec, y=..density..)) + scale_x_continuous(limits = c(0, 10000)) + geom_histogram() + geom_density(aes(y=..density..)) oakland %>% ggplot(aes(log(x=duration_sec), y=..density..)) + geom_histogram() + geom_density(aes(y=..density..)) oakland %>% ggplot(aes(x=duration_sec, y=..density..)) + scale_x_continuous(limits = c(0, 10000)) + geom_histogram() + geom_density(aes(y=..density..)) + facet_grid(member_gender ~ .) oakland %>% ggplot(aes(log(x=duration_sec), y=..density..)) + geom_histogram() + geom_density(aes(y=..density..)) + facet_grid(member_gender ~ .) ``` Duration by City ```{r} fordgobike2018 %>% ggplot(aes(x=duration_sec, y=..density..)) + scale_x_continuous(limits = c(0, 10000)) + geom_histogram() + geom_density(aes(y=..density..)) + facet_grid(clust ~ .) fordgobike2018 %>% ggplot(aes(log(x=duration_sec), y=..density..)) + geom_histogram() + geom_density(aes(y=..density..)) + facet_grid(clust ~ .) fordgobike2018 %>% ggplot(aes(x=duration_sec, y=..density..)) + scale_x_continuous(limits = c(0, 10000)) + geom_histogram() + geom_density(aes(y=..density..)) + facet_grid(clust ~ .) fordgobike2018 %>% ggplot(aes(log(x=duration_sec), y=..density..)) + geom_histogram() + geom_density(aes(y=..density..)) + facet_grid(clust ~ .) ``` Duration in Oakland ```{r} oakland %>% ggplot(aes(x=duration_sec, y=..density..)) + scale_x_continuous(limits = c(0, 10000)) + geom_histogram() + geom_density(aes(y=..density..)) + facet_grid(clust ~ .) oakland %>% ggplot(aes(log(x=duration_sec), y=..density..)) + geom_histogram() + geom_density(aes(y=..density..)) + facet_grid(clust ~ .) oakland %>% ggplot(aes(x=duration_sec, y=..density..)) + scale_x_continuous(limits = c(0, 10000)) + geom_histogram() + geom_density(aes(y=..density..)) + facet_grid(clust ~ .) oakland %>% ggplot(aes(log(x=duration_sec), y=..density..)) + geom_histogram() + geom_density(aes(y=..density..)) + facet_grid(clust ~ .) ``` ```{r} fordgobike2018 %>% filter(clust == 1) %>% group_by( member_gender ) %>% summarize(dur_mean = mean(duration_sec), dur_sd = sd(duration_sec)) fordgobike2018 %>% filter(clust == 2) %>% group_by( member_gender ) %>% summarize(dur_mean = mean(duration_sec), dur_sd = sd(duration_sec)) fordgobike2018 %>% filter(clust == 3) %>% group_by( member_gender ) %>% summarize(dur_mean = mean(duration_sec), dur_sd = sd(duration_sec)) ``` ```{r} oakland %>% group_by( member_gender ) %>% summarize(dur_mean = mean(duration_sec), dur_sd = sd(duration_sec)) ```