--- title: "Ford Go Bike" output: html_notebook: default pdf_document: default --- In this notebook I download and unzip the [Ford Go Bike](https://www.fordgobike.com/) [data](https://www.fordgobike.com/system-data). ```{r} library(tidyverse) library(tictoc) library(ggmap) library(skimr) library(lubridate) library(forcats) ``` Create a directory data in your directory, as a subdirectory, within your working directory. Of use a Project and delete the previous code chunk. Download the files into the data directory. First one is not zipped, the remaining are zipped. ```{r} URL <- "https://s3.amazonaws.com/fordgobike-data/2017-fordgobike-tripdata.csv" download.file(URL, destfile = "./data/2017-fordgobike-tripdata.csv", method="curl") URL <- "https://s3.amazonaws.com/fordgobike-data/201801-fordgobike-tripdata.csv.zip" download.file(URL, destfile = "./data/201801-fordgobike-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/fordgobike-data/201802-fordgobike-tripdata.csv.zip" download.file(URL, destfile = "./data/201802-fordgobike-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/fordgobike-data/201803-fordgobike-tripdata.csv.zip" download.file(URL, destfile = "./data/201803-fordgobike-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/fordgobike-data/201804-fordgobike-tripdata.csv.zip" download.file(URL, destfile = "./data/201804-fordgobike-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/fordgobike-data/201805-fordgobike-tripdata.csv.zip" download.file(URL, destfile = "./data/201805-fordgobike-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/fordgobike-data/201806-fordgobike-tripdata.csv.zip" download.file(URL, destfile = "./data/201806-fordgobike-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/fordgobike-data/201807-fordgobike-tripdata.csv.zip" download.file(URL, destfile = "./data/201807-fordgobike-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/fordgobike-data/201808-fordgobike-tripdata.csv.zip" download.file(URL, destfile = "./data/201808-fordgobike-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/fordgobike-data/201809-fordgobike-tripdata.csv.zip" download.file(URL, destfile = "./data/201809-fordgobike-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/fordgobike-data/201810-fordgobike-tripdata.csv.zip" download.file(URL, destfile = "./data/201810-fordgobike-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/fordgobike-data/201811-fordgobike-tripdata.csv.zip" download.file(URL, destfile = "./data/201811-fordgobike-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/fordgobike-data/201812-fordgobike-tripdata.csv.zip" download.file(URL, destfile = "./data/201812-fordgobike-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/fordgobike-data/201901-fordgobike-tripdata.csv.zip" download.file(URL, destfile = "./data/201901-fordgobike-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/fordgobike-data/201902-fordgobike-tripdata.csv.zip" download.file(URL, destfile = "./data/201902-fordgobike-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/fordgobike-data/201903-fordgobike-tripdata.csv.zip" download.file(URL, destfile = "./data/201903-fordgobike-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/fordgobike-data/201904-fordgobike-tripdata.csv.zip" download.file(URL, destfile = "./data/201904-fordgobike-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/baywheels-data/201905-baywheels-tripdata.csv.zip" download.file(URL, destfile = "./data/201905-baywheels-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/baywheels-data/201906-baywheels-tripdata.csv.zip" download.file(URL, destfile = "./data/201906-baywheels-tripdata.csv.zip", method="curl") URL <- "https://s3.amazonaws.com/baywheels-data/201907-baywheels-tripdata.csv.zip" download.file(URL, destfile = "./data/201907-baywheels-tripdata.csv.zip", method="curl") ``` Unzip downloaded files. ```{r} unzip("./data/201801-fordgobike-tripdata.csv.zip",exdir="./data") unzip("./data/201802-fordgobike-tripdata.csv.zip",exdir="./data") unzip("./data/201803-fordgobike-tripdata.csv.zip",exdir="./data") unzip("./data/201804-fordgobike-tripdata.csv.zip",exdir="./data") unzip("./data/201805-fordgobike-tripdata.csv.zip",exdir="./data") unzip("./data/201806-fordgobike-tripdata.csv.zip",exdir="./data") unzip("./data/201807-fordgobike-tripdata.csv.zip",exdir="./data") unzip("./data/201808-fordgobike-tripdata.csv.zip",exdir="./data") unzip("./data/201809-fordgobike-tripdata.csv.zip",exdir="./data") unzip("./data/201810-fordgobike-tripdata.csv.zip",exdir="./data") unzip("./data/201811-fordgobike-tripdata.csv.zip",exdir="./data") unzip("./data/201812-fordgobike-tripdata.csv.zip",exdir="./data") unzip("./data/201901-fordgobike-tripdata.csv.zip",exdir="./data") unzip("./data/201902-fordgobike-tripdata.csv.zip",exdir="./data") unzip("./data/201903-fordgobike-tripdata.csv.zip",exdir="./data") unzip("./data/201904-fordgobike-tripdata.csv.zip",exdir="./data") unzip("./data/201905-baywheels-tripdata.csv.zip",exdir="./data") unzip("./data/201906-baywheels-tripdata.csv.zip",exdir="./data") unzip("./data/201907-baywheels-tripdata.csv.zip",exdir="./data") ``` Clean up data directory. ```{r} fn <- "./data/201801-fordgobike-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/201802-fordgobike-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/201803-fordgobike-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/201804-fordgobike-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/201805-fordgobike-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/201806-fordgobike-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/201807-fordgobike-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/201808-fordgobike-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/201809-fordgobike-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/201810-fordgobike-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/201811-fordgobike-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/201812-fordgobike-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/201901-fordgobike-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/201902-fordgobike-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/201903-fordgobike-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/201904-fordgobike-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/201905-baywheels-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/201906-baywheels-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) fn <- "./data/201907-baywheels-tripdata.csv.zip" if (file.exists(fn)) file.remove(fn) ``` Rename the BayWheels files to fordgobike ```{r} file.rename("./data/201905-baywheels-tripdata.csv", "./data/201905-fordgobike-tripdata.csv") file.rename("./data/201906-baywheels-tripdata.csv", "./data/201906-fordgobike-tripdata.csv") file.rename("./data/201907-baywheels-tripdata.csv", "./data/201907-fordgobike-tripdata.csv") ``` Read the.csv files ```{r message=FALSE} fordgobike2017 <- read_csv(file="./data/2017-fordgobike-tripdata.csv") fordgobike201801 <- read_csv(file="./data/201801-fordgobike-tripdata.csv") fordgobike201802 <- read_csv(file="./data/201802-fordgobike-tripdata.csv") fordgobike201803 <- read_csv(file="./data/201803-fordgobike-tripdata.csv") fordgobike201804 <- read_csv(file="./data/201804-fordgobike-tripdata.csv") fordgobike201805 <- read_csv(file="./data/201805-fordgobike-tripdata.csv") fordgobike201806 <- read_csv(file="./data/201806-fordgobike-tripdata.csv") fordgobike201807 <- read_csv(file="./data/201807-fordgobike-tripdata.csv") fordgobike201808 <- read_csv(file="./data/201808-fordgobike-tripdata.csv") fordgobike201809 <- read_csv(file="./data/201809-fordgobike-tripdata.csv") fordgobike201810 <- read_csv(file="./data/201810-fordgobike-tripdata.csv") fordgobike201811 <- read_csv(file="./data/201811-fordgobike-tripdata.csv") fordgobike201812 <- read_csv(file="./data/201812-fordgobike-tripdata.csv") fordgobike201901 <- read_csv(file="./data/201901-fordgobike-tripdata.csv") fordgobike201902 <- read_csv(file="./data/201902-fordgobike-tripdata.csv") fordgobike201903 <- read_csv(file="./data/201903-fordgobike-tripdata.csv") fordgobike201904 <- read_csv(file="./data/201904-fordgobike-tripdata.csv") fordgobike201905 <- read_csv(file="./data/201905-fordgobike-tripdata.csv") fordgobike201906 <- read_csv(file="./data/201906-fordgobike-tripdata.csv") fordgobike201907 <- read_csv(file="./data/201907-fordgobike-tripdata.csv") ``` Check the head() and tail() of the data.frames that are loaded. ```{r} head(fordgobike2017) head(fordgobike201801) head(fordgobike201802) head(fordgobike201803) head(fordgobike201804) head(fordgobike201805) head(fordgobike201806) head(fordgobike201807) head(fordgobike201808) head(fordgobike201809) head(fordgobike201810) head(fordgobike201811) head(fordgobike201812) head(fordgobike201901) head(fordgobike201902) head(fordgobike201903) head(fordgobike201904) head(fordgobike201905) head(fordgobike201906) head(fordgobike201907) ``` ```{r} tail(fordgobike2017) tail(fordgobike201801) tail(fordgobike201802) tail(fordgobike201803) tail(fordgobike201804) tail(fordgobike201805) tail(fordgobike201806) tail(fordgobike201807) tail(fordgobike201808) tail(fordgobike201809) tail(fordgobike201810) tail(fordgobike201811) tail(fordgobike201812) tail(fordgobike201901) tail(fordgobike201902) tail(fordgobike201903) tail(fordgobike201904) tail(fordgobike201905) tail(fordgobike201906) tail(fordgobike201907) ``` Year 2017 ```{r} dim(fordgobike2017) fordgobike2017 %>% count() ``` Year 2018 ```{r} fordgobike201801 %>% count() fordgobike201802 %>% count() fordgobike201803 %>% count() fordgobike201804 %>% count() fordgobike201805 %>% count() fordgobike201806 %>% count() fordgobike201807 %>% count() fordgobike201808 %>% count() fordgobike201809 %>% count() fordgobike201810 %>% count() fordgobike201811 %>% count() fordgobike201812 %>% count() ``` Year 2019 ```{r} fordgobike201901 %>% count() fordgobike201902 %>% count() fordgobike201903 %>% count() fordgobike201904 %>% count() fordgobike201905 %>% count() fordgobike201906 %>% count() fordgobike201907 %>% count() ``` The end_station_id have been updated. ```{r} glimpse(fordgobike201804) glimpse(fordgobike201805) glimpse(fordgobike201806) glimpse(fordgobike201807) fordgobike201806 %>% select(start_station_id) fordgobike201806 <- fordgobike201806 %>% mutate(start_station_id = as.integer(start_station_id), end_station_id= as.integer(end_station_id) ) fordgobike201806 %>% select(start_station_id) fordgobike201807 %>% select(start_station_id) fordgobike201807 <- fordgobike201807 %>% mutate(start_station_id = as.integer(start_station_id), end_station_id= as.integer(end_station_id) ) fordgobike201807 %>% select(start_station_id) fordgobike201807 %>% select(start_time) %>% mutate(year=year(start_time), month=month(start_time), day=day(start_time) ) fordgobike201808 %>% select(start_station_id) fordgobike201808 <- fordgobike201808 %>% mutate(start_station_id = as.integer(start_station_id), end_station_id= as.integer(end_station_id) ) fordgobike201808 %>% select(start_station_id) fordgobike201809 %>% select(start_station_id) fordgobike201809 <- fordgobike201809 %>% mutate(start_station_id = as.integer(start_station_id), end_station_id= as.integer(end_station_id) ) fordgobike201809 %>% select(start_station_id) fordgobike201810 %>% select(start_station_id) fordgobike201810 <- fordgobike201810 %>% mutate(start_station_id = as.integer(start_station_id), end_station_id= as.integer(end_station_id) ) fordgobike201810 %>% select(start_station_id) fordgobike201811 %>% select(start_station_id) fordgobike201811 <- fordgobike201811 %>% mutate(start_station_id = as.integer(start_station_id), end_station_id= as.integer(end_station_id) ) fordgobike201811 %>% select(start_station_id) fordgobike201812 %>% select(start_station_id) fordgobike201812 <- fordgobike201812 %>% mutate(start_station_id = as.integer(start_station_id), end_station_id= as.integer(end_station_id) ) fordgobike201812 %>% select(start_station_id) fordgobike2018 <- bind_rows(fordgobike201801, fordgobike201802, fordgobike201803, fordgobike201804, fordgobike201805, fordgobike201806, fordgobike201807, fordgobike201808, fordgobike201809, fordgobike201810, fordgobike201811, fordgobike201812) glimpse(fordgobike2018) fordgobike2018 %>% mutate(year=year(start_time), month=month(start_time), day=day(start_time) ) %>% select(month) %>% filter(month == '7') ``` ```{r} glimpse(fordgobike201902) glimpse(fordgobike201903) glimpse(fordgobike201906) fordgobike201902 %>% select(start_station_id) fordgobike201902 <- fordgobike201902 %>% mutate(start_station_id = as.integer(start_station_id), end_station_id= as.integer(end_station_id) ) fordgobike201902 %>% select(start_station_id) fordgobike201903 %>% select(start_station_id) fordgobike201903 <- fordgobike201903 %>% mutate(start_station_id = as.integer(start_station_id), end_station_id= as.integer(end_station_id) ) fordgobike201903 %>% select(start_station_id) fordgobike2019 <- bind_rows(fordgobike201901, fordgobike201902, fordgobike201903, fordgobike201904, fordgobike201905, fordgobike201906, fordgobike201907) glimpse(fordgobike2019) ``` ```{r} fordgobike2017 %>% select(start_station_id,start_station_name, start_station_latitude,start_station_longitude) %>% arrange(start_station_id) %>% distinct() %>% head() fordgobike2018 %>% select(start_station_id,start_station_name, start_station_latitude,start_station_longitude) %>% arrange(start_station_id) %>% distinct() %>% head() fordgobike2019 %>% select(start_station_id,start_station_name, start_station_latitude,start_station_longitude) %>% arrange(start_station_id) %>% distinct() %>% head() ``` ```{r} dim(fordgobike2017) fordgobike2017 %>% count() dim(fordgobike2018) fordgobike2018 %>% count() dim(fordgobike2019) fordgobike2019 %>% count() fordgobike <- bind_rows(fordgobike2017, fordgobike2018, fordgobike2019) dim(fordgobike) fordgobike %>% count() fordgobike <- fordgobike %>% mutate(age = 2019 - member_birth_year) fordgobike %>% count() dim(fordgobike) fordgobike %>% select(start_time) fordgobike <- fordgobike %>% mutate(year=year(start_time), month=month(start_time), day=day(start_time) ) fordgobike %>% count() dim(fordgobike) fordgobike <- fordgobike %>% mutate(week_day = wday(start_time, label = TRUE, abbr = TRUE) ) fordgobike %>% count() dim(fordgobike) ``` ```{r} today() now() ``` Age ```{r warning=FALSE} fordgobike %>% group_by( age ) %>% count() fordgobike %>% summary() skim(fordgobike) fordgobike %>% ggplot(aes(x=age)) + geom_histogram() fordgobike %>% filter(age <= 80) %>% ggplot(aes(x=age)) + geom_histogram() fordgobike %>% filter(age <= 100) %>% ggplot(aes(x=age)) + geom_histogram() fordgobike %>% filter(age > 100) %>% ggplot(aes(x=age)) + geom_histogram() ``` ```{r} fordgobike %>% group_by( member_gender, age ) %>% count() fordgobike %>% ggplot(aes(x=age, class=member_gender)) + geom_histogram() fordgobike %>% ggplot(aes(x=age, class=member_gender)) + geom_histogram(aes(y=..density..)) fordgobike %>% filter(age <= 80) %>% ggplot(aes(x=age)) + geom_histogram() ``` ```{r} fordgobike %>% filter(age <= 80) %>% ggplot(aes(x=age, color=member_gender)) + geom_histogram(position="identity") + facet_grid(member_gender ~ .) fordgobike %>% filter(age <= 80) %>% ggplot(aes(x=age, color=member_gender)) + geom_histogram(aes(y=..density..),position="identity") + facet_grid(member_gender ~ .) fordgobike %>% select(member_gender) fordgobike <- fordgobike %>% mutate( member_gender = as.factor(member_gender) ) fct_count(fordgobike$member_gender) fordgobike$member_gender <- fordgobike$member_gender %>% na_if( "?") fordgobike$member_gender <- fct_collapse(fordgobike$member_gender, Male = c("Male", "M"), Female = c("Female", "F"), Other = c("Other", "O"), "NA" = "?" ) fct_count(fordgobike$member_gender) fordgobike %>% filter(age <= 80) %>% ggplot(aes(x=age, color=member_gender)) + geom_histogram(position="identity") + facet_grid(member_gender ~ .) fordgobike %>% filter(age <= 80) %>% ggplot(aes(x=age, color=member_gender)) + geom_histogram(aes(y=..density..),position="identity") + facet_grid(member_gender ~ .) ``` Year and day of week. ```{r} fordgobike %>% select(year, month, day) fordgobike <- fordgobike %>% mutate(year = as.integer(year), month = as.integer(month) ) fordgobike %>% select(year, month, day) fordgobike %>% select(year, month, day) %>% filter(year == '2018', month == '7') fordgobike %>% ggplot(aes(x=year)) + geom_bar() fordgobike %>% ggplot(aes(x=month)) + geom_bar() + facet_grid(year ~ .) fordgobike %>% ggplot(aes(x=day)) + geom_bar() + facet_grid(year ~ .) ``` ```{r} fordgobike_restricted <- fordgobike2017 %>% filter(start_station_latitude < 38 & start_station_longitude < 120 ) fordgobike_subset_2017 <- fordgobike_restricted %>% select(start_station_longitude, start_station_latitude, member_gender ) fordgobike_subset_2017 %>% ggplot(aes(x=start_station_longitude, y=start_station_latitude)) + geom_point() fordgobike_restricted <- fordgobike2018 %>% filter(start_station_latitude < 38 & start_station_longitude < 120 ) fordgobike_subset_2018 <- fordgobike_restricted %>% select(start_station_longitude, start_station_latitude, member_gender) fordgobike_subset_2018 %>% ggplot(aes(x=start_station_longitude, y=start_station_latitude)) + geom_point() fordgobike_restricted <- fordgobike2019 %>% filter(start_station_latitude > 37 & start_station_latitude < 38 & start_station_longitude < 120 ) fordgobike_subset_2019 <- fordgobike_restricted %>% select(start_station_longitude, start_station_latitude, member_gender) fordgobike_subset_2019 %>% ggplot(aes(x=start_station_longitude, y=start_station_latitude)) + geom_point() ``` ```{r} library(biganalytics) # run in parallel, the doMC package runs on Windows library(doParallel) registerDoParallel(cores = 8) head(fordgobike2018) fordgobike_subset2 <- as.matrix(fordgobike_subset_2018[-3]) # do not include member_gender set.seed <- 123454321 tic() cl <- bigkmeans(fordgobike_subset2, 3, nstart=8) toc() head(cl$cluster) cl$centers fordgobike_subset_2018 %>% ggplot(aes(x=start_station_longitude, y=start_station_latitude, color=cl$cluster)) + geom_point() fordgobike_subset_2018 <- fordgobike_subset_2018 %>% mutate(clust = cl$cluster) ``` ```{r} # City of Oakland c(-122.2711, 37.8044) ) # https://stackoverflow.com/questions/20621250/simple-approach-to-assigning-clusters-for-new-data-after-k-means-clustering cl$centers closest.cluster <- function(x) { cluster.dist <- apply(cl$centers, 1, function(y) sqrt(sum((x-y)^2))) return(which.min(cluster.dist)[1]) } oak <- closest.cluster(c(-122.2711, 37.8044)) oak oakland <- fordgobike_subset_2018 %>% filter(clust == oak) oakland %>% ggplot(aes(x=start_station_longitude, y=start_station_latitude)) + geom_point() ``` ```{r} tic() cl.km <- kmeans(fordgobike_subset_2018[-3], 3) toc() cl.km$centers # City of Oakland c(-122.2711, 37.8044) ) fordgobike_subset_2018 %>% ggplot(aes(x=start_station_longitude, y=start_station_latitude, color=cl.km$cluster)) + geom_point() ``` Gender of users ```{r} fordgobike %>% ggplot(aes(x=member_gender, y=duration_sec)) + geom_bar(stat="Identity") + ggtitle("Bay Area") ``` Duration of rides in the Bay Area ```{r} fordgobike %>% ggplot(aes(x=duration_sec, y=..density..)) + scale_x_continuous(limits = c(0, 10000)) + geom_histogram() + geom_density(aes(y=..density..)) fordgobike %>% ggplot(aes(log(x=duration_sec), y=..density..)) + geom_histogram() + geom_density(aes(y=..density..)) fordgobike %>% ggplot(aes(x=duration_sec, y=..density..)) + scale_x_continuous(limits = c(0, 10000)) + geom_histogram() + geom_density(aes(y=..density..)) + facet_grid(member_gender ~ .) fordgobike %>% ggplot(aes(log(x=duration_sec), y=..density..)) + geom_histogram() + geom_density(aes(y=..density..)) + facet_grid(member_gender ~ .) ``` Duration by City ```{r} fordgobike2018 %>% ggplot(aes(x=duration_sec, y=..density..)) + scale_x_continuous(limits = c(0, 10000)) + geom_histogram() + geom_density(aes(y=..density..)) fordgobike2018 %>% ggplot(aes(log(x=duration_sec), y=..density..)) + geom_histogram() + geom_density(aes(y=..density..)) fordgobike2018 %>% ggplot(aes(x=duration_sec, y=..density..)) + scale_x_continuous(limits = c(0, 10000)) + geom_histogram() + geom_density(aes(y=..density..)) fordgobike2018 %>% ggplot(aes(log(x=duration_sec), y=..density..)) + geom_histogram() + geom_density(aes(y=..density..)) ```