---
title: "Ford Go Bike"
output:
html_notebook: default
pdf_document: default
---
In this notebook I download and unzip the [Ford Go Bike](https://www.baywheels.com/) [data](https://www.baywheels.com/system-data).
```{r}
library(tidyverse)
library(tictoc)
library(skimr)
library(lubridate)
library(forcats)
```
Create a directory data in your directory, as a subdirectory, within your working directory. Of use a Project and delete the previous code chunk. Download the files into the data directory. First one is not zipped, the remaining are zipped.
```{r}
URL <- "https://s3.amazonaws.com/baywheels-data/201905-baywheels-tripdata.csv.zip"
download.file(URL, destfile = "./data/201905-baywheels-tripdata.csv.zip", method="curl")
URL <- "https://s3.amazonaws.com/baywheels-data/201906-baywheels-tripdata.csv.zip"
download.file(URL, destfile = "./data/201906-baywheels-tripdata.csv.zip", method="curl")
URL <- "https://s3.amazonaws.com/baywheels-data/201907-baywheels-tripdata.csv.zip"
download.file(URL, destfile = "./data/201907-baywheels-tripdata.csv.zip", method="curl")
URL <- "https://s3.amazonaws.com/baywheels-data/201908-baywheels-tripdata.csv.zip"
download.file(URL, destfile = "./data/201908-baywheels-tripdata.csv.zip", method="curl")
URL <- "https://s3.amazonaws.com/baywheels-data/201909-baywheels-tripdata.csv.zip"
download.file(URL, destfile = "./data/201909-baywheels-tripdata.csv.zip", method="curl")
URL <- "https://s3.amazonaws.com/baywheels-data/201910-baywheels-tripdata.csv.zip"
download.file(URL, destfile = "./data/201910-baywheels-tripdata.csv.zip", method="curl")
URL <- "https://s3.amazonaws.com/baywheels-data/201911-baywheels-tripdata.csv.zip"
download.file(URL, destfile = "./data/201911-baywheels-tripdata.csv.zip", method="curl")
URL <- "https://s3.amazonaws.com/baywheels-data/201912-baywheels-tripdata.csv.zip"
download.file(URL, destfile = "./data/201912-baywheels-tripdata.csv.zip", method="curl")
URL <- "https://s3.amazonaws.com/baywheels-data/202001-baywheels-tripdata.csv.zip"
download.file(URL, destfile = "./data/202001-baywheels-tripdata.csv.zip", method="curl")
URL <- "https://s3.amazonaws.com/baywheels-data/202002-baywheels-tripdata.csv.zip"
download.file(URL, destfile = "./data/202002-baywheels-tripdata.csv.zip", method="curl")
URL <- "https://s3.amazonaws.com/baywheels-data/202003-baywheels-tripdata.csv.zip"
download.file(URL, destfile = "./data/202003-baywheels-tripdata.csv.zip", method="curl")
URL <- "https://s3.amazonaws.com/baywheels-data/202004-baywheels-tripdata.csv.zip"
download.file(URL, destfile = "./data/202004-baywheels-tripdata.csv.zip", method="curl")
URL <- "https://s3.amazonaws.com/baywheels-data/202005-baywheels-tripdata.csv.zip"
download.file(URL, destfile = "./data/202005-baywheels-tripdata.csv.zip", method="curl")
URL <- "https://s3.amazonaws.com/baywheels-data/202006-baywheels-tripdata.csv.zip"
download.file(URL, destfile = "./data/202006-baywheels-tripdata.csv.zip", method="curl")
URL <- "https://s3.amazonaws.com/baywheels-data/202007-baywheels-tripdata.csv.zip"
download.file(URL, destfile = "./data/202007-baywheels-tripdata.csv.zip", method="curl")
URL <- "https://s3.amazonaws.com/baywheels-data/202008-baywheels-tripdata.csv.zip"
download.file(URL, destfile = "./data/202008-baywheels-tripdata.csv.zip", method="curl")
```
Loop over the one value in the url and filename that changes.
```{r, eval=FALSE}
for (i in 5:9) {
URL <- paste0("https://s3.amazonaws.com/baywheels-data/20190",i,"-baywheels-tripdata.csv.zip")
download.file(URL, destfile = paste0("./data/20190",i,"-baywheels-tripdata.csv.zip"), method="curl")
}
for (i in 10:12) {
URL <- paste0("https://s3.amazonaws.com/baywheels-data/2019",i,"-baywheels-tripdata.csv.zip")
download.file(URL, destfile = paste0("./data/2019",i,"-baywheels-tripdata.csv.zip"), method="curl")
}
for (i in 1:8) {
URL <- paste0("https://s3.amazonaws.com/baywheels-data/20200",i,"-baywheels-tripdata.csv.zip")
download.file(URL, destfile = paste0("./data/20200",i,"-baywheels-tripdata.csv.zip"), method="curl")
}
```
Unzip downloaded files.
```{r}
unzip("./data/201905-baywheels-tripdata.csv.zip",exdir="./data")
unzip("./data/201906-baywheels-tripdata.csv.zip",exdir="./data")
unzip("./data/201907-baywheels-tripdata.csv.zip",exdir="./data")
unzip("./data/201908-baywheels-tripdata.csv.zip",exdir="./data")
unzip("./data/201909-baywheels-tripdata.csv.zip",exdir="./data")
unzip("./data/201910-baywheels-tripdata.csv.zip",exdir="./data")
unzip("./data/201911-baywheels-tripdata.csv.zip",exdir="./data")
unzip("./data/201912-baywheels-tripdata.csv.zip",exdir="./data")
unzip("./data/202001-baywheels-tripdata.csv.zip",exdir="./data")
unzip("./data/202002-baywheels-tripdata.csv.zip",exdir="./data")
unzip("./data/202003-baywheels-tripdata.csv.zip",exdir="./data")
unzip("./data/202004-baywheels-tripdata.csv.zip",exdir="./data")
unzip("./data/202005-baywheels-tripdata.csv.zip",exdir="./data")
unzip("./data/202006-baywheels-tripdata.csv.zip",exdir="./data")
unzip("./data/202007-baywheels-tripdata.csv.zip",exdir="./data")
unzip("./data/202008-baywheels-tripdata.csv.zip",exdir="./data")
```
Clean up data directory.
```{r}
fn <- "./data/201905-baywheels-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
fn <- "./data/201906-baywheels-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
fn <- "./data/201907-baywheels-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
fn <- "./data/201908-baywheels-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
fn <- "./data/201909-baywheels-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
fn <- "./data/201910-baywheels-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
fn <- "./data/201911-baywheels-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
fn <- "./data/201912-baywheels-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
fn <- "./data/202001-baywheels-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
fn <- "./data/202002-baywheels-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
fn <- "./data/202003-baywheels-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
fn <- "./data/202004-baywheels-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
fn <- "./data/202005-baywheels-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
fn <- "./data/202006-baywheels-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
fn <- "./data/202007-baywheels-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
fn <- "./data/202008-baywheels-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
```
Read the.csv files
```{r message=FALSE}
baywheels201905 <- read_csv(file="./data/201905-baywheels-tripdata.csv")
baywheels201906 <- read_csv(file="./data/201906-baywheels-tripdata.csv")
baywheels201907 <- read_csv(file="./data/201907-baywheels-tripdata.csv")
baywheels201908 <- read_csv(file="./data/201908-baywheels-tripdata.csv")
baywheels201909 <- read_csv(file="./data/201909-baywheels-tripdata.csv")
baywheels201910 <- read_csv(file="./data/201910-baywheels-tripdata.csv")
baywheels201911 <- read_csv(file="./data/201911-baywheels-tripdata.csv")
baywheels201912 <- read_csv(file="./data/201912-baywheels-tripdata.csv")
baywheels202001 <- read_csv(file="./data/202001-baywheels-tripdata.csv")
baywheels202002 <- read_csv(file="./data/202002-baywheels-tripdata.csv")
baywheels202003 <- read_csv(file="./data/202003-baywheels-tripdata.csv")
baywheels202004 <- read_csv(file="./data/202004-baywheels-tripdata.csv")
baywheels202005 <- read_csv(file="./data/202005-baywheels-tripdata.csv")
baywheels202006 <- read_csv(file="./data/202006-baywheels-tripdata.csv")
baywheels202007 <- read_csv(file="./data/202007-baywheels-tripdata.csv")
baywheels202008 <- read_csv(file="./data/202008-baywheels-tripdata.csv")
```
Check the head() and tail() of the data.frames that are loaded.
```{r}
head(baywheels201905)
head(baywheels201906)
head(baywheels201907)
head(baywheels201908)
head(baywheels201909)
head(baywheels201910)
head(baywheels201911)
head(baywheels201912)
head(baywheels202001)
head(baywheels202002)
head(baywheels202003)
head(baywheels202004)
head(baywheels202005)
head(baywheels202006)
head(baywheels202007)
head(baywheels202008)
```
```{r}
tail(baywheels201905)
tail(baywheels201906)
tail(baywheels201907)
tail(baywheels201908)
tail(baywheels201909)
tail(baywheels201910)
tail(baywheels201911)
tail(baywheels201912)
tail(baywheels202001)
tail(baywheels202002)
tail(baywheels202003)
tail(baywheels202004)
tail(baywheels202005)
tail(baywheels202006)
tail(baywheels202007)
tail(baywheels202008)
```
```{r}
x <- as.numeric(c(
baywheels201905 %>% count(),
baywheels201906 %>% count(),
baywheels201907 %>% count(),
baywheels201908 %>% count(),
baywheels201909 %>% count(),
baywheels201910 %>% count(),
baywheels201911 %>% count(),
baywheels201912 %>% count(),
baywheels202001 %>% count(),
baywheels202002 %>% count(),
baywheels202003 %>% count(),
baywheels202004 %>% count(),
baywheels202005 %>% count(),
baywheels202006 %>% count(),
baywheels202007 %>% count(),
baywheels202008 %>% count()
))
sum(x)
```
The end_station_id have been updated.
```{r}
glimpse(baywheels201905)
glimpse(baywheels201906)
glimpse(baywheels201907)
baywheels2019 <- bind_rows(baywheels201905,
baywheels201906,
baywheels201907,
baywheels201908,
baywheels201909,
baywheels201910,
baywheels201911,
baywheels201912)
dim(baywheels2019)
baywheels2020 <- bind_rows(baywheels202001,
baywheels202002,
baywheels202003,
baywheels202004,
baywheels202005,
baywheels202006,
baywheels202007,
baywheels202008)
dim(baywheels2020)
```
```{r}
baywheels2019 %>% select(start_station_id,start_station_name, start_station_latitude,start_station_longitude) %>%
arrange(start_station_id) %>%
distinct() %>%
head()
```
```{r}
dim(baywheels2019)
baywheels2019 %>% count()
dim(baywheels2020)
baywheels2020 %>% count()
baywheels <- bind_rows(baywheels2019, baywheels2020)
baywheels %>% count()
dim(baywheels)
baywheels <- baywheels %>% mutate(year=year(start_time), month=month(start_time), day=day(start_time) )
baywheels %>% count()
dim(baywheels)
baywheels <- baywheels %>% mutate(week_day = wday(start_time) )
levels <- c("M","T","W","TH","F","SAT","SUN")
baywheels <- baywheels %>% mutate( week_day = factor(week_day, levels = levels) )
baywheels %>% count()
dim(baywheels)
```
```{r}
today()
now()
```
Year and day of week.
```{r}
baywheels %>% ggplot(aes(x=year)) +
geom_bar()
baywheels %>% ggplot(aes(x=month)) +
geom_bar() +
facet_grid(year ~ .)
baywheels %>% ggplot(aes(x=day)) +
geom_bar() +
facet_grid(year ~ .)
```
```{r}
baywheels <- baywheels %>% filter(start_station_latitude < 38 & start_station_longitude < 120 )
baywheels_subset <- baywheels %>% select(start_station_longitude,start_station_latitude)
baywheels_subset %>% ggplot(aes(x=start_station_longitude, y=start_station_latitude)) +
geom_point()
```
### There seem to be too many columns. We should check to make sure all of the data files have the same columns.
Check the head() and tail() of the data.frames that are loaded.
```{r}
head(baywheels201905)
head(baywheels201906)
head(baywheels201907)
head(baywheels201908)
head(baywheels201909)
head(baywheels201910)
head(baywheels201911)
head(baywheels201912)
head(baywheels202001)
head(baywheels202002)
head(baywheels202003)
head(baywheels202004)
head(baywheels202005)
head(baywheels202006)
head(baywheels202007)
head(baywheels202008)
```
```{r}
tail(baywheels201905)
tail(baywheels201906)
tail(baywheels201907)
tail(baywheels201908)
tail(baywheels201909)
tail(baywheels201910)
tail(baywheels201911)
tail(baywheels201912)
tail(baywheels202001)
tail(baywheels202002)
tail(baywheels202003)
tail(baywheels202004)
tail(baywheels202005)
tail(baywheels202006)
tail(baywheels202007)
tail(baywheels202008)
```
Make a list of the variable names. Do the data files contain the same columns? If not, what are the difference?
```{r}
bwn <- list(
baywheels201905 %>% names(),
baywheels201906 %>% names(),
baywheels201907 %>% names(),
baywheels201908 %>% names(),
baywheels201909 %>% names(),
baywheels201910 %>% names(),
baywheels201911 %>% names(),
baywheels201912 %>% names(),
baywheels202001 %>% names(),
baywheels202002 %>% names(),
baywheels202003 %>% names(),
baywheels202004 %>% names(),
baywheels202005 %>% names(),
baywheels202006 %>% names(),
baywheels202007 %>% names(),
baywheels202008 %>% names()
)
bwn
```
```{r}
x <- as.numeric(c(
baywheels201905 %>% count(),
baywheels201906 %>% count(),
baywheels201907 %>% count(),
baywheels201908 %>% count(),
baywheels201909 %>% count(),
baywheels201910 %>% count(),
baywheels201911 %>% count(),
baywheels201912 %>% count(),
baywheels202001 %>% count(),
baywheels202002 %>% count(),
baywheels202003 %>% count(),
baywheels202004 %>% count(),
baywheels202005 %>% count(),
baywheels202006 %>% count(),
baywheels202007 %>% count(),
baywheels202008 %>% count()
))
sum(x)
```
There was a change in the variables in April 2020.
```{r}
glimpse(baywheels201905)
dim(baywheels201905)
glimpse(baywheels201906)
dim(baywheels201906)
glimpse(baywheels201907)
dim(baywheels201907)
baywheels2019 <- bind_rows(baywheels201905,
baywheels201906)
dim(baywheels2019)
baywheels2019 <- bind_rows(baywheels201905,
baywheels201906,
baywheels201907,
baywheels201908,
baywheels201909,
baywheels201910,
baywheels201911,
baywheels201912)
dim(baywheels2019)
baywheels2020 <- bind_rows(baywheels202001,
baywheels202002,
baywheels202003)
dim(baywheels2020)
baywheels <- bind_rows(baywheels2019, baywheels2020)
dim(baywheels)
baywheels2020a <- bind_rows(
baywheels202004,
baywheels202005,
baywheels202006,
baywheels202007,
baywheels202008)
dim(baywheels2020a)
```
```{r}
bwna <- list(
baywheels %>% names(),
baywheels2020a %>% names()
)
bwna
identical(bwna[[1]], bwna[[2]])
intersect(bwna[[1]], bwna[[2]])
setdiff(bwna[[1]], bwna[[2]])
setdiff(bwna[[2]], bwna[[1]])
```
```{r}
baywheels2020b <- baywheels2020a %>% rename(
"start_time" = "started_at",
"end_time" = "ended_at",
"start_station_latitude" = "start_lat",
"start_station_longitude" = "start_lng",
"end_station_latitude" = "end_lat",
"end_station_longitude" = "end_lng"
)
baywheels2020b %>% names()
```
```{r}
baywheels %>% select(user_type) %>%
table()
baywheels2020b %>% select(member_casual) %>%
table()
```
```{r}
baywheels2020b <- baywheels2020b %>% rename(
"user_type" = "member_casual"
)
baywheels2020b %>% names()
baywheels2020c <- baywheels2020b %>%
mutate( user_type = as_factor(user_type) )
baywheels2020c %>% select(user_type) %>%
table()
baywheels2020d <- baywheels2020c %>% mutate(user_type = fct_recode( user_type,
Customer = "casual",
Subscriber = "member"
)
)
baywheels2020d %>% select(user_type) %>%
table()
baywheels2020e <- baywheels2020d %>%
mutate(user_type = as.character(user_type))
```
```{r}
setdiff(names(baywheels), names(baywheels2020e))
setdiff(names(baywheels2020e), names(baywheels))
baywheels2020f <- baywheels2020e %>% mutate(duration_sec =
as.numeric(difftime(end_time, start_time, units = "secs")))
baywheels <- bind_rows(baywheels, baywheels2020f)
dim(baywheels)
baywheels %>% names()
```
```{r}
baywheels2 <- baywheels %>% mutate(year=year(start_time),
month=month(start_time),
day=day(start_time) )
baywheels %>% count()
dim(baywheels2)
baywheels3 <- baywheels2 %>% mutate(week_day = wday(start_time) )
levels <- c("M","T","W","TH","F","SAT","SUN")
baywheels3 <- baywheels3 %>% mutate( week_day = factor(week_day, levels = levels) )
baywheels3 %>% count()
dim(baywheels3)
```
```{r}
baywheels3 %>% select(start_station_id,
start_station_name,
start_station_latitude,
start_station_longitude) %>%
arrange(start_station_id) %>%
distinct() %>%
head()
```
```{r}
today()
now()
```
Year and day of week.
```{r}
baywheels3 %>% ggplot(aes(x=year)) + geom_bar()
baywheels3 %>% ggplot(aes(x=month)) +
geom_bar() +
facet_grid(year ~ .)
baywheels3 %>% ggplot(aes(x=day)) +
geom_bar() +
facet_grid(year ~ .)
```
```{r}
baywheels3 <- baywheels3 %>% filter(start_station_latitude < 38 & start_station_longitude < 120 )
baywheels_subset <- baywheels3 %>% select(start_station_longitude,start_station_latitude)
baywheels_subset %>% ggplot(aes(x=start_station_longitude, y=start_station_latitude)) +
geom_point()
```
```{r}
library(biganalytics)
# run in parallel, the doMC package runs on Windows
library(doParallel)
registerDoParallel(cores = 8)
head(baywheels3)
baywheels_subset2 <- as.matrix(baywheels_subset)
set.seed <- 123454321
tic()
cl <- bigkmeans(baywheels_subset2, 3, nstart=8)
toc()
head(cl$cluster)
cl$centers
baywheels_subset %>% ggplot(aes(x=start_station_longitude, y=start_station_latitude, color=cl$cluster)) +
geom_point()
baywheels3 <- baywheels3 %>% mutate(clust = cl$cluster)
```
```{r}
# City of Oakland c(-122.2711, 37.8044) )
# https://stackoverflow.com/questions/20621250/simple-approach-to-assigning-clusters-for-new-data-after-k-means-clustering
cl$centers
closest.cluster <- function(x) {
cluster.dist <- apply(cl$centers, 1, function(y) sqrt(sum((x-y)^2)))
return(which.min(cluster.dist)[1])
}
oak <- closest.cluster(c(-122.2711, 37.8044))
oak
oakland <- baywheels3 %>% filter(clust == oak)
oakland %>% ggplot(aes(x=start_station_longitude, y=start_station_latitude)) +
geom_point()
```
```{r}
tic()
cl.km <- kmeans(baywheels_subset, 3)
toc()
cl.km$centers
# City of Oakland c(-122.2711, 37.8044) )
baywheels_subset %>% ggplot(aes(x=start_station_longitude, y=start_station_latitude, color=cl.km$cluster)) +
geom_point()
```
Duration of rides in the Bay Area
```{r}
baywheels %>% ggplot(aes(x=duration_sec, y=..density..)) +
scale_x_continuous(limits = c(0, 10000)) +
geom_histogram() +
geom_density(aes(y=..density..))
baywheels %>% ggplot(aes(log(x=duration_sec), y=..density..)) +
geom_histogram() +
geom_density(aes(y=..density..))
```
Durations of rides in Oakland
```{r}
oakland %>% ggplot(aes(x=duration_sec, y=..density..)) +
scale_x_continuous(limits = c(0, 10000)) +
geom_histogram() +
geom_density(aes(y=..density..))
oakland %>% ggplot(aes(log(x=duration_sec), y=..density..)) +
geom_histogram() +
geom_density(aes(y=..density..))
```
Duration by City
```{r}
baywheels3 %>% ggplot(aes(x=duration_sec, y=..density..)) +
scale_x_continuous(limits = c(0, 10000)) +
geom_histogram() +
geom_density(aes(y=..density..)) +
facet_grid(clust ~ .)
baywheels3 %>% ggplot(aes(log(x=duration_sec), y=..density..)) +
geom_histogram() +
geom_density(aes(y=..density..)) +
facet_grid(clust ~ .)
baywheels3 %>% ggplot(aes(x=duration_sec, y=..density..)) +
scale_x_continuous(limits = c(0, 10000)) +
geom_histogram() +
geom_density(aes(y=..density..)) +
facet_grid(clust ~ .)
baywheels3 %>% ggplot(aes(log(x=duration_sec), y=..density..)) +
geom_histogram() +
geom_density(aes(y=..density..)) +
facet_grid(clust ~ .)
```
Duration in Oakland
```{r}
oakland %>% ggplot(aes(x=duration_sec, y=..density..)) +
scale_x_continuous(limits = c(0, 10000)) +
geom_histogram() +
geom_density(aes(y=..density..))
oakland %>% ggplot(aes(log(x=duration_sec), y=..density..)) +
geom_histogram() +
geom_density(aes(y=..density..))
oakland %>% ggplot(aes(x=duration_sec, y=..density..)) +
scale_x_continuous(limits = c(0, 10000)) +
geom_histogram() +
geom_density(aes(y=..density..))
oakland %>% ggplot(aes(log(x=duration_sec), y=..density..)) +
geom_histogram() +
geom_density(aes(y=..density..))
```
```{r}
baywheels3 %>% filter(clust == 1) %>%
summarize(dur_mean = mean(duration_sec, na.rm = TRUE),
dur_sd = sd(duration_sec, na.rm = TRUE))
baywheels3 %>% filter(clust == 2) %>%
summarize(dur_mean = mean(duration_sec, na.rm = TRUE),
dur_sd = sd(duration_sec, na.rm = TRUE))
baywheels3 %>% filter(clust == 3) %>%
summarize(dur_mean = mean(duration_sec, na.rm = TRUE),
dur_sd = sd(duration_sec, na.rm = TRUE))
```
```{r}
oakland %>%
summarize(dur_mean = mean(duration_sec, na.rm = TRUE),
dur_sd = sd(duration_sec, na.rm = TRUE))
``````