In this notebook I download and unzip the Ford Go Bike data.
library(tidyverse)
library(tictoc)
library(ggmap)
library(skimr)
library(lubridate)
library(forcats)
Set working directory.
setwd("~/GitHub/Stat6620/fordgobike")
Create a directory data in your directory, as a subdirectory, within your working directory. Of use a Project and delete the previous code chunk. Download the files into the data directory. First one is not zipped, the remaining are zipped.
URL <- "https://s3.amazonaws.com/fordgobike-data/2017-fordgobike-tripdata.csv"
download.file(URL, destfile = "./data/2017-fordgobike-tripdata.csv", method="curl")
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
0 112M 0 69269 0 0 69269 0 0:28:22 --:--:-- 0:28:22 82072
0 112M 0 271k 0 0 271k 0 0:07:04 0:00:01 0:07:03 152k
0 112M 0 560k 0 0 280k 0 0:06:50 0:00:02 0:06:48 204k
0 112M 0 883k 0 0 294k 0 0:06:31 0:00:03 0:06:28 236k
1 112M 1 1206k 0 0 301k 0 0:06:21 0:00:04 0:06:17 254k
1 112M 1 1563k 0 0 312k 0 0:06:08 0:00:05 0:06:03 305k
1 112M 1 2022k 0 0 337k 0 0:05:41 0:00:06 0:05:35 353k
2 112M 2 2685k 0 0 383k 0 0:05:00 0:00:07 0:04:53 422k
3 112M 3 3739k 0 0 467k 0 0:04:06 0:00:08 0:03:58 567k
4 112M 4 5286k 0 0 587k 0 0:03:16 0:00:09 0:03:07 816k
6 112M 6 7751k 0 0 775k 0 0:02:28 0:00:10 0:02:18 1237k
9 112M 9 10.7M 0 0 1001k 0 0:01:55 0:00:11 0:01:44 1798k
13 112M 13 15.1M 0 0 1290k 0 0:01:29 0:00:12 0:01:17 2576k
18 112M 18 21.2M 0 0 1676k 0 0:01:08 0:00:13 0:00:55 3633k
25 112M 25 28.9M 0 0 2114k 0 0:00:54 0:00:14 0:00:40 4863k
34 112M 34 38.4M 0 0 2622k 0 0:00:43 0:00:15 0:00:28 6316k
43 112M 43 49.3M 0 0 3160k 0 0:00:36 0:00:16 0:00:20 7909k
55 112M 55 62.0M 0 0 3739k 0 0:00:30 0:00:17 0:00:13 9617k
67 112M 67 76.1M 0 0 4332k 0 0:00:26 0:00:18 0:00:08 10.9M
81 112M 81 91.8M 0 0 4948k 0 0:00:23 0:00:19 0:00:04 12.5M
96 112M 96 108M 0 0 5565k 0 0:00:20 0:00:20 --:--:-- 14.0M
100 112M 100 112M 0 0 5759k 0 0:00:20 0:00:20 --:--:-- 15.0M
URL <- "https://s3.amazonaws.com/fordgobike-data/201801-fordgobike-tripdata.csv.zip"
download.file(URL, destfile = "./data/201801-fordgobike-tripdata.csv.zip", method="curl")
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
0 3251k 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
17 3251k 17 560k 0 0 560k 0 0:00:05 0:00:01 0:00:04 421k
98 3251k 98 3195k 0 0 1597k 0 0:00:02 0:00:02 --:--:-- 1372k
100 3251k 100 3251k 0 0 1625k 0 0:00:02 0:00:02 --:--:-- 1396k
URL <- "https://s3.amazonaws.com/fordgobike-data/201802-fordgobike-tripdata.csv.zip"
download.file(URL, destfile = "./data/201802-fordgobike-tripdata.csv.zip", method="curl")
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
1 3698k 1 69266 0 0 69266 0 0:00:54 --:--:-- 0:00:54 98529
36 3698k 36 1342k 0 0 1342k 0 0:00:02 0:00:01 0:00:01 803k
100 3698k 100 3698k 0 0 1849k 0 0:00:02 0:00:02 --:--:-- 1578k
URL <- "https://s3.amazonaws.com/fordgobike-data/201803-fordgobike-tripdata.csv.zip"
download.file(URL, destfile = "./data/201803-fordgobike-tripdata.csv.zip", method="curl")
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
7 3901k 7 288k 0 0 288k 0 0:00:13 0:00:01 0:00:12 260k
59 3901k 59 2328k 0 0 1164k 0 0:00:03 0:00:02 0:00:01 1103k
100 3901k 100 3901k 0 0 1950k 0 0:00:02 0:00:02 --:--:-- 1590k
URL <- "https://s3.amazonaws.com/fordgobike-data/201804-fordgobike-tripdata.csv.zip"
download.file(URL, destfile = "./data/201804-fordgobike-tripdata.csv.zip", method="curl")
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
0 4613k 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
12 4613k 12 560k 0 0 560k 0 0:00:08 0:00:01 0:00:07 403k
54 4613k 54 2532k 0 0 1266k 0 0:00:03 0:00:02 0:00:01 1052k
100 4613k 100 4613k 0 0 1537k 0 0:00:03 0:00:03 --:--:-- 1514k
URL <- "https://s3.amazonaws.com/fordgobike-data/201805-fordgobike-tripdata.csv.zip"
download.file(URL, destfile = "./data/201805-fordgobike-tripdata.csv.zip", method="curl")
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
4 6269k 4 305k 0 0 305k 0 0:00:20 0:00:01 0:00:19 260k
28 6269k 28 1801k 0 0 900k 0 0:00:06 0:00:02 0:00:04 823k
67 6269k 67 4232k 0 0 1410k 0 0:00:04 0:00:03 0:00:01 1334k
100 6269k 100 6269k 0 0 2089k 0 0:00:03 0:00:03 --:--:-- 1665k
URL <- "https://s3.amazonaws.com/fordgobike-data/201806-fordgobike-tripdata.csv.zip"
download.file(URL, destfile = "./data/201806-fordgobike-tripdata.csv.zip", method="curl")
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
7 6901k 7 509k 0 0 509k 0 0:00:13 0:00:01 0:00:12 418k
41 6901k 41 2855k 0 0 1427k 0 0:00:04 0:00:02 0:00:02 1278k
100 6901k 100 6901k 0 0 2300k 0 0:00:03 0:00:03 --:--:-- 2265k
URL <- "https://s3.amazonaws.com/fordgobike-data/201807-fordgobike-tripdata.csv.zip"
download.file(URL, destfile = "./data/201807-fordgobike-tripdata.csv.zip", method="curl")
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
3 7057k 3 271k 0 0 271k 0 0:00:25 0:00:01 0:00:24 267k
26 7057k 26 1885k 0 0 942k 0 0:00:07 0:00:02 0:00:05 935k
65 7057k 65 4623k 0 0 1541k 0 0:00:04 0:00:03 0:00:01 1533k
100 7057k 100 7057k 0 0 2352k 0 0:00:03 0:00:03 --:--:-- 1963k
Loop over the one value in the url and filename that changes.
URL <- "https://s3.amazonaws.com/fordgobike-data/2017-fordgobike-tripdata.csv"
download.file(URL, destfile = "./data/2017-fordgobike-tripdata.csv", method="curl")
for (i in 1:7) {
URL <- paste0("https://s3.amazonaws.com/fordgobike-data/20180",i,"-fordgobike-tripdata.csv.zip")
download.file(URL, destfile = paste0("./data/20180",i,"-fordgobike-tripdata.csv.zip"), method="curl")
}
Unzip downloaded files.
unzip("./data/201801-fordgobike-tripdata.csv.zip",exdir="./data")
unzip("./data/201802-fordgobike-tripdata.csv.zip",exdir="./data")
unzip("./data/201803-fordgobike-tripdata.csv.zip",exdir="./data")
unzip("./data/201804-fordgobike-tripdata.csv.zip",exdir="./data")
unzip("./data/201805-fordgobike-tripdata.csv.zip",exdir="./data")
unzip("./data/201806-fordgobike-tripdata.csv.zip",exdir="./data")
unzip("./data/201807-fordgobike-tripdata.csv.zip",exdir="./data")
Clean up data directory.
fn <- "./data/201801-fordgobike-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
[1] TRUE
fn <- "./data/201802-fordgobike-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
[1] TRUE
fn <- "./data/201803-fordgobike-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
[1] TRUE
fn <- "./data/201804-fordgobike-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
[1] TRUE
fn <- "./data/201805-fordgobike-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
[1] TRUE
fn <- "./data/201806-fordgobike-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
[1] TRUE
fn <- "./data/201807-fordgobike-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
[1] TRUE
Read the.csv files
fordgobike2017 <- read_csv(file="./data/2017-fordgobike-tripdata.csv")
fordgobike201801 <- read_csv(file="./data/201801-fordgobike-tripdata.csv")
fordgobike201802 <- read_csv(file="./data/201802-fordgobike-tripdata.csv")
fordgobike201803 <- read_csv(file="./data/201803-fordgobike-tripdata.csv")
fordgobike201804 <- read_csv(file="./data/201804-fordgobike-tripdata.csv")
fordgobike201805 <- read_csv(file="./data/201805-fordgobike-tripdata.csv")
fordgobike201806 <- read_csv(file="./data/201806-fordgobike-tripdata.csv")
fordgobike201807 <- read_csv(file="./data/201807-fordgobike-tripdata.csv")
Check the head() and tail() of the data.frames that are loaded.
head(fordgobike2017)
head(fordgobike201801)
head(fordgobike201802)
head(fordgobike201803)
head(fordgobike201804)
head(fordgobike201805)
head(fordgobike201806)
head(fordgobike201807)
tail(fordgobike2017)
tail(fordgobike201801)
tail(fordgobike201802)
tail(fordgobike201803)
tail(fordgobike201804)
tail(fordgobike201805)
tail(fordgobike201806)
tail(fordgobike201807)
dim(fordgobike2017)
[1] 519700 15
fordgobike2017 %>% count()
fordgobike201801 %>% count()
fordgobike201802 %>% count()
fordgobike201803 %>% count()
fordgobike201804 %>% count()
fordgobike201805 %>% count()
fordgobike201806 %>% count()
fordgobike201807 %>% count()
The end_station_id have been updated.
glimpse(fordgobike201805)
Observations: 179,125
Variables: 16
$ duration_sec <int> 56791, 52797, 43204, 67102, 58883, 22858, 2863, 3189, 3149, 3136, 3166, 2619, 8847, 2658, 1790, 142...
$ start_time <dttm> 2018-05-31 21:41:51, 2018-05-31 18:39:53, 2018-05-31 21:09:48, 2018-05-31 14:09:54, 2018-05-31 16:...
$ end_time <dttm> 2018-06-01 13:28:22, 2018-06-01 09:19:51, 2018-06-01 09:09:52, 2018-06-01 08:48:17, 2018-06-01 08:...
$ start_station_id <int> 44, 186, 17, 106, 16, 163, 197, 61, 61, 61, 61, 61, 211, 66, 19, 10, 175, 258, 108, 60, 14, 107, 31...
$ start_station_name <chr> "Civic Center/UN Plaza BART Station (Market St at McAllister St)", "Lakeside Dr at 14th St", "Embar...
$ start_station_latitude <dbl> 37.78107, 37.80132, 37.79225, 37.76324, 37.79413, 37.79732, 37.80885, 37.77651, 37.77651, 37.77651,...
$ start_station_longitude <dbl> -122.4117, -122.2626, -122.3971, -122.4307, -122.3944, -122.2653, -122.2497, -122.4113, -122.4113, ...
$ end_station_id <int> 78, 338, 93, 47, 30, 212, 197, 8, 8, 8, 8, 8, 7, 323, 34, 6, 190, 268, 58, 60, 36, 119, 278, 186, 2...
$ end_station_name <chr> "Folsom St at 9th St", "13th St at Franklin St", "4th St at Mission Bay Blvd S", "4th St at Harriso...
$ end_station_latitude <dbl> 37.77372, 37.80319, 37.77041, 37.78095, 37.77660, 37.82493, 37.80885, 37.79995, 37.79995, 37.79995,...
$ end_station_longitude <dbl> -122.4116, -122.2706, -122.3912, -122.3997, -122.3953, -122.2605, -122.2497, -122.3985, -122.3985, ...
$ bike_id <int> 1230, 3414, 2677, 4224, 3392, 1235, 152, 1109, 2143, 3374, 3493, 2190, 2927, 3789, 2019, 2070, 1351...
$ user_type <chr> "Customer", "Subscriber", "Customer", "Subscriber", "Subscriber", "Customer", "Subscriber", "Custom...
$ member_birth_year <int> NA, 1983, NA, 1979, 1986, 1992, 1985, NA, NA, NA, NA, NA, NA, 1985, 1991, NA, 1966, 1993, 1992, 199...
$ member_gender <chr> NA, "Male", NA, "Male", "Male", "Male", "Male", NA, NA, NA, NA, NA, NA, "Male", "Male", NA, "Male",...
$ bike_share_for_all_trip <chr> "No", "No", "No", "No", "No", "No", "Yes", "No", "No", "No", "No", "No", "No", "No", "Yes", "No", "...
glimpse(fordgobike201806)
Observations: 195,968
Variables: 16
$ duration_sec <int> 59088, 60358, 63654, 50508, 51697, 36708, 46380, 7224, 4294, 2209, 8975, 7256, 1043, 922, 1241, 174...
$ start_time <dttm> 2018-06-30 23:32:44, 2018-06-30 21:48:19, 2018-06-30 20:26:53, 2018-06-30 20:29:59, 2018-06-30 18:...
$ end_time <dttm> 2018-07-01 15:57:33, 2018-07-01 14:34:18, 2018-07-01 14:07:47, 2018-07-01 10:31:48, 2018-07-01 08:...
$ start_station_id <chr> "76", "248", "23", "58", "196", "8", "237", "284", "240", "133", "75", "6", "282", "166", "193", "1...
$ start_station_name <chr> "McCoppin St at Valencia St", "Telegraph Ave at Ashby Ave", "The Embarcadero at Steuart St", "Marke...
$ start_station_latitude <dbl> 37.77166, 37.85596, 37.79146, 37.77662, 37.80889, 37.79995, 37.77523, 37.78487, 37.86604, 37.75521,...
$ start_station_longitude <dbl> -122.4224, -122.2598, -122.3910, -122.4174, -122.2565, -122.3985, -122.2245, -122.4009, -122.2588, ...
$ end_station_id <chr> "95", "239", "50", "88", "272", "4", "237", "284", "240", "55", "75", "15", "313", "240", "186", "7...
$ end_station_name <chr> "Sanchez St at 15th St", "Bancroft Way at Telegraph Ave", "2nd St at Townsend St", "11th St at Brya...
$ end_station_latitude <dbl> 37.76622, 37.86881, 37.78053, 37.77003, 37.85058, 37.78588, 37.77523, 37.78487, 37.86604, 37.77705,...
$ end_station_longitude <dbl> -122.4311, -122.2588, -122.3903, -122.4117, -122.2782, -122.4089, -122.2245, -122.4009, -122.2588, ...
$ bike_id <int> 2100, 653, 3235, 3675, 3232, 577, 1764, 779, 2491, 4225, 3972, 3369, 1882, 1622, 2323, 4009, 3953, ...
$ user_type <chr> "Subscriber", "Customer", "Subscriber", "Subscriber", "Customer", "Customer", "Customer", "Subscrib...
$ member_birth_year <int> 1975, NA, 1962, 1992, 1989, NA, NA, 1989, 1996, 1963, 1981, 1990, NA, 1992, 1990, 1986, 1986, 1985,...
$ member_gender <chr> "Male", NA, "Female", "Male", "Female", NA, NA, "Male", "Female", "Male", "Male", "Male", NA, "Male...
$ bike_share_for_all_trip <chr> "Yes", "No", "No", "No", "No", "No", "No", "No", "Yes", "Yes", "No", "No", "No", "No", "No", "No", ...
glimpse(fordgobike201807)
Observations: 199,222
Variables: 16
$ duration_sec <int> 59989, 60232, 43864, 51522, 83380, 49546, 42799, 50603, 54830, 22051, 30404, 1397, 1238, 903, 5166,...
$ start_time <dttm> 2018-07-31 18:20:32, 2018-07-31 17:24:26, 2018-07-31 21:03:26, 2018-07-31 18:54:23, 2018-07-31 09:...
$ end_time <dttm> 2018-08-01 11:00:22, 2018-08-01 10:08:18, 2018-08-01 09:14:30, 2018-08-01 09:13:06, 2018-08-01 08:...
$ start_station_id <chr> "197", "77", "NULL", "114", "213", "139", "337", "19", "247", "13", "20", "NULL", "74", "180", "186...
$ start_station_name <chr> "El Embarcadero at Grand Ave", "11th St at Natoma St", "NULL", "Rhode Island St at 17th St", "32nd ...
$ start_station_latitude <dbl> 37.80885, 37.77351, 37.41000, 37.76448, 37.82385, 37.75102, 37.80697, 37.78898, 37.86779, 37.79423,...
$ start_station_longitude <dbl> -122.2497, -122.4160, -121.9400, -122.4026, -122.2812, -122.4119, -122.2666, -122.4035, -122.2659, ...
$ end_station_id <chr> "181", "356", "NULL", "345", "198", "356", "196", "16", "266", "16", "43", "NULL", "144", "213", "1...
$ end_station_name <chr> "Grand Ave at Webster St", "Valencia St at Clinton Park", "NULL", "Hubbell St at 16th St", "Snow Pa...
$ end_station_latitude <dbl> 37.81138, 37.76919, 37.41000, 37.76647, 37.80781, 37.76919, 37.80889, 37.79413, 37.86246, 37.79413,...
$ end_station_longitude <dbl> -122.2652, -122.4223, -121.9400, -122.3983, -122.2645, -122.4223, -122.2565, -122.3944, -122.2648, ...
$ bike_id <int> 1953, 3010, 4273, 1043, 1336, 697, 605, 800, 2432, 3839, 3492, 4128, 2407, 3489, 3543, 3314, 3364, ...
$ user_type <chr> "Customer", "Subscriber", "Subscriber", "Subscriber", "Subscriber", "Customer", "Subscriber", "Subs...
$ member_birth_year <int> 1995, 1994, 1998, 1990, 1982, 1991, 1976, 1972, 1997, 1978, 1999, 1982, 1975, 1981, 1996, 1983, 199...
$ member_gender <chr> "Male", "Female", "Male", "Female", "Male", "Female", "Female", "Male", "Male", "Male", "Male", "Fe...
$ bike_share_for_all_trip <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "Yes", "No", "No", "No", "N...
fordgobike201806 <- fordgobike201806 %>%
mutate(start_station_id = as.integer(start_station_id),
end_station_id= as.integer(end_station_id) )
NAs introduced by coercionNAs introduced by coercion
fordgobike201807 <- fordgobike201807 %>%
mutate(start_station_id = as.integer(start_station_id),
end_station_id= as.integer(end_station_id) )
NAs introduced by coercionNAs introduced by coercion
fordgobike2018 <- bind_rows(fordgobike201801, fordgobike201802, fordgobike201803, fordgobike201804,
fordgobike201805, fordgobike201806, fordgobike201807)
glimpse(fordgobike2018)
Observations: 1,018,386
Variables: 16
$ duration_sec <int> 75284, 85422, 71576, 61076, 39966, 6477, 453, 180, 996, 825, 1316, 432, 601, 887, 210, 188, 808, 37...
$ start_time <dttm> 2018-01-31 22:52:35, 2018-01-31 16:13:34, 2018-01-31 14:23:55, 2018-01-31 14:53:23, 2018-01-31 19:...
$ end_time <dttm> 2018-02-01 19:47:19, 2018-02-01 15:57:17, 2018-02-01 10:16:52, 2018-02-01 07:51:20, 2018-02-01 06:...
$ start_station_id <int> 120, 15, 304, 75, 74, 236, 110, 81, 134, 305, 98, 89, 223, 308, 7, 98, 67, 80, 247, 312, 241, 239, ...
$ start_station_name <chr> "Mission Dolores Park", "San Francisco Ferry Building (Harry Bridges Plaza)", "Jackson St at 5th St...
$ start_station_latitude <dbl> 37.76142, 37.79539, 37.34876, 37.77379, 37.77643, 37.80369, 37.76371, 37.77588, 37.75243, 37.34273,...
$ start_station_longitude <dbl> -122.4264, -122.3942, -121.8948, -122.4212, -122.4262, -122.2825, -122.4152, -122.3932, -122.4206, ...
$ end_station_id <int> 285, 15, 296, 47, 19, 160, 134, 93, 4, 317, 4, 43, 86, 297, 186, 76, 98, 78, 274, 317, 157, 244, 18...
$ end_station_name <chr> "Webster St at O'Farrell St", "San Francisco Ferry Building (Harry Bridges Plaza)", "5th St at Virg...
$ end_station_latitude <dbl> 37.78352, 37.79539, 37.32600, 37.78095, 37.78898, 37.80532, 37.75243, 37.77041, 37.78588, 37.33396,...
$ end_station_longitude <dbl> -122.4312, -122.3942, -121.8771, -122.3997, -122.4035, -122.2948, -122.4206, -122.3912, -122.4089, ...
$ bike_id <int> 2765, 2815, 3039, 321, 617, 1306, 3571, 1403, 3675, 1453, 1278, 2928, 3016, 55, 2602, 2556, 3041, 5...
$ user_type <chr> "Subscriber", "Customer", "Customer", "Customer", "Subscriber", "Customer", "Subscriber", "Subscrib...
$ member_birth_year <int> 1986, NA, 1996, NA, 1991, NA, 1988, 1980, 1987, 1994, NA, 1993, 1957, 1976, 1976, 1964, 1976, 1995,...
$ member_gender <chr> "Male", NA, "Male", NA, "Male", NA, "Male", "Male", "Male", "Female", NA, "Male", "Male", "Female",...
$ bike_share_for_all_trip <chr> "No", "No", "No", "No", "No", "No", "No", "No", "Yes", "Yes", "No", "No", "No", "Yes", "No", "No", ...
fordgobike2018 %>% select(start_station_id,start_station_name, start_station_latitude,start_station_longitude) %>%
arrange(start_station_id) %>%
distinct() %>%
head()
dim(fordgobike2017)
[1] 519700 15
fordgobike2017 %>% count()
nrow(fordgobike201801) + nrow(fordgobike201802) + nrow(fordgobike201803) + nrow(fordgobike201804)
[1] 444071
dim(fordgobike2018)
[1] 1018386 16
fordgobike2018 %>% count()
fordgobike <- bind_rows(fordgobike2017, fordgobike2018)
dim(fordgobike)
[1] 1538086 16
fordgobike %>% count()
dim(fordgobike)
[1] 1538086 16
fordgobike <- fordgobike %>% mutate(age = 2018 - member_birth_year)
fordgobike %>% count()
dim(fordgobike)
[1] 1538086 17
fordgobike <- fordgobike %>% mutate(year=year(start_time), month=month(start_time), day=day(start_time) )
fordgobike %>% count()
dim(fordgobike)
[1] 1538086 20
fordgobike <- fordgobike %>% mutate(week_day = wday(start_time) )
levels <- c("M","T","W","TH","F","SAT","SUN")
fordgobike$week_day <- factor(fordgobike$week_day, levels = levels)
fordgobike %>% count()
dim(fordgobike)
[1] 1538086 21
today()
[1] "2018-09-14"
now()
[1] "2018-09-14 20:18:18 PDT"
Age
fordgobike %>% group_by( age ) %>% count()
fordgobike %>% group_by( age ) %>% summary()
duration_sec start_time end_time start_station_id start_station_name
Min. : 61.0 Min. :2017-06-28 09:47:36 Min. :2017-06-28 09:52:55 Min. : 3.0 Length:1538086
1st Qu.: 361.0 1st Qu.:2017-11-14 10:08:31 1st Qu.:2017-11-14 10:21:12 1st Qu.: 28.0 Class :character
Median : 569.0 Median :2018-03-15 07:10:23 Median :2018-03-15 07:24:04 Median : 79.0 Mode :character
Mean : 957.4 Mean :2018-02-22 12:28:46 Mean :2018-02-22 12:44:43 Mean :107.7
3rd Qu.: 897.0 3rd Qu.:2018-06-02 17:56:46 3rd Qu.:2018-06-02 18:19:06 3rd Qu.:173.0
Max. :86369.0 Max. :2018-07-31 23:57:19 Max. :2018-08-01 11:00:22 Max. :357.0
NA's :5245
start_station_latitude start_station_longitude end_station_id end_station_name end_station_latitude end_station_longitude
Min. :37.31 Min. :-122.44 Min. : 3.0 Length:1538086 Min. :37.28 Min. :-122.44
1st Qu.:37.77 1st Qu.:-122.41 1st Qu.: 27.0 Class :character 1st Qu.:37.77 1st Qu.:-122.41
Median :37.78 Median :-122.40 Median : 77.0 Mode :character Median :37.78 Median :-122.40
Mean :37.77 Mean :-122.36 Mean :105.6 Mean :37.77 Mean :-122.35
3rd Qu.:37.80 3rd Qu.:-122.39 3rd Qu.:171.0 3rd Qu.:37.80 3rd Qu.:-122.39
Max. :45.51 Max. : -73.57 Max. :357.0 Max. :45.51 Max. : -73.57
NA's :5245
bike_id user_type member_birth_year member_gender bike_share_for_all_trip age year
Min. : 10 Length:1538086 Min. :1881 Length:1538086 Length:1538086 Min. : 18.0 Min. :2017
1st Qu.:1045 Class :character 1st Qu.:1976 Class :character Class :character 1st Qu.: 29.0 1st Qu.:2017
Median :2072 Mode :character Median :1984 Mode :character Mode :character Median : 34.0 Median :2018
Mean :2021 Mean :1982 Mean : 36.2 Mean :2018
3rd Qu.:2952 3rd Qu.:1989 3rd Qu.: 42.0 3rd Qu.:2018
Max. :4307 Max. :2000 Max. :137.0 Max. :2018
NA's :137667 NA's :137667
month day week_day
Min. : 1.0 Min. : 1.00 M : 0
1st Qu.: 4.0 1st Qu.: 8.00 T : 0
Median : 6.0 Median :16.00 W : 0
Mean : 6.3 Mean :15.98 TH : 0
3rd Qu.: 9.0 3rd Qu.:24.00 F : 0
Max. :12.0 Max. :31.00 (Other): 0
NA's :1538086
skim(fordgobike)
Skim summary statistics
n obs: 1538086
n variables: 21
Variable type: character
variable missing complete n min max empty n_unique
bike_share_for_all_trip 519700 1018386 1538086 2 3 0 2
end_station_name 0 1538086 1538086 4 63 0 316
member_gender 137326 1400760 1538086 4 6 0 3
start_station_name 0 1538086 1538086 4 63 0 316
user_type 0 1538086 1538086 8 10 0 2
Variable type: factor
variable missing complete n n_unique top_counts ordered
week_day 1538086 0 1538086 0 NA: 1538086, M: 0, T: 0, W: 0 FALSE
Variable type: integer
variable missing complete n mean sd p0 p25 p50 p75 p100 hist
bike_id 0 1538086 1538086 2020.6 1152.29 10 1045 2072 2952 4307 ▇▆▆▇▇▇▅▃
day 0 1538086 1538086 15.98 8.78 1 8 16 24 31 ▇▇▇▇▆▇▇▇
duration_sec 0 1538086 1538086 957.38 2891.83 61 361 569 897 86369 ▇▁▁▁▁▁▁▁
end_station_id 5245 1532841 1538086 105.63 92.6 3 27 77 171 357 ▇▅▃▂▂▁▁▁
member_birth_year 137667 1400419 1538086 1981.8 10.56 1881 1976 1984 1989 2000 ▁▁▁▁▁▂▇▇
start_station_id 5245 1532841 1538086 107.7 92.97 3 28 79 173 357 ▇▅▃▂▂▁▁▁
Variable type: numeric
variable missing complete n mean sd p0 p25 p50 p75 p100 hist
age 137667 1400419 1538086 36.2 10.56 18 29 34 42 137 ▇▇▂▁▁▁▁▁
end_station_latitude 0 1538086 1538086 37.77 0.098 37.28 37.77 37.78 37.8 45.51 ▇▁▁▁▁▁▁▁
end_station_longitude 0 1538086 1538086 -122.35 0.15 -122.44 -122.41 -122.4 -122.39 -73.57 ▇▁▁▁▁▁▁▁
month 0 1538086 1538086 6.3 3.06 1 4 6 9 12 ▅▃▇▅▆▅▃▅
start_station_latitude 0 1538086 1538086 37.77 0.098 37.31 37.77 37.78 37.8 45.51 ▇▁▁▁▁▁▁▁
start_station_longitude 0 1538086 1538086 -122.36 0.15 -122.44 -122.41 -122.4 -122.39 -73.57 ▇▁▁▁▁▁▁▁
year 0 1538086 1538086 2017.66 0.47 2017 2017 2018 2018 2018 ▅▁▁▁▁▁▁▇
Variable type: POSIXct
variable missing complete n min max median n_unique
end_time 0 1538086 1538086 2017-06-28 2018-08-01 2018-03-15 1538010
start_time 0 1538086 1538086 2017-06-28 2018-07-31 2018-03-15 1538011
fordgobike %>% ggplot(aes(x=age)) + geom_histogram()
fordgobike %>% filter(age <= 80) %>% ggplot(aes(x=age)) + geom_histogram()
fordgobike %>% filter(age <= 100) %>% ggplot(aes(x=age)) + geom_histogram()
fordgobike %>% filter(age > 100) %>% ggplot(aes(x=age)) + geom_histogram()
fordgobike %>% group_by( member_gender, age ) %>% count()
fordgobike %>% ggplot(aes(x=age, class=member_gender)) + geom_histogram()
fordgobike %>% ggplot(aes(x=age, class=member_gender)) + geom_histogram(aes(y=..density..))
fordgobike %>% filter(age <= 80) %>% ggplot(aes(x=age)) + geom_histogram()
fordgobike %>% filter(age <= 80) %>% ggplot(aes(x=age, color=member_gender)) +
geom_histogram(position="identity") +
facet_grid(member_gender ~ .)
fordgobike %>% filter(age <= 80) %>% ggplot(aes(x=age, color=member_gender)) +
geom_histogram(aes(y=..density..),position="identity") +
facet_grid(member_gender ~ .)
Year and day of week.
fordgobike %>% ggplot(aes(x=year)) + geom_bar()
fordgobike %>% ggplot(aes(x=month)) + geom_bar() + facet_grid(year ~ .)
fordgobike %>% ggplot(aes(x=day)) + geom_bar() + facet_grid(year ~ .)
fordgobike2018 <- fordgobike2018 %>% filter(start_station_latitude < 38 & start_station_longitude < 120 )
fordgobike_subset <- fordgobike2018 %>% select(start_station_longitude,start_station_latitude)
fordgobike_subset %>% ggplot(aes(x=start_station_longitude, y=start_station_latitude)) +
geom_point()
library(biganalytics)
Loading required package: bigmemory
Loading required package: foreach
Attaching package: 㤼㸱foreach㤼㸲
The following objects are masked from 㤼㸱package:purrr㤼㸲:
accumulate, when
Loading required package: biglm
Loading required package: DBI
# run in parallel, the doMC package runs on Windows
library(doParallel)
Loading required package: iterators
Loading required package: parallel
registerDoParallel(cores = 8)
head(fordgobike2018)
fordgobike_subset2 <- as.matrix(fordgobike_subset)
set.seed <- 123454321
tic()
cl <- bigkmeans(fordgobike_subset2, 3, nstart=8)
toc()
3.31 sec elapsed
head(cl$cluster)
[1] 1 1 3 1 1 2
cl$centers
[,1] [,2]
[1,] -122.4072 37.77809
[2,] -122.2660 37.83117
[3,] -121.8953 37.34168
fordgobike_subset %>% ggplot(aes(x=start_station_longitude, y=start_station_latitude, color=cl$cluster)) +
geom_point()
fordgobike2018 <- fordgobike2018 %>% mutate(clust = cl$cluster)
# City of Oakland c(-122.2711, 37.8044) )
# https://stackoverflow.com/questions/20621250/simple-approach-to-assigning-clusters-for-new-data-after-k-means-clustering
cl$centers
[,1] [,2]
[1,] -122.4072 37.77809
[2,] -122.2660 37.83117
[3,] -121.8953 37.34168
closest.cluster <- function(x) {
cluster.dist <- apply(cl$centers, 1, function(y) sqrt(sum((x-y)^2)))
return(which.min(cluster.dist)[1])
}
oak <- closest.cluster(c(-122.2711, 37.8044))
oak
[1] 2
oakland <- fordgobike2018 %>% filter(clust == oak)
oakland %>% ggplot(aes(x=start_station_longitude, y=start_station_latitude)) +
geom_point()
tic()
cl.km <- kmeans(fordgobike_subset, 3)
toc()
0.42 sec elapsed
cl.km$centers
start_station_longitude start_station_latitude
1 -121.8953 37.34168
2 -122.2660 37.83117
3 -122.4072 37.77809
# City of Oakland c(-122.2711, 37.8044) )
fordgobike_subset %>% ggplot(aes(x=start_station_longitude, y=start_station_latitude, color=cl.km$cluster)) +
geom_point()
NA
dim(cl$centers)
[1] 3 2
bayarea <- get_map(location = c(lon=cl$centers[oak,1], lat=cl$centers[oak,2]), zoom = 12, maptype = "roadmap")
Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=37.831171,-122.26603&zoom=12&size=640x640&scale=2&maptype=roadmap&language=en-EN&sensor=false
ggmap(bayarea)
ggmap(bayarea) +
geom_point(data = oakland, aes(x = start_station_longitude, y = start_station_latitude), size = 0.2, shape = 19) +
theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank(),
axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank()
)
ggsave(filename="oakland.jpg", width = 4, height = 4, units = "cm", plot=last_plot())
ggmap(bayarea) +
geom_point(data = oakland, aes(x = start_station_longitude, y = start_station_latitude), size = 1, shape = 19) +
ggtitle("Oakland Ford Go Bike stations")
bayarea <- get_map(location = "hayward")
Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=hayward&zoom=10&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=hayward&sensor=false
ggmap(bayarea)
ggmap(bayarea) +
geom_point(data = fordgobike2018, aes(x = start_station_longitude, y = start_station_latitude, color = clust, alpha = 0.1), size = 0.2, shape = 19) +
theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank(),
axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank(),
legend.position="none")
ggsave(filename="bayarea.jpg", width = 4, height = 4, units = "cm", plot=last_plot())
ggmap(bayarea) +
geom_point(data = fordgobike2018, aes(x = start_station_longitude, y = start_station_latitude, color = clust, alpha = 0.1), size = 1, shape = 19) +
ggtitle("Bay Area Ford Go Bike stations")
Gender of users
fordgobike2018 %>% ggplot(aes(x=member_gender, y=duration_sec)) + geom_bar(stat="Identity") +
ggtitle("Bay Area")
oakland %>% ggplot(aes(x=member_gender, y=duration_sec)) + geom_bar(stat="Identity") +
ggtitle("Oakland")
Duration of rides in the Bay Area
fordgobike2018 %>% ggplot(aes(x=duration_sec, y=..density..)) +
scale_x_continuous(limits = c(0, 10000)) +
geom_histogram() +
geom_density(aes(y=..density..))
fordgobike2018 %>% ggplot(aes(log(x=duration_sec), y=..density..)) +
geom_histogram() +
geom_density(aes(y=..density..))
fordgobike2018 %>% ggplot(aes(x=duration_sec, y=..density..)) +
scale_x_continuous(limits = c(0, 10000)) +
geom_histogram() +
geom_density(aes(y=..density..)) +
facet_grid(member_gender ~ .)
fordgobike2018 %>% ggplot(aes(log(x=duration_sec), y=..density..)) +
geom_histogram() +
geom_density(aes(y=..density..)) +
facet_grid(member_gender ~ .)
Durations of rides in Oakland
oakland %>% ggplot(aes(x=duration_sec, y=..density..)) +
scale_x_continuous(limits = c(0, 10000)) +
geom_histogram() +
geom_density(aes(y=..density..))
oakland %>% ggplot(aes(log(x=duration_sec), y=..density..)) +
geom_histogram() +
geom_density(aes(y=..density..))
oakland %>% ggplot(aes(x=duration_sec, y=..density..)) +
scale_x_continuous(limits = c(0, 10000)) +
geom_histogram() +
geom_density(aes(y=..density..)) +
facet_grid(member_gender ~ .)
oakland %>% ggplot(aes(log(x=duration_sec), y=..density..)) +
geom_histogram() +
geom_density(aes(y=..density..)) +
facet_grid(member_gender ~ .)
Duration by City
fordgobike2018 %>% ggplot(aes(x=duration_sec, y=..density..)) +
scale_x_continuous(limits = c(0, 10000)) +
geom_histogram() +
geom_density(aes(y=..density..)) +
facet_grid(clust ~ .)
fordgobike2018 %>% ggplot(aes(log(x=duration_sec), y=..density..)) +
geom_histogram() +
geom_density(aes(y=..density..)) +
facet_grid(clust ~ .)
fordgobike2018 %>% ggplot(aes(x=duration_sec, y=..density..)) +
scale_x_continuous(limits = c(0, 10000)) +
geom_histogram() +
geom_density(aes(y=..density..)) +
facet_grid(clust ~ .)
fordgobike2018 %>% ggplot(aes(log(x=duration_sec), y=..density..)) +
geom_histogram() +
geom_density(aes(y=..density..)) +
facet_grid(clust ~ .)
Duration in Oakland
oakland %>% ggplot(aes(x=duration_sec, y=..density..)) +
scale_x_continuous(limits = c(0, 10000)) +
geom_histogram() +
geom_density(aes(y=..density..)) +
facet_grid(clust ~ .)
oakland %>% ggplot(aes(log(x=duration_sec), y=..density..)) +
geom_histogram() +
geom_density(aes(y=..density..)) +
facet_grid(clust ~ .)
oakland %>% ggplot(aes(x=duration_sec, y=..density..)) +
scale_x_continuous(limits = c(0, 10000)) +
geom_histogram() +
geom_density(aes(y=..density..)) +
facet_grid(clust ~ .)
oakland %>% ggplot(aes(log(x=duration_sec), y=..density..)) +
geom_histogram() +
geom_density(aes(y=..density..)) +
facet_grid(clust ~ .)
fordgobike2018 %>% filter(clust == 1) %>%
group_by( member_gender ) %>%
summarize(dur_mean = mean(duration_sec), dur_sd = sd(duration_sec))
fordgobike2018 %>% filter(clust == 2) %>%
group_by( member_gender ) %>%
summarize(dur_mean = mean(duration_sec), dur_sd = sd(duration_sec))
fordgobike2018 %>% filter(clust == 3) %>%
group_by( member_gender ) %>%
summarize(dur_mean = mean(duration_sec), dur_sd = sd(duration_sec))
oakland %>%
group_by( member_gender ) %>%
summarize(dur_mean = mean(duration_sec), dur_sd = sd(duration_sec))