-
Notifications
You must be signed in to change notification settings - Fork 0
/
code
311 lines (221 loc) · 13.2 KB
/
code
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
###################### IMPORT AND AGGREGATION ########################
# For this project, I loaded data into RStudio in order to clean and analyze it using R.
# First, I installed the tidyverse package and loaded relevant libraries:
install.packages("tidyverse")
library(tidyverse)
# Then, I loaded and aggregated the 12 separate .csv files so that these data were accessible and organized.
# First, uploading the files to RStudio using readr library...
trips_2021_04 <- read_csv("202104-divvy-tripdata.csv")
trips_2021_05 <- read_csv("202105-divvy-tripdata.csv")
trips_2021_06 <- read_csv("202106-divvy-tripdata.csv")
trips_2021_07 <- read_csv("202107-divvy-tripdata.csv")
trips_2021_08 <- read_csv("202108-divvy-tripdata.csv")
trips_2021_09 <- read_csv("202109-divvy-tripdata.csv")
trips_2021_10 <- read_csv("202110-divvy-tripdata.csv")
trips_2021_11 <- read_csv("202111-divvy-tripdata.csv")
trips_2021_12 <- read_csv("202112-divvy-tripdata.csv")
trips_2022_01 <- read_csv("202201-divvy-tripdata.csv")
trips_2022_02 <- read_csv("202202-divvy-tripdata.csv")
trips_2022_03 <- read_csv("202203-divvy-tripdata.csv")
# Time to wrangle this data into a single file!
# First, I compare column names to make sure everything will harmonize...
colnames(trips_2021_04)
colnames(trips_2021_05)
colnames(trips_2021_06)
colnames(trips_2021_07)
colnames(trips_2021_08)
colnames(trips_2021_09)
colnames(trips_2021_10)
colnames(trips_2021_11)
colnames(trips_2021_12)
colnames(trips_2022_01)
colnames(trips_2022_02)
colnames(trips_2022_03)
# Now to inspect the dataframes and look for inconsistencies...
str(trips_2021_04)
str(trips_2021_05)
str(trips_2021_06)
str(trips_2021_07)
str(trips_2021_08)
str(trips_2021_09)
str(trips_2021_10)
str(trips_2021_11)
str(trips_2021_12)
str(trips_2022_01)
str(trips_2022_02)
str(trips_2022_03)
# These files are formatted consistently already. No need to convert any data types, etc.
# Now, I can stack all these dataframes into one big dataframe
all_trips <- bind_rows(trips_2021_04, trips_2021_05, trips_2021_06, trips_2021_07, trips_2021_08,
trips_2021_09, trips_2021_10, trips_2021_11, trips_2021_12, trips_2022_01, trips_2022_02, trips_2022_03)
# Let's remove fields we don't need. We won't be using locational data since it's not useful to our analysis, so we'll remove longitude and latitude.
all_trips <- all_trips %>%
select(-c(start_lat, start_lng, end_lat, end_lng))
# Now we'll look at the new table and make sure it looks the way we expect it to...
# Inspect column names:
colnames(all_trips)
# Columns remaining are: ride_id, rideable_type, started_at, ended_at, start_station_name, start_station_id, end_station_name, end_station_id, member_casual, ride_length, and day_of_week
# How many rows are in this big data frame?
nrow(all_trips)
# Wow! This dataframe has 5,723,532 rows.
# Quick check of the dataframe's dimensions?
dim(all_trips)
# As expected, we have 5,723,532 rows and 11 columns
# Let's make sure the data types came out the way we expected them to:
str(all_trips)
summary(all_trips)
####################### DATA CLEANING #######################
# First we'll inspect and clean columns to make sure they only have specific values.
# Let's look at the member_casual column:
table(all_trips$member_casual)
# member_casual has 2,546,542 casual riders and 3,176,990 Cyclistic members, suggesting that more than half of Cyclistic users are already members. There are no unexpected values here.
# Need to change the data types of dates, the started_at and ended_at columns are currently stored as characters.
# First let's duplicate the table. We can double check for duplicates here using the distinct function in dplyr.
library(dplyr)
all_trips_clean <- distinct(all_trips)
all_trips_clean
# Great, the data is definitely free of duplicate rows and now we have a copy
# Let's convert strings to datetime using lubridate in the format mdy_hm, since times are formatted as HH:MM.
library(lubridate)
all_trips_clean$started_at <- mdy_hm(all_trips_clean$started_at)
all_trips_clean$ended_at <- mdy_hm(all_trips_clean$ended_at)
all_trips_clean
# Just checking that the data type was successfully changed in the last line, looks good. FYI lubridate assumes UTC timezone
# Now we'll make new columns so that we can see patterns in ride frequency across days, months, and years.
all_trips_clean$date <- as.Date(all_trips_clean$started_at)
all_trips_clean$month <- format(as.Date(all_trips_clean$date), "%m")
all_trips_clean$day <- format(as.Date(all_trips_clean$date), "%d")
all_trips_clean$year <- format(as.Date(all_trips_clean$date), "%Y")
# Let's get a look at that table. We should have 4 new columns.
str(all_trips_clean)
# Great!
# Clean ride_length column for rides negative time in length. R already recognizes it as a numeric data type in hms format
# We'll make another new version of the dataframe since data is being removed here
all_trips_clean_v2 <- all_trips_clean[!(all_trips_clean$start_station_name == "HQ QR" | all_trips_clean$ride_length<0),]
all_trips_clean_v2
# This did not drop any rows. Hmm. We'll try again and see if we can drop by 00:03. We don't need to remove the HQ station name again.
all_trips_clean_v3 <- all_trips_clean_v2[!(all_trips_clean_v2$ride_length<00:03),]
all_trips_clean_v3
# Okay, we dropped 1689 rows, or 0.029% of the dataset and no rides should be less than 3 minutes in length.
# The data should be good to go for analysis!
# Let's start with a descriptive analysis and see what R returns. Times are currently in HH:MM
mean(all_trips_clean_v3$ride_length) #straight average (total ride length / rides)
median(all_trips_clean_v3$ride_length) #midpoint number in the ascending array of ride lengths
max(all_trips_clean_v3$ride_length) #longest ride
min(all_trips_clean_v3$ride_length) #shortest ride
# Shortest ride is coming up as 0 seconds, which means something went wrong in our ride_length cleaning
# I decided to make a new column for ride duration with a calculation using start_time and end_time
all_trips_clean_v3$duration_seconds <- difftime(all_trips_clean_v3$ended_at,all_trips_clean_v3$started_at)
# Make this numeric so the calculations work
is.factor(all_trips_clean_v3$duration_seconds)
all_trips_clean_v3$duration_seconds <- as.numeric(as.character(all_trips_clean_v3$duration_seconds))
is.numeric(all_trips_clean_v3$duration_seconds)
str(all_trips_clean_v3)
# Great now we have trip duration in seconds.
# On to descriptive stats...
mean(all_trips_clean_v3$duration_seconds) #straight average (total ride length / rides)
median(all_trips_clean_v3$duration_seconds) #midpoint number in the ascending array of ride lengths
max(all_trips_clean_v3$duration_seconds) #longest ride
min(all_trips_clean_v3$duration_seconds) #shortest ride
View(all_trips_clean_v3)
# Okay! So the duration column blew off the seconds part of the HHMMSS data AND we still can't do calculations with it
aggregate(all_trips_clean_v3$ride_length ~ all_trips_clean_v3$member_casual, FUN = mean)
aggregate(all_trips_clean_v3$ride_length ~ all_trips_clean_v3$member_casual, FUN = median)
aggregate(all_trips_clean_v3$ride_length ~ all_trips_clean_v3$member_casual, FUN = max)
aggregate(all_trips_clean_v3$ride_length ~ all_trips_clean_v3$member_casual, FUN = min)
# Very strange that min being returned is zero seconds.
# The data shouldn't have any trips less than 3 minutes
# Let's try the duration column
aggregate(all_trips_clean_v3$duration_seconds ~ all_trips_clean_v3$member_casual, FUN = mean)
aggregate(all_trips_clean_v3$duration_seconds ~ all_trips_clean_v3$member_casual, FUN = median)
aggregate(all_trips_clean_v3$duration_seconds ~ all_trips_clean_v3$member_casual, FUN = max)
aggregate(all_trips_clean_v3$duration_seconds ~ all_trips_clean_v3$member_casual, FUN = min)
# Ok let's sort the dataset by the duration_seconds column and see what's up
sorted_by_duration <- all_trips_clean_v3[order(all_trips_clean_v3$duration_seconds),]
View(sorted_by_duration)
# Okay we have a bunch of unwanted trip duration lengths. Let's try to drop by 00:03:00
all_trips_clean_v4 <- all_trips_clean_v3[!(all_trips_clean_v3$duration_seconds<180),]
all_trips_clean_v4
# Okay let's try again. This removed 3.39% of the dataset.
rm(sorted_by_duration)
sorted_by_duration <- all_trips_clean_v4[order(all_trips_clean_v4$duration_seconds),]
View(sorted_by_duration)
# Aggregations again
aggregate(all_trips_clean_v4$ride_length ~ all_trips_clean_v4$member_casual, FUN = mean)
aggregate(all_trips_clean_v4$ride_length ~ all_trips_clean_v4$member_casual, FUN = median)
aggregate(all_trips_clean_v4$ride_length ~ all_trips_clean_v4$member_casual, FUN = max)
aggregate(all_trips_clean_v4$ride_length ~ all_trips_clean_v4$member_casual, FUN = min)
# Let's try the duration column
aggregate(all_trips_clean_v4$duration_seconds ~ all_trips_clean_v4$member_casual, FUN = mean)
aggregate(all_trips_clean_v4$duration_seconds ~ all_trips_clean_v4$member_casual, FUN = median)
aggregate(all_trips_clean_v4$duration_seconds ~ all_trips_clean_v4$member_casual, FUN = max)
aggregate(all_trips_clean_v4$duration_seconds ~ all_trips_clean_v4$member_casual, FUN = min)
# Somewhere along the way, the seconds portion of the datetime column started_at and ended_at disappeared
# Let's look at the original data and see if it's there
View(trips_2021_04)
# No seconds in these columns! Not sure what happened in Excel that the ride_length column came out in HH:MM:SS with anything besides 00 in seconds
# We'll continue without this column and use duration_seconds for calculations instead
###################### ANALYSIS ######################
# Comparing members and casual riders using the new duration_seconds column for aggregations
aggregate(all_trips_clean_v4$duration_seconds ~ all_trips_clean_v4$member_casual, FUN = mean)
aggregate(all_trips_clean_v4$duration_seconds ~ all_trips_clean_v4$member_casual, FUN = median)
aggregate(all_trips_clean_v4$duration_seconds ~ all_trips_clean_v4$member_casual, FUN = max)
aggregate(all_trips_clean_v4$duration_seconds ~ all_trips_clean_v4$member_casual, FUN = min)
# Looks like even though we know there are more Cyslistic members in the user pool than casual riders,
# Casual riders are using the bikes for longer periods of time
# How does this translate to frequency of bike use?
# Various errors occured from using the other Excel column, day_of_week, so another version of the data was made and day_of_week and ride_length columns removed, day_of_week added back in using R.
# Here is that code:
# Okay let's try starting a better table from a previous iteration
all_trips_clean_v5 <- all_trips_clean_v3[!(all_trips_clean_v3$duration_seconds<180),]
# Remove ride_length
all_trips_clean_v5 <- all_trips_clean_v5 %>%
select(-c(ride_length))
#Remove day_of_week
all_trips_clean_v5 <- all_trips_clean_v5 %>%
select(-c(day_of_week))
#Add day_of_week back using R
all_trips_clean_v5$day_of_week <- format(as.Date(all_trips_clean_v5$date), "%A")
View(all_trips_clean_v5)
# Run average tide time by member status per day
aggregate(all_trips_clean_v5$duration_seconds ~ all_trips_clean_v5$member_casual + all_trips_clean_v5$day_of_week, FUN = mean)
# Ok... now it works
# Tried a plot and had an unexpected column of NAs, cleaning now with dplyr
library(dplyr)
all_trips_clean_v5 <- all_trips_clean_v5 %>% filter_all(all_vars(complete.cases(.)))
# This cleaned out about 1.06 million rows of data with any NA field
# Visualize number of rides by rider type
all_trips_clean_v5 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>%
group_by(member_casual, weekday) %>%
summarise(number_of_rides = n()
,average_duration = mean(duration_seconds)) %>%
arrange(member_casual, weekday) %>%
ggplot(aes(x = weekday, y = number_of_rides, fill = member_casual)) +
geom_col(position = "dodge") +
labs(title="Average Daily Rides", subtitle="Members vs Casual Riders", fill = "Rider Type") +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE))
#Aggregations again with reduced data
aggregate(all_trips_clean_v5$duration_seconds ~ all_trips_clean_v5$member_casual, FUN = mean)
aggregate(all_trips_clean_v5$duration_seconds ~ all_trips_clean_v5$member_casual, FUN = median)
aggregate(all_trips_clean_v5$duration_seconds ~ all_trips_clean_v5$member_casual, FUN = max)
aggregate(all_trips_clean_v5$duration_seconds ~ all_trips_clean_v5$member_casual, FUN = min)
# Count members vs casual
all_trips_clean_v5 %>%
count(member_casual)
# Average ride length plot
all_trips_clean_v5 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>%
group_by(member_casual, weekday) %>%
summarise(number_of_rides = n()
,average_duration = mean(duration_seconds)) %>%
arrange(member_casual, weekday) %>%
ggplot(aes(x = weekday, y = average_duration, fill = member_casual)) +
geom_col(position = "dodge") +
labs(title="Average Ride Length", subtitle="Members vs Casual Riders", fill = "Rider Type") +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE))
#Average rides by month plot
ggplot(data = all_trips_clean_v5) +
geom_bar(mapping = aes(x = month, fill = member_casual), position = "dodge") +
labs(title="Monthly Ridership Trends", subtitle="Members vs Casual Riders", fill = "Rider Type") +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE))