code

###################### IMPORT AND AGGREGATION ########################

# For this project, I loaded data into RStudio in order to clean and analyze it using R.
# First, I installed the tidyverse package and loaded relevant libraries:

install.packages("tidyverse")
library(tidyverse)

# Then, I loaded and aggregated the 12 separate .csv files so that these data were accessible and organized. 
# First, uploading the files to RStudio using readr library...

trips_2021_04 <- read_csv("202104-divvy-tripdata.csv")
trips_2021_05 <- read_csv("202105-divvy-tripdata.csv")
trips_2021_06 <- read_csv("202106-divvy-tripdata.csv")
trips_2021_07 <- read_csv("202107-divvy-tripdata.csv")
trips_2021_08 <- read_csv("202108-divvy-tripdata.csv")
trips_2021_09 <- read_csv("202109-divvy-tripdata.csv")
trips_2021_10 <- read_csv("202110-divvy-tripdata.csv")
trips_2021_11 <- read_csv("202111-divvy-tripdata.csv")
trips_2021_12 <- read_csv("202112-divvy-tripdata.csv")
trips_2022_01 <- read_csv("202201-divvy-tripdata.csv")
trips_2022_02 <- read_csv("202202-divvy-tripdata.csv")
trips_2022_03 <- read_csv("202203-divvy-tripdata.csv")

# Time to wrangle this data into a single file!
# First, I compare column names to make sure everything will harmonize...

colnames(trips_2021_04) 
colnames(trips_2021_05)
colnames(trips_2021_06)
colnames(trips_2021_07)
colnames(trips_2021_08)
colnames(trips_2021_09)
colnames(trips_2021_10)
colnames(trips_2021_11)
colnames(trips_2021_12)
colnames(trips_2022_01)
colnames(trips_2022_02)
colnames(trips_2022_03)

# Now to inspect the dataframes and look for inconsistencies...

str(trips_2021_04) 
str(trips_2021_05)
str(trips_2021_06)
str(trips_2021_07)
str(trips_2021_08)
str(trips_2021_09)
str(trips_2021_10)
str(trips_2021_11)
str(trips_2021_12)
str(trips_2022_01)
str(trips_2022_02)
str(trips_2022_03)

# These files are formatted consistently already. No need to convert any data types, etc.
# Now, I can stack all these dataframes into one big dataframe

all_trips <- bind_rows(trips_2021_04, trips_2021_05, trips_2021_06, trips_2021_07, trips_2021_08, 
trips_2021_09, trips_2021_10, trips_2021_11, trips_2021_12, trips_2022_01, trips_2022_02, trips_2022_03)

# Let's remove fields we don't need. We won't be using locational data since it's not useful to our analysis, so we'll remove longitude and latitude.

all_trips <- all_trips %>%  
  select(-c(start_lat, start_lng, end_lat, end_lng))

# Now we'll look at the new table and make sure it looks the way we expect it to...
# Inspect column names:

colnames(all_trips)  

# Columns remaining are: ride_id, rideable_type, started_at, ended_at, start_station_name, start_station_id, end_station_name, end_station_id, member_casual, ride_length, and day_of_week
# How many rows are in this big data frame?

nrow(all_trips)  

# Wow! This dataframe has 5,723,532 rows. 
# Quick check of the dataframe's dimensions? 

dim(all_trips)  

# As expected, we have 5,723,532 rows and 11 columns
# Let's make sure the data types came out the way we expected them to: 

str(all_trips) 
summary(all_trips)

####################### DATA CLEANING #######################

# First we'll inspect and clean columns to make sure they only have specific values. 
# Let's look at the member_casual column:

table(all_trips$member_casual)

# member_casual has 2,546,542 casual riders and 3,176,990 Cyclistic members, suggesting that more than half of Cyclistic users are already members. There are no unexpected values here.
# Need to change the data types of dates, the started_at and ended_at columns are currently stored as characters. 
# First let's duplicate the table. We can double check for duplicates here using the distinct function in dplyr.

library(dplyr)
all_trips_clean <- distinct(all_trips)
all_trips_clean

# Great, the data is definitely free of duplicate rows and now we have a copy
# Let's convert strings to datetime using lubridate in the format mdy_hm, since times are formatted as HH:MM.

library(lubridate)
all_trips_clean$started_at <- mdy_hm(all_trips_clean$started_at)
all_trips_clean$ended_at <- mdy_hm(all_trips_clean$ended_at)
all_trips_clean

# Just checking that the data type was successfully changed in the last line, looks good. FYI lubridate assumes UTC timezone
# Now we'll make new columns so that we can see patterns in ride frequency across days, months, and years.

all_trips_clean$date <- as.Date(all_trips_clean$started_at) 
all_trips_clean$month <- format(as.Date(all_trips_clean$date), "%m")
all_trips_clean$day <- format(as.Date(all_trips_clean$date), "%d")
all_trips_clean$year <- format(as.Date(all_trips_clean$date), "%Y")

# Let's get a look at that table. We should have 4 new columns.

str(all_trips_clean)

# Great!
# Clean ride_length column for rides negative time in length. R already recognizes it as a numeric data type in hms format
# We'll make another new version of the dataframe since data is being removed here

all_trips_clean_v2 <- all_trips_clean[!(all_trips_clean$start_station_name == "HQ QR" | all_trips_clean$ride_length<0),]
all_trips_clean_v2

# This did not drop any rows. Hmm. We'll try again and see if we can drop by 00:03. We don't need to remove the HQ station name again. 

all_trips_clean_v3 <- all_trips_clean_v2[!(all_trips_clean_v2$ride_length<00:03),]
all_trips_clean_v3

# Okay, we dropped 1689 rows, or 0.029% of the dataset and no rides should be less than 3 minutes in length.
# The data should be good to go for analysis!

# Let's start with a descriptive analysis and see what R returns. Times are currently in HH:MM

mean(all_trips_clean_v3$ride_length) #straight average (total ride length / rides)
median(all_trips_clean_v3$ride_length) #midpoint number in the ascending array of ride lengths
max(all_trips_clean_v3$ride_length) #longest ride
min(all_trips_clean_v3$ride_length) #shortest ride

# Shortest ride is coming up as 0 seconds, which means something went wrong in our ride_length cleaning
# I decided to make a new column for ride duration with a calculation using start_time and end_time

all_trips_clean_v3$duration_seconds <- difftime(all_trips_clean_v3$ended_at,all_trips_clean_v3$started_at)

# Make this numeric so the calculations work

is.factor(all_trips_clean_v3$duration_seconds)
all_trips_clean_v3$duration_seconds <- as.numeric(as.character(all_trips_clean_v3$duration_seconds))
is.numeric(all_trips_clean_v3$duration_seconds)

str(all_trips_clean_v3)

# Great now we have trip duration in seconds.
# On to descriptive stats...

mean(all_trips_clean_v3$duration_seconds) #straight average (total ride length / rides)
median(all_trips_clean_v3$duration_seconds) #midpoint number in the ascending array of ride lengths
max(all_trips_clean_v3$duration_seconds) #longest ride
min(all_trips_clean_v3$duration_seconds) #shortest ride

View(all_trips_clean_v3)

# Okay! So the duration column blew off the seconds part of the HHMMSS data AND we still can't do calculations with it

aggregate(all_trips_clean_v3$ride_length ~ all_trips_clean_v3$member_casual, FUN = mean)
aggregate(all_trips_clean_v3$ride_length ~ all_trips_clean_v3$member_casual, FUN = median)
aggregate(all_trips_clean_v3$ride_length ~ all_trips_clean_v3$member_casual, FUN = max)
aggregate(all_trips_clean_v3$ride_length ~ all_trips_clean_v3$member_casual, FUN = min)

# Very strange that min being returned is zero seconds. 
# The data shouldn't have any trips less than 3 minutes
# Let's try the duration column

aggregate(all_trips_clean_v3$duration_seconds ~ all_trips_clean_v3$member_casual, FUN = mean)
aggregate(all_trips_clean_v3$duration_seconds ~ all_trips_clean_v3$member_casual, FUN = median)
aggregate(all_trips_clean_v3$duration_seconds ~ all_trips_clean_v3$member_casual, FUN = max)
aggregate(all_trips_clean_v3$duration_seconds ~ all_trips_clean_v3$member_casual, FUN = min)

# Ok let's sort the dataset by the duration_seconds column and see what's up

sorted_by_duration <- all_trips_clean_v3[order(all_trips_clean_v3$duration_seconds),]
View(sorted_by_duration)

# Okay we have a bunch of unwanted trip duration lengths. Let's try to drop by 00:03:00

all_trips_clean_v4 <- all_trips_clean_v3[!(all_trips_clean_v3$duration_seconds<180),]
all_trips_clean_v4

# Okay let's try again. This removed 3.39% of the dataset. 

rm(sorted_by_duration)

sorted_by_duration <- all_trips_clean_v4[order(all_trips_clean_v4$duration_seconds),]
View(sorted_by_duration)

# Aggregations again

aggregate(all_trips_clean_v4$ride_length ~ all_trips_clean_v4$member_casual, FUN = mean)
aggregate(all_trips_clean_v4$ride_length ~ all_trips_clean_v4$member_casual, FUN = median)
aggregate(all_trips_clean_v4$ride_length ~ all_trips_clean_v4$member_casual, FUN = max)
aggregate(all_trips_clean_v4$ride_length ~ all_trips_clean_v4$member_casual, FUN = min)

# Let's try the duration column

aggregate(all_trips_clean_v4$duration_seconds ~ all_trips_clean_v4$member_casual, FUN = mean)
aggregate(all_trips_clean_v4$duration_seconds ~ all_trips_clean_v4$member_casual, FUN = median)
aggregate(all_trips_clean_v4$duration_seconds ~ all_trips_clean_v4$member_casual, FUN = max)
aggregate(all_trips_clean_v4$duration_seconds ~ all_trips_clean_v4$member_casual, FUN = min)

# Somewhere along the way, the seconds portion of the datetime column started_at and ended_at disappeared
# Let's look at the original data and see if it's there

View(trips_2021_04)

# No seconds in these columns! Not sure what happened in Excel that the ride_length column came out in HH:MM:SS with anything besides 00 in seconds
# We'll continue without this column and use duration_seconds for calculations instead

###################### ANALYSIS ######################

# Comparing members and casual riders using the new duration_seconds column for aggregations
aggregate(all_trips_clean_v4$duration_seconds ~ all_trips_clean_v4$member_casual, FUN = mean)
aggregate(all_trips_clean_v4$duration_seconds ~ all_trips_clean_v4$member_casual, FUN = median)
aggregate(all_trips_clean_v4$duration_seconds ~ all_trips_clean_v4$member_casual, FUN = max)
aggregate(all_trips_clean_v4$duration_seconds ~ all_trips_clean_v4$member_casual, FUN = min)

# Looks like even though we know there are more Cyslistic members in the user pool than casual riders,
# Casual riders are using the bikes for longer periods of time
# How does this translate to frequency of bike use?

# Various errors occured from using the other Excel column, day_of_week, so another version of the data was made and day_of_week and ride_length columns removed, day_of_week added back in using R.
# Here is that code:

# Okay let's try starting a better table from a previous iteration
all_trips_clean_v5 <- all_trips_clean_v3[!(all_trips_clean_v3$duration_seconds<180),]

# Remove ride_length
all_trips_clean_v5 <- all_trips_clean_v5 %>%  
  select(-c(ride_length))

#Remove day_of_week
all_trips_clean_v5 <- all_trips_clean_v5 %>%  
  select(-c(day_of_week))

#Add day_of_week back using R
all_trips_clean_v5$day_of_week <- format(as.Date(all_trips_clean_v5$date), "%A")
View(all_trips_clean_v5)

# Run average tide time by member status per day

aggregate(all_trips_clean_v5$duration_seconds ~ all_trips_clean_v5$member_casual + all_trips_clean_v5$day_of_week, FUN = mean)

# Ok... now it works
# Tried a plot and had an unexpected column of NAs, cleaning now with dplyr

library(dplyr)
all_trips_clean_v5 <- all_trips_clean_v5 %>% filter_all(all_vars(complete.cases(.)))  

# This cleaned out about 1.06 million rows of data with any NA field

# Visualize number of rides by rider type 
all_trips_clean_v5 %>% 
  mutate(weekday = wday(started_at, label = TRUE)) %>% 
  group_by(member_casual, weekday) %>% 
  summarise(number_of_rides = n()
            ,average_duration = mean(duration_seconds)) %>% 
  arrange(member_casual, weekday)  %>% 
  ggplot(aes(x = weekday, y = number_of_rides, fill = member_casual)) +
  geom_col(position = "dodge") +
  labs(title="Average Daily Rides", subtitle="Members vs Casual Riders", fill = "Rider Type") +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE))
  
#Aggregations again with reduced data
aggregate(all_trips_clean_v5$duration_seconds ~ all_trips_clean_v5$member_casual, FUN = mean)
aggregate(all_trips_clean_v5$duration_seconds ~ all_trips_clean_v5$member_casual, FUN = median)
aggregate(all_trips_clean_v5$duration_seconds ~ all_trips_clean_v5$member_casual, FUN = max)
aggregate(all_trips_clean_v5$duration_seconds ~ all_trips_clean_v5$member_casual, FUN = min)

# Count members vs casual
all_trips_clean_v5 %>% 
  count(member_casual)

# Average ride length plot
all_trips_clean_v5 %>% 
  mutate(weekday = wday(started_at, label = TRUE)) %>% 
  group_by(member_casual, weekday) %>% 
  summarise(number_of_rides = n()
            ,average_duration = mean(duration_seconds)) %>% 
  arrange(member_casual, weekday)  %>% 
  ggplot(aes(x = weekday, y = average_duration, fill = member_casual)) +
  geom_col(position = "dodge") +
  labs(title="Average Ride Length", subtitle="Members vs Casual Riders", fill = "Rider Type") +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE))

#Average rides by month plot
ggplot(data = all_trips_clean_v5) +
  geom_bar(mapping = aes(x = month, fill = member_casual), position = "dodge") +
  labs(title="Monthly Ridership Trends", subtitle="Members vs Casual Riders", fill = "Rider Type") +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE))