Here is the R script I used to load, review, merge, add additional variables, filter and export to a new csv file.
#here are the R script I used to import the 12 months worth of CSV files
#review their structures to ensure all files have the same structures to merge together
#note - need to review each file's variables individually, which is not done in this script
install.packages("tidyverse")
install.packages("readr")
install.packages("janitor")
install.packages("lubridate")
library(tidyverse)
library(readr)
library(janitor)
library(lubridate)
df1 <- read_csv("202106-divvy-tripdata.csv")
df2 <- read_csv("202107-divvy-tripdata.csv")
df3 <- read_csv("202108-divvy-tripdata.csv")
df4 <- read_csv("202109-divvy-tripdata.csv")
df5 <- read_csv("202110-divvy-tripdata.csv")
df6 <- read_csv("202111-divvy-tripdata.csv")
df7 <- read_csv("202112-divvy-tripdata.csv")
df8 <- read_csv("202201-divvy-tripdata.csv")
df9 <- read_csv("202202-divvy-tripdata.csv")
df10 <- read_csv("202203-divvy-tripdata.csv")
df11 <- read_csv("202204-divvy-tripdata.csv")
df12 <- read_csv("202205-divvy-tripdata.csv")
df <- rbind(df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12)
#I ran below scripts for df1 to df12 to ensure all variables match before combining them.
#there should be an easier / more automatic way to compare them instead of manual comparison.
str(df)
glimpse(df)
colnames(df)
summary(df)
table(df$rideable_type)
#convert variable started_at and ended_at to POSIXlt type
#calculate weekdays to see daily pattern
#calculate month of year to see monthly pattern
#calculate duration (ended_at - started_at) and convert the result to numeric
df$started_at <- as.POSIXlt(df$started_at)
df$ended_at <- as.POSIXlt(df$ended_at)
df$start_weekday <- weekdays(df$started_at)
df$start_month <- format(as.Date(df$started_at), "%m")
df$duration_min <- as.numeric((df$ended_at - df$started_at)/60)
summary(df)
#filter out any data that has duration less than or equal to 0
df_temp <- filter(df, duration_min > 0)
#select a smaller size of data
#after review, decided to keep only ride_id, rideable_type, started_at, ended_at, member_casual,
# start_weekday, duration_min and start_month
df1 <- select(df_temp, ride_id, rideable_type, started_at, ended_at, member_casual, start_weekday,
duration_min, start_month)
#save the file into file name "202106-202205-updatedbikedata.csv"
write.csv(df1,"202106-202205-updatedbikedata.csv",row.names=FALSE)
No comments:
Post a Comment