-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path01_data import via csv.R
108 lines (94 loc) · 5.24 KB
/
01_data import via csv.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
## from strava archive request. zip fil in ~/data. unzipped files needed for analysis in data file here in project.
# gpx files still in the zip archive. will import a few in another script to analyse
# data dictionary https://stravametro.zendesk.com/hc/en-us/articles/1500001573281-Glossary-Data-Dictionary
# api https://developers.strava.com/docs/reference/
# data set produced here will be used to augment data produced in "01_data import via API"
library(tidyverse) # to do tidyverse things
library(tidylog) # to get a log of what's happening to the data
library(janitor) # tools for data cleaning
# EDA tools
library(DataExplorer)
library(explore)
library(skimr)
# some custom functions
source("~/Data/r/basic functions.R")
# sets theme as default for all plots
theme_set(theme_light)
## ggplot helpers - load if necessary
library(patchwork) # to stitch together plots
library(ggtext) # helper functions for ggplot text
library(ggrepel) # helper functions for ggplot text
### load data
strava_activities1 <- readr::read_csv("data/activities.csv") %>%
clean_names() %>%
as_tibble() %>%
rename(elapsed_time = elapsed_time_6, distance = distance_7, max_heart_rate = max_heart_rate_8,
relative_effort = relative_effort_9, commute = commute_10, elapsed_time2 = elapsed_time_16,
distance2 = distance_18, relative_effort2 = relative_effort_38, commute2 = commute_51)
glimpse(strava_activities1)
## skimr summary
strava_activities1 %>%
# select() %>%
skim() %>%
view()
strava_activities1 %>%
select(dirt_distance, distance, distance2) %>%
view()
## gets list of time zones
OlsonNames(tzdir = NULL)
### clean data, redo as necessary after running basic EDA
strava_activities <- strava_activities1 %>%
#clean up NA in commute flag
mutate(commute2 = ifelse(is.na(commute2) & commute == "FALSE", 0, commute2)) %>%
# clean up missing gear type
mutate(activity_gear = ifelse(activity_type == "Ride" & is.na(activity_gear),
"Commute bike", activity_gear)) %>%
# clean up date column, parse out date to month, date, year, day of week, time
mutate(activity_date = str_replace(activity_date, "Jan ", "January ")) %>%
mutate(activity_date = str_replace(activity_date, "Feb ", "February ")) %>%
mutate(activity_date = str_replace(activity_date, "Mar ", "March ")) %>%
mutate(activity_date = str_replace(activity_date, "Apr ", "April ")) %>%
mutate(activity_date = str_replace(activity_date, "May ", "May ")) %>%
mutate(activity_date = str_replace(activity_date, "Jun ", "June ")) %>%
mutate(activity_date = str_replace(activity_date, "Jul ", "July ")) %>%
mutate(activity_date = str_replace(activity_date, "Aug ", "August ")) %>%
mutate(activity_date = str_replace(activity_date, "Sep ", "September ")) %>%
mutate(activity_date = str_replace(activity_date, "Oct ", "October ")) %>%
mutate(activity_date = str_replace(activity_date, "Nov ", "November ")) %>%
mutate(activity_date = str_replace(activity_date, "Dec ", "December ")) %>%
mutate(activity_date2 = activity_date) %>%
separate('activity_date2', paste("date", 1:3, sep="_"), sep=",", extra="drop") %>%
mutate(activity_md = str_trim(date_1)) %>%
separate('activity_md', paste("activity_md", 1:2, sep="_"), sep=" ", extra="drop") %>%
mutate(activity_mdy = paste0(date_1, ",", date_2)) %>%
mutate(activity_ymd = lubridate::mdy(activity_mdy)) %>%
mutate(activity_ymdhms_t = paste0(activity_ymd, date_3)) %>%
mutate(activity_ymdhms_dt = ymd_hms(activity_ymdhms_t)) %>%
mutate(timezone = tz(activity_ymdhms_dt)) %>%
mutate(activity_tz = case_when(
activity_ymd >= "2022-06-28" ~ "Europe/Copenhagen",
TRUE ~ "US/Pacific")) %>%
mutate(activity_ymdhms_cet = as_datetime(activity_ymdhms_dt, tz = 'Europe/Copenhagen')) %>%
mutate(activity_ymdhms_pst = as_datetime(activity_ymdhms_dt, tz = 'US/Pacific')) %>%
mutate(date_3 = str_trim(date_3)) %>%
mutate(activity_wday = case_when(
activity_tz == "US/Pacific" ~ wday(activity_ymdhms_pst, label = TRUE, abbr = FALSE),
activity_tz == "Europe/Copenhagen" ~ wday(activity_ymdhms_cet, label = TRUE, abbr = FALSE))) %>%
mutate(activity_hms = format(as.POSIXct(activity_ymdhms_pst, format = '%I:%M:%S %p'), format = "%H:%M:%S")) %>%
mutate(activity_hour = ifelse(activity_tz == "US/Pacific",
hour(activity_ymdhms_pst), hour(activity_ymdhms_cet))) %>%
mutate(activity_min = minute(activity_ymdhms_dt)) %>%
mutate(activity_hmt = paste0(activity_hour, ":", activity_min)) %>%
mutate(activity_hm = hm(activity_hmt)) %>%
mutate(activity_year = year(activity_ymdhms_dt)) %>%
select(activity_id, activity_ymdhms_cet, activity_ymdhms_pst, activity_tz,
activity_year, activity_month = activity_md_1, activity_date = activity_md_2, activity_wday,
activity_hour, activity_min, activity_hmt, activity_hm,
activity_name:activity_type, activity_gear, commute_txt = commute,
commute_n = commute2, distance_km = distance, distance_m = distance2, elapsed_time, moving_time,
average_speed, average_elapsed_speed, max_speed, elevation_gain:elevation_high, average_grade, max_grade,
average_watts, prefer_perceived_exertion, calories, filename)
glimpse(strava_activities)
saveRDS(strava_activities, file = "data/strava_activities.rds")
strava_activities <- readRDS(file = "data/strava_activities_from_csv.rds")
### continue with deeper analysis here or start new r script