title | author | output | ||||||
---|---|---|---|---|---|---|---|---|
Fitness Activity Monitoring |
Mohammad Ridhwan Tamjis |
|
Dataset: Activity monitoring data
library(ggplot2)
library(dplyr)
library(data.table)
library(kableExtra)
library(lubridate)
baseDir <- "."
dataDir <- file.path(baseDir, "data")
if(!file.exists("./data")) {
dir.create("./data")
url <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2Factivity.zip"
fileName <- "./data/activity.zip"
download.file(url, destfile = fileName, mode = "wb", method = "curl")
dateDownloaded <- date()
unzip(fileName, exdir="./data")
}
datapath <- "./data/activity.csv"
data <- read.csv(datapath)
data <- mutate(data, hour=interval%/%100, minutes=interval%%100, date=factor(as.Date(date)))
data$elapsed <- data$hour * 60 + data$minute
data$timeInterval <- as.factor(sprintf("%02d:%02d", data$hour, data$minutes))
num_steps <- aggregate(steps~date, data, sum, na.exclude=TRUE)
step_hist <- ggplot(num_steps, aes(steps)) +
geom_histogram(bins=20, col="black", fill="cornflowerblue") +
ggtitle("Total number of steps taken each day")
print(step_hist)
steps_mean <- round(mean(num_steps$steps, na.rm=TRUE), 2)
steps_median <- round(median(num_steps$steps, na.rm=TRUE), 2)
The mean of the total number of steps taken per day is 1.076719\times 10^{4}.
The median of the total number of steps taken per day is 1.0766\times 10^{4}.
avg_steps <- aggregate(steps~interval, data, mean, na.exclude=TRUE)
interval_hist <- ggplot(avg_steps, aes(interval, steps)) +
geom_line(col="darkmagenta") +
ggtitle("Interval vs Averaged Steps")
print(interval_hist)
max_steps <- avg_steps$interval[which.max(avg_steps$steps)]
The 5-minute interval which contains the maximum number of steps is 835
sum_na_steps <- sum(is.na(data$steps))
sum_na_date <- sum(is.na(data$date))
sum_na_interval <- sum(is.na(data$interval))
• Total number of missing values in data$steps: 2304
• Total number of missing values in data$date: 0
• Total number of missing values in data$interval: 0
The missing values are only exist in the steps data, thus filling operations will be focus on that column.
For this operation, the mean values for each interval will be used to replace the missing step values.
data_new <- data
count <- 0
for(i in 1:nrow(data_new)) {
if(is.na(data_new$steps[i])) {
data_new$steps[i] <- round(avg_steps$steps[which(avg_steps$interval == data_new$interval[i])])
}
}
new_num_steps <- aggregate(steps~date, data_new, sum)
new_steps_hist <- ggplot(new_num_steps, aes(steps)) +
geom_histogram(bins=20, col="black", fill="darkmagenta") +
ggtitle("Total number of steps taken each day (NA's filled)")
print(new_steps_hist)
new_steps_mean <- round(mean(new_num_steps$steps, na.rm=TRUE), 2)
new_steps_median <- round(median(new_num_steps$steps, na.rm=TRUE),2)
comparison_dt <- data.table(c("original", "filled"), c(steps_mean, new_steps_mean), c(steps_median, new_steps_median))
colnames(comparison_dt) <- c("data", "mean", "median")
kbl(comparison_dt, align="c", caption="Table1: Statistical comparison of NA's and filled data for total number of steps taken per day" ,booktabs=TRUE) %>%
kable_styling(latex_options = c("striped", "hold_position"))
data | mean | median |
---|---|---|
original | 10767.19 | 10766 |
filled | 10765.64 | 10762 |
dtype <- function(x) {
if(x == 'Sat' || x == 'Sun') {
res <- 'weekend'
} else {
res <- 'weekday'
}
res
}
data_new <- mutate(data_new, day = wday(as.POSIXlt(date), label=TRUE))
data_new <- mutate(data_new, day.type = factor(sapply(day, dtype)))
Panel plot containing a time series plot of the 5-minute interval (x-axis) and the average number of steps taken, averaged accros all weekday days or weekend (y-axis)
steps_week <- aggregate(steps~ interval + day.type, data_new, mean)
pplot <- ggplot(steps_week, aes(interval, steps)) +
geom_line(col="darkmagenta")+
facet_wrap(~ day.type, ncol=1)+
ggtitle("Interval vs Average Steps (NA's filled)")
print(pplot)
Based on the above figure, it can be observed that there are slight differences in activity pattern between weekdays and weekend. The graph shows that the user took a lot of steps on weekend compared to weekdays.