diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000..aa06f51 --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,7 @@ +^.*\.Rproj$ +^\.Rproj\.user$ +^README\.Rmd$ +^cran-comments\.md$ +^_pkgdown\.yml$ +^docs$ +^data-raw$ diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5b6a065 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.Rproj.user +.Rhistory +.RData +.Ruserdata diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..4bf582a --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,38 @@ +Package: anomalize +Type: Package +Title: Tidy anomaly detection +Version: 0.1.0 +Author: c( + person("Matt", "Dancho", email = "mdancho@business-science.io", role = c("aut", "cre")), + person("Davis", "Vaughan", email = "dvaughan@business-science.io", role = c("aut")) + ) +Maintainer: Matt Dancho +Description: + The `anomalize` package enables a "tidy" workflow for detecting anomalies in data. + The main functions are `time_decompose()`, `anomalize()`, and `time_recompose()`. + When combined, it's quite simple to decompose time series, detect anomalies, + and create bands separating the "normal" data from the anomalous data. +URL: https://github.com/business-science/anomalize +BugReports: https://github.com/business-science/anomalize/issues +License: GPL (>= 3) +Encoding: UTF-8 +LazyData: true +Depends: + R (>= 3.0.0) +Imports: + dplyr, + glue, + tidyquant, + timetk, + sweep, + tibbletime, + purrr, + rlang, + tibble, + stringr, + tidyr, + ggplot2 +RoxygenNote: 6.0.1 +Roxygen: list(markdown = TRUE) +Suggests: + testthat diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..38afec3 --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,45 @@ +# Generated by roxygen2: do not edit by hand + +S3method(anomalize,default) +S3method(anomalize,grouped_df) +S3method(anomalize,tbl_df) +S3method(plot_anomaly_decomposition,default) +S3method(plot_anomaly_decomposition,grouped_tbl_time) +S3method(plot_anomaly_decomposition,tbl_time) +S3method(prep_tbl_time,data.frame) +S3method(prep_tbl_time,tbl_time) +S3method(time_apply,data.frame) +S3method(time_decompose,default) +S3method(time_decompose,grouped_df) +S3method(time_decompose,grouped_tbl_time) +S3method(time_decompose,tbl_df) +S3method(time_decompose,tbl_time) +S3method(time_recompose,default) +S3method(time_recompose,grouped_df) +S3method(time_recompose,grouped_tbl_time) +S3method(time_recompose,tbl_df) +S3method(time_recompose,tbl_time) +export(anomalize) +export(decompose_multiplicative) +export(decompose_stl) +export(decompose_twitter) +export(gesd) +export(iqr) +export(plot_anomaly_decomposition) +export(prep_tbl_time) +export(time_apply) +export(time_decompose) +export(time_frequency) +export(time_recompose) +export(time_scale_template) +export(time_trend) +importFrom(dplyr,"%>%") +importFrom(dplyr,contains) +importFrom(dplyr,n) +importFrom(dplyr,row_number) +importFrom(rlang,"!!!") +importFrom(rlang,"!!") +importFrom(rlang,":=") +importFrom(stats,mad) +importFrom(stats,median) +importFrom(stats,qt) diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 0000000..575762b --- /dev/null +++ b/NEWS.md @@ -0,0 +1,3 @@ +# anomalize 0.1.0 + +* Added a `NEWS.md` file to track changes to the package. diff --git a/R/anomalize-package.R b/R/anomalize-package.R new file mode 100644 index 0000000..3c3129b --- /dev/null +++ b/R/anomalize-package.R @@ -0,0 +1,19 @@ +#' anomalize: Tidy anomaly detection +#' +#' @details +#' The `anomalize` package enables a tidy workflow for detecting anomalies in data. +#' The main functions are `time_decompose()`, `anomalize()`, and `time_recompose()`. +#' When combined, it's quite simple to decompose time series, detect anomalies, +#' and create bands separating the "normal" data from the anomalous data. +#' +#' To learn more about `anomalize`, start with the vignettes: +#' `browseVignettes(package = "anomalize")` +#' +#' @docType package +#' @name anomalize_package +#' +#' @importFrom rlang := !! !!! +#' @importFrom dplyr %>% n row_number contains +#' @importFrom stats median mad qt + +NULL diff --git a/R/anomalize.R b/R/anomalize.R new file mode 100644 index 0000000..520638a --- /dev/null +++ b/R/anomalize.R @@ -0,0 +1,173 @@ +#' Detect anomalies using the tidyverse +#' +#' @inheritParams time_apply +#' @param data A `tibble` or `tbl_time` object. +#' @param method The anomaly detection method. One of `"iqr"` or `"gesd"`. +#' The IQR method is faster at the expense of possibly not being quite as accurate. +#' The GESD method has the best properties for outlier detection, but is loop-based +#' and therefore a bit slower. +#' @param alpha Controls the width of the "normal" range. +#' Lower values are more conservative while higher values are less prone +#' to incorrectly classifying "normal" observations. +#' @param max_anoms The maximum percent of anomalies permitted to be identified. +#' @param verbose A boolean. If `TRUE`, will return a list containing useful information +#' about the anomalies. If `FALSE`, just returns the data expanded with the anomalies and +#' the lower (l1) and upper (l2) bounds. +#' +#' @return Returns a `tibble` / `tbl_time` object or list depending on the value of `verbose`. +#' +#' @details +#' The `anomalize()` function is used to detect outliers in a distribution +#' with no trend or seasonality present. The return has three columns: +#' "remainder_l1" (lower limit for anomalies), "remainder_l2" (upper limit for +#' anomalies), and "anomaly" (Yes/No). +#' +#' Use [time_decompose()] to decompose a time series prior to performing +#' anomaly detection with `anomalize()`. Typically, `anomalize()` is +#' performed on the "remainder" of the time series decomposition. +#' +#' For non-time series data (data without trend), the `anomalize()` function can +#' be used without time series decomposition. +#' +#' The `anomalize()` function uses two methods for outlier detection +#' each with benefits. +#' +#' __IQR__: +#' +#' The IQR Method uses an innerquartile range of 25% and 75% to establish a baseline distribution around +#' the median. With the default `alpha = 0.05`, the limits are established by expanding +#' the 25/75 baseline by an IQR Factor of 3 (3X). The IQR Factor = 0.15 / alpha (hense 3X with alpha = 0.05). +#' To increase the IQR Factor controling the limits, decrease the alpha, which makes +#' it more difficult to be an outlier. Increase alpha to make it easier to be an outlier. +#' +#' __GESD__: +#' +#' The GESD Method (Generlized Extreme Studentized Deviate Test) progressively +#' eliminates outliers using a Student's T-Test comparing the test statistic to a critical value. +#' Each time an outlier is removed, the test statistic is updated. Once test statistic +#' drops below the critical value, all outliers are considered removed. Because this method +#' involves continuous updating via a loop, it is slower than the IQR method. However, it +#' tends to be the best performing method for outlier removal. +#' +#' @seealso +#' Anomaly Detection Methods (Powers `anomalize`) +#' - [iqr()] +#' - [gesd()] +#' +#' Time Series Anomaly Detection Functions (anomaly detection workflow): +#' - [time_decompose()] +#' - [time_recompose()] +#' +#' @examples +#' +#' library(dplyr) +#' +#' data(tidyverse_cran_downloads) +#' +#' tidyverse_cran_downloads %>% +#' time_decompose(count, method = "stl") %>% +#' anomalize(remainder, method = "iqr") +#' +#' @references +#' - The IQR method is used in [`forecast::tsoutliers()`](https://github.com/robjhyndman/forecast/blob/master/R/clean.R) +#' - The GESD method is used in Twitter's [`AnomalyDetection`](https://github.com/twitter/AnomalyDetection) package and is also available as a function in [@raunakms's GESD method](https://github.com/raunakms/GESD/blob/master/runGESD.R) +#' +#' @export +anomalize <- function(data, target, method = c("iqr", "gesd"), + alpha = 0.05, max_anoms = 0.20, verbose = FALSE) { + UseMethod("anomalize", data) +} + +#' @export +anomalize.default <- function(data, target, method = c("iqr", "gesd"), + alpha = 0.05, max_anoms = 0.20, verbose = FALSE) { + stop("Object is not of class `tbl_df` or `tbl_time`.", call. = FALSE) +} + +#' @export +anomalize.tbl_df <- function(data, target, method = c("iqr", "gesd"), + alpha = 0.05, max_anoms = 0.20, verbose = FALSE) { + + # Checks + if (missing(target)) stop('Error in anomalize(): argument "target" is missing, with no default', call. = FALSE) + + # Setup + target_expr <- rlang::enquo(target) + + method <- tolower(method[[1]]) + x <- data %>% dplyr::pull(!! target_expr) + + # Detect Anomalies + # method <- tolower(method[[1]]) + # args <- list(x = data %>% dplyr::pull(!! target_expr), + # alpha = alpha, + # max_anoms = max_anoms, + # verbose = TRUE) + # + # outlier_list <- do.call(method, args) + + # Explicitly call functions + if (method == "iqr") { + outlier_list <- anomalize::iqr(x = x, + alpha = alpha, + max_anoms = max_anoms, + verbose = TRUE) + } else if (method == "gesd") { + outlier_list <- anomalize::gesd(x = x, + alpha = alpha, + max_anoms = max_anoms, + verbose = TRUE) + + } else { + stop("The `method` selected is invalid.", call. = FALSE) + } + + outlier <- outlier_list$outlier + limit_lower <- outlier_list$critical_limits[[1]] + limit_upper <- outlier_list$critical_limits[[2]] + + # Returns + ret <- data %>% + dplyr::mutate(!! paste0(dplyr::quo_name(target_expr), "_l1") := limit_lower, + !! paste0(dplyr::quo_name(target_expr), "_l2") := limit_upper) %>% + tibble::add_column(anomaly = outlier) + + if (verbose) { + ret <- list( + anomalized_tbl = ret, + anomaly_details = outlier_list + ) + + return(ret) + + } else { + return(ret) + } + +} + +#' @export +anomalize.grouped_df <- function(data, target, method = c("iqr", "gesd"), + alpha = 0.05, max_anoms = 0.20, verbose = FALSE, ...) { + + # Checks + if (missing(target)) stop('Error in anomalize(): argument "target" is missing, with no default', call. = FALSE) + if (verbose) warning(glue::glue("Cannot use 'verbose = TRUE' with grouped data.")) + + # Setup + target_expr <- dplyr::enquo(target) + + ret <- data %>% + grouped_mapper( + .f = anomalize, + target = !! target_expr, + method = method[[1]], + alpha = alpha, + max_anoms = max_anoms, + verbose = F, + ...) + + return(ret) + +} + diff --git a/R/anomalize_methods.R b/R/anomalize_methods.R new file mode 100644 index 0000000..39731f0 --- /dev/null +++ b/R/anomalize_methods.R @@ -0,0 +1,228 @@ +#' Methods that power anomalize() +#' +#' @inheritParams anomalize +#' @param x A vector of numeric data. +#' @param verbose A boolean. If `TRUE`, will return a list containing useful information +#' about the anomalies. If `FALSE`, just returns a vector of "Yes" / "No" values. +#' +#' @return Returns character vector or list depending on the value of `verbose`. +#' +#' +#' @seealso [anomalize()] +#' +#' @examples +#' +#' set.seed(100) +#' x <- rnorm(100) +#' idx_outliers <- sample(100, size = 5) +#' x[idx_outliers] <- x[idx_outliers] + 10 +#' +#' iqr(x, alpha = 0.05, max_anoms = 0.2) +#' iqr(x, alpha = 0.05, max_anoms = 0.2, verbose = TRUE) +#' +#' gesd(x, alpha = 0.05, max_anoms = 0.2) +#' gesd(x, alpha = 0.05, max_anoms = 0.2, verbose = TRUE) +#' +#' +#' @references +#' - The IQR method is used in [`forecast::tsoutliers()`](https://github.com/robjhyndman/forecast/blob/master/R/clean.R) +#' - The GESD method is used in Twitter's [`AnomalyDetection`](https://github.com/twitter/AnomalyDetection) package and is also available as a function in [@raunakms's GESD method](https://github.com/raunakms/GESD/blob/master/runGESD.R) +#' +#' @name anomalize_methods + +# 1A. IQR Method ---- + +#' @export +#' @rdname anomalize_methods +iqr <- function(x, alpha = 0.05, max_anoms = 0.2, verbose = FALSE) { + + quantile_x <- stats::quantile(x, prob = c(0.25, 0.75), na.rm = TRUE) + iq_range <- quantile_x[[2]] - quantile_x[[1]] + limits <- quantile_x + (0.15/alpha) * iq_range * c(-1, 1) + + outlier_idx <- ((x < limits[1]) | (x > limits[2])) + outlier_vals <- x[outlier_idx] + outlier_response <- ifelse(outlier_idx == TRUE, "Yes", "No") + + vals_tbl <- tibble::tibble(value = x) %>% + tibble::rownames_to_column(var = "index") %>% + # Establish limits and assess if outside of limits + dplyr::mutate( + limit_lower = limits[1], + limit_upper = limits[2], + abs_diff_lower = ifelse(value <= limit_lower, abs(value - limit_lower), 0), + abs_diff_upper = ifelse(value >= limit_upper, abs(value - limit_upper), 0), + max_abs_diff = ifelse(abs_diff_lower > abs_diff_upper, abs_diff_lower, abs_diff_upper) + ) %>% + dplyr::select(index, dplyr::everything()) %>% + dplyr::select(-c(abs_diff_lower, abs_diff_upper)) %>% + # Sort by absolute distance from centerline of limits + dplyr::mutate( + centerline = (limit_upper + limit_lower) / 2, + sorting = abs(value - centerline) + ) %>% + dplyr::arrange(dplyr::desc(sorting)) %>% + dplyr::select(-c(centerline, sorting)) %>% + tibble::rownames_to_column(var = "rank") %>% + dplyr::mutate( + rank = as.numeric(rank), + index = as.numeric(index) + ) %>% + # Identify outliers + dplyr::arrange(dplyr::desc(max_abs_diff)) %>% + dplyr::mutate( + outlier = ifelse(max_abs_diff > 0, "Yes", "No") , + below_max_anoms = ifelse(row_number() / n() > max_anoms, + "No", "Yes"), + outlier_reported = ifelse(outlier == "Yes" & below_max_anoms == "Yes", + "Yes", "No"), + direction = dplyr::case_when( + (outlier_reported == "Yes") & (value > limit_upper) ~ "Up", + (outlier_reported == "Yes") & (value < limit_lower) ~ "Down", + TRUE ~ "NA"), + direction = ifelse(direction == "NA", NA, direction) + ) + + vals_tbl_filtered <- vals_tbl %>% + dplyr::filter(below_max_anoms == "Yes") %>% + dplyr::select(-c(max_abs_diff:below_max_anoms)) %>% + dplyr::rename(outlier = outlier_reported) + + # Critical Limits + if (any(vals_tbl$outlier == "No")) { + # Non outliers identified, pick first limit + limit_tbl <- vals_tbl %>% + dplyr::filter(outlier == "No") %>% + dplyr::slice(1) + limits_vec <- c(limit_lower = limit_tbl$limit_lower, + limit_upper = limit_tbl$limit_upper) + } else { + # All outliers, pick last limits + limit_tbl <- vals_tbl %>% + dplyr::slice(n()) + limits_vec <- c(limit_lower = limit_tbl$limit_lower, + limit_upper = limit_tbl$limit_upper) + } + + # Return results + if (verbose) { + outlier_list <- list( + outlier = vals_tbl %>% dplyr::arrange(index) %>% dplyr::pull(outlier_reported), + outlier_idx = vals_tbl %>% dplyr::filter(outlier_reported == "Yes") %>% dplyr::pull(index), + outlier_vals = vals_tbl %>% dplyr::filter(outlier_reported == "Yes") %>% dplyr::pull(value), + outlier_direction = vals_tbl %>% dplyr::filter(outlier_reported == "Yes") %>% dplyr::pull(direction), + critical_limits = limits_vec, + outlier_report = vals_tbl_filtered) + return(outlier_list) + } else { + return(vals_tbl %>% dplyr::arrange(index) %>% dplyr::pull(outlier_reported)) + } +} + + + +# 1B. GESD: Generlized Extreme Studentized Deviate Test ---- + +#' @export +#' @rdname anomalize_methods +gesd <- function(x, alpha = 0.05, max_anoms = 0.2, verbose = FALSE) { + + # Variables + n <- length(x) + # r <- floor(n/2) # by default, set upper bound on number of outliers 'r' to 1/2 sample size + r <- trunc(n * max_anoms) # use max anoms to limit loop + R <- numeric(length=r) # test statistics for 'r' outliers + + lambda <- numeric(length = r) # critical values for 'r' outliers + outlier_ind <- numeric(length = r) # removed outlier observation values + outlier_val <- numeric(length = r) # removed outlier observation values + m <- 0 # number of outliers + x_new <- x # temporary observation values + median_new <- numeric(length = r) + mad_new <- numeric(length = r) + + # Outlier detection + for(i in 1:r){ + + # Compute test statistic + # z <- abs(x_new - mean(x_new))/sd(x_new) # Z-scores + median_new[i] <- median(x_new) + mad_new[i] <- mad(x_new) + + z <- abs(x_new - median(x_new))/mad(x_new) # Z-scores + + max_ind <- which(z == max(z),arr.ind=T)[1] # in case of ties, return first one + R[i] <- z[max_ind] # max Z-score + outlier_val[i] <- x_new[max_ind] # removed outlier observation values + outlier_ind[i] <- which(x_new[max_ind] == x, arr.ind=T)[1] # index of removed outlier observation values + x_new <- x_new[-max_ind] # remove observation that maximizes |x_i - x_mean| + + # Compute critical values + p <- 1 - alpha/(2*(n-i+1)) # probability + t_pv <- qt(p, df = (n-i-1)) # Critical value from Student's t distribution + lambda[i] <- ((n-i)*t_pv) / (sqrt((n-i-1+t_pv^2)*(n-i+1))) + + # Find exact number of outliers + # largest 'i' such that R_i > lambda_i + if(!is.na(R[i]) & !is.na(lambda[i])) { # qt can produce NaNs + if (R[i] > lambda[i]) { + m <- i + } + } + } + + vals_tbl <- tibble::tibble( + rank = as.numeric(1:r), + index = outlier_ind, + value = outlier_val, + test_statistic = R, + critical_value = lambda, + median = median_new, + mad = mad_new, + limit_lower = median - critical_value * mad, + limit_upper = critical_value * mad - median + ) %>% + dplyr::mutate( + outlier = ifelse(test_statistic > critical_value, "Yes", "No"), + direction = dplyr::case_when( + (outlier == "Yes") & (value > limit_upper) ~ "Up", + (outlier == "Yes") & (value < limit_lower) ~ "Down", + TRUE ~ "NA"), + direction = ifelse(direction == "NA", NA, direction) + ) %>% + dplyr::select(-c(test_statistic:mad)) + + outlier_index <- vals_tbl %>% dplyr::filter(outlier == "Yes") %>% dplyr::pull(index) + outlier_idx <- seq_along(x) %in% outlier_index + outlier_response <- ifelse(outlier_idx == TRUE, "Yes", "No") + + # Critical Limits + if (any(vals_tbl$outlier == "No")) { + # Non outliers identified, pick first limit + limit_tbl <- vals_tbl %>% + dplyr::filter(outlier == "No") %>% + dplyr::slice(1) + limits_vec <- c(limit_lower = limit_tbl$limit_lower, + limit_upper = limit_tbl$limit_upper) + } else { + # All outliers, pick last limits + limit_tbl <- vals_tbl %>% + dplyr::slice(n()) + limits_vec <- c(limit_lower = limit_tbl$limit_lower, + limit_upper = limit_tbl$limit_upper) + } + + # Return results + if (verbose) { + outlier_list <- list( + outlier = outlier_response, + outlier_idx = outlier_index, + outlier_vals = vals_tbl %>% dplyr::filter(outlier == "Yes") %>% dplyr::pull(value), + outlier_direction = vals_tbl %>% dplyr::filter(outlier == "Yes") %>% dplyr::pull(direction), + critical_limits = limits_vec, + outlier_report = vals_tbl) + return(outlier_list) + } else { + return(outlier_response) + } +} diff --git a/R/global_vars.R b/R/global_vars.R new file mode 100644 index 0000000..4e74892 --- /dev/null +++ b/R/global_vars.R @@ -0,0 +1,30 @@ +globalVariables(c( + "n", + ".", + ".period_groups", + "data", + "abs_diff_lower", + "abs_diff_upper", + "below_max_anoms", + "centerline", + "critical_value", + "direction", + "index", + "limit_lower", + "limit_upper", + "max_abs_diff", + "outlier", + "outlier_reported", + "sorting", + "test_statistic", + "value", + "observed", + "random", + "remainder", + "seasadj", + "season", + "trend", + "target", + "anomaly", + "key" + )) diff --git a/R/plot_anomaly_decomposition.R b/R/plot_anomaly_decomposition.R new file mode 100644 index 0000000..32b3087 --- /dev/null +++ b/R/plot_anomaly_decomposition.R @@ -0,0 +1,99 @@ +#' Visualize the time series decomposition with anomalies shown +#' +#' @param data A `tibble` or `tbl_time` object. +#' @param ncol Number of columns to display. Set to 1 for single column by default. +#' @param color_no Color for non-anomalous data. +#' @param color_yes Color for anomalous data. +#' @param alpha_dots Controls the transparency of the dots. Reduce when too many dots on the screen. +#' @param alpha_circles Controls the transparency of the circles that identify anomalies. +#' @param size_dots Controls the size of the dots. +#' @param size_circles Controls the size of the circles that identify anomalies. +#' @param strip.position Controls the placement of the strip that identifies the time series decomposition components. +#' +#' @return Returns a `ggplot` object. +#' +#' @details +#' The first step in reviewing the anomaly detection process is to evaluate +#' a single times series to observe how the algorithm is selecting anomalies. +#' The `plot_anomaly_decomposition()` function is used to gain +#' an understanding as to whether or not the method is detecting anomalies correctly and +#' whether or not parameters such as decomposition method, anomalize method, +#' alpha, frequency, and so on should be adjusted. +#' +#' @seealso [plot_anomalies()] +#' +#' @examples +#' +#' library(dplyr) +#' library(ggplot2) +#' +#' data(tidyverse_cran_downloads) +#' +#' tidyverse_cran_downloads %>% +#' filter(package == "tidyquant") %>% +#' ungroup() %>% +#' time_decompose(count, method = "stl") %>% +#' anomalize(remainder, method = "iqr") %>% +#' plot_anomaly_decomposition() +#' +#' @export +plot_anomaly_decomposition <- function(data, ncol = 1, color_no = "#2c3e50", color_yes = "#e31a1c", + alpha_dots = 1, alpha_circles = 1, size_dots = 1.5, size_circles = 4, + strip.position = "right") { + UseMethod("plot_anomaly_decomposition", data) + +} + +#' @export +plot_anomaly_decomposition.default <- function(data, ncol = 1, color_no = "#2c3e50", color_yes = "#e31a1c", + alpha_dots = 1, alpha_circles = 1, size_dots = 1.5, size_circles = 4, + strip.position = "right") { + stop("Object is not of class `tbl_time`.", call. = FALSE) + + +} + +#' @export +plot_anomaly_decomposition.grouped_tbl_time <- function(data, ncol = 1, color_no = "#2c3e50", color_yes = "#e31a1c", + alpha_dots = 1, alpha_circles = 1, size_dots = 1.5, size_circles = 4, + strip.position = "right") { + stop("Object cannot be grouped. Select a single time series for evaluation, and use `dplyr::ungroup()`.", call. = FALSE) + + +} + +#' @export +plot_anomaly_decomposition.tbl_time <- function(data, ncol = 1, color_no = "#2c3e50", color_yes = "#e31a1c", + alpha_dots = 1, alpha_circles = 1, size_dots = 1.5, size_circles = 4, + strip.position = "right") { + + # Checks + column_names <- names(data) + check_names <- c("observed", "remainder", "anomaly", "remainder_l1", "remainder_l2") %in% column_names + if (!all(check_names)) stop('Error in plot_anomaly_decomposition(): key names are missing. Make sure observed:remainder, remainder_l1, and remainder_l2 are present', call. = FALSE) + + + # Setup + date_expr <- tibbletime::get_index_quo(data) + date_col <- tibbletime::get_index_char(data) + + data_anomaly_tbl <- data %>% + dplyr::select(!! date_expr, observed:remainder, anomaly) %>% + tidyr::gather(key = key, value = value, -c(!! date_col, anomaly), factor_key = T) + + g <- data_anomaly_tbl %>% + ggplot2::ggplot(ggplot2::aes_string(x = date_col, y = "value", color = "anomaly")) + + # Points + ggplot2::geom_point(size = size_dots, alpha = alpha_dots) + + # Circles + ggplot2::geom_point(size = size_circles, shape = 1, alpha = alpha_circles, + data = data_anomaly_tbl %>% dplyr::filter(anomaly == "Yes")) + + # Horizontal Line at Y = 0 + ggplot2::geom_hline(yintercept = 0, color = tidyquant::palette_light()[[1]]) + + tidyquant::theme_tq() + + ggplot2::facet_wrap(~ key, ncol = ncol, scales = "free_y", strip.position = strip.position) + + ggplot2::scale_color_manual(values = c("No" = color_no, "Yes" = color_yes)) + + return(g) + +} diff --git a/R/prep_tbl_time.R b/R/prep_tbl_time.R new file mode 100644 index 0000000..5cc21eb --- /dev/null +++ b/R/prep_tbl_time.R @@ -0,0 +1,56 @@ +#' Automatically create tibbletime objects from tibbles +#' +#' @param data A `tibble`. +#' @param message A boolean. If `TRUE`, returns a message indicating any +#' conversion details important to know during the conversion to `tbl_time` class. +#' +#' @return Returns a `tibbletime` object of class `tbl_time`. +#' +#' @details +#' Detects a date or datetime index column and automatically +#' +#' @seealso [tibbletime::as_tbl_time()] +#' +#' @examples +#' +#' library(dplyr) +#' library(tibbletime) +#' +#' data_tbl <- tibble( +#' date = seq.Date(from = as.Date("2018-01-01"), by = "day", length.out = 10), +#' value = rnorm(10) +#' ) +#' +#' prep_tbl_time(data_tbl) +#' +#' @export +prep_tbl_time <- function(data, message = FALSE) { + UseMethod("prep_tbl_time", data) +} + + +prep_tbl_time.default <- function(data, message = FALSE) { + stop("Object is not of class `data.frame`.", call. = FALSE) +} + + +#' @export +prep_tbl_time.data.frame <- function(data, message = FALSE) { + + cl <- class(data)[[1]] + idx <- timetk::tk_get_timeseries_variables(data)[[1]] + + data <- data %>% + tibbletime::as_tbl_time(index = !! rlang::sym(idx)) + + if (message) message(glue::glue("Converting from {cl} to {class(data)[[1]]}. + Auto-index message: index = {idx}")) + + return(data) +} + +#' @export +prep_tbl_time.tbl_time <- function(data, message = FALSE) { + return(data) +} + diff --git a/R/tidyverse_cran_downloads.R b/R/tidyverse_cran_downloads.R new file mode 100644 index 0000000..0bff1f9 --- /dev/null +++ b/R/tidyverse_cran_downloads.R @@ -0,0 +1,30 @@ +#' Downloads of various "tidyverse" packages from CRAN +#' +#' A dataset containing the daily download counts from 2017-01-01 to 2018-03-01 +#' for the following tidyverse packages: +#' - `tidyr` +#' - `lubridate` +#' - `dplyr` +#' - `broom` +#' - `tidyquant` +#' - `tidytext` +#' - `ggplot2` +#' - `purrr` +#' - `stringr` +#' - `forcats` +#' - `knitr` +#' - `readr` +#' - `tibble` +#' - `tidyverse` +#' +#' +#' @format A grouped `tbl_time` object with 6,375 rows and 3 variables: +#' \describe{ +#' \item{date}{Date of the daily observation} +#' \item{count}{Number of downloads that day} +#' \item{package}{The package corresponding to the daily download number} +#' } +#' +#' @source +#' The package downloads come from CRAN by way of the `cranlogs` package. +"tidyverse_cran_downloads" diff --git a/R/time_apply.R b/R/time_apply.R new file mode 100644 index 0000000..a032575 --- /dev/null +++ b/R/time_apply.R @@ -0,0 +1,155 @@ +#' Apply a function to a time series by period +#' +#' @inheritParams tibbletime::collapse_by +#' @param data A `tibble` with a date or datetime index. +#' @param target A column to apply the function to +#' @param period A time-based definition (e.g. "2 weeks"). +#' or a numeric number of observations per frequency (e.g. 10). +#' See [tibbletime::collapse_by()] for period notation. +#' @param .fun A function to apply (e.g. `median`) +#' @param ... Additional parameters passed to the function, `.fun` +#' @param message A boolean. If `message = TRUE`, the frequency used is output +#' along with the units in the scale of the data. +#' +#' @return Returns a `tibbletime` object of class `tbl_time`. +#' +#' @details +#' Uses a time-based period to apply functions to. This is useful in circumstances where you want to +#' compare the observation values to aggregated values such as `mean()` or `median()` +#' during a set time-based period. The returned output extends the +#' length of the data frame so the differences can easily be computed. +#' +#' +#' @examples +#' +#' library(dplyr) +#' +#' data(tidyverse_cran_downloads) +#' +#' # Basic Usage +#' tidyverse_cran_downloads %>% +#' time_apply(count, period = "1 week", .fun = mean, na.rm = TRUE) +#' +#' @export +time_apply <- function(data, target, period, .fun, ..., + start_date = NULL, side = "end", clean = FALSE, message = TRUE) { + + UseMethod("time_apply", data) + +} + + +#' @export +time_apply.data.frame <- function(data, target, period, .fun, ..., + start_date = NULL, side = "end", clean = FALSE, message = TRUE) { + + # Checks + if (missing(target)) stop('Error in time_apply(): argument "target" is missing, with no default', call. = FALSE) + if (missing(period)) stop('Error in time_apply(): argument "period" is missing, with no default', call. = FALSE) + if (missing(.fun)) stop('Error in time_apply(): argument ".fun" is missing, with no default', call. = FALSE) + + + # Setup inputs + data <- prep_tbl_time(data, message = F) + + date_col_expr <- tibbletime::get_index_quo(data) + date_col_name <- dplyr::quo_name(date_col_expr) + + target_expr <- dplyr::enquo(target) + + # Function apply logic + if (is.character(period)) { + # See collapse_by for valid character sequences (e.g. "1 Y") + ret <- data %>% + tibbletime::collapse_by(period = period, clean = clean, start_date = start_date, side = side) %>% + dplyr::group_by(!! tibbletime::get_index_quo(.)) %>% + dplyr::mutate(time_apply = .fun(!! target_expr, ...)) %>% + dplyr::ungroup() %>% + dplyr::mutate(!! date_col_name := data %>% dplyr::pull(!! date_col_expr)) + + } else { + # Numeric (e.g. 15 data points) + ret <- data %>% + dplyr::mutate( + .period_groups = rep(1:period, length.out = nrow(.)) %>% sort() + ) %>% + dplyr::group_by(.period_groups) %>% + dplyr::mutate( + time_apply = .fun(!! target_expr, ...) + ) %>% + dplyr::ungroup() %>% + dplyr::select(-.period_groups) + } + + return(ret) + +} + +time_apply.grouped_df <- function(data, target, period, .fun, ..., + start_date = NULL, side = "end", clean = FALSE, message = TRUE) { + + # Checks + if (missing(target)) stop('Error in time_apply(): argument "target" is missing, with no default', call. = FALSE) + if (missing(period)) stop('Error in time_apply(): argument "period" is missing, with no default', call. = FALSE) + if (missing(.fun)) stop('Error in time_apply(): argument ".fun" is missing, with no default', call. = FALSE) + + + # Setup + data <- prep_tbl_time(data, message = F) + + target_expr <- dplyr::enquo(target) + + # Map time_apply.data.frame + ret <- data %>% + grouped_mapper( + .f = time_apply, + target = !! target_expr, + period = period, + .fun = .fun, + ... = ..., + start_date = start_date, + side = side, + clean = clean, + message = message) + + return(ret) + +} + +# Helper function for decompose_twitter +time_median <- function(data, target, period = "auto", template = time_scale_template(), message = TRUE) { + + # Setup inputs + data <- prep_tbl_time(data, message = F) + + date_col_expr <- tibbletime::get_index_quo(data) + date_col_name <- dplyr::quo_name(date_col_expr) + + target_expr <- dplyr::enquo(target) + + # For median_span = "auto" use template + if (period == "auto") { + + # Get timeseries summary attributes + ts_summary <- data %>% + tibbletime::get_index_col() %>% + timetk::tk_get_timeseries_summary() + + ts_scale <- ts_summary$scale + + period <- template %>% + target_time_decomposition_scale(ts_scale, "trend", index_shift = 0) + + } + + # Use time_apply() + ret <- data %>% + time_apply(!! target_expr, period = period, + .fun = median, na.rm = T, clean = F, message = message) %>% + dplyr::rename(median_spans = time_apply) + + if (message) message(glue::glue("median_span = {period}")) + + return(ret) + +} diff --git a/R/time_decompose.R b/R/time_decompose.R new file mode 100644 index 0000000..f91ba88 --- /dev/null +++ b/R/time_decompose.R @@ -0,0 +1,207 @@ +#' Decompose a time series in preparation for anomaly detection +#' +#' @inheritParams anomalize +#' @param data A `tibble` or `tbl_time` object. +#' @param method The time series decomposition method. One of `"stl"`, `"twitter"`, or +#' `"multiplicative"`. The STL method uses seasonal decomposition (see [decompose_stl()]). +#' The Twitter method uses `median_spans` to remove the trend (see [decompose_twitter()]). +#' The Multiplicative method uses multiplicative decomposition (see [decompose_multiplicative()]). +#' @param frequency Controls the seasonal adjustment (removal of seasonality). +#' Input can be either "auto", a time-based definition (e.g. "2 weeks"), +#' or a numeric number of observations per frequency (e.g. 10). +#' Refer to [time_frequency()]. +#' @param ... Additional parameters passed to the underlying method functions. +#' @param merge A boolean. `FALSE` by default. If `TRUE`, will append results to the original data. +#' @param message A boolean. If `TRUE`, will output information related to `tbl_time` conversions, frequencies, +#' and median spans (if applicable). +#' +#' @return Returns a `tbl_time` object. +#' +#' @details +#' The `time_decompose()` function generates a time series decomposition on +#' `tbl_time` objects. The function is "tidy" in the sense that it works +#' on data frames. It is designed to work with time-based data, and as such +#' must have a column that contains date or datetime information. The function +#' also works with grouped data. The function implements several methods +#' of time series decomposition, each with benefits. +#' +#' __STL__: +#' +#' The STL method (`method = "stl"`) implements time series decomposition using +#' the underlying [decompose_stl()] function. If you are familiar with [stats::stl()], +#' the function is a "tidy" version that is designed to work with `tbl_time` objects. +#' The decomposition separates the "season" and "trend" components from +#' the "observed" values leaving the "remainder" for anomaly detection. +#' The main parameter that controls the seasonal adjustment is `frequency`. +#' Setting `frequency = "auto"` will lets the [time_frequency()] function +#' automatically determine the frequency based on the scale of the time series. +#' +#' __Twitter__: +#' +#' The Twitter method (`method = "twitter"`) implements time series decomposition using +#' the methodology from the Twitter [AnomalyDetection](https://github.com/twitter/AnomalyDetection) package. +#' The decomposition separates the "seasonal" component and then removes +#' the median data, which is a different approach than the STL method for removing +#' the trend. This approach works very well for low-growth + high seasonality data. +#' STL may be a better approach when trend is a large factor. +#' The user can control two parameters: `frequency` and `median_spans`. +#' The `frequency` parameter adjusts the "season" component that is removed +#' from the "observed" values. The `median_spans` parameter adjusts the +#' number of median spans that are used. The user may supply both `frequency` +#' and `median_spans` as time-based durations (e.g. "6 weeks") or numeric values +#' (e.g. 180) or "auto", which predetermines the frequency and/or median spans +#' based on the scale of the time series. +#' +#' __Multiplicative__: +#' +#' The Multiplicative method (`method = "multiplicative"`) time series decomposition +#' uses the [stats::decompose()] function with `type = "multiplicative"`. This +#' method is useful in circumstances where variance is non-constantant and typically +#' growing in a multiplicative fashion. The parameters are the same as the STL method. +#' Alternatively, users may wish to try a transformation (e.g. `log()` or `sqrt()`) in combination +#' with the STL method to get near-constant variance. +#' +#' @seealso +#' Decomposition Methods (Powers `time_decompose`) +#' - [decompose_stl()] +#' - [decompose_twitter()] +#' - [decompose_multiplicative()] +#' +#' Time Series Anomaly Detection Functions (anomaly detection workflow): +#' - [anomalize()] +#' - [time_recompose()] +#' +#' @examples +#' +#' library(dplyr) +#' +#' data(tidyverse_cran_downloads) +#' +#' # Basic Usage +#' tidyverse_cran_downloads %>% +#' time_decompose(count, method = "stl") +#' +#' # twitter + median_spans +#' tidyverse_cran_downloads %>% +#' time_decompose(count, +#' method = "twitter", +#' frequency = "1 week", +#' median_spans = "3 months", +#' merge = TRUE, +#' message = FALSE) +#' +#' @export +time_decompose <- function(data, target, method = c("stl", "twitter", "multiplicative"), + frequency = "auto", ..., merge = FALSE, message = TRUE) { + UseMethod("time_decompose", data) +} + +#' @export +time_decompose.default <- function(data, target, method = c("stl", "twitter", "multiplicative"), + frequency = "auto", ..., merge = FALSE, message = TRUE) { + stop("Object is not of class `tbl_df` or `tbl_time`.", call. = FALSE) +} + +#' @export +time_decompose.tbl_time <- function(data, target, method = c("stl", "twitter", "multiplicative"), + frequency = "auto", ..., merge = FALSE, message = TRUE) { + + # Checks + if (missing(target)) stop('Error in time_decompose(): argument "target" is missing, with no default', call. = FALSE) + + # Setup + target_expr <- dplyr::enquo(target) + method <- tolower(method[[1]]) + + # Set method + if (method == "twitter") { + decomp_tbl <- data %>% + decompose_twitter(!! target_expr, frequency = frequency, message = message, ...) + } else if (method == "stl") { + decomp_tbl <- data %>% + decompose_stl(!! target_expr, frequency = frequency, message = message, ...) + } else if (method == "multiplicative") { + decomp_tbl <- data %>% + decompose_multiplicative(!! target_expr, frequency = frequency, message = message, ...) + } else { + stop(paste0("method = '", method[[1]], "' is not a valid option.")) + } + + # Merge if desired + if (merge) { + ret <- merge_two_tibbles(data, decomp_tbl, .f = time_decompose) + } else { + ret <- decomp_tbl + } + + return(ret) + +} + +#' @export +time_decompose.tbl_df <- function(data, target, method = c("stl", "twitter", "multiplicative"), + frequency = "auto", ..., merge = FALSE, message = TRUE) { + + # Checks + if (missing(target)) stop('Error in time_decompose(): argument "target" is missing, with no default', call. = FALSE) + + # Prep + data <- prep_tbl_time(data, message = message) + + # Send to time_decompose.tbl_time + time_decompose(data = data, + target = !! dplyr::enquo(target), + method = method[[1]], + frequency = frequency, + ... = ..., + merge = merge, + message = message) + +} + + + + +#' @export +time_decompose.grouped_tbl_time <- function(data, target, method = c("stl", "twitter", "multiplicative"), + frequency = "auto", merge = FALSE, message = FALSE, ...) { + + # Checks + if (missing(target)) stop('Error in time_decompose(): argument "target" is missing, with no default', call. = FALSE) + + # Setup + target_expr <- dplyr::enquo(target) + + # Mapping + ret <- data %>% + grouped_mapper( + .f = time_decompose, + target = !! target_expr, + method = method[[1]], + frequency = frequency, + merge = merge, + message = message, + ...) + + return(ret) + +} + +#' @export +time_decompose.grouped_df <- function(data, target, method = c("stl", "twitter", "multiplicative"), + frequency = "auto", merge = FALSE, message = FALSE, ...) { + + data <- prep_tbl_time(data, message = message) + + # Send to grouped_tbl_time + time_decompose(data = data, + target = !! dplyr::enquo(target), + method = method[[1]], + frequency = frequency, + ... = ..., + merge = merge, + message = message) + +} + + diff --git a/R/time_decompose_methods.R b/R/time_decompose_methods.R new file mode 100644 index 0000000..c73a7e1 --- /dev/null +++ b/R/time_decompose_methods.R @@ -0,0 +1,153 @@ +#' Methods that power time_decompose() +#' +#' @inheritParams time_decompose +#' +#' @return A `tbl_time` object containing the time series decomposition. +#' +#' @seealso [time_decompose()] +#' +#' @examples +#' +#' library(dplyr) +#' +#' tidyverse_cran_downloads %>% +#' ungroup() %>% +#' filter(package == "tidyquant") %>% +#' decompose_stl(count) +#' +#' +#' @references +#' - The "twitter" method is used in Twitter's [`AnomalyDetection` package](https://github.com/twitter/AnomalyDetection) +#' +#' @name decompose_methods + +# 2A. Twitter ---- + +#' @export +#' @rdname decompose_methods +#' @param median_spans Applies to the "twitter" method only. +#' The median spans are used to remove the trend and center the remainder. +decompose_twitter <- function(data, target, frequency = "auto", median_spans = "auto", message = TRUE) { + + # Checks + if (missing(target)) stop('Error in decompose_twitter(): argument "target" is missing, with no default', call. = FALSE) + + + data <- prep_tbl_time(data) + date_col_vals <- tibbletime::get_index_col(data) + + target_expr <- dplyr::enquo(target) + + date_col_name <- timetk::tk_get_timeseries_variables(data)[[1]] + date_col_expr <- rlang::sym(date_col_name) + + # Time Series Decomposition + decomp_tbl <- data %>% + dplyr::pull(!! target_expr) %>% + stats::ts(frequency = time_frequency(data, period = frequency, message = message)) %>% + stats::stl(s.window = "periodic", robust = TRUE) %>% + sweep::sw_tidy_decomp() %>% + dplyr::select(-c(index, seasadj)) %>% + # forecast::mstl() %>% + # as.tibble() %>% + tibble::add_column(!! date_col_name := date_col_vals, .after = 0) %>% + purrr::set_names(c(date_col_name, "observed", "season", "trend", "remainder")) %>% + dplyr::mutate(seasadj = observed - season) %>% + dplyr::select(!! date_col_expr, observed, season, seasadj, trend, remainder) %>% + + # Median Groups + time_median(observed, period = median_spans, message = message) %>% + + # Observed transformations + dplyr::mutate( + remainder = observed - season - median_spans + ) %>% + dplyr::select(!! date_col_expr, observed, season, median_spans, remainder) + + decomp_tbl <- anomalize::prep_tbl_time(decomp_tbl) + + return(decomp_tbl) + +} + + + +# 2B. STL ---- + +#' @export +#' @rdname decompose_methods +decompose_stl <- function(data, target, frequency = "auto", message = TRUE) { + + # Checks + if (missing(target)) stop('Error in decompose_stl(): argument "target" is missing, with no default', call. = FALSE) + + + data <- prep_tbl_time(data) + date_col_vals <- tibbletime::get_index_col(data) + + target_expr <- dplyr::enquo(target) + + date_col_name <- timetk::tk_get_timeseries_variables(data)[[1]] + date_col_expr <- rlang::sym(date_col_name) + + # Time Series Decomposition + decomp_tbl <- data %>% + dplyr::pull(!! target_expr) %>% + stats::ts(frequency = time_frequency(data, period = frequency, message = message)) %>% + stats::stl(s.window = "periodic", robust = TRUE) %>% + sweep::sw_tidy_decomp() %>% + # forecast::mstl() %>% + # as.tibble() %>% + tibble::add_column(!! date_col_name := date_col_vals, .after = 0) %>% + dplyr::select(!! date_col_expr, observed, season, trend, remainder) + + decomp_tbl <- anomalize::prep_tbl_time(decomp_tbl) + + return(decomp_tbl) + +} + + + + +# 2C. Multiplicative ---- + +#' @export +#' @rdname decompose_methods +decompose_multiplicative <- function(data, target, frequency = "auto", message = TRUE) { + + # Checks + if (missing(target)) stop('Error in decompose_multiplicative(): argument "target" is missing, with no default', call. = FALSE) + + # Setup inputs + data <- prep_tbl_time(data) + date_col_vals <- tibbletime::get_index_col(data) + + target_expr <- dplyr::enquo(target) + + date_col_name <- timetk::tk_get_timeseries_variables(data)[[1]] + date_col_expr <- rlang::sym(date_col_name) + + frequency <- anomalize::time_frequency(data, period = frequency, message = message) + + # Time Series Decomposition + decomp_tbl <- data %>% + dplyr::pull(!! target_expr) %>% + stats::ts(frequency = frequency) %>% + stats::decompose(type = "multiplicative") %>% + sweep::sw_tidy_decomp() %>% + dplyr::select(-index) %>% + dplyr::rename(remainder = random) %>% + dplyr::select(observed, season, seasadj, trend, remainder) %>% + tibble::add_column(!! date_col_name := date_col_vals, .after = 0) %>% + # Fix trend and remainder + dplyr::mutate( + trend = stats::supsmu(seq_along(observed), seasadj)$y, + remainder = observed / (trend * season) + ) + + decomp_tbl <- anomalize::prep_tbl_time(decomp_tbl) + + return(decomp_tbl) + +} diff --git a/R/time_frequency.R b/R/time_frequency.R new file mode 100644 index 0000000..ab87589 --- /dev/null +++ b/R/time_frequency.R @@ -0,0 +1,271 @@ +#' Generate a time series frequency from a periodicity +#' +#' @param data A `tibble` with a date or datetime index. +#' @param period Either "auto", a time-based definition (e.g. "2 weeks"), +#' or a numeric number of observations per frequency (e.g. 10). +#' See [tibbletime::collapse_by()] for period notation. +#' @param template A template that converts the scale of the data to +#' the expected frequency and trend spans. +#' @param message A boolean. If `message = TRUE`, the frequency used is output +#' along with the units in the scale of the data. +#' +#' @return Returns a scalar numeric value indicating the number of observations in the frequency or trend span. +#' +#' @details +#' A frequency is loosely defined as the number of observations that comprise a cycle +#' in a data set. The trend is loosely defined as time span that can +#' be aggregated across to visualize the central tendency of the data. +#' It's often easiest to think of frequency and trend in terms of the time-based units +#' that the data is already in. __This is what `time_frequency()` and `time_trend()` +#' enable: using time-based periods to define the frequency or trend.__ +#' +#' __Frequency__: +#' +#' As an example, a weekly cycle is often 5-days (for working +#' days) or 7-days (for calendar days). Rather than specify a frequency of 5 or 7, +#' the user can specify `period = "1 week"`, and +#' time_frequency()` will detect the scale of the time series and return 5 or 7 +#' based on the actual data. +#' +#' The `period` argument has three basic options for returning a frequency. +#' Options include: +#' - `"auto"`: A target frequency is determined using a pre-defined template (see `template` below). +#' - `time-based duration`: (e.g. "1 week" or "2 quarters" per cycle) +#' - `numeric number of observations`: (e.g. 5 for 5 observations per cycle) +#' +#' The `template` argument is only used when `period = "auto"`. The template is a tibble +#' of three features: `time_scale`, `frequency`, and `trend`. The algorithm will inspect +#' the scale of the time series and select the best frequency that matches the scale and +#' number of observations per target frequency. A frequency is then chosen on be the +#' best match. The predefined template is stored in a function `time_scale_template()`. +#' However, the user can come up with his or her own template changing the values +#' for frequency in the data frame. +#' +#' __Trend__: +#' +#' As an example, the trend of daily data is often best aggregated by evaluating +#' the moving average over a quarter or a month span. Rather than specify the number +#' of days in a quarter or month, the user can specify "1 quarter" or "1 month", +#' and the `time_trend()` function will return the correct number of observations +#' per trend cycle. In addition, there is an option, `period = "auto"`, to +#' auto-detect an appropriate trend span depending on the data. The `template` +#' is used to define the appropriate trend span. +#' +#' @examples +#' +#' library(dplyr) +#' +#' data(tidyverse_cran_downloads) +#' +#' #### FREQUENCY DETECTION #### +#' +#' # period = "auto" +#' tidyverse_cran_downloads %>% +#' filter(package == "tidyquant") %>% +#' ungroup() %>% +#' time_frequency(period = "auto", template = time_scale_template()) +#' +#' time_scale_template() +#' +#' # period = "1 month" +#' tidyverse_cran_downloads %>% +#' filter(package == "tidyquant") %>% +#' ungroup() %>% +#' time_frequency(period = "1 month") +#' +#' #### TREND DETECTION #### +#' +#' tidyverse_cran_downloads %>% +#' filter(package == "tidyquant") %>% +#' ungroup() %>% +#' time_trend(period = "auto") + + +#' @export +#' @rdname time_frequency +time_frequency <- function(data, period = "auto", template = time_scale_template(), message = TRUE) { + + # Checks + if (dplyr::is.grouped_df(data)) + stop(glue::glue("Cannot use on a grouped data frame. + Frequency should be performed on a single time series.")) + + # Setup inputs + data <- prep_tbl_time(data, message = F) + + index_expr <- data %>% tibbletime::get_index_quo() + index_name <- dplyr::quo_name(index_expr) + + # Get timeseries summary attributes + ts_summary <- data %>% + tibbletime::get_index_col() %>% + timetk::tk_get_timeseries_summary() + + ts_nobs <- ts_summary$n.obs + ts_scale <- ts_summary$scale + + + if (is.numeric(period)) { + # 1. Numeric Periods + freq <- period + + } else if (period != "auto") { + # 2. Text (e.g. period = "2 Weeks") + freq <- data %>% + tibbletime::collapse_by(period = period) %>% + dplyr::count(!! index_expr) %>% + dplyr::pull(n) %>% + stats::median(na.rm = T) + + } else { + # 3. period = "auto" + + periodicity_target <- time_scale_template() %>% + target_time_decomposition_scale(time_scale = ts_scale, target = "frequency", index_shift = 0) + + freq <- data %>% + tibbletime::collapse_by(period = periodicity_target) %>% + dplyr::count(!! index_expr) %>% + dplyr::pull(n) %>% + stats::median(na.rm = T) + + # Insufficient observations: nobs-to-freq should be at least 3-1 + if (ts_nobs < 3*freq) { + periodicity_target <- time_scale_template() %>% + target_time_decomposition_scale(time_scale = ts_scale, target = "frequency", index_shift = 1) + + freq <- data %>% + tibbletime::collapse_by(period = periodicity_target) %>% + dplyr::count(!! index_expr) %>% + dplyr::pull(n) %>% + stats::median(na.rm = T) + } + + if (ts_nobs < 3*freq) { + freq <- 1 + } + } + + if (message) { + freq_string <- glue::glue("frequency = {freq} {ts_scale}s") + message(freq_string) + } + + return(freq) +} + +#' @export +#' @rdname time_frequency +time_trend <- function(data, period = "auto", template = time_scale_template(), message = TRUE) { + + # Checks + if (dplyr::is.grouped_df(data)) + stop(glue::glue("Cannot use on a grouped data frame. + Frequency should be performed on a single time series.")) + + # Setup inputs + data <- prep_tbl_time(data, message = F) + + index_expr <- data %>% tibbletime::get_index_quo() + index_name <- dplyr::quo_name(index_expr) + + # Get timeseries summary attributes + ts_summary <- data %>% + tibbletime::get_index_col() %>% + timetk::tk_get_timeseries_summary() + + ts_nobs <- ts_summary$n.obs + ts_scale <- ts_summary$scale + + + if (is.numeric(period)) { + # 1. Numeric Periods + trend <- period + + } else if (period != "auto") { + # 2. Text (e.g. period = "2 Weeks") + trend <- data %>% + tibbletime::collapse_by(period = period) %>% + dplyr::count(!! index_expr) %>% + dplyr::pull(n) %>% + stats::median(na.rm = T) + + } else { + # 3. period = "auto" + + periodicity_target <- time_scale_template() %>% + target_time_decomposition_scale(time_scale = ts_scale, target = "trend", index_shift = 0) + + trend <- data %>% + tibbletime::collapse_by(period = periodicity_target) %>% + dplyr::count(!! index_expr) %>% + dplyr::pull(n) %>% + stats::median(na.rm = T) + + # Insufficient observations: nobs-to-trend should be at least 2-1 + if (ts_nobs / trend < 2) { + periodicity_target <- time_scale_template() %>% + target_time_decomposition_scale(time_scale = ts_scale, target = "trend", index_shift = 1) + + trend <- data %>% + tibbletime::collapse_by(period = periodicity_target) %>% + dplyr::count(!! index_expr) %>% + dplyr::pull(n) %>% + stats::median(na.rm = T) + + trend <- ceiling(trend) + + } + + if (ts_nobs / trend < 2) { + trend <- ts_nobs + } + } + + if (message) { + trend_string <- glue::glue("trend = {trend} {ts_scale}s") + message(trend_string) + } + + return(trend) +} + +#' @export +#' @rdname time_frequency +time_scale_template <- function() { + + tibble::tribble( + ~ "time_scale", ~ "frequency", ~ "trend", + "second", "1 hour", "12 hours", + "minute", "1 day", "14 days", + "hour", "1 day", "1 month", + "day", "1 week", "3 months", + "week", "1 quarter", "1 year", + "month", "1 year", "5 years", + "quarter", "1 year", "10 years", + "year", "5 years", "30 years" + ) + +} + +# Helper function to get the time decomposition scale +target_time_decomposition_scale <- function(template, time_scale, target = c("frequency", "trend"), index_shift = 0) { + + target_expr <- rlang::sym(target[[1]]) + + idx <- which(template$time_scale == time_scale) - index_shift + key_value <- template$time_scale[idx] + + template %>% + dplyr::filter(time_scale == key_value) %>% + dplyr::pull(!! target_expr) +} + +# time_scale_template() %>% +# target_time_decomposition_scale(time_scale = "minute", target = "frequency") +# +# time_scale_template() %>% +# target_time_decomposition_scale(time_scale = "minute", target = "trend") + + + diff --git a/R/time_recompose.R b/R/time_recompose.R new file mode 100644 index 0000000..8adb3e7 --- /dev/null +++ b/R/time_recompose.R @@ -0,0 +1,142 @@ +#' Recompose bands separating anomalies from "normal" observations +#' +#' @param data A `tibble` or `tbl_time` object that has been +#' processed with `time_decompose()` and `anomalize()`. +#' +#' @return Returns a `tbl_time` object. +#' +#' @details +#' The `time_recompose()` function is used to generate bands around the +#' "normal" levels of observed values. The function uses the remainder_l1 +#' and remainder_l2 levels produced during the [anomalize()] step +#' and the season and trend/median_spans values from the [time_decompose()] +#' step to reconstruct bands around the normal values. +#' +#' The following key names are required: observed:remainder from the +#' `time_decompose()` step and remainder_l1 and remainder_l2 from the +#' `anomalize()` step. +#' +#' +#' @seealso +#' Time Series Anomaly Detection Functions (anomaly detection workflow): +#' - [time_decompose()] +#' - [anomalize()] +#' +#' @examples +#' +#' library(dplyr) +#' +#' data(tidyverse_cran_downloads) +#' +#' # Basic Usage +#' tidyverse_cran_downloads %>% +#' time_decompose(count, method = "stl") %>% +#' anomalize(remainder, method = "iqr") %>% +#' time_recompose() +#' +#' +#' @export +time_recompose <- function(data) { + UseMethod("time_recompose", data) +} + +#' @export +time_recompose.default <- function(data) { + stop("Object is not of class `tbl_df` or `tbl_time`.", call. = FALSE) +} + +#' @export +time_recompose.tbl_time <- function(data) { + + # Checks + column_names <- names(data) + check_names <- c("observed", "remainder", "remainder_l1", "remainder_l2") %in% column_names + if (!all(check_names)) stop('Error in time_recompose(): key names are missing. Make sure observed:remainder, remainder_l1, and remainder_l2 are present', call. = FALSE) + + # Setup + target_expr <- dplyr::enquo(target) + method <- tolower(method[[1]]) + + l1 <- data %>% + dplyr::select(observed:remainder, contains("_l1")) %>% + dplyr::select(-c(observed, remainder)) %>% + apply(MARGIN = 1, FUN = sum) + + l2 <- data %>% + dplyr::select(observed:remainder, contains("_l2")) %>% + dplyr::select(-c(observed, remainder)) %>% + apply(MARGIN = 1, FUN = sum) + + ret <- data %>% + # add_column(!! paste0(quo_name(target_expr), "_l1") := l1) + tibble::add_column( + recomposed_l1 = l1, + recomposed_l2 = l2 + ) + + return(ret) + +} + +#' @export +time_recompose.tbl_df <- function(data) { + + # Prep + data <- prep_tbl_time(data, message = FALSE) + + # Send to time_recompose.tbl_time + time_recompose(data = data) + +} + + +#' @export +time_recompose.grouped_tbl_time <- function(data) { + + # Checks + column_names <- names(data) + check_names <- c("observed", "remainder", "remainder_l1", "remainder_l2") %in% column_names + if (!all(check_names)) stop('Error in time_recompose(): key names are missing. Make sure observed:remainder, remainder_l1, and remainder_l2 are present', call. = FALSE) + + # Setup + group_names <- dplyr::groups(data) + group_vars_expr <- rlang::syms(group_names) + + # Recompose l1 and l2 bands + l1 <- data %>% + dplyr::ungroup() %>% + dplyr::select(observed:remainder, contains("_l1")) %>% + dplyr::select(-c(observed, remainder)) %>% + apply(MARGIN = 1, FUN = sum) + + l2 <- data %>% + dplyr::ungroup() %>% + dplyr::select(observed:remainder, contains("_l2")) %>% + dplyr::select(-c(observed, remainder)) %>% + apply(MARGIN = 1, FUN = sum) + + ret <- data %>% + dplyr::ungroup() %>% + tibble::add_column( + recomposed_l1 = l1, + recomposed_l2 = l2 + ) %>% + dplyr::group_by(!!! group_vars_expr) + + return(ret) + +} + +#' @export +time_recompose.grouped_df <- function(data) { + + data <- prep_tbl_time(data, message = message) + + # Send to grouped_tbl_time + time_recompose(data = data) + +} + + + + diff --git a/R/utils.R b/R/utils.R new file mode 100644 index 0000000..562951f --- /dev/null +++ b/R/utils.R @@ -0,0 +1,178 @@ +# UTILITY FUNCTIONS ---- + +# 1. Mapping Functions ----- + +grouped_mapper <- function(data, target, .f, ...) { + + data <- prep_tbl_time(data, message = F) + + target_expr <- dplyr::enquo(target) + + group_names <- dplyr::groups(data) + group_vars_expr <- rlang::syms(group_names) + + ret <- data %>% + tidyr::nest() %>% + dplyr::mutate(nested.col = purrr::map( + .x = data, + .f = .f, + target = !! target_expr, + ...) + ) %>% + dplyr::select(-data) %>% + tidyr::unnest() %>% + dplyr::group_by(!!! group_vars_expr) + + # if (merge) { + # ret <- merge_two_tibbles(tib1 = data, tib2 = ret, .f = .f) + # } + + return(ret) + +} + +# 2. Merging Time-Based Tibbles ----- + +merge_two_tibbles <- function(tib1, tib2, .f) { + + # Merge results + if (identical(nrow(tib1), nrow(tib2))) { + + # Arrange dates - Possibility of issue if dates not decending in tib1 + tib1 <- arrange_by_date(tib1) + + # Drop date column and groups + tib2 <- drop_date_and_group_cols(tib2) + + # Replace bad names + tib2 <- replace_bad_names(tib2, .f) + + # Replace duplicate names + tib2 <- replace_duplicate_colnames(tib1, tib2) + + ret <- dplyr::bind_cols(tib1, tib2) + + } else { + + stop("Could not join. Incompatible structures.") + } + + return(ret) +} + +replace_duplicate_colnames <- function(tib1, tib2) { + + # Collect column names + name_list_tib1 <- colnames(tib1) + name_list_tib2 <- colnames(tib2) + name_list <- c(name_list_tib1, name_list_tib2) + + duplicates_exist <- detect_duplicates(name_list) + + # Iteratively add .1, .2, .3 ... onto end of column names + if (duplicates_exist) { + + i <- 1 + + while (duplicates_exist) { + + dup_names_stripped <- + stringr::str_split(name_list[duplicated(name_list)], + pattern = "\\.\\.", + simplify = FALSE) %>% + sapply(function(x) x[[1]]) + + name_list[duplicated(name_list)] <- + stringr::str_c(dup_names_stripped, "..", i) + + i <- i + 1 + + duplicates_exist <- detect_duplicates(name_list) + + } + + name_list_tib2 <- name_list[(ncol(tib1) + 1):length(name_list)] + + colnames(tib2) <- name_list_tib2 + } + + return(tib2) +} + +detect_duplicates <- function(name_list) { + + name_list %>% + duplicated() %>% + any() +} + +# bad / restricted names are names that get selected unintetionally by OHLC functions +replace_bad_names <- function(tib, fun_name) { + + bad_names_regex <- "open|high|low|close|volume|adjusted|price" + + name_list_tib <- colnames(tib) + name_list_tib_lower <- stringr::str_to_lower(name_list_tib) + + detect_bad_names <- stringr::str_detect(name_list_tib_lower, bad_names_regex) + + if (any(detect_bad_names)) { + + len <- length(name_list_tib_lower[detect_bad_names]) + name_list_tib[detect_bad_names] <- rep(fun_name, length.out = len) + + } + + colnames(tib) <- name_list_tib + + return(tib) +} + +arrange_by_date <- function(tib) { + + if (dplyr::is.grouped_df(tib)) { + + group_names <- dplyr::groups(tib) + + arrange_date <- function(tib) { + date_col <- timetk::tk_get_timeseries_variables(tib)[[1]] + tib %>% + dplyr::arrange_(date_col) + } + + tib <- tib %>% + tidyr::nest() %>% + dplyr::mutate(nested.col = + purrr::map(data, arrange_date) + ) %>% + dplyr::select(-data) %>% + tidyr::unnest() %>% + dplyr::group_by_(.dots = group_names) + + + } else { + + tib <- tib %>% + dplyr::arrange_(timetk::tk_get_timeseries_variables(tib)[[1]]) + + } + + return(tib) +} + +drop_date_and_group_cols <- function(tib) { + + date_col <- timetk::tk_get_timeseries_variables(tib)[[1]] + group_cols <- dplyr::groups(tib) %>% + as.character() + cols_to_remove <- c(date_col, group_cols) + tib_names <- colnames(tib) + cols_to_remove_logical <- tib_names %in% cols_to_remove + tib_names_without_date_or_group <- tib_names[!cols_to_remove_logical] + + tib <- tib %>% + dplyr::ungroup() %>% + dplyr::select_(.dots = as.list(tib_names_without_date_or_group)) + + return(tib) +} diff --git a/README.Rmd b/README.Rmd new file mode 100644 index 0000000..556db55 --- /dev/null +++ b/README.Rmd @@ -0,0 +1,49 @@ +--- +output: github_document +--- + + + +```{r setup, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + fig.path = "man/figures/README-", + out.width = "100%" +) +``` +# anomalize + +The goal of anomalize is to ... + +## Installation + +You can install the released version of anomalize from [CRAN](https://CRAN.R-project.org) with: + +``` r +install.packages("anomalize") +``` + +## Example + +This is a basic example which shows you how to solve a common problem: + +```{r example} +## basic example code +``` + +What is special about using `README.Rmd` instead of just `README.md`? You can include R chunks like so: + +```{r cars} +summary(cars) +``` + +You'll still need to render `README.Rmd` regularly, to keep `README.md` up-to-date. + +You can also embed plots, for example: + +```{r pressure, echo = FALSE} +plot(pressure) +``` + +In that case, don't forget to commit and push the resulting figure files, so they display on GitHub! diff --git a/README.md b/README.md new file mode 100644 index 0000000..7d798fe --- /dev/null +++ b/README.md @@ -0,0 +1,47 @@ + + + +# anomalize + +The goal of anomalize is to … + +## Installation + +You can install the released version of anomalize from +[CRAN](https://CRAN.R-project.org) with: + +``` r +install.packages("anomalize") +``` + +## Example + +This is a basic example which shows you how to solve a common problem: + +``` r +## basic example code +``` + +What is special about using `README.Rmd` instead of just `README.md`? +You can include R chunks like so: + +``` r +summary(cars) +#> speed dist +#> Min. : 4.0 Min. : 2.00 +#> 1st Qu.:12.0 1st Qu.: 26.00 +#> Median :15.0 Median : 36.00 +#> Mean :15.4 Mean : 42.98 +#> 3rd Qu.:19.0 3rd Qu.: 56.00 +#> Max. :25.0 Max. :120.00 +``` + +You’ll still need to render `README.Rmd` regularly, to keep `README.md` +up-to-date. + +You can also embed plots, for example: + + + +In that case, don’t forget to commit and push the resulting figure +files, so they display on GitHub\! diff --git a/_pkgdown.yml b/_pkgdown.yml new file mode 100644 index 0000000..e69de29 diff --git a/anomalize.Rproj b/anomalize.Rproj new file mode 100644 index 0000000..a648ce1 --- /dev/null +++ b/anomalize.Rproj @@ -0,0 +1,20 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 4 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +AutoAppendNewline: Yes +StripTrailingWhitespace: Yes + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source diff --git a/cran-comments.md b/cran-comments.md new file mode 100644 index 0000000..a95d65e --- /dev/null +++ b/cran-comments.md @@ -0,0 +1,10 @@ +## Test environments +* local OS X install, R 3.4.4 +* ubuntu 14.04 (on travis-ci), R 3.4.4 +* win-builder (devel and release) + +## R CMD check results + +0 errors | 0 warnings | 1 note + +* This is a new release. diff --git a/data-raw/tidyverse_cran_downloads.R b/data-raw/tidyverse_cran_downloads.R new file mode 100644 index 0000000..e22f1a5 --- /dev/null +++ b/data-raw/tidyverse_cran_downloads.R @@ -0,0 +1,17 @@ +library(dplyr) +library(tibbletime) +library(cranlogs) + +pkgs <- c( + "tidyr", "lubridate", "dplyr", + "broom", "tidyquant", "tidytext", + "ggplot2", "purrr", "glue", + "stringr", "forcats", "knitr", + "readr", "tibble", "tidyverse" +) + +tidyverse_cran_downloads <- cran_downloads(pkgs, from = "2017-01-01", to = "2018-03-01") %>% + group_by(package) %>% + as_tbl_time(date) + +tidyverse_cran_downloads diff --git a/data/tidyverse_cran_downloads.rda b/data/tidyverse_cran_downloads.rda new file mode 100644 index 0000000..2bc683f Binary files /dev/null and b/data/tidyverse_cran_downloads.rda differ diff --git a/man/anomalize.Rd b/man/anomalize.Rd new file mode 100644 index 0000000..e8de682 --- /dev/null +++ b/man/anomalize.Rd @@ -0,0 +1,98 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/anomalize.R +\name{anomalize} +\alias{anomalize} +\title{Detect anomalies using the tidyverse} +\usage{ +anomalize(data, target, method = c("iqr", "gesd"), alpha = 0.05, + max_anoms = 0.2, verbose = FALSE) +} +\arguments{ +\item{data}{A \code{tibble} or \code{tbl_time} object.} + +\item{target}{A column to apply the function to} + +\item{method}{The anomaly detection method. One of \code{"iqr"} or \code{"gesd"}. +The IQR method is faster at the expense of possibly not being quite as accurate. +The GESD method has the best properties for outlier detection, but is loop-based +and therefore a bit slower.} + +\item{alpha}{Controls the width of the "normal" range. +Lower values are more conservative while higher values are less prone +to incorrectly classifying "normal" observations.} + +\item{max_anoms}{The maximum percent of anomalies permitted to be identified.} + +\item{verbose}{A boolean. If \code{TRUE}, will return a list containing useful information +about the anomalies. If \code{FALSE}, just returns the data expanded with the anomalies and +the lower (l1) and upper (l2) bounds.} +} +\value{ +Returns a \code{tibble} / \code{tbl_time} object or list depending on the value of \code{verbose}. +} +\description{ +Detect anomalies using the tidyverse +} +\details{ +The \code{anomalize()} function is used to detect outliers in a distribution +with no trend or seasonality present. The return has three columns: +"remainder_l1" (lower limit for anomalies), "remainder_l2" (upper limit for +anomalies), and "anomaly" (Yes/No). + +Use \code{\link[=time_decompose]{time_decompose()}} to decompose a time series prior to performing +anomaly detection with \code{anomalize()}. Typically, \code{anomalize()} is +performed on the "remainder" of the time series decomposition. + +For non-time series data (data without trend), the \code{anomalize()} function can +be used without time series decomposition. + +The \code{anomalize()} function uses two methods for outlier detection +each with benefits. + +\strong{IQR}: + +The IQR Method uses an innerquartile range of 25% and 75% to establish a baseline distribution around +the median. With the default \code{alpha = 0.05}, the limits are established by expanding +the 25/75 baseline by an IQR Factor of 3 (3X). The IQR Factor = 0.15 / alpha (hense 3X with alpha = 0.05). +To increase the IQR Factor controling the limits, decrease the alpha, which makes +it more difficult to be an outlier. Increase alpha to make it easier to be an outlier. + +\strong{GESD}: + +The GESD Method (Generlized Extreme Studentized Deviate Test) progressively +eliminates outliers using a Student's T-Test comparing the test statistic to a critical value. +Each time an outlier is removed, the test statistic is updated. Once test statistic +drops below the critical value, all outliers are considered removed. Because this method +involves continuous updating via a loop, it is slower than the IQR method. However, it +tends to be the best performing method for outlier removal. +} +\examples{ + +library(dplyr) + +data(tidyverse_cran_downloads) + +tidyverse_cran_downloads \%>\% + time_decompose(count, method = "stl") \%>\% + anomalize(remainder, method = "iqr") + +} +\references{ +\itemize{ +\item The IQR method is used in \href{https://github.com/robjhyndman/forecast/blob/master/R/clean.R}{forecast::tsoutliers()} +\item The GESD method is used in Twitter's \href{https://github.com/twitter/AnomalyDetection}{AnomalyDetection} package and is also available as a function in \href{https://github.com/raunakms/GESD/blob/master/runGESD.R}{@raunakms's GESD method} +} +} +\seealso{ +Anomaly Detection Methods (Powers \code{anomalize}) +\itemize{ +\item \code{\link[=iqr]{iqr()}} +\item \code{\link[=gesd]{gesd()}} +} + +Time Series Anomaly Detection Functions (anomaly detection workflow): +\itemize{ +\item \code{\link[=time_decompose]{time_decompose()}} +\item \code{\link[=time_recompose]{time_recompose()}} +} +} diff --git a/man/anomalize_methods.Rd b/man/anomalize_methods.Rd new file mode 100644 index 0000000..75752ab --- /dev/null +++ b/man/anomalize_methods.Rd @@ -0,0 +1,54 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/anomalize_methods.R +\name{anomalize_methods} +\alias{anomalize_methods} +\alias{iqr} +\alias{gesd} +\title{Methods that power anomalize()} +\usage{ +iqr(x, alpha = 0.05, max_anoms = 0.2, verbose = FALSE) + +gesd(x, alpha = 0.05, max_anoms = 0.2, verbose = FALSE) +} +\arguments{ +\item{x}{A vector of numeric data.} + +\item{alpha}{Controls the width of the "normal" range. +Lower values are more conservative while higher values are less prone +to incorrectly classifying "normal" observations.} + +\item{max_anoms}{The maximum percent of anomalies permitted to be identified.} + +\item{verbose}{A boolean. If \code{TRUE}, will return a list containing useful information +about the anomalies. If \code{FALSE}, just returns a vector of "Yes" / "No" values.} +} +\value{ +Returns character vector or list depending on the value of \code{verbose}. +} +\description{ +Methods that power anomalize() +} +\examples{ + +set.seed(100) +x <- rnorm(100) +idx_outliers <- sample(100, size = 5) +x[idx_outliers] <- x[idx_outliers] + 10 + +iqr(x, alpha = 0.05, max_anoms = 0.2) +iqr(x, alpha = 0.05, max_anoms = 0.2, verbose = TRUE) + +gesd(x, alpha = 0.05, max_anoms = 0.2) +gesd(x, alpha = 0.05, max_anoms = 0.2, verbose = TRUE) + + +} +\references{ +\itemize{ +\item The IQR method is used in \href{https://github.com/robjhyndman/forecast/blob/master/R/clean.R}{forecast::tsoutliers()} +\item The GESD method is used in Twitter's \href{https://github.com/twitter/AnomalyDetection}{AnomalyDetection} package and is also available as a function in \href{https://github.com/raunakms/GESD/blob/master/runGESD.R}{@raunakms's GESD method} +} +} +\seealso{ +\code{\link[=anomalize]{anomalize()}} +} diff --git a/man/anomalize_package.Rd b/man/anomalize_package.Rd new file mode 100644 index 0000000..e39c015 --- /dev/null +++ b/man/anomalize_package.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/anomalize-package.R +\docType{package} +\name{anomalize_package} +\alias{anomalize_package} +\alias{anomalize_package-package} +\title{anomalize: Tidy anomaly detection} +\description{ +anomalize: Tidy anomaly detection +} +\details{ +The \code{anomalize} package enables a tidy workflow for detecting anomalies in data. +The main functions are \code{time_decompose()}, \code{anomalize()}, and \code{time_recompose()}. +When combined, it's quite simple to decompose time series, detect anomalies, +and create bands separating the "normal" data from the anomalous data. + +To learn more about \code{anomalize}, start with the vignettes: +\code{browseVignettes(package = "anomalize")} +} diff --git a/man/decompose_methods.Rd b/man/decompose_methods.Rd new file mode 100644 index 0000000..b0ca5f0 --- /dev/null +++ b/man/decompose_methods.Rd @@ -0,0 +1,57 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/time_decompose_methods.R +\name{decompose_methods} +\alias{decompose_methods} +\alias{decompose_twitter} +\alias{decompose_stl} +\alias{decompose_multiplicative} +\title{Methods that power time_decompose()} +\usage{ +decompose_twitter(data, target, frequency = "auto", median_spans = "auto", + message = TRUE) + +decompose_stl(data, target, frequency = "auto", message = TRUE) + +decompose_multiplicative(data, target, frequency = "auto", message = TRUE) +} +\arguments{ +\item{data}{A \code{tibble} or \code{tbl_time} object.} + +\item{target}{A column to apply the function to} + +\item{frequency}{Controls the seasonal adjustment (removal of seasonality). +Input can be either "auto", a time-based definition (e.g. "2 weeks"), +or a numeric number of observations per frequency (e.g. 10). +Refer to \code{\link[=time_frequency]{time_frequency()}}.} + +\item{median_spans}{Applies to the "twitter" method only. +The median spans are used to remove the trend and center the remainder.} + +\item{message}{A boolean. If \code{TRUE}, will output information related to \code{tbl_time} conversions, frequencies, +and median spans (if applicable).} +} +\value{ +A \code{tbl_time} object containing the time series decomposition. +} +\description{ +Methods that power time_decompose() +} +\examples{ + +library(dplyr) + +tidyverse_cran_downloads \%>\% + ungroup() \%>\% + filter(package == "tidyquant") \%>\% + decompose_stl(count) + + +} +\references{ +\itemize{ +\item The "twitter" method is used in Twitter's \href{https://github.com/twitter/AnomalyDetection}{AnomalyDetection package} +} +} +\seealso{ +\code{\link[=time_decompose]{time_decompose()}} +} diff --git a/man/figures/README-pressure-1.png b/man/figures/README-pressure-1.png new file mode 100644 index 0000000..c092055 Binary files /dev/null and b/man/figures/README-pressure-1.png differ diff --git a/man/plot_anomaly_decomposition.Rd b/man/plot_anomaly_decomposition.Rd new file mode 100644 index 0000000..3aae88d --- /dev/null +++ b/man/plot_anomaly_decomposition.Rd @@ -0,0 +1,61 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot_anomaly_decomposition.R +\name{plot_anomaly_decomposition} +\alias{plot_anomaly_decomposition} +\title{Visualize the time series decomposition with anomalies shown} +\usage{ +plot_anomaly_decomposition(data, ncol = 1, color_no = "#2c3e50", + color_yes = "#e31a1c", alpha_dots = 1, alpha_circles = 1, + size_dots = 1.5, size_circles = 4, strip.position = "right") +} +\arguments{ +\item{data}{A \code{tibble} or \code{tbl_time} object.} + +\item{ncol}{Number of columns to display. Set to 1 for single column by default.} + +\item{color_no}{Color for non-anomalous data.} + +\item{color_yes}{Color for anomalous data.} + +\item{alpha_dots}{Controls the transparency of the dots. Reduce when too many dots on the screen.} + +\item{alpha_circles}{Controls the transparency of the circles that identify anomalies.} + +\item{size_dots}{Controls the size of the dots.} + +\item{size_circles}{Controls the size of the circles that identify anomalies.} + +\item{strip.position}{Controls the placement of the strip that identifies the time series decomposition components.} +} +\value{ +Returns a \code{ggplot2} object. +} +\description{ +Visualize the time series decomposition with anomalies shown +} +\details{ +The first step in reviewing the anomaly detection process is to evaluate +a single times series to observe how the algorithm is selecting anomalies. +The \code{plot_anomaly_decomposition()} function is used to gain +an understanding as to whether or not the method is detecting anomalies correctly and +whether or not parameters such as decomposition method, anomalize method, +alpha, frequency, and so on should be adjusted. +} +\examples{ + +library(dplyr) +library(ggplot2) + +data(tidyverse_cran_downloads) + +tidyverse_cran_downloads \%>\% + filter(package == "tidyquant") \%>\% + ungroup() \%>\% + time_decompose(count, method = "stl") \%>\% + anomalize(remainder, method = "iqr") \%>\% + plot_anomaly_decomposition() + +} +\seealso{ +\code{\link[=plot_anomalies]{plot_anomalies()}} +} diff --git a/man/prep_tbl_time.Rd b/man/prep_tbl_time.Rd new file mode 100644 index 0000000..666c2e1 --- /dev/null +++ b/man/prep_tbl_time.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/prep_tbl_time.R +\name{prep_tbl_time} +\alias{prep_tbl_time} +\title{Automatically create tibbletime objects from tibbles} +\usage{ +prep_tbl_time(data, message = FALSE) +} +\arguments{ +\item{data}{A \code{tibble}.} + +\item{message}{A boolean. If \code{TRUE}, returns a message indicating any +conversion details important to know during the conversion to \code{tbl_time} class.} +} +\value{ +Returns a \code{tibbletime} object of class \code{tbl_time}. +} +\description{ +Automatically create tibbletime objects from tibbles +} +\details{ +Detects a date or datetime index column and automatically +} +\examples{ + +library(dplyr) +library(tibbletime) + +data_tbl <- tibble( + date = seq.Date(from = as.Date("2018-01-01"), by = "day", length.out = 10), + value = rnorm(10) + ) + +prep_tbl_time(data_tbl) + +} +\seealso{ +\code{\link[tibbletime:as_tbl_time]{tibbletime::as_tbl_time()}} +} diff --git a/man/tidyverse_cran_downloads.Rd b/man/tidyverse_cran_downloads.Rd new file mode 100644 index 0000000..4242468 --- /dev/null +++ b/man/tidyverse_cran_downloads.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tidyverse_cran_downloads.R +\docType{data} +\name{tidyverse_cran_downloads} +\alias{tidyverse_cran_downloads} +\title{Downloads of various "tidyverse" packages from CRAN} +\format{A grouped \code{tbl_time} object with 6,375 rows and 3 variables: +\describe{ +\item{date}{Date of the daily observation} +\item{count}{Number of downloads that day} +\item{package}{The package corresponding to the daily download number} +}} +\source{ +The package downloads come from CRAN by way of the \code{cranlogs} package. +} +\usage{ +tidyverse_cran_downloads +} +\description{ +A dataset containing the daily download counts from 2017-01-01 to 2018-03-01 +for the following tidyverse packages: +\itemize{ +\item \code{tidyr} +\item \code{lubridate} +\item \code{dplyr} +\item \code{broom} +\item \code{tidyquant} +\item \code{tidytext} +\item \code{ggplot2} +\item \code{purrr} +\item \code{stringr} +\item \code{forcats} +\item \code{knitr} +\item \code{readr} +\item \code{tibble} +\item \code{tidyverse} +} +} +\keyword{datasets} diff --git a/man/time_apply.Rd b/man/time_apply.Rd new file mode 100644 index 0000000..a0315c5 --- /dev/null +++ b/man/time_apply.Rd @@ -0,0 +1,61 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/time_apply.R +\name{time_apply} +\alias{time_apply} +\title{Apply a function to a time series by period} +\usage{ +time_apply(data, target, period, .fun, ..., start_date = NULL, side = "end", + clean = FALSE, message = TRUE) +} +\arguments{ +\item{data}{A \code{tibble} with a date or datetime index.} + +\item{target}{A column to apply the function to} + +\item{period}{A time-based definition (e.g. "2 weeks"). +or a numeric number of observations per frequency (e.g. 10). +See \code{\link[tibbletime:collapse_by]{tibbletime::collapse_by()}} for period notation.} + +\item{.fun}{A function to apply (e.g. \code{median})} + +\item{...}{Additional parameters passed to the function, \code{.fun}} + +\item{start_date}{Optional argument used to +specify the start date for the +first group. The default is to start at the closest period boundary +below the minimum date in the supplied index.} + +\item{side}{Whether to return the date at the beginning or the end of +the new period. By default, the "end" of the period. +Use "start" to change to the start of the period.} + +\item{clean}{Whether or not to round the collapsed index up / down to the next +period boundary. The decision to round up / down is controlled by the side +argument.} + +\item{message}{A boolean. If \code{message = TRUE}, the frequency used is output +along with the units in the scale of the data.} +} +\value{ +Returns a \code{tibbletime} object of class \code{tbl_time}. +} +\description{ +Apply a function to a time series by period +} +\details{ +Uses a time-based period to apply functions to. This is useful in circumstances where you want to +compare the observation values to aggregated values such as \code{mean()} or \code{median()} +during a set time-based period. The returned output extends the +length of the data frame so the differences can easily be computed. +} +\examples{ + +library(dplyr) + +data(tidyverse_cran_downloads) + +# Basic Usage +tidyverse_cran_downloads \%>\% + time_apply(count, period = "1 week", .fun = mean, na.rm = TRUE) + +} diff --git a/man/time_decompose.Rd b/man/time_decompose.Rd new file mode 100644 index 0000000..031c06f --- /dev/null +++ b/man/time_decompose.Rd @@ -0,0 +1,115 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/time_decompose.R +\name{time_decompose} +\alias{time_decompose} +\title{Decompose a time series in preparation for anomaly detection} +\usage{ +time_decompose(data, target, method = c("stl", "twitter", "multiplicative"), + frequency = "auto", ..., merge = FALSE, message = TRUE) +} +\arguments{ +\item{data}{A \code{tibble} or \code{tbl_time} object.} + +\item{target}{A column to apply the function to} + +\item{method}{The time series decomposition method. One of \code{"stl"}, \code{"twitter"}, or +\code{"multiplicative"}. The STL method uses seasonal decomposition (see \code{\link[=decompose_stl]{decompose_stl()}}). +The Twitter method uses \code{median_spans} to remove the trend (see \code{\link[=decompose_twitter]{decompose_twitter()}}). +The Multiplicative method uses multiplicative decomposition (see \code{\link[=decompose_multiplicative]{decompose_multiplicative()}}).} + +\item{frequency}{Controls the seasonal adjustment (removal of seasonality). +Input can be either "auto", a time-based definition (e.g. "2 weeks"), +or a numeric number of observations per frequency (e.g. 10). +Refer to \code{\link[=time_frequency]{time_frequency()}}.} + +\item{...}{Additional parameters passed to the underlying method functions.} + +\item{merge}{A boolean. \code{FALSE} by default. If \code{TRUE}, will append results to the original data.} + +\item{message}{A boolean. If \code{TRUE}, will output information related to \code{tbl_time} conversions, frequencies, +and median spans (if applicable).} +} +\value{ +Returns a \code{tbl_time} object. +} +\description{ +Decompose a time series in preparation for anomaly detection +} +\details{ +The \code{time_decompose()} function generates a time series decomposition on +\code{tbl_time} objects. The function is "tidy" in the sense that it works +on data frames. It is designed to work with time-based data, and as such +must have a column that contains date or datetime information. The function +also works with grouped data. The function implements several methods +of time series decomposition, each with benefits. + +\strong{STL}: + +The STL method (\code{method = "stl"}) implements time series decomposition using +the underlying \code{\link[=decompose_stl]{decompose_stl()}} function. If you are familiar with \code{\link[stats:stl]{stats::stl()}}, +the function is a "tidy" version that is designed to work with \code{tbl_time} objects. +The decomposition separates the "season" and "trend" components from +the "observed" values leaving the "remainder" for anomaly detection. +The main parameter that controls the seasonal adjustment is \code{frequency}. +Setting \code{frequency = "auto"} will lets the \code{\link[=time_frequency]{time_frequency()}} function +automatically determine the frequency based on the scale of the time series. + +\strong{Twitter}: + +The Twitter method (\code{method = "twitter"}) implements time series decomposition using +the methodology from the Twitter \href{https://github.com/twitter/AnomalyDetection}{AnomalyDetection} package. +The decomposition separates the "seasonal" component and then removes +the median data, which is a different approach than the STL method for removing +the trend. This approach works very well for low-growth + high seasonality data. +STL may be a better approach when trend is a large factor. +The user can control two parameters: \code{frequency} and \code{median_spans}. +The \code{frequency} parameter adjusts the "season" component that is removed +from the "observed" values. The \code{median_spans} parameter adjusts the +number of median spans that are used. The user may supply both \code{frequency} +and \code{median_spans} as time-based durations (e.g. "6 weeks") or numeric values +(e.g. 180) or "auto", which predetermines the frequency and/or median spans +based on the scale of the time series. + +\strong{Multiplicative}: + +The Multiplicative method (\code{method = "multiplicative"}) time series decomposition +uses the \code{\link[stats:decompose]{stats::decompose()}} function with \code{type = "multiplicative"}. This +method is useful in circumstances where variance is non-constantant and typically +growing in a multiplicative fashion. The parameters are the same as the STL method. +Alternatively, users may wish to try a transformation (e.g. \code{log()} or \code{sqrt()}) in combination +with the STL method to get near-constant variance. +} +\examples{ + +library(dplyr) + +data(tidyverse_cran_downloads) + +# Basic Usage +tidyverse_cran_downloads \%>\% + time_decompose(count, method = "stl") + +# twitter + median_spans +tidyverse_cran_downloads \%>\% + time_decompose(count, + method = "twitter", + frequency = "1 week", + median_spans = "3 months", + merge = TRUE, + message = FALSE) + +} +\seealso{ +Decomposition Methods (Powers \code{time_decompose}) +\itemize{ +\item \code{\link[=decompose_stl]{decompose_stl()}} +\item \code{\link[=decompose_twitter]{decompose_twitter()}} +\item \code{\link[=decompose_multiplicative]{decompose_multiplicative()}} +} + +Time Series Anomaly Detection Functions (anomaly detection workflow): +\itemize{ +\item \code{\link[=anomalize]{anomalize()}} +\item \code{\link[=time_recompose]{time_recompose()}} +} +} diff --git a/man/time_frequency.Rd b/man/time_frequency.Rd new file mode 100644 index 0000000..91ca714 --- /dev/null +++ b/man/time_frequency.Rd @@ -0,0 +1,106 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/time_frequency.R +\name{time_frequency} +\alias{time_frequency} +\alias{time_trend} +\alias{time_scale_template} +\title{Generate a time series frequency from a periodicity} +\usage{ +time_frequency(data, period = "auto", template = time_scale_template(), + message = TRUE) + +time_trend(data, period = "auto", template = time_scale_template(), + message = TRUE) + +time_scale_template() +} +\arguments{ +\item{data}{A \code{tibble} with a date or datetime index.} + +\item{period}{Either "auto", a time-based definition (e.g. "2 weeks"), +or a numeric number of observations per frequency (e.g. 10). +See \code{\link[tibbletime:collapse_by]{tibbletime::collapse_by()}} for period notation.} + +\item{template}{A template that converts the scale of the data to +the expected frequency and trend spans.} + +\item{message}{A boolean. If \code{message = TRUE}, the frequency used is output +along with the units in the scale of the data.} +} +\value{ +Returns a scalar numeric value indicating the number of observations in the frequency or trend span. +} +\description{ +Generate a time series frequency from a periodicity +} +\details{ +A frequency is loosely defined as the number of observations that comprise a cycle +in a data set. The trend is loosely defined as time span that can +be aggregated across to visualize the central tendency of the data. +It's often easiest to think of frequency and trend in terms of the time-based units +that the data is already in. \strong{This is what \code{time_frequency()} and \code{time_trend()} +enable: using time-based periods to define the frequency or trend.} + +\strong{Frequency}: + +As an example, a weekly cycle is often 5-days (for working +days) or 7-days (for calendar days). Rather than specify a frequency of 5 or 7, +the user can specify \code{period = "1 week"}, and +time_frequency()` will detect the scale of the time series and return 5 or 7 +based on the actual data. + +The \code{period} argument has three basic options for returning a frequency. +Options include: +\itemize{ +\item \code{"auto"}: A target frequency is determined using a pre-defined template (see \code{template} below). +\item \code{time-based duration}: (e.g. "1 week" or "2 quarters" per cycle) +\item \code{numeric number of observations}: (e.g. 5 for 5 observations per cycle) +} + +The \code{template} argument is only used when \code{period = "auto"}. The template is a tibble +of three features: \code{time_scale}, \code{frequency}, and \code{trend}. The algorithm will inspect +the scale of the time series and select the best frequency that matches the scale and +number of observations per target frequency. A frequency is then chosen on be the +best match. The predefined template is stored in a function \code{time_scale_template()}. +However, the user can come up with his or her own template changing the values +for frequency in the data frame. + +\strong{Trend}: + +As an example, the trend of daily data is often best aggregated by evaluating +the moving average over a quarter or a month span. Rather than specify the number +of days in a quarter or month, the user can specify "1 quarter" or "1 month", +and the \code{time_trend()} function will return the correct number of observations +per trend cycle. In addition, there is an option, \code{period = "auto"}, to +auto-detect an appropriate trend span depending on the data. The \code{template} +is used to define the appropriate trend span. +} +\examples{ + +library(dplyr) + +data(tidyverse_cran_downloads) + +#### FREQUENCY DETECTION #### + +# period = "auto" +tidyverse_cran_downloads \%>\% + filter(package == "tidyquant") \%>\% + ungroup() \%>\% + time_frequency(period = "auto", template = time_scale_template()) + +time_scale_template() + +# period = "1 month" +tidyverse_cran_downloads \%>\% + filter(package == "tidyquant") \%>\% + ungroup() \%>\% + time_frequency(period = "1 month") + +#### TREND DETECTION #### + +tidyverse_cran_downloads \%>\% + filter(package == "tidyquant") \%>\% + ungroup() \%>\% + time_trend(period = "auto") +} diff --git a/man/time_recompose.Rd b/man/time_recompose.Rd new file mode 100644 index 0000000..bb800dc --- /dev/null +++ b/man/time_recompose.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/time_recompose.R +\name{time_recompose} +\alias{time_recompose} +\title{Recompose bands separating anomalies from "normal" observations} +\usage{ +time_recompose(data) +} +\arguments{ +\item{data}{A \code{tibble} or \code{tbl_time} object that has been +processed with \code{time_decompose()} and \code{anomalize()}.} +} +\value{ +Returns a \code{tbl_time} object. +} +\description{ +Recompose bands separating anomalies from "normal" observations +} +\details{ +The \code{time_recompose()} function is used to generate bands around the +"normal" levels of observed values. The function uses the remainder_l1 +and remainder_l2 levels produced during the \code{\link[=anomalize]{anomalize()}} step +and the season and trend/median_spans values from the \code{\link[=time_decompose]{time_decompose()}} +step to reconstruct bands around the normal values. + +The following key names are required: observed:remainder from the +\code{time_decompose()} step and remainder_l1 and remainder_l2 from the +\code{anomalize()} step. +} +\examples{ + +library(dplyr) + +data(tidyverse_cran_downloads) + +# Basic Usage +tidyverse_cran_downloads \%>\% + time_decompose(count, method = "stl") \%>\% + anomalize(remainder, method = "iqr") \%>\% + time_recompose() + + +} +\seealso{ +Time Series Anomaly Detection Functions (anomaly detection workflow): +\itemize{ +\item \code{\link[=time_decompose]{time_decompose()}} +\item \code{\link[=anomalize]{anomalize()}} +} +} diff --git a/tests/testthat.R b/tests/testthat.R new file mode 100644 index 0000000..5a18383 --- /dev/null +++ b/tests/testthat.R @@ -0,0 +1,4 @@ +library(testthat) +library(anomalize) + +test_check("anomalize") diff --git a/tests/testthat/test-anomalize.R b/tests/testthat/test-anomalize.R new file mode 100644 index 0000000..13a3b36 --- /dev/null +++ b/tests/testthat/test-anomalize.R @@ -0,0 +1,61 @@ +context("test-anomalize.R") + +# Setup + +library(tidyverse) + +tq_dloads <- tidyverse_cran_downloads %>% + dplyr::ungroup() %>% + dplyr::filter(package == "tidyquant") + +# Tests + +test_that("iqr_tbl_df works", { + + iqr_tbl_df <- tq_dloads %>% + anomalize(count, method = "iqr") + + expect_equal(nrow(iqr_tbl_df), 425) + expect_equal(ncol(iqr_tbl_df), 6) + +}) + +test_that("gesd_tbl_df works", { + + gesd_tbl_df <- tq_dloads %>% + anomalize(count, method = "gesd") + + expect_equal(nrow(gesd_tbl_df), 425) + expect_equal(ncol(gesd_tbl_df), 6) + +}) + +test_that("iqr_grouped_df works", { + + iqr_grouped_df <- tidyverse_cran_downloads %>% + dplyr::ungroup() %>% + dplyr::filter(package %in% c("tidyquant", "tidytext")) %>% + dplyr::group_by(package) %>% + anomalize(count, method = "iqr") + + expect_equal(nrow(iqr_grouped_df), 850) + expect_equal(ncol(iqr_grouped_df), 6) + +}) + +test_that("gesd_grouped_df works", { + + gesd_grouped_df <- tidyverse_cran_downloads %>% + dplyr::ungroup() %>% + dplyr::filter(package %in% c("tidyquant", "tidytext")) %>% + dplyr::group_by(package) %>% + anomalize(count, method = "gesd") + + expect_equal(nrow(gesd_grouped_df), 850) + expect_equal(ncol(gesd_grouped_df), 6) + +}) + + + + diff --git a/tests/testthat/test-time_frequency.R b/tests/testthat/test-time_frequency.R new file mode 100644 index 0000000..b5ba262 --- /dev/null +++ b/tests/testthat/test-time_frequency.R @@ -0,0 +1,77 @@ +context("test-time_frequency.R") + +# Setup + +tq_dloads <- tidyverse_cran_downloads %>% + dplyr::ungroup() %>% + dplyr::filter(package == "tidyquant") + +tq_dloads_small <- tq_dloads %>% + dplyr::slice(1:60) + +# Tests + +test_that("time_frequency works: period = 'auto'", { + + freq <- tq_dloads %>% + time_frequency() + + expect_equal(freq, 7) + +}) + +test_that("time_frequency works: period = '2 weeks'", { + + freq <- tq_dloads %>% + time_frequency(period = "2 weeks") + + expect_equal(freq, 14) + +}) + +test_that("time_frequency works: period = 5", { + + freq <- tq_dloads %>% + time_frequency(period = 5) + + expect_equal(freq, 5) + +}) + + + +test_that("time_trend works: period = 'auto'", { + + trend <- tq_dloads %>% + time_trend() + + expect_equal(trend, 91) + +}) + +test_that("time_trend works: period = '90 days'", { + + trend <- tq_dloads %>% + time_trend(period = "90 days") + + expect_equal(trend, 90) + +}) + +test_that("time_trend works: period = 90", { + + trend <- tq_dloads %>% + time_trend(period = 90) + + expect_equal(trend, 90) + +}) + +test_that("time_trend works with small data: period = 'auto'", { + + trend <- tq_dloads_small %>% + time_trend() + + expect_equal(trend, 28) + +})