initial commit

business-science · Mar 19, 2018 · 8030b4b · 8030b4b
commit 8030b4b
Show file tree

Hide file tree

Showing 40 changed files with 2,822 additions and 0 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -0,0 +1,7 @@
+^.*\.Rproj$
+^\.Rproj\.user$
+^README\.Rmd$
+^cran-comments\.md$
+^_pkgdown\.yml$
+^docs$
+^data-raw$
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+.Rproj.user
+.Rhistory
+.RData
+.Ruserdata
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,38 @@
+Package: anomalize
+Type: Package
+Title: Tidy anomaly detection
+Version: 0.1.0
+Author: c(
+    person("Matt", "Dancho", email = "[email protected]", role = c("aut", "cre")),
+    person("Davis", "Vaughan", email = "[email protected]", role = c("aut"))
+  )
+Maintainer: Matt Dancho <[email protected]>
+Description:
+    The `anomalize` package enables a "tidy" workflow for detecting anomalies in data.
+    The main functions are `time_decompose()`, `anomalize()`, and `time_recompose()`.
+    When combined, it's quite simple to decompose time series, detect anomalies,
+    and create bands separating the "normal" data from the anomalous data. 
+URL: https://github.com/business-science/anomalize
+BugReports: https://github.com/business-science/anomalize/issues
+License: GPL (>= 3)
+Encoding: UTF-8
+LazyData: true
+Depends:
+    R (>= 3.0.0)
+Imports: 
+    dplyr,
+    glue,
+    tidyquant,
+    timetk,
+    sweep,
+    tibbletime,
+    purrr,
+    rlang,
+    tibble,
+    stringr,
+    tidyr,
+    ggplot2
+RoxygenNote: 6.0.1
+Roxygen: list(markdown = TRUE)
+Suggests: 
+    testthat
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,45 @@
+# Generated by roxygen2: do not edit by hand
+
+S3method(anomalize,default)
+S3method(anomalize,grouped_df)
+S3method(anomalize,tbl_df)
+S3method(plot_anomaly_decomposition,default)
+S3method(plot_anomaly_decomposition,grouped_tbl_time)
+S3method(plot_anomaly_decomposition,tbl_time)
+S3method(prep_tbl_time,data.frame)
+S3method(prep_tbl_time,tbl_time)
+S3method(time_apply,data.frame)
+S3method(time_decompose,default)
+S3method(time_decompose,grouped_df)
+S3method(time_decompose,grouped_tbl_time)
+S3method(time_decompose,tbl_df)
+S3method(time_decompose,tbl_time)
+S3method(time_recompose,default)
+S3method(time_recompose,grouped_df)
+S3method(time_recompose,grouped_tbl_time)
+S3method(time_recompose,tbl_df)
+S3method(time_recompose,tbl_time)
+export(anomalize)
+export(decompose_multiplicative)
+export(decompose_stl)
+export(decompose_twitter)
+export(gesd)
+export(iqr)
+export(plot_anomaly_decomposition)
+export(prep_tbl_time)
+export(time_apply)
+export(time_decompose)
+export(time_frequency)
+export(time_recompose)
+export(time_scale_template)
+export(time_trend)
+importFrom(dplyr,"%>%")
+importFrom(dplyr,contains)
+importFrom(dplyr,n)
+importFrom(dplyr,row_number)
+importFrom(rlang,"!!!")
+importFrom(rlang,"!!")
+importFrom(rlang,":=")
+importFrom(stats,mad)
+importFrom(stats,median)
+importFrom(stats,qt)
diff --git a/NEWS.md b/NEWS.md
@@ -0,0 +1,3 @@
+# anomalize 0.1.0
+
+* Added a `NEWS.md` file to track changes to the package.
diff --git a/R/anomalize-package.R b/R/anomalize-package.R
@@ -0,0 +1,19 @@
+#' anomalize: Tidy anomaly detection
+#'
+#' @details
+#' The `anomalize` package enables a tidy workflow for detecting anomalies in data.
+#' The main functions are `time_decompose()`, `anomalize()`, and `time_recompose()`.
+#' When combined, it's quite simple to decompose time series, detect anomalies,
+#' and create bands separating the "normal" data from the anomalous data.
+#'
+#' To learn more about `anomalize`, start with the vignettes:
+#'  `browseVignettes(package = "anomalize")`
+#'
+#' @docType package
+#' @name anomalize_package
+#'
+#' @importFrom rlang := !! !!!
+#' @importFrom dplyr %>% n row_number contains
+#' @importFrom stats median mad qt
+
+NULL
diff --git a/R/anomalize.R b/R/anomalize.R
@@ -0,0 +1,173 @@
+#' Detect anomalies using the tidyverse
+#'
+#' @inheritParams time_apply
+#' @param data A `tibble` or `tbl_time` object.
+#' @param method The anomaly detection method. One of `"iqr"` or `"gesd"`.
+#' The IQR method is faster at the expense of possibly not being quite as accurate.
+#' The GESD method has the best properties for outlier detection, but is loop-based
+#' and therefore a bit slower.
+#' @param alpha Controls the width of the "normal" range.
+#' Lower values are more conservative while higher values are less prone
+#' to incorrectly classifying "normal" observations.
+#' @param max_anoms The maximum percent of anomalies permitted to be identified.
+#' @param verbose A boolean. If `TRUE`, will return a list containing useful information
+#' about the anomalies. If `FALSE`, just returns the data expanded with the anomalies and
+#' the lower (l1) and upper (l2) bounds.
+#'
+#' @return Returns a `tibble` / `tbl_time` object or list depending on the value of `verbose`.
+#'
+#' @details
+#' The `anomalize()` function is used to detect outliers in a distribution
+#' with no trend or seasonality present. The return has three columns:
+#' "remainder_l1" (lower limit for anomalies), "remainder_l2" (upper limit for
+#' anomalies), and "anomaly" (Yes/No).
+#'
+#' Use [time_decompose()] to decompose a time series prior to performing
+#' anomaly detection with `anomalize()`.  Typically, `anomalize()` is
+#' performed on the "remainder" of the time series decomposition.
+#'
+#' For non-time series data (data without trend), the `anomalize()` function can
+#' be used without time series decomposition.
+#'
+#' The `anomalize()` function uses two methods for outlier detection
+#' each with benefits.
+#'
+#' __IQR__:
+#'
+#' The IQR Method uses an innerquartile range of 25% and 75% to establish a baseline distribution around
+#' the median. With the default `alpha = 0.05`, the limits are established by expanding
+#' the 25/75 baseline by an IQR Factor of 3 (3X). The IQR Factor = 0.15 / alpha (hense 3X with alpha = 0.05).
+#' To increase the IQR Factor controling the limits, decrease the alpha, which makes
+#' it more difficult to be an outlier. Increase alpha to make it easier to be an outlier.
+#'
+#' __GESD__:
+#'
+#' The GESD Method (Generlized Extreme Studentized Deviate Test) progressively
+#' eliminates outliers using a Student's T-Test comparing the test statistic to a critical value.
+#' Each time an outlier is removed, the test statistic is updated. Once test statistic
+#' drops below the critical value, all outliers are considered removed. Because this method
+#' involves continuous updating via a loop, it is slower than the IQR method. However, it
+#' tends to be the best performing method for outlier removal.
+#'
+#' @seealso
+#' Anomaly Detection Methods (Powers `anomalize`)
+#' - [iqr()]
+#' - [gesd()]
+#'
+#' Time Series Anomaly Detection Functions (anomaly detection workflow):
+#' - [time_decompose()]
+#' - [time_recompose()]
+#'
+#' @examples
+#'
+#' library(dplyr)
+#'
+#' data(tidyverse_cran_downloads)
+#'
+#' tidyverse_cran_downloads %>%
+#'     time_decompose(count, method = "stl") %>%
+#'     anomalize(remainder, method = "iqr")
+#'
+#' @references
+#' - The IQR method is used in [`forecast::tsoutliers()`](https://github.com/robjhyndman/forecast/blob/master/R/clean.R)
+#' - The GESD method is used in Twitter's [`AnomalyDetection`](https://github.com/twitter/AnomalyDetection) package and is also available as a function in [@raunakms's GESD method](https://github.com/raunakms/GESD/blob/master/runGESD.R)
+#'
+#' @export
+anomalize <- function(data, target, method = c("iqr", "gesd"),
+                      alpha = 0.05, max_anoms = 0.20, verbose = FALSE) {
+    UseMethod("anomalize", data)
+}
+
+#' @export
+anomalize.default <- function(data, target, method = c("iqr", "gesd"),
+                              alpha = 0.05, max_anoms = 0.20, verbose = FALSE) {
+    stop("Object is not of class `tbl_df` or `tbl_time`.", call. = FALSE)
+}
+
+#' @export
+anomalize.tbl_df <- function(data, target, method = c("iqr", "gesd"),
+                      alpha = 0.05, max_anoms = 0.20, verbose = FALSE) {
+
+    # Checks
+    if (missing(target)) stop('Error in anomalize(): argument "target" is missing, with no default', call. = FALSE)
+
+    # Setup
+    target_expr <- rlang::enquo(target)
+
+    method <- tolower(method[[1]])
+    x      <- data %>% dplyr::pull(!! target_expr)
+
+    # Detect Anomalies
+    # method <- tolower(method[[1]])
+    # args   <- list(x         = data %>% dplyr::pull(!! target_expr),
+    #                alpha     = alpha,
+    #                max_anoms = max_anoms,
+    #                verbose   = TRUE)
+    #
+    # outlier_list <- do.call(method, args)
+
+    # Explicitly call functions
+    if (method == "iqr") {
+        outlier_list <- anomalize::iqr(x         = x,
+                                       alpha     = alpha,
+                                       max_anoms = max_anoms,
+                                       verbose   = TRUE)
+    } else if (method == "gesd") {
+        outlier_list <- anomalize::gesd(x         = x,
+                                        alpha     = alpha,
+                                        max_anoms = max_anoms,
+                                        verbose   = TRUE)
+
+    } else {
+        stop("The `method` selected is invalid.", call. = FALSE)
+    }
+
+    outlier      <- outlier_list$outlier
+    limit_lower  <- outlier_list$critical_limits[[1]]
+    limit_upper  <- outlier_list$critical_limits[[2]]
+
+    # Returns
+    ret <- data %>%
+        dplyr::mutate(!! paste0(dplyr::quo_name(target_expr), "_l1") := limit_lower,
+                      !! paste0(dplyr::quo_name(target_expr), "_l2") := limit_upper) %>%
+        tibble::add_column(anomaly = outlier)
+
+    if (verbose) {
+        ret <- list(
+            anomalized_tbl       = ret,
+            anomaly_details      = outlier_list
+        )
+
+        return(ret)
+
+    } else {
+        return(ret)
+    }
+
+}
+
+#' @export
+anomalize.grouped_df <- function(data, target, method = c("iqr", "gesd"),
+                                 alpha = 0.05, max_anoms = 0.20, verbose = FALSE, ...) {
+
+    # Checks
+    if (missing(target)) stop('Error in anomalize(): argument "target" is missing, with no default', call. = FALSE)
+    if (verbose) warning(glue::glue("Cannot use 'verbose = TRUE' with grouped data."))
+
+    # Setup
+    target_expr <- dplyr::enquo(target)
+
+    ret <- data %>%
+        grouped_mapper(
+            .f        = anomalize,
+            target    = !! target_expr,
+            method    = method[[1]],
+            alpha     = alpha,
+            max_anoms = max_anoms,
+            verbose   = F,
+            ...)
+
+    return(ret)
+
+}
+
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# anomalize 0.1.0

		* Added a `NEWS.md` file to track changes to the package.