Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
mdancho84 committed Mar 19, 2018
0 parents commit 8030b4b
Show file tree
Hide file tree
Showing 40 changed files with 2,822 additions and 0 deletions.
7 changes: 7 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
^.*\.Rproj$
^\.Rproj\.user$
^README\.Rmd$
^cran-comments\.md$
^_pkgdown\.yml$
^docs$
^data-raw$
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.Rproj.user
.Rhistory
.RData
.Ruserdata
38 changes: 38 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
Package: anomalize
Type: Package
Title: Tidy anomaly detection
Version: 0.1.0
Author: c(
person("Matt", "Dancho", email = "[email protected]", role = c("aut", "cre")),
person("Davis", "Vaughan", email = "[email protected]", role = c("aut"))
)
Maintainer: Matt Dancho <[email protected]>
Description:
The `anomalize` package enables a "tidy" workflow for detecting anomalies in data.
The main functions are `time_decompose()`, `anomalize()`, and `time_recompose()`.
When combined, it's quite simple to decompose time series, detect anomalies,
and create bands separating the "normal" data from the anomalous data.
URL: https://github.com/business-science/anomalize
BugReports: https://github.com/business-science/anomalize/issues
License: GPL (>= 3)
Encoding: UTF-8
LazyData: true
Depends:
R (>= 3.0.0)
Imports:
dplyr,
glue,
tidyquant,
timetk,
sweep,
tibbletime,
purrr,
rlang,
tibble,
stringr,
tidyr,
ggplot2
RoxygenNote: 6.0.1
Roxygen: list(markdown = TRUE)
Suggests:
testthat
45 changes: 45 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Generated by roxygen2: do not edit by hand

S3method(anomalize,default)
S3method(anomalize,grouped_df)
S3method(anomalize,tbl_df)
S3method(plot_anomaly_decomposition,default)
S3method(plot_anomaly_decomposition,grouped_tbl_time)
S3method(plot_anomaly_decomposition,tbl_time)
S3method(prep_tbl_time,data.frame)
S3method(prep_tbl_time,tbl_time)
S3method(time_apply,data.frame)
S3method(time_decompose,default)
S3method(time_decompose,grouped_df)
S3method(time_decompose,grouped_tbl_time)
S3method(time_decompose,tbl_df)
S3method(time_decompose,tbl_time)
S3method(time_recompose,default)
S3method(time_recompose,grouped_df)
S3method(time_recompose,grouped_tbl_time)
S3method(time_recompose,tbl_df)
S3method(time_recompose,tbl_time)
export(anomalize)
export(decompose_multiplicative)
export(decompose_stl)
export(decompose_twitter)
export(gesd)
export(iqr)
export(plot_anomaly_decomposition)
export(prep_tbl_time)
export(time_apply)
export(time_decompose)
export(time_frequency)
export(time_recompose)
export(time_scale_template)
export(time_trend)
importFrom(dplyr,"%>%")
importFrom(dplyr,contains)
importFrom(dplyr,n)
importFrom(dplyr,row_number)
importFrom(rlang,"!!!")
importFrom(rlang,"!!")
importFrom(rlang,":=")
importFrom(stats,mad)
importFrom(stats,median)
importFrom(stats,qt)
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# anomalize 0.1.0

* Added a `NEWS.md` file to track changes to the package.
19 changes: 19 additions & 0 deletions R/anomalize-package.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#' anomalize: Tidy anomaly detection
#'
#' @details
#' The `anomalize` package enables a tidy workflow for detecting anomalies in data.
#' The main functions are `time_decompose()`, `anomalize()`, and `time_recompose()`.
#' When combined, it's quite simple to decompose time series, detect anomalies,
#' and create bands separating the "normal" data from the anomalous data.
#'
#' To learn more about `anomalize`, start with the vignettes:
#' `browseVignettes(package = "anomalize")`
#'
#' @docType package
#' @name anomalize_package
#'
#' @importFrom rlang := !! !!!
#' @importFrom dplyr %>% n row_number contains
#' @importFrom stats median mad qt

NULL
173 changes: 173 additions & 0 deletions R/anomalize.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
#' Detect anomalies using the tidyverse
#'
#' @inheritParams time_apply
#' @param data A `tibble` or `tbl_time` object.
#' @param method The anomaly detection method. One of `"iqr"` or `"gesd"`.
#' The IQR method is faster at the expense of possibly not being quite as accurate.
#' The GESD method has the best properties for outlier detection, but is loop-based
#' and therefore a bit slower.
#' @param alpha Controls the width of the "normal" range.
#' Lower values are more conservative while higher values are less prone
#' to incorrectly classifying "normal" observations.
#' @param max_anoms The maximum percent of anomalies permitted to be identified.
#' @param verbose A boolean. If `TRUE`, will return a list containing useful information
#' about the anomalies. If `FALSE`, just returns the data expanded with the anomalies and
#' the lower (l1) and upper (l2) bounds.
#'
#' @return Returns a `tibble` / `tbl_time` object or list depending on the value of `verbose`.
#'
#' @details
#' The `anomalize()` function is used to detect outliers in a distribution
#' with no trend or seasonality present. The return has three columns:
#' "remainder_l1" (lower limit for anomalies), "remainder_l2" (upper limit for
#' anomalies), and "anomaly" (Yes/No).
#'
#' Use [time_decompose()] to decompose a time series prior to performing
#' anomaly detection with `anomalize()`. Typically, `anomalize()` is
#' performed on the "remainder" of the time series decomposition.
#'
#' For non-time series data (data without trend), the `anomalize()` function can
#' be used without time series decomposition.
#'
#' The `anomalize()` function uses two methods for outlier detection
#' each with benefits.
#'
#' __IQR__:
#'
#' The IQR Method uses an innerquartile range of 25% and 75% to establish a baseline distribution around
#' the median. With the default `alpha = 0.05`, the limits are established by expanding
#' the 25/75 baseline by an IQR Factor of 3 (3X). The IQR Factor = 0.15 / alpha (hense 3X with alpha = 0.05).
#' To increase the IQR Factor controling the limits, decrease the alpha, which makes
#' it more difficult to be an outlier. Increase alpha to make it easier to be an outlier.
#'
#' __GESD__:
#'
#' The GESD Method (Generlized Extreme Studentized Deviate Test) progressively
#' eliminates outliers using a Student's T-Test comparing the test statistic to a critical value.
#' Each time an outlier is removed, the test statistic is updated. Once test statistic
#' drops below the critical value, all outliers are considered removed. Because this method
#' involves continuous updating via a loop, it is slower than the IQR method. However, it
#' tends to be the best performing method for outlier removal.
#'
#' @seealso
#' Anomaly Detection Methods (Powers `anomalize`)
#' - [iqr()]
#' - [gesd()]
#'
#' Time Series Anomaly Detection Functions (anomaly detection workflow):
#' - [time_decompose()]
#' - [time_recompose()]
#'
#' @examples
#'
#' library(dplyr)
#'
#' data(tidyverse_cran_downloads)
#'
#' tidyverse_cran_downloads %>%
#' time_decompose(count, method = "stl") %>%
#' anomalize(remainder, method = "iqr")
#'
#' @references
#' - The IQR method is used in [`forecast::tsoutliers()`](https://github.com/robjhyndman/forecast/blob/master/R/clean.R)
#' - The GESD method is used in Twitter's [`AnomalyDetection`](https://github.com/twitter/AnomalyDetection) package and is also available as a function in [@raunakms's GESD method](https://github.com/raunakms/GESD/blob/master/runGESD.R)
#'
#' @export
anomalize <- function(data, target, method = c("iqr", "gesd"),
alpha = 0.05, max_anoms = 0.20, verbose = FALSE) {
UseMethod("anomalize", data)
}

#' @export
anomalize.default <- function(data, target, method = c("iqr", "gesd"),
alpha = 0.05, max_anoms = 0.20, verbose = FALSE) {
stop("Object is not of class `tbl_df` or `tbl_time`.", call. = FALSE)
}

#' @export
anomalize.tbl_df <- function(data, target, method = c("iqr", "gesd"),
alpha = 0.05, max_anoms = 0.20, verbose = FALSE) {

# Checks
if (missing(target)) stop('Error in anomalize(): argument "target" is missing, with no default', call. = FALSE)

# Setup
target_expr <- rlang::enquo(target)

method <- tolower(method[[1]])
x <- data %>% dplyr::pull(!! target_expr)

# Detect Anomalies
# method <- tolower(method[[1]])
# args <- list(x = data %>% dplyr::pull(!! target_expr),
# alpha = alpha,
# max_anoms = max_anoms,
# verbose = TRUE)
#
# outlier_list <- do.call(method, args)

# Explicitly call functions
if (method == "iqr") {
outlier_list <- anomalize::iqr(x = x,
alpha = alpha,
max_anoms = max_anoms,
verbose = TRUE)
} else if (method == "gesd") {
outlier_list <- anomalize::gesd(x = x,
alpha = alpha,
max_anoms = max_anoms,
verbose = TRUE)

} else {
stop("The `method` selected is invalid.", call. = FALSE)
}

outlier <- outlier_list$outlier
limit_lower <- outlier_list$critical_limits[[1]]
limit_upper <- outlier_list$critical_limits[[2]]

# Returns
ret <- data %>%
dplyr::mutate(!! paste0(dplyr::quo_name(target_expr), "_l1") := limit_lower,
!! paste0(dplyr::quo_name(target_expr), "_l2") := limit_upper) %>%
tibble::add_column(anomaly = outlier)

if (verbose) {
ret <- list(
anomalized_tbl = ret,
anomaly_details = outlier_list
)

return(ret)

} else {
return(ret)
}

}

#' @export
anomalize.grouped_df <- function(data, target, method = c("iqr", "gesd"),
alpha = 0.05, max_anoms = 0.20, verbose = FALSE, ...) {

# Checks
if (missing(target)) stop('Error in anomalize(): argument "target" is missing, with no default', call. = FALSE)
if (verbose) warning(glue::glue("Cannot use 'verbose = TRUE' with grouped data."))

# Setup
target_expr <- dplyr::enquo(target)

ret <- data %>%
grouped_mapper(
.f = anomalize,
target = !! target_expr,
method = method[[1]],
alpha = alpha,
max_anoms = max_anoms,
verbose = F,
...)

return(ret)

}

Loading

0 comments on commit 8030b4b

Please sign in to comment.