Skip to content

Commit

Permalink
add clean_anomalies()
Browse files Browse the repository at this point in the history
  • Loading branch information
mdancho84 committed Sep 15, 2019
1 parent af69165 commit 8182cbd
Show file tree
Hide file tree
Showing 8 changed files with 198 additions and 3 deletions.
5 changes: 3 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: anomalize
Type: Package
Title: Tidy Anomaly Detection
Version: 0.1.2
Version: 0.2.0
Authors@R: c(
person("Matt", "Dancho", email = "[email protected]", role = c("aut", "cre")),
person("Davis", "Vaughan", email = "[email protected]", role = c("aut"))
Expand Down Expand Up @@ -44,7 +44,8 @@ Roxygen: list(markdown = TRUE)
Suggests:
tidyverse,
tidyquant,
testthat,
stringr,
testthat (>= 2.1.0),
covr,
knitr,
rmarkdown,
Expand Down
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
S3method(anomalize,default)
S3method(anomalize,grouped_df)
S3method(anomalize,tbl_df)
S3method(clean_anomalies,default)
S3method(clean_anomalies,tbl_df)
S3method(plot_anomalies,default)
S3method(plot_anomalies,tbl_time)
S3method(plot_anomaly_decomposition,default)
Expand All @@ -25,6 +27,7 @@ S3method(time_recompose,grouped_tbl_time)
S3method(time_recompose,tbl_df)
S3method(time_recompose,tbl_time)
export(anomalize)
export(clean_anomalies)
export(decompose_stl)
export(decompose_twitter)
export(gesd)
Expand Down
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# anomalize 0.2.0

* `clean_anomalies()` - A new function to simplify cleaning anomalies by replacing with trend and seasonal components. This is useful in preparing data for forecasting.

# anomalize 0.1.1

* [Issue #2](https://github.com/business-science/anomalize/issues/2): Bugfixes for various `ggplot2` issues in `plot_anomalies()`. Solves "Error in FUN(X[[i]], ...) : object '.group' not found".
Expand Down
101 changes: 101 additions & 0 deletions R/anomalize_clean.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#' Clean anomalies from anomalized data
#'
#' @param data A `tibble` or `tbl_time` object.
#'
#' @return Returns a `tibble` / `tbl_time` object with a new column "observed_cleaned".
#'
#' @details
#' The `clean_anomalies()` function is used to replace outliers with the seasonal and trend component.
#' This is often desirable when forecasting with noisy time series data to improve trend detection.
#'
#' To clean anomalies, the input data must be detrended with `time_decompose()` and anomalized with `anomalize()`.
#' The data can also be recomposed with `time_recompose()`.
#'
#' @seealso
#' Time Series Anomaly Detection Functions (anomaly detection workflow):
#' - [time_decompose()]
#' - [anomalize()]
#' - [time_recompose()]
#'
#' @examples
#'
#' library(dplyr)
#'
#' # Needed to pass CRAN check / This is loaded by default
#' set_time_scale_template(time_scale_template())
#'
#' data(tidyverse_cran_downloads)
#'
#' tidyverse_cran_downloads %>%
#' time_decompose(count, method = "stl") %>%
#' anomalize(remainder, method = "iqr") %>%
#' clean_anomalies()
#'
#'
#' @export
clean_anomalies <- function(data) {
UseMethod("clean_anomalies", data)
}

#' @export
clean_anomalies.default <- function(data) {
stop("Error clean_anomalies(): Object is not of class `tbl_df` or `tbl_time`.", call. = FALSE)
}

#' @export
clean_anomalies.tbl_df <- function(data) {

# Checks
check_clean_anomalies_input(data)

# Get method col
method_col <- get_method_col(data)

if (method_col == "trend") {
data %>%
dplyr::mutate(observed_cleaned = ifelse(anomaly == "Yes", season + trend, observed))
} else {
data %>%
dplyr::mutate(observed_cleaned = ifelse(anomaly == "Yes", season + median_spans, observed))
}

}

check_clean_anomalies_input <- function(data) {

data_names <- names(data)

# Detect method - STL or Twitter
method_names <- c("trend", "median_spans")
method_name_in_data <- any(method_names %in% data_names)

# Check - No method name in data
if (!method_name_in_data) stop("Error clean_anomalies(): Output does not contain a column named trend or median_spans. This may occur if the output was not detrended with time_decompose().", call. = FALSE)

# Check - Required names from time_decompose()
required_names <- c("observed", "season")
required_names_in_data <- all(required_names %in% data_names)
if (!required_names_in_data) stop("Error clean_anomalies(): Output does not contain columns named observed and season. This may occur if the output was not detrended with time_decompose().", call. = FALSE)

# Check - Required names from time_decompose()
required_names <- c("anomaly")
required_names_in_data <- all(required_names %in% data_names)
if (!required_names_in_data) stop("Error clean_anomalies(): Output does not contain columns named anomaly. This may occur if the output was not anomalized with anomalize().", call. = FALSE)


}


get_method_col <- function(data) {

data_names <- names(data)

# Detect method - STL or Twitter
method_names <- c("trend", "median_spans")
method_name_in_data <- method_names %in% data_names

method_names[method_name_in_data]

}


3 changes: 2 additions & 1 deletion R/global_vars.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,6 @@ globalVariables(c(
"key",
"median_spans",
"recomposed_l1",
"recomposed_l2"
"recomposed_l2",
"data_names"
))
48 changes: 48 additions & 0 deletions man/clean_anomalies.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions tests/testthat.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ library(anomalize)
library(dplyr)
library(ggplot2)
library(tibble)
library(stringr)

# set_time_scale_template(time_scale_template())

Expand Down
36 changes: 36 additions & 0 deletions tests/testthat/test-clean_anomalies.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@


data_stl <- tidyverse_cran_downloads %>%
time_decompose(count, method = "stl") %>%
anomalize(remainder, method = "iqr")

data_twitter <- tidyverse_cran_downloads %>%
time_decompose(count, method = "twitter") %>%
anomalize(remainder, method = "iqr")


test_that("bad data returns error", {

expect_error(clean_anomalies(2))

})

test_that("Clean Anomalies from STL Method", {

expect_true(data_stl %>%
clean_anomalies() %>%
names() %>%
str_detect("observed_cleaned") %>%
any())

})

test_that("Clean Anomalies from Twitter Method", {

expect_true(data_twitter %>%
clean_anomalies() %>%
names() %>%
str_detect("observed_cleaned") %>%
any())

})

0 comments on commit 8182cbd

Please sign in to comment.