Forecast preprocessing schema (#772)

ahosler · web-flow · commit 6d80d1d3cc97 · 2024-04-12T09:34:11.000+01:00
diff --git a/ads/opctl/operator/lowcode/anomaly/model/anomaly_dataset.py b/ads/opctl/operator/lowcode/anomaly/model/anomaly_dataset.py
@@ -10,7 +10,6 @@
     merge_category_columns,
 )
 from ads.opctl.operator.lowcode.common.data import AbstractData
-from ads.opctl.operator.lowcode.common.data import AbstractData
 from ads.opctl.operator.lowcode.anomaly.utils import get_frequency_of_datetime
 from ads.opctl import logger
 import pandas as pd
diff --git a/ads/opctl/operator/lowcode/anomaly/operator_config.py b/ads/opctl/operator/lowcode/anomaly/operator_config.py
@@ -36,6 +36,21 @@ class TestData(InputData):
     """Class representing operator specification test data details."""
 
 
+@dataclass(repr=True)
+class PreprocessingSteps(DataClassSerializable):
+    """Class representing preprocessing steps for operator."""
+
+    missing_value_imputation: bool = True
+    outlier_treatment: bool = False
+
+
+@dataclass(repr=True)
+class DataPreprocessor(DataClassSerializable):
+    """Class representing operator specification preprocessing details."""
+
+    enabled: bool = True
+    steps: PreprocessingSteps = field(default_factory=PreprocessingSteps)
+
 @dataclass(repr=True)
 class AnomalyOperatorSpec(DataClassSerializable):
     """Class representing operator specification."""
@@ -74,7 +89,9 @@ def __post_init__(self):
             self.generate_inliers if self.generate_inliers is not None else False
         )
         self.model_kwargs = self.model_kwargs or dict()
-
+        self.preprocessing = (
+            self.preprocessing if self.preprocessing is not None else DataPreprocessor(enabled=True)
+        )
 
 @dataclass(repr=True)
 class AnomalyOperatorConfig(OperatorConfig):
diff --git a/ads/opctl/operator/lowcode/anomaly/schema.yaml b/ads/opctl/operator/lowcode/anomaly/schema.yaml
@@ -307,11 +307,23 @@ spec:
         description: "When provided, target_category_columns [list] indexes the data into multiple related datasets for anomaly detection"
 
     preprocessing:
-      type: boolean
+      type: dict
       required: false
-      default: true
-      meta:
-        description: "preprocessing and feature engineering can be disabled using this flag, Defaults to true"
+      schema:
+        enabled:
+          type: boolean
+          required: false
+          default: true
+          meta:
+            description: "preprocessing and feature engineering can be disabled using this flag, Defaults to true"
+        steps:
+          type: dict
+          required: false
+          schema:
+            missing_value_imputation:
+              type: boolean
+              required: false
+              default: true
 
     generate_report:
       type: boolean
diff --git a/ads/opctl/operator/lowcode/common/transformations.py b/ads/opctl/operator/lowcode/common/transformations.py
@@ -58,20 +58,26 @@ def run(self, data):
         clean_df = self._format_datetime_col(clean_df)
         clean_df = self._set_multi_index(clean_df)
 
-        if self.name == "historical_data":
-            try:
-                clean_df = self._missing_value_imputation_hist(clean_df)
-            except Exception as e:
-                logger.debug(f"Missing value imputation failed with {e.args}")
-            if self.preprocessing:
-                try:
-                    clean_df = self._outlier_treatment(clean_df)
-                except Exception as e:
-                    logger.debug(f"Outlier Treatment failed with {e.args}")
-            else:
-                logger.debug("Skipping outlier treatment as preprocessing is disabled")
-        elif self.name == "additional_data":
-            clean_df = self._missing_value_imputation_add(clean_df)
+        if self.preprocessing and self.preprocessing.enabled:
+            if self.name == "historical_data":
+                if self.preprocessing.steps.missing_value_imputation:
+                    try:
+                        clean_df = self._missing_value_imputation_hist(clean_df)
+                    except Exception as e:
+                        logger.debug(f"Missing value imputation failed with {e.args}")
+                else:
+                    logger.info("Skipping missing value imputation because it is disabled")
+                if self.preprocessing.steps.outlier_treatment:
+                    try:
+                        clean_df = self._outlier_treatment(clean_df)
+                    except Exception as e:
+                        logger.debug(f"Outlier Treatment failed with {e.args}")
+                else:
+                    logger.info("Skipping outlier treatment because it is disabled")
+            elif self.name == "additional_data":
+                clean_df = self._missing_value_imputation_add(clean_df)
+        else:
+            logger.info("Skipping all preprocessing steps because preprocessing is disabled")
         return clean_df
 
     def _remove_trailing_whitespace(self, df):
diff --git a/ads/opctl/operator/lowcode/forecast/operator_config.py b/ads/opctl/operator/lowcode/forecast/operator_config.py
@@ -29,6 +29,22 @@ class DateTimeColumn(DataClassSerializable):
     format: str = None
 
 
+@dataclass(repr=True)
+class PreprocessingSteps(DataClassSerializable):
+    """Class representing preprocessing steps for operator."""
+
+    missing_value_imputation: bool = True
+    outlier_treatment: bool = True
+
+
+@dataclass(repr=True)
+class DataPreprocessor(DataClassSerializable):
+    """Class representing operator specification preprocessing details."""
+
+    enabled: bool = True
+    steps: PreprocessingSteps = field(default_factory=PreprocessingSteps)
+
+
 @dataclass(repr=True)
 class Tuning(DataClassSerializable):
     """Class representing operator specification tuning details."""
@@ -54,7 +70,7 @@ class ForecastOperatorSpec(DataClassSerializable):
     global_explanation_filename: str = None
     local_explanation_filename: str = None
     target_column: str = None
-    preprocessing: bool = None
+    preprocessing: DataPreprocessor = field(default_factory=DataPreprocessor)
     datetime_column: DateTimeColumn = field(default_factory=DateTimeColumn)
     target_category_columns: List[str] = field(default_factory=list)
     generate_report: bool = None
@@ -79,7 +95,7 @@ def __post_init__(self):
         self.confidence_interval_width = self.confidence_interval_width or 0.80
         self.report_filename = self.report_filename or "report.html"
         self.preprocessing = (
-            self.preprocessing if self.preprocessing is not None else True
+            self.preprocessing if self.preprocessing is not None else DataPreprocessor(enabled=True)
         )
         # For Report Generation. When user doesn't specify defaults to True
         self.generate_report = (
diff --git a/ads/opctl/operator/lowcode/forecast/schema.yaml b/ads/opctl/operator/lowcode/forecast/schema.yaml
@@ -286,11 +286,27 @@ spec:
       default: target
 
     preprocessing:
-      type: boolean
+      type: dict
       required: false
-      default: true
-      meta:
-        description: "preprocessing and feature engineering can be disabled using this flag, Defaults to true"
+      schema:
+        enabled:
+          type: boolean
+          required: false
+          default: true
+          meta:
+            description: "preprocessing and feature engineering can be disabled using this flag, Defaults to true"
+        steps:
+          type: dict
+          required: false
+          schema:
+            missing_value_imputation:
+              type: boolean
+              required: false
+              default: true
+            outlier_treatment:
+              type: boolean
+              required: false
+              default: true
 
     generate_explanations:
       type: boolean
diff --git a/tests/operators/forecast/test_errors.py b/tests/operators/forecast/test_errors.py
@@ -26,7 +26,7 @@
     ForecastInputDataError,
 )
 from ads.opctl.operator.cmd import run
-
+import math
 
 NUM_ROWS = 1000
 NUM_SERIES = 10
@@ -172,8 +172,9 @@ def run_yaml(tmpdirname, yaml_i, output_data_path):
     run(yaml_i, backend="operator.local", debug=True)
     subprocess.run(f"ls -a {output_data_path}", shell=True)
 
-    test_metrics = pd.read_csv(f"{tmpdirname}/results/test_metrics.csv")
-    print(test_metrics)
+    if 'test_data' in yaml_i['spec']:
+        test_metrics = pd.read_csv(f"{tmpdirname}/results/test_metrics.csv")
+        print(test_metrics)
     train_metrics = pd.read_csv(f"{tmpdirname}/results/metrics.csv")
     print(train_metrics)
 
@@ -185,6 +186,7 @@ def populate_yaml(
     additional_data_path=None,
     test_data_path=None,
     output_data_path=None,
+    preprocessing=None,
 ):
     if historical_data_path is None:
         historical_data_path, additional_data_path, test_data_path = setup_rossman()
@@ -204,7 +206,8 @@ def populate_yaml(
     yaml_i["spec"]["datetime_column"]["name"] = "Date"
     yaml_i["spec"]["target_category_columns"] = ["Store"]
     yaml_i["spec"]["horizon"] = HORIZON
-
+    if preprocessing:
+        yaml_i["spec"]["preprocessing"] = preprocessing
     if generate_train_metrics:
         yaml_i["spec"]["generate_metrics"] = generate_train_metrics
     if model == "autots":
@@ -372,6 +375,7 @@ def test_0_series(operator_setup, model):
         historical_data_path=historical_data_path,
         additional_data_path=additional_data_path,
         test_data_path=test_data_path,
+        preprocessing={"enabled": False}
     )
     with pytest.raises(DataMismatchError):
         run_yaml(
@@ -429,6 +433,49 @@ def test_invalid_dates(operator_setup, model):
         )
 
 
+def test_disabling_outlier_treatment(operator_setup):
+    tmpdirname = operator_setup
+    NUM_ROWS = 100
+    hist_data_0 = pd.concat(
+        [
+            HISTORICAL_DATETIME_COL[: NUM_ROWS - HORIZON],
+            TARGET_COL[: NUM_ROWS - HORIZON],
+        ],
+        axis=1,
+    )
+    outliers = [1000, -800]
+    hist_data_0.at[40, 'Sales'] = outliers[0]
+    hist_data_0.at[75, 'Sales'] = outliers[1]
+    historical_data_path, additional_data_path, test_data_path = setup_artificial_data(
+        tmpdirname, hist_data_0
+    )
+
+    yaml_i, output_data_path = populate_yaml(
+        tmpdirname=tmpdirname,
+        model="arima",
+        historical_data_path=historical_data_path
+    )
+    yaml_i["spec"].pop("target_category_columns")
+    yaml_i["spec"].pop("additional_data")
+
+    # running default pipeline where outlier will be treated
+    run_yaml(tmpdirname=tmpdirname, yaml_i=yaml_i, output_data_path=output_data_path)
+    forecast_without_outlier = pd.read_csv(f"{tmpdirname}/results/forecast.csv")
+    input_vals_without_outlier = set(forecast_without_outlier['input_value'])
+    assert all(
+        item not in input_vals_without_outlier for item in outliers), "forecast file should not contain any outliers"
+
+    # switching off outlier_treatment
+    preprocessing_steps = {"missing_value_imputation": True, "outlier_treatment": False}
+    preprocessing = {"enabled": True, "steps": preprocessing_steps}
+    yaml_i["spec"]["preprocessing"] = preprocessing
+    run_yaml(tmpdirname=tmpdirname, yaml_i=yaml_i, output_data_path=output_data_path)
+    forecast_with_outlier = pd.read_csv(f"{tmpdirname}/results/forecast.csv")
+    input_vals_with_outlier = set(forecast_with_outlier['input_value'])
+    assert all(
+        item in input_vals_with_outlier for item in outliers), "forecast file should contain all the outliers"
+
+
 @pytest.mark.parametrize("model", MODELS)
 def test_2_series(operator_setup, model):
     # Test w and w/o add data
@@ -454,12 +501,14 @@ def split_df(df):
     historical_data_path, additional_data_path, test_data_path = setup_artificial_data(
         tmpdirname, hist_data, add_data, test_data
     )
+    preprocessing_steps = {"missing_value_imputation": True, "outlier_treatment": False}
     yaml_i, output_data_path = populate_yaml(
         tmpdirname=tmpdirname,
         model=model,
         historical_data_path=historical_data_path,
         additional_data_path=additional_data_path,
         test_data_path=test_data_path,
+        preprocessing={"enabled": True, "steps": preprocessing_steps}
     )
     with pytest.raises(DataMismatchError):
         # 4 columns in historical data, but only 1 cat col specified

Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,6 @@`
`10`	`10`	`merge_category_columns,`
`11`	`11`	`)`
`12`	`12`	`from ads.opctl.operator.lowcode.common.data import AbstractData`
`13`		`-from ads.opctl.operator.lowcode.common.data import AbstractData`
`14`	`13`	`from ads.opctl.operator.lowcode.anomaly.utils import get_frequency_of_datetime`
`15`	`14`	`from ads.opctl import logger`
`16`	`15`	`import pandas as pd`