Merge branch 'main' into forecast_series_failure_bugfixes

ahosler · web-flow · commit 1138d2dc9102 · 2024-04-12T10:46:57.000+01:00
diff --git a/.github/workflows/run-operators-unit-tests.yml b/.github/workflows/run-operators-unit-tests.yml
@@ -31,7 +31,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.8", "3.10.8"]
+        python-version: ["3.8"]
 
     steps:
       - uses: actions/checkout@v4
diff --git a/CODEOWNERS b/CODEOWNERS
@@ -0,0 +1 @@
+* @darenr @mayoor @mrDzurb @VipulMascarenhas @qiuosier
diff --git a/ads/common/serializer.py b/ads/common/serializer.py
@@ -464,7 +464,7 @@ def from_dict(
             )
 
         obj = cls(
-            **{key: obj_dict.get(key) for key in allowed_fields if key in obj_dict}
+            **{key: obj_dict.get(key) for key in allowed_fields}
         )
 
         for key, value in obj_dict.items():
diff --git a/ads/opctl/operator/lowcode/anomaly/model/anomaly_dataset.py b/ads/opctl/operator/lowcode/anomaly/model/anomaly_dataset.py
@@ -10,7 +10,6 @@
     merge_category_columns,
 )
 from ads.opctl.operator.lowcode.common.data import AbstractData
-from ads.opctl.operator.lowcode.common.data import AbstractData
 from ads.opctl.operator.lowcode.anomaly.utils import get_frequency_of_datetime
 from ads.opctl import logger
 import pandas as pd
diff --git a/ads/opctl/operator/lowcode/anomaly/model/automlx.py b/ads/opctl/operator/lowcode/anomaly/model/automlx.py
@@ -26,8 +26,9 @@ class AutoMLXOperatorModel(AnomalyOperatorBaseModel):
     )
     def _build_model(self) -> pd.DataFrame:
         from automlx import init
+        import logging
         try:
-            init(engine="ray", engine_opts={"ray_setup": {"_temp_dir": "/tmp/ray-temp"}})
+            init(engine="ray", engine_opts={"ray_setup": {"_temp_dir": "/tmp/ray-temp"}}, loglevel=logging.CRITICAL)
         except Exception as e:
             logger.info("Ray already initialized")
         date_column = self.spec.datetime_column.name
diff --git a/ads/opctl/operator/lowcode/anomaly/operator_config.py b/ads/opctl/operator/lowcode/anomaly/operator_config.py
@@ -36,6 +36,21 @@ class TestData(InputData):
     """Class representing operator specification test data details."""
 
 
+@dataclass(repr=True)
+class PreprocessingSteps(DataClassSerializable):
+    """Class representing preprocessing steps for operator."""
+
+    missing_value_imputation: bool = True
+    outlier_treatment: bool = False
+
+
+@dataclass(repr=True)
+class DataPreprocessor(DataClassSerializable):
+    """Class representing operator specification preprocessing details."""
+
+    enabled: bool = True
+    steps: PreprocessingSteps = field(default_factory=PreprocessingSteps)
+
 @dataclass(repr=True)
 class AnomalyOperatorSpec(DataClassSerializable):
     """Class representing operator specification."""
@@ -74,7 +89,9 @@ def __post_init__(self):
             self.generate_inliers if self.generate_inliers is not None else False
         )
         self.model_kwargs = self.model_kwargs or dict()
-
+        self.preprocessing = (
+            self.preprocessing if self.preprocessing is not None else DataPreprocessor(enabled=True)
+        )
 
 @dataclass(repr=True)
 class AnomalyOperatorConfig(OperatorConfig):
diff --git a/ads/opctl/operator/lowcode/anomaly/schema.yaml b/ads/opctl/operator/lowcode/anomaly/schema.yaml
@@ -307,11 +307,23 @@ spec:
         description: "When provided, target_category_columns [list] indexes the data into multiple related datasets for anomaly detection"
 
     preprocessing:
-      type: boolean
+      type: dict
       required: false
-      default: true
-      meta:
-        description: "preprocessing and feature engineering can be disabled using this flag, Defaults to true"
+      schema:
+        enabled:
+          type: boolean
+          required: false
+          default: true
+          meta:
+            description: "preprocessing and feature engineering can be disabled using this flag, Defaults to true"
+        steps:
+          type: dict
+          required: false
+          schema:
+            missing_value_imputation:
+              type: boolean
+              required: false
+              default: true
 
     generate_report:
       type: boolean
diff --git a/ads/opctl/operator/lowcode/common/transformations.py b/ads/opctl/operator/lowcode/common/transformations.py
@@ -58,20 +58,26 @@ def run(self, data):
         clean_df = self._format_datetime_col(clean_df)
         clean_df = self._set_multi_index(clean_df)
 
-        if self.name == "historical_data":
-            try:
-                clean_df = self._missing_value_imputation_hist(clean_df)
-            except Exception as e:
-                logger.debug(f"Missing value imputation failed with {e.args}")
-            if self.preprocessing:
-                try:
-                    clean_df = self._outlier_treatment(clean_df)
-                except Exception as e:
-                    logger.debug(f"Outlier Treatment failed with {e.args}")
-            else:
-                logger.debug("Skipping outlier treatment as preprocessing is disabled")
-        elif self.name == "additional_data":
-            clean_df = self._missing_value_imputation_add(clean_df)
+        if self.preprocessing and self.preprocessing.enabled:
+            if self.name == "historical_data":
+                if self.preprocessing.steps.missing_value_imputation:
+                    try:
+                        clean_df = self._missing_value_imputation_hist(clean_df)
+                    except Exception as e:
+                        logger.debug(f"Missing value imputation failed with {e.args}")
+                else:
+                    logger.info("Skipping missing value imputation because it is disabled")
+                if self.preprocessing.steps.outlier_treatment:
+                    try:
+                        clean_df = self._outlier_treatment(clean_df)
+                    except Exception as e:
+                        logger.debug(f"Outlier Treatment failed with {e.args}")
+                else:
+                    logger.info("Skipping outlier treatment because it is disabled")
+            elif self.name == "additional_data":
+                clean_df = self._missing_value_imputation_add(clean_df)
+        else:
+            logger.info("Skipping all preprocessing steps because preprocessing is disabled")
         return clean_df
 
     def _remove_trailing_whitespace(self, df):
diff --git a/ads/opctl/operator/lowcode/forecast/environment.yaml b/ads/opctl/operator/lowcode/forecast/environment.yaml
@@ -18,3 +18,4 @@ dependencies:
       - optuna==3.1.0
       - oracle-automlx==23.4.1
       - oracle-automlx[forecasting]==23.4.1
+      - fire
diff --git a/ads/opctl/operator/lowcode/forecast/model/automlx.py b/ads/opctl/operator/lowcode/forecast/model/automlx.py
@@ -76,17 +76,15 @@ def preprocess(self, data, series_id=None):  # TODO: re-use self.le for explanat
     )
     def _build_model(self) -> pd.DataFrame:
         from automlx import init
-        from sktime.forecasting.model_selection import temporal_train_test_split
+        import logging
         try:
-            init(engine="ray", engine_opts={"ray_setup": {"_temp_dir": "/tmp/ray-temp"}})
+            init(engine="ray", engine_opts={"ray_setup": {"_temp_dir": "/tmp/ray-temp"}}, loglevel=logging.CRITICAL)
         except Exception as e:
             logger.info("Ray already initialized")
 
-
         full_data_dict = self.datasets.get_data_by_series()
 
         self.models = dict()
-        date_column = self.spec.datetime_column.name
         horizon = self.spec.horizon
         self.spec.confidence_interval_width = self.spec.confidence_interval_width or 0.8
         self.forecast_output = ForecastOutput(
diff --git a/ads/opctl/operator/lowcode/forecast/operator_config.py b/ads/opctl/operator/lowcode/forecast/operator_config.py
@@ -29,6 +29,22 @@ class DateTimeColumn(DataClassSerializable):
     format: str = None
 
 
+@dataclass(repr=True)
+class PreprocessingSteps(DataClassSerializable):
+    """Class representing preprocessing steps for operator."""
+
+    missing_value_imputation: bool = True
+    outlier_treatment: bool = True
+
+
+@dataclass(repr=True)
+class DataPreprocessor(DataClassSerializable):
+    """Class representing operator specification preprocessing details."""
+
+    enabled: bool = True
+    steps: PreprocessingSteps = field(default_factory=PreprocessingSteps)
+
+
 @dataclass(repr=True)
 class Tuning(DataClassSerializable):
     """Class representing operator specification tuning details."""
@@ -54,7 +70,7 @@ class ForecastOperatorSpec(DataClassSerializable):
     global_explanation_filename: str = None
     local_explanation_filename: str = None
     target_column: str = None
-    preprocessing: bool = None
+    preprocessing: DataPreprocessor = field(default_factory=DataPreprocessor)
     datetime_column: DateTimeColumn = field(default_factory=DateTimeColumn)
     target_category_columns: List[str] = field(default_factory=list)
     generate_report: bool = None
@@ -79,7 +95,7 @@ def __post_init__(self):
         self.confidence_interval_width = self.confidence_interval_width or 0.80
         self.report_filename = self.report_filename or "report.html"
         self.preprocessing = (
-            self.preprocessing if self.preprocessing is not None else True
+            self.preprocessing if self.preprocessing is not None else DataPreprocessor(enabled=True)
         )
         # For Report Generation. When user doesn't specify defaults to True
         self.generate_report = (
diff --git a/ads/opctl/operator/lowcode/forecast/schema.yaml b/ads/opctl/operator/lowcode/forecast/schema.yaml
@@ -286,11 +286,27 @@ spec:
       default: target
 
     preprocessing:
-      type: boolean
+      type: dict
       required: false
-      default: true
-      meta:
-        description: "preprocessing and feature engineering can be disabled using this flag, Defaults to true"
+      schema:
+        enabled:
+          type: boolean
+          required: false
+          default: true
+          meta:
+            description: "preprocessing and feature engineering can be disabled using this flag, Defaults to true"
+        steps:
+          type: dict
+          required: false
+          schema:
+            missing_value_imputation:
+              type: boolean
+              required: false
+              default: true
+            outlier_treatment:
+              type: boolean
+              required: false
+              default: true
 
     generate_explanations:
       type: boolean
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -79,6 +79,13 @@ Oracle Accelerated Data Science (ADS)
 
    modules
 
+.. admonition:: Introducing AI Quick Actions
+   :class: note
+
+   Deploy, Fine Tune and Evaluate Large language models such as `Mistral-7B-Instruct-v0.2 <https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2>`__, `CodeLlama-13b-Instruct-hf <https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf>`__, etc with just a couple of clicks.
+
+   Visit `AI Quick Actions documentation <https://github.com/oracle-samples/oci-data-science-ai-samples/tree/main/ai-quick-actions>`__ for the latest information
+  
 .. admonition:: Oracle Accelerated Data Science (ADS)
    :class: note
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -179,7 +179,7 @@ pii = [
   "spacy-transformers==1.2.5",
   "spacy==3.6.1",
 ]
-llm = ["langchain>=0.1.10", "evaluate>=0.4.0"]
+llm = ["langchain-community<0.0.32", "langchain>=0.1.10,<0.1.14", "evaluate>=0.4.0"]
 aqua = ["jupyter_server"]
 
 # To reduce backtracking (decrese deps install time) during test/dev env setup reducing number of versions pip is
diff --git a/test-requirements-operators.txt b/test-requirements-operators.txt
@@ -1,5 +1,5 @@
 -r test-requirements.txt
 -e ".[forecast]"
 -e ".[feature-store-marketplace]"
-darts
+darts>=0.28
 plotly
diff --git a/tests/operators/forecast/test_errors.py b/tests/operators/forecast/test_errors.py
@@ -32,6 +32,7 @@
 from ads.opctl.operator.cmd import run
 import os
 import json
+import math
 
 NUM_ROWS = 1000
 NUM_SERIES = 10
@@ -191,6 +192,7 @@ def populate_yaml(
     additional_data_path=None,
     test_data_path=None,
     output_data_path=None,
+    preprocessing=None,
 ):
     if historical_data_path is None:
         historical_data_path, additional_data_path, test_data_path = setup_rossman()
@@ -210,7 +212,8 @@ def populate_yaml(
     yaml_i["spec"]["datetime_column"]["name"] = "Date"
     yaml_i["spec"]["target_category_columns"] = ["Store"]
     yaml_i["spec"]["horizon"] = HORIZON
-
+    if preprocessing:
+        yaml_i["spec"]["preprocessing"] = preprocessing
     if generate_train_metrics:
         yaml_i["spec"]["generate_metrics"] = generate_train_metrics
     if model == "autots":
@@ -393,6 +396,7 @@ def test_0_series(operator_setup, model):
         historical_data_path=historical_data_path,
         additional_data_path=additional_data_path,
         test_data_path=test_data_path,
+        preprocessing={"enabled": False}
     )
     with pytest.raises(DataMismatchError):
         run_yaml(
@@ -450,6 +454,49 @@ def test_invalid_dates(operator_setup, model):
         )
 
 
+def test_disabling_outlier_treatment(operator_setup):
+    tmpdirname = operator_setup
+    NUM_ROWS = 100
+    hist_data_0 = pd.concat(
+        [
+            HISTORICAL_DATETIME_COL[: NUM_ROWS - HORIZON],
+            TARGET_COL[: NUM_ROWS - HORIZON],
+        ],
+        axis=1,
+    )
+    outliers = [1000, -800]
+    hist_data_0.at[40, 'Sales'] = outliers[0]
+    hist_data_0.at[75, 'Sales'] = outliers[1]
+    historical_data_path, additional_data_path, test_data_path = setup_artificial_data(
+        tmpdirname, hist_data_0
+    )
+
+    yaml_i, output_data_path = populate_yaml(
+        tmpdirname=tmpdirname,
+        model="arima",
+        historical_data_path=historical_data_path
+    )
+    yaml_i["spec"].pop("target_category_columns")
+    yaml_i["spec"].pop("additional_data")
+
+    # running default pipeline where outlier will be treated
+    run_yaml(tmpdirname=tmpdirname, yaml_i=yaml_i, output_data_path=output_data_path)
+    forecast_without_outlier = pd.read_csv(f"{tmpdirname}/results/forecast.csv")
+    input_vals_without_outlier = set(forecast_without_outlier['input_value'])
+    assert all(
+        item not in input_vals_without_outlier for item in outliers), "forecast file should not contain any outliers"
+
+    # switching off outlier_treatment
+    preprocessing_steps = {"missing_value_imputation": True, "outlier_treatment": False}
+    preprocessing = {"enabled": True, "steps": preprocessing_steps}
+    yaml_i["spec"]["preprocessing"] = preprocessing
+    run_yaml(tmpdirname=tmpdirname, yaml_i=yaml_i, output_data_path=output_data_path)
+    forecast_with_outlier = pd.read_csv(f"{tmpdirname}/results/forecast.csv")
+    input_vals_with_outlier = set(forecast_with_outlier['input_value'])
+    assert all(
+        item in input_vals_with_outlier for item in outliers), "forecast file should contain all the outliers"
+
+
 @pytest.mark.parametrize("model", MODELS)
 def test_2_series(operator_setup, model):
     # Test w and w/o add data
@@ -475,12 +522,14 @@ def split_df(df):
     historical_data_path, additional_data_path, test_data_path = setup_artificial_data(
         tmpdirname, hist_data, add_data, test_data
     )
+    preprocessing_steps = {"missing_value_imputation": True, "outlier_treatment": False}
     yaml_i, output_data_path = populate_yaml(
         tmpdirname=tmpdirname,
         model=model,
         historical_data_path=historical_data_path,
         additional_data_path=additional_data_path,
         test_data_path=test_data_path,
+        preprocessing={"enabled": True, "steps": preprocessing_steps}
     )
     with pytest.raises(DataMismatchError):
         # 4 columns in historical data, but only 1 cat col specified

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+* @darenr @mayoor @mrDzurb @VipulMascarenhas @qiuosier`
Original file line number	Diff line number	Diff line change
`@@ -464,7 +464,7 @@ def from_dict(`
`464`	`464`	`)`
`465`	`465`
`466`	`466`	`obj = cls(`
`467`		`- **{key: obj_dict.get(key) for key in allowed_fields if key in obj_dict}`
	`467`	`+ **{key: obj_dict.get(key) for key in allowed_fields}`
`468`	`468`	`)`
`469`	`469`
`470`	`470`	`for key, value in obj_dict.items():`
Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,6 @@`
`10`	`10`	`merge_category_columns,`
`11`	`11`	`)`
`12`	`12`	`from ads.opctl.operator.lowcode.common.data import AbstractData`
`13`		`-from ads.opctl.operator.lowcode.common.data import AbstractData`
`14`	`13`	`from ads.opctl.operator.lowcode.anomaly.utils import get_frequency_of_datetime`
`15`	`14`	`from ads.opctl import logger`
`16`	`15`	`import pandas as pd`