sync with main

VipulMascarenhas · VipulMascarenhas · commit c5b59212461d · 2024-09-23T20:57:15.000-07:00
diff --git a/THIRD_PARTY_LICENSES.txt b/THIRD_PARTY_LICENSES.txt
@@ -72,6 +72,12 @@ fastavro
 * Source code: https://github.com/fastavro/fastavro
 * Project home: https://github.com/fastavro/fastavro
 
+fiona
+* Copyright (c) 2007, Sean C. Gillies
+* License: BSD 3-Clause "New" or "Revised" License
+* Source code: https://github.com/Toblerity/Fiona
+* Project home: https://github.com/Toblerity/Fiona
+
 folium
 * Copyright (C) 2013, Rob Story
 * License: MIT License
@@ -459,7 +465,13 @@ pydantic
 * Source code: https://github.com/pydantic/pydantic
 * Project home: https://docs.pydantic.dev/latest/
 
-=======
+rrcf
+* Copyright 2018 kLabUM
+* License: MIT License
+* Source code: https://github.com/kLabUM/rrcf
+* Project home: https://github.com/kLabUM/rrcf
+
+
 =============================== Licenses ===============================
 ------------------------------------------------------------------------
 
diff --git a/ads/opctl/conda/cmds.py b/ads/opctl/conda/cmds.py
@@ -181,29 +181,29 @@ def _create(
     logger.info(
         f"Preparing manifest. Manifest in the environment: {conda_dep.get('manifest')}"
     )
-    manifest = _fetch_manifest_template()
+    manifest_template = _fetch_manifest_template()
     if "name" not in manifest:
-        manifest["manifest"]["name"] = name
-    manifest["manifest"]["slug"] = slug
+        manifest_template["manifest"]["name"] = name
+    manifest_template["manifest"]["slug"] = slug
     if "type" not in manifest:
         logger.info("Setting manifest to published")
-        manifest["manifest"]["type"] = "published"
+        manifest_template["manifest"]["type"] = "published"
     if "version" not in manifest:
-        manifest["manifest"]["version"] = version
-    manifest["manifest"]["arch_type"] = "GPU" if gpu else "CPU"
+        manifest_template["manifest"]["version"] = version
+    manifest_template["manifest"]["arch_type"] = "GPU" if gpu else "CPU"
 
-    manifest["manifest"]["create_date"] = datetime.utcnow().strftime(
+    manifest_template["manifest"]["create_date"] = datetime.utcnow().strftime(
         "%a, %b %d, %Y, %H:%M:%S %Z UTC"
     )
 
     if not "manifest_version" in manifest:
-        manifest["manifest"]["manifest_version"] = "1.0"
+        manifest_template["manifest"]["manifest_version"] = "1.0"
 
     logger.info(f"Creating conda environment {slug}")
     manifest_dict = {
-        k: manifest["manifest"][k]
-        for k in manifest["manifest"]
-        if manifest["manifest"][k]
+        k: manifest_template["manifest"][k]
+        for k in manifest_template["manifest"]
+        if manifest_template["manifest"][k]
     }
     if "manifest" in conda_dep:
         conda_dep["manifest"].update(manifest_dict)
diff --git a/ads/opctl/operator/lowcode/anomaly/const.py b/ads/opctl/operator/lowcode/anomaly/const.py
@@ -21,6 +21,7 @@ class NonTimeADSupportedModels(str, metaclass=ExtendedEnumMeta):
 
     OneClassSVM = "oneclasssvm"
     IsolationForest = "isolationforest"
+    RandomCutForest = "randomcutforest"
     # TODO : Add DBScan
     # DBScan = "dbscan"
     
diff --git a/ads/opctl/operator/lowcode/anomaly/model/base_model.py b/ads/opctl/operator/lowcode/anomaly/model/base_model.py
@@ -16,7 +16,11 @@
 
 from ads.common.object_storage_details import ObjectStorageDetails
 from ads.opctl import logger
-from ads.opctl.operator.lowcode.anomaly.const import OutputColumns, SupportedMetrics, SUBSAMPLE_THRESHOLD
+from ads.opctl.operator.lowcode.anomaly.const import (
+    SUBSAMPLE_THRESHOLD,
+    OutputColumns,
+    SupportedMetrics,
+)
 from ads.opctl.operator.lowcode.anomaly.utils import _build_metrics_df, default_signer
 from ads.opctl.operator.lowcode.common.utils import (
     disable_print,
@@ -55,6 +59,7 @@ def __init__(self, config: AnomalyOperatorConfig, datasets: AnomalyDatasets):
     def generate_report(self):
         """Generates the report."""
         import matplotlib.pyplot as plt
+        plt.rcParams.update({'figure.max_open_warning': 0})
         import report_creator as rc
 
         start_time = time.time()
@@ -87,43 +92,59 @@ def generate_report(self):
             self.spec.datetime_column.name if self.spec.datetime_column else "index"
         )
 
+        (
+            model_description,
+            other_sections,
+        ) = self._generate_report()
+
         blocks = []
         for target, df in self.datasets.full_data_dict.items():
-            figure_blocks = []
-            time_col = df[date_column].reset_index(drop=True)
-            anomaly_col = anomaly_output.get_anomalies_by_cat(category=target)[
-                OutputColumns.ANOMALY_COL
-            ]
-            anomaly_indices = [i for i, index in enumerate(anomaly_col) if index == 1]
-            downsampled_time_col = time_col
-            selected_indices = list(range(len(time_col)))
-            if self.spec.subsample_report_data:
-                non_anomaly_indices = [i for i in range(len(time_col)) if i not in anomaly_indices]
-                # Downsample non-anomalous data if it exceeds the threshold (1000)
-                if len(non_anomaly_indices) > SUBSAMPLE_THRESHOLD:
-                    downsampled_non_anomaly_indices = non_anomaly_indices[::len(non_anomaly_indices)//SUBSAMPLE_THRESHOLD]
-                    selected_indices = anomaly_indices + downsampled_non_anomaly_indices
-                    selected_indices.sort()
-                downsampled_time_col = time_col[selected_indices]
-
-            columns = set(df.columns).difference({date_column})
-            for col in columns:
-                y = df[col].reset_index(drop=True)
-
-                downsampled_y = y[selected_indices]
-
-                fig, ax = plt.subplots(figsize=(8, 3), layout="constrained")
-                ax.grid()
-                ax.plot(downsampled_time_col, downsampled_y, color="black")
-                # Plot anomalies
-                for i in anomaly_indices:
-                    ax.scatter(time_col[i], y[i], color="red", marker="o")
-                plt.xlabel(date_column)
-                plt.ylabel(col)
-                plt.title(f"`{col}` with reference to anomalies")
-                figure_blocks.append(rc.Widget(ax))
-
-        blocks.append(rc.Group(*figure_blocks, label=target))
+            if target in anomaly_output.list_categories():
+                figure_blocks = []
+                time_col = df[date_column].reset_index(drop=True)
+                anomaly_col = anomaly_output.get_anomalies_by_cat(category=target)[
+                    OutputColumns.ANOMALY_COL
+                ]
+                anomaly_indices = [
+                    i for i, index in enumerate(anomaly_col) if index == 1
+                ]
+                downsampled_time_col = time_col
+                selected_indices = list(range(len(time_col)))
+                if self.spec.subsample_report_data:
+                    non_anomaly_indices = [
+                        i for i in range(len(time_col)) if i not in anomaly_indices
+                    ]
+                    # Downsample non-anomalous data if it exceeds the threshold (1000)
+                    if len(non_anomaly_indices) > SUBSAMPLE_THRESHOLD:
+                        downsampled_non_anomaly_indices = non_anomaly_indices[
+                            :: len(non_anomaly_indices) // SUBSAMPLE_THRESHOLD
+                        ]
+                        selected_indices = (
+                            anomaly_indices + downsampled_non_anomaly_indices
+                        )
+                        selected_indices.sort()
+                    downsampled_time_col = time_col[selected_indices]
+
+                columns = set(df.columns).difference({date_column})
+                for col in columns:
+                    y = df[col].reset_index(drop=True)
+
+                    downsampled_y = y[selected_indices]
+
+                    fig, ax = plt.subplots(figsize=(8, 3), layout="constrained")
+                    ax.grid()
+                    ax.plot(downsampled_time_col, downsampled_y, color="black")
+                    # Plot anomalies
+                    for i in anomaly_indices:
+                        ax.scatter(time_col[i], y[i], color="red", marker="o")
+                    plt.xlabel(date_column)
+                    plt.ylabel(col)
+                    plt.title(f"`{col}` with reference to anomalies")
+                    figure_blocks.append(rc.Widget(ax))
+            else:
+                figure_blocks = None
+
+            blocks.append(rc.Group(*figure_blocks, label=target)) if figure_blocks else None
         plots = rc.Select(blocks)
 
         report_sections = []
@@ -133,7 +154,7 @@ def generate_report(self):
         yaml_appendix = rc.Yaml(self.config.to_dict())
         summary = rc.Block(
             rc.Group(
-                rc.Text(f"You selected the **`{self.spec.model}`** model."),
+                rc.Text(f"You selected the **`{self.spec.model}`** model.\n{model_description.text}\n"),
                 rc.Text(
                     "Based on your dataset, you could have also selected "
                     f"any of the models: `{'`, `'.join(SupportedModels.keys() if self.spec.datetime_column else NonTimeADSupportedModels.keys())}`."
diff --git a/ads/opctl/operator/lowcode/anomaly/model/factory.py b/ads/opctl/operator/lowcode/anomaly/model/factory.py
@@ -15,6 +15,7 @@
 from .base_model import AnomalyOperatorBaseModel
 from .isolationforest import IsolationForestOperatorModel
 from .oneclasssvm import OneClassSVMOperatorModel
+from .randomcutforest import RandomCutForestOperatorModel
 
 
 class UnSupportedModelError(Exception):
@@ -52,6 +53,7 @@ class AnomalyOperatorModelFactory:
     _NonTime_MAP = {
         NonTimeADSupportedModels.OneClassSVM: OneClassSVMOperatorModel,
         NonTimeADSupportedModels.IsolationForest: IsolationForestOperatorModel,
+        NonTimeADSupportedModels.RandomCutForest: RandomCutForestOperatorModel,
         # TODO: Add DBScan model for non time based anomaly
         # NonTimeADSupportedModels.DBScan: DBScanOperatorModel,
     }
diff --git a/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py b/ads/opctl/operator/lowcode/anomaly/model/randomcutforest.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python
+
+# Copyright (c) 2023, 2024 Oracle and/or its affiliates.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
+
+import numpy as np
+import pandas as pd
+
+from ads.common.decorator.runtime_dependency import runtime_dependency
+from ads.opctl import logger
+from ads.opctl.operator.lowcode.anomaly.const import OutputColumns
+
+from .anomaly_dataset import AnomalyOutput
+from .base_model import AnomalyOperatorBaseModel
+
+
+class RandomCutForestOperatorModel(AnomalyOperatorBaseModel):
+    """
+    Class representing Random Cut Forest Anomaly Detection operator model.
+    """
+
+    @runtime_dependency(
+        module="rrcf",
+        err_msg=(
+            "Please run `pip install rrcf` to "
+            "install the required dependencies for RandomCutForest."
+        ),
+    )
+    def _build_model(self) -> AnomalyOutput:
+        from rrcf import RCTree
+
+        model_kwargs = self.spec.model_kwargs
+
+        anomaly_output = AnomalyOutput(date_column="index")
+
+        # Set tree parameters
+        num_trees = model_kwargs.get("num_trees", 200)
+        shingle_size = model_kwargs.get("shingle_size", None)
+        anomaly_threshold = model_kwargs.get("anamoly_threshold", 95)
+
+        for target, df in self.datasets.full_data_dict.items():
+            try:
+                if df.shape[0] == 1:
+                    raise ValueError("Dataset size must be greater than 1")
+                df_values = df[self.spec.target_column].astype(float).values
+
+                cal_shingle_size = (
+                    shingle_size
+                    if shingle_size
+                    else int(2 ** np.floor(np.log2(df.shape[0])) / 2)
+                )
+                points = np.vstack(list(rrcf.shingle(df_values, size=cal_shingle_size)))
+
+                sample_size_range = (1, points.shape[0])
+                n = points.shape[0]
+                avg_codisp = pd.Series(0.0, index=np.arange(n))
+                index = np.zeros(n)
+
+                forest = []
+                while len(forest) < num_trees:
+                    ixs = np.random.choice(n, size=sample_size_range, replace=False)
+                    trees = [rrcf.RCTree(points[ix], index_labels=ix) for ix in ixs]
+                    forest.extend(trees)
+
+                for tree in forest:
+                    codisp = pd.Series(
+                        {leaf: tree.codisp(leaf) for leaf in tree.leaves}
+                    )
+                    avg_codisp[codisp.index] += codisp
+                    np.add.at(index, codisp.index.values, 1)
+
+                avg_codisp /= index
+                avg_codisp.index = df.iloc[(cal_shingle_size - 1) :].index
+                avg_codisp = (avg_codisp - avg_codisp.min()) / (
+                    avg_codisp.max() - avg_codisp.min()
+                )
+
+                y_pred = (
+                    avg_codisp > np.percentile(avg_codisp, anomaly_threshold)
+                ).astype(int)
+
+                index_col = df.columns[0]
+
+                anomaly = pd.DataFrame(
+                    {index_col: y_pred.index, OutputColumns.ANOMALY_COL: y_pred}
+                ).reset_index(drop=True)
+                score = pd.DataFrame(
+                    {"index": avg_codisp.index, OutputColumns.SCORE_COL: avg_codisp}
+                ).reset_index(drop=True)
+
+                anomaly_output.add_output(target, anomaly, score)
+            except Exception as e:
+                logger.warn(f"Encountered Error: {e}. Skipping series {target}.")
+
+        return anomaly_output
+
+    def _generate_report(self):
+        """Generates the report."""
+        import report_creator as rc
+
+        other_sections = [
+            rc.Heading("Selected Models Overview", level=2),
+            rc.Text(
+                "The following tables provide information regarding the chosen model."
+            ),
+        ]
+
+        model_description = rc.Text(
+            "The Random Cut Forest (RCF) is an unsupervised machine learning algorithm that is used for anomaly detection."
+            " It works by building an ensemble of binary trees (random cut trees) and using them to compute anomaly scores for data points."
+        )
+
+        return (
+            model_description,
+            other_sections,
+        )
diff --git a/ads/opctl/operator/lowcode/anomaly/schema.yaml b/ads/opctl/operator/lowcode/anomaly/schema.yaml
@@ -363,6 +363,7 @@ spec:
         - auto
         - oneclasssvm
         - isolationforest
+        - randomcutforest
       meta:
         description: "The model to be used for anomaly detection"
 
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -2,6 +2,12 @@
 Release Notes
 =============
 
+2.11.18
+-------
+Release date: September 20, 2024
+
+* Added ``with_artifact()`` in ``ContainerRuntime`` class to support running container job with additional artifact.
+
 2.11.17
 -------
 Release date: August 9, 2024
diff --git a/docs/source/user_guide/jobs/run_container.rst b/docs/source/user_guide/jobs/run_container.rst
@@ -22,7 +22,7 @@ Here is an example to create and run a container job:
 
 To configure ``ContainerRuntime``, you must specify the container ``image``.
 Similar to other runtime, you can add environment variables.
-You can optionally specify the `entrypoint`, `cmd`, `image_digest` and `image_signature_id` for running the container.
+You can optionally specify the `entrypoint`, `cmd`, `image_digest` and `image_signature_id` for running the container. You may also add additional artifact (file or directory) if needed. Please note that if you add a directory, it will be compressed as a zip file under `/home/datascience` and you will need to unzip if in your container.
 
 See also:
 
diff --git a/docs/source/user_guide/jobs/tabs/container_runtime.rst b/docs/source/user_guide/jobs/tabs/container_runtime.rst
@@ -33,6 +33,7 @@
             .with_environment_variable(GREETINGS="Welcome to OCI Data Science")
             .with_entrypoint(["/bin/sh", "-c"])
             .with_cmd("sleep 5 && echo $GREETINGS")
+            .artifact("<path/to/artifact>")
         )
     )
 
@@ -69,6 +70,7 @@
           - name: GREETINGS
             value: Welcome to OCI Data Science
           image: <region>.ocir.io/<your_tenancy>/<your_image>
+          scriptPathURI: path/to/artifact
 
 
 .. code-block:: python
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/test-requirements-operators.txt b/test-requirements-operators.txt
diff --git a/tests/operators/anomaly/test_anomaly_simple.py b/tests/operators/anomaly/test_anomaly_simple.py

Original file line number	Diff line number	Diff line change
`@@ -33,6 +33,7 @@`
`33`	`33`	`.with_environment_variable(GREETINGS="Welcome to OCI Data Science")`
`34`	`34`	`.with_entrypoint(["/bin/sh", "-c"])`
`35`	`35`	`.with_cmd("sleep 5 && echo $GREETINGS")`
	`36`	`+ .artifact("<path/to/artifact>")`
`36`	`37`	`)`
`37`	`38`	`)`
`38`	`39`
`@@ -69,6 +70,7 @@`
`69`	`70`	`- name: GREETINGS`
`70`	`71`	`value: Welcome to OCI Data Science`
`71`	`72`	`image: <region>.ocir.io/<your_tenancy>/<your_image>`
	`73`	`+ scriptPathURI: path/to/artifact`
`72`	`74`
`73`	`75`
`74`	`76`	`.. code-block:: python`