Skip to content

Commit c5b5921

Browse files
sync with main
2 parents 5b4d2d3 + 8cf0d7c commit c5b5921

File tree

13 files changed

+217
-52
lines changed

13 files changed

+217
-52
lines changed

THIRD_PARTY_LICENSES.txt

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,12 @@ fastavro
7272
* Source code: https://github.com/fastavro/fastavro
7373
* Project home: https://github.com/fastavro/fastavro
7474

75+
fiona
76+
* Copyright (c) 2007, Sean C. Gillies
77+
* License: BSD 3-Clause "New" or "Revised" License
78+
* Source code: https://github.com/Toblerity/Fiona
79+
* Project home: https://github.com/Toblerity/Fiona
80+
7581
folium
7682
* Copyright (C) 2013, Rob Story
7783
* License: MIT License
@@ -459,7 +465,13 @@ pydantic
459465
* Source code: https://github.com/pydantic/pydantic
460466
* Project home: https://docs.pydantic.dev/latest/
461467

462-
=======
468+
rrcf
469+
* Copyright 2018 kLabUM
470+
* License: MIT License
471+
* Source code: https://github.com/kLabUM/rrcf
472+
* Project home: https://github.com/kLabUM/rrcf
473+
474+
463475
=============================== Licenses ===============================
464476
------------------------------------------------------------------------
465477

ads/opctl/conda/cmds.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -181,29 +181,29 @@ def _create(
181181
logger.info(
182182
f"Preparing manifest. Manifest in the environment: {conda_dep.get('manifest')}"
183183
)
184-
manifest = _fetch_manifest_template()
184+
manifest_template = _fetch_manifest_template()
185185
if "name" not in manifest:
186-
manifest["manifest"]["name"] = name
187-
manifest["manifest"]["slug"] = slug
186+
manifest_template["manifest"]["name"] = name
187+
manifest_template["manifest"]["slug"] = slug
188188
if "type" not in manifest:
189189
logger.info("Setting manifest to published")
190-
manifest["manifest"]["type"] = "published"
190+
manifest_template["manifest"]["type"] = "published"
191191
if "version" not in manifest:
192-
manifest["manifest"]["version"] = version
193-
manifest["manifest"]["arch_type"] = "GPU" if gpu else "CPU"
192+
manifest_template["manifest"]["version"] = version
193+
manifest_template["manifest"]["arch_type"] = "GPU" if gpu else "CPU"
194194

195-
manifest["manifest"]["create_date"] = datetime.utcnow().strftime(
195+
manifest_template["manifest"]["create_date"] = datetime.utcnow().strftime(
196196
"%a, %b %d, %Y, %H:%M:%S %Z UTC"
197197
)
198198

199199
if not "manifest_version" in manifest:
200-
manifest["manifest"]["manifest_version"] = "1.0"
200+
manifest_template["manifest"]["manifest_version"] = "1.0"
201201

202202
logger.info(f"Creating conda environment {slug}")
203203
manifest_dict = {
204-
k: manifest["manifest"][k]
205-
for k in manifest["manifest"]
206-
if manifest["manifest"][k]
204+
k: manifest_template["manifest"][k]
205+
for k in manifest_template["manifest"]
206+
if manifest_template["manifest"][k]
207207
}
208208
if "manifest" in conda_dep:
209209
conda_dep["manifest"].update(manifest_dict)

ads/opctl/operator/lowcode/anomaly/const.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ class NonTimeADSupportedModels(str, metaclass=ExtendedEnumMeta):
2121

2222
OneClassSVM = "oneclasssvm"
2323
IsolationForest = "isolationforest"
24+
RandomCutForest = "randomcutforest"
2425
# TODO : Add DBScan
2526
# DBScan = "dbscan"
2627

ads/opctl/operator/lowcode/anomaly/model/base_model.py

Lines changed: 58 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,11 @@
1616

1717
from ads.common.object_storage_details import ObjectStorageDetails
1818
from ads.opctl import logger
19-
from ads.opctl.operator.lowcode.anomaly.const import OutputColumns, SupportedMetrics, SUBSAMPLE_THRESHOLD
19+
from ads.opctl.operator.lowcode.anomaly.const import (
20+
SUBSAMPLE_THRESHOLD,
21+
OutputColumns,
22+
SupportedMetrics,
23+
)
2024
from ads.opctl.operator.lowcode.anomaly.utils import _build_metrics_df, default_signer
2125
from ads.opctl.operator.lowcode.common.utils import (
2226
disable_print,
@@ -55,6 +59,7 @@ def __init__(self, config: AnomalyOperatorConfig, datasets: AnomalyDatasets):
5559
def generate_report(self):
5660
"""Generates the report."""
5761
import matplotlib.pyplot as plt
62+
plt.rcParams.update({'figure.max_open_warning': 0})
5863
import report_creator as rc
5964

6065
start_time = time.time()
@@ -87,43 +92,59 @@ def generate_report(self):
8792
self.spec.datetime_column.name if self.spec.datetime_column else "index"
8893
)
8994

95+
(
96+
model_description,
97+
other_sections,
98+
) = self._generate_report()
99+
90100
blocks = []
91101
for target, df in self.datasets.full_data_dict.items():
92-
figure_blocks = []
93-
time_col = df[date_column].reset_index(drop=True)
94-
anomaly_col = anomaly_output.get_anomalies_by_cat(category=target)[
95-
OutputColumns.ANOMALY_COL
96-
]
97-
anomaly_indices = [i for i, index in enumerate(anomaly_col) if index == 1]
98-
downsampled_time_col = time_col
99-
selected_indices = list(range(len(time_col)))
100-
if self.spec.subsample_report_data:
101-
non_anomaly_indices = [i for i in range(len(time_col)) if i not in anomaly_indices]
102-
# Downsample non-anomalous data if it exceeds the threshold (1000)
103-
if len(non_anomaly_indices) > SUBSAMPLE_THRESHOLD:
104-
downsampled_non_anomaly_indices = non_anomaly_indices[::len(non_anomaly_indices)//SUBSAMPLE_THRESHOLD]
105-
selected_indices = anomaly_indices + downsampled_non_anomaly_indices
106-
selected_indices.sort()
107-
downsampled_time_col = time_col[selected_indices]
108-
109-
columns = set(df.columns).difference({date_column})
110-
for col in columns:
111-
y = df[col].reset_index(drop=True)
112-
113-
downsampled_y = y[selected_indices]
114-
115-
fig, ax = plt.subplots(figsize=(8, 3), layout="constrained")
116-
ax.grid()
117-
ax.plot(downsampled_time_col, downsampled_y, color="black")
118-
# Plot anomalies
119-
for i in anomaly_indices:
120-
ax.scatter(time_col[i], y[i], color="red", marker="o")
121-
plt.xlabel(date_column)
122-
plt.ylabel(col)
123-
plt.title(f"`{col}` with reference to anomalies")
124-
figure_blocks.append(rc.Widget(ax))
125-
126-
blocks.append(rc.Group(*figure_blocks, label=target))
102+
if target in anomaly_output.list_categories():
103+
figure_blocks = []
104+
time_col = df[date_column].reset_index(drop=True)
105+
anomaly_col = anomaly_output.get_anomalies_by_cat(category=target)[
106+
OutputColumns.ANOMALY_COL
107+
]
108+
anomaly_indices = [
109+
i for i, index in enumerate(anomaly_col) if index == 1
110+
]
111+
downsampled_time_col = time_col
112+
selected_indices = list(range(len(time_col)))
113+
if self.spec.subsample_report_data:
114+
non_anomaly_indices = [
115+
i for i in range(len(time_col)) if i not in anomaly_indices
116+
]
117+
# Downsample non-anomalous data if it exceeds the threshold (1000)
118+
if len(non_anomaly_indices) > SUBSAMPLE_THRESHOLD:
119+
downsampled_non_anomaly_indices = non_anomaly_indices[
120+
:: len(non_anomaly_indices) // SUBSAMPLE_THRESHOLD
121+
]
122+
selected_indices = (
123+
anomaly_indices + downsampled_non_anomaly_indices
124+
)
125+
selected_indices.sort()
126+
downsampled_time_col = time_col[selected_indices]
127+
128+
columns = set(df.columns).difference({date_column})
129+
for col in columns:
130+
y = df[col].reset_index(drop=True)
131+
132+
downsampled_y = y[selected_indices]
133+
134+
fig, ax = plt.subplots(figsize=(8, 3), layout="constrained")
135+
ax.grid()
136+
ax.plot(downsampled_time_col, downsampled_y, color="black")
137+
# Plot anomalies
138+
for i in anomaly_indices:
139+
ax.scatter(time_col[i], y[i], color="red", marker="o")
140+
plt.xlabel(date_column)
141+
plt.ylabel(col)
142+
plt.title(f"`{col}` with reference to anomalies")
143+
figure_blocks.append(rc.Widget(ax))
144+
else:
145+
figure_blocks = None
146+
147+
blocks.append(rc.Group(*figure_blocks, label=target)) if figure_blocks else None
127148
plots = rc.Select(blocks)
128149

129150
report_sections = []
@@ -133,7 +154,7 @@ def generate_report(self):
133154
yaml_appendix = rc.Yaml(self.config.to_dict())
134155
summary = rc.Block(
135156
rc.Group(
136-
rc.Text(f"You selected the **`{self.spec.model}`** model."),
157+
rc.Text(f"You selected the **`{self.spec.model}`** model.\n{model_description.text}\n"),
137158
rc.Text(
138159
"Based on your dataset, you could have also selected "
139160
f"any of the models: `{'`, `'.join(SupportedModels.keys() if self.spec.datetime_column else NonTimeADSupportedModels.keys())}`."

ads/opctl/operator/lowcode/anomaly/model/factory.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from .base_model import AnomalyOperatorBaseModel
1616
from .isolationforest import IsolationForestOperatorModel
1717
from .oneclasssvm import OneClassSVMOperatorModel
18+
from .randomcutforest import RandomCutForestOperatorModel
1819

1920

2021
class UnSupportedModelError(Exception):
@@ -52,6 +53,7 @@ class AnomalyOperatorModelFactory:
5253
_NonTime_MAP = {
5354
NonTimeADSupportedModels.OneClassSVM: OneClassSVMOperatorModel,
5455
NonTimeADSupportedModels.IsolationForest: IsolationForestOperatorModel,
56+
NonTimeADSupportedModels.RandomCutForest: RandomCutForestOperatorModel,
5557
# TODO: Add DBScan model for non time based anomaly
5658
# NonTimeADSupportedModels.DBScan: DBScanOperatorModel,
5759
}
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
#!/usr/bin/env python
2+
3+
# Copyright (c) 2023, 2024 Oracle and/or its affiliates.
4+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
5+
6+
import numpy as np
7+
import pandas as pd
8+
9+
from ads.common.decorator.runtime_dependency import runtime_dependency
10+
from ads.opctl import logger
11+
from ads.opctl.operator.lowcode.anomaly.const import OutputColumns
12+
13+
from .anomaly_dataset import AnomalyOutput
14+
from .base_model import AnomalyOperatorBaseModel
15+
16+
17+
class RandomCutForestOperatorModel(AnomalyOperatorBaseModel):
18+
"""
19+
Class representing Random Cut Forest Anomaly Detection operator model.
20+
"""
21+
22+
@runtime_dependency(
23+
module="rrcf",
24+
err_msg=(
25+
"Please run `pip install rrcf` to "
26+
"install the required dependencies for RandomCutForest."
27+
),
28+
)
29+
def _build_model(self) -> AnomalyOutput:
30+
from rrcf import RCTree
31+
32+
model_kwargs = self.spec.model_kwargs
33+
34+
anomaly_output = AnomalyOutput(date_column="index")
35+
36+
# Set tree parameters
37+
num_trees = model_kwargs.get("num_trees", 200)
38+
shingle_size = model_kwargs.get("shingle_size", None)
39+
anomaly_threshold = model_kwargs.get("anamoly_threshold", 95)
40+
41+
for target, df in self.datasets.full_data_dict.items():
42+
try:
43+
if df.shape[0] == 1:
44+
raise ValueError("Dataset size must be greater than 1")
45+
df_values = df[self.spec.target_column].astype(float).values
46+
47+
cal_shingle_size = (
48+
shingle_size
49+
if shingle_size
50+
else int(2 ** np.floor(np.log2(df.shape[0])) / 2)
51+
)
52+
points = np.vstack(list(rrcf.shingle(df_values, size=cal_shingle_size)))
53+
54+
sample_size_range = (1, points.shape[0])
55+
n = points.shape[0]
56+
avg_codisp = pd.Series(0.0, index=np.arange(n))
57+
index = np.zeros(n)
58+
59+
forest = []
60+
while len(forest) < num_trees:
61+
ixs = np.random.choice(n, size=sample_size_range, replace=False)
62+
trees = [rrcf.RCTree(points[ix], index_labels=ix) for ix in ixs]
63+
forest.extend(trees)
64+
65+
for tree in forest:
66+
codisp = pd.Series(
67+
{leaf: tree.codisp(leaf) for leaf in tree.leaves}
68+
)
69+
avg_codisp[codisp.index] += codisp
70+
np.add.at(index, codisp.index.values, 1)
71+
72+
avg_codisp /= index
73+
avg_codisp.index = df.iloc[(cal_shingle_size - 1) :].index
74+
avg_codisp = (avg_codisp - avg_codisp.min()) / (
75+
avg_codisp.max() - avg_codisp.min()
76+
)
77+
78+
y_pred = (
79+
avg_codisp > np.percentile(avg_codisp, anomaly_threshold)
80+
).astype(int)
81+
82+
index_col = df.columns[0]
83+
84+
anomaly = pd.DataFrame(
85+
{index_col: y_pred.index, OutputColumns.ANOMALY_COL: y_pred}
86+
).reset_index(drop=True)
87+
score = pd.DataFrame(
88+
{"index": avg_codisp.index, OutputColumns.SCORE_COL: avg_codisp}
89+
).reset_index(drop=True)
90+
91+
anomaly_output.add_output(target, anomaly, score)
92+
except Exception as e:
93+
logger.warn(f"Encountered Error: {e}. Skipping series {target}.")
94+
95+
return anomaly_output
96+
97+
def _generate_report(self):
98+
"""Generates the report."""
99+
import report_creator as rc
100+
101+
other_sections = [
102+
rc.Heading("Selected Models Overview", level=2),
103+
rc.Text(
104+
"The following tables provide information regarding the chosen model."
105+
),
106+
]
107+
108+
model_description = rc.Text(
109+
"The Random Cut Forest (RCF) is an unsupervised machine learning algorithm that is used for anomaly detection."
110+
" It works by building an ensemble of binary trees (random cut trees) and using them to compute anomaly scores for data points."
111+
)
112+
113+
return (
114+
model_description,
115+
other_sections,
116+
)

ads/opctl/operator/lowcode/anomaly/schema.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,7 @@ spec:
363363
- auto
364364
- oneclasssvm
365365
- isolationforest
366+
- randomcutforest
366367
meta:
367368
description: "The model to be used for anomaly detection"
368369

docs/source/release_notes.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,12 @@
22
Release Notes
33
=============
44

5+
2.11.18
6+
-------
7+
Release date: September 20, 2024
8+
9+
* Added ``with_artifact()`` in ``ContainerRuntime`` class to support running container job with additional artifact.
10+
511
2.11.17
612
-------
713
Release date: August 9, 2024

docs/source/user_guide/jobs/run_container.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ Here is an example to create and run a container job:
2222

2323
To configure ``ContainerRuntime``, you must specify the container ``image``.
2424
Similar to other runtime, you can add environment variables.
25-
You can optionally specify the `entrypoint`, `cmd`, `image_digest` and `image_signature_id` for running the container.
25+
You can optionally specify the `entrypoint`, `cmd`, `image_digest` and `image_signature_id` for running the container. You may also add additional artifact (file or directory) if needed. Please note that if you add a directory, it will be compressed as a zip file under `/home/datascience` and you will need to unzip if in your container.
2626

2727
See also:
2828

docs/source/user_guide/jobs/tabs/container_runtime.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
.with_environment_variable(GREETINGS="Welcome to OCI Data Science")
3434
.with_entrypoint(["/bin/sh", "-c"])
3535
.with_cmd("sleep 5 && echo $GREETINGS")
36+
.artifact("<path/to/artifact>")
3637
)
3738
)
3839

@@ -69,6 +70,7 @@
6970
- name: GREETINGS
7071
value: Welcome to OCI Data Science
7172
image: <region>.ocir.io/<your_tenancy>/<your_image>
73+
scriptPathURI: path/to/artifact
7274

7375

7476
.. code-block:: python

0 commit comments

Comments
 (0)