Skip to content

Commit d04075b

Browse files
committed
Merge branch 'feature/repository_maintanance' into ODSC-54710/python311_312_support
2 parents a2f4fd9 + fc05985 commit d04075b

22 files changed

+4404
-430
lines changed

ads/dataset/label_encoder.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def fit(self, X: "pandas.DataFrame"):
5252
5353
"""
5454
for column in X.columns:
55-
if X[column].dtype.name in ["object", "category"]:
55+
if X[column].dtype.name in ["object", "category", "bool"]:
5656
X[column] = X[column].astype(str)
5757
self.label_encoders[column] = LabelEncoder()
5858
self.label_encoders[column].fit(X[column])

ads/opctl/operator/lowcode/anomaly/model/anomaly_dataset.py

+8-15
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
merge_category_columns,
1111
)
1212
from ads.opctl.operator.lowcode.common.data import AbstractData
13-
from ads.opctl.operator.lowcode.common.data import AbstractData
1413
from ads.opctl.operator.lowcode.anomaly.utils import get_frequency_of_datetime
1514
from ads.opctl import logger
1615
import pandas as pd
@@ -56,6 +55,10 @@ def __init__(self, spec: AnomalyOperatorSpec):
5655
self.X_valid_dict = self.valid_data.X_valid_dict
5756
self.y_valid_dict = self.valid_data.y_valid_dict
5857

58+
# Returns raw data based on the series_id i.e; the merged target_category_column value
59+
def get_raw_data_by_cat(self, category):
60+
return self._data.get_raw_data_by_cat(category)
61+
5962

6063
class AnomalyOutput:
6164
def __init__(self, date_column):
@@ -94,38 +97,28 @@ def get_outliers_by_cat(self, category: str, data: pd.DataFrame):
9497
outliers = pd.merge(outliers, scores, on=self.date_column, how="inner")
9598
return outliers
9699

97-
def get_inliers(self, data):
100+
def get_inliers(self, datasets):
98101
inliers = pd.DataFrame()
99102

100103
for category in self.list_categories():
101104
inliers = pd.concat(
102105
[
103106
inliers,
104-
self.get_inliers_by_cat(
105-
category,
106-
data[data[OutputColumns.Series] == category]
107-
.reset_index(drop=True)
108-
.drop(OutputColumns.Series, axis=1),
109-
),
107+
self.get_inliers_by_cat(category, datasets.get_raw_data_by_cat(category)),
110108
],
111109
axis=0,
112110
ignore_index=True,
113111
)
114112
return inliers
115113

116-
def get_outliers(self, data):
114+
def get_outliers(self, datasets):
117115
outliers = pd.DataFrame()
118116

119117
for category in self.list_categories():
120118
outliers = pd.concat(
121119
[
122120
outliers,
123-
self.get_outliers_by_cat(
124-
category,
125-
data[data[OutputColumns.Series] == category]
126-
.reset_index(drop=True)
127-
.drop(OutputColumns.Series, axis=1),
128-
),
121+
self.get_outliers_by_cat(category, datasets.get_raw_data_by_cat(category)),
129122
],
130123
axis=0,
131124
ignore_index=True,

ads/opctl/operator/lowcode/anomaly/model/base_model.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -272,15 +272,15 @@ def _save_report(
272272
f2.write(f1.read())
273273

274274
if self.spec.generate_inliers:
275-
inliers = anomaly_output.get_inliers(self.datasets.data)
275+
inliers = anomaly_output.get_inliers(self.datasets)
276276
write_data(
277277
data=inliers,
278278
filename=os.path.join(unique_output_dir, self.spec.inliers_filename),
279279
format="csv",
280280
storage_options=storage_options,
281281
)
282282

283-
outliers = anomaly_output.get_outliers(self.datasets.data)
283+
outliers = anomaly_output.get_outliers(self.datasets)
284284
write_data(
285285
data=outliers,
286286
filename=os.path.join(unique_output_dir, self.spec.outliers_filename),

ads/opctl/operator/lowcode/anomaly/operator_config.py

+18-1
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,21 @@ class TestData(InputData):
3636
"""Class representing operator specification test data details."""
3737

3838

39+
@dataclass(repr=True)
40+
class PreprocessingSteps(DataClassSerializable):
41+
"""Class representing preprocessing steps for operator."""
42+
43+
missing_value_imputation: bool = True
44+
outlier_treatment: bool = False
45+
46+
47+
@dataclass(repr=True)
48+
class DataPreprocessor(DataClassSerializable):
49+
"""Class representing operator specification preprocessing details."""
50+
51+
enabled: bool = True
52+
steps: PreprocessingSteps = field(default_factory=PreprocessingSteps)
53+
3954
@dataclass(repr=True)
4055
class AnomalyOperatorSpec(DataClassSerializable):
4156
"""Class representing operator specification."""
@@ -74,7 +89,9 @@ def __post_init__(self):
7489
self.generate_inliers if self.generate_inliers is not None else False
7590
)
7691
self.model_kwargs = self.model_kwargs or dict()
77-
92+
self.preprocessing = (
93+
self.preprocessing if self.preprocessing is not None else DataPreprocessor(enabled=True)
94+
)
7895

7996
@dataclass(repr=True)
8097
class AnomalyOperatorConfig(OperatorConfig):

ads/opctl/operator/lowcode/anomaly/schema.yaml

+16-4
Original file line numberDiff line numberDiff line change
@@ -307,11 +307,23 @@ spec:
307307
description: "When provided, target_category_columns [list] indexes the data into multiple related datasets for anomaly detection"
308308

309309
preprocessing:
310-
type: boolean
310+
type: dict
311311
required: false
312-
default: true
313-
meta:
314-
description: "preprocessing and feature engineering can be disabled using this flag, Defaults to true"
312+
schema:
313+
enabled:
314+
type: boolean
315+
required: false
316+
default: true
317+
meta:
318+
description: "preprocessing and feature engineering can be disabled using this flag, Defaults to true"
319+
steps:
320+
type: dict
321+
required: false
322+
schema:
323+
missing_value_imputation:
324+
type: boolean
325+
required: false
326+
default: true
315327

316328
generate_report:
317329
type: boolean

ads/opctl/operator/lowcode/common/data.py

+16-2
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
DataMismatchError,
1717
)
1818
from abc import ABC
19+
import pandas as pd
1920

2021

2122
class AbstractData(ABC):
@@ -26,6 +27,19 @@ def __init__(self, spec: dict, name="input_data"):
2627
self.name = name
2728
self.load_transform_ingest_data(spec)
2829

30+
def get_raw_data_by_cat(self, category):
31+
mapping = self._data_transformer.get_target_category_columns_map()
32+
# For given category, mapping gives the target_category_columns and it's values.
33+
# condition filters raw_data based on the values of target_category_columns for the given category
34+
condition = pd.Series(True, index=self.raw_data.index)
35+
if category in mapping:
36+
for col, val in mapping[category].items():
37+
condition &= (self.raw_data[col] == val)
38+
data_by_cat = self.raw_data[condition].reset_index(drop=True)
39+
data_by_cat = self._data_transformer._format_datetime_col(data_by_cat)
40+
return data_by_cat
41+
42+
2943
def get_dict_by_series(self):
3044
if not self._data_dict:
3145
for s_id in self.list_series_ids():
@@ -73,8 +87,8 @@ def _transform_data(self, spec, raw_data, **kwargs):
7387
return data
7488

7589
def load_transform_ingest_data(self, spec):
76-
raw_data = self._load_data(getattr(spec, self.name))
77-
self.data = self._transform_data(spec, raw_data)
90+
self.raw_data = self._load_data(getattr(spec, self.name))
91+
self.data = self._transform_data(spec, self.raw_data)
7892
self._ingest_data(spec)
7993

8094
def _ingest_data(self, spec):

ads/opctl/operator/lowcode/common/transformations.py

+48-14
Original file line numberDiff line numberDiff line change
@@ -58,33 +58,45 @@ def run(self, data):
5858
clean_df = self._format_datetime_col(clean_df)
5959
clean_df = self._set_multi_index(clean_df)
6060

61-
if self.name == "historical_data":
62-
try:
63-
clean_df = self._missing_value_imputation_hist(clean_df)
64-
except Exception as e:
65-
logger.debug(f"Missing value imputation failed with {e.args}")
66-
if self.preprocessing:
67-
try:
68-
clean_df = self._outlier_treatment(clean_df)
69-
except Exception as e:
70-
logger.debug(f"Outlier Treatment failed with {e.args}")
71-
else:
72-
logger.debug("Skipping outlier treatment as preprocessing is disabled")
73-
elif self.name == "additional_data":
74-
clean_df = self._missing_value_imputation_add(clean_df)
61+
if self.preprocessing and self.preprocessing.enabled:
62+
if self.name == "historical_data":
63+
if self.preprocessing.steps.missing_value_imputation:
64+
try:
65+
clean_df = self._missing_value_imputation_hist(clean_df)
66+
except Exception as e:
67+
logger.debug(f"Missing value imputation failed with {e.args}")
68+
else:
69+
logger.info("Skipping missing value imputation because it is disabled")
70+
if self.preprocessing.steps.outlier_treatment:
71+
try:
72+
clean_df = self._outlier_treatment(clean_df)
73+
except Exception as e:
74+
logger.debug(f"Outlier Treatment failed with {e.args}")
75+
else:
76+
logger.info("Skipping outlier treatment because it is disabled")
77+
elif self.name == "additional_data":
78+
clean_df = self._missing_value_imputation_add(clean_df)
79+
else:
80+
logger.info("Skipping all preprocessing steps because preprocessing is disabled")
7581
return clean_df
7682

7783
def _remove_trailing_whitespace(self, df):
7884
return df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
7985

8086
def _set_series_id_column(self, df):
87+
self._target_category_columns_map = dict()
8188
if not self.target_category_columns:
8289
df[DataColumns.Series] = "Series 1"
8390
self.has_artificial_series = True
8491
else:
8592
df[DataColumns.Series] = merge_category_columns(
8693
df, self.target_category_columns
8794
)
95+
merged_values = df[DataColumns.Series].unique().tolist()
96+
if self.target_category_columns:
97+
for value in merged_values:
98+
self._target_category_columns_map[value] = df[df[DataColumns.Series] == value][self.target_category_columns].drop_duplicates().iloc[0].to_dict()
99+
88100
df = df.drop(self.target_category_columns, axis=1)
89101
return df
90102

@@ -189,3 +201,25 @@ def _check_historical_dataset(self, df):
189201
raise DataMismatchError(
190202
f"Expected {self.name} to have columns: {expected_names}, but instead found column names: {df.columns}. Is the {self.name} path correct?"
191203
)
204+
205+
"""
206+
Map between merged target category column values and target category column and its value
207+
If target category columns are PPG_Code, Class, Num
208+
Merged target category column values are Product Category 1__A__1, Product Category 2__A__2
209+
Then target_category_columns_map would be
210+
{
211+
"Product Category 1__A__1": {
212+
"PPG_Code": "Product Category 1",
213+
"Class": "A",
214+
"Num": 1
215+
},
216+
"Product Category 2__A__2": {
217+
"PPG_Code": "Product Category 2",
218+
"Class": "A",
219+
"Num": 2
220+
},
221+
222+
}
223+
"""
224+
def get_target_category_columns_map(self):
225+
return self._target_category_columns_map

ads/opctl/operator/lowcode/forecast/model/arima.py

+21-12
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ def __init__(self, config: ForecastOperatorConfig, datasets: ForecastDatasets):
2929
self.local_explanation = {}
3030
self.formatted_global_explanation = None
3131
self.formatted_local_explanation = None
32+
self.constant_cols = {}
3233

3334
def set_kwargs(self):
3435
# Extract the Confidence Interval Width and convert to arima's equivalent - alpha
@@ -64,6 +65,10 @@ def _train_model(self, i, s_id, df, model_kwargs):
6465
try:
6566
target = self.original_target_column
6667
self.forecast_output.init_series_output(series_id=s_id, data_at_series=df)
68+
# If trend is constant, remove constant columns
69+
if 'trend' not in model_kwargs or model_kwargs['trend'] == 'c':
70+
self.constant_cols[s_id] = df.columns[df.nunique() == 1]
71+
df = df.drop(columns=self.constant_cols[s_id])
6772

6873
# format the dataframe for this target. Dropping NA on target[df] will remove all future data
6974
data = self.preprocess(df, s_id)
@@ -74,7 +79,7 @@ def _train_model(self, i, s_id, df, model_kwargs):
7479
X_in = data_i.drop(target, axis=1) if len(data_i.columns) > 1 else None
7580
X_pred = self.get_horizon(data).drop(target, axis=1)
7681

77-
if self.loaded_models is not None:
82+
if self.loaded_models is not None and s_id in self.loaded_models:
7883
model = self.loaded_models[s_id]
7984
else:
8085
# Build and fit model
@@ -143,17 +148,18 @@ def _build_model(self) -> pd.DataFrame:
143148
def _generate_report(self):
144149
"""The method that needs to be implemented on the particular model level."""
145150
import datapane as dp
146-
147-
sec5_text = dp.Text(f"## ARIMA Model Parameters")
148-
blocks = [
149-
dp.HTML(
150-
m.summary().as_html(),
151-
label=s_id,
152-
)
153-
for i, (s_id, m) in enumerate(self.models.items())
154-
]
155-
sec5 = dp.Select(blocks=blocks) if len(blocks) > 1 else blocks[0]
156-
all_sections = [sec5_text, sec5]
151+
all_sections = []
152+
if len(self.models) > 0:
153+
sec5_text = dp.Text(f"## ARIMA Model Parameters")
154+
blocks = [
155+
dp.HTML(
156+
m.summary().as_html(),
157+
label=s_id,
158+
)
159+
for i, (s_id, m) in enumerate(self.models.items())
160+
]
161+
sec5 = dp.Select(blocks=blocks) if len(blocks) > 1 else blocks[0]
162+
all_sections = [sec5_text, sec5]
157163

158164
if self.spec.generate_explanations:
159165
try:
@@ -239,6 +245,9 @@ def _custom_predict(
239245
"""
240246
data: ForecastDatasets.get_data_at_series(s_id)
241247
"""
248+
if series_id in self.constant_cols:
249+
data = data.drop(columns=self.constant_cols[series_id])
250+
242251
data = data.drop([target_col], axis=1)
243252
data[dt_column_name] = seconds_to_datetime(
244253
data[dt_column_name], dt_format=self.spec.datetime_column.format

0 commit comments

Comments
 (0)