Skip to content

Commit fc05985

Browse files
authored
adding all original columns in anomaly outputs (#708)
2 parents 703b296 + 28ece26 commit fc05985

File tree

4 files changed

+54
-18
lines changed

4 files changed

+54
-18
lines changed

ads/opctl/operator/lowcode/anomaly/model/anomaly_dataset.py

Lines changed: 8 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@ def __init__(self, spec: AnomalyOperatorSpec):
5555
self.X_valid_dict = self.valid_data.X_valid_dict
5656
self.y_valid_dict = self.valid_data.y_valid_dict
5757

58+
# Returns raw data based on the series_id i.e; the merged target_category_column value
59+
def get_raw_data_by_cat(self, category):
60+
return self._data.get_raw_data_by_cat(category)
61+
5862

5963
class AnomalyOutput:
6064
def __init__(self, date_column):
@@ -93,38 +97,28 @@ def get_outliers_by_cat(self, category: str, data: pd.DataFrame):
9397
outliers = pd.merge(outliers, scores, on=self.date_column, how="inner")
9498
return outliers
9599

96-
def get_inliers(self, data):
100+
def get_inliers(self, datasets):
97101
inliers = pd.DataFrame()
98102

99103
for category in self.list_categories():
100104
inliers = pd.concat(
101105
[
102106
inliers,
103-
self.get_inliers_by_cat(
104-
category,
105-
data[data[OutputColumns.Series] == category]
106-
.reset_index(drop=True)
107-
.drop(OutputColumns.Series, axis=1),
108-
),
107+
self.get_inliers_by_cat(category, datasets.get_raw_data_by_cat(category)),
109108
],
110109
axis=0,
111110
ignore_index=True,
112111
)
113112
return inliers
114113

115-
def get_outliers(self, data):
114+
def get_outliers(self, datasets):
116115
outliers = pd.DataFrame()
117116

118117
for category in self.list_categories():
119118
outliers = pd.concat(
120119
[
121120
outliers,
122-
self.get_outliers_by_cat(
123-
category,
124-
data[data[OutputColumns.Series] == category]
125-
.reset_index(drop=True)
126-
.drop(OutputColumns.Series, axis=1),
127-
),
121+
self.get_outliers_by_cat(category, datasets.get_raw_data_by_cat(category)),
128122
],
129123
axis=0,
130124
ignore_index=True,

ads/opctl/operator/lowcode/anomaly/model/base_model.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -272,15 +272,15 @@ def _save_report(
272272
f2.write(f1.read())
273273

274274
if self.spec.generate_inliers:
275-
inliers = anomaly_output.get_inliers(self.datasets.data)
275+
inliers = anomaly_output.get_inliers(self.datasets)
276276
write_data(
277277
data=inliers,
278278
filename=os.path.join(unique_output_dir, self.spec.inliers_filename),
279279
format="csv",
280280
storage_options=storage_options,
281281
)
282282

283-
outliers = anomaly_output.get_outliers(self.datasets.data)
283+
outliers = anomaly_output.get_outliers(self.datasets)
284284
write_data(
285285
data=outliers,
286286
filename=os.path.join(unique_output_dir, self.spec.outliers_filename),

ads/opctl/operator/lowcode/common/data.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
DataMismatchError,
1717
)
1818
from abc import ABC
19+
import pandas as pd
1920

2021

2122
class AbstractData(ABC):
@@ -26,6 +27,19 @@ def __init__(self, spec: dict, name="input_data"):
2627
self.name = name
2728
self.load_transform_ingest_data(spec)
2829

30+
def get_raw_data_by_cat(self, category):
31+
mapping = self._data_transformer.get_target_category_columns_map()
32+
# For given category, mapping gives the target_category_columns and it's values.
33+
# condition filters raw_data based on the values of target_category_columns for the given category
34+
condition = pd.Series(True, index=self.raw_data.index)
35+
if category in mapping:
36+
for col, val in mapping[category].items():
37+
condition &= (self.raw_data[col] == val)
38+
data_by_cat = self.raw_data[condition].reset_index(drop=True)
39+
data_by_cat = self._data_transformer._format_datetime_col(data_by_cat)
40+
return data_by_cat
41+
42+
2943
def get_dict_by_series(self):
3044
if not self._data_dict:
3145
for s_id in self.list_series_ids():
@@ -73,8 +87,8 @@ def _transform_data(self, spec, raw_data, **kwargs):
7387
return data
7488

7589
def load_transform_ingest_data(self, spec):
76-
raw_data = self._load_data(getattr(spec, self.name))
77-
self.data = self._transform_data(spec, raw_data)
90+
self.raw_data = self._load_data(getattr(spec, self.name))
91+
self.data = self._transform_data(spec, self.raw_data)
7892
self._ingest_data(spec)
7993

8094
def _ingest_data(self, spec):

ads/opctl/operator/lowcode/common/transformations.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,13 +84,19 @@ def _remove_trailing_whitespace(self, df):
8484
return df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
8585

8686
def _set_series_id_column(self, df):
87+
self._target_category_columns_map = dict()
8788
if not self.target_category_columns:
8889
df[DataColumns.Series] = "Series 1"
8990
self.has_artificial_series = True
9091
else:
9192
df[DataColumns.Series] = merge_category_columns(
9293
df, self.target_category_columns
9394
)
95+
merged_values = df[DataColumns.Series].unique().tolist()
96+
if self.target_category_columns:
97+
for value in merged_values:
98+
self._target_category_columns_map[value] = df[df[DataColumns.Series] == value][self.target_category_columns].drop_duplicates().iloc[0].to_dict()
99+
94100
df = df.drop(self.target_category_columns, axis=1)
95101
return df
96102

@@ -195,3 +201,25 @@ def _check_historical_dataset(self, df):
195201
raise DataMismatchError(
196202
f"Expected {self.name} to have columns: {expected_names}, but instead found column names: {df.columns}. Is the {self.name} path correct?"
197203
)
204+
205+
"""
206+
Map between merged target category column values and target category column and its value
207+
If target category columns are PPG_Code, Class, Num
208+
Merged target category column values are Product Category 1__A__1, Product Category 2__A__2
209+
Then target_category_columns_map would be
210+
{
211+
"Product Category 1__A__1": {
212+
"PPG_Code": "Product Category 1",
213+
"Class": "A",
214+
"Num": 1
215+
},
216+
"Product Category 2__A__2": {
217+
"PPG_Code": "Product Category 2",
218+
"Class": "A",
219+
"Num": 2
220+
},
221+
222+
}
223+
"""
224+
def get_target_category_columns_map(self):
225+
return self._target_category_columns_map

0 commit comments

Comments
 (0)