Skip to content

Commit d114ba3

Browse files
author
Minh Khue Tran
committed
2 parents 3170bd3 + 3018a3d commit d114ba3

File tree

9 files changed

+7871
-9
lines changed

9 files changed

+7871
-9
lines changed

Deliverables/documentation/Flatline Filter - Real Time Data Ingestion Platform.htm

Lines changed: 7619 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
::: src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.prediction.arima
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
::: src.sdk.python.rtdip_sdk.pipelines.data_quality.data_manipulation.spark.prediction.auto_arima
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
::: src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.spark.moving_average

mkdocs.yml

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -236,18 +236,35 @@ nav:
236236
- Deploy:
237237
- Databricks: sdk/code-reference/pipelines/deploy/databricks.md
238238
- Data Quality:
239-
- Monitoring:
239+
- Monitoring:
240240
- Check Value Ranges: sdk/code-reference/pipelines/data_quality/monitoring/spark/check_value_ranges.md
241-
- Great Expectations:
242-
- Data Quality Monitoring: sdk/code-reference/pipelines/data_quality/monitoring/spark/great_expectations.md
241+
- Great Expectations:
242+
- Data Quality Monitoring: sdk/code-reference/pipelines/data_quality/monitoring/spark/great_expectations.md
243243
- Flatline Detection: sdk/code-reference/pipelines/data_quality/monitoring/spark/flatline_detection.md
244244
- Identify Missing Data:
245245
- Interval Based: sdk/code-reference/pipelines/data_quality/monitoring/spark/identify_missing_data_interval.md
246246
- Pattern Based: sdk/code-reference/pipelines/data_quality/monitoring/spark/identify_missing_data_pattern.md
247+
- Moving Average: sdk/code-reference/pipelines/data_quality/monitoring/spark/moving_average.md
247248
- Data Manipulation:
248249
- Duplicate Detection: sdk/code-reference/pipelines/data_quality/data_manipulation/spark/duplicate_detection.md
249-
- Filter Out of Range Values: sdk/code-reference/pipelines/data_quality/data_manipulation/spark/delete_out_of_range_values.md
250+
- Filter Out of Range Values: sdk/code-reference/pipelines/data_quality/data_manipulation/spark/out_of_range_value_filter.md
250251
- Flatline Filter: sdk/code-reference/pipelines/data_quality/data_manipulation/spark/flatline_filter.md
252+
- Dimensionality Reduction: sdk/code-reference/pipelines/data_quality/data_manipulation/spark/dimensionality_reduction.md
253+
- Interval Filtering: sdk/code-reference/pipelines/data_quality/data_manipulation/spark/interval_filtering.md
254+
- K-Sigma Anomaly Detection: sdk/code-reference/pipelines/data_quality/data_manipulation/spark/k_sigma_anomaly_detection.md
255+
- Missing Value Imputation: sdk/code-reference/pipelines/data_quality/data_manipulation/spark/missing_value_imputation.md
256+
- Normalization:
257+
- Normalization: sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/normalization.md
258+
- Normalization Mean: sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/normalization_mean.md
259+
- Normalization MinMax: sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/normalization_minmax.md
260+
- Normalization ZScore: sdk/code-reference/pipelines/data_quality/data_manipulation/spark/normalization/normalization_zscore.md
261+
- Prediction:
262+
- Arima: sdk/code-reference/pipelines/data_quality/data_manipulation/spark/prediction/arima.md
263+
- Auto Arima: sdk/code-reference/pipelines/data_quality/data_manipulation/spark/prediction/auto_arima.md
264+
- Machine Learning:
265+
- Data Binning: sdk/code-reference/pipelines/machine_learning/spark/data_binning.md
266+
- Linear Regression: sdk/code-reference/pipelines/machine_learning/spark/linear_regression.md
267+
251268
- Jobs: sdk/pipelines/jobs.md
252269
- Deploy:
253270
- Databricks Workflows: sdk/pipelines/deploy/databricks.md
@@ -339,4 +356,4 @@ nav:
339356
- blog/index.md
340357
- University:
341358
- University: university/overview.md
342-
359+

src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/prediction/arima.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,6 @@ class ArimaPrediction(DataManipulationBaseInterface, InputValidator):
9999
timestamp_name (str): Name of column, where event timestamps are stored
100100
source_name (str): Name of column in source-based format, where source of events are stored
101101
status_name (str): Name of column in source-based format, where status of events are stored
102-
# Options for ARIMA
103102
external_regressor_names (List[str]): Currently not working. Names of the columns with data to use for prediction, but not extend
104103
number_of_data_points_to_predict (int): Amount of points to forecast
105104
number_of_data_points_to_analyze (int): Amount of most recent points to train on
@@ -319,7 +318,7 @@ def filter(self) -> PySparkDataFrame:
319318
value imputation to prevent learning on dirty data.
320319
321320
Returns:
322-
DataFrame: A PySpark DataFrame with forcasted value entries depending on constructor parameters.
321+
DataFrame: A PySpark DataFrame with forecasted value entries depending on constructor parameters.
323322
"""
324323
# expected_scheme = StructType(
325324
# [

src/sdk/python/rtdip_sdk/pipelines/data_quality/data_manipulation/spark/prediction/auto_arima.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,6 @@ class ArimaAutoPrediction(ArimaPrediction):
7777
number_of_data_points_to_predict (int): Amount of points to forecast
7878
number_of_data_points_to_analyze (int): Amount of most recent points to train on
7979
seasonal (bool): Setting for AutoArima, is past_data seasonal?
80-
# Options for ARIMA
81-
trend (str): ARIMA-Specific setting
8280
enforce_stationarity (bool): ARIMA-Specific setting
8381
enforce_invertibility (bool): ARIMA-Specific setting
8482
concentrate_scale (bool): ARIMA-Specific setting
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
import logging
2+
from pyspark.sql import DataFrame as PySparkDataFrame
3+
from pyspark.sql.functions import col, avg
4+
from pyspark.sql.window import Window
5+
from pyspark.sql.types import (
6+
StructType,
7+
StructField,
8+
StringType,
9+
TimestampType,
10+
FloatType,
11+
)
12+
13+
from src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.interfaces import (
14+
MonitoringBaseInterface,
15+
)
16+
from src.sdk.python.rtdip_sdk.pipelines._pipeline_utils.models import (
17+
Libraries,
18+
SystemType,
19+
)
20+
from ...input_validator import InputValidator
21+
22+
23+
class MovingAverage(MonitoringBaseInterface, InputValidator):
24+
"""
25+
Computes and logs the moving average over a specified window size for a given PySpark DataFrame.
26+
27+
Args:
28+
df (pyspark.sql.DataFrame): The DataFrame to process.
29+
window_size (int): The size of the moving window.
30+
31+
Example:
32+
```python
33+
from pyspark.sql import SparkSession
34+
from rtdip_sdk.pipelines.monitoring.spark.data_quality.moving_average import MovingAverage
35+
36+
spark = SparkSession.builder.master("local[1]").appName("MovingAverageExample").getOrCreate()
37+
38+
data = [
39+
("A2PS64V0J.:ZUX09R", "2024-01-02 03:49:45.000", "Good", 1.0),
40+
("A2PS64V0J.:ZUX09R", "2024-01-02 07:53:11.000", "Good", 2.0),
41+
("A2PS64V0J.:ZUX09R", "2024-01-02 11:56:42.000", "Good", 3.0),
42+
("A2PS64V0J.:ZUX09R", "2024-01-02 16:00:12.000", "Good", 4.0),
43+
("A2PS64V0J.:ZUX09R", "2024-01-02 20:03:46.000", "Good", 5.0),
44+
]
45+
46+
columns = ["TagName", "EventTime", "Status", "Value"]
47+
48+
df = spark.createDataFrame(data, columns)
49+
50+
moving_avg = MovingAverage(
51+
df=df,
52+
window_size=3,
53+
)
54+
55+
moving_avg.check()
56+
```
57+
"""
58+
59+
df: PySparkDataFrame
60+
window_size: int
61+
EXPECTED_SCHEMA = StructType(
62+
[
63+
StructField("TagName", StringType(), True),
64+
StructField("EventTime", TimestampType(), True),
65+
StructField("Status", StringType(), True),
66+
StructField("Value", FloatType(), True),
67+
]
68+
)
69+
70+
def __init__(
71+
self,
72+
df: PySparkDataFrame,
73+
window_size: int,
74+
) -> None:
75+
if not isinstance(window_size, int) or window_size <= 0:
76+
raise ValueError("window_size must be a positive integer.")
77+
78+
self.df = df
79+
self.validate(self.EXPECTED_SCHEMA)
80+
self.window_size = window_size
81+
82+
self.logger = logging.getLogger(self.__class__.__name__)
83+
if not self.logger.handlers:
84+
handler = logging.StreamHandler()
85+
formatter = logging.Formatter(
86+
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
87+
)
88+
handler.setFormatter(formatter)
89+
self.logger.addHandler(handler)
90+
self.logger.setLevel(logging.INFO)
91+
92+
@staticmethod
93+
def system_type():
94+
"""
95+
Attributes:
96+
SystemType (Environment): Requires PYSPARK
97+
"""
98+
return SystemType.PYSPARK
99+
100+
@staticmethod
101+
def libraries():
102+
libraries = Libraries()
103+
return libraries
104+
105+
@staticmethod
106+
def settings() -> dict:
107+
return {}
108+
109+
def check(self) -> None:
110+
"""
111+
Computes and logs the moving average using a specified window size.
112+
"""
113+
114+
self._validate_inputs()
115+
116+
window_spec = (
117+
Window.partitionBy("TagName")
118+
.orderBy("EventTime")
119+
.rowsBetween(-(self.window_size - 1), 0)
120+
)
121+
122+
self.logger.info("Computing moving averages:")
123+
124+
for row in (
125+
self.df.withColumn("MovingAverage", avg(col("Value")).over(window_spec))
126+
.select("TagName", "EventTime", "Value", "MovingAverage")
127+
.collect()
128+
):
129+
self.logger.info(
130+
f"Tag: {row.TagName}, Time: {row.EventTime}, Value: {row.Value}, Moving Avg: {row.MovingAverage}"
131+
)
132+
133+
def _validate_inputs(self):
134+
if not isinstance(self.window_size, int) or self.window_size <= 0:
135+
raise ValueError("window_size must be a positive integer.")
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
import pytest
2+
import os
3+
from pyspark.sql import SparkSession
4+
from src.sdk.python.rtdip_sdk.pipelines.data_quality.monitoring.spark.moving_average import (
5+
MovingAverage,
6+
)
7+
import logging
8+
from io import StringIO
9+
10+
11+
@pytest.fixture(scope="session")
12+
def spark():
13+
spark = (
14+
SparkSession.builder.master("local[2]")
15+
.appName("MovingAverageTest")
16+
.getOrCreate()
17+
)
18+
yield spark
19+
spark.stop()
20+
21+
22+
@pytest.fixture
23+
def log_capture():
24+
log_stream = StringIO()
25+
logger = logging.getLogger("MovingAverage")
26+
logger.setLevel(logging.INFO)
27+
handler = logging.StreamHandler(log_stream)
28+
formatter = logging.Formatter("%(message)s")
29+
handler.setFormatter(formatter)
30+
logger.addHandler(handler)
31+
yield log_stream
32+
logger.removeHandler(handler)
33+
handler.close()
34+
35+
36+
def test_moving_average_basic(spark, log_capture):
37+
df = spark.createDataFrame(
38+
[
39+
("Tag1", "2024-01-02 03:49:45.000", "Good", 1.0),
40+
("Tag1", "2024-01-02 07:53:11.000", "Good", 2.0),
41+
("Tag1", "2024-01-02 11:56:42.000", "Good", 3.0),
42+
("Tag1", "2024-01-02 16:00:12.000", "Good", 4.0),
43+
("Tag1", "2024-01-02 20:03:46.000", "Good", 5.0),
44+
],
45+
["TagName", "EventTime", "Status", "Value"],
46+
)
47+
48+
detector = MovingAverage(df, window_size=3)
49+
detector.check()
50+
51+
expected_logs = [
52+
"Computing moving averages:",
53+
"Tag: Tag1, Time: 2024-01-02 03:49:45, Value: 1.0, Moving Avg: 1.0",
54+
"Tag: Tag1, Time: 2024-01-02 07:53:11, Value: 2.0, Moving Avg: 1.5",
55+
"Tag: Tag1, Time: 2024-01-02 11:56:42, Value: 3.0, Moving Avg: 2.0",
56+
"Tag: Tag1, Time: 2024-01-02 16:00:12, Value: 4.0, Moving Avg: 3.0",
57+
"Tag: Tag1, Time: 2024-01-02 20:03:46, Value: 5.0, Moving Avg: 4.0",
58+
]
59+
60+
actual_logs = log_capture.getvalue().strip().split("\n")
61+
62+
assert len(expected_logs) == len(
63+
actual_logs
64+
), f"Expected {len(expected_logs)} logs, got {len(actual_logs)}"
65+
66+
for expected, actual in zip(expected_logs, actual_logs):
67+
assert expected in actual, f"Expected: '{expected}', got: '{actual}'"
68+
69+
70+
def test_moving_average_invalid_window_size(spark):
71+
df = spark.createDataFrame(
72+
[
73+
("Tag1", "2024-01-02 03:49:45.000", "Good", 1.0),
74+
("Tag1", "2024-01-02 07:53:11.000", "Good", 2.0),
75+
],
76+
["TagName", "EventTime", "Status", "Value"],
77+
)
78+
79+
with pytest.raises(ValueError, match="window_size must be a positive integer."):
80+
MovingAverage(df, window_size=-2)
81+
82+
83+
def test_large_dataset(spark):
84+
base_path = os.path.dirname(__file__)
85+
file_path = os.path.join(base_path, "../../test_data.csv")
86+
df = spark.read.option("header", "true").csv(file_path)
87+
88+
assert df.count() > 0, "DataFrame was nicht geladen."
89+
90+
detector = MovingAverage(df, window_size=5)
91+
detector.check()

0 commit comments

Comments
 (0)