Skip to content

Commit d665017

Browse files
authored
set customisable thresholds for zenduty alerts (#75)
* set customisable thresholds for zenduty alerts * mention defaults * bump version * cleanup readme * fix
1 parent 89fdfd3 commit d665017

File tree

4 files changed

+39
-13
lines changed

4 files changed

+39
-13
lines changed

README.md

+9-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ Observe Pyth on-chain price feeds and run sanity checks on the data.
44

55
## Usage
66

7-
Container images are available at https://gallery.ecr.aws/pyth-network/observer.
7+
Container images are available at https://github.com/pyth-network/pyth-observer/pkgs/container/pyth-observer
88

99
To run Observer locally, make sure you have a recent version of [Poetry](https://python-poetry.org) installed and run:
1010

@@ -38,6 +38,14 @@ Event types are configured via environment variables:
3838
- `ZENDUTY_INTEGRATION_KEY` - Integration key for Zenduty service API integration
3939
- `OPEN_ALERTS_FILE` - Path to local file used for persisting open alerts
4040

41+
### Zenduty Alert Thresholds
42+
- Zenduty alert will fire if a check fails 5 or more times within 5 minutes.
43+
- The alert will be resolved if the check failed < 4 times within 5 minutes.
44+
- Checks run approximately once per minute.
45+
- These thresholds can be overridden per check type in config.yaml
46+
- `zenduty_alert_threshold`: number of failures in 5 minutes >= to this value trigger an alert (default: 5)
47+
- `zenduty_resolution_threshold`: number of failures in 5 minutes <= this value resolve the alert (default: 3)
48+
4149
## Finding the Telegram Group Chat ID
4250

4351
To integrate Telegram events with the Observer, you need the Telegram group chat ID. Here's how you can find it:

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ ignore_missing_imports = true
44

55
[tool.poetry]
66
name = "pyth-observer"
7-
version = "0.2.10"
7+
version = "0.2.11"
88
description = "Alerts and stuff"
99
authors = []
1010
readme = "README.md"

pyth_observer/dispatch.py

+22-11
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ async def run(self, states: List[State]):
8484
alert = self.open_alerts.get(alert_identifier)
8585
if alert is None:
8686
self.open_alerts[alert_identifier] = {
87+
"type": check.__class__.__name__,
8788
"window_start": current_time.isoformat(),
8889
"failures": 1,
8990
"last_window_failures": None,
@@ -175,21 +176,31 @@ async def process_zenduty_events(self, current_time):
175176

176177
for identifier, info in self.open_alerts.items():
177178
self.check_zd_alert_status(identifier, current_time)
178-
# Resolve the alert if raised and failed < 5 times in the last 5m window
179+
check_config = self.config["checks"]["global"][info["type"]]
180+
alert_threshold = check_config.get("zenduty_alert_threshold", 5)
181+
resolution_threshold = check_config.get("zenduty_resolution_threshold", 3)
182+
# Resolve the alert if raised and failed < $threshold times in the last 5m window
183+
resolved = False
179184
if (
180-
info["sent"]
181-
and info["last_window_failures"] is not None
182-
and info["last_window_failures"] < 5
185+
info["last_window_failures"] is not None
186+
and info["last_window_failures"] <= resolution_threshold
183187
):
184188
logger.debug(f"Resolving Zenduty alert {identifier}")
185-
response = await send_zenduty_alert(
186-
identifier, identifier, resolved=True
187-
)
188-
if response and 200 <= response.status < 300:
189+
resolved = True
190+
if info["sent"]:
191+
response = await send_zenduty_alert(
192+
identifier, identifier, resolved=True
193+
)
194+
if response and 200 <= response.status < 300:
195+
to_remove.append(identifier)
196+
else:
189197
to_remove.append(identifier)
190-
# Raise alert if failed > 5 times within the last 5m window
191-
# re-alert every 5 minutes
192-
elif info["failures"] >= 5 and (
198+
# Raise alert if failed > $threshold times within the last 5m window
199+
# or if already alerted and not yet resolved.
200+
# Re-alert every 5 minutes but not more often.
201+
elif (
202+
info["failures"] >= alert_threshold or (info["sent"] and not resolved)
203+
) and (
193204
not info.get("last_alert")
194205
or current_time - datetime.fromisoformat(info["last_alert"])
195206
> timedelta(minutes=5)

sample.config.yaml

+7
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,16 @@ events:
1111
# - DatadogEvent
1212
- LogEvent
1313
# - TelegramEvent
14+
- ZendutyEvent
1415
checks:
1516
global:
1617
# Price feed checks
1718
PriceFeedOfflineCheck:
1819
enable: true
1920
max_slot_distance: 25
2021
abandoned_slot_distance: 100000
22+
zenduty_alert_threshold: 3
23+
zenduty_resolution_threshold: 0
2124
PriceFeedCoinGeckoCheck:
2225
enable: true
2326
max_deviation: 5
@@ -44,11 +47,15 @@ checks:
4447
enable: true
4548
max_slot_distance: 25
4649
max_aggregate_distance: 6
50+
zenduty_alert_threshold: 5
51+
zenduty_resolution_threshold: 2
4752
PublisherStalledCheck:
4853
enable: false
4954
stall_time_limit: 30
5055
abandoned_time_limit: 600
5156
max_slot_distance: 25
57+
zenduty_alert_threshold: 1
58+
zenduty_resolution_threshold: 0
5259
# Per-symbol config
5360
Crypto.MNGO/USD:
5461
PriceFeedOfflineCheck:

0 commit comments

Comments
 (0)