@@ -84,6 +84,7 @@ async def run(self, states: List[State]):
84
84
alert = self .open_alerts .get (alert_identifier )
85
85
if alert is None :
86
86
self .open_alerts [alert_identifier ] = {
87
+ "type" : check .__class__ .__name__ ,
87
88
"window_start" : current_time .isoformat (),
88
89
"failures" : 1 ,
89
90
"last_window_failures" : None ,
@@ -175,21 +176,31 @@ async def process_zenduty_events(self, current_time):
175
176
176
177
for identifier , info in self .open_alerts .items ():
177
178
self .check_zd_alert_status (identifier , current_time )
178
- # Resolve the alert if raised and failed < 5 times in the last 5m window
179
+ check_config = self .config ["checks" ]["global" ][info ["type" ]]
180
+ alert_threshold = check_config .get ("zenduty_alert_threshold" , 5 )
181
+ resolution_threshold = check_config .get ("zenduty_resolution_threshold" , 3 )
182
+ # Resolve the alert if raised and failed < $threshold times in the last 5m window
183
+ resolved = False
179
184
if (
180
- info ["sent" ]
181
- and info ["last_window_failures" ] is not None
182
- and info ["last_window_failures" ] < 5
185
+ info ["last_window_failures" ] is not None
186
+ and info ["last_window_failures" ] <= resolution_threshold
183
187
):
184
188
logger .debug (f"Resolving Zenduty alert { identifier } " )
185
- response = await send_zenduty_alert (
186
- identifier , identifier , resolved = True
187
- )
188
- if response and 200 <= response .status < 300 :
189
+ resolved = True
190
+ if info ["sent" ]:
191
+ response = await send_zenduty_alert (
192
+ identifier , identifier , resolved = True
193
+ )
194
+ if response and 200 <= response .status < 300 :
195
+ to_remove .append (identifier )
196
+ else :
189
197
to_remove .append (identifier )
190
- # Raise alert if failed > 5 times within the last 5m window
191
- # re-alert every 5 minutes
192
- elif info ["failures" ] >= 5 and (
198
+ # Raise alert if failed > $threshold times within the last 5m window
199
+ # or if already alerted and not yet resolved.
200
+ # Re-alert every 5 minutes but not more often.
201
+ elif (
202
+ info ["failures" ] >= alert_threshold or (info ["sent" ] and not resolved )
203
+ ) and (
193
204
not info .get ("last_alert" )
194
205
or current_time - datetime .fromisoformat (info ["last_alert" ])
195
206
> timedelta (minutes = 5 )
0 commit comments