@@ -170,11 +170,19 @@ type ThresholdConfig struct {
170170
171171// QuietHours represents quiet hours configuration
172172type QuietHours struct {
173- Enabled bool `json:"enabled"`
174- Start string `json:"start"` // 24-hour format "HH:MM"
175- End string `json:"end"` // 24-hour format "HH:MM"
176- Timezone string `json:"timezone"`
177- Days map [string ]bool `json:"days"` // monday, tuesday, etc.
173+ Enabled bool `json:"enabled"`
174+ Start string `json:"start"` // 24-hour format "HH:MM"
175+ End string `json:"end"` // 24-hour format "HH:MM"
176+ Timezone string `json:"timezone"`
177+ Days map [string ]bool `json:"days"` // monday, tuesday, etc.
178+ Suppress QuietHoursSuppression `json:"suppress"`
179+ }
180+
181+ // QuietHoursSuppression controls which alert categories are silenced during quiet hours.
182+ type QuietHoursSuppression struct {
183+ Performance bool `json:"performance"`
184+ Storage bool `json:"storage"`
185+ Offline bool `json:"offline"`
178186}
179187
180188// EscalationLevel represents an escalation rule
@@ -333,10 +341,10 @@ type pmgBaselineCache struct {
333341
334342// pmgAnomalyTracker tracks history and baselines for anomaly detection
335343type pmgAnomalyTracker struct {
336- Samples []pmgMailMetricSample // Ring buffer (max 48 samples)
337- Baselines map [string ]pmgBaselineCache // Cached baselines per metric (spamIn, spamOut, virusIn, virusOut)
338- LastSampleTime time.Time // Timestamp of most recent sample
339- SampleCount int // Total samples collected (for warmup check)
344+ Samples []pmgMailMetricSample // Ring buffer (max 48 samples)
345+ Baselines map [string ]pmgBaselineCache // Cached baselines per metric (spamIn, spamOut, virusIn, virusOut)
346+ LastSampleTime time.Time // Timestamp of most recent sample
347+ SampleCount int // Total samples collected (for warmup check)
340348}
341349
342350// Manager handles alert monitoring and state
@@ -381,7 +389,7 @@ type Manager struct {
381389 // PMG quarantine growth tracking
382390 pmgQuarantineHistory map [string ][]pmgQuarantineSnapshot // Track quarantine snapshots for growth detection
383391 // PMG anomaly detection tracking
384- pmgAnomalyTrackers map [string ]* pmgAnomalyTracker // Track mail metrics for anomaly detection per PMG instance
392+ pmgAnomalyTrackers map [string ]* pmgAnomalyTracker // Track mail metrics for anomaly detection per PMG instance
385393 // Persistent acknowledgement state so quick alert rebuilds keep user acknowledgements
386394 ackState map [string ]ackRecord
387395}
@@ -468,7 +476,7 @@ func NewManager() *Manager {
468476 MinimumDelta : 2.0 , // 2% minimum change
469477 SuppressionWindow : 5 , // 5 minutes
470478 HysteresisMargin : 5.0 , // 5% default margin
471- TimeThreshold : 5 ,
479+ TimeThreshold : 5 ,
472480 TimeThresholds : map [string ]int {
473481 "guest" : 5 ,
474482 "node" : 5 ,
@@ -491,6 +499,7 @@ func NewManager() *Manager {
491499 "saturday" : false ,
492500 "sunday" : false ,
493501 },
502+ Suppress : QuietHoursSuppression {},
494503 },
495504 Cooldown : 5 , // ON - 5 minutes prevents spam
496505 GroupingWindow : 30 , // ON - 30 seconds groups related alerts
@@ -565,9 +574,19 @@ func (m *Manager) SetEscalateCallback(cb func(alert *Alert, level int)) {
565574
566575// dispatchAlert delivers an alert to the configured callback, cloning it first to
567576// prevent concurrent mutations from racing with consumers.
568- func (m * Manager ) dispatchAlert (alert * Alert , async bool ) {
577+ func (m * Manager ) dispatchAlert (alert * Alert , async bool ) bool {
569578 if m .onAlert == nil || alert == nil {
570- return
579+ return false
580+ }
581+
582+ if suppressed , reason := m .shouldSuppressNotification (alert ); suppressed {
583+ log .Debug ().
584+ Str ("alertID" , alert .ID ).
585+ Str ("type" , alert .Type ).
586+ Str ("level" , string (alert .Level )).
587+ Str ("quietHoursRule" , reason ).
588+ Msg ("Alert notification suppressed during quiet hours" )
589+ return false
571590 }
572591
573592 alertCopy := alert .Clone ()
@@ -576,6 +595,7 @@ func (m *Manager) dispatchAlert(alert *Alert, async bool) {
576595 } else {
577596 m .onAlert (alertCopy )
578597 }
598+ return true
579599}
580600
581601// UpdateConfig updates the alert configuration
@@ -1210,6 +1230,66 @@ func (m *Manager) isInQuietHours() bool {
12101230 return false
12111231}
12121232
1233+ func quietHoursCategoryForAlert (alert * Alert ) string {
1234+ if alert == nil {
1235+ return ""
1236+ }
1237+
1238+ switch alert .Type {
1239+ case "cpu" , "memory" , "disk" , "diskRead" , "diskWrite" , "networkIn" , "networkOut" , "temperature" :
1240+ return "performance"
1241+ case "queue-depth" , "queue-deferred" , "queue-hold" , "message-age" ,
1242+ "docker-container-health" , "docker-container-restart-loop" ,
1243+ "docker-container-oom-kill" , "docker-container-memory-limit" :
1244+ return "performance"
1245+ case "usage" , "disk-health" , "disk-wearout" , "zfs-pool-state" , "zfs-pool-errors" , "zfs-device" :
1246+ return "storage"
1247+ case "connectivity" , "offline" , "powered-off" , "docker-host-offline" :
1248+ return "offline"
1249+ }
1250+
1251+ if strings .HasPrefix (alert .Type , "docker-container-" ) {
1252+ if alert .Type == "docker-container-state" {
1253+ return "offline"
1254+ }
1255+ return "performance"
1256+ }
1257+
1258+ return ""
1259+ }
1260+
1261+ func (m * Manager ) shouldSuppressNotification (alert * Alert ) (bool , string ) {
1262+ if alert == nil {
1263+ return false , ""
1264+ }
1265+
1266+ if ! m .isInQuietHours () {
1267+ return false , ""
1268+ }
1269+
1270+ if alert .Level != AlertLevelCritical {
1271+ return true , "non-critical"
1272+ }
1273+
1274+ category := quietHoursCategoryForAlert (alert )
1275+ switch category {
1276+ case "performance" :
1277+ if m .config .Schedule .QuietHours .Suppress .Performance {
1278+ return true , category
1279+ }
1280+ case "storage" :
1281+ if m .config .Schedule .QuietHours .Suppress .Storage {
1282+ return true , category
1283+ }
1284+ case "offline" :
1285+ if m .config .Schedule .QuietHours .Suppress .Offline {
1286+ return true , category
1287+ }
1288+ }
1289+
1290+ return false , ""
1291+ }
1292+
12131293// shouldNotifyAfterCooldown checks if enough time has passed since the last notification
12141294// Returns true if notification should be sent, false if still in cooldown period
12151295func (m * Manager ) shouldNotifyAfterCooldown (alert * Alert ) bool {
@@ -3099,21 +3179,17 @@ func (m *Manager) checkMetric(resourceID, resourceName, node, instance, resource
30993179 return
31003180 }
31013181
3102- // Check if we should suppress notifications due to quiet hours
3103- if m .isInQuietHours () && alert .Level != AlertLevelCritical {
3104- log .Debug ().
3105- Str ("alertID" , alertID ).
3106- Msg ("Alert notification suppressed due to quiet hours (non-critical)" )
3107- } else {
3108- // Notify callback
3109- if m .onAlert != nil {
3182+ // Notify callback (may be suppressed by quiet hours)
3183+ if m .onAlert != nil {
3184+ now := time .Now ()
3185+ alert .LastNotified = & now
3186+ if m .dispatchAlert (alert , true ) {
31103187 log .Info ().Str ("alertID" , alertID ).Msg ("Calling onAlert callback" )
3111- now := time .Now ()
3112- alert .LastNotified = & now
3113- m .dispatchAlert (alert , true )
31143188 } else {
3115- log . Warn (). Msg ( "No onAlert callback set!" )
3189+ alert . LastNotified = nil
31163190 }
3191+ } else {
3192+ log .Warn ().Msg ("No onAlert callback set!" )
31173193 }
31183194 } else {
31193195 // Update existing alert
@@ -3159,16 +3235,17 @@ func (m *Manager) checkMetric(resourceID, resourceName, node, instance, resource
31593235 Msg ("Alert escalated to critical, will re-notify despite cooldown" )
31603236 }
31613237
3162- // Send re-notification if appropriate
3163- if shouldRenotify && ! m .isInQuietHours () {
3164- if m .onAlert != nil {
3238+ // Send re-notification if appropriate (may be suppressed by quiet hours)
3239+ if shouldRenotify && m .onAlert != nil {
3240+ now := time .Now ()
3241+ existingAlert .LastNotified = & now
3242+ if m .dispatchAlert (existingAlert , false ) {
31653243 log .Info ().
31663244 Str ("alertID" , alertID ).
31673245 Str ("level" , string (existingAlert .Level )).
31683246 Msg ("Re-notifying for existing alert" )
3169- now := time .Now ()
3170- existingAlert .LastNotified = & now
3171- m .dispatchAlert (existingAlert , false ) // false = not a new alert
3247+ } else {
3248+ existingAlert .LastNotified = nil
31723249 }
31733250 }
31743251 }
0 commit comments