Skip to content

Commit 6e6d2de

Browse files
committed
Add quiet hours category suppression
1 parent 11e80bb commit 6e6d2de

File tree

4 files changed

+304
-31
lines changed

4 files changed

+304
-31
lines changed

frontend-modern/src/pages/Alerts.tsx

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,11 @@ interface QuietHoursConfig {
104104
end: string;
105105
timezone: string;
106106
days: Record<string, boolean>;
107+
suppress: {
108+
performance: boolean;
109+
storage: boolean;
110+
offline: boolean;
111+
};
107112
}
108113

109114
interface CooldownConfig {
@@ -148,6 +153,11 @@ const createDefaultQuietHours = (): QuietHoursConfig => ({
148153
saturday: false,
149154
sunday: false,
150155
},
156+
suppress: {
157+
performance: false,
158+
storage: false,
159+
offline: false,
160+
},
151161
});
152162

153163
const createDefaultCooldown = (): CooldownConfig => ({
@@ -730,13 +740,19 @@ export function Alerts() {
730740
} else {
731741
days = (qh.days as Record<string, boolean>) || createDefaultQuietHours().days;
732742
}
743+
const suppress = {
744+
performance: qh.suppress?.performance ?? false,
745+
storage: qh.suppress?.storage ?? false,
746+
offline: qh.suppress?.offline ?? false,
747+
};
733748

734749
setScheduleQuietHours({
735750
enabled: qh.enabled || false,
736751
start: qh.start || '22:00',
737752
end: qh.end || '08:00',
738753
timezone: qh.timezone || Intl.DateTimeFormat().resolvedOptions().timeZone || 'UTC',
739754
days,
755+
suppress,
740756
});
741757
}
742758

@@ -2172,6 +2188,28 @@ function ScheduleTab(props: ScheduleTabProps) {
21722188
'Pacific/Honolulu',
21732189
];
21742190

2191+
const quietHourSuppressOptions: Array<{
2192+
key: keyof QuietHoursConfig['suppress'];
2193+
label: string;
2194+
description: string;
2195+
}> = [
2196+
{
2197+
key: 'performance',
2198+
label: 'Performance alerts',
2199+
description: 'CPU, memory, disk, and network thresholds stay quiet.',
2200+
},
2201+
{
2202+
key: 'storage',
2203+
label: 'Storage alerts',
2204+
description: 'Silence storage usage, disk health, and ZFS events.',
2205+
},
2206+
{
2207+
key: 'offline',
2208+
label: 'Offline & power state',
2209+
description: 'Skip connectivity and powered-off alerts during backups.',
2210+
},
2211+
];
2212+
21752213
const days = [
21762214
{ id: 'monday', label: 'M', fullLabel: 'Monday' },
21772215
{ id: 'tuesday', label: 'T', fullLabel: 'Tuesday' },
@@ -2340,6 +2378,71 @@ function ScheduleTab(props: ScheduleTabProps) {
23402378
</Show>
23412379
</p>
23422380
</div>
2381+
2382+
<div class="space-y-3 border-t border-gray-200 pt-4 dark:border-gray-700">
2383+
<span class={`${labelClass('text-xs uppercase tracking-[0.08em]')} block`}>
2384+
Suppress categories
2385+
</span>
2386+
<p class="text-xs text-gray-500 dark:text-gray-400">
2387+
Critical alerts in selected categories will stay silent during quiet hours.
2388+
</p>
2389+
<div class="flex flex-col gap-2 sm:flex-row sm:flex-wrap sm:gap-3">
2390+
<For each={quietHourSuppressOptions}>
2391+
{(option) => (
2392+
<label
2393+
class={`flex cursor-pointer items-start gap-3 rounded-lg border px-3 py-2 transition-colors ${
2394+
quietHours().suppress[option.key]
2395+
? 'border-blue-500 bg-blue-50 dark:border-blue-400 dark:bg-blue-500/10'
2396+
: 'border-gray-200 hover:bg-gray-50 dark:border-gray-600 dark:hover:bg-gray-700'
2397+
}`}
2398+
>
2399+
<input
2400+
type="checkbox"
2401+
checked={quietHours().suppress[option.key]}
2402+
onChange={(e) => {
2403+
setQuietHours({
2404+
...quietHours(),
2405+
suppress: {
2406+
...quietHours().suppress,
2407+
[option.key]: e.currentTarget.checked,
2408+
},
2409+
});
2410+
props.setHasUnsavedChanges(true);
2411+
}}
2412+
class="sr-only"
2413+
/>
2414+
<div
2415+
class={`mt-1 flex h-4 w-4 items-center justify-center rounded border-2 ${
2416+
quietHours().suppress[option.key]
2417+
? 'border-blue-500 bg-blue-500'
2418+
: 'border-gray-300 dark:border-gray-600'
2419+
}`}
2420+
>
2421+
<Show when={quietHours().suppress[option.key]}>
2422+
<svg
2423+
class="h-3 w-3 text-white"
2424+
fill="none"
2425+
viewBox="0 0 24 24"
2426+
stroke="currentColor"
2427+
stroke-width="3"
2428+
>
2429+
<path stroke-linecap="round" stroke-linejoin="round" d="M5 13l4 4L19 7" />
2430+
</svg>
2431+
</Show>
2432+
</div>
2433+
<div>
2434+
<p class="text-sm font-medium text-gray-700 dark:text-gray-200">
2435+
{option.label}
2436+
</p>
2437+
<p class="text-xs text-gray-500 dark:text-gray-400">
2438+
{option.description}
2439+
</p>
2440+
</div>
2441+
</label>
2442+
)}
2443+
</For>
2444+
</div>
2445+
</div>
23432446
</div>
23442447
</Show>
23452448
</SettingsPanel>
@@ -2700,6 +2803,23 @@ function ScheduleTab(props: ScheduleTabProps) {
27002803
{quietHours().timezone})
27012804
</p>
27022805
</Show>
2806+
<Show
2807+
when={
2808+
quietHours().enabled &&
2809+
(quietHours().suppress.performance ||
2810+
quietHours().suppress.storage ||
2811+
quietHours().suppress.offline)
2812+
}
2813+
>
2814+
<p>
2815+
• Suppressing{' '}
2816+
{quietHourSuppressOptions
2817+
.filter((option) => quietHours().suppress[option.key])
2818+
.map((option) => option.label)
2819+
.join(', ')}{' '}
2820+
during quiet hours
2821+
</p>
2822+
</Show>
27032823
<Show when={cooldown().enabled}>
27042824
<p>
27052825
{cooldown().minutes} minute cooldown between alerts, max {cooldown().maxAlerts}{' '}

frontend-modern/src/types/alerts.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,11 @@ export interface AlertConfig {
127127
end: string;
128128
timezone?: string;
129129
days: number[] | Record<string, boolean>;
130+
suppress?: {
131+
performance?: boolean;
132+
storage?: boolean;
133+
offline?: boolean;
134+
};
130135
};
131136
cooldown?: number;
132137
groupingWindow?: number;

internal/alerts/alerts.go

Lines changed: 108 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -170,11 +170,19 @@ type ThresholdConfig struct {
170170

171171
// QuietHours represents quiet hours configuration
172172
type QuietHours struct {
173-
Enabled bool `json:"enabled"`
174-
Start string `json:"start"` // 24-hour format "HH:MM"
175-
End string `json:"end"` // 24-hour format "HH:MM"
176-
Timezone string `json:"timezone"`
177-
Days map[string]bool `json:"days"` // monday, tuesday, etc.
173+
Enabled bool `json:"enabled"`
174+
Start string `json:"start"` // 24-hour format "HH:MM"
175+
End string `json:"end"` // 24-hour format "HH:MM"
176+
Timezone string `json:"timezone"`
177+
Days map[string]bool `json:"days"` // monday, tuesday, etc.
178+
Suppress QuietHoursSuppression `json:"suppress"`
179+
}
180+
181+
// QuietHoursSuppression controls which alert categories are silenced during quiet hours.
182+
type QuietHoursSuppression struct {
183+
Performance bool `json:"performance"`
184+
Storage bool `json:"storage"`
185+
Offline bool `json:"offline"`
178186
}
179187

180188
// EscalationLevel represents an escalation rule
@@ -333,10 +341,10 @@ type pmgBaselineCache struct {
333341

334342
// pmgAnomalyTracker tracks history and baselines for anomaly detection
335343
type pmgAnomalyTracker struct {
336-
Samples []pmgMailMetricSample // Ring buffer (max 48 samples)
337-
Baselines map[string]pmgBaselineCache // Cached baselines per metric (spamIn, spamOut, virusIn, virusOut)
338-
LastSampleTime time.Time // Timestamp of most recent sample
339-
SampleCount int // Total samples collected (for warmup check)
344+
Samples []pmgMailMetricSample // Ring buffer (max 48 samples)
345+
Baselines map[string]pmgBaselineCache // Cached baselines per metric (spamIn, spamOut, virusIn, virusOut)
346+
LastSampleTime time.Time // Timestamp of most recent sample
347+
SampleCount int // Total samples collected (for warmup check)
340348
}
341349

342350
// Manager handles alert monitoring and state
@@ -381,7 +389,7 @@ type Manager struct {
381389
// PMG quarantine growth tracking
382390
pmgQuarantineHistory map[string][]pmgQuarantineSnapshot // Track quarantine snapshots for growth detection
383391
// PMG anomaly detection tracking
384-
pmgAnomalyTrackers map[string]*pmgAnomalyTracker // Track mail metrics for anomaly detection per PMG instance
392+
pmgAnomalyTrackers map[string]*pmgAnomalyTracker // Track mail metrics for anomaly detection per PMG instance
385393
// Persistent acknowledgement state so quick alert rebuilds keep user acknowledgements
386394
ackState map[string]ackRecord
387395
}
@@ -468,7 +476,7 @@ func NewManager() *Manager {
468476
MinimumDelta: 2.0, // 2% minimum change
469477
SuppressionWindow: 5, // 5 minutes
470478
HysteresisMargin: 5.0, // 5% default margin
471-
TimeThreshold: 5,
479+
TimeThreshold: 5,
472480
TimeThresholds: map[string]int{
473481
"guest": 5,
474482
"node": 5,
@@ -491,6 +499,7 @@ func NewManager() *Manager {
491499
"saturday": false,
492500
"sunday": false,
493501
},
502+
Suppress: QuietHoursSuppression{},
494503
},
495504
Cooldown: 5, // ON - 5 minutes prevents spam
496505
GroupingWindow: 30, // ON - 30 seconds groups related alerts
@@ -565,9 +574,19 @@ func (m *Manager) SetEscalateCallback(cb func(alert *Alert, level int)) {
565574

566575
// dispatchAlert delivers an alert to the configured callback, cloning it first to
567576
// prevent concurrent mutations from racing with consumers.
568-
func (m *Manager) dispatchAlert(alert *Alert, async bool) {
577+
func (m *Manager) dispatchAlert(alert *Alert, async bool) bool {
569578
if m.onAlert == nil || alert == nil {
570-
return
579+
return false
580+
}
581+
582+
if suppressed, reason := m.shouldSuppressNotification(alert); suppressed {
583+
log.Debug().
584+
Str("alertID", alert.ID).
585+
Str("type", alert.Type).
586+
Str("level", string(alert.Level)).
587+
Str("quietHoursRule", reason).
588+
Msg("Alert notification suppressed during quiet hours")
589+
return false
571590
}
572591

573592
alertCopy := alert.Clone()
@@ -576,6 +595,7 @@ func (m *Manager) dispatchAlert(alert *Alert, async bool) {
576595
} else {
577596
m.onAlert(alertCopy)
578597
}
598+
return true
579599
}
580600

581601
// UpdateConfig updates the alert configuration
@@ -1210,6 +1230,66 @@ func (m *Manager) isInQuietHours() bool {
12101230
return false
12111231
}
12121232

1233+
func quietHoursCategoryForAlert(alert *Alert) string {
1234+
if alert == nil {
1235+
return ""
1236+
}
1237+
1238+
switch alert.Type {
1239+
case "cpu", "memory", "disk", "diskRead", "diskWrite", "networkIn", "networkOut", "temperature":
1240+
return "performance"
1241+
case "queue-depth", "queue-deferred", "queue-hold", "message-age",
1242+
"docker-container-health", "docker-container-restart-loop",
1243+
"docker-container-oom-kill", "docker-container-memory-limit":
1244+
return "performance"
1245+
case "usage", "disk-health", "disk-wearout", "zfs-pool-state", "zfs-pool-errors", "zfs-device":
1246+
return "storage"
1247+
case "connectivity", "offline", "powered-off", "docker-host-offline":
1248+
return "offline"
1249+
}
1250+
1251+
if strings.HasPrefix(alert.Type, "docker-container-") {
1252+
if alert.Type == "docker-container-state" {
1253+
return "offline"
1254+
}
1255+
return "performance"
1256+
}
1257+
1258+
return ""
1259+
}
1260+
1261+
func (m *Manager) shouldSuppressNotification(alert *Alert) (bool, string) {
1262+
if alert == nil {
1263+
return false, ""
1264+
}
1265+
1266+
if !m.isInQuietHours() {
1267+
return false, ""
1268+
}
1269+
1270+
if alert.Level != AlertLevelCritical {
1271+
return true, "non-critical"
1272+
}
1273+
1274+
category := quietHoursCategoryForAlert(alert)
1275+
switch category {
1276+
case "performance":
1277+
if m.config.Schedule.QuietHours.Suppress.Performance {
1278+
return true, category
1279+
}
1280+
case "storage":
1281+
if m.config.Schedule.QuietHours.Suppress.Storage {
1282+
return true, category
1283+
}
1284+
case "offline":
1285+
if m.config.Schedule.QuietHours.Suppress.Offline {
1286+
return true, category
1287+
}
1288+
}
1289+
1290+
return false, ""
1291+
}
1292+
12131293
// shouldNotifyAfterCooldown checks if enough time has passed since the last notification
12141294
// Returns true if notification should be sent, false if still in cooldown period
12151295
func (m *Manager) shouldNotifyAfterCooldown(alert *Alert) bool {
@@ -3099,21 +3179,17 @@ func (m *Manager) checkMetric(resourceID, resourceName, node, instance, resource
30993179
return
31003180
}
31013181

3102-
// Check if we should suppress notifications due to quiet hours
3103-
if m.isInQuietHours() && alert.Level != AlertLevelCritical {
3104-
log.Debug().
3105-
Str("alertID", alertID).
3106-
Msg("Alert notification suppressed due to quiet hours (non-critical)")
3107-
} else {
3108-
// Notify callback
3109-
if m.onAlert != nil {
3182+
// Notify callback (may be suppressed by quiet hours)
3183+
if m.onAlert != nil {
3184+
now := time.Now()
3185+
alert.LastNotified = &now
3186+
if m.dispatchAlert(alert, true) {
31103187
log.Info().Str("alertID", alertID).Msg("Calling onAlert callback")
3111-
now := time.Now()
3112-
alert.LastNotified = &now
3113-
m.dispatchAlert(alert, true)
31143188
} else {
3115-
log.Warn().Msg("No onAlert callback set!")
3189+
alert.LastNotified = nil
31163190
}
3191+
} else {
3192+
log.Warn().Msg("No onAlert callback set!")
31173193
}
31183194
} else {
31193195
// Update existing alert
@@ -3159,16 +3235,17 @@ func (m *Manager) checkMetric(resourceID, resourceName, node, instance, resource
31593235
Msg("Alert escalated to critical, will re-notify despite cooldown")
31603236
}
31613237

3162-
// Send re-notification if appropriate
3163-
if shouldRenotify && !m.isInQuietHours() {
3164-
if m.onAlert != nil {
3238+
// Send re-notification if appropriate (may be suppressed by quiet hours)
3239+
if shouldRenotify && m.onAlert != nil {
3240+
now := time.Now()
3241+
existingAlert.LastNotified = &now
3242+
if m.dispatchAlert(existingAlert, false) {
31653243
log.Info().
31663244
Str("alertID", alertID).
31673245
Str("level", string(existingAlert.Level)).
31683246
Msg("Re-notifying for existing alert")
3169-
now := time.Now()
3170-
existingAlert.LastNotified = &now
3171-
m.dispatchAlert(existingAlert, false) // false = not a new alert
3247+
} else {
3248+
existingAlert.LastNotified = nil
31723249
}
31733250
}
31743251
}

0 commit comments

Comments
 (0)