From 28e2180f7b8bc2ed37d6e0676de8e5849ecc57db Mon Sep 17 00:00:00 2001 From: Andreas Wrede Date: Sat, 2 May 2026 14:27:18 -0400 Subject: [PATCH] =?UTF-8?q?fix:=20suppress=20notifications=20on=20alert=20?= =?UTF-8?q?de-escalation=20(e.g.=20CRITICAL=E2=86=92WARNING)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only notify on worsening transitions (OK→WARNING, OK→CRITICAL, WARNING→CRITICAL) and recovery (any→OK). De-escalation within alert states no longer sends a duplicate notification since the metric never recovered. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- hbd/server/threshold.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/hbd/server/threshold.py b/hbd/server/threshold.py index 0e11993..f8baca1 100644 --- a/hbd/server/threshold.py +++ b/hbd/server/threshold.py @@ -1114,7 +1114,9 @@ class ThresholdChecker: ) -> None: """Handle a state-change transition with grace-period logic. - Transitioning INTO alert: defers the notification for grace_seconds. + Transitioning INTO alert (worsening): defers the notification for grace_seconds. + De-escalation within alert states (e.g. CRITICAL→WARNING): no new notification; + the metric is still alerting so no RECOVER was sent. Transitioning TO OK: - Still in grace window (pending_since set): suppresses both the alert and the recovery — the spike never warranted a page. @@ -1134,12 +1136,20 @@ class ThresholdChecker: alert_state.pending_since = None else: self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value) - else: + elif new_level.value > old_level.value: + # Worsening (OK→WARNING, OK→CRITICAL, WARNING→CRITICAL): schedule notification. alert_state.pending_since = time.time() logger.debug( "Alert deferred (%.0fs grace): %s on %s = %s", self.grace_seconds, metric_path, host_name, value, ) + else: + # De-escalation within alert states (e.g. CRITICAL→WARNING): metric is still + # alerting but did not recover, so no new notification. + logger.debug( + "De-escalation %s→%s for %s on %s, no notification", + old_level.name, new_level.name, metric_path, host_name, + ) def _check_pending_or_renotify( self,