diff --git a/hbd/server/threshold.py b/hbd/server/threshold.py index 0e11993..f8baca1 100644 --- a/hbd/server/threshold.py +++ b/hbd/server/threshold.py @@ -1114,7 +1114,9 @@ class ThresholdChecker: ) -> None: """Handle a state-change transition with grace-period logic. - Transitioning INTO alert: defers the notification for grace_seconds. + Transitioning INTO alert (worsening): defers the notification for grace_seconds. + De-escalation within alert states (e.g. CRITICAL→WARNING): no new notification; + the metric is still alerting so no RECOVER was sent. Transitioning TO OK: - Still in grace window (pending_since set): suppresses both the alert and the recovery — the spike never warranted a page. @@ -1134,12 +1136,20 @@ class ThresholdChecker: alert_state.pending_since = None else: self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value) - else: + elif new_level.value > old_level.value: + # Worsening (OK→WARNING, OK→CRITICAL, WARNING→CRITICAL): schedule notification. alert_state.pending_since = time.time() logger.debug( "Alert deferred (%.0fs grace): %s on %s = %s", self.grace_seconds, metric_path, host_name, value, ) + else: + # De-escalation within alert states (e.g. CRITICAL→WARNING): metric is still + # alerting but did not recover, so no new notification. + logger.debug( + "De-escalation %s→%s for %s on %s, no notification", + old_level.name, new_level.name, metric_path, host_name, + ) def _check_pending_or_renotify( self,