From 990c658e6573b1bad014cee0bc96e4e22e74a438 Mon Sep 17 00:00:00 2001 From: Andreas Wrede Date: Fri, 24 Apr 2026 12:00:40 +0200 Subject: [PATCH] Apply grace period to all threshold alerts before logging/notifying MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Threshold alerts (plugin metrics, RTT) were firing immediately on the first breach. Now every state transition to WARNING/CRITICAL starts a grace-period timer (grace_seconds from the 'grace' config key). The notification is deferred until the next heartbeat after grace_seconds have elapsed. If the metric recovers within the grace window, both the alert and the recovery are suppressed — no spurious pages for transient spikes. Two helper methods added to ThresholdChecker: - _apply_grace: handles the state-change path (defer or suppress) - _check_pending_or_renotify: handles the stable-alert path (fire deferred notification once grace expires, or fall through to reminders) The overdue case is unchanged — on_overdue already fires only after interval+grace seconds of silence, which is equivalent behaviour. Co-Authored-By: Claude Sonnet 4.6 --- hbd/server/threshold.py | 109 +++++++++++++++++++++++++++++----------- 1 file changed, 80 insertions(+), 29 deletions(-) diff --git a/hbd/server/threshold.py b/hbd/server/threshold.py index 46a2ff7..b87e7d7 100644 --- a/hbd/server/threshold.py +++ b/hbd/server/threshold.py @@ -60,6 +60,7 @@ class AlertState: self.acknowledged = False # Whether alert has been acknowledged self.acknowledged_at = None # Timestamp when acknowledged self.consecutive_count = 0 # Consecutive exceedances while still OK (for count gating) + self.pending_since: Optional[float] = None # non-None while waiting out grace period before notifying def update( self, @@ -340,8 +341,9 @@ class ThresholdChecker: self.default_config = "default" self.renotify_interval = renotify_interval + self.grace_seconds: float = float(config.get("grace", 2)) self.journal = journal - + # Parse configuration self._parse_config(config) @@ -372,7 +374,8 @@ class ThresholdChecker: self.threshold_configs.clear() self.thresholds.clear() self.host_config_mapping.clear() - + self.grace_seconds = float(config.get("grace", 2)) + # Parse new configuration self._parse_config(config) @@ -760,15 +763,10 @@ class ThresholdChecker: # Update state and check for changes old_level = alert_state.level if alert_state.update(new_level, value, threshold_value, threshold.operator.value): - # For check_value, we don't have full plugin data, pass None - lvl, message, formatted_msg = self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold, None) - # Update alert state with formatted message - alert_state.formatted_message = formatted_msg - self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value) + self._apply_grace(host_name, alert_state, metric_path, old_level, new_level, value, threshold, None) return (old_level, new_level) elif new_level != AlertLevel.OK: - # Check if we should re-notify - self._check_renotify(host_name, alert_state, metric_path, value, threshold, None) + self._check_pending_or_renotify(host_name, alert_state, metric_path, value, threshold, None) return None def check_plugin_data( @@ -827,14 +825,10 @@ class ThresholdChecker: old_level = alert_state.level if alert_state.update(new_level, value, threshold_value, threshold.operator.value): state_changes.append((metric_path, old_level, new_level, value)) - lvl, message, formatted_msg = self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold, data) - # Update alert state with formatted message - alert_state.formatted_message = formatted_msg - self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value) + self._apply_grace(host_name, alert_state, metric_path, old_level, new_level, value, threshold, data) elif new_level != AlertLevel.OK: - # Check if we should re-notify - self._check_renotify(host_name, alert_state, metric_path, value, threshold, data) - + self._check_pending_or_renotify(host_name, alert_state, metric_path, value, threshold, data) + # Check nested metrics (e.g., partition data in disk_monitor) self._check_nested_metrics( host_name, @@ -896,20 +890,9 @@ class ThresholdChecker: old_level = alert_state.level if alert_state.update(new_level, value, threshold_value, threshold.operator.value): state_changes.append((metric_path, old_level, new_level, value)) - lvl, message, formatted_msg = self._trigger_notification( - host_name, - metric_path, - old_level, - new_level, - value, - threshold, - data # Pass full plugin data for format string - ) - # Update alert state with formatted message - alert_state.formatted_message = formatted_msg - self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value) + self._apply_grace(host_name, alert_state, metric_path, old_level, new_level, value, threshold, data) elif new_level != AlertLevel.OK: - self._check_renotify(host_name, alert_state, metric_path, value, threshold, data) + self._check_pending_or_renotify(host_name, alert_state, metric_path, value, threshold, data) def _trigger_notification( self, @@ -1084,6 +1067,74 @@ class ThresholdChecker: ) return f"(threshold: {op_symbol} {threshold_value})" + def _apply_grace( + self, + host_name: str, + alert_state: AlertState, + metric_path: str, + old_level: AlertLevel, + new_level: AlertLevel, + value: Any, + threshold: ThresholdConfig, + plugin_data: Optional[Dict[str, Any]], + ) -> None: + """Handle a state-change transition with grace-period logic. + + Transitioning INTO alert: defers the notification for grace_seconds. + Transitioning TO OK: + - Still in grace window (pending_since set): suppresses both the alert + and the recovery — the spike never warranted a page. + - Past grace: fires the RECOVER notification normally. + """ + lvl, message, formatted_msg = self._trigger_notification( + host_name, metric_path, old_level, new_level, value, threshold, plugin_data + ) + alert_state.formatted_message = formatted_msg + + if new_level == AlertLevel.OK: + if alert_state.pending_since is not None: + logger.info( + "Alert suppressed (recovered within %.0fs grace): %s on %s", + self.grace_seconds, metric_path, host_name, + ) + alert_state.pending_since = None + else: + self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value) + else: + alert_state.pending_since = time.time() + logger.debug( + "Alert deferred (%.0fs grace): %s on %s = %s", + self.grace_seconds, metric_path, host_name, value, + ) + + def _check_pending_or_renotify( + self, + host_name: str, + alert_state: AlertState, + metric_path: str, + value: Any, + threshold: ThresholdConfig, + plugin_data: Optional[Dict[str, Any]], + ) -> None: + """Called when alert level is unchanged and non-OK. + + If a deferred notification is pending and grace_seconds have elapsed, + fires it now. Otherwise falls through to normal reminder logic. + """ + if alert_state.pending_since is not None: + if time.time() - alert_state.pending_since >= self.grace_seconds: + lvl, message, formatted_msg = self._trigger_notification( + host_name, metric_path, AlertLevel.OK, alert_state.level, value, threshold, plugin_data + ) + alert_state.formatted_message = formatted_msg + self._send_notification( + host_name, lvl, message, metric_path, AlertLevel.OK, alert_state.level, value + ) + alert_state.pending_since = None + # else: still within grace window, do nothing + else: + self._check_renotify(host_name, alert_state, metric_path, value, threshold, plugin_data) + def _check_renotify( self, host_name: str,