Apply grace period to all threshold alerts before logging/notifying
Threshold alerts (plugin metrics, RTT) were firing immediately on the first breach. Now every state transition to WARNING/CRITICAL starts a grace-period timer (grace_seconds from the 'grace' config key). The notification is deferred until the next heartbeat after grace_seconds have elapsed. If the metric recovers within the grace window, both the alert and the recovery are suppressed — no spurious pages for transient spikes. Two helper methods added to ThresholdChecker: - _apply_grace: handles the state-change path (defer or suppress) - _check_pending_or_renotify: handles the stable-alert path (fire deferred notification once grace expires, or fall through to reminders) The overdue case is unchanged — on_overdue already fires only after interval+grace seconds of silence, which is equivalent behaviour. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+77
-26
@@ -60,6 +60,7 @@ class AlertState:
|
||||
self.acknowledged = False # Whether alert has been acknowledged
|
||||
self.acknowledged_at = None # Timestamp when acknowledged
|
||||
self.consecutive_count = 0 # Consecutive exceedances while still OK (for count gating)
|
||||
self.pending_since: Optional[float] = None # non-None while waiting out grace period before notifying
|
||||
|
||||
def update(
|
||||
self,
|
||||
@@ -340,6 +341,7 @@ class ThresholdChecker:
|
||||
self.default_config = "default"
|
||||
|
||||
self.renotify_interval = renotify_interval
|
||||
self.grace_seconds: float = float(config.get("grace", 2))
|
||||
self.journal = journal
|
||||
|
||||
# Parse configuration
|
||||
@@ -372,6 +374,7 @@ class ThresholdChecker:
|
||||
self.threshold_configs.clear()
|
||||
self.thresholds.clear()
|
||||
self.host_config_mapping.clear()
|
||||
self.grace_seconds = float(config.get("grace", 2))
|
||||
|
||||
# Parse new configuration
|
||||
self._parse_config(config)
|
||||
@@ -760,15 +763,10 @@ class ThresholdChecker:
|
||||
# Update state and check for changes
|
||||
old_level = alert_state.level
|
||||
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
||||
# For check_value, we don't have full plugin data, pass None
|
||||
lvl, message, formatted_msg = self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold, None)
|
||||
# Update alert state with formatted message
|
||||
alert_state.formatted_message = formatted_msg
|
||||
self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value)
|
||||
self._apply_grace(host_name, alert_state, metric_path, old_level, new_level, value, threshold, None)
|
||||
return (old_level, new_level)
|
||||
elif new_level != AlertLevel.OK:
|
||||
# Check if we should re-notify
|
||||
self._check_renotify(host_name, alert_state, metric_path, value, threshold, None)
|
||||
self._check_pending_or_renotify(host_name, alert_state, metric_path, value, threshold, None)
|
||||
|
||||
return None
|
||||
def check_plugin_data(
|
||||
@@ -827,13 +825,9 @@ class ThresholdChecker:
|
||||
old_level = alert_state.level
|
||||
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
||||
state_changes.append((metric_path, old_level, new_level, value))
|
||||
lvl, message, formatted_msg = self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold, data)
|
||||
# Update alert state with formatted message
|
||||
alert_state.formatted_message = formatted_msg
|
||||
self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value)
|
||||
self._apply_grace(host_name, alert_state, metric_path, old_level, new_level, value, threshold, data)
|
||||
elif new_level != AlertLevel.OK:
|
||||
# Check if we should re-notify
|
||||
self._check_renotify(host_name, alert_state, metric_path, value, threshold, data)
|
||||
self._check_pending_or_renotify(host_name, alert_state, metric_path, value, threshold, data)
|
||||
|
||||
# Check nested metrics (e.g., partition data in disk_monitor)
|
||||
self._check_nested_metrics(
|
||||
@@ -896,20 +890,9 @@ class ThresholdChecker:
|
||||
old_level = alert_state.level
|
||||
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
||||
state_changes.append((metric_path, old_level, new_level, value))
|
||||
lvl, message, formatted_msg = self._trigger_notification(
|
||||
host_name,
|
||||
metric_path,
|
||||
old_level,
|
||||
new_level,
|
||||
value,
|
||||
threshold,
|
||||
data # Pass full plugin data for format string
|
||||
)
|
||||
# Update alert state with formatted message
|
||||
alert_state.formatted_message = formatted_msg
|
||||
self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value)
|
||||
self._apply_grace(host_name, alert_state, metric_path, old_level, new_level, value, threshold, data)
|
||||
elif new_level != AlertLevel.OK:
|
||||
self._check_renotify(host_name, alert_state, metric_path, value, threshold, data)
|
||||
self._check_pending_or_renotify(host_name, alert_state, metric_path, value, threshold, data)
|
||||
|
||||
def _trigger_notification(
|
||||
self,
|
||||
@@ -1084,6 +1067,74 @@ class ThresholdChecker:
|
||||
)
|
||||
return f"(threshold: {op_symbol} {threshold_value})"
|
||||
|
||||
def _apply_grace(
|
||||
self,
|
||||
host_name: str,
|
||||
alert_state: AlertState,
|
||||
metric_path: str,
|
||||
old_level: AlertLevel,
|
||||
new_level: AlertLevel,
|
||||
value: Any,
|
||||
threshold: ThresholdConfig,
|
||||
plugin_data: Optional[Dict[str, Any]],
|
||||
) -> None:
|
||||
"""Handle a state-change transition with grace-period logic.
|
||||
|
||||
Transitioning INTO alert: defers the notification for grace_seconds.
|
||||
Transitioning TO OK:
|
||||
- Still in grace window (pending_since set): suppresses both the alert
|
||||
and the recovery — the spike never warranted a page.
|
||||
- Past grace: fires the RECOVER notification normally.
|
||||
"""
|
||||
lvl, message, formatted_msg = self._trigger_notification(
|
||||
host_name, metric_path, old_level, new_level, value, threshold, plugin_data
|
||||
)
|
||||
alert_state.formatted_message = formatted_msg
|
||||
|
||||
if new_level == AlertLevel.OK:
|
||||
if alert_state.pending_since is not None:
|
||||
logger.info(
|
||||
"Alert suppressed (recovered within %.0fs grace): %s on %s",
|
||||
self.grace_seconds, metric_path, host_name,
|
||||
)
|
||||
alert_state.pending_since = None
|
||||
else:
|
||||
self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value)
|
||||
else:
|
||||
alert_state.pending_since = time.time()
|
||||
logger.debug(
|
||||
"Alert deferred (%.0fs grace): %s on %s = %s",
|
||||
self.grace_seconds, metric_path, host_name, value,
|
||||
)
|
||||
|
||||
def _check_pending_or_renotify(
|
||||
self,
|
||||
host_name: str,
|
||||
alert_state: AlertState,
|
||||
metric_path: str,
|
||||
value: Any,
|
||||
threshold: ThresholdConfig,
|
||||
plugin_data: Optional[Dict[str, Any]],
|
||||
) -> None:
|
||||
"""Called when alert level is unchanged and non-OK.
|
||||
|
||||
If a deferred notification is pending and grace_seconds have elapsed,
|
||||
fires it now. Otherwise falls through to normal reminder logic.
|
||||
"""
|
||||
if alert_state.pending_since is not None:
|
||||
if time.time() - alert_state.pending_since >= self.grace_seconds:
|
||||
lvl, message, formatted_msg = self._trigger_notification(
|
||||
host_name, metric_path, AlertLevel.OK, alert_state.level, value, threshold, plugin_data
|
||||
)
|
||||
alert_state.formatted_message = formatted_msg
|
||||
self._send_notification(
|
||||
host_name, lvl, message, metric_path, AlertLevel.OK, alert_state.level, value
|
||||
)
|
||||
alert_state.pending_since = None
|
||||
# else: still within grace window, do nothing
|
||||
else:
|
||||
self._check_renotify(host_name, alert_state, metric_path, value, threshold, plugin_data)
|
||||
|
||||
def _check_renotify(
|
||||
self,
|
||||
host_name: str,
|
||||
|
||||
Reference in New Issue
Block a user