Apply grace period to all threshold alerts before logging/notifying
Threshold alerts (plugin metrics, RTT) were firing immediately on the first breach. Now every state transition to WARNING/CRITICAL starts a grace-period timer (grace_seconds from the 'grace' config key). The notification is deferred until the next heartbeat after grace_seconds have elapsed. If the metric recovers within the grace window, both the alert and the recovery are suppressed — no spurious pages for transient spikes. Two helper methods added to ThresholdChecker: - _apply_grace: handles the state-change path (defer or suppress) - _check_pending_or_renotify: handles the stable-alert path (fire deferred notification once grace expires, or fall through to reminders) The overdue case is unchanged — on_overdue already fires only after interval+grace seconds of silence, which is equivalent behaviour. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+80
-29
@@ -60,6 +60,7 @@ class AlertState:
|
|||||||
self.acknowledged = False # Whether alert has been acknowledged
|
self.acknowledged = False # Whether alert has been acknowledged
|
||||||
self.acknowledged_at = None # Timestamp when acknowledged
|
self.acknowledged_at = None # Timestamp when acknowledged
|
||||||
self.consecutive_count = 0 # Consecutive exceedances while still OK (for count gating)
|
self.consecutive_count = 0 # Consecutive exceedances while still OK (for count gating)
|
||||||
|
self.pending_since: Optional[float] = None # non-None while waiting out grace period before notifying
|
||||||
|
|
||||||
def update(
|
def update(
|
||||||
self,
|
self,
|
||||||
@@ -340,8 +341,9 @@ class ThresholdChecker:
|
|||||||
self.default_config = "default"
|
self.default_config = "default"
|
||||||
|
|
||||||
self.renotify_interval = renotify_interval
|
self.renotify_interval = renotify_interval
|
||||||
|
self.grace_seconds: float = float(config.get("grace", 2))
|
||||||
self.journal = journal
|
self.journal = journal
|
||||||
|
|
||||||
# Parse configuration
|
# Parse configuration
|
||||||
self._parse_config(config)
|
self._parse_config(config)
|
||||||
|
|
||||||
@@ -372,7 +374,8 @@ class ThresholdChecker:
|
|||||||
self.threshold_configs.clear()
|
self.threshold_configs.clear()
|
||||||
self.thresholds.clear()
|
self.thresholds.clear()
|
||||||
self.host_config_mapping.clear()
|
self.host_config_mapping.clear()
|
||||||
|
self.grace_seconds = float(config.get("grace", 2))
|
||||||
|
|
||||||
# Parse new configuration
|
# Parse new configuration
|
||||||
self._parse_config(config)
|
self._parse_config(config)
|
||||||
|
|
||||||
@@ -760,15 +763,10 @@ class ThresholdChecker:
|
|||||||
# Update state and check for changes
|
# Update state and check for changes
|
||||||
old_level = alert_state.level
|
old_level = alert_state.level
|
||||||
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
||||||
# For check_value, we don't have full plugin data, pass None
|
self._apply_grace(host_name, alert_state, metric_path, old_level, new_level, value, threshold, None)
|
||||||
lvl, message, formatted_msg = self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold, None)
|
|
||||||
# Update alert state with formatted message
|
|
||||||
alert_state.formatted_message = formatted_msg
|
|
||||||
self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value)
|
|
||||||
return (old_level, new_level)
|
return (old_level, new_level)
|
||||||
elif new_level != AlertLevel.OK:
|
elif new_level != AlertLevel.OK:
|
||||||
# Check if we should re-notify
|
self._check_pending_or_renotify(host_name, alert_state, metric_path, value, threshold, None)
|
||||||
self._check_renotify(host_name, alert_state, metric_path, value, threshold, None)
|
|
||||||
|
|
||||||
return None
|
return None
|
||||||
def check_plugin_data(
|
def check_plugin_data(
|
||||||
@@ -827,14 +825,10 @@ class ThresholdChecker:
|
|||||||
old_level = alert_state.level
|
old_level = alert_state.level
|
||||||
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
||||||
state_changes.append((metric_path, old_level, new_level, value))
|
state_changes.append((metric_path, old_level, new_level, value))
|
||||||
lvl, message, formatted_msg = self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold, data)
|
self._apply_grace(host_name, alert_state, metric_path, old_level, new_level, value, threshold, data)
|
||||||
# Update alert state with formatted message
|
|
||||||
alert_state.formatted_message = formatted_msg
|
|
||||||
self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value)
|
|
||||||
elif new_level != AlertLevel.OK:
|
elif new_level != AlertLevel.OK:
|
||||||
# Check if we should re-notify
|
self._check_pending_or_renotify(host_name, alert_state, metric_path, value, threshold, data)
|
||||||
self._check_renotify(host_name, alert_state, metric_path, value, threshold, data)
|
|
||||||
|
|
||||||
# Check nested metrics (e.g., partition data in disk_monitor)
|
# Check nested metrics (e.g., partition data in disk_monitor)
|
||||||
self._check_nested_metrics(
|
self._check_nested_metrics(
|
||||||
host_name,
|
host_name,
|
||||||
@@ -896,20 +890,9 @@ class ThresholdChecker:
|
|||||||
old_level = alert_state.level
|
old_level = alert_state.level
|
||||||
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
||||||
state_changes.append((metric_path, old_level, new_level, value))
|
state_changes.append((metric_path, old_level, new_level, value))
|
||||||
lvl, message, formatted_msg = self._trigger_notification(
|
self._apply_grace(host_name, alert_state, metric_path, old_level, new_level, value, threshold, data)
|
||||||
host_name,
|
|
||||||
metric_path,
|
|
||||||
old_level,
|
|
||||||
new_level,
|
|
||||||
value,
|
|
||||||
threshold,
|
|
||||||
data # Pass full plugin data for format string
|
|
||||||
)
|
|
||||||
# Update alert state with formatted message
|
|
||||||
alert_state.formatted_message = formatted_msg
|
|
||||||
self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value)
|
|
||||||
elif new_level != AlertLevel.OK:
|
elif new_level != AlertLevel.OK:
|
||||||
self._check_renotify(host_name, alert_state, metric_path, value, threshold, data)
|
self._check_pending_or_renotify(host_name, alert_state, metric_path, value, threshold, data)
|
||||||
|
|
||||||
def _trigger_notification(
|
def _trigger_notification(
|
||||||
self,
|
self,
|
||||||
@@ -1084,6 +1067,74 @@ class ThresholdChecker:
|
|||||||
)
|
)
|
||||||
return f"(threshold: {op_symbol} {threshold_value})"
|
return f"(threshold: {op_symbol} {threshold_value})"
|
||||||
|
|
||||||
|
def _apply_grace(
|
||||||
|
self,
|
||||||
|
host_name: str,
|
||||||
|
alert_state: AlertState,
|
||||||
|
metric_path: str,
|
||||||
|
old_level: AlertLevel,
|
||||||
|
new_level: AlertLevel,
|
||||||
|
value: Any,
|
||||||
|
threshold: ThresholdConfig,
|
||||||
|
plugin_data: Optional[Dict[str, Any]],
|
||||||
|
) -> None:
|
||||||
|
"""Handle a state-change transition with grace-period logic.
|
||||||
|
|
||||||
|
Transitioning INTO alert: defers the notification for grace_seconds.
|
||||||
|
Transitioning TO OK:
|
||||||
|
- Still in grace window (pending_since set): suppresses both the alert
|
||||||
|
and the recovery — the spike never warranted a page.
|
||||||
|
- Past grace: fires the RECOVER notification normally.
|
||||||
|
"""
|
||||||
|
lvl, message, formatted_msg = self._trigger_notification(
|
||||||
|
host_name, metric_path, old_level, new_level, value, threshold, plugin_data
|
||||||
|
)
|
||||||
|
alert_state.formatted_message = formatted_msg
|
||||||
|
|
||||||
|
if new_level == AlertLevel.OK:
|
||||||
|
if alert_state.pending_since is not None:
|
||||||
|
logger.info(
|
||||||
|
"Alert suppressed (recovered within %.0fs grace): %s on %s",
|
||||||
|
self.grace_seconds, metric_path, host_name,
|
||||||
|
)
|
||||||
|
alert_state.pending_since = None
|
||||||
|
else:
|
||||||
|
self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value)
|
||||||
|
else:
|
||||||
|
alert_state.pending_since = time.time()
|
||||||
|
logger.debug(
|
||||||
|
"Alert deferred (%.0fs grace): %s on %s = %s",
|
||||||
|
self.grace_seconds, metric_path, host_name, value,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _check_pending_or_renotify(
|
||||||
|
self,
|
||||||
|
host_name: str,
|
||||||
|
alert_state: AlertState,
|
||||||
|
metric_path: str,
|
||||||
|
value: Any,
|
||||||
|
threshold: ThresholdConfig,
|
||||||
|
plugin_data: Optional[Dict[str, Any]],
|
||||||
|
) -> None:
|
||||||
|
"""Called when alert level is unchanged and non-OK.
|
||||||
|
|
||||||
|
If a deferred notification is pending and grace_seconds have elapsed,
|
||||||
|
fires it now. Otherwise falls through to normal reminder logic.
|
||||||
|
"""
|
||||||
|
if alert_state.pending_since is not None:
|
||||||
|
if time.time() - alert_state.pending_since >= self.grace_seconds:
|
||||||
|
lvl, message, formatted_msg = self._trigger_notification(
|
||||||
|
host_name, metric_path, AlertLevel.OK, alert_state.level, value, threshold, plugin_data
|
||||||
|
)
|
||||||
|
alert_state.formatted_message = formatted_msg
|
||||||
|
self._send_notification(
|
||||||
|
host_name, lvl, message, metric_path, AlertLevel.OK, alert_state.level, value
|
||||||
|
)
|
||||||
|
alert_state.pending_since = None
|
||||||
|
# else: still within grace window, do nothing
|
||||||
|
else:
|
||||||
|
self._check_renotify(host_name, alert_state, metric_path, value, threshold, plugin_data)
|
||||||
|
|
||||||
def _check_renotify(
|
def _check_renotify(
|
||||||
self,
|
self,
|
||||||
host_name: str,
|
host_name: str,
|
||||||
|
|||||||
Reference in New Issue
Block a user