fix: reduce default hysteresis 10%→2%; show recovery threshold in alerts UI

The 10% default hysteresis created an unreasonably wide recovery band:
a 95% threshold would only clear once the value dropped below 85.5%,
causing alerts to linger long after the metric was well below the
trigger level.

Change default hysteresis to 2% across all threshold parsers (plugin
metrics, partitions, RTT). For a 95% threshold, recovery is now at
93.1% instead of 85.5%.

Add AlertState.hysteresis field (set on every check, cleared on OK) and
expose recovery_threshold in to_dict() so the Alerts dashboard can
display "recovers < 93.1" alongside the trigger threshold, making the
hysteresis band visible to the user. Pickle backward-compatible via
__setstate__.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-04 14:47:50 -04:00
parent d7b368c7c6
commit de778f680f
2 changed files with 31 additions and 5 deletions
+4
View File
@@ -405,6 +405,10 @@
} else if (alert.threshold_value !== undefined && alert.threshold_value !== null && alert.operator) { } else if (alert.threshold_value !== undefined && alert.threshold_value !== null && alert.operator) {
valueText += ` <span class="threshold-info">(threshold: ${alert.operator} ${formatValue(alert.threshold_value)})</span>`; valueText += ` <span class="threshold-info">(threshold: ${alert.operator} ${formatValue(alert.threshold_value)})</span>`;
} }
if (alert.recovery_threshold !== undefined && alert.recovery_threshold !== null) {
const recOp = (alert.operator === '>' || alert.operator === '>=') ? '<' : '>';
valueText += ` <span class="threshold-info" style="color:#888">(recovers ${recOp} ${formatValue(alert.recovery_threshold)})</span>`;
}
// Build actions section // Build actions section
let actionsHtml = ''; let actionsHtml = '';
+24 -2
View File
@@ -57,6 +57,7 @@ class AlertState:
self.last_notification = None self.last_notification = None
self.threshold_value = None # The threshold value that triggered alert self.threshold_value = None # The threshold value that triggered alert
self.operator = None # The comparison operator (>, <, >=, etc.) self.operator = None # The comparison operator (>, <, >=, etc.)
self.hysteresis: Optional[float] = None # Hysteresis fraction used for recovery
self.formatted_message = None # Formatted display message for UI self.formatted_message = None # Formatted display message for UI
self.acknowledged = False # Whether alert has been acknowledged self.acknowledged = False # Whether alert has been acknowledged
self.acknowledged_at = None # Timestamp when acknowledged self.acknowledged_at = None # Timestamp when acknowledged
@@ -152,6 +153,15 @@ class AlertState:
if self.formatted_message is not None: if self.formatted_message is not None:
result["formatted_message"] = self.formatted_message result["formatted_message"] = self.formatted_message
# Compute and expose the recovery threshold so the UI can display it
if (self.hysteresis and self.threshold_value is not None
and self.operator is not None):
ha = abs(self.threshold_value * self.hysteresis)
if self.operator in ('>', '>='):
result["recovery_threshold"] = round(self.threshold_value - ha, 4)
elif self.operator in ('<', '<='):
result["recovery_threshold"] = round(self.threshold_value + ha, 4)
return result return result
def __setstate__(self, state): def __setstate__(self, state):
@@ -159,6 +169,8 @@ class AlertState:
self.__dict__.update(state) self.__dict__.update(state)
if not hasattr(self, 'consecutive_count'): if not hasattr(self, 'consecutive_count'):
self.consecutive_count = 0 self.consecutive_count = 0
if not hasattr(self, 'hysteresis'):
self.hysteresis = None
def acknowledge(self): def acknowledge(self):
"""Acknowledge this alert to stop reminder notifications.""" """Acknowledge this alert to stop reminder notifications."""
@@ -546,7 +558,7 @@ class ThresholdChecker:
critical = threshold_config.get("critical") critical = threshold_config.get("critical")
operator = threshold_config.get("operator", ">") operator = threshold_config.get("operator", ">")
display = threshold_config.get("display", "(threshold: {op_symbol} {threshold_value})") display = threshold_config.get("display", "(threshold: {op_symbol} {threshold_value})")
hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default hysteresis = threshold_config.get("hysteresis", 0.02) # 2% default
enabled = threshold_config.get("enabled", True) enabled = threshold_config.get("enabled", True)
if warning is None and critical is None: if warning is None and critical is None:
@@ -649,7 +661,7 @@ class ThresholdChecker:
warning = rtt_thresholds.get("warning") warning = rtt_thresholds.get("warning")
critical = rtt_thresholds.get("critical") critical = rtt_thresholds.get("critical")
operator = rtt_thresholds.get("operator", ">") operator = rtt_thresholds.get("operator", ">")
hysteresis = rtt_thresholds.get("hysteresis", 0.1) # 10% default hysteresis = rtt_thresholds.get("hysteresis", 0.02) # 2% default
enabled = rtt_thresholds.get("enabled", True) enabled = rtt_thresholds.get("enabled", True)
display = rtt_thresholds.get("display") display = rtt_thresholds.get("display")
count = rtt_thresholds.get("count", 1) count = rtt_thresholds.get("count", 1)
@@ -794,6 +806,12 @@ class ThresholdChecker:
elif new_level == AlertLevel.WARNING and threshold.warning is not None: elif new_level == AlertLevel.WARNING and threshold.warning is not None:
threshold_value = threshold.warning threshold_value = threshold.warning
# Keep hysteresis on the state so the UI can show the recovery threshold
if new_level != AlertLevel.OK:
alert_state.hysteresis = threshold.hysteresis
else:
alert_state.hysteresis = None
# Update state and check for changes # Update state and check for changes
old_level = alert_state.level old_level = alert_state.level
if alert_state.update(new_level, value, threshold_value, threshold.operator.value): if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
@@ -877,6 +895,8 @@ class ThresholdChecker:
elif new_level == AlertLevel.WARNING and threshold.warning is not None: elif new_level == AlertLevel.WARNING and threshold.warning is not None:
threshold_value = threshold.warning threshold_value = threshold.warning
alert_state.hysteresis = threshold.hysteresis if new_level != AlertLevel.OK else None
# Update state and check for changes # Update state and check for changes
old_level = alert_state.level old_level = alert_state.level
if alert_state.update(new_level, value, threshold_value, threshold.operator.value): if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
@@ -943,6 +963,8 @@ class ThresholdChecker:
elif new_level == AlertLevel.WARNING and threshold.warning is not None: elif new_level == AlertLevel.WARNING and threshold.warning is not None:
threshold_value = threshold.warning threshold_value = threshold.warning
alert_state.hysteresis = threshold.hysteresis if new_level != AlertLevel.OK else None
old_level = alert_state.level old_level = alert_state.level
if alert_state.update(new_level, value, threshold_value, threshold.operator.value): if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
state_changes.append((metric_path, old_level, new_level, value)) state_changes.append((metric_path, old_level, new_level, value))