fix: reduce default hysteresis 10%→2%; show recovery threshold in alerts UI
The 10% default hysteresis created an unreasonably wide recovery band: a 95% threshold would only clear once the value dropped below 85.5%, causing alerts to linger long after the metric was well below the trigger level. Change default hysteresis to 2% across all threshold parsers (plugin metrics, partitions, RTT). For a 95% threshold, recovery is now at 93.1% instead of 85.5%. Add AlertState.hysteresis field (set on every check, cleared on OK) and expose recovery_threshold in to_dict() so the Alerts dashboard can display "recovers < 93.1" alongside the trigger threshold, making the hysteresis band visible to the user. Pickle backward-compatible via __setstate__. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -405,6 +405,10 @@
|
||||
} else if (alert.threshold_value !== undefined && alert.threshold_value !== null && alert.operator) {
|
||||
valueText += ` <span class="threshold-info">(threshold: ${alert.operator} ${formatValue(alert.threshold_value)})</span>`;
|
||||
}
|
||||
if (alert.recovery_threshold !== undefined && alert.recovery_threshold !== null) {
|
||||
const recOp = (alert.operator === '>' || alert.operator === '>=') ? '<' : '>';
|
||||
valueText += ` <span class="threshold-info" style="color:#888">(recovers ${recOp} ${formatValue(alert.recovery_threshold)})</span>`;
|
||||
}
|
||||
|
||||
// Build actions section
|
||||
let actionsHtml = '';
|
||||
|
||||
+24
-2
@@ -57,6 +57,7 @@ class AlertState:
|
||||
self.last_notification = None
|
||||
self.threshold_value = None # The threshold value that triggered alert
|
||||
self.operator = None # The comparison operator (>, <, >=, etc.)
|
||||
self.hysteresis: Optional[float] = None # Hysteresis fraction used for recovery
|
||||
self.formatted_message = None # Formatted display message for UI
|
||||
self.acknowledged = False # Whether alert has been acknowledged
|
||||
self.acknowledged_at = None # Timestamp when acknowledged
|
||||
@@ -152,6 +153,15 @@ class AlertState:
|
||||
if self.formatted_message is not None:
|
||||
result["formatted_message"] = self.formatted_message
|
||||
|
||||
# Compute and expose the recovery threshold so the UI can display it
|
||||
if (self.hysteresis and self.threshold_value is not None
|
||||
and self.operator is not None):
|
||||
ha = abs(self.threshold_value * self.hysteresis)
|
||||
if self.operator in ('>', '>='):
|
||||
result["recovery_threshold"] = round(self.threshold_value - ha, 4)
|
||||
elif self.operator in ('<', '<='):
|
||||
result["recovery_threshold"] = round(self.threshold_value + ha, 4)
|
||||
|
||||
return result
|
||||
|
||||
def __setstate__(self, state):
|
||||
@@ -159,6 +169,8 @@ class AlertState:
|
||||
self.__dict__.update(state)
|
||||
if not hasattr(self, 'consecutive_count'):
|
||||
self.consecutive_count = 0
|
||||
if not hasattr(self, 'hysteresis'):
|
||||
self.hysteresis = None
|
||||
|
||||
def acknowledge(self):
|
||||
"""Acknowledge this alert to stop reminder notifications."""
|
||||
@@ -546,7 +558,7 @@ class ThresholdChecker:
|
||||
critical = threshold_config.get("critical")
|
||||
operator = threshold_config.get("operator", ">")
|
||||
display = threshold_config.get("display", "(threshold: {op_symbol} {threshold_value})")
|
||||
hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default
|
||||
hysteresis = threshold_config.get("hysteresis", 0.02) # 2% default
|
||||
enabled = threshold_config.get("enabled", True)
|
||||
|
||||
if warning is None and critical is None:
|
||||
@@ -649,7 +661,7 @@ class ThresholdChecker:
|
||||
warning = rtt_thresholds.get("warning")
|
||||
critical = rtt_thresholds.get("critical")
|
||||
operator = rtt_thresholds.get("operator", ">")
|
||||
hysteresis = rtt_thresholds.get("hysteresis", 0.1) # 10% default
|
||||
hysteresis = rtt_thresholds.get("hysteresis", 0.02) # 2% default
|
||||
enabled = rtt_thresholds.get("enabled", True)
|
||||
display = rtt_thresholds.get("display")
|
||||
count = rtt_thresholds.get("count", 1)
|
||||
@@ -794,6 +806,12 @@ class ThresholdChecker:
|
||||
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
|
||||
threshold_value = threshold.warning
|
||||
|
||||
# Keep hysteresis on the state so the UI can show the recovery threshold
|
||||
if new_level != AlertLevel.OK:
|
||||
alert_state.hysteresis = threshold.hysteresis
|
||||
else:
|
||||
alert_state.hysteresis = None
|
||||
|
||||
# Update state and check for changes
|
||||
old_level = alert_state.level
|
||||
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
||||
@@ -877,6 +895,8 @@ class ThresholdChecker:
|
||||
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
|
||||
threshold_value = threshold.warning
|
||||
|
||||
alert_state.hysteresis = threshold.hysteresis if new_level != AlertLevel.OK else None
|
||||
|
||||
# Update state and check for changes
|
||||
old_level = alert_state.level
|
||||
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
||||
@@ -943,6 +963,8 @@ class ThresholdChecker:
|
||||
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
|
||||
threshold_value = threshold.warning
|
||||
|
||||
alert_state.hysteresis = threshold.hysteresis if new_level != AlertLevel.OK else None
|
||||
|
||||
old_level = alert_state.level
|
||||
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
||||
state_changes.append((metric_path, old_level, new_level, value))
|
||||
|
||||
Reference in New Issue
Block a user