fix: reduce default hysteresis 10%→2%; show recovery threshold in alerts UI
The 10% default hysteresis created an unreasonably wide recovery band: a 95% threshold would only clear once the value dropped below 85.5%, causing alerts to linger long after the metric was well below the trigger level. Change default hysteresis to 2% across all threshold parsers (plugin metrics, partitions, RTT). For a 95% threshold, recovery is now at 93.1% instead of 85.5%. Add AlertState.hysteresis field (set on every check, cleared on OK) and expose recovery_threshold in to_dict() so the Alerts dashboard can display "recovers < 93.1" alongside the trigger threshold, making the hysteresis band visible to the user. Pickle backward-compatible via __setstate__. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -405,6 +405,10 @@
|
|||||||
} else if (alert.threshold_value !== undefined && alert.threshold_value !== null && alert.operator) {
|
} else if (alert.threshold_value !== undefined && alert.threshold_value !== null && alert.operator) {
|
||||||
valueText += ` <span class="threshold-info">(threshold: ${alert.operator} ${formatValue(alert.threshold_value)})</span>`;
|
valueText += ` <span class="threshold-info">(threshold: ${alert.operator} ${formatValue(alert.threshold_value)})</span>`;
|
||||||
}
|
}
|
||||||
|
if (alert.recovery_threshold !== undefined && alert.recovery_threshold !== null) {
|
||||||
|
const recOp = (alert.operator === '>' || alert.operator === '>=') ? '<' : '>';
|
||||||
|
valueText += ` <span class="threshold-info" style="color:#888">(recovers ${recOp} ${formatValue(alert.recovery_threshold)})</span>`;
|
||||||
|
}
|
||||||
|
|
||||||
// Build actions section
|
// Build actions section
|
||||||
let actionsHtml = '';
|
let actionsHtml = '';
|
||||||
|
|||||||
+27
-5
@@ -57,6 +57,7 @@ class AlertState:
|
|||||||
self.last_notification = None
|
self.last_notification = None
|
||||||
self.threshold_value = None # The threshold value that triggered alert
|
self.threshold_value = None # The threshold value that triggered alert
|
||||||
self.operator = None # The comparison operator (>, <, >=, etc.)
|
self.operator = None # The comparison operator (>, <, >=, etc.)
|
||||||
|
self.hysteresis: Optional[float] = None # Hysteresis fraction used for recovery
|
||||||
self.formatted_message = None # Formatted display message for UI
|
self.formatted_message = None # Formatted display message for UI
|
||||||
self.acknowledged = False # Whether alert has been acknowledged
|
self.acknowledged = False # Whether alert has been acknowledged
|
||||||
self.acknowledged_at = None # Timestamp when acknowledged
|
self.acknowledged_at = None # Timestamp when acknowledged
|
||||||
@@ -151,7 +152,16 @@ class AlertState:
|
|||||||
result["operator"] = self.operator
|
result["operator"] = self.operator
|
||||||
if self.formatted_message is not None:
|
if self.formatted_message is not None:
|
||||||
result["formatted_message"] = self.formatted_message
|
result["formatted_message"] = self.formatted_message
|
||||||
|
|
||||||
|
# Compute and expose the recovery threshold so the UI can display it
|
||||||
|
if (self.hysteresis and self.threshold_value is not None
|
||||||
|
and self.operator is not None):
|
||||||
|
ha = abs(self.threshold_value * self.hysteresis)
|
||||||
|
if self.operator in ('>', '>='):
|
||||||
|
result["recovery_threshold"] = round(self.threshold_value - ha, 4)
|
||||||
|
elif self.operator in ('<', '<='):
|
||||||
|
result["recovery_threshold"] = round(self.threshold_value + ha, 4)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def __setstate__(self, state):
|
def __setstate__(self, state):
|
||||||
@@ -159,6 +169,8 @@ class AlertState:
|
|||||||
self.__dict__.update(state)
|
self.__dict__.update(state)
|
||||||
if not hasattr(self, 'consecutive_count'):
|
if not hasattr(self, 'consecutive_count'):
|
||||||
self.consecutive_count = 0
|
self.consecutive_count = 0
|
||||||
|
if not hasattr(self, 'hysteresis'):
|
||||||
|
self.hysteresis = None
|
||||||
|
|
||||||
def acknowledge(self):
|
def acknowledge(self):
|
||||||
"""Acknowledge this alert to stop reminder notifications."""
|
"""Acknowledge this alert to stop reminder notifications."""
|
||||||
@@ -546,7 +558,7 @@ class ThresholdChecker:
|
|||||||
critical = threshold_config.get("critical")
|
critical = threshold_config.get("critical")
|
||||||
operator = threshold_config.get("operator", ">")
|
operator = threshold_config.get("operator", ">")
|
||||||
display = threshold_config.get("display", "(threshold: {op_symbol} {threshold_value})")
|
display = threshold_config.get("display", "(threshold: {op_symbol} {threshold_value})")
|
||||||
hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default
|
hysteresis = threshold_config.get("hysteresis", 0.02) # 2% default
|
||||||
enabled = threshold_config.get("enabled", True)
|
enabled = threshold_config.get("enabled", True)
|
||||||
|
|
||||||
if warning is None and critical is None:
|
if warning is None and critical is None:
|
||||||
@@ -649,7 +661,7 @@ class ThresholdChecker:
|
|||||||
warning = rtt_thresholds.get("warning")
|
warning = rtt_thresholds.get("warning")
|
||||||
critical = rtt_thresholds.get("critical")
|
critical = rtt_thresholds.get("critical")
|
||||||
operator = rtt_thresholds.get("operator", ">")
|
operator = rtt_thresholds.get("operator", ">")
|
||||||
hysteresis = rtt_thresholds.get("hysteresis", 0.1) # 10% default
|
hysteresis = rtt_thresholds.get("hysteresis", 0.02) # 2% default
|
||||||
enabled = rtt_thresholds.get("enabled", True)
|
enabled = rtt_thresholds.get("enabled", True)
|
||||||
display = rtt_thresholds.get("display")
|
display = rtt_thresholds.get("display")
|
||||||
count = rtt_thresholds.get("count", 1)
|
count = rtt_thresholds.get("count", 1)
|
||||||
@@ -794,6 +806,12 @@ class ThresholdChecker:
|
|||||||
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
|
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
|
||||||
threshold_value = threshold.warning
|
threshold_value = threshold.warning
|
||||||
|
|
||||||
|
# Keep hysteresis on the state so the UI can show the recovery threshold
|
||||||
|
if new_level != AlertLevel.OK:
|
||||||
|
alert_state.hysteresis = threshold.hysteresis
|
||||||
|
else:
|
||||||
|
alert_state.hysteresis = None
|
||||||
|
|
||||||
# Update state and check for changes
|
# Update state and check for changes
|
||||||
old_level = alert_state.level
|
old_level = alert_state.level
|
||||||
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
||||||
@@ -876,7 +894,9 @@ class ThresholdChecker:
|
|||||||
threshold_value = threshold.critical
|
threshold_value = threshold.critical
|
||||||
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
|
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
|
||||||
threshold_value = threshold.warning
|
threshold_value = threshold.warning
|
||||||
|
|
||||||
|
alert_state.hysteresis = threshold.hysteresis if new_level != AlertLevel.OK else None
|
||||||
|
|
||||||
# Update state and check for changes
|
# Update state and check for changes
|
||||||
old_level = alert_state.level
|
old_level = alert_state.level
|
||||||
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
||||||
@@ -942,7 +962,9 @@ class ThresholdChecker:
|
|||||||
threshold_value = threshold.critical
|
threshold_value = threshold.critical
|
||||||
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
|
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
|
||||||
threshold_value = threshold.warning
|
threshold_value = threshold.warning
|
||||||
|
|
||||||
|
alert_state.hysteresis = threshold.hysteresis if new_level != AlertLevel.OK else None
|
||||||
|
|
||||||
old_level = alert_state.level
|
old_level = alert_state.level
|
||||||
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
||||||
state_changes.append((metric_path, old_level, new_level, value))
|
state_changes.append((metric_path, old_level, new_level, value))
|
||||||
|
|||||||
Reference in New Issue
Block a user