From de778f680f9d1075b73c13b64ca7818c58297d5f Mon Sep 17 00:00:00 2001 From: Andreas Wrede Date: Mon, 4 May 2026 14:47:50 -0400 Subject: [PATCH] =?UTF-8?q?fix:=20reduce=20default=20hysteresis=2010%?= =?UTF-8?q?=E2=86=922%;=20show=20recovery=20threshold=20in=20alerts=20UI?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 10% default hysteresis created an unreasonably wide recovery band: a 95% threshold would only clear once the value dropped below 85.5%, causing alerts to linger long after the metric was well below the trigger level. Change default hysteresis to 2% across all threshold parsers (plugin metrics, partitions, RTT). For a 95% threshold, recovery is now at 93.1% instead of 85.5%. Add AlertState.hysteresis field (set on every check, cleared on OK) and expose recovery_threshold in to_dict() so the Alerts dashboard can display "recovers < 93.1" alongside the trigger threshold, making the hysteresis band visible to the user. Pickle backward-compatible via __setstate__. Co-Authored-By: Claude Sonnet 4.6 --- hbd/server/templates/alerts.html | 4 ++++ hbd/server/threshold.py | 32 +++++++++++++++++++++++++++----- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/hbd/server/templates/alerts.html b/hbd/server/templates/alerts.html index 1769cc2..493ff1d 100644 --- a/hbd/server/templates/alerts.html +++ b/hbd/server/templates/alerts.html @@ -405,6 +405,10 @@ } else if (alert.threshold_value !== undefined && alert.threshold_value !== null && alert.operator) { valueText += ` (threshold: ${alert.operator} ${formatValue(alert.threshold_value)})`; } + if (alert.recovery_threshold !== undefined && alert.recovery_threshold !== null) { + const recOp = (alert.operator === '>' || alert.operator === '>=') ? '<' : '>'; + valueText += ` (recovers ${recOp} ${formatValue(alert.recovery_threshold)})`; + } // Build actions section let actionsHtml = ''; diff --git a/hbd/server/threshold.py b/hbd/server/threshold.py index 9d86a00..66c6619 100644 --- a/hbd/server/threshold.py +++ b/hbd/server/threshold.py @@ -57,6 +57,7 @@ class AlertState: self.last_notification = None self.threshold_value = None # The threshold value that triggered alert self.operator = None # The comparison operator (>, <, >=, etc.) + self.hysteresis: Optional[float] = None # Hysteresis fraction used for recovery self.formatted_message = None # Formatted display message for UI self.acknowledged = False # Whether alert has been acknowledged self.acknowledged_at = None # Timestamp when acknowledged @@ -151,7 +152,16 @@ class AlertState: result["operator"] = self.operator if self.formatted_message is not None: result["formatted_message"] = self.formatted_message - + + # Compute and expose the recovery threshold so the UI can display it + if (self.hysteresis and self.threshold_value is not None + and self.operator is not None): + ha = abs(self.threshold_value * self.hysteresis) + if self.operator in ('>', '>='): + result["recovery_threshold"] = round(self.threshold_value - ha, 4) + elif self.operator in ('<', '<='): + result["recovery_threshold"] = round(self.threshold_value + ha, 4) + return result def __setstate__(self, state): @@ -159,6 +169,8 @@ class AlertState: self.__dict__.update(state) if not hasattr(self, 'consecutive_count'): self.consecutive_count = 0 + if not hasattr(self, 'hysteresis'): + self.hysteresis = None def acknowledge(self): """Acknowledge this alert to stop reminder notifications.""" @@ -546,7 +558,7 @@ class ThresholdChecker: critical = threshold_config.get("critical") operator = threshold_config.get("operator", ">") display = threshold_config.get("display", "(threshold: {op_symbol} {threshold_value})") - hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default + hysteresis = threshold_config.get("hysteresis", 0.02) # 2% default enabled = threshold_config.get("enabled", True) if warning is None and critical is None: @@ -649,7 +661,7 @@ class ThresholdChecker: warning = rtt_thresholds.get("warning") critical = rtt_thresholds.get("critical") operator = rtt_thresholds.get("operator", ">") - hysteresis = rtt_thresholds.get("hysteresis", 0.1) # 10% default + hysteresis = rtt_thresholds.get("hysteresis", 0.02) # 2% default enabled = rtt_thresholds.get("enabled", True) display = rtt_thresholds.get("display") count = rtt_thresholds.get("count", 1) @@ -794,6 +806,12 @@ class ThresholdChecker: elif new_level == AlertLevel.WARNING and threshold.warning is not None: threshold_value = threshold.warning + # Keep hysteresis on the state so the UI can show the recovery threshold + if new_level != AlertLevel.OK: + alert_state.hysteresis = threshold.hysteresis + else: + alert_state.hysteresis = None + # Update state and check for changes old_level = alert_state.level if alert_state.update(new_level, value, threshold_value, threshold.operator.value): @@ -876,7 +894,9 @@ class ThresholdChecker: threshold_value = threshold.critical elif new_level == AlertLevel.WARNING and threshold.warning is not None: threshold_value = threshold.warning - + + alert_state.hysteresis = threshold.hysteresis if new_level != AlertLevel.OK else None + # Update state and check for changes old_level = alert_state.level if alert_state.update(new_level, value, threshold_value, threshold.operator.value): @@ -942,7 +962,9 @@ class ThresholdChecker: threshold_value = threshold.critical elif new_level == AlertLevel.WARNING and threshold.warning is not None: threshold_value = threshold.warning - + + alert_state.hysteresis = threshold.hysteresis if new_level != AlertLevel.OK else None + old_level = alert_state.level if alert_state.update(new_level, value, threshold_value, threshold.operator.value): state_changes.append((metric_path, old_level, new_level, value))