From ba27d2e3005f8ab8826a2ed345755762ce6eeec8 Mon Sep 17 00:00:00 2001 From: Andreas Wrede Date: Fri, 10 Apr 2026 08:07:35 -0400 Subject: [PATCH] Add count to rtt threshold --- hbd/server/config.py | 1 + hbd/server/threshold.py | 53 +++++++++++++++++++++++++++++++---------- 2 files changed, 41 insertions(+), 13 deletions(-) diff --git a/hbd/server/config.py b/hbd/server/config.py index 53a57a4..67e8fdc 100644 --- a/hbd/server/config.py +++ b/hbd/server/config.py @@ -96,6 +96,7 @@ THRESHOLD_DEFAULTS = { 'rtt': { 'warning': 200, 'critical': 250.0 + 'count': 3 # Optional: number of consecutive breaches before alerting } } } diff --git a/hbd/server/threshold.py b/hbd/server/threshold.py index 031c1c2..281d468 100644 --- a/hbd/server/threshold.py +++ b/hbd/server/threshold.py @@ -39,11 +39,11 @@ class ComparisonOperator(Enum): class AlertState: """Represents the current alert state for a specific metric.""" - + def __init__(self, metric_path: str): """ Initialize alert state. - + Args: metric_path: Full path to metric (e.g., "cpu_monitor.cpu_percent") """ @@ -59,6 +59,7 @@ class AlertState: self.formatted_message = None # Formatted display message for UI self.acknowledged = False # Whether alert has been acknowledged self.acknowledged_at = None # Timestamp when acknowledged + self.consecutive_count = 0 # Consecutive exceedances while still OK (for count gating) def update( self, @@ -158,7 +159,7 @@ class AlertState: class ThresholdConfig: """Configuration for a single threshold check.""" - + def __init__( self, metric_path: str, @@ -168,10 +169,11 @@ class ThresholdConfig: operator: str = ">", hysteresis: float = 0.0, enabled: bool = True, + count: int = 1, ): """ Initialize threshold configuration. - + Args: metric_path: Full path to metric (e.g., "cpu_monitor.cpu_percent") warning: Warning threshold value @@ -179,6 +181,7 @@ class ThresholdConfig: operator: Comparison operator (>, >=, <, <=, ==, !=) hysteresis: Hysteresis percentage to prevent flapping (0.0-1.0) enabled: Whether this threshold is enabled + count: Number of consecutive exceedances required before alerting (default 1) """ self.metric_path = metric_path self.warning = warning @@ -186,6 +189,7 @@ class ThresholdConfig: self.enabled = enabled self.hysteresis = hysteresis self.display = display + self.count = max(1, int(count)) # Parse operator try: @@ -621,11 +625,12 @@ class ThresholdChecker: hysteresis = rtt_thresholds.get("hysteresis", 0.1) # 10% default enabled = rtt_thresholds.get("enabled", True) display = rtt_thresholds.get("display") - + count = rtt_thresholds.get("count", 1) + if warning is None and critical is None: logger.warning("No RTT thresholds defined, skipping") return - + threshold = ThresholdConfig( metric_path=metric_path, warning=warning, @@ -633,14 +638,16 @@ class ThresholdChecker: operator=operator, hysteresis=hysteresis, enabled=enabled, - display=display + display=display, + count=count, ) - + target_dict[metric_path] = threshold logger.debug( - "Registered RTT threshold: warn=%s ms, crit=%s ms", + "Registered RTT threshold: warn=%s ms, crit=%s ms, count=%d", warning, - critical + critical, + count, ) def get_thresholds_for_host(self, host_name: str) -> Dict[str, ThresholdConfig]: @@ -712,14 +719,34 @@ class ThresholdChecker: value, alert_state.level ) - + + # Apply consecutive-count gating: when currently OK, require threshold.count + # consecutive exceedances before escalating to WARNING/CRITICAL. + if new_level == AlertLevel.OK: + # Value is fine (or recovered) — reset the pending counter immediately. + alert_state.consecutive_count = 0 + elif alert_state.level == AlertLevel.OK and new_level != AlertLevel.OK: + # First time we exceed while still OK: count up. + alert_state.consecutive_count += 1 + if alert_state.consecutive_count < threshold.count: + logger.debug( + "RTT threshold exceeded %d/%d consecutive times for %s on %s", + alert_state.consecutive_count, + threshold.count, + metric_path, + host_name, + ) + return None + # Count reached — fire the alert and reset the counter. + alert_state.consecutive_count = 0 + # Determine which threshold was exceeded threshold_value = None if new_level == AlertLevel.CRITICAL and threshold.critical is not None: threshold_value = threshold.critical elif new_level == AlertLevel.WARNING and threshold.warning is not None: threshold_value = threshold.warning - + # Update state and check for changes old_level = alert_state.level if alert_state.update(new_level, value, threshold_value, threshold.operator.value): @@ -732,7 +759,7 @@ class ThresholdChecker: elif new_level != AlertLevel.OK: # Check if we should re-notify self._check_renotify(host_name, alert_state, metric_path, value, threshold, None) - + return None def check_plugin_data( self,