Add count to rtt threshold

This commit is contained in:
Andreas Wrede
2026-04-10 08:07:35 -04:00
parent 381e37efce
commit ba27d2e300
2 changed files with 41 additions and 13 deletions
+1
View File
@@ -96,6 +96,7 @@ THRESHOLD_DEFAULTS = {
'rtt': { 'rtt': {
'warning': 200, 'warning': 200,
'critical': 250.0 'critical': 250.0
'count': 3 # Optional: number of consecutive breaches before alerting
} }
} }
} }
+30 -3
View File
@@ -59,6 +59,7 @@ class AlertState:
self.formatted_message = None # Formatted display message for UI self.formatted_message = None # Formatted display message for UI
self.acknowledged = False # Whether alert has been acknowledged self.acknowledged = False # Whether alert has been acknowledged
self.acknowledged_at = None # Timestamp when acknowledged self.acknowledged_at = None # Timestamp when acknowledged
self.consecutive_count = 0 # Consecutive exceedances while still OK (for count gating)
def update( def update(
self, self,
@@ -168,6 +169,7 @@ class ThresholdConfig:
operator: str = ">", operator: str = ">",
hysteresis: float = 0.0, hysteresis: float = 0.0,
enabled: bool = True, enabled: bool = True,
count: int = 1,
): ):
""" """
Initialize threshold configuration. Initialize threshold configuration.
@@ -179,6 +181,7 @@ class ThresholdConfig:
operator: Comparison operator (>, >=, <, <=, ==, !=) operator: Comparison operator (>, >=, <, <=, ==, !=)
hysteresis: Hysteresis percentage to prevent flapping (0.0-1.0) hysteresis: Hysteresis percentage to prevent flapping (0.0-1.0)
enabled: Whether this threshold is enabled enabled: Whether this threshold is enabled
count: Number of consecutive exceedances required before alerting (default 1)
""" """
self.metric_path = metric_path self.metric_path = metric_path
self.warning = warning self.warning = warning
@@ -186,6 +189,7 @@ class ThresholdConfig:
self.enabled = enabled self.enabled = enabled
self.hysteresis = hysteresis self.hysteresis = hysteresis
self.display = display self.display = display
self.count = max(1, int(count))
# Parse operator # Parse operator
try: try:
@@ -621,6 +625,7 @@ class ThresholdChecker:
hysteresis = rtt_thresholds.get("hysteresis", 0.1) # 10% default hysteresis = rtt_thresholds.get("hysteresis", 0.1) # 10% default
enabled = rtt_thresholds.get("enabled", True) enabled = rtt_thresholds.get("enabled", True)
display = rtt_thresholds.get("display") display = rtt_thresholds.get("display")
count = rtt_thresholds.get("count", 1)
if warning is None and critical is None: if warning is None and critical is None:
logger.warning("No RTT thresholds defined, skipping") logger.warning("No RTT thresholds defined, skipping")
@@ -633,14 +638,16 @@ class ThresholdChecker:
operator=operator, operator=operator,
hysteresis=hysteresis, hysteresis=hysteresis,
enabled=enabled, enabled=enabled,
display=display display=display,
count=count,
) )
target_dict[metric_path] = threshold target_dict[metric_path] = threshold
logger.debug( logger.debug(
"Registered RTT threshold: warn=%s ms, crit=%s ms", "Registered RTT threshold: warn=%s ms, crit=%s ms, count=%d",
warning, warning,
critical critical,
count,
) )
def get_thresholds_for_host(self, host_name: str) -> Dict[str, ThresholdConfig]: def get_thresholds_for_host(self, host_name: str) -> Dict[str, ThresholdConfig]:
@@ -713,6 +720,26 @@ class ThresholdChecker:
alert_state.level alert_state.level
) )
# Apply consecutive-count gating: when currently OK, require threshold.count
# consecutive exceedances before escalating to WARNING/CRITICAL.
if new_level == AlertLevel.OK:
# Value is fine (or recovered) — reset the pending counter immediately.
alert_state.consecutive_count = 0
elif alert_state.level == AlertLevel.OK and new_level != AlertLevel.OK:
# First time we exceed while still OK: count up.
alert_state.consecutive_count += 1
if alert_state.consecutive_count < threshold.count:
logger.debug(
"RTT threshold exceeded %d/%d consecutive times for %s on %s",
alert_state.consecutive_count,
threshold.count,
metric_path,
host_name,
)
return None
# Count reached — fire the alert and reset the counter.
alert_state.consecutive_count = 0
# Determine which threshold was exceeded # Determine which threshold was exceeded
threshold_value = None threshold_value = None
if new_level == AlertLevel.CRITICAL and threshold.critical is not None: if new_level == AlertLevel.CRITICAL and threshold.critical is not None: