Add count to rtt threshold

This commit is contained in:
Andreas Wrede
2026-04-10 08:07:35 -04:00
parent 381e37efce
commit ba27d2e300
2 changed files with 41 additions and 13 deletions
+1
View File
@@ -96,6 +96,7 @@ THRESHOLD_DEFAULTS = {
'rtt': {
'warning': 200,
'critical': 250.0
'count': 3 # Optional: number of consecutive breaches before alerting
}
}
}
+30 -3
View File
@@ -59,6 +59,7 @@ class AlertState:
self.formatted_message = None # Formatted display message for UI
self.acknowledged = False # Whether alert has been acknowledged
self.acknowledged_at = None # Timestamp when acknowledged
self.consecutive_count = 0 # Consecutive exceedances while still OK (for count gating)
def update(
self,
@@ -168,6 +169,7 @@ class ThresholdConfig:
operator: str = ">",
hysteresis: float = 0.0,
enabled: bool = True,
count: int = 1,
):
"""
Initialize threshold configuration.
@@ -179,6 +181,7 @@ class ThresholdConfig:
operator: Comparison operator (>, >=, <, <=, ==, !=)
hysteresis: Hysteresis percentage to prevent flapping (0.0-1.0)
enabled: Whether this threshold is enabled
count: Number of consecutive exceedances required before alerting (default 1)
"""
self.metric_path = metric_path
self.warning = warning
@@ -186,6 +189,7 @@ class ThresholdConfig:
self.enabled = enabled
self.hysteresis = hysteresis
self.display = display
self.count = max(1, int(count))
# Parse operator
try:
@@ -621,6 +625,7 @@ class ThresholdChecker:
hysteresis = rtt_thresholds.get("hysteresis", 0.1) # 10% default
enabled = rtt_thresholds.get("enabled", True)
display = rtt_thresholds.get("display")
count = rtt_thresholds.get("count", 1)
if warning is None and critical is None:
logger.warning("No RTT thresholds defined, skipping")
@@ -633,14 +638,16 @@ class ThresholdChecker:
operator=operator,
hysteresis=hysteresis,
enabled=enabled,
display=display
display=display,
count=count,
)
target_dict[metric_path] = threshold
logger.debug(
"Registered RTT threshold: warn=%s ms, crit=%s ms",
"Registered RTT threshold: warn=%s ms, crit=%s ms, count=%d",
warning,
critical
critical,
count,
)
def get_thresholds_for_host(self, host_name: str) -> Dict[str, ThresholdConfig]:
@@ -713,6 +720,26 @@ class ThresholdChecker:
alert_state.level
)
# Apply consecutive-count gating: when currently OK, require threshold.count
# consecutive exceedances before escalating to WARNING/CRITICAL.
if new_level == AlertLevel.OK:
# Value is fine (or recovered) — reset the pending counter immediately.
alert_state.consecutive_count = 0
elif alert_state.level == AlertLevel.OK and new_level != AlertLevel.OK:
# First time we exceed while still OK: count up.
alert_state.consecutive_count += 1
if alert_state.consecutive_count < threshold.count:
logger.debug(
"RTT threshold exceeded %d/%d consecutive times for %s on %s",
alert_state.consecutive_count,
threshold.count,
metric_path,
host_name,
)
return None
# Count reached — fire the alert and reset the counter.
alert_state.consecutive_count = 0
# Determine which threshold was exceeded
threshold_value = None
if new_level == AlertLevel.CRITICAL and threshold.critical is not None: