Add count to rtt threshold
This commit is contained in:
@@ -96,6 +96,7 @@ THRESHOLD_DEFAULTS = {
|
||||
'rtt': {
|
||||
'warning': 200,
|
||||
'critical': 250.0
|
||||
'count': 3 # Optional: number of consecutive breaches before alerting
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
+30
-3
@@ -59,6 +59,7 @@ class AlertState:
|
||||
self.formatted_message = None # Formatted display message for UI
|
||||
self.acknowledged = False # Whether alert has been acknowledged
|
||||
self.acknowledged_at = None # Timestamp when acknowledged
|
||||
self.consecutive_count = 0 # Consecutive exceedances while still OK (for count gating)
|
||||
|
||||
def update(
|
||||
self,
|
||||
@@ -168,6 +169,7 @@ class ThresholdConfig:
|
||||
operator: str = ">",
|
||||
hysteresis: float = 0.0,
|
||||
enabled: bool = True,
|
||||
count: int = 1,
|
||||
):
|
||||
"""
|
||||
Initialize threshold configuration.
|
||||
@@ -179,6 +181,7 @@ class ThresholdConfig:
|
||||
operator: Comparison operator (>, >=, <, <=, ==, !=)
|
||||
hysteresis: Hysteresis percentage to prevent flapping (0.0-1.0)
|
||||
enabled: Whether this threshold is enabled
|
||||
count: Number of consecutive exceedances required before alerting (default 1)
|
||||
"""
|
||||
self.metric_path = metric_path
|
||||
self.warning = warning
|
||||
@@ -186,6 +189,7 @@ class ThresholdConfig:
|
||||
self.enabled = enabled
|
||||
self.hysteresis = hysteresis
|
||||
self.display = display
|
||||
self.count = max(1, int(count))
|
||||
|
||||
# Parse operator
|
||||
try:
|
||||
@@ -621,6 +625,7 @@ class ThresholdChecker:
|
||||
hysteresis = rtt_thresholds.get("hysteresis", 0.1) # 10% default
|
||||
enabled = rtt_thresholds.get("enabled", True)
|
||||
display = rtt_thresholds.get("display")
|
||||
count = rtt_thresholds.get("count", 1)
|
||||
|
||||
if warning is None and critical is None:
|
||||
logger.warning("No RTT thresholds defined, skipping")
|
||||
@@ -633,14 +638,16 @@ class ThresholdChecker:
|
||||
operator=operator,
|
||||
hysteresis=hysteresis,
|
||||
enabled=enabled,
|
||||
display=display
|
||||
display=display,
|
||||
count=count,
|
||||
)
|
||||
|
||||
target_dict[metric_path] = threshold
|
||||
logger.debug(
|
||||
"Registered RTT threshold: warn=%s ms, crit=%s ms",
|
||||
"Registered RTT threshold: warn=%s ms, crit=%s ms, count=%d",
|
||||
warning,
|
||||
critical
|
||||
critical,
|
||||
count,
|
||||
)
|
||||
|
||||
def get_thresholds_for_host(self, host_name: str) -> Dict[str, ThresholdConfig]:
|
||||
@@ -713,6 +720,26 @@ class ThresholdChecker:
|
||||
alert_state.level
|
||||
)
|
||||
|
||||
# Apply consecutive-count gating: when currently OK, require threshold.count
|
||||
# consecutive exceedances before escalating to WARNING/CRITICAL.
|
||||
if new_level == AlertLevel.OK:
|
||||
# Value is fine (or recovered) — reset the pending counter immediately.
|
||||
alert_state.consecutive_count = 0
|
||||
elif alert_state.level == AlertLevel.OK and new_level != AlertLevel.OK:
|
||||
# First time we exceed while still OK: count up.
|
||||
alert_state.consecutive_count += 1
|
||||
if alert_state.consecutive_count < threshold.count:
|
||||
logger.debug(
|
||||
"RTT threshold exceeded %d/%d consecutive times for %s on %s",
|
||||
alert_state.consecutive_count,
|
||||
threshold.count,
|
||||
metric_path,
|
||||
host_name,
|
||||
)
|
||||
return None
|
||||
# Count reached — fire the alert and reset the counter.
|
||||
alert_state.consecutive_count = 0
|
||||
|
||||
# Determine which threshold was exceeded
|
||||
threshold_value = None
|
||||
if new_level == AlertLevel.CRITICAL and threshold.critical is not None:
|
||||
|
||||
Reference in New Issue
Block a user