Add count to rtt threshold

This commit is contained in:
Andreas Wrede
2026-04-10 08:07:35 -04:00
parent 381e37efce
commit ba27d2e300
2 changed files with 41 additions and 13 deletions
+1
View File
@@ -96,6 +96,7 @@ THRESHOLD_DEFAULTS = {
'rtt': {
'warning': 200,
'critical': 250.0
'count': 3 # Optional: number of consecutive breaches before alerting
}
}
}
+40 -13
View File
@@ -39,11 +39,11 @@ class ComparisonOperator(Enum):
class AlertState:
"""Represents the current alert state for a specific metric."""
def __init__(self, metric_path: str):
"""
Initialize alert state.
Args:
metric_path: Full path to metric (e.g., "cpu_monitor.cpu_percent")
"""
@@ -59,6 +59,7 @@ class AlertState:
self.formatted_message = None # Formatted display message for UI
self.acknowledged = False # Whether alert has been acknowledged
self.acknowledged_at = None # Timestamp when acknowledged
self.consecutive_count = 0 # Consecutive exceedances while still OK (for count gating)
def update(
self,
@@ -158,7 +159,7 @@ class AlertState:
class ThresholdConfig:
"""Configuration for a single threshold check."""
def __init__(
self,
metric_path: str,
@@ -168,10 +169,11 @@ class ThresholdConfig:
operator: str = ">",
hysteresis: float = 0.0,
enabled: bool = True,
count: int = 1,
):
"""
Initialize threshold configuration.
Args:
metric_path: Full path to metric (e.g., "cpu_monitor.cpu_percent")
warning: Warning threshold value
@@ -179,6 +181,7 @@ class ThresholdConfig:
operator: Comparison operator (>, >=, <, <=, ==, !=)
hysteresis: Hysteresis percentage to prevent flapping (0.0-1.0)
enabled: Whether this threshold is enabled
count: Number of consecutive exceedances required before alerting (default 1)
"""
self.metric_path = metric_path
self.warning = warning
@@ -186,6 +189,7 @@ class ThresholdConfig:
self.enabled = enabled
self.hysteresis = hysteresis
self.display = display
self.count = max(1, int(count))
# Parse operator
try:
@@ -621,11 +625,12 @@ class ThresholdChecker:
hysteresis = rtt_thresholds.get("hysteresis", 0.1) # 10% default
enabled = rtt_thresholds.get("enabled", True)
display = rtt_thresholds.get("display")
count = rtt_thresholds.get("count", 1)
if warning is None and critical is None:
logger.warning("No RTT thresholds defined, skipping")
return
threshold = ThresholdConfig(
metric_path=metric_path,
warning=warning,
@@ -633,14 +638,16 @@ class ThresholdChecker:
operator=operator,
hysteresis=hysteresis,
enabled=enabled,
display=display
display=display,
count=count,
)
target_dict[metric_path] = threshold
logger.debug(
"Registered RTT threshold: warn=%s ms, crit=%s ms",
"Registered RTT threshold: warn=%s ms, crit=%s ms, count=%d",
warning,
critical
critical,
count,
)
def get_thresholds_for_host(self, host_name: str) -> Dict[str, ThresholdConfig]:
@@ -712,14 +719,34 @@ class ThresholdChecker:
value,
alert_state.level
)
# Apply consecutive-count gating: when currently OK, require threshold.count
# consecutive exceedances before escalating to WARNING/CRITICAL.
if new_level == AlertLevel.OK:
# Value is fine (or recovered) — reset the pending counter immediately.
alert_state.consecutive_count = 0
elif alert_state.level == AlertLevel.OK and new_level != AlertLevel.OK:
# First time we exceed while still OK: count up.
alert_state.consecutive_count += 1
if alert_state.consecutive_count < threshold.count:
logger.debug(
"RTT threshold exceeded %d/%d consecutive times for %s on %s",
alert_state.consecutive_count,
threshold.count,
metric_path,
host_name,
)
return None
# Count reached — fire the alert and reset the counter.
alert_state.consecutive_count = 0
# Determine which threshold was exceeded
threshold_value = None
if new_level == AlertLevel.CRITICAL and threshold.critical is not None:
threshold_value = threshold.critical
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
threshold_value = threshold.warning
# Update state and check for changes
old_level = alert_state.level
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
@@ -732,7 +759,7 @@ class ThresholdChecker:
elif new_level != AlertLevel.OK:
# Check if we should re-notify
self._check_renotify(host_name, alert_state, metric_path, value, threshold, None)
return None
def check_plugin_data(
self,