Add count to rtt threshold
This commit is contained in:
@@ -96,6 +96,7 @@ THRESHOLD_DEFAULTS = {
|
|||||||
'rtt': {
|
'rtt': {
|
||||||
'warning': 200,
|
'warning': 200,
|
||||||
'critical': 250.0
|
'critical': 250.0
|
||||||
|
'count': 3 # Optional: number of consecutive breaches before alerting
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
+30
-3
@@ -59,6 +59,7 @@ class AlertState:
|
|||||||
self.formatted_message = None # Formatted display message for UI
|
self.formatted_message = None # Formatted display message for UI
|
||||||
self.acknowledged = False # Whether alert has been acknowledged
|
self.acknowledged = False # Whether alert has been acknowledged
|
||||||
self.acknowledged_at = None # Timestamp when acknowledged
|
self.acknowledged_at = None # Timestamp when acknowledged
|
||||||
|
self.consecutive_count = 0 # Consecutive exceedances while still OK (for count gating)
|
||||||
|
|
||||||
def update(
|
def update(
|
||||||
self,
|
self,
|
||||||
@@ -168,6 +169,7 @@ class ThresholdConfig:
|
|||||||
operator: str = ">",
|
operator: str = ">",
|
||||||
hysteresis: float = 0.0,
|
hysteresis: float = 0.0,
|
||||||
enabled: bool = True,
|
enabled: bool = True,
|
||||||
|
count: int = 1,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Initialize threshold configuration.
|
Initialize threshold configuration.
|
||||||
@@ -179,6 +181,7 @@ class ThresholdConfig:
|
|||||||
operator: Comparison operator (>, >=, <, <=, ==, !=)
|
operator: Comparison operator (>, >=, <, <=, ==, !=)
|
||||||
hysteresis: Hysteresis percentage to prevent flapping (0.0-1.0)
|
hysteresis: Hysteresis percentage to prevent flapping (0.0-1.0)
|
||||||
enabled: Whether this threshold is enabled
|
enabled: Whether this threshold is enabled
|
||||||
|
count: Number of consecutive exceedances required before alerting (default 1)
|
||||||
"""
|
"""
|
||||||
self.metric_path = metric_path
|
self.metric_path = metric_path
|
||||||
self.warning = warning
|
self.warning = warning
|
||||||
@@ -186,6 +189,7 @@ class ThresholdConfig:
|
|||||||
self.enabled = enabled
|
self.enabled = enabled
|
||||||
self.hysteresis = hysteresis
|
self.hysteresis = hysteresis
|
||||||
self.display = display
|
self.display = display
|
||||||
|
self.count = max(1, int(count))
|
||||||
|
|
||||||
# Parse operator
|
# Parse operator
|
||||||
try:
|
try:
|
||||||
@@ -621,6 +625,7 @@ class ThresholdChecker:
|
|||||||
hysteresis = rtt_thresholds.get("hysteresis", 0.1) # 10% default
|
hysteresis = rtt_thresholds.get("hysteresis", 0.1) # 10% default
|
||||||
enabled = rtt_thresholds.get("enabled", True)
|
enabled = rtt_thresholds.get("enabled", True)
|
||||||
display = rtt_thresholds.get("display")
|
display = rtt_thresholds.get("display")
|
||||||
|
count = rtt_thresholds.get("count", 1)
|
||||||
|
|
||||||
if warning is None and critical is None:
|
if warning is None and critical is None:
|
||||||
logger.warning("No RTT thresholds defined, skipping")
|
logger.warning("No RTT thresholds defined, skipping")
|
||||||
@@ -633,14 +638,16 @@ class ThresholdChecker:
|
|||||||
operator=operator,
|
operator=operator,
|
||||||
hysteresis=hysteresis,
|
hysteresis=hysteresis,
|
||||||
enabled=enabled,
|
enabled=enabled,
|
||||||
display=display
|
display=display,
|
||||||
|
count=count,
|
||||||
)
|
)
|
||||||
|
|
||||||
target_dict[metric_path] = threshold
|
target_dict[metric_path] = threshold
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"Registered RTT threshold: warn=%s ms, crit=%s ms",
|
"Registered RTT threshold: warn=%s ms, crit=%s ms, count=%d",
|
||||||
warning,
|
warning,
|
||||||
critical
|
critical,
|
||||||
|
count,
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_thresholds_for_host(self, host_name: str) -> Dict[str, ThresholdConfig]:
|
def get_thresholds_for_host(self, host_name: str) -> Dict[str, ThresholdConfig]:
|
||||||
@@ -713,6 +720,26 @@ class ThresholdChecker:
|
|||||||
alert_state.level
|
alert_state.level
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Apply consecutive-count gating: when currently OK, require threshold.count
|
||||||
|
# consecutive exceedances before escalating to WARNING/CRITICAL.
|
||||||
|
if new_level == AlertLevel.OK:
|
||||||
|
# Value is fine (or recovered) — reset the pending counter immediately.
|
||||||
|
alert_state.consecutive_count = 0
|
||||||
|
elif alert_state.level == AlertLevel.OK and new_level != AlertLevel.OK:
|
||||||
|
# First time we exceed while still OK: count up.
|
||||||
|
alert_state.consecutive_count += 1
|
||||||
|
if alert_state.consecutive_count < threshold.count:
|
||||||
|
logger.debug(
|
||||||
|
"RTT threshold exceeded %d/%d consecutive times for %s on %s",
|
||||||
|
alert_state.consecutive_count,
|
||||||
|
threshold.count,
|
||||||
|
metric_path,
|
||||||
|
host_name,
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
# Count reached — fire the alert and reset the counter.
|
||||||
|
alert_state.consecutive_count = 0
|
||||||
|
|
||||||
# Determine which threshold was exceeded
|
# Determine which threshold was exceeded
|
||||||
threshold_value = None
|
threshold_value = None
|
||||||
if new_level == AlertLevel.CRITICAL and threshold.critical is not None:
|
if new_level == AlertLevel.CRITICAL and threshold.critical is not None:
|
||||||
|
|||||||
Reference in New Issue
Block a user