Add count to rtt threshold
This commit is contained in:
@@ -96,6 +96,7 @@ THRESHOLD_DEFAULTS = {
|
|||||||
'rtt': {
|
'rtt': {
|
||||||
'warning': 200,
|
'warning': 200,
|
||||||
'critical': 250.0
|
'critical': 250.0
|
||||||
|
'count': 3 # Optional: number of consecutive breaches before alerting
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
+40
-13
@@ -39,11 +39,11 @@ class ComparisonOperator(Enum):
|
|||||||
|
|
||||||
class AlertState:
|
class AlertState:
|
||||||
"""Represents the current alert state for a specific metric."""
|
"""Represents the current alert state for a specific metric."""
|
||||||
|
|
||||||
def __init__(self, metric_path: str):
|
def __init__(self, metric_path: str):
|
||||||
"""
|
"""
|
||||||
Initialize alert state.
|
Initialize alert state.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
metric_path: Full path to metric (e.g., "cpu_monitor.cpu_percent")
|
metric_path: Full path to metric (e.g., "cpu_monitor.cpu_percent")
|
||||||
"""
|
"""
|
||||||
@@ -59,6 +59,7 @@ class AlertState:
|
|||||||
self.formatted_message = None # Formatted display message for UI
|
self.formatted_message = None # Formatted display message for UI
|
||||||
self.acknowledged = False # Whether alert has been acknowledged
|
self.acknowledged = False # Whether alert has been acknowledged
|
||||||
self.acknowledged_at = None # Timestamp when acknowledged
|
self.acknowledged_at = None # Timestamp when acknowledged
|
||||||
|
self.consecutive_count = 0 # Consecutive exceedances while still OK (for count gating)
|
||||||
|
|
||||||
def update(
|
def update(
|
||||||
self,
|
self,
|
||||||
@@ -158,7 +159,7 @@ class AlertState:
|
|||||||
|
|
||||||
class ThresholdConfig:
|
class ThresholdConfig:
|
||||||
"""Configuration for a single threshold check."""
|
"""Configuration for a single threshold check."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
metric_path: str,
|
metric_path: str,
|
||||||
@@ -168,10 +169,11 @@ class ThresholdConfig:
|
|||||||
operator: str = ">",
|
operator: str = ">",
|
||||||
hysteresis: float = 0.0,
|
hysteresis: float = 0.0,
|
||||||
enabled: bool = True,
|
enabled: bool = True,
|
||||||
|
count: int = 1,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Initialize threshold configuration.
|
Initialize threshold configuration.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
metric_path: Full path to metric (e.g., "cpu_monitor.cpu_percent")
|
metric_path: Full path to metric (e.g., "cpu_monitor.cpu_percent")
|
||||||
warning: Warning threshold value
|
warning: Warning threshold value
|
||||||
@@ -179,6 +181,7 @@ class ThresholdConfig:
|
|||||||
operator: Comparison operator (>, >=, <, <=, ==, !=)
|
operator: Comparison operator (>, >=, <, <=, ==, !=)
|
||||||
hysteresis: Hysteresis percentage to prevent flapping (0.0-1.0)
|
hysteresis: Hysteresis percentage to prevent flapping (0.0-1.0)
|
||||||
enabled: Whether this threshold is enabled
|
enabled: Whether this threshold is enabled
|
||||||
|
count: Number of consecutive exceedances required before alerting (default 1)
|
||||||
"""
|
"""
|
||||||
self.metric_path = metric_path
|
self.metric_path = metric_path
|
||||||
self.warning = warning
|
self.warning = warning
|
||||||
@@ -186,6 +189,7 @@ class ThresholdConfig:
|
|||||||
self.enabled = enabled
|
self.enabled = enabled
|
||||||
self.hysteresis = hysteresis
|
self.hysteresis = hysteresis
|
||||||
self.display = display
|
self.display = display
|
||||||
|
self.count = max(1, int(count))
|
||||||
|
|
||||||
# Parse operator
|
# Parse operator
|
||||||
try:
|
try:
|
||||||
@@ -621,11 +625,12 @@ class ThresholdChecker:
|
|||||||
hysteresis = rtt_thresholds.get("hysteresis", 0.1) # 10% default
|
hysteresis = rtt_thresholds.get("hysteresis", 0.1) # 10% default
|
||||||
enabled = rtt_thresholds.get("enabled", True)
|
enabled = rtt_thresholds.get("enabled", True)
|
||||||
display = rtt_thresholds.get("display")
|
display = rtt_thresholds.get("display")
|
||||||
|
count = rtt_thresholds.get("count", 1)
|
||||||
|
|
||||||
if warning is None and critical is None:
|
if warning is None and critical is None:
|
||||||
logger.warning("No RTT thresholds defined, skipping")
|
logger.warning("No RTT thresholds defined, skipping")
|
||||||
return
|
return
|
||||||
|
|
||||||
threshold = ThresholdConfig(
|
threshold = ThresholdConfig(
|
||||||
metric_path=metric_path,
|
metric_path=metric_path,
|
||||||
warning=warning,
|
warning=warning,
|
||||||
@@ -633,14 +638,16 @@ class ThresholdChecker:
|
|||||||
operator=operator,
|
operator=operator,
|
||||||
hysteresis=hysteresis,
|
hysteresis=hysteresis,
|
||||||
enabled=enabled,
|
enabled=enabled,
|
||||||
display=display
|
display=display,
|
||||||
|
count=count,
|
||||||
)
|
)
|
||||||
|
|
||||||
target_dict[metric_path] = threshold
|
target_dict[metric_path] = threshold
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"Registered RTT threshold: warn=%s ms, crit=%s ms",
|
"Registered RTT threshold: warn=%s ms, crit=%s ms, count=%d",
|
||||||
warning,
|
warning,
|
||||||
critical
|
critical,
|
||||||
|
count,
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_thresholds_for_host(self, host_name: str) -> Dict[str, ThresholdConfig]:
|
def get_thresholds_for_host(self, host_name: str) -> Dict[str, ThresholdConfig]:
|
||||||
@@ -712,14 +719,34 @@ class ThresholdChecker:
|
|||||||
value,
|
value,
|
||||||
alert_state.level
|
alert_state.level
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Apply consecutive-count gating: when currently OK, require threshold.count
|
||||||
|
# consecutive exceedances before escalating to WARNING/CRITICAL.
|
||||||
|
if new_level == AlertLevel.OK:
|
||||||
|
# Value is fine (or recovered) — reset the pending counter immediately.
|
||||||
|
alert_state.consecutive_count = 0
|
||||||
|
elif alert_state.level == AlertLevel.OK and new_level != AlertLevel.OK:
|
||||||
|
# First time we exceed while still OK: count up.
|
||||||
|
alert_state.consecutive_count += 1
|
||||||
|
if alert_state.consecutive_count < threshold.count:
|
||||||
|
logger.debug(
|
||||||
|
"RTT threshold exceeded %d/%d consecutive times for %s on %s",
|
||||||
|
alert_state.consecutive_count,
|
||||||
|
threshold.count,
|
||||||
|
metric_path,
|
||||||
|
host_name,
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
# Count reached — fire the alert and reset the counter.
|
||||||
|
alert_state.consecutive_count = 0
|
||||||
|
|
||||||
# Determine which threshold was exceeded
|
# Determine which threshold was exceeded
|
||||||
threshold_value = None
|
threshold_value = None
|
||||||
if new_level == AlertLevel.CRITICAL and threshold.critical is not None:
|
if new_level == AlertLevel.CRITICAL and threshold.critical is not None:
|
||||||
threshold_value = threshold.critical
|
threshold_value = threshold.critical
|
||||||
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
|
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
|
||||||
threshold_value = threshold.warning
|
threshold_value = threshold.warning
|
||||||
|
|
||||||
# Update state and check for changes
|
# Update state and check for changes
|
||||||
old_level = alert_state.level
|
old_level = alert_state.level
|
||||||
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
||||||
@@ -732,7 +759,7 @@ class ThresholdChecker:
|
|||||||
elif new_level != AlertLevel.OK:
|
elif new_level != AlertLevel.OK:
|
||||||
# Check if we should re-notify
|
# Check if we should re-notify
|
||||||
self._check_renotify(host_name, alert_state, metric_path, value, threshold, None)
|
self._check_renotify(host_name, alert_state, metric_path, value, threshold, None)
|
||||||
|
|
||||||
return None
|
return None
|
||||||
def check_plugin_data(
|
def check_plugin_data(
|
||||||
self,
|
self,
|
||||||
|
|||||||
Reference in New Issue
Block a user