display tag fro alterts, cleanup udp

This commit is contained in:
Andreas Wrede
2026-04-01 11:49:55 -04:00
parent dd23d9d163
commit 079e84f729
15 changed files with 277 additions and 540 deletions
+102 -15
View File
@@ -53,14 +53,24 @@ class AlertState:
self.last_check = time.time()
self.notification_count = 0
self.last_notification = None
self.threshold_value = None # The threshold value that triggered alert
self.operator = None # The comparison operator (>, <, >=, etc.)
def update(self, level: AlertLevel, value: Any) -> bool:
def update(
self,
level: AlertLevel,
value: Any,
threshold_value: Optional[float] = None,
operator: Optional[str] = None
) -> bool:
"""
Update alert state.
Args:
level: New alert level
value: Current metric value
threshold_value: The threshold value that was exceeded (if applicable)
operator: The comparison operator (>, <, >=, etc.)
Returns:
True if state changed (notification needed), False otherwise
@@ -69,6 +79,15 @@ class AlertState:
self.last_check = now
self.last_value = value
# Update threshold info when alert is active
if level != AlertLevel.OK:
self.threshold_value = threshold_value
self.operator = operator
else:
# Clear threshold info when returning to OK
self.threshold_value = None
self.operator = None
# Check if state changed
if level != self.level:
logger.info(
@@ -87,7 +106,7 @@ class AlertState:
def to_dict(self) -> dict:
"""Convert alert state to dictionary for serialization."""
return {
result = {
"metric_path": self.metric_path,
"level": self.level.name,
"since": self.since,
@@ -95,6 +114,14 @@ class AlertState:
"last_check": self.last_check,
"notification_count": self.notification_count,
}
# Include threshold info if available
if self.threshold_value is not None:
result["threshold_value"] = self.threshold_value
if self.operator is not None:
result["operator"] = self.operator
return result
def __str__(self):
return self.to_dict().__str__()
@@ -107,6 +134,7 @@ class ThresholdConfig:
metric_path: str,
warning: Optional[float] = None,
critical: Optional[float] = None,
display: Optional[str] = None,
operator: str = ">",
hysteresis: float = 0.0,
enabled: bool = True,
@@ -127,6 +155,7 @@ class ThresholdConfig:
self.critical = critical
self.enabled = enabled
self.hysteresis = hysteresis
self.display = display
# Parse operator
try:
@@ -302,6 +331,7 @@ class ThresholdChecker:
warning = threshold_config.get("warning")
critical = threshold_config.get("critical")
operator = threshold_config.get("operator", ">")
display = threshold_config.get("display")
hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default
enabled = threshold_config.get("enabled", True)
@@ -316,6 +346,7 @@ class ThresholdChecker:
operator=operator,
hysteresis=hysteresis,
enabled=enabled,
display=display
)
self.thresholds[metric_path] = threshold
@@ -345,7 +376,7 @@ class ThresholdChecker:
operator = threshold_config.get("operator", ">")
hysteresis = threshold_config.get("hysteresis", 0.1)
enabled = threshold_config.get("enabled", True)
display = threshold_config.get("display")
if warning is None and critical is None:
continue
@@ -356,6 +387,7 @@ class ThresholdChecker:
operator=operator,
hysteresis=hysteresis,
enabled=enabled,
display=display
)
self.thresholds[metric_path] = threshold
@@ -382,6 +414,7 @@ class ThresholdChecker:
operator = threshold_config.get("operator", ">")
hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default
enabled = threshold_config.get("enabled", True)
display = threshold_config.get("display")
if warning is None and critical is None:
logger.warning("No RTT thresholds defined for %s, skipping", hostname)
@@ -394,6 +427,7 @@ class ThresholdChecker:
operator=operator,
hysteresis=hysteresis,
enabled=enabled,
display=display
)
self.thresholds[metric_path] = threshold
@@ -440,14 +474,21 @@ class ThresholdChecker:
alert_state.level
)
# Determine which threshold was exceeded
threshold_value = None
if new_level == AlertLevel.CRITICAL and threshold.critical is not None:
threshold_value = threshold.critical
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
threshold_value = threshold.warning
# Update state and check for changes
old_level = alert_state.level
if alert_state.update(new_level, value):
self._trigger_notification(host_name, metric_path, old_level, new_level, value)
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold)
return (old_level, new_level)
elif new_level != AlertLevel.OK:
# Check if we should re-notify
self._check_renotify(host_name, alert_state, metric_path, value)
self._check_renotify(host_name, alert_state, metric_path, value, threshold)
return None
@@ -493,14 +534,21 @@ class ThresholdChecker:
alert_state.level
)
# Determine which threshold was exceeded
threshold_value = None
if new_level == AlertLevel.CRITICAL and threshold.critical is not None:
threshold_value = threshold.critical
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
threshold_value = threshold.warning
# Update state and check for changes
old_level = alert_state.level
if alert_state.update(new_level, value):
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
state_changes.append((metric_path, old_level, new_level, value))
self._trigger_notification(host_name, metric_path, old_level, new_level, value)
self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold)
elif new_level != AlertLevel.OK:
# Check if we should re-notify
self._check_renotify(host_name, alert_state, metric_path, value)
self._check_renotify(host_name, alert_state, metric_path, value, threshold)
# Check nested metrics (e.g., partition data in disk_monitor)
self._check_nested_metrics(
@@ -550,18 +598,26 @@ class ThresholdChecker:
alert_state.level
)
# Determine which threshold was exceeded
threshold_value = None
if new_level == AlertLevel.CRITICAL and threshold.critical is not None:
threshold_value = threshold.critical
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
threshold_value = threshold.warning
old_level = alert_state.level
if alert_state.update(new_level, value):
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
state_changes.append((metric_path, old_level, new_level, value))
self._trigger_notification(
host_name,
metric_path,
old_level,
new_level,
value
value,
threshold
)
elif new_level != AlertLevel.OK:
self._check_renotify(host_name, alert_state, metric_path, value)
self._check_renotify(host_name, alert_state, metric_path, value, threshold)
def _trigger_notification(
self,
@@ -570,18 +626,35 @@ class ThresholdChecker:
old_level: AlertLevel,
new_level: AlertLevel,
value: Any,
threshold: ThresholdConfig,
):
"""Trigger a notification for an alert state change."""
# Determine which threshold was exceeded
threshold_value = None
if new_level == AlertLevel.CRITICAL and threshold.critical is not None:
threshold_value = threshold.critical
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
threshold_value = threshold.warning
# Format operator symbol
op_symbol = threshold.operator.value
# Format message
if new_level == AlertLevel.OK:
lvl = "RECOVERED"
message = f"{metric_path} = {value} ({old_level.name} -> OK)"
elif new_level == AlertLevel.WARNING:
lvl = "WARNING"
message = f"{metric_path} = {value}"
if threshold_value is not None:
message = f"{metric_path} = {value} (threshold: {op_symbol} {threshold_value})"
else:
message = f"{metric_path} = {value}"
elif new_level == AlertLevel.CRITICAL:
lvl = "CRITICAL"
message = f"{metric_path} = {value}"
if threshold_value is not None:
message = f"{metric_path} = {value} (threshold: {op_symbol} {threshold_value})"
else:
message = f"{metric_path} = {value}"
else:
lvl = "UNKNOWN"
message = f"{metric_path} = {value}"
@@ -617,6 +690,7 @@ class ThresholdChecker:
alert_state: AlertState,
metric_path: str,
value: Any,
threshold: ThresholdConfig,
):
"""Check if we should send a repeat notification."""
if alert_state.level == AlertLevel.OK:
@@ -632,8 +706,21 @@ class ThresholdChecker:
return
if (now - alert_state.last_notification) >= self.renotify_interval:
# Determine which threshold is active
threshold_value = None
if alert_state.level == AlertLevel.CRITICAL and threshold.critical is not None:
threshold_value = threshold.critical
elif alert_state.level == AlertLevel.WARNING and threshold.warning is not None:
threshold_value = threshold.warning
# Format operator symbol
op_symbol = threshold.operator.value
# Time to re-notify
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (ongoing for {int(now - alert_state.since)}s)"
if threshold_value is not None:
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (threshold: {op_symbol} {threshold_value}, ongoing for {int(now - alert_state.since)}s)"
else:
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (ongoing for {int(now - alert_state.since)}s)"
if self.notification_callback:
try: