display tag fro alterts, cleanup udp
This commit is contained in:
+102
-15
@@ -53,14 +53,24 @@ class AlertState:
|
||||
self.last_check = time.time()
|
||||
self.notification_count = 0
|
||||
self.last_notification = None
|
||||
self.threshold_value = None # The threshold value that triggered alert
|
||||
self.operator = None # The comparison operator (>, <, >=, etc.)
|
||||
|
||||
def update(self, level: AlertLevel, value: Any) -> bool:
|
||||
def update(
|
||||
self,
|
||||
level: AlertLevel,
|
||||
value: Any,
|
||||
threshold_value: Optional[float] = None,
|
||||
operator: Optional[str] = None
|
||||
) -> bool:
|
||||
"""
|
||||
Update alert state.
|
||||
|
||||
Args:
|
||||
level: New alert level
|
||||
value: Current metric value
|
||||
threshold_value: The threshold value that was exceeded (if applicable)
|
||||
operator: The comparison operator (>, <, >=, etc.)
|
||||
|
||||
Returns:
|
||||
True if state changed (notification needed), False otherwise
|
||||
@@ -69,6 +79,15 @@ class AlertState:
|
||||
self.last_check = now
|
||||
self.last_value = value
|
||||
|
||||
# Update threshold info when alert is active
|
||||
if level != AlertLevel.OK:
|
||||
self.threshold_value = threshold_value
|
||||
self.operator = operator
|
||||
else:
|
||||
# Clear threshold info when returning to OK
|
||||
self.threshold_value = None
|
||||
self.operator = None
|
||||
|
||||
# Check if state changed
|
||||
if level != self.level:
|
||||
logger.info(
|
||||
@@ -87,7 +106,7 @@ class AlertState:
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert alert state to dictionary for serialization."""
|
||||
return {
|
||||
result = {
|
||||
"metric_path": self.metric_path,
|
||||
"level": self.level.name,
|
||||
"since": self.since,
|
||||
@@ -95,6 +114,14 @@ class AlertState:
|
||||
"last_check": self.last_check,
|
||||
"notification_count": self.notification_count,
|
||||
}
|
||||
|
||||
# Include threshold info if available
|
||||
if self.threshold_value is not None:
|
||||
result["threshold_value"] = self.threshold_value
|
||||
if self.operator is not None:
|
||||
result["operator"] = self.operator
|
||||
|
||||
return result
|
||||
|
||||
def __str__(self):
|
||||
return self.to_dict().__str__()
|
||||
@@ -107,6 +134,7 @@ class ThresholdConfig:
|
||||
metric_path: str,
|
||||
warning: Optional[float] = None,
|
||||
critical: Optional[float] = None,
|
||||
display: Optional[str] = None,
|
||||
operator: str = ">",
|
||||
hysteresis: float = 0.0,
|
||||
enabled: bool = True,
|
||||
@@ -127,6 +155,7 @@ class ThresholdConfig:
|
||||
self.critical = critical
|
||||
self.enabled = enabled
|
||||
self.hysteresis = hysteresis
|
||||
self.display = display
|
||||
|
||||
# Parse operator
|
||||
try:
|
||||
@@ -302,6 +331,7 @@ class ThresholdChecker:
|
||||
warning = threshold_config.get("warning")
|
||||
critical = threshold_config.get("critical")
|
||||
operator = threshold_config.get("operator", ">")
|
||||
display = threshold_config.get("display")
|
||||
hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default
|
||||
enabled = threshold_config.get("enabled", True)
|
||||
|
||||
@@ -316,6 +346,7 @@ class ThresholdChecker:
|
||||
operator=operator,
|
||||
hysteresis=hysteresis,
|
||||
enabled=enabled,
|
||||
display=display
|
||||
)
|
||||
|
||||
self.thresholds[metric_path] = threshold
|
||||
@@ -345,7 +376,7 @@ class ThresholdChecker:
|
||||
operator = threshold_config.get("operator", ">")
|
||||
hysteresis = threshold_config.get("hysteresis", 0.1)
|
||||
enabled = threshold_config.get("enabled", True)
|
||||
|
||||
display = threshold_config.get("display")
|
||||
if warning is None and critical is None:
|
||||
continue
|
||||
|
||||
@@ -356,6 +387,7 @@ class ThresholdChecker:
|
||||
operator=operator,
|
||||
hysteresis=hysteresis,
|
||||
enabled=enabled,
|
||||
display=display
|
||||
)
|
||||
|
||||
self.thresholds[metric_path] = threshold
|
||||
@@ -382,6 +414,7 @@ class ThresholdChecker:
|
||||
operator = threshold_config.get("operator", ">")
|
||||
hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default
|
||||
enabled = threshold_config.get("enabled", True)
|
||||
display = threshold_config.get("display")
|
||||
|
||||
if warning is None and critical is None:
|
||||
logger.warning("No RTT thresholds defined for %s, skipping", hostname)
|
||||
@@ -394,6 +427,7 @@ class ThresholdChecker:
|
||||
operator=operator,
|
||||
hysteresis=hysteresis,
|
||||
enabled=enabled,
|
||||
display=display
|
||||
)
|
||||
|
||||
self.thresholds[metric_path] = threshold
|
||||
@@ -440,14 +474,21 @@ class ThresholdChecker:
|
||||
alert_state.level
|
||||
)
|
||||
|
||||
# Determine which threshold was exceeded
|
||||
threshold_value = None
|
||||
if new_level == AlertLevel.CRITICAL and threshold.critical is not None:
|
||||
threshold_value = threshold.critical
|
||||
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
|
||||
threshold_value = threshold.warning
|
||||
|
||||
# Update state and check for changes
|
||||
old_level = alert_state.level
|
||||
if alert_state.update(new_level, value):
|
||||
self._trigger_notification(host_name, metric_path, old_level, new_level, value)
|
||||
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
||||
self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold)
|
||||
return (old_level, new_level)
|
||||
elif new_level != AlertLevel.OK:
|
||||
# Check if we should re-notify
|
||||
self._check_renotify(host_name, alert_state, metric_path, value)
|
||||
self._check_renotify(host_name, alert_state, metric_path, value, threshold)
|
||||
|
||||
return None
|
||||
|
||||
@@ -493,14 +534,21 @@ class ThresholdChecker:
|
||||
alert_state.level
|
||||
)
|
||||
|
||||
# Determine which threshold was exceeded
|
||||
threshold_value = None
|
||||
if new_level == AlertLevel.CRITICAL and threshold.critical is not None:
|
||||
threshold_value = threshold.critical
|
||||
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
|
||||
threshold_value = threshold.warning
|
||||
|
||||
# Update state and check for changes
|
||||
old_level = alert_state.level
|
||||
if alert_state.update(new_level, value):
|
||||
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
||||
state_changes.append((metric_path, old_level, new_level, value))
|
||||
self._trigger_notification(host_name, metric_path, old_level, new_level, value)
|
||||
self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold)
|
||||
elif new_level != AlertLevel.OK:
|
||||
# Check if we should re-notify
|
||||
self._check_renotify(host_name, alert_state, metric_path, value)
|
||||
self._check_renotify(host_name, alert_state, metric_path, value, threshold)
|
||||
|
||||
# Check nested metrics (e.g., partition data in disk_monitor)
|
||||
self._check_nested_metrics(
|
||||
@@ -550,18 +598,26 @@ class ThresholdChecker:
|
||||
alert_state.level
|
||||
)
|
||||
|
||||
# Determine which threshold was exceeded
|
||||
threshold_value = None
|
||||
if new_level == AlertLevel.CRITICAL and threshold.critical is not None:
|
||||
threshold_value = threshold.critical
|
||||
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
|
||||
threshold_value = threshold.warning
|
||||
|
||||
old_level = alert_state.level
|
||||
if alert_state.update(new_level, value):
|
||||
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
||||
state_changes.append((metric_path, old_level, new_level, value))
|
||||
self._trigger_notification(
|
||||
host_name,
|
||||
metric_path,
|
||||
old_level,
|
||||
new_level,
|
||||
value
|
||||
value,
|
||||
threshold
|
||||
)
|
||||
elif new_level != AlertLevel.OK:
|
||||
self._check_renotify(host_name, alert_state, metric_path, value)
|
||||
self._check_renotify(host_name, alert_state, metric_path, value, threshold)
|
||||
|
||||
def _trigger_notification(
|
||||
self,
|
||||
@@ -570,18 +626,35 @@ class ThresholdChecker:
|
||||
old_level: AlertLevel,
|
||||
new_level: AlertLevel,
|
||||
value: Any,
|
||||
threshold: ThresholdConfig,
|
||||
):
|
||||
"""Trigger a notification for an alert state change."""
|
||||
# Determine which threshold was exceeded
|
||||
threshold_value = None
|
||||
if new_level == AlertLevel.CRITICAL and threshold.critical is not None:
|
||||
threshold_value = threshold.critical
|
||||
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
|
||||
threshold_value = threshold.warning
|
||||
|
||||
# Format operator symbol
|
||||
op_symbol = threshold.operator.value
|
||||
|
||||
# Format message
|
||||
if new_level == AlertLevel.OK:
|
||||
lvl = "RECOVERED"
|
||||
message = f"{metric_path} = {value} ({old_level.name} -> OK)"
|
||||
elif new_level == AlertLevel.WARNING:
|
||||
lvl = "WARNING"
|
||||
message = f"{metric_path} = {value}"
|
||||
if threshold_value is not None:
|
||||
message = f"{metric_path} = {value} (threshold: {op_symbol} {threshold_value})"
|
||||
else:
|
||||
message = f"{metric_path} = {value}"
|
||||
elif new_level == AlertLevel.CRITICAL:
|
||||
lvl = "CRITICAL"
|
||||
message = f"{metric_path} = {value}"
|
||||
if threshold_value is not None:
|
||||
message = f"{metric_path} = {value} (threshold: {op_symbol} {threshold_value})"
|
||||
else:
|
||||
message = f"{metric_path} = {value}"
|
||||
else:
|
||||
lvl = "UNKNOWN"
|
||||
message = f"{metric_path} = {value}"
|
||||
@@ -617,6 +690,7 @@ class ThresholdChecker:
|
||||
alert_state: AlertState,
|
||||
metric_path: str,
|
||||
value: Any,
|
||||
threshold: ThresholdConfig,
|
||||
):
|
||||
"""Check if we should send a repeat notification."""
|
||||
if alert_state.level == AlertLevel.OK:
|
||||
@@ -632,8 +706,21 @@ class ThresholdChecker:
|
||||
return
|
||||
|
||||
if (now - alert_state.last_notification) >= self.renotify_interval:
|
||||
# Determine which threshold is active
|
||||
threshold_value = None
|
||||
if alert_state.level == AlertLevel.CRITICAL and threshold.critical is not None:
|
||||
threshold_value = threshold.critical
|
||||
elif alert_state.level == AlertLevel.WARNING and threshold.warning is not None:
|
||||
threshold_value = threshold.warning
|
||||
|
||||
# Format operator symbol
|
||||
op_symbol = threshold.operator.value
|
||||
|
||||
# Time to re-notify
|
||||
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (ongoing for {int(now - alert_state.since)}s)"
|
||||
if threshold_value is not None:
|
||||
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (threshold: {op_symbol} {threshold_value}, ongoing for {int(now - alert_state.since)}s)"
|
||||
else:
|
||||
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (ongoing for {int(now - alert_state.since)}s)"
|
||||
|
||||
if self.notification_callback:
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user