Fix rtt, including bug in time compute

This commit is contained in:
Andreas Wrede
2026-04-01 19:41:53 -04:00
parent 090d341244
commit 460d2be9e9
13 changed files with 1366 additions and 372 deletions
+64 -58
View File
@@ -275,7 +275,6 @@ class ThresholdChecker:
def __init__(
self,
config: Dict[str, Any],
notification_callback: Optional[Callable] = None,
renotify_interval: int = 3600,
journal: Optional[Any] = None,
):
@@ -284,7 +283,6 @@ class ThresholdChecker:
Args:
config: Threshold configuration dictionary from YAML
notification_callback: Function to call for notifications
renotify_interval: Seconds between repeat notifications (default: 1 hour)
journal: Optional MessageJournal instance for logging threshold events
"""
@@ -300,7 +298,6 @@ class ThresholdChecker:
# Default config name to use when no mapping exists
self.default_config = "default"
self.notification_callback = notification_callback
self.renotify_interval = renotify_interval
self.journal = journal
@@ -367,8 +364,20 @@ class ThresholdChecker:
target_dict=self.threshold_configs[config_name]
)
# Parse host to config mapping
self.host_config_mapping = config.get("host_threshold_mapping", {})
# Parse host to config mapping from two possible sources
# 1. New format: hosts section with threshold_config attribute
if "hosts" in config:
hosts_config = config["hosts"]
if isinstance(hosts_config, dict):
for host_name, host_attrs in hosts_config.items():
if isinstance(host_attrs, dict) and "threshold_config" in host_attrs:
self.host_config_mapping[host_name] = host_attrs["threshold_config"]
# 2. Legacy format: host_threshold_mapping section (for backward compatibility)
if "host_threshold_mapping" in config:
legacy_mapping = config.get("host_threshold_mapping", {})
if isinstance(legacy_mapping, dict):
self.host_config_mapping.update(legacy_mapping)
# Set default config (first one alphabetically or explicitly set)
self.default_config = config.get("default_threshold_config", "default")
@@ -513,14 +522,13 @@ class ThresholdChecker:
rtt_thresholds: Dict[str, Any],
target_dict: Optional[Dict[str, ThresholdConfig]] = None
):
"""Parse RTT thresholds (per-host network latency thresholds).
"""Parse RTT thresholds (network latency thresholds).
RTT thresholds are configured as:
thresholds:
rtt:
hostname1:
warning: 100.0 # ms
critical: 500.0 # ms
warning: 100.0 # ms
critical: 500.0 # ms
Args:
rtt_thresholds: RTT threshold configuration
@@ -529,41 +537,39 @@ class ThresholdChecker:
if target_dict is None:
target_dict = self.thresholds
for hostname, threshold_config in rtt_thresholds.items():
if not isinstance(threshold_config, dict):
continue
# Metric path is "rtt.<hostname>"
metric_path = f"rtt.{hostname}"
warning = threshold_config.get("warning")
critical = threshold_config.get("critical")
operator = threshold_config.get("operator", ">")
hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default
enabled = threshold_config.get("enabled", True)
display = threshold_config.get("display")
if warning is None and critical is None:
logger.warning("No RTT thresholds defined for %s, skipping", hostname)
continue
threshold = ThresholdConfig(
metric_path=metric_path,
warning=warning,
critical=critical,
operator=operator,
hysteresis=hysteresis,
enabled=enabled,
display=display
)
target_dict[metric_path] = threshold
logger.debug(
"Registered RTT threshold for %s: warn=%s ms, crit=%s ms",
hostname,
warning,
critical
)
if not isinstance(rtt_thresholds, dict):
return
# Metric path is simply "rtt" (not per-host)
metric_path = "rtt"
warning = rtt_thresholds.get("warning")
critical = rtt_thresholds.get("critical")
operator = rtt_thresholds.get("operator", ">")
hysteresis = rtt_thresholds.get("hysteresis", 0.1) # 10% default
enabled = rtt_thresholds.get("enabled", True)
display = rtt_thresholds.get("display")
if warning is None and critical is None:
logger.warning("No RTT thresholds defined, skipping")
return
threshold = ThresholdConfig(
metric_path=metric_path,
warning=warning,
critical=critical,
operator=operator,
hysteresis=hysteresis,
enabled=enabled,
display=display
)
target_dict[metric_path] = threshold
logger.debug(
"Registered RTT threshold: warn=%s ms, crit=%s ms",
warning,
critical
)
def get_thresholds_for_host(self, host_name: str) -> Dict[str, ThresholdConfig]:
"""Get the appropriate threshold configuration for a host.
@@ -887,12 +893,12 @@ class ThresholdChecker:
value: Any,
):
"""Send notification and log to journal/eventlog."""
if self.notification_callback is not None:
try:
self.notification_callback(f"{lvl}: {host_name} - {message}")
logger.info("Notification sent: %s", message)
except Exception as e:
logger.error("Failed to send notification: %s", e)
# Send notification using host-specific channels
try:
notify_mod.pushmsg_for_host(host_name, f"{lvl}: {host_name} - {message}")
logger.info("Notification sent: %s", message)
except Exception as e:
logger.error("Failed to send notification: %s", e)
# Log to journal
if self.journal is not None:
@@ -1017,14 +1023,14 @@ class ThresholdChecker:
else:
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (ongoing for {int(now - alert_state.since)}s)"
if self.notification_callback:
try:
self.notification_callback(message)
alert_state.last_notification = now
alert_state.notification_count += 1
logger.info("Re-notification sent: %s", message)
except Exception as e:
logger.error("Failed to send re-notification: %s", e)
# Send re-notification using host-specific channels
try:
notify_mod.pushmsg_for_host(host_name, message)
alert_state.last_notification = now
alert_state.notification_count += 1
logger.info("Re-notification sent: %s", message)
except Exception as e:
logger.error("Failed to send re-notification: %s", e)
def get_active_alerts(self, alert_states: Dict[str, AlertState]) -> list:
"""