Fix rtt, including bug in time compute
This commit is contained in:
+64
-58
@@ -275,7 +275,6 @@ class ThresholdChecker:
|
||||
def __init__(
|
||||
self,
|
||||
config: Dict[str, Any],
|
||||
notification_callback: Optional[Callable] = None,
|
||||
renotify_interval: int = 3600,
|
||||
journal: Optional[Any] = None,
|
||||
):
|
||||
@@ -284,7 +283,6 @@ class ThresholdChecker:
|
||||
|
||||
Args:
|
||||
config: Threshold configuration dictionary from YAML
|
||||
notification_callback: Function to call for notifications
|
||||
renotify_interval: Seconds between repeat notifications (default: 1 hour)
|
||||
journal: Optional MessageJournal instance for logging threshold events
|
||||
"""
|
||||
@@ -300,7 +298,6 @@ class ThresholdChecker:
|
||||
# Default config name to use when no mapping exists
|
||||
self.default_config = "default"
|
||||
|
||||
self.notification_callback = notification_callback
|
||||
self.renotify_interval = renotify_interval
|
||||
self.journal = journal
|
||||
|
||||
@@ -367,8 +364,20 @@ class ThresholdChecker:
|
||||
target_dict=self.threshold_configs[config_name]
|
||||
)
|
||||
|
||||
# Parse host to config mapping
|
||||
self.host_config_mapping = config.get("host_threshold_mapping", {})
|
||||
# Parse host to config mapping from two possible sources
|
||||
# 1. New format: hosts section with threshold_config attribute
|
||||
if "hosts" in config:
|
||||
hosts_config = config["hosts"]
|
||||
if isinstance(hosts_config, dict):
|
||||
for host_name, host_attrs in hosts_config.items():
|
||||
if isinstance(host_attrs, dict) and "threshold_config" in host_attrs:
|
||||
self.host_config_mapping[host_name] = host_attrs["threshold_config"]
|
||||
|
||||
# 2. Legacy format: host_threshold_mapping section (for backward compatibility)
|
||||
if "host_threshold_mapping" in config:
|
||||
legacy_mapping = config.get("host_threshold_mapping", {})
|
||||
if isinstance(legacy_mapping, dict):
|
||||
self.host_config_mapping.update(legacy_mapping)
|
||||
|
||||
# Set default config (first one alphabetically or explicitly set)
|
||||
self.default_config = config.get("default_threshold_config", "default")
|
||||
@@ -513,14 +522,13 @@ class ThresholdChecker:
|
||||
rtt_thresholds: Dict[str, Any],
|
||||
target_dict: Optional[Dict[str, ThresholdConfig]] = None
|
||||
):
|
||||
"""Parse RTT thresholds (per-host network latency thresholds).
|
||||
"""Parse RTT thresholds (network latency thresholds).
|
||||
|
||||
RTT thresholds are configured as:
|
||||
thresholds:
|
||||
rtt:
|
||||
hostname1:
|
||||
warning: 100.0 # ms
|
||||
critical: 500.0 # ms
|
||||
warning: 100.0 # ms
|
||||
critical: 500.0 # ms
|
||||
|
||||
Args:
|
||||
rtt_thresholds: RTT threshold configuration
|
||||
@@ -529,41 +537,39 @@ class ThresholdChecker:
|
||||
if target_dict is None:
|
||||
target_dict = self.thresholds
|
||||
|
||||
for hostname, threshold_config in rtt_thresholds.items():
|
||||
if not isinstance(threshold_config, dict):
|
||||
continue
|
||||
|
||||
# Metric path is "rtt.<hostname>"
|
||||
metric_path = f"rtt.{hostname}"
|
||||
|
||||
warning = threshold_config.get("warning")
|
||||
critical = threshold_config.get("critical")
|
||||
operator = threshold_config.get("operator", ">")
|
||||
hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default
|
||||
enabled = threshold_config.get("enabled", True)
|
||||
display = threshold_config.get("display")
|
||||
|
||||
if warning is None and critical is None:
|
||||
logger.warning("No RTT thresholds defined for %s, skipping", hostname)
|
||||
continue
|
||||
|
||||
threshold = ThresholdConfig(
|
||||
metric_path=metric_path,
|
||||
warning=warning,
|
||||
critical=critical,
|
||||
operator=operator,
|
||||
hysteresis=hysteresis,
|
||||
enabled=enabled,
|
||||
display=display
|
||||
)
|
||||
|
||||
target_dict[metric_path] = threshold
|
||||
logger.debug(
|
||||
"Registered RTT threshold for %s: warn=%s ms, crit=%s ms",
|
||||
hostname,
|
||||
warning,
|
||||
critical
|
||||
)
|
||||
if not isinstance(rtt_thresholds, dict):
|
||||
return
|
||||
|
||||
# Metric path is simply "rtt" (not per-host)
|
||||
metric_path = "rtt"
|
||||
|
||||
warning = rtt_thresholds.get("warning")
|
||||
critical = rtt_thresholds.get("critical")
|
||||
operator = rtt_thresholds.get("operator", ">")
|
||||
hysteresis = rtt_thresholds.get("hysteresis", 0.1) # 10% default
|
||||
enabled = rtt_thresholds.get("enabled", True)
|
||||
display = rtt_thresholds.get("display")
|
||||
|
||||
if warning is None and critical is None:
|
||||
logger.warning("No RTT thresholds defined, skipping")
|
||||
return
|
||||
|
||||
threshold = ThresholdConfig(
|
||||
metric_path=metric_path,
|
||||
warning=warning,
|
||||
critical=critical,
|
||||
operator=operator,
|
||||
hysteresis=hysteresis,
|
||||
enabled=enabled,
|
||||
display=display
|
||||
)
|
||||
|
||||
target_dict[metric_path] = threshold
|
||||
logger.debug(
|
||||
"Registered RTT threshold: warn=%s ms, crit=%s ms",
|
||||
warning,
|
||||
critical
|
||||
)
|
||||
|
||||
def get_thresholds_for_host(self, host_name: str) -> Dict[str, ThresholdConfig]:
|
||||
"""Get the appropriate threshold configuration for a host.
|
||||
@@ -887,12 +893,12 @@ class ThresholdChecker:
|
||||
value: Any,
|
||||
):
|
||||
"""Send notification and log to journal/eventlog."""
|
||||
if self.notification_callback is not None:
|
||||
try:
|
||||
self.notification_callback(f"{lvl}: {host_name} - {message}")
|
||||
logger.info("Notification sent: %s", message)
|
||||
except Exception as e:
|
||||
logger.error("Failed to send notification: %s", e)
|
||||
# Send notification using host-specific channels
|
||||
try:
|
||||
notify_mod.pushmsg_for_host(host_name, f"{lvl}: {host_name} - {message}")
|
||||
logger.info("Notification sent: %s", message)
|
||||
except Exception as e:
|
||||
logger.error("Failed to send notification: %s", e)
|
||||
|
||||
# Log to journal
|
||||
if self.journal is not None:
|
||||
@@ -1017,14 +1023,14 @@ class ThresholdChecker:
|
||||
else:
|
||||
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (ongoing for {int(now - alert_state.since)}s)"
|
||||
|
||||
if self.notification_callback:
|
||||
try:
|
||||
self.notification_callback(message)
|
||||
alert_state.last_notification = now
|
||||
alert_state.notification_count += 1
|
||||
logger.info("Re-notification sent: %s", message)
|
||||
except Exception as e:
|
||||
logger.error("Failed to send re-notification: %s", e)
|
||||
# Send re-notification using host-specific channels
|
||||
try:
|
||||
notify_mod.pushmsg_for_host(host_name, message)
|
||||
alert_state.last_notification = now
|
||||
alert_state.notification_count += 1
|
||||
logger.info("Re-notification sent: %s", message)
|
||||
except Exception as e:
|
||||
logger.error("Failed to send re-notification: %s", e)
|
||||
|
||||
def get_active_alerts(self, alert_states: Dict[str, AlertState]) -> list:
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user