Fix rtt, including bug in time compute
This commit is contained in:
+165
-14
@@ -21,10 +21,9 @@ SERVER_DEFAULTS = {
|
||||
"logfile": "/var/log/heartbeat.log",
|
||||
"logfmt": "text", # text or msg or json
|
||||
|
||||
# Notification settings
|
||||
"pushsrv": "pushover", # pushover, mattermost, or all
|
||||
"pushover_token": "",
|
||||
"pushover_user": "",
|
||||
# Notification channels
|
||||
"notification_channels": {}, # Named channels with type and credentials
|
||||
"default_notification_channels": [], # Default channels if host doesn't specify
|
||||
|
||||
# Monitoring settings
|
||||
"interval": 20, # Expected heartbeat interval (for server checks)
|
||||
@@ -32,22 +31,15 @@ SERVER_DEFAULTS = {
|
||||
"threshold_renotify_interval": 3600, # Seconds between threshold re-notifications
|
||||
|
||||
# Host management
|
||||
"watchhosts": [], # Hosts to monitor and notify about
|
||||
"dyndnshosts": [], # Hosts with dynamic DNS
|
||||
"hosts": {}, # New unified host definitions (optional)
|
||||
"watchhosts": [], # Hosts to monitor and notify about (legacy)
|
||||
"dyndnshosts": [], # Hosts with dynamic DNS (legacy)
|
||||
"drophosts": [], # Hosts to ignore
|
||||
"dyndomains": ["wrede.org"],
|
||||
|
||||
# DNS updates
|
||||
"nsupdate_bin": "/usr/bin/nsupdate",
|
||||
|
||||
# Email settings
|
||||
"smtpserver": "smtp.fastmail.com",
|
||||
"smtpuser": "andreas@wrede.ca",
|
||||
"smtppassword": "pvtvefyp5gbhnch2",
|
||||
"smtpport": 587,
|
||||
"toemail": ["aew.hbd.notify@wrede.ca"],
|
||||
"fromemail": "aew.hbd@wrede.ca",
|
||||
|
||||
# WebSocket settings
|
||||
"ws_port": 50005,
|
||||
"wss_port": None,
|
||||
@@ -101,3 +93,162 @@ def load_config(path=None):
|
||||
# yaml not installed: do not attempt to parse; user must ensure defaults
|
||||
pass
|
||||
return cfg
|
||||
|
||||
|
||||
def get_watchhosts(config):
|
||||
"""Extract watchhosts from config, supporting both new and legacy formats.
|
||||
|
||||
Args:
|
||||
config: Configuration dictionary
|
||||
|
||||
Returns:
|
||||
List of hostnames to watch
|
||||
"""
|
||||
watchhosts = []
|
||||
|
||||
# New format: hosts section with watch attribute
|
||||
if "hosts" in config:
|
||||
hosts_config = config["hosts"]
|
||||
if isinstance(hosts_config, dict):
|
||||
for host_name, host_attrs in hosts_config.items():
|
||||
if isinstance(host_attrs, dict) and host_attrs.get("watch", False):
|
||||
watchhosts.append(host_name)
|
||||
|
||||
# Legacy format: watchhosts list
|
||||
if "watchhosts" in config:
|
||||
legacy_watchhosts = config.get("watchhosts", [])
|
||||
if isinstance(legacy_watchhosts, (list, set)):
|
||||
watchhosts.extend(legacy_watchhosts)
|
||||
elif isinstance(legacy_watchhosts, dict):
|
||||
# Old dict format: {"host1": {attrs}, "host2": {attrs}}
|
||||
watchhosts.extend(legacy_watchhosts.keys())
|
||||
|
||||
return list(set(watchhosts)) # Remove duplicates
|
||||
|
||||
|
||||
def get_dyndnshosts(config):
|
||||
"""Extract dyndnshosts from config, supporting both new and legacy formats.
|
||||
|
||||
Args:
|
||||
config: Configuration dictionary
|
||||
|
||||
Returns:
|
||||
List of hostnames with dynamic DNS
|
||||
"""
|
||||
dyndnshosts = []
|
||||
|
||||
# New format: hosts section with dyndns attribute
|
||||
if "hosts" in config:
|
||||
hosts_config = config["hosts"]
|
||||
if isinstance(hosts_config, dict):
|
||||
for host_name, host_attrs in hosts_config.items():
|
||||
if isinstance(host_attrs, dict) and host_attrs.get("dyndns", False):
|
||||
dyndnshosts.append(host_name)
|
||||
|
||||
# Legacy format: dyndnshosts list/set
|
||||
if "dyndnshosts" in config:
|
||||
legacy_dyndnshosts = config.get("dyndnshosts", [])
|
||||
if isinstance(legacy_dyndnshosts, (list, set)):
|
||||
dyndnshosts.extend(legacy_dyndnshosts)
|
||||
|
||||
return list(set(dyndnshosts)) # Remove duplicates
|
||||
|
||||
|
||||
def get_host_config(config, hostname):
|
||||
"""Get configuration for a specific host.
|
||||
|
||||
Args:
|
||||
config: Configuration dictionary
|
||||
hostname: Host name
|
||||
|
||||
Returns:
|
||||
Dictionary with host attributes or empty dict
|
||||
"""
|
||||
if "hosts" in config:
|
||||
hosts_config = config.get("hosts", {})
|
||||
if isinstance(hosts_config, dict) and hostname in hosts_config:
|
||||
return hosts_config[hostname] if isinstance(hosts_config[hostname], dict) else {}
|
||||
|
||||
# Check legacy watchhosts for notification settings
|
||||
if "watchhosts" in config:
|
||||
watchhosts = config.get("watchhosts", {})
|
||||
if isinstance(watchhosts, dict) and hostname in watchhosts:
|
||||
legacy_attrs = watchhosts[hostname]
|
||||
if isinstance(legacy_attrs, dict):
|
||||
# Convert legacy format to new format
|
||||
return {
|
||||
"watch": True,
|
||||
"notify": legacy_attrs.get("notify"),
|
||||
"notify_src": legacy_attrs.get("src"),
|
||||
}
|
||||
|
||||
return {}
|
||||
|
||||
|
||||
def get_notification_channels_for_host(config, hostname):
|
||||
"""Get notification channels configured for a specific host.
|
||||
|
||||
Args:
|
||||
config: Configuration dictionary
|
||||
hostname: Host name
|
||||
|
||||
Returns:
|
||||
List of channel names to use for this host
|
||||
"""
|
||||
host_config = get_host_config(config, hostname)
|
||||
|
||||
# Check if host specifies notification channels
|
||||
channels = host_config.get("notification_channels", [])
|
||||
if channels:
|
||||
if isinstance(channels, str):
|
||||
return [channels]
|
||||
elif isinstance(channels, list):
|
||||
return channels
|
||||
|
||||
# Fall back to default channels
|
||||
default_channels = config.get("default_notification_channels", [])
|
||||
if default_channels:
|
||||
if isinstance(default_channels, str):
|
||||
return [default_channels]
|
||||
elif isinstance(default_channels, list):
|
||||
return default_channels
|
||||
|
||||
# No channels configured, return empty list (will use legacy global config)
|
||||
return []
|
||||
|
||||
|
||||
def get_channel_config(config, channel_name):
|
||||
"""Get configuration for a specific notification channel.
|
||||
|
||||
Args:
|
||||
config: Configuration dictionary
|
||||
channel_name: Name of the notification channel
|
||||
|
||||
Returns:
|
||||
Dictionary with channel configuration or None if not found
|
||||
"""
|
||||
channels = config.get("notification_channels", {})
|
||||
if isinstance(channels, dict) and channel_name in channels:
|
||||
return channels[channel_name]
|
||||
return None
|
||||
|
||||
|
||||
def get_notification_channels_config(config, hostname):
|
||||
"""Get list of notification channel configurations for a host.
|
||||
|
||||
Args:
|
||||
config: Configuration dictionary
|
||||
hostname: Host name
|
||||
|
||||
Returns:
|
||||
List of (channel_name, channel_config) tuples
|
||||
"""
|
||||
channel_names = get_notification_channels_for_host(config, hostname)
|
||||
|
||||
channels = []
|
||||
for channel_name in channel_names:
|
||||
channel_config = get_channel_config(config, channel_name)
|
||||
if channel_config and channel_config.get("type"):
|
||||
channels.append((channel_name, channel_config))
|
||||
|
||||
return channels
|
||||
|
||||
+2
-12
@@ -136,16 +136,7 @@ async def dns_update_worker(
|
||||
)
|
||||
if err:
|
||||
m += f", DNS update failed: {err}"
|
||||
if pushmsg:
|
||||
try:
|
||||
await loop.run_in_executor(
|
||||
None,
|
||||
pushmsg,
|
||||
"error: nsupdate failed",
|
||||
f"{name}.dy.{dyndomain}: {m}",
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
logger.error("DNS update failed for %s: %s", name, err)
|
||||
else:
|
||||
m += ", DNS updated."
|
||||
|
||||
@@ -171,7 +162,6 @@ def start_dns_worker(
|
||||
hbdclass,
|
||||
cfg: dict,
|
||||
log: Optional[callable] = None,
|
||||
pushmsg: Optional[callable] = None,
|
||||
loop: Optional[asyncio.AbstractEventLoop] = None,
|
||||
):
|
||||
"""Start the async DNS worker and return the Task.
|
||||
@@ -218,7 +208,7 @@ def start_dns_worker(
|
||||
|
||||
task = loop.create_task(
|
||||
dns_update_worker(
|
||||
hbdclass, cfg, async_queue=async_q, log=log, pushmsg=pushmsg, loop=loop
|
||||
hbdclass, cfg, async_queue=async_q, log=log, loop=loop
|
||||
)
|
||||
)
|
||||
return task
|
||||
|
||||
@@ -25,12 +25,7 @@ async def start(
|
||||
port: int,
|
||||
config,
|
||||
hbdclass,
|
||||
log=None,
|
||||
email=None,
|
||||
pushmsg=None,
|
||||
msg_to_websockets=None,
|
||||
tcss=None,
|
||||
DEBUG=0,
|
||||
verbose=False,
|
||||
get_now=None,
|
||||
VER="",
|
||||
|
||||
+4
-11
@@ -79,14 +79,11 @@ async def _run_async(config):
|
||||
# Initialize threshold checker
|
||||
threshold_checker = threshold_mod.ThresholdChecker(
|
||||
config=config,
|
||||
notification_callback=notify_mod.pushmsg_from_config,
|
||||
renotify_interval=config.get("threshold_renotify_interval", 3600),
|
||||
journal=msg_journal,
|
||||
)
|
||||
logger.info("Threshold checker initialized")
|
||||
|
||||
pushmsg = notify_mod.pushmsg_from_config
|
||||
|
||||
sock = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
|
||||
# Disable IPV6_V6ONLY option to enable dual-stack (listen on IPv4 as well)
|
||||
# This option is system-dependent; on many systems, setting it to False enables
|
||||
@@ -110,7 +107,6 @@ async def _run_async(config):
|
||||
config=config,
|
||||
hbdclass=hbdclass,
|
||||
log=eventlog,
|
||||
pushmsg=pushmsg,
|
||||
msg_to_websockets=msg_to_websockets,
|
||||
msg_journal=msg_journal,
|
||||
threshold_checker=threshold_checker,
|
||||
@@ -132,12 +128,8 @@ async def _run_async(config):
|
||||
port=config.get("hbd_port", 50004),
|
||||
config=config,
|
||||
hbdclass=hbdclass,
|
||||
log=eventlog,
|
||||
pushmsg=pushmsg,
|
||||
msg_to_websockets=msg_to_websockets,
|
||||
threshold_checker=threshold_checker,
|
||||
tcss=None,
|
||||
DEBUG=config.get("debug", 0),
|
||||
verbose=config.get("verbose", False),
|
||||
get_now=lambda: time.time(),
|
||||
VER="",
|
||||
@@ -155,7 +147,7 @@ async def _run_async(config):
|
||||
dns_task = None
|
||||
try:
|
||||
dns_task = dns_mod.start_dns_worker(
|
||||
hbdclass, config, log=eventlog, pushmsg=pushmsg, loop=loop
|
||||
hbdclass, config, log=eventlog, loop=loop
|
||||
)
|
||||
logger.info("dns update worker started")
|
||||
except Exception as e:
|
||||
@@ -273,10 +265,11 @@ def load_pickled_hosts(config, hbdclass):
|
||||
"""Load pickled hosts from file, if available."""
|
||||
import os
|
||||
import pickle
|
||||
from . import config as config_mod
|
||||
|
||||
pickfile = config.get("pickfile", "hbd.pickle")
|
||||
dyndnshosts = config.get("dyndnshosts", [])
|
||||
watchhosts = config.get("watchhosts", [])
|
||||
dyndnshosts = config_mod.get_dyndnshosts(config)
|
||||
watchhosts = config_mod.get_watchhosts(config)
|
||||
drophosts = config.get("drophosts", [])
|
||||
if 1 and os.path.exists(pickfile):
|
||||
if config.get("verbose", False):
|
||||
|
||||
+117
-49
@@ -190,55 +190,123 @@ def pushsignal(
|
||||
return False
|
||||
|
||||
|
||||
def pushmsg(cfg: dict, msg: str, debug: int = 0):
|
||||
"""Dispatch push notifications according to `cfg['pushsrv']`.
|
||||
|
||||
cfg is expected to contain keys for different services when needed, e.g.
|
||||
- cfg['pushsrv'] : one of 'all', 'pushover', 'mattermost', 'signal'
|
||||
- cfg['pushover_token'], cfg['pushover_user']
|
||||
- cfg['matter_host'], cfg['matter_token'], cfg['matter_channel']
|
||||
- cfg['signal_cli'], cfg['signal_user'], cfg['signal_recipient']
|
||||
|
||||
Returns a dict of results per provider.
|
||||
"""
|
||||
def _dispatch_to_channel(channel_name: str, channel_config: dict, msg: str, debug: int = 0) -> bool:
|
||||
"""Dispatch a message to a specific notification channel.
|
||||
|
||||
Args:
|
||||
channel_name: Name of the channel (for logging)
|
||||
channel_config: Channel configuration dictionary with 'type' and type-specific fields
|
||||
msg: Message to send
|
||||
debug: Debug level
|
||||
|
||||
Returns:
|
||||
True if notification sent successfully, False otherwise
|
||||
"""
|
||||
channel_type = channel_config.get("type")
|
||||
|
||||
if channel_type == "pushover":
|
||||
return pushover(
|
||||
channel_config.get("token", ""),
|
||||
channel_config.get("user", ""),
|
||||
msg,
|
||||
debug=debug
|
||||
)
|
||||
|
||||
elif channel_type == "email":
|
||||
# Build email from channel config
|
||||
recipients = channel_config.get("recipients", [])
|
||||
sender = channel_config.get("sender", "")
|
||||
smtp_server = channel_config.get("smtp_server", "")
|
||||
smtp_port = channel_config.get("smtp_port", 587)
|
||||
smtp_user = channel_config.get("smtp_user")
|
||||
smtp_password = channel_config.get("smtp_password")
|
||||
|
||||
if not recipients or not sender or not smtp_server:
|
||||
logger.warning(
|
||||
"Email channel '%s' missing required fields: recipients=%s, sender=%s, smtp_server=%s",
|
||||
channel_name, recipients, sender, smtp_server
|
||||
)
|
||||
return False
|
||||
|
||||
# Temporarily update _config for email() function
|
||||
old_config = dict(_config)
|
||||
_config["toemail"] = recipients
|
||||
_config["fromemail"] = sender
|
||||
_config["smtpserver"] = smtp_server
|
||||
_config["smtpport"] = smtp_port
|
||||
if smtp_user:
|
||||
_config["smtpuser"] = smtp_user
|
||||
if smtp_password:
|
||||
_config["smtppassword"] = smtp_password
|
||||
|
||||
result = email("Heartbeat notification", msg, debug=debug)
|
||||
|
||||
# Restore config
|
||||
_config.clear()
|
||||
_config.update(old_config)
|
||||
|
||||
return result
|
||||
|
||||
elif channel_type == "signal":
|
||||
return pushsignal(
|
||||
channel_config.get("cli_path", "/usr/local/bin/signal-cli"),
|
||||
channel_config.get("user", ""),
|
||||
channel_config.get("recipient", ""),
|
||||
msg,
|
||||
debug=debug
|
||||
)
|
||||
|
||||
elif channel_type == "mattermost":
|
||||
return pushmattermost(
|
||||
channel_config.get("host", ""),
|
||||
channel_config.get("token", ""),
|
||||
channel_config.get("channel", ""),
|
||||
msg,
|
||||
username=channel_config.get("username", "hbd"),
|
||||
icon=channel_config.get("icon"),
|
||||
debug=debug
|
||||
)
|
||||
|
||||
else:
|
||||
logger.warning("Unknown channel type '%s' for channel '%s'", channel_type, channel_name)
|
||||
return False
|
||||
|
||||
|
||||
def pushmsg_for_host(hostname: str, msg: str, debug: int = 0) -> dict:
|
||||
"""Send notification for a specific host using its configured channels.
|
||||
|
||||
This function looks up the host's notification channels from the config
|
||||
and sends the message to those channels.
|
||||
|
||||
Args:
|
||||
hostname: Name of the host to send notification for
|
||||
msg: Message to send
|
||||
debug: Debug level
|
||||
|
||||
Returns:
|
||||
Dictionary of results per channel: {"channel_name": True/False}
|
||||
"""
|
||||
from . import config as config_mod
|
||||
|
||||
# Get notification channels for this host
|
||||
channels = config_mod.get_notification_channels_config(_config, hostname)
|
||||
|
||||
if not channels:
|
||||
logger.warning("No notification channels configured for host '%s'", hostname)
|
||||
return {}
|
||||
|
||||
# Dispatch to each channel
|
||||
results = {}
|
||||
p = cfg.get("pushsrv", "pushover")
|
||||
if p in ("all", "pushover"):
|
||||
ok = pushover(
|
||||
cfg.get("pushover_token", ""),
|
||||
cfg.get("pushover_user", ""),
|
||||
msg,
|
||||
debug=debug,
|
||||
)
|
||||
results["pushover"] = ok
|
||||
if p in ("all", "mattermost"):
|
||||
ok = pushmattermost(
|
||||
cfg.get("matter_host", ""),
|
||||
cfg.get("matter_token", ""),
|
||||
cfg.get("matter_channel", ""),
|
||||
msg,
|
||||
username=cfg.get("matter_username", "hbd"),
|
||||
icon=cfg.get("matter_icon"),
|
||||
debug=debug,
|
||||
)
|
||||
results["mattermost"] = ok
|
||||
if p in ("all", "signal"):
|
||||
ok = pushsignal(
|
||||
cfg.get("signal_cli", "/usr/local/bin/signal-cli"),
|
||||
cfg.get("signal_user", ""),
|
||||
cfg.get("signal_recipient", ""),
|
||||
msg,
|
||||
debug=debug,
|
||||
)
|
||||
results["signal"] = ok
|
||||
if p in ("all", "email"):
|
||||
ok = email("Heartbeat notification", msg, debug=debug)
|
||||
results["email"] = ok
|
||||
logger.debug("push results: %s", results)
|
||||
for channel_name, channel_config in channels:
|
||||
try:
|
||||
success = _dispatch_to_channel(channel_name, channel_config, msg, debug=debug)
|
||||
results[channel_name] = success
|
||||
if success:
|
||||
logger.info("Notification sent to channel '%s': %s", channel_name, msg)
|
||||
else:
|
||||
logger.warning("Failed to send notification to channel '%s'", channel_name)
|
||||
except Exception as e:
|
||||
logger.error("Error sending to channel '%s': %s", channel_name, e)
|
||||
results[channel_name] = False
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def pushmsg_from_config(msg: str, debug: int = 0) -> dict:
|
||||
"""Use the module-level configuration dict to dispatch a push message."""
|
||||
return pushmsg(_config, msg, debug=debug)
|
||||
|
||||
+64
-58
@@ -275,7 +275,6 @@ class ThresholdChecker:
|
||||
def __init__(
|
||||
self,
|
||||
config: Dict[str, Any],
|
||||
notification_callback: Optional[Callable] = None,
|
||||
renotify_interval: int = 3600,
|
||||
journal: Optional[Any] = None,
|
||||
):
|
||||
@@ -284,7 +283,6 @@ class ThresholdChecker:
|
||||
|
||||
Args:
|
||||
config: Threshold configuration dictionary from YAML
|
||||
notification_callback: Function to call for notifications
|
||||
renotify_interval: Seconds between repeat notifications (default: 1 hour)
|
||||
journal: Optional MessageJournal instance for logging threshold events
|
||||
"""
|
||||
@@ -300,7 +298,6 @@ class ThresholdChecker:
|
||||
# Default config name to use when no mapping exists
|
||||
self.default_config = "default"
|
||||
|
||||
self.notification_callback = notification_callback
|
||||
self.renotify_interval = renotify_interval
|
||||
self.journal = journal
|
||||
|
||||
@@ -367,8 +364,20 @@ class ThresholdChecker:
|
||||
target_dict=self.threshold_configs[config_name]
|
||||
)
|
||||
|
||||
# Parse host to config mapping
|
||||
self.host_config_mapping = config.get("host_threshold_mapping", {})
|
||||
# Parse host to config mapping from two possible sources
|
||||
# 1. New format: hosts section with threshold_config attribute
|
||||
if "hosts" in config:
|
||||
hosts_config = config["hosts"]
|
||||
if isinstance(hosts_config, dict):
|
||||
for host_name, host_attrs in hosts_config.items():
|
||||
if isinstance(host_attrs, dict) and "threshold_config" in host_attrs:
|
||||
self.host_config_mapping[host_name] = host_attrs["threshold_config"]
|
||||
|
||||
# 2. Legacy format: host_threshold_mapping section (for backward compatibility)
|
||||
if "host_threshold_mapping" in config:
|
||||
legacy_mapping = config.get("host_threshold_mapping", {})
|
||||
if isinstance(legacy_mapping, dict):
|
||||
self.host_config_mapping.update(legacy_mapping)
|
||||
|
||||
# Set default config (first one alphabetically or explicitly set)
|
||||
self.default_config = config.get("default_threshold_config", "default")
|
||||
@@ -513,14 +522,13 @@ class ThresholdChecker:
|
||||
rtt_thresholds: Dict[str, Any],
|
||||
target_dict: Optional[Dict[str, ThresholdConfig]] = None
|
||||
):
|
||||
"""Parse RTT thresholds (per-host network latency thresholds).
|
||||
"""Parse RTT thresholds (network latency thresholds).
|
||||
|
||||
RTT thresholds are configured as:
|
||||
thresholds:
|
||||
rtt:
|
||||
hostname1:
|
||||
warning: 100.0 # ms
|
||||
critical: 500.0 # ms
|
||||
warning: 100.0 # ms
|
||||
critical: 500.0 # ms
|
||||
|
||||
Args:
|
||||
rtt_thresholds: RTT threshold configuration
|
||||
@@ -529,41 +537,39 @@ class ThresholdChecker:
|
||||
if target_dict is None:
|
||||
target_dict = self.thresholds
|
||||
|
||||
for hostname, threshold_config in rtt_thresholds.items():
|
||||
if not isinstance(threshold_config, dict):
|
||||
continue
|
||||
|
||||
# Metric path is "rtt.<hostname>"
|
||||
metric_path = f"rtt.{hostname}"
|
||||
|
||||
warning = threshold_config.get("warning")
|
||||
critical = threshold_config.get("critical")
|
||||
operator = threshold_config.get("operator", ">")
|
||||
hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default
|
||||
enabled = threshold_config.get("enabled", True)
|
||||
display = threshold_config.get("display")
|
||||
|
||||
if warning is None and critical is None:
|
||||
logger.warning("No RTT thresholds defined for %s, skipping", hostname)
|
||||
continue
|
||||
|
||||
threshold = ThresholdConfig(
|
||||
metric_path=metric_path,
|
||||
warning=warning,
|
||||
critical=critical,
|
||||
operator=operator,
|
||||
hysteresis=hysteresis,
|
||||
enabled=enabled,
|
||||
display=display
|
||||
)
|
||||
|
||||
target_dict[metric_path] = threshold
|
||||
logger.debug(
|
||||
"Registered RTT threshold for %s: warn=%s ms, crit=%s ms",
|
||||
hostname,
|
||||
warning,
|
||||
critical
|
||||
)
|
||||
if not isinstance(rtt_thresholds, dict):
|
||||
return
|
||||
|
||||
# Metric path is simply "rtt" (not per-host)
|
||||
metric_path = "rtt"
|
||||
|
||||
warning = rtt_thresholds.get("warning")
|
||||
critical = rtt_thresholds.get("critical")
|
||||
operator = rtt_thresholds.get("operator", ">")
|
||||
hysteresis = rtt_thresholds.get("hysteresis", 0.1) # 10% default
|
||||
enabled = rtt_thresholds.get("enabled", True)
|
||||
display = rtt_thresholds.get("display")
|
||||
|
||||
if warning is None and critical is None:
|
||||
logger.warning("No RTT thresholds defined, skipping")
|
||||
return
|
||||
|
||||
threshold = ThresholdConfig(
|
||||
metric_path=metric_path,
|
||||
warning=warning,
|
||||
critical=critical,
|
||||
operator=operator,
|
||||
hysteresis=hysteresis,
|
||||
enabled=enabled,
|
||||
display=display
|
||||
)
|
||||
|
||||
target_dict[metric_path] = threshold
|
||||
logger.debug(
|
||||
"Registered RTT threshold: warn=%s ms, crit=%s ms",
|
||||
warning,
|
||||
critical
|
||||
)
|
||||
|
||||
def get_thresholds_for_host(self, host_name: str) -> Dict[str, ThresholdConfig]:
|
||||
"""Get the appropriate threshold configuration for a host.
|
||||
@@ -887,12 +893,12 @@ class ThresholdChecker:
|
||||
value: Any,
|
||||
):
|
||||
"""Send notification and log to journal/eventlog."""
|
||||
if self.notification_callback is not None:
|
||||
try:
|
||||
self.notification_callback(f"{lvl}: {host_name} - {message}")
|
||||
logger.info("Notification sent: %s", message)
|
||||
except Exception as e:
|
||||
logger.error("Failed to send notification: %s", e)
|
||||
# Send notification using host-specific channels
|
||||
try:
|
||||
notify_mod.pushmsg_for_host(host_name, f"{lvl}: {host_name} - {message}")
|
||||
logger.info("Notification sent: %s", message)
|
||||
except Exception as e:
|
||||
logger.error("Failed to send notification: %s", e)
|
||||
|
||||
# Log to journal
|
||||
if self.journal is not None:
|
||||
@@ -1017,14 +1023,14 @@ class ThresholdChecker:
|
||||
else:
|
||||
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (ongoing for {int(now - alert_state.since)}s)"
|
||||
|
||||
if self.notification_callback:
|
||||
try:
|
||||
self.notification_callback(message)
|
||||
alert_state.last_notification = now
|
||||
alert_state.notification_count += 1
|
||||
logger.info("Re-notification sent: %s", message)
|
||||
except Exception as e:
|
||||
logger.error("Failed to send re-notification: %s", e)
|
||||
# Send re-notification using host-specific channels
|
||||
try:
|
||||
notify_mod.pushmsg_for_host(host_name, message)
|
||||
alert_state.last_notification = now
|
||||
alert_state.notification_count += 1
|
||||
logger.info("Re-notification sent: %s", message)
|
||||
except Exception as e:
|
||||
logger.error("Failed to send re-notification: %s", e)
|
||||
|
||||
def get_active_alerts(self, alert_states: Dict[str, AlertState]) -> list:
|
||||
"""
|
||||
|
||||
+33
-24
@@ -68,7 +68,6 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
- config: dict of configuration
|
||||
- hbdclass: module providing Host/Connection classes
|
||||
- log: callable(loghost, message)
|
||||
- pushmsg: callable(message)
|
||||
- msg_to_websockets: callable(typ, data)
|
||||
- msg_journal: MessageJournal instance for logging all messages
|
||||
- DEBUG, verbose
|
||||
@@ -91,7 +90,6 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
cfg = ctx.get("config", {})
|
||||
hbdcls = ctx.get("hbdclass")
|
||||
log = ctx.get("log")
|
||||
pushmsg = ctx.get("pushmsg")
|
||||
msg_to_websockets = ctx.get("msg_to_websockets")
|
||||
DEBUG = ctx.get("DEBUG", 0)
|
||||
verbose = ctx.get("verbose", False)
|
||||
@@ -100,18 +98,24 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
ip = addr[0] if isinstance(addr, (list, tuple)) else addr
|
||||
name = msg.get("name", "unknown")
|
||||
from ..common.utils import shortname
|
||||
from . import config as config_mod
|
||||
|
||||
uname = shortname(name)
|
||||
|
||||
if uname not in hbdcls.Host.hosts:
|
||||
host = hbdcls.Host(uname)
|
||||
host.dyn = uname in cfg.get("dyndnshosts", [])
|
||||
# Use new config function to check dyndns
|
||||
dyndnshosts = config_mod.get_dyndnshosts(cfg)
|
||||
host.dyn = uname in dyndnshosts
|
||||
if verbose:
|
||||
print(("XX: New host, num now %s" % (len(hbdcls.Host.hosts))))
|
||||
newh = True
|
||||
else:
|
||||
host = hbdcls.Host.hosts[uname]
|
||||
newh = False
|
||||
|
||||
# Get watchhosts once for use throughout message handling
|
||||
watchhosts = config_mod.get_watchhosts(cfg)
|
||||
|
||||
cid = msg.get("id", 0)
|
||||
try:
|
||||
@@ -181,9 +185,8 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
|
||||
if res:
|
||||
eventlog(uname, "WARNING", res)
|
||||
if uname in cfg.get("watchhosts", []):
|
||||
if pushmsg:
|
||||
pushmsg("%s %s" % (host.name, res))
|
||||
if uname in watchhosts:
|
||||
notify_mod.pushmsg_for_host(uname, "%s %s" % (host.name, res))
|
||||
|
||||
interval = int(msg.get("interval", 0) or 0)
|
||||
shutdown = msg.get("shutdown", 0)
|
||||
@@ -193,15 +196,13 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
|
||||
if boot:
|
||||
eventlog(uname, "INFO", "booted")
|
||||
if uname in cfg.get("watchhosts", []):
|
||||
if uname in watchhosts:
|
||||
m = "%s booted" % (host.name)
|
||||
if pushmsg:
|
||||
pushmsg(m)
|
||||
notify_mod.pushmsg_for_host(uname, m)
|
||||
if message:
|
||||
eventlog(uname, "INFO", "msg: %s" % message, service=service)
|
||||
if uname in cfg.get("watchhosts", []):
|
||||
if pushmsg:
|
||||
pushmsg(message)
|
||||
if uname in watchhosts:
|
||||
notify_mod.pushmsg_for_host(uname, message)
|
||||
|
||||
if conn.getstate() != hbdcls.Connection.UP:
|
||||
lasts = conn.state
|
||||
@@ -211,9 +212,8 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
else:
|
||||
m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
|
||||
eventlog(uname, "RECOVER", m)
|
||||
if uname in cfg.get("watchhosts", []):
|
||||
if pushmsg:
|
||||
pushmsg("%s %s is back" % (uname, conn.afam))
|
||||
if uname in watchhosts:
|
||||
notify_mod.pushmsg_for_host(uname, "%s %s is back" % (uname, conn.afam))
|
||||
|
||||
if boot or newh:
|
||||
host.upcount = host.doesack
|
||||
@@ -222,9 +222,8 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
|
||||
if shutdown:
|
||||
eventlog(uname, "INFO", "%s shutdown" % conn.afam)
|
||||
if uname in cfg.get("watchhosts", []):
|
||||
if pushmsg:
|
||||
pushmsg("%s %s shutdown" % (uname, conn.afam))
|
||||
if uname in watchhosts:
|
||||
notify_mod.pushmsg_for_host(uname, "%s %s shutdown" % (uname, conn.afam))
|
||||
conn.newstate(hbdcls.Connection.DOWN, now)
|
||||
|
||||
if interval > 0:
|
||||
@@ -247,11 +246,21 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
connection.newstate(hbdcls.Connection.OVERDUE, now, cfg.get("grace", 2))
|
||||
|
||||
msg = f"{connection.afam} overdue"
|
||||
eventlog(uname, "CRITICAL" if uname in cfg.get("watchhosts", []) else "WARNING", msg)
|
||||
eventlog(uname, "CRITICAL" if uname in watchhosts else "WARNING", msg)
|
||||
|
||||
if uname in cfg.get("watchhosts", []):
|
||||
if pushmsg:
|
||||
pushmsg(f"{uname} {msg}")
|
||||
if uname in watchhosts:
|
||||
notify_mod.pushmsg_for_host(uname, f"{uname} {msg}")
|
||||
|
||||
# Check RTT thresholds with infinite RTT for overdue hosts
|
||||
threshold_checker = ctx.get("threshold_checker")
|
||||
if threshold_checker:
|
||||
metric_path = "rtt"
|
||||
threshold_checker.check_value(
|
||||
host_name=uname,
|
||||
metric_path=metric_path,
|
||||
value=float('inf'),
|
||||
alert_states=host.alert_states
|
||||
)
|
||||
|
||||
# Notify websockets
|
||||
if msg_to_websockets:
|
||||
@@ -274,8 +283,8 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
# Check RTT thresholds using the threshold checker
|
||||
threshold_checker = ctx.get("threshold_checker")
|
||||
if threshold_checker and rtt and rtt > 0:
|
||||
# Metric path for RTT is "rtt.<hostname>"
|
||||
metric_path = f"rtt.{uname}"
|
||||
# Metric path for RTT is simply "rtt"
|
||||
metric_path = "rtt"
|
||||
|
||||
# Check against configured thresholds (handles alerts, notifications, etc.)
|
||||
threshold_checker.check_value(
|
||||
|
||||
Reference in New Issue
Block a user