refactor monitor, add threshold rtesting

2026-03-31 12:22:03 -04:00
parent ad7178ebcb
commit dd23d9d163
15 changed files with 488 additions and 101 deletions
@@ -64,4 +64,17 @@ thresholds:
        percent:
          warning: 8.0
          critical: 90.0
-
+  nagios_runner:
    load_status:
      warning: WARNING
      operator: = "="
      critical: CRITICAL
      operator: = "="
    UPS_status_code:
      warning: 1
      critical: 2
      operator: ">="
  rtt:
    y:
      warning: 0.1
      critical: 10.0
@@ -9,7 +9,7 @@
      "type": "debugpy",
      "request": "launch",
      "module": "hbd.server.cli",
-      "args": ["-c", "/home/andreas/git/heartbeat/.hb.yaml", "-f", "-v", "-x", "-x", "-x"],
+      "args": ["-c", "/home/andreas/git/heartbeat/.hb.yaml", "-f", "-v", "-x", "-x", "-x", "-x"],
      "cwd": "${workspaceFolder}",
      "env": {
        "PYTHONPATH": "${workspaceFolder}"
@@ -150,6 +150,18 @@ Heartbeat includes a sophisticated threshold alerting system that monitors plugi
 ```yaml
 thresholds:
  # RTT (Round-Trip Time) thresholds for heartbeat monitoring
  # These are checked on every HTB message arrival
  rtt:
    webserver01:
      warning: 100.0   # Warn when RTT > 100ms
      critical: 500.0  # Critical when RTT > 500ms
    database01:
      warning: 50.0
      critical: 200.0
  # Plugin metric thresholds
  cpu_monitor:
    cpu_percent:
      warning: 80.0      # Warn when CPU > 80%
@@ -177,6 +189,38 @@ thresholds:
 threshold_renotify_interval: 3600  # Re-notify every hour for ongoing alerts
 ```
 ### RTT Monitoring
 Heartbeat monitors network latency (Round-Trip Time) for each host's heartbeat messages. RTT thresholds are **fully integrated with the threshold alerting system**:
 - **Per-host configuration**: Set different thresholds for each monitored host
 - **Real-time checking**: Thresholds evaluated on every HTB message arrival
 - **Alert state tracking**: RTT alerts use the same state management as plugin metrics
 - **Hysteresis support**: Configurable hysteresis prevents rapid state transitions
 - **Alerts dashboard**: RTT alerts visible on the `/alerts` web page alongside plugin alerts
 - **Smart notifications**: Only triggers on state changes (OK → WARNING → CRITICAL)
 - **Re-notification**: Periodic reminders for ongoing RTT issues
 - **Event & journal logging**: All RTT events logged for audit trail
 **Configuration format:**
 ```yaml
 thresholds:
  rtt:
    <hostname>:
      warning: <milliseconds>   # Warn when RTT > this value
      critical: <milliseconds>  # Critical when RTT > this value
      hysteresis: 0.1           # Optional: 10% hysteresis (default)
 ```
 **Example alerts:**
 ```
 WARNING: webserver01 - rtt.webserver01 = 125.3
 CRITICAL: database01 - rtt.database01 = 520.1
 RECOVERED: webserver01 - rtt.webserver01 = 45.2 (WARNING -> OK)
 ```
 RTT alerts appear on the Alerts dashboard and can be filtered by severity level. The `metric_path` format is `rtt.<hostname>`, making it easy to distinguish from plugin metrics.
 ### Alert Behavior
 1. **State Changes**: Notifications sent when crossing thresholds
@@ -43,9 +43,8 @@ def main(argv=None):
        config["verbose"] = True
    if args.pushsrv:
        config["pushsrv"] = args.pushsrv
-    if args.debug:
+    if args.debug > 0:
-        config.setdefault("debug", 0)
+        config["debug"] = args.debug
        config["debug"] += args.debug
    run_server(config)
@@ -0,0 +1,12 @@
 msgs = []  # in-memory list of recent messages for new websocket clients; also logged to file via notify.eventlog
 class Data:
    def __init__(self, config):
        self.config = config
        self.data = {}
    def update(self, new_data):
        self.data.update(new_data)
    def get(self, key, default=None):
        return self.data.get(key, default)
@@ -43,12 +43,38 @@ class Connection:
        self.deltastatetime = "computed"
        self.state = Connection.UNKNOWN
        # Timer-based reachability monitoring
        self.overdue_timer = None
        self.overdue_callback = None
        self.timeout_duration = None
        if host:
            Connection.htab[addr] = self.host.name
            if self.host.isDynDns():
                log(self.host.name, "dns update %s" % self.addr)
                Host.dnsQ.put((self.host.name, self.addr))
    def __getstate__(self):
        """Prepare Connection for pickling by excluding non-serializable timer objects."""
        state = self.__dict__.copy()
        # Remove asyncio timer objects that can't be pickled
        # These will be recreated when the next HTB arrives after unpickling
        state['overdue_timer'] = None
        state['overdue_callback'] = None
        state['timeout_duration'] = None
        return state
    def __setstate__(self, state):
        """Restore Connection from pickle, reinitializing timer fields."""
        self.__dict__.update(state)
        # Ensure timer fields are initialized (they'll be recreated when HTB arrives)
        if not hasattr(self, 'overdue_timer'):
            self.overdue_timer = None
        if not hasattr(self, 'overdue_callback'):
            self.overdue_callback = None
        if not hasattr(self, 'timeout_duration'):
            self.timeout_duration = None
    def registerDns(self):
        Host.dnsQ.put((self.host.name, self.addr))
@@ -123,7 +149,18 @@ class Connection:
        return d
    def jsons(self):
-        return json.dumps(self.__dict__)
+        """Serialize connection to JSON, excluding non-serializable timer objects."""
        data = {}
        for key, value in self.__dict__.items():
            # Skip timer-related fields that can't be serialized
            if key in ['overdue_timer', 'overdue_callback', 'timeout_duration']:
                continue
            # Handle host backpointer by converting to name
            if key == 'host':
                data[key] = value.name if value else None
            else:
                data[key] = value
        return json.dumps(data)
    # set new state, return number of secs in previous state
    def newstate(self, state, now, when=0):
@@ -151,11 +188,88 @@ class Connection:
            except Exception:
                pass
            self.addr = addr
-            Connection.htab[addr] = self.host.name
+            Connection.htab[addr] = self.host.nameconnection_count
            if self.host.isDynDns():
                Host.dnsQ.put((self.host.name, self.addr))
        return r
    def reset_overdue_timer(self, timeout_seconds, callback):
        """Reset the overdue timer for this connection.
        Cancels any existing timer and sets a new one that will mark
        the connection as overdue if no heartbeat arrives before timeout.
        Args:
            timeout_seconds: Seconds before marking as overdue
            callback: Async function to call when timer expires
        """
        import asyncio
        # Cancel existing timer if any
        if self.overdue_timer and not self.overdue_timer.cancelled():
            self.overdue_timer.cancel()
        # Store parameters for later reference
        self.timeout_duration = timeout_seconds
        self.overdue_callback = callback
        # Create new timer
        async def timer_expired():
            await callback(self)
        try:
            loop = asyncio.get_event_loop()
            self.overdue_timer = loop.call_later(timeout_seconds, 
                                                  lambda: asyncio.create_task(timer_expired()))
        except RuntimeError:
            # No event loop running yet
            pass
    def cancel_overdue_timer(self):
        """Cancel the overdue timer if it exists and clear all timer references."""
        if self.overdue_timer:
            try:
                if not self.overdue_timer.cancelled():
                    self.overdue_timer.cancel()
            except Exception:
                pass
        # Clear all timer-related references
        self.overdue_timer = None
        self.overdue_callback = None
        self.timeout_duration = None
    def get_avg_rtt(self):
        """Get average RTT from recent samples."""
        valid_rtts = [r for r in self.rtts if r > 0]
        if valid_rtts:
            return sum(valid_rtts) / len(valid_rtts)
        return 0
    def get_current_rtt(self):
        """Get most recent RTT value."""
        return self.rtts[-1] if self.rtts else 0
    def check_rtt_threshold(self, warning_threshold=None, critical_threshold=None):
        """Check if RTT exceeds thresholds.
        Args:
            warning_threshold: RTT in ms for warning level
            critical_threshold: RTT in ms for critical level
        Returns:
            Tuple of (level, rtt_value) where level is None, 'WARNING', or 'CRITICAL'
        """
        rtt = self.get_current_rtt()
        if rtt <= 0:
            return (None, rtt)
        if critical_threshold and rtt > critical_threshold:
            return ('CRITICAL', rtt)
        elif warning_threshold and rtt > warning_threshold:
            return ('WARNING', rtt)
        return (None, rtt)
 #
 class Host:
@@ -224,14 +338,30 @@ class Host:
    def stateinfo(self):
        ddict = {}
        for d in self.__dict__:
            if d in ["alert_states", "plugin_data"]:
                continue
            if d == "connections":
                cl = []
                for c in ["IPv4", "IPv6"]:
                    if c not in self.connections:
                        continue
-                    # dirty ugly hack: fix conn to host backpointer
+                    # Create connection dict, excluding non-serializable timer objects
-                    cld = copy.deepcopy(self.connections[c].__dict__)
+                    conn = self.connections[c]
-                    cld["host"] = cld["host"].name
+                    cld = {}
                    for key, value in conn.__dict__.items():
                        # Skip timer-related fields that can't be serialized
                        if key in ['overdue_timer', 'overdue_callback', 'timeout_duration']:
                            continue
                        # Handle host backpointer by converting to name
                        if key == 'host':
                            cld[key] = value.name if value else None
                        else:
                            # Safe copy for serializable values
                            try:
                                cld[key] = copy.deepcopy(value)
                            except Exception:
                                # If deepcopy fails, use shallow copy
                                cld[key] = value
                    cl.append(cld)
                ddict[d] = cl
            else:
@@ -8,6 +8,7 @@ import os
 import logging
 from aiohttp import web
 import jinja2
 from . import data
 logger = logging.getLogger(__name__)
@@ -22,7 +23,6 @@ async def start(
    port: int,
    config,
    hbdclass,
    msgs_getter,
    log=None,
    email=None,
    pushmsg=None,
@@ -52,7 +52,7 @@ async def start(
        res.append('<body BGCOLOR = "#FFFFFF" LINK = "#008000" VLINK = "#008000">')
        res.append(f"<H2>Heartbeat status {VER}</h2>")
        res += hbdclass.ubHost.buildhosttable()
-        res += hbdclass.ubHost.buildmsgtable(msgs_getter())
+        res += hbdclass.ubHost.buildmsgtable(data.msgs)
        res.append(
            "<p> %s (%s)</p>"
            % (
@@ -69,7 +69,7 @@ async def start(
        return web.json_response(json.loads("[" + ",".join(lst) + "]"))
    async def api_messages(request):
-        lst = msgs_getter()[-30:]
+        lst = data.msgs[-30:]
        return web.json_response(lst)
    async def cmd(request):
@@ -155,7 +155,7 @@ async def start(
            hosts=[
                hbdclass.Host.hosts[h].stateinfo() for h in sorted(hbdclass.Host.hosts)
            ],
-            messages=msgs_getter()[-30:],
+            messages=data.msgs[-30:],
        )
        return web.Response(text=body, content_type="text/html")
@@ -14,28 +14,35 @@ from . import hbdclass
 from . import ws as ws_mod
 from . import notify as notify_mod
 from . import data 
 logger = logging.getLogger(__name__)
 msg_to_websockets = ws_mod.broadcast
-eventlog = notify_mod.log
+eventlog = notify_mod.eventlog
 lastfm = ["", "", ""]
 # shared runtime collections and helpers
 msgs = notify_mod.msgs
-def cleanup_function(config):
+def cleanup_function(config, hbdclass):
    """This function will be executed upon program exit."""
    logger.info("Running cleanup function...")
    import pickle
    # Ensure all timer references are cleared before pickling
    for hostname, host in list(hbdclass.Host.hosts.items()):
        for conn_type, conn in host.connections.items():
            if hasattr(conn, 'overdue_timer'):
                conn.overdue_timer = None
            if hasattr(conn, 'overdue_callback'):
                conn.overdue_callback = None
            if hasattr(conn, 'timeout_duration'):
                conn.timeout_duration = None
    pickfile = config.get("pickfile", "hbd.pickle")
    pickf = open(pickfile, "wb")
    pick = pickle.Pickler(pickf)
    pick.dump(hbdclass.Host.hosts)
-    pick.dump(msgs)
+    pick.dump(data.msgs)
    pick.dump(lastfm)
    pickf.close()
    logger.info("Cleanup complete.")
@@ -124,7 +131,6 @@ async def _run_async(config):
                port=config.get("hbd_port", 50004),
                config=config,
                hbdclass=hbdclass,
                msgs_getter=lambda: msgs,
                log=eventlog,
                pushmsg=pushmsg,
                msg_to_websockets=msg_to_websockets,
@@ -186,7 +192,7 @@ async def _run_async(config):
                    hbdclass.Host.hosts[h].stateinfo()
                    for h in sorted(hbdclass.Host.hosts)
                ],
-                get_msgs=lambda: msgs,
+#                get_msgs=lambda: msgs,
                config=config,
            )
        )
@@ -200,7 +206,6 @@ async def _run_async(config):
            monitor_mod.start(
                config=config,
                hbdclass=hbdclass,
                log=eventlog,
                pushmsg=pushmsg,
                msg_to_websockets=msg_to_websockets,
            )
@@ -216,6 +221,13 @@ async def _run_async(config):
    except Exception as e:
        logger.exception("Error in main loop: %s", e)
    finally:
        # Clean up connection timers
        try:
            logger.info("Cleaning up connection timers...")
            await monitor_mod.cleanup_connections(hbdclass)
        except Exception as e:
            logger.warning("Error cleaning up connection timers: %s", e)
        # Cancel all running tasks
        logger.info("Cancelling tasks...")
        try:
@@ -279,7 +291,6 @@ async def _run_async(config):
 def load_pickled_hosts(config, hbdclass):
    """Load pickled hosts from file, if available."""
    global lastfm, msgs
    import os
    import pickle
@@ -294,11 +305,7 @@ def load_pickled_hosts(config, hbdclass):
        pick = pickle.Unpickler(pickf)
        try:
            hbdclass.Host.hosts = pick.load()
-            msgs = pick.load()
+            data.msgs = pick.load()
            try:
                lastfm = pick.load()
            except Exception:
                lastfm = ["", "", ""]
            pickf.close()
        except Exception as e:
            logger.exception("load pickled failed: %s", e)
@@ -331,7 +338,7 @@ def run(config):
    load_pickled_hosts(config, hbdclass)
    notify_mod.initlog(logfile=config.get("logfile", "messages.log"))
-    eventlog(None, f"hbd version {__version__} starting up")
+    eventlog(None, "INFO", f"hbd version {__version__} starting up")
    # Create and set the event loop manually
    loop = asyncio.new_event_loop()
@@ -344,8 +351,9 @@ def run(config):
    except Exception as e:
        logger.exception("Unhandled exception in main: %s", e)
    finally:
-        cleanup_function(config)
+        cleanup_function(config, hbdclass)
        logger.info("hbd shutdown complete")
        eventlog(None, "INFO", f"hbd version {__version__} shutdown")
        notify_mod.closelog()
        # Explicitly close the loop
        try:
@@ -1,50 +1,66 @@
-"""monitor helper and thread for heartbeat daemon."""
+"""Monitor helper for heartbeat daemon.
 This module provides monitoring tasks for the heartbeat daemon.
 The primary reachability monitoring is now event-driven (timers set/reset 
 on HTB arrival in udp.py) rather than periodic polling.
 This module can be extended for additional monitoring tasks.
 """
 from __future__ import annotations
 import asyncio
 import time
 from . import notify as notify_mod
 DROPOVERDUE = 7 * 24 * 3600
 eventlog = notify_mod.eventlog
-def checkoverdue(
+async def cleanup_connections(hbdclass):
-    config: dict,
+    """Clean up connection timers on shutdown.
-    hbdclass,
+    
-    log: callable,
+    Cancels all active overdue timers to prevent callbacks after shutdown.
-    pushmsg: callable,
+    """
-    msg_to_websockets: callable,
+    for hostname, host in list(hbdclass.Host.hosts.items()):
-):
+        for conn_type, conn in host.connections.items():
-    now = time.time()
+            if hasattr(conn, 'cancel_overdue_timer'):
-    for h in list(hbdclass.Host.hosts.keys()):
+                conn.cancel_overdue_timer()
        pmsg = []
        for c in hbdclass.Host.hosts[h].connections:
            conn = hbdclass.Host.hosts[h].connections[c]
            if conn.state == hbdclass.Connection.DOWN:
                continue
            timeout = hbdclass.Host.hosts[h].interval + config.get("grace", 10)
            if conn.state == hbdclass.Connection.UP and (now - conn.lastbeat) > timeout:
                conn.newstate(hbdclass.Connection.OVERDUE, now, config.get("grace", 10))
                pmsg.append(conn.afam)
            if (
                conn.state == hbdclass.Connection.OVERDUE
                and (now - conn.lastbeat) > DROPOVERDUE
            ):
                conn.newstate(hbdclass.Connection.UNKNOWN, conn.lastbeat)
        if pmsg != []:
            if h in config.get("watchhosts", []):
                pushmsg("%s %s overdue" % (h, " and ".join(pmsg)))
            log(h, "%s overdue" % " and ".join(pmsg))
            msg_to_websockets("host", hbdclass.Host.hosts[h].stateinfo())
 async def start(
    config: dict,
    hbdclass: callable,
    log=None,
    pushmsg=None,
    msg_to_websockets=None,
 ):
-    """start a monitor loop that checks for overdue hosts every minute"""
+    """Start monitor background tasks.
    Note: Reachability monitoring is now timer-based and happens in udp.py
    when HTB messages arrive. This function can be used for additional
    monitoring tasks.
    Currently runs a simple status logger every 5 minutes.
    """
    import logging
    logger = logging.getLogger(__name__)
    logger_interval = 300  # Log status every 5 minutes
    while True:
-        await asyncio.sleep(15)  # 15 seconds between checks
+        await asyncio.sleep(logger_interval)
-        checkoverdue(config, hbdclass, log, pushmsg, msg_to_websockets)
+        
        # Log monitoring status
        total_hosts = len(hbdclass.Host.hosts)
        up_count = sum(
            1 for h in hbdclass.Host.hosts.values()
            for c in h.connections.values()
            if c.state == hbdclass.Connection.UP
        )
        overdue_count = sum(
            1 for h in hbdclass.Host.hosts.values()
            for c in h.connections.values()
            if c.state == hbdclass.Connection.OVERDUE
        )
        logger.debug(
            f"Monitor status: {total_hosts} hosts, {up_count} UP, {overdue_count} OVERDUE"
        )
@@ -8,7 +8,9 @@ import subprocess
 import smtplib
 import time
 import sys
 from . import data
 from . import ws as ws_mod
 from . import main as main_mod
 DEFAULT_PUSHPROVIDERS = ["all", "pushover", "mattermost", "signal"]
 msg_to_websockets = ws_mod.broadcast
@@ -17,19 +19,18 @@ msg_to_websockets = ws_mod.broadcast
 _config = {}
 logger = logging.getLogger(__name__)
 msgs = []
 logf = None
 def initlog(logfile):
    global logf
    try:
        logf = open(logfile, "a+")
        return logf
    except Exception as e:
        import sys
        print("cannot open logfile %s, using STDERR: %s" % (logfile, e))
-        return sys.stderr
+        logf = sys.stderr
    return logf
 def closelog():
    global logf
@@ -39,10 +40,13 @@ def closelog():
        except Exception:
            pass
-def log(host, m, service=None):
+def eventlog(host, lvl, m, service=None):
    ts = time.time()
-    s = f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(ts))} {host or ''} {m}"
+    s = f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(ts))} {lvl} "
-    msgs.append(s)
+    if host:
        s += f"{host} "
    s += m
    data.msgs.append(s)
    logger.info(s)
    if logf:
        try:
@@ -16,7 +16,7 @@ from typing import Dict, Any, Optional, Tuple, Callable
 from . import notify as notify_mod
 logger = logging.getLogger(__name__)
-eventlog = notify_mod.log
+eventlog = notify_mod.eventlog
 class AlertLevel(Enum):
    """Alert severity levels."""
@@ -96,6 +96,8 @@ class AlertState:
            "notification_count": self.notification_count,
        }
    def __str__(self):
        return self.to_dict().__str__()
 class ThresholdConfig:
    """Configuration for a single threshold check."""
@@ -280,6 +282,11 @@ class ThresholdChecker:
    def _parse_plugin_thresholds(self, plugin_name: str, thresholds: Dict[str, Any]):
        """Parse thresholds for a specific plugin."""
        # Special handling for RTT thresholds (per-host)
        if plugin_name == "rtt":
            self._parse_rtt_thresholds(thresholds)
            return
        for metric_name, threshold_config in thresholds.items():
            if not isinstance(threshold_config, dict):
                continue
@@ -353,6 +360,97 @@ class ThresholdChecker:
                self.thresholds[metric_path] = threshold
    def _parse_rtt_thresholds(self, rtt_thresholds: Dict[str, Any]):
        """Parse RTT thresholds (per-host network latency thresholds).
        RTT thresholds are configured as:
        thresholds:
          rtt:
            hostname1:
              warning: 100.0   # ms
              critical: 500.0  # ms
        """
        for hostname, threshold_config in rtt_thresholds.items():
            if not isinstance(threshold_config, dict):
                continue
            # Metric path is "rtt.<hostname>"
            metric_path = f"rtt.{hostname}"
            warning = threshold_config.get("warning")
            critical = threshold_config.get("critical")
            operator = threshold_config.get("operator", ">")
            hysteresis = threshold_config.get("hysteresis", 0.1)  # 10% default
            enabled = threshold_config.get("enabled", True)
            if warning is None and critical is None:
                logger.warning("No RTT thresholds defined for %s, skipping", hostname)
                continue
            threshold = ThresholdConfig(
                metric_path=metric_path,
                warning=warning,
                critical=critical,
                operator=operator,
                hysteresis=hysteresis,
                enabled=enabled,
            )
            self.thresholds[metric_path] = threshold
            logger.debug(
                "Registered RTT threshold for %s: warn=%s ms, crit=%s ms",
                hostname,
                warning,
                critical
            )
    def check_value(
        self,
        host_name: str,
        metric_path: str,
        value: float,
        alert_states: Dict[str, AlertState],
    ) -> Optional[Tuple[AlertLevel, AlertLevel]]:
        """
        Check a single value against configured threshold.
        Args:
            host_name: Name of the host
            metric_path: Full metric path (e.g., "rtt.hostname")
            value: The metric value to check
            alert_states: Host's alert_states dictionary
        Returns:
            Tuple of (old_level, new_level) if state changed, None otherwise
        """
        if metric_path not in self.thresholds:
            return None
        threshold = self.thresholds[metric_path]
        # Get or create alert state
        if metric_path not in alert_states:
            alert_states[metric_path] = AlertState(metric_path)
        alert_state = alert_states[metric_path]
        # Evaluate threshold with hysteresis
        new_level = threshold.evaluate_with_hysteresis(
            value,
            alert_state.level
        )
        # Update state and check for changes
        old_level = alert_state.level
        if alert_state.update(new_level, value):
            self._trigger_notification(host_name, metric_path, old_level, new_level, value)
            return (old_level, new_level)
        elif new_level != AlertLevel.OK:
            # Check if we should re-notify
            self._check_renotify(host_name, alert_state, metric_path, value)
        return None
    def check_plugin_data(
        self,
        host_name: str,
@@ -476,18 +574,22 @@ class ThresholdChecker:
        """Trigger a notification for an alert state change."""
        # Format message
        if new_level == AlertLevel.OK:
-            message = f"RECOVERED: {host_name} - {metric_path} = {value} ({old_level.name} -> OK)"
+            lvl = "RECOVERED" 
            message = f"{metric_path} = {value} ({old_level.name} -> OK)"
        elif new_level == AlertLevel.WARNING:
-            message = f"WARNING: {host_name} - {metric_path} = {value}"
+            lvl = "WARNING"
            message = f"{metric_path} = {value}"
        elif new_level == AlertLevel.CRITICAL:
-            message = f"CRITICAL: {host_name} - {metric_path} = {value}"
+            lvl = "CRITICAL"
            message = f"{metric_path} = {value}"
        else:
-            message = f"UNKNOWN: {host_name} - {metric_path} = {value}"
+            lvl = "UNKNOWN"
            message = f"{metric_path} = {value}"
        # Send notification
        if self.notification_callback is not None:
            try:
-                self.notification_callback(message)
+                self.notification_callback(f"{lvl}: {host_name} - {message}")
                logger.info("Notification sent: %s", message)
            except Exception as e:
                logger.error("Failed to send notification: %s", e)
@@ -507,7 +609,7 @@ class ThresholdChecker:
            except Exception as e:
                logger.debug(f"Failed to log threshold event to journal: {e}")
        # Log to eventlog as well
-        eventlog(host_name, message, service="threshold")
+        eventlog(host_name, lvl,  message, service="threshold")
    def _check_renotify(
        self,
@@ -6,8 +6,10 @@ import logging
 from ..common.proto import stodict, oldmtodict
 from ..common.utils import dur
 from . import notify as notify_mod
 logger = logging.getLogger(__name__)
 eventlog = notify_mod.eventlog
 class EchoServerProtocol(asyncio.DatagramProtocol):
@@ -170,8 +172,7 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
        return
    if res:
-        if log:
+        eventlog(uname, "WARNING", res)
            log(uname, res)
        if uname in cfg.get("watchhosts", []):
            if pushmsg:
                pushmsg("%s %s" % (host.name, res))
@@ -183,15 +184,13 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
    boot = msg.get("boot", 0)
    if boot:
-        if log:
+        eventlog(uname, "INFO", "booted")
            log(uname, "booted")
        if uname in cfg.get("watchhosts", []):
            m = "%s booted" % (host.name)
            if pushmsg:
                pushmsg(m)
    if message:
-        if log:
+        eventlog(uname, "INFO", "msg: %s" % message, service=service)
            log(uname, "msg: %s" % message, service=service)
        if uname in cfg.get("watchhosts", []):
            if pushmsg:
                pushmsg(message)
@@ -199,9 +198,11 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
    if conn.getstate() != hbdcls.Connection.UP:
        lasts = conn.state
        d = conn.newstate(hbdcls.Connection.UP, now)
-        m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
+        if d == 0 or lasts == "unknown":
-        if log:
+            m = "%s is up" % (conn.afam)
-            log(uname, m)
+        else:
            m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
        eventlog(uname, "RECOVER", m)
        if uname in cfg.get("watchhosts", []):
            if pushmsg:
                pushmsg("%s %s is back" % (uname, conn.afam))
@@ -212,8 +213,7 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
        host.upcount += 1
    if shutdown:
-        if log:
+        eventlog(uname, "INFO", "%s shutdown" % conn.afam)
            log(uname, "%s shutdown" % conn.afam)
        if uname in cfg.get("watchhosts", []):
            if pushmsg:
                pushmsg("%s %s shutdown" % (uname, conn.afam))
@@ -222,6 +222,61 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
    if interval > 0:
        host.interval = interval
    # Timer-based reachability monitoring
    # Reset overdue timer on every heartbeat
    if interval > 0 and conn.getstate() != hbdcls.Connection.DOWN:
        grace = cfg.get("grace", 2)
        timeout_seconds = (interval + grace) if interval > 0 else 30
        # Create callback for timer expiration
        async def on_overdue(connection):
            """Called when connection timer expires (no heartbeat received)."""
            import time
            now = time.time()
            # Only mark as overdue if still in UP state (not already marked)
            if connection.getstate() == hbdcls.Connection.UP:
                connection.newstate(hbdcls.Connection.OVERDUE, now, cfg.get("grace", 2))
                msg = f"{connection.afam} overdue"
                eventlog(uname, "CRITICAL" if uname in cfg.get("watchhosts", []) else "WARNING", msg)
                if uname in cfg.get("watchhosts", []):
                    if pushmsg:
                        pushmsg(f"{uname} {msg}")
                # Notify websockets
                if msg_to_websockets:
                    msg_to_websockets("host", host.stateinfo())
                # Set a longer timer for marking as UNKNOWN (7 days)
                DROPOVERDUE = 7 * 24 * 3600
                async def on_unknown(connection):
                    """Mark connection as unknown after extended absence."""
                    connection.newstate(hbdcls.Connection.UNKNOWN, connection.lastbeat)
                    if msg_to_websockets:
                        msg_to_websockets("host", host.stateinfo())
                connection.reset_overdue_timer(DROPOVERDUE, on_unknown)
        # Reset the timer
        conn.reset_overdue_timer(timeout_seconds, on_overdue)
    # Check RTT thresholds using the threshold checker
    threshold_checker = ctx.get("threshold_checker")
    if threshold_checker and rtt and rtt > 0:
        # Metric path for RTT is "rtt.<hostname>"
        metric_path = f"rtt.{uname}"
        # Check against configured thresholds (handles alerts, notifications, etc.)
        threshold_checker.check_value(
            host_name=uname,
            metric_path=metric_path,
            value=rtt,
            alert_states=host.alert_states
        )
    # send ACK back
    rmsg = {"time": __import__("time").time()}
    if host.cver < 1:
@@ -266,5 +321,6 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
    if msg_to_websockets:
        try:
            msg_to_websockets("host", host.stateinfo())
-        except Exception:
+        except Exception as e:
-            pass
+            if DEBUG > 0:
                print(("cannot send websocket message: %s" % e))
@@ -8,6 +8,7 @@ import asyncio
 import json
 import logging
 from typing import Callable, Iterable, Optional
 from . import data
 import websockets
@@ -16,7 +17,7 @@ logger.setLevel(logging.INFO)
 _connections = set()
 _loop: Optional[asyncio.AbstractEventLoop] = None
 _get_hosts: Optional[Callable[[], Iterable]] = None
-_get_msgs: Optional[Callable[[], Iterable]] = None
+#_get_msgs: Optional[Callable[[], Iterable]] = None
 _verbose = False
@@ -38,11 +39,11 @@ async def _handler(websocket, path=None):
            except Exception as e:
                logger.error("Error sending initial hosts: %s", e, exc_info=True)
        # send recent messages
-        if _get_msgs:
+        if data.msgs:
            try:
-                msgs = list(_get_msgs())[-100:]
+#                msgs = list(_get_msgs())[-100:]
-                logger.debug("Sending %d recent messages to new WebSocket client", len(msgs))
+                logger.debug("Sending %d recent messages to new WebSocket client", len(data.msgs))
-                for m in msgs:
+                for m in data.msgs:
                    jmsg = json.dumps({"type": "message", "data": m})
                    await websocket.send(jmsg)
            except Exception as e:
@@ -77,7 +78,7 @@ async def start(
    wss_port: Optional[int] = None,
    ssl_context=None,
    get_hosts: Optional[Callable] = None,
-    get_msgs: Optional[Callable] = None,
+#    get_msgs: Optional[Callable] = None,
    config: dict = {},
 ):
    """Start WebSocket servers and block until cancelled.
@@ -86,17 +87,19 @@ async def start(
    If `wss_port` and `ssl_context` are provided, a WSS server will also be
    started.
    """
-    global _loop, _get_hosts, _get_msgs, _verbose
+    global _loop, _get_hosts, _verbose
    _loop = asyncio.get_running_loop()
    _get_hosts = get_hosts
    _get_msgs = get_msgs
    _verbose = config.get("verbose", False),
-    _debug = config.get("debug", False),
+    _debug = config.get("debug", 0),
    servers = []
    # plain WebSocket
    websockets_logger = logging.getLogger("websockets.server")
-    websockets_logger.setLevel(logging.DEBUG if _debug > 2 else logging.INFO)
+    #if _debug > 2:
    #    websockets_logger.setLevel(logging.DEBUG)
    #else:   
    #    websockets_logger.setLevel(logging.INFO)
    # regular WebSocket
    ws_server = websockets.serve(_handler, host, ws_port)  # , subprotocols=["hbd"])
    servers.append(ws_server)