refactor monitor, add threshold rtesting
This commit is contained in:
@@ -64,4 +64,17 @@ thresholds:
|
|||||||
percent:
|
percent:
|
||||||
warning: 8.0
|
warning: 8.0
|
||||||
critical: 90.0
|
critical: 90.0
|
||||||
|
nagios_runner:
|
||||||
|
load_status:
|
||||||
|
warning: WARNING
|
||||||
|
operator: = "="
|
||||||
|
critical: CRITICAL
|
||||||
|
operator: = "="
|
||||||
|
UPS_status_code:
|
||||||
|
warning: 1
|
||||||
|
critical: 2
|
||||||
|
operator: ">="
|
||||||
|
rtt:
|
||||||
|
y:
|
||||||
|
warning: 0.1
|
||||||
|
critical: 10.0
|
||||||
|
|||||||
Binary file not shown.
Vendored
+1
-1
@@ -9,7 +9,7 @@
|
|||||||
"type": "debugpy",
|
"type": "debugpy",
|
||||||
"request": "launch",
|
"request": "launch",
|
||||||
"module": "hbd.server.cli",
|
"module": "hbd.server.cli",
|
||||||
"args": ["-c", "/home/andreas/git/heartbeat/.hb.yaml", "-f", "-v", "-x", "-x", "-x"],
|
"args": ["-c", "/home/andreas/git/heartbeat/.hb.yaml", "-f", "-v", "-x", "-x", "-x", "-x"],
|
||||||
"cwd": "${workspaceFolder}",
|
"cwd": "${workspaceFolder}",
|
||||||
"env": {
|
"env": {
|
||||||
"PYTHONPATH": "${workspaceFolder}"
|
"PYTHONPATH": "${workspaceFolder}"
|
||||||
|
|||||||
@@ -150,6 +150,18 @@ Heartbeat includes a sophisticated threshold alerting system that monitors plugi
|
|||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
thresholds:
|
thresholds:
|
||||||
|
# RTT (Round-Trip Time) thresholds for heartbeat monitoring
|
||||||
|
# These are checked on every HTB message arrival
|
||||||
|
rtt:
|
||||||
|
webserver01:
|
||||||
|
warning: 100.0 # Warn when RTT > 100ms
|
||||||
|
critical: 500.0 # Critical when RTT > 500ms
|
||||||
|
|
||||||
|
database01:
|
||||||
|
warning: 50.0
|
||||||
|
critical: 200.0
|
||||||
|
|
||||||
|
# Plugin metric thresholds
|
||||||
cpu_monitor:
|
cpu_monitor:
|
||||||
cpu_percent:
|
cpu_percent:
|
||||||
warning: 80.0 # Warn when CPU > 80%
|
warning: 80.0 # Warn when CPU > 80%
|
||||||
@@ -177,6 +189,38 @@ thresholds:
|
|||||||
threshold_renotify_interval: 3600 # Re-notify every hour for ongoing alerts
|
threshold_renotify_interval: 3600 # Re-notify every hour for ongoing alerts
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### RTT Monitoring
|
||||||
|
|
||||||
|
Heartbeat monitors network latency (Round-Trip Time) for each host's heartbeat messages. RTT thresholds are **fully integrated with the threshold alerting system**:
|
||||||
|
|
||||||
|
- **Per-host configuration**: Set different thresholds for each monitored host
|
||||||
|
- **Real-time checking**: Thresholds evaluated on every HTB message arrival
|
||||||
|
- **Alert state tracking**: RTT alerts use the same state management as plugin metrics
|
||||||
|
- **Hysteresis support**: Configurable hysteresis prevents rapid state transitions
|
||||||
|
- **Alerts dashboard**: RTT alerts visible on the `/alerts` web page alongside plugin alerts
|
||||||
|
- **Smart notifications**: Only triggers on state changes (OK → WARNING → CRITICAL)
|
||||||
|
- **Re-notification**: Periodic reminders for ongoing RTT issues
|
||||||
|
- **Event & journal logging**: All RTT events logged for audit trail
|
||||||
|
|
||||||
|
**Configuration format:**
|
||||||
|
```yaml
|
||||||
|
thresholds:
|
||||||
|
rtt:
|
||||||
|
<hostname>:
|
||||||
|
warning: <milliseconds> # Warn when RTT > this value
|
||||||
|
critical: <milliseconds> # Critical when RTT > this value
|
||||||
|
hysteresis: 0.1 # Optional: 10% hysteresis (default)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Example alerts:**
|
||||||
|
```
|
||||||
|
WARNING: webserver01 - rtt.webserver01 = 125.3
|
||||||
|
CRITICAL: database01 - rtt.database01 = 520.1
|
||||||
|
RECOVERED: webserver01 - rtt.webserver01 = 45.2 (WARNING -> OK)
|
||||||
|
```
|
||||||
|
|
||||||
|
RTT alerts appear on the Alerts dashboard and can be filtered by severity level. The `metric_path` format is `rtt.<hostname>`, making it easy to distinguish from plugin metrics.
|
||||||
|
|
||||||
### Alert Behavior
|
### Alert Behavior
|
||||||
|
|
||||||
1. **State Changes**: Notifications sent when crossing thresholds
|
1. **State Changes**: Notifications sent when crossing thresholds
|
||||||
|
|||||||
+2
-3
@@ -43,9 +43,8 @@ def main(argv=None):
|
|||||||
config["verbose"] = True
|
config["verbose"] = True
|
||||||
if args.pushsrv:
|
if args.pushsrv:
|
||||||
config["pushsrv"] = args.pushsrv
|
config["pushsrv"] = args.pushsrv
|
||||||
if args.debug:
|
if args.debug > 0:
|
||||||
config.setdefault("debug", 0)
|
config["debug"] = args.debug
|
||||||
config["debug"] += args.debug
|
|
||||||
|
|
||||||
run_server(config)
|
run_server(config)
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,12 @@
|
|||||||
|
msgs = [] # in-memory list of recent messages for new websocket clients; also logged to file via notify.eventlog
|
||||||
|
class Data:
|
||||||
|
|
||||||
|
def __init__(self, config):
|
||||||
|
self.config = config
|
||||||
|
self.data = {}
|
||||||
|
|
||||||
|
def update(self, new_data):
|
||||||
|
self.data.update(new_data)
|
||||||
|
|
||||||
|
def get(self, key, default=None):
|
||||||
|
return self.data.get(key, default)
|
||||||
+135
-5
@@ -43,12 +43,38 @@ class Connection:
|
|||||||
self.deltastatetime = "computed"
|
self.deltastatetime = "computed"
|
||||||
self.state = Connection.UNKNOWN
|
self.state = Connection.UNKNOWN
|
||||||
|
|
||||||
|
# Timer-based reachability monitoring
|
||||||
|
self.overdue_timer = None
|
||||||
|
self.overdue_callback = None
|
||||||
|
self.timeout_duration = None
|
||||||
|
|
||||||
if host:
|
if host:
|
||||||
Connection.htab[addr] = self.host.name
|
Connection.htab[addr] = self.host.name
|
||||||
if self.host.isDynDns():
|
if self.host.isDynDns():
|
||||||
log(self.host.name, "dns update %s" % self.addr)
|
log(self.host.name, "dns update %s" % self.addr)
|
||||||
Host.dnsQ.put((self.host.name, self.addr))
|
Host.dnsQ.put((self.host.name, self.addr))
|
||||||
|
|
||||||
|
def __getstate__(self):
|
||||||
|
"""Prepare Connection for pickling by excluding non-serializable timer objects."""
|
||||||
|
state = self.__dict__.copy()
|
||||||
|
# Remove asyncio timer objects that can't be pickled
|
||||||
|
# These will be recreated when the next HTB arrives after unpickling
|
||||||
|
state['overdue_timer'] = None
|
||||||
|
state['overdue_callback'] = None
|
||||||
|
state['timeout_duration'] = None
|
||||||
|
return state
|
||||||
|
|
||||||
|
def __setstate__(self, state):
|
||||||
|
"""Restore Connection from pickle, reinitializing timer fields."""
|
||||||
|
self.__dict__.update(state)
|
||||||
|
# Ensure timer fields are initialized (they'll be recreated when HTB arrives)
|
||||||
|
if not hasattr(self, 'overdue_timer'):
|
||||||
|
self.overdue_timer = None
|
||||||
|
if not hasattr(self, 'overdue_callback'):
|
||||||
|
self.overdue_callback = None
|
||||||
|
if not hasattr(self, 'timeout_duration'):
|
||||||
|
self.timeout_duration = None
|
||||||
|
|
||||||
def registerDns(self):
|
def registerDns(self):
|
||||||
Host.dnsQ.put((self.host.name, self.addr))
|
Host.dnsQ.put((self.host.name, self.addr))
|
||||||
|
|
||||||
@@ -123,7 +149,18 @@ class Connection:
|
|||||||
return d
|
return d
|
||||||
|
|
||||||
def jsons(self):
|
def jsons(self):
|
||||||
return json.dumps(self.__dict__)
|
"""Serialize connection to JSON, excluding non-serializable timer objects."""
|
||||||
|
data = {}
|
||||||
|
for key, value in self.__dict__.items():
|
||||||
|
# Skip timer-related fields that can't be serialized
|
||||||
|
if key in ['overdue_timer', 'overdue_callback', 'timeout_duration']:
|
||||||
|
continue
|
||||||
|
# Handle host backpointer by converting to name
|
||||||
|
if key == 'host':
|
||||||
|
data[key] = value.name if value else None
|
||||||
|
else:
|
||||||
|
data[key] = value
|
||||||
|
return json.dumps(data)
|
||||||
|
|
||||||
# set new state, return number of secs in previous state
|
# set new state, return number of secs in previous state
|
||||||
def newstate(self, state, now, when=0):
|
def newstate(self, state, now, when=0):
|
||||||
@@ -151,11 +188,88 @@ class Connection:
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
self.addr = addr
|
self.addr = addr
|
||||||
Connection.htab[addr] = self.host.name
|
Connection.htab[addr] = self.host.nameconnection_count
|
||||||
if self.host.isDynDns():
|
if self.host.isDynDns():
|
||||||
Host.dnsQ.put((self.host.name, self.addr))
|
Host.dnsQ.put((self.host.name, self.addr))
|
||||||
return r
|
return r
|
||||||
|
|
||||||
|
def reset_overdue_timer(self, timeout_seconds, callback):
|
||||||
|
"""Reset the overdue timer for this connection.
|
||||||
|
|
||||||
|
Cancels any existing timer and sets a new one that will mark
|
||||||
|
the connection as overdue if no heartbeat arrives before timeout.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
timeout_seconds: Seconds before marking as overdue
|
||||||
|
callback: Async function to call when timer expires
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
# Cancel existing timer if any
|
||||||
|
if self.overdue_timer and not self.overdue_timer.cancelled():
|
||||||
|
self.overdue_timer.cancel()
|
||||||
|
|
||||||
|
# Store parameters for later reference
|
||||||
|
self.timeout_duration = timeout_seconds
|
||||||
|
self.overdue_callback = callback
|
||||||
|
|
||||||
|
# Create new timer
|
||||||
|
async def timer_expired():
|
||||||
|
await callback(self)
|
||||||
|
|
||||||
|
try:
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
self.overdue_timer = loop.call_later(timeout_seconds,
|
||||||
|
lambda: asyncio.create_task(timer_expired()))
|
||||||
|
except RuntimeError:
|
||||||
|
# No event loop running yet
|
||||||
|
pass
|
||||||
|
|
||||||
|
def cancel_overdue_timer(self):
|
||||||
|
"""Cancel the overdue timer if it exists and clear all timer references."""
|
||||||
|
if self.overdue_timer:
|
||||||
|
try:
|
||||||
|
if not self.overdue_timer.cancelled():
|
||||||
|
self.overdue_timer.cancel()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
# Clear all timer-related references
|
||||||
|
self.overdue_timer = None
|
||||||
|
self.overdue_callback = None
|
||||||
|
self.timeout_duration = None
|
||||||
|
|
||||||
|
def get_avg_rtt(self):
|
||||||
|
"""Get average RTT from recent samples."""
|
||||||
|
valid_rtts = [r for r in self.rtts if r > 0]
|
||||||
|
if valid_rtts:
|
||||||
|
return sum(valid_rtts) / len(valid_rtts)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def get_current_rtt(self):
|
||||||
|
"""Get most recent RTT value."""
|
||||||
|
return self.rtts[-1] if self.rtts else 0
|
||||||
|
|
||||||
|
def check_rtt_threshold(self, warning_threshold=None, critical_threshold=None):
|
||||||
|
"""Check if RTT exceeds thresholds.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
warning_threshold: RTT in ms for warning level
|
||||||
|
critical_threshold: RTT in ms for critical level
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (level, rtt_value) where level is None, 'WARNING', or 'CRITICAL'
|
||||||
|
"""
|
||||||
|
rtt = self.get_current_rtt()
|
||||||
|
if rtt <= 0:
|
||||||
|
return (None, rtt)
|
||||||
|
|
||||||
|
if critical_threshold and rtt > critical_threshold:
|
||||||
|
return ('CRITICAL', rtt)
|
||||||
|
elif warning_threshold and rtt > warning_threshold:
|
||||||
|
return ('WARNING', rtt)
|
||||||
|
|
||||||
|
return (None, rtt)
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
class Host:
|
class Host:
|
||||||
@@ -224,14 +338,30 @@ class Host:
|
|||||||
def stateinfo(self):
|
def stateinfo(self):
|
||||||
ddict = {}
|
ddict = {}
|
||||||
for d in self.__dict__:
|
for d in self.__dict__:
|
||||||
|
if d in ["alert_states", "plugin_data"]:
|
||||||
|
continue
|
||||||
if d == "connections":
|
if d == "connections":
|
||||||
cl = []
|
cl = []
|
||||||
for c in ["IPv4", "IPv6"]:
|
for c in ["IPv4", "IPv6"]:
|
||||||
if c not in self.connections:
|
if c not in self.connections:
|
||||||
continue
|
continue
|
||||||
# dirty ugly hack: fix conn to host backpointer
|
# Create connection dict, excluding non-serializable timer objects
|
||||||
cld = copy.deepcopy(self.connections[c].__dict__)
|
conn = self.connections[c]
|
||||||
cld["host"] = cld["host"].name
|
cld = {}
|
||||||
|
for key, value in conn.__dict__.items():
|
||||||
|
# Skip timer-related fields that can't be serialized
|
||||||
|
if key in ['overdue_timer', 'overdue_callback', 'timeout_duration']:
|
||||||
|
continue
|
||||||
|
# Handle host backpointer by converting to name
|
||||||
|
if key == 'host':
|
||||||
|
cld[key] = value.name if value else None
|
||||||
|
else:
|
||||||
|
# Safe copy for serializable values
|
||||||
|
try:
|
||||||
|
cld[key] = copy.deepcopy(value)
|
||||||
|
except Exception:
|
||||||
|
# If deepcopy fails, use shallow copy
|
||||||
|
cld[key] = value
|
||||||
cl.append(cld)
|
cl.append(cld)
|
||||||
ddict[d] = cl
|
ddict[d] = cl
|
||||||
else:
|
else:
|
||||||
|
|||||||
+4
-4
@@ -8,6 +8,7 @@ import os
|
|||||||
import logging
|
import logging
|
||||||
from aiohttp import web
|
from aiohttp import web
|
||||||
import jinja2
|
import jinja2
|
||||||
|
from . import data
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -22,7 +23,6 @@ async def start(
|
|||||||
port: int,
|
port: int,
|
||||||
config,
|
config,
|
||||||
hbdclass,
|
hbdclass,
|
||||||
msgs_getter,
|
|
||||||
log=None,
|
log=None,
|
||||||
email=None,
|
email=None,
|
||||||
pushmsg=None,
|
pushmsg=None,
|
||||||
@@ -52,7 +52,7 @@ async def start(
|
|||||||
res.append('<body BGCOLOR = "#FFFFFF" LINK = "#008000" VLINK = "#008000">')
|
res.append('<body BGCOLOR = "#FFFFFF" LINK = "#008000" VLINK = "#008000">')
|
||||||
res.append(f"<H2>Heartbeat status {VER}</h2>")
|
res.append(f"<H2>Heartbeat status {VER}</h2>")
|
||||||
res += hbdclass.ubHost.buildhosttable()
|
res += hbdclass.ubHost.buildhosttable()
|
||||||
res += hbdclass.ubHost.buildmsgtable(msgs_getter())
|
res += hbdclass.ubHost.buildmsgtable(data.msgs)
|
||||||
res.append(
|
res.append(
|
||||||
"<p> %s (%s)</p>"
|
"<p> %s (%s)</p>"
|
||||||
% (
|
% (
|
||||||
@@ -69,7 +69,7 @@ async def start(
|
|||||||
return web.json_response(json.loads("[" + ",".join(lst) + "]"))
|
return web.json_response(json.loads("[" + ",".join(lst) + "]"))
|
||||||
|
|
||||||
async def api_messages(request):
|
async def api_messages(request):
|
||||||
lst = msgs_getter()[-30:]
|
lst = data.msgs[-30:]
|
||||||
return web.json_response(lst)
|
return web.json_response(lst)
|
||||||
|
|
||||||
async def cmd(request):
|
async def cmd(request):
|
||||||
@@ -155,7 +155,7 @@ async def start(
|
|||||||
hosts=[
|
hosts=[
|
||||||
hbdclass.Host.hosts[h].stateinfo() for h in sorted(hbdclass.Host.hosts)
|
hbdclass.Host.hosts[h].stateinfo() for h in sorted(hbdclass.Host.hosts)
|
||||||
],
|
],
|
||||||
messages=msgs_getter()[-30:],
|
messages=data.msgs[-30:],
|
||||||
)
|
)
|
||||||
return web.Response(text=body, content_type="text/html")
|
return web.Response(text=body, content_type="text/html")
|
||||||
|
|
||||||
|
|||||||
+26
-18
@@ -14,28 +14,35 @@ from . import hbdclass
|
|||||||
|
|
||||||
from . import ws as ws_mod
|
from . import ws as ws_mod
|
||||||
from . import notify as notify_mod
|
from . import notify as notify_mod
|
||||||
|
from . import data
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
msg_to_websockets = ws_mod.broadcast
|
msg_to_websockets = ws_mod.broadcast
|
||||||
eventlog = notify_mod.log
|
eventlog = notify_mod.eventlog
|
||||||
|
|
||||||
lastfm = ["", "", ""]
|
|
||||||
|
|
||||||
# shared runtime collections and helpers
|
# shared runtime collections and helpers
|
||||||
msgs = notify_mod.msgs
|
|
||||||
|
|
||||||
def cleanup_function(config):
|
def cleanup_function(config, hbdclass):
|
||||||
"""This function will be executed upon program exit."""
|
"""This function will be executed upon program exit."""
|
||||||
logger.info("Running cleanup function...")
|
logger.info("Running cleanup function...")
|
||||||
import pickle
|
import pickle
|
||||||
|
|
||||||
|
# Ensure all timer references are cleared before pickling
|
||||||
|
for hostname, host in list(hbdclass.Host.hosts.items()):
|
||||||
|
for conn_type, conn in host.connections.items():
|
||||||
|
if hasattr(conn, 'overdue_timer'):
|
||||||
|
conn.overdue_timer = None
|
||||||
|
if hasattr(conn, 'overdue_callback'):
|
||||||
|
conn.overdue_callback = None
|
||||||
|
if hasattr(conn, 'timeout_duration'):
|
||||||
|
conn.timeout_duration = None
|
||||||
|
|
||||||
pickfile = config.get("pickfile", "hbd.pickle")
|
pickfile = config.get("pickfile", "hbd.pickle")
|
||||||
|
|
||||||
pickf = open(pickfile, "wb")
|
pickf = open(pickfile, "wb")
|
||||||
pick = pickle.Pickler(pickf)
|
pick = pickle.Pickler(pickf)
|
||||||
pick.dump(hbdclass.Host.hosts)
|
pick.dump(hbdclass.Host.hosts)
|
||||||
pick.dump(msgs)
|
pick.dump(data.msgs)
|
||||||
pick.dump(lastfm)
|
|
||||||
pickf.close()
|
pickf.close()
|
||||||
|
|
||||||
logger.info("Cleanup complete.")
|
logger.info("Cleanup complete.")
|
||||||
@@ -124,7 +131,6 @@ async def _run_async(config):
|
|||||||
port=config.get("hbd_port", 50004),
|
port=config.get("hbd_port", 50004),
|
||||||
config=config,
|
config=config,
|
||||||
hbdclass=hbdclass,
|
hbdclass=hbdclass,
|
||||||
msgs_getter=lambda: msgs,
|
|
||||||
log=eventlog,
|
log=eventlog,
|
||||||
pushmsg=pushmsg,
|
pushmsg=pushmsg,
|
||||||
msg_to_websockets=msg_to_websockets,
|
msg_to_websockets=msg_to_websockets,
|
||||||
@@ -186,7 +192,7 @@ async def _run_async(config):
|
|||||||
hbdclass.Host.hosts[h].stateinfo()
|
hbdclass.Host.hosts[h].stateinfo()
|
||||||
for h in sorted(hbdclass.Host.hosts)
|
for h in sorted(hbdclass.Host.hosts)
|
||||||
],
|
],
|
||||||
get_msgs=lambda: msgs,
|
# get_msgs=lambda: msgs,
|
||||||
config=config,
|
config=config,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
@@ -200,7 +206,6 @@ async def _run_async(config):
|
|||||||
monitor_mod.start(
|
monitor_mod.start(
|
||||||
config=config,
|
config=config,
|
||||||
hbdclass=hbdclass,
|
hbdclass=hbdclass,
|
||||||
log=eventlog,
|
|
||||||
pushmsg=pushmsg,
|
pushmsg=pushmsg,
|
||||||
msg_to_websockets=msg_to_websockets,
|
msg_to_websockets=msg_to_websockets,
|
||||||
)
|
)
|
||||||
@@ -216,6 +221,13 @@ async def _run_async(config):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.exception("Error in main loop: %s", e)
|
logger.exception("Error in main loop: %s", e)
|
||||||
finally:
|
finally:
|
||||||
|
# Clean up connection timers
|
||||||
|
try:
|
||||||
|
logger.info("Cleaning up connection timers...")
|
||||||
|
await monitor_mod.cleanup_connections(hbdclass)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Error cleaning up connection timers: %s", e)
|
||||||
|
|
||||||
# Cancel all running tasks
|
# Cancel all running tasks
|
||||||
logger.info("Cancelling tasks...")
|
logger.info("Cancelling tasks...")
|
||||||
try:
|
try:
|
||||||
@@ -279,7 +291,6 @@ async def _run_async(config):
|
|||||||
|
|
||||||
def load_pickled_hosts(config, hbdclass):
|
def load_pickled_hosts(config, hbdclass):
|
||||||
"""Load pickled hosts from file, if available."""
|
"""Load pickled hosts from file, if available."""
|
||||||
global lastfm, msgs
|
|
||||||
import os
|
import os
|
||||||
import pickle
|
import pickle
|
||||||
|
|
||||||
@@ -294,11 +305,7 @@ def load_pickled_hosts(config, hbdclass):
|
|||||||
pick = pickle.Unpickler(pickf)
|
pick = pickle.Unpickler(pickf)
|
||||||
try:
|
try:
|
||||||
hbdclass.Host.hosts = pick.load()
|
hbdclass.Host.hosts = pick.load()
|
||||||
msgs = pick.load()
|
data.msgs = pick.load()
|
||||||
try:
|
|
||||||
lastfm = pick.load()
|
|
||||||
except Exception:
|
|
||||||
lastfm = ["", "", ""]
|
|
||||||
pickf.close()
|
pickf.close()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.exception("load pickled failed: %s", e)
|
logger.exception("load pickled failed: %s", e)
|
||||||
@@ -331,7 +338,7 @@ def run(config):
|
|||||||
load_pickled_hosts(config, hbdclass)
|
load_pickled_hosts(config, hbdclass)
|
||||||
|
|
||||||
notify_mod.initlog(logfile=config.get("logfile", "messages.log"))
|
notify_mod.initlog(logfile=config.get("logfile", "messages.log"))
|
||||||
eventlog(None, f"hbd version {__version__} starting up")
|
eventlog(None, "INFO", f"hbd version {__version__} starting up")
|
||||||
|
|
||||||
# Create and set the event loop manually
|
# Create and set the event loop manually
|
||||||
loop = asyncio.new_event_loop()
|
loop = asyncio.new_event_loop()
|
||||||
@@ -344,8 +351,9 @@ def run(config):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.exception("Unhandled exception in main: %s", e)
|
logger.exception("Unhandled exception in main: %s", e)
|
||||||
finally:
|
finally:
|
||||||
cleanup_function(config)
|
cleanup_function(config, hbdclass)
|
||||||
logger.info("hbd shutdown complete")
|
logger.info("hbd shutdown complete")
|
||||||
|
eventlog(None, "INFO", f"hbd version {__version__} shutdown")
|
||||||
notify_mod.closelog()
|
notify_mod.closelog()
|
||||||
# Explicitly close the loop
|
# Explicitly close the loop
|
||||||
try:
|
try:
|
||||||
|
|||||||
+49
-33
@@ -1,50 +1,66 @@
|
|||||||
"""monitor helper and thread for heartbeat daemon."""
|
"""Monitor helper for heartbeat daemon.
|
||||||
|
|
||||||
|
This module provides monitoring tasks for the heartbeat daemon.
|
||||||
|
The primary reachability monitoring is now event-driven (timers set/reset
|
||||||
|
on HTB arrival in udp.py) rather than periodic polling.
|
||||||
|
|
||||||
|
This module can be extended for additional monitoring tasks.
|
||||||
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
import asyncio
|
import asyncio
|
||||||
import time
|
import time
|
||||||
|
from . import notify as notify_mod
|
||||||
|
|
||||||
DROPOVERDUE = 7 * 24 * 3600
|
DROPOVERDUE = 7 * 24 * 3600
|
||||||
|
eventlog = notify_mod.eventlog
|
||||||
|
|
||||||
|
|
||||||
def checkoverdue(
|
async def cleanup_connections(hbdclass):
|
||||||
config: dict,
|
"""Clean up connection timers on shutdown.
|
||||||
hbdclass,
|
|
||||||
log: callable,
|
Cancels all active overdue timers to prevent callbacks after shutdown.
|
||||||
pushmsg: callable,
|
"""
|
||||||
msg_to_websockets: callable,
|
for hostname, host in list(hbdclass.Host.hosts.items()):
|
||||||
):
|
for conn_type, conn in host.connections.items():
|
||||||
now = time.time()
|
if hasattr(conn, 'cancel_overdue_timer'):
|
||||||
for h in list(hbdclass.Host.hosts.keys()):
|
conn.cancel_overdue_timer()
|
||||||
pmsg = []
|
|
||||||
for c in hbdclass.Host.hosts[h].connections:
|
|
||||||
conn = hbdclass.Host.hosts[h].connections[c]
|
|
||||||
if conn.state == hbdclass.Connection.DOWN:
|
|
||||||
continue
|
|
||||||
timeout = hbdclass.Host.hosts[h].interval + config.get("grace", 10)
|
|
||||||
if conn.state == hbdclass.Connection.UP and (now - conn.lastbeat) > timeout:
|
|
||||||
conn.newstate(hbdclass.Connection.OVERDUE, now, config.get("grace", 10))
|
|
||||||
pmsg.append(conn.afam)
|
|
||||||
if (
|
|
||||||
conn.state == hbdclass.Connection.OVERDUE
|
|
||||||
and (now - conn.lastbeat) > DROPOVERDUE
|
|
||||||
):
|
|
||||||
conn.newstate(hbdclass.Connection.UNKNOWN, conn.lastbeat)
|
|
||||||
if pmsg != []:
|
|
||||||
if h in config.get("watchhosts", []):
|
|
||||||
pushmsg("%s %s overdue" % (h, " and ".join(pmsg)))
|
|
||||||
log(h, "%s overdue" % " and ".join(pmsg))
|
|
||||||
msg_to_websockets("host", hbdclass.Host.hosts[h].stateinfo())
|
|
||||||
|
|
||||||
|
|
||||||
async def start(
|
async def start(
|
||||||
config: dict,
|
config: dict,
|
||||||
hbdclass: callable,
|
hbdclass: callable,
|
||||||
log=None,
|
|
||||||
pushmsg=None,
|
pushmsg=None,
|
||||||
msg_to_websockets=None,
|
msg_to_websockets=None,
|
||||||
):
|
):
|
||||||
"""start a monitor loop that checks for overdue hosts every minute"""
|
"""Start monitor background tasks.
|
||||||
|
|
||||||
|
Note: Reachability monitoring is now timer-based and happens in udp.py
|
||||||
|
when HTB messages arrive. This function can be used for additional
|
||||||
|
monitoring tasks.
|
||||||
|
|
||||||
|
Currently runs a simple status logger every 5 minutes.
|
||||||
|
"""
|
||||||
|
import logging
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logger_interval = 300 # Log status every 5 minutes
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
await asyncio.sleep(15) # 15 seconds between checks
|
await asyncio.sleep(logger_interval)
|
||||||
checkoverdue(config, hbdclass, log, pushmsg, msg_to_websockets)
|
|
||||||
|
# Log monitoring status
|
||||||
|
total_hosts = len(hbdclass.Host.hosts)
|
||||||
|
up_count = sum(
|
||||||
|
1 for h in hbdclass.Host.hosts.values()
|
||||||
|
for c in h.connections.values()
|
||||||
|
if c.state == hbdclass.Connection.UP
|
||||||
|
)
|
||||||
|
overdue_count = sum(
|
||||||
|
1 for h in hbdclass.Host.hosts.values()
|
||||||
|
for c in h.connections.values()
|
||||||
|
if c.state == hbdclass.Connection.OVERDUE
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
f"Monitor status: {total_hosts} hosts, {up_count} UP, {overdue_count} OVERDUE"
|
||||||
|
)
|
||||||
|
|||||||
+10
-6
@@ -8,7 +8,9 @@ import subprocess
|
|||||||
import smtplib
|
import smtplib
|
||||||
import time
|
import time
|
||||||
import sys
|
import sys
|
||||||
|
from . import data
|
||||||
from . import ws as ws_mod
|
from . import ws as ws_mod
|
||||||
|
from . import main as main_mod
|
||||||
|
|
||||||
DEFAULT_PUSHPROVIDERS = ["all", "pushover", "mattermost", "signal"]
|
DEFAULT_PUSHPROVIDERS = ["all", "pushover", "mattermost", "signal"]
|
||||||
msg_to_websockets = ws_mod.broadcast
|
msg_to_websockets = ws_mod.broadcast
|
||||||
@@ -17,19 +19,18 @@ msg_to_websockets = ws_mod.broadcast
|
|||||||
_config = {}
|
_config = {}
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
msgs = []
|
|
||||||
logf = None
|
logf = None
|
||||||
|
|
||||||
def initlog(logfile):
|
def initlog(logfile):
|
||||||
global logf
|
global logf
|
||||||
try:
|
try:
|
||||||
logf = open(logfile, "a+")
|
logf = open(logfile, "a+")
|
||||||
return logf
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
print("cannot open logfile %s, using STDERR: %s" % (logfile, e))
|
print("cannot open logfile %s, using STDERR: %s" % (logfile, e))
|
||||||
return sys.stderr
|
logf = sys.stderr
|
||||||
|
return logf
|
||||||
|
|
||||||
def closelog():
|
def closelog():
|
||||||
global logf
|
global logf
|
||||||
@@ -39,10 +40,13 @@ def closelog():
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def log(host, m, service=None):
|
def eventlog(host, lvl, m, service=None):
|
||||||
ts = time.time()
|
ts = time.time()
|
||||||
s = f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(ts))} {host or ''} {m}"
|
s = f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(ts))} {lvl} "
|
||||||
msgs.append(s)
|
if host:
|
||||||
|
s += f"{host} "
|
||||||
|
s += m
|
||||||
|
data.msgs.append(s)
|
||||||
logger.info(s)
|
logger.info(s)
|
||||||
if logf:
|
if logf:
|
||||||
try:
|
try:
|
||||||
|
|||||||
+109
-7
@@ -16,7 +16,7 @@ from typing import Dict, Any, Optional, Tuple, Callable
|
|||||||
from . import notify as notify_mod
|
from . import notify as notify_mod
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
eventlog = notify_mod.log
|
eventlog = notify_mod.eventlog
|
||||||
|
|
||||||
class AlertLevel(Enum):
|
class AlertLevel(Enum):
|
||||||
"""Alert severity levels."""
|
"""Alert severity levels."""
|
||||||
@@ -96,6 +96,8 @@ class AlertState:
|
|||||||
"notification_count": self.notification_count,
|
"notification_count": self.notification_count,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.to_dict().__str__()
|
||||||
|
|
||||||
class ThresholdConfig:
|
class ThresholdConfig:
|
||||||
"""Configuration for a single threshold check."""
|
"""Configuration for a single threshold check."""
|
||||||
@@ -280,6 +282,11 @@ class ThresholdChecker:
|
|||||||
|
|
||||||
def _parse_plugin_thresholds(self, plugin_name: str, thresholds: Dict[str, Any]):
|
def _parse_plugin_thresholds(self, plugin_name: str, thresholds: Dict[str, Any]):
|
||||||
"""Parse thresholds for a specific plugin."""
|
"""Parse thresholds for a specific plugin."""
|
||||||
|
# Special handling for RTT thresholds (per-host)
|
||||||
|
if plugin_name == "rtt":
|
||||||
|
self._parse_rtt_thresholds(thresholds)
|
||||||
|
return
|
||||||
|
|
||||||
for metric_name, threshold_config in thresholds.items():
|
for metric_name, threshold_config in thresholds.items():
|
||||||
if not isinstance(threshold_config, dict):
|
if not isinstance(threshold_config, dict):
|
||||||
continue
|
continue
|
||||||
@@ -353,6 +360,97 @@ class ThresholdChecker:
|
|||||||
|
|
||||||
self.thresholds[metric_path] = threshold
|
self.thresholds[metric_path] = threshold
|
||||||
|
|
||||||
|
def _parse_rtt_thresholds(self, rtt_thresholds: Dict[str, Any]):
|
||||||
|
"""Parse RTT thresholds (per-host network latency thresholds).
|
||||||
|
|
||||||
|
RTT thresholds are configured as:
|
||||||
|
thresholds:
|
||||||
|
rtt:
|
||||||
|
hostname1:
|
||||||
|
warning: 100.0 # ms
|
||||||
|
critical: 500.0 # ms
|
||||||
|
"""
|
||||||
|
for hostname, threshold_config in rtt_thresholds.items():
|
||||||
|
if not isinstance(threshold_config, dict):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Metric path is "rtt.<hostname>"
|
||||||
|
metric_path = f"rtt.{hostname}"
|
||||||
|
|
||||||
|
warning = threshold_config.get("warning")
|
||||||
|
critical = threshold_config.get("critical")
|
||||||
|
operator = threshold_config.get("operator", ">")
|
||||||
|
hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default
|
||||||
|
enabled = threshold_config.get("enabled", True)
|
||||||
|
|
||||||
|
if warning is None and critical is None:
|
||||||
|
logger.warning("No RTT thresholds defined for %s, skipping", hostname)
|
||||||
|
continue
|
||||||
|
|
||||||
|
threshold = ThresholdConfig(
|
||||||
|
metric_path=metric_path,
|
||||||
|
warning=warning,
|
||||||
|
critical=critical,
|
||||||
|
operator=operator,
|
||||||
|
hysteresis=hysteresis,
|
||||||
|
enabled=enabled,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.thresholds[metric_path] = threshold
|
||||||
|
logger.debug(
|
||||||
|
"Registered RTT threshold for %s: warn=%s ms, crit=%s ms",
|
||||||
|
hostname,
|
||||||
|
warning,
|
||||||
|
critical
|
||||||
|
)
|
||||||
|
|
||||||
|
def check_value(
|
||||||
|
self,
|
||||||
|
host_name: str,
|
||||||
|
metric_path: str,
|
||||||
|
value: float,
|
||||||
|
alert_states: Dict[str, AlertState],
|
||||||
|
) -> Optional[Tuple[AlertLevel, AlertLevel]]:
|
||||||
|
"""
|
||||||
|
Check a single value against configured threshold.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
host_name: Name of the host
|
||||||
|
metric_path: Full metric path (e.g., "rtt.hostname")
|
||||||
|
value: The metric value to check
|
||||||
|
alert_states: Host's alert_states dictionary
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (old_level, new_level) if state changed, None otherwise
|
||||||
|
"""
|
||||||
|
if metric_path not in self.thresholds:
|
||||||
|
return None
|
||||||
|
|
||||||
|
threshold = self.thresholds[metric_path]
|
||||||
|
|
||||||
|
# Get or create alert state
|
||||||
|
if metric_path not in alert_states:
|
||||||
|
alert_states[metric_path] = AlertState(metric_path)
|
||||||
|
|
||||||
|
alert_state = alert_states[metric_path]
|
||||||
|
|
||||||
|
# Evaluate threshold with hysteresis
|
||||||
|
new_level = threshold.evaluate_with_hysteresis(
|
||||||
|
value,
|
||||||
|
alert_state.level
|
||||||
|
)
|
||||||
|
|
||||||
|
# Update state and check for changes
|
||||||
|
old_level = alert_state.level
|
||||||
|
if alert_state.update(new_level, value):
|
||||||
|
self._trigger_notification(host_name, metric_path, old_level, new_level, value)
|
||||||
|
return (old_level, new_level)
|
||||||
|
elif new_level != AlertLevel.OK:
|
||||||
|
# Check if we should re-notify
|
||||||
|
self._check_renotify(host_name, alert_state, metric_path, value)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
def check_plugin_data(
|
def check_plugin_data(
|
||||||
self,
|
self,
|
||||||
host_name: str,
|
host_name: str,
|
||||||
@@ -476,18 +574,22 @@ class ThresholdChecker:
|
|||||||
"""Trigger a notification for an alert state change."""
|
"""Trigger a notification for an alert state change."""
|
||||||
# Format message
|
# Format message
|
||||||
if new_level == AlertLevel.OK:
|
if new_level == AlertLevel.OK:
|
||||||
message = f"RECOVERED: {host_name} - {metric_path} = {value} ({old_level.name} -> OK)"
|
lvl = "RECOVERED"
|
||||||
|
message = f"{metric_path} = {value} ({old_level.name} -> OK)"
|
||||||
elif new_level == AlertLevel.WARNING:
|
elif new_level == AlertLevel.WARNING:
|
||||||
message = f"WARNING: {host_name} - {metric_path} = {value}"
|
lvl = "WARNING"
|
||||||
|
message = f"{metric_path} = {value}"
|
||||||
elif new_level == AlertLevel.CRITICAL:
|
elif new_level == AlertLevel.CRITICAL:
|
||||||
message = f"CRITICAL: {host_name} - {metric_path} = {value}"
|
lvl = "CRITICAL"
|
||||||
|
message = f"{metric_path} = {value}"
|
||||||
else:
|
else:
|
||||||
message = f"UNKNOWN: {host_name} - {metric_path} = {value}"
|
lvl = "UNKNOWN"
|
||||||
|
message = f"{metric_path} = {value}"
|
||||||
|
|
||||||
# Send notification
|
# Send notification
|
||||||
if self.notification_callback is not None:
|
if self.notification_callback is not None:
|
||||||
try:
|
try:
|
||||||
self.notification_callback(message)
|
self.notification_callback(f"{lvl}: {host_name} - {message}")
|
||||||
logger.info("Notification sent: %s", message)
|
logger.info("Notification sent: %s", message)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("Failed to send notification: %s", e)
|
logger.error("Failed to send notification: %s", e)
|
||||||
@@ -507,7 +609,7 @@ class ThresholdChecker:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(f"Failed to log threshold event to journal: {e}")
|
logger.debug(f"Failed to log threshold event to journal: {e}")
|
||||||
# Log to eventlog as well
|
# Log to eventlog as well
|
||||||
eventlog(host_name, message, service="threshold")
|
eventlog(host_name, lvl, message, service="threshold")
|
||||||
|
|
||||||
def _check_renotify(
|
def _check_renotify(
|
||||||
self,
|
self,
|
||||||
|
|||||||
+69
-13
@@ -6,8 +6,10 @@ import logging
|
|||||||
|
|
||||||
from ..common.proto import stodict, oldmtodict
|
from ..common.proto import stodict, oldmtodict
|
||||||
from ..common.utils import dur
|
from ..common.utils import dur
|
||||||
|
from . import notify as notify_mod
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
eventlog = notify_mod.eventlog
|
||||||
|
|
||||||
|
|
||||||
class EchoServerProtocol(asyncio.DatagramProtocol):
|
class EchoServerProtocol(asyncio.DatagramProtocol):
|
||||||
@@ -170,8 +172,7 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
|||||||
return
|
return
|
||||||
|
|
||||||
if res:
|
if res:
|
||||||
if log:
|
eventlog(uname, "WARNING", res)
|
||||||
log(uname, res)
|
|
||||||
if uname in cfg.get("watchhosts", []):
|
if uname in cfg.get("watchhosts", []):
|
||||||
if pushmsg:
|
if pushmsg:
|
||||||
pushmsg("%s %s" % (host.name, res))
|
pushmsg("%s %s" % (host.name, res))
|
||||||
@@ -183,15 +184,13 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
|||||||
boot = msg.get("boot", 0)
|
boot = msg.get("boot", 0)
|
||||||
|
|
||||||
if boot:
|
if boot:
|
||||||
if log:
|
eventlog(uname, "INFO", "booted")
|
||||||
log(uname, "booted")
|
|
||||||
if uname in cfg.get("watchhosts", []):
|
if uname in cfg.get("watchhosts", []):
|
||||||
m = "%s booted" % (host.name)
|
m = "%s booted" % (host.name)
|
||||||
if pushmsg:
|
if pushmsg:
|
||||||
pushmsg(m)
|
pushmsg(m)
|
||||||
if message:
|
if message:
|
||||||
if log:
|
eventlog(uname, "INFO", "msg: %s" % message, service=service)
|
||||||
log(uname, "msg: %s" % message, service=service)
|
|
||||||
if uname in cfg.get("watchhosts", []):
|
if uname in cfg.get("watchhosts", []):
|
||||||
if pushmsg:
|
if pushmsg:
|
||||||
pushmsg(message)
|
pushmsg(message)
|
||||||
@@ -199,9 +198,11 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
|||||||
if conn.getstate() != hbdcls.Connection.UP:
|
if conn.getstate() != hbdcls.Connection.UP:
|
||||||
lasts = conn.state
|
lasts = conn.state
|
||||||
d = conn.newstate(hbdcls.Connection.UP, now)
|
d = conn.newstate(hbdcls.Connection.UP, now)
|
||||||
m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
|
if d == 0 or lasts == "unknown":
|
||||||
if log:
|
m = "%s is up" % (conn.afam)
|
||||||
log(uname, m)
|
else:
|
||||||
|
m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
|
||||||
|
eventlog(uname, "RECOVER", m)
|
||||||
if uname in cfg.get("watchhosts", []):
|
if uname in cfg.get("watchhosts", []):
|
||||||
if pushmsg:
|
if pushmsg:
|
||||||
pushmsg("%s %s is back" % (uname, conn.afam))
|
pushmsg("%s %s is back" % (uname, conn.afam))
|
||||||
@@ -212,8 +213,7 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
|||||||
host.upcount += 1
|
host.upcount += 1
|
||||||
|
|
||||||
if shutdown:
|
if shutdown:
|
||||||
if log:
|
eventlog(uname, "INFO", "%s shutdown" % conn.afam)
|
||||||
log(uname, "%s shutdown" % conn.afam)
|
|
||||||
if uname in cfg.get("watchhosts", []):
|
if uname in cfg.get("watchhosts", []):
|
||||||
if pushmsg:
|
if pushmsg:
|
||||||
pushmsg("%s %s shutdown" % (uname, conn.afam))
|
pushmsg("%s %s shutdown" % (uname, conn.afam))
|
||||||
@@ -222,6 +222,61 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
|||||||
if interval > 0:
|
if interval > 0:
|
||||||
host.interval = interval
|
host.interval = interval
|
||||||
|
|
||||||
|
# Timer-based reachability monitoring
|
||||||
|
# Reset overdue timer on every heartbeat
|
||||||
|
if interval > 0 and conn.getstate() != hbdcls.Connection.DOWN:
|
||||||
|
grace = cfg.get("grace", 2)
|
||||||
|
timeout_seconds = (interval + grace) if interval > 0 else 30
|
||||||
|
|
||||||
|
# Create callback for timer expiration
|
||||||
|
async def on_overdue(connection):
|
||||||
|
"""Called when connection timer expires (no heartbeat received)."""
|
||||||
|
import time
|
||||||
|
now = time.time()
|
||||||
|
|
||||||
|
# Only mark as overdue if still in UP state (not already marked)
|
||||||
|
if connection.getstate() == hbdcls.Connection.UP:
|
||||||
|
connection.newstate(hbdcls.Connection.OVERDUE, now, cfg.get("grace", 2))
|
||||||
|
|
||||||
|
msg = f"{connection.afam} overdue"
|
||||||
|
eventlog(uname, "CRITICAL" if uname in cfg.get("watchhosts", []) else "WARNING", msg)
|
||||||
|
|
||||||
|
if uname in cfg.get("watchhosts", []):
|
||||||
|
if pushmsg:
|
||||||
|
pushmsg(f"{uname} {msg}")
|
||||||
|
|
||||||
|
# Notify websockets
|
||||||
|
if msg_to_websockets:
|
||||||
|
msg_to_websockets("host", host.stateinfo())
|
||||||
|
|
||||||
|
# Set a longer timer for marking as UNKNOWN (7 days)
|
||||||
|
DROPOVERDUE = 7 * 24 * 3600
|
||||||
|
|
||||||
|
async def on_unknown(connection):
|
||||||
|
"""Mark connection as unknown after extended absence."""
|
||||||
|
connection.newstate(hbdcls.Connection.UNKNOWN, connection.lastbeat)
|
||||||
|
if msg_to_websockets:
|
||||||
|
msg_to_websockets("host", host.stateinfo())
|
||||||
|
|
||||||
|
connection.reset_overdue_timer(DROPOVERDUE, on_unknown)
|
||||||
|
|
||||||
|
# Reset the timer
|
||||||
|
conn.reset_overdue_timer(timeout_seconds, on_overdue)
|
||||||
|
|
||||||
|
# Check RTT thresholds using the threshold checker
|
||||||
|
threshold_checker = ctx.get("threshold_checker")
|
||||||
|
if threshold_checker and rtt and rtt > 0:
|
||||||
|
# Metric path for RTT is "rtt.<hostname>"
|
||||||
|
metric_path = f"rtt.{uname}"
|
||||||
|
|
||||||
|
# Check against configured thresholds (handles alerts, notifications, etc.)
|
||||||
|
threshold_checker.check_value(
|
||||||
|
host_name=uname,
|
||||||
|
metric_path=metric_path,
|
||||||
|
value=rtt,
|
||||||
|
alert_states=host.alert_states
|
||||||
|
)
|
||||||
|
|
||||||
# send ACK back
|
# send ACK back
|
||||||
rmsg = {"time": __import__("time").time()}
|
rmsg = {"time": __import__("time").time()}
|
||||||
if host.cver < 1:
|
if host.cver < 1:
|
||||||
@@ -266,5 +321,6 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
|||||||
if msg_to_websockets:
|
if msg_to_websockets:
|
||||||
try:
|
try:
|
||||||
msg_to_websockets("host", host.stateinfo())
|
msg_to_websockets("host", host.stateinfo())
|
||||||
except Exception:
|
except Exception as e:
|
||||||
pass
|
if DEBUG > 0:
|
||||||
|
print(("cannot send websocket message: %s" % e))
|
||||||
|
|||||||
+13
-10
@@ -8,6 +8,7 @@ import asyncio
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
from typing import Callable, Iterable, Optional
|
from typing import Callable, Iterable, Optional
|
||||||
|
from . import data
|
||||||
|
|
||||||
import websockets
|
import websockets
|
||||||
|
|
||||||
@@ -16,7 +17,7 @@ logger.setLevel(logging.INFO)
|
|||||||
_connections = set()
|
_connections = set()
|
||||||
_loop: Optional[asyncio.AbstractEventLoop] = None
|
_loop: Optional[asyncio.AbstractEventLoop] = None
|
||||||
_get_hosts: Optional[Callable[[], Iterable]] = None
|
_get_hosts: Optional[Callable[[], Iterable]] = None
|
||||||
_get_msgs: Optional[Callable[[], Iterable]] = None
|
#_get_msgs: Optional[Callable[[], Iterable]] = None
|
||||||
_verbose = False
|
_verbose = False
|
||||||
|
|
||||||
|
|
||||||
@@ -38,11 +39,11 @@ async def _handler(websocket, path=None):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("Error sending initial hosts: %s", e, exc_info=True)
|
logger.error("Error sending initial hosts: %s", e, exc_info=True)
|
||||||
# send recent messages
|
# send recent messages
|
||||||
if _get_msgs:
|
if data.msgs:
|
||||||
try:
|
try:
|
||||||
msgs = list(_get_msgs())[-100:]
|
# msgs = list(_get_msgs())[-100:]
|
||||||
logger.debug("Sending %d recent messages to new WebSocket client", len(msgs))
|
logger.debug("Sending %d recent messages to new WebSocket client", len(data.msgs))
|
||||||
for m in msgs:
|
for m in data.msgs:
|
||||||
jmsg = json.dumps({"type": "message", "data": m})
|
jmsg = json.dumps({"type": "message", "data": m})
|
||||||
await websocket.send(jmsg)
|
await websocket.send(jmsg)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -77,7 +78,7 @@ async def start(
|
|||||||
wss_port: Optional[int] = None,
|
wss_port: Optional[int] = None,
|
||||||
ssl_context=None,
|
ssl_context=None,
|
||||||
get_hosts: Optional[Callable] = None,
|
get_hosts: Optional[Callable] = None,
|
||||||
get_msgs: Optional[Callable] = None,
|
# get_msgs: Optional[Callable] = None,
|
||||||
config: dict = {},
|
config: dict = {},
|
||||||
):
|
):
|
||||||
"""Start WebSocket servers and block until cancelled.
|
"""Start WebSocket servers and block until cancelled.
|
||||||
@@ -86,17 +87,19 @@ async def start(
|
|||||||
If `wss_port` and `ssl_context` are provided, a WSS server will also be
|
If `wss_port` and `ssl_context` are provided, a WSS server will also be
|
||||||
started.
|
started.
|
||||||
"""
|
"""
|
||||||
global _loop, _get_hosts, _get_msgs, _verbose
|
global _loop, _get_hosts, _verbose
|
||||||
_loop = asyncio.get_running_loop()
|
_loop = asyncio.get_running_loop()
|
||||||
_get_hosts = get_hosts
|
_get_hosts = get_hosts
|
||||||
_get_msgs = get_msgs
|
|
||||||
_verbose = config.get("verbose", False),
|
_verbose = config.get("verbose", False),
|
||||||
_debug = config.get("debug", False),
|
_debug = config.get("debug", 0),
|
||||||
|
|
||||||
servers = []
|
servers = []
|
||||||
# plain WebSocket
|
# plain WebSocket
|
||||||
websockets_logger = logging.getLogger("websockets.server")
|
websockets_logger = logging.getLogger("websockets.server")
|
||||||
websockets_logger.setLevel(logging.DEBUG if _debug > 2 else logging.INFO)
|
#if _debug > 2:
|
||||||
|
# websockets_logger.setLevel(logging.DEBUG)
|
||||||
|
#else:
|
||||||
|
# websockets_logger.setLevel(logging.INFO)
|
||||||
# regular WebSocket
|
# regular WebSocket
|
||||||
ws_server = websockets.serve(_handler, host, ws_port) # , subprotocols=["hbd"])
|
ws_server = websockets.serve(_handler, host, ws_port) # , subprotocols=["hbd"])
|
||||||
servers.append(ws_server)
|
servers.append(ws_server)
|
||||||
|
|||||||
Reference in New Issue
Block a user