save state to pickle file, restart timers on restart
This commit is contained in:
+101
-47
@@ -61,6 +61,102 @@ def dicttos(ID, d):
|
||||
return opk
|
||||
|
||||
|
||||
DROPOVERDUE = 7 * 24 * 3600 # seconds before an overdue host becomes UNKNOWN
|
||||
|
||||
|
||||
def _make_timer_callbacks(uname, host, watchhosts, ctx):
|
||||
"""Return (on_overdue, on_unknown) async callbacks for connection timer logic.
|
||||
|
||||
Captured values are bound at call time so callbacks are safe to use in loops.
|
||||
"""
|
||||
msg_to_websockets = ctx.get("msg_to_websockets")
|
||||
threshold_checker = ctx.get("threshold_checker")
|
||||
cfg = ctx.get("config", {})
|
||||
|
||||
async def on_unknown(connection):
|
||||
connection.newstate(connection.__class__.UNKNOWN, connection.lastbeat)
|
||||
if msg_to_websockets:
|
||||
msg_to_websockets("host", host.stateinfo())
|
||||
|
||||
async def on_overdue(connection):
|
||||
import time
|
||||
if connection.getstate() != connection.__class__.UP:
|
||||
return
|
||||
now = time.time()
|
||||
connection.newstate(connection.__class__.OVERDUE, now, cfg.get("grace", 2))
|
||||
msg = f"{connection.afam} overdue"
|
||||
eventlog(uname, "CRITICAL" if uname in watchhosts else "WARNING", msg)
|
||||
if uname in watchhosts:
|
||||
notify_mod.pushmsg_for_host(uname, f"{uname} {msg}")
|
||||
if threshold_checker:
|
||||
threshold_checker.check_value(
|
||||
host_name=uname,
|
||||
metric_path="rtt",
|
||||
value=float("inf"),
|
||||
alert_states=host.alert_states,
|
||||
)
|
||||
if msg_to_websockets:
|
||||
msg_to_websockets("host", host.stateinfo())
|
||||
connection.reset_overdue_timer(DROPOVERDUE, on_unknown)
|
||||
|
||||
return on_overdue, on_unknown
|
||||
|
||||
|
||||
def restore_connection_timers(hbdclass, ctx):
|
||||
"""Restore overdue timers for all loaded connections after a pickle restore.
|
||||
|
||||
For UP connections, the remaining time until overdue is calculated from
|
||||
lastbeat so that clients that vanished during hbd's downtime are detected.
|
||||
For OVERDUE connections, the UNKNOWN drop timer is restored.
|
||||
"""
|
||||
import time
|
||||
now = time.time()
|
||||
cfg = ctx.get("config", {})
|
||||
grace = cfg.get("grace", 2)
|
||||
from . import config as config_mod
|
||||
watchhosts = config_mod.get_watchhosts(cfg)
|
||||
|
||||
restored = 0
|
||||
for uname, host in list(hbdclass.Host.hosts.items()):
|
||||
interval = host.interval
|
||||
for afam, conn in list(host.connections.items()):
|
||||
state = conn.getstate()
|
||||
if state == hbdclass.Connection.DOWN:
|
||||
continue
|
||||
|
||||
on_overdue, on_unknown = _make_timer_callbacks(uname, host, watchhosts, ctx)
|
||||
|
||||
if state == hbdclass.Connection.UP and interval > 0:
|
||||
elapsed = now - conn.lastbeat
|
||||
remaining = max(1.0, (interval + grace) - elapsed)
|
||||
conn.reset_overdue_timer(remaining, on_overdue)
|
||||
logger.debug(
|
||||
"Restored UP timer %s/%s: %.0fs remaining (elapsed %.0fs)",
|
||||
uname, afam, remaining, elapsed,
|
||||
)
|
||||
restored += 1
|
||||
|
||||
elif state == hbdclass.Connection.OVERDUE:
|
||||
elapsed_overdue = now - conn.statetime
|
||||
remaining = DROPOVERDUE - elapsed_overdue
|
||||
if remaining <= 1:
|
||||
# Already past the drop window — mark UNKNOWN immediately
|
||||
conn.newstate(hbdclass.Connection.UNKNOWN, conn.lastbeat)
|
||||
logger.info(
|
||||
"Marking %s/%s UNKNOWN (overdue %.1f days)",
|
||||
uname, afam, elapsed_overdue / 86400,
|
||||
)
|
||||
else:
|
||||
conn.reset_overdue_timer(remaining, on_unknown)
|
||||
logger.debug(
|
||||
"Restored OVERDUE timer %s/%s: %.0fs remaining",
|
||||
uname, afam, remaining,
|
||||
)
|
||||
restored += 1
|
||||
|
||||
logger.info("Restored timers for %d connection(s)", restored)
|
||||
|
||||
|
||||
def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
"""Handle a parsed datagram message.
|
||||
|
||||
@@ -138,8 +234,9 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
# Handle plugin data message
|
||||
plugin_name = msg.get("plugin")
|
||||
if plugin_name:
|
||||
# Extract all fields except ID and plugin name
|
||||
plugin_data = {k: v for k, v in msg.items() if k not in ["ID", "plugin"]}
|
||||
# Extract plugin fields, dropping protocol metadata fields
|
||||
plugin_data = {k: v for k, v in msg.items()
|
||||
if k not in ("ID", "plugin", "id", "name")}
|
||||
# Store plugin data with timestamp
|
||||
host.add_plugin_data(plugin_name, plugin_data, timestamp=now)
|
||||
if DEBUG > 1:
|
||||
@@ -229,51 +326,8 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
# Reset overdue timer on every heartbeat
|
||||
if interval > 0 and conn.getstate() != hbdcls.Connection.DOWN:
|
||||
grace = cfg.get("grace", 2)
|
||||
timeout_seconds = (interval + grace) if interval > 0 else 30
|
||||
|
||||
# Create callback for timer expiration
|
||||
async def on_overdue(connection):
|
||||
"""Called when connection timer expires (no heartbeat received)."""
|
||||
import time
|
||||
now = time.time()
|
||||
|
||||
# Only mark as overdue if still in UP state (not already marked)
|
||||
if connection.getstate() == hbdcls.Connection.UP:
|
||||
connection.newstate(hbdcls.Connection.OVERDUE, now, cfg.get("grace", 2))
|
||||
|
||||
msg = f"{connection.afam} overdue"
|
||||
eventlog(uname, "CRITICAL" if uname in watchhosts else "WARNING", msg)
|
||||
|
||||
if uname in watchhosts:
|
||||
notify_mod.pushmsg_for_host(uname, f"{uname} {msg}")
|
||||
|
||||
# Check RTT thresholds with infinite RTT for overdue hosts
|
||||
threshold_checker = ctx.get("threshold_checker")
|
||||
if threshold_checker:
|
||||
metric_path = "rtt"
|
||||
threshold_checker.check_value(
|
||||
host_name=uname,
|
||||
metric_path=metric_path,
|
||||
value=float('inf'),
|
||||
alert_states=host.alert_states
|
||||
)
|
||||
|
||||
# Notify websockets
|
||||
if msg_to_websockets:
|
||||
msg_to_websockets("host", host.stateinfo())
|
||||
|
||||
# Set a longer timer for marking as UNKNOWN (7 days)
|
||||
DROPOVERDUE = 7 * 24 * 3600
|
||||
|
||||
async def on_unknown(connection):
|
||||
"""Mark connection as unknown after extended absence."""
|
||||
connection.newstate(hbdcls.Connection.UNKNOWN, connection.lastbeat)
|
||||
if msg_to_websockets:
|
||||
msg_to_websockets("host", host.stateinfo())
|
||||
|
||||
connection.reset_overdue_timer(DROPOVERDUE, on_unknown)
|
||||
|
||||
# Reset the timer
|
||||
timeout_seconds = interval + grace
|
||||
on_overdue, _ = _make_timer_callbacks(uname, host, watchhosts, ctx)
|
||||
conn.reset_overdue_timer(timeout_seconds, on_overdue)
|
||||
|
||||
# Check RTT thresholds using the threshold checker
|
||||
|
||||
Reference in New Issue
Block a user