refactor monitor, add threshold rtesting

This commit is contained in:
Andreas Wrede
2026-03-31 12:22:03 -04:00
parent ad7178ebcb
commit dd23d9d163
15 changed files with 488 additions and 101 deletions
+135 -5
View File
@@ -42,6 +42,11 @@ class Connection:
self.statetime = self.lastbeat
self.deltastatetime = "computed"
self.state = Connection.UNKNOWN
# Timer-based reachability monitoring
self.overdue_timer = None
self.overdue_callback = None
self.timeout_duration = None
if host:
Connection.htab[addr] = self.host.name
@@ -49,6 +54,27 @@ class Connection:
log(self.host.name, "dns update %s" % self.addr)
Host.dnsQ.put((self.host.name, self.addr))
def __getstate__(self):
"""Prepare Connection for pickling by excluding non-serializable timer objects."""
state = self.__dict__.copy()
# Remove asyncio timer objects that can't be pickled
# These will be recreated when the next HTB arrives after unpickling
state['overdue_timer'] = None
state['overdue_callback'] = None
state['timeout_duration'] = None
return state
def __setstate__(self, state):
"""Restore Connection from pickle, reinitializing timer fields."""
self.__dict__.update(state)
# Ensure timer fields are initialized (they'll be recreated when HTB arrives)
if not hasattr(self, 'overdue_timer'):
self.overdue_timer = None
if not hasattr(self, 'overdue_callback'):
self.overdue_callback = None
if not hasattr(self, 'timeout_duration'):
self.timeout_duration = None
def registerDns(self):
Host.dnsQ.put((self.host.name, self.addr))
@@ -123,7 +149,18 @@ class Connection:
return d
def jsons(self):
return json.dumps(self.__dict__)
"""Serialize connection to JSON, excluding non-serializable timer objects."""
data = {}
for key, value in self.__dict__.items():
# Skip timer-related fields that can't be serialized
if key in ['overdue_timer', 'overdue_callback', 'timeout_duration']:
continue
# Handle host backpointer by converting to name
if key == 'host':
data[key] = value.name if value else None
else:
data[key] = value
return json.dumps(data)
# set new state, return number of secs in previous state
def newstate(self, state, now, when=0):
@@ -151,10 +188,87 @@ class Connection:
except Exception:
pass
self.addr = addr
Connection.htab[addr] = self.host.name
Connection.htab[addr] = self.host.nameconnection_count
if self.host.isDynDns():
Host.dnsQ.put((self.host.name, self.addr))
return r
def reset_overdue_timer(self, timeout_seconds, callback):
"""Reset the overdue timer for this connection.
Cancels any existing timer and sets a new one that will mark
the connection as overdue if no heartbeat arrives before timeout.
Args:
timeout_seconds: Seconds before marking as overdue
callback: Async function to call when timer expires
"""
import asyncio
# Cancel existing timer if any
if self.overdue_timer and not self.overdue_timer.cancelled():
self.overdue_timer.cancel()
# Store parameters for later reference
self.timeout_duration = timeout_seconds
self.overdue_callback = callback
# Create new timer
async def timer_expired():
await callback(self)
try:
loop = asyncio.get_event_loop()
self.overdue_timer = loop.call_later(timeout_seconds,
lambda: asyncio.create_task(timer_expired()))
except RuntimeError:
# No event loop running yet
pass
def cancel_overdue_timer(self):
"""Cancel the overdue timer if it exists and clear all timer references."""
if self.overdue_timer:
try:
if not self.overdue_timer.cancelled():
self.overdue_timer.cancel()
except Exception:
pass
# Clear all timer-related references
self.overdue_timer = None
self.overdue_callback = None
self.timeout_duration = None
def get_avg_rtt(self):
"""Get average RTT from recent samples."""
valid_rtts = [r for r in self.rtts if r > 0]
if valid_rtts:
return sum(valid_rtts) / len(valid_rtts)
return 0
def get_current_rtt(self):
"""Get most recent RTT value."""
return self.rtts[-1] if self.rtts else 0
def check_rtt_threshold(self, warning_threshold=None, critical_threshold=None):
"""Check if RTT exceeds thresholds.
Args:
warning_threshold: RTT in ms for warning level
critical_threshold: RTT in ms for critical level
Returns:
Tuple of (level, rtt_value) where level is None, 'WARNING', or 'CRITICAL'
"""
rtt = self.get_current_rtt()
if rtt <= 0:
return (None, rtt)
if critical_threshold and rtt > critical_threshold:
return ('CRITICAL', rtt)
elif warning_threshold and rtt > warning_threshold:
return ('WARNING', rtt)
return (None, rtt)
#
@@ -224,14 +338,30 @@ class Host:
def stateinfo(self):
ddict = {}
for d in self.__dict__:
if d in ["alert_states", "plugin_data"]:
continue
if d == "connections":
cl = []
for c in ["IPv4", "IPv6"]:
if c not in self.connections:
continue
# dirty ugly hack: fix conn to host backpointer
cld = copy.deepcopy(self.connections[c].__dict__)
cld["host"] = cld["host"].name
# Create connection dict, excluding non-serializable timer objects
conn = self.connections[c]
cld = {}
for key, value in conn.__dict__.items():
# Skip timer-related fields that can't be serialized
if key in ['overdue_timer', 'overdue_callback', 'timeout_duration']:
continue
# Handle host backpointer by converting to name
if key == 'host':
cld[key] = value.name if value else None
else:
# Safe copy for serializable values
try:
cld[key] = copy.deepcopy(value)
except Exception:
# If deepcopy fails, use shallow copy
cld[key] = value
cl.append(cld)
ddict[d] = cl
else: