display tag fro alterts, cleanup udp

This commit is contained in:
Andreas Wrede
2026-04-01 11:49:55 -04:00
parent dd23d9d163
commit 079e84f729
15 changed files with 277 additions and 540 deletions
+18 -6
View File
@@ -52,7 +52,7 @@ journal_max_backups: 10 # Number of backups to keep
thresholds: thresholds:
cpu_monitor: cpu_monitor:
cpu_percent: cpu_percent:
warning: 1.0 warning: 80.0
critical: 90.0 critical: 90.0
memory_monitor: memory_monitor:
percent: percent:
@@ -62,19 +62,31 @@ thresholds:
partitions: partitions:
/: /:
percent: percent:
warning: 8.0 warning: 85.0
critical: 90.0 critical: 90.0
nagios_runner: nagios_runner:
overall_status_code:
warning: 1
critical: 2
operator: ">="
load_status: load_status:
warning: WARNING warning: WARNING
operator: = "="
critical: CRITICAL critical: CRITICAL
operator: = "=" operator: "=="
UPS_load:
warning: 70
critical: 80
operator: ">="
UPS_status_code: UPS_status_code:
warning: 1 warning: 1
critical: 2 critical: 2
operator: ">=" operator: ">="
nextcloud_apps_status:
display: "{nextcloud_apps_output}"
warning: 1
critical: 2
operator: ">="
rtt: rtt:
y: y:
warning: 0.1 warning: 30
critical: 10.0 critical: 250.0
BIN
View File
Binary file not shown.
+1 -1
View File
@@ -105,7 +105,7 @@ class AsyncConnection:
msg["time"] = time.time() msg["time"] = time.time()
# Encode message # Encode message
data = dicttos(msg_id, msg, compress=True) data = dicttos(msg_id, msg)
# Send # Send
self.transport.sendto(data, (self.addr, self.port)) self.transport.sendto(data, (self.addr, self.port))
+6 -9
View File
@@ -62,7 +62,7 @@ def decode_value(val: str) -> Any:
return val return val
def dicttos(ID: str, d: Dict[str, Any], compress: bool = False): def dicttos(ID: str, d: Dict[str, Any]):
"""Serialize a dict to protocol message bytes. """Serialize a dict to protocol message bytes.
If compress is True, the payload is zlib-compressed and the message is If compress is True, the payload is zlib-compressed and the message is
@@ -75,12 +75,9 @@ def dicttos(ID: str, d: Dict[str, Any], compress: bool = False):
encoded_val = encode_value(v) encoded_val = encode_value(v)
s.append(f"{k}={encoded_val}") s.append(f"{k}={encoded_val}")
pk = ";".join(s) pk = ";".join(s)
if compress: zpk = zlib.compress(pk.encode(), 6)
zpk = zlib.compress(pk.encode(), 6) hdr = ("!" + ID + ":").encode()
hdr = ("!" + ID + ":").encode() return hdr + zpk
return hdr + zpk
else:
return (ID + ":" + pk).encode()
def stodict(msg: bytes): def stodict(msg: bytes):
@@ -131,7 +128,7 @@ def oldmtodict(msg: bytes):
return stodict(b"HTB:" + msg) return stodict(b"HTB:" + msg)
def encode_plugin_data(plugin_name: str, data: Dict[str, Any], compress: bool = False) -> bytes: def encode_plugin_data(plugin_name: str, data: Dict[str, Any]) -> bytes:
"""Encode plugin data into a PLG message. """Encode plugin data into a PLG message.
Args: Args:
@@ -144,7 +141,7 @@ def encode_plugin_data(plugin_name: str, data: Dict[str, Any], compress: bool =
""" """
# Add plugin name to data # Add plugin name to data
full_data = {"plugin": plugin_name, **data} full_data = {"plugin": plugin_name, **data}
return dicttos("PLG", full_data, compress) return dicttos("PLG", full_data)
def decode_plugin_data(msg: bytes) -> Dict[str, Any]: def decode_plugin_data(msg: bytes) -> Dict[str, Any]:
+2 -38
View File
@@ -175,7 +175,8 @@ class Connection:
def newaddr(self, addr, rtt, now): def newaddr(self, addr, rtt, now):
self.lastbeat = now self.lastbeat = now
self.rtts.append(rtt) if rtt is not None:
self.rtts.append(rtt)
if len(self.rtts) > MAXRTTS: if len(self.rtts) > MAXRTTS:
del self.rtts[0] del self.rtts[0]
@@ -292,7 +293,6 @@ class Host:
self.cmds = [] self.cmds = []
self.cver = 0 self.cver = 0
self.connections = {} self.connections = {}
self.hdwcounts = [[0, 0], [0, 0], [0, 0]]
# Plugin data storage: {plugin_name: [(timestamp, data), ...]} # Plugin data storage: {plugin_name: [(timestamp, data), ...]}
self.plugin_data = {} self.plugin_data = {}
self.plugin_retention = 100 # Keep last N samples per plugin self.plugin_retention = 100 # Keep last N samples per plugin
@@ -473,42 +473,6 @@ class Host:
""" """
return self.plugin_data return self.plugin_data
# def dispstate(self):
# if self.state in ["down", "overdue"]:
# state = "<b>%s</b>" % self.state
# elif self.state in ["up", "UP"]:
# state = ""
# for x in list(self.connections.keys()):
# try:
# state += " %5.1f" % (self.connections[x].rtts[-1])
# except:
# state += " %5s" % (self.connections[x].rtts[-1])
# elif self.state in ["unknown", "UNKNOWN"]:
# state = ""
# else:
# state = "%s" % self.state
# return state
def dispstats(self):
if self.doesack != -1:
if self.upcount > 0:
r = ""
for v in range(3):
a, u = self.hdwcounts[v]
if (self.upcount - u) != 0:
vs = "%0.0f" % (
100.0 - (((self.doesack - a) * 100.0) / (self.upcount - u))
)
if vs == "0":
vs = ""
else:
vs = "-"
r += '<td align="right">%s</td>' % vs
return r
else:
return "<td>(%s)</td><td></td><td></td>" % (self.doesack)
return '<td align="right">N/A</td><td></td<td></td>>'
hostfields_long = [ hostfields_long = [
"name", "name",
"IPv4.addr", "IPv4.addr",
+4 -4
View File
@@ -9,9 +9,11 @@ import logging
from aiohttp import web from aiohttp import web
import jinja2 import jinja2
from . import data from . import data
from . import notify as notify_mod
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
eventlog = notify_mod.eventlog
def _render_template(html_str: str, **context) -> str: def _render_template(html_str: str, **context) -> str:
tmpl = jinja2.Template(html_str) tmpl = jinja2.Template(html_str)
@@ -92,8 +94,7 @@ async def start(
return web.Response(status=400, text="need h= argument") return web.Response(status=400, text="need h= argument")
if uname not in hbdclass.Host.hosts: if uname not in hbdclass.Host.hosts:
return web.Response(status=400, text=f"h={uname} not found") return web.Response(status=400, text=f"h={uname} not found")
if log: eventlog(uname, "INFO", "dropped")
log(uname, "dropped")
del hbdclass.Host.hosts[uname] del hbdclass.Host.hosts[uname]
return web.Response(text="Done") return web.Response(text="Done")
@@ -105,8 +106,7 @@ async def start(
if uname not in hbdclass.Host.hosts: if uname not in hbdclass.Host.hosts:
return web.Response(status=400, text=f"h={uname} not found") return web.Response(status=400, text=f"h={uname} not found")
ll = hbdclass.Host.hosts[uname].registerDns() ll = hbdclass.Host.hosts[uname].registerDns()
if log: eventlog(uname, "INFO", ll)
log(uname, ll)
return web.Response(text=str(ll)) return web.Response(text=str(ll))
async def update(request): async def update(request):
+3 -23
View File
@@ -30,6 +30,8 @@ def cleanup_function(config, hbdclass):
# Ensure all timer references are cleared before pickling # Ensure all timer references are cleared before pickling
for hostname, host in list(hbdclass.Host.hosts.items()): for hostname, host in list(hbdclass.Host.hosts.items()):
for conn_type, conn in host.connections.items(): for conn_type, conn in host.connections.items():
if hasattr(conn, 'cancel_overdue_timer'):
conn.cancel_overdue_timer()
if hasattr(conn, 'overdue_timer'): if hasattr(conn, 'overdue_timer'):
conn.overdue_timer = None conn.overdue_timer = None
if hasattr(conn, 'overdue_callback'): if hasattr(conn, 'overdue_callback'):
@@ -65,7 +67,6 @@ async def _run_async(config):
from . import http as http_mod from . import http as http_mod
from . import dns as dns_mod from . import dns as dns_mod
from . import notify as notify_mod from . import notify as notify_mod
from . import monitor as monitor_mod
from . import journal as journal_mod from . import journal as journal_mod
from . import threshold as threshold_mod from . import threshold as threshold_mod
@@ -200,20 +201,6 @@ async def _run_async(config):
except Exception as e: except Exception as e:
logger.exception("websocket server failed to start: %s", e) logger.exception("websocket server failed to start: %s", e)
# Start the monitor thread as a background task
try:
monitor_task = asyncio.create_task(
monitor_mod.start(
config=config,
hbdclass=hbdclass,
pushmsg=pushmsg,
msg_to_websockets=msg_to_websockets,
)
)
logger.info("Monitor task started")
except Exception as e:
logger.exception("monitor task failed to start: %s", e)
try: try:
# run forever until shutdown event is set # run forever until shutdown event is set
await shutdown_event.wait() await shutdown_event.wait()
@@ -221,13 +208,6 @@ async def _run_async(config):
except Exception as e: except Exception as e:
logger.exception("Error in main loop: %s", e) logger.exception("Error in main loop: %s", e)
finally: finally:
# Clean up connection timers
try:
logger.info("Cleaning up connection timers...")
await monitor_mod.cleanup_connections(hbdclass)
except Exception as e:
logger.warning("Error cleaning up connection timers: %s", e)
# Cancel all running tasks # Cancel all running tasks
logger.info("Cancelling tasks...") logger.info("Cancelling tasks...")
try: try:
@@ -235,7 +215,7 @@ async def _run_async(config):
except Exception as e: except Exception as e:
logger.warning("Error closing UDP transport: %s", e) logger.warning("Error closing UDP transport: %s", e)
tasks_to_cancel = [http_task, ws_task, monitor_task] tasks_to_cancel = [http_task, ws_task]
for task in tasks_to_cancel: for task in tasks_to_cancel:
if task: if task:
try: try:
-38
View File
@@ -26,41 +26,3 @@ async def cleanup_connections(hbdclass):
if hasattr(conn, 'cancel_overdue_timer'): if hasattr(conn, 'cancel_overdue_timer'):
conn.cancel_overdue_timer() conn.cancel_overdue_timer()
async def start(
config: dict,
hbdclass: callable,
pushmsg=None,
msg_to_websockets=None,
):
"""Start monitor background tasks.
Note: Reachability monitoring is now timer-based and happens in udp.py
when HTB messages arrive. This function can be used for additional
monitoring tasks.
Currently runs a simple status logger every 5 minutes.
"""
import logging
logger = logging.getLogger(__name__)
logger_interval = 300 # Log status every 5 minutes
while True:
await asyncio.sleep(logger_interval)
# Log monitoring status
total_hosts = len(hbdclass.Host.hosts)
up_count = sum(
1 for h in hbdclass.Host.hosts.values()
for c in h.connections.values()
if c.state == hbdclass.Connection.UP
)
overdue_count = sum(
1 for h in hbdclass.Host.hosts.values()
for c in h.connections.values()
if c.state == hbdclass.Connection.OVERDUE
)
logger.debug(
f"Monitor status: {total_hosts} hosts, {up_count} UP, {overdue_count} OVERDUE"
)
+7 -1
View File
@@ -397,6 +397,12 @@
const level = alert.level.toLowerCase(); const level = alert.level.toLowerCase();
const duration = getDuration(alert.since); const duration = getDuration(alert.since);
// Format value with threshold info if available
let valueText = `Value: <span class="alert-value">${formatValue(alert.last_value)}</span>`;
if (alert.threshold_value !== undefined && alert.threshold_value !== null && alert.operator) {
valueText += ` <span class="threshold-info">(threshold: ${alert.operator} ${formatValue(alert.threshold_value)})</span>`;
}
return ` return `
<div class="alert-item ${level}"> <div class="alert-item ${level}">
<div class="alert-main"> <div class="alert-main">
@@ -406,7 +412,7 @@
</div> </div>
<div class="alert-metric">${alert.metric_path}</div> <div class="alert-metric">${alert.metric_path}</div>
<div class="alert-details"> <div class="alert-details">
<span>Value: <span class="alert-value">${formatValue(alert.last_value)}</span></span> <span>${valueText}</span>
<span class="alert-duration">Active for ${duration}</span> <span class="alert-duration">Active for ${duration}</span>
</div> </div>
</div> </div>
+113
View File
@@ -300,6 +300,60 @@
font-weight: bold; font-weight: bold;
color: #2196f3; color: #2196f3;
} }
/* Simple data table styling (for os_info, cpu_monitor, etc.) */
.simple-data-table {
width: 100%;
border-collapse: collapse;
margin-top: 10px;
font-size: 0.9em;
background: #fff;
box-shadow: 0 1px 3px rgba(0,0,0,0.1);
border-radius: 4px;
overflow: hidden;
}
.simple-data-table thead {
background: #2196f3;
color: white;
}
.simple-data-table th {
padding: 10px 15px;
text-align: left;
font-weight: 600;
text-transform: uppercase;
font-size: 0.8em;
letter-spacing: 0.5px;
}
.simple-data-table td {
padding: 8px 15px;
border-top: 1px solid #e0e0e0;
}
.simple-data-table td.name {
font-weight: 500;
color: #555;
width: 40%;
}
.simple-data-table td.value {
color: #333;
font-family: 'Segoe UI', system-ui, sans-serif;
}
.simple-data-table tbody tr:hover {
background: #f5f5f5;
}
.simple-data-table tbody tr:nth-child(even) {
background: #fafafa;
}
.simple-data-table tbody tr:nth-child(even):hover {
background: #f0f0f0;
}
</style> </style>
<body> <body>
@@ -406,6 +460,14 @@
} }
function renderPluginData(data, timestamp) { function renderPluginData(data, timestamp) {
// Check if this should be rendered as a simple table
const pluginName = getCurrentPluginName();
const simplePlugins = ['os_info', 'cpu_monitor', 'memory_monitor', 'nagios_runner'];
if (simplePlugins.includes(pluginName) && isSimpleKeyValueData(data)) {
return renderSimpleDataTable(data, timestamp);
}
let html = '<div class="metric-grid">'; let html = '<div class="metric-grid">';
for (const [key, value] of Object.entries(data)) { for (const [key, value] of Object.entries(data)) {
@@ -478,6 +540,57 @@
return html; return html;
} }
function getCurrentPluginName() {
// Get currently active plugin name from the active plugin content div
const activeContent = document.querySelector('.plugin-content.active');
if (activeContent) {
return activeContent.dataset.plugin;
}
return '';
}
function isSimpleKeyValueData(data) {
// Check if data is simple key-value pairs (no complex nesting)
for (const [key, value] of Object.entries(data)) {
if (typeof value === 'object' && value !== null) {
// Has nested objects - not simple
return false;
}
}
return true;
}
function renderSimpleDataTable(data, timestamp) {
let html = '<table class="simple-data-table">';
// Table header
html += '<thead><tr>';
html += '<th>Name</th>';
html += '<th>Value</th>';
html += '</tr></thead>';
// Table body
html += '<tbody>';
for (const [key, value] of Object.entries(data)) {
const label = formatLabel(key);
const formattedValue = formatValue(key, value);
const unit = getUnit(key);
html += '<tr>';
html += `<td class="name">${label}</td>`;
html += `<td class="value">${formattedValue}${unit ? ' ' + unit : ''}</td>`;
html += '</tr>';
}
html += '</tbody>';
html += '</table>';
const date = new Date(timestamp * 1000);
html += `<div class="timestamp">Last updated: ${date.toLocaleString()}</div>`;
return html;
}
function isInterfaceData(key, value) { function isInterfaceData(key, value) {
// Check if this is interface/network stats data (I/O counters) // Check if this is interface/network stats data (I/O counters)
if (key.toLowerCase().includes('interface') && !key.toLowerCase().includes('interface_stats')) { if (key.toLowerCase().includes('interface') && !key.toLowerCase().includes('interface_stats')) {
+102 -15
View File
@@ -53,14 +53,24 @@ class AlertState:
self.last_check = time.time() self.last_check = time.time()
self.notification_count = 0 self.notification_count = 0
self.last_notification = None self.last_notification = None
self.threshold_value = None # The threshold value that triggered alert
self.operator = None # The comparison operator (>, <, >=, etc.)
def update(self, level: AlertLevel, value: Any) -> bool: def update(
self,
level: AlertLevel,
value: Any,
threshold_value: Optional[float] = None,
operator: Optional[str] = None
) -> bool:
""" """
Update alert state. Update alert state.
Args: Args:
level: New alert level level: New alert level
value: Current metric value value: Current metric value
threshold_value: The threshold value that was exceeded (if applicable)
operator: The comparison operator (>, <, >=, etc.)
Returns: Returns:
True if state changed (notification needed), False otherwise True if state changed (notification needed), False otherwise
@@ -69,6 +79,15 @@ class AlertState:
self.last_check = now self.last_check = now
self.last_value = value self.last_value = value
# Update threshold info when alert is active
if level != AlertLevel.OK:
self.threshold_value = threshold_value
self.operator = operator
else:
# Clear threshold info when returning to OK
self.threshold_value = None
self.operator = None
# Check if state changed # Check if state changed
if level != self.level: if level != self.level:
logger.info( logger.info(
@@ -87,7 +106,7 @@ class AlertState:
def to_dict(self) -> dict: def to_dict(self) -> dict:
"""Convert alert state to dictionary for serialization.""" """Convert alert state to dictionary for serialization."""
return { result = {
"metric_path": self.metric_path, "metric_path": self.metric_path,
"level": self.level.name, "level": self.level.name,
"since": self.since, "since": self.since,
@@ -96,6 +115,14 @@ class AlertState:
"notification_count": self.notification_count, "notification_count": self.notification_count,
} }
# Include threshold info if available
if self.threshold_value is not None:
result["threshold_value"] = self.threshold_value
if self.operator is not None:
result["operator"] = self.operator
return result
def __str__(self): def __str__(self):
return self.to_dict().__str__() return self.to_dict().__str__()
@@ -107,6 +134,7 @@ class ThresholdConfig:
metric_path: str, metric_path: str,
warning: Optional[float] = None, warning: Optional[float] = None,
critical: Optional[float] = None, critical: Optional[float] = None,
display: Optional[str] = None,
operator: str = ">", operator: str = ">",
hysteresis: float = 0.0, hysteresis: float = 0.0,
enabled: bool = True, enabled: bool = True,
@@ -127,6 +155,7 @@ class ThresholdConfig:
self.critical = critical self.critical = critical
self.enabled = enabled self.enabled = enabled
self.hysteresis = hysteresis self.hysteresis = hysteresis
self.display = display
# Parse operator # Parse operator
try: try:
@@ -302,6 +331,7 @@ class ThresholdChecker:
warning = threshold_config.get("warning") warning = threshold_config.get("warning")
critical = threshold_config.get("critical") critical = threshold_config.get("critical")
operator = threshold_config.get("operator", ">") operator = threshold_config.get("operator", ">")
display = threshold_config.get("display")
hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default
enabled = threshold_config.get("enabled", True) enabled = threshold_config.get("enabled", True)
@@ -316,6 +346,7 @@ class ThresholdChecker:
operator=operator, operator=operator,
hysteresis=hysteresis, hysteresis=hysteresis,
enabled=enabled, enabled=enabled,
display=display
) )
self.thresholds[metric_path] = threshold self.thresholds[metric_path] = threshold
@@ -345,7 +376,7 @@ class ThresholdChecker:
operator = threshold_config.get("operator", ">") operator = threshold_config.get("operator", ">")
hysteresis = threshold_config.get("hysteresis", 0.1) hysteresis = threshold_config.get("hysteresis", 0.1)
enabled = threshold_config.get("enabled", True) enabled = threshold_config.get("enabled", True)
display = threshold_config.get("display")
if warning is None and critical is None: if warning is None and critical is None:
continue continue
@@ -356,6 +387,7 @@ class ThresholdChecker:
operator=operator, operator=operator,
hysteresis=hysteresis, hysteresis=hysteresis,
enabled=enabled, enabled=enabled,
display=display
) )
self.thresholds[metric_path] = threshold self.thresholds[metric_path] = threshold
@@ -382,6 +414,7 @@ class ThresholdChecker:
operator = threshold_config.get("operator", ">") operator = threshold_config.get("operator", ">")
hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default
enabled = threshold_config.get("enabled", True) enabled = threshold_config.get("enabled", True)
display = threshold_config.get("display")
if warning is None and critical is None: if warning is None and critical is None:
logger.warning("No RTT thresholds defined for %s, skipping", hostname) logger.warning("No RTT thresholds defined for %s, skipping", hostname)
@@ -394,6 +427,7 @@ class ThresholdChecker:
operator=operator, operator=operator,
hysteresis=hysteresis, hysteresis=hysteresis,
enabled=enabled, enabled=enabled,
display=display
) )
self.thresholds[metric_path] = threshold self.thresholds[metric_path] = threshold
@@ -440,14 +474,21 @@ class ThresholdChecker:
alert_state.level alert_state.level
) )
# Determine which threshold was exceeded
threshold_value = None
if new_level == AlertLevel.CRITICAL and threshold.critical is not None:
threshold_value = threshold.critical
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
threshold_value = threshold.warning
# Update state and check for changes # Update state and check for changes
old_level = alert_state.level old_level = alert_state.level
if alert_state.update(new_level, value): if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
self._trigger_notification(host_name, metric_path, old_level, new_level, value) self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold)
return (old_level, new_level) return (old_level, new_level)
elif new_level != AlertLevel.OK: elif new_level != AlertLevel.OK:
# Check if we should re-notify # Check if we should re-notify
self._check_renotify(host_name, alert_state, metric_path, value) self._check_renotify(host_name, alert_state, metric_path, value, threshold)
return None return None
@@ -493,14 +534,21 @@ class ThresholdChecker:
alert_state.level alert_state.level
) )
# Determine which threshold was exceeded
threshold_value = None
if new_level == AlertLevel.CRITICAL and threshold.critical is not None:
threshold_value = threshold.critical
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
threshold_value = threshold.warning
# Update state and check for changes # Update state and check for changes
old_level = alert_state.level old_level = alert_state.level
if alert_state.update(new_level, value): if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
state_changes.append((metric_path, old_level, new_level, value)) state_changes.append((metric_path, old_level, new_level, value))
self._trigger_notification(host_name, metric_path, old_level, new_level, value) self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold)
elif new_level != AlertLevel.OK: elif new_level != AlertLevel.OK:
# Check if we should re-notify # Check if we should re-notify
self._check_renotify(host_name, alert_state, metric_path, value) self._check_renotify(host_name, alert_state, metric_path, value, threshold)
# Check nested metrics (e.g., partition data in disk_monitor) # Check nested metrics (e.g., partition data in disk_monitor)
self._check_nested_metrics( self._check_nested_metrics(
@@ -550,18 +598,26 @@ class ThresholdChecker:
alert_state.level alert_state.level
) )
# Determine which threshold was exceeded
threshold_value = None
if new_level == AlertLevel.CRITICAL and threshold.critical is not None:
threshold_value = threshold.critical
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
threshold_value = threshold.warning
old_level = alert_state.level old_level = alert_state.level
if alert_state.update(new_level, value): if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
state_changes.append((metric_path, old_level, new_level, value)) state_changes.append((metric_path, old_level, new_level, value))
self._trigger_notification( self._trigger_notification(
host_name, host_name,
metric_path, metric_path,
old_level, old_level,
new_level, new_level,
value value,
threshold
) )
elif new_level != AlertLevel.OK: elif new_level != AlertLevel.OK:
self._check_renotify(host_name, alert_state, metric_path, value) self._check_renotify(host_name, alert_state, metric_path, value, threshold)
def _trigger_notification( def _trigger_notification(
self, self,
@@ -570,18 +626,35 @@ class ThresholdChecker:
old_level: AlertLevel, old_level: AlertLevel,
new_level: AlertLevel, new_level: AlertLevel,
value: Any, value: Any,
threshold: ThresholdConfig,
): ):
"""Trigger a notification for an alert state change.""" """Trigger a notification for an alert state change."""
# Determine which threshold was exceeded
threshold_value = None
if new_level == AlertLevel.CRITICAL and threshold.critical is not None:
threshold_value = threshold.critical
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
threshold_value = threshold.warning
# Format operator symbol
op_symbol = threshold.operator.value
# Format message # Format message
if new_level == AlertLevel.OK: if new_level == AlertLevel.OK:
lvl = "RECOVERED" lvl = "RECOVERED"
message = f"{metric_path} = {value} ({old_level.name} -> OK)" message = f"{metric_path} = {value} ({old_level.name} -> OK)"
elif new_level == AlertLevel.WARNING: elif new_level == AlertLevel.WARNING:
lvl = "WARNING" lvl = "WARNING"
message = f"{metric_path} = {value}" if threshold_value is not None:
message = f"{metric_path} = {value} (threshold: {op_symbol} {threshold_value})"
else:
message = f"{metric_path} = {value}"
elif new_level == AlertLevel.CRITICAL: elif new_level == AlertLevel.CRITICAL:
lvl = "CRITICAL" lvl = "CRITICAL"
message = f"{metric_path} = {value}" if threshold_value is not None:
message = f"{metric_path} = {value} (threshold: {op_symbol} {threshold_value})"
else:
message = f"{metric_path} = {value}"
else: else:
lvl = "UNKNOWN" lvl = "UNKNOWN"
message = f"{metric_path} = {value}" message = f"{metric_path} = {value}"
@@ -617,6 +690,7 @@ class ThresholdChecker:
alert_state: AlertState, alert_state: AlertState,
metric_path: str, metric_path: str,
value: Any, value: Any,
threshold: ThresholdConfig,
): ):
"""Check if we should send a repeat notification.""" """Check if we should send a repeat notification."""
if alert_state.level == AlertLevel.OK: if alert_state.level == AlertLevel.OK:
@@ -632,8 +706,21 @@ class ThresholdChecker:
return return
if (now - alert_state.last_notification) >= self.renotify_interval: if (now - alert_state.last_notification) >= self.renotify_interval:
# Determine which threshold is active
threshold_value = None
if alert_state.level == AlertLevel.CRITICAL and threshold.critical is not None:
threshold_value = threshold.critical
elif alert_state.level == AlertLevel.WARNING and threshold.warning is not None:
threshold_value = threshold.warning
# Format operator symbol
op_symbol = threshold.operator.value
# Time to re-notify # Time to re-notify
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (ongoing for {int(now - alert_state.since)}s)" if threshold_value is not None:
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (threshold: {op_symbol} {threshold_value}, ongoing for {int(now - alert_state.since)}s)"
else:
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (ongoing for {int(now - alert_state.since)}s)"
if self.notification_callback: if self.notification_callback:
try: try:
+19 -23
View File
@@ -47,7 +47,7 @@ def parse_message(data: bytes):
return msg return msg
def dicttos(ID, d, compress=False): def dicttos(ID, d):
s = [] s = []
for k in d: for k in d:
if isinstance(d[k], float): if isinstance(d[k], float):
@@ -55,13 +55,9 @@ def dicttos(ID, d, compress=False):
else: else:
s.append("%s=%s" % (k, d[k])) s.append("%s=%s" % (k, d[k]))
pk = ";".join(s) pk = ";".join(s)
if compress: zpk = zlib.compress(pk.encode(), 6)
zpk = zlib.compress(pk.encode(), 6) ID = "!" + ID + ":"
ID = "!" + ID + ":" opk = ID.encode() + zpk
opk = ID.encode() + zpk
else:
zpk = pk
opk = ID + ":" + zpk
return opk return opk
@@ -119,12 +115,24 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
cid = msg.get("id", 0) cid = msg.get("id", 0)
try: try:
rtt = float(msg.get("rtt", None)) rtt = float(msg.get("rtt"))
except Exception: except TypeError:
rtt = None rtt = None
if msg.get("ID") == "HTB": if msg.get("ID") == "HTB":
host.doesack = msg.get("acks", -1) host.doesack = msg.get("acks", -1)
# send ACK back
rmsg = {"time": __import__("time").time()}
if host.cver < 1:
opkt = b"ACK"
else:
opkt = dicttos("ACK", rmsg)
try:
transport.sendto(opkt, addr)
except Exception as e:
if DEBUG > 0:
print(("cannot send ack: %s" % e))
elif msg.get("ID") == "PLG": elif msg.get("ID") == "PLG":
# Handle plugin data message # Handle plugin data message
plugin_name = msg.get("plugin") plugin_name = msg.get("plugin")
@@ -277,18 +285,6 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
alert_states=host.alert_states alert_states=host.alert_states
) )
# send ACK back
rmsg = {"time": __import__("time").time()}
if host.cver < 1:
opkt = b"ACK"
else:
opkt = dicttos("ACK", rmsg, host.cver > 1)
try:
transport.sendto(opkt, addr)
except Exception as e:
if DEBUG > 0:
print(("cannot send ack: %s" % e))
# send any commands we have queued # send any commands we have queued
while len(host.cmds): while len(host.cmds):
op, rmsg = host.cmds[0] op, rmsg = host.cmds[0]
@@ -311,7 +307,7 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
if isinstance(opkt, str): if isinstance(opkt, str):
opkt = opkt.encode() opkt = opkt.encode()
else: else:
opkt = dicttos(op, rmsg, True) opkt = dicttos(op, rmsg)
try: try:
transport.sendto(opkt, addr) transport.sendto(opkt, addr)
except Exception as e: except Exception as e:
-380
View File
@@ -1,380 +0,0 @@
"""
host and connection class shared between hbd and
the websit's heartbeat.py
"""
import time
import json
import copy
import queue
num = 0
MAXRTTS = 10
DEBUG = 2
def log(host, m):
if DEBUG:
print("class log: %s %s" % (host, m))
class Connection:
# map of addrs to names
htab = {}
UNKNOWN = "unknown"
UP = "up"
DOWN = "down"
OVERDUE = "overdue"
def __init__(self, host, cid, addr, afam):
self.host = host
self.cid = cid
if addr[0:7] == "::ffff:":
addr = addr[7:]
self.addr = addr
self.afam = afam
self.rtts = [0]
self.lastbeat = time.time()
self.statetime = self.lastbeat
self.deltastatetime = "computed"
self.state = Connection.UNKNOWN
if host:
Connection.htab[addr] = self.host.name
if self.host.isDynDns():
log(self.host.name, "dns update %s" % self.addr)
Host.dnsQ.put((self.host.name, self.addr))
def registerDns(self):
Host.dnsQ.put((self.host.name, self.addr))
def clearstate(self):
d = {}
d["addr"] = ""
d["rtt"] = ""
d["lastbeat"] = ""
d["state"] = ""
d["statetime"] = ""
d["deltastatetime"] = ""
d["rttstate"] = ""
return d
def statedict(self, Null=False):
d = self.clearstate()
now = time.time()
if not Null:
d["addr"] = self.addr
if self.rtts[-1]:
d["rtt"] = "%0.1f" % self.rtts[-1]
elif self.state == Connection.UNKNOWN:
d["rtt"] = ""
else:
d["rtt"] = "?"
d["lastbeat"] = self.lastbeat
if self.state == Connection.OVERDUE:
d["state"] = "<b>%s</b>" % self.state
else:
d["state"] = self.state
if self.state == Connection.UP:
d["rttstate"] = d["rtt"]
elif self.state == Connection.OVERDUE:
d["rttstate"] = ""
else:
d["rttstate"] = d["state"]
d["statetime"] = time.strftime(
"%Y-%m-%d %H:%M:%S", time.localtime(self.statetime)
)
delta = now - self.statetime
if self.state == Connection.UNKNOWN:
d["deltastatetime"] = ""
elif delta > 86400:
# d['deltastatetime'] = time.strftime("%d %H:%M:%S", time.gmtime(delta))
d["deltastatetime"] = "%0.1f days" % (delta / 86400.0)
elif delta > 3600:
# d['deltastatetime'] = time.strftime("%H:%M:%S", time.gmtime(delta))
d["deltastatetime"] = time.strftime("%k:%M hrs", time.gmtime(delta))
# d['deltastatetime'] = "%0.1f hrs" % (delta / 3600.)
elif delta > 60:
# d['deltastatetime'] = time.strftime("%M:%S", time.gmtime(delta))
d["deltastatetime"] = time.strftime("%M:%S mins", time.gmtime(delta))
# d['deltastatetime'] = "%0.1f mins" % (delta / 60.)
else:
# d['deltastatetime'] = time.strftime("%S", time.gmtime(delta))
d["deltastatetime"] = "%i secs" % (delta)
if self.state == Connection.UNKNOWN and now - self.lastbeat > 86400 * 10:
d = self.clearstate()
return d
def headerdict(self, afam):
d = {}
d["addr"] = "%s Addr" % afam
d["rtt"] = "Latencey"
d["lastbeat"] = "Last Contact"
d["state"] = "State"
d["statetime"] = "Last State"
d["rttstate"] = "Reach"
d["deltastatetime"] = "Last State"
return d
def jsons(self):
return json.dumps(self.__dict__)
# set new state, return number of secs in previous state
def newstate(self, state, now, when=0):
self.state = state
delta = now - when
s = delta - self.statetime
self.statetime = delta
return s
def getstate(self):
return self.state
def newaddr(self, addr, rtt, now):
self.lastbeat = now
self.rtts.append(rtt)
if len(self.rtts) > MAXRTTS:
del self.rtts[0]
if self.addr == addr:
r = None
else:
r = "changed from %s to %s" % (self.addr, addr)
try:
del Connection.htab[self.addr]
except:
pass
self.addr = addr
Connection.htab[addr] = self.host.name
if self.host.isDynDns():
Host.dnsQ.put((self.host.name, self.addr))
return r
#
class Host:
# Table of Hosts
hosts = {}
dnsQ = queue.Queue()
def __init__(self, name):
global num
self.name = name
if name:
num += 1
Host.hosts[name] = self
self.num = num
self.dyn = False
self.watched = False
self.upcount = 0
self.interval = 0
self.doesack = -1
self.cmds = []
self.cver = 0
self.connections = {}
self.hdwcounts = [[0, 0], [0, 0], [0, 0]]
def statedict(self):
d = {}
d["name"] = self.name
if self.dyn:
d["name"] += "*"
if self.watched:
d["name"] = "<b>%s</b>" % d["name"]
d["dyn"] = str(self.dyn)
d["ver"] = str(self.cver)
d["num"] = self.num
for c in ["IPv4", "IPv6"]:
if c in self.connections:
cs = self.connections[c].statedict()
else:
cs = ubConnection.statedict(True)
for csv in cs:
d["%s.%s" % (c, csv)] = cs[csv]
return d
def headerdict(self):
d = {}
d["name"] = "Name"
d["dyn"] = "Dyn"
d["ver"] = "Ver"
d["num"] = "??"
for c in ["IPv4", "IPv6"]:
cs = ubConnection.headerdict(c)
for csv in cs:
d["%s.%s" % (c, csv)] = cs[csv]
return d
def registerDns(self):
for af in self.connections:
self.connections[af].registerDns()
def stateinfo(self):
ddict = {}
for d in self.__dict__:
if d == "connections":
cl = []
for c in self.connections:
# dirty ugly hack: fix conn to host backpointer
cld = copy.deepcopy(self.connections[c].__dict__)
cld["host"] = cld["host"].name
cl.append(cld)
ddict[d] = cl
else:
ddict[d] = self.__dict__[d]
return ddict
def jsons(self):
return json.dumps(self.stateinfo())
def setcver(self, cver):
self.cver = cver
def isDynDns(self):
return self.dyn
def isIPv4(self, addr):
if isinstance(addr, tuple):
return addr[0].find(".") > 0
else:
return addr.find(".") > 0
def conndata(self, cid, addr, rtt, now):
if addr[0:7] == "::ffff:":
addr = addr[7:]
if self.isIPv4(addr):
afam = "IPv4"
else:
afam = "IPv6"
if afam not in self.connections:
self.connections[afam] = Connection(self, cid, addr, afam)
conn = self.connections[afam]
res = conn.newaddr(addr, rtt, now)
return conn, res
# called when reloading class from pickle, add new fields here
def fixup(self):
for c in ["IPv4", "IPv6"]:
if c in self.connections:
addr = self.connections[c].addr
if addr[0:7] == "::ffff:":
addr = addr[7:]
self.connections[c].addr = addr
pass
# def dispstate(self):
# if self.state in ["down", "overdue"]:
# state = "<b>%s</b>" % self.state
# elif self.state in ["up", "UP"]:
# state = ""
# for x in list(self.connections.keys()):
# try:
# state += " %5.1f" % (self.connections[x].rtts[-1])
# except:
# state += " %5s" % (self.connections[x].rtts[-1])
# elif self.state in ["unknown", "UNKNOWN"]:
# state = ""
# else:
# state = "%s" % self.state
# return state
def dispstats(self):
if self.doesack != -1:
if self.upcount > 0:
# return "(%0.1f%%) %s %s %s " % ((self.doesack * 100.0) / self.upcount, self.doesack, self.upcount, self.hdwcounts)
r = ""
for v in range(3):
a, u = self.hdwcounts[v]
if (self.upcount - u) != 0:
vs = "%0.0f" % (
100.0 - (((self.doesack - a) * 100.0) / (self.upcount - u))
)
if vs == "0":
vs = ""
else:
vs = "-"
r += '<td align="right">%s</td>' % vs
return r
else:
return "<td>(%s)</td><td></td><td></td>" % (self.doesack)
return '<td align="right">N/A</td><td></td<td></td>>'
hostfields_long = [
"name",
"IPv4.addr",
"IPv4.state",
("IPv4.rtt", 'style="text-align: right;"'),
("IPv4.statetime", 'style="text-align: right;"'),
"IPv6.addr",
"IPv6.state",
("IPv6.rtt", 'style="text-align: right;"'),
("IPv6.statetime", 'style="text-align: right;"'),
"ver",
]
hostfields_short = [
"name",
("IPv4.rttstate", 'style="text-align: right;"'),
("IPv4.deltastatetime", 'style="text-align: right;"'),
("IPv6.rttstate", 'style="text-align: right;"'),
("IPv6.deltastatetime", 'style="text-align: right;"'),
]
def gene(self, tag, v, attrib=None):
if attrib:
a = " %s" % attrib
else:
a = ""
return "<%s%s>%s</%s>" % (tag, a, v, tag)
def htmltable(self, tag, hd, short):
if short:
hostfields = Host.hostfields_short
else:
hostfields = Host.hostfields_long
h = []
for f in hostfields:
if isinstance(f, tuple):
h.append(self.gene(tag, hd[f[0]], f[1]))
else:
h.append(self.gene(tag, hd[f]))
return self.gene("tr", "\n".join(h))
def buildhosttable(self, short=False):
if DEBUG > 1:
print("DBG buildhosttable: start")
res = []
res.append('<table id="ntable" class="sortable">')
res.append(ubHost.htmltable("th", ubHost.headerdict(), short))
hosts_sorted = list(Host.hosts.keys())
if len(hosts_sorted):
hosts_sorted.sort()
for h in hosts_sorted:
res.append(ubHost.htmltable("td", Host.hosts[h].statedict(), short))
res.append("</table>")
if DEBUG > 1:
print("DBG buildhosttable: %s" % res)
return res
def buildmsgtable(self, msgs):
res = []
le = max(40 - len(Host.hosts), 3)
res.append("<h4>Log of Events</h4>")
for m in msgs[len(msgs) - le:]:
res.append("%s<BR>" % m)
return res
# create fake "unbound objects", remove in Python 3.0
ubHost = Host(None)
ubConnection = Connection(None, "", "", "")
+1 -1
View File
@@ -93,7 +93,7 @@ async def test_plugins():
print(f" Original data: {test_data}") print(f" Original data: {test_data}")
# Encode # Encode
encoded = dicttos("PLG", test_data, compress=True) encoded = dicttos("PLG", test_data)
print(f" Encoded ({len(encoded)} bytes): {encoded[:50]}...") print(f" Encoded ({len(encoded)} bytes): {encoded[:50]}...")
# Decode # Decode
+1 -1
View File
@@ -9,6 +9,6 @@ def test_parse_message_uncompressed():
def test_parse_message_compressed(): def test_parse_message_compressed():
raw = dicttos("ACK", {"time": 1}, compress=True) raw = dicttos("ACK", {"time": 1})
m = parse_message(raw) m = parse_message(raw)
assert "ID" in m assert "ID" in m