Compare commits
17 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 4349ae217a | |||
| b3aa7b585f | |||
| 88a3c09b51 | |||
| 0504402a8a | |||
| ca58c18802 | |||
| 1ddc4b8132 | |||
| 5e1720ed32 | |||
| 77f127fe60 | |||
| 54fbd8d73d | |||
| 7ab17e26e2 | |||
| 28f5fa951c | |||
| 37f1c58969 | |||
| f006077a71 | |||
| d9fc8d632f | |||
| f640574e4f | |||
| 9a19424279 | |||
| ca8ba84e65 |
@@ -507,6 +507,9 @@ hbc --boot your-server.example.com
|
||||
|
||||
# Verbose output
|
||||
hbc -v your-server.example.com
|
||||
|
||||
# Send 'boot' and 'shutdown' messages on start and exit
|
||||
hbc -b your-server.example.com
|
||||
```
|
||||
|
||||
You can also run it via the module entrypoint:
|
||||
|
||||
@@ -8,6 +8,7 @@ This guide explains how to create custom plugins for the Heartbeat monitoring sy
|
||||
- [Plugin Types](#plugin-types)
|
||||
- [Creating a Plugin](#creating-a-plugin)
|
||||
- [Plugin Lifecycle](#plugin-lifecycle)
|
||||
- [Server-initiated InfoPlugin refresh](#server-initiated-infoplugin-refresh)
|
||||
- [Configuration](#configuration)
|
||||
- [Best Practices](#best-practices)
|
||||
- [Examples](#examples)
|
||||
@@ -250,6 +251,28 @@ Understanding the plugin lifecycle helps you implement plugins correctly:
|
||||
└─> Plugin releases resources, closes connections
|
||||
```
|
||||
|
||||
## Server-initiated InfoPlugin refresh
|
||||
|
||||
When a heartbeat packet arrives from a host the server has no plugin data for (e.g. after a server restart), the server sets `request_update = 1` in the ACK reply. The client detects this flag and immediately re-runs all InfoPlugins — clearing their cached results first — then resends the data as PLG messages.
|
||||
|
||||
This means InfoPlugin data will always reach the server as soon as possible without requiring a client restart. No action is needed from plugin authors: the framework handles cache invalidation and re-collection automatically.
|
||||
|
||||
The lifecycle for this case looks like:
|
||||
|
||||
```
|
||||
Server restarts, host reconnects
|
||||
└─> hbd receives HTB with no existing plugin_data for host
|
||||
└─> hbd sets request_update=1 in ACK
|
||||
|
||||
Client receives ACK
|
||||
└─> Detects request_update flag
|
||||
└─> Clears _cache on every registered InfoPlugin
|
||||
└─> Calls collect() on each InfoPlugin
|
||||
└─> Sends fresh PLG messages to server
|
||||
```
|
||||
|
||||
If you write an `InfoPlugin` with side effects in `_collect_info()` (opening connections, writing files, etc.), be aware it may be called more than once per client session when this mechanism triggers.
|
||||
|
||||
## Configuration
|
||||
|
||||
### Plugin-Specific Configuration
|
||||
|
||||
@@ -46,6 +46,24 @@ default_owner: andreas # owns hosts with no explicit owner
|
||||
# falls back to the first admin user if omitted
|
||||
```
|
||||
|
||||
### Client-declared host ownership
|
||||
|
||||
A host can declare its own owner directly in the hbc or hbc_mini client configuration. This is useful for hosts that are not listed in the server config, or during initial setup before a server-side config entry has been created.
|
||||
|
||||
**`~/.hbc.yaml`** (hbc):
|
||||
```yaml
|
||||
owner: andreas
|
||||
```
|
||||
|
||||
**`~/.hbc.json`** (hbc_mini):
|
||||
```json
|
||||
{ "owner": "andreas" }
|
||||
```
|
||||
|
||||
When set, the value is included in the `os_info` plugin data sent to the server. The server applies it as `host.owner` the first time `os_info` arrives, provided no owner has been configured server-side for that host. Server-configured ownership always takes precedence.
|
||||
|
||||
---
|
||||
|
||||
### Assigning roles to hosts
|
||||
|
||||
```yaml
|
||||
|
||||
+1
-1
@@ -14,4 +14,4 @@ Install options:
|
||||
"""
|
||||
|
||||
__all__ = ["__version__"]
|
||||
__version__ = "5.2.1"
|
||||
__version__ = "5.2.4"
|
||||
|
||||
@@ -15,12 +15,15 @@ CLIENT_DEFAULTS = {
|
||||
# Network settings
|
||||
"hb_port": 50003, # Port where hbd servers listen
|
||||
"interval": 10, # Heartbeat interval in seconds
|
||||
|
||||
|
||||
# Host identity
|
||||
"owner": None, # Optional username to set as this host's owner on the server
|
||||
|
||||
# Runtime flags
|
||||
"foreground": False,
|
||||
"verbose": False,
|
||||
"debug": 0,
|
||||
|
||||
|
||||
# Plugin configuration
|
||||
"plugins": {}, # Per-plugin configuration
|
||||
"thresholds": {}, # Threshold configuration for monitoring
|
||||
|
||||
+51
-28
@@ -21,6 +21,7 @@ from typing import Dict, List, Optional
|
||||
# Import protocol and config
|
||||
from .config import load_config
|
||||
from ..common.proto import dicttos, stodict
|
||||
from .. import __version__
|
||||
|
||||
# Import plugin system
|
||||
from .plugin import PluginRegistry, PluginLoader, InfoPlugin, MonitorPlugin
|
||||
@@ -58,6 +59,7 @@ class AsyncConnection:
|
||||
self._dead = False
|
||||
self._ever_opened = False
|
||||
self._open_fail_count = 0 # consecutive failures before first success
|
||||
self.request_info_event: asyncio.Event = asyncio.Event()
|
||||
|
||||
self.logger = logging.getLogger(f"hbc.conn.{addr}")
|
||||
|
||||
@@ -137,6 +139,9 @@ class AsyncConnection:
|
||||
|
||||
self.ackcount += 1
|
||||
self.logger.debug(f"ACK received, RTT: {rtt:.1f}ms")
|
||||
if msg.get("request_update"):
|
||||
self.logger.info("server requested plugin info refresh")
|
||||
self.request_info_event.set()
|
||||
|
||||
|
||||
class HeartbeatProtocol(asyncio.DatagramProtocol):
|
||||
@@ -172,9 +177,8 @@ class HeartbeatProtocol(asyncio.DatagramProtocol):
|
||||
self.logger.error(f"Error processing datagram: {e}", exc_info=True)
|
||||
|
||||
def error_received(self, exc):
|
||||
"""Handle protocol errors."""
|
||||
self.logger.warning(f"Protocol error on {self.connection.addr}: {exc} — dropping connection")
|
||||
self.connection._dead = True
|
||||
"""Handle protocol errors — close transport so the heartbeat sender retries."""
|
||||
self.logger.warning(f"Protocol error on {self.connection.addr}: {exc} — will retry")
|
||||
self.connection.close()
|
||||
|
||||
|
||||
@@ -338,15 +342,35 @@ async def heartbeat_sender(conn: AsyncConnection, interval: int):
|
||||
raise
|
||||
|
||||
|
||||
async def _info_plugin_refresh_loop(conn: AsyncConnection, info_plugins: List):
|
||||
"""Wait for server requests to re-send InfoPlugin data."""
|
||||
logger = logging.getLogger("hbc.plugins")
|
||||
while running:
|
||||
await conn.request_info_event.wait()
|
||||
if not running:
|
||||
break
|
||||
conn.request_info_event.clear()
|
||||
logger.info("refreshing InfoPlugins on server request")
|
||||
for plugin in info_plugins:
|
||||
plugin._cache = None
|
||||
try:
|
||||
data = await plugin.collect()
|
||||
if data:
|
||||
await conn.sendto({"plugin": plugin.name, **data}, "PLG")
|
||||
logger.info(f"Resent {plugin.name} data")
|
||||
except Exception as e:
|
||||
logger.error(f"Error re-collecting {plugin.name}: {e}", exc_info=True)
|
||||
|
||||
|
||||
async def plugin_collector(conn: AsyncConnection, registry: PluginRegistry):
|
||||
"""Collect and send plugin data.
|
||||
|
||||
|
||||
Args:
|
||||
conn: Connection to send on
|
||||
registry: Plugin registry
|
||||
"""
|
||||
logger = logging.getLogger("hbc.plugins")
|
||||
|
||||
|
||||
# Collect InfoPlugins once at startup
|
||||
info_plugins = registry.get_by_type(InfoPlugin)
|
||||
for plugin in info_plugins:
|
||||
@@ -359,34 +383,31 @@ async def plugin_collector(conn: AsyncConnection, registry: PluginRegistry):
|
||||
logger.info(f"Sent {plugin.name} data")
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting {plugin.name}: {e}", exc_info=True)
|
||||
|
||||
|
||||
# Schedule MonitorPlugins
|
||||
# Group plugins by interval
|
||||
from collections import defaultdict
|
||||
by_interval = defaultdict(list)
|
||||
|
||||
|
||||
monitor_plugins = registry.get_by_type(MonitorPlugin)
|
||||
for plugin in monitor_plugins:
|
||||
by_interval[plugin.interval].append(plugin)
|
||||
|
||||
# Create tasks for each interval
|
||||
tasks = []
|
||||
|
||||
# Create tasks for each interval; always include the info-refresh watcher
|
||||
tasks = [asyncio.create_task(_info_plugin_refresh_loop(conn, info_plugins))]
|
||||
for interval, plugins in by_interval.items():
|
||||
task = asyncio.create_task(
|
||||
tasks.append(asyncio.create_task(
|
||||
plugin_collector_interval(conn, plugins, interval)
|
||||
)
|
||||
tasks.append(task)
|
||||
|
||||
# Wait for all tasks
|
||||
if tasks:
|
||||
try:
|
||||
await asyncio.gather(*tasks, return_exceptions=True)
|
||||
except asyncio.CancelledError:
|
||||
logger.debug("Plugin collector cancelled, cancelling sub-tasks")
|
||||
for task in tasks:
|
||||
if not task.done():
|
||||
task.cancel()
|
||||
raise
|
||||
))
|
||||
|
||||
try:
|
||||
await asyncio.gather(*tasks, return_exceptions=True)
|
||||
except asyncio.CancelledError:
|
||||
logger.debug("Plugin collector cancelled, cancelling sub-tasks")
|
||||
for task in tasks:
|
||||
if not task.done():
|
||||
task.cancel()
|
||||
raise
|
||||
|
||||
|
||||
async def plugin_collector_interval(
|
||||
@@ -464,7 +485,7 @@ async def cleanup(connections: List[AsyncConnection]):
|
||||
logger.info("Cleaning up connections")
|
||||
|
||||
target = next((c for c in connections if c.transport), connections[0] if connections else None)
|
||||
if target:
|
||||
if target and send_shutdown:
|
||||
try:
|
||||
await target.sendto({"shutdown": 1, "acks": target.ackcount})
|
||||
except Exception as e:
|
||||
@@ -478,7 +499,7 @@ async def cleanup(connections: List[AsyncConnection]):
|
||||
|
||||
async def async_main(args, config):
|
||||
"""Async main function."""
|
||||
global running, shutdown_event, active_tasks
|
||||
global running, shutdown_event, active_tasks, send_shutdown
|
||||
|
||||
# Create shutdown event
|
||||
shutdown_event = asyncio.Event()
|
||||
@@ -495,8 +516,7 @@ async def async_main(args, config):
|
||||
hb_port = config.get("hb_port", PORT)
|
||||
interval = config.get("interval", INTERVAL)
|
||||
|
||||
logger.info(f"Starting hbc for {iam} -> {hb_hosts}")
|
||||
logger.info(f"Port: {hb_port}, Interval: {interval}s")
|
||||
logger.info(f"hbc {__version__} on {iam} -> {hb_hosts} port={hb_port}, interval={interval}s")
|
||||
|
||||
# Create connections
|
||||
connections = []
|
||||
@@ -526,10 +546,13 @@ async def async_main(args, config):
|
||||
logger.info(f"Created {len(connections)} connections")
|
||||
|
||||
# Send boot/message if requested
|
||||
send_shutdown = False
|
||||
if args.boot or args.message:
|
||||
boot_msg = {}
|
||||
if args.boot:
|
||||
boot_msg["boot"] = 1
|
||||
args.boot = False # Clear boot flag so we don't send it again in main loop
|
||||
send_shutdown = True
|
||||
if args.message:
|
||||
boot_msg["service"] = "service"
|
||||
boot_msg["msg"] = args.message
|
||||
|
||||
@@ -364,7 +364,10 @@ class PluginLoader:
|
||||
|
||||
# Instantiate plugin with config — check plugins subdict first,
|
||||
# then top-level keys (e.g. nagios_runner: ... at root of config).
|
||||
plugin_instance_config = plugins_subconfig.get(obj.name) or raw_config.get(obj.name, {})
|
||||
plugin_instance_config = dict(plugins_subconfig.get(obj.name) or raw_config.get(obj.name) or {})
|
||||
# Propagate top-level owner so os_info (and any future plugin) can report it.
|
||||
if "owner" in raw_config and "owner" not in plugin_instance_config:
|
||||
plugin_instance_config["owner"] = raw_config["owner"]
|
||||
plugin = obj(config=plugin_instance_config)
|
||||
|
||||
# Initialize plugin
|
||||
|
||||
@@ -62,6 +62,9 @@ class OSInfoPlugin(InfoPlugin):
|
||||
"hbc_version": hbc_version,
|
||||
"hbc_type": "full",
|
||||
}
|
||||
if self.config.get("owner"):
|
||||
self.logger.debug(f"Adding owner from config: {self.config['owner']}")
|
||||
data["owner"] = self.config["owner"]
|
||||
|
||||
# Add Linux-specific distribution info
|
||||
if platform.system() == "Linux":
|
||||
|
||||
@@ -13,12 +13,8 @@ plugins:
|
||||
count: 3 # ICMP packets per ping run (default 3)
|
||||
timeout: 5 # seconds before a host is considered unreachable (default 5)
|
||||
hosts:
|
||||
8.8.8.8:
|
||||
warning: 20.0 # ms
|
||||
critical: 100.0 # ms
|
||||
192.168.1.1:
|
||||
warning: 5.0
|
||||
critical: 20.0
|
||||
- 8.8.8.8
|
||||
- 192.168.1.1
|
||||
```
|
||||
|
||||
Reported metrics per host (metric key uses the hostname with dots/colons replaced
|
||||
|
||||
@@ -309,7 +309,7 @@ def get_host_access(config, hostname) -> dict:
|
||||
"""
|
||||
host_cfg = get_host_config(config, hostname)
|
||||
|
||||
owner = host_cfg.get("owner") or get_default_owner(config)
|
||||
owner = host_cfg.get("owner") # or get_default_owner(config)
|
||||
|
||||
managers = host_cfg.get("managers", [])
|
||||
if isinstance(managers, str):
|
||||
|
||||
+2
-1
@@ -475,7 +475,8 @@ def run(config, config_path=None):
|
||||
if config.get("debug", 0) > 0:
|
||||
log_level = logging.DEBUG
|
||||
logging.basicConfig(level=log_level)
|
||||
logging.getLogger("aiohttp.access").setLevel(logging.DEBUG)
|
||||
if not config.get("debug", 0):
|
||||
logging.getLogger("aiohttp.access").propagate = False
|
||||
load_pickled_hosts(config, hbdclass)
|
||||
|
||||
notify_mod.initlog(logfile=config.get("logfile", "messages.log"))
|
||||
|
||||
+10
-3
@@ -106,11 +106,18 @@ def closelog():
|
||||
|
||||
def eventlog(host, lvl, m, service=None):
|
||||
ts = time.time()
|
||||
msg = {
|
||||
"ts": ts,
|
||||
"host": host or None,
|
||||
"level": lvl,
|
||||
"service": service,
|
||||
"message": m,
|
||||
}
|
||||
data.msgs.append(msg)
|
||||
s = f"{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(ts))} {lvl} "
|
||||
if host:
|
||||
s += f"{host} "
|
||||
s += m
|
||||
data.msgs.append(s)
|
||||
logger.info(s)
|
||||
if logf:
|
||||
try:
|
||||
@@ -118,7 +125,7 @@ def eventlog(host, lvl, m, service=None):
|
||||
logf.flush()
|
||||
except Exception as e:
|
||||
logger.warning("failed to write to logfile: %s", e)
|
||||
msg_to_websockets("message", s)
|
||||
msg_to_websockets("message", msg)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -209,7 +216,7 @@ def _send_mattermost(channel_cfg: dict, notif: Notification) -> bool:
|
||||
return False
|
||||
text = f"**{notif.title}**\n{notif.body}"
|
||||
if notif.url:
|
||||
text += f"\n[Plugin metrics]({notif.url})"
|
||||
text += f"\n[Plugin metrics] {notif.url}"
|
||||
ses = {"url": host, "scheme": "http", "basepath": "/api/v4", "port": 8065}
|
||||
mm = Driver(ses)
|
||||
payload: dict = {"text": text, "channel": channel, "username": channel_cfg.get("username", "hbd")}
|
||||
|
||||
@@ -184,9 +184,9 @@
|
||||
}
|
||||
|
||||
.alert-metric {
|
||||
color: #666;
|
||||
font-family: 'Courier New', monospace;
|
||||
font-size: 0.9em;
|
||||
color: #0066cc;
|
||||
font-size: 1.1em;
|
||||
font-weight: normal;
|
||||
}
|
||||
|
||||
.alert-details {
|
||||
@@ -438,8 +438,8 @@
|
||||
<div class="alert-header">
|
||||
<span class="alert-level ${level}">${alert.level}</span>
|
||||
<a class="alert-hostname" href="/plugins#${alert.hostname}">${alert.hostname}</a>
|
||||
<span class="alert-metric">${(alert.metric_path.includes('.') ? alert.metric_path.slice(alert.metric_path.indexOf('.') + 1) : alert.metric_path).replace(/_status_code$/, '')}</span>
|
||||
</div>
|
||||
<div class="alert-metric">${alert.metric_path}</div>
|
||||
<div class="alert-details">
|
||||
<span>${valueText}</span>
|
||||
<span class="alert-duration">Active for ${duration}</span>
|
||||
|
||||
@@ -214,7 +214,7 @@
|
||||
ctx.restore();
|
||||
}
|
||||
|
||||
hand((m + s / 60) / 60 * Math.PI * 2 - Math.PI / 2,
|
||||
hand((sFrac >= 58.5 ? m + 1 : m) / 60 * Math.PI * 2 - Math.PI / 2,
|
||||
R * 0.88, -R * 0.12, SIZE * 0.027, '#222'); /* minute */
|
||||
hand((h + m / 60) / 12 * Math.PI * 2 - Math.PI / 2,
|
||||
R * 0.58, -R * 0.12, SIZE * 0.039, '#222'); /* hour */
|
||||
|
||||
@@ -183,11 +183,24 @@
|
||||
line-height: 1.0;
|
||||
}
|
||||
|
||||
#messages div {
|
||||
#messages .log-entry {
|
||||
padding: 5px 0;
|
||||
border-bottom: 1px solid #f0f0f0;
|
||||
display: flex;
|
||||
gap: 0.5em;
|
||||
align-items: baseline;
|
||||
}
|
||||
|
||||
.log-ts { color: #888; white-space: nowrap; }
|
||||
.log-level { font-weight: bold; min-width: 6em; }
|
||||
.log-host { font-weight: 600; }
|
||||
.log-service { color: #888; }
|
||||
|
||||
.log-warning .log-level { color: #b8860b; }
|
||||
.log-critical .log-level { color: #c00; }
|
||||
.log-recover .log-level { color: #2a7a2a; }
|
||||
.log-info .log-level { color: #555; }
|
||||
|
||||
/* Modal for connection status messages */
|
||||
.connection-modal {
|
||||
display: none;
|
||||
@@ -460,7 +473,20 @@
|
||||
update_table(state.data);
|
||||
} else if (state.type == "message") {
|
||||
var msgs = document.getElementById("messages");
|
||||
msgs.insertAdjacentHTML("afterbegin", "<div>" + state.data + "</div>");
|
||||
var msg = state.data;
|
||||
var _d = new Date(msg.ts * 1000);
|
||||
function _p(n) { return n < 10 ? '0' + n : '' + n; }
|
||||
var ts_str = _d.getFullYear() + '-' + _p(_d.getMonth()+1) + '-' + _p(_d.getDate())
|
||||
+ ' ' + _p(_d.getHours()) + ':' + _p(_d.getMinutes()) + ':' + _p(_d.getSeconds());
|
||||
var lvl = (msg.level || "INFO").toLowerCase();
|
||||
var html = '<div class="log-entry log-' + lvl + '">';
|
||||
html += '<span class="log-ts">' + ts_str + '</span>';
|
||||
html += '<span class="log-level">' + (msg.level || "") + '</span>';
|
||||
if (msg.host) html += '<span class="log-host">' + msg.host + '</span>';
|
||||
if (msg.service) html += '<span class="log-service">' + msg.service + '</span>';
|
||||
html += '<span class="log-msg">' + msg.message + '</span>';
|
||||
html += '</div>';
|
||||
msgs.insertAdjacentHTML("afterbegin", html);
|
||||
}
|
||||
cnt++;
|
||||
};
|
||||
|
||||
+16
-10
@@ -1044,8 +1044,8 @@ class ThresholdChecker:
|
||||
# Format operator symbol
|
||||
op_symbol = threshold.operator.value
|
||||
|
||||
# Short metric label: strip the plugin-name prefix for readability
|
||||
short_path = metric_path.partition(".")[2] or metric_path
|
||||
# Short metric label: strip the plugin-name prefix and _status_code suffix
|
||||
short_path = (metric_path.partition(".")[2] or metric_path).removesuffix("_status_code")
|
||||
|
||||
# Use a display-friendly value (inf is the sentinel for "overdue")
|
||||
import math
|
||||
@@ -1109,11 +1109,16 @@ class ThresholdChecker:
|
||||
if host is not None and not host.watched:
|
||||
eventlog(host_name, lvl, message, service="threshold")
|
||||
return
|
||||
short_path = (metric_path.partition(".")[2] or metric_path).removesuffix("_status_code")
|
||||
title = f"[{lvl}] {host_name} {short_path}"
|
||||
# Strip the "metric = " prefix from message so body is just the value/detail
|
||||
prefix = short_path + " = "
|
||||
body = message[len(prefix):] if message.startswith(prefix) else message
|
||||
asyncio.get_event_loop().create_task(notify_mod.send_notification(
|
||||
host_name,
|
||||
notify_mod.Notification(
|
||||
title=f"[{lvl}] {host_name}",
|
||||
body=message,
|
||||
title=title,
|
||||
body=body,
|
||||
level=lvl,
|
||||
),
|
||||
))
|
||||
@@ -1344,7 +1349,7 @@ class ThresholdChecker:
|
||||
|
||||
# Format operator symbol
|
||||
op_symbol = threshold.operator.value
|
||||
short_path = metric_path.partition(".")[2] or metric_path
|
||||
short_path = (metric_path.partition(".")[2] or metric_path).removesuffix("_status_code")
|
||||
|
||||
# Time to re-notify
|
||||
if threshold_value is not None:
|
||||
@@ -1358,18 +1363,19 @@ class ThresholdChecker:
|
||||
check_name=check_name,
|
||||
metric_name=metric_name,
|
||||
)
|
||||
message = f"REMINDER ({alert_state.level.name}): {host_name} - {short_path} = {value} {threshold_info}, ongoing for {int(now - alert_state.since)}s"
|
||||
body = f"{value} {threshold_info}, ongoing for {int(now - alert_state.since)}s"
|
||||
else:
|
||||
message = f"REMINDER ({alert_state.level.name}): {host_name} - {short_path} = {value} (ongoing for {int(now - alert_state.since)}s)"
|
||||
|
||||
body = f"{value} (ongoing for {int(now - alert_state.since)}s)"
|
||||
message = f"REMINDER ({alert_state.level.name}): {host_name} - {short_path} = {body}"
|
||||
|
||||
from . import hbdclass
|
||||
host = hbdclass.Host.hosts.get(host_name)
|
||||
if host is None or host.watched:
|
||||
asyncio.get_event_loop().create_task(notify_mod.send_notification(
|
||||
host_name,
|
||||
notify_mod.Notification(
|
||||
title=f"[REMINDER/{alert_state.level.name}] {host_name}",
|
||||
body=message,
|
||||
title=f"[REMINDER/{alert_state.level.name}] {host_name} {short_path}",
|
||||
body=body,
|
||||
level=alert_state.level.name,
|
||||
),
|
||||
))
|
||||
|
||||
+8
-1
@@ -350,8 +350,10 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
|
||||
if msg.get("ID") == "HTB":
|
||||
host.doesack = msg.get("acks", -1)
|
||||
# send ACK back
|
||||
# send ACK back; ask client to resend plugin info when we have none yet
|
||||
rmsg = {"time": time.time()}
|
||||
if not host.plugin_data:
|
||||
rmsg["request_update"] = 1
|
||||
opkt = dicttos("ACK", rmsg)
|
||||
try:
|
||||
transport.sendto(opkt, addr)
|
||||
@@ -368,6 +370,11 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
if k not in ("ID", "plugin", "id", "name")}
|
||||
# Store plugin data with timestamp
|
||||
host.add_plugin_data(plugin_name, plugin_data, timestamp=now)
|
||||
|
||||
# If os_info reports an owner and none is configured server-side, apply it
|
||||
if plugin_name == "os_info":
|
||||
if not host.owner:
|
||||
host.owner = plugin_data.get("owner", config_mod.get_default_owner(cfg))
|
||||
if DEBUG > 1:
|
||||
print(f"Stored plugin data for {uname}: {plugin_name}")
|
||||
|
||||
|
||||
+6
-2
@@ -85,11 +85,13 @@ async def handler(request):
|
||||
except Exception as e:
|
||||
logger.error("Error sending initial hosts: %s", e)
|
||||
|
||||
# Send recent messages
|
||||
# Send recent messages, filtered to hosts this user may see
|
||||
if data.msgs:
|
||||
try:
|
||||
for m in data.msgs:
|
||||
await ws.send_str(json.dumps({"type": "message", "data": m}))
|
||||
host_name = m.get("host") if isinstance(m, dict) else None
|
||||
if not host_name or _user_can_see_host(user, host_name):
|
||||
await ws.send_str(json.dumps({"type": "message", "data": m}))
|
||||
except Exception as e:
|
||||
logger.error("Error sending initial messages: %s", e)
|
||||
|
||||
@@ -128,6 +130,8 @@ def broadcast(typ: str, payload) -> bool:
|
||||
host_name: Optional[str] = None
|
||||
if typ in ("host", "plugin"):
|
||||
host_name = payload.get("raw_name") or payload.get("host") or payload.get("name")
|
||||
elif typ == "message" and isinstance(payload, dict):
|
||||
host_name = payload.get("host")
|
||||
|
||||
jmsg = json.dumps({"type": typ, "data": payload})
|
||||
|
||||
|
||||
+1
-1
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "hbd"
|
||||
version = "5.2.1"
|
||||
version = "5.2.4"
|
||||
description = "Heartbeat monitoring system — client (hbc) and server (hbd)"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.11"
|
||||
|
||||
+36
-15
@@ -41,7 +41,7 @@ from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
# updated by scripts/bumpminor.sh
|
||||
__version__ = "5.2.1"
|
||||
__version__ = "5.2.4"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Protocol (mirrors hbd/common/proto.py)
|
||||
@@ -114,6 +114,7 @@ def _stodict(data: bytes) -> Dict[str, Any]:
|
||||
_DEFAULTS: Dict[str, Any] = {
|
||||
"hb_port": 50003,
|
||||
"interval": 10,
|
||||
"owner": None,
|
||||
"plugins": {},
|
||||
}
|
||||
|
||||
@@ -239,6 +240,8 @@ class OSInfoPlugin(InfoPlugin):
|
||||
"hbc_version": __version__,
|
||||
"hbc_type": "mini",
|
||||
}
|
||||
if self.config.get("owner"):
|
||||
data["owner"] = self.config["owner"]
|
||||
if platform.system() == "Linux":
|
||||
data.update(_linux_distro())
|
||||
elif platform.system() == "Darwin":
|
||||
@@ -716,7 +719,9 @@ async def _load_plugins(cfg: Dict[str, Any]) -> List[Plugin]:
|
||||
plugins_cfg: Dict[str, Any] = cfg.get("plugins", {})
|
||||
loaded: List[Plugin] = []
|
||||
for cls in _ALL_PLUGIN_CLASSES:
|
||||
plugin_cfg = plugins_cfg.get(cls.name) or cfg.get(cls.name, {})
|
||||
plugin_cfg = dict(plugins_cfg.get(cls.name) or cfg.get(cls.name) or {})
|
||||
if "owner" in cfg and "owner" not in plugin_cfg:
|
||||
plugin_cfg["owner"] = cfg["owner"]
|
||||
plugin: Plugin = cls(config=plugin_cfg)
|
||||
try:
|
||||
ok = await plugin.initialize()
|
||||
@@ -786,7 +791,7 @@ class _HeartbeatProtocol(asyncio.DatagramProtocol):
|
||||
msg_id = msg.get("ID")
|
||||
now = time.time()
|
||||
if msg_id == "ACK":
|
||||
self._conn._handle_ack(now)
|
||||
self._conn._handle_ack(msg, now)
|
||||
elif msg_id == "CMD":
|
||||
asyncio.create_task(_handle_command(self._conn, msg))
|
||||
elif msg_id == "UPD":
|
||||
@@ -797,8 +802,7 @@ class _HeartbeatProtocol(asyncio.DatagramProtocol):
|
||||
self._log.error("datagram error: %s", e)
|
||||
|
||||
def error_received(self, exc):
|
||||
self._log.warning("protocol error on %s: %s — dropping connection", self._conn.addr, exc)
|
||||
self._conn._dead = True
|
||||
self._log.warning("protocol error on %s: %s — will retry", self._conn.addr, exc)
|
||||
self._conn.close()
|
||||
|
||||
|
||||
@@ -814,6 +818,7 @@ class AsyncConnection:
|
||||
self.rtts: List[float] = [0.0]
|
||||
self._transport: Optional[asyncio.DatagramTransport] = None
|
||||
self._dead = False
|
||||
self._request_info: asyncio.Event = asyncio.Event()
|
||||
self._log = logging.getLogger(f"hbc.conn.{addr}")
|
||||
|
||||
async def open(self) -> bool:
|
||||
@@ -832,12 +837,14 @@ class AsyncConnection:
|
||||
self._transport.close()
|
||||
self._transport = None
|
||||
|
||||
def _handle_ack(self, now: float):
|
||||
def _handle_ack(self, msg: Dict[str, Any], now: float):
|
||||
rtt = (now - self.lastsend) * 1000.0
|
||||
self.rtts.append(rtt)
|
||||
if len(self.rtts) > 10:
|
||||
self.rtts.pop(0)
|
||||
self.ackcount += 1
|
||||
if msg.get("request_update"):
|
||||
self._request_info.set()
|
||||
|
||||
async def sendto(self, msg: Dict[str, Any], msg_id: str = "HTB"):
|
||||
if self._dead:
|
||||
@@ -970,6 +977,19 @@ async def _run_monitor_group(conn: AsyncConnection, plugins: List[Plugin], inter
|
||||
await _sleep(interval)
|
||||
|
||||
|
||||
async def _info_refresh_loop(conn: AsyncConnection, info: List[Plugin]):
|
||||
log = logging.getLogger("hbc.plugins")
|
||||
while _running:
|
||||
await conn._request_info.wait()
|
||||
if not _running:
|
||||
break
|
||||
conn._request_info.clear()
|
||||
log.info("refreshing InfoPlugins on server request")
|
||||
for plugin in info:
|
||||
plugin._cache = None
|
||||
await _run_info_plugins(conn, info)
|
||||
|
||||
|
||||
async def _plugin_collector(conn: AsyncConnection, plugins: List[Plugin]):
|
||||
info = [p for p in plugins if isinstance(p, InfoPlugin)]
|
||||
monitor = [p for p in plugins if isinstance(p, MonitorPlugin)]
|
||||
@@ -980,12 +1000,10 @@ async def _plugin_collector(conn: AsyncConnection, plugins: List[Plugin]):
|
||||
for p in monitor:
|
||||
by_interval[p.interval].append(p)
|
||||
|
||||
if by_interval:
|
||||
await asyncio.gather(
|
||||
*[asyncio.create_task(_run_monitor_group(conn, grp, iv))
|
||||
for iv, grp in by_interval.items()],
|
||||
return_exceptions=True,
|
||||
)
|
||||
tasks = [asyncio.create_task(_info_refresh_loop(conn, info))]
|
||||
tasks += [asyncio.create_task(_run_monitor_group(conn, grp, iv))
|
||||
for iv, grp in by_interval.items()]
|
||||
await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -1029,7 +1047,7 @@ def _reconfigure_syslog(level: int):
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
async def _async_main(args, cfg: Dict[str, Any]) -> int:
|
||||
global _running, _shutdown_event, _active_tasks
|
||||
global _running, _shutdown_event, _active_tasks, send_shutdown
|
||||
_running = True
|
||||
_shutdown_event = asyncio.Event()
|
||||
_active_tasks = []
|
||||
@@ -1039,7 +1057,7 @@ async def _async_main(args, cfg: Dict[str, Any]) -> int:
|
||||
port = cfg.get("hb_port", PORT)
|
||||
interval = cfg.get("interval", INTERVAL)
|
||||
|
||||
log.info("starting: %s -> %s port=%d interval=%ds", iam, args.hosts, port, interval)
|
||||
log.info("hbc_mini %s on %s -> %s port=%d interval=%ds",__version__, iam, args.hosts, port, interval)
|
||||
|
||||
connections: List[AsyncConnection] = []
|
||||
conn_id = 1
|
||||
@@ -1060,10 +1078,13 @@ async def _async_main(args, cfg: Dict[str, Any]) -> int:
|
||||
return 1
|
||||
|
||||
# Boot / one-shot message
|
||||
send_shutdown = False
|
||||
if args.boot or args.message:
|
||||
bmsg: Dict[str, Any] = {"acks": 0}
|
||||
if args.boot:
|
||||
bmsg["boot"] = 1
|
||||
args.boot = False # don't repeat on restart
|
||||
send_shutdown = True
|
||||
if args.message:
|
||||
bmsg["service"] = "service"
|
||||
bmsg["msg"] = args.message
|
||||
@@ -1101,7 +1122,7 @@ async def _async_main(args, cfg: Dict[str, Any]) -> int:
|
||||
|
||||
log.info("shutting down")
|
||||
target = next((c for c in connections if c._transport), connections[0] if connections else None)
|
||||
if target:
|
||||
if target and send_shutdown:
|
||||
try:
|
||||
await target.sendto({"shutdown": 1, "acks": target.ackcount})
|
||||
except Exception:
|
||||
|
||||
Reference in New Issue
Block a user