Compare commits

...

11 Commits

Author SHA1 Message Date
andreas 54fbd8d73d version 5.2.3
Release / release (push) Successful in 5s
2026-05-07 10:15:11 -04:00
andreas 7ab17e26e2 hbc/hbc_mini: log name and version at startup; ui: bump alert-metric font size
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-07 10:15:03 -04:00
andreas 28f5fa951c ui: show metric name inline with hostname in alerts and notifications
Alerts page: move metric name into the header row alongside hostname.
Notifications: include metric name in title (hostname  metric) and
strip the metric prefix from the body so it contains only value/detail.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-07 06:26:27 -04:00
andreas 37f1c58969 docs: remove dead warning/critical keys from ping_monitor config example
These fields were never read by the plugin; thresholds are configured
server-side. Also document the -b flag in README.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-07 06:12:15 -04:00
andreas f006077a71 send shutdown msg only if we sent a boot msg. Don't send eithe when restarting. 2026-05-06 11:57:43 -04:00
andreas d9fc8d632f send shutdown msg only if we sent a boot msg. Don't send eithe when restarting. 2026-05-06 11:54:09 -04:00
andreas f640574e4f version 5.2.2
Release / release (push) Successful in 5s
2026-05-06 09:57:43 -04:00
andreas 9a19424279 fix: retry connection on network error instead of permanently dropping it
error_received() no longer sets _dead=True; it just closes the transport
so the existing retry loop in heartbeat_sender (hbc) and sendto (hbc_mini)
reopens the connection on the next interval. This allows hbc to recover
when it starts before network connectivity is established.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-06 09:57:32 -04:00
andreas ca8ba84e65 fix: silence aiohttp.access log and strip plugin prefix in alerts UI
- main: disable aiohttp.access propagation unless --debug is active
- alerts.html: strip plugin-name prefix from metric_path display
  (nagios_runner.check_disk_root_status_code → check_disk_root_status_code)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-06 07:39:55 -04:00
andreas f3d08d1c9e version 5.2.1
Release / release (push) Successful in 5s
2026-05-06 07:07:01 -04:00
andreas 1e4263b793 fix: threshold and logging improvements
- threshold: fix crash when display is None (_format_display now falls
  back to default format string instead of calling None.format())
- threshold: shorten notification messages by stripping plugin-name prefix
  from metric_path (cpu_percent instead of cpu_monitor.cpu_percent)
- main: demote aiohttp.access log records from INFO to DEBUG
- udp: replace debug print with proper logger.info for new host sign-on

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-06 07:06:56 -04:00
10 changed files with 60 additions and 41 deletions
+3
View File
@@ -507,6 +507,9 @@ hbc --boot your-server.example.com
# Verbose output # Verbose output
hbc -v your-server.example.com hbc -v your-server.example.com
# Send 'boot' and 'shutdown' messages on start and exit
hbc -b your-server.example.com
``` ```
You can also run it via the module entrypoint: You can also run it via the module entrypoint:
+1 -1
View File
@@ -14,4 +14,4 @@ Install options:
""" """
__all__ = ["__version__"] __all__ = ["__version__"]
__version__ = "5.2.0" __version__ = "5.2.3"
+9 -5
View File
@@ -21,6 +21,7 @@ from typing import Dict, List, Optional
# Import protocol and config # Import protocol and config
from .config import load_config from .config import load_config
from ..common.proto import dicttos, stodict from ..common.proto import dicttos, stodict
from .. import __version__
# Import plugin system # Import plugin system
from .plugin import PluginRegistry, PluginLoader, InfoPlugin, MonitorPlugin from .plugin import PluginRegistry, PluginLoader, InfoPlugin, MonitorPlugin
@@ -172,9 +173,8 @@ class HeartbeatProtocol(asyncio.DatagramProtocol):
self.logger.error(f"Error processing datagram: {e}", exc_info=True) self.logger.error(f"Error processing datagram: {e}", exc_info=True)
def error_received(self, exc): def error_received(self, exc):
"""Handle protocol errors.""" """Handle protocol errors — close transport so the heartbeat sender retries."""
self.logger.warning(f"Protocol error on {self.connection.addr}: {exc}dropping connection") self.logger.warning(f"Protocol error on {self.connection.addr}: {exc}will retry")
self.connection._dead = True
self.connection.close() self.connection.close()
@@ -464,7 +464,7 @@ async def cleanup(connections: List[AsyncConnection]):
logger.info("Cleaning up connections") logger.info("Cleaning up connections")
target = next((c for c in connections if c.transport), connections[0] if connections else None) target = next((c for c in connections if c.transport), connections[0] if connections else None)
if target: if target and send_shutdown:
try: try:
await target.sendto({"shutdown": 1, "acks": target.ackcount}) await target.sendto({"shutdown": 1, "acks": target.ackcount})
except Exception as e: except Exception as e:
@@ -478,7 +478,7 @@ async def cleanup(connections: List[AsyncConnection]):
async def async_main(args, config): async def async_main(args, config):
"""Async main function.""" """Async main function."""
global running, shutdown_event, active_tasks global running, shutdown_event, active_tasks, send_shutdown
# Create shutdown event # Create shutdown event
shutdown_event = asyncio.Event() shutdown_event = asyncio.Event()
@@ -495,6 +495,7 @@ async def async_main(args, config):
hb_port = config.get("hb_port", PORT) hb_port = config.get("hb_port", PORT)
interval = config.get("interval", INTERVAL) interval = config.get("interval", INTERVAL)
logger.info(f"hbc {__version__} starting on {iam}")
logger.info(f"Starting hbc for {iam} -> {hb_hosts}") logger.info(f"Starting hbc for {iam} -> {hb_hosts}")
logger.info(f"Port: {hb_port}, Interval: {interval}s") logger.info(f"Port: {hb_port}, Interval: {interval}s")
@@ -526,10 +527,13 @@ async def async_main(args, config):
logger.info(f"Created {len(connections)} connections") logger.info(f"Created {len(connections)} connections")
# Send boot/message if requested # Send boot/message if requested
send_shutdown = False
if args.boot or args.message: if args.boot or args.message:
boot_msg = {} boot_msg = {}
if args.boot: if args.boot:
boot_msg["boot"] = 1 boot_msg["boot"] = 1
args.boot = False # Clear boot flag so we don't send it again in main loop
send_shutdown = True
if args.message: if args.message:
boot_msg["service"] = "service" boot_msg["service"] = "service"
boot_msg["msg"] = args.message boot_msg["msg"] = args.message
+2 -6
View File
@@ -13,12 +13,8 @@ plugins:
count: 3 # ICMP packets per ping run (default 3) count: 3 # ICMP packets per ping run (default 3)
timeout: 5 # seconds before a host is considered unreachable (default 5) timeout: 5 # seconds before a host is considered unreachable (default 5)
hosts: hosts:
8.8.8.8: - 8.8.8.8
warning: 20.0 # ms - 192.168.1.1
critical: 100.0 # ms
192.168.1.1:
warning: 5.0
critical: 20.0
``` ```
Reported metrics per host (metric key uses the hostname with dots/colons replaced Reported metrics per host (metric key uses the hostname with dots/colons replaced
+2
View File
@@ -475,6 +475,8 @@ def run(config, config_path=None):
if config.get("debug", 0) > 0: if config.get("debug", 0) > 0:
log_level = logging.DEBUG log_level = logging.DEBUG
logging.basicConfig(level=log_level) logging.basicConfig(level=log_level)
if not config.get("debug", 0):
logging.getLogger("aiohttp.access").propagate = False
load_pickled_hosts(config, hbdclass) load_pickled_hosts(config, hbdclass)
notify_mod.initlog(logfile=config.get("logfile", "messages.log")) notify_mod.initlog(logfile=config.get("logfile", "messages.log"))
+4 -4
View File
@@ -184,9 +184,9 @@
} }
.alert-metric { .alert-metric {
color: #666; color: #0066cc;
font-family: 'Courier New', monospace; font-size: 1.1em;
font-size: 0.9em; font-weight: normal;
} }
.alert-details { .alert-details {
@@ -438,8 +438,8 @@
<div class="alert-header"> <div class="alert-header">
<span class="alert-level ${level}">${alert.level}</span> <span class="alert-level ${level}">${alert.level}</span>
<a class="alert-hostname" href="/plugins#${alert.hostname}">${alert.hostname}</a> <a class="alert-hostname" href="/plugins#${alert.hostname}">${alert.hostname}</a>
<span class="alert-metric">${alert.metric_path.includes('.') ? alert.metric_path.slice(alert.metric_path.indexOf('.') + 1) : alert.metric_path}</span>
</div> </div>
<div class="alert-metric">${alert.metric_path}</div>
<div class="alert-details"> <div class="alert-details">
<span>${valueText}</span> <span>${valueText}</span>
<span class="alert-duration">Active for ${duration}</span> <span class="alert-duration">Active for ${duration}</span>
+29 -16
View File
@@ -1043,7 +1043,10 @@ class ThresholdChecker:
# Format operator symbol # Format operator symbol
op_symbol = threshold.operator.value op_symbol = threshold.operator.value
# Short metric label: strip the plugin-name prefix for readability
short_path = metric_path.partition(".")[2] or metric_path
# Use a display-friendly value (inf is the sentinel for "overdue") # Use a display-friendly value (inf is the sentinel for "overdue")
import math import math
display_value = "overdue" if isinstance(value, float) and math.isinf(value) else value display_value = "overdue" if isinstance(value, float) and math.isinf(value) else value
@@ -1065,25 +1068,25 @@ class ThresholdChecker:
if new_level == AlertLevel.OK: if new_level == AlertLevel.OK:
lvl = "RECOVER" lvl = "RECOVER"
message = f"{metric_path} = {display_value} ({old_level.name} -> OK)" message = f"{short_path} = {display_value} ({old_level.name} -> OK)"
elif new_level == AlertLevel.WARNING: elif new_level == AlertLevel.WARNING:
lvl = "WARNING" lvl = "WARNING"
if has_display: if has_display:
message = f"{metric_path} = {display_value} {_fmt()}" message = f"{short_path} = {display_value} {_fmt()}"
else: else:
message = f"{metric_path} = {display_value}" message = f"{short_path} = {display_value}"
elif new_level == AlertLevel.CRITICAL: elif new_level == AlertLevel.CRITICAL:
lvl = "CRITICAL" lvl = "CRITICAL"
if has_display: if has_display:
message = f"{metric_path} = {display_value} {_fmt()}" message = f"{short_path} = {display_value} {_fmt()}"
else: else:
message = f"{metric_path} = {display_value}" message = f"{short_path} = {display_value}"
else: else:
lvl = "UNKNOWN" lvl = "UNKNOWN"
if has_display: if has_display:
message = f"{metric_path} = {display_value} {_fmt()}" message = f"{short_path} = {display_value} {_fmt()}"
else: else:
message = f"{metric_path} = {display_value}" message = f"{short_path} = {display_value}"
# Formatted threshold info stored on AlertState for the UI # Formatted threshold info stored on AlertState for the UI
formatted_threshold_msg = _fmt() if has_display and new_level != AlertLevel.OK else None formatted_threshold_msg = _fmt() if has_display and new_level != AlertLevel.OK else None
@@ -1106,11 +1109,16 @@ class ThresholdChecker:
if host is not None and not host.watched: if host is not None and not host.watched:
eventlog(host_name, lvl, message, service="threshold") eventlog(host_name, lvl, message, service="threshold")
return return
short_path = metric_path.partition(".")[2] or metric_path
title = f"[{lvl}] {host_name} {short_path}"
# Strip the "metric = " prefix from message so body is just the value/detail
prefix = short_path + " = "
body = message[len(prefix):] if message.startswith(prefix) else message
asyncio.get_event_loop().create_task(notify_mod.send_notification( asyncio.get_event_loop().create_task(notify_mod.send_notification(
host_name, host_name,
notify_mod.Notification( notify_mod.Notification(
title=f"[{lvl}] {host_name}", title=title,
body=message, body=body,
level=lvl, level=lvl,
), ),
)) ))
@@ -1157,6 +1165,9 @@ class ThresholdChecker:
Returns: Returns:
Formatted display string Formatted display string
""" """
if not display_format:
display_format = "(threshold: {op_symbol} {threshold_value})" if threshold_value is not None else ""
# Build format context with standard variables # Build format context with standard variables
format_context = { format_context = {
'value': value, 'value': value,
@@ -1338,7 +1349,8 @@ class ThresholdChecker:
# Format operator symbol # Format operator symbol
op_symbol = threshold.operator.value op_symbol = threshold.operator.value
short_path = metric_path.partition(".")[2] or metric_path
# Time to re-notify # Time to re-notify
if threshold_value is not None: if threshold_value is not None:
# Use display format string # Use display format string
@@ -1351,18 +1363,19 @@ class ThresholdChecker:
check_name=check_name, check_name=check_name,
metric_name=metric_name, metric_name=metric_name,
) )
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} {threshold_info}, ongoing for {int(now - alert_state.since)}s" body = f"{value} {threshold_info}, ongoing for {int(now - alert_state.since)}s"
else: else:
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (ongoing for {int(now - alert_state.since)}s)" body = f"{value} (ongoing for {int(now - alert_state.since)}s)"
message = f"REMINDER ({alert_state.level.name}): {host_name} - {short_path} = {body}"
from . import hbdclass from . import hbdclass
host = hbdclass.Host.hosts.get(host_name) host = hbdclass.Host.hosts.get(host_name)
if host is None or host.watched: if host is None or host.watched:
asyncio.get_event_loop().create_task(notify_mod.send_notification( asyncio.get_event_loop().create_task(notify_mod.send_notification(
host_name, host_name,
notify_mod.Notification( notify_mod.Notification(
title=f"[REMINDER/{alert_state.level.name}] {host_name}", title=f"[REMINDER/{alert_state.level.name}] {host_name} {short_path}",
body=message, body=body,
level=alert_state.level.name, level=alert_state.level.name,
), ),
)) ))
+1 -2
View File
@@ -336,8 +336,7 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
# Apply user-access settings from config # Apply user-access settings from config
access = config_mod.get_host_access(cfg, uname) access = config_mod.get_host_access(cfg, uname)
host.apply_access(access["owner"], access["managers"], access["monitors"]) host.apply_access(access["owner"], access["managers"], access["monitors"])
if verbose: logger.info("New host signed on: %s (dyn=%s, access=%s)", uname, host.dyn, access)
print(("XX: New host, num now %s" % (len(hbdcls.Host.hosts))))
newh = True newh = True
else: else:
host = hbdcls.Host.hosts[uname] host = hbdcls.Host.hosts[uname]
+1 -1
View File
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project] [project]
name = "hbd" name = "hbd"
version = "5.2.0" version = "5.2.3"
description = "Heartbeat monitoring system — client (hbc) and server (hbd)" description = "Heartbeat monitoring system — client (hbc) and server (hbd)"
readme = "README.md" readme = "README.md"
requires-python = ">=3.11" requires-python = ">=3.11"
+8 -6
View File
@@ -41,7 +41,7 @@ from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
# updated by scripts/bumpminor.sh # updated by scripts/bumpminor.sh
__version__ = "5.2.0" __version__ = "5.2.3"
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Protocol (mirrors hbd/common/proto.py) # Protocol (mirrors hbd/common/proto.py)
@@ -797,8 +797,7 @@ class _HeartbeatProtocol(asyncio.DatagramProtocol):
self._log.error("datagram error: %s", e) self._log.error("datagram error: %s", e)
def error_received(self, exc): def error_received(self, exc):
self._log.warning("protocol error on %s: %sdropping connection", self._conn.addr, exc) self._log.warning("protocol error on %s: %swill retry", self._conn.addr, exc)
self._conn._dead = True
self._conn.close() self._conn.close()
@@ -1029,7 +1028,7 @@ def _reconfigure_syslog(level: int):
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
async def _async_main(args, cfg: Dict[str, Any]) -> int: async def _async_main(args, cfg: Dict[str, Any]) -> int:
global _running, _shutdown_event, _active_tasks global _running, _shutdown_event, _active_tasks, send_shutdown
_running = True _running = True
_shutdown_event = asyncio.Event() _shutdown_event = asyncio.Event()
_active_tasks = [] _active_tasks = []
@@ -1039,7 +1038,7 @@ async def _async_main(args, cfg: Dict[str, Any]) -> int:
port = cfg.get("hb_port", PORT) port = cfg.get("hb_port", PORT)
interval = cfg.get("interval", INTERVAL) interval = cfg.get("interval", INTERVAL)
log.info("starting: %s -> %s port=%d interval=%ds", iam, args.hosts, port, interval) log.info("starting hbc_mini %s on %s -> %s port=%d interval=%ds",__version__, iam, args.hosts, port, interval)
connections: List[AsyncConnection] = [] connections: List[AsyncConnection] = []
conn_id = 1 conn_id = 1
@@ -1060,10 +1059,13 @@ async def _async_main(args, cfg: Dict[str, Any]) -> int:
return 1 return 1
# Boot / one-shot message # Boot / one-shot message
send_shutdown = False
if args.boot or args.message: if args.boot or args.message:
bmsg: Dict[str, Any] = {"acks": 0} bmsg: Dict[str, Any] = {"acks": 0}
if args.boot: if args.boot:
bmsg["boot"] = 1 bmsg["boot"] = 1
args.boot = False # don't repeat on restart
send_shutdown = True
if args.message: if args.message:
bmsg["service"] = "service" bmsg["service"] = "service"
bmsg["msg"] = args.message bmsg["msg"] = args.message
@@ -1101,7 +1103,7 @@ async def _async_main(args, cfg: Dict[str, Any]) -> int:
log.info("shutting down") log.info("shutting down")
target = next((c for c in connections if c._transport), connections[0] if connections else None) target = next((c for c in connections if c._transport), connections[0] if connections else None)
if target: if target and send_shutdown:
try: try:
await target.sendto({"shutdown": 1, "acks": target.ackcount}) await target.sendto({"shutdown": 1, "acks": target.ackcount})
except Exception: except Exception: