Compare commits

..

5 Commits

Author SHA1 Message Date
andreas f640574e4f version 5.2.2
Release / release (push) Successful in 5s
2026-05-06 09:57:43 -04:00
andreas 9a19424279 fix: retry connection on network error instead of permanently dropping it
error_received() no longer sets _dead=True; it just closes the transport
so the existing retry loop in heartbeat_sender (hbc) and sendto (hbc_mini)
reopens the connection on the next interval. This allows hbc to recover
when it starts before network connectivity is established.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-06 09:57:32 -04:00
andreas ca8ba84e65 fix: silence aiohttp.access log and strip plugin prefix in alerts UI
- main: disable aiohttp.access propagation unless --debug is active
- alerts.html: strip plugin-name prefix from metric_path display
  (nagios_runner.check_disk_root_status_code → check_disk_root_status_code)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-06 07:39:55 -04:00
andreas f3d08d1c9e version 5.2.1
Release / release (push) Successful in 5s
2026-05-06 07:07:01 -04:00
andreas 1e4263b793 fix: threshold and logging improvements
- threshold: fix crash when display is None (_format_display now falls
  back to default format string instead of calling None.format())
- threshold: shorten notification messages by stripping plugin-name prefix
  from metric_path (cpu_percent instead of cpu_monitor.cpu_percent)
- main: demote aiohttp.access log records from INFO to DEBUG
- udp: replace debug print with proper logger.info for new host sign-on

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-06 07:06:56 -04:00
8 changed files with 28 additions and 22 deletions
+1 -1
View File
@@ -14,4 +14,4 @@ Install options:
"""
__all__ = ["__version__"]
__version__ = "5.2.0"
__version__ = "5.2.2"
+2 -3
View File
@@ -172,9 +172,8 @@ class HeartbeatProtocol(asyncio.DatagramProtocol):
self.logger.error(f"Error processing datagram: {e}", exc_info=True)
def error_received(self, exc):
"""Handle protocol errors."""
self.logger.warning(f"Protocol error on {self.connection.addr}: {exc}dropping connection")
self.connection._dead = True
"""Handle protocol errors — close transport so the heartbeat sender retries."""
self.logger.warning(f"Protocol error on {self.connection.addr}: {exc}will retry")
self.connection.close()
+2
View File
@@ -475,6 +475,8 @@ def run(config, config_path=None):
if config.get("debug", 0) > 0:
log_level = logging.DEBUG
logging.basicConfig(level=log_level)
if not config.get("debug", 0):
logging.getLogger("aiohttp.access").propagate = False
load_pickled_hosts(config, hbdclass)
notify_mod.initlog(logfile=config.get("logfile", "messages.log"))
+1 -1
View File
@@ -439,7 +439,7 @@
<span class="alert-level ${level}">${alert.level}</span>
<a class="alert-hostname" href="/plugins#${alert.hostname}">${alert.hostname}</a>
</div>
<div class="alert-metric">${alert.metric_path}</div>
<div class="alert-metric">${alert.metric_path.includes('.') ? alert.metric_path.slice(alert.metric_path.indexOf('.') + 1) : alert.metric_path}</div>
<div class="alert-details">
<span>${valueText}</span>
<span class="alert-duration">Active for ${duration}</span>
+18 -11
View File
@@ -1043,7 +1043,10 @@ class ThresholdChecker:
# Format operator symbol
op_symbol = threshold.operator.value
# Short metric label: strip the plugin-name prefix for readability
short_path = metric_path.partition(".")[2] or metric_path
# Use a display-friendly value (inf is the sentinel for "overdue")
import math
display_value = "overdue" if isinstance(value, float) and math.isinf(value) else value
@@ -1065,25 +1068,25 @@ class ThresholdChecker:
if new_level == AlertLevel.OK:
lvl = "RECOVER"
message = f"{metric_path} = {display_value} ({old_level.name} -> OK)"
message = f"{short_path} = {display_value} ({old_level.name} -> OK)"
elif new_level == AlertLevel.WARNING:
lvl = "WARNING"
if has_display:
message = f"{metric_path} = {display_value} {_fmt()}"
message = f"{short_path} = {display_value} {_fmt()}"
else:
message = f"{metric_path} = {display_value}"
message = f"{short_path} = {display_value}"
elif new_level == AlertLevel.CRITICAL:
lvl = "CRITICAL"
if has_display:
message = f"{metric_path} = {display_value} {_fmt()}"
message = f"{short_path} = {display_value} {_fmt()}"
else:
message = f"{metric_path} = {display_value}"
message = f"{short_path} = {display_value}"
else:
lvl = "UNKNOWN"
if has_display:
message = f"{metric_path} = {display_value} {_fmt()}"
message = f"{short_path} = {display_value} {_fmt()}"
else:
message = f"{metric_path} = {display_value}"
message = f"{short_path} = {display_value}"
# Formatted threshold info stored on AlertState for the UI
formatted_threshold_msg = _fmt() if has_display and new_level != AlertLevel.OK else None
@@ -1157,6 +1160,9 @@ class ThresholdChecker:
Returns:
Formatted display string
"""
if not display_format:
display_format = "(threshold: {op_symbol} {threshold_value})" if threshold_value is not None else ""
# Build format context with standard variables
format_context = {
'value': value,
@@ -1338,7 +1344,8 @@ class ThresholdChecker:
# Format operator symbol
op_symbol = threshold.operator.value
short_path = metric_path.partition(".")[2] or metric_path
# Time to re-notify
if threshold_value is not None:
# Use display format string
@@ -1351,9 +1358,9 @@ class ThresholdChecker:
check_name=check_name,
metric_name=metric_name,
)
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} {threshold_info}, ongoing for {int(now - alert_state.since)}s"
message = f"REMINDER ({alert_state.level.name}): {host_name} - {short_path} = {value} {threshold_info}, ongoing for {int(now - alert_state.since)}s"
else:
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (ongoing for {int(now - alert_state.since)}s)"
message = f"REMINDER ({alert_state.level.name}): {host_name} - {short_path} = {value} (ongoing for {int(now - alert_state.since)}s)"
from . import hbdclass
host = hbdclass.Host.hosts.get(host_name)
+1 -2
View File
@@ -336,8 +336,7 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
# Apply user-access settings from config
access = config_mod.get_host_access(cfg, uname)
host.apply_access(access["owner"], access["managers"], access["monitors"])
if verbose:
print(("XX: New host, num now %s" % (len(hbdcls.Host.hosts))))
logger.info("New host signed on: %s (dyn=%s, access=%s)", uname, host.dyn, access)
newh = True
else:
host = hbdcls.Host.hosts[uname]
+1 -1
View File
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "hbd"
version = "5.2.0"
version = "5.2.2"
description = "Heartbeat monitoring system — client (hbc) and server (hbd)"
readme = "README.md"
requires-python = ">=3.11"
+2 -3
View File
@@ -41,7 +41,7 @@ from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
# updated by scripts/bumpminor.sh
__version__ = "5.2.0"
__version__ = "5.2.2"
# ---------------------------------------------------------------------------
# Protocol (mirrors hbd/common/proto.py)
@@ -797,8 +797,7 @@ class _HeartbeatProtocol(asyncio.DatagramProtocol):
self._log.error("datagram error: %s", e)
def error_received(self, exc):
self._log.warning("protocol error on %s: %sdropping connection", self._conn.addr, exc)
self._conn._dead = True
self._log.warning("protocol error on %s: %swill retry", self._conn.addr, exc)
self._conn.close()