Compare commits

..

7 Commits

Author SHA1 Message Date
Andreas Wrede 8da3d550eb version 5.1.16
Release / release (push) Successful in 5s
2026-05-03 06:08:14 -04:00
Andreas Wrede a76d0fc840 feat: generic ping_monitor thresholds; round RTT to nearest ms
- threshold.py: add _find_threshold() with suffix fallback so thresholds
  like ping_monitor.rtt_avg match ping_monitor.8_8_8_8_rtt_avg etc.;
  each pinged host keeps its own alert state
- hbdclass.py: format RTT as integer ms (round())
- live.html: JS RTT display rounded to nearest ms (Math.round)

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
2026-05-03 06:08:11 -04:00
Andreas Wrede 94cbb31c48 version 5.1.15
Release / release (push) Successful in 6s
2026-05-02 14:37:11 -04:00
Andreas Wrede ae60844a8a feat: link hostnames in Live Dashboard to Host Overview
Hostnames in the live dashboard table are now links to /plugins#hostname,
which expands and scrolls to that host's card in the Host Overview page.

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
2026-05-02 14:37:08 -04:00
Andreas Wrede 49fa310361 feat: add Threshold Configurations section to settings page
Reads threshold_configs (or legacy thresholds) from config and renders
per-named-config tables showing metric path, operator, warning/critical
values, hysteresis, and count. Disabled entries are dimmed.

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
2026-05-02 14:30:31 -04:00
Andreas Wrede 28e2180f7b fix: suppress notifications on alert de-escalation (e.g. CRITICAL→WARNING)
Only notify on worsening transitions (OK→WARNING, OK→CRITICAL,
WARNING→CRITICAL) and recovery (any→OK). De-escalation within alert
states no longer sends a duplicate notification since the metric never
recovered.

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
2026-05-02 14:27:18 -04:00
Andreas Wrede ce0590f015 fix: suppress recover messages for down durations under 4 seconds
Transient blips caused by hbc client restarts no longer generate
eventlog entries or notifications.

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
2026-05-02 14:18:58 -04:00
10 changed files with 177 additions and 18 deletions
+1 -1
View File
@@ -14,4 +14,4 @@ Install options:
"""
__all__ = ["__version__"]
__version__ = "5.1.14"
__version__ = "5.1.16"
+1 -1
View File
@@ -95,7 +95,7 @@ class Connection:
if not Null:
d["addr"] = self.addr
if self.rtts[-1]:
d["rtt"] = "%0.1f" % self.rtts[-1]
d["rtt"] = "%d" % round(self.rtts[-1])
elif self.state == Connection.UNKNOWN:
d["rtt"] = ""
else:
+52
View File
@@ -181,6 +181,48 @@ def get_settings_sections(config: dict) -> list:
"notification_channels": attrs.get("notification_channels", []),
})
# ---- Threshold configurations -----------------------------------------
def _parse_metric_row(metric_path, metric_cfg):
if not isinstance(metric_cfg, dict):
return None
return {
"metric": metric_path,
"operator": metric_cfg.get("operator", ">"),
"warning": metric_cfg.get("warning"),
"critical": metric_cfg.get("critical"),
"hysteresis": metric_cfg.get("hysteresis"),
"count": metric_cfg.get("count", 1),
"enabled": metric_cfg.get("enabled", True),
}
threshold_config_list = []
raw_tconfigs = config.get("threshold_configs") or {}
if raw_tconfigs:
for cfg_name, cfg_data in sorted(raw_tconfigs.items()):
if not isinstance(cfg_data, dict):
continue
metrics = [
r for r in (
_parse_metric_row(mp, mc)
for mp, mc in (cfg_data.get("thresholds") or {}).items()
) if r
]
threshold_config_list.append({
"name": cfg_name,
"metrics": sorted(metrics, key=lambda m: m["metric"]),
})
elif config.get("thresholds"):
metrics = [
r for r in (
_parse_metric_row(mp, mc)
for mp, mc in config["thresholds"].items()
) if r
]
threshold_config_list.append({
"name": "default",
"metrics": sorted(metrics, key=lambda m: m["metric"]),
})
# ---- Hosts summary ----------------------------------------------------
hosts_list = []
for hname, hcfg in (config.get("hosts") or {}).items():
@@ -312,6 +354,16 @@ def get_settings_sections(config: dict) -> list:
"hosts": hosts_list,
"fields": [],
},
{
"id": "thresholds",
"title": "Threshold Configurations",
"description": "Named alert threshold sets. Each defines warning/critical levels per metric.",
"threshold_configs": threshold_config_list,
"fields": [
field("default_threshold_config", "Default config", "text",
"Threshold config used for hosts with no explicit mapping."),
],
},
{
"id": "runtime",
"title": "Runtime",
+7 -3
View File
@@ -236,6 +236,8 @@
color: #ff9800;
font-weight: 700;
}
#ntable a.host-link { color: inherit; text-decoration: none; }
#ntable a.host-link:hover { text-decoration: underline; }
</style>
<script type="text/javascript">
var cnt = 0;
@@ -245,11 +247,13 @@
var HBD_VERSION = "{{ hbd_version }}";
function hostNameHtml(data) {
var rawName = data.raw_name || data.name.replace(/<[^>]+>/g, '').replace('*', '').trim();
var nameHtml = data.name;
if (!data.hbc_version || data.hbc_version !== HBD_VERSION) {
nameHtml += ' 🥀';
}
return data.dyn ? '<b>' + nameHtml + '</b>' : nameHtml;
var display = data.dyn ? '<b>' + nameHtml + '</b>' : nameHtml;
return '<a class="host-link" href="/plugins#' + encodeURIComponent(rawName) + '">' + display + '</a>';
}
function setup() {
@@ -404,7 +408,7 @@
);
if (data.connections[i].state == "up") {
state = '<span class="state-up">up</span>';
latency = Number.parseFloat(data.connections[i].rtts[0]).toFixed(2);
latency = String(Math.round(Number.parseFloat(data.connections[i].rtts[0])));
} else {
if (data.connections[i].state == "unknown") {
state = "";
@@ -511,7 +515,7 @@
<tbody id="ntablebody">
{% for host in hosts %}
<tr class="{% if host.alert_critical_unacked > 0 or host.alert_critical_acked > 0 %}row-critical{% elif host.alert_warning_unacked > 0 or host.alert_warning_acked > 0 %}row-warning{% endif %}">
<td data-name="{{ host.name }}">{{ host.name }}{% if not host.hbc_version or host.hbc_version != hbd_version %} 🥀{% endif %}</td>
<td data-name="{{ host.name }}"><a class="host-link" href="/plugins#{{ host.raw_name | urlencode }}">{{ host.name }}{% if not host.hbc_version or host.hbc_version != hbd_version %} 🥀{% endif %}</a></td>
<td style="text-align: center; color: #ff9800; font-weight: bold;">
{%- set warning_unacked = host.alert_warning_unacked -%}
{%- set warning_acked = host.alert_warning_acked -%}
+13
View File
@@ -1156,6 +1156,19 @@
// ── Init ────────────────────────────────────────────────────────────────
document.addEventListener('DOMContentLoaded', () => {
// If a host fragment is in the URL, expand and scroll to that host;
// otherwise expand the first host as before.
const hash = window.location.hash;
if (hash) {
const hostname = decodeURIComponent(hash.slice(1));
const card = document.querySelector(`.host-card[data-hostname="${hostname}"]`);
if (card) {
card.classList.remove('collapsed');
fetchHostGlance(hostname);
setTimeout(() => card.scrollIntoView({ behavior: 'smooth', block: 'start' }), 150);
return;
}
}
const first = document.querySelector('.host-card');
if (first) {
first.classList.remove('collapsed');
+54
View File
@@ -254,6 +254,17 @@
.host-bool { text-align: center; }
.dot-yes { color: #2e7d32; font-size: 1.1em; }
.dot-no { color: #ddd; font-size: 1.1em; }
/* ---- Threshold configurations ---- */
.thresh-config { margin: 12px 20px 20px; }
.thresh-config-name {
font-weight: 600; font-size: 0.9em; color: #1a237e;
margin-bottom: 6px;
}
.mini-table .warn { color: #e65100; font-weight: 600; }
.mini-table .crit { color: #b71c1c; font-weight: 600; }
.mini-table .dim { color: #aaa; }
.mini-table .metric-path { font-family: monospace; font-size: 0.88em; }
</style>
<body>
@@ -394,6 +405,49 @@
{% endif %}
{% endif %}
{# ---- Threshold configurations section ---- #}
{% if section.id == "thresholds" %}
{% if section.threshold_configs %}
{% for tc in section.threshold_configs %}
<div class="thresh-config">
<div class="thresh-config-name">{{ tc.name }}</div>
{% if tc.metrics %}
<div style="overflow-x: auto;">
<table class="mini-table">
<thead>
<tr>
<th>Metric</th>
<th>Op</th>
<th>Warning</th>
<th>Critical</th>
<th>Hysteresis</th>
<th>Count</th>
</tr>
</thead>
<tbody>
{% for m in tc.metrics %}
<tr {% if not m.enabled %} style="opacity:0.45"{% endif %}>
<td class="metric-path">{{ m.metric }}</td>
<td>{{ m.operator or '>' }}</td>
<td class="warn">{{ m.warning if m.warning is not none else '—' }}</td>
<td class="crit">{{ m.critical if m.critical is not none else '—' }}</td>
<td class="dim">{{ '%.0f%%' % (m.hysteresis * 100) if m.hysteresis else '—' }}</td>
<td class="dim">{{ m.count }}</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
{% else %}
<span class="val-empty">No thresholds defined.</span>
{% endif %}
</div>
{% endfor %}
{% else %}
<div class="field-row"><span class="val-empty">No threshold configurations defined.</span></div>
{% endif %}
{% endif %}
{# ---- Hosts section ---- #}
{% if section.id == "hosts" %}
{% if section.hosts %}
+37 -5
View File
@@ -803,6 +803,29 @@ class ThresholdChecker:
self._check_pending_or_renotify(host_name, alert_state, metric_path, value, threshold, None)
return None
def _find_threshold(
self, thresholds: Dict[str, "ThresholdConfig"], metric_path: str
) -> Optional["ThresholdConfig"]:
"""Return the threshold for *metric_path*, falling back to suffix matches.
Allows generic thresholds like ``ping_monitor.rtt_avg`` to match
fully-qualified paths like ``ping_monitor.8_8_8_8_rtt_avg``.
The exact match is always tried first; then successive leading
underscore-delimited segments are stripped from the field name until
a match is found or no segments remain.
"""
if metric_path in thresholds:
return thresholds[metric_path]
plugin, sep, field = metric_path.partition(".")
if not sep:
return None
parts = field.split("_")
for i in range(1, len(parts)):
candidate = plugin + "." + "_".join(parts[i:])
if candidate in thresholds:
return thresholds[candidate]
return None
def check_plugin_data(
self,
host_name: str,
@@ -831,11 +854,10 @@ class ThresholdChecker:
for metric_name, value in data.items():
metric_path = f"{plugin_name}.{metric_name}"
if metric_path not in thresholds:
threshold = self._find_threshold(thresholds, metric_path)
if threshold is None:
continue
threshold = thresholds[metric_path]
# Get or create alert state
if metric_path not in alert_states:
alert_states[metric_path] = AlertState(metric_path)
@@ -1114,7 +1136,9 @@ class ThresholdChecker:
) -> None:
"""Handle a state-change transition with grace-period logic.
Transitioning INTO alert: defers the notification for grace_seconds.
Transitioning INTO alert (worsening): defers the notification for grace_seconds.
De-escalation within alert states (e.g. CRITICAL→WARNING): no new notification;
the metric is still alerting so no RECOVER was sent.
Transitioning TO OK:
- Still in grace window (pending_since set): suppresses both the alert
and the recovery — the spike never warranted a page.
@@ -1134,12 +1158,20 @@ class ThresholdChecker:
alert_state.pending_since = None
else:
self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value)
else:
elif new_level.value > old_level.value:
# Worsening (OK→WARNING, OK→CRITICAL, WARNING→CRITICAL): schedule notification.
alert_state.pending_since = time.time()
logger.debug(
"Alert deferred (%.0fs grace): %s on %s = %s",
self.grace_seconds, metric_path, host_name, value,
)
else:
# De-escalation within alert states (e.g. CRITICAL→WARNING): metric is still
# alerting but did not recover, so no new notification.
logger.debug(
"De-escalation %s%s for %s on %s, no notification",
old_level.name, new_level.name, metric_path, host_name,
)
def _check_pending_or_renotify(
self,
+4
View File
@@ -440,8 +440,12 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
if not newh:
if d == 0 or lasts == "unknown":
m = "%s is up" % (conn.afam)
elif d < 4:
# Transient blip (likely client restart) — skip log and notification
m = None
else:
m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
if m:
eventlog(uname, "RECOVER", m)
if host.watched:
asyncio.create_task(notify_mod.send_notification(
+1 -1
View File
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "hbd"
version = "5.1.14"
version = "5.1.16"
description = "Heartbeat monitoring system — client (hbc) and server (hbd)"
readme = "README.md"
requires-python = ">=3.11"
+1 -1
View File
@@ -41,7 +41,7 @@ from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
# updated by scripts/bumpminor.sh
__version__ = "5.1.14"
__version__ = "5.1.16"
# ---------------------------------------------------------------------------
# Protocol (mirrors hbd/common/proto.py)