Compare commits
9 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 8da3d550eb | |||
| a76d0fc840 | |||
| 94cbb31c48 | |||
| ae60844a8a | |||
| 49fa310361 | |||
| 28e2180f7b | |||
| ce0590f015 | |||
| f50acca509 | |||
| 72fc82b91f |
+1
-1
@@ -14,4 +14,4 @@ Install options:
|
||||
"""
|
||||
|
||||
__all__ = ["__version__"]
|
||||
__version__ = "5.1.13"
|
||||
__version__ = "5.1.16"
|
||||
|
||||
@@ -95,7 +95,7 @@ class Connection:
|
||||
if not Null:
|
||||
d["addr"] = self.addr
|
||||
if self.rtts[-1]:
|
||||
d["rtt"] = "%0.1f" % self.rtts[-1]
|
||||
d["rtt"] = "%d" % round(self.rtts[-1])
|
||||
elif self.state == Connection.UNKNOWN:
|
||||
d["rtt"] = ""
|
||||
else:
|
||||
|
||||
@@ -181,6 +181,48 @@ def get_settings_sections(config: dict) -> list:
|
||||
"notification_channels": attrs.get("notification_channels", []),
|
||||
})
|
||||
|
||||
# ---- Threshold configurations -----------------------------------------
|
||||
def _parse_metric_row(metric_path, metric_cfg):
|
||||
if not isinstance(metric_cfg, dict):
|
||||
return None
|
||||
return {
|
||||
"metric": metric_path,
|
||||
"operator": metric_cfg.get("operator", ">"),
|
||||
"warning": metric_cfg.get("warning"),
|
||||
"critical": metric_cfg.get("critical"),
|
||||
"hysteresis": metric_cfg.get("hysteresis"),
|
||||
"count": metric_cfg.get("count", 1),
|
||||
"enabled": metric_cfg.get("enabled", True),
|
||||
}
|
||||
|
||||
threshold_config_list = []
|
||||
raw_tconfigs = config.get("threshold_configs") or {}
|
||||
if raw_tconfigs:
|
||||
for cfg_name, cfg_data in sorted(raw_tconfigs.items()):
|
||||
if not isinstance(cfg_data, dict):
|
||||
continue
|
||||
metrics = [
|
||||
r for r in (
|
||||
_parse_metric_row(mp, mc)
|
||||
for mp, mc in (cfg_data.get("thresholds") or {}).items()
|
||||
) if r
|
||||
]
|
||||
threshold_config_list.append({
|
||||
"name": cfg_name,
|
||||
"metrics": sorted(metrics, key=lambda m: m["metric"]),
|
||||
})
|
||||
elif config.get("thresholds"):
|
||||
metrics = [
|
||||
r for r in (
|
||||
_parse_metric_row(mp, mc)
|
||||
for mp, mc in config["thresholds"].items()
|
||||
) if r
|
||||
]
|
||||
threshold_config_list.append({
|
||||
"name": "default",
|
||||
"metrics": sorted(metrics, key=lambda m: m["metric"]),
|
||||
})
|
||||
|
||||
# ---- Hosts summary ----------------------------------------------------
|
||||
hosts_list = []
|
||||
for hname, hcfg in (config.get("hosts") or {}).items():
|
||||
@@ -312,6 +354,16 @@ def get_settings_sections(config: dict) -> list:
|
||||
"hosts": hosts_list,
|
||||
"fields": [],
|
||||
},
|
||||
{
|
||||
"id": "thresholds",
|
||||
"title": "Threshold Configurations",
|
||||
"description": "Named alert threshold sets. Each defines warning/critical levels per metric.",
|
||||
"threshold_configs": threshold_config_list,
|
||||
"fields": [
|
||||
field("default_threshold_config", "Default config", "text",
|
||||
"Threshold config used for hosts with no explicit mapping."),
|
||||
],
|
||||
},
|
||||
{
|
||||
"id": "runtime",
|
||||
"title": "Runtime",
|
||||
|
||||
@@ -236,6 +236,8 @@
|
||||
color: #ff9800;
|
||||
font-weight: 700;
|
||||
}
|
||||
#ntable a.host-link { color: inherit; text-decoration: none; }
|
||||
#ntable a.host-link:hover { text-decoration: underline; }
|
||||
</style>
|
||||
<script type="text/javascript">
|
||||
var cnt = 0;
|
||||
@@ -245,11 +247,13 @@
|
||||
var HBD_VERSION = "{{ hbd_version }}";
|
||||
|
||||
function hostNameHtml(data) {
|
||||
var rawName = data.raw_name || data.name.replace(/<[^>]+>/g, '').replace('*', '').trim();
|
||||
var nameHtml = data.name;
|
||||
if (!data.hbc_version || data.hbc_version !== HBD_VERSION) {
|
||||
nameHtml += ' 🥀';
|
||||
}
|
||||
return data.dyn ? '<b>' + nameHtml + '</b>' : nameHtml;
|
||||
var display = data.dyn ? '<b>' + nameHtml + '</b>' : nameHtml;
|
||||
return '<a class="host-link" href="/plugins#' + encodeURIComponent(rawName) + '">' + display + '</a>';
|
||||
}
|
||||
|
||||
function setup() {
|
||||
@@ -404,7 +408,7 @@
|
||||
);
|
||||
if (data.connections[i].state == "up") {
|
||||
state = '<span class="state-up">up</span>';
|
||||
latency = Number.parseFloat(data.connections[i].rtts[0]).toFixed(2);
|
||||
latency = String(Math.round(Number.parseFloat(data.connections[i].rtts[0])));
|
||||
} else {
|
||||
if (data.connections[i].state == "unknown") {
|
||||
state = "";
|
||||
@@ -511,7 +515,7 @@
|
||||
<tbody id="ntablebody">
|
||||
{% for host in hosts %}
|
||||
<tr class="{% if host.alert_critical_unacked > 0 or host.alert_critical_acked > 0 %}row-critical{% elif host.alert_warning_unacked > 0 or host.alert_warning_acked > 0 %}row-warning{% endif %}">
|
||||
<td data-name="{{ host.name }}">{{ host.name }}{% if not host.hbc_version or host.hbc_version != hbd_version %} 🥀{% endif %}</td>
|
||||
<td data-name="{{ host.name }}"><a class="host-link" href="/plugins#{{ host.raw_name | urlencode }}">{{ host.name }}{% if not host.hbc_version or host.hbc_version != hbd_version %} 🥀{% endif %}</a></td>
|
||||
<td style="text-align: center; color: #ff9800; font-weight: bold;">
|
||||
{%- set warning_unacked = host.alert_warning_unacked -%}
|
||||
{%- set warning_acked = host.alert_warning_acked -%}
|
||||
|
||||
@@ -383,7 +383,7 @@
|
||||
</div>
|
||||
|
||||
<div class="host-body">
|
||||
{% set plugin_order = ['os_info','cpu_monitor','memory_monitor','disk_monitor','network_monitor','nagios_runner','filesystem_info'] %}
|
||||
{% set plugin_order = ['os_info','cpu_monitor','memory_monitor','disk_monitor','network_monitor','zfs_monitor','nagios_runner','filesystem_info'] %}
|
||||
{% for plugin in plugin_order if plugin in host.plugins %}
|
||||
<div class="plugin-accordion collapsed"
|
||||
data-hostname="{{ host.name }}"
|
||||
@@ -673,6 +673,19 @@
|
||||
text = `${count} filesystem${count !== 1 ? 's' : ''}`;
|
||||
break;
|
||||
}
|
||||
case 'zfs_monitor': {
|
||||
const pools = d.pools || {};
|
||||
const names = Object.keys(pools);
|
||||
if (names.length === 0) { text = 'No pools'; break; }
|
||||
const degraded = names.filter(n => pools[n].health && pools[n].health !== 'ONLINE');
|
||||
text = names.map(n => {
|
||||
const p = pools[n];
|
||||
const cap = p.capacity != null ? ` ${p.capacity.toFixed(0)}%` : '';
|
||||
return `${n}${cap}`;
|
||||
}).join(' · ');
|
||||
if (degraded.length) text += ` ⚠ ${degraded.map(n => pools[n].health).join(',')}`;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
text = 'Loaded';
|
||||
}
|
||||
@@ -694,6 +707,7 @@
|
||||
case 'memory_monitor': html = renderMemoryTable(cached.data); break;
|
||||
case 'disk_monitor': html = renderDiskTables(cached.data); break;
|
||||
case 'network_monitor':html = renderNetworkTables(cached.data); break;
|
||||
case 'zfs_monitor': html = renderZfsTables(cached.data); break;
|
||||
case 'nagios_runner': html = renderNagiosTable(cached.data); break;
|
||||
case 'filesystem_info':html = renderFilesystemTable(cached.data); break;
|
||||
default: html = renderGenericTable(cached.data); break;
|
||||
@@ -1024,6 +1038,66 @@
|
||||
return html;
|
||||
}
|
||||
|
||||
function renderZfsTables(d) {
|
||||
const pools = d.pools || {};
|
||||
const names = Object.keys(pools);
|
||||
if (names.length === 0) return '<div class="no-data">No ZFS pools found</div>';
|
||||
|
||||
const healthCls = h => {
|
||||
if (!h || h === 'ONLINE') return 'pct-ok';
|
||||
if (h === 'DEGRADED') return 'pct-warn';
|
||||
return 'pct-crit';
|
||||
};
|
||||
|
||||
let pt = '<table class="data-table"><thead><tr>'
|
||||
+ '<th>Pool</th><th>Health</th>'
|
||||
+ '<th class="num">Size</th><th class="num">Used</th>'
|
||||
+ '<th class="num">Free</th><th class="num">Cap %</th>'
|
||||
+ '<th class="num">Frag %</th><th class="num">Dedup</th>'
|
||||
+ '</tr></thead><tbody>';
|
||||
for (const name of names) {
|
||||
const p = pools[name];
|
||||
const cap = p.capacity != null ? p.capacity : 0;
|
||||
const capCls = cap > 90 ? 'pct-crit' : cap > 75 ? 'pct-warn' : 'pct-ok';
|
||||
pt += `<tr>
|
||||
<td class="iface-name">${escHtml(name)}</td>
|
||||
<td class="${healthCls(p.health)}">${escHtml(p.health || '—')}</td>
|
||||
<td class="num">${formatBytes(p.size || 0)}</td>
|
||||
<td class="num">${formatBytes(p.alloc || 0)}</td>
|
||||
<td class="num">${formatBytes(p.free || 0)}</td>
|
||||
<td class="num ${capCls}">${cap.toFixed(1)}%</td>
|
||||
<td class="num">${p.frag != null ? p.frag.toFixed(1) + '%' : '—'}</td>
|
||||
<td class="num">${p.dedup != null ? p.dedup.toFixed(2) + 'x' : '—'}</td>
|
||||
</tr>`;
|
||||
}
|
||||
pt += '</tbody></table>';
|
||||
|
||||
const hasIo = names.some(n => pools[n].read_ops != null);
|
||||
if (!hasIo) return pt;
|
||||
|
||||
let iot = '<table class="data-table"><thead><tr>'
|
||||
+ '<th>Pool</th>'
|
||||
+ '<th class="num">Read ops</th><th class="num">Write ops</th>'
|
||||
+ '<th class="num">Read BW</th><th class="num">Write BW</th>'
|
||||
+ '</tr></thead><tbody>';
|
||||
for (const name of names) {
|
||||
const p = pools[name];
|
||||
iot += `<tr>
|
||||
<td class="iface-name">${escHtml(name)}</td>
|
||||
<td class="num">${p.read_ops != null ? p.read_ops.toLocaleString() : '—'}</td>
|
||||
<td class="num">${p.write_ops != null ? p.write_ops.toLocaleString() : '—'}</td>
|
||||
<td class="num">${p.read_bw != null ? formatBytes(p.read_bw) : '—'}</td>
|
||||
<td class="num">${p.write_bw != null ? formatBytes(p.write_bw) : '—'}</td>
|
||||
</tr>`;
|
||||
}
|
||||
iot += '</tbody></table>';
|
||||
|
||||
return `<div class="flex-tables">
|
||||
<div><div class="table-section-label">Pools</div>${pt}</div>
|
||||
<div><div class="table-section-label">I/O (cumulative)</div>${iot}</div>
|
||||
</div>`;
|
||||
}
|
||||
|
||||
function renderGenericTable(d) {
|
||||
let html = '<table class="data-table"><thead><tr><th>Field</th><th>Value</th></tr></thead><tbody>';
|
||||
for (const [k, v] of Object.entries(d)) {
|
||||
@@ -1082,6 +1156,19 @@
|
||||
// ── Init ────────────────────────────────────────────────────────────────
|
||||
|
||||
document.addEventListener('DOMContentLoaded', () => {
|
||||
// If a host fragment is in the URL, expand and scroll to that host;
|
||||
// otherwise expand the first host as before.
|
||||
const hash = window.location.hash;
|
||||
if (hash) {
|
||||
const hostname = decodeURIComponent(hash.slice(1));
|
||||
const card = document.querySelector(`.host-card[data-hostname="${hostname}"]`);
|
||||
if (card) {
|
||||
card.classList.remove('collapsed');
|
||||
fetchHostGlance(hostname);
|
||||
setTimeout(() => card.scrollIntoView({ behavior: 'smooth', block: 'start' }), 150);
|
||||
return;
|
||||
}
|
||||
}
|
||||
const first = document.querySelector('.host-card');
|
||||
if (first) {
|
||||
first.classList.remove('collapsed');
|
||||
|
||||
@@ -254,6 +254,17 @@
|
||||
.host-bool { text-align: center; }
|
||||
.dot-yes { color: #2e7d32; font-size: 1.1em; }
|
||||
.dot-no { color: #ddd; font-size: 1.1em; }
|
||||
|
||||
/* ---- Threshold configurations ---- */
|
||||
.thresh-config { margin: 12px 20px 20px; }
|
||||
.thresh-config-name {
|
||||
font-weight: 600; font-size: 0.9em; color: #1a237e;
|
||||
margin-bottom: 6px;
|
||||
}
|
||||
.mini-table .warn { color: #e65100; font-weight: 600; }
|
||||
.mini-table .crit { color: #b71c1c; font-weight: 600; }
|
||||
.mini-table .dim { color: #aaa; }
|
||||
.mini-table .metric-path { font-family: monospace; font-size: 0.88em; }
|
||||
</style>
|
||||
|
||||
<body>
|
||||
@@ -394,6 +405,49 @@
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
{# ---- Threshold configurations section ---- #}
|
||||
{% if section.id == "thresholds" %}
|
||||
{% if section.threshold_configs %}
|
||||
{% for tc in section.threshold_configs %}
|
||||
<div class="thresh-config">
|
||||
<div class="thresh-config-name">{{ tc.name }}</div>
|
||||
{% if tc.metrics %}
|
||||
<div style="overflow-x: auto;">
|
||||
<table class="mini-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Metric</th>
|
||||
<th>Op</th>
|
||||
<th>Warning</th>
|
||||
<th>Critical</th>
|
||||
<th>Hysteresis</th>
|
||||
<th>Count</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for m in tc.metrics %}
|
||||
<tr {% if not m.enabled %} style="opacity:0.45"{% endif %}>
|
||||
<td class="metric-path">{{ m.metric }}</td>
|
||||
<td>{{ m.operator or '>' }}</td>
|
||||
<td class="warn">{{ m.warning if m.warning is not none else '—' }}</td>
|
||||
<td class="crit">{{ m.critical if m.critical is not none else '—' }}</td>
|
||||
<td class="dim">{{ '%.0f%%' % (m.hysteresis * 100) if m.hysteresis else '—' }}</td>
|
||||
<td class="dim">{{ m.count }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
{% else %}
|
||||
<span class="val-empty">No thresholds defined.</span>
|
||||
{% endif %}
|
||||
</div>
|
||||
{% endfor %}
|
||||
{% else %}
|
||||
<div class="field-row"><span class="val-empty">No threshold configurations defined.</span></div>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
{# ---- Hosts section ---- #}
|
||||
{% if section.id == "hosts" %}
|
||||
{% if section.hosts %}
|
||||
|
||||
+37
-5
@@ -803,6 +803,29 @@ class ThresholdChecker:
|
||||
self._check_pending_or_renotify(host_name, alert_state, metric_path, value, threshold, None)
|
||||
|
||||
return None
|
||||
def _find_threshold(
|
||||
self, thresholds: Dict[str, "ThresholdConfig"], metric_path: str
|
||||
) -> Optional["ThresholdConfig"]:
|
||||
"""Return the threshold for *metric_path*, falling back to suffix matches.
|
||||
|
||||
Allows generic thresholds like ``ping_monitor.rtt_avg`` to match
|
||||
fully-qualified paths like ``ping_monitor.8_8_8_8_rtt_avg``.
|
||||
The exact match is always tried first; then successive leading
|
||||
underscore-delimited segments are stripped from the field name until
|
||||
a match is found or no segments remain.
|
||||
"""
|
||||
if metric_path in thresholds:
|
||||
return thresholds[metric_path]
|
||||
plugin, sep, field = metric_path.partition(".")
|
||||
if not sep:
|
||||
return None
|
||||
parts = field.split("_")
|
||||
for i in range(1, len(parts)):
|
||||
candidate = plugin + "." + "_".join(parts[i:])
|
||||
if candidate in thresholds:
|
||||
return thresholds[candidate]
|
||||
return None
|
||||
|
||||
def check_plugin_data(
|
||||
self,
|
||||
host_name: str,
|
||||
@@ -831,11 +854,10 @@ class ThresholdChecker:
|
||||
for metric_name, value in data.items():
|
||||
metric_path = f"{plugin_name}.{metric_name}"
|
||||
|
||||
if metric_path not in thresholds:
|
||||
threshold = self._find_threshold(thresholds, metric_path)
|
||||
if threshold is None:
|
||||
continue
|
||||
|
||||
threshold = thresholds[metric_path]
|
||||
|
||||
# Get or create alert state
|
||||
if metric_path not in alert_states:
|
||||
alert_states[metric_path] = AlertState(metric_path)
|
||||
@@ -1114,7 +1136,9 @@ class ThresholdChecker:
|
||||
) -> None:
|
||||
"""Handle a state-change transition with grace-period logic.
|
||||
|
||||
Transitioning INTO alert: defers the notification for grace_seconds.
|
||||
Transitioning INTO alert (worsening): defers the notification for grace_seconds.
|
||||
De-escalation within alert states (e.g. CRITICAL→WARNING): no new notification;
|
||||
the metric is still alerting so no RECOVER was sent.
|
||||
Transitioning TO OK:
|
||||
- Still in grace window (pending_since set): suppresses both the alert
|
||||
and the recovery — the spike never warranted a page.
|
||||
@@ -1134,12 +1158,20 @@ class ThresholdChecker:
|
||||
alert_state.pending_since = None
|
||||
else:
|
||||
self._send_notification(host_name, lvl, message, metric_path, old_level, new_level, value)
|
||||
else:
|
||||
elif new_level.value > old_level.value:
|
||||
# Worsening (OK→WARNING, OK→CRITICAL, WARNING→CRITICAL): schedule notification.
|
||||
alert_state.pending_since = time.time()
|
||||
logger.debug(
|
||||
"Alert deferred (%.0fs grace): %s on %s = %s",
|
||||
self.grace_seconds, metric_path, host_name, value,
|
||||
)
|
||||
else:
|
||||
# De-escalation within alert states (e.g. CRITICAL→WARNING): metric is still
|
||||
# alerting but did not recover, so no new notification.
|
||||
logger.debug(
|
||||
"De-escalation %s→%s for %s on %s, no notification",
|
||||
old_level.name, new_level.name, metric_path, host_name,
|
||||
)
|
||||
|
||||
def _check_pending_or_renotify(
|
||||
self,
|
||||
|
||||
+10
-6
@@ -440,14 +440,18 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
if not newh:
|
||||
if d == 0 or lasts == "unknown":
|
||||
m = "%s is up" % (conn.afam)
|
||||
elif d < 4:
|
||||
# Transient blip (likely client restart) — skip log and notification
|
||||
m = None
|
||||
else:
|
||||
m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
|
||||
eventlog(uname, "RECOVER", m)
|
||||
if host.watched:
|
||||
asyncio.create_task(notify_mod.send_notification(
|
||||
uname,
|
||||
notify_mod.Notification(title=f"[RECOVER] {uname}", body=m, level="RECOVER"),
|
||||
))
|
||||
if m:
|
||||
eventlog(uname, "RECOVER", m)
|
||||
if host.watched:
|
||||
asyncio.create_task(notify_mod.send_notification(
|
||||
uname,
|
||||
notify_mod.Notification(title=f"[RECOVER] {uname}", body=m, level="RECOVER"),
|
||||
))
|
||||
|
||||
if boot or newh:
|
||||
host.upcount = host.doesack
|
||||
|
||||
+1
-1
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "hbd"
|
||||
version = "5.1.13"
|
||||
version = "5.1.16"
|
||||
description = "Heartbeat monitoring system — client (hbc) and server (hbd)"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.11"
|
||||
|
||||
+1
-1
@@ -41,7 +41,7 @@ from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
# updated by scripts/bumpminor.sh
|
||||
__version__ = "5.1.13"
|
||||
__version__ = "5.1.16"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Protocol (mirrors hbd/common/proto.py)
|
||||
|
||||
Reference in New Issue
Block a user