Compare commits

..

6 Commits

Author SHA1 Message Date
andreas d7b368c7c6 version 5.1.19
Release / release (push) Successful in 5s
2026-05-04 12:10:01 -04:00
andreas e790663f9f feat: exclude ZFS ARC from memory_percent; add uptime_seconds to cpu_monitor
memory_monitor / hbc_mini: ZFS ARC is reclaimable but not reflected in
MemAvailable by the Linux kernel (not in SReclaimable). Read ARC size
from /proc/spl/kstat/zfs/arcstats and add it to available memory before
computing memory_percent and memory_used. No-op on systems without ZFS.

cpu_monitor: report uptime_seconds via psutil.boot_time() (full client)
and /proc/uptime (hbc_mini).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-04 12:09:58 -04:00
andreas 475319e248 fix: send boot/shutdown on first open connection, not blindly first in list
Replace break-after-first-iteration with next(c for c in connections if
c.transport) so the message goes to the first connection that actually
has an open transport. Falls back to connections[0] if none are open
yet (sendto will attempt reopen), avoiding silent message loss when the
leading connection is still connecting.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-04 09:59:30 -04:00
andreas ca5ef384a8 version 5.1.18
Release / release (push) Successful in 5s
2026-05-04 09:13:18 -04:00
andreas c93dbdc0f4 fix: settings thresholds show correct per-config metrics; misc hbc fixes
Settings page: pass threshold_checker to http.start so the Threshold
Configurations section has data. Use threshold_checker's already-parsed
ThresholdConfig objects instead of re-parsing the raw nested YAML.
Named (non-default) configs now display only their explicit overrides
via threshold_raw_configs, not the full merged set with defaults.

hbc/hbc_mini: send boot and shutdown messages on first connection only
to avoid duplicate packets when multiple servers are configured.
Replace print("Daemonizing...") with logging.info so output goes to
syslog in daemon mode.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-04 09:12:39 -04:00
andreas 3a546a1e5c feat: fetch-based Update/Delete buttons with toast notification on Host Overview
Replace href navigation with fetch() so the server response is captured
and displayed in a slide-up toast at the bottom of the page. Delete also
removes the host card from the DOM on success without a page reload.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-04 08:16:54 -04:00
10 changed files with 178 additions and 64 deletions
+1 -1
View File
@@ -14,4 +14,4 @@ Install options:
"""
__all__ = ["__version__"]
__version__ = "5.1.17"
__version__ = "5.1.19"
+7 -10
View File
@@ -463,16 +463,13 @@ async def cleanup(connections: List[AsyncConnection]):
logger = logging.getLogger("hbc.cleanup")
logger.info("Cleaning up connections")
for conn in connections:
target = next((c for c in connections if c.transport), connections[0] if connections else None)
if target:
try:
msg = {
"shutdown": 1,
"acks": conn.ackcount
}
await conn.sendto(msg)
await target.sendto({"shutdown": 1, "acks": target.ackcount})
except Exception as e:
logger.error(f"Error sending shutdown: {e}")
for conn in connections:
conn.close()
# Give messages time to send
@@ -538,8 +535,8 @@ async def async_main(args, config):
boot_msg["msg"] = args.message
boot_msg["acks"] = 0
for conn in connections:
await conn.sendto(boot_msg)
target = next((c for c in connections if c.transport), connections[0])
await target.sendto(boot_msg)
if args.message and not args.daemon:
# Message-only mode
@@ -739,7 +736,7 @@ def main(argv=None):
# Daemonize if requested
if args.daemon:
print("Daemonizing...")
logging.info("Daemonizing...")
daemonize()
_reconfigure_logging_for_daemon(log_level)
logging.info(f"hbc starting, sending heartbeat to {', '.join(args.hosts)}")
+7
View File
@@ -119,6 +119,13 @@ class CPUMonitorPlugin(MonitorPlugin):
except Exception as e:
self.logger.debug(f"Could not get CPU times: {e}")
# Uptime in seconds
try:
import time
data["uptime_seconds"] = int(time.time() - self.psutil.boot_time())
except Exception as e:
self.logger.debug(f"Could not get uptime: {e}")
self.logger.debug(
f"Collected CPU metrics: {data.get('cpu_percent', 'N/A')}% usage"
)
+31 -3
View File
@@ -14,6 +14,24 @@ except ImportError:
from hbd.client.plugin import MonitorPlugin
def _zfs_arc_bytes() -> int:
"""Return current ZFS ARC size in bytes, or 0 if ZFS is not present.
ZFS ARC is reclaimable but is not included in MemAvailable by the Linux
kernel (it is not in SReclaimable), so it would otherwise be counted as
used memory.
"""
try:
with open("/proc/spl/kstat/zfs/arcstats") as fh:
for line in fh:
parts = line.split()
if len(parts) >= 3 and parts[0] == "size":
return int(parts[2])
except (OSError, ValueError):
pass
return 0
logger = logging.getLogger(__name__)
@@ -101,11 +119,21 @@ class MemoryMonitorPlugin(MonitorPlugin):
# Virtual (physical) memory statistics
vmem = psutil.virtual_memory()
# psutil's available already excludes page cache / file buffers
# (uses MemAvailable on Linux). Add ZFS ARC on top because the kernel
# does not include it in SReclaimable / MemAvailable even though it is
# reclaimable.
arc_bytes = _zfs_arc_bytes()
available = min(vmem.available + arc_bytes, vmem.total)
used = vmem.total - available
percent = round(used / vmem.total * 100, 1) if vmem.total else 0.0
metrics['memory_total'] = vmem.total
metrics['memory_available'] = vmem.available
metrics['memory_used'] = vmem.used
metrics['memory_available'] = available
metrics['memory_used'] = used
metrics['memory_free'] = vmem.free
metrics['memory_percent'] = vmem.percent
metrics['memory_percent'] = percent
# Platform-specific memory details
if hasattr(vmem, 'active'):
+1 -1
View File
@@ -890,7 +890,7 @@ async def start(
tmpl = env.get_template("settings.html")
body = tmpl.render(
title="Settings - Heartbeat",
sections=settings_mod.get_settings_sections(config),
sections=settings_mod.get_settings_sections(config, threshold_checker=threshold_checker),
current_user=current_user.to_dict() if current_user else None,
active_page="settings",
)
+1
View File
@@ -255,6 +255,7 @@ async def _run_async(config, config_path=None):
config=config,
hbdclass=hbdclass,
tcss=None,
threshold_checker=threshold_checker,
verbose=config.get("verbose", False),
get_now=lambda: time.time(),
VER="",
+30 -37
View File
@@ -88,7 +88,7 @@ def _sanitize_channel(name, cfg):
# Public API
# ---------------------------------------------------------------------------
def get_settings_sections(config: dict) -> list:
def get_settings_sections(config: dict, threshold_checker=None) -> list:
"""Return ordered list of setting sections for the settings page.
Each section:
@@ -182,46 +182,39 @@ def get_settings_sections(config: dict) -> list:
})
# ---- Threshold configurations -----------------------------------------
def _parse_metric_row(metric_path, metric_cfg):
if not isinstance(metric_cfg, dict):
return None
def _tc_to_row(tc):
return {
"metric": metric_path,
"operator": metric_cfg.get("operator", ">"),
"warning": metric_cfg.get("warning"),
"critical": metric_cfg.get("critical"),
"hysteresis": metric_cfg.get("hysteresis"),
"count": metric_cfg.get("count", 1),
"enabled": metric_cfg.get("enabled", True),
"metric": tc.metric_path,
"operator": tc.operator.value,
"warning": tc.warning,
"critical": tc.critical,
"hysteresis": tc.hysteresis,
"count": tc.count,
"enabled": tc.enabled,
}
threshold_config_list = []
raw_tconfigs = config.get("threshold_configs") or {}
if raw_tconfigs:
for cfg_name, cfg_data in sorted(raw_tconfigs.items()):
if not isinstance(cfg_data, dict):
continue
metrics = [
r for r in (
_parse_metric_row(mp, mc)
for mp, mc in (cfg_data.get("thresholds") or {}).items()
) if r
]
threshold_config_list.append({
"name": cfg_name,
"metrics": sorted(metrics, key=lambda m: m["metric"]),
})
elif config.get("thresholds"):
metrics = [
r for r in (
_parse_metric_row(mp, mc)
for mp, mc in config["thresholds"].items()
) if r
]
threshold_config_list.append({
"name": "default",
"metrics": sorted(metrics, key=lambda m: m["metric"]),
})
if threshold_checker is not None:
if threshold_checker.threshold_configs:
for cfg_name, cfg_metrics in sorted(threshold_checker.threshold_configs.items()):
# For the default config use the merged effective set;
# for named overrides use only the explicitly defined metrics
# (threshold_raw_configs) so inherited defaults are not repeated.
if cfg_name == "default":
display_metrics = cfg_metrics
else:
display_metrics = threshold_checker.threshold_raw_configs.get(cfg_name, cfg_metrics)
metrics = sorted(
[_tc_to_row(tc) for tc in display_metrics.values()],
key=lambda m: m["metric"],
)
threshold_config_list.append({"name": cfg_name, "metrics": metrics})
elif threshold_checker.thresholds:
metrics = sorted(
[_tc_to_row(tc) for tc in threshold_checker.thresholds.values()],
key=lambda m: m["metric"],
)
threshold_config_list.append({"name": "default", "metrics": metrics})
# ---- Hosts summary ----------------------------------------------------
hosts_list = []
+72 -6
View File
@@ -152,6 +152,31 @@
}
.host-action-btn.delete-btn:hover { background: #ffcdd2; }
/* ── Action result toast ───────────────────────────────────── */
#action-toast {
position: fixed;
bottom: 24px;
left: 50%;
transform: translateX(-50%) translateY(20px);
background: #323232;
color: #fff;
padding: 12px 22px;
border-radius: 6px;
font-size: 0.9em;
max-width: 480px;
text-align: center;
opacity: 0;
pointer-events: none;
transition: opacity 0.25s, transform 0.25s;
z-index: 9000;
white-space: pre-wrap;
}
#action-toast.show {
opacity: 1;
transform: translateX(-50%) translateY(0);
}
#action-toast.error { background: #c62828; }
/* ── Host body ──────────────────────────────────────────────── */
.host-body {
@@ -401,12 +426,10 @@
{% endif %}
<span class="os-label" id="os-label-{{ host.name }}"></span>
{% if host.is_owner %}
<a class="host-action-btn update-btn"
href="/u?h={{ host.name }}"
onclick="event.stopPropagation()">Update</a>
<a class="host-action-btn delete-btn"
href="/d?h={{ host.name }}"
onclick="event.stopPropagation(); return confirm('Delete host {{ host.name }}?')">Delete</a>
<button class="host-action-btn update-btn"
onclick="event.stopPropagation(); hostAction(this, '/u?h={{ host.name }}')">Update</button>
<button class="host-action-btn delete-btn"
onclick="event.stopPropagation(); hostDelete(this, '{{ host.name }}')">Delete</button>
{% endif %}
</div>
</div>
@@ -1204,6 +1227,49 @@
fetchHostGlance(first.dataset.hostname);
}
});
// ── Host action helpers ──────────────────────────────────────
let _toastTimer = null;
function showToast(msg, isError) {
const t = document.getElementById('action-toast');
t.textContent = msg;
t.classList.toggle('error', !!isError);
t.classList.add('show');
clearTimeout(_toastTimer);
_toastTimer = setTimeout(() => t.classList.remove('show'), 4000);
}
async function hostAction(btn, url) {
btn.disabled = true;
try {
const res = await fetch(url);
const text = await res.text();
showToast(text, !res.ok);
} catch (e) {
showToast('Request failed: ' + e.message, true);
} finally {
btn.disabled = false;
}
}
async function hostDelete(btn, hostname) {
if (!confirm('Delete host ' + hostname + '?')) return;
btn.disabled = true;
try {
const res = await fetch('/d?h=' + encodeURIComponent(hostname));
const text = await res.text();
showToast(text, !res.ok);
if (res.ok) {
const card = document.querySelector(`.host-card[data-hostname="${hostname}"]`);
if (card) card.remove();
}
} catch (e) {
showToast('Request failed: ' + e.message, true);
btn.disabled = false;
}
}
</script>
<div id="action-toast"></div>
</body>
</html>
+1 -1
View File
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "hbd"
version = "5.1.17"
version = "5.1.19"
description = "Heartbeat monitoring system — client (hbc) and server (hbd)"
readme = "README.md"
requires-python = ">=3.11"
+27 -5
View File
@@ -41,7 +41,7 @@ from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
# updated by scripts/bumpminor.sh
__version__ = "5.1.17"
__version__ = "5.1.19"
# ---------------------------------------------------------------------------
# Protocol (mirrors hbd/common/proto.py)
@@ -487,6 +487,12 @@ class CPUMonitorPlugin(MonitorPlugin):
except Exception:
pass
try:
with open("/proc/uptime") as fh:
data["uptime_seconds"] = int(float(fh.read().split()[0]))
except Exception:
pass
return data
@@ -535,6 +541,20 @@ class MemoryMonitorPlugin(MonitorPlugin):
total = mi.get("MemTotal", 0)
avail = mi.get("MemAvailable", mi.get("MemFree", 0))
free = mi.get("MemFree", 0)
# ZFS ARC is reclaimable but not included in MemAvailable; add it.
arc_kb = 0
try:
with open("/proc/spl/kstat/zfs/arcstats") as _f:
for _line in _f:
_p = _line.split()
if len(_p) >= 3 and _p[0] == "size":
arc_kb = int(_p[2]) // 1024
break
except (OSError, ValueError):
pass
avail = min(avail + arc_kb, total)
used = total - avail
data: Dict[str, Any] = {
"memory_total": total * 1024,
@@ -1052,8 +1072,8 @@ async def _async_main(args, cfg: Dict[str, Any]) -> int:
if args.message:
bmsg["service"] = "service"
bmsg["msg"] = args.message
for c in connections:
await c.sendto(bmsg)
target = next((c for c in connections if c._transport), connections[0])
await target.sendto(bmsg)
if args.message and not args.daemon:
await asyncio.sleep(0.3)
for c in connections:
@@ -1085,11 +1105,13 @@ async def _async_main(args, cfg: Dict[str, Any]) -> int:
pass
log.info("shutting down")
for conn in connections:
target = next((c for c in connections if c._transport), connections[0] if connections else None)
if target:
try:
await conn.sendto({"shutdown": 1, "acks": conn.ackcount})
await target.sendto({"shutdown": 1, "acks": target.ackcount})
except Exception:
pass
for conn in connections:
conn.close()
await asyncio.sleep(0.3)
for plugin in plugins: