Compare commits

..

6 Commits

Author SHA1 Message Date
andreas d7b368c7c6 version 5.1.19
Release / release (push) Successful in 5s
2026-05-04 12:10:01 -04:00
andreas e790663f9f feat: exclude ZFS ARC from memory_percent; add uptime_seconds to cpu_monitor
memory_monitor / hbc_mini: ZFS ARC is reclaimable but not reflected in
MemAvailable by the Linux kernel (not in SReclaimable). Read ARC size
from /proc/spl/kstat/zfs/arcstats and add it to available memory before
computing memory_percent and memory_used. No-op on systems without ZFS.

cpu_monitor: report uptime_seconds via psutil.boot_time() (full client)
and /proc/uptime (hbc_mini).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-04 12:09:58 -04:00
andreas 475319e248 fix: send boot/shutdown on first open connection, not blindly first in list
Replace break-after-first-iteration with next(c for c in connections if
c.transport) so the message goes to the first connection that actually
has an open transport. Falls back to connections[0] if none are open
yet (sendto will attempt reopen), avoiding silent message loss when the
leading connection is still connecting.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-04 09:59:30 -04:00
andreas ca5ef384a8 version 5.1.18
Release / release (push) Successful in 5s
2026-05-04 09:13:18 -04:00
andreas c93dbdc0f4 fix: settings thresholds show correct per-config metrics; misc hbc fixes
Settings page: pass threshold_checker to http.start so the Threshold
Configurations section has data. Use threshold_checker's already-parsed
ThresholdConfig objects instead of re-parsing the raw nested YAML.
Named (non-default) configs now display only their explicit overrides
via threshold_raw_configs, not the full merged set with defaults.

hbc/hbc_mini: send boot and shutdown messages on first connection only
to avoid duplicate packets when multiple servers are configured.
Replace print("Daemonizing...") with logging.info so output goes to
syslog in daemon mode.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-04 09:12:39 -04:00
andreas 3a546a1e5c feat: fetch-based Update/Delete buttons with toast notification on Host Overview
Replace href navigation with fetch() so the server response is captured
and displayed in a slide-up toast at the bottom of the page. Delete also
removes the host card from the DOM on success without a page reload.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-04 08:16:54 -04:00
10 changed files with 178 additions and 64 deletions
+1 -1
View File
@@ -14,4 +14,4 @@ Install options:
""" """
__all__ = ["__version__"] __all__ = ["__version__"]
__version__ = "5.1.17" __version__ = "5.1.19"
+7 -10
View File
@@ -463,16 +463,13 @@ async def cleanup(connections: List[AsyncConnection]):
logger = logging.getLogger("hbc.cleanup") logger = logging.getLogger("hbc.cleanup")
logger.info("Cleaning up connections") logger.info("Cleaning up connections")
for conn in connections: target = next((c for c in connections if c.transport), connections[0] if connections else None)
if target:
try: try:
msg = { await target.sendto({"shutdown": 1, "acks": target.ackcount})
"shutdown": 1,
"acks": conn.ackcount
}
await conn.sendto(msg)
except Exception as e: except Exception as e:
logger.error(f"Error sending shutdown: {e}") logger.error(f"Error sending shutdown: {e}")
for conn in connections:
conn.close() conn.close()
# Give messages time to send # Give messages time to send
@@ -538,8 +535,8 @@ async def async_main(args, config):
boot_msg["msg"] = args.message boot_msg["msg"] = args.message
boot_msg["acks"] = 0 boot_msg["acks"] = 0
for conn in connections: target = next((c for c in connections if c.transport), connections[0])
await conn.sendto(boot_msg) await target.sendto(boot_msg)
if args.message and not args.daemon: if args.message and not args.daemon:
# Message-only mode # Message-only mode
@@ -739,7 +736,7 @@ def main(argv=None):
# Daemonize if requested # Daemonize if requested
if args.daemon: if args.daemon:
print("Daemonizing...") logging.info("Daemonizing...")
daemonize() daemonize()
_reconfigure_logging_for_daemon(log_level) _reconfigure_logging_for_daemon(log_level)
logging.info(f"hbc starting, sending heartbeat to {', '.join(args.hosts)}") logging.info(f"hbc starting, sending heartbeat to {', '.join(args.hosts)}")
+7
View File
@@ -118,6 +118,13 @@ class CPUMonitorPlugin(MonitorPlugin):
data["cpu_iowait"] = round(cpu_times.iowait, 1) data["cpu_iowait"] = round(cpu_times.iowait, 1)
except Exception as e: except Exception as e:
self.logger.debug(f"Could not get CPU times: {e}") self.logger.debug(f"Could not get CPU times: {e}")
# Uptime in seconds
try:
import time
data["uptime_seconds"] = int(time.time() - self.psutil.boot_time())
except Exception as e:
self.logger.debug(f"Could not get uptime: {e}")
self.logger.debug( self.logger.debug(
f"Collected CPU metrics: {data.get('cpu_percent', 'N/A')}% usage" f"Collected CPU metrics: {data.get('cpu_percent', 'N/A')}% usage"
+31 -3
View File
@@ -14,6 +14,24 @@ except ImportError:
from hbd.client.plugin import MonitorPlugin from hbd.client.plugin import MonitorPlugin
def _zfs_arc_bytes() -> int:
"""Return current ZFS ARC size in bytes, or 0 if ZFS is not present.
ZFS ARC is reclaimable but is not included in MemAvailable by the Linux
kernel (it is not in SReclaimable), so it would otherwise be counted as
used memory.
"""
try:
with open("/proc/spl/kstat/zfs/arcstats") as fh:
for line in fh:
parts = line.split()
if len(parts) >= 3 and parts[0] == "size":
return int(parts[2])
except (OSError, ValueError):
pass
return 0
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -101,11 +119,21 @@ class MemoryMonitorPlugin(MonitorPlugin):
# Virtual (physical) memory statistics # Virtual (physical) memory statistics
vmem = psutil.virtual_memory() vmem = psutil.virtual_memory()
# psutil's available already excludes page cache / file buffers
# (uses MemAvailable on Linux). Add ZFS ARC on top because the kernel
# does not include it in SReclaimable / MemAvailable even though it is
# reclaimable.
arc_bytes = _zfs_arc_bytes()
available = min(vmem.available + arc_bytes, vmem.total)
used = vmem.total - available
percent = round(used / vmem.total * 100, 1) if vmem.total else 0.0
metrics['memory_total'] = vmem.total metrics['memory_total'] = vmem.total
metrics['memory_available'] = vmem.available metrics['memory_available'] = available
metrics['memory_used'] = vmem.used metrics['memory_used'] = used
metrics['memory_free'] = vmem.free metrics['memory_free'] = vmem.free
metrics['memory_percent'] = vmem.percent metrics['memory_percent'] = percent
# Platform-specific memory details # Platform-specific memory details
if hasattr(vmem, 'active'): if hasattr(vmem, 'active'):
+1 -1
View File
@@ -890,7 +890,7 @@ async def start(
tmpl = env.get_template("settings.html") tmpl = env.get_template("settings.html")
body = tmpl.render( body = tmpl.render(
title="Settings - Heartbeat", title="Settings - Heartbeat",
sections=settings_mod.get_settings_sections(config), sections=settings_mod.get_settings_sections(config, threshold_checker=threshold_checker),
current_user=current_user.to_dict() if current_user else None, current_user=current_user.to_dict() if current_user else None,
active_page="settings", active_page="settings",
) )
+1
View File
@@ -255,6 +255,7 @@ async def _run_async(config, config_path=None):
config=config, config=config,
hbdclass=hbdclass, hbdclass=hbdclass,
tcss=None, tcss=None,
threshold_checker=threshold_checker,
verbose=config.get("verbose", False), verbose=config.get("verbose", False),
get_now=lambda: time.time(), get_now=lambda: time.time(),
VER="", VER="",
+30 -37
View File
@@ -88,7 +88,7 @@ def _sanitize_channel(name, cfg):
# Public API # Public API
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
def get_settings_sections(config: dict) -> list: def get_settings_sections(config: dict, threshold_checker=None) -> list:
"""Return ordered list of setting sections for the settings page. """Return ordered list of setting sections for the settings page.
Each section: Each section:
@@ -182,46 +182,39 @@ def get_settings_sections(config: dict) -> list:
}) })
# ---- Threshold configurations ----------------------------------------- # ---- Threshold configurations -----------------------------------------
def _parse_metric_row(metric_path, metric_cfg): def _tc_to_row(tc):
if not isinstance(metric_cfg, dict):
return None
return { return {
"metric": metric_path, "metric": tc.metric_path,
"operator": metric_cfg.get("operator", ">"), "operator": tc.operator.value,
"warning": metric_cfg.get("warning"), "warning": tc.warning,
"critical": metric_cfg.get("critical"), "critical": tc.critical,
"hysteresis": metric_cfg.get("hysteresis"), "hysteresis": tc.hysteresis,
"count": metric_cfg.get("count", 1), "count": tc.count,
"enabled": metric_cfg.get("enabled", True), "enabled": tc.enabled,
} }
threshold_config_list = [] threshold_config_list = []
raw_tconfigs = config.get("threshold_configs") or {} if threshold_checker is not None:
if raw_tconfigs: if threshold_checker.threshold_configs:
for cfg_name, cfg_data in sorted(raw_tconfigs.items()): for cfg_name, cfg_metrics in sorted(threshold_checker.threshold_configs.items()):
if not isinstance(cfg_data, dict): # For the default config use the merged effective set;
continue # for named overrides use only the explicitly defined metrics
metrics = [ # (threshold_raw_configs) so inherited defaults are not repeated.
r for r in ( if cfg_name == "default":
_parse_metric_row(mp, mc) display_metrics = cfg_metrics
for mp, mc in (cfg_data.get("thresholds") or {}).items() else:
) if r display_metrics = threshold_checker.threshold_raw_configs.get(cfg_name, cfg_metrics)
] metrics = sorted(
threshold_config_list.append({ [_tc_to_row(tc) for tc in display_metrics.values()],
"name": cfg_name, key=lambda m: m["metric"],
"metrics": sorted(metrics, key=lambda m: m["metric"]), )
}) threshold_config_list.append({"name": cfg_name, "metrics": metrics})
elif config.get("thresholds"): elif threshold_checker.thresholds:
metrics = [ metrics = sorted(
r for r in ( [_tc_to_row(tc) for tc in threshold_checker.thresholds.values()],
_parse_metric_row(mp, mc) key=lambda m: m["metric"],
for mp, mc in config["thresholds"].items() )
) if r threshold_config_list.append({"name": "default", "metrics": metrics})
]
threshold_config_list.append({
"name": "default",
"metrics": sorted(metrics, key=lambda m: m["metric"]),
})
# ---- Hosts summary ---------------------------------------------------- # ---- Hosts summary ----------------------------------------------------
hosts_list = [] hosts_list = []
+72 -6
View File
@@ -152,6 +152,31 @@
} }
.host-action-btn.delete-btn:hover { background: #ffcdd2; } .host-action-btn.delete-btn:hover { background: #ffcdd2; }
/* ── Action result toast ───────────────────────────────────── */
#action-toast {
position: fixed;
bottom: 24px;
left: 50%;
transform: translateX(-50%) translateY(20px);
background: #323232;
color: #fff;
padding: 12px 22px;
border-radius: 6px;
font-size: 0.9em;
max-width: 480px;
text-align: center;
opacity: 0;
pointer-events: none;
transition: opacity 0.25s, transform 0.25s;
z-index: 9000;
white-space: pre-wrap;
}
#action-toast.show {
opacity: 1;
transform: translateX(-50%) translateY(0);
}
#action-toast.error { background: #c62828; }
/* ── Host body ──────────────────────────────────────────────── */ /* ── Host body ──────────────────────────────────────────────── */
.host-body { .host-body {
@@ -401,12 +426,10 @@
{% endif %} {% endif %}
<span class="os-label" id="os-label-{{ host.name }}"></span> <span class="os-label" id="os-label-{{ host.name }}"></span>
{% if host.is_owner %} {% if host.is_owner %}
<a class="host-action-btn update-btn" <button class="host-action-btn update-btn"
href="/u?h={{ host.name }}" onclick="event.stopPropagation(); hostAction(this, '/u?h={{ host.name }}')">Update</button>
onclick="event.stopPropagation()">Update</a> <button class="host-action-btn delete-btn"
<a class="host-action-btn delete-btn" onclick="event.stopPropagation(); hostDelete(this, '{{ host.name }}')">Delete</button>
href="/d?h={{ host.name }}"
onclick="event.stopPropagation(); return confirm('Delete host {{ host.name }}?')">Delete</a>
{% endif %} {% endif %}
</div> </div>
</div> </div>
@@ -1204,6 +1227,49 @@
fetchHostGlance(first.dataset.hostname); fetchHostGlance(first.dataset.hostname);
} }
}); });
// ── Host action helpers ──────────────────────────────────────
let _toastTimer = null;
function showToast(msg, isError) {
const t = document.getElementById('action-toast');
t.textContent = msg;
t.classList.toggle('error', !!isError);
t.classList.add('show');
clearTimeout(_toastTimer);
_toastTimer = setTimeout(() => t.classList.remove('show'), 4000);
}
async function hostAction(btn, url) {
btn.disabled = true;
try {
const res = await fetch(url);
const text = await res.text();
showToast(text, !res.ok);
} catch (e) {
showToast('Request failed: ' + e.message, true);
} finally {
btn.disabled = false;
}
}
async function hostDelete(btn, hostname) {
if (!confirm('Delete host ' + hostname + '?')) return;
btn.disabled = true;
try {
const res = await fetch('/d?h=' + encodeURIComponent(hostname));
const text = await res.text();
showToast(text, !res.ok);
if (res.ok) {
const card = document.querySelector(`.host-card[data-hostname="${hostname}"]`);
if (card) card.remove();
}
} catch (e) {
showToast('Request failed: ' + e.message, true);
btn.disabled = false;
}
}
</script> </script>
<div id="action-toast"></div>
</body> </body>
</html> </html>
+1 -1
View File
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project] [project]
name = "hbd" name = "hbd"
version = "5.1.17" version = "5.1.19"
description = "Heartbeat monitoring system — client (hbc) and server (hbd)" description = "Heartbeat monitoring system — client (hbc) and server (hbd)"
readme = "README.md" readme = "README.md"
requires-python = ">=3.11" requires-python = ">=3.11"
+27 -5
View File
@@ -41,7 +41,7 @@ from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
# updated by scripts/bumpminor.sh # updated by scripts/bumpminor.sh
__version__ = "5.1.17" __version__ = "5.1.19"
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Protocol (mirrors hbd/common/proto.py) # Protocol (mirrors hbd/common/proto.py)
@@ -487,6 +487,12 @@ class CPUMonitorPlugin(MonitorPlugin):
except Exception: except Exception:
pass pass
try:
with open("/proc/uptime") as fh:
data["uptime_seconds"] = int(float(fh.read().split()[0]))
except Exception:
pass
return data return data
@@ -535,6 +541,20 @@ class MemoryMonitorPlugin(MonitorPlugin):
total = mi.get("MemTotal", 0) total = mi.get("MemTotal", 0)
avail = mi.get("MemAvailable", mi.get("MemFree", 0)) avail = mi.get("MemAvailable", mi.get("MemFree", 0))
free = mi.get("MemFree", 0) free = mi.get("MemFree", 0)
# ZFS ARC is reclaimable but not included in MemAvailable; add it.
arc_kb = 0
try:
with open("/proc/spl/kstat/zfs/arcstats") as _f:
for _line in _f:
_p = _line.split()
if len(_p) >= 3 and _p[0] == "size":
arc_kb = int(_p[2]) // 1024
break
except (OSError, ValueError):
pass
avail = min(avail + arc_kb, total)
used = total - avail used = total - avail
data: Dict[str, Any] = { data: Dict[str, Any] = {
"memory_total": total * 1024, "memory_total": total * 1024,
@@ -1052,8 +1072,8 @@ async def _async_main(args, cfg: Dict[str, Any]) -> int:
if args.message: if args.message:
bmsg["service"] = "service" bmsg["service"] = "service"
bmsg["msg"] = args.message bmsg["msg"] = args.message
for c in connections: target = next((c for c in connections if c._transport), connections[0])
await c.sendto(bmsg) await target.sendto(bmsg)
if args.message and not args.daemon: if args.message and not args.daemon:
await asyncio.sleep(0.3) await asyncio.sleep(0.3)
for c in connections: for c in connections:
@@ -1085,11 +1105,13 @@ async def _async_main(args, cfg: Dict[str, Any]) -> int:
pass pass
log.info("shutting down") log.info("shutting down")
for conn in connections: target = next((c for c in connections if c._transport), connections[0] if connections else None)
if target:
try: try:
await conn.sendto({"shutdown": 1, "acks": conn.ackcount}) await target.sendto({"shutdown": 1, "acks": target.ackcount})
except Exception: except Exception:
pass pass
for conn in connections:
conn.close() conn.close()
await asyncio.sleep(0.3) await asyncio.sleep(0.3)
for plugin in plugins: for plugin in plugins: