Compare commits

..

3 Commits

Author SHA1 Message Date
andreas d7b368c7c6 version 5.1.19
Release / release (push) Successful in 5s
2026-05-04 12:10:01 -04:00
andreas e790663f9f feat: exclude ZFS ARC from memory_percent; add uptime_seconds to cpu_monitor
memory_monitor / hbc_mini: ZFS ARC is reclaimable but not reflected in
MemAvailable by the Linux kernel (not in SReclaimable). Read ARC size
from /proc/spl/kstat/zfs/arcstats and add it to available memory before
computing memory_percent and memory_used. No-op on systems without ZFS.

cpu_monitor: report uptime_seconds via psutil.boot_time() (full client)
and /proc/uptime (hbc_mini).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-04 12:09:58 -04:00
andreas 475319e248 fix: send boot/shutdown on first open connection, not blindly first in list
Replace break-after-first-iteration with next(c for c in connections if
c.transport) so the message goes to the first connection that actually
has an open transport. Falls back to connections[0] if none are open
yet (sendto will attempt reopen), avoiding silent message loss when the
leading connection is still connecting.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-04 09:59:30 -04:00
6 changed files with 72 additions and 23 deletions
+1 -1
View File
@@ -14,4 +14,4 @@ Install options:
"""
__all__ = ["__version__"]
__version__ = "5.1.18"
__version__ = "5.1.19"
+6 -11
View File
@@ -463,18 +463,14 @@ async def cleanup(connections: List[AsyncConnection]):
logger = logging.getLogger("hbc.cleanup")
logger.info("Cleaning up connections")
for conn in connections:
target = next((c for c in connections if c.transport), connections[0] if connections else None)
if target:
try:
msg = {
"shutdown": 1,
"acks": conn.ackcount
}
await conn.sendto(msg)
await target.sendto({"shutdown": 1, "acks": target.ackcount})
except Exception as e:
logger.error(f"Error sending shutdown: {e}")
for conn in connections:
conn.close()
break # Only send shutdown on first connection to avoid duplicates
# Give messages time to send
await asyncio.sleep(0.5)
@@ -539,9 +535,8 @@ async def async_main(args, config):
boot_msg["msg"] = args.message
boot_msg["acks"] = 0
for conn in connections:
await conn.sendto(boot_msg)
break # Only send message on first connection to avoid duplicates
target = next((c for c in connections if c.transport), connections[0])
await target.sendto(boot_msg)
if args.message and not args.daemon:
# Message-only mode
+7
View File
@@ -118,6 +118,13 @@ class CPUMonitorPlugin(MonitorPlugin):
data["cpu_iowait"] = round(cpu_times.iowait, 1)
except Exception as e:
self.logger.debug(f"Could not get CPU times: {e}")
# Uptime in seconds
try:
import time
data["uptime_seconds"] = int(time.time() - self.psutil.boot_time())
except Exception as e:
self.logger.debug(f"Could not get uptime: {e}")
self.logger.debug(
f"Collected CPU metrics: {data.get('cpu_percent', 'N/A')}% usage"
+31 -3
View File
@@ -14,6 +14,24 @@ except ImportError:
from hbd.client.plugin import MonitorPlugin
def _zfs_arc_bytes() -> int:
"""Return current ZFS ARC size in bytes, or 0 if ZFS is not present.
ZFS ARC is reclaimable but is not included in MemAvailable by the Linux
kernel (it is not in SReclaimable), so it would otherwise be counted as
used memory.
"""
try:
with open("/proc/spl/kstat/zfs/arcstats") as fh:
for line in fh:
parts = line.split()
if len(parts) >= 3 and parts[0] == "size":
return int(parts[2])
except (OSError, ValueError):
pass
return 0
logger = logging.getLogger(__name__)
@@ -101,11 +119,21 @@ class MemoryMonitorPlugin(MonitorPlugin):
# Virtual (physical) memory statistics
vmem = psutil.virtual_memory()
# psutil's available already excludes page cache / file buffers
# (uses MemAvailable on Linux). Add ZFS ARC on top because the kernel
# does not include it in SReclaimable / MemAvailable even though it is
# reclaimable.
arc_bytes = _zfs_arc_bytes()
available = min(vmem.available + arc_bytes, vmem.total)
used = vmem.total - available
percent = round(used / vmem.total * 100, 1) if vmem.total else 0.0
metrics['memory_total'] = vmem.total
metrics['memory_available'] = vmem.available
metrics['memory_used'] = vmem.used
metrics['memory_available'] = available
metrics['memory_used'] = used
metrics['memory_free'] = vmem.free
metrics['memory_percent'] = vmem.percent
metrics['memory_percent'] = percent
# Platform-specific memory details
if hasattr(vmem, 'active'):
+1 -1
View File
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "hbd"
version = "5.1.18"
version = "5.1.19"
description = "Heartbeat monitoring system — client (hbc) and server (hbd)"
readme = "README.md"
requires-python = ">=3.11"
+26 -7
View File
@@ -41,7 +41,7 @@ from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
# updated by scripts/bumpminor.sh
__version__ = "5.1.18"
__version__ = "5.1.19"
# ---------------------------------------------------------------------------
# Protocol (mirrors hbd/common/proto.py)
@@ -487,6 +487,12 @@ class CPUMonitorPlugin(MonitorPlugin):
except Exception:
pass
try:
with open("/proc/uptime") as fh:
data["uptime_seconds"] = int(float(fh.read().split()[0]))
except Exception:
pass
return data
@@ -535,6 +541,20 @@ class MemoryMonitorPlugin(MonitorPlugin):
total = mi.get("MemTotal", 0)
avail = mi.get("MemAvailable", mi.get("MemFree", 0))
free = mi.get("MemFree", 0)
# ZFS ARC is reclaimable but not included in MemAvailable; add it.
arc_kb = 0
try:
with open("/proc/spl/kstat/zfs/arcstats") as _f:
for _line in _f:
_p = _line.split()
if len(_p) >= 3 and _p[0] == "size":
arc_kb = int(_p[2]) // 1024
break
except (OSError, ValueError):
pass
avail = min(avail + arc_kb, total)
used = total - avail
data: Dict[str, Any] = {
"memory_total": total * 1024,
@@ -1052,9 +1072,8 @@ async def _async_main(args, cfg: Dict[str, Any]) -> int:
if args.message:
bmsg["service"] = "service"
bmsg["msg"] = args.message
for c in connections:
await c.sendto(bmsg)
break
target = next((c for c in connections if c._transport), connections[0])
await target.sendto(bmsg)
if args.message and not args.daemon:
await asyncio.sleep(0.3)
for c in connections:
@@ -1086,12 +1105,12 @@ async def _async_main(args, cfg: Dict[str, Any]) -> int:
pass
log.info("shutting down")
for conn in connections:
target = next((c for c in connections if c._transport), connections[0] if connections else None)
if target:
try:
await conn.sendto({"shutdown": 1, "acks": conn.ackcount})
await target.sendto({"shutdown": 1, "acks": target.ackcount})
except Exception:
pass
break
for conn in connections:
conn.close()
await asyncio.sleep(0.3)