Compare commits
22 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 6282077fe0 | |||
| ddd857173b | |||
| f46f725d12 | |||
| 3da6976b53 | |||
| 3a0c48e32b | |||
| cf6e19704f | |||
| b0addd7c67 | |||
| 32680d34a4 | |||
| a7abdcb5c5 | |||
| 7bab15ae52 | |||
| e0443293e9 | |||
| 39670f4e63 | |||
| 2e88ee2269 | |||
| 2ef7d473c3 | |||
| 862a9cdea0 | |||
| 9351938b15 | |||
| b6ef2fe065 | |||
| d5d2f066b3 | |||
| d9563392c3 | |||
| 5f090b9d96 | |||
| 3cc1d92eb4 | |||
| 2ddba203df |
@@ -2,6 +2,18 @@
|
||||
|
||||
All notable changes to this project are documented here, organized by release.
|
||||
|
||||
## [5.3.10]
|
||||
|
||||
### Added
|
||||
- clear stale plugin data and persist OAuth users to config
|
||||
- auto-scale CPU history graph Y axis
|
||||
- add CPU usage history graph to CPU Monitor section
|
||||
|
||||
### Fixed
|
||||
- remove bak file in bumpminor.sh
|
||||
|
||||
---
|
||||
|
||||
## [5.3.9]
|
||||
|
||||
### Added
|
||||
|
||||
@@ -20,7 +20,7 @@ A lightweight UDP-based host monitoring system. Monitored hosts run a client (`h
|
||||
└────────────────────┘ └────────────────────────────┘
|
||||
```
|
||||
|
||||
**Package:** `hbd` v5.3.9
|
||||
**Package:** `hbd` v5.3.10
|
||||
**Python:** 3.11+
|
||||
|
||||
### Subpackages
|
||||
|
||||
+1
-1
@@ -14,4 +14,4 @@ Install options:
|
||||
"""
|
||||
|
||||
__all__ = ["__version__"]
|
||||
__version__ = "5.3.9"
|
||||
__version__ = "5.3.10"
|
||||
|
||||
@@ -127,15 +127,15 @@ class FilesystemInfoPlugin(InfoPlugin):
|
||||
try:
|
||||
# Maximum filename length
|
||||
max_name = os.pathconf(partition.mountpoint, 'PC_NAME_MAX')
|
||||
if max_name:
|
||||
if max_name is not None:
|
||||
fs_info['maxfile'] = max_name
|
||||
except (OSError, ValueError):
|
||||
pass
|
||||
|
||||
|
||||
try:
|
||||
# Maximum path length
|
||||
max_path = os.pathconf(partition.mountpoint, 'PC_PATH_MAX')
|
||||
if max_path:
|
||||
if max_path is not None:
|
||||
fs_info['maxpath'] = max_path
|
||||
except (OSError, ValueError):
|
||||
pass
|
||||
|
||||
+33
-1
@@ -297,6 +297,8 @@ class Host:
|
||||
self.plugin_retention = 100 # Keep last N samples per plugin
|
||||
# Alert state tracking: {metric_path: AlertState}
|
||||
self.alert_states = {}
|
||||
# Stale-data timers: {plugin_name: asyncio.TimerHandle}
|
||||
self.plugin_timers = {}
|
||||
# User access control
|
||||
self.owner: str | None = None # username of owner
|
||||
self.managers: list = [] # usernames with manager role
|
||||
@@ -365,7 +367,7 @@ class Host:
|
||||
def stateinfo(self):
|
||||
ddict = {}
|
||||
for d in self.__dict__:
|
||||
if d in ["alert_states", "plugin_data"]:
|
||||
if d in ["alert_states", "plugin_data", "plugin_timers"]:
|
||||
continue
|
||||
if d == "connections":
|
||||
cl = []
|
||||
@@ -483,6 +485,8 @@ class Host:
|
||||
self.managers = []
|
||||
if not hasattr(self, "monitors"):
|
||||
self.monitors = []
|
||||
if not hasattr(self, "plugin_timers"):
|
||||
self.plugin_timers = {}
|
||||
|
||||
pass
|
||||
|
||||
@@ -542,6 +546,34 @@ class Host:
|
||||
"""
|
||||
return self.plugin_data
|
||||
|
||||
def reset_plugin_timer(self, plugin_name, timeout_seconds, callback):
|
||||
"""Reset the stale-data timer for a plugin.
|
||||
|
||||
If no new PLG data arrives within timeout_seconds, callback(host, plugin_name)
|
||||
is called so the caller can clear history and alerts.
|
||||
"""
|
||||
import asyncio
|
||||
existing = self.plugin_timers.get(plugin_name)
|
||||
if existing and not existing.cancelled():
|
||||
existing.cancel()
|
||||
|
||||
async def _fire():
|
||||
await callback(self, plugin_name)
|
||||
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
self.plugin_timers[plugin_name] = loop.call_later(
|
||||
timeout_seconds, lambda: asyncio.create_task(_fire())
|
||||
)
|
||||
except RuntimeError:
|
||||
pass
|
||||
|
||||
def cancel_plugin_timer(self, plugin_name):
|
||||
"""Cancel the stale timer for a plugin, if any."""
|
||||
handle = self.plugin_timers.pop(plugin_name, None)
|
||||
if handle and not handle.cancelled():
|
||||
handle.cancel()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# User-role helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
+42
-9
@@ -424,7 +424,7 @@ async def start(
|
||||
# Resolve templates directory relative to the hbd package
|
||||
pkg_dir = os.path.dirname(__file__)
|
||||
templates_dir = config.get("templates_dir", os.path.join(pkg_dir, "templates"))
|
||||
env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_dir))
|
||||
env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_dir), autoescape=True)
|
||||
host = config.get("hb_host", "localhost")
|
||||
extra_scripts = config.get("http_extra_scripts", "")
|
||||
host = request.host # includes port if non-standard
|
||||
@@ -597,8 +597,6 @@ async def start(
|
||||
all_alerts = []
|
||||
|
||||
for hostname, host in hbdclass.Host.hosts.items():
|
||||
if not host.watched:
|
||||
continue
|
||||
if not _can_view_host(user, host):
|
||||
continue
|
||||
if threshold_checker:
|
||||
@@ -692,7 +690,7 @@ async def start(
|
||||
current_user, _ = _require_auth_redirect(request)
|
||||
pkg_dir = os.path.dirname(__file__)
|
||||
templates_dir = config.get("templates_dir", os.path.join(pkg_dir, "templates"))
|
||||
env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_dir))
|
||||
env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_dir), autoescape=True)
|
||||
|
||||
# Collect all hosts with plugin data (filtered by visibility)
|
||||
hosts_with_plugins = []
|
||||
@@ -723,7 +721,7 @@ async def start(
|
||||
current_user, _ = _require_auth_redirect(request)
|
||||
pkg_dir = os.path.dirname(__file__)
|
||||
templates_dir = config.get("templates_dir", os.path.join(pkg_dir, "templates"))
|
||||
env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_dir))
|
||||
env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_dir), autoescape=True)
|
||||
|
||||
tmpl = env.get_template("alerts.html")
|
||||
body = tmpl.render(
|
||||
@@ -780,6 +778,8 @@ async def start(
|
||||
token = users_mod.create_session(username)
|
||||
eventlog("hbd", "INFO", f"Login: {username} via password")
|
||||
redirect_to = request.rel_url.query.get("next", "/")
|
||||
if not redirect_to.startswith("/"):
|
||||
redirect_to = "/"
|
||||
resp = web.HTTPFound(redirect_to)
|
||||
resp.set_cookie(
|
||||
SESSION_COOKIE,
|
||||
@@ -891,6 +891,13 @@ async def start(
|
||||
if not target_user.avatar_is_local():
|
||||
return web.Response(status=404, text="No local avatar configured")
|
||||
path = target_user.avatar
|
||||
avatar_dir = config.get("avatar_dir") or (
|
||||
os.path.dirname(os.path.realpath(_config_path)) if _config_path else None
|
||||
)
|
||||
if not avatar_dir:
|
||||
return web.Response(status=403, text="Local avatars not configured")
|
||||
if not os.path.realpath(path).startswith(os.path.realpath(avatar_dir) + os.sep):
|
||||
return web.Response(status=403, text="Forbidden")
|
||||
if not os.path.isfile(path):
|
||||
return web.Response(status=404, text="Avatar file not found")
|
||||
# Infer content-type from extension
|
||||
@@ -994,7 +1001,7 @@ async def start(
|
||||
current_user, _ = _require_auth_redirect(request)
|
||||
pkg_dir = os.path.dirname(__file__)
|
||||
templates_dir = config.get("templates_dir", os.path.join(pkg_dir, "templates"))
|
||||
env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_dir))
|
||||
env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_dir), autoescape=True)
|
||||
|
||||
# Build host access summary for this user.
|
||||
# Merge live hosts with config-only hosts (not yet seen) so the profile
|
||||
@@ -1078,7 +1085,7 @@ async def start(
|
||||
current_user, _ = _require_auth_redirect(request)
|
||||
pkg_dir = os.path.dirname(__file__)
|
||||
templates_dir = config.get("templates_dir", os.path.join(pkg_dir, "templates"))
|
||||
env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_dir))
|
||||
env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_dir), autoescape=True)
|
||||
from hbd import __version__ as hbd_version
|
||||
|
||||
uptime_secs = int(time.time() - _start_epoch)
|
||||
@@ -1122,7 +1129,7 @@ async def start(
|
||||
raise web.HTTPForbidden(reason="Admin access required")
|
||||
pkg_dir = os.path.dirname(__file__)
|
||||
templates_dir = config.get("templates_dir", os.path.join(pkg_dir, "templates"))
|
||||
env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_dir))
|
||||
env = jinja2.Environment(loader=jinja2.FileSystemLoader(templates_dir), autoescape=True)
|
||||
tmpl = env.get_template("settings.html")
|
||||
settings_data = settings_mod.get_settings_data(config, threshold_checker=threshold_checker)
|
||||
body = tmpl.render(
|
||||
@@ -1182,6 +1189,23 @@ async def start(
|
||||
profile["full_name"],
|
||||
profile["avatar_url"],
|
||||
)
|
||||
# Persist new OAuth users to the config file so they survive restarts.
|
||||
# Only write when the user isn't already in the config's users section.
|
||||
if _config_path and not (config.get("users") or {}).get(user.username):
|
||||
try:
|
||||
disk_data = configio_mod.read_roundtrip(_config_path)
|
||||
if not disk_data.get("users"):
|
||||
disk_data["users"] = {}
|
||||
disk_data["users"][user.username] = {
|
||||
k: v for k, v in [
|
||||
("full_name", user.full_name),
|
||||
("avatar", user.avatar),
|
||||
] if v
|
||||
}
|
||||
configio_mod.write_config(_config_path, disk_data)
|
||||
logger.info("Persisted OAuth user %r to config", user.username)
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to persist OAuth user %r to config: %s", user.username, exc)
|
||||
session_token = users_mod.create_session(user.username)
|
||||
eventlog("hbd", "INFO", f"Login: {user.username} via {provider.type}")
|
||||
resp = web.HTTPFound("/")
|
||||
@@ -1644,7 +1668,16 @@ async def start(
|
||||
if "full_name" in body:
|
||||
user_entry["full_name"] = str(body["full_name"])
|
||||
if "avatar" in body:
|
||||
user_entry["avatar"] = str(body["avatar"])
|
||||
avatar_val = str(body["avatar"])
|
||||
if avatar_val.startswith("/"):
|
||||
avatar_dir = config.get("avatar_dir") or (
|
||||
os.path.dirname(os.path.realpath(_config_path)) if _config_path else None
|
||||
)
|
||||
if not avatar_dir:
|
||||
return web.json_response({"error": "Local avatars not configured"}, status=400)
|
||||
if not os.path.realpath(avatar_val).startswith(os.path.realpath(avatar_dir) + os.sep):
|
||||
return web.json_response({"error": "Avatar path outside allowed directory"}, status=400)
|
||||
user_entry["avatar"] = avatar_val
|
||||
if "notification_channels" in body:
|
||||
visible = _visible_channels_for_user(user)
|
||||
user_entry["notification_channels"] = [
|
||||
|
||||
@@ -140,7 +140,9 @@ def _send_pushover(channel_cfg: dict, notif: Notification) -> bool:
|
||||
if not token or not user:
|
||||
logger.warning("pushover: missing token or user")
|
||||
return False
|
||||
params: dict = {"token": token, "user": user, "title": notif.title, "message": notif.body}
|
||||
body = "%s: %s" % (notif.title, notif.body)
|
||||
title = ""
|
||||
params: dict = {"token": token, "user": user, "title": title, "message": body}
|
||||
if channel_cfg.get("sound"):
|
||||
params["sound"] = channel_cfg["sound"]
|
||||
if notif.url:
|
||||
|
||||
@@ -321,9 +321,15 @@
|
||||
var c = 0;
|
||||
var HBD_VERSION = "{{ hbd_version }}";
|
||||
|
||||
function escHtml(s) {
|
||||
var d = document.createElement('div');
|
||||
d.textContent = String(s);
|
||||
return d.innerHTML;
|
||||
}
|
||||
|
||||
function hostNameHtml(data) {
|
||||
var rawName = data.raw_name || data.name.replace(/<[^>]+>/g, '').replace('*', '').trim();
|
||||
var nameHtml = data.name;
|
||||
var nameHtml = escHtml(data.name);
|
||||
if (!data.hbc_version || data.hbc_version !== HBD_VERSION) {
|
||||
nameHtml += ' 🥀';
|
||||
}
|
||||
@@ -410,11 +416,11 @@
|
||||
c_critical.innerHTML = "";
|
||||
}
|
||||
|
||||
c_ipv4addr.innerHTML = data.connections[0].addr;
|
||||
c_ipv4state.innerHTML = data.connections[0].state;
|
||||
c_ipv4addr.innerHTML = escHtml(data.connections[0].addr);
|
||||
c_ipv4state.innerHTML = escHtml(data.connections[0].state);
|
||||
if (data.connections.length > 1) {
|
||||
c_ipv6addr.innerHTML = data.connections[1].addr;
|
||||
c_ipv6state.innerHTML = data.connections[1].state;
|
||||
c_ipv6addr.innerHTML = escHtml(data.connections[1].addr);
|
||||
c_ipv6state.innerHTML = escHtml(data.connections[1].state);
|
||||
}
|
||||
var table = document.getElementById("ntablebody"); // find table to append to
|
||||
table.appendChild(row); // append row to table
|
||||
@@ -477,7 +483,7 @@
|
||||
|
||||
for (var i = 0; i < data.connections.length; i++) {
|
||||
// Offset by 2 for the warning/critical count columns
|
||||
name_idx[data.name].cells[3 + i * 4].innerHTML = data.connections[i].addr;
|
||||
name_idx[data.name].cells[3 + i * 4].innerHTML = escHtml(data.connections[i].addr);
|
||||
name_idx[data.name].cells[6 + i * 4].innerHTML = formatTS(
|
||||
data.connections[i].statetime
|
||||
);
|
||||
@@ -497,7 +503,7 @@
|
||||
state = '<span class="state-overdue">overdue</span>';
|
||||
latency = "-";
|
||||
} else {
|
||||
state = "<b>" + data.connections[i].state + "</b>";
|
||||
state = "<b>" + escHtml(data.connections[i].state) + "</b>";
|
||||
latency = "-";
|
||||
}
|
||||
}
|
||||
@@ -558,12 +564,12 @@
|
||||
+ ' ' + _p(_d.getHours()) + ':' + _p(_d.getMinutes()) + ':' + _p(_d.getSeconds());
|
||||
var lvl = (msg.level || "INFO").toLowerCase();
|
||||
var hostVal = msg.host || '';
|
||||
var html = '<div class="log-entry log-' + lvl + '" data-level="' + lvl + '" data-host="' + hostVal.replace(/"/g, '"') + '">';
|
||||
var html = '<div class="log-entry log-' + escHtml(lvl) + '" data-level="' + escHtml(lvl) + '" data-host="' + escHtml(hostVal) + '">';
|
||||
html += '<span class="log-ts">' + ts_str + '</span>';
|
||||
html += '<span class="log-level">' + (msg.level || "") + '</span>';
|
||||
if (msg.host) html += '<span class="log-host">' + msg.host + '</span>';
|
||||
if (msg.service) html += '<span class="log-service">' + msg.service + '</span>';
|
||||
html += '<span class="log-msg">' + msg.message + '</span>';
|
||||
html += '<span class="log-level">' + escHtml(msg.level || "") + '</span>';
|
||||
if (msg.host) html += '<span class="log-host">' + escHtml(msg.host) + '</span>';
|
||||
if (msg.service) html += '<span class="log-service">' + escHtml(msg.service) + '</span>';
|
||||
html += '<span class="log-msg">' + escHtml(msg.message) + '</span>';
|
||||
html += '</div>';
|
||||
msgs.insertAdjacentHTML(state.history ? "beforeend" : "afterbegin", html);
|
||||
applyLogFilters();
|
||||
@@ -621,7 +627,7 @@
|
||||
<tbody id="ntablebody">
|
||||
{% for host in hosts %}
|
||||
<tr class="{% if host.alert_critical_unacked > 0 or host.alert_critical_acked > 0 %}row-critical{% elif host.alert_warning_unacked > 0 or host.alert_warning_acked > 0 %}row-warning{% endif %}">
|
||||
<td data-name="{{ host.name }}"><a class="host-link" href="/plugins#{{ host.raw_name | urlencode }}">{{ host.name }}{% if not host.hbc_version or host.hbc_version != hbd_version %} 🥀{% endif %}</a></td>
|
||||
<td data-name="{{ host.name }}"><a class="host-link" href="/plugins#{{ host.name | urlencode }}">{{ host.name }}{% if not host.hbc_version or host.hbc_version != hbd_version %} 🥀{% endif %}</a></td>
|
||||
<td style="text-align: center; color: #ff9800; font-weight: bold;">
|
||||
{%- set warning_unacked = host.alert_warning_unacked -%}
|
||||
{%- set warning_acked = host.alert_warning_acked -%}
|
||||
|
||||
@@ -914,7 +914,7 @@
|
||||
let html = '';
|
||||
switch (pluginName) {
|
||||
case 'os_info': html = renderOsInfoTable(cached.data); break;
|
||||
case 'cpu_monitor': html = renderCpuTable(cached.data); break;
|
||||
case 'cpu_monitor': html = renderCpuTable(hostname, cached.data); break;
|
||||
case 'memory_monitor': html = renderMemoryTable(cached.data); break;
|
||||
case 'disk_monitor': html = renderDiskTables(cached.data); break;
|
||||
case 'network_monitor':html = renderNetworkTables(cached.data); break;
|
||||
@@ -926,6 +926,10 @@
|
||||
|
||||
html += `<div class="timestamp">Last updated: ${new Date(cached.timestamp * 1000).toLocaleString()}</div>`;
|
||||
body.innerHTML = html;
|
||||
|
||||
if (pluginName === 'cpu_monitor') {
|
||||
fetchCpuHistory(hostname).then(samples => renderCpuChart(hostname, samples)).catch(() => {});
|
||||
}
|
||||
}
|
||||
|
||||
// ── Per-plugin renderers ────────────────────────────────────────────────
|
||||
@@ -948,7 +952,92 @@
|
||||
return html;
|
||||
}
|
||||
|
||||
function renderCpuTable(d) {
|
||||
async function fetchCpuHistory(hostname) {
|
||||
const r = await fetch(`/api/0/hosts/${encodeURIComponent(hostname)}/plugins/cpu_monitor?limit=100`);
|
||||
if (!r.ok) return [];
|
||||
const json = await r.json();
|
||||
return json.samples || [];
|
||||
}
|
||||
|
||||
function renderCpuChart(hostname, samples) {
|
||||
const el = document.getElementById(`cpu-chart-${hostname}`);
|
||||
if (!el || !samples.length) return;
|
||||
|
||||
const pts = samples
|
||||
.filter(s => s.data.cpu_percent != null)
|
||||
.map(s => ({ t: s.timestamp, v: s.data.cpu_percent }));
|
||||
if (pts.length < 2) { el.style.display = 'none'; return; }
|
||||
|
||||
const W = 600, H = 80, PAD = { top: 6, right: 8, bottom: 18, left: 28 };
|
||||
const cW = W - PAD.left - PAD.right;
|
||||
const cH = H - PAD.top - PAD.bottom;
|
||||
|
||||
const tMin = pts[0].t, tMax = pts[pts.length - 1].t;
|
||||
const tRange = tMax - tMin || 1;
|
||||
const x = t => PAD.left + ((t - tMin) / tRange) * cW;
|
||||
|
||||
// Auto-scale Y axis with 10% padding, clamped to [0, 100]
|
||||
const vMin = Math.min(...pts.map(p => p.v));
|
||||
const vMax = Math.max(...pts.map(p => p.v));
|
||||
const vRange = vMax - vMin || 1;
|
||||
const vPad = Math.max(vRange * 0.1, 1);
|
||||
const yLow = Math.max(0, vMin - vPad);
|
||||
const yHigh = Math.min(100, vMax + vPad);
|
||||
const yRange = yHigh - yLow || 1;
|
||||
const y = v => PAD.top + cH - ((v - yLow) / yRange) * cH;
|
||||
|
||||
// Build polyline points and filled area path
|
||||
const linePoints = pts.map(p => `${x(p.t).toFixed(1)},${y(p.v).toFixed(1)}`).join(' ');
|
||||
const areaPath = `M${x(pts[0].t).toFixed(1)},${(PAD.top + cH).toFixed(1)} ` +
|
||||
pts.map(p => `L${x(p.t).toFixed(1)},${y(p.v).toFixed(1)}`).join(' ') +
|
||||
` L${x(pts[pts.length-1].t).toFixed(1)},${(PAD.top + cH).toFixed(1)} Z`;
|
||||
|
||||
// Color based on latest absolute CPU %
|
||||
const latest = pts[pts.length - 1].v;
|
||||
const strokeColor = latest > 90 ? '#e53935' : latest > 70 ? '#fb8c00' : '#43a047';
|
||||
const fillColor = latest > 90 ? '#ffcdd2' : latest > 70 ? '#ffe0b2' : '#c8e6c9';
|
||||
|
||||
// Compute nice tick step for ~3-5 grid lines
|
||||
const rawStep = yRange / 4;
|
||||
const mag = Math.pow(10, Math.floor(Math.log10(rawStep || 1)));
|
||||
const niceStep = [1, 2, 5, 10].map(f => f * mag).find(s => yRange / s <= 5) || mag * 10;
|
||||
const tickStart = Math.ceil(yLow / niceStep) * niceStep;
|
||||
let gridLines = '';
|
||||
for (let v = tickStart; v <= yHigh + 0.001; v += niceStep) {
|
||||
const yy = y(v).toFixed(1);
|
||||
const label = Number.isInteger(v) ? v : v.toFixed(1);
|
||||
gridLines += `<line x1="${PAD.left}" y1="${yy}" x2="${PAD.left + cW}" y2="${yy}" stroke="#e0e0e0" stroke-width="1"/>`;
|
||||
gridLines += `<text x="${(PAD.left - 3).toFixed(1)}" y="${yy}" text-anchor="end" dominant-baseline="middle" font-size="8" fill="#999">${label}</text>`;
|
||||
}
|
||||
|
||||
// X-axis time labels
|
||||
const fmt = ts => {
|
||||
const d = new Date(ts * 1000);
|
||||
return d.toLocaleTimeString([], { hour: '2-digit', minute: '2-digit' });
|
||||
};
|
||||
const xLabels = `
|
||||
<text x="${PAD.left}" y="${H - 2}" text-anchor="start" font-size="8" fill="#999">${fmt(pts[0].t)}</text>
|
||||
<text x="${PAD.left + cW}" y="${H - 2}" text-anchor="end" font-size="8" fill="#999">${fmt(pts[pts.length-1].t)}</text>`;
|
||||
|
||||
el.innerHTML = `<svg viewBox="0 0 ${W} ${H}" preserveAspectRatio="none"
|
||||
style="width:100%;height:${H}px;display:block;">
|
||||
<defs>
|
||||
<clipPath id="cpu-clip-${hostname}">
|
||||
<rect x="${PAD.left}" y="${PAD.top}" width="${cW}" height="${cH}"/>
|
||||
</clipPath>
|
||||
</defs>
|
||||
${gridLines}
|
||||
<line x1="${PAD.left}" y1="${PAD.top}" x2="${PAD.left}" y2="${PAD.top + cH}" stroke="#ccc" stroke-width="1"/>
|
||||
<line x1="${PAD.left}" y1="${PAD.top + cH}" x2="${PAD.left + cW}" y2="${PAD.top + cH}" stroke="#ccc" stroke-width="1"/>
|
||||
<g clip-path="url(#cpu-clip-${hostname})">
|
||||
<path d="${areaPath}" fill="${fillColor}" opacity="0.6"/>
|
||||
<polyline points="${linePoints}" fill="none" stroke="${strokeColor}" stroke-width="1.5" stroke-linejoin="round"/>
|
||||
</g>
|
||||
${xLabels}
|
||||
</svg>`;
|
||||
}
|
||||
|
||||
function renderCpuTable(hostname, d) {
|
||||
const KEYS = [
|
||||
['cpu_percent', 'CPU Usage', 'bar'],
|
||||
['load_1min', 'Load (1 min)', 'num'],
|
||||
@@ -966,7 +1055,8 @@
|
||||
];
|
||||
|
||||
const handled = new Set(KEYS.map(r => r[0]));
|
||||
let html = '<table class="data-table"><thead><tr><th>Metric</th><th>Value</th></tr></thead><tbody>';
|
||||
let html = `<div id="cpu-chart-${hostname}" style="margin-bottom:8px;"></div>`;
|
||||
html += '<table class="data-table"><thead><tr><th>Metric</th><th>Value</th></tr></thead><tbody>';
|
||||
for (const [k, label, fmt] of KEYS) {
|
||||
if (!(k in d)) continue;
|
||||
const v = d[k];
|
||||
|
||||
@@ -1554,6 +1554,10 @@ class ThresholdChecker:
|
||||
configured = self.get_thresholds_for_host(hostname)
|
||||
stale = []
|
||||
for mp in host.alert_states:
|
||||
# connectivity.* and rtt are managed by the connection state
|
||||
# machine, not by threshold config — never purge them.
|
||||
if mp == "rtt" or mp.startswith("connectivity"):
|
||||
continue
|
||||
if self._find_threshold(configured, mp)[0] is not None:
|
||||
continue
|
||||
# Also match wildcard pool/partition thresholds (e.g. "zfs_monitor.*.status"
|
||||
|
||||
+63
-2
@@ -232,6 +232,23 @@ def _make_timer_callbacks(uname, host, ctx):
|
||||
return on_overdue, on_unknown
|
||||
|
||||
|
||||
def _make_plugin_stale_callback(uname, ctx):
|
||||
"""Return an async callback that clears stale plugin data and its alerts."""
|
||||
msg_to_websockets = ctx.get("msg_to_websockets")
|
||||
|
||||
async def on_plugin_stale(host, plugin_name):
|
||||
host.plugin_data.pop(plugin_name, None)
|
||||
stale_keys = [k for k in host.alert_states if k.startswith(f"{plugin_name}.")]
|
||||
for k in stale_keys:
|
||||
del host.alert_states[k]
|
||||
eventlog(uname, "INFO", f"plugin data stale: {plugin_name}")
|
||||
if msg_to_websockets:
|
||||
msg_to_websockets("plugin_stale", {"host": uname, "plugin": plugin_name})
|
||||
msg_to_websockets("host", host.stateinfo())
|
||||
|
||||
return on_plugin_stale
|
||||
|
||||
|
||||
def restore_connection_timers(hbdclass, ctx):
|
||||
"""Restore overdue timers for all loaded connections after a pickle restore.
|
||||
|
||||
@@ -249,10 +266,15 @@ def restore_connection_timers(hbdclass, ctx):
|
||||
for afam, conn in list(host.connections.items()):
|
||||
state = conn.getstate()
|
||||
if state == hbdclass.Connection.DOWN:
|
||||
_set_connectivity_alert(host, afam, "CRITICAL")
|
||||
continue
|
||||
|
||||
on_overdue, on_unknown = _make_timer_callbacks(uname, host, ctx)
|
||||
|
||||
if state == hbdclass.Connection.UNKNOWN:
|
||||
_set_connectivity_alert(host, afam, "CRITICAL")
|
||||
continue
|
||||
|
||||
if state == hbdclass.Connection.UP and interval > 0:
|
||||
elapsed = now - conn.lastbeat
|
||||
# Give hosts one full (interval + grace) of extra time on startup
|
||||
@@ -283,6 +305,10 @@ def restore_connection_timers(hbdclass, ctx):
|
||||
"Restored OVERDUE timer %s/%s: %.0fs remaining",
|
||||
uname, afam, remaining,
|
||||
)
|
||||
# Ensure the connectivity alert is set — it may be missing if
|
||||
# hbd was shut down before the on_overdue callback had a chance
|
||||
# to record it.
|
||||
_set_connectivity_alert(host, afam, "CRITICAL")
|
||||
restored += 1
|
||||
|
||||
logger.info("Restored timers for %d connection(s)", restored)
|
||||
@@ -372,12 +398,33 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
if k not in ("ID", "plugin", "id", "name")}
|
||||
# Store plugin data with timestamp
|
||||
host.add_plugin_data(plugin_name, plugin_data, timestamp=now)
|
||||
# Reset stale timer using the observed send interval for this plugin.
|
||||
# We need two samples to know the real interval; on the first sample
|
||||
# we cancel any leftover timer but don't set a new one, to avoid
|
||||
# false-stale firing for slow plugins (e.g. nagios_runner at 300 s).
|
||||
history = host.plugin_data.get(plugin_name, [])
|
||||
if len(history) >= 2:
|
||||
plugin_interval = max(history[-1][0] - history[-2][0], 1)
|
||||
host.reset_plugin_timer(plugin_name, plugin_interval * 3,
|
||||
_make_plugin_stale_callback(uname, ctx))
|
||||
# Remove alert states for metrics present in the previous sample
|
||||
# but absent now (e.g. a nagios check removed from configuration).
|
||||
prev_keys = set(history[-2][1].keys())
|
||||
curr_keys = set(plugin_data.keys())
|
||||
for metric_name in prev_keys - curr_keys:
|
||||
metric_path = f"{plugin_name}.{metric_name}"
|
||||
if host.alert_states.pop(metric_path, None) is not None:
|
||||
eventlog(uname, "INFO", f"stale check removed: {metric_path}")
|
||||
if (prev_keys - curr_keys) and msg_to_websockets:
|
||||
msg_to_websockets("host", host.stateinfo())
|
||||
else:
|
||||
host.cancel_plugin_timer(plugin_name)
|
||||
|
||||
# If os_info reports an owner and none is configured server-side, apply it
|
||||
if plugin_name == "os_info":
|
||||
config_owner = config_mod.get_host_access(cfg, uname).get("owner")
|
||||
default_owner = config_mod.get_default_owner(cfg)
|
||||
inferred_owner = plugin_data.get("owner", config_owner or default_owner)
|
||||
inferred_owner = config_owner or plugin_data.get("owner") or default_owner
|
||||
host.owner = inferred_owner
|
||||
logger.info(f"owner for {uname} is {host.owner}")
|
||||
if DEBUG > 1:
|
||||
@@ -432,6 +479,7 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
boot = msg.get("boot", 0)
|
||||
|
||||
if boot:
|
||||
# hbc was stared with a -b flag
|
||||
eventlog(uname, "INFO", "booted")
|
||||
if host.watched:
|
||||
asyncio.create_task(notify_mod.send_notification(
|
||||
@@ -439,11 +487,24 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
notify_mod.Notification(title=f"[INFO] {uname}", body=f"{host.name} booted", level="INFO"),
|
||||
))
|
||||
if message:
|
||||
eventlog(uname, "INFO", "msg: %s" % message, service=service)
|
||||
eventlog(uname, "INFO", message, service=service)
|
||||
|
||||
if conn.getstate() != hbdcls.Connection.UP:
|
||||
# Transition to UP and log/notify if appropriate
|
||||
lasts = conn.state
|
||||
d = conn.newstate(hbdcls.Connection.UP, now)
|
||||
# On reboot, pre-boot plugin data and derived alerts are stale.
|
||||
# Cancel all plugin timers and wipe plugin state so timers restart
|
||||
# cleanly from the first two post-boot samples.
|
||||
for pname in list(host.plugin_timers):
|
||||
host.cancel_plugin_timer(pname)
|
||||
host.plugin_data.clear()
|
||||
stale_plugin_keys = [
|
||||
k for k in host.alert_states
|
||||
if k not in ("rtt",) and not k.startswith("connectivity.")
|
||||
]
|
||||
for k in stale_plugin_keys:
|
||||
del host.alert_states[k]
|
||||
# Clear connectivity alert now that the host is back up
|
||||
_set_connectivity_alert(host, conn.afam, "OK")
|
||||
# Don't log/notify RECOVER for a brand-new host seen for the first time —
|
||||
|
||||
+1
-1
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "hbd"
|
||||
version = "5.3.9"
|
||||
version = "5.3.10"
|
||||
description = "Heartbeat monitoring system — client (hbc) and server (hbd)"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.11"
|
||||
|
||||
@@ -29,3 +29,4 @@ git push --tags
|
||||
|
||||
rm hbd/__init__.py.bak
|
||||
rm scripts/hbc_mini.py.bak
|
||||
rm README.md.bak
|
||||
@@ -789,7 +789,7 @@ static void plugin_cpu_monitor(conn_t *c, const config_t *cfg) {
|
||||
* Plugin: memory_monitor
|
||||
* Linux: /proc/meminfo
|
||||
* FreeBSD: sysctl vm.stats.vm.*
|
||||
* NetBSD: sysctl vm.uvmexp (struct uvmexp)
|
||||
* NetBSD: sysctl vm.uvmexp (struct uvmexp_sysctl)
|
||||
* ============================================================ */
|
||||
|
||||
/* emit the common kvdict fields and send */
|
||||
@@ -896,9 +896,9 @@ static void plugin_memory_monitor(conn_t *c, const config_t *cfg) {
|
||||
|
||||
static void plugin_memory_monitor(conn_t *c, const config_t *cfg) {
|
||||
(void)cfg;
|
||||
struct uvmexp uvm;
|
||||
struct uvmexp_sysctl uvm;
|
||||
size_t len = sizeof(uvm);
|
||||
int mib[2] = {CTL_VM, VM_UVMEXP};
|
||||
int mib[2] = {CTL_VM, VM_UVMEXP2};
|
||||
if (sysctl(mib, 2, &uvm, &len, NULL, 0) != 0) return;
|
||||
|
||||
long long ps = uvm.pagesize;
|
||||
|
||||
+1
-1
@@ -41,7 +41,7 @@ from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
# updated by scripts/bumpminor.sh
|
||||
__version__ = "5.3.9"
|
||||
__version__ = "5.3.10"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Protocol (mirrors hbd/common/proto.py)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,49 @@
|
||||
# PyInstaller spec for hbc_windows.exe
|
||||
# Build with: pyinstaller hbc_windows.spec
|
||||
#
|
||||
# Requirements (on Windows):
|
||||
# pip install pyinstaller
|
||||
|
||||
block_cipher = None
|
||||
|
||||
a = Analysis(
|
||||
['hbc_windows.py'],
|
||||
pathex=[],
|
||||
binaries=[],
|
||||
datas=[],
|
||||
hiddenimports=[],
|
||||
hookspath=[],
|
||||
hooksconfig={},
|
||||
runtime_hooks=[],
|
||||
excludes=['tkinter', 'unittest', 'email', 'html', 'http', 'urllib', 'xml'],
|
||||
win_no_prefer_redirects=False,
|
||||
win_private_assemblies=False,
|
||||
cipher=block_cipher,
|
||||
noarchive=False,
|
||||
)
|
||||
|
||||
pyz = PYZ(a.pure, a.zlib_archive, cipher=block_cipher)
|
||||
|
||||
exe = EXE(
|
||||
pyz,
|
||||
a.scripts,
|
||||
a.binaries,
|
||||
a.zipfiles,
|
||||
a.datas,
|
||||
[],
|
||||
name='hbc_windows',
|
||||
debug=False,
|
||||
bootloader_ignore_signals=False,
|
||||
strip=False,
|
||||
upx=False,
|
||||
upx_exclude=[],
|
||||
runtime_tmpdir=None,
|
||||
console=True,
|
||||
disable_windowed_traceback=False,
|
||||
argv_emulation=False,
|
||||
target_arch=None,
|
||||
codesign_identity=None,
|
||||
entitlements_file=None,
|
||||
icon=None,
|
||||
version=None,
|
||||
)
|
||||
@@ -0,0 +1,126 @@
|
||||
#Requires -RunAsAdministrator
|
||||
<#
|
||||
.SYNOPSIS
|
||||
Install hbc_windows.exe as a Windows Service using NSSM.
|
||||
|
||||
.DESCRIPTION
|
||||
Installs the HeartBeat Client as a Windows Service that starts automatically.
|
||||
Requires NSSM (Non-Sucking Service Manager) in PATH or alongside this script.
|
||||
Requires hbc_windows.exe built via: pyinstaller hbc_windows.spec
|
||||
|
||||
.PARAMETER Server
|
||||
HBD server hostname or IP address (required).
|
||||
|
||||
.PARAMETER ExePath
|
||||
Path to hbc_windows.exe. Defaults to the directory containing this script.
|
||||
|
||||
.PARAMETER ServiceName
|
||||
Windows service name. Default: heartbeat-client
|
||||
|
||||
.PARAMETER ConfigFile
|
||||
Path to hbc.json config file. Optional.
|
||||
|
||||
.PARAMETER LogFile
|
||||
Path to log file. Default: C:\ProgramData\heartbeat\hbc.log
|
||||
|
||||
.PARAMETER Interval
|
||||
Heartbeat interval in seconds. Default: 10
|
||||
|
||||
.EXAMPLE
|
||||
.\install_hbc_windows.ps1 -Server hbd.example.com
|
||||
.\install_hbc_windows.ps1 -Server hbd.example.com -ConfigFile C:\ProgramData\heartbeat\hbc.json
|
||||
#>
|
||||
|
||||
param(
|
||||
[Parameter(Mandatory = $true)]
|
||||
[string]$Server,
|
||||
|
||||
[string]$ExePath = "",
|
||||
[string]$ServiceName = "heartbeat-client",
|
||||
[string]$ConfigFile = "",
|
||||
[string]$LogFile = "C:\ProgramData\heartbeat\hbc.log",
|
||||
[int]$Interval = 10
|
||||
)
|
||||
|
||||
Set-StrictMode -Version Latest
|
||||
$ErrorActionPreference = "Stop"
|
||||
|
||||
# Locate hbc_windows.exe
|
||||
if ($ExePath -eq "") {
|
||||
$ExePath = Join-Path $PSScriptRoot "hbc_windows.exe"
|
||||
}
|
||||
if (-not (Test-Path $ExePath)) {
|
||||
Write-Error "hbc_windows.exe not found at: $ExePath`nBuild it first with: pyinstaller hbc_windows.spec"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Locate NSSM
|
||||
$nssm = Get-Command nssm -ErrorAction SilentlyContinue
|
||||
if (-not $nssm) {
|
||||
$nssmLocal = Join-Path $PSScriptRoot "nssm.exe"
|
||||
if (Test-Path $nssmLocal) {
|
||||
$nssm = $nssmLocal
|
||||
} else {
|
||||
Write-Error "nssm.exe not found in PATH or alongside this script.`nDownload from https://nssm.cc/download"
|
||||
exit 1
|
||||
}
|
||||
} else {
|
||||
$nssm = $nssm.Source
|
||||
}
|
||||
|
||||
# Build argument list
|
||||
$args_list = "--daemon $Server"
|
||||
if ($ConfigFile -ne "") {
|
||||
$args_list = "--daemon -c `"$ConfigFile`" $Server"
|
||||
}
|
||||
if ($LogFile -ne "") {
|
||||
$args_list = "$args_list --log-file `"$LogFile`""
|
||||
}
|
||||
|
||||
# Create data directory
|
||||
$dataDir = "C:\ProgramData\heartbeat"
|
||||
if (-not (Test-Path $dataDir)) {
|
||||
New-Item -ItemType Directory -Path $dataDir | Out-Null
|
||||
Write-Host "Created $dataDir"
|
||||
}
|
||||
|
||||
# Remove existing service if present
|
||||
$existing = Get-Service -Name $ServiceName -ErrorAction SilentlyContinue
|
||||
if ($existing) {
|
||||
Write-Host "Removing existing service '$ServiceName'..."
|
||||
& $nssm stop $ServiceName 2>$null
|
||||
& $nssm remove $ServiceName confirm
|
||||
}
|
||||
|
||||
# Install service
|
||||
Write-Host "Installing service '$ServiceName'..."
|
||||
& $nssm install $ServiceName $ExePath $args_list
|
||||
if ($LASTEXITCODE -ne 0) {
|
||||
Write-Error "nssm install failed (exit $LASTEXITCODE)"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Configure service
|
||||
& $nssm set $ServiceName DisplayName "HeartBeat Client"
|
||||
& $nssm set $ServiceName Description "Sends heartbeat and plugin metrics to the HBD monitoring server."
|
||||
& $nssm set $ServiceName Start SERVICE_AUTO_START
|
||||
& $nssm set $ServiceName AppStdout (Join-Path $dataDir "nssm_stdout.log")
|
||||
& $nssm set $ServiceName AppStderr (Join-Path $dataDir "nssm_stderr.log")
|
||||
& $nssm set $ServiceName AppRotateFiles 1
|
||||
& $nssm set $ServiceName AppRotateBytes 5242880
|
||||
|
||||
# Start service
|
||||
Write-Host "Starting service '$ServiceName'..."
|
||||
& $nssm start $ServiceName
|
||||
if ($LASTEXITCODE -ne 0) {
|
||||
Write-Warning "Service installed but failed to start — check logs in $dataDir"
|
||||
} else {
|
||||
Write-Host "Service '$ServiceName' started successfully."
|
||||
Write-Host "Log file: $LogFile"
|
||||
Write-Host ""
|
||||
Write-Host "Useful commands:"
|
||||
Write-Host " nssm status $ServiceName"
|
||||
Write-Host " nssm stop $ServiceName"
|
||||
Write-Host " nssm restart $ServiceName"
|
||||
Write-Host " nssm remove $ServiceName confirm"
|
||||
}
|
||||
Reference in New Issue
Block a user