Compare commits
9 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 54fbd8d73d | |||
| 7ab17e26e2 | |||
| 28f5fa951c | |||
| 37f1c58969 | |||
| f006077a71 | |||
| d9fc8d632f | |||
| f640574e4f | |||
| 9a19424279 | |||
| ca8ba84e65 |
@@ -507,6 +507,9 @@ hbc --boot your-server.example.com
|
|||||||
|
|
||||||
# Verbose output
|
# Verbose output
|
||||||
hbc -v your-server.example.com
|
hbc -v your-server.example.com
|
||||||
|
|
||||||
|
# Send 'boot' and 'shutdown' messages on start and exit
|
||||||
|
hbc -b your-server.example.com
|
||||||
```
|
```
|
||||||
|
|
||||||
You can also run it via the module entrypoint:
|
You can also run it via the module entrypoint:
|
||||||
|
|||||||
+1
-1
@@ -14,4 +14,4 @@ Install options:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
__all__ = ["__version__"]
|
__all__ = ["__version__"]
|
||||||
__version__ = "5.2.1"
|
__version__ = "5.2.3"
|
||||||
|
|||||||
+9
-5
@@ -21,6 +21,7 @@ from typing import Dict, List, Optional
|
|||||||
# Import protocol and config
|
# Import protocol and config
|
||||||
from .config import load_config
|
from .config import load_config
|
||||||
from ..common.proto import dicttos, stodict
|
from ..common.proto import dicttos, stodict
|
||||||
|
from .. import __version__
|
||||||
|
|
||||||
# Import plugin system
|
# Import plugin system
|
||||||
from .plugin import PluginRegistry, PluginLoader, InfoPlugin, MonitorPlugin
|
from .plugin import PluginRegistry, PluginLoader, InfoPlugin, MonitorPlugin
|
||||||
@@ -172,9 +173,8 @@ class HeartbeatProtocol(asyncio.DatagramProtocol):
|
|||||||
self.logger.error(f"Error processing datagram: {e}", exc_info=True)
|
self.logger.error(f"Error processing datagram: {e}", exc_info=True)
|
||||||
|
|
||||||
def error_received(self, exc):
|
def error_received(self, exc):
|
||||||
"""Handle protocol errors."""
|
"""Handle protocol errors — close transport so the heartbeat sender retries."""
|
||||||
self.logger.warning(f"Protocol error on {self.connection.addr}: {exc} — dropping connection")
|
self.logger.warning(f"Protocol error on {self.connection.addr}: {exc} — will retry")
|
||||||
self.connection._dead = True
|
|
||||||
self.connection.close()
|
self.connection.close()
|
||||||
|
|
||||||
|
|
||||||
@@ -464,7 +464,7 @@ async def cleanup(connections: List[AsyncConnection]):
|
|||||||
logger.info("Cleaning up connections")
|
logger.info("Cleaning up connections")
|
||||||
|
|
||||||
target = next((c for c in connections if c.transport), connections[0] if connections else None)
|
target = next((c for c in connections if c.transport), connections[0] if connections else None)
|
||||||
if target:
|
if target and send_shutdown:
|
||||||
try:
|
try:
|
||||||
await target.sendto({"shutdown": 1, "acks": target.ackcount})
|
await target.sendto({"shutdown": 1, "acks": target.ackcount})
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -478,7 +478,7 @@ async def cleanup(connections: List[AsyncConnection]):
|
|||||||
|
|
||||||
async def async_main(args, config):
|
async def async_main(args, config):
|
||||||
"""Async main function."""
|
"""Async main function."""
|
||||||
global running, shutdown_event, active_tasks
|
global running, shutdown_event, active_tasks, send_shutdown
|
||||||
|
|
||||||
# Create shutdown event
|
# Create shutdown event
|
||||||
shutdown_event = asyncio.Event()
|
shutdown_event = asyncio.Event()
|
||||||
@@ -495,6 +495,7 @@ async def async_main(args, config):
|
|||||||
hb_port = config.get("hb_port", PORT)
|
hb_port = config.get("hb_port", PORT)
|
||||||
interval = config.get("interval", INTERVAL)
|
interval = config.get("interval", INTERVAL)
|
||||||
|
|
||||||
|
logger.info(f"hbc {__version__} starting on {iam}")
|
||||||
logger.info(f"Starting hbc for {iam} -> {hb_hosts}")
|
logger.info(f"Starting hbc for {iam} -> {hb_hosts}")
|
||||||
logger.info(f"Port: {hb_port}, Interval: {interval}s")
|
logger.info(f"Port: {hb_port}, Interval: {interval}s")
|
||||||
|
|
||||||
@@ -526,10 +527,13 @@ async def async_main(args, config):
|
|||||||
logger.info(f"Created {len(connections)} connections")
|
logger.info(f"Created {len(connections)} connections")
|
||||||
|
|
||||||
# Send boot/message if requested
|
# Send boot/message if requested
|
||||||
|
send_shutdown = False
|
||||||
if args.boot or args.message:
|
if args.boot or args.message:
|
||||||
boot_msg = {}
|
boot_msg = {}
|
||||||
if args.boot:
|
if args.boot:
|
||||||
boot_msg["boot"] = 1
|
boot_msg["boot"] = 1
|
||||||
|
args.boot = False # Clear boot flag so we don't send it again in main loop
|
||||||
|
send_shutdown = True
|
||||||
if args.message:
|
if args.message:
|
||||||
boot_msg["service"] = "service"
|
boot_msg["service"] = "service"
|
||||||
boot_msg["msg"] = args.message
|
boot_msg["msg"] = args.message
|
||||||
|
|||||||
@@ -13,12 +13,8 @@ plugins:
|
|||||||
count: 3 # ICMP packets per ping run (default 3)
|
count: 3 # ICMP packets per ping run (default 3)
|
||||||
timeout: 5 # seconds before a host is considered unreachable (default 5)
|
timeout: 5 # seconds before a host is considered unreachable (default 5)
|
||||||
hosts:
|
hosts:
|
||||||
8.8.8.8:
|
- 8.8.8.8
|
||||||
warning: 20.0 # ms
|
- 192.168.1.1
|
||||||
critical: 100.0 # ms
|
|
||||||
192.168.1.1:
|
|
||||||
warning: 5.0
|
|
||||||
critical: 20.0
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Reported metrics per host (metric key uses the hostname with dots/colons replaced
|
Reported metrics per host (metric key uses the hostname with dots/colons replaced
|
||||||
|
|||||||
+2
-1
@@ -475,7 +475,8 @@ def run(config, config_path=None):
|
|||||||
if config.get("debug", 0) > 0:
|
if config.get("debug", 0) > 0:
|
||||||
log_level = logging.DEBUG
|
log_level = logging.DEBUG
|
||||||
logging.basicConfig(level=log_level)
|
logging.basicConfig(level=log_level)
|
||||||
logging.getLogger("aiohttp.access").setLevel(logging.DEBUG)
|
if not config.get("debug", 0):
|
||||||
|
logging.getLogger("aiohttp.access").propagate = False
|
||||||
load_pickled_hosts(config, hbdclass)
|
load_pickled_hosts(config, hbdclass)
|
||||||
|
|
||||||
notify_mod.initlog(logfile=config.get("logfile", "messages.log"))
|
notify_mod.initlog(logfile=config.get("logfile", "messages.log"))
|
||||||
|
|||||||
@@ -184,9 +184,9 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
.alert-metric {
|
.alert-metric {
|
||||||
color: #666;
|
color: #0066cc;
|
||||||
font-family: 'Courier New', monospace;
|
font-size: 1.1em;
|
||||||
font-size: 0.9em;
|
font-weight: normal;
|
||||||
}
|
}
|
||||||
|
|
||||||
.alert-details {
|
.alert-details {
|
||||||
@@ -438,8 +438,8 @@
|
|||||||
<div class="alert-header">
|
<div class="alert-header">
|
||||||
<span class="alert-level ${level}">${alert.level}</span>
|
<span class="alert-level ${level}">${alert.level}</span>
|
||||||
<a class="alert-hostname" href="/plugins#${alert.hostname}">${alert.hostname}</a>
|
<a class="alert-hostname" href="/plugins#${alert.hostname}">${alert.hostname}</a>
|
||||||
|
<span class="alert-metric">${alert.metric_path.includes('.') ? alert.metric_path.slice(alert.metric_path.indexOf('.') + 1) : alert.metric_path}</span>
|
||||||
</div>
|
</div>
|
||||||
<div class="alert-metric">${alert.metric_path}</div>
|
|
||||||
<div class="alert-details">
|
<div class="alert-details">
|
||||||
<span>${valueText}</span>
|
<span>${valueText}</span>
|
||||||
<span class="alert-duration">Active for ${duration}</span>
|
<span class="alert-duration">Active for ${duration}</span>
|
||||||
|
|||||||
+13
-7
@@ -1109,11 +1109,16 @@ class ThresholdChecker:
|
|||||||
if host is not None and not host.watched:
|
if host is not None and not host.watched:
|
||||||
eventlog(host_name, lvl, message, service="threshold")
|
eventlog(host_name, lvl, message, service="threshold")
|
||||||
return
|
return
|
||||||
|
short_path = metric_path.partition(".")[2] or metric_path
|
||||||
|
title = f"[{lvl}] {host_name} {short_path}"
|
||||||
|
# Strip the "metric = " prefix from message so body is just the value/detail
|
||||||
|
prefix = short_path + " = "
|
||||||
|
body = message[len(prefix):] if message.startswith(prefix) else message
|
||||||
asyncio.get_event_loop().create_task(notify_mod.send_notification(
|
asyncio.get_event_loop().create_task(notify_mod.send_notification(
|
||||||
host_name,
|
host_name,
|
||||||
notify_mod.Notification(
|
notify_mod.Notification(
|
||||||
title=f"[{lvl}] {host_name}",
|
title=title,
|
||||||
body=message,
|
body=body,
|
||||||
level=lvl,
|
level=lvl,
|
||||||
),
|
),
|
||||||
))
|
))
|
||||||
@@ -1358,18 +1363,19 @@ class ThresholdChecker:
|
|||||||
check_name=check_name,
|
check_name=check_name,
|
||||||
metric_name=metric_name,
|
metric_name=metric_name,
|
||||||
)
|
)
|
||||||
message = f"REMINDER ({alert_state.level.name}): {host_name} - {short_path} = {value} {threshold_info}, ongoing for {int(now - alert_state.since)}s"
|
body = f"{value} {threshold_info}, ongoing for {int(now - alert_state.since)}s"
|
||||||
else:
|
else:
|
||||||
message = f"REMINDER ({alert_state.level.name}): {host_name} - {short_path} = {value} (ongoing for {int(now - alert_state.since)}s)"
|
body = f"{value} (ongoing for {int(now - alert_state.since)}s)"
|
||||||
|
message = f"REMINDER ({alert_state.level.name}): {host_name} - {short_path} = {body}"
|
||||||
|
|
||||||
from . import hbdclass
|
from . import hbdclass
|
||||||
host = hbdclass.Host.hosts.get(host_name)
|
host = hbdclass.Host.hosts.get(host_name)
|
||||||
if host is None or host.watched:
|
if host is None or host.watched:
|
||||||
asyncio.get_event_loop().create_task(notify_mod.send_notification(
|
asyncio.get_event_loop().create_task(notify_mod.send_notification(
|
||||||
host_name,
|
host_name,
|
||||||
notify_mod.Notification(
|
notify_mod.Notification(
|
||||||
title=f"[REMINDER/{alert_state.level.name}] {host_name}",
|
title=f"[REMINDER/{alert_state.level.name}] {host_name} {short_path}",
|
||||||
body=message,
|
body=body,
|
||||||
level=alert_state.level.name,
|
level=alert_state.level.name,
|
||||||
),
|
),
|
||||||
))
|
))
|
||||||
|
|||||||
+1
-1
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "hbd"
|
name = "hbd"
|
||||||
version = "5.2.1"
|
version = "5.2.3"
|
||||||
description = "Heartbeat monitoring system — client (hbc) and server (hbd)"
|
description = "Heartbeat monitoring system — client (hbc) and server (hbd)"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.11"
|
requires-python = ">=3.11"
|
||||||
|
|||||||
+8
-6
@@ -41,7 +41,7 @@ from pathlib import Path
|
|||||||
from typing import Any, Dict, List, Optional, Tuple
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
# updated by scripts/bumpminor.sh
|
# updated by scripts/bumpminor.sh
|
||||||
__version__ = "5.2.1"
|
__version__ = "5.2.3"
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Protocol (mirrors hbd/common/proto.py)
|
# Protocol (mirrors hbd/common/proto.py)
|
||||||
@@ -797,8 +797,7 @@ class _HeartbeatProtocol(asyncio.DatagramProtocol):
|
|||||||
self._log.error("datagram error: %s", e)
|
self._log.error("datagram error: %s", e)
|
||||||
|
|
||||||
def error_received(self, exc):
|
def error_received(self, exc):
|
||||||
self._log.warning("protocol error on %s: %s — dropping connection", self._conn.addr, exc)
|
self._log.warning("protocol error on %s: %s — will retry", self._conn.addr, exc)
|
||||||
self._conn._dead = True
|
|
||||||
self._conn.close()
|
self._conn.close()
|
||||||
|
|
||||||
|
|
||||||
@@ -1029,7 +1028,7 @@ def _reconfigure_syslog(level: int):
|
|||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
async def _async_main(args, cfg: Dict[str, Any]) -> int:
|
async def _async_main(args, cfg: Dict[str, Any]) -> int:
|
||||||
global _running, _shutdown_event, _active_tasks
|
global _running, _shutdown_event, _active_tasks, send_shutdown
|
||||||
_running = True
|
_running = True
|
||||||
_shutdown_event = asyncio.Event()
|
_shutdown_event = asyncio.Event()
|
||||||
_active_tasks = []
|
_active_tasks = []
|
||||||
@@ -1039,7 +1038,7 @@ async def _async_main(args, cfg: Dict[str, Any]) -> int:
|
|||||||
port = cfg.get("hb_port", PORT)
|
port = cfg.get("hb_port", PORT)
|
||||||
interval = cfg.get("interval", INTERVAL)
|
interval = cfg.get("interval", INTERVAL)
|
||||||
|
|
||||||
log.info("starting: %s -> %s port=%d interval=%ds", iam, args.hosts, port, interval)
|
log.info("starting hbc_mini %s on %s -> %s port=%d interval=%ds",__version__, iam, args.hosts, port, interval)
|
||||||
|
|
||||||
connections: List[AsyncConnection] = []
|
connections: List[AsyncConnection] = []
|
||||||
conn_id = 1
|
conn_id = 1
|
||||||
@@ -1060,10 +1059,13 @@ async def _async_main(args, cfg: Dict[str, Any]) -> int:
|
|||||||
return 1
|
return 1
|
||||||
|
|
||||||
# Boot / one-shot message
|
# Boot / one-shot message
|
||||||
|
send_shutdown = False
|
||||||
if args.boot or args.message:
|
if args.boot or args.message:
|
||||||
bmsg: Dict[str, Any] = {"acks": 0}
|
bmsg: Dict[str, Any] = {"acks": 0}
|
||||||
if args.boot:
|
if args.boot:
|
||||||
bmsg["boot"] = 1
|
bmsg["boot"] = 1
|
||||||
|
args.boot = False # don't repeat on restart
|
||||||
|
send_shutdown = True
|
||||||
if args.message:
|
if args.message:
|
||||||
bmsg["service"] = "service"
|
bmsg["service"] = "service"
|
||||||
bmsg["msg"] = args.message
|
bmsg["msg"] = args.message
|
||||||
@@ -1101,7 +1103,7 @@ async def _async_main(args, cfg: Dict[str, Any]) -> int:
|
|||||||
|
|
||||||
log.info("shutting down")
|
log.info("shutting down")
|
||||||
target = next((c for c in connections if c._transport), connections[0] if connections else None)
|
target = next((c for c in connections if c._transport), connections[0] if connections else None)
|
||||||
if target:
|
if target and send_shutdown:
|
||||||
try:
|
try:
|
||||||
await target.sendto({"shutdown": 1, "acks": target.ackcount})
|
await target.sendto({"shutdown": 1, "acks": target.ackcount})
|
||||||
except Exception:
|
except Exception:
|
||||||
|
|||||||
Reference in New Issue
Block a user