fix non-alerting of overdue hosts
This commit is contained in:
@@ -171,6 +171,24 @@ def dicttos(ID, d):
|
|||||||
DROPOVERDUE = 7 * 24 * 3600 # seconds before an overdue host becomes UNKNOWN
|
DROPOVERDUE = 7 * 24 * 3600 # seconds before an overdue host becomes UNKNOWN
|
||||||
|
|
||||||
|
|
||||||
|
def _set_connectivity_alert(host, afam, level_name):
|
||||||
|
"""Update (or clear) a connectivity alert_state entry for a host/address-family.
|
||||||
|
|
||||||
|
level_name is "CRITICAL", "WARNING", or "OK". "OK" removes the entry so
|
||||||
|
that recovered hosts don't clutter the Alerts Dashboard.
|
||||||
|
"""
|
||||||
|
from .threshold import AlertState, AlertLevel
|
||||||
|
metric_path = f"connectivity.{afam}"
|
||||||
|
level = getattr(AlertLevel, level_name, AlertLevel.OK)
|
||||||
|
if level == AlertLevel.OK:
|
||||||
|
host.alert_states.pop(metric_path, None)
|
||||||
|
return
|
||||||
|
if metric_path not in host.alert_states:
|
||||||
|
host.alert_states[metric_path] = AlertState(metric_path)
|
||||||
|
state = host.alert_states[metric_path]
|
||||||
|
state.update(level, level_name)
|
||||||
|
|
||||||
|
|
||||||
def _make_timer_callbacks(uname, host, ctx):
|
def _make_timer_callbacks(uname, host, ctx):
|
||||||
"""Return (on_overdue, on_unknown) async callbacks for connection timer logic.
|
"""Return (on_overdue, on_unknown) async callbacks for connection timer logic.
|
||||||
|
|
||||||
@@ -182,6 +200,7 @@ def _make_timer_callbacks(uname, host, ctx):
|
|||||||
|
|
||||||
async def on_unknown(connection):
|
async def on_unknown(connection):
|
||||||
connection.newstate(connection.__class__.UNKNOWN, connection.lastbeat)
|
connection.newstate(connection.__class__.UNKNOWN, connection.lastbeat)
|
||||||
|
# Keep connectivity alert active when host transitions to unknown
|
||||||
if msg_to_websockets:
|
if msg_to_websockets:
|
||||||
msg_to_websockets("host", host.stateinfo())
|
msg_to_websockets("host", host.stateinfo())
|
||||||
|
|
||||||
@@ -196,6 +215,8 @@ def _make_timer_callbacks(uname, host, ctx):
|
|||||||
uname,
|
uname,
|
||||||
notify_mod.Notification(title=f"[CRITICAL] {uname}", body=msg, level="CRITICAL"),
|
notify_mod.Notification(title=f"[CRITICAL] {uname}", body=msg, level="CRITICAL"),
|
||||||
)
|
)
|
||||||
|
# Track in alert_states so the Alerts Dashboard shows this
|
||||||
|
_set_connectivity_alert(host, connection.afam, "CRITICAL")
|
||||||
if threshold_checker:
|
if threshold_checker:
|
||||||
threshold_checker.check_value(
|
threshold_checker.check_value(
|
||||||
host_name=uname,
|
host_name=uname,
|
||||||
@@ -410,6 +431,8 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
|||||||
if conn.getstate() != hbdcls.Connection.UP:
|
if conn.getstate() != hbdcls.Connection.UP:
|
||||||
lasts = conn.state
|
lasts = conn.state
|
||||||
d = conn.newstate(hbdcls.Connection.UP, now)
|
d = conn.newstate(hbdcls.Connection.UP, now)
|
||||||
|
# Clear connectivity alert now that the host is back up
|
||||||
|
_set_connectivity_alert(host, conn.afam, "OK")
|
||||||
# Don't log/notify RECOVER for a brand-new host seen for the first time —
|
# Don't log/notify RECOVER for a brand-new host seen for the first time —
|
||||||
# it was never down, it just hasn't been seen before.
|
# it was never down, it just hasn't been seen before.
|
||||||
if not newh:
|
if not newh:
|
||||||
@@ -436,6 +459,7 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
|||||||
notify_mod.Notification(title=f"[INFO] {uname}", body=m, level="INFO"),
|
notify_mod.Notification(title=f"[INFO] {uname}", body=m, level="INFO"),
|
||||||
)
|
)
|
||||||
conn.newstate(hbdcls.Connection.DOWN, now)
|
conn.newstate(hbdcls.Connection.DOWN, now)
|
||||||
|
_set_connectivity_alert(host, conn.afam, "CRITICAL")
|
||||||
|
|
||||||
if interval > 0:
|
if interval > 0:
|
||||||
host.interval = interval
|
host.interval = interval
|
||||||
|
|||||||
Reference in New Issue
Block a user