display tag fro alterts, cleanup udp
This commit is contained in:
+1
-1
@@ -105,7 +105,7 @@ class AsyncConnection:
|
||||
msg["time"] = time.time()
|
||||
|
||||
# Encode message
|
||||
data = dicttos(msg_id, msg, compress=True)
|
||||
data = dicttos(msg_id, msg)
|
||||
|
||||
# Send
|
||||
self.transport.sendto(data, (self.addr, self.port))
|
||||
|
||||
+6
-9
@@ -62,7 +62,7 @@ def decode_value(val: str) -> Any:
|
||||
return val
|
||||
|
||||
|
||||
def dicttos(ID: str, d: Dict[str, Any], compress: bool = False):
|
||||
def dicttos(ID: str, d: Dict[str, Any]):
|
||||
"""Serialize a dict to protocol message bytes.
|
||||
|
||||
If compress is True, the payload is zlib-compressed and the message is
|
||||
@@ -75,12 +75,9 @@ def dicttos(ID: str, d: Dict[str, Any], compress: bool = False):
|
||||
encoded_val = encode_value(v)
|
||||
s.append(f"{k}={encoded_val}")
|
||||
pk = ";".join(s)
|
||||
if compress:
|
||||
zpk = zlib.compress(pk.encode(), 6)
|
||||
hdr = ("!" + ID + ":").encode()
|
||||
return hdr + zpk
|
||||
else:
|
||||
return (ID + ":" + pk).encode()
|
||||
zpk = zlib.compress(pk.encode(), 6)
|
||||
hdr = ("!" + ID + ":").encode()
|
||||
return hdr + zpk
|
||||
|
||||
|
||||
def stodict(msg: bytes):
|
||||
@@ -131,7 +128,7 @@ def oldmtodict(msg: bytes):
|
||||
return stodict(b"HTB:" + msg)
|
||||
|
||||
|
||||
def encode_plugin_data(plugin_name: str, data: Dict[str, Any], compress: bool = False) -> bytes:
|
||||
def encode_plugin_data(plugin_name: str, data: Dict[str, Any]) -> bytes:
|
||||
"""Encode plugin data into a PLG message.
|
||||
|
||||
Args:
|
||||
@@ -144,7 +141,7 @@ def encode_plugin_data(plugin_name: str, data: Dict[str, Any], compress: bool =
|
||||
"""
|
||||
# Add plugin name to data
|
||||
full_data = {"plugin": plugin_name, **data}
|
||||
return dicttos("PLG", full_data, compress)
|
||||
return dicttos("PLG", full_data)
|
||||
|
||||
|
||||
def decode_plugin_data(msg: bytes) -> Dict[str, Any]:
|
||||
|
||||
+2
-38
@@ -175,7 +175,8 @@ class Connection:
|
||||
|
||||
def newaddr(self, addr, rtt, now):
|
||||
self.lastbeat = now
|
||||
self.rtts.append(rtt)
|
||||
if rtt is not None:
|
||||
self.rtts.append(rtt)
|
||||
if len(self.rtts) > MAXRTTS:
|
||||
del self.rtts[0]
|
||||
|
||||
@@ -292,7 +293,6 @@ class Host:
|
||||
self.cmds = []
|
||||
self.cver = 0
|
||||
self.connections = {}
|
||||
self.hdwcounts = [[0, 0], [0, 0], [0, 0]]
|
||||
# Plugin data storage: {plugin_name: [(timestamp, data), ...]}
|
||||
self.plugin_data = {}
|
||||
self.plugin_retention = 100 # Keep last N samples per plugin
|
||||
@@ -473,42 +473,6 @@ class Host:
|
||||
"""
|
||||
return self.plugin_data
|
||||
|
||||
# def dispstate(self):
|
||||
# if self.state in ["down", "overdue"]:
|
||||
# state = "<b>%s</b>" % self.state
|
||||
# elif self.state in ["up", "UP"]:
|
||||
# state = ""
|
||||
# for x in list(self.connections.keys()):
|
||||
# try:
|
||||
# state += " %5.1f" % (self.connections[x].rtts[-1])
|
||||
# except:
|
||||
# state += " %5s" % (self.connections[x].rtts[-1])
|
||||
# elif self.state in ["unknown", "UNKNOWN"]:
|
||||
# state = ""
|
||||
# else:
|
||||
# state = "%s" % self.state
|
||||
# return state
|
||||
|
||||
def dispstats(self):
|
||||
if self.doesack != -1:
|
||||
if self.upcount > 0:
|
||||
r = ""
|
||||
for v in range(3):
|
||||
a, u = self.hdwcounts[v]
|
||||
if (self.upcount - u) != 0:
|
||||
vs = "%0.0f" % (
|
||||
100.0 - (((self.doesack - a) * 100.0) / (self.upcount - u))
|
||||
)
|
||||
if vs == "0":
|
||||
vs = ""
|
||||
else:
|
||||
vs = "-"
|
||||
r += '<td align="right">%s</td>' % vs
|
||||
return r
|
||||
else:
|
||||
return "<td>(%s)</td><td></td><td></td>" % (self.doesack)
|
||||
return '<td align="right">N/A</td><td></td<td></td>>'
|
||||
|
||||
hostfields_long = [
|
||||
"name",
|
||||
"IPv4.addr",
|
||||
|
||||
+4
-4
@@ -9,9 +9,11 @@ import logging
|
||||
from aiohttp import web
|
||||
import jinja2
|
||||
from . import data
|
||||
from . import notify as notify_mod
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
eventlog = notify_mod.eventlog
|
||||
|
||||
def _render_template(html_str: str, **context) -> str:
|
||||
tmpl = jinja2.Template(html_str)
|
||||
@@ -92,8 +94,7 @@ async def start(
|
||||
return web.Response(status=400, text="need h= argument")
|
||||
if uname not in hbdclass.Host.hosts:
|
||||
return web.Response(status=400, text=f"h={uname} not found")
|
||||
if log:
|
||||
log(uname, "dropped")
|
||||
eventlog(uname, "INFO", "dropped")
|
||||
del hbdclass.Host.hosts[uname]
|
||||
return web.Response(text="Done")
|
||||
|
||||
@@ -105,8 +106,7 @@ async def start(
|
||||
if uname not in hbdclass.Host.hosts:
|
||||
return web.Response(status=400, text=f"h={uname} not found")
|
||||
ll = hbdclass.Host.hosts[uname].registerDns()
|
||||
if log:
|
||||
log(uname, ll)
|
||||
eventlog(uname, "INFO", ll)
|
||||
return web.Response(text=str(ll))
|
||||
|
||||
async def update(request):
|
||||
|
||||
+3
-23
@@ -30,6 +30,8 @@ def cleanup_function(config, hbdclass):
|
||||
# Ensure all timer references are cleared before pickling
|
||||
for hostname, host in list(hbdclass.Host.hosts.items()):
|
||||
for conn_type, conn in host.connections.items():
|
||||
if hasattr(conn, 'cancel_overdue_timer'):
|
||||
conn.cancel_overdue_timer()
|
||||
if hasattr(conn, 'overdue_timer'):
|
||||
conn.overdue_timer = None
|
||||
if hasattr(conn, 'overdue_callback'):
|
||||
@@ -65,7 +67,6 @@ async def _run_async(config):
|
||||
from . import http as http_mod
|
||||
from . import dns as dns_mod
|
||||
from . import notify as notify_mod
|
||||
from . import monitor as monitor_mod
|
||||
from . import journal as journal_mod
|
||||
from . import threshold as threshold_mod
|
||||
|
||||
@@ -200,20 +201,6 @@ async def _run_async(config):
|
||||
except Exception as e:
|
||||
logger.exception("websocket server failed to start: %s", e)
|
||||
|
||||
# Start the monitor thread as a background task
|
||||
try:
|
||||
monitor_task = asyncio.create_task(
|
||||
monitor_mod.start(
|
||||
config=config,
|
||||
hbdclass=hbdclass,
|
||||
pushmsg=pushmsg,
|
||||
msg_to_websockets=msg_to_websockets,
|
||||
)
|
||||
)
|
||||
logger.info("Monitor task started")
|
||||
except Exception as e:
|
||||
logger.exception("monitor task failed to start: %s", e)
|
||||
|
||||
try:
|
||||
# run forever until shutdown event is set
|
||||
await shutdown_event.wait()
|
||||
@@ -221,13 +208,6 @@ async def _run_async(config):
|
||||
except Exception as e:
|
||||
logger.exception("Error in main loop: %s", e)
|
||||
finally:
|
||||
# Clean up connection timers
|
||||
try:
|
||||
logger.info("Cleaning up connection timers...")
|
||||
await monitor_mod.cleanup_connections(hbdclass)
|
||||
except Exception as e:
|
||||
logger.warning("Error cleaning up connection timers: %s", e)
|
||||
|
||||
# Cancel all running tasks
|
||||
logger.info("Cancelling tasks...")
|
||||
try:
|
||||
@@ -235,7 +215,7 @@ async def _run_async(config):
|
||||
except Exception as e:
|
||||
logger.warning("Error closing UDP transport: %s", e)
|
||||
|
||||
tasks_to_cancel = [http_task, ws_task, monitor_task]
|
||||
tasks_to_cancel = [http_task, ws_task]
|
||||
for task in tasks_to_cancel:
|
||||
if task:
|
||||
try:
|
||||
|
||||
@@ -26,41 +26,3 @@ async def cleanup_connections(hbdclass):
|
||||
if hasattr(conn, 'cancel_overdue_timer'):
|
||||
conn.cancel_overdue_timer()
|
||||
|
||||
|
||||
async def start(
|
||||
config: dict,
|
||||
hbdclass: callable,
|
||||
pushmsg=None,
|
||||
msg_to_websockets=None,
|
||||
):
|
||||
"""Start monitor background tasks.
|
||||
|
||||
Note: Reachability monitoring is now timer-based and happens in udp.py
|
||||
when HTB messages arrive. This function can be used for additional
|
||||
monitoring tasks.
|
||||
|
||||
Currently runs a simple status logger every 5 minutes.
|
||||
"""
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
logger_interval = 300 # Log status every 5 minutes
|
||||
|
||||
while True:
|
||||
await asyncio.sleep(logger_interval)
|
||||
|
||||
# Log monitoring status
|
||||
total_hosts = len(hbdclass.Host.hosts)
|
||||
up_count = sum(
|
||||
1 for h in hbdclass.Host.hosts.values()
|
||||
for c in h.connections.values()
|
||||
if c.state == hbdclass.Connection.UP
|
||||
)
|
||||
overdue_count = sum(
|
||||
1 for h in hbdclass.Host.hosts.values()
|
||||
for c in h.connections.values()
|
||||
if c.state == hbdclass.Connection.OVERDUE
|
||||
)
|
||||
|
||||
logger.debug(
|
||||
f"Monitor status: {total_hosts} hosts, {up_count} UP, {overdue_count} OVERDUE"
|
||||
)
|
||||
|
||||
@@ -397,6 +397,12 @@
|
||||
const level = alert.level.toLowerCase();
|
||||
const duration = getDuration(alert.since);
|
||||
|
||||
// Format value with threshold info if available
|
||||
let valueText = `Value: <span class="alert-value">${formatValue(alert.last_value)}</span>`;
|
||||
if (alert.threshold_value !== undefined && alert.threshold_value !== null && alert.operator) {
|
||||
valueText += ` <span class="threshold-info">(threshold: ${alert.operator} ${formatValue(alert.threshold_value)})</span>`;
|
||||
}
|
||||
|
||||
return `
|
||||
<div class="alert-item ${level}">
|
||||
<div class="alert-main">
|
||||
@@ -406,7 +412,7 @@
|
||||
</div>
|
||||
<div class="alert-metric">${alert.metric_path}</div>
|
||||
<div class="alert-details">
|
||||
<span>Value: <span class="alert-value">${formatValue(alert.last_value)}</span></span>
|
||||
<span>${valueText}</span>
|
||||
<span class="alert-duration">Active for ${duration}</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -300,6 +300,60 @@
|
||||
font-weight: bold;
|
||||
color: #2196f3;
|
||||
}
|
||||
|
||||
/* Simple data table styling (for os_info, cpu_monitor, etc.) */
|
||||
.simple-data-table {
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
margin-top: 10px;
|
||||
font-size: 0.9em;
|
||||
background: #fff;
|
||||
box-shadow: 0 1px 3px rgba(0,0,0,0.1);
|
||||
border-radius: 4px;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.simple-data-table thead {
|
||||
background: #2196f3;
|
||||
color: white;
|
||||
}
|
||||
|
||||
.simple-data-table th {
|
||||
padding: 10px 15px;
|
||||
text-align: left;
|
||||
font-weight: 600;
|
||||
text-transform: uppercase;
|
||||
font-size: 0.8em;
|
||||
letter-spacing: 0.5px;
|
||||
}
|
||||
|
||||
.simple-data-table td {
|
||||
padding: 8px 15px;
|
||||
border-top: 1px solid #e0e0e0;
|
||||
}
|
||||
|
||||
.simple-data-table td.name {
|
||||
font-weight: 500;
|
||||
color: #555;
|
||||
width: 40%;
|
||||
}
|
||||
|
||||
.simple-data-table td.value {
|
||||
color: #333;
|
||||
font-family: 'Segoe UI', system-ui, sans-serif;
|
||||
}
|
||||
|
||||
.simple-data-table tbody tr:hover {
|
||||
background: #f5f5f5;
|
||||
}
|
||||
|
||||
.simple-data-table tbody tr:nth-child(even) {
|
||||
background: #fafafa;
|
||||
}
|
||||
|
||||
.simple-data-table tbody tr:nth-child(even):hover {
|
||||
background: #f0f0f0;
|
||||
}
|
||||
</style>
|
||||
|
||||
<body>
|
||||
@@ -406,6 +460,14 @@
|
||||
}
|
||||
|
||||
function renderPluginData(data, timestamp) {
|
||||
// Check if this should be rendered as a simple table
|
||||
const pluginName = getCurrentPluginName();
|
||||
const simplePlugins = ['os_info', 'cpu_monitor', 'memory_monitor', 'nagios_runner'];
|
||||
|
||||
if (simplePlugins.includes(pluginName) && isSimpleKeyValueData(data)) {
|
||||
return renderSimpleDataTable(data, timestamp);
|
||||
}
|
||||
|
||||
let html = '<div class="metric-grid">';
|
||||
|
||||
for (const [key, value] of Object.entries(data)) {
|
||||
@@ -478,6 +540,57 @@
|
||||
return html;
|
||||
}
|
||||
|
||||
function getCurrentPluginName() {
|
||||
// Get currently active plugin name from the active plugin content div
|
||||
const activeContent = document.querySelector('.plugin-content.active');
|
||||
if (activeContent) {
|
||||
return activeContent.dataset.plugin;
|
||||
}
|
||||
return '';
|
||||
}
|
||||
|
||||
function isSimpleKeyValueData(data) {
|
||||
// Check if data is simple key-value pairs (no complex nesting)
|
||||
for (const [key, value] of Object.entries(data)) {
|
||||
if (typeof value === 'object' && value !== null) {
|
||||
// Has nested objects - not simple
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
function renderSimpleDataTable(data, timestamp) {
|
||||
let html = '<table class="simple-data-table">';
|
||||
|
||||
// Table header
|
||||
html += '<thead><tr>';
|
||||
html += '<th>Name</th>';
|
||||
html += '<th>Value</th>';
|
||||
html += '</tr></thead>';
|
||||
|
||||
// Table body
|
||||
html += '<tbody>';
|
||||
for (const [key, value] of Object.entries(data)) {
|
||||
const label = formatLabel(key);
|
||||
const formattedValue = formatValue(key, value);
|
||||
const unit = getUnit(key);
|
||||
|
||||
html += '<tr>';
|
||||
html += `<td class="name">${label}</td>`;
|
||||
html += `<td class="value">${formattedValue}${unit ? ' ' + unit : ''}</td>`;
|
||||
html += '</tr>';
|
||||
}
|
||||
html += '</tbody>';
|
||||
|
||||
html += '</table>';
|
||||
|
||||
const date = new Date(timestamp * 1000);
|
||||
html += `<div class="timestamp">Last updated: ${date.toLocaleString()}</div>`;
|
||||
|
||||
return html;
|
||||
}
|
||||
|
||||
function isInterfaceData(key, value) {
|
||||
// Check if this is interface/network stats data (I/O counters)
|
||||
if (key.toLowerCase().includes('interface') && !key.toLowerCase().includes('interface_stats')) {
|
||||
|
||||
+102
-15
@@ -53,14 +53,24 @@ class AlertState:
|
||||
self.last_check = time.time()
|
||||
self.notification_count = 0
|
||||
self.last_notification = None
|
||||
self.threshold_value = None # The threshold value that triggered alert
|
||||
self.operator = None # The comparison operator (>, <, >=, etc.)
|
||||
|
||||
def update(self, level: AlertLevel, value: Any) -> bool:
|
||||
def update(
|
||||
self,
|
||||
level: AlertLevel,
|
||||
value: Any,
|
||||
threshold_value: Optional[float] = None,
|
||||
operator: Optional[str] = None
|
||||
) -> bool:
|
||||
"""
|
||||
Update alert state.
|
||||
|
||||
Args:
|
||||
level: New alert level
|
||||
value: Current metric value
|
||||
threshold_value: The threshold value that was exceeded (if applicable)
|
||||
operator: The comparison operator (>, <, >=, etc.)
|
||||
|
||||
Returns:
|
||||
True if state changed (notification needed), False otherwise
|
||||
@@ -69,6 +79,15 @@ class AlertState:
|
||||
self.last_check = now
|
||||
self.last_value = value
|
||||
|
||||
# Update threshold info when alert is active
|
||||
if level != AlertLevel.OK:
|
||||
self.threshold_value = threshold_value
|
||||
self.operator = operator
|
||||
else:
|
||||
# Clear threshold info when returning to OK
|
||||
self.threshold_value = None
|
||||
self.operator = None
|
||||
|
||||
# Check if state changed
|
||||
if level != self.level:
|
||||
logger.info(
|
||||
@@ -87,7 +106,7 @@ class AlertState:
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert alert state to dictionary for serialization."""
|
||||
return {
|
||||
result = {
|
||||
"metric_path": self.metric_path,
|
||||
"level": self.level.name,
|
||||
"since": self.since,
|
||||
@@ -95,6 +114,14 @@ class AlertState:
|
||||
"last_check": self.last_check,
|
||||
"notification_count": self.notification_count,
|
||||
}
|
||||
|
||||
# Include threshold info if available
|
||||
if self.threshold_value is not None:
|
||||
result["threshold_value"] = self.threshold_value
|
||||
if self.operator is not None:
|
||||
result["operator"] = self.operator
|
||||
|
||||
return result
|
||||
|
||||
def __str__(self):
|
||||
return self.to_dict().__str__()
|
||||
@@ -107,6 +134,7 @@ class ThresholdConfig:
|
||||
metric_path: str,
|
||||
warning: Optional[float] = None,
|
||||
critical: Optional[float] = None,
|
||||
display: Optional[str] = None,
|
||||
operator: str = ">",
|
||||
hysteresis: float = 0.0,
|
||||
enabled: bool = True,
|
||||
@@ -127,6 +155,7 @@ class ThresholdConfig:
|
||||
self.critical = critical
|
||||
self.enabled = enabled
|
||||
self.hysteresis = hysteresis
|
||||
self.display = display
|
||||
|
||||
# Parse operator
|
||||
try:
|
||||
@@ -302,6 +331,7 @@ class ThresholdChecker:
|
||||
warning = threshold_config.get("warning")
|
||||
critical = threshold_config.get("critical")
|
||||
operator = threshold_config.get("operator", ">")
|
||||
display = threshold_config.get("display")
|
||||
hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default
|
||||
enabled = threshold_config.get("enabled", True)
|
||||
|
||||
@@ -316,6 +346,7 @@ class ThresholdChecker:
|
||||
operator=operator,
|
||||
hysteresis=hysteresis,
|
||||
enabled=enabled,
|
||||
display=display
|
||||
)
|
||||
|
||||
self.thresholds[metric_path] = threshold
|
||||
@@ -345,7 +376,7 @@ class ThresholdChecker:
|
||||
operator = threshold_config.get("operator", ">")
|
||||
hysteresis = threshold_config.get("hysteresis", 0.1)
|
||||
enabled = threshold_config.get("enabled", True)
|
||||
|
||||
display = threshold_config.get("display")
|
||||
if warning is None and critical is None:
|
||||
continue
|
||||
|
||||
@@ -356,6 +387,7 @@ class ThresholdChecker:
|
||||
operator=operator,
|
||||
hysteresis=hysteresis,
|
||||
enabled=enabled,
|
||||
display=display
|
||||
)
|
||||
|
||||
self.thresholds[metric_path] = threshold
|
||||
@@ -382,6 +414,7 @@ class ThresholdChecker:
|
||||
operator = threshold_config.get("operator", ">")
|
||||
hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default
|
||||
enabled = threshold_config.get("enabled", True)
|
||||
display = threshold_config.get("display")
|
||||
|
||||
if warning is None and critical is None:
|
||||
logger.warning("No RTT thresholds defined for %s, skipping", hostname)
|
||||
@@ -394,6 +427,7 @@ class ThresholdChecker:
|
||||
operator=operator,
|
||||
hysteresis=hysteresis,
|
||||
enabled=enabled,
|
||||
display=display
|
||||
)
|
||||
|
||||
self.thresholds[metric_path] = threshold
|
||||
@@ -440,14 +474,21 @@ class ThresholdChecker:
|
||||
alert_state.level
|
||||
)
|
||||
|
||||
# Determine which threshold was exceeded
|
||||
threshold_value = None
|
||||
if new_level == AlertLevel.CRITICAL and threshold.critical is not None:
|
||||
threshold_value = threshold.critical
|
||||
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
|
||||
threshold_value = threshold.warning
|
||||
|
||||
# Update state and check for changes
|
||||
old_level = alert_state.level
|
||||
if alert_state.update(new_level, value):
|
||||
self._trigger_notification(host_name, metric_path, old_level, new_level, value)
|
||||
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
||||
self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold)
|
||||
return (old_level, new_level)
|
||||
elif new_level != AlertLevel.OK:
|
||||
# Check if we should re-notify
|
||||
self._check_renotify(host_name, alert_state, metric_path, value)
|
||||
self._check_renotify(host_name, alert_state, metric_path, value, threshold)
|
||||
|
||||
return None
|
||||
|
||||
@@ -493,14 +534,21 @@ class ThresholdChecker:
|
||||
alert_state.level
|
||||
)
|
||||
|
||||
# Determine which threshold was exceeded
|
||||
threshold_value = None
|
||||
if new_level == AlertLevel.CRITICAL and threshold.critical is not None:
|
||||
threshold_value = threshold.critical
|
||||
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
|
||||
threshold_value = threshold.warning
|
||||
|
||||
# Update state and check for changes
|
||||
old_level = alert_state.level
|
||||
if alert_state.update(new_level, value):
|
||||
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
||||
state_changes.append((metric_path, old_level, new_level, value))
|
||||
self._trigger_notification(host_name, metric_path, old_level, new_level, value)
|
||||
self._trigger_notification(host_name, metric_path, old_level, new_level, value, threshold)
|
||||
elif new_level != AlertLevel.OK:
|
||||
# Check if we should re-notify
|
||||
self._check_renotify(host_name, alert_state, metric_path, value)
|
||||
self._check_renotify(host_name, alert_state, metric_path, value, threshold)
|
||||
|
||||
# Check nested metrics (e.g., partition data in disk_monitor)
|
||||
self._check_nested_metrics(
|
||||
@@ -550,18 +598,26 @@ class ThresholdChecker:
|
||||
alert_state.level
|
||||
)
|
||||
|
||||
# Determine which threshold was exceeded
|
||||
threshold_value = None
|
||||
if new_level == AlertLevel.CRITICAL and threshold.critical is not None:
|
||||
threshold_value = threshold.critical
|
||||
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
|
||||
threshold_value = threshold.warning
|
||||
|
||||
old_level = alert_state.level
|
||||
if alert_state.update(new_level, value):
|
||||
if alert_state.update(new_level, value, threshold_value, threshold.operator.value):
|
||||
state_changes.append((metric_path, old_level, new_level, value))
|
||||
self._trigger_notification(
|
||||
host_name,
|
||||
metric_path,
|
||||
old_level,
|
||||
new_level,
|
||||
value
|
||||
value,
|
||||
threshold
|
||||
)
|
||||
elif new_level != AlertLevel.OK:
|
||||
self._check_renotify(host_name, alert_state, metric_path, value)
|
||||
self._check_renotify(host_name, alert_state, metric_path, value, threshold)
|
||||
|
||||
def _trigger_notification(
|
||||
self,
|
||||
@@ -570,18 +626,35 @@ class ThresholdChecker:
|
||||
old_level: AlertLevel,
|
||||
new_level: AlertLevel,
|
||||
value: Any,
|
||||
threshold: ThresholdConfig,
|
||||
):
|
||||
"""Trigger a notification for an alert state change."""
|
||||
# Determine which threshold was exceeded
|
||||
threshold_value = None
|
||||
if new_level == AlertLevel.CRITICAL and threshold.critical is not None:
|
||||
threshold_value = threshold.critical
|
||||
elif new_level == AlertLevel.WARNING and threshold.warning is not None:
|
||||
threshold_value = threshold.warning
|
||||
|
||||
# Format operator symbol
|
||||
op_symbol = threshold.operator.value
|
||||
|
||||
# Format message
|
||||
if new_level == AlertLevel.OK:
|
||||
lvl = "RECOVERED"
|
||||
message = f"{metric_path} = {value} ({old_level.name} -> OK)"
|
||||
elif new_level == AlertLevel.WARNING:
|
||||
lvl = "WARNING"
|
||||
message = f"{metric_path} = {value}"
|
||||
if threshold_value is not None:
|
||||
message = f"{metric_path} = {value} (threshold: {op_symbol} {threshold_value})"
|
||||
else:
|
||||
message = f"{metric_path} = {value}"
|
||||
elif new_level == AlertLevel.CRITICAL:
|
||||
lvl = "CRITICAL"
|
||||
message = f"{metric_path} = {value}"
|
||||
if threshold_value is not None:
|
||||
message = f"{metric_path} = {value} (threshold: {op_symbol} {threshold_value})"
|
||||
else:
|
||||
message = f"{metric_path} = {value}"
|
||||
else:
|
||||
lvl = "UNKNOWN"
|
||||
message = f"{metric_path} = {value}"
|
||||
@@ -617,6 +690,7 @@ class ThresholdChecker:
|
||||
alert_state: AlertState,
|
||||
metric_path: str,
|
||||
value: Any,
|
||||
threshold: ThresholdConfig,
|
||||
):
|
||||
"""Check if we should send a repeat notification."""
|
||||
if alert_state.level == AlertLevel.OK:
|
||||
@@ -632,8 +706,21 @@ class ThresholdChecker:
|
||||
return
|
||||
|
||||
if (now - alert_state.last_notification) >= self.renotify_interval:
|
||||
# Determine which threshold is active
|
||||
threshold_value = None
|
||||
if alert_state.level == AlertLevel.CRITICAL and threshold.critical is not None:
|
||||
threshold_value = threshold.critical
|
||||
elif alert_state.level == AlertLevel.WARNING and threshold.warning is not None:
|
||||
threshold_value = threshold.warning
|
||||
|
||||
# Format operator symbol
|
||||
op_symbol = threshold.operator.value
|
||||
|
||||
# Time to re-notify
|
||||
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (ongoing for {int(now - alert_state.since)}s)"
|
||||
if threshold_value is not None:
|
||||
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (threshold: {op_symbol} {threshold_value}, ongoing for {int(now - alert_state.since)}s)"
|
||||
else:
|
||||
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (ongoing for {int(now - alert_state.since)}s)"
|
||||
|
||||
if self.notification_callback:
|
||||
try:
|
||||
|
||||
+19
-23
@@ -47,7 +47,7 @@ def parse_message(data: bytes):
|
||||
return msg
|
||||
|
||||
|
||||
def dicttos(ID, d, compress=False):
|
||||
def dicttos(ID, d):
|
||||
s = []
|
||||
for k in d:
|
||||
if isinstance(d[k], float):
|
||||
@@ -55,13 +55,9 @@ def dicttos(ID, d, compress=False):
|
||||
else:
|
||||
s.append("%s=%s" % (k, d[k]))
|
||||
pk = ";".join(s)
|
||||
if compress:
|
||||
zpk = zlib.compress(pk.encode(), 6)
|
||||
ID = "!" + ID + ":"
|
||||
opk = ID.encode() + zpk
|
||||
else:
|
||||
zpk = pk
|
||||
opk = ID + ":" + zpk
|
||||
zpk = zlib.compress(pk.encode(), 6)
|
||||
ID = "!" + ID + ":"
|
||||
opk = ID.encode() + zpk
|
||||
return opk
|
||||
|
||||
|
||||
@@ -119,12 +115,24 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
|
||||
cid = msg.get("id", 0)
|
||||
try:
|
||||
rtt = float(msg.get("rtt", None))
|
||||
except Exception:
|
||||
rtt = float(msg.get("rtt"))
|
||||
except TypeError:
|
||||
rtt = None
|
||||
|
||||
if msg.get("ID") == "HTB":
|
||||
host.doesack = msg.get("acks", -1)
|
||||
# send ACK back
|
||||
rmsg = {"time": __import__("time").time()}
|
||||
if host.cver < 1:
|
||||
opkt = b"ACK"
|
||||
else:
|
||||
opkt = dicttos("ACK", rmsg)
|
||||
try:
|
||||
transport.sendto(opkt, addr)
|
||||
except Exception as e:
|
||||
if DEBUG > 0:
|
||||
print(("cannot send ack: %s" % e))
|
||||
|
||||
elif msg.get("ID") == "PLG":
|
||||
# Handle plugin data message
|
||||
plugin_name = msg.get("plugin")
|
||||
@@ -277,18 +285,6 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
alert_states=host.alert_states
|
||||
)
|
||||
|
||||
# send ACK back
|
||||
rmsg = {"time": __import__("time").time()}
|
||||
if host.cver < 1:
|
||||
opkt = b"ACK"
|
||||
else:
|
||||
opkt = dicttos("ACK", rmsg, host.cver > 1)
|
||||
try:
|
||||
transport.sendto(opkt, addr)
|
||||
except Exception as e:
|
||||
if DEBUG > 0:
|
||||
print(("cannot send ack: %s" % e))
|
||||
|
||||
# send any commands we have queued
|
||||
while len(host.cmds):
|
||||
op, rmsg = host.cmds[0]
|
||||
@@ -311,7 +307,7 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
||||
if isinstance(opkt, str):
|
||||
opkt = opkt.encode()
|
||||
else:
|
||||
opkt = dicttos(op, rmsg, True)
|
||||
opkt = dicttos(op, rmsg)
|
||||
try:
|
||||
transport.sendto(opkt, addr)
|
||||
except Exception as e:
|
||||
|
||||
Reference in New Issue
Block a user