Fix rtt, including bug in time compute
This commit is contained in:
@@ -7,33 +7,125 @@ logfile: "/home/andreas/logs/heartbeat/heartbeat.log"
|
|||||||
logfmt: "msg"
|
logfmt: "msg"
|
||||||
grace: 40
|
grace: 40
|
||||||
interval: 10
|
interval: 10
|
||||||
watchhosts:
|
|
||||||
# "localhost":
|
# Notification Channels - Define notification providers centrally
|
||||||
# "haschloss" :
|
# Each channel has a type (pushover, email, signal, mattermost) and type-specific configuration
|
||||||
# "cotgate":
|
notification_channels:
|
||||||
"wentworth":
|
|
||||||
notify: +4915123456789
|
pushover_standard:
|
||||||
src: "signal"
|
type: pushover
|
||||||
"y":
|
token: ac7NLX2rPjXFareeDgLpXNoDf4iFmf
|
||||||
notify: +4915123456789
|
user: uDhH33UjQQDYtNzJb1ThRiWb9ingGK
|
||||||
src: "signal"
|
|
||||||
"winter":
|
signal_andreas:
|
||||||
notify: +14168226179
|
type: signal
|
||||||
src: "signal"
|
cli_path: /usr/local/bin/signal-cli
|
||||||
dyndnshosts: {"haschloss", "wayback", "wertvoll", "weekend", "cotgate", "rvgate", "draper", "eris"}
|
user: +14168226179
|
||||||
|
recipient: +14168226179
|
||||||
|
|
||||||
|
email_andreas:
|
||||||
|
type: email
|
||||||
|
recipients: [aew.hbd.notify@wrede.ca]
|
||||||
|
sender: aew.hbd@wrede.ca
|
||||||
|
smtp_server: smtp.fastmail.com
|
||||||
|
smtp_port: 587
|
||||||
|
smtp_user: andreas@wrede.ca
|
||||||
|
smtp_password: pvtvefyp5gbhnch2
|
||||||
|
|
||||||
|
# Example additional channels (commented out)
|
||||||
|
# pushover_urgent:
|
||||||
|
# type: pushover
|
||||||
|
# token: your-app-token
|
||||||
|
# user: your-user-key
|
||||||
|
#
|
||||||
|
mattermost_devops:
|
||||||
|
type: mattermost
|
||||||
|
host: mattermost.example.com
|
||||||
|
token: webhook-token
|
||||||
|
channel: devops-alerts
|
||||||
|
username: heartbeat-bot
|
||||||
|
icon: https://example.com/heartbeat-icon.png
|
||||||
|
|
||||||
|
# Default notification channels (used if host doesn't specify channels)
|
||||||
|
default_notification_channels: [pushover_standard]
|
||||||
|
|
||||||
|
# Host definitions - combines threshold mapping, watch status, DNS updates, and notifications
|
||||||
|
hosts:
|
||||||
|
wentworth:
|
||||||
|
threshold_config: default
|
||||||
|
watch: true
|
||||||
|
notification_channels: [pushover_standard]
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
y:
|
||||||
|
threshold_config: default
|
||||||
|
watch: true
|
||||||
|
notification_channels: [pushover_standard]
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
winter:
|
||||||
|
threshold_config: default
|
||||||
|
watch: true
|
||||||
|
notification_channels: [pushover_standard]
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
wally:
|
||||||
|
threshold_config: freebsd_server
|
||||||
|
watch: false
|
||||||
|
notification_channels: [pushover_standard]
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
eris:
|
||||||
|
threshold_config: truenas_server
|
||||||
|
watch: false
|
||||||
|
notification_channels: [pushover_standard]
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
haschloss:
|
||||||
|
threshold_config: default
|
||||||
|
watch: false
|
||||||
|
dyndns: true
|
||||||
|
|
||||||
|
wayback:
|
||||||
|
threshold_config: default
|
||||||
|
watch: false
|
||||||
|
notification_channels: [pushover_standard]
|
||||||
|
dyndns: true
|
||||||
|
|
||||||
|
wertvoll:
|
||||||
|
threshold_config: default
|
||||||
|
watch: false
|
||||||
|
notification_channels: [pushover_standard]
|
||||||
|
dyndns: true
|
||||||
|
|
||||||
|
weekend:
|
||||||
|
threshold_config: default
|
||||||
|
watch: false
|
||||||
|
notification_channels: [pushover_standard]
|
||||||
|
dyndns: true
|
||||||
|
|
||||||
|
cotgate:
|
||||||
|
threshold_config: default
|
||||||
|
watch: false
|
||||||
|
dyndns: true
|
||||||
|
|
||||||
|
rvgate:
|
||||||
|
threshold_config: default
|
||||||
|
watch: false
|
||||||
|
dyndns: true
|
||||||
|
|
||||||
|
draper:
|
||||||
|
threshold_config: default
|
||||||
|
watch: false
|
||||||
|
notification_channels: [pushover_standard]
|
||||||
|
dyndns: true
|
||||||
|
|
||||||
|
# Hosts to drop/ignore
|
||||||
drophosts: {"unknown", "wookie15", "wort"}
|
drophosts: {"unknown", "wookie15", "wort"}
|
||||||
|
|
||||||
nsupdate_bin: "/usr/local/bin/nsupdate"
|
nsupdate_bin: "/usr/local/bin/nsupdate"
|
||||||
pushover_token: "ac7NLX2rPjXFareeDgLpXNoDf4iFmf"
|
|
||||||
pushover_user: "uDhH33UjQQDYtNzJb1ThRiWb9ingGK"
|
|
||||||
pushsrv: "pushover"
|
|
||||||
|
|
||||||
dyndomains: {"wrede.org"}
|
dyndomains: {"wrede.org"}
|
||||||
toemail: ["aew.hbd.notify@wrede.ca"]
|
|
||||||
fromemail: "aew.hbd@wrede.ca"
|
|
||||||
smtpserver: "smtp.fastmail.com"
|
|
||||||
smtpuser: "andreas@wrede.ca"
|
|
||||||
smtppassword: "r8psra6wj6gcakkp"
|
|
||||||
smtpport: 587
|
|
||||||
|
|
||||||
ws_port: 50005
|
ws_port: 50005
|
||||||
# wss_port: 50006 # Commented out - use plain WebSocket instead of secure WSS
|
# wss_port: 50006 # Commented out - use plain WebSocket instead of secure WSS
|
||||||
@@ -49,120 +141,114 @@ journal_file: messages.journal # Base filename
|
|||||||
journal_max_size: 104857600 # Max size (100MB default)
|
journal_max_size: 104857600 # Max size (100MB default)
|
||||||
journal_max_backups: 10 # Number of backups to keep
|
journal_max_backups: 10 # Number of backups to keep
|
||||||
|
|
||||||
thresholds:
|
threshold_configs:
|
||||||
default:
|
default:
|
||||||
cpu_monitor:
|
thresholds:
|
||||||
cpu_percent:
|
cpu_monitor:
|
||||||
warning: 80.0
|
cpu_percent:
|
||||||
critical: 90.0
|
warning: 80.0
|
||||||
memory_monitor:
|
critical: 90.0
|
||||||
percent:
|
memory_monitor:
|
||||||
warning: 3.0
|
percent:
|
||||||
critical: 95.0
|
warning: 85.0
|
||||||
disk_monitor:
|
critical: 95.0
|
||||||
partitions:
|
disk_monitor:
|
||||||
/:
|
partitions:
|
||||||
percent:
|
/:
|
||||||
warning: 85.0
|
percent:
|
||||||
critical: 90.0
|
warning: 85.0
|
||||||
rtt:
|
critical: 90.0
|
||||||
y:
|
rtt:
|
||||||
warning: 30
|
warning: 30
|
||||||
critical: 250.0
|
critical: 250.0
|
||||||
|
|
||||||
|
|
||||||
freebsd_server:
|
freebsd_server:
|
||||||
cpu_monitor:
|
thresholds:
|
||||||
cpu_percent:
|
cpu_monitor:
|
||||||
warning: 80.0
|
cpu_percent:
|
||||||
critical: 90.0
|
warning: 80.0
|
||||||
memory_monitor:
|
critical: 90.0
|
||||||
percent:
|
memory_monitor:
|
||||||
warning: 3.0
|
percent:
|
||||||
critical: 95.0
|
warning: 3.0
|
||||||
disk_monitor:
|
critical: 95.0
|
||||||
partitions:
|
disk_monitor:
|
||||||
/:
|
partitions:
|
||||||
percent:
|
/:
|
||||||
warning: 85.0
|
percent:
|
||||||
critical: 90.0
|
warning: 85.0
|
||||||
nagios_runner:
|
critical: 90.0
|
||||||
# overall_status_code:
|
nagios_runner:
|
||||||
# warning: 1
|
# overall_status_code:
|
||||||
# critical: 2
|
# warning: 1
|
||||||
# operator: ">="
|
# critical: 2
|
||||||
load_status:
|
# operator: ">="
|
||||||
warning: WARNING
|
load_status:
|
||||||
critical: CRITICAL
|
warning: WARNING
|
||||||
operator: "=="
|
critical: CRITICAL
|
||||||
UPS_load:
|
operator: "=="
|
||||||
display: "{ups_output}"
|
UPS_load:
|
||||||
warning: 70
|
display: "{ups_output}"
|
||||||
critical: 80
|
warning: 70
|
||||||
operator: ">="
|
critical: 80
|
||||||
UPS_status_code:
|
operator: ">="
|
||||||
display: "{ups_output}"
|
UPS_status_code:
|
||||||
warning: 1
|
display: "{ups_output}"
|
||||||
critical: 2
|
warning: 1
|
||||||
operator: ">="
|
critical: 2
|
||||||
nextcloud_apps_status_code:
|
operator: ">="
|
||||||
display: "{nextcloud_apps_output}"
|
nextcloud_apps_status_code:
|
||||||
warning: 1
|
display: "{nextcloud_apps_output}"
|
||||||
critical: 2
|
warning: 1
|
||||||
operator: ">="
|
critical: 2
|
||||||
rtt:
|
operator: ">="
|
||||||
y:
|
rtt:
|
||||||
warning: 30
|
warning: 30
|
||||||
critical: 250.0
|
critical: 250.0
|
||||||
|
|
||||||
truenas_server:
|
truenas_server:
|
||||||
cpu_monitor:
|
thresholds:
|
||||||
cpu_percent:
|
cpu_monitor:
|
||||||
warning: 80.0
|
cpu_percent:
|
||||||
critical: 90.0
|
warning: 80.0
|
||||||
memory_monitor:
|
critical: 90.0
|
||||||
percent:
|
memory_monitor:
|
||||||
warning: 3.0
|
percent:
|
||||||
critical: 95.0
|
warning: 3.0
|
||||||
disk_monitor:
|
critical: 95.0
|
||||||
partitions:
|
disk_monitor:
|
||||||
/:
|
partitions:
|
||||||
percent:
|
/:
|
||||||
warning: 85.0
|
percent:
|
||||||
critical: 90.0
|
warning: 85.0
|
||||||
nagios_runner:
|
critical: 90.0
|
||||||
# overall_status_code:
|
nagios_runner:
|
||||||
# warning: 1
|
# overall_status_code:
|
||||||
# critical: 2
|
# warning: 1
|
||||||
# operator: ">="
|
# critical: 2
|
||||||
load_status:
|
# operator: ">="
|
||||||
warning: WARNING
|
load_status:
|
||||||
critical: CRITICAL
|
warning: WARNING
|
||||||
operator: "=="
|
critical: CRITICAL
|
||||||
UPS_load:
|
operator: "=="
|
||||||
display: "{ups_output}"
|
UPS_load:
|
||||||
warning: 70
|
display: "{ups_output}"
|
||||||
critical: 80
|
warning: 70
|
||||||
operator: ">="
|
critical: 80
|
||||||
UPS_status_code:
|
operator: ">="
|
||||||
display: "{ups_output}"
|
UPS_status_code:
|
||||||
warning: 1
|
display: "{ups_output}"
|
||||||
critical: 2
|
warning: 1
|
||||||
operator: ">="
|
critical: 2
|
||||||
nextcloud_apps_status_code:
|
operator: ">="
|
||||||
display: "{nextcloud_apps_output}"
|
nextcloud_apps_status_code:
|
||||||
warning: 1
|
display: "{nextcloud_apps_output}"
|
||||||
critical: 2
|
warning: 1
|
||||||
operator: ">="
|
critical: 2
|
||||||
rtt:
|
operator: ">="
|
||||||
y:
|
rtt:
|
||||||
warning: 30
|
warning: 30
|
||||||
critical: 250.0
|
critical: 250.0
|
||||||
|
|
||||||
|
|
||||||
host_threshold_mapping:
|
|
||||||
# Critical production servers
|
|
||||||
|
|
||||||
wally: freebsd_server
|
|
||||||
eris: truenas_server
|
|
||||||
|
|
||||||
|
|||||||
Binary file not shown.
@@ -0,0 +1,533 @@
|
|||||||
|
# Notification System
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The Heartbeat Monitoring System includes a flexible notification system that can send alerts through multiple channels including Email, Pushover, Signal, and Mattermost. The system supports centralized channel definitions with per-host routing, allowing fine-grained control over notification delivery.
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
### Components
|
||||||
|
|
||||||
|
1. **Notification Channels** (`notification_channels` in config)
|
||||||
|
- Centralized definitions of notification providers
|
||||||
|
- Each channel has a type and type-specific credentials
|
||||||
|
- Reusable across multiple hosts
|
||||||
|
|
||||||
|
2. **Channel Dispatcher** (`hbd/server/notify.py`)
|
||||||
|
- `pushmsg_for_host(hostname, message)`: Main entry point for host-specific notifications
|
||||||
|
- `_dispatch_to_channel(channel_name, channel_config, message)`: Routes to specific provider
|
||||||
|
- Provider functions: `pushover()`, `pushsignal()`, `pushmattermost()`, `send_email()`
|
||||||
|
|
||||||
|
3. **Configuration Utilities** (`hbd/server/config.py`)
|
||||||
|
- `get_notification_channels_for_host(config, hostname)`: Retrieves channel names for a host
|
||||||
|
- `get_notification_channels_config(config, hostname)`: Retrieves full channel configurations
|
||||||
|
- `get_channel_config(config, channel_name)`: Gets configuration for a specific channel
|
||||||
|
|
||||||
|
4. **Integration Points**
|
||||||
|
- **Threshold alerts**: `threshold.py` calls `notify_mod.pushmsg_for_host()`
|
||||||
|
- **Heartbeat events**: `udp.py` calls `notify_mod.pushmsg_for_host()` for boot/shutdown/overdue
|
||||||
|
- **Custom alerts**: Any code can call `notify_mod.pushmsg_for_host(hostname, message)`
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
### Centralized Channel Definitions
|
||||||
|
|
||||||
|
Define notification channels once in your configuration file:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
notification_channels:
|
||||||
|
# Signal notifications
|
||||||
|
signal_ops:
|
||||||
|
type: signal
|
||||||
|
cli_path: /usr/local/bin/signal-cli
|
||||||
|
user: +1234567890 # Your Signal number
|
||||||
|
recipient: +1234567890 # Recipient number
|
||||||
|
|
||||||
|
signal_oncall:
|
||||||
|
type: signal
|
||||||
|
cli_path: /usr/local/bin/signal-cli
|
||||||
|
user: +1234567890
|
||||||
|
recipient: +0987654321 # Different recipient
|
||||||
|
|
||||||
|
# Email notifications
|
||||||
|
email_ops:
|
||||||
|
type: email
|
||||||
|
recipients:
|
||||||
|
- ops@example.com
|
||||||
|
- alerts@example.com
|
||||||
|
sender: heartbeat@example.com
|
||||||
|
smtp_server: smtp.example.com
|
||||||
|
smtp_port: 587
|
||||||
|
smtp_user: heartbeat@example.com
|
||||||
|
smtp_password: your-smtp-password
|
||||||
|
|
||||||
|
email_devteam:
|
||||||
|
type: email
|
||||||
|
recipients: [dev-alerts@example.com]
|
||||||
|
sender: heartbeat-dev@example.com
|
||||||
|
smtp_server: smtp.example.com
|
||||||
|
smtp_port: 587
|
||||||
|
smtp_user: heartbeat-dev@example.com
|
||||||
|
smtp_password: your-smtp-password
|
||||||
|
|
||||||
|
# Pushover notifications
|
||||||
|
pushover_urgent:
|
||||||
|
type: pushover
|
||||||
|
token: your-pushover-app-token
|
||||||
|
user: your-pushover-user-key
|
||||||
|
|
||||||
|
pushover_normal:
|
||||||
|
type: pushover
|
||||||
|
token: your-pushover-app-token
|
||||||
|
user: another-user-key
|
||||||
|
|
||||||
|
# Mattermost notifications
|
||||||
|
mattermost_devops:
|
||||||
|
type: mattermost
|
||||||
|
host: mattermost.example.com
|
||||||
|
token: your-webhook-token
|
||||||
|
channel: devops-alerts
|
||||||
|
username: heartbeat-bot
|
||||||
|
icon: https://example.com/heartbeat-icon.png
|
||||||
|
```
|
||||||
|
|
||||||
|
### Default Notification Channels
|
||||||
|
|
||||||
|
Specify default channels for hosts that don't have specific channel assignments:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
default_notification_channels:
|
||||||
|
- email_ops
|
||||||
|
- mattermost_devops
|
||||||
|
```
|
||||||
|
|
||||||
|
Hosts without `notification_channels` defined will use these defaults.
|
||||||
|
|
||||||
|
### Per-Host Channel Assignment
|
||||||
|
|
||||||
|
Assign specific channels to each host in the `hosts` section:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
hosts:
|
||||||
|
# Critical production web server - multiple channels for redundancy
|
||||||
|
prod-web-01:
|
||||||
|
threshold_config: high_sensitivity
|
||||||
|
watch: true
|
||||||
|
notification_channels:
|
||||||
|
- signal_oncall # Immediate mobile notification
|
||||||
|
- pushover_urgent # Secondary mobile notification
|
||||||
|
- email_ops # Email for record keeping
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
# Database server - ops team notifications only
|
||||||
|
prod-db-01:
|
||||||
|
threshold_config: database
|
||||||
|
watch: true
|
||||||
|
notification_channels:
|
||||||
|
- signal_ops
|
||||||
|
- email_ops
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
# Development server - email only, no urgent notifications
|
||||||
|
dev-server-01:
|
||||||
|
threshold_config: low_sensitivity
|
||||||
|
watch: false
|
||||||
|
notification_channels:
|
||||||
|
- email_devteam
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
# Test server - uses default_notification_channels
|
||||||
|
test-server-01:
|
||||||
|
threshold_config: default
|
||||||
|
watch: false
|
||||||
|
dyndns: false
|
||||||
|
# No notification_channels specified = uses default_notification_channels
|
||||||
|
```
|
||||||
|
|
||||||
|
## Channel Types
|
||||||
|
|
||||||
|
### Email
|
||||||
|
|
||||||
|
Sends notifications via SMTP.
|
||||||
|
|
||||||
|
**Configuration fields:**
|
||||||
|
```yaml
|
||||||
|
type: email
|
||||||
|
recipients: [email1@example.com, email2@example.com] # Required: List of recipients
|
||||||
|
sender: heartbeat@example.com # Required: From address
|
||||||
|
smtp_server: smtp.example.com # Required: SMTP server hostname
|
||||||
|
smtp_port: 587 # Optional: Default 587
|
||||||
|
smtp_user: heartbeat@example.com # Optional: For authenticated SMTP
|
||||||
|
smtp_password: your-password # Optional: For authenticated SMTP
|
||||||
|
```
|
||||||
|
|
||||||
|
**Features:**
|
||||||
|
- Supports multiple recipients
|
||||||
|
- TLS/STARTTLS support on port 587
|
||||||
|
- Authenticated and unauthenticated SMTP
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```yaml
|
||||||
|
notification_channels:
|
||||||
|
email_critical:
|
||||||
|
type: email
|
||||||
|
recipients: [admin@example.com, oncall@example.com]
|
||||||
|
sender: alerts@example.com
|
||||||
|
smtp_server: smtp.fastmail.com
|
||||||
|
smtp_port: 587
|
||||||
|
smtp_user: alerts@example.com
|
||||||
|
smtp_password: app-specific-password
|
||||||
|
```
|
||||||
|
|
||||||
|
### Pushover
|
||||||
|
|
||||||
|
Sends push notifications to mobile devices via Pushover API.
|
||||||
|
|
||||||
|
**Configuration fields:**
|
||||||
|
```yaml
|
||||||
|
type: pushover
|
||||||
|
token: your-application-token # Required: Your Pushover app token
|
||||||
|
user: your-user-key # Required: Recipient's user key
|
||||||
|
```
|
||||||
|
|
||||||
|
**Features:**
|
||||||
|
- Instant mobile push notifications
|
||||||
|
- Works on iOS and Android
|
||||||
|
- Supports delivery confirmations
|
||||||
|
|
||||||
|
**Setup:**
|
||||||
|
1. Create a Pushover account at https://pushover.net
|
||||||
|
2. Create an application to get your app token
|
||||||
|
3. Note your user key from your account dashboard
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```yaml
|
||||||
|
notification_channels:
|
||||||
|
pushover_admin:
|
||||||
|
type: pushover
|
||||||
|
token: azGDORePK8gMaC0QOYAMyEEuzJnyUi
|
||||||
|
user: uQiRzpo4DXghDmr9QzzfQu27cmVRsG
|
||||||
|
```
|
||||||
|
|
||||||
|
### Signal
|
||||||
|
|
||||||
|
Sends notifications via Signal messenger using signal-cli.
|
||||||
|
|
||||||
|
**Configuration fields:**
|
||||||
|
```yaml
|
||||||
|
type: signal
|
||||||
|
cli_path: /usr/local/bin/signal-cli # Optional: Path to signal-cli binary
|
||||||
|
user: +1234567890 # Required: Your Signal phone number
|
||||||
|
recipient: +0987654321 # Required: Recipient phone number
|
||||||
|
```
|
||||||
|
|
||||||
|
**Prerequisites:**
|
||||||
|
1. Install signal-cli: https://github.com/AsamK/signal-cli
|
||||||
|
2. Register signal-cli with your phone number:
|
||||||
|
```bash
|
||||||
|
signal-cli -u +1234567890 register
|
||||||
|
signal-cli -u +1234567890 verify CODE
|
||||||
|
```
|
||||||
|
3. Ensure signal-cli is in PATH or specify full path in config
|
||||||
|
|
||||||
|
**Features:**
|
||||||
|
- End-to-end encrypted messaging
|
||||||
|
- Works without phone being online
|
||||||
|
- No API fees or rate limits
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```yaml
|
||||||
|
notification_channels:
|
||||||
|
signal_admin:
|
||||||
|
type: signal
|
||||||
|
cli_path: /usr/local/bin/signal-cli
|
||||||
|
user: +12025551234
|
||||||
|
recipient: +12025559999
|
||||||
|
```
|
||||||
|
|
||||||
|
### Mattermost
|
||||||
|
|
||||||
|
Sends notifications to Mattermost team chat via incoming webhooks.
|
||||||
|
|
||||||
|
**Configuration fields:**
|
||||||
|
```yaml
|
||||||
|
type: mattermost
|
||||||
|
host: mattermost.example.com # Required: Mattermost server hostname
|
||||||
|
token: your-webhook-token # Required: Incoming webhook token
|
||||||
|
channel: channel-name # Required: Target channel name
|
||||||
|
username: heartbeat-bot # Optional: Bot display name
|
||||||
|
icon: https://example.com/icon.png # Optional: Bot icon URL
|
||||||
|
```
|
||||||
|
|
||||||
|
**Prerequisites:**
|
||||||
|
1. Enable incoming webhooks in Mattermost
|
||||||
|
2. Create an incoming webhook for your team
|
||||||
|
3. Note the webhook token from the webhook URL
|
||||||
|
|
||||||
|
**Features:**
|
||||||
|
- Team-wide visibility
|
||||||
|
- Rich formatting support
|
||||||
|
- Message threading
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```yaml
|
||||||
|
notification_channels:
|
||||||
|
mattermost_ops:
|
||||||
|
type: mattermost
|
||||||
|
host: chat.example.com
|
||||||
|
token: abc123def456ghi789
|
||||||
|
channel: infrastructure-alerts
|
||||||
|
username: heartbeat-monitor
|
||||||
|
icon: https://example.com/heartbeat-icon.png
|
||||||
|
```
|
||||||
|
|
||||||
|
## Notification Events
|
||||||
|
|
||||||
|
The system sends notifications for various events:
|
||||||
|
|
||||||
|
### Threshold Alerts
|
||||||
|
|
||||||
|
When monitored metrics exceed configured thresholds:
|
||||||
|
|
||||||
|
- **State changes**: OK → WARNING, WARNING → CRITICAL, CRITICAL → OK
|
||||||
|
- **Format**: `{LEVEL}: {hostname} - {metric_path} = {value} {threshold_info}`
|
||||||
|
- **Example**: `CRITICAL: prod-web-01 - cpu_monitor.cpu_percent = 95.2 (threshold: > 90.0)`
|
||||||
|
- **Re-notifications**: Periodic reminders for ongoing alerts (default: hourly)
|
||||||
|
|
||||||
|
### Heartbeat Events
|
||||||
|
|
||||||
|
Host lifecycle events:
|
||||||
|
|
||||||
|
- **Host boot**: `{hostname} booted`
|
||||||
|
- **Host shutdown**: `{hostname} {connection_type} shutdown`
|
||||||
|
- **Host recovery**: `{hostname} {connection_type} is back`
|
||||||
|
- **Connection issues**: `{hostname} {message}`
|
||||||
|
- **Host overdue**: `{hostname} {connection_type} overdue`
|
||||||
|
|
||||||
|
Only hosts with `watch: true` send heartbeat event notifications.
|
||||||
|
|
||||||
|
### Custom Alerts
|
||||||
|
|
||||||
|
Application code can send custom notifications:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from hbd.server import notify as notify_mod
|
||||||
|
|
||||||
|
# Send to host-specific channels
|
||||||
|
notify_mod.pushmsg_for_host("prod-web-01", "Custom alert message")
|
||||||
|
|
||||||
|
# Send using global config
|
||||||
|
notify_mod.pushmsg_from_config("Global notification")
|
||||||
|
|
||||||
|
# Send to specific config
|
||||||
|
notify_mod.pushmsg(custom_config_dict, "Targeted notification")
|
||||||
|
```
|
||||||
|
|
||||||
|
## Design Principles
|
||||||
|
|
||||||
|
The notification system follows these core principles:
|
||||||
|
|
||||||
|
- **Centralization**: Define notification providers once, reference them by name
|
||||||
|
- **Flexibility**: Each host can use different channels for different notification needs
|
||||||
|
- **Redundancy**: Critical hosts can specify multiple channels for failover
|
||||||
|
- **Clarity**: Clean separation between channel definition and channel assignment
|
||||||
|
- **Type Safety**: Provider-specific validation at configuration time
|
||||||
|
|
||||||
|
## Best Practices
|
||||||
|
|
||||||
|
### Channel Organization
|
||||||
|
|
||||||
|
- **Create purpose-specific channels**: `email_ops`, `signal_oncall`, `pushover_urgent`
|
||||||
|
- **Separate by team/role**: `email_devteam`, `signal_dbateam`, `mattermost_security`
|
||||||
|
- **Use descriptive names**: Channel names appear in logs and debugging
|
||||||
|
|
||||||
|
### Redundancy
|
||||||
|
|
||||||
|
For critical hosts, use multiple notification channels:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
hosts:
|
||||||
|
critical-db:
|
||||||
|
notification_channels:
|
||||||
|
- signal_oncall # Primary: Mobile alert
|
||||||
|
- pushover_urgent # Backup: Different mobile platform
|
||||||
|
- email_ops # Tertiary: Email for record-keeping
|
||||||
|
```
|
||||||
|
|
||||||
|
### Notification Fatigue Prevention
|
||||||
|
|
||||||
|
- **Use `watch: false`** for non-critical hosts
|
||||||
|
- **Configure appropriate thresholds** to avoid false positives
|
||||||
|
- **Set different channels for different severities**
|
||||||
|
- **Use `default_notification_channels`** for baseline, add more for critical systems
|
||||||
|
|
||||||
|
### Security
|
||||||
|
|
||||||
|
- **Protect credentials**: Use file permissions to protect config files with passwords/tokens
|
||||||
|
- **Rotate tokens**: Periodically rotate API tokens and passwords
|
||||||
|
- **Use app-specific passwords**: For email, use app-specific passwords instead of main account password
|
||||||
|
- **Separate accounts**: Consider separate notification accounts for different environments (prod vs dev)
|
||||||
|
|
||||||
|
### Testing
|
||||||
|
|
||||||
|
Test notification channels before relying on them:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Test signal-cli directly
|
||||||
|
signal-cli -u +1234567890 send -m "Test message" +0987654321
|
||||||
|
|
||||||
|
# Test SMTP
|
||||||
|
echo "Test" | mail -s "Test Subject" admin@example.com
|
||||||
|
|
||||||
|
# Test through heartbeat system (Python REPL)
|
||||||
|
from hbd.server import notify as notify_mod, config as config_mod
|
||||||
|
cfg = config_mod.load_config(".hb.yaml")
|
||||||
|
notify_mod.setup(cfg)
|
||||||
|
notify_mod.pushmsg_for_host("test-host", "Test notification")
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Notifications Not Sending
|
||||||
|
|
||||||
|
1. **Check logs**: Look for "Failed to send notification" errors
|
||||||
|
2. **Verify host is watched**: Ensure `watch: true` in host definition
|
||||||
|
3. **Check channel configuration**: Verify credentials and settings
|
||||||
|
4. **Test channel directly**: Use command-line tools to test provider
|
||||||
|
5. **Check network**: Ensure server can reach notification endpoints
|
||||||
|
|
||||||
|
### Signal Issues
|
||||||
|
|
||||||
|
- **signal-cli not found**: Specify full path in `cli_path`
|
||||||
|
- **Not registered**: Run `signal-cli -u +NUMBER register` and verify
|
||||||
|
- **Trust issues**: Run `signal-cli -u +NUMBER receive` to sync trust store
|
||||||
|
- **Recipient not found**: Ensure recipient is in your Signal contacts
|
||||||
|
|
||||||
|
### Email Issues
|
||||||
|
|
||||||
|
- **Authentication failed**: Check SMTP username/password
|
||||||
|
- **TLS errors**: Verify SMTP port (587 for STARTTLS, 465 for SSL)
|
||||||
|
- **Relay denied**: Ensure SMTP server allows relay from your IP
|
||||||
|
- **Timeout**: Check firewall rules for SMTP ports
|
||||||
|
|
||||||
|
### Pushover Issues
|
||||||
|
|
||||||
|
- **Invalid token/user**: Verify token and user key from Pushover dashboard
|
||||||
|
- **API rate limits**: Pushover has monthly message limits on free tier
|
||||||
|
- **HTTP errors**: Check Pushover API status page
|
||||||
|
|
||||||
|
### Mattermost Issues
|
||||||
|
|
||||||
|
- **Webhook not found**: Verify webhook token and ensure webhook is enabled
|
||||||
|
- **Channel not found**: Check channel name spelling and permissions
|
||||||
|
- **Driver import error**: Install mattermostdriver: `pip install mattermostdriver`
|
||||||
|
|
||||||
|
## API Reference
|
||||||
|
|
||||||
|
### Main Functions
|
||||||
|
|
||||||
|
#### `pushmsg_for_host(hostname: str, msg: str, debug: int = 0) -> dict`
|
||||||
|
|
||||||
|
Send notification to host-specific channels.
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
- `hostname`: Name of the host (used to look up notification channels)
|
||||||
|
- `msg`: Message to send
|
||||||
|
- `debug`: Debug level (0=no debug, 1+=debug output)
|
||||||
|
|
||||||
|
**Returns:** Dictionary of results per channel: `{"signal_ops": True, "email_ops": False}`
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```python
|
||||||
|
from hbd.server import notify as notify_mod
|
||||||
|
|
||||||
|
notify_mod.pushmsg_for_host("prod-web-01", "Server CPU at 95%")
|
||||||
|
```
|
||||||
|
|
||||||
|
**Behavior:**
|
||||||
|
1. Looks up notification channels configured for the host
|
||||||
|
2. If no host-specific channels, uses `default_notification_channels`
|
||||||
|
3. Dispatches to each channel in parallel
|
||||||
|
4. Returns dict of results keyed by channel name
|
||||||
|
5. Logs success/failure for each channel
|
||||||
|
|
||||||
|
## Examples
|
||||||
|
|
||||||
|
### Complete Configuration Example
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# Notification channel definitions
|
||||||
|
notification_channels:
|
||||||
|
signal_oncall:
|
||||||
|
type: signal
|
||||||
|
cli_path: /usr/local/bin/signal-cli
|
||||||
|
user: +12025551234
|
||||||
|
recipient: +12025555678
|
||||||
|
|
||||||
|
email_ops:
|
||||||
|
type: email
|
||||||
|
recipients: [ops@example.com, alerts@example.com]
|
||||||
|
sender: heartbeat@example.com
|
||||||
|
smtp_server: smtp.fastmail.com
|
||||||
|
smtp_port: 587
|
||||||
|
smtp_user: heartbeat@example.com
|
||||||
|
smtp_password: app-password-here
|
||||||
|
|
||||||
|
# Default channels
|
||||||
|
default_notification_channels: [email_ops]
|
||||||
|
|
||||||
|
# Host definitions with channel assignments
|
||||||
|
hosts:
|
||||||
|
prod-web-01:
|
||||||
|
threshold_config: high_sensitivity
|
||||||
|
watch: true
|
||||||
|
notification_channels: [signal_oncall, email_ops]
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
dev-server-01:
|
||||||
|
threshold_config: low_sensitivity
|
||||||
|
watch: false
|
||||||
|
notification_channels: [email_ops]
|
||||||
|
dyndns: false
|
||||||
|
```
|
||||||
|
|
||||||
|
### Multiple Environments Example
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
notification_channels:
|
||||||
|
# Production channels
|
||||||
|
signal_prod_oncall:
|
||||||
|
type: signal
|
||||||
|
user: +12025551234
|
||||||
|
recipient: +12025551111 # On-call phone
|
||||||
|
|
||||||
|
email_prod_ops:
|
||||||
|
type: email
|
||||||
|
recipients: [prod-ops@example.com]
|
||||||
|
sender: prod-heartbeat@example.com
|
||||||
|
smtp_server: smtp.example.com
|
||||||
|
|
||||||
|
# Staging channels
|
||||||
|
email_staging:
|
||||||
|
type: email
|
||||||
|
recipients: [staging-alerts@example.com]
|
||||||
|
sender: staging-heartbeat@example.com
|
||||||
|
smtp_server: smtp.example.com
|
||||||
|
|
||||||
|
# Development channels
|
||||||
|
mattermost_dev:
|
||||||
|
type: mattermost
|
||||||
|
host: chat.example.com
|
||||||
|
token: dev-webhook-token
|
||||||
|
channel: dev-alerts
|
||||||
|
|
||||||
|
hosts:
|
||||||
|
prod-api-01:
|
||||||
|
notification_channels: [signal_prod_oncall, email_prod_ops]
|
||||||
|
|
||||||
|
staging-api-01:
|
||||||
|
notification_channels: [email_staging]
|
||||||
|
|
||||||
|
dev-api-01:
|
||||||
|
notification_channels: [mattermost_dev]
|
||||||
|
```
|
||||||
+90
-22
@@ -335,43 +335,111 @@ threshold_renotify_interval: 3600 # Re-notify every hour for ongoing alerts
|
|||||||
|
|
||||||
### Notification Channels
|
### Notification Channels
|
||||||
|
|
||||||
Thresholds use the same notification infrastructure as heartbeat monitoring:
|
The system supports centralized notification channel definitions, allowing different hosts to use different notification providers and credentials. This provides fine-grained control over who gets notified about what.
|
||||||
|
|
||||||
|
#### Supported Channel Types
|
||||||
|
|
||||||
- **Email** (via SMTP)
|
- **Email** (via SMTP)
|
||||||
- **Pushover** (mobile notifications)
|
- **Pushover** (mobile notifications)
|
||||||
- **Mattermost** (team chat)
|
- **Signal** (via signal-cli)
|
||||||
- **Custom webhooks**
|
- **Mattermost** (team chat webhooks)
|
||||||
|
|
||||||
Configuration:
|
#### Centralized Channel Configuration
|
||||||
|
|
||||||
|
Define notification channels once in the configuration file:
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
# Email
|
notification_channels:
|
||||||
toemail:
|
# Signal notifications
|
||||||
- admin@example.com
|
signal_ops:
|
||||||
- oncall@example.com
|
type: signal
|
||||||
fromemail: heartbeat@example.com
|
cli_path: /usr/local/bin/signal-cli
|
||||||
smtpserver: smtp.example.com
|
user: +1234567890
|
||||||
smtpport: 587
|
recipient: +1234567890
|
||||||
smtpuser: heartbeat@example.com
|
|
||||||
smtppassword: your-password
|
# Email notifications
|
||||||
|
email_ops:
|
||||||
|
type: email
|
||||||
|
recipients: [ops@example.com, alerts@example.com]
|
||||||
|
sender: heartbeat@example.com
|
||||||
|
smtp_server: smtp.example.com
|
||||||
|
smtp_port: 587
|
||||||
|
smtp_user: heartbeat@example.com
|
||||||
|
smtp_password: your-smtp-password
|
||||||
|
|
||||||
|
# Pushover notifications
|
||||||
|
pushover_urgent:
|
||||||
|
type: pushover
|
||||||
|
token: your-pushover-app-token
|
||||||
|
user: your-pushover-user-key
|
||||||
|
|
||||||
|
# Mattermost notifications
|
||||||
|
mattermost_devops:
|
||||||
|
type: mattermost
|
||||||
|
host: mattermost.example.com
|
||||||
|
token: your-webhook-token
|
||||||
|
channel: devops-alerts
|
||||||
|
username: heartbeat-bot
|
||||||
|
icon: https://example.com/heartbeat-icon.png
|
||||||
|
|
||||||
# Pushover
|
# Default channels for hosts that don't specify channels
|
||||||
pushover_token: your-app-token
|
default_notification_channels: [email_ops]
|
||||||
pushover_user: your-user-key
|
```
|
||||||
|
|
||||||
|
#### Per-Host Channel Assignment
|
||||||
|
|
||||||
|
Assign notification channels to specific hosts in the `hosts` section:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
hosts:
|
||||||
|
# Critical server - multiple notification channels
|
||||||
|
prod-web-01:
|
||||||
|
threshold_config: high_sensitivity
|
||||||
|
watch: true
|
||||||
|
notification_channels: [signal_ops, pushover_urgent, email_ops]
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
# Database server - ops team only
|
||||||
|
prod-db-01:
|
||||||
|
threshold_config: database
|
||||||
|
watch: true
|
||||||
|
notification_channels: [signal_ops, email_ops]
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
# Development server - email only
|
||||||
|
dev-server-01:
|
||||||
|
threshold_config: low_sensitivity
|
||||||
|
watch: false
|
||||||
|
notification_channels: [email_ops]
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
# Uses default_notification_channels if not specified
|
||||||
|
test-server-01:
|
||||||
|
threshold_config: default
|
||||||
|
watch: false
|
||||||
|
dyndns: false
|
||||||
```
|
```
|
||||||
|
|
||||||
### Watched Hosts
|
### Watched Hosts
|
||||||
|
|
||||||
Only hosts in the `watchhosts` list will trigger notifications:
|
Only hosts with `watch: true` in the `hosts` section will trigger notifications:
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
watchhosts:
|
hosts:
|
||||||
- webserver01
|
webserver01:
|
||||||
- database01
|
watch: true
|
||||||
- mailserver
|
notification_channels: [email_ops]
|
||||||
|
|
||||||
|
database01:
|
||||||
|
watch: true
|
||||||
|
notification_channels: [signal_ops, email_ops]
|
||||||
|
|
||||||
|
mailserver:
|
||||||
|
watch: true
|
||||||
|
notification_channels: [pushover_urgent]
|
||||||
```
|
```
|
||||||
|
|
||||||
Hosts not in this list will still have thresholds checked and alert states tracked, but won't send notifications.
|
Hosts not marked for watching will still have thresholds checked and alert states tracked, but won't send notifications.
|
||||||
|
|
||||||
## Alert State Tracking
|
## Alert State Tracking
|
||||||
|
|
||||||
|
|||||||
+8
-7
@@ -115,13 +115,14 @@ class AsyncConnection:
|
|||||||
self.logger.debug(f"Sent {msg_id} message ({len(data)} bytes)")
|
self.logger.debug(f"Sent {msg_id} message ({len(data)} bytes)")
|
||||||
|
|
||||||
def handle_ack(self, msg: dict, now: float):
|
def handle_ack(self, msg: dict, now: float):
|
||||||
"""Handle ACK message from server."""
|
"""Handle ACK message from server.
|
||||||
try:
|
|
||||||
self.lastack = msg.get("time", now)
|
RTT is calculated as: (time ACK received) - (time HTB sent)
|
||||||
rtt = (self.lastack - self.lastsend) * 2000.0 # Convert to ms
|
"""
|
||||||
except Exception:
|
self.lastack = now
|
||||||
self.lastack = now
|
|
||||||
rtt = (self.lastack - self.lastsend) * 1000.0
|
# Calculate RTT: time ACK received minus time HTB sent
|
||||||
|
rtt = (now - self.lastsend) * 1000.0 # Convert to ms
|
||||||
|
|
||||||
self.rtts.append(rtt)
|
self.rtts.append(rtt)
|
||||||
if len(self.rtts) > 10:
|
if len(self.rtts) > 10:
|
||||||
|
|||||||
@@ -51,13 +51,9 @@ threshold_configs:
|
|||||||
operator: ">"
|
operator: ">"
|
||||||
|
|
||||||
rtt:
|
rtt:
|
||||||
# RTT thresholds per remote host
|
# RTT thresholds (applies to all hosts)
|
||||||
router:
|
warning: 50.0 # ms
|
||||||
warning: 50.0 # ms
|
critical: 200.0
|
||||||
critical: 200.0
|
|
||||||
server1:
|
|
||||||
warning: 100.0
|
|
||||||
critical: 500.0
|
|
||||||
|
|
||||||
# High sensitivity configuration - lower thresholds for critical systems
|
# High sensitivity configuration - lower thresholds for critical systems
|
||||||
high_sensitivity:
|
high_sensitivity:
|
||||||
@@ -94,12 +90,8 @@ threshold_configs:
|
|||||||
operator: ">"
|
operator: ">"
|
||||||
|
|
||||||
rtt:
|
rtt:
|
||||||
router:
|
warning: 30.0
|
||||||
warning: 30.0
|
critical: 100.0
|
||||||
critical: 100.0
|
|
||||||
server1:
|
|
||||||
warning: 50.0
|
|
||||||
critical: 200.0
|
|
||||||
|
|
||||||
# Low sensitivity configuration - higher thresholds for development/test systems
|
# Low sensitivity configuration - higher thresholds for development/test systems
|
||||||
low_sensitivity:
|
low_sensitivity:
|
||||||
@@ -125,9 +117,8 @@ threshold_configs:
|
|||||||
operator: ">"
|
operator: ">"
|
||||||
|
|
||||||
rtt:
|
rtt:
|
||||||
router:
|
warning: 100.0
|
||||||
warning: 100.0
|
critical: 500.0
|
||||||
critical: 500.0
|
|
||||||
|
|
||||||
# Production database servers - specialized thresholds
|
# Production database servers - specialized thresholds
|
||||||
database:
|
database:
|
||||||
@@ -159,44 +150,147 @@ threshold_configs:
|
|||||||
operator: ">"
|
operator: ">"
|
||||||
|
|
||||||
rtt:
|
rtt:
|
||||||
router:
|
warning: 20.0 # Stricter latency requirements
|
||||||
warning: 20.0 # Stricter latency requirements
|
critical: 50.0
|
||||||
critical: 50.0
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------
|
||||||
# Host to Threshold Configuration Mapping
|
# Host to Threshold Configuration Mapping
|
||||||
# ----------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------
|
||||||
# Map specific hosts to specific threshold configurations
|
# Map specific hosts to specific threshold configurations
|
||||||
# Hosts not listed here will use the default_threshold_config
|
# ----------------------------------------------------------------------------
|
||||||
host_threshold_mapping:
|
# Notification Channels
|
||||||
# Critical production servers
|
# ----------------------------------------------------------------------------
|
||||||
prod-web-01: high_sensitivity
|
# Define notification providers centrally with their credentials
|
||||||
prod-web-02: high_sensitivity
|
# Each channel has a type (pushover, email, signal, mattermost) and type-specific config
|
||||||
prod-api-01: high_sensitivity
|
notification_channels:
|
||||||
|
# Signal notifications
|
||||||
|
signal_ops:
|
||||||
|
type: signal
|
||||||
|
cli_path: /usr/local/bin/signal-cli
|
||||||
|
user: +1234567890
|
||||||
|
recipient: +1234567890
|
||||||
|
|
||||||
# Database servers
|
signal_oncall:
|
||||||
prod-db-01: database
|
type: signal
|
||||||
prod-db-02: database
|
cli_path: /usr/local/bin/signal-cli
|
||||||
prod-db-replica: database
|
user: +1234567890
|
||||||
|
recipient: +0987654321
|
||||||
|
|
||||||
# Development and test systems
|
# Email notifications
|
||||||
dev-server-01: low_sensitivity
|
email_ops:
|
||||||
dev-server-02: low_sensitivity
|
type: email
|
||||||
test-server-01: low_sensitivity
|
recipients: [ops@example.com, alerts@example.com]
|
||||||
test-server-02: low_sensitivity
|
sender: heartbeat@example.com
|
||||||
|
smtp_server: smtp.example.com
|
||||||
|
smtp_port: 587
|
||||||
|
smtp_user: heartbeat@example.com
|
||||||
|
smtp_password: your-smtp-password
|
||||||
|
|
||||||
# Everything else uses 'default' (no need to list explicitly)
|
# Pushover notifications
|
||||||
|
pushover_urgent:
|
||||||
|
type: pushover
|
||||||
|
token: your-pushover-app-token
|
||||||
|
user: your-pushover-user-key
|
||||||
|
|
||||||
|
# Mattermost notifications
|
||||||
|
mattermost_devops:
|
||||||
|
type: mattermost
|
||||||
|
host: mattermost.example.com
|
||||||
|
token: your-webhook-token
|
||||||
|
channel: devops-alerts
|
||||||
|
username: heartbeat-bot
|
||||||
|
icon: https://example.com/heartbeat-icon.png
|
||||||
|
|
||||||
|
# Default notification channels (used if host doesn't specify channels)
|
||||||
|
default_notification_channels: [email_ops]
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------
|
||||||
# Backward Compatibility Example
|
# Host Definitions (New Unified Format)
|
||||||
# ----------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------
|
||||||
# The old single threshold format is still supported:
|
# Define hosts with threshold configs, monitoring, DNS, and notification settings
|
||||||
# Just use 'thresholds:' directly without 'threshold_configs:'
|
hosts:
|
||||||
|
# Critical production servers - high sensitivity, multiple notification channels
|
||||||
|
prod-web-01:
|
||||||
|
threshold_config: high_sensitivity
|
||||||
|
watch: true
|
||||||
|
notification_channels: [signal_oncall, pushover_urgent, email_ops]
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
prod-web-02:
|
||||||
|
threshold_config: high_sensitivity
|
||||||
|
watch: true
|
||||||
|
notification_channels: [signal_oncall, pushover_urgent, email_ops]
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
prod-api-01:
|
||||||
|
threshold_config: high_sensitivity
|
||||||
|
watch: true
|
||||||
|
notification_channels: [signal_oncall, email_ops]
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
# Database servers - database-specific thresholds
|
||||||
|
prod-db-01:
|
||||||
|
threshold_config: database
|
||||||
|
watch: true
|
||||||
|
notification_channels: [signal_ops, email_ops]
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
prod-db-02:
|
||||||
|
threshold_config: database
|
||||||
|
watch: true
|
||||||
|
notification_channels: [signal_ops, email_ops]
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
prod-db-replica:
|
||||||
|
threshold_config: database
|
||||||
|
watch: true
|
||||||
|
notification_channels: [email_ops] # Replica gets email only
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
# Development servers - low sensitivity, minimal notifications
|
||||||
|
dev-server-01:
|
||||||
|
threshold_config: low_sensitivity
|
||||||
|
watch: false # Don't monitor dev servers closely
|
||||||
|
notification_channels: [email_ops]
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
dev-server-02:
|
||||||
|
threshold_config: low_sensitivity
|
||||||
|
watch: false
|
||||||
|
notification_channels: [email_ops]
|
||||||
|
dyndns: false
|
||||||
|
|
||||||
|
# Test servers
|
||||||
|
test-server-01:
|
||||||
|
threshold_config: low_sensitivity
|
||||||
|
watch: false
|
||||||
|
dyndns: false
|
||||||
|
# No notification channels - uses default_notification_channels
|
||||||
|
|
||||||
|
# Home server with dynamic DNS
|
||||||
|
home-server:
|
||||||
|
threshold_config: default
|
||||||
|
watch: true
|
||||||
|
notification_channels: [signal_ops]
|
||||||
|
dyndns: true # Update DNS when IP changes
|
||||||
|
|
||||||
|
# Hosts not listed in the hosts section will use:
|
||||||
|
# - default_threshold_config for thresholds (falls back to "default")
|
||||||
|
# - default_notification_channels for notifications
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
# Notes on Configuration Structure
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# All configuration is centralized in the hosts section. Each host can specify:
|
||||||
|
# - threshold_config: Name of threshold configuration to use
|
||||||
|
# - watch: Whether to monitor this host actively (send notifications)
|
||||||
|
# - notification_channels: List of channels to use for this host
|
||||||
|
# - dyndns: Whether to update DNS when IP address changes
|
||||||
#
|
#
|
||||||
# thresholds:
|
# Notification channels are defined once at the top level and referenced
|
||||||
# cpu_monitor:
|
# by name in host definitions, allowing easy reuse and updates.
|
||||||
# cpu_percent:
|
|
||||||
# warning: 80.0
|
|
||||||
# critical: 90.0
|
|
||||||
#
|
#
|
||||||
# This will apply the same thresholds to all hosts.
|
# For hosts not explicitly listed, the system will still accept heartbeats
|
||||||
|
# and track their state, but won't apply thresholds or send notifications
|
||||||
|
# unless default settings are configured.
|
||||||
|
|||||||
+165
-14
@@ -21,10 +21,9 @@ SERVER_DEFAULTS = {
|
|||||||
"logfile": "/var/log/heartbeat.log",
|
"logfile": "/var/log/heartbeat.log",
|
||||||
"logfmt": "text", # text or msg or json
|
"logfmt": "text", # text or msg or json
|
||||||
|
|
||||||
# Notification settings
|
# Notification channels
|
||||||
"pushsrv": "pushover", # pushover, mattermost, or all
|
"notification_channels": {}, # Named channels with type and credentials
|
||||||
"pushover_token": "",
|
"default_notification_channels": [], # Default channels if host doesn't specify
|
||||||
"pushover_user": "",
|
|
||||||
|
|
||||||
# Monitoring settings
|
# Monitoring settings
|
||||||
"interval": 20, # Expected heartbeat interval (for server checks)
|
"interval": 20, # Expected heartbeat interval (for server checks)
|
||||||
@@ -32,22 +31,15 @@ SERVER_DEFAULTS = {
|
|||||||
"threshold_renotify_interval": 3600, # Seconds between threshold re-notifications
|
"threshold_renotify_interval": 3600, # Seconds between threshold re-notifications
|
||||||
|
|
||||||
# Host management
|
# Host management
|
||||||
"watchhosts": [], # Hosts to monitor and notify about
|
"hosts": {}, # New unified host definitions (optional)
|
||||||
"dyndnshosts": [], # Hosts with dynamic DNS
|
"watchhosts": [], # Hosts to monitor and notify about (legacy)
|
||||||
|
"dyndnshosts": [], # Hosts with dynamic DNS (legacy)
|
||||||
"drophosts": [], # Hosts to ignore
|
"drophosts": [], # Hosts to ignore
|
||||||
"dyndomains": ["wrede.org"],
|
"dyndomains": ["wrede.org"],
|
||||||
|
|
||||||
# DNS updates
|
# DNS updates
|
||||||
"nsupdate_bin": "/usr/bin/nsupdate",
|
"nsupdate_bin": "/usr/bin/nsupdate",
|
||||||
|
|
||||||
# Email settings
|
|
||||||
"smtpserver": "smtp.fastmail.com",
|
|
||||||
"smtpuser": "andreas@wrede.ca",
|
|
||||||
"smtppassword": "pvtvefyp5gbhnch2",
|
|
||||||
"smtpport": 587,
|
|
||||||
"toemail": ["aew.hbd.notify@wrede.ca"],
|
|
||||||
"fromemail": "aew.hbd@wrede.ca",
|
|
||||||
|
|
||||||
# WebSocket settings
|
# WebSocket settings
|
||||||
"ws_port": 50005,
|
"ws_port": 50005,
|
||||||
"wss_port": None,
|
"wss_port": None,
|
||||||
@@ -101,3 +93,162 @@ def load_config(path=None):
|
|||||||
# yaml not installed: do not attempt to parse; user must ensure defaults
|
# yaml not installed: do not attempt to parse; user must ensure defaults
|
||||||
pass
|
pass
|
||||||
return cfg
|
return cfg
|
||||||
|
|
||||||
|
|
||||||
|
def get_watchhosts(config):
|
||||||
|
"""Extract watchhosts from config, supporting both new and legacy formats.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: Configuration dictionary
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of hostnames to watch
|
||||||
|
"""
|
||||||
|
watchhosts = []
|
||||||
|
|
||||||
|
# New format: hosts section with watch attribute
|
||||||
|
if "hosts" in config:
|
||||||
|
hosts_config = config["hosts"]
|
||||||
|
if isinstance(hosts_config, dict):
|
||||||
|
for host_name, host_attrs in hosts_config.items():
|
||||||
|
if isinstance(host_attrs, dict) and host_attrs.get("watch", False):
|
||||||
|
watchhosts.append(host_name)
|
||||||
|
|
||||||
|
# Legacy format: watchhosts list
|
||||||
|
if "watchhosts" in config:
|
||||||
|
legacy_watchhosts = config.get("watchhosts", [])
|
||||||
|
if isinstance(legacy_watchhosts, (list, set)):
|
||||||
|
watchhosts.extend(legacy_watchhosts)
|
||||||
|
elif isinstance(legacy_watchhosts, dict):
|
||||||
|
# Old dict format: {"host1": {attrs}, "host2": {attrs}}
|
||||||
|
watchhosts.extend(legacy_watchhosts.keys())
|
||||||
|
|
||||||
|
return list(set(watchhosts)) # Remove duplicates
|
||||||
|
|
||||||
|
|
||||||
|
def get_dyndnshosts(config):
|
||||||
|
"""Extract dyndnshosts from config, supporting both new and legacy formats.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: Configuration dictionary
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of hostnames with dynamic DNS
|
||||||
|
"""
|
||||||
|
dyndnshosts = []
|
||||||
|
|
||||||
|
# New format: hosts section with dyndns attribute
|
||||||
|
if "hosts" in config:
|
||||||
|
hosts_config = config["hosts"]
|
||||||
|
if isinstance(hosts_config, dict):
|
||||||
|
for host_name, host_attrs in hosts_config.items():
|
||||||
|
if isinstance(host_attrs, dict) and host_attrs.get("dyndns", False):
|
||||||
|
dyndnshosts.append(host_name)
|
||||||
|
|
||||||
|
# Legacy format: dyndnshosts list/set
|
||||||
|
if "dyndnshosts" in config:
|
||||||
|
legacy_dyndnshosts = config.get("dyndnshosts", [])
|
||||||
|
if isinstance(legacy_dyndnshosts, (list, set)):
|
||||||
|
dyndnshosts.extend(legacy_dyndnshosts)
|
||||||
|
|
||||||
|
return list(set(dyndnshosts)) # Remove duplicates
|
||||||
|
|
||||||
|
|
||||||
|
def get_host_config(config, hostname):
|
||||||
|
"""Get configuration for a specific host.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: Configuration dictionary
|
||||||
|
hostname: Host name
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with host attributes or empty dict
|
||||||
|
"""
|
||||||
|
if "hosts" in config:
|
||||||
|
hosts_config = config.get("hosts", {})
|
||||||
|
if isinstance(hosts_config, dict) and hostname in hosts_config:
|
||||||
|
return hosts_config[hostname] if isinstance(hosts_config[hostname], dict) else {}
|
||||||
|
|
||||||
|
# Check legacy watchhosts for notification settings
|
||||||
|
if "watchhosts" in config:
|
||||||
|
watchhosts = config.get("watchhosts", {})
|
||||||
|
if isinstance(watchhosts, dict) and hostname in watchhosts:
|
||||||
|
legacy_attrs = watchhosts[hostname]
|
||||||
|
if isinstance(legacy_attrs, dict):
|
||||||
|
# Convert legacy format to new format
|
||||||
|
return {
|
||||||
|
"watch": True,
|
||||||
|
"notify": legacy_attrs.get("notify"),
|
||||||
|
"notify_src": legacy_attrs.get("src"),
|
||||||
|
}
|
||||||
|
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def get_notification_channels_for_host(config, hostname):
|
||||||
|
"""Get notification channels configured for a specific host.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: Configuration dictionary
|
||||||
|
hostname: Host name
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of channel names to use for this host
|
||||||
|
"""
|
||||||
|
host_config = get_host_config(config, hostname)
|
||||||
|
|
||||||
|
# Check if host specifies notification channels
|
||||||
|
channels = host_config.get("notification_channels", [])
|
||||||
|
if channels:
|
||||||
|
if isinstance(channels, str):
|
||||||
|
return [channels]
|
||||||
|
elif isinstance(channels, list):
|
||||||
|
return channels
|
||||||
|
|
||||||
|
# Fall back to default channels
|
||||||
|
default_channels = config.get("default_notification_channels", [])
|
||||||
|
if default_channels:
|
||||||
|
if isinstance(default_channels, str):
|
||||||
|
return [default_channels]
|
||||||
|
elif isinstance(default_channels, list):
|
||||||
|
return default_channels
|
||||||
|
|
||||||
|
# No channels configured, return empty list (will use legacy global config)
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def get_channel_config(config, channel_name):
|
||||||
|
"""Get configuration for a specific notification channel.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: Configuration dictionary
|
||||||
|
channel_name: Name of the notification channel
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with channel configuration or None if not found
|
||||||
|
"""
|
||||||
|
channels = config.get("notification_channels", {})
|
||||||
|
if isinstance(channels, dict) and channel_name in channels:
|
||||||
|
return channels[channel_name]
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_notification_channels_config(config, hostname):
|
||||||
|
"""Get list of notification channel configurations for a host.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: Configuration dictionary
|
||||||
|
hostname: Host name
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of (channel_name, channel_config) tuples
|
||||||
|
"""
|
||||||
|
channel_names = get_notification_channels_for_host(config, hostname)
|
||||||
|
|
||||||
|
channels = []
|
||||||
|
for channel_name in channel_names:
|
||||||
|
channel_config = get_channel_config(config, channel_name)
|
||||||
|
if channel_config and channel_config.get("type"):
|
||||||
|
channels.append((channel_name, channel_config))
|
||||||
|
|
||||||
|
return channels
|
||||||
|
|||||||
+2
-12
@@ -136,16 +136,7 @@ async def dns_update_worker(
|
|||||||
)
|
)
|
||||||
if err:
|
if err:
|
||||||
m += f", DNS update failed: {err}"
|
m += f", DNS update failed: {err}"
|
||||||
if pushmsg:
|
logger.error("DNS update failed for %s: %s", name, err)
|
||||||
try:
|
|
||||||
await loop.run_in_executor(
|
|
||||||
None,
|
|
||||||
pushmsg,
|
|
||||||
"error: nsupdate failed",
|
|
||||||
f"{name}.dy.{dyndomain}: {m}",
|
|
||||||
)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
else:
|
else:
|
||||||
m += ", DNS updated."
|
m += ", DNS updated."
|
||||||
|
|
||||||
@@ -171,7 +162,6 @@ def start_dns_worker(
|
|||||||
hbdclass,
|
hbdclass,
|
||||||
cfg: dict,
|
cfg: dict,
|
||||||
log: Optional[callable] = None,
|
log: Optional[callable] = None,
|
||||||
pushmsg: Optional[callable] = None,
|
|
||||||
loop: Optional[asyncio.AbstractEventLoop] = None,
|
loop: Optional[asyncio.AbstractEventLoop] = None,
|
||||||
):
|
):
|
||||||
"""Start the async DNS worker and return the Task.
|
"""Start the async DNS worker and return the Task.
|
||||||
@@ -218,7 +208,7 @@ def start_dns_worker(
|
|||||||
|
|
||||||
task = loop.create_task(
|
task = loop.create_task(
|
||||||
dns_update_worker(
|
dns_update_worker(
|
||||||
hbdclass, cfg, async_queue=async_q, log=log, pushmsg=pushmsg, loop=loop
|
hbdclass, cfg, async_queue=async_q, log=log, loop=loop
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
return task
|
return task
|
||||||
|
|||||||
@@ -25,12 +25,7 @@ async def start(
|
|||||||
port: int,
|
port: int,
|
||||||
config,
|
config,
|
||||||
hbdclass,
|
hbdclass,
|
||||||
log=None,
|
|
||||||
email=None,
|
|
||||||
pushmsg=None,
|
|
||||||
msg_to_websockets=None,
|
|
||||||
tcss=None,
|
tcss=None,
|
||||||
DEBUG=0,
|
|
||||||
verbose=False,
|
verbose=False,
|
||||||
get_now=None,
|
get_now=None,
|
||||||
VER="",
|
VER="",
|
||||||
|
|||||||
+4
-11
@@ -79,14 +79,11 @@ async def _run_async(config):
|
|||||||
# Initialize threshold checker
|
# Initialize threshold checker
|
||||||
threshold_checker = threshold_mod.ThresholdChecker(
|
threshold_checker = threshold_mod.ThresholdChecker(
|
||||||
config=config,
|
config=config,
|
||||||
notification_callback=notify_mod.pushmsg_from_config,
|
|
||||||
renotify_interval=config.get("threshold_renotify_interval", 3600),
|
renotify_interval=config.get("threshold_renotify_interval", 3600),
|
||||||
journal=msg_journal,
|
journal=msg_journal,
|
||||||
)
|
)
|
||||||
logger.info("Threshold checker initialized")
|
logger.info("Threshold checker initialized")
|
||||||
|
|
||||||
pushmsg = notify_mod.pushmsg_from_config
|
|
||||||
|
|
||||||
sock = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
|
sock = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
|
||||||
# Disable IPV6_V6ONLY option to enable dual-stack (listen on IPv4 as well)
|
# Disable IPV6_V6ONLY option to enable dual-stack (listen on IPv4 as well)
|
||||||
# This option is system-dependent; on many systems, setting it to False enables
|
# This option is system-dependent; on many systems, setting it to False enables
|
||||||
@@ -110,7 +107,6 @@ async def _run_async(config):
|
|||||||
config=config,
|
config=config,
|
||||||
hbdclass=hbdclass,
|
hbdclass=hbdclass,
|
||||||
log=eventlog,
|
log=eventlog,
|
||||||
pushmsg=pushmsg,
|
|
||||||
msg_to_websockets=msg_to_websockets,
|
msg_to_websockets=msg_to_websockets,
|
||||||
msg_journal=msg_journal,
|
msg_journal=msg_journal,
|
||||||
threshold_checker=threshold_checker,
|
threshold_checker=threshold_checker,
|
||||||
@@ -132,12 +128,8 @@ async def _run_async(config):
|
|||||||
port=config.get("hbd_port", 50004),
|
port=config.get("hbd_port", 50004),
|
||||||
config=config,
|
config=config,
|
||||||
hbdclass=hbdclass,
|
hbdclass=hbdclass,
|
||||||
log=eventlog,
|
|
||||||
pushmsg=pushmsg,
|
|
||||||
msg_to_websockets=msg_to_websockets,
|
|
||||||
threshold_checker=threshold_checker,
|
threshold_checker=threshold_checker,
|
||||||
tcss=None,
|
tcss=None,
|
||||||
DEBUG=config.get("debug", 0),
|
|
||||||
verbose=config.get("verbose", False),
|
verbose=config.get("verbose", False),
|
||||||
get_now=lambda: time.time(),
|
get_now=lambda: time.time(),
|
||||||
VER="",
|
VER="",
|
||||||
@@ -155,7 +147,7 @@ async def _run_async(config):
|
|||||||
dns_task = None
|
dns_task = None
|
||||||
try:
|
try:
|
||||||
dns_task = dns_mod.start_dns_worker(
|
dns_task = dns_mod.start_dns_worker(
|
||||||
hbdclass, config, log=eventlog, pushmsg=pushmsg, loop=loop
|
hbdclass, config, log=eventlog, loop=loop
|
||||||
)
|
)
|
||||||
logger.info("dns update worker started")
|
logger.info("dns update worker started")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -273,10 +265,11 @@ def load_pickled_hosts(config, hbdclass):
|
|||||||
"""Load pickled hosts from file, if available."""
|
"""Load pickled hosts from file, if available."""
|
||||||
import os
|
import os
|
||||||
import pickle
|
import pickle
|
||||||
|
from . import config as config_mod
|
||||||
|
|
||||||
pickfile = config.get("pickfile", "hbd.pickle")
|
pickfile = config.get("pickfile", "hbd.pickle")
|
||||||
dyndnshosts = config.get("dyndnshosts", [])
|
dyndnshosts = config_mod.get_dyndnshosts(config)
|
||||||
watchhosts = config.get("watchhosts", [])
|
watchhosts = config_mod.get_watchhosts(config)
|
||||||
drophosts = config.get("drophosts", [])
|
drophosts = config.get("drophosts", [])
|
||||||
if 1 and os.path.exists(pickfile):
|
if 1 and os.path.exists(pickfile):
|
||||||
if config.get("verbose", False):
|
if config.get("verbose", False):
|
||||||
|
|||||||
+117
-49
@@ -190,55 +190,123 @@ def pushsignal(
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def pushmsg(cfg: dict, msg: str, debug: int = 0):
|
def _dispatch_to_channel(channel_name: str, channel_config: dict, msg: str, debug: int = 0) -> bool:
|
||||||
"""Dispatch push notifications according to `cfg['pushsrv']`.
|
"""Dispatch a message to a specific notification channel.
|
||||||
|
|
||||||
cfg is expected to contain keys for different services when needed, e.g.
|
|
||||||
- cfg['pushsrv'] : one of 'all', 'pushover', 'mattermost', 'signal'
|
|
||||||
- cfg['pushover_token'], cfg['pushover_user']
|
|
||||||
- cfg['matter_host'], cfg['matter_token'], cfg['matter_channel']
|
|
||||||
- cfg['signal_cli'], cfg['signal_user'], cfg['signal_recipient']
|
|
||||||
|
|
||||||
Returns a dict of results per provider.
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
Args:
|
||||||
|
channel_name: Name of the channel (for logging)
|
||||||
|
channel_config: Channel configuration dictionary with 'type' and type-specific fields
|
||||||
|
msg: Message to send
|
||||||
|
debug: Debug level
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if notification sent successfully, False otherwise
|
||||||
|
"""
|
||||||
|
channel_type = channel_config.get("type")
|
||||||
|
|
||||||
|
if channel_type == "pushover":
|
||||||
|
return pushover(
|
||||||
|
channel_config.get("token", ""),
|
||||||
|
channel_config.get("user", ""),
|
||||||
|
msg,
|
||||||
|
debug=debug
|
||||||
|
)
|
||||||
|
|
||||||
|
elif channel_type == "email":
|
||||||
|
# Build email from channel config
|
||||||
|
recipients = channel_config.get("recipients", [])
|
||||||
|
sender = channel_config.get("sender", "")
|
||||||
|
smtp_server = channel_config.get("smtp_server", "")
|
||||||
|
smtp_port = channel_config.get("smtp_port", 587)
|
||||||
|
smtp_user = channel_config.get("smtp_user")
|
||||||
|
smtp_password = channel_config.get("smtp_password")
|
||||||
|
|
||||||
|
if not recipients or not sender or not smtp_server:
|
||||||
|
logger.warning(
|
||||||
|
"Email channel '%s' missing required fields: recipients=%s, sender=%s, smtp_server=%s",
|
||||||
|
channel_name, recipients, sender, smtp_server
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Temporarily update _config for email() function
|
||||||
|
old_config = dict(_config)
|
||||||
|
_config["toemail"] = recipients
|
||||||
|
_config["fromemail"] = sender
|
||||||
|
_config["smtpserver"] = smtp_server
|
||||||
|
_config["smtpport"] = smtp_port
|
||||||
|
if smtp_user:
|
||||||
|
_config["smtpuser"] = smtp_user
|
||||||
|
if smtp_password:
|
||||||
|
_config["smtppassword"] = smtp_password
|
||||||
|
|
||||||
|
result = email("Heartbeat notification", msg, debug=debug)
|
||||||
|
|
||||||
|
# Restore config
|
||||||
|
_config.clear()
|
||||||
|
_config.update(old_config)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
elif channel_type == "signal":
|
||||||
|
return pushsignal(
|
||||||
|
channel_config.get("cli_path", "/usr/local/bin/signal-cli"),
|
||||||
|
channel_config.get("user", ""),
|
||||||
|
channel_config.get("recipient", ""),
|
||||||
|
msg,
|
||||||
|
debug=debug
|
||||||
|
)
|
||||||
|
|
||||||
|
elif channel_type == "mattermost":
|
||||||
|
return pushmattermost(
|
||||||
|
channel_config.get("host", ""),
|
||||||
|
channel_config.get("token", ""),
|
||||||
|
channel_config.get("channel", ""),
|
||||||
|
msg,
|
||||||
|
username=channel_config.get("username", "hbd"),
|
||||||
|
icon=channel_config.get("icon"),
|
||||||
|
debug=debug
|
||||||
|
)
|
||||||
|
|
||||||
|
else:
|
||||||
|
logger.warning("Unknown channel type '%s' for channel '%s'", channel_type, channel_name)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def pushmsg_for_host(hostname: str, msg: str, debug: int = 0) -> dict:
|
||||||
|
"""Send notification for a specific host using its configured channels.
|
||||||
|
|
||||||
|
This function looks up the host's notification channels from the config
|
||||||
|
and sends the message to those channels.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
hostname: Name of the host to send notification for
|
||||||
|
msg: Message to send
|
||||||
|
debug: Debug level
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary of results per channel: {"channel_name": True/False}
|
||||||
|
"""
|
||||||
|
from . import config as config_mod
|
||||||
|
|
||||||
|
# Get notification channels for this host
|
||||||
|
channels = config_mod.get_notification_channels_config(_config, hostname)
|
||||||
|
|
||||||
|
if not channels:
|
||||||
|
logger.warning("No notification channels configured for host '%s'", hostname)
|
||||||
|
return {}
|
||||||
|
|
||||||
|
# Dispatch to each channel
|
||||||
results = {}
|
results = {}
|
||||||
p = cfg.get("pushsrv", "pushover")
|
for channel_name, channel_config in channels:
|
||||||
if p in ("all", "pushover"):
|
try:
|
||||||
ok = pushover(
|
success = _dispatch_to_channel(channel_name, channel_config, msg, debug=debug)
|
||||||
cfg.get("pushover_token", ""),
|
results[channel_name] = success
|
||||||
cfg.get("pushover_user", ""),
|
if success:
|
||||||
msg,
|
logger.info("Notification sent to channel '%s': %s", channel_name, msg)
|
||||||
debug=debug,
|
else:
|
||||||
)
|
logger.warning("Failed to send notification to channel '%s'", channel_name)
|
||||||
results["pushover"] = ok
|
except Exception as e:
|
||||||
if p in ("all", "mattermost"):
|
logger.error("Error sending to channel '%s': %s", channel_name, e)
|
||||||
ok = pushmattermost(
|
results[channel_name] = False
|
||||||
cfg.get("matter_host", ""),
|
|
||||||
cfg.get("matter_token", ""),
|
|
||||||
cfg.get("matter_channel", ""),
|
|
||||||
msg,
|
|
||||||
username=cfg.get("matter_username", "hbd"),
|
|
||||||
icon=cfg.get("matter_icon"),
|
|
||||||
debug=debug,
|
|
||||||
)
|
|
||||||
results["mattermost"] = ok
|
|
||||||
if p in ("all", "signal"):
|
|
||||||
ok = pushsignal(
|
|
||||||
cfg.get("signal_cli", "/usr/local/bin/signal-cli"),
|
|
||||||
cfg.get("signal_user", ""),
|
|
||||||
cfg.get("signal_recipient", ""),
|
|
||||||
msg,
|
|
||||||
debug=debug,
|
|
||||||
)
|
|
||||||
results["signal"] = ok
|
|
||||||
if p in ("all", "email"):
|
|
||||||
ok = email("Heartbeat notification", msg, debug=debug)
|
|
||||||
results["email"] = ok
|
|
||||||
logger.debug("push results: %s", results)
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def pushmsg_from_config(msg: str, debug: int = 0) -> dict:
|
|
||||||
"""Use the module-level configuration dict to dispatch a push message."""
|
|
||||||
return pushmsg(_config, msg, debug=debug)
|
|
||||||
|
|||||||
+64
-58
@@ -275,7 +275,6 @@ class ThresholdChecker:
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
config: Dict[str, Any],
|
config: Dict[str, Any],
|
||||||
notification_callback: Optional[Callable] = None,
|
|
||||||
renotify_interval: int = 3600,
|
renotify_interval: int = 3600,
|
||||||
journal: Optional[Any] = None,
|
journal: Optional[Any] = None,
|
||||||
):
|
):
|
||||||
@@ -284,7 +283,6 @@ class ThresholdChecker:
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
config: Threshold configuration dictionary from YAML
|
config: Threshold configuration dictionary from YAML
|
||||||
notification_callback: Function to call for notifications
|
|
||||||
renotify_interval: Seconds between repeat notifications (default: 1 hour)
|
renotify_interval: Seconds between repeat notifications (default: 1 hour)
|
||||||
journal: Optional MessageJournal instance for logging threshold events
|
journal: Optional MessageJournal instance for logging threshold events
|
||||||
"""
|
"""
|
||||||
@@ -300,7 +298,6 @@ class ThresholdChecker:
|
|||||||
# Default config name to use when no mapping exists
|
# Default config name to use when no mapping exists
|
||||||
self.default_config = "default"
|
self.default_config = "default"
|
||||||
|
|
||||||
self.notification_callback = notification_callback
|
|
||||||
self.renotify_interval = renotify_interval
|
self.renotify_interval = renotify_interval
|
||||||
self.journal = journal
|
self.journal = journal
|
||||||
|
|
||||||
@@ -367,8 +364,20 @@ class ThresholdChecker:
|
|||||||
target_dict=self.threshold_configs[config_name]
|
target_dict=self.threshold_configs[config_name]
|
||||||
)
|
)
|
||||||
|
|
||||||
# Parse host to config mapping
|
# Parse host to config mapping from two possible sources
|
||||||
self.host_config_mapping = config.get("host_threshold_mapping", {})
|
# 1. New format: hosts section with threshold_config attribute
|
||||||
|
if "hosts" in config:
|
||||||
|
hosts_config = config["hosts"]
|
||||||
|
if isinstance(hosts_config, dict):
|
||||||
|
for host_name, host_attrs in hosts_config.items():
|
||||||
|
if isinstance(host_attrs, dict) and "threshold_config" in host_attrs:
|
||||||
|
self.host_config_mapping[host_name] = host_attrs["threshold_config"]
|
||||||
|
|
||||||
|
# 2. Legacy format: host_threshold_mapping section (for backward compatibility)
|
||||||
|
if "host_threshold_mapping" in config:
|
||||||
|
legacy_mapping = config.get("host_threshold_mapping", {})
|
||||||
|
if isinstance(legacy_mapping, dict):
|
||||||
|
self.host_config_mapping.update(legacy_mapping)
|
||||||
|
|
||||||
# Set default config (first one alphabetically or explicitly set)
|
# Set default config (first one alphabetically or explicitly set)
|
||||||
self.default_config = config.get("default_threshold_config", "default")
|
self.default_config = config.get("default_threshold_config", "default")
|
||||||
@@ -513,14 +522,13 @@ class ThresholdChecker:
|
|||||||
rtt_thresholds: Dict[str, Any],
|
rtt_thresholds: Dict[str, Any],
|
||||||
target_dict: Optional[Dict[str, ThresholdConfig]] = None
|
target_dict: Optional[Dict[str, ThresholdConfig]] = None
|
||||||
):
|
):
|
||||||
"""Parse RTT thresholds (per-host network latency thresholds).
|
"""Parse RTT thresholds (network latency thresholds).
|
||||||
|
|
||||||
RTT thresholds are configured as:
|
RTT thresholds are configured as:
|
||||||
thresholds:
|
thresholds:
|
||||||
rtt:
|
rtt:
|
||||||
hostname1:
|
warning: 100.0 # ms
|
||||||
warning: 100.0 # ms
|
critical: 500.0 # ms
|
||||||
critical: 500.0 # ms
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
rtt_thresholds: RTT threshold configuration
|
rtt_thresholds: RTT threshold configuration
|
||||||
@@ -529,41 +537,39 @@ class ThresholdChecker:
|
|||||||
if target_dict is None:
|
if target_dict is None:
|
||||||
target_dict = self.thresholds
|
target_dict = self.thresholds
|
||||||
|
|
||||||
for hostname, threshold_config in rtt_thresholds.items():
|
if not isinstance(rtt_thresholds, dict):
|
||||||
if not isinstance(threshold_config, dict):
|
return
|
||||||
continue
|
|
||||||
|
# Metric path is simply "rtt" (not per-host)
|
||||||
# Metric path is "rtt.<hostname>"
|
metric_path = "rtt"
|
||||||
metric_path = f"rtt.{hostname}"
|
|
||||||
|
warning = rtt_thresholds.get("warning")
|
||||||
warning = threshold_config.get("warning")
|
critical = rtt_thresholds.get("critical")
|
||||||
critical = threshold_config.get("critical")
|
operator = rtt_thresholds.get("operator", ">")
|
||||||
operator = threshold_config.get("operator", ">")
|
hysteresis = rtt_thresholds.get("hysteresis", 0.1) # 10% default
|
||||||
hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default
|
enabled = rtt_thresholds.get("enabled", True)
|
||||||
enabled = threshold_config.get("enabled", True)
|
display = rtt_thresholds.get("display")
|
||||||
display = threshold_config.get("display")
|
|
||||||
|
if warning is None and critical is None:
|
||||||
if warning is None and critical is None:
|
logger.warning("No RTT thresholds defined, skipping")
|
||||||
logger.warning("No RTT thresholds defined for %s, skipping", hostname)
|
return
|
||||||
continue
|
|
||||||
|
threshold = ThresholdConfig(
|
||||||
threshold = ThresholdConfig(
|
metric_path=metric_path,
|
||||||
metric_path=metric_path,
|
warning=warning,
|
||||||
warning=warning,
|
critical=critical,
|
||||||
critical=critical,
|
operator=operator,
|
||||||
operator=operator,
|
hysteresis=hysteresis,
|
||||||
hysteresis=hysteresis,
|
enabled=enabled,
|
||||||
enabled=enabled,
|
display=display
|
||||||
display=display
|
)
|
||||||
)
|
|
||||||
|
target_dict[metric_path] = threshold
|
||||||
target_dict[metric_path] = threshold
|
logger.debug(
|
||||||
logger.debug(
|
"Registered RTT threshold: warn=%s ms, crit=%s ms",
|
||||||
"Registered RTT threshold for %s: warn=%s ms, crit=%s ms",
|
warning,
|
||||||
hostname,
|
critical
|
||||||
warning,
|
)
|
||||||
critical
|
|
||||||
)
|
|
||||||
|
|
||||||
def get_thresholds_for_host(self, host_name: str) -> Dict[str, ThresholdConfig]:
|
def get_thresholds_for_host(self, host_name: str) -> Dict[str, ThresholdConfig]:
|
||||||
"""Get the appropriate threshold configuration for a host.
|
"""Get the appropriate threshold configuration for a host.
|
||||||
@@ -887,12 +893,12 @@ class ThresholdChecker:
|
|||||||
value: Any,
|
value: Any,
|
||||||
):
|
):
|
||||||
"""Send notification and log to journal/eventlog."""
|
"""Send notification and log to journal/eventlog."""
|
||||||
if self.notification_callback is not None:
|
# Send notification using host-specific channels
|
||||||
try:
|
try:
|
||||||
self.notification_callback(f"{lvl}: {host_name} - {message}")
|
notify_mod.pushmsg_for_host(host_name, f"{lvl}: {host_name} - {message}")
|
||||||
logger.info("Notification sent: %s", message)
|
logger.info("Notification sent: %s", message)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("Failed to send notification: %s", e)
|
logger.error("Failed to send notification: %s", e)
|
||||||
|
|
||||||
# Log to journal
|
# Log to journal
|
||||||
if self.journal is not None:
|
if self.journal is not None:
|
||||||
@@ -1017,14 +1023,14 @@ class ThresholdChecker:
|
|||||||
else:
|
else:
|
||||||
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (ongoing for {int(now - alert_state.since)}s)"
|
message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (ongoing for {int(now - alert_state.since)}s)"
|
||||||
|
|
||||||
if self.notification_callback:
|
# Send re-notification using host-specific channels
|
||||||
try:
|
try:
|
||||||
self.notification_callback(message)
|
notify_mod.pushmsg_for_host(host_name, message)
|
||||||
alert_state.last_notification = now
|
alert_state.last_notification = now
|
||||||
alert_state.notification_count += 1
|
alert_state.notification_count += 1
|
||||||
logger.info("Re-notification sent: %s", message)
|
logger.info("Re-notification sent: %s", message)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("Failed to send re-notification: %s", e)
|
logger.error("Failed to send re-notification: %s", e)
|
||||||
|
|
||||||
def get_active_alerts(self, alert_states: Dict[str, AlertState]) -> list:
|
def get_active_alerts(self, alert_states: Dict[str, AlertState]) -> list:
|
||||||
"""
|
"""
|
||||||
|
|||||||
+33
-24
@@ -68,7 +68,6 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
|||||||
- config: dict of configuration
|
- config: dict of configuration
|
||||||
- hbdclass: module providing Host/Connection classes
|
- hbdclass: module providing Host/Connection classes
|
||||||
- log: callable(loghost, message)
|
- log: callable(loghost, message)
|
||||||
- pushmsg: callable(message)
|
|
||||||
- msg_to_websockets: callable(typ, data)
|
- msg_to_websockets: callable(typ, data)
|
||||||
- msg_journal: MessageJournal instance for logging all messages
|
- msg_journal: MessageJournal instance for logging all messages
|
||||||
- DEBUG, verbose
|
- DEBUG, verbose
|
||||||
@@ -91,7 +90,6 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
|||||||
cfg = ctx.get("config", {})
|
cfg = ctx.get("config", {})
|
||||||
hbdcls = ctx.get("hbdclass")
|
hbdcls = ctx.get("hbdclass")
|
||||||
log = ctx.get("log")
|
log = ctx.get("log")
|
||||||
pushmsg = ctx.get("pushmsg")
|
|
||||||
msg_to_websockets = ctx.get("msg_to_websockets")
|
msg_to_websockets = ctx.get("msg_to_websockets")
|
||||||
DEBUG = ctx.get("DEBUG", 0)
|
DEBUG = ctx.get("DEBUG", 0)
|
||||||
verbose = ctx.get("verbose", False)
|
verbose = ctx.get("verbose", False)
|
||||||
@@ -100,18 +98,24 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
|||||||
ip = addr[0] if isinstance(addr, (list, tuple)) else addr
|
ip = addr[0] if isinstance(addr, (list, tuple)) else addr
|
||||||
name = msg.get("name", "unknown")
|
name = msg.get("name", "unknown")
|
||||||
from ..common.utils import shortname
|
from ..common.utils import shortname
|
||||||
|
from . import config as config_mod
|
||||||
|
|
||||||
uname = shortname(name)
|
uname = shortname(name)
|
||||||
|
|
||||||
if uname not in hbdcls.Host.hosts:
|
if uname not in hbdcls.Host.hosts:
|
||||||
host = hbdcls.Host(uname)
|
host = hbdcls.Host(uname)
|
||||||
host.dyn = uname in cfg.get("dyndnshosts", [])
|
# Use new config function to check dyndns
|
||||||
|
dyndnshosts = config_mod.get_dyndnshosts(cfg)
|
||||||
|
host.dyn = uname in dyndnshosts
|
||||||
if verbose:
|
if verbose:
|
||||||
print(("XX: New host, num now %s" % (len(hbdcls.Host.hosts))))
|
print(("XX: New host, num now %s" % (len(hbdcls.Host.hosts))))
|
||||||
newh = True
|
newh = True
|
||||||
else:
|
else:
|
||||||
host = hbdcls.Host.hosts[uname]
|
host = hbdcls.Host.hosts[uname]
|
||||||
newh = False
|
newh = False
|
||||||
|
|
||||||
|
# Get watchhosts once for use throughout message handling
|
||||||
|
watchhosts = config_mod.get_watchhosts(cfg)
|
||||||
|
|
||||||
cid = msg.get("id", 0)
|
cid = msg.get("id", 0)
|
||||||
try:
|
try:
|
||||||
@@ -181,9 +185,8 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
|||||||
|
|
||||||
if res:
|
if res:
|
||||||
eventlog(uname, "WARNING", res)
|
eventlog(uname, "WARNING", res)
|
||||||
if uname in cfg.get("watchhosts", []):
|
if uname in watchhosts:
|
||||||
if pushmsg:
|
notify_mod.pushmsg_for_host(uname, "%s %s" % (host.name, res))
|
||||||
pushmsg("%s %s" % (host.name, res))
|
|
||||||
|
|
||||||
interval = int(msg.get("interval", 0) or 0)
|
interval = int(msg.get("interval", 0) or 0)
|
||||||
shutdown = msg.get("shutdown", 0)
|
shutdown = msg.get("shutdown", 0)
|
||||||
@@ -193,15 +196,13 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
|||||||
|
|
||||||
if boot:
|
if boot:
|
||||||
eventlog(uname, "INFO", "booted")
|
eventlog(uname, "INFO", "booted")
|
||||||
if uname in cfg.get("watchhosts", []):
|
if uname in watchhosts:
|
||||||
m = "%s booted" % (host.name)
|
m = "%s booted" % (host.name)
|
||||||
if pushmsg:
|
notify_mod.pushmsg_for_host(uname, m)
|
||||||
pushmsg(m)
|
|
||||||
if message:
|
if message:
|
||||||
eventlog(uname, "INFO", "msg: %s" % message, service=service)
|
eventlog(uname, "INFO", "msg: %s" % message, service=service)
|
||||||
if uname in cfg.get("watchhosts", []):
|
if uname in watchhosts:
|
||||||
if pushmsg:
|
notify_mod.pushmsg_for_host(uname, message)
|
||||||
pushmsg(message)
|
|
||||||
|
|
||||||
if conn.getstate() != hbdcls.Connection.UP:
|
if conn.getstate() != hbdcls.Connection.UP:
|
||||||
lasts = conn.state
|
lasts = conn.state
|
||||||
@@ -211,9 +212,8 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
|||||||
else:
|
else:
|
||||||
m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
|
m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d))
|
||||||
eventlog(uname, "RECOVER", m)
|
eventlog(uname, "RECOVER", m)
|
||||||
if uname in cfg.get("watchhosts", []):
|
if uname in watchhosts:
|
||||||
if pushmsg:
|
notify_mod.pushmsg_for_host(uname, "%s %s is back" % (uname, conn.afam))
|
||||||
pushmsg("%s %s is back" % (uname, conn.afam))
|
|
||||||
|
|
||||||
if boot or newh:
|
if boot or newh:
|
||||||
host.upcount = host.doesack
|
host.upcount = host.doesack
|
||||||
@@ -222,9 +222,8 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
|||||||
|
|
||||||
if shutdown:
|
if shutdown:
|
||||||
eventlog(uname, "INFO", "%s shutdown" % conn.afam)
|
eventlog(uname, "INFO", "%s shutdown" % conn.afam)
|
||||||
if uname in cfg.get("watchhosts", []):
|
if uname in watchhosts:
|
||||||
if pushmsg:
|
notify_mod.pushmsg_for_host(uname, "%s %s shutdown" % (uname, conn.afam))
|
||||||
pushmsg("%s %s shutdown" % (uname, conn.afam))
|
|
||||||
conn.newstate(hbdcls.Connection.DOWN, now)
|
conn.newstate(hbdcls.Connection.DOWN, now)
|
||||||
|
|
||||||
if interval > 0:
|
if interval > 0:
|
||||||
@@ -247,11 +246,21 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
|||||||
connection.newstate(hbdcls.Connection.OVERDUE, now, cfg.get("grace", 2))
|
connection.newstate(hbdcls.Connection.OVERDUE, now, cfg.get("grace", 2))
|
||||||
|
|
||||||
msg = f"{connection.afam} overdue"
|
msg = f"{connection.afam} overdue"
|
||||||
eventlog(uname, "CRITICAL" if uname in cfg.get("watchhosts", []) else "WARNING", msg)
|
eventlog(uname, "CRITICAL" if uname in watchhosts else "WARNING", msg)
|
||||||
|
|
||||||
if uname in cfg.get("watchhosts", []):
|
if uname in watchhosts:
|
||||||
if pushmsg:
|
notify_mod.pushmsg_for_host(uname, f"{uname} {msg}")
|
||||||
pushmsg(f"{uname} {msg}")
|
|
||||||
|
# Check RTT thresholds with infinite RTT for overdue hosts
|
||||||
|
threshold_checker = ctx.get("threshold_checker")
|
||||||
|
if threshold_checker:
|
||||||
|
metric_path = "rtt"
|
||||||
|
threshold_checker.check_value(
|
||||||
|
host_name=uname,
|
||||||
|
metric_path=metric_path,
|
||||||
|
value=float('inf'),
|
||||||
|
alert_states=host.alert_states
|
||||||
|
)
|
||||||
|
|
||||||
# Notify websockets
|
# Notify websockets
|
||||||
if msg_to_websockets:
|
if msg_to_websockets:
|
||||||
@@ -274,8 +283,8 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict):
|
|||||||
# Check RTT thresholds using the threshold checker
|
# Check RTT thresholds using the threshold checker
|
||||||
threshold_checker = ctx.get("threshold_checker")
|
threshold_checker = ctx.get("threshold_checker")
|
||||||
if threshold_checker and rtt and rtt > 0:
|
if threshold_checker and rtt and rtt > 0:
|
||||||
# Metric path for RTT is "rtt.<hostname>"
|
# Metric path for RTT is simply "rtt"
|
||||||
metric_path = f"rtt.{uname}"
|
metric_path = "rtt"
|
||||||
|
|
||||||
# Check against configured thresholds (handles alerts, notifications, etc.)
|
# Check against configured thresholds (handles alerts, notifications, etc.)
|
||||||
threshold_checker.check_value(
|
threshold_checker.check_value(
|
||||||
|
|||||||
Reference in New Issue
Block a user