Fix rtt, including bug in time compute

This commit is contained in:
Andreas Wrede
2026-04-01 19:41:53 -04:00
parent 090d341244
commit 460d2be9e9
13 changed files with 1366 additions and 372 deletions
+138 -44
View File
@@ -51,13 +51,9 @@ threshold_configs:
operator: ">"
rtt:
# RTT thresholds per remote host
router:
warning: 50.0 # ms
critical: 200.0
server1:
warning: 100.0
critical: 500.0
# RTT thresholds (applies to all hosts)
warning: 50.0 # ms
critical: 200.0
# High sensitivity configuration - lower thresholds for critical systems
high_sensitivity:
@@ -94,12 +90,8 @@ threshold_configs:
operator: ">"
rtt:
router:
warning: 30.0
critical: 100.0
server1:
warning: 50.0
critical: 200.0
warning: 30.0
critical: 100.0
# Low sensitivity configuration - higher thresholds for development/test systems
low_sensitivity:
@@ -125,9 +117,8 @@ threshold_configs:
operator: ">"
rtt:
router:
warning: 100.0
critical: 500.0
warning: 100.0
critical: 500.0
# Production database servers - specialized thresholds
database:
@@ -159,44 +150,147 @@ threshold_configs:
operator: ">"
rtt:
router:
warning: 20.0 # Stricter latency requirements
critical: 50.0
warning: 20.0 # Stricter latency requirements
critical: 50.0
# ----------------------------------------------------------------------------
# Host to Threshold Configuration Mapping
# ----------------------------------------------------------------------------
# Map specific hosts to specific threshold configurations
# Hosts not listed here will use the default_threshold_config
host_threshold_mapping:
# Critical production servers
prod-web-01: high_sensitivity
prod-web-02: high_sensitivity
prod-api-01: high_sensitivity
# ----------------------------------------------------------------------------
# Notification Channels
# ----------------------------------------------------------------------------
# Define notification providers centrally with their credentials
# Each channel has a type (pushover, email, signal, mattermost) and type-specific config
notification_channels:
# Signal notifications
signal_ops:
type: signal
cli_path: /usr/local/bin/signal-cli
user: +1234567890
recipient: +1234567890
# Database servers
prod-db-01: database
prod-db-02: database
prod-db-replica: database
signal_oncall:
type: signal
cli_path: /usr/local/bin/signal-cli
user: +1234567890
recipient: +0987654321
# Development and test systems
dev-server-01: low_sensitivity
dev-server-02: low_sensitivity
test-server-01: low_sensitivity
test-server-02: low_sensitivity
# Email notifications
email_ops:
type: email
recipients: [ops@example.com, alerts@example.com]
sender: heartbeat@example.com
smtp_server: smtp.example.com
smtp_port: 587
smtp_user: heartbeat@example.com
smtp_password: your-smtp-password
# Everything else uses 'default' (no need to list explicitly)
# Pushover notifications
pushover_urgent:
type: pushover
token: your-pushover-app-token
user: your-pushover-user-key
# Mattermost notifications
mattermost_devops:
type: mattermost
host: mattermost.example.com
token: your-webhook-token
channel: devops-alerts
username: heartbeat-bot
icon: https://example.com/heartbeat-icon.png
# Default notification channels (used if host doesn't specify channels)
default_notification_channels: [email_ops]
# ----------------------------------------------------------------------------
# Backward Compatibility Example
# Host Definitions (New Unified Format)
# ----------------------------------------------------------------------------
# The old single threshold format is still supported:
# Just use 'thresholds:' directly without 'threshold_configs:'
# Define hosts with threshold configs, monitoring, DNS, and notification settings
hosts:
# Critical production servers - high sensitivity, multiple notification channels
prod-web-01:
threshold_config: high_sensitivity
watch: true
notification_channels: [signal_oncall, pushover_urgent, email_ops]
dyndns: false
prod-web-02:
threshold_config: high_sensitivity
watch: true
notification_channels: [signal_oncall, pushover_urgent, email_ops]
dyndns: false
prod-api-01:
threshold_config: high_sensitivity
watch: true
notification_channels: [signal_oncall, email_ops]
dyndns: false
# Database servers - database-specific thresholds
prod-db-01:
threshold_config: database
watch: true
notification_channels: [signal_ops, email_ops]
dyndns: false
prod-db-02:
threshold_config: database
watch: true
notification_channels: [signal_ops, email_ops]
dyndns: false
prod-db-replica:
threshold_config: database
watch: true
notification_channels: [email_ops] # Replica gets email only
dyndns: false
# Development servers - low sensitivity, minimal notifications
dev-server-01:
threshold_config: low_sensitivity
watch: false # Don't monitor dev servers closely
notification_channels: [email_ops]
dyndns: false
dev-server-02:
threshold_config: low_sensitivity
watch: false
notification_channels: [email_ops]
dyndns: false
# Test servers
test-server-01:
threshold_config: low_sensitivity
watch: false
dyndns: false
# No notification channels - uses default_notification_channels
# Home server with dynamic DNS
home-server:
threshold_config: default
watch: true
notification_channels: [signal_ops]
dyndns: true # Update DNS when IP changes
# Hosts not listed in the hosts section will use:
# - default_threshold_config for thresholds (falls back to "default")
# - default_notification_channels for notifications
# ----------------------------------------------------------------------------
# Notes on Configuration Structure
# ----------------------------------------------------------------------------
#
# All configuration is centralized in the hosts section. Each host can specify:
# - threshold_config: Name of threshold configuration to use
# - watch: Whether to monitor this host actively (send notifications)
# - notification_channels: List of channels to use for this host
# - dyndns: Whether to update DNS when IP address changes
#
# thresholds:
# cpu_monitor:
# cpu_percent:
# warning: 80.0
# critical: 90.0
# Notification channels are defined once at the top level and referenced
# by name in host definitions, allowing easy reuse and updates.
#
# This will apply the same thresholds to all hosts.
# For hosts not explicitly listed, the system will still accept heartbeats
# and track their state, but won't apply thresholds or send notifications
# unless default settings are configured.