Files
heartbeat/hbd/config_multi_threshold_example.yaml
T
2026-04-01 19:41:53 -04:00

297 lines
9.0 KiB
YAML

# ==============================================================================
# Heartbeat Daemon Multi-Threshold Configuration Example
# ==============================================================================
# This file demonstrates the new multi-threshold configuration feature that allows
# different threshold settings for different hosts/clients.
#
# Features:
# - Define multiple named threshold configurations
# - Map specific hosts to specific threshold configurations
# - Set a default configuration for unmapped hosts
# - Backward compatible with single threshold configuration
# ==============================================================================
# Global threshold settings
threshold_renotify_interval: 3600 # Re-notify every hour for ongoing alerts (seconds)
# Optional: Set default threshold config (defaults to "default" if not specified)
default_threshold_config: "default"
# ----------------------------------------------------------------------------
# Multiple Named Threshold Configurations
# ----------------------------------------------------------------------------
# Define multiple threshold configurations with different sensitivity levels
threshold_configs:
# Default configuration - moderate thresholds for most servers
default:
thresholds:
cpu_monitor:
cpu_percent:
warning: 80.0
critical: 90.0
operator: ">"
load_1min:
warning: 4.0
critical: 8.0
operator: ">"
memory_monitor:
percent:
warning: 85.0
critical: 95.0
operator: ">"
disk_monitor:
partitions:
/:
percent:
warning: 85.0
critical: 95.0
operator: ">"
rtt:
# RTT thresholds (applies to all hosts)
warning: 50.0 # ms
critical: 200.0
# High sensitivity configuration - lower thresholds for critical systems
high_sensitivity:
thresholds:
cpu_monitor:
cpu_percent:
warning: 60.0 # Alert earlier
critical: 75.0
operator: ">"
hysteresis: 0.15 # More hysteresis to reduce flapping
load_1min:
warning: 2.0
critical: 4.0
operator: ">"
memory_monitor:
percent:
warning: 75.0 # Alert at lower memory usage
critical: 85.0
operator: ">"
display: "(threshold: {op_symbol} {threshold_value}%, total: {total_gb} GB)"
disk_monitor:
partitions:
/:
percent:
warning: 75.0
critical: 85.0
operator: ">"
/var:
percent:
warning: 80.0
critical: 90.0
operator: ">"
rtt:
warning: 30.0
critical: 100.0
# Low sensitivity configuration - higher thresholds for development/test systems
low_sensitivity:
thresholds:
cpu_monitor:
cpu_percent:
warning: 90.0 # Only alert at very high usage
critical: 95.0
operator: ">"
memory_monitor:
percent:
warning: 90.0
critical: 98.0
operator: ">"
disk_monitor:
partitions:
/:
percent:
warning: 90.0
critical: 95.0
operator: ">"
rtt:
warning: 100.0
critical: 500.0
# Production database servers - specialized thresholds
database:
thresholds:
cpu_monitor:
cpu_percent:
warning: 70.0
critical: 85.0
operator: ">"
memory_monitor:
percent:
warning: 90.0 # Databases can use high memory
critical: 97.0
operator: ">"
display: "(threshold: {op_symbol} {threshold_value}%, total: {total_gb} GB, available: {available_gb} GB)"
disk_monitor:
partitions:
/:
percent:
warning: 80.0
critical: 90.0
operator: ">"
/var/lib/mysql: # Database data partition
percent:
warning: 75.0 # Alert earlier for DB partition
critical: 85.0
operator: ">"
rtt:
warning: 20.0 # Stricter latency requirements
critical: 50.0
# ----------------------------------------------------------------------------
# Host to Threshold Configuration Mapping
# ----------------------------------------------------------------------------
# Map specific hosts to specific threshold configurations
# ----------------------------------------------------------------------------
# Notification Channels
# ----------------------------------------------------------------------------
# Define notification providers centrally with their credentials
# Each channel has a type (pushover, email, signal, mattermost) and type-specific config
notification_channels:
# Signal notifications
signal_ops:
type: signal
cli_path: /usr/local/bin/signal-cli
user: +1234567890
recipient: +1234567890
signal_oncall:
type: signal
cli_path: /usr/local/bin/signal-cli
user: +1234567890
recipient: +0987654321
# Email notifications
email_ops:
type: email
recipients: [ops@example.com, alerts@example.com]
sender: heartbeat@example.com
smtp_server: smtp.example.com
smtp_port: 587
smtp_user: heartbeat@example.com
smtp_password: your-smtp-password
# Pushover notifications
pushover_urgent:
type: pushover
token: your-pushover-app-token
user: your-pushover-user-key
# Mattermost notifications
mattermost_devops:
type: mattermost
host: mattermost.example.com
token: your-webhook-token
channel: devops-alerts
username: heartbeat-bot
icon: https://example.com/heartbeat-icon.png
# Default notification channels (used if host doesn't specify channels)
default_notification_channels: [email_ops]
# ----------------------------------------------------------------------------
# Host Definitions (New Unified Format)
# ----------------------------------------------------------------------------
# Define hosts with threshold configs, monitoring, DNS, and notification settings
hosts:
# Critical production servers - high sensitivity, multiple notification channels
prod-web-01:
threshold_config: high_sensitivity
watch: true
notification_channels: [signal_oncall, pushover_urgent, email_ops]
dyndns: false
prod-web-02:
threshold_config: high_sensitivity
watch: true
notification_channels: [signal_oncall, pushover_urgent, email_ops]
dyndns: false
prod-api-01:
threshold_config: high_sensitivity
watch: true
notification_channels: [signal_oncall, email_ops]
dyndns: false
# Database servers - database-specific thresholds
prod-db-01:
threshold_config: database
watch: true
notification_channels: [signal_ops, email_ops]
dyndns: false
prod-db-02:
threshold_config: database
watch: true
notification_channels: [signal_ops, email_ops]
dyndns: false
prod-db-replica:
threshold_config: database
watch: true
notification_channels: [email_ops] # Replica gets email only
dyndns: false
# Development servers - low sensitivity, minimal notifications
dev-server-01:
threshold_config: low_sensitivity
watch: false # Don't monitor dev servers closely
notification_channels: [email_ops]
dyndns: false
dev-server-02:
threshold_config: low_sensitivity
watch: false
notification_channels: [email_ops]
dyndns: false
# Test servers
test-server-01:
threshold_config: low_sensitivity
watch: false
dyndns: false
# No notification channels - uses default_notification_channels
# Home server with dynamic DNS
home-server:
threshold_config: default
watch: true
notification_channels: [signal_ops]
dyndns: true # Update DNS when IP changes
# Hosts not listed in the hosts section will use:
# - default_threshold_config for thresholds (falls back to "default")
# - default_notification_channels for notifications
# ----------------------------------------------------------------------------
# Notes on Configuration Structure
# ----------------------------------------------------------------------------
#
# All configuration is centralized in the hosts section. Each host can specify:
# - threshold_config: Name of threshold configuration to use
# - watch: Whether to monitor this host actively (send notifications)
# - notification_channels: List of channels to use for this host
# - dyndns: Whether to update DNS when IP address changes
#
# Notification channels are defined once at the top level and referenced
# by name in host definitions, allowing easy reuse and updates.
#
# For hosts not explicitly listed, the system will still accept heartbeats
# and track their state, but won't apply thresholds or send notifications
# unless default settings are configured.