297 lines
9.0 KiB
YAML
297 lines
9.0 KiB
YAML
# ==============================================================================
|
|
# Heartbeat Daemon Multi-Threshold Configuration Example
|
|
# ==============================================================================
|
|
# This file demonstrates the new multi-threshold configuration feature that allows
|
|
# different threshold settings for different hosts/clients.
|
|
#
|
|
# Features:
|
|
# - Define multiple named threshold configurations
|
|
# - Map specific hosts to specific threshold configurations
|
|
# - Set a default configuration for unmapped hosts
|
|
# - Backward compatible with single threshold configuration
|
|
# ==============================================================================
|
|
|
|
# Global threshold settings
|
|
threshold_renotify_interval: 3600 # Re-notify every hour for ongoing alerts (seconds)
|
|
|
|
# Optional: Set default threshold config (defaults to "default" if not specified)
|
|
default_threshold_config: "default"
|
|
|
|
# ----------------------------------------------------------------------------
|
|
# Multiple Named Threshold Configurations
|
|
# ----------------------------------------------------------------------------
|
|
# Define multiple threshold configurations with different sensitivity levels
|
|
threshold_configs:
|
|
|
|
# Default configuration - moderate thresholds for most servers
|
|
default:
|
|
thresholds:
|
|
cpu_monitor:
|
|
cpu_percent:
|
|
warning: 80.0
|
|
critical: 90.0
|
|
operator: ">"
|
|
load_1min:
|
|
warning: 4.0
|
|
critical: 8.0
|
|
operator: ">"
|
|
|
|
memory_monitor:
|
|
percent:
|
|
warning: 85.0
|
|
critical: 95.0
|
|
operator: ">"
|
|
|
|
disk_monitor:
|
|
partitions:
|
|
/:
|
|
percent:
|
|
warning: 85.0
|
|
critical: 95.0
|
|
operator: ">"
|
|
|
|
rtt:
|
|
# RTT thresholds (applies to all hosts)
|
|
warning: 50.0 # ms
|
|
critical: 200.0
|
|
|
|
# High sensitivity configuration - lower thresholds for critical systems
|
|
high_sensitivity:
|
|
thresholds:
|
|
cpu_monitor:
|
|
cpu_percent:
|
|
warning: 60.0 # Alert earlier
|
|
critical: 75.0
|
|
operator: ">"
|
|
hysteresis: 0.15 # More hysteresis to reduce flapping
|
|
load_1min:
|
|
warning: 2.0
|
|
critical: 4.0
|
|
operator: ">"
|
|
|
|
memory_monitor:
|
|
percent:
|
|
warning: 75.0 # Alert at lower memory usage
|
|
critical: 85.0
|
|
operator: ">"
|
|
display: "(threshold: {op_symbol} {threshold_value}%, total: {total_gb} GB)"
|
|
|
|
disk_monitor:
|
|
partitions:
|
|
/:
|
|
percent:
|
|
warning: 75.0
|
|
critical: 85.0
|
|
operator: ">"
|
|
/var:
|
|
percent:
|
|
warning: 80.0
|
|
critical: 90.0
|
|
operator: ">"
|
|
|
|
rtt:
|
|
warning: 30.0
|
|
critical: 100.0
|
|
|
|
# Low sensitivity configuration - higher thresholds for development/test systems
|
|
low_sensitivity:
|
|
thresholds:
|
|
cpu_monitor:
|
|
cpu_percent:
|
|
warning: 90.0 # Only alert at very high usage
|
|
critical: 95.0
|
|
operator: ">"
|
|
|
|
memory_monitor:
|
|
percent:
|
|
warning: 90.0
|
|
critical: 98.0
|
|
operator: ">"
|
|
|
|
disk_monitor:
|
|
partitions:
|
|
/:
|
|
percent:
|
|
warning: 90.0
|
|
critical: 95.0
|
|
operator: ">"
|
|
|
|
rtt:
|
|
warning: 100.0
|
|
critical: 500.0
|
|
|
|
# Production database servers - specialized thresholds
|
|
database:
|
|
thresholds:
|
|
cpu_monitor:
|
|
cpu_percent:
|
|
warning: 70.0
|
|
critical: 85.0
|
|
operator: ">"
|
|
|
|
memory_monitor:
|
|
percent:
|
|
warning: 90.0 # Databases can use high memory
|
|
critical: 97.0
|
|
operator: ">"
|
|
display: "(threshold: {op_symbol} {threshold_value}%, total: {total_gb} GB, available: {available_gb} GB)"
|
|
|
|
disk_monitor:
|
|
partitions:
|
|
/:
|
|
percent:
|
|
warning: 80.0
|
|
critical: 90.0
|
|
operator: ">"
|
|
/var/lib/mysql: # Database data partition
|
|
percent:
|
|
warning: 75.0 # Alert earlier for DB partition
|
|
critical: 85.0
|
|
operator: ">"
|
|
|
|
rtt:
|
|
warning: 20.0 # Stricter latency requirements
|
|
critical: 50.0
|
|
|
|
# ----------------------------------------------------------------------------
|
|
# Host to Threshold Configuration Mapping
|
|
# ----------------------------------------------------------------------------
|
|
# Map specific hosts to specific threshold configurations
|
|
# ----------------------------------------------------------------------------
|
|
# Notification Channels
|
|
# ----------------------------------------------------------------------------
|
|
# Define notification providers centrally with their credentials
|
|
# Each channel has a type (pushover, email, signal, mattermost) and type-specific config
|
|
notification_channels:
|
|
# Signal notifications
|
|
signal_ops:
|
|
type: signal
|
|
cli_path: /usr/local/bin/signal-cli
|
|
user: +1234567890
|
|
recipient: +1234567890
|
|
|
|
signal_oncall:
|
|
type: signal
|
|
cli_path: /usr/local/bin/signal-cli
|
|
user: +1234567890
|
|
recipient: +0987654321
|
|
|
|
# Email notifications
|
|
email_ops:
|
|
type: email
|
|
recipients: [ops@example.com, alerts@example.com]
|
|
sender: heartbeat@example.com
|
|
smtp_server: smtp.example.com
|
|
smtp_port: 587
|
|
smtp_user: heartbeat@example.com
|
|
smtp_password: your-smtp-password
|
|
|
|
# Pushover notifications
|
|
pushover_urgent:
|
|
type: pushover
|
|
token: your-pushover-app-token
|
|
user: your-pushover-user-key
|
|
|
|
# Mattermost notifications
|
|
mattermost_devops:
|
|
type: mattermost
|
|
host: mattermost.example.com
|
|
token: your-webhook-token
|
|
channel: devops-alerts
|
|
username: heartbeat-bot
|
|
icon: https://example.com/heartbeat-icon.png
|
|
|
|
# Default notification channels (used if host doesn't specify channels)
|
|
default_notification_channels: [email_ops]
|
|
|
|
# ----------------------------------------------------------------------------
|
|
# Host Definitions (New Unified Format)
|
|
# ----------------------------------------------------------------------------
|
|
# Define hosts with threshold configs, monitoring, DNS, and notification settings
|
|
hosts:
|
|
# Critical production servers - high sensitivity, multiple notification channels
|
|
prod-web-01:
|
|
threshold_config: high_sensitivity
|
|
watch: true
|
|
notification_channels: [signal_oncall, pushover_urgent, email_ops]
|
|
dyndns: false
|
|
|
|
prod-web-02:
|
|
threshold_config: high_sensitivity
|
|
watch: true
|
|
notification_channels: [signal_oncall, pushover_urgent, email_ops]
|
|
dyndns: false
|
|
|
|
prod-api-01:
|
|
threshold_config: high_sensitivity
|
|
watch: true
|
|
notification_channels: [signal_oncall, email_ops]
|
|
dyndns: false
|
|
|
|
# Database servers - database-specific thresholds
|
|
prod-db-01:
|
|
threshold_config: database
|
|
watch: true
|
|
notification_channels: [signal_ops, email_ops]
|
|
dyndns: false
|
|
|
|
prod-db-02:
|
|
threshold_config: database
|
|
watch: true
|
|
notification_channels: [signal_ops, email_ops]
|
|
dyndns: false
|
|
|
|
prod-db-replica:
|
|
threshold_config: database
|
|
watch: true
|
|
notification_channels: [email_ops] # Replica gets email only
|
|
dyndns: false
|
|
|
|
# Development servers - low sensitivity, minimal notifications
|
|
dev-server-01:
|
|
threshold_config: low_sensitivity
|
|
watch: false # Don't monitor dev servers closely
|
|
notification_channels: [email_ops]
|
|
dyndns: false
|
|
|
|
dev-server-02:
|
|
threshold_config: low_sensitivity
|
|
watch: false
|
|
notification_channels: [email_ops]
|
|
dyndns: false
|
|
|
|
# Test servers
|
|
test-server-01:
|
|
threshold_config: low_sensitivity
|
|
watch: false
|
|
dyndns: false
|
|
# No notification channels - uses default_notification_channels
|
|
|
|
# Home server with dynamic DNS
|
|
home-server:
|
|
threshold_config: default
|
|
watch: true
|
|
notification_channels: [signal_ops]
|
|
dyndns: true # Update DNS when IP changes
|
|
|
|
# Hosts not listed in the hosts section will use:
|
|
# - default_threshold_config for thresholds (falls back to "default")
|
|
# - default_notification_channels for notifications
|
|
|
|
# ----------------------------------------------------------------------------
|
|
# Notes on Configuration Structure
|
|
# ----------------------------------------------------------------------------
|
|
#
|
|
# All configuration is centralized in the hosts section. Each host can specify:
|
|
# - threshold_config: Name of threshold configuration to use
|
|
# - watch: Whether to monitor this host actively (send notifications)
|
|
# - notification_channels: List of channels to use for this host
|
|
# - dyndns: Whether to update DNS when IP address changes
|
|
#
|
|
# Notification channels are defined once at the top level and referenced
|
|
# by name in host definitions, allowing easy reuse and updates.
|
|
#
|
|
# For hosts not explicitly listed, the system will still accept heartbeats
|
|
# and track their state, but won't apply thresholds or send notifications
|
|
# unless default settings are configured.
|