Files
heartbeat/hbd/config_thresholds_example.yaml
T

279 lines
9.6 KiB
YAML

# ==============================================================================
# Heartbeat Daemon Threshold Configuration Example
# ==============================================================================
# This file demonstrates threshold configuration for the Heartbeat monitoring system.
# Thresholds can be defined for any metric collected by monitoring plugins.
#
# Threshold levels:
# - WARNING: First level of concern, typically for early notification
# - CRITICAL: Severe condition requiring immediate attention
#
# Alert notifications are sent when:
# - A metric crosses from OK to WARNING or CRITICAL
# - A metric crosses from WARNING to CRITICAL
# - A metric recovers (returns to a lower severity level)
#
# Re-notifications are sent for ongoing alerts based on threshold_renotify_interval.
# ==============================================================================
# Global threshold settings
threshold_renotify_interval: 3600 # Re-notify every hour for ongoing alerts (seconds)
# Threshold definitions per plugin
thresholds:
# ----------------------------------------------------------------------------
# CPU Monitor Thresholds
# ----------------------------------------------------------------------------
cpu_monitor:
# Overall CPU usage percentage (0-100)
cpu_percent:
warning: 80.0 # Warn when CPU usage exceeds 80%
critical: 90.0 # Critical when CPU usage exceeds 90%
operator: ">" # Alert when value is GREATER than threshold
hysteresis: 0.1 # 10% hysteresis to prevent flapping
enabled: true
# 1-minute load average
load_1min:
warning: 4.0 # Warn when 1-min load exceeds 4.0
critical: 8.0 # Critical when 1-min load exceeds 8.0
operator: ">"
hysteresis: 0.15 # 15% hysteresis
enabled: true
# 5-minute load average
load_5min:
warning: 3.0
critical: 6.0
operator: ">"
hysteresis: 0.15
enabled: true
# 15-minute load average
load_15min:
warning: 2.0
critical: 4.0
operator: ">"
hysteresis: 0.15
enabled: true
# ----------------------------------------------------------------------------
# Memory Monitor Thresholds
# ----------------------------------------------------------------------------
memory_monitor:
# Memory usage percentage
percent:
warning: 85.0 # Warn at 85% memory usage
critical: 95.0 # Critical at 95% memory usage
operator: ">"
hysteresis: 0.1
enabled: true
# Available memory in MB (inverse threshold - alert when LOW)
available_mb:
warning: 1000 # Warn when less than 1GB available
critical: 500 # Critical when less than 500MB available
operator: "<" # Alert when value is LESS than threshold
hysteresis: 0.1
enabled: true
# Swap usage percentage
swap_percent:
warning: 50.0 # Warn at 50% swap usage
critical: 80.0 # Critical at 80% swap usage
operator: ">"
hysteresis: 0.1
enabled: true
# ----------------------------------------------------------------------------
# Disk Monitor Thresholds
# ----------------------------------------------------------------------------
disk_monitor:
# Partition-specific thresholds
# Use the mount point as the key
partitions:
# Root filesystem
/:
percent:
warning: 80.0 # Warn at 80% disk usage
critical: 90.0 # Critical at 90% disk usage
operator: ">"
hysteresis: 0.05 # 5% hysteresis for disk (more stable)
enabled: true
free_gb:
warning: 10.0 # Warn when less than 10GB free
critical: 5.0 # Critical when less than 5GB free
operator: "<"
hysteresis: 0.1
enabled: true
# Home filesystem (if separate partition)
/home:
percent:
warning: 85.0
critical: 95.0
operator: ">"
hysteresis: 0.05
enabled: true
# Var filesystem (logs, etc.)
/var:
percent:
warning: 80.0
critical: 90.0
operator: ">"
hysteresis: 0.05
enabled: true
free_gb:
warning: 5.0 # Var needs space for logs
critical: 2.0
operator: "<"
hysteresis: 0.1
enabled: true
# ----------------------------------------------------------------------------
# ZFS Monitor Thresholds
# ----------------------------------------------------------------------------
zfs_monitor:
# Pool health check — built-in default; shown here for reference/override.
# status is 0 (ONLINE) or 1 (DEGRADED) or 2 (SUSPENDED, FAULTED, UNAVAIL…).
# Use '*' to apply the same rule to every pool, or name a specific pool.
pools:
'*':
status:
warning: 1 # Alert WARNING when pool is DEGRADED
critical: 2 # Alert CRITICAL when pool is SUSPENDED/FAULTED/UNAVAIL
operator: ">"
hysteresis: 0.0 # No hysteresis — a degraded pool is always critical
display: "ZFS pool {pool_name} is {health}"
# Per-pool capacity thresholds (optional; add pools you care about)
# tank:
# capacity:
# warning: 75.0 # Warn at 75% used
# critical: 90.0 # Critical at 90% used
# operator: ">"
# hysteresis: 0.05
# ----------------------------------------------------------------------------
# Network Monitor Thresholds
# ----------------------------------------------------------------------------
network_monitor:
# Total error count across all interfaces
errors_total:
warning: 100 # Warn at 100 errors
critical: 1000 # Critical at 1000 errors
operator: ">"
hysteresis: 0.2 # 20% hysteresis for counters
enabled: true
# Total dropped packets
dropin_total:
warning: 50
critical: 200
operator: ">"
hysteresis: 0.2
enabled: true
dropout_total:
warning: 50
critical: 200
operator: ">"
hysteresis: 0.2
enabled: true
# TCP connections in TIME_WAIT state
connections_TIME_WAIT:
warning: 1000 # Warn at 1000 TIME_WAIT connections
critical: 5000 # Critical at 5000 TIME_WAIT connections
operator: ">"
hysteresis: 0.2
enabled: true
# Total established connections
connections_ESTABLISHED:
warning: 500
critical: 1000
operator: ">"
hysteresis: 0.1
enabled: true
# ----------------------------------------------------------------------------
# Nagios Plugin Thresholds (if using nagios_runner)
# ----------------------------------------------------------------------------
nagios_runner:
# Nagios plugins report exit codes:
# 0 = OK, 1 = WARNING, 2 = CRITICAL, 3 = UNKNOWN
# We can threshold on the exit_code directly
exit_code:
warning: 1 # Map Nagios WARNING to our WARNING
critical: 2 # Map Nagios CRITICAL to our CRITICAL
operator: ">=" # Alert when exit code >= threshold
hysteresis: 0.0 # No hysteresis for exit codes
enabled: true
# ==============================================================================
# Notification Configuration
# ==============================================================================
# Configure notification methods (email, pushover, etc.)
# These are used when threshold violations occur
# Email notifications
toemail:
- admin@example.com
- oncall@example.com
fromemail: heartbeat@example.com
smtpserver: smtp.example.com
smtpport: 587
smtpuser: heartbeat@example.com
smtppassword: your-password-here
# Pushover notifications (optional)
# pushover_token: your-pushover-app-token
# pushover_user: your-pushover-user-key
# Mattermost webhook (optional)
# mattermost_url: https://mattermost.example.com/hooks/your-webhook-id
# ==============================================================================
# Watched Hosts
# ==============================================================================
# Hosts in this list will trigger notifications for:
# - Heartbeat timeouts/overdue
# - Threshold violations
# - Boot messages
watchhosts:
- webserver01
- database01
- mailserver
- critical-app
# ==============================================================================
# Additional Server Settings
# ==============================================================================
hb_port: 50003 # UDP port for heartbeat messages
hbd_port: 50004 # HTTP port for web interface
grace: 10 # Grace period for overdue detection (seconds)
debug: 0 # Debug level (0-3)
verbose: false # Verbose output
# Journal settings (message logging)
journal_enabled: true
journal_path: /var/log/heartbeat/messages.journal
journal_max_size: 104857600 # 100MB before rotation
journal_max_backups: 10
# ==============================================================================
# Example: Production Configuration with Conservative Thresholds
# ==============================================================================
# For production systems, consider:
# - Higher warning thresholds to reduce alert fatigue
# - Appropriate hysteresis values (5-15% typical)
# - Re-notification intervals matching on-call rotation
# - Multiple escalation contacts
# - Integration with incident management systems
# ==============================================================================