# ============================================================================== # Heartbeat Daemon Threshold Configuration Example # ============================================================================== # This file demonstrates threshold configuration for the Heartbeat monitoring system. # Thresholds can be defined for any metric collected by monitoring plugins. # # Threshold levels: # - WARNING: First level of concern, typically for early notification # - CRITICAL: Severe condition requiring immediate attention # # Alert notifications are sent when: # - A metric crosses from OK to WARNING or CRITICAL # - A metric crosses from WARNING to CRITICAL # - A metric recovers (returns to a lower severity level) # # Re-notifications are sent for ongoing alerts based on threshold_renotify_interval. # ============================================================================== # Global threshold settings threshold_renotify_interval: 3600 # Re-notify every hour for ongoing alerts (seconds) # Threshold definitions per plugin thresholds: # ---------------------------------------------------------------------------- # CPU Monitor Thresholds # ---------------------------------------------------------------------------- cpu_monitor: # Overall CPU usage percentage (0-100) cpu_percent: warning: 80.0 # Warn when CPU usage exceeds 80% critical: 90.0 # Critical when CPU usage exceeds 90% operator: ">" # Alert when value is GREATER than threshold hysteresis: 0.1 # 10% hysteresis to prevent flapping enabled: true # 1-minute load average load_1min: warning: 4.0 # Warn when 1-min load exceeds 4.0 critical: 8.0 # Critical when 1-min load exceeds 8.0 operator: ">" hysteresis: 0.15 # 15% hysteresis enabled: true # 5-minute load average load_5min: warning: 3.0 critical: 6.0 operator: ">" hysteresis: 0.15 enabled: true # 15-minute load average load_15min: warning: 2.0 critical: 4.0 operator: ">" hysteresis: 0.15 enabled: true # ---------------------------------------------------------------------------- # Memory Monitor Thresholds # ---------------------------------------------------------------------------- memory_monitor: # Memory usage percentage percent: warning: 85.0 # Warn at 85% memory usage critical: 95.0 # Critical at 95% memory usage operator: ">" hysteresis: 0.1 enabled: true # Available memory in MB (inverse threshold - alert when LOW) available_mb: warning: 1000 # Warn when less than 1GB available critical: 500 # Critical when less than 500MB available operator: "<" # Alert when value is LESS than threshold hysteresis: 0.1 enabled: true # Swap usage percentage swap_percent: warning: 50.0 # Warn at 50% swap usage critical: 80.0 # Critical at 80% swap usage operator: ">" hysteresis: 0.1 enabled: true # ---------------------------------------------------------------------------- # Disk Monitor Thresholds # ---------------------------------------------------------------------------- disk_monitor: # Partition-specific thresholds # Use the mount point as the key partitions: # Root filesystem /: percent: warning: 80.0 # Warn at 80% disk usage critical: 90.0 # Critical at 90% disk usage operator: ">" hysteresis: 0.05 # 5% hysteresis for disk (more stable) enabled: true free_gb: warning: 10.0 # Warn when less than 10GB free critical: 5.0 # Critical when less than 5GB free operator: "<" hysteresis: 0.1 enabled: true # Home filesystem (if separate partition) /home: percent: warning: 85.0 critical: 95.0 operator: ">" hysteresis: 0.05 enabled: true # Var filesystem (logs, etc.) /var: percent: warning: 80.0 critical: 90.0 operator: ">" hysteresis: 0.05 enabled: true free_gb: warning: 5.0 # Var needs space for logs critical: 2.0 operator: "<" hysteresis: 0.1 enabled: true # ---------------------------------------------------------------------------- # ZFS Monitor Thresholds # ---------------------------------------------------------------------------- zfs_monitor: # Pool health check — built-in default; shown here for reference/override. # health_ok is 1 (ONLINE) or 0 (DEGRADED, SUSPENDED, FAULTED, UNAVAIL…). # Use '*' to apply the same rule to every pool, or name a specific pool. pools: '*': health_ok: critical: 1 # Alert CRITICAL when pool is not ONLINE operator: "<" hysteresis: 0.0 # No hysteresis — a degraded pool is always critical display: "ZFS pool {pool_name} is {health}" # Per-pool capacity thresholds (optional; add pools you care about) # tank: # capacity: # warning: 75.0 # Warn at 75% used # critical: 90.0 # Critical at 90% used # operator: ">" # hysteresis: 0.05 # ---------------------------------------------------------------------------- # Network Monitor Thresholds # ---------------------------------------------------------------------------- network_monitor: # Total error count across all interfaces errors_total: warning: 100 # Warn at 100 errors critical: 1000 # Critical at 1000 errors operator: ">" hysteresis: 0.2 # 20% hysteresis for counters enabled: true # Total dropped packets dropin_total: warning: 50 critical: 200 operator: ">" hysteresis: 0.2 enabled: true dropout_total: warning: 50 critical: 200 operator: ">" hysteresis: 0.2 enabled: true # TCP connections in TIME_WAIT state connections_TIME_WAIT: warning: 1000 # Warn at 1000 TIME_WAIT connections critical: 5000 # Critical at 5000 TIME_WAIT connections operator: ">" hysteresis: 0.2 enabled: true # Total established connections connections_ESTABLISHED: warning: 500 critical: 1000 operator: ">" hysteresis: 0.1 enabled: true # ---------------------------------------------------------------------------- # Nagios Plugin Thresholds (if using nagios_runner) # ---------------------------------------------------------------------------- nagios_runner: # Nagios plugins report exit codes: # 0 = OK, 1 = WARNING, 2 = CRITICAL, 3 = UNKNOWN # We can threshold on the exit_code directly exit_code: warning: 1 # Map Nagios WARNING to our WARNING critical: 2 # Map Nagios CRITICAL to our CRITICAL operator: ">=" # Alert when exit code >= threshold hysteresis: 0.0 # No hysteresis for exit codes enabled: true # ============================================================================== # Notification Configuration # ============================================================================== # Configure notification methods (email, pushover, etc.) # These are used when threshold violations occur # Email notifications toemail: - admin@example.com - oncall@example.com fromemail: heartbeat@example.com smtpserver: smtp.example.com smtpport: 587 smtpuser: heartbeat@example.com smtppassword: your-password-here # Pushover notifications (optional) # pushover_token: your-pushover-app-token # pushover_user: your-pushover-user-key # Mattermost webhook (optional) # mattermost_url: https://mattermost.example.com/hooks/your-webhook-id # ============================================================================== # Watched Hosts # ============================================================================== # Hosts in this list will trigger notifications for: # - Heartbeat timeouts/overdue # - Threshold violations # - Boot messages watchhosts: - webserver01 - database01 - mailserver - critical-app # ============================================================================== # Additional Server Settings # ============================================================================== hb_port: 50003 # UDP port for heartbeat messages hbd_port: 50004 # HTTP port for web interface grace: 10 # Grace period for overdue detection (seconds) debug: 0 # Debug level (0-3) verbose: false # Verbose output # Journal settings (message logging) journal_enabled: true journal_path: /var/log/heartbeat/messages.journal journal_max_size: 104857600 # 100MB before rotation journal_max_backups: 10 # ============================================================================== # Example: Production Configuration with Conservative Thresholds # ============================================================================== # For production systems, consider: # - Higher warning thresholds to reduce alert fatigue # - Appropriate hysteresis values (5-15% typical) # - Re-notification intervals matching on-call rotation # - Multiple escalation contacts # - Integration with incident management systems # ==============================================================================