0f90be659e
The default zfs_monitor.*.status threshold used operator '>' with warning=1, so a DEGRADED pool (status=1) never alerted (1 > 1 is false) and a FAULTED pool (status=2) only triggered WARNING instead of CRITICAL. Fix the operator to '>=' in THRESHOLD_DEFAULTS and the example config. Also adds a per-metric grace period override (ThresholdConfig.grace) so individual thresholds can bypass or shorten the global grace delay. Alerts with grace=0 fire immediately on state change rather than waiting for a second collection cycle. Sets grace=0 on zfs_monitor.*.status so pool degradation alerts fire on the first data report after the event. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
280 lines
9.7 KiB
YAML
280 lines
9.7 KiB
YAML
# ==============================================================================
|
|
# Heartbeat Daemon Threshold Configuration Example
|
|
# ==============================================================================
|
|
# This file demonstrates threshold configuration for the Heartbeat monitoring system.
|
|
# Thresholds can be defined for any metric collected by monitoring plugins.
|
|
#
|
|
# Threshold levels:
|
|
# - WARNING: First level of concern, typically for early notification
|
|
# - CRITICAL: Severe condition requiring immediate attention
|
|
#
|
|
# Alert notifications are sent when:
|
|
# - A metric crosses from OK to WARNING or CRITICAL
|
|
# - A metric crosses from WARNING to CRITICAL
|
|
# - A metric recovers (returns to a lower severity level)
|
|
#
|
|
# Re-notifications are sent for ongoing alerts based on threshold_renotify_interval.
|
|
# ==============================================================================
|
|
|
|
# Global threshold settings
|
|
threshold_renotify_interval: 3600 # Re-notify every hour for ongoing alerts (seconds)
|
|
|
|
# Threshold definitions per plugin
|
|
thresholds:
|
|
|
|
# ----------------------------------------------------------------------------
|
|
# CPU Monitor Thresholds
|
|
# ----------------------------------------------------------------------------
|
|
cpu_monitor:
|
|
# Overall CPU usage percentage (0-100)
|
|
cpu_percent:
|
|
warning: 80.0 # Warn when CPU usage exceeds 80%
|
|
critical: 90.0 # Critical when CPU usage exceeds 90%
|
|
operator: ">" # Alert when value is GREATER than threshold
|
|
hysteresis: 0.1 # 10% hysteresis to prevent flapping
|
|
enabled: true
|
|
|
|
# 1-minute load average
|
|
load_1min:
|
|
warning: 4.0 # Warn when 1-min load exceeds 4.0
|
|
critical: 8.0 # Critical when 1-min load exceeds 8.0
|
|
operator: ">"
|
|
hysteresis: 0.15 # 15% hysteresis
|
|
enabled: true
|
|
|
|
# 5-minute load average
|
|
load_5min:
|
|
warning: 3.0
|
|
critical: 6.0
|
|
operator: ">"
|
|
hysteresis: 0.15
|
|
enabled: true
|
|
|
|
# 15-minute load average
|
|
load_15min:
|
|
warning: 2.0
|
|
critical: 4.0
|
|
operator: ">"
|
|
hysteresis: 0.15
|
|
enabled: true
|
|
|
|
# ----------------------------------------------------------------------------
|
|
# Memory Monitor Thresholds
|
|
# ----------------------------------------------------------------------------
|
|
memory_monitor:
|
|
# Memory usage percentage
|
|
percent:
|
|
warning: 85.0 # Warn at 85% memory usage
|
|
critical: 95.0 # Critical at 95% memory usage
|
|
operator: ">"
|
|
hysteresis: 0.1
|
|
enabled: true
|
|
|
|
# Available memory in MB (inverse threshold - alert when LOW)
|
|
available_mb:
|
|
warning: 1000 # Warn when less than 1GB available
|
|
critical: 500 # Critical when less than 500MB available
|
|
operator: "<" # Alert when value is LESS than threshold
|
|
hysteresis: 0.1
|
|
enabled: true
|
|
|
|
# Swap usage percentage
|
|
swap_percent:
|
|
warning: 50.0 # Warn at 50% swap usage
|
|
critical: 80.0 # Critical at 80% swap usage
|
|
operator: ">"
|
|
hysteresis: 0.1
|
|
enabled: true
|
|
|
|
# ----------------------------------------------------------------------------
|
|
# Disk Monitor Thresholds
|
|
# ----------------------------------------------------------------------------
|
|
disk_monitor:
|
|
# Partition-specific thresholds
|
|
# Use the mount point as the key
|
|
partitions:
|
|
# Root filesystem
|
|
/:
|
|
percent:
|
|
warning: 80.0 # Warn at 80% disk usage
|
|
critical: 90.0 # Critical at 90% disk usage
|
|
operator: ">"
|
|
hysteresis: 0.05 # 5% hysteresis for disk (more stable)
|
|
enabled: true
|
|
|
|
free_gb:
|
|
warning: 10.0 # Warn when less than 10GB free
|
|
critical: 5.0 # Critical when less than 5GB free
|
|
operator: "<"
|
|
hysteresis: 0.1
|
|
enabled: true
|
|
|
|
# Home filesystem (if separate partition)
|
|
/home:
|
|
percent:
|
|
warning: 85.0
|
|
critical: 95.0
|
|
operator: ">"
|
|
hysteresis: 0.05
|
|
enabled: true
|
|
|
|
# Var filesystem (logs, etc.)
|
|
/var:
|
|
percent:
|
|
warning: 80.0
|
|
critical: 90.0
|
|
operator: ">"
|
|
hysteresis: 0.05
|
|
enabled: true
|
|
|
|
free_gb:
|
|
warning: 5.0 # Var needs space for logs
|
|
critical: 2.0
|
|
operator: "<"
|
|
hysteresis: 0.1
|
|
enabled: true
|
|
|
|
# ----------------------------------------------------------------------------
|
|
# ZFS Monitor Thresholds
|
|
# ----------------------------------------------------------------------------
|
|
zfs_monitor:
|
|
# Pool health check — built-in default; shown here for reference/override.
|
|
# status is 0 (ONLINE) or 1 (DEGRADED) or 2 (SUSPENDED, FAULTED, UNAVAIL…).
|
|
# Use '*' to apply the same rule to every pool, or name a specific pool.
|
|
pools:
|
|
'*':
|
|
status:
|
|
warning: 1 # Alert WARNING when pool is DEGRADED
|
|
critical: 2 # Alert CRITICAL when pool is SUSPENDED/FAULTED/UNAVAIL
|
|
operator: ">="
|
|
hysteresis: 0.0 # No hysteresis — a degraded pool is always alerting
|
|
grace: 0 # Fire immediately — don't wait for a second collection
|
|
display: "ZFS pool {pool_name} is {health}"
|
|
|
|
# Per-pool capacity thresholds (optional; add pools you care about)
|
|
# tank:
|
|
# capacity:
|
|
# warning: 75.0 # Warn at 75% used
|
|
# critical: 90.0 # Critical at 90% used
|
|
# operator: ">"
|
|
# hysteresis: 0.05
|
|
|
|
# ----------------------------------------------------------------------------
|
|
# Network Monitor Thresholds
|
|
# ----------------------------------------------------------------------------
|
|
network_monitor:
|
|
# Total error count across all interfaces
|
|
errors_total:
|
|
warning: 100 # Warn at 100 errors
|
|
critical: 1000 # Critical at 1000 errors
|
|
operator: ">"
|
|
hysteresis: 0.2 # 20% hysteresis for counters
|
|
enabled: true
|
|
|
|
# Total dropped packets
|
|
dropin_total:
|
|
warning: 50
|
|
critical: 200
|
|
operator: ">"
|
|
hysteresis: 0.2
|
|
enabled: true
|
|
|
|
dropout_total:
|
|
warning: 50
|
|
critical: 200
|
|
operator: ">"
|
|
hysteresis: 0.2
|
|
enabled: true
|
|
|
|
# TCP connections in TIME_WAIT state
|
|
connections_TIME_WAIT:
|
|
warning: 1000 # Warn at 1000 TIME_WAIT connections
|
|
critical: 5000 # Critical at 5000 TIME_WAIT connections
|
|
operator: ">"
|
|
hysteresis: 0.2
|
|
enabled: true
|
|
|
|
# Total established connections
|
|
connections_ESTABLISHED:
|
|
warning: 500
|
|
critical: 1000
|
|
operator: ">"
|
|
hysteresis: 0.1
|
|
enabled: true
|
|
|
|
# ----------------------------------------------------------------------------
|
|
# Nagios Plugin Thresholds (if using nagios_runner)
|
|
# ----------------------------------------------------------------------------
|
|
nagios_runner:
|
|
# Nagios plugins report exit codes:
|
|
# 0 = OK, 1 = WARNING, 2 = CRITICAL, 3 = UNKNOWN
|
|
# We can threshold on the exit_code directly
|
|
exit_code:
|
|
warning: 1 # Map Nagios WARNING to our WARNING
|
|
critical: 2 # Map Nagios CRITICAL to our CRITICAL
|
|
operator: ">=" # Alert when exit code >= threshold
|
|
hysteresis: 0.0 # No hysteresis for exit codes
|
|
enabled: true
|
|
|
|
# ==============================================================================
|
|
# Notification Configuration
|
|
# ==============================================================================
|
|
# Configure notification methods (email, pushover, etc.)
|
|
# These are used when threshold violations occur
|
|
|
|
# Email notifications
|
|
toemail:
|
|
- admin@example.com
|
|
- oncall@example.com
|
|
fromemail: heartbeat@example.com
|
|
smtpserver: smtp.example.com
|
|
smtpport: 587
|
|
smtpuser: heartbeat@example.com
|
|
smtppassword: your-password-here
|
|
|
|
# Pushover notifications (optional)
|
|
# pushover_token: your-pushover-app-token
|
|
# pushover_user: your-pushover-user-key
|
|
|
|
# Mattermost webhook (optional)
|
|
# mattermost_url: https://mattermost.example.com/hooks/your-webhook-id
|
|
|
|
# ==============================================================================
|
|
# Watched Hosts
|
|
# ==============================================================================
|
|
# Hosts in this list will trigger notifications for:
|
|
# - Heartbeat timeouts/overdue
|
|
# - Threshold violations
|
|
# - Boot messages
|
|
watchhosts:
|
|
- webserver01
|
|
- database01
|
|
- mailserver
|
|
- critical-app
|
|
|
|
# ==============================================================================
|
|
# Additional Server Settings
|
|
# ==============================================================================
|
|
hb_port: 50003 # UDP port for heartbeat messages
|
|
hbd_port: 50004 # HTTP port for web interface
|
|
grace: 10 # Grace period for overdue detection (seconds)
|
|
debug: 0 # Debug level (0-3)
|
|
verbose: false # Verbose output
|
|
|
|
# Journal settings (message logging)
|
|
journal_enabled: true
|
|
journal_path: /var/log/heartbeat/messages.journal
|
|
journal_max_size: 104857600 # 100MB before rotation
|
|
journal_max_backups: 10
|
|
|
|
# ==============================================================================
|
|
# Example: Production Configuration with Conservative Thresholds
|
|
# ==============================================================================
|
|
# For production systems, consider:
|
|
# - Higher warning thresholds to reduce alert fatigue
|
|
# - Appropriate hysteresis values (5-15% typical)
|
|
# - Re-notification intervals matching on-call rotation
|
|
# - Multiple escalation contacts
|
|
# - Integration with incident management systems
|
|
# ==============================================================================
|