From 460d2be9e9a53d6f8d6fba62fee4f811b51b3a9b Mon Sep 17 00:00:00 2001 From: Andreas Wrede Date: Wed, 1 Apr 2026 19:41:53 -0400 Subject: [PATCH] Fix rtt, including bug in time compute --- .hb.yaml | 338 +++++++++------ .hb.yaml.swp | Bin 20480 -> 20480 bytes docs/NOTIFICATIONS.md | 533 ++++++++++++++++++++++++ docs/THRESHOLD_ALERTING.md | 112 ++++- hbd/client/main.py | 15 +- hbd/config_multi_threshold_example.yaml | 182 ++++++-- hbd/server/config.py | 179 +++++++- hbd/server/dns.py | 14 +- hbd/server/http.py | 5 - hbd/server/main.py | 15 +- hbd/server/notify.py | 166 +++++--- hbd/server/threshold.py | 122 +++--- hbd/server/udp.py | 57 +-- 13 files changed, 1366 insertions(+), 372 deletions(-) create mode 100644 docs/NOTIFICATIONS.md diff --git a/.hb.yaml b/.hb.yaml index ec54df5..0e7b974 100644 --- a/.hb.yaml +++ b/.hb.yaml @@ -7,33 +7,125 @@ logfile: "/home/andreas/logs/heartbeat/heartbeat.log" logfmt: "msg" grace: 40 interval: 10 -watchhosts: -# "localhost": -# "haschloss" : -# "cotgate": - "wentworth": - notify: +4915123456789 - src: "signal" - "y": - notify: +4915123456789 - src: "signal" - "winter": - notify: +14168226179 - src: "signal" -dyndnshosts: {"haschloss", "wayback", "wertvoll", "weekend", "cotgate", "rvgate", "draper", "eris"} + +# Notification Channels - Define notification providers centrally +# Each channel has a type (pushover, email, signal, mattermost) and type-specific configuration +notification_channels: + + pushover_standard: + type: pushover + token: ac7NLX2rPjXFareeDgLpXNoDf4iFmf + user: uDhH33UjQQDYtNzJb1ThRiWb9ingGK + + signal_andreas: + type: signal + cli_path: /usr/local/bin/signal-cli + user: +14168226179 + recipient: +14168226179 + + email_andreas: + type: email + recipients: [aew.hbd.notify@wrede.ca] + sender: aew.hbd@wrede.ca + smtp_server: smtp.fastmail.com + smtp_port: 587 + smtp_user: andreas@wrede.ca + smtp_password: pvtvefyp5gbhnch2 + + # Example additional channels (commented out) + # pushover_urgent: + # type: pushover + # token: your-app-token + # user: your-user-key + # + mattermost_devops: + type: mattermost + host: mattermost.example.com + token: webhook-token + channel: devops-alerts + username: heartbeat-bot + icon: https://example.com/heartbeat-icon.png + +# Default notification channels (used if host doesn't specify channels) +default_notification_channels: [pushover_standard] + +# Host definitions - combines threshold mapping, watch status, DNS updates, and notifications +hosts: + wentworth: + threshold_config: default + watch: true + notification_channels: [pushover_standard] + dyndns: false + + y: + threshold_config: default + watch: true + notification_channels: [pushover_standard] + dyndns: false + + winter: + threshold_config: default + watch: true + notification_channels: [pushover_standard] + dyndns: false + + wally: + threshold_config: freebsd_server + watch: false + notification_channels: [pushover_standard] + dyndns: false + + eris: + threshold_config: truenas_server + watch: false + notification_channels: [pushover_standard] + dyndns: false + + haschloss: + threshold_config: default + watch: false + dyndns: true + + wayback: + threshold_config: default + watch: false + notification_channels: [pushover_standard] + dyndns: true + + wertvoll: + threshold_config: default + watch: false + notification_channels: [pushover_standard] + dyndns: true + + weekend: + threshold_config: default + watch: false + notification_channels: [pushover_standard] + dyndns: true + + cotgate: + threshold_config: default + watch: false + dyndns: true + + rvgate: + threshold_config: default + watch: false + dyndns: true + + draper: + threshold_config: default + watch: false + notification_channels: [pushover_standard] + dyndns: true + +# Hosts to drop/ignore drophosts: {"unknown", "wookie15", "wort"} + nsupdate_bin: "/usr/local/bin/nsupdate" -pushover_token: "ac7NLX2rPjXFareeDgLpXNoDf4iFmf" -pushover_user: "uDhH33UjQQDYtNzJb1ThRiWb9ingGK" -pushsrv: "pushover" dyndomains: {"wrede.org"} -toemail: ["aew.hbd.notify@wrede.ca"] -fromemail: "aew.hbd@wrede.ca" -smtpserver: "smtp.fastmail.com" -smtpuser: "andreas@wrede.ca" -smtppassword: "r8psra6wj6gcakkp" -smtpport: 587 ws_port: 50005 # wss_port: 50006 # Commented out - use plain WebSocket instead of secure WSS @@ -49,120 +141,114 @@ journal_file: messages.journal # Base filename journal_max_size: 104857600 # Max size (100MB default) journal_max_backups: 10 # Number of backups to keep -thresholds: +threshold_configs: default: - cpu_monitor: - cpu_percent: - warning: 80.0 - critical: 90.0 - memory_monitor: - percent: - warning: 3.0 - critical: 95.0 - disk_monitor: - partitions: - /: - percent: - warning: 85.0 - critical: 90.0 - rtt: - y: + thresholds: + cpu_monitor: + cpu_percent: + warning: 80.0 + critical: 90.0 + memory_monitor: + percent: + warning: 85.0 + critical: 95.0 + disk_monitor: + partitions: + /: + percent: + warning: 85.0 + critical: 90.0 + rtt: warning: 30 critical: 250.0 freebsd_server: - cpu_monitor: - cpu_percent: - warning: 80.0 - critical: 90.0 - memory_monitor: - percent: - warning: 3.0 - critical: 95.0 - disk_monitor: - partitions: - /: - percent: - warning: 85.0 - critical: 90.0 - nagios_runner: - # overall_status_code: - # warning: 1 - # critical: 2 - # operator: ">=" - load_status: - warning: WARNING - critical: CRITICAL - operator: "==" - UPS_load: - display: "{ups_output}" - warning: 70 - critical: 80 - operator: ">=" - UPS_status_code: - display: "{ups_output}" - warning: 1 - critical: 2 - operator: ">=" - nextcloud_apps_status_code: - display: "{nextcloud_apps_output}" - warning: 1 - critical: 2 - operator: ">=" - rtt: - y: + thresholds: + cpu_monitor: + cpu_percent: + warning: 80.0 + critical: 90.0 + memory_monitor: + percent: + warning: 3.0 + critical: 95.0 + disk_monitor: + partitions: + /: + percent: + warning: 85.0 + critical: 90.0 + nagios_runner: + # overall_status_code: + # warning: 1 + # critical: 2 + # operator: ">=" + load_status: + warning: WARNING + critical: CRITICAL + operator: "==" + UPS_load: + display: "{ups_output}" + warning: 70 + critical: 80 + operator: ">=" + UPS_status_code: + display: "{ups_output}" + warning: 1 + critical: 2 + operator: ">=" + nextcloud_apps_status_code: + display: "{nextcloud_apps_output}" + warning: 1 + critical: 2 + operator: ">=" + rtt: warning: 30 critical: 250.0 truenas_server: - cpu_monitor: - cpu_percent: - warning: 80.0 - critical: 90.0 - memory_monitor: - percent: - warning: 3.0 - critical: 95.0 - disk_monitor: - partitions: - /: - percent: - warning: 85.0 - critical: 90.0 - nagios_runner: - # overall_status_code: - # warning: 1 - # critical: 2 - # operator: ">=" - load_status: - warning: WARNING - critical: CRITICAL - operator: "==" - UPS_load: - display: "{ups_output}" - warning: 70 - critical: 80 - operator: ">=" - UPS_status_code: - display: "{ups_output}" - warning: 1 - critical: 2 - operator: ">=" - nextcloud_apps_status_code: - display: "{nextcloud_apps_output}" - warning: 1 - critical: 2 - operator: ">=" - rtt: - y: + thresholds: + cpu_monitor: + cpu_percent: + warning: 80.0 + critical: 90.0 + memory_monitor: + percent: + warning: 3.0 + critical: 95.0 + disk_monitor: + partitions: + /: + percent: + warning: 85.0 + critical: 90.0 + nagios_runner: + # overall_status_code: + # warning: 1 + # critical: 2 + # operator: ">=" + load_status: + warning: WARNING + critical: CRITICAL + operator: "==" + UPS_load: + display: "{ups_output}" + warning: 70 + critical: 80 + operator: ">=" + UPS_status_code: + display: "{ups_output}" + warning: 1 + critical: 2 + operator: ">=" + nextcloud_apps_status_code: + display: "{nextcloud_apps_output}" + warning: 1 + critical: 2 + operator: ">=" + rtt: warning: 30 critical: 250.0 -host_threshold_mapping: - # Critical production servers - - wally: freebsd_server - eris: truenas_server - diff --git a/.hb.yaml.swp b/.hb.yaml.swp index 10da5a262ec771698a93c2f5e7500da252e900e1..1b2ceb37ede524a29de7a37f4da7185b81d969b2 100644 GIT binary patch literal 20480 zcmeI4d#oH)8NdhRVNsw}6Ntok*h{6wd-rkQU2GM4+uK5W(~cF|55|KbEfhcdVXfq`+d>( z8GYYpMc19^`<}l1cSP6ki@xvd%a1OzuS^4(1~Ls~8pt$|X&}=;rh!ZYnFcZqWE#jc zkZIs;(12P~l#R4|zdQiI{~y2qzjC3X{0@E!_d^2;a4B2_=fM&uO`96FL_CpJ1p$yAmDZKhFMfoNC3=YD>@OhYp z%}{_h&r_5);3>ErZiCC<1F#hSx9RmV^Cp z4+uWNANfNFTbZ@nuBDn4-BPQjUMmaVZR=rX@81EjE+|r{8J;XEqMC-)z|}1}o>!UE z7s?{%xW+uc@{ZohMH&1gquw@6t*IJToV9k{wjGr%SMAs!R*PJ|;Om}WkX1C@^(&6* zH>tE}VxjH11=H44v!MH0!PI?Ew={Rb@e8IguNRutTE64zH9fDXfkfFm$9Da)7%LWw z<3bFGwRWqe6I4?ZcH0+2qV4IzG3lMyrB}Dx+MMnS!}5Gxr9@qLy4H4ev1|KwrIVI5 z#{F*jP9%JCBD_*PuVcGtGv`h^o~w>`X2%#FCsRKv_`Hu|i)c1xF+vbNqKuBVo&?b}h_4RL1PvVEhz&@FSLQd_WUHX>Uo~n-;>s$4lVtH++ruurNYRKZL zdrLwg&dn)G&9$AT?fF5!+t!?AcdXo?$aQRc&d^I^;cwSxBm-iDEGT?i$XW%XVcD)8 zPJcOmId(bG6*Z<9534YVn8NOMoXtq^18f8(_>uTHdDPD6`hnvGM{&9Iq z!rb`=13yi|G}}Lo(mJ}%?W>)pzIAw_=WWwGUEPKn?# zGBnjUY^#D1vn<_Y4&C4}v#_4ggM4E!Cr)5K$+>5#UL}0gsinKdf3f*Gk73oTb(`;~ zrnzwP!w;hqdq$r;UB`Qv!g?p)?qX^`p@vUktWleTA~3ZT+wo3f+$=n-4=iv>XwV{q zCGCu18FG?&EJ;{=OrVY#iw%-!sg8r89Sn_@@O;&8dxK(XX1fTjC>Qilq~|NqrIG!r zq?3;~jR(Y3Xz_XkiFo3R6)XldVbq1}zNp!{XI<_K&(Sp&o21xEX>bRyk_L9PESkRW zc;!Mt-=nr1leO7ybwgB#WSP9fW+JR2R~t)h=~RoOScj^%w7^ne)DdZz=eYwzs)>Q| zqReRF>FCuaE5lH58D+`tmQkWlFl~$9`BY!x+6xYic8dhb-OEbYqMWk>QH^k6Two(L zB!3RE`H5wPOJI=2jqB-(AG8y%vq6RUqSAI7%#ffBu`bk>P;0edVyI?3dQYrCh22fe zN`&xF2)S=y{}k^*oOypdC(F=kw_|w50^$uE;!pkr$kH)ikt9({GuDmhO3KLr*euft09PlTC6`3vpi8 z&UAm}$>?pZr`quwW8Yl~jmhPgE6r-^uYUI9#G;D zlWr<6IKiwWCzO@wgtE-|c@qvkT2#9%hR2Heq7o%@hUr=yW|qZBoEDtR3gP;};a6DB z#^0j)w)B^+S)EHNr-UYHiNI2lgtro}p7cxJZ0UP^&9vLK3TJU%1-GBm_HT(L_T`2r zt2lsafq&*yx+35Ir}4>Ni7!~b|HtwD&+=Zs7uLY*yu-_P{FmTYa2TF}ufv_-!fv=0 zM&S}T7hb`y{}LR5r{T+R7u*IWY=f<^1>#tMOZa^b9L3-N92|rv;9j^DWK6(yun|V# zVt50;|4Z-#coMz}d%=XQFbbEz5_p*yfFtlMJP3C{9k#-ikb{5X_df#PhKJxTxD7g> z!ZfUhQMe42!JGL0e-DS@Y4`@*1wMQdHi3*4kZ}SRz+dq7{~8X!!>}JZP=#r@0+zwQ z@az8u4#79!Ubq!@gNy+vf%N^q%KbY8PlELK-wu1A1-oE1d<5iuE<^UoG>~cF6g3bU zUimj=@OxGf>A>$_M}&x0lD@Tsh?m7zlOFL(x6wp&cSXVJf1+!O5GS&#oc2z&k6qn_ z*tunUWzj|Lgqoj>S7@o4L@5*L)Rb2KV+TxI4MGZH1*h)D>eSkvz8h<|ZQQYO?NytU zxK675T~}?J**G(uWJ&SSAf>M;a23jGsb-=rgmfaxPij&rT*k@C2!q=mrph1~+!%`~ z(NY_R?N!`1VF5vGh!h=vx=pM~Z+^n;q@l^N)W%5WF;AuEBp62|s?fb~_@8}3KXIIf z`sXSw8}Bh&B_*oCSE~8mCX%K{QZ4tjO01FXF1&@<-ccoDC)4Sg)2{T%mPtMRP4CdP}sat6d^bq|3JHxchpDi$|i9S3c$OhklH z1##p+J_=w?^=3d^tr7+-%Si`*Qn6-m`cUWz59p05NMsC%Yr=qPHi<-uuof>U{|2|c BWO@Jq literal 20480 zcmeI2dyHL09mmJkM_c@Mxn?b>#CyM1VG&OK-Dy{Grw zGl!Wud-s+g36Ox1fPaXU7(ruV_`{GU)*uP7QL7=1CYTVhiH#xn2ZmUU{82zj{La1S z+{d=r+WH4o=Omw;IrI3<%=es`Gp|x+|MYG;mdX-b*AOztf8(8d`jg}*x%I>jT)}Lq zLsq>SUZ9v*HA*Y+E3v_J#Z&1jvxO=#TcuOgQmSd!{Yx7#F^mF6fh$uWQsvw8o5;{$ zHlqy;SU1v7e0<@`Y?)L>0i%FXz$jo8FbWt2i~>dhqriWq0xDcf9ze<0#HF8(?;luk z|6u&QE51*y2wxjNUmM@Qwjw-!G&iGwQNSo*6fg=H1&jhl0i%FXz$jo8FbWt2-UkJ2 zmyoR=A!K14GROXZGXH;T3n8z8SHO2b1*E}^;5x7YytkQ{f%p5F33(O#4!j7CfhITz?gZC^O<*H<@1uk~2kr%50+V1HxCU$i zr#BGtFxUmwfm7=V`2~0${1AK_JPHIj40eD#7yui=yU^eba2A{f$H9}}G4K%JU=i#G zTCaKVF0^|c{1%)9C&2f>Bftj-!8q6kt_A=6Fd=^iFM}V0r@`YO0EfU7NP+9X`418D zC-5?O0UQH9I0&Xd3fv4n1=fMTVUgyq;4JtxcmaGLJO-A52j;;zxD9Lto54S@c=H`f^TnXnZXNcZ{iBsQgv@RaA+eu3sQowyBK1YS@n&|KH= z(%Jglb}yRW_(k)Z*f%{tJ+Xb}!dw@(@0*>T-4O?9vvHMReSo&FmDNf7yY(6DM()%n zoA~i1sNwq;N)^}@kISNnf`Ex`ZFHtx(_@)bR~Q}C$sj+{7Xr;47O1YBbm9zgNC(RE zK(>7#Z>?D8fu~;)(KhwflVz{rPIg0`-1>5=vpN^eRn7I*pkprI^>xCi*k@CF^psDD zhy}JRN+uv>fsj^t`-(g`(lys3(NH2?vt3W2dh}WOT4SqS9q?A;t5a&c)cET8Mgz92 z9G^#S(GD+cB`ZR&tik-dMKP_k)9IL2+S4Cgek;XDBKg#NTErDJGi{}7!E|TTWo0|^ z(GjVtV6w`67yU9h!Xpvbez9&Z7fZHNLrX5uY$kaJo%;Ztjp`*PC@<4E9#x#y7z_Js zNbe}q28Qz^xzS9&odLSrUZy$*{cJXq*?m`1rfnS)WzR?QI+N0_Fqw)=*C_?X&+#r> zGOFVRcAa$+x*l3!y2|S;otzZ)c|~@Qigs)a(0kf-rbxpa{EV6%E3jGenOmT`@9(7O zaSYI@R+KdQVI7smCB#qyV?o-GvRGq!;1UXN39eMg>Q)>8Cj!cm?>kl73zDFTseSXs zJ=^DZ(s62~(uzry)=3;DR57$w6^SnDnU17L`y3;iG*eF6XG*fb5zSDgeQ$}St0gzp z5X@yM$8K5Fr3<-G1v-++WJa}^6TDu>-KS!s?tHKwQ+NX2Z# zC{xLtNHDrMH%A)1wnhk%brr_O^SSnuEoFlXsA-9OD1|-RI5Jvs>{>0fS`m<$?l$B1 z^@XIY?ZEi2OD%0URko$7+n%3tIP@$Fyl4HWQ?mvR0lPh^6_3LXCxNMeE3e$_#`!XF zn}N$=0)rEJpViKqi;8tWAz>t|QY^u?cIjc1)$-G>VqE5Evbu9;elB`SswQ8u9qzm$S8De z#fl|QNAG#J@JJ`27I6g+t6Zv9v7?|C@J3*Lj#>@QYaYvvwB16XBD#tJDiBAm+R~}| zT*_7yyQy1o%Zcx7iTi%r%}`udI~}enD35lErNre58zUy|$Al-5N76K;a8PRLRnV7o zN8WAO3-q?^aCS65I5?WkjkS_k?PK3ocNlmH1H)t4k?i2m@W^N`ADd|YN2we5>hW=^ zDy+1A*_{d}&t1J};?-(wT2Lfv*vhHK;*!=O_9Z>VzzNxkupM+_!x@4es9Z;Lv+A;f zCaoV^{hO6uz~?PHJyjT$e9tLXRozb~eP0r2wXDR+R_WimJzWa<^yk)>Z1v30l#04m z1$xUZq+06fFNW~0$$;*3QK3d=&?0*N{~6579|wB=pX}fN1@rqe;OF4GAOa;Y3vwU@ zZU!F%?;!2l;5_&vI0=3VegK{TE!*$I+J?!}C}0#Y3K#{90!9I&fKk9GU=%P47zM6c z1t`^v6799S?!FDy8yeUK!dhqkvJsDDc0gKr)2w yIZtE`@t8wAr2Sp=9gDmCLp<%1mFB43<(y45NA1jk*cFQkV^`UM*!~5S0{Iumk*CZ6 diff --git a/docs/NOTIFICATIONS.md b/docs/NOTIFICATIONS.md new file mode 100644 index 0000000..408d261 --- /dev/null +++ b/docs/NOTIFICATIONS.md @@ -0,0 +1,533 @@ +# Notification System + +## Overview + +The Heartbeat Monitoring System includes a flexible notification system that can send alerts through multiple channels including Email, Pushover, Signal, and Mattermost. The system supports centralized channel definitions with per-host routing, allowing fine-grained control over notification delivery. + +## Architecture + +### Components + +1. **Notification Channels** (`notification_channels` in config) + - Centralized definitions of notification providers + - Each channel has a type and type-specific credentials + - Reusable across multiple hosts + +2. **Channel Dispatcher** (`hbd/server/notify.py`) + - `pushmsg_for_host(hostname, message)`: Main entry point for host-specific notifications + - `_dispatch_to_channel(channel_name, channel_config, message)`: Routes to specific provider + - Provider functions: `pushover()`, `pushsignal()`, `pushmattermost()`, `send_email()` + +3. **Configuration Utilities** (`hbd/server/config.py`) + - `get_notification_channels_for_host(config, hostname)`: Retrieves channel names for a host + - `get_notification_channels_config(config, hostname)`: Retrieves full channel configurations + - `get_channel_config(config, channel_name)`: Gets configuration for a specific channel + +4. **Integration Points** + - **Threshold alerts**: `threshold.py` calls `notify_mod.pushmsg_for_host()` + - **Heartbeat events**: `udp.py` calls `notify_mod.pushmsg_for_host()` for boot/shutdown/overdue + - **Custom alerts**: Any code can call `notify_mod.pushmsg_for_host(hostname, message)` + +## Configuration + +### Centralized Channel Definitions + +Define notification channels once in your configuration file: + +```yaml +notification_channels: + # Signal notifications + signal_ops: + type: signal + cli_path: /usr/local/bin/signal-cli + user: +1234567890 # Your Signal number + recipient: +1234567890 # Recipient number + + signal_oncall: + type: signal + cli_path: /usr/local/bin/signal-cli + user: +1234567890 + recipient: +0987654321 # Different recipient + + # Email notifications + email_ops: + type: email + recipients: + - ops@example.com + - alerts@example.com + sender: heartbeat@example.com + smtp_server: smtp.example.com + smtp_port: 587 + smtp_user: heartbeat@example.com + smtp_password: your-smtp-password + + email_devteam: + type: email + recipients: [dev-alerts@example.com] + sender: heartbeat-dev@example.com + smtp_server: smtp.example.com + smtp_port: 587 + smtp_user: heartbeat-dev@example.com + smtp_password: your-smtp-password + + # Pushover notifications + pushover_urgent: + type: pushover + token: your-pushover-app-token + user: your-pushover-user-key + + pushover_normal: + type: pushover + token: your-pushover-app-token + user: another-user-key + + # Mattermost notifications + mattermost_devops: + type: mattermost + host: mattermost.example.com + token: your-webhook-token + channel: devops-alerts + username: heartbeat-bot + icon: https://example.com/heartbeat-icon.png +``` + +### Default Notification Channels + +Specify default channels for hosts that don't have specific channel assignments: + +```yaml +default_notification_channels: + - email_ops + - mattermost_devops +``` + +Hosts without `notification_channels` defined will use these defaults. + +### Per-Host Channel Assignment + +Assign specific channels to each host in the `hosts` section: + +```yaml +hosts: + # Critical production web server - multiple channels for redundancy + prod-web-01: + threshold_config: high_sensitivity + watch: true + notification_channels: + - signal_oncall # Immediate mobile notification + - pushover_urgent # Secondary mobile notification + - email_ops # Email for record keeping + dyndns: false + + # Database server - ops team notifications only + prod-db-01: + threshold_config: database + watch: true + notification_channels: + - signal_ops + - email_ops + dyndns: false + + # Development server - email only, no urgent notifications + dev-server-01: + threshold_config: low_sensitivity + watch: false + notification_channels: + - email_devteam + dyndns: false + + # Test server - uses default_notification_channels + test-server-01: + threshold_config: default + watch: false + dyndns: false + # No notification_channels specified = uses default_notification_channels +``` + +## Channel Types + +### Email + +Sends notifications via SMTP. + +**Configuration fields:** +```yaml +type: email +recipients: [email1@example.com, email2@example.com] # Required: List of recipients +sender: heartbeat@example.com # Required: From address +smtp_server: smtp.example.com # Required: SMTP server hostname +smtp_port: 587 # Optional: Default 587 +smtp_user: heartbeat@example.com # Optional: For authenticated SMTP +smtp_password: your-password # Optional: For authenticated SMTP +``` + +**Features:** +- Supports multiple recipients +- TLS/STARTTLS support on port 587 +- Authenticated and unauthenticated SMTP + +**Example:** +```yaml +notification_channels: + email_critical: + type: email + recipients: [admin@example.com, oncall@example.com] + sender: alerts@example.com + smtp_server: smtp.fastmail.com + smtp_port: 587 + smtp_user: alerts@example.com + smtp_password: app-specific-password +``` + +### Pushover + +Sends push notifications to mobile devices via Pushover API. + +**Configuration fields:** +```yaml +type: pushover +token: your-application-token # Required: Your Pushover app token +user: your-user-key # Required: Recipient's user key +``` + +**Features:** +- Instant mobile push notifications +- Works on iOS and Android +- Supports delivery confirmations + +**Setup:** +1. Create a Pushover account at https://pushover.net +2. Create an application to get your app token +3. Note your user key from your account dashboard + +**Example:** +```yaml +notification_channels: + pushover_admin: + type: pushover + token: azGDORePK8gMaC0QOYAMyEEuzJnyUi + user: uQiRzpo4DXghDmr9QzzfQu27cmVRsG +``` + +### Signal + +Sends notifications via Signal messenger using signal-cli. + +**Configuration fields:** +```yaml +type: signal +cli_path: /usr/local/bin/signal-cli # Optional: Path to signal-cli binary +user: +1234567890 # Required: Your Signal phone number +recipient: +0987654321 # Required: Recipient phone number +``` + +**Prerequisites:** +1. Install signal-cli: https://github.com/AsamK/signal-cli +2. Register signal-cli with your phone number: + ```bash + signal-cli -u +1234567890 register + signal-cli -u +1234567890 verify CODE + ``` +3. Ensure signal-cli is in PATH or specify full path in config + +**Features:** +- End-to-end encrypted messaging +- Works without phone being online +- No API fees or rate limits + +**Example:** +```yaml +notification_channels: + signal_admin: + type: signal + cli_path: /usr/local/bin/signal-cli + user: +12025551234 + recipient: +12025559999 +``` + +### Mattermost + +Sends notifications to Mattermost team chat via incoming webhooks. + +**Configuration fields:** +```yaml +type: mattermost +host: mattermost.example.com # Required: Mattermost server hostname +token: your-webhook-token # Required: Incoming webhook token +channel: channel-name # Required: Target channel name +username: heartbeat-bot # Optional: Bot display name +icon: https://example.com/icon.png # Optional: Bot icon URL +``` + +**Prerequisites:** +1. Enable incoming webhooks in Mattermost +2. Create an incoming webhook for your team +3. Note the webhook token from the webhook URL + +**Features:** +- Team-wide visibility +- Rich formatting support +- Message threading + +**Example:** +```yaml +notification_channels: + mattermost_ops: + type: mattermost + host: chat.example.com + token: abc123def456ghi789 + channel: infrastructure-alerts + username: heartbeat-monitor + icon: https://example.com/heartbeat-icon.png +``` + +## Notification Events + +The system sends notifications for various events: + +### Threshold Alerts + +When monitored metrics exceed configured thresholds: + +- **State changes**: OK → WARNING, WARNING → CRITICAL, CRITICAL → OK +- **Format**: `{LEVEL}: {hostname} - {metric_path} = {value} {threshold_info}` +- **Example**: `CRITICAL: prod-web-01 - cpu_monitor.cpu_percent = 95.2 (threshold: > 90.0)` +- **Re-notifications**: Periodic reminders for ongoing alerts (default: hourly) + +### Heartbeat Events + +Host lifecycle events: + +- **Host boot**: `{hostname} booted` +- **Host shutdown**: `{hostname} {connection_type} shutdown` +- **Host recovery**: `{hostname} {connection_type} is back` +- **Connection issues**: `{hostname} {message}` +- **Host overdue**: `{hostname} {connection_type} overdue` + +Only hosts with `watch: true` send heartbeat event notifications. + +### Custom Alerts + +Application code can send custom notifications: + +```python +from hbd.server import notify as notify_mod + +# Send to host-specific channels +notify_mod.pushmsg_for_host("prod-web-01", "Custom alert message") + +# Send using global config +notify_mod.pushmsg_from_config("Global notification") + +# Send to specific config +notify_mod.pushmsg(custom_config_dict, "Targeted notification") +``` + +## Design Principles + +The notification system follows these core principles: + +- **Centralization**: Define notification providers once, reference them by name +- **Flexibility**: Each host can use different channels for different notification needs +- **Redundancy**: Critical hosts can specify multiple channels for failover +- **Clarity**: Clean separation between channel definition and channel assignment +- **Type Safety**: Provider-specific validation at configuration time + +## Best Practices + +### Channel Organization + +- **Create purpose-specific channels**: `email_ops`, `signal_oncall`, `pushover_urgent` +- **Separate by team/role**: `email_devteam`, `signal_dbateam`, `mattermost_security` +- **Use descriptive names**: Channel names appear in logs and debugging + +### Redundancy + +For critical hosts, use multiple notification channels: + +```yaml +hosts: + critical-db: + notification_channels: + - signal_oncall # Primary: Mobile alert + - pushover_urgent # Backup: Different mobile platform + - email_ops # Tertiary: Email for record-keeping +``` + +### Notification Fatigue Prevention + +- **Use `watch: false`** for non-critical hosts +- **Configure appropriate thresholds** to avoid false positives +- **Set different channels for different severities** +- **Use `default_notification_channels`** for baseline, add more for critical systems + +### Security + +- **Protect credentials**: Use file permissions to protect config files with passwords/tokens +- **Rotate tokens**: Periodically rotate API tokens and passwords +- **Use app-specific passwords**: For email, use app-specific passwords instead of main account password +- **Separate accounts**: Consider separate notification accounts for different environments (prod vs dev) + +### Testing + +Test notification channels before relying on them: + +```bash +# Test signal-cli directly +signal-cli -u +1234567890 send -m "Test message" +0987654321 + +# Test SMTP +echo "Test" | mail -s "Test Subject" admin@example.com + +# Test through heartbeat system (Python REPL) +from hbd.server import notify as notify_mod, config as config_mod +cfg = config_mod.load_config(".hb.yaml") +notify_mod.setup(cfg) +notify_mod.pushmsg_for_host("test-host", "Test notification") +``` + +## Troubleshooting + +### Notifications Not Sending + +1. **Check logs**: Look for "Failed to send notification" errors +2. **Verify host is watched**: Ensure `watch: true` in host definition +3. **Check channel configuration**: Verify credentials and settings +4. **Test channel directly**: Use command-line tools to test provider +5. **Check network**: Ensure server can reach notification endpoints + +### Signal Issues + +- **signal-cli not found**: Specify full path in `cli_path` +- **Not registered**: Run `signal-cli -u +NUMBER register` and verify +- **Trust issues**: Run `signal-cli -u +NUMBER receive` to sync trust store +- **Recipient not found**: Ensure recipient is in your Signal contacts + +### Email Issues + +- **Authentication failed**: Check SMTP username/password +- **TLS errors**: Verify SMTP port (587 for STARTTLS, 465 for SSL) +- **Relay denied**: Ensure SMTP server allows relay from your IP +- **Timeout**: Check firewall rules for SMTP ports + +### Pushover Issues + +- **Invalid token/user**: Verify token and user key from Pushover dashboard +- **API rate limits**: Pushover has monthly message limits on free tier +- **HTTP errors**: Check Pushover API status page + +### Mattermost Issues + +- **Webhook not found**: Verify webhook token and ensure webhook is enabled +- **Channel not found**: Check channel name spelling and permissions +- **Driver import error**: Install mattermostdriver: `pip install mattermostdriver` + +## API Reference + +### Main Functions + +#### `pushmsg_for_host(hostname: str, msg: str, debug: int = 0) -> dict` + +Send notification to host-specific channels. + +**Parameters:** +- `hostname`: Name of the host (used to look up notification channels) +- `msg`: Message to send +- `debug`: Debug level (0=no debug, 1+=debug output) + +**Returns:** Dictionary of results per channel: `{"signal_ops": True, "email_ops": False}` + +**Example:** +```python +from hbd.server import notify as notify_mod + +notify_mod.pushmsg_for_host("prod-web-01", "Server CPU at 95%") +``` + +**Behavior:** +1. Looks up notification channels configured for the host +2. If no host-specific channels, uses `default_notification_channels` +3. Dispatches to each channel in parallel +4. Returns dict of results keyed by channel name +5. Logs success/failure for each channel + +## Examples + +### Complete Configuration Example + +```yaml +# Notification channel definitions +notification_channels: + signal_oncall: + type: signal + cli_path: /usr/local/bin/signal-cli + user: +12025551234 + recipient: +12025555678 + + email_ops: + type: email + recipients: [ops@example.com, alerts@example.com] + sender: heartbeat@example.com + smtp_server: smtp.fastmail.com + smtp_port: 587 + smtp_user: heartbeat@example.com + smtp_password: app-password-here + +# Default channels +default_notification_channels: [email_ops] + +# Host definitions with channel assignments +hosts: + prod-web-01: + threshold_config: high_sensitivity + watch: true + notification_channels: [signal_oncall, email_ops] + dyndns: false + + dev-server-01: + threshold_config: low_sensitivity + watch: false + notification_channels: [email_ops] + dyndns: false +``` + +### Multiple Environments Example + +```yaml +notification_channels: + # Production channels + signal_prod_oncall: + type: signal + user: +12025551234 + recipient: +12025551111 # On-call phone + + email_prod_ops: + type: email + recipients: [prod-ops@example.com] + sender: prod-heartbeat@example.com + smtp_server: smtp.example.com + + # Staging channels + email_staging: + type: email + recipients: [staging-alerts@example.com] + sender: staging-heartbeat@example.com + smtp_server: smtp.example.com + + # Development channels + mattermost_dev: + type: mattermost + host: chat.example.com + token: dev-webhook-token + channel: dev-alerts + +hosts: + prod-api-01: + notification_channels: [signal_prod_oncall, email_prod_ops] + + staging-api-01: + notification_channels: [email_staging] + + dev-api-01: + notification_channels: [mattermost_dev] +``` diff --git a/docs/THRESHOLD_ALERTING.md b/docs/THRESHOLD_ALERTING.md index 49b5be5..a3eed90 100644 --- a/docs/THRESHOLD_ALERTING.md +++ b/docs/THRESHOLD_ALERTING.md @@ -335,43 +335,111 @@ threshold_renotify_interval: 3600 # Re-notify every hour for ongoing alerts ### Notification Channels -Thresholds use the same notification infrastructure as heartbeat monitoring: +The system supports centralized notification channel definitions, allowing different hosts to use different notification providers and credentials. This provides fine-grained control over who gets notified about what. + +#### Supported Channel Types - **Email** (via SMTP) - **Pushover** (mobile notifications) -- **Mattermost** (team chat) -- **Custom webhooks** +- **Signal** (via signal-cli) +- **Mattermost** (team chat webhooks) -Configuration: +#### Centralized Channel Configuration + +Define notification channels once in the configuration file: ```yaml -# Email -toemail: - - admin@example.com - - oncall@example.com -fromemail: heartbeat@example.com -smtpserver: smtp.example.com -smtpport: 587 -smtpuser: heartbeat@example.com -smtppassword: your-password +notification_channels: + # Signal notifications + signal_ops: + type: signal + cli_path: /usr/local/bin/signal-cli + user: +1234567890 + recipient: +1234567890 + + # Email notifications + email_ops: + type: email + recipients: [ops@example.com, alerts@example.com] + sender: heartbeat@example.com + smtp_server: smtp.example.com + smtp_port: 587 + smtp_user: heartbeat@example.com + smtp_password: your-smtp-password + + # Pushover notifications + pushover_urgent: + type: pushover + token: your-pushover-app-token + user: your-pushover-user-key + + # Mattermost notifications + mattermost_devops: + type: mattermost + host: mattermost.example.com + token: your-webhook-token + channel: devops-alerts + username: heartbeat-bot + icon: https://example.com/heartbeat-icon.png -# Pushover -pushover_token: your-app-token -pushover_user: your-user-key +# Default channels for hosts that don't specify channels +default_notification_channels: [email_ops] +``` + +#### Per-Host Channel Assignment + +Assign notification channels to specific hosts in the `hosts` section: + +```yaml +hosts: + # Critical server - multiple notification channels + prod-web-01: + threshold_config: high_sensitivity + watch: true + notification_channels: [signal_ops, pushover_urgent, email_ops] + dyndns: false + + # Database server - ops team only + prod-db-01: + threshold_config: database + watch: true + notification_channels: [signal_ops, email_ops] + dyndns: false + + # Development server - email only + dev-server-01: + threshold_config: low_sensitivity + watch: false + notification_channels: [email_ops] + dyndns: false + + # Uses default_notification_channels if not specified + test-server-01: + threshold_config: default + watch: false + dyndns: false ``` ### Watched Hosts -Only hosts in the `watchhosts` list will trigger notifications: +Only hosts with `watch: true` in the `hosts` section will trigger notifications: ```yaml -watchhosts: - - webserver01 - - database01 - - mailserver +hosts: + webserver01: + watch: true + notification_channels: [email_ops] + + database01: + watch: true + notification_channels: [signal_ops, email_ops] + + mailserver: + watch: true + notification_channels: [pushover_urgent] ``` -Hosts not in this list will still have thresholds checked and alert states tracked, but won't send notifications. +Hosts not marked for watching will still have thresholds checked and alert states tracked, but won't send notifications. ## Alert State Tracking diff --git a/hbd/client/main.py b/hbd/client/main.py index 0552f09..1356bb3 100644 --- a/hbd/client/main.py +++ b/hbd/client/main.py @@ -115,13 +115,14 @@ class AsyncConnection: self.logger.debug(f"Sent {msg_id} message ({len(data)} bytes)") def handle_ack(self, msg: dict, now: float): - """Handle ACK message from server.""" - try: - self.lastack = msg.get("time", now) - rtt = (self.lastack - self.lastsend) * 2000.0 # Convert to ms - except Exception: - self.lastack = now - rtt = (self.lastack - self.lastsend) * 1000.0 + """Handle ACK message from server. + + RTT is calculated as: (time ACK received) - (time HTB sent) + """ + self.lastack = now + + # Calculate RTT: time ACK received minus time HTB sent + rtt = (now - self.lastsend) * 1000.0 # Convert to ms self.rtts.append(rtt) if len(self.rtts) > 10: diff --git a/hbd/config_multi_threshold_example.yaml b/hbd/config_multi_threshold_example.yaml index c14839c..0514e97 100644 --- a/hbd/config_multi_threshold_example.yaml +++ b/hbd/config_multi_threshold_example.yaml @@ -51,13 +51,9 @@ threshold_configs: operator: ">" rtt: - # RTT thresholds per remote host - router: - warning: 50.0 # ms - critical: 200.0 - server1: - warning: 100.0 - critical: 500.0 + # RTT thresholds (applies to all hosts) + warning: 50.0 # ms + critical: 200.0 # High sensitivity configuration - lower thresholds for critical systems high_sensitivity: @@ -94,12 +90,8 @@ threshold_configs: operator: ">" rtt: - router: - warning: 30.0 - critical: 100.0 - server1: - warning: 50.0 - critical: 200.0 + warning: 30.0 + critical: 100.0 # Low sensitivity configuration - higher thresholds for development/test systems low_sensitivity: @@ -125,9 +117,8 @@ threshold_configs: operator: ">" rtt: - router: - warning: 100.0 - critical: 500.0 + warning: 100.0 + critical: 500.0 # Production database servers - specialized thresholds database: @@ -159,44 +150,147 @@ threshold_configs: operator: ">" rtt: - router: - warning: 20.0 # Stricter latency requirements - critical: 50.0 + warning: 20.0 # Stricter latency requirements + critical: 50.0 # ---------------------------------------------------------------------------- # Host to Threshold Configuration Mapping # ---------------------------------------------------------------------------- # Map specific hosts to specific threshold configurations -# Hosts not listed here will use the default_threshold_config -host_threshold_mapping: - # Critical production servers - prod-web-01: high_sensitivity - prod-web-02: high_sensitivity - prod-api-01: high_sensitivity +# ---------------------------------------------------------------------------- +# Notification Channels +# ---------------------------------------------------------------------------- +# Define notification providers centrally with their credentials +# Each channel has a type (pushover, email, signal, mattermost) and type-specific config +notification_channels: + # Signal notifications + signal_ops: + type: signal + cli_path: /usr/local/bin/signal-cli + user: +1234567890 + recipient: +1234567890 - # Database servers - prod-db-01: database - prod-db-02: database - prod-db-replica: database + signal_oncall: + type: signal + cli_path: /usr/local/bin/signal-cli + user: +1234567890 + recipient: +0987654321 - # Development and test systems - dev-server-01: low_sensitivity - dev-server-02: low_sensitivity - test-server-01: low_sensitivity - test-server-02: low_sensitivity + # Email notifications + email_ops: + type: email + recipients: [ops@example.com, alerts@example.com] + sender: heartbeat@example.com + smtp_server: smtp.example.com + smtp_port: 587 + smtp_user: heartbeat@example.com + smtp_password: your-smtp-password - # Everything else uses 'default' (no need to list explicitly) + # Pushover notifications + pushover_urgent: + type: pushover + token: your-pushover-app-token + user: your-pushover-user-key + + # Mattermost notifications + mattermost_devops: + type: mattermost + host: mattermost.example.com + token: your-webhook-token + channel: devops-alerts + username: heartbeat-bot + icon: https://example.com/heartbeat-icon.png + +# Default notification channels (used if host doesn't specify channels) +default_notification_channels: [email_ops] # ---------------------------------------------------------------------------- -# Backward Compatibility Example +# Host Definitions (New Unified Format) # ---------------------------------------------------------------------------- -# The old single threshold format is still supported: -# Just use 'thresholds:' directly without 'threshold_configs:' +# Define hosts with threshold configs, monitoring, DNS, and notification settings +hosts: + # Critical production servers - high sensitivity, multiple notification channels + prod-web-01: + threshold_config: high_sensitivity + watch: true + notification_channels: [signal_oncall, pushover_urgent, email_ops] + dyndns: false + + prod-web-02: + threshold_config: high_sensitivity + watch: true + notification_channels: [signal_oncall, pushover_urgent, email_ops] + dyndns: false + + prod-api-01: + threshold_config: high_sensitivity + watch: true + notification_channels: [signal_oncall, email_ops] + dyndns: false + + # Database servers - database-specific thresholds + prod-db-01: + threshold_config: database + watch: true + notification_channels: [signal_ops, email_ops] + dyndns: false + + prod-db-02: + threshold_config: database + watch: true + notification_channels: [signal_ops, email_ops] + dyndns: false + + prod-db-replica: + threshold_config: database + watch: true + notification_channels: [email_ops] # Replica gets email only + dyndns: false + + # Development servers - low sensitivity, minimal notifications + dev-server-01: + threshold_config: low_sensitivity + watch: false # Don't monitor dev servers closely + notification_channels: [email_ops] + dyndns: false + + dev-server-02: + threshold_config: low_sensitivity + watch: false + notification_channels: [email_ops] + dyndns: false + + # Test servers + test-server-01: + threshold_config: low_sensitivity + watch: false + dyndns: false + # No notification channels - uses default_notification_channels + + # Home server with dynamic DNS + home-server: + threshold_config: default + watch: true + notification_channels: [signal_ops] + dyndns: true # Update DNS when IP changes + +# Hosts not listed in the hosts section will use: +# - default_threshold_config for thresholds (falls back to "default") +# - default_notification_channels for notifications + +# ---------------------------------------------------------------------------- +# Notes on Configuration Structure +# ---------------------------------------------------------------------------- +# +# All configuration is centralized in the hosts section. Each host can specify: +# - threshold_config: Name of threshold configuration to use +# - watch: Whether to monitor this host actively (send notifications) +# - notification_channels: List of channels to use for this host +# - dyndns: Whether to update DNS when IP address changes # -# thresholds: -# cpu_monitor: -# cpu_percent: -# warning: 80.0 -# critical: 90.0 +# Notification channels are defined once at the top level and referenced +# by name in host definitions, allowing easy reuse and updates. # -# This will apply the same thresholds to all hosts. +# For hosts not explicitly listed, the system will still accept heartbeats +# and track their state, but won't apply thresholds or send notifications +# unless default settings are configured. diff --git a/hbd/server/config.py b/hbd/server/config.py index 7782164..1c945a6 100644 --- a/hbd/server/config.py +++ b/hbd/server/config.py @@ -21,10 +21,9 @@ SERVER_DEFAULTS = { "logfile": "/var/log/heartbeat.log", "logfmt": "text", # text or msg or json - # Notification settings - "pushsrv": "pushover", # pushover, mattermost, or all - "pushover_token": "", - "pushover_user": "", + # Notification channels + "notification_channels": {}, # Named channels with type and credentials + "default_notification_channels": [], # Default channels if host doesn't specify # Monitoring settings "interval": 20, # Expected heartbeat interval (for server checks) @@ -32,22 +31,15 @@ SERVER_DEFAULTS = { "threshold_renotify_interval": 3600, # Seconds between threshold re-notifications # Host management - "watchhosts": [], # Hosts to monitor and notify about - "dyndnshosts": [], # Hosts with dynamic DNS + "hosts": {}, # New unified host definitions (optional) + "watchhosts": [], # Hosts to monitor and notify about (legacy) + "dyndnshosts": [], # Hosts with dynamic DNS (legacy) "drophosts": [], # Hosts to ignore "dyndomains": ["wrede.org"], # DNS updates "nsupdate_bin": "/usr/bin/nsupdate", - # Email settings - "smtpserver": "smtp.fastmail.com", - "smtpuser": "andreas@wrede.ca", - "smtppassword": "pvtvefyp5gbhnch2", - "smtpport": 587, - "toemail": ["aew.hbd.notify@wrede.ca"], - "fromemail": "aew.hbd@wrede.ca", - # WebSocket settings "ws_port": 50005, "wss_port": None, @@ -101,3 +93,162 @@ def load_config(path=None): # yaml not installed: do not attempt to parse; user must ensure defaults pass return cfg + + +def get_watchhosts(config): + """Extract watchhosts from config, supporting both new and legacy formats. + + Args: + config: Configuration dictionary + + Returns: + List of hostnames to watch + """ + watchhosts = [] + + # New format: hosts section with watch attribute + if "hosts" in config: + hosts_config = config["hosts"] + if isinstance(hosts_config, dict): + for host_name, host_attrs in hosts_config.items(): + if isinstance(host_attrs, dict) and host_attrs.get("watch", False): + watchhosts.append(host_name) + + # Legacy format: watchhosts list + if "watchhosts" in config: + legacy_watchhosts = config.get("watchhosts", []) + if isinstance(legacy_watchhosts, (list, set)): + watchhosts.extend(legacy_watchhosts) + elif isinstance(legacy_watchhosts, dict): + # Old dict format: {"host1": {attrs}, "host2": {attrs}} + watchhosts.extend(legacy_watchhosts.keys()) + + return list(set(watchhosts)) # Remove duplicates + + +def get_dyndnshosts(config): + """Extract dyndnshosts from config, supporting both new and legacy formats. + + Args: + config: Configuration dictionary + + Returns: + List of hostnames with dynamic DNS + """ + dyndnshosts = [] + + # New format: hosts section with dyndns attribute + if "hosts" in config: + hosts_config = config["hosts"] + if isinstance(hosts_config, dict): + for host_name, host_attrs in hosts_config.items(): + if isinstance(host_attrs, dict) and host_attrs.get("dyndns", False): + dyndnshosts.append(host_name) + + # Legacy format: dyndnshosts list/set + if "dyndnshosts" in config: + legacy_dyndnshosts = config.get("dyndnshosts", []) + if isinstance(legacy_dyndnshosts, (list, set)): + dyndnshosts.extend(legacy_dyndnshosts) + + return list(set(dyndnshosts)) # Remove duplicates + + +def get_host_config(config, hostname): + """Get configuration for a specific host. + + Args: + config: Configuration dictionary + hostname: Host name + + Returns: + Dictionary with host attributes or empty dict + """ + if "hosts" in config: + hosts_config = config.get("hosts", {}) + if isinstance(hosts_config, dict) and hostname in hosts_config: + return hosts_config[hostname] if isinstance(hosts_config[hostname], dict) else {} + + # Check legacy watchhosts for notification settings + if "watchhosts" in config: + watchhosts = config.get("watchhosts", {}) + if isinstance(watchhosts, dict) and hostname in watchhosts: + legacy_attrs = watchhosts[hostname] + if isinstance(legacy_attrs, dict): + # Convert legacy format to new format + return { + "watch": True, + "notify": legacy_attrs.get("notify"), + "notify_src": legacy_attrs.get("src"), + } + + return {} + + +def get_notification_channels_for_host(config, hostname): + """Get notification channels configured for a specific host. + + Args: + config: Configuration dictionary + hostname: Host name + + Returns: + List of channel names to use for this host + """ + host_config = get_host_config(config, hostname) + + # Check if host specifies notification channels + channels = host_config.get("notification_channels", []) + if channels: + if isinstance(channels, str): + return [channels] + elif isinstance(channels, list): + return channels + + # Fall back to default channels + default_channels = config.get("default_notification_channels", []) + if default_channels: + if isinstance(default_channels, str): + return [default_channels] + elif isinstance(default_channels, list): + return default_channels + + # No channels configured, return empty list (will use legacy global config) + return [] + + +def get_channel_config(config, channel_name): + """Get configuration for a specific notification channel. + + Args: + config: Configuration dictionary + channel_name: Name of the notification channel + + Returns: + Dictionary with channel configuration or None if not found + """ + channels = config.get("notification_channels", {}) + if isinstance(channels, dict) and channel_name in channels: + return channels[channel_name] + return None + + +def get_notification_channels_config(config, hostname): + """Get list of notification channel configurations for a host. + + Args: + config: Configuration dictionary + hostname: Host name + + Returns: + List of (channel_name, channel_config) tuples + """ + channel_names = get_notification_channels_for_host(config, hostname) + + channels = [] + for channel_name in channel_names: + channel_config = get_channel_config(config, channel_name) + if channel_config and channel_config.get("type"): + channels.append((channel_name, channel_config)) + + return channels diff --git a/hbd/server/dns.py b/hbd/server/dns.py index af15835..d52396f 100644 --- a/hbd/server/dns.py +++ b/hbd/server/dns.py @@ -136,16 +136,7 @@ async def dns_update_worker( ) if err: m += f", DNS update failed: {err}" - if pushmsg: - try: - await loop.run_in_executor( - None, - pushmsg, - "error: nsupdate failed", - f"{name}.dy.{dyndomain}: {m}", - ) - except Exception: - pass + logger.error("DNS update failed for %s: %s", name, err) else: m += ", DNS updated." @@ -171,7 +162,6 @@ def start_dns_worker( hbdclass, cfg: dict, log: Optional[callable] = None, - pushmsg: Optional[callable] = None, loop: Optional[asyncio.AbstractEventLoop] = None, ): """Start the async DNS worker and return the Task. @@ -218,7 +208,7 @@ def start_dns_worker( task = loop.create_task( dns_update_worker( - hbdclass, cfg, async_queue=async_q, log=log, pushmsg=pushmsg, loop=loop + hbdclass, cfg, async_queue=async_q, log=log, loop=loop ) ) return task diff --git a/hbd/server/http.py b/hbd/server/http.py index d6d293d..2a3c4ca 100644 --- a/hbd/server/http.py +++ b/hbd/server/http.py @@ -25,12 +25,7 @@ async def start( port: int, config, hbdclass, - log=None, - email=None, - pushmsg=None, - msg_to_websockets=None, tcss=None, - DEBUG=0, verbose=False, get_now=None, VER="", diff --git a/hbd/server/main.py b/hbd/server/main.py index 95f5820..5bd5fe2 100644 --- a/hbd/server/main.py +++ b/hbd/server/main.py @@ -79,14 +79,11 @@ async def _run_async(config): # Initialize threshold checker threshold_checker = threshold_mod.ThresholdChecker( config=config, - notification_callback=notify_mod.pushmsg_from_config, renotify_interval=config.get("threshold_renotify_interval", 3600), journal=msg_journal, ) logger.info("Threshold checker initialized") - pushmsg = notify_mod.pushmsg_from_config - sock = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM) # Disable IPV6_V6ONLY option to enable dual-stack (listen on IPv4 as well) # This option is system-dependent; on many systems, setting it to False enables @@ -110,7 +107,6 @@ async def _run_async(config): config=config, hbdclass=hbdclass, log=eventlog, - pushmsg=pushmsg, msg_to_websockets=msg_to_websockets, msg_journal=msg_journal, threshold_checker=threshold_checker, @@ -132,12 +128,8 @@ async def _run_async(config): port=config.get("hbd_port", 50004), config=config, hbdclass=hbdclass, - log=eventlog, - pushmsg=pushmsg, - msg_to_websockets=msg_to_websockets, threshold_checker=threshold_checker, tcss=None, - DEBUG=config.get("debug", 0), verbose=config.get("verbose", False), get_now=lambda: time.time(), VER="", @@ -155,7 +147,7 @@ async def _run_async(config): dns_task = None try: dns_task = dns_mod.start_dns_worker( - hbdclass, config, log=eventlog, pushmsg=pushmsg, loop=loop + hbdclass, config, log=eventlog, loop=loop ) logger.info("dns update worker started") except Exception as e: @@ -273,10 +265,11 @@ def load_pickled_hosts(config, hbdclass): """Load pickled hosts from file, if available.""" import os import pickle + from . import config as config_mod pickfile = config.get("pickfile", "hbd.pickle") - dyndnshosts = config.get("dyndnshosts", []) - watchhosts = config.get("watchhosts", []) + dyndnshosts = config_mod.get_dyndnshosts(config) + watchhosts = config_mod.get_watchhosts(config) drophosts = config.get("drophosts", []) if 1 and os.path.exists(pickfile): if config.get("verbose", False): diff --git a/hbd/server/notify.py b/hbd/server/notify.py index feaedb3..0c897c0 100644 --- a/hbd/server/notify.py +++ b/hbd/server/notify.py @@ -190,55 +190,123 @@ def pushsignal( return False -def pushmsg(cfg: dict, msg: str, debug: int = 0): - """Dispatch push notifications according to `cfg['pushsrv']`. - - cfg is expected to contain keys for different services when needed, e.g. - - cfg['pushsrv'] : one of 'all', 'pushover', 'mattermost', 'signal' - - cfg['pushover_token'], cfg['pushover_user'] - - cfg['matter_host'], cfg['matter_token'], cfg['matter_channel'] - - cfg['signal_cli'], cfg['signal_user'], cfg['signal_recipient'] - - Returns a dict of results per provider. - """ +def _dispatch_to_channel(channel_name: str, channel_config: dict, msg: str, debug: int = 0) -> bool: + """Dispatch a message to a specific notification channel. + Args: + channel_name: Name of the channel (for logging) + channel_config: Channel configuration dictionary with 'type' and type-specific fields + msg: Message to send + debug: Debug level + + Returns: + True if notification sent successfully, False otherwise + """ + channel_type = channel_config.get("type") + + if channel_type == "pushover": + return pushover( + channel_config.get("token", ""), + channel_config.get("user", ""), + msg, + debug=debug + ) + + elif channel_type == "email": + # Build email from channel config + recipients = channel_config.get("recipients", []) + sender = channel_config.get("sender", "") + smtp_server = channel_config.get("smtp_server", "") + smtp_port = channel_config.get("smtp_port", 587) + smtp_user = channel_config.get("smtp_user") + smtp_password = channel_config.get("smtp_password") + + if not recipients or not sender or not smtp_server: + logger.warning( + "Email channel '%s' missing required fields: recipients=%s, sender=%s, smtp_server=%s", + channel_name, recipients, sender, smtp_server + ) + return False + + # Temporarily update _config for email() function + old_config = dict(_config) + _config["toemail"] = recipients + _config["fromemail"] = sender + _config["smtpserver"] = smtp_server + _config["smtpport"] = smtp_port + if smtp_user: + _config["smtpuser"] = smtp_user + if smtp_password: + _config["smtppassword"] = smtp_password + + result = email("Heartbeat notification", msg, debug=debug) + + # Restore config + _config.clear() + _config.update(old_config) + + return result + + elif channel_type == "signal": + return pushsignal( + channel_config.get("cli_path", "/usr/local/bin/signal-cli"), + channel_config.get("user", ""), + channel_config.get("recipient", ""), + msg, + debug=debug + ) + + elif channel_type == "mattermost": + return pushmattermost( + channel_config.get("host", ""), + channel_config.get("token", ""), + channel_config.get("channel", ""), + msg, + username=channel_config.get("username", "hbd"), + icon=channel_config.get("icon"), + debug=debug + ) + + else: + logger.warning("Unknown channel type '%s' for channel '%s'", channel_type, channel_name) + return False + + +def pushmsg_for_host(hostname: str, msg: str, debug: int = 0) -> dict: + """Send notification for a specific host using its configured channels. + + This function looks up the host's notification channels from the config + and sends the message to those channels. + + Args: + hostname: Name of the host to send notification for + msg: Message to send + debug: Debug level + + Returns: + Dictionary of results per channel: {"channel_name": True/False} + """ + from . import config as config_mod + + # Get notification channels for this host + channels = config_mod.get_notification_channels_config(_config, hostname) + + if not channels: + logger.warning("No notification channels configured for host '%s'", hostname) + return {} + + # Dispatch to each channel results = {} - p = cfg.get("pushsrv", "pushover") - if p in ("all", "pushover"): - ok = pushover( - cfg.get("pushover_token", ""), - cfg.get("pushover_user", ""), - msg, - debug=debug, - ) - results["pushover"] = ok - if p in ("all", "mattermost"): - ok = pushmattermost( - cfg.get("matter_host", ""), - cfg.get("matter_token", ""), - cfg.get("matter_channel", ""), - msg, - username=cfg.get("matter_username", "hbd"), - icon=cfg.get("matter_icon"), - debug=debug, - ) - results["mattermost"] = ok - if p in ("all", "signal"): - ok = pushsignal( - cfg.get("signal_cli", "/usr/local/bin/signal-cli"), - cfg.get("signal_user", ""), - cfg.get("signal_recipient", ""), - msg, - debug=debug, - ) - results["signal"] = ok - if p in ("all", "email"): - ok = email("Heartbeat notification", msg, debug=debug) - results["email"] = ok - logger.debug("push results: %s", results) + for channel_name, channel_config in channels: + try: + success = _dispatch_to_channel(channel_name, channel_config, msg, debug=debug) + results[channel_name] = success + if success: + logger.info("Notification sent to channel '%s': %s", channel_name, msg) + else: + logger.warning("Failed to send notification to channel '%s'", channel_name) + except Exception as e: + logger.error("Error sending to channel '%s': %s", channel_name, e) + results[channel_name] = False + return results - - -def pushmsg_from_config(msg: str, debug: int = 0) -> dict: - """Use the module-level configuration dict to dispatch a push message.""" - return pushmsg(_config, msg, debug=debug) diff --git a/hbd/server/threshold.py b/hbd/server/threshold.py index 2c88b72..1217814 100644 --- a/hbd/server/threshold.py +++ b/hbd/server/threshold.py @@ -275,7 +275,6 @@ class ThresholdChecker: def __init__( self, config: Dict[str, Any], - notification_callback: Optional[Callable] = None, renotify_interval: int = 3600, journal: Optional[Any] = None, ): @@ -284,7 +283,6 @@ class ThresholdChecker: Args: config: Threshold configuration dictionary from YAML - notification_callback: Function to call for notifications renotify_interval: Seconds between repeat notifications (default: 1 hour) journal: Optional MessageJournal instance for logging threshold events """ @@ -300,7 +298,6 @@ class ThresholdChecker: # Default config name to use when no mapping exists self.default_config = "default" - self.notification_callback = notification_callback self.renotify_interval = renotify_interval self.journal = journal @@ -367,8 +364,20 @@ class ThresholdChecker: target_dict=self.threshold_configs[config_name] ) - # Parse host to config mapping - self.host_config_mapping = config.get("host_threshold_mapping", {}) + # Parse host to config mapping from two possible sources + # 1. New format: hosts section with threshold_config attribute + if "hosts" in config: + hosts_config = config["hosts"] + if isinstance(hosts_config, dict): + for host_name, host_attrs in hosts_config.items(): + if isinstance(host_attrs, dict) and "threshold_config" in host_attrs: + self.host_config_mapping[host_name] = host_attrs["threshold_config"] + + # 2. Legacy format: host_threshold_mapping section (for backward compatibility) + if "host_threshold_mapping" in config: + legacy_mapping = config.get("host_threshold_mapping", {}) + if isinstance(legacy_mapping, dict): + self.host_config_mapping.update(legacy_mapping) # Set default config (first one alphabetically or explicitly set) self.default_config = config.get("default_threshold_config", "default") @@ -513,14 +522,13 @@ class ThresholdChecker: rtt_thresholds: Dict[str, Any], target_dict: Optional[Dict[str, ThresholdConfig]] = None ): - """Parse RTT thresholds (per-host network latency thresholds). + """Parse RTT thresholds (network latency thresholds). RTT thresholds are configured as: thresholds: rtt: - hostname1: - warning: 100.0 # ms - critical: 500.0 # ms + warning: 100.0 # ms + critical: 500.0 # ms Args: rtt_thresholds: RTT threshold configuration @@ -529,41 +537,39 @@ class ThresholdChecker: if target_dict is None: target_dict = self.thresholds - for hostname, threshold_config in rtt_thresholds.items(): - if not isinstance(threshold_config, dict): - continue - - # Metric path is "rtt." - metric_path = f"rtt.{hostname}" - - warning = threshold_config.get("warning") - critical = threshold_config.get("critical") - operator = threshold_config.get("operator", ">") - hysteresis = threshold_config.get("hysteresis", 0.1) # 10% default - enabled = threshold_config.get("enabled", True) - display = threshold_config.get("display") - - if warning is None and critical is None: - logger.warning("No RTT thresholds defined for %s, skipping", hostname) - continue - - threshold = ThresholdConfig( - metric_path=metric_path, - warning=warning, - critical=critical, - operator=operator, - hysteresis=hysteresis, - enabled=enabled, - display=display - ) - - target_dict[metric_path] = threshold - logger.debug( - "Registered RTT threshold for %s: warn=%s ms, crit=%s ms", - hostname, - warning, - critical - ) + if not isinstance(rtt_thresholds, dict): + return + + # Metric path is simply "rtt" (not per-host) + metric_path = "rtt" + + warning = rtt_thresholds.get("warning") + critical = rtt_thresholds.get("critical") + operator = rtt_thresholds.get("operator", ">") + hysteresis = rtt_thresholds.get("hysteresis", 0.1) # 10% default + enabled = rtt_thresholds.get("enabled", True) + display = rtt_thresholds.get("display") + + if warning is None and critical is None: + logger.warning("No RTT thresholds defined, skipping") + return + + threshold = ThresholdConfig( + metric_path=metric_path, + warning=warning, + critical=critical, + operator=operator, + hysteresis=hysteresis, + enabled=enabled, + display=display + ) + + target_dict[metric_path] = threshold + logger.debug( + "Registered RTT threshold: warn=%s ms, crit=%s ms", + warning, + critical + ) def get_thresholds_for_host(self, host_name: str) -> Dict[str, ThresholdConfig]: """Get the appropriate threshold configuration for a host. @@ -887,12 +893,12 @@ class ThresholdChecker: value: Any, ): """Send notification and log to journal/eventlog.""" - if self.notification_callback is not None: - try: - self.notification_callback(f"{lvl}: {host_name} - {message}") - logger.info("Notification sent: %s", message) - except Exception as e: - logger.error("Failed to send notification: %s", e) + # Send notification using host-specific channels + try: + notify_mod.pushmsg_for_host(host_name, f"{lvl}: {host_name} - {message}") + logger.info("Notification sent: %s", message) + except Exception as e: + logger.error("Failed to send notification: %s", e) # Log to journal if self.journal is not None: @@ -1017,14 +1023,14 @@ class ThresholdChecker: else: message = f"REMINDER ({alert_state.level.name}): {host_name} - {metric_path} = {value} (ongoing for {int(now - alert_state.since)}s)" - if self.notification_callback: - try: - self.notification_callback(message) - alert_state.last_notification = now - alert_state.notification_count += 1 - logger.info("Re-notification sent: %s", message) - except Exception as e: - logger.error("Failed to send re-notification: %s", e) + # Send re-notification using host-specific channels + try: + notify_mod.pushmsg_for_host(host_name, message) + alert_state.last_notification = now + alert_state.notification_count += 1 + logger.info("Re-notification sent: %s", message) + except Exception as e: + logger.error("Failed to send re-notification: %s", e) def get_active_alerts(self, alert_states: Dict[str, AlertState]) -> list: """ diff --git a/hbd/server/udp.py b/hbd/server/udp.py index 84b59a2..4f26bf7 100644 --- a/hbd/server/udp.py +++ b/hbd/server/udp.py @@ -68,7 +68,6 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict): - config: dict of configuration - hbdclass: module providing Host/Connection classes - log: callable(loghost, message) - - pushmsg: callable(message) - msg_to_websockets: callable(typ, data) - msg_journal: MessageJournal instance for logging all messages - DEBUG, verbose @@ -91,7 +90,6 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict): cfg = ctx.get("config", {}) hbdcls = ctx.get("hbdclass") log = ctx.get("log") - pushmsg = ctx.get("pushmsg") msg_to_websockets = ctx.get("msg_to_websockets") DEBUG = ctx.get("DEBUG", 0) verbose = ctx.get("verbose", False) @@ -100,18 +98,24 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict): ip = addr[0] if isinstance(addr, (list, tuple)) else addr name = msg.get("name", "unknown") from ..common.utils import shortname + from . import config as config_mod uname = shortname(name) if uname not in hbdcls.Host.hosts: host = hbdcls.Host(uname) - host.dyn = uname in cfg.get("dyndnshosts", []) + # Use new config function to check dyndns + dyndnshosts = config_mod.get_dyndnshosts(cfg) + host.dyn = uname in dyndnshosts if verbose: print(("XX: New host, num now %s" % (len(hbdcls.Host.hosts)))) newh = True else: host = hbdcls.Host.hosts[uname] newh = False + + # Get watchhosts once for use throughout message handling + watchhosts = config_mod.get_watchhosts(cfg) cid = msg.get("id", 0) try: @@ -181,9 +185,8 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict): if res: eventlog(uname, "WARNING", res) - if uname in cfg.get("watchhosts", []): - if pushmsg: - pushmsg("%s %s" % (host.name, res)) + if uname in watchhosts: + notify_mod.pushmsg_for_host(uname, "%s %s" % (host.name, res)) interval = int(msg.get("interval", 0) or 0) shutdown = msg.get("shutdown", 0) @@ -193,15 +196,13 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict): if boot: eventlog(uname, "INFO", "booted") - if uname in cfg.get("watchhosts", []): + if uname in watchhosts: m = "%s booted" % (host.name) - if pushmsg: - pushmsg(m) + notify_mod.pushmsg_for_host(uname, m) if message: eventlog(uname, "INFO", "msg: %s" % message, service=service) - if uname in cfg.get("watchhosts", []): - if pushmsg: - pushmsg(message) + if uname in watchhosts: + notify_mod.pushmsg_for_host(uname, message) if conn.getstate() != hbdcls.Connection.UP: lasts = conn.state @@ -211,9 +212,8 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict): else: m = "%s back after being %s for %s" % (conn.afam, lasts, dur(d)) eventlog(uname, "RECOVER", m) - if uname in cfg.get("watchhosts", []): - if pushmsg: - pushmsg("%s %s is back" % (uname, conn.afam)) + if uname in watchhosts: + notify_mod.pushmsg_for_host(uname, "%s %s is back" % (uname, conn.afam)) if boot or newh: host.upcount = host.doesack @@ -222,9 +222,8 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict): if shutdown: eventlog(uname, "INFO", "%s shutdown" % conn.afam) - if uname in cfg.get("watchhosts", []): - if pushmsg: - pushmsg("%s %s shutdown" % (uname, conn.afam)) + if uname in watchhosts: + notify_mod.pushmsg_for_host(uname, "%s %s shutdown" % (uname, conn.afam)) conn.newstate(hbdcls.Connection.DOWN, now) if interval > 0: @@ -247,11 +246,21 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict): connection.newstate(hbdcls.Connection.OVERDUE, now, cfg.get("grace", 2)) msg = f"{connection.afam} overdue" - eventlog(uname, "CRITICAL" if uname in cfg.get("watchhosts", []) else "WARNING", msg) + eventlog(uname, "CRITICAL" if uname in watchhosts else "WARNING", msg) - if uname in cfg.get("watchhosts", []): - if pushmsg: - pushmsg(f"{uname} {msg}") + if uname in watchhosts: + notify_mod.pushmsg_for_host(uname, f"{uname} {msg}") + + # Check RTT thresholds with infinite RTT for overdue hosts + threshold_checker = ctx.get("threshold_checker") + if threshold_checker: + metric_path = "rtt" + threshold_checker.check_value( + host_name=uname, + metric_path=metric_path, + value=float('inf'), + alert_states=host.alert_states + ) # Notify websockets if msg_to_websockets: @@ -274,8 +283,8 @@ def handle_datagram(msg: dict, addr, transport, ctx: dict): # Check RTT thresholds using the threshold checker threshold_checker = ctx.get("threshold_checker") if threshold_checker and rtt and rtt > 0: - # Metric path for RTT is "rtt." - metric_path = f"rtt.{uname}" + # Metric path for RTT is simply "rtt" + metric_path = "rtt" # Check against configured thresholds (handles alerts, notifications, etc.) threshold_checker.check_value(