From ddbc8929e468b82984fbfb4c25e678edb0d8e608 Mon Sep 17 00:00:00 2001 From: rope Date: Sat, 13 Jun 2026 17:54:37 +0100 Subject: [PATCH] alerting: silence per-ban crowdsec pushes; ntfy alert on service down/recovery - crowdsec.nix: drop the ntfy notifications (one push per ban was constant noise on the WAN-exposed box); bans still happen silently - service-health.nix: OnFailure=notify-failure@%n on 16 core units sends an ntfy 'down' push when a unit truly fails (after exhausting Restart=), then a 'recovered' push when it comes back. Shares /var/secrets/ntfy-url. Co-Authored-By: Claude Fable 5 --- common.nix | 1 + services/crowdsec.nix | 64 +++--------------------------- services/service-health.nix | 77 +++++++++++++++++++++++++++++++++++++ 3 files changed, 84 insertions(+), 58 deletions(-) create mode 100644 services/service-health.nix diff --git a/common.nix b/common.nix index 48cb9c8..3bd1a75 100644 --- a/common.nix +++ b/common.nix @@ -37,6 +37,7 @@ ./services/adguard.nix ./services/router.nix ./services/crowdsec.nix + ./services/service-health.nix ./services/sabnzbd.nix ./services/forgejo-runner.nix ./services/code-server.nix diff --git a/services/crowdsec.nix b/services/crowdsec.nix index 355f9d9..d35905c 100644 --- a/services/crowdsec.nix +++ b/services/crowdsec.nix @@ -9,37 +9,10 @@ # 2. Delete ../modules/crowdsec/ and the disabledModules + imports lines below # 3. The settings/option API is the same as the PR's, so config below is forward-compatible # -# Before first deploy, create /var/secrets/ntfy-url with your topic URL: -# echo 'https://ntfy.sh/nordhammer-' | sudo tee /var/secrets/ntfy-url -# sudo chmod 600 /var/secrets/ntfy-url +# CrowdSec bans silently — no ntfy pushes (they were constant noise). +# The /var/secrets/ntfy-url topic is used by services/service-health.nix instead. { config, lib, pkgs, ... }: let - # The real URL is injected at service start (see ExecStartPre below) — - # eval-time builtins.readFile can't see /var/secrets under pure flake - # evaluation, which is how the `update` alias builds. - ntfyUrlPlaceholder = "@NTFY_URL@"; - - # The module renders settings.notifications into /etc/crowdsec/notifications/ - # as a symlink into /etc/static (the store). Re-render it from the static - # source with the secret substituted on every service start; nixos-rebuild - # restores the symlink on activation, so this never goes stale. - injectNtfyUrl = pkgs.writeShellScript "crowdsec-inject-ntfy-url" '' - set -euo pipefail - src=/etc/static/crowdsec/notifications/0-nixos-generated.yaml - dst=/etc/crowdsec/notifications/0-nixos-generated.yaml - secret=/var/secrets/ntfy-url - if [ ! -f "$secret" ]; then - echo "WARNING: $secret not found; ntfy notifications will not work" >&2 - exit 0 - fi - url=$(${pkgs.coreutils}/bin/tr -d '\n' < "$secret") - tmp=$(${pkgs.coreutils}/bin/mktemp "$dst.XXXXXX") - ${pkgs.gnused}/bin/sed "s|${ntfyUrlPlaceholder}|$url|g" "$src" > "$tmp" - ${pkgs.coreutils}/bin/chmod 600 "$tmp" - ${pkgs.coreutils}/bin/chown crowdsec:crowdsec "$tmp" - ${pkgs.coreutils}/bin/mv "$tmp" "$dst" - ''; - # nixpkgs only builds the agent + cscli; the new module also expects # notification plugins at $out/libexec/crowdsec/plugins/. Compile them # from the same source tree (cmd/notification-*) and move them there. @@ -142,52 +115,27 @@ in } ]; - # Push notifications via ntfy.sh - notifications = [ - { - name = "ntfy_http"; - type = "http"; - log_level = "info"; - url = ntfyUrlPlaceholder; - method = "POST"; - headers = { - Title = "CrowdSec alert"; - Priority = "high"; - Tags = "rotating_light"; - }; - format = '' - {{range . -}} - {{.Scenario}} from {{.Source.IP}} ({{.Source.Cn}}) — {{len .Decisions}} decision(s) taken - {{end -}} - ''; - } - ]; - - # Override default profiles to attach the ntfy notifier + # Profiles set ban duration to 4h. No ntfy notifications: a push per + # ban was constant noise on a WAN-exposed box. ntfy is now reserved + # for service-down alerts (see services/service-health.nix); CrowdSec + # still bans silently. profiles = [ { name = "default_ip_remediation"; filters = [ "Alert.Remediation == true && Alert.GetScope() == 'Ip'" ]; decisions = [{ type = "ban"; duration = "4h"; }]; - notifications = [ "ntfy_http" ]; on_success = "break"; } { name = "default_range_remediation"; filters = [ "Alert.Remediation == true && Alert.GetScope() == 'Range'" ]; decisions = [{ type = "ban"; duration = "4h"; }]; - notifications = [ "ntfy_http" ]; on_success = "break"; } ]; }; }; - # Inject the ntfy topic URL into the rendered notification config before - # every start. "+" runs the script with full privileges (it reads the - # root-owned secret and replaces a root-owned /etc symlink). - systemd.services.crowdsec.serviceConfig.ExecStartPre = [ "+${injectNtfyUrl}" ]; - # Firewall bouncer enforces decisions via nftables; auto-registers with LAPI services.crowdsec-firewall-bouncer = { enable = true; diff --git a/services/service-health.nix b/services/service-health.nix new file mode 100644 index 0000000..9dcd112 --- /dev/null +++ b/services/service-health.nix @@ -0,0 +1,77 @@ +# services/service-health.nix — ntfy alert when a watched systemd unit fails, +# and again when it recovers. Replaces the noisy per-ban CrowdSec pushes +# (silenced in services/crowdsec.nix); both share the /var/secrets/ntfy-url topic. +# +# Detection is event-driven: each watched unit gets OnFailure=notify-failure@%n. +# OnFailure fires only once a unit truly enters "failed" state — i.e. after it +# has exhausted its Restart= attempts — so transient restarts stay silent and +# you're only paged when a service has genuinely given up. The handler sends a +# "down" push, then waits for the unit to come back and sends "recovered". +# +# Requires /var/secrets/ntfy-url (the same topic file CrowdSec used): +# echo 'https://ntfy.sh/your-topic' | sudo tee /var/secrets/ntfy-url +# sudo chmod 600 /var/secrets/ntfy-url +{ config, lib, pkgs, ... }: +let + # Core media + infra units to page on. All verified to exist on the box; + # adding a name that isn't a real unit would create a stray stub service. + watched = [ + "jellyfin" "sonarr" "radarr" "prowlarr" "bazarr" + "qbittorrent-nox" "sabnzbd" "authelia-main" "nginx" + "adguardhome" "crowdsec" "frigate" "go2rtc" + "homepage-dashboard" "cloudflare-dyndns" "gitea-runner-default" + ]; + + # Reads the topic at runtime (pure flake eval can't see /var/secrets). + # $1 = the failed unit's full name, e.g. "jellyfin.service". + notify = pkgs.writeShellScript "service-health-notify" '' + set -uo pipefail + unit="$1" + name="''${unit%.service}" + host="${config.networking.hostName}" + secret=/var/secrets/ntfy-url + if [ ! -f "$secret" ]; then + echo "service-health: $secret missing; cannot notify" >&2 + exit 0 + fi + url=$(${pkgs.coreutils}/bin/tr -d '\n' < "$secret") + + post() { # title priority tags body + ${pkgs.curl}/bin/curl -fsS --max-time 10 \ + -H "Title: $1" -H "Priority: $2" -H "Tags: $3" \ + -d "$4" "$url" >/dev/null 2>&1 || true + } + + post "Service down" high rotating_light "$name failed on $host" + + # Wait for recovery: up to 2h, polling every 20s. + for _ in $(${pkgs.coreutils}/bin/seq 1 360); do + ${pkgs.coreutils}/bin/sleep 20 + if ${pkgs.systemd}/bin/systemctl is-active --quiet "$unit"; then + post "Service recovered" default white_check_mark "$name is running again on $host" + exit 0 + fi + done + ''; +in +{ + config = lib.mkIf (config.networking.hostName == "FredOS-Mediaserver") { + + systemd.services = lib.mkMerge [ + # Templated handler: %i is the failed unit's full name (jellyfin.service). + { + "notify-failure@" = { + description = "ntfy alert: %i failed"; + serviceConfig = { + Type = "simple"; + ExecStart = "${notify} %i"; + }; + }; + } + # Wire OnFailure onto each watched unit (merges with its existing config). + (lib.genAttrs watched (_: { + unitConfig.OnFailure = [ "notify-failure@%n.service" ]; + })) + ]; + }; +}