alerting: silence per-ban crowdsec pushes; ntfy alert on service down/recovery

- crowdsec.nix: drop the ntfy notifications (one push per ban was constant noise on the WAN-exposed box); bans still happen silently - service-health.nix: OnFailure=notify-failure@%n on 16 core units sends an ntfy 'down' push when a unit truly fails (after exhausting Restart=), then a 'recovered' push when it comes back. Shares /var/secrets/ntfy-url. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-13 17:54:37 +01:00 · 2026-06-13 17:54:37 +01:00 · ddbc8929e4
commit ddbc8929e4
parent 3047ea547c
3 changed files with 84 additions and 58 deletions
--- a/services/service-health.nix
+++ b/services/service-health.nix
@ -0,0 +1,77 @@
+# services/service-health.nix — ntfy alert when a watched systemd unit fails,
+# and again when it recovers. Replaces the noisy per-ban CrowdSec pushes
+# (silenced in services/crowdsec.nix); both share the /var/secrets/ntfy-url topic.
+#
+# Detection is event-driven: each watched unit gets OnFailure=notify-failure@%n.
+# OnFailure fires only once a unit truly enters "failed" state — i.e. after it
+# has exhausted its Restart= attempts — so transient restarts stay silent and
+# you're only paged when a service has genuinely given up. The handler sends a
+# "down" push, then waits for the unit to come back and sends "recovered".
+#
+# Requires /var/secrets/ntfy-url (the same topic file CrowdSec used):
+#   echo 'https://ntfy.sh/your-topic' | sudo tee /var/secrets/ntfy-url
+#   sudo chmod 600 /var/secrets/ntfy-url
+{ config, lib, pkgs, ... }:
+let
+  # Core media + infra units to page on. All verified to exist on the box;
+  # adding a name that isn't a real unit would create a stray stub service.
+  watched = [
+    "jellyfin" "sonarr" "radarr" "prowlarr" "bazarr"
+    "qbittorrent-nox" "sabnzbd" "authelia-main" "nginx"
+    "adguardhome" "crowdsec" "frigate" "go2rtc"
+    "homepage-dashboard" "cloudflare-dyndns" "gitea-runner-default"
+  ];
+
+  # Reads the topic at runtime (pure flake eval can't see /var/secrets).
+  # $1 = the failed unit's full name, e.g. "jellyfin.service".
+  notify = pkgs.writeShellScript "service-health-notify" ''
+    set -uo pipefail
+    unit="$1"
+    name="''${unit%.service}"
+    host="${config.networking.hostName}"
+    secret=/var/secrets/ntfy-url
+    if [ ! -f "$secret" ]; then
+      echo "service-health: $secret missing; cannot notify" >&2
+      exit 0
+    fi
+    url=$(${pkgs.coreutils}/bin/tr -d '\n' < "$secret")
+
+    post() { # title  priority  tags  body
+      ${pkgs.curl}/bin/curl -fsS --max-time 10 \
+        -H "Title: $1" -H "Priority: $2" -H "Tags: $3" \
+        -d "$4" "$url" >/dev/null 2>&1 || true
+    }
+
+    post "Service down" high rotating_light "$name failed on $host"
+
+    # Wait for recovery: up to 2h, polling every 20s.
+    for _ in $(${pkgs.coreutils}/bin/seq 1 360); do
+      ${pkgs.coreutils}/bin/sleep 20
+      if ${pkgs.systemd}/bin/systemctl is-active --quiet "$unit"; then
+        post "Service recovered" default white_check_mark "$name is running again on $host"
+        exit 0
+      fi
+    done
+  '';
+in
+{
+  config = lib.mkIf (config.networking.hostName == "FredOS-Mediaserver") {
+
+    systemd.services = lib.mkMerge [
+      # Templated handler: %i is the failed unit's full name (jellyfin.service).
+      {
+        "notify-failure@" = {
+          description = "ntfy alert: %i failed";
+          serviceConfig = {
+            Type = "simple";
+            ExecStart = "${notify} %i";
+          };
+        };
+      }
+      # Wire OnFailure onto each watched unit (merges with its existing config).
+      (lib.genAttrs watched (_: {
+        unitConfig.OnFailure = [ "notify-failure@%n.service" ];
+      }))
+    ];
+  };
+}