# services/service-health.nix — ntfy alert when a watched systemd unit fails, # and again when it recovers. Replaces the noisy per-ban CrowdSec pushes # (silenced in services/crowdsec.nix); both share the /var/secrets/ntfy-url topic. # # Detection is event-driven: each watched unit gets OnFailure=notify-failure@%n. # OnFailure fires only once a unit truly enters "failed" state — i.e. after it # has exhausted its Restart= attempts — so transient restarts stay silent and # you're only paged when a service has genuinely given up. The handler sends a # "down" push, then waits for the unit to come back and sends "recovered". # # Requires /var/secrets/ntfy-url (the same topic file CrowdSec used): # echo 'https://ntfy.sh/your-topic' | sudo tee /var/secrets/ntfy-url # sudo chmod 600 /var/secrets/ntfy-url { config, lib, pkgs, ... }: let # Core media + infra units to page on. All verified to exist on the box; # adding a name that isn't a real unit would create a stray stub service. watched = [ "jellyfin" "sonarr" "radarr" "prowlarr" "bazarr" "qbittorrent-nox" "sabnzbd" "authelia-main" "nginx" "adguardhome" "crowdsec" "frigate" "go2rtc" "homepage-dashboard" "cloudflare-dyndns" "gitea-runner-default" ]; # Reads the topic at runtime (pure flake eval can't see /var/secrets). # $1 = the failed unit's full name, e.g. "jellyfin.service". notify = pkgs.writeShellScript "service-health-notify" '' set -uo pipefail unit="$1" name="''${unit%.service}" host="${config.networking.hostName}" secret=/var/secrets/ntfy-url if [ ! -f "$secret" ]; then echo "service-health: $secret missing; cannot notify" >&2 exit 0 fi url=$(${pkgs.coreutils}/bin/tr -d '\n' < "$secret") post() { # title priority tags body ${pkgs.curl}/bin/curl -fsS --max-time 10 \ -H "Title: $1" -H "Priority: $2" -H "Tags: $3" \ -d "$4" "$url" >/dev/null 2>&1 || true } post "Service down" high rotating_light "$name failed on $host" # Wait for recovery: up to 2h, polling every 20s. for _ in $(${pkgs.coreutils}/bin/seq 1 360); do ${pkgs.coreutils}/bin/sleep 20 if ${pkgs.systemd}/bin/systemctl is-active --quiet "$unit"; then post "Service recovered" default white_check_mark "$name is running again on $host" exit 0 fi done ''; in { config = lib.mkIf (config.networking.hostName == "FredOS-Mediaserver") { systemd.services = lib.mkMerge [ # Templated handler: %i is the failed unit's full name (jellyfin.service). { "notify-failure@" = { description = "ntfy alert: %i failed"; serviceConfig = { Type = "simple"; ExecStart = "${notify} %i"; }; }; } # Wire OnFailure onto each watched unit (merges with its existing config). (lib.genAttrs watched (_: { unitConfig.OnFailure = [ "notify-failure@%n.service" ]; })) ]; }; }