nixos/services/service-health.nix

78 lines
2.9 KiB
Nix
Raw Normal View History

# services/service-health.nix — ntfy alert when a watched systemd unit fails,
# and again when it recovers. Replaces the noisy per-ban CrowdSec pushes
# (silenced in services/crowdsec.nix); both share the /var/secrets/ntfy-url topic.
#
# Detection is event-driven: each watched unit gets OnFailure=notify-failure@%n.
# OnFailure fires only once a unit truly enters "failed" state — i.e. after it
# has exhausted its Restart= attempts — so transient restarts stay silent and
# you're only paged when a service has genuinely given up. The handler sends a
# "down" push, then waits for the unit to come back and sends "recovered".
#
# Requires /var/secrets/ntfy-url (the same topic file CrowdSec used):
# echo 'https://ntfy.sh/your-topic' | sudo tee /var/secrets/ntfy-url
# sudo chmod 600 /var/secrets/ntfy-url
{ config, lib, pkgs, ... }:
let
# Core media + infra units to page on. All verified to exist on the box;
# adding a name that isn't a real unit would create a stray stub service.
watched = [
"jellyfin" "sonarr" "radarr" "prowlarr" "bazarr"
"qbittorrent-nox" "sabnzbd" "authelia-main" "nginx"
"adguardhome" "crowdsec" "go2rtc"
"homepage-dashboard" "cloudflare-dyndns" "gitea-runner-default"
];
# Reads the topic at runtime (pure flake eval can't see /var/secrets).
# $1 = the failed unit's full name, e.g. "jellyfin.service".
notify = pkgs.writeShellScript "service-health-notify" ''
set -uo pipefail
unit="$1"
name="''${unit%.service}"
host="${config.networking.hostName}"
secret=/var/secrets/ntfy-url
if [ ! -f "$secret" ]; then
echo "service-health: $secret missing; cannot notify" >&2
exit 0
fi
url=$(${pkgs.coreutils}/bin/tr -d '\n' < "$secret")
post() { # title priority tags body
${pkgs.curl}/bin/curl -fsS --max-time 10 \
-H "Title: $1" -H "Priority: $2" -H "Tags: $3" \
-d "$4" "$url" >/dev/null 2>&1 || true
}
post "Service down" high rotating_light "$name failed on $host"
# Wait for recovery: up to 2h, polling every 20s.
for _ in $(${pkgs.coreutils}/bin/seq 1 360); do
${pkgs.coreutils}/bin/sleep 20
if ${pkgs.systemd}/bin/systemctl is-active --quiet "$unit"; then
post "Service recovered" default white_check_mark "$name is running again on $host"
exit 0
fi
done
'';
in
{
config = lib.mkIf (config.networking.hostName == "FredOS-Mediaserver") {
systemd.services = lib.mkMerge [
# Templated handler: %i is the failed unit's full name (jellyfin.service).
{
"notify-failure@" = {
description = "ntfy alert: %i failed";
serviceConfig = {
Type = "simple";
ExecStart = "${notify} %i";
};
};
}
# Wire OnFailure onto each watched unit (merges with its existing config).
(lib.genAttrs watched (_: {
unitConfig.OnFailure = [ "notify-failure@%n.service" ];
}))
];
};
}