alerting: silence per-ban crowdsec pushes; ntfy alert on service down/recovery
- crowdsec.nix: drop the ntfy notifications (one push per ban was constant noise on the WAN-exposed box); bans still happen silently - service-health.nix: OnFailure=notify-failure@%n on 16 core units sends an ntfy 'down' push when a unit truly fails (after exhausting Restart=), then a 'recovered' push when it comes back. Shares /var/secrets/ntfy-url. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
parent
3047ea547c
commit
ddbc8929e4
3 changed files with 84 additions and 58 deletions
|
|
@ -37,6 +37,7 @@
|
||||||
./services/adguard.nix
|
./services/adguard.nix
|
||||||
./services/router.nix
|
./services/router.nix
|
||||||
./services/crowdsec.nix
|
./services/crowdsec.nix
|
||||||
|
./services/service-health.nix
|
||||||
./services/sabnzbd.nix
|
./services/sabnzbd.nix
|
||||||
./services/forgejo-runner.nix
|
./services/forgejo-runner.nix
|
||||||
./services/code-server.nix
|
./services/code-server.nix
|
||||||
|
|
|
||||||
|
|
@ -9,37 +9,10 @@
|
||||||
# 2. Delete ../modules/crowdsec/ and the disabledModules + imports lines below
|
# 2. Delete ../modules/crowdsec/ and the disabledModules + imports lines below
|
||||||
# 3. The settings/option API is the same as the PR's, so config below is forward-compatible
|
# 3. The settings/option API is the same as the PR's, so config below is forward-compatible
|
||||||
#
|
#
|
||||||
# Before first deploy, create /var/secrets/ntfy-url with your topic URL:
|
# CrowdSec bans silently — no ntfy pushes (they were constant noise).
|
||||||
# echo 'https://ntfy.sh/nordhammer-<random>' | sudo tee /var/secrets/ntfy-url
|
# The /var/secrets/ntfy-url topic is used by services/service-health.nix instead.
|
||||||
# sudo chmod 600 /var/secrets/ntfy-url
|
|
||||||
{ config, lib, pkgs, ... }:
|
{ config, lib, pkgs, ... }:
|
||||||
let
|
let
|
||||||
# The real URL is injected at service start (see ExecStartPre below) —
|
|
||||||
# eval-time builtins.readFile can't see /var/secrets under pure flake
|
|
||||||
# evaluation, which is how the `update` alias builds.
|
|
||||||
ntfyUrlPlaceholder = "@NTFY_URL@";
|
|
||||||
|
|
||||||
# The module renders settings.notifications into /etc/crowdsec/notifications/
|
|
||||||
# as a symlink into /etc/static (the store). Re-render it from the static
|
|
||||||
# source with the secret substituted on every service start; nixos-rebuild
|
|
||||||
# restores the symlink on activation, so this never goes stale.
|
|
||||||
injectNtfyUrl = pkgs.writeShellScript "crowdsec-inject-ntfy-url" ''
|
|
||||||
set -euo pipefail
|
|
||||||
src=/etc/static/crowdsec/notifications/0-nixos-generated.yaml
|
|
||||||
dst=/etc/crowdsec/notifications/0-nixos-generated.yaml
|
|
||||||
secret=/var/secrets/ntfy-url
|
|
||||||
if [ ! -f "$secret" ]; then
|
|
||||||
echo "WARNING: $secret not found; ntfy notifications will not work" >&2
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
url=$(${pkgs.coreutils}/bin/tr -d '\n' < "$secret")
|
|
||||||
tmp=$(${pkgs.coreutils}/bin/mktemp "$dst.XXXXXX")
|
|
||||||
${pkgs.gnused}/bin/sed "s|${ntfyUrlPlaceholder}|$url|g" "$src" > "$tmp"
|
|
||||||
${pkgs.coreutils}/bin/chmod 600 "$tmp"
|
|
||||||
${pkgs.coreutils}/bin/chown crowdsec:crowdsec "$tmp"
|
|
||||||
${pkgs.coreutils}/bin/mv "$tmp" "$dst"
|
|
||||||
'';
|
|
||||||
|
|
||||||
# nixpkgs only builds the agent + cscli; the new module also expects
|
# nixpkgs only builds the agent + cscli; the new module also expects
|
||||||
# notification plugins at $out/libexec/crowdsec/plugins/. Compile them
|
# notification plugins at $out/libexec/crowdsec/plugins/. Compile them
|
||||||
# from the same source tree (cmd/notification-*) and move them there.
|
# from the same source tree (cmd/notification-*) and move them there.
|
||||||
|
|
@ -142,52 +115,27 @@ in
|
||||||
}
|
}
|
||||||
];
|
];
|
||||||
|
|
||||||
# Push notifications via ntfy.sh
|
# Profiles set ban duration to 4h. No ntfy notifications: a push per
|
||||||
notifications = [
|
# ban was constant noise on a WAN-exposed box. ntfy is now reserved
|
||||||
{
|
# for service-down alerts (see services/service-health.nix); CrowdSec
|
||||||
name = "ntfy_http";
|
# still bans silently.
|
||||||
type = "http";
|
|
||||||
log_level = "info";
|
|
||||||
url = ntfyUrlPlaceholder;
|
|
||||||
method = "POST";
|
|
||||||
headers = {
|
|
||||||
Title = "CrowdSec alert";
|
|
||||||
Priority = "high";
|
|
||||||
Tags = "rotating_light";
|
|
||||||
};
|
|
||||||
format = ''
|
|
||||||
{{range . -}}
|
|
||||||
{{.Scenario}} from {{.Source.IP}} ({{.Source.Cn}}) — {{len .Decisions}} decision(s) taken
|
|
||||||
{{end -}}
|
|
||||||
'';
|
|
||||||
}
|
|
||||||
];
|
|
||||||
|
|
||||||
# Override default profiles to attach the ntfy notifier
|
|
||||||
profiles = [
|
profiles = [
|
||||||
{
|
{
|
||||||
name = "default_ip_remediation";
|
name = "default_ip_remediation";
|
||||||
filters = [ "Alert.Remediation == true && Alert.GetScope() == 'Ip'" ];
|
filters = [ "Alert.Remediation == true && Alert.GetScope() == 'Ip'" ];
|
||||||
decisions = [{ type = "ban"; duration = "4h"; }];
|
decisions = [{ type = "ban"; duration = "4h"; }];
|
||||||
notifications = [ "ntfy_http" ];
|
|
||||||
on_success = "break";
|
on_success = "break";
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
name = "default_range_remediation";
|
name = "default_range_remediation";
|
||||||
filters = [ "Alert.Remediation == true && Alert.GetScope() == 'Range'" ];
|
filters = [ "Alert.Remediation == true && Alert.GetScope() == 'Range'" ];
|
||||||
decisions = [{ type = "ban"; duration = "4h"; }];
|
decisions = [{ type = "ban"; duration = "4h"; }];
|
||||||
notifications = [ "ntfy_http" ];
|
|
||||||
on_success = "break";
|
on_success = "break";
|
||||||
}
|
}
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
# Inject the ntfy topic URL into the rendered notification config before
|
|
||||||
# every start. "+" runs the script with full privileges (it reads the
|
|
||||||
# root-owned secret and replaces a root-owned /etc symlink).
|
|
||||||
systemd.services.crowdsec.serviceConfig.ExecStartPre = [ "+${injectNtfyUrl}" ];
|
|
||||||
|
|
||||||
# Firewall bouncer enforces decisions via nftables; auto-registers with LAPI
|
# Firewall bouncer enforces decisions via nftables; auto-registers with LAPI
|
||||||
services.crowdsec-firewall-bouncer = {
|
services.crowdsec-firewall-bouncer = {
|
||||||
enable = true;
|
enable = true;
|
||||||
|
|
|
||||||
77
services/service-health.nix
Normal file
77
services/service-health.nix
Normal file
|
|
@ -0,0 +1,77 @@
|
||||||
|
# services/service-health.nix — ntfy alert when a watched systemd unit fails,
|
||||||
|
# and again when it recovers. Replaces the noisy per-ban CrowdSec pushes
|
||||||
|
# (silenced in services/crowdsec.nix); both share the /var/secrets/ntfy-url topic.
|
||||||
|
#
|
||||||
|
# Detection is event-driven: each watched unit gets OnFailure=notify-failure@%n.
|
||||||
|
# OnFailure fires only once a unit truly enters "failed" state — i.e. after it
|
||||||
|
# has exhausted its Restart= attempts — so transient restarts stay silent and
|
||||||
|
# you're only paged when a service has genuinely given up. The handler sends a
|
||||||
|
# "down" push, then waits for the unit to come back and sends "recovered".
|
||||||
|
#
|
||||||
|
# Requires /var/secrets/ntfy-url (the same topic file CrowdSec used):
|
||||||
|
# echo 'https://ntfy.sh/your-topic' | sudo tee /var/secrets/ntfy-url
|
||||||
|
# sudo chmod 600 /var/secrets/ntfy-url
|
||||||
|
{ config, lib, pkgs, ... }:
|
||||||
|
let
|
||||||
|
# Core media + infra units to page on. All verified to exist on the box;
|
||||||
|
# adding a name that isn't a real unit would create a stray stub service.
|
||||||
|
watched = [
|
||||||
|
"jellyfin" "sonarr" "radarr" "prowlarr" "bazarr"
|
||||||
|
"qbittorrent-nox" "sabnzbd" "authelia-main" "nginx"
|
||||||
|
"adguardhome" "crowdsec" "frigate" "go2rtc"
|
||||||
|
"homepage-dashboard" "cloudflare-dyndns" "gitea-runner-default"
|
||||||
|
];
|
||||||
|
|
||||||
|
# Reads the topic at runtime (pure flake eval can't see /var/secrets).
|
||||||
|
# $1 = the failed unit's full name, e.g. "jellyfin.service".
|
||||||
|
notify = pkgs.writeShellScript "service-health-notify" ''
|
||||||
|
set -uo pipefail
|
||||||
|
unit="$1"
|
||||||
|
name="''${unit%.service}"
|
||||||
|
host="${config.networking.hostName}"
|
||||||
|
secret=/var/secrets/ntfy-url
|
||||||
|
if [ ! -f "$secret" ]; then
|
||||||
|
echo "service-health: $secret missing; cannot notify" >&2
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
url=$(${pkgs.coreutils}/bin/tr -d '\n' < "$secret")
|
||||||
|
|
||||||
|
post() { # title priority tags body
|
||||||
|
${pkgs.curl}/bin/curl -fsS --max-time 10 \
|
||||||
|
-H "Title: $1" -H "Priority: $2" -H "Tags: $3" \
|
||||||
|
-d "$4" "$url" >/dev/null 2>&1 || true
|
||||||
|
}
|
||||||
|
|
||||||
|
post "Service down" high rotating_light "$name failed on $host"
|
||||||
|
|
||||||
|
# Wait for recovery: up to 2h, polling every 20s.
|
||||||
|
for _ in $(${pkgs.coreutils}/bin/seq 1 360); do
|
||||||
|
${pkgs.coreutils}/bin/sleep 20
|
||||||
|
if ${pkgs.systemd}/bin/systemctl is-active --quiet "$unit"; then
|
||||||
|
post "Service recovered" default white_check_mark "$name is running again on $host"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
'';
|
||||||
|
in
|
||||||
|
{
|
||||||
|
config = lib.mkIf (config.networking.hostName == "FredOS-Mediaserver") {
|
||||||
|
|
||||||
|
systemd.services = lib.mkMerge [
|
||||||
|
# Templated handler: %i is the failed unit's full name (jellyfin.service).
|
||||||
|
{
|
||||||
|
"notify-failure@" = {
|
||||||
|
description = "ntfy alert: %i failed";
|
||||||
|
serviceConfig = {
|
||||||
|
Type = "simple";
|
||||||
|
ExecStart = "${notify} %i";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
||||||
|
# Wire OnFailure onto each watched unit (merges with its existing config).
|
||||||
|
(lib.genAttrs watched (_: {
|
||||||
|
unitConfig.OnFailure = [ "notify-failure@%n.service" ];
|
||||||
|
}))
|
||||||
|
];
|
||||||
|
};
|
||||||
|
}
|
||||||
Loading…
Add table
Add a link
Reference in a new issue