diff --git a/common.nix b/common.nix index b745c30..bac046e 100644 --- a/common.nix +++ b/common.nix @@ -44,6 +44,7 @@ ./services/memos.nix # ./services/neko.nix # superseded by selkies.nix (Neko can't handle GW's mouse grab) ./services/selkies.nix + ./services/hardware-health.nix ]; ### Make build time quicker diff --git a/services/hardware-health.nix b/services/hardware-health.nix new file mode 100644 index 0000000..6d2d525 --- /dev/null +++ b/services/hardware-health.nix @@ -0,0 +1,35 @@ +# services/hardware-health.nix — RAS error attribution + watchdog auto-recovery +# +# Context: Jun 2026 the dual Xeon E5-2697 v3 began throwing a storm of +# *corrected* Machine Check Exceptions on both sockets (Bank 5 / Bank 20), +# ~18k events in 36h, eventually hanging the box. Since this host is the +# router, a hang takes the whole LAN offline until a manual power-cycle. +# +# This module: +# - rasdaemon: decodes every MCE to a specific DIMM/channel/socket and +# persists a per-component error DB, so a failing part can be named +# (needed for the seller's warranty claim). Query with `ras-mc-ctl +# --error-count` and `ras-mc-ctl --summary`. +# - hardware watchdog: if userspace hangs again, systemd stops petting +# /dev/watchdog0 and the chipset watchdog reboots the box (~30s), +# restoring the LAN without physical access. + +{ config, lib, pkgs, ... }: +{ + config = lib.mkIf (config.networking.hostName == "FredOS-Mediaserver") { + + # Decode + log + persist machine-check / memory errors per component. + hardware.rasdaemon.enable = true; + + # ras-mc-ctl on PATH for manual inspection. + environment.systemPackages = [ pkgs.rasdaemon ]; + + # Hardware watchdog: auto-reboot a hung box instead of a dead LAN. + # systemd pets /dev/watchdog0 at half the runtime interval; if it stops + # (hang), the chipset resets after RuntimeWatchdogSec. + systemd.settings.Manager = { + RuntimeWatchdogSec = "30s"; + RebootWatchdogSec = "10min"; + }; + }; +}