# services/hardware-health.nix — RAS error attribution + watchdog auto-recovery # # Context: Jun 2026 the dual Xeon E5-2697 v3 began throwing a storm of # *corrected* Machine Check Exceptions on both sockets (Bank 5 / Bank 20), # ~18k events in 36h, eventually hanging the box. Since this host is the # router, a hang takes the whole LAN offline until a manual power-cycle. # # This module: # - rasdaemon: decodes every MCE to a specific DIMM/channel/socket and # persists a per-component error DB, so a failing part can be named # (needed for the seller's warranty claim). Query with `ras-mc-ctl # --error-count` and `ras-mc-ctl --summary`. # - hardware watchdog: if userspace hangs again, systemd stops petting # /dev/watchdog0 and the chipset watchdog reboots the box (~30s), # restoring the LAN without physical access. { config, lib, pkgs, ... }: { config = lib.mkIf (config.networking.hostName == "FredOS-Mediaserver") { # Decode + log + persist machine-check / memory errors per component. hardware.rasdaemon.enable = true; # ras-mc-ctl on PATH for manual inspection. environment.systemPackages = [ pkgs.rasdaemon ]; # fwupd: lets us check whether Lenovo publishes a P700 BIOS/microcode # update to LVFS that can be flashed in-place (UEFI capsule, applied on # reboot). The dual-Xeon QPI fault is intermittent; a microcode bump may # improve link tolerance. If LVFS has no payload for this 2014 board, # this is harmless and can be removed. services.fwupd.enable = true; # Hardware watchdog: auto-reboot a hung box instead of a dead LAN. # systemd pets /dev/watchdog0 at half the runtime interval; if it stops # (hang), the chipset resets after RuntimeWatchdogSec. systemd.settings.Manager = { RuntimeWatchdogSec = "30s"; RebootWatchdogSec = "10min"; }; }; }