36 lines
1.5 KiB
Nix
36 lines
1.5 KiB
Nix
|
|
# services/hardware-health.nix — RAS error attribution + watchdog auto-recovery
|
||
|
|
#
|
||
|
|
# Context: Jun 2026 the dual Xeon E5-2697 v3 began throwing a storm of
|
||
|
|
# *corrected* Machine Check Exceptions on both sockets (Bank 5 / Bank 20),
|
||
|
|
# ~18k events in 36h, eventually hanging the box. Since this host is the
|
||
|
|
# router, a hang takes the whole LAN offline until a manual power-cycle.
|
||
|
|
#
|
||
|
|
# This module:
|
||
|
|
# - rasdaemon: decodes every MCE to a specific DIMM/channel/socket and
|
||
|
|
# persists a per-component error DB, so a failing part can be named
|
||
|
|
# (needed for the seller's warranty claim). Query with `ras-mc-ctl
|
||
|
|
# --error-count` and `ras-mc-ctl --summary`.
|
||
|
|
# - hardware watchdog: if userspace hangs again, systemd stops petting
|
||
|
|
# /dev/watchdog0 and the chipset watchdog reboots the box (~30s),
|
||
|
|
# restoring the LAN without physical access.
|
||
|
|
|
||
|
|
{ config, lib, pkgs, ... }:
|
||
|
|
{
|
||
|
|
config = lib.mkIf (config.networking.hostName == "FredOS-Mediaserver") {
|
||
|
|
|
||
|
|
# Decode + log + persist machine-check / memory errors per component.
|
||
|
|
hardware.rasdaemon.enable = true;
|
||
|
|
|
||
|
|
# ras-mc-ctl on PATH for manual inspection.
|
||
|
|
environment.systemPackages = [ pkgs.rasdaemon ];
|
||
|
|
|
||
|
|
# Hardware watchdog: auto-reboot a hung box instead of a dead LAN.
|
||
|
|
# systemd pets /dev/watchdog0 at half the runtime interval; if it stops
|
||
|
|
# (hang), the chipset resets after RuntimeWatchdogSec.
|
||
|
|
systemd.settings.Manager = {
|
||
|
|
RuntimeWatchdogSec = "30s";
|
||
|
|
RebootWatchdogSec = "10min";
|
||
|
|
};
|
||
|
|
};
|
||
|
|
}
|