From 84d6cfe7b3937cd37e3a65713e29992fd8aadb07 Mon Sep 17 00:00:00 2001 From: Dalton Hubble Date: Tue, 10 Jul 2018 00:20:30 -0700 Subject: [PATCH] Add Prometheus alert rule for inactive md devices * node-exporter exposes metrics to Prometheus about total and active md devices (e.g. disks in mdadm RAID arrays) * Add alert that fires when a RAID disk fails or becomes inactive for another reason --- addons/prometheus/rules.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/addons/prometheus/rules.yaml b/addons/prometheus/rules.yaml index c76c00d2..99dbf7c7 100644 --- a/addons/prometheus/rules.yaml +++ b/addons/prometheus/rules.yaml @@ -496,6 +496,13 @@ data: annotations: description: device {{$labels.device}} on node {{$labels.instance}} is running full within the next 2 hours (mounted at {{$labels.mountpoint}}) + - alert: InactiveRAIDDisk + expr: node_md_disks - node_md_disks_active > 0 + for: 10m + labels: + severity: warning + annotations: + description: '{{$value}} RAID disk(s) on node {{$labels.instance}} are inactive' prometheus.rules.yaml: | groups: - name: prometheus.rules