Update node-exporter from v0.15.2 to v0.17.0

* node-exporter renamed multiple metrics that are reflected
in changes to Prometheus rules and Grafana dashboard expressions
This commit is contained in:
Dalton Hubble 2019-01-22 00:19:55 -08:00
parent d697dd46dc
commit f5ff003d0e
4 changed files with 54 additions and 44 deletions

View File

@ -19,6 +19,7 @@ Notable changes between versions.
* Update Prometheus from v2.6.0 to v2.6.1
* Update kube-state-metrics from v1.4.0 to v1.5.0
* Fix ClusterRole to collect and export PodDisruptionBudget metrics ([#383](https://github.com/poseidon/typhoon/pull/383))
* Update node-exporter from v0.15.2 to v0.17.0
* Update Grafana from v5.4.2 to v5.4.3
## v1.13.2

View File

@ -1963,7 +1963,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(node_cpu{mode=\"idle\"}[2m])) * 100",
"expr": "sum(rate(node_cpu_seconds_total{mode=\"idle\"}[2m])) * 100",
"hide": false,
"intervalFactor": 10,
"legendFormat": "",
@ -2138,7 +2138,7 @@ data:
"renderer": "flot",
"seriesOverrides": [
{
"alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}",
"alias": "node_memory_SwapFree_bytes{instance=\"172.17.0.1:9100\",job=\"prometheus\"}",
"yaxis": 2
}
],
@ -2148,7 +2148,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)",
"expr": "sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes) - sum(node_memory_Buffers_bytes) - sum(node_memory_Cached_bytes)",
"intervalFactor": 2,
"legendFormat": "memory usage",
"metric": "memo",
@ -2157,7 +2157,7 @@ data:
"target": ""
},
{
"expr": "sum(node_memory_Buffers)",
"expr": "sum(node_memory_Buffers_bytes)",
"interval": "",
"intervalFactor": 2,
"legendFormat": "memory buffers",
@ -2167,7 +2167,7 @@ data:
"target": ""
},
{
"expr": "sum(node_memory_Cached)",
"expr": "sum(node_memory_Cached_bytes)",
"interval": "",
"intervalFactor": 2,
"legendFormat": "memory cached",
@ -2177,7 +2177,7 @@ data:
"target": ""
},
{
"expr": "sum(node_memory_MemFree)",
"expr": "sum(node_memory_MemFree_bytes)",
"interval": "",
"intervalFactor": 2,
"legendFormat": "memory free",
@ -2268,7 +2268,7 @@ data:
},
"targets": [
{
"expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100",
"expr": "((sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes) - sum(node_memory_Buffers_bytes) - sum(node_memory_Cached_bytes)) / sum(node_memory_MemTotal_bytes)) * 100",
"intervalFactor": 2,
"metric": "",
"refId": "A",
@ -2355,7 +2355,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(node_disk_bytes_read[5m]))",
"expr": "max(rate(node_disk_read_bytes_total[5m]))",
"hide": false,
"intervalFactor": 4,
"legendFormat": "read",
@ -2364,14 +2364,14 @@ data:
"target": ""
},
{
"expr": "sum(rate(node_disk_bytes_written[5m]))",
"expr": "max(rate(node_disk_written_bytes_total[5m]))",
"intervalFactor": 4,
"legendFormat": "written",
"refId": "B",
"step": 20
},
{
"expr": "sum(rate(node_disk_io_time_ms[5m]))",
"expr": "max(rate(node_disk_io_time_seconds_total[5m]))",
"intervalFactor": 4,
"legendFormat": "io time",
"refId": "C",
@ -2458,7 +2458,7 @@ data:
},
"targets": [
{
"expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})",
"expr": "(sum(node_filesystem_size_bytes{device!=\"rootfs\"}) - sum(node_filesystem_free_bytes{device!=\"rootfs\"})) / sum(node_filesystem_size_bytes{device!=\"rootfs\"})",
"intervalFactor": 2,
"refId": "A",
"step": 60,
@ -2536,7 +2536,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(node_network_receive_bytes{device!~\"lo\"}[5m]))",
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]))",
"hide": false,
"intervalFactor": 2,
"legendFormat": "",
@ -2618,7 +2618,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(node_network_transmit_bytes{device!~\"lo\"}[5m]))",
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))",
"hide": false,
"intervalFactor": 2,
"legendFormat": "",
@ -4093,7 +4093,7 @@ data:
},
"targets": [
{
"expr": "sum(100 - (avg by (instance) (rate(node_cpu{job=\"node-exporter\",mode=\"idle\"}[5m])) * 100)) / count(node_cpu{job=\"node-exporter\",mode=\"idle\"})",
"expr": "sum(100 - (avg by (instance) (rate(node_cpu_seconds_total{job=\"node-exporter\",mode=\"idle\"}[5m])) * 100)) / count(node_cpu_seconds_total{job=\"node-exporter\",mode=\"idle\"})",
"format": "time_series",
"intervalFactor": 2,
"refId": "A",
@ -4165,7 +4165,7 @@ data:
},
"targets": [
{
"expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100",
"expr": "((sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes) - sum(node_memory_Buffers_bytes) - sum(node_memory_Cached_bytes)) / sum(node_memory_MemTotal_bytes)) * 100",
"format": "time_series",
"intervalFactor": 2,
"refId": "A",
@ -4237,7 +4237,7 @@ data:
},
"targets": [
{
"expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})",
"expr": "(sum(node_filesystem_size_bytes{device!=\"rootfs\"}) - sum(node_filesystem_free_bytes{device!=\"rootfs\"})) / sum(node_filesystem_size_bytes{device!=\"rootfs\"})",
"format": "time_series",
"intervalFactor": 2,
"refId": "A",
@ -5476,7 +5476,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "100 - (avg by (cpu) (irate(node_cpu{mode=\"idle\", instance=\"$server\"}[5m])) * 100)",
"expr": "100 - (avg by (cpu) (irate(node_cpu_seconds_total{mode=\"idle\", instance=\"$server\"}[5m])) * 100)",
"hide": false,
"intervalFactor": 10,
"legendFormat": "{{cpu}}",
@ -5652,7 +5652,7 @@ data:
"renderer": "flot",
"seriesOverrides": [
{
"alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}",
"alias": "node_memory_SwapFree_bytes{instance=\"172.17.0.1:9100\",job=\"prometheus\"}",
"yaxis": 2
}
],
@ -5662,7 +5662,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}",
"expr": "node_memory_MemTotal_bytes{instance=\"$server\"} - node_memory_MemFree_bytes{instance=\"$server\"} - node_memory_Buffers_bytes{instance=\"$server\"} - node_memory_Cached_bytes{instance=\"$server\"}",
"hide": false,
"interval": "",
"intervalFactor": 2,
@ -5672,7 +5672,7 @@ data:
"step": 10
},
{
"expr": "node_memory_Buffers{instance=\"$server\"}",
"expr": "node_memory_Buffers_bytes{instance=\"$server\"}",
"interval": "",
"intervalFactor": 2,
"legendFormat": "memory buffers",
@ -5689,7 +5689,7 @@ data:
"step": 10
},
{
"expr": "node_memory_MemFree{instance=\"$server\"}",
"expr": "node_memory_MemFree_bytes{instance=\"$server\"}",
"intervalFactor": 2,
"legendFormat": "memory free",
"metric": "",
@ -5778,7 +5778,7 @@ data:
},
"targets": [
{
"expr": "((node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}) / node_memory_MemTotal{instance=\"$server\"}) * 100",
"expr": "((node_memory_MemTotal_bytes{instance=\"$server\"} - node_memory_MemFree_bytes{instance=\"$server\"} - node_memory_Buffers_bytes{instance=\"$server\"} - node_memory_Cached_bytes{instance=\"$server\"}) / node_memory_MemTotal_bytes{instance=\"$server\"}) * 100",
"intervalFactor": 2,
"refId": "A",
"step": 60,
@ -5864,7 +5864,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum by (instance) (rate(node_disk_bytes_read{instance=\"$server\"}[2m]))",
"expr": "sum by (instance) (rate(node_disk_read_bytes_total{instance=\"$server\"}[2m]))",
"hide": false,
"intervalFactor": 4,
"legendFormat": "read",
@ -5873,14 +5873,14 @@ data:
"target": ""
},
{
"expr": "sum by (instance) (rate(node_disk_bytes_written{instance=\"$server\"}[2m]))",
"expr": "sum by (instance) (rate(node_disk_written_bytes_total{instance=\"$server\"}[2m]))",
"intervalFactor": 4,
"legendFormat": "written",
"refId": "B",
"step": 20
},
{
"expr": "sum by (instance) (rate(node_disk_io_time_ms{instance=\"$server\"}[2m]))",
"expr": "sum by (instance) (rate(node_disk_io_time_seconds_total{instance=\"$server\"}[2m]))",
"intervalFactor": 4,
"legendFormat": "io time",
"refId": "C",
@ -5967,7 +5967,7 @@ data:
},
"targets": [
{
"expr": "(sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"}) - sum(node_filesystem_free{device!=\"rootfs\",instance=\"$server\"})) / sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"})",
"expr": "(sum(node_filesystem_size_bytes{device!=\"rootfs\",instance=\"$server\"}) - sum(node_filesystem_free_bytes{device!=\"rootfs\",instance=\"$server\"})) / sum(node_filesystem_size_bytes{device!=\"rootfs\",instance=\"$server\"})",
"intervalFactor": 2,
"refId": "A",
"step": 60,
@ -6045,7 +6045,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "rate(node_network_receive_bytes{instance=\"$server\",device!~\"lo\"}[5m])",
"expr": "rate(node_network_receive_bytes_total{instance=\"$server\",device!~\"lo\"}[5m])",
"hide": false,
"intervalFactor": 2,
"legendFormat": "{{device}}",
@ -6127,7 +6127,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "rate(node_network_transmit_bytes{instance=\"$server\",device!~\"lo\"}[5m])",
"expr": "rate(node_network_transmit_bytes_total{instance=\"$server\",device!~\"lo\"}[5m])",
"hide": false,
"intervalFactor": 2,
"legendFormat": "{{device}}",
@ -6184,7 +6184,7 @@ data:
"multi": false,
"name": "server",
"options": [],
"query": "label_values(node_boot_time, instance)",
"query": "label_values(node_boot_time_seconds, instance)",
"refresh": 1,
"regex": "",
"sort": 0,

View File

@ -28,21 +28,24 @@ spec:
hostPID: true
containers:
- name: node-exporter
image: quay.io/prometheus/node-exporter:v0.15.2
image: quay.io/prometheus/node-exporter:v0.17.0
args:
- "--path.procfs=/host/proc"
- "--path.sysfs=/host/sys"
- --path.procfs=/host/proc
- --path.sysfs=/host/sys
- --path.rootfs=/host/root
- --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/)
- --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$
ports:
- name: metrics
containerPort: 9100
hostPort: 9100
resources:
requests:
memory: 30Mi
cpu: 100m
limits:
memory: 50Mi
limits:
cpu: 200m
memory: 100Mi
volumeMounts:
- name: proc
mountPath: /host/proc
@ -50,6 +53,9 @@ spec:
- name: sys
mountPath: /host/sys
readOnly: true
- name: root
mountPath: /host/root
readOnly: true
tolerations:
- effect: NoSchedule
operator: Exists
@ -60,3 +66,6 @@ spec:
- name: sys
hostPath:
path: /sys
- name: root
hostPath:
path: /

View File

@ -456,22 +456,22 @@ data:
- name: node.rules
rules:
- record: instance:node_cpu:rate:sum
expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m]))
expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m]))
BY (instance)
- record: instance:node_filesystem_usage:sum
expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}))
expr: sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}))
BY (instance)
- record: instance:node_network_receive_bytes:rate:sum
expr: sum(rate(node_network_receive_bytes[3m])) BY (instance)
expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
- record: instance:node_network_transmit_bytes:rate:sum
expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance)
expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
- record: instance:node_cpu:ratio
expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance)
GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)
expr: sum(rate(node_cpu_seconds_total{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance)
GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)
- record: cluster:node_cpu:sum_rate5m
expr: sum(rate(node_cpu{mode!="idle"}[5m]))
expr: sum(rate(node_cpu_seconds_total{mode!="idle"}[5m]))
- record: cluster:node_cpu:ratio
expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))
- alert: NodeExporterDown
expr: absent(up{job="node-exporter"} == 1)
for: 10m
@ -481,7 +481,7 @@ data:
description: Prometheus could not scrape a node-exporter for more than 10m,
or node-exporters have disappeared from discovery
- alert: NodeDiskRunningFull
expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0
expr: predict_linear(node_filesystem_free_bytes[6h], 3600 * 24) < 0
for: 30m
labels:
severity: warning
@ -489,7 +489,7 @@ data:
description: device {{$labels.device}} on node {{$labels.instance}} is running
full within the next 24 hours (mounted at {{$labels.mountpoint}})
- alert: NodeDiskRunningFull
expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0
expr: predict_linear(node_filesystem_free_bytes[30m], 3600 * 2) < 0
for: 10m
labels:
severity: critical