Update node-exporter from v0.15.2 to v0.17.0
* node-exporter renamed multiple metrics that are reflected in changes to Prometheus rules and Grafana dashboard expressions
This commit is contained in:
parent
d697dd46dc
commit
f5ff003d0e
|
@ -19,6 +19,7 @@ Notable changes between versions.
|
|||
* Update Prometheus from v2.6.0 to v2.6.1
|
||||
* Update kube-state-metrics from v1.4.0 to v1.5.0
|
||||
* Fix ClusterRole to collect and export PodDisruptionBudget metrics ([#383](https://github.com/poseidon/typhoon/pull/383))
|
||||
* Update node-exporter from v0.15.2 to v0.17.0
|
||||
* Update Grafana from v5.4.2 to v5.4.3
|
||||
|
||||
## v1.13.2
|
||||
|
|
|
@ -1963,7 +1963,7 @@ data:
|
|||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(node_cpu{mode=\"idle\"}[2m])) * 100",
|
||||
"expr": "sum(rate(node_cpu_seconds_total{mode=\"idle\"}[2m])) * 100",
|
||||
"hide": false,
|
||||
"intervalFactor": 10,
|
||||
"legendFormat": "",
|
||||
|
@ -2138,7 +2138,7 @@ data:
|
|||
"renderer": "flot",
|
||||
"seriesOverrides": [
|
||||
{
|
||||
"alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}",
|
||||
"alias": "node_memory_SwapFree_bytes{instance=\"172.17.0.1:9100\",job=\"prometheus\"}",
|
||||
"yaxis": 2
|
||||
}
|
||||
],
|
||||
|
@ -2148,7 +2148,7 @@ data:
|
|||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)",
|
||||
"expr": "sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes) - sum(node_memory_Buffers_bytes) - sum(node_memory_Cached_bytes)",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "memory usage",
|
||||
"metric": "memo",
|
||||
|
@ -2157,7 +2157,7 @@ data:
|
|||
"target": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum(node_memory_Buffers)",
|
||||
"expr": "sum(node_memory_Buffers_bytes)",
|
||||
"interval": "",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "memory buffers",
|
||||
|
@ -2167,7 +2167,7 @@ data:
|
|||
"target": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum(node_memory_Cached)",
|
||||
"expr": "sum(node_memory_Cached_bytes)",
|
||||
"interval": "",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "memory cached",
|
||||
|
@ -2177,7 +2177,7 @@ data:
|
|||
"target": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum(node_memory_MemFree)",
|
||||
"expr": "sum(node_memory_MemFree_bytes)",
|
||||
"interval": "",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "memory free",
|
||||
|
@ -2268,7 +2268,7 @@ data:
|
|||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100",
|
||||
"expr": "((sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes) - sum(node_memory_Buffers_bytes) - sum(node_memory_Cached_bytes)) / sum(node_memory_MemTotal_bytes)) * 100",
|
||||
"intervalFactor": 2,
|
||||
"metric": "",
|
||||
"refId": "A",
|
||||
|
@ -2355,7 +2355,7 @@ data:
|
|||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(node_disk_bytes_read[5m]))",
|
||||
"expr": "max(rate(node_disk_read_bytes_total[5m]))",
|
||||
"hide": false,
|
||||
"intervalFactor": 4,
|
||||
"legendFormat": "read",
|
||||
|
@ -2364,14 +2364,14 @@ data:
|
|||
"target": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(node_disk_bytes_written[5m]))",
|
||||
"expr": "max(rate(node_disk_written_bytes_total[5m]))",
|
||||
"intervalFactor": 4,
|
||||
"legendFormat": "written",
|
||||
"refId": "B",
|
||||
"step": 20
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(node_disk_io_time_ms[5m]))",
|
||||
"expr": "max(rate(node_disk_io_time_seconds_total[5m]))",
|
||||
"intervalFactor": 4,
|
||||
"legendFormat": "io time",
|
||||
"refId": "C",
|
||||
|
@ -2458,7 +2458,7 @@ data:
|
|||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})",
|
||||
"expr": "(sum(node_filesystem_size_bytes{device!=\"rootfs\"}) - sum(node_filesystem_free_bytes{device!=\"rootfs\"})) / sum(node_filesystem_size_bytes{device!=\"rootfs\"})",
|
||||
"intervalFactor": 2,
|
||||
"refId": "A",
|
||||
"step": 60,
|
||||
|
@ -2536,7 +2536,7 @@ data:
|
|||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(node_network_receive_bytes{device!~\"lo\"}[5m]))",
|
||||
"expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]))",
|
||||
"hide": false,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
|
@ -2618,7 +2618,7 @@ data:
|
|||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(node_network_transmit_bytes{device!~\"lo\"}[5m]))",
|
||||
"expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))",
|
||||
"hide": false,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
|
@ -4093,7 +4093,7 @@ data:
|
|||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(100 - (avg by (instance) (rate(node_cpu{job=\"node-exporter\",mode=\"idle\"}[5m])) * 100)) / count(node_cpu{job=\"node-exporter\",mode=\"idle\"})",
|
||||
"expr": "sum(100 - (avg by (instance) (rate(node_cpu_seconds_total{job=\"node-exporter\",mode=\"idle\"}[5m])) * 100)) / count(node_cpu_seconds_total{job=\"node-exporter\",mode=\"idle\"})",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"refId": "A",
|
||||
|
@ -4165,7 +4165,7 @@ data:
|
|||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100",
|
||||
"expr": "((sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes) - sum(node_memory_Buffers_bytes) - sum(node_memory_Cached_bytes)) / sum(node_memory_MemTotal_bytes)) * 100",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"refId": "A",
|
||||
|
@ -4237,7 +4237,7 @@ data:
|
|||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})",
|
||||
"expr": "(sum(node_filesystem_size_bytes{device!=\"rootfs\"}) - sum(node_filesystem_free_bytes{device!=\"rootfs\"})) / sum(node_filesystem_size_bytes{device!=\"rootfs\"})",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"refId": "A",
|
||||
|
@ -5476,7 +5476,7 @@ data:
|
|||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (avg by (cpu) (irate(node_cpu{mode=\"idle\", instance=\"$server\"}[5m])) * 100)",
|
||||
"expr": "100 - (avg by (cpu) (irate(node_cpu_seconds_total{mode=\"idle\", instance=\"$server\"}[5m])) * 100)",
|
||||
"hide": false,
|
||||
"intervalFactor": 10,
|
||||
"legendFormat": "{{cpu}}",
|
||||
|
@ -5652,7 +5652,7 @@ data:
|
|||
"renderer": "flot",
|
||||
"seriesOverrides": [
|
||||
{
|
||||
"alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}",
|
||||
"alias": "node_memory_SwapFree_bytes{instance=\"172.17.0.1:9100\",job=\"prometheus\"}",
|
||||
"yaxis": 2
|
||||
}
|
||||
],
|
||||
|
@ -5662,7 +5662,7 @@ data:
|
|||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}",
|
||||
"expr": "node_memory_MemTotal_bytes{instance=\"$server\"} - node_memory_MemFree_bytes{instance=\"$server\"} - node_memory_Buffers_bytes{instance=\"$server\"} - node_memory_Cached_bytes{instance=\"$server\"}",
|
||||
"hide": false,
|
||||
"interval": "",
|
||||
"intervalFactor": 2,
|
||||
|
@ -5672,7 +5672,7 @@ data:
|
|||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "node_memory_Buffers{instance=\"$server\"}",
|
||||
"expr": "node_memory_Buffers_bytes{instance=\"$server\"}",
|
||||
"interval": "",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "memory buffers",
|
||||
|
@ -5689,7 +5689,7 @@ data:
|
|||
"step": 10
|
||||
},
|
||||
{
|
||||
"expr": "node_memory_MemFree{instance=\"$server\"}",
|
||||
"expr": "node_memory_MemFree_bytes{instance=\"$server\"}",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "memory free",
|
||||
"metric": "",
|
||||
|
@ -5778,7 +5778,7 @@ data:
|
|||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "((node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}) / node_memory_MemTotal{instance=\"$server\"}) * 100",
|
||||
"expr": "((node_memory_MemTotal_bytes{instance=\"$server\"} - node_memory_MemFree_bytes{instance=\"$server\"} - node_memory_Buffers_bytes{instance=\"$server\"} - node_memory_Cached_bytes{instance=\"$server\"}) / node_memory_MemTotal_bytes{instance=\"$server\"}) * 100",
|
||||
"intervalFactor": 2,
|
||||
"refId": "A",
|
||||
"step": 60,
|
||||
|
@ -5864,7 +5864,7 @@ data:
|
|||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (instance) (rate(node_disk_bytes_read{instance=\"$server\"}[2m]))",
|
||||
"expr": "sum by (instance) (rate(node_disk_read_bytes_total{instance=\"$server\"}[2m]))",
|
||||
"hide": false,
|
||||
"intervalFactor": 4,
|
||||
"legendFormat": "read",
|
||||
|
@ -5873,14 +5873,14 @@ data:
|
|||
"target": ""
|
||||
},
|
||||
{
|
||||
"expr": "sum by (instance) (rate(node_disk_bytes_written{instance=\"$server\"}[2m]))",
|
||||
"expr": "sum by (instance) (rate(node_disk_written_bytes_total{instance=\"$server\"}[2m]))",
|
||||
"intervalFactor": 4,
|
||||
"legendFormat": "written",
|
||||
"refId": "B",
|
||||
"step": 20
|
||||
},
|
||||
{
|
||||
"expr": "sum by (instance) (rate(node_disk_io_time_ms{instance=\"$server\"}[2m]))",
|
||||
"expr": "sum by (instance) (rate(node_disk_io_time_seconds_total{instance=\"$server\"}[2m]))",
|
||||
"intervalFactor": 4,
|
||||
"legendFormat": "io time",
|
||||
"refId": "C",
|
||||
|
@ -5967,7 +5967,7 @@ data:
|
|||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"}) - sum(node_filesystem_free{device!=\"rootfs\",instance=\"$server\"})) / sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"})",
|
||||
"expr": "(sum(node_filesystem_size_bytes{device!=\"rootfs\",instance=\"$server\"}) - sum(node_filesystem_free_bytes{device!=\"rootfs\",instance=\"$server\"})) / sum(node_filesystem_size_bytes{device!=\"rootfs\",instance=\"$server\"})",
|
||||
"intervalFactor": 2,
|
||||
"refId": "A",
|
||||
"step": 60,
|
||||
|
@ -6045,7 +6045,7 @@ data:
|
|||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_network_receive_bytes{instance=\"$server\",device!~\"lo\"}[5m])",
|
||||
"expr": "rate(node_network_receive_bytes_total{instance=\"$server\",device!~\"lo\"}[5m])",
|
||||
"hide": false,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{device}}",
|
||||
|
@ -6127,7 +6127,7 @@ data:
|
|||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_network_transmit_bytes{instance=\"$server\",device!~\"lo\"}[5m])",
|
||||
"expr": "rate(node_network_transmit_bytes_total{instance=\"$server\",device!~\"lo\"}[5m])",
|
||||
"hide": false,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{device}}",
|
||||
|
@ -6184,7 +6184,7 @@ data:
|
|||
"multi": false,
|
||||
"name": "server",
|
||||
"options": [],
|
||||
"query": "label_values(node_boot_time, instance)",
|
||||
"query": "label_values(node_boot_time_seconds, instance)",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"sort": 0,
|
||||
|
|
|
@ -28,21 +28,24 @@ spec:
|
|||
hostPID: true
|
||||
containers:
|
||||
- name: node-exporter
|
||||
image: quay.io/prometheus/node-exporter:v0.15.2
|
||||
image: quay.io/prometheus/node-exporter:v0.17.0
|
||||
args:
|
||||
- "--path.procfs=/host/proc"
|
||||
- "--path.sysfs=/host/sys"
|
||||
- --path.procfs=/host/proc
|
||||
- --path.sysfs=/host/sys
|
||||
- --path.rootfs=/host/root
|
||||
- --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/)
|
||||
- --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$
|
||||
ports:
|
||||
- name: metrics
|
||||
containerPort: 9100
|
||||
hostPort: 9100
|
||||
resources:
|
||||
requests:
|
||||
memory: 30Mi
|
||||
cpu: 100m
|
||||
limits:
|
||||
memory: 50Mi
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 100Mi
|
||||
volumeMounts:
|
||||
- name: proc
|
||||
mountPath: /host/proc
|
||||
|
@ -50,6 +53,9 @@ spec:
|
|||
- name: sys
|
||||
mountPath: /host/sys
|
||||
readOnly: true
|
||||
- name: root
|
||||
mountPath: /host/root
|
||||
readOnly: true
|
||||
tolerations:
|
||||
- effect: NoSchedule
|
||||
operator: Exists
|
||||
|
@ -60,3 +66,6 @@ spec:
|
|||
- name: sys
|
||||
hostPath:
|
||||
path: /sys
|
||||
- name: root
|
||||
hostPath:
|
||||
path: /
|
||||
|
|
|
@ -456,22 +456,22 @@ data:
|
|||
- name: node.rules
|
||||
rules:
|
||||
- record: instance:node_cpu:rate:sum
|
||||
expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m]))
|
||||
expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m]))
|
||||
BY (instance)
|
||||
- record: instance:node_filesystem_usage:sum
|
||||
expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}))
|
||||
expr: sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}))
|
||||
BY (instance)
|
||||
- record: instance:node_network_receive_bytes:rate:sum
|
||||
expr: sum(rate(node_network_receive_bytes[3m])) BY (instance)
|
||||
expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
|
||||
- record: instance:node_network_transmit_bytes:rate:sum
|
||||
expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance)
|
||||
expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
|
||||
- record: instance:node_cpu:ratio
|
||||
expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance)
|
||||
GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)
|
||||
expr: sum(rate(node_cpu_seconds_total{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance)
|
||||
GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)
|
||||
- record: cluster:node_cpu:sum_rate5m
|
||||
expr: sum(rate(node_cpu{mode!="idle"}[5m]))
|
||||
expr: sum(rate(node_cpu_seconds_total{mode!="idle"}[5m]))
|
||||
- record: cluster:node_cpu:ratio
|
||||
expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
|
||||
expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))
|
||||
- alert: NodeExporterDown
|
||||
expr: absent(up{job="node-exporter"} == 1)
|
||||
for: 10m
|
||||
|
@ -481,7 +481,7 @@ data:
|
|||
description: Prometheus could not scrape a node-exporter for more than 10m,
|
||||
or node-exporters have disappeared from discovery
|
||||
- alert: NodeDiskRunningFull
|
||||
expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0
|
||||
expr: predict_linear(node_filesystem_free_bytes[6h], 3600 * 24) < 0
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -489,7 +489,7 @@ data:
|
|||
description: device {{$labels.device}} on node {{$labels.instance}} is running
|
||||
full within the next 24 hours (mounted at {{$labels.mountpoint}})
|
||||
- alert: NodeDiskRunningFull
|
||||
expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0
|
||||
expr: predict_linear(node_filesystem_free_bytes[30m], 3600 * 2) < 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
Loading…
Reference in New Issue