diff --git a/CHANGES.md b/CHANGES.md index c5f543e6..f26c5dae 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -19,6 +19,7 @@ Notable changes between versions. * Update Prometheus from v2.6.0 to v2.6.1 * Update kube-state-metrics from v1.4.0 to v1.5.0 * Fix ClusterRole to collect and export PodDisruptionBudget metrics ([#383](https://github.com/poseidon/typhoon/pull/383)) +* Update node-exporter from v0.15.2 to v0.17.0 * Update Grafana from v5.4.2 to v5.4.3 ## v1.13.2 diff --git a/addons/grafana/dashboards.yaml b/addons/grafana/dashboards.yaml index 4212ddf5..56a03ceb 100644 --- a/addons/grafana/dashboards.yaml +++ b/addons/grafana/dashboards.yaml @@ -1963,7 +1963,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(node_cpu{mode=\"idle\"}[2m])) * 100", + "expr": "sum(rate(node_cpu_seconds_total{mode=\"idle\"}[2m])) * 100", "hide": false, "intervalFactor": 10, "legendFormat": "", @@ -2138,7 +2138,7 @@ data: "renderer": "flot", "seriesOverrides": [ { - "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", + "alias": "node_memory_SwapFree_bytes{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", "yaxis": 2 } ], @@ -2148,7 +2148,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)", + "expr": "sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes) - sum(node_memory_Buffers_bytes) - sum(node_memory_Cached_bytes)", "intervalFactor": 2, "legendFormat": "memory usage", "metric": "memo", @@ -2157,7 +2157,7 @@ data: "target": "" }, { - "expr": "sum(node_memory_Buffers)", + "expr": "sum(node_memory_Buffers_bytes)", "interval": "", "intervalFactor": 2, "legendFormat": "memory buffers", @@ -2167,7 +2167,7 @@ data: "target": "" }, { - "expr": "sum(node_memory_Cached)", + "expr": "sum(node_memory_Cached_bytes)", "interval": "", "intervalFactor": 2, "legendFormat": "memory cached", @@ -2177,7 +2177,7 @@ data: "target": "" }, { - "expr": "sum(node_memory_MemFree)", + "expr": "sum(node_memory_MemFree_bytes)", "interval": "", "intervalFactor": 2, "legendFormat": "memory free", @@ -2268,7 +2268,7 @@ data: }, "targets": [ { - "expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100", + "expr": "((sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes) - sum(node_memory_Buffers_bytes) - sum(node_memory_Cached_bytes)) / sum(node_memory_MemTotal_bytes)) * 100", "intervalFactor": 2, "metric": "", "refId": "A", @@ -2355,7 +2355,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(node_disk_bytes_read[5m]))", + "expr": "max(rate(node_disk_read_bytes_total[5m]))", "hide": false, "intervalFactor": 4, "legendFormat": "read", @@ -2364,14 +2364,14 @@ data: "target": "" }, { - "expr": "sum(rate(node_disk_bytes_written[5m]))", + "expr": "max(rate(node_disk_written_bytes_total[5m]))", "intervalFactor": 4, "legendFormat": "written", "refId": "B", "step": 20 }, { - "expr": "sum(rate(node_disk_io_time_ms[5m]))", + "expr": "max(rate(node_disk_io_time_seconds_total[5m]))", "intervalFactor": 4, "legendFormat": "io time", "refId": "C", @@ -2458,7 +2458,7 @@ data: }, "targets": [ { - "expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})", + "expr": "(sum(node_filesystem_size_bytes{device!=\"rootfs\"}) - sum(node_filesystem_free_bytes{device!=\"rootfs\"})) / sum(node_filesystem_size_bytes{device!=\"rootfs\"})", "intervalFactor": 2, "refId": "A", "step": 60, @@ -2536,7 +2536,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(node_network_receive_bytes{device!~\"lo\"}[5m]))", + "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[5m]))", "hide": false, "intervalFactor": 2, "legendFormat": "", @@ -2618,7 +2618,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(rate(node_network_transmit_bytes{device!~\"lo\"}[5m]))", + "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[5m]))", "hide": false, "intervalFactor": 2, "legendFormat": "", @@ -4093,7 +4093,7 @@ data: }, "targets": [ { - "expr": "sum(100 - (avg by (instance) (rate(node_cpu{job=\"node-exporter\",mode=\"idle\"}[5m])) * 100)) / count(node_cpu{job=\"node-exporter\",mode=\"idle\"})", + "expr": "sum(100 - (avg by (instance) (rate(node_cpu_seconds_total{job=\"node-exporter\",mode=\"idle\"}[5m])) * 100)) / count(node_cpu_seconds_total{job=\"node-exporter\",mode=\"idle\"})", "format": "time_series", "intervalFactor": 2, "refId": "A", @@ -4165,7 +4165,7 @@ data: }, "targets": [ { - "expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100", + "expr": "((sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes) - sum(node_memory_Buffers_bytes) - sum(node_memory_Cached_bytes)) / sum(node_memory_MemTotal_bytes)) * 100", "format": "time_series", "intervalFactor": 2, "refId": "A", @@ -4237,7 +4237,7 @@ data: }, "targets": [ { - "expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})", + "expr": "(sum(node_filesystem_size_bytes{device!=\"rootfs\"}) - sum(node_filesystem_free_bytes{device!=\"rootfs\"})) / sum(node_filesystem_size_bytes{device!=\"rootfs\"})", "format": "time_series", "intervalFactor": 2, "refId": "A", @@ -5476,7 +5476,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "100 - (avg by (cpu) (irate(node_cpu{mode=\"idle\", instance=\"$server\"}[5m])) * 100)", + "expr": "100 - (avg by (cpu) (irate(node_cpu_seconds_total{mode=\"idle\", instance=\"$server\"}[5m])) * 100)", "hide": false, "intervalFactor": 10, "legendFormat": "{{cpu}}", @@ -5652,7 +5652,7 @@ data: "renderer": "flot", "seriesOverrides": [ { - "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", + "alias": "node_memory_SwapFree_bytes{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", "yaxis": 2 } ], @@ -5662,7 +5662,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}", + "expr": "node_memory_MemTotal_bytes{instance=\"$server\"} - node_memory_MemFree_bytes{instance=\"$server\"} - node_memory_Buffers_bytes{instance=\"$server\"} - node_memory_Cached_bytes{instance=\"$server\"}", "hide": false, "interval": "", "intervalFactor": 2, @@ -5672,7 +5672,7 @@ data: "step": 10 }, { - "expr": "node_memory_Buffers{instance=\"$server\"}", + "expr": "node_memory_Buffers_bytes{instance=\"$server\"}", "interval": "", "intervalFactor": 2, "legendFormat": "memory buffers", @@ -5689,7 +5689,7 @@ data: "step": 10 }, { - "expr": "node_memory_MemFree{instance=\"$server\"}", + "expr": "node_memory_MemFree_bytes{instance=\"$server\"}", "intervalFactor": 2, "legendFormat": "memory free", "metric": "", @@ -5778,7 +5778,7 @@ data: }, "targets": [ { - "expr": "((node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}) / node_memory_MemTotal{instance=\"$server\"}) * 100", + "expr": "((node_memory_MemTotal_bytes{instance=\"$server\"} - node_memory_MemFree_bytes{instance=\"$server\"} - node_memory_Buffers_bytes{instance=\"$server\"} - node_memory_Cached_bytes{instance=\"$server\"}) / node_memory_MemTotal_bytes{instance=\"$server\"}) * 100", "intervalFactor": 2, "refId": "A", "step": 60, @@ -5864,7 +5864,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum by (instance) (rate(node_disk_bytes_read{instance=\"$server\"}[2m]))", + "expr": "sum by (instance) (rate(node_disk_read_bytes_total{instance=\"$server\"}[2m]))", "hide": false, "intervalFactor": 4, "legendFormat": "read", @@ -5873,14 +5873,14 @@ data: "target": "" }, { - "expr": "sum by (instance) (rate(node_disk_bytes_written{instance=\"$server\"}[2m]))", + "expr": "sum by (instance) (rate(node_disk_written_bytes_total{instance=\"$server\"}[2m]))", "intervalFactor": 4, "legendFormat": "written", "refId": "B", "step": 20 }, { - "expr": "sum by (instance) (rate(node_disk_io_time_ms{instance=\"$server\"}[2m]))", + "expr": "sum by (instance) (rate(node_disk_io_time_seconds_total{instance=\"$server\"}[2m]))", "intervalFactor": 4, "legendFormat": "io time", "refId": "C", @@ -5967,7 +5967,7 @@ data: }, "targets": [ { - "expr": "(sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"}) - sum(node_filesystem_free{device!=\"rootfs\",instance=\"$server\"})) / sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"})", + "expr": "(sum(node_filesystem_size_bytes{device!=\"rootfs\",instance=\"$server\"}) - sum(node_filesystem_free_bytes{device!=\"rootfs\",instance=\"$server\"})) / sum(node_filesystem_size_bytes{device!=\"rootfs\",instance=\"$server\"})", "intervalFactor": 2, "refId": "A", "step": 60, @@ -6045,7 +6045,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "rate(node_network_receive_bytes{instance=\"$server\",device!~\"lo\"}[5m])", + "expr": "rate(node_network_receive_bytes_total{instance=\"$server\",device!~\"lo\"}[5m])", "hide": false, "intervalFactor": 2, "legendFormat": "{{device}}", @@ -6127,7 +6127,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "rate(node_network_transmit_bytes{instance=\"$server\",device!~\"lo\"}[5m])", + "expr": "rate(node_network_transmit_bytes_total{instance=\"$server\",device!~\"lo\"}[5m])", "hide": false, "intervalFactor": 2, "legendFormat": "{{device}}", @@ -6184,7 +6184,7 @@ data: "multi": false, "name": "server", "options": [], - "query": "label_values(node_boot_time, instance)", + "query": "label_values(node_boot_time_seconds, instance)", "refresh": 1, "regex": "", "sort": 0, diff --git a/addons/prometheus/exporters/node-exporter/daemonset.yaml b/addons/prometheus/exporters/node-exporter/daemonset.yaml index 4164bd51..2b631c93 100644 --- a/addons/prometheus/exporters/node-exporter/daemonset.yaml +++ b/addons/prometheus/exporters/node-exporter/daemonset.yaml @@ -28,21 +28,24 @@ spec: hostPID: true containers: - name: node-exporter - image: quay.io/prometheus/node-exporter:v0.15.2 + image: quay.io/prometheus/node-exporter:v0.17.0 args: - - "--path.procfs=/host/proc" - - "--path.sysfs=/host/sys" + - --path.procfs=/host/proc + - --path.sysfs=/host/sys + - --path.rootfs=/host/root + - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/) + - --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$ ports: - name: metrics containerPort: 9100 hostPort: 9100 resources: requests: - memory: 30Mi cpu: 100m - limits: memory: 50Mi + limits: cpu: 200m + memory: 100Mi volumeMounts: - name: proc mountPath: /host/proc @@ -50,6 +53,9 @@ spec: - name: sys mountPath: /host/sys readOnly: true + - name: root + mountPath: /host/root + readOnly: true tolerations: - effect: NoSchedule operator: Exists @@ -60,3 +66,6 @@ spec: - name: sys hostPath: path: /sys + - name: root + hostPath: + path: / diff --git a/addons/prometheus/rules.yaml b/addons/prometheus/rules.yaml index ed61ecda..618cd428 100644 --- a/addons/prometheus/rules.yaml +++ b/addons/prometheus/rules.yaml @@ -456,22 +456,22 @@ data: - name: node.rules rules: - record: instance:node_cpu:rate:sum - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m])) + expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m])) BY (instance) - record: instance:node_filesystem_usage:sum - expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) + expr: sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"})) BY (instance) - record: instance:node_network_receive_bytes:rate:sum - expr: sum(rate(node_network_receive_bytes[3m])) BY (instance) + expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance) - record: instance:node_network_transmit_bytes:rate:sum - expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance) + expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance) - record: instance:node_cpu:ratio - expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance) - GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance) + expr: sum(rate(node_cpu_seconds_total{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance) + GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance) - record: cluster:node_cpu:sum_rate5m - expr: sum(rate(node_cpu{mode!="idle"}[5m])) + expr: sum(rate(node_cpu_seconds_total{mode!="idle"}[5m])) - record: cluster:node_cpu:ratio - expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu)) + expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu)) - alert: NodeExporterDown expr: absent(up{job="node-exporter"} == 1) for: 10m @@ -481,7 +481,7 @@ data: description: Prometheus could not scrape a node-exporter for more than 10m, or node-exporters have disappeared from discovery - alert: NodeDiskRunningFull - expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0 + expr: predict_linear(node_filesystem_free_bytes[6h], 3600 * 24) < 0 for: 30m labels: severity: warning @@ -489,7 +489,7 @@ data: description: device {{$labels.device}} on node {{$labels.instance}} is running full within the next 24 hours (mounted at {{$labels.mountpoint}}) - alert: NodeDiskRunningFull - expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0 + expr: predict_linear(node_filesystem_free_bytes[30m], 3600 * 2) < 0 for: 10m labels: severity: critical