Refresh Prometheus rules/alerts and Grafana dashboards

* Refresh upstream Prometheus rules and alerts and Grafana
dashboards
* All Loki recording rules for convenience
This commit is contained in:
Dalton Hubble 2020-03-31 00:50:16 -07:00
parent 3c1be7b0e0
commit d47d40b517
4 changed files with 267 additions and 302 deletions

View File

@ -15,6 +15,10 @@ Notable changes between versions.
* Fix Azure worker UDP outbound connections ([#691](https://github.com/poseidon/typhoon/pull/691))
* Fix Azure worker clock sync timeouts
#### Addons
* Refresh Prometheus rules and alerts
## v1.18.0
* Kubernetes [v1.18.0](https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG/CHANGELOG-1.18.md#v1180)

View File

@ -59,7 +59,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", cluster=\"$cluster\"}[1m]))",
"expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", cluster=\"$cluster\"}[$__interval]))",
"format": "time_series",
"instant": true,
"intervalFactor": 2,
@ -1561,7 +1561,7 @@ data:
],
"targets": [
{
"expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$interval])) by (namespace)",
"expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1570,7 +1570,7 @@ data:
"step": 10
},
{
"expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$interval])) by (namespace)",
"expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1579,7 +1579,7 @@ data:
"step": 10
},
{
"expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$interval])) by (namespace)",
"expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1588,7 +1588,7 @@ data:
"step": 10
},
{
"expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$interval])) by (namespace)",
"expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1597,7 +1597,7 @@ data:
"step": 10
},
{
"expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$interval])) by (namespace)",
"expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1606,7 +1606,7 @@ data:
"step": 10
},
{
"expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$interval])) by (namespace)",
"expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -1706,7 +1706,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$interval])) by (namespace)",
"expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@ -1804,7 +1804,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$interval])) by (namespace)",
"expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@ -1902,7 +1902,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$interval])) by (namespace)",
"expr": "avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@ -2000,7 +2000,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$interval])) by (namespace)",
"expr": "avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@ -2098,7 +2098,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$interval])) by (namespace)",
"expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@ -2196,7 +2196,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$interval])) by (namespace)",
"expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@ -2294,7 +2294,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$interval])) by (namespace)",
"expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@ -2392,7 +2392,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$interval])) by (namespace)",
"expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{namespace}}",
@ -2499,41 +2499,6 @@ data:
"type": "query",
"useTags": false
},
{
"allValue": null,
"auto": false,
"auto_count": 30,
"auto_min": "10s",
"current": {
"text": "5m",
"value": "5m"
},
"datasource": "$datasource",
"hide": 2,
"includeAll": false,
"label": null,
"multi": false,
"name": "interval",
"options": [
{
"selected": true,
"text": "4h",
"value": "4h"
}
],
"query": "4h",
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"tagValuesQuery": "",
"tags": [
],
"tagsQuery": "",
"type": "interval",
"useTags": false
},
{
"allValue": null,
"current": {
@ -4033,7 +3998,7 @@ data:
],
"targets": [
{
"expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])) by (pod)",
"expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -4042,7 +4007,7 @@ data:
"step": 10
},
{
"expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])) by (pod)",
"expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -4051,7 +4016,7 @@ data:
"step": 10
},
{
"expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])) by (pod)",
"expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -4060,7 +4025,7 @@ data:
"step": 10
},
{
"expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])) by (pod)",
"expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -4069,7 +4034,7 @@ data:
"step": 10
},
{
"expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])) by (pod)",
"expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -4078,7 +4043,7 @@ data:
"step": 10
},
{
"expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])) by (pod)",
"expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -4178,7 +4143,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])) by (pod)",
"expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -4276,7 +4241,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])) by (pod)",
"expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -4374,7 +4339,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])) by (pod)",
"expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -4472,7 +4437,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])) by (pod)",
"expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -4570,7 +4535,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])) by (pod)",
"expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -4668,7 +4633,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])) by (pod)",
"expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -4748,41 +4713,6 @@ data:
"regex": "",
"type": "datasource"
},
{
"allValue": null,
"auto": false,
"auto_count": 30,
"auto_min": "10s",
"current": {
"text": "5m",
"value": "5m"
},
"datasource": "$datasource",
"hide": 2,
"includeAll": false,
"label": null,
"multi": false,
"name": "interval",
"options": [
{
"selected": true,
"text": "4h",
"value": "4h"
}
],
"query": "4h",
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"tagValuesQuery": "",
"tags": [
],
"tagsQuery": "",
"type": "interval",
"useTags": false
},
{
"allValue": null,
"current": {
@ -5724,41 +5654,6 @@ data:
"regex": "",
"type": "datasource"
},
{
"allValue": null,
"auto": false,
"auto_count": 30,
"auto_min": "10s",
"current": {
"text": "5m",
"value": "5m"
},
"datasource": "$datasource",
"hide": 2,
"includeAll": false,
"label": null,
"multi": false,
"name": "interval",
"options": [
{
"selected": true,
"text": "4h",
"value": "4h"
}
],
"query": "4h",
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"tagValuesQuery": "",
"tags": [
],
"tagsQuery": "",
"type": "interval",
"useTags": false
},
{
"allValue": null,
"current": {

View File

@ -1042,7 +1042,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(container_network_receive_bytes_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval])) by (pod)",
"expr": "sum(irate(container_network_receive_bytes_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__interval])) by (pod)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -1140,7 +1140,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval])) by (pod)",
"expr": "sum(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__interval])) by (pod)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -1238,7 +1238,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(container_network_receive_packets_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval])) by (pod)",
"expr": "sum(irate(container_network_receive_packets_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__interval])) by (pod)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -1336,7 +1336,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(container_network_transmit_packets_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval])) by (pod)",
"expr": "sum(irate(container_network_transmit_packets_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__interval])) by (pod)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -1434,7 +1434,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval])) by (pod)",
"expr": "sum(irate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__interval])) by (pod)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -1532,7 +1532,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$interval])) by (pod)",
"expr": "sum(irate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__interval])) by (pod)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -1612,41 +1612,6 @@ data:
"regex": "",
"type": "datasource"
},
{
"allValue": null,
"auto": false,
"auto_count": 30,
"auto_min": "10s",
"current": {
"text": "5m",
"value": "5m"
},
"datasource": "$datasource",
"hide": 2,
"includeAll": false,
"label": null,
"multi": false,
"name": "interval",
"options": [
{
"selected": true,
"text": "4h",
"value": "4h"
}
],
"query": "4h",
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"tagValuesQuery": "",
"tags": [
],
"tagsQuery": "",
"type": "interval",
"useTags": false
},
{
"allValue": null,
"current": {
@ -2701,7 +2666,7 @@ data:
],
"targets": [
{
"expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -2710,7 +2675,7 @@ data:
"step": 10
},
{
"expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -2719,7 +2684,7 @@ data:
"step": 10
},
{
"expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -2728,7 +2693,7 @@ data:
"step": 10
},
{
"expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -2737,7 +2702,7 @@ data:
"step": 10
},
{
"expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -2746,7 +2711,7 @@ data:
"step": 10
},
{
"expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -2846,7 +2811,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -2944,7 +2909,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -3042,7 +3007,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -3140,7 +3105,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -3238,7 +3203,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -3336,7 +3301,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -3434,7 +3399,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -3532,7 +3497,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod) \ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{pod}}",
@ -3612,41 +3577,6 @@ data:
"regex": "",
"type": "datasource"
},
{
"allValue": null,
"auto": false,
"auto_count": 30,
"auto_min": "10s",
"current": {
"text": "5m",
"value": "5m"
},
"datasource": "$datasource",
"hide": 2,
"includeAll": false,
"label": null,
"multi": false,
"name": "interval",
"options": [
{
"selected": true,
"text": "4h",
"value": "4h"
}
],
"query": "4h",
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"tagValuesQuery": "",
"tags": [
],
"tagsQuery": "",
"type": "interval",
"useTags": false
},
{
"allValue": null,
"current": {
@ -3708,7 +3638,7 @@ data:
"value": ""
},
"datasource": "$datasource",
"hide": 2,
"hide": 0,
"includeAll": false,
"label": null,
"multi": false,
@ -4906,7 +4836,7 @@ data:
],
"targets": [
{
"expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n",
"expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -4915,7 +4845,7 @@ data:
"step": 10
},
{
"expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n",
"expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -4924,7 +4854,7 @@ data:
"step": 10
},
{
"expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n",
"expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -4933,7 +4863,7 @@ data:
"step": 10
},
{
"expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n",
"expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -4942,7 +4872,7 @@ data:
"step": 10
},
{
"expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n",
"expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -4951,7 +4881,7 @@ data:
"step": 10
},
{
"expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n",
"expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n",
"format": "table",
"instant": true,
"intervalFactor": 2,
@ -5051,7 +4981,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n",
"expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{workload}}",
@ -5149,7 +5079,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n",
"expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{workload}}",
@ -5247,7 +5177,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n",
"expr": "(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{workload}}",
@ -5345,7 +5275,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n",
"expr": "(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{workload}}",
@ -5443,7 +5373,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n",
"expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{workload}}",
@ -5541,7 +5471,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n",
"expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{workload}}",
@ -5639,7 +5569,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n",
"expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{workload}}",
@ -5737,7 +5667,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$interval])\n* on (namespace,pod) \ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n",
"expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod) \ngroup_left(workload,workload_type) mixin_pod_workload{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{workload}}",
@ -5817,41 +5747,6 @@ data:
"regex": "",
"type": "datasource"
},
{
"allValue": null,
"auto": false,
"auto_count": 30,
"auto_min": "10s",
"current": {
"text": "5m",
"value": "5m"
},
"datasource": "$datasource",
"hide": 2,
"includeAll": false,
"label": null,
"multi": false,
"name": "interval",
"options": [
{
"selected": true,
"text": "4h",
"value": "4h"
}
],
"query": "4h",
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"tagValuesQuery": "",
"tags": [
],
"tagsQuery": "",
"type": "interval",
"useTags": false
},
{
"allValue": null,
"auto": false,

View File

@ -252,25 +252,25 @@ data:
"name": "kube-apiserver.rules",
"rules": [
{
"expr": "sum(rate(apiserver_request_duration_seconds_sum{subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|PROXY|CONNECT\"}[5m])) without(instance, pod)\n/\nsum(rate(apiserver_request_duration_seconds_count{subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|PROXY|CONNECT\"}[5m])) without(instance, pod)\n",
"expr": "sum(rate(apiserver_request_duration_seconds_sum{subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT\"}[5m])) without(instance, pod)\n/\nsum(rate(apiserver_request_duration_seconds_count{subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT\"}[5m])) without(instance, pod)\n",
"record": "cluster:apiserver_request_duration_seconds:mean5m"
},
{
"expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|PROXY|CONNECT\"}[5m])) without(instance, pod))\n",
"expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT\"}[5m])) without(instance, pod))\n",
"labels": {
"quantile": "0.99"
},
"record": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile"
},
{
"expr": "histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|PROXY|CONNECT\"}[5m])) without(instance, pod))\n",
"expr": "histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT\"}[5m])) without(instance, pod))\n",
"labels": {
"quantile": "0.9"
},
"record": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile"
},
{
"expr": "histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|PROXY|CONNECT\"}[5m])) without(instance, pod))\n",
"expr": "histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT\"}[5m])) without(instance, pod))\n",
"labels": {
"quantile": "0.5"
},
@ -805,6 +805,7 @@ data:
{
"alert": "ErrorBudgetBurn",
"annotations": {
"message": "High requests error budget burn for job=apiserver (current value: {{ $value }})",
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-errorbudgetburn"
},
"expr": "(\n status_class_5xx:apiserver_request_total:ratio_rate1h{job=\"apiserver\"} > (14.4*0.010000)\n and\n status_class_5xx:apiserver_request_total:ratio_rate5m{job=\"apiserver\"} > (14.4*0.010000)\n)\nor\n(\n status_class_5xx:apiserver_request_total:ratio_rate6h{job=\"apiserver\"} > (6*0.010000)\n and\n status_class_5xx:apiserver_request_total:ratio_rate30m{job=\"apiserver\"} > (6*0.010000)\n)\n",
@ -816,6 +817,7 @@ data:
{
"alert": "ErrorBudgetBurn",
"annotations": {
"message": "High requests error budget burn for job=apiserver (current value: {{ $value }})",
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-errorbudgetburn"
},
"expr": "(\n status_class_5xx:apiserver_request_total:ratio_rate1d{job=\"apiserver\"} > (3*0.010000)\n and\n status_class_5xx:apiserver_request_total:ratio_rate2h{job=\"apiserver\"} > (3*0.010000)\n)\nor\n(\n status_class_5xx:apiserver_request_total:ratio_rate3d{job=\"apiserver\"} > (0.010000)\n and\n status_class_5xx:apiserver_request_total:ratio_rate6h{job=\"apiserver\"} > (0.010000)\n)\n",
@ -853,30 +855,6 @@ data:
"severity": "critical"
}
},
{
"alert": "KubeAPIErrorsHigh",
"annotations": {
"message": "API server is returning errors for {{ $value | humanizePercentage }} of requests.",
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh"
},
"expr": "sum(rate(apiserver_request_total{job=\"apiserver\",code=~\"5..\"}[5m]))\n /\nsum(rate(apiserver_request_total{job=\"apiserver\"}[5m])) > 0.03\n",
"for": "10m",
"labels": {
"severity": "critical"
}
},
{
"alert": "KubeAPIErrorsHigh",
"annotations": {
"message": "API server is returning errors for {{ $value | humanizePercentage }} of requests.",
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh"
},
"expr": "sum(rate(apiserver_request_total{job=\"apiserver\",code=~\"5..\"}[5m]))\n /\nsum(rate(apiserver_request_total{job=\"apiserver\"}[5m])) > 0.01\n",
"for": "10m",
"labels": {
"severity": "warning"
}
},
{
"alert": "KubeAPIErrorsHigh",
"annotations": {
@ -993,7 +971,7 @@ data:
"message": "Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity.",
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods"
},
"expr": "max(max(kubelet_running_pod_count{job=\"kubelet\"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\"}) by(node) / max(kube_node_status_capacity_pods{job=\"kube-state-metrics\"}) by(node) > 0.95\n",
"expr": "max(max(kubelet_running_pod_count{job=\"kubelet\"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job=\"kubelet\"}) by(node) / max(kube_node_status_capacity_pods{job=\"kube-state-metrics\"} != 1) by(node) > 0.95\n",
"for": "15m",
"labels": {
"severity": "warning"
@ -1029,7 +1007,7 @@ data:
"message": "Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}.",
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh"
},
"expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job=\"kubelet\"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name > 5\n",
"expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job=\"kubelet\"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name > 60\n",
"for": "15m",
"labels": {
"severity": "warning"
@ -1085,9 +1063,167 @@ data:
}
]
}
loki.yaml: |-
{
"groups": [
{
"name": "loki_rules",
"rules": [
{
"expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job))",
"record": "job:loki_request_duration_seconds:99quantile"
},
{
"expr": "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job))",
"record": "job:loki_request_duration_seconds:50quantile"
},
{
"expr": "sum(rate(loki_request_duration_seconds_sum[1m])) by (job) / sum(rate(loki_request_duration_seconds_count[1m])) by (job)",
"record": "job:loki_request_duration_seconds:avg"
},
{
"expr": "sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job)",
"record": "job:loki_request_duration_seconds_bucket:sum_rate"
},
{
"expr": "sum(rate(loki_request_duration_seconds_sum[1m])) by (job)",
"record": "job:loki_request_duration_seconds_sum:sum_rate"
},
{
"expr": "sum(rate(loki_request_duration_seconds_count[1m])) by (job)",
"record": "job:loki_request_duration_seconds_count:sum_rate"
},
{
"expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job, route))",
"record": "job_route:loki_request_duration_seconds:99quantile"
},
{
"expr": "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job, route))",
"record": "job_route:loki_request_duration_seconds:50quantile"
},
{
"expr": "sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)",
"record": "job_route:loki_request_duration_seconds:avg"
},
{
"expr": "sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job, route)",
"record": "job_route:loki_request_duration_seconds_bucket:sum_rate"
},
{
"expr": "sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route)",
"record": "job_route:loki_request_duration_seconds_sum:sum_rate"
},
{
"expr": "sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)",
"record": "job_route:loki_request_duration_seconds_count:sum_rate"
},
{
"expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, namespace, job, route))",
"record": "namespace_job_route:loki_request_duration_seconds:99quantile"
},
{
"expr": "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, namespace, job, route))",
"record": "namespace_job_route:loki_request_duration_seconds:50quantile"
},
{
"expr": "sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)",
"record": "namespace_job_route:loki_request_duration_seconds:avg"
},
{
"expr": "sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, namespace, job, route)",
"record": "namespace_job_route:loki_request_duration_seconds_bucket:sum_rate"
},
{
"expr": "sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route)",
"record": "namespace_job_route:loki_request_duration_seconds_sum:sum_rate"
},
{
"expr": "sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)",
"record": "namespace_job_route:loki_request_duration_seconds_count:sum_rate"
}
]
},
{
"name": "loki_alerts",
"rules": [
{
"alert": "LokiRequestErrors",
"annotations": {
"message": "{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}% errors.\n"
},
"expr": "100 * sum(rate(loki_request_duration_seconds_count{status_code=~\"5..\"}[1m])) by (namespace, job, route)\n /\nsum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)\n > 10\n",
"for": "15m",
"labels": {
"severity": "critical"
}
},
{
"alert": "LokiRequestLatency",
"annotations": {
"message": "{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency.\n"
},
"expr": "namespace_job_route:loki_request_duration_seconds:99quantile{route!~\"(?i).*tail.*\"} > 1\n",
"for": "15m",
"labels": {
"severity": "critical"
}
}
]
}
]
}
node-exporter.yaml: |-
{
"groups": [
{
"name": "node-exporter.rules",
"rules": [
{
"expr": "count without (cpu) (\n count without (mode) (\n node_cpu_seconds_total{job=\"node-exporter\"}\n )\n)\n",
"record": "instance:node_num_cpu:sum"
},
{
"expr": "1 - avg without (cpu, mode) (\n rate(node_cpu_seconds_total{job=\"node-exporter\", mode=\"idle\"}[1m])\n)\n",
"record": "instance:node_cpu_utilisation:rate1m"
},
{
"expr": "(\n node_load1{job=\"node-exporter\"}\n/\n instance:node_num_cpu:sum{job=\"node-exporter\"}\n)\n",
"record": "instance:node_load1_per_cpu:ratio"
},
{
"expr": "1 - (\n node_memory_MemAvailable_bytes{job=\"node-exporter\"}\n/\n node_memory_MemTotal_bytes{job=\"node-exporter\"}\n)\n",
"record": "instance:node_memory_utilisation:ratio"
},
{
"expr": "rate(node_vmstat_pgmajfault{job=\"node-exporter\"}[1m])\n",
"record": "instance:node_vmstat_pgmajfault:rate1m"
},
{
"expr": "rate(node_disk_io_time_seconds_total{job=\"node-exporter\", device!~\"dm.*\"}[1m])\n",
"record": "instance_device:node_disk_io_time_seconds:rate1m"
},
{
"expr": "rate(node_disk_io_time_weighted_seconds_total{job=\"node-exporter\", device!~\"dm.*\"}[1m])\n",
"record": "instance_device:node_disk_io_time_weighted_seconds:rate1m"
},
{
"expr": "sum without (device) (\n rate(node_network_receive_bytes_total{job=\"node-exporter\", device!=\"lo\"}[1m])\n)\n",
"record": "instance:node_network_receive_bytes_excluding_lo:rate1m"
},
{
"expr": "sum without (device) (\n rate(node_network_transmit_bytes_total{job=\"node-exporter\", device!=\"lo\"}[1m])\n)\n",
"record": "instance:node_network_transmit_bytes_excluding_lo:rate1m"
},
{
"expr": "sum without (device) (\n rate(node_network_receive_drop_total{job=\"node-exporter\", device!=\"lo\"}[1m])\n)\n",
"record": "instance:node_network_receive_drop_excluding_lo:rate1m"
},
{
"expr": "sum without (device) (\n rate(node_network_transmit_drop_total{job=\"node-exporter\", device!=\"lo\"}[1m])\n)\n",
"record": "instance:node_network_transmit_drop_excluding_lo:rate1m"
}
]
},
{
"name": "node-exporter",
"rules": [
@ -1210,6 +1346,41 @@ data:
"labels": {
"severity": "warning"
}
},
{
"alert": "NodeHighNumberConntrackEntriesUsed",
"annotations": {
"description": "{{ $value | humanizePercentage }} of conntrack entries are used",
"summary": "Number of conntrack are getting close to the limit"
},
"expr": "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75\n",
"labels": {
"severity": "warning"
}
},
{
"alert": "NodeClockSkewDetected",
"annotations": {
"message": "Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.",
"summary": "Clock skew detected."
},
"expr": "(\n node_timex_offset_seconds > 0.05\nand\n deriv(node_timex_offset_seconds[5m]) >= 0\n)\nor\n(\n node_timex_offset_seconds < -0.05\nand\n deriv(node_timex_offset_seconds[5m]) <= 0\n)\n",
"for": "10m",
"labels": {
"severity": "warning"
}
},
{
"alert": "NodeClockNotSynchronising",
"annotations": {
"message": "Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.",
"summary": "Clock not synchronising."
},
"expr": "min_over_time(node_timex_sync_status[5m]) == 0\n",
"for": "10m",
"labels": {
"severity": "warning"
}
}
]
}