diff --git a/CHANGES.md b/CHANGES.md index 594dbeb6..1f2f4605 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -11,6 +11,10 @@ Notable changes between versions. * Fedora CoreOS 33 has stronger crypto defaults ([**notice**](https://docs.fedoraproject.org/en-US/fedora-coreos/faq/#_why_does_ssh_stop_working_after_upgrading_to_fedora_33), [#915](https://github.com/poseidon/typhoon/issues/915)) * Use a non-RSA SSH key or add the workaround provided in upstream [Fedora docs](https://docs.fedoraproject.org/en-US/fedora-coreos/faq/#_why_does_ssh_stop_working_after_upgrading_to_fedora_33) as a [snippet](https://typhoon.psdn.io/advanced/customization/#fedora-coreos) (**action required**) +### Addons + +* Update Grafana from v7.3.5 to [v7.3.6](https://github.com/grafana/grafana/releases/tag/v7.3.6) + ## v1.20.0 * Kubernetes [v1.20.0](https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG/CHANGELOG-1.20.md#v1200) diff --git a/addons/grafana/dashboards-coredns.yaml b/addons/grafana/dashboards-coredns.yaml index 60994376..01f0c582 100644 --- a/addons/grafana/dashboards-coredns.yaml +++ b/addons/grafana/dashboards-coredns.yaml @@ -37,6 +37,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -129,6 +130,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -221,6 +223,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -326,6 +329,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -432,6 +436,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -537,6 +542,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -643,6 +649,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -762,6 +769,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -854,6 +862,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, diff --git a/addons/grafana/dashboards-k8s-nodes.yaml b/addons/grafana/dashboards-k8s-nodes.yaml index 000293ba..1f34a098 100644 --- a/addons/grafana/dashboards-k8s-nodes.yaml +++ b/addons/grafana/dashboards-k8s-nodes.yaml @@ -172,7 +172,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "sum(kubelet_running_pods{cluster=\"$cluster\", job=\"kubelet\", instance=~\"$instance\"})", + "expr": "sum(kubelet_running_pods{cluster=\"$cluster\", job=\"kubelet\", instance=~\"$instance\"}) OR sum(kubelet_running_pod_count{cluster=\"$cluster\", job=\"kubelet\", instance=~\"$instance\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -256,7 +256,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "sum(kubelet_running_containers{cluster=\"$cluster\", job=\"kubelet\", instance=~\"$instance\"})", + "expr": "sum(kubelet_running_containers{cluster=\"$cluster\", job=\"kubelet\", instance=~\"$instance\"}) OR sum(kubelet_running_container_count{cluster=\"$cluster\", job=\"kubelet\", instance=~\"$instance\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -553,6 +553,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -645,6 +646,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -750,6 +752,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -855,6 +858,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -954,6 +958,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -1066,6 +1071,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -1160,6 +1166,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -1267,6 +1274,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -1374,6 +1382,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -1466,6 +1475,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -1572,6 +1582,7 @@ data: "datasource": "$datasource", "description": "Pod lifecycle event generator", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -1664,6 +1675,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -1769,6 +1781,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -1874,6 +1887,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -2000,6 +2014,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -2105,6 +2120,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -2197,6 +2213,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -2289,6 +2306,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -2613,6 +2631,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -2705,6 +2724,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -2810,6 +2830,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -2902,6 +2923,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -3007,6 +3029,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -3120,6 +3143,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -3225,6 +3249,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -3330,6 +3355,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -3422,6 +3448,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -3514,6 +3541,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, diff --git a/addons/grafana/dashboards-k8s-resources-1.yaml b/addons/grafana/dashboards-k8s-resources-1.yaml index d88629a9..860de6bb 100644 --- a/addons/grafana/dashboards-k8s-resources-1.yaml +++ b/addons/grafana/dashboards-k8s-resources-1.yaml @@ -60,7 +60,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", cluster=\"$cluster\"}[$__interval]))", + "expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", cluster=\"$cluster\"}[$__rate_interval]))", "format": "time_series", "instant": true, "intervalFactor": 2, @@ -1586,7 +1586,7 @@ data: ], "targets": [ { - "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -1595,7 +1595,7 @@ data: "step": 10 }, { - "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -1604,7 +1604,7 @@ data: "step": 10 }, { - "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -1613,7 +1613,7 @@ data: "step": 10 }, { - "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -1622,7 +1622,7 @@ data: "step": 10 }, { - "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -1631,7 +1631,7 @@ data: "step": 10 }, { - "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "table", "instant": true, "intervalFactor": 2, @@ -1731,7 +1731,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -1829,7 +1829,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -1927,7 +1927,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -2025,7 +2025,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -2123,7 +2123,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -2221,7 +2221,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -2319,7 +2319,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -2417,7 +2417,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__interval])) by (namespace)", + "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\".+\"}[$__rate_interval])) by (namespace)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{namespace}}", @@ -4019,7 +4019,7 @@ data: ], "targets": [ { - "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4028,7 +4028,7 @@ data: "step": 10 }, { - "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4037,7 +4037,7 @@ data: "step": 10 }, { - "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4046,7 +4046,7 @@ data: "step": 10 }, { - "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4055,7 +4055,7 @@ data: "step": 10 }, { - "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4064,7 +4064,7 @@ data: "step": 10 }, { - "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", "format": "table", "instant": true, "intervalFactor": 2, @@ -4164,7 +4164,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -4262,7 +4262,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -4360,7 +4360,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -4458,7 +4458,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -4556,7 +4556,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -4654,7 +4654,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", diff --git a/addons/grafana/dashboards-k8s-resources-2.yaml b/addons/grafana/dashboards-k8s-resources-2.yaml index aa45bd88..78a34d73 100644 --- a/addons/grafana/dashboards-k8s-resources-2.yaml +++ b/addons/grafana/dashboards-k8s-resources-2.yaml @@ -1058,7 +1058,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_bytes_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_receive_bytes_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -1157,7 +1157,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_transmit_bytes_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -1256,7 +1256,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_receive_packets_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -1355,7 +1355,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_packets_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_transmit_packets_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -1454,7 +1454,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -1553,7 +1553,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(irate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__interval])) by (pod)", + "expr": "sum(irate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\", pod=~\"$pod\"}[$__rate_interval])) by (pod)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -2707,7 +2707,7 @@ data: ], "targets": [ { - "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -2716,7 +2716,7 @@ data: "step": 10 }, { - "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -2725,7 +2725,7 @@ data: "step": 10 }, { - "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -2734,7 +2734,7 @@ data: "step": 10 }, { - "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -2743,7 +2743,7 @@ data: "step": 10 }, { - "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -2752,7 +2752,7 @@ data: "step": 10 }, { - "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -2852,7 +2852,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -2950,7 +2950,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -3048,7 +3048,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -3146,7 +3146,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -3244,7 +3244,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -3342,7 +3342,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -3440,7 +3440,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -3538,7 +3538,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", + "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\"$workload\", workload_type=\"$type\"}) by (pod))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{pod}}", @@ -4902,7 +4902,7 @@ data: ], "targets": [ { - "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -4911,7 +4911,7 @@ data: "step": 10 }, { - "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -4920,7 +4920,7 @@ data: "step": 10 }, { - "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -4929,7 +4929,7 @@ data: "step": 10 }, { - "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -4938,7 +4938,7 @@ data: "step": 10 }, { - "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -4947,7 +4947,7 @@ data: "step": 10 }, { - "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload_type=\"$type\"}) by (workload))\n", "format": "table", "instant": true, "intervalFactor": 2, @@ -5047,7 +5047,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -5145,7 +5145,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -5243,7 +5243,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(avg(irate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -5341,7 +5341,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(avg(irate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -5439,7 +5439,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_receive_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -5537,7 +5537,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_transmit_packets_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -5635,7 +5635,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_receive_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", @@ -5733,7 +5733,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", + "expr": "(sum(irate(container_network_transmit_packets_dropped_total{cluster=\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])\n* on (namespace,pod)\ngroup_left(workload,workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\", namespace=~\"$namespace\", workload=~\".+\", workload_type=\"$type\"}) by (workload))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{workload}}", diff --git a/addons/grafana/dashboards-k8s.yaml b/addons/grafana/dashboards-k8s.yaml index 481f039d..e5714fde 100644 --- a/addons/grafana/dashboards-k8s.yaml +++ b/addons/grafana/dashboards-k8s.yaml @@ -140,8 +140,9 @@ data: "dashes": false, "datasource": "$datasource", "decimals": 3, - "description": "How much error budget is left looking at our 0.990% availability gurantees?", + "description": "How much error budget is left looking at our 0.990% availability guarantees?", "fill": 10, + "fillGradient": 0, "gridPos": { }, @@ -336,6 +337,7 @@ data: "datasource": "$datasource", "description": "How many read requests (LIST,GET) per second do the apiservers get by code?", "fill": 10, + "fillGradient": 0, "gridPos": { }, @@ -444,6 +446,7 @@ data: "datasource": "$datasource", "description": "How many percent of read requests (LIST,GET) per second are returned with errors (5xx)?", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -537,6 +540,7 @@ data: "datasource": "$datasource", "description": "How many seconds is the 99th percentile for reading (LIST|GET) a given resource?", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -729,6 +733,7 @@ data: "datasource": "$datasource", "description": "How many write requests (POST|PUT|PATCH|DELETE) per second do the apiservers get by code?", "fill": 10, + "fillGradient": 0, "gridPos": { }, @@ -837,6 +842,7 @@ data: "datasource": "$datasource", "description": "How many percent of write requests (POST|PUT|PATCH|DELETE) per second are returned with errors (5xx)?", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -930,6 +936,7 @@ data: "datasource": "$datasource", "description": "How many seconds is the 99th percentile for writing (POST|PUT|PATCH|DELETE) a given resource?", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -1035,6 +1042,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -1127,6 +1135,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -1219,6 +1228,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -1324,6 +1334,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -1416,6 +1427,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -1508,6 +1520,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -1832,6 +1845,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -1937,6 +1951,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -2042,6 +2057,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -2147,6 +2163,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -2260,6 +2277,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -2365,6 +2383,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -2470,6 +2489,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -2562,6 +2582,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -2654,6 +2675,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -2868,6 +2890,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -3019,7 +3042,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "(\n kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n -\n kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n)\n/\nkubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n* 100\n", + "expr": "max without(instance,node) (\n(\n kubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n -\n kubelet_volume_stats_available_bytes{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n)\n/\nkubelet_volume_stats_capacity_bytes{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n* 100)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -3064,6 +3087,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -3215,7 +3239,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "kubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n/\nkubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n* 100\n", + "expr": "max without(instance,node) (\nkubelet_volume_stats_inodes_used{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n/\nkubelet_volume_stats_inodes{cluster=\"$cluster\", job=\"kubelet\", namespace=\"$namespace\", persistentvolumeclaim=\"$volume\"}\n* 100)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -3505,6 +3529,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -3618,6 +3643,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -3744,6 +3770,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -3857,6 +3884,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -3962,6 +3990,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -4067,6 +4096,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -4159,6 +4189,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -4251,6 +4282,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -4516,7 +4548,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubernetes-cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$statefulset.*\"}[3m]))", + "expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubernetes-cadvisor\", cluster=\"$cluster\", container!=\"\", namespace=\"$namespace\", pod=~\"$statefulset.*\"}[3m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -4599,7 +4631,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "sum(container_memory_usage_bytes{job=\"kubernetes-cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$statefulset.*\"}) / 1024^3", + "expr": "sum(container_memory_usage_bytes{job=\"kubernetes-cadvisor\", cluster=\"$cluster\", container!=\"\", namespace=\"$namespace\", pod=~\"$statefulset.*\"}) / 1024^3", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -4682,7 +4714,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{job=\"kubernetes-cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$statefulset.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\",pod=~\"$statefulset.*\"}[3m]))", + "expr": "sum(rate(container_network_transmit_bytes_total{job=\"kubernetes-cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$statefulset.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{job=\"kubernetes-cadvisor\", cluster=\"$cluster\", namespace=\"$namespace\",pod=~\"$statefulset.*\"}[3m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -5077,6 +5109,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, diff --git a/addons/grafana/dashboards-nginx-ingress.yaml b/addons/grafana/dashboards-nginx-ingress.yaml index 6228e760..71f9c722 100644 --- a/addons/grafana/dashboards-nginx-ingress.yaml +++ b/addons/grafana/dashboards-nginx-ingress.yaml @@ -172,7 +172,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "sum(avg_over_time(nginx_ingress_controller_nginx_process_connections{cluster=~\"$cluster\", controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\"}[2m]))", + "expr": "sum(avg_over_time(nginx_ingress_controller_nginx_process_connections{cluster=~\"$cluster\", controller_pod=~\"$controller\",controller_class=~\"$controller_class\",controller_namespace=~\"$namespace\",state=\"active\"}[2m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -296,6 +296,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -388,6 +389,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -493,6 +495,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -612,6 +615,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -711,6 +715,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -803,6 +808,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, diff --git a/addons/grafana/dashboards-node-exporter.yaml b/addons/grafana/dashboards-node-exporter.yaml index 682ed3ad..d5aac1bf 100644 --- a/addons/grafana/dashboards-node-exporter.yaml +++ b/addons/grafana/dashboards-node-exporter.yaml @@ -36,6 +36,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -129,6 +130,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 0, + "fillGradient": 0, "gridPos": { }, @@ -255,6 +257,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -420,7 +423,7 @@ data: "tableColumn": "", "targets": [ { - "expr": "100 -\n(\n node_memory_MemAvailable_bytes{job=\"node-exporter\", instance=\"$instance\"}\n/\n node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"}\n* 100\n)\n", + "expr": "100 -\n(\n avg(node_memory_MemAvailable_bytes{job=\"node-exporter\", instance=\"$instance\"})\n/\n avg(node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"})\n* 100\n)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "", @@ -462,6 +465,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 0, + "fillGradient": 0, "gridPos": { }, @@ -578,6 +582,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -697,6 +702,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 0, + "fillGradient": 0, "gridPos": { }, @@ -790,6 +796,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 0, + "fillGradient": 0, "gridPos": { }, diff --git a/addons/grafana/dashboards-prom.yaml b/addons/grafana/dashboards-prom.yaml index b9856418..94638cb4 100644 --- a/addons/grafana/dashboards-prom.yaml +++ b/addons/grafana/dashboards-prom.yaml @@ -21,7 +21,7 @@ data: "links": [ ], - "refresh": "", + "refresh": "60s", "rows": [ { "collapse": false, @@ -36,6 +36,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -72,7 +73,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"} \n- \n ignoring(remote_name, url) group_right(instance) prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}\n)\n", + "expr": "(\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"} \n- \n ignoring(remote_name, url) group_right(instance) (prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"} != 0)\n)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", @@ -128,6 +129,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -164,7 +166,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "(\n rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) \n- \n ignoring (remote_name, url) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n)\n", + "expr": "clamp_min(\n rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) \n- \n ignoring (remote_name, url) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n, 0)\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", @@ -233,6 +235,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -269,7 +272,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "rate(\n prometheus_remote_storage_samples_in_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n- \n ignoring(remote_name, url) group_right(instance) rate(prometheus_remote_storage_succeeded_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n- \n rate(prometheus_remote_storage_dropped_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n", + "expr": "rate(\n prometheus_remote_storage_samples_in_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])\n- \n ignoring(remote_name, url) group_right(instance) (rate(prometheus_remote_storage_succeeded_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) or rate(prometheus_remote_storage_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]))\n- \n (rate(prometheus_remote_storage_dropped_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) or rate(prometheus_remote_storage_samples_dropped_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]))\n", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", @@ -338,6 +341,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -431,6 +435,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -523,6 +528,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -615,6 +621,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -720,6 +727,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -812,6 +820,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -848,7 +857,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "prometheus_remote_storage_pending_samples{cluster=~\"$cluster\", instance=~\"$instance\"}", + "expr": "prometheus_remote_storage_pending_samples{cluster=~\"$cluster\", instance=~\"$instance\"} or prometheus_remote_storage_samples_pending{cluster=~\"$cluster\", instance=~\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", @@ -917,6 +926,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -1009,6 +1019,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -1114,6 +1125,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -1150,7 +1162,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "rate(prometheus_remote_storage_dropped_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", + "expr": "rate(prometheus_remote_storage_dropped_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) or rate(prometheus_remote_storage_samples_dropped_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", @@ -1206,6 +1218,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -1242,7 +1255,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "rate(prometheus_remote_storage_failed_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", + "expr": "rate(prometheus_remote_storage_failed_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", @@ -1298,6 +1311,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -1334,7 +1348,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "rate(prometheus_remote_storage_retried_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", + "expr": "rate(prometheus_remote_storage_retried_samples_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m]) or rate(prometheus_remote_storage_samples_retried_total{cluster=~\"$cluster\", instance=~\"$instance\"}[5m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cluster}}:{{instance}} {{remote_name}}:{{url}}", @@ -1390,6 +1404,7 @@ data: "dashes": false, "datasource": "$datasource", "fill": 1, + "fillGradient": 0, "gridPos": { }, @@ -1486,7 +1501,7 @@ data: "schemaVersion": 14, "style": "dark", "tags": [ - + "prometheus-mixin" ], "templating": { "list": [ @@ -1630,7 +1645,7 @@ data: ] }, "timezone": "browser", - "title": "Prometheus Remote Write", + "title": "Prometheus / Remote Write", "version": 0 } prometheus.json: |- @@ -1647,7 +1662,7 @@ data: "links": [ ], - "refresh": "10s", + "refresh": "60s", "rows": [ { "collapse": false, @@ -2726,7 +2741,7 @@ data: "schemaVersion": 14, "style": "dark", "tags": [ - + "prometheus-mixin" ], "templating": { "list": [ @@ -2834,7 +2849,7 @@ data: ] }, "timezone": "utc", - "title": "Prometheus Overview", + "title": "Prometheus / Overview", "uid": "", "version": 0 } diff --git a/addons/grafana/deployment.yaml b/addons/grafana/deployment.yaml index 67366363..1d43e24c 100644 --- a/addons/grafana/deployment.yaml +++ b/addons/grafana/deployment.yaml @@ -24,7 +24,7 @@ spec: type: RuntimeDefault containers: - name: grafana - image: docker.io/grafana/grafana:7.3.5 + image: docker.io/grafana/grafana:7.3.6 env: - name: GF_PATHS_CONFIG value: "/etc/grafana/custom.ini" diff --git a/addons/prometheus/rules.yaml b/addons/prometheus/rules.yaml index c7179aae..51504668 100644 --- a/addons/prometheus/rules.yaml +++ b/addons/prometheus/rules.yaml @@ -9,7 +9,8 @@ data: { "alert": "etcdMembersDown", "annotations": { - "message": "etcd cluster \"{{ $labels.job }}\": members are down ({{ $value }})." + "description": "etcd cluster \"{{ $labels.job }}\": members are down ({{ $value }}).", + "summary": "etcd cluster members are down." }, "expr": "max without (endpoint) (\n sum without (instance) (up{job=~\".*etcd.*\"} == bool 0)\nor\n count without (To) (\n sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~\".*etcd.*\"}[120s])) > 0.01\n )\n)\n> 0\n", "for": "10m", @@ -20,7 +21,8 @@ data: { "alert": "etcdInsufficientMembers", "annotations": { - "message": "etcd cluster \"{{ $labels.job }}\": insufficient members ({{ $value }})." + "description": "etcd cluster \"{{ $labels.job }}\": insufficient members ({{ $value }}).", + "summary": "etcd cluster has insufficient number of members." }, "expr": "sum(up{job=~\".*etcd.*\"} == bool 1) without (instance) < ((count(up{job=~\".*etcd.*\"}) without (instance) + 1) / 2)\n", "for": "3m", @@ -31,7 +33,8 @@ data: { "alert": "etcdNoLeader", "annotations": { - "message": "etcd cluster \"{{ $labels.job }}\": member {{ $labels.instance }} has no leader." + "description": "etcd cluster \"{{ $labels.job }}\": member {{ $labels.instance }} has no leader.", + "summary": "etcd cluster has no leader." }, "expr": "etcd_server_has_leader{job=~\".*etcd.*\"} == 0\n", "for": "1m", @@ -42,7 +45,8 @@ data: { "alert": "etcdHighNumberOfLeaderChanges", "annotations": { - "message": "etcd cluster \"{{ $labels.job }}\": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated." + "description": "etcd cluster \"{{ $labels.job }}\": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.", + "summary": "etcd cluster has high number of leader changes." }, "expr": "increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~\".*etcd.*\"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~\".*etcd.*\"}))[15m:1m]) >= 4\n", "for": "5m", @@ -53,7 +57,8 @@ data: { "alert": "etcdGRPCRequestsSlow", "annotations": { - "message": "etcd cluster \"{{ $labels.job }}\": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}." + "description": "etcd cluster \"{{ $labels.job }}\": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.", + "summary": "etcd grpc requests are slow" }, "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\", grpc_type=\"unary\"}[5m])) without(grpc_type))\n> 0.15\n", "for": "10m", @@ -64,7 +69,8 @@ data: { "alert": "etcdMemberCommunicationSlow", "annotations": { - "message": "etcd cluster \"{{ $labels.job }}\": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}." + "description": "etcd cluster \"{{ $labels.job }}\": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.", + "summary": "etcd cluster member communication is slow." }, "expr": "histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~\".*etcd.*\"}[5m]))\n> 0.15\n", "for": "10m", @@ -75,7 +81,8 @@ data: { "alert": "etcdHighNumberOfFailedProposals", "annotations": { - "message": "etcd cluster \"{{ $labels.job }}\": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}." + "description": "etcd cluster \"{{ $labels.job }}\": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.", + "summary": "etcd cluster has high number of proposal failures." }, "expr": "rate(etcd_server_proposals_failed_total{job=~\".*etcd.*\"}[15m]) > 5\n", "for": "15m", @@ -86,7 +93,8 @@ data: { "alert": "etcdHighFsyncDurations", "annotations": { - "message": "etcd cluster \"{{ $labels.job }}\": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}." + "description": "etcd cluster \"{{ $labels.job }}\": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.", + "summary": "etcd cluster 99th percentile fsync durations are too high." }, "expr": "histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~\".*etcd.*\"}[5m]))\n> 0.5\n", "for": "10m", @@ -94,10 +102,22 @@ data: "severity": "warning" } }, + { + "alert": "etcdHighFsyncDurations", + "annotations": { + "message": "etcd cluster \"{{ $labels.job }}\": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}." + }, + "expr": "histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~\".*etcd.*\"}[5m]))\n> 1\n", + "for": "10m", + "labels": { + "severity": "critical" + } + }, { "alert": "etcdHighCommitDurations", "annotations": { - "message": "etcd cluster \"{{ $labels.job }}\": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}." + "description": "etcd cluster \"{{ $labels.job }}\": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.", + "summary": "etcd cluster 99th percentile commit durations are too high." }, "expr": "histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~\".*etcd.*\"}[5m]))\n> 0.25\n", "for": "10m", @@ -108,7 +128,8 @@ data: { "alert": "etcdHighNumberOfFailedHTTPRequests", "annotations": { - "message": "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}" + "description": "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}", + "summary": "etcd has high number of failed HTTP requests." }, "expr": "sum(rate(etcd_http_failed_total{job=~\".*etcd.*\", code!=\"404\"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~\".*etcd.*\"}[5m]))\nwithout (code) > 0.01\n", "for": "10m", @@ -119,7 +140,8 @@ data: { "alert": "etcdHighNumberOfFailedHTTPRequests", "annotations": { - "message": "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}." + "description": "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.", + "summary": "etcd has high number of failed HTTP requests." }, "expr": "sum(rate(etcd_http_failed_total{job=~\".*etcd.*\", code!=\"404\"}[5m])) without (code) / sum(rate(etcd_http_received_total{job=~\".*etcd.*\"}[5m]))\nwithout (code) > 0.05\n", "for": "10m", @@ -130,13 +152,36 @@ data: { "alert": "etcdHTTPRequestsSlow", "annotations": { - "message": "etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow." + "description": "etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.", + "summary": "etcd instance HTTP requests are slow." }, "expr": "histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))\n> 0.15\n", "for": "10m", "labels": { "severity": "warning" } + }, + { + "alert": "etcdBackendQuotaLowSpace", + "annotations": { + "message": "etcd cluster \"{{ $labels.job }}\": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full." + }, + "expr": "(etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100 > 95\n", + "for": "10m", + "labels": { + "severity": "critical" + } + }, + { + "alert": "etcdExcessiveDatabaseGrowth", + "annotations": { + "message": "etcd cluster \"{{ $labels.job }}\": Observed surge in etcd writes leading to 50% increase in database size over the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive." + }, + "expr": "increase(((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100)[240m:1m]) > 50\n", + "for": "10m", + "labels": { + "severity": "warning" + } } ] } @@ -276,10 +321,6 @@ data: }, "record": "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile" }, - { - "expr": "sum(rate(apiserver_request_duration_seconds_sum{subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT\"}[5m])) without(instance, pod)\n/\nsum(rate(apiserver_request_duration_seconds_count{subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT\"}[5m])) without(instance, pod)\n", - "record": "cluster:apiserver_request_duration_seconds:mean5m" - }, { "expr": "histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job=\"apiserver\",subresource!=\"log\",verb!~\"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT\"}[5m])) without(instance, pod))\n", "labels": { @@ -443,10 +484,6 @@ data: { "name": "k8s.rules", "rules": [ - { - "expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubernetes-cadvisor\", image!=\"\", container!=\"POD\"}[5m])) by (namespace)\n", - "record": "namespace:container_cpu_usage_seconds_total:sum_rate" - }, { "expr": "sum by (cluster, namespace, pod, container) (\n rate(container_cpu_usage_seconds_total{job=\"kubernetes-cadvisor\", image!=\"\", container!=\"POD\"}[5m])\n) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (\n 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=\"\"})\n)\n", "record": "node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate" @@ -467,10 +504,6 @@ data: "expr": "container_memory_swap{job=\"kubernetes-cadvisor\", image!=\"\"}\n* on (namespace, pod) group_left(node) topk by(namespace, pod) (1,\n max by(namespace, pod, node) (kube_pod_info{node!=\"\"})\n)\n", "record": "node_namespace_pod_container:container_memory_swap" }, - { - "expr": "sum(container_memory_usage_bytes{job=\"kubernetes-cadvisor\", image!=\"\", container!=\"POD\"}) by (namespace)\n", - "record": "namespace:container_memory_usage_bytes:sum" - }, { "expr": "sum by (namespace) (\n sum by (namespace, pod) (\n max by (namespace, pod, container) (\n kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\"}\n ) * on(namespace, pod) group_left() max by (namespace, pod) (\n kube_pod_status_phase{phase=~\"Pending|Running\"} == 1\n )\n )\n)\n", "record": "namespace:kube_pod_container_resource_requests_memory_bytes:sum" @@ -573,10 +606,6 @@ data: { "name": "node.rules", "rules": [ - { - "expr": "sum(min(kube_pod_info{node!=\"\"}) by (cluster, node))\n", - "record": ":kube_pod_info_node_count:" - }, { "expr": "topk by(namespace, pod) (1,\n max by (node, namespace, pod) (\n label_replace(kube_pod_info{job=\"kube-state-metrics\",node!=\"\"}, \"pod\", \"$1\", \"pod\", \"(.*)\")\n))\n", "record": "node_namespace_pod:kube_pod_info:" @@ -779,7 +808,7 @@ data: { "alert": "KubeJobFailed", "annotations": { - "description": "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.", + "description": "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed", "summary": "Job failed to complete." }, @@ -796,7 +825,7 @@ data: "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch", "summary": "HPA has not matched descired number of replicas." }, - "expr": "(kube_hpa_status_desired_replicas{job=\"kube-state-metrics\"}\n !=\nkube_hpa_status_current_replicas{job=\"kube-state-metrics\"})\n and\nchanges(kube_hpa_status_current_replicas[15m]) == 0\n", + "expr": "(kube_hpa_status_desired_replicas{job=\"kube-state-metrics\"}\n !=\nkube_hpa_status_current_replicas{job=\"kube-state-metrics\"})\n and\n(kube_hpa_status_current_replicas{job=\"kube-state-metrics\"}\n >\nkube_hpa_spec_min_replicas{job=\"kube-state-metrics\"})\n and\n(kube_hpa_status_current_replicas{job=\"kube-state-metrics\"}\n <\nkube_hpa_spec_max_replicas{job=\"kube-state-metrics\"})\n and\nchanges(kube_hpa_status_current_replicas[15m]) == 0\n", "for": "15m", "labels": { "severity": "warning" @@ -866,7 +895,7 @@ data: "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryquotaovercommit", "summary": "Cluster has overcommitted memory resource requests." }, - "expr": "sum(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=\"memory\"})\n /\nsum(kube_node_status_allocatable_memory_bytes{job=\"node-exporter\"})\n > 1.5\n", + "expr": "sum(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=\"memory\"})\n /\nsum(kube_node_status_allocatable_memory_bytes{job=\"kube-state-metrics\"})\n > 1.5\n", "for": "5m", "labels": { "severity": "warning" @@ -1096,11 +1125,11 @@ data: { "alert": "AggregatedAPIErrors", "annotations": { - "description": "An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often.", + "description": "An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m.", "runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors", "summary": "An aggregated API has reported errors." }, - "expr": "sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2\n", + "expr": "sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[10m])) > 4\n", "labels": { "severity": "warning" } @@ -1341,115 +1370,6 @@ data: } ] } - loki.yaml: |- - { - "groups": [ - { - "name": "loki_rules", - "rules": [ - { - "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job))", - "record": "job:loki_request_duration_seconds:99quantile" - }, - { - "expr": "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job))", - "record": "job:loki_request_duration_seconds:50quantile" - }, - { - "expr": "sum(rate(loki_request_duration_seconds_sum[1m])) by (job) / sum(rate(loki_request_duration_seconds_count[1m])) by (job)", - "record": "job:loki_request_duration_seconds:avg" - }, - { - "expr": "sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job)", - "record": "job:loki_request_duration_seconds_bucket:sum_rate" - }, - { - "expr": "sum(rate(loki_request_duration_seconds_sum[1m])) by (job)", - "record": "job:loki_request_duration_seconds_sum:sum_rate" - }, - { - "expr": "sum(rate(loki_request_duration_seconds_count[1m])) by (job)", - "record": "job:loki_request_duration_seconds_count:sum_rate" - }, - { - "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job, route))", - "record": "job_route:loki_request_duration_seconds:99quantile" - }, - { - "expr": "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job, route))", - "record": "job_route:loki_request_duration_seconds:50quantile" - }, - { - "expr": "sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)", - "record": "job_route:loki_request_duration_seconds:avg" - }, - { - "expr": "sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job, route)", - "record": "job_route:loki_request_duration_seconds_bucket:sum_rate" - }, - { - "expr": "sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route)", - "record": "job_route:loki_request_duration_seconds_sum:sum_rate" - }, - { - "expr": "sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)", - "record": "job_route:loki_request_duration_seconds_count:sum_rate" - }, - { - "expr": "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, namespace, job, route))", - "record": "namespace_job_route:loki_request_duration_seconds:99quantile" - }, - { - "expr": "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, namespace, job, route))", - "record": "namespace_job_route:loki_request_duration_seconds:50quantile" - }, - { - "expr": "sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)", - "record": "namespace_job_route:loki_request_duration_seconds:avg" - }, - { - "expr": "sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, namespace, job, route)", - "record": "namespace_job_route:loki_request_duration_seconds_bucket:sum_rate" - }, - { - "expr": "sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route)", - "record": "namespace_job_route:loki_request_duration_seconds_sum:sum_rate" - }, - { - "expr": "sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)", - "record": "namespace_job_route:loki_request_duration_seconds_count:sum_rate" - } - ] - }, - { - "name": "loki_alerts", - "rules": [ - { - "alert": "LokiRequestErrors", - "annotations": { - "message": "{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}% errors.\n" - }, - "expr": "100 * sum(rate(loki_request_duration_seconds_count{status_code=~\"5..\"}[1m])) by (namespace, job, route)\n /\nsum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)\n > 10\n", - "for": "15m", - "labels": { - "severity": "critical" - } - }, - { - "alert": "LokiRequestLatency", - "annotations": { - "message": "{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency.\n" - }, - "expr": "namespace_job_route:loki_request_duration_seconds:99quantile{route!~\"(?i).*tail.*\"} > 1\n", - "for": "15m", - "labels": { - "severity": "critical" - } - } - ] - } - ] - } node-exporter.yaml: |- { "groups": [ @@ -1607,7 +1527,7 @@ data: "description": "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.", "summary": "Network interface is reporting many receive errors." }, - "expr": "increase(node_network_receive_errs_total[2m]) > 10\n", + "expr": "rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01\n", "for": "1h", "labels": { "severity": "warning" @@ -1619,7 +1539,7 @@ data: "description": "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.", "summary": "Network interface is reporting many transmit errors." }, - "expr": "increase(node_network_transmit_errs_total[2m]) > 10\n", + "expr": "rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01\n", "for": "1h", "labels": { "severity": "warning" @@ -1665,7 +1585,7 @@ data: "message": "Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.", "summary": "Clock not synchronising." }, - "expr": "min_over_time(node_timex_sync_status[5m]) == 0\n", + "expr": "min_over_time(node_timex_sync_status[5m]) == 0\nand\nnode_timex_maxerror_seconds >= 16\n", "for": "10m", "labels": { "severity": "warning" @@ -1740,18 +1660,6 @@ data: "severity": "warning" } }, - { - "alert": "PrometheusErrorSendingAlertsToAnyAlertmanager", - "annotations": { - "description": "{{ printf \"%.1f\" $value }}% minimum errors while sending alerts from Prometheus {{$labels.instance}} to any Alertmanager.", - "summary": "Prometheus encounters more than 3% errors sending alerts to any Alertmanager." - }, - "expr": "min without(alertmanager) (\n rate(prometheus_notifications_errors_total{job=\"prometheus\"}[5m])\n/\n rate(prometheus_notifications_sent_total{job=\"prometheus\"}[5m])\n)\n* 100\n> 3\n", - "for": "15m", - "labels": { - "severity": "critical" - } - }, { "alert": "PrometheusNotConnectedToAlertmanagers", "annotations": { @@ -1794,7 +1702,7 @@ data: "description": "Prometheus {{$labels.instance}} is not ingesting samples.", "summary": "Prometheus is not ingesting samples." }, - "expr": "rate(prometheus_tsdb_head_samples_appended_total{job=\"prometheus\"}[5m]) <= 0\n", + "expr": "(\n rate(prometheus_tsdb_head_samples_appended_total{job=\"prometheus\"}[5m]) <= 0\nand\n (\n sum without(scrape_job) (prometheus_target_metadata_cache_entries{job=\"prometheus\"}) > 0\n or\n sum without(rule_group) (prometheus_rule_group_rules{job=\"prometheus\"}) > 0\n )\n)\n", "for": "10m", "labels": { "severity": "warning" @@ -1842,7 +1750,7 @@ data: "description": "Prometheus {{$labels.instance}} remote write is {{ printf \"%.1f\" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}.", "summary": "Prometheus remote write is behind." }, - "expr": "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n(\n max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job=\"prometheus\"}[5m])\n- on(job, instance) group_right\n max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job=\"prometheus\"}[5m])\n)\n> 120\n", + "expr": "# Without max_over_time, failed scrapes could create false negatives, see\n# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.\n(\n max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job=\"prometheus\"}[5m])\n- ignoring(remote_name, url) group_right\n max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job=\"prometheus\"}[5m])\n)\n> 120\n", "for": "15m", "labels": { "severity": "critical" @@ -1895,6 +1803,18 @@ data: "labels": { "severity": "warning" } + }, + { + "alert": "PrometheusErrorSendingAlertsToAnyAlertmanager", + "annotations": { + "description": "{{ printf \"%.1f\" $value }}% minimum errors while sending alerts from Prometheus {{$labels.instance}} to any Alertmanager.", + "summary": "Prometheus encounters more than 3% errors sending alerts to any Alertmanager." + }, + "expr": "min without (alertmanager) (\n rate(prometheus_notifications_errors_total{job=\"prometheus\",alertmanager!~``}[5m])\n/\n rate(prometheus_notifications_sent_total{job=\"prometheus\",alertmanager!~``}[5m])\n)\n* 100\n> 3\n", + "for": "15m", + "labels": { + "severity": "critical" + } } ] }