addons: Sync prometheus alerts to upstream

* https://github.com/coreos/prometheus-operator/pull/774
This commit is contained in:
Dalton Hubble 2017-12-01 23:24:08 -08:00
parent 8d3817e0ae
commit 65f006e6cc
1 changed files with 262 additions and 215 deletions

View File

@ -7,7 +7,7 @@ data:
# Rules adapted from those provided by coreos/prometheus-operator and SoundCloud # Rules adapted from those provided by coreos/prometheus-operator and SoundCloud
alertmanager.rules.yaml: |+ alertmanager.rules.yaml: |+
groups: groups:
- name: ./alertmanager.rules - name: alertmanager.rules
rules: rules:
- alert: AlertmanagerConfigInconsistent - alert: AlertmanagerConfigInconsistent
expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service)
@ -19,7 +19,6 @@ data:
annotations: annotations:
description: The configuration of the instances of the Alertmanager cluster description: The configuration of the instances of the Alertmanager cluster
`{{$labels.service}}` are out of sync. `{{$labels.service}}` are out of sync.
summary: Alertmanager configurations are inconsistent
- alert: AlertmanagerDownOrMissing - alert: AlertmanagerDownOrMissing
expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1",
"alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1 "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
@ -29,8 +28,7 @@ data:
annotations: annotations:
description: An unexpected number of Alertmanagers are scraped or Alertmanagers description: An unexpected number of Alertmanagers are scraped or Alertmanagers
disappeared from discovery. disappeared from discovery.
summary: Alertmanager down or not discovered - alert: AlertmanagerFailedReload
- alert: FailedReload
expr: alertmanager_config_last_reload_successful == 0 expr: alertmanager_config_last_reload_successful == 0
for: 10m for: 10m
labels: labels:
@ -38,7 +36,6 @@ data:
annotations: annotations:
description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
}}/{{ $labels.pod}}. }}/{{ $labels.pod}}.
summary: Alertmanager configuration reload has failed
etcd3.rules.yaml: |+ etcd3.rules.yaml: |+
groups: groups:
- name: ./etcd3.rules - name: ./etcd3.rules
@ -165,7 +162,7 @@ data:
summary: high commit durations summary: high commit durations
general.rules.yaml: |+ general.rules.yaml: |+
groups: groups:
- name: ./general.rules - name: general.rules
rules: rules:
- alert: TargetDown - alert: TargetDown
expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
@ -173,63 +170,31 @@ data:
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: '{{ $value }}% or more of {{ $labels.job }} targets are down.' description: '{{ $value }}% of {{ $labels.job }} targets are down.'
summary: Targets are down summary: Targets are down
- alert: TooManyOpenFileDescriptors - record: fd_utilization
expr: 100 * (process_open_fds / process_max_fds) > 95
for: 10m
labels:
severity: critical
annotations:
description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{
$labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.'
summary: too many open file descriptors
- record: instance:fd_utilization
expr: process_open_fds / process_max_fds expr: process_open_fds / process_max_fds
- alert: FdExhaustionClose - alert: FdExhaustionClose
expr: predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
$labels.instance }}) instance will exhaust in file/socket descriptors soon' will exhaust in file/socket descriptors within the next 4 hours'
summary: file descriptors soon exhausted summary: file descriptors soon exhausted
- alert: FdExhaustionClose - alert: FdExhaustionClose
expr: predict_linear(instance:fd_utilization[10m], 3600) > 1 expr: predict_linear(fd_utilization[10m], 3600) > 1
for: 10m for: 10m
labels: labels:
severity: critical severity: critical
annotations: annotations:
description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{ description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
$labels.instance }}) instance will exhaust in file/socket descriptors soon' will exhaust in file/socket descriptors within the next hour'
summary: file descriptors soon exhausted summary: file descriptors soon exhausted
kube-apiserver.rules.yaml: |+
groups:
- name: ./kube-apiserver.rules
rules:
- alert: K8SApiserverDown
expr: absent(up{job="kubernetes-apiservers"} == 1)
for: 5m
labels:
severity: critical
annotations:
description: Prometheus failed to scrape API server(s), or all API servers have
disappeared from service discovery.
summary: API server unreachable
- alert: K8SApiServerLatency
expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"}[10m]))
by (le)) / 1e+06 > 1
for: 10m
labels:
severity: warning
annotations:
description: 99th percentile Latency for {{ $labels.verb }} requests to the
kube-apiserver is higher than 1s.
summary: Kubernetes apiserver latency is high
kube-controller-manager.rules.yaml: |+ kube-controller-manager.rules.yaml: |+
groups: groups:
- name: ./kube-controller-manager.rules - name: kube-controller-manager.rules
rules: rules:
- alert: K8SControllerManagerDown - alert: K8SControllerManagerDown
expr: absent(up{kubernetes_name="kube-controller-manager"} == 1) expr: absent(up{kubernetes_name="kube-controller-manager"} == 1)
@ -242,8 +207,53 @@ data:
summary: Controller manager is down summary: Controller manager is down
kube-scheduler.rules.yaml: |+ kube-scheduler.rules.yaml: |+
groups: groups:
- name: ./kube-scheduler.rules - name: kube-scheduler.rules
rules: rules:
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.99"
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.9"
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.5"
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.99"
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.9"
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.5"
- record: cluster:scheduler_binding_latency_seconds:quantile
expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.99"
- record: cluster:scheduler_binding_latency_seconds:quantile
expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.9"
- record: cluster:scheduler_binding_latency_seconds:quantile
expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.5"
- alert: K8SSchedulerDown - alert: K8SSchedulerDown
expr: absent(up{kubernetes_name="kube-scheduler"} == 1) expr: absent(up{kubernetes_name="kube-scheduler"} == 1)
for: 5m for: 5m
@ -253,9 +263,65 @@ data:
description: There is no running K8S scheduler. New pods are not being assigned description: There is no running K8S scheduler. New pods are not being assigned
to nodes. to nodes.
summary: Scheduler is down summary: Scheduler is down
kube-state-metrics.rules.yaml: |+
groups:
- name: kube-state-metrics.rules
rules:
- alert: DeploymentGenerationMismatch
expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
for: 15m
labels:
severity: warning
annotations:
description: Observed deployment generation does not match expected one for
deployment {{$labels.namespaces}}{{$labels.deployment}}
- alert: DeploymentReplicasNotUpdated
expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas)
or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas))
unless (kube_deployment_spec_paused == 1)
for: 15m
labels:
severity: warning
annotations:
description: Replicas are not updated and available for deployment {{$labels.namespaces}}/{{$labels.deployment}}
- alert: DaemonSetRolloutStuck
expr: kube_daemonset_status_current_number_ready / kube_daemonset_status_desired_number_scheduled
* 100 < 100
for: 15m
labels:
severity: warning
annotations:
description: Only {{$value}}% of desired pods scheduled and ready for daemon
set {{$labels.namespaces}}/{{$labels.daemonset}}
- alert: K8SDaemonSetsNotScheduled
expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled
> 0
for: 10m
labels:
severity: warning
annotations:
description: A number of daemonsets are not scheduled.
summary: Daemonsets are not scheduled correctly
- alert: DaemonSetsMissScheduled
expr: kube_daemonset_status_number_misscheduled > 0
for: 10m
labels:
severity: warning
annotations:
description: A number of daemonsets are running where they are not supposed
to run.
summary: Daemonsets are not scheduled correctly
- alert: PodFrequentlyRestarting
expr: increase(kube_pod_container_status_restarts[1h]) > 5
for: 10m
labels:
severity: warning
annotations:
description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}}
times within the last hour
kubelet.rules.yaml: |+ kubelet.rules.yaml: |+
groups: groups:
- name: ./kubelet.rules - name: kubelet.rules
rules: rules:
- alert: K8SNodeNotReady - alert: K8SNodeNotReady
expr: kube_node_status_condition{condition="Ready",status="true"} == 0 expr: kube_node_status_condition{condition="Ready",status="true"} == 0
@ -274,20 +340,17 @@ data:
labels: labels:
severity: critical severity: critical
annotations: annotations:
description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady description: '{{ $value }}% of Kubernetes nodes are not ready'
state).'
summary: Many Kubernetes nodes are Not Ready
- alert: K8SKubeletDown - alert: K8SKubeletDown
expr: count(up{job="kubernetes-nodes"} == 0) / count(up{job="kubernetes-nodes"}) > 0.03 expr: count(up{job="kubernetes-nodes"} == 0) / count(up{job="kubernetes-nodes"}) * 100 > 3
for: 1h for: 1h
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: Prometheus failed to scrape {{ $value }}% of kubelets. description: Prometheus failed to scrape {{ $value }}% of kubelets.
summary: Many Kubelets cannot be scraped
- alert: K8SKubeletDown - alert: K8SKubeletDown
expr: absent(up{job="kubernetes-nodes"} == 1) or count(up{job="kubernetes-nodes"} == 0) / count(up{job="kubernetes-nodes"}) expr: (absent(up{job="kubernetes-nodes"} == 1) or count(up{job="kubernetes-nodes"} == 0) / count(up{job="kubernetes-nodes"}))
> 0.1 * 100 > 1
for: 1h for: 1h
labels: labels:
severity: critical severity: critical
@ -297,159 +360,121 @@ data:
summary: Many Kubelets cannot be scraped summary: Many Kubelets cannot be scraped
- alert: K8SKubeletTooManyPods - alert: K8SKubeletTooManyPods
expr: kubelet_running_pod_count > 100 expr: kubelet_running_pod_count > 100
for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: Kubelet {{$labels.instance}} is running {{$value}} pods, close description: Kubelet {{$labels.instance}} is running {{$value}} pods, close
to the limit of 110 to the limit of 110
summary: Kubelet is close to pod limit summary: Kubelet is close to pod limit
- alert: K8SDaemonSetsNotScheduled
expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled
> 0
for: 10m
labels:
severity: warning
annotations:
description: A number of daemonsets are not scheduled.
summary: Daemonsets are not scheduled correctly
- alert: K8SDaemonSetsNotRunning
expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_number_ready
> 0
for: 10m
labels:
severity: warning
annotations:
description: A number of daemonsets are not ready.
summary: Daemonsets are not ready
- alert: K8SDaemonSetsMissScheduled
expr: kube_daemonset_status_number_misscheduled > 0
for: 10m
labels:
severity: warning
annotations:
description: A number of daemonsets are running where they are not supposed
to run.
summary: Daemonsets are not scheduled correctly
kubernetes.rules.yaml: |+ kubernetes.rules.yaml: |+
groups: groups:
- name: ./kubernetes.rules - name: kubernetes.rules
rules: rules:
- record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes - record: pod_name:container_memory_usage_bytes:sum
expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""}, expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, (pod_name)
controller, pod_name, container_name) - record: pod_name:container_spec_cpu_shares:sum
- record: cluster_namespace_controller_pod_container:spec_cpu_shares expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name)
expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller", - record: pod_name:container_cpu_usage:sum
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
container_name) BY (pod_name)
- record: cluster_namespace_controller_pod_container:cpu_usage:rate - record: pod_name:container_fs_usage_bytes:sum
expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]), expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name)
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, - record: namespace:container_memory_usage_bytes:sum
controller, pod_name, container_name) expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace)
- record: cluster_namespace_controller_pod_container:memory_usage:bytes - record: namespace:container_spec_cpu_shares:sum
expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace)
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, - record: namespace:container_cpu_usage:sum
container_name) expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m]))
- record: cluster_namespace_controller_pod_container:memory_working_set:bytes BY (namespace)
expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""}, - record: cluster:memory_usage:ratio
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
controller, pod_name, container_name) (cluster) / sum(machine_memory_bytes) BY (cluster)
- record: cluster_namespace_controller_pod_container:memory_rss:bytes - record: cluster:container_spec_cpu_shares:ratio
expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller", expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, / sum(machine_cpu_cores)
container_name) - record: cluster:container_cpu_usage:ratio
- record: cluster_namespace_controller_pod_container:memory_cache:bytes expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller", / sum(machine_cpu_cores)
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name, - record: apiserver_latency_seconds:quantile
container_name) expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) /
- record: cluster_namespace_controller_pod_container:disk_usage:bytes 1e+06
expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller",
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
container_name)
- record: cluster_namespace_controller_pod_container:memory_pagefaults:rate
expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]),
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
controller, pod_name, container_name, scope, type)
- record: cluster_namespace_controller_pod_container:memory_oom:rate
expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]),
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
controller, pod_name, container_name, scope, type)
- record: cluster:memory_allocation:percent
expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster)
/ sum(machine_memory_bytes) BY (cluster)
- record: cluster:memory_used:percent
expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes)
BY (cluster)
- record: cluster:cpu_allocation:percent
expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"}
* ON(cluster, instance) machine_cpu_cores) BY (cluster)
- record: cluster:node_cpu_use:percent
expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores)
BY (cluster)
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le,
cluster, job, resource, verb)) / 1e+06
labels: labels:
quantile: "0.99" quantile: "0.99"
- record: cluster_resource_verb:apiserver_latency:quantile_seconds - record: apiserver_latency:quantile_seconds
expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le, expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) /
cluster, job, resource, verb)) / 1e+06 1e+06
labels: labels:
quantile: "0.9" quantile: "0.9"
- record: cluster_resource_verb:apiserver_latency:quantile_seconds - record: apiserver_latency_seconds:quantile
expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le, expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) /
cluster, job, resource, verb)) / 1e+06 1e+06
labels: labels:
quantile: "0.5" quantile: "0.5"
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds - alert: APIServerLatencyHigh
expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
BY (le, cluster)) / 1e+06 > 1
for: 10m
labels: labels:
quantile: "0.99" severity: warning
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds annotations:
expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) description: the API server has a 99th percentile latency of {{ $value }} seconds
BY (le, cluster)) / 1e+06 for {{$labels.verb}} {{$labels.resource}}
- alert: APIServerLatencyHigh
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
> 4
for: 10m
labels: labels:
quantile: "0.9" severity: critical
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds annotations:
expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) description: the API server has a 99th percentile latency of {{ $value }} seconds
BY (le, cluster)) / 1e+06 for {{$labels.verb}} {{$labels.resource}}
- alert: APIServerErrorsHigh
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
* 100 > 2
for: 10m
labels: labels:
quantile: "0.5" severity: warning
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds annotations:
expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) description: API server returns errors for {{ $value }}% of requests
BY (le, cluster)) / 1e+06 - alert: APIServerErrorsHigh
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
* 100 > 5
for: 10m
labels: labels:
quantile: "0.99" severity: critical
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds annotations:
expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) description: API server returns errors for {{ $value }}% of requests
BY (le, cluster)) / 1e+06 - alert: K8SApiserverDown
expr: absent(up{job="kubernetes-apiservers"} == 1)
for: 20m
labels: labels:
quantile: "0.9" severity: critical
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds annotations:
expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) description: No API servers are reachable or all have disappeared from service
BY (le, cluster)) / 1e+06 discovery
labels:
quantile: "0.5"
- record: cluster:scheduler_binding_latency:quantile_seconds
expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.99"
- record: cluster:scheduler_binding_latency:quantile_seconds
expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.9"
- record: cluster:scheduler_binding_latency:quantile_seconds
expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.5"
node.rules.yaml: |+ node.rules.yaml: |+
groups: groups:
- name: ./node.rules - name: node.rules
rules: rules:
- record: instance:node_cpu:rate:sum
expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m]))
BY (instance)
- record: instance:node_filesystem_usage:sum
expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}))
BY (instance)
- record: instance:node_network_receive_bytes:rate:sum
expr: sum(rate(node_network_receive_bytes[3m])) BY (instance)
- record: instance:node_network_transmit_bytes:rate:sum
expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance)
- record: instance:node_cpu:ratio
expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance)
GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)
- record: cluster:node_cpu:sum_rate5m
expr: sum(rate(node_cpu{mode!="idle"}[5m]))
- record: cluster:node_cpu:ratio
expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
- alert: NodeExporterDown - alert: NodeExporterDown
expr: absent(up{kubernetes_name="node-exporter"} == 1) expr: absent(up{kubernetes_name="node-exporter"} == 1)
for: 10m for: 10m
@ -457,43 +482,65 @@ data:
severity: warning severity: warning
annotations: annotations:
description: Prometheus could not scrape a node-exporter for more than 10m, description: Prometheus could not scrape a node-exporter for more than 10m,
or node-exporters have disappeared from discovery. or node-exporters have disappeared from discovery
summary: node-exporter cannot be scraped - alert: NodeDiskRunningFull
- alert: K8SNodeOutOfDisk expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0
expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1 for: 30m
labels:
severity: warning
annotations:
description: device {{$labels.device}} on node {{$labels.instance}} is running
full within the next 24 hours (mounted at {{$labels.mountpoint}})
- alert: NodeDiskRunningFull
expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0
for: 10m
labels: labels:
service: k8s
severity: critical severity: critical
annotations: annotations:
description: '{{ $labels.node }} has run out of disk space.' description: device {{$labels.device}} on node {{$labels.instance}} is running
summary: Node ran out of disk space. full within the next 2 hours (mounted at {{$labels.mountpoint}})
- alert: K8SNodeMemoryPressure
expr: kube_node_status_condition{condition="MemoryPressure",status="true"} ==
1
labels:
service: k8s
severity: warning
annotations:
description: '{{ $labels.node }} is under memory pressure.'
summary: Node is under memory pressure.
- alert: K8SNodeDiskPressure
expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
labels:
service: k8s
severity: warning
annotations:
description: '{{ $labels.node }} is under disk pressure.'
summary: Node is under disk pressure.
prometheus.rules.yaml: |+ prometheus.rules.yaml: |+
groups: groups:
- name: ./prometheus.rules - name: prometheus.rules
rules: rules:
- alert: FailedReload - alert: PrometheusConfigReloadFailed
expr: prometheus_config_last_reload_successful == 0 expr: prometheus_config_last_reload_successful == 0
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: Reloading Prometheus' configuration has failed for {{ $labels.namespace description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
}}/{{ $labels.pod}}. - alert: PrometheusNotificationQueueRunningFull
summary: Prometheus configuration reload has failed expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
for: 10m
labels:
severity: warning
annotations:
description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
$labels.pod}}
- alert: PrometheusErrorSendingAlerts
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
> 0.01
for: 10m
labels:
severity: warning
annotations:
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
- alert: PrometheusErrorSendingAlerts
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
> 0.03
for: 10m
labels:
severity: critical
annotations:
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
- alert: PrometheusNotConnectedToAlertmanagers
expr: prometheus_notifications_alertmanagers_discovered < 1
for: 10m
labels:
severity: warning
annotations:
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
to any Alertmanagers