addons: Sync prometheus alerts to upstream
* https://github.com/coreos/prometheus-operator/pull/774
This commit is contained in:
parent
8d3817e0ae
commit
65f006e6cc
|
@ -7,7 +7,7 @@ data:
|
|||
# Rules adapted from those provided by coreos/prometheus-operator and SoundCloud
|
||||
alertmanager.rules.yaml: |+
|
||||
groups:
|
||||
- name: ./alertmanager.rules
|
||||
- name: alertmanager.rules
|
||||
rules:
|
||||
- alert: AlertmanagerConfigInconsistent
|
||||
expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service)
|
||||
|
@ -19,7 +19,6 @@ data:
|
|||
annotations:
|
||||
description: The configuration of the instances of the Alertmanager cluster
|
||||
`{{$labels.service}}` are out of sync.
|
||||
summary: Alertmanager configurations are inconsistent
|
||||
- alert: AlertmanagerDownOrMissing
|
||||
expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1",
|
||||
"alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
|
||||
|
@ -29,8 +28,7 @@ data:
|
|||
annotations:
|
||||
description: An unexpected number of Alertmanagers are scraped or Alertmanagers
|
||||
disappeared from discovery.
|
||||
summary: Alertmanager down or not discovered
|
||||
- alert: FailedReload
|
||||
- alert: AlertmanagerFailedReload
|
||||
expr: alertmanager_config_last_reload_successful == 0
|
||||
for: 10m
|
||||
labels:
|
||||
|
@ -38,7 +36,6 @@ data:
|
|||
annotations:
|
||||
description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
|
||||
}}/{{ $labels.pod}}.
|
||||
summary: Alertmanager configuration reload has failed
|
||||
etcd3.rules.yaml: |+
|
||||
groups:
|
||||
- name: ./etcd3.rules
|
||||
|
@ -165,7 +162,7 @@ data:
|
|||
summary: high commit durations
|
||||
general.rules.yaml: |+
|
||||
groups:
|
||||
- name: ./general.rules
|
||||
- name: general.rules
|
||||
rules:
|
||||
- alert: TargetDown
|
||||
expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
|
||||
|
@ -173,63 +170,31 @@ data:
|
|||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{ $value }}% or more of {{ $labels.job }} targets are down.'
|
||||
description: '{{ $value }}% of {{ $labels.job }} targets are down.'
|
||||
summary: Targets are down
|
||||
- alert: TooManyOpenFileDescriptors
|
||||
expr: 100 * (process_open_fds / process_max_fds) > 95
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{
|
||||
$labels.instance }}) is using {{ $value }}% of the available file/socket descriptors.'
|
||||
summary: too many open file descriptors
|
||||
- record: instance:fd_utilization
|
||||
- record: fd_utilization
|
||||
expr: process_open_fds / process_max_fds
|
||||
- alert: FdExhaustionClose
|
||||
expr: predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
|
||||
expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{
|
||||
$labels.instance }}) instance will exhaust in file/socket descriptors soon'
|
||||
description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
|
||||
will exhaust in file/socket descriptors within the next 4 hours'
|
||||
summary: file descriptors soon exhausted
|
||||
- alert: FdExhaustionClose
|
||||
expr: predict_linear(instance:fd_utilization[10m], 3600) > 1
|
||||
expr: predict_linear(fd_utilization[10m], 3600) > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} ({{
|
||||
$labels.instance }}) instance will exhaust in file/socket descriptors soon'
|
||||
description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
|
||||
will exhaust in file/socket descriptors within the next hour'
|
||||
summary: file descriptors soon exhausted
|
||||
kube-apiserver.rules.yaml: |+
|
||||
groups:
|
||||
- name: ./kube-apiserver.rules
|
||||
rules:
|
||||
- alert: K8SApiserverDown
|
||||
expr: absent(up{job="kubernetes-apiservers"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: Prometheus failed to scrape API server(s), or all API servers have
|
||||
disappeared from service discovery.
|
||||
summary: API server unreachable
|
||||
- alert: K8SApiServerLatency
|
||||
expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"}[10m]))
|
||||
by (le)) / 1e+06 > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: 99th percentile Latency for {{ $labels.verb }} requests to the
|
||||
kube-apiserver is higher than 1s.
|
||||
summary: Kubernetes apiserver latency is high
|
||||
kube-controller-manager.rules.yaml: |+
|
||||
groups:
|
||||
- name: ./kube-controller-manager.rules
|
||||
- name: kube-controller-manager.rules
|
||||
rules:
|
||||
- alert: K8SControllerManagerDown
|
||||
expr: absent(up{kubernetes_name="kube-controller-manager"} == 1)
|
||||
|
@ -242,8 +207,53 @@ data:
|
|||
summary: Controller manager is down
|
||||
kube-scheduler.rules.yaml: |+
|
||||
groups:
|
||||
- name: ./kube-scheduler.rules
|
||||
- name: kube-scheduler.rules
|
||||
rules:
|
||||
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
- record: cluster:scheduler_binding_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: cluster:scheduler_binding_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: cluster:scheduler_binding_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
- alert: K8SSchedulerDown
|
||||
expr: absent(up{kubernetes_name="kube-scheduler"} == 1)
|
||||
for: 5m
|
||||
|
@ -253,9 +263,65 @@ data:
|
|||
description: There is no running K8S scheduler. New pods are not being assigned
|
||||
to nodes.
|
||||
summary: Scheduler is down
|
||||
kube-state-metrics.rules.yaml: |+
|
||||
groups:
|
||||
- name: kube-state-metrics.rules
|
||||
rules:
|
||||
- alert: DeploymentGenerationMismatch
|
||||
expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Observed deployment generation does not match expected one for
|
||||
deployment {{$labels.namespaces}}{{$labels.deployment}}
|
||||
- alert: DeploymentReplicasNotUpdated
|
||||
expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas)
|
||||
or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas))
|
||||
unless (kube_deployment_spec_paused == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Replicas are not updated and available for deployment {{$labels.namespaces}}/{{$labels.deployment}}
|
||||
- alert: DaemonSetRolloutStuck
|
||||
expr: kube_daemonset_status_current_number_ready / kube_daemonset_status_desired_number_scheduled
|
||||
* 100 < 100
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Only {{$value}}% of desired pods scheduled and ready for daemon
|
||||
set {{$labels.namespaces}}/{{$labels.daemonset}}
|
||||
- alert: K8SDaemonSetsNotScheduled
|
||||
expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled
|
||||
> 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: A number of daemonsets are not scheduled.
|
||||
summary: Daemonsets are not scheduled correctly
|
||||
- alert: DaemonSetsMissScheduled
|
||||
expr: kube_daemonset_status_number_misscheduled > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: A number of daemonsets are running where they are not supposed
|
||||
to run.
|
||||
summary: Daemonsets are not scheduled correctly
|
||||
- alert: PodFrequentlyRestarting
|
||||
expr: increase(kube_pod_container_status_restarts[1h]) > 5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}}
|
||||
times within the last hour
|
||||
kubelet.rules.yaml: |+
|
||||
groups:
|
||||
- name: ./kubelet.rules
|
||||
- name: kubelet.rules
|
||||
rules:
|
||||
- alert: K8SNodeNotReady
|
||||
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
|
||||
|
@ -274,20 +340,17 @@ data:
|
|||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{ $value }} Kubernetes nodes (more than 10% are in the NotReady
|
||||
state).'
|
||||
summary: Many Kubernetes nodes are Not Ready
|
||||
description: '{{ $value }}% of Kubernetes nodes are not ready'
|
||||
- alert: K8SKubeletDown
|
||||
expr: count(up{job="kubernetes-nodes"} == 0) / count(up{job="kubernetes-nodes"}) > 0.03
|
||||
expr: count(up{job="kubernetes-nodes"} == 0) / count(up{job="kubernetes-nodes"}) * 100 > 3
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Prometheus failed to scrape {{ $value }}% of kubelets.
|
||||
summary: Many Kubelets cannot be scraped
|
||||
- alert: K8SKubeletDown
|
||||
expr: absent(up{job="kubernetes-nodes"} == 1) or count(up{job="kubernetes-nodes"} == 0) / count(up{job="kubernetes-nodes"})
|
||||
> 0.1
|
||||
expr: (absent(up{job="kubernetes-nodes"} == 1) or count(up{job="kubernetes-nodes"} == 0) / count(up{job="kubernetes-nodes"}))
|
||||
* 100 > 1
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
|
@ -297,159 +360,121 @@ data:
|
|||
summary: Many Kubelets cannot be scraped
|
||||
- alert: K8SKubeletTooManyPods
|
||||
expr: kubelet_running_pod_count > 100
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Kubelet {{$labels.instance}} is running {{$value}} pods, close
|
||||
to the limit of 110
|
||||
summary: Kubelet is close to pod limit
|
||||
- alert: K8SDaemonSetsNotScheduled
|
||||
expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled
|
||||
> 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: A number of daemonsets are not scheduled.
|
||||
summary: Daemonsets are not scheduled correctly
|
||||
- alert: K8SDaemonSetsNotRunning
|
||||
expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_number_ready
|
||||
> 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: A number of daemonsets are not ready.
|
||||
summary: Daemonsets are not ready
|
||||
- alert: K8SDaemonSetsMissScheduled
|
||||
expr: kube_daemonset_status_number_misscheduled > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: A number of daemonsets are running where they are not supposed
|
||||
to run.
|
||||
summary: Daemonsets are not scheduled correctly
|
||||
kubernetes.rules.yaml: |+
|
||||
groups:
|
||||
- name: ./kubernetes.rules
|
||||
- name: kubernetes.rules
|
||||
rules:
|
||||
- record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes
|
||||
expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""},
|
||||
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
|
||||
controller, pod_name, container_name)
|
||||
- record: cluster_namespace_controller_pod_container:spec_cpu_shares
|
||||
expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller",
|
||||
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
|
||||
container_name)
|
||||
- record: cluster_namespace_controller_pod_container:cpu_usage:rate
|
||||
expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]),
|
||||
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
|
||||
controller, pod_name, container_name)
|
||||
- record: cluster_namespace_controller_pod_container:memory_usage:bytes
|
||||
expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller",
|
||||
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
|
||||
container_name)
|
||||
- record: cluster_namespace_controller_pod_container:memory_working_set:bytes
|
||||
expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""},
|
||||
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
|
||||
controller, pod_name, container_name)
|
||||
- record: cluster_namespace_controller_pod_container:memory_rss:bytes
|
||||
expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller",
|
||||
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
|
||||
container_name)
|
||||
- record: cluster_namespace_controller_pod_container:memory_cache:bytes
|
||||
expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller",
|
||||
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
|
||||
container_name)
|
||||
- record: cluster_namespace_controller_pod_container:disk_usage:bytes
|
||||
expr: sum(label_replace(container_disk_usage_bytes{container_name!=""}, "controller",
|
||||
"$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace, controller, pod_name,
|
||||
container_name)
|
||||
- record: cluster_namespace_controller_pod_container:memory_pagefaults:rate
|
||||
expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]),
|
||||
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
|
||||
controller, pod_name, container_name, scope, type)
|
||||
- record: cluster_namespace_controller_pod_container:memory_oom:rate
|
||||
expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]),
|
||||
"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) BY (cluster, namespace,
|
||||
controller, pod_name, container_name, scope, type)
|
||||
- record: cluster:memory_allocation:percent
|
||||
expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) BY (cluster)
|
||||
/ sum(machine_memory_bytes) BY (cluster)
|
||||
- record: cluster:memory_used:percent
|
||||
expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) BY (cluster) / sum(machine_memory_bytes)
|
||||
BY (cluster)
|
||||
- record: cluster:cpu_allocation:percent
|
||||
expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) BY (cluster) / sum(container_spec_cpu_shares{id="/"}
|
||||
* ON(cluster, instance) machine_cpu_cores) BY (cluster)
|
||||
- record: cluster:node_cpu_use:percent
|
||||
expr: 100 * sum(rate(node_cpu{mode!="idle"}[5m])) BY (cluster) / sum(machine_cpu_cores)
|
||||
BY (cluster)
|
||||
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le,
|
||||
cluster, job, resource, verb)) / 1e+06
|
||||
- record: pod_name:container_memory_usage_bytes:sum
|
||||
expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
|
||||
(pod_name)
|
||||
- record: pod_name:container_spec_cpu_shares:sum
|
||||
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name)
|
||||
- record: pod_name:container_cpu_usage:sum
|
||||
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
|
||||
BY (pod_name)
|
||||
- record: pod_name:container_fs_usage_bytes:sum
|
||||
expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name)
|
||||
- record: namespace:container_memory_usage_bytes:sum
|
||||
expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace)
|
||||
- record: namespace:container_spec_cpu_shares:sum
|
||||
expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace)
|
||||
- record: namespace:container_cpu_usage:sum
|
||||
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m]))
|
||||
BY (namespace)
|
||||
- record: cluster:memory_usage:ratio
|
||||
expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
|
||||
(cluster) / sum(machine_memory_bytes) BY (cluster)
|
||||
- record: cluster:container_spec_cpu_shares:ratio
|
||||
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000
|
||||
/ sum(machine_cpu_cores)
|
||||
- record: cluster:container_cpu_usage:ratio
|
||||
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
|
||||
/ sum(machine_cpu_cores)
|
||||
- record: apiserver_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) /
|
||||
1e+06
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le,
|
||||
cluster, job, resource, verb)) / 1e+06
|
||||
- record: apiserver_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) /
|
||||
1e+06
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le,
|
||||
cluster, job, resource, verb)) / 1e+06
|
||||
- record: apiserver_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) /
|
||||
1e+06
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
- alert: APIServerLatencyHigh
|
||||
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
|
||||
> 1
|
||||
for: 10m
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
severity: warning
|
||||
annotations:
|
||||
description: the API server has a 99th percentile latency of {{ $value }} seconds
|
||||
for {{$labels.verb}} {{$labels.resource}}
|
||||
- alert: APIServerLatencyHigh
|
||||
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
|
||||
> 4
|
||||
for: 10m
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
severity: critical
|
||||
annotations:
|
||||
description: the API server has a 99th percentile latency of {{ $value }} seconds
|
||||
for {{$labels.verb}} {{$labels.resource}}
|
||||
- alert: APIServerErrorsHigh
|
||||
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
|
||||
* 100 > 2
|
||||
for: 10m
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
severity: warning
|
||||
annotations:
|
||||
description: API server returns errors for {{ $value }}% of requests
|
||||
- alert: APIServerErrorsHigh
|
||||
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
|
||||
* 100 > 5
|
||||
for: 10m
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
severity: critical
|
||||
annotations:
|
||||
description: API server returns errors for {{ $value }}% of requests
|
||||
- alert: K8SApiserverDown
|
||||
expr: absent(up{job="kubernetes-apiservers"} == 1)
|
||||
for: 20m
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
- record: cluster:scheduler_binding_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: cluster:scheduler_binding_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: cluster:scheduler_binding_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
severity: critical
|
||||
annotations:
|
||||
description: No API servers are reachable or all have disappeared from service
|
||||
discovery
|
||||
node.rules.yaml: |+
|
||||
groups:
|
||||
- name: ./node.rules
|
||||
- name: node.rules
|
||||
rules:
|
||||
- record: instance:node_cpu:rate:sum
|
||||
expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m]))
|
||||
BY (instance)
|
||||
- record: instance:node_filesystem_usage:sum
|
||||
expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}))
|
||||
BY (instance)
|
||||
- record: instance:node_network_receive_bytes:rate:sum
|
||||
expr: sum(rate(node_network_receive_bytes[3m])) BY (instance)
|
||||
- record: instance:node_network_transmit_bytes:rate:sum
|
||||
expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance)
|
||||
- record: instance:node_cpu:ratio
|
||||
expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance)
|
||||
GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)
|
||||
- record: cluster:node_cpu:sum_rate5m
|
||||
expr: sum(rate(node_cpu{mode!="idle"}[5m]))
|
||||
- record: cluster:node_cpu:ratio
|
||||
expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
|
||||
- alert: NodeExporterDown
|
||||
expr: absent(up{kubernetes_name="node-exporter"} == 1)
|
||||
for: 10m
|
||||
|
@ -457,43 +482,65 @@ data:
|
|||
severity: warning
|
||||
annotations:
|
||||
description: Prometheus could not scrape a node-exporter for more than 10m,
|
||||
or node-exporters have disappeared from discovery.
|
||||
summary: node-exporter cannot be scraped
|
||||
- alert: K8SNodeOutOfDisk
|
||||
expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1
|
||||
or node-exporters have disappeared from discovery
|
||||
- alert: NodeDiskRunningFull
|
||||
expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: device {{$labels.device}} on node {{$labels.instance}} is running
|
||||
full within the next 24 hours (mounted at {{$labels.mountpoint}})
|
||||
- alert: NodeDiskRunningFull
|
||||
expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0
|
||||
for: 10m
|
||||
labels:
|
||||
service: k8s
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{ $labels.node }} has run out of disk space.'
|
||||
summary: Node ran out of disk space.
|
||||
- alert: K8SNodeMemoryPressure
|
||||
expr: kube_node_status_condition{condition="MemoryPressure",status="true"} ==
|
||||
1
|
||||
labels:
|
||||
service: k8s
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{ $labels.node }} is under memory pressure.'
|
||||
summary: Node is under memory pressure.
|
||||
- alert: K8SNodeDiskPressure
|
||||
expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
|
||||
labels:
|
||||
service: k8s
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{ $labels.node }} is under disk pressure.'
|
||||
summary: Node is under disk pressure.
|
||||
description: device {{$labels.device}} on node {{$labels.instance}} is running
|
||||
full within the next 2 hours (mounted at {{$labels.mountpoint}})
|
||||
prometheus.rules.yaml: |+
|
||||
groups:
|
||||
- name: ./prometheus.rules
|
||||
- name: prometheus.rules
|
||||
rules:
|
||||
- alert: FailedReload
|
||||
- alert: PrometheusConfigReloadFailed
|
||||
expr: prometheus_config_last_reload_successful == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Reloading Prometheus' configuration has failed for {{ $labels.namespace
|
||||
}}/{{ $labels.pod}}.
|
||||
summary: Prometheus configuration reload has failed
|
||||
description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
|
||||
- alert: PrometheusNotificationQueueRunningFull
|
||||
expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
|
||||
$labels.pod}}
|
||||
- alert: PrometheusErrorSendingAlerts
|
||||
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
|
||||
> 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
|
||||
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
||||
- alert: PrometheusErrorSendingAlerts
|
||||
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
|
||||
> 0.03
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
|
||||
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
||||
- alert: PrometheusNotConnectedToAlertmanagers
|
||||
expr: prometheus_notifications_alertmanagers_discovered < 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
|
||||
to any Alertmanagers
|
||||
|
|
Loading…
Reference in New Issue