addons: Update Prometheus to v2.1.0
* Change service discovery to relabel jobs to align with rule expressions in upstream examples * Use a separate service account for prometheus instead of granting roles to the namespace's default * Use a separate service account for node-exporter * Update node-exporter and kube-state-metrics exporters
This commit is contained in:
parent
c3b0cdddf3
commit
064ce83f25
|
@ -39,7 +39,7 @@ data:
|
||||||
tls_config:
|
tls_config:
|
||||||
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||||
# Using endpoints to discover kube-apiserver targets finds the pod IP
|
# Using endpoints to discover kube-apiserver targets finds the pod IP
|
||||||
# (host IP since apiserver is uses host network) which is not used in
|
# (host IP since apiserver uses host network) which is not used in
|
||||||
# the server certificate.
|
# the server certificate.
|
||||||
insecure_skip_verify: true
|
insecure_skip_verify: true
|
||||||
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||||
|
@ -51,6 +51,9 @@ data:
|
||||||
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
|
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
|
||||||
action: keep
|
action: keep
|
||||||
regex: default;kubernetes;https
|
regex: default;kubernetes;https
|
||||||
|
- replacement: apiserver
|
||||||
|
action: replace
|
||||||
|
target_label: job
|
||||||
|
|
||||||
# Scrape config for node (i.e. kubelet) /metrics (e.g. 'kubelet_'). Explore
|
# Scrape config for node (i.e. kubelet) /metrics (e.g. 'kubelet_'). Explore
|
||||||
# metrics from a node by scraping kubelet (127.0.0.1:10255/metrics).
|
# metrics from a node by scraping kubelet (127.0.0.1:10255/metrics).
|
||||||
|
@ -59,7 +62,7 @@ data:
|
||||||
# Kubernetes apiserver. This means it will work if Prometheus is running out of
|
# Kubernetes apiserver. This means it will work if Prometheus is running out of
|
||||||
# cluster, or can't connect to nodes for some other reason (e.g. because of
|
# cluster, or can't connect to nodes for some other reason (e.g. because of
|
||||||
# firewalling).
|
# firewalling).
|
||||||
- job_name: 'kubernetes-nodes'
|
- job_name: 'kubelet'
|
||||||
kubernetes_sd_configs:
|
kubernetes_sd_configs:
|
||||||
- role: node
|
- role: node
|
||||||
|
|
||||||
|
@ -149,7 +152,7 @@ data:
|
||||||
target_label: kubernetes_namespace
|
target_label: kubernetes_namespace
|
||||||
- source_labels: [__meta_kubernetes_service_name]
|
- source_labels: [__meta_kubernetes_service_name]
|
||||||
action: replace
|
action: replace
|
||||||
target_label: kubernetes_name
|
target_label: job
|
||||||
|
|
||||||
# Example scrape config for probing services via the Blackbox Exporter.
|
# Example scrape config for probing services via the Blackbox Exporter.
|
||||||
#
|
#
|
||||||
|
@ -181,7 +184,7 @@ data:
|
||||||
- source_labels: [__meta_kubernetes_namespace]
|
- source_labels: [__meta_kubernetes_namespace]
|
||||||
target_label: kubernetes_namespace
|
target_label: kubernetes_namespace
|
||||||
- source_labels: [__meta_kubernetes_service_name]
|
- source_labels: [__meta_kubernetes_service_name]
|
||||||
target_label: kubernetes_name
|
target_label: job
|
||||||
|
|
||||||
# Example scrape config for pods
|
# Example scrape config for pods
|
||||||
#
|
#
|
||||||
|
|
|
@ -14,9 +14,10 @@ spec:
|
||||||
name: prometheus
|
name: prometheus
|
||||||
phase: prod
|
phase: prod
|
||||||
spec:
|
spec:
|
||||||
|
serviceAccountName: prometheus
|
||||||
containers:
|
containers:
|
||||||
- name: prometheus
|
- name: prometheus
|
||||||
image: quay.io/prometheus/prometheus:v2.0.0
|
image: quay.io/prometheus/prometheus:v2.1.0
|
||||||
args:
|
args:
|
||||||
- '--config.file=/etc/prometheus/prometheus.yaml'
|
- '--config.file=/etc/prometheus/prometheus.yaml'
|
||||||
ports:
|
ports:
|
||||||
|
|
|
@ -35,4 +35,3 @@ rules:
|
||||||
resources:
|
resources:
|
||||||
- horizontalpodautoscalers
|
- horizontalpodautoscalers
|
||||||
verbs: ["list", "watch"]
|
verbs: ["list", "watch"]
|
||||||
|
|
||||||
|
|
|
@ -54,8 +54,8 @@ spec:
|
||||||
- /pod_nanny
|
- /pod_nanny
|
||||||
- --container=kube-state-metrics
|
- --container=kube-state-metrics
|
||||||
- --cpu=100m
|
- --cpu=100m
|
||||||
- --extra-cpu=1m
|
- --extra-cpu=2m
|
||||||
- --memory=100Mi
|
- --memory=150Mi
|
||||||
- --extra-memory=2Mi
|
- --extra-memory=30Mi
|
||||||
- --threshold=5
|
- --threshold=5
|
||||||
- --deployment=kube-state-metrics
|
- --deployment=kube-state-metrics
|
||||||
|
|
|
@ -18,11 +18,15 @@ spec:
|
||||||
name: node-exporter
|
name: node-exporter
|
||||||
phase: prod
|
phase: prod
|
||||||
spec:
|
spec:
|
||||||
|
serviceAccountName: node-exporter
|
||||||
|
securityContext:
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 65534
|
||||||
hostNetwork: true
|
hostNetwork: true
|
||||||
hostPID: true
|
hostPID: true
|
||||||
containers:
|
containers:
|
||||||
- name: node-exporter
|
- name: node-exporter
|
||||||
image: quay.io/prometheus/node-exporter:v0.15.0
|
image: quay.io/prometheus/node-exporter:v0.15.2
|
||||||
args:
|
args:
|
||||||
- "--path.procfs=/host/proc"
|
- "--path.procfs=/host/proc"
|
||||||
- "--path.sysfs=/host/sys"
|
- "--path.sysfs=/host/sys"
|
||||||
|
@ -45,9 +49,8 @@ spec:
|
||||||
mountPath: /host/sys
|
mountPath: /host/sys
|
||||||
readOnly: true
|
readOnly: true
|
||||||
tolerations:
|
tolerations:
|
||||||
- key: node-role.kubernetes.io/master
|
- effect: NoSchedule
|
||||||
operator: Exists
|
operator: Exists
|
||||||
effect: NoSchedule
|
|
||||||
volumes:
|
volumes:
|
||||||
- name: proc
|
- name: proc
|
||||||
hostPath:
|
hostPath:
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ServiceAccount
|
||||||
|
metadata:
|
||||||
|
name: node-exporter
|
||||||
|
namespace: monitoring
|
|
@ -8,5 +8,5 @@ roleRef:
|
||||||
name: prometheus
|
name: prometheus
|
||||||
subjects:
|
subjects:
|
||||||
- kind: ServiceAccount
|
- kind: ServiceAccount
|
||||||
name: default
|
name: prometheus
|
||||||
namespace: monitoring
|
namespace: monitoring
|
||||||
|
|
|
@ -4,8 +4,7 @@ metadata:
|
||||||
name: prometheus-rules
|
name: prometheus-rules
|
||||||
namespace: monitoring
|
namespace: monitoring
|
||||||
data:
|
data:
|
||||||
# Rules adapted from those provided by coreos/prometheus-operator and SoundCloud
|
alertmanager.rules.yaml: |
|
||||||
alertmanager.rules.yaml: |+
|
|
||||||
groups:
|
groups:
|
||||||
- name: alertmanager.rules
|
- name: alertmanager.rules
|
||||||
rules:
|
rules:
|
||||||
|
@ -36,7 +35,7 @@ data:
|
||||||
annotations:
|
annotations:
|
||||||
description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
|
description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
|
||||||
}}/{{ $labels.pod}}.
|
}}/{{ $labels.pod}}.
|
||||||
etcd3.rules.yaml: |+
|
etcd3.rules.yaml: |
|
||||||
groups:
|
groups:
|
||||||
- name: ./etcd3.rules
|
- name: ./etcd3.rules
|
||||||
rules:
|
rules:
|
||||||
|
@ -65,8 +64,8 @@ data:
|
||||||
changes within the last hour
|
changes within the last hour
|
||||||
summary: a high number of leader changes within the etcd cluster are happening
|
summary: a high number of leader changes within the etcd cluster are happening
|
||||||
- alert: HighNumberOfFailedGRPCRequests
|
- alert: HighNumberOfFailedGRPCRequests
|
||||||
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method)
|
expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
|
||||||
/ sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01
|
/ sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.01
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
@ -75,8 +74,8 @@ data:
|
||||||
on etcd instance {{ $labels.instance }}'
|
on etcd instance {{ $labels.instance }}'
|
||||||
summary: a high number of gRPC requests are failing
|
summary: a high number of gRPC requests are failing
|
||||||
- alert: HighNumberOfFailedGRPCRequests
|
- alert: HighNumberOfFailedGRPCRequests
|
||||||
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method)
|
expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
|
||||||
/ sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05
|
/ sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.05
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
|
@ -85,7 +84,7 @@ data:
|
||||||
on etcd instance {{ $labels.instance }}'
|
on etcd instance {{ $labels.instance }}'
|
||||||
summary: a high number of gRPC requests are failing
|
summary: a high number of gRPC requests are failing
|
||||||
- alert: GRPCRequestsSlow
|
- alert: GRPCRequestsSlow
|
||||||
expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m]))
|
expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le))
|
||||||
> 0.15
|
> 0.15
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
|
@ -125,7 +124,7 @@ data:
|
||||||
}} are slow
|
}} are slow
|
||||||
summary: slow HTTP requests
|
summary: slow HTTP requests
|
||||||
- alert: EtcdMemberCommunicationSlow
|
- alert: EtcdMemberCommunicationSlow
|
||||||
expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m]))
|
expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m]))
|
||||||
> 0.15
|
> 0.15
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
|
@ -160,7 +159,7 @@ data:
|
||||||
annotations:
|
annotations:
|
||||||
description: etcd instance {{ $labels.instance }} commit durations are high
|
description: etcd instance {{ $labels.instance }} commit durations are high
|
||||||
summary: high commit durations
|
summary: high commit durations
|
||||||
general.rules.yaml: |+
|
general.rules.yaml: |
|
||||||
groups:
|
groups:
|
||||||
- name: general.rules
|
- name: general.rules
|
||||||
rules:
|
rules:
|
||||||
|
@ -192,12 +191,12 @@ data:
|
||||||
description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
|
description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
|
||||||
will exhaust in file/socket descriptors within the next hour'
|
will exhaust in file/socket descriptors within the next hour'
|
||||||
summary: file descriptors soon exhausted
|
summary: file descriptors soon exhausted
|
||||||
kube-controller-manager.rules.yaml: |+
|
kube-controller-manager.rules.yaml: |
|
||||||
groups:
|
groups:
|
||||||
- name: kube-controller-manager.rules
|
- name: kube-controller-manager.rules
|
||||||
rules:
|
rules:
|
||||||
- alert: K8SControllerManagerDown
|
- alert: K8SControllerManagerDown
|
||||||
expr: absent(up{kubernetes_name="kube-controller-manager"} == 1)
|
expr: absent(up{job="kube-controller-manager"} == 1)
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
|
@ -205,7 +204,7 @@ data:
|
||||||
description: There is no running K8S controller manager. Deployments and replication
|
description: There is no running K8S controller manager. Deployments and replication
|
||||||
controllers are not making progress.
|
controllers are not making progress.
|
||||||
summary: Controller manager is down
|
summary: Controller manager is down
|
||||||
kube-scheduler.rules.yaml: |+
|
kube-scheduler.rules.yaml: |
|
||||||
groups:
|
groups:
|
||||||
- name: kube-scheduler.rules
|
- name: kube-scheduler.rules
|
||||||
rules:
|
rules:
|
||||||
|
@ -255,7 +254,7 @@ data:
|
||||||
labels:
|
labels:
|
||||||
quantile: "0.5"
|
quantile: "0.5"
|
||||||
- alert: K8SSchedulerDown
|
- alert: K8SSchedulerDown
|
||||||
expr: absent(up{kubernetes_name="kube-scheduler"} == 1)
|
expr: absent(up{job="kube-scheduler"} == 1)
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
|
@ -263,7 +262,7 @@ data:
|
||||||
description: There is no running K8S scheduler. New pods are not being assigned
|
description: There is no running K8S scheduler. New pods are not being assigned
|
||||||
to nodes.
|
to nodes.
|
||||||
summary: Scheduler is down
|
summary: Scheduler is down
|
||||||
kube-state-metrics.rules.yaml: |+
|
kube-state-metrics.rules.yaml: |
|
||||||
groups:
|
groups:
|
||||||
- name: kube-state-metrics.rules
|
- name: kube-state-metrics.rules
|
||||||
rules:
|
rules:
|
||||||
|
@ -274,7 +273,8 @@ data:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
description: Observed deployment generation does not match expected one for
|
description: Observed deployment generation does not match expected one for
|
||||||
deployment {{$labels.namespaces}}{{$labels.deployment}}
|
deployment {{$labels.namespaces}}/{{$labels.deployment}}
|
||||||
|
summary: Deployment is outdated
|
||||||
- alert: DeploymentReplicasNotUpdated
|
- alert: DeploymentReplicasNotUpdated
|
||||||
expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas)
|
expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas)
|
||||||
or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas))
|
or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas))
|
||||||
|
@ -284,8 +284,9 @@ data:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
description: Replicas are not updated and available for deployment {{$labels.namespaces}}/{{$labels.deployment}}
|
description: Replicas are not updated and available for deployment {{$labels.namespaces}}/{{$labels.deployment}}
|
||||||
|
summary: Deployment replicas are outdated
|
||||||
- alert: DaemonSetRolloutStuck
|
- alert: DaemonSetRolloutStuck
|
||||||
expr: kube_daemonset_status_current_number_ready / kube_daemonset_status_desired_number_scheduled
|
expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled
|
||||||
* 100 < 100
|
* 100 < 100
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
|
@ -293,6 +294,7 @@ data:
|
||||||
annotations:
|
annotations:
|
||||||
description: Only {{$value}}% of desired pods scheduled and ready for daemon
|
description: Only {{$value}}% of desired pods scheduled and ready for daemon
|
||||||
set {{$labels.namespaces}}/{{$labels.daemonset}}
|
set {{$labels.namespaces}}/{{$labels.daemonset}}
|
||||||
|
summary: DaemonSet is missing pods
|
||||||
- alert: K8SDaemonSetsNotScheduled
|
- alert: K8SDaemonSetsNotScheduled
|
||||||
expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled
|
expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled
|
||||||
> 0
|
> 0
|
||||||
|
@ -312,14 +314,15 @@ data:
|
||||||
to run.
|
to run.
|
||||||
summary: Daemonsets are not scheduled correctly
|
summary: Daemonsets are not scheduled correctly
|
||||||
- alert: PodFrequentlyRestarting
|
- alert: PodFrequentlyRestarting
|
||||||
expr: increase(kube_pod_container_status_restarts[1h]) > 5
|
expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}}
|
description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}}
|
||||||
times within the last hour
|
times within the last hour
|
||||||
kubelet.rules.yaml: |+
|
summary: Pod is restarting frequently
|
||||||
|
kubelet.rules.yaml: |
|
||||||
groups:
|
groups:
|
||||||
- name: kubelet.rules
|
- name: kubelet.rules
|
||||||
rules:
|
rules:
|
||||||
|
@ -342,14 +345,14 @@ data:
|
||||||
annotations:
|
annotations:
|
||||||
description: '{{ $value }}% of Kubernetes nodes are not ready'
|
description: '{{ $value }}% of Kubernetes nodes are not ready'
|
||||||
- alert: K8SKubeletDown
|
- alert: K8SKubeletDown
|
||||||
expr: count(up{job="kubernetes-nodes"} == 0) / count(up{job="kubernetes-nodes"}) * 100 > 3
|
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3
|
||||||
for: 1h
|
for: 1h
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
description: Prometheus failed to scrape {{ $value }}% of kubelets.
|
description: Prometheus failed to scrape {{ $value }}% of kubelets.
|
||||||
- alert: K8SKubeletDown
|
- alert: K8SKubeletDown
|
||||||
expr: (absent(up{job="kubernetes-nodes"} == 1) or count(up{job="kubernetes-nodes"} == 0) / count(up{job="kubernetes-nodes"}))
|
expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}))
|
||||||
* 100 > 1
|
* 100 > 1
|
||||||
for: 1h
|
for: 1h
|
||||||
labels:
|
labels:
|
||||||
|
@ -367,7 +370,7 @@ data:
|
||||||
description: Kubelet {{$labels.instance}} is running {{$value}} pods, close
|
description: Kubelet {{$labels.instance}} is running {{$value}} pods, close
|
||||||
to the limit of 110
|
to the limit of 110
|
||||||
summary: Kubelet is close to pod limit
|
summary: Kubelet is close to pod limit
|
||||||
kubernetes.rules.yaml: |+
|
kubernetes.rules.yaml: |
|
||||||
groups:
|
groups:
|
||||||
- name: kubernetes.rules
|
- name: kubernetes.rules
|
||||||
rules:
|
rules:
|
||||||
|
@ -447,14 +450,28 @@ data:
|
||||||
annotations:
|
annotations:
|
||||||
description: API server returns errors for {{ $value }}% of requests
|
description: API server returns errors for {{ $value }}% of requests
|
||||||
- alert: K8SApiserverDown
|
- alert: K8SApiserverDown
|
||||||
expr: absent(up{job="kubernetes-apiservers"} == 1)
|
expr: absent(up{job="apiserver"} == 1)
|
||||||
for: 20m
|
for: 20m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
description: No API servers are reachable or all have disappeared from service
|
description: No API servers are reachable or all have disappeared from service
|
||||||
discovery
|
discovery
|
||||||
node.rules.yaml: |+
|
|
||||||
|
- alert: K8sCertificateExpirationNotice
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
description: Kubernetes API Certificate is expiring soon (less than 7 days)
|
||||||
|
expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0
|
||||||
|
|
||||||
|
- alert: K8sCertificateExpirationNotice
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
description: Kubernetes API Certificate is expiring in less than 1 day
|
||||||
|
expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0
|
||||||
|
node.rules.yaml: |
|
||||||
groups:
|
groups:
|
||||||
- name: node.rules
|
- name: node.rules
|
||||||
rules:
|
rules:
|
||||||
|
@ -476,7 +493,7 @@ data:
|
||||||
- record: cluster:node_cpu:ratio
|
- record: cluster:node_cpu:ratio
|
||||||
expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
|
expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
|
||||||
- alert: NodeExporterDown
|
- alert: NodeExporterDown
|
||||||
expr: absent(up{kubernetes_name="node-exporter"} == 1)
|
expr: absent(up{job="node-exporter"} == 1)
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
@ -499,7 +516,7 @@ data:
|
||||||
annotations:
|
annotations:
|
||||||
description: device {{$labels.device}} on node {{$labels.instance}} is running
|
description: device {{$labels.device}} on node {{$labels.instance}} is running
|
||||||
full within the next 2 hours (mounted at {{$labels.mountpoint}})
|
full within the next 2 hours (mounted at {{$labels.mountpoint}})
|
||||||
prometheus.rules.yaml: |+
|
prometheus.rules.yaml: |
|
||||||
groups:
|
groups:
|
||||||
- name: prometheus.rules
|
- name: prometheus.rules
|
||||||
rules:
|
rules:
|
||||||
|
@ -544,3 +561,30 @@ data:
|
||||||
annotations:
|
annotations:
|
||||||
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
|
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
|
||||||
to any Alertmanagers
|
to any Alertmanagers
|
||||||
|
- alert: PrometheusTSDBReloadsFailing
|
||||||
|
expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
|
||||||
|
for: 12h
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
|
||||||
|
reload failures over the last four hours.'
|
||||||
|
summary: Prometheus has issues reloading data blocks from disk
|
||||||
|
- alert: PrometheusTSDBCompactionsFailing
|
||||||
|
expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
|
||||||
|
for: 12h
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
|
||||||
|
compaction failures over the last four hours.'
|
||||||
|
summary: Prometheus has issues compacting sample blocks
|
||||||
|
- alert: PrometheusTSDBWALCorruptions
|
||||||
|
expr: tsdb_wal_corruptions_total > 0
|
||||||
|
for: 4h
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
|
||||||
|
log (WAL).'
|
||||||
|
summary: Prometheus write-ahead log is corrupted
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ServiceAccount
|
||||||
|
metadata:
|
||||||
|
name: prometheus
|
||||||
|
namespace: monitoring
|
Loading…
Reference in New Issue