addons: Update Prometheus to v2.1.0

* Change service discovery to relabel jobs to align with
rule expressions in upstream examples
* Use a separate service account for prometheus instead
of granting roles to the namespace's default
* Use a separate service account for node-exporter
* Update node-exporter and kube-state-metrics exporters
This commit is contained in:
Dalton Hubble 2018-01-27 20:56:49 -08:00
parent c3b0cdddf3
commit 064ce83f25
9 changed files with 99 additions and 39 deletions

View File

@ -39,7 +39,7 @@ data:
tls_config: tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
# Using endpoints to discover kube-apiserver targets finds the pod IP # Using endpoints to discover kube-apiserver targets finds the pod IP
# (host IP since apiserver is uses host network) which is not used in # (host IP since apiserver uses host network) which is not used in
# the server certificate. # the server certificate.
insecure_skip_verify: true insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
@ -51,6 +51,9 @@ data:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep action: keep
regex: default;kubernetes;https regex: default;kubernetes;https
- replacement: apiserver
action: replace
target_label: job
# Scrape config for node (i.e. kubelet) /metrics (e.g. 'kubelet_'). Explore # Scrape config for node (i.e. kubelet) /metrics (e.g. 'kubelet_'). Explore
# metrics from a node by scraping kubelet (127.0.0.1:10255/metrics). # metrics from a node by scraping kubelet (127.0.0.1:10255/metrics).
@ -59,7 +62,7 @@ data:
# Kubernetes apiserver. This means it will work if Prometheus is running out of # Kubernetes apiserver. This means it will work if Prometheus is running out of
# cluster, or can't connect to nodes for some other reason (e.g. because of # cluster, or can't connect to nodes for some other reason (e.g. because of
# firewalling). # firewalling).
- job_name: 'kubernetes-nodes' - job_name: 'kubelet'
kubernetes_sd_configs: kubernetes_sd_configs:
- role: node - role: node
@ -149,7 +152,7 @@ data:
target_label: kubernetes_namespace target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name] - source_labels: [__meta_kubernetes_service_name]
action: replace action: replace
target_label: kubernetes_name target_label: job
# Example scrape config for probing services via the Blackbox Exporter. # Example scrape config for probing services via the Blackbox Exporter.
# #
@ -181,7 +184,7 @@ data:
- source_labels: [__meta_kubernetes_namespace] - source_labels: [__meta_kubernetes_namespace]
target_label: kubernetes_namespace target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name] - source_labels: [__meta_kubernetes_service_name]
target_label: kubernetes_name target_label: job
# Example scrape config for pods # Example scrape config for pods
# #

View File

@ -14,9 +14,10 @@ spec:
name: prometheus name: prometheus
phase: prod phase: prod
spec: spec:
serviceAccountName: prometheus
containers: containers:
- name: prometheus - name: prometheus
image: quay.io/prometheus/prometheus:v2.0.0 image: quay.io/prometheus/prometheus:v2.1.0
args: args:
- '--config.file=/etc/prometheus/prometheus.yaml' - '--config.file=/etc/prometheus/prometheus.yaml'
ports: ports:

View File

@ -35,4 +35,3 @@ rules:
resources: resources:
- horizontalpodautoscalers - horizontalpodautoscalers
verbs: ["list", "watch"] verbs: ["list", "watch"]

View File

@ -54,8 +54,8 @@ spec:
- /pod_nanny - /pod_nanny
- --container=kube-state-metrics - --container=kube-state-metrics
- --cpu=100m - --cpu=100m
- --extra-cpu=1m - --extra-cpu=2m
- --memory=100Mi - --memory=150Mi
- --extra-memory=2Mi - --extra-memory=30Mi
- --threshold=5 - --threshold=5
- --deployment=kube-state-metrics - --deployment=kube-state-metrics

View File

@ -18,11 +18,15 @@ spec:
name: node-exporter name: node-exporter
phase: prod phase: prod
spec: spec:
serviceAccountName: node-exporter
securityContext:
runAsNonRoot: true
runAsUser: 65534
hostNetwork: true hostNetwork: true
hostPID: true hostPID: true
containers: containers:
- name: node-exporter - name: node-exporter
image: quay.io/prometheus/node-exporter:v0.15.0 image: quay.io/prometheus/node-exporter:v0.15.2
args: args:
- "--path.procfs=/host/proc" - "--path.procfs=/host/proc"
- "--path.sysfs=/host/sys" - "--path.sysfs=/host/sys"
@ -45,9 +49,8 @@ spec:
mountPath: /host/sys mountPath: /host/sys
readOnly: true readOnly: true
tolerations: tolerations:
- key: node-role.kubernetes.io/master - effect: NoSchedule
operator: Exists operator: Exists
effect: NoSchedule
volumes: volumes:
- name: proc - name: proc
hostPath: hostPath:

View File

@ -0,0 +1,5 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: node-exporter
namespace: monitoring

View File

@ -8,5 +8,5 @@ roleRef:
name: prometheus name: prometheus
subjects: subjects:
- kind: ServiceAccount - kind: ServiceAccount
name: default name: prometheus
namespace: monitoring namespace: monitoring

View File

@ -4,8 +4,7 @@ metadata:
name: prometheus-rules name: prometheus-rules
namespace: monitoring namespace: monitoring
data: data:
# Rules adapted from those provided by coreos/prometheus-operator and SoundCloud alertmanager.rules.yaml: |
alertmanager.rules.yaml: |+
groups: groups:
- name: alertmanager.rules - name: alertmanager.rules
rules: rules:
@ -36,7 +35,7 @@ data:
annotations: annotations:
description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
}}/{{ $labels.pod}}. }}/{{ $labels.pod}}.
etcd3.rules.yaml: |+ etcd3.rules.yaml: |
groups: groups:
- name: ./etcd3.rules - name: ./etcd3.rules
rules: rules:
@ -65,8 +64,8 @@ data:
changes within the last hour changes within the last hour
summary: a high number of leader changes within the etcd cluster are happening summary: a high number of leader changes within the etcd cluster are happening
- alert: HighNumberOfFailedGRPCRequests - alert: HighNumberOfFailedGRPCRequests
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
/ sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01 / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.01
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
@ -75,8 +74,8 @@ data:
on etcd instance {{ $labels.instance }}' on etcd instance {{ $labels.instance }}'
summary: a high number of gRPC requests are failing summary: a high number of gRPC requests are failing
- alert: HighNumberOfFailedGRPCRequests - alert: HighNumberOfFailedGRPCRequests
expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
/ sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05 / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.05
for: 5m for: 5m
labels: labels:
severity: critical severity: critical
@ -85,7 +84,7 @@ data:
on etcd instance {{ $labels.instance }}' on etcd instance {{ $labels.instance }}'
summary: a high number of gRPC requests are failing summary: a high number of gRPC requests are failing
- alert: GRPCRequestsSlow - alert: GRPCRequestsSlow
expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le))
> 0.15 > 0.15
for: 10m for: 10m
labels: labels:
@ -125,7 +124,7 @@ data:
}} are slow }} are slow
summary: slow HTTP requests summary: slow HTTP requests
- alert: EtcdMemberCommunicationSlow - alert: EtcdMemberCommunicationSlow
expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m]))
> 0.15 > 0.15
for: 10m for: 10m
labels: labels:
@ -160,7 +159,7 @@ data:
annotations: annotations:
description: etcd instance {{ $labels.instance }} commit durations are high description: etcd instance {{ $labels.instance }} commit durations are high
summary: high commit durations summary: high commit durations
general.rules.yaml: |+ general.rules.yaml: |
groups: groups:
- name: general.rules - name: general.rules
rules: rules:
@ -192,12 +191,12 @@ data:
description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
will exhaust in file/socket descriptors within the next hour' will exhaust in file/socket descriptors within the next hour'
summary: file descriptors soon exhausted summary: file descriptors soon exhausted
kube-controller-manager.rules.yaml: |+ kube-controller-manager.rules.yaml: |
groups: groups:
- name: kube-controller-manager.rules - name: kube-controller-manager.rules
rules: rules:
- alert: K8SControllerManagerDown - alert: K8SControllerManagerDown
expr: absent(up{kubernetes_name="kube-controller-manager"} == 1) expr: absent(up{job="kube-controller-manager"} == 1)
for: 5m for: 5m
labels: labels:
severity: critical severity: critical
@ -205,7 +204,7 @@ data:
description: There is no running K8S controller manager. Deployments and replication description: There is no running K8S controller manager. Deployments and replication
controllers are not making progress. controllers are not making progress.
summary: Controller manager is down summary: Controller manager is down
kube-scheduler.rules.yaml: |+ kube-scheduler.rules.yaml: |
groups: groups:
- name: kube-scheduler.rules - name: kube-scheduler.rules
rules: rules:
@ -255,7 +254,7 @@ data:
labels: labels:
quantile: "0.5" quantile: "0.5"
- alert: K8SSchedulerDown - alert: K8SSchedulerDown
expr: absent(up{kubernetes_name="kube-scheduler"} == 1) expr: absent(up{job="kube-scheduler"} == 1)
for: 5m for: 5m
labels: labels:
severity: critical severity: critical
@ -263,7 +262,7 @@ data:
description: There is no running K8S scheduler. New pods are not being assigned description: There is no running K8S scheduler. New pods are not being assigned
to nodes. to nodes.
summary: Scheduler is down summary: Scheduler is down
kube-state-metrics.rules.yaml: |+ kube-state-metrics.rules.yaml: |
groups: groups:
- name: kube-state-metrics.rules - name: kube-state-metrics.rules
rules: rules:
@ -274,7 +273,8 @@ data:
severity: warning severity: warning
annotations: annotations:
description: Observed deployment generation does not match expected one for description: Observed deployment generation does not match expected one for
deployment {{$labels.namespaces}}{{$labels.deployment}} deployment {{$labels.namespaces}}/{{$labels.deployment}}
summary: Deployment is outdated
- alert: DeploymentReplicasNotUpdated - alert: DeploymentReplicasNotUpdated
expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas) expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas)
or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas)) or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas))
@ -284,8 +284,9 @@ data:
severity: warning severity: warning
annotations: annotations:
description: Replicas are not updated and available for deployment {{$labels.namespaces}}/{{$labels.deployment}} description: Replicas are not updated and available for deployment {{$labels.namespaces}}/{{$labels.deployment}}
summary: Deployment replicas are outdated
- alert: DaemonSetRolloutStuck - alert: DaemonSetRolloutStuck
expr: kube_daemonset_status_current_number_ready / kube_daemonset_status_desired_number_scheduled expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled
* 100 < 100 * 100 < 100
for: 15m for: 15m
labels: labels:
@ -293,6 +294,7 @@ data:
annotations: annotations:
description: Only {{$value}}% of desired pods scheduled and ready for daemon description: Only {{$value}}% of desired pods scheduled and ready for daemon
set {{$labels.namespaces}}/{{$labels.daemonset}} set {{$labels.namespaces}}/{{$labels.daemonset}}
summary: DaemonSet is missing pods
- alert: K8SDaemonSetsNotScheduled - alert: K8SDaemonSetsNotScheduled
expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled
> 0 > 0
@ -312,14 +314,15 @@ data:
to run. to run.
summary: Daemonsets are not scheduled correctly summary: Daemonsets are not scheduled correctly
- alert: PodFrequentlyRestarting - alert: PodFrequentlyRestarting
expr: increase(kube_pod_container_status_restarts[1h]) > 5 expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}} description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}}
times within the last hour times within the last hour
kubelet.rules.yaml: |+ summary: Pod is restarting frequently
kubelet.rules.yaml: |
groups: groups:
- name: kubelet.rules - name: kubelet.rules
rules: rules:
@ -342,14 +345,14 @@ data:
annotations: annotations:
description: '{{ $value }}% of Kubernetes nodes are not ready' description: '{{ $value }}% of Kubernetes nodes are not ready'
- alert: K8SKubeletDown - alert: K8SKubeletDown
expr: count(up{job="kubernetes-nodes"} == 0) / count(up{job="kubernetes-nodes"}) * 100 > 3 expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3
for: 1h for: 1h
labels: labels:
severity: warning severity: warning
annotations: annotations:
description: Prometheus failed to scrape {{ $value }}% of kubelets. description: Prometheus failed to scrape {{ $value }}% of kubelets.
- alert: K8SKubeletDown - alert: K8SKubeletDown
expr: (absent(up{job="kubernetes-nodes"} == 1) or count(up{job="kubernetes-nodes"} == 0) / count(up{job="kubernetes-nodes"})) expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}))
* 100 > 1 * 100 > 1
for: 1h for: 1h
labels: labels:
@ -367,7 +370,7 @@ data:
description: Kubelet {{$labels.instance}} is running {{$value}} pods, close description: Kubelet {{$labels.instance}} is running {{$value}} pods, close
to the limit of 110 to the limit of 110
summary: Kubelet is close to pod limit summary: Kubelet is close to pod limit
kubernetes.rules.yaml: |+ kubernetes.rules.yaml: |
groups: groups:
- name: kubernetes.rules - name: kubernetes.rules
rules: rules:
@ -447,14 +450,28 @@ data:
annotations: annotations:
description: API server returns errors for {{ $value }}% of requests description: API server returns errors for {{ $value }}% of requests
- alert: K8SApiserverDown - alert: K8SApiserverDown
expr: absent(up{job="kubernetes-apiservers"} == 1) expr: absent(up{job="apiserver"} == 1)
for: 20m for: 20m
labels: labels:
severity: critical severity: critical
annotations: annotations:
description: No API servers are reachable or all have disappeared from service description: No API servers are reachable or all have disappeared from service
discovery discovery
node.rules.yaml: |+
- alert: K8sCertificateExpirationNotice
labels:
severity: warning
annotations:
description: Kubernetes API Certificate is expiring soon (less than 7 days)
expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0
- alert: K8sCertificateExpirationNotice
labels:
severity: critical
annotations:
description: Kubernetes API Certificate is expiring in less than 1 day
expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0
node.rules.yaml: |
groups: groups:
- name: node.rules - name: node.rules
rules: rules:
@ -476,7 +493,7 @@ data:
- record: cluster:node_cpu:ratio - record: cluster:node_cpu:ratio
expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu)) expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
- alert: NodeExporterDown - alert: NodeExporterDown
expr: absent(up{kubernetes_name="node-exporter"} == 1) expr: absent(up{job="node-exporter"} == 1)
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
@ -499,7 +516,7 @@ data:
annotations: annotations:
description: device {{$labels.device}} on node {{$labels.instance}} is running description: device {{$labels.device}} on node {{$labels.instance}} is running
full within the next 2 hours (mounted at {{$labels.mountpoint}}) full within the next 2 hours (mounted at {{$labels.mountpoint}})
prometheus.rules.yaml: |+ prometheus.rules.yaml: |
groups: groups:
- name: prometheus.rules - name: prometheus.rules
rules: rules:
@ -544,3 +561,30 @@ data:
annotations: annotations:
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
to any Alertmanagers to any Alertmanagers
- alert: PrometheusTSDBReloadsFailing
expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
for: 12h
labels:
severity: warning
annotations:
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
reload failures over the last four hours.'
summary: Prometheus has issues reloading data blocks from disk
- alert: PrometheusTSDBCompactionsFailing
expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
for: 12h
labels:
severity: warning
annotations:
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
compaction failures over the last four hours.'
summary: Prometheus has issues compacting sample blocks
- alert: PrometheusTSDBWALCorruptions
expr: tsdb_wal_corruptions_total > 0
for: 4h
labels:
severity: warning
annotations:
description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
log (WAL).'
summary: Prometheus write-ahead log is corrupted

View File

@ -0,0 +1,5 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: monitoring