addons: Update Prometheus from v2.1.0 to v2.2.0

* Annotate Prometheus service to scrape metrics from
Prometheus itself (enables Prometheus* alerts)
* Update kube-state-metrics addon-resizer to 1.7
* Use port 8080 for kube-state-metrics
* Add PrometheusNotIngestingSamples alert rule
* Change K8SKubeletDown alert rule to fire when 10%
of kubelets are down, not 1%
  * https://github.com/coreos/prometheus-operator/pull/1032
This commit is contained in:
Dalton Hubble 2018-03-02 18:47:37 -08:00
parent c112ee3829
commit 9307e97c46
5 changed files with 17 additions and 7 deletions

View File

@ -18,7 +18,7 @@ spec:
serviceAccountName: prometheus serviceAccountName: prometheus
containers: containers:
- name: prometheus - name: prometheus
image: quay.io/prometheus/prometheus:v2.1.0 image: quay.io/prometheus/prometheus:v2.2.0-rc.1
args: args:
- '--config.file=/etc/prometheus/prometheus.yaml' - '--config.file=/etc/prometheus/prometheus.yaml'
ports: ports:

View File

@ -33,7 +33,7 @@ spec:
initialDelaySeconds: 5 initialDelaySeconds: 5
timeoutSeconds: 5 timeoutSeconds: 5
- name: addon-resizer - name: addon-resizer
image: gcr.io/google_containers/addon-resizer:1.0 image: gcr.io/google_containers/addon-resizer:1.7
resources: resources:
limits: limits:
cpu: 100m cpu: 100m
@ -54,8 +54,8 @@ spec:
- /pod_nanny - /pod_nanny
- --container=kube-state-metrics - --container=kube-state-metrics
- --cpu=100m - --cpu=100m
- --extra-cpu=2m - --extra-cpu=1m
- --memory=150Mi - --memory=100Mi
- --extra-memory=30Mi - --extra-memory=2Mi
- --threshold=5 - --threshold=5
- --deployment=kube-state-metrics - --deployment=kube-state-metrics

View File

@ -15,5 +15,5 @@ spec:
ports: ports:
- name: metrics - name: metrics
protocol: TCP protocol: TCP
port: 80 port: 8080
targetPort: 8080 targetPort: 8080

View File

@ -353,7 +353,7 @@ data:
description: Prometheus failed to scrape {{ $value }}% of kubelets. description: Prometheus failed to scrape {{ $value }}% of kubelets.
- alert: K8SKubeletDown - alert: K8SKubeletDown
expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})) expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}))
* 100 > 1 * 100 > 10
for: 1h for: 1h
labels: labels:
severity: critical severity: critical
@ -588,3 +588,11 @@ data:
description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
log (WAL).' log (WAL).'
summary: Prometheus write-ahead log is corrupted summary: Prometheus write-ahead log is corrupted
- alert: PrometheusNotIngestingSamples
expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0
for: 10m
labels:
severity: warning
annotations:
description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples."
summary: "Prometheus isn't ingesting samples"

View File

@ -3,6 +3,8 @@ kind: Service
metadata: metadata:
name: prometheus name: prometheus
namespace: monitoring namespace: monitoring
annotations:
prometheus.io/scrape: 'true'
spec: spec:
type: ClusterIP type: ClusterIP
selector: selector: