addons: Include Prometheus and node-exporter manifests

2025-09-14 16:49:43 +02:00 · 2017-10-22 17:00:41 -07:00
parent a73f57fe4e
commit d046d45769
14 changed files with 453 additions and 5 deletions
--- a/addons/prometheus/config.yaml
+++ b/addons/prometheus/config.yaml
@@ -0,0 +1,228 @@
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: prometheus-config
  namespace: monitoring
 data:
  prometheus.yaml: |-
    # Global config
    global:
      scrape_interval: 15s
    # Scrape configs for running Prometheus on a Kubernetes cluster.
    # This uses separate scrape configs for cluster components (i.e. API server, node)
    # and services to allow each to use different authentication configs.
    #
    # Kubernetes labels will be added as Prometheus labels on metrics via the
    # `labelmap` relabeling action.
    scrape_configs:
    # Scrape config for API servers.
    #
    # Kubernetes exposes API servers as endpoints to the default/kubernetes
    # service so this uses `endpoints` role and uses relabelling to only keep
    # the endpoints associated with the default/kubernetes service using the
    # default named port `https`. This works for single API server deployments as
    # well as HA API server deployments.
    - job_name: 'kubernetes-apiservers'
      kubernetes_sd_configs:
      - role: endpoints
      scheme: https
      tls_config:
        ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
        # Using endpoints to discover kube-apiserver targets finds the pod IP
        # (host IP since apiserver is uses host network) which is not used in
        # the server certificate.
        insecure_skip_verify: true
      bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
      # Keep only the default/kubernetes service endpoints for the https port. This
      # will add targets for each API server which Kubernetes adds an endpoint to
      # the default/kubernetes service.
      relabel_configs:
      - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
        action: keep
        regex: default;kubernetes;https
    # Scrape config for node (i.e. kubelet) /metrics (e.g. 'kubelet_'). Explore
    # metrics from a node by scraping kubelet (127.0.0.1:10255/metrics).
    #
    # Rather than connecting directly to the node, the scrape is proxied though the
    # Kubernetes apiserver.  This means it will work if Prometheus is running out of
    # cluster, or can't connect to nodes for some other reason (e.g. because of
    # firewalling).
    - job_name: 'kubernetes-nodes'
      kubernetes_sd_configs:
      - role: node
      scheme: https
      tls_config:
        ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
      bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
      relabel_configs:
      - action: labelmap
        regex: __meta_kubernetes_node_label_(.+)
      - target_label: __address__
        replacement: kubernetes.default.svc:443
      - source_labels: [__meta_kubernetes_node_name]
        regex: (.+)
        target_label: __metrics_path__
        replacement: /api/v1/nodes/${1}/proxy/metrics
    # Scrape config for Kubelet cAdvisor. Explore metrics from a node by
    # scraping kubelet (127.0.0.1:10255/metrics/cadvisor).
    #
    # This is required for Kubernetes 1.7.3 and later, where cAdvisor metrics
    # (those whose names begin with 'container_') have been removed from the
    # Kubelet metrics endpoint.  This job scrapes the cAdvisor endpoint to
    # retrieve those metrics.
    #
    # Rather than connecting directly to the node, the scrape is proxied though the
    # Kubernetes apiserver.  This means it will work if Prometheus is running out of
    # cluster, or can't connect to nodes for some other reason (e.g. because of
    # firewalling).
    - job_name: 'kubernetes-cadvisor'
      kubernetes_sd_configs:
      - role: node
      scheme: https
      tls_config:
        ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
      bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
      relabel_configs:
      - action: labelmap
        regex: __meta_kubernetes_node_label_(.+)
      - target_label: __address__
        replacement: kubernetes.default.svc:443
      - source_labels: [__meta_kubernetes_node_name]
        regex: (.+)
        target_label: __metrics_path__
        replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
    # Scrape config for node-exporter endpoints (e.g. 'node_'). Explore metrics
    # from a host running a node-exporter by scraping (127.0.0.1:9100/metrics).
    - job_name: 'node-exporters'  
      kubernetes_sd_configs:
      - role: endpoints
      relabel_configs:
      - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
        action: keep
        regex: monitoring;node-exporter;metrics
    # Scrape config for service endpoints.
    #
    # The relabeling allows the actual service scrape endpoint to be configured
    # via the following annotations:
    #
    # * `prometheus.io/scrape`: Only scrape services that have a value of `true`
    # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
    # to set this to `https` & most likely set the `tls_config` of the scrape config.
    # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
    # * `prometheus.io/port`: If the metrics are exposed on a different port to the
    # service then set this appropriately.
    - job_name: 'kubernetes-service-endpoints'
      kubernetes_sd_configs:
      - role: endpoints
      relabel_configs:
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
        action: keep
        regex: true
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
        action: replace
        target_label: __scheme__
        regex: (https?)
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
        action: replace
        target_label: __metrics_path__
        regex: (.+)
      - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
        action: replace
        target_label: __address__
        regex: ([^:]+)(?::\d+)?;(\d+)
        replacement: $1:$2
      - action: labelmap
        regex: __meta_kubernetes_service_label_(.+)
      - source_labels: [__meta_kubernetes_namespace]
        action: replace
        target_label: kubernetes_namespace
      - source_labels: [__meta_kubernetes_service_name]
        action: replace
        target_label: kubernetes_name
    # Example scrape config for probing services via the Blackbox Exporter.
    #
    # The relabeling allows the actual service scrape endpoint to be configured
    # via the following annotations:
    #
    # * `prometheus.io/probe`: Only probe services that have a value of `true`
    - job_name: 'kubernetes-services'
      metrics_path: /probe
      params:
        module: [http_2xx]
      kubernetes_sd_configs:
      - role: service
      relabel_configs:
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
        action: keep
        regex: true
      - source_labels: [__address__]
        target_label: __param_target
      - target_label: __address__
        replacement: blackbox
      - source_labels: [__param_target]
        target_label: instance
      - action: labelmap
        regex: __meta_kubernetes_service_label_(.+)
      - source_labels: [__meta_kubernetes_namespace]
        target_label: kubernetes_namespace
      - source_labels: [__meta_kubernetes_service_name]
        target_label: kubernetes_name
    # Example scrape config for pods
    #
    # The relabeling allows the actual pod scrape endpoint to be configured via the
    # following annotations:
    #
    # * `prometheus.io/scrape`: Only scrape pods that have a value of `true`
    # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
    # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the
    # pod's declared ports (default is a port-free target if none are declared).
    - job_name: 'kubernetes-pods'
      kubernetes_sd_configs:
      - role: pod
      relabel_configs:
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
        action: keep
        regex: true
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
        action: replace
        target_label: __metrics_path__
        regex: (.+)
      - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
        action: replace
        regex: ([^:]+)(?::\d+)?;(\d+)
        replacement: $1:$2
        target_label: __address__
      - action: labelmap
        regex: __meta_kubernetes_pod_label_(.+)
      - source_labels: [__meta_kubernetes_namespace]
        action: replace
        target_label: kubernetes_namespace
      - source_labels: [__meta_kubernetes_pod_name]
        action: replace
        target_label: kubernetes_pod_name
    # Rule files
    rule_files:
      - "/etc/prometheus/rules/*.rules"
--- a/addons/prometheus/deployment.yaml
+++ b/addons/prometheus/deployment.yaml
@@ -0,0 +1,45 @@
 apiVersion: extensions/v1beta1
 kind: Deployment
 metadata:
  name: prometheus
  namespace: monitoring
 spec:
  replicas: 1
  strategy:
    rollingUpdate:
      maxUnavailable: 1
  template:
    metadata:
      labels:
        name: prometheus
        phase: prod
    spec:
      containers:
      - name: prometheus
        image: quay.io/prometheus/prometheus:v1.8.0
        args:
          - '-config.file=/etc/prometheus/prometheus.yaml'
          - '-storage.local.retention=12h'
          - '-storage.local.memory-chunks=500000'
        ports:
        - name: web
          containerPort: 9090
        volumeMounts:
        - name: config
          mountPath: /etc/prometheus
        - name: rules
          mountPath: /etc/prometheus/rules
        - name: data
          mountPath: /var/lib/prometheus
      dnsPolicy: ClusterFirst
      restartPolicy: Always
      terminationGracePeriodSeconds: 30
      volumes:
      - name: config
        configMap:
          name: prometheus-config
      - name: rules
        configMap:
          name: prometheus-rules
      - name: data
        emptyDir: {}
--- a/addons/prometheus/exporters/node-exporter/daemonset.yaml
+++ b/addons/prometheus/exporters/node-exporter/daemonset.yaml
@@ -0,0 +1,53 @@
 apiVersion: extensions/v1beta1
 kind: DaemonSet
 metadata:
  name: node-exporter
  namespace: monitoring
 spec:
  updateStrategy:
    type: RollingUpdate
    rollingUpdate:
      maxUnavailable: 1
  template:
    metadata:
      labels:
        name: node-exporter
        phase: prod
    spec:
      hostNetwork: true
      hostPID: true
      containers:
      - name: node-exporter
        image: quay.io/prometheus/node-exporter:v0.15.0
        args:
          - "--path.procfs=/host/proc"
          - "--path.sysfs=/host/sys"
        ports:
          - name: metrics
            containerPort: 9100
            hostPort: 9100
        resources:
          requests:
            memory: 30Mi
            cpu: 100m
          limits:
            memory: 50Mi
            cpu: 200m
        volumeMounts:
          - name: proc
            mountPath: /host/proc
            readOnly:  true
          - name: sys
            mountPath: /host/sys
            readOnly: true
      tolerations:
        - key: node-role.kubernetes.io/master
          operator: Exists
          effect: NoSchedule
      volumes:
        - name: proc
          hostPath:
            path: /proc
        - name: sys
          hostPath:
            path: /sys
--- a/addons/prometheus/exporters/node-exporter/service.yaml
+++ b/addons/prometheus/exporters/node-exporter/service.yaml
@@ -0,0 +1,17 @@
 apiVersion: v1
 kind: Service
 metadata:
  name: node-exporter
  namespace: monitoring
 spec:
  type: ClusterIP
  # service is created to allow prometheus to scape endpoints
  clusterIP: None
  selector:
    name: node-exporter
    phase: prod
  ports:
    - name: metrics
      protocol: TCP
      port: 80
      targetPort: 9100
--- a/addons/prometheus/namespace.yaml
+++ b/addons/prometheus/namespace.yaml
@@ -0,0 +1,4 @@
 apiVersion: v1
 kind: Namespace
 metadata:
  name: monitoring
--- a/addons/prometheus/rbac/cluster-role-binding.yaml
+++ b/addons/prometheus/rbac/cluster-role-binding.yaml
@@ -0,0 +1,12 @@
 apiVersion: rbac.authorization.k8s.io/v1beta1
 kind: ClusterRoleBinding
 metadata:
  name: prometheus
 roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: prometheus
 subjects:
 - kind: ServiceAccount
  name: default
  namespace: monitoring
--- a/addons/prometheus/rbac/cluster-role.yaml
+++ b/addons/prometheus/rbac/cluster-role.yaml
@@ -0,0 +1,15 @@
 apiVersion: rbac.authorization.k8s.io/v1beta1
 kind: ClusterRole
 metadata:
  name: prometheus
 rules:
 - apiGroups: [""]
  resources:
  - nodes
  - nodes/proxy
  - services
  - endpoints
  - pods
  verbs: ["get", "list", "watch"]
 - nonResourceURLs: ["/metrics"]
  verbs: ["get"]
--- a/addons/prometheus/rules.yaml
+++ b/addons/prometheus/rules.yaml
@@ -0,0 +1,8 @@
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: prometheus-rules
  namespace: monitoring
 data:
  example.rule: |
    job_service:rpc_durations_seconds_count:avg_rate5m = avg(rate(rpc_durations_seconds_count[5m])) by (job, service)
--- a/addons/prometheus/service.yaml
+++ b/addons/prometheus/service.yaml
@@ -0,0 +1,15 @@
 apiVersion: v1
 kind: Service
 metadata:
  name: prometheus
  namespace: monitoring
 spec:
  type: ClusterIP
  selector:
    name: prometheus
    phase: prod
  ports:
    - name: web
      protocol: TCP
      port: 80
      targetPort: 9090
--- a/docs/addons/overview.md
+++ b/docs/addons/overview.md
@@ -2,10 +2,10 @@
 Every Typhoon cluster is verified to work well with several post-install addons.
 * [CLUO](cluo.md) (Container Linux only)
 * Nginx [Ingress Controller](ingress.md)
 * [Heapster](heapster.md)
 * Kubernetes [Dashboard](dashboard.md)
-* [CLUO](cluo.md) (Container Linux only)
+* [Prometheus](prometheus.md)
 * Prometheus
 * Grafana
--- a/docs/addons/prometheus.md
+++ b/docs/addons/prometheus.md
@@ -0,0 +1,50 @@
 # Prometheus
 Prometheus collects metrics (e.g. `node_memory_usage_bytes`) from *targets* by scraping their HTTP metrics endpoints. Targets are organized into *jobs*, defined in the Prometheus config. Targets may expose counter, gauge, histogram, or summary metrics.
 Here's a simple config from the Prometheus [tutorial](https://prometheus.io/docs/introduction/getting_started/).
 ```
 global:
  scrape_interval: 15s
 scrape_configs:
  - job_name: 'prometheus'
    scrape_interval: 5s
    static_configs:
      - targets: ['localhost:9090']
 ```
 On Kubernetes clusters, Prometheus is run as a Deployment, configured with a ConfigMap, and accessed via a Service or Ingress.
 ```
 kubectl apply -f addons/prometheus -R
 ```
 The ConfigMap configures Prometheus to target apiserver endpoints, node metrics, cAdvisor metrics, and exporters. By default, data is kept in an `emptyDir` so it is persisted until the pod is rescheduled.
 ### Exporters
 Exporters expose metrics for 3rd-party systems that don't natively expose Prometheus metrics.
 * [node_exporter](https://github.com/prometheus/node_exporter) - DaemonSet that exposes a machine's hardware and OS metrics
 * [kube-state-metrics](https://github.com/kubernetes/kube-state-metrics) - Deployment that exposes Kubernetes object metrics
 * [blackbox_exporter](https://github.com/prometheus/blackbox_exporter) - Scrapes HTTP, HTTPS, DNS, TCP, or ICMP endpoints and exposes availability as metrics
 ### Queries and Graphs
 Prometheus provides a simplistic UI for querying and graphing metrics. Use `kubectl` to authenticate to the apiserver and create a local port-forward to the Prometheus pod.
 ```
 kubectl get pods -n monitoring
 kubectl port-forward prometheus-POD-ID 9090 -n monitoring
 ```
 Visit [127.0.0.1:9090](http://127.0.0.1:9090) to query [expressions](http://127.0.0.1:9090/graph), view [targets](http://127.0.0.1:9090/targets), or check [alerts](http://127.0.0.1:9090/alerts).
 ![Prometheus Graph](/img/prometheus-graph.png)
 <br/>
 ![Prometheus Targets](/img/prometheus-targets.png)
 ### Visualization
 Grafana can be used to build dashboards and rich visualizations that use Prometheus as the datasource. Favor Grafana for these use cases and use the Prometheus for debugging or quickly checking available metrics.
--- a/docs/img/prometheus-graph.png
+++ b/docs/img/prometheus-graph.png
--- a/docs/img/prometheus-targets.png
+++ b/docs/img/prometheus-targets.png
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -46,10 +46,11 @@ pages:
  - 'Google Cloud': 'google-cloud.md'
  - 'Addons':
    - 'Overview': 'addons/overview.md'
    - 'Nginx Ingress': 'addons/ingress.md'
    - 'Heapster': 'addons/heapster.md'
    - 'Dashboard': 'addons/dashboard.md'
    - 'CLUO': 'addons/cluo.md'
    - 'Heapster': 'addons/heapster.md'
    - 'Nginx Ingress': 'addons/ingress.md'
    - 'Prometheus': 'addons/prometheus.md'
    - 'Dashboard': 'addons/dashboard.md'
  - 'Topics':
    - 'Hardware': 'topics/hardware.md'
    - 'Security': 'topics/security.md'