mirror of
https://github.com/puppetmaster/typhoon.git
synced 2025-08-02 13:21:34 +02:00
Compare commits
197 Commits
Author | SHA1 | Date | |
---|---|---|---|
5066a25d89 | |||
de251bd94f | |||
fc277eaab6 | |||
a08adc92b5 | |||
d42f42df4e | |||
4ff7fe2c29 | |||
f598307998 | |||
8ae552ebda | |||
daee5a9d60 | |||
73ae5d5649 | |||
42d7222f3d | |||
d10c2b4cb9 | |||
7f8572030d | |||
4294bd0292 | |||
ba4c5de052 | |||
e483c81ce9 | |||
6fa3b8a13f | |||
ac95e83249 | |||
d988822741 | |||
170ef74eea | |||
b13a651cfe | |||
9c59f393a5 | |||
3e4b3bfb04 | |||
584088397c | |||
0200058e0e | |||
d5537405e1 | |||
949ce21fb2 | |||
ccd96c37da | |||
acd539f865 | |||
244a1a601a | |||
d02af3d40d | |||
130daeac26 | |||
1ab06f69d7 | |||
eb08593eae | |||
e9659a8539 | |||
6b87132aa1 | |||
f5ff003d0e | |||
d697dd46dc | |||
2f3097ebea | |||
f4d3508578 | |||
67fb9602e7 | |||
c8a85fabe1 | |||
7eafa59d8f | |||
679079b242 | |||
1d27dc6528 | |||
b74cc8afd2 | |||
1d66ad33f7 | |||
4d32b79c6f | |||
df4c0ba05d | |||
bfe0c74793 | |||
60c70797ec | |||
6795a753ea | |||
b57273b6f1 | |||
812a1adb49 | |||
1c6a0392ad | |||
5263d00a6f | |||
66e1365cc4 | |||
ea8b0d1c84 | |||
f2f4deb8bb | |||
4d2f33aee6 | |||
d42f47c49e | |||
53e549f233 | |||
bcb200186d | |||
479d498024 | |||
e0c032be94 | |||
b74bf11772 | |||
018c5edc25 | |||
8aeec0b9b5 | |||
ff6ab571f3 | |||
991fb44c37 | |||
d31f444fcd | |||
76d993cdae | |||
b6016d0a26 | |||
eec314b52f | |||
bcce02a9ce | |||
42c523e6a2 | |||
64b4c10418 | |||
872b11b948 | |||
5b27d8d889 | |||
840b73f9ba | |||
915af3c6cc | |||
c6586b69fd | |||
ea3fc6d2a7 | |||
c8c43f3991 | |||
58472438ce | |||
7f8e781ae4 | |||
56e9a82984 | |||
e95b856a22 | |||
31f48a81a8 | |||
2b3f61d1bb | |||
8fd2978c31 | |||
7de03a1279 | |||
be9f7b87d6 | |||
721c847943 | |||
78c9fdc18f | |||
884c8b39dc | |||
0e71f7e565 | |||
8c4200d425 | |||
5be5b261e2 | |||
f034ef90ae | |||
3bba1ba0dc | |||
dbe7604b67 | |||
9b405a19b2 | |||
bfa1a679eb | |||
f1da0731d8 | |||
d641a058fe | |||
99a6d5478b | |||
bc750aec33 | |||
d55bfd5589 | |||
0be4673e44 | |||
3b44972d78 | |||
0127ee82c1 | |||
a10d6977b8 | |||
05fe923c14 | |||
d10620fb58 | |||
9b6113a058 | |||
5eb4078d68 | |||
8f0d2b5db4 | |||
2e89e161e9 | |||
55bb4dfba6 | |||
43fe78a2cc | |||
5a283b6443 | |||
ff0c271d7b | |||
89088b6a99 | |||
ee569e3a59 | |||
daeaec0832 | |||
db36036c81 | |||
7653e511be | |||
f8474d68c9 | |||
032a24133b | |||
f5f442b658 | |||
ad871dbfa9 | |||
5801be53ff | |||
c1b1669cf8 | |||
dc03f7a4a9 | |||
1b8234eb91 | |||
4ba090feb0 | |||
4882fe1053 | |||
019009e9ee | |||
991a5c6cee | |||
c60ec642bc | |||
38b4ff4700 | |||
7eb09237f4 | |||
e58b424882 | |||
b8eeafe4f9 | |||
bdf1e6986e | |||
99e3721181 | |||
ea365b551a | |||
bbf2c13eef | |||
da5d2c5321 | |||
bceec9fdf5 | |||
49a9dc9b8b | |||
bec5250e73 | |||
dbdc3fc850 | |||
e00f97c578 | |||
f7ebdf475d | |||
716dfe4d17 | |||
edc250d62a | |||
db64ce3312 | |||
7c327b8bf4 | |||
e6720cf738 | |||
844f380b4e | |||
13beb13aab | |||
90c4a7483d | |||
4e7dfc115d | |||
ec5ea51141 | |||
d8d524d10b | |||
02cd8eb8d3 | |||
84d6cfe7b3 | |||
3352388fe6 | |||
915f89d3c8 | |||
f40f60b83c | |||
6f958d7577 | |||
ee31074679 | |||
97517fa7f3 | |||
18502d64d6 | |||
a3349b5c68 | |||
74dc6b0bf9 | |||
fd1de27aef | |||
93de7506ef | |||
def445a344 | |||
8464b258d8 | |||
855aec5af3 | |||
0c4d59db87 | |||
2eaf04c68b | |||
0227014fa0 | |||
fb6f40051f | |||
316f06df06 | |||
f4d3059b00 | |||
6c5a1964aa | |||
6e64634748 | |||
d5de41e07a | |||
05b99178ae | |||
ed0b781296 | |||
51906bf398 | |||
18dd7ccc09 | |||
0764bd30b5 |
10
.github/ISSUE_TEMPLATE.md
vendored
10
.github/ISSUE_TEMPLATE.md
vendored
@ -4,11 +4,11 @@
|
|||||||
|
|
||||||
### Environment
|
### Environment
|
||||||
|
|
||||||
* Platform: aws, bare-metal, google-cloud, digital-ocean
|
* Platform: aws, azure, bare-metal, google-cloud, digital-ocean
|
||||||
* OS: container-linux, fedora-atomic
|
* OS: container-linux, flatcar-linux, or fedora-atomic
|
||||||
* Terraform: `terraform version`
|
* Release: Typhoon version or Git SHA (reporting latest is **not** helpful)
|
||||||
* Plugins: Provider plugin versions
|
* Terraform: `terraform version` (reporting latest is **not** helpful)
|
||||||
* Ref: Git SHA (if applicable)
|
* Plugins: Provider plugin versions (reporting latest is **not** helpful)
|
||||||
|
|
||||||
### Problem
|
### Problem
|
||||||
|
|
||||||
|
367
CHANGES.md
367
CHANGES.md
@ -4,6 +4,373 @@ Notable changes between versions.
|
|||||||
|
|
||||||
## Latest
|
## Latest
|
||||||
|
|
||||||
|
## v1.13.4
|
||||||
|
|
||||||
|
* Kubernetes [v1.13.4](https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG-1.13.md#v1134)
|
||||||
|
* Update etcd from v3.3.11 to [v3.3.12](https://github.com/etcd-io/etcd/releases/tag/v3.3.12)
|
||||||
|
* Update Calico from v3.5.0 to [v3.5.2](https://docs.projectcalico.org/v3.5/releases/)
|
||||||
|
* Assign priorityClassNames to critical cluster and node components ([#406](https://github.com/poseidon/typhoon/pull/406))
|
||||||
|
* Inform node out-of-resource eviction and scheduler preemption and ordering
|
||||||
|
* Add CoreDNS readiness probe ([#410](https://github.com/poseidon/typhoon/pull/410))
|
||||||
|
|
||||||
|
#### Bare-Metal
|
||||||
|
|
||||||
|
* Recommend updating [terraform-provider-matchbox](https://github.com/coreos/terraform-provider-matchbox) plugin from v0.2.2 to [v0.2.3](https://github.com/coreos/terraform-provider-matchbox/releases/tag/v0.2.3) ([#402](https://github.com/poseidon/typhoon/pull/402))
|
||||||
|
* Improve docs on using Ubiquiti EdgeOS with bare-metal clusters ([#413](https://github.com/poseidon/typhoon/pull/413))
|
||||||
|
|
||||||
|
#### Google Cloud
|
||||||
|
|
||||||
|
* Support `terraform-provider-google` v2.0+ ([#407](https://github.com/poseidon/typhoon/pull/407))
|
||||||
|
* Require `terraform-provider-google` v1.19+ (**action required**)
|
||||||
|
* Set the minimum CPU platform to Intel Haswell ([#405](https://github.com/poseidon/typhoon/pull/405))
|
||||||
|
* Haswell or better is available in every zone (no price change)
|
||||||
|
* A few zones still default to Sandy/Ivy Bridge (shifts in April 2019)
|
||||||
|
|
||||||
|
#### Addons
|
||||||
|
|
||||||
|
* Modernize Prometheus rules and alerts ([#404](https://github.com/poseidon/typhoon/pull/404))
|
||||||
|
* Drop extraneous metrics ([#397](https://github.com/poseidon/typhoon/pull/397))
|
||||||
|
* Add `pod` name label to metrics discovered via service endpoints
|
||||||
|
* Rename `kubernetes_namespace` label to `namespace`
|
||||||
|
* Modernize Grafana and dashboards, see [docs](https://typhoon.psdn.io/addons/grafana/) ([#403](https://github.com/poseidon/typhoon/pull/403), [#404](https://github.com/poseidon/typhoon/pull/404))
|
||||||
|
* Upgrade Grafana from v5.4.3 to [v6.0.0](https://github.com/grafana/grafana/releases/tag/v6.0.0)!
|
||||||
|
* Enable Grafana [Explore](http://docs.grafana.org/guides/whats-new-in-v6-0/#explore) UI as a Viewer (inspect/edit without saving)
|
||||||
|
* Update nginx-ingress from v0.22.0 to v0.23.0
|
||||||
|
* Raise nginx-ingress liveness/readiness timeout to 5 seconds
|
||||||
|
* Remove nginx-ingess default-backend ([#401](https://github.com/poseidon/typhoon/pull/401))
|
||||||
|
|
||||||
|
#### Fedora Atomic
|
||||||
|
|
||||||
|
* Build Kubelet [system container](https://github.com/poseidon/system-containers) with buildah. The image is an OCI format and slightly larger.
|
||||||
|
|
||||||
|
## v1.13.3
|
||||||
|
|
||||||
|
* Kubernetes [v1.13.3](https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG-1.13.md#v1133)
|
||||||
|
* Update etcd from v3.3.10 to [v3.3.11](https://github.com/etcd-io/etcd/blob/master/CHANGELOG-3.3.md#v3311-2019-1-11)
|
||||||
|
* Update CoreDNS from v1.3.0 to [v1.3.1](https://coredns.io/2019/01/13/coredns-1.3.1-release/)
|
||||||
|
* Switch from the `proxy` plugin to the faster `forward` plugin for upsteam resolvers
|
||||||
|
* Update Calico from v3.4.0 to [v3.5.0](https://docs.projectcalico.org/v3.5/releases/)
|
||||||
|
* Update flannel from v0.10.0 to [v0.11.0](https://github.com/coreos/flannel/releases/tag/v0.11.0)
|
||||||
|
* Reduce pod eviction timeout for deleting pods on unready nodes to 1 minute
|
||||||
|
* Respond more quickly to node preemption (previously 5 minutes)
|
||||||
|
* Fix automatic worker deletion on shutdown for cloud platforms
|
||||||
|
* Lowering Kubelet privileges in [#372](https://github.com/poseidon/typhoon/pull/372) dropped a needed node deletion authorization. Scale-in due to manual terraform apply (any cloud), AWS spot termination, or Azure low priority deletion left old nodes registered, requiring manual deletion (`kubectl delete node name`)
|
||||||
|
|
||||||
|
#### AWS
|
||||||
|
|
||||||
|
* Add `ingress_zone_id` output with the NLB DNS name's Route53 zone for use in alias records ([#380](https://github.com/poseidon/typhoon/pull/380))
|
||||||
|
|
||||||
|
#### Azure
|
||||||
|
|
||||||
|
* Fix azure provider warning, `public_ip` `allocation_method` replaces `public_ip_address_allocation`
|
||||||
|
* Require `terraform-provider-azurerm` v1.21+ (action required)
|
||||||
|
|
||||||
|
#### Addons
|
||||||
|
|
||||||
|
* Update nginx-ingress from v0.21.0 to v0.22.0
|
||||||
|
* Update Prometheus from v2.6.0 to v2.7.1
|
||||||
|
* Update kube-state-metrics from v1.4.0 to v1.5.0
|
||||||
|
* Fix ClusterRole to collect and export PodDisruptionBudget metrics ([#383](https://github.com/poseidon/typhoon/pull/383))
|
||||||
|
* Update node-exporter from v0.15.2 to v0.17.0
|
||||||
|
* Update Grafana from v5.4.2 to v5.4.3
|
||||||
|
|
||||||
|
## v1.13.2
|
||||||
|
|
||||||
|
* Kubernetes [v1.13.2](https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG-1.13.md#v1132)
|
||||||
|
* Add ServiceAccounts for `kube-apiserver` and `kube-scheduler` ([#370](https://github.com/poseidon/typhoon/pull/370))
|
||||||
|
* Use lower-privilege TLS client certificates for Kubelets ([#372](https://github.com/poseidon/typhoon/pull/372))
|
||||||
|
* Use HTTPS liveness probes for `kube-scheduler` and `kube-controller-manager` ([#377](https://github.com/poseidon/typhoon/pull/377))
|
||||||
|
* Update CoreDNS from v1.2.6 to [v1.3.0](https://coredns.io/2018/12/15/coredns-1.3.0-release/)
|
||||||
|
* Allow the `certificates.k8s.io` API to issue certificates signed by the cluster CA ([#376](https://github.com/poseidon/typhoon/pull/376))
|
||||||
|
* Configure controller manager to sign CSRs that are manually [approved](https://kubernetes.io/docs/tasks/tls/managing-tls-in-a-cluster) by an administrator
|
||||||
|
|
||||||
|
#### AWS
|
||||||
|
|
||||||
|
* Change `controller_type` and `worker_type` default from t2.small to t3.small ([#365](https://github.com/poseidon/typhoon/pull/365))
|
||||||
|
* t3.small is cheaper, provides 2 vCPU (instead of 1), and 5 Gbps of pod-to-pod bandwidth!
|
||||||
|
|
||||||
|
#### Bare-Metal
|
||||||
|
|
||||||
|
* Remove the `kubeconfig` output variable
|
||||||
|
|
||||||
|
#### Addons
|
||||||
|
|
||||||
|
* Update Prometheus from v2.5.0 to v2.6.0
|
||||||
|
|
||||||
|
## v1.13.1
|
||||||
|
|
||||||
|
* Kubernetes [v1.13.1](https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG-1.13.md#v1131)
|
||||||
|
* Update Calico from v3.3.2 to [v3.4.0](https://docs.projectcalico.org/v3.4/releases/) ([#362](https://github.com/poseidon/typhoon/pull/362))
|
||||||
|
* Install CNI plugins with an init container rather than a sidecar
|
||||||
|
* Improve the `calico-node` ClusterRole
|
||||||
|
* Recommend updating `terraform-provider-ct` plugin from v0.2.1 to v0.3.0 ([#363](https://github.com/poseidon/typhoon/pull/363))
|
||||||
|
* [Migration](https://typhoon.psdn.io/topics/maintenance/#upgrade-terraform-provider-ct) instructions for upgrading `terraform-provider-ct` in-place for v1.12.2+ clusters (**action required**)
|
||||||
|
* [Require](https://typhoon.psdn.io/topics/maintenance/#terraform-plugins-directory) switching from `~/.terraformrc` to the Terraform [third-party plugins](https://www.terraform.io/docs/configuration/providers.html#third-party-plugins) directory `~/.terraform.d/plugins/`
|
||||||
|
* Require Container Linux 1688.5.3 or newer
|
||||||
|
|
||||||
|
#### Google Cloud
|
||||||
|
|
||||||
|
* Increase TCP proxy apiserver backend service timeout from 1 minute to 5 minutes ([#361](https://github.com/poseidon/typhoon/pull/361))
|
||||||
|
* Align `port-forward` behavior closer to AWS/Azure (no timeout)
|
||||||
|
|
||||||
|
#### Addons
|
||||||
|
|
||||||
|
* Update Grafana from v5.4.0 to v5.4.2
|
||||||
|
|
||||||
|
## v1.13.0
|
||||||
|
|
||||||
|
* Kubernetes [v1.13.0](https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG-1.13.md#v1130)
|
||||||
|
* Update Calico from v3.3.1 to [v3.3.2](https://docs.projectcalico.org/v3.3/releases/)
|
||||||
|
|
||||||
|
#### Addons
|
||||||
|
|
||||||
|
* Update Grafana from v5.3.4 to v5.4.0
|
||||||
|
* Disable Grafana login form, since admin user can't be disabled ([#352](https://github.com/poseidon/typhoon/pull/352))
|
||||||
|
* Example manifests aim to provide a read-only dashboard view
|
||||||
|
|
||||||
|
## v1.12.3
|
||||||
|
|
||||||
|
* Kubernetes [v1.12.3](https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG-1.12.md#v1123)
|
||||||
|
* Add `enable_reporting` variable (default "false") to provide upstreams with usage data ([#345](https://github.com/poseidon/typhoon/pull/345))
|
||||||
|
* Change kube-apiserver `--kubelet-preferred-address-types` to InternalIP,ExternalIP,Hostname
|
||||||
|
* Update Calico from v3.3.0 to [v3.3.1](https://docs.projectcalico.org/v3.3/releases/)
|
||||||
|
* Disable Felix usage reporting by default ([#345](https://github.com/poseidon/typhoon/pull/345))
|
||||||
|
* Improve flannel manifests
|
||||||
|
* [Rename](https://github.com/poseidon/terraform-render-bootkube/commit/d045a8e6b8eccfbb9d69bb51953b5a93d23f67f7) `kube-flannel` DaemonSet to `flannel` and `kube-flannel-cfg` ConfigMap to `flannel-config`
|
||||||
|
* [Drop](https://github.com/poseidon/terraform-render-bootkube/commit/39f9afb3360ec642e5b98457c8bd07eda35b6c96) unused mounts and add a CPU resource request
|
||||||
|
* Update CoreDNS from v1.2.4 to [v1.2.6](https://coredns.io/2018/11/05/coredns-1.2.6-release/)
|
||||||
|
* Enable CoreDNS `loop` and `loadbalance` plugins ([#340](https://github.com/poseidon/typhoon/pull/340))
|
||||||
|
* Fix pod-checkpointer log noise and checkpointable pods detection ([#346](https://github.com/poseidon/typhoon/pull/346))
|
||||||
|
* Use kubernetes-incubator/bootkube v0.14.0
|
||||||
|
* [Recommend](https://typhoon.psdn.io/topics/maintenance/#terraform-plugins-directory) switching from `~/.terraformrc` to the Terraform [third-party plugins](https://www.terraform.io/docs/configuration/providers.html#third-party-plugins) directory `~/.terraform.d/plugins/`.
|
||||||
|
* Allows pinning `terraform-provider-ct` and `terraform-provider-matchbox` versions
|
||||||
|
* Improves safety of later plugin version migrations
|
||||||
|
|
||||||
|
#### Azure
|
||||||
|
|
||||||
|
* Use eviction policy `Delete` for `Low` priority virtual machine scale set workers ([#343](https://github.com/poseidon/typhoon/pull/343))
|
||||||
|
* Fix issue where Azure defaults to `Deallocate` eviction policy, which required manually restarting deallocated instances. `Delete` policy aligns Azure with AWS and GCP behavior.
|
||||||
|
* Require `terraform-provider-azurerm` v1.19+ (action required)
|
||||||
|
|
||||||
|
#### Bare-Metal
|
||||||
|
|
||||||
|
* Add Kubelet `/etc/iscsi` and `iscsadm` mounts on bare-metal for iSCSI ([#103](https://github.com/poseidon/typhoon/pull/103))
|
||||||
|
|
||||||
|
#### Addons
|
||||||
|
|
||||||
|
* Update nginx-ingress from v0.20.0 to v0.21.0
|
||||||
|
* Update Prometheus from v2.4.3 to v2.5.0
|
||||||
|
* Update Grafana from v5.3.2 to v5.3.4
|
||||||
|
|
||||||
|
## v1.12.2
|
||||||
|
|
||||||
|
* Kubernetes [v1.12.2](https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG-1.12.md#v1122)
|
||||||
|
* Update CoreDNS from 1.2.2 to [1.2.4](https://github.com/coredns/coredns/releases/tag/v1.2.4)
|
||||||
|
* Update Calico from v3.2.3 to [v3.3.0](https://docs.projectcalico.org/v3.3/releases/)
|
||||||
|
* Disable Kubelet read-only port ([#324](https://github.com/poseidon/typhoon/pull/324))
|
||||||
|
* Fix CoreDNS AntiAffinity spec to prefer spreading replicas
|
||||||
|
* Ignore controller node user-data changes ([#335](https://github.com/poseidon/typhoon/pull/335))
|
||||||
|
* Once all managed clusters use v1.12.2, it is possible to update `terraform-provider-ct`
|
||||||
|
|
||||||
|
#### AWS
|
||||||
|
|
||||||
|
* Add `disk_iops` variable for EBS volume IOPS ([#314](https://github.com/poseidon/typhoon/pull/314))
|
||||||
|
|
||||||
|
#### Azure
|
||||||
|
|
||||||
|
* Use new `azurerm_network_interface_backend_address_pool_association` ([#332](https://github.com/poseidon/typhoon/pull/332))
|
||||||
|
* Require `terraform-provider-azurerm` v1.17+ (action required)
|
||||||
|
* Add `primary` field to `ip_configuration` needed by v1.17+ ([#331](https://github.com/poseidon/typhoon/pull/331))
|
||||||
|
|
||||||
|
#### DigitalOcean
|
||||||
|
|
||||||
|
* Add AAAA DNS records resolving to worker nodes ([#333](https://github.com/poseidon/typhoon/pull/333))
|
||||||
|
* Hosting IPv6 apps requires editing nginx-ingress with `hostNetwork: true`
|
||||||
|
|
||||||
|
#### Google Cloud
|
||||||
|
|
||||||
|
* Add an IPv6 address and IPv6 forwarding rules for load balancing IPv6 Ingress ([#334](https://github.com/poseidon/typhoon/pull/334))
|
||||||
|
* Add `ingress_static_ipv6` output variable for use in AAAA DNS records
|
||||||
|
* Allow serving IPv6 applications via Kubernetes Ingress
|
||||||
|
|
||||||
|
#### Addons
|
||||||
|
|
||||||
|
* Configure Heapster to scrape Kubelets with bearer token auth ([#323](https://github.com/poseidon/typhoon/pull/323))
|
||||||
|
* Update Grafana from v5.3.1 to v5.3.2
|
||||||
|
|
||||||
|
## v1.12.1
|
||||||
|
|
||||||
|
* Kubernetes [v1.12.1](https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG-1.12.md#v1121)
|
||||||
|
* Update etcd from v3.3.9 to [v3.3.10](https://github.com/etcd-io/etcd/blob/master/CHANGELOG-3.3.md#v3310-2018-10-10)
|
||||||
|
* Update CoreDNS from 1.1.3 to [1.2.2](https://github.com/coredns/coredns/releases/tag/v1.2.2)
|
||||||
|
* Update Calico from v3.2.1 to [v3.2.3](https://docs.projectcalico.org/v3.2/releases/)
|
||||||
|
* Raise scheduler and controller-manager replicas to the larger of 2 or the number of controller nodes ([#312](https://github.com/poseidon/typhoon/pull/312))
|
||||||
|
* Single-controller clusters continue to run 2 replicas as before
|
||||||
|
* Raise default CoreDNS replicas to the larger of 2 or the number of controller nodes ([#313](https://github.com/poseidon/typhoon/pull/313))
|
||||||
|
* Add AntiAffinity preferred rule to favor spreading CoreDNS pods
|
||||||
|
* Annotate control plane and addon containers to use the Docker runtime seccomp profile ([#319](https://github.com/poseidon/typhoon/pull/319))
|
||||||
|
* Override Kubernetes default behavior that starts containers with `seccomp=unconfined`
|
||||||
|
|
||||||
|
#### Azure
|
||||||
|
|
||||||
|
* Remove `admin_password` field (disabled) since it is now optional
|
||||||
|
* Require `terraform-provider-azurerm` v1.16+ (action required)
|
||||||
|
|
||||||
|
#### Bare-Metal
|
||||||
|
|
||||||
|
* Add support for `cached_install` mode with Flatcar Linux ([#315](https://github.com/poseidon/typhoon/pull/315))
|
||||||
|
|
||||||
|
#### DigitalOcean
|
||||||
|
|
||||||
|
* Require `terraform-provider-digitalocean` v1.0+ (action required)
|
||||||
|
|
||||||
|
#### Addons
|
||||||
|
|
||||||
|
* Update nginx-ingress from v0.19.0 to v0.20.0
|
||||||
|
* Update Prometheus from v2.3.2 to v2.4.3
|
||||||
|
* Update Grafana from v5.2.4 to v5.3.1
|
||||||
|
|
||||||
|
## v1.11.3
|
||||||
|
|
||||||
|
* Kubernetes [v1.11.3](https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG-1.11.md#v1113)
|
||||||
|
* Introduce Typhoon for Azure as alpha ([#288](https://github.com/poseidon/typhoon/pull/288))
|
||||||
|
* Special thanks @justaugustus for an earlier variant
|
||||||
|
* Update Calico from v3.1.3 to v3.2.1 ([#278](https://github.com/poseidon/typhoon/pull/278))
|
||||||
|
|
||||||
|
#### AWS
|
||||||
|
|
||||||
|
* Remove firewall rule allowing ICMP packets to nodes ([#285](https://github.com/poseidon/typhoon/pull/285))
|
||||||
|
|
||||||
|
#### Bare-Metal
|
||||||
|
|
||||||
|
* Remove `controller_networkds` and `worker_networkds` variables. Use Container Linux Config snippets [#277](https://github.com/poseidon/typhoon/pull/277)
|
||||||
|
|
||||||
|
#### Google Cloud
|
||||||
|
|
||||||
|
* Fix firewall to allow etcd client port 2379 traffic between controller nodes ([#287](https://github.com/poseidon/typhoon/pull/287))
|
||||||
|
* kube-apiservers were only able to connect to their node's local etcd peer. While master node outages were tolerated, reaching a healthy peer took longer than neccessary in some cases
|
||||||
|
* Reduce time needed to bootstrap the cluster
|
||||||
|
* Remove firewall rule allowing workers to access Nginx Ingress health check ([#284](https://github.com/poseidon/typhoon/pull/284))
|
||||||
|
* Nginx Ingress addon no longer uses hostNetwork, Prometheus scrapes via CNI network
|
||||||
|
|
||||||
|
#### Addons
|
||||||
|
|
||||||
|
* Update nginx-ingress from 0.17.1 to 0.19.0
|
||||||
|
* Update kube-state-metrics from v1.3.1 to v1.4.0
|
||||||
|
* Update Grafana from 5.2.2 to 5.2.4
|
||||||
|
|
||||||
|
## v1.11.2
|
||||||
|
|
||||||
|
* Kubernetes [v1.11.2](https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG-1.11.md#v1112)
|
||||||
|
* Update etcd from v3.3.8 to [v3.3.9](https://github.com/coreos/etcd/blob/master/CHANGELOG-3.3.md#v339-2018-07-24)
|
||||||
|
* Use kubernetes-incubator/bootkube v0.13.0
|
||||||
|
* Fix Fedora Atomic modules' Kubelet version ([#270](https://github.com/poseidon/typhoon/issues/270))
|
||||||
|
|
||||||
|
#### Bare-Metal
|
||||||
|
|
||||||
|
* Introduce [Container Linux Config snippets](https://typhoon.psdn.io/advanced/customization/#container-linux) on bare-metal
|
||||||
|
* Validate and additively merge custom Container Linux Configs during terraform plan
|
||||||
|
* Define files, systemd units, dropins, networkd configs, mounts, users, and more
|
||||||
|
* [Require](https://typhoon.psdn.io/cl/bare-metal/#terraform-setup) `terraform-provider-ct` plugin v0.2.1 (**action required!**)
|
||||||
|
|
||||||
|
#### Addons
|
||||||
|
|
||||||
|
* Update nginx-ingress from 0.16.2 to 0.17.1
|
||||||
|
* Add nginx-ingress manifests for bare-metal
|
||||||
|
* Update Grafana from 5.2.1 to 5.2.2
|
||||||
|
* Update heapster from v1.5.3 to v1.5.4
|
||||||
|
|
||||||
|
## v1.11.1
|
||||||
|
|
||||||
|
* Kubernetes [v1.11.1](https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG-1.11.md#v1111)
|
||||||
|
|
||||||
|
#### Addons
|
||||||
|
|
||||||
|
* Update Prometheus from v2.3.1 to v2.3.2
|
||||||
|
|
||||||
|
#### Errata
|
||||||
|
|
||||||
|
* Fedora Atomic modules shipped with Kubelet v1.11.0, instead of v1.11.1. Fixed in [#270](https://github.com/poseidon/typhoon/issues/270).
|
||||||
|
|
||||||
|
## v1.11.0
|
||||||
|
|
||||||
|
* Kubernetes [v1.11.0](https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG-1.11.md#v1110)
|
||||||
|
* Force apiserver to stop listening on `127.0.0.1:8080`
|
||||||
|
* Replace `kube-dns` with [CoreDNS](https://coredns.io/) ([#261](https://github.com/poseidon/typhoon/pull/261))
|
||||||
|
* Edit the `coredns` ConfigMap to [customize](https://coredns.io/plugins/)
|
||||||
|
* CoreDNS doesn't use a resizer. For large clusters, scaling may be required.
|
||||||
|
|
||||||
|
#### AWS
|
||||||
|
|
||||||
|
* Update from Fedora Atomic 27 to 28 ([#258](https://github.com/poseidon/typhoon/pull/258))
|
||||||
|
|
||||||
|
#### Bare-Metal
|
||||||
|
|
||||||
|
* Update from Fedora Atomic 27 to 28 ([#263](https://github.com/poseidon/typhoon/pull/263))
|
||||||
|
|
||||||
|
#### Google
|
||||||
|
|
||||||
|
* Promote Google Cloud to stable
|
||||||
|
* Update from Fedora Atomic 27 to 28 ([#259](https://github.com/poseidon/typhoon/pull/259))
|
||||||
|
* Remove `ingress_static_ip` module output. Use `ingress_static_ipv4`.
|
||||||
|
* Remove `controllers_ipv4_public` module output.
|
||||||
|
|
||||||
|
#### Addons
|
||||||
|
|
||||||
|
* Update nginx-ingress from 0.15.0 to 0.16.2
|
||||||
|
* Update Grafana from 5.1.4 to [5.2.1](http://docs.grafana.org/guides/whats-new-in-v5-2/)
|
||||||
|
* Update heapster from v1.5.2 to v1.5.3
|
||||||
|
|
||||||
|
## v1.10.5
|
||||||
|
|
||||||
|
* Kubernetes [v1.10.5](https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG-1.10.md#v1105)
|
||||||
|
* Update etcd from v3.3.6 to v3.3.8 ([#243](https://github.com/poseidon/typhoon/pull/243), [#247](https://github.com/poseidon/typhoon/pull/247))
|
||||||
|
|
||||||
|
#### AWS
|
||||||
|
|
||||||
|
* Switch `kube-apiserver` port from 443 to 6443 ([#248](https://github.com/poseidon/typhoon/pull/248))
|
||||||
|
* Combine apiserver and ingress NLBs ([#249](https://github.com/poseidon/typhoon/pull/249))
|
||||||
|
* Reduce cost by ~$18/month per cluster. Typhoon AWS clusters now use one network load balancer.
|
||||||
|
* Ingress addon users may keep using CNAME records to the `ingress_dns_name` module output (few million RPS)
|
||||||
|
* Ingress users with heavy traffic (many million RPS) should create a separate NLB(s)
|
||||||
|
* Worker pools no longer include an extraneous load balancer. Remove worker module's `ingress_dns_name` output
|
||||||
|
* Disable detailed (paid) monitoring on worker nodes ([#251](https://github.com/poseidon/typhoon/pull/251))
|
||||||
|
* Favor Prometheus for cloud-agnostic metrics, aggregation, and alerting
|
||||||
|
* Add `worker_target_group_http` and `worker_target_group_https` module outputs to allow custom load balancing
|
||||||
|
* Add `target_group_http` and `target_group_https` worker module outputs to allow custom load balancing
|
||||||
|
|
||||||
|
#### Bare-Metal
|
||||||
|
|
||||||
|
* Switch `kube-apiserver` port from 443 to 6443 ([#248](https://github.com/poseidon/typhoon/pull/248))
|
||||||
|
* Users who exposed kube-apiserver on a WAN via their router/load-balancer will need to adjust its configuration (e.g. DNAT 6443). Most apiservers are on a LAN (internal, VPN-only, etc) so if you didn't specially configure network gear for 443, no change is needed. (possible action required)
|
||||||
|
* Fix possible deadlock when provisioning clusters larger than 10 nodes ([#244](https://github.com/poseidon/typhoon/pull/244))
|
||||||
|
|
||||||
|
#### DigitalOcean
|
||||||
|
|
||||||
|
* Switch `kube-apiserver` port from 443 to 6443 ([#248](https://github.com/poseidon/typhoon/pull/248))
|
||||||
|
* Update firewall rules and generated kubeconfig's
|
||||||
|
|
||||||
|
#### Google Cloud
|
||||||
|
|
||||||
|
* Use global HTTP and TCP proxy load balancing for Kubernetes Ingress ([#252](https://github.com/poseidon/typhoon/pull/252))
|
||||||
|
* Switch Ingress from regional network load balancers to global HTTP/TCP Proxy load balancing
|
||||||
|
* Reduce cost by ~$19/month per cluster. Google bills the first 5 global and regional forwarding rules separately. Typhoon clusters now use 3 global and 0 regional forwarding rules.
|
||||||
|
* Worker pools no longer include an extraneous load balancer. Remove worker module's `ingress_static_ip` output
|
||||||
|
* Allow using nginx-ingress addon on Fedora Atomic clusters ([#200](https://github.com/poseidon/typhoon/issues/200))
|
||||||
|
* Add `worker_instance_group` module output to allow custom global load balancing
|
||||||
|
* Add `instance_group` worker module output to allow custom global load balancing
|
||||||
|
* Deprecate `ingress_static_ip` module output. Add `ingress_static_ipv4` module output instead.
|
||||||
|
* Deprecate `controllers_ipv4_public` module output
|
||||||
|
|
||||||
|
#### Addons
|
||||||
|
|
||||||
|
* Update CLUO from v0.6.0 to v0.7.0 ([#242](https://github.com/poseidon/typhoon/pull/242))
|
||||||
|
* Update Prometheus from v2.3.0 to v2.3.1
|
||||||
|
* Update Grafana from 5.1.3 to 5.1.4
|
||||||
|
* Drop `hostNetwork` from nginx-ingress addon
|
||||||
|
* Both flannel and Calico support host port via `portmap`
|
||||||
|
* Allows writing NetworkPolicies that reference ingress pods in `from` or `to`. HostNetwork pods were difficult to write network policy for since they could circumvent the CNI network to communicate with pods on the same node.
|
||||||
|
|
||||||
## v1.10.4
|
## v1.10.4
|
||||||
|
|
||||||
* Kubernetes [v1.10.4](https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG-1.10.md#v1104)
|
* Kubernetes [v1.10.4](https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG-1.10.md#v1104)
|
||||||
|
49
README.md
49
README.md
@ -11,34 +11,38 @@ Typhoon distributes upstream Kubernetes, architectural conventions, and cluster
|
|||||||
|
|
||||||
## Features <a href="https://www.cncf.io/certification/software-conformance/"><img align="right" src="https://storage.googleapis.com/poseidon/certified-kubernetes.png"></a>
|
## Features <a href="https://www.cncf.io/certification/software-conformance/"><img align="right" src="https://storage.googleapis.com/poseidon/certified-kubernetes.png"></a>
|
||||||
|
|
||||||
* Kubernetes v1.10.4 (upstream, via [kubernetes-incubator/bootkube](https://github.com/kubernetes-incubator/bootkube))
|
* Kubernetes v1.13.4 (upstream, via [kubernetes-incubator/bootkube](https://github.com/kubernetes-incubator/bootkube))
|
||||||
* Single or multi-master, workloads isolated on workers, [Calico](https://www.projectcalico.org/) or [flannel](https://github.com/coreos/flannel) networking
|
* Single or multi-master, [Calico](https://www.projectcalico.org/) or [flannel](https://github.com/coreos/flannel) networking
|
||||||
* On-cluster etcd with TLS, [RBAC](https://kubernetes.io/docs/admin/authorization/rbac/)-enabled, [network policy](https://kubernetes.io/docs/concepts/services-networking/network-policies/)
|
* On-cluster etcd with TLS, [RBAC](https://kubernetes.io/docs/admin/authorization/rbac/)-enabled, [network policy](https://kubernetes.io/docs/concepts/services-networking/network-policies/)
|
||||||
* Advanced features like [worker pools](https://typhoon.psdn.io/advanced/worker-pools/) and [preemption](https://typhoon.psdn.io/google-cloud/#preemption) (varies by platform)
|
* Advanced features like [worker pools](https://typhoon.psdn.io/advanced/worker-pools/), [preemptible](https://typhoon.psdn.io/cl/google-cloud/#preemption) workers, and [snippets](https://typhoon.psdn.io/advanced/customization/#container-linux) customization
|
||||||
* Ready for Ingress, Prometheus, Grafana, and other optional [addons](https://typhoon.psdn.io/addons/overview/)
|
* Ready for Ingress, Prometheus, Grafana, CSI, or other [addons](https://typhoon.psdn.io/addons/overview/)
|
||||||
|
|
||||||
## Modules
|
## Modules
|
||||||
|
|
||||||
Typhoon provides a Terraform Module for each supported operating system and platform.
|
Typhoon provides a Terraform Module for each supported operating system and platform. Container Linux is a mature and reliable choice. Also, Kinvolk's Flatcar Linux fork is selectable on AWS and bare-metal.
|
||||||
|
|
||||||
| Platform | Operating System | Terraform Module | Status |
|
| Platform | Operating System | Terraform Module | Status |
|
||||||
|---------------|------------------|------------------|--------|
|
|---------------|------------------|------------------|--------|
|
||||||
| AWS | Container Linux | [aws/container-linux/kubernetes](aws/container-linux/kubernetes) | stable |
|
| AWS | Container Linux | [aws/container-linux/kubernetes](aws/container-linux/kubernetes) | stable |
|
||||||
| AWS | Fedora Atomic | [aws/fedora-atomic/kubernetes](aws/fedora-atomic/kubernetes) | alpha |
|
| Azure | Container Linux | [azure/container-linux/kubernetes](cl/azure.md) | alpha |
|
||||||
| Bare-Metal | Container Linux | [bare-metal/container-linux/kubernetes](bare-metal/container-linux/kubernetes) | stable |
|
| Bare-Metal | Container Linux | [bare-metal/container-linux/kubernetes](bare-metal/container-linux/kubernetes) | stable |
|
||||||
| Bare-Metal | Fedora Atomic | [bare-metal/fedora-atomic/kubernetes](bare-metal/fedora-atomic/kubernetes) | alpha |
|
|
||||||
| Digital Ocean | Container Linux | [digital-ocean/container-linux/kubernetes](digital-ocean/container-linux/kubernetes) | beta |
|
| Digital Ocean | Container Linux | [digital-ocean/container-linux/kubernetes](digital-ocean/container-linux/kubernetes) | beta |
|
||||||
| Digital Ocean | Fedora Atomic | [digital-ocean/fedora-atomic/kubernetes](digital-ocean/fedora-atomic/kubernetes) | alpha |
|
| Google Cloud | Container Linux | [google-cloud/container-linux/kubernetes](google-cloud/container-linux/kubernetes) | stable |
|
||||||
| Google Cloud | Container Linux | [google-cloud/container-linux/kubernetes](google-cloud/container-linux/kubernetes) | beta |
|
|
||||||
| Google Cloud | Fedora Atomic | [google-cloud/fedora-atomic/kubernetes](google-cloud/fedora-atomic/kubernetes) | alpha |
|
|
||||||
|
|
||||||
The AWS and bare-metal `container-linux` modules allow picking Red Hat Container Linux (formerly CoreOS Container Linux) or Kinvolk's Flatcar Linux friendly fork.
|
Fedora Atomic support is alpha and will evolve as Fedora Atomic is replaced by Fedora CoreOS.
|
||||||
|
|
||||||
|
| Platform | Operating System | Terraform Module | Status |
|
||||||
|
|---------------|------------------|------------------|--------|
|
||||||
|
| AWS | Fedora Atomic | [aws/fedora-atomic/kubernetes](aws/fedora-atomic/kubernetes) | alpha |
|
||||||
|
| Bare-Metal | Fedora Atomic | [bare-metal/fedora-atomic/kubernetes](bare-metal/fedora-atomic/kubernetes) | alpha |
|
||||||
|
| Digital Ocean | Fedora Atomic | [digital-ocean/fedora-atomic/kubernetes](digital-ocean/fedora-atomic/kubernetes) | alpha |
|
||||||
|
| Google Cloud | Fedora Atomic | [google-cloud/fedora-atomic/kubernetes](google-cloud/fedora-atomic/kubernetes) | alpha |
|
||||||
|
|
||||||
## Documentation
|
## Documentation
|
||||||
|
|
||||||
* [Docs](https://typhoon.psdn.io)
|
* [Docs](https://typhoon.psdn.io)
|
||||||
* Architecture [concepts](https://typhoon.psdn.io/architecture/concepts/) and [operating systems](https://typhoon.psdn.io/architecture/operating-systems/)
|
* Architecture [concepts](https://typhoon.psdn.io/architecture/concepts/) and [operating systems](https://typhoon.psdn.io/architecture/operating-systems/)
|
||||||
* Tutorials for [AWS](https://typhoon.psdn.io/cl/aws/), [Bare-Metal](https://typhoon.psdn.io/cl/bare-metal/), [Digital Ocean](https://typhoon.psdn.io/cl/digital-ocean/), and [Google-Cloud](https://typhoon.psdn.io/cl/google-cloud/)
|
* Tutorials for [AWS](docs/cl/aws.md), [Azure](docs/cl/azure.md), [Bare-Metal](docs/cl/bare-metal.md), [Digital Ocean](docs/cl/digital-ocean.md), and [Google-Cloud](docs/cl/google-cloud.md)
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
@ -46,7 +50,7 @@ Define a Kubernetes cluster by using the Terraform module for your chosen platfo
|
|||||||
|
|
||||||
```tf
|
```tf
|
||||||
module "google-cloud-yavin" {
|
module "google-cloud-yavin" {
|
||||||
source = "git::https://github.com/poseidon/typhoon//google-cloud/container-linux/kubernetes?ref=v1.10.4"
|
source = "git::https://github.com/poseidon/typhoon//google-cloud/container-linux/kubernetes?ref=v1.13.4"
|
||||||
|
|
||||||
providers = {
|
providers = {
|
||||||
google = "google.default"
|
google = "google.default"
|
||||||
@ -71,15 +75,14 @@ module "google-cloud-yavin" {
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
Fetch modules, plan the changes to be made, and apply the changes.
|
Initialize modules, plan the changes to be made, and apply the changes.
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
$ terraform init
|
$ terraform init
|
||||||
$ terraform get --update
|
|
||||||
$ terraform plan
|
$ terraform plan
|
||||||
Plan: 37 to add, 0 to change, 0 to destroy.
|
Plan: 64 to add, 0 to change, 0 to destroy.
|
||||||
$ terraform apply
|
$ terraform apply
|
||||||
Apply complete! Resources: 37 added, 0 changed, 0 destroyed.
|
Apply complete! Resources: 64 added, 0 changed, 0 destroyed.
|
||||||
```
|
```
|
||||||
|
|
||||||
In 4-8 minutes (varies by platform), the cluster will be ready. This Google Cloud example creates a `yavin.example.com` DNS record to resolve to a network load balancer across controller nodes.
|
In 4-8 minutes (varies by platform), the cluster will be ready. This Google Cloud example creates a `yavin.example.com` DNS record to resolve to a network load balancer across controller nodes.
|
||||||
@ -87,10 +90,10 @@ In 4-8 minutes (varies by platform), the cluster will be ready. This Google Clou
|
|||||||
```sh
|
```sh
|
||||||
$ export KUBECONFIG=/home/user/.secrets/clusters/yavin/auth/kubeconfig
|
$ export KUBECONFIG=/home/user/.secrets/clusters/yavin/auth/kubeconfig
|
||||||
$ kubectl get nodes
|
$ kubectl get nodes
|
||||||
NAME STATUS AGE VERSION
|
NAME ROLES STATUS AGE VERSION
|
||||||
yavin-controller-0.c.example-com.internal Ready 6m v1.10.4
|
yavin-controller-0.c.example-com.internal controller,master Ready 6m v1.13.4
|
||||||
yavin-worker-jrbf.c.example-com.internal Ready 5m v1.10.4
|
yavin-worker-jrbf.c.example-com.internal node Ready 5m v1.13.4
|
||||||
yavin-worker-mzdm.c.example-com.internal Ready 5m v1.10.4
|
yavin-worker-mzdm.c.example-com.internal node Ready 5m v1.13.4
|
||||||
```
|
```
|
||||||
|
|
||||||
List the pods.
|
List the pods.
|
||||||
@ -101,16 +104,18 @@ NAMESPACE NAME READY STATUS RESTART
|
|||||||
kube-system calico-node-1cs8z 2/2 Running 0 6m
|
kube-system calico-node-1cs8z 2/2 Running 0 6m
|
||||||
kube-system calico-node-d1l5b 2/2 Running 0 6m
|
kube-system calico-node-d1l5b 2/2 Running 0 6m
|
||||||
kube-system calico-node-sp9ps 2/2 Running 0 6m
|
kube-system calico-node-sp9ps 2/2 Running 0 6m
|
||||||
|
kube-system coredns-1187388186-zj5dl 1/1 Running 0 6m
|
||||||
|
kube-system coredns-1187388186-dkh3o 1/1 Running 0 6m
|
||||||
kube-system kube-apiserver-zppls 1/1 Running 0 6m
|
kube-system kube-apiserver-zppls 1/1 Running 0 6m
|
||||||
kube-system kube-controller-manager-3271970485-gh9kt 1/1 Running 0 6m
|
kube-system kube-controller-manager-3271970485-gh9kt 1/1 Running 0 6m
|
||||||
kube-system kube-controller-manager-3271970485-h90v8 1/1 Running 1 6m
|
kube-system kube-controller-manager-3271970485-h90v8 1/1 Running 1 6m
|
||||||
kube-system kube-dns-1187388186-zj5dl 3/3 Running 0 6m
|
|
||||||
kube-system kube-proxy-117v6 1/1 Running 0 6m
|
kube-system kube-proxy-117v6 1/1 Running 0 6m
|
||||||
kube-system kube-proxy-9886n 1/1 Running 0 6m
|
kube-system kube-proxy-9886n 1/1 Running 0 6m
|
||||||
kube-system kube-proxy-njn47 1/1 Running 0 6m
|
kube-system kube-proxy-njn47 1/1 Running 0 6m
|
||||||
kube-system kube-scheduler-3895335239-5x87r 1/1 Running 0 6m
|
kube-system kube-scheduler-3895335239-5x87r 1/1 Running 0 6m
|
||||||
kube-system kube-scheduler-3895335239-bzrrt 1/1 Running 1 6m
|
kube-system kube-scheduler-3895335239-bzrrt 1/1 Running 1 6m
|
||||||
kube-system pod-checkpointer-l6lrt 1/1 Running 0 6m
|
kube-system pod-checkpointer-l6lrt 1/1 Running 0 6m
|
||||||
|
kube-system pod-checkpointer-l6lrt-controller-0 1/1 Running 0 6m
|
||||||
```
|
```
|
||||||
|
|
||||||
## Non-Goals
|
## Non-Goals
|
||||||
|
@ -15,10 +15,12 @@ spec:
|
|||||||
metadata:
|
metadata:
|
||||||
labels:
|
labels:
|
||||||
app: container-linux-update-agent
|
app: container-linux-update-agent
|
||||||
|
annotations:
|
||||||
|
seccomp.security.alpha.kubernetes.io/pod: 'docker/default'
|
||||||
spec:
|
spec:
|
||||||
containers:
|
containers:
|
||||||
- name: update-agent
|
- name: update-agent
|
||||||
image: quay.io/coreos/container-linux-update-operator:v0.6.0
|
image: quay.io/coreos/container-linux-update-operator:v0.7.0
|
||||||
command:
|
command:
|
||||||
- "/bin/update-agent"
|
- "/bin/update-agent"
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
|
@ -12,10 +12,12 @@ spec:
|
|||||||
metadata:
|
metadata:
|
||||||
labels:
|
labels:
|
||||||
app: container-linux-update-operator
|
app: container-linux-update-operator
|
||||||
|
annotations:
|
||||||
|
seccomp.security.alpha.kubernetes.io/pod: 'docker/default'
|
||||||
spec:
|
spec:
|
||||||
containers:
|
containers:
|
||||||
- name: update-operator
|
- name: update-operator
|
||||||
image: quay.io/coreos/container-linux-update-operator:v0.6.0
|
image: quay.io/coreos/container-linux-update-operator:v0.7.0
|
||||||
command:
|
command:
|
||||||
- "/bin/update-operator"
|
- "/bin/update-operator"
|
||||||
env:
|
env:
|
||||||
|
36
addons/grafana/config.yaml
Normal file
36
addons/grafana/config.yaml
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: grafana-config
|
||||||
|
namespace: monitoring
|
||||||
|
data:
|
||||||
|
custom.ini: |+
|
||||||
|
[server]
|
||||||
|
http_port = 8080
|
||||||
|
|
||||||
|
[paths]
|
||||||
|
data = /var/lib/grafana
|
||||||
|
plugins = /var/lib/grafana/plugins
|
||||||
|
provisioning = /etc/grafana/provisioning
|
||||||
|
|
||||||
|
[users]
|
||||||
|
allow_sign_up = false
|
||||||
|
allow_org_create = false
|
||||||
|
# viewers can edit/inspect, but not save
|
||||||
|
viewers_can_edit = true
|
||||||
|
|
||||||
|
# Disable login form, since Grafana always creates an admin user
|
||||||
|
[auth]
|
||||||
|
disable_login_form = true
|
||||||
|
|
||||||
|
# Disable the user/pass login system
|
||||||
|
[auth.basic]
|
||||||
|
enabled = false
|
||||||
|
|
||||||
|
# Allow anonymous authentication with view-only authorization
|
||||||
|
[auth.anonymous]
|
||||||
|
enabled = true
|
||||||
|
org_role = Viewer
|
||||||
|
|
||||||
|
[analytics]
|
||||||
|
reporting_enabled = false
|
File diff suppressed because it is too large
Load Diff
@ -10,7 +10,15 @@ data:
|
|||||||
- name: prometheus
|
- name: prometheus
|
||||||
type: prometheus
|
type: prometheus
|
||||||
access: proxy
|
access: proxy
|
||||||
orgId: 1
|
|
||||||
url: http://prometheus.monitoring.svc.cluster.local
|
url: http://prometheus.monitoring.svc.cluster.local
|
||||||
version: 1
|
version: 1
|
||||||
editable: false
|
editable: false
|
||||||
|
loki.yaml: |+
|
||||||
|
apiVersion: 1
|
||||||
|
datasources:
|
||||||
|
- name: loki
|
||||||
|
type: loki
|
||||||
|
access: proxy
|
||||||
|
url: http://loki.monitoring.svc.cluster.local
|
||||||
|
version: 1
|
||||||
|
editable: false
|
||||||
|
@ -18,21 +18,15 @@ spec:
|
|||||||
labels:
|
labels:
|
||||||
name: grafana
|
name: grafana
|
||||||
phase: prod
|
phase: prod
|
||||||
|
annotations:
|
||||||
|
seccomp.security.alpha.kubernetes.io/pod: 'docker/default'
|
||||||
spec:
|
spec:
|
||||||
containers:
|
containers:
|
||||||
- name: grafana
|
- name: grafana
|
||||||
image: grafana/grafana:5.1.3
|
image: grafana/grafana:6.0.0
|
||||||
env:
|
env:
|
||||||
- name: GF_SERVER_HTTP_PORT
|
- name: GF_PATHS_CONFIG
|
||||||
value: "8080"
|
value: "/etc/grafana/custom.ini"
|
||||||
- name: GF_AUTH_BASIC_ENABLED
|
|
||||||
value: "false"
|
|
||||||
- name: GF_AUTH_ANONYMOUS_ENABLED
|
|
||||||
value: "true"
|
|
||||||
- name: GF_AUTH_ANONYMOUS_ORG_ROLE
|
|
||||||
value: Viewer
|
|
||||||
- name: GF_ANALYTICS_REPORTING_ENABLED
|
|
||||||
value: "false"
|
|
||||||
ports:
|
ports:
|
||||||
- name: http
|
- name: http
|
||||||
containerPort: 8080
|
containerPort: 8080
|
||||||
@ -44,19 +38,24 @@ spec:
|
|||||||
memory: 200Mi
|
memory: 200Mi
|
||||||
cpu: 200m
|
cpu: 200m
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
|
- name: config
|
||||||
|
mountPath: /etc/grafana
|
||||||
- name: datasources
|
- name: datasources
|
||||||
mountPath: /etc/grafana/provisioning/datasources
|
mountPath: /etc/grafana/provisioning/datasources
|
||||||
- name: dashboard-providers
|
- name: providers
|
||||||
mountPath: /etc/grafana/provisioning/dashboards
|
mountPath: /etc/grafana/provisioning/dashboards
|
||||||
- name: dashboards
|
- name: dashboards
|
||||||
mountPath: /var/lib/grafana/dashboards
|
mountPath: /etc/grafana/dashboards
|
||||||
volumes:
|
volumes:
|
||||||
|
- name: config
|
||||||
|
configMap:
|
||||||
|
name: grafana-config
|
||||||
- name: datasources
|
- name: datasources
|
||||||
configMap:
|
configMap:
|
||||||
name: grafana-datasources
|
name: grafana-datasources
|
||||||
- name: dashboard-providers
|
- name: providers
|
||||||
configMap:
|
configMap:
|
||||||
name: grafana-dashboard-providers
|
name: grafana-providers
|
||||||
- name: dashboards
|
- name: dashboards
|
||||||
configMap:
|
configMap:
|
||||||
name: grafana-dashboards
|
name: grafana-dashboards
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: ConfigMap
|
kind: ConfigMap
|
||||||
metadata:
|
metadata:
|
||||||
name: grafana-dashboard-providers
|
name: grafana-providers
|
||||||
namespace: monitoring
|
namespace: monitoring
|
||||||
data:
|
data:
|
||||||
dashboard-providers.yaml: |+
|
providers.yaml: |+
|
||||||
apiVersion: 1
|
apiVersion: 1
|
||||||
providers:
|
providers:
|
||||||
- name: 'default'
|
- name: 'default'
|
||||||
@ -12,4 +12,4 @@ data:
|
|||||||
folder: ''
|
folder: ''
|
||||||
type: file
|
type: file
|
||||||
options:
|
options:
|
||||||
path: /var/lib/grafana/dashboards
|
path: /etc/grafana/dashboards
|
@ -5,7 +5,7 @@ metadata:
|
|||||||
roleRef:
|
roleRef:
|
||||||
apiGroup: rbac.authorization.k8s.io
|
apiGroup: rbac.authorization.k8s.io
|
||||||
kind: ClusterRole
|
kind: ClusterRole
|
||||||
name: system:heapster
|
name: heapster
|
||||||
subjects:
|
subjects:
|
||||||
- kind: ServiceAccount
|
- kind: ServiceAccount
|
||||||
name: heapster
|
name: heapster
|
||||||
|
30
addons/heapster/cluster-role.yaml
Normal file
30
addons/heapster/cluster-role.yaml
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRole
|
||||||
|
metadata:
|
||||||
|
name: heapster
|
||||||
|
rules:
|
||||||
|
- apiGroups:
|
||||||
|
- ""
|
||||||
|
resources:
|
||||||
|
- events
|
||||||
|
- namespaces
|
||||||
|
- nodes
|
||||||
|
- pods
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
|
- apiGroups:
|
||||||
|
- extensions
|
||||||
|
resources:
|
||||||
|
- deployments
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
|
- apiGroups:
|
||||||
|
- ""
|
||||||
|
resources:
|
||||||
|
- nodes/stats
|
||||||
|
verbs:
|
||||||
|
- get
|
@ -14,14 +14,16 @@ spec:
|
|||||||
labels:
|
labels:
|
||||||
name: heapster
|
name: heapster
|
||||||
phase: prod
|
phase: prod
|
||||||
|
annotations:
|
||||||
|
seccomp.security.alpha.kubernetes.io/pod: 'docker/default'
|
||||||
spec:
|
spec:
|
||||||
serviceAccountName: heapster
|
serviceAccountName: heapster
|
||||||
containers:
|
containers:
|
||||||
- name: heapster
|
- name: heapster
|
||||||
image: k8s.gcr.io/heapster-amd64:v1.5.2
|
image: k8s.gcr.io/heapster-amd64:v1.5.4
|
||||||
command:
|
command:
|
||||||
- /heapster
|
- /heapster
|
||||||
- --source=kubernetes.summary_api:''
|
- --source=kubernetes.summary_api:''?useServiceAccount=true&kubeletHttps=true&kubeletPort=10250&insecure=true
|
||||||
livenessProbe:
|
livenessProbe:
|
||||||
httpGet:
|
httpGet:
|
||||||
path: /healthz
|
path: /healthz
|
||||||
|
@ -1,40 +0,0 @@
|
|||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: default-backend
|
|
||||||
namespace: ingress
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
name: default-backend
|
|
||||||
phase: prod
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
name: default-backend
|
|
||||||
phase: prod
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: default-backend
|
|
||||||
# Any image is permissable as long as:
|
|
||||||
# 1. It serves a 404 page at /
|
|
||||||
# 2. It serves 200 on a /healthz endpoint
|
|
||||||
image: k8s.gcr.io/defaultbackend:1.4
|
|
||||||
ports:
|
|
||||||
- containerPort: 8080
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
cpu: 10m
|
|
||||||
memory: 20Mi
|
|
||||||
requests:
|
|
||||||
cpu: 10m
|
|
||||||
memory: 20Mi
|
|
||||||
livenessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /healthz
|
|
||||||
port: 8080
|
|
||||||
scheme: HTTP
|
|
||||||
initialDelaySeconds: 30
|
|
||||||
timeoutSeconds: 5
|
|
||||||
terminationGracePeriodSeconds: 60
|
|
@ -1,15 +0,0 @@
|
|||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: default-backend
|
|
||||||
namespace: ingress
|
|
||||||
spec:
|
|
||||||
type: ClusterIP
|
|
||||||
selector:
|
|
||||||
name: default-backend
|
|
||||||
phase: prod
|
|
||||||
ports:
|
|
||||||
- name: http
|
|
||||||
protocol: TCP
|
|
||||||
port: 80
|
|
||||||
targetPort: 8080
|
|
@ -17,16 +17,16 @@ spec:
|
|||||||
labels:
|
labels:
|
||||||
name: nginx-ingress-controller
|
name: nginx-ingress-controller
|
||||||
phase: prod
|
phase: prod
|
||||||
|
annotations:
|
||||||
|
seccomp.security.alpha.kubernetes.io/pod: 'docker/default'
|
||||||
spec:
|
spec:
|
||||||
nodeSelector:
|
nodeSelector:
|
||||||
node-role.kubernetes.io/node: ""
|
node-role.kubernetes.io/node: ""
|
||||||
hostNetwork: true
|
|
||||||
containers:
|
containers:
|
||||||
- name: nginx-ingress-controller
|
- name: nginx-ingress-controller
|
||||||
image: quay.io/kubernetes-ingress-controller/nginx-ingress-controller:0.15.0
|
image: quay.io/kubernetes-ingress-controller/nginx-ingress-controller:0.23.0
|
||||||
args:
|
args:
|
||||||
- /nginx-ingress-controller
|
- /nginx-ingress-controller
|
||||||
- --default-backend-service=$(POD_NAMESPACE)/default-backend
|
|
||||||
- --ingress-class=public
|
- --ingress-class=public
|
||||||
# use downward API
|
# use downward API
|
||||||
env:
|
env:
|
||||||
@ -57,7 +57,7 @@ spec:
|
|||||||
initialDelaySeconds: 10
|
initialDelaySeconds: 10
|
||||||
periodSeconds: 10
|
periodSeconds: 10
|
||||||
successThreshold: 1
|
successThreshold: 1
|
||||||
timeoutSeconds: 1
|
timeoutSeconds: 5
|
||||||
readinessProbe:
|
readinessProbe:
|
||||||
failureThreshold: 3
|
failureThreshold: 3
|
||||||
httpGet:
|
httpGet:
|
||||||
@ -66,8 +66,13 @@ spec:
|
|||||||
scheme: HTTP
|
scheme: HTTP
|
||||||
periodSeconds: 10
|
periodSeconds: 10
|
||||||
successThreshold: 1
|
successThreshold: 1
|
||||||
timeoutSeconds: 1
|
timeoutSeconds: 5
|
||||||
securityContext:
|
securityContext:
|
||||||
runAsNonRoot: false
|
capabilities:
|
||||||
|
add:
|
||||||
|
- NET_BIND_SERVICE
|
||||||
|
drop:
|
||||||
|
- ALL
|
||||||
|
runAsUser: 33 # www-data
|
||||||
restartPolicy: Always
|
restartPolicy: Always
|
||||||
terminationGracePeriodSeconds: 60
|
terminationGracePeriodSeconds: 60
|
||||||
|
6
addons/nginx-ingress/azure/0-namespace.yaml
Normal file
6
addons/nginx-ingress/azure/0-namespace.yaml
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: Namespace
|
||||||
|
metadata:
|
||||||
|
name: ingress
|
||||||
|
labels:
|
||||||
|
name: ingress
|
78
addons/nginx-ingress/azure/deployment.yaml
Normal file
78
addons/nginx-ingress/azure/deployment.yaml
Normal file
@ -0,0 +1,78 @@
|
|||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: nginx-ingress-controller
|
||||||
|
namespace: ingress
|
||||||
|
spec:
|
||||||
|
replicas: 2
|
||||||
|
strategy:
|
||||||
|
rollingUpdate:
|
||||||
|
maxUnavailable: 1
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
name: nginx-ingress-controller
|
||||||
|
phase: prod
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
name: nginx-ingress-controller
|
||||||
|
phase: prod
|
||||||
|
annotations:
|
||||||
|
seccomp.security.alpha.kubernetes.io/pod: 'docker/default'
|
||||||
|
spec:
|
||||||
|
nodeSelector:
|
||||||
|
node-role.kubernetes.io/node: ""
|
||||||
|
containers:
|
||||||
|
- name: nginx-ingress-controller
|
||||||
|
image: quay.io/kubernetes-ingress-controller/nginx-ingress-controller:0.23.0
|
||||||
|
args:
|
||||||
|
- /nginx-ingress-controller
|
||||||
|
- --ingress-class=public
|
||||||
|
# use downward API
|
||||||
|
env:
|
||||||
|
- name: POD_NAME
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
fieldPath: metadata.name
|
||||||
|
- name: POD_NAMESPACE
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
fieldPath: metadata.namespace
|
||||||
|
ports:
|
||||||
|
- name: http
|
||||||
|
containerPort: 80
|
||||||
|
hostPort: 80
|
||||||
|
- name: https
|
||||||
|
containerPort: 443
|
||||||
|
hostPort: 443
|
||||||
|
- name: health
|
||||||
|
containerPort: 10254
|
||||||
|
hostPort: 10254
|
||||||
|
livenessProbe:
|
||||||
|
failureThreshold: 3
|
||||||
|
httpGet:
|
||||||
|
path: /healthz
|
||||||
|
port: 10254
|
||||||
|
scheme: HTTP
|
||||||
|
initialDelaySeconds: 10
|
||||||
|
periodSeconds: 10
|
||||||
|
successThreshold: 1
|
||||||
|
timeoutSeconds: 5
|
||||||
|
readinessProbe:
|
||||||
|
failureThreshold: 3
|
||||||
|
httpGet:
|
||||||
|
path: /healthz
|
||||||
|
port: 10254
|
||||||
|
scheme: HTTP
|
||||||
|
periodSeconds: 10
|
||||||
|
successThreshold: 1
|
||||||
|
timeoutSeconds: 5
|
||||||
|
securityContext:
|
||||||
|
capabilities:
|
||||||
|
add:
|
||||||
|
- NET_BIND_SERVICE
|
||||||
|
drop:
|
||||||
|
- ALL
|
||||||
|
runAsUser: 33 # www-data
|
||||||
|
restartPolicy: Always
|
||||||
|
terminationGracePeriodSeconds: 60
|
12
addons/nginx-ingress/azure/rbac/cluster-role-binding.yaml
Normal file
12
addons/nginx-ingress/azure/rbac/cluster-role-binding.yaml
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRoleBinding
|
||||||
|
metadata:
|
||||||
|
name: ingress
|
||||||
|
roleRef:
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
||||||
|
kind: ClusterRole
|
||||||
|
name: ingress
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
namespace: ingress
|
||||||
|
name: default
|
51
addons/nginx-ingress/azure/rbac/cluster-role.yaml
Normal file
51
addons/nginx-ingress/azure/rbac/cluster-role.yaml
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRole
|
||||||
|
metadata:
|
||||||
|
name: ingress
|
||||||
|
rules:
|
||||||
|
- apiGroups:
|
||||||
|
- ""
|
||||||
|
resources:
|
||||||
|
- configmaps
|
||||||
|
- endpoints
|
||||||
|
- nodes
|
||||||
|
- pods
|
||||||
|
- secrets
|
||||||
|
verbs:
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
|
- apiGroups:
|
||||||
|
- ""
|
||||||
|
resources:
|
||||||
|
- nodes
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- apiGroups:
|
||||||
|
- ""
|
||||||
|
resources:
|
||||||
|
- services
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
|
- apiGroups:
|
||||||
|
- "extensions"
|
||||||
|
resources:
|
||||||
|
- ingresses
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
|
- apiGroups:
|
||||||
|
- ""
|
||||||
|
resources:
|
||||||
|
- events
|
||||||
|
verbs:
|
||||||
|
- create
|
||||||
|
- patch
|
||||||
|
- apiGroups:
|
||||||
|
- "extensions"
|
||||||
|
resources:
|
||||||
|
- ingresses/status
|
||||||
|
verbs:
|
||||||
|
- update
|
13
addons/nginx-ingress/azure/rbac/role-binding.yaml
Normal file
13
addons/nginx-ingress/azure/rbac/role-binding.yaml
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: RoleBinding
|
||||||
|
metadata:
|
||||||
|
name: ingress
|
||||||
|
namespace: ingress
|
||||||
|
roleRef:
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
||||||
|
kind: Role
|
||||||
|
name: ingress
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
namespace: ingress
|
||||||
|
name: default
|
41
addons/nginx-ingress/azure/rbac/role.yaml
Normal file
41
addons/nginx-ingress/azure/rbac/role.yaml
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: Role
|
||||||
|
metadata:
|
||||||
|
name: ingress
|
||||||
|
namespace: ingress
|
||||||
|
rules:
|
||||||
|
- apiGroups:
|
||||||
|
- ""
|
||||||
|
resources:
|
||||||
|
- configmaps
|
||||||
|
- pods
|
||||||
|
- secrets
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- apiGroups:
|
||||||
|
- ""
|
||||||
|
resources:
|
||||||
|
- configmaps
|
||||||
|
resourceNames:
|
||||||
|
# Defaults to "<election-id>-<ingress-class>"
|
||||||
|
# Here: "<ingress-controller-leader>-<nginx>"
|
||||||
|
# This has to be adapted if you change either parameter
|
||||||
|
# when launching the nginx-ingress-controller.
|
||||||
|
- "ingress-controller-leader-public"
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- update
|
||||||
|
- apiGroups:
|
||||||
|
- ""
|
||||||
|
resources:
|
||||||
|
- configmaps
|
||||||
|
verbs:
|
||||||
|
- create
|
||||||
|
- apiGroups:
|
||||||
|
- ""
|
||||||
|
resources:
|
||||||
|
- endpoints
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- create
|
||||||
|
- update
|
22
addons/nginx-ingress/azure/service.yaml
Normal file
22
addons/nginx-ingress/azure/service.yaml
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: nginx-ingress-controller
|
||||||
|
namespace: ingress
|
||||||
|
annotations:
|
||||||
|
prometheus.io/scrape: 'true'
|
||||||
|
prometheus.io/port: '10254'
|
||||||
|
spec:
|
||||||
|
type: ClusterIP
|
||||||
|
selector:
|
||||||
|
name: nginx-ingress-controller
|
||||||
|
phase: prod
|
||||||
|
ports:
|
||||||
|
- name: http
|
||||||
|
protocol: TCP
|
||||||
|
port: 80
|
||||||
|
targetPort: 80
|
||||||
|
- name: https
|
||||||
|
protocol: TCP
|
||||||
|
port: 443
|
||||||
|
targetPort: 443
|
6
addons/nginx-ingress/bare-metal/0-namespace.yaml
Normal file
6
addons/nginx-ingress/bare-metal/0-namespace.yaml
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: Namespace
|
||||||
|
metadata:
|
||||||
|
name: ingress
|
||||||
|
labels:
|
||||||
|
name: ingress
|
74
addons/nginx-ingress/bare-metal/deployment.yaml
Normal file
74
addons/nginx-ingress/bare-metal/deployment.yaml
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: ingress-controller-public
|
||||||
|
namespace: ingress
|
||||||
|
spec:
|
||||||
|
replicas: 2
|
||||||
|
strategy:
|
||||||
|
rollingUpdate:
|
||||||
|
maxUnavailable: 1
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
name: ingress-controller-public
|
||||||
|
phase: prod
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
name: ingress-controller-public
|
||||||
|
phase: prod
|
||||||
|
annotations:
|
||||||
|
seccomp.security.alpha.kubernetes.io/pod: 'docker/default'
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: nginx-ingress-controller
|
||||||
|
image: quay.io/kubernetes-ingress-controller/nginx-ingress-controller:0.23.0
|
||||||
|
args:
|
||||||
|
- /nginx-ingress-controller
|
||||||
|
- --ingress-class=public
|
||||||
|
# use downward API
|
||||||
|
env:
|
||||||
|
- name: POD_NAME
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
fieldPath: metadata.name
|
||||||
|
- name: POD_NAMESPACE
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
fieldPath: metadata.namespace
|
||||||
|
ports:
|
||||||
|
- name: http
|
||||||
|
containerPort: 80
|
||||||
|
- name: https
|
||||||
|
containerPort: 443
|
||||||
|
- name: health
|
||||||
|
containerPort: 10254
|
||||||
|
livenessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /healthz
|
||||||
|
port: 10254
|
||||||
|
scheme: HTTP
|
||||||
|
initialDelaySeconds: 10
|
||||||
|
periodSeconds: 10
|
||||||
|
successThreshold: 1
|
||||||
|
failureThreshold: 3
|
||||||
|
timeoutSeconds: 5
|
||||||
|
readinessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /healthz
|
||||||
|
port: 10254
|
||||||
|
scheme: HTTP
|
||||||
|
periodSeconds: 10
|
||||||
|
successThreshold: 1
|
||||||
|
failureThreshold: 3
|
||||||
|
timeoutSeconds: 5
|
||||||
|
securityContext:
|
||||||
|
capabilities:
|
||||||
|
add:
|
||||||
|
- NET_BIND_SERVICE
|
||||||
|
drop:
|
||||||
|
- ALL
|
||||||
|
runAsUser: 33 # www-data
|
||||||
|
restartPolicy: Always
|
||||||
|
terminationGracePeriodSeconds: 60
|
||||||
|
|
@ -0,0 +1,12 @@
|
|||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRoleBinding
|
||||||
|
metadata:
|
||||||
|
name: ingress
|
||||||
|
roleRef:
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
||||||
|
kind: ClusterRole
|
||||||
|
name: ingress
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
namespace: ingress
|
||||||
|
name: default
|
51
addons/nginx-ingress/bare-metal/rbac/cluster-role.yaml
Normal file
51
addons/nginx-ingress/bare-metal/rbac/cluster-role.yaml
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRole
|
||||||
|
metadata:
|
||||||
|
name: ingress
|
||||||
|
rules:
|
||||||
|
- apiGroups:
|
||||||
|
- ""
|
||||||
|
resources:
|
||||||
|
- configmaps
|
||||||
|
- endpoints
|
||||||
|
- nodes
|
||||||
|
- pods
|
||||||
|
- secrets
|
||||||
|
verbs:
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
|
- apiGroups:
|
||||||
|
- ""
|
||||||
|
resources:
|
||||||
|
- nodes
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- apiGroups:
|
||||||
|
- ""
|
||||||
|
resources:
|
||||||
|
- services
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
|
- apiGroups:
|
||||||
|
- "extensions"
|
||||||
|
resources:
|
||||||
|
- ingresses
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
|
- apiGroups:
|
||||||
|
- ""
|
||||||
|
resources:
|
||||||
|
- events
|
||||||
|
verbs:
|
||||||
|
- create
|
||||||
|
- patch
|
||||||
|
- apiGroups:
|
||||||
|
- "extensions"
|
||||||
|
resources:
|
||||||
|
- ingresses/status
|
||||||
|
verbs:
|
||||||
|
- update
|
13
addons/nginx-ingress/bare-metal/rbac/role-binding.yaml
Normal file
13
addons/nginx-ingress/bare-metal/rbac/role-binding.yaml
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: RoleBinding
|
||||||
|
metadata:
|
||||||
|
name: ingress
|
||||||
|
namespace: ingress
|
||||||
|
roleRef:
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
||||||
|
kind: Role
|
||||||
|
name: ingress
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
namespace: ingress
|
||||||
|
name: default
|
41
addons/nginx-ingress/bare-metal/rbac/role.yaml
Normal file
41
addons/nginx-ingress/bare-metal/rbac/role.yaml
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: Role
|
||||||
|
metadata:
|
||||||
|
name: ingress
|
||||||
|
namespace: ingress
|
||||||
|
rules:
|
||||||
|
- apiGroups:
|
||||||
|
- ""
|
||||||
|
resources:
|
||||||
|
- configmaps
|
||||||
|
- pods
|
||||||
|
- secrets
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- apiGroups:
|
||||||
|
- ""
|
||||||
|
resources:
|
||||||
|
- configmaps
|
||||||
|
resourceNames:
|
||||||
|
# Defaults to "<election-id>-<ingress-class>"
|
||||||
|
# Here: "<ingress-controller-leader>-<nginx>"
|
||||||
|
# This has to be adapted if you change either parameter
|
||||||
|
# when launching the nginx-ingress-controller.
|
||||||
|
- "ingress-controller-leader-public"
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- update
|
||||||
|
- apiGroups:
|
||||||
|
- ""
|
||||||
|
resources:
|
||||||
|
- configmaps
|
||||||
|
verbs:
|
||||||
|
- create
|
||||||
|
- apiGroups:
|
||||||
|
- ""
|
||||||
|
resources:
|
||||||
|
- endpoints
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- create
|
||||||
|
- update
|
23
addons/nginx-ingress/bare-metal/service.yaml
Normal file
23
addons/nginx-ingress/bare-metal/service.yaml
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: ingress-controller-public
|
||||||
|
namespace: ingress
|
||||||
|
annotations:
|
||||||
|
prometheus.io/scrape: 'true'
|
||||||
|
prometheus.io/port: '10254'
|
||||||
|
spec:
|
||||||
|
type: ClusterIP
|
||||||
|
clusterIP: 10.3.0.12
|
||||||
|
selector:
|
||||||
|
name: ingress-controller-public
|
||||||
|
phase: prod
|
||||||
|
ports:
|
||||||
|
- name: http
|
||||||
|
protocol: TCP
|
||||||
|
port: 80
|
||||||
|
targetPort: 80
|
||||||
|
- name: https
|
||||||
|
protocol: TCP
|
||||||
|
port: 443
|
||||||
|
targetPort: 443
|
@ -17,16 +17,16 @@ spec:
|
|||||||
labels:
|
labels:
|
||||||
name: nginx-ingress-controller
|
name: nginx-ingress-controller
|
||||||
phase: prod
|
phase: prod
|
||||||
|
annotations:
|
||||||
|
seccomp.security.alpha.kubernetes.io/pod: 'docker/default'
|
||||||
spec:
|
spec:
|
||||||
nodeSelector:
|
nodeSelector:
|
||||||
node-role.kubernetes.io/node: ""
|
node-role.kubernetes.io/node: ""
|
||||||
hostNetwork: true
|
|
||||||
containers:
|
containers:
|
||||||
- name: nginx-ingress-controller
|
- name: nginx-ingress-controller
|
||||||
image: quay.io/kubernetes-ingress-controller/nginx-ingress-controller:0.15.0
|
image: quay.io/kubernetes-ingress-controller/nginx-ingress-controller:0.23.0
|
||||||
args:
|
args:
|
||||||
- /nginx-ingress-controller
|
- /nginx-ingress-controller
|
||||||
- --default-backend-service=$(POD_NAMESPACE)/default-backend
|
|
||||||
- --ingress-class=public
|
- --ingress-class=public
|
||||||
# use downward API
|
# use downward API
|
||||||
env:
|
env:
|
||||||
@ -57,7 +57,7 @@ spec:
|
|||||||
initialDelaySeconds: 10
|
initialDelaySeconds: 10
|
||||||
periodSeconds: 10
|
periodSeconds: 10
|
||||||
successThreshold: 1
|
successThreshold: 1
|
||||||
timeoutSeconds: 1
|
timeoutSeconds: 5
|
||||||
readinessProbe:
|
readinessProbe:
|
||||||
failureThreshold: 3
|
failureThreshold: 3
|
||||||
httpGet:
|
httpGet:
|
||||||
@ -66,8 +66,13 @@ spec:
|
|||||||
scheme: HTTP
|
scheme: HTTP
|
||||||
periodSeconds: 10
|
periodSeconds: 10
|
||||||
successThreshold: 1
|
successThreshold: 1
|
||||||
timeoutSeconds: 1
|
timeoutSeconds: 5
|
||||||
securityContext:
|
securityContext:
|
||||||
runAsNonRoot: false
|
capabilities:
|
||||||
|
add:
|
||||||
|
- NET_BIND_SERVICE
|
||||||
|
drop:
|
||||||
|
- ALL
|
||||||
|
runAsUser: 33 # www-data
|
||||||
restartPolicy: Always
|
restartPolicy: Always
|
||||||
terminationGracePeriodSeconds: 60
|
terminationGracePeriodSeconds: 60
|
||||||
|
@ -1,40 +0,0 @@
|
|||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: default-backend
|
|
||||||
namespace: ingress
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
name: default-backend
|
|
||||||
phase: prod
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
name: default-backend
|
|
||||||
phase: prod
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: default-backend
|
|
||||||
# Any image is permissable as long as:
|
|
||||||
# 1. It serves a 404 page at /
|
|
||||||
# 2. It serves 200 on a /healthz endpoint
|
|
||||||
image: k8s.gcr.io/defaultbackend:1.4
|
|
||||||
ports:
|
|
||||||
- containerPort: 8080
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
cpu: 10m
|
|
||||||
memory: 20Mi
|
|
||||||
requests:
|
|
||||||
cpu: 10m
|
|
||||||
memory: 20Mi
|
|
||||||
livenessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /healthz
|
|
||||||
port: 8080
|
|
||||||
scheme: HTTP
|
|
||||||
initialDelaySeconds: 30
|
|
||||||
timeoutSeconds: 5
|
|
||||||
terminationGracePeriodSeconds: 60
|
|
@ -1,15 +0,0 @@
|
|||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: default-backend
|
|
||||||
namespace: ingress
|
|
||||||
spec:
|
|
||||||
type: ClusterIP
|
|
||||||
selector:
|
|
||||||
name: default-backend
|
|
||||||
phase: prod
|
|
||||||
ports:
|
|
||||||
- name: http
|
|
||||||
protocol: TCP
|
|
||||||
port: 80
|
|
||||||
targetPort: 8080
|
|
@ -1,40 +0,0 @@
|
|||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: default-backend
|
|
||||||
namespace: ingress
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
name: default-backend
|
|
||||||
phase: prod
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
name: default-backend
|
|
||||||
phase: prod
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: default-backend
|
|
||||||
# Any image is permissable as long as:
|
|
||||||
# 1. It serves a 404 page at /
|
|
||||||
# 2. It serves 200 on a /healthz endpoint
|
|
||||||
image: k8s.gcr.io/defaultbackend:1.4
|
|
||||||
ports:
|
|
||||||
- containerPort: 8080
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
cpu: 10m
|
|
||||||
memory: 20Mi
|
|
||||||
requests:
|
|
||||||
cpu: 10m
|
|
||||||
memory: 20Mi
|
|
||||||
livenessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /healthz
|
|
||||||
port: 8080
|
|
||||||
scheme: HTTP
|
|
||||||
initialDelaySeconds: 30
|
|
||||||
timeoutSeconds: 5
|
|
||||||
terminationGracePeriodSeconds: 60
|
|
@ -1,15 +0,0 @@
|
|||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: default-backend
|
|
||||||
namespace: ingress
|
|
||||||
spec:
|
|
||||||
type: ClusterIP
|
|
||||||
selector:
|
|
||||||
name: default-backend
|
|
||||||
phase: prod
|
|
||||||
ports:
|
|
||||||
- name: http
|
|
||||||
protocol: TCP
|
|
||||||
port: 80
|
|
||||||
targetPort: 8080
|
|
@ -17,16 +17,16 @@ spec:
|
|||||||
labels:
|
labels:
|
||||||
name: nginx-ingress-controller
|
name: nginx-ingress-controller
|
||||||
phase: prod
|
phase: prod
|
||||||
|
annotations:
|
||||||
|
seccomp.security.alpha.kubernetes.io/pod: 'docker/default'
|
||||||
spec:
|
spec:
|
||||||
nodeSelector:
|
nodeSelector:
|
||||||
node-role.kubernetes.io/node: ""
|
node-role.kubernetes.io/node: ""
|
||||||
hostNetwork: true
|
|
||||||
containers:
|
containers:
|
||||||
- name: nginx-ingress-controller
|
- name: nginx-ingress-controller
|
||||||
image: quay.io/kubernetes-ingress-controller/nginx-ingress-controller:0.15.0
|
image: quay.io/kubernetes-ingress-controller/nginx-ingress-controller:0.23.0
|
||||||
args:
|
args:
|
||||||
- /nginx-ingress-controller
|
- /nginx-ingress-controller
|
||||||
- --default-backend-service=$(POD_NAMESPACE)/default-backend
|
|
||||||
- --ingress-class=public
|
- --ingress-class=public
|
||||||
# use downward API
|
# use downward API
|
||||||
env:
|
env:
|
||||||
@ -57,7 +57,7 @@ spec:
|
|||||||
initialDelaySeconds: 10
|
initialDelaySeconds: 10
|
||||||
periodSeconds: 10
|
periodSeconds: 10
|
||||||
successThreshold: 1
|
successThreshold: 1
|
||||||
timeoutSeconds: 1
|
timeoutSeconds: 5
|
||||||
readinessProbe:
|
readinessProbe:
|
||||||
failureThreshold: 3
|
failureThreshold: 3
|
||||||
httpGet:
|
httpGet:
|
||||||
@ -66,8 +66,13 @@ spec:
|
|||||||
scheme: HTTP
|
scheme: HTTP
|
||||||
periodSeconds: 10
|
periodSeconds: 10
|
||||||
successThreshold: 1
|
successThreshold: 1
|
||||||
timeoutSeconds: 1
|
timeoutSeconds: 5
|
||||||
securityContext:
|
securityContext:
|
||||||
runAsNonRoot: false
|
capabilities:
|
||||||
|
add:
|
||||||
|
- NET_BIND_SERVICE
|
||||||
|
drop:
|
||||||
|
- ALL
|
||||||
|
runAsUser: 33 # www-data
|
||||||
restartPolicy: Always
|
restartPolicy: Always
|
||||||
terminationGracePeriodSeconds: 60
|
terminationGracePeriodSeconds: 60
|
||||||
|
@ -55,6 +55,17 @@ data:
|
|||||||
action: replace
|
action: replace
|
||||||
target_label: job
|
target_label: job
|
||||||
|
|
||||||
|
metric_relabel_configs:
|
||||||
|
- source_labels: [__name__]
|
||||||
|
action: drop
|
||||||
|
regex: etcd_(debugging|disk|request|server).*
|
||||||
|
- source_labels: [__name__]
|
||||||
|
action: drop
|
||||||
|
regex: apiserver_admission_controller_admission_latencies_seconds_.*
|
||||||
|
- source_labels: [__name__]
|
||||||
|
action: drop
|
||||||
|
regex: apiserver_admission_step_admission_latencies_seconds_.*
|
||||||
|
|
||||||
# Scrape config for node (i.e. kubelet) /metrics (e.g. 'kubelet_'). Explore
|
# Scrape config for node (i.e. kubelet) /metrics (e.g. 'kubelet_'). Explore
|
||||||
# metrics from a node by scraping kubelet (127.0.0.1:10250/metrics).
|
# metrics from a node by scraping kubelet (127.0.0.1:10250/metrics).
|
||||||
- job_name: 'kubelet'
|
- job_name: 'kubelet'
|
||||||
@ -89,6 +100,13 @@ data:
|
|||||||
relabel_configs:
|
relabel_configs:
|
||||||
- action: labelmap
|
- action: labelmap
|
||||||
regex: __meta_kubernetes_node_label_(.+)
|
regex: __meta_kubernetes_node_label_(.+)
|
||||||
|
metric_relabel_configs:
|
||||||
|
- source_labels: [__name__, image]
|
||||||
|
action: drop
|
||||||
|
regex: container_([a-z_]+);
|
||||||
|
- source_labels: [__name__]
|
||||||
|
action: drop
|
||||||
|
regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)
|
||||||
|
|
||||||
|
|
||||||
# Scrap etcd metrics from controllers via listen-metrics-urls
|
# Scrap etcd metrics from controllers via listen-metrics-urls
|
||||||
@ -102,7 +120,7 @@ data:
|
|||||||
regex: 'true'
|
regex: 'true'
|
||||||
- action: labelmap
|
- action: labelmap
|
||||||
regex: __meta_kubernetes_node_label_(.+)
|
regex: __meta_kubernetes_node_label_(.+)
|
||||||
- source_labels: [__meta_kubernetes_node_name]
|
- source_labels: [__meta_kubernetes_node_address_InternalIP]
|
||||||
action: replace
|
action: replace
|
||||||
target_label: __address__
|
target_label: __address__
|
||||||
replacement: '${1}:2381'
|
replacement: '${1}:2381'
|
||||||
@ -119,10 +137,10 @@ data:
|
|||||||
# * `prometheus.io/port`: If the metrics are exposed on a different port to the
|
# * `prometheus.io/port`: If the metrics are exposed on a different port to the
|
||||||
# service then set this appropriately.
|
# service then set this appropriately.
|
||||||
- job_name: 'kubernetes-service-endpoints'
|
- job_name: 'kubernetes-service-endpoints'
|
||||||
|
|
||||||
kubernetes_sd_configs:
|
kubernetes_sd_configs:
|
||||||
- role: endpoints
|
- role: endpoints
|
||||||
|
|
||||||
|
honor_labels: true
|
||||||
relabel_configs:
|
relabel_configs:
|
||||||
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
|
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
|
||||||
action: keep
|
action: keep
|
||||||
@ -144,10 +162,18 @@ data:
|
|||||||
regex: __meta_kubernetes_service_label_(.+)
|
regex: __meta_kubernetes_service_label_(.+)
|
||||||
- source_labels: [__meta_kubernetes_namespace]
|
- source_labels: [__meta_kubernetes_namespace]
|
||||||
action: replace
|
action: replace
|
||||||
target_label: kubernetes_namespace
|
target_label: namespace
|
||||||
|
- source_labels: [__meta_kubernetes_pod_name]
|
||||||
|
action: replace
|
||||||
|
target_label: pod
|
||||||
- source_labels: [__meta_kubernetes_service_name]
|
- source_labels: [__meta_kubernetes_service_name]
|
||||||
action: replace
|
action: replace
|
||||||
target_label: job
|
target_label: job
|
||||||
|
|
||||||
|
metric_relabel_configs:
|
||||||
|
- source_labels: [__name__]
|
||||||
|
action: drop
|
||||||
|
regex: etcd_(debugging|disk|request|server).*
|
||||||
|
|
||||||
# Example scrape config for probing services via the Blackbox Exporter.
|
# Example scrape config for probing services via the Blackbox Exporter.
|
||||||
#
|
#
|
||||||
@ -177,7 +203,7 @@ data:
|
|||||||
- action: labelmap
|
- action: labelmap
|
||||||
regex: __meta_kubernetes_service_label_(.+)
|
regex: __meta_kubernetes_service_label_(.+)
|
||||||
- source_labels: [__meta_kubernetes_namespace]
|
- source_labels: [__meta_kubernetes_namespace]
|
||||||
target_label: kubernetes_namespace
|
target_label: namespace
|
||||||
- source_labels: [__meta_kubernetes_service_name]
|
- source_labels: [__meta_kubernetes_service_name]
|
||||||
target_label: job
|
target_label: job
|
||||||
|
|
||||||
|
@ -14,11 +14,13 @@ spec:
|
|||||||
labels:
|
labels:
|
||||||
name: prometheus
|
name: prometheus
|
||||||
phase: prod
|
phase: prod
|
||||||
|
annotations:
|
||||||
|
seccomp.security.alpha.kubernetes.io/pod: 'docker/default'
|
||||||
spec:
|
spec:
|
||||||
serviceAccountName: prometheus
|
serviceAccountName: prometheus
|
||||||
containers:
|
containers:
|
||||||
- name: prometheus
|
- name: prometheus
|
||||||
image: quay.io/prometheus/prometheus:v2.3.0
|
image: quay.io/prometheus/prometheus:v2.7.1
|
||||||
args:
|
args:
|
||||||
- --web.listen-address=0.0.0.0:9090
|
- --web.listen-address=0.0.0.0:9090
|
||||||
- --config.file=/etc/prometheus/prometheus.yaml
|
- --config.file=/etc/prometheus/prometheus.yaml
|
||||||
|
@ -3,7 +3,8 @@ kind: ClusterRole
|
|||||||
metadata:
|
metadata:
|
||||||
name: kube-state-metrics
|
name: kube-state-metrics
|
||||||
rules:
|
rules:
|
||||||
- apiGroups: [""]
|
- apiGroups:
|
||||||
|
- ""
|
||||||
resources:
|
resources:
|
||||||
- configmaps
|
- configmaps
|
||||||
- secrets
|
- secrets
|
||||||
@ -17,23 +18,47 @@ rules:
|
|||||||
- persistentvolumes
|
- persistentvolumes
|
||||||
- namespaces
|
- namespaces
|
||||||
- endpoints
|
- endpoints
|
||||||
verbs: ["list", "watch"]
|
verbs:
|
||||||
- apiGroups: ["extensions"]
|
- list
|
||||||
|
- watch
|
||||||
|
- apiGroups:
|
||||||
|
- extensions
|
||||||
resources:
|
resources:
|
||||||
- daemonsets
|
- daemonsets
|
||||||
- deployments
|
- deployments
|
||||||
- replicasets
|
- replicasets
|
||||||
verbs: ["list", "watch"]
|
verbs:
|
||||||
- apiGroups: ["apps"]
|
- list
|
||||||
|
- watch
|
||||||
|
- apiGroups:
|
||||||
|
- apps
|
||||||
resources:
|
resources:
|
||||||
- statefulsets
|
- statefulsets
|
||||||
verbs: ["list", "watch"]
|
- daemonsets
|
||||||
- apiGroups: ["batch"]
|
- deployments
|
||||||
|
- replicasets
|
||||||
|
verbs:
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
|
- apiGroups:
|
||||||
|
- batch
|
||||||
resources:
|
resources:
|
||||||
- cronjobs
|
- cronjobs
|
||||||
- jobs
|
- jobs
|
||||||
verbs: ["list", "watch"]
|
verbs:
|
||||||
- apiGroups: ["autoscaling"]
|
- list
|
||||||
|
- watch
|
||||||
|
- apiGroups:
|
||||||
|
- autoscaling
|
||||||
resources:
|
resources:
|
||||||
- horizontalpodautoscalers
|
- horizontalpodautoscalers
|
||||||
verbs: ["list", "watch"]
|
verbs:
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
|
- apiGroups:
|
||||||
|
- policy
|
||||||
|
resources:
|
||||||
|
- poddisruptionbudgets
|
||||||
|
verbs:
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
|
@ -18,11 +18,13 @@ spec:
|
|||||||
labels:
|
labels:
|
||||||
name: kube-state-metrics
|
name: kube-state-metrics
|
||||||
phase: prod
|
phase: prod
|
||||||
|
annotations:
|
||||||
|
seccomp.security.alpha.kubernetes.io/pod: 'docker/default'
|
||||||
spec:
|
spec:
|
||||||
serviceAccountName: kube-state-metrics
|
serviceAccountName: kube-state-metrics
|
||||||
containers:
|
containers:
|
||||||
- name: kube-state-metrics
|
- name: kube-state-metrics
|
||||||
image: quay.io/coreos/kube-state-metrics:v1.3.1
|
image: quay.io/coreos/kube-state-metrics:v1.5.0
|
||||||
ports:
|
ports:
|
||||||
- name: metrics
|
- name: metrics
|
||||||
containerPort: 8080
|
containerPort: 8080
|
||||||
@ -33,7 +35,7 @@ spec:
|
|||||||
initialDelaySeconds: 5
|
initialDelaySeconds: 5
|
||||||
timeoutSeconds: 5
|
timeoutSeconds: 5
|
||||||
- name: addon-resizer
|
- name: addon-resizer
|
||||||
image: k8s.gcr.io/addon-resizer:1.7
|
image: k8s.gcr.io/addon-resizer:1.8.4
|
||||||
resources:
|
resources:
|
||||||
limits:
|
limits:
|
||||||
cpu: 100m
|
cpu: 100m
|
||||||
|
@ -6,7 +6,7 @@ metadata:
|
|||||||
roleRef:
|
roleRef:
|
||||||
apiGroup: rbac.authorization.k8s.io
|
apiGroup: rbac.authorization.k8s.io
|
||||||
kind: Role
|
kind: Role
|
||||||
name: kube-state-metrics-resizer
|
name: kube-state-metrics
|
||||||
subjects:
|
subjects:
|
||||||
- kind: ServiceAccount
|
- kind: ServiceAccount
|
||||||
name: kube-state-metrics
|
name: kube-state-metrics
|
||||||
|
@ -1,15 +1,31 @@
|
|||||||
apiVersion: rbac.authorization.k8s.io/v1
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
kind: Role
|
kind: Role
|
||||||
metadata:
|
metadata:
|
||||||
name: kube-state-metrics-resizer
|
name: kube-state-metrics
|
||||||
namespace: monitoring
|
namespace: monitoring
|
||||||
rules:
|
rules:
|
||||||
- apiGroups: [""]
|
- apiGroups:
|
||||||
|
- ""
|
||||||
resources:
|
resources:
|
||||||
- pods
|
- pods
|
||||||
verbs: ["get"]
|
verbs:
|
||||||
- apiGroups: ["extensions"]
|
- get
|
||||||
|
- apiGroups:
|
||||||
|
- extensions
|
||||||
resources:
|
resources:
|
||||||
- deployments
|
- deployments
|
||||||
resourceNames: ["kube-state-metrics"]
|
resourceNames:
|
||||||
verbs: ["get", "update"]
|
- kube-state-metrics
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- update
|
||||||
|
- apiGroups:
|
||||||
|
- apps
|
||||||
|
resources:
|
||||||
|
- deployments
|
||||||
|
resourceNames:
|
||||||
|
- kube-state-metrics
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- update
|
||||||
|
|
||||||
|
@ -17,6 +17,8 @@ spec:
|
|||||||
labels:
|
labels:
|
||||||
name: node-exporter
|
name: node-exporter
|
||||||
phase: prod
|
phase: prod
|
||||||
|
annotations:
|
||||||
|
seccomp.security.alpha.kubernetes.io/pod: 'docker/default'
|
||||||
spec:
|
spec:
|
||||||
serviceAccountName: node-exporter
|
serviceAccountName: node-exporter
|
||||||
securityContext:
|
securityContext:
|
||||||
@ -26,21 +28,24 @@ spec:
|
|||||||
hostPID: true
|
hostPID: true
|
||||||
containers:
|
containers:
|
||||||
- name: node-exporter
|
- name: node-exporter
|
||||||
image: quay.io/prometheus/node-exporter:v0.15.2
|
image: quay.io/prometheus/node-exporter:v0.17.0
|
||||||
args:
|
args:
|
||||||
- "--path.procfs=/host/proc"
|
- --path.procfs=/host/proc
|
||||||
- "--path.sysfs=/host/sys"
|
- --path.sysfs=/host/sys
|
||||||
|
- --path.rootfs=/host/root
|
||||||
|
- --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/)
|
||||||
|
- --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$
|
||||||
ports:
|
ports:
|
||||||
- name: metrics
|
- name: metrics
|
||||||
containerPort: 9100
|
containerPort: 9100
|
||||||
hostPort: 9100
|
hostPort: 9100
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
memory: 30Mi
|
|
||||||
cpu: 100m
|
cpu: 100m
|
||||||
limits:
|
|
||||||
memory: 50Mi
|
memory: 50Mi
|
||||||
|
limits:
|
||||||
cpu: 200m
|
cpu: 200m
|
||||||
|
memory: 100Mi
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- name: proc
|
- name: proc
|
||||||
mountPath: /host/proc
|
mountPath: /host/proc
|
||||||
@ -48,6 +53,9 @@ spec:
|
|||||||
- name: sys
|
- name: sys
|
||||||
mountPath: /host/sys
|
mountPath: /host/sys
|
||||||
readOnly: true
|
readOnly: true
|
||||||
|
- name: root
|
||||||
|
mountPath: /host/root
|
||||||
|
readOnly: true
|
||||||
tolerations:
|
tolerations:
|
||||||
- effect: NoSchedule
|
- effect: NoSchedule
|
||||||
operator: Exists
|
operator: Exists
|
||||||
@ -58,3 +66,6 @@ spec:
|
|||||||
- name: sys
|
- name: sys
|
||||||
hostPath:
|
hostPath:
|
||||||
path: /sys
|
path: /sys
|
||||||
|
- name: root
|
||||||
|
hostPath:
|
||||||
|
path: /
|
||||||
|
@ -4,575 +4,1089 @@ metadata:
|
|||||||
name: prometheus-rules
|
name: prometheus-rules
|
||||||
namespace: monitoring
|
namespace: monitoring
|
||||||
data:
|
data:
|
||||||
alertmanager.rules.yaml: |
|
etcd.yaml: |-
|
||||||
groups:
|
{
|
||||||
- name: alertmanager.rules
|
"groups": [
|
||||||
rules:
|
{
|
||||||
- alert: AlertmanagerConfigInconsistent
|
"name": "etcd",
|
||||||
expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service)
|
"rules": [
|
||||||
GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service",
|
{
|
||||||
"alertmanager-$1", "alertmanager", "(.*)") != 1
|
"alert": "etcdInsufficientMembers",
|
||||||
for: 5m
|
"annotations": {
|
||||||
labels:
|
"message": "etcd cluster \"{{ $labels.job }}\": insufficient members ({{ $value }})."
|
||||||
severity: critical
|
},
|
||||||
annotations:
|
"expr": "sum(up{job=~\".*etcd.*\"} == bool 1) by (job) < ((count(up{job=~\".*etcd.*\"}) by (job) + 1) / 2)\n",
|
||||||
description: The configuration of the instances of the Alertmanager cluster
|
"for": "3m",
|
||||||
`{{$labels.service}}` are out of sync.
|
"labels": {
|
||||||
- alert: AlertmanagerDownOrMissing
|
"severity": "critical"
|
||||||
expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1",
|
}
|
||||||
"alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
|
},
|
||||||
for: 5m
|
{
|
||||||
labels:
|
"alert": "etcdNoLeader",
|
||||||
severity: warning
|
"annotations": {
|
||||||
annotations:
|
"message": "etcd cluster \"{{ $labels.job }}\": member {{ $labels.instance }} has no leader."
|
||||||
description: An unexpected number of Alertmanagers are scraped or Alertmanagers
|
},
|
||||||
disappeared from discovery.
|
"expr": "etcd_server_has_leader{job=~\".*etcd.*\"} == 0\n",
|
||||||
- alert: AlertmanagerFailedReload
|
"for": "1m",
|
||||||
expr: alertmanager_config_last_reload_successful == 0
|
"labels": {
|
||||||
for: 10m
|
"severity": "critical"
|
||||||
labels:
|
}
|
||||||
severity: warning
|
},
|
||||||
annotations:
|
{
|
||||||
description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
|
"alert": "etcdHighNumberOfLeaderChanges",
|
||||||
}}/{{ $labels.pod}}.
|
"annotations": {
|
||||||
etcd3.rules.yaml: |
|
"message": "etcd cluster \"{{ $labels.job }}\": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last 30 minutes."
|
||||||
groups:
|
},
|
||||||
- name: ./etcd3.rules
|
"expr": "rate(etcd_server_leader_changes_seen_total{job=~\".*etcd.*\"}[15m]) > 3\n",
|
||||||
rules:
|
"for": "15m",
|
||||||
- alert: InsufficientMembers
|
"labels": {
|
||||||
expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
|
"severity": "warning"
|
||||||
for: 3m
|
}
|
||||||
labels:
|
},
|
||||||
severity: critical
|
{
|
||||||
annotations:
|
"alert": "etcdGRPCRequestsSlow",
|
||||||
description: If one more etcd member goes down the cluster will be unavailable
|
"annotations": {
|
||||||
summary: etcd cluster insufficient members
|
"message": "etcd cluster \"{{ $labels.job }}\": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}."
|
||||||
- alert: NoLeader
|
},
|
||||||
expr: etcd_server_has_leader{job="etcd"} == 0
|
"expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\", grpc_type=\"unary\"}[5m])) by (job, instance, grpc_service, grpc_method, le))\n> 0.15\n",
|
||||||
for: 1m
|
"for": "10m",
|
||||||
labels:
|
"labels": {
|
||||||
severity: critical
|
"severity": "critical"
|
||||||
annotations:
|
}
|
||||||
description: etcd member {{ $labels.instance }} has no leader
|
},
|
||||||
summary: etcd member has no leader
|
{
|
||||||
- alert: HighNumberOfLeaderChanges
|
"alert": "etcdMemberCommunicationSlow",
|
||||||
expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
|
"annotations": {
|
||||||
labels:
|
"message": "etcd cluster \"{{ $labels.job }}\": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}."
|
||||||
severity: warning
|
},
|
||||||
annotations:
|
"expr": "histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~\".*etcd.*\"}[5m]))\n> 0.15\n",
|
||||||
description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader
|
"for": "10m",
|
||||||
changes within the last hour
|
"labels": {
|
||||||
summary: a high number of leader changes within the etcd cluster are happening
|
"severity": "warning"
|
||||||
- alert: GRPCRequestsSlow
|
}
|
||||||
expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le))
|
},
|
||||||
> 0.15
|
{
|
||||||
for: 10m
|
"alert": "etcdHighNumberOfFailedProposals",
|
||||||
labels:
|
"annotations": {
|
||||||
severity: critical
|
"message": "etcd cluster \"{{ $labels.job }}\": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}."
|
||||||
annotations:
|
},
|
||||||
description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method
|
"expr": "rate(etcd_server_proposals_failed_total{job=~\".*etcd.*\"}[15m]) > 5\n",
|
||||||
}} are slow
|
"for": "15m",
|
||||||
summary: slow gRPC requests
|
"labels": {
|
||||||
- alert: HighNumberOfFailedHTTPRequests
|
"severity": "warning"
|
||||||
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
|
}
|
||||||
BY (method) > 0.01
|
},
|
||||||
for: 10m
|
{
|
||||||
labels:
|
"alert": "etcdHighFsyncDurations",
|
||||||
severity: warning
|
"annotations": {
|
||||||
annotations:
|
"message": "etcd cluster \"{{ $labels.job }}\": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}."
|
||||||
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
|
},
|
||||||
instance {{ $labels.instance }}'
|
"expr": "histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~\".*etcd.*\"}[5m]))\n> 0.5\n",
|
||||||
summary: a high number of HTTP requests are failing
|
"for": "10m",
|
||||||
- alert: HighNumberOfFailedHTTPRequests
|
"labels": {
|
||||||
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
|
"severity": "warning"
|
||||||
BY (method) > 0.05
|
}
|
||||||
for: 5m
|
},
|
||||||
labels:
|
{
|
||||||
severity: critical
|
"alert": "etcdHighCommitDurations",
|
||||||
annotations:
|
"annotations": {
|
||||||
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
|
"message": "etcd cluster \"{{ $labels.job }}\": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}."
|
||||||
instance {{ $labels.instance }}'
|
},
|
||||||
summary: a high number of HTTP requests are failing
|
"expr": "histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~\".*etcd.*\"}[5m]))\n> 0.25\n",
|
||||||
- alert: HTTPRequestsSlow
|
"for": "10m",
|
||||||
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
|
"labels": {
|
||||||
> 0.15
|
"severity": "warning"
|
||||||
for: 10m
|
}
|
||||||
labels:
|
},
|
||||||
severity: warning
|
{
|
||||||
annotations:
|
"alert": "etcdHighNumberOfFailedHTTPRequests",
|
||||||
description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
|
"annotations": {
|
||||||
}} are slow
|
"message": "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}"
|
||||||
summary: slow HTTP requests
|
},
|
||||||
- alert: EtcdMemberCommunicationSlow
|
"expr": "sum(rate(etcd_http_failed_total{job=~\".*etcd.*\", code!=\"404\"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~\".*etcd.*\"}[5m]))\nBY (method) > 0.01\n",
|
||||||
expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m]))
|
"for": "10m",
|
||||||
> 0.15
|
"labels": {
|
||||||
for: 10m
|
"severity": "warning"
|
||||||
labels:
|
}
|
||||||
severity: warning
|
},
|
||||||
annotations:
|
{
|
||||||
description: etcd instance {{ $labels.instance }} member communication with
|
"alert": "etcdHighNumberOfFailedHTTPRequests",
|
||||||
{{ $labels.To }} is slow
|
"annotations": {
|
||||||
summary: etcd member communication is slow
|
"message": "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}."
|
||||||
- alert: HighNumberOfFailedProposals
|
},
|
||||||
expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
|
"expr": "sum(rate(etcd_http_failed_total{job=~\".*etcd.*\", code!=\"404\"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~\".*etcd.*\"}[5m]))\nBY (method) > 0.05\n",
|
||||||
labels:
|
"for": "10m",
|
||||||
severity: warning
|
"labels": {
|
||||||
annotations:
|
"severity": "critical"
|
||||||
description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal
|
}
|
||||||
failures within the last hour
|
},
|
||||||
summary: a high number of proposals within the etcd cluster are failing
|
{
|
||||||
- alert: HighFsyncDurations
|
"alert": "etcdHTTPRequestsSlow",
|
||||||
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m]))
|
"annotations": {
|
||||||
> 0.5
|
"message": "etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow."
|
||||||
for: 10m
|
},
|
||||||
labels:
|
"expr": "histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))\n> 0.15\n",
|
||||||
severity: warning
|
"for": "10m",
|
||||||
annotations:
|
"labels": {
|
||||||
description: etcd instance {{ $labels.instance }} fync durations are high
|
"severity": "warning"
|
||||||
summary: high fsync durations
|
}
|
||||||
- alert: HighCommitDurations
|
}
|
||||||
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m]))
|
]
|
||||||
> 0.25
|
}
|
||||||
for: 10m
|
]
|
||||||
labels:
|
}
|
||||||
severity: warning
|
extra.yaml: |-
|
||||||
annotations:
|
{
|
||||||
description: etcd instance {{ $labels.instance }} commit durations are high
|
"groups": [
|
||||||
summary: high commit durations
|
{
|
||||||
general.rules.yaml: |
|
"name": "extra.rules",
|
||||||
groups:
|
"rules": [
|
||||||
- name: general.rules
|
{
|
||||||
rules:
|
"alert": "InactiveRAIDDisk",
|
||||||
- alert: TargetDown
|
"annotations": {
|
||||||
expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
|
"message": "{{ $value }} RAID disk(s) on node {{ $labels.instance }} are inactive."
|
||||||
for: 10m
|
},
|
||||||
labels:
|
"expr": "node_md_disks - node_md_disks_active > 0",
|
||||||
severity: warning
|
"for": "10m",
|
||||||
annotations:
|
"labels": {
|
||||||
description: '{{ $value }}% of {{ $labels.job }} targets are down.'
|
"severity": "warning"
|
||||||
summary: Targets are down
|
}
|
||||||
- record: fd_utilization
|
}
|
||||||
expr: process_open_fds / process_max_fds
|
]
|
||||||
- alert: FdExhaustionClose
|
}
|
||||||
expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1
|
]
|
||||||
for: 10m
|
}
|
||||||
labels:
|
kube.yaml: |-
|
||||||
severity: warning
|
{
|
||||||
annotations:
|
"groups": [
|
||||||
description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
|
{
|
||||||
will exhaust in file/socket descriptors within the next 4 hours'
|
"name": "k8s.rules",
|
||||||
summary: file descriptors soon exhausted
|
"rules": [
|
||||||
- alert: FdExhaustionClose
|
{
|
||||||
expr: predict_linear(fd_utilization[10m], 3600) > 1
|
"expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubernetes-cadvisor\", image!=\"\", container_name!=\"\"}[5m])) by (namespace)\n",
|
||||||
for: 10m
|
"record": "namespace:container_cpu_usage_seconds_total:sum_rate"
|
||||||
labels:
|
},
|
||||||
severity: critical
|
{
|
||||||
annotations:
|
"expr": "sum by (namespace, pod_name, container_name) (\n rate(container_cpu_usage_seconds_total{job=\"kubernetes-cadvisor\", image!=\"\", container_name!=\"\"}[5m])\n)\n",
|
||||||
description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
|
"record": "namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate"
|
||||||
will exhaust in file/socket descriptors within the next hour'
|
},
|
||||||
summary: file descriptors soon exhausted
|
{
|
||||||
kube-controller-manager.rules.yaml: |
|
"expr": "sum(container_memory_usage_bytes{job=\"kubernetes-cadvisor\", image!=\"\", container_name!=\"\"}) by (namespace)\n",
|
||||||
groups:
|
"record": "namespace:container_memory_usage_bytes:sum"
|
||||||
- name: kube-controller-manager.rules
|
},
|
||||||
rules:
|
{
|
||||||
- alert: K8SControllerManagerDown
|
"expr": "sum by (namespace, label_name) (\n sum(rate(container_cpu_usage_seconds_total{job=\"kubernetes-cadvisor\", image!=\"\", container_name!=\"\"}[5m])) by (namespace, pod_name)\n * on (namespace, pod_name) group_left(label_name)\n label_replace(kube_pod_labels{job=\"kube-state-metrics\"}, \"pod_name\", \"$1\", \"pod\", \"(.*)\")\n)\n",
|
||||||
expr: absent(up{job="kube-controller-manager"} == 1)
|
"record": "namespace_name:container_cpu_usage_seconds_total:sum_rate"
|
||||||
for: 5m
|
},
|
||||||
labels:
|
{
|
||||||
severity: critical
|
"expr": "sum by (namespace, label_name) (\n sum(container_memory_usage_bytes{job=\"kubernetes-cadvisor\",image!=\"\", container_name!=\"\"}) by (pod_name, namespace)\n* on (namespace, pod_name) group_left(label_name)\n label_replace(kube_pod_labels{job=\"kube-state-metrics\"}, \"pod_name\", \"$1\", \"pod\", \"(.*)\")\n)\n",
|
||||||
annotations:
|
"record": "namespace_name:container_memory_usage_bytes:sum"
|
||||||
description: There is no running K8S controller manager. Deployments and replication
|
},
|
||||||
controllers are not making progress.
|
{
|
||||||
summary: Controller manager is down
|
"expr": "sum by (namespace, label_name) (\n sum(kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\"}) by (namespace, pod)\n* on (namespace, pod) group_left(label_name)\n label_replace(kube_pod_labels{job=\"kube-state-metrics\"}, \"pod_name\", \"$1\", \"pod\", \"(.*)\")\n)\n",
|
||||||
kube-scheduler.rules.yaml: |
|
"record": "namespace_name:kube_pod_container_resource_requests_memory_bytes:sum"
|
||||||
groups:
|
},
|
||||||
- name: kube-scheduler.rules
|
{
|
||||||
rules:
|
"expr": "sum by (namespace, label_name) (\n sum(kube_pod_container_resource_requests_cpu_cores{job=\"kube-state-metrics\"} and on(pod) kube_pod_status_scheduled{condition=\"true\"}) by (namespace, pod)\n* on (namespace, pod) group_left(label_name)\n label_replace(kube_pod_labels{job=\"kube-state-metrics\"}, \"pod_name\", \"$1\", \"pod\", \"(.*)\")\n)\n",
|
||||||
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
|
"record": "namespace_name:kube_pod_container_resource_requests_cpu_cores:sum"
|
||||||
expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
}
|
||||||
BY (le, cluster)) / 1e+06
|
]
|
||||||
labels:
|
},
|
||||||
quantile: "0.99"
|
{
|
||||||
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
|
"name": "kube-scheduler.rules",
|
||||||
expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
"rules": [
|
||||||
BY (le, cluster)) / 1e+06
|
{
|
||||||
labels:
|
"expr": "histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) without(instance, pod)) / 1e+06\n",
|
||||||
quantile: "0.9"
|
"labels": {
|
||||||
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
|
"quantile": "0.99"
|
||||||
expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
},
|
||||||
BY (le, cluster)) / 1e+06
|
"record": "cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile"
|
||||||
labels:
|
},
|
||||||
quantile: "0.5"
|
{
|
||||||
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
|
"expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) without(instance, pod)) / 1e+06\n",
|
||||||
expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
"labels": {
|
||||||
BY (le, cluster)) / 1e+06
|
"quantile": "0.99"
|
||||||
labels:
|
},
|
||||||
quantile: "0.99"
|
"record": "cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile"
|
||||||
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
|
},
|
||||||
expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
{
|
||||||
BY (le, cluster)) / 1e+06
|
"expr": "histogram_quantile(0.99, sum(rate(scheduler_binding_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) without(instance, pod)) / 1e+06\n",
|
||||||
labels:
|
"labels": {
|
||||||
quantile: "0.9"
|
"quantile": "0.99"
|
||||||
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
|
},
|
||||||
expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
"record": "cluster_quantile:scheduler_binding_latency:histogram_quantile"
|
||||||
BY (le, cluster)) / 1e+06
|
},
|
||||||
labels:
|
{
|
||||||
quantile: "0.5"
|
"expr": "histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) without(instance, pod)) / 1e+06\n",
|
||||||
- record: cluster:scheduler_binding_latency_seconds:quantile
|
"labels": {
|
||||||
expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
|
"quantile": "0.9"
|
||||||
BY (le, cluster)) / 1e+06
|
},
|
||||||
labels:
|
"record": "cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile"
|
||||||
quantile: "0.99"
|
},
|
||||||
- record: cluster:scheduler_binding_latency_seconds:quantile
|
{
|
||||||
expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
|
"expr": "histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) without(instance, pod)) / 1e+06\n",
|
||||||
BY (le, cluster)) / 1e+06
|
"labels": {
|
||||||
labels:
|
"quantile": "0.9"
|
||||||
quantile: "0.9"
|
},
|
||||||
- record: cluster:scheduler_binding_latency_seconds:quantile
|
"record": "cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile"
|
||||||
expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
|
},
|
||||||
BY (le, cluster)) / 1e+06
|
{
|
||||||
labels:
|
"expr": "histogram_quantile(0.9, sum(rate(scheduler_binding_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) without(instance, pod)) / 1e+06\n",
|
||||||
quantile: "0.5"
|
"labels": {
|
||||||
- alert: K8SSchedulerDown
|
"quantile": "0.9"
|
||||||
expr: absent(up{job="kube-scheduler"} == 1)
|
},
|
||||||
for: 5m
|
"record": "cluster_quantile:scheduler_binding_latency:histogram_quantile"
|
||||||
labels:
|
},
|
||||||
severity: critical
|
{
|
||||||
annotations:
|
"expr": "histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) without(instance, pod)) / 1e+06\n",
|
||||||
description: There is no running K8S scheduler. New pods are not being assigned
|
"labels": {
|
||||||
to nodes.
|
"quantile": "0.5"
|
||||||
summary: Scheduler is down
|
},
|
||||||
kube-state-metrics.rules.yaml: |
|
"record": "cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile"
|
||||||
groups:
|
},
|
||||||
- name: kube-state-metrics.rules
|
{
|
||||||
rules:
|
"expr": "histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) without(instance, pod)) / 1e+06\n",
|
||||||
- alert: DeploymentGenerationMismatch
|
"labels": {
|
||||||
expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
|
"quantile": "0.5"
|
||||||
for: 15m
|
},
|
||||||
labels:
|
"record": "cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile"
|
||||||
severity: warning
|
},
|
||||||
annotations:
|
{
|
||||||
description: Observed deployment generation does not match expected one for
|
"expr": "histogram_quantile(0.5, sum(rate(scheduler_binding_latency_microseconds_bucket{job=\"kube-scheduler\"}[5m])) without(instance, pod)) / 1e+06\n",
|
||||||
deployment {{$labels.namespaces}}/{{$labels.deployment}}
|
"labels": {
|
||||||
summary: Deployment is outdated
|
"quantile": "0.5"
|
||||||
- alert: DeploymentReplicasNotUpdated
|
},
|
||||||
expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas)
|
"record": "cluster_quantile:scheduler_binding_latency:histogram_quantile"
|
||||||
or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas))
|
}
|
||||||
unless (kube_deployment_spec_paused == 1)
|
]
|
||||||
for: 15m
|
},
|
||||||
labels:
|
{
|
||||||
severity: warning
|
"name": "kube-apiserver.rules",
|
||||||
annotations:
|
"rules": [
|
||||||
description: Replicas are not updated and available for deployment {{$labels.namespaces}}/{{$labels.deployment}}
|
{
|
||||||
summary: Deployment replicas are outdated
|
"expr": "histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{job=\"apiserver\"}[5m])) without(instance, pod)) / 1e+06\n",
|
||||||
- alert: DaemonSetRolloutStuck
|
"labels": {
|
||||||
expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled
|
"quantile": "0.99"
|
||||||
* 100 < 100
|
},
|
||||||
for: 15m
|
"record": "cluster_quantile:apiserver_request_latencies:histogram_quantile"
|
||||||
labels:
|
},
|
||||||
severity: warning
|
{
|
||||||
annotations:
|
"expr": "histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job=\"apiserver\"}[5m])) without(instance, pod)) / 1e+06\n",
|
||||||
description: Only {{$value}}% of desired pods scheduled and ready for daemon
|
"labels": {
|
||||||
set {{$labels.namespaces}}/{{$labels.daemonset}}
|
"quantile": "0.9"
|
||||||
summary: DaemonSet is missing pods
|
},
|
||||||
- alert: K8SDaemonSetsNotScheduled
|
"record": "cluster_quantile:apiserver_request_latencies:histogram_quantile"
|
||||||
expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled
|
},
|
||||||
> 0
|
{
|
||||||
for: 10m
|
"expr": "histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job=\"apiserver\"}[5m])) without(instance, pod)) / 1e+06\n",
|
||||||
labels:
|
"labels": {
|
||||||
severity: warning
|
"quantile": "0.5"
|
||||||
annotations:
|
},
|
||||||
description: A number of daemonsets are not scheduled.
|
"record": "cluster_quantile:apiserver_request_latencies:histogram_quantile"
|
||||||
summary: Daemonsets are not scheduled correctly
|
}
|
||||||
- alert: DaemonSetsMissScheduled
|
]
|
||||||
expr: kube_daemonset_status_number_misscheduled > 0
|
},
|
||||||
for: 10m
|
{
|
||||||
labels:
|
"name": "node.rules",
|
||||||
severity: warning
|
"rules": [
|
||||||
annotations:
|
{
|
||||||
description: A number of daemonsets are running where they are not supposed
|
"expr": "sum(min(kube_pod_info) by (node))",
|
||||||
to run.
|
"record": ":kube_pod_info_node_count:"
|
||||||
summary: Daemonsets are not scheduled correctly
|
},
|
||||||
- alert: PodFrequentlyRestarting
|
{
|
||||||
expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
|
"expr": "max(label_replace(kube_pod_info{job=\"kube-state-metrics\"}, \"pod\", \"$1\", \"pod\", \"(.*)\")) by (node, namespace, pod)\n",
|
||||||
for: 10m
|
"record": "node_namespace_pod:kube_pod_info:"
|
||||||
labels:
|
},
|
||||||
severity: warning
|
{
|
||||||
annotations:
|
"expr": "count by (node) (sum by (node, cpu) (\n node_cpu_seconds_total{job=\"node-exporter\"}\n* on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n))\n",
|
||||||
description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}}
|
"record": "node:node_num_cpu:sum"
|
||||||
times within the last hour
|
},
|
||||||
summary: Pod is restarting frequently
|
{
|
||||||
kubelet.rules.yaml: |
|
"expr": "1 - avg(rate(node_cpu_seconds_total{job=\"node-exporter\",mode=\"idle\"}[1m]))\n",
|
||||||
groups:
|
"record": ":node_cpu_utilisation:avg1m"
|
||||||
- name: kubelet.rules
|
},
|
||||||
rules:
|
{
|
||||||
- alert: K8SNodeNotReady
|
"expr": "1 - avg by (node) (\n rate(node_cpu_seconds_total{job=\"node-exporter\",mode=\"idle\"}[1m])\n* on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:)\n",
|
||||||
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
|
"record": "node:node_cpu_utilisation:avg1m"
|
||||||
for: 1h
|
},
|
||||||
labels:
|
{
|
||||||
severity: warning
|
"expr": "node:node_cpu_utilisation:avg1m\n *\nnode:node_num_cpu:sum\n /\nscalar(sum(node:node_num_cpu:sum))\n",
|
||||||
annotations:
|
"record": "node:cluster_cpu_utilisation:ratio"
|
||||||
description: The Kubelet on {{ $labels.node }} has not checked in with the API,
|
},
|
||||||
or has set itself to NotReady, for more than an hour
|
{
|
||||||
summary: Node status is NotReady
|
"expr": "sum(node_load1{job=\"node-exporter\"})\n/\nsum(node:node_num_cpu:sum)\n",
|
||||||
- alert: K8SManyNodesNotReady
|
"record": ":node_cpu_saturation_load1:"
|
||||||
expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0)
|
},
|
||||||
> 1 and (count(kube_node_status_condition{condition="Ready",status="true"} ==
|
{
|
||||||
0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2
|
"expr": "sum by (node) (\n node_load1{job=\"node-exporter\"}\n* on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n)\n/\nnode:node_num_cpu:sum\n",
|
||||||
for: 1m
|
"record": "node:node_cpu_saturation_load1:"
|
||||||
labels:
|
},
|
||||||
severity: critical
|
{
|
||||||
annotations:
|
"expr": "1 -\nsum(node_memory_MemFree_bytes{job=\"node-exporter\"} + node_memory_Cached_bytes{job=\"node-exporter\"} + node_memory_Buffers_bytes{job=\"node-exporter\"})\n/\nsum(node_memory_MemTotal_bytes{job=\"node-exporter\"})\n",
|
||||||
description: '{{ $value }}% of Kubernetes nodes are not ready'
|
"record": ":node_memory_utilisation:"
|
||||||
- alert: K8SKubeletDown
|
},
|
||||||
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3
|
{
|
||||||
for: 1h
|
"expr": "sum(node_memory_MemFree_bytes{job=\"node-exporter\"} + node_memory_Cached_bytes{job=\"node-exporter\"} + node_memory_Buffers_bytes{job=\"node-exporter\"})\n",
|
||||||
labels:
|
"record": ":node_memory_MemFreeCachedBuffers_bytes:sum"
|
||||||
severity: warning
|
},
|
||||||
annotations:
|
{
|
||||||
description: Prometheus failed to scrape {{ $value }}% of kubelets.
|
"expr": "sum(node_memory_MemTotal_bytes{job=\"node-exporter\"})\n",
|
||||||
- alert: K8SKubeletDown
|
"record": ":node_memory_MemTotal_bytes:sum"
|
||||||
expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}))
|
},
|
||||||
* 100 > 10
|
{
|
||||||
for: 1h
|
"expr": "sum by (node) (\n (node_memory_MemFree_bytes{job=\"node-exporter\"} + node_memory_Cached_bytes{job=\"node-exporter\"} + node_memory_Buffers_bytes{job=\"node-exporter\"})\n * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n)\n",
|
||||||
labels:
|
"record": "node:node_memory_bytes_available:sum"
|
||||||
severity: critical
|
},
|
||||||
annotations:
|
{
|
||||||
description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets
|
"expr": "sum by (node) (\n node_memory_MemTotal_bytes{job=\"node-exporter\"}\n * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n)\n",
|
||||||
have disappeared from service discovery.
|
"record": "node:node_memory_bytes_total:sum"
|
||||||
summary: Many Kubelets cannot be scraped
|
},
|
||||||
- alert: K8SKubeletTooManyPods
|
{
|
||||||
expr: kubelet_running_pod_count > 100
|
"expr": "(node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)\n/\nnode:node_memory_bytes_total:sum\n",
|
||||||
for: 10m
|
"record": "node:node_memory_utilisation:ratio"
|
||||||
labels:
|
},
|
||||||
severity: warning
|
{
|
||||||
annotations:
|
"expr": "(node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)\n/\nscalar(sum(node:node_memory_bytes_total:sum))\n",
|
||||||
description: Kubelet {{$labels.instance}} is running {{$value}} pods, close
|
"record": "node:cluster_memory_utilisation:ratio"
|
||||||
to the limit of 110
|
},
|
||||||
summary: Kubelet is close to pod limit
|
{
|
||||||
kubernetes.rules.yaml: |
|
"expr": "1e3 * sum(\n (rate(node_vmstat_pgpgin{job=\"node-exporter\"}[1m])\n + rate(node_vmstat_pgpgout{job=\"node-exporter\"}[1m]))\n)\n",
|
||||||
groups:
|
"record": ":node_memory_swap_io_bytes:sum_rate"
|
||||||
- name: kubernetes.rules
|
},
|
||||||
rules:
|
{
|
||||||
- record: pod_name:container_memory_usage_bytes:sum
|
"expr": "1 -\nsum by (node) (\n (node_memory_MemFree_bytes{job=\"node-exporter\"} + node_memory_Cached_bytes{job=\"node-exporter\"} + node_memory_Buffers_bytes{job=\"node-exporter\"})\n* on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n)\n/\nsum by (node) (\n node_memory_MemTotal_bytes{job=\"node-exporter\"}\n* on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n)\n",
|
||||||
expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
|
"record": "node:node_memory_utilisation:"
|
||||||
(pod_name)
|
},
|
||||||
- record: pod_name:container_spec_cpu_shares:sum
|
{
|
||||||
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name)
|
"expr": "1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum)\n",
|
||||||
- record: pod_name:container_cpu_usage:sum
|
"record": "node:node_memory_utilisation_2:"
|
||||||
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
|
},
|
||||||
BY (pod_name)
|
{
|
||||||
- record: pod_name:container_fs_usage_bytes:sum
|
"expr": "1e3 * sum by (node) (\n (rate(node_vmstat_pgpgin{job=\"node-exporter\"}[1m])\n + rate(node_vmstat_pgpgout{job=\"node-exporter\"}[1m]))\n * on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n)\n",
|
||||||
expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name)
|
"record": "node:node_memory_swap_io_bytes:sum_rate"
|
||||||
- record: namespace:container_memory_usage_bytes:sum
|
},
|
||||||
expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace)
|
{
|
||||||
- record: namespace:container_spec_cpu_shares:sum
|
"expr": "avg(irate(node_disk_io_time_seconds_total{job=\"node-exporter\",device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+\"}[1m]))\n",
|
||||||
expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace)
|
"record": ":node_disk_utilisation:avg_irate"
|
||||||
- record: namespace:container_cpu_usage:sum
|
},
|
||||||
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m]))
|
{
|
||||||
BY (namespace)
|
"expr": "avg by (node) (\n irate(node_disk_io_time_seconds_total{job=\"node-exporter\",device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+\"}[1m])\n* on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n)\n",
|
||||||
- record: cluster:memory_usage:ratio
|
"record": "node:node_disk_utilisation:avg_irate"
|
||||||
expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
|
},
|
||||||
(cluster) / sum(machine_memory_bytes) BY (cluster)
|
{
|
||||||
- record: cluster:container_spec_cpu_shares:ratio
|
"expr": "avg(irate(node_disk_io_time_weighted_seconds_total{job=\"node-exporter\",device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+\"}[1m]) / 1e3)\n",
|
||||||
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000
|
"record": ":node_disk_saturation:avg_irate"
|
||||||
/ sum(machine_cpu_cores)
|
},
|
||||||
- record: cluster:container_cpu_usage:ratio
|
{
|
||||||
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
|
"expr": "avg by (node) (\n irate(node_disk_io_time_weighted_seconds_total{job=\"node-exporter\",device=~\"nvme.+|rbd.+|sd.+|vd.+|xvd.+\"}[1m]) / 1e3\n* on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n)\n",
|
||||||
/ sum(machine_cpu_cores)
|
"record": "node:node_disk_saturation:avg_irate"
|
||||||
- record: apiserver_latency_seconds:quantile
|
},
|
||||||
expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) /
|
{
|
||||||
1e+06
|
"expr": "max by (namespace, pod, device) ((node_filesystem_size_bytes{fstype=~\"ext[234]|btrfs|xfs|zfs\"}\n- node_filesystem_avail_bytes{fstype=~\"ext[234]|btrfs|xfs|zfs\"})\n/ node_filesystem_size_bytes{fstype=~\"ext[234]|btrfs|xfs|zfs\"})\n",
|
||||||
labels:
|
"record": "node:node_filesystem_usage:"
|
||||||
quantile: "0.99"
|
},
|
||||||
- record: apiserver_latency:quantile_seconds
|
{
|
||||||
expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) /
|
"expr": "max by (namespace, pod, device) (node_filesystem_avail_bytes{fstype=~\"ext[234]|btrfs|xfs|zfs\"} / node_filesystem_size_bytes{fstype=~\"ext[234]|btrfs|xfs|zfs\"})\n",
|
||||||
1e+06
|
"record": "node:node_filesystem_avail:"
|
||||||
labels:
|
},
|
||||||
quantile: "0.9"
|
{
|
||||||
- record: apiserver_latency_seconds:quantile
|
"expr": "sum(irate(node_network_receive_bytes_total{job=\"node-exporter\",device!~\"veth.+\"}[1m])) +\nsum(irate(node_network_transmit_bytes_total{job=\"node-exporter\",device!~\"veth.+\"}[1m]))\n",
|
||||||
expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) /
|
"record": ":node_net_utilisation:sum_irate"
|
||||||
1e+06
|
},
|
||||||
labels:
|
{
|
||||||
quantile: "0.5"
|
"expr": "sum by (node) (\n (irate(node_network_receive_bytes_total{job=\"node-exporter\",device!~\"veth.+\"}[1m]) +\n irate(node_network_transmit_bytes_total{job=\"node-exporter\",device!~\"veth.+\"}[1m]))\n* on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n)\n",
|
||||||
- alert: APIServerLatencyHigh
|
"record": "node:node_net_utilisation:sum_irate"
|
||||||
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
|
},
|
||||||
> 1
|
{
|
||||||
for: 10m
|
"expr": "sum(irate(node_network_receive_drop_total{job=\"node-exporter\",device!~\"veth.+\"}[1m])) +\nsum(irate(node_network_transmit_drop_total{job=\"node-exporter\",device!~\"veth.+\"}[1m]))\n",
|
||||||
labels:
|
"record": ":node_net_saturation:sum_irate"
|
||||||
severity: warning
|
},
|
||||||
annotations:
|
{
|
||||||
description: the API server has a 99th percentile latency of {{ $value }} seconds
|
"expr": "sum by (node) (\n (irate(node_network_receive_drop_total{job=\"node-exporter\",device!~\"veth.+\"}[1m]) +\n irate(node_network_transmit_drop_total{job=\"node-exporter\",device!~\"veth.+\"}[1m]))\n* on (namespace, pod) group_left(node)\n node_namespace_pod:kube_pod_info:\n)\n",
|
||||||
for {{$labels.verb}} {{$labels.resource}}
|
"record": "node:node_net_saturation:sum_irate"
|
||||||
- alert: APIServerLatencyHigh
|
},
|
||||||
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
|
{
|
||||||
> 4
|
"expr": "max(\n max(\n kube_pod_info{job=\"kube-state-metrics\", host_ip!=\"\"}\n ) by (node, host_ip)\n * on (host_ip) group_right (node)\n label_replace(\n (max(node_filesystem_files{job=\"node-exporter\", mountpoint=\"/\"}) by (instance)), \"host_ip\", \"$1\", \"instance\", \"(.*):.*\"\n )\n) by (node)\n",
|
||||||
for: 10m
|
"record": "node:node_inodes_total:"
|
||||||
labels:
|
},
|
||||||
severity: critical
|
{
|
||||||
annotations:
|
"expr": "max(\n max(\n kube_pod_info{job=\"kube-state-metrics\", host_ip!=\"\"}\n ) by (node, host_ip)\n * on (host_ip) group_right (node)\n label_replace(\n (max(node_filesystem_files_free{job=\"node-exporter\", mountpoint=\"/\"}) by (instance)), \"host_ip\", \"$1\", \"instance\", \"(.*):.*\"\n )\n) by (node)\n",
|
||||||
description: the API server has a 99th percentile latency of {{ $value }} seconds
|
"record": "node:node_inodes_free:"
|
||||||
for {{$labels.verb}} {{$labels.resource}}
|
}
|
||||||
- alert: APIServerErrorsHigh
|
]
|
||||||
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
|
},
|
||||||
* 100 > 2
|
{
|
||||||
for: 10m
|
"name": "kubernetes-absent",
|
||||||
labels:
|
"rules": [
|
||||||
severity: warning
|
{
|
||||||
annotations:
|
"alert": "KubeAPIDown",
|
||||||
description: API server returns errors for {{ $value }}% of requests
|
"annotations": {
|
||||||
- alert: APIServerErrorsHigh
|
"message": "KubeAPI has disappeared from Prometheus target discovery.",
|
||||||
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown"
|
||||||
* 100 > 5
|
},
|
||||||
for: 10m
|
"expr": "absent(up{job=\"apiserver\"} == 1)\n",
|
||||||
labels:
|
"for": "15m",
|
||||||
severity: critical
|
"labels": {
|
||||||
annotations:
|
"severity": "critical"
|
||||||
description: API server returns errors for {{ $value }}% of requests
|
}
|
||||||
- alert: K8SApiserverDown
|
},
|
||||||
expr: absent(up{job="apiserver"} == 1)
|
{
|
||||||
for: 20m
|
"alert": "KubeControllerManagerDown",
|
||||||
labels:
|
"annotations": {
|
||||||
severity: critical
|
"message": "KubeControllerManager has disappeared from Prometheus target discovery.",
|
||||||
annotations:
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown"
|
||||||
description: No API servers are reachable or all have disappeared from service
|
},
|
||||||
discovery
|
"expr": "absent(up{job=\"kube-controller-manager\"} == 1)\n",
|
||||||
|
"for": "15m",
|
||||||
- alert: K8sCertificateExpirationNotice
|
"labels": {
|
||||||
labels:
|
"severity": "critical"
|
||||||
severity: warning
|
}
|
||||||
annotations:
|
},
|
||||||
description: Kubernetes API Certificate is expiring soon (less than 7 days)
|
{
|
||||||
expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0
|
"alert": "KubeSchedulerDown",
|
||||||
|
"annotations": {
|
||||||
- alert: K8sCertificateExpirationNotice
|
"message": "KubeScheduler has disappeared from Prometheus target discovery.",
|
||||||
labels:
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown"
|
||||||
severity: critical
|
},
|
||||||
annotations:
|
"expr": "absent(up{job=\"kube-scheduler\"} == 1)\n",
|
||||||
description: Kubernetes API Certificate is expiring in less than 1 day
|
"for": "15m",
|
||||||
expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0
|
"labels": {
|
||||||
node.rules.yaml: |
|
"severity": "critical"
|
||||||
groups:
|
}
|
||||||
- name: node.rules
|
},
|
||||||
rules:
|
{
|
||||||
- record: instance:node_cpu:rate:sum
|
"alert": "KubeletDown",
|
||||||
expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m]))
|
"annotations": {
|
||||||
BY (instance)
|
"message": "Kubelet has disappeared from Prometheus target discovery.",
|
||||||
- record: instance:node_filesystem_usage:sum
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown"
|
||||||
expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}))
|
},
|
||||||
BY (instance)
|
"expr": "absent(up{job=\"kubelet\"} == 1)\n",
|
||||||
- record: instance:node_network_receive_bytes:rate:sum
|
"for": "15m",
|
||||||
expr: sum(rate(node_network_receive_bytes[3m])) BY (instance)
|
"labels": {
|
||||||
- record: instance:node_network_transmit_bytes:rate:sum
|
"severity": "critical"
|
||||||
expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance)
|
}
|
||||||
- record: instance:node_cpu:ratio
|
}
|
||||||
expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance)
|
]
|
||||||
GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)
|
},
|
||||||
- record: cluster:node_cpu:sum_rate5m
|
{
|
||||||
expr: sum(rate(node_cpu{mode!="idle"}[5m]))
|
"name": "kubernetes-apps",
|
||||||
- record: cluster:node_cpu:ratio
|
"rules": [
|
||||||
expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
|
{
|
||||||
- alert: NodeExporterDown
|
"alert": "KubePodCrashLooping",
|
||||||
expr: absent(up{job="node-exporter"} == 1)
|
"annotations": {
|
||||||
for: 10m
|
"message": "Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf \"%.2f\" $value }} times / 5 minutes.",
|
||||||
labels:
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping"
|
||||||
severity: warning
|
},
|
||||||
annotations:
|
"expr": "rate(kube_pod_container_status_restarts_total{job=\"kube-state-metrics\"}[15m]) * 60 * 5 > 0\n",
|
||||||
description: Prometheus could not scrape a node-exporter for more than 10m,
|
"for": "1h",
|
||||||
or node-exporters have disappeared from discovery
|
"labels": {
|
||||||
- alert: NodeDiskRunningFull
|
"severity": "critical"
|
||||||
expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0
|
}
|
||||||
for: 30m
|
},
|
||||||
labels:
|
{
|
||||||
severity: warning
|
"alert": "KubePodNotReady",
|
||||||
annotations:
|
"annotations": {
|
||||||
description: device {{$labels.device}} on node {{$labels.instance}} is running
|
"message": "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than an hour.",
|
||||||
full within the next 24 hours (mounted at {{$labels.mountpoint}})
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready"
|
||||||
- alert: NodeDiskRunningFull
|
},
|
||||||
expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0
|
"expr": "sum by (namespace, pod) (kube_pod_status_phase{job=\"kube-state-metrics\", phase=~\"Pending|Unknown\"}) > 0\n",
|
||||||
for: 10m
|
"for": "1h",
|
||||||
labels:
|
"labels": {
|
||||||
severity: critical
|
"severity": "critical"
|
||||||
annotations:
|
}
|
||||||
description: device {{$labels.device}} on node {{$labels.instance}} is running
|
},
|
||||||
full within the next 2 hours (mounted at {{$labels.mountpoint}})
|
{
|
||||||
prometheus.rules.yaml: |
|
"alert": "KubeDeploymentGenerationMismatch",
|
||||||
groups:
|
"annotations": {
|
||||||
- name: prometheus.rules
|
"message": "Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back.",
|
||||||
rules:
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch"
|
||||||
- alert: PrometheusConfigReloadFailed
|
},
|
||||||
expr: prometheus_config_last_reload_successful == 0
|
"expr": "kube_deployment_status_observed_generation{job=\"kube-state-metrics\"}\n !=\nkube_deployment_metadata_generation{job=\"kube-state-metrics\"}\n",
|
||||||
for: 10m
|
"for": "15m",
|
||||||
labels:
|
"labels": {
|
||||||
severity: warning
|
"severity": "critical"
|
||||||
annotations:
|
}
|
||||||
description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
|
},
|
||||||
- alert: PrometheusNotificationQueueRunningFull
|
{
|
||||||
expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
|
"alert": "KubeDeploymentReplicasMismatch",
|
||||||
for: 10m
|
"annotations": {
|
||||||
labels:
|
"message": "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than an hour.",
|
||||||
severity: warning
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch"
|
||||||
annotations:
|
},
|
||||||
description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
|
"expr": "kube_deployment_spec_replicas{job=\"kube-state-metrics\"}\n !=\nkube_deployment_status_replicas_available{job=\"kube-state-metrics\"}\n",
|
||||||
$labels.pod}}
|
"for": "1h",
|
||||||
- alert: PrometheusErrorSendingAlerts
|
"labels": {
|
||||||
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
|
"severity": "critical"
|
||||||
> 0.01
|
}
|
||||||
for: 10m
|
},
|
||||||
labels:
|
{
|
||||||
severity: warning
|
"alert": "KubeStatefulSetReplicasMismatch",
|
||||||
annotations:
|
"annotations": {
|
||||||
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
|
"message": "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes.",
|
||||||
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch"
|
||||||
- alert: PrometheusErrorSendingAlerts
|
},
|
||||||
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
|
"expr": "kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\"}\n !=\nkube_statefulset_status_replicas{job=\"kube-state-metrics\"}\n",
|
||||||
> 0.03
|
"for": "15m",
|
||||||
for: 10m
|
"labels": {
|
||||||
labels:
|
"severity": "critical"
|
||||||
severity: critical
|
}
|
||||||
annotations:
|
},
|
||||||
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
|
{
|
||||||
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
"alert": "KubeStatefulSetGenerationMismatch",
|
||||||
- alert: PrometheusNotConnectedToAlertmanagers
|
"annotations": {
|
||||||
expr: prometheus_notifications_alertmanagers_discovered < 1
|
"message": "StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back.",
|
||||||
for: 10m
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch"
|
||||||
labels:
|
},
|
||||||
severity: warning
|
"expr": "kube_statefulset_status_observed_generation{job=\"kube-state-metrics\"}\n !=\nkube_statefulset_metadata_generation{job=\"kube-state-metrics\"}\n",
|
||||||
annotations:
|
"for": "15m",
|
||||||
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
|
"labels": {
|
||||||
to any Alertmanagers
|
"severity": "critical"
|
||||||
- alert: PrometheusTSDBReloadsFailing
|
}
|
||||||
expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
|
},
|
||||||
for: 12h
|
{
|
||||||
labels:
|
"alert": "KubeStatefulSetUpdateNotRolledOut",
|
||||||
severity: warning
|
"annotations": {
|
||||||
annotations:
|
"message": "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.",
|
||||||
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout"
|
||||||
reload failures over the last four hours.'
|
},
|
||||||
summary: Prometheus has issues reloading data blocks from disk
|
"expr": "max without (revision) (\n kube_statefulset_status_current_revision{job=\"kube-state-metrics\"}\n unless\n kube_statefulset_status_update_revision{job=\"kube-state-metrics\"}\n)\n *\n(\n kube_statefulset_replicas{job=\"kube-state-metrics\"}\n !=\n kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\"}\n)\n",
|
||||||
- alert: PrometheusTSDBCompactionsFailing
|
"for": "15m",
|
||||||
expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
|
"labels": {
|
||||||
for: 12h
|
"severity": "critical"
|
||||||
labels:
|
}
|
||||||
severity: warning
|
},
|
||||||
annotations:
|
{
|
||||||
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
|
"alert": "KubeDaemonSetRolloutStuck",
|
||||||
compaction failures over the last four hours.'
|
"annotations": {
|
||||||
summary: Prometheus has issues compacting sample blocks
|
"message": "Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are scheduled and ready.",
|
||||||
- alert: PrometheusTSDBWALCorruptions
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck"
|
||||||
expr: tsdb_wal_corruptions_total > 0
|
},
|
||||||
for: 4h
|
"expr": "kube_daemonset_status_number_ready{job=\"kube-state-metrics\"}\n /\nkube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"} * 100 < 100\n",
|
||||||
labels:
|
"for": "15m",
|
||||||
severity: warning
|
"labels": {
|
||||||
annotations:
|
"severity": "critical"
|
||||||
description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
|
}
|
||||||
log (WAL).'
|
},
|
||||||
summary: Prometheus write-ahead log is corrupted
|
{
|
||||||
- alert: PrometheusNotIngestingSamples
|
"alert": "KubeDaemonSetNotScheduled",
|
||||||
expr: rate(prometheus_tsdb_head_samples_appended_total[5m]) <= 0
|
"annotations": {
|
||||||
for: 10m
|
"message": "{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.",
|
||||||
labels:
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled"
|
||||||
severity: warning
|
},
|
||||||
annotations:
|
"expr": "kube_daemonset_status_desired_number_scheduled{job=\"kube-state-metrics\"}\n -\nkube_daemonset_status_current_number_scheduled{job=\"kube-state-metrics\"} > 0\n",
|
||||||
description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples."
|
"for": "10m",
|
||||||
summary: "Prometheus isn't ingesting samples"
|
"labels": {
|
||||||
|
"severity": "warning"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alert": "KubeDaemonSetMisScheduled",
|
||||||
|
"annotations": {
|
||||||
|
"message": "{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.",
|
||||||
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled"
|
||||||
|
},
|
||||||
|
"expr": "kube_daemonset_status_number_misscheduled{job=\"kube-state-metrics\"} > 0\n",
|
||||||
|
"for": "10m",
|
||||||
|
"labels": {
|
||||||
|
"severity": "warning"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alert": "KubeCronJobRunning",
|
||||||
|
"annotations": {
|
||||||
|
"message": "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.",
|
||||||
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning"
|
||||||
|
},
|
||||||
|
"expr": "time() - kube_cronjob_next_schedule_time{job=\"kube-state-metrics\"} > 3600\n",
|
||||||
|
"for": "1h",
|
||||||
|
"labels": {
|
||||||
|
"severity": "warning"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alert": "KubeJobCompletion",
|
||||||
|
"annotations": {
|
||||||
|
"message": "Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than one hour to complete.",
|
||||||
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion"
|
||||||
|
},
|
||||||
|
"expr": "kube_job_spec_completions{job=\"kube-state-metrics\"} - kube_job_status_succeeded{job=\"kube-state-metrics\"} > 0\n",
|
||||||
|
"for": "1h",
|
||||||
|
"labels": {
|
||||||
|
"severity": "warning"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alert": "KubeJobFailed",
|
||||||
|
"annotations": {
|
||||||
|
"message": "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.",
|
||||||
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed"
|
||||||
|
},
|
||||||
|
"expr": "kube_job_status_failed{job=\"kube-state-metrics\"} > 0\n",
|
||||||
|
"for": "1h",
|
||||||
|
"labels": {
|
||||||
|
"severity": "warning"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "kubernetes-resources",
|
||||||
|
"rules": [
|
||||||
|
{
|
||||||
|
"alert": "KubeCPUOvercommit",
|
||||||
|
"annotations": {
|
||||||
|
"message": "Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure.",
|
||||||
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit"
|
||||||
|
},
|
||||||
|
"expr": "sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum)\n /\nsum(node:node_num_cpu:sum)\n >\n(count(node:node_num_cpu:sum)-1) / count(node:node_num_cpu:sum)\n",
|
||||||
|
"for": "5m",
|
||||||
|
"labels": {
|
||||||
|
"severity": "warning"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alert": "KubeMemOvercommit",
|
||||||
|
"annotations": {
|
||||||
|
"message": "Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure.",
|
||||||
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit"
|
||||||
|
},
|
||||||
|
"expr": "sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum)\n /\nsum(node_memory_MemTotal_bytes)\n >\n(count(node:node_num_cpu:sum)-1)\n /\ncount(node:node_num_cpu:sum)\n",
|
||||||
|
"for": "5m",
|
||||||
|
"labels": {
|
||||||
|
"severity": "warning"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alert": "KubeCPUOvercommit",
|
||||||
|
"annotations": {
|
||||||
|
"message": "Cluster has overcommitted CPU resource requests for Namespaces.",
|
||||||
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit"
|
||||||
|
},
|
||||||
|
"expr": "sum(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=\"requests.cpu\"})\n /\nsum(node:node_num_cpu:sum)\n > 1.5\n",
|
||||||
|
"for": "5m",
|
||||||
|
"labels": {
|
||||||
|
"severity": "warning"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alert": "KubeMemOvercommit",
|
||||||
|
"annotations": {
|
||||||
|
"message": "Cluster has overcommitted memory resource requests for Namespaces.",
|
||||||
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit"
|
||||||
|
},
|
||||||
|
"expr": "sum(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\", resource=\"requests.memory\"})\n /\nsum(node_memory_MemTotal_bytes{job=\"node-exporter\"})\n > 1.5\n",
|
||||||
|
"for": "5m",
|
||||||
|
"labels": {
|
||||||
|
"severity": "warning"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alert": "KubeQuotaExceeded",
|
||||||
|
"annotations": {
|
||||||
|
"message": "Namespace {{ $labels.namespace }} is using {{ printf \"%0.0f\" $value }}% of its {{ $labels.resource }} quota.",
|
||||||
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded"
|
||||||
|
},
|
||||||
|
"expr": "100 * kube_resourcequota{job=\"kube-state-metrics\", type=\"used\"}\n / ignoring(instance, job, type)\n(kube_resourcequota{job=\"kube-state-metrics\", type=\"hard\"} > 0)\n > 90\n",
|
||||||
|
"for": "15m",
|
||||||
|
"labels": {
|
||||||
|
"severity": "warning"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alert": "CPUThrottlingHigh",
|
||||||
|
"annotations": {
|
||||||
|
"message": "{{ printf \"%0.0f\" $value }}% throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container_name }} in pod {{ $labels.pod_name }}.",
|
||||||
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh"
|
||||||
|
},
|
||||||
|
"expr": "100 * sum(increase(container_cpu_cfs_throttled_periods_total{container_name!=\"\", }[5m])) by (container_name, pod_name, namespace)\n /\nsum(increase(container_cpu_cfs_periods_total{}[5m])) by (container_name, pod_name, namespace)\n > 100 \n",
|
||||||
|
"for": "15m",
|
||||||
|
"labels": {
|
||||||
|
"severity": "warning"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "kubernetes-storage",
|
||||||
|
"rules": [
|
||||||
|
{
|
||||||
|
"alert": "KubePersistentVolumeUsageCritical",
|
||||||
|
"annotations": {
|
||||||
|
"message": "The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ printf \"%0.2f\" $value }}% free.",
|
||||||
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical"
|
||||||
|
},
|
||||||
|
"expr": "100 * kubelet_volume_stats_available_bytes{job=\"kubelet\"}\n /\nkubelet_volume_stats_capacity_bytes{job=\"kubelet\"}\n < 3\n",
|
||||||
|
"for": "1m",
|
||||||
|
"labels": {
|
||||||
|
"severity": "critical"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alert": "KubePersistentVolumeFullInFourDays",
|
||||||
|
"annotations": {
|
||||||
|
"message": "Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ printf \"%0.2f\" $value }}% is available.",
|
||||||
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays"
|
||||||
|
},
|
||||||
|
"expr": "100 * (\n kubelet_volume_stats_available_bytes{job=\"kubelet\"}\n /\n kubelet_volume_stats_capacity_bytes{job=\"kubelet\"}\n) < 15\nand\npredict_linear(kubelet_volume_stats_available_bytes{job=\"kubelet\"}[6h], 4 * 24 * 3600) < 0\n",
|
||||||
|
"for": "5m",
|
||||||
|
"labels": {
|
||||||
|
"severity": "critical"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alert": "KubePersistentVolumeErrors",
|
||||||
|
"annotations": {
|
||||||
|
"message": "The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}.",
|
||||||
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors"
|
||||||
|
},
|
||||||
|
"expr": "kube_persistentvolume_status_phase{phase=~\"Failed|Pending\",job=\"kube-state-metrics\"} > 0\n",
|
||||||
|
"for": "5m",
|
||||||
|
"labels": {
|
||||||
|
"severity": "critical"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "kubernetes-system",
|
||||||
|
"rules": [
|
||||||
|
{
|
||||||
|
"alert": "KubeNodeNotReady",
|
||||||
|
"annotations": {
|
||||||
|
"message": "{{ $labels.node }} has been unready for more than an hour.",
|
||||||
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready"
|
||||||
|
},
|
||||||
|
"expr": "kube_node_status_condition{job=\"kube-state-metrics\",condition=\"Ready\",status=\"true\"} == 0\n",
|
||||||
|
"for": "1h",
|
||||||
|
"labels": {
|
||||||
|
"severity": "warning"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alert": "KubeVersionMismatch",
|
||||||
|
"annotations": {
|
||||||
|
"message": "There are {{ $value }} different semantic versions of Kubernetes components running.",
|
||||||
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch"
|
||||||
|
},
|
||||||
|
"expr": "count(count by (gitVersion) (label_replace(kubernetes_build_info{job!=\"coredns\"},\"gitVersion\",\"$1\",\"gitVersion\",\"(v[0-9]*.[0-9]*.[0-9]*).*\"))) > 1\n",
|
||||||
|
"for": "1h",
|
||||||
|
"labels": {
|
||||||
|
"severity": "warning"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alert": "KubeClientErrors",
|
||||||
|
"annotations": {
|
||||||
|
"message": "Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ printf \"%0.0f\" $value }}% errors.'",
|
||||||
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors"
|
||||||
|
},
|
||||||
|
"expr": "(sum(rate(rest_client_requests_total{code=~\"5..\"}[5m])) by (instance, job)\n /\nsum(rate(rest_client_requests_total[5m])) by (instance, job))\n* 100 > 1\n",
|
||||||
|
"for": "15m",
|
||||||
|
"labels": {
|
||||||
|
"severity": "warning"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alert": "KubeClientErrors",
|
||||||
|
"annotations": {
|
||||||
|
"message": "Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ printf \"%0.0f\" $value }} errors / second.",
|
||||||
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors"
|
||||||
|
},
|
||||||
|
"expr": "sum(rate(ksm_scrape_error_total{job=\"kube-state-metrics\"}[5m])) by (instance, job) > 0.1\n",
|
||||||
|
"for": "15m",
|
||||||
|
"labels": {
|
||||||
|
"severity": "warning"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alert": "KubeletTooManyPods",
|
||||||
|
"annotations": {
|
||||||
|
"message": "Kubelet {{ $labels.instance }} is running {{ $value }} Pods, close to the limit of 110.",
|
||||||
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods"
|
||||||
|
},
|
||||||
|
"expr": "kubelet_running_pod_count{job=\"kubelet\"} > 110 * 0.9\n",
|
||||||
|
"for": "15m",
|
||||||
|
"labels": {
|
||||||
|
"severity": "warning"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alert": "KubeAPILatencyHigh",
|
||||||
|
"annotations": {
|
||||||
|
"message": "The API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.",
|
||||||
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh"
|
||||||
|
},
|
||||||
|
"expr": "cluster_quantile:apiserver_request_latencies:histogram_quantile{job=\"apiserver\",quantile=\"0.99\",subresource!=\"log\",verb!~\"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$\"} > 1\n",
|
||||||
|
"for": "10m",
|
||||||
|
"labels": {
|
||||||
|
"severity": "warning"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alert": "KubeAPILatencyHigh",
|
||||||
|
"annotations": {
|
||||||
|
"message": "The API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.",
|
||||||
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh"
|
||||||
|
},
|
||||||
|
"expr": "cluster_quantile:apiserver_request_latencies:histogram_quantile{job=\"apiserver\",quantile=\"0.99\",subresource!=\"log\",verb!~\"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$\"} > 4\n",
|
||||||
|
"for": "10m",
|
||||||
|
"labels": {
|
||||||
|
"severity": "critical"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alert": "KubeAPIErrorsHigh",
|
||||||
|
"annotations": {
|
||||||
|
"message": "API server is returning errors for {{ $value }}% of requests.",
|
||||||
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh"
|
||||||
|
},
|
||||||
|
"expr": "sum(rate(apiserver_request_count{job=\"apiserver\",code=~\"^(?:5..)$\"}[5m])) without(instance, pod)\n /\nsum(rate(apiserver_request_count{job=\"apiserver\"}[5m])) without(instance, pod) * 100 > 10\n",
|
||||||
|
"for": "10m",
|
||||||
|
"labels": {
|
||||||
|
"severity": "critical"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alert": "KubeAPIErrorsHigh",
|
||||||
|
"annotations": {
|
||||||
|
"message": "API server is returning errors for {{ $value }}% of requests.",
|
||||||
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh"
|
||||||
|
},
|
||||||
|
"expr": "sum(rate(apiserver_request_count{job=\"apiserver\",code=~\"^(?:5..)$\"}[5m])) without(instance, pod)\n /\nsum(rate(apiserver_request_count{job=\"apiserver\"}[5m])) without(instance, pod) * 100 > 5\n",
|
||||||
|
"for": "10m",
|
||||||
|
"labels": {
|
||||||
|
"severity": "warning"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alert": "KubeClientCertificateExpiration",
|
||||||
|
"annotations": {
|
||||||
|
"message": "A client certificate used to authenticate to the apiserver is expiring in less than 7 days.",
|
||||||
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration"
|
||||||
|
},
|
||||||
|
"expr": "histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) < 604800\n",
|
||||||
|
"labels": {
|
||||||
|
"severity": "warning"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alert": "KubeClientCertificateExpiration",
|
||||||
|
"annotations": {
|
||||||
|
"message": "A client certificate used to authenticate to the apiserver is expiring in less than 24 hours.",
|
||||||
|
"runbook_url": "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration"
|
||||||
|
},
|
||||||
|
"expr": "histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job=\"apiserver\"}[5m]))) < 86400\n",
|
||||||
|
"labels": {
|
||||||
|
"severity": "critical"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
kubeprom.yaml: |-
|
||||||
|
{
|
||||||
|
"groups": [
|
||||||
|
{
|
||||||
|
"name": "kube-prometheus-node-recording.rules",
|
||||||
|
"rules": [
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(node_cpu_seconds_total{mode!=\"idle\",mode!=\"iowait\"}[3m])) BY (instance)",
|
||||||
|
"record": "instance:node_cpu:rate:sum"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum((node_filesystem_size_bytes{mountpoint=\"/\"} - node_filesystem_free_bytes{mountpoint=\"/\"})) BY (instance)",
|
||||||
|
"record": "instance:node_filesystem_usage:sum"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(node_network_receive_bytes_total[3m])) BY (instance)",
|
||||||
|
"record": "instance:node_network_receive_bytes:rate:sum"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)",
|
||||||
|
"record": "instance:node_network_transmit_bytes:rate:sum"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(node_cpu_seconds_total{mode!=\"idle\",mode!=\"iowait\"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)",
|
||||||
|
"record": "instance:node_cpu:ratio"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(node_cpu_seconds_total{mode!=\"idle\",mode!=\"iowait\"}[5m]))",
|
||||||
|
"record": "cluster:node_cpu:sum_rate5m"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))",
|
||||||
|
"record": "cluster:node_cpu:ratio"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "kube-prometheus-node-alerting.rules",
|
||||||
|
"rules": [
|
||||||
|
{
|
||||||
|
"alert": "NodeDiskRunningFull",
|
||||||
|
"annotations": {
|
||||||
|
"message": "Device {{ $labels.device }} of node-exporter {{ $labels.namespace }}/{{ $labels.pod }} will be full within the next 24 hours."
|
||||||
|
},
|
||||||
|
"expr": "(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0)\n",
|
||||||
|
"for": "30m",
|
||||||
|
"labels": {
|
||||||
|
"severity": "warning"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alert": "NodeDiskRunningFull",
|
||||||
|
"annotations": {
|
||||||
|
"message": "Device {{ $labels.device }} of node-exporter {{ $labels.namespace }}/{{ $labels.pod }} will be full within the next 2 hours."
|
||||||
|
},
|
||||||
|
"expr": "(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0)\n",
|
||||||
|
"for": "10m",
|
||||||
|
"labels": {
|
||||||
|
"severity": "critical"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "prometheus.rules",
|
||||||
|
"rules": [
|
||||||
|
{
|
||||||
|
"alert": "PrometheusConfigReloadFailed",
|
||||||
|
"annotations": {
|
||||||
|
"description": "Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}",
|
||||||
|
"summary": "Reloading Prometheus' configuration failed"
|
||||||
|
},
|
||||||
|
"expr": "prometheus_config_last_reload_successful{job=\"prometheus\"} == 0\n",
|
||||||
|
"for": "10m",
|
||||||
|
"labels": {
|
||||||
|
"severity": "warning"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alert": "PrometheusNotificationQueueRunningFull",
|
||||||
|
"annotations": {
|
||||||
|
"description": "Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ $labels.pod}}",
|
||||||
|
"summary": "Prometheus' alert notification queue is running full"
|
||||||
|
},
|
||||||
|
"expr": "predict_linear(prometheus_notifications_queue_length{job=\"prometheus\"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job=\"prometheus\"}\n",
|
||||||
|
"for": "10m",
|
||||||
|
"labels": {
|
||||||
|
"severity": "warning"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alert": "PrometheusErrorSendingAlerts",
|
||||||
|
"annotations": {
|
||||||
|
"description": "Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}",
|
||||||
|
"summary": "Errors while sending alert from Prometheus"
|
||||||
|
},
|
||||||
|
"expr": "rate(prometheus_notifications_errors_total{job=\"prometheus\"}[5m]) / rate(prometheus_notifications_sent_total{job=\"prometheus\"}[5m]) > 0.01\n",
|
||||||
|
"for": "10m",
|
||||||
|
"labels": {
|
||||||
|
"severity": "warning"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alert": "PrometheusErrorSendingAlerts",
|
||||||
|
"annotations": {
|
||||||
|
"description": "Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}",
|
||||||
|
"summary": "Errors while sending alerts from Prometheus"
|
||||||
|
},
|
||||||
|
"expr": "rate(prometheus_notifications_errors_total{job=\"prometheus\"}[5m]) / rate(prometheus_notifications_sent_total{job=\"prometheus\"}[5m]) > 0.03\n",
|
||||||
|
"for": "10m",
|
||||||
|
"labels": {
|
||||||
|
"severity": "critical"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alert": "PrometheusNotConnectedToAlertmanagers",
|
||||||
|
"annotations": {
|
||||||
|
"description": "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected to any Alertmanagers",
|
||||||
|
"summary": "Prometheus is not connected to any Alertmanagers"
|
||||||
|
},
|
||||||
|
"expr": "prometheus_notifications_alertmanagers_discovered{job=\"prometheus\"} < 1\n",
|
||||||
|
"for": "10m",
|
||||||
|
"labels": {
|
||||||
|
"severity": "warning"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alert": "PrometheusTSDBReloadsFailing",
|
||||||
|
"annotations": {
|
||||||
|
"description": "{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} reload failures over the last four hours.",
|
||||||
|
"summary": "Prometheus has issues reloading data blocks from disk"
|
||||||
|
},
|
||||||
|
"expr": "increase(prometheus_tsdb_reloads_failures_total{job=\"prometheus\"}[2h]) > 0\n",
|
||||||
|
"for": "12h",
|
||||||
|
"labels": {
|
||||||
|
"severity": "warning"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alert": "PrometheusTSDBCompactionsFailing",
|
||||||
|
"annotations": {
|
||||||
|
"description": "{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} compaction failures over the last four hours.",
|
||||||
|
"summary": "Prometheus has issues compacting sample blocks"
|
||||||
|
},
|
||||||
|
"expr": "increase(prometheus_tsdb_compactions_failed_total{job=\"prometheus\"}[2h]) > 0\n",
|
||||||
|
"for": "12h",
|
||||||
|
"labels": {
|
||||||
|
"severity": "warning"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alert": "PrometheusTSDBWALCorruptions",
|
||||||
|
"annotations": {
|
||||||
|
"description": "{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead log (WAL).",
|
||||||
|
"summary": "Prometheus write-ahead log is corrupted"
|
||||||
|
},
|
||||||
|
"expr": "tsdb_wal_corruptions_total{job=\"prometheus\"} > 0\n",
|
||||||
|
"for": "4h",
|
||||||
|
"labels": {
|
||||||
|
"severity": "warning"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alert": "PrometheusNotIngestingSamples",
|
||||||
|
"annotations": {
|
||||||
|
"description": "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples.",
|
||||||
|
"summary": "Prometheus isn't ingesting samples"
|
||||||
|
},
|
||||||
|
"expr": "rate(prometheus_tsdb_head_samples_appended_total{job=\"prometheus\"}[5m]) <= 0\n",
|
||||||
|
"for": "10m",
|
||||||
|
"labels": {
|
||||||
|
"severity": "warning"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"alert": "PrometheusTargetScrapesDuplicate",
|
||||||
|
"annotations": {
|
||||||
|
"description": "{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values",
|
||||||
|
"summary": "Prometheus has many samples rejected"
|
||||||
|
},
|
||||||
|
"expr": "increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job=\"prometheus\"}[5m]) > 0\n",
|
||||||
|
"for": "10m",
|
||||||
|
"labels": {
|
||||||
|
"severity": "warning"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "general.rules",
|
||||||
|
"rules": [
|
||||||
|
{
|
||||||
|
"alert": "TargetDown",
|
||||||
|
"annotations": {
|
||||||
|
"message": "{{ $value }}% of the {{ $labels.job }} targets are down."
|
||||||
|
},
|
||||||
|
"expr": "100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10",
|
||||||
|
"for": "10m",
|
||||||
|
"labels": {
|
||||||
|
"severity": "warning"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
@ -11,10 +11,10 @@ Typhoon distributes upstream Kubernetes, architectural conventions, and cluster
|
|||||||
|
|
||||||
## Features <a href="https://www.cncf.io/certification/software-conformance/"><img align="right" src="https://storage.googleapis.com/poseidon/certified-kubernetes.png"></a>
|
## Features <a href="https://www.cncf.io/certification/software-conformance/"><img align="right" src="https://storage.googleapis.com/poseidon/certified-kubernetes.png"></a>
|
||||||
|
|
||||||
* Kubernetes v1.10.4 (upstream, via [kubernetes-incubator/bootkube](https://github.com/kubernetes-incubator/bootkube))
|
* Kubernetes v1.13.4 (upstream, via [kubernetes-incubator/bootkube](https://github.com/kubernetes-incubator/bootkube))
|
||||||
* Single or multi-master, workloads isolated on workers, [Calico](https://www.projectcalico.org/) or [flannel](https://github.com/coreos/flannel) networking
|
* Single or multi-master, [Calico](https://www.projectcalico.org/) or [flannel](https://github.com/coreos/flannel) networking
|
||||||
* On-cluster etcd with TLS, [RBAC](https://kubernetes.io/docs/admin/authorization/rbac/)-enabled, [network policy](https://kubernetes.io/docs/concepts/services-networking/network-policies/)
|
* On-cluster etcd with TLS, [RBAC](https://kubernetes.io/docs/admin/authorization/rbac/)-enabled, [network policy](https://kubernetes.io/docs/concepts/services-networking/network-policies/)
|
||||||
* Advanced features like [worker pools](https://typhoon.psdn.io/advanced/worker-pools/)
|
* Advanced features like [worker pools](https://typhoon.psdn.io/advanced/worker-pools/), [spot](https://typhoon.psdn.io/cl/aws/#spot) workers, and [snippets](https://typhoon.psdn.io/advanced/customization/#container-linux) customization
|
||||||
* Ready for Ingress, Prometheus, Grafana, and other optional [addons](https://typhoon.psdn.io/addons/overview/)
|
* Ready for Ingress, Prometheus, Grafana, and other optional [addons](https://typhoon.psdn.io/addons/overview/)
|
||||||
|
|
||||||
## Docs
|
## Docs
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
# Self-hosted Kubernetes assets (kubeconfig, manifests)
|
# Self-hosted Kubernetes assets (kubeconfig, manifests)
|
||||||
module "bootkube" {
|
module "bootkube" {
|
||||||
source = "git::https://github.com/poseidon/terraform-render-bootkube.git?ref=0e98e89e14a074768db13c4e050ed0c13319a0c1"
|
source = "git::https://github.com/poseidon/terraform-render-bootkube.git?ref=953521dbba49eb6a39204f30a3978730eac01e11"
|
||||||
|
|
||||||
cluster_name = "${var.cluster_name}"
|
cluster_name = "${var.cluster_name}"
|
||||||
api_servers = ["${format("%s.%s", var.cluster_name, var.dns_zone)}"]
|
api_servers = ["${format("%s.%s", var.cluster_name, var.dns_zone)}"]
|
||||||
@ -11,4 +11,5 @@ module "bootkube" {
|
|||||||
pod_cidr = "${var.pod_cidr}"
|
pod_cidr = "${var.pod_cidr}"
|
||||||
service_cidr = "${var.service_cidr}"
|
service_cidr = "${var.service_cidr}"
|
||||||
cluster_domain_suffix = "${var.cluster_domain_suffix}"
|
cluster_domain_suffix = "${var.cluster_domain_suffix}"
|
||||||
|
enable_reporting = "${var.enable_reporting}"
|
||||||
}
|
}
|
||||||
|
@ -7,7 +7,7 @@ systemd:
|
|||||||
- name: 40-etcd-cluster.conf
|
- name: 40-etcd-cluster.conf
|
||||||
contents: |
|
contents: |
|
||||||
[Service]
|
[Service]
|
||||||
Environment="ETCD_IMAGE_TAG=v3.3.6"
|
Environment="ETCD_IMAGE_TAG=v3.3.12"
|
||||||
Environment="ETCD_NAME=${etcd_name}"
|
Environment="ETCD_NAME=${etcd_name}"
|
||||||
Environment="ETCD_ADVERTISE_CLIENT_URLS=https://${etcd_domain}:2379"
|
Environment="ETCD_ADVERTISE_CLIENT_URLS=https://${etcd_domain}:2379"
|
||||||
Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://${etcd_domain}:2380"
|
Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://${etcd_domain}:2380"
|
||||||
@ -74,12 +74,11 @@ systemd:
|
|||||||
ExecStartPre=/usr/bin/bash -c "grep 'certificate-authority-data' /etc/kubernetes/kubeconfig | awk '{print $2}' | base64 -d > /etc/kubernetes/ca.crt"
|
ExecStartPre=/usr/bin/bash -c "grep 'certificate-authority-data' /etc/kubernetes/kubeconfig | awk '{print $2}' | base64 -d > /etc/kubernetes/ca.crt"
|
||||||
ExecStartPre=-/usr/bin/rkt rm --uuid-file=/var/cache/kubelet-pod.uuid
|
ExecStartPre=-/usr/bin/rkt rm --uuid-file=/var/cache/kubelet-pod.uuid
|
||||||
ExecStart=/usr/lib/coreos/kubelet-wrapper \
|
ExecStart=/usr/lib/coreos/kubelet-wrapper \
|
||||||
--allow-privileged \
|
|
||||||
--anonymous-auth=false \
|
--anonymous-auth=false \
|
||||||
--authentication-token-webhook \
|
--authentication-token-webhook \
|
||||||
--authorization-mode=Webhook \
|
--authorization-mode=Webhook \
|
||||||
--client-ca-file=/etc/kubernetes/ca.crt \
|
--client-ca-file=/etc/kubernetes/ca.crt \
|
||||||
--cluster_dns=${k8s_dns_service_ip} \
|
--cluster_dns=${cluster_dns_service_ip} \
|
||||||
--cluster_domain=${cluster_domain_suffix} \
|
--cluster_domain=${cluster_domain_suffix} \
|
||||||
--cni-conf-dir=/etc/kubernetes/cni/net.d \
|
--cni-conf-dir=/etc/kubernetes/cni/net.d \
|
||||||
--exit-on-lock-contention \
|
--exit-on-lock-contention \
|
||||||
@ -89,6 +88,7 @@ systemd:
|
|||||||
--node-labels=node-role.kubernetes.io/master \
|
--node-labels=node-role.kubernetes.io/master \
|
||||||
--node-labels=node-role.kubernetes.io/controller="true" \
|
--node-labels=node-role.kubernetes.io/controller="true" \
|
||||||
--pod-manifest-path=/etc/kubernetes/manifests \
|
--pod-manifest-path=/etc/kubernetes/manifests \
|
||||||
|
--read-only-port=0 \
|
||||||
--register-with-taints=node-role.kubernetes.io/master=:NoSchedule \
|
--register-with-taints=node-role.kubernetes.io/master=:NoSchedule \
|
||||||
--volume-plugin-dir=/var/lib/kubelet/volumeplugins
|
--volume-plugin-dir=/var/lib/kubelet/volumeplugins
|
||||||
ExecStop=-/usr/bin/rkt stop --uuid-file=/var/cache/kubelet-pod.uuid
|
ExecStop=-/usr/bin/rkt stop --uuid-file=/var/cache/kubelet-pod.uuid
|
||||||
@ -123,7 +123,7 @@ storage:
|
|||||||
contents:
|
contents:
|
||||||
inline: |
|
inline: |
|
||||||
KUBELET_IMAGE_URL=docker://k8s.gcr.io/hyperkube
|
KUBELET_IMAGE_URL=docker://k8s.gcr.io/hyperkube
|
||||||
KUBELET_IMAGE_TAG=v1.10.4
|
KUBELET_IMAGE_TAG=v1.13.4
|
||||||
- path: /etc/sysctl.d/max-user-watches.conf
|
- path: /etc/sysctl.d/max-user-watches.conf
|
||||||
filesystem: root
|
filesystem: root
|
||||||
contents:
|
contents:
|
||||||
@ -143,17 +143,14 @@ storage:
|
|||||||
set -e
|
set -e
|
||||||
# Move experimental manifests
|
# Move experimental manifests
|
||||||
[ -n "$(ls /opt/bootkube/assets/manifests-*/* 2>/dev/null)" ] && mv /opt/bootkube/assets/manifests-*/* /opt/bootkube/assets/manifests && rm -rf /opt/bootkube/assets/manifests-*
|
[ -n "$(ls /opt/bootkube/assets/manifests-*/* 2>/dev/null)" ] && mv /opt/bootkube/assets/manifests-*/* /opt/bootkube/assets/manifests && rm -rf /opt/bootkube/assets/manifests-*
|
||||||
BOOTKUBE_ACI="$${BOOTKUBE_ACI:-quay.io/coreos/bootkube}"
|
|
||||||
BOOTKUBE_VERSION="$${BOOTKUBE_VERSION:-v0.12.0}"
|
|
||||||
BOOTKUBE_ASSETS="$${BOOTKUBE_ASSETS:-/opt/bootkube/assets}"
|
|
||||||
exec /usr/bin/rkt run \
|
exec /usr/bin/rkt run \
|
||||||
--trust-keys-from-https \
|
--trust-keys-from-https \
|
||||||
--volume assets,kind=host,source=$${BOOTKUBE_ASSETS} \
|
--volume assets,kind=host,source=/opt/bootkube/assets \
|
||||||
--mount volume=assets,target=/assets \
|
--mount volume=assets,target=/assets \
|
||||||
--volume bootstrap,kind=host,source=/etc/kubernetes \
|
--volume bootstrap,kind=host,source=/etc/kubernetes \
|
||||||
--mount volume=bootstrap,target=/etc/kubernetes \
|
--mount volume=bootstrap,target=/etc/kubernetes \
|
||||||
$${RKT_OPTS} \
|
$${RKT_OPTS} \
|
||||||
$${BOOTKUBE_ACI}:$${BOOTKUBE_VERSION} \
|
quay.io/coreos/bootkube:v0.14.0 \
|
||||||
--net=host \
|
--net=host \
|
||||||
--dns=host \
|
--dns=host \
|
||||||
--exec=/bootkube -- start --asset-dir=/assets "$@"
|
--exec=/bootkube -- start --asset-dir=/assets "$@"
|
||||||
|
@ -24,12 +24,13 @@ resource "aws_instance" "controllers" {
|
|||||||
instance_type = "${var.controller_type}"
|
instance_type = "${var.controller_type}"
|
||||||
|
|
||||||
ami = "${local.ami_id}"
|
ami = "${local.ami_id}"
|
||||||
user_data = "${element(data.ct_config.controller_ign.*.rendered, count.index)}"
|
user_data = "${element(data.ct_config.controller-ignitions.*.rendered, count.index)}"
|
||||||
|
|
||||||
# storage
|
# storage
|
||||||
root_block_device {
|
root_block_device {
|
||||||
volume_type = "${var.disk_type}"
|
volume_type = "${var.disk_type}"
|
||||||
volume_size = "${var.disk_size}"
|
volume_size = "${var.disk_size}"
|
||||||
|
iops = "${var.disk_iops}"
|
||||||
}
|
}
|
||||||
|
|
||||||
# network
|
# network
|
||||||
@ -38,12 +39,23 @@ resource "aws_instance" "controllers" {
|
|||||||
vpc_security_group_ids = ["${aws_security_group.controller.id}"]
|
vpc_security_group_ids = ["${aws_security_group.controller.id}"]
|
||||||
|
|
||||||
lifecycle {
|
lifecycle {
|
||||||
ignore_changes = ["ami"]
|
ignore_changes = [
|
||||||
|
"ami",
|
||||||
|
"user_data",
|
||||||
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
# Controller Container Linux Config
|
# Controller Ignition configs
|
||||||
data "template_file" "controller_config" {
|
data "ct_config" "controller-ignitions" {
|
||||||
|
count = "${var.controller_count}"
|
||||||
|
content = "${element(data.template_file.controller-configs.*.rendered, count.index)}"
|
||||||
|
pretty_print = false
|
||||||
|
snippets = ["${var.controller_clc_snippets}"]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Controller Container Linux configs
|
||||||
|
data "template_file" "controller-configs" {
|
||||||
count = "${var.controller_count}"
|
count = "${var.controller_count}"
|
||||||
|
|
||||||
template = "${file("${path.module}/cl/controller.yaml.tmpl")}"
|
template = "${file("${path.module}/cl/controller.yaml.tmpl")}"
|
||||||
@ -56,10 +68,10 @@ data "template_file" "controller_config" {
|
|||||||
# etcd0=https://cluster-etcd0.example.com,etcd1=https://cluster-etcd1.example.com,...
|
# etcd0=https://cluster-etcd0.example.com,etcd1=https://cluster-etcd1.example.com,...
|
||||||
etcd_initial_cluster = "${join(",", data.template_file.etcds.*.rendered)}"
|
etcd_initial_cluster = "${join(",", data.template_file.etcds.*.rendered)}"
|
||||||
|
|
||||||
kubeconfig = "${indent(10, module.bootkube.kubeconfig)}"
|
kubeconfig = "${indent(10, module.bootkube.kubeconfig-kubelet)}"
|
||||||
ssh_authorized_key = "${var.ssh_authorized_key}"
|
ssh_authorized_key = "${var.ssh_authorized_key}"
|
||||||
k8s_dns_service_ip = "${cidrhost(var.service_cidr, 10)}"
|
cluster_dns_service_ip = "${cidrhost(var.service_cidr, 10)}"
|
||||||
cluster_domain_suffix = "${var.cluster_domain_suffix}"
|
cluster_domain_suffix = "${var.cluster_domain_suffix}"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -68,15 +80,8 @@ data "template_file" "etcds" {
|
|||||||
template = "etcd$${index}=https://$${cluster_name}-etcd$${index}.$${dns_zone}:2380"
|
template = "etcd$${index}=https://$${cluster_name}-etcd$${index}.$${dns_zone}:2380"
|
||||||
|
|
||||||
vars {
|
vars {
|
||||||
index = "${count.index}"
|
index = "${count.index}"
|
||||||
cluster_name = "${var.cluster_name}"
|
cluster_name = "${var.cluster_name}"
|
||||||
dns_zone = "${var.dns_zone}"
|
dns_zone = "${var.dns_zone}"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
data "ct_config" "controller_ign" {
|
|
||||||
count = "${var.controller_count}"
|
|
||||||
content = "${element(data.template_file.controller_config.*.rendered, count.index)}"
|
|
||||||
pretty_print = false
|
|
||||||
snippets = ["${var.controller_clc_snippets}"]
|
|
||||||
}
|
|
||||||
|
@ -5,17 +5,17 @@ resource "aws_route53_record" "apiserver" {
|
|||||||
name = "${format("%s.%s.", var.cluster_name, var.dns_zone)}"
|
name = "${format("%s.%s.", var.cluster_name, var.dns_zone)}"
|
||||||
type = "A"
|
type = "A"
|
||||||
|
|
||||||
# AWS recommends their special "alias" records for ELBs
|
# AWS recommends their special "alias" records for NLBs
|
||||||
alias {
|
alias {
|
||||||
name = "${aws_lb.apiserver.dns_name}"
|
name = "${aws_lb.nlb.dns_name}"
|
||||||
zone_id = "${aws_lb.apiserver.zone_id}"
|
zone_id = "${aws_lb.nlb.zone_id}"
|
||||||
evaluate_target_health = true
|
evaluate_target_health = true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
# Network Load Balancer for apiservers
|
# Network Load Balancer for apiservers and ingress
|
||||||
resource "aws_lb" "apiserver" {
|
resource "aws_lb" "nlb" {
|
||||||
name = "${var.cluster_name}-apiserver"
|
name = "${var.cluster_name}-nlb"
|
||||||
load_balancer_type = "network"
|
load_balancer_type = "network"
|
||||||
internal = false
|
internal = false
|
||||||
|
|
||||||
@ -24,11 +24,11 @@ resource "aws_lb" "apiserver" {
|
|||||||
enable_cross_zone_load_balancing = true
|
enable_cross_zone_load_balancing = true
|
||||||
}
|
}
|
||||||
|
|
||||||
# Forward TCP traffic to controllers
|
# Forward TCP apiserver traffic to controllers
|
||||||
resource "aws_lb_listener" "apiserver-https" {
|
resource "aws_lb_listener" "apiserver-https" {
|
||||||
load_balancer_arn = "${aws_lb.apiserver.arn}"
|
load_balancer_arn = "${aws_lb.nlb.arn}"
|
||||||
protocol = "TCP"
|
protocol = "TCP"
|
||||||
port = "443"
|
port = "6443"
|
||||||
|
|
||||||
default_action {
|
default_action {
|
||||||
type = "forward"
|
type = "forward"
|
||||||
@ -36,6 +36,30 @@ resource "aws_lb_listener" "apiserver-https" {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Forward HTTP ingress traffic to workers
|
||||||
|
resource "aws_lb_listener" "ingress-http" {
|
||||||
|
load_balancer_arn = "${aws_lb.nlb.arn}"
|
||||||
|
protocol = "TCP"
|
||||||
|
port = 80
|
||||||
|
|
||||||
|
default_action {
|
||||||
|
type = "forward"
|
||||||
|
target_group_arn = "${module.workers.target_group_http}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Forward HTTPS ingress traffic to workers
|
||||||
|
resource "aws_lb_listener" "ingress-https" {
|
||||||
|
load_balancer_arn = "${aws_lb.nlb.arn}"
|
||||||
|
protocol = "TCP"
|
||||||
|
port = 443
|
||||||
|
|
||||||
|
default_action {
|
||||||
|
type = "forward"
|
||||||
|
target_group_arn = "${module.workers.target_group_https}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
# Target group of controllers
|
# Target group of controllers
|
||||||
resource "aws_lb_target_group" "controllers" {
|
resource "aws_lb_target_group" "controllers" {
|
||||||
name = "${var.cluster_name}-controllers"
|
name = "${var.cluster_name}-controllers"
|
||||||
@ -43,12 +67,12 @@ resource "aws_lb_target_group" "controllers" {
|
|||||||
target_type = "instance"
|
target_type = "instance"
|
||||||
|
|
||||||
protocol = "TCP"
|
protocol = "TCP"
|
||||||
port = 443
|
port = 6443
|
||||||
|
|
||||||
# TCP health check for apiserver
|
# TCP health check for apiserver
|
||||||
health_check {
|
health_check {
|
||||||
protocol = "TCP"
|
protocol = "TCP"
|
||||||
port = 443
|
port = 6443
|
||||||
|
|
||||||
# NLBs required to use same healthy and unhealthy thresholds
|
# NLBs required to use same healthy and unhealthy thresholds
|
||||||
healthy_threshold = 3
|
healthy_threshold = 3
|
||||||
@ -65,5 +89,5 @@ resource "aws_lb_target_group_attachment" "controllers" {
|
|||||||
|
|
||||||
target_group_arn = "${aws_lb_target_group.controllers.arn}"
|
target_group_arn = "${aws_lb_target_group.controllers.arn}"
|
||||||
target_id = "${element(aws_instance.controllers.*.id, count.index)}"
|
target_id = "${element(aws_instance.controllers.*.id, count.index)}"
|
||||||
port = 443
|
port = 6443
|
||||||
}
|
}
|
@ -1,8 +1,19 @@
|
|||||||
|
output "kubeconfig-admin" {
|
||||||
|
value = "${module.bootkube.kubeconfig-admin}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Outputs for Kubernetes Ingress
|
||||||
|
|
||||||
output "ingress_dns_name" {
|
output "ingress_dns_name" {
|
||||||
value = "${module.workers.ingress_dns_name}"
|
value = "${aws_lb.nlb.dns_name}"
|
||||||
description = "DNS name of the network load balancer for distributing traffic to Ingress controllers"
|
description = "DNS name of the network load balancer for distributing traffic to Ingress controllers"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
output "ingress_zone_id" {
|
||||||
|
value = "${aws_lb.nlb.zone_id}"
|
||||||
|
description = "Route53 zone id of the network load balancer DNS name that can be used in Route53 alias records"
|
||||||
|
}
|
||||||
|
|
||||||
# Outputs for worker pools
|
# Outputs for worker pools
|
||||||
|
|
||||||
output "vpc_id" {
|
output "vpc_id" {
|
||||||
@ -21,5 +32,17 @@ output "worker_security_groups" {
|
|||||||
}
|
}
|
||||||
|
|
||||||
output "kubeconfig" {
|
output "kubeconfig" {
|
||||||
value = "${module.bootkube.kubeconfig}"
|
value = "${module.bootkube.kubeconfig-kubelet}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Outputs for custom load balancing
|
||||||
|
|
||||||
|
output "worker_target_group_http" {
|
||||||
|
description = "ARN of a target group of workers for HTTP traffic"
|
||||||
|
value = "${module.workers.target_group_http}"
|
||||||
|
}
|
||||||
|
|
||||||
|
output "worker_target_group_https" {
|
||||||
|
description = "ARN of a target group of workers for HTTPS traffic"
|
||||||
|
value = "${module.workers.target_group_https}"
|
||||||
}
|
}
|
||||||
|
@ -11,16 +11,6 @@ resource "aws_security_group" "controller" {
|
|||||||
tags = "${map("Name", "${var.cluster_name}-controller")}"
|
tags = "${map("Name", "${var.cluster_name}-controller")}"
|
||||||
}
|
}
|
||||||
|
|
||||||
resource "aws_security_group_rule" "controller-icmp" {
|
|
||||||
security_group_id = "${aws_security_group.controller.id}"
|
|
||||||
|
|
||||||
type = "ingress"
|
|
||||||
protocol = "icmp"
|
|
||||||
from_port = 0
|
|
||||||
to_port = 0
|
|
||||||
cidr_blocks = ["0.0.0.0/0"]
|
|
||||||
}
|
|
||||||
|
|
||||||
resource "aws_security_group_rule" "controller-ssh" {
|
resource "aws_security_group_rule" "controller-ssh" {
|
||||||
security_group_id = "${aws_security_group.controller.id}"
|
security_group_id = "${aws_security_group.controller.id}"
|
||||||
|
|
||||||
@ -31,16 +21,6 @@ resource "aws_security_group_rule" "controller-ssh" {
|
|||||||
cidr_blocks = ["0.0.0.0/0"]
|
cidr_blocks = ["0.0.0.0/0"]
|
||||||
}
|
}
|
||||||
|
|
||||||
resource "aws_security_group_rule" "controller-apiserver" {
|
|
||||||
security_group_id = "${aws_security_group.controller.id}"
|
|
||||||
|
|
||||||
type = "ingress"
|
|
||||||
protocol = "tcp"
|
|
||||||
from_port = 443
|
|
||||||
to_port = 443
|
|
||||||
cidr_blocks = ["0.0.0.0/0"]
|
|
||||||
}
|
|
||||||
|
|
||||||
resource "aws_security_group_rule" "controller-etcd" {
|
resource "aws_security_group_rule" "controller-etcd" {
|
||||||
security_group_id = "${aws_security_group.controller.id}"
|
security_group_id = "${aws_security_group.controller.id}"
|
||||||
|
|
||||||
@ -51,6 +31,7 @@ resource "aws_security_group_rule" "controller-etcd" {
|
|||||||
self = true
|
self = true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Allow Prometheus to scrape etcd metrics
|
||||||
resource "aws_security_group_rule" "controller-etcd-metrics" {
|
resource "aws_security_group_rule" "controller-etcd-metrics" {
|
||||||
security_group_id = "${aws_security_group.controller.id}"
|
security_group_id = "${aws_security_group.controller.id}"
|
||||||
|
|
||||||
@ -61,6 +42,16 @@ resource "aws_security_group_rule" "controller-etcd-metrics" {
|
|||||||
source_security_group_id = "${aws_security_group.worker.id}"
|
source_security_group_id = "${aws_security_group.worker.id}"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
resource "aws_security_group_rule" "controller-apiserver" {
|
||||||
|
security_group_id = "${aws_security_group.controller.id}"
|
||||||
|
|
||||||
|
type = "ingress"
|
||||||
|
protocol = "tcp"
|
||||||
|
from_port = 6443
|
||||||
|
to_port = 6443
|
||||||
|
cidr_blocks = ["0.0.0.0/0"]
|
||||||
|
}
|
||||||
|
|
||||||
resource "aws_security_group_rule" "controller-flannel" {
|
resource "aws_security_group_rule" "controller-flannel" {
|
||||||
security_group_id = "${aws_security_group.controller.id}"
|
security_group_id = "${aws_security_group.controller.id}"
|
||||||
|
|
||||||
@ -81,6 +72,7 @@ resource "aws_security_group_rule" "controller-flannel-self" {
|
|||||||
self = true
|
self = true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Allow Prometheus to scrape node-exporter daemonset
|
||||||
resource "aws_security_group_rule" "controller-node-exporter" {
|
resource "aws_security_group_rule" "controller-node-exporter" {
|
||||||
security_group_id = "${aws_security_group.controller.id}"
|
security_group_id = "${aws_security_group.controller.id}"
|
||||||
|
|
||||||
@ -91,6 +83,7 @@ resource "aws_security_group_rule" "controller-node-exporter" {
|
|||||||
source_security_group_id = "${aws_security_group.worker.id}"
|
source_security_group_id = "${aws_security_group.worker.id}"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Allow apiserver to access kubelets for exec, log, port-forward
|
||||||
resource "aws_security_group_rule" "controller-kubelet" {
|
resource "aws_security_group_rule" "controller-kubelet" {
|
||||||
security_group_id = "${aws_security_group.controller.id}"
|
security_group_id = "${aws_security_group.controller.id}"
|
||||||
|
|
||||||
@ -111,26 +104,6 @@ resource "aws_security_group_rule" "controller-kubelet-self" {
|
|||||||
self = true
|
self = true
|
||||||
}
|
}
|
||||||
|
|
||||||
resource "aws_security_group_rule" "controller-kubelet-read" {
|
|
||||||
security_group_id = "${aws_security_group.controller.id}"
|
|
||||||
|
|
||||||
type = "ingress"
|
|
||||||
protocol = "tcp"
|
|
||||||
from_port = 10255
|
|
||||||
to_port = 10255
|
|
||||||
source_security_group_id = "${aws_security_group.worker.id}"
|
|
||||||
}
|
|
||||||
|
|
||||||
resource "aws_security_group_rule" "controller-kubelet-read-self" {
|
|
||||||
security_group_id = "${aws_security_group.controller.id}"
|
|
||||||
|
|
||||||
type = "ingress"
|
|
||||||
protocol = "tcp"
|
|
||||||
from_port = 10255
|
|
||||||
to_port = 10255
|
|
||||||
self = true
|
|
||||||
}
|
|
||||||
|
|
||||||
resource "aws_security_group_rule" "controller-bgp" {
|
resource "aws_security_group_rule" "controller-bgp" {
|
||||||
security_group_id = "${aws_security_group.controller.id}"
|
security_group_id = "${aws_security_group.controller.id}"
|
||||||
|
|
||||||
@ -213,16 +186,6 @@ resource "aws_security_group" "worker" {
|
|||||||
tags = "${map("Name", "${var.cluster_name}-worker")}"
|
tags = "${map("Name", "${var.cluster_name}-worker")}"
|
||||||
}
|
}
|
||||||
|
|
||||||
resource "aws_security_group_rule" "worker-icmp" {
|
|
||||||
security_group_id = "${aws_security_group.worker.id}"
|
|
||||||
|
|
||||||
type = "ingress"
|
|
||||||
protocol = "icmp"
|
|
||||||
from_port = 0
|
|
||||||
to_port = 0
|
|
||||||
cidr_blocks = ["0.0.0.0/0"]
|
|
||||||
}
|
|
||||||
|
|
||||||
resource "aws_security_group_rule" "worker-ssh" {
|
resource "aws_security_group_rule" "worker-ssh" {
|
||||||
security_group_id = "${aws_security_group.worker.id}"
|
security_group_id = "${aws_security_group.worker.id}"
|
||||||
|
|
||||||
@ -273,6 +236,7 @@ resource "aws_security_group_rule" "worker-flannel-self" {
|
|||||||
self = true
|
self = true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Allow Prometheus to scrape node-exporter daemonset
|
||||||
resource "aws_security_group_rule" "worker-node-exporter" {
|
resource "aws_security_group_rule" "worker-node-exporter" {
|
||||||
security_group_id = "${aws_security_group.worker.id}"
|
security_group_id = "${aws_security_group.worker.id}"
|
||||||
|
|
||||||
@ -293,6 +257,7 @@ resource "aws_security_group_rule" "ingress-health" {
|
|||||||
cidr_blocks = ["0.0.0.0/0"]
|
cidr_blocks = ["0.0.0.0/0"]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Allow apiserver to access kubelets for exec, log, port-forward
|
||||||
resource "aws_security_group_rule" "worker-kubelet" {
|
resource "aws_security_group_rule" "worker-kubelet" {
|
||||||
security_group_id = "${aws_security_group.worker.id}"
|
security_group_id = "${aws_security_group.worker.id}"
|
||||||
|
|
||||||
@ -303,6 +268,7 @@ resource "aws_security_group_rule" "worker-kubelet" {
|
|||||||
source_security_group_id = "${aws_security_group.controller.id}"
|
source_security_group_id = "${aws_security_group.controller.id}"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Allow Prometheus to scrape kubelet metrics
|
||||||
resource "aws_security_group_rule" "worker-kubelet-self" {
|
resource "aws_security_group_rule" "worker-kubelet-self" {
|
||||||
security_group_id = "${aws_security_group.worker.id}"
|
security_group_id = "${aws_security_group.worker.id}"
|
||||||
|
|
||||||
@ -313,26 +279,6 @@ resource "aws_security_group_rule" "worker-kubelet-self" {
|
|||||||
self = true
|
self = true
|
||||||
}
|
}
|
||||||
|
|
||||||
resource "aws_security_group_rule" "worker-kubelet-read" {
|
|
||||||
security_group_id = "${aws_security_group.worker.id}"
|
|
||||||
|
|
||||||
type = "ingress"
|
|
||||||
protocol = "tcp"
|
|
||||||
from_port = 10255
|
|
||||||
to_port = 10255
|
|
||||||
source_security_group_id = "${aws_security_group.controller.id}"
|
|
||||||
}
|
|
||||||
|
|
||||||
resource "aws_security_group_rule" "worker-kubelet-read-self" {
|
|
||||||
security_group_id = "${aws_security_group.worker.id}"
|
|
||||||
|
|
||||||
type = "ingress"
|
|
||||||
protocol = "tcp"
|
|
||||||
from_port = 10255
|
|
||||||
to_port = 10255
|
|
||||||
self = true
|
|
||||||
}
|
|
||||||
|
|
||||||
resource "aws_security_group_rule" "worker-bgp" {
|
resource "aws_security_group_rule" "worker-bgp" {
|
||||||
security_group_id = "${aws_security_group.worker.id}"
|
security_group_id = "${aws_security_group.worker.id}"
|
||||||
|
|
||||||
|
@ -31,13 +31,13 @@ variable "worker_count" {
|
|||||||
|
|
||||||
variable "controller_type" {
|
variable "controller_type" {
|
||||||
type = "string"
|
type = "string"
|
||||||
default = "t2.small"
|
default = "t3.small"
|
||||||
description = "EC2 instance type for controllers"
|
description = "EC2 instance type for controllers"
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "worker_type" {
|
variable "worker_type" {
|
||||||
type = "string"
|
type = "string"
|
||||||
default = "t2.small"
|
default = "t3.small"
|
||||||
description = "EC2 instance type for workers"
|
description = "EC2 instance type for workers"
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -59,6 +59,12 @@ variable "disk_type" {
|
|||||||
description = "Type of the EBS volume (e.g. standard, gp2, io1)"
|
description = "Type of the EBS volume (e.g. standard, gp2, io1)"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "disk_iops" {
|
||||||
|
type = "string"
|
||||||
|
default = "0"
|
||||||
|
description = "IOPS of the EBS volume (e.g. 100)"
|
||||||
|
}
|
||||||
|
|
||||||
variable "worker_price" {
|
variable "worker_price" {
|
||||||
type = "string"
|
type = "string"
|
||||||
default = ""
|
default = ""
|
||||||
@ -116,7 +122,7 @@ variable "pod_cidr" {
|
|||||||
variable "service_cidr" {
|
variable "service_cidr" {
|
||||||
description = <<EOD
|
description = <<EOD
|
||||||
CIDR IPv4 range to assign Kubernetes services.
|
CIDR IPv4 range to assign Kubernetes services.
|
||||||
The 1st IP will be reserved for kube_apiserver, the 10th IP will be reserved for kube-dns.
|
The 1st IP will be reserved for kube_apiserver, the 10th IP will be reserved for coredns.
|
||||||
EOD
|
EOD
|
||||||
|
|
||||||
type = "string"
|
type = "string"
|
||||||
@ -124,7 +130,13 @@ EOD
|
|||||||
}
|
}
|
||||||
|
|
||||||
variable "cluster_domain_suffix" {
|
variable "cluster_domain_suffix" {
|
||||||
description = "Queries for domains with the suffix will be answered by kube-dns. Default is cluster.local (e.g. foo.default.svc.cluster.local) "
|
description = "Queries for domains with the suffix will be answered by coredns. Default is cluster.local (e.g. foo.default.svc.cluster.local) "
|
||||||
type = "string"
|
type = "string"
|
||||||
default = "cluster.local"
|
default = "cluster.local"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "enable_reporting" {
|
||||||
|
type = "string"
|
||||||
|
description = "Enable usage or analytics reporting to upstreams (Calico)"
|
||||||
|
default = "false"
|
||||||
|
}
|
||||||
|
@ -13,7 +13,7 @@ module "workers" {
|
|||||||
spot_price = "${var.worker_price}"
|
spot_price = "${var.worker_price}"
|
||||||
|
|
||||||
# configuration
|
# configuration
|
||||||
kubeconfig = "${module.bootkube.kubeconfig}"
|
kubeconfig = "${module.bootkube.kubeconfig-kubelet}"
|
||||||
ssh_authorized_key = "${var.ssh_authorized_key}"
|
ssh_authorized_key = "${var.ssh_authorized_key}"
|
||||||
service_cidr = "${var.service_cidr}"
|
service_cidr = "${var.service_cidr}"
|
||||||
cluster_domain_suffix = "${var.cluster_domain_suffix}"
|
cluster_domain_suffix = "${var.cluster_domain_suffix}"
|
||||||
|
@ -47,12 +47,11 @@ systemd:
|
|||||||
ExecStartPre=/usr/bin/bash -c "grep 'certificate-authority-data' /etc/kubernetes/kubeconfig | awk '{print $2}' | base64 -d > /etc/kubernetes/ca.crt"
|
ExecStartPre=/usr/bin/bash -c "grep 'certificate-authority-data' /etc/kubernetes/kubeconfig | awk '{print $2}' | base64 -d > /etc/kubernetes/ca.crt"
|
||||||
ExecStartPre=-/usr/bin/rkt rm --uuid-file=/var/cache/kubelet-pod.uuid
|
ExecStartPre=-/usr/bin/rkt rm --uuid-file=/var/cache/kubelet-pod.uuid
|
||||||
ExecStart=/usr/lib/coreos/kubelet-wrapper \
|
ExecStart=/usr/lib/coreos/kubelet-wrapper \
|
||||||
--allow-privileged \
|
|
||||||
--anonymous-auth=false \
|
--anonymous-auth=false \
|
||||||
--authentication-token-webhook \
|
--authentication-token-webhook \
|
||||||
--authorization-mode=Webhook \
|
--authorization-mode=Webhook \
|
||||||
--client-ca-file=/etc/kubernetes/ca.crt \
|
--client-ca-file=/etc/kubernetes/ca.crt \
|
||||||
--cluster_dns=${k8s_dns_service_ip} \
|
--cluster_dns=${cluster_dns_service_ip} \
|
||||||
--cluster_domain=${cluster_domain_suffix} \
|
--cluster_domain=${cluster_domain_suffix} \
|
||||||
--cni-conf-dir=/etc/kubernetes/cni/net.d \
|
--cni-conf-dir=/etc/kubernetes/cni/net.d \
|
||||||
--exit-on-lock-contention \
|
--exit-on-lock-contention \
|
||||||
@ -61,6 +60,7 @@ systemd:
|
|||||||
--network-plugin=cni \
|
--network-plugin=cni \
|
||||||
--node-labels=node-role.kubernetes.io/node \
|
--node-labels=node-role.kubernetes.io/node \
|
||||||
--pod-manifest-path=/etc/kubernetes/manifests \
|
--pod-manifest-path=/etc/kubernetes/manifests \
|
||||||
|
--read-only-port=0 \
|
||||||
--volume-plugin-dir=/var/lib/kubelet/volumeplugins
|
--volume-plugin-dir=/var/lib/kubelet/volumeplugins
|
||||||
ExecStop=-/usr/bin/rkt stop --uuid-file=/var/cache/kubelet-pod.uuid
|
ExecStop=-/usr/bin/rkt stop --uuid-file=/var/cache/kubelet-pod.uuid
|
||||||
Restart=always
|
Restart=always
|
||||||
@ -93,7 +93,7 @@ storage:
|
|||||||
contents:
|
contents:
|
||||||
inline: |
|
inline: |
|
||||||
KUBELET_IMAGE_URL=docker://k8s.gcr.io/hyperkube
|
KUBELET_IMAGE_URL=docker://k8s.gcr.io/hyperkube
|
||||||
KUBELET_IMAGE_TAG=v1.10.4
|
KUBELET_IMAGE_TAG=v1.13.4
|
||||||
- path: /etc/sysctl.d/max-user-watches.conf
|
- path: /etc/sysctl.d/max-user-watches.conf
|
||||||
filesystem: root
|
filesystem: root
|
||||||
contents:
|
contents:
|
||||||
@ -111,7 +111,7 @@ storage:
|
|||||||
--volume config,kind=host,source=/etc/kubernetes \
|
--volume config,kind=host,source=/etc/kubernetes \
|
||||||
--mount volume=config,target=/etc/kubernetes \
|
--mount volume=config,target=/etc/kubernetes \
|
||||||
--insecure-options=image \
|
--insecure-options=image \
|
||||||
docker://k8s.gcr.io/hyperkube:v1.10.4 \
|
docker://k8s.gcr.io/hyperkube:v1.13.4 \
|
||||||
--net=host \
|
--net=host \
|
||||||
--dns=host \
|
--dns=host \
|
||||||
--exec=/kubectl -- --kubeconfig=/etc/kubernetes/kubeconfig delete node $(hostname)
|
--exec=/kubectl -- --kubeconfig=/etc/kubernetes/kubeconfig delete node $(hostname)
|
||||||
|
@ -1,39 +1,4 @@
|
|||||||
# Network Load Balancer for Ingress
|
# Target groups of instances for use with load balancers
|
||||||
resource "aws_lb" "ingress" {
|
|
||||||
name = "${var.name}-ingress"
|
|
||||||
load_balancer_type = "network"
|
|
||||||
internal = false
|
|
||||||
|
|
||||||
subnets = ["${var.subnet_ids}"]
|
|
||||||
|
|
||||||
enable_cross_zone_load_balancing = true
|
|
||||||
}
|
|
||||||
|
|
||||||
# Forward HTTP traffic to workers
|
|
||||||
resource "aws_lb_listener" "ingress-http" {
|
|
||||||
load_balancer_arn = "${aws_lb.ingress.arn}"
|
|
||||||
protocol = "TCP"
|
|
||||||
port = 80
|
|
||||||
|
|
||||||
default_action {
|
|
||||||
type = "forward"
|
|
||||||
target_group_arn = "${aws_lb_target_group.workers-http.arn}"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# Forward HTTPS traffic to workers
|
|
||||||
resource "aws_lb_listener" "ingress-https" {
|
|
||||||
load_balancer_arn = "${aws_lb.ingress.arn}"
|
|
||||||
protocol = "TCP"
|
|
||||||
port = 443
|
|
||||||
|
|
||||||
default_action {
|
|
||||||
type = "forward"
|
|
||||||
target_group_arn = "${aws_lb_target_group.workers-https.arn}"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# Network Load Balancer target groups of instances
|
|
||||||
|
|
||||||
resource "aws_lb_target_group" "workers-http" {
|
resource "aws_lb_target_group" "workers-http" {
|
||||||
name = "${var.name}-workers-http"
|
name = "${var.name}-workers-http"
|
||||||
|
@ -1,4 +1,9 @@
|
|||||||
output "ingress_dns_name" {
|
output "target_group_http" {
|
||||||
value = "${aws_lb.ingress.dns_name}"
|
description = "ARN of a target group of workers for HTTP traffic"
|
||||||
description = "DNS name of the network load balancer for distributing traffic to Ingress controllers"
|
value = "${aws_lb_target_group.workers-http.arn}"
|
||||||
|
}
|
||||||
|
|
||||||
|
output "target_group_https" {
|
||||||
|
description = "ARN of a target group of workers for HTTPS traffic"
|
||||||
|
value = "${aws_lb_target_group.workers-https.arn}"
|
||||||
}
|
}
|
||||||
|
@ -30,7 +30,7 @@ variable "count" {
|
|||||||
|
|
||||||
variable "instance_type" {
|
variable "instance_type" {
|
||||||
type = "string"
|
type = "string"
|
||||||
default = "t2.small"
|
default = "t3.small"
|
||||||
description = "EC2 instance type"
|
description = "EC2 instance type"
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -52,6 +52,12 @@ variable "disk_type" {
|
|||||||
description = "Type of the EBS volume (e.g. standard, gp2, io1)"
|
description = "Type of the EBS volume (e.g. standard, gp2, io1)"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "disk_iops" {
|
||||||
|
type = "string"
|
||||||
|
default = "0"
|
||||||
|
description = "IOPS of the EBS volume (required for io1)"
|
||||||
|
}
|
||||||
|
|
||||||
variable "spot_price" {
|
variable "spot_price" {
|
||||||
type = "string"
|
type = "string"
|
||||||
default = ""
|
default = ""
|
||||||
@ -79,7 +85,7 @@ variable "ssh_authorized_key" {
|
|||||||
variable "service_cidr" {
|
variable "service_cidr" {
|
||||||
description = <<EOD
|
description = <<EOD
|
||||||
CIDR IPv4 range to assign Kubernetes services.
|
CIDR IPv4 range to assign Kubernetes services.
|
||||||
The 1st IP will be reserved for kube_apiserver, the 10th IP will be reserved for kube-dns.
|
The 1st IP will be reserved for kube_apiserver, the 10th IP will be reserved for coredns.
|
||||||
EOD
|
EOD
|
||||||
|
|
||||||
type = "string"
|
type = "string"
|
||||||
@ -87,7 +93,7 @@ EOD
|
|||||||
}
|
}
|
||||||
|
|
||||||
variable "cluster_domain_suffix" {
|
variable "cluster_domain_suffix" {
|
||||||
description = "Queries for domains with the suffix will be answered by kube-dns. Default is cluster.local (e.g. foo.default.svc.cluster.local) "
|
description = "Queries for domains with the suffix will be answered by coredns. Default is cluster.local (e.g. foo.default.svc.cluster.local) "
|
||||||
type = "string"
|
type = "string"
|
||||||
default = "cluster.local"
|
default = "cluster.local"
|
||||||
}
|
}
|
||||||
|
@ -41,16 +41,18 @@ resource "aws_autoscaling_group" "workers" {
|
|||||||
|
|
||||||
# Worker template
|
# Worker template
|
||||||
resource "aws_launch_configuration" "worker" {
|
resource "aws_launch_configuration" "worker" {
|
||||||
image_id = "${local.ami_id}"
|
image_id = "${local.ami_id}"
|
||||||
instance_type = "${var.instance_type}"
|
instance_type = "${var.instance_type}"
|
||||||
spot_price = "${var.spot_price}"
|
spot_price = "${var.spot_price}"
|
||||||
|
enable_monitoring = false
|
||||||
|
|
||||||
user_data = "${data.ct_config.worker_ign.rendered}"
|
user_data = "${data.ct_config.worker-ignition.rendered}"
|
||||||
|
|
||||||
# storage
|
# storage
|
||||||
root_block_device {
|
root_block_device {
|
||||||
volume_type = "${var.disk_type}"
|
volume_type = "${var.disk_type}"
|
||||||
volume_size = "${var.disk_size}"
|
volume_size = "${var.disk_size}"
|
||||||
|
iops = "${var.disk_iops}"
|
||||||
}
|
}
|
||||||
|
|
||||||
# network
|
# network
|
||||||
@ -63,20 +65,21 @@ resource "aws_launch_configuration" "worker" {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
# Worker Container Linux Config
|
# Worker Ignition config
|
||||||
data "template_file" "worker_config" {
|
data "ct_config" "worker-ignition" {
|
||||||
template = "${file("${path.module}/cl/worker.yaml.tmpl")}"
|
content = "${data.template_file.worker-config.rendered}"
|
||||||
|
|
||||||
vars = {
|
|
||||||
kubeconfig = "${indent(10, var.kubeconfig)}"
|
|
||||||
ssh_authorized_key = "${var.ssh_authorized_key}"
|
|
||||||
k8s_dns_service_ip = "${cidrhost(var.service_cidr, 10)}"
|
|
||||||
cluster_domain_suffix = "${var.cluster_domain_suffix}"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
data "ct_config" "worker_ign" {
|
|
||||||
content = "${data.template_file.worker_config.rendered}"
|
|
||||||
pretty_print = false
|
pretty_print = false
|
||||||
snippets = ["${var.clc_snippets}"]
|
snippets = ["${var.clc_snippets}"]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Worker Container Linux config
|
||||||
|
data "template_file" "worker-config" {
|
||||||
|
template = "${file("${path.module}/cl/worker.yaml.tmpl")}"
|
||||||
|
|
||||||
|
vars = {
|
||||||
|
kubeconfig = "${indent(10, var.kubeconfig)}"
|
||||||
|
ssh_authorized_key = "${var.ssh_authorized_key}"
|
||||||
|
cluster_dns_service_ip = "${cidrhost(var.service_cidr, 10)}"
|
||||||
|
cluster_domain_suffix = "${var.cluster_domain_suffix}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -11,10 +11,10 @@ Typhoon distributes upstream Kubernetes, architectural conventions, and cluster
|
|||||||
|
|
||||||
## Features <a href="https://www.cncf.io/certification/software-conformance/"><img align="right" src="https://storage.googleapis.com/poseidon/certified-kubernetes.png"></a>
|
## Features <a href="https://www.cncf.io/certification/software-conformance/"><img align="right" src="https://storage.googleapis.com/poseidon/certified-kubernetes.png"></a>
|
||||||
|
|
||||||
* Kubernetes v1.10.4 (upstream, via [kubernetes-incubator/bootkube](https://github.com/kubernetes-incubator/bootkube))
|
* Kubernetes v1.13.4 (upstream, via [kubernetes-incubator/bootkube](https://github.com/kubernetes-incubator/bootkube))
|
||||||
* Single or multi-master, workloads isolated on workers, [Calico](https://www.projectcalico.org/) or [flannel](https://github.com/coreos/flannel) networking
|
* Single or multi-master, [Calico](https://www.projectcalico.org/) or [flannel](https://github.com/coreos/flannel) networking
|
||||||
* On-cluster etcd with TLS, [RBAC](https://kubernetes.io/docs/admin/authorization/rbac/)-enabled, [network policy](https://kubernetes.io/docs/concepts/services-networking/network-policies/)
|
* On-cluster etcd with TLS, [RBAC](https://kubernetes.io/docs/admin/authorization/rbac/)-enabled, [network policy](https://kubernetes.io/docs/concepts/services-networking/network-policies/)
|
||||||
* Advanced features like [worker pools](https://typhoon.psdn.io/advanced/worker-pools/)
|
* Advanced features like [worker pools](https://typhoon.psdn.io/advanced/worker-pools/) and [spot](https://typhoon.psdn.io/cl/aws/#spot) workers
|
||||||
* Ready for Ingress, Prometheus, Grafana, and other optional [addons](https://typhoon.psdn.io/addons/overview/)
|
* Ready for Ingress, Prometheus, Grafana, and other optional [addons](https://typhoon.psdn.io/addons/overview/)
|
||||||
|
|
||||||
## Docs
|
## Docs
|
||||||
|
@ -14,6 +14,6 @@ data "aws_ami" "fedora" {
|
|||||||
|
|
||||||
filter {
|
filter {
|
||||||
name = "name"
|
name = "name"
|
||||||
values = ["Fedora-Atomic-27-20180419.0.x86_64-*-gp2-*"]
|
values = ["Fedora-AtomicHost-28-20180625.1.x86_64-*-gp2-*"]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
# Self-hosted Kubernetes assets (kubeconfig, manifests)
|
# Self-hosted Kubernetes assets (kubeconfig, manifests)
|
||||||
module "bootkube" {
|
module "bootkube" {
|
||||||
source = "git::https://github.com/poseidon/terraform-render-bootkube.git?ref=0e98e89e14a074768db13c4e050ed0c13319a0c1"
|
source = "git::https://github.com/poseidon/terraform-render-bootkube.git?ref=953521dbba49eb6a39204f30a3978730eac01e11"
|
||||||
|
|
||||||
cluster_name = "${var.cluster_name}"
|
cluster_name = "${var.cluster_name}"
|
||||||
api_servers = ["${format("%s.%s", var.cluster_name, var.dns_zone)}"]
|
api_servers = ["${format("%s.%s", var.cluster_name, var.dns_zone)}"]
|
||||||
@ -11,6 +11,7 @@ module "bootkube" {
|
|||||||
pod_cidr = "${var.pod_cidr}"
|
pod_cidr = "${var.pod_cidr}"
|
||||||
service_cidr = "${var.service_cidr}"
|
service_cidr = "${var.service_cidr}"
|
||||||
cluster_domain_suffix = "${var.cluster_domain_suffix}"
|
cluster_domain_suffix = "${var.cluster_domain_suffix}"
|
||||||
|
enable_reporting = "${var.enable_reporting}"
|
||||||
|
|
||||||
# Fedora
|
# Fedora
|
||||||
trusted_certs_dir = "/etc/pki/tls/certs"
|
trusted_certs_dir = "/etc/pki/tls/certs"
|
||||||
|
@ -19,24 +19,9 @@ write_files:
|
|||||||
ETCD_PEER_CERT_FILE=/etc/ssl/certs/etcd/peer.crt
|
ETCD_PEER_CERT_FILE=/etc/ssl/certs/etcd/peer.crt
|
||||||
ETCD_PEER_KEY_FILE=/etc/ssl/certs/etcd/peer.key
|
ETCD_PEER_KEY_FILE=/etc/ssl/certs/etcd/peer.key
|
||||||
ETCD_PEER_CLIENT_CERT_AUTH=true
|
ETCD_PEER_CLIENT_CERT_AUTH=true
|
||||||
- path: /etc/systemd/system/cloud-metadata.service
|
|
||||||
content: |
|
|
||||||
[Unit]
|
|
||||||
Description=Cloud metadata agent
|
|
||||||
[Service]
|
|
||||||
Type=oneshot
|
|
||||||
Environment=OUTPUT=/run/metadata/cloud
|
|
||||||
ExecStart=/usr/bin/mkdir -p /run/metadata
|
|
||||||
ExecStart=/usr/bin/bash -c 'echo "HOSTNAME_OVERRIDE=$(curl\
|
|
||||||
--url http://169.254.169.254/latest/meta-data/local-ipv4\
|
|
||||||
--retry 10)" > $${OUTPUT}'
|
|
||||||
[Install]
|
|
||||||
WantedBy=multi-user.target
|
|
||||||
- path: /etc/systemd/system/kubelet.service.d/10-typhoon.conf
|
- path: /etc/systemd/system/kubelet.service.d/10-typhoon.conf
|
||||||
content: |
|
content: |
|
||||||
[Unit]
|
[Unit]
|
||||||
Requires=cloud-metadata.service
|
|
||||||
After=cloud-metadata.service
|
|
||||||
Wants=rpc-statd.service
|
Wants=rpc-statd.service
|
||||||
[Service]
|
[Service]
|
||||||
ExecStartPre=/bin/mkdir -p /opt/cni/bin
|
ExecStartPre=/bin/mkdir -p /opt/cni/bin
|
||||||
@ -51,12 +36,11 @@ write_files:
|
|||||||
RestartSec=10
|
RestartSec=10
|
||||||
- path: /etc/kubernetes/kubelet.conf
|
- path: /etc/kubernetes/kubelet.conf
|
||||||
content: |
|
content: |
|
||||||
ARGS="--allow-privileged \
|
ARGS="--anonymous-auth=false \
|
||||||
--anonymous-auth=false \
|
|
||||||
--authentication-token-webhook \
|
--authentication-token-webhook \
|
||||||
--authorization-mode=Webhook \
|
--authorization-mode=Webhook \
|
||||||
--client-ca-file=/etc/kubernetes/ca.crt \
|
--client-ca-file=/etc/kubernetes/ca.crt \
|
||||||
--cluster_dns=${k8s_dns_service_ip} \
|
--cluster_dns=${cluster_dns_service_ip} \
|
||||||
--cluster_domain=${cluster_domain_suffix} \
|
--cluster_domain=${cluster_domain_suffix} \
|
||||||
--cni-conf-dir=/etc/kubernetes/cni/net.d \
|
--cni-conf-dir=/etc/kubernetes/cni/net.d \
|
||||||
--exit-on-lock-contention \
|
--exit-on-lock-contention \
|
||||||
@ -66,6 +50,7 @@ write_files:
|
|||||||
--node-labels=node-role.kubernetes.io/master \
|
--node-labels=node-role.kubernetes.io/master \
|
||||||
--node-labels=node-role.kubernetes.io/controller="true" \
|
--node-labels=node-role.kubernetes.io/controller="true" \
|
||||||
--pod-manifest-path=/etc/kubernetes/manifests \
|
--pod-manifest-path=/etc/kubernetes/manifests \
|
||||||
|
--read-only-port=0 \
|
||||||
--register-with-taints=node-role.kubernetes.io/master=:NoSchedule \
|
--register-with-taints=node-role.kubernetes.io/master=:NoSchedule \
|
||||||
--volume-plugin-dir=/var/lib/kubelet/volumeplugins"
|
--volume-plugin-dir=/var/lib/kubelet/volumeplugins"
|
||||||
- path: /etc/kubernetes/kubeconfig
|
- path: /etc/kubernetes/kubeconfig
|
||||||
@ -93,11 +78,10 @@ bootcmd:
|
|||||||
runcmd:
|
runcmd:
|
||||||
- [systemctl, daemon-reload]
|
- [systemctl, daemon-reload]
|
||||||
- [systemctl, restart, NetworkManager]
|
- [systemctl, restart, NetworkManager]
|
||||||
- "atomic install --system --name=etcd quay.io/poseidon/etcd:v3.3.6"
|
- "atomic install --system --name=etcd quay.io/poseidon/etcd:v3.3.12"
|
||||||
- "atomic install --system --name=kubelet quay.io/poseidon/kubelet:v1.10.4"
|
- "atomic install --system --name=kubelet quay.io/poseidon/kubelet:v1.13.4"
|
||||||
- "atomic install --system --name=bootkube quay.io/poseidon/bootkube:v0.12.0"
|
- "atomic install --system --name=bootkube quay.io/poseidon/bootkube:v0.14.0"
|
||||||
- [systemctl, start, --no-block, etcd.service]
|
- [systemctl, start, --no-block, etcd.service]
|
||||||
- [systemctl, enable, cloud-metadata.service]
|
|
||||||
- [systemctl, start, --no-block, kubelet.service]
|
- [systemctl, start, --no-block, kubelet.service]
|
||||||
users:
|
users:
|
||||||
- default
|
- default
|
||||||
|
@ -30,6 +30,7 @@ resource "aws_instance" "controllers" {
|
|||||||
root_block_device {
|
root_block_device {
|
||||||
volume_type = "${var.disk_type}"
|
volume_type = "${var.disk_type}"
|
||||||
volume_size = "${var.disk_size}"
|
volume_size = "${var.disk_size}"
|
||||||
|
iops = "${var.disk_iops}"
|
||||||
}
|
}
|
||||||
|
|
||||||
# network
|
# network
|
||||||
@ -38,7 +39,10 @@ resource "aws_instance" "controllers" {
|
|||||||
vpc_security_group_ids = ["${aws_security_group.controller.id}"]
|
vpc_security_group_ids = ["${aws_security_group.controller.id}"]
|
||||||
|
|
||||||
lifecycle {
|
lifecycle {
|
||||||
ignore_changes = ["ami"]
|
ignore_changes = [
|
||||||
|
"ami",
|
||||||
|
"user_data",
|
||||||
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -56,10 +60,10 @@ data "template_file" "controller-cloudinit" {
|
|||||||
# etcd0=https://cluster-etcd0.example.com,etcd1=https://cluster-etcd1.example.com,...
|
# etcd0=https://cluster-etcd0.example.com,etcd1=https://cluster-etcd1.example.com,...
|
||||||
etcd_initial_cluster = "${join(",", data.template_file.etcds.*.rendered)}"
|
etcd_initial_cluster = "${join(",", data.template_file.etcds.*.rendered)}"
|
||||||
|
|
||||||
kubeconfig = "${indent(6, module.bootkube.kubeconfig)}"
|
kubeconfig = "${indent(6, module.bootkube.kubeconfig-kubelet)}"
|
||||||
ssh_authorized_key = "${var.ssh_authorized_key}"
|
ssh_authorized_key = "${var.ssh_authorized_key}"
|
||||||
k8s_dns_service_ip = "${cidrhost(var.service_cidr, 10)}"
|
cluster_dns_service_ip = "${cidrhost(var.service_cidr, 10)}"
|
||||||
cluster_domain_suffix = "${var.cluster_domain_suffix}"
|
cluster_domain_suffix = "${var.cluster_domain_suffix}"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -68,8 +72,8 @@ data "template_file" "etcds" {
|
|||||||
template = "etcd$${index}=https://$${cluster_name}-etcd$${index}.$${dns_zone}:2380"
|
template = "etcd$${index}=https://$${cluster_name}-etcd$${index}.$${dns_zone}:2380"
|
||||||
|
|
||||||
vars {
|
vars {
|
||||||
index = "${count.index}"
|
index = "${count.index}"
|
||||||
cluster_name = "${var.cluster_name}"
|
cluster_name = "${var.cluster_name}"
|
||||||
dns_zone = "${var.dns_zone}"
|
dns_zone = "${var.dns_zone}"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -5,17 +5,17 @@ resource "aws_route53_record" "apiserver" {
|
|||||||
name = "${format("%s.%s.", var.cluster_name, var.dns_zone)}"
|
name = "${format("%s.%s.", var.cluster_name, var.dns_zone)}"
|
||||||
type = "A"
|
type = "A"
|
||||||
|
|
||||||
# AWS recommends their special "alias" records for ELBs
|
# AWS recommends their special "alias" records for NLBs
|
||||||
alias {
|
alias {
|
||||||
name = "${aws_lb.apiserver.dns_name}"
|
name = "${aws_lb.nlb.dns_name}"
|
||||||
zone_id = "${aws_lb.apiserver.zone_id}"
|
zone_id = "${aws_lb.nlb.zone_id}"
|
||||||
evaluate_target_health = true
|
evaluate_target_health = true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
# Network Load Balancer for apiservers
|
# Network Load Balancer for apiservers and ingress
|
||||||
resource "aws_lb" "apiserver" {
|
resource "aws_lb" "nlb" {
|
||||||
name = "${var.cluster_name}-apiserver"
|
name = "${var.cluster_name}-nlb"
|
||||||
load_balancer_type = "network"
|
load_balancer_type = "network"
|
||||||
internal = false
|
internal = false
|
||||||
|
|
||||||
@ -24,11 +24,11 @@ resource "aws_lb" "apiserver" {
|
|||||||
enable_cross_zone_load_balancing = true
|
enable_cross_zone_load_balancing = true
|
||||||
}
|
}
|
||||||
|
|
||||||
# Forward TCP traffic to controllers
|
# Forward TCP apiserver traffic to controllers
|
||||||
resource "aws_lb_listener" "apiserver-https" {
|
resource "aws_lb_listener" "apiserver-https" {
|
||||||
load_balancer_arn = "${aws_lb.apiserver.arn}"
|
load_balancer_arn = "${aws_lb.nlb.arn}"
|
||||||
protocol = "TCP"
|
protocol = "TCP"
|
||||||
port = "443"
|
port = "6443"
|
||||||
|
|
||||||
default_action {
|
default_action {
|
||||||
type = "forward"
|
type = "forward"
|
||||||
@ -36,6 +36,30 @@ resource "aws_lb_listener" "apiserver-https" {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Forward HTTP ingress traffic to workers
|
||||||
|
resource "aws_lb_listener" "ingress-http" {
|
||||||
|
load_balancer_arn = "${aws_lb.nlb.arn}"
|
||||||
|
protocol = "TCP"
|
||||||
|
port = 80
|
||||||
|
|
||||||
|
default_action {
|
||||||
|
type = "forward"
|
||||||
|
target_group_arn = "${module.workers.target_group_http}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Forward HTTPS ingress traffic to workers
|
||||||
|
resource "aws_lb_listener" "ingress-https" {
|
||||||
|
load_balancer_arn = "${aws_lb.nlb.arn}"
|
||||||
|
protocol = "TCP"
|
||||||
|
port = 443
|
||||||
|
|
||||||
|
default_action {
|
||||||
|
type = "forward"
|
||||||
|
target_group_arn = "${module.workers.target_group_https}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
# Target group of controllers
|
# Target group of controllers
|
||||||
resource "aws_lb_target_group" "controllers" {
|
resource "aws_lb_target_group" "controllers" {
|
||||||
name = "${var.cluster_name}-controllers"
|
name = "${var.cluster_name}-controllers"
|
||||||
@ -43,12 +67,12 @@ resource "aws_lb_target_group" "controllers" {
|
|||||||
target_type = "instance"
|
target_type = "instance"
|
||||||
|
|
||||||
protocol = "TCP"
|
protocol = "TCP"
|
||||||
port = 443
|
port = 6443
|
||||||
|
|
||||||
# TCP health check for apiserver
|
# TCP health check for apiserver
|
||||||
health_check {
|
health_check {
|
||||||
protocol = "TCP"
|
protocol = "TCP"
|
||||||
port = 443
|
port = 6443
|
||||||
|
|
||||||
# NLBs required to use same healthy and unhealthy thresholds
|
# NLBs required to use same healthy and unhealthy thresholds
|
||||||
healthy_threshold = 3
|
healthy_threshold = 3
|
||||||
@ -65,5 +89,5 @@ resource "aws_lb_target_group_attachment" "controllers" {
|
|||||||
|
|
||||||
target_group_arn = "${aws_lb_target_group.controllers.arn}"
|
target_group_arn = "${aws_lb_target_group.controllers.arn}"
|
||||||
target_id = "${element(aws_instance.controllers.*.id, count.index)}"
|
target_id = "${element(aws_instance.controllers.*.id, count.index)}"
|
||||||
port = 443
|
port = 6443
|
||||||
}
|
}
|
@ -1,8 +1,19 @@
|
|||||||
|
output "kubeconfig-admin" {
|
||||||
|
value = "${module.bootkube.kubeconfig-admin}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Outputs for Kubernetes Ingress
|
||||||
|
|
||||||
output "ingress_dns_name" {
|
output "ingress_dns_name" {
|
||||||
value = "${module.workers.ingress_dns_name}"
|
value = "${aws_lb.nlb.dns_name}"
|
||||||
description = "DNS name of the network load balancer for distributing traffic to Ingress controllers"
|
description = "DNS name of the network load balancer for distributing traffic to Ingress controllers"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
output "ingress_zone_id" {
|
||||||
|
value = "${aws_lb.nlb.zone_id}"
|
||||||
|
description = "Route53 zone id of the network load balancer DNS name that can be used in Route53 alias records"
|
||||||
|
}
|
||||||
|
|
||||||
# Outputs for worker pools
|
# Outputs for worker pools
|
||||||
|
|
||||||
output "vpc_id" {
|
output "vpc_id" {
|
||||||
@ -21,5 +32,17 @@ output "worker_security_groups" {
|
|||||||
}
|
}
|
||||||
|
|
||||||
output "kubeconfig" {
|
output "kubeconfig" {
|
||||||
value = "${module.bootkube.kubeconfig}"
|
value = "${module.bootkube.kubeconfig-kubelet}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Outputs for custom load balancing
|
||||||
|
|
||||||
|
output "worker_target_group_http" {
|
||||||
|
description = "ARN of a target group of workers for HTTP traffic"
|
||||||
|
value = "${module.workers.target_group_http}"
|
||||||
|
}
|
||||||
|
|
||||||
|
output "worker_target_group_https" {
|
||||||
|
description = "ARN of a target group of workers for HTTPS traffic"
|
||||||
|
value = "${module.workers.target_group_https}"
|
||||||
}
|
}
|
||||||
|
@ -11,16 +11,6 @@ resource "aws_security_group" "controller" {
|
|||||||
tags = "${map("Name", "${var.cluster_name}-controller")}"
|
tags = "${map("Name", "${var.cluster_name}-controller")}"
|
||||||
}
|
}
|
||||||
|
|
||||||
resource "aws_security_group_rule" "controller-icmp" {
|
|
||||||
security_group_id = "${aws_security_group.controller.id}"
|
|
||||||
|
|
||||||
type = "ingress"
|
|
||||||
protocol = "icmp"
|
|
||||||
from_port = 0
|
|
||||||
to_port = 0
|
|
||||||
cidr_blocks = ["0.0.0.0/0"]
|
|
||||||
}
|
|
||||||
|
|
||||||
resource "aws_security_group_rule" "controller-ssh" {
|
resource "aws_security_group_rule" "controller-ssh" {
|
||||||
security_group_id = "${aws_security_group.controller.id}"
|
security_group_id = "${aws_security_group.controller.id}"
|
||||||
|
|
||||||
@ -31,16 +21,6 @@ resource "aws_security_group_rule" "controller-ssh" {
|
|||||||
cidr_blocks = ["0.0.0.0/0"]
|
cidr_blocks = ["0.0.0.0/0"]
|
||||||
}
|
}
|
||||||
|
|
||||||
resource "aws_security_group_rule" "controller-apiserver" {
|
|
||||||
security_group_id = "${aws_security_group.controller.id}"
|
|
||||||
|
|
||||||
type = "ingress"
|
|
||||||
protocol = "tcp"
|
|
||||||
from_port = 443
|
|
||||||
to_port = 443
|
|
||||||
cidr_blocks = ["0.0.0.0/0"]
|
|
||||||
}
|
|
||||||
|
|
||||||
resource "aws_security_group_rule" "controller-etcd" {
|
resource "aws_security_group_rule" "controller-etcd" {
|
||||||
security_group_id = "${aws_security_group.controller.id}"
|
security_group_id = "${aws_security_group.controller.id}"
|
||||||
|
|
||||||
@ -51,6 +31,7 @@ resource "aws_security_group_rule" "controller-etcd" {
|
|||||||
self = true
|
self = true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Allow Prometheus to scrape etcd metrics
|
||||||
resource "aws_security_group_rule" "controller-etcd-metrics" {
|
resource "aws_security_group_rule" "controller-etcd-metrics" {
|
||||||
security_group_id = "${aws_security_group.controller.id}"
|
security_group_id = "${aws_security_group.controller.id}"
|
||||||
|
|
||||||
@ -61,6 +42,16 @@ resource "aws_security_group_rule" "controller-etcd-metrics" {
|
|||||||
source_security_group_id = "${aws_security_group.worker.id}"
|
source_security_group_id = "${aws_security_group.worker.id}"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
resource "aws_security_group_rule" "controller-apiserver" {
|
||||||
|
security_group_id = "${aws_security_group.controller.id}"
|
||||||
|
|
||||||
|
type = "ingress"
|
||||||
|
protocol = "tcp"
|
||||||
|
from_port = 6443
|
||||||
|
to_port = 6443
|
||||||
|
cidr_blocks = ["0.0.0.0/0"]
|
||||||
|
}
|
||||||
|
|
||||||
resource "aws_security_group_rule" "controller-flannel" {
|
resource "aws_security_group_rule" "controller-flannel" {
|
||||||
security_group_id = "${aws_security_group.controller.id}"
|
security_group_id = "${aws_security_group.controller.id}"
|
||||||
|
|
||||||
@ -81,6 +72,7 @@ resource "aws_security_group_rule" "controller-flannel-self" {
|
|||||||
self = true
|
self = true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Allow Prometheus to scrape node-exporter daemonset
|
||||||
resource "aws_security_group_rule" "controller-node-exporter" {
|
resource "aws_security_group_rule" "controller-node-exporter" {
|
||||||
security_group_id = "${aws_security_group.controller.id}"
|
security_group_id = "${aws_security_group.controller.id}"
|
||||||
|
|
||||||
@ -91,6 +83,7 @@ resource "aws_security_group_rule" "controller-node-exporter" {
|
|||||||
source_security_group_id = "${aws_security_group.worker.id}"
|
source_security_group_id = "${aws_security_group.worker.id}"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Allow apiserver to access kubelets for exec, log, port-forward
|
||||||
resource "aws_security_group_rule" "controller-kubelet" {
|
resource "aws_security_group_rule" "controller-kubelet" {
|
||||||
security_group_id = "${aws_security_group.controller.id}"
|
security_group_id = "${aws_security_group.controller.id}"
|
||||||
|
|
||||||
@ -111,26 +104,6 @@ resource "aws_security_group_rule" "controller-kubelet-self" {
|
|||||||
self = true
|
self = true
|
||||||
}
|
}
|
||||||
|
|
||||||
resource "aws_security_group_rule" "controller-kubelet-read" {
|
|
||||||
security_group_id = "${aws_security_group.controller.id}"
|
|
||||||
|
|
||||||
type = "ingress"
|
|
||||||
protocol = "tcp"
|
|
||||||
from_port = 10255
|
|
||||||
to_port = 10255
|
|
||||||
source_security_group_id = "${aws_security_group.worker.id}"
|
|
||||||
}
|
|
||||||
|
|
||||||
resource "aws_security_group_rule" "controller-kubelet-read-self" {
|
|
||||||
security_group_id = "${aws_security_group.controller.id}"
|
|
||||||
|
|
||||||
type = "ingress"
|
|
||||||
protocol = "tcp"
|
|
||||||
from_port = 10255
|
|
||||||
to_port = 10255
|
|
||||||
self = true
|
|
||||||
}
|
|
||||||
|
|
||||||
resource "aws_security_group_rule" "controller-bgp" {
|
resource "aws_security_group_rule" "controller-bgp" {
|
||||||
security_group_id = "${aws_security_group.controller.id}"
|
security_group_id = "${aws_security_group.controller.id}"
|
||||||
|
|
||||||
@ -213,16 +186,6 @@ resource "aws_security_group" "worker" {
|
|||||||
tags = "${map("Name", "${var.cluster_name}-worker")}"
|
tags = "${map("Name", "${var.cluster_name}-worker")}"
|
||||||
}
|
}
|
||||||
|
|
||||||
resource "aws_security_group_rule" "worker-icmp" {
|
|
||||||
security_group_id = "${aws_security_group.worker.id}"
|
|
||||||
|
|
||||||
type = "ingress"
|
|
||||||
protocol = "icmp"
|
|
||||||
from_port = 0
|
|
||||||
to_port = 0
|
|
||||||
cidr_blocks = ["0.0.0.0/0"]
|
|
||||||
}
|
|
||||||
|
|
||||||
resource "aws_security_group_rule" "worker-ssh" {
|
resource "aws_security_group_rule" "worker-ssh" {
|
||||||
security_group_id = "${aws_security_group.worker.id}"
|
security_group_id = "${aws_security_group.worker.id}"
|
||||||
|
|
||||||
@ -273,6 +236,7 @@ resource "aws_security_group_rule" "worker-flannel-self" {
|
|||||||
self = true
|
self = true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Allow Prometheus to scrape node-exporter daemonset
|
||||||
resource "aws_security_group_rule" "worker-node-exporter" {
|
resource "aws_security_group_rule" "worker-node-exporter" {
|
||||||
security_group_id = "${aws_security_group.worker.id}"
|
security_group_id = "${aws_security_group.worker.id}"
|
||||||
|
|
||||||
@ -293,6 +257,7 @@ resource "aws_security_group_rule" "ingress-health" {
|
|||||||
cidr_blocks = ["0.0.0.0/0"]
|
cidr_blocks = ["0.0.0.0/0"]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Allow apiserver to access kubelets for exec, log, port-forward
|
||||||
resource "aws_security_group_rule" "worker-kubelet" {
|
resource "aws_security_group_rule" "worker-kubelet" {
|
||||||
security_group_id = "${aws_security_group.worker.id}"
|
security_group_id = "${aws_security_group.worker.id}"
|
||||||
|
|
||||||
@ -303,6 +268,7 @@ resource "aws_security_group_rule" "worker-kubelet" {
|
|||||||
source_security_group_id = "${aws_security_group.controller.id}"
|
source_security_group_id = "${aws_security_group.controller.id}"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Allow Prometheus to scrape kubelet metrics
|
||||||
resource "aws_security_group_rule" "worker-kubelet-self" {
|
resource "aws_security_group_rule" "worker-kubelet-self" {
|
||||||
security_group_id = "${aws_security_group.worker.id}"
|
security_group_id = "${aws_security_group.worker.id}"
|
||||||
|
|
||||||
@ -313,26 +279,6 @@ resource "aws_security_group_rule" "worker-kubelet-self" {
|
|||||||
self = true
|
self = true
|
||||||
}
|
}
|
||||||
|
|
||||||
resource "aws_security_group_rule" "worker-kubelet-read" {
|
|
||||||
security_group_id = "${aws_security_group.worker.id}"
|
|
||||||
|
|
||||||
type = "ingress"
|
|
||||||
protocol = "tcp"
|
|
||||||
from_port = 10255
|
|
||||||
to_port = 10255
|
|
||||||
source_security_group_id = "${aws_security_group.controller.id}"
|
|
||||||
}
|
|
||||||
|
|
||||||
resource "aws_security_group_rule" "worker-kubelet-read-self" {
|
|
||||||
security_group_id = "${aws_security_group.worker.id}"
|
|
||||||
|
|
||||||
type = "ingress"
|
|
||||||
protocol = "tcp"
|
|
||||||
from_port = 10255
|
|
||||||
to_port = 10255
|
|
||||||
self = true
|
|
||||||
}
|
|
||||||
|
|
||||||
resource "aws_security_group_rule" "worker-bgp" {
|
resource "aws_security_group_rule" "worker-bgp" {
|
||||||
security_group_id = "${aws_security_group.worker.id}"
|
security_group_id = "${aws_security_group.worker.id}"
|
||||||
|
|
||||||
|
@ -31,13 +31,13 @@ variable "worker_count" {
|
|||||||
|
|
||||||
variable "controller_type" {
|
variable "controller_type" {
|
||||||
type = "string"
|
type = "string"
|
||||||
default = "t2.small"
|
default = "t3.small"
|
||||||
description = "EC2 instance type for controllers"
|
description = "EC2 instance type for controllers"
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "worker_type" {
|
variable "worker_type" {
|
||||||
type = "string"
|
type = "string"
|
||||||
default = "t2.small"
|
default = "t3.small"
|
||||||
description = "EC2 instance type for workers"
|
description = "EC2 instance type for workers"
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -53,6 +53,12 @@ variable "disk_type" {
|
|||||||
description = "Type of the EBS volume (e.g. standard, gp2, io1)"
|
description = "Type of the EBS volume (e.g. standard, gp2, io1)"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "disk_iops" {
|
||||||
|
type = "string"
|
||||||
|
default = "0"
|
||||||
|
description = "IOPS of the EBS volume (e.g. 100)"
|
||||||
|
}
|
||||||
|
|
||||||
variable "worker_price" {
|
variable "worker_price" {
|
||||||
type = "string"
|
type = "string"
|
||||||
default = ""
|
default = ""
|
||||||
@ -98,7 +104,7 @@ variable "pod_cidr" {
|
|||||||
variable "service_cidr" {
|
variable "service_cidr" {
|
||||||
description = <<EOD
|
description = <<EOD
|
||||||
CIDR IPv4 range to assign Kubernetes services.
|
CIDR IPv4 range to assign Kubernetes services.
|
||||||
The 1st IP will be reserved for kube_apiserver, the 10th IP will be reserved for kube-dns.
|
The 1st IP will be reserved for kube_apiserver, the 10th IP will be reserved for coredns.
|
||||||
EOD
|
EOD
|
||||||
|
|
||||||
type = "string"
|
type = "string"
|
||||||
@ -106,7 +112,13 @@ EOD
|
|||||||
}
|
}
|
||||||
|
|
||||||
variable "cluster_domain_suffix" {
|
variable "cluster_domain_suffix" {
|
||||||
description = "Queries for domains with the suffix will be answered by kube-dns. Default is cluster.local (e.g. foo.default.svc.cluster.local) "
|
description = "Queries for domains with the suffix will be answered by coredns. Default is cluster.local (e.g. foo.default.svc.cluster.local) "
|
||||||
type = "string"
|
type = "string"
|
||||||
default = "cluster.local"
|
default = "cluster.local"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "enable_reporting" {
|
||||||
|
type = "string"
|
||||||
|
description = "Enable usage or analytics reporting to upstreams (Calico)"
|
||||||
|
default = "false"
|
||||||
|
}
|
||||||
|
@ -12,7 +12,7 @@ module "workers" {
|
|||||||
spot_price = "${var.worker_price}"
|
spot_price = "${var.worker_price}"
|
||||||
|
|
||||||
# configuration
|
# configuration
|
||||||
kubeconfig = "${module.bootkube.kubeconfig}"
|
kubeconfig = "${module.bootkube.kubeconfig-kubelet}"
|
||||||
ssh_authorized_key = "${var.ssh_authorized_key}"
|
ssh_authorized_key = "${var.ssh_authorized_key}"
|
||||||
service_cidr = "${var.service_cidr}"
|
service_cidr = "${var.service_cidr}"
|
||||||
cluster_domain_suffix = "${var.cluster_domain_suffix}"
|
cluster_domain_suffix = "${var.cluster_domain_suffix}"
|
||||||
|
@ -14,6 +14,6 @@ data "aws_ami" "fedora" {
|
|||||||
|
|
||||||
filter {
|
filter {
|
||||||
name = "name"
|
name = "name"
|
||||||
values = ["Fedora-Atomic-27-20180419.0.x86_64-*-gp2-*"]
|
values = ["Fedora-AtomicHost-28-20180625.1.x86_64-*-gp2-*"]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,23 +1,8 @@
|
|||||||
#cloud-config
|
#cloud-config
|
||||||
write_files:
|
write_files:
|
||||||
- path: /etc/systemd/system/cloud-metadata.service
|
|
||||||
content: |
|
|
||||||
[Unit]
|
|
||||||
Description=Cloud metadata agent
|
|
||||||
[Service]
|
|
||||||
Type=oneshot
|
|
||||||
Environment=OUTPUT=/run/metadata/cloud
|
|
||||||
ExecStart=/usr/bin/mkdir -p /run/metadata
|
|
||||||
ExecStart=/usr/bin/bash -c 'echo "HOSTNAME_OVERRIDE=$(curl\
|
|
||||||
--url http://169.254.169.254/latest/meta-data/local-ipv4\
|
|
||||||
--retry 10)" > $${OUTPUT}'
|
|
||||||
[Install]
|
|
||||||
WantedBy=multi-user.target
|
|
||||||
- path: /etc/systemd/system/kubelet.service.d/10-typhoon.conf
|
- path: /etc/systemd/system/kubelet.service.d/10-typhoon.conf
|
||||||
content: |
|
content: |
|
||||||
[Unit]
|
[Unit]
|
||||||
Requires=cloud-metadata.service
|
|
||||||
After=cloud-metadata.service
|
|
||||||
Wants=rpc-statd.service
|
Wants=rpc-statd.service
|
||||||
[Service]
|
[Service]
|
||||||
ExecStartPre=/bin/mkdir -p /opt/cni/bin
|
ExecStartPre=/bin/mkdir -p /opt/cni/bin
|
||||||
@ -30,12 +15,11 @@ write_files:
|
|||||||
RestartSec=10
|
RestartSec=10
|
||||||
- path: /etc/kubernetes/kubelet.conf
|
- path: /etc/kubernetes/kubelet.conf
|
||||||
content: |
|
content: |
|
||||||
ARGS="--allow-privileged \
|
ARGS="--anonymous-auth=false \
|
||||||
--anonymous-auth=false \
|
|
||||||
--authentication-token-webhook \
|
--authentication-token-webhook \
|
||||||
--authorization-mode=Webhook \
|
--authorization-mode=Webhook \
|
||||||
--client-ca-file=/etc/kubernetes/ca.crt \
|
--client-ca-file=/etc/kubernetes/ca.crt \
|
||||||
--cluster_dns=${k8s_dns_service_ip} \
|
--cluster_dns=${cluster_dns_service_ip} \
|
||||||
--cluster_domain=${cluster_domain_suffix} \
|
--cluster_domain=${cluster_domain_suffix} \
|
||||||
--cni-conf-dir=/etc/kubernetes/cni/net.d \
|
--cni-conf-dir=/etc/kubernetes/cni/net.d \
|
||||||
--exit-on-lock-contention \
|
--exit-on-lock-contention \
|
||||||
@ -44,6 +28,7 @@ write_files:
|
|||||||
--network-plugin=cni \
|
--network-plugin=cni \
|
||||||
--node-labels=node-role.kubernetes.io/node \
|
--node-labels=node-role.kubernetes.io/node \
|
||||||
--pod-manifest-path=/etc/kubernetes/manifests \
|
--pod-manifest-path=/etc/kubernetes/manifests \
|
||||||
|
--read-only-port=0 \
|
||||||
--volume-plugin-dir=/var/lib/kubelet/volumeplugins"
|
--volume-plugin-dir=/var/lib/kubelet/volumeplugins"
|
||||||
- path: /etc/kubernetes/kubeconfig
|
- path: /etc/kubernetes/kubeconfig
|
||||||
permissions: '0644'
|
permissions: '0644'
|
||||||
@ -69,8 +54,7 @@ bootcmd:
|
|||||||
runcmd:
|
runcmd:
|
||||||
- [systemctl, daemon-reload]
|
- [systemctl, daemon-reload]
|
||||||
- [systemctl, restart, NetworkManager]
|
- [systemctl, restart, NetworkManager]
|
||||||
- [systemctl, enable, cloud-metadata.service]
|
- "atomic install --system --name=kubelet quay.io/poseidon/kubelet:v1.13.4"
|
||||||
- "atomic install --system --name=kubelet quay.io/poseidon/kubelet:v1.10.4"
|
|
||||||
- [systemctl, start, --no-block, kubelet.service]
|
- [systemctl, start, --no-block, kubelet.service]
|
||||||
users:
|
users:
|
||||||
- default
|
- default
|
||||||
|
@ -1,39 +1,4 @@
|
|||||||
# Network Load Balancer for Ingress
|
# Target groups of instances for use with load balancers
|
||||||
resource "aws_lb" "ingress" {
|
|
||||||
name = "${var.name}-ingress"
|
|
||||||
load_balancer_type = "network"
|
|
||||||
internal = false
|
|
||||||
|
|
||||||
subnets = ["${var.subnet_ids}"]
|
|
||||||
|
|
||||||
enable_cross_zone_load_balancing = true
|
|
||||||
}
|
|
||||||
|
|
||||||
# Forward HTTP traffic to workers
|
|
||||||
resource "aws_lb_listener" "ingress-http" {
|
|
||||||
load_balancer_arn = "${aws_lb.ingress.arn}"
|
|
||||||
protocol = "TCP"
|
|
||||||
port = 80
|
|
||||||
|
|
||||||
default_action {
|
|
||||||
type = "forward"
|
|
||||||
target_group_arn = "${aws_lb_target_group.workers-http.arn}"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# Forward HTTPS traffic to workers
|
|
||||||
resource "aws_lb_listener" "ingress-https" {
|
|
||||||
load_balancer_arn = "${aws_lb.ingress.arn}"
|
|
||||||
protocol = "TCP"
|
|
||||||
port = 443
|
|
||||||
|
|
||||||
default_action {
|
|
||||||
type = "forward"
|
|
||||||
target_group_arn = "${aws_lb_target_group.workers-https.arn}"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# Network Load Balancer target groups of instances
|
|
||||||
|
|
||||||
resource "aws_lb_target_group" "workers-http" {
|
resource "aws_lb_target_group" "workers-http" {
|
||||||
name = "${var.name}-workers-http"
|
name = "${var.name}-workers-http"
|
||||||
|
@ -1,4 +1,9 @@
|
|||||||
output "ingress_dns_name" {
|
output "target_group_http" {
|
||||||
value = "${aws_lb.ingress.dns_name}"
|
description = "ARN of a target group of workers for HTTP traffic"
|
||||||
description = "DNS name of the network load balancer for distributing traffic to Ingress controllers"
|
value = "${aws_lb_target_group.workers-http.arn}"
|
||||||
|
}
|
||||||
|
|
||||||
|
output "target_group_https" {
|
||||||
|
description = "ARN of a target group of workers for HTTPS traffic"
|
||||||
|
value = "${aws_lb_target_group.workers-https.arn}"
|
||||||
}
|
}
|
||||||
|
@ -30,7 +30,7 @@ variable "count" {
|
|||||||
|
|
||||||
variable "instance_type" {
|
variable "instance_type" {
|
||||||
type = "string"
|
type = "string"
|
||||||
default = "t2.small"
|
default = "t3.small"
|
||||||
description = "EC2 instance type"
|
description = "EC2 instance type"
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -46,6 +46,12 @@ variable "disk_type" {
|
|||||||
description = "Type of the EBS volume (e.g. standard, gp2, io1)"
|
description = "Type of the EBS volume (e.g. standard, gp2, io1)"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "disk_iops" {
|
||||||
|
type = "string"
|
||||||
|
default = "0"
|
||||||
|
description = "IOPS of the EBS volume (required for io1)"
|
||||||
|
}
|
||||||
|
|
||||||
variable "spot_price" {
|
variable "spot_price" {
|
||||||
type = "string"
|
type = "string"
|
||||||
default = ""
|
default = ""
|
||||||
@ -67,7 +73,7 @@ variable "ssh_authorized_key" {
|
|||||||
variable "service_cidr" {
|
variable "service_cidr" {
|
||||||
description = <<EOD
|
description = <<EOD
|
||||||
CIDR IPv4 range to assign Kubernetes services.
|
CIDR IPv4 range to assign Kubernetes services.
|
||||||
The 1st IP will be reserved for kube_apiserver, the 10th IP will be reserved for kube-dns.
|
The 1st IP will be reserved for kube_apiserver, the 10th IP will be reserved for coredns.
|
||||||
EOD
|
EOD
|
||||||
|
|
||||||
type = "string"
|
type = "string"
|
||||||
@ -75,7 +81,7 @@ EOD
|
|||||||
}
|
}
|
||||||
|
|
||||||
variable "cluster_domain_suffix" {
|
variable "cluster_domain_suffix" {
|
||||||
description = "Queries for domains with the suffix will be answered by kube-dns. Default is cluster.local (e.g. foo.default.svc.cluster.local) "
|
description = "Queries for domains with the suffix will be answered by coredns. Default is cluster.local (e.g. foo.default.svc.cluster.local) "
|
||||||
type = "string"
|
type = "string"
|
||||||
default = "cluster.local"
|
default = "cluster.local"
|
||||||
}
|
}
|
||||||
|
@ -41,9 +41,10 @@ resource "aws_autoscaling_group" "workers" {
|
|||||||
|
|
||||||
# Worker template
|
# Worker template
|
||||||
resource "aws_launch_configuration" "worker" {
|
resource "aws_launch_configuration" "worker" {
|
||||||
image_id = "${data.aws_ami.fedora.image_id}"
|
image_id = "${data.aws_ami.fedora.image_id}"
|
||||||
instance_type = "${var.instance_type}"
|
instance_type = "${var.instance_type}"
|
||||||
spot_price = "${var.spot_price}"
|
spot_price = "${var.spot_price}"
|
||||||
|
enable_monitoring = false
|
||||||
|
|
||||||
user_data = "${data.template_file.worker-cloudinit.rendered}"
|
user_data = "${data.template_file.worker-cloudinit.rendered}"
|
||||||
|
|
||||||
@ -51,6 +52,7 @@ resource "aws_launch_configuration" "worker" {
|
|||||||
root_block_device {
|
root_block_device {
|
||||||
volume_type = "${var.disk_type}"
|
volume_type = "${var.disk_type}"
|
||||||
volume_size = "${var.disk_size}"
|
volume_size = "${var.disk_size}"
|
||||||
|
iops = "${var.disk_iops}"
|
||||||
}
|
}
|
||||||
|
|
||||||
# network
|
# network
|
||||||
@ -68,9 +70,9 @@ data "template_file" "worker-cloudinit" {
|
|||||||
template = "${file("${path.module}/cloudinit/worker.yaml.tmpl")}"
|
template = "${file("${path.module}/cloudinit/worker.yaml.tmpl")}"
|
||||||
|
|
||||||
vars = {
|
vars = {
|
||||||
kubeconfig = "${indent(6, var.kubeconfig)}"
|
kubeconfig = "${indent(6, var.kubeconfig)}"
|
||||||
ssh_authorized_key = "${var.ssh_authorized_key}"
|
ssh_authorized_key = "${var.ssh_authorized_key}"
|
||||||
k8s_dns_service_ip = "${cidrhost(var.service_cidr, 10)}"
|
cluster_dns_service_ip = "${cidrhost(var.service_cidr, 10)}"
|
||||||
cluster_domain_suffix = "${var.cluster_domain_suffix}"
|
cluster_domain_suffix = "${var.cluster_domain_suffix}"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
23
azure/container-linux/kubernetes/LICENSE
Normal file
23
azure/container-linux/kubernetes/LICENSE
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
The MIT License (MIT)
|
||||||
|
|
||||||
|
Copyright (c) 2017 Typhoon Authors
|
||||||
|
Copyright (c) 2017 Dalton Hubble
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in
|
||||||
|
all copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||||
|
THE SOFTWARE.
|
||||||
|
|
23
azure/container-linux/kubernetes/README.md
Normal file
23
azure/container-linux/kubernetes/README.md
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
# Typhoon <img align="right" src="https://storage.googleapis.com/poseidon/typhoon-logo.png">
|
||||||
|
|
||||||
|
Typhoon is a minimal and free Kubernetes distribution.
|
||||||
|
|
||||||
|
* Minimal, stable base Kubernetes distribution
|
||||||
|
* Declarative infrastructure and configuration
|
||||||
|
* Free (freedom and cost) and privacy-respecting
|
||||||
|
* Practical for labs, datacenters, and clouds
|
||||||
|
|
||||||
|
Typhoon distributes upstream Kubernetes, architectural conventions, and cluster addons, much like a GNU/Linux distribution provides the Linux kernel and userspace components.
|
||||||
|
|
||||||
|
## Features <a href="https://www.cncf.io/certification/software-conformance/"><img align="right" src="https://storage.googleapis.com/poseidon/certified-kubernetes.png"></a>
|
||||||
|
|
||||||
|
* Kubernetes v1.13.4 (upstream, via [kubernetes-incubator/bootkube](https://github.com/kubernetes-incubator/bootkube))
|
||||||
|
* Single or multi-master, [flannel](https://github.com/coreos/flannel) networking
|
||||||
|
* On-cluster etcd with TLS, [RBAC](https://kubernetes.io/docs/admin/authorization/rbac/)-enabled
|
||||||
|
* Advanced features like [worker pools](https://typhoon.psdn.io/advanced/worker-pools/), [low-priority](https://typhoon.psdn.io/cl/azure/#low-priority) workers, and [snippets](https://typhoon.psdn.io/advanced/customization/#container-linux) customization
|
||||||
|
* Ready for Ingress, Prometheus, Grafana, and other optional [addons](https://typhoon.psdn.io/addons/overview/)
|
||||||
|
|
||||||
|
## Docs
|
||||||
|
|
||||||
|
Please see the [official docs](https://typhoon.psdn.io) and the Azure [tutorial](https://typhoon.psdn.io/cl/azure/).
|
||||||
|
|
14
azure/container-linux/kubernetes/bootkube.tf
Normal file
14
azure/container-linux/kubernetes/bootkube.tf
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
# Self-hosted Kubernetes assets (kubeconfig, manifests)
|
||||||
|
module "bootkube" {
|
||||||
|
source = "git::https://github.com/poseidon/terraform-render-bootkube.git?ref=953521dbba49eb6a39204f30a3978730eac01e11"
|
||||||
|
|
||||||
|
cluster_name = "${var.cluster_name}"
|
||||||
|
api_servers = ["${format("%s.%s", var.cluster_name, var.dns_zone)}"]
|
||||||
|
etcd_servers = ["${formatlist("%s.%s", azurerm_dns_a_record.etcds.*.name, var.dns_zone)}"]
|
||||||
|
asset_dir = "${var.asset_dir}"
|
||||||
|
networking = "flannel"
|
||||||
|
pod_cidr = "${var.pod_cidr}"
|
||||||
|
service_cidr = "${var.service_cidr}"
|
||||||
|
cluster_domain_suffix = "${var.cluster_domain_suffix}"
|
||||||
|
enable_reporting = "${var.enable_reporting}"
|
||||||
|
}
|
161
azure/container-linux/kubernetes/cl/controller.yaml.tmpl
Normal file
161
azure/container-linux/kubernetes/cl/controller.yaml.tmpl
Normal file
@ -0,0 +1,161 @@
|
|||||||
|
---
|
||||||
|
systemd:
|
||||||
|
units:
|
||||||
|
- name: etcd-member.service
|
||||||
|
enable: true
|
||||||
|
dropins:
|
||||||
|
- name: 40-etcd-cluster.conf
|
||||||
|
contents: |
|
||||||
|
[Service]
|
||||||
|
Environment="ETCD_IMAGE_TAG=v3.3.12"
|
||||||
|
Environment="ETCD_NAME=${etcd_name}"
|
||||||
|
Environment="ETCD_ADVERTISE_CLIENT_URLS=https://${etcd_domain}:2379"
|
||||||
|
Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://${etcd_domain}:2380"
|
||||||
|
Environment="ETCD_LISTEN_CLIENT_URLS=https://0.0.0.0:2379"
|
||||||
|
Environment="ETCD_LISTEN_PEER_URLS=https://0.0.0.0:2380"
|
||||||
|
Environment="ETCD_LISTEN_METRICS_URLS=http://0.0.0.0:2381"
|
||||||
|
Environment="ETCD_INITIAL_CLUSTER=${etcd_initial_cluster}"
|
||||||
|
Environment="ETCD_STRICT_RECONFIG_CHECK=true"
|
||||||
|
Environment="ETCD_SSL_DIR=/etc/ssl/etcd"
|
||||||
|
Environment="ETCD_TRUSTED_CA_FILE=/etc/ssl/certs/etcd/server-ca.crt"
|
||||||
|
Environment="ETCD_CERT_FILE=/etc/ssl/certs/etcd/server.crt"
|
||||||
|
Environment="ETCD_KEY_FILE=/etc/ssl/certs/etcd/server.key"
|
||||||
|
Environment="ETCD_CLIENT_CERT_AUTH=true"
|
||||||
|
Environment="ETCD_PEER_TRUSTED_CA_FILE=/etc/ssl/certs/etcd/peer-ca.crt"
|
||||||
|
Environment="ETCD_PEER_CERT_FILE=/etc/ssl/certs/etcd/peer.crt"
|
||||||
|
Environment="ETCD_PEER_KEY_FILE=/etc/ssl/certs/etcd/peer.key"
|
||||||
|
Environment="ETCD_PEER_CLIENT_CERT_AUTH=true"
|
||||||
|
- name: docker.service
|
||||||
|
enable: true
|
||||||
|
- name: locksmithd.service
|
||||||
|
mask: true
|
||||||
|
- name: wait-for-dns.service
|
||||||
|
enable: true
|
||||||
|
contents: |
|
||||||
|
[Unit]
|
||||||
|
Description=Wait for DNS entries
|
||||||
|
Wants=systemd-resolved.service
|
||||||
|
Before=kubelet.service
|
||||||
|
[Service]
|
||||||
|
Type=oneshot
|
||||||
|
RemainAfterExit=true
|
||||||
|
ExecStart=/bin/sh -c 'while ! /usr/bin/grep '^[^#[:space:]]' /etc/resolv.conf > /dev/null; do sleep 1; done'
|
||||||
|
[Install]
|
||||||
|
RequiredBy=kubelet.service
|
||||||
|
RequiredBy=etcd-member.service
|
||||||
|
- name: kubelet.service
|
||||||
|
enable: true
|
||||||
|
contents: |
|
||||||
|
[Unit]
|
||||||
|
Description=Kubelet via Hyperkube
|
||||||
|
Wants=rpc-statd.service
|
||||||
|
[Service]
|
||||||
|
EnvironmentFile=/etc/kubernetes/kubelet.env
|
||||||
|
Environment="RKT_RUN_ARGS=--uuid-file-save=/var/cache/kubelet-pod.uuid \
|
||||||
|
--volume=resolv,kind=host,source=/etc/resolv.conf \
|
||||||
|
--mount volume=resolv,target=/etc/resolv.conf \
|
||||||
|
--volume var-lib-cni,kind=host,source=/var/lib/cni \
|
||||||
|
--mount volume=var-lib-cni,target=/var/lib/cni \
|
||||||
|
--volume var-lib-calico,kind=host,source=/var/lib/calico \
|
||||||
|
--mount volume=var-lib-calico,target=/var/lib/calico \
|
||||||
|
--volume opt-cni-bin,kind=host,source=/opt/cni/bin \
|
||||||
|
--mount volume=opt-cni-bin,target=/opt/cni/bin \
|
||||||
|
--volume var-log,kind=host,source=/var/log \
|
||||||
|
--mount volume=var-log,target=/var/log \
|
||||||
|
--insecure-options=image"
|
||||||
|
ExecStartPre=/bin/mkdir -p /opt/cni/bin
|
||||||
|
ExecStartPre=/bin/mkdir -p /etc/kubernetes/manifests
|
||||||
|
ExecStartPre=/bin/mkdir -p /etc/kubernetes/cni/net.d
|
||||||
|
ExecStartPre=/bin/mkdir -p /etc/kubernetes/checkpoint-secrets
|
||||||
|
ExecStartPre=/bin/mkdir -p /etc/kubernetes/inactive-manifests
|
||||||
|
ExecStartPre=/bin/mkdir -p /var/lib/cni
|
||||||
|
ExecStartPre=/bin/mkdir -p /var/lib/calico
|
||||||
|
ExecStartPre=/bin/mkdir -p /var/lib/kubelet/volumeplugins
|
||||||
|
ExecStartPre=/usr/bin/bash -c "grep 'certificate-authority-data' /etc/kubernetes/kubeconfig | awk '{print $2}' | base64 -d > /etc/kubernetes/ca.crt"
|
||||||
|
ExecStartPre=-/usr/bin/rkt rm --uuid-file=/var/cache/kubelet-pod.uuid
|
||||||
|
ExecStart=/usr/lib/coreos/kubelet-wrapper \
|
||||||
|
--anonymous-auth=false \
|
||||||
|
--authentication-token-webhook \
|
||||||
|
--authorization-mode=Webhook \
|
||||||
|
--client-ca-file=/etc/kubernetes/ca.crt \
|
||||||
|
--cluster_dns=${cluster_dns_service_ip} \
|
||||||
|
--cluster_domain=${cluster_domain_suffix} \
|
||||||
|
--cni-conf-dir=/etc/kubernetes/cni/net.d \
|
||||||
|
--exit-on-lock-contention \
|
||||||
|
--kubeconfig=/etc/kubernetes/kubeconfig \
|
||||||
|
--lock-file=/var/run/lock/kubelet.lock \
|
||||||
|
--network-plugin=cni \
|
||||||
|
--node-labels=node-role.kubernetes.io/master \
|
||||||
|
--node-labels=node-role.kubernetes.io/controller="true" \
|
||||||
|
--pod-manifest-path=/etc/kubernetes/manifests \
|
||||||
|
--read-only-port=0 \
|
||||||
|
--register-with-taints=node-role.kubernetes.io/master=:NoSchedule \
|
||||||
|
--volume-plugin-dir=/var/lib/kubelet/volumeplugins
|
||||||
|
ExecStop=-/usr/bin/rkt stop --uuid-file=/var/cache/kubelet-pod.uuid
|
||||||
|
Restart=always
|
||||||
|
RestartSec=10
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
- name: bootkube.service
|
||||||
|
contents: |
|
||||||
|
[Unit]
|
||||||
|
Description=Bootstrap a Kubernetes cluster
|
||||||
|
ConditionPathExists=!/opt/bootkube/init_bootkube.done
|
||||||
|
[Service]
|
||||||
|
Type=oneshot
|
||||||
|
RemainAfterExit=true
|
||||||
|
WorkingDirectory=/opt/bootkube
|
||||||
|
ExecStart=/opt/bootkube/bootkube-start
|
||||||
|
ExecStartPost=/bin/touch /opt/bootkube/init_bootkube.done
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
storage:
|
||||||
|
files:
|
||||||
|
- path: /etc/kubernetes/kubeconfig
|
||||||
|
filesystem: root
|
||||||
|
mode: 0644
|
||||||
|
contents:
|
||||||
|
inline: |
|
||||||
|
${kubeconfig}
|
||||||
|
- path: /etc/kubernetes/kubelet.env
|
||||||
|
filesystem: root
|
||||||
|
mode: 0644
|
||||||
|
contents:
|
||||||
|
inline: |
|
||||||
|
KUBELET_IMAGE_URL=docker://k8s.gcr.io/hyperkube
|
||||||
|
KUBELET_IMAGE_TAG=v1.13.4
|
||||||
|
- path: /etc/sysctl.d/max-user-watches.conf
|
||||||
|
filesystem: root
|
||||||
|
contents:
|
||||||
|
inline: |
|
||||||
|
fs.inotify.max_user_watches=16184
|
||||||
|
- path: /opt/bootkube/bootkube-start
|
||||||
|
filesystem: root
|
||||||
|
mode: 0544
|
||||||
|
user:
|
||||||
|
id: 500
|
||||||
|
group:
|
||||||
|
id: 500
|
||||||
|
contents:
|
||||||
|
inline: |
|
||||||
|
#!/bin/bash
|
||||||
|
# Wrapper for bootkube start
|
||||||
|
set -e
|
||||||
|
# Move experimental manifests
|
||||||
|
[ -n "$(ls /opt/bootkube/assets/manifests-*/* 2>/dev/null)" ] && mv /opt/bootkube/assets/manifests-*/* /opt/bootkube/assets/manifests && rm -rf /opt/bootkube/assets/manifests-*
|
||||||
|
exec /usr/bin/rkt run \
|
||||||
|
--trust-keys-from-https \
|
||||||
|
--volume assets,kind=host,source=/opt/bootkube/assets \
|
||||||
|
--mount volume=assets,target=/assets \
|
||||||
|
--volume bootstrap,kind=host,source=/etc/kubernetes \
|
||||||
|
--mount volume=bootstrap,target=/etc/kubernetes \
|
||||||
|
$${RKT_OPTS} \
|
||||||
|
quay.io/coreos/bootkube:v0.14.0 \
|
||||||
|
--net=host \
|
||||||
|
--dns=host \
|
||||||
|
--exec=/bootkube -- start --asset-dir=/assets "$@"
|
||||||
|
passwd:
|
||||||
|
users:
|
||||||
|
- name: core
|
||||||
|
ssh_authorized_keys:
|
||||||
|
- "${ssh_authorized_key}"
|
168
azure/container-linux/kubernetes/controllers.tf
Normal file
168
azure/container-linux/kubernetes/controllers.tf
Normal file
@ -0,0 +1,168 @@
|
|||||||
|
# Discrete DNS records for each controller's private IPv4 for etcd usage
|
||||||
|
resource "azurerm_dns_a_record" "etcds" {
|
||||||
|
count = "${var.controller_count}"
|
||||||
|
resource_group_name = "${var.dns_zone_group}"
|
||||||
|
|
||||||
|
# DNS Zone name where record should be created
|
||||||
|
zone_name = "${var.dns_zone}"
|
||||||
|
|
||||||
|
# DNS record
|
||||||
|
name = "${format("%s-etcd%d", var.cluster_name, count.index)}"
|
||||||
|
ttl = 300
|
||||||
|
|
||||||
|
# private IPv4 address for etcd
|
||||||
|
records = ["${element(azurerm_network_interface.controllers.*.private_ip_address, count.index)}"]
|
||||||
|
}
|
||||||
|
|
||||||
|
locals {
|
||||||
|
# Channel for a Container Linux derivative
|
||||||
|
# coreos-stable -> Container Linux Stable
|
||||||
|
channel = "${element(split("-", var.os_image), 1)}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Controller availability set to spread controllers
|
||||||
|
resource "azurerm_availability_set" "controllers" {
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "${var.cluster_name}-controllers"
|
||||||
|
location = "${var.region}"
|
||||||
|
platform_fault_domain_count = 2
|
||||||
|
platform_update_domain_count = 4
|
||||||
|
managed = true
|
||||||
|
}
|
||||||
|
|
||||||
|
# Controller instances
|
||||||
|
resource "azurerm_virtual_machine" "controllers" {
|
||||||
|
count = "${var.controller_count}"
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "${var.cluster_name}-controller-${count.index}"
|
||||||
|
location = "${var.region}"
|
||||||
|
availability_set_id = "${azurerm_availability_set.controllers.id}"
|
||||||
|
vm_size = "${var.controller_type}"
|
||||||
|
|
||||||
|
# boot
|
||||||
|
storage_image_reference {
|
||||||
|
publisher = "CoreOS"
|
||||||
|
offer = "CoreOS"
|
||||||
|
sku = "${local.channel}"
|
||||||
|
version = "latest"
|
||||||
|
}
|
||||||
|
|
||||||
|
# storage
|
||||||
|
storage_os_disk {
|
||||||
|
name = "${var.cluster_name}-controller-${count.index}"
|
||||||
|
create_option = "FromImage"
|
||||||
|
caching = "ReadWrite"
|
||||||
|
disk_size_gb = "${var.disk_size}"
|
||||||
|
os_type = "Linux"
|
||||||
|
managed_disk_type = "Premium_LRS"
|
||||||
|
}
|
||||||
|
|
||||||
|
# network
|
||||||
|
network_interface_ids = ["${element(azurerm_network_interface.controllers.*.id, count.index)}"]
|
||||||
|
|
||||||
|
os_profile {
|
||||||
|
computer_name = "${var.cluster_name}-controller-${count.index}"
|
||||||
|
admin_username = "core"
|
||||||
|
custom_data = "${element(data.ct_config.controller-ignitions.*.rendered, count.index)}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Azure mandates setting an ssh_key, even though Ignition custom_data handles it too
|
||||||
|
os_profile_linux_config {
|
||||||
|
disable_password_authentication = true
|
||||||
|
|
||||||
|
ssh_keys {
|
||||||
|
path = "/home/core/.ssh/authorized_keys"
|
||||||
|
key_data = "${var.ssh_authorized_key}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# lifecycle
|
||||||
|
delete_os_disk_on_termination = true
|
||||||
|
delete_data_disks_on_termination = true
|
||||||
|
|
||||||
|
lifecycle {
|
||||||
|
ignore_changes = [
|
||||||
|
"storage_os_disk",
|
||||||
|
"os_profile",
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Controller NICs with public and private IPv4
|
||||||
|
resource "azurerm_network_interface" "controllers" {
|
||||||
|
count = "${var.controller_count}"
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "${var.cluster_name}-controller-${count.index}"
|
||||||
|
location = "${azurerm_resource_group.cluster.location}"
|
||||||
|
network_security_group_id = "${azurerm_network_security_group.controller.id}"
|
||||||
|
|
||||||
|
ip_configuration {
|
||||||
|
name = "ip0"
|
||||||
|
subnet_id = "${azurerm_subnet.controller.id}"
|
||||||
|
private_ip_address_allocation = "dynamic"
|
||||||
|
|
||||||
|
# public IPv4
|
||||||
|
public_ip_address_id = "${element(azurerm_public_ip.controllers.*.id, count.index)}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add controller NICs to the controller backend address pool
|
||||||
|
resource "azurerm_network_interface_backend_address_pool_association" "controllers" {
|
||||||
|
network_interface_id = "${azurerm_network_interface.controllers.id}"
|
||||||
|
ip_configuration_name = "ip0"
|
||||||
|
backend_address_pool_id = "${azurerm_lb_backend_address_pool.controller.id}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Controller public IPv4 addresses
|
||||||
|
resource "azurerm_public_ip" "controllers" {
|
||||||
|
count = "${var.controller_count}"
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "${var.cluster_name}-controller-${count.index}"
|
||||||
|
location = "${azurerm_resource_group.cluster.location}"
|
||||||
|
sku = "Standard"
|
||||||
|
allocation_method = "Static"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Controller Ignition configs
|
||||||
|
data "ct_config" "controller-ignitions" {
|
||||||
|
count = "${var.controller_count}"
|
||||||
|
content = "${element(data.template_file.controller-configs.*.rendered, count.index)}"
|
||||||
|
pretty_print = false
|
||||||
|
snippets = ["${var.controller_clc_snippets}"]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Controller Container Linux configs
|
||||||
|
data "template_file" "controller-configs" {
|
||||||
|
count = "${var.controller_count}"
|
||||||
|
|
||||||
|
template = "${file("${path.module}/cl/controller.yaml.tmpl")}"
|
||||||
|
|
||||||
|
vars = {
|
||||||
|
# Cannot use cyclic dependencies on controllers or their DNS records
|
||||||
|
etcd_name = "etcd${count.index}"
|
||||||
|
etcd_domain = "${var.cluster_name}-etcd${count.index}.${var.dns_zone}"
|
||||||
|
|
||||||
|
# etcd0=https://cluster-etcd0.example.com,etcd1=https://cluster-etcd1.example.com,...
|
||||||
|
etcd_initial_cluster = "${join(",", data.template_file.etcds.*.rendered)}"
|
||||||
|
|
||||||
|
kubeconfig = "${indent(10, module.bootkube.kubeconfig-kubelet)}"
|
||||||
|
ssh_authorized_key = "${var.ssh_authorized_key}"
|
||||||
|
cluster_dns_service_ip = "${cidrhost(var.service_cidr, 10)}"
|
||||||
|
cluster_domain_suffix = "${var.cluster_domain_suffix}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
data "template_file" "etcds" {
|
||||||
|
count = "${var.controller_count}"
|
||||||
|
template = "etcd$${index}=https://$${cluster_name}-etcd$${index}.$${dns_zone}:2380"
|
||||||
|
|
||||||
|
vars {
|
||||||
|
index = "${count.index}"
|
||||||
|
cluster_name = "${var.cluster_name}"
|
||||||
|
dns_zone = "${var.dns_zone}"
|
||||||
|
}
|
||||||
|
}
|
144
azure/container-linux/kubernetes/lb.tf
Normal file
144
azure/container-linux/kubernetes/lb.tf
Normal file
@ -0,0 +1,144 @@
|
|||||||
|
# DNS record for the apiserver load balancer
|
||||||
|
resource "azurerm_dns_a_record" "apiserver" {
|
||||||
|
resource_group_name = "${var.dns_zone_group}"
|
||||||
|
|
||||||
|
# DNS Zone name where record should be created
|
||||||
|
zone_name = "${var.dns_zone}"
|
||||||
|
|
||||||
|
# DNS record
|
||||||
|
name = "${var.cluster_name}"
|
||||||
|
ttl = 300
|
||||||
|
|
||||||
|
# IPv4 address of apiserver load balancer
|
||||||
|
records = ["${azurerm_public_ip.apiserver-ipv4.ip_address}"]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Static IPv4 address for the apiserver frontend
|
||||||
|
resource "azurerm_public_ip" "apiserver-ipv4" {
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "${var.cluster_name}-apiserver-ipv4"
|
||||||
|
location = "${var.region}"
|
||||||
|
sku = "Standard"
|
||||||
|
allocation_method = "Static"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Static IPv4 address for the ingress frontend
|
||||||
|
resource "azurerm_public_ip" "ingress-ipv4" {
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "${var.cluster_name}-ingress-ipv4"
|
||||||
|
location = "${var.region}"
|
||||||
|
sku = "Standard"
|
||||||
|
allocation_method = "Static"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Network Load Balancer for apiservers and ingress
|
||||||
|
resource "azurerm_lb" "cluster" {
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "${var.cluster_name}"
|
||||||
|
location = "${var.region}"
|
||||||
|
sku = "Standard"
|
||||||
|
|
||||||
|
frontend_ip_configuration {
|
||||||
|
name = "apiserver"
|
||||||
|
public_ip_address_id = "${azurerm_public_ip.apiserver-ipv4.id}"
|
||||||
|
}
|
||||||
|
|
||||||
|
frontend_ip_configuration {
|
||||||
|
name = "ingress"
|
||||||
|
public_ip_address_id = "${azurerm_public_ip.ingress-ipv4.id}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "azurerm_lb_rule" "apiserver" {
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "apiserver"
|
||||||
|
loadbalancer_id = "${azurerm_lb.cluster.id}"
|
||||||
|
frontend_ip_configuration_name = "apiserver"
|
||||||
|
|
||||||
|
protocol = "Tcp"
|
||||||
|
frontend_port = 6443
|
||||||
|
backend_port = 6443
|
||||||
|
backend_address_pool_id = "${azurerm_lb_backend_address_pool.controller.id}"
|
||||||
|
probe_id = "${azurerm_lb_probe.apiserver.id}"
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "azurerm_lb_rule" "ingress-http" {
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "ingress-http"
|
||||||
|
loadbalancer_id = "${azurerm_lb.cluster.id}"
|
||||||
|
frontend_ip_configuration_name = "ingress"
|
||||||
|
|
||||||
|
protocol = "Tcp"
|
||||||
|
frontend_port = 80
|
||||||
|
backend_port = 80
|
||||||
|
backend_address_pool_id = "${azurerm_lb_backend_address_pool.worker.id}"
|
||||||
|
probe_id = "${azurerm_lb_probe.ingress.id}"
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "azurerm_lb_rule" "ingress-https" {
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "ingress-https"
|
||||||
|
loadbalancer_id = "${azurerm_lb.cluster.id}"
|
||||||
|
frontend_ip_configuration_name = "ingress"
|
||||||
|
|
||||||
|
protocol = "Tcp"
|
||||||
|
frontend_port = 443
|
||||||
|
backend_port = 443
|
||||||
|
backend_address_pool_id = "${azurerm_lb_backend_address_pool.worker.id}"
|
||||||
|
probe_id = "${azurerm_lb_probe.ingress.id}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Address pool of controllers
|
||||||
|
resource "azurerm_lb_backend_address_pool" "controller" {
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "controller"
|
||||||
|
loadbalancer_id = "${azurerm_lb.cluster.id}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Address pool of workers
|
||||||
|
resource "azurerm_lb_backend_address_pool" "worker" {
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "worker"
|
||||||
|
loadbalancer_id = "${azurerm_lb.cluster.id}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Health checks / probes
|
||||||
|
|
||||||
|
# TCP health check for apiserver
|
||||||
|
resource "azurerm_lb_probe" "apiserver" {
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "apiserver"
|
||||||
|
loadbalancer_id = "${azurerm_lb.cluster.id}"
|
||||||
|
protocol = "Tcp"
|
||||||
|
port = 6443
|
||||||
|
|
||||||
|
# unhealthy threshold
|
||||||
|
number_of_probes = 3
|
||||||
|
|
||||||
|
interval_in_seconds = 5
|
||||||
|
}
|
||||||
|
|
||||||
|
# HTTP health check for ingress
|
||||||
|
resource "azurerm_lb_probe" "ingress" {
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "ingress"
|
||||||
|
loadbalancer_id = "${azurerm_lb.cluster.id}"
|
||||||
|
protocol = "Http"
|
||||||
|
port = 10254
|
||||||
|
request_path = "/healthz"
|
||||||
|
|
||||||
|
# unhealthy threshold
|
||||||
|
number_of_probes = 3
|
||||||
|
|
||||||
|
interval_in_seconds = 5
|
||||||
|
}
|
33
azure/container-linux/kubernetes/network.tf
Normal file
33
azure/container-linux/kubernetes/network.tf
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
# Organize cluster into a resource group
|
||||||
|
resource "azurerm_resource_group" "cluster" {
|
||||||
|
name = "${var.cluster_name}"
|
||||||
|
location = "${var.region}"
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "azurerm_virtual_network" "network" {
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "${var.cluster_name}"
|
||||||
|
location = "${azurerm_resource_group.cluster.location}"
|
||||||
|
address_space = ["${var.host_cidr}"]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Subnets - separate subnets for controller and workers because Azure
|
||||||
|
# network security groups are based on IPv4 CIDR rather than instance
|
||||||
|
# tags like GCP or security group membership like AWS
|
||||||
|
|
||||||
|
resource "azurerm_subnet" "controller" {
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "controller"
|
||||||
|
virtual_network_name = "${azurerm_virtual_network.network.name}"
|
||||||
|
address_prefix = "${cidrsubnet(var.host_cidr, 1, 0)}"
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "azurerm_subnet" "worker" {
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "worker"
|
||||||
|
virtual_network_name = "${azurerm_virtual_network.network.name}"
|
||||||
|
address_prefix = "${cidrsubnet(var.host_cidr, 1, 1)}"
|
||||||
|
}
|
36
azure/container-linux/kubernetes/outputs.tf
Normal file
36
azure/container-linux/kubernetes/outputs.tf
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
output "kubeconfig-admin" {
|
||||||
|
value = "${module.bootkube.kubeconfig-admin}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Outputs for Kubernetes Ingress
|
||||||
|
|
||||||
|
output "ingress_static_ipv4" {
|
||||||
|
value = "${azurerm_public_ip.ingress-ipv4.ip_address}"
|
||||||
|
description = "IPv4 address of the load balancer for distributing traffic to Ingress controllers"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Outputs for worker pools
|
||||||
|
|
||||||
|
output "region" {
|
||||||
|
value = "${azurerm_resource_group.cluster.location}"
|
||||||
|
}
|
||||||
|
|
||||||
|
output "resource_group_name" {
|
||||||
|
value = "${azurerm_resource_group.cluster.name}"
|
||||||
|
}
|
||||||
|
|
||||||
|
output "subnet_id" {
|
||||||
|
value = "${azurerm_subnet.worker.id}"
|
||||||
|
}
|
||||||
|
|
||||||
|
output "security_group_id" {
|
||||||
|
value = "${azurerm_network_security_group.worker.id}"
|
||||||
|
}
|
||||||
|
|
||||||
|
output "backend_address_pool_id" {
|
||||||
|
value = "${azurerm_lb_backend_address_pool.worker.id}"
|
||||||
|
}
|
||||||
|
|
||||||
|
output "kubeconfig" {
|
||||||
|
value = "${module.bootkube.kubeconfig-kubelet}"
|
||||||
|
}
|
25
azure/container-linux/kubernetes/require.tf
Normal file
25
azure/container-linux/kubernetes/require.tf
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
# Terraform version and plugin versions
|
||||||
|
|
||||||
|
terraform {
|
||||||
|
required_version = ">= 0.11.0"
|
||||||
|
}
|
||||||
|
|
||||||
|
provider "azurerm" {
|
||||||
|
version = "~> 1.21"
|
||||||
|
}
|
||||||
|
|
||||||
|
provider "local" {
|
||||||
|
version = "~> 1.0"
|
||||||
|
}
|
||||||
|
|
||||||
|
provider "null" {
|
||||||
|
version = "~> 1.0"
|
||||||
|
}
|
||||||
|
|
||||||
|
provider "template" {
|
||||||
|
version = "~> 1.0"
|
||||||
|
}
|
||||||
|
|
||||||
|
provider "tls" {
|
||||||
|
version = "~> 1.0"
|
||||||
|
}
|
287
azure/container-linux/kubernetes/security.tf
Normal file
287
azure/container-linux/kubernetes/security.tf
Normal file
@ -0,0 +1,287 @@
|
|||||||
|
# Controller security group
|
||||||
|
|
||||||
|
resource "azurerm_network_security_group" "controller" {
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "${var.cluster_name}-controller"
|
||||||
|
location = "${azurerm_resource_group.cluster.location}"
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "azurerm_network_security_rule" "controller-ssh" {
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "allow-ssh"
|
||||||
|
network_security_group_name = "${azurerm_network_security_group.controller.name}"
|
||||||
|
priority = "2000"
|
||||||
|
access = "Allow"
|
||||||
|
direction = "Inbound"
|
||||||
|
protocol = "Tcp"
|
||||||
|
source_port_range = "*"
|
||||||
|
destination_port_range = "22"
|
||||||
|
source_address_prefix = "*"
|
||||||
|
destination_address_prefix = "${azurerm_subnet.controller.address_prefix}"
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "azurerm_network_security_rule" "controller-etcd" {
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "allow-etcd"
|
||||||
|
network_security_group_name = "${azurerm_network_security_group.controller.name}"
|
||||||
|
priority = "2005"
|
||||||
|
access = "Allow"
|
||||||
|
direction = "Inbound"
|
||||||
|
protocol = "Tcp"
|
||||||
|
source_port_range = "*"
|
||||||
|
destination_port_range = "2379-2380"
|
||||||
|
source_address_prefix = "${azurerm_subnet.controller.address_prefix}"
|
||||||
|
destination_address_prefix = "${azurerm_subnet.controller.address_prefix}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Allow Prometheus to scrape etcd metrics
|
||||||
|
resource "azurerm_network_security_rule" "controller-etcd-metrics" {
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "allow-etcd-metrics"
|
||||||
|
network_security_group_name = "${azurerm_network_security_group.controller.name}"
|
||||||
|
priority = "2010"
|
||||||
|
access = "Allow"
|
||||||
|
direction = "Inbound"
|
||||||
|
protocol = "Tcp"
|
||||||
|
source_port_range = "*"
|
||||||
|
destination_port_range = "2381"
|
||||||
|
source_address_prefix = "${azurerm_subnet.worker.address_prefix}"
|
||||||
|
destination_address_prefix = "${azurerm_subnet.controller.address_prefix}"
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "azurerm_network_security_rule" "controller-apiserver" {
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "allow-apiserver"
|
||||||
|
network_security_group_name = "${azurerm_network_security_group.controller.name}"
|
||||||
|
priority = "2015"
|
||||||
|
access = "Allow"
|
||||||
|
direction = "Inbound"
|
||||||
|
protocol = "Tcp"
|
||||||
|
source_port_range = "*"
|
||||||
|
destination_port_range = "6443"
|
||||||
|
source_address_prefix = "*"
|
||||||
|
destination_address_prefix = "${azurerm_subnet.controller.address_prefix}"
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "azurerm_network_security_rule" "controller-flannel" {
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "allow-flannel"
|
||||||
|
network_security_group_name = "${azurerm_network_security_group.controller.name}"
|
||||||
|
priority = "2020"
|
||||||
|
access = "Allow"
|
||||||
|
direction = "Inbound"
|
||||||
|
protocol = "Udp"
|
||||||
|
source_port_range = "*"
|
||||||
|
destination_port_range = "8472"
|
||||||
|
source_address_prefixes = ["${azurerm_subnet.controller.address_prefix}", "${azurerm_subnet.worker.address_prefix}"]
|
||||||
|
destination_address_prefix = "${azurerm_subnet.controller.address_prefix}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Allow Prometheus to scrape node-exporter daemonset
|
||||||
|
resource "azurerm_network_security_rule" "controller-node-exporter" {
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "allow-node-exporter"
|
||||||
|
network_security_group_name = "${azurerm_network_security_group.controller.name}"
|
||||||
|
priority = "2025"
|
||||||
|
access = "Allow"
|
||||||
|
direction = "Inbound"
|
||||||
|
protocol = "Tcp"
|
||||||
|
source_port_range = "*"
|
||||||
|
destination_port_range = "9100"
|
||||||
|
source_address_prefix = "${azurerm_subnet.worker.address_prefix}"
|
||||||
|
destination_address_prefix = "${azurerm_subnet.controller.address_prefix}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Allow apiserver to access kubelet's for exec, log, port-forward
|
||||||
|
resource "azurerm_network_security_rule" "controller-kubelet" {
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "allow-kubelet"
|
||||||
|
network_security_group_name = "${azurerm_network_security_group.controller.name}"
|
||||||
|
priority = "2030"
|
||||||
|
access = "Allow"
|
||||||
|
direction = "Inbound"
|
||||||
|
protocol = "Tcp"
|
||||||
|
source_port_range = "*"
|
||||||
|
destination_port_range = "10250"
|
||||||
|
|
||||||
|
# allow Prometheus to scrape kubelet metrics too
|
||||||
|
source_address_prefixes = ["${azurerm_subnet.controller.address_prefix}", "${azurerm_subnet.worker.address_prefix}"]
|
||||||
|
destination_address_prefix = "${azurerm_subnet.controller.address_prefix}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Override Azure AllowVNetInBound and AllowAzureLoadBalancerInBound
|
||||||
|
# https://docs.microsoft.com/en-us/azure/virtual-network/security-overview#default-security-rules
|
||||||
|
|
||||||
|
resource "azurerm_network_security_rule" "controller-allow-loadblancer" {
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "allow-loadbalancer"
|
||||||
|
network_security_group_name = "${azurerm_network_security_group.controller.name}"
|
||||||
|
priority = "3000"
|
||||||
|
access = "Allow"
|
||||||
|
direction = "Inbound"
|
||||||
|
protocol = "*"
|
||||||
|
source_port_range = "*"
|
||||||
|
destination_port_range = "*"
|
||||||
|
source_address_prefix = "AzureLoadBalancer"
|
||||||
|
destination_address_prefix = "*"
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "azurerm_network_security_rule" "controller-deny-all" {
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "deny-all"
|
||||||
|
network_security_group_name = "${azurerm_network_security_group.controller.name}"
|
||||||
|
priority = "3005"
|
||||||
|
access = "Deny"
|
||||||
|
direction = "Inbound"
|
||||||
|
protocol = "*"
|
||||||
|
source_port_range = "*"
|
||||||
|
destination_port_range = "*"
|
||||||
|
source_address_prefix = "*"
|
||||||
|
destination_address_prefix = "*"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Worker security group
|
||||||
|
|
||||||
|
resource "azurerm_network_security_group" "worker" {
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "${var.cluster_name}-worker"
|
||||||
|
location = "${azurerm_resource_group.cluster.location}"
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "azurerm_network_security_rule" "worker-ssh" {
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "allow-ssh"
|
||||||
|
network_security_group_name = "${azurerm_network_security_group.worker.name}"
|
||||||
|
priority = "2000"
|
||||||
|
access = "Allow"
|
||||||
|
direction = "Inbound"
|
||||||
|
protocol = "Tcp"
|
||||||
|
source_port_range = "*"
|
||||||
|
destination_port_range = "22"
|
||||||
|
source_address_prefix = "${azurerm_subnet.controller.address_prefix}"
|
||||||
|
destination_address_prefix = "${azurerm_subnet.worker.address_prefix}"
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "azurerm_network_security_rule" "worker-http" {
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "allow-http"
|
||||||
|
network_security_group_name = "${azurerm_network_security_group.worker.name}"
|
||||||
|
priority = "2005"
|
||||||
|
access = "Allow"
|
||||||
|
direction = "Inbound"
|
||||||
|
protocol = "Tcp"
|
||||||
|
source_port_range = "*"
|
||||||
|
destination_port_range = "80"
|
||||||
|
source_address_prefix = "*"
|
||||||
|
destination_address_prefix = "${azurerm_subnet.worker.address_prefix}"
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "azurerm_network_security_rule" "worker-https" {
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "allow-https"
|
||||||
|
network_security_group_name = "${azurerm_network_security_group.worker.name}"
|
||||||
|
priority = "2010"
|
||||||
|
access = "Allow"
|
||||||
|
direction = "Inbound"
|
||||||
|
protocol = "Tcp"
|
||||||
|
source_port_range = "*"
|
||||||
|
destination_port_range = "443"
|
||||||
|
source_address_prefix = "*"
|
||||||
|
destination_address_prefix = "${azurerm_subnet.worker.address_prefix}"
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "azurerm_network_security_rule" "worker-flannel" {
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "allow-flannel"
|
||||||
|
network_security_group_name = "${azurerm_network_security_group.worker.name}"
|
||||||
|
priority = "2015"
|
||||||
|
access = "Allow"
|
||||||
|
direction = "Inbound"
|
||||||
|
protocol = "Udp"
|
||||||
|
source_port_range = "*"
|
||||||
|
destination_port_range = "8472"
|
||||||
|
source_address_prefixes = ["${azurerm_subnet.controller.address_prefix}", "${azurerm_subnet.worker.address_prefix}"]
|
||||||
|
destination_address_prefix = "${azurerm_subnet.worker.address_prefix}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Allow Prometheus to scrape node-exporter daemonset
|
||||||
|
resource "azurerm_network_security_rule" "worker-node-exporter" {
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "allow-node-exporter"
|
||||||
|
network_security_group_name = "${azurerm_network_security_group.worker.name}"
|
||||||
|
priority = "2020"
|
||||||
|
access = "Allow"
|
||||||
|
direction = "Inbound"
|
||||||
|
protocol = "Tcp"
|
||||||
|
source_port_range = "*"
|
||||||
|
destination_port_range = "9100"
|
||||||
|
source_address_prefix = "${azurerm_subnet.worker.address_prefix}"
|
||||||
|
destination_address_prefix = "${azurerm_subnet.worker.address_prefix}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Allow apiserver to access kubelet's for exec, log, port-forward
|
||||||
|
resource "azurerm_network_security_rule" "worker-kubelet" {
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "allow-kubelet"
|
||||||
|
network_security_group_name = "${azurerm_network_security_group.worker.name}"
|
||||||
|
priority = "2025"
|
||||||
|
access = "Allow"
|
||||||
|
direction = "Inbound"
|
||||||
|
protocol = "Tcp"
|
||||||
|
source_port_range = "*"
|
||||||
|
destination_port_range = "10250"
|
||||||
|
|
||||||
|
# allow Prometheus to scrape kubelet metrics too
|
||||||
|
source_address_prefixes = ["${azurerm_subnet.controller.address_prefix}", "${azurerm_subnet.worker.address_prefix}"]
|
||||||
|
destination_address_prefix = "${azurerm_subnet.worker.address_prefix}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Override Azure AllowVNetInBound and AllowAzureLoadBalancerInBound
|
||||||
|
# https://docs.microsoft.com/en-us/azure/virtual-network/security-overview#default-security-rules
|
||||||
|
|
||||||
|
resource "azurerm_network_security_rule" "worker-allow-loadblancer" {
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "allow-loadbalancer"
|
||||||
|
network_security_group_name = "${azurerm_network_security_group.worker.name}"
|
||||||
|
priority = "3000"
|
||||||
|
access = "Allow"
|
||||||
|
direction = "Inbound"
|
||||||
|
protocol = "*"
|
||||||
|
source_port_range = "*"
|
||||||
|
destination_port_range = "*"
|
||||||
|
source_address_prefix = "AzureLoadBalancer"
|
||||||
|
destination_address_prefix = "*"
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "azurerm_network_security_rule" "worker-deny-all" {
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
|
||||||
|
name = "deny-all"
|
||||||
|
network_security_group_name = "${azurerm_network_security_group.worker.name}"
|
||||||
|
priority = "3005"
|
||||||
|
access = "Deny"
|
||||||
|
direction = "Inbound"
|
||||||
|
protocol = "*"
|
||||||
|
source_port_range = "*"
|
||||||
|
destination_port_range = "*"
|
||||||
|
source_address_prefix = "*"
|
||||||
|
destination_address_prefix = "*"
|
||||||
|
}
|
95
azure/container-linux/kubernetes/ssh.tf
Normal file
95
azure/container-linux/kubernetes/ssh.tf
Normal file
@ -0,0 +1,95 @@
|
|||||||
|
# Secure copy etcd TLS assets to controllers.
|
||||||
|
resource "null_resource" "copy-controller-secrets" {
|
||||||
|
count = "${var.controller_count}"
|
||||||
|
|
||||||
|
depends_on = [
|
||||||
|
"azurerm_virtual_machine.controllers",
|
||||||
|
]
|
||||||
|
|
||||||
|
connection {
|
||||||
|
type = "ssh"
|
||||||
|
host = "${element(azurerm_public_ip.controllers.*.ip_address, count.index)}"
|
||||||
|
user = "core"
|
||||||
|
timeout = "15m"
|
||||||
|
}
|
||||||
|
|
||||||
|
provisioner "file" {
|
||||||
|
content = "${module.bootkube.etcd_ca_cert}"
|
||||||
|
destination = "$HOME/etcd-client-ca.crt"
|
||||||
|
}
|
||||||
|
|
||||||
|
provisioner "file" {
|
||||||
|
content = "${module.bootkube.etcd_client_cert}"
|
||||||
|
destination = "$HOME/etcd-client.crt"
|
||||||
|
}
|
||||||
|
|
||||||
|
provisioner "file" {
|
||||||
|
content = "${module.bootkube.etcd_client_key}"
|
||||||
|
destination = "$HOME/etcd-client.key"
|
||||||
|
}
|
||||||
|
|
||||||
|
provisioner "file" {
|
||||||
|
content = "${module.bootkube.etcd_server_cert}"
|
||||||
|
destination = "$HOME/etcd-server.crt"
|
||||||
|
}
|
||||||
|
|
||||||
|
provisioner "file" {
|
||||||
|
content = "${module.bootkube.etcd_server_key}"
|
||||||
|
destination = "$HOME/etcd-server.key"
|
||||||
|
}
|
||||||
|
|
||||||
|
provisioner "file" {
|
||||||
|
content = "${module.bootkube.etcd_peer_cert}"
|
||||||
|
destination = "$HOME/etcd-peer.crt"
|
||||||
|
}
|
||||||
|
|
||||||
|
provisioner "file" {
|
||||||
|
content = "${module.bootkube.etcd_peer_key}"
|
||||||
|
destination = "$HOME/etcd-peer.key"
|
||||||
|
}
|
||||||
|
|
||||||
|
provisioner "remote-exec" {
|
||||||
|
inline = [
|
||||||
|
"sudo mkdir -p /etc/ssl/etcd/etcd",
|
||||||
|
"sudo mv etcd-client* /etc/ssl/etcd/",
|
||||||
|
"sudo cp /etc/ssl/etcd/etcd-client-ca.crt /etc/ssl/etcd/etcd/server-ca.crt",
|
||||||
|
"sudo mv etcd-server.crt /etc/ssl/etcd/etcd/server.crt",
|
||||||
|
"sudo mv etcd-server.key /etc/ssl/etcd/etcd/server.key",
|
||||||
|
"sudo cp /etc/ssl/etcd/etcd-client-ca.crt /etc/ssl/etcd/etcd/peer-ca.crt",
|
||||||
|
"sudo mv etcd-peer.crt /etc/ssl/etcd/etcd/peer.crt",
|
||||||
|
"sudo mv etcd-peer.key /etc/ssl/etcd/etcd/peer.key",
|
||||||
|
"sudo chown -R etcd:etcd /etc/ssl/etcd",
|
||||||
|
"sudo chmod -R 500 /etc/ssl/etcd",
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Secure copy bootkube assets to ONE controller and start bootkube to perform
|
||||||
|
# one-time self-hosted cluster bootstrapping.
|
||||||
|
resource "null_resource" "bootkube-start" {
|
||||||
|
depends_on = [
|
||||||
|
"module.bootkube",
|
||||||
|
"module.workers",
|
||||||
|
"azurerm_dns_a_record.apiserver",
|
||||||
|
"null_resource.copy-controller-secrets",
|
||||||
|
]
|
||||||
|
|
||||||
|
connection {
|
||||||
|
type = "ssh"
|
||||||
|
host = "${element(azurerm_public_ip.controllers.*.ip_address, 0)}"
|
||||||
|
user = "core"
|
||||||
|
timeout = "15m"
|
||||||
|
}
|
||||||
|
|
||||||
|
provisioner "file" {
|
||||||
|
source = "${var.asset_dir}"
|
||||||
|
destination = "$HOME/assets"
|
||||||
|
}
|
||||||
|
|
||||||
|
provisioner "remote-exec" {
|
||||||
|
inline = [
|
||||||
|
"sudo mv $HOME/assets /opt/bootkube",
|
||||||
|
"sudo systemctl start bootkube",
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
123
azure/container-linux/kubernetes/variables.tf
Normal file
123
azure/container-linux/kubernetes/variables.tf
Normal file
@ -0,0 +1,123 @@
|
|||||||
|
variable "cluster_name" {
|
||||||
|
type = "string"
|
||||||
|
description = "Unique cluster name (prepended to dns_zone)"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Azure
|
||||||
|
|
||||||
|
variable "region" {
|
||||||
|
type = "string"
|
||||||
|
description = "Azure Region (e.g. centralus , see `az account list-locations --output table`)"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "dns_zone" {
|
||||||
|
type = "string"
|
||||||
|
description = "Azure DNS Zone (e.g. azure.example.com)"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "dns_zone_group" {
|
||||||
|
type = "string"
|
||||||
|
description = "Resource group where the Azure DNS Zone resides (e.g. global)"
|
||||||
|
}
|
||||||
|
|
||||||
|
# instances
|
||||||
|
|
||||||
|
variable "controller_count" {
|
||||||
|
type = "string"
|
||||||
|
default = "1"
|
||||||
|
description = "Number of controllers (i.e. masters)"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "worker_count" {
|
||||||
|
type = "string"
|
||||||
|
default = "1"
|
||||||
|
description = "Number of workers"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "controller_type" {
|
||||||
|
type = "string"
|
||||||
|
default = "Standard_DS1_v2"
|
||||||
|
description = "Machine type for controllers (see `az vm list-skus --location centralus`)"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "worker_type" {
|
||||||
|
type = "string"
|
||||||
|
default = "Standard_F1"
|
||||||
|
description = "Machine type for workers (see `az vm list-skus --location centralus`)"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "os_image" {
|
||||||
|
type = "string"
|
||||||
|
default = "coreos-stable"
|
||||||
|
description = "Channel for a Container Linux derivative (coreos-stable, coreos-beta, coreos-alpha)"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "disk_size" {
|
||||||
|
type = "string"
|
||||||
|
default = "40"
|
||||||
|
description = "Size of the disk in GB"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "worker_priority" {
|
||||||
|
type = "string"
|
||||||
|
default = "Regular"
|
||||||
|
description = "Set worker priority to Low to use reduced cost surplus capacity, with the tradeoff that instances can be deallocated at any time."
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "controller_clc_snippets" {
|
||||||
|
type = "list"
|
||||||
|
description = "Controller Container Linux Config snippets"
|
||||||
|
default = []
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "worker_clc_snippets" {
|
||||||
|
type = "list"
|
||||||
|
description = "Worker Container Linux Config snippets"
|
||||||
|
default = []
|
||||||
|
}
|
||||||
|
|
||||||
|
# configuration
|
||||||
|
|
||||||
|
variable "ssh_authorized_key" {
|
||||||
|
type = "string"
|
||||||
|
description = "SSH public key for user 'core'"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "asset_dir" {
|
||||||
|
description = "Path to a directory where generated assets should be placed (contains secrets)"
|
||||||
|
type = "string"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "host_cidr" {
|
||||||
|
description = "CIDR IPv4 range to assign to instances"
|
||||||
|
type = "string"
|
||||||
|
default = "10.0.0.0/16"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "pod_cidr" {
|
||||||
|
description = "CIDR IPv4 range to assign Kubernetes pods"
|
||||||
|
type = "string"
|
||||||
|
default = "10.2.0.0/16"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "service_cidr" {
|
||||||
|
description = <<EOD
|
||||||
|
CIDR IPv4 range to assign Kubernetes services.
|
||||||
|
The 1st IP will be reserved for kube_apiserver, the 10th IP will be reserved for coredns.
|
||||||
|
EOD
|
||||||
|
|
||||||
|
type = "string"
|
||||||
|
default = "10.3.0.0/16"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "cluster_domain_suffix" {
|
||||||
|
description = "Queries for domains with the suffix will be answered by coredns. Default is cluster.local (e.g. foo.default.svc.cluster.local) "
|
||||||
|
type = "string"
|
||||||
|
default = "cluster.local"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "enable_reporting" {
|
||||||
|
type = "string"
|
||||||
|
description = "Enable usage or analytics reporting to upstreams (Calico)"
|
||||||
|
default = "false"
|
||||||
|
}
|
23
azure/container-linux/kubernetes/workers.tf
Normal file
23
azure/container-linux/kubernetes/workers.tf
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
module "workers" {
|
||||||
|
source = "workers"
|
||||||
|
name = "${var.cluster_name}"
|
||||||
|
|
||||||
|
# Azure
|
||||||
|
resource_group_name = "${azurerm_resource_group.cluster.name}"
|
||||||
|
region = "${azurerm_resource_group.cluster.location}"
|
||||||
|
subnet_id = "${azurerm_subnet.worker.id}"
|
||||||
|
security_group_id = "${azurerm_network_security_group.worker.id}"
|
||||||
|
backend_address_pool_id = "${azurerm_lb_backend_address_pool.worker.id}"
|
||||||
|
|
||||||
|
count = "${var.worker_count}"
|
||||||
|
vm_type = "${var.worker_type}"
|
||||||
|
os_image = "${var.os_image}"
|
||||||
|
priority = "${var.worker_priority}"
|
||||||
|
|
||||||
|
# configuration
|
||||||
|
kubeconfig = "${module.bootkube.kubeconfig-kubelet}"
|
||||||
|
ssh_authorized_key = "${var.ssh_authorized_key}"
|
||||||
|
service_cidr = "${var.service_cidr}"
|
||||||
|
cluster_domain_suffix = "${var.cluster_domain_suffix}"
|
||||||
|
clc_snippets = "${var.worker_clc_snippets}"
|
||||||
|
}
|
122
azure/container-linux/kubernetes/workers/cl/worker.yaml.tmpl
Normal file
122
azure/container-linux/kubernetes/workers/cl/worker.yaml.tmpl
Normal file
@ -0,0 +1,122 @@
|
|||||||
|
---
|
||||||
|
systemd:
|
||||||
|
units:
|
||||||
|
- name: docker.service
|
||||||
|
enable: true
|
||||||
|
- name: locksmithd.service
|
||||||
|
mask: true
|
||||||
|
- name: wait-for-dns.service
|
||||||
|
enable: true
|
||||||
|
contents: |
|
||||||
|
[Unit]
|
||||||
|
Description=Wait for DNS entries
|
||||||
|
Wants=systemd-resolved.service
|
||||||
|
Before=kubelet.service
|
||||||
|
[Service]
|
||||||
|
Type=oneshot
|
||||||
|
RemainAfterExit=true
|
||||||
|
ExecStart=/bin/sh -c 'while ! /usr/bin/grep '^[^#[:space:]]' /etc/resolv.conf > /dev/null; do sleep 1; done'
|
||||||
|
[Install]
|
||||||
|
RequiredBy=kubelet.service
|
||||||
|
- name: kubelet.service
|
||||||
|
enable: true
|
||||||
|
contents: |
|
||||||
|
[Unit]
|
||||||
|
Description=Kubelet via Hyperkube
|
||||||
|
Wants=rpc-statd.service
|
||||||
|
[Service]
|
||||||
|
EnvironmentFile=/etc/kubernetes/kubelet.env
|
||||||
|
Environment="RKT_RUN_ARGS=--uuid-file-save=/var/cache/kubelet-pod.uuid \
|
||||||
|
--volume=resolv,kind=host,source=/etc/resolv.conf \
|
||||||
|
--mount volume=resolv,target=/etc/resolv.conf \
|
||||||
|
--volume var-lib-cni,kind=host,source=/var/lib/cni \
|
||||||
|
--mount volume=var-lib-cni,target=/var/lib/cni \
|
||||||
|
--volume var-lib-calico,kind=host,source=/var/lib/calico \
|
||||||
|
--mount volume=var-lib-calico,target=/var/lib/calico \
|
||||||
|
--volume opt-cni-bin,kind=host,source=/opt/cni/bin \
|
||||||
|
--mount volume=opt-cni-bin,target=/opt/cni/bin \
|
||||||
|
--volume var-log,kind=host,source=/var/log \
|
||||||
|
--mount volume=var-log,target=/var/log \
|
||||||
|
--insecure-options=image"
|
||||||
|
ExecStartPre=/bin/mkdir -p /opt/cni/bin
|
||||||
|
ExecStartPre=/bin/mkdir -p /etc/kubernetes/manifests
|
||||||
|
ExecStartPre=/bin/mkdir -p /etc/kubernetes/cni/net.d
|
||||||
|
ExecStartPre=/bin/mkdir -p /var/lib/cni
|
||||||
|
ExecStartPre=/bin/mkdir -p /var/lib/calico
|
||||||
|
ExecStartPre=/bin/mkdir -p /var/lib/kubelet/volumeplugins
|
||||||
|
ExecStartPre=/usr/bin/bash -c "grep 'certificate-authority-data' /etc/kubernetes/kubeconfig | awk '{print $2}' | base64 -d > /etc/kubernetes/ca.crt"
|
||||||
|
ExecStartPre=-/usr/bin/rkt rm --uuid-file=/var/cache/kubelet-pod.uuid
|
||||||
|
ExecStart=/usr/lib/coreos/kubelet-wrapper \
|
||||||
|
--anonymous-auth=false \
|
||||||
|
--authentication-token-webhook \
|
||||||
|
--authorization-mode=Webhook \
|
||||||
|
--client-ca-file=/etc/kubernetes/ca.crt \
|
||||||
|
--cluster_dns=${cluster_dns_service_ip} \
|
||||||
|
--cluster_domain=${cluster_domain_suffix} \
|
||||||
|
--cni-conf-dir=/etc/kubernetes/cni/net.d \
|
||||||
|
--exit-on-lock-contention \
|
||||||
|
--kubeconfig=/etc/kubernetes/kubeconfig \
|
||||||
|
--lock-file=/var/run/lock/kubelet.lock \
|
||||||
|
--network-plugin=cni \
|
||||||
|
--node-labels=node-role.kubernetes.io/node \
|
||||||
|
--pod-manifest-path=/etc/kubernetes/manifests \
|
||||||
|
--read-only-port=0 \
|
||||||
|
--volume-plugin-dir=/var/lib/kubelet/volumeplugins
|
||||||
|
ExecStop=-/usr/bin/rkt stop --uuid-file=/var/cache/kubelet-pod.uuid
|
||||||
|
Restart=always
|
||||||
|
RestartSec=5
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
- name: delete-node.service
|
||||||
|
enable: true
|
||||||
|
contents: |
|
||||||
|
[Unit]
|
||||||
|
Description=Waiting to delete Kubernetes node on shutdown
|
||||||
|
[Service]
|
||||||
|
Type=oneshot
|
||||||
|
RemainAfterExit=true
|
||||||
|
ExecStart=/bin/true
|
||||||
|
ExecStop=/etc/kubernetes/delete-node
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
storage:
|
||||||
|
files:
|
||||||
|
- path: /etc/kubernetes/kubeconfig
|
||||||
|
filesystem: root
|
||||||
|
mode: 0644
|
||||||
|
contents:
|
||||||
|
inline: |
|
||||||
|
${kubeconfig}
|
||||||
|
- path: /etc/kubernetes/kubelet.env
|
||||||
|
filesystem: root
|
||||||
|
mode: 0644
|
||||||
|
contents:
|
||||||
|
inline: |
|
||||||
|
KUBELET_IMAGE_URL=docker://k8s.gcr.io/hyperkube
|
||||||
|
KUBELET_IMAGE_TAG=v1.13.4
|
||||||
|
- path: /etc/sysctl.d/max-user-watches.conf
|
||||||
|
filesystem: root
|
||||||
|
contents:
|
||||||
|
inline: |
|
||||||
|
fs.inotify.max_user_watches=16184
|
||||||
|
- path: /etc/kubernetes/delete-node
|
||||||
|
filesystem: root
|
||||||
|
mode: 0744
|
||||||
|
contents:
|
||||||
|
inline: |
|
||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
exec /usr/bin/rkt run \
|
||||||
|
--trust-keys-from-https \
|
||||||
|
--volume config,kind=host,source=/etc/kubernetes \
|
||||||
|
--mount volume=config,target=/etc/kubernetes \
|
||||||
|
--insecure-options=image \
|
||||||
|
docker://k8s.gcr.io/hyperkube:v1.13.4 \
|
||||||
|
--net=host \
|
||||||
|
--dns=host \
|
||||||
|
--exec=/kubectl -- --kubeconfig=/etc/kubernetes/kubeconfig delete node $(hostname | tr '[:upper:]' '[:lower:]')
|
||||||
|
passwd:
|
||||||
|
users:
|
||||||
|
- name: core
|
||||||
|
ssh_authorized_keys:
|
||||||
|
- "${ssh_authorized_key}"
|
91
azure/container-linux/kubernetes/workers/variables.tf
Normal file
91
azure/container-linux/kubernetes/workers/variables.tf
Normal file
@ -0,0 +1,91 @@
|
|||||||
|
variable "name" {
|
||||||
|
type = "string"
|
||||||
|
description = "Unique name for the worker pool"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Azure
|
||||||
|
|
||||||
|
variable "region" {
|
||||||
|
type = "string"
|
||||||
|
description = "Must be set to the Azure Region of cluster"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "resource_group_name" {
|
||||||
|
type = "string"
|
||||||
|
description = "Must be set to the resource group name of cluster"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "subnet_id" {
|
||||||
|
type = "string"
|
||||||
|
description = "Must be set to the `worker_subnet_id` output by cluster"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "security_group_id" {
|
||||||
|
type = "string"
|
||||||
|
description = "Must be set to the `worker_security_group_id` output by cluster"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "backend_address_pool_id" {
|
||||||
|
type = "string"
|
||||||
|
description = "Must be set to the `worker_backend_address_pool_id` output by cluster"
|
||||||
|
}
|
||||||
|
|
||||||
|
# instances
|
||||||
|
|
||||||
|
variable "count" {
|
||||||
|
type = "string"
|
||||||
|
default = "1"
|
||||||
|
description = "Number of instances"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "vm_type" {
|
||||||
|
type = "string"
|
||||||
|
default = "Standard_F1"
|
||||||
|
description = "Machine type for instances (see `az vm list-skus --location centralus`)"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "os_image" {
|
||||||
|
type = "string"
|
||||||
|
default = "coreos-stable"
|
||||||
|
description = "Channel for a Container Linux derivative (coreos-stable, coreos-beta, coreos-alpha)"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "priority" {
|
||||||
|
type = "string"
|
||||||
|
default = "Regular"
|
||||||
|
description = "Set priority to Low to use reduced cost surplus capacity, with the tradeoff that instances can be evicted at any time."
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "clc_snippets" {
|
||||||
|
type = "list"
|
||||||
|
description = "Container Linux Config snippets"
|
||||||
|
default = []
|
||||||
|
}
|
||||||
|
|
||||||
|
# configuration
|
||||||
|
|
||||||
|
variable "kubeconfig" {
|
||||||
|
type = "string"
|
||||||
|
description = "Must be set to `kubeconfig` output by cluster"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "ssh_authorized_key" {
|
||||||
|
type = "string"
|
||||||
|
description = "SSH public key for user 'core'"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "service_cidr" {
|
||||||
|
description = <<EOD
|
||||||
|
CIDR IPv4 range to assign Kubernetes services.
|
||||||
|
The 1st IP will be reserved for kube_apiserver, the 10th IP will be reserved for coredns.
|
||||||
|
EOD
|
||||||
|
|
||||||
|
type = "string"
|
||||||
|
default = "10.3.0.0/16"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "cluster_domain_suffix" {
|
||||||
|
description = "Queries for domains with the suffix will be answered by coredns. Default is cluster.local (e.g. foo.default.svc.cluster.local) "
|
||||||
|
type = "string"
|
||||||
|
default = "cluster.local"
|
||||||
|
}
|
114
azure/container-linux/kubernetes/workers/workers.tf
Normal file
114
azure/container-linux/kubernetes/workers/workers.tf
Normal file
@ -0,0 +1,114 @@
|
|||||||
|
locals {
|
||||||
|
# Channel for a Container Linux derivative
|
||||||
|
# coreos-stable -> Container Linux Stable
|
||||||
|
channel = "${element(split("-", var.os_image), 1)}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Workers scale set
|
||||||
|
resource "azurerm_virtual_machine_scale_set" "workers" {
|
||||||
|
resource_group_name = "${var.resource_group_name}"
|
||||||
|
|
||||||
|
name = "${var.name}-workers"
|
||||||
|
location = "${var.region}"
|
||||||
|
single_placement_group = false
|
||||||
|
|
||||||
|
sku {
|
||||||
|
name = "${var.vm_type}"
|
||||||
|
tier = "standard"
|
||||||
|
capacity = "${var.count}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# boot
|
||||||
|
storage_profile_image_reference {
|
||||||
|
publisher = "CoreOS"
|
||||||
|
offer = "CoreOS"
|
||||||
|
sku = "${local.channel}"
|
||||||
|
version = "latest"
|
||||||
|
}
|
||||||
|
|
||||||
|
# storage
|
||||||
|
storage_profile_os_disk {
|
||||||
|
create_option = "FromImage"
|
||||||
|
caching = "ReadWrite"
|
||||||
|
os_type = "linux"
|
||||||
|
managed_disk_type = "Standard_LRS"
|
||||||
|
}
|
||||||
|
|
||||||
|
os_profile {
|
||||||
|
computer_name_prefix = "${var.name}-worker-"
|
||||||
|
admin_username = "core"
|
||||||
|
custom_data = "${data.ct_config.worker-ignition.rendered}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Azure mandates setting an ssh_key, even though Ignition custom_data handles it too
|
||||||
|
os_profile_linux_config {
|
||||||
|
disable_password_authentication = true
|
||||||
|
|
||||||
|
ssh_keys {
|
||||||
|
path = "/home/core/.ssh/authorized_keys"
|
||||||
|
key_data = "${var.ssh_authorized_key}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# network
|
||||||
|
network_profile {
|
||||||
|
name = "nic0"
|
||||||
|
primary = true
|
||||||
|
network_security_group_id = "${var.security_group_id}"
|
||||||
|
|
||||||
|
ip_configuration {
|
||||||
|
name = "ip0"
|
||||||
|
primary = true
|
||||||
|
subnet_id = "${var.subnet_id}"
|
||||||
|
|
||||||
|
# backend address pool to which the NIC should be added
|
||||||
|
load_balancer_backend_address_pool_ids = ["${var.backend_address_pool_id}"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# lifecycle
|
||||||
|
upgrade_policy_mode = "Manual"
|
||||||
|
priority = "${var.priority}"
|
||||||
|
eviction_policy = "Delete"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Scale up or down to maintain desired number, tolerating deallocations.
|
||||||
|
resource "azurerm_autoscale_setting" "workers" {
|
||||||
|
resource_group_name = "${var.resource_group_name}"
|
||||||
|
|
||||||
|
name = "${var.name}-maintain-desired"
|
||||||
|
location = "${var.region}"
|
||||||
|
|
||||||
|
# autoscale
|
||||||
|
enabled = true
|
||||||
|
target_resource_id = "${azurerm_virtual_machine_scale_set.workers.id}"
|
||||||
|
|
||||||
|
profile {
|
||||||
|
name = "default"
|
||||||
|
|
||||||
|
capacity {
|
||||||
|
minimum = "${var.count}"
|
||||||
|
default = "${var.count}"
|
||||||
|
maximum = "${var.count}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Worker Ignition configs
|
||||||
|
data "ct_config" "worker-ignition" {
|
||||||
|
content = "${data.template_file.worker-config.rendered}"
|
||||||
|
pretty_print = false
|
||||||
|
snippets = ["${var.clc_snippets}"]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Worker Container Linux configs
|
||||||
|
data "template_file" "worker-config" {
|
||||||
|
template = "${file("${path.module}/cl/worker.yaml.tmpl")}"
|
||||||
|
|
||||||
|
vars = {
|
||||||
|
kubeconfig = "${indent(10, var.kubeconfig)}"
|
||||||
|
ssh_authorized_key = "${var.ssh_authorized_key}"
|
||||||
|
cluster_dns_service_ip = "${cidrhost(var.service_cidr, 10)}"
|
||||||
|
cluster_domain_suffix = "${var.cluster_domain_suffix}"
|
||||||
|
}
|
||||||
|
}
|
0
azure/ignore/.gitkeep
Normal file
0
azure/ignore/.gitkeep
Normal file
@ -11,9 +11,10 @@ Typhoon distributes upstream Kubernetes, architectural conventions, and cluster
|
|||||||
|
|
||||||
## Features <a href="https://www.cncf.io/certification/software-conformance/"><img align="right" src="https://storage.googleapis.com/poseidon/certified-kubernetes.png"></a>
|
## Features <a href="https://www.cncf.io/certification/software-conformance/"><img align="right" src="https://storage.googleapis.com/poseidon/certified-kubernetes.png"></a>
|
||||||
|
|
||||||
* Kubernetes v1.10.4 (upstream, via [kubernetes-incubator/bootkube](https://github.com/kubernetes-incubator/bootkube))
|
* Kubernetes v1.13.4 (upstream, via [kubernetes-incubator/bootkube](https://github.com/kubernetes-incubator/bootkube))
|
||||||
* Single or multi-master, workloads isolated on workers, [Calico](https://www.projectcalico.org/) or [flannel](https://github.com/coreos/flannel) networking
|
* Single or multi-master, [Calico](https://www.projectcalico.org/) or [flannel](https://github.com/coreos/flannel) networking
|
||||||
* On-cluster etcd with TLS, [RBAC](https://kubernetes.io/docs/admin/authorization/rbac/)-enabled, [network policy](https://kubernetes.io/docs/concepts/services-networking/network-policies/)
|
* On-cluster etcd with TLS, [RBAC](https://kubernetes.io/docs/admin/authorization/rbac/)-enabled, [network policy](https://kubernetes.io/docs/concepts/services-networking/network-policies/)
|
||||||
|
* Advanced features like [snippets](https://typhoon.psdn.io/advanced/customization/#container-linux) customization
|
||||||
* Ready for Ingress, Prometheus, Grafana, and other optional [addons](https://typhoon.psdn.io/addons/overview/)
|
* Ready for Ingress, Prometheus, Grafana, and other optional [addons](https://typhoon.psdn.io/addons/overview/)
|
||||||
|
|
||||||
## Docs
|
## Docs
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
# Self-hosted Kubernetes assets (kubeconfig, manifests)
|
# Self-hosted Kubernetes assets (kubeconfig, manifests)
|
||||||
module "bootkube" {
|
module "bootkube" {
|
||||||
source = "git::https://github.com/poseidon/terraform-render-bootkube.git?ref=0e98e89e14a074768db13c4e050ed0c13319a0c1"
|
source = "git::https://github.com/poseidon/terraform-render-bootkube.git?ref=953521dbba49eb6a39204f30a3978730eac01e11"
|
||||||
|
|
||||||
cluster_name = "${var.cluster_name}"
|
cluster_name = "${var.cluster_name}"
|
||||||
api_servers = ["${var.k8s_domain_name}"]
|
api_servers = ["${var.k8s_domain_name}"]
|
||||||
@ -12,4 +12,5 @@ module "bootkube" {
|
|||||||
pod_cidr = "${var.pod_cidr}"
|
pod_cidr = "${var.pod_cidr}"
|
||||||
service_cidr = "${var.service_cidr}"
|
service_cidr = "${var.service_cidr}"
|
||||||
cluster_domain_suffix = "${var.cluster_domain_suffix}"
|
cluster_domain_suffix = "${var.cluster_domain_suffix}"
|
||||||
|
enable_reporting = "${var.enable_reporting}"
|
||||||
}
|
}
|
||||||
|
@ -7,7 +7,7 @@ systemd:
|
|||||||
- name: 40-etcd-cluster.conf
|
- name: 40-etcd-cluster.conf
|
||||||
contents: |
|
contents: |
|
||||||
[Service]
|
[Service]
|
||||||
Environment="ETCD_IMAGE_TAG=v3.3.6"
|
Environment="ETCD_IMAGE_TAG=v3.3.12"
|
||||||
Environment="ETCD_NAME=${etcd_name}"
|
Environment="ETCD_NAME=${etcd_name}"
|
||||||
Environment="ETCD_ADVERTISE_CLIENT_URLS=https://${domain_name}:2379"
|
Environment="ETCD_ADVERTISE_CLIENT_URLS=https://${domain_name}:2379"
|
||||||
Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://${domain_name}:2380"
|
Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://${domain_name}:2380"
|
||||||
@ -70,6 +70,10 @@ systemd:
|
|||||||
--mount volume=opt-cni-bin,target=/opt/cni/bin \
|
--mount volume=opt-cni-bin,target=/opt/cni/bin \
|
||||||
--volume var-log,kind=host,source=/var/log \
|
--volume var-log,kind=host,source=/var/log \
|
||||||
--mount volume=var-log,target=/var/log \
|
--mount volume=var-log,target=/var/log \
|
||||||
|
--volume iscsiconf,kind=host,source=/etc/iscsi/ \
|
||||||
|
--mount volume=iscsiconf,target=/etc/iscsi/ \
|
||||||
|
--volume iscsiadm,kind=host,source=/usr/sbin/iscsiadm \
|
||||||
|
--mount volume=iscsiadm,target=/sbin/iscsiadm \
|
||||||
--insecure-options=image"
|
--insecure-options=image"
|
||||||
ExecStartPre=/bin/mkdir -p /opt/cni/bin
|
ExecStartPre=/bin/mkdir -p /opt/cni/bin
|
||||||
ExecStartPre=/bin/mkdir -p /etc/kubernetes/manifests
|
ExecStartPre=/bin/mkdir -p /etc/kubernetes/manifests
|
||||||
@ -82,12 +86,11 @@ systemd:
|
|||||||
ExecStartPre=/usr/bin/bash -c "grep 'certificate-authority-data' /etc/kubernetes/kubeconfig | awk '{print $2}' | base64 -d > /etc/kubernetes/ca.crt"
|
ExecStartPre=/usr/bin/bash -c "grep 'certificate-authority-data' /etc/kubernetes/kubeconfig | awk '{print $2}' | base64 -d > /etc/kubernetes/ca.crt"
|
||||||
ExecStartPre=-/usr/bin/rkt rm --uuid-file=/var/cache/kubelet-pod.uuid
|
ExecStartPre=-/usr/bin/rkt rm --uuid-file=/var/cache/kubelet-pod.uuid
|
||||||
ExecStart=/usr/lib/coreos/kubelet-wrapper \
|
ExecStart=/usr/lib/coreos/kubelet-wrapper \
|
||||||
--allow-privileged \
|
|
||||||
--anonymous-auth=false \
|
--anonymous-auth=false \
|
||||||
--authentication-token-webhook \
|
--authentication-token-webhook \
|
||||||
--authorization-mode=Webhook \
|
--authorization-mode=Webhook \
|
||||||
--client-ca-file=/etc/kubernetes/ca.crt \
|
--client-ca-file=/etc/kubernetes/ca.crt \
|
||||||
--cluster_dns=${k8s_dns_service_ip} \
|
--cluster_dns=${cluster_dns_service_ip} \
|
||||||
--cluster_domain=${cluster_domain_suffix} \
|
--cluster_domain=${cluster_domain_suffix} \
|
||||||
--cni-conf-dir=/etc/kubernetes/cni/net.d \
|
--cni-conf-dir=/etc/kubernetes/cni/net.d \
|
||||||
--exit-on-lock-contention \
|
--exit-on-lock-contention \
|
||||||
@ -98,6 +101,7 @@ systemd:
|
|||||||
--node-labels=node-role.kubernetes.io/master \
|
--node-labels=node-role.kubernetes.io/master \
|
||||||
--node-labels=node-role.kubernetes.io/controller="true" \
|
--node-labels=node-role.kubernetes.io/controller="true" \
|
||||||
--pod-manifest-path=/etc/kubernetes/manifests \
|
--pod-manifest-path=/etc/kubernetes/manifests \
|
||||||
|
--read-only-port=0 \
|
||||||
--register-with-taints=node-role.kubernetes.io/master=:NoSchedule \
|
--register-with-taints=node-role.kubernetes.io/master=:NoSchedule \
|
||||||
--volume-plugin-dir=/var/lib/kubelet/volumeplugins
|
--volume-plugin-dir=/var/lib/kubelet/volumeplugins
|
||||||
ExecStop=-/usr/bin/rkt stop --uuid-file=/var/cache/kubelet-pod.uuid
|
ExecStop=-/usr/bin/rkt stop --uuid-file=/var/cache/kubelet-pod.uuid
|
||||||
@ -124,7 +128,7 @@ storage:
|
|||||||
contents:
|
contents:
|
||||||
inline: |
|
inline: |
|
||||||
KUBELET_IMAGE_URL=docker://k8s.gcr.io/hyperkube
|
KUBELET_IMAGE_URL=docker://k8s.gcr.io/hyperkube
|
||||||
KUBELET_IMAGE_TAG=v1.10.4
|
KUBELET_IMAGE_TAG=v1.13.4
|
||||||
- path: /etc/hostname
|
- path: /etc/hostname
|
||||||
filesystem: root
|
filesystem: root
|
||||||
mode: 0644
|
mode: 0644
|
||||||
@ -150,22 +154,17 @@ storage:
|
|||||||
set -e
|
set -e
|
||||||
# Move experimental manifests
|
# Move experimental manifests
|
||||||
[ -n "$(ls /opt/bootkube/assets/manifests-*/* 2>/dev/null)" ] && mv /opt/bootkube/assets/manifests-*/* /opt/bootkube/assets/manifests && rm -rf /opt/bootkube/assets/manifests-*
|
[ -n "$(ls /opt/bootkube/assets/manifests-*/* 2>/dev/null)" ] && mv /opt/bootkube/assets/manifests-*/* /opt/bootkube/assets/manifests && rm -rf /opt/bootkube/assets/manifests-*
|
||||||
BOOTKUBE_ACI="$${BOOTKUBE_ACI:-quay.io/coreos/bootkube}"
|
|
||||||
BOOTKUBE_VERSION="$${BOOTKUBE_VERSION:-v0.12.0}"
|
|
||||||
BOOTKUBE_ASSETS="$${BOOTKUBE_ASSETS:-/opt/bootkube/assets}"
|
|
||||||
exec /usr/bin/rkt run \
|
exec /usr/bin/rkt run \
|
||||||
--trust-keys-from-https \
|
--trust-keys-from-https \
|
||||||
--volume assets,kind=host,source=$BOOTKUBE_ASSETS \
|
--volume assets,kind=host,source=/opt/bootkube/assets \
|
||||||
--mount volume=assets,target=/assets \
|
--mount volume=assets,target=/assets \
|
||||||
--volume bootstrap,kind=host,source=/etc/kubernetes \
|
--volume bootstrap,kind=host,source=/etc/kubernetes \
|
||||||
--mount volume=bootstrap,target=/etc/kubernetes \
|
--mount volume=bootstrap,target=/etc/kubernetes \
|
||||||
$$RKT_OPTS \
|
$$RKT_OPTS \
|
||||||
$${BOOTKUBE_ACI}:$${BOOTKUBE_VERSION} \
|
quay.io/coreos/bootkube:v0.14.0 \
|
||||||
--net=host \
|
--net=host \
|
||||||
--dns=host \
|
--dns=host \
|
||||||
--exec=/bootkube -- start --asset-dir=/assets "$@"
|
--exec=/bootkube -- start --asset-dir=/assets "$@"
|
||||||
networkd:
|
|
||||||
${networkd_content}
|
|
||||||
passwd:
|
passwd:
|
||||||
users:
|
users:
|
||||||
- name: core
|
- name: core
|
||||||
|
@ -45,6 +45,10 @@ systemd:
|
|||||||
--mount volume=opt-cni-bin,target=/opt/cni/bin \
|
--mount volume=opt-cni-bin,target=/opt/cni/bin \
|
||||||
--volume var-log,kind=host,source=/var/log \
|
--volume var-log,kind=host,source=/var/log \
|
||||||
--mount volume=var-log,target=/var/log \
|
--mount volume=var-log,target=/var/log \
|
||||||
|
--volume iscsiconf,kind=host,source=/etc/iscsi/ \
|
||||||
|
--mount volume=iscsiconf,target=/etc/iscsi/ \
|
||||||
|
--volume iscsiadm,kind=host,source=/usr/sbin/iscsiadm \
|
||||||
|
--mount volume=iscsiadm,target=/sbin/iscsiadm \
|
||||||
--insecure-options=image"
|
--insecure-options=image"
|
||||||
ExecStartPre=/bin/mkdir -p /opt/cni/bin
|
ExecStartPre=/bin/mkdir -p /opt/cni/bin
|
||||||
ExecStartPre=/bin/mkdir -p /etc/kubernetes/manifests
|
ExecStartPre=/bin/mkdir -p /etc/kubernetes/manifests
|
||||||
@ -55,12 +59,11 @@ systemd:
|
|||||||
ExecStartPre=/usr/bin/bash -c "grep 'certificate-authority-data' /etc/kubernetes/kubeconfig | awk '{print $2}' | base64 -d > /etc/kubernetes/ca.crt"
|
ExecStartPre=/usr/bin/bash -c "grep 'certificate-authority-data' /etc/kubernetes/kubeconfig | awk '{print $2}' | base64 -d > /etc/kubernetes/ca.crt"
|
||||||
ExecStartPre=-/usr/bin/rkt rm --uuid-file=/var/cache/kubelet-pod.uuid
|
ExecStartPre=-/usr/bin/rkt rm --uuid-file=/var/cache/kubelet-pod.uuid
|
||||||
ExecStart=/usr/lib/coreos/kubelet-wrapper \
|
ExecStart=/usr/lib/coreos/kubelet-wrapper \
|
||||||
--allow-privileged \
|
|
||||||
--anonymous-auth=false \
|
--anonymous-auth=false \
|
||||||
--authentication-token-webhook \
|
--authentication-token-webhook \
|
||||||
--authorization-mode=Webhook \
|
--authorization-mode=Webhook \
|
||||||
--client-ca-file=/etc/kubernetes/ca.crt \
|
--client-ca-file=/etc/kubernetes/ca.crt \
|
||||||
--cluster_dns=${k8s_dns_service_ip} \
|
--cluster_dns=${cluster_dns_service_ip} \
|
||||||
--cluster_domain=${cluster_domain_suffix} \
|
--cluster_domain=${cluster_domain_suffix} \
|
||||||
--cni-conf-dir=/etc/kubernetes/cni/net.d \
|
--cni-conf-dir=/etc/kubernetes/cni/net.d \
|
||||||
--exit-on-lock-contention \
|
--exit-on-lock-contention \
|
||||||
@ -70,6 +73,7 @@ systemd:
|
|||||||
--network-plugin=cni \
|
--network-plugin=cni \
|
||||||
--node-labels=node-role.kubernetes.io/node \
|
--node-labels=node-role.kubernetes.io/node \
|
||||||
--pod-manifest-path=/etc/kubernetes/manifests \
|
--pod-manifest-path=/etc/kubernetes/manifests \
|
||||||
|
--read-only-port=0 \
|
||||||
--volume-plugin-dir=/var/lib/kubelet/volumeplugins
|
--volume-plugin-dir=/var/lib/kubelet/volumeplugins
|
||||||
ExecStop=-/usr/bin/rkt stop --uuid-file=/var/cache/kubelet-pod.uuid
|
ExecStop=-/usr/bin/rkt stop --uuid-file=/var/cache/kubelet-pod.uuid
|
||||||
Restart=always
|
Restart=always
|
||||||
@ -85,7 +89,7 @@ storage:
|
|||||||
contents:
|
contents:
|
||||||
inline: |
|
inline: |
|
||||||
KUBELET_IMAGE_URL=docker://k8s.gcr.io/hyperkube
|
KUBELET_IMAGE_URL=docker://k8s.gcr.io/hyperkube
|
||||||
KUBELET_IMAGE_TAG=v1.10.4
|
KUBELET_IMAGE_TAG=v1.13.4
|
||||||
- path: /etc/hostname
|
- path: /etc/hostname
|
||||||
filesystem: root
|
filesystem: root
|
||||||
mode: 0644
|
mode: 0644
|
||||||
@ -97,8 +101,6 @@ storage:
|
|||||||
contents:
|
contents:
|
||||||
inline: |
|
inline: |
|
||||||
fs.inotify.max_user_watches=16184
|
fs.inotify.max_user_watches=16184
|
||||||
networkd:
|
|
||||||
${networkd_content}
|
|
||||||
passwd:
|
passwd:
|
||||||
users:
|
users:
|
||||||
- name: core
|
- name: core
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
resource "matchbox_group" "install" {
|
resource "matchbox_group" "install" {
|
||||||
count = "${length(var.controller_names) + length(var.worker_names)}"
|
count = "${length(var.controller_names) + length(var.worker_names)}"
|
||||||
|
|
||||||
name = "${format("install-%s", element(concat(var.controller_names, var.worker_names), count.index))}"
|
name = "${format("install-%s", element(concat(var.controller_names, var.worker_names), count.index))}"
|
||||||
|
|
||||||
profile = "${local.flavor == "flatcar" ? element(matchbox_profile.flatcar-install.*.name, count.index) : var.cached_install == "true" ? element(matchbox_profile.cached-container-linux-install.*.name, count.index) : element(matchbox_profile.container-linux-install.*.name, count.index)}"
|
profile = "${local.flavor == "flatcar" ? var.cached_install == "true" ? element(matchbox_profile.cached-flatcar-linux-install.*.name, count.index) : element(matchbox_profile.flatcar-install.*.name, count.index) : var.cached_install == "true" ? element(matchbox_profile.cached-container-linux-install.*.name, count.index) : element(matchbox_profile.container-linux-install.*.name, count.index)}"
|
||||||
|
|
||||||
selector {
|
selector {
|
||||||
mac = "${element(concat(var.controller_macs, var.worker_macs), count.index)}"
|
mac = "${element(concat(var.controller_macs, var.worker_macs), count.index)}"
|
||||||
|
@ -1,3 +1,3 @@
|
|||||||
output "kubeconfig" {
|
output "kubeconfig-admin" {
|
||||||
value = "${module.bootkube.kubeconfig}"
|
value = "${module.bootkube.kubeconfig-admin}"
|
||||||
}
|
}
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
locals {
|
locals {
|
||||||
# coreos-stable -> coreos flavor, stable channel
|
# coreos-stable -> coreos flavor, stable channel
|
||||||
# flatcar-stable -> flatcar flavor, stable channel
|
# flatcar-stable -> flatcar flavor, stable channel
|
||||||
flavor = "${element(split("-", var.os_channel), 0)}"
|
flavor = "${element(split("-", var.os_channel), 0)}"
|
||||||
channel = "${element(split("-", var.os_channel), 1)}"
|
|
||||||
|
channel = "${element(split("-", var.os_channel), 1)}"
|
||||||
}
|
}
|
||||||
|
|
||||||
// Container Linux Install profile (from release.core-os.net)
|
// Container Linux Install profile (from release.core-os.net)
|
||||||
@ -48,7 +49,7 @@ data "template_file" "container-linux-install-configs" {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Container Linux Install profile (from matchbox /assets cache)
|
// Container Linux Install profile (from matchbox /assets cache)
|
||||||
// Note: Admin must have downloaded os_version into matchbox assets.
|
// Note: Admin must have downloaded os_version into matchbox assets/coreos.
|
||||||
resource "matchbox_profile" "cached-container-linux-install" {
|
resource "matchbox_profile" "cached-container-linux-install" {
|
||||||
count = "${length(var.controller_names) + length(var.worker_names)}"
|
count = "${length(var.controller_names) + length(var.worker_names)}"
|
||||||
name = "${format("%s-cached-container-linux-install-%s", var.cluster_name, element(concat(var.controller_names, var.worker_names), count.index))}"
|
name = "${format("%s-cached-container-linux-install-%s", var.cluster_name, element(concat(var.controller_names, var.worker_names), count.index))}"
|
||||||
@ -86,7 +87,7 @@ data "template_file" "cached-container-linux-install-configs" {
|
|||||||
ssh_authorized_key = "${var.ssh_authorized_key}"
|
ssh_authorized_key = "${var.ssh_authorized_key}"
|
||||||
|
|
||||||
# profile uses -b baseurl to install from matchbox cache
|
# profile uses -b baseurl to install from matchbox cache
|
||||||
baseurl_flag = "-b ${var.matchbox_http_endpoint}/assets/coreos"
|
baseurl_flag = "-b ${var.matchbox_http_endpoint}/assets/${local.flavor}"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -113,11 +114,44 @@ resource "matchbox_profile" "flatcar-install" {
|
|||||||
container_linux_config = "${element(data.template_file.container-linux-install-configs.*.rendered, count.index)}"
|
container_linux_config = "${element(data.template_file.container-linux-install-configs.*.rendered, count.index)}"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Flatcar Linux Install profile (from matchbox /assets cache)
|
||||||
|
// Note: Admin must have downloaded os_version into matchbox assets/flatcar.
|
||||||
|
resource "matchbox_profile" "cached-flatcar-linux-install" {
|
||||||
|
count = "${length(var.controller_names) + length(var.worker_names)}"
|
||||||
|
name = "${format("%s-cached-flatcar-linux-install-%s", var.cluster_name, element(concat(var.controller_names, var.worker_names), count.index))}"
|
||||||
|
|
||||||
|
kernel = "/assets/flatcar/${var.os_version}/flatcar_production_pxe.vmlinuz"
|
||||||
|
|
||||||
|
initrd = [
|
||||||
|
"/assets/flatcar/${var.os_version}/flatcar_production_pxe_image.cpio.gz",
|
||||||
|
]
|
||||||
|
|
||||||
|
args = [
|
||||||
|
"initrd=flatcar_production_pxe_image.cpio.gz",
|
||||||
|
"flatcar.config.url=${var.matchbox_http_endpoint}/ignition?uuid=$${uuid}&mac=$${mac:hexhyp}",
|
||||||
|
"flatcar.first_boot=yes",
|
||||||
|
"console=tty0",
|
||||||
|
"console=ttyS0",
|
||||||
|
"${var.kernel_args}",
|
||||||
|
]
|
||||||
|
|
||||||
|
container_linux_config = "${element(data.template_file.cached-container-linux-install-configs.*.rendered, count.index)}"
|
||||||
|
}
|
||||||
|
|
||||||
// Kubernetes Controller profiles
|
// Kubernetes Controller profiles
|
||||||
resource "matchbox_profile" "controllers" {
|
resource "matchbox_profile" "controllers" {
|
||||||
count = "${length(var.controller_names)}"
|
count = "${length(var.controller_names)}"
|
||||||
name = "${format("%s-controller-%s", var.cluster_name, element(var.controller_names, count.index))}"
|
name = "${format("%s-controller-%s", var.cluster_name, element(var.controller_names, count.index))}"
|
||||||
container_linux_config = "${element(data.template_file.controller-configs.*.rendered, count.index)}"
|
raw_ignition = "${element(data.ct_config.controller-ignitions.*.rendered, count.index)}"
|
||||||
|
}
|
||||||
|
|
||||||
|
data "ct_config" "controller-ignitions" {
|
||||||
|
count = "${length(var.controller_names)}"
|
||||||
|
content = "${element(data.template_file.controller-configs.*.rendered, count.index)}"
|
||||||
|
pretty_print = false
|
||||||
|
|
||||||
|
# Must use direct lookup. Cannot use lookup(map, key) since it only works for flat maps
|
||||||
|
snippets = ["${local.clc_map[element(var.controller_names, count.index)]}"]
|
||||||
}
|
}
|
||||||
|
|
||||||
data "template_file" "controller-configs" {
|
data "template_file" "controller-configs" {
|
||||||
@ -126,23 +160,29 @@ data "template_file" "controller-configs" {
|
|||||||
template = "${file("${path.module}/cl/controller.yaml.tmpl")}"
|
template = "${file("${path.module}/cl/controller.yaml.tmpl")}"
|
||||||
|
|
||||||
vars {
|
vars {
|
||||||
domain_name = "${element(var.controller_domains, count.index)}"
|
domain_name = "${element(var.controller_domains, count.index)}"
|
||||||
etcd_name = "${element(var.controller_names, count.index)}"
|
etcd_name = "${element(var.controller_names, count.index)}"
|
||||||
etcd_initial_cluster = "${join(",", formatlist("%s=https://%s:2380", var.controller_names, var.controller_domains))}"
|
etcd_initial_cluster = "${join(",", formatlist("%s=https://%s:2380", var.controller_names, var.controller_domains))}"
|
||||||
k8s_dns_service_ip = "${module.bootkube.kube_dns_service_ip}"
|
cluster_dns_service_ip = "${module.bootkube.cluster_dns_service_ip}"
|
||||||
cluster_domain_suffix = "${var.cluster_domain_suffix}"
|
cluster_domain_suffix = "${var.cluster_domain_suffix}"
|
||||||
ssh_authorized_key = "${var.ssh_authorized_key}"
|
ssh_authorized_key = "${var.ssh_authorized_key}"
|
||||||
|
|
||||||
# Terraform evaluates both sides regardless and element cannot be used on 0 length lists
|
|
||||||
networkd_content = "${length(var.controller_networkds) == 0 ? "" : element(concat(var.controller_networkds, list("")), count.index)}"
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Kubernetes Worker profiles
|
// Kubernetes Worker profiles
|
||||||
resource "matchbox_profile" "workers" {
|
resource "matchbox_profile" "workers" {
|
||||||
count = "${length(var.worker_names)}"
|
count = "${length(var.worker_names)}"
|
||||||
name = "${format("%s-worker-%s", var.cluster_name, element(var.worker_names, count.index))}"
|
name = "${format("%s-worker-%s", var.cluster_name, element(var.worker_names, count.index))}"
|
||||||
container_linux_config = "${element(data.template_file.worker-configs.*.rendered, count.index)}"
|
raw_ignition = "${element(data.ct_config.worker-ignitions.*.rendered, count.index)}"
|
||||||
|
}
|
||||||
|
|
||||||
|
data "ct_config" "worker-ignitions" {
|
||||||
|
count = "${length(var.worker_names)}"
|
||||||
|
content = "${element(data.template_file.worker-configs.*.rendered, count.index)}"
|
||||||
|
pretty_print = false
|
||||||
|
|
||||||
|
# Must use direct lookup. Cannot use lookup(map, key) since it only works for flat maps
|
||||||
|
snippets = ["${local.clc_map[element(var.worker_names, count.index)]}"]
|
||||||
}
|
}
|
||||||
|
|
||||||
data "template_file" "worker-configs" {
|
data "template_file" "worker-configs" {
|
||||||
@ -151,12 +191,25 @@ data "template_file" "worker-configs" {
|
|||||||
template = "${file("${path.module}/cl/worker.yaml.tmpl")}"
|
template = "${file("${path.module}/cl/worker.yaml.tmpl")}"
|
||||||
|
|
||||||
vars {
|
vars {
|
||||||
domain_name = "${element(var.worker_domains, count.index)}"
|
domain_name = "${element(var.worker_domains, count.index)}"
|
||||||
k8s_dns_service_ip = "${module.bootkube.kube_dns_service_ip}"
|
cluster_dns_service_ip = "${module.bootkube.cluster_dns_service_ip}"
|
||||||
cluster_domain_suffix = "${var.cluster_domain_suffix}"
|
cluster_domain_suffix = "${var.cluster_domain_suffix}"
|
||||||
ssh_authorized_key = "${var.ssh_authorized_key}"
|
ssh_authorized_key = "${var.ssh_authorized_key}"
|
||||||
|
|
||||||
# Terraform evaluates both sides regardless and element cannot be used on 0 length lists
|
|
||||||
networkd_content = "${length(var.worker_networkds) == 0 ? "" : element(concat(var.worker_networkds, list("")), count.index)}"
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
locals {
|
||||||
|
# Hack to workaround https://github.com/hashicorp/terraform/issues/17251
|
||||||
|
# Default Container Linux config snippets map every node names to list("\n") so
|
||||||
|
# all lookups succeed
|
||||||
|
clc_defaults = "${zipmap(concat(var.controller_names, var.worker_names), chunklist(data.template_file.clc-default-snippets.*.rendered, 1))}"
|
||||||
|
|
||||||
|
# Union of the default and user specific snippets, later overrides prior.
|
||||||
|
clc_map = "${merge(local.clc_defaults, var.clc_snippets)}"
|
||||||
|
}
|
||||||
|
|
||||||
|
// Horrible hack to generate a Terraform list of node count length
|
||||||
|
data "template_file" "clc-default-snippets" {
|
||||||
|
count = "${length(var.controller_names) + length(var.worker_names)}"
|
||||||
|
template = "\n"
|
||||||
|
}
|
||||||
|
@ -2,6 +2,14 @@
|
|||||||
resource "null_resource" "copy-controller-secrets" {
|
resource "null_resource" "copy-controller-secrets" {
|
||||||
count = "${length(var.controller_names)}"
|
count = "${length(var.controller_names)}"
|
||||||
|
|
||||||
|
# Without depends_on, remote-exec could start and wait for machines before
|
||||||
|
# matchbox groups are written, causing a deadlock.
|
||||||
|
depends_on = [
|
||||||
|
"matchbox_group.install",
|
||||||
|
"matchbox_group.controller",
|
||||||
|
"matchbox_group.worker",
|
||||||
|
]
|
||||||
|
|
||||||
connection {
|
connection {
|
||||||
type = "ssh"
|
type = "ssh"
|
||||||
host = "${element(var.controller_domains, count.index)}"
|
host = "${element(var.controller_domains, count.index)}"
|
||||||
@ -10,7 +18,7 @@ resource "null_resource" "copy-controller-secrets" {
|
|||||||
}
|
}
|
||||||
|
|
||||||
provisioner "file" {
|
provisioner "file" {
|
||||||
content = "${module.bootkube.kubeconfig}"
|
content = "${module.bootkube.kubeconfig-kubelet}"
|
||||||
destination = "$HOME/kubeconfig"
|
destination = "$HOME/kubeconfig"
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -70,6 +78,14 @@ resource "null_resource" "copy-controller-secrets" {
|
|||||||
resource "null_resource" "copy-worker-secrets" {
|
resource "null_resource" "copy-worker-secrets" {
|
||||||
count = "${length(var.worker_names)}"
|
count = "${length(var.worker_names)}"
|
||||||
|
|
||||||
|
# Without depends_on, remote-exec could start and wait for machines before
|
||||||
|
# matchbox groups are written, causing a deadlock.
|
||||||
|
depends_on = [
|
||||||
|
"matchbox_group.install",
|
||||||
|
"matchbox_group.controller",
|
||||||
|
"matchbox_group.worker",
|
||||||
|
]
|
||||||
|
|
||||||
connection {
|
connection {
|
||||||
type = "ssh"
|
type = "ssh"
|
||||||
host = "${element(var.worker_domains, count.index)}"
|
host = "${element(var.worker_domains, count.index)}"
|
||||||
@ -78,7 +94,7 @@ resource "null_resource" "copy-worker-secrets" {
|
|||||||
}
|
}
|
||||||
|
|
||||||
provisioner "file" {
|
provisioner "file" {
|
||||||
content = "${module.bootkube.kubeconfig}"
|
content = "${module.bootkube.kubeconfig-kubelet}"
|
||||||
destination = "$HOME/kubeconfig"
|
destination = "$HOME/kubeconfig"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -24,27 +24,39 @@ variable "os_version" {
|
|||||||
# Terraform's crude "type system" does not properly support lists of maps so we do this.
|
# Terraform's crude "type system" does not properly support lists of maps so we do this.
|
||||||
|
|
||||||
variable "controller_names" {
|
variable "controller_names" {
|
||||||
type = "list"
|
type = "list"
|
||||||
|
description = "Ordered list of controller names (e.g. [node1])"
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "controller_macs" {
|
variable "controller_macs" {
|
||||||
type = "list"
|
type = "list"
|
||||||
|
description = "Ordered list of controller identifying MAC addresses (e.g. [52:54:00:a1:9c:ae])"
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "controller_domains" {
|
variable "controller_domains" {
|
||||||
type = "list"
|
type = "list"
|
||||||
|
description = "Ordered list of controller FQDNs (e.g. [node1.example.com])"
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "worker_names" {
|
variable "worker_names" {
|
||||||
type = "list"
|
type = "list"
|
||||||
|
description = "Ordered list of worker names (e.g. [node2, node3])"
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "worker_macs" {
|
variable "worker_macs" {
|
||||||
type = "list"
|
type = "list"
|
||||||
|
description = "Ordered list of worker identifying MAC addresses (e.g. [52:54:00:b2:2f:86, 52:54:00:c3:61:77])"
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "worker_domains" {
|
variable "worker_domains" {
|
||||||
type = "list"
|
type = "list"
|
||||||
|
description = "Ordered list of worker FQDNs (e.g. [node2.example.com, node3.example.com])"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "clc_snippets" {
|
||||||
|
type = "map"
|
||||||
|
description = "Map from machine names to lists of Container Linux Config snippets"
|
||||||
|
default = {}
|
||||||
}
|
}
|
||||||
|
|
||||||
# configuration
|
# configuration
|
||||||
@ -91,7 +103,7 @@ variable "pod_cidr" {
|
|||||||
variable "service_cidr" {
|
variable "service_cidr" {
|
||||||
description = <<EOD
|
description = <<EOD
|
||||||
CIDR IPv4 range to assign Kubernetes services.
|
CIDR IPv4 range to assign Kubernetes services.
|
||||||
The 1st IP will be reserved for kube_apiserver, the 10th IP will be reserved for kube-dns.
|
The 1st IP will be reserved for kube_apiserver, the 10th IP will be reserved for coredns.
|
||||||
EOD
|
EOD
|
||||||
|
|
||||||
type = "string"
|
type = "string"
|
||||||
@ -101,7 +113,7 @@ EOD
|
|||||||
# optional
|
# optional
|
||||||
|
|
||||||
variable "cluster_domain_suffix" {
|
variable "cluster_domain_suffix" {
|
||||||
description = "Queries for domains with the suffix will be answered by kube-dns. Default is cluster.local (e.g. foo.default.svc.cluster.local) "
|
description = "Queries for domains with the suffix will be answered by coredns. Default is cluster.local (e.g. foo.default.svc.cluster.local) "
|
||||||
type = "string"
|
type = "string"
|
||||||
default = "cluster.local"
|
default = "cluster.local"
|
||||||
}
|
}
|
||||||
@ -130,16 +142,8 @@ variable "kernel_args" {
|
|||||||
default = []
|
default = []
|
||||||
}
|
}
|
||||||
|
|
||||||
# unofficial, undocumented, unsupported, temporary
|
variable "enable_reporting" {
|
||||||
|
type = "string"
|
||||||
variable "controller_networkds" {
|
description = "Enable usage or analytics reporting to upstreams (Calico)"
|
||||||
type = "list"
|
default = "false"
|
||||||
description = "Controller Container Linux config networkd section"
|
|
||||||
default = []
|
|
||||||
}
|
|
||||||
|
|
||||||
variable "worker_networkds" {
|
|
||||||
type = "list"
|
|
||||||
description = "Worker Container Linux config networkd section"
|
|
||||||
default = []
|
|
||||||
}
|
}
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user