From d276fffcda9fe9c1c4216a0b63aca2855f71d640 Mon Sep 17 00:00:00 2001 From: Dalton Hubble Date: Wed, 4 Apr 2018 21:38:03 -0700 Subject: [PATCH] Fix bare-metal multiple apply/ssh on Terraform v0.11.4+ * Terraform v0.11.4 introduced changes to remote-exec that mean Typhoon bare-metal clusters require multiple runs of terraform apply to ssh and bootstrap. * Bare-metal installs PXE boot a live instance to install to disk and then reboot from disk as controllers/workers. Terraform remote-exec has no way to "know" to wait until the reboot has occurred to kickoff Kubernetes bootstrap. Previously Typhoon created a "debug" user during this install phase to allow an admin to SSH, but remote-exec would hang, trying to connect as user "core". Terraform v0.11.4 changes this behavior so remote-exec fails and a user must re-run terraform apply until succeeding. * A new way to "trick" remote-exec into waiting for the reboot into the disk install is to run SSH on a non-standard port during the disk install. This retains the ability for an admin to SSH during install (most distros don't have this) and fixes the issue so only a single run of terraform apply is needed. * https://github.com/hashicorp/terraform/pull/17359#issuecomment-376415464 --- CHANGES.md | 8 +++++++- .../cl/container-linux-install.yaml.tmpl | 19 ++++++++++++------- .../container-linux/kubernetes/groups.tf | 4 ---- .../container-linux/kubernetes/profiles.tf | 2 ++ 4 files changed, 21 insertions(+), 12 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index b65f8b22..8446396c 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -5,11 +5,17 @@ Notable changes between versions. ## Latest * Enable etcd v3.3 metrics endpoint ([#175](https://github.com/poseidon/typhoon/pull/175)) -* Use `k8s.gcr.io` instead of `gcr.io/google_containers` +* Use `k8s.gcr.io` instead of `gcr.io/google_containers` ([#180](https://github.com/poseidon/typhoon/pull/180)) * Kubernetes [recommends](https://groups.google.com/forum/#!msg/kubernetes-dev/ytjk_rNrTa0/3EFUHvovCAAJ) using the alias to pull from the nearest regional mirror and to abstract the backing container registry * Update kube-dns from v1.14.8 to v1.14.9 * Update etcd from v3.3.2 to v3.3.3 +#### Bare-Metal + +* Fix need for multiple `terraform apply` runs to create a cluster with Terraform v0.11.4 ([#181](https://github.com/poseidon/typhoon/pull/181)) + * To SSH during a disk install for debugging, SSH as user "core" with port 2222 + * Remove the old trick of using a user "debug" during disk install + #### Addons * Add Prometheus discovery for etcd peers on controller nodes ([#175](https://github.com/poseidon/typhoon/pull/175)) diff --git a/bare-metal/container-linux/kubernetes/cl/container-linux-install.yaml.tmpl b/bare-metal/container-linux/kubernetes/cl/container-linux-install.yaml.tmpl index 1371ec76..fb6e687c 100644 --- a/bare-metal/container-linux/kubernetes/cl/container-linux-install.yaml.tmpl +++ b/bare-metal/container-linux/kubernetes/cl/container-linux-install.yaml.tmpl @@ -12,6 +12,16 @@ systemd: ExecStart=/opt/installer [Install] WantedBy=multi-user.target + # Avoid using the standard SSH port so terraform apply cannot SSH until + # post-install. But admins may SSH to debug disk install problems. + # After install, sshd will use port 22 and users/terraform can connect. + - name: sshd.socket + dropins: + - name: 10-sshd-port.conf + contents: | + [Socket] + ListenStream= + ListenStream=2222 storage: files: - path: /opt/installer @@ -32,11 +42,6 @@ storage: systemctl reboot passwd: users: - # Avoid using standard name "core" so terraform apply cannot SSH until post-install. - - name: debug - create: - groups: - - sudo - - docker + - name: core ssh_authorized_keys: - - {{.ssh_authorized_key}} + - "${ssh_authorized_key}" diff --git a/bare-metal/container-linux/kubernetes/groups.tf b/bare-metal/container-linux/kubernetes/groups.tf index cdc66eda..3566c9a9 100644 --- a/bare-metal/container-linux/kubernetes/groups.tf +++ b/bare-metal/container-linux/kubernetes/groups.tf @@ -8,10 +8,6 @@ resource "matchbox_group" "container-linux-install" { selector { mac = "${element(concat(var.controller_macs, var.worker_macs), count.index)}" } - - metadata { - ssh_authorized_key = "${var.ssh_authorized_key}" - } } resource "matchbox_group" "controller" { diff --git a/bare-metal/container-linux/kubernetes/profiles.tf b/bare-metal/container-linux/kubernetes/profiles.tf index cdcadf2e..3d50b677 100644 --- a/bare-metal/container-linux/kubernetes/profiles.tf +++ b/bare-metal/container-linux/kubernetes/profiles.tf @@ -32,6 +32,7 @@ data "template_file" "container-linux-install-configs" { ignition_endpoint = "${format("%s/ignition", var.matchbox_http_endpoint)}" install_disk = "${var.install_disk}" container_linux_oem = "${var.container_linux_oem}" + ssh_authorized_key = "${var.ssh_authorized_key}" # only cached-container-linux profile adds -b baseurl baseurl_flag = "" @@ -73,6 +74,7 @@ data "template_file" "cached-container-linux-install-configs" { ignition_endpoint = "${format("%s/ignition", var.matchbox_http_endpoint)}" install_disk = "${var.install_disk}" container_linux_oem = "${var.container_linux_oem}" + ssh_authorized_key = "${var.ssh_authorized_key}" # profile uses -b baseurl to install from matchbox cache baseurl_flag = "-b ${var.matchbox_http_endpoint}/assets/coreos"