Dalton Hubble 20b76d6e00 Roll instance template changes to worker managed instance groups
* When a worker managed instance group's (MIG) instance template
changes (including machine type, disk size, or Butane snippets
but excluding new AMIs), use Google Cloud's rolling update features
to ensure instances match declared state
* Ignore new AMIs since Fedora CoreOS and Flatcar Linux nodes
already auto-update and reboot themselves
* Rolling updates will create surge instances, wait for health
checks, then delete old instances (0 unavilable instances)
* Instances are replaced to ensure new Ignition/Butane snippets
are respected
* Add managed instance group autohealing (i.e. health checks) to
ensure new instances' Kubelet is running


* Name apiserver and kubelet health checks consistently
* Rename MIG from `${var.name}-worker-group` to `${var.name}-worker`

Rel: https://cloud.google.com/compute/docs/instance-groups/rolling-out-updates-to-managed-instance-groups
2022-08-14 13:06:53 -07:00

120 lines
3.1 KiB

# Managed instance group of workers
resource "google_compute_region_instance_group_manager" "workers" {
name = "${var.name}-worker"
description = "Compute instance group of ${var.name} workers"
# instance name prefix for instances in the group
base_instance_name = "${var.name}-worker"
region = var.region
version {
name = "default"
instance_template = google_compute_instance_template.worker.self_link
# Roll out MIG instance template changes by replacing instances.
# - Surge to create new instances, then delete old instances.
# - Replace ensures new Ignition is picked up
update_policy {
type = "PROACTIVE"
max_surge_fixed = 3
max_unavailable_fixed = 0
minimal_action = "REPLACE"
target_size = var.worker_count
target_pools = [google_compute_target_pool.workers.self_link]
named_port {
name = "http"
port = "80"
named_port {
name = "https"
port = "443"
auto_healing_policies {
health_check = google_compute_health_check.worker.id
initial_delay_sec = 120
# Health check for worker Kubelet
resource "google_compute_health_check" "worker" {
name = "${var.name}-kubelet-health"
description = "Health check for worker Kubelet"
timeout_sec = 20
check_interval_sec = 30
healthy_threshold = 1
unhealthy_threshold = 6
ssl_health_check {
port = "10250"
# Worker instance template
resource "google_compute_instance_template" "worker" {
name_prefix = "${var.name}-worker-"
description = "Worker Instance template"
machine_type = var.machine_type
metadata = {
user-data = data.ct_config.worker.rendered
scheduling {
provisioning_model = var.preemptible ? "SPOT" : "STANDARD"
preemptible = var.preemptible
automatic_restart = var.preemptible ? false : true
# Spot instances with termination action DELETE cannot be used with MIGs
instance_termination_action = var.preemptible ? "STOP" : null
disk {
auto_delete = true
boot = true
source_image = data.google_compute_image.flatcar-linux.self_link
disk_size_gb = var.disk_size
network_interface {
network = var.network
# Ephemeral external IP
access_config {}
can_ip_forward = true
tags = ["worker", "${var.cluster_name}-worker", "${var.name}-worker"]
guest_accelerator {
count = var.accelerator_count
type = var.accelerator_type
lifecycle {
ignore_changes = [
# To update an Instance Template, Terraform should replace the existing resource
create_before_destroy = true
# Flatcar Linux worker
data "ct_config" "worker" {
content = templatefile("${path.module}/butane/worker.yaml", {
kubeconfig = indent(10, var.kubeconfig)
ssh_authorized_key = var.ssh_authorized_key
cluster_dns_service_ip = cidrhost(var.service_cidr, 10)
cluster_domain_suffix = var.cluster_domain_suffix
node_labels = join(",", var.node_labels)
node_taints = join(",", var.node_taints)
strict = true
snippets = var.snippets