typhoon/google-cloud/flatcar-linux/kubernetes/workers/workers.tf
Dalton Hubble e87d5aabc3 Adjust Google Cloud worker health checks to use kube-proxy healthz
* Change the workers managed instance group to health check nodes
via HTTP probe of the kube-proxy port 10256 /healthz endpoints
* Advantages: kube-proxy is a lower value target (in case there
were bugs in firewalls) that Kubelet, its more representative than
health checking Kubelet (Kubelet must run AND kube-proxy Daemonset
must be healthy), and its already used by kube-proxy liveness probes
(better discoverability via kubectl or alerts on pods crashlooping)
* Another motivator is that GKE clusters also use kube-proxy port
10256 checks to assess node health
2022-08-17 20:50:52 -07:00

121 lines
3.2 KiB
HCL

# Managed instance group of workers
resource "google_compute_region_instance_group_manager" "workers" {
name = "${var.name}-worker"
description = "Compute instance group of ${var.name} workers"
# instance name prefix for instances in the group
base_instance_name = "${var.name}-worker"
region = var.region
version {
name = "default"
instance_template = google_compute_instance_template.worker.self_link
}
# Roll out MIG instance template changes by replacing instances.
# - Surge to create new instances, then delete old instances.
# - Replace ensures new Ignition is picked up
update_policy {
type = "PROACTIVE"
max_surge_fixed = 3
max_unavailable_fixed = 0
minimal_action = "REPLACE"
}
target_size = var.worker_count
target_pools = [google_compute_target_pool.workers.self_link]
named_port {
name = "http"
port = "80"
}
named_port {
name = "https"
port = "443"
}
auto_healing_policies {
health_check = google_compute_health_check.worker.id
initial_delay_sec = 300
}
}
# Health check for worker Kubelet
resource "google_compute_health_check" "worker" {
name = "${var.name}-worker-health"
description = "Health check for worker node"
timeout_sec = 20
check_interval_sec = 30
healthy_threshold = 1
unhealthy_threshold = 6
http_health_check {
port = "10256"
request_path = "/healthz"
}
}
# Worker instance template
resource "google_compute_instance_template" "worker" {
name_prefix = "${var.name}-worker-"
description = "Worker Instance template"
machine_type = var.machine_type
metadata = {
user-data = data.ct_config.worker.rendered
}
scheduling {
provisioning_model = var.preemptible ? "SPOT" : "STANDARD"
preemptible = var.preemptible
automatic_restart = var.preemptible ? false : true
# Spot instances with termination action DELETE cannot be used with MIGs
instance_termination_action = var.preemptible ? "STOP" : null
}
disk {
auto_delete = true
boot = true
source_image = data.google_compute_image.flatcar-linux.self_link
disk_size_gb = var.disk_size
}
network_interface {
network = var.network
# Ephemeral external IP
access_config {}
}
can_ip_forward = true
tags = ["worker", "${var.cluster_name}-worker", "${var.name}-worker"]
guest_accelerator {
count = var.accelerator_count
type = var.accelerator_type
}
lifecycle {
ignore_changes = [
disk[0].source_image
]
# To update an Instance Template, Terraform should replace the existing resource
create_before_destroy = true
}
}
# Flatcar Linux worker
data "ct_config" "worker" {
content = templatefile("${path.module}/butane/worker.yaml", {
kubeconfig = indent(10, var.kubeconfig)
ssh_authorized_key = var.ssh_authorized_key
cluster_dns_service_ip = cidrhost(var.service_cidr, 10)
cluster_domain_suffix = var.cluster_domain_suffix
node_labels = join(",", var.node_labels)
node_taints = join(",", var.node_taints)
})
strict = true
snippets = var.snippets
}