Change worker node pools from uniform to flexible orchestration mode

* Use flexible orchestration mode. Azure has started to recommend this
mode because it allows interacting with VMSS instances like regular VMs
via the CLI or via the Azure Portal
* Add options to allow workers nodes to use ephemeral local disks
  * Add `controller_disk_type` and `controller_disk_size` variables
  * Add `worker_disk_type`, `worker_disk_size`, and `worker_ephemeral_disk` variables
This commit is contained in:
Dalton Hubble 2024-07-09 07:53:41 -07:00
parent a4fab61066
commit 0d10d180f8
No known key found for this signature in database
GPG Key ID: BD34C2E3EF32B7A0
12 changed files with 206 additions and 121 deletions

View File

@ -18,6 +18,10 @@ Notable changes between versions.
* Configure worker nodes to use outbound rules and the load balancer for SNAT
* Extend network security rules to allow IPv6 traffic, analogous to IPv4
* Rename `region` variable to `location` to align with Azure platform conventions ([#1469](https://github.com/poseidon/typhoon/pull/1469))
* Change worker pools from uniform to flexible orchestration mode ([#1473](https://github.com/poseidon/typhoon/pull/1473))
* Add options to allow workers nodes to use ephemeral local disks ([#1473](https://github.com/poseidon/typhoon/pull/1473))
* Add `controller_disk_type` and `controller_disk_size` variables
* Add `worker_disk_type`, `worker_disk_size`, and `worker_ephemeral_disk` variables
* Reduce the number of public IPv4 addresses needed for the Azure load balancer ([#1470](https://github.com/poseidon/typhoon/pull/1470))
```diff
@ -30,6 +34,10 @@ module "cluster" {
+ network_cidr = {
+ ipv4 = ["10.0.0.0/16"]
+ }
# optional
+ controller_disk_type = "StandardSSD_LRS"
+ worker_ephemeral_disk = true
}
```

View File

@ -44,9 +44,9 @@ resource "azurerm_linux_virtual_machine" "controllers" {
source_image_id = var.os_image
os_disk {
name = "${var.cluster_name}-controller-${count.index}"
storage_account_type = var.controller_disk_type
disk_size_gb = var.controller_disk_size
caching = "None"
disk_size_gb = var.disk_size
storage_account_type = "Premium_LRS"
}
# network

View File

@ -22,41 +22,66 @@ variable "dns_zone_group" {
# instances
variable "os_image" {
type = string
description = "Fedora CoreOS image for instances"
}
variable "controller_count" {
type = number
description = "Number of controllers (i.e. masters)"
default = 1
}
variable "worker_count" {
type = number
description = "Number of workers"
default = 1
}
variable "controller_type" {
type = string
description = "Machine type for controllers (see `az vm list-skus --location centralus`)"
default = "Standard_B2s"
}
variable "controller_disk_type" {
type = string
description = "Type of managed disk for controller node(s)"
default = "Premium_LRS"
}
variable "controller_disk_size" {
type = number
description = "Size of the managed disk in GB for controller node(s)"
default = 30
}
variable "worker_count" {
type = number
description = "Number of workers"
default = 1
}
variable "worker_type" {
type = string
description = "Machine type for workers (see `az vm list-skus --location centralus`)"
default = "Standard_D2as_v5"
}
variable "os_image" {
variable "worker_disk_type" {
type = string
description = "Fedora CoreOS image for instances"
description = "Type of managed disk for worker nodes"
default = "Standard_LRS"
}
variable "disk_size" {
variable "worker_disk_size" {
type = number
description = "Size of the disk in GB"
description = "Size of the managed disk in GB for worker nodes"
default = 30
}
variable "worker_ephemeral_disk" {
type = bool
description = "Use ephemeral local disk instead of managed disk (requires vm_type with local storage)"
default = false
}
variable "worker_priority" {
type = string
description = "Set worker priority to Spot to use reduced cost surplus capacity, with the tradeoff that instances can be deallocated at any time."

View File

@ -9,10 +9,13 @@ module "workers" {
security_group_id = azurerm_network_security_group.worker.id
backend_address_pool_ids = local.backend_address_pool_ids
worker_count = var.worker_count
vm_type = var.worker_type
os_image = var.os_image
priority = var.worker_priority
worker_count = var.worker_count
vm_type = var.worker_type
os_image = var.os_image
disk_type = var.worker_disk_type
disk_size = var.worker_disk_size
ephemeral_disk = var.worker_ephemeral_disk
priority = var.worker_priority
# configuration
kubeconfig = module.bootstrap.kubeconfig-kubelet

View File

@ -52,6 +52,24 @@ variable "os_image" {
description = "Fedora CoreOS image for instances"
}
variable "disk_type" {
type = string
description = "Type of managed disk"
default = "Standard_LRS"
}
variable "disk_size" {
type = number
description = "Size of the managed disk in GB"
default = 30
}
variable "ephemeral_disk" {
type = bool
description = "Use ephemeral local disk instead of managed disk (requires vm_type with local storage)"
default = false
}
variable "priority" {
type = string
description = "Set priority to Spot to use reduced cost surplus capacity, with the tradeoff that instances can be evicted at any time."

View File

@ -3,21 +3,29 @@ locals {
}
# Workers scale set
resource "azurerm_linux_virtual_machine_scale_set" "workers" {
name = "${var.name}-worker"
resource_group_name = var.resource_group_name
location = var.location
sku = var.vm_type
instances = var.worker_count
# instance name prefix for instances in the set
computer_name_prefix = "${var.name}-worker"
single_placement_group = false
resource "azurerm_orchestrated_virtual_machine_scale_set" "workers" {
name = "${var.name}-worker"
resource_group_name = var.resource_group_name
location = var.location
platform_fault_domain_count = 1
sku_name = var.vm_type
instances = var.worker_count
# storage
source_image_id = var.os_image
encryption_at_host_enabled = true
source_image_id = var.os_image
os_disk {
storage_account_type = "Standard_LRS"
caching = "ReadWrite"
storage_account_type = var.disk_type
disk_size_gb = var.disk_size
caching = "ReadOnly"
# Optionally, use the ephemeral disk of the instance type (support varies)
dynamic "diff_disk_settings" {
for_each = var.ephemeral_disk ? [1] : []
content {
option = "Local"
placement = "ResourceDisk"
}
}
}
# network
@ -44,20 +52,24 @@ resource "azurerm_linux_virtual_machine_scale_set" "workers" {
}
# boot
custom_data = base64encode(data.ct_config.worker.rendered)
user_data_base64 = base64encode(data.ct_config.worker.rendered)
boot_diagnostics {
# defaults to a managed storage account
}
# Azure requires an RSA admin_ssh_key
admin_username = "core"
admin_ssh_key {
username = "core"
public_key = local.azure_authorized_key
os_profile {
linux_configuration {
admin_username = "core"
admin_ssh_key {
username = "core"
public_key = local.azure_authorized_key
}
computer_name_prefix = "${var.name}-worker"
}
}
# lifecycle
upgrade_mode = "Manual"
# eviction policy may only be set when priority is Spot
priority = var.priority
eviction_policy = var.priority == "Spot" ? "Delete" : null
@ -66,25 +78,6 @@ resource "azurerm_linux_virtual_machine_scale_set" "workers" {
}
}
# Scale up or down to maintain desired number, tolerating deallocations.
resource "azurerm_monitor_autoscale_setting" "workers" {
name = "${var.name}-maintain-desired"
resource_group_name = var.resource_group_name
location = var.location
# autoscale
enabled = true
target_resource_id = azurerm_linux_virtual_machine_scale_set.workers.id
profile {
name = "default"
capacity {
minimum = var.worker_count
default = var.worker_count
maximum = var.worker_count
}
}
}
# Fedora CoreOS worker
data "ct_config" "worker" {
content = templatefile("${path.module}/butane/worker.yaml", {

View File

@ -49,9 +49,9 @@ resource "azurerm_linux_virtual_machine" "controllers" {
# storage
os_disk {
name = "${var.cluster_name}-controller-${count.index}"
storage_account_type = var.controller_disk_type
disk_size_gb = var.controller_disk_size
caching = "None"
disk_size_gb = var.disk_size
storage_account_type = "Premium_LRS"
}
# Flatcar Container Linux

View File

@ -22,30 +22,6 @@ variable "dns_zone_group" {
# instances
variable "controller_count" {
type = number
description = "Number of controllers (i.e. masters)"
default = 1
}
variable "worker_count" {
type = number
description = "Number of workers"
default = 1
}
variable "controller_type" {
type = string
description = "Machine type for controllers (see `az vm list-skus --location centralus`)"
default = "Standard_B2s"
}
variable "worker_type" {
type = string
description = "Machine type for workers (see `az vm list-skus --location centralus`)"
default = "Standard_D2as_v5"
}
variable "os_image" {
type = string
description = "Channel for a Container Linux derivative (flatcar-stable, flatcar-beta, flatcar-alpha)"
@ -57,12 +33,60 @@ variable "os_image" {
}
}
variable "disk_size" {
variable "controller_count" {
type = number
description = "Size of the disk in GB"
description = "Number of controllers (i.e. masters)"
default = 1
}
variable "controller_type" {
type = string
description = "Machine type for controllers (see `az vm list-skus --location centralus`)"
default = "Standard_B2s"
}
variable "controller_disk_type" {
type = string
description = "Type of managed disk for controller node(s)"
default = "Premium_LRS"
}
variable "controller_disk_size" {
type = number
description = "Size of the managed disk in GB for controller node(s)"
default = 30
}
variable "worker_count" {
type = number
description = "Number of workers"
default = 1
}
variable "worker_type" {
type = string
description = "Machine type for workers (see `az vm list-skus --location centralus`)"
default = "Standard_D2as_v5"
}
variable "worker_disk_type" {
type = string
description = "Type of managed disk for worker nodes"
default = "Standard_LRS"
}
variable "worker_disk_size" {
type = number
description = "Size of the managed disk in GB for worker nodes"
default = 30
}
variable "worker_ephemeral_disk" {
type = bool
description = "Use ephemeral local disk instead of managed disk (requires vm_type with local storage)"
default = false
}
variable "worker_priority" {
type = string
description = "Set worker priority to Spot to use reduced cost surplus capacity, with the tradeoff that instances can be deallocated at any time."

View File

@ -9,10 +9,13 @@ module "workers" {
security_group_id = azurerm_network_security_group.worker.id
backend_address_pool_ids = local.backend_address_pool_ids
worker_count = var.worker_count
vm_type = var.worker_type
os_image = var.os_image
priority = var.worker_priority
worker_count = var.worker_count
vm_type = var.worker_type
os_image = var.os_image
disk_type = var.worker_disk_type
disk_size = var.worker_disk_size
ephemeral_disk = var.worker_ephemeral_disk
priority = var.worker_priority
# configuration
kubeconfig = module.bootstrap.kubeconfig-kubelet

View File

@ -58,6 +58,24 @@ variable "os_image" {
}
}
variable "disk_type" {
type = string
description = "Type of managed disk"
default = "Standard_LRS"
}
variable "disk_size" {
type = number
description = "Size of the managed disk in GB"
default = 30
}
variable "ephemeral_disk" {
type = bool
description = "Use ephemeral local disk instead of managed disk (requires vm_type with local storage)"
default = false
}
variable "priority" {
type = string
description = "Set priority to Spot to use reduced cost surplus capacity, with the tradeoff that instances can be evicted at any time."

View File

@ -8,20 +8,28 @@ locals {
}
# Workers scale set
resource "azurerm_linux_virtual_machine_scale_set" "workers" {
name = "${var.name}-worker"
resource_group_name = var.resource_group_name
location = var.location
sku = var.vm_type
instances = var.worker_count
# instance name prefix for instances in the set
computer_name_prefix = "${var.name}-worker"
single_placement_group = false
resource "azurerm_orchestrated_virtual_machine_scale_set" "workers" {
name = "${var.name}-worker"
resource_group_name = var.resource_group_name
location = var.location
platform_fault_domain_count = 1
sku_name = var.vm_type
instances = var.worker_count
# storage
encryption_at_host_enabled = true
os_disk {
storage_account_type = "Standard_LRS"
caching = "ReadWrite"
storage_account_type = var.disk_type
disk_size_gb = var.disk_size
caching = "ReadOnly"
# Optionally, use the ephemeral disk of the instance type (support varies)
dynamic "diff_disk_settings" {
for_each = var.ephemeral_disk ? [1] : []
content {
option = "Local"
placement = "ResourceDisk"
}
}
}
# Flatcar Container Linux
@ -65,20 +73,24 @@ resource "azurerm_linux_virtual_machine_scale_set" "workers" {
}
# boot
custom_data = base64encode(data.ct_config.worker.rendered)
user_data_base64 = base64encode(data.ct_config.worker.rendered)
boot_diagnostics {
# defaults to a managed storage account
}
# Azure requires an RSA admin_ssh_key
admin_username = "core"
admin_ssh_key {
username = "core"
public_key = local.azure_authorized_key
os_profile {
linux_configuration {
admin_username = "core"
admin_ssh_key {
username = "core"
public_key = local.azure_authorized_key
}
computer_name_prefix = "${var.name}-worker"
}
}
# lifecycle
upgrade_mode = "Manual"
# eviction policy may only be set when priority is Spot
priority = var.priority
eviction_policy = var.priority == "Spot" ? "Delete" : null
@ -87,25 +99,6 @@ resource "azurerm_linux_virtual_machine_scale_set" "workers" {
}
}
# Scale up or down to maintain desired number, tolerating deallocations.
resource "azurerm_monitor_autoscale_setting" "workers" {
name = "${var.name}-maintain-desired"
resource_group_name = var.resource_group_name
location = var.location
# autoscale
enabled = true
target_resource_id = azurerm_linux_virtual_machine_scale_set.workers.id
profile {
name = "default"
capacity {
minimum = var.worker_count
default = var.worker_count
maximum = var.worker_count
}
}
}
# Flatcar Linux worker
data "ct_config" "worker" {
content = templatefile("${path.module}/butane/worker.yaml", {

Binary file not shown.

Before

Width:  |  Height:  |  Size: 39 KiB

After

Width:  |  Height:  |  Size: 82 KiB