diff --git a/CHANGES.md b/CHANGES.md index 2e2940c8..afe2a20e 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -15,6 +15,15 @@ Notable changes between versions. * Allow a snippet with a systemd dropin to set an alternate image (e.g. mirror) * Fix local node delete oneshot on node shutdown ([#856](https://github.com/poseidon/typhoon/pull/855)) +#### AWS + +* Add experimental Fedora CoreOS arm64 support ([docs](https://typhoon.psdn.io/advanced/arm64/), [#875](https://github.com/poseidon/typhoon/pull/875)) + * Allow arm64 full-cluster or mixed/hybrid cluster with worker pools + * Add `arch` variable to cluster module + * Add `daemonset_tolerations` variable to cluster module + * Add `node_taints` variable to workers module + * Requires flannel CNI provider and use of experimental AMI (see docs) + ### Flatcar Linux * Rename `container-linux` modules to `flatcar-linux` ([#858](https://github.com/poseidon/typhoon/issues/858)) (**action required**) diff --git a/aws/fedora-coreos/kubernetes/ami.tf b/aws/fedora-coreos/kubernetes/ami.tf index a7ab184b..2ac01d44 100644 --- a/aws/fedora-coreos/kubernetes/ami.tf +++ b/aws/fedora-coreos/kubernetes/ami.tf @@ -18,3 +18,27 @@ data "aws_ami" "fedora-coreos" { values = ["Fedora CoreOS ${var.os_stream} *"] } } + +# Experimental Fedora CoreOS arm64 / aarch64 AMIs from Poseidon +# WARNING: These AMIs will be removed when Fedora CoreOS publishes arm64 AMIs +# and may be removed for any reason before then as well. Do not use. +data "aws_ami" "fedora-coreos-arm" { + most_recent = true + owners = ["099663496933"] + + filter { + name = "architecture" + values = ["arm64"] + } + + filter { + name = "virtualization-type" + values = ["hvm"] + } + + filter { + name = "name" + values = ["fedora-coreos-*"] + } +} + diff --git a/aws/fedora-coreos/kubernetes/bootstrap.tf b/aws/fedora-coreos/kubernetes/bootstrap.tf index 88515684..d357034f 100644 --- a/aws/fedora-coreos/kubernetes/bootstrap.tf +++ b/aws/fedora-coreos/kubernetes/bootstrap.tf @@ -12,6 +12,7 @@ module "bootstrap" { cluster_domain_suffix = var.cluster_domain_suffix enable_reporting = var.enable_reporting enable_aggregation = var.enable_aggregation + daemonset_tolerations = var.daemonset_tolerations trusted_certs_dir = "/etc/pki/tls/certs" } diff --git a/aws/fedora-coreos/kubernetes/controllers.tf b/aws/fedora-coreos/kubernetes/controllers.tf index 2fd253db..1ab26aaa 100644 --- a/aws/fedora-coreos/kubernetes/controllers.tf +++ b/aws/fedora-coreos/kubernetes/controllers.tf @@ -22,9 +22,8 @@ resource "aws_instance" "controllers" { } instance_type = var.controller_type - - ami = data.aws_ami.fedora-coreos.image_id - user_data = data.ct_config.controller-ignitions.*.rendered[count.index] + ami = var.arch == "arm64" ? data.aws_ami.fedora-coreos-arm.image_id : data.aws_ami.fedora-coreos.image_id + user_data = data.ct_config.controller-ignitions.*.rendered[count.index] # storage root_block_device { @@ -63,6 +62,7 @@ data "template_file" "controller-configs" { vars = { # Cannot use cyclic dependencies on controllers or their DNS records + etcd_arch = var.arch == "arm64" ? "-arm64" : "" etcd_name = "etcd${count.index}" etcd_domain = "${var.cluster_name}-etcd${count.index}.${var.dns_zone}" # etcd0=https://cluster-etcd0.example.com,etcd1=https://cluster-etcd1.example.com,... diff --git a/aws/fedora-coreos/kubernetes/fcc/controller.yaml b/aws/fedora-coreos/kubernetes/fcc/controller.yaml index eaa912de..4147b14d 100644 --- a/aws/fedora-coreos/kubernetes/fcc/controller.yaml +++ b/aws/fedora-coreos/kubernetes/fcc/controller.yaml @@ -12,7 +12,7 @@ systemd: Wants=network-online.target network.target After=network-online.target [Service] - Environment=ETCD_IMAGE=quay.io/coreos/etcd:v3.4.12 + Environment=ETCD_IMAGE=quay.io/coreos/etcd:v3.4.12${etcd_arch} Type=exec ExecStartPre=/bin/mkdir -p /var/lib/etcd ExecStartPre=-/usr/bin/podman rm etcd @@ -214,6 +214,7 @@ storage: ETCD_PEER_CERT_FILE=/etc/ssl/certs/etcd/peer.crt ETCD_PEER_KEY_FILE=/etc/ssl/certs/etcd/peer.key ETCD_PEER_CLIENT_CERT_AUTH=true + ETCD_UNSUPPORTED_ARCH=arm64 passwd: users: - name: core diff --git a/aws/fedora-coreos/kubernetes/variables.tf b/aws/fedora-coreos/kubernetes/variables.tf index fa47cbe5..7557e919 100644 --- a/aws/fedora-coreos/kubernetes/variables.tf +++ b/aws/fedora-coreos/kubernetes/variables.tf @@ -155,3 +155,15 @@ variable "cluster_domain_suffix" { default = "cluster.local" } +variable "arch" { + type = string + description = "Container architecture (amd64 or arm64)" + default = "amd64" +} + +variable "daemonset_tolerations" { + type = list(string) + description = "List of additional taint keys kube-system DaemonSets should tolerate (e.g. ['custom-role', 'gpu-role'])" + default = [] +} + diff --git a/aws/fedora-coreos/kubernetes/workers.tf b/aws/fedora-coreos/kubernetes/workers.tf index dcfc05d9..0ec9cdb6 100644 --- a/aws/fedora-coreos/kubernetes/workers.tf +++ b/aws/fedora-coreos/kubernetes/workers.tf @@ -9,6 +9,7 @@ module "workers" { worker_count = var.worker_count instance_type = var.worker_type os_stream = var.os_stream + arch = var.arch disk_size = var.disk_size spot_price = var.worker_price target_groups = var.worker_target_groups diff --git a/aws/fedora-coreos/kubernetes/workers/ami.tf b/aws/fedora-coreos/kubernetes/workers/ami.tf index a7ab184b..2ac01d44 100644 --- a/aws/fedora-coreos/kubernetes/workers/ami.tf +++ b/aws/fedora-coreos/kubernetes/workers/ami.tf @@ -18,3 +18,27 @@ data "aws_ami" "fedora-coreos" { values = ["Fedora CoreOS ${var.os_stream} *"] } } + +# Experimental Fedora CoreOS arm64 / aarch64 AMIs from Poseidon +# WARNING: These AMIs will be removed when Fedora CoreOS publishes arm64 AMIs +# and may be removed for any reason before then as well. Do not use. +data "aws_ami" "fedora-coreos-arm" { + most_recent = true + owners = ["099663496933"] + + filter { + name = "architecture" + values = ["arm64"] + } + + filter { + name = "virtualization-type" + values = ["hvm"] + } + + filter { + name = "name" + values = ["fedora-coreos-*"] + } +} + diff --git a/aws/fedora-coreos/kubernetes/workers/fcc/worker.yaml b/aws/fedora-coreos/kubernetes/workers/fcc/worker.yaml index 50f274d5..389da5a5 100644 --- a/aws/fedora-coreos/kubernetes/workers/fcc/worker.yaml +++ b/aws/fedora-coreos/kubernetes/workers/fcc/worker.yaml @@ -68,6 +68,9 @@ systemd: %{~ for label in split(",", node_labels) ~} --node-labels=${label} \ %{~ endfor ~} + %{~ for taint in split(",", node_taints) ~} + --register-with-taints=${taint} \ + %{~ endfor ~} --pod-manifest-path=/etc/kubernetes/manifests \ --read-only-port=0 \ --rotate-certificates \ diff --git a/aws/fedora-coreos/kubernetes/workers/variables.tf b/aws/fedora-coreos/kubernetes/workers/variables.tf index 76b33cbf..645bae38 100644 --- a/aws/fedora-coreos/kubernetes/workers/variables.tf +++ b/aws/fedora-coreos/kubernetes/workers/variables.tf @@ -108,3 +108,17 @@ variable "node_labels" { description = "List of initial node labels" default = [] } + +variable "node_taints" { + type = list(string) + description = "List of initial node taints" + default = [] +} + +# unofficial, undocumented, unsupported + +variable "arch" { + type = string + description = "Container architecture (amd64 or arm64)" + default = "amd64" +} diff --git a/aws/fedora-coreos/kubernetes/workers/workers.tf b/aws/fedora-coreos/kubernetes/workers/workers.tf index 39f9a4a4..4144f664 100644 --- a/aws/fedora-coreos/kubernetes/workers/workers.tf +++ b/aws/fedora-coreos/kubernetes/workers/workers.tf @@ -44,7 +44,7 @@ resource "aws_autoscaling_group" "workers" { # Worker template resource "aws_launch_configuration" "worker" { - image_id = data.aws_ami.fedora-coreos.image_id + image_id = var.arch == "arm64" ? data.aws_ami.fedora-coreos-arm.image_id : data.aws_ami.fedora-coreos.image_id instance_type = var.instance_type spot_price = var.spot_price > 0 ? var.spot_price : null enable_monitoring = false @@ -86,6 +86,7 @@ data "template_file" "worker-config" { cluster_dns_service_ip = cidrhost(var.service_cidr, 10) cluster_domain_suffix = var.cluster_domain_suffix node_labels = join(",", var.node_labels) + node_taints = join(",", var.node_taints) } } diff --git a/docs/advanced/arm64.md b/docs/advanced/arm64.md new file mode 100644 index 00000000..49962071 --- /dev/null +++ b/docs/advanced/arm64.md @@ -0,0 +1,116 @@ +# ARM64 + +!!! warning + ARM64 support is experimental + +Typhoon has experimental support for ARM64 with Fedora CoreOS on AWS. Full clusters can be created with ARM64 controller and worker nodes. Or worker pools of ARM64 nodes can be attached to an AMD64 cluster to create a hybrid/mixed architecture cluster. + +!!! note + Currently, CNI networking must be set to flannel. + +## AMIs + +In lieu of official Fedora CoreOS ARM64 AMIs, Poseidon publishes experimental ARM64 AMIs to a few regions (us-east-1, us-east-2, us-west-1). These AMIs may be **removed** at any time and will be replaced when Fedora CoreOS publishes equivalents. + +!!! note + AMIs are only published to a few regions, and AWS availability of ARM instance types varies. + +## Cluster + +Create a cluster with ARM64 controller and worker nodes. Container workloads must be `arm64` compatible and use `arm64` container images. + +```tf +module "gravitas" { + source = "git::https://github.com/poseidon/typhoon//aws/fedora-coreos/kubernetes?ref=v1.19.4" + + # AWS + cluster_name = "gravitas" + dns_zone = "aws.example.com" + dns_zone_id = "Z3PAABBCFAKEC0" + + # configuration + ssh_authorized_key = "ssh-rsa AAAAB3Nz..." + + # optional + arch = "arm64" + networking = "flannel" + worker_count = 2 + worker_price = "0.0168" + + controller_type = "t4g.small" + worker_type = "t4g.small" +} +``` + +Verify the cluster has only arm64 (`aarch64`) nodes. + +``` +$ kubectl get nodes -o wide +NAME STATUS ROLES AGE VERSION INTERNAL-IP EXTERNAL-IP OS-IMAGE KERNEL-VERSION CONTAINER-RUNTIME +ip-10-0-12-178 Ready 101s v1.19.4 10.0.12.178 Fedora CoreOS 32.20201104.dev.0 5.8.17-200.fc32.aarch64 docker://19.3.11 +ip-10-0-18-93 Ready 102s v1.19.4 10.0.18.93 Fedora CoreOS 32.20201104.dev.0 5.8.17-200.fc32.aarch64 docker://19.3.11 +ip-10-0-90-10 Ready 104s v1.19.4 10.0.90.10 Fedora CoreOS 32.20201104.dev.0 5.8.17-200.fc32.aarch64 docker://19.3.11 +``` + +## Hybrid + +Create a hybrid/mixed arch cluster by defining an AWS cluster. Then define a [worker pool](worker-pools.md#aws) with ARM64 workers. Optional taints are added to aid in scheduling. + +=== "Cluster (amd64)" + + ```tf + module "gravitas" { + source = "git::https://github.com/poseidon/typhoon//aws/fedora-coreos/kubernetes?ref=v1.19.4" + + # AWS + cluster_name = "gravitas" + dns_zone = "aws.example.com" + dns_zone_id = "Z3PAABBCFAKEC0" + + # configuration + ssh_authorized_key = "ssh-rsa AAAAB3Nz..." + + # optional + networking = "flannel" + worker_count = 2 + worker_price = "0.021" + + daemonset_tolerations = ["arch"] # important + } + ``` + +=== "Worker Pool (arm64)" + + ```tf + module "gravitas-arm64" { + source = "git::https://github.com/poseidon/typhoon//aws/fedora-coreos/kubernetes/workers?ref=v1.19.4" + + # AWS + vpc_id = module.gravitas.vpc_id + subnet_ids = module.gravitas.subnet_ids + security_groups = module.gravitas.worker_security_groups + + # configuration + name = "gravitas-arm64" + kubeconfig = module.gravitas.kubeconfig + ssh_authorized_key = var.ssh_authorized_key + + # optional + arch = "arm64" + instance_type = "t4g.small" + spot_price = "0.0168" + node_taints = ["arch=arm64:NoSchedule"] + } + ``` + +Verify amd64 (x86_64) and arm64 (aarch64) nodes are present. + +``` +$ kubectl get nodes -o wide +NAME STATUS ROLES AGE VERSION INTERNAL-IP EXTERNAL-IP OS-IMAGE KERNEL-VERSION CONTAINER-RUNTIME +ip-10-0-14-73 Ready 116s v1.19.4 10.0.14.73 Fedora CoreOS 32.20201018.3.0 5.8.15-201.fc32.x86_64 docker://19.3.11 +ip-10-0-17-167 Ready 104s v1.19.4 10.0.17.167 Fedora CoreOS 32.20201018.3.0 5.8.15-201.fc32.x86_64 docker://19.3.11 +ip-10-0-47-166 Ready 110s v1.19.4 10.0.47.166 Fedora CoreOS 32.20201104.dev.0 5.8.17-200.fc32.aarch64 docker://19.3.11 +ip-10-0-7-237 Ready 111s v1.19.4 10.0.7.237 Fedora CoreOS 32.20201018.3.0 5.8.15-201.fc32.x86_64 docker://19.3.11 +``` + diff --git a/docs/advanced/overview.md b/docs/advanced/overview.md index 79ad7bde..63cff796 100644 --- a/docs/advanced/overview.md +++ b/docs/advanced/overview.md @@ -2,5 +2,6 @@ Typhoon clusters offer several advanced features for skilled users. +* [ARM64](arm64.md) * [Customization](customization.md) * [Worker Pools](worker-pools.md) diff --git a/mkdocs.yml b/mkdocs.yml index 1cca3d1f..5d599f54 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -78,6 +78,7 @@ nav: - 'FAQ': 'topics/faq.md' - 'Advanced': - 'Overview': 'advanced/overview.md' + - 'ARM64': 'advanced/arm64.md' - 'Customization': 'advanced/customization.md' - 'Worker Pools': 'advanced/worker-pools.md' - 'Addons': diff --git a/requirements.txt b/requirements.txt index c52163d8..5bbcd3be 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ mkdocs==1.1.2 -mkdocs-material==6.1.0 +mkdocs-material==6.1.4 pygments==2.6.1 pymdown-extensions==7.1.0