From bbbaf949f98a33fdd72f72ee924ab4ae5a6b12d3 Mon Sep 17 00:00:00 2001 From: Dalton Hubble Date: Tue, 31 Mar 2020 20:28:27 -0700 Subject: [PATCH] Fix UDP outbound and clock sync timeouts on Azure workers * Add "lb" outbound rule for worker TCP _and_ UDP traffic * Fix Azure worker nodes clock synchronization being inactive due to timeouts reaching the CoreOS / Flatcar NTP pool * Fix Azure worker nodes not providing outbount UDP connectivity Background: Azure provides VMs outbound connectivity either by having a public IP or via an SNAT masquerade feature bundled with their virtual load balancing abstraction (in contrast with, say, a NAT gateway). Azure worker nodes have only a private IP, but are associated with the cluster load balancer's backend pool and ingress frontend IP. Outbound traffic uses SNAT with this frontend IP. A subtle detail with Azure SNAT seems to be that since both inbound lb_rule's are TCP only, outbound UDP traffic isn't SNAT'd (highlights the reasons Azure shouldn't have conflated inbound load balancing with outbound SNAT concepts). However, adding a separate outbound rule and disabling outbound SNAT on our ingress lb_rule's we can tell Azure to continue load balancing as before, and support outbound SNAT for worker traffic of both the TCP and UDP protocol. Fixes clock synchronization timeouts: ``` systemd-timesyncd[786]: Timed out waiting for reply from 45.79.36.123:123 (3.flatcar.pool.ntp.org) ``` Azure controller nodes have their own public IP, so controllers (and etcd) nodes have not had clock synchronization or outbound UDP issues --- CHANGES.md | 2 ++ azure/container-linux/kubernetes/lb.tf | 16 ++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 0658ba99..255f91b9 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -12,6 +12,8 @@ Notable changes between versions. * Rename Container Linux `clc_snippets` (bare-metal) to `snippets` for consistency * Fix bootstrap when `networking` mode `flannel` (non-default) is chosen ([#689](https://github.com/poseidon/typhoon/pull/689)) * Regressed in v1.18.0 changes for Calico ([#675](https://github.com/poseidon/typhoon/pull/675)) +* Fix Azure worker UDP outbound connections ([#691](https://github.com/poseidon/typhoon/pull/691)) + * Fix Azure worker clock sync timeouts ## v1.18.0 diff --git a/azure/container-linux/kubernetes/lb.tf b/azure/container-linux/kubernetes/lb.tf index 6de81b22..ef924759 100644 --- a/azure/container-linux/kubernetes/lb.tf +++ b/azure/container-linux/kubernetes/lb.tf @@ -72,6 +72,7 @@ resource "azurerm_lb_rule" "ingress-http" { name = "ingress-http" loadbalancer_id = azurerm_lb.cluster.id frontend_ip_configuration_name = "ingress" + disable_outbound_snat = true protocol = "Tcp" frontend_port = 80 @@ -86,6 +87,7 @@ resource "azurerm_lb_rule" "ingress-https" { name = "ingress-https" loadbalancer_id = azurerm_lb.cluster.id frontend_ip_configuration_name = "ingress" + disable_outbound_snat = true protocol = "Tcp" frontend_port = 443 @@ -94,6 +96,20 @@ resource "azurerm_lb_rule" "ingress-https" { probe_id = azurerm_lb_probe.ingress.id } +# Worker outbound TCP/UDP SNAT +resource "azurerm_lb_outbound_rule" "worker-outbound" { + resource_group_name = azurerm_resource_group.cluster.name + + name = "worker" + loadbalancer_id = azurerm_lb.cluster.id + frontend_ip_configuration { + name = "ingress" + } + + protocol = "All" + backend_address_pool_id = azurerm_lb_backend_address_pool.worker.id +} + # Address pool of controllers resource "azurerm_lb_backend_address_pool" "controller" { resource_group_name = azurerm_resource_group.cluster.name