From 1786e34f33779d93f96b0a4345a7b460e023c892 Mon Sep 17 00:00:00 2001 From: Dalton Hubble Date: Sat, 10 Sep 2022 14:34:12 -0700 Subject: [PATCH] Revert Graceful Node Shutdown feature * Disable Kubelet Graceful Node Shutdown on worker nodes (enabled in Kubernetes v1.25.0 https://github.com/poseidon/typhoon/pull/1222) * Graceful node shutdown shutdown allows 30s for critical pods to shutdown and 15s for regular pods to shutdown before releasing the inhibitor lock to allow the host to shutdown * Unfortunately, both pods and the node are shutdown at the same time at the end of the 45s period without further configuration options. As a result, regular pods and the node are shutdown at the same time. In practice, enabling this feature leaves Error or Completed pods in kube-apiserver state until manually cleaned up. This feature is not ready for general use * Fix issue where Error/Completed pods are accumulating whenever any node restarts (or auto-updates), visible in kubectl get pods * This issue wasn't apparent in initial testing and seems to only affect non-critical pods (due to critical pods being killed earlier) But its very apparent on our real clusters Rel: https://github.com/kubernetes/kubernetes/issues/110755 --- CHANGES.md | 3 +++ aws/fedora-coreos/kubernetes/workers/butane/worker.yaml | 2 -- aws/flatcar-linux/kubernetes/workers/butane/worker.yaml | 2 -- azure/fedora-coreos/kubernetes/workers/butane/worker.yaml | 2 -- azure/flatcar-linux/kubernetes/workers/butane/worker.yaml | 2 -- bare-metal/fedora-coreos/kubernetes/butane/worker.yaml | 2 -- bare-metal/flatcar-linux/kubernetes/butane/worker.yaml | 2 -- digital-ocean/fedora-coreos/kubernetes/butane/worker.yaml | 2 -- digital-ocean/flatcar-linux/kubernetes/butane/worker.yaml | 2 -- .../fedora-coreos/kubernetes/workers/butane/worker.yaml | 2 -- .../flatcar-linux/kubernetes/workers/butane/worker.yaml | 2 -- 11 files changed, 3 insertions(+), 20 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 08916c6f..40e26973 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -4,6 +4,9 @@ Notable changes between versions. ## Latest +* Revert Kubelet Graceful Node Shutdown on worker nodes ([#1227](https://github.com/poseidon/typhoon/pull/1227)) + * Fix issue where non-critical pods are left in Error/Completed state on node shutdown + ### Addons * Update kube-state-metrics from v2.5.0 to [v2.6.0](https://github.com/kubernetes/kube-state-metrics/releases/tag/v2.6.0) diff --git a/aws/fedora-coreos/kubernetes/workers/butane/worker.yaml b/aws/fedora-coreos/kubernetes/workers/butane/worker.yaml index 72503200..eaed26e0 100644 --- a/aws/fedora-coreos/kubernetes/workers/butane/worker.yaml +++ b/aws/fedora-coreos/kubernetes/workers/butane/worker.yaml @@ -122,8 +122,6 @@ storage: featureGates: LocalStorageCapacityIsolationFSQuotaMonitoring: false rotateCertificates: true - shutdownGracePeriod: 45s - shutdownGracePeriodCriticalPods: 30s staticPodPath: /etc/kubernetes/manifests readOnlyPort: 0 resolvConf: /run/systemd/resolve/resolv.conf diff --git a/aws/flatcar-linux/kubernetes/workers/butane/worker.yaml b/aws/flatcar-linux/kubernetes/workers/butane/worker.yaml index be3141c1..b02418e2 100644 --- a/aws/flatcar-linux/kubernetes/workers/butane/worker.yaml +++ b/aws/flatcar-linux/kubernetes/workers/butane/worker.yaml @@ -121,8 +121,6 @@ storage: featureGates: LocalStorageCapacityIsolationFSQuotaMonitoring: false rotateCertificates: true - shutdownGracePeriod: 45s - shutdownGracePeriodCriticalPods: 30s staticPodPath: /etc/kubernetes/manifests readOnlyPort: 0 resolvConf: /run/systemd/resolve/resolv.conf diff --git a/azure/fedora-coreos/kubernetes/workers/butane/worker.yaml b/azure/fedora-coreos/kubernetes/workers/butane/worker.yaml index 0f0bdc4a..5c4f3de0 100644 --- a/azure/fedora-coreos/kubernetes/workers/butane/worker.yaml +++ b/azure/fedora-coreos/kubernetes/workers/butane/worker.yaml @@ -117,8 +117,6 @@ storage: featureGates: LocalStorageCapacityIsolationFSQuotaMonitoring: false rotateCertificates: true - shutdownGracePeriod: 45s - shutdownGracePeriodCriticalPods: 30s staticPodPath: /etc/kubernetes/manifests readOnlyPort: 0 resolvConf: /run/systemd/resolve/resolv.conf diff --git a/azure/flatcar-linux/kubernetes/workers/butane/worker.yaml b/azure/flatcar-linux/kubernetes/workers/butane/worker.yaml index d2f77d73..08486321 100644 --- a/azure/flatcar-linux/kubernetes/workers/butane/worker.yaml +++ b/azure/flatcar-linux/kubernetes/workers/butane/worker.yaml @@ -117,8 +117,6 @@ storage: featureGates: LocalStorageCapacityIsolationFSQuotaMonitoring: false rotateCertificates: true - shutdownGracePeriod: 45s - shutdownGracePeriodCriticalPods: 30s staticPodPath: /etc/kubernetes/manifests readOnlyPort: 0 resolvConf: /run/systemd/resolve/resolv.conf diff --git a/bare-metal/fedora-coreos/kubernetes/butane/worker.yaml b/bare-metal/fedora-coreos/kubernetes/butane/worker.yaml index 095f5730..952248a8 100644 --- a/bare-metal/fedora-coreos/kubernetes/butane/worker.yaml +++ b/bare-metal/fedora-coreos/kubernetes/butane/worker.yaml @@ -128,8 +128,6 @@ storage: featureGates: LocalStorageCapacityIsolationFSQuotaMonitoring: false rotateCertificates: true - shutdownGracePeriod: 45s - shutdownGracePeriodCriticalPods: 30s staticPodPath: /etc/kubernetes/manifests readOnlyPort: 0 resolvConf: /run/systemd/resolve/resolv.conf diff --git a/bare-metal/flatcar-linux/kubernetes/butane/worker.yaml b/bare-metal/flatcar-linux/kubernetes/butane/worker.yaml index de2ef194..4ee1aa91 100644 --- a/bare-metal/flatcar-linux/kubernetes/butane/worker.yaml +++ b/bare-metal/flatcar-linux/kubernetes/butane/worker.yaml @@ -118,8 +118,6 @@ storage: featureGates: LocalStorageCapacityIsolationFSQuotaMonitoring: false rotateCertificates: true - shutdownGracePeriod: 45s - shutdownGracePeriodCriticalPods: 30s staticPodPath: /etc/kubernetes/manifests readOnlyPort: 0 resolvConf: /run/systemd/resolve/resolv.conf diff --git a/digital-ocean/fedora-coreos/kubernetes/butane/worker.yaml b/digital-ocean/fedora-coreos/kubernetes/butane/worker.yaml index 4b822b85..4ea5b525 100644 --- a/digital-ocean/fedora-coreos/kubernetes/butane/worker.yaml +++ b/digital-ocean/fedora-coreos/kubernetes/butane/worker.yaml @@ -122,8 +122,6 @@ storage: featureGates: LocalStorageCapacityIsolationFSQuotaMonitoring: false rotateCertificates: true - shutdownGracePeriod: 45s - shutdownGracePeriodCriticalPods: 30s staticPodPath: /etc/kubernetes/manifests readOnlyPort: 0 resolvConf: /run/systemd/resolve/resolv.conf diff --git a/digital-ocean/flatcar-linux/kubernetes/butane/worker.yaml b/digital-ocean/flatcar-linux/kubernetes/butane/worker.yaml index e86daadb..aaf55634 100644 --- a/digital-ocean/flatcar-linux/kubernetes/butane/worker.yaml +++ b/digital-ocean/flatcar-linux/kubernetes/butane/worker.yaml @@ -121,8 +121,6 @@ storage: featureGates: LocalStorageCapacityIsolationFSQuotaMonitoring: false rotateCertificates: true - shutdownGracePeriod: 45s - shutdownGracePeriodCriticalPods: 30s staticPodPath: /etc/kubernetes/manifests readOnlyPort: 0 resolvConf: /run/systemd/resolve/resolv.conf diff --git a/google-cloud/fedora-coreos/kubernetes/workers/butane/worker.yaml b/google-cloud/fedora-coreos/kubernetes/workers/butane/worker.yaml index 19b29470..d6707696 100644 --- a/google-cloud/fedora-coreos/kubernetes/workers/butane/worker.yaml +++ b/google-cloud/fedora-coreos/kubernetes/workers/butane/worker.yaml @@ -116,8 +116,6 @@ storage: featureGates: LocalStorageCapacityIsolationFSQuotaMonitoring: false rotateCertificates: true - shutdownGracePeriod: 45s - shutdownGracePeriodCriticalPods: 30s staticPodPath: /etc/kubernetes/manifests readOnlyPort: 0 resolvConf: /run/systemd/resolve/resolv.conf diff --git a/google-cloud/flatcar-linux/kubernetes/workers/butane/worker.yaml b/google-cloud/flatcar-linux/kubernetes/workers/butane/worker.yaml index ac46de25..6685f3d7 100644 --- a/google-cloud/flatcar-linux/kubernetes/workers/butane/worker.yaml +++ b/google-cloud/flatcar-linux/kubernetes/workers/butane/worker.yaml @@ -116,8 +116,6 @@ storage: featureGates: LocalStorageCapacityIsolationFSQuotaMonitoring: false rotateCertificates: true - shutdownGracePeriod: 45s - shutdownGracePeriodCriticalPods: 30s staticPodPath: /etc/kubernetes/manifests readOnlyPort: 0 resolvConf: /run/systemd/resolve/resolv.conf