From 567e18f015bf7d8e1f111084a04e2d531a2d4a7d Mon Sep 17 00:00:00 2001 From: Dalton Hubble Date: Wed, 25 Apr 2018 21:33:26 -0700 Subject: [PATCH] Fix conflict between Calico and NetworkManager * Observed frequent kube-scheduler and controller-manager restarts with Calico as the CNI provider. Root cause was unclear since control plane was functional and tests of pod to pod network connectivity passed * Root cause: Calico sets up cali* and tunl* network interfaces for containers on hosts. NetworkManager tries to manage these interfaces. It periodically disconnected veth pairs. Logs did not surface this issue since its not an error per-se, just Calico and NetworkManager dueling for control. Kubernetes correctly restarted pods failing health checks and ensured 2 replicas were running so the control plane functioned mostly normally. Pod to pod connecitivity was only affected occassionally. Pain to debug. * Solution: Configure NetworkManager to ignore the Calico ifaces per Calico's recommendation. Cloud-init writes files after NetworkManager starts, so a restart is required on first boot. On subsequent boots, the file is present so no restart is needed --- .../kubernetes/cloudinit/controller.yaml.tmpl | 7 +++++++ .../kubernetes/workers/cloudinit/worker.yaml.tmpl | 7 +++++++ .../kubernetes/cloudinit/controller.yaml.tmpl | 7 +++++++ .../fedora-atomic/kubernetes/cloudinit/worker.yaml.tmpl | 7 +++++++ .../kubernetes/cloudinit/controller.yaml.tmpl | 7 +++++++ .../kubernetes/workers/cloudinit/worker.yaml.tmpl | 7 +++++++ 6 files changed, 42 insertions(+) diff --git a/aws/fedora-atomic/kubernetes/cloudinit/controller.yaml.tmpl b/aws/fedora-atomic/kubernetes/cloudinit/controller.yaml.tmpl index adf95c34..9179b807 100644 --- a/aws/fedora-atomic/kubernetes/cloudinit/controller.yaml.tmpl +++ b/aws/fedora-atomic/kubernetes/cloudinit/controller.yaml.tmpl @@ -71,6 +71,12 @@ write_files: content: | ${kubeconfig} - path: /var/lib/bootkube/.keep + - path: /etc/NetworkManager/conf.d/typhoon.conf + content: | + [main] + plugins=keyfile + [keyfile] + unmanaged-devices=interface-name:cali*;interface-name:tunl* - path: /etc/selinux/config owner: root:root permissions: '0644' @@ -84,6 +90,7 @@ bootcmd: - [modprobe, ip_vs] runcmd: - [systemctl, daemon-reload] + - [systemctl, restart, NetworkManager] - "atomic install --system --name=etcd quay.io/poseidon/etcd:v3.3.3" - "atomic install --system --name=kubelet quay.io/poseidon/kubelet:v1.10.1" - "atomic install --system --name=bootkube quay.io/poseidon/bootkube:v0.12.0" diff --git a/aws/fedora-atomic/kubernetes/workers/cloudinit/worker.yaml.tmpl b/aws/fedora-atomic/kubernetes/workers/cloudinit/worker.yaml.tmpl index a104b755..733c71bd 100644 --- a/aws/fedora-atomic/kubernetes/workers/cloudinit/worker.yaml.tmpl +++ b/aws/fedora-atomic/kubernetes/workers/cloudinit/worker.yaml.tmpl @@ -47,6 +47,12 @@ write_files: permissions: '0644' content: | ${kubeconfig} + - path: /etc/NetworkManager/conf.d/typhoon.conf + content: | + [main] + plugins=keyfile + [keyfile] + unmanaged-devices=interface-name:cali*;interface-name:tunl* - path: /etc/selinux/config owner: root:root permissions: '0644' @@ -60,6 +66,7 @@ bootcmd: - [modprobe, ip_vs] runcmd: - [systemctl, daemon-reload] + - [systemctl, restart, NetworkManager] - [systemctl, enable, cloud-metadata.service] - "atomic install --system --name=kubelet quay.io/poseidon/kubelet:v1.10.1" - [systemctl, start, --no-block, kubelet.service] diff --git a/bare-metal/fedora-atomic/kubernetes/cloudinit/controller.yaml.tmpl b/bare-metal/fedora-atomic/kubernetes/cloudinit/controller.yaml.tmpl index 54fd196b..99845c9d 100644 --- a/bare-metal/fedora-atomic/kubernetes/cloudinit/controller.yaml.tmpl +++ b/bare-metal/fedora-atomic/kubernetes/cloudinit/controller.yaml.tmpl @@ -61,6 +61,12 @@ write_files: [Install] WantedBy=multi-user.target - path: /var/lib/bootkube/.keep + - path: /etc/NetworkManager/conf.d/typhoon.conf + content: | + [main] + plugins=keyfile + [keyfile] + unmanaged-devices=interface-name:cali*;interface-name:tunl* - path: /etc/selinux/config owner: root:root permissions: '0644' @@ -74,6 +80,7 @@ bootcmd: - [modprobe, ip_vs] runcmd: - [systemctl, daemon-reload] + - [systemctl, restart, NetworkManager] - [hostnamectl, set-hostname, ${domain_name}] - "atomic install --system --name=etcd quay.io/poseidon/etcd:v3.3.3" - "atomic install --system --name=kubelet quay.io/poseidon/kubelet:v1.10.1" diff --git a/bare-metal/fedora-atomic/kubernetes/cloudinit/worker.yaml.tmpl b/bare-metal/fedora-atomic/kubernetes/cloudinit/worker.yaml.tmpl index 62669bca..e9b0ffa8 100644 --- a/bare-metal/fedora-atomic/kubernetes/cloudinit/worker.yaml.tmpl +++ b/bare-metal/fedora-atomic/kubernetes/cloudinit/worker.yaml.tmpl @@ -37,6 +37,12 @@ write_files: PathExists=/etc/kubernetes/kubeconfig [Install] WantedBy=multi-user.target + - path: /etc/NetworkManager/conf.d/typhoon.conf + content: | + [main] + plugins=keyfile + [keyfile] + unmanaged-devices=interface-name:cali*;interface-name:tunl* - path: /etc/selinux/config owner: root:root permissions: '0644' @@ -50,6 +56,7 @@ bootcmd: - [modprobe, ip_vs] runcmd: - [systemctl, daemon-reload] + - [systemctl, restart, NetworkManager] - [hostnamectl, set-hostname, ${domain_name}] - "atomic install --system --name=kubelet quay.io/poseidon/kubelet:v1.10.1" - [systemctl, enable, kubelet.path] diff --git a/google-cloud/fedora-atomic/kubernetes/cloudinit/controller.yaml.tmpl b/google-cloud/fedora-atomic/kubernetes/cloudinit/controller.yaml.tmpl index 21135269..87f5a59d 100644 --- a/google-cloud/fedora-atomic/kubernetes/cloudinit/controller.yaml.tmpl +++ b/google-cloud/fedora-atomic/kubernetes/cloudinit/controller.yaml.tmpl @@ -72,6 +72,12 @@ write_files: content: | ${kubeconfig} - path: /var/lib/bootkube/.keep + - path: /etc/NetworkManager/conf.d/typhoon.conf + content: | + [main] + plugins=keyfile + [keyfile] + unmanaged-devices=interface-name:cali*;interface-name:tunl* - path: /etc/selinux/config owner: root:root permissions: '0644' @@ -85,6 +91,7 @@ bootcmd: - [modprobe, ip_vs] runcmd: - [systemctl, daemon-reload] + - [systemctl, restart, NetworkManager] - "atomic install --system --name=etcd quay.io/poseidon/etcd:v3.3.3" - "atomic install --system --name=kubelet quay.io/poseidon/kubelet:v1.10.1" - "atomic install --system --name=bootkube quay.io/poseidon/bootkube:v0.12.0" diff --git a/google-cloud/fedora-atomic/kubernetes/workers/cloudinit/worker.yaml.tmpl b/google-cloud/fedora-atomic/kubernetes/workers/cloudinit/worker.yaml.tmpl index 9ff26f49..fbc7bb89 100644 --- a/google-cloud/fedora-atomic/kubernetes/workers/cloudinit/worker.yaml.tmpl +++ b/google-cloud/fedora-atomic/kubernetes/workers/cloudinit/worker.yaml.tmpl @@ -48,6 +48,12 @@ write_files: permissions: '0644' content: | ${kubeconfig} + - path: /etc/NetworkManager/conf.d/typhoon.conf + content: | + [main] + plugins=keyfile + [keyfile] + unmanaged-devices=interface-name:cali*;interface-name:tunl* - path: /etc/selinux/config owner: root:root permissions: '0644' @@ -61,6 +67,7 @@ bootcmd: - [modprobe, ip_vs] runcmd: - [systemctl, daemon-reload] + - [systemctl, restart, NetworkManager] - [systemctl, enable, cloud-metadata.service] - "atomic install --system --name=kubelet quay.io/poseidon/kubelet:v1.10.1" - [systemctl, start, --no-block, kubelet.service]