mirror of
https://github.com/puppetmaster/typhoon.git
synced 2025-01-25 01:48:30 +01:00
567e18f015
* Observed frequent kube-scheduler and controller-manager restarts with Calico as the CNI provider. Root cause was unclear since control plane was functional and tests of pod to pod network connectivity passed * Root cause: Calico sets up cali* and tunl* network interfaces for containers on hosts. NetworkManager tries to manage these interfaces. It periodically disconnected veth pairs. Logs did not surface this issue since its not an error per-se, just Calico and NetworkManager dueling for control. Kubernetes correctly restarted pods failing health checks and ensured 2 replicas were running so the control plane functioned mostly normally. Pod to pod connecitivity was only affected occassionally. Pain to debug. * Solution: Configure NetworkManager to ignore the Calico ifaces per Calico's recommendation. Cloud-init writes files after NetworkManager starts, so a restart is required on first boot. On subsequent boots, the file is present so no restart is needed
108 lines
4.2 KiB
Cheetah
108 lines
4.2 KiB
Cheetah
#cloud-config
|
|
write_files:
|
|
- path: /etc/etcd/etcd.conf
|
|
content: |
|
|
ETCD_NAME=${etcd_name}
|
|
ETCD_DATA_DIR=/var/lib/etcd
|
|
ETCD_ADVERTISE_CLIENT_URLS=https://${etcd_domain}:2379
|
|
ETCD_INITIAL_ADVERTISE_PEER_URLS=https://${etcd_domain}:2380
|
|
ETCD_LISTEN_CLIENT_URLS=https://0.0.0.0:2379
|
|
ETCD_LISTEN_PEER_URLS=https://0.0.0.0:2380
|
|
ETCD_LISTEN_METRICS_URLS=http://0.0.0.0:2381
|
|
ETCD_INITIAL_CLUSTER=${etcd_initial_cluster}
|
|
ETCD_STRICT_RECONFIG_CHECK=true
|
|
ETCD_TRUSTED_CA_FILE=/etc/ssl/certs/etcd/server-ca.crt
|
|
ETCD_CERT_FILE=/etc/ssl/certs/etcd/server.crt
|
|
ETCD_KEY_FILE=/etc/ssl/certs/etcd/server.key
|
|
ETCD_CLIENT_CERT_AUTH=true
|
|
ETCD_PEER_TRUSTED_CA_FILE=/etc/ssl/certs/etcd/peer-ca.crt
|
|
ETCD_PEER_CERT_FILE=/etc/ssl/certs/etcd/peer.crt
|
|
ETCD_PEER_KEY_FILE=/etc/ssl/certs/etcd/peer.key
|
|
ETCD_PEER_CLIENT_CERT_AUTH=true
|
|
- path: /etc/systemd/system/cloud-metadata.service
|
|
content: |
|
|
[Unit]
|
|
Description=Cloud metadata agent
|
|
[Service]
|
|
Type=oneshot
|
|
Environment=OUTPUT=/run/metadata/cloud
|
|
ExecStart=/usr/bin/mkdir -p /run/metadata
|
|
ExecStart=/usr/bin/bash -c 'echo "HOSTNAME_OVERRIDE=$(curl\
|
|
--url http://169.254.169.254/latest/meta-data/local-ipv4\
|
|
--retry 10)" > $${OUTPUT}'
|
|
[Install]
|
|
WantedBy=multi-user.target
|
|
- path: /etc/systemd/system/kubelet.service.d/10-typhoon.conf
|
|
content: |
|
|
[Unit]
|
|
Requires=cloud-metadata.service
|
|
After=cloud-metadata.service
|
|
Wants=rpc-statd.service
|
|
[Service]
|
|
ExecStartPre=/bin/mkdir -p /opt/cni/bin
|
|
ExecStartPre=/bin/mkdir -p /etc/kubernetes/manifests
|
|
ExecStartPre=/bin/mkdir -p /etc/kubernetes/cni/net.d
|
|
ExecStartPre=/bin/mkdir -p /etc/kubernetes/checkpoint-secrets
|
|
ExecStartPre=/bin/mkdir -p /etc/kubernetes/inactive-manifests
|
|
ExecStartPre=/bin/mkdir -p /var/lib/cni
|
|
ExecStartPre=/bin/mkdir -p /var/lib/kubelet/volumeplugins
|
|
ExecStartPre=/usr/bin/bash -c "grep 'certificate-authority-data' /etc/kubernetes/kubeconfig | awk '{print $2}' | base64 -d > /etc/kubernetes/ca.crt"
|
|
Restart=always
|
|
RestartSec=10
|
|
- path: /etc/kubernetes/kubelet.conf
|
|
content: |
|
|
ARGS="--allow-privileged \
|
|
--anonymous-auth=false \
|
|
--client-ca-file=/etc/kubernetes/ca.crt \
|
|
--cluster_dns=${k8s_dns_service_ip} \
|
|
--cluster_domain=${cluster_domain_suffix} \
|
|
--cni-conf-dir=/etc/kubernetes/cni/net.d \
|
|
--exit-on-lock-contention \
|
|
--kubeconfig=/etc/kubernetes/kubeconfig \
|
|
--lock-file=/var/run/lock/kubelet.lock \
|
|
--network-plugin=cni \
|
|
--node-labels=node-role.kubernetes.io/master \
|
|
--node-labels=node-role.kubernetes.io/controller="true" \
|
|
--pod-manifest-path=/etc/kubernetes/manifests \
|
|
--register-with-taints=node-role.kubernetes.io/master=:NoSchedule \
|
|
--volume-plugin-dir=/var/lib/kubelet/volumeplugins"
|
|
- path: /etc/kubernetes/kubeconfig
|
|
permissions: '0644'
|
|
content: |
|
|
${kubeconfig}
|
|
- path: /var/lib/bootkube/.keep
|
|
- path: /etc/NetworkManager/conf.d/typhoon.conf
|
|
content: |
|
|
[main]
|
|
plugins=keyfile
|
|
[keyfile]
|
|
unmanaged-devices=interface-name:cali*;interface-name:tunl*
|
|
- path: /etc/selinux/config
|
|
owner: root:root
|
|
permissions: '0644'
|
|
content: |
|
|
SELINUX=permissive
|
|
SELINUXTYPE=targeted
|
|
bootcmd:
|
|
- [setenforce, Permissive]
|
|
- [systemctl, disable, firewalld, --now]
|
|
# https://github.com/kubernetes/kubernetes/issues/60869
|
|
- [modprobe, ip_vs]
|
|
runcmd:
|
|
- [systemctl, daemon-reload]
|
|
- [systemctl, restart, NetworkManager]
|
|
- "atomic install --system --name=etcd quay.io/poseidon/etcd:v3.3.3"
|
|
- "atomic install --system --name=kubelet quay.io/poseidon/kubelet:v1.10.1"
|
|
- "atomic install --system --name=bootkube quay.io/poseidon/bootkube:v0.12.0"
|
|
- [systemctl, start, --no-block, etcd.service]
|
|
- [systemctl, enable, cloud-metadata.service]
|
|
- [systemctl, start, --no-block, kubelet.service]
|
|
users:
|
|
- default
|
|
- name: fedora
|
|
gecos: Fedora Admin
|
|
sudo: ALL=(ALL) NOPASSWD:ALL
|
|
groups: wheel,adm,systemd-journal,docker
|
|
ssh-authorized-keys:
|
|
- "${ssh_authorized_key}"
|