From 38370fd9652d55f1ce07ccef3d84abaa7ffcd6b2 Mon Sep 17 00:00:00 2001 From: Rafael Franzke Date: Tue, 31 Jul 2018 15:52:30 +0200 Subject: [PATCH] Restart kubelet if it does not report an internal/external ip address On kubelet start-up some functions to set the node status are generated. One of those functions propagates the node addresses into the `Node` object the kubelet is responsible for (`.status.addresses`). The kube-apiserver uses these addresses to talk to the actual node. To identify the IP address of the node the kubelet communicates with the cloud provider. kubernetes/kubernetes#62543 introduced a timeout of 10s when trying to connect to the cloud. In case the IP cannot be determined within 10s, the `Node` object does not report an `InternalIP` address. Consequently, the kube-apiserver will never be able to talk to that node; particularly VPN won't work in case the vpn-shoot pod is scheduled on it. Once the connection failed, it is never retried, and only a kubelet process restart can trigger it again. Hence, our kubelet monitoring script will now do the same when it cannot find an `InternalIP` or an `ExternalIP` address on the `Node` object. closes #283 --- .../original/templates/scripts/_health-monitor.sh | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/charts/shoot-cloud-config/charts/original/templates/scripts/_health-monitor.sh b/charts/shoot-cloud-config/charts/original/templates/scripts/_health-monitor.sh index 688a89d770e..3bd6aed41e2 100644 --- a/charts/shoot-cloud-config/charts/original/templates/scripts/_health-monitor.sh +++ b/charts/shoot-cloud-config/charts/original/templates/scripts/_health-monitor.sh @@ -48,8 +48,20 @@ continue fi + node_status="$(kubectl get nodes -l kubernetes.io/hostname=$(hostname) -o json | jq -r '.items[0].status')" + + # Check whether the kubelet does report an InternalIP node address + if node_ip_addresses="$(echo $node_status | jq -r '.addresses[] | select(.type=="InternalIP" or .type=="ExternalIP") | .address')"; then + if [[ -z "$node_ip_addresses" ]]; then + echo "Kubelet has not reported an InternalIP nor an ExternalIP node address yet. Restarting kubelet!"; + restart_kubelet + sleep 20 + continue + fi + fi + # Check whether kubelet ready status toggles between true and false and reboot VM if happened too often. - if status="$(kubectl get nodes -l kubernetes.io/hostname=$(hostname) -o json | jq -r '.items[0].status.conditions[] | select(.type=="Ready") | .status')"; then + if status="$(echo $node_status | jq -r '.conditions[] | select(.type=="Ready") | .status')"; then if [[ "$status" != "True" ]]; then if [[ $time_kubelet_not_ready_first_occurrence == 0 ]]; then time_kubelet_not_ready_first_occurrence=$(date +%s)