Add retries to validate-cluster

E2e shows occasional kubectl failures here, so add some retries. We may want to make this more general, but I think we should try it out in small scope first. Also clean up the retry loop so it doesn't process errors as successful runs (discovered in testing). Also simplify a bit of go template syntax. Testing: I made kubectl randomly fail 50% of the time ($RANDOM%2 ==0) and iterated until this gave me more helpful results. Still not perfect, but better.
edlee2121 · Apr 22, 2016 · 7e0f66b · 7e0f66b
1 parent 88a68e9
commit 7e0f66b
Showing 1 changed file with 38 additions and 17 deletions.
diff --git a/cluster/validate-cluster.sh b/cluster/validate-cluster.sh
@@ -27,12 +27,26 @@ set -o pipefail
 KUBE_ROOT=$(dirname "${BASH_SOURCE}")/..
 
 if [ -f "${KUBE_ROOT}/cluster/env.sh" ]; then
-    source "${KUBE_ROOT}/cluster/env.sh"
+  source "${KUBE_ROOT}/cluster/env.sh"
 fi
 
 source "${KUBE_ROOT}/cluster/lib/util.sh"
 source "${KUBE_ROOT}/cluster/kube-util.sh"
 
+# Run kubectl and retry upon failure.
+function kubectl_retry() {
+  tries=3
+  while ! "${KUBE_ROOT}/cluster/kubectl.sh" "$@"; do
+    tries=$((tries-1))
+    if [[ ${tries} -le 0 ]]; then
+      echo "('kubectl $@' failed, giving up)" >&2
+      return 1
+    fi
+    echo "(kubectl failed, will retry ${tries} times)" >&2
+    sleep 1
+  done
+}
+
 ALLOWED_NOTREADY_NODES="${ALLOWED_NOTREADY_NODES:-0}"
 
 EXPECTED_NUM_NODES="${NUM_NODES}"
@@ -43,6 +57,12 @@ fi
 return_value=0
 attempt=0
 while true; do
+  # Pause between iterations of this large outer loop.
+  if [[ ${attempt} -gt 0 ]]; then
+    sleep 15
+  fi
+  attempt=$((attempt+1))
+
   # The "kubectl get nodes -o template" exports node information.
   #
   # Echo the output and gather 2 counts:
@@ -52,35 +72,36 @@ while true; do
   # Suppress errors from kubectl output because during cluster bootstrapping
   # for clusters where the master node is registered, the apiserver will become
   # available and then get restarted as the kubelet configures the docker bridge.
-  node=$("${KUBE_ROOT}/cluster/kubectl.sh" get nodes) || true
-  found=$(($(echo "${node}" | wc -l) - 1)) || true
-  ready=$(($(echo "${node}" | grep -v "NotReady" | wc -l ) - 1)) || true
+  node=$(kubectl_retry get nodes) || continue
+  found=$(($(echo "${node}" | wc -l) - 1))
+  ready=$(($(echo "${node}" | grep -v "NotReady" | wc -l ) - 1))
 
   if (( "${found}" == "${EXPECTED_NUM_NODES}" )) && (( "${ready}" == "${EXPECTED_NUM_NODES}")); then
     break
-  elif (( "${found}" > "${EXPECTED_NUM_NODES}" )) && (( "${ready}" > "${EXPECTED_NUM_NODES}")); then
-    echo -e "${color_red}Detected ${ready} ready nodes, found ${found} nodes out of expected ${EXPECTED_NUM_NODES}. Found more nodes than expected, your cluster may not behave correctly.${color_norm}"
+  elif (( "${found}" > "${EXPECTED_NUM_NODES}" )); then
+    echo -e "${color_red}Found ${found} nodes, but expected ${EXPECTED_NUM_NODES}. Your cluster may not behave correctly.${color_norm}"
+    break
+  elif (( "${ready}" > "${EXPECTED_NUM_NODES}")); then
+    echo -e "${color_red}Found ${ready} ready nodes, but expected ${EXPECTED_NUM_NODES}. Your cluster may not behave correctly.${color_norm}"
     break
   else
     # Set the timeout to ~25minutes (100 x 15 second) to avoid timeouts for 1000-node clusters.
     if (( attempt > 100 )); then
       echo -e "${color_red}Detected ${ready} ready nodes, found ${found} nodes out of expected ${EXPECTED_NUM_NODES}. Your cluster may not be fully functional.${color_norm}"
-      "${KUBE_ROOT}/cluster/kubectl.sh" get nodes
+      kubectl_retry get nodes
       if [ "$((${EXPECTED_NUM_NODES} - ${ready}))" -gt "${ALLOWED_NOTREADY_NODES}" ]; then
         exit 1
       else
         return_value=2
         break
       fi
-		else
+    else
       echo -e "${color_yellow}Waiting for ${EXPECTED_NUM_NODES} ready nodes. ${ready} ready nodes, ${found} registered. Retrying.${color_norm}"
     fi
-    attempt=$((attempt+1))
-    sleep 15
   fi
 done
 echo "Found ${found} node(s)."
-"${KUBE_ROOT}/cluster/kubectl.sh" get nodes
+kubectl_retry get nodes
 
 attempt=0
 while true; do
@@ -89,9 +110,9 @@ while true; do
   # Echo the output and gather 2 counts:
   #  - Total number of componentstatuses.
   #  - Number of "healthy" components.
-  cs_status=$("${KUBE_ROOT}/cluster/kubectl.sh" get componentstatuses -o template --template='{{range .items}}{{with index .conditions 0}}{{.type}}:{{.status}},{{end}}{{end}}') || true
-  componentstatuses=$(echo "${cs_status}" | tr "," "\n" | grep -c 'Healthy:') || true
-  healthy=$(echo "${cs_status}" | tr "," "\n" | grep -c 'Healthy:True') || true
+  cs_status=$(kubectl_retry get componentstatuses -o template --template='{{range .items}}{{with index .conditions 0}}{{.type}}:{{.status}}{{end}}{{"\n"}}{{end}}') || true
+  componentstatuses=$(echo "${cs_status}" | grep -c 'Healthy:') || true
+  healthy=$(echo "${cs_status}" | grep -c 'Healthy:True') || true
 
   if ((componentstatuses > healthy)); then
     if ((attempt < 5)); then
@@ -100,7 +121,7 @@ while true; do
       sleep 30
     else
       echo -e " ${color_yellow}Validate output:${color_norm}"
-      "${KUBE_ROOT}/cluster/kubectl.sh" get cs
+      kubectl_retry get cs
       echo -e "${color_red}Validation returned one or more failed components. Cluster is probably broken.${color_norm}"
       exit 1
     fi
@@ -110,8 +131,8 @@ while true; do
 done
 
 echo "Validate output:"
-"${KUBE_ROOT}/cluster/kubectl.sh" get cs
-if [ "${return_value}" == "0" ]; then 
+kubectl_retry get cs
+if [ "${return_value}" == "0" ]; then
   echo -e "${color_green}Cluster validation succeeded${color_norm}"
 else
   echo -e "${color_yellow}Cluster validation encountered some problems, but cluster should be in working order${color_norm}"