Skip to content

Commit

Permalink
Add retries to validate-cluster
Browse files Browse the repository at this point in the history
E2e shows occasional kubectl failures here, so add some retries.  We may want
to make this more general, but I think we should try it out in small scope
first.

Also clean up the retry loop so it doesn't process errors as successful runs
(discovered in testing).

Also simplify a bit of go template syntax.

Testing: I made kubectl randomly fail 50% of the time ($RANDOM%2 ==0) and
iterated until this gave me more helpful results.  Still not perfect, but
better.
  • Loading branch information
thockin committed Apr 22, 2016
1 parent 88a68e9 commit 7e0f66b
Showing 1 changed file with 38 additions and 17 deletions.
55 changes: 38 additions & 17 deletions cluster/validate-cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,26 @@ set -o pipefail
KUBE_ROOT=$(dirname "${BASH_SOURCE}")/..

if [ -f "${KUBE_ROOT}/cluster/env.sh" ]; then
source "${KUBE_ROOT}/cluster/env.sh"
source "${KUBE_ROOT}/cluster/env.sh"
fi

source "${KUBE_ROOT}/cluster/lib/util.sh"
source "${KUBE_ROOT}/cluster/kube-util.sh"

# Run kubectl and retry upon failure.
function kubectl_retry() {
tries=3
while ! "${KUBE_ROOT}/cluster/kubectl.sh" "$@"; do
tries=$((tries-1))
if [[ ${tries} -le 0 ]]; then
echo "('kubectl $@' failed, giving up)" >&2
return 1
fi
echo "(kubectl failed, will retry ${tries} times)" >&2
sleep 1
done
}

ALLOWED_NOTREADY_NODES="${ALLOWED_NOTREADY_NODES:-0}"

EXPECTED_NUM_NODES="${NUM_NODES}"
Expand All @@ -43,6 +57,12 @@ fi
return_value=0
attempt=0
while true; do
# Pause between iterations of this large outer loop.
if [[ ${attempt} -gt 0 ]]; then
sleep 15
fi
attempt=$((attempt+1))

# The "kubectl get nodes -o template" exports node information.
#
# Echo the output and gather 2 counts:
Expand All @@ -52,35 +72,36 @@ while true; do
# Suppress errors from kubectl output because during cluster bootstrapping
# for clusters where the master node is registered, the apiserver will become
# available and then get restarted as the kubelet configures the docker bridge.
node=$("${KUBE_ROOT}/cluster/kubectl.sh" get nodes) || true
found=$(($(echo "${node}" | wc -l) - 1)) || true
ready=$(($(echo "${node}" | grep -v "NotReady" | wc -l ) - 1)) || true
node=$(kubectl_retry get nodes) || continue
found=$(($(echo "${node}" | wc -l) - 1))
ready=$(($(echo "${node}" | grep -v "NotReady" | wc -l ) - 1))

if (( "${found}" == "${EXPECTED_NUM_NODES}" )) && (( "${ready}" == "${EXPECTED_NUM_NODES}")); then
break
elif (( "${found}" > "${EXPECTED_NUM_NODES}" )) && (( "${ready}" > "${EXPECTED_NUM_NODES}")); then
echo -e "${color_red}Detected ${ready} ready nodes, found ${found} nodes out of expected ${EXPECTED_NUM_NODES}. Found more nodes than expected, your cluster may not behave correctly.${color_norm}"
elif (( "${found}" > "${EXPECTED_NUM_NODES}" )); then
echo -e "${color_red}Found ${found} nodes, but expected ${EXPECTED_NUM_NODES}. Your cluster may not behave correctly.${color_norm}"
break
elif (( "${ready}" > "${EXPECTED_NUM_NODES}")); then
echo -e "${color_red}Found ${ready} ready nodes, but expected ${EXPECTED_NUM_NODES}. Your cluster may not behave correctly.${color_norm}"
break
else
# Set the timeout to ~25minutes (100 x 15 second) to avoid timeouts for 1000-node clusters.
if (( attempt > 100 )); then
echo -e "${color_red}Detected ${ready} ready nodes, found ${found} nodes out of expected ${EXPECTED_NUM_NODES}. Your cluster may not be fully functional.${color_norm}"
"${KUBE_ROOT}/cluster/kubectl.sh" get nodes
kubectl_retry get nodes
if [ "$((${EXPECTED_NUM_NODES} - ${ready}))" -gt "${ALLOWED_NOTREADY_NODES}" ]; then
exit 1
else
return_value=2
break
fi
else
else
echo -e "${color_yellow}Waiting for ${EXPECTED_NUM_NODES} ready nodes. ${ready} ready nodes, ${found} registered. Retrying.${color_norm}"
fi
attempt=$((attempt+1))
sleep 15
fi
done
echo "Found ${found} node(s)."
"${KUBE_ROOT}/cluster/kubectl.sh" get nodes
kubectl_retry get nodes

attempt=0
while true; do
Expand All @@ -89,9 +110,9 @@ while true; do
# Echo the output and gather 2 counts:
# - Total number of componentstatuses.
# - Number of "healthy" components.
cs_status=$("${KUBE_ROOT}/cluster/kubectl.sh" get componentstatuses -o template --template='{{range .items}}{{with index .conditions 0}}{{.type}}:{{.status}},{{end}}{{end}}') || true
componentstatuses=$(echo "${cs_status}" | tr "," "\n" | grep -c 'Healthy:') || true
healthy=$(echo "${cs_status}" | tr "," "\n" | grep -c 'Healthy:True') || true
cs_status=$(kubectl_retry get componentstatuses -o template --template='{{range .items}}{{with index .conditions 0}}{{.type}}:{{.status}}{{end}}{{"\n"}}{{end}}') || true
componentstatuses=$(echo "${cs_status}" | grep -c 'Healthy:') || true
healthy=$(echo "${cs_status}" | grep -c 'Healthy:True') || true

if ((componentstatuses > healthy)); then
if ((attempt < 5)); then
Expand All @@ -100,7 +121,7 @@ while true; do
sleep 30
else
echo -e " ${color_yellow}Validate output:${color_norm}"
"${KUBE_ROOT}/cluster/kubectl.sh" get cs
kubectl_retry get cs
echo -e "${color_red}Validation returned one or more failed components. Cluster is probably broken.${color_norm}"
exit 1
fi
Expand All @@ -110,8 +131,8 @@ while true; do
done

echo "Validate output:"
"${KUBE_ROOT}/cluster/kubectl.sh" get cs
if [ "${return_value}" == "0" ]; then
kubectl_retry get cs
if [ "${return_value}" == "0" ]; then
echo -e "${color_green}Cluster validation succeeded${color_norm}"
else
echo -e "${color_yellow}Cluster validation encountered some problems, but cluster should be in working order${color_norm}"
Expand Down

0 comments on commit 7e0f66b

Please sign in to comment.