Skip to content

Commit 437f7e3

Browse files
committed
workbench: allow some network/other errors while waiting for the cluster to stop
1 parent 5928e46 commit 437f7e3

File tree

1 file changed

+95
-9
lines changed

1 file changed

+95
-9
lines changed

nix/workbench/backend/nomad.sh

Lines changed: 95 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1907,22 +1907,22 @@ backend_nomad() {
19071907
while \
19081908
! test -f "${dir}"/flag/cluster-stopping \
19091909
&& \
1910-
backend_nomad is-task-program-running "${dir}" "node-${pool_ix}" "node-${pool_ix}" > /dev/null
1910+
backend_nomad is-task-program-running "${dir}" "node-${pool_ix}" "node-${pool_ix}" 5 > /dev/null
19111911
do
19121912
# Always check that a started generator has not FAILED!
19131913
if \
19141914
test -f "${dir}"/generator/started \
19151915
&& \
19161916
! test -f "${dir}"/generator/quit \
19171917
&& \
1918-
! backend_nomad is-task-program-running "${dir}" "${generator_task}" generator
1918+
! backend_nomad is-task-program-running "${dir}" "${generator_task}" generator 5
19191919
then
1920-
if backend_nomad is-task-program-failed "${dir}" "${generator_task}" generator
1920+
if backend_nomad is-task-program-failed "${dir}" "${generator_task}" generator 5
19211921
then
19221922
# If the node in "${generator_task}" quits generators fails with:
19231923
# tx-generator: MuxError MuxBearerClosed "<socket: 12> closed when reading data, waiting on next header True"
19241924
# Service binary 'tx-generator' returned status: 1
1925-
if backend_nomad is-task-program-running "${dir}" "${generator_task}" "${generator_task}"
1925+
if backend_nomad is-task-program-running "${dir}" "${generator_task}" "${generator_task}" 5
19261926
then
19271927
# This was not expected!
19281928
# But check it wasn't a race condition of a stopping cluster!
@@ -2457,11 +2457,13 @@ backend_nomad() {
24572457
backend_nomad task-supervisorctl "${dir}" "${task}" stop "${program}" > /dev/null
24582458
;;
24592459

2460+
# Don't use fatal with no strikes, the exit trap uses it to stop everything!
24602461
is-task-program-running )
24612462
local usage="USAGE: wb backend pass $op RUN-DIR TASK-NAME SUPERVISOR-PROGRAM"
24622463
local dir=${1:?$usage}; shift
24632464
local task=${1:?$usage}; shift
24642465
local program=${1:?$usage}; shift
2466+
local strikes=${1:-""}
24652467
# NOTICE: Only returns zero when RUNNING!
24662468
#> supervisorctl status
24672469
# generator RUNNING pid 83, uptime 0:00:23
@@ -2483,24 +2485,108 @@ backend_nomad() {
24832485
# 3
24842486
#> supervisorctl status node-0 >/dev/null; echo $?
24852487
# 3
2486-
backend_nomad task-supervisorctl "$dir" "$task" status "$program" > /dev/null
2488+
local stderr_file="${dir}"/flag/is-task-program-running-"${task}"-"${program}"
2489+
:> "${stderr_file}"
2490+
if ! backend_nomad task-supervisorctl "${dir}" "${task}" status "${program}" > /dev/null 2> "${stderr_file}"
2491+
then
2492+
# Command returned "false"
2493+
if test -s "${stderr_file}"
2494+
then
2495+
# Command returned "false" with a non-empty stderr output
2496+
if test -n "${strikes}"
2497+
then
2498+
# A strike parameter was given
2499+
msg "$(yellow "Function \"is-task-program-running\" failed: $(cat ${stderr_file})")"
2500+
strikes=$(( strikes - 1 ))
2501+
msg "$(yellow "Strikes for \"is-task-program-running\" left: ${strikes}")"
2502+
if test "${strikes}" -gt 0
2503+
then
2504+
# Strikes still available, sleep/retry!
2505+
if test "${strikes}" = 1
2506+
then
2507+
# Before the last retry, wait five minute!
2508+
sleep 300 # 5 minutes!
2509+
else
2510+
sleep 60 # 1 minute!
2511+
fi
2512+
# Retry with one less strike available
2513+
backend_nomad is-task-program-running "${dir}" "${task}" "${program}" "${strikes}"
2514+
else
2515+
# Fails everything only if using strikes!
2516+
fatal "Function \"is-task-program-running\" failed: $(cat ${stderr_file})"
2517+
fi
2518+
else
2519+
# No strike parameter was given, don't use "fatal"!
2520+
msg "$(red "Function \"is-task-program-running\" failed: $(cat ${stderr_file})")"
2521+
false
2522+
fi
2523+
else
2524+
# Command returned "false" with an empty stderr output
2525+
false # Program is not running!
2526+
fi
2527+
else
2528+
# Command returned "true"
2529+
if test -s "${stderr_file}"
2530+
then
2531+
# Don't supress possible error messages!
2532+
msg "$(yellow "WARNING: \"is-task-program-running\" is returning a non-empty stderr: $(cat "${stderr_file}")")"
2533+
fi
2534+
true # Program is running!
2535+
fi
24872536
;;
24882537

2538+
# Don't use fatal with no strikes, the exit trap uses it to stop everything!
24892539
is-task-program-failed )
24902540
local usage="USAGE: wb backend pass $op RUN-DIR TASK-NAME SUPERVISOR-PROGRAM"
24912541
local dir=${1:?$usage}; shift
24922542
local task=${1:?$usage}; shift
24932543
local program=${1:?$usage}; shift
2544+
local strikes=${1:-""}
24942545
# As we are not using any "autorestart" supervisord programs are run as:
24952546
# command=sh -c "./start.sh; echo "$?" > ./exit_code"
24962547
# because we can't obtain the exit codes using `supervisrctl`
2548+
local stderr_file="${dir}"/flag/is-task-program-failed-"${task}"-"${program}"
2549+
:> "${stderr_file}"
24972550
local exit_code
2498-
if exit_code=$(backend_nomad task-file-contents "${dir}" "${task}" \
2499-
/local/run/current/"${program}"/exit_code 2>/dev/null)
2551+
if ! exit_code=$(backend_nomad task-file-contents "${dir}" "${task}" \
2552+
/local/run/current/"${program}"/exit_code 2> "${stderr_file}")
25002553
then
2501-
test "${exit_code}" != "0"
2554+
# Command returned "false"
2555+
if test -n "${strikes}"
2556+
then
2557+
# A strike parameter was given
2558+
msg "$(yellow "Function \"is-task-program-failed\" failed: $(cat ${stderr_file})")"
2559+
strikes=$(( strikes - 1 ))
2560+
msg "$(yellow "Strikes for \"is-task-program-failed\" left: ${strikes}")"
2561+
if test "${strikes}" -gt 0
2562+
then
2563+
# Strikes still available, sleep/retry!
2564+
if test "${strikes}" = 1
2565+
then
2566+
# Before the last retry, wait five minute!
2567+
sleep 300 # 5 minutes!
2568+
else
2569+
sleep 60 # 1 minute!
2570+
fi
2571+
# Retry with one less strike available
2572+
backend_nomad is-task-program-failed "${dir}" "${task}" "${program}" $(( strikes - 1 ))
2573+
else
2574+
# Fails everything only if using strikes!
2575+
fatal "Function \"is-task-program-failed\" failed"
2576+
fi
2577+
else
2578+
# No strike parameter was given, don't use "fatal"!
2579+
msg "$(red "Function \"is-task-program-failed\" failed: $(cat ${stderr_file})")"
2580+
true # Assuming program failed due to Nomad command error!
2581+
fi
25022582
else
2503-
return 0
2583+
# Command returned "true"
2584+
if test -s "${stderr_file}"
2585+
then
2586+
# Don't supress possible error messages!
2587+
msg "$(yellow "WARNING: \"is-task-program-failed\" is returning a non-empty stderr: $(cat ${stderr_file})")"
2588+
fi
2589+
test "${exit_code}" != "0"
25042590
fi
25052591
;;
25062592

0 commit comments

Comments
 (0)