@@ -1907,22 +1907,22 @@ backend_nomad() {
1907
1907
while \
1908
1908
! test -f " ${dir} " /flag/cluster-stopping \
1909
1909
&& \
1910
- backend_nomad is-task-program-running " ${dir} " " node-${pool_ix} " " node-${pool_ix} " > /dev/null
1910
+ backend_nomad is-task-program-running " ${dir} " " node-${pool_ix} " " node-${pool_ix} " 5 > /dev/null
1911
1911
do
1912
1912
# Always check that a started generator has not FAILED!
1913
1913
if \
1914
1914
test -f " ${dir} " /generator/started \
1915
1915
&& \
1916
1916
! test -f " ${dir} " /generator/quit \
1917
1917
&& \
1918
- ! backend_nomad is-task-program-running " ${dir} " " ${generator_task} " generator
1918
+ ! backend_nomad is-task-program-running " ${dir} " " ${generator_task} " generator 5
1919
1919
then
1920
- if backend_nomad is-task-program-failed " ${dir} " " ${generator_task} " generator
1920
+ if backend_nomad is-task-program-failed " ${dir} " " ${generator_task} " generator 5
1921
1921
then
1922
1922
# If the node in "${generator_task}" quits generators fails with:
1923
1923
# tx-generator: MuxError MuxBearerClosed "<socket: 12> closed when reading data, waiting on next header True"
1924
1924
# Service binary 'tx-generator' returned status: 1
1925
- if backend_nomad is-task-program-running " ${dir} " " ${generator_task} " " ${generator_task} "
1925
+ if backend_nomad is-task-program-running " ${dir} " " ${generator_task} " " ${generator_task} " 5
1926
1926
then
1927
1927
# This was not expected!
1928
1928
# But check it wasn't a race condition of a stopping cluster!
@@ -2457,11 +2457,13 @@ backend_nomad() {
2457
2457
backend_nomad task-supervisorctl " ${dir} " " ${task} " stop " ${program} " > /dev/null
2458
2458
;;
2459
2459
2460
+ # Don't use fatal with no strikes, the exit trap uses it to stop everything!
2460
2461
is-task-program-running )
2461
2462
local usage=" USAGE: wb backend pass $op RUN-DIR TASK-NAME SUPERVISOR-PROGRAM"
2462
2463
local dir=${1:? $usage } ; shift
2463
2464
local task=${1:? $usage } ; shift
2464
2465
local program=${1:? $usage } ; shift
2466
+ local strikes=${1:- " " }
2465
2467
# NOTICE: Only returns zero when RUNNING!
2466
2468
# > supervisorctl status
2467
2469
# generator RUNNING pid 83, uptime 0:00:23
@@ -2483,24 +2485,108 @@ backend_nomad() {
2483
2485
# 3
2484
2486
# > supervisorctl status node-0 >/dev/null; echo $?
2485
2487
# 3
2486
- backend_nomad task-supervisorctl " $dir " " $task " status " $program " > /dev/null
2488
+ local stderr_file=" ${dir} " /flag/is-task-program-running-" ${task} " -" ${program} "
2489
+ :> " ${stderr_file} "
2490
+ if ! backend_nomad task-supervisorctl " ${dir} " " ${task} " status " ${program} " > /dev/null 2> " ${stderr_file} "
2491
+ then
2492
+ # Command returned "false"
2493
+ if test -s " ${stderr_file} "
2494
+ then
2495
+ # Command returned "false" with a non-empty stderr output
2496
+ if test -n " ${strikes} "
2497
+ then
2498
+ # A strike parameter was given
2499
+ msg " $( yellow " Function \" is-task-program-running\" failed: $( cat ${stderr_file} ) " ) "
2500
+ strikes=$(( strikes - 1 ))
2501
+ msg " $( yellow " Strikes for \" is-task-program-running\" left: ${strikes} " ) "
2502
+ if test " ${strikes} " -gt 0
2503
+ then
2504
+ # Strikes still available, sleep/retry!
2505
+ if test " ${strikes} " = 1
2506
+ then
2507
+ # Before the last retry, wait five minute!
2508
+ sleep 300 # 5 minutes!
2509
+ else
2510
+ sleep 60 # 1 minute!
2511
+ fi
2512
+ # Retry with one less strike available
2513
+ backend_nomad is-task-program-running " ${dir} " " ${task} " " ${program} " " ${strikes} "
2514
+ else
2515
+ # Fails everything only if using strikes!
2516
+ fatal " Function \" is-task-program-running\" failed: $( cat ${stderr_file} ) "
2517
+ fi
2518
+ else
2519
+ # No strike parameter was given, don't use "fatal"!
2520
+ msg " $( red " Function \" is-task-program-running\" failed: $( cat ${stderr_file} ) " ) "
2521
+ false
2522
+ fi
2523
+ else
2524
+ # Command returned "false" with an empty stderr output
2525
+ false # Program is not running!
2526
+ fi
2527
+ else
2528
+ # Command returned "true"
2529
+ if test -s " ${stderr_file} "
2530
+ then
2531
+ # Don't supress possible error messages!
2532
+ msg " $( yellow " WARNING: \" is-task-program-running\" is returning a non-empty stderr: $( cat " ${stderr_file} " ) " ) "
2533
+ fi
2534
+ true # Program is running!
2535
+ fi
2487
2536
;;
2488
2537
2538
+ # Don't use fatal with no strikes, the exit trap uses it to stop everything!
2489
2539
is-task-program-failed )
2490
2540
local usage=" USAGE: wb backend pass $op RUN-DIR TASK-NAME SUPERVISOR-PROGRAM"
2491
2541
local dir=${1:? $usage } ; shift
2492
2542
local task=${1:? $usage } ; shift
2493
2543
local program=${1:? $usage } ; shift
2544
+ local strikes=${1:- " " }
2494
2545
# As we are not using any "autorestart" supervisord programs are run as:
2495
2546
# command=sh -c "./start.sh; echo "$?" > ./exit_code"
2496
2547
# because we can't obtain the exit codes using `supervisrctl`
2548
+ local stderr_file=" ${dir} " /flag/is-task-program-failed-" ${task} " -" ${program} "
2549
+ :> " ${stderr_file} "
2497
2550
local exit_code
2498
- if exit_code=$( backend_nomad task-file-contents " ${dir} " " ${task} " \
2499
- /local/run/current/" ${program} " /exit_code 2> /dev/null )
2551
+ if ! exit_code=$( backend_nomad task-file-contents " ${dir} " " ${task} " \
2552
+ /local/run/current/" ${program} " /exit_code 2> " ${stderr_file} " )
2500
2553
then
2501
- test " ${exit_code} " ! = " 0"
2554
+ # Command returned "false"
2555
+ if test -n " ${strikes} "
2556
+ then
2557
+ # A strike parameter was given
2558
+ msg " $( yellow " Function \" is-task-program-failed\" failed: $( cat ${stderr_file} ) " ) "
2559
+ strikes=$(( strikes - 1 ))
2560
+ msg " $( yellow " Strikes for \" is-task-program-failed\" left: ${strikes} " ) "
2561
+ if test " ${strikes} " -gt 0
2562
+ then
2563
+ # Strikes still available, sleep/retry!
2564
+ if test " ${strikes} " = 1
2565
+ then
2566
+ # Before the last retry, wait five minute!
2567
+ sleep 300 # 5 minutes!
2568
+ else
2569
+ sleep 60 # 1 minute!
2570
+ fi
2571
+ # Retry with one less strike available
2572
+ backend_nomad is-task-program-failed " ${dir} " " ${task} " " ${program} " $(( strikes - 1 ))
2573
+ else
2574
+ # Fails everything only if using strikes!
2575
+ fatal " Function \" is-task-program-failed\" failed"
2576
+ fi
2577
+ else
2578
+ # No strike parameter was given, don't use "fatal"!
2579
+ msg " $( red " Function \" is-task-program-failed\" failed: $( cat ${stderr_file} ) " ) "
2580
+ true # Assuming program failed due to Nomad command error!
2581
+ fi
2502
2582
else
2503
- return 0
2583
+ # Command returned "true"
2584
+ if test -s " ${stderr_file} "
2585
+ then
2586
+ # Don't supress possible error messages!
2587
+ msg " $( yellow " WARNING: \" is-task-program-failed\" is returning a non-empty stderr: $( cat ${stderr_file} ) " ) "
2588
+ fi
2589
+ test " ${exit_code} " ! = " 0"
2504
2590
fi
2505
2591
;;
2506
2592
0 commit comments