Skip to content

Commit c45f04d

Browse files
scalibySikaGrr
authored andcommitted
Add backoff_retry script and use it in create-pathways job (#739)
* feat: add backoff_retry script and use it in create-pathways job * feat: move env to args in backoff_retry
1 parent 7e1bee4 commit c45f04d

File tree

2 files changed

+178
-1
lines changed

2 files changed

+178
-1
lines changed

.github/workflows/reusable_cluster_private.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ jobs:
7676
- name: Check xpk installation
7777
run: xpk --help
7878
- name: Create a Pathways-enabled private XPK Cluster with 2x ${{inputs.tpu-type}} nodepools. Larger num-nodes to avoid master resizing.
79-
run: python xpk.py cluster create-pathways --cluster ${{inputs.cluster-name}}-private --private --tpu-type=${{inputs.tpu-type}} --num-slices=1 --zone=${{inputs.zone}} --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=4 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments="${CLUSTER_ARGUMENTS}"
79+
run: ./backoff_retry.sh -c 'python xpk.py cluster create-pathways --cluster ${{inputs.cluster-name}}-private --private --tpu-type=${{inputs.tpu-type}} --num-slices=1 --zone=${{inputs.zone}} --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=4 --reservation="${{ secrets.GCP_TPU_V4_RESERVATION }}" --custom-cluster-arguments="${CLUSTER_ARGUMENTS}"'
8080
- name: Verify the created cluster is private
8181
run: gcloud container clusters describe ${{inputs.cluster-name}}-private --location=${{inputs.location}} --format="value(privateClusterConfig.enablePrivateNodes)" | grep 'True' || (echo 'The created cluster is not private.' && exit 1)
8282
- name: Delete the cluster created

backoff_retry.sh

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
#!/bin/bash
2+
3+
# Copyright 2025 Google LLC
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# https://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
# ==============================================================================
18+
#
19+
# == Description ==
20+
#
21+
# This script acts as a wrapper to run a specified command. If the command
22+
# exits with a non-zero status (indicating failure), the script will wait
23+
# for a calculated period and then retry the command.
24+
#
25+
# The delay between retries increases exponentially to avoid overwhelming
26+
# a failing service (e.g., a database, web API) that might be
27+
# temporarily unavailable.
28+
#
29+
# == Backoff Logic ==
30+
#
31+
# The delay is calculated using the formula:
32+
# DELAY = BASE_DELAY * (2 ^ (current_attempt_number - 1))
33+
#
34+
# With default settings (BASE_DELAY=60s, ATTEMPTS=3):
35+
# - Attempt 1 fails: Waits 60 * (2^0) = 60 seconds
36+
# - Attempt 2 fails: Waits 60 * (2^1) = 120 seconds
37+
# - Attempt 3 fails: Script gives up and exits with the command's final error code.
38+
#
39+
# == Usage ==
40+
#
41+
# ./backoff_retry.sh -c "<command_to_run>" [-a <num_attempts>] [-d <base_delay_sec>]
42+
#
43+
# == Options ==
44+
#
45+
# -c, --command The command string to execute (required).
46+
# **Must be quoted** if it contains spaces, pipes, or
47+
# other special characters.
48+
#
49+
# -a, --attempts The total number of times to try the command.
50+
# Must be a positive integer. (Default: 3)
51+
#
52+
# -d, --base-delay The initial delay in seconds before the first retry.
53+
# Must be a non-negative integer. (Default: 60)
54+
#
55+
# -h, --help Show this help message and exit.
56+
#
57+
# == Examples ==
58+
#
59+
# 1. Run a simple 'false' command (will fail 3 times by default):
60+
# ./backoff_retry.sh -c "false"
61+
#
62+
# 2. Attempt to curl a potentially flaky API 5 times, starting with a 10s delay:
63+
# ./backoff_retry.sh -c "curl -f http://localhost/status" -a 5 -d 10
64+
#
65+
# 3. Run a complex command with pipes and ensure it's properly quoted:
66+
# ./backoff_retry.sh -c "ps aux | grep 'xpk' | grep -v 'grep'"
67+
#
68+
# ==============================================================================
69+
70+
RED='\033[0;31m'
71+
GREEN='\033[0;32m'
72+
YELLOW='\033[0;33m'
73+
NC='\033[0m'
74+
75+
usage() {
76+
echo -e "Usage: $0 -c \"<command>\" [-a <attempts>] [-d <base_delay>]" >&2
77+
echo -e "\nOptions:" >&2
78+
echo -e " -c, --command The command string to execute (required)." >&2
79+
echo -e " -a, --attempts Number of attempts (default: 3)." >&2
80+
echo -e " -d, --base-delay Base delay in seconds for backoff (default: 60)." >&2
81+
echo -e " -h, --help Show this help message." >&2
82+
}
83+
84+
ATTEMPTS=3
85+
BASE_DELAY=60
86+
COMMAND_STRING=""
87+
88+
if [ "$#" -eq 0 ]; then
89+
echo -e "${RED}Error: No arguments provided.${NC}" >&2
90+
usage
91+
exit 1
92+
fi
93+
94+
while [[ "$#" -gt 0 ]]; do
95+
case "$1" in
96+
-c|--command)
97+
if [[ -n "$2" && ! "$2" =~ ^- ]]; then
98+
COMMAND_STRING="$2"
99+
shift 2
100+
else
101+
echo -e "${RED}Error: --command requires an argument.${NC}" >&2
102+
usage
103+
exit 1
104+
fi
105+
;;
106+
-a|--attempts)
107+
if [[ -n "$2" && ! "$2" =~ ^- ]]; then
108+
ATTEMPTS="$2"
109+
shift 2
110+
else
111+
echo -e "${RED}Error: --attempts requires an argument.${NC}" >&2
112+
usage
113+
exit 1
114+
fi
115+
;;
116+
-d|--base-delay)
117+
if [[ -n "$2" && ! "$2" =~ ^- ]]; then
118+
BASE_DELAY="$2"
119+
shift 2
120+
else
121+
echo -e "${RED}Error: --base-delay requires an argument.${NC}" >&2
122+
usage
123+
exit 1
124+
fi
125+
;;
126+
-h|--help)
127+
usage
128+
exit 0
129+
;;
130+
*)
131+
echo -e "${RED}Error: Unknown option: $1${NC}" >&2
132+
usage
133+
exit 1
134+
;;
135+
esac
136+
done
137+
138+
if [[ -z "$COMMAND_STRING" ]]; then
139+
echo -e "${RED}Error: You must provide a command string with -c or --command.${NC}" >&2
140+
usage
141+
exit 1
142+
fi
143+
144+
if ! [[ "$ATTEMPTS" =~ ^[1-9][0-9]*$ ]]; then
145+
echo -e "${RED}Error: ATTEMPTS (-a) must be a positive integer.${NC}" >&2
146+
exit 1
147+
fi
148+
149+
if ! [[ "$BASE_DELAY" =~ ^[0-9]+$ ]]; then
150+
echo -e "${RED}Error: BASE_DELAY (-d) must be a non-negative integer.${NC}" >&2
151+
exit 1
152+
fi
153+
154+
for (( i=1; i<=ATTEMPTS; i++ )); do
155+
echo -e "${YELLOW}--- Attempt $i of $ATTEMPTS ---${NC}"
156+
157+
/bin/bash -c "$COMMAND_STRING"
158+
159+
EXIT_CODE=$?
160+
161+
if [ $EXIT_CODE -eq 0 ]; then
162+
echo -e "${GREEN}Command succeeded on attempt $i. Exiting.${NC}"
163+
exit 0
164+
fi
165+
166+
if [ $i -eq $ATTEMPTS ]; then
167+
echo -e "${RED}Command failed after $ATTEMPTS attempts. Exiting with status $EXIT_CODE.${NC}"
168+
exit $EXIT_CODE
169+
fi
170+
171+
172+
DELAY=$(( BASE_DELAY * (2 ** (i - 1)) ))
173+
174+
echo -e "${YELLOW}Command failed with status $EXIT_CODE. Waiting for $DELAY seconds before next attempt...${NC}"
175+
176+
sleep "$DELAY"
177+
done

0 commit comments

Comments
 (0)