Skip to content

Commit 427e023

Browse files
committed
feat: add backoff_retry script and use it in create-pathways job
1 parent 11728f8 commit 427e023

File tree

2 files changed

+114
-1
lines changed

2 files changed

+114
-1
lines changed

.github/workflows/reusable_cluster_private.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ jobs:
7676
- name: Check xpk installation
7777
run: xpk --help
7878
- name: Create a Pathways-enabled private XPK Cluster with 2x ${{inputs.tpu-type}} nodepools. Larger num-nodes to avoid master resizing.
79-
run: python xpk.py cluster create-pathways --cluster ${{inputs.cluster-name}}-private --private --tpu-type=${{inputs.tpu-type}} --num-slices=1 --zone=${{inputs.zone}} --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=4 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --custom-cluster-arguments="${CLUSTER_ARGUMENTS}"
79+
run: ./backoff_retry.sh 'python xpk.py cluster create-pathways --cluster ${{inputs.cluster-name}}-private --private --tpu-type=${{inputs.tpu-type}} --num-slices=1 --zone=${{inputs.zone}} --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=4 --reservation="${{ secrets.GCP_TPU_V4_RESERVATION }}" --custom-cluster-arguments="${CLUSTER_ARGUMENTS}"'
8080
- name: Verify the created cluster is private
8181
run: gcloud container clusters describe ${{inputs.cluster-name}}-private --location=${{inputs.location}} --format="value(privateClusterConfig.enablePrivateNodes)" | grep 'True' || (echo 'The created cluster is not private.' && exit 1)
8282
- name: Delete the cluster created

backoff_retry.sh

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
#!/bin/bash
2+
3+
# Copyright 2025 Google LLC
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# https://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
#==============================================================================
18+
#
19+
# SCRIPT: backoff_retry.sh
20+
#
21+
# DESCRIPTION: Executes a given command string and retries it with an
22+
# exponential backoff strategy if it fails.
23+
#
24+
# The script will attempt to run the command up to a specified
25+
# number of times. If the command fails, it will wait for a
26+
# calculated delay before retrying. The delay doubles after
27+
# each subsequent failure.
28+
#
29+
# USAGE:
30+
# ./backoff_retry.sh "<command>"
31+
#
32+
# Example (default 3 attempts, 60s base delay):
33+
# ./backoff_retry.sh "curl -f http://my-service.local/health"
34+
#
35+
# Example (custom settings via environment variables):
36+
# ATTEMPTS=5 BASE_DELAY=10 ./backoff_retry.sh "./my_flaky_script.py --init"
37+
#
38+
# ARGUMENTS:
39+
# $1 (Required): The command string to execute. This string *must* be
40+
# quoted if it contains spaces, pipes, or other special
41+
# shell characters.
42+
#
43+
# ENVIRONMENT VARIABLES:
44+
# ATTEMPTS: The total number of times to try the command.
45+
# (Default: 3)
46+
#
47+
# BASE_DELAY: The initial wait time (in seconds) used before the *first*
48+
# retry (i.e., after the first failure). The delay is
49+
# calculated as: (BASE_DELAY * 2^(attempt_number - 1))
50+
# (Default: 60)
51+
#
52+
# Delay progression (for default BASE_DELAY=60):
53+
# - After fail 1: 60s (60 * 2^0)
54+
# - After fail 2: 120s (60 * 2^1)
55+
# - After fail 3: 240s (60 * 2^2)
56+
# - ...etc.
57+
#
58+
# EXIT CODES:
59+
# 0: The command succeeded (exited with 0) on one of the attempts.
60+
# 1: Script usage error (e.g., no command provided, invalid ATTEMPTS).
61+
# <other>: If the command fails on all attempts, this script will exit
62+
# with the *last exit code* provided by the failed command.
63+
#
64+
#==============================================================================
65+
66+
RED='\033[0;31m'
67+
GREEN='\033[0;32m'
68+
YELLOW='\033[0;33m'
69+
NC='\033[0m'
70+
71+
if [ "$#" -ne 1 ]; then
72+
echo -e "${RED}Correct usage: backoff_retry.sh \"<command>\"${NC}"
73+
exit 1
74+
fi
75+
76+
ATTEMPTS="${ATTEMPTS:-3}"
77+
BASE_DELAY="${BASE_DELAY:-60}"
78+
COMMAND_STRING="$1"
79+
80+
if [[ -z "$COMMAND_STRING" ]]; then
81+
echo -e "${RED}Error: You must provide a command string as the first argument.${NC}" >&2
82+
exit 1
83+
fi
84+
85+
if ! [[ "$ATTEMPTS" =~ ^[0-9]+$ ]]; then
86+
echo -e "${RED}Error: ATTEMPTS environment variable must be a positive integer.${NC}" >&2
87+
exit 1
88+
fi
89+
90+
for (( i=1; i<=ATTEMPTS; i++ )); do
91+
echo -e "${YELLOW}--- Attempt $i of $ATTEMPTS ---${NC}"
92+
93+
/bin/bash -c "$COMMAND_STRING"
94+
95+
EXIT_CODE=$?
96+
97+
if [ $EXIT_CODE -eq 0 ]; then
98+
echo -e "${GREEN}Command succeeded on attempt $i. Exiting.${NC}"
99+
exit 0
100+
fi
101+
102+
if [ $i -eq $ATTEMPTS ]; then
103+
echo -e "${RED}Command failed after $ATTEMPTS attempts. Exiting with status $EXIT_CODE.${NC}"
104+
exit $EXIT_CODE
105+
fi
106+
107+
108+
DELAY=$(( BASE_DELAY * (2 ** (i - 1)) ))
109+
110+
echo -e "${YELLOW}Command failed with status $EXIT_CODE. Waiting for $DELAY seconds before next attempt...${NC}"
111+
112+
sleep "$DELAY"
113+
done

0 commit comments

Comments
 (0)