Skip to content

OCPBUGS-57386: Add retries for external dependencies #1785

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion 01_install_requirements.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,30 @@ sudo dnf -y clean all
old_version=$(sudo dnf info NetworkManager | grep Version | cut -d ':' -f 2)

# Update to latest packages first
sudo dnf -y upgrade --nobest
# Number of attempts
MAX_RETRIES=5
# Delay between attempts (in seconds)
_YUM_RETRY_BACKOFF=15

attempt=1
while (( attempt <= MAX_RETRIES )); do
if sudo dnf -y upgrade --nobest; then
echo "System upgraded successfully."
break
else
echo "Upgrade failed (attempt $attempt). Cleaning cache and retrying..."
sudo dnf clean all
sudo rm -rf /var/cache/dnf/*
sleep $(( _YUM_RETRY_BACKOFF * attempt ))
fi

(( attempt++ ))
done

if (( attempt > MAX_RETRIES )); then
echo "ERROR: Failed to upgrade system after $MAX_RETRIES attempts."
exit 1
fi

new_version=$(sudo dnf info NetworkManager | grep Version | cut -d ':' -f 2)
# If NetworkManager was upgraded it needs to be restarted
Expand Down
24 changes: 21 additions & 3 deletions ocp_install_env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,33 @@ function extract_command() {
local cmd
local outdir
local extract_dir
local MAX_RETRIES=5
local SLEEP_BETWEEN=10

cmd="$1"
release_image="$2"
outdir="$3"

extract_dir=$(mktemp --tmpdir -d "installer--XXXXXXXXXX")
_tmpfiles="$_tmpfiles $extract_dir"
# Retry loop for oc adm release extract to handle quay.io blips
for attempt in $(seq 1 $MAX_RETRIES); do
extract_dir=$(mktemp --tmpdir -d "installer--XXXXXXXXXX")

oc adm release extract --registry-config "${PULL_SECRET_FILE}" --command=$cmd --to "${extract_dir}" ${release_image}
if oc adm release extract --registry-config "${PULL_SECRET_FILE}" --command="$cmd" --to "${extract_dir}" "${release_image}"; then
echo "Successfully extracted $cmd"
break
fi

if [[ $attempt -lt $MAX_RETRIES ]]; then
echo "Extraction failed, retrying in ${SLEEP_BETWEEN}s..."
rm -rf "${extract_dir}"
sleep "${SLEEP_BETWEEN}"
else
echo "Failed to extract $cmd from ${release_image} after $MAX_RETRIES attempts"
return 1
fi
done

_tmpfiles="$_tmpfiles $extract_dir"

if [[ $cmd == "oc.rhel8" ]]; then
cmd="oc"
Expand Down
19 changes: 19 additions & 0 deletions utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -617,6 +617,25 @@ EOF
if [[ "$reg_state" != "running" || $restart_registry -eq 1 ]]; then
sudo podman rm registry -f || true

MAX_RETRIES=5
_PULL_RETRY_DELAY=10

# Try pulling the image first to tolerate quay.io errors like 504s.
for attempt in $(seq 1 $MAX_RETRIES); do
if sudo podman pull "${DOCKER_REGISTRY_IMAGE}"; then
echo "Successfully pulled ${DOCKER_REGISTRY_IMAGE}"
break
fi

if [[ $attempt -lt $MAX_RETRIES ]]; then
echo "Pull failed, retrying in ${_PULL_RETRY_DELAY}s..."
sleep "${_PULL_RETRY_DELAY}"
else
echo "Failed to pull ${DOCKER_REGISTRY_IMAGE} after $MAX_RETRIES attempts"
exit 1
fi
done

sudo podman run -d --name registry --net=host --privileged \
-v ${REGISTRY_DIR}/data:/var/lib/registry:z \
-v ${REGISTRY_DIR}/auth:/auth:z \
Expand Down