Skip to content

Commit 4f89f59

Browse files
Merge pull request #846 from mlcommons/switch_project
Updates in GCP set up
2 parents c1a1ef0 + b93a21c commit 4f89f59

File tree

7 files changed

+41
-113
lines changed

7 files changed

+41
-113
lines changed

.github/workflows/regression_tests.yml

Lines changed: 16 additions & 16 deletions
Large diffs are not rendered by default.

.github/workflows/regression_tests_variants.yml

Lines changed: 0 additions & 85 deletions
This file was deleted.

docker/build_docker_images.sh

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ do
1313
esac
1414
done
1515

16+
# Artifact repostiory
17+
ARTIFACT_REPO="europe-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo"
18+
1619
if [[ -z ${GIT_BRANCH+x} ]]
1720
then
1821
GIT_BRANCH='main' # Set default argument
@@ -22,9 +25,9 @@ for FRAMEWORK in "jax" "pytorch" "both"
2225
do
2326
IMAGE_NAME="algoperf_${FRAMEWORK}_${GIT_BRANCH}"
2427
DOCKER_BUILD_COMMAND="docker build --no-cache -t $IMAGE_NAME . --build-arg framework=$FRAMEWORK --build-arg branch=$GIT_BRANCH"
25-
DOCKER_TAG_COMMAND="docker tag $IMAGE_NAME us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/$IMAGE_NAME"
26-
DOCKER_PUSH_COMMAND="docker push us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/$IMAGE_NAME"
27-
DOCKER_PULL_COMMAND="docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/$IMAGE_NAME"
28+
DOCKER_TAG_COMMAND="docker tag $IMAGE_NAME $ARTIFACT_REPO/$IMAGE_NAME"
29+
DOCKER_PUSH_COMMAND="docker push $ARTIFACT_REPO/$IMAGE_NAME"
30+
DOCKER_PULL_COMMAND="docker pull $ARTIFACT_REPO/$IMAGE_NAME"
2831

2932
echo "On branch: ${GIT_BRANCH}"
3033
echo $DOCKER_BUILD_COMMAND

docker/scripts/cloud-init.cfg

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@ write_files:
4040
ExecStartPre=mount --bind /var/lib/nvidia /var/lib/nvidia
4141
ExecStartPre=mount -o remountexec /var/lib/nvidia
4242
ExecStartPre=/usr/bin/docker-credential-gcr configure-docker --registries us-central1-docker.pkg.dev
43-
ExecStartPre=/usr/bin/docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/base_image:latest
44-
ExecStart=/usr/bin/docker run --rm --name=mlcommons --volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64 --volume /var/lib/nvidia/bin:/usr/local/nvidia/bin --device /dev/nvidia0:/dev/nvidia0 --device /dev/nvidia1:/dev/nvidia1 --device /dev/nvidia2:/dev/nvidia2 --device /dev/nvidia3:/dev/nvidia3 --device /dev/nvidia4:/dev/nvidia4 --device /dev/nvidia5:/dev/nvidia5 --device /dev/nvidia6:/dev/nvidia6 --device /dev/nvidia7:/dev/nvidia7 --device /dev/nvidia-uvm:/dev/nvidia-uvm --device /dev/nvidiactl:/dev/nvidiactl us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/base_image:latest -b true
43+
ExecStartPre=/usr/bin/docker pull europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/base_image:latest
44+
ExecStart=/usr/bin/docker run --rm --name=mlcommons --volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64 --volume /var/lib/nvidia/bin:/usr/local/nvidia/bin --device /dev/nvidia0:/dev/nvidia0 --device /dev/nvidia1:/dev/nvidia1 --device /dev/nvidia2:/dev/nvidia2 --device /dev/nvidia3:/dev/nvidia3 --device /dev/nvidia4:/dev/nvidia4 --device /dev/nvidia5:/dev/nvidia5 --device /dev/nvidia6:/dev/nvidia6 --device /dev/nvidia7:/dev/nvidia7 --device /dev/nvidia-uvm:/dev/nvidia-uvm --device /dev/nvidiactl:/dev/nvidiactl europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/base_image:latest -b true
4545
StandardOutput=journal+console
4646
StandardError=journal+console
4747

docker/scripts/startup.sh

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ RSYNC_DATA="true"
5151
OVERWRITE="false"
5252
SAVE_CHECKPOINTS="true"
5353
TUNING_RULESET="external"
54+
ROOT_DATA_BUCKET="algoperf-data"
55+
LOGS_BUCKET="algoperf-runs"
5456

5557
# Pass flag
5658
while [ "$1" != "" ]; do
@@ -136,6 +138,14 @@ while [ "$1" != "" ]; do
136138
shift
137139
ADDITIONAL_REQUIREMENTS_PATH=$1
138140
;;
141+
--data_bucket)
142+
shift
143+
ROOT_DATA_BUCKET=$1
144+
;;
145+
--logs_bucket)
146+
shift
147+
LOGS_BUCKET=$1
148+
;;
139149
*)
140150
usage
141151
exit 1
@@ -179,11 +189,11 @@ VALID_WORKLOADS=("criteo1tb" "imagenet_resnet" "imagenet_resnet_silu" "imagenet_
179189
VALID_RULESETS=("self" "external")
180190

181191
# Set data and experiment paths
182-
ROOT_DATA_BUCKET="gs://mlcommons-data"
183192
ROOT_DATA_DIR="${HOME_DIR}/data"
193+
ROOT_DATA_BUCKET="gs://${ROOT_DATA_BUCKET}"
184194

185-
EXPERIMENT_BUCKET="gs://mlcommons-runs"
186195
EXPERIMENT_DIR="${HOME_DIR}/experiment_runs"
196+
EXPERIMENT_LOGS_BUCKET="gs://${LOGS_BUCKET}"
187197

188198
if [[ -n ${DATASET+x} ]]; then
189199
if [[ ! " ${VALID_DATASETS[@]} " =~ " $DATASET " ]]; then
@@ -313,8 +323,8 @@ if [[ ! -z ${SUBMISSION_PATH+x} ]]; then
313323
RETURN_CODE=$?
314324

315325
if [[ $INTERNAL_CONTRIBUTOR_MODE == "true" ]]; then
316-
/google-cloud-sdk/bin/gsutil -m cp -r ${EXPERIMENT_DIR}/${EXPERIMENT_NAME}/${WORKLOAD}_${FRAMEWORK} ${EXPERIMENT_BUCKET}/${EXPERIMENT_NAME}/
317-
/google-cloud-sdk/bin/gsutil -m cp ${LOG_FILE} ${EXPERIMENT_BUCKET}/${EXPERIMENT_NAME}/${WORKLOAD}_${FRAMEWORK}/
326+
/google-cloud-sdk/bin/gsutil -m cp -r ${EXPERIMENT_DIR}/${EXPERIMENT_NAME}/${WORKLOAD}_${FRAMEWORK} ${EXPERIMENT_LOGS_BUCKET}/${EXPERIMENT_NAME}/
327+
/google-cloud-sdk/bin/gsutil -m cp ${LOG_FILE} ${EXPERIMENT_LOGS_BUCKET}/${EXPERIMENT_NAME}/${WORKLOAD}_${FRAMEWORK}/
318328
fi
319329

320330
fi

docs/CONTRIBUTING.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ gcloud auth configure-docker $ARTIFACT_REGISTRY_URL
8888
To pull the latest prebuilt image:
8989

9090
```bash
91-
docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/<image_name>
91+
docker pull europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/<image_name>
9292
```
9393

9494
The naming convention for `image_name` is `algoperf_<framework>_<branch>`.
@@ -102,7 +102,7 @@ Currently maintained images on the repository are:
102102
- `algoperf_both_dev`
103103

104104
To reference the pulled image you will have to use the full `image_path`, e.g.
105-
`us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_main`.
105+
`europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/algoperf_jax_main`.
106106

107107
### Trigger Rebuild and Push of Maintained Images
108108

scoring/run_workloads.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626

2727
flags.DEFINE_string(
2828
'docker_image_url',
29-
'us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_dev',
29+
'europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/algoperf_jax_dev',
3030
'URL to docker image')
3131
flags.DEFINE_integer(
3232
'run_percentage',

0 commit comments

Comments
 (0)