Skip to content

Commit 1d81455

Browse files
Merge pull request #847 from mlcommons/dev
Dev -> main
2 parents 9653f18 + 4f89f59 commit 1d81455

File tree

9 files changed

+45
-114
lines changed

9 files changed

+45
-114
lines changed

.github/workflows/regression_tests.yml

+16-16
Large diffs are not rendered by default.

.github/workflows/regression_tests_variants.yml

-85
This file was deleted.

docker/build_docker_images.sh

+6-3
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ do
1313
esac
1414
done
1515

16+
# Artifact repostiory
17+
ARTIFACT_REPO="europe-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo"
18+
1619
if [[ -z ${GIT_BRANCH+x} ]]
1720
then
1821
GIT_BRANCH='main' # Set default argument
@@ -22,9 +25,9 @@ for FRAMEWORK in "jax" "pytorch" "both"
2225
do
2326
IMAGE_NAME="algoperf_${FRAMEWORK}_${GIT_BRANCH}"
2427
DOCKER_BUILD_COMMAND="docker build --no-cache -t $IMAGE_NAME . --build-arg framework=$FRAMEWORK --build-arg branch=$GIT_BRANCH"
25-
DOCKER_TAG_COMMAND="docker tag $IMAGE_NAME us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/$IMAGE_NAME"
26-
DOCKER_PUSH_COMMAND="docker push us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/$IMAGE_NAME"
27-
DOCKER_PULL_COMMAND="docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/$IMAGE_NAME"
28+
DOCKER_TAG_COMMAND="docker tag $IMAGE_NAME $ARTIFACT_REPO/$IMAGE_NAME"
29+
DOCKER_PUSH_COMMAND="docker push $ARTIFACT_REPO/$IMAGE_NAME"
30+
DOCKER_PULL_COMMAND="docker pull $ARTIFACT_REPO/$IMAGE_NAME"
2831

2932
echo "On branch: ${GIT_BRANCH}"
3033
echo $DOCKER_BUILD_COMMAND

docker/scripts/cloud-init.cfg

+2-2
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@ write_files:
4040
ExecStartPre=mount --bind /var/lib/nvidia /var/lib/nvidia
4141
ExecStartPre=mount -o remountexec /var/lib/nvidia
4242
ExecStartPre=/usr/bin/docker-credential-gcr configure-docker --registries us-central1-docker.pkg.dev
43-
ExecStartPre=/usr/bin/docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/base_image:latest
44-
ExecStart=/usr/bin/docker run --rm --name=mlcommons --volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64 --volume /var/lib/nvidia/bin:/usr/local/nvidia/bin --device /dev/nvidia0:/dev/nvidia0 --device /dev/nvidia1:/dev/nvidia1 --device /dev/nvidia2:/dev/nvidia2 --device /dev/nvidia3:/dev/nvidia3 --device /dev/nvidia4:/dev/nvidia4 --device /dev/nvidia5:/dev/nvidia5 --device /dev/nvidia6:/dev/nvidia6 --device /dev/nvidia7:/dev/nvidia7 --device /dev/nvidia-uvm:/dev/nvidia-uvm --device /dev/nvidiactl:/dev/nvidiactl us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/base_image:latest -b true
43+
ExecStartPre=/usr/bin/docker pull europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/base_image:latest
44+
ExecStart=/usr/bin/docker run --rm --name=mlcommons --volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64 --volume /var/lib/nvidia/bin:/usr/local/nvidia/bin --device /dev/nvidia0:/dev/nvidia0 --device /dev/nvidia1:/dev/nvidia1 --device /dev/nvidia2:/dev/nvidia2 --device /dev/nvidia3:/dev/nvidia3 --device /dev/nvidia4:/dev/nvidia4 --device /dev/nvidia5:/dev/nvidia5 --device /dev/nvidia6:/dev/nvidia6 --device /dev/nvidia7:/dev/nvidia7 --device /dev/nvidia-uvm:/dev/nvidia-uvm --device /dev/nvidiactl:/dev/nvidiactl europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/base_image:latest -b true
4545
StandardOutput=journal+console
4646
StandardError=journal+console
4747

docker/scripts/startup.sh

+14-4
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ RSYNC_DATA="true"
5151
OVERWRITE="false"
5252
SAVE_CHECKPOINTS="true"
5353
TUNING_RULESET="external"
54+
ROOT_DATA_BUCKET="algoperf-data"
55+
LOGS_BUCKET="algoperf-runs"
5456

5557
# Pass flag
5658
while [ "$1" != "" ]; do
@@ -136,6 +138,14 @@ while [ "$1" != "" ]; do
136138
shift
137139
ADDITIONAL_REQUIREMENTS_PATH=$1
138140
;;
141+
--data_bucket)
142+
shift
143+
ROOT_DATA_BUCKET=$1
144+
;;
145+
--logs_bucket)
146+
shift
147+
LOGS_BUCKET=$1
148+
;;
139149
*)
140150
usage
141151
exit 1
@@ -179,11 +189,11 @@ VALID_WORKLOADS=("criteo1tb" "imagenet_resnet" "imagenet_resnet_silu" "imagenet_
179189
VALID_RULESETS=("self" "external")
180190

181191
# Set data and experiment paths
182-
ROOT_DATA_BUCKET="gs://mlcommons-data"
183192
ROOT_DATA_DIR="${HOME_DIR}/data"
193+
ROOT_DATA_BUCKET="gs://${ROOT_DATA_BUCKET}"
184194

185-
EXPERIMENT_BUCKET="gs://mlcommons-runs"
186195
EXPERIMENT_DIR="${HOME_DIR}/experiment_runs"
196+
EXPERIMENT_LOGS_BUCKET="gs://${LOGS_BUCKET}"
187197

188198
if [[ -n ${DATASET+x} ]]; then
189199
if [[ ! " ${VALID_DATASETS[@]} " =~ " $DATASET " ]]; then
@@ -313,8 +323,8 @@ if [[ ! -z ${SUBMISSION_PATH+x} ]]; then
313323
RETURN_CODE=$?
314324

315325
if [[ $INTERNAL_CONTRIBUTOR_MODE == "true" ]]; then
316-
/google-cloud-sdk/bin/gsutil -m cp -r ${EXPERIMENT_DIR}/${EXPERIMENT_NAME}/${WORKLOAD}_${FRAMEWORK} ${EXPERIMENT_BUCKET}/${EXPERIMENT_NAME}/
317-
/google-cloud-sdk/bin/gsutil -m cp ${LOG_FILE} ${EXPERIMENT_BUCKET}/${EXPERIMENT_NAME}/${WORKLOAD}_${FRAMEWORK}/
326+
/google-cloud-sdk/bin/gsutil -m cp -r ${EXPERIMENT_DIR}/${EXPERIMENT_NAME}/${WORKLOAD}_${FRAMEWORK} ${EXPERIMENT_LOGS_BUCKET}/${EXPERIMENT_NAME}/
327+
/google-cloud-sdk/bin/gsutil -m cp ${LOG_FILE} ${EXPERIMENT_LOGS_BUCKET}/${EXPERIMENT_NAME}/${WORKLOAD}_${FRAMEWORK}/
318328
fi
319329

320330
fi

docs/CONTRIBUTING.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ gcloud auth configure-docker $ARTIFACT_REGISTRY_URL
8888
To pull the latest prebuilt image:
8989

9090
```bash
91-
docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/<image_name>
91+
docker pull europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/<image_name>
9292
```
9393

9494
The naming convention for `image_name` is `algoperf_<framework>_<branch>`.
@@ -102,7 +102,7 @@ Currently maintained images on the repository are:
102102
- `algoperf_both_dev`
103103

104104
To reference the pulled image you will have to use the full `image_path`, e.g.
105-
`us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_main`.
105+
`europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/algoperf_jax_main`.
106106

107107
### Trigger Rebuild and Push of Maintained Images
108108

scoring/run_workloads.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626

2727
flags.DEFINE_string(
2828
'docker_image_url',
29-
'us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_dev',
29+
'europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/algoperf_jax_dev',
3030
'URL to docker image')
3131
flags.DEFINE_integer(
3232
'run_percentage',

tests/modeldiffs/diff.py

+3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from flax import jax_utils
2+
from flax.core import FrozenDict
23
import jax
34
import numpy as np
45
import torch
@@ -16,6 +17,8 @@ def torch2jax(jax_workload,
1617
jax_params, model_state = jax_workload.init_model_fn(jax.random.PRNGKey(0),
1718
**init_kwargs)
1819
pytorch_model, _ = pytorch_workload.init_model_fn([0], **init_kwargs)
20+
if isinstance(jax_params, dict):
21+
jax_params = FrozenDict(jax_params)
1922
jax_params = jax_utils.unreplicate(jax_params).unfreeze()
2023
if model_state is not None:
2124
model_state = jax_utils.unreplicate(model_state)

tests/test_traindiffs.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def test_workload(self, workload):
5050
pyt_logs = '/tmp/pyt_log.pkl'
5151
try:
5252
run(
53-
f'XLA_PYTHON_CLIENT_ALLOCATOR=platform python3 -m tests.reference_algorithm_tests --workload={workload} --framework=jax --global_batch_size={GLOBAL_BATCH_SIZE} --log_file={jax_logs}'
53+
f'XLA_PYTHON_CLIENT_ALLOCATOR=platform python -m tests.reference_algorithm_tests --workload={workload} --framework=jax --global_batch_size={GLOBAL_BATCH_SIZE} --log_file={jax_logs}'
5454
f' --submission_path=tests/modeldiffs/vanilla_sgd_jax.py --identical=True --tuning_search_space=None --num_train_steps={NUM_TRAIN_STEPS}',
5555
shell=True,
5656
stdout=DEVNULL,

0 commit comments

Comments
 (0)