Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Testing MWMS in TF 2.9.1 with TF Model Garden #420

Draft
wants to merge 9 commits into
base: tf-2
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion buildspec-dlc-cpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ version: 0.2

env:
variables:
FRAMEWORK_VERSION: '2.7.1'
FRAMEWORK_VERSION: '2.9.1'
CPU_INSTANCE_TYPE: 'ml.c4.xlarge'
ECR_REPO: 'sagemaker-test'

Expand Down
2 changes: 1 addition & 1 deletion buildspec-dlc-gpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ version: 0.2

env:
variables:
FRAMEWORK_VERSION: '2.7.1'
FRAMEWORK_VERSION: '2.9.1'
GPU_INSTANCE_TYPE: 'ml.p3.2xlarge'
ECR_REPO: 'sagemaker-test'
GITHUB_REPO: 'sagemaker-tensorflow-training-toolkit'
Expand Down
2 changes: 1 addition & 1 deletion buildspec-gen-cpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ version: 0.2

env:
variables:
FRAMEWORK_VERSION: '2.7.1'
FRAMEWORK_VERSION: '2.9.1'
CPU_INSTANCE_TYPE: 'ml.c4.xlarge'
ECR_REPO: 'sagemaker-test'

Expand Down
2 changes: 1 addition & 1 deletion buildspec-gen-gpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ version: 0.2

env:
variables:
FRAMEWORK_VERSION: '2.7.1'
FRAMEWORK_VERSION: '2.9.1'
GPU_INSTANCE_TYPE: 'ml.p3.16xlarge'
ECR_REPO: 'sagemaker-test'
GITHUB_REPO: 'sagemaker-tensorflow-training-toolkit'
Expand Down
8 changes: 4 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,13 @@ def read_version():
"pytest-rerunfailures",
"mock",
"sagemaker[local]>=2",
"tensorflow<2.4",
"tensorflow>=2.9",
"docker-compose",
"boto3==1.16.34",
"boto3",
"python-dateutil>=2.1,<2.8.1",
"botocore==1.19.34",
"botocore",
"requests-mock",
"awscli==1.18.194",
"awscli",
"protobuf>=3.20,<3.21"
]

Expand Down
2 changes: 1 addition & 1 deletion src/sagemaker_tensorflow_container/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def _build_tf_config_for_mwms(hosts, current_host):
"""
workers = hosts

def host_addresses(hosts, port=8890):
def host_addresses(hosts, port=2222):
return ["{}:{}".format(host, port) for host in hosts]

tf_config = {"cluster": {}, "environment": "cloud"}
Expand Down
6 changes: 6 additions & 0 deletions test/container/2.9.1/Dockerfile.dlc.cpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
ARG region
FROM 763104351884.dkr.ecr.$region.amazonaws.com/tensorflow-training:2.9.1-cpu-py39-ubuntu20.04-sagemaker

COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz
RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \
rm /sagemaker_tensorflow_training.tar.gz
6 changes: 6 additions & 0 deletions test/container/2.9.1/Dockerfile.dlc.gpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
ARG region
FROM 763104351884.dkr.ecr.$region.amazonaws.com/tensorflow-training:2.9.1-gpu-py39-cu112-ubuntu20.04-sagemaker

COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz
RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \
rm /sagemaker_tensorflow_training.tar.gz
9 changes: 9 additions & 0 deletions test/container/2.9.1/Dockerfile.tf.cpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
FROM tensorflow/tensorflow:2.9.1

ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main

COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz
RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \
rm /sagemaker_tensorflow_training.tar.gz
RUN pip install --no-cache-dir tensorflow-io
RUN apt-get update && apt-get install -y --no-install-recommends openssh-server && mkdir -p /var/run/sshd
13 changes: 13 additions & 0 deletions test/container/2.9.1/Dockerfile.tf.gpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
FROM tensorflow/tensorflow:2.9.1-gpu

ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main

COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz
RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \
rm /sagemaker_tensorflow_training.tar.gz
RUN pip install --no-cache-dir tensorflow-io
RUN apt-key del 7fa2af80 \
&& rm /etc/apt/sources.list.d/nvidia-ml.list \
&& apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub \
&& apt-get update \
&& apt-get install -y --no-install-recommends openssh-server && mkdir -p /var/run/sshd
64 changes: 63 additions & 1 deletion test/integration/sagemaker/test_multi_worker_mirrored.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,17 @@

import os

import pytest
from sagemaker.tensorflow import TensorFlow
from sagemaker.utils import unique_name_from_base


RESOURCE_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "resources")


def test_multi_node(sagemaker_session, instance_type, image_uri, tmpdir, framework_version, capsys):
def test_keras_example(
sagemaker_session, instance_type, image_uri, tmpdir, framework_version, capsys
):
estimator = TensorFlow(
entry_point=os.path.join(RESOURCE_PATH, "multi_worker_mirrored", "train_dummy.py"),
role="SageMakerRole",
Expand All @@ -40,3 +43,62 @@ def test_multi_node(sagemaker_session, instance_type, image_uri, tmpdir, framewo
logs = captured.out + captured.err
assert "Running distributed training job with multi_worker_mirrored_strategy setup" in logs
assert "TF_CONFIG=" in logs


@pytest.mark.skip_cpu
def test_tf_model_garden(
sagemaker_session, instance_type, image_uri, tmpdir, framework_version, capsys
):
epochs = 1
global_batch_size = 64
train_steps = int(10**5 * epochs / global_batch_size)
steps_per_loop = train_steps // 100
overrides = (
f"runtime.enable_xla=False,"
f"runtime.num_gpus=1,"
f"runtime.distribution_strategy=multi_worker_mirrored,"
f"runtime.mixed_precision_dtype=float16,"
f"task.train_data.global_batch_size={global_batch_size},"
f"task.train_data.input_path=/opt/ml/input/data/training/train-000*,"
f"task.train_data.cache=True,"
f"trainer.train_steps={train_steps},"
f"trainer.steps_per_loop={steps_per_loop},"
f"trainer.summary_interval={steps_per_loop},"
f"trainer.checkpoint_interval={train_steps},"
f"task.model.backbone.type=resnet,"
f"task.model.backbone.resnet.model_id=50"
)
estimator = TensorFlow(
git_config={
"repo": "https://github.com/tensorflow/models.git",
"branch": "v2.9.2",
},
source_dir=".",
entry_point="official/vision/train.py",
model_dir=False,
instance_type=instance_type,
instance_count=2,
image_uri=image_uri,
hyperparameters={
"sagemaker_multi_worker_mirrored_strategy_enabled": True,
"experiment": "resnet_imagenet",
"config_file": "official/vision/configs/experiments/image_classification/imagenet_resnet50_gpu.yaml",
"mode": "train",
"model_dir": "/opt/ml/model",
"params_override": overrides,
},
environment={
'NCCL_DEBUG': 'INFO',
},
max_run=60 * 60 * 12, # 1 hour
role="SageMakerRole",
volume_size=400,
)
estimator.fit(
inputs="s3://collection-of-ml-datasets/Imagenet/TFRecords/train",
job_name=unique_name_from_base("test-tf-mwms"),
)
captured = capsys.readouterr()
logs = captured.out + captured.err
assert "Running distributed training job with multi_worker_mirrored_strategy setup" in logs
assert "TF_CONFIG=" in logs
2 changes: 1 addition & 1 deletion test/resources/multi_worker_mirrored/train_dummy.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,4 @@ def build_and_compile_cnn_model():
# Model building/compiling need to be within `strategy.scope()`.
multi_worker_model = build_and_compile_cnn_model()

multi_worker_model.fit(multi_worker_dataset, epochs=3, steps_per_epoch=70)
multi_worker_model.fit(multi_worker_dataset, epochs=3, steps_per_epoch=70, verbose=2)
2 changes: 1 addition & 1 deletion test/unit/test_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
"worker": ["{}:2222".format(HOST2)],
"ps": ["{}:2223".format(HOST1), "{}:2223".format(HOST2)],
}
CLUSTER_WITH_MWMS = {"worker": ["{}:8890".format(HOST) for HOST in HOST_LIST]}
CLUSTER_WITH_MWMS = {"worker": ["{}:2222".format(HOST) for HOST in HOST_LIST]}

MASTER_TASK = {"index": 0, "type": "master"}
WORKER_TASK = {"index": 0, "type": "worker"}
Expand Down