Skip to content

Commit

Permalink
Change clip-roberta/bridgetower not to use fast_ddp (#1749)
Browse files Browse the repository at this point in the history
  • Loading branch information
jiminha authored Feb 7, 2025
1 parent f75b6bd commit 6e97ad7
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 17 deletions.
1 change: 0 additions & 1 deletion examples/contrastive-image-text/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,6 @@ python ../gaudi_spawn.py --use_mpi --world_size 8 run_bridgetower.py \
--logging_steps 10 \
--dataloader_num_workers 1 \
--mediapipe_dataloader \
--distribution_strategy fast_ddp \
--trust_remote_code \
--sdp_on_bf16
```
Expand Down
5 changes: 2 additions & 3 deletions tests/baselines/bridgetower_large_itm_mlm_itc.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
"multi_card": {
"learning_rate": 1e-5,
"train_batch_size": 48,
"train_runtime": 314.5877,
"train_samples_per_second": 918.387,
"train_runtime": 224.42,
"train_samples_per_second": 904.93,
"extra_arguments": [
"--dataset_config_name matching",
"--dataset_revision 3c6c4f6c0ff7e902833d3afa5f8f3875c2b036e6",
Expand All @@ -19,7 +19,6 @@
"--dataloader_num_workers 2",
"--logging_steps 10",
"--use_hpu_graphs_for_inference",
"--distribution_strategy fast_ddp",
"--trust_remote_code True"
]
}
Expand Down
20 changes: 10 additions & 10 deletions tests/baselines/clip_roberta.json
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,14 @@
},
"gaudi2": {
"ydshieh/coco_dataset_script": {
"num_train_epochs": 1,
"eval_batch_size": 64,
"num_train_epochs": 1,
"distribution": {
"multi_card": {
"learning_rate": 5e-5,
"learning_rate": 5e-05,
"train_batch_size": 512,
"train_runtime": 62.3694,
"train_samples_per_second": 16572.31,
"train_runtime": 59.50,
"train_samples_per_second": 14124,
"extra_arguments": [
"--data_dir $PWD/",
"--dataset_config_name 2017",
Expand All @@ -45,13 +45,13 @@
"--remove_unused_columns False",
"--warmup_steps 0",
"--weight_decay 0.1",
"--save_strategy epoch",
"--use_hpu_graphs_for_training",
"--use_hpu_graphs_for_inference",
"--dataloader_num_workers 16",
"--distribution_strategy fast_ddp",
"--save_strategy no",
"--use_hpu_graphs",
"--dataloader_num_workers 2",
"--mediapipe_dataloader",
"--trust_remote_code True"
"--logging_nan_inf_filter",
"--trust_remote_code True",
"--max_steps 100"
]
}
}
Expand Down
8 changes: 5 additions & 3 deletions tests/test_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -514,10 +514,12 @@ def test(self):
extra_command_line_arguments.remove("--use_hpu_graphs_for_inference")
if os.environ.get("DATA_CACHE", None) is not None and self.EXAMPLE_NAME == "run_clip":
extra_command_line_arguments[0] = "--data_dir {}".format(os.environ["DATA_CACHE"])
elif torch_compile and (

if torch_compile and (
model_name == "bert-large-uncased-whole-word-masking"
or model_name == "roberta-large"
or model_name == "albert-xxlarge-v1"
or model_name == "./clip-roberta"
):
extra_command_line_arguments.append("--torch_compile_backend hpu_backend")
extra_command_line_arguments.append("--torch_compile")
Expand Down Expand Up @@ -687,7 +689,7 @@ def _create_command_line(
"--save_strategy no",
]

if "compile" in task:
if "compile" in task or "--torch_compile" in extra_command_line_arguments:
cmd_line += ["--use_lazy_mode False"]
elif self.EXAMPLE_NAME not in ["dpo", "ppo", "reward_modeling"]:
cmd_line += ["--use_lazy_mode"]
Expand Down Expand Up @@ -877,7 +879,7 @@ class MultiCardSeq2SeqQuestionAnsweringExampleTester(


class MultiCardVisionLanguageExampleTester(
ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_clip", multi_card=True
ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_clip", multi_card=True, torch_compile=True
):
TASK_NAME = "ydshieh/coco_dataset_script"

Expand Down

0 comments on commit 6e97ad7

Please sign in to comment.