Change clip-roberta/bridgetower not to use fast_ddp (#1749)

huggingface · Feb 7, 2025 · 6e97ad7 · 6e97ad7
1 parent f75b6bd
commit 6e97ad7
Show file tree

Hide file tree

Showing 4 changed files with 17 additions and 17 deletions.
diff --git a/examples/contrastive-image-text/README.md b/examples/contrastive-image-text/README.md
@@ -191,7 +191,6 @@ python ../gaudi_spawn.py --use_mpi --world_size 8 run_bridgetower.py \
   --logging_steps 10 \
   --dataloader_num_workers 1 \
   --mediapipe_dataloader \
-  --distribution_strategy fast_ddp \
   --trust_remote_code \
   --sdp_on_bf16
 ```

diff --git a/tests/baselines/bridgetower_large_itm_mlm_itc.json b/tests/baselines/bridgetower_large_itm_mlm_itc.json
@@ -7,8 +7,8 @@
                 "multi_card": {
                     "learning_rate": 1e-5,
                     "train_batch_size": 48,
-                    "train_runtime": 314.5877,
-                    "train_samples_per_second": 918.387,
+                    "train_runtime": 224.42,
+                    "train_samples_per_second": 904.93,
                     "extra_arguments": [
                         "--dataset_config_name matching",
                         "--dataset_revision 3c6c4f6c0ff7e902833d3afa5f8f3875c2b036e6",
@@ -19,7 +19,6 @@
                         "--dataloader_num_workers 2",
                         "--logging_steps 10",
                         "--use_hpu_graphs_for_inference",
-                        "--distribution_strategy fast_ddp",
                         "--trust_remote_code True"
                     ]
                 }

diff --git a/tests/baselines/clip_roberta.json b/tests/baselines/clip_roberta.json
@@ -29,14 +29,14 @@
     },
     "gaudi2": {
         "ydshieh/coco_dataset_script": {
-            "num_train_epochs": 1,
             "eval_batch_size": 64,
+            "num_train_epochs": 1,
             "distribution": {
                 "multi_card": {
-                    "learning_rate": 5e-5,
+                    "learning_rate": 5e-05,
                     "train_batch_size": 512,
-                    "train_runtime": 62.3694,
-                    "train_samples_per_second": 16572.31,
+                    "train_runtime": 59.50,
+                    "train_samples_per_second": 14124,
                     "extra_arguments": [
                         "--data_dir $PWD/",
                         "--dataset_config_name 2017",
@@ -45,13 +45,13 @@
                         "--remove_unused_columns False",
                         "--warmup_steps 0",
                         "--weight_decay 0.1",
-                        "--save_strategy epoch",
-                        "--use_hpu_graphs_for_training",
-                        "--use_hpu_graphs_for_inference",
-                        "--dataloader_num_workers 16",
-                        "--distribution_strategy fast_ddp",
+                        "--save_strategy no",
+                        "--use_hpu_graphs",
+                        "--dataloader_num_workers 2",
                         "--mediapipe_dataloader",
-                        "--trust_remote_code True"
+                        "--logging_nan_inf_filter",
+                        "--trust_remote_code True",
+                        "--max_steps 100"
                     ]
                 }
             }

diff --git a/tests/test_examples.py b/tests/test_examples.py
@@ -514,10 +514,12 @@ def test(self):
                     extra_command_line_arguments.remove("--use_hpu_graphs_for_inference")
             if os.environ.get("DATA_CACHE", None) is not None and self.EXAMPLE_NAME == "run_clip":
                 extra_command_line_arguments[0] = "--data_dir {}".format(os.environ["DATA_CACHE"])
-            elif torch_compile and (
+
+            if torch_compile and (
                 model_name == "bert-large-uncased-whole-word-masking"
                 or model_name == "roberta-large"
                 or model_name == "albert-xxlarge-v1"
+                or model_name == "./clip-roberta"
             ):
                 extra_command_line_arguments.append("--torch_compile_backend hpu_backend")
                 extra_command_line_arguments.append("--torch_compile")
@@ -687,7 +689,7 @@ def _create_command_line(
                 "--save_strategy no",
             ]
 
-        if "compile" in task:
+        if "compile" in task or "--torch_compile" in extra_command_line_arguments:
             cmd_line += ["--use_lazy_mode False"]
         elif self.EXAMPLE_NAME not in ["dpo", "ppo", "reward_modeling"]:
             cmd_line += ["--use_lazy_mode"]
@@ -877,7 +879,7 @@ class MultiCardSeq2SeqQuestionAnsweringExampleTester(
 
 
 class MultiCardVisionLanguageExampleTester(
-    ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_clip", multi_card=True
+    ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_clip", multi_card=True, torch_compile=True
 ):
     TASK_NAME = "ydshieh/coco_dataset_script"