From ca49360dfeadd9b25587bae879f2f34e0d94aa96 Mon Sep 17 00:00:00 2001
From: Puneesh Khanna <pkhanna@habana.ai>
Date: Wed, 22 Nov 2023 15:20:39 +0530
Subject: [PATCH 1/8] Fix setting of reuse cache (#553)

Reuse cache is independent of HPU graphs enablement.
---
 examples/text-generation/run_generation.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/text-generation/run_generation.py b/examples/text-generation/run_generation.py
index 8c54cf6e9d..bd875ffd4f 100644
--- a/examples/text-generation/run_generation.py
+++ b/examples/text-generation/run_generation.py
@@ -209,8 +209,7 @@ def setup_parser(parser):
     args = parser.parse_args()
 
     if not args.use_hpu_graphs:
-        args.limit_hpu_graphs = False
-        args.reuse_cache = False
+        args.limit_hpu_graphs = False        
 
     return args
 

From 86d37aeaa83b86e927e9d7cd414e18e33f490a90 Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Wed, 22 Nov 2023 11:21:09 +0100
Subject: [PATCH 2/8] Add Gaudi2 regression test workflow (#554)

---
 .github/workflows/slow_tests.yml        |   4 +-
 .github/workflows/slow_tests_gaudi2.yml | 137 ++++++++++++++++++++++++
 2 files changed, 139 insertions(+), 2 deletions(-)
 create mode 100644 .github/workflows/slow_tests_gaudi2.yml

diff --git a/.github/workflows/slow_tests.yml b/.github/workflows/slow_tests.yml
index 045af4bbed..d37fc21305 100644
--- a/.github/workflows/slow_tests.yml
+++ b/.github/workflows/slow_tests.yml
@@ -3,8 +3,8 @@ name: Non-regression tests
 on:
   workflow_dispatch:
   schedule:
-    - cron: '0 21 * * 0-5'  # every Sunday to Friday at 11pm CET
-    - cron: '0 21 * * 6'  # every Saturday at 1am CET
+    - cron: '0 21 * * 0-5'  # every Sunday to Friday at 11pm CET (10pm winter time)
+    - cron: '0 21 * * 6'  # every Saturday at 1am CET (midnight winter time)
 
 concurrency:
   group: ${{ github.workflow }}
diff --git a/.github/workflows/slow_tests_gaudi2.yml b/.github/workflows/slow_tests_gaudi2.yml
new file mode 100644
index 0000000000..8a538ec1cd
--- /dev/null
+++ b/.github/workflows/slow_tests_gaudi2.yml
@@ -0,0 +1,137 @@
+name: (Gaudi2) Non-regression tests
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 23 * * 3,6'  # every Wednesday and Saturday at 1am CET (midnight winter time)
+
+concurrency:
+  group: ${{ github.workflow }}
+
+jobs:
+  stable-diffusion:
+    name: Test Stable Diffusion
+    runs-on: [self-hosted, linux, x64, gaudi2]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+      - name: Pull image
+        run: |
+            docker pull vault.habana.ai/gaudi-docker/1.12.0/ubuntu20.04/habanalabs/pytorch-installer-2.0.1:latest
+      - name: Run tests
+        run: |
+            docker run \
+            -v $PWD:/root/workspace \
+            --workdir=/root/workspace \
+            --runtime=habana \
+            -e HABANA_VISIBLE_DEVICES=all \
+            -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
+            --cap-add=sys_nice \
+            --net=host \
+            --ipc=host \
+            vault.habana.ai/gaudi-docker/1.12.0/ubuntu20.04/habanalabs/pytorch-installer-2.0.1:latest \
+            GAUDI2_CI=1 /bin/bash tests/ci/slow_tests_diffusers.sh
+  deepspeed:
+    name: Test DeepSpeed models
+    if: ${{ !cancelled() && (success() || failure()) }}
+    needs:
+      - stable-diffusion  # run the job when the previous test job is done
+    runs-on: [self-hosted, linux, x64, gaudi2]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+      - name: Pull image
+        run: |
+            docker pull vault.habana.ai/gaudi-docker/1.12.0/ubuntu20.04/habanalabs/pytorch-installer-2.0.1:latest
+      - name: Run tests
+        run: |
+            docker run \
+            -v $PWD:/root/workspace \
+            --workdir=/root/workspace \
+            --runtime=habana \
+            -e HABANA_VISIBLE_DEVICES=all \
+            -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
+            --cap-add=sys_nice \
+            --net=host \
+            --ipc=host \
+            vault.habana.ai/gaudi-docker/1.12.0/ubuntu20.04/habanalabs/pytorch-installer-2.0.1:latest \
+            GAUDI2_CI=1 /bin/bash tests/ci/slow_tests_deepspeed.sh
+  multi-card:
+    name: Test multi-card models
+    if: ${{ !cancelled() && (success() || failure()) }}
+    needs:
+      - deepspeed  # run the job when the previous test job is done
+    runs-on: [self-hosted, linux, x64, gaudi2]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+      - name: Pull image
+        run: |
+            docker pull vault.habana.ai/gaudi-docker/1.12.0/ubuntu20.04/habanalabs/pytorch-installer-2.0.1:latest
+      - name: Run tests
+        run: |
+            docker run \
+            -v $PWD:/root/workspace \
+            --workdir=/root/workspace \
+            --runtime=habana \
+            -e HABANA_VISIBLE_DEVICES=all \
+            -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
+            --cap-add=sys_nice \
+            --net=host \
+            --ipc=host \
+            vault.habana.ai/gaudi-docker/1.12.0/ubuntu20.04/habanalabs/pytorch-installer-2.0.1:latest \
+            GAUDI2_CI=1 /bin/bash tests/ci/slow_tests_8x.sh
+  single-card:
+    name: Test single-card models
+    if: ${{ !cancelled() && (success() || failure()) }}
+    needs:
+      - deepspeed
+      - multi-card  # run the job when the previous test jobs are done
+    runs-on: [self-hosted, linux, x64, gaudi2]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+      - name: Pull image
+        run: |
+            docker pull vault.habana.ai/gaudi-docker/1.12.0/ubuntu20.04/habanalabs/pytorch-installer-2.0.1:latest
+      - name: Run tests
+        run: |
+            docker run \
+            -v $PWD:/root/workspace \
+            --workdir=/root/workspace \
+            --runtime=habana \
+            -e HABANA_VISIBLE_DEVICES=all \
+            -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
+            --cap-add=sys_nice \
+            --net=host \
+            --ipc=host \
+            vault.habana.ai/gaudi-docker/1.12.0/ubuntu20.04/habanalabs/pytorch-installer-2.0.1:latest \
+            GAUDI2_CI=1 RUN_ALBERT_XXL_1X=1 /bin/bash tests/ci/slow_tests_1x.sh
+  text-generation:
+    name: Test text-generation example
+    if: ${{ !cancelled() && (success() || failure()) }}
+    needs:
+      - deepspeed
+      - multi-card
+      - single-card
+      - albert-xxl-single-card  # run the job when the previous test jobs are done
+    runs-on: [self-hosted, linux, x64, gaudi2]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+      - name: Pull image
+        run: |
+            docker pull vault.habana.ai/gaudi-docker/1.12.0/ubuntu20.04/habanalabs/pytorch-installer-2.0.1:latest
+      - name: Run tests
+        run: |
+            docker run \
+            -v $PWD:/root/workspace \
+            --workdir=/root/workspace \
+            --runtime=habana \
+            -e HABANA_VISIBLE_DEVICES=all \
+            -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
+            --cap-add=sys_nice \
+            --net=host \
+            --ipc=host \
+            vault.habana.ai/gaudi-docker/1.12.0/ubuntu20.04/habanalabs/pytorch-installer-2.0.1:latest \
+            GAUDI2_CI=1 make slow_tests_text_generation_example TOKEN=${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }}

From b2efcd399f0481b4d1eb2d6886d904868f8e2f7d Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Wed, 22 Nov 2023 11:46:13 +0100
Subject: [PATCH 3/8] Remove black and use ruff for code formatting (#555)

---
 .github/workflows/build_pr_documentation.yml  |  1 -
 .github/workflows/check_code_quality.yml      |  9 +++------
 Makefile                                      |  8 ++++----
 examples/text-generation/run_generation.py    |  2 +-
 pyproject.toml                                | 19 ++++++++++++++-----
 setup.py                                      |  1 -
 tests/test_trainer.py                         |  4 +---
 .../tests/models/gpt2/test_modeling_gpt2.py   |  6 +-----
 .../tests/models/gptj/test_modeling_gptj.py   |  4 +---
 .../tests/models/llama/test_modeling_llama.py | 16 ++++------------
 10 files changed, 29 insertions(+), 41 deletions(-)

diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index 926c97b62f..9d6b2d4005 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -34,7 +34,6 @@ jobs:
           cd doc-builder
           git pull origin main
           pip install .
-          pip install black
           cd ..
 
       - name: Make documentation
diff --git a/.github/workflows/check_code_quality.yml b/.github/workflows/check_code_quality.yml
index cb7a266a04..b734c564d3 100644
--- a/.github/workflows/check_code_quality.yml
+++ b/.github/workflows/check_code_quality.yml
@@ -29,12 +29,9 @@ jobs:
       run: |
         source venv/bin/activate
         pip install --upgrade pip
-        pip install black ruff
-    - name: Check style with black
-      run: |
-        source venv/bin/activate
-        black --check .
+        pip install ruff
     - name: Check style with ruff
       run: |
         source venv/bin/activate
-        ruff .
+        ruff check . setup.py
+        ruff format --check . setup.py
diff --git a/Makefile b/Makefile
index 2b0b535405..f5c974136f 100644
--- a/Makefile
+++ b/Makefile
@@ -22,12 +22,12 @@ REAL_CLONE_URL = $(if $(CLONE_URL),$(CLONE_URL),$(DEFAULT_CLONE_URL))
 
 # Run code quality checks
 style_check: clean
-	black --check . setup.py
-	ruff . setup.py
+	ruff check . setup.py
+	ruff format --check . setup.py
 
 style: clean
-	black . setup.py
-	ruff . setup.py --fix
+	ruff check . setup.py --fix
+	ruff format . setup.py
 
 # Run unit and integration tests
 fast_tests:
diff --git a/examples/text-generation/run_generation.py b/examples/text-generation/run_generation.py
index bd875ffd4f..3351c72ec1 100644
--- a/examples/text-generation/run_generation.py
+++ b/examples/text-generation/run_generation.py
@@ -209,7 +209,7 @@ def setup_parser(parser):
     args = parser.parse_args()
 
     if not args.use_hpu_graphs:
-        args.limit_hpu_graphs = False        
+        args.limit_hpu_graphs = False
 
     return args
 
diff --git a/pyproject.toml b/pyproject.toml
index 7323ffa36c..87941f7e5d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,13 +12,9 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-[tool.black]
-line-length = 119
-target-version = ['py37']
-
 [tool.ruff]
 # Never enforce `E501` (line length violations).
-ignore = ["C901", "E501", "E741"]
+ignore = ["C901", "E501", "E741", "F402", "F823"]
 select = ["C", "E", "F", "I", "W"]
 line-length = 119
 exclude = ["text-generation-inference"]
@@ -30,3 +26,16 @@ exclude = ["text-generation-inference"]
 [tool.ruff.isort]
 lines-after-imports = 2
 known-first-party = ["optimum.habana"]
+
+[tool.ruff.format]
+# Like Black, use double quotes for strings.
+quote-style = "double"
+
+# Like Black, indent with spaces, rather than tabs.
+indent-style = "space"
+
+# Like Black, respect magic trailing commas.
+skip-magic-trailing-comma = false
+
+# Like Black, automatically detect the appropriate line ending.
+line-ending = "auto"
diff --git a/setup.py b/setup.py
index c081a55db4..ae4ee1a2c4 100644
--- a/setup.py
+++ b/setup.py
@@ -48,7 +48,6 @@
 ]
 
 QUALITY_REQUIRES = [
-    "black",
     "ruff",
     "hf_doc_builder @ git+https://github.com/huggingface/doc-builder.git",
 ]
diff --git a/tests/test_trainer.py b/tests/test_trainer.py
index dda4e55e20..1d82a5913a 100644
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -1683,9 +1683,7 @@ def test_no_wd_param_group(self):
         args = GaudiTrainingArguments(output_dir="./test", use_habana=True, use_lazy_mode=True)
         trainer = GaudiTrainer(model=model, gaudi_config=gaudi_config, args=args)
         trainer.create_optimizer_and_scheduler(10)
-        # fmt: off
-        wd_names = ['0.linear1.weight', '0.linear2.weight', '1.0.linear1.weight', '1.0.linear2.weight', '1.1.linear1.weight', '1.1.linear2.weight']
-        # fmt: on
+        wd_names = ['0.linear1.weight', '0.linear2.weight', '1.0.linear1.weight', '1.0.linear2.weight', '1.1.linear1.weight', '1.1.linear2.weight']  # fmt: skip
         wd_params = [p for n, p in model.named_parameters() if n in wd_names]
         no_wd_params = [p for n, p in model.named_parameters() if n not in wd_names]
         self.assertListEqual(trainer.optimizer.param_groups[0]["params"], wd_params)
diff --git a/tests/transformers/tests/models/gpt2/test_modeling_gpt2.py b/tests/transformers/tests/models/gpt2/test_modeling_gpt2.py
index 016313927f..d507b34b4d 100644
--- a/tests/transformers/tests/models/gpt2/test_modeling_gpt2.py
+++ b/tests/transformers/tests/models/gpt2/test_modeling_gpt2.py
@@ -727,11 +727,7 @@ def _test_lm_generate_gpt2_helper(
         input_ids = torch.tensor([[464, 3290]], dtype=torch.long, device=torch_device)
 
         # The dog was found in a field near the intersection of West and West Streets.\n\nThe dog
-        # fmt: off
-        expected_output_ids = [
-            464, 3290, 373, 1043, 287, 257, 2214, 1474, 262, 16246, 286, 2688, 290, 2688, 27262, 13, 198, 198, 464, 3290,
-        ]
-        # fmt: on
+        expected_output_ids = [464, 3290, 373, 1043, 287, 257, 2214, 1474, 262, 16246, 286, 2688, 290, 2688, 27262, 13, 198, 198, 464, 3290,]  # fmt: skip
         output_ids = model.generate(input_ids, do_sample=False)
         if verify_outputs:
             self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
diff --git a/tests/transformers/tests/models/gptj/test_modeling_gptj.py b/tests/transformers/tests/models/gptj/test_modeling_gptj.py
index 9d222350e3..4271079915 100644
--- a/tests/transformers/tests/models/gptj/test_modeling_gptj.py
+++ b/tests/transformers/tests/models/gptj/test_modeling_gptj.py
@@ -543,10 +543,8 @@ def test_lm_generate_gptj(self):
                 model.gradient_checkpointing_disable()
             model.to(torch_device)
             input_ids = torch.tensor([[464, 3290]], dtype=torch.long, device=torch_device)  # The dog
-            # fmt: off
             # The dog is a man's best friend. It is a loyal companion, and it is a friend
-            expected_output_ids = [464, 3290, 318, 257, 582, 338, 1266, 1545, 13, 632, 318, 257, 9112, 15185, 11, 290, 340, 318, 257, 1545]
-            # fmt: on
+            expected_output_ids = [464, 3290, 318, 257, 582, 338, 1266, 1545, 13, 632, 318, 257, 9112, 15185, 11, 290, 340, 318, 257, 1545]  # fmt: skip
             output_ids = model.generate(input_ids, do_sample=False)
             self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
 
diff --git a/tests/transformers/tests/models/llama/test_modeling_llama.py b/tests/transformers/tests/models/llama/test_modeling_llama.py
index 49e78fa854..cadac239c5 100644
--- a/tests/transformers/tests/models/llama/test_modeling_llama.py
+++ b/tests/transformers/tests/models/llama/test_modeling_llama.py
@@ -384,9 +384,7 @@ def test_model_7b_logits(self):
         EXPECTED_MEAN = torch.tensor([[-6.6550, -4.1227, -4.9859, -3.2406, 0.8262, -3.0033, 1.2964, -3.3699]])
         torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)
         # slicing logits[0, 0, 0:30]
-        # fmt: off
-        EXPECTED_SLICE = torch.tensor([-12.8281, -7.4453, -0.4639, -8.0625, -7.2500, -8.0000, -6.4883, -7.7695, -7.8438, -7.0312, -6.2188, -7.1328, -1.8496, 1.9961, -8.6250, -6.7227, -12.8281, -6.9492, -7.0742, -7.7852, -7.5820, -7.9062, -6.9375, -7.9805, -8.3438, -8.1562, -8.0469, -7.6250, -7.7422, -7.3398,])
-        # fmt: on
+        EXPECTED_SLICE = torch.tensor([-12.8281, -7.4453, -0.4639, -8.0625, -7.2500, -8.0000, -6.4883, -7.7695, -7.8438, -7.0312, -6.2188, -7.1328, -1.8496, 1.9961, -8.6250, -6.7227, -12.8281, -6.9492, -7.0742, -7.7852, -7.5820, -7.9062, -6.9375, -7.9805, -8.3438, -8.1562, -8.0469, -7.6250, -7.7422, -7.3398,])  # fmt: skip
         torch.testing.assert_close(out[0, 0, :30], EXPECTED_SLICE, atol=1e-5, rtol=1e-5)
 
     @unittest.skip("Logits are not exactly the same, once we fix the instabalities somehow, will update!")
@@ -399,9 +397,7 @@ def test_model_13b_logits(self):
         EXPECTED_MEAN = torch.tensor([[-2.0622, -1.2794, -1.1638, -0.9788, -1.4603, -1.0238, -1.7893, -1.4411]])
         torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)
         # slicing logits[0, 0, 0:30]
-        # fmt: off
-        EXPECTED_SLICE = torch.tensor([-8.1406, -8.0547, 2.7461, -1.2344, -0.1448, -1.8262, -1.0020, -1.8154, -1.6895, -1.8516, -2.3574, -0.9277, 3.7598, 6.5742, -1.2998, -0.1177, -8.1406, -2.9688, -2.9199, -3.1699, -3.5254, -2.3555, -2.7988, -3.4141, -2.8262, -4.5195, -3.3379, -3.3164, -2.7832, -3.0273])
-        # fmt: on
+        EXPECTED_SLICE = torch.tensor([-8.1406, -8.0547, 2.7461, -1.2344, -0.1448, -1.8262, -1.0020, -1.8154, -1.6895, -1.8516, -2.3574, -0.9277, 3.7598, 6.5742, -1.2998, -0.1177, -8.1406, -2.9688, -2.9199, -3.1699, -3.5254, -2.3555, -2.7988, -3.4141, -2.8262, -4.5195, -3.3379, -3.3164, -2.7832, -3.0273])  # fmt: skip
         torch.testing.assert_close(out[0, 0, :30], EXPECTED_SLICE, atol=1e-5, rtol=1e-5)
 
     @unittest.skip("Logits are not exactly the same, once we fix the instabalities somehow, will update!")
@@ -414,9 +410,7 @@ def test_model_13bf_logits(self):
         EXPECTED_MEAN = torch.tensor([[-0.8562, -1.8520, -0.7551, -0.4162, -1.5161, -1.2038, -2.4823, -2.3254]])
         torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)
         # slicing logits[0, 0, 0:30]
-        # fmt: off
-        EXPECTED_SLICE = torch.tensor([-2.2227, 4.8828, 0.9023, -0.4578, -0.7871, -0.1033, -0.6221, -0.5786, -0.7803, -1.0674, -1.2920, -0.1570, 0.8008, 2.0723, -0.9497, 0.2771, -2.2227, -0.7612, -1.4346, -1.2061, -1.6426, -0.3000, -0.7139, -1.1934, -1.8691, -1.6973, -1.5947, -1.2705, -0.3523, -0.5513])
-        # fmt: on
+        EXPECTED_SLICE = torch.tensor([-2.2227, 4.8828, 0.9023, -0.4578, -0.7871, -0.1033, -0.6221, -0.5786, -0.7803, -1.0674, -1.2920, -0.1570, 0.8008, 2.0723, -0.9497, 0.2771, -2.2227, -0.7612, -1.4346, -1.2061, -1.6426, -0.3000, -0.7139, -1.1934, -1.8691, -1.6973, -1.5947, -1.2705, -0.3523, -0.5513])  # fmt: skip
         torch.testing.assert_close(out.mean(-1), EXPECTED_SLICE, atol=1e-2, rtol=1e-2)
 
     @unittest.skip(
@@ -432,9 +426,7 @@ def test_model_70b_logits(self):
             [[-4.2327, -3.3360, -4.6665, -4.7631, -1.8180, -3.4170, -1.4211, -3.1810]], dtype=torch.float32
         )
         torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)
-        # fmt: off
-        EXPECTED_SLICE = torch.tensor([-9.4922, -3.9551, 1.7998, -5.6758, -5.1055, -5.8984, -4.8320, -6.8086, -6.5391, -5.6172, -5.5820, -5.5352, 1.7881, 3.6289, -6.5117, -3.4785, -9.5000, -6.0352, -6.8125, -6.0195, -6.6836, -5.4727, -6.2812, -6.0391, -7.3398, -7.4297, -7.4844, -6.5820, -5.8789, -5.5312])
-        # fmt: on
+        EXPECTED_SLICE = torch.tensor([-9.4922, -3.9551, 1.7998, -5.6758, -5.1055, -5.8984, -4.8320, -6.8086, -6.5391, -5.6172, -5.5820, -5.5352, 1.7881, 3.6289, -6.5117, -3.4785, -9.5000, -6.0352, -6.8125, -6.0195, -6.6836, -5.4727, -6.2812, -6.0391, -7.3398, -7.4297, -7.4844, -6.5820, -5.8789, -5.5312])  # fmt: skip
         torch.testing.assert_close(out[0, 0, :30], EXPECTED_SLICE, atol=1e-5, rtol=1e-5)
 
     @unittest.skip("Model is curently gated")

From bd4d43c455216126bd5fe234605de68ffa4a0ee8 Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Wed, 22 Nov 2023 19:11:02 +0100
Subject: [PATCH 4/8] Fix Gaudi2 CI workflow (#556)

---
 .github/workflows/slow_tests_gaudi2.yml | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/slow_tests_gaudi2.yml b/.github/workflows/slow_tests_gaudi2.yml
index 8a538ec1cd..acca3abc9e 100644
--- a/.github/workflows/slow_tests_gaudi2.yml
+++ b/.github/workflows/slow_tests_gaudi2.yml
@@ -26,11 +26,12 @@ jobs:
             --runtime=habana \
             -e HABANA_VISIBLE_DEVICES=all \
             -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
+            -e GAUDI2_CI=1 \
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
             vault.habana.ai/gaudi-docker/1.12.0/ubuntu20.04/habanalabs/pytorch-installer-2.0.1:latest \
-            GAUDI2_CI=1 /bin/bash tests/ci/slow_tests_diffusers.sh
+            /bin/bash tests/ci/slow_tests_diffusers.sh
   deepspeed:
     name: Test DeepSpeed models
     if: ${{ !cancelled() && (success() || failure()) }}
@@ -51,11 +52,12 @@ jobs:
             --runtime=habana \
             -e HABANA_VISIBLE_DEVICES=all \
             -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
+            -e GAUDI2_CI=1 \
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
             vault.habana.ai/gaudi-docker/1.12.0/ubuntu20.04/habanalabs/pytorch-installer-2.0.1:latest \
-            GAUDI2_CI=1 /bin/bash tests/ci/slow_tests_deepspeed.sh
+            /bin/bash tests/ci/slow_tests_deepspeed.sh
   multi-card:
     name: Test multi-card models
     if: ${{ !cancelled() && (success() || failure()) }}
@@ -76,11 +78,12 @@ jobs:
             --runtime=habana \
             -e HABANA_VISIBLE_DEVICES=all \
             -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
+            -e GAUDI2_CI=1 \
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
             vault.habana.ai/gaudi-docker/1.12.0/ubuntu20.04/habanalabs/pytorch-installer-2.0.1:latest \
-            GAUDI2_CI=1 /bin/bash tests/ci/slow_tests_8x.sh
+            /bin/bash tests/ci/slow_tests_8x.sh
   single-card:
     name: Test single-card models
     if: ${{ !cancelled() && (success() || failure()) }}
@@ -102,19 +105,20 @@ jobs:
             --runtime=habana \
             -e HABANA_VISIBLE_DEVICES=all \
             -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
+            -e GAUDI2_CI=1 \
+            -e RUN_ALBERT_XXL_1X=1 \
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
             vault.habana.ai/gaudi-docker/1.12.0/ubuntu20.04/habanalabs/pytorch-installer-2.0.1:latest \
-            GAUDI2_CI=1 RUN_ALBERT_XXL_1X=1 /bin/bash tests/ci/slow_tests_1x.sh
+            /bin/bash tests/ci/slow_tests_1x.sh
   text-generation:
     name: Test text-generation example
     if: ${{ !cancelled() && (success() || failure()) }}
     needs:
       - deepspeed
       - multi-card
-      - single-card
-      - albert-xxl-single-card  # run the job when the previous test jobs are done
+      - single-card  # run the job when the previous test jobs are done
     runs-on: [self-hosted, linux, x64, gaudi2]
     steps:
       - name: Checkout
@@ -130,8 +134,9 @@ jobs:
             --runtime=habana \
             -e HABANA_VISIBLE_DEVICES=all \
             -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
+            -e GAUDI2_CI=1 \
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
             vault.habana.ai/gaudi-docker/1.12.0/ubuntu20.04/habanalabs/pytorch-installer-2.0.1:latest \
-            GAUDI2_CI=1 make slow_tests_text_generation_example TOKEN=${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }}
+            make slow_tests_text_generation_example TOKEN=${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }}

From 110e992115879b498004e6373590928f74d6736e Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Thu, 23 Nov 2023 10:10:26 +0100
Subject: [PATCH 5/8] Fix example diff CI (#560)

---
 tests/example_diff/run_generation.txt | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/tests/example_diff/run_generation.txt b/tests/example_diff/run_generation.txt
index 3fc3304ebd..563d95485b 100644
--- a/tests/example_diff/run_generation.txt
+++ b/tests/example_diff/run_generation.txt
@@ -481,18 +481,17 @@
 <     distributed_state = PartialState(cpu=args.use_cpu)
 ---
 >     args = parser.parse_args()
-341c211,213
+341c211,212
 <     logger.warning(f"device: {distributed_state.device}, 16-bits inference: {args.fp16}")
 ---
 >     if not args.use_hpu_graphs:
 >         args.limit_hpu_graphs = False
->         args.reuse_cache = False
-343,344c215
+343,344c214
 <     if args.seed is not None:
 <         set_seed(args.seed)
 ---
 >     return args
-346,373d216
+346,373d215
 <     # Initialize the model and tokenizer
 <     try:
 <         args.model_type = args.model_type.lower()
@@ -521,7 +520,7 @@
 <     if requires_preprocessing:
 <         prepare_input = PREPROCESSING_FUNCTIONS.get(args.model_type)
 <         preprocessed_prompt_text = prepare_input(args, model, tokenizer, prompt_text)
-375,378c218,221
+375,378c217,220
 <         if model.__class__.__name__ in ["TransfoXLLMHeadModel"]:
 <             tokenizer_kwargs = {"add_space_before_punct_symbol": True}
 <         else:
@@ -531,7 +530,7 @@
 >     parser = argparse.ArgumentParser()
 >     args = setup_parser(parser)
 >     model, tokenizer, generation_config = initialize_model(args, logger)
-380,386c223
+380,386c222
 <         encoded_prompt = tokenizer.encode(
 <             preprocessed_prompt_text, add_special_tokens=False, return_tensors="pt", **tokenizer_kwargs
 <         )
@@ -541,7 +540,7 @@
 <     encoded_prompt = encoded_prompt.to(distributed_state.device)
 ---
 >     import habana_frameworks.torch.hpu as torch_hpu
-388,389c225,336
+388,389c224,335
 <     if encoded_prompt.size()[-1] == 0:
 <         input_ids = None
 ---
@@ -657,7 +656,7 @@
 >         print(f"Graph compilation duration          = {compilation_duration} seconds")
 >         print(separator)
 >         print()
-391c338,353
+391c337,352
 <         input_ids = encoded_prompt
 ---
 >         # Downloading and loading a dataset from the hub.
@@ -676,7 +675,7 @@
 >             .shuffle()
 >             .select(range(args.dataset_max_samples if args.dataset_max_samples > 0 else (raw_dataset[split]).num_rows))
 >         )
-393,399c355,362
+393,399c354,361
 <     if args.jit:
 <         jit_input_texts = ["enable jit"]
 <         jit_inputs = prepare_jit_inputs(jit_input_texts, model, tokenizer)
@@ -693,7 +692,7 @@
 >             logger.info(
 >                 f"No column name was given so automatically choosing '{column_name}' for prompts. If you would like to use another column of the dataset, you can set the argument `--column_name`."
 >             )
-401,439c364,384
+401,439c363,383
 <             sig = inspect.signature(model.__call__)
 <         jit_inputs = tuple(jit_inputs[key] for key in sig.parameters if jit_inputs.get(key, None) is not None)
 <         traced_model = torch.jit.trace(model, jit_inputs, strict=False)
@@ -755,7 +754,7 @@
 >             preprocess_function,
 >             batched=True,
 >             desc="Running tokenizer on dataset",
-440a386,463
+440a385,462
 >         # After tokenization, we can remove the column of interest
 >         raw_dataset = raw_dataset.remove_columns([column_name])
 >         raw_dataset.set_format(type="torch")
@@ -834,13 +833,13 @@
 >             )
 >             print(separator)
 >         t_end = time.time()
-442,443c465,466
+442,443c464,465
 <         generated_sequences.append(total_sequence)
 <         print(total_sequence)
 ---
 >         throughput = total_new_tokens_generated / duration
 >         # Print Stats
-445c468,480
+445c467,479
 <     return generated_sequences
 ---
 >         stats = f"Throughput (including tokenization) = {throughput} tokens/second"

From 542c588b8ad25214df5041c9b6008048f2e9c32a Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Thu, 23 Nov 2023 11:21:09 +0100
Subject: [PATCH 6/8] Update BridgeTower example (#561)

---
 examples/contrastive-image-text/README.md          | 1 +
 examples/contrastive-image-text/run_bridgetower.py | 5 +++++
 tests/baselines/bridgetower_large_itm_mlm_itc.json | 1 +
 3 files changed, 7 insertions(+)

diff --git a/examples/contrastive-image-text/README.md b/examples/contrastive-image-text/README.md
index 058f126d96..3ee9c81b7b 100644
--- a/examples/contrastive-image-text/README.md
+++ b/examples/contrastive-image-text/README.md
@@ -207,6 +207,7 @@ python ../gaudi_spawn.py --use_mpi --world_size 8 run_bridgetower.py \
 --output_dir /tmp/bridgetower-test \
 --model_name_or_path BridgeTower/bridgetower-large-itm-mlm-itc \
 --dataset_name jmhessel/newyorker_caption_contest --dataset_config_name matching \
+--dataset_revision 3c6c4f6c0ff7e902833d3afa5f8f3875c2b036e6 \
 --image_column image --caption_column image_description \
 --remove_unused_columns=False \
 --do_train --do_eval --do_predict \
diff --git a/examples/contrastive-image-text/run_bridgetower.py b/examples/contrastive-image-text/run_bridgetower.py
index 3f592fdf16..a59205b954 100644
--- a/examples/contrastive-image-text/run_bridgetower.py
+++ b/examples/contrastive-image-text/run_bridgetower.py
@@ -138,6 +138,10 @@ class DataTrainingArguments:
     dataset_config_name: Optional[str] = field(
         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
     )
+    dataset_revision: str = field(
+        default="main",
+        metadata={"help": "The specific dataset version to use (can be a branch name, tag name or commit id)."},
+    )
     data_dir: Optional[str] = field(default=None, metadata={"help": "The data directory containing input files."})
     image_column: Optional[str] = field(
         default="image_path",
@@ -339,6 +343,7 @@ def main():
             keep_in_memory=False,
             data_dir=data_args.data_dir,
             token=model_args.token,
+            revision=data_args.dataset_revision,
         )
     else:
         data_files = {}
diff --git a/tests/baselines/bridgetower_large_itm_mlm_itc.json b/tests/baselines/bridgetower_large_itm_mlm_itc.json
index 095ddd490e..0c571fe5be 100644
--- a/tests/baselines/bridgetower_large_itm_mlm_itc.json
+++ b/tests/baselines/bridgetower_large_itm_mlm_itc.json
@@ -11,6 +11,7 @@
                     "train_samples_per_second": 921.069,
                     "extra_arguments": [
                         "--dataset_config_name matching",
+                        "--dataset_revision 3c6c4f6c0ff7e902833d3afa5f8f3875c2b036e6",
                         "--image_column image",
                         "--caption_column image_description",
                         "--remove_unused_columns False",

From b5e7d131864c31098c3ca57484c835c3441a7153 Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Thu, 23 Nov 2023 23:40:00 +0800
Subject: [PATCH 7/8] Don't unsqueeze input_id in prepare_inputs_for_generation
 for Starcoder/Codegen (#559)

---
 .../habana/transformers/models/codegen/modeling_codegen.py    | 4 ++--
 .../transformers/models/gpt_bigcode/modeling_gpt_bigcode.py   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/optimum/habana/transformers/models/codegen/modeling_codegen.py b/optimum/habana/transformers/models/codegen/modeling_codegen.py
index 0387bc6a0a..329dec59b2 100644
--- a/optimum/habana/transformers/models/codegen/modeling_codegen.py
+++ b/optimum/habana/transformers/models/codegen/modeling_codegen.py
@@ -417,9 +417,9 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, token_i
         # only last token for inputs_ids if past is defined in kwargs
         if past_key_values:
             if token_idx is not None:
-                input_ids = torch.index_select(input_ids, 1, token_idx - 1).unsqueeze(-1)
+                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
                 if token_type_ids is not None:
-                    token_type_ids = torch.index_select(token_type_ids, 1, token_idx - 1).unsqueeze(-1)
+                    token_type_ids = torch.index_select(token_type_ids, 1, token_idx - 1)
             else:
                 input_ids = input_ids[:, -1].unsqueeze(-1)
                 if token_type_ids is not None:
diff --git a/optimum/habana/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/optimum/habana/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
index e17aa5a0d0..a70826b62b 100644
--- a/optimum/habana/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+++ b/optimum/habana/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -352,9 +352,9 @@ def prepare_inputs_for_generation(
         # only last token for inputs_ids if past is defined in kwargs
         if past_key_values:
             if token_idx is not None:
-                input_ids = torch.index_select(input_ids, 1, token_idx - 1).unsqueeze(-1)
+                input_ids = torch.index_select(input_ids, 1, token_idx - 1)
                 if token_type_ids is not None:
-                    token_type_ids = torch.index_select(token_type_ids, 1, token_idx - 1).unsqueeze(-1)
+                    token_type_ids = torch.index_select(token_type_ids, 1, token_idx - 1)
             else:
                 input_ids = input_ids[:, -1].unsqueeze(-1)
                 if token_type_ids is not None:

From 13fa10d58247b1c0bfe15005669cb24561398769 Mon Sep 17 00:00:00 2001
From: Mandy Li <mjli@habana.ai>
Date: Thu, 23 Nov 2023 07:43:44 -0800
Subject: [PATCH 8/8] Enable llama2-70b LoRA finetuning (#527)

---
 examples/language-modeling/README.md          | 40 ++++++++++++++++++-
 .../llama2_ds_zero3_config.json               | 15 +++++++
 2 files changed, 54 insertions(+), 1 deletion(-)
 create mode 100755 examples/language-modeling/llama2_ds_zero3_config.json

diff --git a/examples/language-modeling/README.md b/examples/language-modeling/README.md
index dad97020c6..16c8e3c441 100644
--- a/examples/language-modeling/README.md
+++ b/examples/language-modeling/README.md
@@ -321,7 +321,8 @@ python run_clm.py \
 
 ## PEFT
 
-To run LoRA finetuning and inference. you could use `run_lora_clm.py` as an example. Multi-card examples can be simply adapted to run LoRA finetuning. Here is the CLM example with Llama1-7B and Falcon-40B:
+To run LoRA finetuning, you can use `run_lora_clm.py`.
+Here are single-/multi-device command examples for Llama1-7B, Falcon-40B and Llama2-70B:
 
 - Single-card finetuning of Llama1-7B:
 ```bash
@@ -455,6 +456,43 @@ LOWER_LIST=ops_bf16.txt python3 ../gaudi_spawn.py \
     --low_cpu_mem_usage True
 ```
 
+- Multi-card finetuning of Llama2-70B with DeepSpeed ZeRO-3 optimization and LoRA:
+
+  > The following command requires Habana DeepSpeed 1.13.0 or later.
+
+```bash
+PT_HPU_MAX_COMPOUND_OP_SIZE=10 DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 \
+python3 ../gaudi_spawn.py --use_deepspeed  --world_size 8  run_lora_clm.py \
+  --model_name_or_path meta-llama/Llama-2-70b-hf \
+  --deepspeed llama2_ds_zero3_config.json \
+  --dataset_name tatsu-lab/alpaca \
+  --bf16 True \
+  --output_dir ./lora_out \
+  --num_train_epochs 2 \
+  --max_seq_len 2048 \
+  --per_device_train_batch_size 10 \
+  --per_device_eval_batch_size 10 \
+  --gradient_checkpointing \
+  --evaluation_strategy epoch \
+  --eval_delay 2 \
+  --save_strategy no \
+  --learning_rate 0.0018 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --dataset_concatenation \
+  --attn_softmax_bf16 True \
+  --do_train \
+  --do_eval \
+  --use_habana \
+  --use_lazy_mode \
+  --pipelining_fwd_bwd \
+  --throughput_warmup_steps 3 \
+  --lora_rank 4 \
+  --lora_target_modules "q_proj" "v_proj" "k_proj" "o_proj" \
+  --validation_split_percentage 4
+````
+
 ## Streaming
 
 To use the streaming dataset mode which can be very useful for large datasets, add `--streaming` with `--max_steps` specified in the command line. This is currently supported by `run_mlm.py` and `run_clm.py`.
diff --git a/examples/language-modeling/llama2_ds_zero3_config.json b/examples/language-modeling/llama2_ds_zero3_config.json
new file mode 100755
index 0000000000..69845e1899
--- /dev/null
+++ b/examples/language-modeling/llama2_ds_zero3_config.json
@@ -0,0 +1,15 @@
+{
+    "steps_per_print": 64,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "gradient_accumulation_steps": "auto",
+    "bf16": {
+        "enabled": true
+    },
+    "gradient_clipping": 1.0,
+    "zero_optimization": {
+        "stage": 3,
+        "overlap_comm": false,
+        "contiguous_gradients": false
+    }
+}