diff --git a/conftest.py b/conftest.py
index 71cb6bb7ca..2a3c7625a3 100644
--- a/conftest.py
+++ b/conftest.py
@@ -1,3 +1,76 @@
+import json
+import logging
+from pathlib import Path
+
+import pytest
+
+
+BASELINE_DIRECTORY = Path(__file__).parent.resolve() / Path("tests") / Path("baselines") / Path("fixture")
+
+
+def walk_path(path: Path):
+    """
+    Taken from https://stackoverflow.com/a/76236680
+
+    Path.walk() is not available until python 3.12
+    """
+    subdirs = [d for d in path.iterdir() if d.is_dir()]
+    files = [f for f in path.iterdir() if f.is_file()]
+    yield path, subdirs, files
+    for s in subdirs:
+        yield from walk_path(s)
+
+
+class Baseline:
+    def __init__(self, session):
+        self.rebase = session.config.option.rebase
+        self.references = {}
+
+        if BASELINE_DIRECTORY.exists():
+            for root, dirs, files in walk_path(BASELINE_DIRECTORY):
+                for name in files:
+                    with (root / name).open() as f:
+                        self.references.update(json.load(f))
+
+    def get_reference(self, addr, context=[]):
+        reference = self.references.setdefault(addr, {})
+        for c in context:
+            reference = reference.setdefault(c, {})
+        return reference
+
+    def finalize(self):
+        if self.rebase:
+            # aggregate refs by test file
+            refsbyfile = {}
+            for case, ref in self.references.items():
+                key = case.split("::")[0]
+                reffile = BASELINE_DIRECTORY / Path(key).with_suffix(".json")
+                refsbyfile.setdefault(reffile, {})[case] = ref
+
+            # dump aggregated refs into their own files
+            for reffile, refs in refsbyfile.items():
+                reffile.parent.mkdir(parents=True, exist_ok=True)
+                with reffile.open("w+") as f:
+                    json.dump(refs, f, indent=2, sort_keys=True)
+
+
+class BaselineRequest:
+    def __init__(self, request):
+        self.baseline = request.session.stash["baseline"]
+        self.addr = request.node.nodeid
+
+    def assertRef(self, compare, context=[], **kwargs):
+        reference = self.baseline.get_reference(self.addr, context)
+        if self.baseline.rebase:
+            reference.update(**kwargs)
+
+        for key, actual in kwargs.items():
+            ref = reference.get(key, None)
+            logging.getLogger().info(f"{'.'.join(context + [key])}:actual = {actual}")
+            logging.getLogger().info(f"{'.'.join(context + [key])}:ref    = {ref}")
+            assert compare(actual, ref)
+
+
 class Secret:
     """
     Taken from: https://stackoverflow.com/a/67393351
@@ -15,11 +88,22 @@ def __str___(self):
 
 def pytest_addoption(parser):
     parser.addoption("--token", action="store", default=None)
+    parser.addoption("--rebase", action="store_true", help="rebase baseline references from current run")
+
+
+@pytest.fixture
+def token(request):
+    return Secret(request.config.option.token)
+
+
+def pytest_sessionstart(session):
+    session.stash["baseline"] = Baseline(session)
+
+
+def pytest_sessionfinish(session):
+    session.stash["baseline"].finalize()
 
 
-def pytest_generate_tests(metafunc):
-    # This is called for every test. Only get/set command line arguments
-    # if the argument is specified in the list of test "fixturenames".
-    option_value = Secret(metafunc.config.option.token)
-    if "token" in metafunc.fixturenames:
-        metafunc.parametrize("token", [option_value])
+@pytest.fixture
+def baseline(request):
+    return BaselineRequest(request)
diff --git a/tests/baselines/fixture/tests/test_encoder_decoder.json b/tests/baselines/fixture/tests/test_encoder_decoder.json
new file mode 100644
index 0000000000..25c780e5dd
--- /dev/null
+++ b/tests/baselines/fixture/tests/test_encoder_decoder.json
@@ -0,0 +1,32 @@
+{
+  "tests/test_encoder_decoder.py::TestEncoderDecoderModels::test_text_summarization_bf16[facebook/bart-large-cnn-Habana/bart-2-2]": {
+    "gaudi1": {
+      "predict_rougeLsum": 29.174,
+      "predict_samples_per_second": 2.304
+    },
+    "gaudi2": {
+      "predict_rougeLsum": 28.9801,
+      "predict_samples_per_second": 4.339
+    }
+  },
+  "tests/test_encoder_decoder.py::TestEncoderDecoderModels::test_text_summarization_bf16[t5-3b-Habana/t5-2-1]": {
+    "gaudi1": {
+      "predict_rougeLsum": 21.7286,
+      "predict_samples_per_second": 1.005
+    },
+    "gaudi2": {
+      "predict_rougeLsum": 21.8877,
+      "predict_samples_per_second": 3.848
+    }
+  },
+  "tests/test_encoder_decoder.py::TestEncoderDecoderModels::test_text_translation_bf16[t5-small-Habana/t5-2-1]": {
+    "gaudi1": {
+      "predict_bleu": 11.6126,
+      "predict_samples_per_second": 9.188
+    },
+    "gaudi2": {
+      "predict_bleu": 11.7277,
+      "predict_samples_per_second": 11.648
+    }
+  }
+}
\ No newline at end of file
diff --git a/tests/baselines/fixture/tests/test_fp8_examples.json b/tests/baselines/fixture/tests/test_fp8_examples.json
new file mode 100644
index 0000000000..43aa371fa1
--- /dev/null
+++ b/tests/baselines/fixture/tests/test_fp8_examples.json
@@ -0,0 +1,8 @@
+{
+  "tests/test_fp8_examples.py::test_fp8_train[mistralai/Mistral-7B-Instruct-v0.2-tatsu-lab/alpaca--language-modeling-8-8-run_lora_clm.py]": {
+    "gaudi2": {
+      "eval_accuracy": 0.7538,
+      "train_samples_per_second": 12.373
+    }
+  }
+}
\ No newline at end of file
diff --git a/tests/baselines/fixture/tests/test_fsdp_examples.json b/tests/baselines/fixture/tests/test_fsdp_examples.json
new file mode 100644
index 0000000000..834ecba8a6
--- /dev/null
+++ b/tests/baselines/fixture/tests/test_fsdp_examples.json
@@ -0,0 +1,14 @@
+{
+  "tests/test_fsdp_examples.py::test_fsdp_bf16[bert-base-uncased-Habana/bert-base-uncased-question-answering-24-8-run_qa.py-full_shard]": {
+    "gaudi2": {
+      "eval_f1": 85.7077,
+      "train_samples_per_second": 2983.533
+    }
+  },
+  "tests/test_fsdp_examples.py::test_fsdp_bf16[meta-llama/Llama-2-7b-hf--language-modeling-8-8-run_lora_clm.py-auto_wrap]": {
+    "gaudi2": {
+      "train_loss": 0.9093,
+      "train_samples_per_second": 85.016
+    }
+  }
+}
\ No newline at end of file
diff --git a/tests/baselines/fixture/tests/test_image_to_text_example.json b/tests/baselines/fixture/tests/test_image_to_text_example.json
new file mode 100644
index 0000000000..d9bab43d39
--- /dev/null
+++ b/tests/baselines/fixture/tests/test_image_to_text_example.json
@@ -0,0 +1,94 @@
+{
+  "tests/test_image_to_text_example.py::test_image_to_text_bf16[HuggingFaceM4/idefics2-8b-1]": {
+    "gaudi2": {
+      "throughput": 21.89944593215077
+    }
+  },
+  "tests/test_image_to_text_example.py::test_image_to_text_bf16[Qwen/Qwen2-VL-2B-Instruct-1]": {
+    "gaudi2": {
+      "throughput": 28.755882208438422
+    }
+  },
+  "tests/test_image_to_text_example.py::test_image_to_text_bf16[Qwen/Qwen2-VL-7B-Instruct-1]": {
+    "gaudi2": {
+      "throughput": 19.32562189532818
+    }
+  },
+  "tests/test_image_to_text_example.py::test_image_to_text_bf16[google/paligemma-3b-mix-224-1]": {
+    "gaudi2": {
+      "throughput": 132.8949150246155
+    }
+  },
+  "tests/test_image_to_text_example.py::test_image_to_text_bf16[llava-hf/llava-1.5-13b-hf-1]": {
+    "gaudi1": {
+      "throughput": 16.704731010481538
+    },
+    "gaudi2": {
+      "throughput": 48.54364937033955
+    }
+  },
+  "tests/test_image_to_text_example.py::test_image_to_text_bf16[llava-hf/llava-1.5-7b-hf-1]": {
+    "gaudi1": {
+      "throughput": 28.04096918512148
+    },
+    "gaudi2": {
+      "throughput": 77.98733740859008
+    }
+  },
+  "tests/test_image_to_text_example.py::test_image_to_text_bf16[llava-hf/llava-v1.6-mistral-7b-hf-1]": {
+    "gaudi1": {
+      "throughput": 10.759228696741
+    },
+    "gaudi2": {
+      "throughput": 33.17984878151546
+    }
+  },
+  "tests/test_image_to_text_example.py::test_image_to_text_bf16[llava-hf/llava-v1.6-vicuna-13b-hf-1]": {
+    "gaudi1": {
+      "throughput": 6.96732060769783
+    },
+    "gaudi2": {
+      "throughput": 23.527610042925
+    }
+  },
+  "tests/test_image_to_text_example.py::test_image_to_text_bf16[llava-hf/llava-v1.6-vicuna-7b-hf-1]": {
+    "gaudi2": {
+      "throughput": 35.00608681379742
+    }
+  },
+  "tests/test_image_to_text_example.py::test_image_to_text_bf16[meta-llama/Llama-3.2-11B-Vision-Instruct-1]": {
+    "gaudi2": {
+      "throughput": 18.974541922240313
+    }
+  },
+  "tests/test_image_to_text_example.py::test_image_to_text_bf16[tiiuae/falcon-11B-vlm-1]": {
+    "gaudi2": {
+      "throughput": 23.69260849957278
+    }
+  },
+  "tests/test_image_to_text_example.py::test_image_to_text_fp8[llava-hf/llava-1.5-13b-hf-1]": {
+    "gaudi2": {
+      "throughput": 67.20488222876344
+    }
+  },
+  "tests/test_image_to_text_example.py::test_image_to_text_fp8[llava-hf/llava-1.5-7b-hf-1]": {
+    "gaudi2": {
+      "throughput": 98.72578382705062
+    }
+  },
+  "tests/test_image_to_text_example.py::test_image_to_text_fp8[llava-hf/llava-v1.6-mistral-7b-hf-1]": {
+    "gaudi2": {
+      "throughput": 45.011551008367086
+    }
+  },
+  "tests/test_image_to_text_example.py::test_image_to_text_fp8[llava-hf/llava-v1.6-vicuna-13b-hf-1]": {
+    "gaudi2": {
+      "throughput": 30.9535718774675
+    }
+  },
+  "tests/test_image_to_text_example.py::test_image_to_text_fp8[llava-hf/llava-v1.6-vicuna-7b-hf-1]": {
+    "gaudi2": {
+      "throughput": 45.18544502949674
+    }
+  }
+}
\ No newline at end of file
diff --git a/tests/baselines/fixture/tests/test_openclip_vqa.json b/tests/baselines/fixture/tests/test_openclip_vqa.json
new file mode 100644
index 0000000000..91f9d7d601
--- /dev/null
+++ b/tests/baselines/fixture/tests/test_openclip_vqa.json
@@ -0,0 +1,18 @@
+{
+  "tests/test_openclip_vqa.py::test_openclip_vqa_bf16[laion/CLIP-ViT-g-14-laion2B-s12B-b42K]": {
+    "gaudi1": {
+      "throughput": 550
+    },
+    "gaudi2": {
+      "throughput": 1472
+    }
+  },
+  "tests/test_openclip_vqa.py::test_openclip_vqa_bf16[microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224]": {
+    "gaudi1": {
+      "throughput": 1200
+    },
+    "gaudi2": {
+      "throughput": 1816
+    }
+  }
+}
\ No newline at end of file
diff --git a/tests/baselines/fixture/tests/test_pipeline.json b/tests/baselines/fixture/tests/test_pipeline.json
new file mode 100644
index 0000000000..78bbf4c51f
--- /dev/null
+++ b/tests/baselines/fixture/tests/test_pipeline.json
@@ -0,0 +1,17 @@
+{
+  "tests/test_pipeline.py::TestGaudiPipeline::test_image_to_text[Salesforce/blip-image-captioning-base-44]": {
+    "generated_text": "a soccer player is playing a game on the app"
+  },
+  "tests/test_pipeline.py::TestGaudiPipeline::test_image_to_text[nlpconnect/vit-gpt2-image-captioning-44]": {
+    "generated_text": "a soccer game with a player jumping to catch"
+  },
+  "tests/test_pipeline.py::TestGaudiPipeline::test_text_to_speech[facebook/hf-seamless-m4t-medium]": {
+    "sampling_rate": 16000
+  },
+  "tests/test_pipeline.py::TestGaudiPipeline::test_text_to_speech[facebook/mms-tts-eng]": {
+    "sampling_rate": 16000
+  },
+  "tests/test_pipeline.py::TestGaudiPipeline::test_text_to_speech[microsoft/speecht5_tts]": {
+    "sampling_rate": 16000
+  }
+}
\ No newline at end of file
diff --git a/tests/baselines/fixture/tests/test_sentence_transformers.json b/tests/baselines/fixture/tests/test_sentence_transformers.json
new file mode 100644
index 0000000000..23f4f6af97
--- /dev/null
+++ b/tests/baselines/fixture/tests/test_sentence_transformers.json
@@ -0,0 +1,106 @@
+{
+  "tests/test_sentence_transformers.py::test_compute_embeddings_throughput[sentence-transformers/all-MiniLM-L12-v2]": {
+    "gaudi1": {
+      "measured_throughput": 1252.6261862281467
+    },
+    "gaudi2": {
+      "measured_throughput": 3614.2610109716247
+    }
+  },
+  "tests/test_sentence_transformers.py::test_compute_embeddings_throughput[sentence-transformers/all-MiniLM-L6-v2]": {
+    "gaudi1": {
+      "measured_throughput": 1109.160132821451
+    },
+    "gaudi2": {
+      "measured_throughput": 2615.6975354038477
+    }
+  },
+  "tests/test_sentence_transformers.py::test_compute_embeddings_throughput[sentence-transformers/all-distilroberta-v1]": {
+    "gaudi1": {
+      "measured_throughput": 226.90237421623164
+    },
+    "gaudi2": {
+      "measured_throughput": 958.5097903298335
+    }
+  },
+  "tests/test_sentence_transformers.py::test_compute_embeddings_throughput[sentence-transformers/all-mpnet-base-v2]": {
+    "gaudi1": {
+      "measured_throughput": 164.36556936723508
+    },
+    "gaudi2": {
+      "measured_throughput": 762.5595168883357
+    }
+  },
+  "tests/test_sentence_transformers.py::test_compute_embeddings_throughput[sentence-transformers/distiluse-base-multilingual-cased-v1]": {
+    "gaudi1": {
+      "measured_throughput": 947.844857744754
+    },
+    "gaudi2": {
+      "measured_throughput": 3487.3319366004903
+    }
+  },
+  "tests/test_sentence_transformers.py::test_compute_embeddings_throughput[sentence-transformers/distiluse-base-multilingual-cased-v2]": {
+    "gaudi1": {
+      "measured_throughput": 947.7317550605878
+    },
+    "gaudi2": {
+      "measured_throughput": 3807.2486282025716
+    }
+  },
+  "tests/test_sentence_transformers.py::test_compute_embeddings_throughput[sentence-transformers/multi-qa-MiniLM-L6-cos-v1]": {
+    "gaudi1": {
+      "measured_throughput": 471.14320842607674
+    },
+    "gaudi2": {
+      "measured_throughput": 1208.3672807492396
+    }
+  },
+  "tests/test_sentence_transformers.py::test_compute_embeddings_throughput[sentence-transformers/multi-qa-distilbert-cos-v1]": {
+    "gaudi1": {
+      "measured_throughput": 216.47035182888888
+    },
+    "gaudi2": {
+      "measured_throughput": 944.6166139694299
+    }
+  },
+  "tests/test_sentence_transformers.py::test_compute_embeddings_throughput[sentence-transformers/multi-qa-mpnet-base-dot-v1]": {
+    "gaudi1": {
+      "measured_throughput": 116.82789535569364
+    },
+    "gaudi2": {
+      "measured_throughput": 545.3360251829846
+    }
+  },
+  "tests/test_sentence_transformers.py::test_compute_embeddings_throughput[sentence-transformers/paraphrase-MiniLM-L3-v2]": {
+    "gaudi1": {
+      "measured_throughput": 3029.398417051629
+    },
+    "gaudi2": {
+      "measured_throughput": 5734.318427972881
+    }
+  },
+  "tests/test_sentence_transformers.py::test_compute_embeddings_throughput[sentence-transformers/paraphrase-albert-small-v2]": {
+    "gaudi1": {
+      "measured_throughput": 1139.806075824319
+    },
+    "gaudi2": {
+      "measured_throughput": 3896.1911011860166
+    }
+  },
+  "tests/test_sentence_transformers.py::test_compute_embeddings_throughput[sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2]": {
+    "gaudi1": {
+      "measured_throughput": 1253.06776127632
+    },
+    "gaudi2": {
+      "measured_throughput": 3558.0778715789693
+    }
+  },
+  "tests/test_sentence_transformers.py::test_compute_embeddings_throughput[sentence-transformers/paraphrase-multilingual-mpnet-base-v2]": {
+    "gaudi1": {
+      "measured_throughput": 518.4762252952173
+    },
+    "gaudi2": {
+      "measured_throughput": 2392.1654748794062
+    }
+  }
+}
\ No newline at end of file
diff --git a/tests/baselines/fixture/tests/test_text_generation_example.json b/tests/baselines/fixture/tests/test_text_generation_example.json
new file mode 100644
index 0000000000..de9b3f1014
--- /dev/null
+++ b/tests/baselines/fixture/tests/test_text_generation_example.json
@@ -0,0 +1,444 @@
+{
+  "tests/test_text_generation_example.py::test_text_generation_awq[TheBloke/Llama-2-7b-Chat-AWQ-1-10-False-128-2048]": {
+    "gaudi2": {
+      "throughput": 456.7
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_beam_search[Qwen/Qwen2-7b-Instruct-1-True]": {
+    "gaudi2": {
+      "throughput": 91.24938949709826
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[CohereForAI/c4ai-command-r-v01-1-False-False]": {
+    "gaudi2": {
+      "throughput": 29.50315234651154
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[Deci/DeciLM-7B-1-False-False]": {
+    "gaudi2": {
+      "throughput": 115
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[EleutherAI/gpt-j-6b-1-False-False]": {
+    "gaudi2": {
+      "throughput": 160.5823842101192
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[EleutherAI/gpt-j-6b-1-True-False]": {
+    "gaudi1": {
+      "throughput": 156.2893125740893
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[EleutherAI/gpt-neo-2.7B-1-False-False]": {
+    "gaudi2": {
+      "throughput": 257.2476416844122
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[EleutherAI/gpt-neox-20b-1-False-False]": {
+    "gaudi2": {
+      "throughput": 50.67672679310354
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[Qwen/Qwen1.5-7B-1-False-False]": {
+    "gaudi1": {
+      "throughput": 39.29068423087616
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[Qwen/Qwen1.5-7B-4-False-False]": {
+    "gaudi2": {
+      "throughput": 490.8621617893209
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[Qwen/Qwen1.5-MoE-A2.7B-1-True-False]": {
+    "gaudi2": {
+      "throughput": 44.25834541569395
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[Qwen/Qwen2-7B-256-False-True]": {
+    "gaudi2": {
+      "output": "DeepSpeed is a machine learning framework that provides a unified interface for training deep learning models. It is designed to be easy to use and to provide high performance. DeepSpeed is built on top of PyTorch and TensorFlow, and it supports a wide range of models, including transformers, convolutional neural networks, and recurrent neural networks.\nDeepSpeed is a machine learning framework that provides a unified interface for training deep learning models. It is designed to be easy to use and to provide high performance. DeepSpeed is built on top of Py",
+      "throughput": 8870.945160540245
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[Qwen/Qwen2.5-7B-4-False-False]": {
+    "gaudi2": {
+      "throughput": 490
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[Salesforce/codegen2-1B-1-False-False]": {
+    "gaudi1": {
+      "throughput": 155.32071248826423
+    },
+    "gaudi2": {
+      "throughput": 446.4029486883532
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[THUDM/chatglm2-6b-1-True-False]": {
+    "gaudi2": {
+      "throughput": 150
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[THUDM/chatglm3-6b-1-True-False]": {
+    "gaudi2": {
+      "throughput": 150
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[adept/persimmon-8b-base-1-False-False]": {
+    "gaudi1": {
+      "throughput": 34.53559807384106
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[adept/persimmon-8b-base-4-False-False]": {
+    "gaudi2": {
+      "throughput": 366.73968820698406
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[baichuan-inc/Baichuan2-13B-Chat-1-False-False]": {
+    "gaudi2": {
+      "throughput": 66
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[baichuan-inc/Baichuan2-7B-Chat-1-True-False]": {
+    "gaudi2": {
+      "throughput": 108
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[bigcode/starcoder-1-False-False]": {
+    "gaudi1": {
+      "throughput": 15.945023767901013
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[bigcode/starcoder-256-True-True]": {
+    "gaudi2": {
+      "output": "def print_hello_world():\n    print(\"Hello World\")\n\ndef print_hello_world_twice():\n    print_hello_world()\n    print_hello_world()\n\ndef print_hello_world_thrice():\n    print_hello_world()\n    print_hello_world()\n    print_hello_world()\n\ndef print_hello_world_four_times():\n    print_hello_world()\n    print_hello_world()\n    print_hello_world()\n   ",
+      "throughput": 6846.575763562658
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[bigcode/starcoder2-3b-1-False-False]": {
+    "gaudi1": {
+      "throughput": 82.09655684566117
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[bigcode/starcoder2-3b-1-False-True]": {
+    "gaudi2": {
+      "output": "def print_hello_world():\n    print(\"Hello World\")\n\ndef print_hello_world_with_name(name):\n    print(\"Hello World, \" + name)\n\ndef print_hello_world_with_name_and_age(name, age):\n    print(\"Hello World, \" + name + \", \" + str(age))\n\ndef print_hello_world_with_name_and_age_and_gender(name, age, gender):\n    print(\"Hello",
+      "throughput": 261.07213776344133
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[bigscience/bloomz-7b1-1-False-False]": {
+    "gaudi1": {
+      "throughput": 41.7555095197846
+    },
+    "gaudi2": {
+      "throughput": 130.0472971205316
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[codellama/CodeLlama-34b-hf-1-True-False]": {
+    "gaudi2": {
+      "throughput": 32.644
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[deepseek-ai/DeepSeek-V2-Lite-1-False-False]": {
+    "gaudi2": {
+      "throughput": 35
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[facebook/xglm-1.7B-1-False-False]": {
+    "gaudi2": {
+      "throughput": 357.46365062825083
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[google/gemma-2-27b-1-False-True]": {
+    "gaudi2": {
+      "output": "DeepSpeed is a machine learning framework that enables you to train models with trillions of parameters and beyond, using model parallelism to partition large models over multiple GPUs.\n\nThe following is a brief introduction to the DeepSpeed model parallel training.\n\n<h2>1. Introduction</h2>\n\nThe DeepSpeed model parallel training is a simple and effective way to train large models. It is a framework that enables you to train models with trillions of parameters and beyond.\n\nDeepSpeed is a distributed deep learning optimization toolkit that makes it easy and efficient",
+      "throughput": 36.578709544111
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[google/gemma-2-9b-1-False-True]": {
+    "gaudi2": {
+      "output": "DeepSpeed is a machine learning framework that enables training of large-scale deep learning models on a single GPU or across multiple GPUs. It is designed to be easy to use and highly scalable, making it a powerful tool for researchers and practitioners working with large-scale deep learning models.\n\nDeepSpeed is built on top of PyTorch, a popular deep learning framework, and provides a set of tools and libraries that make it easy to train large-scale models. It includes features such as zero-shot inference, which allows models to be",
+      "throughput": 92.302359446567
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[google/gemma-7b-1-False-False]": {
+    "gaudi1": {
+      "throughput": 28.84284625836978
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[google/gemma-7b-1-False-True]": {
+    "gaudi2": {
+      "output": "DeepSpeed is a machine learning framework that enables training of large-scale models on commodity hardware. It is designed to be a drop-in replacement for PyTorch, and it is compatible with the existing PyTorch ecosystem. DeepSpeed is designed to be easy to use, and it provides a number of features that make it easy to train large-scale models. DeepSpeed is designed to be scalable, and it can be used to train models on a single machine or on a cluster of machines. DeepSpeed is designed to be efficient,",
+      "throughput": 109.70751574382221
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[gpt2-xl-1-False-False]": {
+    "gaudi1": {
+      "throughput": 142.11481820425706
+    },
+    "gaudi2": {
+      "throughput": 281.8734689674413
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[meta-llama/Llama-2-7b-hf-1-True-False]": {
+    "gaudi1": {
+      "throughput": 44.39616259946937
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[meta-llama/Llama-2-7b-hf-1-True-True]": {
+    "gaudi2": {
+      "output": "DeepSpeed is a machine learning framework for deep learning. It is designed to be fast and efficient, while also being easy to use. DeepSpeed is based on the TensorFlow framework, and it uses the TensorFlow library to perform computations.\nDeepSpeed is a deep learning framework that is designed to be fast and efficient. It is based on the TensorFlow library and uses the TensorFlow library to perform computations. DeepSpeed is designed to be easy to use and to provide a high level of flex",
+      "throughput": 141.25776956002076
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[meta-llama/Llama-2-7b-hf-512-False-False]": {
+    "gaudi2": {
+      "throughput": 8711
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[meta-llama/Llama-2-7b-hf-512-True-False]": {
+    "gaudi2": {
+      "throughput": 12808
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[meta-llama/Meta-Llama-3-8B-1-True-False]": {
+    "gaudi2": {
+      "throughput": 129
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[microsoft/phi-2-1-False-False]": {
+    "gaudi1": {
+      "throughput": 92.53083167241344
+    },
+    "gaudi2": {
+      "throughput": 224.72307766211117
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[mistralai/Mistral-7B-v0.1-1-True-False]": {
+    "gaudi1": {
+      "throughput": 41.21906841459711
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[mistralai/Mistral-7B-v0.1-1-True-True]": {
+    "gaudi2": {
+      "output": "DeepSpeed is a machine learning framework that accelerates training of large models on a single machine or distributed systems. It is designed to be compatible with PyTorch and TensorFlow, and can be used to train models on a single machine or on a distributed system.\n\nDeepSpeed is a machine learning framework that accelerates training of large models on a single machine or distributed systems. It is designed to be compatible with PyTorch and TensorFlow, and can be used to train models on a single machine or on a distributed system",
+      "throughput": 130.2172236767782
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[mistralai/Mixtral-8x7B-v0.1-1-False-True]": {
+    "gaudi2": {
+      "output": "DeepSpeed is a machine learning framework that enables training of large models on a single machine with a single GPU. It is designed to be easy to use and efficient, and it can be used to train models on a variety of tasks.\n\n## Introduction\n\nDeepSpeed is a machine learning framework that enables training of large models on a single machine with a single GPU. It is designed to be easy to use and efficient, and it can be used to train models on a variety of tasks.\n\n## What is DeepSpeed",
+      "throughput": 23.7931001677926
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[mosaicml/mpt-30b-1-False-False]": {
+    "gaudi2": {
+      "throughput": 36.06464336116623
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[mosaicml/mpt-7b-1-False-False]": {
+    "gaudi1": {
+      "throughput": 45.45168927038262
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[openbmb/MiniCPM3-4B-1-False-False]": {
+    "gaudi2": {
+      "throughput": 65.116
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[stabilityai/stablelm-2-12b-1-False-False]": {
+    "gaudi1": {
+      "throughput": 26.80858949645992
+    },
+    "gaudi2": {
+      "throughput": 74.8904496532218
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[state-spaces/mamba-130m-hf-1536-False-False]": {
+    "gaudi2": {
+      "throughput": 5385.511100161605
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[state-spaces/mamba-130m-hf-224-False-False]": {
+    "gaudi1": {
+      "throughput": 794.542
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[tiiuae/falcon-40b-1-True-False]": {
+    "gaudi2": {
+      "throughput": 25.202450111088346
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[tiiuae/falcon-7b-1-True-False]": {
+    "gaudi1": {
+      "throughput": 44.82870145718665
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[tiiuae/falcon-mamba-7b-1-False-False]": {
+    "gaudi2": {
+      "throughput": 47.1464839567739
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_contrastive_search[gpt2-xl-1-False]": {
+    "gaudi1": {
+      "throughput": 34.48141280163397
+    },
+    "gaudi2": {
+      "throughput": 51.61471298016438
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_deepspeed[Qwen/Qwen2.5-72B-2-1]": {
+    "gaudi2": {
+      "throughput": 26
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_deepspeed[bigscience/bloomz-7b1-8-1]": {
+    "gaudi1": {
+      "throughput": 31.994268212011505
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_deepspeed[bigscience/bloomz-8-1]": {
+    "gaudi2": {
+      "throughput": 36.77314954096159
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_deepspeed[facebook/opt-66b-2-1]": {
+    "gaudi2": {
+      "throughput": 28.48069266504111
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_deepspeed[google/gemma-2-27b-8-1]": {
+    "gaudi2": {
+      "throughput": 87.578709544111
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_deepspeed[google/gemma-2-9b-8-1]": {
+    "gaudi2": {
+      "throughput": 110.12610917383735
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_deepspeed[meta-llama/Llama-2-70b-hf-8-1]": {
+    "gaudi2": {
+      "throughput": 64.10514998902435
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_deepspeed[meta-llama/Meta-Llama-3-70B-Instruct-8-1]": {
+    "gaudi2": {
+      "throughput": 64
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_distributed_tp[meta-llama/Llama-2-7b-hf]": {
+    "gaudi2": {
+      "throughput": 1345.2369318328463
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_fp8[meta-llama/Llama-2-70b-hf-4-207-False-2048-128]": {
+    "gaudi2": {
+      "throughput": 568.5
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_fp8[meta-llama/Llama-2-70b-hf-4-3042-False-128-128]": {
+    "gaudi2": {
+      "throughput": 5374.6
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_fp8[meta-llama/Llama-2-70b-hf-4-750-False-128-2048]": {
+    "gaudi2": {
+      "throughput": 7422.4
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_fp8[meta-llama/Llama-2-70b-hf-8-172-False-2048-2048]": {
+    "gaudi2": {
+      "throughput": 4656.2
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_fp8[meta-llama/Llama-2-7b-hf-1-1230-False-128-128]": {
+    "gaudi2": {
+      "throughput": 13152.7
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_fp8[meta-llama/Llama-2-7b-hf-1-163-False-128-2048]": {
+    "gaudi2": {
+      "throughput": 4774.7
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_fp8[meta-llama/Llama-2-7b-hf-1-81-False-2048-2048]": {
+    "gaudi2": {
+      "throughput": 1942.9
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_fp8[meta-llama/Llama-2-7b-hf-1-94-False-2048-128]": {
+    "gaudi2": {
+      "throughput": 1293.3
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_fp8[microsoft/phi-2-1-1-True-128-128]": {
+    "gaudi2": {
+      "throughput": 254.08932787178165
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_fp8[mistralai/Mistral-7B-Instruct-v0.2-1-120-True-128-2048]": {
+    "gaudi2": {
+      "throughput": 6979.225194247115
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_fp8[mistralai/Mistral-7B-Instruct-v0.2-1-120-True-2048-128]": {
+    "gaudi2": {
+      "throughput": 1681.4401450088983
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_fp8[mistralai/Mistral-7B-Instruct-v0.2-1-44-True-2048-2048]": {
+    "gaudi2": {
+      "throughput": 3393.149396451692
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_fp8[mistralai/Mistral-7B-Instruct-v0.2-1-896-True-128-128]": {
+    "gaudi2": {
+      "throughput": 17068.965283763682
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_fp8[mistralai/Mixtral-8x7B-v0.1-1-1-True-128-128]": {
+    "gaudi2": {
+      "throughput": 40.94
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_fp8[mistralai/Mixtral-8x7B-v0.1-2-48-True-2048-2048]": {
+    "gaudi2": {
+      "throughput": 1147.5
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_fp8[mistralai/Mixtral-8x7B-v0.1-2-768-True-128-128]": {
+    "gaudi2": {
+      "throughput": 3428.65
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_fp8[mistralai/Mixtral-8x7B-v0.1-2-96-True-128-2048]": {
+    "gaudi2": {
+      "throughput": 2570.34
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_fp8[mistralai/Mixtral-8x7B-v0.1-2-96-True-2048-128]": {
+    "gaudi2": {
+      "throughput": 379.03
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_fp8[tiiuae/falcon-180B-4-950-True-128-128]": {
+    "gaudi2": {
+      "throughput": 2506.68
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_gptq[TheBloke/Llama-2-7b-Chat-GPTQ-1-10-False-128-2048]": {
+    "gaudi2": {
+      "throughput": 456.7
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_torch_compile[meta-llama/Llama-2-7b-hf]": {
+    "gaudi2": {
+      "throughput": 102.27823420713148
+    }
+  },
+  "tests/test_text_generation_example.py::test_text_generation_torch_compile_distributed[meta-llama/Llama-2-7b-hf]": {
+    "gaudi2": {
+      "throughput": 39.72973199515235
+    }
+  }
+}
\ No newline at end of file
diff --git a/tests/test_encoder_decoder.py b/tests/test_encoder_decoder.py
index 308a3fe242..b9be7b77f6 100644
--- a/tests/test_encoder_decoder.py
+++ b/tests/test_encoder_decoder.py
@@ -11,41 +11,33 @@
 from .test_examples import ACCURACY_PERF_FACTOR, TIME_PERF_FACTOR
 
 
-if os.environ.get("GAUDI2_CI", "0") == "1":
-    # Gaudi2 CI baselines
-    MODELS_TO_TEST = {
-        "summarization": {
-            "bf16": [
-                ("facebook/bart-large-cnn", "Habana/bart", 4.339, 28.9801, 2, 2),
-                ("t5-3b", "Habana/t5", 3.848, 21.8877, 2, 1),
-            ],
-        },
-        "translation": {
-            "bf16": [
-                ("t5-small", "Habana/t5", 11.648, 11.7277, 2, 1),
-            ],
-        },
-    }
-else:
-    # Gaudi1 CI baselines
-    MODELS_TO_TEST = {
-        "summarization": {
-            "bf16": [
-                ("facebook/bart-large-cnn", "Habana/bart", 2.304, 29.174, 2, 2),
-                ("t5-3b", "Habana/t5", 1.005, 21.7286, 2, 1),
-            ],
-        },
-        "translation": {
-            "bf16": [
-                ("t5-small", "Habana/t5", 9.188, 11.6126, 2, 1),
-            ],
-        },
-    }
+MODELS_TO_TEST = {
+    "summarization": {
+        "bf16": [
+            ("facebook/bart-large-cnn", "Habana/bart", 2, 2),
+            ("t5-3b", "Habana/t5", 2, 1),
+        ],
+    },
+    "translation": {
+        "bf16": [
+            ("t5-small", "Habana/t5", 2, 1),
+        ],
+    },
+}
 
 
 class TestEncoderDecoderModels:
     PATH_TO_EXAMPLE_DIR = Path(__file__).resolve().parent.parent / "examples"
 
+    @pytest.fixture(autouse=True)
+    def _pretest(self, baseline):
+        """
+        This is automatically called before each test function is executed.
+
+        Collect custom fixtures (from conftest.py).
+        """
+        self.baseline = baseline
+
     def _install_requirements(self, task: str):
         cmd_line = f"pip install -r {self.PATH_TO_EXAMPLE_DIR / task / 'requirements.txt'}".split()
         p = subprocess.Popen(cmd_line)
@@ -80,8 +72,6 @@ def _run_test(
         self,
         command: List[str],
         task: str,
-        baseline: float,
-        baseline_acc: float,
     ):
         with TemporaryDirectory() as tmp_dir:
             command.append(f"--output_dir {tmp_dir}")
@@ -93,35 +83,36 @@ def _run_test(
             proc = subprocess.run(command)
 
             # Ensure the run finished without any issue
-            # Use try-except to avoid logging the token if used
-            try:
-                assert proc.returncode == 0
-            except AssertionError as e:
-                if "'--token', 'hf_" in e.args[0]:
-                    e.args = (f"The following command failed:\n{' '.join(command[:-2])}",)
-                raise
+            assert proc.returncode == 0
 
             with open(Path(tmp_dir) / "predict_results.json") as fp:
                 results = json.load(fp)
 
+        device = "gaudi2" if os.environ.get("GAUDI2_CI", "0") == "1" else "gaudi1"
+
         # Ensure performance requirements (throughput) are met
-        assert results["predict_samples_per_second"] >= (2 - TIME_PERF_FACTOR) * baseline
+        self.baseline.assertRef(
+            compare=lambda actual, ref: actual >= (2 - TIME_PERF_FACTOR) * ref,
+            context=[device],
+            predict_samples_per_second=results["predict_samples_per_second"],
+        )
 
         if task == "summarization":
             accuracy_metric = "predict_rougeLsum"
         elif task == "translation":
             accuracy_metric = "predict_bleu"
-        assert results[accuracy_metric] >= ACCURACY_PERF_FACTOR * baseline_acc
+        self.baseline.assertRef(
+            compare=lambda actual, ref: actual >= ACCURACY_PERF_FACTOR * ref,
+            context=[device],
+            **{accuracy_metric: results[accuracy_metric]},
+        )
 
     def _test_text_summarization(
         self,
         model_name: str,
         gaudi_config: str,
-        baseline: float,
-        baseline_acc: float,
         batch_size: int,
         num_beams: int,
-        token: str,
         deepspeed: bool = False,
         world_size: int = 8,
     ):
@@ -159,17 +150,14 @@ def _test_text_summarization(
         if not deepspeed and model_name == "t5-3b":
             command.append("--bf16_full_eval")
 
-        self._run_test(command, task, baseline, baseline_acc)
+        self._run_test(command, task)
 
     def _test_text_translation(
         self,
         model_name: str,
         gaudi_config: str,
-        baseline: float,
-        baseline_acc: float,
         batch_size: int,
         num_beams: int,
-        token: str,
         deepspeed: bool = False,
         world_size: int = 8,
     ):
@@ -213,36 +201,30 @@ def _test_text_translation(
             command_args=command_args,
         )
 
-        self._run_test(command, task, baseline, baseline_acc)
+        self._run_test(command, task)
 
     @pytest.mark.parametrize(
-        "model_name, gaudi_config, baseline, baseline_acc, batch_size, num_beams",
+        "model_name, gaudi_config, batch_size, num_beams",
         MODELS_TO_TEST["summarization"]["bf16"],
     )
     def test_text_summarization_bf16(
         self,
         model_name: str,
         gaudi_config: str,
-        baseline: float,
-        baseline_acc: float,
         batch_size: int,
         num_beams: int,
-        token: str,
     ):
-        self._test_text_summarization(model_name, gaudi_config, baseline, baseline_acc, batch_size, num_beams, token)
+        self._test_text_summarization(model_name, gaudi_config, batch_size, num_beams)
 
     @pytest.mark.parametrize(
-        "model_name, gaudi_config, baseline, baseline_acc, batch_size, num_beams",
+        "model_name, gaudi_config, batch_size, num_beams",
         MODELS_TO_TEST["translation"]["bf16"],
     )
     def test_text_translation_bf16(
         self,
         model_name: str,
         gaudi_config: str,
-        baseline: float,
-        baseline_acc: float,
         batch_size: int,
         num_beams: int,
-        token: str,
     ):
-        self._test_text_translation(model_name, gaudi_config, baseline, baseline_acc, batch_size, num_beams, token)
+        self._test_text_translation(model_name, gaudi_config, batch_size, num_beams)
diff --git a/tests/test_fp8_examples.py b/tests/test_fp8_examples.py
index 27020a2b8f..4e2382b8b7 100644
--- a/tests/test_fp8_examples.py
+++ b/tests/test_fp8_examples.py
@@ -18,8 +18,6 @@
                 "mistralai/Mistral-7B-Instruct-v0.2",
                 "tatsu-lab/alpaca",
                 "",
-                12.373,
-                0.7538,
                 "language-modeling",
                 8,
                 8,
@@ -36,8 +34,7 @@ def _test_fp8_train(
     model_name: str,
     dataset_name: str,
     gaudi_config: str,
-    baseline: float,
-    baseline_acc: float,
+    baseline,
     task: str,
     batch_size_train: int,
     batch_size_eval: int,
@@ -112,27 +109,34 @@ def _test_fp8_train(
         with open(Path(tmp_dir) / "all_results.json") as fp:
             results = json.load(fp)
 
+        device = "gaudi2" if os.environ.get("GAUDI2_CI", "0") == "1" else "gaudi1"
+
         # Ensure performance requirements (throughput) are met
-        assert results["train_samples_per_second"] >= (2 - TIME_PERF_FACTOR) * baseline
-        assert results["eval_accuracy"] >= ACCURACY_PERF_FACTOR * baseline_acc
+        baseline.assertRef(
+            compare=lambda actual, ref: actual >= (2 - TIME_PERF_FACTOR) * ref,
+            context=[device],
+            train_samples_per_second=results["train_samples_per_second"],
+        )
+        baseline.assertRef(
+            compare=lambda actual, ref: actual >= ACCURACY_PERF_FACTOR * ref,
+            context=[device],
+            eval_accuracy=results["eval_accuracy"],
+        )
 
 
 @pytest.mark.parametrize(
-    "model_name, dataset_name, gaudi_config, baseline, baseline_acc, task, bs_train, bs_eval, script",
+    "model_name, dataset_name, gaudi_config, task, bs_train, bs_eval, script",
     MODELS_TO_TEST["fp8"],
 )
 def test_fp8_train(
     model_name: str,
     dataset_name: str,
     gaudi_config: str,
-    baseline: float,
-    baseline_acc: float,
     task: str,
     bs_train: int,
     bs_eval: int,
     script: str,
-    token: str,
+    baseline,
+    token,
 ):
-    _test_fp8_train(
-        model_name, dataset_name, gaudi_config, baseline, baseline_acc, task, bs_train, bs_eval, script, token
-    )
+    _test_fp8_train(model_name, dataset_name, gaudi_config, baseline, task, bs_train, bs_eval, script, token)
diff --git a/tests/test_fsdp_examples.py b/tests/test_fsdp_examples.py
index 180a2bb3f9..90931e1e25 100644
--- a/tests/test_fsdp_examples.py
+++ b/tests/test_fsdp_examples.py
@@ -17,8 +17,6 @@
             (
                 "bert-base-uncased",
                 "Habana/bert-base-uncased",
-                2983.533,
-                85.7077,
                 "question-answering",
                 24,
                 8,
@@ -28,8 +26,6 @@
             (
                 "meta-llama/Llama-2-7b-hf",
                 "",
-                85.016,
-                0.9093,
                 "language-modeling",
                 8,
                 8,
@@ -46,8 +42,7 @@
 def _test_fsdp(
     model_name: str,
     gaudi_config: str,
-    baseline: float,
-    baseline_acc: float,
+    baseline,
     task: str,
     batch_size_train: int,
     batch_size_eval: int,
@@ -150,27 +145,38 @@ def _test_fsdp(
         with open(Path(tmp_dir) / "all_results.json") as fp:
             results = json.load(fp)
 
+        device = "gaudi2" if os.environ.get("GAUDI2_CI", "0") == "1" else "gaudi1"
+
         # Ensure performance requirements (throughput) are met
-        assert results["train_samples_per_second"] >= (2 - TIME_PERF_FACTOR) * baseline
+        baseline.assertRef(
+            compare=lambda actual, ref: actual >= (2 - TIME_PERF_FACTOR) * ref,
+            context=[device],
+            train_samples_per_second=results["train_samples_per_second"],
+        )
         if model_name == "bert-base-uncased":
-            assert results["eval_f1"] >= ACCURACY_PERF_FACTOR * baseline_acc
+            baseline.assertRef(
+                compare=lambda actual, ref: actual >= ACCURACY_PERF_FACTOR * ref,
+                context=[device],
+                eval_f1=results["eval_f1"],
+            )
         else:
-            assert results["train_loss"] <= (2 - ACCURACY_PERF_FACTOR) * baseline_acc
+            baseline.assertRef(
+                compare=lambda actual, ref: actual <= (2 - ACCURACY_PERF_FACTOR) * ref,
+                context=[device],
+                train_loss=results["train_loss"],
+            )
 
 
-@pytest.mark.parametrize(
-    "model_name, gaudi_config, baseline, baseline_acc, task, bs_train, bs_eval, script, policy", MODELS_TO_TEST["bf16"]
-)
+@pytest.mark.parametrize("model_name, gaudi_config, task, bs_train, bs_eval, script, policy", MODELS_TO_TEST["bf16"])
 def test_fsdp_bf16(
     model_name: str,
     gaudi_config: str,
-    baseline: float,
-    baseline_acc: float,
     task: str,
     bs_train: int,
     bs_eval: int,
     script: str,
     policy: str,
-    token: str,
+    baseline,
+    token,
 ):
-    _test_fsdp(model_name, gaudi_config, baseline, baseline_acc, task, bs_train, bs_eval, script, policy, token)
+    _test_fsdp(model_name, gaudi_config, baseline, task, bs_train, bs_eval, script, policy, token)
diff --git a/tests/test_image_to_text_example.py b/tests/test_image_to_text_example.py
index 538ca8c182..80983330e5 100644
--- a/tests/test_image_to_text_example.py
+++ b/tests/test_image_to_text_example.py
@@ -14,34 +14,34 @@
     # Gaudi2 CI baselines
     MODELS_TO_TEST = {
         "bf16": [
-            # ("llava-hf/llava-1.5-7b-hf", 1, 77.98733740859008),
-            # ("llava-hf/llava-1.5-13b-hf", 1, 48.54364937033955),
-            ("llava-hf/llava-v1.6-mistral-7b-hf", 1, 33.17984878151546),
-            ("llava-hf/llava-v1.6-vicuna-7b-hf", 1, 35.00608681379742),
-            ("llava-hf/llava-v1.6-vicuna-13b-hf", 1, 23.527610042925),
-            ("google/paligemma-3b-mix-224", 1, 132.8949150246155),
-            ("HuggingFaceM4/idefics2-8b", 1, 21.89944593215077),
-            ("meta-llama/Llama-3.2-11B-Vision-Instruct", 1, 18.974541922240313),
-            ("tiiuae/falcon-11B-vlm", 1, 23.69260849957278),
-            ("Qwen/Qwen2-VL-2B-Instruct", 1, 28.755882208438422),
-            ("Qwen/Qwen2-VL-7B-Instruct", 1, 19.32562189532818),
+            # ("llava-hf/llava-1.5-7b-hf", 1),
+            # ("llava-hf/llava-1.5-13b-hf", 1),
+            ("llava-hf/llava-v1.6-mistral-7b-hf", 1),
+            ("llava-hf/llava-v1.6-vicuna-7b-hf", 1),
+            ("llava-hf/llava-v1.6-vicuna-13b-hf", 1),
+            ("google/paligemma-3b-mix-224", 1),
+            ("HuggingFaceM4/idefics2-8b", 1),
+            ("meta-llama/Llama-3.2-11B-Vision-Instruct", 1),
+            ("tiiuae/falcon-11B-vlm", 1),
+            ("Qwen/Qwen2-VL-2B-Instruct", 1),
+            ("Qwen/Qwen2-VL-7B-Instruct", 1),
         ],
         "fp8": [
-            # ("llava-hf/llava-1.5-7b-hf", 1, 98.72578382705062),
-            # ("llava-hf/llava-1.5-13b-hf", 1, 67.20488222876344),
-            ("llava-hf/llava-v1.6-mistral-7b-hf", 1, 45.011551008367084),
-            ("llava-hf/llava-v1.6-vicuna-7b-hf", 1, 45.18544502949674),
-            ("llava-hf/llava-v1.6-vicuna-13b-hf", 1, 30.9535718774675),
+            # ("llava-hf/llava-1.5-7b-hf", 1),
+            # ("llava-hf/llava-1.5-13b-hf", 1),
+            ("llava-hf/llava-v1.6-mistral-7b-hf", 1),
+            ("llava-hf/llava-v1.6-vicuna-7b-hf", 1),
+            ("llava-hf/llava-v1.6-vicuna-13b-hf", 1),
         ],
     }
 else:
     # Gaudi1 CI baselines
     MODELS_TO_TEST = {
         "bf16": [
-            ("llava-hf/llava-1.5-7b-hf", 1, 28.04096918512148),
-            ("llava-hf/llava-1.5-13b-hf", 1, 16.704731010481538),
-            ("llava-hf/llava-v1.6-mistral-7b-hf", 1, 10.759228696741),
-            ("llava-hf/llava-v1.6-vicuna-13b-hf", 1, 6.96732060769783),
+            ("llava-hf/llava-1.5-7b-hf", 1),
+            ("llava-hf/llava-1.5-13b-hf", 1),
+            ("llava-hf/llava-v1.6-mistral-7b-hf", 1),
+            ("llava-hf/llava-v1.6-vicuna-13b-hf", 1),
         ],
         "fp8": [],
     }
@@ -49,7 +49,7 @@
 
 def _test_image_to_text(
     model_name: str,
-    baseline: float,
+    baseline,
     token: str,
     batch_size: int = 1,
     fp8: bool = False,
@@ -118,15 +118,21 @@ def _test_image_to_text(
         with open(Path(tmp_dir) / "results.json") as fp:
             results = json.load(fp)
 
+        device = "gaudi2" if os.environ.get("GAUDI2_CI", "0") == "1" else "gaudi1"
+
         # Ensure performance requirements (throughput) are met
-        assert results["throughput"] >= (2 - TIME_PERF_FACTOR) * baseline
+        baseline.assertRef(
+            compare=lambda actual, ref: actual >= (2 - TIME_PERF_FACTOR) * ref,
+            context=[device],
+            throughput=results["throughput"],
+        )
 
 
-@pytest.mark.parametrize("model_name, batch_size, baseline", MODELS_TO_TEST["bf16"])
-def test_image_to_text_bf16(model_name: str, baseline: float, batch_size: int, token: str):
+@pytest.mark.parametrize("model_name, batch_size", MODELS_TO_TEST["bf16"])
+def test_image_to_text_bf16(model_name: str, batch_size: int, baseline, token):
     _test_image_to_text(model_name, baseline, token, batch_size)
 
 
-@pytest.mark.parametrize("model_name, batch_size, baseline", MODELS_TO_TEST["fp8"])
-def test_image_to_text_fp8(model_name: str, baseline: float, batch_size: int, token: str):
+@pytest.mark.parametrize("model_name, batch_size", MODELS_TO_TEST["fp8"])
+def test_image_to_text_fp8(model_name: str, batch_size: int, baseline, token):
     _test_image_to_text(model_name, baseline, token, batch_size, fp8=True)
diff --git a/tests/test_openclip_vqa.py b/tests/test_openclip_vqa.py
index c0c3d38521..812db05645 100644
--- a/tests/test_openclip_vqa.py
+++ b/tests/test_openclip_vqa.py
@@ -10,22 +10,12 @@
 from .test_examples import TIME_PERF_FACTOR
 
 
-if os.environ.get("GAUDI2_CI", "0") == "1":
-    # Gaudi2 CI baselines
-    MODELS_TO_TEST = {
-        "bf16": [
-            ("laion/CLIP-ViT-g-14-laion2B-s12B-b42K", 1472),
-            ("microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224", 1816),
-        ],
-    }
-else:
-    # Gaudi1 CI baselines
-    MODELS_TO_TEST = {
-        "bf16": [
-            ("laion/CLIP-ViT-g-14-laion2B-s12B-b42K", 550),
-            ("microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224", 1200),
-        ],
-    }
+MODELS_TO_TEST = {
+    "bf16": [
+        "laion/CLIP-ViT-g-14-laion2B-s12B-b42K",
+        "microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224",
+    ],
+}
 
 
 def _install_requirements():
@@ -38,7 +28,7 @@ def _install_requirements():
     assert return_code == 0
 
 
-def _test_openclip_vqa(model_name: str, baseline: float):
+def _test_openclip_vqa(model_name: str, baseline):
     _install_requirements()
     command = ["python3"]
     path_to_example_dir = Path(__file__).resolve().parent.parent / "examples"
@@ -72,10 +62,16 @@ def _test_openclip_vqa(model_name: str, baseline: float):
         with open(Path(tmp_dir) / "results.json") as fp:
             results = json.load(fp)
 
+        device = "gaudi2" if os.environ.get("GAUDI2_CI", "0") == "1" else "gaudi1"
+
         # Ensure performance requirements (throughput) are met
-        assert results["throughput"] >= (2 - TIME_PERF_FACTOR) * baseline
+        baseline.assertRef(
+            compare=lambda actual, ref: actual >= (2 - TIME_PERF_FACTOR) * ref,
+            context=[device],
+            throughput=results["throughput"],
+        )
 
 
-@pytest.mark.parametrize("model_name, baseline", MODELS_TO_TEST["bf16"])
-def test_openclip_vqa_bf16(model_name: str, baseline: float):
+@pytest.mark.parametrize("model_name", MODELS_TO_TEST["bf16"])
+def test_openclip_vqa_bf16(model_name: str, baseline):
     _test_openclip_vqa(model_name, baseline)
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
index 200f2a78a2..ca53718f54 100644
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import operator
 import os
 
 import numpy as np
@@ -27,20 +28,20 @@
 
 MODELS_TO_TEST = {
     "text-to-speech": [
-        ("microsoft/speecht5_tts", 16000),
-        ("facebook/hf-seamless-m4t-medium", 16000),
-        ("facebook/mms-tts-eng", 16000),
+        "microsoft/speecht5_tts",
+        "facebook/hf-seamless-m4t-medium",
+        "facebook/mms-tts-eng",
     ],
     "image-to-text": [
-        ("Salesforce/blip-image-captioning-base", "a soccer player is playing a game on the app"),
-        ("nlpconnect/vit-gpt2-image-captioning", "a soccer game with a player jumping to catch"),
+        ("Salesforce/blip-image-captioning-base", 44),
+        ("nlpconnect/vit-gpt2-image-captioning", 44),
     ],
 }
 
 
 class TestGaudiPipeline:
-    @pytest.mark.parametrize("model, expected_result", MODELS_TO_TEST["image-to-text"])
-    def test_image_to_text(self, model, expected_result):
+    @pytest.mark.parametrize("model, validate_length", MODELS_TO_TEST["image-to-text"])
+    def test_image_to_text(self, model, validate_length, baseline):
         adapt_transformers_to_gaudi()
         MODEL_DTYPE_LIST = [torch.bfloat16, torch.float32]
         generate_kwargs = {
@@ -60,10 +61,12 @@ def test_image_to_text(self, model, expected_result):
             generator.model = wrap_in_hpu_graph(generator.model)
             for i in range(3):
                 output = generator(image, generate_kwargs=generate_kwargs)
-            assert output[0]["generated_text"].startswith(expected_result)
 
-    @pytest.mark.parametrize("model, expected_sample_rate", MODELS_TO_TEST["text-to-speech"])
-    def test_text_to_speech(self, model, expected_sample_rate):
+            result = output[0]["generated_text"][:validate_length]
+            baseline.assertRef(compare=operator.eq, generated_text=result)
+
+    @pytest.mark.parametrize("model", MODELS_TO_TEST["text-to-speech"])
+    def test_text_to_speech(self, model, baseline):
         adapt_transformers_to_gaudi()
         MODEL_DTYPE_LIST = [torch.bfloat16, torch.float32]
         text = "hello, the dog is cooler"
@@ -95,4 +98,5 @@ def test_text_to_speech(self, model, expected_sample_rate):
                 for i in range(3):
                     output = generator(text, forward_params=forward_params, generate_kwargs=generate_kwargs)
             assert isinstance(output["audio"], np.ndarray)
-            assert output["sampling_rate"] == expected_sample_rate
+
+            baseline.assertRef(compare=operator.eq, sampling_rate=output["sampling_rate"])
diff --git a/tests/test_sentence_transformers.py b/tests/test_sentence_transformers.py
index 90d97f3005..f9b3033a7f 100644
--- a/tests/test_sentence_transformers.py
+++ b/tests/test_sentence_transformers.py
@@ -9,45 +9,26 @@
 from .test_examples import TIME_PERF_FACTOR
 
 
-if os.environ.get("GAUDI2_CI", "0") == "1":
-    # Gaudi2 CI baselines
-    MODELS_TO_TEST = [
-        ("sentence-transformers/all-mpnet-base-v2", 762.5595168883357),
-        ("sentence-transformers/multi-qa-mpnet-base-dot-v1", 545.3360251829846),
-        ("sentence-transformers/all-distilroberta-v1", 958.5097903298335),
-        ("sentence-transformers/all-MiniLM-L12-v2", 3614.2610109716247),
-        ("sentence-transformers/multi-qa-distilbert-cos-v1", 944.6166139694299),
-        ("sentence-transformers/all-MiniLM-L6-v2", 2615.6975354038477),
-        ("sentence-transformers/multi-qa-MiniLM-L6-cos-v1", 1208.3672807492396),
-        ("sentence-transformers/paraphrase-multilingual-mpnet-base-v2", 2392.1654748794062),
-        ("sentence-transformers/paraphrase-albert-small-v2", 3896.1911011860166),
-        ("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", 3558.0778715789693),
-        ("sentence-transformers/paraphrase-MiniLM-L3-v2", 5734.318427972881),
-        ("sentence-transformers/distiluse-base-multilingual-cased-v1", 3487.3319366004903),
-        ("sentence-transformers/distiluse-base-multilingual-cased-v2", 3807.2486282025716),
-    ]
-else:
-    # Gaudi1 CI baselines
-    MODELS_TO_TEST = [
-        ("sentence-transformers/all-mpnet-base-v2", 164.36556936723508),
-        ("sentence-transformers/multi-qa-mpnet-base-dot-v1", 116.82789535569364),
-        ("sentence-transformers/all-distilroberta-v1", 226.90237421623164),
-        ("sentence-transformers/all-MiniLM-L12-v2", 1252.6261862281467),
-        ("sentence-transformers/multi-qa-distilbert-cos-v1", 216.47035182888888),
-        ("sentence-transformers/all-MiniLM-L6-v2", 1109.160132821451),
-        ("sentence-transformers/multi-qa-MiniLM-L6-cos-v1", 471.14320842607674),
-        ("sentence-transformers/paraphrase-multilingual-mpnet-base-v2", 518.4762252952173),
-        ("sentence-transformers/paraphrase-albert-small-v2", 1139.806075824319),
-        ("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", 1253.06776127632),
-        ("sentence-transformers/paraphrase-MiniLM-L3-v2", 3029.398417051629),
-        ("sentence-transformers/distiluse-base-multilingual-cased-v1", 947.844857744754),
-        ("sentence-transformers/distiluse-base-multilingual-cased-v2", 947.7317550605878),
-    ]
+MODELS_TO_TEST = [
+    "sentence-transformers/all-mpnet-base-v2",
+    "sentence-transformers/multi-qa-mpnet-base-dot-v1",
+    "sentence-transformers/all-distilroberta-v1",
+    "sentence-transformers/all-MiniLM-L12-v2",
+    "sentence-transformers/multi-qa-distilbert-cos-v1",
+    "sentence-transformers/all-MiniLM-L6-v2",
+    "sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
+    "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
+    "sentence-transformers/paraphrase-albert-small-v2",
+    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
+    "sentence-transformers/paraphrase-MiniLM-L3-v2",
+    "sentence-transformers/distiluse-base-multilingual-cased-v1",
+    "sentence-transformers/distiluse-base-multilingual-cased-v2",
+]
 
 
 def _test_sentence_transformers(
     model_name: str,
-    baseline: float,
+    baseline,
 ):
     model = SentenceTransformer(model_name)
 
@@ -74,10 +55,17 @@ def _test_sentence_transformers(
         end_time = time.perf_counter()
         diff_time = end_time - start_time
         measured_throughput = len(sentences) / diff_time
+
+    device = "gaudi2" if os.environ.get("GAUDI2_CI", "0") == "1" else "gaudi1"
+
     # Only assert the last measured throughtput as the first iteration is used as a warmup
-    assert measured_throughput >= (2 - TIME_PERF_FACTOR) * baseline
+    baseline.assertRef(
+        compare=lambda actual, ref: actual >= (2 - TIME_PERF_FACTOR) * ref,
+        context=[device],
+        measured_throughput=measured_throughput,
+    )
 
 
-@pytest.mark.parametrize("model_name, baseline", MODELS_TO_TEST)
-def test_compute_embeddings_throughput(model_name: str, baseline: float):
+@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+def test_compute_embeddings_throughput(model_name: str, baseline):
     _test_sentence_transformers(model_name, baseline)
diff --git a/tests/test_text_generation_example.py b/tests/test_text_generation_example.py
index c217ee9a79..0c513d8bb1 100644
--- a/tests/test_text_generation_example.py
+++ b/tests/test_text_generation_example.py
@@ -1,4 +1,5 @@
 import json
+import operator
 import os
 import re
 import subprocess
@@ -18,154 +19,142 @@
 prev_quant_rank = 0
 
 if os.environ.get("GAUDI2_CI", "0") == "1":
-    # Gaudi2 CI baselines
+    # Gaudi2 CI
     MODELS_TO_TEST = {
         "bf16_1x": [
-            ("bigscience/bloomz-7b1", 1, False, 130.0472971205316, False),
-            ("gpt2-xl", 1, False, 281.8734689674413, False),
-            ("EleutherAI/gpt-j-6b", 1, False, 160.5823842101192, False),
-            ("EleutherAI/gpt-neox-20b", 1, False, 50.67672679310354, False),
-            ("meta-llama/Llama-2-7b-hf", 1, True, 141.25776956002076, True),
-            ("tiiuae/falcon-40b", 1, True, 25.202450111088346, False),
-            ("bigcode/starcoder", 256, True, 6846.575763562658, True),
-            ("Salesforce/codegen2-1B", 1, False, 446.4029486883532, False),
-            ("mosaicml/mpt-30b", 1, False, 36.06464336116623, False),
-            ("mistralai/Mistral-7B-v0.1", 1, True, 130.2172236767782, True),
-            ("mistralai/Mixtral-8x7B-v0.1", 1, False, 23.7931001677926, True),
-            ("microsoft/phi-2", 1, False, 224.72307766211117, False),
-            ("meta-llama/Meta-Llama-3-8B", 1, True, 129, False),
-            ("meta-llama/Llama-2-7b-hf", 512, True, 12808, False),
-            ("meta-llama/Llama-2-7b-hf", 512, False, 8711, False),  # in some cases like TGI, reuse_cache isn't used
-            ("stabilityai/stablelm-2-12b", 1, False, 74.8904496532218, False),
-            ("codellama/CodeLlama-34b-hf", 1, True, 32.644, False),
-            ("bigcode/starcoder2-3b", 1, False, 261.07213776344133, True),
-            ("adept/persimmon-8b-base", 4, False, 366.73968820698406, False),
-            # ("Qwen/Qwen1.5-7B", 4, False, 490.8621617893209, False),
-            ("google/gemma-7b", 1, False, 109.70751574382221, True),
-            ("google/gemma-2-9b", 1, False, 92.302359446567, True),
-            ("google/gemma-2-27b", 1, False, 36.578709544111, True),
-            ("state-spaces/mamba-130m-hf", 1536, False, 5385.511100161605, False),
-            # ("Deci/DeciLM-7B", 1, False, 115, False),
-            ("Qwen/Qwen2-7B", 256, False, 8870.945160540245, True),
-            ("Qwen/Qwen1.5-MoE-A2.7B", 1, True, 44.25834541569395, False),
-            # ("EleutherAI/gpt-neo-2.7B", 1, False, 257.2476416844122, False),
-            # ("facebook/xglm-1.7B", 1, False, 357.46365062825083, False),
-            # ("CohereForAI/c4ai-command-r-v01", 1, False, 29.50315234651154, False),
-            ("tiiuae/falcon-mamba-7b", 1, False, 47.1464839567739, False),
-            ("openbmb/MiniCPM3-4B", 1, False, 65.116, False),
-            ("baichuan-inc/Baichuan2-7B-Chat", 1, True, 108, False),
-            ("baichuan-inc/Baichuan2-13B-Chat", 1, False, 66, False),
-            ("deepseek-ai/DeepSeek-V2-Lite", 1, False, 35, False),
-            ("THUDM/chatglm2-6b", 1, True, 150, False),
-            ("THUDM/chatglm3-6b", 1, True, 150, False),
-            ("Qwen/Qwen2.5-7B", 4, False, 490, False),
+            ("bigscience/bloomz-7b1", 1, False, False),
+            ("gpt2-xl", 1, False, False),
+            ("EleutherAI/gpt-j-6b", 1, False, False),
+            ("EleutherAI/gpt-neox-20b", 1, False, False),
+            ("meta-llama/Llama-2-7b-hf", 1, True, True),
+            ("tiiuae/falcon-40b", 1, True, False),
+            ("bigcode/starcoder", 256, True, True),
+            ("Salesforce/codegen2-1B", 1, False, False),
+            ("mosaicml/mpt-30b", 1, False, False),
+            ("mistralai/Mistral-7B-v0.1", 1, True, True),
+            ("mistralai/Mixtral-8x7B-v0.1", 1, False, True),
+            ("microsoft/phi-2", 1, False, False),
+            ("meta-llama/Meta-Llama-3-8B", 1, True, False),
+            ("meta-llama/Llama-2-7b-hf", 512, True, False),
+            ("meta-llama/Llama-2-7b-hf", 512, False, False),  # in some cases like TGI, reuse_cache isn't used
+            ("stabilityai/stablelm-2-12b", 1, False, False),
+            ("codellama/CodeLlama-34b-hf", 1, True, False),
+            ("bigcode/starcoder2-3b", 1, False, True),
+            ("adept/persimmon-8b-base", 4, False, False),
+            # ("Qwen/Qwen1.5-7B", 4, False, False),
+            ("google/gemma-7b", 1, False, True),
+            ("google/gemma-2-9b", 1, False, True),
+            ("google/gemma-2-27b", 1, False, True),
+            ("state-spaces/mamba-130m-hf", 1536, False, False),
+            # ("Deci/DeciLM-7B", 1, False, False),
+            ("Qwen/Qwen2-7B", 256, False, True),
+            ("Qwen/Qwen1.5-MoE-A2.7B", 1, True, False),
+            # ("EleutherAI/gpt-neo-2.7B", 1, False, False),
+            # ("facebook/xglm-1.7B", 1, False, False),
+            # ("CohereForAI/c4ai-command-r-v01", 1, False, False),
+            ("tiiuae/falcon-mamba-7b", 1, False, False),
+            ("openbmb/MiniCPM3-4B", 1, False, False),
+            ("baichuan-inc/Baichuan2-7B-Chat", 1, True, False),
+            ("baichuan-inc/Baichuan2-13B-Chat", 1, False, False),
+            ("deepseek-ai/DeepSeek-V2-Lite", 1, False, False),
+            ("THUDM/chatglm2-6b", 1, True, False),
+            ("THUDM/chatglm3-6b", 1, True, False),
+            ("Qwen/Qwen2.5-7B", 4, False, False),
         ],
         "fp8": [
-            ("tiiuae/falcon-180B", 4, 950, True, 128, 128, 2506.68),
-            ("meta-llama/Llama-2-7b-hf", 1, 1230, False, 128, 128, 13152.7),
-            ("meta-llama/Llama-2-7b-hf", 1, 163, False, 128, 2048, 4774.7),
-            ("meta-llama/Llama-2-7b-hf", 1, 94, False, 2048, 128, 1293.3),
-            ("meta-llama/Llama-2-7b-hf", 1, 81, False, 2048, 2048, 1942.9),
-            ("meta-llama/Llama-2-70b-hf", 4, 3042, False, 128, 128, 5374.6),
-            ("meta-llama/Llama-2-70b-hf", 4, 750, False, 128, 2048, 7422.4),
-            ("meta-llama/Llama-2-70b-hf", 4, 207, False, 2048, 128, 568.5),
-            ("meta-llama/Llama-2-70b-hf", 8, 172, False, 2048, 2048, 4656.2),
-            ("mistralai/Mistral-7B-Instruct-v0.2", 1, 896, True, 128, 128, 17068.965283763682),
-            # ("mistralai/Mistral-7B-Instruct-v0.2", 1, 120, True, 128, 2048, 6979.225194247115),
-            # ("mistralai/Mistral-7B-Instruct-v0.2", 1, 120, True, 2048, 128, 1681.4401450088983),
-            ("mistralai/Mistral-7B-Instruct-v0.2", 1, 44, True, 2048, 2048, 3393.149396451692),
-            ("mistralai/Mixtral-8x7B-v0.1", 1, 1, True, 128, 128, 40.94),
-            ("mistralai/Mixtral-8x7B-v0.1", 2, 768, True, 128, 128, 3428.65),
-            # ("mistralai/Mixtral-8x7B-v0.1", 2, 96, True, 128, 2048, 2570.34),
-            # ("mistralai/Mixtral-8x7B-v0.1", 2, 96, True, 2048, 128, 379.03),
-            ("mistralai/Mixtral-8x7B-v0.1", 2, 48, True, 2048, 2048, 1147.50),
-            ("microsoft/phi-2", 1, 1, True, 128, 128, 254.08932787178165),
+            ("tiiuae/falcon-180B", 4, 950, True, 128, 128),
+            ("meta-llama/Llama-2-7b-hf", 1, 1230, False, 128, 128),
+            ("meta-llama/Llama-2-7b-hf", 1, 163, False, 128, 2048),
+            ("meta-llama/Llama-2-7b-hf", 1, 94, False, 2048, 128),
+            ("meta-llama/Llama-2-7b-hf", 1, 81, False, 2048, 2048),
+            ("meta-llama/Llama-2-70b-hf", 4, 3042, False, 128, 128),
+            ("meta-llama/Llama-2-70b-hf", 4, 750, False, 128, 2048),
+            ("meta-llama/Llama-2-70b-hf", 4, 207, False, 2048, 128),
+            ("meta-llama/Llama-2-70b-hf", 8, 172, False, 2048, 2048),
+            ("mistralai/Mistral-7B-Instruct-v0.2", 1, 896, True, 128, 128),
+            # ("mistralai/Mistral-7B-Instruct-v0.2", 1, 120, True, 128, 2048),
+            # ("mistralai/Mistral-7B-Instruct-v0.2", 1, 120, True, 2048, 128),
+            ("mistralai/Mistral-7B-Instruct-v0.2", 1, 44, True, 2048, 2048),
+            ("mistralai/Mixtral-8x7B-v0.1", 1, 1, True, 128, 128),
+            ("mistralai/Mixtral-8x7B-v0.1", 2, 768, True, 128, 128),
+            # ("mistralai/Mixtral-8x7B-v0.1", 2, 96, True, 128, 2048),
+            # ("mistralai/Mixtral-8x7B-v0.1", 2, 96, True, 2048, 128),
+            ("mistralai/Mixtral-8x7B-v0.1", 2, 48, True, 2048, 2048),
+            ("microsoft/phi-2", 1, 1, True, 128, 128),
         ],
         "load_quantized_model_with_autogptq": [
-            ("TheBloke/Llama-2-7b-Chat-GPTQ", 1, 10, False, 128, 2048, 456.7),
+            ("TheBloke/Llama-2-7b-Chat-GPTQ", 1, 10, False, 128, 2048),
         ],
         "load_quantized_model_with_autoawq": [
-            ("TheBloke/Llama-2-7b-Chat-AWQ", 1, 10, False, 128, 2048, 456.7),
+            ("TheBloke/Llama-2-7b-Chat-AWQ", 1, 10, False, 128, 2048),
         ],
         "deepspeed": [
-            ("bigscience/bloomz", 8, 1, 36.77314954096159),
-            # ("meta-llama/Llama-2-70b-hf", 8, 1, 64.10514998902435),
-            ("meta-llama/Meta-Llama-3-70B-Instruct", 8, 1, 64),
-            ("facebook/opt-66b", 2, 1, 28.48069266504111),
-            ("google/gemma-2-9b", 8, 1, 110.12610917383735),
-            ("Qwen/Qwen2.5-72B", 2, 1, 26),
-            ("google/gemma-2-27b", 8, 1, 87.578709544111),
+            ("bigscience/bloomz", 8, 1),
+            # ("meta-llama/Llama-2-70b-hf", 8, 1),
+            ("meta-llama/Meta-Llama-3-70B-Instruct", 8, 1),
+            ("facebook/opt-66b", 2, 1),
+            ("google/gemma-2-9b", 8, 1),
+            ("Qwen/Qwen2.5-72B", 2, 1),
+            ("google/gemma-2-27b", 8, 1),
         ],
         "torch_compile": [
-            ("meta-llama/Llama-2-7b-hf", 102.27823420713148),
+            "meta-llama/Llama-2-7b-hf",
         ],
         "torch_compile_distributed": [
-            ("meta-llama/Llama-2-7b-hf", 39.72973199515235),
+            "meta-llama/Llama-2-7b-hf",
         ],
         "distributed_tp": [
-            ("meta-llama/Llama-2-7b-hf", 1345.2369318328463),
+            "meta-llama/Llama-2-7b-hf",
         ],
         "contrastive_search": [
-            ("gpt2-xl", 1, False, 51.61471298016438),
+            ("gpt2-xl", 1, False),
         ],
         "beam_search": [
-            ("Qwen/Qwen2-7b-Instruct", 1, True, 91.24938949709826),
+            ("Qwen/Qwen2-7b-Instruct", 1, True),
         ],
     }
-    MODEL_OUTPUTS = {
-        "bigcode/starcoder": 'def print_hello_world():\n    print("Hello World")\n\ndef print_hello_world_twice():\n    print_hello_world()\n    print_hello_world()\n\ndef print_hello_world_thrice():\n    print_hello_world()\n    print_hello_world()\n    print_hello_world()\n\ndef print_hello_world_four_times():\n    print_hello_world()\n    print_hello_world()\n    print_hello_world()\n   ',
-        "bigcode/starcoder2-3b": 'def print_hello_world():\n    print("Hello World")\n\ndef print_hello_world_with_name(name):\n    print("Hello World, " + name)\n\ndef print_hello_world_with_name_and_age(name, age):\n    print("Hello World, " + name + ", " + str(age))\n\ndef print_hello_world_with_name_and_age_and_gender(name, age, gender):\n    print("Hello',
-        "google/gemma-7b": "DeepSpeed is a machine learning framework that enables training of large-scale models on commodity hardware. It is designed to be a drop-in replacement for PyTorch, and it is compatible with the existing PyTorch ecosystem. DeepSpeed is designed to be easy to use, and it provides a number of features that make it easy to train large-scale models. DeepSpeed is designed to be scalable, and it can be used to train models on a single machine or on a cluster of machines. DeepSpeed is designed to be efficient,",
-        "google/gemma-2-9b": "DeepSpeed is a machine learning framework that enables training of large-scale deep learning models on a single GPU or across multiple GPUs. It is designed to be easy to use and highly scalable, making it a powerful tool for researchers and practitioners working with large-scale deep learning models.\n\nDeepSpeed is built on top of PyTorch, a popular deep learning framework, and provides a set of tools and libraries that make it easy to train large-scale models. It includes features such as zero-shot inference, which allows models to be",
-        "google/gemma-2-27b": "DeepSpeed is a machine learning framework that enables you to train models with trillions of parameters and beyond, using model parallelism to partition large models over multiple GPUs.\n\nThe following is a brief introduction to the DeepSpeed model parallel training.\n\n<h2>1. Introduction</h2>\n\nThe DeepSpeed model parallel training is a simple and effective way to train large models. It is a framework that enables you to train models with trillions of parameters and beyond.\n\nDeepSpeed is a distributed deep learning optimization toolkit that makes it easy and efficient",
-        "meta-llama/Llama-2-7b-hf": "DeepSpeed is a machine learning framework for deep learning. It is designed to be fast and efficient, while also being easy to use. DeepSpeed is based on the TensorFlow framework, and it uses the TensorFlow library to perform computations.\nDeepSpeed is a deep learning framework that is designed to be fast and efficient. It is based on the TensorFlow library and uses the TensorFlow library to perform computations. DeepSpeed is designed to be easy to use and to provide a high level of flex",
-        "mistralai/Mistral-7B-v0.1": "DeepSpeed is a machine learning framework that accelerates training of large models on a single machine or distributed systems. It is designed to be compatible with PyTorch and TensorFlow, and can be used to train models on a single machine or on a distributed system.\n\nDeepSpeed is a machine learning framework that accelerates training of large models on a single machine or distributed systems. It is designed to be compatible with PyTorch and TensorFlow, and can be used to train models on a single machine or on a distributed system",
-        "mistralai/Mixtral-8x7B-v0.1": "DeepSpeed is a machine learning framework that enables training of large models on a single machine with a single GPU. It is designed to be easy to use and efficient, and it can be used to train models on a variety of tasks.\n\n## Introduction\n\nDeepSpeed is a machine learning framework that enables training of large models on a single machine with a single GPU. It is designed to be easy to use and efficient, and it can be used to train models on a variety of tasks.\n\n## What is DeepSpeed",
-        "Qwen/Qwen2-7B": "DeepSpeed is a machine learning framework that provides a unified interface for training deep learning models. It is designed to be easy to use and to provide high performance. DeepSpeed is built on top of PyTorch and TensorFlow, and it supports a wide range of models, including transformers, convolutional neural networks, and recurrent neural networks.\nDeepSpeed is a machine learning framework that provides a unified interface for training deep learning models. It is designed to be easy to use and to provide high performance. DeepSpeed is built on top of Py",
-    }
 else:
-    # Gaudi1 CI baselines
+    # Gaudi1 CI
     MODELS_TO_TEST = {
         "bf16_1x": [
-            ("bigscience/bloomz-7b1", 1, False, 41.7555095197846, False),
-            ("gpt2-xl", 1, False, 142.11481820425706, False),
+            ("bigscience/bloomz-7b1", 1, False, False),
+            ("gpt2-xl", 1, False, False),
             # TODO: fix OPT 6.7B
             # ("facebook/opt-6.7b", 0.0),
-            ("EleutherAI/gpt-j-6b", 1, True, 156.2893125740893, False),
-            ("meta-llama/Llama-2-7b-hf", 1, True, 44.39616259946937, False),
-            ("tiiuae/falcon-7b", 1, True, 44.82870145718665, False),
-            ("bigcode/starcoder", 1, False, 15.945023767901013, False),
-            ("Salesforce/codegen2-1B", 1, False, 155.32071248826423, False),
-            ("mosaicml/mpt-7b", 1, False, 45.45168927038262, False),
-            ("mistralai/Mistral-7B-v0.1", 1, True, 41.21906841459711, False),
-            ("microsoft/phi-2", 1, False, 92.53083167241344, False),
-            ("google/gemma-7b", 1, False, 28.84284625836978, False),
-            ("stabilityai/stablelm-2-12b", 1, False, 26.80858949645992, False),
-            ("Qwen/Qwen1.5-7B", 1, False, 39.29068423087616, False),
-            ("adept/persimmon-8b-base", 1, False, 34.53559807384106, False),
-            ("bigcode/starcoder2-3b", 1, False, 82.09655684566117, False),
-            ("state-spaces/mamba-130m-hf", 224, False, 794.542, False),
+            ("EleutherAI/gpt-j-6b", 1, True, False),
+            ("meta-llama/Llama-2-7b-hf", 1, True, False),
+            ("tiiuae/falcon-7b", 1, True, False),
+            ("bigcode/starcoder", 1, False, False),
+            ("Salesforce/codegen2-1B", 1, False, False),
+            ("mosaicml/mpt-7b", 1, False, False),
+            ("mistralai/Mistral-7B-v0.1", 1, True, False),
+            ("microsoft/phi-2", 1, False, False),
+            ("google/gemma-7b", 1, False, False),
+            ("stabilityai/stablelm-2-12b", 1, False, False),
+            ("Qwen/Qwen1.5-7B", 1, False, False),
+            ("adept/persimmon-8b-base", 1, False, False),
+            ("bigcode/starcoder2-3b", 1, False, False),
+            ("state-spaces/mamba-130m-hf", 224, False, False),
         ],
         "fp8": [],
         "load_quantized_model_with_autogptq": [],
         "load_quantized_model_with_autoawq": [],
         "deepspeed": [
-            ("bigscience/bloomz-7b1", 8, 1, 31.994268212011505),
+            ("bigscience/bloomz-7b1", 8, 1),
         ],
         "torch_compile": [],
         "torch_compile_distributed": [],
         "distributed_tp": [],
         "contrastive_search": [
-            ("gpt2-xl", 1, False, 34.48141280163397),
+            ("gpt2-xl", 1, False),
         ],
         "beam_search": [],
     }
-    MODEL_OUTPUTS = {}
 
 
 def _test_text_generation(
     model_name: str,
-    baseline: float,
+    baseline,
     token: str,
     batch_size: int = 1,
     reuse_cache: bool = False,
@@ -377,21 +366,23 @@ def _test_text_generation(
         with open(Path(tmp_dir) / "results.json") as fp:
             results = json.load(fp)
 
+        device = "gaudi2" if os.environ.get("GAUDI2_CI", "0") == "1" else "gaudi1"
+
         # Ensure performance requirements (throughput) are met
-        assert results["throughput"] >= (2 - TIME_PERF_FACTOR) * baseline
+        baseline.assertRef(
+            compare=lambda actual, ref: actual >= (2 - TIME_PERF_FACTOR) * ref,
+            context=[device],
+            throughput=results["throughput"],
+        )
 
         # Verify output for 1 HPU, BF16
         if check_output:
-            assert model_name in MODEL_OUTPUTS, (
-                f"Failed functional testing, missing expected output in MODEL_OUTPUTS for model {model_name}"
-            )
-            expected_output = MODEL_OUTPUTS[model_name]
-            assert results["output"][0][0] == expected_output
+            baseline.assertRef(compare=operator.eq, context=[device], output=results["output"][0][0])
 
 
-@pytest.mark.parametrize("model_name, batch_size, reuse_cache, baseline, check_output", MODELS_TO_TEST["bf16_1x"])
+@pytest.mark.parametrize("model_name, batch_size, reuse_cache, check_output", MODELS_TO_TEST["bf16_1x"])
 def test_text_generation_bf16_1x(
-    model_name: str, baseline: float, batch_size: int, reuse_cache: bool, token: str, check_output: bool
+    model_name: str, batch_size: int, reuse_cache: bool, check_output: bool, baseline, token
 ):
     _test_text_generation(
         model_name=model_name,
@@ -405,17 +396,17 @@ def test_text_generation_bf16_1x(
 
 @pytest.mark.skipif(condition=not bool(int(os.environ.get("GAUDI2_CI", "0"))), reason="Skipping test for G1")
 @pytest.mark.parametrize(
-    "model_name, world_size, batch_size, reuse_cache, input_len, output_len, baseline", MODELS_TO_TEST["fp8"]
+    "model_name, world_size, batch_size, reuse_cache, input_len, output_len", MODELS_TO_TEST["fp8"]
 )
 def test_text_generation_fp8(
     model_name: str,
-    baseline: float,
     world_size: int,
     batch_size: int,
     reuse_cache: bool,
     input_len: int,
     output_len: int,
-    token: str,
+    baseline,
+    token,
 ):
     deepspeed = True if world_size > 1 else False
     _test_text_generation(
@@ -434,18 +425,18 @@ def test_text_generation_fp8(
 
 @pytest.mark.skipif(condition=not bool(int(os.environ.get("GAUDI2_CI", "0"))), reason="Skipping test for G1")
 @pytest.mark.parametrize(
-    "model_name, world_size, batch_size, reuse_cache, input_len, output_len, baseline",
+    "model_name, world_size, batch_size, reuse_cache, input_len, output_len",
     MODELS_TO_TEST["load_quantized_model_with_autogptq"],
 )
 def test_text_generation_gptq(
     model_name: str,
-    baseline: float,
     world_size: int,
     batch_size: int,
     reuse_cache: bool,
     input_len: int,
     output_len: int,
-    token: str,
+    baseline,
+    token,
 ):
     deepspeed = True if world_size > 1 else False
     _test_text_generation(
@@ -465,18 +456,18 @@ def test_text_generation_gptq(
 
 @pytest.mark.skipif(condition=not bool(int(os.environ.get("GAUDI2_CI", "0"))), reason="Skipping test for G1")
 @pytest.mark.parametrize(
-    "model_name, world_size, batch_size, reuse_cache, input_len, output_len, baseline",
+    "model_name, world_size, batch_size, reuse_cache, input_len, output_len",
     MODELS_TO_TEST["load_quantized_model_with_autoawq"],
 )
 def test_text_generation_awq(
     model_name: str,
-    baseline: float,
     world_size: int,
     batch_size: int,
     reuse_cache: bool,
     input_len: int,
     output_len: int,
-    token: str,
+    baseline,
+    token,
 ):
     deepspeed = True if world_size > 1 else False
     _test_text_generation(
@@ -494,27 +485,27 @@ def test_text_generation_awq(
     )
 
 
-@pytest.mark.parametrize("model_name,  world_size, batch_size, baseline", MODELS_TO_TEST["deepspeed"])
-def test_text_generation_deepspeed(model_name: str, baseline: float, world_size: int, batch_size: int, token: str):
+@pytest.mark.parametrize("model_name, world_size, batch_size", MODELS_TO_TEST["deepspeed"])
+def test_text_generation_deepspeed(model_name: str, world_size: int, batch_size: int, baseline, token):
     _test_text_generation(model_name, baseline, token, deepspeed=True, world_size=world_size, batch_size=batch_size)
 
 
 @pytest.mark.skipif(condition=not bool(int(os.environ.get("GAUDI2_CI", "0"))), reason="Skipping test for G1")
-@pytest.mark.parametrize("model_name, baseline", MODELS_TO_TEST["torch_compile"])
-def test_text_generation_torch_compile(model_name: str, baseline: float, token: str):
+@pytest.mark.parametrize("model_name", MODELS_TO_TEST["torch_compile"])
+def test_text_generation_torch_compile(model_name: str, baseline, token):
     _test_text_generation(model_name, baseline, token, torch_compile=True)
 
 
 @pytest.mark.skipif(condition=not bool(int(os.environ.get("GAUDI2_CI", "0"))), reason="Skipping test for G1")
-@pytest.mark.parametrize("model_name, baseline", MODELS_TO_TEST["torch_compile_distributed"])
-def test_text_generation_torch_compile_distributed(model_name: str, baseline: float, token: str):
+@pytest.mark.parametrize("model_name", MODELS_TO_TEST["torch_compile_distributed"])
+def test_text_generation_torch_compile_distributed(model_name: str, baseline, token):
     world_size = 8
     _test_text_generation(model_name, baseline, token, deepspeed=True, world_size=world_size, torch_compile=True)
 
 
 @pytest.mark.skipif(condition=not bool(int(os.environ.get("GAUDI2_CI", "0"))), reason="Skipping test for G1")
-@pytest.mark.parametrize("model_name, baseline", MODELS_TO_TEST["distributed_tp"])
-def test_text_generation_distributed_tp(model_name: str, baseline: float, token: str):
+@pytest.mark.parametrize("model_name", MODELS_TO_TEST["distributed_tp"])
+def test_text_generation_distributed_tp(model_name: str, baseline, token):
     world_size = 8
     _test_text_generation(
         model_name,
@@ -528,16 +519,14 @@ def test_text_generation_distributed_tp(model_name: str, baseline: float, token:
     )
 
 
-@pytest.mark.parametrize("model_name, batch_size, reuse_cache, baseline", MODELS_TO_TEST["contrastive_search"])
-def test_text_generation_contrastive_search(
-    model_name: str, baseline: float, batch_size: int, reuse_cache: bool, token: str
-):
+@pytest.mark.parametrize("model_name, batch_size, reuse_cache", MODELS_TO_TEST["contrastive_search"])
+def test_text_generation_contrastive_search(model_name: str, batch_size: int, reuse_cache: bool, baseline, token):
     _test_text_generation(model_name, baseline, token, batch_size, reuse_cache, contrastive_search=True)
 
 
 @pytest.mark.skipif(condition=not bool(int(os.environ.get("GAUDI2_CI", "0"))), reason="Skipping test for G1")
-@pytest.mark.parametrize("model_name, batch_size, reuse_cache, baseline", MODELS_TO_TEST["beam_search"])
-def test_text_generation_beam_search(model_name: str, baseline: float, batch_size: int, reuse_cache: bool, token: str):
+@pytest.mark.parametrize("model_name, batch_size, reuse_cache", MODELS_TO_TEST["beam_search"])
+def test_text_generation_beam_search(model_name: str, batch_size: int, reuse_cache: bool, baseline, token):
     _test_text_generation(model_name, baseline, token, batch_size, reuse_cache, num_beams=3)
     _test_text_generation(model_name, baseline, token, batch_size, reuse_cache, num_beams=3, num_return_sequences=2)