Implement baselines as a fixture and with simple rebase support (#1732)

Signed-off-by: U. Artie Eoff <[email protected]>
huggingface · Feb 7, 2025 · 9e882f2 · 9e882f2
1 parent 18449ba
commit 9e882f2
Show file tree

Hide file tree

Showing 17 changed files with 1,116 additions and 324 deletions.
diff --git a/conftest.py b/conftest.py
@@ -1,3 +1,76 @@
+import json
+import logging
+from pathlib import Path
+
+import pytest
+
+
+BASELINE_DIRECTORY = Path(__file__).parent.resolve() / Path("tests") / Path("baselines") / Path("fixture")
+
+
+def walk_path(path: Path):
+    """
+    Taken from https://stackoverflow.com/a/76236680
+
+    Path.walk() is not available until python 3.12
+    """
+    subdirs = [d for d in path.iterdir() if d.is_dir()]
+    files = [f for f in path.iterdir() if f.is_file()]
+    yield path, subdirs, files
+    for s in subdirs:
+        yield from walk_path(s)
+
+
+class Baseline:
+    def __init__(self, session):
+        self.rebase = session.config.option.rebase
+        self.references = {}
+
+        if BASELINE_DIRECTORY.exists():
+            for root, dirs, files in walk_path(BASELINE_DIRECTORY):
+                for name in files:
+                    with (root / name).open() as f:
+                        self.references.update(json.load(f))
+
+    def get_reference(self, addr, context=[]):
+        reference = self.references.setdefault(addr, {})
+        for c in context:
+            reference = reference.setdefault(c, {})
+        return reference
+
+    def finalize(self):
+        if self.rebase:
+            # aggregate refs by test file
+            refsbyfile = {}
+            for case, ref in self.references.items():
+                key = case.split("::")[0]
+                reffile = BASELINE_DIRECTORY / Path(key).with_suffix(".json")
+                refsbyfile.setdefault(reffile, {})[case] = ref
+
+            # dump aggregated refs into their own files
+            for reffile, refs in refsbyfile.items():
+                reffile.parent.mkdir(parents=True, exist_ok=True)
+                with reffile.open("w+") as f:
+                    json.dump(refs, f, indent=2, sort_keys=True)
+
+
+class BaselineRequest:
+    def __init__(self, request):
+        self.baseline = request.session.stash["baseline"]
+        self.addr = request.node.nodeid
+
+    def assertRef(self, compare, context=[], **kwargs):
+        reference = self.baseline.get_reference(self.addr, context)
+        if self.baseline.rebase:
+            reference.update(**kwargs)
+
+        for key, actual in kwargs.items():
+            ref = reference.get(key, None)
+            logging.getLogger().info(f"{'.'.join(context + [key])}:actual = {actual}")
+            logging.getLogger().info(f"{'.'.join(context + [key])}:ref    = {ref}")
+            assert compare(actual, ref)
+
+
 class Secret:
     """
     Taken from: https://stackoverflow.com/a/67393351
@@ -15,11 +88,22 @@ def __str___(self):
 
 def pytest_addoption(parser):
     parser.addoption("--token", action="store", default=None)
+    parser.addoption("--rebase", action="store_true", help="rebase baseline references from current run")
+
+
+@pytest.fixture
+def token(request):
+    return Secret(request.config.option.token)
+
+
+def pytest_sessionstart(session):
+    session.stash["baseline"] = Baseline(session)
+
+
+def pytest_sessionfinish(session):
+    session.stash["baseline"].finalize()
 
 
-def pytest_generate_tests(metafunc):
-    # This is called for every test. Only get/set command line arguments
-    # if the argument is specified in the list of test "fixturenames".
-    option_value = Secret(metafunc.config.option.token)
-    if "token" in metafunc.fixturenames:
-        metafunc.parametrize("token", [option_value])
+@pytest.fixture
+def baseline(request):
+    return BaselineRequest(request)
diff --git a/tests/baselines/fixture/tests/test_encoder_decoder.json b/tests/baselines/fixture/tests/test_encoder_decoder.json
@@ -0,0 +1,32 @@
+{
+  "tests/test_encoder_decoder.py::TestEncoderDecoderModels::test_text_summarization_bf16[facebook/bart-large-cnn-Habana/bart-2-2]": {
+    "gaudi1": {
+      "predict_rougeLsum": 29.174,
+      "predict_samples_per_second": 2.304
+    },
+    "gaudi2": {
+      "predict_rougeLsum": 28.9801,
+      "predict_samples_per_second": 4.339
+    }
+  },
+  "tests/test_encoder_decoder.py::TestEncoderDecoderModels::test_text_summarization_bf16[t5-3b-Habana/t5-2-1]": {
+    "gaudi1": {
+      "predict_rougeLsum": 21.7286,
+      "predict_samples_per_second": 1.005
+    },
+    "gaudi2": {
+      "predict_rougeLsum": 21.8877,
+      "predict_samples_per_second": 3.848
+    }
+  },
+  "tests/test_encoder_decoder.py::TestEncoderDecoderModels::test_text_translation_bf16[t5-small-Habana/t5-2-1]": {
+    "gaudi1": {
+      "predict_bleu": 11.6126,
+      "predict_samples_per_second": 9.188
+    },
+    "gaudi2": {
+      "predict_bleu": 11.7277,
+      "predict_samples_per_second": 11.648
+    }
+  }
+}
diff --git a/tests/baselines/fixture/tests/test_fp8_examples.json b/tests/baselines/fixture/tests/test_fp8_examples.json
@@ -0,0 +1,8 @@
+{
+  "tests/test_fp8_examples.py::test_fp8_train[mistralai/Mistral-7B-Instruct-v0.2-tatsu-lab/alpaca--language-modeling-8-8-run_lora_clm.py]": {
+    "gaudi2": {
+      "eval_accuracy": 0.7538,
+      "train_samples_per_second": 12.373
+    }
+  }
+}
diff --git a/tests/baselines/fixture/tests/test_fsdp_examples.json b/tests/baselines/fixture/tests/test_fsdp_examples.json
@@ -0,0 +1,14 @@
+{
+  "tests/test_fsdp_examples.py::test_fsdp_bf16[bert-base-uncased-Habana/bert-base-uncased-question-answering-24-8-run_qa.py-full_shard]": {
+    "gaudi2": {
+      "eval_f1": 85.7077,
+      "train_samples_per_second": 2983.533
+    }
+  },
+  "tests/test_fsdp_examples.py::test_fsdp_bf16[meta-llama/Llama-2-7b-hf--language-modeling-8-8-run_lora_clm.py-auto_wrap]": {
+    "gaudi2": {
+      "train_loss": 0.9093,
+      "train_samples_per_second": 85.016
+    }
+  }
+}
diff --git a/tests/baselines/fixture/tests/test_image_to_text_example.json b/tests/baselines/fixture/tests/test_image_to_text_example.json
@@ -0,0 +1,94 @@
+{
+  "tests/test_image_to_text_example.py::test_image_to_text_bf16[HuggingFaceM4/idefics2-8b-1]": {
+    "gaudi2": {
+      "throughput": 21.89944593215077
+    }
+  },
+  "tests/test_image_to_text_example.py::test_image_to_text_bf16[Qwen/Qwen2-VL-2B-Instruct-1]": {
+    "gaudi2": {
+      "throughput": 28.755882208438422
+    }
+  },
+  "tests/test_image_to_text_example.py::test_image_to_text_bf16[Qwen/Qwen2-VL-7B-Instruct-1]": {
+    "gaudi2": {
+      "throughput": 19.32562189532818
+    }
+  },
+  "tests/test_image_to_text_example.py::test_image_to_text_bf16[google/paligemma-3b-mix-224-1]": {
+    "gaudi2": {
+      "throughput": 132.8949150246155
+    }
+  },
+  "tests/test_image_to_text_example.py::test_image_to_text_bf16[llava-hf/llava-1.5-13b-hf-1]": {
+    "gaudi1": {
+      "throughput": 16.704731010481538
+    },
+    "gaudi2": {
+      "throughput": 48.54364937033955
+    }
+  },
+  "tests/test_image_to_text_example.py::test_image_to_text_bf16[llava-hf/llava-1.5-7b-hf-1]": {
+    "gaudi1": {
+      "throughput": 28.04096918512148
+    },
+    "gaudi2": {
+      "throughput": 77.98733740859008
+    }
+  },
+  "tests/test_image_to_text_example.py::test_image_to_text_bf16[llava-hf/llava-v1.6-mistral-7b-hf-1]": {
+    "gaudi1": {
+      "throughput": 10.759228696741
+    },
+    "gaudi2": {
+      "throughput": 33.17984878151546
+    }
+  },
+  "tests/test_image_to_text_example.py::test_image_to_text_bf16[llava-hf/llava-v1.6-vicuna-13b-hf-1]": {
+    "gaudi1": {
+      "throughput": 6.96732060769783
+    },
+    "gaudi2": {
+      "throughput": 23.527610042925
+    }
+  },
+  "tests/test_image_to_text_example.py::test_image_to_text_bf16[llava-hf/llava-v1.6-vicuna-7b-hf-1]": {
+    "gaudi2": {
+      "throughput": 35.00608681379742
+    }
+  },
+  "tests/test_image_to_text_example.py::test_image_to_text_bf16[meta-llama/Llama-3.2-11B-Vision-Instruct-1]": {
+    "gaudi2": {
+      "throughput": 18.974541922240313
+    }
+  },
+  "tests/test_image_to_text_example.py::test_image_to_text_bf16[tiiuae/falcon-11B-vlm-1]": {
+    "gaudi2": {
+      "throughput": 23.69260849957278
+    }
+  },
+  "tests/test_image_to_text_example.py::test_image_to_text_fp8[llava-hf/llava-1.5-13b-hf-1]": {
+    "gaudi2": {
+      "throughput": 67.20488222876344
+    }
+  },
+  "tests/test_image_to_text_example.py::test_image_to_text_fp8[llava-hf/llava-1.5-7b-hf-1]": {
+    "gaudi2": {
+      "throughput": 98.72578382705062
+    }
+  },
+  "tests/test_image_to_text_example.py::test_image_to_text_fp8[llava-hf/llava-v1.6-mistral-7b-hf-1]": {
+    "gaudi2": {
+      "throughput": 45.011551008367086
+    }
+  },
+  "tests/test_image_to_text_example.py::test_image_to_text_fp8[llava-hf/llava-v1.6-vicuna-13b-hf-1]": {
+    "gaudi2": {
+      "throughput": 30.9535718774675
+    }
+  },
+  "tests/test_image_to_text_example.py::test_image_to_text_fp8[llava-hf/llava-v1.6-vicuna-7b-hf-1]": {
+    "gaudi2": {
+      "throughput": 45.18544502949674
+    }
+  }
+}
diff --git a/tests/baselines/fixture/tests/test_openclip_vqa.json b/tests/baselines/fixture/tests/test_openclip_vqa.json
@@ -0,0 +1,18 @@
+{
+  "tests/test_openclip_vqa.py::test_openclip_vqa_bf16[laion/CLIP-ViT-g-14-laion2B-s12B-b42K]": {
+    "gaudi1": {
+      "throughput": 550
+    },
+    "gaudi2": {
+      "throughput": 1472
+    }
+  },
+  "tests/test_openclip_vqa.py::test_openclip_vqa_bf16[microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224]": {
+    "gaudi1": {
+      "throughput": 1200
+    },
+    "gaudi2": {
+      "throughput": 1816
+    }
+  }
+}
diff --git a/tests/baselines/fixture/tests/test_pipeline.json b/tests/baselines/fixture/tests/test_pipeline.json
@@ -0,0 +1,17 @@
+{
+  "tests/test_pipeline.py::TestGaudiPipeline::test_image_to_text[Salesforce/blip-image-captioning-base-44]": {
+    "generated_text": "a soccer player is playing a game on the app"
+  },
+  "tests/test_pipeline.py::TestGaudiPipeline::test_image_to_text[nlpconnect/vit-gpt2-image-captioning-44]": {
+    "generated_text": "a soccer game with a player jumping to catch"
+  },
+  "tests/test_pipeline.py::TestGaudiPipeline::test_text_to_speech[facebook/hf-seamless-m4t-medium]": {
+    "sampling_rate": 16000
+  },
+  "tests/test_pipeline.py::TestGaudiPipeline::test_text_to_speech[facebook/mms-tts-eng]": {
+    "sampling_rate": 16000
+  },
+  "tests/test_pipeline.py::TestGaudiPipeline::test_text_to_speech[microsoft/speecht5_tts]": {
+    "sampling_rate": 16000
+  }
+}