Skip to content

Commit

Permalink
Implement baselines as a fixture and with simple rebase support (#1732)
Browse files Browse the repository at this point in the history
Signed-off-by: U. Artie Eoff <[email protected]>
  • Loading branch information
uartie authored Feb 7, 2025
1 parent 18449ba commit 9e882f2
Show file tree
Hide file tree
Showing 17 changed files with 1,116 additions and 324 deletions.
96 changes: 90 additions & 6 deletions conftest.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,76 @@
import json
import logging
from pathlib import Path

import pytest


BASELINE_DIRECTORY = Path(__file__).parent.resolve() / Path("tests") / Path("baselines") / Path("fixture")


def walk_path(path: Path):
"""
Taken from https://stackoverflow.com/a/76236680
Path.walk() is not available until python 3.12
"""
subdirs = [d for d in path.iterdir() if d.is_dir()]
files = [f for f in path.iterdir() if f.is_file()]
yield path, subdirs, files
for s in subdirs:
yield from walk_path(s)


class Baseline:
def __init__(self, session):
self.rebase = session.config.option.rebase
self.references = {}

if BASELINE_DIRECTORY.exists():
for root, dirs, files in walk_path(BASELINE_DIRECTORY):
for name in files:
with (root / name).open() as f:
self.references.update(json.load(f))

def get_reference(self, addr, context=[]):
reference = self.references.setdefault(addr, {})
for c in context:
reference = reference.setdefault(c, {})
return reference

def finalize(self):
if self.rebase:
# aggregate refs by test file
refsbyfile = {}
for case, ref in self.references.items():
key = case.split("::")[0]
reffile = BASELINE_DIRECTORY / Path(key).with_suffix(".json")
refsbyfile.setdefault(reffile, {})[case] = ref

# dump aggregated refs into their own files
for reffile, refs in refsbyfile.items():
reffile.parent.mkdir(parents=True, exist_ok=True)
with reffile.open("w+") as f:
json.dump(refs, f, indent=2, sort_keys=True)


class BaselineRequest:
def __init__(self, request):
self.baseline = request.session.stash["baseline"]
self.addr = request.node.nodeid

def assertRef(self, compare, context=[], **kwargs):
reference = self.baseline.get_reference(self.addr, context)
if self.baseline.rebase:
reference.update(**kwargs)

for key, actual in kwargs.items():
ref = reference.get(key, None)
logging.getLogger().info(f"{'.'.join(context + [key])}:actual = {actual}")
logging.getLogger().info(f"{'.'.join(context + [key])}:ref = {ref}")
assert compare(actual, ref)


class Secret:
"""
Taken from: https://stackoverflow.com/a/67393351
Expand All @@ -15,11 +88,22 @@ def __str___(self):

def pytest_addoption(parser):
parser.addoption("--token", action="store", default=None)
parser.addoption("--rebase", action="store_true", help="rebase baseline references from current run")


@pytest.fixture
def token(request):
return Secret(request.config.option.token)


def pytest_sessionstart(session):
session.stash["baseline"] = Baseline(session)


def pytest_sessionfinish(session):
session.stash["baseline"].finalize()


def pytest_generate_tests(metafunc):
# This is called for every test. Only get/set command line arguments
# if the argument is specified in the list of test "fixturenames".
option_value = Secret(metafunc.config.option.token)
if "token" in metafunc.fixturenames:
metafunc.parametrize("token", [option_value])
@pytest.fixture
def baseline(request):
return BaselineRequest(request)
32 changes: 32 additions & 0 deletions tests/baselines/fixture/tests/test_encoder_decoder.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{
"tests/test_encoder_decoder.py::TestEncoderDecoderModels::test_text_summarization_bf16[facebook/bart-large-cnn-Habana/bart-2-2]": {
"gaudi1": {
"predict_rougeLsum": 29.174,
"predict_samples_per_second": 2.304
},
"gaudi2": {
"predict_rougeLsum": 28.9801,
"predict_samples_per_second": 4.339
}
},
"tests/test_encoder_decoder.py::TestEncoderDecoderModels::test_text_summarization_bf16[t5-3b-Habana/t5-2-1]": {
"gaudi1": {
"predict_rougeLsum": 21.7286,
"predict_samples_per_second": 1.005
},
"gaudi2": {
"predict_rougeLsum": 21.8877,
"predict_samples_per_second": 3.848
}
},
"tests/test_encoder_decoder.py::TestEncoderDecoderModels::test_text_translation_bf16[t5-small-Habana/t5-2-1]": {
"gaudi1": {
"predict_bleu": 11.6126,
"predict_samples_per_second": 9.188
},
"gaudi2": {
"predict_bleu": 11.7277,
"predict_samples_per_second": 11.648
}
}
}
8 changes: 8 additions & 0 deletions tests/baselines/fixture/tests/test_fp8_examples.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"tests/test_fp8_examples.py::test_fp8_train[mistralai/Mistral-7B-Instruct-v0.2-tatsu-lab/alpaca--language-modeling-8-8-run_lora_clm.py]": {
"gaudi2": {
"eval_accuracy": 0.7538,
"train_samples_per_second": 12.373
}
}
}
14 changes: 14 additions & 0 deletions tests/baselines/fixture/tests/test_fsdp_examples.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"tests/test_fsdp_examples.py::test_fsdp_bf16[bert-base-uncased-Habana/bert-base-uncased-question-answering-24-8-run_qa.py-full_shard]": {
"gaudi2": {
"eval_f1": 85.7077,
"train_samples_per_second": 2983.533
}
},
"tests/test_fsdp_examples.py::test_fsdp_bf16[meta-llama/Llama-2-7b-hf--language-modeling-8-8-run_lora_clm.py-auto_wrap]": {
"gaudi2": {
"train_loss": 0.9093,
"train_samples_per_second": 85.016
}
}
}
94 changes: 94 additions & 0 deletions tests/baselines/fixture/tests/test_image_to_text_example.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
{
"tests/test_image_to_text_example.py::test_image_to_text_bf16[HuggingFaceM4/idefics2-8b-1]": {
"gaudi2": {
"throughput": 21.89944593215077
}
},
"tests/test_image_to_text_example.py::test_image_to_text_bf16[Qwen/Qwen2-VL-2B-Instruct-1]": {
"gaudi2": {
"throughput": 28.755882208438422
}
},
"tests/test_image_to_text_example.py::test_image_to_text_bf16[Qwen/Qwen2-VL-7B-Instruct-1]": {
"gaudi2": {
"throughput": 19.32562189532818
}
},
"tests/test_image_to_text_example.py::test_image_to_text_bf16[google/paligemma-3b-mix-224-1]": {
"gaudi2": {
"throughput": 132.8949150246155
}
},
"tests/test_image_to_text_example.py::test_image_to_text_bf16[llava-hf/llava-1.5-13b-hf-1]": {
"gaudi1": {
"throughput": 16.704731010481538
},
"gaudi2": {
"throughput": 48.54364937033955
}
},
"tests/test_image_to_text_example.py::test_image_to_text_bf16[llava-hf/llava-1.5-7b-hf-1]": {
"gaudi1": {
"throughput": 28.04096918512148
},
"gaudi2": {
"throughput": 77.98733740859008
}
},
"tests/test_image_to_text_example.py::test_image_to_text_bf16[llava-hf/llava-v1.6-mistral-7b-hf-1]": {
"gaudi1": {
"throughput": 10.759228696741
},
"gaudi2": {
"throughput": 33.17984878151546
}
},
"tests/test_image_to_text_example.py::test_image_to_text_bf16[llava-hf/llava-v1.6-vicuna-13b-hf-1]": {
"gaudi1": {
"throughput": 6.96732060769783
},
"gaudi2": {
"throughput": 23.527610042925
}
},
"tests/test_image_to_text_example.py::test_image_to_text_bf16[llava-hf/llava-v1.6-vicuna-7b-hf-1]": {
"gaudi2": {
"throughput": 35.00608681379742
}
},
"tests/test_image_to_text_example.py::test_image_to_text_bf16[meta-llama/Llama-3.2-11B-Vision-Instruct-1]": {
"gaudi2": {
"throughput": 18.974541922240313
}
},
"tests/test_image_to_text_example.py::test_image_to_text_bf16[tiiuae/falcon-11B-vlm-1]": {
"gaudi2": {
"throughput": 23.69260849957278
}
},
"tests/test_image_to_text_example.py::test_image_to_text_fp8[llava-hf/llava-1.5-13b-hf-1]": {
"gaudi2": {
"throughput": 67.20488222876344
}
},
"tests/test_image_to_text_example.py::test_image_to_text_fp8[llava-hf/llava-1.5-7b-hf-1]": {
"gaudi2": {
"throughput": 98.72578382705062
}
},
"tests/test_image_to_text_example.py::test_image_to_text_fp8[llava-hf/llava-v1.6-mistral-7b-hf-1]": {
"gaudi2": {
"throughput": 45.011551008367086
}
},
"tests/test_image_to_text_example.py::test_image_to_text_fp8[llava-hf/llava-v1.6-vicuna-13b-hf-1]": {
"gaudi2": {
"throughput": 30.9535718774675
}
},
"tests/test_image_to_text_example.py::test_image_to_text_fp8[llava-hf/llava-v1.6-vicuna-7b-hf-1]": {
"gaudi2": {
"throughput": 45.18544502949674
}
}
}
18 changes: 18 additions & 0 deletions tests/baselines/fixture/tests/test_openclip_vqa.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"tests/test_openclip_vqa.py::test_openclip_vqa_bf16[laion/CLIP-ViT-g-14-laion2B-s12B-b42K]": {
"gaudi1": {
"throughput": 550
},
"gaudi2": {
"throughput": 1472
}
},
"tests/test_openclip_vqa.py::test_openclip_vqa_bf16[microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224]": {
"gaudi1": {
"throughput": 1200
},
"gaudi2": {
"throughput": 1816
}
}
}
17 changes: 17 additions & 0 deletions tests/baselines/fixture/tests/test_pipeline.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"tests/test_pipeline.py::TestGaudiPipeline::test_image_to_text[Salesforce/blip-image-captioning-base-44]": {
"generated_text": "a soccer player is playing a game on the app"
},
"tests/test_pipeline.py::TestGaudiPipeline::test_image_to_text[nlpconnect/vit-gpt2-image-captioning-44]": {
"generated_text": "a soccer game with a player jumping to catch"
},
"tests/test_pipeline.py::TestGaudiPipeline::test_text_to_speech[facebook/hf-seamless-m4t-medium]": {
"sampling_rate": 16000
},
"tests/test_pipeline.py::TestGaudiPipeline::test_text_to_speech[facebook/mms-tts-eng]": {
"sampling_rate": 16000
},
"tests/test_pipeline.py::TestGaudiPipeline::test_text_to_speech[microsoft/speecht5_tts]": {
"sampling_rate": 16000
}
}
Loading

0 comments on commit 9e882f2

Please sign in to comment.