offline WIP

wangshangsam · wangshangsam · commit 990503c29f6e · 2025-10-28T05:25:14.000-04:00
diff --git a/multimodal/vl2l/README.md b/multimodal/vl2l/README.md
@@ -19,7 +19,7 @@ on how to install Miniconda on your host machine. Then, you can create a new con
 environment via:
 
 ```bash
-conda create -n mlperf-inf-mm-vl2l python=3.14
+conda create -n mlperf-inf-mm-vl2l python=3.13
 ```
 
 ### Install LoadGen
@@ -47,5 +47,29 @@ cd ../
 Run a quick test to validate that LoadGen was installed correctly:
 
 ```bash
-python loadgen/
+python loadgen/demos/token_metrics/py_demo_server.py
+```
+
+### Install VL2L Benchmark CLI
+
+For users, install `mlperf-inf-mm-vl2l` with:
+
+```bash
+pip install multimodal/vl2l/
+```
+
+For developers, install `mlperf-inf-mm-vl2l` and the development tools with:
+
+```bash
+pip install multimodal/vl2l/[dev]
+```
+
+## Developer Guide
+
+### Linting
+
+You can lint the VL2L benchmark source code by running the following script:
+
+```bash
+bash multimodal/vl2l/scripts/linters.sh
 ```
diff --git a/multimodal/vl2l/notebooks/shopify-global-catalogue-isl-osl.ipynb b/multimodal/vl2l/notebooks/shopify-global-catalogue-isl-osl.ipynb
diff --git a/multimodal/vl2l/pyproject.toml b/multimodal/vl2l/pyproject.toml
@@ -8,12 +8,21 @@ classifiers = [
   "Intended Audience :: Developers",
   "Operating System :: POSIX :: Linux",
 ]
-requires-python = ">=3.14"
-dependencies = ["typer", "loguru", "pydantic", "mlcommons_loadgen", "pydantic-typer"]
+requires-python = ">=3.13"
+dependencies = [
+  "datasets",
+  "loguru",
+  "matplotlib",
+  "mlcommons_loadgen",
+  "openai[aiohttp]",
+  "pydantic",
+  "pydantic-typer",
+  "typer",
+]
 dynamic = ["version"]
 
 [project.optional-dependencies]
-dev = ["mypy", "pytest"]
+dev = ["black", "ruff", "mypy", "shellcheck-py", "pytest"]
 
 [project.scripts]
 mlperf-inf-mm-vl2l = "mlperf_inference_multimodal_vl2l.cli:app"
@@ -47,3 +56,11 @@ convention = "google"
 [tool.mypy]
 check_untyped_defs = true
 plugins = ['pydantic.mypy']
+
+[[tool.mypy.overrides]]
+module = "pydantic_typer"
+ignore_missing_imports = true
+
+[[tool.mypy.overrides]]
+module = "mlperf_loadgen"
+ignore_missing_imports = true
diff --git a/multimodal/vl2l/scripts/linters.sh b/multimodal/vl2l/scripts/linters.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+set -eux
+set -o pipefail
+
+PROJECT_ROOT=$(dirname "${BASH_SOURCE[0]}")/../
+PROJECT_ROOT=$(realpath "${PROJECT_ROOT}")
+
+function _exit_with_help_msg() {
+  cat <<EOF
+Run linters for "${PROJECT_ROOT}".
+
+Usage: ${BASH_SOURCE[0]}
+  [-h | --help]     Print this help message.
+EOF
+  if [ -n "$1" ]; then
+    echo "$(tput bold setab 1)$1$(tput sgr0)"
+  fi
+  exit "$2"
+}
+
+while [[ $# -gt 0 ]]; do
+  case $1 in
+  -h | --help)
+    _exit_with_help_msg "" 0
+    ;;
+  *)
+    _exit_with_help_msg "[ERROR] Unknown option: $1" 1
+    ;;
+  esac
+done
+
+echo "Running ruff..."
+ruff check --fix "${PROJECT_ROOT}"/src/
+
+echo "Running mypy..."
+mypy --config-file="${PROJECT_ROOT}"/pyproject.toml --install-types "${PROJECT_ROOT}"/src/
+
+echo "Running shellcheck..."
+find "${PROJECT_ROOT}" -type f -name "*.sh" -exec shellcheck -ax {} +
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/__init__.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/__init__.py
@@ -1,9 +1,9 @@
-"""Reference Implementation for the Vision-language-to-language (VL2L) Benchmark"""
+"""Reference Implementation for the Vision-language-to-language (VL2L) Benchmark."""
 
 from __future__ import annotations
+
+import contextlib
 from importlib.metadata import PackageNotFoundError, version
 
-try:
+with contextlib.suppress(PackageNotFoundError):
     __version__ = version("mlperf-inference-multimodal-vl2l")
-except PackageNotFoundError:
-    pass
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/cli.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/cli.py
@@ -2,12 +2,216 @@
 
 from __future__ import annotations
 
-import typer
+from datetime import timedelta
+from enum import StrEnum, auto
+from typing import Annotated
 
-app = typer.Typer()
+import mlperf_loadgen as lg
+from loguru import logger
+from openai import AsyncOpenAI, DefaultAioHttpClient
+from pydantic import BaseModel, Field
+from pydantic_typer import Typer
+
+from .task import MMMU, ShopifyGlobalCatalogue, Task
+
+app = Typer()
+
+
+class TestScenario(StrEnum):
+    """The test scenario for the MLPerf inference LoadGen."""
+
+    SERVER = auto()
+    """Run the benchmark in server/interactive scenario."""
+
+    OFFLINE = auto()
+    """Run the benchmark in offline/batch scenario."""
+
+    class UnknownValueError(ValueError):
+        """The exception raised when an unknown test scenario is encountered."""
+
+        def __init__(self, test_scenario: TestScenario) -> None:
+            """Initialize the exception."""
+            super().__init__(f"Unknown test scenario: {test_scenario}")
+
+    def to_lgtype(self) -> lg.TestScenario:
+        """Convert the test scenario to its corresponding LoadGen type."""
+        match self:
+            case TestScenario.SERVER:
+                return lg.TestScenario.Server
+            case TestScenario.OFFLINE:
+                return lg.TestScenario.Offline
+            case _:
+                raise TestScenario.UnknownValueError(self)
+
+
+class TestMode(StrEnum):
+    """The test mode for the MLPerf inference LoadGen."""
+
+    PERFORMANCE_ONLY = auto()
+    """Run the benchmark to evaluate performance."""
+
+    ACCURACY_ONLY = auto()
+    """Run the benchmark to evaluate model quality."""
+
+    class UnknownValueError(ValueError):
+        """The exception raised when an unknown test mode is encountered."""
+
+        def __init__(self, test_mode: TestMode) -> None:
+            """Initialize the exception."""
+            super().__init__(f"Unknown test mode: {test_mode}")
+
+    def to_lgtype(self) -> lg.TestMode:
+        """Convert the test mode to its corresponding LoadGen type."""
+        match self:
+            case TestMode.PERFORMANCE_ONLY:
+                return lg.TestMode.PerformanceOnly
+            case TestMode.ACCURACY_ONLY:
+                return lg.TestMode.AccuracyOnly
+            case _:
+                raise TestMode.UnknownValueError(self)
+
+
+class TestSettings(BaseModel):
+    """The test settings for the MLPerf inference LoadGen."""
+
+    senario: Annotated[
+        TestScenario,
+        Field(
+            description=(
+                "The MLPerf inference benchmarking scenario to run the benchmark in."
+            ),
+        ),
+    ] = TestScenario.OFFLINE
+
+    mode: Annotated[
+        TestMode,
+        Field(
+            description=(
+                "Whether you want to run the benchmark for performance or accuracy."
+            ),
+        ),
+    ] = TestMode.PERFORMANCE_ONLY
+
+    offline_expected_qps: Annotated[
+        float,
+        Field(
+            description="The expected QPS for the offline scenario.",
+        ),
+    ] = 10
+
+    min_duration: Annotated[
+        timedelta,
+        Field(
+            description="The minimum testing duration.",
+        ),
+    ] = timedelta(seconds=5)
+
+    def to_lgtype(self) -> lg.TestSettings:
+        """Convert the test settings to its corresponding LoadGen type."""
+        settings = lg.TestSettings()
+        settings.scenario = self.senario.to_lgtype()
+        settings.mode = self.mode.to_lgtype()
+        settings.offline_expected_qps = self.offline_expected_qps
+        settings.min_duration_ms = round(self.min_duration.total_seconds() * 1000)
+        settings.use_token_latencies = True
+        return settings
+
+
+class Model(BaseModel):
+    """Specifies the model to use for the VL2L benchmark."""
+
+    repo_id: Annotated[
+        str,
+        Field(description="The HuggingFace repository ID of the model."),
+    ] = "Qwen/Qwen3-VL-30B-A3B-Instruct"
+
+
+class Dataset(BaseModel):
+    """Specifies a dataset on HuggingFace."""
+
+    class Task(StrEnum):
+        """The task for the VL2L benchmark."""
+
+        SHOPIFY_GLOBAL_CATALOG = auto()
+        MMMU = auto()
+
+    class UnknownTaskError(ValueError):
+        """The exception raised when an unknown task is encountered."""
+
+        def __init__(self, task: Dataset.Task) -> None:
+            """Initialize the exception."""
+            super().__init__(f"Unknown task: {task}")
+
+    task: Annotated[
+        Dataset.Task | None,
+        Field(
+            description=(
+                "The vision-language-to-language task to run the benchmark for. If not "
+                "specified, the task will be derived from the HuggingFace repository ID"
+                " of the dataset."
+            ),
+        ),
+    ] = None
+
+    repo_id: Annotated[
+        str,
+        Field(description="The HuggingFace repository ID of the dataset."),
+    ] = "Shopify/the-catalogue-public-beta"
+
+    token: Annotated[
+        str | None,
+        Field(
+            description=(
+                "The token to access the HuggingFace repository of the dataset."
+            ),
+        ),
+    ] = None
+
+
+def create_task(dataset: Dataset, model: Model, openai_api_client: AsyncOpenAI) -> Task:
+    """Convert the dataset configuration to its corresponding task."""
+    match dataset.task:
+        case Dataset.Task.MMMU:
+            return MMMU(dataset, model, openai_api_client)
+        case Dataset.Task.SHOPIFY_GLOBAL_CATALOG:
+            return ShopifyGlobalCatalogue(dataset, model, openai_api_client)
+        case None:
+            match dataset.repo_id:
+                case "MMMU/MMMU":
+                    return MMMU(dataset, model, openai_api_client)
+                case "Shopify/the-catalogue-public-beta":
+                    return ShopifyGlobalCatalogue(dataset, model, openai_api_client)
+                case _:
+                    raise Dataset.UnknownTaskError(dataset.task)
+        case _:
+            raise Dataset.UnknownTaskError(dataset.task)
 
 
 @app.command()
-def main():
-    """VL2L benchmark CLI"""
-    typer.echo("Hello, World!")
+def main(
+    *,
+    settings: TestSettings,
+    model: Model,
+    dataset: Dataset,
+    endpoint: str = "http://localhost:8000/v1",
+    openai_api_key: str = "",
+) -> None:
+    """Main CLI for running the VL2L benchmark."""
+    logger.info("Running VL2L benchmark with settings: {}", settings)
+    logger.info("Running VL2L benchmark with dataset: {}", dataset)
+    logger.info("Running VL2L benchmark with endpoint: {}", endpoint)
+    lg_settings = settings.to_lgtype()
+    task = create_task(
+        dataset,
+        model,
+        AsyncOpenAI(
+            base_url=endpoint,
+            http_client=DefaultAioHttpClient(),
+            api_key=openai_api_key,
+        ),
+    )
+    sut = task.construct_sut()
+    qsl = task.construct_qsl()
+    lg.StartTest(sut, qsl, lg_settings)
+    lg.DestroyQSL(qsl)
+    lg.DestroySUT(sut)
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py