clean-up

wangshangsam · wangshangsam · commit 754207eea61e · 2025-11-04T03:28:56.000-05:00
diff --git a/multimodal/vl2l/README.md b/multimodal/vl2l/README.md
@@ -2,6 +2,8 @@
 
 ## Quick Start
 
+### Get the source code 
+
 Clone the MLPerf Inference repo via:
 
 ```bash
@@ -14,6 +16,8 @@ Then enter the repo:
 cd mlperf-inference/
 ```
 
+### Create a Conda environment
+
 Follow [this link](https://www.anaconda.com/docs/getting-started/miniconda/install#quickstart-install-instructions)
 on how to install Miniconda on your host machine. Then, you can create a new conda 
 environment via:
@@ -50,7 +54,7 @@ Run a quick test to validate that LoadGen was installed correctly:
 python loadgen/demos/token_metrics/py_demo_server.py
 ```
 
-### Install VL2L Benchmark CLI
+### Install the VL2L benchmarking CLI
 
 For users, install `mlperf-inf-mm-vl2l` with:
 
@@ -64,6 +68,57 @@ For developers, install `mlperf-inf-mm-vl2l` and the development tools with:
 pip install multimodal/vl2l/[dev]
 ```
 
+After installation, you can check the CLI flags that `mlperf-inf-mm-vl2l` can take with:
+
+```bash
+mlperf-inf-mm-vl2l --help
+```
+
+You can enable shell autocompletion for `mlperf-inf-mm-vl2l` with:
+
+```bash
+mlperf-inf-mm-vl2l --install-completion
+```
+
+> NOTE: Shell auto-completion will take effect once you restart the terminal.
+
+### Start an inference endpoint on your local host machine with vLLM
+
+Please refer to [this guide on how to launch vLLM for various Qwen3 VL MoE models](https://docs.vllm.ai/projects/recipes/en/latest/Qwen/Qwen3-VL.html).
+
+```bash
+docker run --gpus all \                                 # Use all the GPUs on this host machine.
+    -v ~/.cache/huggingface:/root/.cache/huggingface \  # Use the HuggingFace cache from your host machine.
+    -p 8000:8000 \                                      # This assumes the endpoint will use port 8000.
+    --ipc=host \                                        # The container can access and utilize the host's IPC mechanisms (e.g., shared memory).
+    vllm/vllm-openai:nightly \                          # You can also use the `:latest` container or a specific release.
+        --model Qwen/Qwen3-VL-235B-A22B-Instruct \      # Specifies the model for vLLM to deploy.
+        --tensor-parallel-size 8 \                      # 8-way tensor-parallel inference across 8 GPUs.
+        --limit-mm-per-prompt.video 0                   # The input requests will contain images only (i.e., no videos).
+```
+
+### Run the benchmark for the Offline scenario
+
+Performance only mode:
+
+```bash
+mlperf-inf-mm-vl2l --settings.senario offline --settings.mode performance_only
+```
+
+Accuracy only mode:
+
+TBD
+
+### Run the benchmark for the Server scenario
+
+Performance only mode:
+
+TBD
+
+Accuracy only mode:
+
+TBD
+
 ## Developer Guide
 
 ### Linting
diff --git a/multimodal/vl2l/notebooks/shopify-global-catalogue.ipynb b/multimodal/vl2l/notebooks/shopify-global-catalogue.ipynb
diff --git a/multimodal/vl2l/pyproject.toml b/multimodal/vl2l/pyproject.toml
@@ -17,6 +17,7 @@ dependencies = [
   "openai[aiohttp]",
   "pydantic",
   "pydantic-typer",
+  "pympler",
   "typer",
 ]
 dynamic = ["version"]
@@ -50,6 +51,9 @@ lint.ignore = [
   "ANN003",  # Missing type annotation for `**kwargs`
 ]
 
+[tool.ruff.lint.pylint]
+max-args = 10
+
 [tool.ruff.lint.pydocstyle]
 convention = "google"
 
@@ -64,3 +68,7 @@ ignore_missing_imports = true
 [[tool.mypy.overrides]]
 module = "mlperf_loadgen"
 ignore_missing_imports = true
+
+[[tool.mypy.overrides]]
+module = "datasets"
+ignore_missing_imports = true
diff --git a/multimodal/vl2l/scripts/linters.sh b/multimodal/vl2l/scripts/linters.sh
@@ -34,7 +34,16 @@ echo "Running ruff..."
 ruff check --fix "${PROJECT_ROOT}"/src/
 
 echo "Running mypy..."
-mypy --config-file="${PROJECT_ROOT}"/pyproject.toml --install-types "${PROJECT_ROOT}"/src/
+mypy --config-file="${PROJECT_ROOT}"/pyproject.toml \
+     --install-types \
+     "${PROJECT_ROOT}"/src/
 
 echo "Running shellcheck..."
-find "${PROJECT_ROOT}" -type f -name "*.sh" -exec shellcheck -ax {} +
+find "${PROJECT_ROOT}" -type f -name "*.sh" -exec shellcheck -ax {} +
+
+echo "Running trufflehog..."
+docker run --rm \
+           -it \
+           -v "${PROJECT_ROOT}":/to-scan \
+           trufflesecurity/trufflehog:latest \
+           filesystem /to-scan
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/cli.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/cli.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import sys
 from datetime import timedelta
 from enum import StrEnum, auto
 from typing import Annotated
@@ -11,8 +12,9 @@
 from openai import AsyncOpenAI, DefaultAioHttpClient
 from pydantic import BaseModel, Field
 from pydantic_typer import Typer
+from typer import Option
 
-from .task import MMMU, ShopifyGlobalCatalogue, Task
+from .task import ShopifyGlobalCatalogue
 
 app = Typer()
 
@@ -97,7 +99,7 @@ class TestSettings(BaseModel):
         Field(
             description="The expected QPS for the offline scenario.",
         ),
-    ] = 10
+    ] = 100
 
     min_duration: Annotated[
         timedelta,
@@ -112,8 +114,7 @@ def to_lgtype(self) -> lg.TestSettings:
         settings.scenario = self.senario.to_lgtype()
         settings.mode = self.mode.to_lgtype()
         settings.offline_expected_qps = self.offline_expected_qps
-        settings.min_duration_ms = round(
-            self.min_duration.total_seconds() * 1000)
+        settings.min_duration_ms = round(self.min_duration.total_seconds() * 1000)
         settings.use_token_latencies = True
         return settings
 
@@ -124,36 +125,12 @@ class Model(BaseModel):
     repo_id: Annotated[
         str,
         Field(description="The HuggingFace repository ID of the model."),
-    ] = "Qwen/Qwen3-VL-30B-A3B-Instruct"
+    ] = "Qwen/Qwen3-VL-235B-A22B-Instruct"
 
 
 class Dataset(BaseModel):
     """Specifies a dataset on HuggingFace."""
 
-    class Task(StrEnum):
-        """The task for the VL2L benchmark."""
-
-        SHOPIFY_GLOBAL_CATALOG = auto()
-        MMMU = auto()
-
-    class UnknownTaskError(ValueError):
-        """The exception raised when an unknown task is encountered."""
-
-        def __init__(self, task: Dataset.Task) -> None:
-            """Initialize the exception."""
-            super().__init__(f"Unknown task: {task}")
-
-    task: Annotated[
-        Dataset.Task | None,
-        Field(
-            description=(
-                "The vision-language-to-language task to run the benchmark for. If not "
-                "specified, the task will be derived from the HuggingFace repository ID"
-                " of the dataset."
-            ),
-        ),
-    ] = None
-
     repo_id: Annotated[
         str,
         Field(description="The HuggingFace repository ID of the dataset."),
@@ -169,25 +146,35 @@ def __init__(self, task: Dataset.Task) -> None:
     ] = None
 
 
-def create_task(dataset: Dataset, model: Model,
-                openai_api_client: AsyncOpenAI) -> Task:
-    """Convert the dataset configuration to its corresponding task."""
-    match dataset.task:
-        case Dataset.Task.MMMU:
-            return MMMU(dataset, model, openai_api_client)
-        case Dataset.Task.SHOPIFY_GLOBAL_CATALOG:
-            return ShopifyGlobalCatalogue(dataset, model, openai_api_client)
-        case None:
-            match dataset.repo_id:
-                case "MMMU/MMMU":
-                    return MMMU(dataset, model, openai_api_client)
-                case "Shopify/the-catalogue-public-beta":
-                    return ShopifyGlobalCatalogue(
-                        dataset, model, openai_api_client)
-                case _:
-                    raise Dataset.UnknownTaskError(dataset.task)
-        case _:
-            raise Dataset.UnknownTaskError(dataset.task)
+class Verbosity(StrEnum):
+    """The verbosity level of the logger."""
+
+    TRACE = auto()
+    """The trace verbosity level."""
+
+    DEBUG = auto()
+    """The debug verbosity level."""
+
+    INFO = auto()
+    """The info verbosity level (default)."""
+
+
+class Endpoint(BaseModel):
+    """Specifies the OpenAI API endpoint to use for the VL2L benchmark."""
+
+    url: Annotated[
+        str,
+        Field(
+            description=(
+                "The URL of the OpenAI API endpoint that the inference requests will be"
+                " sent to."
+            ),
+        ),
+    ] = "http://localhost:8000/v1"
+    api_key: Annotated[
+        str,
+        Field(description="The API key to authenticate the inference requests."),
+    ] = ""
 
 
 @app.command()
@@ -196,25 +183,39 @@ def main(
     settings: TestSettings,
     model: Model,
     dataset: Dataset,
-    endpoint: str = "http://localhost:8000/v1",
-    openai_api_key: str = "",
+    endpoint: Endpoint,
+    random_seed: Annotated[
+        int,
+        Option(help="The seed for the random number generator used by the benchmark."),
+    ] = 12345,
+    verbosity: Annotated[
+        Verbosity,
+        Option(help="The verbosity level of the logger."),
+    ] = Verbosity.INFO,
 ) -> None:
     """Main CLI for running the VL2L benchmark."""
+    logger.remove()
+    logger.add(sys.stdout, level=verbosity.value.upper())
     logger.info("Running VL2L benchmark with settings: {}", settings)
+    logger.info("Running VL2L benchmark with model: {}", model)
     logger.info("Running VL2L benchmark with dataset: {}", dataset)
-    logger.info("Running VL2L benchmark with endpoint: {}", endpoint)
+    logger.info("Running VL2L benchmark with OpenAI API endpoint: {}", endpoint)
+    logger.info("Running VL2L benchmark with random seed: {}", random_seed)
     lg_settings = settings.to_lgtype()
-    task = create_task(
-        dataset,
-        model,
-        AsyncOpenAI(
-            base_url=endpoint,
+    task = ShopifyGlobalCatalogue(
+        dataset_cli=dataset,
+        model_cli=model,
+        openai_api_client=AsyncOpenAI(
+            base_url=endpoint.url,
             http_client=DefaultAioHttpClient(),
-            api_key=openai_api_key,
+            api_key=endpoint.api_key,
         ),
+        random_seed=random_seed,
     )
     sut = task.construct_sut()
     qsl = task.construct_qsl()
+    logger.info("Starting the VL2L benchmark with LoadGen...")
     lg.StartTest(sut, qsl, lg_settings)
+    logger.info("The VL2L benchmark with LoadGen completed.")
     lg.DestroyQSL(qsl)
     lg.DestroySUT(sut)
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py