CentML
diff --git a/‎language/qwen2.5-VL-7B-docker/README.md‎
Lines changed: 18 additions & 1 deletion b/‎language/qwen2.5-VL-7B-docker/README.md‎
Lines changed: 18 additions & 1 deletion
diff --git a/‎language/qwen2.5-VL-7B-docker/ALT_SUT_VLLM.py‎ renamed to ‎language/qwen2.5-VL-7B-docker/SUT_VLLM.py‎
Lines changed: 58 additions & 164 deletions b/‎language/qwen2.5-VL-7B-docker/ALT_SUT_VLLM.py‎ renamed to ‎language/qwen2.5-VL-7B-docker/SUT_VLLM.py‎
Lines changed: 58 additions & 164 deletions
@@ -1,3 +1,20 @@
 ## How to run the docker compose file ##
 
-```UID=$(id -u) GID=$(id -g) docker compose -f docker-compose-qwen.yml up```
+**Offline mode**
+```UID=$(id -u) GID=$(id -g) docker compose -f docker-compose-offline-mode.yml up```
+
+**Server Mode**
+```UID=$(id -u) GID=$(id -g) docker compose -f docker-compose-server-mode.yml up```
+
+#### Evaluate accuracy ####
+
+Run
+Create a python virtual environment and then run these commands:
+
+```pip install numpy pandas pyyaml transformers```
+
+```python generate_total_val_output.py --mode [offline|server]```
+
+```python main_eval_only.py --output_path total_val_output.json```
+
+The accuracy results will be shown in the file **evaluation_results.json**
@@ -3,48 +3,25 @@
 import time
 import numpy as np
 import array
-# import torch
-# from torch.nn.functional import pad
-# from vllm import LLM, AsyncLLMEngine, AsyncEngineArgs, SamplingParams
-# from vllm.inputs import TokensPrompt
-# from transformers import AutoProcessor
-# import pickle
 import time
 import threading
-# import tqdm
 import queue
 
 import logging
-# from typing import TYPE_CHECKING, Optional, List
-# from pathlib import Path
 
 import mlperf_loadgen as lg
 from dataset import Dataset
-import sys
-import subprocess
-import requests
-from contextlib import suppress
-import signal
-from openai import OpenAI, AsyncOpenAI
+from openai import AsyncOpenAI
 from typing import Any, Dict
 
 
 logging.basicConfig(level=logging.INFO)
 log = logging.getLogger("Qwen2.5-VL-7B")
 
 # ---------- Config ----------
-MODEL = os.environ.get("VLLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct")
 HOST = os.environ.get("VLLM_HOST", "vllm")
 PORT = int(os.environ.get("VLLM_PORT", "8000"))
-
-# Extra vLLM server args if you need them (GPU/CPU flags, trust-remote-code, tensor-parallel-size, etc.)
-EXTRA_ARGS = os.environ.get("VLLM_EXTRA_ARGS", "--trust-remote-code").split()
-
 BASE_URL = f"http://{HOST}:{PORT}/v1"
-HEALTH_URLS = [
-    f"http://{HOST}:{PORT}/health",         # preferred if available
-    f"http://{HOST}:{PORT}/v1/models",      # fallback readiness check
-]
 
 class SUT:
     def __init__(
@@ -59,9 +36,8 @@ def __init__(
         # session was killed partway through
         workers=1,
         tensor_parallel_size=8,
-        _load_model=False
+        scenario="offline"
     ):
-        self.proc = None
         self.model_path = model_path or f"Qwen/Qwen2.5-VL-7B-Instruct"
 
         if not batch_size:
@@ -83,27 +59,27 @@ def __init__(
             self.data_object.UnloadSamplesFromRam,
         )
 
-        if _load_model: self.load_model()
-        gen_kwargs = {
+        self.num_workers = workers
+        self.params = {
             "temperature": 0.0,
-            "top_p": 1,
-            "top_k": 1,
-            "seed": 42,
             "max_tokens": 1024,
         }
-        self.max_tokens = 1024
-        self.temperature = 0.0
-        # self.sampling_params = SamplingParams(**gen_kwargs)
-        self.sampling_params = gen_kwargs
-        # self.sampling_params.all_stop_token_ids.add(self.model.get_tokenizer().eos_token_id)
+        
+        if scenario == "offline":
+            from vllm import SamplingParams
+            from transformers import AutoProcessor
 
-        self.num_workers = workers
-        self.worker_threads = [None] * self.num_workers
-        self.query_queue = queue.Queue()
+            self.load_model()
+            self.sampling_params = SamplingParams(**self.params)
+            self.processor = AutoProcessor.from_pretrained(self.model_path)
+            self.request_id_counter = 0
+
+            self.worker_threads = [None] * self.num_workers
+            self.query_queue = queue.Queue()
 
-        self.use_cached_outputs = use_cached_outputs
-        self.sample_counter = 0
-        self.sample_counter_lock = threading.Lock()
+            self.use_cached_outputs = use_cached_outputs
+            self.sample_counter = 0
+            self.sample_counter_lock = threading.Lock()
 
     def start(self):
         # Create worker threads
@@ -122,24 +98,37 @@ def stop(self):
     def process_queries(self):
         """Processor of the queued queries. User may choose to add batching logic"""
         while True:
-            qitem = self.query_queue.get()
-            if qitem is None:
+            qitems = self.query_queue.get()
+            if qitems is None:
                 break
 
-            query_ids = [q.index for q in qitem]
+            query_ids = [q.index for q in qitems]
 
             tik1 = time.time()
 
-            input_ids_tensor = [
-                self.data_object.input_ids[q.index] for q in qitem]
-            # input_text_tensor = [
-            #     self.data_object.input[q.index] for q in qitem]
-            # for in_text in input_text_tensor:
-            #     log.info(f"Input: {in_text}")
-
+            prompts = []
+            for item in qitems:
+                question = self.data_object.prompts[item.index]
+                
+                placeholders = [{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64img}"}} for b64img in self.data_object.images[item.index]]
+                messages = [
+                    {"role": "system", "content": "You are a helpful assistant."},
+                    {"role": "user", "content": [*placeholders, {"type": "text", "text": question}]},
+                ]
+                
+                prompt = self.processor.apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True
+                )
+                prompts.append({
+                    "prompt": prompt,
+                    "multi_modal_data": {"image": self.data_object.images[item.index]}
+                })
+            
+            
+                
             tik2 = time.time()
             outputs = self.model.generate(
-                prompt_token_ids=input_ids_tensor, sampling_params=self.sampling_params
+                prompts=prompts, sampling_params=self.sampling_params
             )
             pred_output_tokens = []
             for output in outputs:
@@ -151,14 +140,14 @@ def process_queries(self):
                 pred_output_tokens,
                 query_id_list=query_ids,
             )
-            for i in range(len(qitem)):
+            for i in range(len(qitems)):
                 n_tokens = processed_output[i].shape[0]
                 response_array = array.array(
                     "B", processed_output[i].tobytes())
                 bi = response_array.buffer_info()
                 response = [
                     lg.QuerySampleResponse(
-                        qitem[i].id,
+                        qitems[i].id,
                         bi[0],
                         bi[1],
                         n_tokens)]
@@ -167,7 +156,7 @@ def process_queries(self):
             tok = time.time()
 
             with self.sample_counter_lock:
-                self.sample_counter += len(qitem)
+                self.sample_counter += len(qitems)
                 log.info(f"Samples run: {self.sample_counter}")
                 if tik1:
                     log.info(f"\tBatchMaker time: {tik2 - tik1}")
@@ -176,12 +165,13 @@ def process_queries(self):
                     log.info(f"\t==== Total time: {tok - tik1}")
 
     def load_model(self):
+        from vllm import LLM
         log.info("Loading model...")
-        # self.model = LLM(
-        #     self.model_path,
-        #     dtype=self.dtype,
-        #     tensor_parallel_size=self.tensor_parallel_size,
-        # )
+        self.model = LLM(
+             self.model_path,
+             dtype=self.dtype,
+             tensor_parallel_size=self.tensor_parallel_size,
+         )
         log.info("Loaded model")
 
     def get_sut(self):
@@ -209,74 +199,6 @@ def issue_queries(self, query_samples):
     def flush_queries(self):
         pass
 
-    def start_vllm_server(self, model: str, host: str, port: int, extra_args: list[str]) -> subprocess.Popen:
-        """
-        Launch vLLM's OpenAI-compatible server as a subprocess.
-        Returns a Popen handle.
-        """
-        cmd = [
-            sys.executable, "-m", "vllm.entrypoints.openai.api_server",
-            "--model", model,
-            "--host", host,
-            "--port", str(port),
-        ] + extra_args
-
-        # Inherit stdout/stderr so you can see logs in your terminal.
-        # (If you prefer, redirect to a file or PIPE.)
-        print("Launching vLLM server:\n ", " ".join(cmd))
-        proc = subprocess.Popen(cmd, stdout=sys.stdout, stderr=sys.stderr)
-        return proc
-
-    def wait_until_ready(self, timeout_s: int = 300, poll_interval_s: float = 0.5) -> None:
-        """
-        Poll one or more health endpoints until HTTP 200, or raise on timeout.
-        """
-        start = time.time()
-        last_err = None
-        while time.time() - start < timeout_s:
-            for url in HEALTH_URLS:
-                try:
-                    r = requests.get(url, timeout=2)
-                    if r.status_code == 200:
-                        # For /v1/models, ensure JSON is present (indicates API is fully up)
-                        if url.endswith("/v1/models"):
-                            with suppress(Exception):
-                                _ = r.json()
-                        print(f"Server ready at {url}")
-                        return
-                except Exception as e:
-                    last_err = e
-            time.sleep(poll_interval_s)
-
-        raise TimeoutError(f"vLLM server didn't become ready within {timeout_s}s. Last error: {last_err}")
-
-
-    def terminate_process(self, proc: subprocess.Popen, grace_s: int = 15) -> None:
-        """
-        Try to stop the server gracefully, then force-kill if needed.
-        Cross-platform friendly.
-        """
-        if proc.poll() is not None:
-            return  # already exited
-
-        try:
-            # POSIX: try SIGINT first (clean shutdown), then SIGTERM, then SIGKILL.
-            proc.send_signal(signal.SIGINT)
-            try:
-                proc.wait(timeout=grace_s)
-                return
-            except subprocess.TimeoutExpired:
-                pass
-
-            proc.terminate()
-            try:
-                proc.wait(timeout=5)
-            except subprocess.TimeoutExpired:
-                proc.kill()
-        finally:
-            with suppress(Exception):
-                proc.wait(timeout=2)
-
 
     def __del__(self):
         pass
@@ -285,51 +207,32 @@ def __del__(self):
 class SUTServer(SUT):
     def __init__(
         self,
-        # ... same arguments as before
         model_path=None,
         dtype="bfloat16",
         total_sample_count=13368,
         dataset_path=None,
         batch_size=None,
         workers=1,
-        tensor_parallel_size=8
+        tensor_parallel_size=8,
+        scenario="offline"
     ):
-        # We call a modified super().__init__ that doesn't load the model yet
-        # because model loading needs to be async.
-        # This is a bit of a simplification; you might need to adjust the base SUT init.
-        # For this example, let's assume the base init can be called without loading the model.
         super().__init__(
             model_path=model_path,
+            batch_size=batch_size,
             dtype=dtype,
             total_sample_count=total_sample_count,
             dataset_path=dataset_path,
             workers=workers,
             tensor_parallel_size=tensor_parallel_size,
-            # Add a flag to skip model loading in the base class constructor
-            _load_model=False 
+            scenario=scenario
         )
-        self.request_id_counter = 0
-        client = AsyncOpenAI(
+        self._client = AsyncOpenAI(
             base_url=BASE_URL,
             api_key="EMPTY"
         )
-        self._client = client
-        # This will be the single, long-running asyncio event loop
-        self.event_loop = None
-        self.event_loop_thread = None
-        
-        # We'll use an asyncio.Queue to communicate between the issue_queries thread
-        # and our main async event loop.
-        self.async_query_queue = None
 
 
     def start(self):
-        # self.proc = self.start_vllm_server(MODEL, HOST, PORT, EXTRA_ARGS)
-        # self.wait_until_ready()
-
-        # # Optional: print the models list to confirm we're talking to the right thing
-        # r = requests.get(f"{BASE_URL}/models", timeout=3)
-        # print("\n[Models]", r.json())
         pass
 
 
@@ -350,20 +253,15 @@ async def _issue_one(
 
         messages = [{"role": "user", "content": contents}]
 
-        params = dict(
-            model=self.model_path,
-            max_tokens=self.max_tokens,
-            temperature=self.temperature
-        )
-
         async with semaphore:
             ttft_set = False
 
             # await the async creation; ask for a streaming iterator
             stream = await self._client.chat.completions.create(
                 stream=True,
                 messages=messages,
-                **params
+                model=self.model_path,
+                **self.params
             )
             out = []
             # iterate asynchronously
@@ -416,8 +314,4 @@ def issue_queries(self, query_samples):
         asyncio.run(self._issue_queries_async(query_samples))
 
     def stop(self):
-        # if self.proc is not None:
-        #     print("\nShutting down vLLM server…")
-        #     self.terminate_process(self.proc)
-        #     print("Done.")
         pass