enhance multi-turn chat feature, Update loadgen dispatcher

huaxig · huaxig · commit cf275430a105 · 2025-11-05T21:46:18.000Z
- request dispatcher supports assign request to a specific worker
- multi-turn chat enhace with load banlanced on both worker and user session level.
- introduced to standardize the lazy loading of inference data. This replaces the previous implementation and provides a cleaner, extensible design for data handling between the data generator, load generator, and API data layers.
diff --git a/README.md b/README.md
@@ -22,6 +22,7 @@ Inference Perf is a GenAI inference performance benchmarking tool that allows yo
 * Supports benchmarking large deployments with frameworks like [llm-d](https://llm-d.ai/), [Dynamo](https://docs.nvidia.com/dynamo/latest/) and [Inference Gateway](https://gateway-api-inference-extension.sigs.k8s.io/).
 * Supports specifying an exact input and output distribution to simulate different scenarios - Gaussian distribution, fixed length, min-max cases are all supported.
 * Generates different load patterns and can benchmark specific cases like burst traffic, scaling to saturation and other autoscaling / routing scenarios.
+* Supprots Multi-turn chat conversations, it can keep context of a series of messages to simulate a conversation. A request in each chat round will keep previouse messages as prefix. see example [config-multi-turn](examples/vllm/config-shared-prefix-multi-turn.yml)
 
 ## Roadmap
 
diff --git a/examples/vllm/config-shared-prefix-multi-turn.yml b/examples/vllm/config-shared-prefix-multi-turn.yml
@@ -0,0 +1,35 @@
+load:
+  type: constant
+  num_workers: 2
+  worker_max_concurrency: 10
+  stages:
+  - rate: 5
+    duration: 10
+api:
+  type: completion
+server:
+  type: vllm
+  model_name: HuggingFaceTB/SmolLM2-135M-Instruct
+  base_url: http://0.0.0.0:8000
+  ignore_eos: true
+tokenizer:
+  pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-135M-Instruct
+data:
+  type: shared_prefix
+  shared_prefix:
+    num_groups: 2                 # Number of distinct users
+    num_prompts_per_group: 25     # Number of unique questions per user
+    system_prompt_len: 100        # Length of the first prefix (in tokens), simulate initialization of a system prompt
+    question_len: 50              # Length of the unique question part (in tokens)
+    output_len: 50                # Target length for the model's generated output (in tokens)
+    enable_multi_turn_chat: true  # enable multi-turn chat, create user session for each group. The chat context will be appended for the each request in the group.
+metrics:
+  type: prometheus
+  prometheus:
+    url: http://localhost:9090
+    scrape_interval: 15
+report:
+  request_lifecycle:
+    summary: true
+    per_stage: true
+    per_request: true
diff --git a/inference_perf/apis/__init__.py b/inference_perf/apis/__init__.py
@@ -11,12 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .base import InferenceAPIData, InferenceInfo, RequestLifecycleMetric, ErrorResponseInfo
+from .base import InferenceAPIData, InferenceInfo, RequestLifecycleMetric, ErrorResponseInfo, LazyLoadInferenceAPIData
 from .chat import ChatCompletionAPIData, ChatMessage
 from .completion import CompletionAPIData
 
 __all__ = [
     "InferenceAPIData",
+    "LazyLoadInferenceAPIData",
     "InferenceInfo",
     "RequestLifecycleMetric",
     "ErrorResponseInfo",
diff --git a/inference_perf/apis/base.py b/inference_perf/apis/base.py
@@ -44,6 +44,9 @@ class RequestLifecycleMetric(BaseModel):
 
 
 class InferenceAPIData(BaseModel):
+    # loadgen should assign this request to prefered worker if possible
+    prefered_worker_id: int = -1  # no prefered worker by default
+
     @abstractmethod
     def get_api_type(self) -> APIType:
         raise NotImplementedError
@@ -64,3 +67,31 @@ async def process_failure(
         self, response: Optional[ClientResponse], config: APIConfig, tokenizer: CustomTokenizer, exception: Exception
     ) -> Optional[InferenceInfo]:
         pass  # no-op by default
+
+
+class LazyLoadInferenceAPIData(InferenceAPIData):
+    """
+    InferenceAPIData that loads data lazily.
+    This is useful for multiprocessing where the data cannot be pickled or need to be initialized in worker space.
+    this class shouldn't go with any data but payload for data generator to return API data later.
+    in most cases, generator should depends on data_index as reference. If more payload needed, try to extend this class.
+    """
+
+    data_index: int
+
+    def get_api_type(self) -> APIType:
+        raise NotImplementedError("LazyLoadInferenceAPIData doesn't support this operation")
+
+    def get_route(self) -> str:
+        raise NotImplementedError("LazyLoadInferenceAPIData doesn't support this operation")
+
+    async def to_payload(self, model_name: str, max_tokens: int, ignore_eos: bool, streaming: bool) -> dict[str, Any]:
+        raise NotImplementedError("LazyLoadInferenceAPIData doesn't support this operation")
+
+    async def process_response(self, response: ClientResponse, config: APIConfig, tokenizer: CustomTokenizer) -> InferenceInfo:
+        raise NotImplementedError("LazyLoadInferenceAPIData doesn't support this operation")
+
+    async def process_failure(
+        self, response: Optional[ClientResponse], config: APIConfig, tokenizer: CustomTokenizer, exception: Exception
+    ) -> Optional[InferenceInfo]:
+        raise NotImplementedError("LazyLoadInferenceAPIData doesn't support this operation")
diff --git a/inference_perf/apis/completion.py b/inference_perf/apis/completion.py
@@ -26,7 +26,7 @@
 class CompletionAPIData(InferenceAPIData):
     prompt: str
     max_tokens: int = 0
-    output_token: str = ""
+    model_response: str = ""
 
     def get_api_type(self) -> APIType:
         return APIType.Completion
@@ -63,7 +63,7 @@ async def process_response(self, response: ClientResponse, config: APIConfig, to
                         output_text += text
             prompt_len = tokenizer.count_tokens(self.prompt)
             output_len = tokenizer.count_tokens(output_text)
-            self.output_token = output_text
+            self.model_response = output_text
             return InferenceInfo(
                 input_tokens=prompt_len,
                 output_tokens=output_len,
@@ -77,5 +77,5 @@ async def process_response(self, response: ClientResponse, config: APIConfig, to
                 return InferenceInfo(input_tokens=prompt_len)
             output_text = choices[0].get("text", "")
             output_len = tokenizer.count_tokens(output_text)
-            self.output_token = output_text
+            self.model_response = output_text
             return InferenceInfo(input_tokens=prompt_len, output_tokens=output_len)
diff --git a/inference_perf/apis/user_session.py b/inference_perf/apis/user_session.py
@@ -1,6 +1,6 @@
 import logging
 import asyncio
-from typing import Any, Optional, Tuple
+from typing import Any, Optional
 from pydantic import ConfigDict, Field
 
 from aiohttp import ClientResponse
@@ -20,13 +20,13 @@ def __init__(self, user_session_id: str, context: str = ""):
         self.contexts = context if context else ""
         self._current_round = 0
         self._in_flight: asyncio.Lock = asyncio.Lock()
-        self._waiting_rounds: asyncio.PriorityQueue[Tuple[int, asyncio.Future[bool]]] = asyncio.PriorityQueue()
+        self._waiting_rounds: asyncio.Queue[asyncio.Future[bool]] = asyncio.Queue()
 
     async def get_context(self, round: int) -> str:
         if not self._waiting_rounds.empty() or self._in_flight.locked():
             # entering waiting queue
             future: asyncio.Future[bool] = asyncio.Future()
-            self._waiting_rounds.put_nowait((round, future))
+            self._waiting_rounds.put_nowait(future)
             await future
         await self._in_flight.acquire()
         self._current_round += 1
@@ -36,7 +36,7 @@ def update_context(self, response: str) -> None:
         self.contexts = response
 
         if not self._waiting_rounds.empty():
-            _, future = self._waiting_rounds.get_nowait()
+            future = self._waiting_rounds.get_nowait()
             future.set_result(True)
 
         self._in_flight.release()
@@ -49,6 +49,7 @@ class UserSessionCompletionAPIData(CompletionAPIData):
 
     async def to_payload(self, model_name: str, max_tokens: int, ignore_eos: bool, streaming: bool) -> dict[str, Any]:
         self._session_context = await self.user_session.get_context(self.target_round)
+        # TODO: Currently, only prompt style (concat messages) support. Adding support for messages style payload.
         self.prompt = self._session_context + " " + self.prompt
         # TODO: The combined prompt (session context + current prompt) might exceed the model's
         #       maximum sequence length. Implement truncation logic/strategy to prevent
@@ -62,7 +63,7 @@ def update_inference_info(self, inference_info: InferenceInfo) -> None:
     async def process_response(self, response: ClientResponse, config: APIConfig, tokenizer: CustomTokenizer) -> InferenceInfo:
         inference_info = await super().process_response(response, config, tokenizer)
         self.update_inference_info(inference_info)
-        self.user_session.update_context(self.prompt + " " + self.output_token)
+        self.user_session.update_context(self.prompt + " " + self.model_response)
         return inference_info
 
     async def process_failure(
diff --git a/inference_perf/config.py b/inference_perf/config.py
@@ -59,8 +59,7 @@ class SharedPrefix(BaseModel):
     system_prompt_len: int = 100
     question_len: int = 50
     output_len: int = 50
-    # create user session for each group. The chat context will be appended for the each request in the group.
-    group_as_user_session: bool = False
+    enable_multi_turn_chat: bool = False
 
 
 class DataConfig(BaseModel):
diff --git a/inference_perf/datagen/__init__.py b/inference_perf/datagen/__init__.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .base import DataGenerator
+from .base import DataGenerator, LazyLoadDataMixin
 from .mock_datagen import MockDataGenerator
 from .hf_sharegpt_datagen import HFShareGPTDataGenerator
 from .synthetic_datagen import SyntheticDataGenerator
@@ -23,6 +23,7 @@
 
 __all__ = [
     "DataGenerator",
+    "LazyLoadDataMixin",
     "MockDataGenerator",
     "HFShareGPTDataGenerator",
     "SyntheticDataGenerator",
diff --git a/inference_perf/datagen/base.py b/inference_perf/datagen/base.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from inference_perf.apis import InferenceAPIData
+from inference_perf.apis import InferenceAPIData, LazyLoadInferenceAPIData
 from inference_perf.utils.custom_tokenizer import CustomTokenizer
 from inference_perf.config import APIConfig, APIType, DataConfig, Distribution, SharedPrefix
 from abc import ABC, abstractmethod
@@ -64,3 +64,33 @@ def is_io_distribution_supported(self) -> bool:
     @abstractmethod
     def is_shared_prefix_supported(self) -> bool:
         raise NotImplementedError
+
+    # notify load gen whether request has prefered worker
+    def is_prefered_worker_requested(self) -> bool:
+        return False
+
+
+class LazyLoadDataMixin(ABC):
+    """
+    Mixin for data generators that support lazy loading of InferenceAPIData.
+    This is useful for multiprocessing where the actual InferenceAPIData objects
+    might be large or unpickleable, or need to be initialized in the worker process.
+    """
+
+    @abstractmethod
+    def load_lazy_data(self, data: LazyLoadInferenceAPIData) -> InferenceAPIData:
+        """
+        Returns the real InferenceAPIData object for the given data.
+        This method is usually called by worker processes to lazily load data unless MP mode disabled
+        """
+        raise NotImplementedError
+
+    @staticmethod
+    def get_request(data_generator: DataGenerator, data: InferenceAPIData) -> InferenceAPIData:
+        if isinstance(data, LazyLoadInferenceAPIData):
+            if isinstance(data_generator, LazyLoadDataMixin):
+                return data_generator.load_lazy_data(data)
+            else:
+                raise NotImplementedError("Data Generator doesn't support lazy loading of requested InferenceAPIData")
+        else:
+            return data
diff --git a/inference_perf/datagen/random_datagen.py b/inference_perf/datagen/random_datagen.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy as np
-from inference_perf.apis import InferenceAPIData, CompletionAPIData
+from inference_perf.apis import InferenceAPIData, CompletionAPIData, LazyLoadInferenceAPIData
 from inference_perf.utils.custom_tokenizer import CustomTokenizer
 from inference_perf.utils.distribution import generate_distribution
 from .base import DataGenerator
@@ -80,7 +80,9 @@ def is_io_distribution_supported(self) -> bool:
     def is_shared_prefix_supported(self) -> bool:
         return False
 
-    def get_request(self, n: int) -> InferenceAPIData:
+    def load_lazy_data(self, data: LazyLoadInferenceAPIData) -> InferenceAPIData:
+        n = data.data_index
+
         if self.tokenizer is None:
             raise ValueError("Tokenizer is required for RandomDataGenerator")
 
@@ -99,16 +101,5 @@ def get_data(self) -> Generator[InferenceAPIData, None, None]:
 
         i = 0
         while True:
-            prompt_text: str
-            if self.input_lengths[i] <= 0:
-                random_token_ids_list = []
-            else:
-                random_token_ids = np.random.randint(0, self.vocab_size, size=self.input_lengths[i], dtype=np.int64)
-                random_token_ids_list = random_token_ids.tolist()
-            prompt_text = self.tokenizer.get_tokenizer().decode(random_token_ids_list)
-
-            yield CompletionAPIData(
-                prompt=prompt_text,
-                max_tokens=self.output_lengths[i],
-            )
+            yield LazyLoadInferenceAPIData(data_index=i)
             i += 1
diff --git a/inference_perf/datagen/shared_prefix_datagen.py b/inference_perf/datagen/shared_prefix_datagen.py
@@ -2,17 +2,17 @@
 from typing import Generator, List, Optional
 import numpy as np
 
-from inference_perf.apis.base import InferenceAPIData
+from inference_perf.apis.base import InferenceAPIData, LazyLoadInferenceAPIData
 from inference_perf.apis.completion import CompletionAPIData
 from inference_perf.apis.user_session import LocalUserSession, UserSessionCompletionAPIData
 from inference_perf.config import APIConfig, APIType, DataConfig
 from inference_perf.utils.custom_tokenizer import CustomTokenizer
-from .base import DataGenerator
+from .base import DataGenerator, LazyLoadDataMixin
 
 
 # Shared Prefix Generator generates shared prefix in the prompts that are sent.
 # This can be used to benchmark prefix caching cases.
-class SharedPrefixDataGenerator(DataGenerator):
+class SharedPrefixDataGenerator(DataGenerator, LazyLoadDataMixin):
     def __init__(self, api_config: APIConfig, config: DataConfig, tokenizer: Optional[CustomTokenizer]) -> None:
         super().__init__(api_config, config, tokenizer)
 
@@ -44,7 +44,7 @@ def __init__(self, api_config: APIConfig, config: DataConfig, tokenizer: Optiona
         self.system_prompt_len: int = self.shared_prefix.system_prompt_len
         self.question_len: int = self.shared_prefix.question_len
         self.output_len: int = self.shared_prefix.output_len
-        self.group_as_user_session: bool = self.shared_prefix.group_as_user_session
+        self.enable_multi_turn_chat: bool = self.shared_prefix.enable_multi_turn_chat
 
         self.prompts: List[str] = []
         self.user_sessions: List[LocalUserSession] = []
@@ -59,11 +59,14 @@ def is_io_distribution_supported(self) -> bool:
     def is_shared_prefix_supported(self) -> bool:
         return True
 
-    def get_request(self, n: int) -> InferenceAPIData:
-        i = n % len(self.prompts)
-        if self.group_as_user_session:
-            user_id = n % self.num_groups
-            round = n // self.num_groups
+    def is_prefered_worker_requested(self) -> bool:
+        return True
+
+    def load_lazy_data(self, data: LazyLoadInferenceAPIData) -> InferenceAPIData:
+        i = data.data_index % len(self.prompts)
+        if self.enable_multi_turn_chat:
+            user_id = data.data_index % self.num_groups
+            round = data.data_index // self.num_groups
             return UserSessionCompletionAPIData(
                 prompt=self.prompts[i],
                 max_tokens=self.output_len,
@@ -79,8 +82,9 @@ def get_data(self) -> Generator[InferenceAPIData, None, None]:
 
         i = 0
         while True:
-            yield CompletionAPIData(prompt=self.prompts[i], max_tokens=self.output_len)
-            i = (i + 1) % len(self.prompts)
+            prefered_worker_id = i % self.num_groups if self.enable_multi_turn_chat else -1
+            yield LazyLoadInferenceAPIData(data_index=i, prefered_worker_id=prefered_worker_id)
+            i += 1
 
     def _generate_random_token_ids(self, length: int) -> List[int]:
         """Generates a list of random token IDs of a specified length."""
@@ -102,7 +106,7 @@ def _generate_prompts(self) -> None:
             shared_prefix_token_ids = self._generate_random_token_ids(self.system_prompt_len)
             shared_prefix_text = hf_tokenizer.decode(shared_prefix_token_ids, skip_special_tokens=True)
 
-            if self.group_as_user_session:
+            if self.enable_multi_turn_chat:
                 # Create user session and store prefix as context (system prompt)
                 self.user_sessions.append(
                     LocalUserSession(user_session_id=f"user_session_{group_id}", context=shared_prefix_text)
diff --git a/inference_perf/datagen/synthetic_datagen.py b/inference_perf/datagen/synthetic_datagen.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from inference_perf.apis import InferenceAPIData, CompletionAPIData
+from inference_perf.apis import InferenceAPIData, CompletionAPIData, LazyLoadInferenceAPIData
 from inference_perf.utils.custom_tokenizer import CustomTokenizer
 from inference_perf.utils.distribution import generate_distribution
 from .base import DataGenerator
@@ -55,7 +55,9 @@ def is_io_distribution_supported(self) -> bool:
     def is_shared_prefix_supported(self) -> bool:
         return False
 
-    def get_request(self, n: int) -> InferenceAPIData:
+    def load_lazy_data(self, data: LazyLoadInferenceAPIData) -> InferenceAPIData:
+        n = data.data_index
+
         if self.tokenizer is None:
             raise ValueError("Tokenizer is required for SyntheticDataGenerator")
 
@@ -75,10 +77,7 @@ def get_data(self) -> Generator[InferenceAPIData, None, None]:
 
         i = 0
         while True:
-            yield CompletionAPIData(
-                prompt=self.tokenizer.get_tokenizer().decode(self.token_ids[: self.input_lengths[i]]),
-                max_tokens=self.output_lengths[i],
-            )
+            yield LazyLoadInferenceAPIData(data_index=i)
             i += 1
 
     # Hardcoded sonnet data that we can use for synthetic benchmarks.
diff --git a/inference_perf/loadgen/load_generator.py b/inference_perf/loadgen/load_generator.py
diff --git a/inference_perf/utils/request_queue.py b/inference_perf/utils/request_queue.py