Skip to content

Commit f2a8b1a

Browse files
committed
adjust number of user for multi-turn chat
* Number of user = Number of group * Number of prompts per group
1 parent cf27543 commit f2a8b1a

File tree

4 files changed

+31
-28
lines changed

4 files changed

+31
-28
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ Inference Perf is a GenAI inference performance benchmarking tool that allows yo
2222
* Supports benchmarking large deployments with frameworks like [llm-d](https://llm-d.ai/), [Dynamo](https://docs.nvidia.com/dynamo/latest/) and [Inference Gateway](https://gateway-api-inference-extension.sigs.k8s.io/).
2323
* Supports specifying an exact input and output distribution to simulate different scenarios - Gaussian distribution, fixed length, min-max cases are all supported.
2424
* Generates different load patterns and can benchmark specific cases like burst traffic, scaling to saturation and other autoscaling / routing scenarios.
25-
* Supprots Multi-turn chat conversations, it can keep context of a series of messages to simulate a conversation. A request in each chat round will keep previouse messages as prefix. see example [config-multi-turn](examples/vllm/config-shared-prefix-multi-turn.yml)
25+
* Supports Multi-turn chat conversations, it can keep context of a series of messages to simulate a conversation. A request in each chat round will keep previouse messages as prefix. see example [config-multi-turn](examples/vllm/config-shared-prefix-multi-turn.yml)
2626

2727
## Roadmap
2828

examples/vllm/config-shared-prefix-multi-turn.yml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@ load:
33
num_workers: 2
44
worker_max_concurrency: 10
55
stages:
6-
- rate: 5
7-
duration: 10
6+
- rate: 20 # Send all 20 users' requests per second
7+
duration: 5
88
api:
99
type: completion
1010
server:
@@ -17,12 +17,12 @@ tokenizer:
1717
data:
1818
type: shared_prefix
1919
shared_prefix:
20-
num_groups: 2 # Number of distinct users
21-
num_prompts_per_group: 25 # Number of unique questions per user
20+
num_groups: 2 # Number of distinct prefix, Note: the number of users is num_groups * num_prompts_per_group
21+
num_prompts_per_group: 10 # Number of unique questions per group (prefix)
2222
system_prompt_len: 100 # Length of the first prefix (in tokens), simulate initialization of a system prompt
2323
question_len: 50 # Length of the unique question part (in tokens)
2424
output_len: 50 # Target length for the model's generated output (in tokens)
25-
enable_multi_turn_chat: true # enable multi-turn chat, create user session for each group. The chat context will be appended for the each request in the group.
25+
enable_multi_turn_chat: true # enable multi-turn chat, it will create user session to keep the conversation. The chat context will be appended for the each request.
2626
metrics:
2727
type: prometheus
2828
prometheus:

inference_perf/apis/user_session.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,3 +74,8 @@ async def process_failure(
7474
self.update_inference_info(inference_info)
7575
self.user_session.update_context(self._session_context)
7676
return inference_info
77+
78+
79+
# TODO: UserSessionChatAPIData need to be implemented
80+
# class UserSessionChatAPIData(ChatCompletionAPIData):
81+
# ...

inference_perf/datagen/shared_prefix_datagen.py

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,8 @@ def is_prefered_worker_requested(self) -> bool:
6565
def load_lazy_data(self, data: LazyLoadInferenceAPIData) -> InferenceAPIData:
6666
i = data.data_index % len(self.prompts)
6767
if self.enable_multi_turn_chat:
68-
user_id = data.data_index % self.num_groups
69-
round = data.data_index // self.num_groups
68+
user_id = data.data_index % len(self.user_sessions)
69+
round = data.data_index // len(self.user_sessions)
7070
return UserSessionCompletionAPIData(
7171
prompt=self.prompts[i],
7272
max_tokens=self.output_len,
@@ -106,26 +106,24 @@ def _generate_prompts(self) -> None:
106106
shared_prefix_token_ids = self._generate_random_token_ids(self.system_prompt_len)
107107
shared_prefix_text = hf_tokenizer.decode(shared_prefix_token_ids, skip_special_tokens=True)
108108

109-
if self.enable_multi_turn_chat:
110-
# Create user session and store prefix as context (system prompt)
111-
self.user_sessions.append(
112-
LocalUserSession(user_session_id=f"user_session_{group_id}", context=shared_prefix_text)
113-
)
114-
for _ in range(self.num_prompts_per_group):
115-
question_token_ids = self._generate_random_token_ids(self.question_len)
116-
question_text = hf_tokenizer.decode(question_token_ids, skip_special_tokens=True)
117-
# store question only as each round's prompt
118-
self.prompts.append(question_text)
119-
else:
120-
for _ in range(self.num_prompts_per_group):
121-
# Generate a unique question
122-
question_token_ids = self._generate_random_token_ids(self.question_len)
123-
question_text = hf_tokenizer.decode(question_token_ids, skip_special_tokens=True)
124-
125-
# Combine shared prefix and question
126-
full_prompt_text = shared_prefix_text + " " + question_text
127-
128-
self.prompts.append(full_prompt_text)
109+
for prompt_id in range(self.num_prompts_per_group):
110+
# Generate a unique question
111+
question_token_ids = self._generate_random_token_ids(self.question_len)
112+
question_text = hf_tokenizer.decode(question_token_ids, skip_special_tokens=True)
113+
114+
if self.enable_multi_turn_chat:
115+
# multi turn chat, create user to keep conversation
116+
self.user_sessions.append(
117+
LocalUserSession(
118+
user_session_id=f"user_session_{self.num_prompts_per_group * group_id + prompt_id}",
119+
context=shared_prefix_text,
120+
)
121+
)
122+
else:
123+
# Single turn chat, Combine shared prefix and question
124+
question_text = shared_prefix_text + " " + question_text
125+
126+
self.prompts.append(question_text)
129127

130128
# Shuffle the generated prompts to ensure randomness if served sequentially by different workers
131129
random.shuffle(self.prompts)

0 commit comments

Comments
 (0)