1+ [
2+ {
3+ "test_name" : " llama8B_tp1_sharegpt" ,
4+ "qps_list" : [4 ,8 ,16 ,32 ," inf" ],
5+ "common_parameters" : {
6+ "model" : " meta-llama/Meta-Llama-3-8B-Instruct" ,
7+ "tp" : 1 ,
8+ "dataset_name" : " sharegpt" ,
9+ "dataset_path" : " ./ShareGPT_V3_unfiltered_cleaned_split.json" ,
10+ "num_prompts" : 500 ,
11+ "port" : 8000 ,
12+ "reuse_server" : false
13+ },
14+ "lmdeploy_server_parameters" : {
15+ "dtype" : " bfloat16"
16+ },
17+ "lmdeploy_client_parameters" : {
18+ },
19+ "tgi_server_parameters" : {
20+ },
21+ "tgi_client_parameters" : {
22+ "endpoint" : " /generate_stream"
23+ },
24+ "trt_server_parameters" : {
25+ "model_type" : " llama" ,
26+ "model_dtype" : " bfloat16" ,
27+ "max_batch_size" : 2048 ,
28+ "max_input_len" : 4096 ,
29+ "max_seq_len" : 6144 ,
30+ "max_num_tokens" : 16384 ,
31+ "trt_llm_version" : " v0.11.0"
32+ },
33+ "trt_client_parameters" : {
34+ "endpoint" : " /v2/models/ensemble/generate_stream"
35+ },
36+ "vllm_server_parameters" : {
37+ "disable_log_stats" : " " ,
38+ "disable_log_requests" : " " ,
39+ "gpu_memory_utilization" : 0.9 ,
40+ "num_scheduler_steps" : 10 ,
41+ "max_num_seqs" : 512 ,
42+ "dtype" : " bfloat16"
43+ },
44+ "vllm_client_parameters" : {
45+ },
46+ "sglang_server_parameters" : {
47+ "disable_radix_cache" : " " ,
48+ "enable_torch_compile" : " " ,
49+ "dtype" : " bfloat16"
50+ },
51+ "sglang_client_parameters" : {
52+ }
53+ },
54+ {
55+ "test_name" : " llama8B_tp1_sonnet_512_16" ,
56+ "qps_list" : [4 ,8 ,16 ,32 ," inf" ],
57+ "common_parameters" : {
58+ "model" : " meta-llama/Meta-Llama-3-8B-Instruct" ,
59+ "tp" : 1 ,
60+ "dataset_name" : " sonnet" ,
61+ "dataset_path" : " ./sonnet_4x.txt" ,
62+ "num_prompts" : 500 ,
63+ "port" : 8000 ,
64+ "sonnet_input_len" : 512 ,
65+ "sonnet_output_len" : 16 ,
66+ "sonnet_prefix_len" : 50 ,
67+ "reuse_server" : true
68+ },
69+ "lmdeploy_server_parameters" : {
70+ "dtype" : " bfloat16"
71+ },
72+ "lmdeploy_client_parameters" : {
73+ },
74+ "tgi_server_parameters" : {
75+ },
76+ "tgi_client_parameters" : {
77+ "endpoint" : " /generate_stream"
78+ },
79+ "trt_server_parameters" : {
80+ "model_type" : " llama" ,
81+ "model_dtype" : " bfloat16" ,
82+ "max_batch_size" : 2048 ,
83+ "max_input_len" : 4096 ,
84+ "max_seq_len" : 6144 ,
85+ "max_num_tokens" : 16384 ,
86+ "trt_llm_version" : " v0.11.0"
87+ },
88+ "trt_client_parameters" : {
89+ "endpoint" : " /v2/models/ensemble/generate_stream"
90+ },
91+ "vllm_server_parameters" : {
92+ "disable_log_stats" : " " ,
93+ "disable_log_requests" : " " ,
94+ "gpu_memory_utilization" : 0.9 ,
95+ "num_scheduler_steps" : 10 ,
96+ "max_num_seqs" : 512 ,
97+ "dtype" : " bfloat16"
98+ },
99+ "vllm_client_parameters" : {
100+ },
101+ "sglang_server_parameters" : {
102+ "disable_radix_cache" : " " ,
103+ "enable_torch_compile" : " " ,
104+ "dtype" : " bfloat16"
105+ },
106+ "sglang_client_parameters" : {
107+ }
108+ },
109+ {
110+ "test_name" : " llama8B_tp1_sonnet_512_256" ,
111+ "qps_list" : [4 ,8 ,16 ,32 ," inf" ],
112+ "common_parameters" : {
113+ "model" : " meta-llama/Meta-Llama-3-8B-Instruct" ,
114+ "tp" : 1 ,
115+ "dataset_name" : " sonnet" ,
116+ "dataset_path" : " ./sonnet_4x.txt" ,
117+ "num_prompts" : 500 ,
118+ "port" : 8000 ,
119+ "sonnet_input_len" : 512 ,
120+ "sonnet_output_len" : 256 ,
121+ "sonnet_prefix_len" : 50 ,
122+ "reuse_server" : true
123+ },
124+ "lmdeploy_server_parameters" : {
125+ "dtype" : " bfloat16"
126+ },
127+ "lmdeploy_client_parameters" : {
128+ },
129+ "tgi_server_parameters" : {
130+ },
131+ "tgi_client_parameters" : {
132+ "endpoint" : " /generate_stream"
133+ },
134+ "trt_server_parameters" : {
135+ "model_type" : " llama" ,
136+ "model_dtype" : " bfloat16" ,
137+ "max_batch_size" : 2048 ,
138+ "max_input_len" : 4096 ,
139+ "max_seq_len" : 6144 ,
140+ "max_num_tokens" : 16384 ,
141+ "trt_llm_version" : " v0.11.0"
142+ },
143+ "trt_client_parameters" : {
144+ "endpoint" : " /v2/models/ensemble/generate_stream"
145+ },
146+ "vllm_server_parameters" : {
147+ "disable_log_stats" : " " ,
148+ "disable_log_requests" : " " ,
149+ "gpu_memory_utilization" : 0.9 ,
150+ "num_scheduler_steps" : 10 ,
151+ "max_num_seqs" : 512 ,
152+ "dtype" : " bfloat16"
153+ },
154+ "vllm_client_parameters" : {
155+ },
156+ "sglang_server_parameters" : {
157+ "disable_radix_cache" : " " ,
158+ "enable_torch_compile" : " " ,
159+ "dtype" : " bfloat16"
160+ },
161+ "sglang_client_parameters" : {
162+ }
163+ },
164+ {
165+ "test_name" : " llama70B_tp4_sharegpt" ,
166+ "qps_list" : [4 ,8 ,16 ,32 ," inf" ],
167+ "common_parameters" : {
168+ "model" : " meta-llama/Meta-Llama-3-70B-Instruct" ,
169+ "tp" : 4 ,
170+ "dataset_name" : " sharegpt" ,
171+ "dataset_path" : " ./ShareGPT_V3_unfiltered_cleaned_split.json" ,
172+ "num_prompts" : 500 ,
173+ "port" : 8000 ,
174+ "reuse_server" : false
175+ },
176+ "lmdeploy_server_parameters" : {
177+ "dtype" : " bfloat16"
178+ },
179+ "lmdeploy_client_parameters" : {
180+ },
181+ "tgi_server_parameters" : {
182+ },
183+ "tgi_client_parameters" : {
184+ "endpoint" : " /generate_stream"
185+ },
186+ "trt_server_parameters" : {
187+ "model_type" : " llama" ,
188+ "model_dtype" : " bfloat16" ,
189+ "max_batch_size" : 2048 ,
190+ "max_input_len" : 4096 ,
191+ "max_seq_len" : 6144 ,
192+ "max_num_tokens" : 16384 ,
193+ "trt_llm_version" : " v0.11.0"
194+ },
195+ "trt_client_parameters" : {
196+ "endpoint" : " /v2/models/ensemble/generate_stream"
197+ },
198+ "vllm_server_parameters" : {
199+ "disable_log_stats" : " " ,
200+ "disable_log_requests" : " " ,
201+ "gpu_memory_utilization" : 0.9 ,
202+ "num_scheduler_steps" : 10 ,
203+ "max_num_seqs" : 512 ,
204+ "dtype" : " bfloat16"
205+ },
206+ "vllm_client_parameters" : {
207+ },
208+ "sglang_server_parameters" : {
209+ "disable_radix_cache" : " " ,
210+ "dtype" : " bfloat16"
211+ },
212+ "sglang_client_parameters" : {
213+ }
214+ },
215+ {
216+ "test_name" : " llama70B_tp4_sonnet_512_16" ,
217+ "qps_list" : [4 ,8 ,16 ,32 ," inf" ],
218+ "common_parameters" : {
219+ "model" : " meta-llama/Meta-Llama-3-70B-Instruct" ,
220+ "tp" : 4 ,
221+ "dataset_name" : " sonnet" ,
222+ "dataset_path" : " ./sonnet_4x.txt" ,
223+ "num_prompts" : 500 ,
224+ "port" : 8000 ,
225+ "sonnet_input_len" : 512 ,
226+ "sonnet_output_len" : 16 ,
227+ "sonnet_prefix_len" : 50 ,
228+ "reuse_server" : true
229+ },
230+ "lmdeploy_server_parameters" : {
231+ "dtype" : " bfloat16"
232+ },
233+ "lmdeploy_client_parameters" : {
234+ },
235+ "tgi_server_parameters" : {
236+ },
237+ "tgi_client_parameters" : {
238+ "endpoint" : " /generate_stream"
239+ },
240+ "trt_server_parameters" : {
241+ "model_type" : " llama" ,
242+ "model_dtype" : " bfloat16" ,
243+ "max_batch_size" : 2048 ,
244+ "max_input_len" : 4096 ,
245+ "max_seq_len" : 6144 ,
246+ "max_num_tokens" : 16384 ,
247+ "trt_llm_version" : " v0.11.0"
248+ },
249+ "trt_client_parameters" : {
250+ "endpoint" : " /v2/models/ensemble/generate_stream"
251+ },
252+ "vllm_server_parameters" : {
253+ "disable_log_stats" : " " ,
254+ "disable_log_requests" : " " ,
255+ "gpu_memory_utilization" : 0.9 ,
256+ "num_scheduler_steps" : 10 ,
257+ "max_num_seqs" : 512 ,
258+ "dtype" : " bfloat16"
259+ },
260+ "vllm_client_parameters" : {
261+ },
262+ "sglang_server_parameters" : {
263+ "disable_radix_cache" : " " ,
264+ "dtype" : " bfloat16"
265+ },
266+ "sglang_client_parameters" : {
267+ }
268+ },
269+ {
270+ "test_name" : " llama70B_tp4_sonnet_512_256" ,
271+ "qps_list" : [4 ,8 ,16 ,32 ," inf" ],
272+ "common_parameters" : {
273+ "model" : " meta-llama/Meta-Llama-3-70B-Instruct" ,
274+ "tp" : 4 ,
275+ "dataset_name" : " sonnet" ,
276+ "dataset_path" : " ./sonnet_4x.txt" ,
277+ "num_prompts" : 500 ,
278+ "port" : 8000 ,
279+ "sonnet_input_len" : 512 ,
280+ "sonnet_output_len" : 256 ,
281+ "sonnet_prefix_len" : 50 ,
282+ "reuse_server" : true
283+ },
284+ "lmdeploy_server_parameters" : {
285+ "dtype" : " bfloat16"
286+ },
287+ "lmdeploy_client_parameters" : {
288+ },
289+ "tgi_server_parameters" : {
290+ },
291+ "tgi_client_parameters" : {
292+ "endpoint" : " /generate_stream"
293+ },
294+ "trt_server_parameters" : {
295+ "model_type" : " llama" ,
296+ "model_dtype" : " bfloat16" ,
297+ "max_batch_size" : 2048 ,
298+ "max_input_len" : 4096 ,
299+ "max_seq_len" : 6144 ,
300+ "max_num_tokens" : 16384 ,
301+ "trt_llm_version" : " v0.11.0"
302+ },
303+ "trt_client_parameters" : {
304+ "endpoint" : " /v2/models/ensemble/generate_stream"
305+ },
306+ "vllm_server_parameters" : {
307+ "disable_log_stats" : " " ,
308+ "disable_log_requests" : " " ,
309+ "gpu_memory_utilization" : 0.9 ,
310+ "num_scheduler_steps" : 10 ,
311+ "max_num_seqs" : 512 ,
312+ "dtype" : " bfloat16"
313+ },
314+ "vllm_client_parameters" : {
315+ },
316+ "sglang_server_parameters" : {
317+ "disable_radix_cache" : " " ,
318+ "dtype" : " bfloat16"
319+ },
320+ "sglang_client_parameters" : {
321+ }
322+ }
323+ ]
0 commit comments