@@ -91,7 +91,7 @@ class InputRequest:
91
91
@dataclass
92
92
class RequestFuncOutput :
93
93
input_request : InputRequest = None
94
- generated_text : str = ""
94
+ generated_token_list : list [ str ] = None
95
95
success : bool = False
96
96
latency : float = 0
97
97
ttft : float = 0
@@ -102,7 +102,7 @@ def to_dict(self):
102
102
return {
103
103
"prompt" : self .input_request .prompt ,
104
104
"original_output" : self .input_request .output ,
105
- "generated_text " : self .generated_text ,
105
+ "generated_token_list " : self .generated_token_list ,
106
106
"success" : self .success ,
107
107
"latency" : self .latency ,
108
108
"prompt_len" : self .prompt_len
@@ -207,9 +207,9 @@ def calculate_metrics(
207
207
for i in range (len (outputs )):
208
208
if outputs [i ].success :
209
209
output_len = len (
210
- tokenizer . tokenize ( outputs [i ].generated_text )
210
+ outputs [i ].generated_token_list
211
211
if tokenizer != "test"
212
- else "ĊŌƟ"
212
+ else [ "Ċ" , "Ō" , "Ɵ" ]
213
213
)
214
214
total_output += output_len
215
215
total_input += input_requests [i ].prompt_len
@@ -235,7 +235,7 @@ def calculate_metrics(
235
235
return metrics
236
236
237
237
238
- def grpc_sync_request (api_url : str , request : Any ) -> tuple [str , float , float ]:
238
+ def grpc_sync_request (api_url : str , request : Any ) -> tuple [list [ str ] , float , float ]:
239
239
"""Send grpc synchronous request since the current grpc server is sync."""
240
240
with grpc .insecure_channel (api_url ) as channel :
241
241
grpc .channel_ready_future (channel ).result ()
@@ -250,8 +250,7 @@ def grpc_sync_request(api_url: str, request: Any) -> tuple[str, float, float]:
250
250
ttft = time .perf_counter () - request_start_time
251
251
token_list .append (token .response [0 ])
252
252
latency = time .perf_counter () - request_start_time
253
- generated_text = "" .join (token_list )
254
- return generated_text , ttft , latency
253
+ return token_list , ttft , latency
255
254
256
255
257
256
async def send_request (
@@ -274,12 +273,12 @@ async def send_request(
274
273
output = RequestFuncOutput ()
275
274
output .input_request = input_request
276
275
output .prompt_len = input_request .prompt_len
277
- generated_text , ttft , latency = await loop .run_in_executor (
276
+ generated_token_list , ttft , latency = await loop .run_in_executor (
278
277
None , grpc_sync_request , api_url , request
279
278
)
280
279
output .ttft = ttft
281
280
output .latency = latency
282
- output .generated_text = generated_text
281
+ output .generated_token_list = generated_token_list
283
282
output .success = True
284
283
if pbar :
285
284
pbar .update (1 )
0 commit comments