66| ERNIE-4.5-300B-A47B| 32K| WINT8| 8| export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 8 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 64 \ <br > --quantization "wint8" \ <br > --gpu-memory-utilization 0.9 \ <br > --load-choices "default"| 2.3.0|
77| ERNIE-4.5-300B-A47B| 32K| WINT4| 4 (Recommended)| export XPU_VISIBLE_DEVICES="0,1,2,3" or "4,5,6,7"<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 4 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 64 \ <br > --quantization "wint4" \ <br > --gpu-memory-utilization 0.9 \ <br > --load-choices "default"| 2.3.0|
88| ERNIE-4.5-300B-A47B| 32K| WINT4| 8| export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 8 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 64 \ <br > --quantization "wint4" \ <br > --gpu-memory-utilization 0.95 \ <br > --load-choices "default"| 2.3.0|
9- | ERNIE-4.5-300B-A47B| 128K| WINT4| 8 (Recommended) | export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 8 \ <br > --max-model-len 131072 \ <br > --max-num-seqs 64 \ <br > --quantization "wint4" \ <br > --gpu-memory-utilization 0.9 \ <br > --load-choices "default"| 2.3.0|
9+ | ERNIE-4.5-300B-A47B| 128K| WINT4| 8| export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 8 \ <br > --max-model-len 131072 \ <br > --max-num-seqs 64 \ <br > --quantization "wint4" \ <br > --gpu-memory-utilization 0.9 \ <br > --load-choices "default"| 2.3.0|
1010| ERNIE-4.5-21B-A3B| 32K| BF16| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 128 \ <br > --gpu-memory-utilization 0.9 \ <br > --load-choices "default"| 2.3.0|
1111| ERNIE-4.5-21B-A3B| 32K| WINT8| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 128 \ <br > --quantization "wint8" \ <br > --gpu-memory-utilization 0.9 \ <br > --load-choices "default"| 2.3.0|
12- | ERNIE-4.5-21B-A3B| 32K| WINT4| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 128 \ <br > --quantization "wint4" \ <br > --gpu-memory-utilization 0.9 \ <br > --load-choices "default"| 2.3.0|
12+ | ERNIE-4.5-21B-A3B| 32K| WINT4| 1 (Recommended) | export XPU_VISIBLE_DEVICES="0" # Specify any card<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 128 \ <br > --quantization "wint4" \ <br > --gpu-memory-utilization 0.9 \ <br > --load-choices "default"| 2.3.0|
1313| ERNIE-4.5-21B-A3B| 128K| BF16| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 131072 \ <br > --max-num-seqs 128 \ <br > --gpu-memory-utilization 0.9 \ <br > --load-choices "default"| 2.3.0|
1414| ERNIE-4.5-21B-A3B| 128K| WINT8| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 131072 \ <br > --max-num-seqs 128 \ <br > --quantization "wint8" \ <br > --gpu-memory-utilization 0.9 \ <br > --load-choices "default"| 2.3.0|
15- | ERNIE-4.5-21B-A3B| 128K| WINT4| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 131072 \ <br > --max-num-seqs 128 \ <br > --quantization "wint4" \ <br > --gpu-memory-utilization 0.9 \ <br > --load-choices "default"| 2.3.0|
15+ | ERNIE-4.5-21B-A3B| 128K| WINT4| 1 (Recommended) | export XPU_VISIBLE_DEVICES="0" # Specify any card<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 131072 \ <br > --max-num-seqs 128 \ <br > --quantization "wint4" \ <br > --gpu-memory-utilization 0.9 \ <br > --load-choices "default"| 2.3.0|
1616| ERNIE-4.5-0.3B| 32K| BF16| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 128 \ <br > --gpu-memory-utilization 0.9 \ <br > --load-choices "default"| 2.3.0|
17- | ERNIE-4.5-0.3B| 32K| WINT8| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 128 \ <br > --quantization "wint8" \ <br > --gpu-memory-utilization 0.9 \ <br > --load-choices "default"| 2.3.0|
17+ | ERNIE-4.5-0.3B| 32K| WINT8| 1 (Recommended) | export XPU_VISIBLE_DEVICES="0" # Specify any card<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 128 \ <br > --quantization "wint8" \ <br > --gpu-memory-utilization 0.9 \ <br > --load-choices "default"| 2.3.0|
1818| ERNIE-4.5-0.3B| 128K| BF16| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 131072 \ <br > --max-num-seqs 128 \ <br > --gpu-memory-utilization 0.9 \ <br > --load-choices "default"| 2.3.0|
19- | ERNIE-4.5-0.3B| 128K| WINT8| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 131072 \ <br > --max-num-seqs 128 \ <br > --quantization "wint8" \ <br > --gpu-memory-utilization 0.9 \ <br > --load-choices "default"| 2.3.0|
19+ | ERNIE-4.5-0.3B| 128K| WINT8| 1 (Recommended) | export XPU_VISIBLE_DEVICES="0" # Specify any card<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --max-model-len 131072 \ <br > --max-num-seqs 128 \ <br > --quantization "wint8" \ <br > --gpu-memory-utilization 0.9 \ <br > --load-choices "default"| 2.3.0|
2020| ERNIE-4.5-300B-A47B-W4A8C8-TP4| 32K| W4A8| 4| export XPU_VISIBLE_DEVICES="0,1,2,3" or "4,5,6,7"<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-300B-A47B-W4A8C8-TP4-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 4 \ <br > --max-model-len 32768 \ <br > --max-num-seqs 64 \ <br > --quantization "W4A8" \ <br > --gpu-memory-utilization 0.9 \ <br > --load-choices "default"| 2.3.0|
2121| ERNIE-4.5-VL-28B-A3B| 32K| WINT8| 1| export XPU_VISIBLE_DEVICES="0" # Specify any card<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-VL-28B-A3B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --quantization "wint8" \ <br > --max-model-len 32768 \ <br > --max-num-seqs 10 \ <br > --enable-mm \ <br > --mm-processor-kwargs '{"video_max_frames": 30}' \ <br > --limit-mm-per-prompt '{"image": 10, "video": 3}' \ <br > --reasoning-parser ernie-45-vl \ <br > --load-choices "default"| 2.3.0|
2222| ERNIE-4.5-VL-424B-A47B| 32K| WINT8| 8| export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" <br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-VL-424B-A47B-Paddle \ <br > --port 8188 \ <br > --tensor-parallel-size 8 \ <br > --quantization "wint8" \ <br > --max-model-len 32768 \ <br > --max-num-seqs 8 \ <br > --enable-mm \ <br > --mm-processor-kwargs '{"video_max_frames": 30}' \ <br > --limit-mm-per-prompt '{"image": 10, "video": 3}' \ <br > --reasoning-parser ernie-45-vl \ <br > --load-choices "default"| 2.3.0|
2323| PaddleOCR-VL-0.9B| 32K| BF16| 1| export FD_ENABLE_MAX_PREFILL=1 <br >export XPU_VISIBLE_DEVICES="0" # Specify any card <br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/PaddleOCR-VL \ <br > --port 8188 \ <br > --metrics-port 8181 \ <br > --engine-worker-queue-port 8182 \ <br > --max-model-len 16384 \ <br > --max-num-batched-tokens 16384 \ <br > --gpu-memory-utilization 0.8 \ <br > --max-num-seqs 256| 2.3.0|
24+ | ERNIE-4.5-VL-28B-A3B-Thinking| 128K| WINT8| 1| export XPU_VISIBLE_DEVICES="0"# Specify any card<br >python -m fastdeploy.entrypoints.openai.api_server \ <br > --model PaddlePaddle/ERNIE-4.5-VL-28B-A3B-Thinking \ <br > --port 8188 \ <br > --tensor-parallel-size 1 \ <br > --quantization "wint8" \ <br > --max-model-len 131072 \ <br > --max-num-seqs 32 \ <br > --engine-worker-queue-port 8189 \ <br > --metrics-port 8190 \ <br > --cache-queue-port 8191 \ <br > --reasoning-parser ernie-45-vl-thinking \ <br > --tool-call-parser ernie-45-vl-thinking \ <br > --mm-processor-kwargs '{"image_max_pixels": 12845056 }' <br > --load-choices "default_v1"| 2.3.0|
2425
2526## Quick start
2627
@@ -125,7 +126,7 @@ curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \
125126 "messages": [
126127 {"role": "user", "content": [
127128 {"type": "image_url", "image_url": {"url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg", "detail": "high"}},
128- {"type": "text", "text": "请描述图片内容 "}
129+ {"type": "text", "text": "Please describe the content of the image "}
129130 ]}
130131 ],
131132 "metadata": {"enable_thinking": false}
@@ -144,7 +145,7 @@ response = client.chat.completions.create(
144145 messages = [
145146 {" role" : " user" , " content" : [
146147 {" type" : " image_url" , " image_url" : {" url" : " https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg" , " detail" : " high" }},
147- {" type" : " text" , " text" : " 请描述图片内容 " }
148+ {" type" : " text" , " text" : " Please describe the content of the image " }
148149 ]
149150 },
150151 ],
@@ -237,3 +238,77 @@ for chunk in response:
237238 print (reasoning_content + content, end = ' ' , flush = True )
238239print (' \n ' )
239240```
241+
242+ ### Deploy online serving based on ERNIE-4.5-VL-28B-A3B-Thinking
243+
244+ #### Start service
245+ Deploy the ERNIE-4.5-VL-28B-A3B-Thinking model with WINT8 precision and 128K context length on 1 XPU
246+
247+ ``` bash
248+ export XPU_VISIBLE_DEVICES=" 0" # Specify any card
249+ python -m fastdeploy.entrypoints.openai.api_server \
250+ --model PaddlePaddle/ERNIE-4.5-VL-28B-A3B-Thinking \
251+ --port 8188 \
252+ --tensor-parallel-size 1 \
253+ --quantization " wint8" \
254+ --max-model-len 131072 \
255+ --max-num-seqs 32 \
256+ --engine-worker-queue-port 8189 \
257+ --metrics-port 8190 \
258+ --cache-queue-port 8191 \
259+ --reasoning-parser ernie-45-vl-thinking \
260+ --tool-call-parser ernie-45-vl-thinking \
261+ --mm-processor-kwargs ' {"image_max_pixels": 12845056 }'
262+ --load-choices " default_v1"
263+ ```
264+
265+ #### Send requests
266+
267+ ``` bash
268+ curl -X POST " http://0.0.0.0:8188/v1/chat/completions" \
269+ -H " Content-Type: application/json" \
270+ -d ' {
271+ "messages": [
272+ {"role": "user", "content": [
273+ {"type": "image_url", "image_url": {"url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg", "detail": "high"}},
274+ {"type": "text", "text": "Please describe the content of the image"}
275+ ]}
276+ ],
277+ "metadata": {"enable_thinking": true}
278+ }'
279+ ```
280+
281+ ``` python
282+ import openai
283+
284+ ip = " 0.0.0.0"
285+ service_http_port = " 8188"
286+ client = openai.Client(base_url = f " http:// { ip} : { service_http_port} /v1 " , api_key = " EMPTY_API_KEY" )
287+
288+ response = client.chat.completions.create(
289+ model = " default" ,
290+ messages = [
291+ {" role" : " user" , " content" : [
292+ {" type" : " image_url" , " image_url" : {" url" : " https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg" , " detail" : " high" }},
293+ {" type" : " text" , " text" : " Please describe the content of the image" }
294+ ]
295+ },
296+ ],
297+ temperature = 0.0001 ,
298+ max_tokens = 10000 ,
299+ stream = True ,
300+ top_p = 0 ,
301+ metadata = {" enable_thinking" : True },
302+ )
303+
304+ def get_str (content_raw ):
305+ content_str = str (content_raw) if content_raw is not None else ' '
306+ return content_str
307+
308+ for chunk in response:
309+ if chunk.choices[0 ].delta is not None and chunk.choices[0 ].delta.role != ' assistant' :
310+ reasoning_content = get_str(chunk.choices[0 ].delta.reasoning_content)
311+ content = get_str(chunk.choices[0 ].delta.content)
312+ print (reasoning_content + content + is_reason, end = ' ' , flush = True )
313+ print (' \n ' )
314+ ```
0 commit comments