Skip to content

Commit 6223df4

Browse files
committed
支持 lmcache 升级sglang 为最新,vllm后端默认开启 lmcache
1 parent 8e05cd5 commit 6223df4

File tree

5 files changed

+428
-125
lines changed

5 files changed

+428
-125
lines changed

gpt_server/model_backend/sglang_backend.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
11
import asyncio
22
import base64
33
from io import BytesIO
4-
import os
54
from typing import Any, Dict, AsyncGenerator, List, Optional
6-
from fastchat.utils import is_partial_stop
75
from gpt_server.model_backend.base import ModelBackend
86
from loguru import logger
97
from PIL import Image

gpt_server/model_backend/vllm_backend.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,7 @@
88
from vllm.lora.request import LoRARequest
99
from transformers import PreTrainedTokenizer
1010
from vllm.entrypoints.chat_utils import (
11-
ConversationMessage,
1211
apply_hf_chat_template,
13-
load_chat_template,
1412
parse_chat_messages_futures,
1513
)
1614
from gpt_server.settings import get_model_config
@@ -40,6 +38,7 @@ def __init__(self, model_path, tokenizer: PreTrainedTokenizer) -> None:
4038
lora_local_path=lora_path,
4139
)
4240
)
41+
from vllm.config.kv_transfer import KVTransferConfig
4342

4443
self.engine_args = AsyncEngineArgs(
4544
model_path,
@@ -53,6 +52,10 @@ def __init__(self, model_path, tokenizer: PreTrainedTokenizer) -> None:
5352
dtype=model_config.dtype,
5453
max_model_len=model_config.max_model_len,
5554
guided_decoding_backend="xgrammar",
55+
# 支持LMCache的KV传输
56+
kv_transfer_config=KVTransferConfig(
57+
kv_connector="LMCacheConnectorV1", kv_role="kv_both"
58+
),
5659
)
5760
self.engine = AsyncLLMEngine.from_engine_args(self.engine_args)
5861
self.tokenizer = tokenizer

pyproject.toml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "gpt_server"
3-
version = "0.6.6"
3+
version = "0.6.7"
44
description = "gpt_server是一个用于生产级部署LLMs、Embedding、Reranker、ASR和TTS的开源框架。"
55
readme = "README.md"
66
license = { text = "Apache 2.0" }
@@ -14,7 +14,7 @@ dependencies = [
1414
"infinity-emb[all]==0.0.77",
1515
"lmdeploy==0.10.2",
1616
"loguru>=0.7.2",
17-
"openai==1.99.1",
17+
"openai==2.6.1",
1818
"setuptools==75.2.0",
1919
"streamlit>=1.50.0",
2020
"torch==2.8.0",
@@ -25,12 +25,13 @@ dependencies = [
2525
"modelscope>=1.31.0",
2626
"edge-tts>=7.0.0",
2727
"funasr>=1.2.6",
28-
"sglang[all]>=0.5.4",
28+
"sglang[all]>=0.5.5",
2929
"flashinfer-python",
3030
"flashtts>=0.1.7",
3131
"diffusers>=0.35.2",
3232
"sqlmodel>=0.0.27",
3333
"autoawq>=0.2.9",
34+
"lmcache>=0.3.9.post1",
3435
]
3536

3637
[tool.uv]

0 commit comments

Comments
 (0)