2
2
import importlib
3
3
import inspect
4
4
import re
5
- from argparse import Namespace
5
+ import signal
6
6
from contextlib import asynccontextmanager
7
7
from http import HTTPStatus
8
- from typing import Any , Optional , Set
8
+ from typing import Optional , Set
9
9
10
- from fastapi import APIRouter , FastAPI , Request
10
+ import fastapi
11
+ import uvicorn
12
+ from fastapi import APIRouter , Request
11
13
from fastapi .exceptions import RequestValidationError
12
14
from fastapi .middleware .cors import CORSMiddleware
13
15
from fastapi .responses import JSONResponse , Response , StreamingResponse
36
38
from vllm .entrypoints .openai .serving_tokenization import (
37
39
OpenAIServingTokenization )
38
40
from vllm .logger import init_logger
39
- from vllm .server import serve_http
40
41
from vllm .usage .usage_lib import UsageContext
41
42
from vllm .utils import FlexibleArgumentParser
42
43
from vllm .version import __version__ as VLLM_VERSION
56
57
57
58
58
59
@asynccontextmanager
59
- async def lifespan (app : FastAPI ):
60
+ async def lifespan (app : fastapi . FastAPI ):
60
61
61
62
async def _force_log ():
62
63
while True :
@@ -74,7 +75,7 @@ async def _force_log():
74
75
router = APIRouter ()
75
76
76
77
77
- def mount_metrics (app : FastAPI ):
78
+ def mount_metrics (app : fastapi . FastAPI ):
78
79
# Add prometheus asgi middleware to route /metrics requests
79
80
metrics_route = Mount ("/metrics" , make_asgi_app ())
80
81
# Workaround for 307 Redirect for /metrics
@@ -164,8 +165,8 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
164
165
return JSONResponse (content = generator .model_dump ())
165
166
166
167
167
- def build_app (args : Namespace ) -> FastAPI :
168
- app = FastAPI (lifespan = lifespan )
168
+ def build_app (args ) :
169
+ app = fastapi . FastAPI (lifespan = lifespan )
169
170
app .include_router (router )
170
171
app .root_path = args .root_path
171
172
@@ -213,8 +214,11 @@ async def authentication(request: Request, call_next):
213
214
return app
214
215
215
216
216
- async def init_app (args : Namespace ,
217
- llm_engine : Optional [AsyncLLMEngine ] = None ) -> FastAPI :
217
+ async def build_server (
218
+ args ,
219
+ llm_engine : Optional [AsyncLLMEngine ] = None ,
220
+ ** uvicorn_kwargs ,
221
+ ) -> uvicorn .Server :
218
222
app = build_app (args )
219
223
220
224
if args .served_model_name is not None :
@@ -277,17 +281,14 @@ async def init_app(args: Namespace,
277
281
)
278
282
app .root_path = args .root_path
279
283
280
- return app
281
-
282
-
283
- async def run_server (args : Namespace ,
284
- llm_engine : Optional [AsyncLLMEngine ] = None ,
285
- ** uvicorn_kwargs : Any ) -> None :
286
- logger .info ("vLLM API server version %s" , VLLM_VERSION )
287
- logger .info ("args: %s" , args )
284
+ logger .info ("Available routes are:" )
285
+ for route in app .routes :
286
+ if not hasattr (route , 'methods' ):
287
+ continue
288
+ methods = ', ' .join (route .methods )
289
+ logger .info ("Route: %s, Methods: %s" , route .path , methods )
288
290
289
- app = await init_app (args , llm_engine )
290
- await serve_http (
291
+ config = uvicorn .Config (
291
292
app ,
292
293
host = args .host ,
293
294
port = args .port ,
@@ -300,6 +301,36 @@ async def run_server(args: Namespace,
300
301
** uvicorn_kwargs ,
301
302
)
302
303
304
+ return uvicorn .Server (config )
305
+
306
+
307
+ async def run_server (args , llm_engine = None , ** uvicorn_kwargs ) -> None :
308
+ logger .info ("vLLM API server version %s" , VLLM_VERSION )
309
+ logger .info ("args: %s" , args )
310
+
311
+ server = await build_server (
312
+ args ,
313
+ llm_engine ,
314
+ ** uvicorn_kwargs ,
315
+ )
316
+
317
+ loop = asyncio .get_running_loop ()
318
+
319
+ server_task = loop .create_task (server .serve ())
320
+
321
+ def signal_handler () -> None :
322
+ # prevents the uvicorn signal handler to exit early
323
+ server_task .cancel ()
324
+
325
+ loop .add_signal_handler (signal .SIGINT , signal_handler )
326
+ loop .add_signal_handler (signal .SIGTERM , signal_handler )
327
+
328
+ try :
329
+ await server_task
330
+ except asyncio .CancelledError :
331
+ print ("Gracefully stopping http server" )
332
+ await server .shutdown ()
333
+
303
334
304
335
if __name__ == "__main__" :
305
336
# NOTE(simon):
@@ -308,5 +339,4 @@ async def run_server(args: Namespace,
308
339
description = "vLLM OpenAI-Compatible RESTful API server." )
309
340
parser = make_arg_parser (parser )
310
341
args = parser .parse_args ()
311
-
312
342
asyncio .run (run_server (args ))
0 commit comments