Skip to content

Commit d5d214a

Browse files
khluuEC2 Default User
and
EC2 Default User
authored
[1/n][CI] Load models in CI from S3 instead of HF (vllm-project#13205)
Signed-off-by: <> Co-authored-by: EC2 Default User <[email protected]>
1 parent fd84857 commit d5d214a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+225
-76
lines changed

requirements-test.in

+2
Original file line numberDiff line numberDiff line change
@@ -37,3 +37,5 @@ genai_perf==0.0.8
3737
tritonclient==2.51.0
3838

3939
numpy < 2.0.0
40+
runai-model-streamer==0.11.0
41+
runai-model-streamer-s3==0.11.0

requirements-test.txt

+8
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,8 @@ huggingface-hub==0.26.2
171171
# tokenizers
172172
# transformers
173173
# vocos
174+
humanize==4.11.0
175+
# via runai-model-streamer
174176
idna==3.10
175177
# via
176178
# anyio
@@ -290,6 +292,7 @@ numpy==1.26.4
290292
# patsy
291293
# peft
292294
# rouge-score
295+
# runai-model-streamer
293296
# sacrebleu
294297
# scikit-learn
295298
# scipy
@@ -514,6 +517,10 @@ rpds-py==0.20.1
514517
# referencing
515518
rsa==4.7.2
516519
# via awscli
520+
runai-model-streamer==0.11.0
521+
# via -r requirements-test.in
522+
runai-model-streamer-s3==0.11.0
523+
# via -r requirements-test.in
517524
s3transfer==0.10.3
518525
# via
519526
# awscli
@@ -594,6 +601,7 @@ torch==2.5.1
594601
# encodec
595602
# lm-eval
596603
# peft
604+
# runai-model-streamer
597605
# sentence-transformers
598606
# tensorizer
599607
# timm

tests/basic_correctness/test_basic_correctness.py

+10-9
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import pytest
1010

1111
from vllm import LLM
12+
from vllm.config import LoadFormat
1213
from vllm.platforms import current_platform
1314

1415
from ..conftest import VllmRunner
@@ -33,7 +34,7 @@ def v1(run_with_both_engines):
3334

3435
def test_vllm_gc_ed():
3536
"""Verify vllm instance is GC'ed when it is deleted"""
36-
llm = LLM("facebook/opt-125m")
37+
llm = LLM("distilbert/distilgpt2", load_format=LoadFormat.RUNAI_STREAMER)
3738
weak_llm = weakref.ref(llm)
3839
del llm
3940
# If there's any circular reference to vllm, this fails
@@ -94,14 +95,14 @@ def test_models(
9495
@pytest.mark.parametrize(
9596
"model, distributed_executor_backend, attention_backend, "
9697
"test_suite", [
97-
("facebook/opt-125m", "ray", "", "L4"),
98-
("facebook/opt-125m", "mp", "", "L4"),
99-
("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
100-
("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
101-
("facebook/opt-125m", "ray", "", "A100"),
102-
("facebook/opt-125m", "mp", "", "A100"),
103-
("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
104-
("meta-llama/Llama-3.2-1B-Instruct", "ray", "FLASHINFER", "A100"),
98+
("distilbert/distilgpt2", "ray", "", "L4"),
99+
("distilbert/distilgpt2", "mp", "", "L4"),
100+
("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
101+
("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
102+
("distilbert/distilgpt2", "ray", "", "A100"),
103+
("distilbert/distilgpt2", "mp", "", "A100"),
104+
("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"),
105+
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
105106
])
106107
def test_models_distributed(
107108
hf_runner,

tests/basic_correctness/test_cumem.py

+10-3
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,11 @@
44
import torch
55

66
from vllm import LLM, SamplingParams
7+
from vllm.config import LoadFormat
78
from vllm.device_allocator.cumem import CuMemAllocator
89
from vllm.utils import GiB_bytes
910

11+
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
1012
from ..utils import fork_new_process_for_each_test
1113

1214

@@ -118,13 +120,18 @@ def model(x):
118120
@pytest.mark.parametrize(
119121
"model",
120122
[
121-
"meta-llama/Llama-3.2-1B-Instruct", # sleep mode with safetensors
122-
"facebook/opt-125m" # sleep mode with pytorch checkpoint
123+
# sleep mode with safetensors
124+
f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B",
125+
# sleep mode with pytorch checkpoint
126+
"facebook/opt-125m"
123127
])
124128
def test_end_to_end(model):
125129
free, total = torch.cuda.mem_get_info()
126130
used_bytes_baseline = total - free # in case other process is running
127-
llm = LLM(model, enable_sleep_mode=True)
131+
load_format = LoadFormat.AUTO
132+
if "Llama" in model:
133+
load_format = LoadFormat.RUNAI_STREAMER
134+
llm = LLM(model, load_format=load_format, enable_sleep_mode=True)
128135
prompt = "How are you?"
129136
sampling_params = SamplingParams(temperature=0, max_tokens=10)
130137
output = llm.generate(prompt, sampling_params)

tests/basic_correctness/test_preemption.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from ..models.utils import check_outputs_equal
1818

1919
MODELS = [
20-
"facebook/opt-125m",
20+
"distilbert/distilgpt2",
2121
]
2222

2323

tests/conftest.py

+24-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from vllm import LLM, SamplingParams
2525
from vllm.assets.image import ImageAsset
2626
from vllm.assets.video import VideoAsset
27-
from vllm.config import TaskOption, TokenizerPoolConfig
27+
from vllm.config import LoadFormat, TaskOption, TokenizerPoolConfig
2828
from vllm.connections import global_http_connection
2929
from vllm.distributed import (cleanup_dist_env_and_memory,
3030
init_distributed_environment,
@@ -46,6 +46,21 @@
4646
_SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
4747

4848
_M = TypeVar("_M")
49+
50+
MODELS_ON_S3 = [
51+
"distilbert/distilgpt2",
52+
"meta-llama/Llama-2-7b-hf",
53+
"meta-llama/Meta-Llama-3-8B",
54+
"meta-llama/Llama-3.2-1B",
55+
"meta-llama/Llama-3.2-1B-Instruct",
56+
"openai-community/gpt2",
57+
"ArthurZ/Ilama-3.2-1B",
58+
"llava-hf/llava-1.5-7b-hf",
59+
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
60+
]
61+
62+
MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights"
63+
4964
_PromptMultiModalInput = Union[List[_M], List[List[_M]]]
5065

5166
PromptImageInput = _PromptMultiModalInput[Image.Image]
@@ -677,8 +692,15 @@ def __init__(
677692
enable_chunked_prefill: bool = False,
678693
swap_space: int = 4,
679694
enforce_eager: Optional[bool] = False,
695+
load_format: Optional[LoadFormat] = None,
680696
**kwargs,
681697
) -> None:
698+
if model_name in MODELS_ON_S3 and not load_format:
699+
model_name = (f"s3://vllm-ci-model-weights/"
700+
f"{model_name.split('/')[-1]}")
701+
load_format = LoadFormat.RUNAI_STREAMER
702+
if not load_format:
703+
load_format = LoadFormat.AUTO
682704
self.model = LLM(
683705
model=model_name,
684706
task=task,
@@ -693,6 +715,7 @@ def __init__(
693715
max_model_len=max_model_len,
694716
block_size=block_size,
695717
enable_chunked_prefill=enable_chunked_prefill,
718+
load_format=load_format,
696719
**kwargs,
697720
)
698721

tests/engine/test_computed_prefix_blocks.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,15 @@
22

33
import pytest
44

5+
from vllm.config import LoadFormat
56
from vllm.engine.arg_utils import EngineArgs
67
from vllm.engine.llm_engine import LLMEngine
78
from vllm.sampling_params import SamplingParams
89

10+
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
911

10-
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
12+
13+
@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
1114
@pytest.mark.parametrize("block_size", [16])
1215
def test_computed_prefix_blocks(model: str, block_size: int):
1316
# This test checks if we are able to run the engine to completion
@@ -24,6 +27,7 @@ def test_computed_prefix_blocks(model: str, block_size: int):
2427
"decoration.")
2528

2629
engine_args = EngineArgs(model=model,
30+
load_format=LoadFormat.RUNAI_STREAMER,
2731
block_size=block_size,
2832
enable_prefix_caching=True)
2933

tests/engine/test_detokenization.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,14 @@
22

33
import pytest
44

5+
from vllm.config import LoadFormat
56
from vllm.entrypoints.llm import LLM
67
from vllm.sampling_params import SamplingParams
78

9+
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
810

9-
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
11+
12+
@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
1013
def test_computed_prefix_blocks(model: str):
1114
# This test checks if the engine generates completions both with and
1215
# without optional detokenization, that detokenization includes text
@@ -17,7 +20,7 @@ def test_computed_prefix_blocks(model: str):
1720
"paper clips? Is there an easy to follow video tutorial available "
1821
"online for free?")
1922

20-
llm = LLM(model=model)
23+
llm = LLM(model=model, load_format=LoadFormat.RUNAI_STREAMER)
2124
sampling_params = SamplingParams(max_tokens=10,
2225
temperature=0.0,
2326
detokenize=False)

tests/engine/test_executor.py

+13-4
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,17 @@
66

77
import pytest
88

9+
from vllm.config import LoadFormat
910
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
1011
from vllm.engine.async_llm_engine import AsyncLLMEngine
1112
from vllm.engine.llm_engine import LLMEngine
1213
from vllm.executor.uniproc_executor import UniProcExecutor
1314
from vllm.sampling_params import SamplingParams
1415

16+
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
17+
18+
RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
19+
1520

1621
class Mock:
1722
...
@@ -33,10 +38,11 @@ def collective_rpc(self,
3338
CustomUniExecutorAsync = CustomUniExecutor
3439

3540

36-
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
41+
@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
3742
def test_custom_executor_type_checking(model):
3843
with pytest.raises(ValueError):
3944
engine_args = EngineArgs(model=model,
45+
load_format=RUNAI_STREAMER_LOAD_FORMAT,
4046
distributed_executor_backend=Mock)
4147
LLMEngine.from_engine_args(engine_args)
4248
with pytest.raises(ValueError):
@@ -45,7 +51,7 @@ def test_custom_executor_type_checking(model):
4551
AsyncLLMEngine.from_engine_args(engine_args)
4652

4753

48-
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
54+
@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
4955
def test_custom_executor(model, tmp_path):
5056
cwd = os.path.abspath(".")
5157
os.chdir(tmp_path)
@@ -54,6 +60,7 @@ def test_custom_executor(model, tmp_path):
5460

5561
engine_args = EngineArgs(
5662
model=model,
63+
load_format=RUNAI_STREAMER_LOAD_FORMAT,
5764
distributed_executor_backend=CustomUniExecutor,
5865
enforce_eager=True, # reduce test time
5966
)
@@ -68,7 +75,7 @@ def test_custom_executor(model, tmp_path):
6875
os.chdir(cwd)
6976

7077

71-
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
78+
@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
7279
def test_custom_executor_async(model, tmp_path):
7380
cwd = os.path.abspath(".")
7481
os.chdir(tmp_path)
@@ -77,6 +84,7 @@ def test_custom_executor_async(model, tmp_path):
7784

7885
engine_args = AsyncEngineArgs(
7986
model=model,
87+
load_format=RUNAI_STREAMER_LOAD_FORMAT,
8088
distributed_executor_backend=CustomUniExecutorAsync,
8189
enforce_eager=True, # reduce test time
8290
)
@@ -95,7 +103,7 @@ async def t():
95103
os.chdir(cwd)
96104

97105

98-
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
106+
@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
99107
def test_respect_ray(model):
100108
# even for TP=1 and PP=1,
101109
# if users specify ray, we should use ray.
@@ -104,6 +112,7 @@ def test_respect_ray(model):
104112
engine_args = EngineArgs(
105113
model=model,
106114
distributed_executor_backend="ray",
115+
load_format=RUNAI_STREAMER_LOAD_FORMAT,
107116
enforce_eager=True, # reduce test time
108117
)
109118
engine = LLMEngine.from_engine_args(engine_args)

tests/engine/test_skip_tokenizer_init.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,21 @@
22

33
import pytest
44

5+
from vllm.config import LoadFormat
56
from vllm.entrypoints.llm import LLM
67
from vllm.sampling_params import SamplingParams
78

9+
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
810

9-
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
11+
12+
@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
1013
def test_skip_tokenizer_initialization(model: str):
1114
# This test checks if the flag skip_tokenizer_init skips the initialization
1215
# of tokenizer and detokenizer. The generated output is expected to contain
1316
# token ids.
14-
llm = LLM(model=model, skip_tokenizer_init=True)
17+
llm = LLM(model=model,
18+
skip_tokenizer_init=True,
19+
load_format=LoadFormat.RUNAI_STREAMER)
1520
sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
1621

1722
with pytest.raises(ValueError, match="cannot pass text prompts when"):

tests/engine/test_stop_reason.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
from vllm import SamplingParams
1414

15-
MODEL = "facebook/opt-350m"
15+
MODEL = "distilbert/distilgpt2"
1616
STOP_STR = "."
1717
SEED = 42
1818
MAX_TOKENS = 1024

tests/entrypoints/llm/test_chat.py

+10-3
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,17 @@
55
import pytest
66

77
from vllm import LLM
8+
from vllm.config import LoadFormat
89

10+
from ...conftest import MODEL_WEIGHTS_S3_BUCKET
911
from ..openai.test_vision import TEST_IMAGE_URLS
1012

13+
RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
14+
1115

1216
def test_chat():
13-
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
17+
llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct",
18+
load_format=RUNAI_STREAMER_LOAD_FORMAT)
1419

1520
prompt1 = "Explain the concept of entropy."
1621
messages = [
@@ -28,7 +33,8 @@ def test_chat():
2833

2934

3035
def test_multi_chat():
31-
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
36+
llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct",
37+
load_format=RUNAI_STREAMER_LOAD_FORMAT)
3238

3339
prompt1 = "Explain the concept of entropy."
3440
prompt2 = "Explain what among us is."
@@ -65,7 +71,8 @@ def test_multi_chat():
6571
[[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
6672
def test_chat_multi_image(image_urls: List[str]):
6773
llm = LLM(
68-
model="microsoft/Phi-3.5-vision-instruct",
74+
model=f"{MODEL_WEIGHTS_S3_BUCKET}/Phi-3.5-vision-instruct",
75+
load_format=RUNAI_STREAMER_LOAD_FORMAT,
6976
dtype="bfloat16",
7077
max_model_len=4096,
7178
max_num_seqs=5,

tests/entrypoints/llm/test_collective_rpc.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ class MyWorker(Worker):
2828
def echo_rank(self):
2929
return self.rank
3030

31-
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
31+
llm = LLM(model="s3://vllm-ci-model-weights/Llama-3.2-1B-Instruct",
3232
enforce_eager=True,
3333
load_format="dummy",
3434
tensor_parallel_size=tp_size,

0 commit comments

Comments
 (0)