Skip to content

Commit 1b35dc2

Browse files
committed
Fix docker cmdlines for v0.10.2_next workarounds
Signed-off-by: Neelesh Gokhale <[email protected]>
1 parent a2eff87 commit 1b35dc2

File tree

8 files changed

+49
-30
lines changed

8 files changed

+49
-30
lines changed

.cd/benchmark/benchmark_scenarios_text.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ qwen25_14b_instruct:
3333

3434
qwen25_32b_instruct:
3535
MODEL: Qwen/Qwen2.5-32B-Instruct
36+
CONCURRENT_REQ: 8
3637

3738
qwen25_72b_instruct:
3839
MODEL: Qwen/Qwen2.5-72B-Instruct

.cd/benchmark/benchmark_user.env

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
MODEL
22
INPUT_TOK
33
OUTPUT_TOK
4-
CON_REQ
4+
CONCURRENT_REQ
55
NUM_PROMPTS

.cd/server/server_output.env

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,23 +9,21 @@ QUANT_DTYPE
99
BLOCK_SIZE
1010
VLLM_PROMPT_BS_BUCKET_MIN
1111
VLLM_PROMPT_BS_BUCKET_STEP
12+
VLLM_PROMPT_BS_BUCKET_MAX
1213
VLLM_DECODE_BS_BUCKET_MIN
1314
VLLM_DECODE_BS_BUCKET_STEP
1415
VLLM_PROMPT_SEQ_BUCKET_MIN
1516
VLLM_PROMPT_SEQ_BUCKET_STEP
1617
VLLM_DECODE_BLOCK_BUCKET_MIN
1718
VLLM_DECODE_BLOCK_BUCKET_STEP
18-
MAX_NUM_PREFILL_SEQS
1919
NUM_HIDDEN_LAYERS
2020
HIDDEN_SIZE
2121
NUM_KEY_VALUE_HEADS
2222
NUM_ATTENTION_HEADS
2323
CACHE_DTYPE_BYTES
2424
LIMIT_MODEL_LEN
2525
PT_HPU_LAZY_MODE
26-
VLLM_DELAYED_SAMPLING
2726
VLLM_SKIP_WARMUP
28-
EXPERIMENTAL_WEIGHT_SHARING
2927
VLLM_EXPONENTIAL_BUCKETING
3028
MAX_NUM_BATCHED_TOKENS
3129
PT_HPU_ENABLE_LAZY_COLLECTIVES

.cd/server/server_user.env

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ PT_HPU_LAZY_MODE
33
VLLM_DECODE_BLOCK_BUCKET_STEP
44
VLLM_DECODE_BS_BUCKET_STEP
55
VLLM_PROMPT_BS_BUCKET_STEP
6+
VLLM_PROMPT_BS_BUCKET_MAX
67
VLLM_PROMPT_SEQ_BUCKET_STEP
78
VLLM_SKIP_WARMUP
89
MAX_MODEL_LEN
@@ -11,3 +12,5 @@ TENSOR_PARALLEL_SIZE
1112
VLLM_EXPONENTIAL_BUCKETING
1213
GPU_MEM_UTILIZATION
1314
ASYNC_SCHEDULING
15+
ENABLE_PREFIX_CACHING
16+
EXTRA_ARGS

.cd/server/settings_vllm.csv

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
1-
MODEL,TENSOR_PARALLEL_SIZE,MAX_MODEL_LEN,TOTAL_GPU_MEM,UNAVAILABLE_MEM_ABS,MODEL_MEM_FROM_CONFIG,MODEL_DTYPE,QUANT_DTYPE,MODEL_MEM,PROFILER_MEM_OVERHEAD,APPROX_MEM_PER_GRAPH_MB,fsdpa,GPU_FREE_MEM_TARGET,BLOCK_SIZE,VLLM_PROMPT_BS_BUCKET_MIN,VLLM_PROMPT_BS_BUCKET_STEP,VLLM_DECODE_BS_BUCKET_MIN,VLLM_DECODE_BS_BUCKET_STEP,VLLM_PROMPT_SEQ_BUCKET_MIN,VLLM_PROMPT_SEQ_BUCKET_STEP,VLLM_DECODE_BLOCK_BUCKET_MIN,VLLM_DECODE_BLOCK_BUCKET_STEP,MAX_NUM_PREFILL_SEQS,NUM_HIDDEN_LAYERS,HIDDEN_SIZE,NUM_KEY_VALUE_HEADS,NUM_ATTENTION_HEADS,CACHE_DTYPE_BYTES,LIMIT_MODEL_LEN,PT_HPU_LAZY_MODE,VLLM_DELAYED_SAMPLING,VLLM_SKIP_WARMUP,EXPERIMENTAL_WEIGHT_SHARING,VLLM_EXPONENTIAL_BUCKETING,MAX_NUM_BATCHED_TOKENS,VLLM_CONTIGUOUS_PA,VLLM_DEFRAG,ASYNC_SCHEDULING,VLLM_WEIGHT_LOAD_FORCE_SYNC
2-
meta-llama/Llama-3.1-8B-Instruct,1,4352,128,2,16060522496,2,2,14.95752716,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0
3-
meta-llama/Llama-3.1-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0
4-
meta-llama/Llama-3.3-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0
5-
meta-llama/Llama-3.2-1B-Instruct,1,4352,128,2,2471645608,2,2,2.301899351,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,16,2048,8,32,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0
6-
meta-llama/Llama-3.2-3B-Instruct,1,4352,128,2,6425499648,2,2,5.984212875,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,28,3072,8,24,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0
7-
mistralai/Mixtral-8x7B-Instruct-v0.1,2,4352,256,2,93405585408,2,2,86.99073029,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
8-
mistralai/Mixtral-8x22B-Instruct-v0.1,4,4352,512,2,2.8126E+11,2,2,261.9439201,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,56,6144,8,48,2,65536,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
9-
mistralai/Mistral-7B-Instruct-v0.2,1,4352,128,2,14483464192,2,2,13.48877716,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
10-
meta-llama/Llama-3.1-405B-Instruct,8,4352,1024,2,8.11707E+11,2,2,755.9608459,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,126,16384,8,128,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,1
11-
Qwen/Qwen2.5-14B-Instruct,1,4352,128,2,29540067328,2,2,27.51133156,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,48,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
12-
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0
13-
Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,64,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
14-
Qwen/Qwen2.5-72B-Instruct,4,4352,512,2,1.45412E+11,2,2,135.4258575,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
15-
Qwen/Qwen2.5-7B-Instruct,1,4352,128,2,15231233024,2,2,14.18519115,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,28,3584,4,28,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
16-
Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,64,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
17-
ibm-granite/granite-8b-code-instruct-4k,1,4096,128,2,21474836480,2,2,20.00000000,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,36,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0
18-
ibm-granite/granite-20b-code-instruct-8k,1,4352,128,2,53687091200,2,2,48.00000000,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,52,6144,1,48,2,65536,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0
19-
Qwen/Qwen2.5-VL-7B-Instruct,1,8448,128,2,15231233024,2,2,14.18519115,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,28,3584,4,28,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
1+
MODEL,TENSOR_PARALLEL_SIZE,MAX_MODEL_LEN,TOTAL_GPU_MEM,UNAVAILABLE_MEM_ABS,MODEL_MEM_FROM_CONFIG,MODEL_DTYPE,QUANT_DTYPE,MODEL_MEM,PROFILER_MEM_OVERHEAD,APPROX_MEM_PER_GRAPH_MB,fsdpa,GPU_FREE_MEM_TARGET,BLOCK_SIZE,VLLM_PROMPT_BS_BUCKET_MIN,VLLM_PROMPT_BS_BUCKET_STEP,VLLM_DECODE_BS_BUCKET_MIN,VLLM_DECODE_BS_BUCKET_STEP,VLLM_PROMPT_SEQ_BUCKET_MIN,VLLM_PROMPT_SEQ_BUCKET_STEP,VLLM_DECODE_BLOCK_BUCKET_MIN,VLLM_DECODE_BLOCK_BUCKET_STEP,VLLM_PROMPT_BS_BUCKET_MAX,NUM_HIDDEN_LAYERS,HIDDEN_SIZE,NUM_KEY_VALUE_HEADS,NUM_ATTENTION_HEADS,CACHE_DTYPE_BYTES,LIMIT_MODEL_LEN,PT_HPU_LAZY_MODE,VLLM_SKIP_WARMUP,VLLM_EXPONENTIAL_BUCKETING,MAX_NUM_BATCHED_TOKENS,ENABLE_PREFIX_CACHING,VLLM_CONTIGUOUS_PA,VLLM_DEFRAG,ASYNC_SCHEDULING,VLLM_WEIGHT_LOAD_FORCE_SYNC
2+
meta-llama/Llama-3.1-8B-Instruct,1,4352,128,2,16060522496,2,2,14.95752716,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,131072,1,FALSE,FALSE,2048,false,true,true,1,0
3+
meta-llama/Llama-3.1-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,FALSE,FALSE,2048,false,true,true,1,0
4+
meta-llama/Llama-3.3-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,FALSE,FALSE,2048,false,true,true,1,0
5+
meta-llama/Llama-3.2-1B-Instruct,1,4352,128,2,2471645608,2,2,2.301899351,5.5,5,1,1,128,1,32,1,32,128,256,128,256,1,16,2048,8,32,2,131072,1,FALSE,FALSE,2048,false,true,true,1,0
6+
meta-llama/Llama-3.2-3B-Instruct,1,4352,128,2,6425499648,2,2,5.984212875,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,28,3072,8,24,2,131072,1,FALSE,FALSE,2048,false,true,true,1,0
7+
mistralai/Mixtral-8x7B-Instruct-v0.1,2,4352,256,2,93405585408,2,2,86.99073029,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,32768,1,FALSE,FALSE,2048,false,true,true,1,0
8+
mistralai/Mixtral-8x22B-Instruct-v0.1,4,4352,512,2,2.8126E+11,2,2,261.9439201,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,56,6144,8,48,2,65536,1,FALSE,FALSE,2048,false,true,true,1,1
9+
mistralai/Mistral-7B-Instruct-v0.2,1,4352,128,2,14483464192,2,2,13.48877716,5.5,10,1,9,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,32768,1,FALSE,FALSE,2048,false,true,true,1,0
10+
meta-llama/Llama-3.1-405B-Instruct,8,4352,1024,2,8.11707E+11,2,2,755.9608459,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,126,16384,8,128,2,131072,1,FALSE,FALSE,2048,false,true,true,1,1
11+
Qwen/Qwen2.5-14B-Instruct,1,4352,128,2,29540067328,2,2,27.51133156,5.5,10,0,12,128,1,32,1,32,128,256,128,256,1,48,5120,8,40,2,32768,1,FALSE,FALSE,2048,false,true,true,1,0
12+
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,FALSE,FALSE,2048,false,true,true,1,0
13+
Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,1,16,128,1,32,1,32,128,256,128,256,1,64,5120,8,40,2,32768,1,FALSE,FALSE,2048,false,true,true,1,1
14+
Qwen/Qwen2.5-72B-Instruct,4,4352,512,2,1.45412E+11,2,2,135.4258575,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,32768,1,FALSE,FALSE,2048,false,true,true,1,1
15+
Qwen/Qwen2.5-7B-Instruct,1,4352,128,2,15231233024,2,2,14.18519115,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,28,3584,4,28,2,32768,1,FALSE,FALSE,2048,false,true,true,1,0
16+
Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,64,5120,8,40,2,32768,1,FALSE,FALSE,2048,false,true,true,1,0
17+
ibm-granite/granite-8b-code-instruct-4k,1,4096,128,2,21474836480,2,2,20.00000000,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,36,4096,8,32,2,32768,1,FALSE,FALSE,2048,false,true,true,1,0
18+
ibm-granite/granite-20b-code-instruct-8k,1,4352,128,2,40133986304,2,2,37.37,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,52,6144,1,48,2,65536,1,FALSE,FALSE,2048,false,true,true,1,0
19+
Qwen/Qwen2.5-VL-7B-Instruct,1,8448,128,2,15231233024,2,2,14.18519115,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,28,3584,4,28,2,32768,1,FALSE,FALSE,2048,false,false,false,1,0

.cd/server/vllm_autocalc_rules.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,14 @@ def calc_PT_HPU_ENABLE_LAZY_COLLECTIVES(ctx):
1616
return ctx['TENSOR_PARALLEL_SIZE'] > 1
1717

1818

19+
def calc_VLLM_CONTIGUOUS_PA(ctx):
20+
return not ctx['ENABLE_PREFIX_CACHING']
21+
22+
23+
def calc_VLLM_DEFRAG(ctx):
24+
return bool(ctx['VLLM_CONTIGUOUS_PA'])
25+
26+
1927
def calc_MODEL_MEM_FROM_CONFIG(ctx):
2028
return float(ctx.get('MODEL_MEM_FROM_CONFIG'))
2129

@@ -93,13 +101,15 @@ def calc_NUM_DECODE_GRAPHS(ctx):
93101
def calc_PROMPT_BS_RAMP_GRAPHS(ctx):
94102
return 1 + int(
95103
math.log(
96-
min(ctx['MAX_NUM_PREFILL_SEQS'], ctx['VLLM_PROMPT_BS_BUCKET_STEP']) / ctx['VLLM_PROMPT_BS_BUCKET_MIN'], 2))
104+
min(ctx['VLLM_PROMPT_BS_BUCKET_MAX'], ctx['VLLM_PROMPT_BS_BUCKET_STEP']) / ctx['VLLM_PROMPT_BS_BUCKET_MIN'],
105+
2))
97106

98107

99108
def calc_PROMPT_BS_STEP_GRAPHS(ctx):
100109
return max(
101110
0,
102-
int(1 + (ctx['MAX_NUM_PREFILL_SEQS'] - ctx['VLLM_PROMPT_BS_BUCKET_STEP']) / ctx['VLLM_PROMPT_BS_BUCKET_STEP']))
111+
int(1 +
112+
(ctx['VLLM_PROMPT_BS_BUCKET_MAX'] - ctx['VLLM_PROMPT_BS_BUCKET_STEP']) / ctx['VLLM_PROMPT_BS_BUCKET_STEP']))
103113

104114

105115
def calc_PROMPT_SEQ_RAMP_GRAPHS(ctx):
@@ -155,10 +165,11 @@ def calc_MAX_NUM_SEQS(ctx):
155165
return max(1, ctx['MAX_NUM_SEQS'])
156166
# Otherwise, calculate
157167
val = (ctx['TENSOR_PARALLEL_SIZE'] * ctx['KV_CACHE_MEM'] / ctx['KV_CACHE_PER_SEQ'])
158-
if ctx['DTYPE'] == 'fp8':
159-
val = (max(1, math.floor(val / ctx['VLLM_DECODE_BS_BUCKET_STEP'])) * ctx['VLLM_DECODE_BS_BUCKET_STEP'])
168+
# always round down for plugin as WA
169+
if val < ctx['VLLM_DECODE_BS_BUCKET_STEP']:
170+
val = pow(2, math.floor(math.log(val, 2)))
160171
else:
161-
val = (math.ceil(val / ctx['VLLM_DECODE_BS_BUCKET_STEP']) * ctx['VLLM_DECODE_BS_BUCKET_STEP'])
172+
val = max(1, math.floor(val / ctx['VLLM_DECODE_BS_BUCKET_STEP'])) * ctx['VLLM_DECODE_BS_BUCKET_STEP']
162173
# Special limit for Vision-Instruct models
163174
if ctx['MODEL'] in ['meta-llama/Llama-3.2-11B-Vision-Instruct', 'meta-llama/Llama-3.2-90B-Vision-Instruct'
164175
] and val > 128:
@@ -184,6 +195,8 @@ def calc_VLLM_PROMPT_SEQ_BUCKET_MAX(ctx):
184195
"TENSOR_PARALLEL_SIZE": calc_TENSOR_PARALLEL_SIZE,
185196
"MAX_MODEL_LEN": calc_MAX_MODEL_LEN,
186197
"PT_HPU_ENABLE_LAZY_COLLECTIVES": calc_PT_HPU_ENABLE_LAZY_COLLECTIVES,
198+
"VLLM_CONTIGUOUS_PA": calc_VLLM_CONTIGUOUS_PA,
199+
"VLLM_DEFRAG": calc_VLLM_DEFRAG,
187200
"MODEL_MEM_FROM_CONFIG": calc_MODEL_MEM_FROM_CONFIG,
188201
"DEVICE_HPU_MEM": calc_DEVICE_HPU_MEM,
189202
"TOTAL_GPU_MEM": calc_TOTAL_GPU_MEM,

.cd/templates/template_vllm_server.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22

33
#@VARS
44

5+
if [ "$VLLM_CONTIGUOUS_PA" == "True" ]; then # Checks if using contigous pa
6+
EXTRA_ARGS+=" --no-enable-prefix-caching"
7+
fi
8+
59
if [ $ASYNC_SCHEDULING -gt 0 ]; then # Checks if using async scheduling
610
EXTRA_ARGS+=" --async_scheduling"
711
fi

.cd/tests/test_vllm_autocalc_rules.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -125,13 +125,13 @@ def test_calc_NUM_DECODE_GRAPHS(cpa):
125125

126126

127127
def test_calc_PROMPT_BS_RAMP_GRAPHS():
128-
ctx = {'MAX_NUM_PREFILL_SEQS': 16, 'VLLM_PROMPT_BS_BUCKET_STEP': 8, 'VLLM_PROMPT_BS_BUCKET_MIN': 2}
128+
ctx = {'VLLM_PROMPT_BS_BUCKET_MAX': 16, 'VLLM_PROMPT_BS_BUCKET_STEP': 8, 'VLLM_PROMPT_BS_BUCKET_MIN': 2}
129129
expected = 1 + int(math.log(min(16, 8) / 2, 2))
130130
assert rules.calc_PROMPT_BS_RAMP_GRAPHS(ctx) == expected
131131

132132

133133
def test_calc_PROMPT_BS_STEP_GRAPHS():
134-
ctx = {'MAX_NUM_PREFILL_SEQS': 32, 'VLLM_PROMPT_BS_BUCKET_STEP': 8}
134+
ctx = {'VLLM_PROMPT_BS_BUCKET_MAX': 32, 'VLLM_PROMPT_BS_BUCKET_STEP': 8}
135135
expected = max(0, int(1 + (32 - 8) / 8))
136136
assert rules.calc_PROMPT_BS_STEP_GRAPHS(ctx) == expected
137137

0 commit comments

Comments
 (0)