-
-
Notifications
You must be signed in to change notification settings - Fork 9.1k
[V1][Core] Autotune encoder cache budget #11895
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
495f669
8c67ecd
5938a1f
0e4ab3c
bd1ccf1
2a4b1d5
9ee3f3d
7614888
aaf3cef
e8f50f4
767b0d6
eb125b5
f539470
29ad359
3103622
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,7 @@ | ||
from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple | ||
from typing import TYPE_CHECKING, Dict, List, Set, Tuple | ||
|
||
from vllm.logger import init_logger | ||
from vllm.multimodal import MULTIMODAL_REGISTRY | ||
from vllm.utils import cdiv | ||
from vllm.v1.request import Request | ||
|
||
if TYPE_CHECKING: | ||
|
@@ -56,10 +55,10 @@ def get_freed_ids(self) -> List[Tuple[str, int]]: | |
return freed | ||
|
||
|
||
def compute_encoder_cache_budget( | ||
def compute_encoder_budget( | ||
model_config: "ModelConfig", | ||
scheduler_config: "SchedulerConfig", | ||
) -> int: | ||
) -> tuple[int, int]: | ||
"""Compute the encoder cache budget based on the model and scheduler | ||
configurations. | ||
|
||
|
@@ -68,26 +67,28 @@ def compute_encoder_cache_budget( | |
scheduler_config: Scheduler configuration. | ||
|
||
Returns: | ||
The encoder cache budget, in unit of number of tokens | ||
in the input sequence. | ||
- Compute budget for encoder execution, in unit of number of tokens | ||
in the input sequence. | ||
- Space budget for encoder cache size, in unit of number of tokens | ||
in the input sequence. | ||
""" | ||
|
||
encoder_cache_budget = 0 | ||
|
||
if not model_config.is_multimodal_model: | ||
return encoder_cache_budget | ||
ywang96 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return 0, 0 | ||
|
||
# TODO: handle encoder-decoder models once we support them. | ||
encoder_cache_budget, _, _ = compute_encoder_cache_budget_multimodal( | ||
model_config, scheduler_config) | ||
( | ||
encoder_compute_budget, | ||
encoder_cache_size, | ||
) = _compute_encoder_budget_multimodal(model_config, scheduler_config) | ||
|
||
return encoder_cache_budget | ||
return encoder_compute_budget, encoder_cache_size | ||
|
||
|
||
def compute_encoder_cache_budget_multimodal( | ||
def _compute_encoder_budget_multimodal( | ||
model_config: "ModelConfig", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. QQ: Why do we need this separate function? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Eventually we might need to use |
||
scheduler_config: "SchedulerConfig", | ||
) -> tuple[int, Optional[str], int]: | ||
) -> tuple[int, int]: | ||
"""Compute the encoder cache budget based on the model and scheduler | ||
ywang96 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
configurations for a multimodal model. | ||
|
||
|
@@ -96,14 +97,12 @@ def compute_encoder_cache_budget_multimodal( | |
scheduler_config: Scheduler configuration. | ||
|
||
Returns: | ||
- The encoder cache budget, in unit of number of tokens in the | ||
input sequence. | ||
- The modality of the multimodal item that requires the most tokens. | ||
- The number of multimodal items used to compute the encoder cache | ||
budget. | ||
- Compute budget for encoder execution, in unit of number of tokens | ||
in the input sequence. | ||
- Space budget for encoder cache size, in unit of number of tokens | ||
in the input sequence. | ||
""" | ||
|
||
encoder_cache_budget = 0 | ||
max_tokens_by_modality_dict = MULTIMODAL_REGISTRY.get_max_tokens_per_item_by_nonzero_modality( # noqa: E501 | ||
model_config) | ||
|
||
|
@@ -112,47 +111,14 @@ def compute_encoder_cache_budget_multimodal( | |
"All non-text modalities supported by the model have been " | ||
"explicitly disabled via limit_mm_per_prompt. Encoder cache will " | ||
"not be initialized.") | ||
return encoder_cache_budget, None, 0 | ||
|
||
modality, max_tokens_per_mm_item = max(max_tokens_by_modality_dict.items(), | ||
key=lambda item: item[1]) | ||
|
||
max_num_batched_tokens = scheduler_config.max_num_batched_tokens | ||
max_num_reqs = scheduler_config.max_num_seqs | ||
|
||
# The biggest possible multimodal item cannot be fully prefilled in a | ||
# batch, so every batch can partially prefill at most one of such item. | ||
if max_tokens_per_mm_item > max_num_batched_tokens: | ||
num_items = 1 | ||
|
||
# A batch can fully cover multiple biggest possible multimodal items, and | ||
# one that will be partially prefilled. | ||
else: | ||
num_items = cdiv(max_num_batched_tokens, max_tokens_per_mm_item) | ||
|
||
# NOTE: We need the encoder cache to be able to compute & hold ONE | ||
# ADDITIONAL multimodal item, and is required only when: | ||
# - Two requests in the current batch share the same prefix with such item | ||
# as part of the prefix. | ||
# - AND the prefix length is divisible by the block size, triggering the | ||
# recomputation of the last block. | ||
# - AND the part of the embeddings of the item is in this last block. | ||
|
||
# This issue can be fundamentally resolved by supporting num_new_tokens=0 | ||
# on the model runner. | ||
num_items += 1 | ||
|
||
# Number of items needed cannot be bigger than max number of running | ||
# requests * max number of multimodal items per request. | ||
max_mm_items_per_req = max( | ||
MULTIMODAL_REGISTRY.get_mm_limits_per_prompt(model_config).values()) | ||
|
||
num_items = min(num_items, max_num_reqs * max_mm_items_per_req) | ||
encoder_cache_budget = num_items * max_tokens_per_mm_item | ||
|
||
logger.info( | ||
"Encoder cache will be initialized with a budget of %s tokens," | ||
" and profiled with %s %s items of the maximum feature size.", | ||
encoder_cache_budget, num_items, modality) | ||
|
||
return encoder_cache_budget, modality, num_items | ||
return 0, 0 | ||
|
||
_, max_tokens_per_mm_item = max(max_tokens_by_modality_dict.items(), | ||
key=lambda item: item[1]) | ||
|
||
encoder_compute_budget = max(scheduler_config.max_num_encoder_input_tokens, | ||
max_tokens_per_mm_item) | ||
encoder_cache_size = max(scheduler_config.encoder_cache_size, | ||
max_tokens_per_mm_item) | ||
|
||
return encoder_compute_budget, encoder_cache_size |
Uh oh!
There was an error while loading. Please reload this page.