vllm-project · llsj14 · Jul 12, 2025 · Jul 12, 2025 · Jul 14, 2025 · Jul 14, 2025
diff --git a/tests/v1/logits_processors/test_correctness.py b/tests/v1/logits_processors/test_correctness.py
@@ -20,13 +20,10 @@
 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_pin_memory_available
 # yapf: disable
-from vllm.v1.sample.logits_processor import (BatchUpdate, BatchUpdateBuilder,
-                                             LogitBiasLogitsProcessor,
-                                             LogitsProcessor,
-                                             MinPLogitsProcessor,
-                                             MinTokensLogitsProcessor,
-                                             MoveDirectionality,
-                                             build_logitsprocs)
+from vllm.v1.sample.logits_processor import (
+    BatchUpdate, BatchUpdateBuilder, LogitBiasLogitsProcessor, LogitsProcessor,
+    MinPLogitsProcessor, MinTokensLogitsProcessor, MoveDirectionality,
+    ThinkingTokenBudgetLogitsProcessor, build_logitsprocs)
 # yapf: enable
 from vllm.v1.sample.metadata import SamplingMetadata
 
@@ -43,6 +40,11 @@
 REQS_PER_LOGITPROC = 50
 STR_NO_LOGITPROC = "none"
 
+# ThinkingTokenBudgetLogitsProcessor testing constants
+THINKING_TOKEN_BUDGET = 5
+THINK_START_TOKEN_ID = 999
+THINK_END_TOKEN_ID = 998
+
 # LogitsProcessor subclass or "none"
 LogitprocType = Union[type[LogitsProcessor], str]
 
@@ -62,10 +64,24 @@ def __init__(self, workload_index: int, logitproc_type: LogitprocType):
         self.workload_index = workload_index
         self.logitproc_type = logitproc_type
         # Number of output tokens is randomly 0 or twice the min-tokens
-        # threshold which will be used in testing. Output token values
-        # don't matter *for these tests* so use 0 as a dummy value
-        self.out_tokens = ([0] *
-                           (MIN_TOKENS_LEN_THRESHOLD * random.randint(0, 2)))
+        # threshold which will be used in testing.
+        # Generate diverse random tokens for all processors (more realistic)
+        num_tokens = MIN_TOKENS_LEN_THRESHOLD * random.randint(0, 2)
+        if num_tokens > 0:
+            # Use diverse random tokens
+            self.out_tokens = [
+                random.randint(1, 950) for _ in range(num_tokens)
+            ]
+            # Set first token for ThinkingTokenBudget testing
+            is_thinking_processor = (logitproc_type
+                                     is ThinkingTokenBudgetLogitsProcessor or
+                                     (hasattr(logitproc_type, '__name__')
+                                      and logitproc_type.__name__
+                                      == 'ThinkingTokenBudgetLogitsProcessor'))
+            if is_thinking_processor:
+                self.out_tokens[0] = THINK_START_TOKEN_ID
+        else:
+            self.out_tokens = []
         self.prompt_tokens = []
         self.params = _sampling_params_from_logitproc(logitproc_type)
 
@@ -75,6 +91,15 @@ def __str__(self):
         return f"MyClass({summ})"
 
 
+class MockReasoningConfig:
+    """Mock reasoning config for testing ThinkingTokenBudgetLogitsProcessor."""
+    think_start_token_ids = [THINK_START_TOKEN_ID]
+    think_end_token_ids = [THINK_END_TOKEN_ID]
+
+    def is_thinking_enabled(self) -> bool:
+        return True
+
+
 def _generate_fake_sampling_metadata(
     num_output_tokens: int,
     batch_size: int,
@@ -92,8 +117,12 @@ def _generate_fake_sampling_metadata(
                               vocab_size,
                               size=np.random.randint(
                                   1, MAX_NUM_PROMPT_TOKENS)).tolist())
+
+    vllm_config = VllmConfig()
+    vllm_config.reasoning_config = MockReasoningConfig()
+
     logitsprocs = build_logitsprocs(
-        vllm_config=VllmConfig(),
+        vllm_config=vllm_config,
         device=device,
         is_pin_memory=PIN_MEMORY_AVAILABLE,
         is_pooling_model=False,
@@ -368,6 +397,115 @@ def _min_tokens_validate(
                     step_idx=step_idx)
 
 
+def _thinking_budget_params(kwargs: dict) -> None:
+    """Set SamplingParams kwargs for thinking token budget tests"""
+    kwargs["thinking_token_budget"] = THINKING_TOKEN_BUDGET
+
+
+def _thinking_budget_validate(
+    test_fakes: LogitsprocsTestFakes,
+    persistent_batch: list[LogitsProcsRequestParams],
+    logits_new: torch.Tensor,
+    batch_index: int,
+    request_params: LogitsProcsRequestParams,
+    step_idx: int,
+) -> None:
+    """Validate thinking token budget processor behavior"""
+    # Get the ThinkingTokenBudgetLogitsProcessor instance
+    tb_processor: ThinkingTokenBudgetLogitsProcessor = next(
+        test_fakes.get_logitsprocs_by_cls(ThinkingTokenBudgetLogitsProcessor))
+
+    # Get current request state
+    state = tb_processor._state.get(batch_index)
+    params = request_params.params
+
+    # Validate thinking token budget configuration
+    if hasattr(params,
+               'thinking_token_budget') and params.thinking_token_budget:
+        # State should exist for requests with thinking_token_budget
+        if state is None:
+            _raise_error_invalid(msg_suffix=(
+                f"Expected state for batch {batch_index} "
+                f"with thinking_token_budget={params.thinking_token_budget}"),
+                                 batch_index=batch_index,
+                                 request_params=request_params,
+                                 step_idx=step_idx)
+
+        # Validate budget matches what was set
+        expected_budget = params.thinking_token_budget
+        actual_budget = state["thinking_token_budget"]
+
+        if actual_budget != expected_budget:
+            _raise_error_invalid(
+                msg_suffix=(f"Budget mismatch: expected {expected_budget}, "
+                            f"got {actual_budget}"),
+                batch_index=batch_index,
+                request_params=request_params,
+                step_idx=step_idx)
+
+        # Check if we're in thinking mode and validate token counting
+        output_tokens = request_params.out_tokens
+
+        # Find if thinking has started in output tokens
+        thinking_started = False
+        start_tokens = tb_processor.think_start_token_ids
+
+        if len(start_tokens) > 0:
+            for i in range(len(output_tokens) - len(start_tokens) + 1):
+                if output_tokens[i:i + len(start_tokens)] == start_tokens:
+                    thinking_started = True
+                    break
+
+        if thinking_started:
+            # If budget is exceeded, validate end token forcing
+            think_count = state["think_count"]
+            budget = state["thinking_token_budget"]
+
+            if think_count >= budget:
+                if not state["in_end"]:
+                    _raise_error_invalid(
+                        msg_suffix=(f"Budget exceeded ({think_count} >= "
+                                    f"{budget}) but not "
+                                    "forcing end tokens"),
+                        batch_index=batch_index,
+                        request_params=request_params,
+                        step_idx=step_idx)
+
+                # Validate that only end tokens are allowed
+                end_tokens = tb_processor.think_end_token_ids
+                if len(end_tokens) > 0:
+                    expected_end_token_id = end_tokens[min(
+                        state["end_count"],
+                        len(end_tokens) - 1)]
+
+                    # Check logits masking
+                    batch_logits = logits_new[batch_index]
+                    for token_id in range(len(batch_logits)):
+                        logit_value = batch_logits[token_id]
+
+                        if token_id == expected_end_token_id:
+                            # End token should not be masked
+                            if logit_value == -float("inf"):
+                                _raise_error_invalid(
+                                    msg_suffix=(
+                                        f"End token {token_id} should not be "
+                                        "masked but is"),
+                                    batch_index=batch_index,
+                                    request_params=request_params,
+                                    step_idx=step_idx)
+                        else:
+                            # All other tokens should be masked when forcing end
+                            if logit_value != -float("inf"):
+                                _raise_error_invalid(
+                                    msg_suffix=(
+                                        f"Token {token_id} should be masked "
+                                        f"when forcing end tokens, but "
+                                        f"logit={logit_value}"),
+                                    batch_index=batch_index,
+                                    request_params=request_params,
+                                    step_idx=step_idx)
+
+
 def _none_validate(
     test_fakes: LogitsprocsTestFakes,
     persistent_batch: list[LogitsProcsRequestParams],
@@ -413,16 +551,27 @@ class LogitsprocTestHelpers(NamedTuple):
     MinTokensLogitsProcessor:
     LogitsprocTestHelpers(gen_request_fxn=_min_tokens_params,
                           eval_fxn=_min_tokens_validate),
+    ThinkingTokenBudgetLogitsProcessor:
+    LogitsprocTestHelpers(gen_request_fxn=_thinking_budget_params,
+                          eval_fxn=_thinking_budget_validate),
 }
 
 
 def _get_test_cases() -> list[list[str]]:
     """Each test case is a set of logitsprocs"""
     logitsprocs_types = list(logitsprocs_test_mapping.keys())
-    return [[STR_NO_LOGITPROC]] + [[logitproc_type, STR_NO_LOGITPROC]
-                                   for logitproc_type in logitsprocs_types
-                                   if logitproc_type != STR_NO_LOGITPROC
-                                   ] + [logitsprocs_types]
+
+    # Isolate ThinkingTokenBudgetLogitsProcessor from all other processors
+    # to avoid unexpected modification of logits interference
+    thinking_processor = ThinkingTokenBudgetLogitsProcessor
+    other_processors = [
+        p for p in logitsprocs_types
+        if p != STR_NO_LOGITPROC and p != thinking_processor
+    ]
+
+    return ([[STR_NO_LOGITPROC]] + [[logitproc_type, STR_NO_LOGITPROC]
+                                    for logitproc_type in other_processors] +
+            [other_processors] + [[thinking_processor]])
 
 
 def _generate_fake_step_update(

@@ -2422,6 +2422,29 @@ def _parse_collect_detailed_traces(self):
             self.collect_detailed_traces[0].split(","))
 
 
+@config
+@dataclass
+class ReasoningConfig:
+    """Configuration for reasoning models."""
+
+    think_start_str: Optional[str] = None
+    """String that indicates the start of reasoning."""
+    think_end_str: Optional[str] = None
+    """String that indicates the end of reasoning."""
+    think_start_token_ids: Optional[list[int]] = None
+    """Token ID that indicates the start of reasoning."""
+    think_end_token_ids: Optional[list[int]] = None
+    """Token ID that indicates the end of reasoning."""
+
+    def is_thinking_enabled(self) -> bool:
+        """Check if both start and end thinking token IDs
+        are set to enable thinking token budget logic."""
+        return (self.think_start_token_ids is not None
+                and self.think_end_token_ids is not None
+                and len(self.think_start_token_ids) > 0
+                and len(self.think_end_token_ids) > 0)
+
+
 @config
 @dataclass(config=ConfigDict(arbitrary_types_allowed=True))
 class VllmConfig:
@@ -2473,6 +2496,8 @@ class VllmConfig:
     """The configurations for distributed KV cache transfer."""
     kv_events_config: Optional[KVEventsConfig] = None
     """The configurations for event publishing."""
+    reasoning_config: ReasoningConfig = field(default_factory=ReasoningConfig)
+    """The configurations for reasoning model."""
     # some opaque config, only used to provide additional information
     # for the hash computation, mainly used for testing, debugging or out of
     # tree config registration.

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -30,9 +30,9 @@
                          LoRAConfig, MambaDType, MMEncoderTPMode, ModelConfig,
                          ModelDType, ModelImpl, ObservabilityConfig,
                          ParallelConfig, PoolerConfig, PrefixCachingHashAlgo,
-                         RunnerOption, SchedulerConfig, SchedulerPolicy,
-                         SpeculativeConfig, TaskOption, TokenizerMode,
-                         VllmConfig, get_attr_docs)
+                         ReasoningConfig, RunnerOption, SchedulerConfig,
+                         SchedulerPolicy, SpeculativeConfig, TaskOption,
+                         TokenizerMode, VllmConfig, get_attr_docs)
 from vllm.config.multimodal import MMCacheType, MultiModalConfig
 from vllm.config.parallel import ExpertPlacementStrategy
 from vllm.config.utils import get_field
@@ -449,6 +449,9 @@ class EngineArgs:
     kv_transfer_config: Optional[KVTransferConfig] = None
     kv_events_config: Optional[KVEventsConfig] = None
 
+    reasoning_config: ReasoningConfig = get_field(VllmConfig,
+                                                  "reasoning_config")
+
     generation_config: str = ModelConfig.generation_config
     enable_sleep_mode: bool = ModelConfig.enable_sleep_mode
     override_generation_config: dict[str, Any] = \
@@ -932,6 +935,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                                 **vllm_kwargs["kv_events_config"])
         vllm_group.add_argument("--compilation-config", "-O",
                                 **vllm_kwargs["compilation_config"])
+        vllm_group.add_argument("--reasoning-config",
+                                **vllm_kwargs["reasoning_config"])
         vllm_group.add_argument("--additional-config",
                                 **vllm_kwargs["additional_config"])
 
@@ -1452,6 +1457,7 @@ def create_engine_config(
             compilation_config=self.compilation_config,
             kv_transfer_config=self.kv_transfer_config,
             kv_events_config=self.kv_events_config,
+            reasoning_config=self.reasoning_config,
             additional_config=self.additional_config,
         )
 

@@ -434,6 +434,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
         ChatCompletionNamedToolChoiceParam,
     ]] = "none"
     reasoning_effort: Optional[Literal["low", "medium", "high"]] = None
+    thinking_token_budget: Optional[int] = None
     include_reasoning: bool = True
 
     # NOTE this will be ignored by vLLM -- the model determines the behavior
@@ -731,6 +732,7 @@ def to_sampling_params(
             guided_decoding=guided_decoding,
             logit_bias=self.logit_bias,
             bad_words= self.bad_words,
+            thinking_token_budget=self.thinking_token_budget,
             allowed_token_ids=self.allowed_token_ids,
             extra_args=extra_args or None,
         )

diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
@@ -217,6 +217,9 @@ class SamplingParams(
     generated token can complete the sequence."""
     _bad_words_token_ids: Optional[list[list[int]]] = None
 
+    thinking_token_budget: Optional[int] = None
+    """Maximum number of tokens allowed for thinking operations."""
+
     @staticmethod
     def from_optional(
         n: Optional[int] = 1,
@@ -232,6 +235,7 @@ def from_optional(
         stop: Optional[Union[str, list[str]]] = None,
         stop_token_ids: Optional[list[int]] = None,
         bad_words: Optional[list[str]] = None,
+        thinking_token_budget: Optional[int] = None,
         include_stop_str_in_output: bool = False,
         ignore_eos: bool = False,
         max_tokens: Optional[int] = 16,
@@ -276,6 +280,7 @@ def from_optional(
             stop=stop,
             stop_token_ids=stop_token_ids,
             bad_words=bad_words,
+            thinking_token_budget=thinking_token_budget,
             include_stop_str_in_output=include_stop_str_in_output,
             ignore_eos=ignore_eos,
             max_tokens=max_tokens,
@@ -549,6 +554,7 @@ def __repr__(self) -> str:
             f"stop={self.stop}, "
             f"stop_token_ids={self.stop_token_ids}, "
             f"bad_words={self.bad_words}, "
+            f"thinking_token_budget={self.thinking_token_budget}, "
             f"include_stop_str_in_output={self.include_stop_str_in_output}, "
             f"ignore_eos={self.ignore_eos}, "
             f"max_tokens={self.max_tokens}, "

@@ -13,10 +13,9 @@
 from vllm.logger import init_logger
 from vllm.logits_process import LogitsProcessor as RequestLogitsProcessor
 from vllm.sampling_params import SamplingParams
-from vllm.v1.sample.logits_processor.builtin import (LogitBiasLogitsProcessor,
-                                                     MinPLogitsProcessor,
-                                                     MinTokensLogitsProcessor,
-                                                     process_dict_updates)
+from vllm.v1.sample.logits_processor.builtin import (
+    LogitBiasLogitsProcessor, MinPLogitsProcessor, MinTokensLogitsProcessor,
+    ThinkingTokenBudgetLogitsProcessor, process_dict_updates)
 from vllm.v1.sample.logits_processor.interface import (BatchUpdate,
                                                        LogitsProcessor,
                                                        MoveDirectionality)
@@ -39,6 +38,7 @@
     MinTokensLogitsProcessor,
     LogitBiasLogitsProcessor,
     MinPLogitsProcessor,
+    ThinkingTokenBudgetLogitsProcessor,
 ]
 
 
@@ -290,5 +290,5 @@ def apply(self, logits: torch.Tensor) -> torch.Tensor:
     "MinTokensLogitsProcessor", "BatchUpdate", "BatchUpdateBuilder",
     "MoveDirectionality", "LogitsProcessors", "build_logitsprocs",
     "STR_POOLING_REJECTS_LOGITSPROCS", "LOGITSPROCS_GROUP",
-    "AdapterLogitsProcessor"
+    "AdapterLogitsProcessor", "ThinkingTokenBudgetLogitsProcessor"
 ]