vllm-project
diff --git a/‎.buildkite/test-pipeline.yaml
Lines changed: 1 addition & 0 deletions b/‎.buildkite/test-pipeline.yaml
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/offline_inference/logits_processor.py
Lines changed: 147 additions & 0 deletions b/‎examples/offline_inference/logits_processor.py
Lines changed: 147 additions & 0 deletions
diff --git a/‎tests/utils.py
Lines changed: 66 additions & 13 deletions b/‎tests/utils.py
Lines changed: 66 additions & 13 deletions
diff --git a/‎tests/v1/logits_processors/__init__.py b/‎tests/v1/logits_processors/__init__.py
diff --git a/‎tests/v1/sample/test_logits_processors.py renamed to ‎tests/v1/logits_processors/test_correctness.py
Lines changed: 16 additions & 8 deletions b/‎tests/v1/sample/test_logits_processors.py renamed to ‎tests/v1/logits_processors/test_correctness.py
Lines changed: 16 additions & 8 deletions
@@ -253,6 +253,7 @@ steps:
     - pytest -v -s v1/engine
     - pytest -v -s v1/entrypoints
     - pytest -v -s v1/sample
+    - pytest -v -s v1/logits_processors
     - pytest -v -s v1/worker
     - pytest -v -s v1/structured_output
     - pytest -v -s v1/spec_decode
 
@@ -0,0 +1,147 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""This example demonstrates instantiating vLLM with a custom logits processor
+class object.
+
+For a basic example of implementing a custom logits processor, see
+the `DummyLogitsProcessor` implementation in `vllm/test_utils.py`.
+
+For testing purposes, a dummy logits processor is employed which, if
+`target_token` is passed as a keyword argument to `SamplingParams.extra_args`,
+will mask out all tokens except `target_token`.
+
+A batch is constructed with `temperature=0.0` and 50% of requests specifying
+`target_token`, and for these requests - and *only* these requests - we
+expect the `target_token` to be decoded in each step, yielding an output
+similar to that shown below:
+
+Generated Outputs:
+------------------------------------------------------------
+Prompt:    'Hello, my name is'
+Output:    " ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' '"
+------------------------------------------------------------
+Prompt:    'The president of the United States is'
+Output:    " not a racist. He is a racist.\nHe's a racist because he"
+------------------------------------------------------------
+Prompt:    'The capital of France is'
+Output:    ' also also also also also also also also also also also also also
+             also also also'
+------------------------------------------------------------
+Prompt:    'The future of AI is'
+Output:    ' in the hands of the people.\n\nThe future of AI is in the'
+------------------------------------------------------------
+"""
+
+from typing import Optional
+
+import torch
+
+from vllm import LLM, SamplingParams
+from vllm.config import VllmConfig
+from vllm.v1.sample.logits_processor import (
+    BatchUpdate,
+    LogitsProcessor,
+    MoveDirectionality,
+)
+
+
+# Hypothetical custom logits processor
+class DummyLogitsProcessor(LogitsProcessor):
+    """Fake logit processor to support unit testing and examples"""
+
+    def __init__(
+        self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool
+    ):
+        self.req_info: dict[int, SamplingParams] = {}
+
+    def is_argmax_invariant(self) -> bool:
+        """Never impacts greedy sampling"""
+        return False
+
+    def update_state(self, batch_update: Optional[BatchUpdate]):
+        if not batch_update:
+            return
+
+        # Process added requests.
+        for index, params, _, _ in batch_update.added:
+            assert params is not None
+            if params.extra_args and (
+                target_token := params.extra_args.get("target_token")
+            ):
+                self.req_info[index] = target_token
+
+        if self.req_info:
+            # Process removed requests.
+            for index in batch_update.removed:
+                self.req_info.pop(index, None)
+
+            # Process moved requests, unidirectional move (a->b) and swap
+            # (a<->b)
+            for adx, bdx, direct in batch_update.moved:
+                a_val = self.req_info.pop(adx, None)
+                b_val = self.req_info.pop(bdx, None)
+                if a_val is not None:
+                    self.req_info[bdx] = a_val
+                if direct == MoveDirectionality.SWAP and b_val is not None:
+                    self.req_info[adx] = b_val
+
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        if not self.req_info:
+            return logits
+
+        # Save target values before modification
+        rows_list = list(self.req_info.keys())
+        cols = torch.tensor(
+            [self.req_info[i] for i in rows_list],
+            dtype=torch.long,
+            device=logits.device,
+        )
+        rows = torch.tensor(rows_list, dtype=torch.long, device=logits.device)
+        values_to_keep = logits[rows, cols].clone()
+
+        # Mask all but target tokens
+        logits[rows] = float("-inf")
+        logits[rows, cols] = values_to_keep
+
+        return logits
+
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a mixture of requests which do and don't utilize the dummy logitproc
+sampling_params_list = [
+    SamplingParams(temperature=0.0, extra_args={"target_token": 128}),
+    SamplingParams(temperature=0.0),
+    SamplingParams(temperature=0.0, extra_args={"target_token": 67}),
+    SamplingParams(temperature=0.0),
+]
+
+
+def main():
+    # Create an LLM.
+    llm = LLM(
+        model="facebook/opt-125m",
+        logits_processors=[DummyLogitsProcessor],
+    )
+    # Generate texts from the prompts.
+    # The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params_list)
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt:    {prompt!r}")
+        print(f"Output:    {generated_text!r}")
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    main()
@@ -13,6 +13,7 @@
 import time
 import warnings
 from contextlib import contextmanager, suppress
+from multiprocessing import Process
 from pathlib import Path
 from typing import Any, Callable, Literal, Optional, Union
 
@@ -76,6 +77,23 @@ def _nvml():
 class RemoteOpenAIServer:
     DUMMY_API_KEY = "token-abc123"  # vLLM's OpenAI server does not need API key
 
+    def _start_server(self, model: str, vllm_serve_args: list[str],
+                      env_dict: Optional[dict[str, str]]) -> None:
+        """Subclasses override this method to customize server process launch
+        """
+        env = os.environ.copy()
+        # the current process might initialize cuda,
+        # to be safe, we should use spawn method
+        env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
+        if env_dict is not None:
+            env.update(env_dict)
+        self.proc: subprocess.Popen = subprocess.Popen(
+            ["vllm", "serve", model, *vllm_serve_args],
+            env=env,
+            stdout=sys.stdout,
+            stderr=sys.stderr,
+        )
+
     def __init__(self,
                  model: str,
                  vllm_serve_args: list[str],
@@ -128,18 +146,7 @@ def __init__(self,
             model_loader = get_model_loader(load_config)
             model_loader.download_model(model_config)
 
-        env = os.environ.copy()
-        # the current process might initialize cuda,
-        # to be safe, we should use spawn method
-        env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
-        if env_dict is not None:
-            env.update(env_dict)
-        self.proc = subprocess.Popen(
-            ["vllm", "serve", model, *vllm_serve_args],
-            env=env,
-            stdout=sys.stdout,
-            stderr=sys.stderr,
-        )
+        self._start_server(model, vllm_serve_args, env_dict)
         max_wait_seconds = max_wait_seconds or 240
         self._wait_for_server(url=self.url_for("health"),
                               timeout=max_wait_seconds)
@@ -155,6 +162,10 @@ def __exit__(self, exc_type, exc_value, traceback):
             # force kill if needed
             self.proc.kill()
 
+    def _poll(self) -> Optional[int]:
+        """Subclasses override this method to customize process polling"""
+        return self.proc.poll()
+
     def _wait_for_server(self, *, url: str, timeout: float):
         # run health check
         start = time.time()
@@ -169,7 +180,7 @@ def _wait_for_server(self, *, url: str, timeout: float):
                 # which means the server is not ready yet.
                 # the stack trace is not useful, so we suppress it
                 # by using `raise from None`.
-                result = self.proc.poll()
+                result = self._poll()
                 if result is not None and result != 0:
                     raise RuntimeError("Server exited unexpectedly.") from None
 
@@ -205,6 +216,48 @@ def get_async_client(self, **kwargs):
                                   **kwargs)
 
 
+class RemoteOpenAIServerCustom(RemoteOpenAIServer):
+    """Launch test server with custom child process"""
+
+    def _start_server(self, model: str, vllm_serve_args: list[str],
+                      env_dict: Optional[dict[str, str]]) -> None:
+        self.proc: Process = Process(
+            target=self.child_process_fxn,
+            args=(env_dict, model,
+                  vllm_serve_args))  # type: ignore[assignment]
+        self.proc.start()
+
+    def __init__(self,
+                 model: str,
+                 vllm_serve_args: list[str],
+                 child_process_fxn: Callable[
+                     [Optional[dict[str, str]], str, list[str]], None],
+                 *,
+                 env_dict: Optional[dict[str, str]] = None,
+                 seed: Optional[int] = 0,
+                 auto_port: bool = True,
+                 max_wait_seconds: Optional[float] = None) -> None:
+        """Store custom child process function then invoke superclass
+        constructor which will indirectly launch it."""
+        self.child_process_fxn = child_process_fxn
+        super().__init__(model=model,
+                         vllm_serve_args=vllm_serve_args,
+                         env_dict=env_dict,
+                         seed=seed,
+                         auto_port=auto_port,
+                         max_wait_seconds=max_wait_seconds)
+
+    def _poll(self) -> Optional[int]:
+        return self.proc.exitcode
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.proc.terminate()
+        self.proc.join(8)
+        if self.proc.is_alive():
+            # force kill if needed
+            self.proc.kill()
+
+
 def _test_completion(
     client: openai.OpenAI,
     model: str,
 
@@ -9,11 +9,13 @@
 import pytest
 import torch
 
+from tests.utils import create_new_process_for_each_test
 from tests.v1.sample.utils import (LogitsprocsTestFakes, create_fake_logits,
                                    create_penalty_tensor,
                                    create_prompt_tokens_tensor,
                                    fake_apply_logitsprocs,
                                    fake_update_logitsprocs_state)
+from vllm.config import VllmConfig
 from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_pin_memory_available
@@ -24,7 +26,7 @@
                                              MinPLogitsProcessor,
                                              MinTokensLogitsProcessor,
                                              MoveDirectionality,
-                                             init_builtin_logitsprocs)
+                                             build_logitsprocs)
 # yapf: enable
 from vllm.v1.sample.metadata import SamplingMetadata
 
@@ -53,6 +55,7 @@ class LogitsProcsRequestParams:
     workload_index: int
     logitproc_type: LogitprocType  # Logitproc enabled, specified by str id
     out_tokens: list[int]  # Output tokens required for min tokens test
+    prompt_tokens: list[int]  # Dummy prompt tokens placeholder
     params: SamplingParams  # Settings customized for logitproc
 
     def __init__(self, workload_index: int, logitproc_type: LogitprocType):
@@ -63,6 +66,7 @@ def __init__(self, workload_index: int, logitproc_type: LogitprocType):
         # don't matter *for these tests* so use 0 as a dummy value
         self.out_tokens = ([0] *
                            (MIN_TOKENS_LEN_THRESHOLD * random.randint(0, 2)))
+        self.prompt_tokens = []
         self.params = _sampling_params_from_logitproc(logitproc_type)
 
     def __str__(self):
@@ -88,11 +92,12 @@ def _generate_fake_sampling_metadata(
                               vocab_size,
                               size=np.random.randint(
                                   1, MAX_NUM_PROMPT_TOKENS)).tolist())
-    logitsprocs = init_builtin_logitsprocs(
-        pin_memory_available=PIN_MEMORY_AVAILABLE,
-        max_num_reqs=MAX_NUM_REQS + 1,
-        device=device)
-
+    logitsprocs = build_logitsprocs(
+        vllm_config=VllmConfig(),
+        device=device,
+        is_pin_memory=PIN_MEMORY_AVAILABLE,
+        is_pooling_model=False,
+    )
     fake_sampling_metadata = SamplingMetadata(
         temperature=torch.full((batch_size, ), 0.0),
         all_greedy=True,
@@ -462,15 +467,17 @@ def _generate_fake_step_update(
         # Replace as many removed requests as possible with added requests
         add_remove_idx = batch_update_builder.pop_removed()
         batch_update_builder.added.append(
-            (add_remove_idx, add_req_params.params, add_req_params.out_tokens))
+            (add_remove_idx, add_req_params.params,
+             add_req_params.prompt_tokens, add_req_params.out_tokens))
         persistent_batch[add_remove_idx] = add_req_params
 
     # Append remaining added requests to end of batch
     add_reqs_append = workload_params[(wdx +
                                        num_step_add_replace):(wdx +
                                                               num_step_add)]
     batch_update_builder.added.extend([
-        (adx + batch_size, add_req_params.params, add_req_params.out_tokens)
+        (adx + batch_size, add_req_params.params, add_req_params.prompt_tokens,
+         add_req_params.out_tokens)
         for adx, add_req_params in enumerate(add_reqs_append)
     ])
     persistent_batch.extend(add_reqs_append)
@@ -561,6 +568,7 @@ def _assert_valid(
             step_idx=step_idx)
 
 
+@create_new_process_for_each_test()
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("reqs_per_logitproc", [REQS_PER_LOGITPROC])
 @pytest.mark.parametrize("logitsprocs_under_test", _get_test_cases())