Remove unused swap_space parameter

mcelrath · mcelrath · commit 85c7aaf02cb3 · 2025-11-03T10:13:05.000-05:00
References: #27984
diff --git a/docs/serving/integrations/llamaindex.md b/docs/serving/integrations/llamaindex.md
@@ -17,7 +17,7 @@ llm = Vllm(
     model="microsoft/Orca-2-7b",
     tensor_parallel_size=4,
     max_new_tokens=100,
-    vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5},
+    vllm_kwargs={"gpu_memory_utilization": 0.5},
 )
 ```
 
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -749,7 +749,6 @@ def __init__(
         tensor_parallel_size: int = 1,
         block_size: int = 16 if not torch.xpu.is_available() else 64,
         enable_chunked_prefill: bool | None = False,
-        swap_space: int = 4,
         enforce_eager: bool | None = False,
         # Set this to avoid hanging issue
         default_torch_num_threads: int | None = None,
@@ -778,7 +777,6 @@ def __init__(
                 trust_remote_code=trust_remote_code,
                 dtype=dtype,
                 seed=seed,
-                swap_space=swap_space,
                 enforce_eager=enforce_eager,
                 disable_log_stats=disable_log_stats,
                 tensor_parallel_size=tensor_parallel_size,
diff --git a/tests/distributed/test_torchrun_example.py b/tests/distributed/test_torchrun_example.py
@@ -22,15 +22,14 @@
 
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
-# set different `gpu_memory_utilization` and `swap_space` for different ranks,
+# set different `gpu_memory_utilization` for different ranks,
 # to test if all ranks agree on the same kv cache configuration.
 llm = LLM(
     model="facebook/opt-125m",
     tensor_parallel_size=2,
     pipeline_parallel_size=int(os.getenv("PP_SIZE", 1)),
     distributed_executor_backend="external_launcher",
     gpu_memory_utilization=random.uniform(0.7, 0.9),
-    swap_space=random.randint(1, 4),
     seed=0,
 )
 
diff --git a/tests/distributed/test_torchrun_example_moe.py b/tests/distributed/test_torchrun_example_moe.py
@@ -28,7 +28,7 @@
 
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
-# set different `gpu_memory_utilization` and `swap_space` for different ranks,
+# set different `gpu_memory_utilization` for different ranks,
 # to test if all ranks agree on the same kv cache configuration.
 llm = LLM(
     model="microsoft/Phi-mini-MoE-instruct",
@@ -37,7 +37,6 @@
     enable_expert_parallel=int(os.getenv("ENABLE_EP", "0")) == 1,
     distributed_executor_backend="external_launcher",
     gpu_memory_utilization=random.uniform(0.7, 0.9),
-    swap_space=random.randint(1, 4),
     seed=0,
 )
 
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
@@ -52,7 +52,6 @@ def set_active_loras(worker: Worker, lora_requests: list[LoRARequest]):
         device_config=DeviceConfig("cuda"),
         cache_config=CacheConfig(
             block_size=16,
-            swap_space=0,
             cache_dtype="auto",
         ),
         lora_config=LoRAConfig(
diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
@@ -172,7 +172,6 @@ def create_vllm_config(
     cache_config = CacheConfig(
         block_size=block_size,
         cache_dtype="auto",
-        swap_space=0,
     )
     # Set cache blocks for testing
     #   (these may be set during initialization normally)
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
@@ -1425,7 +1425,6 @@ def create_scheduler_with_priority(
     cache_config = CacheConfig(
         block_size=block_size,
         gpu_memory_utilization=0.9,
-        swap_space=0,
         cache_dtype="auto",
         **kwargs_cache,
     )
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
@@ -89,7 +89,6 @@ def create_scheduler(
     cache_config = CacheConfig(
         block_size=block_size,
         gpu_memory_utilization=0.9,
-        swap_space=0,
         cache_dtype="auto",
         **kwargs_cache,
     )
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
@@ -105,7 +105,6 @@ def create_vllm_config(
     cache_config = CacheConfig(
         block_size=block_size,
         gpu_memory_utilization=0.9,
-        swap_space=0,
         cache_dtype="auto",
         enable_prefix_caching=True,
     )
diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -39,7 +39,6 @@ def get_vllm_config():
     cache_config = CacheConfig(
         block_size=16,
         gpu_memory_utilization=0.9,
-        swap_space=0,
         cache_dtype="auto",
     )
     vllm_config = VllmConfig(
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
@@ -92,7 +92,6 @@ def get_vllm_config():
     cache_config = CacheConfig(
         block_size=BLOCK_SIZE,
         gpu_memory_utilization=0.9,
-        swap_space=0,
         cache_dtype="auto",
     )
     parallel_config = ParallelConfig()
@@ -786,7 +785,6 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
     cache_config = CacheConfig(
         block_size=BLOCK_SIZE,
         gpu_memory_utilization=0.9,
-        swap_space=0,
         cache_dtype="auto",
     )
     parallel_config = ParallelConfig()
diff --git a/vllm/config/cache.py b/vllm/config/cache.py
@@ -47,8 +47,6 @@ class CacheConfig:
     not matter if you have another vLLM instance running on the same GPU. For
     example, if you have two vLLM instances running on the same GPU, you can
     set the GPU memory utilization to 0.5 for each instance."""
-    swap_space: float = Field(default=4, ge=0)
-    """Size of the CPU swap space per GPU (in GiB)."""
     cache_dtype: CacheDType = "auto"
     """Data type for kv cache storage. If "auto", will use model data type.
     CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports
@@ -177,23 +175,4 @@ def _validate_cache_dtype(cls, cache_dtype: CacheDType) -> CacheDType:
             )
         return cache_dtype
 
-    def verify_with_parallel_config(
-        self,
-        parallel_config: ParallelConfig,
-    ) -> None:
-        swap_space_bytes = self.swap_space * GiB_bytes
-        total_cpu_memory = get_cpu_memory()
-        # FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel
-        # group are in the same node. However, the GPUs may span multiple nodes.
-        num_gpus_per_node = parallel_config.tensor_parallel_size
-        cpu_memory_usage = swap_space_bytes * num_gpus_per_node
-
-        msg = (
-            f"{cpu_memory_usage / GiB_bytes:.2f} GiB out of the "
-            f"{total_cpu_memory / GiB_bytes:.2f} GiB total CPU memory "
-            "is allocated for the swap space."
-        )
-        if cpu_memory_usage > 0.7 * total_cpu_memory:
-            raise ValueError("Too large swap space. " + msg)
-        elif cpu_memory_usage > 0.4 * total_cpu_memory:
-            logger.warning("Possibly too large swap space. %s", msg)
+
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
@@ -343,8 +343,6 @@ def __post_init__(self):
             self.model_config.verify_with_parallel_config(self.parallel_config)
             self.model_config.verify_dual_chunk_attention_config(self.load_config)
 
-        self.cache_config.verify_with_parallel_config(self.parallel_config)
-
         if self.lora_config is not None:
             self.lora_config.verify_with_model_config(self.model_config)
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
@@ -746,7 +746,6 @@ def get_inference_info(self) -> dict:
                 "gpu_memory_utilization": getattr(
                     vllm_config.cache_config, "gpu_memory_utilization", None
                 ),
-                "swap_space": getattr(vllm_config.cache_config, "swap_space", None),
                 "enable_prefix_caching": getattr(
                     vllm_config.cache_config, "enable_prefix_caching", None
                 ),
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -423,7 +423,6 @@ class EngineArgs:
     )
     disable_sliding_window: bool = ModelConfig.disable_sliding_window
     disable_cascade_attn: bool = ModelConfig.disable_cascade_attn
-    swap_space: float = CacheConfig.swap_space
     cpu_offload_gb: float = CacheConfig.cpu_offload_gb
     gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization
     kv_cache_memory_bytes: int | None = CacheConfig.kv_cache_memory_bytes
@@ -880,8 +879,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         cache_group.add_argument(
             "--kv-cache-memory-bytes", **cache_kwargs["kv_cache_memory_bytes"]
         )
-        cache_group.add_argument("--swap-space", **cache_kwargs["swap_space"])
-        cache_group.add_argument("--kv-cache-dtype", **cache_kwargs["cache_dtype"])
+        cache_group.add_argument("--kv-cache-dtype", **cache_kwargs["kv_cache_dtype"])
         cache_group.add_argument(
             "--num-gpu-blocks-override", **cache_kwargs["num_gpu_blocks_override"]
         )
@@ -1391,7 +1389,6 @@ def create_engine_config(
             block_size=self.block_size,
             gpu_memory_utilization=self.gpu_memory_utilization,
             kv_cache_memory_bytes=self.kv_cache_memory_bytes,
-            swap_space=self.swap_space,
             cache_dtype=self.kv_cache_dtype,
             is_attention_free=model_config.is_attention_free,
             num_gpu_blocks_override=self.num_gpu_blocks_override,
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
@@ -146,12 +146,6 @@ class LLM:
             compared with using gpu_memory_utilization. Note that
             kv_cache_memory_bytes (when not-None) ignores
             gpu_memory_utilization
-        swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
-            This can be used for temporarily storing the states of the requests
-            when their `best_of` sampling parameters are larger than 1. If all
-            requests will have `best_of=1`, you can safely set this to 0.
-            Noting that `best_of` is only supported in V0. Otherwise, too small
-            values may cause out-of-memory (OOM) errors.
         cpu_offload_gb: The size (GiB) of CPU memory to use for offloading
             the model weights. This virtually increases the GPU memory space
             you can use to hold the model weights, at the cost of CPU-GPU data
@@ -206,7 +200,6 @@ def __init__(
         tokenizer_revision: str | None = None,
         seed: int | None = None,
         gpu_memory_utilization: float = 0.9,
-        swap_space: float = 4,
         cpu_offload_gb: float = 0,
         enforce_eager: bool = False,
         disable_custom_all_reduce: bool = False,
@@ -320,7 +313,6 @@ def __init__(
             seed=seed,
             gpu_memory_utilization=gpu_memory_utilization,
             kv_cache_memory_bytes=kv_cache_memory_bytes,
-            swap_space=swap_space,
             cpu_offload_gb=cpu_offload_gb,
             enforce_eager=enforce_eager,
             disable_custom_all_reduce=disable_custom_all_reduce,

Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,7 @@ llm = Vllm(`
`17`	`17`	`model="microsoft/Orca-2-7b",`
`18`	`18`	`tensor_parallel_size=4,`
`19`	`19`	`max_new_tokens=100,`
`20`		`- vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5},`
	`20`	`+ vllm_kwargs={"gpu_memory_utilization": 0.5},`
`21`	`21`	`)`
`22`	`22`	```
`23`	`23`
Original file line number	Diff line number	Diff line change
`@@ -172,7 +172,6 @@ def create_vllm_config(`
`172`	`172`	`cache_config = CacheConfig(`
`173`	`173`	`block_size=block_size,`
`174`	`174`	`cache_dtype="auto",`
`175`		`- swap_space=0,`
`176`	`175`	`)`
`177`	`176`	`# Set cache blocks for testing`
`178`	`177`	`# (these may be set during initialization normally)`
Original file line number	Diff line number	Diff line change
`@@ -1425,7 +1425,6 @@ def create_scheduler_with_priority(`
`1425`	`1425`	`cache_config = CacheConfig(`
`1426`	`1426`	`block_size=block_size,`
`1427`	`1427`	`gpu_memory_utilization=0.9,`
`1428`		`- swap_space=0,`
`1429`	`1428`	`cache_dtype="auto",`
`1430`	`1429`	`**kwargs_cache,`
`1431`	`1430`	`)`
Original file line number	Diff line number	Diff line change
`@@ -89,7 +89,6 @@ def create_scheduler(`
`89`	`89`	`cache_config = CacheConfig(`
`90`	`90`	`block_size=block_size,`
`91`	`91`	`gpu_memory_utilization=0.9,`
`92`		`- swap_space=0,`
`93`	`92`	`cache_dtype="auto",`
`94`	`93`	`**kwargs_cache,`
`95`	`94`	`)`
Original file line number	Diff line number	Diff line change
`@@ -105,7 +105,6 @@ def create_vllm_config(`
`105`	`105`	`cache_config = CacheConfig(`
`106`	`106`	`block_size=block_size,`
`107`	`107`	`gpu_memory_utilization=0.9,`
`108`		`- swap_space=0,`
`109`	`108`	`cache_dtype="auto",`
`110`	`109`	`enable_prefix_caching=True,`
`111`	`110`	`)`
Original file line number	Diff line number	Diff line change
`@@ -39,7 +39,6 @@ def get_vllm_config():`
`39`	`39`	`cache_config = CacheConfig(`
`40`	`40`	`block_size=16,`
`41`	`41`	`gpu_memory_utilization=0.9,`
`42`		`- swap_space=0,`
`43`	`42`	`cache_dtype="auto",`
`44`	`43`	`)`
`45`	`44`	`vllm_config = VllmConfig(`