Skip to content

Commit 660dea1

Browse files
authored
[cuda][misc] remove error_on_invalid_device_count_status (vllm-project#7069)
1 parent cf2a1a4 commit 660dea1

File tree

3 files changed

+3
-32
lines changed

3 files changed

+3
-32
lines changed

vllm/executor/multiproc_gpu_executor.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
from vllm.sequence import ExecuteModelRequest, SamplerOutput
1818
from vllm.triton_utils import maybe_set_triton_cache_manager
1919
from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless,
20-
error_on_invalid_device_count_status,
2120
get_distributed_init_method, get_open_port,
2221
get_vllm_instance_id, make_async,
2322
update_environment_variables)
@@ -79,8 +78,6 @@ def _init_executor(self) -> None:
7978
f"please ensure that world_size ({world_size}) "
8079
f"is less than than max local gpu count ({cuda_device_count})")
8180

82-
error_on_invalid_device_count_status()
83-
8481
# Multiprocessing-based executor does not support multi-node setting.
8582
# Since it only works for single node, we can use the loopback address
8683
# 127.0.0.1 for communication.

vllm/executor/ray_gpu_executor.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,9 @@
1010
from vllm.executor.ray_utils import RayWorkerWrapper, ray
1111
from vllm.logger import init_logger
1212
from vllm.sequence import ExecuteModelRequest, SamplerOutput
13-
from vllm.utils import (_run_task_with_lock,
14-
error_on_invalid_device_count_status,
15-
get_distributed_init_method, get_ip, get_open_port,
16-
get_vllm_instance_id, make_async)
13+
from vllm.utils import (_run_task_with_lock, get_distributed_init_method,
14+
get_ip, get_open_port, get_vllm_instance_id,
15+
make_async)
1716

1817
if ray is not None:
1918
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
@@ -216,8 +215,6 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
216215
distributed_init_method = get_distributed_init_method(
217216
driver_ip, get_open_port())
218217

219-
error_on_invalid_device_count_status()
220-
221218
# Initialize the actual workers inside worker wrapper.
222219
init_worker_all_kwargs = [
223220
self._get_worker_kwargs(

vllm/utils.py

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import argparse
22
import asyncio
3-
import contextlib
43
import datetime
54
import enum
65
import gc
@@ -923,28 +922,6 @@ def cuda_device_count_stateless() -> int:
923922
return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES)
924923

925924

926-
def error_on_invalid_device_count_status():
927-
cache_entries = 0
928-
with contextlib.suppress(Exception):
929-
# future pytorch will fix the issue, device_count will not be cached
930-
# at that time, `.cache_info().currsize` will error out
931-
cache_entries = torch.cuda.device_count.cache_info( # type: ignore
932-
).currsize
933-
if cache_entries != 0:
934-
# the function is already called, and the result is cached
935-
remembered = torch.cuda.device_count()
936-
current = cuda_device_count_stateless()
937-
if remembered > current:
938-
raise RuntimeError(
939-
"The number of CUDA devices has changed since the first "
940-
"call to torch.cuda.device_count(). This is not allowed "
941-
"and may result in undefined behavior. Please check out "
942-
"https://github.com/vllm-project/vllm/issues/6056 to "
943-
"find the first call to torch.cuda.device_count() "
944-
"and defer it until the engine is up. Or you can set "
945-
"CUDA_VISIBLE_DEVICES to the GPUs you want to use.")
946-
947-
948925
# NVML utils
949926
# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
950927
# all the related functions work on real physical device ids.

0 commit comments

Comments
 (0)