Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 17 additions & 2 deletions cuda_bindings/tests/nvml/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,23 @@ def nmigs(handles):

@pytest.fixture
def mig_handles(nmigs):
handles = [nvml.device_get_mig_device_handle_by_index(i) for i in range(nmigs)]
assert len(handles) == nmigs
handles = []
with NVMLInitializer():
dev_count = nvml.device_get_count_v2()

for dev_idx in range(dev_count):
try:
dev = nvml.device_get_handle_by_index_v2(dev_idx)
except nvml.NoPermissionError:
continue
for mig_idx in range(nmigs):
try:
mig = nvml.device_get_mig_device_handle_by_index(dev, mig_idx)
except nvml.NotFoundError:
# Not all MIG devices may be available
continue
else:
handles.append(mig)
return handles


Expand Down
6 changes: 6 additions & 0 deletions cuda_bindings/tests/nvml/test_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

import os

import pytest

import cuda.bindings.driver as cuda
from cuda.bindings import nvml

Expand Down Expand Up @@ -56,6 +58,10 @@ def test_cuda_device_order():
cuda_devices = get_cuda_device_names()
nvml_devices = get_nvml_device_names()

if any("Thor" in device["name"] for device in nvml_devices):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you know if this could also affect "Orin"?

pytest.skip("Skipping test on Thor, which has non-standard device naming")
return

if "CUDA_VISIBLE_DEVICES" not in os.environ:
# If that environment variable isn't set, the device lists should match exactly
assert cuda_devices == nvml_devices, "CUDA and NVML device lists do not match"
Expand Down
2 changes: 1 addition & 1 deletion cuda_bindings/tests/nvml/test_pynvml.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ def test_device_get_memory_info(ngpus, handles):

def test_device_get_utilization_rates(ngpus, handles):
for i in range(ngpus):
with unsupported_before(handles[i], "FERMI"):
with unsupported_before(handles[i], None):
urate = nvml.device_get_utilization_rates(handles[i])
assert urate.gpu >= 0
assert urate.memory >= 0
Expand Down
19 changes: 18 additions & 1 deletion cuda_core/cuda/core/system/_device.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,15 @@ cdef class Device:
-------
cuda.core.Device
The corresponding CUDA device.

Raises
------
RuntimeError
No corresponding CUDA device is found for this NVML device.

For example, on a MIG system, the physical GPU will not have an
available CUDA device, since it can not be used directly, even
though it can be enumerated from NVML.
"""
from cuda.core import Device as CudaDevice

Expand Down Expand Up @@ -890,8 +899,16 @@ cdef class Device:
def pci_info(self) -> PciInfo:
"""
:obj:`~_device.PciInfo` object with the PCI attributes of this device.

Non-physical devices, such as MIG devices, may not have PCI attributes.
In that case, this property will raise a `RuntimeError`.
"""
return PciInfo(nvml.device_get_pci_info_ext(self._handle), self._handle)
try:
pci_info = nvml.device_get_pci_info_ext(self._handle)
except nvml.InvalidArgumentError:
raise RuntimeError("This device does not have PCI attributes") from None
else:
return PciInfo(pci_info, self._handle)

##########################################################################
# PERFORMANCE
Expand Down
6 changes: 5 additions & 1 deletion cuda_core/cuda/core/system/_mig.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -163,4 +163,8 @@ cdef class MigInfo:
A list of all MIG devices corresponding to this GPU.
"""
for i in range(self.device_count):
yield self.get_device_by_index(i)
try:
yield self.get_device_by_index(i)
except nvml.NotFoundError:
# Not all MIG devices may be available
continue
21 changes: 21 additions & 0 deletions cuda_core/docs/source/release/1.0.1-notes.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
.. SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
.. SPDX-License-Identifier: Apache-2.0
.. currentmodule:: cuda.core

``cuda.core`` 1.0.1 Release Notes
=================================


Fixes and enhancements
----------------------

- When iterating over MIG devices with
``cuda.core.system.Device.mig.get_all_devices``, only available MIG devices will
be returned. Previously, if any MIG device was unavailable, an exception would
be raised. (`#2065 <https://github.com/NVIDIA/cuda-python/issues/2065>`__)
- When converting an NVML device (``cuda.core.system.Device``) to a CUDA device
(``cuda.core.Device``), using ``cuda.core.system.Device.to_cuda_device``, if the
device does not have a matching CUDA device, a ``RuntimeError`` will be raised.
Previously, a ``cuda.core.system.InvalidArgumentError`` would be raised. For
example, this may happen for physical devices on a MIG system.
7 changes: 6 additions & 1 deletion cuda_core/tests/system/test_system_device.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,12 @@ def test_to_cuda_device():
from cuda.core import Device as CudaDevice

for device in system.Device.get_all_devices():
cuda_device = device.to_cuda_device()
try:
cuda_device = device.to_cuda_device()
except RuntimeError:
# Not all physical NVML devices may have a matching CUDA device
# when MIG is involved.
continue

assert isinstance(cuda_device, CudaDevice)
assert cuda_device.uuid == device.uuid_without_prefix
Expand Down
6 changes: 0 additions & 6 deletions cuda_core/tests/system/test_system_system.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,6 @@ def test_kernel_mode_driver_version():
assert 0 <= ver_patch[0] <= 99


def test_num_devices():
num_devices = system.get_num_devices()
expected_num_devices = handle_return(runtime.cudaGetDeviceCount())
assert num_devices == expected_num_devices, "Number of devices does not match expected value"
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just removed this test altogether since it isn't universally true and finding a way to write all of the exceptions seemed a little pointless.



def test_devices():
devices = Device.get_all_devices()
expected_num_devices = handle_return(runtime.cudaGetDeviceCount())
Expand Down
6 changes: 5 additions & 1 deletion cuda_core/tests/test_device.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import contextlib

try:
from cuda.bindings import driver, runtime
except ImportError:
Expand Down Expand Up @@ -45,7 +47,9 @@ def test_to_system_device(deinit_cuda):

# CUDA only returns a 2-byte PCI bus ID domain, whereas NVML returns a
# 4-byte domain
assert device.pci_bus_id == system_device.pci_info.bus_id[4:]
# MIG devices don't have pci_info, so skip the bus ID check if it's missing
with contextlib.suppress(RuntimeError):
assert device.pci_bus_id == system_device.pci_info.bus_id[4:]


def test_device_set_current(deinit_cuda):
Expand Down
Loading