Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ private void generateMdevXml(StringBuilder gpuBuilder) {
String mdevUuid = vgpuType.getBusAddress(); // For MDEV devices, busAddress contains the UUID
String displayAttribute = vgpuType.isDisplay() ? "on" : "off";

gpuBuilder.append("<hostdev mode='subsystem' type='mdev' managed='no' display='").append(displayAttribute).append("'>\n");
gpuBuilder.append("<hostdev mode='subsystem' type='mdev' model='vfio-pci' display='").append(displayAttribute).append("'>\n");
gpuBuilder.append(" <source>\n");
gpuBuilder.append(" <address uuid='").append(mdevUuid).append("'/>\n");
gpuBuilder.append(" </source>\n");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,9 @@ public void testGpuDef_withMdevDevice() {

String gpuXml = gpuDef.toString();

assertTrue(gpuXml.contains("<hostdev mode='subsystem' type='mdev' managed='no' display='off'>"));
assertTrue(gpuXml.contains("<hostdev mode='subsystem' type='mdev' model='vfio-pci' display='off'>"));
assertTrue(gpuXml.contains("<address uuid='4b20d080-1b54-4048-85b3-a6a62d165c01'/>"));
assertTrue(gpuXml.contains("</hostdev>"));
assertFalse(gpuXml.contains("vfio")); // MDEV should not contain vfio driver element
}

@Test
Expand Down
111 changes: 66 additions & 45 deletions scripts/vm/hypervisor/kvm/gpudiscovery.sh
Original file line number Diff line number Diff line change
Expand Up @@ -473,7 +473,7 @@ for VM in "${VMS[@]}"; do
# -- MDEV hostdevs: use xmlstarlet to extract UUIDs --
while IFS= read -r UUID; do
[[ -n "$UUID" ]] && mdev_to_vm["$UUID"]="$VM"
done < <(echo "$xml" | xmlstarlet sel -T -t -m "//hostdev[@type='mdev']" -v "@uuid" -n 2>/dev/null || true)
done < <(echo "$xml" | xmlstarlet sel -T -t -m "//hostdev[@type='mdev']/source/address" -v "@uuid" -n 2>/dev/null || true)
done

# Helper: convert a VM name to JSON value (quoted string or null)
Expand Down Expand Up @@ -516,6 +516,55 @@ parse_and_add_gpu_properties() {
fi
}

# Finds and formats mdev instances for a given PCI device (PF or VF).
# Appends JSON strings for each found mdev instance to the global 'vlist' array.
# Arguments:
# $1: mdev_base_path (e.g., /sys/bus/pci/devices/.../mdev_supported_types)
# $2: bdf (e.g., 01:00.0)
process_mdev_instances() {
local mdev_base_path="$1"
local bdf="$2"

if [[ ! -d "$mdev_base_path" ]]; then
return
fi

for PROF_DIR in "$mdev_base_path"/*; do
[[ -d "$PROF_DIR" ]] || continue

local PROFILE_NAME
if [[ -f "$PROF_DIR/name" ]]; then
PROFILE_NAME=$(<"$PROF_DIR/name")
else
PROFILE_NAME=$(basename "$PROF_DIR")
fi

parse_and_add_gpu_properties "$PROF_DIR/description"

local DEVICE_DIR="$PROF_DIR/devices"
if [[ -d "$DEVICE_DIR" ]]; then
for UDIR in "$DEVICE_DIR"/*; do
[[ -d "$UDIR" ]] || continue
local MDEV_UUID
MDEV_UUID=$(basename "$UDIR")

local DOMAIN="0x0000"
local BUS="0x${bdf:0:2}"
local SLOT="0x${bdf:3:2}"
local FUNC="0x${bdf:6:1}"

local raw
raw="${mdev_to_vm[$MDEV_UUID]:-}"
local USED_JSON
USED_JSON=$(to_json_vm "$raw")

vlist+=(
"{\"mdev_uuid\":\"$MDEV_UUID\",\"profile_name\":$(json_escape "$PROFILE_NAME"),\"max_instances\":$MAX_INSTANCES,\"video_ram\":$VIDEO_RAM,\"max_heads\":$MAX_HEADS,\"max_resolution_x\":$MAX_RESOLUTION_X,\"max_resolution_y\":$MAX_RESOLUTION_Y,\"libvirt_address\":{\"domain\":\"$DOMAIN\",\"bus\":\"$BUS\",\"slot\":\"$SLOT\",\"function\":\"$FUNC\"},\"used_by_vm\":$USED_JSON}")
done
fi
done
}

# === GPU Discovery ===

mapfile -t LINES < <(lspci -nnm)
Expand Down Expand Up @@ -588,51 +637,9 @@ for LINE in "${LINES[@]}"; do
# === vGPU (MDEV) instances ===
VGPU_ARRAY="[]"
declare -a vlist=()
# Process mdev on the Physical Function
MDEV_BASE="/sys/bus/pci/devices/0000:$PCI_ADDR/mdev_supported_types"
if [[ -d "$MDEV_BASE" ]]; then
for PROF_DIR in "$MDEV_BASE"/*; do
[[ -d "$PROF_DIR" ]] || continue

# Read the human-readable profile name from the 'name' file
if [[ -f "$PROF_DIR/name" ]]; then
PROFILE_NAME=$(<"$PROF_DIR/name")
else
PROFILE_NAME=$(basename "$PROF_DIR")
fi

# Fetch max_instance from the description file, if present
parse_and_add_gpu_properties "$PROF_DIR/description"

# Under each profile, existing UUIDs appear in:
# /sys/bus/pci/devices/0000:$PCI_ADDR/mdev_supported_types/<PROFILE>/devices/*
DEVICE_DIR="$PROF_DIR/devices"
if [[ -d "$DEVICE_DIR" ]]; then
for UDIR in "$DEVICE_DIR"/*; do
[[ -d $UDIR ]] || continue
MDEV_UUID=$(basename "$UDIR")

# libvirt_address uses PF BDF
DOMAIN="0x0000"
BUS="0x${PCI_ADDR:0:2}"
SLOT="0x${PCI_ADDR:3:2}"
FUNC="0x${PCI_ADDR:6:1}"

# Determine which VM uses this UUID
raw="${mdev_to_vm[$MDEV_UUID]:-}"
USED_JSON=$(to_json_vm "$raw")

vlist+=(
"{\"mdev_uuid\":\"$MDEV_UUID\",\"profile_name\":$(json_escape "$PROFILE_NAME"),\"max_instances\":$MAX_INSTANCES,\"video_ram\":$VIDEO_RAM,\"max_heads\":$MAX_HEADS,\"max_resolution_x\":$MAX_RESOLUTION_X,\"max_resolution_y\":$MAX_RESOLUTION_Y,\"libvirt_address\":{\"domain\":\"$DOMAIN\",\"bus\":\"$BUS\",\"slot\":\"$SLOT\",\"function\":\"$FUNC\"},\"used_by_vm\":$USED_JSON}")
done
fi
done
if [ ${#vlist[@]} -gt 0 ]; then
VGPU_ARRAY="[$(
IFS=,
echo "${vlist[*]}"
)]"
fi
fi
process_mdev_instances "$MDEV_BASE" "$PCI_ADDR"

# === VF instances (SR-IOV / MIG) ===
VF_ARRAY="[]"
Expand All @@ -644,6 +651,12 @@ for LINE in "${LINES[@]}"; do
VF_ADDR=${VF_PATH##*/} # e.g. "0000:65:00.2"
VF_BDF="${VF_ADDR:5}" # "65:00.2"

# For NVIDIA SR-IOV, check for vGPU (mdev) on the VF itself
if [[ "$VENDOR_ID" == "10de" ]]; then
VF_MDEV_BASE="$VF_PATH/mdev_supported_types"
process_mdev_instances "$VF_MDEV_BASE" "$VF_BDF"
fi

DOMAIN="0x0000"
BUS="0x${VF_BDF:0:2}"
SLOT="0x${VF_BDF:3:2}"
Expand Down Expand Up @@ -674,6 +687,14 @@ for LINE in "${LINES[@]}"; do
fi
fi

# Consolidate all vGPU instances (from PF and VFs)
if [ ${#vlist[@]} -gt 0 ]; then
VGPU_ARRAY="[$(
IFS=,
echo "${vlist[*]}"
)]"
fi

# === full_passthrough block ===
# If vgpu_instances and vf_instances are empty, we can assume full passthrough
FP_ENABLED=0
Expand Down
15 changes: 11 additions & 4 deletions server/src/main/java/org/apache/cloudstack/gpu/GpuServiceImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -795,7 +795,6 @@ public GPUDeviceTO doInTransaction(TransactionStatus status) {
vgpuProfile.getName(), gpuDevice.getBusAddress(), gpuCard.getVendorId(),
gpuCard.getVendorName(), gpuCard.getDeviceId(), gpuCard.getDeviceName());
vgpuInfo.setDisplay(serviceOffering.getGpuDisplay());

if (gpuDevice.getParentGpuDeviceId() != null) {
GpuDeviceVO parentGpuDevice = gpuDeviceDao.findById(gpuDevice.getParentGpuDeviceId());
if (parentGpuDevice != null) {
Expand Down Expand Up @@ -891,14 +890,20 @@ public void addGpuDevicesToHost(final Host host, final List<VgpuTypesInfo> newGp
} else {
// Update the device's info
GpuDeviceVO parentGpuDevice = null;
if (existingDevice.getParentGpuDeviceId() == null
&& deviceInfo.getParentBusAddress() != null) {
if (deviceInfo.getParentBusAddress() != null) {
parentGpuDevice = gpuDeviceDao.findByHostIdAndBusAddress(host.getId(),
deviceInfo.getParentBusAddress());
if (parentGpuDevice != null) {
existingDevice.setParentGpuDeviceId(parentGpuDevice.getId());
parentGpuDevice.setType(GpuDevice.DeviceType.VGPUOnly);
gpuDeviceDao.persist(parentGpuDevice);
}
}
if (deviceInfo.isPassthroughEnabled()) {
existingDevice.setType(deviceInfo.getDeviceType());
} else {
existingDevice.setType(GpuDevice.DeviceType.VGPUOnly);
}
if (existingDevice.getPciRoot() == null) {
existingDevice.setPciRoot(deviceInfo.getPciRoot());
}
Expand All @@ -913,7 +918,6 @@ public void addGpuDevicesToHost(final Host host, final List<VgpuTypesInfo> newGp
for (final GpuDeviceVO device : gpuDevicesToDisableMap.values()) {
logger.info("Disabling GPU device {} on host {} due to missing address in the new devices on the host.", device, host);
device.setState(GpuDevice.State.Error);
device.setManagedState(GpuDevice.ManagedState.Unmanaged);
gpuDeviceDao.update(device.getId(), device);
checkAndUpdateParentGpuDeviceState(device.getParentGpuDeviceId());
}
Expand Down Expand Up @@ -1024,11 +1028,14 @@ private void createAndAddGpuDeviceToHost(VgpuTypesInfo deviceInfo, Host host, Gp
deviceInfo.getParentBusAddress());
if (parentGpuDevice != null) {
parentGpuDeviceId = parentGpuDevice.getId();
parentGpuDevice.setType(GpuDevice.DeviceType.VGPUOnly);
gpuDeviceDao.persist(parentGpuDevice);
}
}
GpuDeviceVO gpuDevice = new GpuDeviceVO(card.getId(), vgpuProfile.getId(), deviceInfo.getBusAddress(),
host.getId(), parentGpuDeviceId, deviceInfo.getNumaNode(), deviceInfo.getPciRoot());
gpuDevice.setHostId(host.getId());
gpuDevice.setType(deviceInfo.getDeviceType());
gpuDevice.setBusAddress(deviceInfo.getBusAddress());
gpuDevice.setCardId(card.getId());
setStateAndVmName(deviceInfo, gpuDevice, parentGpuDevice);
Expand Down
15 changes: 4 additions & 11 deletions ui/src/components/view/GPUSummaryTab.vue
Original file line number Diff line number Diff line change
Expand Up @@ -167,15 +167,7 @@ export default {
Object.values(cardGroups).forEach(cardGroup => {
const profileCount = Object.keys(cardGroup.profiles).length

// Filter devices for card summary calculation
// Exclude passthrough profile devices from aggregates if there are multiple profiles
let cardDevicesForSummary = cardGroup.devices
if (profileCount > 1) {
cardDevicesForSummary = cardGroup.devices.filter(device => !device.vgpuprofilename || device.vgpuprofilename.toLowerCase() !== 'passthrough'
)
}

const cardSummary = this.calculateSummary(cardDevicesForSummary)
const cardSummary = this.calculateSummary(cardGroup.devices)
const cardKey = `card-${cardGroup.gpucardname}`

const cardNode = {
Expand All @@ -192,7 +184,6 @@ export default {
expandedKeys.push(cardKey)

cardNode.children = Object.values(cardGroup.profiles)
.filter(profile => profile.vgpuprofilename.toLowerCase() !== 'passthrough')
.map(profile => {
const profileSummary = this.calculateSummary(profile.devices)
return {
Expand All @@ -204,7 +195,6 @@ export default {
}
})
}

summaryTree.push(cardNode)
})

Expand All @@ -222,6 +212,9 @@ export default {
}

devices.forEach(device => {
if (device.gpudevicetype === 'VGPUOnly') {
return
}
summary.total++

if (device.virtualmachineid) {
Expand Down
Loading