Skip to content

Commit d3fdaaa

Browse files
committed
[rhel9] add support for nvlink5+ systems
Signed-off-by: Tariq Ibrahim <[email protected]>
1 parent 8578a1b commit d3fdaaa

File tree

2 files changed

+54
-2
lines changed

2 files changed

+54
-2
lines changed

rhel9/Dockerfile

+2-1
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,8 @@ RUN if [ "$DRIVER_TYPE" != "vgpu" ]; then \
7373
dnf install -y nvidia-fabric-manager-${DRIVER_VERSION}-1 libnvidia-nscq-${DRIVER_BRANCH}-${DRIVER_VERSION}-1; fi
7474

7575
RUN if [ "$DRIVER_TYPE" != "vgpu" ] && [ "$DRIVER_BRANCH" -ge "550" ]; then \
76-
dnf install -y nvidia-imex-${DRIVER_BRANCH}-${DRIVER_VERSION}-1; fi
76+
dnf install -y nvlsm infiniband-diags \
77+
nvidia-imex-${DRIVER_BRANCH}-${DRIVER_VERSION}-1; fi
7778

7879
COPY nvidia-driver /usr/local/bin
7980
COPY ocp_dtk_entrypoint /usr/local/bin

rhel9/nvidia-driver

+52-1
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,31 @@ _assert_nvswitch_system() {
268268
return 0
269269
}
270270

271+
_assert_nvlink5_system() (
272+
for dir in /sys/class/infiniband/*/device; do
273+
# Define the path to the VPD file
274+
vpd_file="$dir/vpd"
275+
276+
# Check if the VPD file exists
277+
if [ -f "$vpd_file" ]; then
278+
# Search for 'SW_MNG' in the VPD file
279+
if grep -q "SW_MNG" "$vpd_file"; then
280+
echo "Detected NVLink5+ system"
281+
return 0
282+
fi
283+
fi
284+
done
285+
return 1
286+
)
287+
288+
_ensure_nvlink5_prerequisites() (
289+
until lsmod | grep mlx5_core > /dev/null 2>&1 && lsmod | grep ib_umad > /dev/null 2>&1;
290+
do
291+
echo "waiting for the mlx5_core and ib_umad kernel modules to be loaded"
292+
sleep 10
293+
done
294+
)
295+
271296
# For each kernel module configuration file mounted into the container,
272297
# parse the file contents and extract the custom module parameters that
273298
# are to be passed as input to 'modprobe'.
@@ -370,7 +395,18 @@ _load_driver() {
370395
_start_vgpu_topology_daemon
371396
fi
372397

373-
if _assert_nvswitch_system; then
398+
if _assert_nvlink5_system; then
399+
_ensure_nvlink5_prerequisites || return 1
400+
echo "Starting NVIDIA fabric manager daemon for NVLink5+..."
401+
402+
fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
403+
fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
404+
nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
405+
nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
406+
/usr/bin/nvidia-fabricmanager-start.sh $fm_config_file $fm_pid_file $nvlsm_config_file $nvlsm_pid_file
407+
408+
# If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
409+
elif _assert_nvswitch_system; then
374410
echo "Starting NVIDIA fabric manager daemon..."
375411
nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
376412
fi
@@ -430,6 +466,21 @@ _unload_driver() {
430466
fi
431467
fi
432468

469+
if [ -f /var/run/nvidia-fabricmanager/nvlsm.pid ]; then
470+
echo "Stopping NVLink Subnet Manager daemon..."
471+
local pid=$(< /var/run/nvidia-fabricmanager/nvlsm.pid)
472+
473+
kill -SIGTERM "${pid}"
474+
for i in $(seq 1 50); do
475+
kill -0 "${pid}" 2> /dev/null || break
476+
sleep 0.1
477+
done
478+
if [ $i -eq 50 ]; then
479+
echo "Could not stop NVLink Subnet Manager daemon" >&2
480+
return 1
481+
fi
482+
fi
483+
433484
echo "Unloading NVIDIA driver kernel modules..."
434485
if [ -f /sys/module/nvidia_modeset/refcnt ]; then
435486
nvidia_modeset_refs=$(< /sys/module/nvidia_modeset/refcnt)

0 commit comments

Comments
 (0)