@@ -268,6 +268,31 @@ _assert_nvswitch_system() {
268
268
return 0
269
269
}
270
270
271
+ _assert_nvlink5_system () (
272
+ for dir in /sys/class/infiniband/* /device; do
273
+ # Define the path to the VPD file
274
+ vpd_file=" $dir /vpd"
275
+
276
+ # Check if the VPD file exists
277
+ if [ -f " $vpd_file " ]; then
278
+ # Search for 'SW_MNG' in the VPD file
279
+ if grep -q " SW_MNG" " $vpd_file " ; then
280
+ echo " Detected NVLink5+ system"
281
+ return 0
282
+ fi
283
+ fi
284
+ done
285
+ return 1
286
+ )
287
+
288
+ _ensure_nvlink5_prerequisites () (
289
+ until lsmod | grep mlx5_core > /dev/null 2>&1 && lsmod | grep ib_umad > /dev/null 2>&1 ;
290
+ do
291
+ echo " waiting for the mlx5_core and ib_umad kernel modules to be loaded"
292
+ sleep 10
293
+ done
294
+ )
295
+
271
296
# For each kernel module configuration file mounted into the container,
272
297
# parse the file contents and extract the custom module parameters that
273
298
# are to be passed as input to 'modprobe'.
@@ -370,7 +395,18 @@ _load_driver() {
370
395
_start_vgpu_topology_daemon
371
396
fi
372
397
373
- if _assert_nvswitch_system; then
398
+ if _assert_nvlink5_system; then
399
+ _ensure_nvlink5_prerequisites || return 1
400
+ echo " Starting NVIDIA fabric manager daemon for NVLink5+..."
401
+
402
+ fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
403
+ fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
404
+ nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
405
+ nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
406
+ /usr/bin/nvidia-fabricmanager-start.sh $fm_config_file $fm_pid_file $nvlsm_config_file $nvlsm_pid_file
407
+
408
+ # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
409
+ elif _assert_nvswitch_system; then
374
410
echo " Starting NVIDIA fabric manager daemon..."
375
411
nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
376
412
fi
@@ -430,6 +466,21 @@ _unload_driver() {
430
466
fi
431
467
fi
432
468
469
+ if [ -f /var/run/nvidia-fabricmanager/nvlsm.pid ]; then
470
+ echo " Stopping NVLink Subnet Manager daemon..."
471
+ local pid=$( < /var/run/nvidia-fabricmanager/nvlsm.pid)
472
+
473
+ kill -SIGTERM " ${pid} "
474
+ for i in $( seq 1 50) ; do
475
+ kill -0 " ${pid} " 2> /dev/null || break
476
+ sleep 0.1
477
+ done
478
+ if [ $i -eq 50 ]; then
479
+ echo " Could not stop NVLink Subnet Manager daemon" >&2
480
+ return 1
481
+ fi
482
+ fi
483
+
433
484
echo " Unloading NVIDIA driver kernel modules..."
434
485
if [ -f /sys/module/nvidia_modeset/refcnt ]; then
435
486
nvidia_modeset_refs=$( < /sys/module/nvidia_modeset/refcnt)
0 commit comments