[no-relnote] Add E2E for libnvidia-container

ArangoGutierrez · ArangoGutierrez · commit ace18c657bdf · 2025-07-17T16:03:11.000+02:00
Signed-off-by: Carlos Eduardo Arango Gutierrez &lt;eduardoa@nvidia.com&gt;
diff --git a/tests/e2e/Makefile b/tests/e2e/Makefile
@@ -20,8 +20,14 @@ LOG_ARTIFACTS_DIR ?= $(CURDIR)/e2e_logs
 
 GINKGO_BIN := $(CURDIR)/bin/ginkgo
 
+# If GINKGO_FOCUS is not set, run all tests
+# current available tests:
+# - nvidia-container-cli
+# - docker
+GINKGO_FOCUS ?= nvidia-container-cli
+
 test: $(GINKGO_BIN)
-	$(GINKGO_BIN) $(GINKGO_ARGS) -v --json-report ginkgo.json ./tests/e2e/...
+	$(GINKGO_BIN) $(GINKGO_ARGS) -v --json-report ginkgo.json --focus="$(GINKGO_FOCUS)" ./tests/e2e/... 
 
 $(GINKGO_BIN):
 	mkdir -p $(CURDIR)/bin
diff --git a/tests/e2e/e2e_test.go b/tests/e2e/e2e_test.go
@@ -62,12 +62,8 @@ func getTestEnv() {
 
 	installCTK = getEnvVarOrDefault("E2E_INSTALL_CTK", false)
 
-	if installCTK {
-		imageName = getRequiredEnvvar[string]("E2E_IMAGE_NAME")
-
-		imageTag = getRequiredEnvvar[string]("E2E_IMAGE_TAG")
-
-	}
+	imageName = getRequiredEnvvar[string]("E2E_IMAGE_NAME")
+	imageTag = getRequiredEnvvar[string]("E2E_IMAGE_TAG")
 
 	sshKey = getRequiredEnvvar[string]("E2E_SSH_KEY")
 	sshUser = getRequiredEnvvar[string]("E2E_SSH_USER")
diff --git a/tests/e2e/installer.go b/tests/e2e/installer.go
@@ -28,11 +28,18 @@ var dockerInstallTemplate = `
 #! /usr/bin/env bash
 set -xe
 
-: ${IMAGE:={{.Image}}}
-
-# Create a temporary directory
-TEMP_DIR="/tmp/ctk_e2e.$(date +%s)_$RANDOM"
-mkdir -p "$TEMP_DIR"
+# if the TEMP_DIR is already set, use it
+if [ -f /tmp/ctk_e2e_temp_dir.txt ]; then
+    TEMP_DIR=$(cat /tmp/ctk_e2e_temp_dir.txt)
+else
+    TEMP_DIR="/tmp/ctk_e2e.$(date +%s)_$RANDOM"
+    echo "$TEMP_DIR" > /tmp/ctk_e2e_temp_dir.txt
+fi
+
+# if TEMP_DIR does not exist, create it
+if [ ! -d "$TEMP_DIR" ]; then
+    mkdir -p "$TEMP_DIR"
+fi
 
 # Given that docker has an init function that checks for the existence of the
 # nvidia-container-toolkit, we need to create a symlink to the nvidia-container-runtime-hook
@@ -46,7 +53,7 @@ docker run --pid=host --rm -i --privileged	\
 	-v /var/run/docker.sock:/var/run/docker.sock	\
 	-v "$TEMP_DIR:$TEMP_DIR"	\
 	-v /etc/docker:/config-root	\
-	${IMAGE}	\
+	{{.Image}}	\
 	--root "$TEMP_DIR"	\
 	--runtime=docker	\
 	--config=/config-root/daemon.json	\
diff --git a/tests/e2e/nvidia-container-cli_test.go b/tests/e2e/nvidia-container-cli_test.go
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package e2e
+
+import (
+	"context"
+	"fmt"
+	"strings"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+const (
+	// libnvidiaContainerCliTestTemplate is an on-the-fly script that prepares a
+	// minimal rootfs with GPU support and finally prints the list of visible GPUs
+	// using `nvidia-smi -L`.  The exit code as well as the standard output of this
+	// script are therefore a good indicator of whether the nvidia-container-cli
+	// is functioning correctly inside the container.
+	libnvidiaContainerCliTestTemplate = `#!/busybox/sh
+set -euo pipefail
+
+# Create a temporary directory and rootfs path
+TMPDIR="$(mktemp -d)"
+ROOTFS="${TMPDIR}/rootfs"
+mkdir -p "${ROOTFS}"
+
+# Expose ROOTFS for the child namespace
+export ROOTFS
+
+# ---------------------------------------------------------------------------
+# NOTE: The container image used in this test is a distroless image that only
+#       ships BusyBox. Therefore we rely exclusively on BusyBox applets
+#       (wget, tar, mount, unshare, etc.) that are expected to be present at
+#       /busybox. Keeping all commands prefixed with "busybox" avoids PATH
+#       issues and makes it explicit which implementation is used.
+# ---------------------------------------------------------------------------
+
+busybox wget -qO- http://cdimage.ubuntu.com/ubuntu-base/releases/22.04/release/ubuntu-base-22.04-base-amd64.tar.gz \
+  | tar -C "$ROOTFS" -xz
+
+# Enter a new mount + PID namespace so we can pivot_root without touching the
+# container's original filesystem.
+busybox unshare --mount --pid --fork --propagation private -- busybox sh -eux <<'IN_NS'
+  : "${ROOTFS:?}"
+
+  # 1 Bind-mount the new root and make the mount private
+  busybox mount --bind "$ROOTFS" "$ROOTFS"
+  busybox mount --make-private "$ROOTFS"
+  cd "$ROOTFS"
+
+  # 2 Minimal virtual filesystems
+  busybox mount -t proc  proc proc
+  busybox mount -t sysfs sys  sys
+  busybox mount -t tmpfs tmp  tmp
+  busybox mount -t tmpfs run  run
+
+  # 3 GPU setup via nvidia-container-cli
+  nvidia-container-cli --load-kmods \
+                       configure \
+                       --no-cgroups --utility --device=0 "$(pwd)"
+
+  # 4 Switch root into the prepared filesystem
+  mkdir -p mnt
+  busybox pivot_root . mnt
+  busybox umount -l /mnt
+
+  exec nvidia-smi -L
+IN_NS
+`
+
+	dockerRunCmdTemplate = `docker run --name %s -d --privileged --runtime=nvidia \
+    -e NVIDIA_VISIBLE_DEVICES=all \
+    -e NVIDIA_DRIVER_CAPABILITIES=all \
+    -v %s:/libnvidia-container-cli.sh \
+    --entrypoint /libnvidia-container-cli.sh %s`
+)
+
+var _ = Describe("nvidia-container-cli", Ordered, ContinueOnFailure, func() {
+	var (
+		runner        Runner
+		dockerImage   = fmt.Sprintf("%s:%s", imageName, imageTag)
+		containerName = "nvidia-cli-e2e"
+		hostOutput    string
+	)
+
+	BeforeAll(func(ctx context.Context) {
+		runner = NewRunner(
+			WithHost(sshHost),
+			WithPort(sshPort),
+			WithSshKey(sshKey),
+			WithSshUser(sshUser),
+		)
+
+		if installCTK {
+			installer, err := NewToolkitInstaller(
+				WithRunner(runner),
+				WithImage(imageName+":"+imageTag),
+				WithTemplate(dockerInstallTemplate),
+			)
+			Expect(err).ToNot(HaveOccurred())
+
+			err = installer.Install()
+			Expect(err).ToNot(HaveOccurred())
+		}
+
+		// Capture the host GPU list.
+		var err error
+		hostOutput, _, err = runner.Run("nvidia-smi -L")
+		Expect(err).ToNot(HaveOccurred())
+	})
+
+	AfterAll(func(ctx context.Context) {
+		// Cleanup: remove the container and the temporary script on the host.
+		runner.Run(fmt.Sprintf("docker rm -f %s", containerName))
+	})
+
+	It("should report the same GPUs inside the container as on the host", func(ctx context.Context) {
+		testScriptPath := "/tmp/libnvidia-container-cli.sh"
+
+		// Write the script to the remote host and make it executable.
+		createScriptCmd := fmt.Sprintf(
+			"cat > %s <<'EOF'\n%s\nEOF\nchmod +x %s",
+			testScriptPath, libnvidiaContainerCliTestTemplate, testScriptPath,
+		)
+
+		_, _, err := runner.Run(createScriptCmd)
+		Expect(err).ToNot(HaveOccurred())
+
+		// If a container with the same name exists from a previous test run, remove it first.
+		_, _, err = runner.Run(fmt.Sprintf("docker rm -f %s", containerName))
+		Expect(err).ToNot(HaveOccurred())
+
+		// Build the docker run command (detached mode) from the template so it
+		// stays readable while still resulting in a single-line invocation.
+		dockerRunCmd := fmt.Sprintf(dockerRunCmdTemplate, containerName, testScriptPath, dockerImage)
+
+		// Launch the container in detached mode.
+		_, _, err = runner.Run(dockerRunCmd)
+		Expect(err).ToNot(HaveOccurred())
+
+		//check if the container is running
+		containerOutput, _, err := runner.Run(fmt.Sprintf("docker ps -a | grep %s", containerName))
+		Expect(err).ToNot(HaveOccurred())
+		fmt.Println("containerOutput: ", containerOutput)
+
+		hostOutputNormalized := strings.TrimSpace(strings.ReplaceAll(hostOutput, "\r", ""))
+
+		// Poll the logs of the already running container until we observe
+		// the GPU list matching the host or until a 5-minute timeout elapses.
+		Eventually(func() string {
+			logs, _, err := runner.Run(fmt.Sprintf("docker logs --tail 10 %s", containerName))
+			if err != nil {
+				return ""
+			}
+
+			fmt.Println("logs: ", logs)
+
+			lines := strings.Split(strings.TrimSpace(logs), "\n")
+			if len(lines) == 0 {
+				return ""
+			}
+
+			last := strings.TrimSpace(strings.ReplaceAll(lines[len(lines)-1], "\r", ""))
+			return last
+		}, "5m", "5s").Should(Equal(hostOutputNormalized))
+
+		// Remove the script from the remote host.
+		runner.Run(fmt.Sprintf("rm -f %s", testScriptPath))
+	})
+})