Skip to content

Commit ace18c6

Browse files
[no-relnote] Add E2E for libnvidia-container
Signed-off-by: Carlos Eduardo Arango Gutierrez <[email protected]>
1 parent f55ef6a commit ace18c6

File tree

4 files changed

+207
-13
lines changed

4 files changed

+207
-13
lines changed

tests/e2e/Makefile

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,14 @@ LOG_ARTIFACTS_DIR ?= $(CURDIR)/e2e_logs
2020

2121
GINKGO_BIN := $(CURDIR)/bin/ginkgo
2222

23+
# If GINKGO_FOCUS is not set, run all tests
24+
# current available tests:
25+
# - nvidia-container-cli
26+
# - docker
27+
GINKGO_FOCUS ?= nvidia-container-cli
28+
2329
test: $(GINKGO_BIN)
24-
$(GINKGO_BIN) $(GINKGO_ARGS) -v --json-report ginkgo.json ./tests/e2e/...
30+
$(GINKGO_BIN) $(GINKGO_ARGS) -v --json-report ginkgo.json --focus="$(GINKGO_FOCUS)" ./tests/e2e/...
2531

2632
$(GINKGO_BIN):
2733
mkdir -p $(CURDIR)/bin

tests/e2e/e2e_test.go

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -62,12 +62,8 @@ func getTestEnv() {
6262

6363
installCTK = getEnvVarOrDefault("E2E_INSTALL_CTK", false)
6464

65-
if installCTK {
66-
imageName = getRequiredEnvvar[string]("E2E_IMAGE_NAME")
67-
68-
imageTag = getRequiredEnvvar[string]("E2E_IMAGE_TAG")
69-
70-
}
65+
imageName = getRequiredEnvvar[string]("E2E_IMAGE_NAME")
66+
imageTag = getRequiredEnvvar[string]("E2E_IMAGE_TAG")
7167

7268
sshKey = getRequiredEnvvar[string]("E2E_SSH_KEY")
7369
sshUser = getRequiredEnvvar[string]("E2E_SSH_USER")

tests/e2e/installer.go

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,18 @@ var dockerInstallTemplate = `
2828
#! /usr/bin/env bash
2929
set -xe
3030
31-
: ${IMAGE:={{.Image}}}
32-
33-
# Create a temporary directory
34-
TEMP_DIR="/tmp/ctk_e2e.$(date +%s)_$RANDOM"
35-
mkdir -p "$TEMP_DIR"
31+
# if the TEMP_DIR is already set, use it
32+
if [ -f /tmp/ctk_e2e_temp_dir.txt ]; then
33+
TEMP_DIR=$(cat /tmp/ctk_e2e_temp_dir.txt)
34+
else
35+
TEMP_DIR="/tmp/ctk_e2e.$(date +%s)_$RANDOM"
36+
echo "$TEMP_DIR" > /tmp/ctk_e2e_temp_dir.txt
37+
fi
38+
39+
# if TEMP_DIR does not exist, create it
40+
if [ ! -d "$TEMP_DIR" ]; then
41+
mkdir -p "$TEMP_DIR"
42+
fi
3643
3744
# Given that docker has an init function that checks for the existence of the
3845
# nvidia-container-toolkit, we need to create a symlink to the nvidia-container-runtime-hook
@@ -46,7 +53,7 @@ docker run --pid=host --rm -i --privileged \
4653
-v /var/run/docker.sock:/var/run/docker.sock \
4754
-v "$TEMP_DIR:$TEMP_DIR" \
4855
-v /etc/docker:/config-root \
49-
${IMAGE} \
56+
{{.Image}} \
5057
--root "$TEMP_DIR" \
5158
--runtime=docker \
5259
--config=/config-root/daemon.json \
Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
/*
2+
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package e2e
18+
19+
import (
20+
"context"
21+
"fmt"
22+
"strings"
23+
24+
. "github.com/onsi/ginkgo/v2"
25+
. "github.com/onsi/gomega"
26+
)
27+
28+
const (
29+
// libnvidiaContainerCliTestTemplate is an on-the-fly script that prepares a
30+
// minimal rootfs with GPU support and finally prints the list of visible GPUs
31+
// using `nvidia-smi -L`. The exit code as well as the standard output of this
32+
// script are therefore a good indicator of whether the nvidia-container-cli
33+
// is functioning correctly inside the container.
34+
libnvidiaContainerCliTestTemplate = `#!/busybox/sh
35+
set -euo pipefail
36+
37+
# Create a temporary directory and rootfs path
38+
TMPDIR="$(mktemp -d)"
39+
ROOTFS="${TMPDIR}/rootfs"
40+
mkdir -p "${ROOTFS}"
41+
42+
# Expose ROOTFS for the child namespace
43+
export ROOTFS
44+
45+
# ---------------------------------------------------------------------------
46+
# NOTE: The container image used in this test is a distroless image that only
47+
# ships BusyBox. Therefore we rely exclusively on BusyBox applets
48+
# (wget, tar, mount, unshare, etc.) that are expected to be present at
49+
# /busybox. Keeping all commands prefixed with "busybox" avoids PATH
50+
# issues and makes it explicit which implementation is used.
51+
# ---------------------------------------------------------------------------
52+
53+
busybox wget -qO- http://cdimage.ubuntu.com/ubuntu-base/releases/22.04/release/ubuntu-base-22.04-base-amd64.tar.gz \
54+
| tar -C "$ROOTFS" -xz
55+
56+
# Enter a new mount + PID namespace so we can pivot_root without touching the
57+
# container's original filesystem.
58+
busybox unshare --mount --pid --fork --propagation private -- busybox sh -eux <<'IN_NS'
59+
: "${ROOTFS:?}"
60+
61+
# 1 Bind-mount the new root and make the mount private
62+
busybox mount --bind "$ROOTFS" "$ROOTFS"
63+
busybox mount --make-private "$ROOTFS"
64+
cd "$ROOTFS"
65+
66+
# 2 Minimal virtual filesystems
67+
busybox mount -t proc proc proc
68+
busybox mount -t sysfs sys sys
69+
busybox mount -t tmpfs tmp tmp
70+
busybox mount -t tmpfs run run
71+
72+
# 3 GPU setup via nvidia-container-cli
73+
nvidia-container-cli --load-kmods \
74+
configure \
75+
--no-cgroups --utility --device=0 "$(pwd)"
76+
77+
# 4 Switch root into the prepared filesystem
78+
mkdir -p mnt
79+
busybox pivot_root . mnt
80+
busybox umount -l /mnt
81+
82+
exec nvidia-smi -L
83+
IN_NS
84+
`
85+
86+
dockerRunCmdTemplate = `docker run --name %s -d --privileged --runtime=nvidia \
87+
-e NVIDIA_VISIBLE_DEVICES=all \
88+
-e NVIDIA_DRIVER_CAPABILITIES=all \
89+
-v %s:/libnvidia-container-cli.sh \
90+
--entrypoint /libnvidia-container-cli.sh %s`
91+
)
92+
93+
var _ = Describe("nvidia-container-cli", Ordered, ContinueOnFailure, func() {
94+
var (
95+
runner Runner
96+
dockerImage = fmt.Sprintf("%s:%s", imageName, imageTag)
97+
containerName = "nvidia-cli-e2e"
98+
hostOutput string
99+
)
100+
101+
BeforeAll(func(ctx context.Context) {
102+
runner = NewRunner(
103+
WithHost(sshHost),
104+
WithPort(sshPort),
105+
WithSshKey(sshKey),
106+
WithSshUser(sshUser),
107+
)
108+
109+
if installCTK {
110+
installer, err := NewToolkitInstaller(
111+
WithRunner(runner),
112+
WithImage(imageName+":"+imageTag),
113+
WithTemplate(dockerInstallTemplate),
114+
)
115+
Expect(err).ToNot(HaveOccurred())
116+
117+
err = installer.Install()
118+
Expect(err).ToNot(HaveOccurred())
119+
}
120+
121+
// Capture the host GPU list.
122+
var err error
123+
hostOutput, _, err = runner.Run("nvidia-smi -L")
124+
Expect(err).ToNot(HaveOccurred())
125+
})
126+
127+
AfterAll(func(ctx context.Context) {
128+
// Cleanup: remove the container and the temporary script on the host.
129+
runner.Run(fmt.Sprintf("docker rm -f %s", containerName))
130+
})
131+
132+
It("should report the same GPUs inside the container as on the host", func(ctx context.Context) {
133+
testScriptPath := "/tmp/libnvidia-container-cli.sh"
134+
135+
// Write the script to the remote host and make it executable.
136+
createScriptCmd := fmt.Sprintf(
137+
"cat > %s <<'EOF'\n%s\nEOF\nchmod +x %s",
138+
testScriptPath, libnvidiaContainerCliTestTemplate, testScriptPath,
139+
)
140+
141+
_, _, err := runner.Run(createScriptCmd)
142+
Expect(err).ToNot(HaveOccurred())
143+
144+
// If a container with the same name exists from a previous test run, remove it first.
145+
_, _, err = runner.Run(fmt.Sprintf("docker rm -f %s", containerName))
146+
Expect(err).ToNot(HaveOccurred())
147+
148+
// Build the docker run command (detached mode) from the template so it
149+
// stays readable while still resulting in a single-line invocation.
150+
dockerRunCmd := fmt.Sprintf(dockerRunCmdTemplate, containerName, testScriptPath, dockerImage)
151+
152+
// Launch the container in detached mode.
153+
_, _, err = runner.Run(dockerRunCmd)
154+
Expect(err).ToNot(HaveOccurred())
155+
156+
//check if the container is running
157+
containerOutput, _, err := runner.Run(fmt.Sprintf("docker ps -a | grep %s", containerName))
158+
Expect(err).ToNot(HaveOccurred())
159+
fmt.Println("containerOutput: ", containerOutput)
160+
161+
hostOutputNormalized := strings.TrimSpace(strings.ReplaceAll(hostOutput, "\r", ""))
162+
163+
// Poll the logs of the already running container until we observe
164+
// the GPU list matching the host or until a 5-minute timeout elapses.
165+
Eventually(func() string {
166+
logs, _, err := runner.Run(fmt.Sprintf("docker logs --tail 10 %s", containerName))
167+
if err != nil {
168+
return ""
169+
}
170+
171+
fmt.Println("logs: ", logs)
172+
173+
lines := strings.Split(strings.TrimSpace(logs), "\n")
174+
if len(lines) == 0 {
175+
return ""
176+
}
177+
178+
last := strings.TrimSpace(strings.ReplaceAll(lines[len(lines)-1], "\r", ""))
179+
return last
180+
}, "5m", "5s").Should(Equal(hostOutputNormalized))
181+
182+
// Remove the script from the remote host.
183+
runner.Run(fmt.Sprintf("rm -f %s", testScriptPath))
184+
})
185+
})

0 commit comments

Comments
 (0)