|
| 1 | +/* |
| 2 | + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. |
| 3 | + * |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | + * you may not use this file except in compliance with the License. |
| 6 | + * You may obtain a copy of the License at |
| 7 | + * |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | + * |
| 10 | + * Unless required by applicable law or agreed to in writing, software |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | + * See the License for the specific language governing permissions and |
| 14 | + * limitations under the License. |
| 15 | + */ |
| 16 | + |
| 17 | +package e2e |
| 18 | + |
| 19 | +import ( |
| 20 | + "context" |
| 21 | + "fmt" |
| 22 | + "strings" |
| 23 | + |
| 24 | + . "github.com/onsi/ginkgo/v2" |
| 25 | + . "github.com/onsi/gomega" |
| 26 | +) |
| 27 | + |
| 28 | +const ( |
| 29 | + // libnvidiaContainerCliTestTemplate is an on-the-fly script that prepares a |
| 30 | + // minimal rootfs with GPU support and finally prints the list of visible GPUs |
| 31 | + // using `nvidia-smi -L`. The exit code as well as the standard output of this |
| 32 | + // script are therefore a good indicator of whether the nvidia-container-cli |
| 33 | + // is functioning correctly inside the container. |
| 34 | + libnvidiaContainerCliTestTemplate = `#!/busybox/sh |
| 35 | +set -euo pipefail |
| 36 | +
|
| 37 | +# Create a temporary directory and rootfs path |
| 38 | +TMPDIR="$(mktemp -d)" |
| 39 | +ROOTFS="${TMPDIR}/rootfs" |
| 40 | +mkdir -p "${ROOTFS}" |
| 41 | +
|
| 42 | +# Expose ROOTFS for the child namespace |
| 43 | +export ROOTFS |
| 44 | +
|
| 45 | +# --------------------------------------------------------------------------- |
| 46 | +# NOTE: The container image used in this test is a distroless image that only |
| 47 | +# ships BusyBox. Therefore we rely exclusively on BusyBox applets |
| 48 | +# (wget, tar, mount, unshare, etc.) that are expected to be present at |
| 49 | +# /busybox. Keeping all commands prefixed with "busybox" avoids PATH |
| 50 | +# issues and makes it explicit which implementation is used. |
| 51 | +# --------------------------------------------------------------------------- |
| 52 | +
|
| 53 | +busybox wget -qO- http://cdimage.ubuntu.com/ubuntu-base/releases/22.04/release/ubuntu-base-22.04-base-amd64.tar.gz \ |
| 54 | + | tar -C "$ROOTFS" -xz |
| 55 | +
|
| 56 | +# Enter a new mount + PID namespace so we can pivot_root without touching the |
| 57 | +# container's original filesystem. |
| 58 | +busybox unshare --mount --pid --fork --propagation private -- busybox sh -eux <<'IN_NS' |
| 59 | + : "${ROOTFS:?}" |
| 60 | +
|
| 61 | + # 1 Bind-mount the new root and make the mount private |
| 62 | + busybox mount --bind "$ROOTFS" "$ROOTFS" |
| 63 | + busybox mount --make-private "$ROOTFS" |
| 64 | + cd "$ROOTFS" |
| 65 | +
|
| 66 | + # 2 Minimal virtual filesystems |
| 67 | + busybox mount -t proc proc proc |
| 68 | + busybox mount -t sysfs sys sys |
| 69 | + busybox mount -t tmpfs tmp tmp |
| 70 | + busybox mount -t tmpfs run run |
| 71 | +
|
| 72 | + # 3 GPU setup via nvidia-container-cli |
| 73 | + nvidia-container-cli --load-kmods \ |
| 74 | + configure \ |
| 75 | + --no-cgroups --utility --device=0 "$(pwd)" |
| 76 | +
|
| 77 | + # 4 Switch root into the prepared filesystem |
| 78 | + mkdir -p mnt |
| 79 | + busybox pivot_root . mnt |
| 80 | + busybox umount -l /mnt |
| 81 | +
|
| 82 | + exec nvidia-smi -L |
| 83 | +IN_NS |
| 84 | +` |
| 85 | + |
| 86 | + dockerRunCmdTemplate = `docker run --name %s -d --privileged --runtime=nvidia \ |
| 87 | + -e NVIDIA_VISIBLE_DEVICES=all \ |
| 88 | + -e NVIDIA_DRIVER_CAPABILITIES=all \ |
| 89 | + -v %s:/libnvidia-container-cli.sh \ |
| 90 | + --entrypoint /libnvidia-container-cli.sh %s` |
| 91 | +) |
| 92 | + |
| 93 | +var _ = Describe("nvidia-container-cli", Ordered, ContinueOnFailure, func() { |
| 94 | + var ( |
| 95 | + runner Runner |
| 96 | + dockerImage = fmt.Sprintf("%s:%s", imageName, imageTag) |
| 97 | + containerName = "nvidia-cli-e2e" |
| 98 | + hostOutput string |
| 99 | + ) |
| 100 | + |
| 101 | + BeforeAll(func(ctx context.Context) { |
| 102 | + runner = NewRunner( |
| 103 | + WithHost(sshHost), |
| 104 | + WithPort(sshPort), |
| 105 | + WithSshKey(sshKey), |
| 106 | + WithSshUser(sshUser), |
| 107 | + ) |
| 108 | + |
| 109 | + if installCTK { |
| 110 | + installer, err := NewToolkitInstaller( |
| 111 | + WithRunner(runner), |
| 112 | + WithImage(imageName+":"+imageTag), |
| 113 | + WithTemplate(dockerInstallTemplate), |
| 114 | + ) |
| 115 | + Expect(err).ToNot(HaveOccurred()) |
| 116 | + |
| 117 | + err = installer.Install() |
| 118 | + Expect(err).ToNot(HaveOccurred()) |
| 119 | + } |
| 120 | + |
| 121 | + // Capture the host GPU list. |
| 122 | + var err error |
| 123 | + hostOutput, _, err = runner.Run("nvidia-smi -L") |
| 124 | + Expect(err).ToNot(HaveOccurred()) |
| 125 | + }) |
| 126 | + |
| 127 | + AfterAll(func(ctx context.Context) { |
| 128 | + // Cleanup: remove the container and the temporary script on the host. |
| 129 | + runner.Run(fmt.Sprintf("docker rm -f %s", containerName)) |
| 130 | + }) |
| 131 | + |
| 132 | + It("should report the same GPUs inside the container as on the host", func(ctx context.Context) { |
| 133 | + testScriptPath := "/tmp/libnvidia-container-cli.sh" |
| 134 | + |
| 135 | + // Write the script to the remote host and make it executable. |
| 136 | + createScriptCmd := fmt.Sprintf( |
| 137 | + "cat > %s <<'EOF'\n%s\nEOF\nchmod +x %s", |
| 138 | + testScriptPath, libnvidiaContainerCliTestTemplate, testScriptPath, |
| 139 | + ) |
| 140 | + |
| 141 | + _, _, err := runner.Run(createScriptCmd) |
| 142 | + Expect(err).ToNot(HaveOccurred()) |
| 143 | + |
| 144 | + // If a container with the same name exists from a previous test run, remove it first. |
| 145 | + _, _, err = runner.Run(fmt.Sprintf("docker rm -f %s", containerName)) |
| 146 | + Expect(err).ToNot(HaveOccurred()) |
| 147 | + |
| 148 | + // Build the docker run command (detached mode) from the template so it |
| 149 | + // stays readable while still resulting in a single-line invocation. |
| 150 | + dockerRunCmd := fmt.Sprintf(dockerRunCmdTemplate, containerName, testScriptPath, dockerImage) |
| 151 | + |
| 152 | + // Launch the container in detached mode. |
| 153 | + _, _, err = runner.Run(dockerRunCmd) |
| 154 | + Expect(err).ToNot(HaveOccurred()) |
| 155 | + |
| 156 | + //check if the container is running |
| 157 | + containerOutput, _, err := runner.Run(fmt.Sprintf("docker ps -a | grep %s", containerName)) |
| 158 | + Expect(err).ToNot(HaveOccurred()) |
| 159 | + fmt.Println("containerOutput: ", containerOutput) |
| 160 | + |
| 161 | + hostOutputNormalized := strings.TrimSpace(strings.ReplaceAll(hostOutput, "\r", "")) |
| 162 | + |
| 163 | + // Poll the logs of the already running container until we observe |
| 164 | + // the GPU list matching the host or until a 5-minute timeout elapses. |
| 165 | + Eventually(func() string { |
| 166 | + logs, _, err := runner.Run(fmt.Sprintf("docker logs --tail 10 %s", containerName)) |
| 167 | + if err != nil { |
| 168 | + return "" |
| 169 | + } |
| 170 | + |
| 171 | + fmt.Println("logs: ", logs) |
| 172 | + |
| 173 | + lines := strings.Split(strings.TrimSpace(logs), "\n") |
| 174 | + if len(lines) == 0 { |
| 175 | + return "" |
| 176 | + } |
| 177 | + |
| 178 | + last := strings.TrimSpace(strings.ReplaceAll(lines[len(lines)-1], "\r", "")) |
| 179 | + return last |
| 180 | + }, "5m", "5s").Should(Equal(hostOutputNormalized)) |
| 181 | + |
| 182 | + // Remove the script from the remote host. |
| 183 | + runner.Run(fmt.Sprintf("rm -f %s", testScriptPath)) |
| 184 | + }) |
| 185 | +}) |
0 commit comments