|
| 1 | +/* |
| 2 | + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. |
| 3 | + * |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | + * you may not use this file except in compliance with the License. |
| 6 | + * You may obtain a copy of the License at |
| 7 | + * |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | + * |
| 10 | + * Unless required by applicable law or agreed to in writing, software |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | + * See the License for the specific language governing permissions and |
| 14 | + * limitations under the License. |
| 15 | + */ |
| 16 | + |
| 17 | +package e2e |
| 18 | + |
| 19 | +import ( |
| 20 | + "context" |
| 21 | + "fmt" |
| 22 | + "strings" |
| 23 | + "text/template" |
| 24 | + |
| 25 | + . "github.com/onsi/ginkgo/v2" |
| 26 | + . "github.com/onsi/gomega" |
| 27 | +) |
| 28 | + |
| 29 | +const ( |
| 30 | + libnvidiaContainerCliDockerRunTemplate = ` |
| 31 | +docker run -d --name test-nvidia-container-cli \ |
| 32 | + --privileged \ |
| 33 | + --runtime=nvidia \ |
| 34 | + -e NVIDIA_VISIBLE_DEVICES=all \ |
| 35 | + -e NVIDIA_DRIVER_CAPABILITIES=all \ |
| 36 | + -v $HOME/libnvidia-container-cli.sh:/usr/local/bin/libnvidia-container-cli.sh \ |
| 37 | + -v {{.NvidiaContainerCliSrc}}:/usr/bin/nvidia-container-cli \ |
| 38 | + -v {{.NvidiaContainerCliRealSrc}}:{{.NvidiaContainerCliRealTarget}} \ |
| 39 | + -v {{.NvidiaCtkSrc}}:/usr/bin/nvidia-ctk \ |
| 40 | + -v {{.NvidiaCtkRealSrc}}:{{.NvidiaCtkRealTarget}} \ |
| 41 | + -v {{.NvidiaContainerRuntimeSrc}}:/usr/bin/nvidia-container-runtime \ |
| 42 | + -v {{.NvidiaContainerRuntimeRealSrc}}:{{.NvidiaContainerRuntimeRealTarget}} \ |
| 43 | + -v {{.NvidiaContainerRuntimeHookSrc}}:/usr/bin/nvidia-container-runtime-hook \ |
| 44 | + -v {{.NvidiaContainerRuntimeHookRealSrc}}:{{.NvidiaContainerRuntimeHookRealTarget}} \ |
| 45 | + -v {{.NvidiaContainerToolkitSrc}}:/usr/bin/nvidia-container-toolkit \ |
| 46 | + -v {{.NvidiaCdiHookSrc}}:/usr/bin/nvidia-cdi-hook \ |
| 47 | + -v {{.NvidiaCdiHookRealSrc}}:{{.NvidiaCdiHookRealTarget}} \ |
| 48 | + -v {{.NvidiaContainerRuntimeCdiSrc}}:/usr/bin/nvidia-container-runtime.cdi \ |
| 49 | + -v {{.NvidiaContainerRuntimeCdiRealSrc}}:{{.NvidiaContainerRuntimeCdiRealTarget}} \ |
| 50 | + -v {{.NvidiaContainerRuntimeLegacySrc}}:/usr/bin/nvidia-container-runtime.legacy \ |
| 51 | + -v {{.NvidiaContainerRuntimeLegacyRealSrc}}:{{.NvidiaContainerRuntimeLegacyRealTarget}} \ |
| 52 | + -v {{.ToolkitDir}}/toolkit:/usr/local/nvidia/toolkit \ |
| 53 | + -v /etc/nvidia-container-runtime:/etc/nvidia-container-runtime \ |
| 54 | + -v {{.LibNvidiaContainerSo1Src}}:/usr/lib/x86_64-linux-gnu/libnvidia-container.so.1 \ |
| 55 | + -v {{.LibNvidiaContainerTargetSrc}}:/usr/lib/x86_64-linux-gnu/{{.LibNvidiaContainerTarget}} \ |
| 56 | + -v {{.LibNvidiaContainerGoSo1Src}}:/usr/lib/x86_64-linux-gnu/libnvidia-container-go.so.1 \ |
| 57 | + -v {{.LibNvidiaContainerGoTargetSrc}}:/usr/lib/x86_64-linux-gnu/{{.LibNvidiaContainerGoTarget}} \ |
| 58 | + -e LD_LIBRARY_PATH=/usr/lib64:/usr/lib/x86_64-linux-gnu:/usr/lib/aarch64-linux-gnu:/lib64:/lib/x86_64-linux-gnu:/lib/aarch64-linux-gnu \ |
| 59 | + --entrypoint /usr/local/bin/libnvidia-container-cli.sh \ |
| 60 | + ubuntu |
| 61 | +` |
| 62 | + |
| 63 | + libnvidiaContainerCliTestTemplate = `#!/usr/bin/env bash |
| 64 | +set -euo pipefail |
| 65 | +
|
| 66 | +apt-get update -y && apt-get install -y curl gnupg2 |
| 67 | +
|
| 68 | +WORKDIR="$(mktemp -d)" |
| 69 | +ROOTFS="${WORKDIR}/rootfs" |
| 70 | +mkdir -p "${ROOTFS}" |
| 71 | +
|
| 72 | +export WORKDIR ROOTFS # make them visible in the child shell |
| 73 | +
|
| 74 | +unshare --mount --pid --fork --propagation private -- bash -eux <<'IN_NS' |
| 75 | + : "${ROOTFS:?}" "${WORKDIR:?}" # abort if either is empty |
| 76 | +
|
| 77 | + # 1 Populate minimal Ubuntu base |
| 78 | + curl -L http://cdimage.ubuntu.com/ubuntu-base/releases/22.04/release/ubuntu-base-22.04-base-amd64.tar.gz \ |
| 79 | + | tar -C "$ROOTFS" -xz |
| 80 | +
|
| 81 | + # 2 Add non-root user |
| 82 | + useradd -R "$ROOTFS" -U -u 1000 -s /bin/bash nvidia |
| 83 | +
|
| 84 | + # 3 Bind-mount new root and unshare mounts |
| 85 | + mount --bind "$ROOTFS" "$ROOTFS" |
| 86 | + mount --make-private "$ROOTFS" |
| 87 | + cd "$ROOTFS" |
| 88 | +
|
| 89 | + # 4 Minimal virtual filesystems |
| 90 | + mount -t proc proc proc |
| 91 | + mount -t sysfs sys sys |
| 92 | + mount -t tmpfs tmp tmp |
| 93 | + mount -t tmpfs run run |
| 94 | +
|
| 95 | + # 5 GPU setup |
| 96 | + nvidia-container-cli --load-kmods --debug=container-cli.log \ |
| 97 | + configure --ldconfig=@/sbin/ldconfig.real \ |
| 98 | + --no-cgroups --utility --device=0 "$(pwd)" |
| 99 | +
|
| 100 | + # 6 Switch root |
| 101 | + mkdir -p mnt |
| 102 | + pivot_root . mnt |
| 103 | + umount -l /mnt |
| 104 | +
|
| 105 | + exec nvidia-smi -L |
| 106 | +IN_NS |
| 107 | +` |
| 108 | +) |
| 109 | + |
| 110 | +// getToolkitDir tries to read the toolkit dir from /tmp/ctk_e2e_temp_dir.txt using the runner. Returns empty string if not found. |
| 111 | +func getToolkitDir(runner Runner) string { |
| 112 | + out, _, err := runner.Run("cat /tmp/ctk_e2e_temp_dir.txt") |
| 113 | + if err == nil { |
| 114 | + dir := strings.TrimSpace(out) |
| 115 | + if dir != "" { |
| 116 | + return dir |
| 117 | + } |
| 118 | + } |
| 119 | + return "" |
| 120 | +} |
| 121 | + |
| 122 | +// getToolkitLayout returns the toolkit dir, a flag for flat layout, and a function to get the source path for a given filename. |
| 123 | +func getToolkitLayout(runner Runner) (toolkitDir string, useFlatLayout bool, srcPath func(string) string) { |
| 124 | + tempDir := getToolkitDir(runner) |
| 125 | + if tempDir == "" { |
| 126 | + return "", false, func(filename string) string { |
| 127 | + if strings.HasPrefix(filename, "lib") { |
| 128 | + return "/usr/lib/x86_64-linux-gnu/" + filename |
| 129 | + } |
| 130 | + return "/usr/bin/" + filename |
| 131 | + } |
| 132 | + } |
| 133 | + return tempDir, true, func(filename string) string { |
| 134 | + return tempDir + "/toolkit/" + filename |
| 135 | + } |
| 136 | +} |
| 137 | + |
| 138 | +// getRealTargetPath returns the correct target path for a .real binary depending on the install type. |
| 139 | +func getRealTargetPath(filename, toolkitDir string) string { |
| 140 | + if toolkitDir == "" { |
| 141 | + return "/usr/bin/" + filename |
| 142 | + } |
| 143 | + return toolkitDir + "/toolkit/" + filename |
| 144 | +} |
| 145 | + |
| 146 | +// Integration tests for Docker runtime |
| 147 | +var _ = Describe("nvidia-container-cli", Ordered, ContinueOnFailure, func() { |
| 148 | + var runner Runner |
| 149 | + |
| 150 | + // Install the NVIDIA Container Toolkit |
| 151 | + BeforeAll(func(ctx context.Context) { |
| 152 | + runner = NewRunner( |
| 153 | + WithHost(sshHost), |
| 154 | + WithPort(sshPort), |
| 155 | + WithSshKey(sshKey), |
| 156 | + WithSshUser(sshUser), |
| 157 | + ) |
| 158 | + |
| 159 | + if installCTK { |
| 160 | + installer, err := NewToolkitInstaller( |
| 161 | + WithRunner(runner), |
| 162 | + WithImage(imageName+":"+imageTag), |
| 163 | + WithTemplate(dockerInstallTemplate), |
| 164 | + ) |
| 165 | + Expect(err).ToNot(HaveOccurred()) |
| 166 | + |
| 167 | + err = installer.Install() |
| 168 | + Expect(err).ToNot(HaveOccurred()) |
| 169 | + } |
| 170 | + }) |
| 171 | + |
| 172 | + When("running nvidia-smi -L", Ordered, func() { |
| 173 | + var hostOutput string |
| 174 | + var err error |
| 175 | + |
| 176 | + BeforeAll(func(ctx context.Context) { |
| 177 | + hostOutput, _, err = runner.Run("nvidia-smi -L") |
| 178 | + Expect(err).ToNot(HaveOccurred()) |
| 179 | + |
| 180 | + _, _, err := runner.Run("docker pull ubuntu") |
| 181 | + Expect(err).ToNot(HaveOccurred()) |
| 182 | + }) |
| 183 | + |
| 184 | + AfterAll(func(ctx context.Context) { |
| 185 | + _, _, err := runner.Run("docker rm -f test-nvidia-container-cli") |
| 186 | + Expect(err).ToNot(HaveOccurred()) |
| 187 | + }) |
| 188 | + |
| 189 | + It("should support NVIDIA_VISIBLE_DEVICES and NVIDIA_DRIVER_CAPABILITIES", func(ctx context.Context) { |
| 190 | + // 1. Create the test script on the remote host at $HOME/test.sh using a here-document |
| 191 | + testScriptPath := "$HOME/libnvidia-container-cli.sh" |
| 192 | + testScript := libnvidiaContainerCliTestTemplate |
| 193 | + createScriptCmd := fmt.Sprintf("cat > %s <<'EOF'\n%s\nEOF\nchmod +x %s", testScriptPath, testScript, testScriptPath) |
| 194 | + _, _, err := runner.Run(createScriptCmd) |
| 195 | + Expect(err).ToNot(HaveOccurred()) |
| 196 | + |
| 197 | + // 2. Discover the symlink targets for the libraries on the remote host |
| 198 | + getTargetCmd := func(lib string) string { |
| 199 | + return fmt.Sprintf("readlink -f /usr/lib/x86_64-linux-gnu/%s.1", lib) |
| 200 | + } |
| 201 | + libNvidiaContainerTarget, _, err := runner.Run(getTargetCmd("libnvidia-container.so")) |
| 202 | + Expect(err).ToNot(HaveOccurred()) |
| 203 | + |
| 204 | + libNvidiaContainerTarget = strings.TrimSpace(libNvidiaContainerTarget) |
| 205 | + libNvidiaContainerTarget = strings.TrimPrefix(libNvidiaContainerTarget, "/usr/lib/x86_64-linux-gnu/") |
| 206 | + |
| 207 | + libNvidiaContainerGoTarget, _, err := runner.Run(getTargetCmd("libnvidia-container-go.so")) |
| 208 | + Expect(err).ToNot(HaveOccurred()) |
| 209 | + |
| 210 | + libNvidiaContainerGoTarget = strings.TrimSpace(libNvidiaContainerGoTarget) |
| 211 | + libNvidiaContainerGoTarget = strings.TrimPrefix(libNvidiaContainerGoTarget, "/usr/lib/x86_64-linux-gnu/") |
| 212 | + |
| 213 | + // 3. Get toolkit layout info and source path helper |
| 214 | + toolkitDir, _, srcPath := getToolkitLayout(runner) |
| 215 | + |
| 216 | + // 4. Render the docker run template with the discovered targets and computed source paths |
| 217 | + tmpl, err := template.New("dockerRun").Parse(libnvidiaContainerCliDockerRunTemplate) |
| 218 | + Expect(err).ToNot(HaveOccurred()) |
| 219 | + var dockerRunCmdBuilder strings.Builder |
| 220 | + err = tmpl.Execute(&dockerRunCmdBuilder, map[string]string{ |
| 221 | + "ToolkitDir": toolkitDir, |
| 222 | + "NvidiaContainerCliSrc": srcPath("nvidia-container-cli"), |
| 223 | + "NvidiaContainerCliRealSrc": srcPath("nvidia-container-cli.real"), |
| 224 | + "NvidiaContainerCliRealTarget": getRealTargetPath("nvidia-container-cli.real", toolkitDir), |
| 225 | + "NvidiaCtkSrc": srcPath("nvidia-ctk"), |
| 226 | + "NvidiaCtkRealSrc": srcPath("nvidia-ctk.real"), |
| 227 | + "NvidiaCtkRealTarget": getRealTargetPath("nvidia-ctk.real", toolkitDir), |
| 228 | + "NvidiaContainerRuntimeSrc": srcPath("nvidia-container-runtime"), |
| 229 | + "NvidiaContainerRuntimeRealSrc": srcPath("nvidia-container-runtime.real"), |
| 230 | + "NvidiaContainerRuntimeRealTarget": getRealTargetPath("nvidia-container-runtime.real", toolkitDir), |
| 231 | + "NvidiaContainerRuntimeHookSrc": srcPath("nvidia-container-runtime-hook"), |
| 232 | + "NvidiaContainerRuntimeHookRealSrc": srcPath("nvidia-container-runtime-hook.real"), |
| 233 | + "NvidiaContainerRuntimeHookRealTarget": getRealTargetPath("nvidia-container-runtime-hook.real", toolkitDir), |
| 234 | + "NvidiaContainerToolkitSrc": srcPath("nvidia-container-toolkit"), |
| 235 | + "NvidiaCdiHookSrc": srcPath("nvidia-cdi-hook"), |
| 236 | + "NvidiaCdiHookRealSrc": srcPath("nvidia-cdi-hook.real"), |
| 237 | + "NvidiaCdiHookRealTarget": getRealTargetPath("nvidia-cdi-hook.real", toolkitDir), |
| 238 | + "NvidiaContainerRuntimeCdiSrc": srcPath("nvidia-container-runtime.cdi"), |
| 239 | + "NvidiaContainerRuntimeCdiRealSrc": srcPath("nvidia-container-runtime.cdi.real"), |
| 240 | + "NvidiaContainerRuntimeCdiRealTarget": getRealTargetPath("nvidia-container-runtime.cdi.real", toolkitDir), |
| 241 | + "NvidiaContainerRuntimeLegacySrc": srcPath("nvidia-container-runtime.legacy"), |
| 242 | + "NvidiaContainerRuntimeLegacyRealSrc": srcPath("nvidia-container-runtime.legacy.real"), |
| 243 | + "NvidiaContainerRuntimeLegacyRealTarget": getRealTargetPath("nvidia-container-runtime.legacy.real", toolkitDir), |
| 244 | + "LibNvidiaContainerSo1Src": srcPath("libnvidia-container.so.1"), |
| 245 | + "LibNvidiaContainerTargetSrc": srcPath(libNvidiaContainerTarget), |
| 246 | + "LibNvidiaContainerGoSo1Src": srcPath("libnvidia-container-go.so.1"), |
| 247 | + "LibNvidiaContainerGoTargetSrc": srcPath(libNvidiaContainerGoTarget), |
| 248 | + "LibNvidiaContainerTarget": libNvidiaContainerTarget, |
| 249 | + "LibNvidiaContainerGoTarget": libNvidiaContainerGoTarget, |
| 250 | + }) |
| 251 | + Expect(err).ToNot(HaveOccurred()) |
| 252 | + dockerRunCmd := dockerRunCmdBuilder.String() |
| 253 | + |
| 254 | + // 5. Start the container using the rendered docker run command |
| 255 | + _, _, err = runner.Run(dockerRunCmd) |
| 256 | + Expect(err).ToNot(HaveOccurred()) |
| 257 | + |
| 258 | + // 6. Use Eventually to check the container logs contain hostOutput |
| 259 | + expected := strings.TrimSpace(strings.ReplaceAll(hostOutput, "\r", "")) |
| 260 | + Eventually(func() string { |
| 261 | + logs, _, err := runner.Run("docker logs test-nvidia-container-cli | tail -n 20") |
| 262 | + if err != nil { |
| 263 | + return "" |
| 264 | + } |
| 265 | + |
| 266 | + logLines := strings.Split(strings.TrimSpace(logs), "\n") |
| 267 | + if len(logLines) == 0 { |
| 268 | + return "" |
| 269 | + } |
| 270 | + lastLine := strings.TrimSpace(strings.ReplaceAll(logLines[len(logLines)-1], "\r", "")) |
| 271 | + return lastLine |
| 272 | + }, "5m", "5s").Should(Equal(expected)) |
| 273 | + }) |
| 274 | + }) |
| 275 | +}) |
0 commit comments