|
| 1 | +/* |
| 2 | + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. |
| 3 | + * |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | + * you may not use this file except in compliance with the License. |
| 6 | + * You may obtain a copy of the License at |
| 7 | + * |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | + * |
| 10 | + * Unless required by applicable law or agreed to in writing, software |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | + * See the License for the specific language governing permissions and |
| 14 | + * limitations under the License. |
| 15 | + */ |
| 16 | + |
| 17 | +package e2e |
| 18 | + |
| 19 | +import ( |
| 20 | + "context" |
| 21 | + "fmt" |
| 22 | + "strings" |
| 23 | + |
| 24 | + . "github.com/onsi/ginkgo/v2" |
| 25 | + . "github.com/onsi/gomega" |
| 26 | +) |
| 27 | + |
| 28 | +const ( |
| 29 | + // libnvidiaContainerCliTestTemplate is an on-the-fly script that prepares a |
| 30 | + // minimal rootfs with GPU support and finally prints the list of visible GPUs |
| 31 | + // using `nvidia-smi -L`. The exit code as well as the standard output of this |
| 32 | + // script are therefore a good indicator of whether the nvidia-container-cli |
| 33 | + // is functioning correctly inside the container. |
| 34 | + libnvidiaContainerCliTestTemplate = `#!/usr/bin/env bash |
| 35 | +set -euo pipefail |
| 36 | +
|
| 37 | +apt-get update -y && apt-get install -y curl gnupg2 |
| 38 | +
|
| 39 | +ROOTFS="${mktemp -d}/rootfs" |
| 40 | +mkdir -p "${ROOTFS}" |
| 41 | +
|
| 42 | +export ROOTFS # make them visible in the child shell |
| 43 | +
|
| 44 | +unshare --mount --pid --fork --propagation private -- bash -eux <<'IN_NS' |
| 45 | + : "${ROOTFS:?}" # abort if empty |
| 46 | +
|
| 47 | + # 1 Populate minimal Ubuntu base |
| 48 | + curl -L http://cdimage.ubuntu.com/ubuntu-base/releases/22.04/release/ubuntu-base-22.04-base-amd64.tar.gz \ |
| 49 | + | tar -C "$ROOTFS" -xz |
| 50 | +
|
| 51 | + # 2 Add non-root user |
| 52 | + useradd -R "$ROOTFS" -U -u 1000 -s /bin/bash nvidia |
| 53 | +
|
| 54 | + # 3 Bind-mount new root and unshare mounts |
| 55 | + mount --bind "$ROOTFS" "$ROOTFS" |
| 56 | + mount --make-private "$ROOTFS" |
| 57 | + cd "$ROOTFS" |
| 58 | +
|
| 59 | + # 4 Minimal virtual filesystems |
| 60 | + mount -t proc proc proc |
| 61 | + mount -t sysfs sys sys |
| 62 | + mount -t tmpfs tmp tmp |
| 63 | + mount -t tmpfs run run |
| 64 | +
|
| 65 | + # 5 GPU setup |
| 66 | + nvidia-container-cli --load-kmods --debug=container-cli.log \ |
| 67 | + configure --ldconfig=@/sbin/ldconfig.real \ |
| 68 | + --no-cgroups --utility --device=0 "$(pwd)" |
| 69 | +
|
| 70 | + # 6 Switch root |
| 71 | + mkdir -p mnt |
| 72 | + pivot_root . mnt |
| 73 | + umount -l /mnt |
| 74 | +
|
| 75 | + exec nvidia-smi -L |
| 76 | +IN_NS |
| 77 | +` |
| 78 | + |
| 79 | + dockerRunCmdTemplate = `docker run --name %s -d --privileged --runtime=nvidia \ |
| 80 | + -e NVIDIA_VISIBLE_DEVICES=all \ |
| 81 | + -e NVIDIA_DRIVER_CAPABILITIES=all \ |
| 82 | + -v %s:/libnvidia-container-cli.sh \ |
| 83 | + --entrypoint /libnvidia-container-cli.sh %s` |
| 84 | +) |
| 85 | + |
| 86 | +var _ = Describe("nvidia-container-cli", Ordered, ContinueOnFailure, func() { |
| 87 | + var ( |
| 88 | + runner Runner |
| 89 | + dockerImage = fmt.Sprintf("%s:%s", imageName, imageTag) |
| 90 | + containerName = "nvidia-cli-e2e" |
| 91 | + ) |
| 92 | + |
| 93 | + BeforeAll(func(ctx context.Context) { |
| 94 | + runner = NewRunner( |
| 95 | + WithHost(sshHost), |
| 96 | + WithPort(sshPort), |
| 97 | + WithSshKey(sshKey), |
| 98 | + WithSshUser(sshUser), |
| 99 | + ) |
| 100 | + |
| 101 | + if installCTK { |
| 102 | + installer, err := NewToolkitInstaller( |
| 103 | + WithRunner(runner), |
| 104 | + WithImage(imageName+":"+imageTag), |
| 105 | + WithTemplate(dockerInstallTemplate), |
| 106 | + ) |
| 107 | + Expect(err).ToNot(HaveOccurred()) |
| 108 | + |
| 109 | + err = installer.Install() |
| 110 | + Expect(err).ToNot(HaveOccurred()) |
| 111 | + } |
| 112 | + }) |
| 113 | + |
| 114 | + AfterAll(func(ctx context.Context) { |
| 115 | + // Cleanup: remove the container and the temporary script on the host. |
| 116 | + runner.Run(fmt.Sprintf("docker rm -f %s", containerName)) |
| 117 | + }) |
| 118 | + |
| 119 | + It("should report the same GPUs inside the container as on the host", func(ctx context.Context) { |
| 120 | + testScript := "/tmp/libnvidia-container-cli.sh" |
| 121 | + |
| 122 | + // Write the script to the remote host and make it executable. |
| 123 | + createScriptCmd := fmt.Sprintf( |
| 124 | + "cat > %s <<'EOF'\n%s\nEOF\nchmod +x %s", |
| 125 | + testScript, libnvidiaContainerCliTestTemplate, testScript, |
| 126 | + ) |
| 127 | + |
| 128 | + _, _, err := runner.Run(createScriptCmd) |
| 129 | + Expect(err).ToNot(HaveOccurred()) |
| 130 | + |
| 131 | + // If a container with the same name exists from a previous test run, remove it first. |
| 132 | + runner.Run(fmt.Sprintf("docker rm -f %s", containerName)) |
| 133 | + |
| 134 | + // Build the docker run command (detached mode) from the template so it |
| 135 | + // stays readable while still resulting in a single-line invocation. |
| 136 | + dockerRunCmd := fmt.Sprintf(dockerRunCmdTemplate, containerName, testScript, dockerImage) |
| 137 | + |
| 138 | + // Launch the container in detached mode. |
| 139 | + _, _, err = runner.Run(dockerRunCmd) |
| 140 | + Expect(err).ToNot(HaveOccurred()) |
| 141 | + |
| 142 | + // Capture the host GPU list. |
| 143 | + hostOutput, _, err := runner.Run("nvidia-smi -L") |
| 144 | + Expect(err).ToNot(HaveOccurred()) |
| 145 | + |
| 146 | + hostOutput = strings.TrimSpace(strings.ReplaceAll(hostOutput, "\r", "")) |
| 147 | + |
| 148 | + // Poll the logs of the already running container until we observe |
| 149 | + // the GPU list matching the host or until a 5-minute timeout elapses. |
| 150 | + Eventually(func() string { |
| 151 | + logs, _, err := runner.Run(fmt.Sprintf("docker logs --tail 1 %s", containerName)) |
| 152 | + if err != nil { |
| 153 | + return "" |
| 154 | + } |
| 155 | + |
| 156 | + lines := strings.Split(strings.TrimSpace(logs), "\n") |
| 157 | + if len(lines) == 0 { |
| 158 | + return "" |
| 159 | + } |
| 160 | + |
| 161 | + last := strings.TrimSpace(strings.ReplaceAll(lines[len(lines)-1], "\r", "")) |
| 162 | + return last |
| 163 | + }, "5m", "5s").Should(Equal(hostOutput)) |
| 164 | + |
| 165 | + // Remove the script from the remote host. |
| 166 | + runner.Run(fmt.Sprintf("rm -f %s", testScript)) |
| 167 | + }) |
| 168 | +}) |
0 commit comments