-
Notifications
You must be signed in to change notification settings - Fork 377
[no-relnote] Add E2E for libnvidia-container #1118
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -62,12 +62,8 @@ func getTestEnv() { | |
|
||
installCTK = getEnvVarOrDefault("E2E_INSTALL_CTK", false) | ||
|
||
if installCTK { | ||
imageName = getRequiredEnvvar[string]("E2E_IMAGE_NAME") | ||
|
||
imageTag = getRequiredEnvvar[string]("E2E_IMAGE_TAG") | ||
|
||
} | ||
imageName = getRequiredEnvvar[string]("E2E_IMAGE_NAME") | ||
imageTag = getRequiredEnvvar[string]("E2E_IMAGE_TAG") | ||
Comment on lines
+65
to
+66
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a reason we removed the conditional? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, regardless of if we want to install or not the toolkit on the host, I want to be able to get these 2 variables. |
||
|
||
sshKey = getRequiredEnvvar[string]("E2E_SSH_KEY") | ||
sshUser = getRequiredEnvvar[string]("E2E_SSH_USER") | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,11 +28,18 @@ var dockerInstallTemplate = ` | |
#! /usr/bin/env bash | ||
set -xe | ||
|
||
ArangoGutierrez marked this conversation as resolved.
Show resolved
Hide resolved
|
||
: ${IMAGE:={{.Image}}} | ||
|
||
# Create a temporary directory | ||
TEMP_DIR="/tmp/ctk_e2e.$(date +%s)_$RANDOM" | ||
mkdir -p "$TEMP_DIR" | ||
# if the TEMP_DIR is already set, use it | ||
if [ -f /tmp/ctk_e2e_temp_dir.txt ]; then | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are we managing this state at a shell-script level? Should we not create the temp dir once in the go code and then use it here in our template? |
||
TEMP_DIR=$(cat /tmp/ctk_e2e_temp_dir.txt) | ||
else | ||
TEMP_DIR="/tmp/ctk_e2e.$(date +%s)_$RANDOM" | ||
echo "$TEMP_DIR" > /tmp/ctk_e2e_temp_dir.txt | ||
ArangoGutierrez marked this conversation as resolved.
Show resolved
Hide resolved
|
||
fi | ||
|
||
# if TEMP_DIR does not exist, create it | ||
if [ ! -d "$TEMP_DIR" ]; then | ||
mkdir -p "$TEMP_DIR" | ||
fi | ||
|
||
# Given that docker has an init function that checks for the existence of the | ||
# nvidia-container-toolkit, we need to create a symlink to the nvidia-container-runtime-hook | ||
|
@@ -46,7 +53,7 @@ docker run --pid=host --rm -i --privileged \ | |
-v /var/run/docker.sock:/var/run/docker.sock \ | ||
-v "$TEMP_DIR:$TEMP_DIR" \ | ||
-v /etc/docker:/config-root \ | ||
${IMAGE} \ | ||
{{.Image}} \ | ||
--root "$TEMP_DIR" \ | ||
--runtime=docker \ | ||
--config=/config-root/daemon.json \ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,188 @@ | ||
/* | ||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package e2e | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"strings" | ||
|
||
. "github.com/onsi/ginkgo/v2" | ||
. "github.com/onsi/gomega" | ||
) | ||
|
||
const ( | ||
// libnvidiaContainerCliTestTemplate is an on-the-fly script that prepares a | ||
// minimal rootfs with GPU support and finally prints the list of visible GPUs | ||
// using `nvidia-smi -L`. The exit code as well as the standard output of this | ||
// script are therefore a good indicator of whether the nvidia-container-cli | ||
// is functioning correctly inside the container. | ||
libnvidiaContainerCliTestTemplate = `#!/busybox/sh | ||
set -euo pipefail | ||
|
||
# Create a temporary directory and rootfs path | ||
TMPDIR="$(mktemp -d)" | ||
ROOTFS="${TMPDIR}/rootfs" | ||
mkdir -p "${ROOTFS}" | ||
|
||
# Expose ROOTFS for the child namespace | ||
export ROOTFS | ||
|
||
busybox wget -qO- http://cdimage.ubuntu.com/ubuntu-base/releases/22.04/release/ubuntu-base-22.04-base-amd64.tar.gz \ | ||
| tar -C "$ROOTFS" -xz | ||
|
||
# Enter a new mount + PID namespace so we can pivot_root without touching the | ||
# container's original filesystem. | ||
busybox unshare --mount --pid --fork --propagation private -- busybox sh -eux <<'IN_NS' | ||
: "${ROOTFS:?}" | ||
|
||
# 1 Bind-mount the new root and make the mount private | ||
busybox mount --bind "$ROOTFS" "$ROOTFS" | ||
busybox mount --make-private "$ROOTFS" | ||
cd "$ROOTFS" | ||
|
||
# 2 Minimal virtual filesystems | ||
busybox mount -t proc proc proc | ||
busybox mount -t sysfs sys sys | ||
busybox mount -t tmpfs tmp tmp | ||
busybox mount -t tmpfs run run | ||
|
||
# 3 GPU setup via nvidia-container-cli | ||
# Add potential install locations of nvidia-container-cli to PATH. | ||
# /artifacts/{rpm,deb}/usr/bin are where the binary ends up after the | ||
# packages are extracted in the application image. /work is included for | ||
# completeness since some images may copy the binary there. | ||
export PATH="${PATH}:/artifacts/rpm/usr/bin:/artifacts/deb/usr/bin:/work" | ||
|
||
nvidia-container-cli --load-kmods \ | ||
configure \ | ||
--no-cgroups --utility --device=0 "$(pwd)" | ||
|
||
# 4 Switch root into the prepared filesystem | ||
mkdir -p mnt | ||
busybox pivot_root . mnt | ||
busybox umount -l /mnt | ||
|
||
exec nvidia-smi -L | ||
IN_NS | ||
` | ||
|
||
dockerRunCmdTemplate = `docker run --name %s --privileged --runtime=nvidia \ | ||
-e NVIDIA_VISIBLE_DEVICES=all \ | ||
-e NVIDIA_DRIVER_CAPABILITIES=all \ | ||
-v %s:/libnvidia-container-cli.sh \ | ||
--entrypoint /libnvidia-container-cli.sh %s` | ||
) | ||
|
||
var _ = Describe("nvidia-container-cli", Ordered, ContinueOnFailure, func() { | ||
var ( | ||
runner Runner | ||
dockerImage = fmt.Sprintf("%s:%s", imageName, imageTag) | ||
containerName = "nvidia-cli-e2e" | ||
hostOutput string | ||
) | ||
|
||
BeforeAll(func(ctx context.Context) { | ||
runner = NewRunner( | ||
WithHost(sshHost), | ||
WithPort(sshPort), | ||
WithSshKey(sshKey), | ||
WithSshUser(sshUser), | ||
) | ||
|
||
if installCTK { | ||
installer, err := NewToolkitInstaller( | ||
WithRunner(runner), | ||
WithImage(imageName+":"+imageTag), | ||
WithTemplate(dockerInstallTemplate), | ||
) | ||
Expect(err).ToNot(HaveOccurred()) | ||
|
||
err = installer.Install() | ||
Expect(err).ToNot(HaveOccurred()) | ||
} | ||
|
||
// Capture the host GPU list. | ||
var err error | ||
hostOutput, _, err = runner.Run("nvidia-smi -L") | ||
Expect(err).ToNot(HaveOccurred()) | ||
}) | ||
|
||
AfterAll(func(ctx context.Context) { | ||
// Cleanup: remove the container and the temporary script on the host. | ||
runner.Run(fmt.Sprintf("docker rm -f %s", containerName)) | ||
}) | ||
|
||
It("should report the same GPUs inside the container as on the host", func(ctx context.Context) { | ||
testScriptPath := "/tmp/libnvidia-container-cli.sh" | ||
|
||
// Write the script to the remote host and make it executable. | ||
createScriptCmd := fmt.Sprintf( | ||
"cat > %s <<'EOF'\n%s\nEOF\nchmod +x %s", | ||
testScriptPath, libnvidiaContainerCliTestTemplate, testScriptPath, | ||
) | ||
|
||
_, _, err := runner.Run(createScriptCmd) | ||
Expect(err).ToNot(HaveOccurred()) | ||
|
||
// If a container with the same name exists from a previous test run, remove it first. | ||
_, _, err = runner.Run(fmt.Sprintf("docker rm -f %s", containerName)) | ||
Expect(err).ToNot(HaveOccurred()) | ||
|
||
// Build the docker run command (detached mode) from the template so it | ||
// stays readable while still resulting in a single-line invocation. | ||
dockerRunCmd := fmt.Sprintf(dockerRunCmdTemplate, containerName, testScriptPath, dockerImage) | ||
|
||
// Launch the container in detached mode. | ||
output, errOutput, err := runner.Run(dockerRunCmd) | ||
Expect(err).ToNot(HaveOccurred()) | ||
fmt.Println("output: ", output) | ||
fmt.Println("errOutput: ", errOutput) | ||
|
||
hostOutputNormalized := strings.TrimSpace(strings.ReplaceAll(hostOutput, "\r", "")) | ||
|
||
// Poll the logs of the already running container until we observe | ||
// the GPU list matching the host or until a 5-minute timeout elapses. | ||
Eventually(func() string { | ||
//check if the container is running | ||
containerOutput, _, err := runner.Run(fmt.Sprintf("docker ps -a | grep %s", containerName)) | ||
if err != nil { | ||
return fmt.Sprintf("error: %v", err) | ||
} | ||
fmt.Println("containerOutput: ", containerOutput) | ||
|
||
logs, logsErr, err := runner.Run(fmt.Sprintf("docker logs %s", containerName)) | ||
if err != nil { | ||
return "" | ||
} | ||
|
||
fmt.Println("logs: ", logs) | ||
fmt.Println("logsErr: ", logsErr) | ||
|
||
lines := strings.Split(strings.TrimSpace(logs), "\n") | ||
if len(lines) == 0 { | ||
return "" | ||
} | ||
|
||
last := strings.TrimSpace(strings.ReplaceAll(lines[len(lines)-1], "\r", "")) | ||
return last | ||
}, "5m", "10s").Should(Equal(hostOutputNormalized)) | ||
|
||
// Remove the script from the remote host. | ||
runner.Run(fmt.Sprintf("rm -f %s", testScriptPath)) | ||
}) | ||
}) |
Uh oh!
There was an error while loading. Please reload this page.