Skip to content

[no-relnote] Add E2E for libnvidia-container #1118

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion tests/e2e/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,14 @@ LOG_ARTIFACTS_DIR ?= $(CURDIR)/e2e_logs

GINKGO_BIN := $(CURDIR)/bin/ginkgo

# If GINKGO_FOCUS is not set, run all tests
# current available tests:
# - nvidia-container-cli
# - docker
GINKGO_FOCUS ?= nvidia-container-cli

test: $(GINKGO_BIN)
$(GINKGO_BIN) $(GINKGO_ARGS) -v --json-report ginkgo.json ./tests/e2e/...
$(GINKGO_BIN) $(GINKGO_ARGS) -v --json-report ginkgo.json --focus="$(GINKGO_FOCUS)" ./tests/e2e/...

$(GINKGO_BIN):
mkdir -p $(CURDIR)/bin
Expand Down
8 changes: 2 additions & 6 deletions tests/e2e/e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,8 @@ func getTestEnv() {

installCTK = getEnvVarOrDefault("E2E_INSTALL_CTK", false)

if installCTK {
imageName = getRequiredEnvvar[string]("E2E_IMAGE_NAME")

imageTag = getRequiredEnvvar[string]("E2E_IMAGE_TAG")

}
imageName = getRequiredEnvvar[string]("E2E_IMAGE_NAME")
imageTag = getRequiredEnvvar[string]("E2E_IMAGE_TAG")
Comment on lines +65 to +66
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason we removed the conditional?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, regardless of if we want to install or not the toolkit on the host, I want to be able to get these 2 variables.


sshKey = getRequiredEnvvar[string]("E2E_SSH_KEY")
sshUser = getRequiredEnvvar[string]("E2E_SSH_USER")
Expand Down
19 changes: 13 additions & 6 deletions tests/e2e/installer.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,18 @@ var dockerInstallTemplate = `
#! /usr/bin/env bash
set -xe

: ${IMAGE:={{.Image}}}

# Create a temporary directory
TEMP_DIR="/tmp/ctk_e2e.$(date +%s)_$RANDOM"
mkdir -p "$TEMP_DIR"
# if the TEMP_DIR is already set, use it
if [ -f /tmp/ctk_e2e_temp_dir.txt ]; then
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are we managing this state at a shell-script level? Should we not create the temp dir once in the go code and then use it here in our template?

TEMP_DIR=$(cat /tmp/ctk_e2e_temp_dir.txt)
else
TEMP_DIR="/tmp/ctk_e2e.$(date +%s)_$RANDOM"
echo "$TEMP_DIR" > /tmp/ctk_e2e_temp_dir.txt
fi

# if TEMP_DIR does not exist, create it
if [ ! -d "$TEMP_DIR" ]; then
mkdir -p "$TEMP_DIR"
fi

# Given that docker has an init function that checks for the existence of the
# nvidia-container-toolkit, we need to create a symlink to the nvidia-container-runtime-hook
Expand All @@ -46,7 +53,7 @@ docker run --pid=host --rm -i --privileged \
-v /var/run/docker.sock:/var/run/docker.sock \
-v "$TEMP_DIR:$TEMP_DIR" \
-v /etc/docker:/config-root \
${IMAGE} \
{{.Image}} \
--root "$TEMP_DIR" \
--runtime=docker \
--config=/config-root/daemon.json \
Expand Down
188 changes: 188 additions & 0 deletions tests/e2e/nvidia-container-cli_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package e2e

import (
"context"
"fmt"
"strings"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)

const (
// libnvidiaContainerCliTestTemplate is an on-the-fly script that prepares a
// minimal rootfs with GPU support and finally prints the list of visible GPUs
// using `nvidia-smi -L`. The exit code as well as the standard output of this
// script are therefore a good indicator of whether the nvidia-container-cli
// is functioning correctly inside the container.
libnvidiaContainerCliTestTemplate = `#!/busybox/sh
set -euo pipefail

# Create a temporary directory and rootfs path
TMPDIR="$(mktemp -d)"
ROOTFS="${TMPDIR}/rootfs"
mkdir -p "${ROOTFS}"

# Expose ROOTFS for the child namespace
export ROOTFS

busybox wget -qO- http://cdimage.ubuntu.com/ubuntu-base/releases/22.04/release/ubuntu-base-22.04-base-amd64.tar.gz \
| tar -C "$ROOTFS" -xz

# Enter a new mount + PID namespace so we can pivot_root without touching the
# container's original filesystem.
busybox unshare --mount --pid --fork --propagation private -- busybox sh -eux <<'IN_NS'
: "${ROOTFS:?}"

# 1 Bind-mount the new root and make the mount private
busybox mount --bind "$ROOTFS" "$ROOTFS"
busybox mount --make-private "$ROOTFS"
cd "$ROOTFS"

# 2 Minimal virtual filesystems
busybox mount -t proc proc proc
busybox mount -t sysfs sys sys
busybox mount -t tmpfs tmp tmp
busybox mount -t tmpfs run run

# 3 GPU setup via nvidia-container-cli
# Add potential install locations of nvidia-container-cli to PATH.
# /artifacts/{rpm,deb}/usr/bin are where the binary ends up after the
# packages are extracted in the application image. /work is included for
# completeness since some images may copy the binary there.
export PATH="${PATH}:/artifacts/rpm/usr/bin:/artifacts/deb/usr/bin:/work"

nvidia-container-cli --load-kmods \
configure \
--no-cgroups --utility --device=0 "$(pwd)"

# 4 Switch root into the prepared filesystem
mkdir -p mnt
busybox pivot_root . mnt
busybox umount -l /mnt

exec nvidia-smi -L
IN_NS
`

dockerRunCmdTemplate = `docker run --name %s --privileged --runtime=nvidia \
-e NVIDIA_VISIBLE_DEVICES=all \
-e NVIDIA_DRIVER_CAPABILITIES=all \
-v %s:/libnvidia-container-cli.sh \
--entrypoint /libnvidia-container-cli.sh %s`
)

var _ = Describe("nvidia-container-cli", Ordered, ContinueOnFailure, func() {
var (
runner Runner
dockerImage = fmt.Sprintf("%s:%s", imageName, imageTag)
containerName = "nvidia-cli-e2e"
hostOutput string
)

BeforeAll(func(ctx context.Context) {
runner = NewRunner(
WithHost(sshHost),
WithPort(sshPort),
WithSshKey(sshKey),
WithSshUser(sshUser),
)

if installCTK {
installer, err := NewToolkitInstaller(
WithRunner(runner),
WithImage(imageName+":"+imageTag),
WithTemplate(dockerInstallTemplate),
)
Expect(err).ToNot(HaveOccurred())

err = installer.Install()
Expect(err).ToNot(HaveOccurred())
}

// Capture the host GPU list.
var err error
hostOutput, _, err = runner.Run("nvidia-smi -L")
Expect(err).ToNot(HaveOccurred())
})

AfterAll(func(ctx context.Context) {
// Cleanup: remove the container and the temporary script on the host.
runner.Run(fmt.Sprintf("docker rm -f %s", containerName))
})

It("should report the same GPUs inside the container as on the host", func(ctx context.Context) {
testScriptPath := "/tmp/libnvidia-container-cli.sh"

// Write the script to the remote host and make it executable.
createScriptCmd := fmt.Sprintf(
"cat > %s <<'EOF'\n%s\nEOF\nchmod +x %s",
testScriptPath, libnvidiaContainerCliTestTemplate, testScriptPath,
)

_, _, err := runner.Run(createScriptCmd)
Expect(err).ToNot(HaveOccurred())

// If a container with the same name exists from a previous test run, remove it first.
_, _, err = runner.Run(fmt.Sprintf("docker rm -f %s", containerName))
Expect(err).ToNot(HaveOccurred())

// Build the docker run command (detached mode) from the template so it
// stays readable while still resulting in a single-line invocation.
dockerRunCmd := fmt.Sprintf(dockerRunCmdTemplate, containerName, testScriptPath, dockerImage)

// Launch the container in detached mode.
output, errOutput, err := runner.Run(dockerRunCmd)
Expect(err).ToNot(HaveOccurred())
fmt.Println("output: ", output)
fmt.Println("errOutput: ", errOutput)

hostOutputNormalized := strings.TrimSpace(strings.ReplaceAll(hostOutput, "\r", ""))

// Poll the logs of the already running container until we observe
// the GPU list matching the host or until a 5-minute timeout elapses.
Eventually(func() string {
//check if the container is running
containerOutput, _, err := runner.Run(fmt.Sprintf("docker ps -a | grep %s", containerName))
if err != nil {
return fmt.Sprintf("error: %v", err)
}
fmt.Println("containerOutput: ", containerOutput)

logs, logsErr, err := runner.Run(fmt.Sprintf("docker logs %s", containerName))
if err != nil {
return ""
}

fmt.Println("logs: ", logs)
fmt.Println("logsErr: ", logsErr)

lines := strings.Split(strings.TrimSpace(logs), "\n")
if len(lines) == 0 {
return ""
}

last := strings.TrimSpace(strings.ReplaceAll(lines[len(lines)-1], "\r", ""))
return last
}, "5m", "10s").Should(Equal(hostOutputNormalized))

// Remove the script from the remote host.
runner.Run(fmt.Sprintf("rm -f %s", testScriptPath))
})
})
Loading