Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions tests/e2e/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ GINKGO_BIN := $(CURDIR)/bin/ginkgo
# current available tests:
# - nvidia-container-cli
# - docker
# - nvidia-cdi-refresh
GINKGO_FOCUS ?=

test: $(GINKGO_BIN)
Expand Down
29 changes: 27 additions & 2 deletions tests/e2e/e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ import (

// Test context
var (
runner Runner

ctx context.Context

installCTK bool
Expand All @@ -41,6 +43,8 @@ var (
sshUser string
sshHost string
sshPort string

testContainerName = "ctk-e2e-test-container"
)

func TestMain(t *testing.T) {
Expand All @@ -49,13 +53,35 @@ func TestMain(t *testing.T) {
RegisterFailHandler(Fail)

ctx = context.Background()
getTestEnv()

RunSpecs(t,
suiteName,
)
}

var _ = BeforeSuite(func() {
getTestEnv()

runner = NewRunner(
WithHost(sshHost),
WithPort(sshPort),
WithSshKey(sshKey),
WithSshUser(sshUser),
)

if installCTK {
installer, err := NewToolkitInstaller(
WithRunner(runner),
WithImage(imageName+":"+imageTag),
WithTemplate(dockerInstallTemplate),
)
Expect(err).ToNot(HaveOccurred())

err = installer.Install()
Expect(err).ToNot(HaveOccurred())
}
})

// getTestEnv gets the test environment variables
func getTestEnv() {
defer GinkgoRecover()
Expand All @@ -73,7 +99,6 @@ func getTestEnv() {
sshUser = getRequiredEnvvar[string]("E2E_SSH_USER")
sshPort = getEnvVarOrDefault("E2E_SSH_PORT", "22")
}

}

// getRequiredEnvvar returns the specified envvar if set or raises an error.
Expand Down
204 changes: 204 additions & 0 deletions tests/e2e/nvidia-cdi-refresh_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package e2e

import (
"context"
"fmt"
"html/template"
"strings"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)

const (
nvidiaCdiRefreshDegradedSystemdTemplate = `
# Read the TMPDIR
TMPDIR=$(cat /tmp/ctk_e2e_temp_dir.txt)
export TMPDIR

# uninstall the nvidia-container-toolkit
apt-get remove -y nvidia-container-toolkit nvidia-container-toolkit-base libnvidia-container-tools libnvidia-container1
apt-get autoremove -y
Comment on lines +35 to +37
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this not in an "AfterEach"?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this script, I am uninstalling the toolkit on purpose to then get systemd on a degraded state and then re-install the toolkit to test that we support the degraded state. Given that this test is running on a nestedContainer the AfterAll removes the outher container


# Remove the cdi file if it exists
if [ -f /var/run/cdi/nvidia.yaml ]; then
rm -f /var/run/cdi/nvidia.yaml
fi

# Stop the nvidia-cdi-refresh.path and nvidia-cdi-refresh.service units
systemctl stop nvidia-cdi-refresh.path
systemctl stop nvidia-cdi-refresh.service

# Reload the systemd daemon
systemctl daemon-reload

# Start the dummy service to force systemd to enter a degraded state
cat <<EOF > /etc/systemd/system/dummy.service
[Unit]
Description=Dummy systemd service

[Service]
Type=oneshot
ExecStart=/usr/bin/sh -c "exit 0"
EOF

# We know the dummy service will fail, so we can ignore the error
systemctl start dummy.service 2>/dev/null || true
Comment on lines +51 to +62
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this not only be a single configuration for the test case:
"The NVIDIA Container Toolkit installs and generates a CDI spec if the system is in a degraded state"?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have turned this script just to create a degraded state and run it after testing on a healthy system, this way we test the 2 status we support running|degraded


# Install the nvidia-container-toolkit
dpkg -i ${TMPDIR}/packages/ubuntu18.04/amd64/libnvidia-container1_*_amd64.deb ${TMPDIR}/packages/ubuntu18.04/amd64/nvidia-container-toolkit-base_*_amd64.deb ${TMPDIR}/packages/ubuntu18.04/amd64/libnvidia-container-tools_*_amd64.deb
`
nvidiaCdiRefreshPathActiveTemplate = `
if ! systemctl status nvidia-cdi-refresh.path | grep "Active: active"; then
echo "nvidia-cdi-refresh.path is not Active"
exit 1
fi
Comment on lines +68 to +71
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be an It clause?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree, I am now moving this to a stand alone template, and calling it from it's own It block

`
nvidiaCdiRefreshServiceLoadedTemplate = `
if ! systemctl status nvidia-cdi-refresh.service | grep "Loaded: loaded"; then
echo "nvidia-cdi-refresh.service is not loaded"
exit 1
fi
Comment on lines +74 to +77
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be an It clause?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree, I am now moving this to a stand alone template, and calling it from it's own It block

`

nvidiaCdiRefreshFileExistsTemplate = `
# is /var/run/cdi/nvidia.yaml exists? and exit with 0 if it does not exist
if [ ! -f /var/run/cdi/nvidia.yaml ]; then
echo "nvidia.yaml file does not exist"
exit 1
fi

# generate the nvidia.yaml file
nvidia-ctk cdi generate --output=/tmp/nvidia.yaml

# diff the generated file with the one in /var/run/cdi/nvidia.yaml and exit with 0 if they are the same
if ! diff /var/run/cdi/nvidia.yaml /tmp/nvidia.yaml; then
echo "nvidia.yaml file is different"
exit 1
fi
`

nvidiaCdiRefreshUpgradeTemplate = `
# remove the generated files
rm /var/run/cdi/nvidia.yaml /tmp/nvidia.yaml

# Touch the nvidia-ctk binary to change the mtime
# This will trigger the nvidia-cdi-refresh.path unit to call the
# nvidia-cdi-refresh.service unit, simulating a change(update/downgrade) in the nvidia-ctk binary.
touch $(which nvidia-ctk)
Comment on lines +101 to +104
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could also install an older version of the toolkit and test upgrades then?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, you are totally right, I have a note for this, my idea is to wait for v1.18.0 and then pin that as the old version. So for the scope of this PR touch is ok enough to fake/mock a new nvidia-ctk binary


# wait for 3 seconds
sleep 3

# Check if the file /var/run/cdi/nvidia.yaml is created
if [ ! -f /var/run/cdi/nvidia.yaml ]; then
echo "nvidia.yaml file is not created after updating the modules.dep file"
exit 1
fi

# generate the nvidia.yaml file
nvidia-ctk cdi generate --output=/tmp/nvidia.yaml

# diff the generated file with the one in /var/run/cdi/nvidia.yaml and exit with 0 if they are the same
if ! diff /var/run/cdi/nvidia.yaml /tmp/nvidia.yaml; then
echo "nvidia.yaml file is different"
exit 1
fi
`
)

var _ = Describe("nvidia-cdi-refresh", Ordered, ContinueOnFailure, Label("systemd-unit"), func() {
var (
nestedContainerRunner Runner
// TODO(@ArangoGutierrez): https://github.com/NVIDIA/nvidia-container-toolkit/pull/1235/files#r2302013660
outerContainerImage = "docker.io/kindest/base:v20250521-31a79fd4"
)

BeforeAll(func(ctx context.Context) {
var err error
nestedContainerRunner, err = NewNestedContainerRunner(runner, outerContainerImage, installCTK, imageName+":"+imageTag, testContainerName)
Expect(err).ToNot(HaveOccurred())
})

AfterAll(func(ctx context.Context) {
// Cleanup: remove the container and the temporary script on the host.
// Use || true to ensure cleanup doesn't fail the test
runner.Run(fmt.Sprintf("docker rm -f %s 2>/dev/null || true", testContainerName)) //nolint:errcheck
})

When("installing nvidia-container-toolkit", Ordered, func() {
It("should load the nvidia-cdi-refresh.path unit", func(ctx context.Context) {
_, _, err := nestedContainerRunner.Run(nvidiaCdiRefreshPathActiveTemplate)
Expect(err).ToNot(HaveOccurred())
})

It("should load the nvidia-cdi-refresh.service unit", func(ctx context.Context) {
_, _, err := nestedContainerRunner.Run(nvidiaCdiRefreshServiceLoadedTemplate)
Expect(err).ToNot(HaveOccurred())
})

It("should generate the nvidia.yaml file", func(ctx context.Context) {
_, _, err := nestedContainerRunner.Run(nvidiaCdiRefreshFileExistsTemplate)
Expect(err).ToNot(HaveOccurred())
})

It("should refresh the nvidia.yaml file after upgrading the nvidia-container-toolkit", func(ctx context.Context) {
_, _, err := nestedContainerRunner.Run(nvidiaCdiRefreshUpgradeTemplate)
Expect(err).ToNot(HaveOccurred())
})
})

When("installing nvidia-container-toolkit on a system with a degraded systemd", Ordered, func() {
BeforeAll(func(ctx context.Context) {
tmpl, err := template.New("nvidiaCdiRefreshDegradedSystemd").Parse(nvidiaCdiRefreshDegradedSystemdTemplate)
Expect(err).ToNot(HaveOccurred())

var nvidiaCdiRefreshDegradedSystemd strings.Builder
err = tmpl.Execute(&nvidiaCdiRefreshDegradedSystemd, struct {
ToolkitImage string
}{
ToolkitImage: imageName + ":" + imageTag,
})
Expect(err).ToNot(HaveOccurred())

_, _, err = nestedContainerRunner.Run(nvidiaCdiRefreshDegradedSystemd.String())
Expect(err).ToNot(HaveOccurred())
})

It("should load the nvidia-cdi-refresh.path unit", func(ctx context.Context) {
_, _, err := nestedContainerRunner.Run(nvidiaCdiRefreshPathActiveTemplate)
Expect(err).ToNot(HaveOccurred())
})

It("should load the nvidia-cdi-refresh.service unit", func(ctx context.Context) {
_, _, err := nestedContainerRunner.Run(nvidiaCdiRefreshServiceLoadedTemplate)
Expect(err).ToNot(HaveOccurred())
})

It("should generate the nvidia.yaml file", func(ctx context.Context) {
_, _, err := nestedContainerRunner.Run(nvidiaCdiRefreshFileExistsTemplate)
Expect(err).ToNot(HaveOccurred())
})

It("should generate the nvidia.yaml file", func(ctx context.Context) {
_, _, err := nestedContainerRunner.Run(nvidiaCdiRefreshFileExistsTemplate)
Expect(err).ToNot(HaveOccurred())
})
})
})
Loading