Skip to content

Commit 3df7c56

Browse files
committed
npu: initial draft for the plugin
Supports Core Ultra 1, 2 and 200V series NPUs. Signed-off-by: Tuomas Katila <[email protected]>
1 parent 905b2ae commit 3df7c56

File tree

11 files changed

+706
-0
lines changed

11 files changed

+706
-0
lines changed
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
## This is a generated file, do not edit directly. Edit build/docker/templates/intel-npu-plugin.Dockerfile.in instead.
2+
##
3+
## Copyright 2022 Intel Corporation. All Rights Reserved.
4+
##
5+
## Licensed under the Apache License, Version 2.0 (the "License");
6+
## you may not use this file except in compliance with the License.
7+
## You may obtain a copy of the License at
8+
##
9+
## http://www.apache.org/licenses/LICENSE-2.0
10+
##
11+
## Unless required by applicable law or agreed to in writing, software
12+
## distributed under the License is distributed on an "AS IS" BASIS,
13+
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
## See the License for the specific language governing permissions and
15+
## limitations under the License.
16+
###
17+
ARG CMD=npu_plugin
18+
## FINAL_BASE can be used to configure the base image of the final image.
19+
##
20+
## This is used in two ways:
21+
## 1) make <image-name> BUILDER=<docker|buildah>
22+
## 2) docker build ... -f <image-name>.Dockerfile
23+
##
24+
## The project default is 1) which sets FINAL_BASE=gcr.io/distroless/static
25+
## (see build-image.sh).
26+
## 2) and the default FINAL_BASE is primarily used to build Redhat Certified Openshift Operator container images that must be UBI based.
27+
## The RedHat build tool does not allow additional image build parameters.
28+
ARG FINAL_BASE=registry.access.redhat.com/ubi9-micro:latest
29+
###
30+
##
31+
## GOLANG_BASE can be used to make the build reproducible by choosing an
32+
## image by its hash:
33+
## GOLANG_BASE=golang@sha256:9d64369fd3c633df71d7465d67d43f63bb31192193e671742fa1c26ebc3a6210
34+
##
35+
## This is used on release branches before tagging a stable version.
36+
## The main branch defaults to using the latest Golang base image.
37+
ARG GOLANG_BASE=golang:1.24-bookworm
38+
###
39+
FROM ${GOLANG_BASE} AS builder
40+
ARG DIR=/intel-device-plugins-for-kubernetes
41+
ARG GO111MODULE=on
42+
ARG LDFLAGS="all=-w -s"
43+
ARG GOFLAGS="-trimpath"
44+
ARG GCFLAGS="all=-spectre=all -N -l"
45+
ARG ASMFLAGS="all=-spectre=all"
46+
ARG GOLICENSES_VERSION
47+
ARG EP=/usr/local/bin/intel_npu_device_plugin
48+
ARG CMD
49+
WORKDIR ${DIR}
50+
COPY . .
51+
RUN (cd cmd/${CMD}; GO111MODULE=${GO111MODULE} GOFLAGS=${GOFLAGS} CGO_ENABLED=0 go install -gcflags="${GCFLAGS}" -asmflags="${ASMFLAGS}" -ldflags="${LDFLAGS}") && install -D /go/bin/${CMD} /install_root${EP}
52+
RUN install -D ${DIR}/LICENSE /install_root/licenses/intel-device-plugins-for-kubernetes/LICENSE \
53+
&& if [ ! -d "licenses/$CMD" ] ; then \
54+
GO111MODULE=on GOROOT=$(go env GOROOT) go run github.com/google/go-licenses@${GOLICENSES_VERSION} save "./cmd/$CMD" \
55+
--save_path /install_root/licenses/$CMD/go-licenses ; \
56+
else mkdir -p /install_root/licenses/$CMD/go-licenses/ && cd licenses/$CMD && cp -r * /install_root/licenses/$CMD/go-licenses/ ; fi && \
57+
echo "Verifying installed licenses" && test -e /install_root/licenses/$CMD/go-licenses
58+
###
59+
FROM ${FINAL_BASE}
60+
COPY --from=builder /install_root /
61+
ENTRYPOINT ["/usr/local/bin/intel_npu_device_plugin"]
62+
LABEL vendor='Intel®'
63+
LABEL org.opencontainers.image.source='https://github.com/intel/intel-device-plugins-for-kubernetes'
64+
LABEL maintainer="Intel®"
65+
LABEL version='devel'
66+
LABEL release='1'
67+
LABEL name='intel-npu-plugin'
68+
LABEL summary='Intel® NPU device plugin for Kubernetes'
69+
LABEL description='The NPU device plugin provides access to Intel CPU neural processing unit (NPU) device files'
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#define _ENTRYPOINT_ /usr/local/bin/intel_npu_device_plugin
2+
ARG CMD=npu_plugin
3+
4+
#include "default_plugin.docker"
5+
6+
LABEL name='intel-npu-plugin'
7+
LABEL summary='Intel® NPU device plugin for Kubernetes'
8+
LABEL description='The NPU device plugin provides access to Intel CPU neural processing unit (NPU) device files'

cmd/npu_plugin/README.md

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
# Intel NPU device plugin for Kubernetes
2+
3+
Table of Contents
4+
5+
* [Introduction](#introduction)
6+
* [Modes and Configuration Options](#modes-and-configuration-options)
7+
* [Pre-built Images](#pre-built-images)
8+
* [Installation](#installation)
9+
* [Install with NFD](#install-with-nfd)
10+
* [Install with Operator](#install-with-operator)
11+
* [Verify Plugin Registration](#verify-plugin-registration)
12+
* [Testing and Demos](#testing-and-demos)
13+
14+
## Introduction
15+
16+
Intel NPU plugin facilitates Kubernetes workload offloading by providing access to Intel CPU neural processing units supported by the host kernel.
17+
18+
The following CPU families are currently detected by the plugin:
19+
* Core Ultra Series 1
20+
* Core Ultra Series 2
21+
* Core Ultra 200V Series
22+
23+
Intel NPU plugin may register one resource to the Kubernetes cluster:
24+
| Resource | Description |
25+
|:---- |:-------- |
26+
| npu.intel.com/npu | NPU |
27+
28+
## Modes and Configuration Options
29+
30+
| Flag | Argument | Default | Meaning |
31+
|:---- |:-------- |:------- |:------- |
32+
| -shared-dev-num | int | 1 | Number of containers that can share the same NPU device |
33+
34+
The plugin also accepts a number of other arguments (common to all plugins) related to logging.
35+
Please use the -h option to see the complete list of logging related options.
36+
37+
## Pre-built Images
38+
39+
[Pre-built images](https://hub.docker.com/r/intel/intel-npu-plugin)
40+
are available on the Docker hub. These images are automatically built and uploaded
41+
to the hub from the latest main branch of this repository.
42+
43+
Release tagged images of the components are also available on the Docker hub, tagged with their
44+
release version numbers in the format `x.y.z`, corresponding to the branches and releases in this
45+
repository.
46+
47+
See [the development guide](../../DEVEL.md) for details if you want to deploy a customized version of the plugin.
48+
49+
## Installation
50+
51+
There are multiple ways to install Intel NPU plugin to a cluster. The most common methods are described below.
52+
53+
> **Note**: Replace `<RELEASE_VERSION>` with the desired [release tag](https://github.com/intel/intel-device-plugins-for-kubernetes/tags) or `main` to get `devel` images.
54+
55+
> **Note**: Add ```--dry-run=client -o yaml``` to the ```kubectl``` commands below to visualize the YAML content being applied.
56+
57+
### Install with NFD
58+
59+
Deploy NPU plugin with the help of NFD ([Node Feature Discovery](https://github.com/kubernetes-sigs/node-feature-discovery)). It detects the presence of Intel NPUs and labels them accordingly. GPU plugin's node selector is used to deploy plugin to nodes which have such a NPU label.
60+
61+
```bash
62+
# Start NFD - if your cluster doesn't have NFD installed yet
63+
$ kubectl apply -k 'https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/nfd?ref=<RELEASE_VERSION>'
64+
65+
# Create NodeFeatureRules for detecting NPUs on nodes
66+
$ kubectl apply -k 'https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/nfd/overlays/node-feature-rules?ref=<RELEASE_VERSION>'
67+
68+
# Create NPU plugin daemonset
69+
$ kubectl apply -k 'https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/npu_plugin/overlays/nfd_labeled_nodes?ref=<RELEASE_VERSION>'
70+
```
71+
72+
### Install with Operator
73+
74+
NPU plugin can be installed with the Intel Device Plugin Operator. It allows configuring NPU plugin parameters without kustomizing the deployment files. The general installation is described in the [install documentation](../operator/README.md#installation).
75+
76+
### Verify Plugin Registration
77+
78+
You can verify that the plugin has been installed on the expected nodes by searching for the relevant
79+
resource allocation status on the nodes:
80+
81+
```bash
82+
$ kubectl get nodes -o=jsonpath="{range .items[*]}{.metadata.name}{'\n'}{' npu: '}{.status.allocatable.npu\.intel\.com/npu}{'\n'}"
83+
master
84+
npu: 1
85+
```
86+
87+
## Testing and Demos
88+
89+
TODO
90+

cmd/npu_plugin/npu_plugin.go

Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
// Copyright 2025 Intel Corporation. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package main
16+
17+
import (
18+
"flag"
19+
"fmt"
20+
"os"
21+
"path"
22+
"regexp"
23+
"slices"
24+
"strings"
25+
"time"
26+
27+
"github.com/pkg/errors"
28+
29+
"k8s.io/klog/v2"
30+
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
31+
32+
dpapi "github.com/intel/intel-device-plugins-for-kubernetes/pkg/deviceplugin"
33+
)
34+
35+
const (
36+
sysfAccelDirectory = "/sys/class/accel"
37+
devfsAccelDirectory = "/dev/accel"
38+
npuDeviceRE = `^accel[0-9]+$`
39+
vendorString = "0x8086"
40+
41+
// Device plugin settings.
42+
namespace = "npu.intel.com"
43+
deviceTypeNpu = "npu"
44+
45+
// Period of device scans.
46+
scanPeriod = 5 * time.Second
47+
)
48+
49+
var npuIDs = []string{
50+
"0x7e4c", // Core Ultra Series 1
51+
"0x643e", // Core Ultra 200V Series
52+
"0xad1d", // Core Ultra Series 2
53+
"0x7d1d", // Core Ultra Series 2 (H)
54+
}
55+
56+
type cliOptions struct {
57+
sharedDevNum int
58+
}
59+
60+
type devicePlugin struct {
61+
npuDeviceReg *regexp.Regexp
62+
63+
scanTicker *time.Ticker
64+
scanDone chan bool
65+
66+
sysfsDir string
67+
devfsDir string
68+
69+
options cliOptions
70+
}
71+
72+
func newDevicePlugin(sysfsDir, devfsDir string, options cliOptions) *devicePlugin {
73+
dp := &devicePlugin{
74+
sysfsDir: sysfsDir,
75+
devfsDir: devfsDir,
76+
options: options,
77+
npuDeviceReg: regexp.MustCompile(npuDeviceRE),
78+
scanTicker: time.NewTicker(scanPeriod),
79+
scanDone: make(chan bool, 1), // buffered as we may send to it before Scan starts receiving from it
80+
}
81+
82+
return dp
83+
}
84+
85+
func (dp *devicePlugin) Scan(notifier dpapi.Notifier) error {
86+
defer dp.scanTicker.Stop()
87+
88+
klog.V(1).Infof("NPU (%s) resource share count = %d", deviceTypeNpu, dp.options.sharedDevNum)
89+
90+
previousCount := 0
91+
devType := fmt.Sprintf("%s/%s", namespace, deviceTypeNpu)
92+
93+
for {
94+
devTree, err := dp.scan()
95+
if err != nil {
96+
klog.Errorf("NPU scan failed: %v", err)
97+
return errors.Wrap(err, "NPU scan failed")
98+
}
99+
100+
count := devTree.DeviceTypeCount(devType)
101+
if count != previousCount {
102+
klog.V(1).Infof("NPU scan update: %d->%d '%s' resources found", previousCount, count, devType)
103+
104+
previousCount = count
105+
}
106+
107+
notifier.Notify(devTree)
108+
109+
select {
110+
case <-dp.scanDone:
111+
return nil
112+
case <-dp.scanTicker.C:
113+
}
114+
}
115+
}
116+
117+
func (dp *devicePlugin) isCompatibleDevice(name string) bool {
118+
if !dp.npuDeviceReg.MatchString(name) {
119+
klog.V(4).Info("Not compatible device: ", name)
120+
return false
121+
}
122+
123+
dat, err := os.ReadFile(path.Join(dp.sysfsDir, name, "device/vendor"))
124+
if err != nil {
125+
klog.Warning("Skipping. Can't read vendor file: ", err)
126+
return false
127+
}
128+
129+
if strings.TrimSpace(string(dat)) != vendorString {
130+
klog.V(4).Info("Non-Intel NPU: ", name)
131+
return false
132+
}
133+
134+
dat, err = os.ReadFile(path.Join(dp.sysfsDir, name, "device/device"))
135+
if err != nil {
136+
klog.Warning("Skipping. Can't read device file: ", err)
137+
return false
138+
}
139+
140+
datStr := strings.Split(string(dat), "\n")[0]
141+
if !slices.Contains(npuIDs, datStr) {
142+
klog.Warning("Unknown device id: ", datStr)
143+
return false
144+
}
145+
146+
return true
147+
}
148+
149+
func (dp *devicePlugin) scan() (dpapi.DeviceTree, error) {
150+
files, err := os.ReadDir(dp.sysfsDir)
151+
if err != nil {
152+
return nil, errors.Wrap(err, "Can't read sysfs folder")
153+
}
154+
155+
devTree := dpapi.NewDeviceTree()
156+
157+
for _, f := range files {
158+
name := f.Name()
159+
160+
if !dp.isCompatibleDevice(name) {
161+
continue
162+
}
163+
164+
devPath := path.Join(dp.devfsDir, name)
165+
if _, err = os.Stat(devPath); err != nil {
166+
continue
167+
}
168+
169+
// even querying metrics requires device to be writable
170+
devSpec := pluginapi.DeviceSpec{
171+
HostPath: devPath,
172+
ContainerPath: devPath,
173+
Permissions: "rw",
174+
}
175+
176+
deviceInfo := dpapi.NewDeviceInfo(pluginapi.Healthy, []pluginapi.DeviceSpec{devSpec}, nil, nil, nil, nil)
177+
178+
for i := 0; i < dp.options.sharedDevNum; i++ {
179+
devID := fmt.Sprintf("%s-%d", name, i)
180+
devTree.AddDevice("npu", devID, deviceInfo)
181+
}
182+
}
183+
184+
return devTree, nil
185+
}
186+
187+
func (dp *devicePlugin) Allocate(request *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) {
188+
return nil, &dpapi.UseDefaultMethodError{}
189+
}
190+
191+
func main() {
192+
var (
193+
prefix string
194+
opts cliOptions
195+
)
196+
197+
flag.StringVar(&prefix, "prefix", "", "Prefix for devfs & sysfs paths")
198+
flag.IntVar(&opts.sharedDevNum, "shared-dev-num", 1, "number of containers sharing the same NPU device")
199+
flag.Parse()
200+
201+
if opts.sharedDevNum < 1 {
202+
klog.Error("The number of containers sharing the same NPU must greater than zero")
203+
os.Exit(1)
204+
}
205+
206+
klog.V(1).Infof("NPU device plugin started")
207+
208+
plugin := newDevicePlugin(prefix+sysfAccelDirectory, prefix+devfsAccelDirectory, opts)
209+
210+
manager := dpapi.NewManager(namespace, plugin)
211+
manager.Run()
212+
}

0 commit comments

Comments
 (0)