Skip to content

Commit 90db622

Browse files
Merge pull request #2 from kerthcet/feat/init
Add resourceFungibility plugin
2 parents 7a49353 + e3878e8 commit 90db622

File tree

10 files changed

+1130
-3
lines changed

10 files changed

+1130
-3
lines changed

Dockerfile

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
ARG BASE_IMAGE
2+
ARG BUILDER_IMAGE
3+
4+
# Build the manager binary
5+
FROM ${BUILDER_IMAGE} as builder
6+
ARG TARGETOS
7+
ARG TARGETARCH
8+
9+
WORKDIR /workspace
10+
# Copy the Go Modules manifests
11+
COPY go.mod go.mod
12+
COPY go.sum go.sum
13+
# cache deps before building and copying source so that we don't need to re-download as much
14+
# and so that source changes don't invalidate our downloaded layer
15+
RUN go mod download
16+
17+
# Copy the go source
18+
COPY cmd/main.go cmd/main.go
19+
COPY api/ api/
20+
COPY pkg/ pkg/
21+
22+
# Build
23+
# the GOARCH has not a default value to allow the binary be built according to the host where the command
24+
# was called. For example, if we call make docker-build in a local env which has the Apple Silicon M1 SO
25+
# the docker BUILDPLATFORM arg will be linux/arm64 when for Apple x86 it will be linux/amd64. Therefore,
26+
# by leaving it empty we can ensure that the container and binary shipped on it will have the same platform.
27+
RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} go build -a -o kube-scheduler cmd/main.go
28+
29+
# Use distroless as minimal base image to package the manager binary
30+
# Refer to https://github.com/GoogleContainerTools/distroless for more details
31+
FROM ${BASE_IMAGE}
32+
WORKDIR /
33+
COPY --from=builder /workspace/kube-scheduler .
34+
USER 65532:65532
35+
36+
ENTRYPOINT ["/kube-scheduler"]

Makefile

+203
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
2+
# Image URL to use all building/pushing image targets
3+
BASE_IMAGE ?= gcr.io/distroless/static:nonroot
4+
DOCKER_BUILDX_CMD ?= docker buildx
5+
IMAGE_BUILD_CMD ?= $(DOCKER_BUILDX_CMD) build
6+
IMAGE_BUILD_EXTRA_OPTS ?=
7+
IMAGE_REGISTRY ?= inftyai
8+
IMAGE_NAME ?= vscheduler
9+
IMAGE_REPO := $(IMAGE_REGISTRY)/$(IMAGE_NAME)
10+
GIT_TAG ?= $(shell git describe --tags --dirty --always)
11+
IMG ?= $(IMAGE_REPO):$(GIT_TAG)
12+
GO_VERSION := $(shell awk '/^go /{print $$2}' go.mod|head -n1)
13+
BUILDER_IMAGE ?= golang:$(GO_VERSION)
14+
15+
# ENVTEST_K8S_VERSION refers to the version of kubebuilder assets to be downloaded by envtest binary.
16+
ENVTEST_K8S_VERSION = 1.28.0
17+
18+
# Get the currently used golang install path (in GOPATH/bin, unless GOBIN is set)
19+
ifeq (,$(shell go env GOBIN))
20+
GOBIN=$(shell go env GOPATH)/bin
21+
else
22+
GOBIN=$(shell go env GOBIN)
23+
endif
24+
25+
# CONTAINER_TOOL defines the container tool to be used for building images.
26+
# Be aware that the target commands are only tested with Docker which is
27+
# scaffolded by default. However, you might want to replace it to use other
28+
# tools. (i.e. podman)
29+
CONTAINER_TOOL ?= docker
30+
31+
# Setting SHELL to bash allows bash commands to be executed by recipes.
32+
# Options are set to exit when a recipe line exits non-zero or a piped command fails.
33+
SHELL = /usr/bin/env bash -o pipefail
34+
.SHELLFLAGS = -ec
35+
36+
.PHONY: all
37+
all: build
38+
39+
##@ General
40+
41+
# The help target prints out all targets with their descriptions organized
42+
# beneath their categories. The categories are represented by '##@' and the
43+
# target descriptions by '##'. The awk command is responsible for reading the
44+
# entire set of makefiles included in this invocation, looking for lines of the
45+
# file as xyz: ## something, and then pretty-format the target and help. Then,
46+
# if there's a line with ##@ something, that gets pretty-printed as a category.
47+
# More info on the usage of ANSI control characters for terminal formatting:
48+
# https://en.wikipedia.org/wiki/ANSI_escape_code#SGR_parameters
49+
# More info on the awk command:
50+
# http://linuxcommand.org/lc3_adv_awk.php
51+
52+
.PHONY: help
53+
help: ## Display this help.
54+
@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m<target>\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
55+
56+
##@ Development
57+
58+
.PHONY: manifests
59+
manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects.
60+
$(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths="./..." output:crd:artifacts:config=config/crd/bases
61+
62+
.PHONY: generate
63+
generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations.
64+
$(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./..."
65+
66+
.PHONY: fmt
67+
fmt: ## Run go fmt against code.
68+
go fmt ./...
69+
70+
.PHONY: vet
71+
vet: ## Run go vet against code.
72+
go vet ./...
73+
74+
.PHONY: test
75+
test: manifests generate fmt vet envtest ## Run tests.
76+
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test ./... -coverprofile cover.out
77+
78+
GOLANGCI_LINT = $(shell pwd)/bin/golangci-lint
79+
GOLANGCI_LINT_VERSION ?= v1.54.2
80+
golangci-lint:
81+
@[ -f $(GOLANGCI_LINT) ] || { \
82+
set -e ;\
83+
curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(shell dirname $(GOLANGCI_LINT)) $(GOLANGCI_LINT_VERSION) ;\
84+
}
85+
86+
.PHONY: lint
87+
lint: golangci-lint ## Run golangci-lint linter & yamllint
88+
$(GOLANGCI_LINT) run
89+
90+
.PHONY: lint-fix
91+
lint-fix: golangci-lint ## Run golangci-lint linter and perform fixes
92+
$(GOLANGCI_LINT) run --fix
93+
94+
##@ Build
95+
96+
.PHONY: build
97+
build: manifests generate fmt vet ## Build manager binary.
98+
go build -o bin/manager cmd/main.go
99+
100+
.PHONY: run
101+
run: manifests generate fmt vet ## Run a controller from your host.
102+
go run ./cmd/main.go
103+
104+
# If you wish to build the manager image targeting other platforms you can use the --platform flag.
105+
# (i.e. docker build --platform linux/arm64). However, you must enable docker buildKit for it.
106+
# More info: https://docs.docker.com/develop/develop-images/build_enhancements/
107+
.PHONY: docker-build
108+
docker-build: ## Build docker image with the manager.
109+
$(CONTAINER_TOOL) build -t ${IMG} .
110+
111+
.PHONY: docker-push
112+
docker-push: ## Push docker image with the manager.
113+
$(CONTAINER_TOOL) push ${IMG}
114+
115+
# PLATFORMS defines the target platforms for the manager image be built to provide support to multiple
116+
# architectures. (i.e. make docker-buildx IMG=myregistry/mypoperator:0.0.1). To use this option you need to:
117+
# - be able to use docker buildx. More info: https://docs.docker.com/build/buildx/
118+
# - have enabled BuildKit. More info: https://docs.docker.com/develop/develop-images/build_enhancements/
119+
# - be able to push the image to your registry (i.e. if you do not set a valid value via IMG=<myregistry/image:<tag>> then the export will fail)
120+
# To adequately provide solutions that are compatible with multiple platforms, you should consider using this option.
121+
PLATFORMS ?= linux/arm64,linux/amd64,linux/s390x,linux/ppc64le
122+
.PHONY: docker-buildx
123+
docker-buildx: ## Build and push docker image for the manager for cross-platform support
124+
# copy existing Dockerfile and insert --platform=${BUILDPLATFORM} into Dockerfile.cross, and preserve the original Dockerfile
125+
sed -e '1 s/\(^FROM\)/FROM --platform=\$$\{BUILDPLATFORM\}/; t' -e ' 1,// s//FROM --platform=\$$\{BUILDPLATFORM\}/' Dockerfile > Dockerfile.cross
126+
- $(CONTAINER_TOOL) buildx create --name project-v3-builder
127+
$(CONTAINER_TOOL) buildx use project-v3-builder
128+
- $(CONTAINER_TOOL) buildx build --push --platform=$(PLATFORMS) --tag ${IMG} -f Dockerfile.cross .
129+
- $(CONTAINER_TOOL) buildx rm project-v3-builder
130+
rm Dockerfile.cross
131+
132+
.PHONY: image-build
133+
image-build:
134+
$(IMAGE_BUILD_CMD) -t $(IMG) \
135+
-f Dockerfile \
136+
--build-arg BASE_IMAGE=$(BASE_IMAGE) \
137+
--build-arg BUILDER_IMAGE=$(BUILDER_IMAGE) \
138+
--build-arg CGO_ENABLED=$(CGO_ENABLED) \
139+
$(IMAGE_BUILD_EXTRA_OPTS) ./
140+
image-load: IMAGE_BUILD_EXTRA_OPTS=--load
141+
image-load: image-build
142+
image-push: IMAGE_BUILD_EXTRA_OPTS=--push
143+
image-push: image-build
144+
145+
##@ Deployment
146+
147+
ifndef ignore-not-found
148+
ignore-not-found = false
149+
endif
150+
151+
.PHONY: install
152+
install: manifests kustomize ## Install CRDs into the K8s cluster specified in ~/.kube/config.
153+
$(KUSTOMIZE) build config/crd | $(KUBECTL) apply -f -
154+
155+
.PHONY: uninstall
156+
uninstall: manifests kustomize ## Uninstall CRDs from the K8s cluster specified in ~/.kube/config. Call with ignore-not-found=true to ignore resource not found errors during deletion.
157+
$(KUSTOMIZE) build config/crd | $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f -
158+
159+
.PHONY: deploy
160+
deploy: manifests kustomize ## Deploy controller to the K8s cluster specified in ~/.kube/config.
161+
cd config/manager && $(KUSTOMIZE) edit set image controller=${IMG}
162+
$(KUSTOMIZE) build config/default | $(KUBECTL) apply -f -
163+
164+
.PHONY: undeploy
165+
undeploy: ## Undeploy controller from the K8s cluster specified in ~/.kube/config. Call with ignore-not-found=true to ignore resource not found errors during deletion.
166+
$(KUSTOMIZE) build config/default | $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f -
167+
168+
##@ Build Dependencies
169+
170+
## Location to install dependencies to
171+
LOCALBIN ?= $(shell pwd)/bin
172+
$(LOCALBIN):
173+
mkdir -p $(LOCALBIN)
174+
175+
## Tool Binaries
176+
KUBECTL ?= kubectl
177+
KUSTOMIZE ?= $(LOCALBIN)/kustomize
178+
CONTROLLER_GEN ?= $(LOCALBIN)/controller-gen
179+
ENVTEST ?= $(LOCALBIN)/setup-envtest
180+
181+
## Tool Versions
182+
KUSTOMIZE_VERSION ?= v5.2.1
183+
CONTROLLER_TOOLS_VERSION ?= v0.13.0
184+
185+
.PHONY: kustomize
186+
kustomize: $(KUSTOMIZE) ## Download kustomize locally if necessary. If wrong version is installed, it will be removed before downloading.
187+
$(KUSTOMIZE): $(LOCALBIN)
188+
@if test -x $(LOCALBIN)/kustomize && ! $(LOCALBIN)/kustomize version | grep -q $(KUSTOMIZE_VERSION); then \
189+
echo "$(LOCALBIN)/kustomize version is not expected $(KUSTOMIZE_VERSION). Removing it before installing."; \
190+
rm -rf $(LOCALBIN)/kustomize; \
191+
fi
192+
test -s $(LOCALBIN)/kustomize || GOBIN=$(LOCALBIN) GO111MODULE=on go install sigs.k8s.io/kustomize/kustomize/v5@$(KUSTOMIZE_VERSION)
193+
194+
.PHONY: controller-gen
195+
controller-gen: $(CONTROLLER_GEN) ## Download controller-gen locally if necessary. If wrong version is installed, it will be overwritten.
196+
$(CONTROLLER_GEN): $(LOCALBIN)
197+
test -s $(LOCALBIN)/controller-gen && $(LOCALBIN)/controller-gen --version | grep -q $(CONTROLLER_TOOLS_VERSION) || \
198+
GOBIN=$(LOCALBIN) go install sigs.k8s.io/controller-tools/cmd/controller-gen@$(CONTROLLER_TOOLS_VERSION)
199+
200+
.PHONY: envtest
201+
envtest: $(ENVTEST) ## Download envtest-setup locally if necessary.
202+
$(ENVTEST): $(LOCALBIN)
203+
test -s $(LOCALBIN)/setup-envtest || GOBIN=$(LOCALBIN) go install sigs.k8s.io/controller-runtime/tools/setup-envtest@latest

README.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ A Kubernetes scheduler designed for smart scheduling with llmaz.
66

77
vScheduler maintains multiple plugins for llm workloads scheduling.
88

9-
### ResourceFungibility
9+
### ResourceFungibility Plugin
1010

11-
A llama2-70B model can be run on 2xA100-80GB GPUs, can also be run on 4xA100-40GB GPUs, this is what we called fungibility.
11+
A `llama2-7B` model can be run on __1xA100__ GPU, can also be run on __1xA10__ GPU, this is what we called fungibility.
1212

13-
With resourceFungibility plugin, we can simply achieve with at most 8 alternatives.
13+
With [resourceFungibility](./docs/plugins/resource_fungibility.md) plugin, we can simply achieve this with at most 8 alternative GPU types.

api/config/scheme/scheme.go

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
/*
2+
Copyright 2022 The Koordinator Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package scheme
18+
19+
import (
20+
"k8s.io/apimachinery/pkg/runtime"
21+
// "k8s.io/apimachinery/pkg/runtime/serializer"
22+
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
23+
kubeschedulerscheme "k8s.io/kubernetes/pkg/scheduler/apis/config/scheme"
24+
)
25+
26+
var (
27+
// Re-use the in-tree Scheme.
28+
Scheme = kubeschedulerscheme.Scheme
29+
)
30+
31+
func init() {
32+
utilruntime.Must(AddToScheme(Scheme))
33+
}
34+
35+
// AddToScheme builds the kubescheduler scheme using all known versions of the kubescheduler api.
36+
func AddToScheme(scheme *runtime.Scheme) error {
37+
return nil
38+
}

cmd/main.go

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
/*
2+
Copyright 2024.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package main
18+
19+
import (
20+
"os"
21+
22+
// Ensure scheme package is initialized.
23+
_ "github.com/inftyai/vscheduler/api/config/scheme"
24+
25+
"k8s.io/component-base/cli"
26+
"k8s.io/kubernetes/cmd/kube-scheduler/app"
27+
28+
resourceFungibility "github.com/inftyai/vscheduler/pkg/plugins/resource_fungibility"
29+
//+kubebuilder:scaffold:imports
30+
)
31+
32+
func main() {
33+
command := app.NewSchedulerCommand(
34+
app.WithPlugin(resourceFungibility.Name, resourceFungibility.New),
35+
)
36+
37+
code := cli.Run(command)
38+
os.Exit(code)
39+
}

docs/plugins/resource_fungibility.md

+63
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# ResourceFungibility Plugin
2+
3+
Support resource fungibility like GPU types with at most 8 alternative choices.
4+
5+
## How to use
6+
7+
### Set kube-scheduler.yaml
8+
9+
Generally looks like:
10+
11+
```yaml
12+
containers:
13+
- args:
14+
- --authentication-kubeconfig=/etc/kubernetes/scheduler.conf
15+
- --authorization-kubeconfig=/etc/kubernetes/scheduler.conf
16+
- --bind-address=127.0.0.1
17+
- --feature-gates=MaxUnavailableStatefulSet=true # this is require by lws
18+
- --kubeconfig=/etc/kubernetes/scheduler.conf
19+
- --leader-elect=true
20+
- --config=/etc/kubernetes/kube-scheduler-config.yaml # set the kube-scheduler-config.yaml
21+
image: inftyai/vscheduler:<version> # set the right version of vscheduler image
22+
```
23+
24+
### Set KubeSchedulerConfiguration
25+
26+
A minimal `kube-scheduler-config.yaml` looks like:
27+
28+
```yaml
29+
apiVersion: kubescheduler.config.k8s.io/v1
30+
kind: KubeSchedulerConfiguration
31+
leaderElection:
32+
leaderElect: true
33+
clientConnection:
34+
kubeconfig: /etc/kubernetes/scheduler.conf
35+
profiles:
36+
- schedulerName: default-scheduler
37+
plugins:
38+
multiPoint:
39+
enabled:
40+
- name: ResourceFungibility
41+
weight: 10 # make sure this plugin dominates the scheduling since GPU is scarce
42+
```
43+
44+
### Set ClusterRole
45+
46+
Edit clusterRole `system:kube-scheduler` to make sure it has the privilege to get Models, mostly like:
47+
48+
```yaml
49+
- apiGroups:
50+
- llmaz.io
51+
resources:
52+
- openmodels
53+
verbs:
54+
- get
55+
```
56+
57+
## Build your own scheduler
58+
59+
If you want to import resourceFungibility plugin to build a customized scheduler, what you need to do is quite similar to [main.go](../../cmd/main.go).
60+
61+
## Limits
62+
63+
However, it only supports GPUs with the same number.

0 commit comments

Comments
 (0)