Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: tensor-fusion workload Implementation #59

Merged
merged 15 commits into from
Mar 6, 2025
8 changes: 4 additions & 4 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,12 @@
"program": "${workspaceFolder}/cmd/main.go",
},
{
"name": "Debug Test Code",
"name": "Run Unit Tests",
"type": "go",
"request": "launch",
"mode": "auto",
"console": "integratedTerminal",
"program": "${workspaceFolder}/cmd/tmp/main.go",
"mode": "test",
"program": "${workspaceFolder}",
"console": "integratedTerminal"
}
]
}
8 changes: 8 additions & 0 deletions PROJECT
Original file line number Diff line number Diff line change
Expand Up @@ -79,4 +79,12 @@ resources:
kind: ClientProfile
path: github.com/NexusGPU/tensor-fusion-operator/api/v1
version: v1
- api:
crdVersion: v1
namespaced: true
controller: true
domain: tensor-fusion.ai
kind: TensorFusionWorkload
path: github.com/NexusGPU/tensor-fusion-operator/api/v1
version: v1
version: "3"
23 changes: 4 additions & 19 deletions api/v1/tensorfusionconnection_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,29 +33,14 @@ type Resources struct {

// TensorFusionConnectionSpec defines the desired state of TensorFusionConnection.
type TensorFusionConnectionSpec struct {
PoolName string `json:"poolName"`
Resources Resources `json:"resources"`

// +optional
// localGpu mode will schedule the GPU in advance
GPUs []string `json:"gpu"`
WorkloadName string `json:"workloadName"`
}

type TensorFusionConnectionPhase string

// These are the valid phases of a GpuConnection.
const (
TensorFusionConnectionPending TensorFusionConnectionPhase = "Pending"
TensorFusionConnectionStarting TensorFusionConnectionPhase = "Starting"
TensorFusionConnectionRunning TensorFusionConnectionPhase = "Running"
)

// TensorFusionConnectionStatus defines the observed state of TensorFusionConnection.
type TensorFusionConnectionStatus struct {
Phase TensorFusionConnectionPhase `json:"phase"`
ConnectionURL string `json:"connectionURL"`
QoS QoSLevel `json:"qos,omitempty"`
GPU string `json:"gpu,omitempty"`
Phase WorkerPhase `json:"phase"`
ConnectionURL string `json:"connectionURL"`
WorkerName string `json:"workerName"`
}

// +kubebuilder:object:root=true
Expand Down
88 changes: 88 additions & 0 deletions api/v1/tensorfusionworkload_types.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
/*
Copyright 2024.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package v1

import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

// TensorFusionWorkloadSpec defines the desired state of TensorFusionWorkload.
type TensorFusionWorkloadSpec struct {
Replicas *int32 `json:"replicas,omitempty"`
PoolName string `json:"poolName"`
// +optional
Resources Resources `json:"resources"`
// +optional
Qos QoSLevel `json:"qos,omitempty"`
// +optional
IsLocalGPU bool `json:"isLocalGPU,omitempty"`
}

type WorkerPhase string

const (
WorkerPending WorkerPhase = "Pending"
WorkerRunning WorkerPhase = "Running"
WorkerFailed WorkerPhase = "Failed"
)

type WorkerStatus struct {
WorkerPhase WorkerPhase `json:"workerPhase"`

WorkerName string `json:"workerName"`
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
// +optional
WorkerIp string `json:"workerIp,omitempty"`
// +optional
WorkerPort int `json:"workerPort,omitempty"`
}

// TensorFusionWorkloadStatus defines the observed state of TensorFusionWorkload.
type TensorFusionWorkloadStatus struct {
// replicas is the number of Pods created by the Workload controller.
Replicas int32 `json:"replicas"`

// readyReplicas is the number of pods created for this Workload with a Ready Condition.
ReadyReplicas int32 `json:"readyReplicas,omitempty"`

WorkerStatuses []WorkerStatus `json:"workerStatuses,omitempty"`
}

// +kubebuilder:object:root=true
// +kubebuilder:subresource:status

// TensorFusionWorkload is the Schema for the tensorfusionworkloads API.
type TensorFusionWorkload struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`

Spec TensorFusionWorkloadSpec `json:"spec,omitempty"`
Status TensorFusionWorkloadStatus `json:"status,omitempty"`
}

// +kubebuilder:object:root=true

// TensorFusionWorkloadList contains a list of TensorFusionWorkload.
type TensorFusionWorkloadList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata,omitempty"`
Items []TensorFusionWorkload `json:"items"`
}

func init() {
SchemeBuilder.Register(&TensorFusionWorkload{}, &TensorFusionWorkloadList{})
}
132 changes: 125 additions & 7 deletions api/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion charts/tensor-fusion/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 1.2.5
version: 1.2.6

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
Expand Down
Loading
Loading