Skip to content

Commit 1a23e69

Browse files
authored
fix: add clientToken to avoid ec2 duplicated creation (#69)
1 parent 8899d38 commit 1a23e69

File tree

7 files changed

+55
-12
lines changed

7 files changed

+55
-12
lines changed

.vscode/settings.json

+1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
"gpunodes",
3434
"gpupool",
3535
"gpupools",
36+
"greptime",
3637
"greptimedb",
3738
"healthz",
3839
"karpenter",

charts/tensor-fusion/Chart.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ type: application
1515
# This is the chart version. This version number should be incremented each time you make changes
1616
# to the chart and its templates, including the app version.
1717
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18-
version: 1.2.6
18+
version: 1.2.7
1919

2020
# This is the version number of the application being deployed. This version number should be
2121
# incremented each time you make changes to the application. Versions are not expected to

charts/tensor-fusion/templates/vector-config.yaml

+11-2
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,19 @@ data:
5353
inputs:
5454
- log_to_metric
5555
new_naming: false
56-
endpoint: {{ .Values.hypervisor.greptimedbEndpoint }}
56+
endpoint: {{ .Values.greptime.endpoint }}:{{ .Values.greptime.port }}
5757
5858
sink_greptimedb_controller_metrics:
5959
type: prometheus_remote_write
6060
inputs:
6161
- prepare_controller_metrics
62-
endpoint: {{ .Values.controller.greptimedbEndpoint }}
62+
{{- if ne .Values.greptime.isCloud true }}
63+
endpoint: http://{{ .Values.greptime.internalHost }}:4000/v1/prometheus/write?db=public
64+
{{- else }}
65+
endpoint: https://{{ .Values.greptime.host }}/v1/prometheus/write?db={{ .Values.greptime.db }}
66+
{{- end }}
67+
auth:
68+
strategy: basic
69+
user: {{ .Values.greptime.user }}
70+
password: {{ .Values.greptime.password }}
71+

charts/tensor-fusion/values.yaml

+12-2
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ serviceAccount:
2020
annotations: {}
2121

2222
hypervisor:
23-
greptimedbEndpoint: greptimedb-standalone.tensor-fusion.svc.cluster.local:4001
2423
image:
2524
repository: tensorfusion/tensor-fusion-hypervisor
2625
# Overrides the image tag whose default is the chart appVersion.
@@ -51,13 +50,24 @@ controller:
5150
tolerations: []
5251
affinity: {}
5352

54-
greptimedbEndpoint: http://greptimedb-standalone.tensor-fusion.svc.cluster.local:4000/v1/prometheus/write?db=public
5553
admissionWebhooks:
5654
failurePolicy: Fail
5755
secretName: tensor-fusion-webhook-secret
5856
patch:
5957
image: registry.k8s.io/ingress-nginx/kube-webhook-certgen:v1.5.0
6058

59+
greptime:
60+
isCloud: false
61+
internalHost: greptimedb-standalone.tensor-fusion.svc.cluster.local
62+
port: 4001
63+
64+
# greptime:
65+
# isCloud: true
66+
# host: y6lmxod4zm69.us-west-2.aws.greptime.cloud
67+
# user: "dummy"
68+
# db: "public"
69+
# password: "dummy"
70+
# port: 5001
6171

6272
agent:
6373
enrollToken: "token-from-cloud"

cmd/main.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -240,8 +240,9 @@ func main() {
240240
os.Exit(1)
241241
}
242242
if err = (&controller.NodeReconciler{
243-
Client: mgr.GetClient(),
244-
Scheme: mgr.GetScheme(),
243+
Client: mgr.GetClient(),
244+
Scheme: mgr.GetScheme(),
245+
Recorder: mgr.GetEventRecorderFor("Node"),
245246
}).SetupWithManager(mgr); err != nil {
246247
setupLog.Error(err, "unable to create controller", "controller", "Node")
247248
os.Exit(1)

internal/cloudprovider/alibaba/ecs.go

+1
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ func (p AlibabaGPUNodeProvider) CreateNode(ctx context.Context, param *types.Nod
8282
nodeClass := param.NodeClass.Spec
8383
request := ecs.CreateRunInstancesRequest()
8484
request.LaunchTemplateId = nodeClass.LaunchTemplate.ID
85+
request.ClientToken = param.NodeName
8586

8687
if len(nodeClass.OSImageSelectorTerms) > 0 {
8788
// TODO: should support other query types not only ID

internal/controller/gpunode_controller.go

+26-5
Original file line numberDiff line numberDiff line change
@@ -23,19 +23,19 @@ import (
2323
"strings"
2424
"time"
2525

26+
tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
2627
cloudprovider "github.com/NexusGPU/tensor-fusion/internal/cloudprovider"
2728
"github.com/NexusGPU/tensor-fusion/internal/cloudprovider/types"
28-
"github.com/NexusGPU/tensor-fusion/internal/utils"
29-
30-
tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
3129
"github.com/NexusGPU/tensor-fusion/internal/constants"
30+
"github.com/NexusGPU/tensor-fusion/internal/utils"
3231
batchv1 "k8s.io/api/batch/v1"
3332
corev1 "k8s.io/api/core/v1"
3433
"k8s.io/apimachinery/pkg/api/errors"
3534
"k8s.io/apimachinery/pkg/api/resource"
3635
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3736
"k8s.io/apimachinery/pkg/runtime"
3837
"k8s.io/client-go/tools/record"
38+
"k8s.io/client-go/util/retry"
3939
"k8s.io/utils/ptr"
4040
ctrl "sigs.k8s.io/controller-runtime"
4141
"sigs.k8s.io/controller-runtime/pkg/builder"
@@ -457,6 +457,7 @@ func (r *GPUNodeReconciler) reconcileCloudVendorNode(ctx context.Context, node *
457457
return fmt.Errorf("failed to unmarshal cloud vendor param: %w, GPUNode: %s", err, node.Name)
458458
}
459459

460+
// TODO: query cloud vendor by node name
460461
status, err := provider.CreateNode(ctx, &nodeParam)
461462
if err != nil {
462463
return err
@@ -469,11 +470,31 @@ func (r *GPUNodeReconciler) reconcileCloudVendorNode(ctx context.Context, node *
469470
if err != nil {
470471
return err
471472
}
473+
gpuNode.Status.Phase = tfv1.TensorFusionGPUNodePhasePending
472474
gpuNode.Status.NodeInfo.IP = status.PrivateIP
473475
gpuNode.Status.NodeInfo.InstanceID = status.InstanceID
474476
gpuNode.Status.NodeInfo.Region = nodeParam.Region
475-
if err := r.Client.Status().Update(ctx, gpuNode); err != nil {
476-
log.FromContext(ctx).Info("Failed to update GPUNode status, must terminate node to keep operation atomic", "name", nodeParam.NodeName)
477+
478+
// Retry status update until success to handle version conflicts
479+
err = retry.RetryOnConflict(retry.DefaultBackoff, func() error {
480+
// Get the latest version before attempting an update
481+
latest := &tfv1.GPUNode{}
482+
if err := r.Client.Get(ctx, client.ObjectKey{Name: gpuNode.Name}, latest); err != nil {
483+
return err
484+
}
485+
486+
// Apply our status updates to the latest version
487+
latest.Status.Phase = tfv1.TensorFusionGPUNodePhasePending
488+
latest.Status.NodeInfo.IP = status.PrivateIP
489+
latest.Status.NodeInfo.InstanceID = status.InstanceID
490+
latest.Status.NodeInfo.Region = nodeParam.Region
491+
492+
// Attempt to update with the latest version
493+
return r.Client.Status().Update(ctx, latest)
494+
})
495+
496+
if err != nil {
497+
log.FromContext(ctx).Error(err, "Failed to update GPUNode status after retries, must terminate node to keep operation atomic", "name", nodeParam.NodeName)
477498
errTerminate := provider.TerminateNode(ctx, &types.NodeIdentityParam{
478499
InstanceID: status.InstanceID,
479500
Region: nodeParam.Region,

0 commit comments

Comments
 (0)