fix: add clientToken to avoid ec2 duplicated creation (#69)

Code2Life · web-flow · commit 1a23e69ddafe · 2025-03-12T20:34:27.000+08:00
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -33,6 +33,7 @@
         "gpunodes",
         "gpupool",
         "gpupools",
+        "greptime",
         "greptimedb",
         "healthz",
         "karpenter",
diff --git a/charts/tensor-fusion/Chart.yaml b/charts/tensor-fusion/Chart.yaml
@@ -15,7 +15,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 1.2.6
+version: 1.2.7
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
diff --git a/charts/tensor-fusion/templates/vector-config.yaml b/charts/tensor-fusion/templates/vector-config.yaml
@@ -53,10 +53,19 @@ data:
         inputs:
           - log_to_metric
         new_naming: false
-        endpoint: {{ .Values.hypervisor.greptimedbEndpoint }}
+        endpoint: {{ .Values.greptime.endpoint }}:{{ .Values.greptime.port }}
 
       sink_greptimedb_controller_metrics:
         type: prometheus_remote_write
         inputs:
           - prepare_controller_metrics
-        endpoint: {{ .Values.controller.greptimedbEndpoint }}
+        {{- if ne .Values.greptime.isCloud true }}
+        endpoint: http://{{ .Values.greptime.internalHost }}:4000/v1/prometheus/write?db=public
+        {{- else }}
+        endpoint: https://{{ .Values.greptime.host }}/v1/prometheus/write?db={{ .Values.greptime.db }}
+        {{- end }}
+        auth:
+          strategy: basic
+          user: {{ .Values.greptime.user }}
+          password: {{ .Values.greptime.password }}
+          
diff --git a/charts/tensor-fusion/values.yaml b/charts/tensor-fusion/values.yaml
@@ -20,7 +20,6 @@ serviceAccount:
   annotations: {}
 
 hypervisor:
-  greptimedbEndpoint: greptimedb-standalone.tensor-fusion.svc.cluster.local:4001
   image:
     repository: tensorfusion/tensor-fusion-hypervisor
     # Overrides the image tag whose default is the chart appVersion.
@@ -51,13 +50,24 @@ controller:
   tolerations: []
   affinity: {}
   
-  greptimedbEndpoint: http://greptimedb-standalone.tensor-fusion.svc.cluster.local:4000/v1/prometheus/write?db=public
   admissionWebhooks:
     failurePolicy: Fail
     secretName: tensor-fusion-webhook-secret
     patch:
       image: registry.k8s.io/ingress-nginx/kube-webhook-certgen:v1.5.0
 
+greptime:
+  isCloud: false
+  internalHost: greptimedb-standalone.tensor-fusion.svc.cluster.local
+  port: 4001
+
+# greptime:
+#   isCloud: true
+#   host: y6lmxod4zm69.us-west-2.aws.greptime.cloud
+#   user: "dummy"
+#   db: "public"
+#   password: "dummy"
+#   port: 5001
 
 agent:
   enrollToken: "token-from-cloud"
diff --git a/cmd/main.go b/cmd/main.go
@@ -240,8 +240,9 @@ func main() {
 		os.Exit(1)
 	}
 	if err = (&controller.NodeReconciler{
-		Client: mgr.GetClient(),
-		Scheme: mgr.GetScheme(),
+		Client:   mgr.GetClient(),
+		Scheme:   mgr.GetScheme(),
+		Recorder: mgr.GetEventRecorderFor("Node"),
 	}).SetupWithManager(mgr); err != nil {
 		setupLog.Error(err, "unable to create controller", "controller", "Node")
 		os.Exit(1)
diff --git a/internal/cloudprovider/alibaba/ecs.go b/internal/cloudprovider/alibaba/ecs.go
@@ -82,6 +82,7 @@ func (p AlibabaGPUNodeProvider) CreateNode(ctx context.Context, param *types.Nod
 	nodeClass := param.NodeClass.Spec
 	request := ecs.CreateRunInstancesRequest()
 	request.LaunchTemplateId = nodeClass.LaunchTemplate.ID
+	request.ClientToken = param.NodeName
 
 	if len(nodeClass.OSImageSelectorTerms) > 0 {
 		// TODO: should support other query types not only ID
diff --git a/internal/controller/gpunode_controller.go b/internal/controller/gpunode_controller.go
@@ -23,19 +23,19 @@ import (
 	"strings"
 	"time"
 
+	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
 	cloudprovider "github.com/NexusGPU/tensor-fusion/internal/cloudprovider"
 	"github.com/NexusGPU/tensor-fusion/internal/cloudprovider/types"
-	"github.com/NexusGPU/tensor-fusion/internal/utils"
-
-	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
 	"github.com/NexusGPU/tensor-fusion/internal/constants"
+	"github.com/NexusGPU/tensor-fusion/internal/utils"
 	batchv1 "k8s.io/api/batch/v1"
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/errors"
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/client-go/tools/record"
+	"k8s.io/client-go/util/retry"
 	"k8s.io/utils/ptr"
 	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/builder"
@@ -457,6 +457,7 @@ func (r *GPUNodeReconciler) reconcileCloudVendorNode(ctx context.Context, node *
 		return fmt.Errorf("failed to unmarshal cloud vendor param: %w, GPUNode: %s", err, node.Name)
 	}
 
+	// TODO: query cloud vendor by node name
 	status, err := provider.CreateNode(ctx, &nodeParam)
 	if err != nil {
 		return err
@@ -469,11 +470,31 @@ func (r *GPUNodeReconciler) reconcileCloudVendorNode(ctx context.Context, node *
 	if err != nil {
 		return err
 	}
+	gpuNode.Status.Phase = tfv1.TensorFusionGPUNodePhasePending
 	gpuNode.Status.NodeInfo.IP = status.PrivateIP
 	gpuNode.Status.NodeInfo.InstanceID = status.InstanceID
 	gpuNode.Status.NodeInfo.Region = nodeParam.Region
-	if err := r.Client.Status().Update(ctx, gpuNode); err != nil {
-		log.FromContext(ctx).Info("Failed to update GPUNode status, must terminate node to keep operation atomic", "name", nodeParam.NodeName)
+
+	// Retry status update until success to handle version conflicts
+	err = retry.RetryOnConflict(retry.DefaultBackoff, func() error {
+		// Get the latest version before attempting an update
+		latest := &tfv1.GPUNode{}
+		if err := r.Client.Get(ctx, client.ObjectKey{Name: gpuNode.Name}, latest); err != nil {
+			return err
+		}
+
+		// Apply our status updates to the latest version
+		latest.Status.Phase = tfv1.TensorFusionGPUNodePhasePending
+		latest.Status.NodeInfo.IP = status.PrivateIP
+		latest.Status.NodeInfo.InstanceID = status.InstanceID
+		latest.Status.NodeInfo.Region = nodeParam.Region
+
+		// Attempt to update with the latest version
+		return r.Client.Status().Update(ctx, latest)
+	})
+
+	if err != nil {
+		log.FromContext(ctx).Error(err, "Failed to update GPUNode status after retries, must terminate node to keep operation atomic", "name", nodeParam.NodeName)
 		errTerminate := provider.TerminateNode(ctx, &types.NodeIdentityParam{
 			InstanceID: status.InstanceID,
 			Region:     nodeParam.Region,