@@ -63,6 +63,9 @@ const (
63
63
64
64
// The label for depicting total number of errors a work item encounter and fail
65
65
errorsAfterRetriesExhaustedWorkItemErrorMetric = "errors_after_retries_exhausted"
66
+
67
+ // The period of time after Node creation to retry tagging due to eventual consistency of the CreateTags API.
68
+ newNodeEventualConsistencyGracePeriod = time .Minute * 5
66
69
)
67
70
68
71
// Controller is the controller implementation for tagging cluster resources.
@@ -292,6 +295,18 @@ func (tc *Controller) tagEc2Instance(node *v1.Node) error {
292
295
err := tc .cloud .TagResource (string (instanceID ), tc .tags )
293
296
294
297
if err != nil {
298
+ if awsv1 .IsAWSErrorInstanceNotFound (err ) {
299
+ // This can happen for two reasons.
300
+ // 1. The CreateTags API is eventually consistent. In rare cases, a newly-created instance may not be taggable for a short period.
301
+ // We will re-queue the event and retry.
302
+ if isNodeWithinEventualConsistencyGracePeriod (node ) {
303
+ return fmt .Errorf ("EC2 instance %s for node %s does not exist, but node is within eventual consistency grace period" , instanceID , node .GetName ())
304
+ }
305
+ // 2. The event in our workQueue is stale, and the instance no longer exists.
306
+ // Tagging will never succeed, and the event should not be re-queued.
307
+ klog .Infof ("Skip tagging since EC2 instance %s for node %s does not exist" , instanceID , node .GetName ())
308
+ return nil
309
+ }
295
310
klog .Errorf ("Error in tagging EC2 instance %s for node %s, error: %v" , instanceID , node .GetName (), err )
296
311
return err
297
312
}
@@ -380,3 +395,7 @@ func (tc *Controller) getChecksumOfTags() string {
380
395
sort .Strings (tags )
381
396
return fmt .Sprintf ("%x" , md5 .Sum ([]byte (strings .Join (tags , "," ))))
382
397
}
398
+
399
+ func isNodeWithinEventualConsistencyGracePeriod (node * v1.Node ) bool {
400
+ return time .Since (node .CreationTimestamp .Time ) < newNodeEventualConsistencyGracePeriod
401
+ }
0 commit comments