Skip to content

Commit 1ccb35f

Browse files
Drop Node events when EC2 instance does not exist and node is not new
1 parent 08ac6f0 commit 1ccb35f

File tree

3 files changed

+23
-3
lines changed

3 files changed

+23
-3
lines changed

pkg/controllers/tagging/tagging_controller.go

+19
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,9 @@ const (
6363

6464
// The label for depicting total number of errors a work item encounter and fail
6565
errorsAfterRetriesExhaustedWorkItemErrorMetric = "errors_after_retries_exhausted"
66+
67+
// The period of time after Node creation to retry tagging due to eventual consistency of the CreateTags API.
68+
newNodeEventualConsistencyGracePeriod = time.Minute * 5
6669
)
6770

6871
// Controller is the controller implementation for tagging cluster resources.
@@ -292,6 +295,18 @@ func (tc *Controller) tagEc2Instance(node *v1.Node) error {
292295
err := tc.cloud.TagResource(string(instanceID), tc.tags)
293296

294297
if err != nil {
298+
if awsv1.IsAWSErrorInstanceNotFound(err) {
299+
// This can happen for two reasons.
300+
// 1. The CreateTags API is eventually consistent. In rare cases, a newly-created instance may not be taggable for a short period.
301+
// We will re-queue the event and retry.
302+
if isNodeWithinEventualConsistencyGracePeriod(node) {
303+
return fmt.Errorf("EC2 instance %s for node %s does not exist, but node is within eventual consistency grace period", instanceID, node.GetName())
304+
}
305+
// 2. The event in our workQueue is stale, and the instance no longer exists.
306+
// Tagging will never succeed, and the event should not be re-queued.
307+
klog.Infof("Skip tagging since EC2 instance %s for node %s does not exist", instanceID, node.GetName())
308+
return nil
309+
}
295310
klog.Errorf("Error in tagging EC2 instance %s for node %s, error: %v", instanceID, node.GetName(), err)
296311
return err
297312
}
@@ -380,3 +395,7 @@ func (tc *Controller) getChecksumOfTags() string {
380395
sort.Strings(tags)
381396
return fmt.Sprintf("%x", md5.Sum([]byte(strings.Join(tags, ","))))
382397
}
398+
399+
func isNodeWithinEventualConsistencyGracePeriod(node *v1.Node) bool {
400+
return time.Since(node.CreationTimestamp.Time) < newNodeEventualConsistencyGracePeriod
401+
}

pkg/providers/v1/aws.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -1707,7 +1707,7 @@ func (c *Cloud) InstanceExistsByProviderID(ctx context.Context, providerID strin
17071707
instances, err := c.ec2.DescribeInstances(request)
17081708
if err != nil {
17091709
// if err is InstanceNotFound, return false with no error
1710-
if isAWSErrorInstanceNotFound(err) {
1710+
if IsAWSErrorInstanceNotFound(err) {
17111711
return false, nil
17121712
}
17131713
return false, err
@@ -1946,7 +1946,8 @@ func (c *Cloud) GetZoneByNodeName(ctx context.Context, nodeName types.NodeName)
19461946

19471947
}
19481948

1949-
func isAWSErrorInstanceNotFound(err error) bool {
1949+
// IsAWSErrorInstanceNotFound returns true if the specified error is an awserr.Error with the code `InvalidInstanceId.NotFound`.
1950+
func IsAWSErrorInstanceNotFound(err error) bool {
19501951
if err == nil {
19511952
return false
19521953
}

pkg/providers/v1/tags.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -344,7 +344,7 @@ func (c *Cloud) UntagResource(resourceID string, tags map[string]string) error {
344344
if err != nil {
345345
// An instance not found should not fail the untagging workflow as it
346346
// would for tagging, since the target state is already reached.
347-
if isAWSErrorInstanceNotFound(err) {
347+
if IsAWSErrorInstanceNotFound(err) {
348348
klog.Infof("Couldn't find resource when trying to untag it hence skipping it, %v", err)
349349
return nil
350350
}

0 commit comments

Comments
 (0)