Drop Node events when EC2 instance does not exist and node is not new

cartermckinnon · cartermckinnon · commit 1ccb35feedf7 · 2023-11-27T21:54:18.000Z
diff --git a/pkg/controllers/tagging/tagging_controller.go b/pkg/controllers/tagging/tagging_controller.go
@@ -63,6 +63,9 @@ const (
 
 	// The label for depicting total number of errors a work item encounter and fail
 	errorsAfterRetriesExhaustedWorkItemErrorMetric = "errors_after_retries_exhausted"
+
+	// The period of time after Node creation to retry tagging due to eventual consistency of the CreateTags API.
+	newNodeEventualConsistencyGracePeriod = time.Minute * 5
 )
 
 // Controller is the controller implementation for tagging cluster resources.
@@ -292,6 +295,18 @@ func (tc *Controller) tagEc2Instance(node *v1.Node) error {
 	err := tc.cloud.TagResource(string(instanceID), tc.tags)
 
 	if err != nil {
+		if awsv1.IsAWSErrorInstanceNotFound(err) {
+			// This can happen for two reasons.
+			// 1. The CreateTags API is eventually consistent. In rare cases, a newly-created instance may not be taggable for a short period.
+			//    We will re-queue the event and retry.
+			if isNodeWithinEventualConsistencyGracePeriod(node) {
+				return fmt.Errorf("EC2 instance %s for node %s does not exist, but node is within eventual consistency grace period", instanceID, node.GetName())
+			}
+			// 2. The event in our workQueue is stale, and the instance no longer exists.
+			//    Tagging will never succeed, and the event should not be re-queued.
+			klog.Infof("Skip tagging since EC2 instance %s for node %s does not exist", instanceID, node.GetName())
+			return nil
+		}
 		klog.Errorf("Error in tagging EC2 instance %s for node %s, error: %v", instanceID, node.GetName(), err)
 		return err
 	}
@@ -380,3 +395,7 @@ func (tc *Controller) getChecksumOfTags() string {
 	sort.Strings(tags)
 	return fmt.Sprintf("%x", md5.Sum([]byte(strings.Join(tags, ","))))
 }
+
+func isNodeWithinEventualConsistencyGracePeriod(node *v1.Node) bool {
+	return time.Since(node.CreationTimestamp.Time) < newNodeEventualConsistencyGracePeriod
+}
diff --git a/pkg/providers/v1/aws.go b/pkg/providers/v1/aws.go
@@ -1707,7 +1707,7 @@ func (c *Cloud) InstanceExistsByProviderID(ctx context.Context, providerID strin
 	instances, err := c.ec2.DescribeInstances(request)
 	if err != nil {
 		// if err is InstanceNotFound, return false with no error
-		if isAWSErrorInstanceNotFound(err) {
+		if IsAWSErrorInstanceNotFound(err) {
 			return false, nil
 		}
 		return false, err
@@ -1946,7 +1946,8 @@ func (c *Cloud) GetZoneByNodeName(ctx context.Context, nodeName types.NodeName)
 
 }
 
-func isAWSErrorInstanceNotFound(err error) bool {
+// IsAWSErrorInstanceNotFound returns true if the specified error is an awserr.Error with the code `InvalidInstanceId.NotFound`.
+func IsAWSErrorInstanceNotFound(err error) bool {
 	if err == nil {
 		return false
 	}
diff --git a/pkg/providers/v1/tags.go b/pkg/providers/v1/tags.go
@@ -344,7 +344,7 @@ func (c *Cloud) UntagResource(resourceID string, tags map[string]string) error {
 	if err != nil {
 		// An instance not found should not fail the untagging workflow as it
 		// would for tagging, since the target state is already reached.
-		if isAWSErrorInstanceNotFound(err) {
+		if IsAWSErrorInstanceNotFound(err) {
 			klog.Infof("Couldn't find resource when trying to untag it hence skipping it, %v", err)
 			return nil
 		}

Original file line number	Diff line number	Diff line change
`@@ -1707,7 +1707,7 @@ func (c *Cloud) InstanceExistsByProviderID(ctx context.Context, providerID strin`
`1707`	`1707`	`instances, err := c.ec2.DescribeInstances(request)`
`1708`	`1708`	`if err != nil {`
`1709`	`1709`	`// if err is InstanceNotFound, return false with no error`
`1710`		`- if isAWSErrorInstanceNotFound(err) {`
	`1710`	`+ if IsAWSErrorInstanceNotFound(err) {`
`1711`	`1711`	`return false, nil`
`1712`	`1712`	`}`
`1713`	`1713`	`return false, err`
`@@ -1946,7 +1946,8 @@ func (c *Cloud) GetZoneByNodeName(ctx context.Context, nodeName types.NodeName)`
`1946`	`1946`
`1947`	`1947`	`}`
`1948`	`1948`
`1949`		`-func isAWSErrorInstanceNotFound(err error) bool {`
	`1949`	+// IsAWSErrorInstanceNotFound returns true if the specified error is an awserr.Error with the code `InvalidInstanceId.NotFound`.
	`1950`	`+func IsAWSErrorInstanceNotFound(err error) bool {`
`1950`	`1951`	`if err == nil {`
`1951`	`1952`	`return false`
`1952`	`1953`	`}`
Original file line number	Diff line number	Diff line change
`@@ -344,7 +344,7 @@ func (c *Cloud) UntagResource(resourceID string, tags map[string]string) error {`
`344`	`344`	`if err != nil {`
`345`	`345`	`// An instance not found should not fail the untagging workflow as it`
`346`	`346`	`// would for tagging, since the target state is already reached.`
`347`		`- if isAWSErrorInstanceNotFound(err) {`
	`347`	`+ if IsAWSErrorInstanceNotFound(err) {`
`348`	`348`	`klog.Infof("Couldn't find resource when trying to untag it hence skipping it, %v", err)`
`349`	`349`	`return nil`
`350`	`350`	`}`