From 5f184ad26bfeb2bfa6ce4884020d6497b1f7e4cf Mon Sep 17 00:00:00 2001 From: Mitchell Alessio <5306896+malessi@users.noreply.github.com> Date: Thu, 6 Feb 2025 10:20:42 -0500 Subject: [PATCH] BFD-3848: Transient SFTP connection errors are affecting BFD EFT Outbound in test (#2542) --- ops/terraform/services/eft/main.tf | 15 +++++---------- .../eft/modules/bfd_eft_outbound_o11y/main.tf | 8 ++++---- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/ops/terraform/services/eft/main.tf b/ops/terraform/services/eft/main.tf index b511b1cc9a..bc68d04778 100644 --- a/ops/terraform/services/eft/main.tf +++ b/ops/terraform/services/eft/main.tf @@ -313,15 +313,11 @@ resource "aws_sqs_queue" "sftp_outbound_transfer_dlq" { resource "aws_lambda_function_event_invoke_config" "sftp_outbound_transfer" { count = length(local.eft_partners_with_outbound_enabled) > 0 ? 1 : 0 - function_name = one(aws_lambda_function.sftp_outbound_transfer[*].function_name) - # This Lambda is invoked by SNS, which invokes the Lambda asynchronously. By default, AWS Lambda - # retries failing Functions twice before dropping the event, but because this Lambda has side - # effects we don't want to retry if it fails. Instead, we will drop failing events into a DLQ for - # the on-call to process it again, if possible - maximum_retry_attempts = 0 - - # On failure we want failing events to land into a DLQ such that responding engineers can analyze - # the event and retry, if necessary + function_name = one(aws_lambda_function.sftp_outbound_transfer[*].function_name) + maximum_retry_attempts = 2 + + # If the Lambda exhausts all of its retry attempts, we want failing events to land into a DLQ such + # that responding engineers can analyze the event and retry, if necessary destination_config { on_failure { destination = one(aws_sqs_queue.sftp_outbound_transfer_dlq[*].arn) @@ -329,7 +325,6 @@ resource "aws_lambda_function_event_invoke_config" "sftp_outbound_transfer" { } } - resource "aws_security_group" "sftp_outbound_transfer" { count = length(local.eft_partners_with_outbound_enabled) > 0 ? 1 : 0 diff --git a/ops/terraform/services/eft/modules/bfd_eft_outbound_o11y/main.tf b/ops/terraform/services/eft/modules/bfd_eft_outbound_o11y/main.tf index 77c3c7ffae..f577a9fb7b 100644 --- a/ops/terraform/services/eft/modules/bfd_eft_outbound_o11y/main.tf +++ b/ops/terraform/services/eft/modules/bfd_eft_outbound_o11y/main.tf @@ -57,16 +57,16 @@ locals { resource "aws_cloudwatch_metric_alarm" "lambda_errors" { alarm_name = local.alarms_config.lambda_errors.alarm_name comparison_operator = "GreaterThanOrEqualToThreshold" - evaluation_periods = "1" + evaluation_periods = "10" period = "60" statistic = "Sum" threshold = "1" - datapoints_to_alarm = "1" + datapoints_to_alarm = "3" treat_missing_data = "notBreaching" alarm_description = join("", [ - "The ${var.outbound_lambda_name} has failed to transfer a file in ${local.env}. View the ", - "linked CloudWatch Log Group for more details on the failure, and inspect the failing event ", + "The ${var.outbound_lambda_name} has failed 3 times in 10 minutes in ${local.env}. View the ", + "linked CloudWatch Log Group for more details on the failures, and inspect the failing events ", "in the linked DLQ", "\n", "\n* CloudWatch Log Group: <${local.alarms_config.lambda_errors.log_group_url}|${local.alarms_config.lambda_errors.log_group_name}>",