Skip to content

Commit

Permalink
BFD-3848: Transient SFTP connection errors are affecting BFD EFT Outb…
Browse files Browse the repository at this point in the history
…ound in test (#2542)
  • Loading branch information
malessi authored Feb 6, 2025
1 parent 46a565e commit 5f184ad
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 14 deletions.
15 changes: 5 additions & 10 deletions ops/terraform/services/eft/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -313,23 +313,18 @@ resource "aws_sqs_queue" "sftp_outbound_transfer_dlq" {
resource "aws_lambda_function_event_invoke_config" "sftp_outbound_transfer" {
count = length(local.eft_partners_with_outbound_enabled) > 0 ? 1 : 0

function_name = one(aws_lambda_function.sftp_outbound_transfer[*].function_name)
# This Lambda is invoked by SNS, which invokes the Lambda asynchronously. By default, AWS Lambda
# retries failing Functions twice before dropping the event, but because this Lambda has side
# effects we don't want to retry if it fails. Instead, we will drop failing events into a DLQ for
# the on-call to process it again, if possible
maximum_retry_attempts = 0

# On failure we want failing events to land into a DLQ such that responding engineers can analyze
# the event and retry, if necessary
function_name = one(aws_lambda_function.sftp_outbound_transfer[*].function_name)
maximum_retry_attempts = 2

# If the Lambda exhausts all of its retry attempts, we want failing events to land into a DLQ such
# that responding engineers can analyze the event and retry, if necessary
destination_config {
on_failure {
destination = one(aws_sqs_queue.sftp_outbound_transfer_dlq[*].arn)
}
}
}


resource "aws_security_group" "sftp_outbound_transfer" {
count = length(local.eft_partners_with_outbound_enabled) > 0 ? 1 : 0

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,16 +57,16 @@ locals {
resource "aws_cloudwatch_metric_alarm" "lambda_errors" {
alarm_name = local.alarms_config.lambda_errors.alarm_name
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "1"
evaluation_periods = "10"
period = "60"
statistic = "Sum"
threshold = "1"
datapoints_to_alarm = "1"
datapoints_to_alarm = "3"
treat_missing_data = "notBreaching"

alarm_description = join("", [
"The ${var.outbound_lambda_name} has failed to transfer a file in ${local.env}. View the ",
"linked CloudWatch Log Group for more details on the failure, and inspect the failing event ",
"The ${var.outbound_lambda_name} has failed 3 times in 10 minutes in ${local.env}. View the ",
"linked CloudWatch Log Group for more details on the failures, and inspect the failing events ",
"in the linked DLQ",
"\n",
"\n* CloudWatch Log Group: <${local.alarms_config.lambda_errors.log_group_url}|${local.alarms_config.lambda_errors.log_group_name}>",
Expand Down

0 comments on commit 5f184ad

Please sign in to comment.