Skip to content

Commit 0dc1070

Browse files
committed
[Test] Quarantine AD stacks on failure for debugging purposes. At most 5 will be quarantined to limit costs.
1 parent a1463e8 commit 0dc1070

File tree

2 files changed

+32
-7
lines changed

2 files changed

+32
-7
lines changed

tests/integration-tests/cfn_stacks_factory.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ def create_stack(self, stack, stack_is_under_test=False):
198198
stack.init_stack_data()
199199
except Exception as e:
200200
logging.error("Creation of stack {0} in region {1} failed with exception: {2}".format(name, region, e))
201-
raise
201+
raise e
202202

203203
logging.info("Stack {0} created successfully in region {1}".format(name, region))
204204

tests/integration-tests/tests/ad_integration/test_ad_integration.py

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,12 @@
3535
NUM_USERS_TO_CREATE = 5
3636
NUM_USERS_TO_TEST = 3
3737

38+
MAX_QUARANTINED_STACKS = 5
39+
40+
AD_STACK_PREFIX = 'integ-tests-MultiUserInfraStack'
41+
42+
DO_NOT_DELETE_TAG_KEY = 'DO-NOT-DELETE'
43+
3844

3945
def get_infra_stack_outputs(stack_name):
4046
cfn = boto3.client("cloudformation")
@@ -117,7 +123,7 @@ def add_tag_to_stack(stack_name, key, value):
117123
stack = cfn.Stack(stack_name)
118124
add_tag = True
119125
for tag in stack.tags:
120-
if tag.get("Key") == "DO-NOT-DELETE":
126+
if tag.get("Key") == DO_NOT_DELETE_TAG_KEY:
121127
add_tag = False
122128
break
123129
if add_tag:
@@ -189,7 +195,7 @@ def _get_stack_parameters(directory_type, vpc_stack, keypair):
189195

190196
def _create_directory_stack(cfn_stacks_factory, request, directory_type, region, vpc_stack: CfnVpcStack):
191197
directory_stack_name = generate_stack_name(
192-
f"integ-tests-MultiUserInfraStack{directory_type}", request.config.getoption("stackname_suffix")
198+
f"{AD_STACK_PREFIX}{directory_type}", request.config.getoption("stackname_suffix")
193199
)
194200

195201
if directory_type not in ("MicrosoftAD", "SimpleAD"):
@@ -203,7 +209,7 @@ def _create_directory_stack(cfn_stacks_factory, request, directory_type, region,
203209
stack_parameters = _get_stack_parameters(directory_type, vpc_stack, request.config.getoption("key_name"))
204210
tags = [{"Key": "parallelcluster:integ-tests-ad-stack", "Value": directory_type}]
205211
if request.config.getoption("retain_ad_stack"):
206-
tags.append({"Key": "DO-NOT-DELETE", "Value": "Retained for integration testing"})
212+
tags.append({"Key": DO_NOT_DELETE_TAG_KEY, "Value": "Retained for integration testing"})
207213

208214
directory_stack = CfnStack(
209215
name=directory_stack_name,
@@ -213,11 +219,30 @@ def _create_directory_stack(cfn_stacks_factory, request, directory_type, region,
213219
capabilities=["CAPABILITY_IAM", "CAPABILITY_NAMED_IAM", "CAPABILITY_AUTO_EXPAND"],
214220
tags=tags,
215221
)
216-
cfn_stacks_factory.create_stack(directory_stack)
222+
try:
223+
cfn_stacks_factory.create_stack(directory_stack, stack_is_under_test=True)
224+
except Exception as e:
225+
logging.error("Failed to create stack %s", directory_stack_name)
226+
# We want to retain the stack in case of failure in order to debug it.
227+
# We retain a limited number of stack to contain the costs.
228+
n_retained_ad_stacks = get_retained_ad_stacks_count()
229+
if n_retained_ad_stacks < MAX_QUARANTINED_STACKS:
230+
logging.warn("Retaining failed stack %s to debug failure", directory_stack_name)
231+
add_tag_to_stack(directory_stack.name, DO_NOT_DELETE_TAG_KEY, "Retained to debug failure")
232+
else:
233+
logging.warn("Cannot retain failed stack %s for debugging because there are already %d retained (max: %d)",
234+
directory_stack_name, n_retained_ad_stacks, MAX_QUARANTINED_STACKS)
235+
raise e
217236
logging.info("Creation of stack %s complete", directory_stack_name)
218237

219238
return directory_stack
220239

240+
def get_retained_ad_stacks_count():
241+
cfn = boto3.client("cloudformation")
242+
failed_stacks = cfn.list_stacks(StackStatusFilter=['CREATE_FAILED'])["StackSummaries"]
243+
failed_ad_stacks = [stack for stack in failed_stacks if AD_STACK_PREFIX in stack.get('StackName')]
244+
return len([stack for stack in failed_ad_stacks if stack.get("Tags") and
245+
any(tag.get("Key") == DO_NOT_DELETE_TAG_KEY for tag in stack.get("Tags"))])
221246

222247
@retry(wait_fixed=seconds(20), stop_max_delay=seconds(700))
223248
def _check_ssm_success(ssm_client, command_id, instance_id):
@@ -243,7 +268,7 @@ def _directory_factory(
243268
directory_stack_name = created_directory_stacks.get(region, {}).get("directory")
244269
logging.info("Using directory stack named %s created by another test", directory_stack_name)
245270
else:
246-
stack_prefix = f"integ-tests-MultiUserInfraStack{directory_type}"
271+
stack_prefix = f"{AD_STACK_PREFIX}{directory_type}"
247272
directory_stack_name = find_stack_by_tag("parallelcluster:integ-tests-ad-stack", region, stack_prefix)
248273

249274
if not directory_stack_name:
@@ -257,7 +282,7 @@ def _directory_factory(
257282
directory_stack_name = directory_stack.name
258283
created_directory_stacks[region]["directory"] = directory_stack_name
259284
if request.config.getoption("retain_ad_stack"):
260-
add_tag_to_stack(vpc_stack.name, "DO-NOT-DELETE", "Retained for integration testing")
285+
add_tag_to_stack(vpc_stack.name, DO_NOT_DELETE_TAG_KEY, "Retained for integration testing")
261286
return directory_stack_name
262287

263288
yield _directory_factory

0 commit comments

Comments
 (0)