Skip to content

[Test] Quarantine AD stacks on failure for debugging purposes. #6742

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 3 commits into
base: release-3.13
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 60 additions & 47 deletions cloudformation/ad/ad-integration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,7 @@ Resources:
Properties:
Count: 1
Handle: !Ref AdDomainAdminNodeWaitConditionHandle
Timeout: 900
Timeout: 600

AdDomainAdminNode:
Type: AWS::EC2::Instance
Expand Down Expand Up @@ -446,55 +446,66 @@ Resources:
#!/bin/bash -e
set -o pipefail
exec > >(tee /var/log/user-data.log|logger -t user-data -s 2>/dev/console) 2>&1
yum update -y aws-cfn-bootstrap
/opt/aws/bin/cfn-init -v --stack "${AWS::StackName}" --resource AdDomainAdminNode --configsets setup --region "${AWS::Region}"
echo "Directory Id: ${DirectoryId}"
echo "Domain Name: ${DirectoryDomain}"
echo "Domain DNS IP 1: ${DnsIp1}"
echo "Domain DNS IP 2: ${DnsIp2}"
echo "Domain Certificate Secret: ${DomainCertificateSecretArn}"
echo "Domain Private Key Secret: ${DomainPrivateKeySecretArn}"
function main() {
yum update -y aws-cfn-bootstrap
/opt/aws/bin/cfn-init -v --stack "${AWS::StackName}" --resource AdDomainAdminNode --configsets setup --region "${AWS::Region}"
echo "Directory Id: ${DirectoryId}"
echo "Domain Name: ${DirectoryDomain}"
echo "Domain DNS IP 1: ${DnsIp1}"
echo "Domain DNS IP 2: ${DnsIp2}"
echo "Domain Certificate Secret: ${DomainCertificateSecretArn}"
echo "Domain Private Key Secret: ${DomainPrivateKeySecretArn}"

echo "Describing directory..."
aws ds describe-directories --directory-id "${DirectoryId}" --region "${AWS::Region}"
echo "Describing domain controllers..."
aws ds describe-domain-controllers --directory-id "${DirectoryId}" --region "${AWS::Region}"

ADMIN_PW="${AdminPassword}"

USERNAMES="ReadOnlyUser,${UserNames}"
echo "Registering Users: $USERNAMES ..."
for username in $(echo $USERNAMES | sed "s/,/ /g")
do
attempt=0
max_attempts=3
until [ $attempt -ge $max_attempts ]; do
attempt=$((attempt+1))
echo "Registering user $username (attempt $attempt/$max_attempts) ..."
echo "$ADMIN_PW" | adcli create-user -v -x -U "${Admin}" --domain-controller="${DnsIp1}" --display-name="$username" "$username" && echo "User registered: $username" && break
echo "$ADMIN_PW" | adcli create-user -v -x -U "${Admin}" --domain-controller="${DnsIp2}" --display-name="$username" "$username" && echo "User registered: $username" && break
echo "User creation failed, describing directory and controllers for troubleshooting..."
aws ds describe-directories --directory-id "${DirectoryId}" --region "${AWS::Region}"
aws ds describe-domain-controllers --directory-id "${DirectoryId}" --region "${AWS::Region}"
sleep 10
done
done

echo "Creating domain certificate..."
PRIVATE_KEY="${DirectoryDomain}.key"
CERTIFICATE="${DirectoryDomain}.crt"
printf '.\n.\n.\n.\n.\n%s\n.\n' "${DirectoryDomain}" | openssl req -x509 -sha256 -nodes -newkey rsa:2048 -keyout "$PRIVATE_KEY" -days 365 -out "$CERTIFICATE"

echo "Storing domain private key to Secrets Manager..."
aws secretsmanager put-secret-value --secret-id "${DomainPrivateKeySecretArn}" --secret-string "file://$PRIVATE_KEY" --region "${AWS::Region}"

echo "Storing domain certificate to Secrets Manager..."
aws secretsmanager put-secret-value --secret-id "${DomainCertificateSecretArn}" --secret-string "file://$CERTIFICATE" --region "${AWS::Region}"

echo "Deleting private key and certificate from local file system..."
rm -rf "$PRIVATE_KEY" "$CERTIFICATE"
}

echo "Describing directory..."
aws ds describe-directories --directory-id "${DirectoryId}" --region "${AWS::Region}"
echo "Describing domain controllers..."
aws ds describe-domain-controllers --directory-id "${DirectoryId}" --region "${AWS::Region}"
function signal_success() {
/opt/aws/bin/cfn-signal -e 0 --stack "${AWS::StackName}" --resource "${AdDomainAdminNodeWaitConditionHandle}" --region "${AWS::Region}"
}

ADMIN_PW="${AdminPassword}"
function signal_failure() {
/opt/aws/bin/cfn-signal -e 0 --stack "${AWS::StackName}" --resource "${AdDomainAdminNodeWaitConditionHandle}" --region "${AWS::Region}"
exit 1
}

USERNAMES="ReadOnlyUser,${UserNames}"
echo "Registering Users: $USERNAMES ..."
for username in $(echo $USERNAMES | sed "s/,/ /g")
do
attempt=0
max_attempts=5
until [ $attempt -ge $max_attempts ]; do
attempt=$((attempt+1))
echo "Registering user $username (attempt $attempt/$max_attempts) ..."
echo "$ADMIN_PW" | adcli create-user -v -x -U "${Admin}" --domain-controller="${DnsIp1}" --display-name="$username" "$username" && echo "User registered: $username" && break
echo "$ADMIN_PW" | adcli create-user -v -x -U "${Admin}" --domain-controller="${DnsIp2}" --display-name="$username" "$username" && echo "User registered: $username" && break
echo "User creation failed, describing directory and controllers for troubleshooting..."
aws ds describe-directories --directory-id "${DirectoryId}" --region "${AWS::Region}"
aws ds describe-domain-controllers --directory-id "${DirectoryId}" --region "${AWS::Region}"
sleep 10
done
done

echo "Creating domain certificate..."
PRIVATE_KEY="${DirectoryDomain}.key"
CERTIFICATE="${DirectoryDomain}.crt"
printf '.\n.\n.\n.\n.\n%s\n.\n' "${DirectoryDomain}" | openssl req -x509 -sha256 -nodes -newkey rsa:2048 -keyout "$PRIVATE_KEY" -days 365 -out "$CERTIFICATE"

echo "Storing domain private key to Secrets Manager..."
aws secretsmanager put-secret-value --secret-id "${DomainPrivateKeySecretArn}" --secret-string "file://$PRIVATE_KEY" --region "${AWS::Region}"

echo "Storing domain certificate to Secrets Manager..."
aws secretsmanager put-secret-value --secret-id "${DomainCertificateSecretArn}" --secret-string "file://$CERTIFICATE" --region "${AWS::Region}"

echo "Deleting private key and certificate from local file system..."
rm -rf "$PRIVATE_KEY" "$CERTIFICATE"

/opt/aws/bin/cfn-signal -e "$?" --stack "${AWS::StackName}" --region "${AWS::Region}" "${AdDomainAdminNodeWaitConditionHandle}"
main && signal_success || signal_failure

- { DirectoryId: !GetAtt Prep.DirectoryId,
DirectoryDomain: !GetAtt Prep.DomainName,
Expand Down Expand Up @@ -622,6 +633,8 @@ Resources:

Post:
Type: Custom::PostLambda
DependsOn:
- AdDomainAdminNodeWaitCondition
Properties:
ServiceToken: !GetAtt PostLambda.Arn
AdminNodeInstanceId: !Ref AdDomainAdminNode
Expand Down
5 changes: 4 additions & 1 deletion tests/integration-tests/cfn_stacks_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,14 @@
class CfnStack:
"""Identify a CloudFormation stack."""

def __init__(self, name, region, template, parameters=None, capabilities=None, tags=None):
def __init__(self, name, region, template, parameters=None, capabilities=None, tags=None, disable_rollback=False):
self.name = name
self.region = region
self.template = template
self.parameters = parameters or []
self.capabilities = capabilities or []
self.tags = tags or []
self.disable_rollback=disable_rollback
self.cfn_stack_id = None
self.__cfn_outputs = None
self.__cfn_resources = None
Expand Down Expand Up @@ -175,6 +176,7 @@ def create_stack(self, stack, stack_is_under_test=False):
Parameters=stack.parameters,
Capabilities=stack.capabilities,
Tags=stack.tags,
DisableRollback=stack.disable_rollback,
)
else:
result = cfn_client.create_stack(
Expand All @@ -183,6 +185,7 @@ def create_stack(self, stack, stack_is_under_test=False):
Parameters=stack.parameters,
Capabilities=stack.capabilities,
Tags=stack.tags,
DisableRollback=stack.disable_rollback,
)
stack.cfn_stack_id = result["StackId"]
self.__created_stacks[id] = stack
Expand Down
9 changes: 9 additions & 0 deletions tests/integration-tests/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,15 @@

UNSUPPORTED_OSES_FOR_DCV = ["alinux2023"]

DO_NOT_DELETE_TAG_KEY = 'DO-NOT-DELETE'

QUARANTINE_TAG_KEY = 'QUARANTINE'

MAX_QUARANTINED_STACKS = 5

QUARANTINE_TAGS = [{ "Key": DO_NOT_DELETE_TAG_KEY, "Value": "true" }, { "Key": QUARANTINE_TAG_KEY, "Value": "true" }]
RETENTION_TAGS = [{ "Key": DO_NOT_DELETE_TAG_KEY, "Value": "true" }]


class NodeType(Enum):
"""Categories of nodes."""
Expand Down
2 changes: 1 addition & 1 deletion tests/integration-tests/framework/metrics_publisher.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def __init__(self, region=None):
def publish_metrics_to_cloudwatch(self, namespace: str, metrics: List[Metric]):
"""Pushes a list of metrics to cloudwatch using a single namespace."""
try:
logging.info(
logging.debug(
f"publishing metrics to cloudwatch {[metric.generate_metric_data_entry() for metric in metrics]}"
)
self.client.put_metric_data(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,18 @@
from remote_command_executor import RemoteCommandExecutor
from retrying import retry
from time_utils import seconds
from utils import find_stack_by_tag, generate_stack_name, is_directory_supported, random_alphanumeric

from utils import find_stack_by_tag, generate_stack_name, is_directory_supported, random_alphanumeric, get_quarantined_stacks, is_quarantined_stack, quarantine_stacks

from tests.ad_integration.cluster_user import ClusterUser
from tests.common.utils import run_system_analyzer
from constants import DO_NOT_DELETE_TAG_KEY, MAX_QUARANTINED_STACKS

NUM_USERS_TO_CREATE = 5
NUM_USERS_TO_TEST = 3

AD_STACK_PREFIX = 'integ-tests-MultiUserInfraStack'


def get_infra_stack_outputs(stack_name):
cfn = boto3.client("cloudformation")
Expand Down Expand Up @@ -117,7 +121,7 @@ def add_tag_to_stack(stack_name, key, value):
stack = cfn.Stack(stack_name)
add_tag = True
for tag in stack.tags:
if tag.get("Key") == "DO-NOT-DELETE":
if tag.get("Key") == DO_NOT_DELETE_TAG_KEY:
add_tag = False
break
if add_tag:
Expand All @@ -127,6 +131,7 @@ def add_tag_to_stack(stack_name, key, value):
Tags=[
{"Key": key, "Value": value},
],
DisableRollback=True,
)


Expand Down Expand Up @@ -189,7 +194,7 @@ def _get_stack_parameters(directory_type, vpc_stack, keypair):

def _create_directory_stack(cfn_stacks_factory, request, directory_type, region, vpc_stack: CfnVpcStack):
directory_stack_name = generate_stack_name(
f"integ-tests-MultiUserInfraStack{directory_type}", request.config.getoption("stackname_suffix")
f"{AD_STACK_PREFIX}{directory_type}", request.config.getoption("stackname_suffix")
)

if directory_type not in ("MicrosoftAD", "SimpleAD"):
Expand All @@ -203,7 +208,7 @@ def _create_directory_stack(cfn_stacks_factory, request, directory_type, region,
stack_parameters = _get_stack_parameters(directory_type, vpc_stack, request.config.getoption("key_name"))
tags = [{"Key": "parallelcluster:integ-tests-ad-stack", "Value": directory_type}]
if request.config.getoption("retain_ad_stack"):
tags.append({"Key": "DO-NOT-DELETE", "Value": "Retained for integration testing"})
tags.append({"Key": DO_NOT_DELETE_TAG_KEY, "Value": "Retained for integration testing"})

directory_stack = CfnStack(
name=directory_stack_name,
Expand All @@ -212,12 +217,33 @@ def _create_directory_stack(cfn_stacks_factory, request, directory_type, region,
parameters=stack_parameters,
capabilities=["CAPABILITY_IAM", "CAPABILITY_NAMED_IAM", "CAPABILITY_AUTO_EXPAND"],
tags=tags,
disable_rollback=True,
)
cfn_stacks_factory.create_stack(directory_stack)
try:
cfn_stacks_factory.create_stack(directory_stack, stack_is_under_test=True)
except BaseException as e:
logging.error("Failed to create stack %s", directory_stack_name)
# We want to retain the stack in case of failure in order to debug it.
# We retain a limited number of stack to contain the costs.
n_quarantined_ad_stacks = len(get_quarantined_stacks(region, prefix=AD_STACK_PREFIX))
if n_quarantined_ad_stacks < MAX_QUARANTINED_STACKS:
logging.warn("Quarantining failed stack %s to debug failure (quarantined: %d, max: %d)",
directory_stack_name, n_quarantined_ad_stacks, MAX_QUARANTINED_STACKS)
quarantine_stacks(region, stack_names=[directory_stack_name])
else:
logging.warn("Cannot quarantine failed stack %s for debugging because there are already %d quarantined (max: %d)",
directory_stack_name, n_quarantined_ad_stacks, MAX_QUARANTINED_STACKS)
raise e
logging.info("Creation of stack %s complete", directory_stack_name)

return directory_stack

def get_retained_ad_stacks_count():
cfn = boto3.client("cloudformation")
failed_stacks = cfn.list_stacks(StackStatusFilter=['CREATE_FAILED'])["StackSummaries"]
failed_ad_stacks = [stack for stack in failed_stacks if AD_STACK_PREFIX in stack.get('StackName')]
return len([stack for stack in failed_ad_stacks if stack.get("Tags") and
any(tag.get("Key") == DO_NOT_DELETE_TAG_KEY for tag in stack.get("Tags"))])

@retry(wait_fixed=seconds(20), stop_max_delay=seconds(700))
def _check_ssm_success(ssm_client, command_id, instance_id):
Expand All @@ -243,10 +269,10 @@ def _directory_factory(
directory_stack_name = created_directory_stacks.get(region, {}).get("directory")
logging.info("Using directory stack named %s created by another test", directory_stack_name)
else:
stack_prefix = f"integ-tests-MultiUserInfraStack{directory_type}"
stack_prefix = f"{AD_STACK_PREFIX}{directory_type}"
directory_stack_name = find_stack_by_tag("parallelcluster:integ-tests-ad-stack", region, stack_prefix)

if not directory_stack_name:
if not directory_stack_name or is_quarantined_stack(region, directory_stack_name):
directory_stack = _create_directory_stack(
cfn_stacks_factory,
request,
Expand All @@ -257,7 +283,7 @@ def _directory_factory(
directory_stack_name = directory_stack.name
created_directory_stacks[region]["directory"] = directory_stack_name
if request.config.getoption("retain_ad_stack"):
add_tag_to_stack(vpc_stack.name, "DO-NOT-DELETE", "Retained for integration testing")
add_tag_to_stack(vpc_stack.name, DO_NOT_DELETE_TAG_KEY, "Retained for integration testing")
return directory_stack_name

yield _directory_factory
Expand Down
42 changes: 42 additions & 0 deletions tests/integration-tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from jinja2.sandbox import SandboxedEnvironment
from retrying import retry
from time_utils import minutes, seconds
from constants import QUARANTINE_TAG_KEY, DO_NOT_DELETE_TAG_KEY, QUARANTINE_TAGS

Check notice

Code scanning / CodeQL

Unused import Note test

Import of 'DO_NOT_DELETE_TAG_KEY' is not used.

DEFAULT_PARTITION = "aws"
PARTITION_MAP = {
Expand Down Expand Up @@ -908,3 +909,44 @@
logging.info(f"Found stack: {name} (created on {creation_date})")
return name
return None

def get_quarantined_stacks(region, prefix=None):
quarantined_stacks = []
cfn_client = boto3.client("cloudformation", region_name=region)

for stack in cfn_client.describe_stacks().get("Stacks", []):
stack_name = stack.get("StackName")
if not stack_name:
continue
if prefix and not stack_name.startswith(prefix):
continue
if any(tag.get("Key") == QUARANTINE_TAG_KEY for tag in stack.get("Tags", [])):
quarantined_stacks.append(stack_name)
return quarantined_stacks

def is_quarantined_stack(region, stack_name):
return stack_name in get_quarantined_stacks(region)

def quarantine_stacks(region, stack_names):
for stack_name in stack_names:
add_tags_to_stack(region, stack_name, QUARANTINE_TAGS)

def add_tags_to_stack(region, stack_name, tags):
cfn = boto3.resource("cloudformation", region_name=region)
stack = cfn.Stack(stack_name)

stack.update(
UsePreviousTemplate=True,
Parameters=get_unchanged_stack_parameters(stack),
Capabilities=stack.capabilities,
DisableRollback=True,
Tags=tags,
)

def get_unchanged_stack_parameters(stack):
return [
{
'ParameterKey': current_parameter.get('ParameterKey'),
'UsePreviousValue': True,
} for current_parameter in stack.parameters
]
Loading