Skip to content

Commit 12bb452

Browse files
authored
Add Rocky Linux 8 support for ParallelCluster (#169)
Add support for ParallelCluster 3.8.0. Create image builder config files so can build custom Rocky 8 AMI. Added rocky8 as an option for slurm/ParallelClusterConfig/Image/Os. Added config slurm/ParallelClusterConfig/Image/CustomAmi. Rocky 8 requires a custom AMI because PC doesn't supply one. Add support for ParallelCluster munge key secret Check that the slurm database is in the same vpc. Fix syntax error in cfn_response for CreateParallelCluster lambda. Resolves #183
1 parent 3717730 commit 12bb452

File tree

4 files changed

+38
-6
lines changed

4 files changed

+38
-6
lines changed

source/cdk/cdk_slurm_stack.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,14 @@ def check_config(self):
309309
self.mount_home_src = mount_dict['src']
310310
logger.info(f"Mounting /home from {self.mount_home_src} on compute nodes")
311311

312+
if self.config['slurm']['ParallelClusterConfig']['Image']['Os'] == 'rocky8':
313+
if not config_schema.PARALLEL_CLUSTER_SUPPORTS_CUSTOM_ROCKY_8(self.PARALLEL_CLUSTER_VERSION):
314+
logger.error(f"rocky8 is not supported in ParallelCluster version {self.PARALLEL_CLUSTER_VERSION}. Support added in {PARALLEL_CLUSTER_SUPPORTS_CUSTOM_ROCKY_8_VERSION}.")
315+
config_errors += 1
316+
if 'CustomAmi' not in self.config['slurm']['ParallelClusterConfig']['Image']:
317+
logger.error(f"Must specify config slurm/ParallelClusterConfig/Image/Os/CustomAmi with rocky8.")
318+
config_errors += 1
319+
312320
if 'Database' in self.config['slurm']['ParallelClusterConfig']:
313321
if 'DatabaseStackName' in self.config['slurm']['ParallelClusterConfig']['Database'] and 'EdaSlurmClusterStackName' in self.config['slurm']['ParallelClusterConfig']['Database']:
314322
logger.error(f"Cannot specify both slurm/ParallelClusterConfig/Database/DatabaseStackName and slurm/ParallelClusterConfig/Database/EdaSlurmClusterStackName")
@@ -646,6 +654,13 @@ def create_parallel_cluster_assets(self):
646654
},
647655
}
648656
}
657+
if config_schema.PARALLEL_CLUSTER_SUPPORTS_CUSTOM_ROCKY_8(self.PARALLEL_CLUSTER_VERSION):
658+
ami_builds['Rocky'] = {
659+
'8': {
660+
'arm64': {},
661+
'x86_64': {}
662+
}
663+
}
649664
template_vars['ComponentS3Url'] = self.custom_action_s3_urls['config/bin/configure-eda.sh']
650665
cfn_client = boto3.client('cloudformation', region_name=self.config['Region'])
651666
cfn_list_resources_paginator = cfn_client.get_paginator('list_stack_resources')
@@ -2164,6 +2179,9 @@ def create_parallel_cluster_config(self):
21642179
if 'AllowedIps' in self.config['slurm']['ParallelClusterConfig']['Dcv']:
21652180
self.parallel_cluster_config['HeadNode']['Dcv']['AllowedIps'] = self.config['slurm']['ParallelClusterConfig']['AllowedIps']
21662181

2182+
if self.munge_key_secret_arn:
2183+
self.parallel_cluster_config['Scheduling']['SlurmSettings']['MungeKeySecretArn'] = self.munge_key_secret_arn
2184+
21672185
if 'CustomAmi' in self.config['slurm']['ParallelClusterConfig']['Image']:
21682186
self.parallel_cluster_config['Image']['CustomAmi'] = self.config['slurm']['ParallelClusterConfig']['Image']['CustomAmi']
21692187

source/cdk/config_schema.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -61,13 +61,14 @@
6161
# * Fix pmix CVE
6262
# * Use Slurm 23.02.5
6363
MIN_PARALLEL_CLUSTER_VERSION = parse_version('3.6.0')
64-
DEFAULT_PARALLEL_CLUSTER_VERSION = parse_version('3.7.2')
64+
DEFAULT_PARALLEL_CLUSTER_VERSION = parse_version('3.8.0')
6565
PARALLEL_CLUSTER_VERSIONS = [
6666
'3.6.0',
6767
'3.6.1',
6868
'3.7.0',
6969
'3.7.1',
7070
'3.7.2',
71+
'3.8.0',
7172
]
7273
PARALLEL_CLUSTER_MUNGE_VERSIONS = {
7374
# This can be found on the head node at /opt/parallelcluster/sources
@@ -77,6 +78,7 @@
7778
'3.7.0': '0.5.15', # confirmed
7879
'3.7.1': '0.5.15', # confirmed
7980
'3.7.2': '0.5.15', # confirmed
81+
'3.8.0': '0.5.15', # confirmed
8082
}
8183
PARALLEL_CLUSTER_PYTHON_VERSIONS = {
8284
# This can be found on the head node at /opt/parallelcluster/pyenv/versions
@@ -85,6 +87,7 @@
8587
'3.7.0': '3.9.16', # confirmed
8688
'3.7.1': '3.9.16', # confirmed
8789
'3.7.2': '3.9.16', # confirmed
90+
'3.8.0': '3.9.17', # confirmed
8891
}
8992
PARALLEL_CLUSTER_SLURM_VERSIONS = {
9093
# This can be found on the head node at /etc/chef/local-mode-cache/cache/
@@ -93,6 +96,7 @@
9396
'3.7.0': '23.02.4', # confirmed
9497
'3.7.1': '23.02.5', # confirmed
9598
'3.7.2': '23.02.6', # confirmed
99+
'3.8.0': '23.02.6', # confirmed
96100
}
97101
PARALLEL_CLUSTER_PC_SLURM_VERSIONS = {
98102
# This can be found on the head node at /etc/chef/local-mode-cache/cache/
@@ -101,6 +105,7 @@
101105
'3.7.0': '23-02-4-1', # confirmed
102106
'3.7.1': '23-02-5-1', # confirmed
103107
'3.7.2': '23-02-6-1', # confirmed
108+
'3.8.0': '23-02-6-1', # confirmed
104109
}
105110
SLURM_REST_API_VERSIONS = {
106111
'23-02-2-1': '0.0.39',
@@ -113,6 +118,7 @@
113118
'alinux2',
114119
'centos7',
115120
'rhel8',
121+
'rocky8',
116122
'ubuntu2004',
117123
'ubuntu2204'
118124
]
@@ -155,12 +161,19 @@ def PARALLEL_CLUSTER_SUPPORTS_MULTIPLE_COMPUTE_RESOURCES_PER_QUEUE(parallel_clus
155161
def PARALLEL_CLUSTER_SUPPORTS_MULTIPLE_INSTANCE_TYPES_PER_COMPUTE_RESOURCE(parallel_cluster_version):
156162
return parallel_cluster_version >= PARALLEL_CLUSTER_SUPPORTS_MULTIPLE_INSTANCE_TYPES_PER_COMPUTE_RESOURCE_VERSION
157163

158-
# Unsupported
164+
# Version 3.8.0
165+
166+
PARALLEL_CLUSTER_SUPPORTS_CUSTOM_ROCKY_8_VERSION = parse_version('3.8.0')
167+
def PARALLEL_CLUSTER_SUPPORTS_CUSTOM_ROCKY_8(parallel_cluster_version):
168+
return parallel_cluster_version >= PARALLEL_CLUSTER_SUPPORTS_CUSTOM_ROCKY_8_VERSION
169+
170+
PARALLEL_CLUSTER_SUPPORTS_CUSTOM_MUNGE_KEY_VERSION = parse_version('3.8.0')
159171
def PARALLEL_CLUSTER_SUPPORTS_CUSTOM_MUNGE_KEY(parallel_cluster_version):
160-
return False
172+
return parallel_cluster_version >= PARALLEL_CLUSTER_SUPPORTS_CUSTOM_MUNGE_KEY_VERSION
161173

174+
PARALLEL_CLUSTER_SUPPORTS_HOME_MOUNT_VERSION = parse_version('3.8.0')
162175
def PARALLEL_CLUSTER_SUPPORTS_HOME_MOUNT(parallel_cluster_version):
163-
return False
176+
return parallel_cluster_version >= PARALLEL_CLUSTER_SUPPORTS_HOME_MOUNT_VERSION
164177

165178
# Determine all AWS regions available on the account.
166179
default_region = environ.get("AWS_DEFAULT_REGION", "us-east-1")

source/resources/lambdas/CreateParallelCluster/CreateParallelCluster.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ def lambda_handler(event, context):
126126
valid_statuses = ['CREATE_COMPLETE', 'UPDATE_COMPLETE', 'UPDATE_ROLLBACK_COMPLETE']
127127
invalid_statuses = ['UPDATE_IN_PROGRESS', 'DELETE_IN_PROGRESS']
128128
if cluster_status in invalid_statuses:
129-
cfnresponse.send(event, context, cfnresponse.FAILED, {f"{cluster_name} in {cluster_status} state."}, physicalResourceId=cluster_name)
129+
cfnresponse.send(event, context, cfnresponse.FAILED, {'error': f"{cluster_name} in {cluster_status} state."}, physicalResourceId=cluster_name)
130130
return
131131
if requestType == 'Create':
132132
logger.info(f"{cluster_name} exists so changing request type from Create to Update.")
@@ -267,7 +267,7 @@ def lambda_handler(event, context):
267267
break
268268
if cluster_status == 'DELETE_FAILED':
269269
logger.info(f"{cluster_name} delete failed")
270-
cfnresponse.send(event, context, cfnresponse.FAILED, {f"{cluster_name} in {cluster_status} state."}, physicalResourceId=cluster_name)
270+
cfnresponse.send(event, context, cfnresponse.FAILED, {'error': f"{cluster_name} in {cluster_status} state."}, physicalResourceId=cluster_name)
271271
return
272272
else:
273273
raise ValueError(f"Unsupported requestType: {requestType}")

source/resources/parallel-cluster/config/bin/on_head_node_configured.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ export PATH=/usr/sbin:$PATH
5454
# Rerun on_head_node_start.sh to download latest versions of all config files and scripts
5555
$config_bin_dir/on_head_node_start.sh
5656

57+
# This is handled directly by ParallelCluster starting in 3.8.0.
5758
if ! [ -z $MungeKeySecretId ]; then
5859
echo "Download munge key from $MungeKeySecretId"
5960
munge_key_b64=$(aws secretsmanager get-secret-value --secret-id $MungeKeySecretId --query 'SecretString' --output text)

0 commit comments

Comments
 (0)