Skip to content

Commit aa8b9dc

Browse files
authored
Enable ParallelCluster access from outside the cluster (#137)
Create CfnOutputs and scripts to configure a submission host to access a ParallelCluster without having to ssh to it. Install the same version of munge as used by ParallelCluster. Copy ParallelCluster's munge key to the submission host. In the future will test changing ParallelCluster to use a key from an SSM parameter. Create a command to mount the Slurm shared file system on the submission host. Create a command to run a script to configure the submission host. Create a modulefile to configure the environment to use the ParallelCluster from the submission host. Resolves #136
1 parent 1514055 commit aa8b9dc

File tree

11 files changed

+438
-2
lines changed

11 files changed

+438
-2
lines changed

source/cdk/cdk_slurm_stack.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -711,6 +711,20 @@ def create_parallel_cluster_assets(self):
711711
self.assets_bucket = self.playbooks_asset.s3_bucket_name
712712
self.assets_base_key = self.config['slurm']['ClusterName']
713713

714+
self.parallel_cluster_munge_key_write_policy = iam.ManagedPolicy(
715+
self, "ParallelClusterMungeKeyWritePolicy",
716+
managed_policy_name = f"{self.stack_name}-ParallelClusterMungeKeyWritePolicy",
717+
statements = [
718+
iam.PolicyStatement(
719+
effect=iam.Effect.ALLOW,
720+
actions=[
721+
's3:PutObject',
722+
],
723+
resources=[f"arn:aws:s3:::{self.assets_bucket}/{self.config['slurm']['ClusterName']}/config/munge.key"]
724+
)
725+
]
726+
)
727+
714728
s3_client = boto3.client('s3', region_name=self.config['Region'])
715729

716730
template_vars = {
@@ -719,14 +733,15 @@ def create_parallel_cluster_assets(self):
719733
'playbooks_s3_url': self.playbooks_s3_url,
720734
}
721735
files_to_upload = [
736+
'config/bin/config_submitter.sh',
722737
'config/bin/create_users_groups_json.py',
723738
'config/bin/create_users_groups.py',
724-
'config/users_groups.json',
725739
'config/bin/on_head_node_start.sh',
726740
'config/bin/on_head_node_configured.sh',
727741
'config/bin/on_head_node_updated.sh',
728742
'config/bin/on_compute_node_start.sh',
729743
'config/bin/on_compute_node_configured.sh',
744+
'config/users_groups.json',
730745
]
731746
self.custom_action_s3_urls = {}
732747
for file_to_upload in files_to_upload:
@@ -2416,7 +2431,10 @@ def get_instance_template_vars(self, instance_role):
24162431
"Region": self.config['Region'],
24172432
"TimeZone": self.config['TimeZone'],
24182433
}
2434+
instance_template_vars['DefaultPartition'] = 'batch'
24192435
instance_template_vars['FileSystemMountPath'] = '/opt/slurm'
2436+
instance_template_vars['ModulefilesBaseDir'] = '/opt/slurm/config/modules/modulefiles'
2437+
instance_template_vars['ParallelClusterVersion'] = self.config['slurm']['ParallelClusterConfig']['Version']
24202438
instance_template_vars['SlurmBaseDir'] = '/opt/slurm'
24212439
instance_template_vars['SlurmOSDir'] = '/opt/slurm'
24222440
instance_template_vars['SlurmVersion'] = self.config['slurm']['SlurmVersion']
@@ -3538,6 +3556,7 @@ def create_parallel_cluster_config(self):
35383556
{'Policy': 'arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore'},
35393557
{'Policy': self.parallel_cluster_asset_read_policy.managed_policy_arn},
35403558
{'Policy': self.parallel_cluster_jwt_write_policy.managed_policy_arn},
3559+
{'Policy': self.parallel_cluster_munge_key_write_policy.managed_policy_arn},
35413560
],
35423561
},
35433562
'Imds': {
@@ -4178,3 +4197,11 @@ def create_parallel_cluster_config(self):
41784197
CfnOutput(self, "PlaybookS3Url",
41794198
value = self.playbooks_asset.s3_object_url
41804199
)
4200+
region = self.config['Region']
4201+
cluster_name = self.config['slurm']['ClusterName']
4202+
CfnOutput(self, "SubmitterMountHeadNodeCommand",
4203+
value = f"head_ip=$(aws ec2 describe-instances --region {region} --filters 'Name=tag:parallelcluster:cluster-name,Values={cluster_name}' 'Name=tag:parallelcluster:node-type,Values=HeadNode' --query 'Reservations[0].Instances[0].PrivateIpAddress' --output text) && sudo mkdir -p /opt/slurm && sudo mount $head_ip:/opt/slurm /opt/slurm"
4204+
)
4205+
CfnOutput(self, "SubmitterConfigureCommand",
4206+
value = f"sudo /opt/slurm/config/bin/config_submitter.sh"
4207+
)

source/cdk/config_schema.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,8 @@
5757
DEFAULT_PARALLEL_CLUSTER_VERSION = parse_version('3.6.1')
5858
DEFAULT_PARALLEL_CLUSTER_PYTHON_VERSION = '3.9.16'
5959
DEFAULT_PARALLEL_CLUSTER_PYTHON_VERSIONS = {
60-
'3.7.0b1': str(DEFAULT_PARALLEL_CLUSTER_PYTHON_VERSION),
60+
'3.6.1': '3.9.16',
61+
'3.7.0b1': '3.9.16',
6162
}
6263
DEFAULT_PARALLEL_CLUSTER_SLURM_VERSION = '23-02-3-1'
6364
DEFAULT_PARALLEL_CLUSTER_SLURM_VERSIONS = {
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#!/bin/bash -xe
2+
3+
full_script=$(realpath $0)
4+
script_dir=$(dirname $full_script)
5+
base_script=$(basename $full_script)
6+
7+
date
8+
echo "Started config_submitter.sh: $full_script"
9+
10+
config_dir=/opt/slurm/config
11+
config_bin_dir=$config_dir/bin
12+
13+
assets_bucket={{assets_bucket}}
14+
assets_base_key={{assets_base_key}}
15+
16+
# Configure using ansible
17+
if ! yum list installed ansible &> /dev/null; then
18+
yum install -y ansible || amazon-linux-extras install -y ansible2
19+
fi
20+
21+
ANSIBLE_PATH=$config_dir/ansible
22+
PLAYBOOKS_PATH=$ANSIBLE_PATH/playbooks
23+
24+
pushd $PLAYBOOKS_PATH
25+
ansible-playbook $PLAYBOOKS_PATH/ParallelClusterSubmitter.yml \
26+
-i inventories/local.yml \
27+
-e @$ANSIBLE_PATH/ansible_head_node_vars.yml
28+
popd
29+
30+
date
31+
echo "Finished config_submitter.sh: $full_script"
32+
33+
exit 0

source/resources/parallel-cluster/config/bin/on_head_node_start.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ chmod -R 0700 $config_dir
3333

3434
# Download all of the config scripts
3535
config_scripts=(\
36+
config_submitter.sh \
3637
create_users_groups_json.py \
3738
create_users_groups.py \
3839
on_head_node_start.sh \
@@ -95,6 +96,8 @@ if ! [ -e $jwt_key ]; then
9596
chmod 0600 $jwt_key
9697
fi
9798

99+
/usr/bin/cp -f /etc/munge/munge.key $config_dir/munge.key
100+
98101
date
99102
echo "Finished on_head_node_start.sh: $full_script"
100103

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
- name: Configure ParallelCluster Submitter
3+
hosts: ParallelClusterHeadNode
4+
become_user: root
5+
become: yes
6+
roles:
7+
- ParallelClusterSubmitter
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
---
2+
3+
- name: Show vars used in this playbook
4+
debug:
5+
msg: |
6+
Architecture: {{Architecture}}
7+
ClusterName: {{ClusterName}}
8+
DefaultPartition: {{DefaultPartition}}
9+
distribution: {{distribution}}
10+
distribution_major_version: {{distribution_major_version}}
11+
ModulefilesBaseDir: {{ModulefilesBaseDir}}
12+
ParallelClusterVersion: {{ParallelClusterVersion}}
13+
SlurmBaseDir: {{SlurmBaseDir}}
14+
SlurmConfigDir: {{SlurmConfigDir}}
15+
SlurmVersion: {{SlurmVersion}}
16+
17+
- name: Fix permissions on config dir so users can access it.
18+
file:
19+
path: "{{SlurmConfigDir}}"
20+
state: directory
21+
owner: root
22+
group: root
23+
mode: 0755
24+
25+
- name: Create {{ModulefilesBaseDir}}/{{distribution}}/{{distribution_major_version}}/{{Architecture}}/{{ClusterName}}
26+
file:
27+
path: "{{ModulefilesBaseDir}}/{{distribution}}/{{distribution_major_version}}/{{Architecture}}/{{ClusterName}}"
28+
state: directory
29+
owner: root
30+
group: root
31+
mode: 0755
32+
33+
- name: Create slurm modulefile .template
34+
template:
35+
dest: "{{ModulefilesBaseDir}}/{{distribution}}/{{distribution_major_version}}/{{Architecture}}/{{ClusterName}}/.template"
36+
src: opt/slurm/modules/modulefiles/slurm/.template
37+
owner: root
38+
group: root
39+
mode: 0664
40+
force: yes
41+
42+
- name: Create slurm modulefile
43+
file:
44+
path: "{{ModulefilesBaseDir}}/{{distribution}}/{{distribution_major_version}}/{{Architecture}}/{{ClusterName}}/{{ParallelClusterVersion}}"
45+
src: "{{ModulefilesBaseDir}}/{{distribution}}/{{distribution_major_version}}/{{Architecture}}/{{ClusterName}}/.template"
46+
state: link
47+
owner: root
48+
group: root
49+
mode: 0664
50+
51+
- name: Create slurm modulefile .version
52+
template:
53+
dest: "{{ModulefilesBaseDir}}/{{distribution}}/{{distribution_major_version}}/{{Architecture}}/{{ClusterName}}/.version"
54+
src: opt/slurm/modules/modulefiles/slurm/.version
55+
owner: root
56+
group: root
57+
mode: 0664
58+
force: yes

source/resources/playbooks/roles/ParallelClusterHeadNode/tasks/main.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,4 @@
66
- { include: config-licenses.yml, tags: licenses }
77
- { include: config-slurmrestd.yml, tags: slurmrestd }
88
- { include: config-sshd.yml, tags: sshd }
9+
- { include: config-modulefiles.yml, tags: modulefiles }
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
#%Module1.0
2+
########################################
3+
##
4+
## SLURM modulefile
5+
##
6+
## Docs: https://modules.readthedocs.io/en/latest/modulefile.html
7+
########################################
8+
9+
set nicename "SLURM"
10+
11+
set thisname [ module-info name ]
12+
set namelen [llength [split $thisname "/"]]
13+
set toolname [lindex [split $thisname "/" ] $namelen-2 ]
14+
set version [lindex [split $thisname "/" ] end ]
15+
16+
proc ModulesHelp { } {
17+
global thisname toolname nicename
18+
puts stderr "\t$thisname - loads the env for $nicename\n"
19+
}
20+
21+
module-whatis "loads the env for $toolname version $version"
22+
23+
prepend-path PATH {{SlurmBaseDir}}/bin
24+
25+
setenv SLURM_CLUSTER_NAME {{ClusterName}}
26+
27+
# sbatch defaults
28+
if { [ module-info mode load ] || [ module-info mode display ] } {
29+
if { ! [ info exists ::env(SBATCH_MEM_PER_NODE) ] } {
30+
setenv SBATCH_MEM_PER_NODE 100M
31+
setenv SBATCH_MEM_PER_NODE_SET ""
32+
}
33+
if { ! ( [ info exists ::env(SBATCH_REQUEUE) ] || [ info exists ::env(SBATCH_NO_REQUEUE) ] ) } {
34+
setenv SBATCH_REQUEUE ""
35+
setenv SBATCH_REQUEUE_SET ""
36+
}
37+
if { ! [ info exists ::env(SBATCH_TIMELIMIT) ] } {
38+
setenv SBATCH_TIMELIMIT "1:0:0"
39+
setenv SBATCH_TIMELIMIT_SET ""
40+
}
41+
if { ! [ info exists ::env(SBATCH_PARTITION) ] } {
42+
setenv SBATCH_PARTITION "{{DefaultPartition}}"
43+
setenv SBATCH_PARTITION_SET ""
44+
}
45+
} elseif { [ module-info mode remove ] } {
46+
if { [ info exists ::env(SBATCH_MEM_PER_NODE_SET) ] } {
47+
unsetenv SBATCH_MEM_PER_NODE
48+
unsetenv SBATCH_MEM_PER_NODE_SET
49+
}
50+
if { [ info exists ::env(SBATCH_REQUEUE_SET) ] } {
51+
unsetenv SBATCH_REQUEUE
52+
unsetenv SBATCH_REQUEUE_SET
53+
}
54+
if { [ info exists ::env(SBATCH_TIMELIMIT_SET) ] } {
55+
unsetenv SBATCH_TIMELIMIT
56+
unsetenv SBATCH_TIMELIMIT_SET
57+
}
58+
if { [ info exists ::env(SBATCH_PARTITION_SET) ] } {
59+
unsetenv SBATCH_PARTITION
60+
unsetenv SBATCH_PARTITION_SET
61+
}
62+
}
63+
64+
# srun defaults
65+
if { [ module-info mode load ] || [ module-info mode display ] } {
66+
if { ! [ info exists ::env(SLURM_CPUS_PER_TASK) ] } {
67+
setenv SLURM_CPUS_PER_TASK 1
68+
setenv SLURM_CPUS_PER_TASK_SET ""
69+
}
70+
if { ! [ info exists ::env(SLURM_MEM_PER_NODE) ] } {
71+
setenv SLURM_MEM_PER_NODE 100M
72+
setenv SLURM_MEM_PER_NODE_SET ""
73+
}
74+
if { ! [ info exists ::env(SLURM_PARTITION) ] } {
75+
setenv SLURM_PARTITION "{{DefaultPartition}}"
76+
setenv SLURM_PARTITION_SET ""
77+
}
78+
if { ! [ info exists ::env(SLURM_TIMELIMIT) ] } {
79+
setenv SLURM_TIMELIMIT "1:0:0"
80+
setenv SLURM_TIMELIMIT_SET ""
81+
}
82+
} elseif { [ module-info mode remove ] } {
83+
if { [ info exists ::env(SLURM_CPUS_PER_TASK_SET) ] } {
84+
unsetenv SLURM_CPUS_PER_TASK
85+
unsetenv SLURM_CPUS_PER_TASK_SET
86+
}
87+
if { [ info exists ::env(SLURM_MEM_PER_NODE_SET) ] } {
88+
unsetenv SLURM_MEM_PER_NODE
89+
unsetenv SLURM_MEM_PER_NODE_SET
90+
}
91+
if { ! [ info exists ::env(SLURM_PARTITION_SET) ] } {
92+
unsetenv SLURM_PARTITION
93+
unsetenv SLURM_PARTITION_SET
94+
}
95+
if { [ info exists ::env(SLURM_TIMELIMIT_SET) ] } {
96+
unsetenv SLURM_TIMELIMIT
97+
unsetenv SLURM_TIMELIMIT_SET
98+
}
99+
}
100+
101+
# squeue defaults
102+
#
103+
# Output format
104+
# If SQUEUE_FORMAT or SQUEUE_FORMAT2 already set then doen't change them.
105+
# This is to avoid overwriting a user's settings.
106+
#
107+
# SQUEUE_FORMAT doesn't allow all fields so prefer SQUEUE_FORMAT2
108+
# default: "%.18i %9P %8j %8u %12T %.10M %.6D %R"
109+
# JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON)
110+
# --long: "%.18i %.9P %.8j %.8u %.2t %.10M %.6D %R"
111+
# JOBID PARTITION NAME USER STATE TIME TIME_LIMI NODES NODELIST(REASON)
112+
#setenv SQUEUE_FORMAT '%.9i %.19P %.9j %.8u %.10T %R'
113+
#
114+
# SQUEUE_FORMAT2 has all available fields
115+
# BatchHost: Similar to NodeList
116+
# ClusterFeature
117+
#
118+
# Priority
119+
# Reason
120+
#
121+
# TimeUsed
122+
# tres-alloc
123+
# UserId
124+
if { [ module-info mode load ] || [ module-info mode display ] } {
125+
if { ! [ info exists ::env(SQUEUE_SORT) ] } {
126+
# Sort by state, priority
127+
setenv SQUEUE_SORT "t,-p"
128+
setenv SQUEUE_SORT_SET ""
129+
}
130+
if { ! ( [ info exists ::env(SQUEUE_FORMAT) ] || [ info exists ::env(SQUEUE_FORMAT2) ] ) } {
131+
setenv SQUEUE_FORMAT2 "Cluster:16 ,Partition:15 ,JobArrayId:16 ,Priority:12 ,State:11 ,UserName:8 ,Name:16 ,NumNodes:.5 ,NumCPUs:.4 ,MinMemory:.10 ,Feature:15 ,Dependency:10 ,Licenses:8 ,ReasonList:35"
132+
#
133+
# Time and priority information
134+
#setenv SQUEUE_FORMAT2 "JobId:.6 ,Partition:9 ,State:7 ,UserName:8 ,Name:16 ,SubmitTime:16 ,PendingTime:12 ,TimeLimit:18 ,EndTime:18 ,ReasonList"
135+
setenv SQUEUE_FORMAT2_SET ""
136+
}
137+
} elseif { [ module-info mode remove ] } {
138+
if { [ info exists ::env(SQUEUE_SORT_SET) ] } {
139+
unsetenv SQUEUE_SORT
140+
unsetenv SQUEUE_SORT_SET
141+
}
142+
if { [ info exists ::env(SQUEUE_FORMAT2_SET) ] } {
143+
unsetenv SQUEUE_FORMAT2
144+
unsetenv SQUEUE_FORMAT2_SET
145+
}
146+
}
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#%Module1.0
2+
##
3+
## Default version of this module
4+
##
5+
set ModulesVersion "{{ParallelClusterVersion}}"
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
Role Name
2+
=========
3+
4+
A brief description of the role goes here.
5+
6+
Requirements
7+
------------
8+
9+
Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required.
10+
11+
Role Variables
12+
--------------
13+
14+
A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well.
15+
16+
Dependencies
17+
------------
18+
19+
A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles.
20+
21+
Example Playbook
22+
----------------
23+
24+
Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too:
25+
26+
- hosts: servers
27+
roles:
28+
- { role: username.rolename, x: 42 }
29+
30+
License
31+
-------
32+
33+
BSD
34+
35+
Author Information
36+
------------------
37+
38+
An optional section for the role authors to include contact information, or a website (HTML is not allowed).

0 commit comments

Comments
 (0)