-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathopenmpiscript.sh
212 lines (179 loc) · 6.43 KB
/
openmpiscript.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
#!/usr/bin/env bash
##**************************************************************
##
## Copyright (C) 1990-2018, Condor Team, Computer Sciences Department,
## University of Wisconsin-Madison, WI.
##
## Licensed under the Apache License, Version 2.0 (the "License"); you
## may not use this file except in compliance with the License. You may
## obtain a copy of the License at
##
## http://www.apache.org/licenses/LICENSE-2.0
##
## Unless required by applicable law or agreed to in writing, software
## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## See the License for the specific language governing permissions and
## limitations under the License.
##
##**************************************************************
# This is a script to run OpenMPI jobs under the HTCondor parallel universe.
# OpenMPI assumes that a full install is available on all execute nodes.
## sample submit script
#universe = parallel
#executable = openmpiscript
#arguments = actual_mpi_job arg1 arg2 arg3
#getenv = true
#
#should_transfer_files = yes
#transfer_input_files = actual_mpi_job
#when_to_transfer_output = on_exit_or_evict
#
#output = out.$(NODE)
#error = err.$(NODE)
#log = log
#
#machine_count = 8
#queue
##
## configuration options
# $USE_OPENMP should be set to true if using OpenMP with your OpenMPI executable (not typical).
USE_OPENMP=false
# Set the paths to the helper scripts
# Get them from the HTCondor libexec directory
ORTED_LAUNCHER=$(condor_config_val libexec)/orted_launcher.sh
GET_ORTED_CMD=$(condor_config_val libexec)/get_orted_cmd.sh
# Or set a custom path (e.g. the local directory if transferring the scripts)
#ORTED_LAUNCHER=./orted_launcher.sh
#GET_ORTED_CMD=./get_orted_cmd.sh
# $MPDIR points to the location of the OpenMPI install
# The pool admin may set it via OPENMPI_INSTALL_PATH in the condor_config (recommended)
MPDIR=/usr
#MPDIR=$(condor_config_val OPENMPI_INSTALL_PATH)
# Or set it manually
#MPDIR=/usr/lib64/openmpi
# $EXINT is a comma-delimited list of excluded network interfaces.
# If your mpi jobs are hanging, OpenMPI may be trying to use too many
# network interfaces to communicate between nodes.
# The pool admin may set it via OPENMPI_EXCLUDE_NETWORK_INTERFACES in the condor_config (recommended)
EXINT=$(condor_config_val OPENMPI_EXCLUDE_NETWORK_INTERFACES)
# Or set it manually
#EXINT="docker0,virbr0"
##
## configuration check
# We recommend that your pool admin use MOUNT_UNDER_SCRATCH = /tmp
# so that OpenMPI caches all data under the user's scratch directory.
# Not having /tmp mounted under scratch may hang mpi jobs.
_USE_SCRATCH=$(condor_config_val MOUNT_UNDER_SCRATCH)
if [ -z $_USE_SCRATCH ]; then
>&2 echo "WARNING: MOUNT_UNDER_SCRATCH not set in condor_config"
elif test "${_USE_SCRATCH#*/tmp}" == "$_USE_SCRATCH"; then
>&2 echo "WARNING: /tmp not included in MOUNT_UNDER_SCRATCH"
fi
# If MPDIR is not set, then use a default value
if [ -z $MPDIR ]; then
>&2 echo "WARNING: Using default value for \$MPDIR in openmpiscript"
MPDIR=/usr/lib64/openmpi
fi
PATH=$MPDIR/bin:.:$PATH
export PATH
# If EXINT is not set, then use some default values
if [ -z $EXINT ]; then
>&2 echo "WARNING: Using default values for \$EXINT in openmpiscript"
EXINT="docker0,virbr0"
fi
##
## cleanup function
_orted_launcher_pid=0
_mpirun_pid=0
CONDOR_CHIRP=$(condor_config_val libexec)/condor_chirp
force_cleanup() {
# Forward SIGTERM to the orted launcher
if [ $_orted_launcher_pid -ne 0 ]; then
kill -s SIGTERM $_orted_launcher_pid
fi
# Cleanup mpirun
if [ $_CONDOR_PROCNO -eq 0 ] && [ $_mpirun_pid -ne 0 ]; then
$CONDOR_CHIRP ulog "Node $_CONDOR_PROCNO caught SIGTERM, cleaning up mpirun"
rm $HOSTFILE
# Send SIGTERM to mpirun and the orted launcher
kill -s SIGTERM $_mpirun_pid
# Give mpirun 30 seconds to terminate nicely
for i in {1..30}; do
kill -0 $_mpirun_pid 2> /dev/null # returns 0 if running
_mpirun_killed=$?
if [ $_mpirun_killed -ne 0 ]; then
break
fi
sleep 1
done
# If mpirun is still running, send SIGKILL
if [ $_mpirun_killed -eq 0 ]; then
$CONDOR_CHIRP ulog "mpirun hung on Node ${_CONDOR_PROCNO}, sending SIGKILL!"
kill -s SIGKILL $_mpirun_pid
fi
fi
exit 1
}
trap force_cleanup SIGTERM
##
## execute node setup
export PATH=$MPDIR/bin:$PATH
# Run the orted launcher (gets orted command from condor_chirp)
$ORTED_LAUNCHER &
_orted_launcher_pid=$!
if [ $_CONDOR_PROCNO -ne 0 ]; then
# If not on node 0, wait for orted
wait $_orted_launcher_pid
exit $?
fi
##
## head node (node 0) setup
# Build the hostfile
HOSTFILE=hosts
while [ -f $_CONDOR_SCRATCH_DIR/$HOSTFILE ]; do
HOSTFILE=x$HOSTFILE
done
HOSTFILE=$_CONDOR_SCRATCH_DIR/$HOSTFILE
REQUEST_CPUS=$(condor_q -jobads $_CONDOR_JOB_AD -af RequestCpus)
for node in $(seq 0 $(( $_CONDOR_NPROCS - 1 ))); do
if $USE_OPENMP; then
# OpenMP will do the threading on the execute node
echo "$node slots=1" >> $HOSTFILE
else
# OpenMPI will do the threading on the execute node
echo "$node slots=$REQUEST_CPUS" >> $HOSTFILE
fi
done
# Make sure the executable is executable
EXECUTABLE=$1
shift
chmod +x $EXECUTABLE
##
## run mpirun
# Set MCA values for running on HTCondor
export OMPI_MCA_plm_rsh_agent=$GET_ORTED_CMD # use the helper script instead of ssh
export OMPI_MCA_plm_rsh_no_tree_spawn=1 # disable ssh tree spawn
export OMPI_MCA_orte_hetero_nodes=1 # do not assume same hardware on each node
export OMPI_MCA_orte_startup_timeout=120 # allow two minutes before failing
export OMPI_MCA_hwloc_base_binding_policy="none" # do not bind to cpu cores
export OMPI_MCA_btl_tcp_if_exclude="lo,$EXINT" # exclude unused tcp network interfaces
# Optional MCA values to set for firewalled setups
#export OMPI_MCA_btl_tcp_port_min_v4=1024 # lowest port number that can be used
#export OMPI_MCA_btl_tcp_port_range_v4=64511 # range of ports above lowest that can be used
# Optionally set MCA values for increasing mpirun verbosity per component
# (see ompi_info for more components)
#export OMPI_MCA_plm_base_verbose=30
#export OMPI_MCA_orte_base_verbose=30
#export OMPI_MCA_hwloc_base_verbose=30
#export OMPI_MCA_btl_base_verbose=30
# Run mpirun in the background and wait for it to exit
mpirun -v --prefix $MPDIR -hostfile $HOSTFILE $EXECUTABLE $@ &
_mpirun_pid=$!
wait $_mpirun_pid
_mpirun_exit=$?
## clean up
# Wait for orted to finish
wait $_orted_launcher_pid
rm $HOSTFILE
exit $_mpirun_exit