-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathentrypoint.sh
executable file
·57 lines (45 loc) · 2.04 KB
/
entrypoint.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#!/bin/bash
# Example usage:
# ./entrypoint.sh <cohort_result_f> [<output_dir>] [<number_of_jobs>]
# The <cohort_result_f> is the path to the cohort-participant-machine-results.tsv file downloaded from the neurobagel query tool.
# The <output_dir> is an optional argument that specifies where the datasets should be installed.
# If not provided, the current directory will be used.
# The <number_of_jobs> is an optional argument that specifies the number of parallel jobs to run.
# If not provided, the default value of 6 will be used.
cohort_result_f=$1
output_dir=${2-.}
jobs=${3-6}
getdata() {
dataset=$(cut -d " " -f1 <<< "$1")
content=$(cut -d " " -f2 <<< "$1")
datalad get --dataset "$dataset" "$content"
}
export -f getdata
[ ! -e ${output_dir} ] && mkdir -p ${output_dir}
# dataset installations
tail -n +2 "$cohort_result_f" | cut -f1,2 | sort | uniq | parallel -j"${jobs}" --joblog "${output_dir}/parallel.log" "
ds_full_name=\$(cut -f1 <<< {})
ds_url=\$(cut -f2 <<< {})
# NOTE: ds_full_name and ds_url references must be unbraced to ensure they aren't expanded in the parent shell
echo \"Will now install '\$ds_full_name' from '\$ds_url'.\"
(
cd \"${output_dir}\" || exit
datalad install \"\$ds_url\"
)
" ::: 2>&1 | tee -a "${output_dir}/parallel.outs"
# session processing
{
# Skip the header row of the cohort_result_f file
read
# Ensure last line is read even if file does not end with newline
# See: https://stackoverflow.com/a/12916758
while read -r dataset || [ -n "$dataset" ]; do
ses_path=$(cut -f6 <<< "$dataset")
if [ -n "$ses_path" ]; then
ds_name=$(echo "$ses_path" | cut -d "/" -f2)
ses_subpath=$(echo "$ses_path" | cut -d "/" -f3-)
echo "${output_dir}/${ds_name} ${output_dir}/${ds_name}/${ses_subpath}"
fi
done | parallel -j"${jobs}" --joblog "${output_dir}/parallel.log" "getdata {}" ::: 2>&1 | tee "${output_dir}/parallel.outs"
} < "$cohort_result_f"
echo "Finished getting all files for the matching subject(s)/session(s) from DataLad."