1
1
#!/usr/bin/python3 env
2
-
3
2
#standard libraries
4
3
import os
5
4
import glob
6
5
import yaml
7
6
import argparse
8
7
from argparse import HelpFormatter
9
8
10
-
11
9
class CustomFormat (HelpFormatter ):
12
10
13
11
'''
14
12
custom help format
15
13
'''
16
14
17
15
def _format_action_invocation (self , action ):
18
-
19
16
if not action .option_strings :
20
-
21
17
default = self ._get_default_metavar_for_positional (action )
22
18
metavar , = self ._metavar_formatter (action , default )(1 )
23
-
24
19
return metavar
25
-
26
20
else :
27
-
28
21
parts = []
29
-
30
22
if action .nargs == 0 :
31
-
32
23
parts .extend (action .option_strings )
33
-
34
24
else :
35
-
36
25
default = self ._get_default_metavar_for_optional (action )
37
26
args_string = self ._format_args (action , default )
38
-
39
27
for option_string in action .option_strings :
40
-
41
28
parts .append (option_string )
42
-
43
29
return '%s %s' % (', ' .join (parts ), args_string )
44
-
45
30
return ', ' .join (parts )
46
31
47
32
def _get_default_metavar_for_optional (self , action ):
48
-
49
33
return action .dest .upper ()
50
34
51
35
52
36
def default_parameters (args ):
53
37
54
38
d = dict ()
55
-
56
39
#bwa-mem2
57
40
d ['bwa-mem2' ]= dict ()
58
41
d ['bwa-mem2' ]['threads' ] = args .aln_threads
59
42
d ['bwa-mem2' ]['mem_mb' ] = args .aln_memory
60
43
d ['bwa-mem2' ]['time' ] = args .aln_time
61
-
62
44
#bwa-mem
63
45
d ['bwa' ]= dict ()
64
46
d ['bwa' ]['threads' ] = args .aln_threads
65
47
d ['bwa' ]['mem_mb' ] = args .aln_memory
66
48
d ['bwa' ]['time' ] = args .aln_time
67
-
68
49
#minimap2
69
50
d ['minimap2' ]= dict ()
70
51
d ['minimap2' ]['threads' ] = args .aln_threads
71
52
d ['minimap2' ]['mem_mb' ] = args .aln_memory
72
53
d ['minimap2' ]['time' ] = args .aln_time
73
54
d ['minimap2' ]['preset' ] = args .aln_preset
74
-
75
55
#samtools
76
56
d ['samtools' ]= dict ()
77
57
d ['samtools' ]['threads' ] = args .sam_threads
78
58
d ['samtools' ]['mem_mb' ] = args .sam_memory
79
59
d ['samtools' ]['time' ] = args .sam_time
80
-
81
60
#pggb
82
61
d ['pggb' ]= dict ()
83
62
d ['pggb' ]['threads' ] = args .pggb_threads
84
63
d ['pggb' ]['mem_mb' ] = args .pggb_memory
85
64
d ['pggb' ]['time' ] = args .pggb_time
86
65
d ['pggb' ]['tmpdir' ] = args .pggb_tmpdir
87
66
d ['pggb' ]['params' ] = args .pggb_params
88
-
89
67
#wfmash
90
68
d ['wfmash' ]= dict ()
91
69
d ['wfmash' ]['threads' ] = args .wfmash_threads
92
70
d ['wfmash' ]['mem_mb' ] = args .wfmash_memory
93
71
d ['wfmash' ]['time' ] = args .wfmash_time
94
72
d ['wfmash' ]['tmpdir' ] = args .wfmash_tmpdir
95
73
d ['wfmash' ]['params' ] = args .wfmash_params
96
-
97
-
98
74
#default
99
75
d ['default' ]= dict ()
100
76
d ['default' ]['mem_mb' ] = args .std_memory
101
77
d ['default' ]['time' ] = args .std_time
102
-
103
78
#output
104
79
d ['output' ] = args .output
105
-
106
80
return d
107
81
108
82
@@ -113,71 +87,59 @@ def main():
113
87
'''
114
88
115
89
parser = argparse .ArgumentParser (prog = 'organize.py' , description = '''COsine SImilarity-based GenoTyper''' , epilog = '''Developed by Davide Bolognini @ Human Technopole''' , formatter_class = CustomFormat )
116
-
90
+ #required
117
91
required = parser .add_argument_group ('Required I/O arguments' )
118
-
119
92
required .add_argument ('-a' , '--alignments' , help = 'folder with read-level alignment files (BAM,CRAM) - and their indexes (BAI/CSI,CRAI) - of the individuals to genotype' , metavar = 'FOLDER' , required = True )
120
93
required .add_argument ('-r' ,'--reference' , help = 'reference FASTA file - the same the individuals to genotype are aligned to' , metavar = 'FASTA' , required = True )
121
94
required .add_argument ('--assemblies' , help = 'chromosome-level assemblies in FASTA format' , metavar = 'FASTA' , required = True )
122
95
required .add_argument ('--roi' , help = 'one or more regions of interest in BED format - first column is the assembly to use as reference (PanSN format, # delimiter)' , metavar = 'BED' , required = True )
123
-
96
+ #additional
124
97
additional = parser .add_argument_group ('Additional I/O arguments' )
125
-
126
98
additional .add_argument ('--blacklist' , help = 'assemblies (one per line) that should not be included in the analysis [None]' , metavar = '' , required = False , default = None )
127
99
additional .add_argument ('--binds' , help = 'additional paths to bind for singularity in /path/1,/path/2 format [/localscratch]' , type = str , default = '/localscratch' )
128
100
additional .add_argument ('--tmp' , help = 'SINGULARITY TMPDIR [/tmp]' , type = str , default = '/tmp' )
129
101
additional .add_argument ('--output' , help = 'output folder [results]' , metavar = 'FOLDER' , default = 'results' )
130
- additional .add_argument ('--profile' , help = 'use profile. If "None", do not use profile and run on the local machine [config/slurm]' , metavar = 'FOLDER' , default = 'config/slurm' )
131
-
102
+ additional .add_argument ('--profile' , help = 'use profile. If None, do not use profile and run on the local machine [config/slurm]' , metavar = 'FOLDER' , default = 'config/slurm' )
103
+ additional .add_argument ('--samplemap' , help = 'tsv file mapping each bam/cram basename to a user-defined id. If None, infer from bam/cram basename [None]' , metavar = 'TSV' , type = str , default = None )
104
+ #metrics
132
105
metrics = parser .add_argument_group ('Specify #threads, memory and time requirements' )
133
-
134
- #default
135
106
metrics .add_argument ('--std_time' , help = 'max time (minutes) - default [1]' ,type = int , default = 1 )
136
107
metrics .add_argument ('--std_memory' , help = 'memory (mb) - default [500]' ,type = int , default = 500 )
137
-
138
108
#alignment
139
109
metrics .add_argument ('--aln_threads' , help = '# threads - aligner [5]' ,type = int , default = 5 )
140
110
metrics .add_argument ('--aln_time' , help = 'max time (minutes) - aligner [2]' ,type = int , default = 5 )
141
111
metrics .add_argument ('--aln_memory' , help = 'max memory (mb) - aligner [5000]' ,type = int , default = 5000 )
142
112
metrics .add_argument ('--aln_preset' , help = 'preset for minimap2 [map-ont] - ignore if not using the long branch of cosigt' , type = str , default = 'map-ont' )
143
-
144
113
#samtools
145
114
metrics .add_argument ('--sam_threads' , help = '# threads - samtools (view) [2]' ,type = int , default = 2 )
146
115
metrics .add_argument ('--sam_time' , help = 'max time (minutes) - samtools (view) [5]' ,type = int , default = 5 )
147
116
metrics .add_argument ('--sam_memory' , help = 'max memory (mb) - samtools (view) [5000]' ,type = int , default = 5000 )
148
-
149
117
#pggb
150
118
metrics .add_argument ('--pggb_threads' , help = '# threads - pggb [24]' ,type = int , default = 24 )
151
119
metrics .add_argument ('--pggb_time' , help = 'max time (minutes) - pggb [35]' ,type = int , default = 35 )
152
120
metrics .add_argument ('--pggb_memory' , help = 'max memory (mb) - pggb [30000]' ,type = int , default = 30000 )
153
121
metrics .add_argument ('--pggb_params' , help = 'additional parameters for pggb [-c 2]' ,type = str , default = '-c 2' )
154
122
metrics .add_argument ('--pggb_tmpdir' , help = 'temporary directory - pggb [working directory]' ,type = str , default = os .getcwd ())
155
-
156
123
#wfmash
157
124
metrics .add_argument ('--wfmash_threads' , help = '# threads - wfmash [24]' ,type = int , default = 24 )
158
125
metrics .add_argument ('--wfmash_time' , help = 'max time (minutes) - wfmash [35]' ,type = int , default = 35 )
159
126
metrics .add_argument ('--wfmash_memory' , help = 'max memory (mb) - wfmash [30000]' ,type = int , default = 30000 )
160
127
metrics .add_argument ('--wfmash_params' , help = 'additional parameters for wfmash [-s 10k -p 95]' ,type = str , default = '-s 10k -p 95' )
161
128
metrics .add_argument ('--wfmash_tmpdir' , help = 'temporary directory - wfmash [working directory]' ,type = str , default = os .getcwd ())
162
-
129
+ #parse args
163
130
args = parser .parse_args ()
164
131
args .profile = None if args .profile == 'None' else args .profile
165
-
166
132
#wd
167
133
wd = os .getcwd ()
168
-
169
134
#default parameters
170
135
d = default_parameters (args )
171
-
172
136
#create all the output paths
173
-
174
137
#config
175
138
out_config_path = os .path .join (wd ,'config' )
176
139
os .makedirs (out_config_path ,exist_ok = True )
177
140
out_yaml_tmp = os .path .join (out_config_path , 'config.yaml.tmp' )
178
141
out_yaml = os .path .join (out_config_path , 'config.yaml' )
179
142
out_samples = os .path .join (out_config_path , 'samples.tsv' )
180
-
181
143
#resources
182
144
out_resources = os .path .join (wd ,'resources' )
183
145
out_aln = os .path .join (out_resources , 'alignments' )
@@ -191,155 +153,99 @@ def main():
191
153
blcklst_out = os .path .join (out_extra , 'blacklist.txt' )
192
154
out_regions = os .path .join (out_resources , 'regions' )
193
155
os .makedirs (out_regions ,exist_ok = True )
194
-
195
-
196
156
#blacklist of samples to exclude
197
157
blcklst = []
198
-
199
158
if args .blacklist is not None :
200
-
201
159
with open (args .blacklist , 'r' ) as bad_samples_in , open (blcklst_out , 'w' ) as bad_samples_out :
202
-
203
160
for line in bad_samples_in :
204
-
205
161
blcklst .append (line .rstrip ())
206
162
bad_samples_out .write (line )
207
-
208
163
else :
209
-
210
164
open (blcklst_out , 'w' ).close ()
211
-
212
165
#symlink alignments
213
166
alns = sorted ([x for x in glob .glob (args .alignments + '/**/*am*' , recursive = True ) if os .path .isfile (x )])
214
-
167
+ samplesmap = dict ()
168
+ if args .samplemap is not None :
169
+ with open (args .samplemap , 'r' ) as samples_in :
170
+ for line in samples_in :
171
+ sid ,sname = line .rstrip ().split ('\t ' )
172
+ samplesmap [sid ]= sname
215
173
with open (out_samples , 'w' ) as samples_out :
216
-
217
- samples_out .write ('sample_id\t alignments\n ' )
218
-
174
+ samples_out .write ('sample_id\t alignment\n ' )
219
175
for aln in alns :
220
-
221
176
bnaln = os .path .basename (aln )
222
177
out_aln_file = os .path .join (out_aln , bnaln )
223
-
224
178
try :
225
-
226
179
os .symlink (os .path .abspath (aln ), out_aln_file ) #error out if this exists
227
-
228
180
except :
229
-
230
181
pass #do not symlink again if exists
231
-
232
182
if aln .endswith ('am' ): #this is not an index, rather a true alignment
233
-
234
- sample_name = '.' .join (bnaln .split ('.' )[:- 1 ])
235
- samples_out .write (sample_name + '\t ' + out_aln_file + '\n ' )
236
-
183
+ if args .samplemap is None :
184
+ sample_name = '.' .join (bnaln .split ('.' )[:- 1 ])
185
+ samples_out .write (sample_name + '\t ' + out_aln_file + '\n ' )
186
+ else :
187
+ samples_out .write (samplesmap [bnaln ] + '\t ' + out_aln_file + '\n ' )
237
188
#add to config
238
189
d ['samples' ] = out_samples
239
-
240
-
241
- #symlink assemblies
242
-
190
+ #symlink assemblies
243
191
out_assemblies_file = os .path .join (out_fasta , os .path .basename (args .assemblies ))
244
-
245
192
try :
246
-
247
193
os .symlink (os .path .abspath (args .assemblies ), out_assemblies_file )
248
-
249
194
except :
250
-
251
195
pass
252
-
253
196
#add to config
254
197
d ['assemblies' ] = out_assemblies_file
255
-
256
198
#symlink reference
257
199
out_reference_file = os .path .join (out_ref , os .path .basename (args .reference ))
258
-
259
200
try :
260
-
261
201
os .symlink (os .path .abspath (args .reference ), out_reference_file )
262
-
263
202
except :
264
-
265
203
pass
266
-
267
204
#add to config
268
205
d ['reference' ] = out_reference_file
269
-
270
206
#add to config
271
207
d ['region' ] = list ()
272
208
d ['path' ] = ''
273
-
274
209
with open (args .roi ) as bed_in :
275
-
276
210
for line in bed_in :
277
-
278
211
l = line .rstrip ().split ('\t ' )
279
-
280
212
region = l [0 ].replace ('#' ,'_' ) + '_' + l [1 ] + '_' + l [2 ]
281
213
d ['path' ] = l [0 ] if d ['path' ] == '' else d ['path' ]
282
-
283
214
#put regions in the config fille
284
215
d ['region' ].append (region )
285
-
286
216
#also write in the dedicated space
287
217
region_out = os .path .join (out_regions , region + '.bed' )
288
-
289
218
with open (region_out , 'w' ) as out_region :
290
-
291
219
out_region .write (l [0 ] + '\t ' + l [1 ] + '\t ' + l [2 ]+ '\n ' )
292
-
293
-
294
220
#dump config
295
221
yml_out = open (out_yaml_tmp , 'w' )
296
222
yaml .dump (d ,yml_out )
297
223
yml_out .close ()
298
-
299
224
#remove single quotes
300
225
with open (out_yaml_tmp ) as filein , open (out_yaml , 'w' ) as fileout :
301
-
302
226
for line in filein :
303
-
304
227
line = line .replace ("'" ,"" )
305
228
fileout .write (line )
306
-
307
229
os .remove (out_yaml_tmp )
308
-
309
230
#write command - singularity
310
231
singpath = ',' .join (list (set ([os .path .abspath (args .alignments ),os .path .dirname (os .path .abspath (args .assemblies )), os .path .dirname (os .path .abspath (args .reference )),args .binds , os .path .abspath (args .pggb_tmpdir ), os .path .abspath (args .wfmash_tmpdir )])))
311
-
312
232
if args .profile is not None :
313
-
314
233
command_singularity_out = 'SINGULARITY_TMPDIR=' + os .path .abspath (args .tmp ) + ' snakemake --profile ' + args .profile + ' --singularity-args "-B ' + singpath + ' -e" cosigt'
315
-
316
234
with open ('snakemake.singularity.profile.run.sh' , 'w' ) as out :
317
-
318
235
out .write (command_singularity_out + '\n ' )
319
-
320
236
#write command - conda
321
237
command_conda_out = 'snakemake --profile ' + args .profile + ' --use-conda cosigt'
322
-
323
238
with open ('snakemake.conda.profile.run.sh' , 'w' ) as out :
324
-
325
239
out .write (command_conda_out + '\n ' )
326
-
327
240
else :
328
-
329
241
command_singularity_out = 'SINGULARITY_TMPDIR=' + os .path .abspath (args .tmp ) + ' snakemake --singularity-args "-B ' + singpath + ' -e" cosigt'
330
-
331
242
with open ('snakemake.singularity.run.sh' , 'w' ) as out :
332
-
333
243
out .write (command_singularity_out + '\n ' )
334
-
335
244
#write command - conda
336
245
command_conda_out = 'snakemake --use-conda cosigt'
337
-
338
246
with open ('snakemake.conda.run.sh' , 'w' ) as out :
339
-
340
247
out .write (command_conda_out + '\n ' )
341
248
342
-
343
249
if __name__ == '__main__' :
344
250
345
251
main ()
0 commit comments