Skip to content

Commit 22cff8e

Browse files
modify organize.py, add sample-map table
1 parent 3cd6165 commit 22cff8e

File tree

2 files changed

+20
-114
lines changed

2 files changed

+20
-114
lines changed

.gitignore

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ cosigt_smk/resources/*
1212
cosigt_smk/logs/*
1313
cosigt_smk/.snakemake/*
1414
cosigt_smk/workflow/scripts/__pycache__
15-
cosigt_smk/snakemake.run.sh
15+
cosigt_smk/snakemake*run.sh
1616
cosigt_smk/config/*
1717
!cosigt_smk/config/.gitignore
1818
cosigt_smk/benchmarks/*
+19-113
Original file line numberDiff line numberDiff line change
@@ -1,108 +1,82 @@
11
#!/usr/bin/python3 env
2-
32
#standard libraries
43
import os
54
import glob
65
import yaml
76
import argparse
87
from argparse import HelpFormatter
98

10-
119
class CustomFormat(HelpFormatter):
1210

1311
'''
1412
custom help format
1513
'''
1614

1715
def _format_action_invocation(self, action):
18-
1916
if not action.option_strings:
20-
2117
default = self._get_default_metavar_for_positional(action)
2218
metavar, = self._metavar_formatter(action, default)(1)
23-
2419
return metavar
25-
2620
else:
27-
2821
parts = []
29-
3022
if action.nargs == 0:
31-
3223
parts.extend(action.option_strings)
33-
3424
else:
35-
3625
default = self._get_default_metavar_for_optional(action)
3726
args_string = self._format_args(action, default)
38-
3927
for option_string in action.option_strings:
40-
4128
parts.append(option_string)
42-
4329
return '%s %s' % (', '.join(parts), args_string)
44-
4530
return ', '.join(parts)
4631

4732
def _get_default_metavar_for_optional(self, action):
48-
4933
return action.dest.upper()
5034

5135

5236
def default_parameters(args):
5337

5438
d=dict()
55-
5639
#bwa-mem2
5740
d['bwa-mem2']=dict()
5841
d['bwa-mem2']['threads'] = args.aln_threads
5942
d['bwa-mem2']['mem_mb'] = args.aln_memory
6043
d['bwa-mem2']['time'] = args.aln_time
61-
6244
#bwa-mem
6345
d['bwa']=dict()
6446
d['bwa']['threads'] = args.aln_threads
6547
d['bwa']['mem_mb'] = args.aln_memory
6648
d['bwa']['time'] = args.aln_time
67-
6849
#minimap2
6950
d['minimap2']=dict()
7051
d['minimap2']['threads'] = args.aln_threads
7152
d['minimap2']['mem_mb'] = args.aln_memory
7253
d['minimap2']['time'] = args.aln_time
7354
d['minimap2']['preset'] = args.aln_preset
74-
7555
#samtools
7656
d['samtools']=dict()
7757
d['samtools']['threads'] = args.sam_threads
7858
d['samtools']['mem_mb'] = args.sam_memory
7959
d['samtools']['time'] = args.sam_time
80-
8160
#pggb
8261
d['pggb']=dict()
8362
d['pggb']['threads'] = args.pggb_threads
8463
d['pggb']['mem_mb'] = args.pggb_memory
8564
d['pggb']['time'] = args.pggb_time
8665
d['pggb']['tmpdir'] = args.pggb_tmpdir
8766
d['pggb']['params'] = args.pggb_params
88-
8967
#wfmash
9068
d['wfmash']=dict()
9169
d['wfmash']['threads'] = args.wfmash_threads
9270
d['wfmash']['mem_mb'] = args.wfmash_memory
9371
d['wfmash']['time'] = args.wfmash_time
9472
d['wfmash']['tmpdir'] = args.wfmash_tmpdir
9573
d['wfmash']['params'] = args.wfmash_params
96-
97-
9874
#default
9975
d['default']=dict()
10076
d['default']['mem_mb'] = args.std_memory
10177
d['default']['time'] = args.std_time
102-
10378
#output
10479
d['output'] = args.output
105-
10680
return d
10781

10882

@@ -113,71 +87,59 @@ def main():
11387
'''
11488

11589
parser = argparse.ArgumentParser(prog='organize.py', description='''COsine SImilarity-based GenoTyper''', epilog='''Developed by Davide Bolognini @ Human Technopole''', formatter_class=CustomFormat)
116-
90+
#required
11791
required = parser.add_argument_group('Required I/O arguments')
118-
11992
required.add_argument('-a', '--alignments', help='folder with read-level alignment files (BAM,CRAM) - and their indexes (BAI/CSI,CRAI) - of the individuals to genotype', metavar='FOLDER', required=True)
12093
required.add_argument('-r','--reference', help='reference FASTA file - the same the individuals to genotype are aligned to', metavar='FASTA', required=True)
12194
required.add_argument('--assemblies', help='chromosome-level assemblies in FASTA format', metavar='FASTA', required=True)
12295
required.add_argument('--roi', help='one or more regions of interest in BED format - first column is the assembly to use as reference (PanSN format, # delimiter)', metavar='BED', required=True)
123-
96+
#additional
12497
additional = parser.add_argument_group('Additional I/O arguments')
125-
12698
additional.add_argument('--blacklist', help='assemblies (one per line) that should not be included in the analysis [None]', metavar='', required=False, default=None)
12799
additional.add_argument('--binds', help='additional paths to bind for singularity in /path/1,/path/2 format [/localscratch]', type=str, default='/localscratch')
128100
additional.add_argument('--tmp', help='SINGULARITY TMPDIR [/tmp]', type=str, default='/tmp')
129101
additional.add_argument('--output', help='output folder [results]', metavar='FOLDER', default='results')
130-
additional.add_argument('--profile', help='use profile. If "None", do not use profile and run on the local machine [config/slurm]', metavar='FOLDER', default='config/slurm')
131-
102+
additional.add_argument('--profile', help='use profile. If None, do not use profile and run on the local machine [config/slurm]', metavar='FOLDER', default='config/slurm')
103+
additional.add_argument('--samplemap', help='tsv file mapping each bam/cram basename to a user-defined id. If None, infer from bam/cram basename [None]', metavar='TSV', type=str, default=None)
104+
#metrics
132105
metrics = parser.add_argument_group('Specify #threads, memory and time requirements')
133-
134-
#default
135106
metrics.add_argument('--std_time', help='max time (minutes) - default [1]',type=int, default=1)
136107
metrics.add_argument('--std_memory', help='memory (mb) - default [500]',type=int, default=500)
137-
138108
#alignment
139109
metrics.add_argument('--aln_threads', help='# threads - aligner [5]',type=int, default=5)
140110
metrics.add_argument('--aln_time', help='max time (minutes) - aligner [2]',type=int, default=5)
141111
metrics.add_argument('--aln_memory', help='max memory (mb) - aligner [5000]',type=int, default=5000)
142112
metrics.add_argument('--aln_preset', help='preset for minimap2 [map-ont] - ignore if not using the long branch of cosigt', type=str, default='map-ont')
143-
144113
#samtools
145114
metrics.add_argument('--sam_threads', help='# threads - samtools (view) [2]',type=int, default=2)
146115
metrics.add_argument('--sam_time', help='max time (minutes) - samtools (view) [5]',type=int, default=5)
147116
metrics.add_argument('--sam_memory', help='max memory (mb) - samtools (view) [5000]',type=int, default=5000)
148-
149117
#pggb
150118
metrics.add_argument('--pggb_threads', help='# threads - pggb [24]',type=int, default=24)
151119
metrics.add_argument('--pggb_time', help='max time (minutes) - pggb [35]',type=int, default=35)
152120
metrics.add_argument('--pggb_memory', help='max memory (mb) - pggb [30000]',type=int, default=30000)
153121
metrics.add_argument('--pggb_params', help='additional parameters for pggb [-c 2]',type=str, default='-c 2')
154122
metrics.add_argument('--pggb_tmpdir', help='temporary directory - pggb [working directory]',type=str, default=os.getcwd())
155-
156123
#wfmash
157124
metrics.add_argument('--wfmash_threads', help='# threads - wfmash [24]',type=int, default=24)
158125
metrics.add_argument('--wfmash_time', help='max time (minutes) - wfmash [35]',type=int, default=35)
159126
metrics.add_argument('--wfmash_memory', help='max memory (mb) - wfmash [30000]',type=int, default=30000)
160127
metrics.add_argument('--wfmash_params', help='additional parameters for wfmash [-s 10k -p 95]',type=str, default='-s 10k -p 95')
161128
metrics.add_argument('--wfmash_tmpdir', help='temporary directory - wfmash [working directory]',type=str, default=os.getcwd())
162-
129+
#parse args
163130
args = parser.parse_args()
164131
args.profile=None if args.profile == 'None' else args.profile
165-
166132
#wd
167133
wd=os.getcwd()
168-
169134
#default parameters
170135
d=default_parameters(args)
171-
172136
#create all the output paths
173-
174137
#config
175138
out_config_path=os.path.join(wd,'config')
176139
os.makedirs(out_config_path,exist_ok=True)
177140
out_yaml_tmp=os.path.join(out_config_path, 'config.yaml.tmp')
178141
out_yaml=os.path.join(out_config_path, 'config.yaml')
179142
out_samples=os.path.join(out_config_path, 'samples.tsv')
180-
181143
#resources
182144
out_resources=os.path.join(wd,'resources')
183145
out_aln=os.path.join(out_resources, 'alignments')
@@ -191,155 +153,99 @@ def main():
191153
blcklst_out=os.path.join(out_extra, 'blacklist.txt')
192154
out_regions=os.path.join(out_resources, 'regions')
193155
os.makedirs(out_regions,exist_ok=True)
194-
195-
196156
#blacklist of samples to exclude
197157
blcklst=[]
198-
199158
if args.blacklist is not None:
200-
201159
with open(args.blacklist, 'r') as bad_samples_in, open(blcklst_out, 'w') as bad_samples_out:
202-
203160
for line in bad_samples_in:
204-
205161
blcklst.append(line.rstrip())
206162
bad_samples_out.write(line)
207-
208163
else:
209-
210164
open(blcklst_out, 'w').close()
211-
212165
#symlink alignments
213166
alns=sorted([x for x in glob.glob(args.alignments + '/**/*am*', recursive=True) if os.path.isfile(x)])
214-
167+
samplesmap=dict()
168+
if args.samplemap is not None:
169+
with open(args.samplemap, 'r') as samples_in:
170+
for line in samples_in:
171+
sid,sname=line.rstrip().split('\t')
172+
samplesmap[sid]=sname
215173
with open(out_samples, 'w') as samples_out:
216-
217-
samples_out.write('sample_id\talignments\n')
218-
174+
samples_out.write('sample_id\talignment\n')
219175
for aln in alns:
220-
221176
bnaln=os.path.basename(aln)
222177
out_aln_file=os.path.join(out_aln, bnaln)
223-
224178
try:
225-
226179
os.symlink(os.path.abspath(aln), out_aln_file) #error out if this exists
227-
228180
except:
229-
230181
pass #do not symlink again if exists
231-
232182
if aln.endswith('am'): #this is not an index, rather a true alignment
233-
234-
sample_name='.'.join(bnaln.split('.')[:-1])
235-
samples_out.write(sample_name + '\t' + out_aln_file + '\n')
236-
183+
if args.samplemap is None:
184+
sample_name='.'.join(bnaln.split('.')[:-1])
185+
samples_out.write(sample_name + '\t' + out_aln_file + '\n')
186+
else:
187+
samples_out.write(samplesmap[bnaln] + '\t' + out_aln_file + '\n')
237188
#add to config
238189
d['samples'] = out_samples
239-
240-
241-
#symlink assemblies
242-
190+
#symlink assemblies
243191
out_assemblies_file=os.path.join(out_fasta, os.path.basename(args.assemblies))
244-
245192
try:
246-
247193
os.symlink(os.path.abspath(args.assemblies), out_assemblies_file)
248-
249194
except:
250-
251195
pass
252-
253196
#add to config
254197
d['assemblies'] = out_assemblies_file
255-
256198
#symlink reference
257199
out_reference_file=os.path.join(out_ref, os.path.basename(args.reference))
258-
259200
try:
260-
261201
os.symlink(os.path.abspath(args.reference), out_reference_file)
262-
263202
except:
264-
265203
pass
266-
267204
#add to config
268205
d['reference'] = out_reference_file
269-
270206
#add to config
271207
d['region'] = list()
272208
d['path'] = ''
273-
274209
with open(args.roi) as bed_in:
275-
276210
for line in bed_in:
277-
278211
l=line.rstrip().split('\t')
279-
280212
region=l[0].replace('#','_') + '_' + l[1] + '_' + l[2]
281213
d['path'] = l[0] if d['path'] == '' else d['path']
282-
283214
#put regions in the config fille
284215
d['region'].append(region)
285-
286216
#also write in the dedicated space
287217
region_out=os.path.join(out_regions, region+'.bed')
288-
289218
with open(region_out, 'w') as out_region:
290-
291219
out_region.write(l[0] + '\t' + l[1] + '\t' + l[2]+'\n')
292-
293-
294220
#dump config
295221
yml_out=open(out_yaml_tmp, 'w')
296222
yaml.dump(d,yml_out)
297223
yml_out.close()
298-
299224
#remove single quotes
300225
with open(out_yaml_tmp) as filein, open(out_yaml, 'w') as fileout:
301-
302226
for line in filein:
303-
304227
line=line.replace("'","")
305228
fileout.write(line)
306-
307229
os.remove(out_yaml_tmp)
308-
309230
#write command - singularity
310231
singpath=','.join(list(set([os.path.abspath(args.alignments),os.path.dirname(os.path.abspath(args.assemblies)), os.path.dirname(os.path.abspath(args.reference)),args.binds, os.path.abspath(args.pggb_tmpdir), os.path.abspath(args.wfmash_tmpdir)])))
311-
312232
if args.profile is not None:
313-
314233
command_singularity_out='SINGULARITY_TMPDIR=' + os.path.abspath(args.tmp) + ' snakemake --profile ' + args.profile + ' --singularity-args "-B '+ singpath + ' -e" cosigt'
315-
316234
with open('snakemake.singularity.profile.run.sh', 'w') as out:
317-
318235
out.write(command_singularity_out + '\n')
319-
320236
#write command - conda
321237
command_conda_out='snakemake --profile ' + args.profile + ' --use-conda cosigt'
322-
323238
with open('snakemake.conda.profile.run.sh', 'w') as out:
324-
325239
out.write(command_conda_out + '\n')
326-
327240
else:
328-
329241
command_singularity_out='SINGULARITY_TMPDIR=' + os.path.abspath(args.tmp) + ' snakemake --singularity-args "-B '+ singpath + ' -e" cosigt'
330-
331242
with open('snakemake.singularity.run.sh', 'w') as out:
332-
333243
out.write(command_singularity_out + '\n')
334-
335244
#write command - conda
336245
command_conda_out='snakemake --use-conda cosigt'
337-
338246
with open('snakemake.conda.run.sh', 'w') as out:
339-
340247
out.write(command_conda_out + '\n')
341248

342-
343249
if __name__ == '__main__':
344250

345251
main()

0 commit comments

Comments
 (0)