Skip to content

Commit f5e02fb

Browse files
authored
Merge pull request #14 from Vicky-Hunt-Lab/dev
Pull changes for v1.1 into main
2 parents 73d5430 + 0073a33 commit f5e02fb

10 files changed

+171
-59
lines changed

docs/Documentation.docx

-19.9 KB
Binary file not shown.

docs/Documentation.pdf

28.2 KB
Binary file not shown.

docs/html/Documentation.html

+1-1
Large diffs are not rendered by default.

docs/html/images/image1.png

-19.4 KB
Loading

docs/html/images/image2.png

-7.18 KB
Loading

hlsmallrna/__main__.py

+67-18
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,13 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515

16+
from ast import Num
1617
import os.path
1718
import glob
1819
import shutil
1920

2021
from math import inf
22+
from multiprocessing import Pool
2123
from argparse import ArgumentParser
2224

2325
from Bio import SeqIO, SeqRecord
@@ -83,21 +85,34 @@ def process_command(small_rna, adapter, front, anywhere, cutoff, quiet):
8385

8486
do_log(quiet, '==> Completed command Process')
8587

86-
def sort_command(genome, small_rna, cds, min_length, max_length, quiet):
88+
def sort_command(genome, small_rna, cds, min_length, max_length, num_mismatches, disable_align, threads, quiet):
8789
'''
8890
Code to run when the user chooses the sort command
8991
'''
9092

91-
if not validate_file(genome, 'fasta'):
93+
if not disable_align and not validate_file(genome, 'fasta'):
9294
print(f'Error: expected a genome in FASTA format, got {genome}')
95+
96+
# Add note if genome positional argument has been missed
97+
if genome is None:
98+
print('It looks like you missed the Genome FASTA argument, try adding one to the end of your command')
99+
93100
return False
94101

95-
if not validate_file(small_rna, 'fastq'):
96-
print(f'Error: expected a small RNA FASTQ with at least one sequence, got {small_rna}')
102+
if validate_file(small_rna, 'fastq'):
103+
small_rna_filetype = 'fastq'
104+
elif validate_file(small_rna, 'fasta'):
105+
small_rna_filetype = 'fasta'
106+
else:
107+
print(f'Error: expected a small RNA FASTQ or FASTA with at least one sequence, got {small_rna}')
97108
return False
98109

99110
do_log(quiet, '==> Starting command Sort')
100-
new_fastq = align_to_genome(genome, small_rna, cds, quiet=quiet)
111+
if not disable_align:
112+
new_fastq = align_to_genome(genome, small_rna, cds, threads=threads, small_rna_filetype=small_rna_filetype, mismatches=num_mismatches, quiet=quiet)
113+
else:
114+
new_fastq = small_rna
115+
101116
table_file = bin_rna_size(new_fastq, min_length, max_length, quiet=quiet)
102117

103118
graph_length(table_file)
@@ -106,7 +121,7 @@ def sort_command(genome, small_rna, cds, min_length, max_length, quiet):
106121

107122
def extractnc_command(genome, gff, quiet):
108123
'''
109-
Code to run when the user chooses to extract the noncoding mRNA reigon
124+
Code to run when the user chooses to extract the noncoding mRNA region
110125
'''
111126

112127
if not validate_file(genome, 'fasta'):
@@ -127,7 +142,23 @@ def extractnc_command(genome, gff, quiet):
127142

128143
do_log(quiet, '==> Completed command extractNC')
129144

130-
def unitas_command(small_rna_path, species_name, ref_seqs, cds, unspliced_transcriptome, quiet):
145+
# Parallelism bits for unitas
146+
# initialize worker processes
147+
def init_worker(a, b, c, d):
148+
# declare scope of a new global variable
149+
global species_name, ref_seqs, quiet, UNITAS_OUTPUT
150+
# store argument in the global variable for this process
151+
species_name = a
152+
ref_seqs = b
153+
quiet = c
154+
UNITAS_OUTPUT = d
155+
156+
157+
# easiest way to implement this quickly
158+
def unitas_threads(small_rna):
159+
run_unitas_annotation(small_rna, species_name, ref_seqs, quiet=quiet, unitas_output=UNITAS_OUTPUT)
160+
161+
def unitas_command(small_rna_path, species_name, ref_seqs, cds, unspliced_transcriptome, threads, quiet):
131162
'''
132163
Code to run when the user chooses the unitas command
133164
'''
@@ -159,14 +190,16 @@ def unitas_command(small_rna_path, species_name, ref_seqs, cds, unspliced_transc
159190

160191
mkdir_if_not_exists(UNITAS_OUTPUT)
161192

162-
for small_rna in glob.glob(os.path.join(small_rna_path, '*.fastq')):
163-
run_unitas_annotation(small_rna, species_name, ref_seqs, quiet=quiet, unitas_output=UNITAS_OUTPUT)
193+
small_rna_list = glob.glob(os.path.join(small_rna_path, '*.fastq'))
194+
195+
with Pool(threads, initializer=init_worker, initargs=(species_name, ref_seqs, quiet, UNITAS_OUTPUT,)) as p:
196+
p.map(unitas_threads, small_rna_list)
164197

165198
table_path = merge_summary()
166199
graph_unitas_classification_type(table_path)
167200
do_log(quiet, '==> Completed command Unitas')
168201

169-
def targetid_command(small_rna, targets, min_seq_length, mismatches_allowed, quiet):
202+
def targetid_command(small_rna, targets, min_seq_length, mismatches_allowed, threads, quiet):
170203
'''
171204
Code to run when the user chooses the targetid command
172205
'''
@@ -185,7 +218,7 @@ def targetid_command(small_rna, targets, min_seq_length, mismatches_allowed, qui
185218
print('Error: You need to supply at least one target file with -t')
186219

187220
revcomp_file = revcomp_input_file(small_rna, quiet=quiet)
188-
sam_files = find_targets(revcomp_file, targets, min_seq_length=min_seq_length, mismatches_allowed=mismatches_allowed, quiet=quiet)
221+
sam_files = find_targets(revcomp_file, targets, threads=threads, min_seq_length=min_seq_length, mismatches_allowed=mismatches_allowed, quiet=quiet)
189222
build_summary_files(sam_files, quiet=quiet)
190223

191224
do_log(quiet, '==> Ending TargetID command')
@@ -205,15 +238,17 @@ def main():
205238
parser_process.add_argument('small_rna', help='Path to FASTQ containing the small RNA')
206239

207240
parser_sort = subparsers.add_parser('sort', help='Find RNAs that align to a genome and sort them by length')
208-
parser_sort.add_argument('-d', '--cds', help='Optional CDS region, also align this to the CDS reigon as well as the genome')
241+
parser_sort.add_argument('-d', '--cds', help='Optional CDS region, also align this to the CDS region as well as the genome')
209242
parser_sort.add_argument('-l', '--min-length', help='Minimum length to bin', type=int, default=-inf)
210243
parser_sort.add_argument('-x', '--max-length', help='Maximum length to bin', type=int, default=inf)
244+
parser_sort.add_argument('-m', '--ref-mismatches', type=int, default=None, help='Number of mismatches to use in bowtie2, None for default behaviour')
245+
parser_sort.add_argument('--disable-alignment', action='store_true', help='Skip the alignment to the reference genome step')
211246
parser_sort.add_argument('small_rna', help='Path to FASTQ containing the small RNA')
212-
parser_sort.add_argument('genome', help='Genome to align against')
247+
parser_sort.add_argument('genome', nargs='?', default=None, help='Genome to align against')
213248

214-
parser_extractnc = subparsers.add_parser('extractnc', help='Extarct the noncoding reigon from a fasta with a GFF file')
249+
parser_extractnc = subparsers.add_parser('extractnc', help='Extarct the noncoding region from a fasta with a GFF file')
215250
parser_extractnc.add_argument('genome', help='FASTA containing the genome to extract from')
216-
parser_extractnc.add_argument('gff_file', help='GFF file containing annotations of CDS and mRNA reigons')
251+
parser_extractnc.add_argument('gff_file', help='GFF file containing annotations of CDS and mRNA regions')
217252

218253
parser_unitas = subparsers.add_parser('unitas', help='Run unitas on split files and merge results')
219254
parser_unitas.add_argument('-d', '--cds', help='Optional CDS region, passed to unitas')
@@ -230,7 +265,7 @@ def main():
230265

231266
parser_all = subparsers.add_parser('all', help='Run process, sort and unitas one after the other')
232267
parser_all.add_argument('-a', '--adapter', help='Sequence of the adapter to remove from the 3\' end')
233-
parser_all.add_argument('-d', '--cds', help='Optional CDS region, also align this to the CDS reigon as well as the genome')
268+
parser_all.add_argument('-d', '--cds', help='Optional CDS region, also align this to the CDS region as well as the genome')
234269
parser_all.add_argument('-g', '--front', help='Sequence of the adapter to remove from the 5\' end')
235270
parser_all.add_argument('-b', '--anywhere', help='Sequence of the adapters to remove from both ends')
236271
parser_all.add_argument('-c', '--cutoff', help='Quality cutoff to trin RNA sequences at', default=20, type=int)
@@ -239,8 +274,10 @@ def main():
239274
parser_all.add_argument('-r', '--refseq', help='References for use with unitas', nargs='*', default=None)
240275
parser_all.add_argument('-s', '--species', help='Species to set in unitas arguments', default='x')
241276
parser_all.add_argument('-u', '--unspliced-transcriptome', help='Optional, unspliced transcriptome, passed to unitas')
277+
parser_all.add_argument('-m', '--ref-mismatches', type=int, default=None, help='Number of mismatches to use in bowtie2 when aligning to the genome, None for default behaviour')
278+
parser_all.add_argument('--disable-alignment', action='store_true', help='Skip the alignment to the reference genome step')
242279
parser_all.add_argument('small_rna', help='Path to FASTQ containing the small RNA')
243-
parser_all.add_argument('genome', help='Genome to align against')
280+
parser_all.add_argument('genome', nargs='?', default=None, help='Genome to align against')
244281

245282
args = parser.parse_args()
246283

@@ -262,6 +299,7 @@ def get_command_args(name):
262299
return None
263300

264301
mkdir_if_not_exists(get_config_key('general', 'output_directory'))
302+
num_threads = get_config_key('general', 'threads')
265303

266304
if args.command == 'process':
267305
process_command(
@@ -280,6 +318,9 @@ def get_command_args(name):
280318
get_command_args('cds'),
281319
get_command_args('min_length'),
282320
get_command_args('max_length'),
321+
get_command_args('ref_mismatches'),
322+
get_command_args('disable_alignment'),
323+
num_threads,
283324
get_command_args('quiet')
284325
)
285326

@@ -297,6 +338,7 @@ def get_command_args(name):
297338
get_command_args('refseq'),
298339
get_command_args('cds'),
299340
get_command_args('unspliced_transcriptome'),
341+
num_threads,
300342
get_command_args('quiet')
301343
)
302344

@@ -306,6 +348,7 @@ def get_command_args(name):
306348
get_command_args('target_files'),
307349
get_command_args('min_seq_length'),
308350
get_command_args('num_mismatches'),
351+
num_threads,
309352
get_command_args('quiet')
310353
)
311354

@@ -328,16 +371,22 @@ def get_command_args(name):
328371
get_command_args('cds'),
329372
get_command_args('min_length'),
330373
get_command_args('max_length'),
374+
get_command_args('ref_mismatches'),
375+
get_command_args('disable_alignment'),
376+
num_threads,
331377
get_command_args('quiet')
332378
)
333379

334380
if out_code is not None:
335381
return
336-
382+
337383
unitas_command(
338384
os.path.join(get_config_key('general', 'output_directory'), 'binned_rna'),
339385
get_command_args('species'),
340386
get_command_args('refseq'),
387+
get_command_args('cds'),
388+
get_command_args('unspliced_transcriptome'),
389+
num_threads,
341390
get_command_args('quiet')
342391
)
343392

hlsmallrna/genome_align.py

+93-21
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,16 @@ def make_fastq_overlap_only(fastq1, fastq2, output):
8484

8585
SeqIO.write(into_seqrecord(result_seqs), output, 'fastq')
8686

87-
def align_to_genome(genome, small_rnas, cds, quiet=0):
87+
def remove_symbols_from_header(fasta):
88+
'''
89+
Remove @s from the FASTA header before doing the alignment
90+
'''
91+
for seq in SeqIO.parse(fasta, 'fasta'):
92+
seq.id = seq.id.replace('@', '_')
93+
94+
yield seq
95+
96+
def align_to_genome(genome, small_rnas, cds, quiet=0, threads=4, small_rna_filetype='fastq', mismatches=None):
8897
'''
8998
Align the small RNAs to the genome and filter out any that are unsuccessful
9099
'''
@@ -107,8 +116,15 @@ def align_to_genome(genome, small_rnas, cds, quiet=0):
107116

108117
mkdir_if_not_exists(INDEX_DIRECTORY)
109118

119+
if small_rna_filetype == 'fasta':
120+
NEW_SMALL_RNAS = os.path.join(get_config_key('general', 'output_directory'), 'corrected_headers.fasta')
121+
SeqIO.write(remove_symbols_from_header(small_rnas), NEW_SMALL_RNAS, 'fasta')
122+
123+
small_rnas = NEW_SMALL_RNAS
124+
110125
bbmap_build_index = [
111126
get_config_key('cli-tools', 'bowtie2', 'path_to_bowtie2_build'),
127+
'--threads', str(threads),
112128
genome,
113129
os.path.join(INDEX_DIRECTORY, 'genome_index')
114130
]
@@ -117,29 +133,98 @@ def align_to_genome(genome, small_rnas, cds, quiet=0):
117133

118134
cds_build_index = [
119135
get_config_key('cli-tools', 'bowtie2', 'path_to_bowtie2_build'),
136+
'--threads', str(threads),
120137
cds,
121138
os.path.join(INDEX_DIRECTORY, 'cds_index')
122139
]
123140

124141
cds_build_index = cds_build_index + get_config_key('cli-tools', 'bowtie2', 'bowtie2_build_params')
125142

126-
bbmap_align_reads = [
143+
if mismatches is not None:
144+
bbmap_align_reads = [
145+
get_config_key('cli-tools', 'bowtie2', 'path_to_bowtie2'),
146+
'--threads', str(threads),
147+
'-L', '18',
148+
'--no-1mm-upfront',
149+
'--score-min', 'L,' + str(-mismatches) + ',0',
150+
'--end-to-end',
151+
'--mp', '1,1',
152+
'--ignore-quals',
153+
'--rdg', '9,1',
154+
'--rfg', '9,1',
155+
'-x', os.path.join(INDEX_DIRECTORY, 'genome_index'),
156+
'-U', small_rnas,
157+
'-S', INTERMEDIATE_SAM
158+
]
159+
elif mismatches == 0:
160+
bbmap_align_reads = [
127161
get_config_key('cli-tools', 'bowtie2', 'path_to_bowtie2'),
162+
'--threads', str(threads),
128163
'-L', '18',
164+
'--no-1mm-upfront',
165+
'--score-min', 'L,0,0',
166+
'--end-to-end',
167+
'-M', '0',
129168
'-x', os.path.join(INDEX_DIRECTORY, 'genome_index'),
130169
'-U', small_rnas,
131170
'-S', INTERMEDIATE_SAM
132-
]
171+
]
172+
else:
173+
bbmap_align_reads = [
174+
get_config_key('cli-tools', 'bowtie2', 'path_to_bowtie2'),
175+
'--threads', str(threads),
176+
'-L', '18',
177+
'-x', os.path.join(INDEX_DIRECTORY, 'genome_index'),
178+
'-U', small_rnas,
179+
'-S', INTERMEDIATE_SAM
180+
]
181+
182+
if small_rna_filetype == 'fasta':
183+
bbmap_align_reads.append('-f')
133184

134185
bbmap_align_reads = bbmap_align_reads + get_config_key('cli-tools', 'bowtie2', 'bowtie2_params')
135186

136-
cds_align_reads = [
187+
if mismatches is not None:
188+
cds_align_reads = [
189+
get_config_key('cli-tools', 'bowtie2', 'path_to_bowtie2'),
190+
'--threads', str(threads),
191+
'-L', '18',
192+
'--no-1mm-upfront',
193+
'--score-min', 'L,' + str(-mismatches) + ',0',
194+
'--end-to-end',
195+
'--mp', '1,1',
196+
'--ignore-quals',
197+
'--rdg', '9,1',
198+
'--rfg', '9,1',
199+
'-x', os.path.join(INDEX_DIRECTORY, 'cds_index'),
200+
'-U', small_rnas,
201+
'-S', CDS_INTERMEDIATE_SAM
202+
]
203+
elif mismatches == 0:
204+
cds_align_reads = [
137205
get_config_key('cli-tools', 'bowtie2', 'path_to_bowtie2'),
206+
'--threads', str(threads),
138207
'-L', '18',
208+
'--no-1mm-upfront',
209+
'--score-min', 'L,0,0',
210+
'--end-to-end',
211+
'-M', '0',
139212
'-x', os.path.join(INDEX_DIRECTORY, 'cds_index'),
140213
'-U', small_rnas,
141214
'-S', CDS_INTERMEDIATE_SAM
142-
]
215+
]
216+
else:
217+
cds_align_reads = [
218+
get_config_key('cli-tools', 'bowtie2', 'path_to_bowtie2'),
219+
'--threads', str(threads),
220+
'-L', '18',
221+
'-x', os.path.join(INDEX_DIRECTORY, 'cds_index'),
222+
'-U', small_rnas,
223+
'-S', CDS_INTERMEDIATE_SAM
224+
]
225+
226+
if small_rna_filetype == 'fasta':
227+
cds_align_reads.append('-f')
143228

144229
cds_align_reads = cds_align_reads + get_config_key('cli-tools', 'bowtie2', 'bowtie2_params')
145230

@@ -219,19 +304,6 @@ def align_to_genome(genome, small_rnas, cds, quiet=0):
219304
'-0', CDS_UNMAPPED_FASTQ
220305
]
221306

222-
if get_config_key('cli-tools', 'bowtie2', 'bowtie2_pass_threads'):
223-
threads = get_config_key('general', 'threads')
224-
225-
bbmap_build_index.append('--threads')
226-
bbmap_build_index.append(str(threads))
227-
bbmap_align_reads.append('--threads')
228-
bbmap_align_reads.append(str(threads))
229-
230-
cds_build_index.append('--threads')
231-
cds_build_index.append(str(threads))
232-
cds_align_reads.append('--threads')
233-
cds_align_reads.append(str(threads))
234-
235307
do_log(quiet, '====> Building BBMap Index')
236308
run(bbmap_build_index, capture_output=(quiet != 0))
237309
if cds is not None:
@@ -256,17 +328,17 @@ def align_to_genome(genome, small_rnas, cds, quiet=0):
256328
make_fastqs_unique(RESULT_FASTQ, CDS_RESULT_FASTQ, FINAL_FASTQ)
257329
make_fastq_overlap_only(RESULT_UNMAPPED_FASTQ, CDS_UNMAPPED_FASTQ, FINAL_UNMAPPED_FASTQ)
258330

259-
create_stats_table(small_rnas, get_config_key('general', 'output_directory'))
331+
create_stats_table(small_rnas, get_config_key('general', 'output_directory'), small_rna_filetype=small_rna_filetype)
260332

261333
return FINAL_FASTQ
262334

263-
def create_stats_table(smallrna, output_dir):
335+
def create_stats_table(smallrna, output_dir, small_rna_filetype='fastq'):
264336
'''
265337
Count sequences to create the statistics file
266338
'''
267339

268340
input_reads = 0
269-
for read in SeqIO.parse(smallrna, 'fastq'):
341+
for read in SeqIO.parse(smallrna, small_rna_filetype):
270342
input_reads += 1
271343

272344
overall_mapped = 0

0 commit comments

Comments
 (0)