-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPIA.pl
404 lines (325 loc) · 21.6 KB
/
PIA.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
#!/usr/bin/perl
######################################################
####################### PIA.pl #######################
######################################################
## split and run Phylogenetic Intersection Analysis ##
############## Roselyn ware, UoW 2018 ################
######################################################
############## Version 5.6, 2021-05-07 ###############
######################################################
my $PIA_version = '5.6'; # String instead of numeric because otherwise perl can 'helpfully' remove '.0'.
# Edited by Roselyn Ware, UoW 2018
# Further edited by Becky Cribdon, UoW 2021
# A method of metagenomic phylogenetic assignation, designed to be robust to partial representation of organisms in the database
# Please report any problems to [email protected].
use strict;
use warnings;
use Getopt::Std;
use File::Copy qw(copy);
use File::Path;
use Data::Dumper qw(Dumper); # Only used for testing
# Run as follows:
# perl PIA.pl -f [FASTA file] -b [BLAST file] -t [number of threads]
# Summary:
# - Check arguments and inputs.
# - Generate a command file that will run PIA_inner.pl on n threads.
# - Collate the n sets of output.
######################################################
########### Check arguments and Input Data ###########
######################################################
##### Get arguments from command line #####
my %options=();
getopts('hf:b:c:C:s:t:', \%options);
##### Display help text and exit if -h flag called #####
if ($options{h}){
print "Usage: perl PIA.pl -H [header file] -b [BLAST file] [other options]
Main Arguments
Option Description Input Explanation
-f FASTA filename Y FASTA of reads to analyse.
-b BLAST filename Y BLAST filename containing entries for all reads in the FASTA (can contain other entries too).
Optional
Option Description Input Explanation
-c cap Optional Maximum unique BLAST taxa examined. Impacts taxonomic diversity score. Default is 100.
-C min % coverage Optional Minimum percentage coverage a top BLAST hit must have for a read to be taken forward. Default is 95.
-h help N Print this help text.
-s min diversity score Optional Minimum taxonomic diversity score for a read to make it to Summary_Basic.txt and Summary_Reads.txt. Depends on cap. Default is 0.1.
-t threads Optional PIA.pl only. Split the header file into x subfiles and run PIA_inner.pl on each one. Default is 1.
";
exit;
}
##### Check FASTA file path #####
# The header file is the read names (headers) extracted from a FASTA file. PIA.pl makes the header file.
my $fasta_filepath = '';
if ($options{f}) {
$fasta_filepath = $options{f};
} else {
print "\nSpecify a FASTA file with -f.\n\n";
exit;
}
##### Check BLAST file path #####
my $blast_filepath = '';
if ($options{b}) {
$blast_filepath = $options{b};
} else {
print "\nSpecify a BLAST file with -b.\n\n";
exit;
}
##### Check names.dmp.dbm and nodes.dmp.dbm file paths #####
unless (-e 'Reference_files/names.dmp.dbm') {
print "\nCannot find names DBM at Reference_files/names.dmp.dbm. You may need to run PIA_index_maker.pl.\n\n";
exit;
}
unless (-e 'Reference_files/nodes.dmp.dbm') {
print "\nCannot find nodes DBM at Reference_files/nodes.dmp.dbm. You may need to run PIA_index_maker.pl.\n\n";
exit;
}
##### See if cap input #####
# BLAST hits are all assigned to taxa. $cap is the maximum number of taxa to be looked at. If $cap is hit, no more BLAST hits will be considered. $cap is used to calculate the taxonomic diversity score, so affects the minimum score threshold (option s). Default is 100.
my $cap = 100;
if ($options{c}) { # If there is an option, overwrite the default.
$cap = $options{c};
}
print "\nMax taxa to consider per read: $cap\n";
##### See if min % coverage input #####
# The minimum percentage coverage a top BLAST hit must have for a read to be taken forward. Default is 95.
my $min_coverage_perc = 95;
if ($options{C}) { # If there is an option, overwrite the default.
$min_coverage_perc = $options{C};
}
print "Min coverage %: $min_coverage_perc\n";
##### See if min taxonomic diversity score input #####
# The minimum taxonomic diversity score a read must have to make it to Summary_Basic.txt. Depends on $cap. Defaults to 0.1.
my $min_taxdiv_score = 0.1;
if ($options{s}) { # If there is an option, overwrite the default.
$min_taxdiv_score = $options{s};
}
print "Min taxonomic diversity score: $min_taxdiv_score\n\n";
##### Check see if thread number input #####
my $threads = 1; # Set to 1 by default.
if ($options{t}) { $threads = $options{t};}
# Start a timer
#--------------
my $timestamp_start = localtime();
# Generate a read info file
#--------------------------
# For every read in the FASTA, this will contain its name and sequence length.
my $read_info_filepath = $fasta_filepath . '.read_info';
open (my $read_info_filehandle, '>', $read_info_filepath) or die "Cannot open $read_info_filepath for writing: $!\n";
open (my $fasta_filehandle, $fasta_filepath) or die "Cannot open FASTA $fasta_filepath: $!\n";
$/ = '>'; # Set the input record separator to '>', the first character of a FASTA record.
while (1) { # Run this loop until "last" is called.
my $record = readline($fasta_filehandle); # Read the next line from the names file.
if (! defined $record) { last }; # If there is no next line, exit the loop. You've processed the whole file.
my @record = split("\n", $record); # Split the record into its two lines.
if ($record[1]) { # Is there ever a record without a sequence? Whoever first wrote PIA thought so.
my @sequence_name = split (' ', $record[0]); # The 0th record is the sequence name (qseqid in BLAST lingo). If there's a space BLAST only takes what comes before the space, so we must do the same.
my $sequence_length = length $record[1]; # The sequence is the 1st line.
print $read_info_filehandle $sequence_name[0] . "\t$sequence_length\n";
}
}
close $read_info_filehandle;
close $fasta_filehandle;
$/ = "\n"; # Set it back to default.
# Generate a command file to run PIA_inner.pl on x threads
#---------------------------------------------------------
# How many reads will each thread have to process?
my $line_count =`wc -l $read_info_filepath`; # wc returns the number of lines in the read_info file followed by the name of the read_info file.
$line_count =~ s/^\s+//; # Remove any leading whitespace.
my @line_count = split (' ', $line_count); # Then split on whitespace. So, there will be two elements: the number of lines and the filename.
$line_count = "$line_count[0]"; # Redefine $line_count to be just the number of lines. This is equivalent to the number of reads.
my $line_count_per_split_read_info = $line_count/$threads; # Redefine $line_count to be the number of reads to process per thread.
$line_count_per_split_read_info = $line_count_per_split_read_info + 1; # Add 1. Does this ensure it's never 0-point-something?
$line_count_per_split_read_info =~ s/\.\d+$//; # It's probably now a decimal. Remove any "." characters followed by digits. So, $line_count_per_split_read_info gains 1 but loses whatever fraction it had. It's rounded up to the nearest integer.
# Split the read_info file accordingly:
system("split -a 3 -l $line_count_per_split_read_info $read_info_filepath $read_info_filepath"); # Split the read_info file such that each new file has $line_count lines. The new files will be named [read_info file].aaa, [read_info file].aab, [read_info file].aac...
# Collect the new read_info file names.
my $read_info_filename = $read_info_filepath; # Default the name to the path; assume there isn't a path.
my $read_info_parentpath = ''; # Default the path to empty.
if (index($read_info_filepath, '/') != -1) { # If there was in fact a path:
my @read_info_filepath_elements = split('/', $read_info_filepath); # Split $read_info_filepath on "/" symbols to separate elements of the path.
$read_info_filename = pop @read_info_filepath_elements; # Separate the read_info file name.
$read_info_parentpath = join('/', @read_info_filepath_elements); # Put the rest of the path back together.
}
my $grep_search_expression = '^' . $read_info_filename . '[a-z]{3}$';
my $list_of_split_read_infos = `ls $read_info_parentpath . | grep -E $grep_search_expression`; # "ls ." lists files and directories in the current directory. The grep -E narrows this list to files and directories starting with the read_info filename, then three lower-case letters. It ends up being a list of the split read_info files.
my @split_read_infos = split("\n", $list_of_split_read_infos); # Save that list.
# Make the command file.
my $shellscript_filename = 'shellscript_' . $read_info_filename . '.txt';
open (my $shellscript_filehandle, '>'."$shellscript_filename") or die "\nUnable to create $shellscript_filename: $!\n";
foreach my $split_read_info_filename (@split_read_infos) { # For each split file, print to the shell script the command to run the PIA_inner.pl script with the relevant options we checked earlier.
my $split_read_info_filepath = $split_read_info_filename; # Default to no parent path.
if ($read_info_parentpath ne '') { # If there was a parent path, add it on now.
$split_read_info_filepath = $read_info_parentpath . '/' . $split_read_info_filename;
}
print $shellscript_filehandle "perl PIA_inner.pl -r $split_read_info_filepath -b $blast_filepath -c $cap -C $min_coverage_perc -s $min_taxdiv_score &\n";
} # Ending in "&" means that the command is told to run in the background, so new commands can be started on top. The split files will be processed alongside each other. That's threading.
close $shellscript_filehandle; # Close the shell script file. We're finished editing its contents.
`bash $shellscript_filename`; # Run it.
# Set up to collate the PIA_inner.pl outputs from each thread
#------------------------------------------------------------
$grep_search_expression = '^' . $read_info_filename . '[a-z]{3}.PIA_output$'; # Add '_out' to pick up only the output directories from each split read_info file.
my @split_directories = `ls $read_info_parentpath . | grep -E $grep_search_expression `;
my @outFulls;
my @outSBs;
my @outSRs;
my @logs;
foreach my $split_directory (@split_directories){ # For every directory that PIA_inner.pl just made, look at its files:
chomp $split_directory;
my $base_output_filepath = $split_directory . '/' . $split_directory; # Default to no parent path.
if ($read_info_parentpath ne '') { # If there was a parent path, add it on now.
$base_output_filepath = $read_info_parentpath . '/' . $base_output_filepath;
}
$base_output_filepath = substr($base_output_filepath, 0, -11); # Remove the '.PIA_output' from the very end. The files inside the output directory don't have this.
push (@outFulls, $base_output_filepath . '.Full.txt');
push (@outSBs, $base_output_filepath . '.Summary_Basic.txt');
push (@outSRs, $base_output_filepath . '.Summary_Reads.txt');
push (@logs, $base_output_filepath . '.PIA_inner_log.txt');
}
# Make a master output folder:
my @read_info_filename_elements = split('\.', $read_info_filename);
my $sample_name = join('.', @read_info_filename_elements[0..(scalar(@read_info_filename_elements)-3)]); # The sample name is $read_info_filename minus 'read_info' and whatever FASTA extension there was: "test_sample.fa.read_infoaaa" becomes "test_sample".
my $collated_output_directory_path = $sample_name . '.PIA_output'; # Default to no parent path.
if ($read_info_parentpath ne '') { # If there was a parent path, add it on now.
$collated_output_directory_path = $read_info_parentpath . '/' . $collated_output_directory_path;
}
mkdir $collated_output_directory_path;
# Make a variable for the filename of each of the PIA_inner.pl output files.
my $output_full_filepath = $collated_output_directory_path . '/' . $sample_name . '.Full.txt';
my $output_SB_filepath = $collated_output_directory_path . '/' . $sample_name . '.Summary_Basic.txt';
my $output_SR_filepath = $collated_output_directory_path . '/' . $sample_name . '.Summary_Reads.txt';
my $output_SRM_filepath = $collated_output_directory_path . '/' . $sample_name . '.Summary_Reads_MEGAN.txt'; # To be filled in later.
my $output_fasta_filepath = $collated_output_directory_path . '/' . $sample_name . '.Post-PIA.fasta'; # To be filled in later.
my $log_filepath = $collated_output_directory_path . '/' . $sample_name . '.PIA_inner_logs.txt';
# Read and merge the Full output files:
open(my $output_full_filehandle, '>', $output_full_filepath) or die "Could not open final Full output file $output_full_filepath for writing: $!\n";
# Print the header again:
print $output_full_filehandle "Read_name\tCoverage\tTop_hit_name\tTop_hit_ID\tExpect_value\tIdentities_value\tNext_hit_name\tNext_hit_ID\tLast_hit_name\tLast_hit_ID\tPhylogenetic_range_name\tPhylogenetic_range_ID\tRaw_hit_count\tTaxonomic_diversity_(up_to_cap_if_met)\tTaxonomic_diversity_score\tPhylogenetic_intersection_name\tPhylogenetic_intersection_ID\n";
foreach my $split_full_filepath (@outFulls) {
open(my $split_full_filehandle, $split_full_filepath) or die "Could not open split Full output file $split_full_filepath: $!\n";
readline($split_full_filehandle); # Skip the header.
while(1) {
my $line = readline($split_full_filehandle);
if (! defined $line) { last }; # If there is no next line, exit the loop. You've processed the whole file.
print $output_full_filehandle $line; # Print other lines to the output file.
}
close $split_full_filehandle;
}
close $output_full_filehandle;
# Simply concatenate the logs, but overwrite any old file first by writing a simple header:
my $timestamp_end = localtime();
`printf "# Start: $timestamp_start\tEnd: $timestamp_end\n# PIA version:\t$PIA_version\n# Input FASTA:\t$fasta_filepath\n# Input BLAST:\t$blast_filepath\n" > $log_filepath`;
foreach my $split_log_filepath (@logs) {
chomp($split_log_filepath);
system("cat $split_log_filepath >> $log_filepath");
}
# Quickly scan the log for any warning messages about not being able to identify an ID. If one is found, stop and print a warning message.
open(my $log_filehandle, $log_filepath) or die "Could not open log file $log_filepath: $!\n";
readline($log_filehandle); # Skip the header.
while(1) {
my $line = readline($log_filehandle);
if (! defined $line) { last; } # If there is no next line, exit the loop. You've processed the whole file.
if (index($line, 'was not found in nodes file') != -1) {
print "WARNING: At least one taxonomic ID was not found in the names or nodes DBM. Check they have a similar date to your BLAST database and maybe run PIA again.\n\n";
last; # That's enough.
}
if (index($line, 'Cannot identify top hit') != -1) {
print "WARNING: At least one taxonomic ID was not found in the names or nodes DBM. Check they have a similar date to your BLAST database and maybe run PIA again.\n\n";
last; # That's enough.
}
}
# Collate the Summary Basics
#---------------------------
my %taxa_and_hits = ();
foreach my $split_SB_filepath (@outSBs) {
open(my $split_SB_filehandle, $split_SB_filepath) or die "Could not open split Summary Basic $split_SB_filepath: $!\n";
readline($split_SB_filehandle); # Skip the header.
while(1) {
my $line = readline($split_SB_filehandle);
if (! defined $line) { last; } # If you've reached the end of the file, exit the loop.
chomp $line;
my @line = split("\t", $line);
if ($line[0] == 1) { next; } # Skip any lines for taxon ID 1 (root).
my $name_and_ID = $line[1] . "\t" . $line[0]; # The first two columns are ID and name.
my $hit_count = $line[2]; # The final column is count.
if (exists $taxa_and_hits{$name_and_ID}) {
$taxa_and_hits{$name_and_ID} = $taxa_and_hits{$name_and_ID} + $hit_count;
} else {
$taxa_and_hits{$name_and_ID} = $hit_count;
}
}
close $split_SB_filehandle;
}
my $total_reads = 0;
foreach my $name_and_ID (keys %taxa_and_hits) {
$total_reads = $total_reads + $taxa_and_hits{$name_and_ID};
}
open(my $output_SB_filehandle, '>', $output_SB_filepath) or die "Could not open final Summary Basic $output_SB_filepath for writing: $!\n";
# Print a new header section including the end time. Preface every new line with '#' to make ignoring them easier.
print $output_SB_filehandle "# Start: $timestamp_start\tEnd: $timestamp_end\n# PIA version:\t$PIA_version\n# Input FASTA:\t$fasta_filepath\n# Input BLAST:\t$blast_filepath\n# Minimum coverage for top BLAST hit:\t$min_coverage_perc %\n# Cap of BLAST taxa to examine:\t\t$cap\n# Minimum taxonomic diversity score:\t$min_taxdiv_score\n# Number of threads:\t$threads\n#\n# Total passed reads: $total_reads\n#\n# ID\tName\tReads\n";
foreach my $taxon_information (sort {lc $a cmp lc $b} keys %taxa_and_hits) { # Print the hits in alphabetical order of taxon name (not just ASCII order).
my @taxon_information = split("\t", $taxon_information);
print $output_SB_filehandle "$taxon_information[1]\t$taxon_information[0]\t$taxa_and_hits{$taxon_information}\n"; # If there weren't any hits in any Summary Basic, this hash will be empty. But that shouldn't throw an error.
}
close $output_SB_filehandle;
# Read and merge the Summary Reads files, and generate from them a CSV that MEGAN can read
#-----------------------------------------------------------------------------------------
open(my $output_SR_filehandle, '>', $output_SR_filepath) or die "Could not open final Summary Reads $output_SR_filepath for writing: $!\n"; # Open the collated Summary Reads file.
open (my $output_SRM_filehandle, '>', $output_SRM_filepath) or die "Could not open $output_SRM_filepath: $!\n"; # Open the MEGAN CSV.
my %SR_data = (); # Also store the data in a hash with which to filter the input FASTA. Keys are read names, values are taxonomic IDs.
# Print the same new header section as for the new Summary Basic file.
print $output_SR_filehandle "# Start: $timestamp_start\tEnd: $timestamp_end\n# PIA version:\t$PIA_version\n# Input FASTA:\t$fasta_filepath\n# Input BLAST:\t$blast_filepath\n# Minimum coverage for top BLAST hit:\t$min_coverage_perc %\n# Cap of BLAST taxa to examine:\t\t$cap\n# Minimum taxonomic diversity score:\t$min_taxdiv_score\n# Number of threads:\t$threads\n#\n# Total passed reads: $total_reads\n#\n# Read\tID\tName\n";
foreach my $split_SR_filepath (@outSRs) {
open(my $split_SR_filehandle, $split_SR_filepath) or die "Could not open split Summary Reads $split_SR_filepath: $!\n";
readline($split_SR_filehandle); # Skip the header.
while(1) {
my $line = readline($split_SR_filehandle);
if (! defined $line) { last }; # If there is no next line, exit the loop. You've processed the whole file.
my @line = split("\t", $line); # Split the line on tabs.
if ($line[1] == 1) { next; } # Skip any lines for taxon ID 1 (root).
print $output_SR_filehandle $line; # Print straight to the collated Summary Reads file.
print $output_SRM_filehandle "$line[0],$line[1],50\n"; # Print to the MEGAN CSV the read name, taxonomic ID, and a stand-in bitscore of 50 (the default minimum pass score for the LCA).
$SR_data{$line[0]} = $line[1]; # Store the read name and taxonomic ID to use with the FASTA.
}
}
close $output_SR_filehandle;
close $output_SRM_filehandle;
# Use the Summary Reads data to produce a post-PIA FASTA
#-------------------------------------------------------
# Filter the input FASTA.
open($fasta_filehandle, $fasta_filepath) or die "Could not open original FASTA file $fasta_filepath: $!\n";
open(my $output_fasta_filehandle, '>', $output_fasta_filepath) or die "Could not open post-PIA FASTA file $output_fasta_filepath: $!\n";
$/ = '>'; # Set the record separator to '>', which separates FASTA records.
while (1) { # Run this loop until "last" is called.
my $record = readline($fasta_filehandle);
if (! defined $record) { last }; # If there is no next record, exit the loop. You've processed the whole file.
my @record = split ("\n", $record); # Split the record into lines.
my $read_name = $record[0];
if (exists $SR_data{$read_name} ) {
$read_name = $read_name . '_ID' . $SR_data{$read_name}; # Add the taxonomic ID on the end of the read name.
$record = join("\n", $read_name, $record[1]); # Slot the new name back into the record.
$record =~ s/>//g; # Remove the '>' character off the end so we can add a new one on the front.
print $output_fasta_filehandle ">$record\n";
}
}
close $fasta_filehandle;
close $output_fasta_filehandle;
$/ = "\n"; # Set the record separator back to default.
# Tidy up
#--------
foreach my $split_directory_filepath (@split_directories) { # Delete the split output directories.
chomp $split_directory_filepath;
if ($read_info_parentpath ne '') { # If there was a parent path, add it on now.
$split_directory_filepath = $read_info_parentpath . '/' . $split_directory_filepath;
}
rmtree $split_directory_filepath or warn "Could not rmtree $split_directory_filepath: $!";
}
foreach my $split_read_info_filepath (@split_read_infos) { # Delete the split read_info files.
chomp $split_read_info_filepath;
if ($read_info_parentpath ne '') { # If there was a parent path, add it on now.
$split_read_info_filepath = $read_info_parentpath . '/' . $split_read_info_filepath;
}
unlink $split_read_info_filepath or warn "Could not unlink $split_read_info_filepath: $!";
}
unlink $shellscript_filename; # Delete the list of PIA_inner.pl commands.
unlink $read_info_filepath; # Delete the original read_info file.
exit;