-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPIA_index_maker.pl
114 lines (89 loc) · 4.11 KB
/
PIA_index_maker.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/perl
######################################################
################# PIA_index_maker.pl #################
######################################################
####### Creates DBM index files for later use ########
############## Becky Cribdon, UoW 2019################
######################################################
############## Version 4.9, 2019-09-13 ###############
######################################################
use strict;
use warnings;
use Getopt::Std;
use DB_File;
use Fcntl;
use Data::Dumper qw(Dumper);
# Run as follows:
# perl PIA_index_maker.pl [optional reference file locations]
# It will take a few minutes.
# Summary:
# - Makes an index of the names file where keys are IDs and values are the matching scientific names.
# - Makes an index of the nodes file where keys are IDs and values are parent nodes and taxonomic ranks.
my %options=();
getopts('n:N:"', \%options);
##### Locate reference files #####
# If they aren't in their default location (Reference_files/), use the options to specify where they are.
my $nodes_filename = 'Reference_files/nodes.dmp';
if ($options{n}){
$nodes_filename = ($options{n});
}
my $names_filename = 'Reference_files/names.dmp';
if ($options{N}){
$names_filename = ($options{N});
}
##############################################
########### Set up DBM index files ###########
##############################################
print "\n\n****Setting up DBM index files****\n";
# Nodes index where keys are IDs
#-------------------------------
my $nodesfileDBM = $nodes_filename . '.dbm';
if (-e $nodesfileDBM) {
print "$nodesfileDBM already exists. Overwriting.\n";
unlink $nodesfileDBM;
}
my %nodesfileDBM = (); # Keys are taxonomic IDs and values are parent nodes and taxonomic ranks.
tie (%nodesfileDBM, "DB_File", $nodesfileDBM, O_RDWR|O_CREAT, 0666, $DB_BTREE) or die "Can't open $nodesfileDBM: $!\n";
# Now populate the hash:
open (my $nodes_filehandle, $nodes_filename) or die "Could not open $nodes_filename: $!\n";
while (1) { # Run this loop until "last" is called.
my $line = <$nodes_filehandle>; # Read the next line from the nodes file.
if (! defined $line) { last }; # If there is no next line, exit the loop. You've processed the whole file.
my @line = split(/\|/, $line); # Split the line by | characters.
my $ID = $line[0];
$ID =~ s/^\s+|\s+$//g; # Trim whitespace off the start and end.
my $parent_node = $line[1];
$parent_node =~ s/^\s+|\s+$//g;
my $rank = $line[2];
$rank =~ s/^\s+|\s+$//g;
$nodesfileDBM{$ID} = $parent_node . "\t" . $rank; # Store in the hash. Tab-separate the parent node and rank.
}
close $nodes_filehandle;
untie %nodesfileDBM;
# Names index where keys are IDs
#-------------------------------
# This will be used to look up the scientific names of taxa using their IDs.
my $namesfileDBM = $names_filename . '.dbm';
if (-e $namesfileDBM) {
print "$namesfileDBM already exists. Overwriting.\n";
unlink $namesfileDBM;
}
my %namesfileDBM = (); # Keys are taxonomic IDs and values are scientific names.
tie (%namesfileDBM, "DB_File", $namesfileDBM, O_RDWR|O_CREAT, 0666, $DB_BTREE) or die "Can't open $namesfileDBM: $!\n";
# Now populate the hash:
open (my $names_filehandle, $names_filename) or die "Could not open $names_filename: $!\n";
while (1) { # Run this loop until "last" is called.
my $line = <$names_filehandle>; # Read the next line from the names file.
if (! defined $line) { last }; # If there is no next line, exit the loop. You've processed the whole file.
my @line = split(/\|/, $line); # Split the line by | characters.
if ($line[3] eq "\tscientific name\t") {
my $name = $line[1];
$name =~ s/^\s+|\s+$//g; # Trim whitespace off the start and end.
my $ID = $line[0];
$ID =~ s/^\s+|\s+$//g;
$namesfileDBM{$ID} = $name; # Store in the hash.
} # If that line is not for a scientific name, ignore it and move on.
}
close $names_filehandle;
untie %namesfileDBM;
print "Finished indexing.\n\n";