-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathBCModules.pm
executable file
·106 lines (97 loc) · 2.63 KB
/
BCModules.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
package BCModules;
# Name: BCModules.pm
# Description: Biocluster specific sequence analysis modules
# Author: Steven Ahrendt
# email: [email protected]
# Date: 03.18.2014
#######################
use strict;
use Bio::Seq;
use Bio::SeqIO;
use base 'Exporter'; # to export our subroutines
our @EXPORT = qw(indexProteomes
indexTaxonomy
indexPFAM
); # export always
our $dbDir = "/rhome/sahrendt/bigdata/Genomes";
our $protDir = "$dbDir/Protein";
our $funcDir = "$dbDir/Functional";
our $PFAMDir = "$funcDir/PFAM";
#######
## Subroutine: indexPFAM
# Input:
# Returns:
###############
sub indexPFAM
{
my %hash;
my $pfamFile = shift @_;
$pfamFile = join("/",$PFAMDir,$pfamFile);
my @key_names = qw(PROTEIN_NAME LOCUS GENE_CONTIG PFAM_ACC PFAM_NAME PFAM_DESCRIPTION PFAM_START PFAM_STOP LENGTH PFAM_SCORE PFAM_EXPECTED);
open(my $fh, "<", $pfamFile) or die "Can't open $pfamFile: $!\n";
my $lc = -1;
while (my $line = <$fh>)
{
$lc++;
chomp $line;
next if($line =~ /^#/);
next if($lc == 0);
my @data = split(/\t/,$line);
$hash{$data[1]}{$key_names[0]} = $data[0];
$hash{$data[1]}{$key_names[2]} = $data[2];
push( @{$hash{$data[1]}{$key_names[3]}}, $data[3]);
push( @{$hash{$data[1]}{$key_names[4]}}, $data[4]);
push( @{$hash{$data[1]}{$key_names[5]}}, $data[5]);
push( @{$hash{$data[1]}{$key_names[6]}}, $data[6]);
push( @{$hash{$data[1]}{$key_names[7]}}, $data[7]);
push( @{$hash{$data[1]}{$key_names[8]}}, $data[8]);
push( @{$hash{$data[1]}{$key_names[9]}}, $data[9]);
push( @{$hash{$data[1]}{$key_names[10]}}, $data[10]);
}
close($fh);
return %hash;
}
#######
## Subroutine: indexTaxonomy
# Input: none
# Returns: a hash with rudimentary taxonomy
##############
sub indexTaxonomy
{
my %hash;
my $taxfile = "$dbDir/taxonlist";
open(TAX,"<$taxfile") or die "Can't open $taxfile: $!\n";
while (my $line = <TAX>)
{
next if ($line =~ /^#/);
chomp $line;
my @data = split(/\t/,$line);
$hash{$data[0]}{"Class"} = $data[1];
$hash{$data[0]}{"FullName"} = $data[2];
}
close(TAX);
return %hash;
}
#######
## Subroutine: indexProteomes
# Input: none
# Returns: a hash with display ID keys storing sequence strings
##############
sub indexProteomes
{
my %hash;
opendir(DIR,$protDir);
my @files = grep { /\.fasta$/ } readdir(DIR);
closedir(DIR);
foreach my $file (@files)
{
my $fasta_in = Bio::SeqIO->new(-file => "$protDir/$file",
-format => "fasta");
while (my $seq_obj = $fasta_in->next_seq)
{
$hash{$seq_obj->display_id} = $seq_obj;
}
}
return %hash;
}
1;