-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_parallel_files.pl~
110 lines (95 loc) · 3.06 KB
/
create_parallel_files.pl~
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#! /usr/bin/perl -w
#
# Use this program as part of the eOTU pipeline
#
$programname="create_parallel_files.pl";
die "Input the following:
full_list- file name of the complete OTU list
fasta- full fasta file with sequences and names that match those in the list
output_prefix- what the output files will begin with
Usage: full_list full_fasta full_mat UNIQUE AFTER\n" unless (@ARGV);
chomp (@ARGV);
($full_list, $fullfasta, $fullmat, $BEFORE, $AFTER) = (@ARGV);
die "$programname: Please follow the command line arguments\n" unless ($AFTER);
chomp ($full_list);
chomp ($fullfasta);
chomp ($fullmat);
chomp ($BEFORE);
chomp ($AFTER);
#read in the list file and capture the OTU on the right line according to the lineno
$curline=0;
open (IN, "<${full_list}") or die "Can't open ${full_list}\n";
while ($line1 = <IN>){
chomp ($line1);
$curline++;
$allotunum{$curline}++;
$checkmatfile="${BEFORE}.${curline}.${AFTER}.mat";
die "Please provide a unique name. ${checkmatfile} exists\n" if (-e ${checkmatfile});
$checkfafile="${BEFORE}.${curline}.${AFTER}.fa";
die "Please provide a unique name. ${checkfafile} exists\n" if (-e ${checkfafile});
(@pieces) = split ("\t", $line1);
foreach $piece (@pieces){
$otuhash{$piece}=$curline;
}
}
close (IN);
die "list_through_filter_from_mat.pl:Missing OTUhash\n" unless (%otuhash);
#reminder info: %otuhash contains all of the OTUs that we want to evaluate
#nothing else should be retained
#change the line breaks to > for fasta format
$/ = ">";
#read in the full fasta file
open (IN, "<$fullfasta") or die "Can't open $fullfasta\n";
while ($line1 = <IN>){
chomp ($line1);
next unless ($line1);
$sequence = ();
(@pieces) = split ("\n", $line1);
($info) = shift (@pieces);
#filter out all of the other entries
#merge the sequences from different lines
($sequence) = join ("", @pieces);
#makes ure they don't contain the line breaks
$sequence =~tr/\n//d;
#store sequence infor for later processing
if ($otuhash{$info}){
($otunum)=$otuhash{$info};
open (FA, ">>${BEFORE}.${otunum}.${AFTER}.fa") or die "Can't open ${BEFORE}.${otunum}.${AFTER}.fa";
print FA ">$info\n$sequence\n";
close (FA);
}
}
close (IN);
#change back the line breaks
$/="\n";
open (IN, "<${fullmat}") or die "Can't open ${fullmat}\n";
while ($line=<IN>){
chomp ($line);
next unless ($line);
($first, @pieces)=split ("\t", $line);
if (@headers){
#print the data to the filehandle associated with each on
($num)=$otuhash{$first};
if ($num){
open (MAT, ">>${BEFORE}.${num}.${AFTER}.mat") or die "Can't open ${BEFORE}.${num}.${AFTER}.mat\n";
print MAT "$first";
foreach $piece (@pieces){
print MAT "\t$piece";
}
print MAT "\n";
close (MAT);
}
} else {
foreach $num (keys %allotunum){
open (MAT, ">>${BEFORE}.${num}.${AFTER}.mat") or die "Can't open ${BEFORE}.${num}.${AFTER}.mat\n";
print MAT "$first";
foreach $piece (@pieces){
print MAT "\t$piece";
}
print MAT"\n";
close (MAT);
}
(@headers)=@pieces;
}
}
close (IN);