-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpick_common_miRNA_in_multiple_samples.pl
126 lines (106 loc) · 3.99 KB
/
pick_common_miRNA_in_multiple_samples.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/perl
use strict;
use warnings;
no warnings qw(uninitialized);
use Cwd 'abs_path';
use Getopt::Long;
use File::Basename;
#Variable setup
my %opt;
use vars qw/%opt/;
my $version="1.0 version";
my $usage=<<USAGE; #******* Instruction of this program *********#
Author: Wanfei Liu & Chengqi Xin
Email: <liuwf\@big.ac.cn> & <xincq\@big.ac.cn>
Date: Jun 8, 2013
Version: $version
This program can pick common miRNA according to the genome postion file of miRNA hairpin or mature sequence in gff format. Before you run it, you must make sure that different sample has different accession number for miRNA. Then you should cat all gff files of candidate miRNAs in different samples into one single file as input file.
Usage: perl pick_common_miRNA_in_multiple_samples.pl --gff|-g <*.gff> --output|-o <*>
The following options are necessary.
--gff|-g <*.gff> :the gff file containing all miRNA in different samples
--output|-o <*> :the output file
The following options are optional.
--overpercent|-hop <hairoverpercent=50> :minimum overlap percent(%), default is 50
--help|-h :print the usage information
USAGE
#Gather input
&GetOptions(
"gff|s=s" =>\$opt{gff},
"output|o=s" =>\$opt{output},
"overpercent|op=i" =>\$opt{overpercent},
"help|h" =>\$opt{help},
);
#Verify input
if (!defined $opt{gff} or !defined $opt{output} or defined $opt{help}) {
die "$usage\n";
}
#Default parameters
$opt{overpercent} = 50 unless defined $opt{overpercent};
# Absolute path
$opt{gff} = abs_path $opt{gff};
$opt{output} = abs_path $opt{output};
#Build commond line
my $cmdline = $0;
$cmdline .= " -g $opt{gff}" if defined $opt{gff};
$cmdline .= " -o $opt{output}" if defined $opt{output};
$cmdline .= " -op $opt{overpercent}" if defined $opt{overpercent};
print `date`, "\n>>> pick_common_miRNA_in_multiple_samples.pl <<<\n\n";
print $cmdline,"\n\n";
my $infile=$opt{gff};
my $outfile=$opt{output};
print "Read the input file.\n";
my (%repeat,);
open (IN,"<$infile")||die("fail to open $infile.\n");
while (<IN>) {
chomp;
#S091595 . miRNA 194 211 . - . ACC="MI1"; ID="ath-miR5021";
my @list=split /\t/,$_;
if ($list[8]=~/ACC\=\"([^\"]+)\"/) {
$list[8]=~s/ACC\=//;
$list[8]=~s/ID\=//;
$list[8]=~s/EXPRESS\=//;
$list[8]=~s/\"//g;
$list[8]=~s/\;//g;
my @feature=split /\s+/,$list[8];
$repeat{$list[0]}{$list[6]}{"$list[3]\t$list[4]"}[0]=$list[3];
$repeat{$list[0]}{$list[6]}{"$list[3]\t$list[4]"}[1].=$feature[0].",";
}
}
close IN;
my $time=localtime;
print "The input file has been readed: $time.\n\n";
print "Compare the miRNA according to the genome position.\n";
open (OUT,">$outfile")||die("fail to open $outfile.\n");
foreach my $chr (keys %repeat) {
foreach my $strand (keys %{$repeat{$chr}}) {
my @record=sort {$repeat{$chr}{$strand}{$a}[0]<=>$repeat{$chr}{$strand}{$b}[0]} keys %{$repeat{$chr}{$strand}};
for (my $i=0;$i<@record-1;$i++) {
my ($start1,$end1)=split /\t/,$record[$i];
my ($start2,$end2)=split /\t/,$record[$i+1];
if (($end1>=$start2 and $end1<=$end2) or ($end2>=$start1 and $end2<=$end1)) {
my $start=$start1;
my $end=$end1;
if ($start<$start2) {
$start=$start2;
}
if ($end>$end2) {
$end=$end2;
}
if (($end-$start+1)/($end1-$start1+1)>=$opt{overpercent}/100 or ($end-$start+1)/($end2-$start2+1)>=$opt{overpercent}/100) {
$repeat{$chr}{$strand}{$record[$i]}[1].=$repeat{$chr}{$strand}{$record[$i+1]}[1];
delete $repeat{$chr}{$strand}{$record[$i+1]};
$record[$i+1]=$record[$i];
}
}
}
@record=sort {$repeat{$chr}{$strand}{$a}[0]<=>$repeat{$chr}{$strand}{$b}[0]} keys %{$repeat{$chr}{$strand}};
for (my $i=0;$i<@record;$i++) {
my @id=split /\,+/,$repeat{$chr}{$strand}{$record[$i]}[1];
my $idnum=@id;
print OUT "$idnum\t$repeat{$chr}{$strand}{$record[$i]}[1]\n";
}
}
}
close OUT;
$time=localtime;
print "The miRNA comparison has been finished: $time.\n\n";