Skip to content

Commit 805b93e

Browse files
committed
generate file with unique genes per sample
1 parent f8ed5c4 commit 805b93e

7 files changed

+238
-1
lines changed

bin/roary-unique_genes_per_sample

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#!/usr/bin/env perl
2+
3+
package Bio::Roary::Main::UniqueGenesPerSample;
4+
5+
# ABSTRACT: Take in the clustered file and produce a sorted file with the frequency of each samples unique genes
6+
# PODNAME: roary-unique_genes_per_sample
7+
8+
=head1 SYNOPSIS
9+
10+
Take in the clustered file and produce a sorted file with the frequency of each samples unique genes
11+
12+
=cut
13+
14+
use Cwd qw(abs_path);
15+
BEGIN { unshift( @INC, abs_path('./lib') ) }
16+
BEGIN { unshift( @INC, abs_path('./t/lib') ) }
17+
use Bio::Roary::CommandLine::UniqueGenesPerSample;
18+
19+
Bio::Roary::CommandLine::UniqueGenesPerSample->new(args => \@ARGV, script_name => $0)->run;

dist.ini

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
name = Bio-Roary
2-
version = 3.8.2
2+
version = 3.9.0
33
author = Andrew J. Page <[email protected]>
44
license = GPL_3
55
copyright_holder = Wellcome Trust Sanger Institute
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
undef $VERSION;
2+
3+
package Bio::Roary::CommandLine::UniqueGenesPerSample;
4+
5+
# ABSTRACT: Take in the clustered file and produce a sorted file with the frequency of each samples unique genes
6+
7+
=head1 SYNOPSIS
8+
9+
Take in the clustered file and produce a sorted file with the frequency of each samples unique genes
10+
11+
=cut
12+
13+
use Moose;
14+
use Getopt::Long qw(GetOptionsFromArray);
15+
extends 'Bio::Roary::CommandLine::Common';
16+
17+
has 'args' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
18+
has 'script_name' => ( is => 'ro', isa => 'Str', required => 1 );
19+
has 'help' => ( is => 'rw', isa => 'Bool', default => 0 );
20+
21+
has 'clustered_proteins' => ( is => 'rw', isa => 'Str', default => 'clustered_proteins' );
22+
has 'output_filename' => ( is => 'rw', isa => 'Str', default => 'unique_genes_per_sample.tsv' );
23+
has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 );
24+
has '_error_message' => ( is => 'rw', isa => 'Str' );
25+
26+
sub BUILD {
27+
my ($self) = @_;
28+
29+
my ( $clustered_proteins, $output_filename, $verbose, $help );
30+
31+
GetOptionsFromArray(
32+
$self->args,
33+
'o|output=s' => \$output_filename,
34+
'c|clustered_proteins=s' => \$clustered_proteins,
35+
'v|verbose' => \$verbose,
36+
'h|help' => \$help,
37+
);
38+
39+
if ( defined($verbose) ) {
40+
$self->verbose($verbose);
41+
$self->logger->level(10000);
42+
}
43+
44+
$self->help($help) if ( defined($help) );
45+
( !$self->help ) or die $self->usage_text;
46+
47+
$self->output_filename($output_filename) if ( defined($output_filename) );
48+
if ( defined($clustered_proteins) && ( -e $clustered_proteins ) ) {
49+
$self->clustered_proteins($clustered_proteins);
50+
}
51+
else {
52+
$self->_error_message("Error: Cant access the clustered proteins file");
53+
}
54+
}
55+
56+
sub run {
57+
my ($self) = @_;
58+
59+
if ( defined( $self->_error_message ) ) {
60+
print $self->_error_message . "\n";
61+
die $self->usage_text;
62+
}
63+
64+
my $obj = Bio::Roary::UniqueGenesPerSample->new(
65+
gff_files => $self->gff_files,
66+
output_filename => $self->output_filename,
67+
groups_filename => $self->groups_filename,
68+
);
69+
$obj->reannotate;
70+
71+
}
72+
73+
sub usage_text {
74+
my ($self) = @_;
75+
76+
return <<USAGE;
77+
Usage: roary-unique_genes_per_sample [options] -c clustered_proteins
78+
Take in the clustered file and produce a sorted file with the frequency of each samples unique genes
79+
80+
Options: -o STR output filename [unique_genes_per_sample.tsv]
81+
-c STR clusters filename [clustered_proteins]
82+
-v verbose output to STDOUT
83+
-h this help message
84+
85+
For further info see: http://sanger-pathogens.github.io/Roary/
86+
USAGE
87+
}
88+
89+
__PACKAGE__->meta->make_immutable;
90+
no Moose;
91+
1;

lib/Bio/Roary/UniqueGenesPerSample.pm

+80
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
package Bio::Roary::UniqueGenesPerSample;
2+
3+
# ABSTRACT: Take in the clustered file and produce a sorted file with the frequency of each samples unique genes
4+
5+
=head1 SYNOPSIS
6+
7+
Take in the clustered file and produce a sorted file with the frequency of each samples unique genes
8+
use Bio::Roary::UniqueGenesPerSample;
9+
10+
my $obj = Bio::Roary::SequenceLengths->new(
11+
clustered_proteins => 'clustered_proteins',
12+
output_filename => 'output_filename',
13+
);
14+
$obj->write_unique_frequency;
15+
16+
=cut
17+
18+
use Moose;
19+
use Bio::Roary::Exceptions;
20+
21+
has 'clustered_proteins' => ( is => 'rw', isa => 'Str', default => 'clustered_proteins' );
22+
has 'output_filename' => ( is => 'rw', isa => 'Str', default => 'unique_genes_per_sample.tsv' );
23+
24+
has '_output_fh' => ( is => 'ro', lazy => 1, builder => '_build__output_fh' );
25+
26+
sub _build__output_fh {
27+
my ($self) = @_;
28+
open( my $fh, '>', $self->output_filename )
29+
or Bio::Roary::Exceptions::CouldntWriteToFile->throw( error => "Couldnt write output file:" . $self->output_filename );
30+
return $fh;
31+
}
32+
33+
#group_17585: 14520_6#21_00645
34+
sub _sample_to_gene_freq {
35+
my ($self) = @_;
36+
37+
open( my $input_fh, $self->clustered_proteins )
38+
or Bio::Roary::Exceptions::FileNotFound->throw( error => "Couldnt read input file:" . $self->clustered_proteins );
39+
40+
my %sample_to_gene_freq;
41+
while (<$input_fh>) {
42+
chomp;
43+
my $line = $_;
44+
next if ( length( $line ) < 6 );
45+
if ( $line =~ /^.+: ([^\s]+)$/ ) {
46+
my $gene_id = $1;
47+
if ( $gene_id =~ /^(.+)_[\d]+$/ ) {
48+
my $sample_name = $1;
49+
$sample_to_gene_freq{$sample_name}++;
50+
}
51+
else {
52+
# gene id may not be valid so ignore
53+
next;
54+
}
55+
}
56+
else {
57+
# its either an invalid line or theres more than 1 gene in the cluster
58+
next;
59+
}
60+
}
61+
62+
return \%sample_to_gene_freq;
63+
}
64+
65+
sub write_unique_frequency {
66+
my ($self) = @_;
67+
68+
my %sample_to_gene_freq = %{$self->_sample_to_gene_freq};
69+
70+
for my $sample ( sort { $sample_to_gene_freq{$b} <=> $sample_to_gene_freq{$a} || $a cmp $b } keys %sample_to_gene_freq ) {
71+
print { $self->_output_fh } $sample . "\t" . $sample_to_gene_freq{$sample} . "\n";
72+
}
73+
close($self->_output_fh);
74+
return 1;
75+
}
76+
77+
no Moose;
78+
__PACKAGE__->meta->make_immutable;
79+
80+
1;

t/Bio/Roary/UniqueGenesPerSample.t

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#!/usr/bin/env perl
2+
use strict;
3+
use warnings;
4+
use Test::Files;
5+
use Data::Dumper;
6+
7+
BEGIN { unshift( @INC, './lib' ) }
8+
$ENV{PATH} .= ":./bin";
9+
10+
BEGIN {
11+
use Test::Most;
12+
use_ok('Bio::Roary::UniqueGenesPerSample');
13+
}
14+
15+
ok(
16+
my $obj = Bio::Roary::UniqueGenesPerSample->new(
17+
clustered_proteins => 't/data/unique_genes_per_sample/clustered_proteins_valid',
18+
),
19+
'Initialise object'
20+
);
21+
22+
is_deeply($obj->_sample_to_gene_freq, {
23+
'11111_4#44' => 1,
24+
'123_4#5' => 2,
25+
'999_4#5' => 1,
26+
'22222_6#21' => 1
27+
}, 'sample frequencies');
28+
29+
30+
ok($obj->write_unique_frequency, 'create output file');
31+
ok(-e $obj->output_filename, 'output file exists');
32+
33+
compare_ok($obj->output_filename, 't/data/unique_genes_per_sample/expected_unique_genes_per_sample.tsv', 'got expected unique gene frequency');
34+
35+
unlink($obj->output_filename);
36+
37+
done_testing();
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
group_2: 123_4#5_02659 999_4#5_02659
2+
group_2: 123_4#5_02654
3+
group_8: 999_4#5_02651
4+
group_7: 123_4#5_02674
5+
nagK: 11111_4#44_01973
6+
dnaA: 22222_6#21_00645
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
123_4#5 2
2+
11111_4#44 1
3+
22222_6#21 1
4+
999_4#5 1

0 commit comments

Comments
 (0)