Skip to content

Commit

Permalink
Merge pull request #3494 from solgenomics/fix_vcf_format
Browse files Browse the repository at this point in the history
Fix vcf format in the VCF download. Transpose introduces some issues, also trailing tabs.
  • Loading branch information
lukasmueller authored Apr 22, 2021
2 parents 13bd312 + c4e5c86 commit cd86ef8
Show file tree
Hide file tree
Showing 6 changed files with 398 additions and 9 deletions.
138 changes: 138 additions & 0 deletions bin/load_marker_dbxref.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@

=head1 NAME
load_dbxref.pl - a script to link marker_names with cvterm for external database link, this script uses the nd_protocolprop table
with a typdef_id
=head1 SYNOPSYS
load_locus_publications.pl -p [person_id] -H [hostname] -D [database name] -c dbxref -j protocol_id file
where file contains a column with marker names
if marker_name does not exist then do not insert
nd_protocolprop format { dbxref: dbxref_name, marker: {array of marker_names} }
To use an existing protocol (not create a new nd_protocol name entry), use -j protocol_id
=head1 COMMAND-LINE OPTIONS
ARGUMENTS
-H host name (required) e.g. "localhost"
-D database name (required) e.g. "cxgn_cassava"
-j protocol_id (Will associate genotype data to an existing nd_protocol_id)
=head1 AUTHOR
Clay Birkett <[email protected]>
=cut

use strict;
use warnings;

use Getopt::Std;
use File::Slurp qw | slurp |;
use CXGN::DB::InsertDBH;

our %opts;
getopts('p:H:D:j:', \%opts);

my $file = shift;

my @lines = slurp($file);
chomp(@lines);

my $dbh = CXGN::DB::InsertDBH->new( { dbname => $opts{D},
dbhost => $opts{H},
});


my $sth;
my @row;
my $alias;
my $count;
my $count_add;
my $marker_id;
my %alias_list;
my %marker_list;
my %unique_list;

my $protocol_id = $opt_j;

#if protocol_id provided, a new one will not be created
if ($protocol_id){
my $protocol = CXGN::Genotype::Protocol->new({
bcs_schema => $schema,
nd_protocol_id => $protocol_id
});
$organism_species = $protocol->species_name;
$obs_type = $protocol->sample_observation_unit_type_name if !$obs_type;
}

##get list of current alias entries
##use (marker_id, alias) as unique key so we don't duplicate entries
my $key;
$count = 0;
$sth = $dbh->prepare("SELECT alias, marker_id, preferred from marker_alias");
$sth->execute();
while (@row = $sth->fetchrow_array()) {
$count++;
$alias = $row[0];
if ($row[2]) {
$marker_list{$alias} = $row[1];
} else {
$alias_list{$alias} = $row[1];
}
$key = $row[0] . $row[1];
$unique_list{$key} = 1;
}
print "$count from marker_alias\n";

$count = 0;
$sth = $dbh->prepare("INSERT into marker_alias (marker_id, alias, preferred) values (?, ?, ?)");
foreach my $l (@lines) {
$count++;
push (@marker_list, $alias);
}

my ($marker_name, $alias) = split /\t/, $l;
my @alias_list = split /\|/, $alias;
if (exists($marker_list{$marker_name})) {
$marker_id = $marker_list{$marker_name};
foreach (@alias_list) {
$key = $_ . $marker_id;
if (exists($unique_list{$key})) {
} else {
$count_add++;
$sth->execute($marker_id, $_, 0);
$alias_list{$_} = $marker_id;
}
}
} else {
$sth2 = $dbh->prepare("INSERT into marker (dummy_field) values (null) RETURNING marker_id");
$sth2->execute();
($marker_id) = $sth2->fetchrow_array();
$sth2 = $dbh->prepare("INSERT into marker_alias (marker_id, alias, preferred) values (?, ?, ?)");
$sth2->execute($marker_id, $marker_name, 1);
print "added marker $marker_name $marker_id\n";
foreach (@alias_list) {
$key = $_ . $marker_id;
if (exists($unique_list{$key})) {
} else {
$count_add++;
$sth->execute($marker_id, $_, 0);
$alias_list{$_} = $marker_id;
}
print "added alias $_\n";
}
last;
}
}
print "$count total $count_add added\n";
$dbh->commit();
6 changes: 5 additions & 1 deletion bin/transpose_matrix.pl
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,11 @@
$outline[$i] = "\t" x $oldlastcol;
}
for (my $i=0; $i <=$lastcol; $i++) {
$outline[$i] .= "$line[$i]\t"
if ($outline[$i] =~ /\w/) {
$outline[$i] .= "\t$line[$i]";
} else {
$outline[$i] = $line[$i];
}
}
}
for (my $i=0; $i <= $lastcol; $i++) {
Expand Down
74 changes: 74 additions & 0 deletions db/00141/AddVcfType.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#!/usr/bin/env perl


=head1 NAME
AddVcfType.pm
=head1 SYNOPSIS
mx-run ThisPackageName [options] -H hostname -D dbname -u username [-F]
This patch add new cvterm vcf_snp_dbxref to nd_protocol
=head1 DESCRIPTION
This dbpatch adds vcf_snp_dbxref to nd_protocol and nd_protocolprop to provide external links to marker_names
=head1 AUTHOR
Clay Birkett<[email protected]>
=head1 COPYRIGHT & LICENSE
Copyright 2010 Boyce Thompson Institute for Plant Research
This program is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
=cut


package AddVcfType;

use Moose;
use Bio::Chado::Schema;
use Try::Tiny;
extends 'CXGN::Metadata::Dbpatch';

has '+description' => ( default => <<'' );
This patch add new cvterm vcf_snp_dbxref to nd_protocol
has '+prereq' => (
default => sub {
[],
},
);

sub patch {
my $self=shift;

print STDOUT "Executing the patch:\n " . $self->name . ".\n\nDescription:\n ". $self->description . ".\n\nExecuted by:\n " . $self->username . " .";

print STDOUT "\nChecking if this db_patch was executed before or if previous db_patches have been executed.\n";

print STDOUT "\nExecuting the SQL commands.\n";

my $schema = Bio::Chado::Schema->connect( sub { $self->dbh->clone } );

my $term = 'vcf_snp_dbxref';

$schema->resultset("Cv::Cvterm")->create_with( {
name => $term,
cv => 'protocol_property', }
);


print "You're done!\n";
}


####
1; #
####
20 changes: 12 additions & 8 deletions lib/CXGN/Genotype/Search.pm
Original file line number Diff line number Diff line change
Expand Up @@ -1613,7 +1613,7 @@ sub get_cached_file_VCF {
my $time = DateTime->now();
my $timestamp = $time->ymd()."_".$time->hms();

my @all_protocol_info_lines = ("##INFO=<ID=VCFDownload, Description='VCFv4.2 FILE GENERATED BY BREEDBASE AT ".$timestamp."'>");
my @all_protocol_info_lines;

#Get all marker information for the protocol(s) requested. this is important if they are requesting subsets of markers or if they are querying more than one protocol at once. Also important for ordering VCF output. Old genotypes did not have protocolprop marker info so markers are taken from first genotypeprop return below.
my @all_marker_objects;
Expand All @@ -1631,6 +1631,7 @@ sub get_cached_file_VCF {
push @all_protocol_info_lines, @{$protocol->header_information_lines};
push @all_marker_objects, values %$markers;
}
push @all_protocol_info_lines, "##INFO=<ID=VCFDownload,Description='VCFv4.2 FILE GENERATED BY BREEDBASE AT ".$timestamp."'>";

foreach (@all_marker_objects) {
$self->_filtered_markers()->{$_->{name}}++;
Expand Down Expand Up @@ -1660,12 +1661,12 @@ sub get_cached_file_VCF {
if ($counter == 0) {
$genotype_string .= "#CHROM\t";
foreach my $m (@all_marker_objects) {
$genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{chrom} . " \t";
$genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{chrom} . "\t";
}
$genotype_string .= "\n";
$genotype_string .= "POS\t";
foreach my $m (@all_marker_objects) {
$genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{pos} . " \t";
$genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{pos} . "\t";
}
$genotype_string .= "\n";
$genotype_string .= "ID\t";
Expand Down Expand Up @@ -1799,7 +1800,7 @@ sub get_cached_file_VCF {
my $stocklookup = CXGN::Stock::StockLookup->new({schema => $self->bcs_schema});
my @accession_ids = keys %unique_germplasm;
my $synonym_hash = $stocklookup->get_stock_synonyms('stock_id', 'accession', \@accession_ids);
my $synonym_string = "## Synonyms of accessions: ";
my $synonym_string = "##SynonymsOfAccessions=\"";
while( my( $uniquename, $synonym_list ) = each %{$synonym_hash}){
if(scalar(@{$synonym_list})>0){
if(not length($synonym_string)<1){
Expand All @@ -1809,6 +1810,7 @@ sub get_cached_file_VCF {
$synonym_string.= (join ", ", @{$synonym_list}).")";
}
}
$synonym_string .= "\"";
push @all_protocol_info_lines, $synonym_string;

my $vcf_header = join "\n", @all_protocol_info_lines;
Expand Down Expand Up @@ -1906,7 +1908,7 @@ sub get_cached_file_VCF_compute_from_parents {
my $time = DateTime->now();
my $timestamp = $time->ymd()."_".$time->hms();

my @all_protocol_info_lines = ("##INFO=<ID=VCFDownload, Description='VCFv4.2 FILE GENERATED BY BREEDBASE AT ".$timestamp."'>");
my @all_protocol_info_lines;

my %unique_germplasm;
my $protocol = CXGN::Genotype::Protocol->new({
Expand All @@ -1920,6 +1922,7 @@ sub get_cached_file_VCF_compute_from_parents {
my $markers = $protocol->markers;
my @all_marker_objects = values %$markers;
push @all_protocol_info_lines, @{$protocol->header_information_lines};
push @all_protocol_info_lines, "##INFO=<ID=VCFDownload,Description='VCFv4.2 FILE GENERATED BY BREEDBASE AT ".$timestamp."'>";

foreach (@all_marker_objects) {
$self->_filtered_markers()->{$_->{name}}++;
Expand Down Expand Up @@ -1959,12 +1962,12 @@ sub get_cached_file_VCF_compute_from_parents {
if ($counter == 0) {
$genotype_string .= "#CHROM\t";
foreach my $m (@all_marker_objects) {
$genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{chrom} . " \t";
$genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{chrom} . "\t";
}
$genotype_string .= "\n";
$genotype_string .= "POS\t";
foreach my $m (@all_marker_objects) {
$genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{pos} . " \t";
$genotype_string .= $geno->{selected_protocol_hash}->{markers}->{$m->{name}}->{pos} . "\t";
}
$genotype_string .= "\n";
$genotype_string .= "ID\t";
Expand Down Expand Up @@ -2059,7 +2062,7 @@ sub get_cached_file_VCF_compute_from_parents {
my $stocklookup = CXGN::Stock::StockLookup->new({schema => $self->bcs_schema});
my @accession_ids = keys %unique_germplasm;
my $synonym_hash = $stocklookup->get_stock_synonyms('stock_id', 'accession', \@accession_ids);
my $synonym_string = "## Synonyms of accessions: ";
my $synonym_string = "##SynonymsOfAccessions=\"";
while( my( $uniquename, $synonym_list ) = each %{$synonym_hash}){
if(scalar(@{$synonym_list})>0){
if(not length($synonym_string)<1){
Expand All @@ -2069,6 +2072,7 @@ sub get_cached_file_VCF_compute_from_parents {
$synonym_string.= (join ", ", @{$synonym_list}).")";
}
}
$synonym_string .= "\"";
push @all_protocol_info_lines, $synonym_string;

my $vcf_header = join "\n", @all_protocol_info_lines;
Expand Down
Loading

0 comments on commit cd86ef8

Please sign in to comment.