Skip to content

Commit

Permalink
Merge pull request #5329 from solgenomics/topic/fix_missing_0s_in_csv…
Browse files Browse the repository at this point in the history
…_parsing

the value zero is not stored correctly in certain phenotype uploads
  • Loading branch information
lukasmueller authored Feb 19, 2025
2 parents 498fb6b + 7773686 commit 20c98e7
Show file tree
Hide file tree
Showing 12 changed files with 851 additions and 714 deletions.
4 changes: 3 additions & 1 deletion bin/delete_nd_experiment_entries.pl
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,13 @@ =head1 AUTHOR
close($fh);

eval {
if ( scalar(@nd_experiment_ids) > 0 ) {
my $nd_experiment_ids_string = join ",", @nd_experiment_ids;
my $q = "DELETE FROM nd_experiment WHERE nd_experiment_id IN ($nd_experiment_ids_string)";
my $h = $dbh->prepare($q);
$h->execute();
print STDERR "DELETED ".scalar(@nd_experiment_ids)." Nd Experiment Entries\n";
}
print STDERR "DELETED ".scalar(@nd_experiment_ids)." Nd Experiment Entries\n";
};

if ($@) {
Expand Down
120 changes: 120 additions & 0 deletions bin/extract_pheno_zeros.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#!/usr/bin/perl

=head1 NAME
extract_pheno_zeros.pl - find all 0 phenotype values in archived uploads and generate a CSV file to re-upload
=head1 DESCRIPTION
extract_pheno_zeros.pl -H [database host] -D [database name] -U [database uesr] -P [database pass] -s [start date YYYY-MM-DD] -o [output csv file]
Options:
-H the database host
-D the database name
-U username
-P password
-d start date YYYY-MM-DD (default = 2024-06-11)
-o output .csv file
=head1 AUTHOR
David Waring <[email protected]>
=cut

use strict;
use warnings;
use DBI;
use Try::Tiny;
use Getopt::Long;
use Data::Dumper;
use CXGN::File::Parse;

my ( $dbhost, $dbname, $username, $password, $date, $output );
GetOptions(
'H=s' => \$dbhost,
'D=s' => \$dbname,
'U=s' => \$username,
'P=s' => \$password,
's=s' => \$date,
'o=s' => \$output,
);

if ( !$dbhost || !$dbname || !$username || !$password ) {
print STDERR "ERROR: Missing either -H dbhost -D dbname -U username or -P password\n";
exit 1;
}
if ( !$output ) {
print STDERR "ERROR: Missing output .xls file\n";
exit 1;
}
if ( !$date || $date eq '' ) {
$date = "2024-06-11";
}

print STDERR "Connecting to database...\n";
my $dsn = 'dbi:Pg:database='.$dbname.";host=".$dbhost.";port=5432";
my $dbh = DBI->connect($dsn, $username, $password, { RaiseError => 1, AutoCommit=>0 });

# Get all phenotype files from the metadata since the start date
my $q = "SELECT dirname || '/' || basename
FROM metadata.md_files
LEFT JOIN metadata.md_metadata ON (md_files.metadata_id = md_metadata.metadata_id)
WHERE md_files.filetype = 'spreadsheet phenotype file' AND md_metadata.create_date > ?
ORDER BY create_date ASC;";
my $h = $dbh->prepare($q);
$h->execute($date);
$dbh->commit();

my %data;
my %traits;

# Check each file for 0s
while ( my ($file) = $h->fetchrow_array() ) {
print STDERR "==> Checking File: $file\n";
my $parser = CXGN::File::Parse->new(
file => $file,
required_columns => [ 'observationunit_name' ],
column_aliases => {
'observationunit_name' => [ 'plot_name', 'subplot_name', 'plant_name', 'observationUnitName', 'plotName', 'subplotName', 'plantName' ]
}
);
my $parsed = $parser->parse();
my $parsed_data = $parsed->{data};
my $trait_columns = $parsed->{optional_columns};

foreach my $row (@$parsed_data) {
my $ou = $row->{'observationunit_name'};
foreach my $trait (@$trait_columns) {
my $value = $row->{$trait};
if ( defined($value) && $value eq '0' ) {
print STDERR "$ou | $trait = 0\n";
$traits{$trait} = 1;
$data{$ou}{$trait} = '0';
}
}
}
}

# Generate output CSV data
my @output;
push @output, join(',', 'observationunit_name', keys %traits);
my @ous = sort keys %data;
foreach my $ou (@ous) {
my @line;
push @line, $ou;
foreach my $trait (keys %traits) {
my $value = $data{$ou}{$trait};
push @line, defined($value) ? $value : '';
}
push @output, join(',', @line);
}


# Write CSV to file
open my $fh, '>', $output or die "Cannot open output file: $!";
foreach (@output) {
print $fh "$_\n";
}
close $fh;
5 changes: 3 additions & 2 deletions lib/CXGN/File/Parse.pm
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ be treated as management factors / treatments.
use Moose;
use Try::Tiny;
use Module::Pluggable require => 1;
use Data::Dumper;

# Path to the file that is being parsed
has 'file' => (
Expand Down Expand Up @@ -391,7 +392,7 @@ sub parse {
foreach my $d (@$data) {
foreach my $c ( @{$parsed->{required_columns}} ) {
my $v = $d->{$c};
if ( !$v || $v eq '' ) {
if ( !defined($v) || $v eq '' ) {
my $r = $d->{_row};
push @{$parsed->{errors}}, "Required column $c does not have a value in row $r";
}
Expand Down Expand Up @@ -463,7 +464,7 @@ sub clean_value {
my $column_arrays = $self->column_arrays();

# trim whitespace
if ( $value && $value ne '' ) {
if ( defined($value) && $value ne '' ) {
$value =~ s/^\s+|\s+$//g;
}

Expand Down
5 changes: 4 additions & 1 deletion lib/CXGN/File/Parse/Plugin/Plain.pm
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
package CXGN::File::Parse::Plugin::Plain;

use strict;

use Data::Dumper;
use CXGN::File::Parse;
use Text::CSV;

Expand Down Expand Up @@ -87,7 +90,7 @@ sub parse {
$v = $super->clean_value($v, $h);
$row_info{$h} = $v;

if ( $v && $v ne '' ) {
if ( defined($v) && $v ne '' ) {
if ( ref($v) eq 'ARRAY' ) {
if ( scalar(@$v) > 0 ) {
foreach (@$v) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ sub parse {
$cell_val = $worksheet->get_cell(0,$col)->value();
$cell_val =~ s/^\s+|\s+$//g;
}
if ($cell_val) {
if ($cell_val || $cell_val == 0) {
$header_column_info{$cell_val} = $col;
$traits_seen{$cell_val} = 1;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ sub parse {
if ($trait_name) {
$traits_seen{$trait_name} = 1;
my $value_string = '';
if ($columns[$col_num]){
if ($columns[$col_num] || $columns[$col_num] == 0){
$value_string = $columns[$col_num];
}
#print STDERR $value_string."\n";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,9 +110,9 @@ sub parse {
my $observationunit_name = $row->{'observationunit_name'};

for my $trait_name (@$trait_columns) {
my $value_string = $row->{$trait_name} || '';
my $value_string = defined($row->{$trait_name}) ? $row->{$trait_name} : '';
my $timestamp = '';
my $trait_value = '';
my $trait_value = undef;
if ($timestamp_included){
($trait_value, $timestamp) = split /,/, $value_string;
} else {
Expand Down
Loading

0 comments on commit 20c98e7

Please sign in to comment.