Skip to content

Commit ecd74d0

Browse files
committed
align from_csv()'s behavior on its : parameter with R's read.csv()
1 parent 5cf69bd commit ecd74d0

File tree

4 files changed

+102
-45
lines changed

4 files changed

+102
-45
lines changed

lib/Data/Frame/IO/CSV.pm

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,13 +37,17 @@ Create a data frame object from a CSV file. For example,
3737
Some of the parameters are explained below,
3838
3939
=for :list
40-
* C<$file> can be a file name string, a Path::Tiny object, or an opened file
41-
handle.
42-
* C<$dtype> is a hashref associating column names to their types. Types
43-
can be the PDL type names like C<"long">, C<"double">, or names of some PDL's
40+
* C<$file>
41+
This can be a file name string, a Path::Tiny object, or an opened file handle.
42+
* C<$dtype>
43+
A hashref associating column names to their types.
44+
Types can be the PDL type names like C<"long">, C<"double">, or names of some PDL's
4445
derived class like C<"PDL::SV">, C<"PDL::Factor">, C<"PDL::DateTime">. If a
4546
column is not specified in C<$dtype>, its type would be automatically
4647
decided.
48+
* C<$na>
49+
An arrayref of strings which are to be interpreted as C<BAD> values.
50+
Blank fields are also considered to be missing value in logical and numeric fields.
4751
4852
=cut
4953

lib/Data/Frame/Util.pm

Lines changed: 42 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ use PDL::Factor ();
1010
use PDL::SV ();
1111
use PDL::Logical ();
1212

13-
use List::AllUtils;
13+
use List::AllUtils qw(uniq);
1414
use Scalar::Util qw(looks_like_number);
1515
use Type::Params;
1616
use Types::PDL qw(PiddleFromAny);
@@ -25,7 +25,6 @@ our @EXPORT_OK = (
2525
BAD NA
2626
ifelse is_discrete
2727
guess_and_convert_to_pdl
28-
2928
dataframe factor logical
3029
),
3130
);
@@ -56,8 +55,8 @@ sub dataframe {
5655
require Data::Frame; # to avoid circular use
5756
Data::Frame->new( columns => \@_ );
5857
}
59-
sub factor { PDL::Factor->new(@_); }
60-
sub logical { PDL::Logical->new(@_); }
58+
sub factor { PDL::Factor->new(@_); }
59+
sub logical { PDL::Logical->new(@_); }
6160

6261
=func BAD
6362
@@ -134,12 +133,18 @@ fun is_discrete (ColumnLike $x) {
134133
=cut
135134

136135
fun guess_and_convert_to_pdl ( (ArrayRef | Value | ColumnLike) $x,
137-
:$strings_as_factors=false, :$test_count=1000, :$na=[qw(BAD NA)]) {
136+
:$strings_as_factors=false, :$test_count=1000, :$na=[qw(NA BAD)]) {
138137
return $x if ( $x->$_DOES('PDL') );
139138

140139
# see utils/benchmarks/is_na.pl for why grep is used here
141-
my @na = (@$na, '');
142-
my $is_na = sub { scalar(grep { $_[0] eq $_ } @na) };
140+
my @na = @$na; # non-numerical
141+
my @na0 = ( @$na, '' ); # numerical
142+
my $is_na = sub {
143+
scalar( grep { $_[0] eq $_ } @na );
144+
};
145+
my $is_na0 = sub {
146+
scalar( grep { $_[0] eq $_ } @na0 );
147+
};
143148

144149
my $like_number;
145150
if ( !ref $x ) {
@@ -148,25 +153,43 @@ fun guess_and_convert_to_pdl ( (ArrayRef | Value | ColumnLike) $x,
148153
}
149154
else {
150155
$like_number = List::AllUtils::all {
151-
looks_like_number($_) or &$is_na($_);
156+
looks_like_number($_) or &$is_na0($_);
152157
}
153158
@$x[ 0 .. List::AllUtils::min( $test_count - 1, $#$x ) ];
154159
}
155160

156-
my $piddle;
161+
# The $na parameter is only effective for logical and numeric columns.
162+
# This is in align with R's from_csv behavior.
157163
if ($like_number) {
158-
local $SIG{__WARN__} = sub {};
159-
$piddle = pdl($x);
164+
my $piddle = do {
165+
local $SIG{__WARN__} = sub { };
166+
pdl($x);
167+
};
168+
my $isbad = pdl( [ map { &$is_na0($_) } @$x ] );
169+
return $piddle->setbadif($isbad);
160170
}
161171
else {
162-
$piddle =
163-
$strings_as_factors
164-
? PDL::Factor->new($x)
165-
: PDL::SV->new($x);
172+
my $piddle;
173+
my $isbad = pdl( [ map { &$is_na($_) } @$x ] );
174+
if ($strings_as_factors) {
175+
if ( $isbad->any ) { # remove $na from levels
176+
my $levels = [
177+
sort grep {
178+
my $s = $_;
179+
!( grep { $s eq $_ } @na )
180+
} uniq(@$x)
181+
];
182+
$piddle = PDL::Factor->new( $x, levels => $levels );
183+
}
184+
else {
185+
$piddle = PDL::Factor->new($x);
186+
}
187+
}
188+
else {
189+
$piddle = PDL::SV->new($x);
190+
}
191+
return $piddle->setbadif($isbad);
166192
}
167-
my $isbad = pdl( [ map { &$is_na($_) } @$x ] );
168-
$piddle = $piddle->setbadif($isbad);
169-
return $piddle;
170193
}
171194

172195
1;
@@ -175,5 +198,5 @@ __END__
175198
176199
=head1 DESCRIPTION
177200
178-
This module provides some utility functions used by the Data::Frame project.
201+
This module provides some utility functions used by the L<Data::Frame> project.
179202

t/26-util.t

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,4 +56,32 @@ subtest factor => sub {
5656
is($f2->unpdl, [qw(1 1 2 1 0 1 0 2 2 1)], 'unpdl');
5757
};
5858

59+
subtest guess_and_convert_to_pdl => sub {
60+
my $x1 = ['foo', 'bar', '', 'NA', 'BAD'];
61+
62+
pdl_is(
63+
guess_and_convert_to_pdl($x1),
64+
PDL::SV->new( [ qw(foo bar), '', '', '' ] )
65+
->setbadif( pdl( [ 0, 0, 0, 1, 1 ] ) ),
66+
'default params on strings'
67+
);
68+
pdl_is(
69+
guess_and_convert_to_pdl( $x1, strings_as_factors => 1 ),
70+
PDL::Factor->new( [ qw(foo bar), '', '', '' ],
71+
levels => [ '', qw(bar foo) ] )
72+
->setbadif( pdl( [ 0, 0, 0, 1, 1 ] ) ),
73+
'strings_as_factors'
74+
);
75+
76+
local $Test2::Tools::PDL::TOLERANCE = 1e-8;
77+
my $x2 = [1, 2.01, -3, '', 'NA', 'BAD'];
78+
79+
pdl_is(
80+
guess_and_convert_to_pdl($x2),
81+
pdl( [ 1, 2.01, -3, 0, 0, 0 ] )
82+
->setbadif( pdl( [ 0, 0, 0, 1, 1, 1 ] ) ),
83+
'numeric'
84+
);
85+
};
86+
5987
done_testing;

t/40-csv.t

Lines changed: 24 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -10,27 +10,29 @@ use Test2::Tools::DataFrame;
1010

1111
use Data::Frame;
1212

13-
my $path_test_data = path( "$FindBin::RealBin/../data-raw" );
14-
15-
my $mtcars_csv = path( $path_test_data, 'mtcars.csv' );
16-
my $df = Data::Frame->from_csv( $mtcars_csv, row_names => 0 );
17-
ok( $df, 'Data::Frame->from_csv' );
18-
is( $df->number_of_rows, 32, 'number_of_rows()' );
19-
is( $df->number_of_columns, 11, 'number_of_columns()' );
20-
is( $df->nrow, $df->number_of_rows, 'nrow() is same as number_of_rows()' );
21-
is( $df->ncol, $df->number_of_columns,
22-
'ncol() is same as number_of_columns()' );
23-
24-
is( $df->column_names, [qw(mpg cyl disp hp drat wt qsec vs am gear carb)],
25-
'column_names()' );
26-
is( $df->column_names, $df->column_names,
27-
'column_names() is same as column_names()' );
28-
diag( $df->string );
29-
30-
my $tempfile = Path::Tiny->tempfile;
31-
$df->to_csv($tempfile);
32-
33-
my $df_recovered = Data::Frame->from_csv( $tempfile, row_names => 0 );
34-
dataframe_is( $df_recovered, $df, '$df->to_csv' );
13+
my $path_test_data = path("$FindBin::RealBin/../data-raw");
14+
15+
subtest mtcars => sub {
16+
my $mtcars_csv = path( $path_test_data, 'mtcars.csv' );
17+
my $df = Data::Frame->from_csv( $mtcars_csv, row_names => 0 );
18+
ok( $df, 'Data::Frame->from_csv' );
19+
is( $df->number_of_rows, 32, 'number_of_rows()' );
20+
is( $df->number_of_columns, 11, 'number_of_columns()' );
21+
is( $df->nrow, $df->number_of_rows, 'nrow() is same as number_of_rows()' );
22+
is( $df->ncol, $df->number_of_columns,
23+
'ncol() is same as number_of_columns()' );
24+
25+
is( $df->column_names, [qw(mpg cyl disp hp drat wt qsec vs am gear carb)],
26+
'column_names()' );
27+
is( $df->column_names, $df->column_names,
28+
'column_names() is same as column_names()' );
29+
diag( $df->string );
30+
31+
my $tempfile = Path::Tiny->tempfile;
32+
$df->to_csv($tempfile);
33+
34+
my $df_recovered = Data::Frame->from_csv( $tempfile, row_names => 0 );
35+
dataframe_is( $df_recovered, $df, '$df->to_csv' );
36+
};
3537

3638
done_testing;

0 commit comments

Comments
 (0)