Skip to content

Commit 5cf69bd

Browse files
committed
speed-up Data::Frame's read_csv a little bit
1 parent 6ff4106 commit 5cf69bd

File tree

2 files changed

+67
-11
lines changed

2 files changed

+67
-11
lines changed

lib/Data/Frame/Util.pm

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -137,9 +137,9 @@ fun guess_and_convert_to_pdl ( (ArrayRef | Value | ColumnLike) $x,
137137
:$strings_as_factors=false, :$test_count=1000, :$na=[qw(BAD NA)]) {
138138
return $x if ( $x->$_DOES('PDL') );
139139

140-
my $is_na = sub {
141-
length( $_[0] ) == 0 or List::AllUtils::any { $_[0] eq $_ } @$na;
142-
};
140+
# see utils/benchmarks/is_na.pl for why grep is used here
141+
my @na = (@$na, '');
142+
my $is_na = sub { scalar(grep { $_[0] eq $_ } @na) };
143143

144144
my $like_number;
145145
if ( !ref $x ) {
@@ -153,21 +153,20 @@ fun guess_and_convert_to_pdl ( (ArrayRef | Value | ColumnLike) $x,
153153
@$x[ 0 .. List::AllUtils::min( $test_count - 1, $#$x ) ];
154154
}
155155

156+
my $piddle;
156157
if ($like_number) {
157-
my @data = map { &$is_na($_) ? 'nan' : $_ } @$x;
158-
my $piddle = pdl( \@data );
159-
$piddle->inplace->setnantobad;
160-
return $piddle;
158+
local $SIG{__WARN__} = sub {};
159+
$piddle = pdl($x);
161160
}
162161
else {
163-
my $piddle =
162+
$piddle =
164163
$strings_as_factors
165164
? PDL::Factor->new($x)
166165
: PDL::SV->new($x);
167-
my $is_bad = pdl( [ map { &$is_na($_) } @$x ] );
168-
$piddle = $piddle->setbadif($is_bad);
169-
return $piddle;
170166
}
167+
my $isbad = pdl( [ map { &$is_na($_) } @$x ] );
168+
$piddle = $piddle->setbadif($isbad);
169+
return $piddle;
171170
}
172171

173172
1;

utils/benchmarks/is_na.pl

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
#!/usr/bin/env perl
2+
3+
# A "is_na" function is used in Data::Frame::Util's
4+
# guess_and_convert_to_pdl() function.
5+
#
6+
# The result indicates that for a small length of @na, grep could be the
7+
# most practical way for performance. It looks like that the overhead from
8+
# closure and Perl operators is obvious in this case.
9+
10+
use 5.016;
11+
use warnings;
12+
13+
use List::Util qw(any);
14+
use Benchmark qw(:all);
15+
16+
my @na = ( qw(NA BAD), '' );
17+
18+
# array of string with 100k data
19+
my @s = (qw(foo bar baz quux)) x 25000;
20+
21+
sub is_na_any {
22+
any { $_[0] eq $_ } @na;
23+
}
24+
25+
sub is_na_grep {
26+
scalar(grep { $_[0] eq $_ } @na);
27+
}
28+
29+
my $re = qr/^(?:NA|BAD|)$/;
30+
31+
sub is_na_regex {
32+
$_[0] =~ $re;
33+
}
34+
35+
sub is_na_regex2 {
36+
# this is for comparison with above is_na_regex()
37+
$_[0] =~ /^(?:NA|BAD|)$/;
38+
}
39+
40+
cmpthese(
41+
100,
42+
{
43+
'is_na_any' => sub {
44+
my @x = map { is_na_any($_) } @s;
45+
},
46+
'is_na_grep' => sub {
47+
my @x = map { is_na_grep($_) } @s;
48+
},
49+
'is_na_regex' => sub {
50+
my @x = map { is_na_regex($_) } @s;
51+
},
52+
'is_na_regex2' => sub {
53+
my @x = map { is_na_regex2($_) } @s;
54+
},
55+
},
56+
);
57+

0 commit comments

Comments
 (0)