speed-up Data::Frame's read_csv a little bit

iynehz · iynehz · commit 5cf69bd79043 · 2021-11-27T21:45:23.000+08:00
diff --git a/lib/Data/Frame/Util.pm b/lib/Data/Frame/Util.pm
@@ -137,9 +137,9 @@ fun guess_and_convert_to_pdl ( (ArrayRef | Value | ColumnLike) $x,
         :$strings_as_factors=false, :$test_count=1000, :$na=[qw(BAD NA)]) {
     return $x if ( $x->$_DOES('PDL') );
 
-    my $is_na = sub {
-        length( $_[0] ) == 0 or List::AllUtils::any { $_[0] eq $_ } @$na;
-    };
+    # see utils/benchmarks/is_na.pl for why grep is used here
+    my @na = (@$na, '');
+    my $is_na = sub { scalar(grep { $_[0] eq $_ } @na) };
 
     my $like_number;
     if ( !ref $x ) {
@@ -153,21 +153,20 @@ fun guess_and_convert_to_pdl ( (ArrayRef | Value | ColumnLike) $x,
         @$x[ 0 .. List::AllUtils::min( $test_count - 1, $#$x ) ];
     }
 
+    my $piddle;
     if ($like_number) {
-        my @data   = map { &$is_na($_) ? 'nan' : $_ } @$x;
-        my $piddle = pdl( \@data );
-        $piddle->inplace->setnantobad;
-        return $piddle;
+        local $SIG{__WARN__} = sub {};
+        $piddle = pdl($x);
     }
     else {
-        my $piddle =
+        $piddle =
           $strings_as_factors
           ? PDL::Factor->new($x)
           : PDL::SV->new($x);
-        my $is_bad = pdl( [ map { &$is_na($_) } @$x ] );
-        $piddle = $piddle->setbadif($is_bad);
-        return $piddle;
     }
+    my $isbad = pdl( [ map { &$is_na($_) } @$x ] );
+    $piddle = $piddle->setbadif($isbad);
+    return $piddle;
 }
 
 1;
diff --git a/utils/benchmarks/is_na.pl b/utils/benchmarks/is_na.pl
@@ -0,0 +1,57 @@
+#!/usr/bin/env perl
+
+# A "is_na" function is used in Data::Frame::Util's
+#  guess_and_convert_to_pdl() function.
+# 
+# The result indicates that for a small length of @na, grep could be the
+#  most practical way for performance. It looks like that the overhead from
+#  closure and Perl operators is obvious in this case.
+
+use 5.016;
+use warnings;
+
+use List::Util qw(any);
+use Benchmark qw(:all);
+
+my @na = ( qw(NA BAD), '' );
+
+# array of string with 100k data
+my @s = (qw(foo bar baz quux)) x 25000;
+
+sub is_na_any {
+    any { $_[0] eq $_ } @na;
+}
+
+sub is_na_grep {
+    scalar(grep { $_[0] eq $_ } @na);
+}
+
+my $re = qr/^(?:NA|BAD|)$/;
+
+sub is_na_regex {
+    $_[0] =~ $re;
+}
+
+sub is_na_regex2 {
+    # this is for comparison with above is_na_regex()
+    $_[0] =~ /^(?:NA|BAD|)$/;
+}
+
+cmpthese(
+    100,
+    {
+        'is_na_any' => sub {
+            my @x = map { is_na_any($_) } @s;
+        },
+        'is_na_grep' => sub {
+            my @x = map { is_na_grep($_) } @s;
+        },
+        'is_na_regex' => sub {
+            my @x = map { is_na_regex($_) } @s;
+        },
+        'is_na_regex2' => sub {
+            my @x = map { is_na_regex2($_) } @s;
+        },
+    },
+);
+