-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTokenize_a_string_with_escaping
executable file
·48 lines (35 loc) · 1.33 KB
/
Tokenize_a_string_with_escaping
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/usr/local/bin/perl
#u# http://rosettacode.org/wiki/Tokenize_a_string_with_escaping
#c# 2018-08-10 <RC
#p# OK
use strict;
use warnings;
use feature 'say';
my($result1,$result2);
# The built-in split function can be used with a regex that matches the delimiter (although advanced backtracking control verbs are needed to skip escaped delimiters):
sub tokenize1 {
my ($string, $sep, $esc) = (shift, quotemeta shift, quotemeta shift);
my @fields = split /$esc . (*SKIP)(*FAIL) | $sep/sx, $string, -1;
return map { s/$esc(.)/$1/gsr } @fields;
}
# A more traditional approach is to parse the input string step by step (using a repeatedly-matching regex of the form /\G.../g), and throw away the separators (which can be done implicitly using \K):
my $string = 'one^|uno||three^^^^|four^^^|^cuatro|';
sub tokenize2 {
my ($string, $sep, $esc) = (shift, quotemeta shift, quotemeta shift);
my @fields = $string =~ /\G (?:^ | $sep) \K (?: [^$sep$esc] | $esc .)*/gsx;
return map { s/$esc(.)/$1/gsr } @fields;
}
# In both cases, stripping the escape characters happens as a separate step.
$result1 .= "'$_'\n" for tokenize1($string, '|', '^');
$result2 .= "'$_'\n" for tokenize2($string, '|', '^');
my $ref = <<'EOD';
'one|uno'
''
'three^^'
'four^|cuatro'
''
EOD
use Test::More;
is ($result1, $ref);
is ($result2, $ref);
done_testing;