Skip to content

Commit 5002ffb

Browse files
committed
Strings: added support for UTF8 offsets in regexp [WIP]
1 parent 8015d13 commit 5002ffb

File tree

5 files changed

+102
-20
lines changed

5 files changed

+102
-20
lines changed

src/Utils/Strings.php

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -474,11 +474,16 @@ public static function split(
474474
string $pattern,
475475
bool|int $captureOffset = false,
476476
bool $skipEmpty = false,
477+
bool $utf8Offset = false,
477478
): array {
478479
$flags = is_int($captureOffset) // back compatibility
479480
? $captureOffset
480481
: ($captureOffset ? PREG_SPLIT_OFFSET_CAPTURE : 0) | ($skipEmpty ? PREG_SPLIT_NO_EMPTY : 0);
481-
return self::pcre('preg_split', [$pattern, $subject, -1, $flags | PREG_SPLIT_DELIM_CAPTURE]);
482+
$m = self::pcre('preg_split', [$pattern, $subject, -1, $flags | PREG_SPLIT_DELIM_CAPTURE]);
483+
if ($utf8Offset && ($flags & PREG_SPLIT_OFFSET_CAPTURE)) {
484+
return self::bytesToChars($subject, [$m])[0];
485+
}
486+
return $m;
482487
}
483488

484489

@@ -491,16 +496,24 @@ public static function match(
491496
bool|int $captureOffset = false,
492497
int $offset = 0,
493498
bool $unmatchedAsNull = false,
499+
bool $utf8Offset = false,
494500
): ?array {
495501
$flags = is_int($captureOffset) // back compatibility
496502
? $captureOffset
497503
: ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0);
504+
if ($utf8Offset) {
505+
$offset = strlen(self::substring($subject, 0, $offset));
506+
}
498507
if ($offset > strlen($subject)) {
499508
return null;
500509
}
501-
return self::pcre('preg_match', [$pattern, $subject, &$m, $flags, $offset])
502-
? $m
503-
: null;
510+
if (!self::pcre('preg_match', [$pattern, $subject, &$m, $flags, $offset])) {
511+
return null;
512+
}
513+
if ($utf8Offset && ($flags & PREG_OFFSET_CAPTURE)) {
514+
return self::bytesToChars($subject, [$m])[0];
515+
}
516+
return $m;
504517
}
505518

506519

@@ -515,10 +528,14 @@ public static function matchAll(
515528
int $offset = 0,
516529
bool $unmatchedAsNull = false,
517530
bool $patternOrder = false,
531+
bool $utf8Offset = false,
518532
): array {
519533
$flags = is_int($captureOffset) // back compatibility
520534
? $captureOffset
521535
: ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0) | ($patternOrder ? PREG_PATTERN_ORDER : 0);
536+
if ($utf8Offset) {
537+
$offset = strlen(self::substring($subject, 0, $offset));
538+
}
522539
if ($offset > strlen($subject)) {
523540
return [];
524541
}
@@ -527,6 +544,9 @@ public static function matchAll(
527544
($flags & PREG_PATTERN_ORDER) ? $flags : ($flags | PREG_SET_ORDER),
528545
$offset,
529546
]);
547+
if ($utf8Offset && ($flags & PREG_OFFSET_CAPTURE)) {
548+
return self::bytesToChars($subject, $m);
549+
}
530550
return $m;
531551
}
532552

@@ -541,12 +561,16 @@ public static function replace(
541561
int $limit = -1,
542562
bool $captureOffset = false,
543563
bool $unmatchedAsNull = false,
564+
bool $utf8Offset = false,
544565
): string {
545566
if (is_object($replacement) || is_array($replacement)) {
546567
if (!is_callable($replacement, false, $textual)) {
547568
throw new Nette\InvalidStateException("Callback '$textual' is not callable.");
548569
}
549570
$flags = ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0);
571+
if ($utf8Offset && $captureOffset) {
572+
$replacement = fn($m) => $replacement(self::bytesToChars($subject, [$m])[0]);
573+
}
550574
return self::pcre('preg_replace_callback', [$pattern, $replacement, $subject, $limit, 0, $flags]);
551575

552576
} elseif (is_array($pattern) && is_string(key($pattern))) {
@@ -558,6 +582,22 @@ public static function replace(
558582
}
559583

560584

585+
private static function bytesToChars(string $s, array $groups): array
586+
{
587+
$lastBytes = $lastChars = 0;
588+
foreach ($groups as &$matches) {
589+
foreach ($matches as &$match) {
590+
if ($match[1] > $lastBytes) {
591+
$lastChars += self::length(substr($s, $lastBytes, $match[1] - $lastBytes));
592+
$lastBytes = $match[1];
593+
}
594+
$match[1] = $lastChars;
595+
}
596+
}
597+
return $groups;
598+
}
599+
600+
561601
/** @internal */
562602
public static function pcre(string $func, array $args)
563603
{

tests/Utils/Strings.match().phpt

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,16 @@ Assert::same(['hell', 'l'], Strings::match('hello world!', '#([e-l])+#'));
1919

2020
Assert::same(['hell'], Strings::match('hello world!', '#[e-l]+#'));
2121

22-
Assert::same([['hell', 0]], Strings::match('hello world!', '#[e-l]+#', PREG_OFFSET_CAPTURE));
23-
Assert::same([['hell', 0]], Strings::match('hello world!', '#[e-l]+#', captureOffset: true));
22+
Assert::same([['l', 2]], Strings::match('žluťoučký kůň', '#[e-l]+#u', PREG_OFFSET_CAPTURE));
23+
Assert::same([['l', 2]], Strings::match('žluťoučký kůň', '#[e-l]+#u', captureOffset: true));
2424

25-
Assert::same(['ll'], Strings::match('hello world!', '#[e-l]+#', 0, 2));
25+
Assert::same([['l', 1]], Strings::match('žluťoučký kůň', '#[e-l]+#u', captureOffset: true, utf8Offset: true));
26+
27+
Assert::same(['l'], Strings::match('žluťoučký kůň', '#[e-l]+#u', 0, 2));
28+
29+
Assert::same(['k'], Strings::match('žluťoučký kůň', '#[e-l]+#u', utf8Offset: true, offset: 2));
30+
31+
Assert::same([['k', 7]], Strings::match('žluťoučký kůň', '#[e-l]+#u', captureOffset: true, utf8Offset: true, offset: 2));
2632

2733
Assert::null(Strings::match('hello world!', '', 0, 50));
2834
Assert::null(Strings::match('', '', 0, 1));

tests/Utils/Strings.matchAll().phpt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,14 +45,29 @@ Assert::same([
4545
[['u', 3], ['u', 7], ['', 11], ['', 15]],
4646
], Strings::matchAll('žluťoučký kůň!', '#([a-z])([a-z]*)#u', PREG_OFFSET_CAPTURE | PREG_PATTERN_ORDER));
4747

48+
Assert::same([
49+
[['lu', 1], ['l', 1], ['u', 2]],
50+
[['ou', 4], ['o', 4], ['u', 5]],
51+
[['k', 7], ['k', 7], ['', 8]],
52+
[['k', 10], ['k', 10], ['', 11]],
53+
], Strings::matchAll('žluťoučký kůň!', '#([a-z])([a-z]*)#u', captureOffset: true, utf8Offset: true));
54+
4855
Assert::same([
4956
[['lu', 2], ['ou', 6], ['k', 10], ['k', 14]],
5057
[['l', 2], ['o', 6], ['k', 10], ['k', 14]],
5158
[['u', 3], ['u', 7], ['', 11], ['', 15]],
5259
], Strings::matchAll('žluťoučký kůň!', '#([a-z])([a-z]*)#u', captureOffset: true, patternOrder: true));
5360

61+
Assert::same([
62+
[['lu', 1], ['ou', 4], ['k', 7], ['k', 10]],
63+
[['l', 10], ['o', 10], ['k', 10], ['k', 10]],
64+
[['u', 10], ['u', 10], ['', 10], ['', 11]],
65+
], Strings::matchAll('žluťoučký kůň!', '#([a-z])([a-z]*)#u', captureOffset: true, utf8Offset: true, patternOrder: true));
66+
5467
Assert::same([['l'], ['k'], ['k']], Strings::matchAll('žluťoučký kůň', '#[e-l]+#u', 0, 2));
5568

69+
Assert::same([['k'], ['k']], Strings::matchAll('žluťoučký kůň', '#[e-l]+#u', utf8Offset: true, offset: 2));
70+
5671
Assert::same([['ll', 'l']], Strings::matchAll('hello world!', '#[e-l]+#', PREG_PATTERN_ORDER, 2));
5772
Assert::same([['ll', 'l']], Strings::matchAll('hello world!', '#[e-l]+#', patternOrder: true, offset: 2));
5873

tests/Utils/Strings.replace().phpt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,5 +34,6 @@ Assert::same('#@ @@@#d!', Strings::replace('hello world!', [
3434
]));
3535
Assert::same(' !', Strings::replace('hello world!', '#\w#'));
3636
Assert::same(' !', Strings::replace('hello world!', ['#\w#']));
37-
Assert::same('hell0o worl9d!', Strings::replace('hello world!', '#[e-l]+#', fn($m) => implode($m[0]), captureOffset: true));
37+
Assert::same('žl2uťoučk10ý k14ůň!', Strings::replace('žluťoučký kůň!', '#[e-l]+#u', fn($m) => implode($m[0]), captureOffset: true));
38+
Assert::same('žl1uťoučk7ý k10ůň!', Strings::replace('žluťoučký kůň!', '#[e-l]+#u', fn($m) => implode($m[0]), captureOffset: true, utf8Offset: true));
3839
Strings::replace('hello world!', '#e(x)*#', fn($m) => Assert::null($m[1]), unmatchedAsNull: true);

tests/Utils/Strings.split().phpt

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -38,17 +38,37 @@ Assert::same([
3838
], Strings::split('a, b, c', '#(,)\s*#', skipEmpty: true));
3939

4040
Assert::same([
41-
['a', 0],
42-
[',', 1],
43-
['b', 3],
44-
[',', 4],
45-
['c', 6],
46-
], Strings::split('a, b, c', '#(,)\s*#', PREG_SPLIT_OFFSET_CAPTURE));
41+
['ž', 0],
42+
['lu', 2],
43+
['ť', 4],
44+
['ou', 6],
45+
['č', 8],
46+
['k', 10],
47+
['ý ', 11],
48+
['k', 14],
49+
['ůň', 15],
50+
], Strings::split('žluťoučký kůň', '#([a-z]+)\s*#u', PREG_SPLIT_OFFSET_CAPTURE));
4751

4852
Assert::same([
49-
['a', 0],
50-
[',', 1],
51-
['b', 3],
52-
[',', 4],
53-
['c', 6],
54-
], Strings::split('a, b, c', '#(,)\s*#', captureOffset: true));
53+
['ž', 0],
54+
['lu', 2],
55+
['ť', 4],
56+
['ou', 6],
57+
['č', 8],
58+
['k', 10],
59+
['ý ', 11],
60+
['k', 14],
61+
['ůň', 15],
62+
], Strings::split('žluťoučký kůň', '#([a-z]+)\s*#u', captureOffset: true));
63+
64+
Assert::same([
65+
['ž', 0],
66+
['lu', 1],
67+
['ť', 3],
68+
['ou', 4],
69+
['č', 6],
70+
['k', 7],
71+
['ý ', 8],
72+
['k', 10],
73+
['ůň', 11],
74+
], Strings::split('žluťoučký kůň', '#([a-z]+)\s*#u', captureOffset: true, utf8Offset: true));

0 commit comments

Comments
 (0)