Skip to content

Commit a14fc43

Browse files
authored
Process non ascii symbols (#6)
1 parent b2f8e09 commit a14fc43

File tree

2 files changed

+29
-5
lines changed

2 files changed

+29
-5
lines changed

src/Levenshtein.php

+24-5
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,32 @@ class Levenshtein
66
{
77
public static function distance(string $string1, string $string2, int $insertionCost = 1, $replacementCost = 1, $deletionCost = 1): int
88
{
9-
$string1 = mb_convert_encoding($string1, 'ASCII', 'utf8');
10-
$string2 = mb_convert_encoding($string2, 'ASCII', 'utf8');
9+
$map = [];
10+
$string1 = self::utf8_to_extended_ascii($string1, $map);
11+
$string2 = self::utf8_to_extended_ascii($string2, $map);
1112

12-
if (false === $string1 || false === $string2) {
13-
throw new \InvalidArgumentException('Both, string1 and string2 have to be valid utf-8 strings.');
13+
return levenshtein($string1, $string2, $insertionCost, $replacementCost, $deletionCost);
14+
}
15+
16+
private static function utf8_to_extended_ascii($str, &$map)
17+
{
18+
// find all multibyte characters (cf. utf-8 encoding specs)
19+
$matches = [];
20+
if (!preg_match_all('/[\xC0-\xF7][\x80-\xBF]+/', $str, $matches)) {
21+
return $str;
22+
} // plain ascii string
23+
24+
// update the encoding map with the characters not already met
25+
foreach ($matches[0] as $mbc) {
26+
if (!isset($map[$mbc])) {
27+
if (\count($map) >= 128) {
28+
throw new \InvalidArgumentException('Strings with more than 128 individual unicode characters are not supported.');
29+
}
30+
$map[$mbc] = \chr(128 + \count($map));
31+
}
1432
}
1533

16-
return levenshtein($string1, $string2, $insertionCost, $replacementCost, $deletionCost);
34+
// finally remap non-ascii characters
35+
return strtr($str, $map);
1736
}
1837
}

tests/LevenshteinTest.php

+5
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,12 @@ public function testLevenshtein(): void
1212
$this->assertSame(1, Levenshtein::distance('hello', 'helo'));
1313
$this->assertSame(2, Levenshtein::distance('hello', 'heo'));
1414
$this->assertSame(1, Levenshtein::distance('héllo', 'hello'));
15+
$this->assertSame(2, Levenshtein::distance('Ñörbärm', 'Üörbarm'));
1516
$this->assertSame(2, Levenshtein::distance('garçonnière', 'garconniere'));
1617
$this->assertSame(1, Levenshtein::distance('garçonnière', 'garçonniere'));
18+
$this->assertSame(1, Levenshtein::distance('пожар', 'пажар'));
19+
$this->assertSame(1, Levenshtein::distance('пожар', 'пожаr'));
20+
$this->assertSame(2, Levenshtein::distance('слово', 'слива'));
21+
$this->assertNotSame(0, Levenshtein::distance('стул', 'вода'));
1722
}
1823
}

0 commit comments

Comments
 (0)