@@ -6,13 +6,32 @@ class Levenshtein
6
6
{
7
7
public static function distance (string $ string1 , string $ string2 , int $ insertionCost = 1 , $ replacementCost = 1 , $ deletionCost = 1 ): int
8
8
{
9
- $ string1 = mb_convert_encoding ($ string1 , 'ASCII ' , 'utf8 ' );
10
- $ string2 = mb_convert_encoding ($ string2 , 'ASCII ' , 'utf8 ' );
9
+ $ map = [];
10
+ $ string1 = self ::utf8_to_extended_ascii ($ string1 , $ map );
11
+ $ string2 = self ::utf8_to_extended_ascii ($ string2 , $ map );
11
12
12
- if (false === $ string1 || false === $ string2 ) {
13
- throw new \InvalidArgumentException ('Both, string1 and string2 have to be valid utf-8 strings. ' );
13
+ return levenshtein ($ string1 , $ string2 , $ insertionCost , $ replacementCost , $ deletionCost );
14
+ }
15
+
16
+ private static function utf8_to_extended_ascii ($ str , &$ map )
17
+ {
18
+ // find all multibyte characters (cf. utf-8 encoding specs)
19
+ $ matches = [];
20
+ if (!preg_match_all ('/[\xC0-\xF7][\x80-\xBF]+/ ' , $ str , $ matches )) {
21
+ return $ str ;
22
+ } // plain ascii string
23
+
24
+ // update the encoding map with the characters not already met
25
+ foreach ($ matches [0 ] as $ mbc ) {
26
+ if (!isset ($ map [$ mbc ])) {
27
+ if (\count ($ map ) >= 128 ) {
28
+ throw new \InvalidArgumentException ('Strings with more than 128 individual unicode characters are not supported. ' );
29
+ }
30
+ $ map [$ mbc ] = \chr (128 + \count ($ map ));
31
+ }
14
32
}
15
33
16
- return levenshtein ($ string1 , $ string2 , $ insertionCost , $ replacementCost , $ deletionCost );
34
+ // finally remap non-ascii characters
35
+ return strtr ($ str , $ map );
17
36
}
18
37
}
0 commit comments