Skip to content

Commit cbf92f7

Browse files
authored
Further improvements and fixes for DamerauLevenshtein (#10)
1 parent b9b89b4 commit cbf92f7

File tree

2 files changed

+106
-24
lines changed

2 files changed

+106
-24
lines changed

src/DamerauLevenshtein.php

+44-18
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@ class DamerauLevenshtein
2828
*/
2929
public static function distance(string $string1, string $string2, int $maxDistance = PHP_INT_MAX, int $insertionCost = 1, int $replacementCost = 1, int $deletionCost = 1, int $transpositionCost = 1): int
3030
{
31-
if ($string1 === $string2) {
32-
return 0;
31+
if ($insertionCost < 1 || $replacementCost < 1 || $deletionCost < 1 || $transpositionCost < 1) {
32+
throw new \InvalidArgumentException('Cost values below 1 are not supported');
3333
}
3434

3535
// Strip common prefix
@@ -41,62 +41,87 @@ public static function distance(string $string1, string $string2, int $maxDistan
4141

4242
// Strip common suffix
4343
$xorRight = substr($string1, -\strlen($string2)) ^ substr($string2, -\strlen($string1));
44-
if (\strlen($string1) === \strlen($string2) && $commonSuffixLength = \strlen($xorRight) - \strlen(rtrim($xorRight, "\0"))) {
44+
if ($commonSuffixLength = \strlen($xorRight) - \strlen(rtrim($xorRight, "\0"))) {
4545
$suffix = mb_strcut($string1, -$commonSuffixLength);
4646
if (\strlen($suffix) > $commonSuffixLength) {
4747
$suffix = mb_substr($suffix, 1);
4848
}
49-
$string1 = substr($string1, 0, -\strlen($suffix));
50-
$string2 = substr($string2, 0, -\strlen($suffix));
49+
$string1 = substr($string1, 0, -\strlen($suffix) ?: null);
50+
$string2 = substr($string2, 0, -\strlen($suffix) ?: null);
5151
}
5252

5353
$chars1 = mb_str_split($string1);
5454
$chars2 = mb_str_split($string2);
55-
5655
$string1Length = \count($chars1);
5756
$string2Length = \count($chars2);
58-
$maxLength = max($string1Length, $string2Length);
59-
$maxDistance = min($maxDistance, $maxLength);
6057

61-
$maxDeletions = floor(($maxDistance - ($string1Length - $string2Length)) / 2);
62-
$maxInsertions = floor(($maxDistance + ($string1Length - $string2Length)) / 2);
58+
if ($string1Length === 0) {
59+
return min($maxDistance, $string2Length * $insertionCost);
60+
}
6361

64-
$matrixSize = 1 + $maxDeletions + $maxInsertions;
62+
if ($string2Length === 0) {
63+
return min($maxDistance, $string1Length * $deletionCost);
64+
}
65+
66+
// Distance can never be higher than deleting string1 and inserting string2
67+
$maxDistance = min($maxDistance, $string1Length * $deletionCost + $string2Length * $insertionCost);
68+
69+
$requiredDeletions = max(0, $string1Length - $string2Length);
70+
$requiredInsertions = max(0, $string2Length - $string1Length);
71+
72+
// Distance required to bring both strings to the same length
73+
$lengthDistance = $requiredInsertions * $insertionCost + $requiredDeletions * $deletionCost;
6574

6675
// Length difference is too big
67-
if ($matrixSize <= 1 || $maxDistance <= abs($string1Length - $string2Length)) {
76+
if ($maxDistance <= $lengthDistance) {
6877
return $maxDistance;
6978
}
7079

80+
// After length correction, how many deletion/insertion pairs are maximally possible
81+
$maxDeletionInsertionPairs = max(0, floor(($maxDistance - $lengthDistance) / ($deletionCost + $insertionCost)));
82+
83+
$maxDeletions = $requiredDeletions + $maxDeletionInsertionPairs;
84+
$maxInsertions = $requiredInsertions + $maxDeletionInsertionPairs;
85+
$matrixSize = 1 + $maxDeletions + $maxInsertions;
86+
7187
// We only store the latest two rows and flip the access between them.
7288
$matrix = [
7389
array_fill(0, $matrixSize, $maxDistance),
7490
array_fill(0, $matrixSize, $maxDistance),
7591
];
7692

77-
for ($i = $maxInsertions; $i < $matrixSize; ++$i) {
78-
$matrix[0][$i] = $i - $maxInsertions;
93+
// Fill the row before the first one starting with 0
94+
for ($i = $maxDeletions; $i < $matrixSize; ++$i) {
95+
$matrix[0][$i] = ($i - $maxDeletions) * $insertionCost;
7996
}
8097

98+
// Iterate through string1 (rows)
8199
for ($i = 0; $i < $string1Length; ++$i) {
82100
$currentRow = ($i + 1) % 2;
83101
$lastRow = $i % 2;
102+
103+
// Iterate through string2 (columns)
84104
for ($j = 0; $j < $matrixSize; ++$j) {
85-
$col = $j - $maxInsertions + $i;
105+
$col = $j - $maxDeletions + $i;
106+
107+
// Fill the column before the first one starting with 0
86108
if ($col < 0) {
87-
$matrix[$currentRow][$j] = $i - $col;
109+
$matrix[$currentRow][$j] = ($i - $col) * $deletionCost;
88110
continue;
89111
}
112+
90113
if ($col >= $string2Length) {
91114
continue;
92115
}
116+
93117
if ($i && ($chars1[$i] ?? '') === ($chars2[$col - 1] ?? '') && ($chars1[$i - 1] ?? '') === ($chars2[$col] ?? '')) {
94118
// In this case $matrix[$currentRow][$j] refers to the value
95119
// two rows above and two columns to the left in the matrix.
96120
$transpositioned = $matrix[$currentRow][$j] + $transpositionCost;
97121
} else {
98122
$transpositioned = $maxDistance;
99123
}
124+
100125
$matrix[$currentRow][$j] = min(
101126
$transpositioned,
102127
($matrix[$lastRow][$j + 1] ?? $maxDistance) + $deletionCost,
@@ -105,11 +130,12 @@ public static function distance(string $string1, string $string2, int $maxDistan
105130
);
106131
}
107132

108-
if (min($matrix[$currentRow]) >= $maxDistance) {
133+
if (min($matrix[$currentRow]) >= $maxDistance && min($matrix[$lastRow]) + $transpositionCost >= $maxDistance) {
109134
return $maxDistance;
110135
}
111136
}
112137

113-
return min($maxDistance, $matrix[$currentRow ?? 0][$maxInsertions - ($string1Length - $string2Length)]);
138+
// Return the distance value found in the last row in the last column
139+
return min($maxDistance, $matrix[$currentRow ?? 0][$maxDeletions - ($string1Length - $string2Length)]);
114140
}
115141
}

tests/DamerauLevenshteinTest.php

+62-6
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,11 @@ public function testDistance(int $expected, string $a, string $b, int $maxDistan
2424
$this->assertSame($expected - 1, DamerauLevenshtein::distance($a, $b, $expected - 1));
2525
$this->assertSame($expected - 1, DamerauLevenshtein::distance($b, $a, $expected - 1));
2626
}
27+
28+
$this->assertSame($expected * 2, DamerauLevenshtein::distance($a, $b, $maxDistance === PHP_INT_MAX ? PHP_INT_MAX : $maxDistance * 2, 2, 2, 2, 2));
29+
$this->assertSame($expected * 2, DamerauLevenshtein::distance($b, $a, $maxDistance === PHP_INT_MAX ? PHP_INT_MAX : $maxDistance * 2, 2, 2, 2, 2));
30+
$this->assertSame($expected * 4, DamerauLevenshtein::distance($a, $b, $maxDistance === PHP_INT_MAX ? PHP_INT_MAX : $maxDistance * 4, 4, 4, 4, 4));
31+
$this->assertSame($expected * 4, DamerauLevenshtein::distance($b, $a, $maxDistance === PHP_INT_MAX ? PHP_INT_MAX : $maxDistance * 4, 4, 4, 4, 4));
2732
}
2833

2934
public static function distanceProvider(): \Generator {
@@ -123,12 +128,63 @@ public static function distanceProvider(): \Generator {
123128
yield [4, 'стул', 'вода'];
124129

125130
yield [1, 'aaäaa', 'aaöaa'];
126-
yield [1, 'prefix\xF0\x9F\x92\xA9', 'prefix\xF0\x9F\x92\xAF'];
127-
yield [1, 'prefix\xF0\x9F\x92\xA9', 'prefix\xF0\x9F\x93\xA9'];
128-
yield [1, '\xF0\x9F\x92\xA9suffix', '\xF0\x9F\x92\xAFsuffix'];
129-
yield [1, '\xF0\x9F\x92\xA9suffix', '\xF0\x9F\x93\xA9suffix'];
130-
yield [1, 'prefix\xF0\x9F\x92\xA9suffix', 'prefix\xF0\x9F\x92\xAFsuffix'];
131-
yield [1, 'prefix\xF0\x9F\x92\xA9suffix', 'prefix\xF0\x9F\x93\xA9suffix'];
131+
yield [1, "prefix\xF0\x9F\x92\xA9", "prefix\xF0\x9F\x92\xAF"];
132+
yield [1, "prefix\xF0\x9F\x92\xA9", "prefix\xF0\x9F\x93\xA9"];
133+
yield [1, "\xF0\x9F\x92\xA9suffix", "\xF0\x9F\x92\xAFsuffix"];
134+
yield [1, "\xF0\x9F\x92\xA9suffix", "\xF0\x9F\x93\xA9suffix"];
135+
yield [1, "prefix\xF0\x9F\x92\xA9suffix", "prefix\xF0\x9F\x92\xAFsuffix"];
136+
yield [1, "prefix\xF0\x9F\x92\xA9suffix", "prefix\xF0\x9F\x93\xA9suffix"];
137+
138+
}
139+
140+
/**
141+
* @dataProvider costsProvider
142+
*/
143+
public function testDifferentCosts(int $expected, string $a, string $b, int $insertionCost = 1, int $replacementCost = 1, int $deletionCost = 1, int $transpositionCost = 1): void
144+
{
145+
$this->assertSame($expected, DamerauLevenshtein::distance($a, $b, PHP_INT_MAX, $insertionCost, $replacementCost, $deletionCost, $transpositionCost));
146+
$this->assertSame($expected, DamerauLevenshtein::distance($b, $a, PHP_INT_MAX, $deletionCost, $replacementCost, $insertionCost, $transpositionCost));
147+
$this->assertSame($expected, DamerauLevenshtein::distance($a, $b, $expected, $insertionCost, $replacementCost, $deletionCost, $transpositionCost));
148+
$this->assertSame($expected, DamerauLevenshtein::distance($b, $a, $expected, $deletionCost, $replacementCost, $insertionCost, $transpositionCost));
149+
$this->assertSame($expected, DamerauLevenshtein::distance($a, $b, $expected + 1, $insertionCost, $replacementCost, $deletionCost, $transpositionCost));
150+
$this->assertSame($expected, DamerauLevenshtein::distance($b, $a, $expected + 1, $deletionCost, $replacementCost, $insertionCost, $transpositionCost));
151+
152+
if ($expected > 0) {
153+
$this->assertSame($expected - 1, DamerauLevenshtein::distance($a, $b, $expected - 1, $insertionCost, $replacementCost, $deletionCost, $transpositionCost));
154+
$this->assertSame($expected - 1, DamerauLevenshtein::distance($b, $a, $expected - 1, $deletionCost, $replacementCost, $insertionCost, $transpositionCost));
155+
}
156+
}
132157

158+
public static function costsProvider(): \Generator
159+
{
160+
yield [7, 'abc', 'bcd', 3, 8, 4];
161+
yield [3, 'abc', 'bcd', 2, 1, 3];
162+
yield [4, 'abcd', 'acbd', 1, 2, 3, 4];
163+
yield [4, 'abcd', 'acbd', 2, 2, 3, 5];
164+
yield [4, 'abcd', 'acbd', 1, 3, 3, 5];
165+
yield [4, 'abcd', 'acbd', 2, 3, 3, 4];
166+
yield [5, 'abcd', 'acbd', 2, 3, 3, 5];
167+
yield [5, 'abcd', 'acbd', 2, 3, 3, 6];
168+
yield [6, 'abcd', 'acbd', 2, 3, 4, 6];
169+
yield [1, 'abcd', 'abcde', 1, 2, 2, 2];
170+
yield [1, 'abcd', 'abcde', 1, 99, 99, 99];
171+
yield [1, 'abcd', 'aXcd', 2, 1, 2, 2];
172+
yield [1, 'abcd', 'aXcd', 99, 1, 99, 99];
173+
yield [1, 'abcd', 'abc', 2, 2, 1, 2];
174+
yield [1, 'abcd', 'abc', 99, 99, 1, 99];
175+
yield [1, 'abcd', 'acbd', 2, 2, 2, 1];
176+
yield [1, 'abcd', 'acbd', 99, 99, 99, 1];
177+
yield [2, 'abcd', 'abcde', 2, 3, 3, 3];
178+
yield [2, 'abcd', 'abcde', 2, 99, 99, 99];
179+
yield [2, 'abcd', 'aXcd', 3, 2, 3, 3];
180+
yield [2, 'abcd', 'aXcd', 99, 2, 99, 99];
181+
yield [2, 'abcd', 'abc', 3, 3, 2, 3];
182+
yield [2, 'abcd', 'abc', 99, 99, 2, 99];
183+
yield [2, 'abcd', 'acbd', 3, 3, 3, 2];
184+
yield [2, 'abcd', 'acbd', 99, 99, 99, 2];
185+
yield [13, 'aaaa', 'bbbbb', 1, 99, 2, 99];
186+
yield [14, 'aaaaa', 'bbbb', 1, 99, 2, 99];
187+
yield [14, 'aaaa', 'bbbbb', 2, 99, 1, 99];
188+
yield [13, 'aaaaa', 'bbbb', 2, 99, 1, 99];
133189
}
134190
}

0 commit comments

Comments
 (0)