Skip to content

Commit 31a60b0

Browse files
authored
Damerau–Levenshtein distance (#4)
1 parent 9b85aa8 commit 31a60b0

5 files changed

+252
-20
lines changed

src/DamerauLevenshtein.php

+94
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
<?php
2+
3+
namespace Toflar\StateSetIndex;
4+
5+
class DamerauLevenshtein
6+
{
7+
/**
8+
* Damerau-Levenshtein distance algorithm optimized for a specified maximum.
9+
*
10+
* We only use a diagonal corridor of the full matrix, the top right and
11+
* bottom left area is ignored as they would always be guaranteed to reach
12+
* the maximum.
13+
*
14+
* For a maximum distance of 2, the matrix looks like this and the algorithm
15+
* would return early with a distance of 2 after calculating row d:
16+
*
17+
* ```
18+
* a, c, e, d, f, g, h
19+
* : 0, 1, , , , , ,
20+
* a: 1, 0, 1, , , , ,
21+
* b: , 1, 1, 2, , , ,
22+
* c: , , 1, 2, 3, , ,
23+
* d: , , , 2, 2, 3, ,
24+
* e: , , , , 2, 3, 3,
25+
* f: , , , , , 2, 3, 3
26+
* g: , , , , , , 2, 3
27+
* ```
28+
*/
29+
public static function distance(string $string1, string $string2, int $maxDistance = PHP_INT_MAX, int $insertionCost = 1, int $replacementCost = 1, int $deletionCost = 1, int $transpositionCost = 1): int
30+
{
31+
if ($string1 === $string2) {
32+
return 0;
33+
}
34+
35+
$string1Length = mb_strlen($string1);
36+
$string2Length = mb_strlen($string2);
37+
$maxLength = max($string1Length, $string2Length);
38+
$maxDistance = min($maxDistance, $maxLength);
39+
40+
$maxDeletions = floor(($maxDistance - ($string1Length - $string2Length)) / 2);
41+
$maxInsertions = floor(($maxDistance + ($string1Length - $string2Length)) / 2);
42+
43+
$matrixSize = 1 + $maxDeletions + $maxInsertions;
44+
45+
// Length difference is too big
46+
if ($matrixSize <= 1 || $maxDistance <= abs($string1Length - $string2Length)) {
47+
return $maxDistance;
48+
}
49+
50+
// We only store the latest two rows and flip the access between them.
51+
$matrix = [
52+
array_fill(0, $matrixSize, $maxDistance),
53+
array_fill(0, $matrixSize, $maxDistance),
54+
];
55+
56+
for ($i = $maxInsertions; $i < $matrixSize; ++$i) {
57+
$matrix[0][$i] = $i - $maxInsertions;
58+
}
59+
60+
for ($i = 0; $i < $string1Length; ++$i) {
61+
$currentRow = ($i + 1) % 2;
62+
$lastRow = $i % 2;
63+
for ($j = 0; $j < $matrixSize; ++$j) {
64+
$col = $j - $maxInsertions + $i;
65+
if ($col < 0) {
66+
$matrix[$currentRow][$j] = $i - $col;
67+
continue;
68+
}
69+
if ($col >= $string2Length) {
70+
continue;
71+
}
72+
if ($i && mb_substr($string1, $i, 1) === mb_substr($string2, $col - 1, 1) && mb_substr($string1, $i - 1, 1) === mb_substr($string2, $col, 1)) {
73+
// In this case $matrix[$currentRow][$j] refers to the value
74+
// two rows above and two columns to the left in the matrix.
75+
$transpositioned = $matrix[$currentRow][$j] + $transpositionCost;
76+
} else {
77+
$transpositioned = $maxDistance;
78+
}
79+
$matrix[$currentRow][$j] = min(
80+
$transpositioned,
81+
($matrix[$lastRow][$j + 1] ?? $maxDistance) + $deletionCost,
82+
($matrix[$currentRow][$j - 1] ?? $maxDistance) + $insertionCost,
83+
($matrix[$lastRow][$j] ?? $maxDistance) + ((mb_substr($string1, $i, 1) === mb_substr($string2, $col, 1)) ? 0 : $replacementCost),
84+
);
85+
}
86+
87+
if (min($matrix[$currentRow]) >= $maxDistance) {
88+
return $maxDistance;
89+
}
90+
}
91+
92+
return $matrix[$currentRow ?? 0][$maxInsertions - ($string1Length - $string2Length)];
93+
}
94+
}

src/DataStore/InMemoryDataStore.php

-4
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,6 @@ public function all(): array
3232

3333
public function getForStates(array $states = []): array
3434
{
35-
if ([] === $states) {
36-
return $this->data;
37-
}
38-
3935
return array_intersect_key($this->data, array_flip($states));
4036
}
4137
}

src/StateSetIndex.php

+28-8
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,9 @@ public function __construct(
3232
*
3333
* @return array<string>
3434
*/
35-
public function find(string $string, int $editDistance): array
35+
public function find(string $string, int $editDistance, int $transpositionCost = 1): array
3636
{
37-
$acceptedStringsPerState = $this->findAcceptedStrings($string, $editDistance);
37+
$acceptedStringsPerState = $this->findAcceptedStrings($string, $editDistance, $transpositionCost);
3838
$stringLength = mb_strlen($string);
3939
$filtered = [];
4040

@@ -45,7 +45,7 @@ public function find(string $string, int $editDistance): array
4545
continue;
4646
}
4747

48-
if (Levenshtein::distance($string, $acceptedString) <= $editDistance) {
48+
if (DamerauLevenshtein::distance($string, $acceptedString, $editDistance + 1, 1, 1, 1, $transpositionCost) <= $editDistance) {
4949
$filtered[] = $acceptedString;
5050
}
5151
}
@@ -60,19 +60,19 @@ public function find(string $string, int $editDistance): array
6060
*
6161
* @return array<int,array<string>>
6262
*/
63-
public function findAcceptedStrings(string $string, int $editDistance): array
63+
public function findAcceptedStrings(string $string, int $editDistance, int $transpositionCost): array
6464
{
65-
return $this->dataStore->getForStates($this->findMatchingStates($string, $editDistance));
65+
return $this->dataStore->getForStates($this->findMatchingStates($string, $editDistance, $transpositionCost));
6666
}
6767

6868
/**
6969
* Returns the matching states.
7070
*
7171
* @return array<int>
7272
*/
73-
public function findMatchingStates(string $string, int $editDistance): array
73+
public function findMatchingStates(string $string, int $editDistance, int $transpositionCost): array
7474
{
75-
$cacheKey = $string . ';' . $editDistance;
75+
$cacheKey = $string . ';' . $editDistance . ';' . $transpositionCost;
7676

7777
// Seen this already, skip
7878
if (isset($this->matchingStatesCache[$cacheKey])) {
@@ -81,9 +81,12 @@ public function findMatchingStates(string $string, int $editDistance): array
8181

8282
// Initial states
8383
$states = $this->getReachableStates(0, $editDistance);
84+
$lastSubstitutions = [];
85+
$lastMappedChar = null;
8486

85-
$this->loopOverEveryCharacter($string, function (int $mappedChar) use (&$states, $editDistance) {
87+
$this->loopOverEveryCharacter($string, function (int $mappedChar) use (&$states, &$lastSubstitutions, &$lastMappedChar, $editDistance, $transpositionCost) {
8688
$statesStar = new CostAnnotatedStateSet(); // This is S∗ in the paper
89+
$substitutionStates = [];
8790

8891
foreach ($states->all() as $state => $cost) {
8992
$statesStarC = new CostAnnotatedStateSet(); // This is S∗c in the paper
@@ -104,6 +107,8 @@ public function findMatchingStates(string $string, int $editDistance): array
104107
} elseif ($cost + 1 <= $editDistance) {
105108
// Substitution
106109
$statesStarC->add($newState, $cost + 1);
110+
$substitutionStates[$i] ??= new CostAnnotatedStateSet();
111+
$substitutionStates[$i]->add($newState, $cost + 1);
107112
}
108113
}
109114
}
@@ -118,7 +123,22 @@ public function findMatchingStates(string $string, int $editDistance): array
118123
}
119124
}
120125

126+
// Transposition
127+
// Takes all substitution states from the previous step that matched
128+
// the current char and adds a followup substitution state using the
129+
// previous char and assigns a combined cost of $transpositionCost.
130+
foreach (($lastSubstitutions[$mappedChar] ?? null)?->all() ?? [] as $state => $cost) {
131+
$newState = (int) ($state * $this->config->getAlphabetSize() + $lastMappedChar);
132+
$statesStar = $statesStar->mergeWith($this->getReachableStates(
133+
$newState,
134+
$editDistance,
135+
$cost - 1 + $transpositionCost,
136+
));
137+
}
138+
121139
$states = $statesStar;
140+
$lastMappedChar = $mappedChar;
141+
$lastSubstitutions = $substitutionStates;
122142
});
123143

124144
return $this->matchingStatesCache[$cacheKey] = $states->states();

tests/DamerauLevenshteinTest.php

+105
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
<?php
2+
3+
namespace Toflar\StateSetIndex\Test;
4+
5+
use PHPUnit\Framework\TestCase;
6+
use Toflar\StateSetIndex\DamerauLevenshtein;
7+
use Toflar\StateSetIndex\Levenshtein;
8+
9+
class DamerauLevenshteinTest extends TestCase
10+
{
11+
/**
12+
* @dataProvider distanceProvider
13+
*/
14+
public function testDistance(int $expected, string $a, string $b, int $maxDistance = PHP_INT_MAX): void
15+
{
16+
$this->assertSame($expected, DamerauLevenshtein::distance($a, $b, $maxDistance));
17+
$this->assertSame($expected, DamerauLevenshtein::distance($b, $a, $maxDistance));
18+
}
19+
20+
public static function distanceProvider(): \Generator {
21+
yield [0, 'abc', 'abc'];
22+
23+
yield [1, 'abcd', 'abcx'];
24+
yield [2, 'abcd', 'abxx'];
25+
yield [3, 'abcd', 'axxx'];
26+
yield [4, 'abcd', 'xxxx'];
27+
28+
yield [1, 'abcd', 'abc'];
29+
yield [2, 'abcd', 'ab'];
30+
yield [3, 'abcd', 'a'];
31+
yield [4, 'abcd', ''];
32+
33+
yield [1, 'abcd', 'abcdx'];
34+
yield [2, 'abcd', 'xxabcd'];
35+
yield [3, 'abcd', 'xxxabcd'];
36+
yield [4, 'abcd', 'xxxxabcd'];
37+
yield [5, 'abcd', 'xxxxxabcd'];
38+
yield [6, 'abcd', 'xxxxxxabcd'];
39+
40+
yield [6, 'abcd', 'xxxxxabcdx'];
41+
yield [6, 'abcd', 'xxxxabcdxx'];
42+
yield [6, 'abcd', 'xxxabcdxxx'];
43+
yield [6, 'abcd', 'xxabcdxxxx'];
44+
yield [6, 'abcd', 'xabcdxxxxx'];
45+
46+
yield [1, 'abcdx', 'abcd'];
47+
yield [2, 'xxabcd', 'abcd'];
48+
yield [3, 'xxxabcd', 'abcd'];
49+
yield [4, 'xxxxabcd', 'abcd'];
50+
yield [5, 'xxxxxabcd', 'abcd'];
51+
yield [6, 'xxxxxxabcd', 'abcd'];
52+
53+
yield [6, 'xxxxxabcdx', 'abcd'];
54+
yield [6, 'xxxxabcdxx', 'abcd'];
55+
yield [6, 'xxxabcdxxx', 'abcd'];
56+
yield [6, 'xxabcdxxxx', 'abcd'];
57+
yield [6, 'xabcdxxxxx', 'abcd'];
58+
59+
yield [1, 'abcdefg', 'bacdefg'];
60+
yield [1, 'abcdefg', 'acbdefg'];
61+
yield [1, 'abcdefg', 'abdcefg'];
62+
yield [1, 'abcdefg', 'abcedfg'];
63+
yield [1, 'abcdefg', 'abcdfeg'];
64+
yield [1, 'abcdefg', 'abcdegf'];
65+
66+
yield [1, 'ab', 'ba'];
67+
yield [2, 'ab', 'xba'];
68+
yield [2, 'ab', 'bax'];
69+
70+
yield [2, 'abab', 'baba'];
71+
yield [2, 'abba', 'baab'];
72+
yield [3, 'abba', 'xbaab'];
73+
yield [3, 'abba', 'baabx'];
74+
yield [3, 'abab', 'baxba'];
75+
yield [3, 'abba', 'baxab'];
76+
yield [4, 'abba', 'bxaab'];
77+
yield [4, 'abba', 'baaxb'];
78+
79+
yield [1, 'abc', 'abcd', 1];
80+
yield [2, 'abc', 'abcde', 2];
81+
yield [3, 'abc', 'abcdef', 3];
82+
yield [4, 'abc', 'abcdefg', 4];
83+
84+
yield [3, 'aaaaaaaaaa', 'bbbbbbbbbb', 3];
85+
yield [2, 'aaaaaaaaaa', 'bbbbbbbbbb', 2];
86+
yield [1, 'aaaaaaaaaa', 'bbbbbbbbbb', 1];
87+
yield [0, 'aaaaaaaaaa', 'bbbbbbbbbb', 0];
88+
89+
yield [1, 'xxxxxxxxxx', 'xxxxxxxxx_', 2];
90+
yield [2, 'xxxxxxxxxx', 'xxxxxxxx__', 3];
91+
yield [3, 'xxxxxxxxxx', 'xxxxxxx___', 4];
92+
93+
yield [1, str_repeat('x', 1024), str_repeat('x', 1023).'_', 2];
94+
yield [2, str_repeat('x', 1024), str_repeat('x', 1022).'__', 3];
95+
yield [3, str_repeat('x', 1024), str_repeat('x', 1021).'___', 4];
96+
97+
yield [1, '', 'a'];
98+
yield [1, 'a', ''];
99+
100+
yield [1, 'héllo', 'hello'];
101+
yield [2, 'garçonnière', 'garconniere'];
102+
yield [1, 'garçonnière', 'garçonniere'];
103+
104+
}
105+
}

tests/StateSetIndexTest.php

+25-8
Original file line numberDiff line numberDiff line change
@@ -35,18 +35,18 @@ public function testResultsMatchResearchPaper(): void
3535

3636
$stateSetIndex->index(['Mueller', 'Müller', 'Muentner', 'Muster', 'Mustermann']);
3737

38-
$this->assertSame([104, 419, 467, 1677, 1811], $stateSetIndex->findMatchingStates('Mustre', 2));
39-
$this->assertSame([1811 => ['Mueller'], 1677 => ['Muster', 'Mustermann']], $stateSetIndex->findAcceptedStrings('Mustre', 2));
40-
$this->assertSame(['Muster'], $stateSetIndex->find('Mustre', 2));
38+
$this->assertSame([104, 419, 467, 1677, 1811], $stateSetIndex->findMatchingStates('Mustre', 2, 2));
39+
$this->assertSame([1811 => ['Mueller'], 1677 => ['Muster', 'Mustermann']], $stateSetIndex->findAcceptedStrings('Mustre', 2, 2));
40+
$this->assertSame(['Muster'], $stateSetIndex->find('Mustre', 2, 2));
4141
}
4242

4343
public function testWithUtf8Alphabet(): void
4444
{
4545
$stateSetIndex = new StateSetIndex(new Config(6, 4), new Utf8Alphabet(), new InMemoryStateSet(), new InMemoryDataStore());
4646
$stateSetIndex->index(['Mueller', 'Müller', 'Muentner', 'Muster', 'Mustermann']);
4747

48-
$this->assertSame([177, 710, 2710, 2843], $stateSetIndex->findMatchingStates('Mustre', 2));
49-
$this->assertSame([2710 => ['Mueller'], 2843 => ['Muster', 'Mustermann']], $stateSetIndex->findAcceptedStrings('Mustre', 2));
48+
$this->assertSame([177, 710, 2710, 2843], $stateSetIndex->findMatchingStates('Mustre', 2, 2));
49+
$this->assertSame([2710 => ['Mueller'], 2843 => ['Muster', 'Mustermann']], $stateSetIndex->findAcceptedStrings('Mustre', 2, 2));
5050
$this->assertSame(['Muster'], $stateSetIndex->find('Mustre', 2));
5151
}
5252

@@ -58,9 +58,26 @@ public function testAssassinCanBeFound(): void
5858
$stateSetIndex = new StateSetIndex(new Config(14, 4), new Utf8Alphabet(), new InMemoryStateSet(), new InMemoryDataStore());
5959
$stateSetIndex->index(['assassin']);
6060

61-
$this->assertSame([844, 3380, 13522, 54091], $stateSetIndex->findMatchingStates('assasin', 2));
62-
$this->assertSame([54091 => ['assassin']], $stateSetIndex->findAcceptedStrings('assasin', 2));
63-
$this->assertSame(['assassin'], $stateSetIndex->find('assasin', 2));
61+
$this->assertSame([844, 3380, 13522, 54091], $stateSetIndex->findMatchingStates('assasin', 2, 2));
62+
$this->assertSame([54091 => ['assassin']], $stateSetIndex->findAcceptedStrings('assasin', 2, 2));
63+
$this->assertSame(['assassin'], $stateSetIndex->find('assasin', 2, 2));
64+
}
65+
66+
public function testTranspositionsCanBeFound(): void
67+
{
68+
$dataStore = new InMemoryDataStore();
69+
$stateSetIndex = new StateSetIndex(new Config(14, 6), new Utf8Alphabet(), new InMemoryStateSet(), $dataStore);
70+
$stateSetIndex->index(['abcdefg']);
71+
72+
$this->assertSame([123128 => ['abcdefg']], $stateSetIndex->findAcceptedStrings('abdcefg', 1, 1));
73+
$this->assertSame([123128 => ['abcdefg']], $stateSetIndex->findAcceptedStrings('bacdegf', 2, 1));
74+
$this->assertSame([], $stateSetIndex->findAcceptedStrings('abdcefg', 0, 1));
75+
$this->assertSame([], $stateSetIndex->findAcceptedStrings('bacdegf', 1, 1));
76+
77+
$this->assertSame(['abcdefg'], $stateSetIndex->find('abdcefg', 1));
78+
$this->assertSame(['abcdefg'], $stateSetIndex->find('bacdegf', 2));
79+
$this->assertSame([], $stateSetIndex->find('abdcefg', 0));
80+
$this->assertSame([], $stateSetIndex->find('bacdegf', 1));
6481
}
6582

6683
public function testRemoveFromIndex(): void

0 commit comments

Comments
 (0)