Skip to content

Commit 7ff7662

Browse files
authored
Version 2 (#1)
* Added an Utf8Alphabet implementation * Completely revamped implementation. Dropped hierarchical logic to reduce storage requirements * Added tests for the Levenshtein implementation * Fixed Utf8Alphabet * Added support for Damerau-Levenshtein * Fixed algorithm * Removed Damerau-Levenshtein attempt * CS
1 parent a4277c0 commit 7ff7662

16 files changed

+309
-191
lines changed

README.md

+15-8
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,16 @@ composer require toflar/state-set-index
2121
```php
2222
namespace App;
2323

24-
use Toflar\StateSetIndex\Alphabet\InMemoryAlphabet;
24+
use Toflar\StateSetIndex\Alphabet\Utf8Alphabet
25+
use Toflar\StateSetIndex\DataStore\InMemoryDataStore;
2526
use Toflar\StateSetIndex\StateSet\InMemoryStateSet;
2627
use Toflar\StateSetIndex\StateSetIndex;
2728

2829
$stateSetIndex = new StateSetIndex(
2930
new Config(6, 4),
30-
new InMemoryAlphabet(),
31-
new InMemoryStateSet()
31+
new Utf8Alphabet(),
32+
new InMemoryStateSet(),
33+
new InMemoryDataStore()
3234
);
3335

3436
$stateSetIndex->index(['Mueller', 'Müller', 'Muentner', 'Muster', 'Mustermann']);
@@ -44,15 +46,20 @@ you want to index and or search.
4446
## Customization
4547

4648
This library ships with the algorithm readily prepared for you to use. The main customization areas will be
47-
the alphabet (both the way it maps characters to labels) as well as the state set storage, if you want to make the index
49+
the alphabet (both the way it maps characters to labels) and the state set storage, if you want to make the index
4850
persistent. Hence, there are two interfaces that allow you to implement your own logic:
4951

5052
* The `AlphabetInterface` is very straight-forward. It only consists of a `map(string $char, int $alphabetSize)` method
5153
which the library needs to map characters to an internal label. Whether you load/store the alphabet in some
52-
database is up to you. The library ships with an `InMemoryAlphabet` for reference and simple use cases.
53-
* The `StateSetInterface` is more complex but is essentially responsible to load and store information about the
54-
state set of your index. Again, whether you load/store the state set in some
55-
database is up to you. The library ships with an `InMemoryStateSet` for reference and simple use cases.
54+
database is up to you. The library ships with an `InMemoryAlphabet` for reference and simple use cases. You don't
55+
even need to store the alphabet as we already have one with the UTF-8 codepoints, that's what `Utf8Alphabet` is
56+
for. In case you don't want to customize the labels, use `Utf8Alphabet`.
57+
* The `StateSetInterface` is responsible to load and store information about the state set of your index. Again,
58+
how you load/store the state set in some database is up to you. The library ships with an `InMemoryStateSet`
59+
for reference and simple use cases and tests.
60+
* The `DataStoreInterface` is responsible for storing the string you index alongside its assigned state. Sometimes
61+
you want to completely customize storage in which case you can use the `NullDataStore` and only use the
62+
assignments you get as a return value from calling `$stateSetIndex->index()`.
5663

5764
You can not only ask for the final matching results using `$stateSetIndex->findMatchingStates('Mustre', 2)` which is
5865
already filtered using a multibyte implementation of the Levenshtein algorithm, but you can also access intermediary

ecs.php

+6-3
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@
22

33
declare(strict_types=1);
44

5-
use PhpCsFixer\Fixer\Import\NoUnusedImportsFixer;
6-
use PhpCsFixer\Fixer\Phpdoc\NoSuperfluousPhpdocTagsFixer;
5+
use PhpCsFixer\Fixer\ClassNotation\OrderedClassElementsFixer;
6+
use PhpCsFixer\Fixer\FunctionNotation\NativeFunctionInvocationFixer;
7+
use PhpCsFixer\Fixer\Operator\NotOperatorWithSuccessorSpaceFixer;
78
use Symplify\EasyCodingStandard\Config\ECSConfig;
89
use Symplify\EasyCodingStandard\ValueObject\Set\SetList;
910

@@ -23,5 +24,7 @@
2324
]);
2425

2526
// Always move private elements to the bottom
26-
$ecsConfig->rule(\PhpCsFixer\Fixer\ClassNotation\OrderedClassElementsFixer::class);
27+
$ecsConfig->ruleWithConfiguration(OrderedClassElementsFixer::class, ['sort_algorithm' => 'alpha']);
28+
$ecsConfig->rule(NativeFunctionInvocationFixer::class);
29+
$ecsConfig->skip([NotOperatorWithSuccessorSpaceFixer::class]);
2730
};

src/Alphabet/InMemoryAlphabet.php

+9-9
Original file line numberDiff line numberDiff line change
@@ -14,31 +14,31 @@ public function __construct(
1414
) {
1515
}
1616

17-
public function all(): array
17+
public function add(string $char, int $label): self
1818
{
19-
return $this->alphabet;
19+
$this->alphabet[$char] = $label;
20+
21+
return $this;
2022
}
2123

22-
public function has(string $char): bool
24+
public function all(): array
2325
{
24-
return isset($this->alphabet[$char]);
26+
return $this->alphabet;
2527
}
2628

2729
public function count(): int
2830
{
29-
return count($this->alphabet);
31+
return \count($this->alphabet);
3032
}
3133

3234
public function get(string $char): ?int
3335
{
3436
return $this->alphabet[$char] ?? null;
3537
}
3638

37-
public function add(string $char, int $label): self
39+
public function has(string $char): bool
3840
{
39-
$this->alphabet[$char] = $label;
40-
41-
return $this;
41+
return isset($this->alphabet[$char]);
4242
}
4343

4444
public function map(string $char, int $alphabetSize): int

src/Alphabet/Utf8Alphabet.php

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
<?php
2+
3+
namespace Toflar\StateSetIndex\Alphabet;
4+
5+
class Utf8Alphabet implements AlphabetInterface
6+
{
7+
/**
8+
* @var array<int, array<string, int>>
9+
*/
10+
private array $cache = [];
11+
12+
public function map(string $char, int $alphabetSize): int
13+
{
14+
if (!isset($this->cache[$alphabetSize][$char])) {
15+
// +1 in order to never assign 0
16+
$this->cache[$alphabetSize][$char] = (mb_ord($char, 'UTF-8') % $alphabetSize) + 1;
17+
}
18+
19+
return $this->cache[$alphabetSize][$char];
20+
}
21+
}

src/Config.php

+4-4
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,13 @@ public function __construct(
1010
) {
1111
}
1212

13-
public function getIndexLength(): int
13+
public function getAlphabetSize(): int
1414
{
15-
return $this->indexLength;
15+
return $this->alphabetSize;
1616
}
1717

18-
public function getAlphabetSize(): int
18+
public function getIndexLength(): int
1919
{
20-
return $this->alphabetSize;
20+
return $this->indexLength;
2121
}
2222
}

src/DataStore/DataStoreInterface.php

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
<?php
2+
3+
namespace Toflar\StateSetIndex\DataStore;
4+
5+
interface DataStoreInterface
6+
{
7+
public function add(int $state, string $string): void;
8+
9+
/**
10+
* Returns the matching strings per state. Key is the state and the value is an array of matching strings
11+
* for that state.
12+
*
13+
* @return array<int,array<string>>
14+
*/
15+
public function getForStates(array $states = []): array;
16+
}

src/DataStore/InMemoryDataStore.php

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
<?php
2+
3+
namespace Toflar\StateSetIndex\DataStore;
4+
5+
class InMemoryDataStore implements DataStoreInterface
6+
{
7+
/**
8+
* @var array<int, array<string>>
9+
*/
10+
private array $data = [];
11+
12+
public function add(int $state, string $string): void
13+
{
14+
$this->data[$state][] = $string;
15+
}
16+
17+
public function all(): array
18+
{
19+
return $this->data;
20+
}
21+
22+
public function getForStates(array $states = []): array
23+
{
24+
if ([] === $states) {
25+
return $this->data;
26+
}
27+
28+
return array_intersect_key($this->data, array_flip($states));
29+
}
30+
}

src/DataStore/NullDataStore.php

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
<?php
2+
3+
namespace Toflar\StateSetIndex\DataStore;
4+
5+
class NullDataStore implements DataStoreInterface
6+
{
7+
public function add(int $state, string $string): void
8+
{
9+
// noop
10+
}
11+
12+
public function getForStates(array $states = []): array
13+
{
14+
return [];
15+
}
16+
}

src/Levenshtein.php

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
class Levenshtein
66
{
7-
public static function distance(string $string1, string $string2, int $insertionCost = 1, $replacementCost = 1, $deletionCost = 1)
7+
public static function distance(string $string1, string $string2, int $insertionCost = 1, $replacementCost = 1, $deletionCost = 1): int
88
{
99
$string1 = mb_convert_encoding($string1, 'ASCII', 'utf8');
1010
$string2 = mb_convert_encoding($string2, 'ASCII', 'utf8');

src/StateSet/CostAnnotatedStateSet.php

+18-16
Original file line numberDiff line numberDiff line change
@@ -11,28 +11,13 @@ class CostAnnotatedStateSet
1111
*/
1212
private array $set = [];
1313

14-
/**
15-
* Key: State
16-
* Value: Cost
17-
* @return array<int, int>
18-
*/
19-
public function all(): array
20-
{
21-
return $this->set;
22-
}
23-
24-
public function states(): array
25-
{
26-
return array_values(array_keys($this->set));
27-
}
28-
2914
/**
3015
* Adds a state with a cost to this set.
3116
* If this sets already contains the given state with a higher cost, replaces it.
3217
*/
3318
public function add(int $state, int $cost): void
3419
{
35-
if (! isset($this->set[$state])) {
20+
if (!isset($this->set[$state])) {
3621
$this->set[$state] = $cost;
3722
return;
3823
}
@@ -43,6 +28,16 @@ public function add(int $state, int $cost): void
4328
}
4429
}
4530

31+
/**
32+
* Key: State
33+
* Value: Cost
34+
* @return array<int, int>
35+
*/
36+
public function all(): array
37+
{
38+
return $this->set;
39+
}
40+
4641
public function mergeWith(CostAnnotatedStateSet $stateSet): self
4742
{
4843
$clone = clone $this;
@@ -53,4 +48,11 @@ public function mergeWith(CostAnnotatedStateSet $stateSet): self
5348

5449
return $clone;
5550
}
51+
52+
public function states(): array
53+
{
54+
$states = array_values(array_keys($this->set));
55+
sort($states);
56+
return $states;
57+
}
5658
}

src/StateSet/InMemoryStateSet.php

+9-61
Original file line numberDiff line numberDiff line change
@@ -5,77 +5,25 @@
55
class InMemoryStateSet implements StateSetInterface
66
{
77
/**
8-
* Key: State
9-
* Value: array<parent,mappedChar>
10-
*
11-
* @var array<int, array<int,int>>
8+
* @param array $states array<int, bool>
129
*/
13-
private array $states = [];
14-
15-
/**
16-
* @var array<int, array<int>>
17-
*/
18-
private array $children = [];
19-
20-
/**
21-
* Key: State
22-
* Value: Mapped char
23-
* @var array<int, int>
24-
*/
25-
private array $mappedChars = [];
26-
27-
/**
28-
* Key: State
29-
* Value: Matching strings
30-
* @var array<int, array<string>>
31-
*/
32-
private array $acceptedStrings = [];
10+
public function __construct(
11+
private array $states = []
12+
) {
13+
}
3314

34-
public function add(int $state, int $parentState, int $mappedChar): self
15+
public function add(int $state): void
3516
{
36-
$this->states[$state] = [$parentState, $mappedChar];
37-
$this->mappedChars[$state] = $mappedChar;
38-
$this->children[$parentState][$state] = true;
39-
40-
return $this;
17+
$this->states[$state] = true;
4118
}
4219

4320
public function all(): array
4421
{
4522
return $this->states;
4623
}
4724

48-
public function getChildrenOfState(int $state): array
49-
{
50-
if (! isset($this->children[$state])) {
51-
return [];
52-
}
53-
54-
return array_keys($this->children[$state]);
55-
}
56-
57-
public function getCharForState(int $state): int
58-
{
59-
if (! isset($this->mappedChars[$state])) {
60-
throw new \LogicException('No mapped char for state. Check your alphabet!');
61-
}
62-
63-
return $this->mappedChars[$state];
64-
}
65-
66-
public function acceptString(int $state, string $string): self
67-
{
68-
$this->acceptedStrings[$state][] = $string;
69-
70-
return $this;
71-
}
72-
73-
public function getAcceptedStrings(array $matchingStates = []): array
25+
public function has(int $state): bool
7426
{
75-
if ([] === $matchingStates) {
76-
return $this->acceptedStrings;
77-
}
78-
79-
return array_intersect_key($this->acceptedStrings, array_flip($matchingStates));
27+
return isset($this->states[$state]);
8028
}
8129
}

src/StateSet/StateSetInterface.php

+4-14
Original file line numberDiff line numberDiff line change
@@ -4,22 +4,12 @@
44

55
interface StateSetInterface
66
{
7-
public function add(int $state, int $parentState, int $mappedChar): self;
8-
9-
public function getChildrenOfState(int $state): array;
10-
11-
public function getCharForState(int $state): int;
7+
public function add(int $state): void;
128

139
/**
14-
* Accept a string with a given state.
10+
* @return array<int>
1511
*/
16-
public function acceptString(int $state, string $string): self;
12+
public function all(): array;
1713

18-
/**
19-
* Returns the matching strings per state. Key is the state and the value is an array of matching strings
20-
* for that state. If no argument is passed, the entire accepted strings dataset is returned.
21-
*
22-
* @return array<int,array<string>>
23-
*/
24-
public function getAcceptedStrings(array $matchingStates = []): array;
14+
public function has(int $state): bool;
2515
}

0 commit comments

Comments
 (0)