Skip to content

Commit 80251a8

Browse files
committed
Initial commit of FuzzyMatching complete example
Signed-off-by: Dan S. Camper <[email protected]>
1 parent a3fec34 commit 80251a8

15 files changed

+1220
-0
lines changed
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
IMPORT Std;
2+
3+
#WORKUNIT('name', 'Create Fuzzy Business Name Stopwords');
4+
5+
//-----------------------------------------------------------------------------
6+
// This code is intended to be executed under Thor
7+
//-----------------------------------------------------------------------------
8+
9+
IMPORT $.^.^ AS Root;
10+
IMPORT $.^ AS Home;
11+
12+
//-----------------------------------------------------------------------------
13+
14+
RAW_DATA_PATH := Home.Constants.PATH_PREFIX + '::business_namelist.csv';
15+
rawData := DATASET(RAW_DATA_PATH, Root.Files.CommonRawDataLayout, CSV(UNICODE));
16+
17+
//-----------------------------------------------------------------------------
18+
19+
// Note that the record definition for the raw file does not read all of the
20+
// fields in, so while this looks like a whole-record-deduplication, it really
21+
// looks at only the first few fields
22+
dedupedRawData := DEDUP(SORT(rawData, WHOLE RECORD), WHOLE RECORD);
23+
24+
cleanedFullNames := PROJECT
25+
(
26+
dedupedRawData(entity_guid != '' AND name != ''),
27+
TRANSFORM
28+
(
29+
{
30+
RECORDOF(LEFT),
31+
UTF8 full_name,
32+
Root.Files.NAMEID_t name_id
33+
},
34+
SELF.name := Home.CleanBusinessName(LEFT.name),
35+
SELF.full_name := LEFT.name,
36+
SELF.name_id := COUNTER,
37+
SELF := LEFT
38+
)
39+
);
40+
41+
// Minimize the fields we use for performance
42+
trimmedCleanedFullNames := TABLE(cleanedFullNames, {name, name_id});
43+
44+
// Make sure file is relatively evenly spread across Thor workers
45+
distCleanedFullNames := DISTRIBUTE(trimmedCleanedFullNames, SKEW(0.05));
46+
47+
// Break (full) name value into words, noting their name_id origin
48+
cleanedNames := NORMALIZE
49+
(
50+
distCleanedFullNames,
51+
Root.FuzzyNameMatch.MakeWordDS(LEFT.name),
52+
TRANSFORM
53+
(
54+
{
55+
UTF8 name,
56+
UNSIGNED4 name_id
57+
},
58+
SELF.name := IF(Root.FuzzyNameMatch.IsValidWord(RIGHT.word), RIGHT.word, SKIP),
59+
SELF.name_id := LEFT.name_id
60+
)
61+
);
62+
63+
// For each unique name word, count the number of names in which that word appears and
64+
// compute an inverse frequency value (IFV) for it; also compute a hash of the name
65+
// word, which will be used as the key for an index
66+
nameFrequency := TABLE
67+
(
68+
cleanedNames,
69+
{
70+
UTF8 word := name,
71+
UNSIGNED4 name_count := COUNT(GROUP)
72+
},
73+
name,
74+
MERGE
75+
);
76+
77+
indexStopwords := nameFrequency(name_count >= $.Constants.INDEX_STOPWORD_WORD_FREQ_CUTOFF);
78+
nonStopwords := nameFrequency(name_count < $.Constants.INDEX_STOPWORD_WORD_FREQ_CUTOFF);
79+
80+
// Debug output
81+
OUTPUT(COUNT(nameFrequency), NAMED('word_count'));
82+
OUTPUT($.Constants.INDEX_STOPWORD_WORD_FREQ_CUTOFF, NAMED('word_freq_cutoff'));
83+
OUTPUT(TOPN(nonStopWords, 1000, -name_count), NAMED('other_words_sample'), ALL);
84+
85+
// Files
86+
OUTPUT(indexStopwords, {indexStopwords}, Home.Constants.STOPWORD_PATH, COMPRESSED, OVERWRITE);
87+
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
IMPORT Std;
2+
3+
#WORKUNIT('name', 'Fuzzy Business Name Index Build');
4+
5+
//-----------------------------------------------------------------------------
6+
// This code is intended to be executed under Thor
7+
//-----------------------------------------------------------------------------
8+
9+
IMPORT $.^.^ AS Root;
10+
IMPORT $.^ AS Home;
11+
12+
//-----------------------------------------------------------------------------
13+
14+
RAW_DATA_PATH := Home.Constants.PATH_PREFIX + '::business_namelist.csv';
15+
rawData := DATASET(RAW_DATA_PATH, Root.Files.CommonRawDataLayout, CSV(UNICODE));
16+
17+
//-----------------------------------------------------------------------------
18+
19+
UNSIGNED1 AdaptedDistance(UTF8 s) := FUNCTION
20+
textLen := LENGTH(s);
21+
RETURN MAP
22+
(
23+
textLen < 3 => 0,
24+
textLen < 9 => 1,
25+
textLen < 13 => 2,
26+
textLen < 21 => 3,
27+
0
28+
);
29+
END;
30+
31+
//-----------------------------------------------------------------------------
32+
33+
Root.FuzzyNameMatch.Build(rawData,
34+
Home.Constants.NAME_INDEX_PATH,
35+
Home.Constants.NAME_ID_INDEX_PATH,
36+
Home.Constants.ENTITY_ID_INDEX_PATH,
37+
stopwordPath := Home.Constants.STOPWORD_PATH,
38+
CleanNameFunction := Home.CleanBusinessName,
39+
AdaptedDistanceFunction := AdaptedDistance);
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
IMPORT Useful_ECL;
2+
3+
EXPORT UTF8 CleanBusinessName(UTF8 s) := FUNCTION
4+
cleanedName := Useful_ECL.CleanBusinessName(s);
5+
removeKeywords := REGEXREPLACE(u'\\b(?:SELECT)|(?:FROM)|(?:WHERE)|(?:TABLE)|(?:DELETE)|(?:CREATE)|(?:UPDATE)|(?:DROP)\\b', cleanedName, '');
6+
RETURN removeKeywords;
7+
END;
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
EXPORT Constants := MODULE
2+
EXPORT PATH_PREFIX := '~fuzzy_match';
3+
4+
EXPORT STOPWORD_PATH := PATH_PREFIX + '::business::stopwords';
5+
EXPORT NAME_INDEX_PATH := PATH_PREFIX + '::business::name.idx';
6+
EXPORT NAME_ID_INDEX_PATH := PATH_PREFIX + '::business::nameid.idx';
7+
EXPORT ENTITY_ID_INDEX_PATH := PATH_PREFIX + '::business::entityid.idx';
8+
9+
// Words appearing in more than this number of names will be considered stopwords
10+
INDEX_STOPWORD_WORD_FREQ_CUTOFF := 5000;
11+
END;
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
IMPORT Std;
2+
3+
#WORKUNIT('name', 'fuzzy_business_name_match');
4+
5+
//-----------------------------------------------------------------------------
6+
// This code is intended to be compiled and published under Roxie
7+
//-----------------------------------------------------------------------------
8+
9+
IMPORT $.^.^ AS Root;
10+
IMPORT $.^ AS Home;
11+
12+
//-----------------------------------------------------------------------------
13+
14+
UTF8 businessName := '' : STORED('business_name', FORMAT(SEQUENCE(100)));
15+
INTEGER1 minScore := 0 : STORED('min_score', FORMAT(SEQUENCE(200)));
16+
BOOLEAN onlyDirect := FALSE : STORED('only_direct_matches', FORMAT(SEQUENCE(300)));
17+
INTEGER2 pageNum := 1 : STORED('page_num', FORMAT(SEQUENCE(400)));
18+
INTEGER2 pageSize := 100 : STORED('page_size', FORMAT(SEQUENCE(500)));
19+
20+
clampedMinScore := MIN(MAX(minScore, 0), 100);
21+
clampedPageNum := MAX(pageNum, 1);
22+
clampedPageSize := MAX(pageSize, 1);
23+
24+
params := DATASET
25+
(
26+
[
27+
{'business_name', businessName},
28+
{'only_direct_matches', IF(onlyDirect, u8'true', u8'false')},
29+
{'min_score', (UTF8)clampedMinScore},
30+
{'page_num', (UTF8)clampedPageNum},
31+
{'page_size', (UTF8)clampedPageSize}
32+
],
33+
{STRING parameter, UTF8 value}
34+
);
35+
OUTPUT(params, NAMED('echo'));
36+
37+
UNSIGNED1 AdaptedDistance(UTF8 s) := FUNCTION
38+
textLen := LENGTH(s);
39+
RETURN MAP
40+
(
41+
textLen < 3 => 0,
42+
textLen < 21 => 1,
43+
0
44+
);
45+
END;
46+
47+
WordsOnStopList(STRING stopwordIndexPath, UTF8 queryStr) := FUNCTION
48+
RETURN JOIN
49+
(
50+
Root.Files.StopwordDS(stopwordIndexPath),
51+
Root.FuzzyNameMatch.MakeWordDS(Home.CleanBusinessName(queryStr)),
52+
LEFT.word = RIGHT.word,
53+
TRANSFORM(LEFT)
54+
);
55+
END;
56+
57+
OUTPUT(WordsOnStopList(Home.Constants.STOPWORD_PATH, businessName), NAMED('query_words_on_index_stoplist'));
58+
59+
rawResults := Root.FuzzyNameMatch.BestMatches(businessName,
60+
Home.Constants.NAME_INDEX_PATH,
61+
Home.Constants.NAME_ID_INDEX_PATH,
62+
Home.Constants.ENTITY_ID_INDEX_PATH,
63+
CleanNameFunction := Home.CleanBusinessName,
64+
AdaptedDistanceFunction := AdaptedDistance,
65+
stopwordPath := Home.Constants.STOPWORD_PATH);
66+
67+
rawResults2 := rawResults(score >= clampedMinScore AND (NOT(onlyDirect) OR is_match));
68+
OUTPUT(COUNT(rawResults2), NAMED('total_found'));
69+
70+
sortedResults := TOPN(rawResults2, (clampedPageNum * clampedPageSize), -score, entity_guid, -is_match);
71+
72+
firstRec := (clampedPageNum -1) * clampedPageSize + 1;
73+
OUTPUT(CHOOSEN(sortedResults, clampedPageSize, firstRec), NAMED('matches'), ALL);
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# Fuzzy Searching for Business Names
2+
3+
## What Is This?
4+
5+
The code in this directory is a complete example of one way to fuzzy search for business names. It demonstrates the following techniques:
6+
7+
- Normalizing and cleaning business names
8+
- Passing a function for name cleaning to the underlying library code (a method of factoring the code to make it more flexible)
9+
- Passing a function to dynamically choose a Levenshtein edit distance to use for an individual word, making it somewhat adaptive
10+
- Creating fuzzy search indexes for individual words within a name, runnable under Thor so it can handle large datasets
11+
- Creating a search query, runnable under Roxie, to search the indexes and score the results, supporting paginated results
12+
13+
## Code Layout
14+
15+
```bash
16+
├── BWRs
17+
│   ├── 01_CreateStopwords.ecl
18+
│   └── 02_CreateIndexes.ecl
19+
├── CleanBusinessName.ecl
20+
├── Constants.ecl
21+
├── Queries
22+
│   └── Search.ecl
23+
└── README.md
24+
```
25+
26+
The two files at the top level are used by other code:
27+
28+
- [CleanBusinessName.ecl](CleanBusinessName.ecl): A function that accepts a UTF-8 string representing a business name and then returns a normalized and "cleaned" version of that same string. The function is used when both indexing and querying. Note that the heavy lifting is performed by an external function found in the [Useful_ECL](https://github.com/dcamper/Useful_ECL) repo.
29+
- [Constants.ecl](Constants.ecl): Constants used in other parts of the code. Most of them have to do with file naming, but one -- ``INDEX_STOPWORD_WORD_FREQ_CUTOFF`` -- is crucial to creating the stopword dataset. The idea behind that constant is basically, "if a user searched for a single common word and it returned too many results, what number is 'too many results'?" The example code uses 5000, but it should be adjusted for your use case. The ``INDEX_STOPWORD_WORD_FREQ_CUTOFF`` is used to create the final stopword list that is loaded by the index build code. The implication here is that if you modify the constant then that will change the contents of the stopword list, which further means that you will have to rebuild the search index as well.
30+
31+
## Creating a stopword list
32+
33+
Support for a stopword list is provided by the toplevel code, but it should be noted that the stopword list is optional. In that toplevel code, if the logical pathname for the stopword list is not provided, or if the pathname is provided but no data is found, then the stopword functionality is simply ignored.
34+
35+
To create the stopword list, you will want to execute [BWRs/01_CreateStopwords.ecl](BWRs/01_CreateStopwords.ecl) in Thor. That file loads the raw data using these two lines near the top:
36+
37+
```ecl
38+
RAW_DATA_PATH := Home.Constants.PATH_PREFIX + '::business_namelist.csv';
39+
rawData := DATASET(RAW_DATA_PATH, Root.Files.CommonRawDataLayout, CSV(UNICODE));
40+
```
41+
42+
This example code assumes that the file has already been sprayed and has the full logical filename of ``~fuzzy_match::business_namelist.csv`` (definitions from [Constants.ecl](Constants.ecl) come into play, here). The raw file's first three fields are what we need, in the right order, and the first line of the data does not contain field names. For your own data, the easiest thing to do is to read it in and project it to the right format, then make sure the result is assigned to the attribute ``rawData`` at that same location; the rest of the code should Just Work.
43+
44+
The stopword dataset that is created is defined by the ``STOPWORD_PATH`` constant, defined within [Constants.ecl](Constants.ecl). The workunit creates some other outputs as well, such as the number of words processed and a sample of words *not* included in the stopword list but were close to the cutoff.
45+
46+
## Building the search index
47+
48+
[BWRs/02_CreateIndexes.ecl](BWRs/02_CreateIndexes.ecl) is the code that actually creates the indexes. It uses the same raw data file declaration as when building the stopword list, so you will have to make the same changes here to reference your own data.
49+
50+
This file contains a function that will be passed to the index-building code. ``AdaptedDistance()`` returns an edit distance that should be used, given a string. This allows you to specify different "fuzziness" for different-length words, which is a handy feature.
51+
52+
The bulk of the work is performed by the toplevel ``Build()`` function. This BWR should run under Thor for performance reasons (and to handle actual big data scenarios).
53+
54+
## Creating and publishing the search query
55+
56+
[Queries/Search.ecl](Queries/Search.ecl) is the code for searching against the indexes created by [BWRs/02_CreateIndexes.ecl](BWRs/02_CreateIndexes.ecl). The search code should be compiled -- not executed -- under Roxie, then published.
57+
58+
Most of the search code is related to handling the query parameters or echoing things back to the caller via multiple results. Result pagination is also supported.
59+
60+
Note that an ``AdaptedDistance()`` function is defined here, like it was when creating the index. It is not necessary to use the exactly same function for index creation and searching (as is shown in this example). If both the original data is indexed with an edit distance of 1, then a user's query is also fuzzed with an edit distance of 1, the net effect could be retrieving data that is actually an edit distance of 2 away from the query. Some experimentation may be needed to determine which values best meet the needs of your use case.
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
IMPORT Std;
2+
3+
#WORKUNIT('name', 'Fuzzy Person Name Index Build');
4+
5+
//-----------------------------------------------------------------------------
6+
// This code is intended to be executed under Thor
7+
//-----------------------------------------------------------------------------
8+
9+
IMPORT $.^.^ AS Root;
10+
IMPORT $.^ AS Home;
11+
12+
//-----------------------------------------------------------------------------
13+
14+
RAW_DATA_PATH := Home.Constants.PATH_PREFIX + '::person_namelist.csv';
15+
rawData := DATASET(RAW_DATA_PATH, Root.Files.CommonRawDataLayout, CSV(UNICODE));
16+
17+
//-----------------------------------------------------------------------------
18+
19+
UNSIGNED1 AdaptedDistance(UTF8 s) := FUNCTION
20+
textLen := LENGTH(s);
21+
RETURN MAP
22+
(
23+
textLen < 3 => 0,
24+
textLen < 9 => 1,
25+
textLen < 13 => 2,
26+
textLen < 21 => 3,
27+
0
28+
);
29+
END;
30+
31+
//-----------------------------------------------------------------------------
32+
33+
Root.FuzzyNameMatch.Build(rawData,
34+
Home.Constants.NAME_INDEX_PATH,
35+
Home.Constants.NAME_ID_INDEX_PATH,
36+
Home.Constants.ENTITY_ID_INDEX_PATH,
37+
CleanNameFunction := Home.CleanPersonName,
38+
AdaptedDistanceFunction := AdaptedDistance);

0 commit comments

Comments
 (0)