hpccsystems-solutions-lab
diff --git a/‎FuzzyMatching/Example_Business_Names/BWRs/01_CreateStopwords.ecl
Lines changed: 87 additions & 0 deletions b/‎FuzzyMatching/Example_Business_Names/BWRs/01_CreateStopwords.ecl
Lines changed: 87 additions & 0 deletions
diff --git a/‎FuzzyMatching/Example_Business_Names/BWRs/02_CreateIndexes.ecl
Lines changed: 39 additions & 0 deletions b/‎FuzzyMatching/Example_Business_Names/BWRs/02_CreateIndexes.ecl
Lines changed: 39 additions & 0 deletions
diff --git a/‎FuzzyMatching/Example_Business_Names/CleanBusinessName.ecl
Lines changed: 7 additions & 0 deletions b/‎FuzzyMatching/Example_Business_Names/CleanBusinessName.ecl
Lines changed: 7 additions & 0 deletions
diff --git a/‎FuzzyMatching/Example_Business_Names/Constants.ecl
Lines changed: 11 additions & 0 deletions b/‎FuzzyMatching/Example_Business_Names/Constants.ecl
Lines changed: 11 additions & 0 deletions
diff --git a/‎FuzzyMatching/Example_Business_Names/Queries/Search.ecl
Lines changed: 73 additions & 0 deletions b/‎FuzzyMatching/Example_Business_Names/Queries/Search.ecl
Lines changed: 73 additions & 0 deletions
diff --git a/‎FuzzyMatching/Example_Business_Names/README.md
Lines changed: 60 additions & 0 deletions b/‎FuzzyMatching/Example_Business_Names/README.md
Lines changed: 60 additions & 0 deletions
diff --git a/‎FuzzyMatching/Example_Person_Names/BWRs/01_CreateIndexes.ecl
Lines changed: 38 additions & 0 deletions b/‎FuzzyMatching/Example_Person_Names/BWRs/01_CreateIndexes.ecl
Lines changed: 38 additions & 0 deletions
@@ -0,0 +1,87 @@
+IMPORT Std;
+
+#WORKUNIT('name', 'Create Fuzzy Business Name Stopwords');
+
+//-----------------------------------------------------------------------------
+// This code is intended to be executed under Thor
+//-----------------------------------------------------------------------------
+
+IMPORT $.^.^ AS Root;
+IMPORT $.^ AS Home;
+
+//-----------------------------------------------------------------------------
+
+RAW_DATA_PATH := Home.Constants.PATH_PREFIX + '::business_namelist.csv';
+rawData := DATASET(RAW_DATA_PATH, Root.Files.CommonRawDataLayout, CSV(UNICODE));
+
+//-----------------------------------------------------------------------------
+
+// Note that the record definition for the raw file does not read all of the
+// fields in, so while this looks like a whole-record-deduplication, it really
+// looks at only the first few fields
+dedupedRawData := DEDUP(SORT(rawData, WHOLE RECORD), WHOLE RECORD);
+
+cleanedFullNames := PROJECT
+    (
+        dedupedRawData(entity_guid != '' AND name != ''),
+        TRANSFORM
+            (
+                {
+                    RECORDOF(LEFT),
+                    UTF8                full_name,
+                    Root.Files.NAMEID_t name_id
+                },
+                SELF.name := Home.CleanBusinessName(LEFT.name),
+                SELF.full_name := LEFT.name,
+                SELF.name_id := COUNTER,
+                SELF := LEFT
+            )
+    );
+
+// Minimize the fields we use for performance
+trimmedCleanedFullNames := TABLE(cleanedFullNames, {name, name_id});
+
+// Make sure file is relatively evenly spread across Thor workers
+distCleanedFullNames := DISTRIBUTE(trimmedCleanedFullNames, SKEW(0.05));
+
+// Break (full) name value into words, noting their name_id origin
+cleanedNames := NORMALIZE
+    (
+        distCleanedFullNames,
+        Root.FuzzyNameMatch.MakeWordDS(LEFT.name),
+        TRANSFORM
+            (
+                {
+                    UTF8        name,
+                    UNSIGNED4   name_id
+                },
+                SELF.name := IF(Root.FuzzyNameMatch.IsValidWord(RIGHT.word), RIGHT.word, SKIP),
+                SELF.name_id := LEFT.name_id
+            )
+    );
+
+// For each unique name word, count the number of names in which that word appears and
+// compute an inverse frequency value (IFV) for it; also compute a hash of the name
+// word, which will be used as the key for an index
+nameFrequency := TABLE
+    (
+        cleanedNames,
+        {
+            UTF8        word := name,
+            UNSIGNED4   name_count := COUNT(GROUP)
+        },
+        name,
+        MERGE
+    );
+
+indexStopwords := nameFrequency(name_count >= $.Constants.INDEX_STOPWORD_WORD_FREQ_CUTOFF);
+nonStopwords := nameFrequency(name_count < $.Constants.INDEX_STOPWORD_WORD_FREQ_CUTOFF);
+
+// Debug output
+OUTPUT(COUNT(nameFrequency), NAMED('word_count'));
+OUTPUT($.Constants.INDEX_STOPWORD_WORD_FREQ_CUTOFF, NAMED('word_freq_cutoff'));
+OUTPUT(TOPN(nonStopWords, 1000, -name_count), NAMED('other_words_sample'), ALL);
+
+// Files
+OUTPUT(indexStopwords, {indexStopwords}, Home.Constants.STOPWORD_PATH, COMPRESSED, OVERWRITE);
+
@@ -0,0 +1,39 @@
+IMPORT Std;
+
+#WORKUNIT('name', 'Fuzzy Business Name Index Build');
+
+//-----------------------------------------------------------------------------
+// This code is intended to be executed under Thor
+//-----------------------------------------------------------------------------
+
+IMPORT $.^.^ AS Root;
+IMPORT $.^ AS Home;
+
+//-----------------------------------------------------------------------------
+
+RAW_DATA_PATH := Home.Constants.PATH_PREFIX + '::business_namelist.csv';
+rawData := DATASET(RAW_DATA_PATH, Root.Files.CommonRawDataLayout, CSV(UNICODE));
+
+//-----------------------------------------------------------------------------
+
+UNSIGNED1 AdaptedDistance(UTF8 s) := FUNCTION
+    textLen := LENGTH(s);
+    RETURN MAP
+        (
+            textLen < 3     => 0,
+            textLen < 9     => 1,
+            textLen < 13    => 2,
+            textLen < 21    => 3,
+            0
+        );
+END;
+
+//-----------------------------------------------------------------------------
+
+Root.FuzzyNameMatch.Build(rawData,
+                          Home.Constants.NAME_INDEX_PATH,
+                          Home.Constants.NAME_ID_INDEX_PATH,
+                          Home.Constants.ENTITY_ID_INDEX_PATH,
+                          stopwordPath := Home.Constants.STOPWORD_PATH,
+                          CleanNameFunction := Home.CleanBusinessName,
+                          AdaptedDistanceFunction := AdaptedDistance);
@@ -0,0 +1,7 @@
+IMPORT Useful_ECL;
+
+EXPORT UTF8 CleanBusinessName(UTF8 s) := FUNCTION
+    cleanedName := Useful_ECL.CleanBusinessName(s);
+    removeKeywords := REGEXREPLACE(u'\\b(?:SELECT)|(?:FROM)|(?:WHERE)|(?:TABLE)|(?:DELETE)|(?:CREATE)|(?:UPDATE)|(?:DROP)\\b', cleanedName, '');
+    RETURN removeKeywords;
+END;
@@ -0,0 +1,11 @@
+EXPORT Constants := MODULE
+    EXPORT PATH_PREFIX := '~fuzzy_match';
+
+    EXPORT STOPWORD_PATH := PATH_PREFIX + '::business::stopwords';
+    EXPORT NAME_INDEX_PATH := PATH_PREFIX + '::business::name.idx';
+    EXPORT NAME_ID_INDEX_PATH := PATH_PREFIX + '::business::nameid.idx';
+    EXPORT ENTITY_ID_INDEX_PATH := PATH_PREFIX + '::business::entityid.idx';
+
+    // Words appearing in more than this number of names will be considered stopwords
+    INDEX_STOPWORD_WORD_FREQ_CUTOFF := 5000;
+END;
@@ -0,0 +1,73 @@
+IMPORT Std;
+
+#WORKUNIT('name', 'fuzzy_business_name_match');
+
+//-----------------------------------------------------------------------------
+// This code is intended to be compiled and published under Roxie
+//-----------------------------------------------------------------------------
+
+IMPORT $.^.^ AS Root;
+IMPORT $.^ AS Home;
+
+//-----------------------------------------------------------------------------
+
+UTF8        businessName := '' : STORED('business_name', FORMAT(SEQUENCE(100)));
+INTEGER1    minScore := 0 : STORED('min_score', FORMAT(SEQUENCE(200)));
+BOOLEAN     onlyDirect := FALSE : STORED('only_direct_matches', FORMAT(SEQUENCE(300)));
+INTEGER2    pageNum := 1 : STORED('page_num', FORMAT(SEQUENCE(400)));
+INTEGER2    pageSize := 100 : STORED('page_size', FORMAT(SEQUENCE(500)));
+
+clampedMinScore := MIN(MAX(minScore, 0), 100);
+clampedPageNum := MAX(pageNum, 1);
+clampedPageSize := MAX(pageSize, 1);
+
+params := DATASET
+    (
+        [
+            {'business_name', businessName},
+            {'only_direct_matches', IF(onlyDirect, u8'true', u8'false')},
+            {'min_score', (UTF8)clampedMinScore},
+            {'page_num', (UTF8)clampedPageNum},
+            {'page_size', (UTF8)clampedPageSize}
+        ],
+        {STRING parameter, UTF8 value}
+    );
+OUTPUT(params, NAMED('echo'));
+
+UNSIGNED1 AdaptedDistance(UTF8 s) := FUNCTION
+    textLen := LENGTH(s);
+    RETURN MAP
+        (
+            textLen < 3     => 0,
+            textLen < 21    => 1,
+            0
+        );
+END;
+
+WordsOnStopList(STRING stopwordIndexPath, UTF8 queryStr) := FUNCTION
+    RETURN JOIN
+        (
+            Root.Files.StopwordDS(stopwordIndexPath),
+            Root.FuzzyNameMatch.MakeWordDS(Home.CleanBusinessName(queryStr)),
+            LEFT.word = RIGHT.word,
+            TRANSFORM(LEFT)
+        );
+END;
+
+OUTPUT(WordsOnStopList(Home.Constants.STOPWORD_PATH, businessName), NAMED('query_words_on_index_stoplist'));
+
+rawResults := Root.FuzzyNameMatch.BestMatches(businessName,
+                                              Home.Constants.NAME_INDEX_PATH,
+                                              Home.Constants.NAME_ID_INDEX_PATH,
+                                              Home.Constants.ENTITY_ID_INDEX_PATH,
+                                              CleanNameFunction := Home.CleanBusinessName,
+                                              AdaptedDistanceFunction := AdaptedDistance,
+                                              stopwordPath := Home.Constants.STOPWORD_PATH);
+
+rawResults2 := rawResults(score >= clampedMinScore AND (NOT(onlyDirect) OR is_match));
+OUTPUT(COUNT(rawResults2), NAMED('total_found'));
+
+sortedResults := TOPN(rawResults2, (clampedPageNum * clampedPageSize), -score, entity_guid, -is_match);
+
+firstRec := (clampedPageNum -1) * clampedPageSize + 1;
+OUTPUT(CHOOSEN(sortedResults, clampedPageSize, firstRec), NAMED('matches'), ALL);
@@ -0,0 +1,60 @@
+# Fuzzy Searching for Business Names
+
+## What Is This?
+
+The code in this directory is a complete example of one way to fuzzy search for business names.  It demonstrates the following techniques:
+
+- Normalizing and cleaning business names
+- Passing a function for name cleaning to the underlying library code (a method of factoring the code to make it more flexible)
+- Passing a function to dynamically choose a Levenshtein edit distance to use for an individual word, making it somewhat adaptive
+- Creating fuzzy search indexes for individual words within a name, runnable under Thor so it can handle large datasets
+- Creating a search query, runnable under Roxie, to search the indexes and score the results, supporting paginated results
+
+## Code Layout
+
+```bash
+├── BWRs
+│   ├── 01_CreateStopwords.ecl
+│   └── 02_CreateIndexes.ecl
+├── CleanBusinessName.ecl
+├── Constants.ecl
+├── Queries
+│   └── Search.ecl
+└── README.md
+```
+
+The two files at the top level are used by other code:
+
+- [CleanBusinessName.ecl](CleanBusinessName.ecl): A function that accepts a UTF-8 string representing a business name and then returns a normalized and "cleaned" version of that same string.  The function is used when both indexing and querying.  Note that the heavy lifting is performed by an external function found in the [Useful_ECL](https://github.com/dcamper/Useful_ECL) repo.
+- [Constants.ecl](Constants.ecl): Constants used in other parts of the code. Most of them have to do with file naming, but one -- ``INDEX_STOPWORD_WORD_FREQ_CUTOFF`` -- is crucial to creating the stopword dataset. The idea behind that constant is basically, "if a user searched for a single common word and it returned too many results, what number is 'too many results'?"  The example code uses 5000, but it should be adjusted for your use case.  The ``INDEX_STOPWORD_WORD_FREQ_CUTOFF`` is used to create the final stopword list that is loaded by the index build code.  The implication here is that if you modify the constant then that will change the contents of the stopword list, which further means that you will have to rebuild the search index as well.
+
+## Creating a stopword list
+
+Support for a stopword list is provided by the toplevel code, but it should be noted that the stopword list is optional.  In that toplevel code, if the logical pathname for the stopword list is not provided, or if the pathname is provided but no data is found, then the stopword functionality is simply ignored.
+
+To create the stopword list, you will want to execute [BWRs/01_CreateStopwords.ecl](BWRs/01_CreateStopwords.ecl) in Thor.  That file loads the raw data using these two lines near the top:
+
+```ecl
+RAW_DATA_PATH := Home.Constants.PATH_PREFIX + '::business_namelist.csv';
+rawData := DATASET(RAW_DATA_PATH, Root.Files.CommonRawDataLayout, CSV(UNICODE));
+```
+
+This example code assumes that the file has already been sprayed and has the full logical filename of ``~fuzzy_match::business_namelist.csv`` (definitions from [Constants.ecl](Constants.ecl) come into play, here).  The raw file's first three fields are what we need, in the right order, and the first line of the data does not contain field names.  For your own data, the easiest thing to do is to read it in and project it to the right format, then make sure the result is assigned to the attribute ``rawData`` at that same location; the rest of the code should Just Work.
+
+The stopword dataset that is created is defined by the ``STOPWORD_PATH`` constant, defined within [Constants.ecl](Constants.ecl).  The workunit creates some other outputs as well, such as the number of words processed and a sample of words *not* included in the stopword list but were close to the cutoff.
+
+## Building the search index
+
+[BWRs/02_CreateIndexes.ecl](BWRs/02_CreateIndexes.ecl) is the code that actually creates the indexes.  It uses the same raw data file declaration as when building the stopword list, so you will have to make the same changes here to reference your own data.
+
+This file contains a function that will be passed to the index-building code.  ``AdaptedDistance()`` returns an edit distance that should be used, given a string.  This allows you to specify different "fuzziness" for different-length words, which is a handy feature.
+
+The bulk of the work is performed by the toplevel ``Build()`` function.  This BWR should run under Thor for performance reasons (and to handle actual big data scenarios).
+
+## Creating and publishing the search query
+
+[Queries/Search.ecl](Queries/Search.ecl) is the code for searching against the indexes created by [BWRs/02_CreateIndexes.ecl](BWRs/02_CreateIndexes.ecl).  The search code should be compiled -- not executed -- under Roxie, then published.
+
+Most of the search code is related to handling the query parameters or echoing things back to the caller via multiple results.  Result pagination is also supported.
+
+Note that an ``AdaptedDistance()`` function is defined here, like it was when creating the index.  It is not necessary to use the exactly same function for index creation and searching (as is shown in this example).  If both the original data is indexed with an edit distance of 1, then a user's query is also fuzzed with an edit distance of 1, the net effect could be retrieving data that is actually an edit distance of 2 away from the query.  Some experimentation may be needed to determine which values best meet the needs of your use case.
@@ -0,0 +1,38 @@
+IMPORT Std;
+
+#WORKUNIT('name', 'Fuzzy Person Name Index Build');
+
+//-----------------------------------------------------------------------------
+// This code is intended to be executed under Thor
+//-----------------------------------------------------------------------------
+
+IMPORT $.^.^ AS Root;
+IMPORT $.^ AS Home;
+
+//-----------------------------------------------------------------------------
+
+RAW_DATA_PATH := Home.Constants.PATH_PREFIX + '::person_namelist.csv';
+rawData := DATASET(RAW_DATA_PATH, Root.Files.CommonRawDataLayout, CSV(UNICODE));
+
+//-----------------------------------------------------------------------------
+
+UNSIGNED1 AdaptedDistance(UTF8 s) := FUNCTION
+    textLen := LENGTH(s);
+    RETURN MAP
+        (
+            textLen < 3     => 0,
+            textLen < 9     => 1,
+            textLen < 13    => 2,
+            textLen < 21    => 3,
+            0
+        );
+END;
+
+//-----------------------------------------------------------------------------
+
+Root.FuzzyNameMatch.Build(rawData,
+                          Home.Constants.NAME_INDEX_PATH,
+                          Home.Constants.NAME_ID_INDEX_PATH,
+                          Home.Constants.ENTITY_ID_INDEX_PATH,
+                          CleanNameFunction := Home.CleanPersonName,
+                          AdaptedDistanceFunction := AdaptedDistance);