-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathBWR_AnagramFun.ecl
190 lines (162 loc) · 6.11 KB
/
BWR_AnagramFun.ecl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
/**
* This is a fun example involving finding anagrams within a list of words
* using ECL. Several ECL techniques are demonstrated, including embedded
* C++ code, reading a "plain" text file from disk, explicit data distribution
* (with local execution), deduplication, sorting, and computing some simple
* statistics.
*
* The code uses Linux's built-in dictionary file, located at
* /usr/share/dict/words. The assumption is that there will be one word per
* line. You can use a different file by changing the value of the
* WORD_FILE_PATH attribute.
*
* Origin: https://github.com/hpccsystems-solutions-lab/Useful_ECL
*/
IMPORT Std;
/**
* Helper function for computing a 'prime factor product' of a text string.
* The idea here is to map letters to prime numbers, then multiply all prime
* numbers together to create a unique unsigned integer value representing
* the string. The order in which the letters appear in the string do not
* change the result, which makes this an effective way of grouping anagrams
* together.
*
* @param s The word to convert to a prime factor product
*
* @return An UNSIGNED8 value. Will return zero if the string does not
* contain any ASCII characters.
*/
UNSIGNED8 StringToPrimeFactorProduct(CONST VARSTRING s) := EMBED(C++)
#option pure;
__uint64 res = 1;
while (*s)
{
switch (toupper(*s))
{
case 'E': res *= 2; break;
case 'A': res *= 3; break;
case 'R': res *= 5; break;
case 'I': res *= 7; break;
case 'O': res *= 11; break;
case 'T': res *= 13; break;
case 'N': res *= 17; break;
case 'S': res *= 19; break;
case 'L': res *= 23; break;
case 'C': res *= 29; break;
case 'U': res *= 31; break;
case 'D': res *= 37; break;
case 'P': res *= 41; break;
case 'M': res *= 43; break;
case 'H': res *= 47; break;
case 'G': res *= 53; break;
case 'B': res *= 59; break;
case 'F': res *= 61; break;
case 'Y': res *= 67; break;
case 'W': res *= 71; break;
case 'K': res *= 73; break;
case 'V': res *= 79; break;
case 'X': res *= 83; break;
case 'Z': res *= 89; break;
case 'J': res *= 97; break;
case 'Q': res *= 101; break;
}
++s;
}
return res > 1 ? res : 0;
ENDEMBED;
//------------------------------------------------------------------------------
// We will use the dictionary supplied with Linux
WORD_FILE_PATH := '/usr/share/dict/words';
WordRec := RECORD
STRING word;
END;
// Reads the file, mapping one line of input to its own record
words0 := DATASET
(
Std.File.ExternalLogicalFileName('127.0.0.1', WORD_FILE_PATH),
WordRec,
CSV(QUOTE(''), SEPARATOR(''))
);
OUTPUT(COUNT(words0), NAMED('original_word_count'));
// Convert each word to uppercase and remove apostrophes (found via inspection);
// while we don't have to convert to uppercase for the prime factor product
// computation, we do need to do that in order to deduplicate the words (a
// necessary step since we remove some characters)
words1 := PROJECT
(
words0,
TRANSFORM
(
RECORDOF(LEFT),
SELF.word := Std.Str.ToUpperCase(Std.Str.FilterOut(LEFT.word, '\''))
)
);
// Make sure we have at least two characters in each word
words2 := words1(LENGTH(word) > 1);
// Remove the duplicates
words3 := DEDUP(SORT(words2, word), word);
words := words3;
OUTPUT(COUNT(words), NAMED('final_word_count'));
// Compute the prime factor product (PFP) for each word
wordsWithPrimeFactorProduct := PROJECT
(
words,
TRANSFORM
(
{
UNSIGNED8 pfp,
WordRec
},
myPFP := StringToPrimeFactorProduct(LEFT.word);
SELF.pfp := IF(myPFP > 0, myPFP, SKIP),
SELF := LEFT
)
);
// Redistribute our data so that words with the same PFP wind up on the same
// Thor nodes
distributedWords := DISTRIBUTE(wordsWithPrimeFactorProduct, pfp);
// Create dataset containing unique PFP values
uniquePFPs := TABLE(distributedWords, {pfp}, pfp, LOCAL);
AnagramRec := RECORD
UNSIGNED8 pfp;
UNSIGNED2 word_count;
DATASET(WordRec) anagrams;
END;
// Group words with the same PFP value together -- by definition, they will be
// anagrams; skip records that contain only one word in the word list
groupedAnagrams := DENORMALIZE
(
uniquePFPs,
distributedWords,
LEFT.pfp = RIGHT.pfp,
GROUP,
TRANSFORM
(
AnagramRec,
wordList := SORT(PROJECT(ROWS(RIGHT), TRANSFORM(WordRec, SELF := LEFT)), word);
SELF.pfp := LEFT.pfp,
SELF.word_count := IF(COUNT(ROWS(RIGHT)) > 1, COUNT(ROWS(RIGHT)), SKIP),
SELF.anagrams := wordList
),
LOCAL
);
// Show the found anagrams; explicitly sort by word count, the length of the
// anagram, and the PFP value (the last item is merely to make the result
// stable)
OUTPUT(SORT(groupedAnagrams, -word_count, -LENGTH(anagrams[1].word), pfp), NAMED('anagram_sample'));
// Compute some stats about the anagrams we found
anagramStats := TABLE
(
groupedAnagrams,
{
UNSIGNED4 num_words_within_anagram := word_count,
UNSIGNED4 occurs := COUNT(GROUP)
},
word_count,
MERGE
);
// Show the computed stats, sorting (descending) by the number of words in
// an anagram group
OUTPUT(SORT(anagramStats, -num_words_within_anagram), NAMED('anagram_stats'));
numOfWordsInAnagrams := SUM(anagramStats, num_words_within_anagram * occurs);
OUTPUT(numOfWordsInAnagrams, NAMED('words_that_form_anagrams_count'));