-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathBernoulliSubstring.java.txt
321 lines (277 loc) · 12 KB
/
BernoulliSubstring.java.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
import org.apache.commons.math3.distribution.RealDistribution;
import org.apache.commons.math3.distribution.PoissonDistribution;
import org.apache.commons.math3.distribution.NormalDistribution;
/**
* A substring that can compute the exact expectation and variance of the number of its
* occurrences in a string generated by a given IID source, as well as scores of
* statistical surprise, using information from its suffixes. See
* \cite{Efficient_detection_of_unusual_words} for notation and algorithmics, and
* \cite{Monotony_of_surprise_and_large-scale_quest_for_unusual_words} for statistics.
*
* Remark: we don't use Apache Commons' $FastMath$ routines for elementary math
* operations, because $FastMath$ is never faster than $Math$ in experiments (likely due
* to internalization). We use $StrictMath.scalb$ rather than $Math.scalb$ because the
* former is approximately twice as fast as the latter. See comments in
* $CompareFastMath.java$.
*/
public class BernoulliSubstring extends BorderSubstring {
/**
* Scores are defined in method $getScore$
*/
private static final byte SCORE_ID = 8;
/**
* TRUE=use a tighter upper bound for the Poisson error in method
* $getCumulativeProbability$.
*/
private static final boolean TIGHT_POISSON_ERROR = true;
public double pHatLog, capitalT, capitalB;
public double expectation, variance;
public BernoulliSubstring(int alphabetLength, int log2alphabetLength, int textLength, int log2textLength) {
super(alphabetLength,log2alphabetLength,textLength,log2textLength);
}
public static int serializedSize(int alphabetLength, int log2alphabetLength) {
return BorderSubstring.serializedSize(alphabetLength,log2alphabetLength)+64*(SCORE_ID<=7?1:3);
}
public void serialize(Stack stack) {
super.serialize(stack);
stack.push(Double.doubleToLongBits(pHatLog),64);
if (SCORE_ID<=7) return;
stack.push(Double.doubleToLongBits(capitalT),64);
stack.push(Double.doubleToLongBits(capitalB),64);
}
public void deserialize(Stack stack) {
super.deserialize(stack);
pHatLog=Double.longBitsToDouble(stack.read(64));
if (SCORE_ID<=7) return;
capitalT=Double.longBitsToDouble(stack.read(64));
capitalB=Double.longBitsToDouble(stack.read(64));
}
public int skip(Stack stack) {
final int log2address = super.skip(stack);
stack.skip(64);
if (SCORE_ID>7) { stack.skip(64); stack.skip(64); }
return log2address;
}
/**
* Remark: the recursive computations in this method hold only for
* $length<=HALF_TEXT_LENGTH$.
*
* @param logProbA $\log(\mathbb{P}[a])$;
* @param buffer1 scratch space; at the end of the procedure it contains $bord(v)$;
* @param buffer2 scratch space; at the end of the procedure it contains
* $bord(bord(v))$.
*/
public final void initFromSuffix(long a, double logProbA, BernoulliSubstring suffix, Stack stack, Stack characterStack, BernoulliSubstring buffer1, BernoulliSubstring buffer2) {
final boolean hasBorder;
final long maxOccurrences;
final double pHat, longestBorderPHatLog, longestBorderLongestBorderPHatLog, longestBorderCapitalB, longestBorderCapitalT, ratio;
super.initFromSuffix(a,suffix,stack,characterStack,buffer1);
// Expectation and first term of the variance
maxOccurrences=TEXT_LENGTH-length+1;
pHatLog=suffix.pHatLog+logProbA;
pHat=Math.exp(pHatLog);
expectation=maxOccurrences*pHat;
if (SCORE_ID<=6) return;
variance=expectation*(1-pHat);
if (SCORE_ID==7) return;
// Second and third terms of the variance
final double pHatSquare = pHat*pHat;
variance-=pHatSquare*((TEXT_LENGTH<<1)-3*length+2)*(length-1);
if (rightLength==0) {
capitalT=0; capitalB=0;
return;
}
longestBorderPHatLog=buffer1.pHatLog; // $buffer1$ contains $bord(v)$ at this point
longestBorderCapitalB=buffer1.capitalB;
longestBorderCapitalT=buffer1.capitalT;
ratio=Math.exp(pHatLog-longestBorderPHatLog);
hasBorder=buffer1.getLongestBorder(stack,buffer2);
if (!hasBorder) capitalT=0;
else {
longestBorderLongestBorderPHatLog=buffer2.pHatLog;
capitalT=longestBorderCapitalT*ratio +
Math.exp(pHatLog-longestBorderLongestBorderPHatLog);
}
capitalB=(TEXT_LENGTH-(length<<1)+1+longestBorderLength)*ratio +
((longestBorderLength-length)<<1)*capitalT +
ratio*longestBorderCapitalB;
variance+=StrictMath.scalb(pHat,1)*capitalB;
}
// --------------------------- SCORING PROCEDURES ------------------------------------
/**
* Determines which nodes of the suffix tree of the text to explore in order to
* detect surprising substrings of a given type. When the score is non-monotonic
* inside a left-equivalence class, or when it is not a convex function of a monotonic
* score, or when it is not a convex and increasing function of a convex function of a
* monotonic score, we cannot limit the search to nodes of the suffix tree and to
* their one-character extensions, thus the whole approach breaks down.
*
* @param stringType 0: over-represented; 1: under-represented; 2: both;
* @param maxProbability $\max\{\mathbb{P}(a) : a \in \Sigma\}$;
* @return 0: suffix-tree nodes; 1: one-character extensions of suffix-tree nodes;
* 2: both; 3: don't explore any node, i.e. the whole approach fails.
*/
public static final byte nodesToExplore(byte stringType, double maxProbability) {
if (SCORE_ID<=5) return stringType;
if (SCORE_ID==6) return 2;
if (SCORE_ID==7) {
if (maxProbability<0.5) return stringType;
else return 3;
}
if (SCORE_ID==8) {
// These three upper bounds refer to function $f(m)=1/((4m)^{1/m})$ evaluated at
// $m=1,2$, and to $\sqrt{2}-1$. Just these two values of $m$ need to be
// considered since $f(m)>\sqrt{2}-1$ at $m \geq 3$.
if (maxProbability<Math.min(Math.min(Math.sqrt(2)-1,1d/Math.sqrt(8)),0.25)) return stringType;
else return 3;
}
if (SCORE_ID==9) {
// These three upper bounds refer to function $f(m)=1/((4m)^{1/m})$ evaluated at
// $m=1,2$, and to $\sqrt{2}-1$. Just these two values of $m$ need to be
// considered since $f(m)>\sqrt{2}-1$ at $m \geq 3$.
if (maxProbability<Math.min(Math.min(Math.sqrt(2)-1,1d/Math.sqrt(8)),0.25)) return 2;
else return 3;
}
return 3;
}
public final double getScore(double frequency) {
switch (SCORE_ID) {
// Expectation only
case 1: return frequency-expectation;
case 2: return frequency/expectation;
case 3: return (frequency-expectation)/expectation;
case 4: return (frequency-expectation)/Math.sqrt(expectation);
case 5: return Math.abs(frequency-expectation)/Math.sqrt(expectation);
case 6: return (frequency-expectation)*(frequency-expectation)/expectation;
case 7: return (frequency-expectation)/Math.sqrt(variance); // Assumes that $variance$ is the approximation $expectation*(1-pHat)$
// Expectation and variance
case 8: return (frequency-expectation)/Math.sqrt(variance);
case 9: return Math.abs(frequency-expectation)/Math.sqrt(variance);
}
return 0;
}
/**
* Computes the probability that the number of occurrences $occ$ of $v$ in an IID text
* of length $TEXT_LENGTH$ is less than or equal to $observedOccurrences$.
*
* Remark: this method uses some potentially slow procedures in the Apache Commons
* Math library to compute cumulative probabilities for normal and Poisson
* distribution.
*
* @return $out[0]=\mathbb{P}[occ \leq observedOccurrences]$; $out[1]$: upper bound on
* the absolute error in cell 0 when the Poisson distribution is used to model $occ$.
*/
public final void getCumulativeProbability(int observedOccurrences, double[] out) {
final int SIGNIFICANTLY_GREATER_THAN = 100;
final double pHat, pHatSquare, shortestPeriod;
final double probability;
double poissonError = 0;
shortestPeriod=length-longestBorderLength;
pHat=Math.exp(pHatLog);
pHatSquare=pHat*pHat;
if (shortestPeriod/length>SIGNIFICANTLY_GREATER_THAN*ONE_OVER_LOG_TEXT_LENGTH && TEXT_LENGTH>SIGNIFICANTLY_GREATER_THAN*length) {
// Using Poisson distribution with mean equal to $expectation$.
// b1=pHatSquare*(-3*length*length+(length<<2)+((length*TEXT_LENGTH)<<1)-TEXT_LENGTH-1);
// b2=b1+variance-expectation;
poissonError=variance-expectation+StrictMath.scalb(pHatSquare,1)*(-3*length*length+(length<<2)+((length*TEXT_LENGTH)<<1)-TEXT_LENGTH-1);
if (TIGHT_POISSON_ERROR) poissonError*=-Math.expm1(0d-expectation)/expectation;
probability=(new PoissonDistribution(expectation)).cumulativeProbability(observedOccurrences);
}
else probability=(new NormalDistribution(expectation,Math.sqrt(variance))).cumulativeProbability(observedOccurrences);
out[0]=probability; out[1]=poissonError;
}
// ----------------------------------- TESTING ---------------------------------------
public static void main(String[] args) {
testCorrectness();
}
private static final void testCorrectness() {
final int TEXT_LENGTH = 10000;
final int N_ITERATIONS = 10;
final int STRINGS_PER_REGION = 30;
boolean longestBorder, isBorder;
int i, j, t, length, nBorders;
long[] borders = new long[4];
long a;
double p;
BernoulliSubstring top, suffix, buffer1, buffer2, tmp;
for (t=0; t<N_ITERATIONS; t++) {
String text = "";
for (i=0; i<TEXT_LENGTH; i++) {
p=Math.random();
if (p<0.25) text+='0';
else if (p<0.5) text+='1';
else if (p<0.75) text+='2';
else text+='3';
}
// Testing
Stack stack = new Stack((BernoulliSubstring.serializedSize(4,2)*STRINGS_PER_REGION)>>>6);
Stack characterStack = new Stack(STRINGS_PER_REGION);
top = new BernoulliSubstring(4,2,TEXT_LENGTH,32);
suffix = new BernoulliSubstring(4,2,TEXT_LENGTH,32);
buffer1 = new BernoulliSubstring(4,2,TEXT_LENGTH,32);
buffer2 = new BernoulliSubstring(4,2,TEXT_LENGTH,32);
top.serialize(stack);
//top.print();
for (i=text.length()-1; i>=0; i--) { // Inserting $test$ from right to left
if (text.charAt(i)=='0') a=0L;
else if (text.charAt(i)=='1') a=1L;
else if (text.charAt(i)=='2') a=2L;
else a=3L;
characterStack.push(a,2);
// Computing borders using brute force
for (j=0; j<4; j++) borders[j]=-1;
for (length=1; length<text.length()-i; length++) {
isBorder=true;
for (j=0; j<length; j++) {
if (text.charAt(i+j)!=text.charAt(text.length()-length+j)) {
isBorder=false;
break;
}
}
if (isBorder) {
if (text.charAt(text.length()-length-1)=='0') borders[0]=length;
else if (text.charAt(text.length()-length-1)=='1') borders[1]=length;
else if (text.charAt(text.length()-length-1)=='2') borders[2]=length;
else borders[3]=length;
}
}
// Computing borders using recursion
tmp=suffix; suffix=top; top=tmp;
top.initFromSuffix(a,-1.386294361119891,suffix,stack,characterStack,buffer1,buffer2); // Overwrites all variables in $top$
top.serialize(stack); // Overwrites $address$
//top.print();
// Checking correctness
nBorders=0;
for (j=0; j<4; j++) {
if (borders[j]!=-1) nBorders++;
}
if (top.rightLength!=nBorders) {
System.err.println("ERROR at suffix <"+text.substring(i)+">:");
System.err.println("rightLength="+top.rightLength+" (should be "+nBorders+")");
System.exit(1);
}
for (j=0; j<top.rightLength; j++) {
if (borders[(int)top.rightCharacters[j]]==-1) {
System.err.println("ERROR at suffix <"+text.substring(i)+">:");
System.err.println("character "+top.rightCharacters[j]+" should not have a border");
System.exit(1);
}
stack.setPosition(top.rightPointers[j]);
buffer1.deserialize(stack);
if (buffer1.length!=borders[(int)top.rightCharacters[j]]) {
System.err.println("ERROR at suffix <"+text.substring(i)+">:");
System.err.println("character "+top.rightCharacters[j]+" should have a border of length "+borders[(int)top.rightCharacters[j]]+" rather than "+buffer1.length);
System.exit(1);
}
}
}
}
System.out.println("No errors found");
}
public void print() {
super.print();
System.out.println("pHatLog="+pHatLog);
System.out.println("capitalT="+capitalT);
System.out.println("capitalB="+capitalB);
}
}