-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathMarkovSubstring.java.txt
114 lines (96 loc) · 3.76 KB
/
MarkovSubstring.java.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
/**
* A substring that can compute a maximum-likelihood estimator of the expectation of the
* number of its occurrences in a string generated by an order-$k$ Markov chain, as well
* as scores of statistical surprise, using information from its suffix. See
* \cite{Monotony_of_surprise_and_large-scale_quest_for_unusual_words} for algorithmics
* and statistics.
*/
public class MarkovSubstring extends Substring {
/**
* Scores are defined in method $getScore$
*/
private static final byte SCORE_ID = 4;
private final byte MARKOV_ORDER;
/**
* $\log(f(v[0,i+MARKOV_ORDER]))+\sum_{i=1}^{|v|-MARKOV_ORDER-1}\log(f(v[i,i+MARKOV_ORDER]))-\log(f(v[i,i+MARKOV_ORDER-1]))$,
* where $f(w)$ is the number of occurrences of string $w$ in the text.
*/
public double expectationLog;
public double expectation;
public MarkovSubstring(byte markovOrder, int alphabetLength, int log2alphabetLength, int textLength, int log2textLength) {
super(alphabetLength,log2alphabetLength,textLength,log2textLength);
MARKOV_ORDER=markovOrder;
}
public void serialize(Stack stack) {
super.serialize(stack);
stack.push(Double.doubleToLongBits(expectationLog),64);
}
public void deserialize(Stack stack) {
super.deserialize(stack);
expectationLog=Double.longBitsToDouble(stack.read(64));
}
public int skip(Stack stack) {
final int log2address = super.skip(stack);
stack.skip(64);
return log2address;
}
/**
* For speed, the procedure assumes $(MARKOV_ORDER+1)*LOG2_ALPHABET_LENGTH<64$.
*
* @param frequency array indexed by all possible strings of length $MARKOV_ORDER$;
* cell $w$ contains the number of occurrences of string $reverse(w)$ in the text;
* @param plusOneFrequency array indexed by all possible strings of length
* $MARKOV_ORDER+1$; cell $w$ contains the number of occurrences of string
* $reverse(w)$ in the text.
*/
public void initFromSuffix(MarkovSubstring suffix, Stack characterStack, double[] frequency, double[] plusOneFrequency) {
final double numerator, denominator;
final long index;
this.suffix=suffix.address;
length=suffix.length+1;
if (length<MARKOV_ORDER) return;
if (length==MARKOV_ORDER) {
characterStack.setPosition(0);
index=characterStack.read(MARKOV_ORDER<<LOG2_LOG2_ALPHABET_LENGTH);
expectation=frequency[(int)index];
expectationLog=Math.log(expectation);
}
else if (length==MARKOV_ORDER+1) {
characterStack.setPosition(0);
index=characterStack.read((MARKOV_ORDER+1)<<LOG2_LOG2_ALPHABET_LENGTH);
expectation=plusOneFrequency[(int)index];
expectationLog=Math.log(expectation);
}
else {
characterStack.setPosition(length-MARKOV_ORDER-1);
index=characterStack.read((MARKOV_ORDER+1)<<LOG2_LOG2_ALPHABET_LENGTH);
numerator=plusOneFrequency[(int)index];
denominator=frequency[(int)(index>>LOG2_ALPHABET_LENGTH)];
expectationLog=suffix.expectation+numerator-denominator;
expectation=Math.exp(expectationLog);
}
}
/**
* Remark: score is not defined if $|v|<MARKOV_ORDER$.
*/
public final double getScore(double frequency) {
switch (SCORE_ID) {
case 1: return frequency-expectation;
case 2: return frequency/expectation;
case 3: return (frequency-expectation)/expectation;
case 4: return (frequency-expectation)/Math.sqrt(expectation);
case 5: return Math.abs(frequency-expectation)/Math.sqrt(expectation);
case 6: return (frequency-expectation)*(frequency-expectation)/expectation;
}
return 0;
}
/**
* @param stringType 0: over-represented; 1: under-represented; 2: both;
* @return 0: suffix-tree nodes; 1: one-character extensions of suffix-tree nodes;
* 2: both; 3: don't explore any node, i.e. the whole approach fails.
*/
public static final byte nodesToExplore(byte stringType) {
if (SCORE_ID<=4) return stringType;
else return 2;
}
}