-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathHuffmanWaveletTree.java
322 lines (295 loc) · 11.4 KB
/
HuffmanWaveletTree.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
import java.util.Arrays;
/**
* Simplistic implementation of a Huffman-shaped wavelet tree with pointers, using
* Sebastiano Vigna's $Rank9$ data structure \cite{vigna2008broadword} to support rank
* operations inside a node.
*
* Remark: We try to optimize time rather than space. We allow construction to use
* $|s|(H_0(s)+1)$ bits of additional space, where $s$ is the input string, i.e. we don't
* implement in-place algorithms like \cite{tischler2011wavelet}.
*
* Remark: We assume that the alphabet is small. In this case, the Huffman-shaped wavelet
* tree with pointers is among the best solutions both in space and in time for general
* strings \cite{claude2009practical}, and it should behave even better for blocks of the
* BWT. We don't implement the wavelet matrix \cite{claude2012wavelet} because it is
* useful in practice only for large alphabets.
*/
public class HuffmanWaveletTree {
/*
* The symbols that occur in $string$, sorted lexicographically.
*/
private final int[] alphabet;
private final int alphabetLength, log2AlphabetLength;
/**
* Tree topology and rank data structures. A nonnegative value in $leftChild[i]$,
* $rightChild[i]$ and $*Parent[i]$ is the position of the corresponding internal node
* in the arrays. A negative value $c$ identifies position $-1-c$ in $alphabet$.
*/
private int[] leftChild, rightChild, nodeParent, leafParent;
protected Rank9[] rankDataStructures;
/**
* Maximum number of bits in the Huffman code
*/
public final int maxCodeLength;
/**
* @param alphabet only the distinct symbols that occur in $string$, sorted
* lexicographically;
* @param counts number of occurrences in $string$ of each symbol in $alphabet$.
*/
public HuffmanWaveletTree(IntArray string, int[] alphabet, IntArray counts) {
final long stringLength, suffixArrayLength;
int i, j, k, node, code, length;
long il;
boolean[][] codes;
int[] codeLengths;
long[] nBits;
float[] frequencies;
IntArray[] bitVectors;
// Building tree topology and bit vectors
stringLength=string.length();
this.alphabet=alphabet;
alphabetLength=alphabet.length;
if (alphabetLength==1) { // No need for a wavelet tree in this case
log2AlphabetLength=0;
codes=null;
codeLengths=null;
frequencies=null;
maxCodeLength=0;
bitVectors=null;
nBits=null;
return;
}
log2AlphabetLength=Utils.log2(alphabetLength);
codes = new boolean[alphabetLength][alphabetLength-1];
codeLengths = new int[alphabetLength];
frequencies = new float[alphabetLength];
for (i=0; i<alphabetLength; i++) frequencies[i]=((float)counts.getElementAt(i))/stringLength;
maxCodeLength=buildHuffmanCodes(frequencies,codes,codeLengths);
frequencies=null;
bitVectors = new IntArray[alphabetLength-1];
nBits = new long[alphabetLength-1];
for (i=0; i<alphabetLength; i++) {
node=leafParent[i];
while (node!=-1) {
nBits[node]+=counts.getElementAt(i);
node=nodeParent[node];
}
}
for (i=0; i<alphabetLength-1; i++) bitVectors[i] = new IntArray(nBits[i],1);
nBits=null;
// Pushing bits from $string$
for (il=0; il<stringLength; il++) {
j=Arrays.binarySearch(alphabet,(int)( string.getElementAt(il) ));
length=codeLengths[j];
node=alphabetLength-2;
for (k=length-1; k>=0; k--) {
if (codes[j][k]) {
bitVectors[node].pushFromRight(1); // $Rank9$ stores bits from right to left
node=rightChild[node];
}
else {
bitVectors[node].pushFromRight(0); // $Rank9$ stores bits from right to left
node=leftChild[node];
}
}
}
codes=null; codeLengths=null;
rankDataStructures = new Rank9[alphabetLength-1];
for (i=0; i<alphabetLength-1; i++) {
rankDataStructures[i] = new Rank9(bitVectors[i]);
bitVectors[i]=null;
}
bitVectors=null;
}
/**
* @param frequencies relative frequency of each symbol in $alphabet$; they are
* assumed to sum to one;
* @param codes output array that stores the Huffman code corresponding to each
* element of $alphabet$ at the end of the procedure, encoded as a *reversed* sequence
* of booleans; all cells of $codes$ are assumed to be FALSE at the beginning;
* representing a code as a sequence of booleans increases space (which is irrelevant
* under the assumption that $alphabet$ is small), but makes access to bits faster;
* @param codeLengths output array that stores the length in bits of each element
* of $codes$ at the end of the procedure; all cells of $codeLengths$ are assumed to
* be zero at the beginning;
* @return the maximum number in $codeLengths$.
*/
private final int buildHuffmanCodes(float[] frequencies, boolean[][] codes, int[] codeLengths) {
int i, j, leafQueueFront, nodeQueueFront, nodeQueueBack, address, node, child, maxCodeLength;
float nodeFrequency;
int[] stack, sortedAlphabet;
float[] sortedFrequencies, nodeFrequencies;
long[] tmpArray;
// Sorting alphabet by frequency
tmpArray = new long[alphabetLength];
for (i=0; i<alphabetLength; i++) tmpArray[i]=(((long)Float.floatToIntBits(frequencies[i]))<<32)|alphabet[i];
Arrays.sort(tmpArray);
sortedAlphabet = new int[alphabetLength];
sortedFrequencies = new float[alphabetLength];
for (i=0; i<alphabetLength; i++) {
sortedFrequencies[i]=Float.intBitsToFloat((int)((tmpArray[i]&Utils.shiftOnesLeft[32])>>>32));
sortedAlphabet[i]=(int)(tmpArray[i]&Utils.shiftOnesRight[32]);
}
tmpArray=null;
// Building tree topology
leftChild = new int[alphabetLength-1];
rightChild = new int[alphabetLength-1];
nodeParent = new int[alphabetLength-1];
for (i=0; i<alphabetLength-1; i++) nodeParent[i]=-1;
leafParent = new int[alphabetLength];
for (i=0; i<alphabetLength; i++) leafParent[i]=-1;
nodeFrequencies = new float[alphabetLength-1];
for (i=0; i<alphabetLength-1; i++) nodeFrequencies[i]=1f;
nodeQueueFront=0; nodeQueueBack=0; leafQueueFront=0;
while (alphabetLength-leafQueueFront+nodeQueueBack-nodeQueueFront>1) {
nodeFrequency=0f;
if (leafQueueFront<alphabetLength && sortedFrequencies[leafQueueFront]<nodeFrequencies[nodeQueueFront]) {
address=Arrays.binarySearch(alphabet,sortedAlphabet[leafQueueFront]);
leftChild[nodeQueueBack]=-1-address;
leafParent[address]=nodeQueueBack;
nodeFrequency=sortedFrequencies[leafQueueFront];
leafQueueFront++;
}
else {
address=nodeQueueFront;
leftChild[nodeQueueBack]=address;
nodeParent[address]=nodeQueueBack;
nodeFrequency=nodeFrequencies[nodeQueueFront];
nodeQueueFront++;
}
if (leafQueueFront<alphabetLength && sortedFrequencies[leafQueueFront]<nodeFrequencies[nodeQueueFront]) {
address=Arrays.binarySearch(alphabet,sortedAlphabet[leafQueueFront]);
rightChild[nodeQueueBack]=-1-address;
leafParent[address]=nodeQueueBack;
nodeFrequency+=sortedFrequencies[leafQueueFront];
leafQueueFront++;
}
else {
address=nodeQueueFront;
rightChild[nodeQueueBack]=address;
nodeParent[address]=nodeQueueBack;
nodeFrequency+=nodeFrequencies[nodeQueueFront];
nodeQueueFront++;
}
nodeFrequencies[nodeQueueBack]=nodeFrequency;
nodeQueueBack++;
}
sortedAlphabet=null;
sortedFrequencies=null;
nodeFrequencies=null;
// Assigning codes
for (i=0; i<alphabetLength; i++) {
codeLengths[i]=0;
node=leafParent[i];
child=-1-i;
while (node!=-1) {
if (child==rightChild[node]) codes[i][codeLengths[i]]=true;
codeLengths[i]++;
child=node;
node=nodeParent[node];
}
}
// Returning the length of the longest code
maxCodeLength=0;
for (i=0; i<alphabetLength; i++) {
if (codeLengths[i]>maxCodeLength) maxCodeLength=codeLengths[i];
}
return maxCodeLength;
}
/**
* @param position a valid position in $string$.
* @return the character in $alphabet$ that equals $string[position]$.
*/
public final int access(long position) {
if (alphabetLength==1) return alphabet[0];
int node = alphabetLength-2;
long effectivePosition;
while (node>=0) {
effectivePosition=(position/64)*64 + (64-(position%64)-1);
if (rankDataStructures[node].bitVector.getElementAt(effectivePosition)==0) {
position-=rankDataStructures[node].rank(position);
node=leftChild[node];
if (node<0) return alphabet[-1-node];
}
else {
position=rankDataStructures[node].rank(position);
node=rightChild[node];
if (node<0) return alphabet[-1-node];
}
}
return -1;
}
/**
* Computes the number of occurrences of every symbol in $[0..fullAlphabetLength]$
* before each position of a list of distinct positions relative to $string$.
* The procedure touches all nodes of the wavelet tree. At each node, all input
* positions are ranked, to limit cache misses.
*
* Remark: $[0..fullAlphabetLength]$ is a superset of the alphabet of $string$ (stored
* in $alphabet$), since $string$ could be a substring of a longer string, with
* reduced alphabet.
*
* @param nPositions number of positions in $string$ to rank, assumed to be at most
* $Integer.MAX_VALUE-1$;
* @param stack a temporary matrix with at least $fullAlphabetLength-1$ rows and
* $1+nPositions$ columns; the procedure assumes that the positions to be ranked are
* written in increasing order in row 0, starting from index 1 (one, not zero); the
* content of this matrix is altered by the procedure;
* @param output output matrix with at least $fullAlphabetLength$ rows and
* $nPositions$ columns; the alphabet is assumed to be sorted lexicographically; all
* cell of $output$ are assumed to be zero;
* @param ones a temporary array with at least $nPositions$ cells; the
* content of this array is altered by the procedure.
*/
public final void multirank(int fullAlphabetLength, int nPositions, long[][] stack, long[][] output, long[] ones) {
int i, j, currentBlock, lastBlock, node, address;
// Characters that do not belong to $alphabet$ are already handled by the
// assumption that all cells of $output$ are zero.
// Handling characters that belong to $alphabet$
if (alphabetLength==1) {
for (j=0; j<nPositions; j++) output[alphabet[0]][j]=stack[0][j+1];
return;
}
stack[0][0]=alphabetLength-2;
currentBlock=0; lastBlock=0;
while (currentBlock<=lastBlock) {
node=(int)stack[currentBlock][0];
for (i=0; i<nPositions; i++) ones[i]=rankDataStructures[node].rank(stack[currentBlock][1+i]);
address=leftChild[node];
if (address<0) {
for (i=0; i<nPositions; i++) output[alphabet[-1-address]][i]=stack[currentBlock][1+i]-ones[i];
}
else {
lastBlock++;
stack[lastBlock][0]=address;
for (i=0; i<nPositions; i++) stack[lastBlock][1+i]=stack[currentBlock][1+i]-ones[i];
}
address=rightChild[node];
if (address<0) System.arraycopy(ones,0,output[alphabet[-1-address]],0,nPositions);
else {
lastBlock++;
stack[lastBlock][0]=address;
System.arraycopy(ones,0,stack[lastBlock],1,nPositions);
}
currentBlock++;
}
}
public void print() {
System.out.print("leftChild: ");
for (int i=0; i<alphabetLength-1; i++) System.out.print(leftChild[i]+" ");
System.out.println();
System.out.print("rightChild: ");
for (int i=0; i<alphabetLength-1; i++) System.out.print(rightChild[i]+" ");
System.out.println();
System.out.print("nodeParent: ");
for (int i=0; i<alphabetLength-1; i++) System.out.print(nodeParent[i]+" ");
System.out.println();
System.out.print("leafParent: ");
for (int i=0; i<alphabetLength; i++) System.out.print(leafParent[i]+" ");
System.out.println();
System.out.println("bitvectors: ");
for (int i=0; i<alphabetLength-1; i++) {
rankDataStructures[i].bitVector.printBits(); System.out.println();
}
}
}