Skip to content

Commit 4d90b1c

Browse files
authored
Merge pull request #72 from raunaqmorarka/vector-binary-packing
Add vectorized BinaryPacking int codec
2 parents acc89e2 + 7d7c36b commit 4d90b1c

4 files changed

Lines changed: 220 additions & 0 deletions

File tree

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
/**
2+
* This code is released under the
3+
* Apache License Version 2.0 http://www.apache.org/licenses/.
4+
*
5+
* (c) Daniel Lemire, http://lemire.me/en/
6+
* (c) Intel Corp. (for Vector implementation)
7+
*/
8+
package me.lemire.integercompression.vector;
9+
10+
import jdk.incubator.vector.IntVector;
11+
import jdk.incubator.vector.VectorOperators;
12+
import jdk.incubator.vector.VectorSpecies;
13+
import me.lemire.integercompression.IntWrapper;
14+
import me.lemire.integercompression.IntegerCODEC;
15+
import me.lemire.integercompression.SkippableIntegerCODEC;
16+
import me.lemire.integercompression.Util;
17+
import me.lemire.integercompression.vector.VectorBitPackerKernels.LaneWidth;
18+
19+
/**
20+
* BinaryPacking using the Vector API pack/unpack kernels: each block is packed at
21+
* its own maximum bit width with no exceptions, so encoding is a single maxbits
22+
* pass plus a vectorized pack. It encodes integers in blocks of BLOCK_SIZE
23+
* integers. For arrays containing an arbitrary number of integers, you should use
24+
* it in conjunction with another CODEC:
25+
*
26+
* <pre>IntegerCODEC ic =
27+
* new Composition(new VectorBinaryPacking(), new VariableByte()).</pre>
28+
*
29+
* Note that this does not use differential coding: if you are working on sorted
30+
* lists, use IntegratedBinaryPacking instead.
31+
*
32+
* Blocks are packed in a vectorized layout that differs by hardware vector lane
33+
* width. The lane width is fixed at construction and not stored on the wire, so a
34+
* stream must be decoded at the same lane width it was encoded at. The default
35+
* constructor packs at this machine's preferred width; the {@code (LaneWidth)}
36+
* constructor pins a width so a heterogeneous cluster can decode on its narrowest
37+
* node.
38+
*
39+
* @author Daniel Lemire
40+
*/
41+
public final class VectorBinaryPacking implements IntegerCODEC, SkippableIntegerCODEC {
42+
public final static int BLOCK_SIZE = 256;
43+
private static final int MAX_BIT_WIDTH = Integer.SIZE;
44+
// Output words a packed block occupies per bit of width (BLOCK_SIZE / Integer.SIZE).
45+
private static final int WORDS_PER_BLOCK_BIT = BLOCK_SIZE / Integer.SIZE;
46+
// Blocks sharing one packed header word (four max-bit values, one byte each).
47+
private static final int GROUP_SIZE_IN_BLOCKS = 4;
48+
// The OR-reduction result is independent of vector width, so it uses the widest
49+
// available species regardless of the wire lane width.
50+
private static final VectorSpecies<Integer> MAXBITS_SPECIES = IntVector.SPECIES_PREFERRED;
51+
52+
private final VectorBitPackerKernels kernel;
53+
54+
/** Packs at this machine's preferred vector lane width. */
55+
public VectorBinaryPacking() {
56+
this(LaneWidth.PREFERRED);
57+
}
58+
59+
/** Pins the lane width so a heterogeneous cluster can decode on its narrowest node. */
60+
public VectorBinaryPacking(LaneWidth laneWidth) {
61+
this.kernel = laneWidth.kernel;
62+
}
63+
64+
@Override
65+
public void compress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos) {
66+
inlength = Util.greatestMultiple(inlength, BLOCK_SIZE);
67+
if (inlength == 0)
68+
return;
69+
out[outpos.get()] = inlength;
70+
outpos.increment();
71+
headlessCompress(in, inpos, inlength, out, outpos);
72+
}
73+
74+
@Override
75+
public void headlessCompress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos) {
76+
inlength = Util.greatestMultiple(inlength, BLOCK_SIZE);
77+
int tmpoutpos = outpos.get();
78+
int s = inpos.get();
79+
for (; s + BLOCK_SIZE * 4 - 1 < inpos.get() + inlength; s += BLOCK_SIZE * 4) {
80+
final int mbits1 = maxbits(in, s);
81+
final int mbits2 = maxbits(in, s + BLOCK_SIZE);
82+
final int mbits3 = maxbits(in, s + 2 * BLOCK_SIZE);
83+
final int mbits4 = maxbits(in, s + 3 * BLOCK_SIZE);
84+
out[tmpoutpos++] = (mbits1 << 24) | (mbits2 << 16) | (mbits3 << 8) | (mbits4);
85+
kernel.fastpackNoMask(in, s, out, tmpoutpos, mbits1);
86+
tmpoutpos += WORDS_PER_BLOCK_BIT * mbits1;
87+
kernel.fastpackNoMask(in, s + BLOCK_SIZE, out, tmpoutpos, mbits2);
88+
tmpoutpos += WORDS_PER_BLOCK_BIT * mbits2;
89+
kernel.fastpackNoMask(in, s + 2 * BLOCK_SIZE, out, tmpoutpos, mbits3);
90+
tmpoutpos += WORDS_PER_BLOCK_BIT * mbits3;
91+
kernel.fastpackNoMask(in, s + 3 * BLOCK_SIZE, out, tmpoutpos, mbits4);
92+
tmpoutpos += WORDS_PER_BLOCK_BIT * mbits4;
93+
}
94+
for (; s < inpos.get() + inlength; s += BLOCK_SIZE) {
95+
final int mbits = maxbits(in, s);
96+
out[tmpoutpos++] = mbits;
97+
kernel.fastpackNoMask(in, s, out, tmpoutpos, mbits);
98+
tmpoutpos += WORDS_PER_BLOCK_BIT * mbits;
99+
}
100+
inpos.add(inlength);
101+
outpos.set(tmpoutpos);
102+
}
103+
104+
@Override
105+
public void uncompress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos) {
106+
if (inlength == 0)
107+
return;
108+
final int outlength = in[inpos.get()];
109+
inpos.increment();
110+
headlessUncompress(in, inpos, inlength, out, outpos, outlength);
111+
}
112+
113+
@Override
114+
public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos, int num) {
115+
final int outlength = Util.greatestMultiple(num, BLOCK_SIZE);
116+
int tmpinpos = inpos.get();
117+
int s = outpos.get();
118+
for (; s + BLOCK_SIZE * 4 - 1 < outpos.get() + outlength; s += BLOCK_SIZE * 4) {
119+
final int mbits1 = (in[tmpinpos] >>> 24);
120+
final int mbits2 = (in[tmpinpos] >>> 16) & 0xFF;
121+
final int mbits3 = (in[tmpinpos] >>> 8) & 0xFF;
122+
final int mbits4 = (in[tmpinpos]) & 0xFF;
123+
++tmpinpos;
124+
kernel.fastunpack(in, tmpinpos, out, s, mbits1);
125+
tmpinpos += WORDS_PER_BLOCK_BIT * mbits1;
126+
kernel.fastunpack(in, tmpinpos, out, s + BLOCK_SIZE, mbits2);
127+
tmpinpos += WORDS_PER_BLOCK_BIT * mbits2;
128+
kernel.fastunpack(in, tmpinpos, out, s + 2 * BLOCK_SIZE, mbits3);
129+
tmpinpos += WORDS_PER_BLOCK_BIT * mbits3;
130+
kernel.fastunpack(in, tmpinpos, out, s + 3 * BLOCK_SIZE, mbits4);
131+
tmpinpos += WORDS_PER_BLOCK_BIT * mbits4;
132+
}
133+
for (; s < outpos.get() + outlength; s += BLOCK_SIZE) {
134+
final int mbits = in[tmpinpos];
135+
++tmpinpos;
136+
kernel.fastunpack(in, tmpinpos, out, s, mbits);
137+
tmpinpos += WORDS_PER_BLOCK_BIT * mbits;
138+
}
139+
outpos.add(outlength);
140+
inpos.set(tmpinpos);
141+
}
142+
143+
@Override
144+
public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) {
145+
int blockCount = inlength / BLOCK_SIZE;
146+
int headersSizeInInts = blockCount / GROUP_SIZE_IN_BLOCKS + (blockCount % GROUP_SIZE_IN_BLOCKS);
147+
int blocksSizeInInts = blockCount * MAX_BIT_WIDTH * WORDS_PER_BLOCK_BIT;
148+
compressedPositions.add(blockCount * BLOCK_SIZE);
149+
return headersSizeInInts + blocksSizeInInts;
150+
}
151+
152+
// Maximum bit width needed for a BLOCK_SIZE-value block: OR-reduce the values, then count significant bits.
153+
private static int maxbits(int[] in, int pos) {
154+
IntVector accumulator = IntVector.zero(MAXBITS_SPECIES);
155+
for (int offset = 0; offset < BLOCK_SIZE; offset += MAXBITS_SPECIES.length()) {
156+
accumulator = accumulator.or(IntVector.fromArray(MAXBITS_SPECIES, in, pos + offset));
157+
}
158+
int mask = accumulator.reduceLanes(VectorOperators.OR);
159+
return Integer.SIZE - Integer.numberOfLeadingZeros(mask);
160+
}
161+
162+
@Override
163+
public String toString() {
164+
return this.getClass().getSimpleName() + "(" + kernel.getClass().getSimpleName() + ")";
165+
}
166+
}

src/test/java/me/lemire/integercompression/BasicTest.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import me.lemire.integercompression.differential.IntegratedVariableByte;
1818
import me.lemire.integercompression.differential.XorBinaryPacking;
1919
import me.lemire.integercompression.synth.ClusteredDataGenerator;
20+
import me.lemire.integercompression.vector.VectorBinaryPacking;
2021
import me.lemire.integercompression.vector.VectorFastPFOR;
2122

2223
import org.junit.Test;
@@ -45,6 +46,7 @@ public class BasicTest {
4546
new Composition(new FastPFOR128(), new VariableByte()),
4647
new Composition(new FastPFOR(), new VariableByte()),
4748
new Composition(new VectorFastPFOR(), new VariableByte()),
49+
new Composition(new VectorBinaryPacking(), new VariableByte()),
4850
new Simple9(),
4951
new Simple16(),
5052
new GroupSimple9(),

src/test/java/me/lemire/integercompression/SkippableBasicTest.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import me.lemire.integercompression.differential.IntegratedVariableByte;
1414
import me.lemire.integercompression.differential.SkippableIntegratedComposition;
1515
import me.lemire.integercompression.differential.SkippableIntegratedIntegerCODEC;
16+
import me.lemire.integercompression.vector.VectorBinaryPacking;
1617
import me.lemire.integercompression.vector.VectorFastPFOR;
1718
import org.junit.Test;
1819

@@ -39,6 +40,7 @@ public class SkippableBasicTest {
3940
new SkippableComposition(new FastPFOR128(), new VariableByte()),
4041
new SkippableComposition(new FastPFOR(), new VariableByte()),
4142
new SkippableComposition(new VectorFastPFOR(), new VariableByte()),
43+
new SkippableComposition(new VectorBinaryPacking(), new VariableByte()),
4244
new Simple9(),
4345
new Simple16() };
4446

@@ -165,6 +167,8 @@ public void testMaxHeadlessCompressedLength() {
165167
testMaxHeadlessCompressedLength(new BinaryPacking(), 16 * BinaryPacking.BLOCK_SIZE, 32);
166168
testMaxHeadlessCompressedLength(new VariableByte(), 128, 32);
167169
testMaxHeadlessCompressedLength(new SkippableComposition(new BinaryPacking(), new VariableByte()), 16 * BinaryPacking.BLOCK_SIZE + 10, 32);
170+
testMaxHeadlessCompressedLength(new VectorBinaryPacking(), 4 * VectorBinaryPacking.BLOCK_SIZE, 32);
171+
testMaxHeadlessCompressedLength(new SkippableComposition(new VectorBinaryPacking(), new VariableByte()), 4 * VectorBinaryPacking.BLOCK_SIZE + 10, 32);
168172
testMaxHeadlessCompressedLength(new JustCopy(), 128, 32);
169173
testMaxHeadlessCompressedLength(new Simple9(), 128, 28);
170174
testMaxHeadlessCompressedLength(new Simple16(), 128, 28);
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
/**
2+
* This code is released under the
3+
* Apache License Version 2.0 http://www.apache.org/licenses/.
4+
*
5+
* (c) Daniel Lemire, http://lemire.me/en/
6+
*/
7+
package me.lemire.integercompression.vector;
8+
9+
import static org.junit.Assert.assertArrayEquals;
10+
11+
import org.junit.Test;
12+
13+
import me.lemire.integercompression.IntWrapper;
14+
import me.lemire.integercompression.vector.VectorBitPackerKernels.LaneWidth;
15+
16+
/**
17+
* Tests for the vectorized BinaryPacking codec.
18+
*/
19+
public class VectorBinaryPackingTest {
20+
21+
/** Every lane width packs and unpacks back to the original values. */
22+
@Test
23+
public void roundTripAcrossLaneWidths() {
24+
for (LaneWidth laneWidth : LaneWidth.values()) {
25+
roundTrip(new VectorBinaryPacking(laneWidth));
26+
}
27+
}
28+
29+
private static void roundTrip(VectorBinaryPacking codec) {
30+
int[] data = new int[3 * VectorBinaryPacking.BLOCK_SIZE];
31+
for (int i = 0; i < data.length; i++) {
32+
data[i] = i % 8; // mix of bit widths across blocks
33+
}
34+
data[5] = 1 << 20;
35+
data[600] = 1 << 30;
36+
37+
int[] compressed = new int[2 * data.length];
38+
IntWrapper inpos = new IntWrapper(0);
39+
IntWrapper outpos = new IntWrapper(0);
40+
codec.headlessCompress(data, inpos, data.length, compressed, outpos);
41+
42+
int[] recovered = new int[data.length];
43+
codec.headlessUncompress(compressed, new IntWrapper(0), outpos.get(),
44+
recovered, new IntWrapper(0), data.length);
45+
46+
assertArrayEquals(data, recovered);
47+
}
48+
}

0 commit comments

Comments
 (0)