|
| 1 | +/** |
| 2 | + * This code is released under the |
| 3 | + * Apache License Version 2.0 http://www.apache.org/licenses/. |
| 4 | + * |
| 5 | + * (c) Daniel Lemire, http://lemire.me/en/ |
| 6 | + * (c) Intel Corp. (for Vector implementation) |
| 7 | + */ |
| 8 | +package me.lemire.integercompression.vector; |
| 9 | + |
| 10 | +import jdk.incubator.vector.IntVector; |
| 11 | +import jdk.incubator.vector.VectorOperators; |
| 12 | +import jdk.incubator.vector.VectorSpecies; |
| 13 | +import me.lemire.integercompression.IntWrapper; |
| 14 | +import me.lemire.integercompression.IntegerCODEC; |
| 15 | +import me.lemire.integercompression.SkippableIntegerCODEC; |
| 16 | +import me.lemire.integercompression.Util; |
| 17 | +import me.lemire.integercompression.vector.VectorBitPackerKernels.LaneWidth; |
| 18 | + |
| 19 | +/** |
| 20 | + * BinaryPacking using the Vector API pack/unpack kernels: each block is packed at |
| 21 | + * its own maximum bit width with no exceptions, so encoding is a single maxbits |
| 22 | + * pass plus a vectorized pack. It encodes integers in blocks of BLOCK_SIZE |
| 23 | + * integers. For arrays containing an arbitrary number of integers, you should use |
| 24 | + * it in conjunction with another CODEC: |
| 25 | + * |
| 26 | + * <pre>IntegerCODEC ic = |
| 27 | + * new Composition(new VectorBinaryPacking(), new VariableByte()).</pre> |
| 28 | + * |
| 29 | + * Note that this does not use differential coding: if you are working on sorted |
| 30 | + * lists, use IntegratedBinaryPacking instead. |
| 31 | + * |
| 32 | + * Blocks are packed in a vectorized layout that differs by hardware vector lane |
| 33 | + * width. The lane width is fixed at construction and not stored on the wire, so a |
| 34 | + * stream must be decoded at the same lane width it was encoded at. The default |
| 35 | + * constructor packs at this machine's preferred width; the {@code (LaneWidth)} |
| 36 | + * constructor pins a width so a heterogeneous cluster can decode on its narrowest |
| 37 | + * node. |
| 38 | + * |
| 39 | + * @author Daniel Lemire |
| 40 | + */ |
| 41 | +public final class VectorBinaryPacking implements IntegerCODEC, SkippableIntegerCODEC { |
| 42 | + public final static int BLOCK_SIZE = 256; |
| 43 | + private static final int MAX_BIT_WIDTH = Integer.SIZE; |
| 44 | + // Output words a packed block occupies per bit of width (BLOCK_SIZE / Integer.SIZE). |
| 45 | + private static final int WORDS_PER_BLOCK_BIT = BLOCK_SIZE / Integer.SIZE; |
| 46 | + // Blocks sharing one packed header word (four max-bit values, one byte each). |
| 47 | + private static final int GROUP_SIZE_IN_BLOCKS = 4; |
| 48 | + // The OR-reduction result is independent of vector width, so it uses the widest |
| 49 | + // available species regardless of the wire lane width. |
| 50 | + private static final VectorSpecies<Integer> MAXBITS_SPECIES = IntVector.SPECIES_PREFERRED; |
| 51 | + |
| 52 | + private final VectorBitPackerKernels kernel; |
| 53 | + |
| 54 | + /** Packs at this machine's preferred vector lane width. */ |
| 55 | + public VectorBinaryPacking() { |
| 56 | + this(LaneWidth.PREFERRED); |
| 57 | + } |
| 58 | + |
| 59 | + /** Pins the lane width so a heterogeneous cluster can decode on its narrowest node. */ |
| 60 | + public VectorBinaryPacking(LaneWidth laneWidth) { |
| 61 | + this.kernel = laneWidth.kernel; |
| 62 | + } |
| 63 | + |
| 64 | + @Override |
| 65 | + public void compress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos) { |
| 66 | + inlength = Util.greatestMultiple(inlength, BLOCK_SIZE); |
| 67 | + if (inlength == 0) |
| 68 | + return; |
| 69 | + out[outpos.get()] = inlength; |
| 70 | + outpos.increment(); |
| 71 | + headlessCompress(in, inpos, inlength, out, outpos); |
| 72 | + } |
| 73 | + |
| 74 | + @Override |
| 75 | + public void headlessCompress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos) { |
| 76 | + inlength = Util.greatestMultiple(inlength, BLOCK_SIZE); |
| 77 | + int tmpoutpos = outpos.get(); |
| 78 | + int s = inpos.get(); |
| 79 | + for (; s + BLOCK_SIZE * 4 - 1 < inpos.get() + inlength; s += BLOCK_SIZE * 4) { |
| 80 | + final int mbits1 = maxbits(in, s); |
| 81 | + final int mbits2 = maxbits(in, s + BLOCK_SIZE); |
| 82 | + final int mbits3 = maxbits(in, s + 2 * BLOCK_SIZE); |
| 83 | + final int mbits4 = maxbits(in, s + 3 * BLOCK_SIZE); |
| 84 | + out[tmpoutpos++] = (mbits1 << 24) | (mbits2 << 16) | (mbits3 << 8) | (mbits4); |
| 85 | + kernel.fastpackNoMask(in, s, out, tmpoutpos, mbits1); |
| 86 | + tmpoutpos += WORDS_PER_BLOCK_BIT * mbits1; |
| 87 | + kernel.fastpackNoMask(in, s + BLOCK_SIZE, out, tmpoutpos, mbits2); |
| 88 | + tmpoutpos += WORDS_PER_BLOCK_BIT * mbits2; |
| 89 | + kernel.fastpackNoMask(in, s + 2 * BLOCK_SIZE, out, tmpoutpos, mbits3); |
| 90 | + tmpoutpos += WORDS_PER_BLOCK_BIT * mbits3; |
| 91 | + kernel.fastpackNoMask(in, s + 3 * BLOCK_SIZE, out, tmpoutpos, mbits4); |
| 92 | + tmpoutpos += WORDS_PER_BLOCK_BIT * mbits4; |
| 93 | + } |
| 94 | + for (; s < inpos.get() + inlength; s += BLOCK_SIZE) { |
| 95 | + final int mbits = maxbits(in, s); |
| 96 | + out[tmpoutpos++] = mbits; |
| 97 | + kernel.fastpackNoMask(in, s, out, tmpoutpos, mbits); |
| 98 | + tmpoutpos += WORDS_PER_BLOCK_BIT * mbits; |
| 99 | + } |
| 100 | + inpos.add(inlength); |
| 101 | + outpos.set(tmpoutpos); |
| 102 | + } |
| 103 | + |
| 104 | + @Override |
| 105 | + public void uncompress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos) { |
| 106 | + if (inlength == 0) |
| 107 | + return; |
| 108 | + final int outlength = in[inpos.get()]; |
| 109 | + inpos.increment(); |
| 110 | + headlessUncompress(in, inpos, inlength, out, outpos, outlength); |
| 111 | + } |
| 112 | + |
| 113 | + @Override |
| 114 | + public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] out, IntWrapper outpos, int num) { |
| 115 | + final int outlength = Util.greatestMultiple(num, BLOCK_SIZE); |
| 116 | + int tmpinpos = inpos.get(); |
| 117 | + int s = outpos.get(); |
| 118 | + for (; s + BLOCK_SIZE * 4 - 1 < outpos.get() + outlength; s += BLOCK_SIZE * 4) { |
| 119 | + final int mbits1 = (in[tmpinpos] >>> 24); |
| 120 | + final int mbits2 = (in[tmpinpos] >>> 16) & 0xFF; |
| 121 | + final int mbits3 = (in[tmpinpos] >>> 8) & 0xFF; |
| 122 | + final int mbits4 = (in[tmpinpos]) & 0xFF; |
| 123 | + ++tmpinpos; |
| 124 | + kernel.fastunpack(in, tmpinpos, out, s, mbits1); |
| 125 | + tmpinpos += WORDS_PER_BLOCK_BIT * mbits1; |
| 126 | + kernel.fastunpack(in, tmpinpos, out, s + BLOCK_SIZE, mbits2); |
| 127 | + tmpinpos += WORDS_PER_BLOCK_BIT * mbits2; |
| 128 | + kernel.fastunpack(in, tmpinpos, out, s + 2 * BLOCK_SIZE, mbits3); |
| 129 | + tmpinpos += WORDS_PER_BLOCK_BIT * mbits3; |
| 130 | + kernel.fastunpack(in, tmpinpos, out, s + 3 * BLOCK_SIZE, mbits4); |
| 131 | + tmpinpos += WORDS_PER_BLOCK_BIT * mbits4; |
| 132 | + } |
| 133 | + for (; s < outpos.get() + outlength; s += BLOCK_SIZE) { |
| 134 | + final int mbits = in[tmpinpos]; |
| 135 | + ++tmpinpos; |
| 136 | + kernel.fastunpack(in, tmpinpos, out, s, mbits); |
| 137 | + tmpinpos += WORDS_PER_BLOCK_BIT * mbits; |
| 138 | + } |
| 139 | + outpos.add(outlength); |
| 140 | + inpos.set(tmpinpos); |
| 141 | + } |
| 142 | + |
| 143 | + @Override |
| 144 | + public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) { |
| 145 | + int blockCount = inlength / BLOCK_SIZE; |
| 146 | + int headersSizeInInts = blockCount / GROUP_SIZE_IN_BLOCKS + (blockCount % GROUP_SIZE_IN_BLOCKS); |
| 147 | + int blocksSizeInInts = blockCount * MAX_BIT_WIDTH * WORDS_PER_BLOCK_BIT; |
| 148 | + compressedPositions.add(blockCount * BLOCK_SIZE); |
| 149 | + return headersSizeInInts + blocksSizeInInts; |
| 150 | + } |
| 151 | + |
| 152 | + // Maximum bit width needed for a BLOCK_SIZE-value block: OR-reduce the values, then count significant bits. |
| 153 | + private static int maxbits(int[] in, int pos) { |
| 154 | + IntVector accumulator = IntVector.zero(MAXBITS_SPECIES); |
| 155 | + for (int offset = 0; offset < BLOCK_SIZE; offset += MAXBITS_SPECIES.length()) { |
| 156 | + accumulator = accumulator.or(IntVector.fromArray(MAXBITS_SPECIES, in, pos + offset)); |
| 157 | + } |
| 158 | + int mask = accumulator.reduceLanes(VectorOperators.OR); |
| 159 | + return Integer.SIZE - Integer.numberOfLeadingZeros(mask); |
| 160 | + } |
| 161 | + |
| 162 | + @Override |
| 163 | + public String toString() { |
| 164 | + return this.getClass().getSimpleName() + "(" + kernel.getClass().getSimpleName() + ")"; |
| 165 | + } |
| 166 | +} |
0 commit comments