Skip to content

Commit

Permalink
Merge pull request #408 from tsenart/to-dense
Browse files Browse the repository at this point in the history
roaring: implement ToDense and FromDense
  • Loading branch information
lemire authored Dec 18, 2023
2 parents c79926f + 3d497d5 commit cceddf2
Show file tree
Hide file tree
Showing 2 changed files with 261 additions and 0 deletions.
162 changes: 162 additions & 0 deletions roaring.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,168 @@ func (rb *Bitmap) ToBytes() ([]byte, error) {
return rb.highlowcontainer.toBytes()
}

const wordSize = uint64(64)
const log2WordSize = uint64(6)
const capacity = ^uint64(0)
const bitmapContainerSize = (1 << 16) / 64 // bitmap size in words

// DenseSize returns the size of the bitmap when stored as a dense bitmap.
func (rb *Bitmap) DenseSize() int {
if rb.highlowcontainer.size() == 0 {
return 0
}

maximum := 1 + uint64(rb.Maximum())
if maximum > (capacity - wordSize + 1) {
return int(capacity >> log2WordSize)

Check failure on line 69 in roaring.go

View workflow job for this annotation

GitHub Actions / test (1.17.x, ubuntu-latest)

constant 288230376151711743 overflows int

Check failure on line 69 in roaring.go

View workflow job for this annotation

GitHub Actions / test (1.14.x, ubuntu-latest)

constant 288230376151711743 overflows int
}

return int((maximum + (wordSize - 1)) >> log2WordSize)
}

// ToDense returns a slice of uint64s representing the bitmap as a dense bitmap.
// Useful to convert a roaring bitmap to a format that can be used by other libraries
// like https://github.com/bits-and-blooms/bitset or https://github.com/kelindar/bitmap
func (rb *Bitmap) ToDense() []uint64 {
sz := rb.DenseSize()
if sz == 0 {
return nil
}

bitmap := make([]uint64, sz)
rb.WriteDenseTo(bitmap)
return bitmap
}

// FromDense creates a bitmap from a slice of uint64s representing the bitmap as a dense bitmap.
// Useful to convert bitmaps from libraries like https://github.com/bits-and-blooms/bitset or
// https://github.com/kelindar/bitmap into roaring bitmaps fast and with convenience.
//
// This function won't create any run containers, only array and bitmap containers. It's up to
// the caller to call RunOptimize if they want to further compress the runs of consecutive values.
//
// When doCopy is true, the bitmap is copied into a new slice for each bitmap container.
// This is useful when the bitmap is going to be modified after this function returns or if it's
// undesirable to hold references to large bitmaps which the GC wouldn't be able to collect.
// One copy can still happen even when doCopy is false if the bitmap length isn't divisible by bitmapContainerSize.
func FromDense(bitmap []uint64, doCopy bool) *Bitmap {
sz := (len(bitmap) + bitmapContainerSize - 1) / bitmapContainerSize // round up
rb := &Bitmap{
highlowcontainer: roaringArray{
containers: make([]container, 0, sz),
keys: make([]uint16, 0, sz),
needCopyOnWrite: make([]bool, 0, sz),
},
}
rb.FromDense(bitmap, doCopy)
return rb
}

// FromDense unmarshalls from a slice of uint64s representing the bitmap as a dense bitmap.
// Useful to convert bitmaps from libraries like https://github.com/bits-and-blooms/bitset or
// https://github.com/kelindar/bitmap into roaring bitmaps fast and with convenience.
// Callers are responsible for ensuring that the bitmap is empty before calling this function.
//
// This function won't create any run containers, only array and bitmap containers. It's up to
// the caller to call RunOptimize if they want to further compress the runs of consecutive values.
//
// When doCopy is true, the bitmap is copied into a new slice for each bitmap container.
// This is useful when the bitmap is going to be modified after this function returns or if it's
// undesirable to hold references to large bitmaps which the GC wouldn't be able to collect.
// One copy can still happen even when doCopy is false if the bitmap length isn't divisible by bitmapContainerSize.
func (rb *Bitmap) FromDense(bitmap []uint64, doCopy bool) {
if len(bitmap) == 0 {
return
}

var k uint16
const size = bitmapContainerSize

for len(bitmap) > 0 {
hi := size
if len(bitmap) < size {
hi = len(bitmap)
}

words := bitmap[:hi]
count := int(popcntSlice(words))

switch {
case count > arrayDefaultMaxSize:
c := &bitmapContainer{cardinality: count, bitmap: words}
cow := true

if doCopy || len(words) < size {
c.bitmap = make([]uint64, size)
copy(c.bitmap, words)
cow = false
}

rb.highlowcontainer.appendContainer(k, c, cow)

case count > 0:
c := &arrayContainer{content: make([]uint16, count)}
var pos, base int
for _, w := range words {
for w != 0 {
t := w & -w
c.content[pos] = uint16(base + int(popcount(t-1)))
pos++
w ^= t
}
base += 64
}
rb.highlowcontainer.appendContainer(k, c, false)
}

bitmap = bitmap[hi:]
k++
}
}

// WriteDenseTo writes to a slice of uint64s representing the bitmap as a dense bitmap.
// Callers are responsible for allocating enough space in the bitmap using DenseSize.
// Useful to convert a roaring bitmap to a format that can be used by other libraries
// like https://github.com/bits-and-blooms/bitset or https://github.com/kelindar/bitmap
func (rb *Bitmap) WriteDenseTo(bitmap []uint64) {
for i, ct := range rb.highlowcontainer.containers {
hb := uint32(rb.highlowcontainer.keys[i]) << 16

switch c := ct.(type) {
case *arrayContainer:
for _, x := range c.content {
n := int(hb | uint32(x))
bitmap[n>>log2WordSize] |= uint64(1) << uint(x%64)
}

case *bitmapContainer:
copy(bitmap[int(hb)>>log2WordSize:], c.bitmap)

case *runContainer16:
for j := range c.iv {
start := uint32(c.iv[j].start)
end := start + uint32(c.iv[j].length) + 1
lo := int(hb|start) >> log2WordSize
hi := int(hb|(end-1)) >> log2WordSize

if lo == hi {
bitmap[lo] |= (^uint64(0) << uint(start%64)) &
(^uint64(0) >> (uint(-end) % 64))
continue
}

bitmap[lo] |= ^uint64(0) << uint(start%64)
for n := lo + 1; n < hi; n++ {
bitmap[n] = ^uint64(0)
}
bitmap[hi] |= ^uint64(0) >> (uint(-end) % 64)
}
default:
panic("unsupported container type")
}
}
}

// Checksum computes a hash (currently FNV-1a) for a bitmap that is suitable for
// using bitmaps as elements in hash sets or as keys in hash maps, as well as
// generally quicker comparisons.
Expand Down
99 changes: 99 additions & 0 deletions roaring_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package roaring

import (
"bytes"
"fmt"
"math"
"math/rand"
"strconv"
Expand Down Expand Up @@ -2543,6 +2544,104 @@ func TestIterateHalt(t *testing.T) {
assert.Equal(t, expected, values)
}

func testDense(fn func(string, *Bitmap)) {
bc := New()
for i := 0; i <= arrayDefaultMaxSize; i++ {
bc.Add(uint32(1 + MaxUint16 + i*2))
}

rc := New()
rc.AddRange(1, 2)
rc.AddRange(bc.GetCardinality(), bc.GetCardinality()*2)

ac := New()
for i := 1; i <= arrayDefaultMaxSize; i++ {
ac.Add(uint32(MaxUint16 + i*2))
}

brc := New()
for i := 150000; i < 450000; i++ {
brc.Add(uint32(i))
}

for _, tc := range []struct {
name string
rb *Bitmap
}{
{"bitmap", bc},
{"run", rc},
{"array", ac},
{"bitmaps-and-runs", brc},
} {
fn(tc.name+"-"+strconv.FormatUint(tc.rb.GetCardinality(), 10), tc.rb)
}
}

func TestToDense(t *testing.T) {
testDense(func(name string, rb *Bitmap) {
t.Run(name, func(t *testing.T) {
bm := bitset.From(rb.ToDense())
assert.EqualValues(t, rb.GetCardinality(), uint64(bm.Count()))
rb.Iterate(func(x uint32) bool {
return assert.True(t, bm.Test(uint(x)), "value %d should be set", x)
})
})
})
}

func TestFromDense(t *testing.T) {
testDense(func(name string, rb *Bitmap) {
for _, doCopy := range []bool{false, true} {
t.Run(fmt.Sprintf("%s,doCopy=%t", name, doCopy), func(t *testing.T) {
dense := rb.ToDense()
cp := FromDense(dense, doCopy)
if doCopy {
// Clear the original dense slice to ensure we don't have any
// references to it
for i := range dense {
dense[i] = 0
}
}
assert.True(t, rb.Equals(cp))
})
}
})
}

func BenchmarkFromDense(b *testing.B) {
testDense(func(name string, rb *Bitmap) {
dense := make([]uint64, rb.DenseSize())
rb.WriteDenseTo(dense)
cp := FromDense(dense, false)

for _, doCopy := range []bool{false, true} {
b.Run(fmt.Sprintf("%s,doCopy=%t", name, doCopy), func(b *testing.B) {
b.ReportAllocs()
b.SetBytes(int64(len(dense) * 8))
b.ResetTimer()
for i := 0; i < b.N; i++ {
cp.FromDense(dense, doCopy)
cp.Clear()
}
})
}
})
}

func BenchmarkWriteDenseTo(b *testing.B) {
testDense(func(name string, rb *Bitmap) {
b.Run(name, func(b *testing.B) {
dense := make([]uint64, rb.DenseSize())
b.ReportAllocs()
b.SetBytes(int64(len(dense) * 8))
b.ResetTimer()
for i := 0; i < b.N; i++ {
rb.WriteDenseTo(dense)
}
})
})
}

func BenchmarkEvenIntervalArrayUnions(b *testing.B) {
inputBitmaps := make([]*Bitmap, 40)
for i := 0; i < 40; i++ {
Expand Down

0 comments on commit cceddf2

Please sign in to comment.