Skip to content

Commit 154aa2a

Browse files
committed
db: rework merging iterator range deletion handling
Rework the merging iterator's handling of range deletions to use the interleaved range deletion boundary keys to determine when the iterator is positioned within a level's range deletion. This removes the direct manipulation of a range deletion keyspan.FragmentIterator from the mergingIter, delegating that to the child iterator's keyspan.InterleavingIter. This factoring is a little cleaner and decouples the mergingIter from the details of range deletion iterators, and in particular, the levelIter's individual sstables. It also should reduce key comparisons, especially during scans, by avoiding unnecessary key comparisons with range deletions that are loaded but outside the keyspace being iterated over. Close #2863.
1 parent 182f936 commit 154aa2a

17 files changed

+500
-658
lines changed

db.go

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"github.com/cockroachdb/pebble/internal/keyspan/keyspanimpl"
2424
"github.com/cockroachdb/pebble/internal/manifest"
2525
"github.com/cockroachdb/pebble/internal/manual"
26+
"github.com/cockroachdb/pebble/internal/rangedel"
2627
"github.com/cockroachdb/pebble/objstorage"
2728
"github.com/cockroachdb/pebble/objstorage/remote"
2829
"github.com/cockroachdb/pebble/rangekey"
@@ -1431,30 +1432,26 @@ func (i *Iterator) constructPointIter(
14311432
} else {
14321433
i.batch.initInternalIter(&i.opts, &i.batchPointIter)
14331434
i.batch.initRangeDelIter(&i.opts, &i.batchRangeDelIter, i.batchSeqNum)
1435+
mil := mergingIterLevel{iter: &i.batchPointIter, getTombstone: nil}
14341436
// Only include the batch's rangedel iterator if it's non-empty.
14351437
// This requires some subtle logic in the case a rangedel is later
14361438
// written to the batch and the view of the batch is refreshed
14371439
// during a call to SetOptions—in this case, we need to reconstruct
14381440
// the point iterator to add the batch rangedel iterator.
1439-
var rangeDelIter keyspan.FragmentIterator
14401441
if i.batchRangeDelIter.Count() > 0 {
1441-
rangeDelIter = &i.batchRangeDelIter
1442+
mil.iter, mil.getTombstone = rangedel.Interleave(&i.comparer, &i.batchPointIter, &i.batchRangeDelIter)
14421443
}
1443-
mlevels = append(mlevels, mergingIterLevel{
1444-
iter: &i.batchPointIter,
1445-
rangeDelIter: rangeDelIter,
1446-
})
1444+
mlevels = append(mlevels, mil)
14471445
}
14481446
}
14491447

14501448
if !i.batchOnlyIter {
14511449
// Next are the memtables.
14521450
for j := len(memtables) - 1; j >= 0; j-- {
14531451
mem := memtables[j]
1454-
mlevels = append(mlevels, mergingIterLevel{
1455-
iter: mem.newIter(&i.opts),
1456-
rangeDelIter: mem.newRangeDelIter(&i.opts),
1457-
})
1452+
mil := mergingIterLevel{}
1453+
mil.iter, mil.getTombstone = rangedel.Interleave(&i.comparer, mem.newIter(&i.opts), mem.newRangeDelIter(&i.opts))
1454+
mlevels = append(mlevels, mil)
14581455
}
14591456

14601457
// Next are the file levels: L0 sub-levels followed by lower levels.
@@ -1467,10 +1464,11 @@ func (i *Iterator) constructPointIter(
14671464
li := &levels[levelsIndex]
14681465

14691466
li.init(ctx, i.opts, &i.comparer, i.newIters, files, level, internalOpts)
1470-
li.initRangeDel(&mlevels[mlevelsIndex].rangeDelIter)
1467+
li.interleaveRangeDeletions = true
14711468
li.initCombinedIterState(&i.lazyCombinedIter.combinedIterState)
14721469
mlevels[mlevelsIndex].levelIter = li
14731470
mlevels[mlevelsIndex].iter = invalidating.MaybeWrapIfInvariants(li)
1471+
mlevels[mlevelsIndex].getTombstone = li.getTombstone
14741472

14751473
levelsIndex++
14761474
mlevelsIndex++

external_iterator.go

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"github.com/cockroachdb/pebble/internal/base"
1212
"github.com/cockroachdb/pebble/internal/keyspan"
1313
"github.com/cockroachdb/pebble/internal/manifest"
14+
"github.com/cockroachdb/pebble/internal/rangedel"
1415
"github.com/cockroachdb/pebble/sstable"
1516
)
1617

@@ -212,10 +213,11 @@ func createExternalPointIter(ctx context.Context, it *Iterator) (topLevelIterato
212213
if err != nil {
213214
return nil, err
214215
}
215-
mlevels = append(mlevels, mergingIterLevel{
216-
iter: pointIter,
217-
rangeDelIter: rangeDelIter,
218-
})
216+
mil := mergingIterLevel{iter: pointIter, getTombstone: nil}
217+
if rangeDelIter != nil {
218+
mil.iter, mil.getTombstone = rangedel.Interleave(&it.comparer, mil.iter, rangeDelIter)
219+
}
220+
mlevels = append(mlevels, mil)
219221
}
220222
}
221223

internal/base/internal.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -574,3 +574,14 @@ func (kv *InternalKV) Visible(snapshot, batchSnapshot uint64) bool {
574574
func (kv *InternalKV) IsExclusiveSentinel() bool {
575575
return kv.K.IsExclusiveSentinel()
576576
}
577+
578+
// String returns a string representation of the kv pair.
579+
func (kv *InternalKV) String() string {
580+
if kv == nil {
581+
return "<nil>"
582+
}
583+
if kv.V.Fetcher != nil {
584+
return fmt.Sprintf("%s=<valblk:%x>", kv.K, kv.V.ValueOrHandle)
585+
}
586+
return fmt.Sprintf("%s:%s", kv.K, FormatBytes(kv.V.ValueOrHandle))
587+
}

internal/keyspan/span.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -298,10 +298,11 @@ func (s Span) Visible(snapshot uint64) Span {
298298
// VisibleAt requires the Span's keys be in ByTrailerDesc order. It panics if
299299
// the span's keys are sorted in a different order.
300300
func (s *Span) VisibleAt(snapshot uint64) bool {
301-
if s.KeysOrder != ByTrailerDesc {
301+
if s == nil {
302+
return false
303+
} else if s.KeysOrder != ByTrailerDesc {
302304
panic("pebble: span's keys unexpectedly not in trailer order")
303-
}
304-
if len(s.Keys) == 0 {
305+
} else if len(s.Keys) == 0 {
305306
return false
306307
} else if first := s.Keys[0].SeqNum(); first&base.InternalKeySeqNumBatch != 0 {
307308
// Only visible batch keys are included when an Iterator's batch spans

internal/rangedel/rangedel.go

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
package rangedel
66

77
import (
8+
"sync"
9+
810
"github.com/cockroachdb/pebble/internal/base"
911
"github.com/cockroachdb/pebble/internal/keyspan"
1012
)
@@ -41,3 +43,49 @@ func Decode(ik base.InternalKey, v []byte, keysDst []keyspan.Key) keyspan.Span {
4143
}),
4244
}
4345
}
46+
47+
// Interleave takes a point iterator and a range deletion iterator, returning an
48+
// iterator that interleaves range deletion boundary keys at the maximal
49+
// sequence number among the stream of point keys with SPANSTART and SPANEND key
50+
// kinds.
51+
//
52+
// In addition, Interleave returns a function that may be used to retrieve the
53+
// range tombstone overlapping the current iterator position, if any.
54+
//
55+
// The returned iterator must only be closed once.
56+
func Interleave(
57+
comparer *base.Comparer, iter base.InternalIterator, rangeDelIter keyspan.FragmentIterator,
58+
) (base.InternalIterator, func() *keyspan.Span) {
59+
// If there is no range deletion iterator, don't bother using an interleaving
60+
// iterator. We can return iter verbatim and a func that unconditionally
61+
// returns nil.
62+
if rangeDelIter == nil {
63+
return iter, nil
64+
}
65+
66+
ii := interleavingIterPool.Get().(*interleavingIter)
67+
ii.Init(comparer, iter, rangeDelIter, keyspan.InterleavingIterOpts{
68+
InterleaveEndKeys: true,
69+
UseBoundaryKeyKinds: true,
70+
})
71+
return ii, ii.Span
72+
}
73+
74+
var interleavingIterPool = sync.Pool{
75+
New: func() interface{} {
76+
return &interleavingIter{}
77+
},
78+
}
79+
80+
type interleavingIter struct {
81+
keyspan.InterleavingIter
82+
}
83+
84+
// Close closes the interleaving iterator and returns the interleaving iterator
85+
// to the pool.
86+
func (i *interleavingIter) Close() error {
87+
err := i.InterleavingIter.Close()
88+
*i = interleavingIter{}
89+
interleavingIterPool.Put(i)
90+
return err
91+
}

iterator.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1820,8 +1820,8 @@ func (i *Iterator) nextPrefix() IterValidityState {
18201820
return i.iterValidityState
18211821
}
18221822
if invariants.Enabled && !i.equal(i.iterKV.K.UserKey, i.key) {
1823-
i.opts.getLogger().Fatalf("pebble: invariant violation: Nexting internal iterator from iterPosPrev landed on %q, not %q",
1824-
i.iterKV.K.UserKey, i.key)
1823+
panic(errors.AssertionFailedf("pebble: invariant violation: Nexting internal iterator from iterPosPrev landed on %q, not %q",
1824+
i.iterKV.K.UserKey, i.key))
18251825
}
18261826
}
18271827
// The internal iterator is now positioned at i.key. Advance to the next

level_checker.go

Lines changed: 21 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"github.com/cockroachdb/pebble/internal/base"
1515
"github.com/cockroachdb/pebble/internal/keyspan"
1616
"github.com/cockroachdb/pebble/internal/manifest"
17+
"github.com/cockroachdb/pebble/internal/rangedel"
1718
)
1819

1920
// This file implements DB.CheckLevels() which checks that every entry in the
@@ -47,11 +48,10 @@ import (
4748

4849
// The per-level structure used by simpleMergingIter.
4950
type simpleMergingIterLevel struct {
50-
iter internalIterator
51-
rangeDelIter keyspan.FragmentIterator
52-
53-
iterKV *base.InternalKV
54-
tombstone *keyspan.Span
51+
iter internalIterator
52+
getTombstone func() *keyspan.Span
53+
iterKV *base.InternalKV
54+
iterTombstone *keyspan.Span
5555
}
5656

5757
type simpleMergingIter struct {
@@ -102,22 +102,6 @@ func (m *simpleMergingIter) init(
102102
if m.heap.len() == 0 {
103103
return
104104
}
105-
m.positionRangeDels()
106-
}
107-
108-
// Positions all the rangedel iterators at or past the current top of the
109-
// heap, using SeekGE().
110-
func (m *simpleMergingIter) positionRangeDels() {
111-
item := &m.heap.items[0]
112-
for i := range m.levels {
113-
l := &m.levels[i]
114-
if l.rangeDelIter == nil {
115-
continue
116-
}
117-
t, err := l.rangeDelIter.SeekGE(item.key.UserKey)
118-
m.err = firstError(m.err, err)
119-
l.tombstone = t
120-
}
121105
}
122106

123107
// Returns true if not yet done.
@@ -128,7 +112,12 @@ func (m *simpleMergingIter) step() bool {
128112
item := &m.heap.items[0]
129113
l := &m.levels[item.index]
130114
// Sentinels are not relevant for this point checking.
131-
if !item.key.IsExclusiveSentinel() && item.key.Visible(m.snapshot, base.InternalKeySeqNumMax) {
115+
if item.key.IsExclusiveSentinel() {
116+
l.iterTombstone = nil
117+
if item.key.Kind() != base.InternalKeyKindSpanEnd {
118+
l.iterTombstone = l.getTombstone()
119+
}
120+
} else if item.key.Visible(m.snapshot, base.InternalKeySeqNumMax) {
132121
// This is a visible point key.
133122
if !m.handleVisiblePoint(item, l) {
134123
return false
@@ -187,7 +176,6 @@ func (m *simpleMergingIter) step() bool {
187176
}
188177
return false
189178
}
190-
m.positionRangeDels()
191179
return true
192180
}
193181

@@ -269,12 +257,12 @@ func (m *simpleMergingIter) handleVisiblePoint(
269257
// iterators must be positioned at a key > item.key.
270258
for level := item.index + 1; level < len(m.levels); level++ {
271259
lvl := &m.levels[level]
272-
if lvl.rangeDelIter == nil || lvl.tombstone.Empty() {
260+
if lvl.iterTombstone.Empty() {
273261
continue
274262
}
275-
if lvl.tombstone.Contains(m.heap.cmp, item.key.UserKey) && lvl.tombstone.CoversAt(m.snapshot, item.key.SeqNum()) {
263+
if lvl.iterTombstone.Contains(m.heap.cmp, item.key.UserKey) && lvl.iterTombstone.CoversAt(m.snapshot, item.key.SeqNum()) {
276264
m.err = errors.Errorf("tombstone %s in %s deletes key %s in %s",
277-
lvl.tombstone.Pretty(m.formatKey), lvl.iter, item.key.Pretty(m.formatKey),
265+
lvl.iterTombstone.Pretty(m.formatKey), lvl.iter, item.key.Pretty(m.formatKey),
278266
l.iter)
279267
return false
280268
}
@@ -593,20 +581,15 @@ func checkLevelsInternal(c *checkConfig) (err error) {
593581
err = firstError(err, l.iter.Close())
594582
l.iter = nil
595583
}
596-
if l.rangeDelIter != nil {
597-
err = firstError(err, l.rangeDelIter.Close())
598-
l.rangeDelIter = nil
599-
}
600584
}
601585
}()
602586

603587
memtables := c.readState.memtables
604588
for i := len(memtables) - 1; i >= 0; i-- {
605589
mem := memtables[i]
606-
mlevels = append(mlevels, simpleMergingIterLevel{
607-
iter: mem.newIter(nil),
608-
rangeDelIter: mem.newRangeDelIter(nil),
609-
})
590+
var smil simpleMergingIterLevel
591+
smil.iter, smil.getTombstone = rangedel.Interleave(c.comparer, mem.newIter(nil), mem.newRangeDelIter(nil))
592+
mlevels = append(mlevels, smil)
610593
}
611594

612595
current := c.readState.current
@@ -636,8 +619,9 @@ func checkLevelsInternal(c *checkConfig) (err error) {
636619
li := &levelIter{}
637620
li.init(context.Background(), iterOpts, c.comparer, c.newIters, manifestIter,
638621
manifest.L0Sublevel(sublevel), internalIterOpts{})
639-
li.initRangeDel(&mlevelAlloc[0].rangeDelIter)
622+
li.interleaveRangeDeletions = true
640623
mlevelAlloc[0].iter = li
624+
mlevelAlloc[0].getTombstone = li.getTombstone
641625
mlevelAlloc = mlevelAlloc[1:]
642626
}
643627
for level := 1; level < len(current.Levels); level++ {
@@ -649,8 +633,9 @@ func checkLevelsInternal(c *checkConfig) (err error) {
649633
li := &levelIter{}
650634
li.init(context.Background(), iterOpts, c.comparer, c.newIters,
651635
current.Levels[level].Iter(), manifest.Level(level), internalIterOpts{})
652-
li.initRangeDel(&mlevelAlloc[0].rangeDelIter)
636+
li.interleaveRangeDeletions = true
653637
mlevelAlloc[0].iter = li
638+
mlevelAlloc[0].getTombstone = li.getTombstone
654639
mlevelAlloc = mlevelAlloc[1:]
655640
}
656641

0 commit comments

Comments
 (0)