Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 31 additions & 48 deletions pkg/kv/kvserver/client_merge_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3852,7 +3852,7 @@ func TestStoreRangeMergeRaftSnapshot(t *testing.T) {
// NOTE: There are no range-local keys or lock table keys, in [d,
// /Max) in the store we're sending a snapshot to, so we aren't
// expecting SSTs to clear those keys.
expectedSSTCount := 9
expectedSSTCount := 12
if len(sstNames) != expectedSSTCount {
return errors.Errorf("expected to ingest %d SSTs, got %d SSTs",
expectedSSTCount, len(sstNames))
Expand Down Expand Up @@ -3963,69 +3963,52 @@ func TestStoreRangeMergeRaftSnapshot(t *testing.T) {
// Keep the last one which contains the user keys.
expectedSSTs = expectedSSTs[len(expectedSSTs)-1:]

// Construct SSTs for the range-id local keys of the subsumed
// replicas. with RangeIDs 3 and 4. Note that this also targets the
// unreplicated rangeID-based keys because we're effectively
// replicaGC'ing these replicas (while absorbing their user keys
// into the LHS).
// Construct SSTs for the RangeID-local keys of the subsumed replicas,
// with range IDs 3 and 4. Note that this also targets the unreplicated
// rangeID-based keys because we're effectively replicaGC'ing these
// replicas (while absorbing their user keys into the LHS).
for _, k := range []roachpb.Key{keyB, keyC} {
rangeID := rangeIds[string(k)]
sstFile := &storage.MemObject{}
sst := storage.MakeIngestionSSTWriter(ctx, st, sstFile)
defer sst.Close()
{
// The snapshot code will use ClearRangeWithHeuristic with a
// threshold of 1 to clear the range, but this will truncate
// the range tombstone to the first key. In this case, the
// first key is RangeGCThresholdKey, which doesn't yet exist
// in the engine, so we write the Pebble range tombstone
// manually.
sl := rditer.Select(rangeID, rditer.SelectOpts{
ReplicatedByRangeID: true,
})
require.Len(t, sl, 1)
s := sl[0]
require.NoError(t, sst.ClearRawRange(keys.RangeGCThresholdKey(rangeID), s.EndKey, true, false))
}
{
// Ditto for the unreplicated version, where the first key
// happens to be the HardState.
sl := rditer.Select(rangeID, rditer.SelectOpts{
UnreplicatedByRangeID: true,
})
require.Len(t, sl, 1)
s := sl[0]
require.NoError(t, sst.ClearRawRange(keys.RaftHardStateKey(rangeID), s.EndKey, true, false))
spans := rditer.Select(rangeID, rditer.SelectOpts{
ReplicatedByRangeID: true,
UnreplicatedByRangeID: true,
})
require.Len(t, spans, 2)
for _, span := range spans {
require.NoError(t, sst.ClearRawRange(span.Key, span.EndKey, true, true))
}

if err := kvstorage.MakeStateLoader(rangeID).SetRangeTombstone(
require.NoError(t, kvstorage.MakeStateLoader(rangeID).SetRangeTombstone(
context.Background(), &sst,
kvserverpb.RangeTombstone{NextReplicaID: math.MaxInt32},
); err != nil {
return err
}
if err := sst.Finish(); err != nil {
return err
}
kvserverpb.RangeTombstone{NextReplicaID: kvstorage.MergedTombstoneReplicaID},
))
require.NoError(t, sst.Finish())
expectedSSTs = append(expectedSSTs, sstFile.Data())
}

// Construct an SST for the user key range of the subsumed replicas.
sstFile := &storage.MemObject{}
sst := storage.MakeIngestionSSTWriter(ctx, st, sstFile)
defer sst.Close()
desc := roachpb.RangeDescriptor{
StartKey: roachpb.RKey(keyD),
EndKey: roachpb.RKey(keyEnd),
}
if err := storage.ClearRangeWithHeuristic(
ctx, receivingEng, &sst, desc.StartKey.AsRawKey(), desc.EndKey.AsRawKey(), 64,
); err != nil {
return err
} else if err := sst.Finish(); err != nil {
return err
spans := rditer.Select(0 /* unused */, rditer.SelectOpts{
Ranged: rditer.SelectRangedOptions{
RSpan: desc.RSpan(),
SystemKeys: true,
LockTable: true,
UserKeys: true,
}})
require.Len(t, spans, 4)
for _, span := range spans {
sstFile := &storage.MemObject{}
sst := storage.MakeIngestionSSTWriter(ctx, st, sstFile)
defer sst.Close()
require.NoError(t, sst.ClearRawRange(span.Key, span.EndKey, true, true))
require.NoError(t, sst.Finish())
expectedSSTs = append(expectedSSTs, sstFile.Data())
}
expectedSSTs = append(expectedSSTs, sstFile.Data())

// Iterate over all the tested SSTs and check that they're
// byte-by-byte equal.
Expand Down
68 changes: 15 additions & 53 deletions pkg/kv/kvserver/kvstorage/destroy.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,28 +16,13 @@ import (
"github.com/cockroachdb/errors"
)

const (
// ClearRangeThresholdPointKeys is the threshold (as number of point keys)
// beyond which we'll clear range data using a Pebble range tombstone rather
// than individual Pebble point tombstones.
//
// It is expensive for there to be many Pebble range tombstones in the same
// sstable because all of the tombstones in an sstable are loaded whenever the
// sstable is accessed. So we avoid using range deletion unless there is some
// minimum number of keys. The value here was pulled out of thin air. It might
// be better to make this dependent on the size of the data being deleted. Or
// perhaps we should fix Pebble to handle large numbers of range tombstones in
// an sstable better.
ClearRangeThresholdPointKeys = 64

// MergedTombstoneReplicaID is the replica ID written into the RangeTombstone
// for replicas of a range which is known to have been merged. This value
// should prevent any messages from stale replicas of that range from ever
// resurrecting merged replicas. Whenever merging or subsuming a replica we
// know new replicas can never be created so this value is used even if we
// don't know the current replica ID.
MergedTombstoneReplicaID roachpb.ReplicaID = math.MaxInt32
)
// MergedTombstoneReplicaID is the replica ID written into the RangeTombstone
// for replicas of a range which is known to have been merged. This value should
// prevent any messages from stale replicas of that range from ever resurrecting
// merged replicas. Whenever merging or subsuming a replica we know new replicas
// can never be created so this value is used even if we don't know the current
// replica ID.
const MergedTombstoneReplicaID roachpb.ReplicaID = math.MaxInt32

// clearRangeDataOptions specify which parts of a Replica are to be destroyed.
type clearRangeDataOptions struct {
Expand All @@ -51,13 +36,6 @@ type clearRangeDataOptions struct {
// for the given RSpan) that is key-addressable (i.e. range descriptor, user keys,
// locks) to be removed. No data is removed if this is the zero span.
clearReplicatedBySpan roachpb.RSpan

// If mustUseClearRange is true, a Pebble range tombstone will always be used
// to clear the key spans (unless empty). This is typically used when we need
// to write additional keys to an SST after this clear, e.g. a replica
// tombstone, since keys must be written in order. When this is false, a
// heuristic will be used instead.
mustUseClearRange bool
}

// clearRangeData clears the data associated with a range descriptor selected
Expand All @@ -67,13 +45,9 @@ type clearRangeDataOptions struct {
// "CRDB Range" and "storage.ClearRange" context in the setting of this method could
// be confusing.
func clearRangeData(
ctx context.Context,
rangeID roachpb.RangeID,
reader storage.Reader,
writer storage.Writer,
opts clearRangeDataOptions,
rangeID roachpb.RangeID, writer storage.Writer, opts clearRangeDataOptions,
) error {
keySpans := rditer.Select(rangeID, rditer.SelectOpts{
for _, span := range rditer.Select(rangeID, rditer.SelectOpts{
Ranged: rditer.SelectRangedOptions{
RSpan: opts.clearReplicatedBySpan,
SystemKeys: true,
Expand All @@ -82,16 +56,9 @@ func clearRangeData(
},
ReplicatedByRangeID: opts.clearReplicatedByRangeID,
UnreplicatedByRangeID: opts.clearUnreplicatedByRangeID,
Copy link
Collaborator Author

@pav-kv pav-kv Oct 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: in the unreplicated keyspace, we don't need to clear "range keys" since they don't exist.

TODO: check if the same applies to the replicated RangeID-local space.

TODO: we can also not clear the replicated space for uninitialized replicas, since it is empty.

})

pointKeyThreshold := ClearRangeThresholdPointKeys
if opts.mustUseClearRange {
pointKeyThreshold = 1
}

for _, keySpan := range keySpans {
if err := storage.ClearRangeWithHeuristic(
ctx, reader, writer, keySpan.Key, keySpan.EndKey, pointKeyThreshold,
}) {
if err := writer.ClearRawRange(
span.Key, span.EndKey, true /* pointKeys */, true, /* rangeKeys */
); err != nil {
return err
}
Expand Down Expand Up @@ -173,7 +140,7 @@ func destroyReplicaImpl(
}

_ = DestroyReplicaTODO // 2.1 + 2.2 + 3.1
if err := clearRangeData(ctx, info.RangeID, reader, writer, opts); err != nil {
if err := clearRangeData(info.RangeID, writer, opts); err != nil {
return err
}
// Save a tombstone to ensure that replica IDs never get reused.
Expand Down Expand Up @@ -208,7 +175,6 @@ func SubsumeReplica(
opts := clearRangeDataOptions{
clearReplicatedByRangeID: true,
clearUnreplicatedByRangeID: true,
mustUseClearRange: forceSortedKeys,
}
return rditer.SelectOpts{
ReplicatedByRangeID: opts.clearReplicatedByRangeID,
Expand All @@ -224,13 +190,9 @@ func SubsumeReplica(
// TODO(#152199): do not remove the unreplicated state which can belong to a
// newer (uninitialized) replica.
func RemoveStaleRHSFromSplit(
ctx context.Context,
reader storage.Reader,
writer storage.Writer,
rangeID roachpb.RangeID,
keys roachpb.RSpan,
writer storage.Writer, rangeID roachpb.RangeID, keys roachpb.RSpan,
) error {
return clearRangeData(ctx, rangeID, reader, writer, clearRangeDataOptions{
return clearRangeData(rangeID, writer, clearRangeDataOptions{
// Since the RHS replica is uninitalized, we know there isn't anything in
// the two replicated spans below, before the current batch. Setting these
// options will in effect only clear the writes to the RHS replicated state
Expand Down
28 changes: 12 additions & 16 deletions pkg/kv/kvserver/kvstorage/testdata/TestDestroyReplica.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,16 @@ Put: "a"/Intent/00000000-0000-0000-0000-000000000000:
Put: 0.000000001,0 "y\xff" (0x79ff00000000000000000109): ""
Put: "y\xff"/Intent/00000000-0000-0000-0000-000000000000:
>> destroy:
Delete (Sized at 31): 0,0 /Local/RangeID/123/u/RangeTombstone (0x0169f67b757266746200):
Delete (Sized at 39): 0,0 /Local/RangeID/123/u/RaftHardState (0x0169f67b757266746800):
Delete (Sized at 43): 0,0 /Local/RangeID/123/u/RaftLog/logIndex:11 (0x0169f67b757266746c000000000000000b00):
Delete (Sized at 43): 0,0 /Local/RangeID/123/u/RaftLog/logIndex:12 (0x0169f67b757266746c000000000000000c00):
Delete (Sized at 43): 0,0 /Local/RangeID/123/u/RaftLog/logIndex:13 (0x0169f67b757266746c000000000000000d00):
Delete (Sized at 43): 0,0 /Local/RangeID/123/u/RaftLog/logIndex:14 (0x0169f67b757266746c000000000000000e00):
Delete (Sized at 43): 0,0 /Local/RangeID/123/u/RaftLog/logIndex:15 (0x0169f67b757266746c000000000000000f00):
Delete (Sized at 31): 0,0 /Local/RangeID/123/u/RaftReplicaID (0x0169f67b757266747200):
Delete (Sized at 33): 0,0 /Local/RangeID/123/u/RaftTruncatedState (0x0169f67b757266747400):
Delete (Sized at 34): 0,0 /Local/RangeID/123/u/RangeLastReplicaGCTimestamp (0x0169f67b75726c727400):
Delete (Sized at 20): 0.000000001,0 /Local/Range"a"/RangeDescriptor (0x016b126100017264736300000000000000000109):
Delete (Sized at 36): /Local/Lock/Local/Range"a"/RangeDescriptor 0300000000000000000000000000000000 (0x017a6b12016b126100ff0172647363000100030000000000000000000000000000000012):
Delete (Sized at 26): /Local/Lock"a" 0300000000000000000000000000000000 (0x017a6b1261000100030000000000000000000000000000000012):
Delete (Sized at 27): /Local/Lock"y\xff" 0300000000000000000000000000000000 (0x017a6b1279ff000100030000000000000000000000000000000012):
Delete (Sized at 11): 0.000000001,0 "a" (0x6100000000000000000109):
Delete (Sized at 12): 0.000000001,0 "y\xff" (0x79ff00000000000000000109):
Delete Range: [0,0 /Local/RangeID/123/r"" (0x0169f67b7200): , 0,0 /Local/RangeID/123/s"" (0x0169f67b7300): )
Delete Range Keys: /Local/RangeID/123/{r""-s""} (0x0169f67b7200-0x0169f67b7300)
Delete Range: [0,0 /Local/RangeID/123/u"" (0x0169f67b7500): , 0,0 /Local/RangeID/123/v"" (0x0169f67b7600): )
Delete Range Keys: /Local/RangeID/123/{u""-v""} (0x0169f67b7500-0x0169f67b7600)
Delete Range: [0,0 /Local/Range"a" (0x016b1261000100): , 0,0 /Local/Range"z" (0x016b127a000100): )
Delete Range Keys: /Local/Range"{a"-z"} (0x016b1261000100-0x016b127a000100)
Delete Range: [0,0 /Local/Lock/Local/Range"a" (0x017a6b12016b126100ff01000100): , 0,0 /Local/Lock/Local/Range"z" (0x017a6b12016b127a00ff01000100): )
Delete Range Keys: /Local/Lock/Local/Range"{a"-z"} (0x017a6b12016b126100ff01000100-0x017a6b12016b127a00ff01000100)
Delete Range: [0,0 /Local/Lock"a" (0x017a6b1261000100): , 0,0 /Local/Lock"z" (0x017a6b127a000100): )
Delete Range Keys: /Local/Lock"{a"-z"} (0x017a6b1261000100-0x017a6b127a000100)
Delete Range: [0,0 "a" (0x6100): , 0,0 "z" (0x7a00): )
Delete Range Keys: {a-z} (0x6100-0x7a00)
Put: 0,0 /Local/RangeID/123/u/RangeTombstone (0x0169f67b757266746200): next_replica_id:4
6 changes: 1 addition & 5 deletions pkg/kv/kvserver/snapshot_apply_prepare.go
Original file line number Diff line number Diff line change
Expand Up @@ -213,8 +213,6 @@ func (s *snapWriteBuilder) clearResidualDataOnNarrowSnapshot(ctx context.Context
return nil // we aren't narrowing anything; no-op
}

// TODO(sep-raft-log): read from the state machine engine here.
reader := storage.Reader(s.todoEng)
for _, span := range rditer.Select(0, rditer.SelectOpts{
Ranged: rditer.SelectRangedOptions{
RSpan: roachpb.RSpan{Key: s.desc.EndKey, EndKey: endKey},
Expand All @@ -224,9 +222,7 @@ func (s *snapWriteBuilder) clearResidualDataOnNarrowSnapshot(ctx context.Context
},
}) {
if err := s.writeSST(ctx, func(ctx context.Context, w storage.Writer) error {
return storage.ClearRangeWithHeuristic(
ctx, reader, w, span.Key, span.EndKey, kvstorage.ClearRangeThresholdPointKeys,
)
return w.ClearRawRange(span.Key, span.EndKey, true /* pointKeys */, true /* rangeKeys */)
}); err != nil {
return err
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/kv/kvserver/store_split.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ func splitPreApply(
// update its HardState. Here, we can accidentally clear the HardState of
// that new replica.
if err := kvstorage.RemoveStaleRHSFromSplit(
ctx, readWriter, readWriter, split.RightDesc.RangeID, split.RightDesc.RSpan(),
readWriter, split.RightDesc.RangeID, split.RightDesc.RSpan(),
); err != nil {
log.KvExec.Fatalf(ctx, "failed to clear range data for removed rhs: %v", err)
}
Expand Down
22 changes: 18 additions & 4 deletions pkg/kv/kvserver/testdata/TestPrepareSnapApply.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,29 @@ Put: 0,0 /Local/RangeID/123/u/RaftHardState (0x0169f67b757266746800): term:20 vo
Put: 0,0 /Local/RangeID/123/u/RaftReplicaID (0x0169f67b757266747200): replica_id:4
Put: 0,0 /Local/RangeID/123/u/RaftTruncatedState (0x0169f67b757266747400): index:100 term:20
>> sst:
Delete Range: [0,0 /Local/RangeID/101/u/RaftReplicaID (0x0169ed757266747200): , 0,0 /Local/RangeID/101/v"" (0x0169ed7600): )
Delete Range: [0,0 /Local/RangeID/101/r"" (0x0169ed7200): , 0,0 /Local/RangeID/101/s"" (0x0169ed7300): )
Delete Range Keys: /Local/RangeID/101/{r""-s""} (0x0169ed7200-0x0169ed7300)
Delete Range: [0,0 /Local/RangeID/101/u"" (0x0169ed7500): , 0,0 /Local/RangeID/101/v"" (0x0169ed7600): )
Delete Range Keys: /Local/RangeID/101/{u""-v""} (0x0169ed7500-0x0169ed7600)
Put: 0,0 /Local/RangeID/101/u/RangeTombstone (0x0169ed757266746200): next_replica_id:2147483647
>> sst:
Delete Range: [0,0 /Local/RangeID/102/u/RaftReplicaID (0x0169ee757266747200): , 0,0 /Local/RangeID/102/v"" (0x0169ee7600): )
Delete Range: [0,0 /Local/RangeID/102/r"" (0x0169ee7200): , 0,0 /Local/RangeID/102/s"" (0x0169ee7300): )
Delete Range Keys: /Local/RangeID/102/{r""-s""} (0x0169ee7200-0x0169ee7300)
Delete Range: [0,0 /Local/RangeID/102/u"" (0x0169ee7500): , 0,0 /Local/RangeID/102/v"" (0x0169ee7600): )
Delete Range Keys: /Local/RangeID/102/{u""-v""} (0x0169ee7500-0x0169ee7600)
Put: 0,0 /Local/RangeID/102/u/RangeTombstone (0x0169ee757266746200): next_replica_id:2147483647
>> sst:
Delete (Sized at 27): /Local/Lock"y\xff" 0300000000000000000000000000000000 (0x017a6b1279ff000100030000000000000000000000000000000012):
Delete Range: [0,0 /Local/Range"k" (0x016b126b000100): , 0,0 /Local/Range"z" (0x016b127a000100): )
Delete Range Keys: /Local/Range"{k"-z"} (0x016b126b000100-0x016b127a000100)
>> sst:
Delete (Sized at 12): 0.000000001,0 "y\xff" (0x79ff00000000000000000109):
Delete Range: [0,0 /Local/Lock/Local/Range"k" (0x017a6b12016b126b00ff01000100): , 0,0 /Local/Lock/Local/Range"z" (0x017a6b12016b127a00ff01000100): )
Delete Range Keys: /Local/Lock/Local/Range"{k"-z"} (0x017a6b12016b126b00ff01000100-0x017a6b12016b127a00ff01000100)
>> sst:
Delete Range: [0,0 /Local/Lock"k" (0x017a6b126b000100): , 0,0 /Local/Lock"z" (0x017a6b127a000100): )
Delete Range Keys: /Local/Lock"{k"-z"} (0x017a6b126b000100-0x017a6b127a000100)
>> sst:
Delete Range: [0,0 "k" (0x6b00): , 0,0 "z" (0x7a00): )
Delete Range Keys: {k-z} (0x6b00-0x7a00)
>> repl: /Local/RangeID/123/{r""-s""}
>> repl: /Local/Range"{a"-k"}
>> repl: /Local/Lock/Local/Range"{a"-k"}
Expand Down