-
Notifications
You must be signed in to change notification settings - Fork 1k
/
Copy pathsnapshot.cc
461 lines (373 loc) · 15.6 KB
/
snapshot.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
// Copyright 2024, DragonflyDB authors. All rights reserved.
// See LICENSE for licensing terms.
//
#include "server/snapshot.h"
#include <absl/strings/match.h>
#include <absl/strings/str_cat.h>
#include <mutex>
#include "base/flags.h"
#include "base/logging.h"
#include "core/heap_size.h"
#include "server/db_slice.h"
#include "server/engine_shard_set.h"
#include "server/journal/journal.h"
#include "server/rdb_extensions.h"
#include "server/rdb_save.h"
#include "server/server_state.h"
#include "server/tiered_storage.h"
#include "util/fibers/synchronization.h"
namespace dfly {
using namespace std;
using namespace util;
using namespace chrono_literals;
using facade::operator""_MB;
using facade::operator""_KB;
namespace {
thread_local absl::flat_hash_set<SliceSnapshot*> tl_slice_snapshots;
constexpr size_t kMinBlobSize = 32_KB;
} // namespace
SliceSnapshot::SliceSnapshot(CompressionMode compression_mode, DbSlice* slice,
SnapshotDataConsumerInterface* consumer, ExecutionState* cntx)
: db_slice_(slice),
db_array_(slice->databases()),
compression_mode_(compression_mode),
consumer_(consumer),
cntx_(cntx) {
tl_slice_snapshots.insert(this);
}
SliceSnapshot::~SliceSnapshot() {
DCHECK(db_slice_->shard_owner()->IsMyThread());
tl_slice_snapshots.erase(this);
}
size_t SliceSnapshot::GetThreadLocalMemoryUsage() {
size_t mem = 0;
for (SliceSnapshot* snapshot : tl_slice_snapshots) {
mem += snapshot->GetBufferCapacity();
}
return mem;
}
bool SliceSnapshot::IsSnaphotInProgress() {
return tl_slice_snapshots.size() > 0;
}
void SliceSnapshot::Start(bool stream_journal, SnapshotFlush allow_flush) {
DCHECK(!snapshot_fb_.IsJoinable());
auto db_cb = [this](DbIndex db_index, const DbSlice::ChangeReq& req) {
OnDbChange(db_index, req);
};
snapshot_version_ = db_slice_->RegisterOnChange(std::move(db_cb));
if (stream_journal) {
auto* journal = db_slice_->shard_owner()->journal();
DCHECK(journal);
auto journal_cb = [this](const journal::JournalItem& item, bool await) {
OnJournalEntry(item, await);
};
journal_cb_id_ = journal->RegisterOnChange(std::move(journal_cb));
}
const auto flush_threshold = ServerState::tlocal()->serialization_max_chunk_size;
std::function<void(size_t, RdbSerializer::FlushState)> flush_fun;
if (flush_threshold != 0 && allow_flush == SnapshotFlush::kAllow) {
flush_fun = [this, flush_threshold](size_t bytes_serialized,
RdbSerializer::FlushState flush_state) {
if (bytes_serialized > flush_threshold) {
size_t serialized = FlushSerialized(flush_state);
VLOG(2) << "FlushSerialized " << serialized << " bytes";
auto& stats = ServerState::tlocal()->stats;
++stats.big_value_preemptions;
}
};
}
serializer_ = std::make_unique<RdbSerializer>(compression_mode_, flush_fun);
VLOG(1) << "DbSaver::Start - saving entries with version less than " << snapshot_version_;
snapshot_fb_ = fb2::Fiber("snapshot", [this, stream_journal] {
this->IterateBucketsFb(stream_journal);
db_slice_->UnregisterOnChange(snapshot_version_);
consumer_->Finalize();
});
}
void SliceSnapshot::StartIncremental(LSN start_lsn) {
serializer_ = std::make_unique<RdbSerializer>(compression_mode_);
snapshot_fb_ = fb2::Fiber("incremental_snapshot",
[start_lsn, this] { this->SwitchIncrementalFb(start_lsn); });
}
// Called only for replication use-case.
void SliceSnapshot::FinalizeJournalStream(bool cancel) {
DVLOG(1) << "Finalize Snapshot";
DCHECK(db_slice_->shard_owner()->IsMyThread());
if (!journal_cb_id_) { // Finalize only once.
return;
}
uint32_t cb_id = journal_cb_id_;
journal_cb_id_ = 0;
// Wait for serialization to finish in any case.
snapshot_fb_.JoinIfNeeded();
auto* journal = db_slice_->shard_owner()->journal();
journal->UnregisterOnChange(cb_id);
if (!cancel) {
serializer_->SendJournalOffset(journal->GetLsn());
PushSerialized(true);
}
}
// The algorithm is to go over all the buckets and serialize those with
// version < snapshot_version_. In order to serialize each physical bucket exactly once we update
// bucket version to snapshot_version_ once it has been serialized.
// We handle serialization at physical bucket granularity.
// To further complicate things, Table::Traverse covers a logical bucket that may comprise of
// several physical buckets in dash table. For example, items belonging to logical bucket 0
// can reside in buckets 0,1 and stash buckets 56-59.
// PrimeTable::Traverse guarantees an atomic traversal of a single logical bucket,
// it also guarantees 100% coverage of all items that exists when the traversal started
// and survived until it finished.
// Serializes all the entries with version less than snapshot_version_.
void SliceSnapshot::IterateBucketsFb(bool send_full_sync_cut) {
{
auto fiber_name = absl::StrCat("SliceSnapshot-", ProactorBase::me()->GetPoolIndex());
ThisFiber::SetName(std::move(fiber_name));
}
PrimeTable::Cursor cursor;
for (DbIndex db_indx = 0; db_indx < db_array_.size(); ++db_indx) {
stats_.keys_total += db_slice_->DbSize(db_indx);
}
for (DbIndex db_indx = 0; db_indx < db_array_.size(); ++db_indx) {
if (!cntx_->IsRunning())
return;
if (!db_array_[db_indx])
continue;
uint64_t last_yield = 0;
PrimeTable* pt = &db_array_[db_indx]->prime;
VLOG(1) << "Start traversing " << pt->size() << " items for index " << db_indx;
do {
if (!cntx_->IsRunning()) {
return;
}
PrimeTable::Cursor next = pt->TraverseBuckets(
cursor, [this, &db_indx](auto it) { return BucketSaveCb(db_indx, it); });
cursor = next;
PushSerialized(false);
if (stats_.loop_serialized >= last_yield + 100) {
DVLOG(2) << "Before sleep " << ThisFiber::GetName();
ThisFiber::Yield();
DVLOG(2) << "After sleep";
last_yield = stats_.loop_serialized;
// Push in case other fibers (writes commands that pushed previous values)
// filled the buffer.
PushSerialized(false);
}
} while (cursor);
DVLOG(2) << "after loop " << ThisFiber::GetName();
PushSerialized(true);
} // for (dbindex)
CHECK(!serialize_bucket_running_);
if (send_full_sync_cut) {
CHECK(!serializer_->SendFullSyncCut());
PushSerialized(true);
}
// serialized + side_saved must be equal to the total saved.
VLOG(1) << "Exit SnapshotSerializer (loop_serialized/side_saved/cbcalls): "
<< stats_.loop_serialized << "/" << stats_.side_saved << "/" << stats_.savecb_calls;
}
void SliceSnapshot::SwitchIncrementalFb(LSN lsn) {
auto* journal = db_slice_->shard_owner()->journal();
DCHECK(journal);
DCHECK_LE(lsn, journal->GetLsn()) << "The replica tried to sync from the future.";
VLOG(1) << "Starting incremental snapshot from lsn=" << lsn;
// The replica sends the LSN of the next entry is wants to receive.
while (cntx_->IsRunning() && journal->IsLSNInBuffer(lsn)) {
serializer_->WriteJournalEntry(journal->GetEntry(lsn));
PushSerialized(false);
lsn++;
}
VLOG(1) << "Last LSN sent in incremental snapshot was " << (lsn - 1);
// This check is safe, but it is not trivially safe.
// We rely here on the fact that JournalSlice::AddLogRecord can
// only preempt while holding the callback lock.
// That guarantees that if we have processed the last LSN the callback
// will only be added after JournalSlice::AddLogRecord has finished
// iterating its callbacks and we won't process the record twice.
// We have to make sure we don't preempt ourselves before registering the callback!
// GetLsn() is always the next lsn that we expect to create.
if (journal->GetLsn() == lsn) {
{
FiberAtomicGuard fg;
serializer_->SendFullSyncCut();
}
auto journal_cb = [this](const journal::JournalItem& item, bool await) {
OnJournalEntry(item, await);
};
journal_cb_id_ = journal->RegisterOnChange(std::move(journal_cb));
PushSerialized(true);
} else {
// We stopped but we didn't manage to send the whole stream.
cntx_->ReportError(
std::make_error_code(errc::state_not_recoverable),
absl::StrCat("Partial sync was unsuccessful because entry #", lsn,
" was dropped from the buffer. Current lsn=", journal->GetLsn()));
FinalizeJournalStream(true);
}
}
bool SliceSnapshot::BucketSaveCb(DbIndex db_index, PrimeTable::bucket_iterator it) {
std::lock_guard guard(big_value_mu_);
++stats_.savecb_calls;
auto check = [&](uint64_t v) {
if (v >= snapshot_version_) {
// either has been already serialized or added after snapshotting started.
DVLOG(3) << "Skipped " << it.segment_id() << ":" << it.bucket_id() << " at " << v;
++stats_.skipped;
return false;
}
return true;
};
if (!check(it.GetVersion())) {
return false;
}
db_slice_->FlushChangeToEarlierCallbacks(db_index, DbSlice::Iterator::FromPrime(it),
snapshot_version_);
auto* blocking_counter = db_slice_->GetLatch();
// Locking this never preempts. We merely just increment the underline counter such that
// if SerializeBucket preempts, Heartbeat() won't run because the blocking counter is not
// zero.
std::lock_guard blocking_counter_guard(*blocking_counter);
stats_.loop_serialized += SerializeBucket(db_index, it);
return false;
}
unsigned SliceSnapshot::SerializeBucket(DbIndex db_index, PrimeTable::bucket_iterator it) {
DCHECK_LT(it.GetVersion(), snapshot_version_);
// traverse physical bucket and write it into string file.
serialize_bucket_running_ = true;
it.SetVersion(snapshot_version_);
unsigned result = 0;
for (it.AdvanceIfNotOccupied(); !it.is_done(); ++it) {
++result;
// might preempt due to big value serialization.
SerializeEntry(db_index, it->first, it->second);
}
serialize_bucket_running_ = false;
return result;
}
void SliceSnapshot::SerializeEntry(DbIndex db_indx, const PrimeKey& pk, const PrimeValue& pv) {
if (pv.IsExternal() && pv.IsCool())
return SerializeEntry(db_indx, pk, pv.GetCool().record->value);
time_t expire_time = 0;
if (pv.HasExpire()) {
auto eit = db_array_[db_indx]->expire.Find(pk);
expire_time = db_slice_->ExpireTime(eit);
}
uint32_t mc_flags = pv.HasFlag() ? db_slice_->GetMCFlag(db_indx, pk) : 0;
if (pv.IsExternal()) {
// TODO: we loose the stickiness attribute by cloning like this PrimeKey.
SerializeExternal(db_indx, PrimeKey{pk.ToString()}, pv, expire_time, mc_flags);
} else {
io::Result<uint8_t> res = serializer_->SaveEntry(pk, pv, expire_time, mc_flags, db_indx);
CHECK(res);
++type_freq_map_[*res];
}
}
size_t SliceSnapshot::FlushSerialized(SerializerBase::FlushState flush_state) {
io::StringFile sfile;
serializer_->FlushToSink(&sfile, flush_state);
size_t serialized = sfile.val.size();
if (serialized == 0)
return 0;
uint64_t id = rec_id_++;
DVLOG(2) << "Pushing " << id;
fb2::NoOpLock lk;
// We create a critical section here that ensures that records are pushed in sequential order.
// As a result, it is not possible for two fiber producers to push concurrently.
// If A.id = 5, and then B.id = 6, and both are blocked here, it means that last_pushed_id_ < 4.
// Once last_pushed_id_ = 4, A will be unblocked, while B will wait until A finishes pushing and
// update last_pushed_id_ to 5.
seq_cond_.wait(lk, [&] { return id == this->last_pushed_id_ + 1; });
// Blocking point.
consumer_->ConsumeData(std::move(sfile.val), cntx_);
DCHECK_EQ(last_pushed_id_ + 1, id);
last_pushed_id_ = id;
seq_cond_.notify_all();
VLOG(2) << "Pushed with Serialize() " << serialized;
return serialized;
}
bool SliceSnapshot::PushSerialized(bool force) {
if (!force && serializer_->SerializedLen() < kMinBlobSize && delayed_entries_.size() < 32)
return false;
// Flush any of the leftovers to avoid interleavings
size_t serialized = FlushSerialized(FlushState::kFlushEndEntry);
if (!delayed_entries_.empty()) {
// Async bucket serialization might have accumulated some delayed values.
// Because we can finally block in this function, we'll await and serialize them
do {
auto& entry = delayed_entries_.back();
// TODO: https://github.com/dragonflydb/dragonfly/issues/4654
// there are a few problems with how we serialize external values.
// 1. We may block here too frequently, slowing down the process.
// 2. For small bin values, we issue multiple reads for the same page, creating
// read factor amplification that can reach factor of ~60.
PrimeValue pv{entry.value.Get()}; // Might block until the future resolves.
// TODO: to introduce RdbSerializer::SaveString that can accept a string value directly.
serializer_->SaveEntry(entry.key, pv, entry.expire, entry.mc_flags, entry.dbid);
delayed_entries_.pop_back();
} while (!delayed_entries_.empty());
// blocking point.
serialized += FlushSerialized(FlushState::kFlushEndEntry);
}
return serialized > 0;
}
void SliceSnapshot::SerializeExternal(DbIndex db_index, PrimeKey key, const PrimeValue& pv,
time_t expire_time, uint32_t mc_flags) {
// We prefer avoid blocking, so we just schedule a tiered read and append
// it to the delayed entries.
util::fb2::Future<string> future =
EngineShard::tlocal()->tiered_storage()->Read(db_index, key.ToString(), pv);
delayed_entries_.push_back({db_index, std::move(key), std::move(future), expire_time, mc_flags});
++type_freq_map_[RDB_TYPE_STRING];
}
void SliceSnapshot::OnDbChange(DbIndex db_index, const DbSlice::ChangeReq& req) {
std::lock_guard guard(big_value_mu_);
PrimeTable* table = db_slice_->GetTables(db_index).first;
const PrimeTable::bucket_iterator* bit = req.update();
if (bit) {
if (!bit->is_done() && bit->GetVersion() < snapshot_version_) {
stats_.side_saved += SerializeBucket(db_index, *bit);
}
} else {
string_view key = get<string_view>(req.change);
table->CVCUponInsert(snapshot_version_, key, [this, db_index](PrimeTable::bucket_iterator it) {
DCHECK_LT(it.GetVersion(), snapshot_version_);
stats_.side_saved += SerializeBucket(db_index, it);
});
}
}
// For any key any journal entry must arrive at the replica strictly after its first original rdb
// value. This is guaranteed by the fact that OnJournalEntry runs always after OnDbChange, and
// no database switch can be performed between those two calls, because they are part of one
// transaction.
void SliceSnapshot::OnJournalEntry(const journal::JournalItem& item, bool await) {
// To enable journal flushing to sync after non auto journal command is executed we call
// TriggerJournalWriteToSink. This call uses the NOOP opcode with await=true. Since there is no
// additional journal change to serialize, it simply invokes PushSerialized.
{
// We should release the lock after we preempt
std::lock_guard guard(big_value_mu_);
if (item.opcode != journal::Op::NOOP) {
serializer_->WriteJournalEntry(item.data);
}
}
if (await) {
// This is the only place that flushes in streaming mode
// once the iterate buckets fiber finished.
PushSerialized(false);
}
}
size_t SliceSnapshot::GetBufferCapacity() const {
if (serializer_ == nullptr) {
return 0;
}
return serializer_->GetBufferCapacity();
}
size_t SliceSnapshot::GetTempBuffersSize() const {
if (serializer_ == nullptr) {
return 0;
}
return serializer_->GetTempBufferSize();
}
RdbSaver::SnapshotStats SliceSnapshot::GetCurrentSnapshotProgress() const {
return {stats_.loop_serialized + stats_.side_saved, stats_.keys_total};
}
} // namespace dfly