Skip to content

Commit 2f0f86a

Browse files
committed
Fixed protocol 23 upgrade bug on snapshot boundary
1 parent 59001f5 commit 2f0f86a

10 files changed

+79
-40
lines changed

src/bucket/BucketManager.cpp

+9-7
Original file line numberDiff line numberDiff line change
@@ -1079,15 +1079,17 @@ BucketManager::startBackgroundEvictionScan(uint32_t ledgerSeq,
10791079
mSnapshotManager->copySearchableLiveBucketListSnapshot();
10801080
auto const& sas = cfg.stateArchivalSettings();
10811081

1082-
using task_t = std::packaged_task<EvictionResultCandidates()>;
1082+
using task_t =
1083+
std::packaged_task<std::unique_ptr<EvictionResultCandidates>()>;
10831084
// MSVC gotcha: searchableBL has to be shared_ptr because MSVC wants to
10841085
// copy this lambda, otherwise we could use unique_ptr.
10851086
auto task = std::make_shared<task_t>(
10861087
[bl = std::move(searchableBL), iter = cfg.evictionIterator(), ledgerSeq,
10871088
ledgerVers, sas, &counters = mBucketListEvictionCounters,
10881089
stats = mEvictionStatistics] {
1089-
return bl->scanForEviction(ledgerSeq, counters, iter, stats, sas,
1090-
ledgerVers);
1090+
return std::make_unique<EvictionResultCandidates>(
1091+
bl->scanForEviction(ledgerSeq, counters, iter, stats, sas,
1092+
ledgerVers));
10911093
});
10921094

10931095
mEvictionFuture = task->get_future();
@@ -1114,14 +1116,14 @@ BucketManager::resolveBackgroundEvictionScan(
11141116

11151117
// If eviction related settings changed during the ledger, we have to
11161118
// restart the scan
1117-
if (!evictionCandidates.isValid(ledgerSeq,
1119+
if (!evictionCandidates->isValid(ledgerSeq, ledgerVers,
11181120
networkConfig.stateArchivalSettings()))
11191121
{
11201122
startBackgroundEvictionScan(ledgerSeq, ledgerVers, networkConfig);
11211123
evictionCandidates = mEvictionFuture.get();
11221124
}
11231125

1124-
auto& eligibleEntries = evictionCandidates.eligibleEntries;
1126+
auto& eligibleEntries = evictionCandidates->eligibleEntries;
11251127

11261128
for (auto iter = eligibleEntries.begin(); iter != eligibleEntries.end();)
11271129
{
@@ -1139,7 +1141,7 @@ BucketManager::resolveBackgroundEvictionScan(
11391141
auto remainingEntriesToEvict =
11401142
networkConfig.stateArchivalSettings().maxEntriesToArchive;
11411143
auto entryToEvictIter = eligibleEntries.begin();
1142-
auto newEvictionIterator = evictionCandidates.endOfRegionIterator;
1144+
auto newEvictionIterator = evictionCandidates->endOfRegionIterator;
11431145

11441146
// Return vectors include both evicted entry and associated TTL
11451147
std::vector<LedgerKey> deletedKeys;
@@ -1179,7 +1181,7 @@ BucketManager::resolveBackgroundEvictionScan(
11791181
// region
11801182
if (remainingEntriesToEvict != 0)
11811183
{
1182-
newEvictionIterator = evictionCandidates.endOfRegionIterator;
1184+
newEvictionIterator = evictionCandidates->endOfRegionIterator;
11831185
}
11841186

11851187
networkConfig.updateEvictionIterator(ltx, newEvictionIterator);

src/bucket/BucketManager.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ class BucketManager : NonMovableOrCopyable
111111
std::map<LedgerEntryTypeAndDurability, medida::Counter&>
112112
mBucketListEntrySizeCounters;
113113

114-
std::future<EvictionResultCandidates> mEvictionFuture{};
114+
std::future<std::unique_ptr<EvictionResultCandidates>> mEvictionFuture{};
115115

116116
// Copy app's config for thread-safe access
117117
Config const mConfig;

src/bucket/BucketUtils.cpp

+20-2
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
// of this distribution or at http://www.apache.org/licenses/LICENSE-2.0
44

55
#include "bucket/BucketUtils.h"
6+
#include "bucket/BucketBase.h"
7+
#include "util/ProtocolVersion.h"
68
#include <medida/metrics_registry.h>
79

810
namespace stellar
@@ -94,10 +96,26 @@ MergeCounters::operator==(MergeCounters const& other) const
9496
// Check that eviction scan is based off of current ledger snapshot and that
9597
// archival settings have not changed
9698
bool
97-
EvictionResultCandidates::isValid(uint32_t currLedger,
99+
EvictionResultCandidates::isValid(uint32_t currLedgerSeq,
100+
uint32_t currLedgerVers,
98101
StateArchivalSettings const& currSas) const
99102
{
100-
return initialLedger == currLedger &&
103+
// If the eviction scan started before a protocol upgrade, and the protocol
104+
// upgrade changes eviction scan behavior during the scan, we need
105+
// to restart with the new protocol version. We only care about
106+
// `FIRST_PROTOCOL_SUPPORTING_PERSISTENT_EVICTION`, other upgrades don't
107+
// affect evictions scans.
108+
if (protocolVersionIsBefore(
109+
initialLedgerVers,
110+
BucketBase::FIRST_PROTOCOL_SUPPORTING_PERSISTENT_EVICTION) &&
111+
protocolVersionStartsFrom(
112+
currLedgerVers,
113+
BucketBase::FIRST_PROTOCOL_SUPPORTING_PERSISTENT_EVICTION))
114+
{
115+
return false;
116+
}
117+
118+
return initialLedgerSeq == currLedgerSeq &&
101119
initialSas.maxEntriesToArchive == currSas.maxEntriesToArchive &&
102120
initialSas.evictionScanSize == currSas.evictionScanSize &&
103121
initialSas.startingEvictionScanLevel ==

src/bucket/BucketUtils.h

+13-9
Original file line numberDiff line numberDiff line change
@@ -95,24 +95,28 @@ struct EvictionResultCandidates
9595
// Eviction iterator at the end of the scan region
9696
EvictionIterator endOfRegionIterator;
9797

98-
// LedgerSeq which this scan is based on
99-
uint32_t initialLedger{};
98+
// LedgerSeq and ledger version which this scan is based on
99+
uint32_t const initialLedgerSeq{};
100+
uint32_t const initialLedgerVers{};
100101

101102
// State archival settings that this scan is based on
102-
StateArchivalSettings initialSas;
103+
StateArchivalSettings const initialSas;
103104

104-
EvictionResultCandidates(StateArchivalSettings const& sas) : initialSas(sas)
105+
EvictionResultCandidates(StateArchivalSettings const& sas,
106+
uint32_t initialLedger, uint32_t initialLedgerVers)
107+
: initialLedgerSeq(initialLedger)
108+
, initialLedgerVers(initialLedgerVers)
109+
, initialSas(sas)
105110
{
106111
}
107112

108113
// Returns true if this is a valid archival scan for the current ledger
109114
// and archival settings. This is necessary because we start the scan
110115
// for ledger N immediately after N - 1 closes. However, ledger N may
111-
// contain a network upgrade changing eviction scan settings. Legacy SQL
112-
// scans will run based on the changes that occurred during ledger N,
113-
// meaning the scan we started at ledger N - 1 is invalid since it was based
114-
// off of older settings.
115-
bool isValid(uint32_t currLedger,
116+
// contain a network upgrade changing eviction scan settings. Eviction scans
117+
// must be based on the network settings after applying any upgrades, so if
118+
// this occurs we must restart the scan.
119+
bool isValid(uint32_t currLedgerSeq, uint32_t currLedgerVers,
116120
StateArchivalSettings const& currSas) const;
117121
};
118122

src/bucket/SearchableBucketList.cpp

+1-2
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ SearchableLiveBucketListSnapshot::scanForEviction(
3131
LiveBucketList::updateStartingEvictionIterator(
3232
evictionIter, sas.startingEvictionScanLevel, ledgerSeq);
3333

34-
EvictionResultCandidates result(sas);
34+
EvictionResultCandidates result(sas, ledgerSeq, ledgerVers);
3535
auto startIter = evictionIter;
3636
auto scanSize = sas.evictionScanSize;
3737

@@ -59,7 +59,6 @@ SearchableLiveBucketListSnapshot::scanForEviction(
5959
}
6060

6161
result.endOfRegionIterator = evictionIter;
62-
result.initialLedger = ledgerSeq;
6362
return result;
6463
}
6564

src/history/HistoryManager.h

+7-4
Original file line numberDiff line numberDiff line change
@@ -319,14 +319,17 @@ class HistoryManager
319319
// Calls queueCurrentHistory() if the current ledger is a multiple of
320320
// getCheckpointFrequency() -- equivalently, the LCL is one _less_ than
321321
// a multiple of getCheckpointFrequency(). Returns true if checkpoint
322-
// publication of the LCL was queued, otherwise false.
323-
virtual bool maybeQueueHistoryCheckpoint(uint32_t lcl) = 0;
322+
// publication of the LCL was queued, otherwise false. ledgerVers must align
323+
// with lcl.
324+
virtual bool maybeQueueHistoryCheckpoint(uint32_t lcl,
325+
uint32_t ledgerVers) = 0;
324326

325327
// Checkpoint the LCL -- both the log of history from the previous
326328
// checkpoint to it, as well as the bucketlist of its state -- to a
327329
// publication-queue in the database. This should be followed shortly
328-
// (typically after commit) with a call to publishQueuedHistory.
329-
virtual void queueCurrentHistory(uint32_t lcl) = 0;
330+
// (typically after commit) with a call to publishQueuedHistory. ledgerVers
331+
// must align with lcl.
332+
virtual void queueCurrentHistory(uint32_t lcl, uint32_t ledgerVers) = 0;
330333

331334
// Return the youngest ledger still in the outgoing publish queue;
332335
// returns 0 if the publish queue has nothing in it.

src/history/HistoryManagerImpl.cpp

+4-6
Original file line numberDiff line numberDiff line change
@@ -375,7 +375,8 @@ HistoryManager::getMaxLedgerQueuedToPublish(Config const& cfg)
375375
}
376376

377377
bool
378-
HistoryManagerImpl::maybeQueueHistoryCheckpoint(uint32_t lcl)
378+
HistoryManagerImpl::maybeQueueHistoryCheckpoint(uint32_t lcl,
379+
uint32_t ledgerVers)
379380
{
380381
if (!publishCheckpointOnLedgerClose(lcl, mApp.getConfig()))
381382
{
@@ -389,12 +390,12 @@ HistoryManagerImpl::maybeQueueHistoryCheckpoint(uint32_t lcl)
389390
return false;
390391
}
391392

392-
queueCurrentHistory(lcl);
393+
queueCurrentHistory(lcl, ledgerVers);
393394
return true;
394395
}
395396

396397
void
397-
HistoryManagerImpl::queueCurrentHistory(uint32_t ledger)
398+
HistoryManagerImpl::queueCurrentHistory(uint32_t ledger, uint32_t ledgerVers)
398399
{
399400
ZoneScoped;
400401

@@ -407,9 +408,6 @@ HistoryManagerImpl::queueCurrentHistory(uint32_t ledger)
407408
}
408409

409410
HistoryArchiveState has;
410-
auto ledgerVers = mApp.getLedgerManager()
411-
.getLastClosedLedgerHeader()
412-
.header.ledgerVersion;
413411
if (protocolVersionStartsFrom(
414412
ledgerVers,
415413
BucketBase::FIRST_PROTOCOL_SUPPORTING_PERSISTENT_EVICTION))

src/history/HistoryManagerImpl.h

+3-2
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,10 @@ class HistoryManagerImpl : public HistoryManager
4646

4747
void logAndUpdatePublishStatus() override;
4848

49-
bool maybeQueueHistoryCheckpoint(uint32_t lcl) override;
49+
bool maybeQueueHistoryCheckpoint(uint32_t lcl,
50+
uint32_t ledgerVers) override;
5051

51-
void queueCurrentHistory(uint32_t lcl) override;
52+
void queueCurrentHistory(uint32_t lcl, uint32_t ledgerVers) override;
5253

5354
void takeSnapshotAndPublish(HistoryArchiveState const& has);
5455

src/history/test/HistoryTests.cpp

+10
Original file line numberDiff line numberDiff line change
@@ -1246,6 +1246,16 @@ TEST_CASE("Catchup with protocol upgrade", "[catchup][history]")
12461246
testUpgrade(SOROBAN_PROTOCOL_VERSION);
12471247
}
12481248
}
1249+
SECTION("hot archive bucket upgrade")
1250+
{
1251+
if (protocolVersionEquals(
1252+
Config::CURRENT_LEDGER_PROTOCOL_VERSION,
1253+
BucketBase::FIRST_PROTOCOL_SUPPORTING_PERSISTENT_EVICTION))
1254+
{
1255+
testUpgrade(
1256+
BucketBase::FIRST_PROTOCOL_SUPPORTING_PERSISTENT_EVICTION);
1257+
}
1258+
}
12491259
}
12501260

12511261
TEST_CASE("Catchup fatal failure", "[catchup][history]")

src/ledger/LedgerManagerImpl.cpp

+11-7
Original file line numberDiff line numberDiff line change
@@ -1054,9 +1054,15 @@ LedgerManagerImpl::closeLedger(LedgerCloseData const& ledgerData,
10541054

10551055
// Step 1. Maybe queue the current checkpoint file for publishing; this
10561056
// should not race with main, since publish on main begins strictly _after_
1057-
// this call.
1057+
// this call. There is a bug in the upgrade path where the initial
1058+
// ledgerVers is used in some places during ledgerClose, and the upgraded
1059+
// ledgerVers is used in other places (see comment in ledgerClosed).
1060+
// On the ledger when an upgrade occurs, the ledger header will contain the
1061+
// newly incremented ledgerVers. Because the history checkpoint must be
1062+
// consistent with the ledger header, we must base checkpoints off the new
1063+
// ledgerVers here and not the initial ledgerVers.
10581064
auto& hm = mApp.getHistoryManager();
1059-
hm.maybeQueueHistoryCheckpoint(ledgerSeq);
1065+
hm.maybeQueueHistoryCheckpoint(ledgerSeq, maybeNewVersion);
10601066

10611067
// step 2
10621068
ltx.commit();
@@ -1856,12 +1862,10 @@ LedgerManagerImpl::ledgerClosed(
18561862
// there are two different assumptions in different parts of the
18571863
// ledger-close path:
18581864
// - In closeLedger we mostly treat the ledger as being on vN, eg.
1859-
// during
1860-
// tx apply and LCM construction.
1865+
// during tx apply and LCM construction.
18611866
// - In the final stage, when we call ledgerClosed, we pass vN+1
1862-
// because
1863-
// the upgrade completed and modified the ltx header, and we fish
1864-
// the protocol out of the ltx header
1867+
// because the upgrade completed and modified the ltx header, and we
1868+
// fish the protocol out of the ltx header
18651869
// Before LedgerCloseMetaV1, this inconsistency was mostly harmless
18661870
// since LedgerCloseMeta was not modified after the LTX header was
18671871
// modified. However, starting with protocol 20, LedgerCloseMeta is

0 commit comments

Comments
 (0)