Merge bitcoin#30611: validation: write chainstate to disk every hour

achow101 · achow101 · commit 5b8046a6e893 · 2025-05-01T12:11:55.000-07:00
e976bd3 validation: add randomness to periodic write interval (Andrew Toth) 2e2f410 refactor: replace m_last_write with m_next_write (Andrew Toth) b557fa7 refactor: rename fDoFullFlush to should_write (Andrew Toth) d73bd9f validation: write chainstate to disk every hour (Andrew Toth) 0ad7d7a test: chainstate write test for periodic chainstate flush (Andrew Toth) Pull request description: Since bitcoin#28233, periodically writing the chainstate to disk every 24 hours does not clear the dbcache. Since bitcoin#28280, periodically writing the chainstate to disk is proportional only to the amount of dirty entries in the cache. Due to these changes, it is no longer beneficial to only write the chainstate to disk every 24 hours. The periodic flush interval was necessary because every write of the chainstate would clear the dbcache. Now, we can get rid of the periodic flush interval and simply write the chainstate along with blocks and block index at least every hour. Three benefits of doing this: 1. For IBD or reindex-chainstate with a combination of large dbcache setting, slow CPU, slow internet speed/unreliable peers, it could be up to 24 hours until the chainstate is persisted to disk. A power outage or crash could potentially lose up to 24 hours of progress. If there is a very large amount of dirty cache entries, writing to disk when a flush finally does occur will take a very long time. Crashing during this window of writing can cause bitcoin#11600. By syncing every hour in unison with the block index we avoid this problem. Only a maximum of one hour of progress can be lost, and the window for crashing during writing is much smaller. For IBD with lower dbcache settings, faster CPU, or better internet speed/reliable peers, chainstate writes are already triggered more often than every hour so this change will have no effect on IBD. 2. Based on discussion in bitcoin#28280, writing only once every 24 hours during long running operation of a node causes IO spikes. Writing smaller chainstate changes every hour like we do with blocks and block index will reduce IO spikes. 3. Faster shutdown speeds. All dirty chainstate entries must be persisted to disk on shutdown. If we have a lot of dirty entries, such as when close to 24 hours or if we sync with a large dbcache, it can take a long time to shutdown. By keeping the chainstate clean we avoid this problem. Inspired by [this comment](bitcoin#28280 (comment)). Resolves bitcoin#11600 ACKs for top commit: achow101: ACK e976bd3 davidgumberg: utACK bitcoin@e976bd3 sipa: utACK e976bd3 l0rinc: ACK e976bd3 Tree-SHA512: 5bccd8f1dea47f9820a3fd32fe3bb6841c0167b3d6870cc8f3f7e2368f124af1a914bca6acb06889cd7183638a8dbdbace54d3237c3683f2b567eb7355e015ee
diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt
@@ -25,6 +25,7 @@ add_executable(test_bitcoin
   blockmanager_tests.cpp
   bloom_tests.cpp
   bswap_tests.cpp
+  chainstate_write_tests.cpp
   checkqueue_tests.cpp
   cluster_linearize_tests.cpp
   coins_tests.cpp
diff --git a/src/test/chainstate_write_tests.cpp b/src/test/chainstate_write_tests.cpp
@@ -0,0 +1,45 @@
+// Copyright (c) The Bitcoin Core developers
+// Distributed under the MIT software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#include <test/util/setup_common.h>
+#include <validation.h>
+#include <validationinterface.h>
+
+#include <boost/test/unit_test.hpp>
+
+BOOST_AUTO_TEST_SUITE(chainstate_write_tests)
+
+BOOST_FIXTURE_TEST_CASE(chainstate_write_interval, TestingSetup)
+{
+    struct TestSubscriber final : CValidationInterface {
+        bool m_did_flush{false};
+        void ChainStateFlushed(ChainstateRole, const CBlockLocator&) override
+        {
+            m_did_flush = true;
+        }
+    };
+
+    const auto sub{std::make_shared<TestSubscriber>()};
+    m_node.validation_signals->RegisterSharedValidationInterface(sub);
+    auto& chainstate{Assert(m_node.chainman)->ActiveChainstate()};
+    BlockValidationState state_dummy{};
+
+    // The first periodic flush sets m_next_write and does not flush
+    chainstate.FlushStateToDisk(state_dummy, FlushStateMode::PERIODIC);
+    m_node.validation_signals->SyncWithValidationInterfaceQueue();
+    BOOST_CHECK(!sub->m_did_flush);
+
+    // The periodic flush interval is between 50 and 70 minutes (inclusive)
+    SetMockTime(GetTime<std::chrono::minutes>() + 49min);
+    chainstate.FlushStateToDisk(state_dummy, FlushStateMode::PERIODIC);
+    m_node.validation_signals->SyncWithValidationInterfaceQueue();
+    BOOST_CHECK(!sub->m_did_flush);
+
+    SetMockTime(GetTime<std::chrono::minutes>() + 70min);
+    chainstate.FlushStateToDisk(state_dummy, FlushStateMode::PERIODIC);
+    m_node.validation_signals->SyncWithValidationInterfaceQueue();
+    BOOST_CHECK(sub->m_did_flush);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/validation.cpp b/src/validation.cpp
@@ -90,10 +90,12 @@ using node::SnapshotMetadata;
 
 /** Size threshold for warning about slow UTXO set flush to disk. */
 static constexpr size_t WARN_FLUSH_COINS_SIZE = 1 << 30; // 1 GiB
-/** Time to wait between writing blocks/block index to disk. */
-static constexpr std::chrono::hours DATABASE_WRITE_INTERVAL{1};
-/** Time to wait between flushing chainstate to disk. */
-static constexpr std::chrono::hours DATABASE_FLUSH_INTERVAL{24};
+/** Time window to wait between writing blocks/block index and chainstate to disk.
+ *  Randomize writing time inside the window to prevent a situation where the
+ *  network over time settles into a few cohorts of synchronized writers.
+*/
+static constexpr auto DATABASE_WRITE_INTERVAL_MIN{50min};
+static constexpr auto DATABASE_WRITE_INTERVAL_MAX{70min};
 /** Maximum age of our tip for us to be considered current for fee estimation */
 static constexpr std::chrono::hours MAX_FEE_ESTIMATION_TIP_AGE{3};
 const std::vector<std::string> CHECKLEVEL_DOC {
@@ -2807,7 +2809,6 @@ bool Chainstate::FlushStateToDisk(
     try {
     {
         bool fFlushForPrune = false;
-        bool fDoFullFlush = false;
 
         CoinsCacheSizeState cache_state = GetCoinsCacheSizeState();
         LOCK(m_blockman.cs_LastBlockFile);
@@ -2852,26 +2853,17 @@ bool Chainstate::FlushStateToDisk(
                 }
             }
         }
-        const auto nNow{SteadyClock::now()};
-        // Avoid writing/flushing immediately after startup.
-        if (m_last_write == decltype(m_last_write){}) {
-            m_last_write = nNow;
-        }
-        if (m_last_flush == decltype(m_last_flush){}) {
-            m_last_flush = nNow;
-        }
+        const auto nNow{NodeClock::now()};
         // The cache is large and we're within 10% and 10 MiB of the limit, but we have time now (not in the middle of a block processing).
         bool fCacheLarge = mode == FlushStateMode::PERIODIC && cache_state >= CoinsCacheSizeState::LARGE;
         // The cache is over the limit, we have to write now.
         bool fCacheCritical = mode == FlushStateMode::IF_NEEDED && cache_state >= CoinsCacheSizeState::CRITICAL;
-        // It's been a while since we wrote the block index to disk. Do this frequently, so we don't need to redownload after a crash.
-        bool fPeriodicWrite = mode == FlushStateMode::PERIODIC && nNow > m_last_write + DATABASE_WRITE_INTERVAL;
-        // It's been very long since we flushed the cache. Do this infrequently, to optimize cache usage.
-        bool fPeriodicFlush = mode == FlushStateMode::PERIODIC && nNow > m_last_flush + DATABASE_FLUSH_INTERVAL;
-        // Combine all conditions that result in a full cache flush.
-        fDoFullFlush = (mode == FlushStateMode::ALWAYS) || fCacheLarge || fCacheCritical || fPeriodicFlush || fFlushForPrune;
-        // Write blocks and block index to disk.
-        if (fDoFullFlush || fPeriodicWrite) {
+        // It's been a while since we wrote the block index and chain state to disk. Do this frequently, so we don't need to redownload or reindex after a crash.
+        bool fPeriodicWrite = mode == FlushStateMode::PERIODIC && nNow >= m_next_write;
+        // Combine all conditions that result in a write to disk.
+        bool should_write = (mode == FlushStateMode::ALWAYS) || fCacheLarge || fCacheCritical || fPeriodicWrite || fFlushForPrune;
+        // Write blocks, block index and best chain related state to disk.
+        if (should_write) {
             // Ensure we can write block index
             if (!CheckDiskSpace(m_blockman.m_opts.blocks_dir)) {
                 return FatalError(m_chainman.GetNotifications(), state, _("Disk space is too low!"));
@@ -2901,35 +2893,38 @@ bool Chainstate::FlushStateToDisk(
 
                 m_blockman.UnlinkPrunedFiles(setFilesToPrune);
             }
-            m_last_write = nNow;
-        }
-        // Flush best chain related state. This can only be done if the blocks / block index write was also done.
-        if (fDoFullFlush && !CoinsTip().GetBestBlock().IsNull()) {
-            if (coins_mem_usage >= WARN_FLUSH_COINS_SIZE) LogWarning("Flushing large (%d GiB) UTXO set to disk, it may take several minutes", coins_mem_usage >> 30);
-            LOG_TIME_MILLIS_WITH_CATEGORY(strprintf("write coins cache to disk (%d coins, %.2fKiB)",
-                coins_count, coins_mem_usage >> 10), BCLog::BENCH);
-
-            // Typical Coin structures on disk are around 48 bytes in size.
-            // Pushing a new one to the database can cause it to be written
-            // twice (once in the log, and once in the tables). This is already
-            // an overestimation, as most will delete an existing entry or
-            // overwrite one. Still, use a conservative safety factor of 2.
-            if (!CheckDiskSpace(m_chainman.m_options.datadir, 48 * 2 * 2 * CoinsTip().GetCacheSize())) {
-                return FatalError(m_chainman.GetNotifications(), state, _("Disk space is too low!"));
-            }
-            // Flush the chainstate (which may refer to block index entries).
-            const auto empty_cache{(mode == FlushStateMode::ALWAYS) || fCacheLarge || fCacheCritical};
-            if (empty_cache ? !CoinsTip().Flush() : !CoinsTip().Sync()) {
-                return FatalError(m_chainman.GetNotifications(), state, _("Failed to write to coin database."));
+
+            if (!CoinsTip().GetBestBlock().IsNull()) {
+                if (coins_mem_usage >= WARN_FLUSH_COINS_SIZE) LogWarning("Flushing large (%d GiB) UTXO set to disk, it may take several minutes", coins_mem_usage >> 30);
+                LOG_TIME_MILLIS_WITH_CATEGORY(strprintf("write coins cache to disk (%d coins, %.2fKiB)",
+                    coins_count, coins_mem_usage >> 10), BCLog::BENCH);
+
+                // Typical Coin structures on disk are around 48 bytes in size.
+                // Pushing a new one to the database can cause it to be written
+                // twice (once in the log, and once in the tables). This is already
+                // an overestimation, as most will delete an existing entry or
+                // overwrite one. Still, use a conservative safety factor of 2.
+                if (!CheckDiskSpace(m_chainman.m_options.datadir, 48 * 2 * 2 * CoinsTip().GetCacheSize())) {
+                    return FatalError(m_chainman.GetNotifications(), state, _("Disk space is too low!"));
+                }
+                // Flush the chainstate (which may refer to block index entries).
+                const auto empty_cache{(mode == FlushStateMode::ALWAYS) || fCacheLarge || fCacheCritical};
+                if (empty_cache ? !CoinsTip().Flush() : !CoinsTip().Sync()) {
+                    return FatalError(m_chainman.GetNotifications(), state, _("Failed to write to coin database."));
+                }
+                full_flush_completed = true;
+                TRACEPOINT(utxocache, flush,
+                    int64_t{Ticks<std::chrono::microseconds>(NodeClock::now() - nNow)},
+                    (uint32_t)mode,
+                    (uint64_t)coins_count,
+                    (uint64_t)coins_mem_usage,
+                    (bool)fFlushForPrune);
             }
-            m_last_flush = nNow;
-            full_flush_completed = true;
-            TRACEPOINT(utxocache, flush,
-                   int64_t{Ticks<std::chrono::microseconds>(SteadyClock::now() - nNow)},
-                   (uint32_t)mode,
-                   (uint64_t)coins_count,
-                   (uint64_t)coins_mem_usage,
-                   (bool)fFlushForPrune);
+        }
+
+        if (should_write || m_next_write == NodeClock::time_point::max()) {
+            constexpr auto range{DATABASE_WRITE_INTERVAL_MAX - DATABASE_WRITE_INTERVAL_MIN};
+            m_next_write = FastRandomContext().rand_uniform_delay(NodeClock::now() + DATABASE_WRITE_INTERVAL_MIN, range);
         }
     }
     if (full_flush_completed && m_chainman.m_options.signals) {
diff --git a/src/validation.h b/src/validation.h
@@ -31,6 +31,7 @@
 #include <util/fs.h>
 #include <util/hasher.h>
 #include <util/result.h>
+#include <util/time.h>
 #include <util/translation.h>
 #include <versionbits.h>
 
@@ -802,8 +803,7 @@ class Chainstate
     void UpdateTip(const CBlockIndex* pindexNew)
         EXCLUSIVE_LOCKS_REQUIRED(::cs_main);
 
-    SteadyClock::time_point m_last_write{};
-    SteadyClock::time_point m_last_flush{};
+    NodeClock::time_point m_next_write{NodeClock::time_point::max()};
 
     /**
      * In case of an invalid snapshot, rename the coins leveldb directory so