Fix HTTP state API bug and add --epochs-per-migration (#4236)

## Issue Addressed Fix an issue observed by `@zlan` on Discord where Lighthouse would sometimes return this error when looking up states via the API: > {"code":500,"message":"UNHANDLED_ERROR: ForkChoiceError(MissingProtoArrayBlock(0xc9cf1495421b6ef3215d82253b388d77321176a1dcef0db0e71a0cd0ffc8cdb7))","stacktraces":[]} ## Proposed Changes The error stems from a faulty assumption in the HTTP API logic: that any state in the hot database must have its block in fork choice. This isn't true because the state's hot database may update much less frequently than the fork choice store, e.g. if reconstructing states (where freezer migration pauses), or if the freezer migration runs slowly. There could also be a race between loading the hot state and checking fork choice, e.g. even if the finalization migration of DB+fork choice were atomic, the update could happen between the 1st and 2nd calls. To address this I've changed the HTTP API logic to use the finalized block's execution status as a fallback where it is safe to do so. In the case where a block is non-canonical and prior to finalization (permanently orphaned) we default `execution_optimistic` to `true`. ## Additional Info I've also added a new CLI flag to reduce the frequency of the finalization migration as this is useful for several purposes: - Spacing out database writes (less frequent, larger batches) - Keeping a limited chain history with high availability, e.g. the last month in the hot database. This new flag made it _substantially_ easier to test this change. It was extracted from `tree-states` (where it's called `--db-migration-period`), which is why this PR also carries the `tree-states` label.
2026-04-19 13:58:28 +00:00 · 2023-07-17 00:14:12 +00:00
parent 0c7eed5e58
commit 6c375205fb
11 changed files with 240 additions and 16 deletions
--- a/beacon_node/beacon_chain/src/migrate.rs
+++ b/beacon_node/beacon_chain/src/migrate.rs
@@ -25,10 +25,15 @@ const MIN_COMPACTION_PERIOD_SECONDS: u64 = 7200;
 /// Compact after a large finality gap, if we respect `MIN_COMPACTION_PERIOD_SECONDS`.
 const COMPACTION_FINALITY_DISTANCE: u64 = 1024;

+/// Default number of epochs to wait between finalization migrations.
+pub const DEFAULT_EPOCHS_PER_MIGRATION: u64 = 1;
+
 /// The background migrator runs a thread to perform pruning and migrate state from the hot
 /// to the cold database.
 pub struct BackgroundMigrator<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> {
    db: Arc<HotColdDB<E, Hot, Cold>>,
+    /// Record of when the last migration ran, for enforcing `epochs_per_migration`.
+    prev_migration: Arc<Mutex<PrevMigration>>,
    #[allow(clippy::type_complexity)]
    tx_thread: Option<Mutex<(mpsc::Sender<Notification>, thread::JoinHandle<()>)>>,
    /// Genesis block root, for persisting the `PersistedBeaconChain`.
@@ -36,9 +41,22 @@ pub struct BackgroundMigrator<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>>
    log: Logger,
 }

-#[derive(Debug, Default, Clone, PartialEq, Eq)]
+#[derive(Debug, Clone, PartialEq, Eq)]
 pub struct MigratorConfig {
    pub blocking: bool,
+    /// Run migrations at most once per `epochs_per_migration`.
+    ///
+    /// If set to 0 or 1, then run every finalization.
+    pub epochs_per_migration: u64,
+}
+
+impl Default for MigratorConfig {
+    fn default() -> Self {
+        Self {
+            blocking: false,
+            epochs_per_migration: DEFAULT_EPOCHS_PER_MIGRATION,
+        }
+    }
 }

 impl MigratorConfig {
@@ -46,6 +64,19 @@ impl MigratorConfig {
        self.blocking = true;
        self
    }
+
+    pub fn epochs_per_migration(mut self, epochs_per_migration: u64) -> Self {
+        self.epochs_per_migration = epochs_per_migration;
+        self
+    }
+}
+
+/// Record of when the last migration ran.
+pub struct PrevMigration {
+    /// The epoch at which the last finalization migration ran.
+    epoch: Epoch,
+    /// The number of epochs to wait between runs.
+    epochs_per_migration: u64,
 }

 /// Pruning can be successful, or in rare cases deferred to a later point.
@@ -92,6 +123,7 @@ pub struct FinalizationNotification {
    finalized_state_root: BeaconStateHash,
    finalized_checkpoint: Checkpoint,
    head_tracker: Arc<HeadTracker>,
+    prev_migration: Arc<Mutex<PrevMigration>>,
    genesis_block_root: Hash256,
 }

@@ -103,6 +135,11 @@ impl<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> BackgroundMigrator<E, Ho
        genesis_block_root: Hash256,
        log: Logger,
    ) -> Self {
+        // Estimate last migration run from DB split slot.
+        let prev_migration = Arc::new(Mutex::new(PrevMigration {
+            epoch: db.get_split_slot().epoch(E::slots_per_epoch()),
+            epochs_per_migration: config.epochs_per_migration,
+        }));
        let tx_thread = if config.blocking {
            None
        } else {
@@ -111,6 +148,7 @@ impl<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> BackgroundMigrator<E, Ho
        Self {
            db,
            tx_thread,
+            prev_migration,
            genesis_block_root,
            log,
        }
@@ -131,6 +169,7 @@ impl<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> BackgroundMigrator<E, Ho
            finalized_state_root,
            finalized_checkpoint,
            head_tracker,
+            prev_migration: self.prev_migration.clone(),
            genesis_block_root: self.genesis_block_root,
        };

@@ -204,6 +243,26 @@ impl<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> BackgroundMigrator<E, Ho
        notif: FinalizationNotification,
        log: &Logger,
    ) {
+        // Do not run too frequently.
+        let epoch = notif.finalized_checkpoint.epoch;
+        let mut prev_migration = notif.prev_migration.lock();
+        if epoch < prev_migration.epoch + prev_migration.epochs_per_migration {
+            debug!(
+                log,
+                "Database consolidation deferred";
+                "last_finalized_epoch" => prev_migration.epoch,
+                "new_finalized_epoch" => epoch,
+                "epochs_per_migration" => prev_migration.epochs_per_migration,
+            );
+            return;
+        }
+
+        // Update the previous migration epoch immediately to avoid holding the lock. If the
+        // migration doesn't succeed then the next migration will be retried at the next scheduled
+        // run.
+        prev_migration.epoch = epoch;
+        drop(prev_migration);
+
        debug!(log, "Database consolidation started");

        let finalized_state_root = notif.finalized_state_root;