From 6194dddc5b9ea176fc38796fb5de6c7fac8a8143 Mon Sep 17 00:00:00 2001 From: Michael Sproul Date: Mon, 2 Mar 2026 17:43:51 +1100 Subject: [PATCH] Persist custody context more readily (#8921) We received a bug report of a node restarting custody backfill unnecessarily after upgrading to Lighthouse v8.1.1. What happened is: - User started LH v8.0.1 many months ago, CGC updated 0 -> N but the CGC was not eagerly persisted. - LH experienced an unclean shutdown (not sure of what type). - Upon restarting (still running v8.0.1), the custody context read from disk contains CGC=0: `DEBUG Loaded persisted custody context custody_context: CustodyContext { validator_custody_count: 0, ...`). - CGC updates again to N, retriggering custody backfill: `DEBUG Validator count at head updated old_count: 0, new_count: N`. - Custody backfill does a bunch of downloading for no gain: `DEBUG Imported historical data columns epoch: Epoch(428433), total_imported: 0` - While custody backfill is running user updated to v8.1.1, and we see logs for the CGC=N being peristed upon clean shutdown, and then correctly read on startup with v8.1.1. - Custody backfill keeps running and downloading due to the CGC change still being considered in progress. - Call `persist_custody_context` inside the `register_validators` handler so that it is written to disk eagerly whenever it changes. The performance impact of this should be minimal as the amount of data is very small and this call can only happen at most ~128 times (once for each change) in the entire life of a beacon node. - Call `persist_custody_context` inside `BeaconChainBuilder::build` so that changes caused by CLI flags are persisted (otherwise starting a node with `--semi-supernode` and no validators, then shutting it down uncleanly would cause use to forget the CGC). These changes greatly reduce the timespan during which an unclean shutdown can create inconsistency. In the worst case, we only lose backfill progress that runs concurrently with the `register_validators` handler (should be extremely minimal, nigh impossible). Co-Authored-By: Michael Sproul --- beacon_node/beacon_chain/src/beacon_chain.rs | 13 ++++++++++++- beacon_node/beacon_chain/src/builder.rs | 5 +++++ beacon_node/http_api/src/validator/mod.rs | 12 ++++++++++++ 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/beacon_node/beacon_chain/src/beacon_chain.rs b/beacon_node/beacon_chain/src/beacon_chain.rs index 9d204ac7f2..703ed24420 100644 --- a/beacon_node/beacon_chain/src/beacon_chain.rs +++ b/beacon_node/beacon_chain/src/beacon_chain.rs @@ -662,7 +662,18 @@ impl BeaconChain { .custody_context() .as_ref() .into(); - debug!(?custody_context, "Persisting custody context to store"); + + // Pattern match to avoid accidentally missing fields and to ignore deprecated fields. + let CustodyContextSsz { + validator_custody_at_head, + epoch_validator_custody_requirements, + persisted_is_supernode: _, + } = &custody_context; + debug!( + validator_custody_at_head, + ?epoch_validator_custody_requirements, + "Persisting custody context to store" + ); persist_custody_context::( self.store.clone(), diff --git a/beacon_node/beacon_chain/src/builder.rs b/beacon_node/beacon_chain/src/builder.rs index 2c1dae9215..66a54d46e8 100644 --- a/beacon_node/beacon_chain/src/builder.rs +++ b/beacon_node/beacon_chain/src/builder.rs @@ -1083,6 +1083,11 @@ where let cgc_change_effective_slot = cgc_changed.effective_epoch.start_slot(E::slots_per_epoch()); beacon_chain.update_data_column_custody_info(Some(cgc_change_effective_slot)); + + // Persist change to disk. + beacon_chain + .persist_custody_context() + .map_err(|e| format!("Failed writing updated CGC: {e:?}"))?; } info!( diff --git a/beacon_node/http_api/src/validator/mod.rs b/beacon_node/http_api/src/validator/mod.rs index df237d9f9b..a9082df715 100644 --- a/beacon_node/http_api/src/validator/mod.rs +++ b/beacon_node/http_api/src/validator/mod.rs @@ -727,6 +727,18 @@ pub fn post_validator_prepare_beacon_proposer( debug!(error = %e, "Could not send message to the network service. \ Likely shutdown") }); + + // Write the updated custody context to disk. This happens at most 128 + // times ever, so the I/O burden should be extremely minimal. Without a + // write here we risk forgetting custody backfill progress upon an + // unclean shutdown. The custody context is otherwise only persisted in + // `BeaconChain::drop`. + if let Err(error) = chain.persist_custody_context() { + error!( + ?error, + "Failed to persist custody context after CGC update" + ); + } } }