Fix custody context initialization race condition that caused panic (#8391)

Take 2 of #8390.

Fixes the race condition properly instead of propagating the error. I think this is a better alternative, and doesn't seem to look that bad.


  * Lift node id loading or generation from `NetworkService ` startup to the `ClientBuilder`, so that it can be used to compute custody columns for the beacon chain without waiting for Network bootstrap.

I've considered and implemented a few alternatives:
1. passing `node_id` to beacon chain builder and compute columns when creating `CustodyContext`. This approach isn't good for separation of concerns and isn't great for testability
2. passing `ordered_custody_groups` to beacon chain. `CustodyContext` only uses this to compute ordered custody columns, so we might as well lift this logic out, so we don't have to do error handling in `CustodyContext` construction. Less tests to update;.


Co-Authored-By: Jimmy Chen <jchen.tc@gmail.com>
This commit is contained in:
Jimmy Chen
2025-11-17 16:23:12 +11:00
committed by GitHub
parent f2b945a5b5
commit af1d9b9991
15 changed files with 230 additions and 196 deletions

View File

@@ -28,6 +28,7 @@ use execution_layer::ExecutionLayer;
use execution_layer::test_utils::generate_genesis_header;
use futures::channel::mpsc::Receiver;
use genesis::{DEFAULT_ETH1_BLOCK_HASH, interop_genesis_state};
use lighthouse_network::identity::Keypair;
use lighthouse_network::{NetworkGlobals, prometheus_client::registry::Registry};
use monitoring_api::{MonitoringHttpClient, ProcessType};
use network::{NetworkConfig, NetworkSenders, NetworkService};
@@ -42,7 +43,7 @@ use std::time::{SystemTime, UNIX_EPOCH};
use store::database::interface::BeaconNodeBackend;
use timer::spawn_timer;
use tracing::{debug, info, warn};
use types::data_column_custody_group::get_custody_groups_ordered;
use types::data_column_custody_group::compute_ordered_custody_column_indices;
use types::{
BeaconState, BlobSidecarList, ChainSpec, EthSpec, ExecutionBlockHash, Hash256,
SignedBeaconBlock, test_utils::generate_deterministic_keypairs,
@@ -154,6 +155,7 @@ where
mut self,
client_genesis: ClientGenesis,
config: ClientConfig,
node_id: [u8; 32],
) -> Result<Self, String> {
let store = self.store.clone();
let chain_spec = self.chain_spec.clone();
@@ -191,6 +193,11 @@ where
Kzg::new_from_trusted_setup_no_precomp(&config.trusted_setup).map_err(kzg_err_msg)?
};
let ordered_custody_column_indices =
compute_ordered_custody_column_indices::<E>(node_id, &spec).map_err(|e| {
format!("Failed to compute ordered custody column indices: {:?}", e)
})?;
let builder = BeaconChainBuilder::new(eth_spec_instance, Arc::new(kzg))
.store(store)
.task_executor(context.executor.clone())
@@ -203,6 +210,7 @@ where
.event_handler(event_handler)
.execution_layer(execution_layer)
.node_custody_type(config.chain.node_custody_type)
.ordered_custody_column_indices(ordered_custody_column_indices)
.validator_monitor_config(config.validator_monitor.clone())
.rng(Box::new(
StdRng::try_from_rng(&mut OsRng)
@@ -463,7 +471,11 @@ where
}
/// Starts the networking stack.
pub async fn network(mut self, config: Arc<NetworkConfig>) -> Result<Self, String> {
pub async fn network(
mut self,
config: Arc<NetworkConfig>,
local_keypair: Keypair,
) -> Result<Self, String> {
let beacon_chain = self
.beacon_chain
.clone()
@@ -491,12 +503,11 @@ where
context.executor,
libp2p_registry.as_mut(),
beacon_processor_channels.beacon_processor_tx.clone(),
local_keypair,
)
.await
.map_err(|e| format!("Failed to start network: {:?}", e))?;
init_custody_context(beacon_chain, &network_globals)?;
self.network_globals = Some(network_globals);
self.network_senders = Some(network_senders);
self.libp2p_registry = libp2p_registry;
@@ -798,21 +809,6 @@ where
}
}
fn init_custody_context<T: BeaconChainTypes>(
chain: Arc<BeaconChain<T>>,
network_globals: &NetworkGlobals<T::EthSpec>,
) -> Result<(), String> {
let node_id = network_globals.local_enr().node_id().raw();
let spec = &chain.spec;
let custody_groups_ordered =
get_custody_groups_ordered(node_id, spec.number_of_custody_groups, spec)
.map_err(|e| format!("Failed to compute custody groups: {:?}", e))?;
chain
.data_availability_checker
.custody_context()
.init_ordered_data_columns_from_custody_groups(custody_groups_ordered, spec)
}
impl<TSlotClock, E, THotStore, TColdStore>
ClientBuilder<Witness<TSlotClock, E, THotStore, TColdStore>>
where