Fix custody context initialization race condition that caused panic (#8391)

Take 2 of #8390.

Fixes the race condition properly instead of propagating the error. I think this is a better alternative, and doesn't seem to look that bad.


  * Lift node id loading or generation from `NetworkService ` startup to the `ClientBuilder`, so that it can be used to compute custody columns for the beacon chain without waiting for Network bootstrap.

I've considered and implemented a few alternatives:
1. passing `node_id` to beacon chain builder and compute columns when creating `CustodyContext`. This approach isn't good for separation of concerns and isn't great for testability
2. passing `ordered_custody_groups` to beacon chain. `CustodyContext` only uses this to compute ordered custody columns, so we might as well lift this logic out, so we don't have to do error handling in `CustodyContext` construction. Less tests to update;.


Co-Authored-By: Jimmy Chen <jchen.tc@gmail.com>
This commit is contained in:
Jimmy Chen
2025-11-17 16:23:12 +11:00
committed by GitHub
parent f2b945a5b5
commit af1d9b9991
15 changed files with 230 additions and 196 deletions

View File

@@ -2,13 +2,11 @@ use parking_lot::RwLock;
use serde::{Deserialize, Serialize};
use ssz_derive::{Decode, Encode};
use std::marker::PhantomData;
use std::sync::OnceLock;
use std::{
collections::{BTreeMap, HashMap},
sync::atomic::{AtomicU64, Ordering},
};
use tracing::{debug, warn};
use types::data_column_custody_group::{CustodyIndex, compute_columns_for_custody_group};
use types::{ChainSpec, ColumnIndex, Epoch, EthSpec, Slot};
/// A delay before making the CGC change effective to the data availability checker.
@@ -206,7 +204,7 @@ fn get_validators_custody_requirement(validator_custody_units: u64, spec: &Chain
/// Therefore, the custody count at any point in time is calculated as the max of
/// the validator custody at that time and the current cli params.
///
/// Choosing the max ensures that we always have the minimum required columns and
/// Choosing the max ensures that we always have the minimum required columns, and
/// we can adjust the `status.earliest_available_slot` value to indicate to our peers
/// the columns that we can guarantee to serve.
#[derive(Debug, Copy, Clone, PartialEq, Eq, Default, Deserialize, Serialize)]
@@ -218,7 +216,7 @@ pub enum NodeCustodyType {
/// wants to subscribe to the minimum number of columns to enable
/// reconstruction (50%) of the full blob data on demand.
SemiSupernode,
/// The node isn't running with with any explicit cli parameters
/// The node isn't running with any explicit cli parameters
/// or is running with cli parameters to indicate that it wants
/// to only subscribe to the minimal custody requirements.
#[default]
@@ -248,9 +246,9 @@ pub struct CustodyContext<E: EthSpec> {
validator_custody_count: AtomicU64,
/// Maintains all the validators that this node is connected to currently
validator_registrations: RwLock<ValidatorRegistrations>,
/// Stores an immutable, ordered list of all custody columns as determined by the node's NodeID
/// on startup.
all_custody_columns_ordered: OnceLock<Box<[ColumnIndex]>>,
/// Stores an immutable, ordered list of all data column indices as determined by the node's NodeID
/// on startup. This used to determine the node's custody columns.
ordered_custody_column_indices: Vec<ColumnIndex>,
_phantom_data: PhantomData<E>,
}
@@ -259,7 +257,11 @@ impl<E: EthSpec> CustodyContext<E> {
/// exists.
///
/// The `node_custody_type` value is based on current cli parameters.
pub fn new(node_custody_type: NodeCustodyType, spec: &ChainSpec) -> Self {
pub fn new(
node_custody_type: NodeCustodyType,
ordered_custody_column_indices: Vec<ColumnIndex>,
spec: &ChainSpec,
) -> Self {
let cgc_override = node_custody_type.get_custody_count_override(spec);
// If there's no override, we initialise `validator_custody_count` to 0. This has been the
// existing behaviour and we maintain this for now to avoid a semantic schema change until
@@ -267,7 +269,7 @@ impl<E: EthSpec> CustodyContext<E> {
Self {
validator_custody_count: AtomicU64::new(cgc_override.unwrap_or(0)),
validator_registrations: RwLock::new(ValidatorRegistrations::new(cgc_override)),
all_custody_columns_ordered: OnceLock::new(),
ordered_custody_column_indices,
_phantom_data: PhantomData,
}
}
@@ -290,6 +292,7 @@ impl<E: EthSpec> CustodyContext<E> {
ssz_context: CustodyContextSsz,
node_custody_type: NodeCustodyType,
head_epoch: Epoch,
ordered_custody_column_indices: Vec<ColumnIndex>,
spec: &ChainSpec,
) -> (Self, Option<CustodyCountChanged>) {
let CustodyContextSsz {
@@ -355,39 +358,13 @@ impl<E: EthSpec> CustodyContext<E> {
.into_iter()
.collect(),
}),
all_custody_columns_ordered: OnceLock::new(),
ordered_custody_column_indices,
_phantom_data: PhantomData,
};
(custody_context, custody_count_changed)
}
/// Initializes an ordered list of data columns based on provided custody groups.
///
/// # Arguments
/// * `all_custody_groups_ordered` - Vector of custody group indices to map to columns
/// * `spec` - Chain specification containing custody parameters
///
/// # Returns
/// Ok(()) if initialization succeeds, Err with description string if it fails
pub fn init_ordered_data_columns_from_custody_groups(
&self,
all_custody_groups_ordered: Vec<CustodyIndex>,
spec: &ChainSpec,
) -> Result<(), String> {
let mut ordered_custody_columns = vec![];
for custody_index in all_custody_groups_ordered {
let columns = compute_columns_for_custody_group::<E>(custody_index, spec)
.map_err(|e| format!("Failed to compute columns for custody group {e:?}"))?;
ordered_custody_columns.extend(columns);
}
self.all_custody_columns_ordered
.set(ordered_custody_columns.into_boxed_slice())
.map_err(|_| {
"Failed to initialise CustodyContext with computed custody columns".to_string()
})
}
/// Register a new validator index and updates the list of validators if required.
///
/// Also modifies the internal structures if the validator custody has changed to
@@ -497,11 +474,7 @@ impl<E: EthSpec> CustodyContext<E> {
/// A slice of ordered column indices that should be sampled for this epoch based on the node's custody configuration
pub fn sampling_columns_for_epoch(&self, epoch: Epoch, spec: &ChainSpec) -> &[ColumnIndex] {
let num_of_columns_to_sample = self.num_of_data_columns_to_sample(epoch, spec);
let all_columns_ordered = self
.all_custody_columns_ordered
.get()
.expect("all_custody_columns_ordered should be initialized");
&all_columns_ordered[..num_of_columns_to_sample]
&self.ordered_custody_column_indices[..num_of_columns_to_sample]
}
/// Returns the ordered list of column indices that the node is assigned to custody
@@ -528,12 +501,11 @@ impl<E: EthSpec> CustodyContext<E> {
self.custody_group_count_at_head(spec) as usize
};
let all_columns_ordered = self
.all_custody_columns_ordered
.get()
.expect("all_custody_columns_ordered should be initialized");
// This is an unnecessary conversion for spec compliance, basically just multiplying by 1.
let columns_per_custody_group = spec.data_columns_per_group::<E>() as usize;
let custody_column_count = columns_per_custody_group * custody_group_count;
&all_columns_ordered[..custody_group_count]
&self.ordered_custody_column_indices[..custody_column_count]
}
/// The node has completed backfill for this epoch. Update the internal records so the function
@@ -599,11 +571,9 @@ impl<E: EthSpec> From<&CustodyContext<E>> for CustodyContextSsz {
#[cfg(test)]
mod tests {
use rand::rng;
use rand::seq::SliceRandom;
use types::MainnetEthSpec;
use super::*;
use crate::test_utils::generate_data_column_indices_rand_order;
use types::MainnetEthSpec;
type E = MainnetEthSpec;
@@ -623,13 +593,10 @@ mod tests {
ssz_context,
NodeCustodyType::Fullnode,
head_epoch,
generate_data_column_indices_rand_order::<E>(),
spec,
);
let all_custody_groups_ordered = (0..spec.number_of_custody_groups).collect::<Vec<_>>();
custody_context
.init_ordered_data_columns_from_custody_groups(all_custody_groups_ordered, spec)
.expect("should initialise ordered data columns");
custody_context
}
@@ -668,6 +635,7 @@ mod tests {
ssz_context,
target_node_custody_type,
head_epoch,
generate_data_column_indices_rand_order::<E>(),
spec,
);
@@ -738,6 +706,7 @@ mod tests {
ssz_context,
target_node_custody_type,
head_epoch,
generate_data_column_indices_rand_order::<E>(),
spec,
);
@@ -759,7 +728,11 @@ mod tests {
#[test]
fn no_validators_supernode_default() {
let spec = E::default_spec();
let custody_context = CustodyContext::<E>::new(NodeCustodyType::Supernode, &spec);
let custody_context = CustodyContext::<E>::new(
NodeCustodyType::Supernode,
generate_data_column_indices_rand_order::<E>(),
&spec,
);
assert_eq!(
custody_context.custody_group_count_at_head(&spec),
spec.number_of_custody_groups
@@ -773,7 +746,11 @@ mod tests {
#[test]
fn no_validators_semi_supernode_default() {
let spec = E::default_spec();
let custody_context = CustodyContext::<E>::new(NodeCustodyType::SemiSupernode, &spec);
let custody_context = CustodyContext::<E>::new(
NodeCustodyType::SemiSupernode,
generate_data_column_indices_rand_order::<E>(),
&spec,
);
assert_eq!(
custody_context.custody_group_count_at_head(&spec),
spec.number_of_custody_groups / 2
@@ -787,7 +764,11 @@ mod tests {
#[test]
fn no_validators_fullnode_default() {
let spec = E::default_spec();
let custody_context = CustodyContext::<E>::new(NodeCustodyType::Fullnode, &spec);
let custody_context = CustodyContext::<E>::new(
NodeCustodyType::Fullnode,
generate_data_column_indices_rand_order::<E>(),
&spec,
);
assert_eq!(
custody_context.custody_group_count_at_head(&spec),
spec.custody_requirement,
@@ -802,7 +783,11 @@ mod tests {
#[test]
fn register_single_validator_should_update_cgc() {
let spec = E::default_spec();
let custody_context = CustodyContext::<E>::new(NodeCustodyType::Fullnode, &spec);
let custody_context = CustodyContext::<E>::new(
NodeCustodyType::Fullnode,
generate_data_column_indices_rand_order::<E>(),
&spec,
);
let bal_per_additional_group = spec.balance_per_additional_custody_group;
let min_val_custody_requirement = spec.validator_custody_requirement;
// One single node increases its balance over 3 epochs.
@@ -826,7 +811,11 @@ mod tests {
#[test]
fn register_multiple_validators_should_update_cgc() {
let spec = E::default_spec();
let custody_context = CustodyContext::<E>::new(NodeCustodyType::Fullnode, &spec);
let custody_context = CustodyContext::<E>::new(
NodeCustodyType::Fullnode,
generate_data_column_indices_rand_order::<E>(),
&spec,
);
let bal_per_additional_group = spec.balance_per_additional_custody_group;
let min_val_custody_requirement = spec.validator_custody_requirement;
// Add 3 validators over 3 epochs.
@@ -863,7 +852,11 @@ mod tests {
#[test]
fn register_validators_should_not_update_cgc_for_supernode() {
let spec = E::default_spec();
let custody_context = CustodyContext::<E>::new(NodeCustodyType::Supernode, &spec);
let custody_context = CustodyContext::<E>::new(
NodeCustodyType::Supernode,
generate_data_column_indices_rand_order::<E>(),
&spec,
);
let bal_per_additional_group = spec.balance_per_additional_custody_group;
// Add 3 validators over 3 epochs.
@@ -901,7 +894,11 @@ mod tests {
#[test]
fn cgc_change_should_be_effective_to_sampling_after_delay() {
let spec = E::default_spec();
let custody_context = CustodyContext::<E>::new(NodeCustodyType::Fullnode, &spec);
let custody_context = CustodyContext::<E>::new(
NodeCustodyType::Fullnode,
generate_data_column_indices_rand_order::<E>(),
&spec,
);
let current_slot = Slot::new(10);
let current_epoch = current_slot.epoch(E::slots_per_epoch());
let default_sampling_size =
@@ -932,7 +929,11 @@ mod tests {
#[test]
fn validator_dropped_after_no_registrations_within_expiry_should_not_reduce_cgc() {
let spec = E::default_spec();
let custody_context = CustodyContext::<E>::new(NodeCustodyType::Fullnode, &spec);
let custody_context = CustodyContext::<E>::new(
NodeCustodyType::Fullnode,
generate_data_column_indices_rand_order::<E>(),
&spec,
);
let current_slot = Slot::new(10);
let val_custody_units_1 = 10;
let val_custody_units_2 = 5;
@@ -974,7 +975,11 @@ mod tests {
#[test]
fn validator_dropped_after_no_registrations_within_expiry() {
let spec = E::default_spec();
let custody_context = CustodyContext::<E>::new(NodeCustodyType::Fullnode, &spec);
let custody_context = CustodyContext::<E>::new(
NodeCustodyType::Fullnode,
generate_data_column_indices_rand_order::<E>(),
&spec,
);
let current_slot = Slot::new(10);
let val_custody_units_1 = 10;
let val_custody_units_2 = 5;
@@ -1021,37 +1026,6 @@ mod tests {
);
}
#[test]
fn should_init_ordered_data_columns_and_return_sampling_columns() {
let spec = E::default_spec();
let custody_context = CustodyContext::<E>::new(NodeCustodyType::Fullnode, &spec);
let sampling_size = custody_context.num_of_data_columns_to_sample(Epoch::new(0), &spec);
// initialise ordered columns
let mut all_custody_groups_ordered = (0..spec.number_of_custody_groups).collect::<Vec<_>>();
all_custody_groups_ordered.shuffle(&mut rng());
custody_context
.init_ordered_data_columns_from_custody_groups(
all_custody_groups_ordered.clone(),
&spec,
)
.expect("should initialise ordered data columns");
let actual_sampling_columns =
custody_context.sampling_columns_for_epoch(Epoch::new(0), &spec);
let expected_sampling_columns = &all_custody_groups_ordered
.iter()
.flat_map(|custody_index| {
compute_columns_for_custody_group::<E>(*custody_index, &spec)
.expect("should compute columns for custody group")
})
.collect::<Vec<_>>()[0..sampling_size];
assert_eq!(actual_sampling_columns, expected_sampling_columns)
}
/// Update the validator every epoch and assert cgc against expected values.
fn register_validators_and_assert_cgc<E: EthSpec>(
custody_context: &CustodyContext<E>,
@@ -1077,12 +1051,12 @@ mod tests {
#[test]
fn custody_columns_for_epoch_no_validators_fullnode() {
let spec = E::default_spec();
let custody_context = CustodyContext::<E>::new(NodeCustodyType::Fullnode, &spec);
let all_custody_groups_ordered = (0..spec.number_of_custody_groups).collect::<Vec<_>>();
custody_context
.init_ordered_data_columns_from_custody_groups(all_custody_groups_ordered, &spec)
.expect("should initialise ordered data columns");
let ordered_custody_column_indices = generate_data_column_indices_rand_order::<E>();
let custody_context = CustodyContext::<E>::new(
NodeCustodyType::Fullnode,
ordered_custody_column_indices,
&spec,
);
assert_eq!(
custody_context.custody_columns_for_epoch(None, &spec).len(),
@@ -1093,12 +1067,12 @@ mod tests {
#[test]
fn custody_columns_for_epoch_no_validators_supernode() {
let spec = E::default_spec();
let custody_context = CustodyContext::<E>::new(NodeCustodyType::Supernode, &spec);
let all_custody_groups_ordered = (0..spec.number_of_custody_groups).collect::<Vec<_>>();
custody_context
.init_ordered_data_columns_from_custody_groups(all_custody_groups_ordered, &spec)
.expect("should initialise ordered data columns");
let ordered_custody_column_indices = generate_data_column_indices_rand_order::<E>();
let custody_context = CustodyContext::<E>::new(
NodeCustodyType::Supernode,
ordered_custody_column_indices,
&spec,
);
assert_eq!(
custody_context.custody_columns_for_epoch(None, &spec).len(),
@@ -1109,14 +1083,14 @@ mod tests {
#[test]
fn custody_columns_for_epoch_with_validators_should_match_cgc() {
let spec = E::default_spec();
let custody_context = CustodyContext::<E>::new(NodeCustodyType::Fullnode, &spec);
let all_custody_groups_ordered = (0..spec.number_of_custody_groups).collect::<Vec<_>>();
let ordered_custody_column_indices = generate_data_column_indices_rand_order::<E>();
let custody_context = CustodyContext::<E>::new(
NodeCustodyType::Fullnode,
ordered_custody_column_indices,
&spec,
);
let val_custody_units = 10;
custody_context
.init_ordered_data_columns_from_custody_groups(all_custody_groups_ordered, &spec)
.expect("should initialise ordered data columns");
let _ = custody_context.register_validators(
vec![(
0,
@@ -1135,14 +1109,14 @@ mod tests {
#[test]
fn custody_columns_for_epoch_specific_epoch_uses_epoch_cgc() {
let spec = E::default_spec();
let custody_context = CustodyContext::<E>::new(NodeCustodyType::Fullnode, &spec);
let all_custody_groups_ordered = (0..spec.number_of_custody_groups).collect::<Vec<_>>();
let ordered_custody_column_indices = generate_data_column_indices_rand_order::<E>();
let custody_context = CustodyContext::<E>::new(
NodeCustodyType::Fullnode,
ordered_custody_column_indices,
&spec,
);
let test_epoch = Epoch::new(5);
custody_context
.init_ordered_data_columns_from_custody_groups(all_custody_groups_ordered, &spec)
.expect("should initialise ordered data columns");
let expected_cgc = custody_context.custody_group_count_at_epoch(test_epoch, &spec);
assert_eq!(
custody_context
@@ -1165,6 +1139,7 @@ mod tests {
ssz_context,
NodeCustodyType::Fullnode,
Epoch::new(0),
generate_data_column_indices_rand_order::<E>(),
&spec,
);
@@ -1198,7 +1173,11 @@ mod tests {
fn restore_semi_supernode_with_validators_can_exceed_64() {
let spec = E::default_spec();
let semi_supernode_cgc = spec.number_of_custody_groups / 2; // 64
let custody_context = CustodyContext::<E>::new(NodeCustodyType::SemiSupernode, &spec);
let custody_context = CustodyContext::<E>::new(
NodeCustodyType::SemiSupernode,
generate_data_column_indices_rand_order::<E>(),
&spec,
);
// Verify initial CGC is 64 (semi-supernode)
assert_eq!(
@@ -1348,6 +1327,7 @@ mod tests {
ssz_context,
NodeCustodyType::Fullnode,
Epoch::new(20),
generate_data_column_indices_rand_order::<E>(),
&spec,
);