Fix wrong columns getting processed on a CGC change (#7792)

This PR fixes a bug where wrong columns could get processed immediately after a CGC increase.

Scenario:
- The node's CGC increased due to additional validators attached to it (lets say from 10 to 11)
- The new CGC is advertised and new subnets are subscribed immediately, however the change won't be effective in the data availability check until the next epoch (See [this](ab0e8870b4/beacon_node/beacon_chain/src/validator_custody.rs (L93-L99))). Data availability checker still only require 10 columns for the current epoch.
- During this time, data columns for the additional custody column (lets say column 11) may arrive via gossip as we're already subscribed to the topic, and it may be incorrectly used to satisfy the existing data availability requirement (10 columns), and result in this additional column (instead of a required one) getting persisted, resulting in database inconsistency.
This commit is contained in:
Jimmy Chen
2025-08-07 10:45:04 +10:00
committed by GitHub
parent 9c972201bc
commit 8bc6693dac
27 changed files with 577 additions and 277 deletions

View File

@@ -898,9 +898,9 @@ impl<E: EthSpec> Network<E> {
name = "libp2p",
skip_all
)]
pub fn subscribe_new_data_column_subnets(&mut self, custody_column_count: u64) {
pub fn subscribe_new_data_column_subnets(&mut self, sampling_column_count: u64) {
self.network_globals
.update_data_column_subnets(custody_column_count);
.update_data_column_subnets(sampling_column_count);
for column in self.network_globals.sampling_subnets() {
let kind = GossipKind::DataColumnSidecar(column);

View File

@@ -8,9 +8,7 @@ use parking_lot::RwLock;
use std::collections::HashSet;
use std::sync::Arc;
use tracing::error;
use types::data_column_custody_group::{
compute_columns_for_custody_group, compute_subnets_from_custody_group, get_custody_groups,
};
use types::data_column_custody_group::{compute_subnets_from_custody_group, get_custody_groups};
use types::{ChainSpec, ColumnIndex, DataColumnSubnetId, EthSpec};
pub struct NetworkGlobals<E: EthSpec> {
@@ -32,7 +30,6 @@ pub struct NetworkGlobals<E: EthSpec> {
pub backfill_state: RwLock<BackFillState>,
/// The computed sampling subnets and columns is stored to avoid re-computing.
pub sampling_subnets: RwLock<HashSet<DataColumnSubnetId>>,
pub sampling_columns: RwLock<HashSet<ColumnIndex>>,
/// Network-related configuration. Immutable after initialization.
pub config: Arc<NetworkConfig>,
/// Ethereum chain configuration. Immutable after initialization.
@@ -78,16 +75,8 @@ impl<E: EthSpec> NetworkGlobals<E> {
sampling_subnets.extend(subnets);
}
let mut sampling_columns = HashSet::new();
for custody_index in &custody_groups {
let columns = compute_columns_for_custody_group(*custody_index, &spec)
.expect("should compute custody columns for node");
sampling_columns.extend(columns);
}
tracing::debug!(
cgc = custody_group_count,
?sampling_columns,
?sampling_subnets,
"Starting node with custody params"
);
@@ -102,20 +91,15 @@ impl<E: EthSpec> NetworkGlobals<E> {
sync_state: RwLock::new(SyncState::Stalled),
backfill_state: RwLock::new(BackFillState::Paused),
sampling_subnets: RwLock::new(sampling_subnets),
sampling_columns: RwLock::new(sampling_columns),
config,
spec,
}
}
/// Update the sampling subnets based on an updated cgc.
pub fn update_data_column_subnets(&self, custody_group_count: u64) {
pub fn update_data_column_subnets(&self, sampling_size: u64) {
// The below `expect` calls will panic on start up if the chain spec config values used
// are invalid
let sampling_size = self
.spec
.sampling_size_custody_groups(custody_group_count)
.expect("should compute node sampling size from valid chain spec");
let custody_groups =
get_custody_groups(self.local_enr().node_id().raw(), sampling_size, &self.spec)
.expect("should compute node custody groups");
@@ -126,13 +110,6 @@ impl<E: EthSpec> NetworkGlobals<E> {
.expect("should compute custody subnets for node");
sampling_subnets.extend(subnets);
}
let mut sampling_columns = self.sampling_columns.write();
for custody_index in &custody_groups {
let columns = compute_columns_for_custody_group(*custody_index, &self.spec)
.expect("should compute custody columns for node");
sampling_columns.extend(columns);
}
}
/// Returns the local ENR from the underlying Discv5 behaviour that external peers may connect
@@ -248,10 +225,6 @@ impl<E: EthSpec> NetworkGlobals<E> {
}
}
pub fn sampling_columns(&self) -> HashSet<ColumnIndex> {
self.sampling_columns.read().clone()
}
pub fn sampling_subnets(&self) -> HashSet<DataColumnSubnetId> {
self.sampling_subnets.read().clone()
}
@@ -320,29 +293,6 @@ mod test {
);
}
#[test]
fn test_sampling_columns() {
create_test_tracing_subscriber();
let mut spec = E::default_spec();
spec.fulu_fork_epoch = Some(Epoch::new(0));
let custody_group_count = spec.number_of_custody_groups / 2;
let expected_sampling_columns = spec.sampling_size_columns(custody_group_count).unwrap();
let metadata = get_metadata(custody_group_count);
let config = Arc::new(NetworkConfig::default());
let globals = NetworkGlobals::<E>::new_test_globals_with_metadata(
vec![],
metadata,
config,
Arc::new(spec),
);
assert_eq!(
globals.sampling_columns.read().len(),
expected_sampling_columns as usize
);
}
fn get_metadata(custody_group_count: u64) -> MetaData<E> {
MetaData::V3(MetaDataV3 {
seq_number: 0,

View File

@@ -188,8 +188,8 @@ impl std::fmt::Display for GossipKind {
GossipKind::BlobSidecar(blob_index) => {
write!(f, "{}{}", BLOB_SIDECAR_PREFIX, blob_index)
}
GossipKind::DataColumnSidecar(column_index) => {
write!(f, "{}{}", DATA_COLUMN_SIDECAR_PREFIX, **column_index)
GossipKind::DataColumnSidecar(column_subnet_id) => {
write!(f, "{}{}", DATA_COLUMN_SIDECAR_PREFIX, **column_subnet_id)
}
x => f.write_str(x.as_ref()),
}
@@ -317,8 +317,8 @@ impl std::fmt::Display for GossipTopic {
GossipKind::BlobSidecar(blob_index) => {
format!("{}{}", BLOB_SIDECAR_PREFIX, blob_index)
}
GossipKind::DataColumnSidecar(index) => {
format!("{}{}", DATA_COLUMN_SIDECAR_PREFIX, *index)
GossipKind::DataColumnSidecar(column_subnet_id) => {
format!("{}{}", DATA_COLUMN_SIDECAR_PREFIX, *column_subnet_id)
}
GossipKind::BlsToExecutionChange => BLS_TO_EXECUTION_CHANGE_TOPIC.into(),
GossipKind::LightClientFinalityUpdate => LIGHT_CLIENT_FINALITY_UPDATE.into(),