Fix CGC backfill race condition (#8267)

During custody backfill sync there could be an edge case where we update CGC at the same time where we are importing a batch of columns which may cause us to incorrectly overwrite values when calling `backfill_validator_custody_requirements`. To prevent this race condition, the expected cgc is now passed into this function and is used to check if the expected cgc == the current validator cgc. If the values arent equal, this probably indicates that a very recent CGC occurred so we do not prune/update values in the `epoch_validator_custody_requirements` map.


  


Co-Authored-By: Eitan Seri-Levi <eserilev@ucsc.edu>
This commit is contained in:
Eitan Seri-Levi
2025-11-02 16:51:42 -08:00
committed by GitHub
parent 55588f7789
commit b57d046c4a
6 changed files with 103 additions and 91 deletions

View File

@@ -134,8 +134,17 @@ impl ValidatorRegistrations {
///
/// This is done by pruning all values on/after `effective_epoch` and updating the map to store
/// the latest validator custody requirements for the `effective_epoch`.
pub fn backfill_validator_custody_requirements(&mut self, effective_epoch: Epoch) {
pub fn backfill_validator_custody_requirements(
&mut self,
effective_epoch: Epoch,
expected_cgc: u64,
) {
if let Some(latest_validator_custody) = self.latest_validator_custody_requirement() {
// If the expected cgc isn't equal to the latest validator custody a very recent cgc change may have occurred.
// We should not update the mapping.
if expected_cgc != latest_validator_custody {
return;
}
// Delete records if
// 1. The epoch is greater than or equal than `effective_epoch`
// 2. the cgc requirements match the latest validator custody requirements
@@ -517,10 +526,14 @@ impl<E: EthSpec> CustodyContext<E> {
/// The node has completed backfill for this epoch. Update the internal records so the function
/// [`Self::custody_columns_for_epoch()`] returns up-to-date results.
pub fn update_and_backfill_custody_count_at_epoch(&self, effective_epoch: Epoch) {
pub fn update_and_backfill_custody_count_at_epoch(
&self,
effective_epoch: Epoch,
expected_cgc: u64,
) {
self.validator_registrations
.write()
.backfill_validator_custody_requirements(effective_epoch);
.backfill_validator_custody_requirements(effective_epoch, expected_cgc);
}
}
@@ -604,11 +617,13 @@ mod tests {
custody_context: &CustodyContext<E>,
start_epoch: Epoch,
end_epoch: Epoch,
expected_cgc: u64,
) {
assert!(start_epoch >= end_epoch);
// Call from end_epoch down to start_epoch (inclusive), simulating backfill
for epoch in (end_epoch.as_u64()..=start_epoch.as_u64()).rev() {
custody_context.update_and_backfill_custody_count_at_epoch(Epoch::new(epoch));
custody_context
.update_and_backfill_custody_count_at_epoch(Epoch::new(epoch), expected_cgc);
}
}
@@ -1368,7 +1383,7 @@ mod tests {
);
// Backfill from epoch 20 down to 15 (simulating backfill)
complete_backfill_for_epochs(&custody_context, head_epoch, Epoch::new(15));
complete_backfill_for_epochs(&custody_context, head_epoch, Epoch::new(15), final_cgc);
// After backfilling to epoch 15, it should use latest CGC (32)
assert_eq!(
@@ -1406,7 +1421,7 @@ mod tests {
let custody_context = setup_custody_context(&spec, head_epoch, epoch_and_cgc_tuples);
// Backfill to epoch 15 (between the two CGC increases)
complete_backfill_for_epochs(&custody_context, Epoch::new(20), Epoch::new(15));
complete_backfill_for_epochs(&custody_context, Epoch::new(20), Epoch::new(15), final_cgc);
// Verify epochs 15 - 20 return latest CGC (32)
for epoch in 15..=20 {
@@ -1424,4 +1439,56 @@ mod tests {
);
}
}
#[test]
fn attempt_backfill_with_invalid_cgc() {
let spec = E::default_spec();
let initial_cgc = 8u64;
let mid_cgc = 16u64;
let final_cgc = 32u64;
// Setup: Node restart after multiple validator registrations causing CGC increases
let head_epoch = Epoch::new(20);
let epoch_and_cgc_tuples = vec![
(Epoch::new(0), initial_cgc),
(Epoch::new(10), mid_cgc),
(head_epoch, final_cgc),
];
let custody_context = setup_custody_context(&spec, head_epoch, epoch_and_cgc_tuples);
// Backfill to epoch 15 (between the two CGC increases)
complete_backfill_for_epochs(&custody_context, Epoch::new(20), Epoch::new(15), final_cgc);
// Verify epochs 15 - 20 return latest CGC (32)
for epoch in 15..=20 {
assert_eq!(
custody_context.custody_group_count_at_epoch(Epoch::new(epoch), &spec),
final_cgc,
);
}
// Attempt backfill with an incorrect cgc value
complete_backfill_for_epochs(
&custody_context,
Epoch::new(20),
Epoch::new(15),
initial_cgc,
);
// Verify epochs 15 - 20 still return latest CGC (32)
for epoch in 15..=20 {
assert_eq!(
custody_context.custody_group_count_at_epoch(Epoch::new(epoch), &spec),
final_cgc,
);
}
// Verify epochs 10-14 still return mid_cgc (16)
for epoch in 10..14 {
assert_eq!(
custody_context.custody_group_count_at_epoch(Epoch::new(epoch), &spec),
mid_cgc,
);
}
}
}

View File

@@ -54,6 +54,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
&self,
epoch: Epoch,
historical_data_column_sidecar_list: DataColumnSidecarList<T::EthSpec>,
expected_cgc: u64,
) -> Result<usize, HistoricalDataColumnError> {
let mut total_imported = 0;
let mut ops = vec![];
@@ -136,7 +137,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
self.data_availability_checker
.custody_context()
.update_and_backfill_custody_count_at_epoch(epoch);
.update_and_backfill_custody_count_at_epoch(epoch, expected_cgc);
self.safely_backfill_data_column_custody_info(epoch)
.map_err(|e| HistoricalDataColumnError::BeaconChainError(Box::new(e)))?;

View File

@@ -3182,6 +3182,8 @@ async fn weak_subjectivity_sync_test(
assert_eq!(store.get_anchor_info().state_upper_limit, Slot::new(0));
}
// This test prunes data columns from epoch 0 and then tries to re-import them via
// the same code paths that custody backfill sync imports data columns
#[tokio::test]
async fn test_import_historical_data_columns_batch() {
let spec = ForkName::Fulu.make_genesis_spec(E::default_spec());
@@ -3189,6 +3191,7 @@ async fn test_import_historical_data_columns_batch() {
let store = get_store_generic(&db_path, StoreConfig::default(), spec);
let start_slot = Epoch::new(0).start_slot(E::slots_per_epoch()) + 1;
let end_slot = Epoch::new(0).end_slot(E::slots_per_epoch());
let cgc = 128;
let harness = get_harness_import_all_data_columns(store.clone(), LOW_VALIDATOR_COUNT);
@@ -3208,6 +3211,7 @@ async fn test_import_historical_data_columns_batch() {
let mut data_columns_list = vec![];
// Get all data columns for epoch 0
for block in block_root_iter {
let (block_root, _) = block.unwrap();
let data_columns = harness.chain.store.get_data_columns(&block_root).unwrap();
@@ -3227,6 +3231,7 @@ async fn test_import_historical_data_columns_batch() {
harness.advance_slot();
// Prune data columns
harness
.chain
.store
@@ -3238,21 +3243,25 @@ async fn test_import_historical_data_columns_batch() {
.forwards_iter_block_roots_until(start_slot, end_slot)
.unwrap();
// Assert that data columns no longer exist for epoch 0
for block in block_root_iter {
let (block_root, _) = block.unwrap();
let data_columns = harness.chain.store.get_data_columns(&block_root).unwrap();
assert!(data_columns.is_none())
}
// Re-import deleted data columns
harness
.chain
.import_historical_data_column_batch(Epoch::new(0), data_columns_list)
.import_historical_data_column_batch(Epoch::new(0), data_columns_list, cgc)
.unwrap();
let block_root_iter = harness
.chain
.forwards_iter_block_roots_until(start_slot, end_slot)
.unwrap();
// Assert that data columns now exist for epoch 0
for block in block_root_iter {
let (block_root, _) = block.unwrap();
let data_columns = harness.chain.store.get_data_columns(&block_root).unwrap();
@@ -3261,6 +3270,7 @@ async fn test_import_historical_data_columns_batch() {
}
// This should verify that a data column sidecar containing mismatched block roots should fail to be imported.
// This also covers any test cases related to data columns with incorrect/invalid/mismatched block roots.
#[tokio::test]
async fn test_import_historical_data_columns_batch_mismatched_block_root() {
let spec = ForkName::Fulu.make_genesis_spec(E::default_spec());
@@ -3268,6 +3278,7 @@ async fn test_import_historical_data_columns_batch_mismatched_block_root() {
let store = get_store_generic(&db_path, StoreConfig::default(), spec);
let start_slot = Slot::new(1);
let end_slot = Slot::new(E::slots_per_epoch() * 2 - 1);
let cgc = 128;
let harness = get_harness_import_all_data_columns(store.clone(), LOW_VALIDATOR_COUNT);
@@ -3287,6 +3298,8 @@ async fn test_import_historical_data_columns_batch_mismatched_block_root() {
let mut data_columns_list = vec![];
// Get all data columns from start_slot to end_slot
// and mutate the data columns with an invalid block root
for block in block_root_iter {
let (block_root, _) = block.unwrap();
let data_columns = harness.chain.store.get_data_columns(&block_root).unwrap();
@@ -3312,6 +3325,7 @@ async fn test_import_historical_data_columns_batch_mismatched_block_root() {
harness.advance_slot();
// Prune blobs
harness
.chain
.store
@@ -3323,17 +3337,20 @@ async fn test_import_historical_data_columns_batch_mismatched_block_root() {
.forwards_iter_block_roots_until(start_slot, end_slot)
.unwrap();
// Assert there are no columns between start_slot and end_slot
for block in block_root_iter {
let (block_root, _) = block.unwrap();
let data_columns = harness.chain.store.get_data_columns(&block_root).unwrap();
assert!(data_columns.is_none())
}
// Attempt to import data columns with invalid block roots and expect a failure
let error = harness
.chain
.import_historical_data_column_batch(
start_slot.epoch(E::slots_per_epoch()),
data_columns_list,
cgc,
)
.unwrap_err();
@@ -3343,84 +3360,6 @@ async fn test_import_historical_data_columns_batch_mismatched_block_root() {
));
}
// This should verify that a data column sidecar associated to a block root that doesn't exist in the store cannot
// be imported.
#[tokio::test]
async fn test_import_historical_data_columns_batch_no_block_found() {
let spec = ForkName::Fulu.make_genesis_spec(E::default_spec());
let db_path = tempdir().unwrap();
let store = get_store_generic(&db_path, StoreConfig::default(), spec);
let start_slot = Slot::new(1);
let end_slot = Slot::new(E::slots_per_epoch() * 2 - 1);
let harness = get_harness_import_all_data_columns(store.clone(), LOW_VALIDATOR_COUNT);
harness
.extend_chain(
(E::slots_per_epoch() * 2) as usize,
BlockStrategy::OnCanonicalHead,
AttestationStrategy::AllValidators,
)
.await;
harness.advance_slot();
let block_root_iter = harness
.chain
.forwards_iter_block_roots_until(start_slot, end_slot)
.unwrap();
let mut data_columns_list = vec![];
for block in block_root_iter {
let (block_root, _) = block.unwrap();
let data_columns = harness.chain.store.get_data_columns(&block_root).unwrap();
assert!(data_columns.is_some());
for data_column in data_columns.unwrap() {
let mut data_column = (*data_column).clone();
data_column.signed_block_header.message.body_root = Hash256::ZERO;
data_columns_list.push(Arc::new(data_column));
}
}
harness
.extend_chain(
(E::slots_per_epoch() * 4) as usize,
BlockStrategy::OnCanonicalHead,
AttestationStrategy::AllValidators,
)
.await;
harness.advance_slot();
harness
.chain
.store
.try_prune_blobs(true, Epoch::new(2))
.unwrap();
let block_root_iter = harness
.chain
.forwards_iter_block_roots_until(start_slot, end_slot)
.unwrap();
for block in block_root_iter {
let (block_root, _) = block.unwrap();
let data_columns = harness.chain.store.get_data_columns(&block_root).unwrap();
assert!(data_columns.is_none())
}
let error = harness
.chain
.import_historical_data_column_batch(Epoch::new(0), data_columns_list)
.unwrap_err();
assert!(matches!(
error,
HistoricalDataColumnError::NoBlockFound { .. }
));
}
/// Test that blocks and attestations that refer to states around an unaligned split state are
/// processed correctly.
#[tokio::test]