From 73e75e3e69f4bfe9d2703a581ece70fd3f3637dd Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Thu, 16 Oct 2025 02:25:44 -0700 Subject: [PATCH 01/16] Ignore extra columns in da cache (#8201) N/A Found this issue in sepolia. Note: the custody requirement for this node is 100. ``` Oct 14 11:25:40.053 DEBUG Reconstructed columns count: 28, block_root: 0x4d7946dec0ab59f2afd46610d7c54af555cb4c2851d9eea7d83dd17cf6e96aae, slot: 8725628 Oct 14 11:25:45.568 WARN Internal availability check failure block_root: 0x4d7946dec0ab59f2afd46610d7c54af555cb4c2851d9eea7d83dd17cf6e96aae, error: Unexpected("too many columns got 128 expected 100") ``` So if any of the block components arrives late, then we reconstruct all 128 columns and try to add it to da cache and have more columns than needed for availability in the cache. There are 2 ways I can think of fixing this: 1. pass only the required columns to the da cache after reconstruction here https://github.com/sigp/lighthouse/blob/60df5f4ab609362711f4f518eb8f03df447bfedb/beacon_node/beacon_chain/src/data_availability_checker.rs#L647-L648 2. Ensure that we add only columns that we need to sample in the da cache. I think this is safer since we can add columns to the cache from multiple code paths and this fixes it at the source. ~~This PR implements (2).~~ Thought more about it, I think (1) is cleaner since we filter gossip and rpc columns also before calling `put_kzg_verified_data_columns`/ Co-Authored-By: Pawan Dhananjay --- .../src/data_availability_checker.rs | 100 +++++++++++++++++- 1 file changed, 99 insertions(+), 1 deletion(-) diff --git a/beacon_node/beacon_chain/src/data_availability_checker.rs b/beacon_node/beacon_chain/src/data_availability_checker.rs index 43b7d8f7ea..c937c32c68 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker.rs @@ -644,8 +644,17 @@ impl DataAvailabilityChecker { "Reconstructed columns" ); + let columns_to_sample = self + .custody_context() + .sampling_columns_for_epoch(slot.epoch(T::EthSpec::slots_per_epoch()), &self.spec); + let data_columns_to_import: Vec<_> = data_columns_to_publish + .iter() + .filter(|column| columns_to_sample.contains(&column.index())) + .cloned() + .collect(); + self.availability_cache - .put_kzg_verified_data_columns(*block_root, data_columns_to_publish.clone()) + .put_kzg_verified_data_columns(*block_root, data_columns_to_import) .map(|availability| { DataColumnReconstructionResult::Success(( availability, @@ -1082,6 +1091,95 @@ mod test { verification_result.expect_err("should have failed to verify blocks"); } + #[test] + fn should_exclude_reconstructed_columns_not_required_for_sampling() { + // SETUP + let spec = Arc::new(ForkName::Fulu.make_genesis_spec(E::default_spec())); + let mut rng = StdRng::seed_from_u64(0xDEADBEEF0BAD5EEDu64); + + let da_checker = new_da_checker(spec.clone()); + let custody_context = &da_checker.custody_context; + let all_column_indices_ordered = + init_custody_context_with_ordered_columns(custody_context, &mut rng, &spec); + + // Set custody requirement to 65 columns (enough to trigger reconstruction) + let epoch = Epoch::new(1); + custody_context.register_validators( + vec![(0, 2_048_000_000_000), (1, 32_000_000_000)], // 64 + 1 + Slot::new(0), + &spec, + ); + let sampling_requirement = custody_context.num_of_data_columns_to_sample(epoch, &spec); + assert_eq!( + sampling_requirement, 65, + "sampling requirement should be 65" + ); + + let (block, data_columns) = generate_rand_block_and_data_columns::( + ForkName::Fulu, + NumBlobs::Number(1), + &mut rng, + &spec, + ); + let block_root = Hash256::random(); + // Add the block to the DA checker + da_checker + .availability_cache + .put_pre_execution_block(block_root, Arc::new(block), BlockImportSource::Gossip) + .expect("should put block"); + + // Add 64 columns to the da checker (enough to be able to reconstruct) + // Order by all_column_indices_ordered, then take first 64 + let custody_columns = all_column_indices_ordered + .iter() + .filter_map(|&col_idx| data_columns.iter().find(|d| d.index == col_idx).cloned()) + .take(64) + .map(|d| { + KzgVerifiedCustodyDataColumn::from_asserted_custody( + KzgVerifiedDataColumn::__new_for_testing(d), + ) + }) + .collect::>(); + + da_checker + .availability_cache + .put_kzg_verified_data_columns(block_root, custody_columns) + .expect("should put custody columns"); + + // Try reconstrucing + let reconstruction_result = da_checker + .reconstruct_data_columns(&block_root) + .expect("should reconstruct columns"); + + // Reconstruction should succeed + let (_availability, reconstructed_columns) = match reconstruction_result { + DataColumnReconstructionResult::Success(result) => result, + e => { + panic!("Expected successful reconstruction {:?}", e); + } + }; + + // Remaining 64 columns should be reconstructed + assert_eq!( + reconstructed_columns.len(), + 64, + "should reconstruct the remaining 64 columns" + ); + + // Only the columns required for custody (65) should be imported into the cache + let sampling_columns = custody_context.sampling_columns_for_epoch(epoch, &spec); + let actual_cached: HashSet = da_checker + .cached_data_column_indexes(&block_root) + .expect("should have cached data columns") + .into_iter() + .collect(); + let expected_sampling_columns = sampling_columns.iter().copied().collect::>(); + assert_eq!( + actual_cached, expected_sampling_columns, + "should cache only the required custody columns, not all reconstructed columns" + ); + } + fn init_custody_context_with_ordered_columns( custody_context: &Arc>, mut rng: &mut StdRng, From d1e06dc40da07221db7611e648313794313f1922 Mon Sep 17 00:00:00 2001 From: SunnysidedJ Date: Thu, 16 Oct 2025 16:20:26 +0100 Subject: [PATCH 02/16] #6853 Adding store tests for data column pruning (#7228) #6853 Update store tests to cover data column pruning Created a helper function `check_data_column_existence` which is a copy of `check_blob_existence` but checking data columns instead. The helper function is then used to check whether data columns are also pruned when blobs are pruned if PeerDAS is enabled. Co-Authored-By: SunnysidedJ Co-Authored-By: Eitan Seri-Levi Co-Authored-By: Michael Sproul --- beacon_node/beacon_chain/tests/store_tests.rs | 444 +++++++++++++++++- .../store/src/database/leveldb_impl.rs | 3 +- beacon_node/store/src/hot_cold_store.rs | 41 +- 3 files changed, 468 insertions(+), 20 deletions(-) diff --git a/beacon_node/beacon_chain/tests/store_tests.rs b/beacon_node/beacon_chain/tests/store_tests.rs index 449b5dd043..f2a506ec57 100644 --- a/beacon_node/beacon_chain/tests/store_tests.rs +++ b/beacon_node/beacon_chain/tests/store_tests.rs @@ -3606,9 +3606,10 @@ async fn deneb_prune_blobs_happy_case() { let store = get_store(&db_path); if store.get_chain_spec().is_peer_das_scheduled() { - // TODO(fulu): add prune tests for Fulu / PeerDAS data columns. + // Blob pruning no longer needed since Fulu / PeerDAS return; } + let Some(deneb_fork_epoch) = store.get_chain_spec().deneb_fork_epoch else { // No-op prior to Deneb. return; @@ -3657,9 +3658,10 @@ async fn deneb_prune_blobs_no_finalization() { let store = get_store(&db_path); if store.get_chain_spec().is_peer_das_scheduled() { - // TODO(fulu): add prune tests for Fulu / PeerDAS data columns. + // Blob pruning no longer needed since Fulu / PeerDAS return; } + let Some(deneb_fork_epoch) = store.get_chain_spec().deneb_fork_epoch else { // No-op prior to Deneb. return; @@ -3716,29 +3718,39 @@ async fn deneb_prune_blobs_no_finalization() { /// Check that blob pruning does not fail trying to prune across the fork boundary. #[tokio::test] -async fn deneb_prune_blobs_fork_boundary() { - let deneb_fork_epoch = Epoch::new(4); +async fn prune_blobs_across_fork_boundary() { let mut spec = ForkName::Capella.make_genesis_spec(E::default_spec()); + + let deneb_fork_epoch = Epoch::new(4); spec.deneb_fork_epoch = Some(deneb_fork_epoch); let deneb_fork_slot = deneb_fork_epoch.start_slot(E::slots_per_epoch()); + let electra_fork_epoch = Epoch::new(8); + spec.electra_fork_epoch = Some(electra_fork_epoch); + + let fulu_fork_epoch = Epoch::new(12); + spec.fulu_fork_epoch = Some(fulu_fork_epoch); + let db_path = tempdir().unwrap(); let store = get_store_generic(&db_path, StoreConfig::default(), spec); let harness = get_harness(store.clone(), LOW_VALIDATOR_COUNT); - let num_blocks = E::slots_per_epoch() * 7; + let blocks_to_deneb_finalization = E::slots_per_epoch() * 7; + let blocks_to_electra_finalization = E::slots_per_epoch() * 4; + let blocks_to_fulu_finalization = E::slots_per_epoch() * 4; - // Finalize to epoch 5. + // Extend the chain to epoch 7 + // Finalize to epoch 5 (Deneb). harness .extend_chain( - num_blocks as usize, + blocks_to_deneb_finalization as usize, BlockStrategy::OnCanonicalHead, AttestationStrategy::AllValidators, ) .await; - // Finalization should be at epoch 5. + // Finalization should be at epoch 5 (Deneb). let finalized_epoch = Epoch::new(5); let finalized_slot = finalized_epoch.start_slot(E::slots_per_epoch()); assert_eq!( @@ -3777,6 +3789,116 @@ async fn deneb_prune_blobs_fork_boundary() { assert_eq!(store.get_blob_info().oldest_blob_slot, Some(pruned_slot)); check_blob_existence(&harness, Slot::new(0), pruned_slot - 1, false); check_blob_existence(&harness, pruned_slot, harness.head_slot(), true); + + // Extend the chain to epoch 11 + // Finalize to epoch 9 (Electra) + harness.advance_slot(); + harness + .extend_chain( + blocks_to_electra_finalization as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + // Finalization should be at epoch 9 (Electra). + let finalized_epoch = Epoch::new(9); + let finalized_slot = finalized_epoch.start_slot(E::slots_per_epoch()); + assert_eq!( + harness.get_current_state().finalized_checkpoint().epoch, + finalized_epoch + ); + assert_eq!(store.get_split_slot(), finalized_slot); + + // All blobs since last pruning during Deneb should still be available. + assert_eq!(store.get_blob_info().oldest_blob_slot, Some(pruned_slot)); + + let electra_first_slot = electra_fork_epoch.start_slot(E::slots_per_epoch()); + // Check that blobs exist from the pruned slot to electra + check_blob_existence(&harness, pruned_slot, electra_first_slot - 1, true); + + // Trigger pruning on Electra + let pruned_slot = (electra_fork_epoch + 1).start_slot(E::slots_per_epoch()); + + store.try_prune_blobs(true, finalized_epoch).unwrap(); + assert_eq!(store.get_blob_info().oldest_blob_slot, Some(finalized_slot)); + check_blob_existence(&harness, Slot::new(0), pruned_slot - 1, false); + check_blob_existence(&harness, pruned_slot, harness.head_slot(), true); + + // Check that blobs have been pruned up to the pruned slot + check_blob_existence(&harness, Slot::new(0), pruned_slot - 1, false); + // Check that blobs exist from electra to the current head + check_blob_existence(&harness, electra_first_slot, harness.head_slot(), true); + + // Extend the chain to epoch 15 + // Finalize to epoch 13 (Fulu) + harness.advance_slot(); + harness + .extend_chain( + blocks_to_fulu_finalization as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + // Finalization should be at epoch 13 (Fulu). + let finalized_epoch = Epoch::new(13); + let finalized_slot = finalized_epoch.start_slot(E::slots_per_epoch()); + assert_eq!( + harness.get_current_state().finalized_checkpoint().epoch, + finalized_epoch + ); + assert_eq!(store.get_split_slot(), finalized_slot); + + // All blobs since last pruning during Electra should still be available. + assert_eq!(store.get_blob_info().oldest_blob_slot, Some(pruned_slot)); + + let fulu_first_slot = fulu_fork_epoch.start_slot(E::slots_per_epoch()); + // Check that blobs have been pruned up to the pruned slot + check_blob_existence(&harness, Slot::new(0), pruned_slot - 1, false); + // Check that blobs exist from the pruned slot to Fulu + check_blob_existence(&harness, pruned_slot, fulu_first_slot - 1, true); + // Check that blobs do not exist from Fulu to the current head + check_blob_existence(&harness, fulu_first_slot, harness.head_slot(), false); + + // Attempt pruning with at different epochs. No pruning should occur for epochs + // preceding Fulu, as we have already triggered pruning pre-Fulu. Pruning should occur + // for epochs after Fulu. + assert!(fulu_fork_epoch < finalized_epoch); + for data_availability_boundary in [ + Epoch::new(7), + electra_fork_epoch, + Epoch::new(9), + Epoch::new(11), + fulu_fork_epoch, + Epoch::new(15), + ] { + store + .try_prune_blobs(true, data_availability_boundary) + .unwrap(); + + let oldest_slot = data_availability_boundary.start_slot(E::slots_per_epoch()); + + if data_availability_boundary < fulu_fork_epoch { + // Pre Fulu fork epochs + // Check oldest blob slot is not updated. + assert!(store.get_blob_info().oldest_blob_slot >= Some(oldest_slot)); + check_blob_existence(&harness, Slot::new(0), oldest_slot - 1, false); + // Blobs should exist + check_blob_existence(&harness, oldest_slot, harness.head_slot(), true); + } else { + // Fulu fork epochs + // Pruning should have been triggered + assert!(store.get_blob_info().oldest_blob_slot <= Some(oldest_slot)); + // Oldest blost slot should never be greater than the first fulu slot + let fulu_first_slot = fulu_fork_epoch.start_slot(E::slots_per_epoch()); + assert!(store.get_blob_info().oldest_blob_slot <= Some(fulu_first_slot)); + // Blobs should not exist post-Fulu + check_blob_existence(&harness, oldest_slot, harness.head_slot(), false); + // Data columns should exist post-Fulu + check_data_column_existence(&harness, oldest_slot, harness.head_slot(), true); + }; + } } /// Check that blob pruning prunes blobs older than the data availability boundary with margin @@ -3805,9 +3927,10 @@ async fn deneb_prune_blobs_margin_test(margin: u64) { let store = get_store_generic(&db_path, config, test_spec::()); if store.get_chain_spec().is_peer_das_scheduled() { - // TODO(fulu): add prune tests for Fulu / PeerDAS data columns. + // Blob pruning no longer needed since Fulu / PeerDAS return; } + let Some(deneb_fork_epoch) = store.get_chain_spec().deneb_fork_epoch else { // No-op prior to Deneb. return; @@ -3917,6 +4040,309 @@ fn check_blob_existence( } } +/// Check that blob pruning prunes data columns older than the data availability boundary. +#[tokio::test] +async fn fulu_prune_data_columns_happy_case() { + let db_path = tempdir().unwrap(); + let store = get_store(&db_path); + + if !store.get_chain_spec().is_peer_das_scheduled() { + // No-op if PeerDAS not scheduled. + return; + } + let Some(fulu_fork_epoch) = store.get_chain_spec().fulu_fork_epoch else { + // No-op prior to Fulu. + return; + }; + let fulu_fork_slot = fulu_fork_epoch.start_slot(E::slots_per_epoch()); + + let num_blocks_produced = E::slots_per_epoch() * 8; + let harness = get_harness(store.clone(), LOW_VALIDATOR_COUNT); + + harness + .extend_chain( + num_blocks_produced as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + // Prior to manual pruning with an artifically low data availability boundary all data columns + // should be stored. + assert_eq!( + store.get_data_column_info().oldest_data_column_slot, + Some(fulu_fork_slot) + ); + check_data_column_existence(&harness, Slot::new(1), harness.head_slot(), true); + + // Trigger pruning of data columns older than epoch 2. + let data_availability_boundary = Epoch::new(2); + store + .try_prune_blobs(true, data_availability_boundary) + .unwrap(); + + // Check oldest data column slot is updated accordingly and prior data columns have been + // deleted. + let oldest_data_column_slot = store + .get_data_column_info() + .oldest_data_column_slot + .unwrap(); + assert_eq!( + oldest_data_column_slot, + data_availability_boundary.start_slot(E::slots_per_epoch()) + ); + check_data_column_existence(&harness, Slot::new(0), oldest_data_column_slot - 1, false); + check_data_column_existence(&harness, oldest_data_column_slot, harness.head_slot(), true); +} + +/// Check that blob pruning does not prune data columns without finalization. +#[tokio::test] +async fn fulu_prune_data_columns_no_finalization() { + let db_path = tempdir().unwrap(); + let store = get_store(&db_path); + + if !store.get_chain_spec().is_peer_das_scheduled() { + // No-op if PeerDAS not scheduled. + return; + } + let Some(fulu_fork_epoch) = store.get_chain_spec().fulu_fork_epoch else { + // No-op prior to Fulu. + return; + }; + let fulu_fork_slot = fulu_fork_epoch.start_slot(E::slots_per_epoch()); + + let initial_num_blocks = E::slots_per_epoch() * 5; + let harness = get_harness(store.clone(), LOW_VALIDATOR_COUNT); + + // Finalize to epoch 3. + harness + .extend_chain( + initial_num_blocks as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + // Extend the chain for another few epochs without attestations. + let unfinalized_num_blocks = E::slots_per_epoch() * 3; + harness.advance_slot(); + harness + .extend_chain( + unfinalized_num_blocks as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::SomeValidators(vec![]), + ) + .await; + + // Finalization should be at epoch 3. + let finalized_slot = Slot::new(E::slots_per_epoch() * 3); + assert_eq!(harness.get_current_state().finalized_checkpoint().epoch, 3); + assert_eq!(store.get_split_slot(), finalized_slot); + + // All data columns should still be available. + assert_eq!( + store.get_data_column_info().oldest_data_column_slot, + Some(fulu_fork_slot) + ); + check_data_column_existence(&harness, Slot::new(0), harness.head_slot(), true); + + // Attempt pruning of data columns older than epoch 4, which is newer than finalization. + let data_availability_boundary = Epoch::new(4); + store + .try_prune_blobs(true, data_availability_boundary) + .unwrap(); + + // Check oldest data column slot is only updated to finalization, and NOT to the DAB. + let oldest_data_column_slot = store + .get_data_column_info() + .oldest_data_column_slot + .unwrap(); + assert_eq!(oldest_data_column_slot, finalized_slot); + check_data_column_existence(&harness, Slot::new(0), finalized_slot - 1, false); + check_data_column_existence(&harness, finalized_slot, harness.head_slot(), true); +} + +/// Check that data column pruning does not fail trying to prune across the fork boundary. +#[tokio::test] +async fn fulu_prune_data_columns_fork_boundary() { + let mut spec = ForkName::Electra.make_genesis_spec(E::default_spec()); + let fulu_fork_epoch = Epoch::new(4); + spec.fulu_fork_epoch = Some(fulu_fork_epoch); + let fulu_fork_slot = fulu_fork_epoch.start_slot(E::slots_per_epoch()); + + let db_path = tempdir().unwrap(); + let store = get_store_generic(&db_path, StoreConfig::default(), spec); + + if !store.get_chain_spec().is_peer_das_scheduled() { + // No-op if PeerDAS not scheduled. + panic!("PeerDAS not scheduled"); + //return; + } + + let harness = get_harness(store.clone(), LOW_VALIDATOR_COUNT); + + let num_blocks = E::slots_per_epoch() * 7; + + // Finalize to epoch 5. + harness + .extend_chain( + num_blocks as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + // Finalization should be at epoch 5. + let finalized_epoch = Epoch::new(5); + let finalized_slot = finalized_epoch.start_slot(E::slots_per_epoch()); + assert_eq!( + harness.get_current_state().finalized_checkpoint().epoch, + finalized_epoch + ); + assert_eq!(store.get_split_slot(), finalized_slot); + + // All data columns should still be available. + assert_eq!( + store.get_data_column_info().oldest_data_column_slot, + Some(fulu_fork_slot) + ); + check_data_column_existence(&harness, Slot::new(0), harness.head_slot(), true); + + // Attempt pruning with data availability epochs that precede the fork epoch. + // No pruning should occur. + assert!(fulu_fork_epoch < finalized_epoch); + for data_availability_boundary in [Epoch::new(0), Epoch::new(3), fulu_fork_epoch] { + store + .try_prune_blobs(true, data_availability_boundary) + .unwrap(); + + // Check oldest data column slot is not updated. + assert_eq!( + store.get_data_column_info().oldest_data_column_slot, + Some(fulu_fork_slot) + ); + } + // All data columns should still be available. + check_data_column_existence(&harness, Slot::new(0), harness.head_slot(), true); + + // Prune one epoch past the fork. + let pruned_slot = (fulu_fork_epoch + 1).start_slot(E::slots_per_epoch()); + store.try_prune_blobs(true, fulu_fork_epoch + 1).unwrap(); + assert_eq!( + store.get_data_column_info().oldest_data_column_slot, + Some(pruned_slot) + ); + check_data_column_existence(&harness, Slot::new(0), pruned_slot - 1, false); + check_data_column_existence(&harness, pruned_slot, harness.head_slot(), true); +} + +/// Check that blob pruning prunes data columns older than the data availability boundary with +/// margin applied. +#[tokio::test] +async fn fulu_prune_data_columns_margin1() { + fulu_prune_data_columns_margin_test(1).await; +} + +#[tokio::test] +async fn fulu_prune_data_columns_margin3() { + fulu_prune_data_columns_margin_test(3).await; +} + +#[tokio::test] +async fn fulu_prune_data_columns_margin4() { + fulu_prune_data_columns_margin_test(4).await; +} + +async fn fulu_prune_data_columns_margin_test(margin: u64) { + let config = StoreConfig { + blob_prune_margin_epochs: margin, + ..StoreConfig::default() + }; + let db_path = tempdir().unwrap(); + let store = get_store_generic(&db_path, config, test_spec::()); + + if !store.get_chain_spec().is_peer_das_scheduled() { + // No-op if PeerDAS not scheduled. + return; + } + let Some(fulu_fork_epoch) = store.get_chain_spec().fulu_fork_epoch else { + // No-op prior to Fulu. + return; + }; + let fulu_fork_slot = fulu_fork_epoch.start_slot(E::slots_per_epoch()); + + let num_blocks_produced = E::slots_per_epoch() * 8; + let harness = get_harness(store.clone(), LOW_VALIDATOR_COUNT); + + harness + .extend_chain( + num_blocks_produced as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + // Prior to manual pruning with an artifically low data availability boundary all blobs should + // be stored. + assert_eq!( + store.get_data_column_info().oldest_data_column_slot, + Some(fulu_fork_slot) + ); + check_data_column_existence(&harness, Slot::new(1), harness.head_slot(), true); + + // Trigger blob pruning of blobs older than epoch 6 - margin (6 is the minimum, due to + // finalization). + let data_availability_boundary = Epoch::new(6); + let effective_data_availability_boundary = + data_availability_boundary - store.get_config().blob_prune_margin_epochs; + assert!( + effective_data_availability_boundary > 0, + "must be > 0 because epoch 0 won't get pruned alone" + ); + store + .try_prune_blobs(true, data_availability_boundary) + .unwrap(); + + // Check oldest blob slot is updated accordingly and prior blobs have been deleted. + let oldest_data_column_slot = store + .get_data_column_info() + .oldest_data_column_slot + .unwrap(); + assert_eq!( + oldest_data_column_slot, + effective_data_availability_boundary.start_slot(E::slots_per_epoch()) + ); + check_data_column_existence(&harness, Slot::new(0), oldest_data_column_slot - 1, false); + check_data_column_existence(&harness, oldest_data_column_slot, harness.head_slot(), true); +} + +/// Check tat there are data column sidecars (or not) at every slot in the range. +fn check_data_column_existence( + harness: &TestHarness, + start_slot: Slot, + end_slot: Slot, + should_exist: bool, +) { + let mut columns_seen = 0; + for (block_root, slot) in harness + .chain + .forwards_iter_block_roots_until(start_slot, end_slot) + .unwrap() + .map(Result::unwrap) + { + if let Some(columns) = harness.chain.store.get_data_columns(&block_root).unwrap() { + assert!(should_exist, "columns at slot {slot} exist but should not"); + columns_seen += columns.len(); + } else { + // We don't actually store empty columns, so unfortunately we can't assert anything + // meaningful here (like asserting that the column should not exist). + } + } + if should_exist { + assert_ne!(columns_seen, 0, "expected non-zero number of columns"); + } +} + #[tokio::test] async fn prune_historic_states() { let num_blocks_produced = E::slots_per_epoch() * 5; diff --git a/beacon_node/store/src/database/leveldb_impl.rs b/beacon_node/store/src/database/leveldb_impl.rs index 385f35a33d..8fdd5812ea 100644 --- a/beacon_node/store/src/database/leveldb_impl.rs +++ b/beacon_node/store/src/database/leveldb_impl.rs @@ -282,7 +282,8 @@ impl LevelDB { ) -> Result<(), Error> { let mut leveldb_batch = Writebatch::new(); let iter = self.db.iter(self.read_options()); - + let start_key = BytesKey::from_vec(column.as_bytes().to_vec()); + iter.seek(&start_key); iter.take_while(move |(key, _)| key.matches_column(column)) .for_each(|(key, value)| { if f(&value).unwrap_or(false) { diff --git a/beacon_node/store/src/hot_cold_store.rs b/beacon_node/store/src/hot_cold_store.rs index 0d8a65e064..d58cf2e731 100644 --- a/beacon_node/store/src/hot_cold_store.rs +++ b/beacon_node/store/src/hot_cold_store.rs @@ -3178,13 +3178,14 @@ impl, Cold: ItemStore> HotColdDB self.try_prune_blobs(force, min_data_availability_boundary) } - /// Try to prune blobs older than the data availability boundary. + /// Try to prune blobs and data columns older than the data availability boundary. /// /// Blobs from the epoch `data_availability_boundary - blob_prune_margin_epochs` are retained. /// This epoch is an _exclusive_ endpoint for the pruning process. /// - /// This function only supports pruning blobs older than the split point, which is older than - /// (or equal to) finalization. Pruning blobs newer than finalization is not supported. + /// This function only supports pruning blobs and data columns older than the split point, + /// which is older than (or equal to) finalization. Pruning blobs and data columns newer than + /// finalization is not supported. /// /// This function also assumes that the split is stationary while it runs. It should only be /// run from the migrator thread (where `migrate_database` runs) or the database manager. @@ -3208,6 +3209,7 @@ impl, Cold: ItemStore> HotColdDB } let blob_info = self.get_blob_info(); + let data_column_info = self.get_data_column_info(); let Some(oldest_blob_slot) = blob_info.oldest_blob_slot else { error!("Slot of oldest blob is not known"); return Err(HotColdDBError::BlobPruneLogicError.into()); @@ -3306,13 +3308,7 @@ impl, Cold: ItemStore> HotColdDB } } - let new_blob_info = BlobInfo { - oldest_blob_slot: Some(end_slot + 1), - blobs_db: blob_info.blobs_db, - }; - - let op = self.compare_and_set_blob_info(blob_info, new_blob_info)?; - self.do_atomically_with_block_and_blobs_cache(vec![StoreOp::KeyValueOp(op)])?; + self.update_blob_or_data_column_info(start_epoch, end_slot, blob_info, data_column_info)?; debug!("Blob pruning complete"); @@ -3379,6 +3375,31 @@ impl, Cold: ItemStore> HotColdDB Ok(()) } + + fn update_blob_or_data_column_info( + &self, + start_epoch: Epoch, + end_slot: Slot, + blob_info: BlobInfo, + data_column_info: DataColumnInfo, + ) -> Result<(), Error> { + let op = if self.spec.is_peer_das_enabled_for_epoch(start_epoch) { + let new_data_column_info = DataColumnInfo { + oldest_data_column_slot: Some(end_slot + 1), + }; + self.compare_and_set_data_column_info(data_column_info, new_data_column_info)? + } else { + let new_blob_info = BlobInfo { + oldest_blob_slot: Some(end_slot + 1), + blobs_db: blob_info.blobs_db, + }; + self.compare_and_set_blob_info(blob_info, new_blob_info)? + }; + + self.do_atomically_with_block_and_blobs_cache(vec![StoreOp::KeyValueOp(op)])?; + + Ok(()) + } } /// Advance the split point of the store, copying new finalized states to the freezer. From f13d0615fdeabbce3889e56c2d31df903dc0f495 Mon Sep 17 00:00:00 2001 From: Mac L Date: Thu, 16 Oct 2025 20:10:42 +0400 Subject: [PATCH 03/16] Add `eip_3076` crate (#8206) #7894 Moves the `Interchange` format from `slashing_protection` and thus removes the dependency on `slashing_protection` from `eth2` which can now just depend on the slimmer `eip_3076` crate. Co-Authored-By: Mac L --- Cargo.lock | 15 ++- Cargo.toml | 2 + common/eip_3076/Cargo.toml | 20 ++++ .../eip_3076/src/lib.rs | 107 +++++++++++++++++- common/eth2/Cargo.toml | 2 +- common/eth2/src/lighthouse_vc/std_types.rs | 2 +- consensus/types/src/contribution_and_proof.rs | 1 - .../slashing_protection/Cargo.toml | 3 +- .../src/bin/test_generator.rs | 4 +- .../src/interchange_test.rs | 2 +- .../slashing_protection/src/lib.rs | 5 +- .../src/slashing_database.rs | 10 +- 12 files changed, 155 insertions(+), 18 deletions(-) create mode 100644 common/eip_3076/Cargo.toml rename validator_client/slashing_protection/src/interchange.rs => common/eip_3076/src/lib.rs (64%) diff --git a/Cargo.lock b/Cargo.lock index 31cccc6a98..59e7bda170 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2581,6 +2581,18 @@ dependencies = [ "sha2 0.10.8", ] +[[package]] +name = "eip_3076" +version = "0.1.0" +dependencies = [ + "arbitrary", + "ethereum_serde_utils", + "serde", + "serde_json", + "tempfile", + "types", +] + [[package]] name = "either" version = "1.15.0" @@ -2848,6 +2860,7 @@ name = "eth2" version = "0.1.0" dependencies = [ "derivative", + "eip_3076", "either", "enr", "eth2_keystore", @@ -2867,7 +2880,6 @@ dependencies = [ "sensitive_url", "serde", "serde_json", - "slashing_protection", "ssz_types", "test_random_derive", "tokio", @@ -8832,6 +8844,7 @@ name = "slashing_protection" version = "0.1.0" dependencies = [ "arbitrary", + "eip_3076", "ethereum_serde_utils", "filesystem", "r2d2", diff --git a/Cargo.toml b/Cargo.toml index a46dc355e7..ae84d645bb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,7 @@ members = [ "common/compare_fields_derive", "common/deposit_contract", "common/directory", + "common/eip_3076", "common/eth2", "common/eth2_config", "common/eth2_interop_keypairs", @@ -135,6 +136,7 @@ directory = { path = "common/directory" } dirs = "3" discv5 = { version = "0.10", features = ["libp2p"] } doppelganger_service = { path = "validator_client/doppelganger_service" } +eip_3076 = { path = "common/eip_3076" } either = "1.9" environment = { path = "lighthouse/environment" } eth2 = { path = "common/eth2" } diff --git a/common/eip_3076/Cargo.toml b/common/eip_3076/Cargo.toml new file mode 100644 index 0000000000..851ef26238 --- /dev/null +++ b/common/eip_3076/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "eip_3076" +version = "0.1.0" +authors = ["Sigma Prime "] +edition = { workspace = true } + +[features] +default = [] +arbitrary-fuzz = ["dep:arbitrary", "types/arbitrary"] +json = ["dep:serde_json"] + +[dependencies] +arbitrary = { workspace = true, features = ["derive"], optional = true } +ethereum_serde_utils = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true, optional = true } +types = { workspace = true } + +[dev-dependencies] +tempfile = { workspace = true } diff --git a/validator_client/slashing_protection/src/interchange.rs b/common/eip_3076/src/lib.rs similarity index 64% rename from validator_client/slashing_protection/src/interchange.rs rename to common/eip_3076/src/lib.rs index 95a39c50e4..2d47a77de4 100644 --- a/validator_client/slashing_protection/src/interchange.rs +++ b/common/eip_3076/src/lib.rs @@ -1,10 +1,15 @@ -use crate::InterchangeError; use serde::{Deserialize, Serialize}; use std::cmp::max; use std::collections::{HashMap, HashSet}; +#[cfg(feature = "json")] use std::io; use types::{Epoch, Hash256, PublicKeyBytes, Slot}; +#[derive(Debug)] +pub enum Error { + MaxInconsistent, +} + #[derive(Debug, Clone, PartialEq, Deserialize, Serialize)] #[serde(deny_unknown_fields)] #[cfg_attr(feature = "arbitrary-fuzz", derive(arbitrary::Arbitrary))] @@ -53,10 +58,12 @@ pub struct Interchange { } impl Interchange { + #[cfg(feature = "json")] pub fn from_json_str(json: &str) -> Result { serde_json::from_str(json) } + #[cfg(feature = "json")] pub fn from_json_reader(mut reader: impl std::io::Read) -> Result { // We read the entire file into memory first, as this is *a lot* faster than using // `serde_json::from_reader`. See https://github.com/serde-rs/json/issues/160 @@ -65,6 +72,7 @@ impl Interchange { Ok(Interchange::from_json_str(&json_str)?) } + #[cfg(feature = "json")] pub fn write_to(&self, writer: impl std::io::Write) -> Result<(), serde_json::Error> { serde_json::to_writer(writer, self) } @@ -87,7 +95,7 @@ impl Interchange { } /// Minify an interchange by constructing a synthetic block & attestation for each validator. - pub fn minify(&self) -> Result { + pub fn minify(&self) -> Result { // Map from pubkey to optional max block and max attestation. let mut validator_data = HashMap::, Option)>::new(); @@ -124,7 +132,7 @@ impl Interchange { } } (None, None) => {} - _ => return Err(InterchangeError::MaxInconsistent), + _ => return Err(Error::MaxInconsistent), }; // Find maximum block slot. @@ -157,3 +165,96 @@ impl Interchange { }) } } + +#[cfg(feature = "json")] +#[cfg(test)] +mod tests { + use super::*; + use std::fs::File; + use tempfile::tempdir; + use types::FixedBytesExtended; + + fn get_interchange() -> Interchange { + Interchange { + metadata: InterchangeMetadata { + interchange_format_version: 5, + genesis_validators_root: Hash256::from_low_u64_be(555), + }, + data: vec![ + InterchangeData { + pubkey: PublicKeyBytes::deserialize(&[1u8; 48]).unwrap(), + signed_blocks: vec![SignedBlock { + slot: Slot::new(100), + signing_root: Some(Hash256::from_low_u64_be(1)), + }], + signed_attestations: vec![SignedAttestation { + source_epoch: Epoch::new(0), + target_epoch: Epoch::new(5), + signing_root: Some(Hash256::from_low_u64_be(2)), + }], + }, + InterchangeData { + pubkey: PublicKeyBytes::deserialize(&[2u8; 48]).unwrap(), + signed_blocks: vec![], + signed_attestations: vec![], + }, + ], + } + } + + #[test] + fn test_roundtrip() { + let temp_dir = tempdir().unwrap(); + let file_path = temp_dir.path().join("interchange.json"); + + let interchange = get_interchange(); + + let mut file = File::create(&file_path).unwrap(); + interchange.write_to(&mut file).unwrap(); + + let file = File::open(&file_path).unwrap(); + let from_file = Interchange::from_json_reader(file).unwrap(); + + assert_eq!(interchange, from_file); + } + + #[test] + fn test_empty_roundtrip() { + let temp_dir = tempdir().unwrap(); + let file_path = temp_dir.path().join("empty.json"); + + let empty = Interchange { + metadata: InterchangeMetadata { + interchange_format_version: 5, + genesis_validators_root: Hash256::zero(), + }, + data: vec![], + }; + + let mut file = File::create(&file_path).unwrap(); + empty.write_to(&mut file).unwrap(); + + let file = File::open(&file_path).unwrap(); + let from_file = Interchange::from_json_reader(file).unwrap(); + + assert_eq!(empty, from_file); + } + + #[test] + fn test_minify_roundtrip() { + let interchange = get_interchange(); + + let minified = interchange.minify().unwrap(); + + let temp_dir = tempdir().unwrap(); + let file_path = temp_dir.path().join("minified.json"); + + let mut file = File::create(&file_path).unwrap(); + minified.write_to(&mut file).unwrap(); + + let file = File::open(&file_path).unwrap(); + let from_file = Interchange::from_json_reader(file).unwrap(); + + assert_eq!(minified, from_file); + } +} diff --git a/common/eth2/Cargo.toml b/common/eth2/Cargo.toml index 81666a6421..46066a559f 100644 --- a/common/eth2/Cargo.toml +++ b/common/eth2/Cargo.toml @@ -10,6 +10,7 @@ lighthouse = [] [dependencies] derivative = { workspace = true } +eip_3076 = { workspace = true } either = { workspace = true } enr = { version = "0.13.0", features = ["ed25519"] } eth2_keystore = { workspace = true } @@ -29,7 +30,6 @@ reqwest-eventsource = "0.5.0" sensitive_url = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } -slashing_protection = { workspace = true } ssz_types = { workspace = true } test_random_derive = { path = "../../common/test_random_derive" } types = { workspace = true } diff --git a/common/eth2/src/lighthouse_vc/std_types.rs b/common/eth2/src/lighthouse_vc/std_types.rs index ae192312bd..0290bdd0b7 100644 --- a/common/eth2/src/lighthouse_vc/std_types.rs +++ b/common/eth2/src/lighthouse_vc/std_types.rs @@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize}; use types::{Address, Graffiti, PublicKeyBytes}; use zeroize::Zeroizing; -pub use slashing_protection::interchange::Interchange; +pub use eip_3076::Interchange; #[derive(Debug, Deserialize, Serialize, PartialEq)] pub struct GetFeeRecipientResponse { diff --git a/consensus/types/src/contribution_and_proof.rs b/consensus/types/src/contribution_and_proof.rs index 85c9ac15fb..4d70cd1f8a 100644 --- a/consensus/types/src/contribution_and_proof.rs +++ b/consensus/types/src/contribution_and_proof.rs @@ -10,7 +10,6 @@ use test_random_derive::TestRandom; use tree_hash_derive::TreeHash; /// A Validators aggregate sync committee contribution and selection proof. - #[cfg_attr( feature = "arbitrary", derive(arbitrary::Arbitrary), diff --git a/validator_client/slashing_protection/Cargo.toml b/validator_client/slashing_protection/Cargo.toml index 3860af514d..6a778c5de3 100644 --- a/validator_client/slashing_protection/Cargo.toml +++ b/validator_client/slashing_protection/Cargo.toml @@ -6,11 +6,12 @@ edition = { workspace = true } autotests = false [features] -arbitrary-fuzz = ["types/arbitrary-fuzz"] +arbitrary-fuzz = ["types/arbitrary-fuzz", "eip_3076/arbitrary-fuzz"] portable = ["types/portable"] [dependencies] arbitrary = { workspace = true, features = ["derive"] } +eip_3076 = { workspace = true, features = ["json"] } ethereum_serde_utils = { workspace = true } filesystem = { workspace = true } r2d2 = { workspace = true } diff --git a/validator_client/slashing_protection/src/bin/test_generator.rs b/validator_client/slashing_protection/src/bin/test_generator.rs index 4576231b7b..dfda7983f7 100644 --- a/validator_client/slashing_protection/src/bin/test_generator.rs +++ b/validator_client/slashing_protection/src/bin/test_generator.rs @@ -1,7 +1,5 @@ +use eip_3076::{Interchange, InterchangeData, InterchangeMetadata, SignedAttestation, SignedBlock}; use slashing_protection::SUPPORTED_INTERCHANGE_FORMAT_VERSION; -use slashing_protection::interchange::{ - Interchange, InterchangeData, InterchangeMetadata, SignedAttestation, SignedBlock, -}; use slashing_protection::interchange_test::{MultiTestCase, TestCase}; use slashing_protection::test_utils::{DEFAULT_GENESIS_VALIDATORS_ROOT, pubkey}; use std::fs::{self, File}; diff --git a/validator_client/slashing_protection/src/interchange_test.rs b/validator_client/slashing_protection/src/interchange_test.rs index 1bc4326b4f..ebe0105f24 100644 --- a/validator_client/slashing_protection/src/interchange_test.rs +++ b/validator_client/slashing_protection/src/interchange_test.rs @@ -1,8 +1,8 @@ use crate::{ SigningRoot, SlashingDatabase, - interchange::{Interchange, SignedAttestation, SignedBlock}, test_utils::{DEFAULT_GENESIS_VALIDATORS_ROOT, pubkey}, }; +use eip_3076::{Interchange, SignedAttestation, SignedBlock}; use serde::{Deserialize, Serialize}; use std::collections::HashSet; use tempfile::tempdir; diff --git a/validator_client/slashing_protection/src/lib.rs b/validator_client/slashing_protection/src/lib.rs index ded64adb49..917d51d38b 100644 --- a/validator_client/slashing_protection/src/lib.rs +++ b/validator_client/slashing_protection/src/lib.rs @@ -1,7 +1,6 @@ mod attestation_tests; mod block_tests; mod extra_interchange_tests; -pub mod interchange; pub mod interchange_test; mod parallel_tests; mod registration_tests; @@ -10,6 +9,10 @@ mod signed_block; mod slashing_database; pub mod test_utils; +pub mod interchange { + pub use eip_3076::{Interchange, InterchangeMetadata}; +} + pub use crate::signed_attestation::{InvalidAttestation, SignedAttestation}; pub use crate::signed_block::{InvalidBlock, SignedBlock}; pub use crate::slashing_database::{ diff --git a/validator_client/slashing_protection/src/slashing_database.rs b/validator_client/slashing_protection/src/slashing_database.rs index 7d8947a584..ce32299a51 100644 --- a/validator_client/slashing_protection/src/slashing_database.rs +++ b/validator_client/slashing_protection/src/slashing_database.rs @@ -1,10 +1,10 @@ -use crate::interchange::{ - Interchange, InterchangeData, InterchangeMetadata, SignedAttestation as InterchangeAttestation, - SignedBlock as InterchangeBlock, -}; use crate::signed_attestation::InvalidAttestation; use crate::signed_block::InvalidBlock; use crate::{NotSafe, Safe, SignedAttestation, SignedBlock, SigningRoot, signing_root_from_row}; +use eip_3076::{ + Interchange, InterchangeData, InterchangeMetadata, SignedAttestation as InterchangeAttestation, + SignedBlock as InterchangeBlock, +}; use filesystem::restrict_file_permissions; use r2d2_sqlite::SqliteConnectionManager; use rusqlite::{OptionalExtension, Transaction, TransactionBehavior, params}; @@ -1219,7 +1219,7 @@ pub enum InterchangeError { interchange_file: Hash256, client: Hash256, }, - MaxInconsistent, + Eip3076(eip_3076::Error), SummaryInconsistent, SQLError(String), SQLPoolError(r2d2::Error), From 76a37a0aef2c260399dcf5d8081b452df1190350 Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Fri, 17 Oct 2025 10:25:30 +1100 Subject: [PATCH 04/16] Revert incorrect fix made in #8179 (#8215) This PR reverts #8179. It turns out that the fix was invalid because an unknown root is always not a finalized descendant: https://github.com/sigp/lighthouse/blob/522bd9e9c6ac167f2231525e937c9ebbcb86cf6e/consensus/proto_array/src/proto_array.rs#L976-L979 so for any data columns with unknown parents, it will always penalise the gossip peer and disconnect it pretty quickly. On a small network, the node may lose all of its peers. The impact is pretty obvious when the peer count is small and sync speed is slow, and is therefore easily reproducible by running a fresh supernode on devnet-3. This isn't as obvious on a live testnet like holesky / sepolia, we haven't noticed this, probably due to its high peer count and sync speed - the nodes might be able to reach head quickly before losing too many peers. The previous behaviour isn't ideal but safe: triggering unknown parent lookup and penalise the bad peer if it happens to be malicious or faulty. So for now it's safer to revert the change and plan for a proper fix after the v8 release. Co-Authored-By: Jimmy Chen --- .../beacon_chain/src/data_column_verification.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/beacon_node/beacon_chain/src/data_column_verification.rs b/beacon_node/beacon_chain/src/data_column_verification.rs index 01e79c49aa..07f85b045a 100644 --- a/beacon_node/beacon_chain/src/data_column_verification.rs +++ b/beacon_node/beacon_chain/src/data_column_verification.rs @@ -626,21 +626,22 @@ fn verify_parent_block_and_finalized_descendant( chain: &BeaconChain, ) -> Result { let fork_choice = chain.canonical_head.fork_choice_read_lock(); - let block_parent_root = data_column.block_parent_root(); - - // Do not process a column that does not descend from the finalized root. - if !fork_choice.is_finalized_checkpoint_or_descendant(block_parent_root) { - return Err(GossipDataColumnError::NotFinalizedDescendant { block_parent_root }); - } // We have already verified that the column is past finalization, so we can // just check fork choice for the block's parent. + let block_parent_root = data_column.block_parent_root(); let Some(parent_block) = fork_choice.get_block(&block_parent_root) else { return Err(GossipDataColumnError::ParentUnknown { parent_root: block_parent_root, }); }; + // Do not process a column that does not descend from the finalized root. + // We just loaded the parent_block, so we can be sure that it exists in fork choice. + if !fork_choice.is_finalized_checkpoint_or_descendant(block_parent_root) { + return Err(GossipDataColumnError::NotFinalizedDescendant { block_parent_root }); + } + Ok(parent_block) } From 79716f6ec1e42d1ee861cf68217ad5b9626cff17 Mon Sep 17 00:00:00 2001 From: Odinson Date: Fri, 17 Oct 2025 14:19:13 +0530 Subject: [PATCH 05/16] Max reconstruction delay as a function of slot time (#8067) Fixes #8054 Co-Authored-By: PoulavBhowmick03 --- .../src/scheduler/work_reprocessing_queue.rs | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs b/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs index 8c33cf5869..9ff26e7841 100644 --- a/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs +++ b/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs @@ -84,8 +84,9 @@ pub const BACKFILL_SCHEDULE_IN_SLOT: [(u32, u32); 3] = [ (4, 5), ]; -/// Trigger reconstruction if we are this many seconds into the current slot -pub const RECONSTRUCTION_DEADLINE: Duration = Duration::from_millis(3000); +/// Fraction of slot duration after which column reconstruction is triggered, makes it easier for +/// different slot timings to have a generalised deadline +pub const RECONSTRUCTION_DEADLINE: (u64, u64) = (1, 4); /// Messages that the scheduler can receive. #[derive(AsRefStr)] @@ -756,13 +757,17 @@ impl ReprocessQueue { } InboundEvent::Msg(DelayColumnReconstruction(request)) => { let mut reconstruction_delay = QUEUED_RECONSTRUCTION_DELAY; + let slot_duration = self.slot_clock.slot_duration().as_millis() as u64; + let reconstruction_deadline_millis = + (slot_duration * RECONSTRUCTION_DEADLINE.0) / RECONSTRUCTION_DEADLINE.1; + let reconstruction_deadline = Duration::from_millis(reconstruction_deadline_millis); if let Some(seconds_from_current_slot) = self.slot_clock.seconds_from_current_slot_start() && let Some(current_slot) = self.slot_clock.now() - && seconds_from_current_slot >= RECONSTRUCTION_DEADLINE + && seconds_from_current_slot >= reconstruction_deadline && current_slot == request.slot { - // If we are at least `RECONSTRUCTION_DEADLINE` seconds into the current slot, + // If we are at least `reconstruction_deadline` seconds into the current slot, // and the reconstruction request is for the current slot, process reconstruction immediately. reconstruction_delay = Duration::from_secs(0); } From 2f8587301d92c076c9811ac6fde3aaac2cf10418 Mon Sep 17 00:00:00 2001 From: Michael Sproul Date: Mon, 20 Oct 2025 14:14:14 +1100 Subject: [PATCH 06/16] More proposer shuffling cleanup (#8130) Addressing more review comments from: - https://github.com/sigp/lighthouse/pull/8101 I've also tweaked a few more things that I think are minor bugs. - Instrument `ensure_state_can_determine_proposers_for_epoch` - Fix `block_root` usage in `compute_proposer_duties_from_head`. This was a regression introduced in 8101 :grimacing: . - Update the `state_advance_timer` to prime the next-epoch proposer cache post-Fulu. Co-Authored-By: Michael Sproul --- beacon_node/beacon_chain/src/beacon_chain.rs | 48 ++++-- .../beacon_chain/src/beacon_proposer_cache.rs | 19 +-- .../beacon_chain/src/block_verification.rs | 2 - .../beacon_chain/src/state_advance_timer.rs | 66 +++++--- beacon_node/beacon_chain/tests/store_tests.rs | 141 +++++++++++++++++- beacon_node/http_api/src/proposer_duties.rs | 17 ++- .../http_api/tests/interactive_tests.rs | 107 +++++++++++++ consensus/types/src/chain_spec.rs | 18 ++- 8 files changed, 359 insertions(+), 59 deletions(-) diff --git a/beacon_node/beacon_chain/src/beacon_chain.rs b/beacon_node/beacon_chain/src/beacon_chain.rs index 85ccb96f69..e8db154a9b 100644 --- a/beacon_node/beacon_chain/src/beacon_chain.rs +++ b/beacon_node/beacon_chain/src/beacon_chain.rs @@ -4726,6 +4726,11 @@ impl BeaconChain { // efficient packing of execution blocks. Err(Error::SkipProposerPreparation) } else { + debug!( + ?shuffling_decision_root, + epoch = %proposal_epoch, + "Proposer shuffling cache miss for proposer prep" + ); let head = self.canonical_head.cached_head(); Ok(( head.head_state_root(), @@ -6557,6 +6562,26 @@ impl BeaconChain { } } + /// This function provides safe and efficient multi-threaded access to the beacon proposer cache. + /// + /// The arguments are: + /// + /// - `shuffling_decision_block`: The block root of the decision block for the desired proposer + /// shuffling. This should be computed using one of the methods for computing proposer + /// shuffling decision roots, e.g. `BeaconState::proposer_shuffling_decision_root_at_epoch`. + /// - `proposal_epoch`: The epoch at which the proposer shuffling is required. + /// - `accessor`: A closure to run against the proposers for the selected epoch. Usually this + /// closure just grabs a single proposer, or takes the vec of proposers for the epoch. + /// - `state_provider`: A closure to compute a state suitable for determining the shuffling. + /// This closure is evaluated lazily ONLY in the case that a cache miss occurs. It is + /// recommended for code that wants to keep track of cache misses to produce a log and/or + /// increment a metric inside this closure . + /// + /// This function makes use of closures in order to efficiently handle concurrent accesses to + /// the cache. + /// + /// The error type is polymorphic, if in doubt you can use `BeaconChainError`. You might need + /// to use a turbofish if type inference can't work it out. pub fn with_proposer_cache + From>( &self, shuffling_decision_block: Hash256, @@ -6575,12 +6600,6 @@ impl BeaconChain { // If it is already initialised, then `get_or_try_init` will return immediately without // executing the initialisation code at all. let epoch_block_proposers = cache_entry.get_or_try_init(|| { - debug!( - ?shuffling_decision_block, - %proposal_epoch, - "Proposer shuffling cache miss" - ); - // Fetch the state on-demand if the required epoch was missing from the cache. // If the caller wants to not compute the state they must return an error here and then // catch it at the call site. @@ -6610,11 +6629,18 @@ impl BeaconChain { } let proposers = state.get_beacon_proposer_indices(proposal_epoch, &self.spec)?; - Ok::<_, E>(EpochBlockProposers::new( - proposal_epoch, - state.fork(), - proposers, - )) + + // Use fork_at_epoch rather than the state's fork, because post-Fulu we may not have + // advanced the state completely into the new epoch. + let fork = self.spec.fork_at_epoch(proposal_epoch); + + debug!( + ?shuffling_decision_block, + epoch = %proposal_epoch, + "Priming proposer shuffling cache" + ); + + Ok::<_, E>(EpochBlockProposers::new(proposal_epoch, fork, proposers)) })?; // Run the accessor function on the computed epoch proposers. diff --git a/beacon_node/beacon_chain/src/beacon_proposer_cache.rs b/beacon_node/beacon_chain/src/beacon_proposer_cache.rs index a64b4981cc..6effce49f8 100644 --- a/beacon_node/beacon_chain/src/beacon_proposer_cache.rs +++ b/beacon_node/beacon_chain/src/beacon_proposer_cache.rs @@ -17,6 +17,7 @@ use smallvec::SmallVec; use state_processing::state_advance::partial_state_advance; use std::num::NonZeroUsize; use std::sync::Arc; +use tracing::instrument; use types::non_zero_usize::new_non_zero_usize; use types::{ BeaconState, BeaconStateError, ChainSpec, Epoch, EthSpec, Fork, Hash256, Slot, Unsigned, @@ -199,11 +200,14 @@ pub fn compute_proposer_duties_from_head( .map_err(BeaconChainError::from)?; let dependent_root = state - // The only block which decides its own shuffling is the genesis block. - .proposer_shuffling_decision_root(chain.genesis_block_root, &chain.spec) + .proposer_shuffling_decision_root_at_epoch(request_epoch, head_block_root, &chain.spec) .map_err(BeaconChainError::from)?; - Ok((indices, dependent_root, execution_status, state.fork())) + // Use fork_at_epoch rather than the state's fork, because post-Fulu we may not have advanced + // the state completely into the new epoch. + let fork = chain.spec.fork_at_epoch(request_epoch); + + Ok((indices, dependent_root, execution_status, fork)) } /// If required, advance `state` to the epoch required to determine proposer indices in `target_epoch`. @@ -214,6 +218,7 @@ pub fn compute_proposer_duties_from_head( /// - No-op if `state.current_epoch() == target_epoch`. /// - It must be the case that `state.canonical_root() == state_root`, but this function will not /// check that. +#[instrument(skip_all, fields(?state_root, %target_epoch, state_slot = %state.slot()), level = "debug")] pub fn ensure_state_can_determine_proposers_for_epoch( state: &mut BeaconState, state_root: Hash256, @@ -234,14 +239,6 @@ pub fn ensure_state_can_determine_proposers_for_epoch( if state.current_epoch() > maximum_epoch { Err(BeaconStateError::SlotOutOfBounds.into()) } else if state.current_epoch() >= minimum_epoch { - if target_epoch > state.current_epoch() { - let target_slot = target_epoch.start_slot(E::slots_per_epoch()); - - // Advance the state into the same epoch as the block. Use the "partial" method since state - // roots are not important for proposer/attester shuffling. - partial_state_advance(state, Some(state_root), target_slot, spec) - .map_err(BeaconChainError::from)?; - } Ok(()) } else { // State's current epoch is less than the minimum epoch. diff --git a/beacon_node/beacon_chain/src/block_verification.rs b/beacon_node/beacon_chain/src/block_verification.rs index d0ed8258e5..691293b200 100644 --- a/beacon_node/beacon_chain/src/block_verification.rs +++ b/beacon_node/beacon_chain/src/block_verification.rs @@ -950,8 +950,6 @@ impl GossipVerifiedBlock { let proposer_shuffling_decision_block = parent_block.proposer_shuffling_root_for_child_block(block_epoch, &chain.spec); - // We assign to a variable instead of using `if let Some` directly to ensure we drop the - // write lock before trying to acquire it again in the `else` clause. let block_slot = block.slot(); let mut opt_parent = None; let proposer = chain.with_proposer_cache::<_, BlockError>( diff --git a/beacon_node/beacon_chain/src/state_advance_timer.rs b/beacon_node/beacon_chain/src/state_advance_timer.rs index 87348cb01b..b10edf2336 100644 --- a/beacon_node/beacon_chain/src/state_advance_timer.rs +++ b/beacon_node/beacon_chain/src/state_advance_timer.rs @@ -333,25 +333,54 @@ fn advance_head(beacon_chain: &Arc>) -> Resu .build_committee_cache(RelativeEpoch::Next, &beacon_chain.spec) .map_err(BeaconChainError::from)?; - // If the `pre_state` is in a later epoch than `state`, pre-emptively add the proposer shuffling - // for the state's current epoch and the committee cache for the state's next epoch. + // The state root is required to prime the proposer cache AND for writing it to disk. + let advanced_state_root = state.update_tree_hash_cache()?; + + // If the `pre_state` is in a later epoch than `state`, pre-emptively update the proposer + // shuffling and attester shuffling caches. if initial_epoch < state.current_epoch() { - // Update the proposer cache. - // - // We supply the `head_block_root` as the decision block since the prior `if` statement guarantees - // the head root is the latest block from the prior epoch. - beacon_chain - .beacon_proposer_cache - .lock() - .insert( - state.current_epoch(), - head_block_root, - state - .get_beacon_proposer_indices(state.current_epoch(), &beacon_chain.spec) - .map_err(BeaconChainError::from)?, - state.fork(), - ) - .map_err(BeaconChainError::from)?; + // Include the proposer shuffling from the current epoch, which is likely to be useful + // pre-Fulu, and probably redundant post-Fulu (it should already have been in the cache). + let current_epoch_decision_root = state.proposer_shuffling_decision_root_at_epoch( + state.current_epoch(), + head_block_root, + &beacon_chain.spec, + )?; + beacon_chain.with_proposer_cache( + current_epoch_decision_root, + state.current_epoch(), + |_| Ok(()), + || { + debug!( + shuffling_decision_root = ?current_epoch_decision_root, + epoch = %state.current_epoch(), + "Computing current epoch proposer shuffling in state advance" + ); + Ok::<_, Error>((advanced_state_root, state.clone())) + }, + )?; + + // For epochs *greater than* the Fulu fork epoch, we have also determined the proposer + // shuffling for the next epoch. + let next_epoch = state.next_epoch()?; + let next_epoch_decision_root = state.proposer_shuffling_decision_root_at_epoch( + next_epoch, + head_block_root, + &beacon_chain.spec, + )?; + beacon_chain.with_proposer_cache( + next_epoch_decision_root, + next_epoch, + |_| Ok(()), + || { + debug!( + shuffling_decision_root = ?next_epoch_decision_root, + epoch = %next_epoch, + "Computing next epoch proposer shuffling in state advance" + ); + Ok::<_, Error>((advanced_state_root, state.clone())) + }, + )?; // Update the attester cache. let shuffling_id = @@ -406,7 +435,6 @@ fn advance_head(beacon_chain: &Arc>) -> Resu // even if we race with the deletion of this state by the finalization pruning code, the worst // case is we end up with a finalized state stored, that will get pruned the next time pruning // runs. - let advanced_state_root = state.update_tree_hash_cache()?; beacon_chain.store.put_state(&advanced_state_root, &state)?; debug!( diff --git a/beacon_node/beacon_chain/tests/store_tests.rs b/beacon_node/beacon_chain/tests/store_tests.rs index f2a506ec57..7940902d4c 100644 --- a/beacon_node/beacon_chain/tests/store_tests.rs +++ b/beacon_node/beacon_chain/tests/store_tests.rs @@ -13,7 +13,11 @@ use beacon_chain::test_utils::{ use beacon_chain::{ BeaconChain, BeaconChainError, BeaconChainTypes, BeaconSnapshot, BlockError, ChainConfig, NotifyExecutionLayer, ServerSentEventHandler, WhenSlotSkipped, - data_availability_checker::MaybeAvailableBlock, historical_blocks::HistoricalBlockError, + beacon_proposer_cache::{ + compute_proposer_duties_from_head, ensure_state_can_determine_proposers_for_epoch, + }, + data_availability_checker::MaybeAvailableBlock, + historical_blocks::HistoricalBlockError, migrate::MigratorConfig, }; use logging::create_test_tracing_subscriber; @@ -1273,19 +1277,34 @@ async fn proposer_shuffling_root_consistency_test( #[tokio::test] async fn proposer_shuffling_root_consistency_same_epoch() { let spec = test_spec::(); - proposer_shuffling_root_consistency_test(spec, 32, 39).await; + proposer_shuffling_root_consistency_test( + spec, + 4 * E::slots_per_epoch(), + 5 * E::slots_per_epoch() - 1, + ) + .await; } #[tokio::test] async fn proposer_shuffling_root_consistency_next_epoch() { let spec = test_spec::(); - proposer_shuffling_root_consistency_test(spec, 32, 47).await; + proposer_shuffling_root_consistency_test( + spec, + 4 * E::slots_per_epoch(), + 6 * E::slots_per_epoch() - 1, + ) + .await; } #[tokio::test] async fn proposer_shuffling_root_consistency_two_epochs() { let spec = test_spec::(); - proposer_shuffling_root_consistency_test(spec, 32, 55).await; + proposer_shuffling_root_consistency_test( + spec, + 4 * E::slots_per_epoch(), + 7 * E::slots_per_epoch() - 1, + ) + .await; } #[tokio::test] @@ -1501,6 +1520,120 @@ async fn proposer_shuffling_changing_with_lookahead() { ); } +#[tokio::test] +async fn proposer_duties_from_head_fulu() { + let spec = ForkName::Fulu.make_genesis_spec(E::default_spec()); + + let db_path = tempdir().unwrap(); + let store = get_store_generic(&db_path, Default::default(), spec.clone()); + let validators_keypairs = + types::test_utils::generate_deterministic_keypairs(LOW_VALIDATOR_COUNT); + let harness = TestHarness::builder(MinimalEthSpec) + .spec(spec.into()) + .keypairs(validators_keypairs) + .fresh_disk_store(store) + .mock_execution_layer() + .build(); + let spec = &harness.chain.spec; + + let initial_blocks = E::slots_per_epoch() * 3; + + // Build chain out to parent block. + let initial_slots: Vec = (1..=initial_blocks).map(Into::into).collect(); + let (state, state_root) = harness.get_current_state_and_root(); + let all_validators = harness.get_all_validators(); + let (_, _, head_block_root, head_state) = harness + .add_attested_blocks_at_slots(state, state_root, &initial_slots, &all_validators) + .await; + + // Compute the proposer duties at the next epoch from the head + let next_epoch = head_state.next_epoch().unwrap(); + let (_indices, dependent_root, _, fork) = + compute_proposer_duties_from_head(next_epoch, &harness.chain).unwrap(); + + assert_eq!( + dependent_root, + head_state + .proposer_shuffling_decision_root_at_epoch(next_epoch, head_block_root.into(), spec) + .unwrap() + ); + assert_eq!(fork, head_state.fork()); +} + +/// Test that we can compute the proposer shuffling for the Gloas fork epoch itself using lookahead! +#[tokio::test] +async fn proposer_lookahead_gloas_fork_epoch() { + let gloas_fork_epoch = Epoch::new(4); + let mut spec = ForkName::Fulu.make_genesis_spec(E::default_spec()); + spec.gloas_fork_epoch = Some(gloas_fork_epoch); + + let db_path = tempdir().unwrap(); + let store = get_store_generic(&db_path, Default::default(), spec.clone()); + let validators_keypairs = + types::test_utils::generate_deterministic_keypairs(LOW_VALIDATOR_COUNT); + let harness = TestHarness::builder(E::default()) + .spec(spec.into()) + .keypairs(validators_keypairs) + .fresh_disk_store(store) + .mock_execution_layer() + .build(); + let spec = &harness.chain.spec; + + let initial_blocks = (gloas_fork_epoch - 1) + .start_slot(E::slots_per_epoch()) + .as_u64(); + + // Build chain out to parent block. + let initial_slots: Vec = (1..=initial_blocks).map(Into::into).collect(); + let (state, state_root) = harness.get_current_state_and_root(); + let all_validators = harness.get_all_validators(); + let (_, _, head_block_root, mut head_state) = harness + .add_attested_blocks_at_slots(state, state_root, &initial_slots, &all_validators) + .await; + let head_state_root = head_state.canonical_root().unwrap(); + + // Check that we have access to the next epoch shuffling according to + // `ensure_state_can_determine_proposers_for_epoch`. + ensure_state_can_determine_proposers_for_epoch( + &mut head_state, + head_state_root, + gloas_fork_epoch, + spec, + ) + .unwrap(); + assert_eq!(head_state.current_epoch(), gloas_fork_epoch - 1); + + // Compute the proposer duties at the fork epoch from the head. + let (indices, dependent_root, _, fork) = + compute_proposer_duties_from_head(gloas_fork_epoch, &harness.chain).unwrap(); + + assert_eq!( + dependent_root, + head_state + .proposer_shuffling_decision_root_at_epoch( + gloas_fork_epoch, + head_block_root.into(), + spec + ) + .unwrap() + ); + assert_ne!(fork, head_state.fork()); + assert_eq!(fork, spec.fork_at_epoch(gloas_fork_epoch)); + + // Build a block in the Gloas fork epoch and assert that the shuffling does not change. + let gloas_slots = vec![gloas_fork_epoch.start_slot(E::slots_per_epoch())]; + let (_, _, _, _) = harness + .add_attested_blocks_at_slots(head_state, head_state_root, &gloas_slots, &all_validators) + .await; + + let (no_lookahead_indices, no_lookahead_dependent_root, _, no_lookahead_fork) = + compute_proposer_duties_from_head(gloas_fork_epoch, &harness.chain).unwrap(); + + assert_eq!(no_lookahead_indices, indices); + assert_eq!(no_lookahead_dependent_root, dependent_root); + assert_eq!(no_lookahead_fork, fork); +} + // Ensure blocks from abandoned forks are pruned from the Hot DB #[tokio::test] async fn prunes_abandoned_fork_between_two_finalized_checkpoints() { diff --git a/beacon_node/http_api/src/proposer_duties.rs b/beacon_node/http_api/src/proposer_duties.rs index ceac60cbad..78f99c475c 100644 --- a/beacon_node/http_api/src/proposer_duties.rs +++ b/beacon_node/http_api/src/proposer_duties.rs @@ -103,14 +103,6 @@ fn try_proposer_duties_from_cache( let head_block = &head.snapshot.beacon_block; let head_block_root = head.head_block_root(); let head_epoch = head_block.slot().epoch(T::EthSpec::slots_per_epoch()); - let head_decision_root = head - .snapshot - .beacon_state - .proposer_shuffling_decision_root(head_block_root, &chain.spec) - .map_err(warp_utils::reject::beacon_state_error)?; - let execution_optimistic = chain - .is_optimistic_or_invalid_head_block(head_block) - .map_err(warp_utils::reject::unhandled_error)?; // This code path can't handle requests for past epochs. if head_epoch > request_epoch { @@ -119,6 +111,15 @@ fn try_proposer_duties_from_cache( ))); } + let head_decision_root = head + .snapshot + .beacon_state + .proposer_shuffling_decision_root_at_epoch(request_epoch, head_block_root, &chain.spec) + .map_err(warp_utils::reject::beacon_state_error)?; + let execution_optimistic = chain + .is_optimistic_or_invalid_head_block(head_block) + .map_err(warp_utils::reject::unhandled_error)?; + chain .beacon_proposer_cache .lock() diff --git a/beacon_node/http_api/tests/interactive_tests.rs b/beacon_node/http_api/tests/interactive_tests.rs index 1398d8c72f..94b773c32d 100644 --- a/beacon_node/http_api/tests/interactive_tests.rs +++ b/beacon_node/http_api/tests/interactive_tests.rs @@ -940,3 +940,110 @@ async fn queue_attestations_from_http() { attestation_future.await.unwrap(); } + +// Test that a request for next epoch proposer duties suceeds when the current slot clock is within +// gossip clock disparity (500ms) of the new epoch. +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn proposer_duties_with_gossip_tolerance() { + let validator_count = 24; + + let tester = InteractiveTester::::new(None, validator_count).await; + let harness = &tester.harness; + let spec = &harness.spec; + let client = &tester.client; + + let num_initial = 4 * E::slots_per_epoch() - 1; + let next_epoch_start_slot = Slot::new(num_initial + 1); + + harness.advance_slot(); + harness + .extend_chain_with_sync( + num_initial as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + SyncCommitteeStrategy::NoValidators, + LightClientStrategy::Disabled, + ) + .await; + + assert_eq!(harness.chain.slot().unwrap(), num_initial); + + // Set the clock to just before the next epoch. + harness.chain.slot_clock.advance_time( + Duration::from_secs(spec.seconds_per_slot) - spec.maximum_gossip_clock_disparity(), + ); + assert_eq!( + harness + .chain + .slot_clock + .now_with_future_tolerance(spec.maximum_gossip_clock_disparity()) + .unwrap(), + next_epoch_start_slot + ); + + let head_state = harness.get_current_state(); + let head_block_root = harness.head_block_root(); + let tolerant_current_epoch = next_epoch_start_slot.epoch(E::slots_per_epoch()); + + // This is a regression test for the bug described here: + // https://github.com/sigp/lighthouse/pull/8130/files#r2386594566 + // + // To trigger it, we need to prime the proposer shuffling cache with an incorrect entry which + // the previous code would be liable to lookup due to the bugs in its decision root calculation. + let wrong_decision_root = head_state + .proposer_shuffling_decision_root(head_block_root, spec) + .unwrap(); + let wrong_proposer_indices = vec![0; E::slots_per_epoch() as usize]; + harness + .chain + .beacon_proposer_cache + .lock() + .insert( + tolerant_current_epoch, + wrong_decision_root, + wrong_proposer_indices.clone(), + head_state.fork(), + ) + .unwrap(); + + // Request the proposer duties. + let proposer_duties_tolerant_current_epoch = client + .get_validator_duties_proposer(tolerant_current_epoch) + .await + .unwrap(); + + assert_eq!( + proposer_duties_tolerant_current_epoch.dependent_root, + head_state + .proposer_shuffling_decision_root_at_epoch( + tolerant_current_epoch, + head_block_root, + spec + ) + .unwrap() + ); + assert_ne!( + proposer_duties_tolerant_current_epoch + .data + .iter() + .map(|data| data.validator_index as usize) + .collect::>(), + wrong_proposer_indices, + ); + + // We should get the exact same result after properly advancing into the epoch. + harness + .chain + .slot_clock + .advance_time(spec.maximum_gossip_clock_disparity()); + assert_eq!(harness.chain.slot().unwrap(), next_epoch_start_slot); + let proposer_duties_current_epoch = client + .get_validator_duties_proposer(tolerant_current_epoch) + .await + .unwrap(); + + assert_eq!( + proposer_duties_tolerant_current_epoch, + proposer_duties_current_epoch + ); +} diff --git a/consensus/types/src/chain_spec.rs b/consensus/types/src/chain_spec.rs index 50a2f268e0..421655777e 100644 --- a/consensus/types/src/chain_spec.rs +++ b/consensus/types/src/chain_spec.rs @@ -476,15 +476,23 @@ impl ChainSpec { /// Returns a full `Fork` struct for a given epoch. pub fn fork_at_epoch(&self, epoch: Epoch) -> Fork { let current_fork_name = self.fork_name_at_epoch(epoch); - let previous_fork_name = current_fork_name.previous_fork().unwrap_or(ForkName::Base); - let epoch = self + + let fork_epoch = self .fork_epoch(current_fork_name) .unwrap_or_else(|| Epoch::new(0)); + // At genesis the Fork is initialised with two copies of the same value for both + // `previous_version` and `current_version` (see `initialize_beacon_state_from_eth1`). + let previous_fork_name = if fork_epoch == 0 { + current_fork_name + } else { + current_fork_name.previous_fork().unwrap_or(ForkName::Base) + }; + Fork { previous_version: self.fork_version_for_name(previous_fork_name), current_version: self.fork_version_for_name(current_fork_name), - epoch, + epoch: fork_epoch, } } @@ -3010,9 +3018,11 @@ mod yaml_tests { fn proposer_shuffling_decision_root_around_epoch_boundary() { type E = MainnetEthSpec; let fulu_fork_epoch = 5; + let gloas_fork_epoch = 10; let spec = { let mut spec = ForkName::Electra.make_genesis_spec(E::default_spec()); spec.fulu_fork_epoch = Some(Epoch::new(fulu_fork_epoch)); + spec.gloas_fork_epoch = Some(Epoch::new(gloas_fork_epoch)); Arc::new(spec) }; @@ -3026,7 +3036,7 @@ mod yaml_tests { } // For epochs after Fulu, the decision slot is the end of the epoch two epochs prior. - for epoch in ((fulu_fork_epoch + 1)..(fulu_fork_epoch + 10)).map(Epoch::new) { + for epoch in ((fulu_fork_epoch + 1)..=(gloas_fork_epoch + 1)).map(Epoch::new) { assert_eq!( spec.proposer_shuffling_decision_slot::(epoch), (epoch - 1).start_slot(E::slots_per_epoch()) - 1 From da93b89e902f9fdb3cebf6bd7eb48d996c843a0f Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Mon, 20 Oct 2025 14:14:16 +1100 Subject: [PATCH 07/16] Feature gate test CLI flags (#8231) Closes #6980 I think these flags may be useful in future peerdas / das testing, and would be useful to keep. Hence I've gated them behind a `testing` feature flag. Co-Authored-By: Jimmy Chen --- beacon_node/Cargo.toml | 1 + beacon_node/src/cli.rs | 22 +++++++++++----------- beacon_node/src/config.rs | 8 ++++++-- lighthouse/Cargo.toml | 1 + 4 files changed, 19 insertions(+), 13 deletions(-) diff --git a/beacon_node/Cargo.toml b/beacon_node/Cargo.toml index 8e2c598fd4..985f4c1752 100644 --- a/beacon_node/Cargo.toml +++ b/beacon_node/Cargo.toml @@ -15,6 +15,7 @@ path = "src/lib.rs" write_ssz_files = [ "beacon_chain/write_ssz_files", ] # Writes debugging .ssz files to /tmp during block processing. +testing = [] # Enables testing-only CLI flags [dependencies] account_utils = { workspace = true } diff --git a/beacon_node/src/cli.rs b/beacon_node/src/cli.rs index 2e3b3fde4b..28f355151d 100644 --- a/beacon_node/src/cli.rs +++ b/beacon_node/src/cli.rs @@ -60,22 +60,22 @@ pub fn cli_app() -> Command { .display_order(0) ) .arg( - // TODO(das): remove this before PeerDAS release Arg::new("malicious-withhold-count") .long("malicious-withhold-count") .action(ArgAction::Set) .help_heading(FLAG_HEADER) - .help("TESTING ONLY do not use this") + .help("TESTING ONLY: Withholds a subset of data columns during publishing. \ + Do not use in production. Requires the 'testing' feature to be enabled.") .hide(true) .display_order(0) ) .arg( - // TODO(das): remove this before PeerDAS release Arg::new("advertise-false-custody-group-count") .long("advertise-false-custody-group-count") .action(ArgAction::Set) .help_heading(FLAG_HEADER) - .help("Advertises a false CGC for testing PeerDAS. Do NOT use in production.") + .help("TESTING ONLY: Advertises a false custody group count for testing PeerDAS. \ + Do not use in production. Requires the 'testing' feature to be enabled.") .hide(true) .display_order(0) ) @@ -1594,9 +1594,9 @@ pub fn cli_app() -> Command { .value_name("SECONDS") .action(ArgAction::Set) .help_heading(FLAG_HEADER) - .help("TESTING ONLY: Artificially delay block publishing by the specified number of seconds. \ - This only works for if `BroadcastValidation::Gossip` is used (default). \ - DO NOT USE IN PRODUCTION.") + .help("TESTING ONLY: Artificially delays block publishing by the specified number of seconds. \ + This only works if BroadcastValidation::Gossip is used (default). \ + Do not use in production. Requires the 'testing' feature to be enabled.") .hide(true) .display_order(0) ) @@ -1606,10 +1606,10 @@ pub fn cli_app() -> Command { .value_name("SECONDS") .action(ArgAction::Set) .help_heading(FLAG_HEADER) - .help("TESTING ONLY: Artificially delay data column publishing by the specified number of seconds. \ - Limitation: If `delay-block-publishing` is also used, data columns will be delayed for a \ - minimum of `delay-block-publishing` seconds. - DO NOT USE IN PRODUCTION.") + .help("TESTING ONLY: Artificially delays data column publishing by the specified number of seconds. \ + Limitation: If delay-block-publishing is also used, data columns will be delayed for a \ + minimum of delay-block-publishing seconds. \ + Do not use in production. Requires the 'testing' feature to be enabled.") .hide(true) .display_order(0) ) diff --git a/beacon_node/src/config.rs b/beacon_node/src/config.rs index c2599ec0cd..acb392779f 100644 --- a/beacon_node/src/config.rs +++ b/beacon_node/src/config.rs @@ -7,7 +7,7 @@ use beacon_chain::chain_config::{ use beacon_chain::graffiti_calculator::GraffitiOrigin; use clap::{ArgMatches, Id, parser::ValueSource}; use clap_utils::flags::DISABLE_MALLOC_TUNING_FLAG; -use clap_utils::{parse_flag, parse_optional, parse_required}; +use clap_utils::{parse_flag, parse_required}; use client::{ClientConfig, ClientGenesis}; use directory::{DEFAULT_BEACON_NODE_DIR, DEFAULT_NETWORK_DIR, DEFAULT_ROOT_DIR}; use environment::RuntimeContext; @@ -421,6 +421,7 @@ pub fn get_config( client_config.store.blob_prune_margin_epochs = blob_prune_margin_epochs; } + #[cfg(feature = "testing")] if let Some(malicious_withhold_count) = clap_utils::parse_optional(cli_args, "malicious-withhold-count")? { @@ -835,10 +836,12 @@ pub fn get_config( .max_gossip_aggregate_batch_size = clap_utils::parse_required(cli_args, "beacon-processor-aggregate-batch-size")?; + #[cfg(feature = "testing")] if let Some(delay) = clap_utils::parse_optional(cli_args, "delay-block-publishing")? { client_config.chain.block_publishing_delay = Some(Duration::from_secs_f64(delay)); } + #[cfg(feature = "testing")] if let Some(delay) = clap_utils::parse_optional(cli_args, "delay-data-column-publishing")? { client_config.chain.data_column_publishing_delay = Some(Duration::from_secs_f64(delay)); } @@ -1145,8 +1148,9 @@ pub fn set_network_config( config.import_all_attestations = true; } + #[cfg(feature = "testing")] if let Some(advertise_false_custody_group_count) = - parse_optional(cli_args, "advertise-false-custody-group-count")? + clap_utils::parse_optional(cli_args, "advertise-false-custody-group-count")? { config.advertise_false_custody_group_count = Some(advertise_false_custody_group_count); } diff --git a/lighthouse/Cargo.toml b/lighthouse/Cargo.toml index ef680c9b96..82bfc5056e 100644 --- a/lighthouse/Cargo.toml +++ b/lighthouse/Cargo.toml @@ -81,6 +81,7 @@ malloc_utils = { workspace = true, features = ["jemalloc"] } malloc_utils = { workspace = true, features = [] } [dev-dependencies] +beacon_node = { workspace = true, features = ["testing"] } beacon_node_fallback = { workspace = true } beacon_processor = { workspace = true } eth2 = { workspace = true } From 2b30c96f16fa52b44198768fce6a226861674da7 Mon Sep 17 00:00:00 2001 From: chonghe <44791194+chong-he@users.noreply.github.com> Date: Mon, 20 Oct 2025 14:29:21 +0800 Subject: [PATCH 08/16] Avoid attempting to serve blobs after Fulu fork (#7756) * #7122 Co-Authored-By: Tan Chee Keong Co-Authored-By: chonghe <44791194+chong-he@users.noreply.github.com> --- .../network_beacon_processor/rpc_methods.rs | 114 +++++++--- .../src/network_beacon_processor/tests.rs | 197 +++++++++++++++++- 2 files changed, 269 insertions(+), 42 deletions(-) diff --git a/beacon_node/network/src/network_beacon_processor/rpc_methods.rs b/beacon_node/network/src/network_beacon_processor/rpc_methods.rs index 58e02ffe00..0fcd67dbf1 100644 --- a/beacon_node/network/src/network_beacon_processor/rpc_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/rpc_methods.rs @@ -3,7 +3,7 @@ use crate::network_beacon_processor::{FUTURE_SLOT_TOLERANCE, NetworkBeaconProces use crate::service::NetworkMessage; use crate::status::ToStatusMessage; use crate::sync::SyncMessage; -use beacon_chain::{BeaconChainError, BeaconChainTypes, WhenSlotSkipped}; +use beacon_chain::{BeaconChainError, BeaconChainTypes, BlockProcessStatus, WhenSlotSkipped}; use itertools::{Itertools, process_results}; use lighthouse_network::rpc::methods::{ BlobsByRangeRequest, BlobsByRootRequest, DataColumnsByRangeRequest, DataColumnsByRootRequest, @@ -293,21 +293,49 @@ impl NetworkBeaconProcessor { inbound_request_id: InboundRequestId, request: BlobsByRootRequest, ) -> Result<(), (RpcErrorResponse, &'static str)> { - let Some(requested_root) = request.blob_ids.as_slice().first().map(|id| id.block_root) - else { - // No blob ids requested. - return Ok(()); - }; - let requested_indices = request - .blob_ids - .as_slice() - .iter() - .map(|id| id.index) - .collect::>(); let mut send_blob_count = 0; + let fulu_start_slot = self + .chain + .spec + .fulu_fork_epoch + .map(|epoch| epoch.start_slot(T::EthSpec::slots_per_epoch())); + let mut blob_list_results = HashMap::new(); + + let slots_by_block_root: HashMap = request + .blob_ids + .iter() + .flat_map(|blob_id| { + let block_root = blob_id.block_root; + self.chain + .data_availability_checker + .get_cached_block(&block_root) + .and_then(|status| match status { + BlockProcessStatus::NotValidated(block, _source) => Some(block), + BlockProcessStatus::ExecutionValidated(block) => Some(block), + BlockProcessStatus::Unknown => None, + }) + .or_else(|| self.chain.early_attester_cache.get_block(block_root)) + .map(|block| (block_root, block.slot())) + }) + .collect(); + for id in request.blob_ids.as_slice() { + let BlobIdentifier { + block_root: root, + index, + } = id; + + let slot = slots_by_block_root.get(root); + + // Skip if slot is >= fulu_start_slot + if let (Some(slot), Some(fulu_slot)) = (slot, fulu_start_slot) + && *slot >= fulu_slot + { + continue; + } + // First attempt to get the blobs from the RPC cache. if let Ok(Some(blob)) = self.chain.data_availability_checker.get_blob(id) { self.send_response( @@ -317,11 +345,6 @@ impl NetworkBeaconProcessor { ); send_blob_count += 1; } else { - let BlobIdentifier { - block_root: root, - index, - } = id; - let blob_list_result = match blob_list_results.entry(root) { Entry::Vacant(entry) => { entry.insert(self.chain.get_blobs_checking_early_attester_cache(root)) @@ -331,16 +354,15 @@ impl NetworkBeaconProcessor { match blob_list_result.as_ref() { Ok(blobs_sidecar_list) => { - 'inner: for blob_sidecar in blobs_sidecar_list.iter() { - if blob_sidecar.index == *index { - self.send_response( - peer_id, - inbound_request_id, - Response::BlobsByRoot(Some(blob_sidecar.clone())), - ); - send_blob_count += 1; - break 'inner; - } + if let Some(blob_sidecar) = + blobs_sidecar_list.iter().find(|b| b.index == *index) + { + self.send_response( + peer_id, + inbound_request_id, + Response::BlobsByRoot(Some(blob_sidecar.clone())), + ); + send_blob_count += 1; } } Err(e) => { @@ -354,10 +376,10 @@ impl NetworkBeaconProcessor { } } } + debug!( %peer_id, - %requested_root, - ?requested_indices, + block_root = ?slots_by_block_root.keys(), returned = send_blob_count, "BlobsByRoot outgoing response processed" ); @@ -1003,6 +1025,34 @@ impl NetworkBeaconProcessor { ); let request_start_slot = Slot::from(req.start_slot); + let request_start_epoch = request_start_slot.epoch(T::EthSpec::slots_per_epoch()); + let fork_name = self.chain.spec.fork_name_at_epoch(request_start_epoch); + // Should not send more than max request blob sidecars + if req.max_blobs_requested(request_start_epoch, &self.chain.spec) + > self.chain.spec.max_request_blob_sidecars(fork_name) as u64 + { + return Err(( + RpcErrorResponse::InvalidRequest, + "Request exceeded `MAX_REQUEST_BLOBS_SIDECARS`", + )); + } + + let effective_count = if let Some(fulu_epoch) = self.chain.spec.fulu_fork_epoch { + let fulu_start_slot = fulu_epoch.start_slot(T::EthSpec::slots_per_epoch()); + let request_end_slot = request_start_slot.saturating_add(req.count) - 1; + + // If the request_start_slot is at or after a Fulu slot, return an empty response + if request_start_slot >= fulu_start_slot { + return Ok(()); + // For the case that the request slots spans across the Fulu fork slot + } else if request_end_slot >= fulu_start_slot { + (fulu_start_slot - request_start_slot).as_u64() + } else { + req.count + } + } else { + req.count + }; let data_availability_boundary_slot = match self.chain.data_availability_boundary() { Some(boundary) => boundary.start_slot(T::EthSpec::slots_per_epoch()), @@ -1040,7 +1090,7 @@ impl NetworkBeaconProcessor { } let block_roots = - self.get_block_roots_for_slot_range(req.start_slot, req.count, "BlobsByRange")?; + self.get_block_roots_for_slot_range(req.start_slot, effective_count, "BlobsByRange")?; let current_slot = self .chain @@ -1067,7 +1117,7 @@ impl NetworkBeaconProcessor { // Due to skip slots, blobs could be out of the range, we ensure they // are in the range before sending if blob_sidecar.slot() >= request_start_slot - && blob_sidecar.slot() < request_start_slot + req.count + && blob_sidecar.slot() < request_start_slot + effective_count { blobs_sent += 1; self.send_network_message(NetworkMessage::SendResponse { @@ -1148,7 +1198,7 @@ impl NetworkBeaconProcessor { if req.max_requested::() > self.chain.spec.max_request_data_column_sidecars { return Err(( RpcErrorResponse::InvalidRequest, - "Request exceeded `MAX_REQUEST_BLOBS_SIDECARS`", + "Request exceeded `MAX_REQUEST_DATA_COLUMN_SIDECARS`", )); } diff --git a/beacon_node/network/src/network_beacon_processor/tests.rs b/beacon_node/network/src/network_beacon_processor/tests.rs index 4137c974bf..a3aef8f802 100644 --- a/beacon_node/network/src/network_beacon_processor/tests.rs +++ b/beacon_node/network/src/network_beacon_processor/tests.rs @@ -22,7 +22,7 @@ use gossipsub::MessageAcceptance; use itertools::Itertools; use lighthouse_network::rpc::InboundRequestId; use lighthouse_network::rpc::methods::{ - BlobsByRangeRequest, DataColumnsByRangeRequest, MetaDataV3, + BlobsByRangeRequest, BlobsByRootRequest, DataColumnsByRangeRequest, MetaDataV3, }; use lighthouse_network::{ Client, MessageId, NetworkConfig, NetworkGlobals, PeerId, Response, @@ -37,12 +37,12 @@ use std::iter::Iterator; use std::sync::Arc; use std::time::Duration; use tokio::sync::mpsc; -use types::blob_sidecar::FixedBlobSidecarList; +use types::blob_sidecar::{BlobIdentifier, FixedBlobSidecarList}; use types::{ AttesterSlashing, BlobSidecar, BlobSidecarList, ChainSpec, DataColumnSidecarList, DataColumnSubnetId, Epoch, EthSpec, Hash256, MainnetEthSpec, ProposerSlashing, - SignedAggregateAndProof, SignedBeaconBlock, SignedVoluntaryExit, SingleAttestation, Slot, - SubnetId, + RuntimeVariableList, SignedAggregateAndProof, SignedBeaconBlock, SignedVoluntaryExit, + SingleAttestation, Slot, SubnetId, }; type E = MainnetEthSpec; @@ -431,15 +431,22 @@ impl TestRig { } } - pub fn enqueue_blobs_by_range_request(&self, count: u64) { + pub fn enqueue_blobs_by_range_request(&self, start_slot: u64, count: u64) { self.network_beacon_processor .send_blobs_by_range_request( PeerId::random(), InboundRequestId::new_unchecked(42, 24), - BlobsByRangeRequest { - start_slot: 0, - count, - }, + BlobsByRangeRequest { start_slot, count }, + ) + .unwrap(); + } + + pub fn enqueue_blobs_by_root_request(&self, blob_ids: RuntimeVariableList) { + self.network_beacon_processor + .send_blobs_by_roots_request( + PeerId::random(), + InboundRequestId::new_unchecked(42, 24), + BlobsByRootRequest { blob_ids }, ) .unwrap(); } @@ -1632,8 +1639,9 @@ async fn test_blobs_by_range() { return; }; let mut rig = TestRig::new(64).await; + let start_slot = 0; let slot_count = 32; - rig.enqueue_blobs_by_range_request(slot_count); + rig.enqueue_blobs_by_range_request(start_slot, slot_count); let mut blob_count = 0; for slot in 0..slot_count { @@ -1651,6 +1659,65 @@ async fn test_blobs_by_range() { .unwrap_or(0); } let mut actual_count = 0; + while let Some(next) = rig.network_rx.recv().await { + if let NetworkMessage::SendResponse { + peer_id: _, + response: Response::BlobsByRange(blob), + inbound_request_id: _, + } = next + { + if blob.is_some() { + actual_count += 1; + } else { + break; + } + } else { + panic!("unexpected message {:?}", next); + } + } + if test_spec::().fulu_fork_epoch.is_some() { + assert_eq!(0, actual_count, "Post-Fulu should return 0 blobs"); + } else { + assert_eq!(blob_count, actual_count); + } +} + +#[tokio::test] +async fn test_blobs_by_range_spans_fulu_fork() { + // Only test for Electra & Fulu fork transition + if test_spec::().electra_fork_epoch.is_none() { + return; + }; + let mut spec = test_spec::(); + spec.fulu_fork_epoch = Some(Epoch::new(1)); + spec.gloas_fork_epoch = Some(Epoch::new(2)); + + let mut rig = TestRig::new_parametric(64, BeaconProcessorConfig::default(), false, spec).await; + + let start_slot = 16; + // This will span from epoch 0 (Electra) to epoch 1 (Fulu) + let slot_count = 32; + + rig.enqueue_blobs_by_range_request(start_slot, slot_count); + + let mut blob_count = 0; + for slot in start_slot..slot_count { + let root = rig + .chain + .block_root_at_slot(Slot::new(slot), WhenSlotSkipped::None) + .unwrap(); + blob_count += root + .map(|root| { + rig.chain + .get_blobs(&root) + .map(|list| list.len()) + .unwrap_or(0) + }) + .unwrap_or(0); + } + + let mut actual_count = 0; + while let Some(next) = rig.network_rx.recv().await { if let NetworkMessage::SendResponse { peer_id: _, @@ -1670,6 +1737,116 @@ async fn test_blobs_by_range() { assert_eq!(blob_count, actual_count); } +#[tokio::test] +async fn test_blobs_by_root() { + if test_spec::().deneb_fork_epoch.is_none() { + return; + }; + + let mut rig = TestRig::new(64).await; + + // Get the block root of a sample slot, e.g., slot 1 + let block_root = rig + .chain + .block_root_at_slot(Slot::new(1), WhenSlotSkipped::None) + .unwrap() + .unwrap(); + + let blobs = rig.chain.get_blobs(&block_root).unwrap(); + let blob_count = blobs.len(); + + let blob_ids: Vec = (0..blob_count) + .map(|index| BlobIdentifier { + block_root, + index: index as u64, + }) + .collect(); + + let blob_ids_list = RuntimeVariableList::new(blob_ids, blob_count).unwrap(); + + rig.enqueue_blobs_by_root_request(blob_ids_list); + + let mut blob_count = 0; + let root = rig + .chain + .block_root_at_slot(Slot::new(1), WhenSlotSkipped::None) + .unwrap(); + blob_count += root + .map(|root| { + rig.chain + .get_blobs(&root) + .map(|list| list.len()) + .unwrap_or(0) + }) + .unwrap_or(0); + + let mut actual_count = 0; + + while let Some(next) = rig.network_rx.recv().await { + if let NetworkMessage::SendResponse { + peer_id: _, + response: Response::BlobsByRoot(blob), + inbound_request_id: _, + } = next + { + if blob.is_some() { + actual_count += 1; + } else { + break; + } + } else { + panic!("unexpected message {:?}", next); + } + } + assert_eq!(blob_count, actual_count); +} + +#[tokio::test] +async fn test_blobs_by_root_post_fulu_should_return_empty() { + // Only test for Fulu fork + if test_spec::().fulu_fork_epoch.is_none() { + return; + }; + + let mut rig = TestRig::new(64).await; + + let block_root = rig + .chain + .block_root_at_slot(Slot::new(1), WhenSlotSkipped::None) + .unwrap() + .unwrap(); + + let blob_ids = vec![BlobIdentifier { + block_root, + index: 0, + }]; + + let blob_ids_list = RuntimeVariableList::new(blob_ids, 1).unwrap(); + + rig.enqueue_blobs_by_root_request(blob_ids_list); + + let mut actual_count = 0; + + while let Some(next) = rig.network_rx.recv().await { + if let NetworkMessage::SendResponse { + peer_id: _, + response: Response::BlobsByRoot(blob), + inbound_request_id: _, + } = next + { + if blob.is_some() { + actual_count += 1; + } else { + break; + } + } else { + panic!("unexpected message {:?}", next); + } + } + // Post-Fulu should return 0 blobs + assert_eq!(0, actual_count); +} + /// Ensure that data column processing that results in block import sends a sync notification #[tokio::test] async fn test_data_column_import_notifies_sync() { From c012f46cb911672700247e3c849ec496bbbc6292 Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Mon, 20 Oct 2025 18:10:40 +1100 Subject: [PATCH 09/16] Fix `get_header` JSON deserialization. (#8228) #8224 Please list or describe the changes introduced by this PR. Co-Authored-By: Jimmy Chen --- Cargo.lock | 2 + beacon_node/builder_client/Cargo.toml | 4 + beacon_node/builder_client/src/lib.rs | 158 ++++++++++++++++++++++++-- 3 files changed, 155 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 59e7bda170..516d0df358 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1235,10 +1235,12 @@ dependencies = [ "eth2", "ethereum_ssz", "lighthouse_version", + "mockito", "reqwest 0.11.27", "sensitive_url", "serde", "serde_json", + "tokio", ] [[package]] diff --git a/beacon_node/builder_client/Cargo.toml b/beacon_node/builder_client/Cargo.toml index 1920bd0ebb..9b1f86360d 100644 --- a/beacon_node/builder_client/Cargo.toml +++ b/beacon_node/builder_client/Cargo.toml @@ -12,3 +12,7 @@ reqwest = { workspace = true } sensitive_url = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } + +[dev-dependencies] +mockito = { workspace = true } +tokio = { workspace = true } diff --git a/beacon_node/builder_client/src/lib.rs b/beacon_node/builder_client/src/lib.rs index 2c83e34755..6b993542f3 100644 --- a/beacon_node/builder_client/src/lib.rs +++ b/beacon_node/builder_client/src/lib.rs @@ -155,15 +155,7 @@ impl BuilderHttpClient { } ContentType::Json => { self.ssz_available.store(false, Ordering::SeqCst); - let mut de = serde_json::Deserializer::from_slice(&response_bytes); - let data = - T::context_deserialize(&mut de, fork_name).map_err(Error::InvalidJson)?; - - Ok(ForkVersionedResponse { - version: fork_name, - metadata: EmptyMetadata {}, - data, - }) + serde_json::from_slice(&response_bytes).map_err(Error::InvalidJson) } } } @@ -546,6 +538,12 @@ impl BuilderHttpClient { #[cfg(test)] mod tests { use super::*; + use eth2::types::builder_bid::{BuilderBid, BuilderBidFulu}; + use eth2::types::test_utils::{SeedableRng, TestRandom, XorShiftRng}; + use eth2::types::{MainnetEthSpec, Signature}; + use mockito::{Matcher, Server, ServerGuard}; + + type E = MainnetEthSpec; #[test] fn test_headers_no_panic() { @@ -556,4 +554,146 @@ mod tests { assert!(HeaderValue::from_str(JSON_ACCEPT_VALUE).is_ok()); assert!(HeaderValue::from_str(JSON_CONTENT_TYPE_HEADER).is_ok()); } + + #[tokio::test] + async fn test_get_builder_header_ssz_response() { + // Set up mock server + let mut server = Server::new_async().await; + let mock_response_body = fulu_signed_builder_bid(); + mock_get_header_response( + &mut server, + Some("fulu"), + ContentType::Ssz, + mock_response_body.clone(), + ); + + let builder_client = BuilderHttpClient::new( + SensitiveUrl::from_str(&server.url()).unwrap(), + None, + None, + false, + ) + .unwrap(); + + let response = builder_client + .get_builder_header( + Slot::new(1), + ExecutionBlockHash::repeat_byte(1), + &PublicKeyBytes::empty(), + ) + .await + .expect("should succeed in get_builder_header") + .expect("should have response body"); + + assert_eq!(response, mock_response_body); + } + + #[tokio::test] + async fn test_get_builder_header_json_response() { + // Set up mock server + let mut server = Server::new_async().await; + let mock_response_body = fulu_signed_builder_bid(); + mock_get_header_response( + &mut server, + None, + ContentType::Json, + mock_response_body.clone(), + ); + + let builder_client = BuilderHttpClient::new( + SensitiveUrl::from_str(&server.url()).unwrap(), + None, + None, + false, + ) + .unwrap(); + + let response = builder_client + .get_builder_header( + Slot::new(1), + ExecutionBlockHash::repeat_byte(1), + &PublicKeyBytes::empty(), + ) + .await + .expect("should succeed in get_builder_header") + .expect("should have response body"); + + assert_eq!(response, mock_response_body); + } + + #[tokio::test] + async fn test_get_builder_header_no_version_header_fallback_json() { + // Set up mock server + let mut server = Server::new_async().await; + let mock_response_body = fulu_signed_builder_bid(); + mock_get_header_response( + &mut server, + Some("fulu"), + ContentType::Json, + mock_response_body.clone(), + ); + + let builder_client = BuilderHttpClient::new( + SensitiveUrl::from_str(&server.url()).unwrap(), + None, + None, + false, + ) + .unwrap(); + + let response = builder_client + .get_builder_header( + Slot::new(1), + ExecutionBlockHash::repeat_byte(1), + &PublicKeyBytes::empty(), + ) + .await + .expect("should succeed in get_builder_header") + .expect("should have response body"); + + assert_eq!(response, mock_response_body); + } + + fn mock_get_header_response( + server: &mut ServerGuard, + header_version_opt: Option<&str>, + content_type: ContentType, + response_body: ForkVersionedResponse>, + ) { + let mut mock = server.mock( + "GET", + Matcher::Regex(r"^/eth/v1/builder/header/\d+/.+/.+$".to_string()), + ); + + if let Some(version) = header_version_opt { + mock = mock.with_header(CONSENSUS_VERSION_HEADER, version); + } + + match content_type { + ContentType::Json => { + mock = mock + .with_header(CONTENT_TYPE_HEADER, JSON_CONTENT_TYPE_HEADER) + .with_body(serde_json::to_string(&response_body).unwrap()); + } + ContentType::Ssz => { + mock = mock + .with_header(CONTENT_TYPE_HEADER, SSZ_CONTENT_TYPE_HEADER) + .with_body(response_body.data.as_ssz_bytes()); + } + } + + mock.with_status(200).create(); + } + + fn fulu_signed_builder_bid() -> ForkVersionedResponse> { + let rng = &mut XorShiftRng::from_seed([42; 16]); + ForkVersionedResponse { + version: ForkName::Fulu, + metadata: EmptyMetadata {}, + data: SignedBuilderBid { + message: BuilderBid::Fulu(BuilderBidFulu::random_for_test(rng)), + signature: Signature::empty(), + }, + } + } } From 092aaae9610e15175fffd4445ce96fda199857a2 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Mon, 20 Oct 2025 04:50:00 -0700 Subject: [PATCH 10/16] Sync cleanups (#8230) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit N/A 1. In the batch retry logic, we were failing to set the batch state to `AwaitingDownload` before attempting a retry. This PR sets it to `AwaitingDownload` before the retry and sets it back to `Downloading` if the retry suceeded in sending out a request 2. Remove all peer scoring logic from retrying and rely on just de priorotizing the failed peer. I finally concede the point to @dapplion 😄 3. Changes `block_components_by_range_request` to accept `block_peers` and `column_peers`. This is to ensure that we use the full synced peerset for requesting columns in order to avoid splitting the column peers among multiple head chains. During forward sync, we want the block peers to be the peers from the syncing chain and column peers to be all synced peers from the peerdb. Also, fixes a typo and calls `attempt_send_awaiting_download_batches` from more places Co-Authored-By: Pawan Dhananjay --- .../src/peer_manager/peerdb.rs | 15 ++----- .../network/src/sync/backfill_sync/mod.rs | 13 +++--- .../src/sync/block_sidecar_coupling.rs | 12 +---- .../network/src/sync/network_context.rs | 11 ++--- .../network/src/sync/range_sync/batch.rs | 25 +++++++++++ .../network/src/sync/range_sync/chain.rs | 45 ++++++++++++++----- 6 files changed, 77 insertions(+), 44 deletions(-) diff --git a/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs b/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs index 0ccad8d042..87337cafcf 100644 --- a/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs +++ b/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs @@ -247,23 +247,16 @@ impl PeerDB { .map(|(peer_id, _)| peer_id) } - /// Returns all the synced peers from the list of allowed peers that claim to have the block + /// Returns all the synced peers from the peer db that claim to have the block /// components for the given epoch based on `status.earliest_available_slot`. /// /// If `earliest_available_slot` info is not available, then return peer anyway assuming it has the /// required data. - /// - /// If `allowed_peers` is `Some`, then filters for the epoch only for those peers. - pub fn synced_peers_for_epoch<'a>( - &'a self, - epoch: Epoch, - allowed_peers: Option<&'a HashSet>, - ) -> impl Iterator { + pub fn synced_peers_for_epoch(&self, epoch: Epoch) -> impl Iterator { self.peers .iter() - .filter(move |(peer_id, info)| { - allowed_peers.is_none_or(|allowed| allowed.contains(peer_id)) - && info.is_connected() + .filter(move |(_, info)| { + info.is_connected() && match info.sync_status() { SyncStatus::Synced { info } => { info.has_slot(epoch.end_slot(E::slots_per_epoch())) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index d5a4e9b73a..00597586b8 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -210,7 +210,7 @@ impl BackFillSync { .network_globals .peers .read() - .synced_peers_for_epoch(self.to_be_downloaded, None) + .synced_peers_for_epoch(self.to_be_downloaded) .next() .is_some() // backfill can't progress if we do not have peers in the required subnets post peerdas. @@ -313,7 +313,6 @@ impl BackFillSync { CouplingError::DataColumnPeerFailure { error, faulty_peers, - action, exceeded_retries, } => { debug!(?batch_id, error, "Block components coupling error"); @@ -325,11 +324,8 @@ impl BackFillSync { failed_columns.insert(*column); failed_peers.insert(*peer); } - for peer in failed_peers.iter() { - network.report_peer(*peer, *action, "failed to return columns"); - } - // Only retry if peer failure **and** retries have been exceeded + // Only retry if peer failure **and** retries haven't been exceeded if !*exceeded_retries { return self.retry_partial_batch( network, @@ -888,7 +884,7 @@ impl BackFillSync { .network_globals .peers .read() - .synced_peers_for_epoch(batch_id, None) + .synced_peers_for_epoch(batch_id) .cloned() .collect::>(); @@ -899,6 +895,7 @@ impl BackFillSync { request, RangeRequestId::BackfillSync { batch_id }, &synced_peers, + &synced_peers, // All synced peers have imported up to the finalized slot so they must have their custody columns available &failed_peers, ) { Ok(request_id) => { @@ -964,7 +961,7 @@ impl BackFillSync { .network_globals() .peers .read() - .synced_peers_for_epoch(batch_id, None) + .synced_peers_for_epoch(batch_id) .cloned() .collect::>(); diff --git a/beacon_node/network/src/sync/block_sidecar_coupling.rs b/beacon_node/network/src/sync/block_sidecar_coupling.rs index ffc79c1550..ba89d11225 100644 --- a/beacon_node/network/src/sync/block_sidecar_coupling.rs +++ b/beacon_node/network/src/sync/block_sidecar_coupling.rs @@ -2,7 +2,7 @@ use beacon_chain::{ block_verification_types::RpcBlock, data_column_verification::CustodyDataColumn, get_block_root, }; use lighthouse_network::{ - PeerAction, PeerId, + PeerId, service::api_types::{ BlobsByRangeRequestId, BlocksByRangeRequestId, DataColumnsByRangeRequestId, }, @@ -63,7 +63,6 @@ pub(crate) enum CouplingError { DataColumnPeerFailure { error: String, faulty_peers: Vec<(ColumnIndex, PeerId)>, - action: PeerAction, exceeded_retries: bool, }, BlobPeerFailure(String), @@ -253,7 +252,6 @@ impl RangeBlockComponentsRequest { if let Err(CouplingError::DataColumnPeerFailure { error: _, faulty_peers, - action: _, exceeded_retries: _, }) = &resp { @@ -377,7 +375,6 @@ impl RangeBlockComponentsRequest { return Err(CouplingError::DataColumnPeerFailure { error: format!("No columns for block {block_root:?} with data"), faulty_peers: responsible_peers, - action: PeerAction::LowToleranceError, exceeded_retries, }); @@ -402,7 +399,6 @@ impl RangeBlockComponentsRequest { return Err(CouplingError::DataColumnPeerFailure { error: format!("Peers did not return column for block_root {block_root:?} {naughty_peers:?}"), faulty_peers: naughty_peers, - action: PeerAction::LowToleranceError, exceeded_retries }); } @@ -468,7 +464,7 @@ mod tests { NumBlobs, generate_rand_block_and_blobs, generate_rand_block_and_data_columns, test_spec, }; use lighthouse_network::{ - PeerAction, PeerId, + PeerId, service::api_types::{ BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, DataColumnsByRangeRequestId, Id, RangeRequestId, @@ -785,7 +781,6 @@ mod tests { if let Err(super::CouplingError::DataColumnPeerFailure { error, faulty_peers, - action, exceeded_retries, }) = result { @@ -793,7 +788,6 @@ mod tests { assert_eq!(faulty_peers.len(), 2); // columns 3 and 4 missing assert_eq!(faulty_peers[0].0, 3); // column index 3 assert_eq!(faulty_peers[1].0, 4); // column index 4 - assert!(matches!(action, PeerAction::LowToleranceError)); assert!(!exceeded_retries); // First attempt, should be false } else { panic!("Expected PeerFailure error"); @@ -957,13 +951,11 @@ mod tests { if let Err(super::CouplingError::DataColumnPeerFailure { error: _, faulty_peers, - action, exceeded_retries, }) = result { assert_eq!(faulty_peers.len(), 1); // column 2 missing assert_eq!(faulty_peers[0].0, 2); // column index 2 - assert!(matches!(action, PeerAction::LowToleranceError)); assert!(exceeded_retries); // Should be true after max retries } else { panic!("Expected PeerFailure error with exceeded_retries=true"); diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index ac2991c147..1d119cb2de 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -533,19 +533,21 @@ impl SyncNetworkContext { batch_type: ByRangeRequestType, request: BlocksByRangeRequest, requester: RangeRequestId, - peers: &HashSet, + block_peers: &HashSet, + column_peers: &HashSet, peers_to_deprioritize: &HashSet, ) -> Result { let range_request_span = debug_span!( parent: None, SPAN_OUTGOING_RANGE_REQUEST, range_req_id = %requester, - peers = peers.len() + block_peers = block_peers.len(), + column_peers = column_peers.len() ); let _guard = range_request_span.clone().entered(); let active_request_count_by_peer = self.active_request_count_by_peer(); - let Some(block_peer) = peers + let Some(block_peer) = block_peers .iter() .map(|peer| { ( @@ -579,7 +581,7 @@ impl SyncNetworkContext { .collect(); Some(self.select_columns_by_range_peers_to_request( &column_indexes, - peers, + column_peers, active_request_count_by_peer, peers_to_deprioritize, )?) @@ -770,7 +772,6 @@ impl SyncNetworkContext { let range_req = entry.get_mut(); if let Some(blocks_result) = range_req.responses(&self.chain.spec) { if let Err(CouplingError::DataColumnPeerFailure { - action: _, error, faulty_peers: _, exceeded_retries, diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index 31e6594139..c79800bfbe 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -334,6 +334,31 @@ impl BatchInfo { } } + /// Change the batch state from `Self::Downloading` to `Self::AwaitingDownload` without + /// registering a failed attempt. + /// + /// Note: must use this cautiously with some level of retry protection + /// as not registering a failed attempt could lead to requesting in a loop. + #[must_use = "Batch may have failed"] + pub fn downloading_to_awaiting_download( + &mut self, + ) -> Result { + match self.state.poison() { + BatchState::Downloading(_) => { + self.state = BatchState::AwaitingDownload; + Ok(self.outcome()) + } + BatchState::Poisoned => unreachable!("Poisoned batch"), + other => { + self.state = other; + Err(WrongState(format!( + "Download failed for batch in wrong state {:?}", + self.state + ))) + } + } + } + pub fn start_downloading(&mut self, request_id: Id) -> Result<(), WrongState> { match self.state.poison() { BatchState::AwaitingDownload => { diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 3b816c0922..ab5b8bee5e 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -871,7 +871,6 @@ impl SyncingChain { CouplingError::DataColumnPeerFailure { error, faulty_peers, - action, exceeded_retries, } => { debug!(?batch_id, error, "Block components coupling error"); @@ -883,12 +882,22 @@ impl SyncingChain { failed_columns.insert(*column); failed_peers.insert(*peer); } - for peer in failed_peers.iter() { - network.report_peer(*peer, *action, "failed to return columns"); - } // Retry the failed columns if the column requests haven't exceeded the // max retries. Otherwise, remove treat it as a failed batch below. if !*exceeded_retries { + // Set the batch back to `AwaitingDownload` before retrying. + // This is to ensure that the batch doesn't get stuck in `Downloading` state. + // + // DataColumn retries has a retry limit so calling `downloading_to_awaiting_download` + // is safe. + if let BatchOperationOutcome::Failed { blacklist } = + batch.downloading_to_awaiting_download()? + { + return Err(RemoveChain::ChainFailed { + blacklist, + failing_batch: batch_id, + }); + } return self.retry_partial_batch( network, batch_id, @@ -936,7 +945,10 @@ impl SyncingChain { failing_batch: batch_id, }); } - self.send_batch(network, batch_id) + // The errored batch is set to AwaitingDownload above. + // We now just attempt to download all batches stuck in `AwaitingDownload` + // state in the right order. + self.attempt_send_awaiting_download_batches(network, "injecting error") } else { debug!( batch_epoch = %batch_id, @@ -969,7 +981,7 @@ impl SyncingChain { .collect(); debug!( ?awaiting_downloads, - src, "Attempting to send batches awaiting downlaod" + src, "Attempting to send batches awaiting download" ); for batch_id in awaiting_downloads { @@ -998,11 +1010,11 @@ impl SyncingChain { let (request, batch_type) = batch.to_blocks_by_range_request(); let failed_peers = batch.failed_peers(); - let synced_peers = network + let synced_column_peers = network .network_globals() .peers .read() - .synced_peers_for_epoch(batch_id, Some(&self.peers)) + .synced_peers_for_epoch(batch_id) .cloned() .collect::>(); @@ -1013,7 +1025,13 @@ impl SyncingChain { chain_id: self.id, batch_id, }, - &synced_peers, + // Request blocks only from peers of this specific chain + &self.peers, + // Request column from all synced peers, even if they are not part of this chain. + // This is to avoid splitting of good column peers across many head chains in a heavy forking + // environment. If the column peers and block peer are on different chains, then we return + // a coupling error and retry only the columns that failed to couple. See `Self::retry_partial_batch`. + &synced_column_peers, &failed_peers, ) { Ok(request_id) => { @@ -1081,7 +1099,7 @@ impl SyncingChain { .network_globals() .peers .read() - .synced_peers_for_epoch(batch_id, Some(&self.peers)) + .synced_peers_for_epoch(batch_id) .cloned() .collect::>(); @@ -1093,6 +1111,8 @@ impl SyncingChain { &failed_columns, ) { Ok(_) => { + // inform the batch about the new request + batch.start_downloading(id)?; debug!( ?batch_id, id, "Retried column requests from different peers" @@ -1100,6 +1120,8 @@ impl SyncingChain { return Ok(KeepChain); } Err(e) => { + // No need to explicitly fail the batch since its in `AwaitingDownload` state + // before we attempted to retry. debug!(?batch_id, id, e, "Failed to retry partial batch"); } } @@ -1123,6 +1145,9 @@ impl SyncingChain { ) -> Result { let _guard = self.span.clone().entered(); debug!("Resuming chain"); + // attempt to download any batches stuck in the `AwaitingDownload` state because of + // a lack of peers before. + self.attempt_send_awaiting_download_batches(network, "resume")?; // Request more batches if needed. self.request_batches(network)?; // If there is any batch ready for processing, send it. From 66f88f6bb4ee6628f9f41d55b32ffc4f3e5dcd5d Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Tue, 21 Oct 2025 13:24:43 +1100 Subject: [PATCH 11/16] Use `millis_from_slot_start` when comparing against reconstruction deadline (#8246) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This recent PR below changes the max reconstruction delay to be a function of slot time. However it uses `seconds_from_slot_start` when comparing (and dropping `nano`), so it might delay reconstruction on networks where the slot time isn’t a multiple of 4, e.g. on gnosis this only happens at 2s instead of 1.25s.: - https://github.com/sigp/lighthouse/pull/8067#discussion_r2443875068 Use `millis_from_slot_start` when comparing against reconstruction deadline Also added some tests for reconstruction delay. Co-Authored-By: Jimmy Chen --- .../src/scheduler/work_reprocessing_queue.rs | 118 +++++++++++++++++- 1 file changed, 115 insertions(+), 3 deletions(-) diff --git a/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs b/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs index 9ff26e7841..c99388287c 100644 --- a/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs +++ b/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs @@ -761,10 +761,10 @@ impl ReprocessQueue { let reconstruction_deadline_millis = (slot_duration * RECONSTRUCTION_DEADLINE.0) / RECONSTRUCTION_DEADLINE.1; let reconstruction_deadline = Duration::from_millis(reconstruction_deadline_millis); - if let Some(seconds_from_current_slot) = - self.slot_clock.seconds_from_current_slot_start() + if let Some(duration_from_current_slot) = + self.slot_clock.millis_from_current_slot_start() && let Some(current_slot) = self.slot_clock.now() - && seconds_from_current_slot >= reconstruction_deadline + && duration_from_current_slot >= reconstruction_deadline && current_slot == request.slot { // If we are at least `reconstruction_deadline` seconds into the current slot, @@ -1227,4 +1227,116 @@ mod tests { // The entry for the block root should be gone. assert!(queue.awaiting_lc_updates_per_parent_root.is_empty()); } + + async fn test_reconstruction_immediate_at_deadline(slot_duration_secs: u64) { + let config = BeaconProcessorConfig::default(); + let (ready_work_tx, _) = mpsc::channel::(config.max_scheduled_work_queue_len); + let (_, reprocess_work_rx) = + mpsc::channel::(config.max_scheduled_work_queue_len); + let slot_clock = Arc::new(testing_slot_clock(slot_duration_secs)); + let mut queue = ReprocessQueue::new(ready_work_tx, reprocess_work_rx, slot_clock); + + let slot_duration = queue.slot_clock.slot_duration(); + let reconstruction_deadline_millis = (slot_duration.as_millis() as u64 + * RECONSTRUCTION_DEADLINE.0) + / RECONSTRUCTION_DEADLINE.1; + let reconstruction_deadline = Duration::from_millis(reconstruction_deadline_millis); + + // Advance time to just after the deadline + advance_time( + &queue.slot_clock, + reconstruction_deadline + Duration::from_millis(10), + ) + .await; + + let current_slot = queue.slot_clock.now().unwrap(); + let block_root = Hash256::repeat_byte(0xaa); + + // Queue a reconstruction for the current slot after the deadline + let reconstruction_request = QueuedColumnReconstruction { + block_root, + slot: current_slot, + process_fn: Box::pin(async {}), + }; + queue.handle_message(InboundEvent::Msg( + ReprocessQueueMessage::DelayColumnReconstruction(reconstruction_request), + )); + + assert_eq!(queue.queued_column_reconstructions.len(), 1); + + // Should be immediately ready (0 delay since we're past deadline) + let ready_msg = queue.next().await.unwrap(); + assert!(matches!( + ready_msg, + InboundEvent::ReadyColumnReconstruction(_) + )); + + if let InboundEvent::ReadyColumnReconstruction(reconstruction) = ready_msg { + assert_eq!(reconstruction.block_root, block_root); + queue.handle_message(InboundEvent::ReadyColumnReconstruction(reconstruction)); + } + + assert!(queue.queued_column_reconstructions.is_empty()); + } + + /// Tests that column reconstruction queued after the deadline is triggered immediately + /// on mainnet (12s slots). + /// + /// When a reconstruction for the current slot is queued after the reconstruction deadline + /// (1/4 of slot duration = 3s for mainnet), it should be processed immediately with 0 delay. + #[tokio::test] + async fn column_reconstruction_immediate_processing_at_deadline_mainnet() { + tokio::time::pause(); + test_reconstruction_immediate_at_deadline(12).await; + } + + /// Tests that column reconstruction queued after the deadline is triggered immediately + /// on Gnosis (5s slots). + /// + /// When a reconstruction for the current slot is queued after the reconstruction deadline + /// (1/4 of slot duration = 1.25s for Gnosis), it should be processed immediately with 0 delay. + #[tokio::test] + async fn column_reconstruction_immediate_processing_at_deadline_gnosis() { + tokio::time::pause(); + test_reconstruction_immediate_at_deadline(5).await; + } + + /// Tests that column reconstruction uses the standard delay when queued before the deadline. + /// + /// When a reconstruction for the current slot is queued before the deadline, it should wait + /// for the standard QUEUED_RECONSTRUCTION_DELAY (150ms) before being triggered. + #[tokio::test] + async fn column_reconstruction_uses_standard_delay() { + tokio::time::pause(); + + let mut queue = test_queue(); + let current_slot = queue.slot_clock.now().unwrap(); + let block_root = Hash256::repeat_byte(0xcc); + + // Queue a reconstruction at the start of the slot (before deadline) + let reconstruction_request = QueuedColumnReconstruction { + block_root, + slot: current_slot, + process_fn: Box::pin(async {}), + }; + queue.handle_message(InboundEvent::Msg( + ReprocessQueueMessage::DelayColumnReconstruction(reconstruction_request), + )); + + assert_eq!(queue.queued_column_reconstructions.len(), 1); + + // Advance time by QUEUED_RECONSTRUCTION_DELAY + advance_time(&queue.slot_clock, QUEUED_RECONSTRUCTION_DELAY).await; + + // Should be ready after the standard delay + let ready_msg = queue.next().await.unwrap(); + assert!(matches!( + ready_msg, + InboundEvent::ReadyColumnReconstruction(_) + )); + + if let InboundEvent::ReadyColumnReconstruction(reconstruction) = ready_msg { + assert_eq!(reconstruction.block_root, block_root); + } + } } From 040d992132a62eafb4c80108f5eed77da97894bc Mon Sep 17 00:00:00 2001 From: chonghe <44791194+chong-he@users.noreply.github.com> Date: Tue, 21 Oct 2025 21:58:10 +0800 Subject: [PATCH 12/16] Add `version` to the response of beacon API `getPendingConsolidations` (#8251) * #7440 Co-Authored-By: Tan Chee Keong --- beacon_node/http_api/src/lib.rs | 22 +++++++++++++++------- beacon_node/http_api/tests/tests.rs | 15 +++++++++++---- common/eth2/src/lib.rs | 6 ++++-- 3 files changed, 30 insertions(+), 13 deletions(-) diff --git a/beacon_node/http_api/src/lib.rs b/beacon_node/http_api/src/lib.rs index 7f6c97a0f8..170012b04b 100644 --- a/beacon_node/http_api/src/lib.rs +++ b/beacon_node/http_api/src/lib.rs @@ -1236,8 +1236,8 @@ pub fn serve( |state_id: StateId, task_spawner: TaskSpawner, chain: Arc>| { - task_spawner.blocking_json_task(Priority::P1, move || { - let (data, execution_optimistic, finalized) = state_id + task_spawner.blocking_response_task(Priority::P1, move || { + let (data, execution_optimistic, finalized, fork_name) = state_id .map_state_and_execution_optimistic_and_finalized( &chain, |state, execution_optimistic, finalized| { @@ -1247,15 +1247,23 @@ pub fn serve( )); }; - Ok((consolidations.clone(), execution_optimistic, finalized)) + Ok(( + consolidations.clone(), + execution_optimistic, + finalized, + state.fork_name_unchecked(), + )) }, )?; - Ok(api_types::ExecutionOptimisticFinalizedResponse { + execution_optimistic_finalized_beacon_response( + ResponseIncludesVersion::Yes(fork_name), + execution_optimistic, + finalized, data, - execution_optimistic: Some(execution_optimistic), - finalized: Some(finalized), - }) + ) + .map(|res| warp::reply::json(&res).into_response()) + .map(|resp| add_consensus_version_header(resp, fork_name)) }) }, ); diff --git a/beacon_node/http_api/tests/tests.rs b/beacon_node/http_api/tests/tests.rs index 9c18a7c1e8..7c2282a488 100644 --- a/beacon_node/http_api/tests/tests.rs +++ b/beacon_node/http_api/tests/tests.rs @@ -1369,12 +1369,14 @@ impl ApiTester { .ok() .map(|(state, _execution_optimistic, _finalized)| state); - let result = self + let result = match self .client .get_beacon_states_pending_consolidations(state_id.0) .await - .unwrap() - .map(|res| res.data); + { + Ok(response) => response, + Err(e) => panic!("query failed incorrectly: {e:?}"), + }; if result.is_none() && state_opt.is_none() { continue; @@ -1383,7 +1385,12 @@ impl ApiTester { let state = state_opt.as_mut().expect("result should be none"); let expected = state.pending_consolidations().unwrap(); - assert_eq!(result.unwrap(), expected.to_vec()); + let response = result.unwrap(); + assert_eq!(response.data(), &expected.to_vec()); + + // Check that the version header is returned in the response + let fork_name = state.fork_name(&self.chain.spec).unwrap(); + assert_eq!(response.version(), Some(fork_name),); } self diff --git a/common/eth2/src/lib.rs b/common/eth2/src/lib.rs index 0423794d0d..995e6966ea 100644 --- a/common/eth2/src/lib.rs +++ b/common/eth2/src/lib.rs @@ -943,7 +943,7 @@ impl BeaconNodeHttpClient { pub async fn get_beacon_states_pending_consolidations( &self, state_id: StateId, - ) -> Result>>, Error> + ) -> Result>>, Error> { let mut path = self.eth_path(V1)?; @@ -954,7 +954,9 @@ impl BeaconNodeHttpClient { .push(&state_id.to_string()) .push("pending_consolidations"); - self.get_opt(path).await + self.get_fork_contextual(path, |fork| fork) + .await + .map(|opt| opt.map(BeaconResponse::ForkVersioned)) } /// `GET beacon/light_client/updates` From 21bab0899a5fc2201abfbc0be30e17bb81472cf6 Mon Sep 17 00:00:00 2001 From: Michael Sproul Date: Wed, 22 Oct 2025 00:58:12 +1100 Subject: [PATCH 13/16] Improve block header signature handling (#8253) Closes: - https://github.com/sigp/lighthouse/issues/7650 Reject blob and data column sidecars from RPC with invalid signatures. Co-Authored-By: Michael Sproul --- beacon_node/beacon_chain/src/beacon_chain.rs | 68 ++++++---- beacon_node/beacon_chain/src/test_utils.rs | 2 +- .../beacon_chain/tests/blob_verification.rs | 120 ++++++++++++++++++ .../beacon_chain/tests/column_verification.rs | 117 +++++++++++++++++ beacon_node/beacon_chain/tests/events.rs | 53 ++++---- beacon_node/beacon_chain/tests/main.rs | 2 + 6 files changed, 307 insertions(+), 55 deletions(-) create mode 100644 beacon_node/beacon_chain/tests/blob_verification.rs create mode 100644 beacon_node/beacon_chain/tests/column_verification.rs diff --git a/beacon_node/beacon_chain/src/beacon_chain.rs b/beacon_node/beacon_chain/src/beacon_chain.rs index e8db154a9b..ab157163f9 100644 --- a/beacon_node/beacon_chain/src/beacon_chain.rs +++ b/beacon_node/beacon_chain/src/beacon_chain.rs @@ -3564,7 +3564,7 @@ impl BeaconChain { .await } - fn check_blobs_for_slashability<'a>( + fn check_blob_header_signature_and_slashability<'a>( self: &Arc, block_root: Hash256, blobs: impl IntoIterator>, @@ -3575,17 +3575,20 @@ impl BeaconChain { .map(|b| b.signed_block_header.clone()) .unique() { - if verify_header_signature::(self, &header).is_ok() { - slashable_cache - .observe_slashable( - header.message.slot, - header.message.proposer_index, - block_root, - ) - .map_err(|e| BlockError::BeaconChainError(Box::new(e.into())))?; - if let Some(slasher) = self.slasher.as_ref() { - slasher.accept_block_header(header); - } + // Return an error if *any* header signature is invalid, we do not want to import this + // list of blobs into the DA checker. However, we will process any valid headers prior + // to the first invalid header in the slashable cache & slasher. + verify_header_signature::(self, &header)?; + + slashable_cache + .observe_slashable( + header.message.slot, + header.message.proposer_index, + block_root, + ) + .map_err(|e| BlockError::BeaconChainError(Box::new(e.into())))?; + if let Some(slasher) = self.slasher.as_ref() { + slasher.accept_block_header(header); } } Ok(()) @@ -3599,7 +3602,10 @@ impl BeaconChain { block_root: Hash256, blobs: FixedBlobSidecarList, ) -> Result { - self.check_blobs_for_slashability(block_root, blobs.iter().flatten().map(Arc::as_ref))?; + self.check_blob_header_signature_and_slashability( + block_root, + blobs.iter().flatten().map(Arc::as_ref), + )?; let availability = self .data_availability_checker .put_rpc_blobs(block_root, blobs)?; @@ -3616,12 +3622,15 @@ impl BeaconChain { ) -> Result { let availability = match engine_get_blobs_output { EngineGetBlobsOutput::Blobs(blobs) => { - self.check_blobs_for_slashability(block_root, blobs.iter().map(|b| b.as_blob()))?; + self.check_blob_header_signature_and_slashability( + block_root, + blobs.iter().map(|b| b.as_blob()), + )?; self.data_availability_checker .put_kzg_verified_blobs(block_root, blobs)? } EngineGetBlobsOutput::CustodyColumns(data_columns) => { - self.check_columns_for_slashability( + self.check_data_column_sidecar_header_signature_and_slashability( block_root, data_columns.iter().map(|c| c.as_data_column()), )?; @@ -3642,7 +3651,7 @@ impl BeaconChain { block_root: Hash256, custody_columns: DataColumnSidecarList, ) -> Result { - self.check_columns_for_slashability( + self.check_data_column_sidecar_header_signature_and_slashability( block_root, custody_columns.iter().map(|c| c.as_ref()), )?; @@ -3659,7 +3668,7 @@ impl BeaconChain { .await } - fn check_columns_for_slashability<'a>( + fn check_data_column_sidecar_header_signature_and_slashability<'a>( self: &Arc, block_root: Hash256, custody_columns: impl IntoIterator>, @@ -3673,17 +3682,20 @@ impl BeaconChain { .map(|c| c.signed_block_header.clone()) .unique() { - if verify_header_signature::(self, &header).is_ok() { - slashable_cache - .observe_slashable( - header.message.slot, - header.message.proposer_index, - block_root, - ) - .map_err(|e| BlockError::BeaconChainError(Box::new(e.into())))?; - if let Some(slasher) = self.slasher.as_ref() { - slasher.accept_block_header(header); - } + // Return an error if *any* header signature is invalid, we do not want to import this + // list of blobs into the DA checker. However, we will process any valid headers prior + // to the first invalid header in the slashable cache & slasher. + verify_header_signature::(self, &header)?; + + slashable_cache + .observe_slashable( + header.message.slot, + header.message.proposer_index, + block_root, + ) + .map_err(|e| BlockError::BeaconChainError(Box::new(e.into())))?; + if let Some(slasher) = self.slasher.as_ref() { + slasher.accept_block_header(header); } } Ok(()) diff --git a/beacon_node/beacon_chain/src/test_utils.rs b/beacon_node/beacon_chain/src/test_utils.rs index 1d57550156..0b125efa32 100644 --- a/beacon_node/beacon_chain/src/test_utils.rs +++ b/beacon_node/beacon_chain/src/test_utils.rs @@ -2437,7 +2437,7 @@ where } /// Builds an `RpcBlock` from a `SignedBeaconBlock` and `BlobsList`. - fn build_rpc_block_from_blobs( + pub fn build_rpc_block_from_blobs( &self, block_root: Hash256, block: Arc>>, diff --git a/beacon_node/beacon_chain/tests/blob_verification.rs b/beacon_node/beacon_chain/tests/blob_verification.rs new file mode 100644 index 0000000000..c42a2828c0 --- /dev/null +++ b/beacon_node/beacon_chain/tests/blob_verification.rs @@ -0,0 +1,120 @@ +#![cfg(not(debug_assertions))] + +use beacon_chain::test_utils::{ + AttestationStrategy, BeaconChainHarness, BlockStrategy, EphemeralHarnessType, test_spec, +}; +use beacon_chain::{ + AvailabilityProcessingStatus, BlockError, ChainConfig, InvalidSignature, NotifyExecutionLayer, + block_verification_types::AsBlock, +}; +use logging::create_test_tracing_subscriber; +use std::sync::{Arc, LazyLock}; +use types::{blob_sidecar::FixedBlobSidecarList, *}; + +type E = MainnetEthSpec; + +// Should ideally be divisible by 3. +const VALIDATOR_COUNT: usize = 24; + +/// A cached set of keys. +static KEYPAIRS: LazyLock> = + LazyLock::new(|| types::test_utils::generate_deterministic_keypairs(VALIDATOR_COUNT)); + +fn get_harness( + validator_count: usize, + spec: Arc, +) -> BeaconChainHarness> { + create_test_tracing_subscriber(); + let harness = BeaconChainHarness::builder(MainnetEthSpec) + .spec(spec) + .chain_config(ChainConfig { + reconstruct_historic_states: true, + ..ChainConfig::default() + }) + .keypairs(KEYPAIRS[0..validator_count].to_vec()) + .fresh_ephemeral_store() + .mock_execution_layer() + .build(); + + harness.advance_slot(); + + harness +} + +// Regression test for https://github.com/sigp/lighthouse/issues/7650 +#[tokio::test] +async fn rpc_blobs_with_invalid_header_signature() { + let spec = Arc::new(test_spec::()); + + // Only run this test if blobs are enabled and columns are disabled. + if spec.deneb_fork_epoch.is_none() || spec.is_fulu_scheduled() { + return; + } + + let harness = get_harness(VALIDATOR_COUNT, spec); + + let num_blocks = E::slots_per_epoch() as usize; + + // Add some chain depth. + harness + .extend_chain( + num_blocks, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + // Produce a block with blobs. + harness.execution_block_generator().set_min_blob_count(1); + let head_state = harness.get_current_state(); + let slot = head_state.slot() + 1; + let ((signed_block, opt_blobs), _) = harness.make_block(head_state, slot).await; + let (kzg_proofs, blobs) = opt_blobs.unwrap(); + assert!(!blobs.is_empty()); + let block_root = signed_block.canonical_root(); + + // Process the block without blobs so that it doesn't become available. + harness.advance_slot(); + let rpc_block = harness + .build_rpc_block_from_blobs(block_root, signed_block.clone(), None) + .unwrap(); + let availability = harness + .chain + .process_block( + block_root, + rpc_block, + NotifyExecutionLayer::Yes, + BlockImportSource::RangeSync, + || Ok(()), + ) + .await + .unwrap(); + assert_eq!( + availability, + AvailabilityProcessingStatus::MissingComponents(slot, block_root) + ); + + // Build blob sidecars with invalid signatures in the block header. + let mut corrupt_block = (*signed_block).clone(); + *corrupt_block.signature_mut() = Signature::infinity().unwrap(); + + let max_len = harness + .chain + .spec + .max_blobs_per_block(slot.epoch(E::slots_per_epoch())) as usize; + let mut blob_sidecars = FixedBlobSidecarList::new(vec![None; max_len]); + for (i, (kzg_proof, blob)) in kzg_proofs.into_iter().zip(blobs).enumerate() { + let blob_sidecar = BlobSidecar::new(i, blob, &corrupt_block, kzg_proof).unwrap(); + blob_sidecars[i] = Some(Arc::new(blob_sidecar)); + } + + let err = harness + .chain + .process_rpc_blobs(slot, block_root, blob_sidecars) + .await + .unwrap_err(); + assert!(matches!( + err, + BlockError::InvalidSignature(InvalidSignature::ProposerSignature) + )); +} diff --git a/beacon_node/beacon_chain/tests/column_verification.rs b/beacon_node/beacon_chain/tests/column_verification.rs new file mode 100644 index 0000000000..5cd3811ea5 --- /dev/null +++ b/beacon_node/beacon_chain/tests/column_verification.rs @@ -0,0 +1,117 @@ +#![cfg(not(debug_assertions))] + +use beacon_chain::test_utils::{ + AttestationStrategy, BeaconChainHarness, BlockStrategy, EphemeralHarnessType, + generate_data_column_sidecars_from_block, test_spec, +}; +use beacon_chain::{ + AvailabilityProcessingStatus, BlockError, ChainConfig, InvalidSignature, NotifyExecutionLayer, + block_verification_types::AsBlock, +}; +use logging::create_test_tracing_subscriber; +use std::sync::{Arc, LazyLock}; +use types::*; + +type E = MainnetEthSpec; + +// Should ideally be divisible by 3. +const VALIDATOR_COUNT: usize = 24; + +/// A cached set of keys. +static KEYPAIRS: LazyLock> = + LazyLock::new(|| types::test_utils::generate_deterministic_keypairs(VALIDATOR_COUNT)); + +fn get_harness( + validator_count: usize, + spec: Arc, + supernode: bool, +) -> BeaconChainHarness> { + create_test_tracing_subscriber(); + let harness = BeaconChainHarness::builder(MainnetEthSpec) + .spec(spec) + .chain_config(ChainConfig { + reconstruct_historic_states: true, + ..ChainConfig::default() + }) + .keypairs(KEYPAIRS[0..validator_count].to_vec()) + .import_all_data_columns(supernode) + .fresh_ephemeral_store() + .mock_execution_layer() + .build(); + + harness.advance_slot(); + + harness +} + +// Regression test for https://github.com/sigp/lighthouse/issues/7650 +#[tokio::test] +async fn rpc_columns_with_invalid_header_signature() { + let spec = Arc::new(test_spec::()); + + // Only run this test if columns are enabled. + if !spec.is_fulu_scheduled() { + return; + } + + let supernode = true; + let harness = get_harness(VALIDATOR_COUNT, spec, supernode); + + let num_blocks = E::slots_per_epoch() as usize; + + // Add some chain depth. + harness + .extend_chain( + num_blocks, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + // Produce a block with blobs. + harness.execution_block_generator().set_min_blob_count(1); + let head_state = harness.get_current_state(); + let slot = head_state.slot() + 1; + let ((signed_block, opt_blobs), _) = harness.make_block(head_state, slot).await; + let (_, blobs) = opt_blobs.unwrap(); + assert!(!blobs.is_empty()); + let block_root = signed_block.canonical_root(); + + // Process the block without blobs so that it doesn't become available. + harness.advance_slot(); + let rpc_block = harness + .build_rpc_block_from_blobs(block_root, signed_block.clone(), None) + .unwrap(); + let availability = harness + .chain + .process_block( + block_root, + rpc_block, + NotifyExecutionLayer::Yes, + BlockImportSource::RangeSync, + || Ok(()), + ) + .await + .unwrap(); + assert_eq!( + availability, + AvailabilityProcessingStatus::MissingComponents(slot, block_root) + ); + + // Build blob sidecars with invalid signatures in the block header. + let mut corrupt_block = (*signed_block).clone(); + *corrupt_block.signature_mut() = Signature::infinity().unwrap(); + + let data_column_sidecars = + generate_data_column_sidecars_from_block(&corrupt_block, &harness.chain.spec); + + let err = harness + .chain + .process_rpc_custody_columns(data_column_sidecars) + .await + .unwrap_err(); + assert!(matches!( + err, + BlockError::InvalidSignature(InvalidSignature::ProposerSignature) + )); +} diff --git a/beacon_node/beacon_chain/tests/events.rs b/beacon_node/beacon_chain/tests/events.rs index 0fc097ae8f..466058eea3 100644 --- a/beacon_node/beacon_chain/tests/events.rs +++ b/beacon_node/beacon_chain/tests/events.rs @@ -1,15 +1,13 @@ use beacon_chain::blob_verification::GossipVerifiedBlob; use beacon_chain::data_column_verification::GossipVerifiedDataColumn; -use beacon_chain::test_utils::{BeaconChainHarness, TEST_DATA_COLUMN_SIDECARS_SSZ}; +use beacon_chain::test_utils::{BeaconChainHarness, generate_data_column_sidecars_from_block}; use eth2::types::{EventKind, SseBlobSidecar, SseDataColumnSidecar}; use rand::SeedableRng; use rand::rngs::StdRng; use std::sync::Arc; use types::blob_sidecar::FixedBlobSidecarList; use types::test_utils::TestRandom; -use types::{ - BlobSidecar, DataColumnSidecar, EthSpec, ForkName, MinimalEthSpec, RuntimeVariableList, Slot, -}; +use types::{BlobSidecar, DataColumnSidecar, EthSpec, ForkName, MinimalEthSpec, Slot}; type E = MinimalEthSpec; @@ -108,19 +106,18 @@ async fn blob_sidecar_event_on_process_rpc_blobs() { let mut blob_event_receiver = event_handler.subscribe_blob_sidecar(); // build and process multiple rpc blobs - let kzg = harness.chain.kzg.as_ref(); - let mut rng = StdRng::seed_from_u64(0xDEADBEEF0BAD5EEDu64); + harness.execution_block_generator().set_min_blob_count(2); - let mut blob_1 = BlobSidecar::random_valid(&mut rng, kzg).unwrap(); - let mut blob_2 = BlobSidecar { - index: 1, - ..BlobSidecar::random_valid(&mut rng, kzg).unwrap() - }; - let parent_root = harness.chain.head().head_block_root(); - blob_1.signed_block_header.message.parent_root = parent_root; - blob_2.signed_block_header.message.parent_root = parent_root; - let blob_1 = Arc::new(blob_1); - let blob_2 = Arc::new(blob_2); + let head_state = harness.get_current_state(); + let slot = head_state.slot() + 1; + let ((signed_block, opt_blobs), _) = harness.make_block(head_state, slot).await; + let (kzg_proofs, blobs) = opt_blobs.unwrap(); + assert!(blobs.len() > 2); + + let blob_1 = + Arc::new(BlobSidecar::new(0, blobs[0].clone(), &signed_block, kzg_proofs[0]).unwrap()); + let blob_2 = + Arc::new(BlobSidecar::new(1, blobs[1].clone(), &signed_block, kzg_proofs[1]).unwrap()); let blobs = FixedBlobSidecarList::new(vec![Some(blob_1.clone()), Some(blob_2.clone())]); let expected_sse_blobs = vec![ @@ -130,7 +127,7 @@ async fn blob_sidecar_event_on_process_rpc_blobs() { let _ = harness .chain - .process_rpc_blobs(blob_1.slot(), blob_1.block_root(), blobs) + .process_rpc_blobs(slot, blob_1.block_root(), blobs) .await .unwrap(); @@ -159,20 +156,24 @@ async fn data_column_sidecar_event_on_process_rpc_columns() { let event_handler = harness.chain.event_handler.as_ref().unwrap(); let mut data_column_event_receiver = event_handler.subscribe_data_column_sidecar(); + // build a valid block + harness.execution_block_generator().set_min_blob_count(1); + + let head_state = harness.get_current_state(); + let slot = head_state.slot() + 1; + let ((signed_block, opt_blobs), _) = harness.make_block(head_state, slot).await; + let (_, blobs) = opt_blobs.unwrap(); + assert!(!blobs.is_empty()); + // load the precomputed column sidecar to avoid computing them for every block in the tests. - let mut sidecar = RuntimeVariableList::>::from_ssz_bytes( - TEST_DATA_COLUMN_SIDECARS_SSZ, - E::number_of_columns(), - ) - .unwrap()[0] - .clone(); - let parent_root = harness.chain.head().head_block_root(); - sidecar.signed_block_header.message.parent_root = parent_root; + let data_column_sidecars = + generate_data_column_sidecars_from_block(&signed_block, &harness.chain.spec); + let sidecar = data_column_sidecars[0].clone(); let expected_sse_data_column = SseDataColumnSidecar::from_data_column_sidecar(&sidecar); let _ = harness .chain - .process_rpc_custody_columns(vec![Arc::new(sidecar)]) + .process_rpc_custody_columns(vec![sidecar]) .await .unwrap(); diff --git a/beacon_node/beacon_chain/tests/main.rs b/beacon_node/beacon_chain/tests/main.rs index f0978c5f05..aec4416419 100644 --- a/beacon_node/beacon_chain/tests/main.rs +++ b/beacon_node/beacon_chain/tests/main.rs @@ -1,8 +1,10 @@ mod attestation_production; mod attestation_verification; mod bellatrix; +mod blob_verification; mod block_verification; mod capella; +mod column_verification; mod events; mod op_verification; mod payload_invalidation; From 46dde9afee389b3389c804b63ea8a061f8e9bf3d Mon Sep 17 00:00:00 2001 From: Eitan Seri-Levi Date: Tue, 21 Oct 2025 16:54:35 -0700 Subject: [PATCH 14/16] Fix data column rpc request (#8247) Fixes an issue mentioned in this comment regarding data column rpc requests: https://github.com/sigp/lighthouse/issues/6572#issuecomment-3400076236 Co-Authored-By: Eitan Seri-Levi Co-Authored-By: Michael Sproul --- beacon_node/beacon_chain/src/beacon_chain.rs | 46 ++++++++++++++- beacon_node/beacon_chain/tests/store_tests.rs | 59 +++++++++++++++++++ .../network_beacon_processor/rpc_methods.rs | 47 +++++++++------ 3 files changed, 130 insertions(+), 22 deletions(-) diff --git a/beacon_node/beacon_chain/src/beacon_chain.rs b/beacon_node/beacon_chain/src/beacon_chain.rs index ab157163f9..e299bea2da 100644 --- a/beacon_node/beacon_chain/src/beacon_chain.rs +++ b/beacon_node/beacon_chain/src/beacon_chain.rs @@ -6946,9 +6946,49 @@ impl BeaconChain { pub fn update_data_column_custody_info(&self, slot: Option) { self.store .put_data_column_custody_info(slot) - .unwrap_or_else( - |e| tracing::error!(error = ?e, "Failed to update data column custody info"), - ); + .unwrap_or_else(|e| error!(error = ?e, "Failed to update data column custody info")); + } + + /// Get the earliest epoch in which the node has met its custody requirements. + /// A `None` response indicates that we've met our custody requirements up to the + /// column data availability window + pub fn earliest_custodied_data_column_epoch(&self) -> Option { + self.store + .get_data_column_custody_info() + .inspect_err( + |e| error!(error=?e, "Failed to get data column custody info from the store"), + ) + .ok() + .flatten() + .and_then(|info| info.earliest_data_column_slot) + .map(|slot| { + let mut epoch = slot.epoch(T::EthSpec::slots_per_epoch()); + // If the earliest custodied slot isn't the first slot in the epoch + // The node has only met its custody requirements for the next epoch. + if slot > epoch.start_slot(T::EthSpec::slots_per_epoch()) { + epoch += 1; + } + epoch + }) + } + + /// The data availability boundary for custodying columns. It will just be the + /// regular data availability boundary unless we are near the Fulu fork epoch. + pub fn column_data_availability_boundary(&self) -> Option { + match self.data_availability_boundary() { + Some(da_boundary_epoch) => { + if let Some(fulu_fork_epoch) = self.spec.fulu_fork_epoch { + if da_boundary_epoch < fulu_fork_epoch { + Some(fulu_fork_epoch) + } else { + Some(da_boundary_epoch) + } + } else { + None // Fulu hasn't been enabled + } + } + None => None, // Deneb hasn't been enabled + } } /// This method serves to get a sense of the current chain health. It is used in block proposal diff --git a/beacon_node/beacon_chain/tests/store_tests.rs b/beacon_node/beacon_chain/tests/store_tests.rs index 7940902d4c..ec5c1c90db 100644 --- a/beacon_node/beacon_chain/tests/store_tests.rs +++ b/beacon_node/beacon_chain/tests/store_tests.rs @@ -4369,6 +4369,65 @@ async fn fulu_prune_data_columns_fork_boundary() { check_data_column_existence(&harness, pruned_slot, harness.head_slot(), true); } +#[tokio::test] +async fn test_column_da_boundary() { + let mut spec = ForkName::Electra.make_genesis_spec(E::default_spec()); + let fulu_fork_epoch = Epoch::new(4); + spec.fulu_fork_epoch = Some(fulu_fork_epoch); + let db_path = tempdir().unwrap(); + let store = get_store_generic(&db_path, StoreConfig::default(), spec); + + if !store.get_chain_spec().is_peer_das_scheduled() { + // No-op if PeerDAS not scheduled. + panic!("PeerDAS not scheduled"); + } + + let harness = get_harness(store.clone(), LOW_VALIDATOR_COUNT); + + // The column da boundary should be the fulu fork epoch + assert_eq!( + harness.chain.column_data_availability_boundary(), + Some(fulu_fork_epoch) + ); +} + +#[tokio::test] +async fn test_earliest_custodied_data_column_epoch() { + let spec = ForkName::Fulu.make_genesis_spec(E::default_spec()); + let db_path = tempdir().unwrap(); + let store = get_store_generic(&db_path, StoreConfig::default(), spec); + let custody_info_epoch = Epoch::new(4); + + if !store.get_chain_spec().is_peer_das_scheduled() { + // No-op if PeerDAS not scheduled. + panic!("PeerDAS not scheduled"); + } + + let harness = get_harness(store.clone(), LOW_VALIDATOR_COUNT); + + // earliest custody info is set to the last slot in `custody_info_epoch` + harness + .chain + .update_data_column_custody_info(Some(custody_info_epoch.end_slot(E::slots_per_epoch()))); + + // earliest custodied data column epoch should be `custody_info_epoch` + 1 + assert_eq!( + harness.chain.earliest_custodied_data_column_epoch(), + Some(custody_info_epoch + 1) + ); + + // earliest custody info is set to the first slot in `custody_info_epoch` + harness + .chain + .update_data_column_custody_info(Some(custody_info_epoch.start_slot(E::slots_per_epoch()))); + + // earliest custodied data column epoch should be `custody_info_epoch` + assert_eq!( + harness.chain.earliest_custodied_data_column_epoch(), + Some(custody_info_epoch) + ); +} + /// Check that blob pruning prunes data columns older than the data availability boundary with /// margin applied. #[tokio::test] diff --git a/beacon_node/network/src/network_beacon_processor/rpc_methods.rs b/beacon_node/network/src/network_beacon_processor/rpc_methods.rs index 0fcd67dbf1..a81595322b 100644 --- a/beacon_node/network/src/network_beacon_processor/rpc_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/rpc_methods.rs @@ -1204,33 +1204,42 @@ impl NetworkBeaconProcessor { let request_start_slot = Slot::from(req.start_slot); - let data_availability_boundary_slot = match self.chain.data_availability_boundary() { - Some(boundary) => boundary.start_slot(T::EthSpec::slots_per_epoch()), - None => { - debug!("Deneb fork is disabled"); - return Err((RpcErrorResponse::InvalidRequest, "Deneb fork is disabled")); - } - }; + let column_data_availability_boundary_slot = + match self.chain.column_data_availability_boundary() { + Some(boundary) => boundary.start_slot(T::EthSpec::slots_per_epoch()), + None => { + debug!("Fulu fork is disabled"); + return Err((RpcErrorResponse::InvalidRequest, "Fulu fork is disabled")); + } + }; - let oldest_data_column_slot = self - .chain - .store - .get_data_column_info() - .oldest_data_column_slot - .unwrap_or(data_availability_boundary_slot); + let earliest_custodied_data_column_slot = + match self.chain.earliest_custodied_data_column_epoch() { + Some(earliest_custodied_epoch) => { + let earliest_custodied_slot = + earliest_custodied_epoch.start_slot(T::EthSpec::slots_per_epoch()); + // Ensure the earliest columns we serve are within the data availability window + if earliest_custodied_slot < column_data_availability_boundary_slot { + column_data_availability_boundary_slot + } else { + earliest_custodied_slot + } + } + None => column_data_availability_boundary_slot, + }; - if request_start_slot < oldest_data_column_slot { + if request_start_slot < earliest_custodied_data_column_slot { debug!( %request_start_slot, - %oldest_data_column_slot, - %data_availability_boundary_slot, - "Range request start slot is older than data availability boundary." + %earliest_custodied_data_column_slot, + %column_data_availability_boundary_slot, + "Range request start slot is older than the earliest custodied data column slot." ); - return if data_availability_boundary_slot < oldest_data_column_slot { + return if earliest_custodied_data_column_slot > column_data_availability_boundary_slot { Err(( RpcErrorResponse::ResourceUnavailable, - "blobs pruned within boundary", + "columns pruned within boundary", )) } else { Err(( From 33e21634cb8842f396e4537342b4c84dc5095649 Mon Sep 17 00:00:00 2001 From: Eitan Seri-Levi Date: Tue, 21 Oct 2025 20:51:34 -0700 Subject: [PATCH 15/16] Custody backfill sync (#7907) #7603 #### Custody backfill sync service Similar in many ways to the current backfill service. There may be ways to unify the two services. The difficulty there is that the current backfill service tightly couples blocks and their associated blobs/data columns. Any attempts to unify the two services should be left to a separate PR in my opinion. #### `SyncNeworkContext` `SyncNetworkContext` manages custody sync data columns by range requests separetly from other sync RPC requests. I think this is a nice separation considering that custody backfill is its own service. #### Data column import logic The import logic verifies KZG committments and that the data columns block root matches the block root in the nodes store before importing columns #### New channel to send messages to `SyncManager` Now external services can communicate with the `SyncManager`. In this PR this channel is used to trigger a custody sync. Alternatively we may be able to use the existing `mpsc` channel that the `SyncNetworkContext` uses to communicate with the `SyncManager`. I will spend some time reviewing this. Co-Authored-By: Eitan Seri-Levi Co-Authored-By: Eitan Seri- Levi Co-Authored-By: dapplion <35266934+dapplion@users.noreply.github.com> --- beacon_node/beacon_chain/src/beacon_chain.rs | 89 ++ beacon_node/beacon_chain/src/errors.rs | 1 + .../src/historical_data_columns.rs | 151 +++ beacon_node/beacon_chain/src/lib.rs | 1 + .../beacon_chain/src/validator_custody.rs | 38 +- beacon_node/beacon_chain/tests/store_tests.rs | 401 ++++++ beacon_node/client/src/notifier.rs | 95 ++ beacon_node/http_api/src/lib.rs | 4 +- .../src/service/api_types.rs | 55 +- .../lighthouse_network/src/types/globals.rs | 6 + .../lighthouse_network/src/types/mod.rs | 2 +- beacon_node/lighthouse_tracing/src/lib.rs | 4 + beacon_node/network/src/metrics.rs | 16 + .../src/network_beacon_processor/mod.rs | 17 + .../network_beacon_processor/sync_methods.rs | 107 +- .../network/src/sync/backfill_sync/mod.rs | 40 +- .../src/sync/{range_sync => }/batch.rs | 209 +-- .../src/sync/block_sidecar_coupling.rs | 65 +- .../src/sync/custody_backfill_sync/mod.rs | 1126 +++++++++++++++++ beacon_node/network/src/sync/manager.rs | 175 ++- beacon_node/network/src/sync/mod.rs | 5 +- .../network/src/sync/network_context.rs | 126 +- .../sync/range_data_column_batch_request.rs | 297 +++++ .../network/src/sync/range_sync/chain.rs | 52 +- .../network/src/sync/range_sync/mod.rs | 8 +- .../network/src/sync/range_sync/range.rs | 3 +- beacon_node/store/src/hot_cold_store.rs | 13 + common/eth2/src/lighthouse/sync_state.rs | 35 +- consensus/types/src/slot_epoch.rs | 7 + scripts/tests/checkpoint-sync.sh | 10 +- 30 files changed, 2958 insertions(+), 200 deletions(-) create mode 100644 beacon_node/beacon_chain/src/historical_data_columns.rs rename beacon_node/network/src/sync/{range_sync => }/batch.rs (82%) create mode 100644 beacon_node/network/src/sync/custody_backfill_sync/mod.rs create mode 100644 beacon_node/network/src/sync/range_data_column_batch_request.rs diff --git a/beacon_node/beacon_chain/src/beacon_chain.rs b/beacon_node/beacon_chain/src/beacon_chain.rs index e299bea2da..152de1a20b 100644 --- a/beacon_node/beacon_chain/src/beacon_chain.rs +++ b/beacon_node/beacon_chain/src/beacon_chain.rs @@ -6991,6 +6991,95 @@ impl BeaconChain { } } + /// Safely update data column custody info by ensuring that: + /// - cgc values at the updated epoch and the earliest custodied column epoch are equal + /// - we are only decrementing the earliest custodied data column epoch by one epoch + /// - the new earliest data column slot is set to the first slot in `effective_epoch`. + pub fn safely_backfill_data_column_custody_info( + &self, + effective_epoch: Epoch, + ) -> Result<(), Error> { + let Some(earliest_data_column_epoch) = self.earliest_custodied_data_column_epoch() else { + return Ok(()); + }; + + if effective_epoch >= earliest_data_column_epoch { + return Ok(()); + } + + let cgc_at_effective_epoch = self + .data_availability_checker + .custody_context() + .custody_group_count_at_epoch(effective_epoch, &self.spec); + + let cgc_at_earliest_data_colum_epoch = self + .data_availability_checker + .custody_context() + .custody_group_count_at_epoch(earliest_data_column_epoch, &self.spec); + + let can_update_data_column_custody_info = cgc_at_effective_epoch + == cgc_at_earliest_data_colum_epoch + && effective_epoch == earliest_data_column_epoch - 1; + + if can_update_data_column_custody_info { + self.store.put_data_column_custody_info(Some( + effective_epoch.start_slot(T::EthSpec::slots_per_epoch()), + ))?; + } else { + error!( + ?cgc_at_effective_epoch, + ?cgc_at_earliest_data_colum_epoch, + ?effective_epoch, + ?earliest_data_column_epoch, + "Couldn't update data column custody info" + ); + return Err(Error::FailedColumnCustodyInfoUpdate); + } + + Ok(()) + } + + /// Compare columns custodied for `epoch` versus columns custodied for the head of the chain + /// and return any column indices that are missing. + pub fn get_missing_columns_for_epoch(&self, epoch: Epoch) -> HashSet { + let custody_context = self.data_availability_checker.custody_context(); + + let columns_required = custody_context + .custody_columns_for_epoch(None, &self.spec) + .iter() + .cloned() + .collect::>(); + + let current_columns_at_epoch = custody_context + .custody_columns_for_epoch(Some(epoch), &self.spec) + .iter() + .cloned() + .collect::>(); + + columns_required + .difference(¤t_columns_at_epoch) + .cloned() + .collect::>() + } + + /// The da boundary for custodying columns. It will just be the DA boundary unless we are near the Fulu fork epoch. + pub fn get_column_da_boundary(&self) -> Option { + match self.data_availability_boundary() { + Some(da_boundary_epoch) => { + if let Some(fulu_fork_epoch) = self.spec.fulu_fork_epoch { + if da_boundary_epoch < fulu_fork_epoch { + Some(fulu_fork_epoch) + } else { + Some(da_boundary_epoch) + } + } else { + None + } + } + None => None, // If no DA boundary set, dont try to custody backfill + } + } + /// This method serves to get a sense of the current chain health. It is used in block proposal /// to determine whether we should outsource payload production duties. /// diff --git a/beacon_node/beacon_chain/src/errors.rs b/beacon_node/beacon_chain/src/errors.rs index 7b04a36fae..d4eba2b0ea 100644 --- a/beacon_node/beacon_chain/src/errors.rs +++ b/beacon_node/beacon_chain/src/errors.rs @@ -247,6 +247,7 @@ pub enum BeaconChainError { cache_epoch: Epoch, }, SkipProposerPreparation, + FailedColumnCustodyInfoUpdate, } easy_from_to!(SlotProcessingError, BeaconChainError); diff --git a/beacon_node/beacon_chain/src/historical_data_columns.rs b/beacon_node/beacon_chain/src/historical_data_columns.rs new file mode 100644 index 0000000000..7e196eb75e --- /dev/null +++ b/beacon_node/beacon_chain/src/historical_data_columns.rs @@ -0,0 +1,151 @@ +use std::collections::{HashMap, HashSet}; + +use crate::{ + BeaconChain, BeaconChainError, BeaconChainTypes, + data_column_verification::verify_kzg_for_data_column_list, +}; +use store::{Error as StoreError, KeyValueStore}; +use tracing::{Span, debug, instrument}; +use types::{ColumnIndex, DataColumnSidecarList, Epoch, EthSpec, Hash256, Slot}; + +#[derive(Debug)] +pub enum HistoricalDataColumnError { + // The provided data column sidecar pertains to a block that doesn't exist in the database. + NoBlockFound { + data_column_block_root: Hash256, + expected_block_root: Hash256, + }, + + /// Logic error: should never occur. + IndexOutOfBounds, + + /// The provided data column sidecar list doesn't contain columns for the full range of slots for the given epoch. + MissingDataColumns { + missing_slots_and_data_columns: Vec<(Slot, ColumnIndex)>, + }, + + /// The provided data column sidecar list contains at least one column with an invalid kzg commitment. + InvalidKzg, + + /// Internal store error + StoreError(StoreError), + + /// Internal beacon chain error + BeaconChainError(Box), +} + +impl From for HistoricalDataColumnError { + fn from(e: StoreError) -> Self { + Self::StoreError(e) + } +} + +impl BeaconChain { + /// Store a batch of historical data columns in the database. + /// + /// The data columns block roots and proposer signatures are verified with the existing + /// block stored in the DB. This function also verifies the columns KZG committments. + /// + /// This function requires that the data column sidecar list contains columns for a full epoch. + /// + /// Return the number of `data_columns` successfully imported. + #[instrument(skip_all, fields(columns_imported_count = tracing::field::Empty ))] + pub fn import_historical_data_column_batch( + &self, + epoch: Epoch, + historical_data_column_sidecar_list: DataColumnSidecarList, + ) -> Result { + let mut total_imported = 0; + let mut ops = vec![]; + + let unique_column_indices = historical_data_column_sidecar_list + .iter() + .map(|item| item.index) + .collect::>(); + + let mut slot_and_column_index_to_data_columns = historical_data_column_sidecar_list + .iter() + .map(|data_column| ((data_column.slot(), data_column.index), data_column)) + .collect::>(); + + let forward_blocks_iter = self + .forwards_iter_block_roots_until( + epoch.start_slot(T::EthSpec::slots_per_epoch()), + epoch.end_slot(T::EthSpec::slots_per_epoch()), + ) + .map_err(|e| HistoricalDataColumnError::BeaconChainError(Box::new(e)))?; + + for block_iter_result in forward_blocks_iter { + let (block_root, slot) = block_iter_result + .map_err(|e| HistoricalDataColumnError::BeaconChainError(Box::new(e)))?; + + for column_index in unique_column_indices.clone() { + if let Some(data_column) = + slot_and_column_index_to_data_columns.remove(&(slot, column_index)) + { + if self + .store + .get_data_column(&block_root, &data_column.index)? + .is_some() + { + debug!( + block_root = ?block_root, + column_index = data_column.index, + "Skipping data column import as identical data column exists" + ); + continue; + } + if block_root != data_column.block_root() { + return Err(HistoricalDataColumnError::NoBlockFound { + data_column_block_root: data_column.block_root(), + expected_block_root: block_root, + }); + } + self.store.data_column_as_kv_store_ops( + &block_root, + data_column.clone(), + &mut ops, + ); + total_imported += 1; + } + } + } + + // If we've made it to here with no columns to import, this means there are no blobs for this epoch. + // `RangeDataColumnBatchRequest` logic should have caught any bad peers withholding columns + if historical_data_column_sidecar_list.is_empty() { + if !ops.is_empty() { + // This shouldn't be a valid case. If there are no columns to import, + // there should be no generated db operations. + return Err(HistoricalDataColumnError::IndexOutOfBounds); + } + } else { + verify_kzg_for_data_column_list(historical_data_column_sidecar_list.iter(), &self.kzg) + .map_err(|_| HistoricalDataColumnError::InvalidKzg)?; + + self.store.blobs_db.do_atomically(ops)?; + } + + if !slot_and_column_index_to_data_columns.is_empty() { + debug!( + ?epoch, + extra_data = ?slot_and_column_index_to_data_columns.keys().map(|(slot, _)| slot), + "We've received unexpected extra data columns, these will not be imported" + ); + } + + self.data_availability_checker + .custody_context() + .update_and_backfill_custody_count_at_epoch(epoch); + + self.safely_backfill_data_column_custody_info(epoch) + .map_err(|e| HistoricalDataColumnError::BeaconChainError(Box::new(e)))?; + + debug!(?epoch, total_imported, "Imported historical data columns"); + + let current_span = Span::current(); + current_span.record("columns_imported_count", total_imported); + + Ok(total_imported) + } +} diff --git a/beacon_node/beacon_chain/src/lib.rs b/beacon_node/beacon_chain/src/lib.rs index 9d8c3dba38..fd2162e7d3 100644 --- a/beacon_node/beacon_chain/src/lib.rs +++ b/beacon_node/beacon_chain/src/lib.rs @@ -28,6 +28,7 @@ pub mod fork_choice_signal; pub mod fork_revert; pub mod graffiti_calculator; pub mod historical_blocks; +pub mod historical_data_columns; pub mod kzg_utils; pub mod light_client_finality_update_verification; pub mod light_client_optimistic_update_verification; diff --git a/beacon_node/beacon_chain/src/validator_custody.rs b/beacon_node/beacon_chain/src/validator_custody.rs index 3ab76828c9..ea1dfdaae0 100644 --- a/beacon_node/beacon_chain/src/validator_custody.rs +++ b/beacon_node/beacon_chain/src/validator_custody.rs @@ -10,7 +10,7 @@ use types::data_column_custody_group::{CustodyIndex, compute_columns_for_custody use types::{ChainSpec, ColumnIndex, Epoch, EthSpec, Slot}; /// A delay before making the CGC change effective to the data availability checker. -const CUSTODY_CHANGE_DA_EFFECTIVE_DELAY_SECONDS: u64 = 30; +pub const CUSTODY_CHANGE_DA_EFFECTIVE_DELAY_SECONDS: u64 = 30; /// Number of slots after which a validator's registration is removed if it has not re-registered. const VALIDATOR_REGISTRATION_EXPIRY_SLOTS: Slot = Slot::new(256); @@ -30,8 +30,10 @@ struct ValidatorRegistrations { /// /// Note: Only stores the epoch value when there's a change in custody requirement. /// So if epoch 10 and 11 has the same custody requirement, only 10 is stored. - /// This map is never pruned, because currently we never decrease custody requirement, so this - /// map size is contained at 128. + /// This map is only pruned during custody backfill. If epoch 11 has custody requirements + /// that are then backfilled to epoch 10, the value at epoch 11 will be removed and epoch 10 + /// will be added to the map instead. This should keep map size constrained to a maximum + /// value of 128. epoch_validator_custody_requirements: BTreeMap, } @@ -99,6 +101,25 @@ impl ValidatorRegistrations { None } } + + /// Updates the `epoch_validator_custody_requirements` map by pruning all values on/after `effective_epoch` + /// and updating the map to store the latest validator custody requirements for the `effective_epoch`. + pub fn backfill_validator_custody_requirements(&mut self, effective_epoch: Epoch) { + if let Some(latest_validator_custody) = self.latest_validator_custody_requirement() { + // Delete records if + // 1. The epoch is greater than or equal than `effective_epoch` + // 2. the cgc requirements match the latest validator custody requirements + self.epoch_validator_custody_requirements + .retain(|&epoch, custody_requirement| { + !(epoch >= effective_epoch && *custody_requirement == latest_validator_custody) + }); + + self.epoch_validator_custody_requirements + .entry(effective_epoch) + .and_modify(|old_custody| *old_custody = latest_validator_custody) + .or_insert(latest_validator_custody); + } + } } /// Given the `validator_custody_units`, return the custody requirement based on @@ -250,6 +271,7 @@ impl CustodyContext { ); return Some(CustodyCountChanged { new_custody_group_count: updated_cgc, + old_custody_group_count: current_cgc, sampling_count: self.num_of_custody_groups_to_sample(effective_epoch, spec), effective_epoch, }); @@ -282,7 +304,7 @@ impl CustodyContext { /// minimum sampling size which may exceed the custody group count (CGC). /// /// See also: [`Self::num_of_custody_groups_to_sample`]. - fn custody_group_count_at_epoch(&self, epoch: Epoch, spec: &ChainSpec) -> u64 { + pub fn custody_group_count_at_epoch(&self, epoch: Epoch, spec: &ChainSpec) -> u64 { if self.current_is_supernode { spec.number_of_custody_groups } else { @@ -360,14 +382,22 @@ impl CustodyContext { .all_custody_columns_ordered .get() .expect("all_custody_columns_ordered should be initialized"); + &all_columns_ordered[..custody_group_count] } + + pub fn update_and_backfill_custody_count_at_epoch(&self, effective_epoch: Epoch) { + self.validator_registrations + .write() + .backfill_validator_custody_requirements(effective_epoch); + } } /// The custody count changed because of a change in the /// number of validators being managed. pub struct CustodyCountChanged { pub new_custody_group_count: u64, + pub old_custody_group_count: u64, pub sampling_count: u64, pub effective_epoch: Epoch, } diff --git a/beacon_node/beacon_chain/tests/store_tests.rs b/beacon_node/beacon_chain/tests/store_tests.rs index ec5c1c90db..69d16b3071 100644 --- a/beacon_node/beacon_chain/tests/store_tests.rs +++ b/beacon_node/beacon_chain/tests/store_tests.rs @@ -4,12 +4,14 @@ use beacon_chain::attestation_verification::Error as AttnError; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::builder::BeaconChainBuilder; use beacon_chain::data_availability_checker::AvailableBlock; +use beacon_chain::historical_data_columns::HistoricalDataColumnError; use beacon_chain::schema_change::migrate_schema; use beacon_chain::test_utils::SyncCommitteeStrategy; use beacon_chain::test_utils::{ AttestationStrategy, BeaconChainHarness, BlockStrategy, DiskHarnessType, get_kzg, mock_execution_layer_from_parts, test_spec, }; +use beacon_chain::validator_custody::CUSTODY_CHANGE_DA_EFFECTIVE_DELAY_SECONDS; use beacon_chain::{ BeaconChain, BeaconChainError, BeaconChainTypes, BeaconSnapshot, BlockError, ChainConfig, NotifyExecutionLayer, ServerSentEventHandler, WhenSlotSkipped, @@ -3169,6 +3171,245 @@ async fn weak_subjectivity_sync_test( assert_eq!(store.get_anchor_info().state_upper_limit, Slot::new(0)); } +#[tokio::test] +async fn test_import_historical_data_columns_batch() { + let spec = ForkName::Fulu.make_genesis_spec(E::default_spec()); + let db_path = tempdir().unwrap(); + let store = get_store_generic(&db_path, StoreConfig::default(), spec); + let start_slot = Epoch::new(0).start_slot(E::slots_per_epoch()) + 1; + let end_slot = Epoch::new(0).end_slot(E::slots_per_epoch()); + + let harness = get_harness_import_all_data_columns(store.clone(), LOW_VALIDATOR_COUNT); + + harness + .extend_chain( + (E::slots_per_epoch() * 2) as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + harness.advance_slot(); + + let block_root_iter = harness + .chain + .forwards_iter_block_roots_until(start_slot, end_slot) + .unwrap(); + + let mut data_columns_list = vec![]; + + for block in block_root_iter { + let (block_root, _) = block.unwrap(); + let data_columns = harness.chain.store.get_data_columns(&block_root).unwrap(); + assert!(data_columns.is_some()); + for data_column in data_columns.unwrap() { + data_columns_list.push(data_column); + } + } + + harness + .extend_chain( + (E::slots_per_epoch() * 4) as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + harness.advance_slot(); + + harness + .chain + .store + .try_prune_blobs(true, Epoch::new(2)) + .unwrap(); + + let block_root_iter = harness + .chain + .forwards_iter_block_roots_until(start_slot, end_slot) + .unwrap(); + + for block in block_root_iter { + let (block_root, _) = block.unwrap(); + let data_columns = harness.chain.store.get_data_columns(&block_root).unwrap(); + assert!(data_columns.is_none()) + } + + harness + .chain + .import_historical_data_column_batch(Epoch::new(0), data_columns_list) + .unwrap(); + let block_root_iter = harness + .chain + .forwards_iter_block_roots_until(start_slot, end_slot) + .unwrap(); + + for block in block_root_iter { + let (block_root, _) = block.unwrap(); + let data_columns = harness.chain.store.get_data_columns(&block_root).unwrap(); + assert!(data_columns.is_some()) + } +} + +// This should verify that a data column sidecar containing mismatched block roots should fail to be imported. +#[tokio::test] +async fn test_import_historical_data_columns_batch_mismatched_block_root() { + let spec = ForkName::Fulu.make_genesis_spec(E::default_spec()); + let db_path = tempdir().unwrap(); + let store = get_store_generic(&db_path, StoreConfig::default(), spec); + let start_slot = Slot::new(1); + let end_slot = Slot::new(E::slots_per_epoch() * 2 - 1); + + let harness = get_harness_import_all_data_columns(store.clone(), LOW_VALIDATOR_COUNT); + + harness + .extend_chain( + (E::slots_per_epoch() * 2) as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + harness.advance_slot(); + + let block_root_iter = harness + .chain + .forwards_iter_block_roots_until(start_slot, end_slot) + .unwrap(); + + let mut data_columns_list = vec![]; + + for block in block_root_iter { + let (block_root, _) = block.unwrap(); + let data_columns = harness.chain.store.get_data_columns(&block_root).unwrap(); + assert!(data_columns.is_some()); + + for data_column in data_columns.unwrap() { + let mut data_column = (*data_column).clone(); + if data_column.index % 2 == 0 { + data_column.signed_block_header.message.body_root = Hash256::ZERO; + } + + data_columns_list.push(Arc::new(data_column)); + } + } + + harness + .extend_chain( + (E::slots_per_epoch() * 4) as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + harness.advance_slot(); + + harness + .chain + .store + .try_prune_blobs(true, Epoch::new(2)) + .unwrap(); + + let block_root_iter = harness + .chain + .forwards_iter_block_roots_until(start_slot, end_slot) + .unwrap(); + + for block in block_root_iter { + let (block_root, _) = block.unwrap(); + let data_columns = harness.chain.store.get_data_columns(&block_root).unwrap(); + assert!(data_columns.is_none()) + } + + let error = harness + .chain + .import_historical_data_column_batch( + start_slot.epoch(E::slots_per_epoch()), + data_columns_list, + ) + .unwrap_err(); + + assert!(matches!( + error, + HistoricalDataColumnError::NoBlockFound { .. } + )); +} + +// This should verify that a data column sidecar associated to a block root that doesn't exist in the store cannot +// be imported. +#[tokio::test] +async fn test_import_historical_data_columns_batch_no_block_found() { + let spec = ForkName::Fulu.make_genesis_spec(E::default_spec()); + let db_path = tempdir().unwrap(); + let store = get_store_generic(&db_path, StoreConfig::default(), spec); + let start_slot = Slot::new(1); + let end_slot = Slot::new(E::slots_per_epoch() * 2 - 1); + + let harness = get_harness_import_all_data_columns(store.clone(), LOW_VALIDATOR_COUNT); + + harness + .extend_chain( + (E::slots_per_epoch() * 2) as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + harness.advance_slot(); + + let block_root_iter = harness + .chain + .forwards_iter_block_roots_until(start_slot, end_slot) + .unwrap(); + + let mut data_columns_list = vec![]; + + for block in block_root_iter { + let (block_root, _) = block.unwrap(); + let data_columns = harness.chain.store.get_data_columns(&block_root).unwrap(); + assert!(data_columns.is_some()); + + for data_column in data_columns.unwrap() { + let mut data_column = (*data_column).clone(); + data_column.signed_block_header.message.body_root = Hash256::ZERO; + data_columns_list.push(Arc::new(data_column)); + } + } + + harness + .extend_chain( + (E::slots_per_epoch() * 4) as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + harness.advance_slot(); + + harness + .chain + .store + .try_prune_blobs(true, Epoch::new(2)) + .unwrap(); + + let block_root_iter = harness + .chain + .forwards_iter_block_roots_until(start_slot, end_slot) + .unwrap(); + + for block in block_root_iter { + let (block_root, _) = block.unwrap(); + let data_columns = harness.chain.store.get_data_columns(&block_root).unwrap(); + assert!(data_columns.is_none()) + } + + let error = harness + .chain + .import_historical_data_column_batch(Epoch::new(0), data_columns_list) + .unwrap_err(); + + assert!(matches!( + error, + HistoricalDataColumnError::NoBlockFound { .. } + )); +} + /// Test that blocks and attestations that refer to states around an unaligned split state are /// processed correctly. #[tokio::test] @@ -4845,6 +5086,166 @@ async fn test_custody_column_filtering_supernode() { ); } +#[tokio::test] +async fn test_missing_columns_after_cgc_change() { + let spec = test_spec::(); + + let num_validators = 8; + + let num_epochs_before_increase = 4; + + let harness = BeaconChainHarness::builder(E::default()) + .spec(spec.clone().into()) + .deterministic_keypairs(num_validators) + .fresh_ephemeral_store() + .mock_execution_layer() + .build(); + + let state = harness.chain.head_beacon_state_cloned(); + + if !state.fork_name_unchecked().fulu_enabled() { + return; + } + + let custody_context = harness.chain.data_availability_checker.custody_context(); + + harness.advance_slot(); + harness + .extend_chain( + (E::slots_per_epoch() * num_epochs_before_increase) as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + let epoch_before_increase = Epoch::new(num_epochs_before_increase); + + let missing_columns = harness + .chain + .get_missing_columns_for_epoch(epoch_before_increase); + + // We should have no missing columns + assert_eq!(missing_columns.len(), 0); + + let epoch_after_increase = Epoch::new(num_epochs_before_increase + 2); + + let cgc_change_slot = epoch_before_increase.end_slot(E::slots_per_epoch()); + custody_context.register_validators(vec![(1, 32_000_000_000 * 9)], cgc_change_slot, &spec); + + harness.advance_slot(); + harness + .extend_chain( + (E::slots_per_epoch() * 5) as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + // We should have missing columns from before the cgc increase + let missing_columns = harness + .chain + .get_missing_columns_for_epoch(epoch_before_increase); + + assert!(!missing_columns.is_empty()); + + // We should have no missing columns after the cgc increase + let missing_columns = harness + .chain + .get_missing_columns_for_epoch(epoch_after_increase); + + assert!(missing_columns.is_empty()); +} + +#[tokio::test] +async fn test_safely_backfill_data_column_custody_info() { + let spec = test_spec::(); + + let num_validators = 8; + + let start_epochs = 4; + + let harness = BeaconChainHarness::builder(E::default()) + .spec(spec.clone().into()) + .deterministic_keypairs(num_validators) + .fresh_ephemeral_store() + .mock_execution_layer() + .build(); + + let state = harness.chain.head_beacon_state_cloned(); + + if !state.fork_name_unchecked().fulu_enabled() { + return; + } + + let custody_context = harness.chain.data_availability_checker.custody_context(); + + harness.advance_slot(); + harness + .extend_chain( + (E::slots_per_epoch() * start_epochs) as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + let epoch_before_increase = Epoch::new(start_epochs); + let effective_delay_slots = + CUSTODY_CHANGE_DA_EFFECTIVE_DELAY_SECONDS / harness.chain.spec.seconds_per_slot; + + let cgc_change_slot = epoch_before_increase.end_slot(E::slots_per_epoch()); + + custody_context.register_validators(vec![(1, 32_000_000_000 * 16)], cgc_change_slot, &spec); + + let epoch_after_increase = + (cgc_change_slot + effective_delay_slots).epoch(E::slots_per_epoch()); + + harness.advance_slot(); + harness + .extend_chain( + (E::slots_per_epoch() * 5) as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + let head_slot = harness.chain.head().snapshot.beacon_block.slot(); + + harness + .chain + .update_data_column_custody_info(Some(head_slot)); + + // We can only safely update custody column info 1 epoch at a time + // Skipping an epoch should return an error + harness + .chain + .safely_backfill_data_column_custody_info(head_slot.epoch(E::slots_per_epoch()) - 2) + .unwrap_err(); + + // Iterate from the head epoch back to 0 and try to backfill data column custody info + for epoch in (0..head_slot.epoch(E::slots_per_epoch()).into()).rev() { + // This is an epoch before the cgc change took into effect, we shouldnt be able to update + // without performing custody backfill sync + if epoch <= epoch_after_increase.into() { + harness + .chain + .safely_backfill_data_column_custody_info(Epoch::new(epoch)) + .unwrap_err(); + } else { + // This is an epoch after the cgc change took into effect, we should be able to update + // as long as we iterate epoch by epoch + harness + .chain + .safely_backfill_data_column_custody_info(Epoch::new(epoch)) + .unwrap(); + let earliest_available_epoch = harness + .chain + .earliest_custodied_data_column_epoch() + .unwrap(); + assert_eq!(Epoch::new(epoch), earliest_available_epoch); + } + } +} + /// Checks that two chains are the same, for the purpose of these tests. /// /// Several fields that are hard/impossible to check are ignored (e.g., the store). diff --git a/beacon_node/client/src/notifier.rs b/beacon_node/client/src/notifier.rs index c83cdad7e0..10d9587ccc 100644 --- a/beacon_node/client/src/notifier.rs +++ b/beacon_node/client/src/notifier.rs @@ -57,6 +57,9 @@ pub fn spawn_notifier( // Store info if we are required to do a backfill sync. let original_oldest_block_slot = beacon_chain.store.get_anchor_info().oldest_block_slot; + // Use this info during custody backfill sync. + let mut original_earliest_data_column_slot = None; + let interval_future = async move { // Perform pre-genesis logging. loop { @@ -80,6 +83,7 @@ pub fn spawn_notifier( // Perform post-genesis logging. let mut last_backfill_log_slot = None; + let mut last_custody_backfill_log_slot = None; loop { // Run the notifier half way through each slot. @@ -112,6 +116,18 @@ pub fn spawn_notifier( let mut speedo = speedo.lock().await; speedo.clear(); } + (_, SyncState::CustodyBackFillSyncing { .. }) => { + // We have transitioned to a custody backfill sync. Reset the speedo. + let mut speedo = speedo.lock().await; + last_custody_backfill_log_slot = None; + speedo.clear(); + } + (SyncState::CustodyBackFillSyncing { .. }, _) => { + // We have transitioned from a custody backfill sync, reset the speedo + let mut speedo = speedo.lock().await; + last_custody_backfill_log_slot = None; + speedo.clear(); + } (_, _) => {} } current_sync_state = sync_state; @@ -154,6 +170,38 @@ pub fn spawn_notifier( Instant::now(), ); } + SyncState::CustodyBackFillSyncing { .. } => { + match beacon_chain.store.get_data_column_custody_info() { + Ok(data_column_custody_info) => { + if let Some(earliest_data_column_slot) = data_column_custody_info + .and_then(|info| info.earliest_data_column_slot) + && let Some(da_boundary) = beacon_chain.get_column_da_boundary() + { + sync_distance = earliest_data_column_slot.saturating_sub( + da_boundary.start_slot(T::EthSpec::slots_per_epoch()), + ); + + // We keep track of our starting point for custody backfill sync + // so we can measure our speed of progress. + if original_earliest_data_column_slot.is_none() { + original_earliest_data_column_slot = + Some(earliest_data_column_slot) + } + + if let Some(original_earliest_data_column_slot) = + original_earliest_data_column_slot + { + speedo.observe( + original_earliest_data_column_slot + .saturating_sub(earliest_data_column_slot), + Instant::now(), + ); + } + } + } + Err(e) => error!(error=?e, "Unable to get data column custody info"), + } + } SyncState::SyncingFinalized { .. } | SyncState::SyncingHead { .. } | SyncState::SyncTransition => { @@ -190,6 +238,8 @@ pub fn spawn_notifier( // Log if we are backfilling. let is_backfilling = matches!(current_sync_state, SyncState::BackFillSyncing { .. }); + let is_custody_backfilling = + matches!(current_sync_state, SyncState::CustodyBackFillSyncing { .. }); if is_backfilling && last_backfill_log_slot .is_none_or(|slot| slot + BACKFILL_LOG_INTERVAL <= current_slot) @@ -234,6 +284,51 @@ pub fn spawn_notifier( info!("Historical block download complete"); } + if is_custody_backfilling + && last_custody_backfill_log_slot + .is_none_or(|slot| slot + BACKFILL_LOG_INTERVAL <= current_slot) + { + last_custody_backfill_log_slot = Some(current_slot); + + let distance = format!( + "{} slots ({})", + sync_distance.as_u64(), + slot_distance_pretty(sync_distance, slot_duration) + ); + + let speed = speedo.slots_per_second(); + let display_speed = speed.is_some_and(|speed| speed != 0.0); + + if display_speed { + info!( + distance, + speed = sync_speed_pretty(speed), + est_time = + estimated_time_pretty(beacon_chain.get_column_da_boundary().and_then( + |da_boundary| speedo.estimated_time_till_slot( + da_boundary.start_slot(T::EthSpec::slots_per_epoch()) + ) + )), + "Downloading historical data columns" + ); + } else { + info!( + distance, + est_time = + estimated_time_pretty(beacon_chain.get_column_da_boundary().and_then( + |da_boundary| speedo.estimated_time_till_slot( + da_boundary.start_slot(T::EthSpec::slots_per_epoch()) + ) + )), + "Downloading historical data columns" + ); + } + } else if !is_custody_backfilling && last_custody_backfill_log_slot.is_some() { + last_custody_backfill_log_slot = None; + original_earliest_data_column_slot = None; + info!("Historical data column download complete"); + } + // Log if we are syncing if current_sync_state.is_syncing() { metrics::set_gauge(&metrics::IS_SYNCED, 0); diff --git a/beacon_node/http_api/src/lib.rs b/beacon_node/http_api/src/lib.rs index 170012b04b..f6d8dbc157 100644 --- a/beacon_node/http_api/src/lib.rs +++ b/beacon_node/http_api/src/lib.rs @@ -478,7 +478,9 @@ pub fn serve( ))) } } - SyncState::SyncTransition | SyncState::BackFillSyncing { .. } => Ok(()), + SyncState::SyncTransition + | SyncState::BackFillSyncing { .. } + | SyncState::CustodyBackFillSyncing { .. } => Ok(()), SyncState::Synced => Ok(()), SyncState::Stalled => Ok(()), } diff --git a/beacon_node/lighthouse_network/src/service/api_types.rs b/beacon_node/lighthouse_network/src/service/api_types.rs index 0f5fd99c27..f1a4d87de7 100644 --- a/beacon_node/lighthouse_network/src/service/api_types.rs +++ b/beacon_node/lighthouse_network/src/service/api_types.rs @@ -60,8 +60,8 @@ pub struct BlobsByRangeRequestId { pub struct DataColumnsByRangeRequestId { /// Id to identify this attempt at a data_columns_by_range request for `parent_request_id` pub id: Id, - /// The Id of the overall By Range request for block components. - pub parent_request_id: ComponentsByRangeRequestId, + /// The Id of the overall By Range request for either a components by range request or a custody backfill request. + pub parent_request_id: DataColumnsByRangeRequester, /// The peer id associated with the request. /// /// This is useful to penalize the peer at a later point if it returned data columns that @@ -69,6 +69,12 @@ pub struct DataColumnsByRangeRequestId { pub peer: PeerId, } +#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] +pub enum DataColumnsByRangeRequester { + ComponentsByRange(ComponentsByRangeRequestId), + CustodyBackfillSync(CustodyBackFillBatchRequestId), +} + /// Block components by range request for range sync. Includes an ID for downstream consumers to /// handle retries and tie all their sub requests together. #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] @@ -80,6 +86,24 @@ pub struct ComponentsByRangeRequestId { pub requester: RangeRequestId, } +/// A batch of data columns by range request for custody sync. Includes an ID for downstream consumers to +/// handle retries and tie all the range requests for the given epoch together. +#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] +pub struct CustodyBackFillBatchRequestId { + /// For each `epoch` we may request the same data in a later retry. This Id identifies the + /// current attempt. + pub id: Id, + pub batch_id: CustodyBackfillBatchId, +} + +/// Custody backfill may be restarted and sync each epoch multiple times in different runs. Identify +/// each batch by epoch and run_id for uniqueness. +#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] +pub struct CustodyBackfillBatchId { + pub epoch: Epoch, + pub run_id: u64, +} + /// Range sync chain or backfill batch #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] pub enum RangeRequestId { @@ -217,6 +241,8 @@ impl_display!(ComponentsByRangeRequestId, "{}/{}", id, requester); impl_display!(DataColumnsByRootRequestId, "{}/{}", id, requester); impl_display!(SingleLookupReqId, "{}/Lookup/{}", req_id, lookup_id); impl_display!(CustodyId, "{}", requester); +impl_display!(CustodyBackFillBatchRequestId, "{}/{}", id, batch_id); +impl_display!(CustodyBackfillBatchId, "{}/{}", epoch, run_id); impl Display for DataColumnsByRootRequester { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { @@ -241,6 +267,15 @@ impl Display for RangeRequestId { } } +impl Display for DataColumnsByRangeRequester { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Self::ComponentsByRange(id) => write!(f, "ByRange/{id}"), + Self::CustodyBackfillSync(id) => write!(f, "CustodyBackfill/{id}"), + } + } +} + #[cfg(test)] mod tests { use super::*; @@ -263,15 +298,17 @@ mod tests { fn display_id_data_columns_by_range() { let id = DataColumnsByRangeRequestId { id: 123, - parent_request_id: ComponentsByRangeRequestId { - id: 122, - requester: RangeRequestId::RangeSync { - chain_id: 54, - batch_id: Epoch::new(0), + parent_request_id: DataColumnsByRangeRequester::ComponentsByRange( + ComponentsByRangeRequestId { + id: 122, + requester: RangeRequestId::RangeSync { + chain_id: 54, + batch_id: Epoch::new(0), + }, }, - }, + ), peer: PeerId::random(), }; - assert_eq!(format!("{id}"), "123/122/RangeSync/0/54"); + assert_eq!(format!("{id}"), "123/ByRange/122/RangeSync/0/54"); } } diff --git a/beacon_node/lighthouse_network/src/types/globals.rs b/beacon_node/lighthouse_network/src/types/globals.rs index b8c34f8392..2a3571c3b7 100644 --- a/beacon_node/lighthouse_network/src/types/globals.rs +++ b/beacon_node/lighthouse_network/src/types/globals.rs @@ -4,6 +4,7 @@ use crate::peer_manager::peerdb::PeerDB; use crate::rpc::{MetaData, MetaDataV3}; use crate::types::{BackFillState, SyncState}; use crate::{Client, Enr, GossipTopic, Multiaddr, NetworkConfig, PeerId}; +use eth2::lighthouse::sync_state::CustodyBackFillState; use network_utils::enr_ext::EnrExt; use parking_lot::RwLock; use std::collections::HashSet; @@ -29,6 +30,8 @@ pub struct NetworkGlobals { pub sync_state: RwLock, /// The current state of the backfill sync. pub backfill_state: RwLock, + /// The current state of custody sync. + pub custody_sync_state: RwLock, /// The computed sampling subnets and columns is stored to avoid re-computing. pub sampling_subnets: RwLock>, /// Network-related configuration. Immutable after initialization. @@ -91,6 +94,9 @@ impl NetworkGlobals { gossipsub_subscriptions: RwLock::new(HashSet::new()), sync_state: RwLock::new(SyncState::Stalled), backfill_state: RwLock::new(BackFillState::Paused), + custody_sync_state: RwLock::new(CustodyBackFillState::Pending( + "Custody backfill sync initialized".to_string(), + )), sampling_subnets: RwLock::new(sampling_subnets), config, spec, diff --git a/beacon_node/lighthouse_network/src/types/mod.rs b/beacon_node/lighthouse_network/src/types/mod.rs index 0bbbcebaf2..3f57406fc7 100644 --- a/beacon_node/lighthouse_network/src/types/mod.rs +++ b/beacon_node/lighthouse_network/src/types/mod.rs @@ -10,7 +10,7 @@ pub type EnrSyncCommitteeBitfield = BitVector<::SyncCommitteeSu pub type Enr = discv5::enr::Enr; -pub use eth2::lighthouse::sync_state::{BackFillState, SyncState}; +pub use eth2::lighthouse::sync_state::{BackFillState, CustodyBackFillState, SyncState}; pub use globals::NetworkGlobals; pub use pubsub::{PubsubMessage, SnappyTransform}; pub use subnet::{Subnet, SubnetDiscovery}; diff --git a/beacon_node/lighthouse_tracing/src/lib.rs b/beacon_node/lighthouse_tracing/src/lib.rs index 18a9874252..56dccadaa9 100644 --- a/beacon_node/lighthouse_tracing/src/lib.rs +++ b/beacon_node/lighthouse_tracing/src/lib.rs @@ -26,7 +26,9 @@ pub const SPAN_PROCESS_RPC_BLOCK: &str = "process_rpc_block"; pub const SPAN_PROCESS_RPC_BLOBS: &str = "process_rpc_blobs"; pub const SPAN_PROCESS_RPC_CUSTODY_COLUMNS: &str = "process_rpc_custody_columns"; pub const SPAN_PROCESS_CHAIN_SEGMENT: &str = "process_chain_segment"; +pub const SPAN_CUSTODY_BACKFILL_SYNC_BATCH_REQUEST: &str = "custody_backfill_sync_batch_request"; pub const SPAN_PROCESS_CHAIN_SEGMENT_BACKFILL: &str = "process_chain_segment_backfill"; +pub const SPAN_CUSTODY_BACKFILL_SYNC_IMPORT_COLUMNS: &str = "custody_backfill_sync_import_columns"; /// Fork choice root spans pub const SPAN_RECOMPUTE_HEAD: &str = "recompute_head_at_slot"; @@ -73,4 +75,6 @@ pub const LH_BN_ROOT_SPAN_NAMES: &[&str] = &[ SPAN_HANDLE_LIGHT_CLIENT_BOOTSTRAP, SPAN_HANDLE_LIGHT_CLIENT_OPTIMISTIC_UPDATE, SPAN_HANDLE_LIGHT_CLIENT_FINALITY_UPDATE, + SPAN_CUSTODY_BACKFILL_SYNC_BATCH_REQUEST, + SPAN_CUSTODY_BACKFILL_SYNC_IMPORT_COLUMNS, ]; diff --git a/beacon_node/network/src/metrics.rs b/beacon_node/network/src/metrics.rs index a2b5af8b08..cea06a28c8 100644 --- a/beacon_node/network/src/metrics.rs +++ b/beacon_node/network/src/metrics.rs @@ -212,6 +212,22 @@ pub static BEACON_PROCESSOR_RPC_BLOCK_IMPORTED_TOTAL: LazyLock, +> = LazyLock::new(|| { + try_create_int_counter( + "beacon_processor_custody_backfill_column_import_success_total", + "Total number of custody backfill sync columns successfully processed.", + ) +}); +pub static BEACON_PROCESSOR_CUSTODY_BACKFILL_BATCH_FAILED_TOTAL: LazyLock> = + LazyLock::new(|| { + try_create_int_counter( + "beacon_processor_custody_backfill_batch_failed_total", + "Total number of custody backfill batches that failed to be processed.", + ) + }); // Chain segments. pub static BEACON_PROCESSOR_CHAIN_SEGMENT_SUCCESS_TOTAL: LazyLock> = LazyLock::new(|| { diff --git a/beacon_node/network/src/network_beacon_processor/mod.rs b/beacon_node/network/src/network_beacon_processor/mod.rs index 85ccde1d59..7441e92871 100644 --- a/beacon_node/network/src/network_beacon_processor/mod.rs +++ b/beacon_node/network/src/network_beacon_processor/mod.rs @@ -16,6 +16,7 @@ use lighthouse_network::rpc::methods::{ BlobsByRangeRequest, BlobsByRootRequest, DataColumnsByRangeRequest, DataColumnsByRootRequest, LightClientUpdatesByRangeRequest, }; +use lighthouse_network::service::api_types::CustodyBackfillBatchId; use lighthouse_network::{ Client, MessageId, NetworkGlobals, PeerId, PubsubMessage, rpc::{BlocksByRangeRequest, BlocksByRootRequest, LightClientBootstrapRequest, StatusMessage}, @@ -492,6 +493,22 @@ impl NetworkBeaconProcessor { }) } + pub fn send_historic_data_columns( + self: &Arc, + batch_id: CustodyBackfillBatchId, + data_columns: DataColumnSidecarList, + ) -> Result<(), Error> { + let processor = self.clone(); + let process_fn = move || processor.process_historic_data_columns(batch_id, data_columns); + + let work = Work::ChainSegmentBackfill(Box::new(process_fn)); + + self.try_send(BeaconWorkEvent { + drop_during_sync: true, + work, + }) + } + /// Create a new work event to import `blocks` as a beacon chain segment. pub fn send_chain_segment( self: &Arc, diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index 1d99540c29..41b12fa01b 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -1,6 +1,7 @@ use crate::metrics::{self, register_process_result_metrics}; use crate::network_beacon_processor::{FUTURE_SLOT_TOLERANCE, NetworkBeaconProcessor}; use crate::sync::BatchProcessResult; +use crate::sync::manager::CustodyBatchProcessResult; use crate::sync::{ ChainId, manager::{BlockProcessType, SyncMessage}, @@ -8,6 +9,7 @@ use crate::sync::{ use beacon_chain::block_verification_types::{AsBlock, RpcBlock}; use beacon_chain::data_availability_checker::AvailabilityCheckError; use beacon_chain::data_availability_checker::MaybeAvailableBlock; +use beacon_chain::historical_data_columns::HistoricalDataColumnError; use beacon_chain::{ AvailabilityProcessingStatus, BeaconChainTypes, BlockError, ChainSegmentResult, HistoricalBlockError, NotifyExecutionLayer, validator_monitor::get_slot_delay_ms, @@ -18,15 +20,17 @@ use beacon_processor::{ }; use beacon_processor::{Work, WorkEvent}; use lighthouse_network::PeerAction; +use lighthouse_network::service::api_types::CustodyBackfillBatchId; use lighthouse_tracing::{ - SPAN_PROCESS_CHAIN_SEGMENT, SPAN_PROCESS_CHAIN_SEGMENT_BACKFILL, SPAN_PROCESS_RPC_BLOBS, - SPAN_PROCESS_RPC_BLOCK, SPAN_PROCESS_RPC_CUSTODY_COLUMNS, + SPAN_CUSTODY_BACKFILL_SYNC_IMPORT_COLUMNS, SPAN_PROCESS_CHAIN_SEGMENT, + SPAN_PROCESS_CHAIN_SEGMENT_BACKFILL, SPAN_PROCESS_RPC_BLOBS, SPAN_PROCESS_RPC_BLOCK, + SPAN_PROCESS_RPC_CUSTODY_COLUMNS, }; use logging::crit; use std::sync::Arc; use std::time::Duration; use store::KzgCommitment; -use tracing::{debug, error, info, instrument, warn}; +use tracing::{debug, debug_span, error, info, instrument, warn}; use types::beacon_block_body::format_kzg_commitments; use types::blob_sidecar::FixedBlobSidecarList; use types::{BlockImportSource, DataColumnSidecarList, Epoch, Hash256}; @@ -418,6 +422,103 @@ impl NetworkBeaconProcessor { }); } + pub fn process_historic_data_columns( + &self, + batch_id: CustodyBackfillBatchId, + downloaded_columns: DataColumnSidecarList, + ) { + let _guard = debug_span!( + SPAN_CUSTODY_BACKFILL_SYNC_IMPORT_COLUMNS, + epoch = %batch_id.epoch, + columns_received_count = downloaded_columns.len() + ) + .entered(); + + let sent_columns = downloaded_columns.len(); + let result = match self + .chain + .import_historical_data_column_batch(batch_id.epoch, downloaded_columns) + { + Ok(imported_columns) => { + metrics::inc_counter_by( + &metrics::BEACON_PROCESSOR_CUSTODY_BACKFILL_COLUMN_IMPORT_SUCCESS_TOTAL, + imported_columns as u64, + ); + CustodyBatchProcessResult::Success { + sent_columns, + imported_columns, + } + } + Err(e) => { + metrics::inc_counter( + &metrics::BEACON_PROCESSOR_CUSTODY_BACKFILL_BATCH_FAILED_TOTAL, + ); + let peer_action: Option = match &e { + HistoricalDataColumnError::NoBlockFound { + data_column_block_root, + expected_block_root, + } => { + debug!( + error = "no_block_found", + ?data_column_block_root, + ?expected_block_root, + "Custody backfill batch processing error" + ); + // The peer is faulty if they send blocks with bad roots. + Some(PeerAction::LowToleranceError) + } + HistoricalDataColumnError::MissingDataColumns { .. } => { + warn!( + error = ?e, + "Custody backfill batch processing error", + ); + // The peer is faulty if they don't return data columns + // that they advertised as available. + Some(PeerAction::LowToleranceError) + } + HistoricalDataColumnError::InvalidKzg => { + warn!( + error = ?e, + "Custody backfill batch processing error", + ); + // The peer is faulty if they don't return data columns + // with valid kzg commitments. + Some(PeerAction::LowToleranceError) + } + HistoricalDataColumnError::BeaconChainError(e) => { + match &**e { + beacon_chain::BeaconChainError::FailedColumnCustodyInfoUpdate => {} + _ => { + warn!( + error = ?e, + "Custody backfill batch processing error", + ); + } + } + + // This is an interal error, don't penalize the peer + None + } + HistoricalDataColumnError::IndexOutOfBounds => { + error!( + error = ?e, + "Custody backfill batch out of bounds error" + ); + // This should never occur, don't penalize the peer. + None + } + HistoricalDataColumnError::StoreError(e) => { + warn!(error = ?e, "Custody backfill batch processing error"); + // This is an internal error, don't penalize the peer. + None + } + }; + CustodyBatchProcessResult::Error { peer_action } + } + }; + self.send_sync_message(SyncMessage::CustodyBatchProcessed { result, batch_id }); + } + /// Attempt to import the chain segment (`blocks`) to the beacon chain, informing the sync /// thread if more blocks are needed to process it. #[instrument( diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 00597586b8..6c0cbd7e55 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -9,24 +9,27 @@ //! sync as failed, log an error and attempt to retry once a new peer joins the node. use crate::network_beacon_processor::ChainSegmentProcessId; +use crate::sync::batch::{ + BatchConfig, BatchId, BatchInfo, BatchOperationOutcome, BatchProcessingResult, BatchState, +}; use crate::sync::block_sidecar_coupling::CouplingError; use crate::sync::manager::BatchProcessResult; use crate::sync::network_context::{ RangeRequestId, RpcRequestSendError, RpcResponseError, SyncNetworkContext, }; -use crate::sync::range_sync::{ - BatchConfig, BatchId, BatchInfo, BatchOperationOutcome, BatchProcessingResult, BatchState, -}; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes}; use lighthouse_network::service::api_types::Id; use lighthouse_network::types::{BackFillState, NetworkGlobals}; use lighthouse_network::{PeerAction, PeerId}; use logging::crit; +use std::collections::hash_map::DefaultHasher; use std::collections::{ HashSet, btree_map::{BTreeMap, Entry}, }; +use std::hash::{Hash, Hasher}; +use std::marker::PhantomData; use std::sync::Arc; use tracing::{debug, error, info, warn}; use types::{ColumnIndex, Epoch, EthSpec}; @@ -49,21 +52,27 @@ const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 10; /// after `MAX_BATCH_PROCESSING_ATTEMPTS` times, it is considered faulty. const MAX_BATCH_PROCESSING_ATTEMPTS: u8 = 10; -/// Custom configuration for the batch object. -struct BackFillBatchConfig {} +type RpcBlocks = Vec>; -impl BatchConfig for BackFillBatchConfig { +type BackFillBatchInfo = BatchInfo, RpcBlocks>; + +type BackFillSyncBatches = BTreeMap>; + +/// Custom configuration for the batch object. +struct BackFillBatchConfig { + marker: PhantomData, +} + +impl BatchConfig for BackFillBatchConfig { fn max_batch_download_attempts() -> u8 { MAX_BATCH_DOWNLOAD_ATTEMPTS } fn max_batch_processing_attempts() -> u8 { MAX_BATCH_PROCESSING_ATTEMPTS } - fn batch_attempt_hash(blocks: &[RpcBlock]) -> u64 { - use std::collections::hash_map::DefaultHasher; - use std::hash::{Hash, Hasher}; + fn batch_attempt_hash(data: &D) -> u64 { let mut hasher = DefaultHasher::new(); - blocks.hash(&mut hasher); + data.hash(&mut hasher); hasher.finish() } } @@ -121,7 +130,7 @@ pub struct BackFillSync { last_batch_downloaded: bool, /// Sorted map of batches undergoing some kind of processing. - batches: BTreeMap>, + batches: BackFillSyncBatches, /// The current processing batch, if any. current_processing_batch: Option, @@ -349,7 +358,7 @@ impl BackFillSync { // reasons. Check that this block belongs to the expected peer // TODO(das): removed peer_id matching as the node may request a different peer for data // columns. - if !batch.is_expecting_block(&request_id) { + if !batch.is_expecting_request_id(&request_id) { return Ok(()); } debug!(batch_epoch = %batch_id, error = ?err, "Batch download failed"); @@ -393,12 +402,13 @@ impl BackFillSync { // sending an error /timeout) if the peer is removed from the chain for other // reasons. Check that this block belongs to the expected peer, and that the // request_id matches - if !batch.is_expecting_block(&request_id) { + if !batch.is_expecting_request_id(&request_id) { return Ok(ProcessResult::Successful); } + let received = blocks.len(); match batch.download_completed(blocks, *peer_id) { - Ok(received) => { + Ok(_) => { let awaiting_batches = self.processing_target.saturating_sub(batch_id) / BACKFILL_EPOCHS_PER_BATCH; debug!( @@ -1050,7 +1060,7 @@ impl BackFillSync { // only request batches up to the buffer size limit // NOTE: we don't count batches in the AwaitingValidation state, to prevent stalling sync // if the current processing window is contained in a long range of skip slots. - let in_buffer = |batch: &BatchInfo| { + let in_buffer = |batch: &BackFillBatchInfo| { matches!( batch.state(), BatchState::Downloading(..) | BatchState::AwaitingProcessing(..) diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/batch.rs similarity index 82% rename from beacon_node/network/src/sync/range_sync/batch.rs rename to beacon_node/network/src/sync/batch.rs index c79800bfbe..ea0ef15f4b 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/batch.rs @@ -2,29 +2,28 @@ use beacon_chain::block_verification_types::RpcBlock; use derivative::Derivative; use lighthouse_network::PeerId; use lighthouse_network::rpc::methods::BlocksByRangeRequest; +use lighthouse_network::rpc::methods::DataColumnsByRangeRequest; use lighthouse_network::service::api_types::Id; use std::collections::HashSet; -use std::fmt; -use std::hash::{Hash, Hasher}; +use std::hash::Hash; +use std::marker::PhantomData; use std::ops::Sub; -use std::time::{Duration, Instant}; +use std::time::Duration; +use std::time::Instant; use strum::Display; -use types::{Epoch, EthSpec, Slot}; +use types::Slot; +use types::{DataColumnSidecarList, Epoch, EthSpec}; -/// The number of times to retry a batch before it is considered failed. -const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 5; - -/// Invalid batches are attempted to be re-downloaded from other peers. If a batch cannot be processed -/// after `MAX_BATCH_PROCESSING_ATTEMPTS` times, it is considered faulty. -const MAX_BATCH_PROCESSING_ATTEMPTS: u8 = 3; +pub type BatchId = Epoch; /// Type of expected batch. -#[derive(Debug, Copy, Clone, Display)] +#[derive(Debug, Clone, Display)] #[strum(serialize_all = "snake_case")] pub enum ByRangeRequestType { BlocksAndColumns, BlocksAndBlobs, Blocks, + Columns(HashSet), } /// Allows customisation of the above constants used in other sync methods such as BackFillSync. @@ -60,28 +59,10 @@ pub trait BatchConfig { /// Note that simpler hashing functions considered in the past (hash of first block, hash of last /// block, number of received blocks) are not good enough to differentiate attempts. For this /// reason, we hash the complete set of blocks both in RangeSync and BackFillSync. - fn batch_attempt_hash(blocks: &[RpcBlock]) -> u64; + fn batch_attempt_hash(data: &D) -> u64; } #[derive(Debug)] -pub struct RangeSyncBatchConfig {} - -impl BatchConfig for RangeSyncBatchConfig { - fn max_batch_download_attempts() -> u8 { - MAX_BATCH_DOWNLOAD_ATTEMPTS - } - fn max_batch_processing_attempts() -> u8 { - MAX_BATCH_PROCESSING_ATTEMPTS - } - fn batch_attempt_hash(blocks: &[RpcBlock]) -> u64 { - let mut hasher = std::collections::hash_map::DefaultHasher::new(); - blocks.hash(&mut hasher); - hasher.finish() - } -} - -/// Error type of a batch in a wrong state. -// Such errors should never be encountered. pub struct WrongState(pub(crate) String); /// After batch operations, we use this to communicate whether a batch can continue or not @@ -100,28 +81,30 @@ pub enum BatchProcessingResult { #[derive(Derivative)] #[derivative(Debug)] /// A segment of a chain. -pub struct BatchInfo { +pub struct BatchInfo { /// Start slot of the batch. start_slot: Slot, /// End slot of the batch. end_slot: Slot, /// The `Attempts` that have been made and failed to send us this batch. - failed_processing_attempts: Vec, + failed_processing_attempts: Vec>, /// Number of processing attempts that have failed but we do not count. non_faulty_processing_attempts: u8, /// The number of download retries this batch has undergone due to a failed request. failed_download_attempts: Vec>, /// State of the batch. - state: BatchState, + state: BatchState, /// Whether this batch contains all blocks or all blocks and blobs. batch_type: ByRangeRequestType, /// Pin the generic #[derivative(Debug = "ignore")] - marker: std::marker::PhantomData, + marker: std::marker::PhantomData<(E, B)>, } -impl fmt::Display for BatchInfo { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl std::fmt::Display + for BatchInfo +{ + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, "Start Slot: {}, End Slot: {}, State: {}", @@ -132,21 +115,21 @@ impl fmt::Display for BatchInfo { #[derive(Display)] /// Current state of a batch -pub enum BatchState { +pub enum BatchState { /// The batch has failed either downloading or processing, but can be requested again. AwaitingDownload, /// The batch is being downloaded. Downloading(Id), /// The batch has been completely downloaded and is ready for processing. - AwaitingProcessing(PeerId, Vec>, Instant), + AwaitingProcessing(PeerId, D, Instant), /// The batch is being processed. - Processing(Attempt), + Processing(Attempt), /// The batch was successfully processed and is waiting to be validated. /// /// It is not sufficient to process a batch successfully to consider it correct. This is /// because batches could be erroneously empty, or incomplete. Therefore, a batch is considered /// valid, only if the next sequential batch imports at least a block. - AwaitingValidation(Attempt), + AwaitingValidation(Attempt), /// Intermediate state for inner state handling. Poisoned, /// The batch has maxed out the allowed attempts for either downloading or processing. It @@ -154,14 +137,14 @@ pub enum BatchState { Failed, } -impl BatchState { +impl BatchState { /// Helper function for poisoning a state. - pub fn poison(&mut self) -> BatchState { + pub fn poison(&mut self) -> BatchState { std::mem::replace(self, BatchState::Poisoned) } } -impl BatchInfo { +impl BatchInfo { /// Batches are downloaded excluding the first block of the epoch assuming it has already been /// downloaded. /// @@ -178,13 +161,13 @@ impl BatchInfo { pub fn new(start_epoch: &Epoch, num_of_epochs: u64, batch_type: ByRangeRequestType) -> Self { let start_slot = start_epoch.start_slot(E::slots_per_epoch()); let end_slot = start_slot + num_of_epochs * E::slots_per_epoch(); - BatchInfo { + Self { start_slot, end_slot, failed_processing_attempts: Vec::new(), failed_download_attempts: Vec::new(), non_faulty_processing_attempts: 0, - state: BatchState::AwaitingDownload, + state: BatchState::::AwaitingDownload, batch_type, marker: std::marker::PhantomData, } @@ -208,8 +191,8 @@ impl BatchInfo { peers } - /// Verifies if an incoming block belongs to this batch. - pub fn is_expecting_block(&self, request_id: &Id) -> bool { + /// Verifies if an incoming request id to this batch. + pub fn is_expecting_request_id(&self, request_id: &Id) -> bool { if let BatchState::Downloading(expected_id) = &self.state { return expected_id == request_id; } @@ -227,30 +210,6 @@ impl BatchInfo { } } - /// Returns the count of stored pending blocks if in awaiting processing state - pub fn pending_blocks(&self) -> usize { - match &self.state { - BatchState::AwaitingProcessing(_, blocks, _) => blocks.len(), - BatchState::AwaitingDownload - | BatchState::Downloading { .. } - | BatchState::Processing { .. } - | BatchState::AwaitingValidation { .. } - | BatchState::Poisoned - | BatchState::Failed => 0, - } - } - - /// Returns a BlocksByRange request associated with the batch. - pub fn to_blocks_by_range_request(&self) -> (BlocksByRangeRequest, ByRangeRequestType) { - ( - BlocksByRangeRequest::new( - self.start_slot.into(), - self.end_slot.sub(self.start_slot).into(), - ), - self.batch_type, - ) - } - /// After different operations over a batch, this could be in a state that allows it to /// continue, or in failed state. When the batch has failed, we check if it did mainly due to /// processing failures. In this case the batch is considered failed and faulty. @@ -265,27 +224,22 @@ impl BatchInfo { } } - pub fn state(&self) -> &BatchState { + pub fn state(&self) -> &BatchState { &self.state } - pub fn attempts(&self) -> &[Attempt] { + pub fn attempts(&self) -> &[Attempt] { &self.failed_processing_attempts } - /// Marks the batch as ready to be processed if the blocks are in the range. The number of - /// received blocks is returned, or the wrong batch end on failure + /// Marks the batch as ready to be processed if the data columns are in the range. The number of + /// received columns is returned, or the wrong batch end on failure #[must_use = "Batch may have failed"] - pub fn download_completed( - &mut self, - blocks: Vec>, - peer: PeerId, - ) -> Result { + pub fn download_completed(&mut self, data_columns: D, peer: PeerId) -> Result<(), WrongState> { match self.state.poison() { BatchState::Downloading(_) => { - let received = blocks.len(); - self.state = BatchState::AwaitingProcessing(peer, blocks, Instant::now()); - Ok(received) + self.state = BatchState::AwaitingProcessing(peer, data_columns, Instant::now()); + Ok(()) } BatchState::Poisoned => unreachable!("Poisoned batch"), other => { @@ -376,17 +330,17 @@ impl BatchInfo { } } - pub fn start_processing(&mut self) -> Result<(Vec>, Duration), WrongState> { + pub fn start_processing(&mut self) -> Result<(D, Duration), WrongState> { match self.state.poison() { - BatchState::AwaitingProcessing(peer, blocks, start_instant) => { - self.state = BatchState::Processing(Attempt::new::(peer, &blocks)); - Ok((blocks, start_instant.elapsed())) + BatchState::AwaitingProcessing(peer, data_columns, start_instant) => { + self.state = BatchState::Processing(Attempt::new::(peer, &data_columns)); + Ok((data_columns, start_instant.elapsed())) } BatchState::Poisoned => unreachable!("Poisoned batch"), other => { self.state = other; Err(WrongState(format!( - "Starting procesing batch in wrong state {:?}", + "Starting processing batch in wrong state {:?}", self.state ))) } @@ -466,37 +420,86 @@ impl BatchInfo { } } -/// Represents a peer's attempt and providing the result for this batch. -/// -/// Invalid attempts will downscore a peer. -#[derive(PartialEq, Debug)] -pub struct Attempt { +// BatchInfo implementations for RangeSync +impl BatchInfo>> { + /// Returns a BlocksByRange request associated with the batch. + pub fn to_blocks_by_range_request(&self) -> (BlocksByRangeRequest, ByRangeRequestType) { + ( + BlocksByRangeRequest::new( + self.start_slot.into(), + self.end_slot.sub(self.start_slot).into(), + ), + self.batch_type.clone(), + ) + } + + /// Returns the count of stored pending blocks if in awaiting processing state + pub fn pending_blocks(&self) -> usize { + match &self.state { + BatchState::AwaitingProcessing(_, blocks, _) => blocks.len(), + BatchState::AwaitingDownload + | BatchState::Downloading { .. } + | BatchState::Processing { .. } + | BatchState::AwaitingValidation { .. } + | BatchState::Poisoned + | BatchState::Failed => 0, + } + } +} + +// BatchInfo implementation for CustodyBackFillSync +impl BatchInfo> { + /// Returns a DataColumnsByRange request associated with the batch. + pub fn to_data_columns_by_range_request( + &self, + ) -> Result { + match &self.batch_type { + ByRangeRequestType::Columns(columns) => Ok(DataColumnsByRangeRequest { + start_slot: self.start_slot.into(), + count: self.end_slot.sub(self.start_slot).into(), + columns: columns.clone().into_iter().collect(), + }), + _ => Err(WrongState( + "Custody backfill sync can only make data columns by range requests.".to_string(), + )), + } + } +} + +#[derive(Debug)] +pub struct Attempt { /// The peer that made the attempt. pub peer_id: PeerId, /// The hash of the blocks of the attempt. pub hash: u64, + /// Pin the generic. + marker: PhantomData, } -impl Attempt { - fn new(peer_id: PeerId, blocks: &[RpcBlock]) -> Self { - let hash = B::batch_attempt_hash(blocks); - Attempt { peer_id, hash } +impl Attempt { + fn new(peer_id: PeerId, data: &D) -> Self { + let hash = B::batch_attempt_hash(data); + Attempt { + peer_id, + hash, + marker: PhantomData, + } } } -impl std::fmt::Debug for BatchState { +impl std::fmt::Debug for BatchState { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - BatchState::Processing(Attempt { peer_id, hash: _ }) => { + BatchState::Processing(Attempt { peer_id, .. }) => { write!(f, "Processing({})", peer_id) } - BatchState::AwaitingValidation(Attempt { peer_id, hash: _ }) => { + BatchState::AwaitingValidation(Attempt { peer_id, .. }) => { write!(f, "AwaitingValidation({})", peer_id) } BatchState::AwaitingDownload => f.write_str("AwaitingDownload"), BatchState::Failed => f.write_str("Failed"), - BatchState::AwaitingProcessing(peer, blocks, _) => { - write!(f, "AwaitingProcessing({}, {} blocks)", peer, blocks.len()) + BatchState::AwaitingProcessing(peer, ..) => { + write!(f, "AwaitingProcessing({})", peer) } BatchState::Downloading(request_id) => { write!(f, "Downloading({})", request_id) @@ -506,7 +509,7 @@ impl std::fmt::Debug for BatchState { } } -impl BatchState { +impl BatchState { /// Creates a character representation/visualization for the batch state to display in logs for quicker and /// easier recognition fn visualize(&self) -> char { diff --git a/beacon_node/network/src/sync/block_sidecar_coupling.rs b/beacon_node/network/src/sync/block_sidecar_coupling.rs index ba89d11225..cd9276f7e3 100644 --- a/beacon_node/network/src/sync/block_sidecar_coupling.rs +++ b/beacon_node/network/src/sync/block_sidecar_coupling.rs @@ -36,7 +36,7 @@ pub struct RangeBlockComponentsRequest { pub(crate) request_span: Span, } -enum ByRangeRequest { +pub enum ByRangeRequest { Active(I), Complete(T), } @@ -435,7 +435,7 @@ impl RangeBlockComponentsRequest { } impl ByRangeRequest { - fn finish(&mut self, id: I, data: T) -> Result<(), String> { + pub fn finish(&mut self, id: I, data: T) -> Result<(), String> { match self { Self::Active(expected_id) => { if expected_id != &id { @@ -448,7 +448,7 @@ impl ByRangeRequest { } } - fn to_finished(&self) -> Option<&T> { + pub fn to_finished(&self) -> Option<&T> { match self { Self::Active(_) => None, Self::Complete(data) => Some(data), @@ -467,7 +467,7 @@ mod tests { PeerId, service::api_types::{ BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, - DataColumnsByRangeRequestId, Id, RangeRequestId, + DataColumnsByRangeRequestId, DataColumnsByRangeRequester, Id, RangeRequestId, }, }; use rand::SeedableRng; @@ -501,7 +501,7 @@ mod tests { fn columns_id( id: Id, - parent_request_id: ComponentsByRangeRequestId, + parent_request_id: DataColumnsByRangeRequester, ) -> DataColumnsByRangeRequestId { DataColumnsByRangeRequestId { id, @@ -598,7 +598,15 @@ mod tests { let columns_req_id = expects_custody_columns .iter() .enumerate() - .map(|(i, column)| (columns_id(i as Id, components_id), vec![*column])) + .map(|(i, column)| { + ( + columns_id( + i as Id, + DataColumnsByRangeRequester::ComponentsByRange(components_id), + ), + vec![*column], + ) + }) .collect::>(); let mut info = RangeBlockComponentsRequest::::new( blocks_req_id, @@ -657,7 +665,15 @@ mod tests { let columns_req_id = batched_column_requests .iter() .enumerate() - .map(|(i, columns)| (columns_id(i as Id, components_id), columns.clone())) + .map(|(i, columns)| { + ( + columns_id( + i as Id, + DataColumnsByRangeRequester::ComponentsByRange(components_id), + ), + columns.clone(), + ) + }) .collect::>(); let mut info = RangeBlockComponentsRequest::::new( @@ -738,7 +754,15 @@ mod tests { let columns_req_id = expected_custody_columns .iter() .enumerate() - .map(|(i, column)| (columns_id(i as Id, components_id), vec![*column])) + .map(|(i, column)| { + ( + columns_id( + i as Id, + DataColumnsByRangeRequester::ComponentsByRange(components_id), + ), + vec![*column], + ) + }) .collect::>(); let mut info = RangeBlockComponentsRequest::::new( blocks_req_id, @@ -816,7 +840,15 @@ mod tests { let columns_req_id = expected_custody_columns .iter() .enumerate() - .map(|(i, column)| (columns_id(i as Id, components_id), vec![*column])) + .map(|(i, column)| { + ( + columns_id( + i as Id, + DataColumnsByRangeRequester::ComponentsByRange(components_id), + ), + vec![*column], + ) + }) .collect::>(); let mut info = RangeBlockComponentsRequest::::new( blocks_req_id, @@ -852,7 +884,10 @@ mod tests { assert!(result.is_err()); // AND: We retry with a new peer for the failed column - let new_columns_req_id = columns_id(10 as Id, components_id); + let new_columns_req_id = columns_id( + 10 as Id, + DataColumnsByRangeRequester::ComponentsByRange(components_id), + ); let failed_column_requests = vec![(new_columns_req_id, vec![2])]; info.reinsert_failed_column_requests(failed_column_requests) .unwrap(); @@ -898,7 +933,15 @@ mod tests { let columns_req_id = expected_custody_columns .iter() .enumerate() - .map(|(i, column)| (columns_id(i as Id, components_id), vec![*column])) + .map(|(i, column)| { + ( + columns_id( + i as Id, + DataColumnsByRangeRequester::ComponentsByRange(components_id), + ), + vec![*column], + ) + }) .collect::>(); let mut info = RangeBlockComponentsRequest::::new( blocks_req_id, diff --git a/beacon_node/network/src/sync/custody_backfill_sync/mod.rs b/beacon_node/network/src/sync/custody_backfill_sync/mod.rs new file mode 100644 index 0000000000..69df3422e6 --- /dev/null +++ b/beacon_node/network/src/sync/custody_backfill_sync/mod.rs @@ -0,0 +1,1126 @@ +use std::{ + collections::{BTreeMap, HashSet, btree_map::Entry}, + marker::PhantomData, + sync::Arc, +}; + +use beacon_chain::{BeaconChain, BeaconChainTypes}; +use lighthouse_network::{ + NetworkGlobals, PeerAction, PeerId, + service::api_types::{CustodyBackFillBatchRequestId, CustodyBackfillBatchId}, + types::CustodyBackFillState, +}; +use lighthouse_tracing::SPAN_CUSTODY_BACKFILL_SYNC_BATCH_REQUEST; +use logging::crit; +use std::hash::{DefaultHasher, Hash, Hasher}; +use tracing::{debug, error, info, info_span, warn}; +use types::{DataColumnSidecarList, Epoch, EthSpec}; + +use crate::sync::{ + backfill_sync::{BACKFILL_EPOCHS_PER_BATCH, ProcessResult, SyncStart}, + batch::{ + BatchConfig, BatchId, BatchInfo, BatchOperationOutcome, BatchProcessingResult, BatchState, + ByRangeRequestType, + }, + block_sidecar_coupling::CouplingError, + manager::CustodyBatchProcessResult, + network_context::{RpcResponseError, SyncNetworkContext}, +}; + +/// The maximum number of batches to queue before requesting more. +const BACKFILL_BATCH_BUFFER_SIZE: u8 = 5; + +/// Columns are downloaded in batches from peers. This constant specifies how many epochs worth of +/// columns per batch are requested _at most_. A batch may request less columns to account for +/// already requested columns. There is a timeout for each batch request. If this value is too high, +/// we will negatively report peers with poor bandwidth. This can be set arbitrarily high, in which +/// case the responder will fill the response up to the max request size, assuming they have the +/// bandwidth to do so. +pub const CUSTODY_BACKFILL_EPOCHS_PER_BATCH: u64 = 1; + +type CustodyBackFillBatchInfo = + BatchInfo, DataColumnSidecarList>; +type CustodyBackFillBatches = BTreeMap>; + +#[derive(Debug)] +pub struct CustodyBackFillBatchConfig { + marker: PhantomData, +} + +impl BatchConfig for CustodyBackFillBatchConfig { + fn max_batch_download_attempts() -> u8 { + 5 + } + fn max_batch_processing_attempts() -> u8 { + 5 + } + fn batch_attempt_hash(data: &D) -> u64 { + let mut hasher = DefaultHasher::new(); + data.hash(&mut hasher); + hasher.finish() + } +} + +/// The ways a custody backfill sync can fail. +// The info in the enum variants is displayed in logging, clippy thinks it's dead code. +#[derive(Debug)] +pub enum CustodyBackfillError { + /// A batch failed to be downloaded. + BatchDownloadFailed(#[allow(dead_code)] BatchId), + /// A batch could not be processed. + BatchProcessingFailed(#[allow(dead_code)] BatchId), + /// A batch entered an invalid state. + BatchInvalidState(#[allow(dead_code)] BatchId, #[allow(dead_code)] String), + /// The sync algorithm entered an invalid state. + InvalidSyncState(#[allow(dead_code)] String), + /// The chain became paused. + Paused, +} + +pub struct CustodyBackFillSync { + /// Keeps track of the current progress of the custody backfill. + /// This only gets refreshed from the beacon chain if we enter a failed state. + current_start: BatchId, + + /// Starting epoch of the batch that needs to be processed next. + /// This is incremented as the chain advances. + processing_target: BatchId, + + /// The custody group count we are trying to fulfill up to the DA window. + /// This is used as an indicator to restart custody backfill sync if the cgc + /// was changed in the middle of a currently active sync. + cgc: u64, + + /// Run ID of this backfill process. Increments if sync restarts. Used to differentiate batch + /// results from different runs. + run_id: u64, + + /// Starting epoch of the next batch that needs to be downloaded. + to_be_downloaded: BatchId, + + /// Keeps track if we have requested the final batch. + last_batch_downloaded: bool, + + /// Sorted map of batches undergoing some kind of processing. + batches: CustodyBackFillBatches, + + /// The current processing batch, if any. + current_processing_batch: Option, + + /// Batches validated. + validated_batches: u64, + + /// These are batches that we've skipped because we have no columns to fetch for the epoch. + skipped_batches: HashSet, + + /// When a custody backfill sync fails, we keep track of whether a new fully synced peer has joined. + /// This signifies that we are able to attempt to restart a failed chain. + restart_failed_sync: bool, + + /// Reference to the beacon chain to obtain initial starting points for custody backfill sync. + beacon_chain: Arc>, + + /// Reference to the network globals in order to obtain valid peers to backfill columns from + /// (i.e synced peers). + network_globals: Arc>, +} + +impl CustodyBackFillSync { + pub fn new( + beacon_chain: Arc>, + network_globals: Arc>, + ) -> Self { + Self { + current_start: Epoch::new(0), + processing_target: Epoch::new(0), + cgc: 0, + run_id: 0, + to_be_downloaded: Epoch::new(0), + last_batch_downloaded: false, + batches: BTreeMap::new(), + skipped_batches: HashSet::new(), + current_processing_batch: None, + validated_batches: 0, + restart_failed_sync: false, + beacon_chain, + network_globals, + } + } + + /// Pauses the custody sync if it's currently syncing. + pub fn pause(&mut self, reason: String) { + if let CustodyBackFillState::Syncing = self.state() { + debug!(processed_epochs = %self.validated_batches, to_be_processed = %self.current_start,"Custody backfill sync paused"); + self.set_state(CustodyBackFillState::Pending(reason)); + } + } + + /// Checks if custody backfill sync should start and sets the missing columns + /// custody backfill sync will attempt to fetch. + /// The criteria to start custody sync is: + /// - The earliest data column epoch's custodied columns != previous epoch's custodied columns + /// - The earliest data column epoch is a finalied epoch + pub fn should_start_custody_backfill_sync(&mut self) -> bool { + let Some(da_boundary_epoch) = self.beacon_chain.get_column_da_boundary() else { + return false; + }; + + // This is the epoch in which we have met our current custody requirements + let Some(earliest_data_column_epoch) = + self.beacon_chain.earliest_custodied_data_column_epoch() + else { + return false; + }; + + // Check if we have missing columns between the da boundary and `earliest_data_column_epoch` + let missing_columns = self + .beacon_chain + .get_missing_columns_for_epoch(da_boundary_epoch); + + if !missing_columns.is_empty() { + let latest_finalized_epoch = self + .beacon_chain + .canonical_head + .cached_head() + .finalized_checkpoint() + .epoch; + + // Check that the earliest data column epoch is a finalized epoch. + return earliest_data_column_epoch <= latest_finalized_epoch; + } + + false + } + + fn restart_sync(&mut self) { + // Set state to paused + self.set_state(CustodyBackFillState::Pending( + "CGC count has changed and custody backfill sync needs to restart".to_string(), + )); + + // Remove all batches and active requests. + self.batches.clear(); + self.skipped_batches.clear(); + self.restart_failed_sync = false; + + // Reset all downloading and processing targets + // NOTE: Lets keep validated_batches for posterity + self.processing_target = Epoch::new(0); + self.to_be_downloaded = Epoch::new(0); + self.last_batch_downloaded = false; + self.current_processing_batch = None; + self.validated_batches = 0; + self.run_id += 1; + + self.set_start_epoch(); + self.set_cgc(); + } + + fn restart_if_required(&mut self) -> bool { + let cgc_at_head = self + .beacon_chain + .data_availability_checker + .custody_context() + .custody_group_count_at_head(&self.beacon_chain.spec); + + if cgc_at_head != self.cgc { + self.restart_sync(); + return true; + } + + false + } + + /// Starts syncing. + #[must_use = "A failure here indicates custody backfill sync has failed and the global sync state should be updated"] + pub fn start( + &mut self, + network: &mut SyncNetworkContext, + ) -> Result { + match self.state() { + CustodyBackFillState::Syncing => { + if self.restart_if_required() { + return Ok(SyncStart::NotSyncing); + } + + if self.check_completed() { + self.set_state(CustodyBackFillState::Completed); + return Ok(SyncStart::NotSyncing); + } + } + CustodyBackFillState::Pending(_) | CustodyBackFillState::Completed => { + if self.check_completed() { + self.set_state(CustodyBackFillState::Completed); + return Ok(SyncStart::NotSyncing); + } + self.set_cgc(); + + if !self.should_start_custody_backfill_sync() { + return Ok(SyncStart::NotSyncing); + } + self.set_start_epoch(); + if self + .network_globals + .peers + .read() + .synced_peers() + .next() + .is_some() + { + debug!( + run_id = self.run_id, + current_start = %self.current_start, + processing_target = %self.processing_target, + to_be_downloaded = %self.to_be_downloaded, + "Starting custody backfill sync" + ); + // If there are peers to resume with, begin the resume. + self.set_state(CustodyBackFillState::Syncing); + // Resume any previously failed batches. + self.resume_batches(network)?; + // begin requesting blocks from the peer pool, until all peers are exhausted. + self.request_batches(network)?; + + // start processing batches if needed + self.process_completed_batches(network)?; + } else { + return Ok(SyncStart::NotSyncing); + } + } + } + + let Some(column_da_boundary) = self.beacon_chain.get_column_da_boundary() else { + return Ok(SyncStart::NotSyncing); + }; + + Ok(SyncStart::Syncing { + completed: (self.validated_batches + * CUSTODY_BACKFILL_EPOCHS_PER_BATCH + * T::EthSpec::slots_per_epoch()) as usize, + remaining: self + .current_start + .end_slot(T::EthSpec::slots_per_epoch()) + .saturating_sub(column_da_boundary.start_slot(T::EthSpec::slots_per_epoch())) + .as_usize(), + }) + } + + fn set_cgc(&mut self) { + self.cgc = self + .beacon_chain + .data_availability_checker + .custody_context() + .custody_group_count_at_head(&self.beacon_chain.spec); + } + + fn set_start_epoch(&mut self) { + let earliest_data_column_epoch = self + .beacon_chain + .earliest_custodied_data_column_epoch() + .unwrap_or(Epoch::new(0)); + + self.current_start = earliest_data_column_epoch + 1; + self.processing_target = self.current_start; + self.to_be_downloaded = self.current_start; + } + + /// Attempts to request the next required batches from the peer pool. It will exhaust the peer + /// pool and left over batches until the batch buffer is reached or all peers are exhausted. + fn request_batches( + &mut self, + network: &mut SyncNetworkContext, + ) -> Result<(), CustodyBackfillError> { + if !matches!(self.state(), CustodyBackFillState::Syncing) { + return Ok(()); + } + + // find the next pending batch and request it from the peer + // Note: for this function to not infinite loop we must: + // - If `include_next_batch` returns Some we MUST increase the count of batches that are + // accounted in the `BACKFILL_BATCH_BUFFER_SIZE` limit in the `matches!` statement of + // that function. + while let Some(batch_id) = self.include_next_batch() { + // send the batch + self.send_batch(network, batch_id)?; + } + + // No more batches, simply stop + Ok(()) + } + + /// When resuming a chain, this function searches for batches that need to be re-downloaded and + /// transitions their state to redownload the batch. + fn resume_batches( + &mut self, + network: &mut SyncNetworkContext, + ) -> Result<(), CustodyBackfillError> { + let batch_ids_to_retry = self + .batches + .iter() + .filter_map(|(batch_id, batch)| { + // In principle there should only ever be on of these, and we could terminate the + // loop early, however the processing is negligible and we continue the search + // for robustness to handle potential future modification + if matches!(batch.state(), BatchState::AwaitingDownload) { + Some(*batch_id) + } else { + None + } + }) + .collect::>(); + + for batch_id in batch_ids_to_retry { + self.send_batch(network, batch_id)?; + } + Ok(()) + } + + /// Creates the next required batch from the chain. If there are no more batches required, + /// `None` is returned. + fn include_next_batch(&mut self) -> Option { + let Some(column_da_boundary) = self.beacon_chain.get_column_da_boundary() else { + return None; + }; + + let mut missing_columns = HashSet::new(); + + // Skip all batches (Epochs) that don't have missing columns. + for epoch in Epoch::range_inclusive_rev(self.to_be_downloaded, column_da_boundary) { + missing_columns = self.beacon_chain.get_missing_columns_for_epoch(epoch); + + if !missing_columns.is_empty() { + self.to_be_downloaded = epoch; + break; + } + + // This batch is being skipped, insert it into the skipped batches mapping. + self.skipped_batches.insert(epoch); + + if epoch == column_da_boundary { + return None; + } + } + + // Don't request batches before the column da boundary + if self.to_be_downloaded < column_da_boundary { + return None; + } + + // Don't request batches beyond the DA window + if self.last_batch_downloaded { + return None; + } + + // Only request batches up to the buffer size limit + // NOTE: we don't count batches in the AwaitingValidation state, to prevent stalling sync + // if the current processing window is contained in a long range of skip slots. + let in_buffer = |batch: &CustodyBackFillBatchInfo| { + matches!( + batch.state(), + BatchState::Downloading(..) | BatchState::AwaitingProcessing(..) + ) + }; + if self + .batches + .iter() + .filter(|&(_epoch, batch)| in_buffer(batch)) + .count() + > BACKFILL_BATCH_BUFFER_SIZE as usize + { + return None; + } + + let batch_id = self.to_be_downloaded; + + match self.batches.entry(batch_id) { + Entry::Occupied(_) => { + // this batch doesn't need downloading, let this same function decide the next batch + if self.would_complete(batch_id) { + self.last_batch_downloaded = true; + } + + self.to_be_downloaded = self + .to_be_downloaded + .saturating_sub(CUSTODY_BACKFILL_EPOCHS_PER_BATCH); + self.include_next_batch() + } + Entry::Vacant(entry) => { + entry.insert(BatchInfo::new( + &batch_id, + CUSTODY_BACKFILL_EPOCHS_PER_BATCH, + ByRangeRequestType::Columns(missing_columns), + )); + if self.would_complete(batch_id) { + self.last_batch_downloaded = true; + } + self.to_be_downloaded = self + .to_be_downloaded + .saturating_sub(CUSTODY_BACKFILL_EPOCHS_PER_BATCH); + Some(batch_id) + } + } + } + + /// Processes the batch with the given id. + /// The batch must exist and be ready for processing + fn process_batch( + &mut self, + network: &mut SyncNetworkContext, + batch_id: BatchId, + ) -> Result { + // Check if we need to restart custody backfill sync due to a recent cgc change + if self.restart_if_required() { + return Ok(ProcessResult::Successful); + } + + if self.state() != CustodyBackFillState::Syncing || self.current_processing_batch.is_some() + { + return Ok(ProcessResult::Successful); + } + + let Some(batch) = self.batches.get_mut(&batch_id) else { + return self + .fail_sync(CustodyBackfillError::InvalidSyncState(format!( + "Trying to process a batch that does not exist: {}", + batch_id + ))) + .map(|_| ProcessResult::Successful); + }; + + let (data_columns, _) = match batch.start_processing() { + Err(e) => { + return self + .fail_sync(CustodyBackfillError::BatchInvalidState(batch_id, e.0)) + .map(|_| ProcessResult::Successful); + } + Ok(v) => v, + }; + + self.current_processing_batch = Some(batch_id); + + if let Err(e) = network.beacon_processor().send_historic_data_columns( + CustodyBackfillBatchId { + epoch: batch_id, + run_id: self.run_id, + }, + data_columns, + ) { + crit!( + msg = "process_batch", + error = %e, + batch = ?self.processing_target, + "Failed to send data columns to processor." + ); + // This is unlikely to happen but it would stall syncing since the batch now has no + // data columns to continue, and the chain is expecting a processing result that won't + // arrive. To mitigate this, (fake) fail this processing so that the batch is + // re-downloaded. + self.on_batch_process_result( + network, + CustodyBackfillBatchId { + epoch: batch_id, + run_id: self.run_id, + }, + &CustodyBatchProcessResult::Error { peer_action: None }, + ) + } else { + Ok(ProcessResult::Successful) + } + } + + /// A data column has been received for a batch. + /// If the column correctly completes the batch it will be processed if possible. + /// If this returns an error, custody sync has failed and will be restarted once new peers + /// join the system. + /// The sync manager should update the global sync state on failure. + #[must_use = "A failure here indicates custody backfill sync has failed and the global sync state should be updated"] + pub fn on_data_column_response( + &mut self, + network: &mut SyncNetworkContext, + req_id: CustodyBackFillBatchRequestId, + peer_id: &PeerId, + resp: Result, RpcResponseError>, + ) -> Result { + if req_id.batch_id.run_id != self.run_id { + debug!(%req_id, "Ignoring custody backfill download response from different run_id"); + return Ok(ProcessResult::Successful); + } + + let batch_id = req_id.batch_id.epoch; + // check if we have this batch + let Some(batch) = self.batches.get_mut(&batch_id) else { + if !matches!(self.state(), CustodyBackFillState::Pending(_)) { + // A batch might get removed when custody sync advances, so this is non fatal. + debug!(epoch = %batch_id, "Received a column for unknown batch"); + } + return Ok(ProcessResult::Successful); + }; + + // A batch could be retried without the peer failing the request (disconnecting/ + // sending an error /timeout) if the peer is removed for other + // reasons. Check that this column belongs to the expected peer, and that the + // request_id matches + if !batch.is_expecting_request_id(&req_id.id) { + return Ok(ProcessResult::Successful); + } + + match resp { + Ok(data_columns) => { + let received = data_columns.len(); + + match batch.download_completed(data_columns, *peer_id) { + Ok(_) => { + let awaiting_batches = self.processing_target.saturating_sub(batch_id) + / CUSTODY_BACKFILL_EPOCHS_PER_BATCH; + debug!( + %req_id, + blocks = received, + %awaiting_batches, + "Completed batch received" + ); + + // pre-emptively request more columns from peers whilst we process current columns. + self.request_batches(network)?; + self.process_completed_batches(network) + } + Err(e) => { + self.fail_sync(CustodyBackfillError::BatchInvalidState(batch_id, e.0))?; + Ok(ProcessResult::Successful) + } + } + } + Err(err) => { + debug!(batch_epoch = %batch_id, error = ?err, "Batch download failed"); + + // If there are any coupling errors, penalize the appropriate peers + if let RpcResponseError::BlockComponentCouplingError(coupling_error) = err + && let CouplingError::DataColumnPeerFailure { + error, + faulty_peers, + exceeded_retries: _, + } = coupling_error + { + for (column_index, faulty_peer) in faulty_peers { + debug!( + ?error, + ?column_index, + ?faulty_peer, + "Custody backfill sync penalizing peer" + ); + network.report_peer( + faulty_peer, + PeerAction::LowToleranceError, + "Peer failed to serve column", + ); + } + } + + match batch.download_failed(Some(*peer_id)) { + Err(e) => { + self.fail_sync(CustodyBackfillError::BatchInvalidState(batch_id, e.0))?; + } + Ok(BatchOperationOutcome::Failed { blacklist: _ }) => { + self.fail_sync(CustodyBackfillError::BatchDownloadFailed(batch_id))?; + } + Ok(BatchOperationOutcome::Continue) => { + self.send_batch(network, batch_id)?; + } + } + Ok(ProcessResult::Successful) + } + } + } + + /// The beacon processor has completed processing a batch. This function handles the result + /// of the batch processor. + /// If an error is returned custody backfill sync has failed. + #[must_use = "A failure here indicates custody backfill sync has failed and the global sync state should be updated"] + pub fn on_batch_process_result( + &mut self, + network: &mut SyncNetworkContext, + custody_batch_id: CustodyBackfillBatchId, + result: &CustodyBatchProcessResult, + ) -> Result { + let batch_id = custody_batch_id.epoch; + if custody_batch_id.run_id != self.run_id { + debug!(batch = %custody_batch_id, "Ignoring custody backfill error from different run_id"); + return Ok(ProcessResult::Successful); + } + + // The first two cases are possible in regular sync, should not occur in custody backfill, but we + // keep this logic for handling potential processing race conditions. + // result + let batch = match &self.current_processing_batch { + Some(processing_id) if *processing_id != batch_id => { + debug!( + batch_epoch = %batch_id, + expected_batch_epoch = processing_id.as_u64(), + "Unexpected batch result" + ); + return Ok(ProcessResult::Successful); + } + None => { + debug!(%batch_id, "Chain was not expecting a batch result"); + return Ok(ProcessResult::Successful); + } + _ => { + // batch_id matches, continue + self.current_processing_batch = None; + + match self.batches.get_mut(&batch_id) { + Some(batch) => batch, + None => { + // This is an error. Fail the sync algorithm. + return self + .fail_sync(CustodyBackfillError::InvalidSyncState(format!( + "Current processing batch not found: {}", + batch_id + ))) + .map(|_| ProcessResult::Successful); + } + } + } + }; + + let Some(peer) = batch.processing_peer() else { + self.fail_sync(CustodyBackfillError::BatchInvalidState( + batch_id, + String::from("Peer does not exist"), + ))?; + return Ok(ProcessResult::Successful); + }; + + debug!( + ?result, + batch_id = %custody_batch_id, + %peer, + client = %network.client_type(peer), + "Custody backfill batch processed" + ); + + match result { + CustodyBatchProcessResult::Success { + imported_columns, .. + } => { + if let Err(e) = batch.processing_completed(BatchProcessingResult::Success) { + self.fail_sync(CustodyBackfillError::BatchInvalidState(batch_id, e.0))?; + } + + debug!(imported_count=?imported_columns, "Succesfully imported historical data columns"); + + self.advance_custody_backfill_sync(batch_id); + + let Some(column_da_boundary) = self.beacon_chain.get_column_da_boundary() else { + return Err(CustodyBackfillError::InvalidSyncState( + "Can't calculate column data availability boundary".to_string(), + )); + }; + + if batch_id == self.processing_target { + // Advance processing target to the previous epoch + // If the current processing target is above the column DA boundary + if self.processing_target > column_da_boundary { + self.processing_target = self + .processing_target + .saturating_sub(CUSTODY_BACKFILL_EPOCHS_PER_BATCH); + } + } + + // check if custody sync has completed syncing up to the DA window + if self.check_completed() { + info!( + validated_epochs = ?self.validated_batches, + run_id = self.run_id, + "Custody backfill sync completed" + ); + self.batches.clear(); + self.restart_failed_sync = false; + self.processing_target = self.current_start; + self.to_be_downloaded = self.current_start; + self.last_batch_downloaded = false; + self.current_processing_batch = None; + self.validated_batches = 0; + self.skipped_batches.clear(); + self.set_state(CustodyBackFillState::Completed); + self.beacon_chain.update_data_column_custody_info(None); + Ok(ProcessResult::SyncCompleted) + } else { + // custody sync is not completed + // attempt to request more batches + self.request_batches(network)?; + // attempt to process more batches + self.process_completed_batches(network) + } + } + CustodyBatchProcessResult::Error { peer_action } => { + match peer_action { + // Faulty failure + Some(peer_action) => { + match batch.processing_completed(BatchProcessingResult::FaultyFailure) { + Err(e) => { + // Batch was in the wrong state + self.fail_sync(CustodyBackfillError::BatchInvalidState( + batch_id, e.0, + )) + .map(|_| ProcessResult::Successful) + } + Ok(BatchOperationOutcome::Failed { blacklist: _ }) => { + warn!( + score_adjustment = ?peer_action, + batch_epoch = %batch_id, + "Custody backfill batch failed to download. Penalizing peers" + ); + self.fail_sync(CustodyBackfillError::BatchProcessingFailed( + batch_id, + )) + .map(|_| ProcessResult::Successful) + } + + Ok(BatchOperationOutcome::Continue) => { + self.advance_custody_backfill_sync(batch_id); + // Handle this invalid batch, that is within the re-process retries limit. + self.handle_invalid_batch(network, batch_id) + .map(|_| ProcessResult::Successful) + } + } + } + // Non faulty failure + None => { + if let Err(e) = + batch.processing_completed(BatchProcessingResult::NonFaultyFailure) + { + self.fail_sync(CustodyBackfillError::BatchInvalidState(batch_id, e.0))?; + } + self.send_batch(network, batch_id)?; + Ok(ProcessResult::Successful) + } + } + } + } + } + + /// Processes the next ready batch. + fn process_completed_batches( + &mut self, + network: &mut SyncNetworkContext, + ) -> Result { + // Only process batches if custody backfill is syncing and only process one batch at a time + if self.state() != CustodyBackFillState::Syncing || self.current_processing_batch.is_some() + { + return Ok(ProcessResult::Successful); + } + + // Don't try to process batches before the Fulu fork epoch since data columns don't exist + if let Some(fulu_fork_epoch) = self.beacon_chain.spec.fulu_fork_epoch + && self.processing_target < fulu_fork_epoch + { + return Ok(ProcessResult::Successful); + } + + // Check if we need to restart custody backfill sync due to a cgc change. + if self.restart_if_required() { + return Ok(ProcessResult::Successful); + } + + while self.skipped_batches.contains(&self.processing_target) { + self.skipped_batches.remove(&self.processing_target); + // Update data column custody info with the skipped batch + if let Err(e) = self + .beacon_chain + .safely_backfill_data_column_custody_info(self.processing_target) + { + // I can't see a scenario where this could happen, but if we don't + // handle this edge case custody backfill sync could be stuck indefinitely. + error!( + error=?e, + "Unable to update data column custody info, restarting sync" + ); + self.restart_sync(); + }; + self.processing_target -= BACKFILL_EPOCHS_PER_BATCH; + } + + // Find the id of the batch we are going to process. + if let Some(batch) = self.batches.get(&self.processing_target) { + let state = batch.state(); + match state { + BatchState::AwaitingProcessing(..) => { + return self.process_batch(network, self.processing_target); + } + BatchState::Downloading(..) => { + // Batch is not ready, nothing to process + } + // Batches can be in `AwaitingDownload` state if there weren't good data column subnet + // peers to send the request to. + BatchState::AwaitingDownload => return Ok(ProcessResult::Successful), + BatchState::AwaitingValidation(..) => { + // The batch is validated + } + BatchState::Poisoned => unreachable!("Poisoned batch"), + BatchState::Failed | BatchState::Processing(_) => { + // these are all inconsistent states: + // - Failed -> non recoverable batch. Columns should have been removed + // - AwaitingDownload -> A recoverable failed batch should have been + // re-requested. + // - Processing -> `self.current_processing_batch` is None + self.fail_sync(CustodyBackfillError::InvalidSyncState(String::from( + "Invalid expected batch state", + )))?; + return Ok(ProcessResult::Successful); + } + } + } else { + self.fail_sync(CustodyBackfillError::InvalidSyncState(format!( + "Batch not found for current processing target {}", + self.processing_target + )))?; + return Ok(ProcessResult::Successful); + } + Ok(ProcessResult::Successful) + } + + /// Removes any batches previous to the given `validating_epoch` and advance custody backfill sync + /// to `validating_epoch`. + /// + /// The `validating_epoch` must align with batch boundaries. + fn advance_custody_backfill_sync(&mut self, validating_epoch: Epoch) { + let Some(column_da_boundary) = self.beacon_chain.get_column_da_boundary() else { + return; + }; + // make sure this epoch produces an advancement, unless its at the column DA boundary + if validating_epoch >= self.current_start && validating_epoch > column_da_boundary { + return; + } + + // We can now validate higher batches than the current batch. Here we remove all + // batches that are higher than the current batch. We add on an extra + // `BACKFILL_EPOCHS_PER_BATCH` as `split_off` is inclusive. + let removed_batches = self + .batches + .split_off(&(validating_epoch + CUSTODY_BACKFILL_EPOCHS_PER_BATCH)); + + for (id, batch) in removed_batches.into_iter() { + self.validated_batches = self.validated_batches.saturating_add(1); + match batch.state() { + BatchState::Downloading(..) | BatchState::AwaitingValidation(..) => {} + BatchState::Failed | BatchState::Poisoned | BatchState::AwaitingDownload => { + crit!("Batch indicates inconsistent data columns while advancing custody sync") + } + BatchState::AwaitingProcessing(..) => {} + BatchState::Processing(_) => { + debug!(batch = %id, %batch, "Advancing custody sync while processing a batch"); + if let Some(processing_id) = self.current_processing_batch + && id >= processing_id + { + self.current_processing_batch = None; + } + } + } + } + + self.processing_target = self.processing_target.min(validating_epoch); + self.current_start = self.current_start.min(validating_epoch); + self.to_be_downloaded = self.to_be_downloaded.min(validating_epoch); + + if self.batches.contains_key(&self.to_be_downloaded) { + // if custody backfill sync is advanced by Range beyond the previous `self.to_be_downloaded`, we + // won't have this batch, so we need to request it. + self.to_be_downloaded -= CUSTODY_BACKFILL_EPOCHS_PER_BATCH; + } + debug!(?validating_epoch, processing_target = ?self.processing_target, "Custody backfill advanced"); + } + + /// An invalid batch has been received that could not be processed, but that can be retried. + /// + /// These events occur when a peer has successfully responded with columns, but the columns + /// received are incorrect or invalid. This indicates the peer has not performed as + /// intended and can result in down voting a peer. + fn handle_invalid_batch( + &mut self, + network: &mut SyncNetworkContext, + batch_id: BatchId, + ) -> Result<(), CustodyBackfillError> { + // The current batch could not be processed, indicating either the current or previous + // batches are invalid. + + // The previous batch could be incomplete due to the columns being too large to fit in + // a single RPC request or there could be consecutive empty batches which are not supposed + // to be there + + // The current (sub-optimal) strategy is to simply re-request all batches that could + // potentially be faulty. If a batch returns a different result than the original and + // results in successful processing, we downvote the original peer that sent us the batch. + + // this is our robust `processing_target`. All previous batches must be awaiting + // validation + let mut redownload_queue = Vec::new(); + + for (id, _) in self.batches.iter_mut().filter(|&(&id, _)| id > batch_id) { + redownload_queue.push(*id); + } + + // no batch maxed out it process attempts, so now the chain's volatile progress must be + // reset + self.processing_target = self.current_start; + + for id in redownload_queue { + self.send_batch(network, id)?; + } + // finally, re-request the failed batch. + self.send_batch(network, batch_id) + } + + /// Checks with the beacon chain if custody sync has completed. + fn check_completed(&mut self) -> bool { + if self.would_complete(self.current_start) { + // Check that the data column custody info `earliest_available_slot` + // is in an epoch that is less than or equal to the current DA boundary + let Some(earliest_data_column_epoch) = + self.beacon_chain.earliest_custodied_data_column_epoch() + else { + return false; + }; + + let Some(column_da_boundary) = self.beacon_chain.get_column_da_boundary() else { + return false; + }; + + return earliest_data_column_epoch <= column_da_boundary; + } + false + } + + /// Checks if custody backfill would complete by syncing to `start_epoch`. + fn would_complete(&self, start_epoch: Epoch) -> bool { + let Some(column_da_boundary) = self.beacon_chain.get_column_da_boundary() else { + return false; + }; + start_epoch <= column_da_boundary + } + + /// Requests the batch assigned to the given id from a given peer. + fn send_batch( + &mut self, + network: &mut SyncNetworkContext, + batch_id: BatchId, + ) -> Result<(), CustodyBackfillError> { + let span = info_span!(SPAN_CUSTODY_BACKFILL_SYNC_BATCH_REQUEST); + let _enter = span.enter(); + + if let Some(batch) = self.batches.get_mut(&batch_id) { + let synced_peers = self + .network_globals + .peers + .read() + .synced_peers_for_epoch(batch_id) + .cloned() + .collect::>(); + + let request = batch.to_data_columns_by_range_request().map_err(|_| { + CustodyBackfillError::InvalidSyncState( + "Can't convert to data column by range request".to_string(), + ) + })?; + let failed_peers = batch.failed_peers(); + + match network.custody_backfill_data_columns_batch_request( + request, + CustodyBackfillBatchId { + epoch: batch_id, + run_id: self.run_id, + }, + &synced_peers, + &failed_peers, + ) { + Ok(request_id) => { + // inform the batch about the new request + if let Err(e) = batch.start_downloading(request_id.id) { + return self + .fail_sync(CustodyBackfillError::BatchInvalidState(batch_id, e.0)); + } + debug!(epoch = %batch_id, %batch, "Requesting batch"); + + return Ok(()); + } + Err(e) => match e { + crate::sync::network_context::RpcRequestSendError::NoPeer(no_peer) => { + // If we are here we have no more synced peers + debug!( + "reason" = format!("insufficient_synced_peers({no_peer:?})"), + "Custody sync paused" + ); + self.pause("Insufficient peers".to_string()); + return Err(CustodyBackfillError::Paused); + } + crate::sync::network_context::RpcRequestSendError::InternalError(e) => { + // NOTE: under normal conditions this shouldn't happen but we handle it anyway + warn!(%batch_id, error = ?e, %batch,"Could not send batch request"); + // register the failed download and check if the batch can be retried + if let Err(e) = batch.start_downloading(1) { + return self + .fail_sync(CustodyBackfillError::BatchInvalidState(batch_id, e.0)); + } + + match batch.download_failed(None) { + Err(e) => self.fail_sync(CustodyBackfillError::BatchInvalidState( + batch_id, e.0, + ))?, + Ok(BatchOperationOutcome::Failed { blacklist: _ }) => { + self.fail_sync(CustodyBackfillError::BatchDownloadFailed(batch_id))? + } + Ok(BatchOperationOutcome::Continue) => { + return self.send_batch(network, batch_id); + } + } + } + }, + } + } + + Ok(()) + } + + /// The syncing process has failed. + /// + /// This resets past variables, to allow for a fresh start when resuming. + fn fail_sync(&mut self, error: CustodyBackfillError) -> Result<(), CustodyBackfillError> { + // Some errors shouldn't cause failure. + if matches!(error, CustodyBackfillError::Paused) { + return Ok(()); + } + + // Set the state + self.pause("Sync has failed".to_string()); + // Remove all batches and active requests. + self.batches.clear(); + self.restart_failed_sync = false; + + // Reset all downloading and processing targets + // NOTE: Lets keep validated_batches for posterity + self.processing_target = self.current_start; + self.to_be_downloaded = self.current_start; + self.last_batch_downloaded = false; + self.current_processing_batch = None; + self.restart_sync(); + + Err(error) + } + + pub fn state(&self) -> CustodyBackFillState { + self.network_globals.custody_sync_state.read().clone() + } + + /// Updates the global network state indicating the current state of a backfill sync. + pub fn set_state(&self, state: CustodyBackFillState) { + *self.network_globals.custody_sync_state.write() = state; + } + + /// A fully synced peer has joined us. + /// If we are in a failed state, update a local variable to indicate we are able to restart + /// the failed sync on the next attempt. + pub fn fully_synced_peer_joined(&mut self) { + if matches!(self.state(), CustodyBackFillState::Pending(_)) { + self.restart_failed_sync = true; + } + } +} diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index d7ba028054..338f21ce98 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -46,7 +46,8 @@ use crate::status::ToStatusMessage; use crate::sync::block_lookups::{ BlobRequestState, BlockComponent, BlockRequestState, CustodyRequestState, DownloadResult, }; -use crate::sync::network_context::PeerGroup; +use crate::sync::custody_backfill_sync::CustodyBackFillSync; +use crate::sync::network_context::{PeerGroup, RpcResponseResult}; use beacon_chain::block_verification_types::AsBlock; use beacon_chain::validator_monitor::timestamp_now; use beacon_chain::{ @@ -56,14 +57,16 @@ use futures::StreamExt; use lighthouse_network::SyncInfo; use lighthouse_network::rpc::RPCError; use lighthouse_network::service::api_types::{ - BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, CustodyRequester, - DataColumnsByRangeRequestId, DataColumnsByRootRequestId, DataColumnsByRootRequester, Id, - SingleLookupReqId, SyncRequestId, + BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, + CustodyBackFillBatchRequestId, CustodyBackfillBatchId, CustodyRequester, + DataColumnsByRangeRequestId, DataColumnsByRangeRequester, DataColumnsByRootRequestId, + DataColumnsByRootRequester, Id, SingleLookupReqId, SyncRequestId, }; use lighthouse_network::types::{NetworkGlobals, SyncState}; use lighthouse_network::{PeerAction, PeerId}; use logging::crit; use lru_cache::LRUTimeCache; +use slot_clock::SlotClock; use std::ops::Sub; use std::sync::Arc; use std::time::Duration; @@ -158,6 +161,12 @@ pub enum SyncMessage { result: BatchProcessResult, }, + /// A custody batch has been processed by the processor thread. + CustodyBatchProcessed { + batch_id: CustodyBackfillBatchId, + result: CustodyBatchProcessResult, + }, + /// Block processed BlockComponentProcessed { process_type: BlockProcessType, @@ -209,6 +218,19 @@ pub enum BatchProcessResult { NonFaultyFailure, } +/// The result of processing multiple data columns. +#[derive(Debug)] +pub enum CustodyBatchProcessResult { + /// The custody batch was completed successfully. It carries whether the sent batch contained data columns. + Success { + #[allow(dead_code)] + sent_columns: usize, + imported_columns: usize, + }, + /// The custody batch processing failed. + Error { peer_action: Option }, +} + /// The primary object for handling and driving all the current syncing logic. It maintains the /// current state of the syncing process, the number of useful peers, downloaded blocks and /// controls the logic behind both the long-range (batch) sync and the on-going potential parent @@ -229,6 +251,9 @@ pub struct SyncManager { /// Backfill syncing. backfill_sync: BackFillSync, + /// Custody syncing. + custody_backfill_sync: CustodyBackFillSync, + block_lookups: BlockLookups, /// debounce duplicated `UnknownBlockHashFromAttestation` for the same root peer tuple. A peer /// may forward us thousands of a attestations, each one triggering an individual event. Only @@ -288,7 +313,8 @@ impl SyncManager { fork_context.clone(), ), range_sync: RangeSync::new(beacon_chain.clone()), - backfill_sync: BackFillSync::new(beacon_chain.clone(), network_globals), + backfill_sync: BackFillSync::new(beacon_chain.clone(), network_globals.clone()), + custody_backfill_sync: CustodyBackFillSync::new(beacon_chain.clone(), network_globals), block_lookups: BlockLookups::new(), notified_unknown_roots: LRUTimeCache::new(Duration::from_secs( NOTIFIED_UNKNOWN_ROOT_EXPIRY_SECONDS, @@ -549,6 +575,7 @@ impl SyncManager { // inform the backfill sync that a new synced peer has joined us. if new_state.is_synced() { self.backfill_sync.fully_synced_peer_joined(); + self.custody_backfill_sync.fully_synced_peer_joined(); } } is_connected @@ -558,17 +585,18 @@ impl SyncManager { } } - /// Updates the global sync state, optionally instigating or pausing a backfill sync as well as + /// Updates the global sync state, optionally instigating or pausing a backfill or custody sync as well as /// logging any changes. /// /// The logic for which sync should be running is as follows: - /// - If there is a range-sync running (or required) pause any backfill and let range-sync + /// - If there is a range-sync running (or required) pause any backfill/custody sync and let range-sync /// complete. /// - If there is no current range sync, check for any requirement to backfill and either /// start/resume a backfill sync if required. The global state will be BackFillSync if a /// backfill sync is running. /// - If there is no range sync and no required backfill and we have synced up to the currently /// known peers, we consider ourselves synced. + /// - If there is no range sync and no required backfill we check if we need to execute a custody sync. fn update_sync_state(&mut self) { let new_state: SyncState = match self.range_sync.state() { Err(e) => { @@ -624,15 +652,51 @@ impl SyncManager { error!(error = ?e, "Backfill sync failed to start"); } } + + // If backfill is complete, check if we have a pending custody backfill to complete + let anchor_info = self.chain.store.get_anchor_info(); + if anchor_info.block_backfill_complete(self.chain.genesis_backfill_slot) { + match self.custody_backfill_sync.start(&mut self.network) { + Ok(SyncStart::Syncing { + completed, + remaining, + }) => { + sync_state = SyncState::CustodyBackFillSyncing { + completed, + remaining, + }; + } + Ok(SyncStart::NotSyncing) => {} // Ignore updating the state if custody sync state didn't start. + Err(e) => { + use crate::sync::custody_backfill_sync::CustodyBackfillError; + + match &e { + CustodyBackfillError::BatchDownloadFailed(_) + | CustodyBackfillError::BatchProcessingFailed(_) => { + debug!(error=?e, "Custody backfill batch processing or downloading failed"); + } + CustodyBackfillError::BatchInvalidState(_, reason) => { + error!(error=?e, reason, "Custody backfill sync failed due to invalid batch state") + } + CustodyBackfillError::InvalidSyncState(reason) => { + error!(error=?e, reason, "Custody backfill sync failed due to invalid sync state") + } + CustodyBackfillError::Paused => {} + } + } + } + } } // Return the sync state if backfilling is not required. sync_state } Some((RangeSyncType::Finalized, start_slot, target_slot)) => { - // If there is a backfill sync in progress pause it. + // Range sync is in progress. If there is a backfill or custody sync in progress pause it. #[cfg(not(feature = "disable-backfill"))] self.backfill_sync.pause(); + self.custody_backfill_sync + .pause("Range sync in progress".to_string()); SyncState::SyncingFinalized { start_slot, @@ -640,9 +704,12 @@ impl SyncManager { } } Some((RangeSyncType::Head, start_slot, target_slot)) => { - // If there is a backfill sync in progress pause it. + // Range sync is in progress. If there is a backfill or custody backfill sync + // in progress pause it. #[cfg(not(feature = "disable-backfill"))] self.backfill_sync.pause(); + self.custody_backfill_sync + .pause("Range sync in progress".to_string()); SyncState::SyncingHead { start_slot, @@ -662,7 +729,9 @@ impl SyncManager { if new_state.is_synced() && !matches!( old_state, - SyncState::Synced | SyncState::BackFillSyncing { .. } + SyncState::Synced + | SyncState::BackFillSyncing { .. } + | SyncState::CustodyBackFillSyncing { .. } ) { self.network.subscribe_core_topics(); @@ -693,6 +762,11 @@ impl SyncManager { let mut register_metrics_interval = tokio::time::interval(Duration::from_secs(5)); + // Trigger a sync state update every epoch. This helps check if we need to trigger a custody backfill sync. + let epoch_duration = + self.chain.slot_clock.slot_duration().as_secs() * T::EthSpec::slots_per_epoch(); + let mut epoch_interval = tokio::time::interval(Duration::from_secs(epoch_duration)); + // process any inbound messages loop { tokio::select! { @@ -711,6 +785,9 @@ impl SyncManager { _ = register_metrics_interval.tick() => { self.network.register_metrics(); } + _ = epoch_interval.tick() => { + self.update_sync_state(); + } } } } @@ -865,6 +942,21 @@ impl SyncManager { } } }, + SyncMessage::CustodyBatchProcessed { result, batch_id } => { + match self.custody_backfill_sync.on_batch_process_result( + &mut self.network, + batch_id, + &result, + ) { + Ok(ProcessResult::Successful) => {} + Ok(ProcessResult::SyncCompleted) => self.update_sync_state(), + Err(error) => { + error!(error = ?error, "Custody sync failed"); + // Update the global status + self.update_sync_state(); + } + } + } } } @@ -1081,11 +1173,13 @@ impl SyncManager { RpcEvent::from_chunk(data_column, seen_timestamp), ); } - SyncRequestId::DataColumnsByRange(id) => self.on_data_columns_by_range_response( - id, - peer_id, - RpcEvent::from_chunk(data_column, seen_timestamp), - ), + SyncRequestId::DataColumnsByRange(req_id) => { + self.on_data_columns_by_range_response( + req_id, + peer_id, + RpcEvent::from_chunk(data_column, seen_timestamp), + ); + } _ => { crit!(%peer_id, "bad request id for data_column"); } @@ -1173,11 +1267,22 @@ impl SyncManager { .network .on_data_columns_by_range_response(id, peer_id, data_column) { - self.on_range_components_response( - id.parent_request_id, - peer_id, - RangeBlockComponent::CustodyColumns(id, resp), - ); + match id.parent_request_id { + DataColumnsByRangeRequester::ComponentsByRange(components_by_range_req_id) => { + self.on_range_components_response( + components_by_range_req_id, + peer_id, + RangeBlockComponent::CustodyColumns(id, resp), + ); + } + DataColumnsByRangeRequester::CustodyBackfillSync(custody_backfill_req_id) => self + .on_custody_backfill_columns_response( + custody_backfill_req_id, + id, + peer_id, + resp, + ), + } } } @@ -1267,6 +1372,36 @@ impl SyncManager { } } } + + /// Handles receiving a response for a custody range sync request that has columns. + fn on_custody_backfill_columns_response( + &mut self, + custody_sync_request_id: CustodyBackFillBatchRequestId, + req_id: DataColumnsByRangeRequestId, + peer_id: PeerId, + data_columns: RpcResponseResult>>>, + ) { + if let Some(resp) = self.network.custody_backfill_data_columns_response( + custody_sync_request_id, + req_id, + data_columns, + ) { + match self.custody_backfill_sync.on_data_column_response( + &mut self.network, + custody_sync_request_id, + &peer_id, + resp, + ) { + Ok(ProcessResult::SyncCompleted) => self.update_sync_state(), + Ok(ProcessResult::Successful) => {} + Err(_e) => { + // The custody sync has failed, errors are reported + // within. + self.update_sync_state(); + } + } + } + } } impl From> for BlockProcessingResult { diff --git a/beacon_node/network/src/sync/mod.rs b/beacon_node/network/src/sync/mod.rs index 4dab2e17d3..054bab654c 100644 --- a/beacon_node/network/src/sync/mod.rs +++ b/beacon_node/network/src/sync/mod.rs @@ -2,14 +2,17 @@ //! //! Stores the various syncing methods for the beacon chain. mod backfill_sync; +mod batch; mod block_lookups; mod block_sidecar_coupling; +mod custody_backfill_sync; pub mod manager; mod network_context; mod peer_sync_info; +mod range_data_column_batch_request; mod range_sync; #[cfg(test)] mod tests; pub use manager::{BatchProcessResult, SyncMessage}; -pub use range_sync::{BatchOperationOutcome, ChainId}; +pub use range_sync::ChainId; diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 1d119cb2de..2e0c56db23 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -6,16 +6,17 @@ pub use self::requests::{BlocksByRootSingleRequest, DataColumnsByRootSingleBlock use super::SyncMessage; use super::block_sidecar_coupling::RangeBlockComponentsRequest; use super::manager::BlockProcessType; -use super::range_sync::ByRangeRequestType; use crate::metrics; use crate::network_beacon_processor::NetworkBeaconProcessor; #[cfg(test)] use crate::network_beacon_processor::TestBeaconChainType; use crate::service::NetworkMessage; use crate::status::ToStatusMessage; +use crate::sync::batch::ByRangeRequestType; use crate::sync::block_lookups::SingleLookupId; use crate::sync::block_sidecar_coupling::CouplingError; use crate::sync::network_context::requests::BlobsByRootSingleBlockRequest; +use crate::sync::range_data_column_batch_request::RangeDataColumnBatchRequest; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes, BlockProcessStatus, EngineState}; use custody::CustodyRequestResult; @@ -25,7 +26,8 @@ use lighthouse_network::rpc::{BlocksByRangeRequest, GoodbyeReason, RPCError, Req pub use lighthouse_network::service::api_types::RangeRequestId; use lighthouse_network::service::api_types::{ AppRequestId, BlobsByRangeRequestId, BlocksByRangeRequestId, ComponentsByRangeRequestId, - CustodyId, CustodyRequester, DataColumnsByRangeRequestId, DataColumnsByRootRequestId, + CustodyBackFillBatchRequestId, CustodyBackfillBatchId, CustodyId, CustodyRequester, + DataColumnsByRangeRequestId, DataColumnsByRangeRequester, DataColumnsByRootRequestId, DataColumnsByRootRequester, Id, SingleLookupReqId, SyncRequestId, }; use lighthouse_network::{Client, NetworkGlobals, PeerAction, PeerId, ReportSource}; @@ -211,7 +213,6 @@ pub struct SyncNetworkContext { /// A mapping of active DataColumnsByRange requests data_columns_by_range_requests: ActiveRequests>, - /// Mapping of active custody column requests for a block root custody_by_root_requests: FnvHashMap>, @@ -219,6 +220,10 @@ pub struct SyncNetworkContext { components_by_range_requests: FnvHashMap>, + /// A batch of data columns by range request for custody sync + custody_backfill_data_column_batch_requests: + FnvHashMap>, + /// Whether the ee is online. If it's not, we don't allow access to the /// `beacon_processor_send`. execution_engine_state: EngineState, @@ -295,6 +300,7 @@ impl SyncNetworkContext { data_columns_by_range_requests: ActiveRequests::new("data_columns_by_range"), custody_by_root_requests: <_>::default(), components_by_range_requests: FnvHashMap::default(), + custody_backfill_data_column_batch_requests: FnvHashMap::default(), network_beacon_processor, chain, fork_context, @@ -324,6 +330,7 @@ impl SyncNetworkContext { custody_by_root_requests: _, // components_by_range_requests is a meta request of various _by_range requests components_by_range_requests: _, + custody_backfill_data_column_batch_requests: _, execution_engine_state: _, network_beacon_processor: _, chain: _, @@ -354,7 +361,6 @@ impl SyncNetworkContext { .active_requests_of_peer(peer_id) .into_iter() .map(|req_id| SyncRequestId::DataColumnsByRange(*req_id)); - blocks_by_root_ids .chain(blobs_by_root_ids) .chain(data_column_by_root_ids) @@ -421,6 +427,7 @@ impl SyncNetworkContext { custody_by_root_requests: _, // components_by_range_requests is a meta request of various _by_range requests components_by_range_requests: _, + custody_backfill_data_column_batch_requests: _, execution_engine_state: _, network_beacon_processor: _, chain: _, @@ -503,7 +510,7 @@ impl SyncNetworkContext { count: *request.count(), columns, }, - id, + DataColumnsByRangeRequester::ComponentsByRange(id), new_range_request_span!( self, "outgoing_columns_by_range_retry", @@ -638,7 +645,7 @@ impl SyncNetworkContext { count: *request.count(), columns, }, - id, + DataColumnsByRangeRequester::ComponentsByRange(id), new_range_request_span!( self, "outgoing_columns_by_range", @@ -1238,7 +1245,7 @@ impl SyncNetworkContext { &mut self, peer_id: PeerId, request: DataColumnsByRangeRequest, - parent_request_id: ComponentsByRangeRequestId, + parent_request_id: DataColumnsByRangeRequester, request_span: Span, ) -> Result<(DataColumnsByRangeRequestId, Vec), RpcRequestSendError> { let requested_columns = request.columns.clone(); @@ -1679,6 +1686,111 @@ impl SyncNetworkContext { }) } + /// data column by range requests sent by the custody sync algorithm + pub fn custody_backfill_data_columns_batch_request( + &mut self, + request: DataColumnsByRangeRequest, + batch_id: CustodyBackfillBatchId, + peers: &HashSet, + peers_to_deprioritize: &HashSet, + ) -> Result { + let active_request_count_by_peer = self.active_request_count_by_peer(); + // Attempt to find all required custody peers before sending any request or creating an ID + let columns_by_range_peers_to_request = { + let column_indexes = self + .chain + .sampling_columns_for_epoch(batch_id.epoch) + .iter() + .cloned() + .collect(); + + self.select_columns_by_range_peers_to_request( + &column_indexes, + peers, + active_request_count_by_peer, + peers_to_deprioritize, + )? + }; + + // Create the overall `custody_by_range` request id + let id = CustodyBackFillBatchRequestId { + id: self.next_id(), + batch_id, + }; + + let result = columns_by_range_peers_to_request + .iter() + .filter_map(|(peer_id, _)| { + self.send_data_columns_by_range_request( + *peer_id, + request.clone(), + DataColumnsByRangeRequester::CustodyBackfillSync(id), + Span::none(), + ) + .ok() + }) + .collect::>(); + + let range_data_column_batch_request = + RangeDataColumnBatchRequest::new(result, self.chain.clone(), batch_id.epoch); + + self.custody_backfill_data_column_batch_requests + .insert(id, range_data_column_batch_request); + + Ok(id) + } + + /// Received a data columns by range response from a custody sync request which batches them. + pub fn custody_backfill_data_columns_response( + &mut self, + // Identifies the custody backfill request for all data columns on this epoch + custody_sync_request_id: CustodyBackFillBatchRequestId, + // Identifies a specific data_columns_by_range request for *some* columns in this epoch. We + // pass them separately as DataColumnsByRangeRequestId parent is an enum and would require + // matching again. + req_id: DataColumnsByRangeRequestId, + data_columns: RpcResponseResult>, + ) -> Option, RpcResponseError>> { + let Entry::Occupied(mut entry) = self + .custody_backfill_data_column_batch_requests + .entry(custody_sync_request_id) + else { + metrics::inc_counter_vec( + &metrics::SYNC_UNKNOWN_NETWORK_REQUESTS, + &["range_data_columns"], + ); + return None; + }; + + if let Err(e) = { + let request = entry.get_mut(); + data_columns.and_then(|(data_columns, _)| { + request + .add_custody_columns(req_id, data_columns.clone()) + .map_err(|e| { + RpcResponseError::BlockComponentCouplingError(CouplingError::InternalError( + e, + )) + }) + }) + } { + entry.remove(); + return Some(Err(e)); + } + + if let Some(data_column_result) = entry.get_mut().responses() { + if data_column_result.is_ok() { + // remove the entry only if it coupled successfully with + // no errors + entry.remove(); + } + // If the request is finished, dequeue everything + Some(data_column_result.map_err(RpcResponseError::BlockComponentCouplingError)) + } else { + None + } + } + pub(crate) fn register_metrics(&self) { for (id, count) in [ ("blocks_by_root", self.blocks_by_root_requests.len()), diff --git a/beacon_node/network/src/sync/range_data_column_batch_request.rs b/beacon_node/network/src/sync/range_data_column_batch_request.rs new file mode 100644 index 0000000000..542d99d97c --- /dev/null +++ b/beacon_node/network/src/sync/range_data_column_batch_request.rs @@ -0,0 +1,297 @@ +use std::collections::{HashMap, HashSet}; + +use crate::sync::block_sidecar_coupling::{ByRangeRequest, CouplingError}; +use crate::sync::network_context::MAX_COLUMN_RETRIES; +use beacon_chain::{BeaconChain, BeaconChainTypes}; +use itertools::Itertools; +use lighthouse_network::PeerId; +use lighthouse_network::service::api_types::DataColumnsByRangeRequestId; +use std::sync::Arc; +use types::{ColumnIndex, DataColumnSidecar, DataColumnSidecarList, Epoch, EthSpec, Slot}; + +pub struct RangeDataColumnBatchRequest { + requests: HashMap< + DataColumnsByRangeRequestId, + ByRangeRequest>, + >, + /// The column indices corresponding to the request + column_peers: HashMap>, + expected_custody_columns: HashSet, + attempt: usize, + beacon_chain: Arc>, + epoch: Epoch, +} + +impl RangeDataColumnBatchRequest { + pub fn new( + by_range_requests: Vec<(DataColumnsByRangeRequestId, Vec)>, + beacon_chain: Arc>, + epoch: Epoch, + ) -> Self { + let requests = by_range_requests + .clone() + .into_iter() + .map(|(req, _)| (req, ByRangeRequest::Active(req))) + .collect::>(); + + let column_peers = by_range_requests.clone().into_iter().collect(); + + let expected_custody_columns = by_range_requests + .into_iter() + .flat_map(|(_, column_indices)| column_indices) + .collect(); + + Self { + requests, + column_peers, + expected_custody_columns, + beacon_chain, + epoch, + attempt: 0, + } + } + + pub fn add_custody_columns( + &mut self, + req_id: DataColumnsByRangeRequestId, + columns: Vec>>, + ) -> Result<(), String> { + let req = self + .requests + .get_mut(&req_id) + .ok_or(format!("unknown data columns by range req_id {req_id}"))?; + req.finish(req_id, columns) + } + + pub fn responses( + &mut self, + ) -> Option, CouplingError>> { + let mut received_columns_for_slot: HashMap> = + HashMap::new(); + let mut column_to_peer_id: HashMap = HashMap::new(); + + for column in self + .requests + .values() + .filter_map(|req| req.to_finished()) + .flatten() + { + received_columns_for_slot + .entry(column.slot()) + .or_default() + .push(column.clone()); + } + + // Note: this assumes that only 1 peer is responsible for a column + // with a batch. + for (id, columns) in self.column_peers.iter() { + for column in columns { + column_to_peer_id.insert(*column, id.peer); + } + } + + // An "attempt" is complete here after we have received a response for all the + // requests we made. i.e. `req.to_finished()` returns Some for all requests. + self.attempt += 1; + + let resp = self.responses_with_custody_columns( + received_columns_for_slot, + column_to_peer_id, + &self.expected_custody_columns, + self.attempt, + ); + + if let Err(CouplingError::DataColumnPeerFailure { + error: _, + faulty_peers, + exceeded_retries: _, + }) = &resp + { + for (_, peer) in faulty_peers.iter() { + // find the req id associated with the peer and + // delete it from the entries as we are going to make + // a separate attempt for those components. + self.requests.retain(|&k, _| k.peer != *peer); + } + } + Some(resp) + } + + fn responses_with_custody_columns( + &self, + mut received_columns_for_slot: HashMap>, + column_to_peer: HashMap, + expected_custody_columns: &HashSet, + attempt: usize, + ) -> Result, CouplingError> { + let mut naughty_peers = vec![]; + let mut result: DataColumnSidecarList = vec![]; + + let forward_blocks_iter = self + .beacon_chain + .forwards_iter_block_roots_until( + self.epoch.start_slot(T::EthSpec::slots_per_epoch()), + self.epoch.end_slot(T::EthSpec::slots_per_epoch()), + ) + .map_err(|_| { + CouplingError::InternalError("Failed to fetch block root iterator".to_string()) + })?; + + for block_iter_result in forward_blocks_iter { + let (block_root, slot) = block_iter_result.map_err(|_| { + CouplingError::InternalError("Failed to iterate block roots".to_string()) + })?; + + let Some(block) = self + .beacon_chain + .get_blinded_block(&block_root) + .ok() + .flatten() + else { + // The block root we are fetching is from the forwards block root iterator. This doesn't seem like a possible scenario. + return Err(CouplingError::InternalError( + "Block root from forwards block iterator not found in db".to_string(), + )); + }; + + let Some(columns) = received_columns_for_slot.remove(&slot) else { + // If at least one blob is expected for this slot but none have been served, penalize all peers + // The slot check ensures we arent checking a skipped slot. + if block.num_expected_blobs() != 0 && block.slot() == slot { + for column in expected_custody_columns { + if let Some(naughty_peer) = column_to_peer.get(column) { + naughty_peers.push((*column, *naughty_peer)); + } + } + } + continue; + }; + + // This is a skipped slot, skip to the next slot after we verify that peers + // didn't serve us columns for a skipped slot + if block.slot() != slot { + // If we received columns for a skipped slot, punish the peer + if !columns.is_empty() { + for column in expected_custody_columns { + if let Some(naughty_peer) = column_to_peer.get(column) { + naughty_peers.push((*column, *naughty_peer)); + } + } + } + + continue; + } + + let column_block_roots = columns + .iter() + .map(|column| column.block_root()) + .unique() + .collect::>(); + + let column_block_signatures = columns + .iter() + .map(|column| column.signed_block_header.signature.clone()) + .unique() + .collect::>(); + + let column_block_root = match column_block_roots.as_slice() { + // We expect a single unique block root + [column_block_root] => *column_block_root, + // If there are no block roots, penalize all peers + [] => { + for column in &columns { + if let Some(naughty_peer) = column_to_peer.get(&column.index) { + naughty_peers.push((column.index, *naughty_peer)); + } + } + continue; + } + // If theres more than one unique block root penalize the peers serving the bad block roots. + column_block_roots => { + for column in columns { + if column_block_roots.contains(&column.block_root()) + && block_root != column.block_root() + && let Some(naughty_peer) = column_to_peer.get(&column.index) + { + naughty_peers.push((column.index, *naughty_peer)); + } + } + continue; + } + }; + + let column_block_signature = match column_block_signatures.as_slice() { + // We expect a single unique block signature + [block_signature] => block_signature, + // If there are no block signatures, penalize all peers + [] => { + for column in &columns { + if let Some(naughty_peer) = column_to_peer.get(&column.index) { + naughty_peers.push((column.index, *naughty_peer)); + } + } + continue; + } + // If theres more than one unique block signature, penalize the peers serving the + // invalid block signatures. + column_block_signatures => { + for column in columns { + if column_block_signatures.contains(&column.signed_block_header.signature) + && block.signature() != &column.signed_block_header.signature + && let Some(naughty_peer) = column_to_peer.get(&column.index) + { + naughty_peers.push((column.index, *naughty_peer)); + } + } + continue; + } + }; + + // if the block root doesn't match the columns block root, penalize the peers + if block_root != column_block_root { + for column in &columns { + if let Some(naughty_peer) = column_to_peer.get(&column.index) { + naughty_peers.push((column.index, *naughty_peer)); + } + } + } + + // If the block signature doesn't match the columns block signature, penalize the peers + if block.signature() != column_block_signature { + for column in &columns { + if let Some(naughty_peer) = column_to_peer.get(&column.index) { + naughty_peers.push((column.index, *naughty_peer)); + } + } + } + + let received_columns = columns.iter().map(|c| c.index).collect::>(); + + let missing_columns = received_columns + .difference(expected_custody_columns) + .collect::>(); + + // blobs are expected for this slot but there is at least one missing columns + // penalize the peers responsible for those columns. + if block.num_expected_blobs() != 0 && !missing_columns.is_empty() { + for column in missing_columns { + if let Some(naughty_peer) = column_to_peer.get(column) { + naughty_peers.push((*column, *naughty_peer)); + }; + } + } + + result.extend(columns); + } + + if !naughty_peers.is_empty() { + return Err(CouplingError::DataColumnPeerFailure { + error: "Bad or missing columns for some slots".to_string(), + faulty_peers: naughty_peers, + exceeded_retries: attempt >= MAX_COLUMN_RETRIES, + }); + } + + Ok(result) + } +} diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index ab5b8bee5e..014d728ffe 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -1,10 +1,13 @@ use super::RangeSyncType; -use super::batch::{BatchInfo, BatchProcessingResult, BatchState}; use crate::metrics; use crate::network_beacon_processor::ChainSegmentProcessId; +use crate::sync::batch::BatchId; +use crate::sync::batch::{ + BatchConfig, BatchInfo, BatchOperationOutcome, BatchProcessingResult, BatchState, +}; use crate::sync::block_sidecar_coupling::CouplingError; use crate::sync::network_context::{RangeRequestId, RpcRequestSendError, RpcResponseError}; -use crate::sync::{BatchOperationOutcome, BatchProcessResult, network_context::SyncNetworkContext}; +use crate::sync::{BatchProcessResult, network_context::SyncNetworkContext}; use beacon_chain::BeaconChainTypes; use beacon_chain::block_verification_types::RpcBlock; use lighthouse_network::service::api_types::Id; @@ -12,6 +15,8 @@ use lighthouse_network::{PeerAction, PeerId}; use lighthouse_tracing::SPAN_SYNCING_CHAIN; use logging::crit; use std::collections::{BTreeMap, HashSet, btree_map::Entry}; +use std::hash::{Hash, Hasher}; +use std::marker::PhantomData; use strum::IntoStaticStr; use tracing::{Span, debug, instrument, warn}; use types::{ColumnIndex, Epoch, EthSpec, Hash256, Slot}; @@ -35,6 +40,35 @@ const BATCH_BUFFER_SIZE: u8 = 5; /// and continued is now in an inconsistent state. pub type ProcessingResult = Result; +type RpcBlocks = Vec>; +type RangeSyncBatchInfo = BatchInfo, RpcBlocks>; +type RangeSyncBatches = BTreeMap>; + +/// The number of times to retry a batch before it is considered failed. +const MAX_BATCH_DOWNLOAD_ATTEMPTS: u8 = 5; + +/// Invalid batches are attempted to be re-downloaded from other peers. If a batch cannot be processed +/// after `MAX_BATCH_PROCESSING_ATTEMPTS` times, it is considered faulty. +const MAX_BATCH_PROCESSING_ATTEMPTS: u8 = 3; + +pub struct RangeSyncBatchConfig { + marker: PhantomData, +} + +impl BatchConfig for RangeSyncBatchConfig { + fn max_batch_download_attempts() -> u8 { + MAX_BATCH_DOWNLOAD_ATTEMPTS + } + fn max_batch_processing_attempts() -> u8 { + MAX_BATCH_PROCESSING_ATTEMPTS + } + fn batch_attempt_hash(data: &D) -> u64 { + let mut hasher = std::collections::hash_map::DefaultHasher::new(); + data.hash(&mut hasher); + hasher.finish() + } +} + /// Reasons for removing a chain #[derive(Debug)] #[allow(dead_code)] @@ -55,7 +89,6 @@ pub struct KeepChain; /// A chain identifier pub type ChainId = Id; -pub type BatchId = Epoch; #[derive(Debug, Copy, Clone, IntoStaticStr)] pub enum SyncingChainType { @@ -85,7 +118,7 @@ pub struct SyncingChain { pub target_head_root: Hash256, /// Sorted map of batches undergoing some kind of processing. - batches: BTreeMap>, + batches: RangeSyncBatches, /// The peers that agree on the `target_head_slot` and `target_head_root` as a canonical chain /// and thus available to download this chain from, as well as the batches we are currently @@ -249,7 +282,7 @@ impl SyncingChain { // request_id matches // TODO(das): removed peer_id matching as the node may request a different peer for data // columns. - if !batch.is_expecting_block(&request_id) { + if !batch.is_expecting_request_id(&request_id) { return Ok(KeepChain); } batch @@ -260,7 +293,8 @@ impl SyncingChain { // Remove the request from the peer's active batches // TODO(das): should use peer group here https://github.com/sigp/lighthouse/issues/6258 - let received = batch.download_completed(blocks, *peer_id)?; + let received = blocks.len(); + batch.download_completed(blocks, *peer_id)?; let awaiting_batches = batch_id .saturating_sub(self.optimistic_start.unwrap_or(self.processing_target)) / EPOCHS_PER_BATCH; @@ -918,7 +952,7 @@ impl SyncingChain { // A batch could be retried without the peer failing the request (disconnecting/ // sending an error /timeout) if the peer is removed from the chain for other // reasons. Check that this block belongs to the expected peer - if !batch.is_expecting_block(&request_id) { + if !batch.is_expecting_request_id(&request_id) { debug!( batch_epoch = %batch_id, batch_state = ?batch.state(), @@ -1233,7 +1267,7 @@ impl SyncingChain { // only request batches up to the buffer size limit // NOTE: we don't count batches in the AwaitingValidation state, to prevent stalling sync // if the current processing window is contained in a long range of skip slots. - let in_buffer = |batch: &BatchInfo| { + let in_buffer = |batch: &RangeSyncBatchInfo| { matches!( batch.state(), BatchState::Downloading(..) | BatchState::AwaitingProcessing(..) @@ -1320,7 +1354,7 @@ impl SyncingChain { } } -use super::batch::WrongState as WrongBatchState; +use crate::sync::batch::WrongState as WrongBatchState; impl From for RemoveChain { fn from(err: WrongBatchState) -> Self { RemoveChain::WrongBatchState(err.0) diff --git a/beacon_node/network/src/sync/range_sync/mod.rs b/beacon_node/network/src/sync/range_sync/mod.rs index 8f881fba90..dd9f17bfd1 100644 --- a/beacon_node/network/src/sync/range_sync/mod.rs +++ b/beacon_node/network/src/sync/range_sync/mod.rs @@ -1,17 +1,11 @@ //! This provides the logic for syncing a chain when the local node is far behind it's current //! peers. - -mod batch; mod chain; mod chain_collection; mod range; mod sync_type; -pub use batch::{ - BatchConfig, BatchInfo, BatchOperationOutcome, BatchProcessingResult, BatchState, - ByRangeRequestType, -}; -pub use chain::{BatchId, ChainId, EPOCHS_PER_BATCH}; +pub use chain::{ChainId, EPOCHS_PER_BATCH}; #[cfg(test)] pub use chain_collection::SyncChainStatus; pub use range::RangeSync; diff --git a/beacon_node/network/src/sync/range_sync/range.rs b/beacon_node/network/src/sync/range_sync/range.rs index 465edd3697..c9656ad1d0 100644 --- a/beacon_node/network/src/sync/range_sync/range.rs +++ b/beacon_node/network/src/sync/range_sync/range.rs @@ -39,12 +39,13 @@ //! Each chain is downloaded in batches of blocks. The batched blocks are processed sequentially //! and further batches are requested as current blocks are being processed. -use super::chain::{BatchId, ChainId, RemoveChain, SyncingChain}; +use super::chain::{ChainId, RemoveChain, SyncingChain}; use super::chain_collection::{ChainCollection, SyncChainStatus}; use super::sync_type::RangeSyncType; use crate::metrics; use crate::status::ToStatusMessage; use crate::sync::BatchProcessResult; +use crate::sync::batch::BatchId; use crate::sync::network_context::{RpcResponseError, SyncNetworkContext}; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::{BeaconChain, BeaconChainTypes}; diff --git a/beacon_node/store/src/hot_cold_store.rs b/beacon_node/store/src/hot_cold_store.rs index d58cf2e731..895afa4f33 100644 --- a/beacon_node/store/src/hot_cold_store.rs +++ b/beacon_node/store/src/hot_cold_store.rs @@ -949,6 +949,19 @@ impl, Cold: ItemStore> HotColdDB )); } + pub fn data_column_as_kv_store_ops( + &self, + block_root: &Hash256, + data_column: Arc>, + ops: &mut Vec, + ) { + ops.push(KeyValueStoreOp::PutKeyValue( + DBColumn::BeaconDataColumn, + get_data_column_key(block_root, &data_column.index), + data_column.as_ssz_bytes(), + )); + } + pub fn put_data_column_custody_info( &self, earliest_data_column_slot: Option, diff --git a/common/eth2/src/lighthouse/sync_state.rs b/common/eth2/src/lighthouse/sync_state.rs index 0327f7073f..9f6f3b52e0 100644 --- a/common/eth2/src/lighthouse/sync_state.rs +++ b/common/eth2/src/lighthouse/sync_state.rs @@ -15,6 +15,10 @@ pub enum SyncState { /// specified by its peers. Once completed, the node enters this sync state and attempts to /// download all required historical blocks. BackFillSyncing { completed: usize, remaining: usize }, + /// The node is undertaking a custody backfill sync. This occurs for a node that has completed forward and + /// backfill sync and has undergone a custody count change. During custody backfill sync the node attempts + /// to backfill its new column custody requirements up to the data availability window. + CustodyBackFillSyncing { completed: usize, remaining: usize }, /// The node has completed syncing a finalized chain and is in the process of re-evaluating /// which sync state to progress to. SyncTransition, @@ -39,6 +43,17 @@ pub enum BackFillState { Failed, } +#[derive(PartialEq, Debug, Clone, Serialize, Deserialize)] +/// The state of the custody backfill sync. +pub enum CustodyBackFillState { + /// We are currently backfilling custody columns. + Syncing, + /// A custody backfill sync has completed. + Completed, + /// A custody sync should is set to Pending for various reasons. + Pending(String), +} + impl PartialEq for SyncState { fn eq(&self, other: &Self) -> bool { matches!( @@ -54,6 +69,10 @@ impl PartialEq for SyncState { SyncState::BackFillSyncing { .. }, SyncState::BackFillSyncing { .. } ) + | ( + SyncState::CustodyBackFillSyncing { .. }, + SyncState::CustodyBackFillSyncing { .. } + ) ) } } @@ -65,8 +84,8 @@ impl SyncState { SyncState::SyncingFinalized { .. } => true, SyncState::SyncingHead { .. } => true, SyncState::SyncTransition => true, - // Backfill doesn't effect any logic, we consider this state, not syncing. - SyncState::BackFillSyncing { .. } => false, + // Both backfill and custody backfill don't effect any logic, we consider this state, not syncing. + SyncState::BackFillSyncing { .. } | SyncState::CustodyBackFillSyncing { .. } => false, SyncState::Synced => false, SyncState::Stalled => false, } @@ -77,7 +96,7 @@ impl SyncState { SyncState::SyncingFinalized { .. } => true, SyncState::SyncingHead { .. } => false, SyncState::SyncTransition => false, - SyncState::BackFillSyncing { .. } => false, + SyncState::BackFillSyncing { .. } | SyncState::CustodyBackFillSyncing { .. } => false, SyncState::Synced => false, SyncState::Stalled => false, } @@ -87,7 +106,12 @@ impl SyncState { /// /// NOTE: We consider the node synced if it is fetching old historical blocks. pub fn is_synced(&self) -> bool { - matches!(self, SyncState::Synced | SyncState::BackFillSyncing { .. }) + matches!( + self, + SyncState::Synced + | SyncState::BackFillSyncing { .. } + | SyncState::CustodyBackFillSyncing { .. } + ) } /// Returns true if the node is *stalled*, i.e. has no synced peers. @@ -108,6 +132,9 @@ impl std::fmt::Display for SyncState { SyncState::Stalled => write!(f, "Stalled"), SyncState::SyncTransition => write!(f, "Evaluating known peers"), SyncState::BackFillSyncing { .. } => write!(f, "Syncing Historical Blocks"), + SyncState::CustodyBackFillSyncing { .. } => { + write!(f, "Syncing Historical Data Columns") + } } } } diff --git a/consensus/types/src/slot_epoch.rs b/consensus/types/src/slot_epoch.rs index 857044f981..05af9c5232 100644 --- a/consensus/types/src/slot_epoch.rs +++ b/consensus/types/src/slot_epoch.rs @@ -33,6 +33,13 @@ pub struct Slot(#[serde(with = "serde_utils::quoted_u64")] u64); #[serde(transparent)] pub struct Epoch(#[serde(with = "serde_utils::quoted_u64")] u64); +impl Epoch { + /// Returns an iterator `(end..=start)` + pub fn range_inclusive_rev(start: Self, end: Self) -> impl Iterator { + (end.0..=start.0).rev().map(Epoch) + } +} + impl_common!(Slot); impl_common!(Epoch); diff --git a/scripts/tests/checkpoint-sync.sh b/scripts/tests/checkpoint-sync.sh index df03da042e..605dc504f5 100755 --- a/scripts/tests/checkpoint-sync.sh +++ b/scripts/tests/checkpoint-sync.sh @@ -102,7 +102,8 @@ node_completed["fullnode"]=false echo "Polling sync status until backfill reaches ${TARGET_BACKFILL_SLOTS} slots or timeout of ${TIMEOUT_MINS} mins" -while [ "${node_completed[supernode]}" = false ] || [ "${node_completed[fullnode]}" = false ]; do +# while [ "${node_completed[supernode]}" = false ] || [ "${node_completed[fullnode]}" = false ]; do +while [ "${node_completed[fullnode]}" = false ]; do current_time=$(date +%s) elapsed=$((current_time - start_time)) @@ -112,7 +113,8 @@ while [ "${node_completed[supernode]}" = false ] || [ "${node_completed[fullnode fi # Poll each node that hasn't completed yet - for node in "supernode" "fullnode"; do + # for node in "supernode" "fullnode"; do + for node in "fullnode"; do if [ "${node_completed[$node]}" = false ]; then poll_node "$node" fi @@ -121,7 +123,7 @@ while [ "${node_completed[supernode]}" = false ] || [ "${node_completed[fullnode sleep $POLL_INTERVAL_SECS done -echo "Sync test complete! Both supernode and fullnode have synced to HEAD and backfilled ${TARGET_BACKFILL_SLOTS} slots." -echo "Supernode time: $((node_complete_time[supernode] - start_time)) seconds" +echo "Sync test complete! Fullnode has synced to HEAD and backfilled ${TARGET_BACKFILL_SLOTS} slots." +# echo "Supernode time: $((node_complete_time[supernode] - start_time)) seconds" echo "Fullnode time: $((node_complete_time[fullnode] - start_time)) seconds" exit_and_dump_logs 0 \ No newline at end of file From 43c5e924d74003d9eab0ca7fdf58f1c8b5e5db7a Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Wed, 22 Oct 2025 16:23:17 +1100 Subject: [PATCH 16/16] Add `--semi-supernode` support (#8254) Addresses #8218 A simplified version of #8241 for the initial release. I've tried to minimise the logic change in this PR, although introducing the `NodeCustodyType` enum still result in quite a bit a of diff, but the actual logic change in `CustodyContext` is quite small. The main changes are in the `CustdoyContext` struct * ~~combining `validator_custody_count` and `current_is_supernode` fields into a single `custody_group_count_at_head` field. We persist the cgc of the initial cli values into the `custody_group_count_at_head` field and only allow for increase (same behaviour as before).~~ * I noticed the above approach caused a backward compatibility issue, I've [made a fix](https://github.com/sigp/lighthouse/pull/8254/commits/15569bc085657b2c9a0933a20c8323a525efd30b) and changed the approach slightly (which was actually what I had originally in mind): * when initialising, only override the `validator_custody_count` value if either flag `--supernode` or `--semi-supernode` is used; otherwise leave it as the existing default `0`. Most other logic remains unchanged. All existing validator custody unit tests are still all passing, and I've added additional tests to cover semi-supernode, and restoring `CustodyContext` from disk. Note: I've added a `WARN` if the user attempts to switch to a `--semi-supernode` or `--supernode` - this currently has no effect, but once @eserilev column backfill is merged, we should be able to support this quite easily. Things to test - [x] cgc in metadata / enr - [x] cgc in metrics - [x] subscribed subnets - [x] getBlobs endpoint Co-Authored-By: Jimmy Chen --- beacon_node/beacon_chain/src/beacon_chain.rs | 2 +- beacon_node/beacon_chain/src/builder.rs | 16 +- beacon_node/beacon_chain/src/chain_config.rs | 4 + ...alidator_custody.rs => custody_context.rs} | 312 +++++++++++++++--- .../src/data_availability_checker.rs | 3 +- .../overflow_lru_cache.rs | 3 +- beacon_node/beacon_chain/src/lib.rs | 4 +- .../beacon_chain/src/persisted_custody.rs | 2 +- .../src/schema_change/migration_schema_v26.rs | 2 +- beacon_node/beacon_chain/src/test_utils.rs | 11 +- .../beacon_chain/tests/block_verification.rs | 21 +- .../beacon_chain/tests/column_verification.rs | 8 +- beacon_node/beacon_chain/tests/store_tests.rs | 37 ++- beacon_node/client/src/builder.rs | 2 +- beacon_node/http_api/tests/tests.rs | 10 +- .../src/network_beacon_processor/tests.rs | 31 +- beacon_node/src/cli.rs | 12 + beacon_node/src/config.rs | 18 +- book/src/help_bn.md | 6 + lighthouse/tests/beacon_node.rs | 27 +- testing/ef_tests/src/cases/fork_choice.rs | 3 +- 21 files changed, 420 insertions(+), 114 deletions(-) rename beacon_node/beacon_chain/src/{validator_custody.rs => custody_context.rs} (72%) diff --git a/beacon_node/beacon_chain/src/beacon_chain.rs b/beacon_node/beacon_chain/src/beacon_chain.rs index 152de1a20b..3e02baf901 100644 --- a/beacon_node/beacon_chain/src/beacon_chain.rs +++ b/beacon_node/beacon_chain/src/beacon_chain.rs @@ -21,6 +21,7 @@ use crate::block_verification_types::{ }; pub use crate::canonical_head::CanonicalHead; use crate::chain_config::ChainConfig; +use crate::custody_context::CustodyContextSsz; use crate::data_availability_checker::{ Availability, AvailabilityCheckError, AvailableBlock, AvailableBlockData, DataAvailabilityChecker, DataColumnReconstructionResult, @@ -64,7 +65,6 @@ use crate::shuffling_cache::{BlockShufflingIds, ShufflingCache}; use crate::sync_committee_verification::{ Error as SyncCommitteeError, VerifiedSyncCommitteeMessage, VerifiedSyncContribution, }; -use crate::validator_custody::CustodyContextSsz; use crate::validator_monitor::{ HISTORIC_EPOCHS as VALIDATOR_MONITOR_HISTORIC_EPOCHS, ValidatorMonitor, get_slot_delay_ms, timestamp_now, diff --git a/beacon_node/beacon_chain/src/builder.rs b/beacon_node/beacon_chain/src/builder.rs index 5564c7916f..750cde14ca 100644 --- a/beacon_node/beacon_chain/src/builder.rs +++ b/beacon_node/beacon_chain/src/builder.rs @@ -4,6 +4,7 @@ use crate::beacon_chain::{ BEACON_CHAIN_DB_KEY, CanonicalHead, LightClientProducerEvent, OP_POOL_DB_KEY, }; use crate::beacon_proposer_cache::BeaconProposerCache; +use crate::custody_context::NodeCustodyType; use crate::data_availability_checker::DataAvailabilityChecker; use crate::fork_choice_signal::ForkChoiceSignalTx; use crate::fork_revert::{reset_fork_choice_to_finalization, revert_to_fork_boundary}; @@ -100,7 +101,7 @@ pub struct BeaconChainBuilder { kzg: Arc, task_executor: Option, validator_monitor_config: Option, - import_all_data_columns: bool, + node_custody_type: NodeCustodyType, rng: Option>, } @@ -139,7 +140,7 @@ where kzg, task_executor: None, validator_monitor_config: None, - import_all_data_columns: false, + node_custody_type: NodeCustodyType::Fullnode, rng: None, } } @@ -640,9 +641,9 @@ where self } - /// Sets whether to require and import all data columns when importing block. - pub fn import_all_data_columns(mut self, import_all_data_columns: bool) -> Self { - self.import_all_data_columns = import_all_data_columns; + /// Sets the node custody type for data column import. + pub fn node_custody_type(mut self, node_custody_type: NodeCustodyType) -> Self { + self.node_custody_type = node_custody_type; self } @@ -935,10 +936,11 @@ where { Arc::new(CustodyContext::new_from_persisted_custody_context( custody, - self.import_all_data_columns, + self.node_custody_type, + &self.spec, )) } else { - Arc::new(CustodyContext::new(self.import_all_data_columns)) + Arc::new(CustodyContext::new(self.node_custody_type, &self.spec)) }; debug!(?custody_context, "Loading persisted custody context"); diff --git a/beacon_node/beacon_chain/src/chain_config.rs b/beacon_node/beacon_chain/src/chain_config.rs index a7defa9fa2..1f5abc4891 100644 --- a/beacon_node/beacon_chain/src/chain_config.rs +++ b/beacon_node/beacon_chain/src/chain_config.rs @@ -1,3 +1,4 @@ +use crate::custody_context::NodeCustodyType; pub use proto_array::{DisallowedReOrgOffsets, ReOrgThreshold}; use serde::{Deserialize, Serialize}; use std::str::FromStr; @@ -118,6 +119,8 @@ pub struct ChainConfig { pub invalid_block_roots: HashSet, /// Disable the getBlobs optimisation to fetch blobs from the EL mempool. pub disable_get_blobs: bool, + /// The node's custody type, determining how many data columns to custody and sample. + pub node_custody_type: NodeCustodyType, } impl Default for ChainConfig { @@ -158,6 +161,7 @@ impl Default for ChainConfig { data_column_publishing_delay: None, invalid_block_roots: HashSet::new(), disable_get_blobs: false, + node_custody_type: NodeCustodyType::Fullnode, } } } diff --git a/beacon_node/beacon_chain/src/validator_custody.rs b/beacon_node/beacon_chain/src/custody_context.rs similarity index 72% rename from beacon_node/beacon_chain/src/validator_custody.rs rename to beacon_node/beacon_chain/src/custody_context.rs index ea1dfdaae0..7ec13a8b51 100644 --- a/beacon_node/beacon_chain/src/validator_custody.rs +++ b/beacon_node/beacon_chain/src/custody_context.rs @@ -1,4 +1,5 @@ use parking_lot::RwLock; +use serde::{Deserialize, Serialize}; use ssz_derive::{Decode, Encode}; use std::marker::PhantomData; use std::sync::OnceLock; @@ -6,6 +7,7 @@ use std::{ collections::{BTreeMap, HashMap}, sync::atomic::{AtomicU64, Ordering}, }; +use tracing::warn; use types::data_column_custody_group::{CustodyIndex, compute_columns_for_custody_group}; use types::{ChainSpec, ColumnIndex, Epoch, EthSpec, Slot}; @@ -34,10 +36,32 @@ struct ValidatorRegistrations { /// that are then backfilled to epoch 10, the value at epoch 11 will be removed and epoch 10 /// will be added to the map instead. This should keep map size constrained to a maximum /// value of 128. + /// + /// If the node's is started with a cgc override (i.e. supernode/semi-supernode flag), the cgc + /// value is inserted into this map on initialisation with epoch set to 0. For a semi-supernode, + /// this means the custody requirement can still be increased if validator custody exceeds + /// 64 columns. epoch_validator_custody_requirements: BTreeMap, } impl ValidatorRegistrations { + /// Initialise the validator registration with some default custody requirements. + /// + /// If a `cgc_override` value is specified, the cgc value is inserted into the registration map + /// and is equivalent to registering validator(s) with the same custody requirement. + fn new(cgc_override: Option) -> Self { + let mut registrations = ValidatorRegistrations { + validators: Default::default(), + epoch_validator_custody_requirements: Default::default(), + }; + if let Some(custody_count) = cgc_override { + registrations + .epoch_validator_custody_requirements + .insert(Epoch::new(0), custody_count); + } + registrations + } + /// Returns the validator custody requirement at the latest epoch. fn latest_validator_custody_requirement(&self) -> Option { self.epoch_validator_custody_requirements @@ -139,6 +163,51 @@ fn get_validators_custody_requirement(validator_custody_units: u64, spec: &Chain ) } +/// Indicates the different "modes" that a node can run based on the cli +/// parameters that are relevant for computing the custody count. +/// +/// The custody count is derived from 2 values: +/// 1. The number of validators attached to the node and the spec parameters +/// that attach custody weight to attached validators. +/// 2. The cli parameters that the current node is running with. +/// +/// We always persist the validator custody units to the db across restarts +/// such that we know the validator custody units at any given epoch in the past. +/// However, knowing the cli parameter at any given epoch is a pain to maintain +/// and unnecessary. +/// +/// Therefore, the custody count at any point in time is calculated as the max of +/// the validator custody at that time and the current cli params. +/// +/// Choosing the max ensures that we always have the minimum required columns and +/// we can adjust the `status.earliest_available_slot` value to indicate to our peers +/// the columns that we can guarantee to serve. +#[derive(Debug, Copy, Clone, PartialEq, Eq, Default, Deserialize, Serialize)] +pub enum NodeCustodyType { + /// The node is running with cli parameters to indicate that it + /// wants to subscribe to all columns. + Supernode, + /// The node is running with cli parameters to indicate that it + /// wants to subscribe to the minimum number of columns to enable + /// reconstruction (50%) of the full blob data on demand. + SemiSupernode, + /// The node isn't running with with any explicit cli parameters + /// or is running with cli parameters to indicate that it wants + /// to only subscribe to the minimal custody requirements. + #[default] + Fullnode, +} + +impl NodeCustodyType { + pub fn get_custody_count_override(&self, spec: &ChainSpec) -> Option { + match self { + Self::Fullnode => None, + Self::SemiSupernode => Some(spec.number_of_custody_groups / 2), + Self::Supernode => Some(spec.number_of_custody_groups), + } + } +} + /// Contains all the information the node requires to calculate the /// number of columns to be custodied when checking for DA. #[derive(Debug)] @@ -150,15 +219,6 @@ pub struct CustodyContext { /// we require for data availability check, and we use to advertise to our peers in the metadata /// and enr values. validator_custody_count: AtomicU64, - /// Is the node run as a supernode based on current cli parameters. - current_is_supernode: bool, - /// The persisted value for `is_supernode` based on the previous run of this node. - /// - /// Note: We require this value because if a user restarts the node with a higher cli custody - /// count value than in the previous run, then we should continue advertising the custody - /// count based on the old value than the new one since we haven't backfilled the required - /// columns. - persisted_is_supernode: bool, /// Maintains all the validators that this node is connected to currently validator_registrations: RwLock, /// Stores an immutable, ordered list of all custody columns as determined by the node's NodeID @@ -171,26 +231,45 @@ impl CustodyContext { /// Create a new custody default custody context object when no persisted object /// exists. /// - /// The `is_supernode` value is based on current cli parameters. - pub fn new(is_supernode: bool) -> Self { + /// The `node_custody_type` value is based on current cli parameters. + pub fn new(node_custody_type: NodeCustodyType, spec: &ChainSpec) -> Self { + let cgc_override = node_custody_type.get_custody_count_override(spec); + // If there's no override, we initialise `validator_custody_count` to 0. This has been the + // existing behaviour and we maintain this for now to avoid a semantic schema change until + // a later release. Self { - validator_custody_count: AtomicU64::new(0), - current_is_supernode: is_supernode, - persisted_is_supernode: is_supernode, - validator_registrations: Default::default(), + validator_custody_count: AtomicU64::new(cgc_override.unwrap_or(0)), + validator_registrations: RwLock::new(ValidatorRegistrations::new(cgc_override)), all_custody_columns_ordered: OnceLock::new(), _phantom_data: PhantomData, } } + /// Restore the custody context from disk. + /// + /// * If NodeCustodyType::custody_count < validator_custody_at_head, it means the attached + /// validate stake has increased the node's CGC. We ignore the CLI input. + /// * If NodeCustodyType::custody_count > validator_custody_at_head, it means the user has + /// changed the node's custody type via either the --supernode or --semi-supernode flags, + /// and will require a resync until we implement column backfill for this scenario. pub fn new_from_persisted_custody_context( ssz_context: CustodyContextSsz, - is_supernode: bool, + node_custody_type: NodeCustodyType, + spec: &ChainSpec, ) -> Self { + let cgc_override = node_custody_type.get_custody_count_override(spec); + if let Some(cgc_from_cli) = cgc_override + && cgc_from_cli > ssz_context.validator_custody_at_head + { + warn!( + info = "node will continue to run with the current custody count", + current_custody_count = ssz_context.validator_custody_at_head, + node_custody_type = ?node_custody_type, + "Changing node type is currently not supported without a resync and will have no effect", + ); + } CustodyContext { validator_custody_count: AtomicU64::new(ssz_context.validator_custody_at_head), - current_is_supernode: is_supernode, - persisted_is_supernode: ssz_context.persisted_is_supernode, validator_registrations: RwLock::new(ValidatorRegistrations { validators: Default::default(), epoch_validator_custody_requirements: ssz_context @@ -249,12 +328,11 @@ impl CustodyContext { return None; }; - let current_cgc = self.custody_group_count_at_head(spec); - let validator_custody_count_at_head = self.validator_custody_count.load(Ordering::Relaxed); + let current_cgc = self.validator_custody_count.load(Ordering::Relaxed); - if new_validator_custody != validator_custody_count_at_head { + if new_validator_custody != current_cgc { tracing::debug!( - old_count = validator_custody_count_at_head, + old_count = current_cgc, new_count = new_validator_custody, "Validator count at head updated" ); @@ -285,9 +363,6 @@ impl CustodyContext { /// Do NOT use this directly for data availability check, use `self.sampling_size` instead as /// CGC can change over epochs. pub fn custody_group_count_at_head(&self, spec: &ChainSpec) -> u64 { - if self.current_is_supernode { - return spec.number_of_custody_groups; - } let validator_custody_count_at_head = self.validator_custody_count.load(Ordering::Relaxed); // If there are no validators, return the minimum custody_requirement @@ -305,14 +380,10 @@ impl CustodyContext { /// /// See also: [`Self::num_of_custody_groups_to_sample`]. pub fn custody_group_count_at_epoch(&self, epoch: Epoch, spec: &ChainSpec) -> u64 { - if self.current_is_supernode { - spec.number_of_custody_groups - } else { - self.validator_registrations - .read() - .custody_requirement_at_epoch(epoch) - .unwrap_or(spec.custody_requirement) - } + self.validator_registrations + .read() + .custody_requirement_at_epoch(epoch) + .unwrap_or(spec.custody_requirement) } /// Returns the count of custody groups this node must _sample_ for a block at `epoch` to import. @@ -406,6 +477,7 @@ pub struct CustodyCountChanged { #[derive(Debug, Encode, Decode, Clone)] pub struct CustodyContextSsz { pub validator_custody_at_head: u64, + /// DEPRECATED. This field is no longer in used and will be removed in a future release. pub persisted_is_supernode: bool, pub epoch_validator_custody_requirements: Vec<(Epoch, u64)>, } @@ -414,7 +486,8 @@ impl From<&CustodyContext> for CustodyContextSsz { fn from(context: &CustodyContext) -> Self { CustodyContextSsz { validator_custody_at_head: context.validator_custody_count.load(Ordering::Relaxed), - persisted_is_supernode: context.persisted_is_supernode, + // This field is deprecated and has no effect + persisted_is_supernode: false, epoch_validator_custody_requirements: context .validator_registrations .read() @@ -438,8 +511,8 @@ mod tests { #[test] fn no_validators_supernode_default() { - let custody_context = CustodyContext::::new(true); let spec = E::default_spec(); + let custody_context = CustodyContext::::new(NodeCustodyType::Supernode, &spec); assert_eq!( custody_context.custody_group_count_at_head(&spec), spec.number_of_custody_groups @@ -451,9 +524,23 @@ mod tests { } #[test] - fn no_validators_fullnode_default() { - let custody_context = CustodyContext::::new(false); + fn no_validators_semi_supernode_default() { let spec = E::default_spec(); + let custody_context = CustodyContext::::new(NodeCustodyType::SemiSupernode, &spec); + assert_eq!( + custody_context.custody_group_count_at_head(&spec), + spec.number_of_custody_groups / 2 + ); + assert_eq!( + custody_context.num_of_custody_groups_to_sample(Epoch::new(0), &spec), + spec.number_of_custody_groups / 2 + ); + } + + #[test] + fn no_validators_fullnode_default() { + let spec = E::default_spec(); + let custody_context = CustodyContext::::new(NodeCustodyType::Fullnode, &spec); assert_eq!( custody_context.custody_group_count_at_head(&spec), spec.custody_requirement, @@ -467,8 +554,8 @@ mod tests { #[test] fn register_single_validator_should_update_cgc() { - let custody_context = CustodyContext::::new(false); let spec = E::default_spec(); + let custody_context = CustodyContext::::new(NodeCustodyType::Fullnode, &spec); let bal_per_additional_group = spec.balance_per_additional_custody_group; let min_val_custody_requirement = spec.validator_custody_requirement; // One single node increases its balance over 3 epochs. @@ -491,8 +578,8 @@ mod tests { #[test] fn register_multiple_validators_should_update_cgc() { - let custody_context = CustodyContext::::new(false); let spec = E::default_spec(); + let custody_context = CustodyContext::::new(NodeCustodyType::Fullnode, &spec); let bal_per_additional_group = spec.balance_per_additional_custody_group; let min_val_custody_requirement = spec.validator_custody_requirement; // Add 3 validators over 3 epochs. @@ -528,8 +615,8 @@ mod tests { #[test] fn register_validators_should_not_update_cgc_for_supernode() { - let custody_context = CustodyContext::::new(true); let spec = E::default_spec(); + let custody_context = CustodyContext::::new(NodeCustodyType::Supernode, &spec); let bal_per_additional_group = spec.balance_per_additional_custody_group; // Add 3 validators over 3 epochs. @@ -566,8 +653,8 @@ mod tests { #[test] fn cgc_change_should_be_effective_to_sampling_after_delay() { - let custody_context = CustodyContext::::new(false); let spec = E::default_spec(); + let custody_context = CustodyContext::::new(NodeCustodyType::Fullnode, &spec); let current_slot = Slot::new(10); let current_epoch = current_slot.epoch(E::slots_per_epoch()); let default_sampling_size = @@ -597,8 +684,8 @@ mod tests { #[test] fn validator_dropped_after_no_registrations_within_expiry_should_not_reduce_cgc() { - let custody_context = CustodyContext::::new(false); let spec = E::default_spec(); + let custody_context = CustodyContext::::new(NodeCustodyType::Fullnode, &spec); let current_slot = Slot::new(10); let val_custody_units_1 = 10; let val_custody_units_2 = 5; @@ -639,8 +726,8 @@ mod tests { #[test] fn validator_dropped_after_no_registrations_within_expiry() { - let custody_context = CustodyContext::::new(false); let spec = E::default_spec(); + let custody_context = CustodyContext::::new(NodeCustodyType::Fullnode, &spec); let current_slot = Slot::new(10); let val_custody_units_1 = 10; let val_custody_units_2 = 5; @@ -690,7 +777,7 @@ mod tests { #[test] fn should_init_ordered_data_columns_and_return_sampling_columns() { let spec = E::default_spec(); - let custody_context = CustodyContext::::new(false); + let custody_context = CustodyContext::::new(NodeCustodyType::Fullnode, &spec); let sampling_size = custody_context.num_of_data_columns_to_sample(Epoch::new(0), &spec); // initialise ordered columns @@ -742,8 +829,8 @@ mod tests { #[test] fn custody_columns_for_epoch_no_validators_fullnode() { - let custody_context = CustodyContext::::new(false); let spec = E::default_spec(); + let custody_context = CustodyContext::::new(NodeCustodyType::Fullnode, &spec); let all_custody_groups_ordered = (0..spec.number_of_custody_groups).collect::>(); custody_context @@ -758,8 +845,8 @@ mod tests { #[test] fn custody_columns_for_epoch_no_validators_supernode() { - let custody_context = CustodyContext::::new(true); let spec = E::default_spec(); + let custody_context = CustodyContext::::new(NodeCustodyType::Supernode, &spec); let all_custody_groups_ordered = (0..spec.number_of_custody_groups).collect::>(); custody_context @@ -774,8 +861,8 @@ mod tests { #[test] fn custody_columns_for_epoch_with_validators_should_match_cgc() { - let custody_context = CustodyContext::::new(false); let spec = E::default_spec(); + let custody_context = CustodyContext::::new(NodeCustodyType::Fullnode, &spec); let all_custody_groups_ordered = (0..spec.number_of_custody_groups).collect::>(); let val_custody_units = 10; @@ -800,8 +887,8 @@ mod tests { #[test] fn custody_columns_for_epoch_specific_epoch_uses_epoch_cgc() { - let custody_context = CustodyContext::::new(false); let spec = E::default_spec(); + let custody_context = CustodyContext::::new(NodeCustodyType::Fullnode, &spec); let all_custody_groups_ordered = (0..spec.number_of_custody_groups).collect::>(); let test_epoch = Epoch::new(5); @@ -817,4 +904,133 @@ mod tests { expected_cgc as usize ); } + + #[test] + fn restore_from_persisted_fullnode_no_validators() { + let spec = E::default_spec(); + let ssz_context = CustodyContextSsz { + validator_custody_at_head: 0, // no validators + persisted_is_supernode: false, + epoch_validator_custody_requirements: vec![], + }; + + let custody_context = CustodyContext::::new_from_persisted_custody_context( + ssz_context, + NodeCustodyType::Fullnode, + &spec, + ); + + assert_eq!( + custody_context.custody_group_count_at_head(&spec), + spec.custody_requirement, + "restored custody group count should match fullnode default" + ); + } + + #[test] + fn restore_fullnode_then_switch_to_supernode_has_no_effect() { + let spec = E::default_spec(); + let ssz_context = CustodyContextSsz { + validator_custody_at_head: 0, // no validators + persisted_is_supernode: false, + epoch_validator_custody_requirements: vec![], + }; + + // Attempt to restore as supernode (wants 128), but should use original persisted value + let custody_context = CustodyContext::::new_from_persisted_custody_context( + ssz_context, + NodeCustodyType::Supernode, + &spec, + ); + + assert_eq!( + custody_context.custody_group_count_at_head(&spec), + spec.custody_requirement, + "should use original fullnode cgc, not supernode cgc" + ); + } + + #[test] + fn restore_supernode_then_switch_to_fullnode_uses_persisted() { + let spec = E::default_spec(); + let supernode_cgc = spec.number_of_custody_groups; // supernode cgc + + let ssz_context = CustodyContextSsz { + validator_custody_at_head: supernode_cgc, + persisted_is_supernode: false, + epoch_validator_custody_requirements: vec![(Epoch::new(0), supernode_cgc)], + }; + + // Attempt to restore as fullnode (wants 8), but should keep persisted value (128) + let custody_context = CustodyContext::::new_from_persisted_custody_context( + ssz_context, + NodeCustodyType::Fullnode, + &spec, + ); + + assert_eq!( + custody_context.custody_group_count_at_head(&spec), + supernode_cgc, + "should use persisted supernode cgc, not fullnode cgc" + ); + } + + #[test] + fn restore_with_validator_custody_history_across_epochs() { + let spec = E::default_spec(); + let initial_cgc = 8u64; + let increased_cgc = 16u64; + let final_cgc = 32u64; + + let ssz_context = CustodyContextSsz { + validator_custody_at_head: final_cgc, + persisted_is_supernode: false, + epoch_validator_custody_requirements: vec![ + (Epoch::new(0), initial_cgc), + (Epoch::new(10), increased_cgc), + (Epoch::new(20), final_cgc), + ], + }; + + let custody_context = CustodyContext::::new_from_persisted_custody_context( + ssz_context, + NodeCustodyType::Fullnode, + &spec, + ); + + // Verify head uses latest value + assert_eq!( + custody_context.custody_group_count_at_head(&spec), + final_cgc + ); + + // Verify historical epoch lookups work correctly + assert_eq!( + custody_context.custody_group_count_at_epoch(Epoch::new(5), &spec), + initial_cgc, + "epoch 5 should use initial cgc" + ); + assert_eq!( + custody_context.custody_group_count_at_epoch(Epoch::new(15), &spec), + increased_cgc, + "epoch 15 should use increased cgc" + ); + assert_eq!( + custody_context.custody_group_count_at_epoch(Epoch::new(25), &spec), + final_cgc, + "epoch 25 should use final cgc" + ); + + // Verify sampling size calculation uses correct historical values + assert_eq!( + custody_context.num_of_custody_groups_to_sample(Epoch::new(5), &spec), + spec.samples_per_slot, + "sampling at epoch 5 should use spec minimum since cgc is at minimum" + ); + assert_eq!( + custody_context.num_of_custody_groups_to_sample(Epoch::new(25), &spec), + final_cgc, + "sampling at epoch 25 should match final cgc" + ); + } } diff --git a/beacon_node/beacon_chain/src/data_availability_checker.rs b/beacon_node/beacon_chain/src/data_availability_checker.rs index c937c32c68..d6cc8d8947 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker.rs @@ -868,6 +868,7 @@ impl MaybeAvailableBlock { mod test { use super::*; use crate::CustodyContext; + use crate::custody_context::NodeCustodyType; use crate::test_utils::{ EphemeralHarnessType, NumBlobs, generate_rand_block_and_data_columns, get_kzg, }; @@ -1201,7 +1202,7 @@ mod test { ); let kzg = get_kzg(&spec); let store = Arc::new(HotColdDB::open_ephemeral(<_>::default(), spec.clone()).unwrap()); - let custody_context = Arc::new(CustodyContext::new(false)); + let custody_context = Arc::new(CustodyContext::new(NodeCustodyType::Fullnode, &spec)); let complete_blob_backfill = false; DataAvailabilityChecker::new( complete_blob_backfill, diff --git a/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs b/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs index 42f6dbd856..b842a1a3f9 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs @@ -827,6 +827,7 @@ mod test { blob_verification::GossipVerifiedBlob, block_verification::PayloadVerificationOutcome, block_verification_types::{AsBlock, BlockImportData}, + custody_context::NodeCustodyType, data_availability_checker::STATE_LRU_CAPACITY, test_utils::{BaseHarnessType, BeaconChainHarness, DiskHarnessType}, }; @@ -1021,7 +1022,7 @@ mod test { let spec = harness.spec.clone(); let test_store = harness.chain.store.clone(); let capacity_non_zero = new_non_zero_usize(capacity); - let custody_context = Arc::new(CustodyContext::new(false)); + let custody_context = Arc::new(CustodyContext::new(NodeCustodyType::Fullnode, &spec)); let cache = Arc::new( DataAvailabilityCheckerInner::::new( capacity_non_zero, diff --git a/beacon_node/beacon_chain/src/lib.rs b/beacon_node/beacon_chain/src/lib.rs index fd2162e7d3..4ac3e54742 100644 --- a/beacon_node/beacon_chain/src/lib.rs +++ b/beacon_node/beacon_chain/src/lib.rs @@ -17,6 +17,7 @@ pub mod block_verification_types; pub mod builder; pub mod canonical_head; pub mod chain_config; +pub mod custody_context; pub mod data_availability_checker; pub mod data_column_verification; mod early_attester_cache; @@ -55,7 +56,6 @@ pub mod summaries_dag; pub mod sync_committee_rewards; pub mod sync_committee_verification; pub mod test_utils; -pub mod validator_custody; pub mod validator_monitor; pub mod validator_pubkey_cache; @@ -84,6 +84,7 @@ pub use block_verification::{ pub use block_verification_types::AvailabilityPendingExecutedBlock; pub use block_verification_types::ExecutedBlock; pub use canonical_head::{CachedHead, CanonicalHead, CanonicalHeadRwLock}; +pub use custody_context::CustodyContext; pub use events::ServerSentEventHandler; pub use execution_layer::EngineState; pub use execution_payload::NotifyExecutionLayer; @@ -99,4 +100,3 @@ pub use state_processing::per_block_processing::errors::{ }; pub use store; pub use types; -pub use validator_custody::CustodyContext; diff --git a/beacon_node/beacon_chain/src/persisted_custody.rs b/beacon_node/beacon_chain/src/persisted_custody.rs index b685ea36b7..ba221c67b5 100644 --- a/beacon_node/beacon_chain/src/persisted_custody.rs +++ b/beacon_node/beacon_chain/src/persisted_custody.rs @@ -1,4 +1,4 @@ -use crate::validator_custody::CustodyContextSsz; +use crate::custody_context::CustodyContextSsz; use ssz::{Decode, Encode}; use std::sync::Arc; use store::{DBColumn, Error as StoreError, HotColdDB, ItemStore, StoreItem}; diff --git a/beacon_node/beacon_chain/src/schema_change/migration_schema_v26.rs b/beacon_node/beacon_chain/src/schema_change/migration_schema_v26.rs index 661d015942..38714ea060 100644 --- a/beacon_node/beacon_chain/src/schema_change/migration_schema_v26.rs +++ b/beacon_node/beacon_chain/src/schema_change/migration_schema_v26.rs @@ -1,6 +1,6 @@ use crate::BeaconChainTypes; +use crate::custody_context::CustodyContextSsz; use crate::persisted_custody::{CUSTODY_DB_KEY, PersistedCustody}; -use crate::validator_custody::CustodyContextSsz; use ssz::{Decode, Encode}; use ssz_derive::{Decode, Encode}; use std::sync::Arc; diff --git a/beacon_node/beacon_chain/src/test_utils.rs b/beacon_node/beacon_chain/src/test_utils.rs index 0b125efa32..38797d0264 100644 --- a/beacon_node/beacon_chain/src/test_utils.rs +++ b/beacon_node/beacon_chain/src/test_utils.rs @@ -1,5 +1,6 @@ use crate::blob_verification::GossipVerifiedBlob; use crate::block_verification_types::{AsBlock, RpcBlock}; +use crate::custody_context::NodeCustodyType; use crate::data_column_verification::CustodyDataColumn; use crate::kzg_utils::build_data_column_sidecars; use crate::observed_operations::ObservationOutcome; @@ -210,7 +211,7 @@ pub struct Builder { testing_slot_clock: Option, validator_monitor_config: Option, genesis_state_builder: Option>, - import_all_data_columns: bool, + node_custody_type: NodeCustodyType, runtime: TestRuntime, } @@ -356,7 +357,7 @@ where testing_slot_clock: None, validator_monitor_config: None, genesis_state_builder: None, - import_all_data_columns: false, + node_custody_type: NodeCustodyType::Fullnode, runtime, } } @@ -442,8 +443,8 @@ where self } - pub fn import_all_data_columns(mut self, import_all_data_columns: bool) -> Self { - self.import_all_data_columns = import_all_data_columns; + pub fn node_custody_type(mut self, node_custody_type: NodeCustodyType) -> Self { + self.node_custody_type = node_custody_type; self } @@ -565,7 +566,7 @@ where .execution_layer(self.execution_layer) .shutdown_sender(shutdown_tx) .chain_config(chain_config) - .import_all_data_columns(self.import_all_data_columns) + .node_custody_type(self.node_custody_type) .event_handler(Some(ServerSentEventHandler::new_with_capacity(5))) .validator_monitor_config(validator_monitor_config) .rng(Box::new(StdRng::seed_from_u64(42))); diff --git a/beacon_node/beacon_chain/tests/block_verification.rs b/beacon_node/beacon_chain/tests/block_verification.rs index 47f5be02cb..7dfef50ea1 100644 --- a/beacon_node/beacon_chain/tests/block_verification.rs +++ b/beacon_node/beacon_chain/tests/block_verification.rs @@ -4,6 +4,7 @@ use beacon_chain::block_verification_types::{AsBlock, ExecutedBlock, RpcBlock}; use beacon_chain::data_column_verification::CustodyDataColumn; use beacon_chain::{ AvailabilityProcessingStatus, BeaconChain, BeaconChainTypes, ExecutionPendingBlock, + custody_context::NodeCustodyType, test_utils::{ AttestationStrategy, BeaconChainHarness, BlockStrategy, EphemeralHarnessType, test_spec, }, @@ -45,7 +46,7 @@ async fn get_chain_segment() -> (Vec>, Vec (Vec>, Vec BeaconChainHarness> { let harness = BeaconChainHarness::builder(MainnetEthSpec) .default_spec() @@ -115,7 +116,7 @@ fn get_harness( ..ChainConfig::default() }) .keypairs(KEYPAIRS[0..validator_count].to_vec()) - .import_all_data_columns(supernode) + .node_custody_type(node_custody_type) .fresh_ephemeral_store() .mock_execution_layer() .build(); @@ -259,7 +260,7 @@ fn update_data_column_signed_header( #[tokio::test] async fn chain_segment_full_segment() { - let harness = get_harness(VALIDATOR_COUNT, false); + let harness = get_harness(VALIDATOR_COUNT, NodeCustodyType::Fullnode); let (chain_segment, chain_segment_blobs) = get_chain_segment().await; let blocks: Vec> = chain_segment_blocks(&chain_segment, &chain_segment_blobs) .into_iter() @@ -297,7 +298,7 @@ async fn chain_segment_full_segment() { #[tokio::test] async fn chain_segment_varying_chunk_size() { for chunk_size in &[1, 2, 3, 5, 31, 32, 33, 42] { - let harness = get_harness(VALIDATOR_COUNT, false); + let harness = get_harness(VALIDATOR_COUNT, NodeCustodyType::Fullnode); let (chain_segment, chain_segment_blobs) = get_chain_segment().await; let blocks: Vec> = chain_segment_blocks(&chain_segment, &chain_segment_blobs) .into_iter() @@ -329,7 +330,7 @@ async fn chain_segment_varying_chunk_size() { #[tokio::test] async fn chain_segment_non_linear_parent_roots() { - let harness = get_harness(VALIDATOR_COUNT, false); + let harness = get_harness(VALIDATOR_COUNT, NodeCustodyType::Fullnode); let (chain_segment, chain_segment_blobs) = get_chain_segment().await; harness @@ -386,7 +387,7 @@ async fn chain_segment_non_linear_parent_roots() { #[tokio::test] async fn chain_segment_non_linear_slots() { - let harness = get_harness(VALIDATOR_COUNT, false); + let harness = get_harness(VALIDATOR_COUNT, NodeCustodyType::Fullnode); let (chain_segment, chain_segment_blobs) = get_chain_segment().await; harness .chain @@ -528,7 +529,7 @@ async fn assert_invalid_signature( async fn get_invalid_sigs_harness( chain_segment: &[BeaconSnapshot], ) -> BeaconChainHarness> { - let harness = get_harness(VALIDATOR_COUNT, false); + let harness = get_harness(VALIDATOR_COUNT, NodeCustodyType::Fullnode); harness .chain .slot_clock @@ -986,7 +987,7 @@ fn unwrap_err(result: Result) -> U { #[tokio::test] async fn block_gossip_verification() { - let harness = get_harness(VALIDATOR_COUNT, false); + let harness = get_harness(VALIDATOR_COUNT, NodeCustodyType::Fullnode); let (chain_segment, chain_segment_blobs) = get_chain_segment().await; let block_index = CHAIN_SEGMENT_LENGTH - 2; @@ -1389,7 +1390,7 @@ async fn verify_block_for_gossip_slashing_detection() { #[tokio::test] async fn verify_block_for_gossip_doppelganger_detection() { - let harness = get_harness(VALIDATOR_COUNT, false); + let harness = get_harness(VALIDATOR_COUNT, NodeCustodyType::Fullnode); let state = harness.get_current_state(); let ((block, _), _) = harness.make_block(state.clone(), Slot::new(1)).await; diff --git a/beacon_node/beacon_chain/tests/column_verification.rs b/beacon_node/beacon_chain/tests/column_verification.rs index 5cd3811ea5..229ae1e199 100644 --- a/beacon_node/beacon_chain/tests/column_verification.rs +++ b/beacon_node/beacon_chain/tests/column_verification.rs @@ -1,5 +1,6 @@ #![cfg(not(debug_assertions))] +use beacon_chain::custody_context::NodeCustodyType; use beacon_chain::test_utils::{ AttestationStrategy, BeaconChainHarness, BlockStrategy, EphemeralHarnessType, generate_data_column_sidecars_from_block, test_spec, @@ -24,7 +25,7 @@ static KEYPAIRS: LazyLock> = fn get_harness( validator_count: usize, spec: Arc, - supernode: bool, + node_custody_type: NodeCustodyType, ) -> BeaconChainHarness> { create_test_tracing_subscriber(); let harness = BeaconChainHarness::builder(MainnetEthSpec) @@ -34,7 +35,7 @@ fn get_harness( ..ChainConfig::default() }) .keypairs(KEYPAIRS[0..validator_count].to_vec()) - .import_all_data_columns(supernode) + .node_custody_type(node_custody_type) .fresh_ephemeral_store() .mock_execution_layer() .build(); @@ -54,8 +55,7 @@ async fn rpc_columns_with_invalid_header_signature() { return; } - let supernode = true; - let harness = get_harness(VALIDATOR_COUNT, spec, supernode); + let harness = get_harness(VALIDATOR_COUNT, spec, NodeCustodyType::Supernode); let num_blocks = E::slots_per_epoch() as usize; diff --git a/beacon_node/beacon_chain/tests/store_tests.rs b/beacon_node/beacon_chain/tests/store_tests.rs index 69d16b3071..53e841692e 100644 --- a/beacon_node/beacon_chain/tests/store_tests.rs +++ b/beacon_node/beacon_chain/tests/store_tests.rs @@ -3,6 +3,7 @@ use beacon_chain::attestation_verification::Error as AttnError; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::builder::BeaconChainBuilder; +use beacon_chain::custody_context::CUSTODY_CHANGE_DA_EFFECTIVE_DELAY_SECONDS; use beacon_chain::data_availability_checker::AvailableBlock; use beacon_chain::historical_data_columns::HistoricalDataColumnError; use beacon_chain::schema_change::migrate_schema; @@ -11,13 +12,13 @@ use beacon_chain::test_utils::{ AttestationStrategy, BeaconChainHarness, BlockStrategy, DiskHarnessType, get_kzg, mock_execution_layer_from_parts, test_spec, }; -use beacon_chain::validator_custody::CUSTODY_CHANGE_DA_EFFECTIVE_DELAY_SECONDS; use beacon_chain::{ BeaconChain, BeaconChainError, BeaconChainTypes, BeaconSnapshot, BlockError, ChainConfig, NotifyExecutionLayer, ServerSentEventHandler, WhenSlotSkipped, beacon_proposer_cache::{ compute_proposer_duties_from_head, ensure_state_can_determine_proposers_for_epoch, }, + custody_context::NodeCustodyType, data_availability_checker::MaybeAvailableBlock, historical_blocks::HistoricalBlockError, migrate::MigratorConfig, @@ -98,7 +99,12 @@ fn get_harness( reconstruct_historic_states: true, ..ChainConfig::default() }; - get_harness_generic(store, validator_count, chain_config, false) + get_harness_generic( + store, + validator_count, + chain_config, + NodeCustodyType::Fullnode, + ) } fn get_harness_import_all_data_columns( @@ -110,14 +116,19 @@ fn get_harness_import_all_data_columns( reconstruct_historic_states: true, ..ChainConfig::default() }; - get_harness_generic(store, validator_count, chain_config, true) + get_harness_generic( + store, + validator_count, + chain_config, + NodeCustodyType::Supernode, + ) } fn get_harness_generic( store: Arc, BeaconNodeBackend>>, validator_count: usize, chain_config: ChainConfig, - import_all_data_columns: bool, + node_custody_type: NodeCustodyType, ) -> TestHarness { let harness = TestHarness::builder(MinimalEthSpec) .spec(store.get_chain_spec().clone()) @@ -125,7 +136,7 @@ fn get_harness_generic( .fresh_disk_store(store) .mock_execution_layer() .chain_config(chain_config) - .import_all_data_columns(import_all_data_columns) + .node_custody_type(node_custody_type) .build(); harness.advance_slot(); harness @@ -3420,7 +3431,12 @@ async fn process_blocks_and_attestations_for_unaligned_checkpoint() { reconstruct_historic_states: false, ..ChainConfig::default() }; - let harness = get_harness_generic(store.clone(), LOW_VALIDATOR_COUNT, chain_config, false); + let harness = get_harness_generic( + store.clone(), + LOW_VALIDATOR_COUNT, + chain_config, + NodeCustodyType::Fullnode, + ); let all_validators = (0..LOW_VALIDATOR_COUNT).collect::>(); @@ -3839,14 +3855,13 @@ async fn schema_downgrade_to_min_version( reconstruct_historic_states, ..ChainConfig::default() }; - let import_all_data_columns = false; let store = get_store_generic(&db_path, store_config.clone(), spec.clone()); let harness = get_harness_generic( store.clone(), LOW_VALIDATOR_COUNT, chain_config.clone(), - import_all_data_columns, + NodeCustodyType::Fullnode, ); harness @@ -4862,14 +4877,13 @@ async fn ancestor_state_root_prior_to_split() { reconstruct_historic_states: false, ..ChainConfig::default() }; - let import_all_data_columns = false; let store = get_store_generic(&db_path, store_config, spec); let harness = get_harness_generic( store.clone(), LOW_VALIDATOR_COUNT, chain_config, - import_all_data_columns, + NodeCustodyType::Fullnode, ); // Produce blocks until we have passed through two full snapshot periods. This period length is @@ -4956,14 +4970,13 @@ async fn replay_from_split_state() { reconstruct_historic_states: false, ..ChainConfig::default() }; - let import_all_data_columns = false; let store = get_store_generic(&db_path, store_config.clone(), spec.clone()); let harness = get_harness_generic( store.clone(), LOW_VALIDATOR_COUNT, chain_config, - import_all_data_columns, + NodeCustodyType::Fullnode, ); // Produce blocks until we finalize epoch 3 which will not be stored as a snapshot. diff --git a/beacon_node/client/src/builder.rs b/beacon_node/client/src/builder.rs index 02c042bf28..c3c827f0aa 100644 --- a/beacon_node/client/src/builder.rs +++ b/beacon_node/client/src/builder.rs @@ -202,7 +202,7 @@ where .beacon_graffiti(beacon_graffiti) .event_handler(event_handler) .execution_layer(execution_layer) - .import_all_data_columns(config.network.subscribe_all_data_column_subnets) + .node_custody_type(config.chain.node_custody_type) .validator_monitor_config(config.validator_monitor.clone()) .rng(Box::new( StdRng::try_from_rng(&mut OsRng) diff --git a/beacon_node/http_api/tests/tests.rs b/beacon_node/http_api/tests/tests.rs index 7c2282a488..dc2fd4ae44 100644 --- a/beacon_node/http_api/tests/tests.rs +++ b/beacon_node/http_api/tests/tests.rs @@ -1,3 +1,4 @@ +use beacon_chain::custody_context::NodeCustodyType; use beacon_chain::test_utils::RelativeSyncCommittee; use beacon_chain::{ BeaconChain, ChainConfig, StateSkipConfig, WhenSlotSkipped, @@ -90,7 +91,7 @@ struct ApiTester { struct ApiTesterConfig { spec: ChainSpec, retain_historic_states: bool, - import_all_data_columns: bool, + node_custody_type: NodeCustodyType, } impl Default for ApiTesterConfig { @@ -100,7 +101,7 @@ impl Default for ApiTesterConfig { Self { spec, retain_historic_states: false, - import_all_data_columns: false, + node_custody_type: NodeCustodyType::Fullnode, } } } @@ -139,7 +140,7 @@ impl ApiTester { .deterministic_withdrawal_keypairs(VALIDATOR_COUNT) .fresh_ephemeral_store() .mock_execution_layer() - .import_all_data_columns(config.import_all_data_columns) + .node_custody_type(config.node_custody_type) .build(); harness @@ -7842,8 +7843,7 @@ async fn get_blobs_post_fulu_supernode() { let mut config = ApiTesterConfig { retain_historic_states: false, spec: E::default_spec(), - // For supernode, we import all data columns - import_all_data_columns: true, + node_custody_type: NodeCustodyType::Supernode, }; config.spec.altair_fork_epoch = Some(Epoch::new(0)); config.spec.bellatrix_fork_epoch = Some(Epoch::new(0)); diff --git a/beacon_node/network/src/network_beacon_processor/tests.rs b/beacon_node/network/src/network_beacon_processor/tests.rs index a3aef8f802..a9794cb5c4 100644 --- a/beacon_node/network/src/network_beacon_processor/tests.rs +++ b/beacon_node/network/src/network_beacon_processor/tests.rs @@ -9,6 +9,7 @@ use crate::{ sync::{SyncMessage, manager::BlockProcessType}, }; use beacon_chain::block_verification_types::RpcBlock; +use beacon_chain::custody_context::NodeCustodyType; use beacon_chain::data_column_verification::validate_data_column_sidecar_for_gossip; use beacon_chain::kzg_utils::blobs_to_data_column_sidecars; use beacon_chain::observed_data_sidecars::DoNotObserve; @@ -94,20 +95,32 @@ impl TestRig { // This allows for testing voluntary exits without building out a massive chain. let mut spec = test_spec::(); spec.shard_committee_period = 2; - Self::new_parametric(chain_length, BeaconProcessorConfig::default(), false, spec).await + Self::new_parametric( + chain_length, + BeaconProcessorConfig::default(), + NodeCustodyType::Fullnode, + spec, + ) + .await } pub async fn new_supernode(chain_length: u64) -> Self { // This allows for testing voluntary exits without building out a massive chain. let mut spec = test_spec::(); spec.shard_committee_period = 2; - Self::new_parametric(chain_length, BeaconProcessorConfig::default(), true, spec).await + Self::new_parametric( + chain_length, + BeaconProcessorConfig::default(), + NodeCustodyType::Supernode, + spec, + ) + .await } pub async fn new_parametric( chain_length: u64, beacon_processor_config: BeaconProcessorConfig, - import_data_columns: bool, + node_custody_type: NodeCustodyType, spec: ChainSpec, ) -> Self { let spec = Arc::new(spec); @@ -116,7 +129,7 @@ impl TestRig { .deterministic_keypairs(VALIDATOR_COUNT) .fresh_ephemeral_store() .mock_execution_layer() - .import_all_data_columns(import_data_columns) + .node_custody_type(node_custody_type) .chain_config(<_>::default()) .build(); @@ -1610,7 +1623,7 @@ async fn test_backfill_sync_processing_rate_limiting_disabled() { let mut rig = TestRig::new_parametric( SMALL_CHAIN, beacon_processor_config, - false, + NodeCustodyType::Fullnode, test_spec::(), ) .await; @@ -1692,7 +1705,13 @@ async fn test_blobs_by_range_spans_fulu_fork() { spec.fulu_fork_epoch = Some(Epoch::new(1)); spec.gloas_fork_epoch = Some(Epoch::new(2)); - let mut rig = TestRig::new_parametric(64, BeaconProcessorConfig::default(), false, spec).await; + let mut rig = TestRig::new_parametric( + 64, + BeaconProcessorConfig::default(), + NodeCustodyType::Fullnode, + spec, + ) + .await; let start_slot = 16; // This will span from epoch 0 (Electra) to epoch 1 (Fulu) diff --git a/beacon_node/src/cli.rs b/beacon_node/src/cli.rs index 28f355151d..e4c7c6ff1f 100644 --- a/beacon_node/src/cli.rs +++ b/beacon_node/src/cli.rs @@ -59,6 +59,18 @@ pub fn cli_app() -> Command { helps network resilience by serving all data columns to syncing peers.") .display_order(0) ) + .arg( + Arg::new("semi-supernode") + .long("semi-supernode") + .action(ArgAction::SetTrue) + .help_heading(FLAG_HEADER) + .conflicts_with("supernode") + .help("Run in minimal reconstruction mode. This node will subscribe to and custody \ + half of the data columns (enough for reconstruction), enabling efficient \ + data availability with lower bandwidth and storage requirements compared to \ + a supernode, while still supporting full blob reconstruction.") + .display_order(0) + ) .arg( Arg::new("malicious-withhold-count") .long("malicious-withhold-count") diff --git a/beacon_node/src/config.rs b/beacon_node/src/config.rs index acb392779f..3b0e80e0b7 100644 --- a/beacon_node/src/config.rs +++ b/beacon_node/src/config.rs @@ -4,6 +4,7 @@ use beacon_chain::chain_config::{ DEFAULT_RE_ORG_MAX_EPOCHS_SINCE_FINALIZATION, DEFAULT_RE_ORG_PARENT_THRESHOLD, DisallowedReOrgOffsets, INVALID_HOLESKY_BLOCK_ROOT, ReOrgThreshold, }; +use beacon_chain::custody_context::NodeCustodyType; use beacon_chain::graffiti_calculator::GraffitiOrigin; use clap::{ArgMatches, Id, parser::ValueSource}; use clap_utils::flags::DISABLE_MALLOC_TUNING_FLAG; @@ -108,6 +109,19 @@ pub fn get_config( set_network_config(&mut client_config.network, cli_args, &data_dir_ref)?; + // Parse custody mode from CLI flags + let is_supernode = parse_flag(cli_args, "supernode"); + let is_semi_supernode = parse_flag(cli_args, "semi-supernode"); + + client_config.chain.node_custody_type = if is_supernode { + client_config.network.subscribe_all_data_column_subnets = true; + NodeCustodyType::Supernode + } else if is_semi_supernode { + NodeCustodyType::SemiSupernode + } else { + NodeCustodyType::Fullnode + }; + /* * Staking flag * Note: the config values set here can be overwritten by other more specific cli params @@ -1136,10 +1150,6 @@ pub fn set_network_config( config.network_dir = data_dir.join(DEFAULT_NETWORK_DIR); }; - if parse_flag(cli_args, "supernode") { - config.subscribe_all_data_column_subnets = true; - } - if parse_flag(cli_args, "subscribe-all-subnets") { config.subscribe_all_subnets = true; } diff --git a/book/src/help_bn.md b/book/src/help_bn.md index 6680202a27..5f3c43a7e4 100644 --- a/book/src/help_bn.md +++ b/book/src/help_bn.md @@ -552,6 +552,12 @@ Flags: When present, Lighthouse will forget the payload statuses of any already-imported blocks. This can assist in the recovery from a consensus failure caused by the execution layer. + --semi-supernode + Run in minimal reconstruction mode. This node will subscribe to and + custody half of the data columns (enough for reconstruction), enabling + efficient data availability with lower bandwidth and storage + requirements compared to a supernode, while still supporting full blob + reconstruction. --shutdown-after-sync Shutdown beacon node as soon as sync is completed. Backfill sync will not be performed before shutdown. diff --git a/lighthouse/tests/beacon_node.rs b/lighthouse/tests/beacon_node.rs index 5a057d7d7f..8342b02173 100644 --- a/lighthouse/tests/beacon_node.rs +++ b/lighthouse/tests/beacon_node.rs @@ -4,6 +4,7 @@ use beacon_node::beacon_chain::chain_config::{ DEFAULT_RE_ORG_MAX_EPOCHS_SINCE_FINALIZATION, DEFAULT_SYNC_TOLERANCE_EPOCHS, DisallowedReOrgOffsets, }; +use beacon_node::beacon_chain::custody_context::NodeCustodyType; use beacon_node::{ ClientConfig as Config, beacon_chain::graffiti_calculator::GraffitiOrigin, beacon_chain::store::config::DatabaseBackend as BeaconNodeBackend, @@ -782,20 +783,38 @@ fn network_subscribe_all_data_column_subnets_flag() { CommandLineTest::new() .flag("subscribe-all-data-column-subnets", None) .run_with_zero_port() - .with_config(|config| assert!(config.network.subscribe_all_data_column_subnets)); + .with_config(|config| { + assert_eq!(config.chain.node_custody_type, NodeCustodyType::Supernode) + }); } #[test] fn network_supernode_flag() { CommandLineTest::new() .flag("supernode", None) .run_with_zero_port() - .with_config(|config| assert!(config.network.subscribe_all_data_column_subnets)); + .with_config(|config| { + assert_eq!(config.chain.node_custody_type, NodeCustodyType::Supernode) + }); } #[test] -fn network_subscribe_all_data_column_subnets_default() { +fn network_semi_supernode_flag() { + CommandLineTest::new() + .flag("semi-supernode", None) + .run_with_zero_port() + .with_config(|config| { + assert_eq!( + config.chain.node_custody_type, + NodeCustodyType::SemiSupernode + ) + }); +} +#[test] +fn network_node_custody_type_default() { CommandLineTest::new() .run_with_zero_port() - .with_config(|config| assert!(!config.network.subscribe_all_data_column_subnets)); + .with_config(|config| { + assert_eq!(config.chain.node_custody_type, NodeCustodyType::Fullnode) + }); } #[test] fn blob_publication_batches() { diff --git a/testing/ef_tests/src/cases/fork_choice.rs b/testing/ef_tests/src/cases/fork_choice.rs index 1380e44acd..47b9902345 100644 --- a/testing/ef_tests/src/cases/fork_choice.rs +++ b/testing/ef_tests/src/cases/fork_choice.rs @@ -16,6 +16,7 @@ use beacon_chain::{ VerifiedAttestation, obtain_indexed_attestation_and_committees_per_slot, }, blob_verification::GossipVerifiedBlob, + custody_context::NodeCustodyType, test_utils::{BeaconChainHarness, EphemeralHarnessType}, }; use execution_layer::{PayloadStatusV1, json_structures::JsonPayloadStatusV1Status}; @@ -436,7 +437,7 @@ impl Tester { .genesis_state_ephemeral_store(case.anchor_state.clone()) .mock_execution_layer() .recalculate_fork_times_with_genesis(0) - .import_all_data_columns(true) + .node_custody_type(NodeCustodyType::Supernode) .mock_execution_layer_all_payloads_valid() .build();