From 66edda2690f4c20d8dd83f0eca6b790221aba296 Mon Sep 17 00:00:00 2001 From: Paul Etscheit Date: Mon, 1 Sep 2025 04:22:40 +0200 Subject: [PATCH 01/81] Impl ForkVersionDecode for beacon state (#7954) --- consensus/types/src/beacon_block_body.rs | 2 +- consensus/types/src/beacon_state.rs | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/consensus/types/src/beacon_block_body.rs b/consensus/types/src/beacon_block_body.rs index 73d02dbe61..7df9c3f3cb 100644 --- a/consensus/types/src/beacon_block_body.rs +++ b/consensus/types/src/beacon_block_body.rs @@ -170,7 +170,7 @@ impl<'a, E: EthSpec, Payload: AbstractExecPayload> BeaconBlockBodyRef<'a, E, } } - pub(crate) fn body_merkle_leaves(&self) -> Vec { + pub fn body_merkle_leaves(&self) -> Vec { let mut leaves = vec![]; match self { Self::Base(body) => { diff --git a/consensus/types/src/beacon_state.rs b/consensus/types/src/beacon_state.rs index 5f3dff56cc..d2efbfe909 100644 --- a/consensus/types/src/beacon_state.rs +++ b/consensus/types/src/beacon_state.rs @@ -2632,6 +2632,12 @@ impl BeaconState { } } +impl ForkVersionDecode for BeaconState { + fn from_ssz_bytes_by_fork(bytes: &[u8], fork_name: ForkName) -> Result { + Ok(map_fork_name!(fork_name, Self, <_>::from_ssz_bytes(bytes)?)) + } +} + impl BeaconState { /// The number of fields of the `BeaconState` rounded up to the nearest power of two. /// @@ -2760,7 +2766,7 @@ impl BeaconState { Ok(proof) } - fn generate_proof( + pub fn generate_proof( &self, field_index: usize, leaves: &[Hash256], @@ -2775,7 +2781,7 @@ impl BeaconState { Ok(proof) } - fn get_beacon_state_leaves(&self) -> Vec { + pub fn get_beacon_state_leaves(&self) -> Vec { let mut leaves = vec![]; #[allow(clippy::arithmetic_side_effects)] match self { From 477c534cd7e151b4e2fdee703043872c4c77ea96 Mon Sep 17 00:00:00 2001 From: Sam Wilson <57262657+SamWilsn@users.noreply.github.com> Date: Mon, 1 Sep 2025 02:03:55 -0400 Subject: [PATCH 02/81] Remove dependency on target_info. (#7964) Remove dependency on target_info, use standard library instead. --- Cargo.lock | 7 ------- common/lighthouse_version/Cargo.toml | 1 - common/lighthouse_version/src/lib.rs | 4 ++-- 3 files changed, 2 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1bd65e1721..e64ffca53e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5687,7 +5687,6 @@ version = "0.1.0" dependencies = [ "git-version", "regex", - "target_info", ] [[package]] @@ -9192,12 +9191,6 @@ dependencies = [ "static_assertions", ] -[[package]] -name = "target_info" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c63f48baada5c52e65a29eef93ab4f8982681b67f9e8d29c7b05abcfec2b9ffe" - [[package]] name = "task_executor" version = "0.1.0" diff --git a/common/lighthouse_version/Cargo.toml b/common/lighthouse_version/Cargo.toml index cb4a43e407..b7e669ed94 100644 --- a/common/lighthouse_version/Cargo.toml +++ b/common/lighthouse_version/Cargo.toml @@ -7,7 +7,6 @@ edition = { workspace = true } [dependencies] git-version = "0.3.4" -target_info = "0.1.0" [dev-dependencies] regex = { workspace = true } diff --git a/common/lighthouse_version/src/lib.rs b/common/lighthouse_version/src/lib.rs index 238efd591a..c45dbac4d3 100644 --- a/common/lighthouse_version/src/lib.rs +++ b/common/lighthouse_version/src/lib.rs @@ -1,5 +1,5 @@ use git_version::git_version; -use target_info::Target; +use std::env::consts; /// Returns the current version of this build of Lighthouse. /// @@ -45,7 +45,7 @@ pub const COMMIT_PREFIX: &str = git_version!( /// /// `Lighthouse/v1.5.1-67da032+/x86_64-linux` pub fn version_with_platform() -> String { - format!("{}/{}-{}", VERSION, Target::arch(), Target::os()) + format!("{}/{}-{}", VERSION, consts::ARCH, consts::OS) } /// Returns semantic versioning information only. From c7492f1c27d23e60223ca4ef593ba88e0d8b70ba Mon Sep 17 00:00:00 2001 From: Eitan Seri-Levi Date: Mon, 1 Sep 2025 01:56:25 -0700 Subject: [PATCH 03/81] Update to `1.6.0 alpha.6` spec (#7967) Upgrade `rust_eth_kzg` library to `0.9` to support the new cell index sorting tests in `recover_cells_and_kzg_proofs` https://github.com/ethereum/consensus-specs/releases https://github.com/crate-crypto/rust-eth-kzg/compare/v0.8.1...v0.9.0 --- Cargo.lock | 46 ++++++++++---------- Cargo.toml | 2 +- testing/ef_tests/Makefile | 2 +- testing/ef_tests/check_all_files_accessed.py | 3 ++ 4 files changed, 28 insertions(+), 25 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e64ffca53e..e5c29b6bff 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1165,9 +1165,9 @@ dependencies = [ [[package]] name = "blst" -version = "0.3.14" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47c79a94619fade3c0b887670333513a67ac28a6a7e653eb260bf0d4103db38d" +checksum = "4fd49896f12ac9b6dcd7a5998466b9b58263a695a3dd1ecc1aaca2e12a90b080" dependencies = [ "cc", "glob", @@ -2554,9 +2554,9 @@ dependencies = [ [[package]] name = "eip4844" -version = "0.8.0" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0067055675ea62c0287d520099d9a560f5ad4fd0c00956da99bbb2a68ad2bfc9" +checksum = "aa86cda6af15a9a5e4cf680850addaee8cd427be95be3ec9d022b9d7b98a66c0" dependencies = [ "ekzg-bls12-381", "ekzg-maybe-rayon", @@ -2579,9 +2579,9 @@ checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" [[package]] name = "ekzg-bls12-381" -version = "0.8.0" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef34382b1387ebc5acb0d509ab88401beade921af5982142778ae0c200f71edf" +checksum = "08f0e00a7689af7f4f17e85ae07f5a92b568a47297a165f685b828edfd82e02b" dependencies = [ "blst", "blstrs", @@ -2593,9 +2593,9 @@ dependencies = [ [[package]] name = "ekzg-erasure-codes" -version = "0.8.0" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fa58fcb3f698451a3a1ceb5f4a13ea7a4decab9f0bad63ee1690671b12b901c" +checksum = "4bfc7ab684a7bb0c5ee37fd6a73da7425858cdd28f4a285c70361f001d6d0efc" dependencies = [ "ekzg-bls12-381", "ekzg-polynomial", @@ -2603,15 +2603,15 @@ dependencies = [ [[package]] name = "ekzg-maybe-rayon" -version = "0.8.0" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ce7a570aaa7eb80ea92637f7153a8cd4c20640a3043146b57590ab4ae8eb0e9" +checksum = "0e0a4876a612b9317be470768e134b671b8e645e412a82eb12fdd9b1958fa6f9" [[package]] name = "ekzg-multi-open" -version = "0.8.0" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51a24896816c59dde1cf08b67480114edb9df1738b7f4f99ec51f7ce0e2dfaa0" +checksum = "2f7964754aa0921aaa89b1589100e4cae9b31f87f137eeb0af5403fdfca68bfc" dependencies = [ "ekzg-bls12-381", "ekzg-maybe-rayon", @@ -2621,9 +2621,9 @@ dependencies = [ [[package]] name = "ekzg-polynomial" -version = "0.8.0" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6520b5210781436d42ec6cb2e3a278573f1af10707b92502f5329ec967d30018" +checksum = "fed36d2ddf86661c9d18e9d5dfc47dce6c9b6e44db385e2da71952b10ba32df1" dependencies = [ "ekzg-bls12-381", "ekzg-maybe-rayon", @@ -2631,9 +2631,9 @@ dependencies = [ [[package]] name = "ekzg-serialization" -version = "0.8.0" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bcf1197575ca1babbd7861424e7c5339233fa8215cf8b1ee9188a2c354f34b6a" +checksum = "1c83402d591ac3534d1ae654feb8f56ee64cc2bacfe80bece7977c24ca5e72e2" dependencies = [ "ekzg-bls12-381", "hex", @@ -2641,9 +2641,9 @@ dependencies = [ [[package]] name = "ekzg-single-open" -version = "0.8.0" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f6e471860c94135d9075562a991c4456c4148efdac2bfccc64e1bf3fd074beb" +checksum = "05e1dbb13023ccebbb24593e4753c87f77b7fb78254a20aef1a028e979145092" dependencies = [ "ekzg-bls12-381", "ekzg-polynomial", @@ -2652,9 +2652,9 @@ dependencies = [ [[package]] name = "ekzg-trusted-setup" -version = "0.8.0" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1b016cc437c85ece6d54ecfe51b745516e520b388beb2b09a5196748bab21f3" +checksum = "ff1cb3e907b27fa51f35def95eeabe47e97765e2b6bac7e55967500937f94282" dependencies = [ "ekzg-bls12-381", "ekzg-serialization", @@ -7355,7 +7355,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" dependencies = [ "anyhow", - "itertools 0.13.0", + "itertools 0.14.0", "proc-macro2", "quote", "syn 2.0.100", @@ -8008,9 +8008,9 @@ dependencies = [ [[package]] name = "rust_eth_kzg" -version = "0.8.0" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c552fbda8be95ddcbebc9ebcb198cb9fe97e538450bcb7476ce5d9e03c499ff" +checksum = "0dc46814bb8e72bff20fe117db43b7455112e6fafdae7466f8f24d451ad773c0" dependencies = [ "eip4844", "ekzg-bls12-381", diff --git a/Cargo.toml b/Cargo.toml index 8588be49c0..c08e7c59a0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -223,7 +223,7 @@ reqwest = { version = "0.11", default-features = false, features = [ ring = "0.17" rpds = "0.11" rusqlite = { version = "0.28", features = ["bundled"] } -rust_eth_kzg = "0.8.0" +rust_eth_kzg = "0.9.0" safe_arith = { path = "consensus/safe_arith" } sensitive_url = { path = "common/sensitive_url" } serde = { version = "1", features = ["derive"] } diff --git a/testing/ef_tests/Makefile b/testing/ef_tests/Makefile index 0c6fd50dfd..da8640d681 100644 --- a/testing/ef_tests/Makefile +++ b/testing/ef_tests/Makefile @@ -1,6 +1,6 @@ # To download/extract nightly tests, run: # CONSENSUS_SPECS_TEST_VERSION=nightly make -CONSENSUS_SPECS_TEST_VERSION ?= v1.6.0-alpha.5 +CONSENSUS_SPECS_TEST_VERSION ?= v1.6.0-alpha.6 REPO_NAME := consensus-spec-tests OUTPUT_DIR := ./$(REPO_NAME) diff --git a/testing/ef_tests/check_all_files_accessed.py b/testing/ef_tests/check_all_files_accessed.py index 821287ce25..41e3c4bff7 100755 --- a/testing/ef_tests/check_all_files_accessed.py +++ b/testing/ef_tests/check_all_files_accessed.py @@ -59,6 +59,9 @@ excluded_paths = [ "tests/.*/.*/epoch_processing/.*/post_epoch.ssz_snappy", # Ignore gloas tests for now "tests/.*/gloas/.*", + # Ignore KZG tests that target internal kzg library functions + "tests/.*/compute_verify_cell_kzg_proof_batch_challenge/.*", + "tests/.*/compute_challenge/.*", ] From 9cc3c0553bb09efee95bde8724e50000ef7ff84a Mon Sep 17 00:00:00 2001 From: kevaundray Date: Mon, 1 Sep 2025 10:21:23 +0100 Subject: [PATCH 04/81] chore: small refactor of `epoch` method (#7902) Stylistic; mostly using early returns to avoid the nested logic Which issue # does this PR address? Please list or describe the changes introduced by this PR. --- .../overflow_lru_cache.rs | 40 +++++++++---------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs b/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs index 3c1b4e8b16..83d775f666 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs @@ -301,30 +301,28 @@ impl PendingComponents { } } - /// Returns the epoch of the block if it is cached, otherwise returns the epoch of the first blob. + /// Returns the epoch of: + /// - The block if it is cached + /// - The first available blob + /// - The first data column + /// Otherwise, returns None pub fn epoch(&self) -> Option { - self.executed_block - .as_ref() - .map(|pending_block| pending_block.as_block().epoch()) - .or_else(|| { - for maybe_blob in self.verified_blobs.iter() { - if maybe_blob.is_some() { - return maybe_blob.as_ref().map(|kzg_verified_blob| { - kzg_verified_blob - .as_blob() - .slot() - .epoch(E::slots_per_epoch()) - }); - } - } + // Get epoch from cached executed block + if let Some(executed_block) = &self.executed_block { + return Some(executed_block.as_block().epoch()); + } - if let Some(kzg_verified_data_column) = self.verified_data_columns.first() { - let epoch = kzg_verified_data_column.as_data_column().epoch(); - return Some(epoch); - } + // Or, get epoch from first available blob + if let Some(blob) = self.verified_blobs.iter().flatten().next() { + return Some(blob.as_blob().slot().epoch(E::slots_per_epoch())); + } - None - }) + // Or, get epoch from first data column + if let Some(data_column) = self.verified_data_columns.first() { + return Some(data_column.as_data_column().epoch()); + } + + None } pub fn status_str( From 979ed2557cd8e4138d373560f3d515b280466cbc Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Mon, 1 Sep 2025 19:21:26 +1000 Subject: [PATCH 05/81] Remove `expect` usage in `kzg_utils` (#7957) Remove `expect` usage in `kzg_utils` to handle the case where EL sends us invalid proof size instead of crashing. --- beacon_node/beacon_chain/src/kzg_utils.rs | 38 ++++++++++++----------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/beacon_node/beacon_chain/src/kzg_utils.rs b/beacon_node/beacon_chain/src/kzg_utils.rs index cde9050ed2..3063e78337 100644 --- a/beacon_node/beacon_chain/src/kzg_utils.rs +++ b/beacon_node/beacon_chain/src/kzg_utils.rs @@ -1,6 +1,6 @@ use kzg::{ Blob as KzgBlob, Bytes48, Cell as KzgCell, CellRef as KzgCellRef, CellsAndKzgProofs, - Error as KzgError, Kzg, + Error as KzgError, Kzg, KzgBlobRef, }; use rayon::prelude::*; use ssz_types::{FixedVector, VariableList}; @@ -28,9 +28,9 @@ fn ssz_blob_to_crypto_blob_boxed(blob: &Blob) -> Result(cell: &Cell) -> Result, KzgError> { let cell_bytes: &[u8] = cell.as_ref(); - Ok(cell_bytes + cell_bytes .try_into() - .expect("expected cell to have size {BYTES_PER_CELL}. This should be guaranteed by the `FixedVector type")) + .map_err(|e| KzgError::InconsistentArrayLength(format!("expected cell to have size BYTES_PER_CELL. This should be guaranteed by the `FixedVector` type: {e:?}"))) } /// Validate a single blob-commitment-proof triplet from a `BlobSidecar`. @@ -183,18 +183,19 @@ pub fn blobs_to_data_column_sidecars( let blob_cells_and_proofs_vec = zipped .into_par_iter() .map(|(blob, proofs)| { - let blob = blob - .as_ref() - .try_into() - .expect("blob should have a guaranteed size due to FixedVector"); + let blob = blob.as_ref().try_into().map_err(|e| { + KzgError::InconsistentArrayLength(format!( + "blob should have a guaranteed size due to FixedVector: {e:?}" + )) + })?; - kzg.compute_cells(blob).map(|cells| { - ( - cells, - proofs - .try_into() - .expect("proof chunks should have exactly `number_of_columns` proofs"), - ) + kzg.compute_cells(blob).and_then(|cells| { + let proofs = proofs.try_into().map_err(|e| { + KzgError::InconsistentArrayLength(format!( + "proof chunks should have exactly `number_of_columns` proofs: {e:?}" + )) + })?; + Ok((cells, proofs)) }) }) .collect::, KzgError>>()?; @@ -213,10 +214,11 @@ pub fn compute_cells(blobs: &[&Blob], kzg: &Kzg) -> Result = blob.as_ref().try_into().map_err(|e| { + KzgError::InconsistentArrayLength(format!( + "blob should have a guaranteed size due to FixedVector: {e:?}", + )) + })?; kzg.compute_cells(blob) }) From eef02afc9326c5e7f1855a995debdbcc481f5c14 Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Tue, 2 Sep 2025 17:18:23 +1000 Subject: [PATCH 06/81] Fix data availability checker race condition causing partial data columns to be served over RPC (#7961) Partially resolves #6439, an simpler alternative to #7931. Race condition occurs when RPC data columns arrives after a block has been imported and removed from the DA checker: 1. Block becomes available via gossip 2. RPC columns arrive and pass fork choice check (block hasn't been imported) 3. Block import completes (removing block from DA checker) 4. RPC data columns finish verification and get imported into DA checker This causes two issues: 1. **Partial data serving**: Already imported components get re-inserted, potentially causing LH to serve incomplete data 2. **State cache misses**: Leads to state reconstruction, holding the availability cache write lock longer and increasing race likelihood ### Proposed Changes 1. Never manually remove pending components from DA checker. Components are only removed via LRU eviction as finality advances. This makes sure we don't run into the issue described above. 2. Use `get` instead of `pop` when recovering the executed block, this prevents cache misses in race condition. This should reduce the likelihood of the race condition 3. Refactor DA checker to drop write lock as soon as components are added. This should also reduce the likelihood of the race condition **Trade-offs:** This solution eliminates a few nasty race conditions while allowing simplicity, with the cost of allowing block re-import (already existing). The increase in memory in DA checker can be partially offset by a reduction in block cache size if this really comes an issue (as we now serve recent blocks from DA checker). --- beacon_node/beacon_chain/src/beacon_chain.rs | 13 - .../src/data_availability_checker.rs | 22 +- .../overflow_lru_cache.rs | 264 ++++++++---------- .../state_lru_cache.rs | 5 +- 4 files changed, 130 insertions(+), 174 deletions(-) diff --git a/beacon_node/beacon_chain/src/beacon_chain.rs b/beacon_node/beacon_chain/src/beacon_chain.rs index 4358e4a872..b8a6529653 100644 --- a/beacon_node/beacon_chain/src/beacon_chain.rs +++ b/beacon_node/beacon_chain/src/beacon_chain.rs @@ -3815,19 +3815,6 @@ impl BeaconChain { .await?? }; - // Remove block components from da_checker AFTER completing block import. Then we can assert - // the following invariant: - // > A valid unfinalized block is either in fork-choice or da_checker. - // - // If we remove the block when it becomes available, there's some time window during - // `import_block` where the block is nowhere. Consumers of the da_checker can handle the - // extend time a block may exist in the da_checker. - // - // If `import_block` errors (only errors with internal errors), the pending components will - // be pruned on data_availability_checker maintenance as finality advances. - self.data_availability_checker - .remove_pending_components(block_root); - Ok(AvailabilityProcessingStatus::Imported(block_root)) } diff --git a/beacon_node/beacon_chain/src/data_availability_checker.rs b/beacon_node/beacon_chain/src/data_availability_checker.rs index 2ebf765a4e..9225ed6b47 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker.rs @@ -38,19 +38,18 @@ use crate::observed_data_sidecars::ObservationStrategy; pub use error::{Error as AvailabilityCheckError, ErrorCategory as AvailabilityCheckErrorCategory}; use types::non_zero_usize::new_non_zero_usize; -/// The LRU Cache stores `PendingComponents`, which can store up to `MAX_BLOBS_PER_BLOCK` blobs each. +/// The LRU Cache stores `PendingComponents`, which store block and its associated blob data: /// /// * Deneb blobs are 128 kb each and are stored in the form of `BlobSidecar`. /// * From Fulu (PeerDAS), blobs are erasure-coded and are 256 kb each, stored in the form of 128 `DataColumnSidecar`s. /// /// With `MAX_BLOBS_PER_BLOCK` = 48 (expected in the next year), the maximum size of data columns -/// in `PendingComponents` is ~12.29 MB. Setting this to 64 means the maximum size of the cache is -/// approximately 0.8 GB. +/// in `PendingComponents` is ~12.29 MB. Setting this to 32 means the maximum size of the cache is +/// approximately 0.4 GB. /// -/// Under normal conditions, the cache should only store the current pending block, but could -/// occasionally spike to 2-4 for various reasons e.g. components arriving late, but would very -/// rarely go above this, unless there are many concurrent forks. -pub const OVERFLOW_LRU_CAPACITY: NonZeroUsize = new_non_zero_usize(64); +/// `PendingComponents` are now never removed from the cache manually are only removed via LRU +/// eviction to prevent race conditions (#7961), so we expect this cache to be full all the time. +pub const OVERFLOW_LRU_CAPACITY: NonZeroUsize = new_non_zero_usize(32); pub const STATE_LRU_CAPACITY_NON_ZERO: NonZeroUsize = new_non_zero_usize(32); pub const STATE_LRU_CAPACITY: usize = STATE_LRU_CAPACITY_NON_ZERO.get(); @@ -346,11 +345,6 @@ impl DataAvailabilityChecker { .put_pending_executed_block(executed_block) } - pub fn remove_pending_components(&self, block_root: Hash256) { - self.availability_cache - .remove_pending_components(block_root) - } - /// Verifies kzg commitments for an RpcBlock, returns a `MaybeAvailableBlock` that may /// include the fully available block. /// @@ -589,8 +583,8 @@ impl DataAvailabilityChecker { // Check indices from cache again to make sure we don't publish components we've already received. let Some(existing_column_indices) = self.cached_data_column_indexes(block_root) else { - return Ok(DataColumnReconstructionResult::RecoveredColumnsNotImported( - "block already imported", + return Err(AvailabilityCheckError::Unexpected( + "block no longer exists in the data availability checker".to_string(), )); }; diff --git a/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs b/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs index 83d775f666..eaea2f70da 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs @@ -11,7 +11,7 @@ use crate::data_availability_checker::{Availability, AvailabilityCheckError}; use crate::data_column_verification::KzgVerifiedCustodyDataColumn; use lighthouse_tracing::SPAN_PENDING_COMPONENTS; use lru::LruCache; -use parking_lot::RwLock; +use parking_lot::{MappedRwLockReadGuard, RwLock, RwLockReadGuard, RwLockWriteGuard}; use std::cmp::Ordering; use std::num::NonZeroUsize; use std::sync::Arc; @@ -89,8 +89,6 @@ impl PendingComponents { /// Inserts a block into the cache. pub fn insert_block(&mut self, block: DietAvailabilityPendingExecutedBlock) { - let _guard = self.span.clone().entered(); - debug!("Block added to pending components"); *self.get_cached_block_mut() = Some(block) } @@ -98,9 +96,7 @@ impl PendingComponents { /// /// Existing blob at the index will be replaced. pub fn insert_blob_at_index(&mut self, blob_index: usize, blob: KzgVerifiedBlob) { - let _guard = self.span.clone().entered(); if let Some(b) = self.get_cached_blobs_mut().get_mut(blob_index) { - debug!(blob_index, "Blob added to pending components"); *b = Some(blob); } } @@ -140,13 +136,8 @@ impl PendingComponents { &mut self, kzg_verified_data_columns: I, ) -> Result<(), AvailabilityCheckError> { - let _guard = self.span.clone().entered(); for data_column in kzg_verified_data_columns { if self.get_cached_data_column(data_column.index()).is_none() { - debug!( - column_index = data_column.index(), - "Data column added to pending components" - ); self.verified_data_columns.push(data_column); } } @@ -169,9 +160,9 @@ impl PendingComponents { /// WARNING: This function can potentially take a lot of time if the state needs to be /// reconstructed from disk. Ensure you are not holding any write locks while calling this. pub fn make_available( - &mut self, + &self, spec: &Arc, - num_expected_columns: usize, + num_expected_columns_opt: Option, recover: R, ) -> Result>, AvailabilityCheckError> where @@ -188,7 +179,7 @@ impl PendingComponents { let num_expected_blobs = block.num_blobs_expected(); let blob_data = if num_expected_blobs == 0 { Some(AvailableBlockData::NoData) - } else if spec.is_peer_das_enabled_for_epoch(block.epoch()) { + } else if let Some(num_expected_columns) = num_expected_columns_opt { let num_received_columns = self.verified_data_columns.len(); match num_received_columns.cmp(&num_expected_columns) { Ordering::Greater => { @@ -325,21 +316,14 @@ impl PendingComponents { None } - pub fn status_str( - &self, - block_epoch: Epoch, - num_expected_columns: Option, - spec: &ChainSpec, - ) -> String { + pub fn status_str(&self, num_expected_columns_opt: Option) -> String { let block_count = if self.executed_block.is_some() { 1 } else { 0 }; - if spec.is_peer_das_enabled_for_epoch(block_epoch) { + if let Some(num_expected_columns) = num_expected_columns_opt { format!( "block {} data_columns {}/{}", block_count, self.verified_data_columns.len(), num_expected_columns - .map(|c| c.to_string()) - .unwrap_or("?".into()) ) } else { let num_expected_blobs = if let Some(block) = self.get_cached_block() { @@ -475,41 +459,21 @@ impl DataAvailabilityCheckerInner { *blob_opt = Some(blob); } } + let pending_components = + self.update_or_insert_pending_components(block_root, epoch, |pending_components| { + pending_components.merge_blobs(fixed_blobs); + Ok(()) + })?; - let mut write_lock = self.critical.write(); + pending_components.span.in_scope(|| { + debug!( + component = "blobs", + status = pending_components.status_str(None), + "Component added to data availability checker" + ); + }); - // Grab existing entry or create a new entry. - let mut pending_components = write_lock - .pop_entry(&block_root) - .map(|(_, v)| v) - .unwrap_or_else(|| { - PendingComponents::empty(block_root, self.spec.max_blobs_per_block(epoch) as usize) - }); - - // Merge in the blobs. - pending_components.merge_blobs(fixed_blobs); - - debug!( - component = "blobs", - ?block_root, - status = pending_components.status_str(epoch, None, &self.spec), - "Component added to data availability checker" - ); - - if let Some(available_block) = pending_components.make_available( - &self.spec, - self.custody_context - .num_of_data_columns_to_sample(epoch, &self.spec), - |block, span| self.state_cache.recover_pending_executed_block(block, span), - )? { - // We keep the pending components in the availability cache during block import (#5845). - write_lock.put(block_root, pending_components); - drop(write_lock); - Ok(Availability::Available(Box::new(available_block))) - } else { - write_lock.put(block_root, pending_components); - Ok(Availability::MissingComponents(block_root)) - } + self.check_availability_and_cache_components(block_root, pending_components, None) } #[allow(clippy::type_complexity)] @@ -532,44 +496,91 @@ impl DataAvailabilityCheckerInner { return Ok(Availability::MissingComponents(block_root)); }; - let mut write_lock = self.critical.write(); - - // Grab existing entry or create a new entry. - let mut pending_components = write_lock - .pop_entry(&block_root) - .map(|(_, v)| v) - .unwrap_or_else(|| { - PendingComponents::empty(block_root, self.spec.max_blobs_per_block(epoch) as usize) - }); - - // Merge in the data columns. - pending_components.merge_data_columns(kzg_verified_data_columns)?; + let pending_components = + self.update_or_insert_pending_components(block_root, epoch, |pending_components| { + pending_components.merge_data_columns(kzg_verified_data_columns) + })?; let num_expected_columns = self .custody_context .num_of_data_columns_to_sample(epoch, &self.spec); - debug!( - component = "data_columns", - ?block_root, - status = pending_components.status_str(epoch, Some(num_expected_columns), &self.spec), - "Component added to data availability checker" - ); - if let Some(available_block) = - pending_components.make_available(&self.spec, num_expected_columns, |block, span| { - self.state_cache.recover_pending_executed_block(block, span) - })? - { - // We keep the pending components in the availability cache during block import (#5845). - write_lock.put(block_root, pending_components); - drop(write_lock); + pending_components.span.in_scope(|| { + debug!( + component = "data_columns", + status = pending_components.status_str(Some(num_expected_columns)), + "Component added to data availability checker" + ); + }); + + self.check_availability_and_cache_components( + block_root, + pending_components, + Some(num_expected_columns), + ) + } + + fn check_availability_and_cache_components( + &self, + block_root: Hash256, + pending_components: MappedRwLockReadGuard<'_, PendingComponents>, + num_expected_columns_opt: Option, + ) -> Result, AvailabilityCheckError> { + if let Some(available_block) = pending_components.make_available( + &self.spec, + num_expected_columns_opt, + |block, span| self.state_cache.recover_pending_executed_block(block, span), + )? { + // Explicitly drop read lock before acquiring write lock + drop(pending_components); + if let Some(components) = self.critical.write().get_mut(&block_root) { + // Clean up span now that block is available + components.span = Span::none(); + } + + // We never remove the pending components manually to avoid race conditions. + // This ensures components remain available during and right after block import, + // preventing a race condition where a component was removed after the block was + // imported, but re-inserted immediately, causing partial pending components to be + // stored and served to peers. + // Components are only removed via LRU eviction as finality advances. Ok(Availability::Available(Box::new(available_block))) } else { - write_lock.put(block_root, pending_components); Ok(Availability::MissingComponents(block_root)) } } + /// Updates or inserts a new `PendingComponents` if it doesn't exist, and then apply the + /// `update_fn` while holding the write lock. + /// + /// Once the update is complete, the write lock is downgraded and a read guard with a + /// reference of the updated `PendingComponents` is returned. + fn update_or_insert_pending_components( + &self, + block_root: Hash256, + epoch: Epoch, + update_fn: F, + ) -> Result>, AvailabilityCheckError> + where + F: FnOnce(&mut PendingComponents) -> Result<(), AvailabilityCheckError>, + { + let mut write_lock = self.critical.write(); + + { + let pending_components = write_lock.get_or_insert_mut(block_root, || { + PendingComponents::empty(block_root, self.spec.max_blobs_per_block(epoch) as usize) + }); + update_fn(pending_components)? + } + + RwLockReadGuard::try_map(RwLockWriteGuard::downgrade(write_lock), |cache| { + cache.peek(&block_root) + }) + .map_err(|_| { + AvailabilityCheckError::Unexpected("pending components should exist".to_string()) + }) + } + /// Check whether data column reconstruction should be attempted. /// /// Potentially trigger reconstruction if: @@ -623,7 +634,6 @@ impl DataAvailabilityCheckerInner { &self, executed_block: AvailabilityPendingExecutedBlock, ) -> Result, AvailabilityCheckError> { - let mut write_lock = self.critical.write(); let epoch = executed_block.as_block().epoch(); let block_root = executed_block.import_data.block_root; @@ -632,45 +642,32 @@ impl DataAvailabilityCheckerInner { .state_cache .register_pending_executed_block(executed_block); - // Grab existing entry or create a new entry. - let mut pending_components = write_lock - .pop_entry(&block_root) - .map(|(_, v)| v) - .unwrap_or_else(|| { - PendingComponents::empty(block_root, self.spec.max_blobs_per_block(epoch) as usize) - }); + let pending_components = + self.update_or_insert_pending_components(block_root, epoch, |pending_components| { + pending_components.merge_block(diet_executed_block); + Ok(()) + })?; - // Merge in the block. - pending_components.merge_block(diet_executed_block); + let num_expected_columns_opt = if self.spec.is_peer_das_enabled_for_epoch(epoch) { + let num_of_column_samples = self + .custody_context + .num_of_data_columns_to_sample(epoch, &self.spec); + Some(num_of_column_samples) + } else { + None + }; - let num_expected_columns = self - .custody_context - .num_of_data_columns_to_sample(epoch, &self.spec); debug!( component = "block", - ?block_root, - status = pending_components.status_str(epoch, Some(num_expected_columns), &self.spec), + status = pending_components.status_str(num_expected_columns_opt), "Component added to data availability checker" ); - // Check if we have all components and entire set is consistent. - if let Some(available_block) = - pending_components.make_available(&self.spec, num_expected_columns, |block, span| { - self.state_cache.recover_pending_executed_block(block, span) - })? - { - // We keep the pending components in the availability cache during block import (#5845). - write_lock.put(block_root, pending_components); - drop(write_lock); - Ok(Availability::Available(Box::new(available_block))) - } else { - write_lock.put(block_root, pending_components); - Ok(Availability::MissingComponents(block_root)) - } - } - - pub fn remove_pending_components(&self, block_root: Hash256) { - self.critical.write().pop_entry(&block_root); + self.check_availability_and_cache_components( + block_root, + pending_components, + num_expected_columns_opt, + ) } /// maintain the cache @@ -958,13 +955,6 @@ mod test { 1, "cache should still have block as it hasn't been imported yet" ); - // remove the blob to simulate successful import - cache.remove_pending_components(root); - assert_eq!( - cache.critical.read().len(), - 0, - "cache should be empty now that block has been imported" - ); } else { assert!( matches!(availability, Availability::MissingComponents(_)), @@ -994,12 +984,6 @@ mod test { assert_eq!(cache.critical.read().len(), 1); } } - // remove the blob to simulate successful import - cache.remove_pending_components(root); - assert!( - cache.critical.read().is_empty(), - "cache should be empty now that all components available" - ); let (pending_block, blobs) = availability_pending_block(&harness).await; let blobs_expected = pending_block.num_blobs_expected(); @@ -1019,7 +1003,11 @@ mod test { matches!(availability, Availability::MissingComponents(_)), "should be pending block" ); - assert_eq!(cache.critical.read().len(), 1); + assert_eq!( + cache.critical.read().len(), + 2, + "cache should have two blocks now" + ); } let availability = cache .put_pending_executed_block(pending_block) @@ -1030,14 +1018,8 @@ mod test { availability ); assert!( - cache.critical.read().len() == 1, - "cache should still have available block until import" - ); - // remove the blob to simulate successful import - cache.remove_pending_components(root); - assert!( - cache.critical.read().is_empty(), - "cache should be empty now that all components available" + cache.critical.read().len() == 2, + "cache should still have available block" ); } @@ -1159,14 +1141,6 @@ mod test { states.last(), "recovered state should be the same as the original" ); - // the state should no longer be in the cache - assert!( - state_cache - .read() - .peek(&last_block.as_block().state_root()) - .is_none(), - "last block state should no longer be in cache" - ); } } diff --git a/beacon_node/beacon_chain/src/data_availability_checker/state_lru_cache.rs b/beacon_node/beacon_chain/src/data_availability_checker/state_lru_cache.rs index e328bd9b9c..57c236efcf 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker/state_lru_cache.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker/state_lru_cache.rs @@ -113,8 +113,9 @@ impl StateLRUCache { diet_executed_block: DietAvailabilityPendingExecutedBlock, _span: &Span, ) -> Result, AvailabilityCheckError> { - let state = if let Some(state) = self.states.write().pop(&diet_executed_block.state_root) { - state + // Keep the state in the cache to prevent reconstruction in race conditions + let state = if let Some(state) = self.states.write().get(&diet_executed_block.state_root) { + state.clone() } else { self.reconstruct_state(&diet_executed_block)? }; From a9db8523a2d6f521ef15dbb8ea7078cb34b49fdc Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Tue, 2 Sep 2025 19:00:12 -0700 Subject: [PATCH 07/81] Update tracing (#7981) Update tracing subscriber for cargo audit failure https://rustsec.org/advisories/RUSTSEC-2025-0055 --- Cargo.lock | 50 ++++++++++++++------------------------------------ 1 file changed, 14 insertions(+), 36 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e5c29b6bff..114b02827e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2152,7 +2152,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "18e4fdb82bd54a12e42fb58a800dcae6b9e13982238ce2296dc3570b92148e1f" dependencies = [ "data-encoding", - "syn 2.0.100", + "syn 1.0.109", ] [[package]] @@ -5864,11 +5864,11 @@ checksum = "ffbee8634e0d45d258acb448e7eaab3fce7a0a467395d4d9f228e3c1f01fb2e4" [[package]] name = "matchers" -version = "0.1.0" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558" +checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" dependencies = [ - "regex-automata 0.1.10", + "regex-automata", ] [[package]] @@ -6424,12 +6424,11 @@ dependencies = [ [[package]] name = "nu-ansi-term" -version = "0.46.0" +version = "0.50.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" +checksum = "d4a28e057d01f97e61255210fcff094d74ed0466038633e95017f5beb68e4399" dependencies = [ - "overload", - "winapi", + "windows-sys 0.52.0", ] [[package]] @@ -6753,12 +6752,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "overload" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" - [[package]] name = "p256" version = "0.13.2" @@ -7321,7 +7314,7 @@ dependencies = [ "rand 0.8.5", "rand_chacha 0.3.1", "rand_xorshift 0.3.0", - "regex-syntax 0.8.5", + "regex-syntax", "rusty-fork", "tempfile", "unarray", @@ -7711,17 +7704,8 @@ checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" dependencies = [ "aho-corasick", "memchr", - "regex-automata 0.4.9", - "regex-syntax 0.8.5", -] - -[[package]] -name = "regex-automata" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" -dependencies = [ - "regex-syntax 0.6.29", + "regex-automata", + "regex-syntax", ] [[package]] @@ -7732,15 +7716,9 @@ checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" dependencies = [ "aho-corasick", "memchr", - "regex-syntax 0.8.5", + "regex-syntax", ] -[[package]] -name = "regex-syntax" -version = "0.6.29" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" - [[package]] name = "regex-syntax" version = "0.8.5" @@ -9764,14 +9742,14 @@ dependencies = [ [[package]] name = "tracing-subscriber" -version = "0.3.19" +version = "0.3.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008" +checksum = "2054a14f5307d601f88daf0553e1cbf472acc4f2c51afab632431cdcd72124d5" dependencies = [ "matchers", "nu-ansi-term", "once_cell", - "regex", + "regex-automata", "serde", "serde_json", "sharded-slab", From 7b5be8b1e74cf0ce71aeda71e017dd426920e955 Mon Sep 17 00:00:00 2001 From: Akihito Nakano Date: Wed, 3 Sep 2025 11:00:15 +0900 Subject: [PATCH 08/81] Remove ttfb_timeout and resp_timeout (#7925) `TTFB_TIMEOUT` was deprecated in https://github.com/ethereum/consensus-specs/pull/3767. Remove `ttfb_timeout` from `InboundUpgrade` and other related structs. (Update) Also removed `resp_timeout` and also removed the `NetworkParams` struct since its fields are no longer used. https://github.com/sigp/lighthouse/pull/7925#issuecomment-3226886352 --- Cargo.lock | 11 ------- beacon_node/lighthouse_network/Cargo.toml | 1 - .../lighthouse_network/src/rpc/handler.rs | 17 +++++------ beacon_node/lighthouse_network/src/rpc/mod.rs | 29 ++----------------- .../lighthouse_network/src/rpc/protocol.rs | 9 ++---- .../lighthouse_network/src/service/mod.rs | 10 ++----- 6 files changed, 13 insertions(+), 64 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 114b02827e..4020d9611f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5642,7 +5642,6 @@ dependencies = [ "tempfile", "tiny-keccak", "tokio", - "tokio-io-timeout", "tokio-util", "tracing", "tracing-subscriber", @@ -9432,16 +9431,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "tokio-io-timeout" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30b74022ada614a1b4834de765f9bb43877f910cc8ce4be40e89042c9223a8bf" -dependencies = [ - "pin-project-lite", - "tokio", -] - [[package]] name = "tokio-macros" version = "2.5.0" diff --git a/beacon_node/lighthouse_network/Cargo.toml b/beacon_node/lighthouse_network/Cargo.toml index 0b2ca9e818..9963cc0bc4 100644 --- a/beacon_node/lighthouse_network/Cargo.toml +++ b/beacon_node/lighthouse_network/Cargo.toml @@ -45,7 +45,6 @@ superstruct = { workspace = true } task_executor = { workspace = true } tiny-keccak = "2" tokio = { workspace = true } -tokio-io-timeout = "1" tokio-util = { workspace = true } tracing = { workspace = true } tracing-subscriber = { workspace = true } diff --git a/beacon_node/lighthouse_network/src/rpc/handler.rs b/beacon_node/lighthouse_network/src/rpc/handler.rs index 972d45cdfe..720895bbe7 100644 --- a/beacon_node/lighthouse_network/src/rpc/handler.rs +++ b/beacon_node/lighthouse_network/src/rpc/handler.rs @@ -39,6 +39,9 @@ const SHUTDOWN_TIMEOUT_SECS: u64 = 15; /// Maximum number of simultaneous inbound substreams we keep for this peer. const MAX_INBOUND_SUBSTREAMS: usize = 32; +/// Timeout that will be used for inbound and outbound responses. +const RESP_TIMEOUT: Duration = Duration::from_secs(10); + /// Identifier of inbound and outbound substreams from the handler's perspective. #[derive(Debug, Clone, Copy, Hash, Eq, PartialEq)] pub struct SubstreamId(usize); @@ -140,9 +143,6 @@ where /// Waker, to be sure the handler gets polled when needed. waker: Option, - - /// Timeout that will be used for inbound and outbound responses. - resp_timeout: Duration, } enum HandlerState { @@ -224,7 +224,6 @@ where pub fn new( listen_protocol: SubstreamProtocol, ()>, fork_context: Arc, - resp_timeout: Duration, peer_id: PeerId, connection_id: ConnectionId, ) -> Self { @@ -246,7 +245,6 @@ where outbound_io_error_retries: 0, fork_context, waker: None, - resp_timeout, } } @@ -542,8 +540,7 @@ where // If this substream has not ended, we reset the timer. // Each chunk is allowed RESPONSE_TIMEOUT to be sent. if let Some(ref delay_key) = info.delay_key { - self.inbound_substreams_delay - .reset(delay_key, self.resp_timeout); + self.inbound_substreams_delay.reset(delay_key, RESP_TIMEOUT); } // The stream may be currently idle. Attempt to process more @@ -712,7 +709,7 @@ where }; substream_entry.max_remaining_chunks = Some(max_remaining_chunks); self.outbound_substreams_delay - .reset(delay_key, self.resp_timeout); + .reset(delay_key, RESP_TIMEOUT); } } @@ -960,7 +957,7 @@ where // Store the stream and tag the output. let delay_key = self .inbound_substreams_delay - .insert(self.current_inbound_substream_id, self.resp_timeout); + .insert(self.current_inbound_substream_id, RESP_TIMEOUT); let awaiting_stream = InboundState::Idle(substream); self.inbound_substreams.insert( self.current_inbound_substream_id, @@ -1036,7 +1033,7 @@ where // new outbound request. Store the stream and tag the output. let delay_key = self .outbound_substreams_delay - .insert(self.current_outbound_substream_id, self.resp_timeout); + .insert(self.current_outbound_substream_id, RESP_TIMEOUT); let awaiting_stream = OutboundSubstreamState::RequestPendingResponse { substream: Box::new(substream), request, diff --git a/beacon_node/lighthouse_network/src/rpc/mod.rs b/beacon_node/lighthouse_network/src/rpc/mod.rs index 5e8e55891c..7c43018af8 100644 --- a/beacon_node/lighthouse_network/src/rpc/mod.rs +++ b/beacon_node/lighthouse_network/src/rpc/mod.rs @@ -16,7 +16,6 @@ use std::collections::HashMap; use std::marker::PhantomData; use std::sync::Arc; use std::task::{Context, Poll}; -use std::time::Duration; use tracing::{debug, trace}; use types::{EthSpec, ForkContext}; @@ -143,12 +142,6 @@ pub struct RPCMessage { type BehaviourAction = ToSwarm, RPCSend>; -pub struct NetworkParams { - pub max_payload_size: usize, - pub ttfb_timeout: Duration, - pub resp_timeout: Duration, -} - /// Implements the libp2p `NetworkBehaviour` trait and therefore manages network-level /// logic. pub struct RPC { @@ -162,8 +155,6 @@ pub struct RPC { events: Vec>, fork_context: Arc, enable_light_client_server: bool, - /// Networking constant values - network_params: NetworkParams, /// A sequential counter indicating when data gets modified. seq_number: u64, } @@ -174,7 +165,6 @@ impl RPC { enable_light_client_server: bool, inbound_rate_limiter_config: Option, outbound_rate_limiter_config: Option, - network_params: NetworkParams, seq_number: u64, ) -> Self { let response_limiter = inbound_rate_limiter_config.map(|config| { @@ -194,7 +184,6 @@ impl RPC { events: Vec::new(), fork_context, enable_light_client_server, - network_params, seq_number, } } @@ -331,18 +320,11 @@ where max_rpc_size: self.fork_context.spec.max_payload_size as usize, enable_light_client_server: self.enable_light_client_server, phantom: PhantomData, - ttfb_timeout: self.network_params.ttfb_timeout, }, (), ); - let handler = RPCHandler::new( - protocol, - self.fork_context.clone(), - self.network_params.resp_timeout, - peer_id, - connection_id, - ); + let handler = RPCHandler::new(protocol, self.fork_context.clone(), peer_id, connection_id); Ok(handler) } @@ -361,18 +343,11 @@ where max_rpc_size: self.fork_context.spec.max_payload_size as usize, enable_light_client_server: self.enable_light_client_server, phantom: PhantomData, - ttfb_timeout: self.network_params.ttfb_timeout, }, (), ); - let handler = RPCHandler::new( - protocol, - self.fork_context.clone(), - self.network_params.resp_timeout, - peer_id, - connection_id, - ); + let handler = RPCHandler::new(protocol, self.fork_context.clone(), peer_id, connection_id); Ok(handler) } diff --git a/beacon_node/lighthouse_network/src/rpc/protocol.rs b/beacon_node/lighthouse_network/src/rpc/protocol.rs index 6529ff5f92..228a74f08c 100644 --- a/beacon_node/lighthouse_network/src/rpc/protocol.rs +++ b/beacon_node/lighthouse_network/src/rpc/protocol.rs @@ -11,7 +11,6 @@ use std::marker::PhantomData; use std::sync::{Arc, LazyLock}; use std::time::Duration; use strum::{AsRefStr, Display, EnumString, IntoStaticStr}; -use tokio_io_timeout::TimeoutStream; use tokio_util::{ codec::Framed, compat::{Compat, FuturesAsyncReadCompatExt}, @@ -425,7 +424,6 @@ pub struct RPCProtocol { pub max_rpc_size: usize, pub enable_light_client_server: bool, pub phantom: PhantomData, - pub ttfb_timeout: Duration, } impl UpgradeInfo for RPCProtocol { @@ -652,7 +650,7 @@ pub fn rpc_data_column_limits( pub type InboundOutput = (RequestType, InboundFramed); pub type InboundFramed = - Framed>>>, SSZSnappyInboundCodec>; + Framed>>, SSZSnappyInboundCodec>; impl InboundUpgrade for RPCProtocol where @@ -676,10 +674,7 @@ where ), }; - let mut timed_socket = TimeoutStream::new(socket); - timed_socket.set_read_timeout(Some(self.ttfb_timeout)); - - let socket = Framed::new(Box::pin(timed_socket), codec); + let socket = Framed::new(Box::pin(socket), codec); // MetaData requests should be empty, return the stream match versioned_protocol { diff --git a/beacon_node/lighthouse_network/src/service/mod.rs b/beacon_node/lighthouse_network/src/service/mod.rs index eebc2f0200..9edb70555d 100644 --- a/beacon_node/lighthouse_network/src/service/mod.rs +++ b/beacon_node/lighthouse_network/src/service/mod.rs @@ -12,8 +12,8 @@ use crate::peer_manager::{ use crate::peer_manager::{MIN_OUTBOUND_ONLY_FACTOR, PEER_EXCESS_FACTOR, PRIORITY_PEER_EXCESS}; use crate::rpc::methods::MetadataRequest; use crate::rpc::{ - GoodbyeReason, HandlerErr, InboundRequestId, NetworkParams, Protocol, RPC, RPCError, - RPCMessage, RPCReceived, RequestType, ResponseTermination, RpcResponse, RpcSuccessResponse, + GoodbyeReason, HandlerErr, InboundRequestId, Protocol, RPC, RPCError, RPCMessage, RPCReceived, + RequestType, ResponseTermination, RpcResponse, RpcSuccessResponse, }; use crate::types::{ GossipEncoding, GossipKind, GossipTopic, SnappyTransform, Subnet, SubnetDiscovery, @@ -367,17 +367,11 @@ impl Network { (gossipsub, update_gossipsub_scores) }; - let network_params = NetworkParams { - max_payload_size: ctx.chain_spec.max_payload_size as usize, - ttfb_timeout: ctx.chain_spec.ttfb_timeout(), - resp_timeout: ctx.chain_spec.resp_timeout(), - }; let eth2_rpc = RPC::new( ctx.fork_context.clone(), config.enable_light_client_server, config.inbound_rate_limiter_config.clone(), config.outbound_rate_limiter_config.clone(), - network_params, seq_number, ); From a93cafee08a1905b92de6a5d8ad5ac65f365db37 Mon Sep 17 00:00:00 2001 From: chonghe <44791194+chong-he@users.noreply.github.com> Date: Wed, 3 Sep 2025 11:50:41 +0800 Subject: [PATCH 09/81] Implement `selections` Beacon API endpoints to support DVT middleware (#7016) * #6610 - [x] Add `beacon_committee_selections` endpoint - [x] Test beacon committee aggregator and confirmed working - [x] Add `sync_committee_selections` endpoint - [x] Test sync committee aggregator and confirmed working --- common/eth2/src/lib.rs | 44 ++ common/eth2/src/types.rs | 17 + consensus/types/src/selection_proof.rs | 4 +- consensus/types/src/sync_selection_proof.rs | 4 +- validator_client/src/lib.rs | 55 ++- .../validator_services/src/duties_service.rs | 376 +++++++++++------ .../validator_services/src/sync.rs | 394 ++++++++++++------ validator_client/validator_store/src/lib.rs | 1 + 8 files changed, 658 insertions(+), 237 deletions(-) diff --git a/common/eth2/src/lib.rs b/common/eth2/src/lib.rs index 9709b0631f..bbc38e31d6 100644 --- a/common/eth2/src/lib.rs +++ b/common/eth2/src/lib.rs @@ -55,11 +55,13 @@ pub const JSON_CONTENT_TYPE_HEADER: &str = "application/json"; const HTTP_ATTESTATION_TIMEOUT_QUOTIENT: u32 = 4; const HTTP_ATTESTER_DUTIES_TIMEOUT_QUOTIENT: u32 = 4; const HTTP_ATTESTATION_SUBSCRIPTIONS_TIMEOUT_QUOTIENT: u32 = 24; +const HTTP_ATTESTATION_AGGREGATOR_TIMEOUT_QUOTIENT: u32 = 24; // For DVT involving middleware only const HTTP_LIVENESS_TIMEOUT_QUOTIENT: u32 = 4; const HTTP_PROPOSAL_TIMEOUT_QUOTIENT: u32 = 2; const HTTP_PROPOSER_DUTIES_TIMEOUT_QUOTIENT: u32 = 4; const HTTP_SYNC_COMMITTEE_CONTRIBUTION_TIMEOUT_QUOTIENT: u32 = 4; const HTTP_SYNC_DUTIES_TIMEOUT_QUOTIENT: u32 = 4; +const HTTP_SYNC_AGGREGATOR_TIMEOUT_QUOTIENT: u32 = 24; // For DVT involving middleware only const HTTP_GET_BEACON_BLOCK_SSZ_TIMEOUT_QUOTIENT: u32 = 4; const HTTP_GET_DEBUG_BEACON_STATE_QUOTIENT: u32 = 4; const HTTP_GET_DEPOSIT_SNAPSHOT_QUOTIENT: u32 = 4; @@ -150,11 +152,13 @@ pub struct Timeouts { pub attestation: Duration, pub attester_duties: Duration, pub attestation_subscriptions: Duration, + pub attestation_aggregators: Duration, pub liveness: Duration, pub proposal: Duration, pub proposer_duties: Duration, pub sync_committee_contribution: Duration, pub sync_duties: Duration, + pub sync_aggregators: Duration, pub get_beacon_blocks_ssz: Duration, pub get_debug_beacon_states: Duration, pub get_deposit_snapshot: Duration, @@ -168,11 +172,13 @@ impl Timeouts { attestation: timeout, attester_duties: timeout, attestation_subscriptions: timeout, + attestation_aggregators: timeout, liveness: timeout, proposal: timeout, proposer_duties: timeout, sync_committee_contribution: timeout, sync_duties: timeout, + sync_aggregators: timeout, get_beacon_blocks_ssz: timeout, get_debug_beacon_states: timeout, get_deposit_snapshot: timeout, @@ -187,12 +193,14 @@ impl Timeouts { attester_duties: base_timeout / HTTP_ATTESTER_DUTIES_TIMEOUT_QUOTIENT, attestation_subscriptions: base_timeout / HTTP_ATTESTATION_SUBSCRIPTIONS_TIMEOUT_QUOTIENT, + attestation_aggregators: base_timeout / HTTP_ATTESTATION_AGGREGATOR_TIMEOUT_QUOTIENT, liveness: base_timeout / HTTP_LIVENESS_TIMEOUT_QUOTIENT, proposal: base_timeout / HTTP_PROPOSAL_TIMEOUT_QUOTIENT, proposer_duties: base_timeout / HTTP_PROPOSER_DUTIES_TIMEOUT_QUOTIENT, sync_committee_contribution: base_timeout / HTTP_SYNC_COMMITTEE_CONTRIBUTION_TIMEOUT_QUOTIENT, sync_duties: base_timeout / HTTP_SYNC_DUTIES_TIMEOUT_QUOTIENT, + sync_aggregators: base_timeout / HTTP_SYNC_AGGREGATOR_TIMEOUT_QUOTIENT, get_beacon_blocks_ssz: base_timeout / HTTP_GET_BEACON_BLOCK_SSZ_TIMEOUT_QUOTIENT, get_debug_beacon_states: base_timeout / HTTP_GET_DEBUG_BEACON_STATE_QUOTIENT, get_deposit_snapshot: base_timeout / HTTP_GET_DEPOSIT_SNAPSHOT_QUOTIENT, @@ -2841,6 +2849,42 @@ impl BeaconNodeHttpClient { ) .await } + + /// `POST validator/beacon_committee_selections` + pub async fn post_validator_beacon_committee_selections( + &self, + selections: &[BeaconCommitteeSelection], + ) -> Result>, Error> { + let mut path = self.eth_path(V1)?; + + path.path_segments_mut() + .map_err(|()| Error::InvalidUrl(self.server.clone()))? + .push("validator") + .push("beacon_committee_selections"); + + self.post_with_timeout_and_response( + path, + &selections, + self.timeouts.attestation_aggregators, + ) + .await + } + + /// `POST validator/sync_committee_selections` + pub async fn post_validator_sync_committee_selections( + &self, + selections: &[SyncCommitteeSelection], + ) -> Result>, Error> { + let mut path = self.eth_path(V1)?; + + path.path_segments_mut() + .map_err(|()| Error::InvalidUrl(self.server.clone()))? + .push("validator") + .push("sync_committee_selections"); + + self.post_with_timeout_and_response(path, &selections, self.timeouts.sync_aggregators) + .await + } } /// Returns `Ok(response)` if the response is a `200 OK` response. Otherwise, creates an diff --git a/common/eth2/src/types.rs b/common/eth2/src/types.rs index 169551e35b..b72ab29380 100644 --- a/common/eth2/src/types.rs +++ b/common/eth2/src/types.rs @@ -967,6 +967,23 @@ pub struct PeerCount { pub disconnecting: u64, } +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct BeaconCommitteeSelection { + #[serde(with = "serde_utils::quoted_u64")] + pub validator_index: u64, + pub slot: Slot, + pub selection_proof: Signature, +} + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct SyncCommitteeSelection { + #[serde(with = "serde_utils::quoted_u64")] + pub validator_index: u64, + pub slot: Slot, + #[serde(with = "serde_utils::quoted_u64")] + pub subcommittee_index: u64, + pub selection_proof: Signature, +} // --------- Server Sent Event Types ----------- #[derive(PartialEq, Debug, Serialize, Deserialize, Clone)] diff --git a/consensus/types/src/selection_proof.rs b/consensus/types/src/selection_proof.rs index e471457c25..aa8c0c5658 100644 --- a/consensus/types/src/selection_proof.rs +++ b/consensus/types/src/selection_proof.rs @@ -3,11 +3,13 @@ use crate::{ }; use ethereum_hashing::hash; use safe_arith::{ArithError, SafeArith}; +use serde::{Deserialize, Serialize}; use ssz::Encode; use std::cmp; #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] -#[derive(PartialEq, Debug, Clone)] +#[derive(PartialEq, Debug, Clone, Serialize, Deserialize)] +#[serde(transparent)] pub struct SelectionProof(Signature); impl SelectionProof { diff --git a/consensus/types/src/sync_selection_proof.rs b/consensus/types/src/sync_selection_proof.rs index 6387212d94..b1e9e8186f 100644 --- a/consensus/types/src/sync_selection_proof.rs +++ b/consensus/types/src/sync_selection_proof.rs @@ -7,12 +7,14 @@ use crate::{ }; use ethereum_hashing::hash; use safe_arith::{ArithError, SafeArith}; +use serde::{Deserialize, Serialize}; use ssz::Encode; use ssz_types::typenum::Unsigned; use std::cmp; #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] -#[derive(PartialEq, Debug, Clone)] +#[derive(PartialEq, Debug, Clone, Serialize, Deserialize)] +#[serde(transparent)] pub struct SyncSelectionProof(Signature); impl SyncSelectionProof { diff --git a/validator_client/src/lib.rs b/validator_client/src/lib.rs index 5b396ccaf5..71bdde10b0 100644 --- a/validator_client/src/lib.rs +++ b/validator_client/src/lib.rs @@ -2,6 +2,7 @@ pub mod cli; pub mod config; use crate::cli::ValidatorClient; +use crate::duties_service::SelectionProofConfig; pub use config::Config; use initialized_validators::InitializedValidators; use metrics::set_gauge; @@ -55,6 +56,22 @@ const WAITING_FOR_GENESIS_POLL_TIME: Duration = Duration::from_secs(12); const DOPPELGANGER_SERVICE_NAME: &str = "doppelganger"; +/// Compute attestation selection proofs this many slots before they are required. +/// +/// At start-up selection proofs will be computed with less lookahead out of necessity. +const SELECTION_PROOF_SLOT_LOOKAHEAD: u64 = 8; + +/// The attestation selection proof lookahead for those running with the --distributed flag. +const SELECTION_PROOF_SLOT_LOOKAHEAD_DVT: u64 = 1; + +/// Fraction of a slot at which attestation selection proof signing should happen (2 means half way). +const SELECTION_PROOF_SCHEDULE_DENOM: u32 = 2; + +/// Number of epochs in advance to compute sync selection proofs when not in `distributed` mode. +pub const AGGREGATION_PRE_COMPUTE_EPOCHS: u64 = 2; +/// Number of slots in advance to compute sync selection proofs when in `distributed` mode. +pub const AGGREGATION_PRE_COMPUTE_SLOTS_DISTRIBUTED: u64 = 1; + type ValidatorStore = LighthouseValidatorStore; #[derive(Clone)] @@ -407,6 +424,41 @@ impl ProductionValidatorClient { validator_store.prune_slashing_protection_db(slot.epoch(E::slots_per_epoch()), true); } + // Define a config to be pass to duties_service. + // The defined config here defaults to using selections_endpoint and parallel_sign (i.e., distributed mode) + // Other DVT applications, e.g., Anchor can pass in different configs to suit different needs. + let attestation_selection_proof_config = if config.distributed { + SelectionProofConfig { + lookahead_slot: SELECTION_PROOF_SLOT_LOOKAHEAD_DVT, + computation_offset: slot_clock.slot_duration() / SELECTION_PROOF_SCHEDULE_DENOM, + selections_endpoint: true, + parallel_sign: true, + } + } else { + SelectionProofConfig { + lookahead_slot: SELECTION_PROOF_SLOT_LOOKAHEAD, + computation_offset: slot_clock.slot_duration() / SELECTION_PROOF_SCHEDULE_DENOM, + selections_endpoint: false, + parallel_sign: false, + } + }; + + let sync_selection_proof_config = if config.distributed { + SelectionProofConfig { + lookahead_slot: AGGREGATION_PRE_COMPUTE_SLOTS_DISTRIBUTED, + computation_offset: Duration::default(), + selections_endpoint: true, + parallel_sign: true, + } + } else { + SelectionProofConfig { + lookahead_slot: E::slots_per_epoch() * AGGREGATION_PRE_COMPUTE_EPOCHS, + computation_offset: Duration::default(), + selections_endpoint: false, + parallel_sign: false, + } + }; + let duties_service = Arc::new( DutiesServiceBuilder::new() .slot_clock(slot_clock.clone()) @@ -415,7 +467,8 @@ impl ProductionValidatorClient { .spec(context.eth2_config.spec.clone()) .executor(context.executor.clone()) .enable_high_validator_count_metrics(config.enable_high_validator_count_metrics) - .distributed(config.distributed) + .attestation_selection_proof_config(attestation_selection_proof_config) + .sync_selection_proof_config(sync_selection_proof_config) .disable_attesting(config.disable_attesting) .build()?, ); diff --git a/validator_client/validator_services/src/duties_service.rs b/validator_client/validator_services/src/duties_service.rs index 009537bc43..7569d3946a 100644 --- a/validator_client/validator_services/src/duties_service.rs +++ b/validator_client/validator_services/src/duties_service.rs @@ -11,10 +11,14 @@ use crate::sync::SyncDutiesMap; use crate::sync::poll_sync_committee_duties; use beacon_node_fallback::{ApiTopic, BeaconNodeFallback}; use eth2::types::{ - AttesterData, BeaconCommitteeSubscription, DutiesResponse, ProposerData, StateId, ValidatorId, + AttesterData, BeaconCommitteeSelection, BeaconCommitteeSubscription, DutiesResponse, + ProposerData, StateId, ValidatorId, }; -use futures::{StreamExt, stream}; -use parking_lot::RwLock; +use futures::{ + StreamExt, + stream::{self, FuturesUnordered}, +}; +use parking_lot::{RwLock, RwLockWriteGuard}; use safe_arith::{ArithError, SafeArith}; use slot_clock::SlotClock; use std::cmp::min; @@ -32,17 +36,6 @@ use validator_store::{DoppelgangerStatus, Error as ValidatorStoreError, Validato /// Only retain `HISTORICAL_DUTIES_EPOCHS` duties prior to the current epoch. const HISTORICAL_DUTIES_EPOCHS: u64 = 2; -/// Compute attestation selection proofs this many slots before they are required. -/// -/// At start-up selection proofs will be computed with less lookahead out of necessity. -const SELECTION_PROOF_SLOT_LOOKAHEAD: u64 = 8; - -/// The attestation selection proof lookahead for those running with the --distributed flag. -const SELECTION_PROOF_SLOT_LOOKAHEAD_DVT: u64 = 1; - -/// Fraction of a slot at which selection proof signing should happen (2 means half way). -const SELECTION_PROOF_SCHEDULE_DENOM: u32 = 2; - /// Minimum number of validators for which we auto-enable per-validator metrics. /// For validators greater than this value, we need to manually set the `enable-per-validator-metrics` /// flag in the cli to enable collection of per validator metrics. @@ -121,18 +114,97 @@ pub struct SubscriptionSlots { duty_slot: Slot, } +#[derive(Copy, Clone, Debug)] +pub struct SelectionProofConfig { + pub lookahead_slot: u64, + /// The seconds to compute the selection proof before a slot. + pub computation_offset: Duration, + /// Whether to call the selections endpoint, true for DVT with middleware. + pub selections_endpoint: bool, + /// Whether to sign the selection proof in parallel, true in distributed mode. + pub parallel_sign: bool, +} + +/// The default config for selection proofs covers the non-DVT case. +impl Default for SelectionProofConfig { + fn default() -> Self { + Self { + lookahead_slot: 0, + computation_offset: Duration::default(), + selections_endpoint: false, + parallel_sign: false, + } + } +} + /// Create a selection proof for `duty`. /// /// Return `Ok(None)` if the attesting validator is not an aggregator. -async fn make_selection_proof( +async fn make_selection_proof( duty: &AttesterData, validator_store: &S, spec: &ChainSpec, + beacon_nodes: &Arc>, + config: &SelectionProofConfig, ) -> Result, Error> { - let selection_proof = validator_store - .produce_selection_proof(duty.pubkey, duty.slot) - .await - .map_err(Error::FailedToProduceSelectionProof)?; + let selection_proof = if config.selections_endpoint { + let beacon_committee_selection = BeaconCommitteeSelection { + validator_index: duty.validator_index, + slot: duty.slot, + // This is partial selection proof + selection_proof: validator_store + .produce_selection_proof(duty.pubkey, duty.slot) + .await + .map_err(Error::FailedToProduceSelectionProof)? + .into(), + }; + // Call the endpoint /eth/v1/validator/beacon_committee_selections + // by sending the BeaconCommitteeSelection that contains partial selection proof + // The middleware should return BeaconCommitteeSelection that contains full selection proof + let middleware_response = beacon_nodes + .first_success(|beacon_node| { + let selection_data = beacon_committee_selection.clone(); + debug!( + "validator_index" = duty.validator_index, + "slot" = %duty.slot, + "partial selection proof" = ?beacon_committee_selection.selection_proof, + "Sending selection to middleware" + ); + async move { + beacon_node + .post_validator_beacon_committee_selections(&[selection_data]) + .await + } + }) + .await; + + let response_data = middleware_response + .map_err(|e| { + Error::FailedToProduceSelectionProof(ValidatorStoreError::Middleware(e.to_string())) + })? + .data + .pop() + .ok_or_else(|| { + Error::FailedToProduceSelectionProof(ValidatorStoreError::Middleware(format!( + "attestation selection proof - empty response for validator {}", + duty.validator_index + ))) + })?; + + debug!( + "validator_index" = response_data.validator_index, + "slot" = %response_data.slot, + // The selection proof from middleware response will be a full selection proof + "full selection proof" = ?response_data.selection_proof, + "Received selection from middleware" + ); + SelectionProof::from(response_data.selection_proof) + } else { + validator_store + .produce_selection_proof(duty.pubkey, duty.slot) + .await + .map_err(Error::FailedToProduceSelectionProof)? + }; selection_proof .is_aggregator(duty.committee_length as usize, spec) @@ -217,8 +289,10 @@ pub struct DutiesServiceBuilder { spec: Option>, //// Whether we permit large validator counts in the metrics. enable_high_validator_count_metrics: bool, - /// If this validator is running in distributed mode. - distributed: bool, + /// Create attestation selection proof config + attestation_selection_proof_config: SelectionProofConfig, + /// Create sync selection proof config + sync_selection_proof_config: SelectionProofConfig, disable_attesting: bool, } @@ -237,7 +311,8 @@ impl DutiesServiceBuilder { executor: None, spec: None, enable_high_validator_count_metrics: false, - distributed: false, + attestation_selection_proof_config: SelectionProofConfig::default(), + sync_selection_proof_config: SelectionProofConfig::default(), disable_attesting: false, } } @@ -275,8 +350,19 @@ impl DutiesServiceBuilder { self } - pub fn distributed(mut self, distributed: bool) -> Self { - self.distributed = distributed; + pub fn attestation_selection_proof_config( + mut self, + attestation_selection_proof_config: SelectionProofConfig, + ) -> Self { + self.attestation_selection_proof_config = attestation_selection_proof_config; + self + } + + pub fn sync_selection_proof_config( + mut self, + sync_selection_proof_config: SelectionProofConfig, + ) -> Self { + self.sync_selection_proof_config = sync_selection_proof_config; self } @@ -289,7 +375,7 @@ impl DutiesServiceBuilder { Ok(DutiesService { attesters: Default::default(), proposers: Default::default(), - sync_duties: SyncDutiesMap::new(self.distributed), + sync_duties: SyncDutiesMap::new(self.sync_selection_proof_config), validator_store: self .validator_store .ok_or("Cannot build DutiesService without validator_store")?, @@ -305,7 +391,7 @@ impl DutiesServiceBuilder { .ok_or("Cannot build DutiesService without executor")?, spec: self.spec.ok_or("Cannot build DutiesService without spec")?, enable_high_validator_count_metrics: self.enable_high_validator_count_metrics, - distributed: self.distributed, + selection_proof_config: self.attestation_selection_proof_config, disable_attesting: self.disable_attesting, }) } @@ -332,10 +418,10 @@ pub struct DutiesService { pub executor: TaskExecutor, /// The current chain spec. pub spec: Arc, - //// Whether we permit large validator counts in the metrics. + /// Whether we permit large validator counts in the metrics. pub enable_high_validator_count_metrics: bool, - /// If this validator is running in distributed mode. - pub distributed: bool, + /// Pass the config for distributed or non-distributed mode. + pub selection_proof_config: SelectionProofConfig, pub disable_attesting: bool, } @@ -1119,6 +1205,75 @@ async fn post_validator_duties_attester( + attesters: &mut RwLockWriteGuard, + result: Result<(AttesterData, Option), Error>, + dependent_root: Hash256, + current_slot: Slot, +) -> bool { + let (duty, selection_proof) = match result { + Ok(duty_and_proof) => duty_and_proof, + Err(Error::FailedToProduceSelectionProof(ValidatorStoreError::UnknownPubkey(pubkey))) => { + // A pubkey can be missing when a validator was recently removed via the API. + warn!( + info = "A validator may have recently been removed from this VC", + ?pubkey, + "Missing pubkey for duty and proof" + ); + // Do not abort the entire batch for a single failure. + // return true means continue processing duties. + return true; + } + Err(e) => { + error!( + error = ?e, + msg = "may impair attestation duties", + "Failed to produce duty and proof" + ); + return true; + } + }; + + let attester_map = attesters.entry(duty.pubkey).or_default(); + let epoch = duty.slot.epoch(S::E::slots_per_epoch()); + match attester_map.entry(epoch) { + hash_map::Entry::Occupied(mut entry) => { + // No need to update duties for which no proof was computed. + let Some(selection_proof) = selection_proof else { + return true; + }; + + let (existing_dependent_root, existing_duty) = entry.get_mut(); + + if *existing_dependent_root == dependent_root { + // Replace existing proof. + existing_duty.selection_proof = Some(selection_proof); + true + } else { + // Our selection proofs are no longer relevant due to a reorg, abandon this entire background process. + debug!( + reason = "re-org", + "Stopping selection proof background task" + ); + false + } + } + + hash_map::Entry::Vacant(entry) => { + // This probably shouldn't happen, but we have enough info to fill in the entry so we may as well. + let subscription_slots = SubscriptionSlots::new(duty.slot, current_slot); + let duty_and_proof = DutyAndProof { + duty, + selection_proof, + subscription_slots, + }; + entry.insert((dependent_root, duty_and_proof)); + true + } + } +} + /// Compute the attestation selection proofs for the `duties` and add them to the `attesters` map. /// /// Duties are computed in batches each slot. If a re-org is detected then the process will @@ -1138,26 +1293,33 @@ async fn fill_in_selection_proofs(); @@ -1170,87 +1332,69 @@ async fn fill_in_selection_proofs>() - .await; + // In distributed case, we want to send all partial selection proofs to the middleware to determine aggregation duties, + // as the middleware will need to have a threshold of partial selection proofs to be able to return the full selection proof + // Thus, sign selection proofs in parallel in distributed case; Otherwise, sign them serially in non-distributed (normal) case + if duties_service.selection_proof_config.parallel_sign { + let mut duty_and_proof_results = relevant_duties + .into_values() + .flatten() + .map(|duty| async { + let opt_selection_proof = make_selection_proof( + &duty, + duties_service.validator_store.as_ref(), + &duties_service.spec, + &duties_service.beacon_nodes, + &duties_service.selection_proof_config, + ) + .await?; + Ok((duty, opt_selection_proof)) + }) + .collect::>(); - // Add to attesters store. - let mut attesters = duties_service.attesters.write(); - for result in duty_and_proof_results { - let (duty, selection_proof) = match result { - Ok(duty_and_proof) => duty_and_proof, - Err(Error::FailedToProduceSelectionProof( - ValidatorStoreError::UnknownPubkey(pubkey), - )) => { - // A pubkey can be missing when a validator was recently - // removed via the API. - warn!( - info = "a validator may have recently been removed from this VC", - ?pubkey, - "Missing pubkey for duty and proof" - ); - // Do not abort the entire batch for a single failure. - continue; - } - Err(e) => { - error!( - error = ?e, - msg = "may impair attestation duties", - "Failed to produce duty and proof" - ); - // Do not abort the entire batch for a single failure. - continue; - } - }; - - let attester_map = attesters.entry(duty.pubkey).or_default(); - let epoch = duty.slot.epoch(S::E::slots_per_epoch()); - match attester_map.entry(epoch) { - hash_map::Entry::Occupied(mut entry) => { - // No need to update duties for which no proof was computed. - let Some(selection_proof) = selection_proof else { - continue; - }; - - let (existing_dependent_root, existing_duty) = entry.get_mut(); - - if *existing_dependent_root == dependent_root { - // Replace existing proof. - existing_duty.selection_proof = Some(selection_proof); - } else { - // Our selection proofs are no longer relevant due to a reorg, abandon - // this entire background process. - debug!( - reason = "re-org", - "Stopping selection proof background task" - ); - return; - } - } - hash_map::Entry::Vacant(entry) => { - // This probably shouldn't happen, but we have enough info to fill in the - // entry so we may as well. - let subscription_slots = SubscriptionSlots::new(duty.slot, current_slot); - let duty_and_proof = DutyAndProof { - duty, - selection_proof, - subscription_slots, - }; - entry.insert((dependent_root, duty_and_proof)); + while let Some(result) = duty_and_proof_results.next().await { + let mut attesters = duties_service.attesters.write(); + // if process_duty_and_proof returns false, exit the loop + if !process_duty_and_proof::( + &mut attesters, + result, + dependent_root, + current_slot, + ) { + return; } } - } - drop(attesters); + } else { + // In normal (non-distributed case), sign selection proofs serially + let duty_and_proof_results = stream::iter(relevant_duties.into_values().flatten()) + .then(|duty| async { + let opt_selection_proof = make_selection_proof( + &duty, + duties_service.validator_store.as_ref(), + &duties_service.spec, + &duties_service.beacon_nodes, + &duties_service.selection_proof_config, + ) + .await?; + Ok((duty, opt_selection_proof)) + }) + .collect::>() + .await; + + // Add to attesters store. + let mut attesters = duties_service.attesters.write(); + for result in duty_and_proof_results { + if !process_duty_and_proof::( + &mut attesters, + result, + dependent_root, + current_slot, + ) { + return; + } + } + drop(attesters); + }; let time_taken_ms = Duration::from_secs_f64(timer.map_or(0.0, |t| t.stop_and_record())).as_millis(); diff --git a/validator_client/validator_services/src/sync.rs b/validator_client/validator_services/src/sync.rs index 328308d514..77032ed15b 100644 --- a/validator_client/validator_services/src/sync.rs +++ b/validator_client/validator_services/src/sync.rs @@ -1,19 +1,16 @@ -use crate::duties_service::{DutiesService, Error}; +use crate::duties_service::{DutiesService, Error, SelectionProofConfig}; +use eth2::types::SyncCommitteeSelection; use futures::future::join_all; +use futures::stream::{FuturesUnordered, StreamExt}; use logging::crit; use parking_lot::{MappedRwLockReadGuard, RwLock, RwLockReadGuard, RwLockWriteGuard}; use slot_clock::SlotClock; use std::collections::{HashMap, HashSet}; use std::sync::Arc; -use tracing::{debug, info, warn}; +use tracing::{debug, error, info, warn}; use types::{ChainSpec, EthSpec, PublicKeyBytes, Slot, SyncDuty, SyncSelectionProof, SyncSubnetId}; use validator_store::{DoppelgangerStatus, Error as ValidatorStoreError, ValidatorStore}; -/// Number of epochs in advance to compute selection proofs when not in `distributed` mode. -pub const AGGREGATION_PRE_COMPUTE_EPOCHS: u64 = 2; -/// Number of slots in advance to compute selection proofs when in `distributed` mode. -pub const AGGREGATION_PRE_COMPUTE_SLOTS_DISTRIBUTED: u64 = 1; - /// Top-level data-structure containing sync duty information. /// /// This data is structured as a series of nested `HashMap`s wrapped in `RwLock`s. Fine-grained @@ -30,7 +27,7 @@ pub struct SyncDutiesMap { /// Map from sync committee period to duties for members of that sync committee. committees: RwLock>, /// Whether we are in `distributed` mode and using reduced lookahead for aggregate pre-compute. - distributed: bool, + pub selection_proof_config: SelectionProofConfig, } /// Duties for a single sync committee period. @@ -79,10 +76,10 @@ pub struct SlotDuties { } impl SyncDutiesMap { - pub fn new(distributed: bool) -> Self { + pub fn new(selection_proof_config: SelectionProofConfig) -> Self { Self { committees: RwLock::new(HashMap::new()), - distributed, + selection_proof_config, } } @@ -99,15 +96,6 @@ impl SyncDutiesMap { }) } - /// Number of slots in advance to compute selection proofs - fn aggregation_pre_compute_slots(&self) -> u64 { - if self.distributed { - AGGREGATION_PRE_COMPUTE_SLOTS_DISTRIBUTED - } else { - E::slots_per_epoch() * AGGREGATION_PRE_COMPUTE_EPOCHS - } - } - /// Prepare for pre-computation of selection proofs for `committee_period`. /// /// Return the slot up to which proofs should be pre-computed, as well as a vec of @@ -123,7 +111,7 @@ impl SyncDutiesMap { current_slot, first_slot_of_period::(committee_period, spec), ); - let pre_compute_lookahead_slots = self.aggregation_pre_compute_slots::(); + let pre_compute_lookahead_slots = self.selection_proof_config.lookahead_slot; let pre_compute_slot = std::cmp::min( current_slot + pre_compute_lookahead_slots, last_slot_of_period::(committee_period, spec), @@ -377,7 +365,7 @@ pub async fn poll_sync_committee_duties(); + let aggregate_pre_compute_lookahead_slots = sync_duties.selection_proof_config.lookahead_slot; if (current_slot + aggregate_pre_compute_lookahead_slots) .epoch(S::E::slots_per_epoch()) .sync_committee_period(spec)? @@ -498,6 +486,114 @@ pub async fn poll_sync_committee_duties_for_period( + duties_service: &Arc>, + duty: &SyncDuty, + proof_slot: Slot, + subnet_id: SyncSubnetId, +) -> Option { + let sync_selection_proof = duties_service + .validator_store + .produce_sync_selection_proof(&duty.pubkey, proof_slot, subnet_id) + .await; + + let selection_proof = match sync_selection_proof { + Ok(proof) => proof, + Err(ValidatorStoreError::UnknownPubkey(pubkey)) => { + // A pubkey can be missing when a validator was recently removed via the API + debug!( + ?pubkey, + "slot" = %proof_slot, + "Missing pubkey for sync selection proof"); + return None; + } + Err(e) => { + warn!( + "error" = ?e, + "pubkey" = ?duty.pubkey, + "slot" = %proof_slot, + "Unable to sign selection proof" + ); + return None; + } + }; + + // In DVT with middleware, when we want to call the selections endpoint + if duties_service + .sync_duties + .selection_proof_config + .selections_endpoint + { + debug!( + "validator_index" = duty.validator_index, + "slot" = %proof_slot, + "subcommittee_index" = *subnet_id, + // This is partial selection proof + "partial selection proof" = ?selection_proof, + "Sending sync selection to middleware" + ); + + let sync_committee_selection = SyncCommitteeSelection { + validator_index: duty.validator_index, + slot: proof_slot, + subcommittee_index: *subnet_id, + selection_proof: selection_proof.clone().into(), + }; + + // Call the endpoint /eth/v1/validator/sync_committee_selections + // by sending the SyncCommitteeSelection that contains partial sync selection proof + // The middleware should return SyncCommitteeSelection that contains full sync selection proof + let middleware_response = duties_service + .beacon_nodes + .first_success(|beacon_node| { + let selection_data = sync_committee_selection.clone(); + async move { + beacon_node + .post_validator_sync_committee_selections(&[selection_data]) + .await + } + }) + .await; + + match middleware_response { + Ok(mut response) => { + let Some(response_data) = response.data.pop() else { + error!( + validator_index = duty.validator_index, + slot = %proof_slot, + "Empty response from sync selection middleware", + ); + return None; + }; + debug!( + "validator_index" = response_data.validator_index, + "slot" = %response_data.slot, + "subcommittee_index" = response_data.subcommittee_index, + // The selection proof from middleware response will be a full selection proof + "full selection proof" = ?response_data.selection_proof, + "Received sync selection from middleware" + ); + + // Convert the response to a SyncSelectionProof + let full_selection_proof = SyncSelectionProof::from(response_data.selection_proof); + Some(full_selection_proof) + } + Err(e) => { + error!( + "error" = %e, + %proof_slot, + "Failed to get sync selection proofs from middleware" + ); + None + } + } + } else { + // In non-distributed mode, the selection_proof is already a full selection proof + Some(selection_proof) + } +} + pub async fn fill_in_aggregation_proofs( duties_service: Arc>, pre_compute_duties: &[(Slot, SyncDuty)], @@ -505,131 +601,193 @@ pub async fn fill_in_aggregation_proofs() { - Ok(subnet_ids) => subnet_ids, - Err(e) => { - crit!( - error = ?e, - "Arithmetic error computing subnet IDs" - ); - continue; - } - }; - - // Create futures to produce proofs. - let duties_service_ref = &duties_service; - let futures = subnet_ids.iter().map(|subnet_id| async move { - // Construct proof for prior slot. - let proof_slot = slot - 1; - - let proof = match duties_service_ref - .validator_store - .produce_sync_selection_proof(&duty.pubkey, proof_slot, *subnet_id) - .await - { - Ok(proof) => proof, - Err(ValidatorStoreError::UnknownPubkey(pubkey)) => { - // A pubkey can be missing when a validator was recently - // removed via the API. - debug!( - ?pubkey, - pubkey = ?duty.pubkey, - slot = %proof_slot, - "Missing pubkey for sync selection proof" - ); - return None; - } + for (_, duty) in pre_compute_duties { + let subnet_ids = match duty.subnet_ids::() { + Ok(subnet_ids) => subnet_ids, Err(e) => { - warn!( - error = ?e, - pubkey = ?duty.pubkey, - slot = %proof_slot, - "Unable to sign selection proof" + crit!( + "error" = ?e, + "Arithmetic error computing subnet IDs" ); - return None; + continue; } }; + // Construct proof for prior slot. + let proof_slot = slot - 1; + + // Calling the make_sync_selection_proof will return a full selection proof + for &subnet_id in &subnet_ids { + let duties_service = duties_service.clone(); + futures_unordered.push(async move { + let result = + make_sync_selection_proof(&duties_service, duty, proof_slot, subnet_id) + .await; + + result.map(|proof| (duty.validator_index, proof_slot, subnet_id, proof)) + }); + } + } + + while let Some(result) = futures_unordered.next().await { + let Some((validator_index, proof_slot, subnet_id, proof)) = result else { + continue; + }; + let sync_map = duties_service.sync_duties.committees.read(); + let Some(committee_duties) = sync_map.get(&sync_committee_period) else { + debug!("period" = sync_committee_period, "Missing sync duties"); + continue; + }; + + let validators = committee_duties.validators.read(); + + // Check if the validator is an aggregator match proof.is_aggregator::() { Ok(true) => { - debug!( - validator_index = duty.validator_index, - slot = %proof_slot, - %subnet_id, - "Validator is sync aggregator" - ); - Some(((proof_slot, *subnet_id), proof)) + if let Some(Some(duty)) = validators.get(&validator_index) { + debug!( + validator_index, + "slot" = %proof_slot, + "subcommittee_index" = *subnet_id, + // log full selection proof for debugging + "full selection proof" = ?proof, + "Validator is sync aggregator" + ); + + // Store the proof + duty.aggregation_duties + .proofs + .write() + .insert((proof_slot, subnet_id), proof); + } } - Ok(false) => None, + Ok(false) => {} // Not an aggregator Err(e) => { warn!( - pubkey = ?duty.pubkey, - slot = %proof_slot, - error = ?e, + validator_index, + %slot, + "error" = ?e, "Error determining is_aggregator" ); - None } } - }); + } + } else { + // For non-distributed mode + debug!( + period = sync_committee_period, + %current_slot, + %pre_compute_slot, + "Calculating sync selection proofs" + ); - // Execute all the futures in parallel, collecting any successful results. - let proofs = join_all(futures) - .await - .into_iter() - .flatten() - .collect::>(); + let mut validator_proofs = vec![]; + for (validator_start_slot, duty) in pre_compute_duties { + // Proofs are already known at this slot for this validator. + if slot < *validator_start_slot { + continue; + } - validator_proofs.push((duty.validator_index, proofs)); - } + let subnet_ids = match duty.subnet_ids::() { + Ok(subnet_ids) => subnet_ids, + Err(e) => { + crit!( + error = ?e, + "Arithmetic error computing subnet IDs" + ); + continue; + } + }; - // Add to global storage (we add regularly so the proofs can be used ASAP). - let sync_map = duties_service.sync_duties.committees.read(); - let Some(committee_duties) = sync_map.get(&sync_committee_period) else { - debug!(period = sync_committee_period, "Missing sync duties"); - continue; - }; - let validators = committee_duties.validators.read(); - let num_validators_updated = validator_proofs.len(); + // Create futures to produce proofs. + let duties_service_ref = &duties_service; + let futures = subnet_ids.iter().map(|subnet_id| async move { + // Construct proof for prior slot. + let proof_slot = slot - 1; - for (validator_index, proofs) in validator_proofs { - if let Some(Some(duty)) = validators.get(&validator_index) { - duty.aggregation_duties.proofs.write().extend(proofs); - } else { + let proof = + make_sync_selection_proof(duties_service_ref, duty, proof_slot, *subnet_id) + .await; + + match proof { + Some(proof) => match proof.is_aggregator::() { + Ok(true) => { + debug!( + validator_index = duty.validator_index, + slot = %proof_slot, + %subnet_id, + "Validator is sync aggregator" + ); + Some(((proof_slot, *subnet_id), proof)) + } + Ok(false) => None, + Err(e) => { + warn!( + pubkey = ?duty.pubkey, + slot = %proof_slot, + error = ?e, + "Error determining is_aggregator" + ); + None + } + }, + + None => None, + } + }); + + // Execute all the futures in parallel, collecting any successful results. + let proofs = join_all(futures) + .await + .into_iter() + .flatten() + .collect::>(); + + validator_proofs.push((duty.validator_index, proofs)); + } + + // Add to global storage (we add regularly so the proofs can be used ASAP). + let sync_map = duties_service.sync_duties.committees.read(); + let Some(committee_duties) = sync_map.get(&sync_committee_period) else { + debug!(period = sync_committee_period, "Missing sync duties"); + continue; + }; + let validators = committee_duties.validators.read(); + let num_validators_updated = validator_proofs.len(); + + for (validator_index, proofs) in validator_proofs { + if let Some(Some(duty)) = validators.get(&validator_index) { + duty.aggregation_duties.proofs.write().extend(proofs); + } else { + debug!( + validator_index, + period = sync_committee_period, + "Missing sync duty to update" + ); + } + } + + if num_validators_updated > 0 { debug!( - validator_index, - period = sync_committee_period, - "Missing sync duty to update" + %slot, + updated_validators = num_validators_updated, + "Finished computing sync selection proofs" ); } } - - if num_validators_updated > 0 { - debug!( - %slot, - updated_validators = num_validators_updated, - "Finished computing sync selection proofs" - ); - } } } diff --git a/validator_client/validator_store/src/lib.rs b/validator_client/validator_store/src/lib.rs index c3b551c249..6fd2e27064 100644 --- a/validator_client/validator_store/src/lib.rs +++ b/validator_client/validator_store/src/lib.rs @@ -21,6 +21,7 @@ pub enum Error { GreaterThanCurrentEpoch { epoch: Epoch, current_epoch: Epoch }, UnableToSignAttestation(AttestationError), SpecificError(T), + Middleware(String), } impl From for Error { From 10e72df3318c7140d7cb0605e2af1f19119ceab9 Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Wed, 3 Sep 2025 18:05:09 +1000 Subject: [PATCH 10/81] Add `tls-roots` feature to `opentelemetry_otlp` to support exporting traces over https (#7987) --- Cargo.lock | 64 ++++++++++++++++++++++++++++++++++++------ Cargo.toml | 2 +- lighthouse/src/main.rs | 4 ++- 3 files changed, 60 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4020d9611f..96768211eb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1809,6 +1809,16 @@ dependencies = [ "libc", ] +[[package]] +name = "core-foundation" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2a6cd9ae233e7f62ba4e9353e81a88df7fc8a5987b8d445b4d90c879bd156f6" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "core-foundation-sys" version = "0.8.7" @@ -2152,7 +2162,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "18e4fdb82bd54a12e42fb58a800dcae6b9e13982238ce2296dc3570b92148e1f" dependencies = [ "data-encoding", - "syn 1.0.109", + "syn 2.0.100", ] [[package]] @@ -4642,7 +4652,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cdf9d64cfcf380606e64f9a0bcf493616b65331199f984151a6fa11a7b3cde38" dependencies = [ "async-io", - "core-foundation", + "core-foundation 0.9.4", "fnv", "futures", "if-addrs", @@ -5097,9 +5107,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.171" +version = "0.2.175" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c19937216e9d3aa9956d9bb8dfc0b0c8beb6058fc4f7a4dc4d850edf86a237d6" +checksum = "6a82ae493e598baaea5209805c49bbf2ea7de956d50d7da0da1164f9c6d28543" [[package]] name = "libloading" @@ -6199,7 +6209,7 @@ dependencies = [ "openssl-probe", "openssl-sys", "schannel", - "security-framework", + "security-framework 2.11.1", "security-framework-sys", "tempfile", ] @@ -8123,6 +8133,7 @@ version = "0.23.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "47796c98c480fce5406ef69d1c76378375492c3b0a0de587be0c1d9feb12f395" dependencies = [ + "log", "once_cell", "ring", "rustls-pki-types", @@ -8131,6 +8142,18 @@ dependencies = [ "zeroize", ] +[[package]] +name = "rustls-native-certs" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fcff2dd52b58a8d98a70243663a0d234c4e2b79235637849d15913394a247d3" +dependencies = [ + "openssl-probe", + "rustls-pki-types", + "schannel", + "security-framework 3.3.0", +] + [[package]] name = "rustls-pemfile" version = "1.0.4" @@ -8380,7 +8403,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" dependencies = [ "bitflags 2.9.0", - "core-foundation", + "core-foundation 0.9.4", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "80fb1d92c5028aa318b4b8bd7302a5bfcf48be96a37fc6fc790f806b0004ee0c" +dependencies = [ + "bitflags 2.9.0", + "core-foundation 0.10.1", "core-foundation-sys", "libc", "security-framework-sys", @@ -9103,7 +9139,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba3a3adc5c275d719af8cb4272ea1c4a6d668a777f37e115f6d11ddbc1c8e0e7" dependencies = [ "bitflags 1.3.2", - "core-foundation", + "core-foundation 0.9.4", "system-configuration-sys 0.5.0", ] @@ -9114,7 +9150,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" dependencies = [ "bitflags 2.9.0", - "core-foundation", + "core-foundation 0.9.4", "system-configuration-sys 0.6.0", ] @@ -9473,6 +9509,16 @@ dependencies = [ "tokio", ] +[[package]] +name = "tokio-rustls" +version = "0.26.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e727b36a1a0e8b74c376ac2211e40c2c8af09fb4013c60d910495810f008e9b" +dependencies = [ + "rustls 0.23.23", + "tokio", +] + [[package]] name = "tokio-stream" version = "0.1.17" @@ -9576,7 +9622,9 @@ dependencies = [ "percent-encoding", "pin-project", "prost", + "rustls-native-certs", "tokio", + "tokio-rustls 0.26.2", "tokio-stream", "tower 0.5.2", "tower-layer", diff --git a/Cargo.toml b/Cargo.toml index c08e7c59a0..ca5f7bc153 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -198,7 +198,7 @@ node_test_rig = { path = "testing/node_test_rig" } num_cpus = "1" once_cell = "1.17.1" opentelemetry = "0.30.0" -opentelemetry-otlp = { version = "0.30.0", features = ["grpc-tonic"] } +opentelemetry-otlp = { version = "0.30.0", features = ["grpc-tonic", "tls-roots"] } opentelemetry_sdk = "0.30.0" operation_pool = { path = "beacon_node/operation_pool" } parking_lot = "0.12" diff --git a/lighthouse/src/main.rs b/lighthouse/src/main.rs index 3b0f7c3376..8660074e91 100644 --- a/lighthouse/src/main.rs +++ b/lighthouse/src/main.rs @@ -20,7 +20,8 @@ use lighthouse_version::VERSION; use logging::{MetricsLayer, build_workspace_filter, crit}; use malloc_utils::configure_memory_allocator; use opentelemetry::trace::TracerProvider; -use opentelemetry_otlp::WithExportConfig; +use opentelemetry_otlp::tonic_types::transport::ClientTlsConfig; +use opentelemetry_otlp::{WithExportConfig, WithTonicConfig}; use std::backtrace::Backtrace; use std::io::IsTerminal; use std::path::PathBuf; @@ -712,6 +713,7 @@ fn run( let telemetry_layer = environment.runtime().block_on(async { let exporter = opentelemetry_otlp::SpanExporter::builder() .with_tonic() + .with_tls_config(ClientTlsConfig::new().with_native_roots()) .with_endpoint(telemetry_collector_url) .build() .map_err(|e| format!("Failed to create OTLP exporter: {:?}", e))?; From 76adedff2788f0f0d55bff96d6b0ffe1e96e8b39 Mon Sep 17 00:00:00 2001 From: Michael Sproul Date: Thu, 4 Sep 2025 10:08:29 +1000 Subject: [PATCH 11/81] Simplify length methods on BeaconBlockBody (#7989) Just the low-hanging fruit from: - https://github.com/sigp/lighthouse/pull/7988 --- consensus/types/src/beacon_block_body.rs | 28 +++++++----------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/consensus/types/src/beacon_block_body.rs b/consensus/types/src/beacon_block_body.rs index 7df9c3f3cb..e636fbb534 100644 --- a/consensus/types/src/beacon_block_body.rs +++ b/consensus/types/src/beacon_block_body.rs @@ -318,29 +318,17 @@ impl<'a, E: EthSpec, Payload: AbstractExecPayload> BeaconBlockBodyRef<'a, E, } pub fn attestations_len(&self) -> usize { - match self { - Self::Base(body) => body.attestations.len(), - Self::Altair(body) => body.attestations.len(), - Self::Bellatrix(body) => body.attestations.len(), - Self::Capella(body) => body.attestations.len(), - Self::Deneb(body) => body.attestations.len(), - Self::Electra(body) => body.attestations.len(), - Self::Fulu(body) => body.attestations.len(), - Self::Gloas(body) => body.attestations.len(), - } + map_beacon_block_body_ref!(&'a _, self, |inner, cons| { + cons(inner); + inner.attestations.len() + }) } pub fn attester_slashings_len(&self) -> usize { - match self { - Self::Base(body) => body.attester_slashings.len(), - Self::Altair(body) => body.attester_slashings.len(), - Self::Bellatrix(body) => body.attester_slashings.len(), - Self::Capella(body) => body.attester_slashings.len(), - Self::Deneb(body) => body.attester_slashings.len(), - Self::Electra(body) => body.attester_slashings.len(), - Self::Fulu(body) => body.attester_slashings.len(), - Self::Gloas(body) => body.attester_slashings.len(), - } + map_beacon_block_body_ref!(&'a _, self, |inner, cons| { + cons(inner); + inner.attester_slashings.len() + }) } pub fn attestations(&self) -> Box> + 'a> { From c2a92f1a8ced6fa0ec357e72875b52a96b4fcbb1 Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Thu, 4 Sep 2025 15:36:20 +1000 Subject: [PATCH 12/81] Maintain peers across all data column subnets (#7915) Closes: - #7865 - #7855 Changes extracted from earlier PR #7876 This PR fixes two main things with a few other improvements mentioned below: - Prevent Lighthouse from repeatedly sending `DataColumnByRoot` requests to an unsynced peer, causing lookup sync to get stuck - Allows Lighthouse to send discovery requests if there isn't enough **synced** peers in the required sampling subnets - this fixes the stuck sync scenario where there isn't enough usable peers in sampling subnet but no discovery is attempted. - Make peer discovery queries if custody subnet peer count drops below the minimum threshold - Update peer pruning logic to prioritise uniform distribution across all data column subnets and avoid pruning sampling peers if the count is below the target threshold (2) - Check sync status when making discovery requests, to make sure we don't ignore requests if there isn't enough synced peers in the required sampling subnets - Optimise some of the `PeerDB` functions checking custody peers - Only send lookup requests to peers that are synced or advanced --- .../lighthouse_network/src/discovery/mod.rs | 5 +- .../src/peer_manager/mod.rs | 1106 ++++++++++++----- .../src/peer_manager/peerdb.rs | 84 +- .../src/peer_manager/peerdb/peer_info.rs | 91 +- .../network/src/sync/backfill_sync/mod.rs | 7 +- .../network/src/sync/range_sync/chain.rs | 17 +- 6 files changed, 974 insertions(+), 336 deletions(-) diff --git a/beacon_node/lighthouse_network/src/discovery/mod.rs b/beacon_node/lighthouse_network/src/discovery/mod.rs index 2d47153809..a245e830b9 100644 --- a/beacon_node/lighthouse_network/src/discovery/mod.rs +++ b/beacon_node/lighthouse_network/src/discovery/mod.rs @@ -1223,7 +1223,7 @@ impl Discovery { #[cfg(test)] mod tests { use super::*; - use crate::rpc::methods::{MetaData, MetaDataV2}; + use crate::rpc::methods::{MetaData, MetaDataV3}; use libp2p::identity::secp256k1; use types::{BitVector, MinimalEthSpec, SubnetId}; @@ -1248,10 +1248,11 @@ mod tests { .unwrap(); let globals = NetworkGlobals::new( enr, - MetaData::V2(MetaDataV2 { + MetaData::V3(MetaDataV3 { seq_number: 0, attnets: Default::default(), syncnets: Default::default(), + custody_group_count: spec.custody_requirement, }), vec![], false, diff --git a/beacon_node/lighthouse_network/src/peer_manager/mod.rs b/beacon_node/lighthouse_network/src/peer_manager/mod.rs index 93515ed5f6..efb86a5feb 100644 --- a/beacon_node/lighthouse_network/src/peer_manager/mod.rs +++ b/beacon_node/lighthouse_network/src/peer_manager/mod.rs @@ -17,7 +17,7 @@ use std::{ time::{Duration, Instant}, }; use tracing::{debug, error, trace, warn}; -use types::{DataColumnSubnetId, EthSpec, SyncSubnetId}; +use types::{DataColumnSubnetId, EthSpec, SubnetId, SyncSubnetId}; pub use libp2p::core::Multiaddr; pub use libp2p::identity::Keypair; @@ -26,9 +26,7 @@ pub mod peerdb; use crate::peer_manager::peerdb::client::ClientKind; use libp2p::multiaddr; -pub use peerdb::peer_info::{ - ConnectionDirection, PeerConnectionStatus, PeerConnectionStatus::*, PeerInfo, -}; +pub use peerdb::peer_info::{ConnectionDirection, PeerConnectionStatus, PeerInfo}; use peerdb::score::{PeerAction, ReportSource}; pub use peerdb::sync_status::{SyncInfo, SyncStatus}; use std::collections::{HashMap, HashSet, hash_map::Entry}; @@ -38,6 +36,14 @@ use types::data_column_custody_group::{ CustodyIndex, compute_subnets_from_custody_group, get_custody_groups, }; +/// Unified peer subnet information structure for pruning logic. +struct PeerSubnetInfo { + info: PeerInfo, + attestation_subnets: HashSet, + sync_committees: HashSet, + custody_subnets: HashSet, +} + pub mod config; mod network_behaviour; @@ -52,6 +58,8 @@ pub const PEER_RECONNECTION_TIMEOUT: Duration = Duration::from_secs(600); /// lower our peer count below this number. Instead we favour a non-uniform distribution of subnet /// peers. pub const MIN_SYNC_COMMITTEE_PEERS: u64 = 2; +/// Avoid pruning sampling peers if subnet peer count is below this number. +pub const MIN_SAMPLING_COLUMN_SUBNET_PEERS: u64 = 2; /// A fraction of `PeerManager::target_peers` that we allow to connect to us in excess of /// `PeerManager::target_peers`. For clarity, if `PeerManager::target_peers` is 50 and /// PEER_EXCESS_FACTOR = 0.1 we allow 10% more nodes, i.e 55. @@ -161,7 +169,7 @@ impl PeerManager { } = cfg; // Set up the peer manager heartbeat interval - let heartbeat = tokio::time::interval(tokio::time::Duration::from_secs(HEARTBEAT_INTERVAL)); + let heartbeat = tokio::time::interval(Duration::from_secs(HEARTBEAT_INTERVAL)); // Compute subnets for all custody groups let subnets_by_custody_group = if network_globals.spec.is_peer_das_scheduled() { @@ -729,7 +737,16 @@ impl PeerManager { } } else { // we have no meta-data for this peer, update - debug!(%peer_id, new_seq_no = meta_data.seq_number(), "Obtained peer's metadata"); + let cgc = meta_data + .custody_group_count() + .map(|&count| count.to_string()) + .unwrap_or_else(|_| "unknown".to_string()); + debug!( + %peer_id, + new_seq_no = meta_data.seq_number(), + cgc, + "Obtained peer's metadata" + ); } let known_custody_group_count = peer_info @@ -949,6 +966,43 @@ impl PeerManager { } } + /// Run discovery query for additional custody peers if we fall below `MIN_SAMPLING_COLUMN_SUBNET_PEERS`. + fn maintain_custody_peers(&mut self) { + let subnets_to_discover: Vec = self + .network_globals + .sampling_subnets() + .iter() + .filter_map(|custody_subnet| { + if self + .network_globals + .peers + .read() + .has_good_peers_in_custody_subnet( + custody_subnet, + MIN_SAMPLING_COLUMN_SUBNET_PEERS as usize, + ) + { + None + } else { + Some(SubnetDiscovery { + subnet: Subnet::DataColumn(*custody_subnet), + min_ttl: None, + }) + } + }) + .collect(); + + // request the subnet query from discovery + if !subnets_to_discover.is_empty() { + debug!( + subnets = ?subnets_to_discover.iter().map(|s| s.subnet).collect::>(), + "Making subnet queries for maintaining custody peers" + ); + self.events + .push(PeerManagerEvent::DiscoverSubnetPeers(subnets_to_discover)); + } + } + fn maintain_trusted_peers(&mut self) { let trusted_peers = self.trusted_peers.clone(); for trusted_peer in trusted_peers { @@ -991,9 +1045,204 @@ impl PeerManager { } } + /// Build unified peer subnet information from connected peers. + /// + /// This creates a unified structure containing all subnet information for each peer, + /// excluding trusted peers and peers already marked for pruning. + fn build_peer_subnet_info( + &self, + peers_to_prune: &HashSet, + ) -> HashMap> { + let mut peer_subnet_info: HashMap> = HashMap::new(); + + for (peer_id, info) in self.network_globals.peers.read().connected_peers() { + // Ignore peers we trust or that we are already pruning + if info.is_trusted() || peers_to_prune.contains(peer_id) { + continue; + } + + let mut peer_info = PeerSubnetInfo { + info: info.clone(), + attestation_subnets: HashSet::new(), + sync_committees: HashSet::new(), + custody_subnets: HashSet::new(), + }; + + // Populate subnet information from long-lived subnets + for subnet in info.long_lived_subnets() { + match subnet { + Subnet::Attestation(subnet_id) => { + peer_info.attestation_subnets.insert(subnet_id); + } + Subnet::SyncCommittee(id) => { + peer_info.sync_committees.insert(id); + } + Subnet::DataColumn(id) => { + peer_info.custody_subnets.insert(id); + } + } + } + + peer_subnet_info.insert(*peer_id, peer_info); + } + + peer_subnet_info + } + + /// Build reverse lookup from custody subnets to peer lists. + fn build_custody_subnet_lookup( + peer_subnet_info: &HashMap>, + ) -> HashMap> { + let mut custody_subnet_to_peers: HashMap> = HashMap::new(); + + for (peer_id, peer_info) in peer_subnet_info { + for &custody_subnet in &peer_info.custody_subnets { + custody_subnet_to_peers + .entry(custody_subnet) + .or_default() + .push(*peer_id); + } + } + + custody_subnet_to_peers + } + + /// Determine if a peer should be protected from pruning based on various criteria. + /// + /// Protection criteria: + /// - Outbound peers: don't prune if it would drop below target outbound peer count + /// - Data column sampling: ≤ MIN_SAMPLING_COLUMN_SUBNET_PEERS (2) peers per subnet + /// - Sync committees: ≤ MIN_SYNC_COMMITTEE_PEERS (2) peers per committee + /// - Attestation subnets: protect peers on the scarcest attestation subnets + /// + /// Returns true if the peer should be protected (not pruned). + fn should_protect_peer( + &self, + candidate_info: &PeerSubnetInfo, + sampling_subnets: &HashSet, + custody_subnet_to_peers: &HashMap>, + peer_subnet_info: &HashMap>, + connected_outbound_peer_count: usize, + outbound_peers_pruned: usize, + ) -> bool { + // Ensure we don't remove too many outbound peers + if candidate_info.info.is_outbound_only() + && self.target_outbound_peers() + >= connected_outbound_peer_count.saturating_sub(outbound_peers_pruned) + { + return true; + } + + // Check data column sampling subnets + // If the peer exists in a sampling subnet that is less than or equal to MIN_SAMPLING_COLUMN_SUBNET_PEERS, we keep it + let should_protect_sampling = candidate_info + .custody_subnets + .iter() + .filter(|subnet| sampling_subnets.contains(subnet)) + .any(|subnet| { + let count = custody_subnet_to_peers + .get(subnet) + .map(|peers| peers.len()) + .unwrap_or(0); + count <= MIN_SAMPLING_COLUMN_SUBNET_PEERS as usize + }); + + if should_protect_sampling { + return true; + } + + // Check sync committee protection + let should_protect_sync = candidate_info.sync_committees.iter().any(|sync_committee| { + let count = peer_subnet_info + .values() + .filter(|p| p.sync_committees.contains(sync_committee)) + .count(); + count <= MIN_SYNC_COMMITTEE_PEERS as usize + }); + + if should_protect_sync { + return true; + } + + // Check attestation subnet to avoid pruning from subnets with the lowest peer count + let attestation_subnet_counts: HashMap = peer_subnet_info + .values() + .flat_map(|p| &p.attestation_subnets) + .fold(HashMap::new(), |mut acc, &subnet| { + *acc.entry(subnet).or_insert(0) += 1; + acc + }); + + if let Some(&least_dense_size) = attestation_subnet_counts.values().min() { + let is_on_least_dense = candidate_info + .attestation_subnets + .iter() + .any(|subnet| attestation_subnet_counts.get(subnet) == Some(&least_dense_size)); + + if is_on_least_dense { + return true; + } + } + + false + } + + /// Find the best candidate for removal from the densest custody subnet. + /// + /// Returns the PeerId of the candidate to remove, or None if no suitable candidate found. + fn find_prune_candidate( + &self, + column_subnet: DataColumnSubnetId, + column_subnet_to_peers: &HashMap>, + peer_subnet_info: &HashMap>, + sampling_subnets: &HashSet, + connected_outbound_peer_count: usize, + outbound_peers_pruned: usize, + ) -> Option { + let peers_on_subnet_clone = column_subnet_to_peers.get(&column_subnet)?.clone(); + + // Create a sorted list of peers prioritized for removal + let mut sorted_peers = peers_on_subnet_clone; + sorted_peers.shuffle(&mut rand::rng()); + sorted_peers.sort_by_key(|peer_id| { + if let Some(peer_info) = peer_subnet_info.get(peer_id) { + ( + peer_info.info.custody_subnet_count(), + peer_info.info.is_synced_or_advanced(), + ) + } else { + (0, false) + } + }); + + // Try and find a candidate peer to remove from the subnet + for candidate_peer in &sorted_peers { + let Some(candidate_info) = peer_subnet_info.get(candidate_peer) else { + continue; + }; + + // Check if this peer should be protected + if self.should_protect_peer( + candidate_info, + sampling_subnets, + column_subnet_to_peers, + peer_subnet_info, + connected_outbound_peer_count, + outbound_peers_pruned, + ) { + continue; + } + + // Found a suitable candidate + return Some(*candidate_peer); + } + + None + } + /// Remove excess peers back down to our target values. /// This prioritises peers with a good score and uniform distribution of peers across - /// subnets. + /// data column subnets. /// /// The logic for the peer pruning is as follows: /// @@ -1023,9 +1272,12 @@ impl PeerManager { /// Prune peers in the following order: /// 1. Remove worst scoring peers /// 2. Remove peers that are not subscribed to a subnet (they have less value) - /// 3. Remove peers that we have many on any particular subnet - /// 4. Randomly remove peers if all the above are satisfied - /// + /// 3. Remove peers that we have many on any particular subnet, with some exceptions + /// - Don't remove peers needed for data column sampling (≥ MIN_SAMPLING_COLUMN_SUBNET_PEERS) + /// - Don't remove peers needed for sync committees (>=MIN_SYNC_COMMITTEE_PEERS) + /// - Don't remove peers from the lowest density attestation subnets + /// 4. Randomly remove peers if all the above are satisfied until we reach `target_peers`, or + /// until we can't prune any more peers due to the above constraints. fn prune_excess_peers(&mut self) { // The current number of connected peers. let connected_peer_count = self.network_globals.connected_peers(); @@ -1035,7 +1287,7 @@ impl PeerManager { } // Keep a list of peers we are pruning. - let mut peers_to_prune = std::collections::HashSet::new(); + let mut peers_to_prune = HashSet::new(); let connected_outbound_peer_count = self.network_globals.connected_outbound_only_peers(); // Keep track of the number of outbound peers we are pruning. @@ -1087,146 +1339,57 @@ impl PeerManager { prune_peers!(|info: &PeerInfo| { !info.has_long_lived_subnet() }); } - // 3. and 4. Remove peers that are too grouped on any given subnet. If all subnets are + // 3. and 4. Remove peers that are too grouped on any given data column subnet. If all subnets are // uniformly distributed, remove random peers. if peers_to_prune.len() < connected_peer_count.saturating_sub(self.target_peers) { - // Of our connected peers, build a map from subnet_id -> Vec<(PeerId, PeerInfo)> - let mut subnet_to_peer: HashMap)>> = HashMap::new(); - // These variables are used to track if a peer is in a long-lived sync-committee as we - // may wish to retain this peer over others when pruning. - let mut sync_committee_peer_count: HashMap = HashMap::new(); - let mut peer_to_sync_committee: HashMap< - PeerId, - std::collections::HashSet, - > = HashMap::new(); + let sampling_subnets = self.network_globals.sampling_subnets(); + let mut peer_subnet_info = self.build_peer_subnet_info(&peers_to_prune); + let mut custody_subnet_to_peers = Self::build_custody_subnet_lookup(&peer_subnet_info); - for (peer_id, info) in self.network_globals.peers.read().connected_peers() { - // Ignore peers we trust or that we are already pruning - if info.is_trusted() || peers_to_prune.contains(peer_id) { - continue; - } - - // Count based on long-lived subnets not short-lived subnets - // NOTE: There are only 4 sync committees. These are likely to be denser than the - // subnets, so our priority here to make the subnet peer count uniform, ignoring - // the dense sync committees. - for subnet in info.long_lived_subnets() { - match subnet { - Subnet::Attestation(_) => { - subnet_to_peer - .entry(subnet) - .or_default() - .push((*peer_id, info.clone())); - } - Subnet::SyncCommittee(id) => { - *sync_committee_peer_count.entry(id).or_default() += 1; - peer_to_sync_committee - .entry(*peer_id) - .or_default() - .insert(id); - } - // TODO(das) to be implemented. We're not pruning data column peers yet - // because data column topics are subscribed as core topics until we - // implement recomputing data column subnets. - Subnet::DataColumn(_) => {} - } - } - } - - // Add to the peers to prune mapping + // Attempt to prune peers to `target_peers`, or until we run out of peers to prune. while peers_to_prune.len() < connected_peer_count.saturating_sub(self.target_peers) { - if let Some((_, peers_on_subnet)) = subnet_to_peer - .iter_mut() + let custody_subnet_with_most_peers = custody_subnet_to_peers + .iter() + .filter(|(_, peers)| !peers.is_empty()) .max_by_key(|(_, peers)| peers.len()) - { - // and the subnet still contains peers - if !peers_on_subnet.is_empty() { - // Order the peers by the number of subnets they are long-lived - // subscribed too, shuffle equal peers. - peers_on_subnet.shuffle(&mut rand::rng()); - peers_on_subnet.sort_by_key(|(_, info)| info.long_lived_subnet_count()); + .map(|(subnet_id, _)| *subnet_id); - // Try and find a candidate peer to remove from the subnet. - // We ignore peers that would put us below our target outbound peers - // and we currently ignore peers that would put us below our - // sync-committee threshold, if we can avoid it. - - let mut removed_peer_index = None; - for (index, (candidate_peer, info)) in peers_on_subnet.iter().enumerate() { - // Ensure we don't remove too many outbound peers - if info.is_outbound_only() - && self.target_outbound_peers() - >= connected_outbound_peer_count - .saturating_sub(outbound_peers_pruned) - { - // Restart the main loop with the outbound peer removed from - // the list. This will lower the peers per subnet count and - // potentially a new subnet may be chosen to remove peers. This - // can occur recursively until we have no peers left to choose - // from. - continue; - } - - // Check the sync committee - if let Some(subnets) = peer_to_sync_committee.get(candidate_peer) { - // The peer is subscribed to some long-lived sync-committees - // Of all the subnets this peer is subscribed too, the minimum - // peer count of all of them is min_subnet_count - if let Some(min_subnet_count) = subnets - .iter() - .filter_map(|v| sync_committee_peer_count.get(v).copied()) - .min() - { - // If the minimum count is our target or lower, we - // shouldn't remove this peer, because it drops us lower - // than our target - if min_subnet_count <= MIN_SYNC_COMMITTEE_PEERS { - // Do not drop this peer in this pruning interval - continue; - } - } - } - - if info.is_outbound_only() { - outbound_peers_pruned += 1; - } - // This peer is suitable to be pruned - removed_peer_index = Some(index); - break; + if let Some(densest_subnet) = custody_subnet_with_most_peers { + // If we have successfully found a candidate peer to prune, prune it, + // otherwise all peers on this subnet should not be removed due to our + // outbound limit or min_subnet_count. In this case, we remove all + // peers from the pruning logic and try another subnet. + if let Some(candidate_peer) = self.find_prune_candidate( + densest_subnet, + &custody_subnet_to_peers, + &peer_subnet_info, + &sampling_subnets, + connected_outbound_peer_count, + outbound_peers_pruned, + ) { + // Update outbound peer count if needed + if let Some(candidate_info) = peer_subnet_info.get(&candidate_peer) + && candidate_info.info.is_outbound_only() + { + outbound_peers_pruned += 1; } - // If we have successfully found a candidate peer to prune, prune it, - // otherwise all peers on this subnet should not be removed due to our - // outbound limit or min_subnet_count. In this case, we remove all - // peers from the pruning logic and try another subnet. - if let Some(index) = removed_peer_index { - let (candidate_peer, _) = peers_on_subnet.remove(index); - // Remove pruned peers from other subnet counts - for subnet_peers in subnet_to_peer.values_mut() { - subnet_peers.retain(|(peer_id, _)| peer_id != &candidate_peer); - } - // Remove pruned peers from all sync-committee counts - if let Some(known_sync_committes) = - peer_to_sync_committee.get(&candidate_peer) - { - for sync_committee in known_sync_committes { - if let Some(sync_committee_count) = - sync_committee_peer_count.get_mut(sync_committee) - { - *sync_committee_count = - sync_committee_count.saturating_sub(1); - } - } - } - peers_to_prune.insert(candidate_peer); - } else { - peers_on_subnet.clear(); + // Remove the candidate peer from the maps, so we don't account for them + // when finding the next prune candidate. + for subnet_peers in custody_subnet_to_peers.values_mut() { + subnet_peers.retain(|peer_id| peer_id != &candidate_peer); } - continue; + peer_subnet_info.remove(&candidate_peer); + + peers_to_prune.insert(candidate_peer); + } else if let Some(peers) = custody_subnet_to_peers.get_mut(&densest_subnet) { + // If we can't find a prune candidate in this subnet, remove peers in this subnet + peers.clear() } + } else { + // If there are no peers left to prune, exit. + break; } - // If there are no peers left to prune exit. - break; } } @@ -1271,6 +1434,9 @@ impl PeerManager { // Update peer score metrics; self.update_peer_score_metrics(); + // Maintain minimum count for custody peers. + self.maintain_custody_peers(); + // Maintain minimum count for sync committee peers. self.maintain_sync_committee_peers(); @@ -1561,6 +1727,22 @@ mod tests { PeerManager::new(config, Arc::new(globals)).unwrap() } + fn empty_synced_status() -> SyncStatus { + SyncStatus::Synced { + info: empty_sync_info(), + } + } + + fn empty_sync_info() -> SyncInfo { + SyncInfo { + head_slot: Default::default(), + head_root: Default::default(), + finalized_epoch: Default::default(), + finalized_root: Default::default(), + earliest_available_slot: None, + } + } + #[tokio::test] async fn test_peer_manager_disconnects_correctly_during_heartbeat() { // Create 6 peers to connect to with a target of 3. @@ -1805,6 +1987,7 @@ mod tests { /// a priority over all else. async fn test_peer_manager_remove_non_subnet_peers_when_all_healthy() { let mut peer_manager = build_peer_manager(3).await; + let spec = peer_manager.network_globals.spec.clone(); // Create 5 peers to connect to. let peer0 = PeerId::random(); @@ -1828,10 +2011,11 @@ mod tests { // Have some of the peers be on a long-lived subnet let mut attnets = crate::types::EnrAttestationBitfield::::new(); attnets.set(1, true).unwrap(); - let metadata = crate::rpc::MetaDataV2 { + let metadata = MetaDataV3 { seq_number: 0, attnets, syncnets: Default::default(), + custody_group_count: spec.custody_requirement, }; peer_manager .network_globals @@ -1839,7 +2023,7 @@ mod tests { .write() .peer_info_mut(&peer0) .unwrap() - .set_meta_data(MetaData::V2(metadata)); + .set_meta_data(MetaData::V3(metadata)); peer_manager .network_globals .peers @@ -1848,10 +2032,11 @@ mod tests { let mut attnets = crate::types::EnrAttestationBitfield::::new(); attnets.set(10, true).unwrap(); - let metadata = crate::rpc::MetaDataV2 { + let metadata = MetaDataV3 { seq_number: 0, attnets, syncnets: Default::default(), + custody_group_count: spec.custody_requirement, }; peer_manager .network_globals @@ -1859,7 +2044,7 @@ mod tests { .write() .peer_info_mut(&peer2) .unwrap() - .set_meta_data(MetaData::V2(metadata)); + .set_meta_data(MetaData::V3(metadata)); peer_manager .network_globals .peers @@ -1868,10 +2053,11 @@ mod tests { let mut syncnets = crate::types::EnrSyncCommitteeBitfield::::new(); syncnets.set(3, true).unwrap(); - let metadata = crate::rpc::MetaDataV2 { + let metadata = MetaDataV3 { seq_number: 0, attnets: Default::default(), syncnets, + custody_group_count: spec.custody_requirement, }; peer_manager .network_globals @@ -1879,7 +2065,7 @@ mod tests { .write() .peer_info_mut(&peer4) .unwrap() - .set_meta_data(MetaData::V2(metadata)); + .set_meta_data(MetaData::V3(metadata)); peer_manager .network_globals .peers @@ -1893,7 +2079,7 @@ mod tests { assert_eq!(peer_manager.network_globals.connected_or_dialing_peers(), 3); // Check that we removed the peers that were not subscribed to any subnet - let mut peers_should_have_removed = std::collections::HashSet::new(); + let mut peers_should_have_removed = HashSet::new(); peers_should_have_removed.insert(peer1); peers_should_have_removed.insert(peer3); for (peer, _) in peer_manager @@ -1954,12 +2140,14 @@ mod tests { } #[tokio::test] - /// Test the pruning logic to remove grouped subnet peers - async fn test_peer_manager_prune_grouped_subnet_peers() { + /// Test the pruning logic to remove grouped data column subnet peers + async fn test_peer_manager_prune_grouped_data_column_subnet_peers() { let target = 9; let mut peer_manager = build_peer_manager(target).await; + // Override sampling subnets to prevent sampling peer protection from interfering with this test. + *peer_manager.network_globals.sampling_subnets.write() = HashSet::new(); - // Create 5 peers to connect to. + // Create 20 peers to connect to. let mut peers = Vec::new(); for x in 0..20 { // Make 20 peers and group peers as: @@ -1972,25 +2160,18 @@ mod tests { peer_manager.inject_connect_ingoing(&peer, "/ip4/0.0.0.0".parse().unwrap(), None); // Have some of the peers be on a long-lived subnet - let mut attnets = crate::types::EnrAttestationBitfield::::new(); - attnets.set(subnet as usize, true).unwrap(); - let metadata = crate::rpc::MetaDataV2 { - seq_number: 0, - attnets, - syncnets: Default::default(), - }; + { + let mut peers_db = peer_manager.network_globals.peers.write(); + let peer_info = peers_db.peer_info_mut(&peer).unwrap(); + peer_info.set_custody_subnets(HashSet::from([DataColumnSubnetId::new(subnet)])); + peer_info.update_sync_status(empty_synced_status()); + } + peer_manager .network_globals .peers .write() - .peer_info_mut(&peer) - .unwrap() - .set_meta_data(MetaData::V2(metadata)); - peer_manager - .network_globals - .peers - .write() - .add_subscription(&peer, Subnet::Attestation(subnet.into())); + .add_subscription(&peer, Subnet::DataColumn(subnet.into())); println!("{},{},{}", x, subnet, peer); peers.push(peer); } @@ -2062,7 +2243,7 @@ mod tests { /// most peers and have the least subscribed long-lived subnets. And peer 0 because it has no /// long-lived subnet. #[tokio::test] - async fn test_peer_manager_prune_subnet_peers_most_subscribed() { + async fn test_peer_manager_prune_data_column_subnet_peers_most_subscribed() { let target = 3; let mut peer_manager = build_peer_manager(target).await; @@ -2073,43 +2254,27 @@ mod tests { peer_manager.inject_connect_ingoing(&peer, "/ip4/0.0.0.0".parse().unwrap(), None); // Have some of the peers be on a long-lived subnet - let mut attnets = crate::types::EnrAttestationBitfield::::new(); - - match x { - 0 => {} - 1 => { - attnets.set(1, true).unwrap(); - attnets.set(2, true).unwrap(); - attnets.set(3, true).unwrap(); - } - 2 => { - attnets.set(1, true).unwrap(); - attnets.set(2, true).unwrap(); - } - 3 => { - attnets.set(3, true).unwrap(); - } - 4 => { - attnets.set(1, true).unwrap(); - } - 5 => { - attnets.set(2, true).unwrap(); - } + let custody_subnets = match x { + 0 => HashSet::new(), + 1 => HashSet::from([ + DataColumnSubnetId::new(1), + DataColumnSubnetId::new(2), + DataColumnSubnetId::new(3), + ]), + 2 => HashSet::from([DataColumnSubnetId::new(1), DataColumnSubnetId::new(2)]), + 3 => HashSet::from([DataColumnSubnetId::new(3)]), + 4 => HashSet::from([DataColumnSubnetId::new(1)]), + 5 => HashSet::from([DataColumnSubnetId::new(2)]), _ => unreachable!(), + }; + + { + let mut peer_db = peer_manager.network_globals.peers.write(); + let peer_info = peer_db.peer_info_mut(&peer).unwrap(); + peer_info.set_custody_subnets(custody_subnets); + peer_info.update_sync_status(empty_synced_status()); } - let metadata = crate::rpc::MetaDataV2 { - seq_number: 0, - attnets, - syncnets: Default::default(), - }; - peer_manager - .network_globals - .peers - .write() - .peer_info_mut(&peer) - .unwrap() - .set_meta_data(MetaData::V2(metadata)); let long_lived_subnets = peer_manager .network_globals .peers @@ -2153,22 +2318,24 @@ mod tests { assert!(!connected_peers.contains(&peers[5])); } - /// Test the pruning logic to prioritise peers with the most subnets, but not at the expense of - /// removing our few sync-committee subnets. + /// Test the pruning logic to prioritise peers with the most data column subnets, but not at + /// the expense of removing our few sync-committee subnets. /// /// Create 6 peers. /// Peer0: None - /// Peer1 : Subnet 1,2,3, - /// Peer2 : Subnet 1,2, - /// Peer3 : Subnet 3 - /// Peer4 : Subnet 1,2, Sync-committee-1 - /// Peer5 : Subnet 1,2, Sync-committee-2 + /// Peer1 : Column subnet 1,2,3, + /// Peer2 : Column subnet 1,2, + /// Peer3 : Column subnet 3 + /// Peer4 : Column subnet 1,2, Sync-committee-1 + /// Peer5 : Column subnet 1,2, Sync-committee-2 /// /// Prune 3 peers: Should be Peer0, Peer1 and Peer2 because (4 and 5 are on a sync-committee) #[tokio::test] async fn test_peer_manager_prune_subnet_peers_sync_committee() { let target = 3; let mut peer_manager = build_peer_manager(target).await; + // Override sampling subnets to prevent sampling peer protection from interfering with this test. + *peer_manager.network_globals.sampling_subnets.write() = HashSet::new(); // Create 6 peers to connect to. let mut peers = Vec::new(); @@ -2177,48 +2344,40 @@ mod tests { peer_manager.inject_connect_ingoing(&peer, "/ip4/0.0.0.0".parse().unwrap(), None); // Have some of the peers be on a long-lived subnet - let mut attnets = crate::types::EnrAttestationBitfield::::new(); let mut syncnets = crate::types::EnrSyncCommitteeBitfield::::new(); - - match x { - 0 => {} - 1 => { - attnets.set(1, true).unwrap(); - attnets.set(2, true).unwrap(); - attnets.set(3, true).unwrap(); - } - 2 => { - attnets.set(1, true).unwrap(); - attnets.set(2, true).unwrap(); - } - 3 => { - attnets.set(3, true).unwrap(); - } + let custody_subnets = match x { + 0 => HashSet::new(), + 1 => HashSet::from([ + DataColumnSubnetId::new(1), + DataColumnSubnetId::new(2), + DataColumnSubnetId::new(3), + ]), + 2 => HashSet::from([DataColumnSubnetId::new(1), DataColumnSubnetId::new(2)]), + 3 => HashSet::from([DataColumnSubnetId::new(3)]), 4 => { - attnets.set(1, true).unwrap(); - attnets.set(2, true).unwrap(); syncnets.set(1, true).unwrap(); + HashSet::from([DataColumnSubnetId::new(1), DataColumnSubnetId::new(2)]) } 5 => { - attnets.set(1, true).unwrap(); - attnets.set(2, true).unwrap(); syncnets.set(2, true).unwrap(); + HashSet::from([DataColumnSubnetId::new(1), DataColumnSubnetId::new(2)]) } _ => unreachable!(), + }; + + { + let mut peer_db = peer_manager.network_globals.peers.write(); + let peer_info = peer_db.peer_info_mut(&peer).unwrap(); + peer_info.set_meta_data(MetaData::V3(MetaDataV3 { + seq_number: 0, + attnets: Default::default(), + syncnets, + custody_group_count: 0, // unused in this test, as pruning logic uses `custody_subnets` + })); + peer_info.set_custody_subnets(custody_subnets); + peer_info.update_sync_status(empty_synced_status()); } - let metadata = crate::rpc::MetaDataV2 { - seq_number: 0, - attnets, - syncnets, - }; - peer_manager - .network_globals - .peers - .write() - .peer_info_mut(&peer) - .unwrap() - .set_meta_data(MetaData::V2(metadata)); let long_lived_subnets = peer_manager .network_globals .peers @@ -2262,10 +2421,111 @@ mod tests { assert!(!connected_peers.contains(&peers[2])); } + /// Test that custody subnet peer count below the `MIN_SAMPLING_COLUMN_SUBNET_PEERS`(2) + /// threshold are protected from pruning. + /// + /// Create 8 peers. + /// Peer0: None (can be pruned) + /// Peer1: Subnet 1,4,5 + /// Peer2: Subnet 1,4 + /// Peer3: Subnet 2 + /// Peer4: Subnet 2 + /// Peer5: Subnet 1 (can be pruned) + /// Peer6: Subnet 3 + /// Peer7: Subnet 5 (can be pruned) + /// + /// Sampling subnets: 1, 2 + /// + /// Prune 3 peers: Should be Peer0, Peer 5 and Peer 7 because + /// - Peer 0 because it has no long-lived subnet. + /// - Peer 5 is on the subnet with the most peers and have the least subscribed long-lived subnets. + /// - Peer 7 because it's on a non-sampling subnet and have the least subscribed long-lived subnets. + #[tokio::test] + async fn test_peer_manager_protect_sampling_subnet_peers_below_threshold() { + let target = 5; + let mut peer_manager = build_peer_manager(target).await; + + *peer_manager.network_globals.sampling_subnets.write() = + HashSet::from([DataColumnSubnetId::new(1), DataColumnSubnetId::new(2)]); + + // Create 8 peers to connect to. + let mut peers = Vec::new(); + for peer_idx in 0..8 { + let peer = PeerId::random(); + peer_manager.inject_connect_ingoing(&peer, "/ip4/0.0.0.0".parse().unwrap(), None); + + // Have some of the peers be on a long-lived subnet + let custody_subnets = match peer_idx { + 0 => HashSet::new(), + 1 => HashSet::from([ + DataColumnSubnetId::new(1), + DataColumnSubnetId::new(4), + DataColumnSubnetId::new(5), + ]), + 2 => HashSet::from([DataColumnSubnetId::new(1), DataColumnSubnetId::new(4)]), + 3 => HashSet::from([DataColumnSubnetId::new(2)]), + 4 => HashSet::from([DataColumnSubnetId::new(2)]), + 5 => HashSet::from([DataColumnSubnetId::new(1)]), + 6 => HashSet::from([DataColumnSubnetId::new(3)]), + 7 => HashSet::from([DataColumnSubnetId::new(5)]), + _ => unreachable!(), + }; + + { + let mut peer_db = peer_manager.network_globals.peers.write(); + let peer_info = peer_db.peer_info_mut(&peer).unwrap(); + peer_info.set_custody_subnets(custody_subnets); + peer_info.update_sync_status(empty_synced_status()); + } + + let long_lived_subnets = peer_manager + .network_globals + .peers + .read() + .peer_info(&peer) + .unwrap() + .long_lived_subnets(); + for subnet in long_lived_subnets { + println!("Subnet: {:?}", subnet); + peer_manager + .network_globals + .peers + .write() + .add_subscription(&peer, subnet); + } + println!("{},{}", peer_idx, peer); + peers.push(peer); + } + + // Perform the heartbeat. + peer_manager.heartbeat(); + + // Tests that when we are over the target peer limit, after disconnecting an unhealthy peer, + // the number of connected peers updates and we will not remove too many peers. + assert_eq!( + peer_manager.network_globals.connected_or_dialing_peers(), + target + ); + + // Check that we removed peers 0, 5 and 7 + let connected_peers: std::collections::HashSet<_> = peer_manager + .network_globals + .peers + .read() + .connected_or_dialing_peers() + .cloned() + .collect(); + + println!("Connected peers: {:?}", connected_peers); + assert!(!connected_peers.contains(&peers[0])); + assert!(!connected_peers.contains(&peers[5])); + assert!(!connected_peers.contains(&peers[7])); + } + /// This test is for reproducing the issue: /// https://github.com/sigp/lighthouse/pull/3236#issue-1256432659 /// - /// Whether the issue happens depends on `subnet_to_peer` (HashMap), since HashMap doesn't + /// Whether the issue happens depends on `custody_subnet_to_peers` (HashMap), since HashMap doesn't /// guarantee a particular order of iteration. So we repeat the test case to try to reproduce /// the issue. #[tokio::test] @@ -2275,41 +2535,42 @@ mod tests { } } - /// Test the pruning logic to prioritize peers with the most subnets. This test specifies + /// Test the pruning logic to prioritize peers with the most column subnets. This test specifies /// the connection direction for the peers. /// Either Peer 4 or 5 is expected to be removed in this test case. /// /// Create 8 peers. - /// Peer0 (out) : Subnet 1, Sync-committee-1 - /// Peer1 (out) : Subnet 1, Sync-committee-1 - /// Peer2 (out) : Subnet 2, Sync-committee-2 - /// Peer3 (out) : Subnet 2, Sync-committee-2 - /// Peer4 (out) : Subnet 3 - /// Peer5 (out) : Subnet 3 - /// Peer6 (in) : Subnet 4 - /// Peer7 (in) : Subnet 5 + /// Peer0 (out) : Column subnet 1, Sync-committee-1 + /// Peer1 (out) : Column subnet 1, Sync-committee-1 + /// Peer2 (out) : Column subnet 2, Sync-committee-2 + /// Peer3 (out) : Column subnet 2, Sync-committee-2 + /// Peer4 (out) : Column subnet 3 + /// Peer5 (out) : Column subnet 3 + /// Peer6 (in) : Column subnet 4 + /// Peer7 (in) : Column subnet 5 async fn test_peer_manager_prune_based_on_subnet_count() { let target = 7; let mut peer_manager = build_peer_manager(target).await; + // Override sampling subnets to prevent sampling peer protection from interfering with this test. + *peer_manager.network_globals.sampling_subnets.write() = HashSet::new(); // Create 8 peers to connect to. let mut peers = Vec::new(); - for x in 0..8 { + for peer_idx in 0..8 { let peer = PeerId::random(); // Have some of the peers be on a long-lived subnet - let mut attnets = crate::types::EnrAttestationBitfield::::new(); let mut syncnets = crate::types::EnrSyncCommitteeBitfield::::new(); - match x { + let custody_subnets = match peer_idx { 0 => { peer_manager.inject_connect_outgoing( &peer, "/ip4/0.0.0.0".parse().unwrap(), None, ); - attnets.set(1, true).unwrap(); syncnets.set(1, true).unwrap(); + HashSet::from([DataColumnSubnetId::new(1)]) } 1 => { peer_manager.inject_connect_outgoing( @@ -2317,8 +2578,8 @@ mod tests { "/ip4/0.0.0.0".parse().unwrap(), None, ); - attnets.set(1, true).unwrap(); syncnets.set(1, true).unwrap(); + HashSet::from([DataColumnSubnetId::new(1)]) } 2 => { peer_manager.inject_connect_outgoing( @@ -2326,8 +2587,8 @@ mod tests { "/ip4/0.0.0.0".parse().unwrap(), None, ); - attnets.set(2, true).unwrap(); syncnets.set(2, true).unwrap(); + HashSet::from([DataColumnSubnetId::new(2)]) } 3 => { peer_manager.inject_connect_outgoing( @@ -2335,8 +2596,8 @@ mod tests { "/ip4/0.0.0.0".parse().unwrap(), None, ); - attnets.set(2, true).unwrap(); syncnets.set(2, true).unwrap(); + HashSet::from([DataColumnSubnetId::new(2)]) } 4 => { peer_manager.inject_connect_outgoing( @@ -2344,7 +2605,7 @@ mod tests { "/ip4/0.0.0.0".parse().unwrap(), None, ); - attnets.set(3, true).unwrap(); + HashSet::from([DataColumnSubnetId::new(3)]) } 5 => { peer_manager.inject_connect_outgoing( @@ -2352,7 +2613,7 @@ mod tests { "/ip4/0.0.0.0".parse().unwrap(), None, ); - attnets.set(3, true).unwrap(); + HashSet::from([DataColumnSubnetId::new(3)]) } 6 => { peer_manager.inject_connect_ingoing( @@ -2360,7 +2621,7 @@ mod tests { "/ip4/0.0.0.0".parse().unwrap(), None, ); - attnets.set(4, true).unwrap(); + HashSet::from([DataColumnSubnetId::new(4)]) } 7 => { peer_manager.inject_connect_ingoing( @@ -2368,23 +2629,26 @@ mod tests { "/ip4/0.0.0.0".parse().unwrap(), None, ); - attnets.set(5, true).unwrap(); + HashSet::from([DataColumnSubnetId::new(5)]) } _ => unreachable!(), + }; + + let metadata = MetaDataV3 { + seq_number: 0, + attnets: Default::default(), + syncnets, + custody_group_count: 0, // unused in this test, as pruning logic uses `custody_subnets` + }; + + { + let mut peer_db_write = peer_manager.network_globals.peers.write(); + let peer_info = peer_db_write.peer_info_mut(&peer).unwrap(); + peer_info.set_meta_data(MetaData::V3(metadata)); + peer_info.set_custody_subnets(custody_subnets); + peer_info.update_sync_status(empty_synced_status()); } - let metadata = crate::rpc::MetaDataV2 { - seq_number: 0, - attnets, - syncnets, - }; - peer_manager - .network_globals - .peers - .write() - .peer_info_mut(&peer) - .unwrap() - .set_meta_data(MetaData::V2(metadata)); let long_lived_subnets = peer_manager .network_globals .peers @@ -2392,7 +2656,7 @@ mod tests { .peer_info(&peer) .unwrap() .long_lived_subnets(); - println!("{},{}", x, peer); + println!("{},{}", peer_idx, peer); for subnet in long_lived_subnets { println!("Subnet: {:?}", subnet); peer_manager @@ -2428,17 +2692,286 @@ mod tests { assert!(connected_peers.contains(&peers[7])); } + /// Test that peers with the sparsest attestation subnets are protected from pruning. + /// + /// Create 7 peers: + /// - 4 on attnet 0 + /// - 1 on attnet 1 (least dense) + /// - 2 on attnet 2 + /// + /// Prune 3 peers: 2 peers from subnet 0 and 1 from either subnet 0 or 2, BUT never from attnet 1. + #[tokio::test] + async fn test_peer_manager_not_prune_sparsest_attestation_subnet() { + let target = 4; + let mut peer_manager = build_peer_manager(target).await; + let spec = peer_manager.network_globals.spec.clone(); + let mut peers = Vec::new(); + + let subnet_assignments = [0, 0, 0, 0, 1, 2, 2]; + + for &subnet in subnet_assignments.iter() { + let peer = PeerId::random(); + peer_manager.inject_connect_ingoing(&peer, "/ip4/0.0.0.0".parse().unwrap(), None); + + let mut attnets = crate::types::EnrAttestationBitfield::::new(); + attnets.set(subnet, true).unwrap(); + + let metadata = MetaDataV3 { + seq_number: 0, + attnets, + syncnets: Default::default(), + custody_group_count: spec.custody_requirement, + }; + peer_manager + .network_globals + .peers + .write() + .peer_info_mut(&peer) + .unwrap() + .set_meta_data(MetaData::V3(metadata)); + + peer_manager + .network_globals + .peers + .write() + .add_subscription(&peer, Subnet::Attestation((subnet as u64).into())); + + peers.push(peer); + } + + peer_manager.heartbeat(); + + // Check attestation subnet to avoid pruning from subnets with lowest peer count: + // Peer 4 (on least dense subnet 1) should be protected + // Should preferentially remove from subnet 0 (most dense) rather than subnet 1 (least dense) + let connected_peers: HashSet<_> = peer_manager + .network_globals + .peers + .read() + .connected_or_dialing_peers() + .cloned() + .collect(); + + // Peer 4 (on least dense attestation subnet 1) should be kept + assert!(connected_peers.contains(&peers[4])); + + // Attestation subnet uniformity should protect peers on least dense subnets + // Count peers on subnet 1 (least dense) + let subnet_1_count = peers + .iter() + .filter(|&peer| connected_peers.contains(peer)) + .filter(|&peer| { + peer_manager + .network_globals + .peers + .read() + .peer_info(peer) + .unwrap() + .long_lived_subnets() + .iter() + .any(|subnet| matches!(subnet, Subnet::Attestation(id) if id == &1u64.into())) + }) + .count(); + + assert!(subnet_1_count > 0, "Least dense subnet should be protected"); + } + + /// Test the pruning logic prioritizes synced and advanced peers over behind/unknown peers. + /// + /// Create 6 peers with different sync statuses: + /// Peer0: Behind + /// Peer1: Unknown + /// Peer2: Synced + /// Peer3: Advanced + /// Peer4: Synced + /// Peer5: Unknown + /// + /// Target: 3 peers. Should prune peers 0, 1, 5 (behind/unknown) and keep 2, 3, 4 (synced/advanced). + #[tokio::test] + async fn test_peer_manager_prune_should_prioritize_synced_advanced_peers() { + let target = 3; + let mut peer_manager = build_peer_manager(target).await; + // Override sampling subnets to prevent sampling peer protection from interfering with this test. + *peer_manager.network_globals.sampling_subnets.write() = HashSet::new(); + + let mut peers = Vec::new(); + let current_peer_count = 6; + for i in 0..current_peer_count { + let peer = PeerId::random(); + peer_manager.inject_connect_ingoing(&peer, "/ip4/0.0.0.0".parse().unwrap(), None); + + let sync_status = match i { + 0 => SyncStatus::Behind { + info: empty_sync_info(), + }, + 1 | 5 => SyncStatus::Unknown, + 2 | 4 => SyncStatus::Synced { + info: empty_sync_info(), + }, + 3 => SyncStatus::Advanced { + info: empty_sync_info(), + }, + _ => unreachable!(), + }; + + { + let mut peer_db = peer_manager.network_globals.peers.write(); + let peer_info = peer_db.peer_info_mut(&peer).unwrap(); + peer_info.update_sync_status(sync_status); + // make sure all the peers have some long live subnets that are not protected + peer_info.set_custody_subnets(HashSet::from([DataColumnSubnetId::new(2)])) + } + + let long_lived_subnets = peer_manager + .network_globals + .peers + .read() + .peer_info(&peer) + .unwrap() + .long_lived_subnets(); + for subnet in long_lived_subnets { + println!("Subnet: {:?}", subnet); + peer_manager + .network_globals + .peers + .write() + .add_subscription(&peer, subnet); + } + + peers.push(peer); + } + + // Perform the heartbeat to trigger pruning + peer_manager.heartbeat(); + + // Should have exactly target number of peers + assert_eq!( + peer_manager.network_globals.connected_or_dialing_peers(), + target + ); + + let connected_peers: std::collections::HashSet<_> = peer_manager + .network_globals + .peers + .read() + .connected_or_dialing_peers() + .cloned() + .collect(); + + // Count how many synced/advanced peers are kept vs behind/unknown peers + let synced_advanced_kept = [&peers[2], &peers[3], &peers[4]] + .iter() + .filter(|peer| connected_peers.contains(peer)) + .count(); + + let behind_unknown_kept = [&peers[0], &peers[1], &peers[5]] + .iter() + .filter(|peer| connected_peers.contains(peer)) + .count(); + + assert_eq!(synced_advanced_kept, target); + assert_eq!(behind_unknown_kept, 0); + } + + /// Test that `peer_subnet_info` is properly cleaned up during pruning iterations. + /// + /// Without proper cleanup, stale peer data affects protection logic for sync committees and we + /// may end up pruning more than expected. + #[tokio::test] + async fn test_peer_manager_prune_mixed_custody_subnet_protection() { + let target = 6; + let mut peer_manager = build_peer_manager(target).await; + // Override sampling subnets to prevent sampling peer protection from interfering. + *peer_manager.network_globals.sampling_subnets.write() = HashSet::new(); + + // Create 12 peers: + // - 4 on custody subnet 0 + // - 3 on subnet 1 + //- 2 on subnet 2 + // - 3 scattered. + // Every 4th peer (0,4,8) is on sync committee 0. + let mut peers = Vec::new(); + for i in 0..12 { + let peer = PeerId::random(); + peer_manager.inject_connect_ingoing(&peer, "/ip4/0.0.0.0".parse().unwrap(), None); + + let custody_subnet = match i { + ..4 => 0, + 4..7 => 1, + 7..9 => 2, + _ => i - 6, + }; + let on_sync_committee = i % 4 == 0; + + { + let mut peers_db = peer_manager.network_globals.peers.write(); + let peer_info = peers_db.peer_info_mut(&peer).unwrap(); + peer_info + .set_custody_subnets(HashSet::from([DataColumnSubnetId::new(custody_subnet)])); + peer_info.update_sync_status(empty_synced_status()); + + if on_sync_committee { + let mut syncnets = crate::types::EnrSyncCommitteeBitfield::::new(); + syncnets.set(0, true).unwrap(); + peer_info.set_meta_data(MetaData::V3(MetaDataV3 { + seq_number: 0, + attnets: Default::default(), + syncnets, + custody_group_count: 0, + })); + } + + for subnet in peer_info.long_lived_subnets() { + peers_db.add_subscription(&peer, subnet); + } + + peers.push(peer); + } + } + + assert_eq!( + peer_manager.network_globals.connected_or_dialing_peers(), + 12 + ); + + peer_manager.heartbeat(); + + assert_eq!( + peer_manager.network_globals.connected_or_dialing_peers(), + target + ); + + let connected_peers: HashSet = peer_manager + .network_globals + .peers + .read() + .connected_or_dialing_peers() + .cloned() + .collect(); + + let sync_committee_peers = [&peers[0], &peers[4], &peers[8]]; + let remaining_sync_peers = connected_peers + .iter() + .filter(|peer| sync_committee_peers.contains(peer)) + .count(); + assert_eq!( + remaining_sync_peers, 2, + "Sync committee protection should preserve exactly MIN_SYNC_COMMITTEE_PEERS (2)" + ); + } + // Test properties PeerManager should have using randomly generated input. #[cfg(test)] mod property_based_tests { use crate::peer_manager::config::DEFAULT_TARGET_PEERS; use crate::peer_manager::tests::build_peer_manager_with_trusted_peers; - use crate::rpc::MetaData; + use crate::rpc::{MetaData, MetaDataV3}; use libp2p::PeerId; use quickcheck::{Arbitrary, Gen, TestResult}; use quickcheck_macros::quickcheck; + use std::collections::HashSet; use tokio::runtime::Runtime; - use types::Unsigned; + use types::{DataColumnSubnetId, Unsigned}; use types::{EthSpec, MainnetEthSpec as E}; #[derive(Clone, Debug)] @@ -2450,6 +2983,7 @@ mod tests { score: f64, trusted: bool, gossipsub_score: f64, + custody_subnets: HashSet, } impl Arbitrary for PeerCondition { @@ -2472,6 +3006,17 @@ mod tests { bitfield }; + let spec = E::default_spec(); + let custody_subnets = { + let total_subnet_count = spec.data_column_sidecar_subnet_count; + let custody_subnet_count = u64::arbitrary(g) % (total_subnet_count + 1); // 0 to 128 + (spec.custody_requirement..total_subnet_count) + .filter(|_| bool::arbitrary(g)) + .map(DataColumnSubnetId::new) + .take(custody_subnet_count as usize) + .collect() + }; + PeerCondition { peer_id: PeerId::random(), outgoing: bool::arbitrary(g), @@ -2480,6 +3025,7 @@ mod tests { score: f64::arbitrary(g), trusted: bool::arbitrary(g), gossipsub_score: f64::arbitrary(g), + custody_subnets, } } } @@ -2487,6 +3033,7 @@ mod tests { #[quickcheck] fn prune_excess_peers(peer_conditions: Vec) -> TestResult { let target_peer_count = DEFAULT_TARGET_PEERS; + let spec = E::default_spec(); if peer_conditions.len() < target_peer_count { return TestResult::discard(); } @@ -2533,17 +3080,22 @@ mod tests { syncnets.set(i, *value).unwrap(); } - let metadata = crate::rpc::MetaDataV2 { + let subnets_per_custody_group = + spec.data_column_sidecar_subnet_count / spec.number_of_custody_groups; + let metadata = MetaDataV3 { seq_number: 0, attnets, syncnets, + custody_group_count: condition.custody_subnets.len() as u64 + / subnets_per_custody_group, }; let mut peer_db = peer_manager.network_globals.peers.write(); let peer_info = peer_db.peer_info_mut(&condition.peer_id).unwrap(); - peer_info.set_meta_data(MetaData::V2(metadata)); + peer_info.set_meta_data(MetaData::V3(metadata)); peer_info.set_gossipsub_score(condition.gossipsub_score); peer_info.add_to_score(condition.score); + peer_info.set_custody_subnets(condition.custody_subnets.clone()); for subnet in peer_info.long_lived_subnets() { peer_db.add_subscription(&condition.peer_id, subnet); diff --git a/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs b/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs index 974b41230e..083c3f00c2 100644 --- a/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs +++ b/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs @@ -300,6 +300,7 @@ impl PeerDB { .filter(move |(_, info)| { // We check both the metadata and gossipsub data as we only want to count long-lived subscribed peers info.is_connected() + && info.is_synced_or_advanced() && info.on_subnet_metadata(&subnet) && info.on_subnet_gossipsub(&subnet) && info.is_good_gossipsub_peer() @@ -318,40 +319,69 @@ impl PeerDB { .filter(move |(_, info)| { // The custody_subnets hashset can be populated via enr or metadata let is_custody_subnet_peer = info.is_assigned_to_custody_subnet(&subnet); - info.is_connected() && info.is_good_gossipsub_peer() && is_custody_subnet_peer + info.is_connected() + && info.is_good_gossipsub_peer() + && is_custody_subnet_peer + && info.is_synced_or_advanced() }) .map(|(peer_id, _)| peer_id) } - /// Returns an iterator of all peers that are supposed to be custodying - /// the given subnet id. - pub fn good_range_sync_custody_subnet_peers( + /// Checks if there is at least one good peer for each specified custody subnet for the given epoch. + /// A "good" peer is one that is both connected and synced (or advanced) for the specified epoch. + pub fn has_good_custody_range_sync_peer( &self, - subnet: DataColumnSubnetId, - ) -> impl Iterator { - self.peers - .iter() - .filter(move |(_, info)| { - // The custody_subnets hashset can be populated via enr or metadata - info.is_connected() && info.is_assigned_to_custody_subnet(&subnet) - }) - .map(|(peer_id, _)| peer_id) - } - - /// Returns `true` if the given peer is assigned to the given subnet. - /// else returns `false` - /// - /// Returns `false` if peer doesn't exist in peerdb. - pub fn is_good_range_sync_custody_subnet_peer( - &self, - subnet: DataColumnSubnetId, - peer: &PeerId, + subnets: &HashSet, + epoch: Epoch, ) -> bool { - if let Some(info) = self.peers.get(peer) { - info.is_connected() && info.is_assigned_to_custody_subnet(&subnet) - } else { - false + let mut remaining_subnets = subnets.clone(); + + let good_sync_peers_for_epoch = self.peers.values().filter(|&info| { + info.is_connected() + && match info.sync_status() { + SyncStatus::Synced { info } | SyncStatus::Advanced { info } => { + info.has_slot(epoch.end_slot(E::slots_per_epoch())) + } + SyncStatus::IrrelevantPeer + | SyncStatus::Behind { .. } + | SyncStatus::Unknown => false, + } + }); + + for info in good_sync_peers_for_epoch { + for subnet in info.custody_subnets_iter() { + if remaining_subnets.remove(subnet) && remaining_subnets.is_empty() { + return true; + } + } } + + false + } + + /// Checks if there are sufficient good peers for a single custody subnet. + /// A "good" peer is one that is both connected and synced (or advanced). + pub fn has_good_peers_in_custody_subnet( + &self, + subnet: &DataColumnSubnetId, + target_peers: usize, + ) -> bool { + let mut peer_count = 0usize; + for info in self + .peers + .values() + .filter(|info| info.is_connected() && info.is_synced_or_advanced()) + { + if info.is_assigned_to_custody_subnet(subnet) { + peer_count += 1; + } + + if peer_count >= target_peers { + return true; + } + } + + false } /// Gives the ids of all known disconnected peers. diff --git a/beacon_node/lighthouse_network/src/peer_manager/peerdb/peer_info.rs b/beacon_node/lighthouse_network/src/peer_manager/peerdb/peer_info.rs index e643fca30f..c289cb9a69 100644 --- a/beacon_node/lighthouse_network/src/peer_manager/peerdb/peer_info.rs +++ b/beacon_node/lighthouse_network/src/peer_manager/peerdb/peer_info.rs @@ -174,19 +174,6 @@ impl PeerInfo { self.subnets.iter() } - /// Returns the number of long lived subnets a peer is subscribed to. - // NOTE: This currently excludes sync committee subnets - pub fn long_lived_subnet_count(&self) -> usize { - if let Some(meta_data) = self.meta_data.as_ref() { - return meta_data.attnets().num_set_bits(); - } else if let Some(enr) = self.enr.as_ref() - && let Ok(attnets) = enr.attestation_bitfield::() - { - return attnets.num_set_bits(); - } - 0 - } - /// Returns an iterator over the long-lived subnets if it has any. pub fn long_lived_subnets(&self) -> Vec { let mut long_lived_subnets = Vec::new(); @@ -222,6 +209,13 @@ impl PeerInfo { } } } + + long_lived_subnets.extend( + self.custody_subnets + .iter() + .map(|&id| Subnet::DataColumn(id)), + ); + long_lived_subnets } @@ -240,6 +234,11 @@ impl PeerInfo { self.custody_subnets.iter() } + /// Returns the number of custody subnets this peer is assigned to. + pub fn custody_subnet_count(&self) -> usize { + self.custody_subnets.len() + } + /// Returns true if the peer is connected to a long-lived subnet. pub fn has_long_lived_subnet(&self) -> bool { // Check the meta_data @@ -262,6 +261,17 @@ impl PeerInfo { { return true; } + + // Check if the peer has custody subnets populated and the peer is subscribed to any of + // its custody subnets + let subscribed_to_any_custody_subnets = self + .custody_subnets + .iter() + .any(|subnet_id| self.subnets.contains(&Subnet::DataColumn(*subnet_id))); + if subscribed_to_any_custody_subnets { + return true; + } + false } @@ -318,6 +328,14 @@ impl PeerInfo { ) } + /// Checks if the peer is synced or advanced. + pub fn is_synced_or_advanced(&self) -> bool { + matches!( + self.sync_status, + SyncStatus::Synced { .. } | SyncStatus::Advanced { .. } + ) + } + /// Checks if the status is connected. pub fn is_dialing(&self) -> bool { matches!(self.connection_status, PeerConnectionStatus::Dialing { .. }) @@ -645,3 +663,50 @@ impl From for PeerState { } } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::types::Subnet; + use types::{DataColumnSubnetId, MainnetEthSpec}; + + type E = MainnetEthSpec; + + fn create_test_peer_info() -> PeerInfo { + PeerInfo::default() + } + + #[test] + fn test_has_long_lived_subnet_empty_custody_subnets() { + let peer_info = create_test_peer_info(); + // peer has no custody subnets or subscribed to any subnets hence return false + assert!(!peer_info.has_long_lived_subnet()); + } + + #[test] + fn test_has_long_lived_subnet_empty_subnets_with_custody_subnets() { + let mut peer_info = create_test_peer_info(); + peer_info.custody_subnets.insert(DataColumnSubnetId::new(1)); + peer_info.custody_subnets.insert(DataColumnSubnetId::new(2)); + // Peer has custody subnets but isn't subscribed to any hence return false + assert!(!peer_info.has_long_lived_subnet()); + } + + #[test] + fn test_has_long_lived_subnet_subscribed_to_custody_subnets() { + let mut peer_info = create_test_peer_info(); + peer_info.custody_subnets.insert(DataColumnSubnetId::new(1)); + peer_info.custody_subnets.insert(DataColumnSubnetId::new(2)); + peer_info.custody_subnets.insert(DataColumnSubnetId::new(3)); + + peer_info + .subnets + .insert(Subnet::DataColumn(DataColumnSubnetId::new(1))); + peer_info + .subnets + .insert(Subnet::DataColumn(DataColumnSubnetId::new(2))); + // Missing DataColumnSubnetId::new(3) - but peer is subscribed to some custody subnets + // Peer is subscribed to any custody subnets - return true + assert!(peer_info.has_long_lived_subnet()); + } +} diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 2f5eb3f689..f00503ec63 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -1120,13 +1120,12 @@ impl BackFillSync { .sampling_subnets() .iter() .all(|subnet_id| { - let peer_count = network + let min_peer_count = 1; + network .network_globals() .peers .read() - .good_range_sync_custody_subnet_peers(*subnet_id) - .count(); - peer_count > 0 + .has_good_peers_in_custody_subnet(subnet_id, min_peer_count) }) } else { true diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 96319f2efa..8907f7510f 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -1132,21 +1132,12 @@ impl SyncingChain { ) -> bool { if network.chain.spec.is_peer_das_enabled_for_epoch(epoch) { // Require peers on all sampling column subnets before sending batches + let sampling_subnets = network.network_globals().sampling_subnets(); network .network_globals() - .sampling_subnets() - .iter() - .all(|subnet_id| { - let peer_db = network.network_globals().peers.read(); - let peer_count = self - .peers - .iter() - .filter(|peer| { - peer_db.is_good_range_sync_custody_subnet_peer(*subnet_id, peer) - }) - .count(); - peer_count > 0 - }) + .peers + .read() + .has_good_custody_range_sync_peer(&sampling_subnets, epoch) } else { true } From 84ec209eba63cfb143e3bd29d6f29310558d0ad3 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Thu, 4 Sep 2025 00:39:16 -0700 Subject: [PATCH 13/81] Allow AwaitingDownload to be a valid in-between state (#7984) N/A Extracts (3) from https://github.com/sigp/lighthouse/pull/7946. Prior to peerdas, a batch should never have been in `AwaitingDownload` state because we immediataly try to move from `AwaitingDownload` to `Downloading` state by sending batches. This was always possible as long as we had peers in the `SyncingChain` in the pre-peerdas world. However, this is no longer the case as a batch can be stuck waiting in `AwaitingDownload` state if we have no peers to request the columns from. This PR makes `AwaitingDownload` to be an allowable in between state. If a batch is found to be in this state, then we attempt to send the batch instead of erroring like before. Note to reviewer: We need to make sure that this doesn't lead to a bunch of batches stuck in `AwaitingDownload` state if the chain can be progressed. Backfill already retries all batches in AwaitingDownload state so we just need to make `AwaitingDownload` a valid state during processing and validation. This PR explicitly adds the same logic for forward sync to download batches stuck in `AwaitingDownload`. Apart from that, we also force download of the `processing_target` when sync stops progressing. This is required in cases where `self.batches` has > `BATCH_BUFFER_SIZE` batches that are waiting to get processed but the `processing_batch` has repeatedly failed at download/processing stage. This leads to sync getting stuck and never recovering. --- .../network/src/sync/backfill_sync/mod.rs | 10 ++- .../network/src/sync/range_sync/chain.rs | 75 +++++++++++++++---- 2 files changed, 67 insertions(+), 18 deletions(-) diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index f00503ec63..d5a4e9b73a 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -687,11 +687,12 @@ impl BackFillSync { // Batch is not ready, nothing to process } BatchState::Poisoned => unreachable!("Poisoned batch"), - BatchState::Failed | BatchState::AwaitingDownload | BatchState::Processing(_) => { + // Batches can be in `AwaitingDownload` state if there weren't good data column subnet + // peers to send the request to. + BatchState::AwaitingDownload => return Ok(ProcessResult::Successful), + BatchState::Failed | BatchState::Processing(_) => { // these are all inconsistent states: // - Failed -> non recoverable batch. Chain should have been removed - // - AwaitingDownload -> A recoverable failed batch should have been - // re-requested. // - Processing -> `self.current_processing_batch` is None self.fail_sync(BackFillError::InvalidSyncState(String::from( "Invalid expected batch state", @@ -790,7 +791,8 @@ impl BackFillSync { } } BatchState::Downloading(..) => {} - BatchState::Failed | BatchState::Poisoned | BatchState::AwaitingDownload => { + BatchState::AwaitingDownload => return, + BatchState::Failed | BatchState::Poisoned => { crit!("batch indicates inconsistent chain state while advancing chain") } BatchState::AwaitingProcessing(..) => {} diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 8907f7510f..a8c85e44d2 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -350,7 +350,10 @@ impl SyncingChain { return Ok(KeepChain); } BatchState::Poisoned => unreachable!("Poisoned batch"), - BatchState::Processing(_) | BatchState::AwaitingDownload | BatchState::Failed => { + // Batches can be in `AwaitingDownload` state if there weren't good data column subnet + // peers to send the request to. + BatchState::AwaitingDownload => return Ok(KeepChain), + BatchState::Processing(_) | BatchState::Failed => { // these are all inconsistent states: // - Processing -> `self.current_processing_batch` is None // - Failed -> non recoverable batch. For an optimistic batch, it should @@ -384,7 +387,10 @@ impl SyncingChain { // Batch is not ready, nothing to process } BatchState::Poisoned => unreachable!("Poisoned batch"), - BatchState::Failed | BatchState::AwaitingDownload | BatchState::Processing(_) => { + // Batches can be in `AwaitingDownload` state if there weren't good data column subnet + // peers to send the request to. + BatchState::AwaitingDownload => return Ok(KeepChain), + BatchState::Failed | BatchState::Processing(_) => { // these are all inconsistent states: // - Failed -> non recoverable batch. Chain should have been removed // - AwaitingDownload -> A recoverable failed batch should have been @@ -582,8 +588,8 @@ impl SyncingChain { BatchProcessResult::NonFaultyFailure => { batch.processing_completed(BatchProcessingResult::NonFaultyFailure)?; - // Simply re-download the batch. - self.send_batch(network, batch_id) + // Simply re-download all batches in `AwaitingDownload` state. + self.attempt_send_awaiting_download_batches(network, "non-faulty-failure") } } } @@ -717,6 +723,7 @@ impl SyncingChain { previous_start = %old_start, new_start = %self.start_epoch, processing_target = %self.processing_target, + id=%self.id, "Chain advanced" ); } @@ -753,7 +760,6 @@ impl SyncingChain { } // this is our robust `processing_target`. All previous batches must be awaiting // validation - let mut redownload_queue = Vec::new(); for (id, batch) in self.batches.range_mut(..batch_id) { if let BatchOperationOutcome::Failed { blacklist } = batch.validation_failed()? { @@ -763,18 +769,14 @@ impl SyncingChain { failing_batch: *id, }); } - redownload_queue.push(*id); } // no batch maxed out it process attempts, so now the chain's volatile progress must be // reset self.processing_target = self.start_epoch; - for id in redownload_queue { - self.send_batch(network, id)?; - } - // finally, re-request the failed batch. - self.send_batch(network, batch_id) + // finally, re-request the failed batch and all other batches in `AwaitingDownload` state. + self.attempt_send_awaiting_download_batches(network, "handle_invalid_batch") } pub fn stop_syncing(&mut self) { @@ -810,6 +812,9 @@ impl SyncingChain { // advance the chain to the new validating epoch self.advance_chain(network, validating_epoch); + // attempt to download any batches stuck in the `AwaitingDownload` state because of + // a lack of peers earlier + self.attempt_send_awaiting_download_batches(network, "start_syncing")?; if self.optimistic_start.is_none() && optimistic_epoch > self.processing_target && !self.attempted_optimistic_starts.contains(&optimistic_epoch) @@ -939,6 +944,41 @@ impl SyncingChain { } } + /// Attempts to send all batches that are in `AwaitingDownload` state. + /// + /// Batches might get stuck in `AwaitingDownload` post peerdas because of lack of peers + /// in required subnets. We need to progress them if peers are available at a later point. + pub fn attempt_send_awaiting_download_batches( + &mut self, + network: &mut SyncNetworkContext, + src: &str, + ) -> ProcessingResult { + // Collect all batches in AwaitingDownload state and see if they can be sent + let awaiting_downloads: Vec<_> = self + .batches + .iter() + .filter(|(_, batch)| matches!(batch.state(), BatchState::AwaitingDownload)) + .map(|(batch_id, _)| batch_id) + .copied() + .collect(); + debug!( + ?awaiting_downloads, + src, "Attempting to send batches awaiting downlaod" + ); + + for batch_id in awaiting_downloads { + if self.good_peers_on_sampling_subnets(batch_id, network) { + self.send_batch(network, batch_id)?; + } else { + debug!( + src = "attempt_send_awaiting_download_batches", + "Waiting for peers to be available on sampling column subnets" + ); + } + } + Ok(KeepChain) + } + /// Requests the batch assigned to the given id from a given peer. pub fn send_batch( &mut self, @@ -1089,14 +1129,16 @@ impl SyncingChain { if !matches!(self.state, ChainSyncingState::Syncing) { return Ok(KeepChain); } - // find the next pending batch and request it from the peer // check if we have the batch for our optimistic start. If not, request it first. // We wait for this batch before requesting any other batches. if let Some(epoch) = self.optimistic_start { if !self.good_peers_on_sampling_subnets(epoch, network) { - debug!("Waiting for peers to be available on sampling column subnets"); + debug!( + src = "request_batches_optimistic", + "Waiting for peers to be available on sampling column subnets" + ); return Ok(KeepChain); } @@ -1105,6 +1147,8 @@ impl SyncingChain { let optimistic_batch = BatchInfo::new(&epoch, EPOCHS_PER_BATCH, batch_type); entry.insert(optimistic_batch); self.send_batch(network, epoch)?; + } else { + self.attempt_send_awaiting_download_batches(network, "request_batches_optimistic")?; } return Ok(KeepChain); } @@ -1179,7 +1223,10 @@ impl SyncingChain { // block and data column requests are currently coupled. This can be removed once we find a // way to decouple the requests and do retries individually, see issue #6258. if !self.good_peers_on_sampling_subnets(self.to_be_downloaded, network) { - debug!("Waiting for peers to be available on custody column subnets"); + debug!( + src = "include_next_batch", + "Waiting for peers to be available on custody column subnets" + ); return None; } From 677de70025b7b00268e445566203462c22a07b20 Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Fri, 5 Sep 2025 05:53:38 +1000 Subject: [PATCH 14/81] Fix incorrect prune test logic (#7999) I just noticed that one of the tests i added in #7915 is incorrect, after it was running flaky for a bit. This PR fixes the scenario and ensure the outcome will always be the same. --- .../lighthouse_network/src/peer_manager/mod.rs | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/beacon_node/lighthouse_network/src/peer_manager/mod.rs b/beacon_node/lighthouse_network/src/peer_manager/mod.rs index efb86a5feb..e7c6f69242 100644 --- a/beacon_node/lighthouse_network/src/peer_manager/mod.rs +++ b/beacon_node/lighthouse_network/src/peer_manager/mod.rs @@ -2885,11 +2885,10 @@ mod tests { *peer_manager.network_globals.sampling_subnets.write() = HashSet::new(); // Create 12 peers: - // - 4 on custody subnet 0 - // - 3 on subnet 1 - //- 2 on subnet 2 - // - 3 scattered. - // Every 4th peer (0,4,8) is on sync committee 0. + // * 4 on custody subnet 0, all on sync committee 0 subnet as well (should only prune up to 2 peers) + // * 3 on subnet 1 + // * 2 on subnet 2 + // * 3 scattered. let mut peers = Vec::new(); for i in 0..12 { let peer = PeerId::random(); @@ -2901,7 +2900,7 @@ mod tests { 7..9 => 2, _ => i - 6, }; - let on_sync_committee = i % 4 == 0; + let on_sync_committee = i < 4; { let mut peers_db = peer_manager.network_globals.peers.write(); @@ -2949,10 +2948,10 @@ mod tests { .cloned() .collect(); - let sync_committee_peers = [&peers[0], &peers[4], &peers[8]]; + // only 2 peers should be pruned from the 4 peers in subnet 0. let remaining_sync_peers = connected_peers .iter() - .filter(|peer| sync_committee_peers.contains(peer)) + .filter(|peer| peers[0..4].contains(peer)) .count(); assert_eq!( remaining_sync_peers, 2, From 9d2f55a39984958e53ccd7110c2cb76eade6b442 Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Fri, 5 Sep 2025 06:17:52 +1000 Subject: [PATCH 15/81] Fix data column reconstruction error (#7998) Addresses #7991 --- beacon_node/beacon_chain/benches/benches.rs | 2 +- .../src/data_column_verification.rs | 2 +- beacon_node/beacon_chain/src/kzg_utils.rs | 34 ++++++++++++++++--- 3 files changed, 32 insertions(+), 6 deletions(-) diff --git a/beacon_node/beacon_chain/benches/benches.rs b/beacon_node/beacon_chain/benches/benches.rs index 5c56594317..d090fc35f7 100644 --- a/beacon_node/beacon_chain/benches/benches.rs +++ b/beacon_node/beacon_chain/benches/benches.rs @@ -55,7 +55,7 @@ fn all_benches(c: &mut Criterion) { b.iter(|| { black_box(reconstruct_data_columns( &kzg, - &column_sidecars.iter().as_slice()[0..column_sidecars.len() / 2], + column_sidecars.iter().as_slice()[0..column_sidecars.len() / 2].to_vec(), spec.as_ref(), )) }) diff --git a/beacon_node/beacon_chain/src/data_column_verification.rs b/beacon_node/beacon_chain/src/data_column_verification.rs index fb88db1300..bc7778cc63 100644 --- a/beacon_node/beacon_chain/src/data_column_verification.rs +++ b/beacon_node/beacon_chain/src/data_column_verification.rs @@ -376,7 +376,7 @@ impl KzgVerifiedCustodyDataColumn { ) -> Result>, KzgError> { let all_data_columns = reconstruct_data_columns( kzg, - &partial_set_of_columns + partial_set_of_columns .iter() .map(|d| d.clone_arc()) .collect::>(), diff --git a/beacon_node/beacon_chain/src/kzg_utils.rs b/beacon_node/beacon_chain/src/kzg_utils.rs index 3063e78337..2147ed5966 100644 --- a/beacon_node/beacon_chain/src/kzg_utils.rs +++ b/beacon_node/beacon_chain/src/kzg_utils.rs @@ -365,14 +365,18 @@ pub fn reconstruct_blobs( /// Reconstruct all data columns from a subset of data column sidecars (requires at least 50%). pub fn reconstruct_data_columns( kzg: &Kzg, - data_columns: &[Arc>], + mut data_columns: Vec>>, spec: &ChainSpec, ) -> Result, KzgError> { + // Sort data columns by index to ensure ascending order for KZG operations + data_columns.sort_unstable_by_key(|dc| dc.index); + let first_data_column = data_columns .first() .ok_or(KzgError::InconsistentArrayLength( "data_columns should have at least one element".to_string(), ))?; + let num_of_blobs = first_data_column.kzg_commitments.len(); let blob_cells_and_proofs_vec = @@ -381,7 +385,7 @@ pub fn reconstruct_data_columns( .map(|row_index| { let mut cells: Vec = vec![]; let mut cell_ids: Vec = vec![]; - for data_column in data_columns { + for data_column in &data_columns { let cell = data_column.column.get(row_index).ok_or( KzgError::InconsistentArrayLength(format!( "Missing data column at row index {row_index}" @@ -433,6 +437,7 @@ mod test { test_build_data_columns_empty(&kzg, &spec); test_build_data_columns(&kzg, &spec); test_reconstruct_data_columns(&kzg, &spec); + test_reconstruct_data_columns_unordered(&kzg, &spec); test_reconstruct_blobs_from_data_columns(&kzg, &spec); test_validate_data_columns(&kzg, &spec); } @@ -505,7 +510,7 @@ mod test { #[track_caller] fn test_reconstruct_data_columns(kzg: &Kzg, spec: &ChainSpec) { - let num_of_blobs = 6; + let num_of_blobs = 2; let (signed_block, blobs, proofs) = create_test_fulu_block_and_blobs::(num_of_blobs, spec); let blob_refs = blobs.iter().collect::>(); @@ -516,7 +521,7 @@ mod test { // Now reconstruct let reconstructed_columns = reconstruct_data_columns( kzg, - &column_sidecars.iter().as_slice()[0..column_sidecars.len() / 2], + column_sidecars.iter().as_slice()[0..column_sidecars.len() / 2].to_vec(), spec, ) .unwrap(); @@ -526,6 +531,27 @@ mod test { } } + #[track_caller] + fn test_reconstruct_data_columns_unordered(kzg: &Kzg, spec: &ChainSpec) { + let num_of_blobs = 2; + let (signed_block, blobs, proofs) = + create_test_fulu_block_and_blobs::(num_of_blobs, spec); + let blob_refs = blobs.iter().collect::>(); + let column_sidecars = + blobs_to_data_column_sidecars(&blob_refs, proofs.to_vec(), &signed_block, kzg, spec) + .unwrap(); + + // Test reconstruction with columns in reverse order (non-ascending) + let mut subset_columns: Vec<_> = + column_sidecars.iter().as_slice()[0..column_sidecars.len() / 2].to_vec(); + subset_columns.reverse(); // This would fail without proper sorting in reconstruct_data_columns + let reconstructed_columns = reconstruct_data_columns(kzg, subset_columns, spec).unwrap(); + + for i in 0..E::number_of_columns() { + assert_eq!(reconstructed_columns.get(i), column_sidecars.get(i), "{i}"); + } + } + #[track_caller] fn test_reconstruct_blobs_from_data_columns(kzg: &Kzg, spec: &ChainSpec) { let num_of_blobs = 6; From fd10b632740e441354c4547a12727c29dd288239 Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Fri, 5 Sep 2025 07:54:30 +1000 Subject: [PATCH 16/81] Add co-author to mergify commits (#7993) * Add co-author to mergify commits. * Remove unnecessary pull request rules from mergify config. * Revert automation removals --- .github/mergify.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/mergify.yml b/.github/mergify.yml index 4ab73bcf07..0b917b2546 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -105,6 +105,10 @@ queue_rules: {{ body | get_section("## Proposed Changes", "") }} + + {% for commit in commits | unique(attribute='email_author') %} + Co-Authored-By: {{ commit.author }} <{{ commit.email_author }}> + {% endfor %} queue_conditions: - "#approved-reviews-by >= 1" - "check-success=license/cla" From 8ec2640e04db4ebfb65eeea2ed6afb85085493fc Mon Sep 17 00:00:00 2001 From: Eitan Seri-Levi Date: Thu, 4 Sep 2025 20:23:34 -0700 Subject: [PATCH 17/81] Don't penalize peers if locally constructed light client data is stale (#7996) #7994 We seem to be penalizing peers in situations where locally constructed light client data is stale. This PR ignores incoming light client data if our locally constructed light client data isn't up to date. Co-Authored-By: Eitan Seri-Levi --- .../src/light_client_finality_update_verification.rs | 6 ++++++ .../src/light_client_optimistic_update_verification.rs | 5 +++++ 2 files changed, 11 insertions(+) diff --git a/beacon_node/beacon_chain/src/light_client_finality_update_verification.rs b/beacon_node/beacon_chain/src/light_client_finality_update_verification.rs index 0d5a5425d5..fe62b8ef90 100644 --- a/beacon_node/beacon_chain/src/light_client_finality_update_verification.rs +++ b/beacon_node/beacon_chain/src/light_client_finality_update_verification.rs @@ -116,7 +116,13 @@ impl VerifiedLightClientFinalityUpdate { // Verify that the gossiped finality update is the same as the locally constructed one. if latest_finality_update != rcv_finality_update { let signature_slot = latest_finality_update.signature_slot(); + if signature_slot != rcv_finality_update.signature_slot() { + // The locally constructed finality update is not up to date, probably + // because the node has fallen behind and needs to sync. + if rcv_finality_update.signature_slot() > signature_slot { + return Err(Error::Ignore); + } return Err(Error::MismatchedSignatureSlot { local: signature_slot, observed: rcv_finality_update.signature_slot(), diff --git a/beacon_node/beacon_chain/src/light_client_optimistic_update_verification.rs b/beacon_node/beacon_chain/src/light_client_optimistic_update_verification.rs index 4da6913443..b59390ea0c 100644 --- a/beacon_node/beacon_chain/src/light_client_optimistic_update_verification.rs +++ b/beacon_node/beacon_chain/src/light_client_optimistic_update_verification.rs @@ -118,6 +118,11 @@ impl VerifiedLightClientOptimisticUpdate { if latest_optimistic_update != rcv_optimistic_update { let signature_slot = latest_optimistic_update.signature_slot(); if signature_slot != rcv_optimistic_update.signature_slot() { + // The locally constructed optimistic update is not up to date, probably + // because the node has fallen behind and needs to sync. + if rcv_optimistic_update.signature_slot() > signature_slot { + return Err(Error::Ignore); + } return Err(Error::MismatchedSignatureSlot { local: signature_slot, observed: rcv_optimistic_update.signature_slot(), From ee734d145665e435f8ea0abfbead37ff61b38549 Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Tue, 9 Sep 2025 16:18:05 +1000 Subject: [PATCH 18/81] Fix stuck data column lookups by improving peer selection and retry logic (#8005) Fixes the issue described in #7980 where Lighthouse repeatedly sends `DataColumnsByRoot` requests to the same peers that return empty responses, causing sync to get stuck. The root cause was we don't count empty responses as failures, leading to excessive retries to unresponsive peers. - Track per peer attempts to limit retry attempts per peer (`MAX_CUSTODY_PEER_ATTEMPTS = 3`) - Replaced random peer selection with hashing within each lookup to prevent splitting lookup into too many small requests and improve request batching efficiency. - Added `single_block_lookup` root span to track all lookups created and added more debug logs: image Co-Authored-By: Jimmy Chen Co-Authored-By: Jimmy Chen --- beacon_node/lighthouse_tracing/src/lib.rs | 4 +- .../network/src/sync/block_lookups/mod.rs | 1 + .../sync/block_lookups/single_block_lookup.rs | 12 ++ .../network/src/sync/network_context.rs | 10 +- .../src/sync/network_context/custody.rs | 153 +++++++++++------- consensus/types/src/data_column_subnet_id.rs | 4 +- 6 files changed, 117 insertions(+), 67 deletions(-) diff --git a/beacon_node/lighthouse_tracing/src/lib.rs b/beacon_node/lighthouse_tracing/src/lib.rs index ffbad1364c..d31df4e3dd 100644 --- a/beacon_node/lighthouse_tracing/src/lib.rs +++ b/beacon_node/lighthouse_tracing/src/lib.rs @@ -17,6 +17,8 @@ pub const SPAN_PROCESS_GOSSIP_BLOCK: &str = "process_gossip_block"; /// Sync methods root spans pub const SPAN_SYNCING_CHAIN: &str = "syncing_chain"; pub const SPAN_OUTGOING_RANGE_REQUEST: &str = "outgoing_range_request"; +pub const SPAN_SINGLE_BLOCK_LOOKUP: &str = "single_block_lookup"; +pub const SPAN_OUTGOING_BLOCK_BY_ROOT_REQUEST: &str = "outgoing_block_by_root_request"; pub const SPAN_OUTGOING_CUSTODY_REQUEST: &str = "outgoing_custody_request"; pub const SPAN_PROCESS_RPC_BLOCK: &str = "process_rpc_block"; pub const SPAN_PROCESS_RPC_BLOBS: &str = "process_rpc_blobs"; @@ -46,7 +48,7 @@ pub const LH_BN_ROOT_SPAN_NAMES: &[&str] = &[ SPAN_PROCESS_GOSSIP_BLOB, SPAN_PROCESS_GOSSIP_BLOCK, SPAN_OUTGOING_RANGE_REQUEST, - SPAN_OUTGOING_CUSTODY_REQUEST, + SPAN_SINGLE_BLOCK_LOOKUP, SPAN_PROCESS_RPC_BLOCK, SPAN_PROCESS_RPC_BLOBS, SPAN_PROCESS_RPC_CUSTODY_COLUMNS, diff --git a/beacon_node/network/src/sync/block_lookups/mod.rs b/beacon_node/network/src/sync/block_lookups/mod.rs index e9f24697ac..b60c21972f 100644 --- a/beacon_node/network/src/sync/block_lookups/mod.rs +++ b/beacon_node/network/src/sync/block_lookups/mod.rs @@ -384,6 +384,7 @@ impl BlockLookups { // If we know that this lookup has unknown parent (is awaiting a parent lookup to resolve), // signal here to hold processing downloaded data. let mut lookup = SingleBlockLookup::new(block_root, peers, cx.next_id(), awaiting_parent); + let _guard = lookup.span.clone().entered(); // Add block components to the new request if let Some(block_component) = block_component { diff --git a/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs b/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs index 30947cf1f0..36509d2563 100644 --- a/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs +++ b/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs @@ -7,6 +7,7 @@ use crate::sync::network_context::{ use beacon_chain::{BeaconChainTypes, BlockProcessStatus}; use derivative::Derivative; use lighthouse_network::service::api_types::Id; +use lighthouse_tracing::SPAN_SINGLE_BLOCK_LOOKUP; use parking_lot::RwLock; use std::collections::HashSet; use std::fmt::Debug; @@ -14,6 +15,7 @@ use std::sync::Arc; use std::time::{Duration, Instant}; use store::Hash256; use strum::IntoStaticStr; +use tracing::{Span, debug_span}; use types::blob_sidecar::FixedBlobSidecarList; use types::{DataColumnSidecarList, EthSpec, SignedBeaconBlock, Slot}; @@ -70,6 +72,7 @@ pub struct SingleBlockLookup { block_root: Hash256, awaiting_parent: Option, created: Instant, + pub(crate) span: Span, } #[derive(Debug)] @@ -89,6 +92,12 @@ impl SingleBlockLookup { id: Id, awaiting_parent: Option, ) -> Self { + let lookup_span = debug_span!( + SPAN_SINGLE_BLOCK_LOOKUP, + block_root = %requested_block_root, + id = id, + ); + Self { id, block_request_state: BlockRequestState::new(requested_block_root), @@ -97,6 +106,7 @@ impl SingleBlockLookup { block_root: requested_block_root, awaiting_parent, created: Instant::now(), + span: lookup_span, } } @@ -192,6 +202,7 @@ impl SingleBlockLookup { &mut self, cx: &mut SyncNetworkContext, ) -> Result { + let _guard = self.span.clone().entered(); // TODO: Check what's necessary to download, specially for blobs self.continue_request::>(cx, 0)?; @@ -257,6 +268,7 @@ impl SingleBlockLookup { // that can make progress so it must be dropped. Consider the lookup completed. // This case can happen if we receive the components from gossip during a retry. if self.all_components_processed() { + self.span = Span::none(); Ok(LookupResult::Completed) } else { Ok(LookupResult::Pending) diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 07462a01fe..17a4295700 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -29,7 +29,7 @@ use lighthouse_network::service::api_types::{ DataColumnsByRootRequester, Id, SingleLookupReqId, SyncRequestId, }; use lighthouse_network::{Client, NetworkGlobals, PeerAction, PeerId, ReportSource}; -use lighthouse_tracing::SPAN_OUTGOING_RANGE_REQUEST; +use lighthouse_tracing::{SPAN_OUTGOING_BLOCK_BY_ROOT_REQUEST, SPAN_OUTGOING_RANGE_REQUEST}; use parking_lot::RwLock; pub use requests::LookupVerifyError; use requests::{ @@ -886,6 +886,11 @@ impl SyncNetworkContext { "Sync RPC request sent" ); + let request_span = debug_span!( + parent: Span::current(), + SPAN_OUTGOING_BLOCK_BY_ROOT_REQUEST, + %block_root, + ); self.blocks_by_root_requests.insert( id, peer_id, @@ -893,8 +898,7 @@ impl SyncNetworkContext { // block and the peer must have it. true, BlocksByRootRequestItems::new(request), - // Not implemented - Span::none(), + request_span, ); Ok(LookupRequestResult::RequestSent(id.req_id)) diff --git a/beacon_node/network/src/sync/network_context/custody.rs b/beacon_node/network/src/sync/network_context/custody.rs index d973e83cea..71e002cc42 100644 --- a/beacon_node/network/src/sync/network_context/custody.rs +++ b/beacon_node/network/src/sync/network_context/custody.rs @@ -7,19 +7,17 @@ use fnv::FnvHashMap; use lighthouse_network::PeerId; use lighthouse_network::service::api_types::{CustodyId, DataColumnsByRootRequester}; use lighthouse_tracing::SPAN_OUTGOING_CUSTODY_REQUEST; -use lru_cache::LRUTimeCache; use parking_lot::RwLock; -use rand::Rng; use std::collections::HashSet; +use std::hash::{BuildHasher, RandomState}; use std::time::{Duration, Instant}; use std::{collections::HashMap, marker::PhantomData, sync::Arc}; -use tracing::{Span, debug, debug_span, field, warn}; +use tracing::{Span, debug, debug_span, warn}; use types::{DataColumnSidecar, Hash256, data_column_sidecar::ColumnIndex}; use types::{DataColumnSidecarList, EthSpec}; use super::{LookupRequestResult, PeerGroup, RpcResponseResult, SyncNetworkContext}; -const FAILED_PEERS_CACHE_EXPIRY_SECONDS: u64 = 5; const MAX_STALE_NO_PEERS_DURATION: Duration = Duration::from_secs(30); pub struct ActiveCustodyRequest { @@ -30,9 +28,7 @@ pub struct ActiveCustodyRequest { /// Active requests for 1 or more columns each active_batch_columns_requests: FnvHashMap, - /// Peers that have recently failed to successfully respond to a columns by root request. - /// Having a LRUTimeCache allows this request to not have to track disconnecting peers. - failed_peers: LRUTimeCache, + peer_attempts: HashMap, /// Set of peers that claim to have imported this block and their custody columns lookup_peers: Arc>>, /// Span for tracing the lifetime of this request. @@ -71,7 +67,11 @@ impl ActiveCustodyRequest { column_indices: &[ColumnIndex], lookup_peers: Arc>>, ) -> Self { - let span = debug_span!(parent: None, SPAN_OUTGOING_CUSTODY_REQUEST, %block_root); + let span = debug_span!( + parent: Span::current(), + SPAN_OUTGOING_CUSTODY_REQUEST, + %block_root, + ); Self { block_root, custody_id, @@ -81,7 +81,7 @@ impl ActiveCustodyRequest { .map(|index| (*index, ColumnRequest::new())), ), active_batch_columns_requests: <_>::default(), - failed_peers: LRUTimeCache::new(Duration::from_secs(FAILED_PEERS_CACHE_EXPIRY_SECONDS)), + peer_attempts: HashMap::new(), lookup_peers, span, _phantom: PhantomData, @@ -170,13 +170,6 @@ impl ActiveCustodyRequest { ?missing_column_indexes, "Custody column peer claims to not have some data" ); - - batch_request.span.record( - "missing_column_indexes", - field::debug(missing_column_indexes), - ); - - self.failed_peers.insert(peer_id); } } Err(err) => { @@ -195,13 +188,6 @@ impl ActiveCustodyRequest { .ok_or(Error::BadState("unknown column_index".to_owned()))? .on_download_error_and_mark_failure(req_id)?; } - - batch_request.span.record( - "missing_column_indexes", - field::debug(&batch_request.indices), - ); - - self.failed_peers.insert(peer_id); } }; @@ -238,52 +224,29 @@ impl ActiveCustodyRequest { let active_request_count_by_peer = cx.active_request_count_by_peer(); let mut columns_to_request_by_peer = HashMap::>::new(); let lookup_peers = self.lookup_peers.read(); + // Create deterministic hasher per request to ensure consistent peer ordering within + // this request (avoiding fragmentation) while varying selection across different requests + let random_state = RandomState::new(); - // Need to: - // - track how many active requests a peer has for load balancing - // - which peers have failures to attempt others - // - which peer returned what to have PeerGroup attributability - - for (column_index, request) in self.column_requests.iter_mut() { + for (column_index, request) in self.column_requests.iter() { if let Some(wait_duration) = request.is_awaiting_download() { + // Note: an empty response is considered a successful response, so we may end up + // retrying many more times than `MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS`. if request.download_failures > MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS { return Err(Error::TooManyFailures); } - // TODO(das): When is a fork and only a subset of your peers know about a block, we should - // only query the peers on that fork. Should this case be handled? How to handle it? - let custodial_peers = cx.get_custodial_peers(*column_index); + let peer_to_request = self.select_column_peer( + cx, + &active_request_count_by_peer, + &lookup_peers, + *column_index, + &random_state, + ); - // We draw from the total set of peers, but prioritize those peers who we have - // received an attestation / status / block message claiming to have imported the - // lookup. The frequency of those messages is low, so drawing only from lookup_peers - // could cause many lookups to take much longer or fail as they don't have enough - // custody peers on a given column - let mut priorized_peers = custodial_peers - .iter() - .map(|peer| { - ( - // Prioritize peers that claim to know have imported this block - if lookup_peers.contains(peer) { 0 } else { 1 }, - // De-prioritize peers that have failed to successfully respond to - // requests recently - self.failed_peers.contains(peer), - // Prefer peers with fewer requests to load balance across peers. - // We batch requests to the same peer, so count existence in the - // `columns_to_request_by_peer` as a single 1 request. - active_request_count_by_peer.get(peer).copied().unwrap_or(0) - + columns_to_request_by_peer.get(peer).map(|_| 1).unwrap_or(0), - // Random factor to break ties, otherwise the PeerID breaks ties - rand::rng().random::(), - *peer, - ) - }) - .collect::>(); - priorized_peers.sort_unstable(); - - if let Some((_, _, _, _, peer_id)) = priorized_peers.first() { + if let Some(peer_id) = peer_to_request { columns_to_request_by_peer - .entry(*peer_id) + .entry(peer_id) .or_default() .push(*column_index); } else if wait_duration > MAX_STALE_NO_PEERS_DURATION { @@ -298,6 +261,23 @@ impl ActiveCustodyRequest { } } + let peer_requests = columns_to_request_by_peer.len(); + if peer_requests > 0 { + let columns_requested_count = columns_to_request_by_peer + .values() + .map(|v| v.len()) + .sum::(); + debug!( + lookup_peers = lookup_peers.len(), + "Requesting {} columns from {} peers", columns_requested_count, peer_requests, + ); + } else { + debug!( + lookup_peers = lookup_peers.len(), + "No column peers found for look up", + ); + } + for (peer_id, indices) in columns_to_request_by_peer.into_iter() { let request_result = cx .data_column_lookup_request( @@ -317,8 +297,14 @@ impl ActiveCustodyRequest { match request_result { LookupRequestResult::RequestSent(req_id) => { + *self.peer_attempts.entry(peer_id).or_insert(0) += 1; + let client = cx.network_globals().client(&peer_id).kind; - let batch_columns_req_span = debug_span!("batch_columns_req", %peer_id, %client, missing_column_indexes = tracing::field::Empty); + let batch_columns_req_span = debug_span!( + "batch_columns_req", + %peer_id, + %client, + ); let _guard = batch_columns_req_span.clone().entered(); for column_index in &indices { let column_request = self @@ -345,11 +331,54 @@ impl ActiveCustodyRequest { Ok(None) } + + fn select_column_peer( + &self, + cx: &mut SyncNetworkContext, + active_request_count_by_peer: &HashMap, + lookup_peers: &HashSet, + column_index: ColumnIndex, + random_state: &RandomState, + ) -> Option { + // We draw from the total set of peers, but prioritize those peers who we have + // received an attestation or a block from (`lookup_peers`), as the `lookup_peers` may take + // time to build up and we are likely to not find any column peers initially. + let custodial_peers = cx.get_custodial_peers(column_index); + let mut prioritized_peers = custodial_peers + .iter() + .filter(|peer| { + // Exclude peers that we have already made too many attempts to. + self.peer_attempts.get(peer).copied().unwrap_or(0) <= MAX_CUSTODY_PEER_ATTEMPTS + }) + .map(|peer| { + ( + // Prioritize peers that claim to know have imported this block + if lookup_peers.contains(peer) { 0 } else { 1 }, + // De-prioritize peers that we have already attempted to download from + self.peer_attempts.get(peer).copied().unwrap_or(0), + // Prefer peers with fewer requests to load balance across peers. + active_request_count_by_peer.get(peer).copied().unwrap_or(0), + // The hash ensures consistent peer ordering within this request + // to avoid fragmentation while varying selection across different requests. + random_state.hash_one(peer), + *peer, + ) + }) + .collect::>(); + prioritized_peers.sort_unstable(); + + prioritized_peers + .first() + .map(|(_, _, _, _, peer_id)| *peer_id) + } } /// TODO(das): this attempt count is nested into the existing lookup request count. const MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS: usize = 3; +/// Max number of attempts to request custody columns from a single peer. +const MAX_CUSTODY_PEER_ATTEMPTS: usize = 3; + struct ColumnRequest { status: Status, download_failures: usize, diff --git a/consensus/types/src/data_column_subnet_id.rs b/consensus/types/src/data_column_subnet_id.rs index 125a77fc1e..4061cb4fdb 100644 --- a/consensus/types/src/data_column_subnet_id.rs +++ b/consensus/types/src/data_column_subnet_id.rs @@ -1,13 +1,15 @@ //! Identifies each data column subnet by an integer identifier. use crate::ChainSpec; use crate::data_column_sidecar::ColumnIndex; +use derivative::Derivative; use safe_arith::{ArithError, SafeArith}; use serde::{Deserialize, Serialize}; use std::fmt::{self, Display}; use std::ops::{Deref, DerefMut}; #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[derive(Clone, Copy, Derivative, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[derivative(Debug = "transparent")] #[serde(transparent)] pub struct DataColumnSubnetId(#[serde(with = "serde_utils::quoted_u64")] u64); From 2b22903fbaa1a18eac9231fe0d172c78ba142181 Mon Sep 17 00:00:00 2001 From: Odinson Date: Tue, 9 Sep 2025 13:39:03 +0530 Subject: [PATCH 19/81] fix: extra fields in logs (#8009) Potentially fixes #7995 changed `span_data` to a `HashMap` and added a new check to remove span fields whose base names are already present on the event. Co-Authored-By: PoulavBhowmick03 Co-Authored-By: Michael Sproul --- common/logging/src/tracing_logging_layer.rs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/common/logging/src/tracing_logging_layer.rs b/common/logging/src/tracing_logging_layer.rs index 27841cb7d8..923ac1758f 100644 --- a/common/logging/src/tracing_logging_layer.rs +++ b/common/logging/src/tracing_logging_layer.rs @@ -80,12 +80,11 @@ where event.record(&mut visitor); let mut span_data = Vec::new(); - if let Some(scope) = ctx.event_scope(event) { - for span in scope.from_root() { - if let Some(data) = span.extensions().get::() { - span_data.extend(data.fields.clone()); - } - } + if let Some(mut scope) = ctx.event_scope(event) + && let Some(span) = scope.next() + && let Some(data) = span.extensions().get::() + { + span_data.extend(data.fields.clone()); } // Remove ascii control codes from message. From 8a4f6cf0d5b6b261b2c3439ce7c05383a53d30c5 Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Wed, 10 Sep 2025 13:30:51 +1000 Subject: [PATCH 20/81] Instrument tracing on block production code path (#8017) Partially #7814. Instrument block production code path. New root spans: * `produce_block_v3` * `produce_block_v2` Example traces: image Co-Authored-By: Jimmy Chen --- beacon_node/beacon_chain/src/beacon_chain.rs | 112 +++++++++++------- .../beacon_chain/src/execution_payload.rs | 8 +- beacon_node/execution_layer/src/lib.rs | 7 +- beacon_node/http_api/src/produce_block.rs | 12 ++ beacon_node/lighthouse_tracing/src/lib.rs | 9 +- consensus/fork_choice/src/fork_choice.rs | 1 + .../src/per_slot_processing.rs | 2 +- 7 files changed, 101 insertions(+), 50 deletions(-) diff --git a/beacon_node/beacon_chain/src/beacon_chain.rs b/beacon_node/beacon_chain/src/beacon_chain.rs index b8a6529653..6e11b66610 100644 --- a/beacon_node/beacon_chain/src/beacon_chain.rs +++ b/beacon_node/beacon_chain/src/beacon_chain.rs @@ -1437,6 +1437,7 @@ impl BeaconChain { /// /// Returns `None` when the state is not found in the database or there is an error skipping /// to a future state. + #[instrument(level = "debug", skip_all)] pub fn state_at_slot( &self, slot: Slot, @@ -4466,6 +4467,7 @@ impl BeaconChain { } /// If configured, wait for the fork choice run at the start of the slot to complete. + #[instrument(level = "debug", skip_all)] fn wait_for_fork_choice_before_block_production( self: &Arc, slot: Slot, @@ -4528,10 +4530,15 @@ impl BeaconChain { // // Load the parent state from disk. let chain = self.clone(); + let span = Span::current(); let (state, state_root_opt) = self .task_executor .spawn_blocking_handle( - move || chain.load_state_for_block_production(slot), + move || { + let _guard = + debug_span!(parent: span, "load_state_for_block_production").entered(); + chain.load_state_for_block_production(slot) + }, "load_state_for_block_production", ) .ok_or(BlockProductionError::ShuttingDown)? @@ -4618,6 +4625,7 @@ impl BeaconChain { /// Fetch the beacon state to use for producing a block if a 1-slot proposer re-org is viable. /// /// This function will return `None` if proposer re-orgs are disabled. + #[instrument(skip_all, level = "debug")] fn get_state_for_re_org( &self, slot: Slot, @@ -5072,6 +5080,7 @@ impl BeaconChain { /// equal to the root of `state`. Providing this value will serve as an optimization to avoid /// performing a tree hash in some scenarios. #[allow(clippy::too_many_arguments)] + #[instrument(level = "debug", skip_all)] pub async fn produce_block_on_state( self: &Arc, state: BeaconState, @@ -5091,10 +5100,13 @@ impl BeaconChain { .graffiti_calculator .get_graffiti(validator_graffiti) .await; + let span = Span::current(); let mut partial_beacon_block = self .task_executor .spawn_blocking_handle( move || { + let _guard = + debug_span!(parent: span, "produce_partial_beacon_block").entered(); chain.produce_partial_beacon_block( state, state_root_opt, @@ -5130,10 +5142,14 @@ impl BeaconChain { match block_contents_type { BlockProposalContentsType::Full(block_contents) => { let chain = self.clone(); + let span = Span::current(); let beacon_block_response = self .task_executor .spawn_blocking_handle( move || { + let _guard = + debug_span!(parent: span, "complete_partial_beacon_block") + .entered(); chain.complete_partial_beacon_block( partial_beacon_block, Some(block_contents), @@ -5150,10 +5166,14 @@ impl BeaconChain { } BlockProposalContentsType::Blinded(block_contents) => { let chain = self.clone(); + let span = Span::current(); let beacon_block_response = self .task_executor .spawn_blocking_handle( move || { + let _guard = + debug_span!(parent: span, "complete_partial_beacon_block") + .entered(); chain.complete_partial_beacon_block( partial_beacon_block, Some(block_contents), @@ -5171,10 +5191,13 @@ impl BeaconChain { } } else { let chain = self.clone(); + let span = Span::current(); let beacon_block_response = self .task_executor .spawn_blocking_handle( move || { + let _guard = + debug_span!(parent: span, "complete_partial_beacon_block").entered(); chain.complete_partial_beacon_block( partial_beacon_block, None, @@ -5276,51 +5299,54 @@ impl BeaconChain { // Iterate through the naive aggregation pool and ensure all the attestations from there // are included in the operation pool. - let unagg_import_timer = - metrics::start_timer(&metrics::BLOCK_PRODUCTION_UNAGGREGATED_TIMES); - for attestation in self.naive_aggregation_pool.read().iter() { - let import = |attestation: &Attestation| { - let attesting_indices = - get_attesting_indices_from_state(&state, attestation.to_ref())?; - self.op_pool - .insert_attestation(attestation.clone(), attesting_indices) - }; - if let Err(e) = import(attestation) { - // Don't stop block production if there's an error, just create a log. - error!( - reason = ?e, - "Attestation did not transfer to op pool" - ); + { + let _guard = debug_span!("import_naive_aggregation_pool").entered(); + let _unagg_import_timer = + metrics::start_timer(&metrics::BLOCK_PRODUCTION_UNAGGREGATED_TIMES); + for attestation in self.naive_aggregation_pool.read().iter() { + let import = |attestation: &Attestation| { + let attesting_indices = + get_attesting_indices_from_state(&state, attestation.to_ref())?; + self.op_pool + .insert_attestation(attestation.clone(), attesting_indices) + }; + if let Err(e) = import(attestation) { + // Don't stop block production if there's an error, just create a log. + error!( + reason = ?e, + "Attestation did not transfer to op pool" + ); + } } - } - drop(unagg_import_timer); - - let attestation_packing_timer = - metrics::start_timer(&metrics::BLOCK_PRODUCTION_ATTESTATION_TIMES); - - // Epoch cache and total balance cache are required for op pool packing. - state.build_total_active_balance_cache(&self.spec)?; - initialize_epoch_cache(&mut state, &self.spec)?; - - let mut prev_filter_cache = HashMap::new(); - let prev_attestation_filter = |att: &CompactAttestationRef| { - self.filter_op_pool_attestation(&mut prev_filter_cache, att, &state) - }; - let mut curr_filter_cache = HashMap::new(); - let curr_attestation_filter = |att: &CompactAttestationRef| { - self.filter_op_pool_attestation(&mut curr_filter_cache, att, &state) }; - let mut attestations = self - .op_pool - .get_attestations( - &state, - prev_attestation_filter, - curr_attestation_filter, - &self.spec, - ) - .map_err(BlockProductionError::OpPoolError)?; - drop(attestation_packing_timer); + let mut attestations = { + let _guard = debug_span!("pack_attestations").entered(); + let _attestation_packing_timer = + metrics::start_timer(&metrics::BLOCK_PRODUCTION_ATTESTATION_TIMES); + + // Epoch cache and total balance cache are required for op pool packing. + state.build_total_active_balance_cache(&self.spec)?; + initialize_epoch_cache(&mut state, &self.spec)?; + + let mut prev_filter_cache = HashMap::new(); + let prev_attestation_filter = |att: &CompactAttestationRef| { + self.filter_op_pool_attestation(&mut prev_filter_cache, att, &state) + }; + let mut curr_filter_cache = HashMap::new(); + let curr_attestation_filter = |att: &CompactAttestationRef| { + self.filter_op_pool_attestation(&mut curr_filter_cache, att, &state) + }; + + self.op_pool + .get_attestations( + &state, + prev_attestation_filter, + curr_attestation_filter, + &self.spec, + ) + .map_err(BlockProductionError::OpPoolError)? + }; // If paranoid mode is enabled re-check the signatures of every included message. // This will be a lot slower but guards against bugs in block production and can be diff --git a/beacon_node/beacon_chain/src/execution_payload.rs b/beacon_node/beacon_chain/src/execution_payload.rs index 697fee351e..f0cab06ca3 100644 --- a/beacon_node/beacon_chain/src/execution_payload.rs +++ b/beacon_node/beacon_chain/src/execution_payload.rs @@ -24,7 +24,7 @@ use state_processing::per_block_processing::{ }; use std::sync::Arc; use tokio::task::JoinHandle; -use tracing::{debug, warn}; +use tracing::{Instrument, debug, debug_span, warn}; use tree_hash::TreeHash; use types::payload::BlockProductionVersion; use types::*; @@ -403,8 +403,9 @@ pub fn get_execution_payload( block_production_version, ) .await - }, - "get_execution_payload", + } + .instrument(debug_span!("prepare_execution_payload")), + "prepare_execution_payload", ) .ok_or(BlockProductionError::ShuttingDown)?; @@ -503,6 +504,7 @@ where }, "prepare_execution_payload_forkchoice_update_params", ) + .instrument(debug_span!("forkchoice_update_params")) .await .map_err(|e| BlockProductionError::BeaconChain(Box::new(e)))?; diff --git a/beacon_node/execution_layer/src/lib.rs b/beacon_node/execution_layer/src/lib.rs index b53c4cde4e..5b48b81aa6 100644 --- a/beacon_node/execution_layer/src/lib.rs +++ b/beacon_node/execution_layer/src/lib.rs @@ -43,7 +43,7 @@ use tokio::{ time::sleep, }; use tokio_stream::wrappers::WatchStream; -use tracing::{debug, error, info, warn}; +use tracing::{Instrument, debug, debug_span, error, info, instrument, warn}; use tree_hash::TreeHash; use types::beacon_block_body::KzgCommitments; use types::builder_bid::BuilderBid; @@ -851,6 +851,7 @@ impl ExecutionLayer { } /// Returns the fee-recipient address that should be used to build a block + #[instrument(level = "debug", skip_all)] pub async fn get_suggested_fee_recipient(&self, proposer_index: u64) -> Address { if let Some(preparation_data_entry) = self.proposer_preparation_data().await.get(&proposer_index) @@ -875,6 +876,7 @@ impl ExecutionLayer { } } + #[instrument(level = "debug", skip_all)] pub async fn get_proposer_gas_limit(&self, proposer_index: u64) -> Option { self.proposer_preparation_data() .await @@ -891,6 +893,7 @@ impl ExecutionLayer { /// /// The result will be returned from the first node that returns successfully. No more nodes /// will be contacted. + #[instrument(level = "debug", skip_all)] pub async fn get_payload( &self, payload_parameters: PayloadParameters<'_>, @@ -996,6 +999,7 @@ impl ExecutionLayer { timed_future(metrics::GET_BLINDED_PAYLOAD_BUILDER, async { builder .get_builder_header::(slot, parent_hash, pubkey) + .instrument(debug_span!("get_builder_header")) .await }), timed_future(metrics::GET_BLINDED_PAYLOAD_LOCAL, async { @@ -1237,6 +1241,7 @@ impl ExecutionLayer { .await } + #[instrument(level = "debug", skip_all)] async fn get_full_payload_with( &self, payload_parameters: PayloadParameters<'_>, diff --git a/beacon_node/http_api/src/produce_block.rs b/beacon_node/http_api/src/produce_block.rs index 932fb00179..367e09969b 100644 --- a/beacon_node/http_api/src/produce_block.rs +++ b/beacon_node/http_api/src/produce_block.rs @@ -10,8 +10,10 @@ use beacon_chain::{ BeaconBlockResponseWrapper, BeaconChain, BeaconChainTypes, ProduceBlockVerification, }; use eth2::types::{self as api_types, ProduceBlockV3Metadata, SkipRandaoVerification}; +use lighthouse_tracing::{SPAN_PRODUCE_BLOCK_V2, SPAN_PRODUCE_BLOCK_V3}; use ssz::Encode; use std::sync::Arc; +use tracing::instrument; use types::{payload::BlockProductionVersion, *}; use warp::{ Reply, @@ -40,6 +42,11 @@ pub fn get_randao_verification( Ok(randao_verification) } +#[instrument( + name = SPAN_PRODUCE_BLOCK_V3, + skip_all, + fields(%slot) +)] pub async fn produce_block_v3( accept_header: Option, chain: Arc>, @@ -155,6 +162,11 @@ pub async fn produce_blinded_block_v2( build_response_v2(chain, block_response_type, accept_header) } +#[instrument( + name = SPAN_PRODUCE_BLOCK_V2, + skip_all, + fields(%slot) +)] pub async fn produce_block_v2( accept_header: Option, chain: Arc>, diff --git a/beacon_node/lighthouse_tracing/src/lib.rs b/beacon_node/lighthouse_tracing/src/lib.rs index d31df4e3dd..1787399761 100644 --- a/beacon_node/lighthouse_tracing/src/lib.rs +++ b/beacon_node/lighthouse_tracing/src/lib.rs @@ -3,7 +3,9 @@ //! TODO: These span identifiers will be used to implement selective tracing export (to be implemented), //! where only the listed root spans and their descendants will be exported to the tracing backend. -/// Root span name for publish_block +/// Root span names for block production and publishing +pub const SPAN_PRODUCE_BLOCK_V2: &str = "produce_block_v2"; +pub const SPAN_PRODUCE_BLOCK_V3: &str = "produce_block_v3"; pub const SPAN_PUBLISH_BLOCK: &str = "publish_block"; /// Data Availability checker span identifiers @@ -42,11 +44,14 @@ pub const SPAN_HANDLE_LIGHT_CLIENT_FINALITY_UPDATE: &str = "handle_light_client_ /// Only these spans and their descendants will be processed to reduce noise from /// uninstrumented code paths. New root spans must be added to this list to be traced. pub const LH_BN_ROOT_SPAN_NAMES: &[&str] = &[ - SPAN_SYNCING_CHAIN, + SPAN_PRODUCE_BLOCK_V2, + SPAN_PRODUCE_BLOCK_V3, + SPAN_PUBLISH_BLOCK, SPAN_PENDING_COMPONENTS, SPAN_PROCESS_GOSSIP_DATA_COLUMN, SPAN_PROCESS_GOSSIP_BLOB, SPAN_PROCESS_GOSSIP_BLOCK, + SPAN_SYNCING_CHAIN, SPAN_OUTGOING_RANGE_REQUEST, SPAN_SINGLE_BLOCK_LOOKUP, SPAN_PROCESS_RPC_BLOCK, diff --git a/consensus/fork_choice/src/fork_choice.rs b/consensus/fork_choice/src/fork_choice.rs index 19f294d439..fe1f5fba9e 100644 --- a/consensus/fork_choice/src/fork_choice.rs +++ b/consensus/fork_choice/src/fork_choice.rs @@ -523,6 +523,7 @@ where /// /// You *must* call `get_head` for the proposal slot prior to calling this function and pass /// in the result of `get_head` as `canonical_head`. + #[instrument(level = "debug", skip_all)] pub fn get_proposer_head( &self, current_slot: Slot, diff --git a/consensus/state_processing/src/per_slot_processing.rs b/consensus/state_processing/src/per_slot_processing.rs index 04b1e8148f..8695054e1e 100644 --- a/consensus/state_processing/src/per_slot_processing.rs +++ b/consensus/state_processing/src/per_slot_processing.rs @@ -26,7 +26,7 @@ impl From for Error { /// If the root of the supplied `state` is known, then it can be passed as `state_root`. If /// `state_root` is `None`, the root of `state` will be computed using a cached tree hash. /// Providing the `state_root` makes this function several orders of magnitude faster. -#[instrument(skip_all)] +#[instrument(level = "debug", skip_all)] pub fn per_slot_processing( state: &mut BeaconState, state_root: Option, From 811eccdf3434e2491b89703affdc88141110a439 Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Wed, 10 Sep 2025 14:59:22 +1000 Subject: [PATCH 21/81] Reduce noise in `Debug` impl of `RuntimeVariableList` (#8007) The default debug output of these types contains a lot of unnecessary noise making it hard to read. This PR removes the type and extra fields from debug output to make logs easier to read. `len` could be potentially useful in some cases, but this gives us flexibility to only log it separately if we need it. Related PR in `ssz_types`: - https://github.com/sigp/ssz_types/pull/57 Co-Authored-By: Jimmy Chen --- consensus/types/src/runtime_fixed_vector.rs | 11 ++++++++++- consensus/types/src/runtime_var_list.rs | 10 +++++++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/consensus/types/src/runtime_fixed_vector.rs b/consensus/types/src/runtime_fixed_vector.rs index 2b08b7bf70..f562322a3d 100644 --- a/consensus/types/src/runtime_fixed_vector.rs +++ b/consensus/types/src/runtime_fixed_vector.rs @@ -2,12 +2,21 @@ //! //! The length of the list cannot be changed once it is set. -#[derive(Clone, Debug)] +use std::fmt; +use std::fmt::Debug; + +#[derive(Clone)] pub struct RuntimeFixedVector { vec: Vec, len: usize, } +impl Debug for RuntimeFixedVector { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{:?} (len={})", self.vec, self.len) + } +} + impl RuntimeFixedVector { pub fn new(vec: Vec) -> Self { let len = vec.len(); diff --git a/consensus/types/src/runtime_var_list.rs b/consensus/types/src/runtime_var_list.rs index dcb98538b7..d57c65b1b7 100644 --- a/consensus/types/src/runtime_var_list.rs +++ b/consensus/types/src/runtime_var_list.rs @@ -4,6 +4,8 @@ use serde::de::Error as DeError; use serde::{Deserialize, Deserializer, Serialize}; use ssz::Decode; use ssz_types::Error; +use std::fmt; +use std::fmt::Debug; use std::ops::{Deref, Index, IndexMut}; use std::slice::SliceIndex; use tree_hash::{Hash256, MerkleHasher, PackedEncoding, TreeHash, TreeHashType}; @@ -42,7 +44,7 @@ use tree_hash::{Hash256, MerkleHasher, PackedEncoding, TreeHash, TreeHashType}; /// assert!(long.push(6).is_err()); /// /// ``` -#[derive(Debug, Clone, Serialize, Deserialize, Derivative)] +#[derive(Clone, Serialize, Deserialize, Derivative)] #[derivative(PartialEq, Eq, Hash(bound = "T: std::hash::Hash"))] #[serde(transparent)] pub struct RuntimeVariableList { @@ -51,6 +53,12 @@ pub struct RuntimeVariableList { max_len: usize, } +impl Debug for RuntimeVariableList { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{:?} (max_len={})", self.vec, self.max_len) + } +} + impl RuntimeVariableList { /// Returns `Ok` if the given `vec` equals the fixed length of `Self`. Otherwise returns /// `Err(OutOfBounds { .. })`. From 38205192caaf538d3ba6b1cb800775f5742615de Mon Sep 17 00:00:00 2001 From: hopinheimer <48147533+hopinheimer@users.noreply.github.com> Date: Wed, 10 Sep 2025 12:16:48 +0530 Subject: [PATCH 22/81] Fix http api tests ci (#7943) Co-Authored-By: Jimmy Chen Co-Authored-By: Michael Sproul Co-Authored-By: Michael Sproul Co-Authored-By: hopinheimer --- Makefile | 6 +- .../beacon_chain/src/blob_verification.rs | 4 +- beacon_node/beacon_chain/src/test_utils.rs | 69 ++- beacon_node/builder_client/src/lib.rs | 6 +- beacon_node/execution_layer/src/lib.rs | 9 +- .../src/test_utils/mock_builder.rs | 237 ++++++---- beacon_node/http_api/src/lib.rs | 13 +- beacon_node/http_api/src/test_utils.rs | 58 ++- .../tests/broadcast_validation_tests.rs | 416 ++++++++++++------ beacon_node/http_api/tests/fork_tests.rs | 1 + .../http_api/tests/interactive_tests.rs | 3 + beacon_node/http_api/tests/tests.rs | 116 +++-- .../gossip_methods.rs | 2 +- common/eth2/src/lib.rs | 101 +++-- .../validator_services/src/block_service.rs | 5 +- 15 files changed, 709 insertions(+), 337 deletions(-) diff --git a/Makefile b/Makefile index 66d0b68268..475d3aac8a 100644 --- a/Makefile +++ b/Makefile @@ -193,11 +193,11 @@ test-beacon-chain: $(patsubst %,test-beacon-chain-%,$(FORKS)) test-beacon-chain-%: env FORK_NAME=$* cargo nextest run --release --features "fork_from_env,slasher/lmdb,$(TEST_FEATURES)" -p beacon_chain -# Run the tests in the `beacon_chain` crate for all known forks. -test-http-api: $(patsubst %,test-beacon-chain-%,$(RECENT_FORKS)) +# Run the tests in the `http_api` crate for recent forks. +test-http-api: $(patsubst %,test-http-api-%,$(RECENT_FORKS)) test-http-api-%: - env FORK_NAME=$* cargo nextest run --release --features "fork_from_env,slasher/lmdb,$(TEST_FEATURES)" -p http_api + env FORK_NAME=$* cargo nextest run --release --features "beacon_chain/fork_from_env" -p http_api # Run the tests in the `operation_pool` crate for all known forks. diff --git a/beacon_node/beacon_chain/src/blob_verification.rs b/beacon_node/beacon_chain/src/blob_verification.rs index 2ba20d5a82..53676c0b24 100644 --- a/beacon_node/beacon_chain/src/blob_verification.rs +++ b/beacon_node/beacon_chain/src/blob_verification.rs @@ -96,7 +96,7 @@ pub enum GossipBlobError { /// ## Peer scoring /// /// We cannot process the blob without validating its parent, the peer isn't necessarily faulty. - BlobParentUnknown { parent_root: Hash256 }, + ParentUnknown { parent_root: Hash256 }, /// Invalid kzg commitment inclusion proof /// ## Peer scoring @@ -474,7 +474,7 @@ pub fn validate_blob_sidecar_for_gossip impl futures::Future + 'static { + strict_registrations: bool, + apply_operations: bool, + broadcast_to_bn: bool, + ) -> impl futures::Future + use { let mock_el = self .mock_execution_layer .as_ref() @@ -727,6 +730,9 @@ where let (mock_builder, (addr, mock_builder_server)) = MockBuilder::new_for_testing( mock_el_url, beacon_url, + strict_registrations, + apply_operations, + broadcast_to_bn, self.spec.clone(), self.runtime.task_executor.clone(), ); @@ -903,8 +909,65 @@ where state: BeaconState, slot: Slot, ) -> (SignedBlindedBeaconBlock, BeaconState) { - let (unblinded, new_state) = self.make_block(state, slot).await; - ((*unblinded.0).clone().into(), new_state) + self.make_blinded_block_with_modifier(state, slot, |_| {}) + .await + } + + pub async fn make_blinded_block_with_modifier( + &self, + mut state: BeaconState, + slot: Slot, + block_modifier: impl FnOnce(&mut BlindedBeaconBlock), + ) -> (SignedBlindedBeaconBlock, BeaconState) { + assert_ne!(slot, 0, "can't produce a block at slot 0"); + assert!(slot >= state.slot()); + + complete_state_advance(&mut state, None, slot, &self.spec) + .expect("should be able to advance state to slot"); + + state.build_caches(&self.spec).expect("should build caches"); + + let proposer_index = state.get_beacon_proposer_index(slot, &self.spec).unwrap(); + + // If we produce two blocks for the same slot, they hash up to the same value and + // BeaconChain errors out with `DuplicateFullyImported`. Vary the graffiti so that we produce + // different blocks each time. + let graffiti = Graffiti::from(self.rng.lock().random::<[u8; 32]>()); + + let randao_reveal = self.sign_randao_reveal(&state, proposer_index, slot); + + // Always use the builder, so that we produce a "real" blinded payload. + let builder_boost_factor = Some(u64::MAX); + + let BeaconBlockResponseWrapper::Blinded(block_response) = self + .chain + .produce_block_on_state( + state, + None, + slot, + randao_reveal, + Some(graffiti), + ProduceBlockVerification::VerifyRandao, + builder_boost_factor, + BlockProductionVersion::V3, + ) + .await + .unwrap() + else { + panic!("Should always be a blinded payload response"); + }; + + let mut block = block_response.block; + block_modifier(&mut block); + + let signed_block = block.sign( + &self.validator_keypairs[proposer_index].sk, + &block_response.state.fork(), + block_response.state.genesis_validators_root(), + &self.spec, + ); + + (signed_block, block_response.state) } /// Returns a newly created block, signed by the proposer for the given slot. diff --git a/beacon_node/builder_client/src/lib.rs b/beacon_node/builder_client/src/lib.rs index 0c3fdca907..2c83e34755 100644 --- a/beacon_node/builder_client/src/lib.rs +++ b/beacon_node/builder_client/src/lib.rs @@ -8,7 +8,7 @@ use eth2::types::{ use eth2::types::{FullPayloadContents, SignedBlindedBeaconBlock}; use eth2::{ CONSENSUS_VERSION_HEADER, CONTENT_TYPE_HEADER, JSON_CONTENT_TYPE_HEADER, - SSZ_CONTENT_TYPE_HEADER, StatusCode, ok_or_error, + SSZ_CONTENT_TYPE_HEADER, StatusCode, ok_or_error, success_or_error, }; use reqwest::header::{ACCEPT, HeaderMap, HeaderValue}; use reqwest::{IntoUrl, Response}; @@ -249,7 +249,7 @@ impl BuilderHttpClient { .send() .await .map_err(Error::from)?; - ok_or_error(response).await + success_or_error(response).await } async fn post_with_raw_response( @@ -270,7 +270,7 @@ impl BuilderHttpClient { .send() .await .map_err(Error::from)?; - ok_or_error(response).await + success_or_error(response).await } /// `POST /eth/v1/builder/validators` diff --git a/beacon_node/execution_layer/src/lib.rs b/beacon_node/execution_layer/src/lib.rs index 5b48b81aa6..401646f367 100644 --- a/beacon_node/execution_layer/src/lib.rs +++ b/beacon_node/execution_layer/src/lib.rs @@ -2032,7 +2032,9 @@ impl ExecutionLayer { relay_response_ms = duration.as_millis(), ?block_root, "Successfully submitted blinded block to the builder" - ) + ); + + Ok(()) } Err(e) => { metrics::inc_counter_vec( @@ -2045,11 +2047,10 @@ impl ExecutionLayer { relay_response_ms = duration.as_millis(), ?block_root, "Failed to submit blinded block to the builder" - ) + ); + Err(e) } } - - Ok(()) } else { Err(Error::NoPayloadBuilder) } diff --git a/beacon_node/execution_layer/src/test_utils/mock_builder.rs b/beacon_node/execution_layer/src/test_utils/mock_builder.rs index 516662b1d6..6b63881d85 100644 --- a/beacon_node/execution_layer/src/test_utils/mock_builder.rs +++ b/beacon_node/execution_layer/src/test_utils/mock_builder.rs @@ -3,8 +3,8 @@ use crate::{Config, ExecutionLayer, PayloadAttributes, PayloadParameters}; use bytes::Bytes; use eth2::types::PublishBlockRequest; use eth2::types::{ - BlobsBundle, BlockId, BroadcastValidation, EventKind, EventTopic, FullPayloadContents, - ProposerData, StateId, ValidatorId, + BlobsBundle, BlockId, BroadcastValidation, EndpointVersion, EventKind, EventTopic, + FullPayloadContents, ProposerData, StateId, ValidatorId, }; use eth2::{ BeaconNodeHttpClient, CONSENSUS_VERSION_HEADER, CONTENT_TYPE_HEADER, SSZ_CONTENT_TYPE_HEADER, @@ -332,6 +332,10 @@ pub struct MockBuilder { payload_id_cache: Arc>>, /// If set to `true`, sets the bid returned by `get_header` to Uint256::MAX max_bid: bool, + /// Broadcast the full block with payload to the attached beacon node (simulating the relay). + /// + /// Turning this off is useful for testing. + broadcast_to_bn: bool, /// A cache that stores the proposers index for a given epoch proposers_cache: Arc>>>, } @@ -340,6 +344,9 @@ impl MockBuilder { pub fn new_for_testing( mock_el_url: SensitiveUrl, beacon_url: SensitiveUrl, + validate_pubkey: bool, + apply_operations: bool, + broadcast_to_bn: bool, spec: Arc, executor: TaskExecutor, ) -> (Self, (SocketAddr, impl Future)) { @@ -357,12 +364,15 @@ impl MockBuilder { let el = ExecutionLayer::from_config(config, executor.clone()).unwrap(); + let max_bid = false; + let builder = MockBuilder::new( el, BeaconNodeHttpClient::new(beacon_url, Timeouts::set_all(Duration::from_secs(1))), - true, - true, - false, + validate_pubkey, + apply_operations, + broadcast_to_bn, + max_bid, spec, None, ); @@ -378,6 +388,7 @@ impl MockBuilder { beacon_client: BeaconNodeHttpClient, validate_pubkey: bool, apply_operations: bool, + broadcast_to_bn: bool, max_bid: bool, spec: Arc, sk: Option<&[u8]>, @@ -407,6 +418,7 @@ impl MockBuilder { proposers_cache: Arc::new(RwLock::new(HashMap::new())), apply_operations, max_bid, + broadcast_to_bn, genesis_time: None, } } @@ -486,14 +498,20 @@ impl MockBuilder { block.message.body.execution_payload.tree_hash_root() } }; + let block_hash = block + .message() + .body() + .execution_payload() + .unwrap() + .block_hash(); info!( - block_hash = %root, + execution_payload_root = %root, + ?block_hash, "Submitting blinded beacon block to builder" ); - let payload = self - .el - .get_payload_by_root(&root) - .ok_or_else(|| "missing payload for tx root".to_string())?; + let payload = self.el.get_payload_by_root(&root).ok_or_else(|| { + format!("missing payload for root: {root:?}, block_hash: {block_hash:?}",) + })?; let (payload, blobs) = payload.deconstruct(); let full_block = block @@ -502,16 +520,28 @@ impl MockBuilder { debug!( txs_count = payload.transactions().len(), blob_count = blobs.as_ref().map(|b| b.commitments.len()), - "Got full payload, sending to local beacon node for propagation" + "Got full payload" ); - let publish_block_request = PublishBlockRequest::new( - Arc::new(full_block), - blobs.clone().map(|b| (b.proofs, b.blobs)), - ); - self.beacon_client - .post_beacon_blocks_v2(&publish_block_request, Some(BroadcastValidation::Gossip)) - .await - .map_err(|e| format!("Failed to post blinded block {:?}", e))?; + if self.broadcast_to_bn { + debug!( + block_hash = ?payload.block_hash(), + "Broadcasting builder block to BN" + ); + let publish_block_request = PublishBlockRequest::new( + Arc::new(full_block), + blobs.clone().map(|b| (b.proofs, b.blobs)), + ); + self.beacon_client + .post_beacon_blocks_v2( + &publish_block_request, + Some(BroadcastValidation::ConsensusAndEquivocation), + ) + .await + .map_err(|e| { + // XXX: this should really be a 400 but warp makes that annoyingly difficult + format!("Failed to post blinded block {e:?}") + })?; + } Ok(FullPayloadContents::new(payload, blobs)) } @@ -542,16 +572,29 @@ impl MockBuilder { info!("Got payload params"); let fork = self.fork_name_at_slot(slot); + let payload_response_type = self .el - .get_full_payload_caching(PayloadParameters { - parent_hash: payload_parameters.parent_hash, - parent_gas_limit: payload_parameters.parent_gas_limit, - proposer_gas_limit: payload_parameters.proposer_gas_limit, - payload_attributes: &payload_parameters.payload_attributes, - forkchoice_update_params: &payload_parameters.forkchoice_update_params, - current_fork: payload_parameters.current_fork, - }) + .get_full_payload_with( + PayloadParameters { + parent_hash: payload_parameters.parent_hash, + parent_gas_limit: payload_parameters.parent_gas_limit, + proposer_gas_limit: payload_parameters.proposer_gas_limit, + payload_attributes: &payload_parameters.payload_attributes, + forkchoice_update_params: &payload_parameters.forkchoice_update_params, + current_fork: payload_parameters.current_fork, + }, + // If apply_operations is set, do NOT cache the payload at this point, we are about + // to mutate it and it would be incorrect to cache the unmutated payload. + // + // This is a flaw in apply_operations generally, if you want the mock builder to + // actually return payloads then this option should be turned off. + if self.apply_operations { + |_, _| None + } else { + ExecutionLayer::cache_payload + }, + ) .await .map_err(|e| format!("couldn't get payload {:?}", e))?; @@ -958,11 +1001,21 @@ pub fn serve( let inner_ctx = builder.clone(); let ctx_filter = warp::any().map(move || inner_ctx.clone()); - let prefix = warp::path("eth") + let prefix_v1 = warp::path("eth") .and(warp::path("v1")) .and(warp::path("builder")); - let validators = prefix + let prefix_either = warp::path("eth") + .and( + warp::path::param::().or_else(|_| async move { + Err(warp::reject::custom(Custom( + "Invalid EndpointVersion".to_string(), + ))) + }), + ) + .and(warp::path("builder")); + + let validators = prefix_v1 .and(warp::path("validators")) .and(warp::body::json()) .and(warp::path::end()) @@ -974,61 +1027,89 @@ pub fn serve( .register_validators(registrations) .await .map_err(|e| warp::reject::custom(Custom(e)))?; - Ok::<_, Rejection>(warp::reply()) - }, - ) - .boxed(); - - let blinded_block_ssz = prefix - .and(warp::path("blinded_blocks")) - .and(warp::body::bytes()) - .and(warp::header::header::(CONSENSUS_VERSION_HEADER)) - .and(warp::path::end()) - .and(ctx_filter.clone()) - .and_then( - |block_bytes: Bytes, fork_name: ForkName, builder: MockBuilder| async move { - let block = - SignedBlindedBeaconBlock::::from_ssz_bytes_by_fork(&block_bytes, fork_name) - .map_err(|e| warp::reject::custom(Custom(format!("{:?}", e))))?; - let payload = builder - .submit_blinded_block(block) - .await - .map_err(|e| warp::reject::custom(Custom(e)))?; - - Ok::<_, warp::reject::Rejection>( - warp::http::Response::builder() - .status(200) - .body(payload.as_ssz_bytes()) - .map(add_ssz_content_type_header) - .map(|res| add_consensus_version_header(res, fork_name)) - .unwrap(), - ) + Ok::<_, Rejection>(warp::reply().into_response()) }, ); - let blinded_block = - prefix + let blinded_block_ssz = + prefix_either .and(warp::path("blinded_blocks")) - .and(warp::body::json()) + .and(warp::body::bytes()) .and(warp::header::header::(CONSENSUS_VERSION_HEADER)) .and(warp::path::end()) .and(ctx_filter.clone()) .and_then( - |block: SignedBlindedBeaconBlock, + |endpoint_version, + block_bytes: Bytes, fork_name: ForkName, builder: MockBuilder| async move { + if endpoint_version != EndpointVersion(1) + && endpoint_version != EndpointVersion(2) + { + return Err(warp::reject::custom(Custom(format!( + "Unsupported version: {endpoint_version}" + )))); + } + let block = SignedBlindedBeaconBlock::::from_ssz_bytes_by_fork( + &block_bytes, + fork_name, + ) + .map_err(|e| warp::reject::custom(Custom(format!("{:?}", e))))?; let payload = builder .submit_blinded_block(block) .await .map_err(|e| warp::reject::custom(Custom(e)))?; - let resp: ForkVersionedResponse<_> = ForkVersionedResponse { - version: fork_name, - metadata: Default::default(), - data: payload, - }; - let json_payload = serde_json::to_string(&resp) - .map_err(|_| reject("coudn't serialize response"))?; + if endpoint_version == EndpointVersion(1) { + Ok::<_, warp::reject::Rejection>( + warp::http::Response::builder() + .status(200) + .body(payload.as_ssz_bytes()) + .map(add_ssz_content_type_header) + .map(|res| add_consensus_version_header(res, fork_name)) + .unwrap(), + ) + } else { + Ok(warp::http::Response::builder() + .status(202) + .body(&[] as &'static [u8]) + .map(|res| add_consensus_version_header(res, fork_name)) + .unwrap()) + } + }, + ); + + let blinded_block = prefix_either + .and(warp::path("blinded_blocks")) + .and(warp::body::json()) + .and(warp::header::header::(CONSENSUS_VERSION_HEADER)) + .and(warp::path::end()) + .and(ctx_filter.clone()) + .and_then( + |endpoint_version, + block: SignedBlindedBeaconBlock, + fork_name: ForkName, + builder: MockBuilder| async move { + if endpoint_version != EndpointVersion(1) && endpoint_version != EndpointVersion(2) + { + return Err(warp::reject::custom(Custom(format!( + "Unsupported version: {endpoint_version}" + )))); + } + let payload = builder + .submit_blinded_block(block) + .await + .map_err(|e| warp::reject::custom(Custom(e)))?; + let resp: ForkVersionedResponse<_> = ForkVersionedResponse { + version: fork_name, + metadata: Default::default(), + data: payload, + }; + + let json_payload = serde_json::to_string(&resp) + .map_err(|_| reject("coudn't serialize response"))?; + + if endpoint_version == EndpointVersion(1) { Ok::<_, warp::reject::Rejection>( warp::http::Response::builder() .status(200) @@ -1036,16 +1117,24 @@ pub fn serve( serde_json::to_string(&json_payload) .map_err(|_| reject("invalid JSON"))?, ) + .map(|res| add_consensus_version_header(res, fork_name)) .unwrap(), ) - }, - ); + } else { + Ok(warp::http::Response::builder() + .status(202) + .body("".to_string()) + .map(|res| add_consensus_version_header(res, fork_name)) + .unwrap()) + } + }, + ); - let status = prefix + let status = prefix_v1 .and(warp::path("status")) - .then(|| async { warp::reply() }); + .then(|| async { warp::reply().into_response() }); - let header = prefix + let header = prefix_v1 .and(warp::path("header")) .and(warp::path::param::().or_else(|_| async { Err(reject("Invalid slot")) })) .and( diff --git a/beacon_node/http_api/src/lib.rs b/beacon_node/http_api/src/lib.rs index 515c262b19..bfe0bd4d38 100644 --- a/beacon_node/http_api/src/lib.rs +++ b/beacon_node/http_api/src/lib.rs @@ -1640,16 +1640,27 @@ pub fn serve( .and(warp::query::()) .and(warp::path::end()) .and(warp_utils::json::json()) + .and(consensus_version_header_filter) .and(task_spawner_filter.clone()) .and(chain_filter.clone()) .and(network_tx_filter.clone()) .then( move |validation_level: api_types::BroadcastValidationQuery, - blinded_block: Arc>, + blinded_block_json: serde_json::Value, + consensus_version: ForkName, task_spawner: TaskSpawner, chain: Arc>, network_tx: UnboundedSender>| { task_spawner.spawn_async_with_rejection(Priority::P0, async move { + let blinded_block = + SignedBlindedBeaconBlock::::context_deserialize( + &blinded_block_json, + consensus_version, + ) + .map(Arc::new) + .map_err(|e| { + warp_utils::reject::custom_bad_request(format!("invalid JSON: {e:?}")) + })?; publish_blocks::publish_blinded_block( blinded_block, chain, diff --git a/beacon_node/http_api/src/test_utils.rs b/beacon_node/http_api/src/test_utils.rs index 90f2fd2d95..28eed26276 100644 --- a/beacon_node/http_api/src/test_utils.rs +++ b/beacon_node/http_api/src/test_utils.rs @@ -60,8 +60,15 @@ type Mutator = BoxedMutator, MemoryStore>; impl InteractiveTester { pub async fn new(spec: Option, validator_count: usize) -> Self { - Self::new_with_initializer_and_mutator(spec, validator_count, None, None, Config::default()) - .await + Self::new_with_initializer_and_mutator( + spec, + validator_count, + None, + None, + Config::default(), + true, + ) + .await } pub async fn new_with_initializer_and_mutator( @@ -70,6 +77,7 @@ impl InteractiveTester { initializer: Option>, mutator: Option>, config: Config, + use_mock_builder: bool, ) -> Self { let mut harness_builder = BeaconChainHarness::builder(E::default()) .spec_or_default(spec.map(Arc::new)) @@ -91,7 +99,7 @@ impl InteractiveTester { harness_builder = harness_builder.initial_mutator(mutator); } - let harness = harness_builder.build(); + let mut harness = harness_builder.build(); let ApiServer { ctx, @@ -103,6 +111,40 @@ impl InteractiveTester { tokio::spawn(server); + // Late-initalize the mock builder now that the mock execution node and beacon API ports + // have been allocated. + let beacon_api_ip = listening_socket.ip(); + let beacon_api_port = listening_socket.port(); + let beacon_url = + SensitiveUrl::parse(format!("http://{beacon_api_ip}:{beacon_api_port}").as_str()) + .unwrap(); + + // We disable apply_operations because it breaks the mock builder's ability to return + // payloads. + let apply_operations = false; + + // We disable strict registration checks too, because it makes HTTP tests less fiddly to + // write. + let strict_registrations = false; + + // Broadcast to the BN only if Fulu is scheduled. In the broadcast validation tests we want + // to infer things from the builder return code, and pre-Fulu it's simpler to let the BN + // handle broadcast and return detailed codes. Post-Fulu the builder doesn't return the + // block at all, so we *need* the builder to do the broadcast and return a 400 if the block + // is invalid. + let broadcast_to_bn = ctx.chain.as_ref().unwrap().spec.is_fulu_scheduled(); + + if use_mock_builder { + let mock_builder_server = harness.set_mock_builder( + beacon_url.clone(), + strict_registrations, + apply_operations, + broadcast_to_bn, + ); + + tokio::spawn(mock_builder_server); + } + // Override the default timeout to 2s to timeouts on CI, as CI seems to require longer // to process. The 1s timeouts for other tasks have been working for a long time, so we'll // keep it as it is, as it may help identify a performance regression. @@ -110,15 +152,7 @@ impl InteractiveTester { default: Duration::from_secs(2), ..Timeouts::set_all(Duration::from_secs(1)) }; - let client = BeaconNodeHttpClient::new( - SensitiveUrl::parse(&format!( - "http://{}:{}", - listening_socket.ip(), - listening_socket.port() - )) - .unwrap(), - timeouts, - ); + let client = BeaconNodeHttpClient::new(beacon_url.clone(), timeouts); Self { ctx, diff --git a/beacon_node/http_api/tests/broadcast_validation_tests.rs b/beacon_node/http_api/tests/broadcast_validation_tests.rs index c125ae035b..d9ddbf9892 100644 --- a/beacon_node/http_api/tests/broadcast_validation_tests.rs +++ b/beacon_node/http_api/tests/broadcast_validation_tests.rs @@ -1,9 +1,9 @@ use beacon_chain::test_utils::test_spec; use beacon_chain::{ - GossipVerifiedBlock, IntoGossipVerifiedBlock, + GossipVerifiedBlock, IntoGossipVerifiedBlock, WhenSlotSkipped, test_utils::{AttestationStrategy, BlockStrategy}, }; -use eth2::reqwest::StatusCode; +use eth2::reqwest::{Response, StatusCode}; use eth2::types::{BroadcastValidation, PublishBlockRequest}; use http_api::test_utils::InteractiveTester; use http_api::{Config, ProvenancedBlock, publish_blinded_block, publish_block, reconstruct_block}; @@ -74,7 +74,7 @@ pub async fn gossip_invalid() { }) .await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blocks_v2_ssz(&PublishBlockRequest::new(block, blobs), validation_level) .await; @@ -85,7 +85,13 @@ pub async fn gossip_invalid() { /* mandated by Beacon API spec */ assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); - assert_server_message_error(error_response, "BAD_REQUEST: NotFinalizedDescendant { block_parent_root: 0x0000000000000000000000000000000000000000000000000000000000000000 }".to_string()); + // Since Deneb, the invalidity of the blobs will be detected prior to the invalidity of the + // block. + let pre_finalized_block_root = Hash256::zero(); + assert_server_message_error( + error_response, + format!("BAD_REQUEST: ParentUnknown {{ parent_root: {pre_finalized_block_root:?} }}"), + ); } /// This test checks that a block that is valid from a gossip perspective is accepted when using `broadcast_validation=gossip`. @@ -123,15 +129,11 @@ pub async fn gossip_partial_pass() { }) .await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blocks_v2_ssz(&PublishBlockRequest::new(block, blobs), validation_level) .await; - assert!(response.is_err()); - - let error_response = response.unwrap_err(); - - assert_eq!(error_response.status(), Some(StatusCode::ACCEPTED)); + assert_eq!(response.unwrap().status(), StatusCode::ACCEPTED); } // This test checks that a block that is valid from both a gossip and consensus perspective is accepted when using `broadcast_validation=gossip`. @@ -164,7 +166,7 @@ pub async fn gossip_full_pass() { let state_a = tester.harness.get_current_state(); let ((block, blobs), _) = tester.harness.make_block(state_a, slot_b).await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blocks_v2_ssz( &PublishBlockRequest::new(block.clone(), blobs), @@ -215,7 +217,7 @@ pub async fn gossip_full_pass_ssz() { let (block_contents_tuple, _) = tester.harness.make_block(state_a, slot_b).await; let block_contents = block_contents_tuple.into(); - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blocks_v2_ssz(&block_contents, validation_level) .await; @@ -264,7 +266,7 @@ pub async fn consensus_invalid() { }) .await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blocks_v2_ssz(&PublishBlockRequest::new(block, blobs), validation_level) .await; @@ -274,7 +276,13 @@ pub async fn consensus_invalid() { /* mandated by Beacon API spec */ assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); - assert_server_message_error(error_response, "BAD_REQUEST: NotFinalizedDescendant { block_parent_root: 0x0000000000000000000000000000000000000000000000000000000000000000 }".to_string()); + // Since Deneb, the invalidity of the blobs will be detected prior to the invalidity of the + // block. + let pre_finalized_block_root = Hash256::zero(); + assert_server_message_error( + error_response, + format!("BAD_REQUEST: ParentUnknown {{ parent_root: {pre_finalized_block_root:?} }}"), + ); } /// This test checks that a block that is only valid from a gossip perspective is rejected when using `broadcast_validation=consensus`. @@ -304,13 +312,17 @@ pub async fn consensus_gossip() { let slot_a = Slot::new(num_initial); let slot_b = slot_a + 1; + let mut correct_state_root = Hash256::ZERO; let state_a = tester.harness.get_current_state(); let ((block, blobs), _) = tester .harness - .make_block_with_modifier(state_a, slot_b, |b| *b.state_root_mut() = Hash256::zero()) + .make_block_with_modifier(state_a, slot_b, |b| { + *correct_state_root = *b.state_root(); + *b.state_root_mut() = Hash256::zero() + }) .await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blocks_v2_ssz(&PublishBlockRequest::new(block, blobs), validation_level) .await; @@ -320,7 +332,14 @@ pub async fn consensus_gossip() { /* mandated by Beacon API spec */ assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); - assert_server_message_error(error_response, "BAD_REQUEST: Invalid block: StateRootMismatch { block: 0x0000000000000000000000000000000000000000000000000000000000000000, local: 0x253405be9aa159bce7b276b8e1d3849c743e673118dfafe8c7d07c203ae0d80d }".to_string()); + assert_server_message_error( + error_response, + format!( + "BAD_REQUEST: Invalid block: StateRootMismatch {{ block: {}, \ + local: {correct_state_root:?} }}", + Hash256::ZERO + ), + ); } /// This test checks that a block that is valid from both a gossip and consensus perspective, but nonetheless equivocates, is accepted when using `broadcast_validation=consensus`. @@ -424,7 +443,7 @@ pub async fn consensus_full_pass() { let state_a = tester.harness.get_current_state(); let ((block, blobs), _) = tester.harness.make_block(state_a, slot_b).await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blocks_v2_ssz( &PublishBlockRequest::new(block.clone(), blobs), @@ -478,7 +497,7 @@ pub async fn equivocation_invalid() { }) .await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blocks_v2_ssz(&PublishBlockRequest::new(block, blobs), validation_level) .await; @@ -488,7 +507,13 @@ pub async fn equivocation_invalid() { /* mandated by Beacon API spec */ assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); - assert_server_message_error(error_response, "BAD_REQUEST: NotFinalizedDescendant { block_parent_root: 0x0000000000000000000000000000000000000000000000000000000000000000 }".to_string()); + // Since Deneb, the invalidity of the blobs will be detected prior to the invalidity of the + // block. + let pre_finalized_block_root = Hash256::zero(); + assert_server_message_error( + error_response, + format!("BAD_REQUEST: ParentUnknown {{ parent_root: {pre_finalized_block_root:?} }}"), + ); } /// This test checks that a block that is valid from both a gossip and consensus perspective is rejected when using `broadcast_validation=consensus_and_equivocation`. @@ -554,7 +579,7 @@ pub async fn equivocation_consensus_early_equivocation() { ); /* submit `block_b` which should induce equivocation */ - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blocks_v2_ssz( &PublishBlockRequest::new(block_b.clone(), blobs_b), @@ -597,14 +622,18 @@ pub async fn equivocation_gossip() { let slot_a = Slot::new(num_initial); let slot_b = slot_a + 1; + let mut correct_state_root = Hash256::zero(); let state_a = tester.harness.get_current_state(); let ((block, blobs), _) = tester .harness - .make_block_with_modifier(state_a, slot_b, |b| *b.state_root_mut() = Hash256::zero()) + .make_block_with_modifier(state_a, slot_b, |b| { + *correct_state_root = *b.state_root(); + *b.state_root_mut() = Hash256::zero() + }) .await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blocks_v2_ssz(&PublishBlockRequest::new(block, blobs), validation_level) .await; @@ -614,7 +643,13 @@ pub async fn equivocation_gossip() { /* mandated by Beacon API spec */ assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); - assert_server_message_error(error_response, "BAD_REQUEST: Invalid block: StateRootMismatch { block: 0x0000000000000000000000000000000000000000000000000000000000000000, local: 0x253405be9aa159bce7b276b8e1d3849c743e673118dfafe8c7d07c203ae0d80d }".to_string()); + assert_server_message_error( + error_response, + format!( + "BAD_REQUEST: Invalid block: StateRootMismatch {{ block: {}, local: {correct_state_root} }}", + Hash256::zero() + ), + ); } /// This test checks that a block that is valid from both a gossip and consensus perspective but @@ -725,7 +760,7 @@ pub async fn equivocation_full_pass() { let state_a = tester.harness.get_current_state(); let ((block, blobs), _) = tester.harness.make_block(state_a, slot_b).await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blocks_v2_ssz( &PublishBlockRequest::new(block.clone(), blobs), @@ -770,28 +805,43 @@ pub async fn blinded_gossip_invalid() { tester.harness.advance_slot(); - let (block_contents_tuple, _) = tester + let (blinded_block, _) = tester .harness - .make_block_with_modifier(chain_state_before, slot, |b| { + .make_blinded_block_with_modifier(chain_state_before, slot, |b| { *b.state_root_mut() = Hash256::zero(); *b.parent_root_mut() = Hash256::zero(); }) .await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client - .post_beacon_blinded_blocks_v2(&block_contents_tuple.0.clone_as_blinded(), validation_level) + .post_beacon_blinded_blocks_v2(&blinded_block, validation_level) .await; assert!(response.is_err()); let error_response: eth2::Error = response.err().unwrap(); - + let pre_finalized_block_root = Hash256::zero(); /* mandated by Beacon API spec */ - assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); - assert_server_message_error(error_response, "BAD_REQUEST: NotFinalizedDescendant { block_parent_root: 0x0000000000000000000000000000000000000000000000000000000000000000 }".to_string()); + if tester.harness.spec.is_fulu_scheduled() { + // XXX: this should be a 400 but is a 500 due to the mock-builder being janky + assert_eq!( + error_response.status(), + Some(StatusCode::INTERNAL_SERVER_ERROR) + ); + } else { + assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); + assert_server_message_error( + error_response, + format!("BAD_REQUEST: ParentUnknown {{ parent_root: {pre_finalized_block_root:?} }}"), + ); + } } -/// This test checks that a block that is valid from a gossip perspective is accepted when using `broadcast_validation=gossip`. +/// Process a blinded block that is invalid, but valid on gossip. +/// +/// Due to the checks conducted by the "relay" (mock-builder) when `broadcast_to_bn` is set (post +/// Fulu), we can't always assert that we get a 202 status for this block -- post Fulu the relay +/// detects it as invalid and the BN returns an error. #[tokio::test(flavor = "multi_thread", worker_threads = 2)] pub async fn blinded_gossip_partial_pass() { /* this test targets gossip-level validation */ @@ -819,22 +869,27 @@ pub async fn blinded_gossip_partial_pass() { tester.harness.advance_slot(); - let (block_contents_tuple, _) = tester + let (blinded_block, _) = tester .harness - .make_block_with_modifier(chain_state_before, slot, |b| { + .make_blinded_block_with_modifier(chain_state_before, slot, |b| { *b.state_root_mut() = Hash256::zero() }) .await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client - .post_beacon_blinded_blocks_v2(&block_contents_tuple.0.clone_as_blinded(), validation_level) + .post_beacon_blinded_blocks_v2(&blinded_block, validation_level) .await; - assert!(response.is_err()); - - let error_response = response.unwrap_err(); - - assert_eq!(error_response.status(), Some(StatusCode::ACCEPTED)); + if tester.harness.spec.is_fulu_scheduled() { + let error_response = response.unwrap_err(); + // XXX: this should be a 400 but is a 500 due to the mock-builder being janky + assert_eq!( + error_response.status(), + Some(StatusCode::INTERNAL_SERVER_ERROR) + ); + } else { + assert_eq!(response.unwrap().status(), StatusCode::ACCEPTED); + } } // This test checks that a block that is valid from both a gossip and consensus perspective is accepted when using `broadcast_validation=gossip`. @@ -866,12 +921,13 @@ pub async fn blinded_gossip_full_pass() { let state_a = tester.harness.get_current_state(); let (blinded_block, _) = tester.harness.make_blinded_block(state_a, slot_b).await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blinded_blocks_v2(&blinded_block, validation_level) .await; assert!(response.is_ok()); + assert_eq!(response.unwrap().status(), StatusCode::OK); assert!( tester .harness @@ -910,12 +966,13 @@ pub async fn blinded_gossip_full_pass_ssz() { let state_a = tester.harness.get_current_state(); let (blinded_block, _) = tester.harness.make_blinded_block(state_a, slot_b).await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blinded_blocks_v2_ssz(&blinded_block, validation_level) .await; assert!(response.is_ok()); + assert_eq!(response.unwrap().status(), StatusCode::OK); assert!( tester .harness @@ -933,7 +990,7 @@ pub async fn blinded_consensus_invalid() { // Validator count needs to be at least 32 or proposer boost gets set to 0 when computing // `validator_count // 32`. let validator_count = 64; - let num_initial: u64 = 31; + let num_initial: u64 = 256; let tester = InteractiveTester::::new(None, validator_count).await; // Create some chain depth. @@ -952,25 +1009,48 @@ pub async fn blinded_consensus_invalid() { tester.harness.advance_slot(); - let (block_contents_tuple, _) = tester + let finalized_slot = chain_state_before + .finalized_checkpoint() + .epoch + .start_slot(E::slots_per_epoch()); + assert_ne!(finalized_slot, 0); + let pre_finalized_block_root = tester .harness - .make_block_with_modifier(chain_state_before, slot, |b| { + .chain + .block_root_at_slot(finalized_slot - 1, WhenSlotSkipped::Prev) + .unwrap() + .unwrap(); + + let (blinded_block, _) = tester + .harness + .make_blinded_block_with_modifier(chain_state_before, slot, |b| { *b.state_root_mut() = Hash256::zero(); - *b.parent_root_mut() = Hash256::zero(); + *b.parent_root_mut() = pre_finalized_block_root; }) .await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client - .post_beacon_blinded_blocks_v2(&block_contents_tuple.0.clone_as_blinded(), validation_level) + .post_beacon_blinded_blocks_v2(&blinded_block, validation_level) .await; assert!(response.is_err()); let error_response: eth2::Error = response.err().unwrap(); /* mandated by Beacon API spec */ - assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); - assert_server_message_error(error_response, "BAD_REQUEST: NotFinalizedDescendant { block_parent_root: 0x0000000000000000000000000000000000000000000000000000000000000000 }".to_string()); + if tester.harness.spec.is_fulu_scheduled() { + // XXX: this should be a 400 but is a 500 due to the mock-builder being janky + assert_eq!( + error_response.status(), + Some(StatusCode::INTERNAL_SERVER_ERROR) + ); + } else { + assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); + assert_server_message_error( + error_response, + format!("BAD_REQUEST: ParentUnknown {{ parent_root: {pre_finalized_block_root:?} }}"), + ); + } } /// This test checks that a block that is only valid from a gossip perspective is rejected when using `broadcast_validation=consensus`. @@ -1000,23 +1080,44 @@ pub async fn blinded_consensus_gossip() { let slot_a = Slot::new(num_initial); let slot_b = slot_a + 1; + let mut correct_state_root = Hash256::zero(); + let state_a = tester.harness.get_current_state(); - let (block_contents_tuple, _) = tester + let (blinded_block, _) = tester .harness - .make_block_with_modifier(state_a, slot_b, |b| *b.state_root_mut() = Hash256::zero()) + .make_blinded_block_with_modifier(state_a, slot_b, |b| { + *correct_state_root = *b.state_root(); + *b.state_root_mut() = Hash256::zero() + }) .await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client - .post_beacon_blinded_blocks_v2(&block_contents_tuple.0.clone_as_blinded(), validation_level) + .post_beacon_blinded_blocks_v2(&blinded_block, validation_level) .await; + assert!(response.is_err()); let error_response: eth2::Error = response.err().unwrap(); /* mandated by Beacon API spec */ - assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); - assert_server_message_error(error_response, "BAD_REQUEST: Invalid block: StateRootMismatch { block: 0x0000000000000000000000000000000000000000000000000000000000000000, local: 0x253405be9aa159bce7b276b8e1d3849c743e673118dfafe8c7d07c203ae0d80d }".to_string()); + if tester.harness.spec.is_fulu_scheduled() { + // XXX: this should be a 400 but is a 500 due to the mock-builder being janky + assert_eq!( + error_response.status(), + Some(StatusCode::INTERNAL_SERVER_ERROR) + ); + } else { + assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); + assert_server_message_error( + error_response, + format!( + "BAD_REQUEST: Invalid block: StateRootMismatch {{ block: {}, \ + local: {correct_state_root} }}", + Hash256::ZERO + ), + ); + } } /// This test checks that a block that is valid from both a gossip and consensus perspective is accepted when using `broadcast_validation=consensus`. @@ -1049,7 +1150,7 @@ pub async fn blinded_consensus_full_pass() { let state_a = tester.harness.get_current_state(); let (blinded_block, _) = tester.harness.make_blinded_block(state_a, slot_b).await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blinded_blocks_v2(&blinded_block, validation_level) .await; @@ -1073,7 +1174,7 @@ pub async fn blinded_equivocation_invalid() { // Validator count needs to be at least 32 or proposer boost gets set to 0 when computing // `validator_count // 32`. let validator_count = 64; - let num_initial: u64 = 31; + let num_initial: u64 = 256; let tester = InteractiveTester::::new(None, validator_count).await; // Create some chain depth. @@ -1092,25 +1193,47 @@ pub async fn blinded_equivocation_invalid() { tester.harness.advance_slot(); - let (block_contents_tuple, _) = tester + let finalized_slot = chain_state_before + .finalized_checkpoint() + .epoch + .start_slot(E::slots_per_epoch()); + assert_ne!(finalized_slot, 0); + let pre_finalized_block_root = tester .harness - .make_block_with_modifier(chain_state_before, slot, |b| { + .chain + .block_root_at_slot(finalized_slot - 1, WhenSlotSkipped::Prev) + .unwrap() + .unwrap(); + + let (blinded_block, _) = tester + .harness + .make_blinded_block_with_modifier(chain_state_before, slot, |b| { *b.state_root_mut() = Hash256::zero(); - *b.parent_root_mut() = Hash256::zero(); + *b.parent_root_mut() = pre_finalized_block_root; }) .await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client - .post_beacon_blinded_blocks_v2(&block_contents_tuple.0.clone_as_blinded(), validation_level) + .post_beacon_blinded_blocks_v2(&blinded_block, validation_level) .await; assert!(response.is_err()); let error_response: eth2::Error = response.err().unwrap(); /* mandated by Beacon API spec */ - assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); - assert_server_message_error(error_response, "BAD_REQUEST: NotFinalizedDescendant { block_parent_root: 0x0000000000000000000000000000000000000000000000000000000000000000 }".to_string()); + if tester.harness.spec.is_fulu_scheduled() { + assert_eq!( + error_response.status(), + Some(StatusCode::INTERNAL_SERVER_ERROR) + ); + } else { + assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); + assert_server_message_error( + error_response, + format!("BAD_REQUEST: ParentUnknown {{ parent_root: {pre_finalized_block_root:?} }}"), + ); + } } /// This test checks that a block that is valid from both a gossip and consensus perspective is rejected when using `broadcast_validation=consensus_and_equivocation`. @@ -1160,13 +1283,11 @@ pub async fn blinded_equivocation_consensus_early_equivocation() { assert_ne!(block_a.state_root(), block_b.state_root()); /* submit `block_a` as valid */ - assert!( - tester - .client - .post_beacon_blinded_blocks_v2(&block_a, validation_level) - .await - .is_ok() - ); + tester + .client + .post_beacon_blinded_blocks_v2(&block_a, validation_level) + .await + .unwrap(); assert!( tester .harness @@ -1175,7 +1296,7 @@ pub async fn blinded_equivocation_consensus_early_equivocation() { ); /* submit `block_b` which should induce equivocation */ - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blinded_blocks_v2(&block_b, validation_level) .await; @@ -1183,8 +1304,15 @@ pub async fn blinded_equivocation_consensus_early_equivocation() { let error_response: eth2::Error = response.err().unwrap(); - assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); - assert_server_message_error(error_response, "BAD_REQUEST: Slashable".to_string()); + if tester.harness.spec.is_fulu_scheduled() { + assert_eq!( + error_response.status(), + Some(StatusCode::INTERNAL_SERVER_ERROR) + ); + } else { + assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); + assert_server_message_error(error_response, "BAD_REQUEST: Slashable".to_string()); + } } /// This test checks that a block that is only valid from a gossip perspective is rejected when using `broadcast_validation=consensus_and_equivocation`. @@ -1215,24 +1343,41 @@ pub async fn blinded_equivocation_gossip() { let slot_a = Slot::new(num_initial); let slot_b = slot_a + 1; + let mut correct_state_root = Hash256::zero(); let state_a = tester.harness.get_current_state(); - let (block_contents_tuple, _) = tester + let (blinded_block, _) = tester .harness - .make_block_with_modifier(state_a, slot_b, |b| *b.state_root_mut() = Hash256::zero()) + .make_blinded_block_with_modifier(state_a, slot_b, |b| { + *correct_state_root = *b.state_root(); + *b.state_root_mut() = Hash256::zero() + }) .await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client - .post_beacon_blinded_blocks_v2(&block_contents_tuple.0.clone_as_blinded(), validation_level) + .post_beacon_blinded_blocks_v2(&blinded_block, validation_level) .await; - assert!(response.is_err()); + assert!(response.is_err()); let error_response: eth2::Error = response.err().unwrap(); /* mandated by Beacon API spec */ - assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); - - assert_server_message_error(error_response, "BAD_REQUEST: Invalid block: StateRootMismatch { block: 0x0000000000000000000000000000000000000000000000000000000000000000, local: 0x253405be9aa159bce7b276b8e1d3849c743e673118dfafe8c7d07c203ae0d80d }".to_string()); + if tester.harness.spec.is_fulu_scheduled() { + // XXX: this should be a 400 but is a 500 due to the mock-builder being janky + assert_eq!( + error_response.status(), + Some(StatusCode::INTERNAL_SERVER_ERROR) + ); + } else { + assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); + assert_server_message_error( + error_response, + format!( + "BAD_REQUEST: Invalid block: StateRootMismatch {{ block: {}, local: {correct_state_root} }}", + Hash256::zero() + ), + ); + } } /// This test checks that a block that is valid from both a gossip and @@ -1287,54 +1432,58 @@ pub async fn blinded_equivocation_consensus_late_equivocation() { ); assert_ne!(block_a.state_root(), block_b.state_root()); - let unblinded_block_a = reconstruct_block( - tester.harness.chain.clone(), - block_a.canonical_root(), - Arc::new(block_a), - ) - .await - .expect("failed to reconstruct block") - .expect("block expected"); + // From fulu builders never send back a full payload, hence further checks in this test + // are not possible + if !tester.harness.spec.is_fulu_scheduled() { + let unblinded_block_a = reconstruct_block( + tester.harness.chain.clone(), + block_a.canonical_root(), + Arc::new(block_a), + ) + .await + .expect("failed to reconstruct block") + .expect("block expected"); - let unblinded_block_b = reconstruct_block( - tester.harness.chain.clone(), - block_b.canonical_root(), - block_b.clone(), - ) - .await - .expect("failed to reconstruct block") - .expect("block expected"); + let unblinded_block_b = reconstruct_block( + tester.harness.chain.clone(), + block_b.canonical_root(), + block_b.clone(), + ) + .await + .expect("failed to reconstruct block") + .expect("block expected"); - let inner_block_a = match unblinded_block_a { - ProvenancedBlock::Local(a, _, _) => a, - ProvenancedBlock::Builder(a, _, _) => a, - }; - let inner_block_b = match unblinded_block_b { - ProvenancedBlock::Local(b, _, _) => b, - ProvenancedBlock::Builder(b, _, _) => b, - }; + let inner_block_a = match unblinded_block_a { + ProvenancedBlock::Local(a, _, _) => a, + ProvenancedBlock::Builder(a, _, _) => a, + }; + let inner_block_b = match unblinded_block_b { + ProvenancedBlock::Local(b, _, _) => b, + ProvenancedBlock::Builder(b, _, _) => b, + }; - let gossip_block_b = GossipVerifiedBlock::new(inner_block_b, &tester.harness.chain); - assert!(gossip_block_b.is_ok()); - let gossip_block_a = GossipVerifiedBlock::new(inner_block_a, &tester.harness.chain); - assert!(gossip_block_a.is_err()); + let gossip_block_b = GossipVerifiedBlock::new(inner_block_b, &tester.harness.chain); + assert!(gossip_block_b.is_ok()); + let gossip_block_a = GossipVerifiedBlock::new(inner_block_a, &tester.harness.chain); + assert!(gossip_block_a.is_err()); - let channel = tokio::sync::mpsc::unbounded_channel(); + let channel = tokio::sync::mpsc::unbounded_channel(); - let publication_result = publish_blinded_block( - block_b, - tester.harness.chain, - &channel.0, - validation_level, - StatusCode::ACCEPTED, - ) - .await; + let publication_result = publish_blinded_block( + block_b, + tester.harness.chain, + &channel.0, + validation_level, + StatusCode::ACCEPTED, + ) + .await; - assert!(publication_result.is_err()); + assert!(publication_result.is_err()); - let publication_error: Rejection = publication_result.unwrap_err(); + let publication_error: Rejection = publication_result.unwrap_err(); - assert!(publication_error.find::().is_some()); + assert!(publication_error.find::().is_some()); + } } /// This test checks that a block that is valid from both a gossip and consensus perspective (and does not equivocate) is accepted when using `broadcast_validation=consensus_and_equivocation`. @@ -1368,7 +1517,7 @@ pub async fn blinded_equivocation_full_pass() { let state_a = tester.harness.get_current_state(); let (block, _) = tester.harness.make_blinded_block(state_a, slot_b).await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blinded_blocks_v2(&block, validation_level) .await; @@ -1434,7 +1583,7 @@ pub async fn block_seen_on_gossip_without_blobs_or_columns() { ); // Post the block *and* blobs to the HTTP API. - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blocks_v2_ssz( &PublishBlockRequest::new(block.clone(), Some(blobs)), @@ -1522,7 +1671,7 @@ pub async fn block_seen_on_gossip_with_some_blobs_or_columns() { ); // Post the block *and* all blobs to the HTTP API. - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blocks_v2_ssz( &PublishBlockRequest::new(block.clone(), Some(blobs)), @@ -1597,7 +1746,7 @@ pub async fn blobs_or_columns_seen_on_gossip_without_block() { ); // Post the block *and* all blobs to the HTTP API. - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blocks_v2_ssz( &PublishBlockRequest::new(block.clone(), Some((kzg_proofs, blobs))), @@ -1672,7 +1821,7 @@ async fn blobs_or_columns_seen_on_gossip_without_block_and_no_http_blobs_or_colu ); // Post just the block to the HTTP API (blob lists are empty). - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blocks_v2_ssz( &PublishBlockRequest::new( @@ -1750,7 +1899,7 @@ async fn slashable_blobs_or_columns_seen_on_gossip_cause_failure() { ); // Post block A *and* all its blobs to the HTTP API. - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blocks_v2_ssz( &PublishBlockRequest::new(block_a.clone(), Some((kzg_proofs_a, blobs_a))), @@ -1788,6 +1937,7 @@ pub async fn duplicate_block_status_code() { duplicate_block_status_code, ..Config::default() }, + true, ) .await; @@ -1812,7 +1962,7 @@ pub async fn duplicate_block_status_code() { // Post the block blobs to the HTTP API once. let block_request = PublishBlockRequest::new(block.clone(), Some((kzg_proofs, blobs))); - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blocks_v2_ssz(&block_request, validation_level) .await; @@ -1827,7 +1977,7 @@ pub async fn duplicate_block_status_code() { ); // Post again. - let duplicate_response: Result<(), eth2::Error> = tester + let duplicate_response: Result = tester .client .post_beacon_blocks_v2_ssz(&block_request, validation_level) .await; diff --git a/beacon_node/http_api/tests/fork_tests.rs b/beacon_node/http_api/tests/fork_tests.rs index 880e206777..62a3461276 100644 --- a/beacon_node/http_api/tests/fork_tests.rs +++ b/beacon_node/http_api/tests/fork_tests.rs @@ -425,6 +425,7 @@ async fn bls_to_execution_changes_update_all_around_capella_fork() { })), None, Default::default(), + true, ) .await; let harness = &tester.harness; diff --git a/beacon_node/http_api/tests/interactive_tests.rs b/beacon_node/http_api/tests/interactive_tests.rs index 1e55bfb7b3..1398d8c72f 100644 --- a/beacon_node/http_api/tests/interactive_tests.rs +++ b/beacon_node/http_api/tests/interactive_tests.rs @@ -73,6 +73,7 @@ async fn state_by_root_pruned_from_fork_choice() { })), None, Default::default(), + false, ) .await; @@ -429,6 +430,7 @@ pub async fn proposer_boost_re_org_test( ) })), Default::default(), + false, ) .await; let harness = &tester.harness; @@ -666,6 +668,7 @@ pub async fn proposer_boost_re_org_test( // Check the fork choice updates that were sent. let forkchoice_updates = forkchoice_updates.lock(); + let block_a_exec_hash = block_a .0 .message() diff --git a/beacon_node/http_api/tests/tests.rs b/beacon_node/http_api/tests/tests.rs index 92abbd84c7..91f8666381 100644 --- a/beacon_node/http_api/tests/tests.rs +++ b/beacon_node/http_api/tests/tests.rs @@ -1,14 +1,16 @@ use beacon_chain::test_utils::RelativeSyncCommittee; use beacon_chain::{ BeaconChain, ChainConfig, StateSkipConfig, WhenSlotSkipped, - test_utils::{AttestationStrategy, BeaconChainHarness, BlockStrategy, EphemeralHarnessType}, + test_utils::{ + AttestationStrategy, BeaconChainHarness, BlockStrategy, EphemeralHarnessType, test_spec, + }, }; use eth2::{ BeaconNodeHttpClient, Error, Error::ServerMessage, StatusCode, Timeouts, mixin::{RequestAccept, ResponseForkName, ResponseOptional}, - reqwest::RequestBuilder, + reqwest::{RequestBuilder, Response}, types::{ BlockId as CoreBlockId, ForkChoiceNode, ProduceBlockV3Response, StateId as CoreStateId, *, }, @@ -113,15 +115,11 @@ impl ApiTester { Self::new_from_config(ApiTesterConfig::default()).await } - pub async fn new_with_hard_forks(altair: bool, bellatrix: bool) -> Self { - let mut config = ApiTesterConfig::default(); - // Set whether the chain has undergone each hard fork. - if altair { - config.spec.altair_fork_epoch = Some(Epoch::new(0)); - } - if bellatrix { - config.spec.bellatrix_fork_epoch = Some(Epoch::new(0)); - } + pub async fn new_with_hard_forks() -> Self { + let config = ApiTesterConfig { + spec: test_spec::(), + ..Default::default() + }; Self::new_from_config(config).await } @@ -291,7 +289,19 @@ impl ApiTester { let beacon_api_port = listening_socket.port(); let beacon_url = SensitiveUrl::parse(format!("http://127.0.0.1:{beacon_api_port}").as_str()).unwrap(); - let mock_builder_server = harness.set_mock_builder(beacon_url.clone()); + + // Be strict with validator registrations, but don't bother applying operations, that flag + // is only used by mock-builder tests. + let strict_registrations = true; + let apply_operations = true; + let broadcast_to_bn = true; + + let mock_builder_server = harness.set_mock_builder( + beacon_url.clone(), + strict_registrations, + apply_operations, + broadcast_to_bn, + ); // Start the mock builder service prior to building the chain out. harness @@ -334,6 +344,7 @@ impl ApiTester { .deterministic_keypairs(VALIDATOR_COUNT) .deterministic_withdrawal_keypairs(VALIDATOR_COUNT) .fresh_ephemeral_store() + .mock_execution_layer() .build(), ); @@ -419,7 +430,7 @@ impl ApiTester { } pub async fn new_mev_tester() -> Self { - let tester = Self::new_with_hard_forks(true, true) + let tester = Self::new_with_hard_forks() .await .test_post_validator_register_validator() .await; @@ -1539,7 +1550,10 @@ impl ApiTester { pub async fn test_post_beacon_blocks_valid(mut self) -> Self { let next_block = self.next_block.clone(); - self.client.post_beacon_blocks(&next_block).await.unwrap(); + self.client + .post_beacon_blocks_v2(&next_block, None) + .await + .unwrap(); assert!( self.network_rx.network_recv.recv().await.is_some(), @@ -1553,7 +1567,7 @@ impl ApiTester { let next_block = &self.next_block; self.client - .post_beacon_blocks_ssz(next_block) + .post_beacon_blocks_v2_ssz(next_block, None) .await .unwrap(); @@ -1578,12 +1592,14 @@ impl ApiTester { .await .0; - assert!( - self.client - .post_beacon_blocks(&PublishBlockRequest::from(block)) - .await - .is_err() - ); + let response: Result = self + .client + .post_beacon_blocks_v2(&PublishBlockRequest::from(block), None) + .await; + + assert!(response.is_ok()); + + assert_eq!(response.unwrap().status(), StatusCode::ACCEPTED); assert!( self.network_rx.network_recv.recv().await.is_some(), @@ -1606,13 +1622,13 @@ impl ApiTester { .await .0; - assert!( - self.client - .post_beacon_blocks_ssz(&PublishBlockRequest::from(block)) - .await - .is_err() - ); + let response: Result = self + .client + .post_beacon_blocks_v2(&PublishBlockRequest::from(block), None) + .await; + assert!(response.is_ok()); + assert_eq!(response.unwrap().status(), StatusCode::ACCEPTED); assert!( self.network_rx.network_recv.recv().await.is_some(), "gossip valid blocks should be sent to network" @@ -1634,7 +1650,7 @@ impl ApiTester { assert!( self.client - .post_beacon_blocks(&block_contents) + .post_beacon_blocks_v2(&block_contents, None) .await .is_ok() ); @@ -1644,45 +1660,25 @@ impl ApiTester { // Test all the POST methods in sequence, they should all behave the same. let responses = vec![ - self.client - .post_beacon_blocks(&block_contents) - .await - .unwrap_err(), self.client .post_beacon_blocks_v2(&block_contents, None) .await - .unwrap_err(), - self.client - .post_beacon_blocks_ssz(&block_contents) - .await - .unwrap_err(), + .unwrap(), self.client .post_beacon_blocks_v2_ssz(&block_contents, None) .await - .unwrap_err(), - self.client - .post_beacon_blinded_blocks(&blinded_block_contents) - .await - .unwrap_err(), + .unwrap(), self.client .post_beacon_blinded_blocks_v2(&blinded_block_contents, None) .await - .unwrap_err(), - self.client - .post_beacon_blinded_blocks_ssz(&blinded_block_contents) - .await - .unwrap_err(), + .unwrap(), self.client .post_beacon_blinded_blocks_v2_ssz(&blinded_block_contents, None) .await - .unwrap_err(), + .unwrap(), ]; for (i, response) in responses.into_iter().enumerate() { - assert_eq!( - response.status().unwrap(), - StatusCode::ACCEPTED, - "response {i}" - ); + assert_eq!(response.status(), StatusCode::ACCEPTED, "response {i}"); } self @@ -3405,7 +3401,7 @@ impl ApiTester { PublishBlockRequest::try_from(Arc::new(signed_block.clone())).unwrap(); self.client - .post_beacon_blocks(&signed_block_contents) + .post_beacon_blocks_v2(&signed_block_contents, None) .await .unwrap(); @@ -3470,7 +3466,7 @@ impl ApiTester { block_contents.sign(&sk, &fork, genesis_validators_root, &self.chain.spec); self.client - .post_beacon_blocks_ssz(&signed_block_contents) + .post_beacon_blocks_v2_ssz(&signed_block_contents, None) .await .unwrap(); @@ -3588,7 +3584,7 @@ impl ApiTester { block_contents.sign(&sk, &fork, genesis_validators_root, &self.chain.spec); self.client - .post_beacon_blocks_ssz(&signed_block_contents) + .post_beacon_blocks_v2_ssz(&signed_block_contents, None) .await .unwrap(); @@ -6394,7 +6390,7 @@ impl ApiTester { }); self.client - .post_beacon_blocks(&self.next_block) + .post_beacon_blocks_v2(&self.next_block, None) .await .unwrap(); @@ -6439,7 +6435,7 @@ impl ApiTester { self.harness.advance_slot(); self.client - .post_beacon_blocks(&self.reorg_block) + .post_beacon_blocks_v2(&self.reorg_block, None) .await .unwrap(); @@ -6661,7 +6657,7 @@ impl ApiTester { }); self.client - .post_beacon_blocks(&self.next_block) + .post_beacon_blocks_v2(&self.next_block, None) .await .unwrap(); @@ -7829,7 +7825,7 @@ async fn lighthouse_endpoints() { #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn optimistic_responses() { - ApiTester::new_with_hard_forks(true, true) + ApiTester::new_with_hard_forks() .await .test_check_optimistic_responses() .await; diff --git a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs index a53e76402e..cb6d63fe91 100644 --- a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs @@ -840,7 +840,7 @@ impl NetworkBeaconProcessor { } Err(err) => { match err { - GossipBlobError::BlobParentUnknown { parent_root } => { + GossipBlobError::ParentUnknown { parent_root } => { debug!( action = "requesting parent", block_root = %root, diff --git a/common/eth2/src/lib.rs b/common/eth2/src/lib.rs index bbc38e31d6..3368569d59 100644 --- a/common/eth2/src/lib.rs +++ b/common/eth2/src/lib.rs @@ -498,7 +498,7 @@ impl BeaconNodeHttpClient { .post(url) .timeout(timeout.unwrap_or(self.timeouts.default)); let response = builder.json(body).send().await?; - ok_or_error(response).await + success_or_error(response).await } /// Generic POST function supporting arbitrary responses and timeouts. @@ -518,7 +518,7 @@ impl BeaconNodeHttpClient { .json(body) .send() .await?; - ok_or_error(response).await + success_or_error(response).await } /// Generic POST function that includes octet-stream content type header. @@ -535,7 +535,7 @@ impl BeaconNodeHttpClient { HeaderValue::from_static("application/octet-stream"), ); let response = builder.headers(headers).json(body).send().await?; - ok_or_error(response).await + success_or_error(response).await } /// Generic POST function supporting arbitrary responses and timeouts. @@ -560,7 +560,7 @@ impl BeaconNodeHttpClient { HeaderValue::from_static("application/octet-stream"), ); let response = builder.headers(headers).body(body).send().await?; - ok_or_error(response).await + success_or_error(response).await } /// `GET beacon/genesis` @@ -1257,16 +1257,17 @@ impl BeaconNodeHttpClient { &self, block_contents: &PublishBlockRequest, validation_level: Option, - ) -> Result<(), Error> { - self.post_generic_with_consensus_version( - self.post_beacon_blocks_v2_path(validation_level)?, - block_contents, - Some(self.timeouts.proposal), - block_contents.signed_block().message().body().fork_name(), - ) - .await?; + ) -> Result { + let response = self + .post_generic_with_consensus_version( + self.post_beacon_blocks_v2_path(validation_level)?, + block_contents, + Some(self.timeouts.proposal), + block_contents.signed_block().message().body().fork_name(), + ) + .await?; - Ok(()) + Ok(response) } /// `POST v2/beacon/blocks` @@ -1274,16 +1275,17 @@ impl BeaconNodeHttpClient { &self, block_contents: &PublishBlockRequest, validation_level: Option, - ) -> Result<(), Error> { - self.post_generic_with_consensus_version_and_ssz_body( - self.post_beacon_blocks_v2_path(validation_level)?, - block_contents.as_ssz_bytes(), - Some(self.timeouts.proposal), - block_contents.signed_block().message().body().fork_name(), - ) - .await?; + ) -> Result { + let response = self + .post_generic_with_consensus_version_and_ssz_body( + self.post_beacon_blocks_v2_path(validation_level)?, + block_contents.as_ssz_bytes(), + Some(self.timeouts.proposal), + block_contents.signed_block().message().body().fork_name(), + ) + .await?; - Ok(()) + Ok(response) } /// `POST v2/beacon/blinded_blocks` @@ -1291,16 +1293,17 @@ impl BeaconNodeHttpClient { &self, signed_block: &SignedBlindedBeaconBlock, validation_level: Option, - ) -> Result<(), Error> { - self.post_generic_with_consensus_version( - self.post_beacon_blinded_blocks_v2_path(validation_level)?, - signed_block, - Some(self.timeouts.proposal), - signed_block.message().body().fork_name(), - ) - .await?; + ) -> Result { + let response = self + .post_generic_with_consensus_version( + self.post_beacon_blinded_blocks_v2_path(validation_level)?, + signed_block, + Some(self.timeouts.proposal), + signed_block.message().body().fork_name(), + ) + .await?; - Ok(()) + Ok(response) } /// `POST v2/beacon/blinded_blocks` @@ -1308,16 +1311,17 @@ impl BeaconNodeHttpClient { &self, signed_block: &SignedBlindedBeaconBlock, validation_level: Option, - ) -> Result<(), Error> { - self.post_generic_with_consensus_version_and_ssz_body( - self.post_beacon_blinded_blocks_v2_path(validation_level)?, - signed_block.as_ssz_bytes(), - Some(self.timeouts.proposal), - signed_block.message().body().fork_name(), - ) - .await?; + ) -> Result { + let response = self + .post_generic_with_consensus_version_and_ssz_body( + self.post_beacon_blinded_blocks_v2_path(validation_level)?, + signed_block.as_ssz_bytes(), + Some(self.timeouts.proposal), + signed_block.message().body().fork_name(), + ) + .await?; - Ok(()) + Ok(response) } /// Path for `v2/beacon/blocks` @@ -2903,3 +2907,20 @@ pub async fn ok_or_error(response: Response) -> Result { Err(Error::StatusCode(status)) } } + +/// Returns `Ok(response)` if the response is a success (2xx) response. Otherwise, creates an +/// appropriate error message. +pub async fn success_or_error(response: Response) -> Result { + let status = response.status(); + + if status.is_success() { + Ok(response) + } else if let Ok(message) = response.json().await { + match message { + ResponseError::Message(message) => Err(Error::ServerMessage(message)), + ResponseError::Indexed(indexed) => Err(Error::ServerIndexedMessage(indexed)), + } + } else { + Err(Error::StatusCode(status)) + } +} diff --git a/validator_client/validator_services/src/block_service.rs b/validator_client/validator_services/src/block_service.rs index 834df67e8a..c111b1f22e 100644 --- a/validator_client/validator_services/src/block_service.rs +++ b/validator_client/validator_services/src/block_service.rs @@ -497,6 +497,7 @@ impl BlockService { beacon_node .post_beacon_blocks_v2_ssz(signed_block, None) .await + .map(|_| ()) .or_else(|e| { handle_block_post_error(e, signed_block.signed_block().message().slot()) })? @@ -506,10 +507,12 @@ impl BlockService { &validator_metrics::BLOCK_SERVICE_TIMES, &[validator_metrics::BLINDED_BEACON_BLOCK_HTTP_POST], ); + beacon_node .post_beacon_blinded_blocks_v2_ssz(signed_block, None) .await - .or_else(|e| handle_block_post_error(e, signed_block.message().slot()))? + .map(|_| ()) + .or_else(|e| handle_block_post_error(e, signed_block.message().slot()))?; } } Ok::<_, BlockError>(()) From caa1df6fc381c07b42c8cce09dd00b1309801286 Mon Sep 17 00:00:00 2001 From: Eitan Seri-Levi Date: Wed, 10 Sep 2025 05:29:56 -0700 Subject: [PATCH 23/81] Skip column gossip verification logic during block production (#7973) #7950 Skip column gossip verification logic during block production as its redundant and potentially computationally expensive. Co-Authored-By: Eitan Seri- Levi Co-Authored-By: Eitan Seri-Levi Co-Authored-By: Jimmy Chen Co-Authored-By: Jimmy Chen --- .../src/data_column_verification.rs | 54 +++++++++++++--- beacon_node/http_api/src/publish_blocks.rs | 63 +++++-------------- .../tests/broadcast_validation_tests.rs | 53 ++++++++++------ 3 files changed, 93 insertions(+), 77 deletions(-) diff --git a/beacon_node/beacon_chain/src/data_column_verification.rs b/beacon_node/beacon_chain/src/data_column_verification.rs index bc7778cc63..608e003a22 100644 --- a/beacon_node/beacon_chain/src/data_column_verification.rs +++ b/beacon_node/beacon_chain/src/data_column_verification.rs @@ -215,6 +215,40 @@ impl GossipVerifiedDataColumn ) } + /// Create a `GossipVerifiedDataColumn` from `DataColumnSidecar` for block production ONLY. + /// When publishing a block constructed locally, the EL will have already verified the cell proofs. + /// When publishing a block constructed externally, there will be no columns here. + pub fn new_for_block_publishing( + column_sidecar: Arc>, + chain: &BeaconChain, + ) -> Result { + verify_data_column_sidecar(&column_sidecar)?; + + // Check if the data column is already in the DA checker cache. This happens when data columns + // are made available through the `engine_getBlobs` method. If it exists in the cache, we know + // it has already passed the gossip checks, even though this particular instance hasn't been + // seen / published on the gossip network yet (passed the `verify_is_unknown_sidecar` check above). + // In this case, we should accept it for gossip propagation. + verify_is_unknown_sidecar(chain, &column_sidecar)?; + + if chain + .data_availability_checker + .is_data_column_cached(&column_sidecar.block_root(), &column_sidecar) + { + // Observe this data column so we don't process it again. + if O::observe() { + observe_gossip_data_column(&column_sidecar, chain)?; + } + return Err(GossipDataColumnError::PriorKnownUnpublished); + } + + Ok(Self { + block_root: column_sidecar.block_root(), + data_column: KzgVerifiedDataColumn::from_execution_verified(column_sidecar), + _phantom: Default::default(), + }) + } + /// Create a `GossipVerifiedDataColumn` from `DataColumnSidecar` for testing ONLY. pub fn __new_for_testing(column_sidecar: Arc>) -> Self { Self { @@ -447,12 +481,12 @@ pub fn validate_data_column_sidecar_for_gossip( Ok(()) } -// Verify that this is the first column sidecar received for the tuple: -// (block_header.slot, block_header.proposer_index, column_sidecar.index) -fn verify_is_first_sidecar( +/// Verify that `column_sidecar` is not yet known, i.e. this is the first time `column_sidecar` has been received for the tuple: +/// `(block_header.slot, block_header.proposer_index, column_sidecar.index)` +fn verify_is_unknown_sidecar( chain: &BeaconChain, - data_column: &DataColumnSidecar, + column_sidecar: &DataColumnSidecar, ) -> Result<(), GossipDataColumnError> { if chain .observed_column_sidecars .read() - .proposer_is_known(data_column) + .proposer_is_known(column_sidecar) .map_err(|e| GossipDataColumnError::BeaconChainError(Box::new(e.into())))? { return Err(GossipDataColumnError::PriorKnown { - proposer: data_column.block_proposer_index(), - slot: data_column.slot(), - index: data_column.index, + proposer: column_sidecar.block_proposer_index(), + slot: column_sidecar.slot(), + index: column_sidecar.index, }); } Ok(()) diff --git a/beacon_node/http_api/src/publish_blocks.rs b/beacon_node/http_api/src/publish_blocks.rs index f797e3f300..b6411167d9 100644 --- a/beacon_node/http_api/src/publish_blocks.rs +++ b/beacon_node/http_api/src/publish_blocks.rs @@ -3,7 +3,7 @@ use std::future::Future; use beacon_chain::blob_verification::{GossipBlobError, GossipVerifiedBlob}; use beacon_chain::block_verification_types::{AsBlock, RpcBlock}; -use beacon_chain::data_column_verification::{GossipDataColumnError, GossipVerifiedDataColumn}; +use beacon_chain::data_column_verification::GossipVerifiedDataColumn; use beacon_chain::validator_monitor::{get_block_delay_ms, timestamp_now}; use beacon_chain::{ AvailabilityProcessingStatus, BeaconChain, BeaconChainError, BeaconChainTypes, BlockError, @@ -216,7 +216,7 @@ pub async fn publish_block>( } } - if gossip_verified_columns.iter().map(Option::is_some).count() > 0 { + if !gossip_verified_columns.is_empty() { if let Some(data_column_publishing_delay) = data_column_publishing_delay_for_testing { // Subtract block publishing delay if it is also used. // Note: if `data_column_publishing_delay` is less than `block_publishing_delay`, it @@ -240,7 +240,6 @@ pub async fn publish_block>( let sampling_columns_indices = chain.sampling_columns_for_epoch(epoch); let sampling_columns = gossip_verified_columns .into_iter() - .flatten() .filter(|data_column| sampling_columns_indices.contains(&data_column.index())) .collect::>(); @@ -348,7 +347,7 @@ pub async fn publish_block>( type BuildDataSidecarTaskResult = Result< ( Vec>>, - Vec>>, + Vec>, ), Rejection, >; @@ -382,7 +381,7 @@ fn spawn_build_data_sidecar_task( } else { // Post PeerDAS: construct data columns. let gossip_verified_data_columns = - build_gossip_verified_data_columns(&chain, &block, blobs, kzg_proofs)?; + build_data_columns(&chain, &block, blobs, kzg_proofs)?; Ok((vec![], gossip_verified_data_columns)) } }, @@ -397,12 +396,16 @@ fn spawn_build_data_sidecar_task( }) } -fn build_gossip_verified_data_columns( +/// Build data columns as wrapped `GossipVerifiedDataColumn`s. +/// There is no need to actually perform gossip verification on columns that a block producer +/// is publishing. In the locally constructed case, cell proof verification happens in the EL. +/// In the externally constructed case, there wont be any columns here. +fn build_data_columns( chain: &BeaconChain, block: &SignedBeaconBlock>, blobs: BlobsList, kzg_cell_proofs: KzgProofs, -) -> Result>>, Rejection> { +) -> Result>, Rejection> { let slot = block.slot(); let data_column_sidecars = build_blob_data_column_sidecars(chain, block, blobs, kzg_cell_proofs).map_err(|e| { @@ -414,49 +417,12 @@ fn build_gossip_verified_data_columns( warp_utils::reject::custom_bad_request(format!("{e:?}")) })?; - let slot = block.slot(); let gossip_verified_data_columns = data_column_sidecars .into_iter() - .map(|data_column_sidecar| { - let column_index = data_column_sidecar.index; - let subnet = DataColumnSubnetId::from_column_index(column_index, &chain.spec); - let gossip_verified_column = - GossipVerifiedDataColumn::new(data_column_sidecar, subnet, chain); - - match gossip_verified_column { - Ok(blob) => Ok(Some(blob)), - Err(GossipDataColumnError::PriorKnown { proposer, .. }) => { - // Log the error but do not abort publication, we may need to publish the block - // or some of the other data columns if the block & data columns are only - // partially published by the other publisher. - debug!( - column_index, - %slot, - proposer, - "Data column for publication already known" - ); - Ok(None) - } - Err(GossipDataColumnError::PriorKnownUnpublished) => { - debug!( - column_index, - %slot, - "Data column for publication already known via the EL" - ); - Ok(None) - } - Err(e) => { - error!( - column_index, - %slot, - error = ?e, - "Data column for publication is gossip-invalid" - ); - Err(warp_utils::reject::custom_bad_request(format!("{e:?}"))) - } - } + .filter_map(|data_column_sidecar| { + GossipVerifiedDataColumn::new_for_block_publishing(data_column_sidecar, chain).ok() }) - .collect::, Rejection>>()?; + .collect::>(); Ok(gossip_verified_data_columns) } @@ -533,13 +499,12 @@ fn publish_blob_sidecars( fn publish_column_sidecars( sender_clone: &UnboundedSender>, - data_column_sidecars: &[Option>], + data_column_sidecars: &[GossipVerifiedDataColumn], chain: &BeaconChain, ) -> Result<(), BlockError> { let malicious_withhold_count = chain.config.malicious_withhold_count; let mut data_column_sidecars = data_column_sidecars .iter() - .flatten() .map(|d| d.clone_data_column()) .collect::>(); if malicious_withhold_count > 0 { diff --git a/beacon_node/http_api/tests/broadcast_validation_tests.rs b/beacon_node/http_api/tests/broadcast_validation_tests.rs index d9ddbf9892..7f02c2c0fd 100644 --- a/beacon_node/http_api/tests/broadcast_validation_tests.rs +++ b/beacon_node/http_api/tests/broadcast_validation_tests.rs @@ -85,13 +85,18 @@ pub async fn gossip_invalid() { /* mandated by Beacon API spec */ assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); - // Since Deneb, the invalidity of the blobs will be detected prior to the invalidity of the - // block. let pre_finalized_block_root = Hash256::zero(); - assert_server_message_error( - error_response, - format!("BAD_REQUEST: ParentUnknown {{ parent_root: {pre_finalized_block_root:?} }}"), - ); + let expected_error_msg = if tester.harness.spec.is_fulu_scheduled() { + format!( + "BAD_REQUEST: NotFinalizedDescendant {{ block_parent_root: {pre_finalized_block_root:?} }}" + ) + } else { + // Since Deneb, the invalidity of the blobs will be detected prior to the invalidity of the + // block. + format!("BAD_REQUEST: ParentUnknown {{ parent_root: {pre_finalized_block_root:?} }}") + }; + + assert_server_message_error(error_response, expected_error_msg); } /// This test checks that a block that is valid from a gossip perspective is accepted when using `broadcast_validation=gossip`. @@ -276,13 +281,19 @@ pub async fn consensus_invalid() { /* mandated by Beacon API spec */ assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); - // Since Deneb, the invalidity of the blobs will be detected prior to the invalidity of the - // block. + let pre_finalized_block_root = Hash256::zero(); - assert_server_message_error( - error_response, - format!("BAD_REQUEST: ParentUnknown {{ parent_root: {pre_finalized_block_root:?} }}"), - ); + let expected_error_msg = if tester.harness.spec.is_fulu_scheduled() { + format!( + "BAD_REQUEST: NotFinalizedDescendant {{ block_parent_root: {pre_finalized_block_root:?} }}" + ) + } else { + // Since Deneb, the invalidity of the blobs will be detected prior to the invalidity of the + // block. + format!("BAD_REQUEST: ParentUnknown {{ parent_root: {pre_finalized_block_root:?} }}") + }; + + assert_server_message_error(error_response, expected_error_msg); } /// This test checks that a block that is only valid from a gossip perspective is rejected when using `broadcast_validation=consensus`. @@ -507,13 +518,19 @@ pub async fn equivocation_invalid() { /* mandated by Beacon API spec */ assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); - // Since Deneb, the invalidity of the blobs will be detected prior to the invalidity of the - // block. + let pre_finalized_block_root = Hash256::zero(); - assert_server_message_error( - error_response, - format!("BAD_REQUEST: ParentUnknown {{ parent_root: {pre_finalized_block_root:?} }}"), - ); + let expected_error_msg = if tester.harness.spec.is_fulu_scheduled() { + format!( + "BAD_REQUEST: NotFinalizedDescendant {{ block_parent_root: {pre_finalized_block_root:?} }}" + ) + } else { + // Since Deneb, the invalidity of the blobs will be detected prior to the invalidity of the + // block. + format!("BAD_REQUEST: ParentUnknown {{ parent_root: {pre_finalized_block_root:?} }}") + }; + + assert_server_message_error(error_response, expected_error_msg); } /// This test checks that a block that is valid from both a gossip and consensus perspective is rejected when using `broadcast_validation=consensus_and_equivocation`. From ee1b6bc81bb4b7dafb95a95eabcfcaf6b76cea04 Mon Sep 17 00:00:00 2001 From: Daniel Knopik <107140945+dknopik@users.noreply.github.com> Date: Wed, 10 Sep 2025 14:59:24 +0200 Subject: [PATCH 24/81] Create `network_utils` crate (#7761) Anchor currently depends on `lighthouse_network` for a few types and utilities that live within. As we use our own libp2p behaviours, we actually do not use the core logic in that crate. This makes us transitively depend on a bunch of unneeded crates (even a whole separate libp2p if the versions mismatch!) Move things we require into it's own lightweight crate. Co-Authored-By: Daniel Knopik --- Cargo.lock | 38 +++++++++------ Cargo.toml | 4 +- beacon_node/Cargo.toml | 2 +- beacon_node/http_api/Cargo.toml | 1 + beacon_node/http_api/src/lib.rs | 3 +- beacon_node/http_api/tests/tests.rs | 3 +- beacon_node/http_metrics/Cargo.toml | 1 + beacon_node/http_metrics/src/metrics.rs | 2 +- beacon_node/lighthouse_network/Cargo.toml | 3 +- beacon_node/lighthouse_network/src/config.rs | 2 +- .../lighthouse_network/src/discovery/enr.rs | 4 +- .../lighthouse_network/src/discovery/mod.rs | 16 +++++-- beacon_node/lighthouse_network/src/lib.rs | 5 +- beacon_node/lighthouse_network/src/metrics.rs | 44 ------------------ .../src/peer_manager/mod.rs | 16 +++---- .../src/peer_manager/network_behaviour.rs | 5 +- .../src/peer_manager/peerdb.rs | 7 ++- .../lighthouse_network/src/service/mod.rs | 2 +- .../lighthouse_network/src/types/globals.rs | 5 +- .../lighthouse_network/tests/common.rs | 2 +- beacon_node/src/config.rs | 34 +++++++------- boot_node/Cargo.toml | 1 + boot_node/src/config.rs | 3 +- boot_node/src/server.rs | 3 +- common/network_utils/Cargo.toml | 17 +++++++ common/network_utils/src/discovery_metrics.rs | 46 +++++++++++++++++++ .../network_utils/src}/enr_ext.rs | 7 +-- common/network_utils/src/lib.rs | 4 ++ .../network_utils}/src/listen_addr.rs | 16 +++---- .../src/unused_port.rs} | 0 common/system_health/Cargo.toml | 2 + common/system_health/src/lib.rs | 37 ++++++--------- common/unused_port/Cargo.toml | 9 ---- lcli/Cargo.toml | 1 + lcli/src/generate_bootnode_enr.rs | 3 +- lighthouse/Cargo.toml | 2 +- lighthouse/tests/beacon_node.rs | 4 +- lighthouse/tests/boot_node.rs | 5 +- .../execution_engine_integration/Cargo.toml | 2 +- .../src/execution_engine.rs | 2 +- .../execution_engine_integration/src/geth.rs | 2 +- .../src/nethermind.rs | 2 +- 42 files changed, 198 insertions(+), 169 deletions(-) create mode 100644 common/network_utils/Cargo.toml create mode 100644 common/network_utils/src/discovery_metrics.rs rename {beacon_node/lighthouse_network/src/discovery => common/network_utils/src}/enr_ext.rs (98%) create mode 100644 common/network_utils/src/lib.rs rename {beacon_node/lighthouse_network => common/network_utils}/src/listen_addr.rs (86%) rename common/{unused_port/src/lib.rs => network_utils/src/unused_port.rs} (100%) delete mode 100644 common/unused_port/Cargo.toml diff --git a/Cargo.lock b/Cargo.lock index 96768211eb..88b5b7b57d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -936,6 +936,7 @@ dependencies = [ "hyper 1.6.0", "lighthouse_network", "monitoring_api", + "network_utils", "node_test_rig", "sensitive_url", "serde_json", @@ -945,7 +946,6 @@ dependencies = [ "task_executor", "tracing", "types", - "unused_port", ] [[package]] @@ -1205,6 +1205,7 @@ dependencies = [ "lighthouse_network", "log", "logging", + "network_utils", "serde", "tokio", "tracing", @@ -3314,6 +3315,7 @@ dependencies = [ "futures", "hex", "logging", + "network_utils", "reqwest 0.11.27", "sensitive_url", "serde_json", @@ -3321,7 +3323,6 @@ dependencies = [ "tempfile", "tokio", "types", - "unused_port", ] [[package]] @@ -4300,6 +4301,7 @@ dependencies = [ "lru", "metrics", "network", + "network_utils", "operation_pool", "parking_lot 0.12.3", "proto_array", @@ -4334,6 +4336,7 @@ dependencies = [ "logging", "malloc_utils", "metrics", + "network_utils", "reqwest 0.11.27", "serde", "slot_clock", @@ -5068,6 +5071,7 @@ dependencies = [ "lighthouse_version", "log", "malloc_utils", + "network_utils", "rayon", "serde", "serde_json", @@ -5583,6 +5587,7 @@ dependencies = [ "logging", "malloc_utils", "metrics", + "network_utils", "opentelemetry", "opentelemetry-otlp", "opentelemetry_sdk", @@ -5599,7 +5604,6 @@ dependencies = [ "tracing-opentelemetry", "tracing-subscriber", "types", - "unused_port", "validator_client", "validator_dir", "validator_manager", @@ -5635,6 +5639,7 @@ dependencies = [ "lru", "lru_cache", "metrics", + "network_utils", "parking_lot 0.12.3", "prometheus-client", "quickcheck", @@ -5650,14 +5655,12 @@ dependencies = [ "superstruct", "task_executor", "tempfile", - "tiny-keccak", "tokio", "tokio-util", "tracing", "tracing-subscriber", "types", "unsigned-varint 0.8.0", - "unused_port", ] [[package]] @@ -6355,6 +6358,21 @@ dependencies = [ "types", ] +[[package]] +name = "network_utils" +version = "0.1.0" +dependencies = [ + "discv5", + "hex", + "libp2p-identity", + "lru_cache", + "metrics", + "multiaddr", + "parking_lot 0.12.3", + "serde", + "tiny-keccak", +] + [[package]] name = "nix" version = "0.24.3" @@ -9179,6 +9197,8 @@ name = "system_health" version = "0.1.0" dependencies = [ "lighthouse_network", + "metrics", + "network_utils", "parking_lot 0.12.3", "serde", "sysinfo", @@ -10022,14 +10042,6 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" -[[package]] -name = "unused_port" -version = "0.1.0" -dependencies = [ - "lru_cache", - "parking_lot 0.12.3", -] - [[package]] name = "url" version = "2.5.4" diff --git a/Cargo.toml b/Cargo.toml index ca5f7bc153..0b930b605d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -37,6 +37,7 @@ members = [ "common/malloc_utils", "common/metrics", "common/monitoring_api", + "common/network_utils", "common/oneshot_broadcast", "common/pretty_reqwest_error", "common/sensitive_url", @@ -45,7 +46,6 @@ members = [ "common/target_check", "common/task_executor", "common/test_random_derive", - "common/unused_port", "common/validator_dir", "common/warp_utils", "common/workspace_members", @@ -194,6 +194,7 @@ mockall_double = "0.3" mockito = "1.5.0" monitoring_api = { path = "common/monitoring_api" } network = { path = "beacon_node/network" } +network_utils = { path = "common/network_utils" } node_test_rig = { path = "testing/node_test_rig" } num_cpus = "1" once_cell = "1.17.1" @@ -265,7 +266,6 @@ tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] } tree_hash = "0.10.0" tree_hash_derive = "0.10.0" types = { path = "consensus/types" } -unused_port = { path = "common/unused_port" } url = "2" uuid = { version = "0.8", features = ["serde", "v4"] } validator_client = { path = "validator_client" } diff --git a/beacon_node/Cargo.toml b/beacon_node/Cargo.toml index 456376e79b..dd7416af54 100644 --- a/beacon_node/Cargo.toml +++ b/beacon_node/Cargo.toml @@ -33,6 +33,7 @@ http_api = { workspace = true } hyper = { workspace = true } lighthouse_network = { workspace = true } monitoring_api = { workspace = true } +network_utils = { workspace = true } sensitive_url = { workspace = true } serde_json = { workspace = true } slasher = { workspace = true } @@ -41,7 +42,6 @@ strum = { workspace = true } task_executor = { workspace = true } tracing = { workspace = true } types = { workspace = true } -unused_port = { workspace = true } [dev-dependencies] node_test_rig = { path = "../testing/node_test_rig" } diff --git a/beacon_node/http_api/Cargo.toml b/beacon_node/http_api/Cargo.toml index 2061df3762..7dd0d0223f 100644 --- a/beacon_node/http_api/Cargo.toml +++ b/beacon_node/http_api/Cargo.toml @@ -26,6 +26,7 @@ logging = { workspace = true } lru = { workspace = true } metrics = { workspace = true } network = { workspace = true } +network_utils = { workspace = true } operation_pool = { workspace = true } parking_lot = { workspace = true } proto_array = { workspace = true } diff --git a/beacon_node/http_api/src/lib.rs b/beacon_node/http_api/src/lib.rs index bfe0bd4d38..5c6a9df739 100644 --- a/beacon_node/http_api/src/lib.rs +++ b/beacon_node/http_api/src/lib.rs @@ -54,10 +54,11 @@ use eth2::types::{ use eth2::{CONSENSUS_VERSION_HEADER, CONTENT_TYPE_HEADER, SSZ_CONTENT_TYPE_HEADER}; use health_metrics::observe::Observe; use lighthouse_network::rpc::methods::MetaData; -use lighthouse_network::{Enr, EnrExt, NetworkGlobals, PeerId, PubsubMessage, types::SyncState}; +use lighthouse_network::{Enr, NetworkGlobals, PeerId, PubsubMessage, types::SyncState}; use lighthouse_version::version_with_platform; use logging::{SSELoggingComponents, crit}; use network::{NetworkMessage, NetworkSenders, ValidatorSubscriptionMessage}; +use network_utils::enr_ext::EnrExt; use operation_pool::ReceivedPreCapella; use parking_lot::RwLock; pub use publish_blocks::{ diff --git a/beacon_node/http_api/tests/tests.rs b/beacon_node/http_api/tests/tests.rs index 91f8666381..2072fb9932 100644 --- a/beacon_node/http_api/tests/tests.rs +++ b/beacon_node/http_api/tests/tests.rs @@ -26,8 +26,9 @@ use http_api::{ BlockId, StateId, test_utils::{ApiServer, create_api_server}, }; -use lighthouse_network::{Enr, EnrExt, PeerId, types::SyncState}; +use lighthouse_network::{Enr, PeerId, types::SyncState}; use network::NetworkReceivers; +use network_utils::enr_ext::EnrExt; use operation_pool::attestation_storage::CheckpointKey; use proto_array::ExecutionStatus; use sensitive_url::SensitiveUrl; diff --git a/beacon_node/http_metrics/Cargo.toml b/beacon_node/http_metrics/Cargo.toml index e12053ac43..b74c04a4cb 100644 --- a/beacon_node/http_metrics/Cargo.toml +++ b/beacon_node/http_metrics/Cargo.toml @@ -13,6 +13,7 @@ lighthouse_version = { workspace = true } logging = { workspace = true } malloc_utils = { workspace = true } metrics = { workspace = true } +network_utils = { workspace = true } serde = { workspace = true } slot_clock = { workspace = true } store = { workspace = true } diff --git a/beacon_node/http_metrics/src/metrics.rs b/beacon_node/http_metrics/src/metrics.rs index dbb0707a90..c19fa8fd3b 100644 --- a/beacon_node/http_metrics/src/metrics.rs +++ b/beacon_node/http_metrics/src/metrics.rs @@ -37,7 +37,7 @@ pub fn gather_prometheus_metrics( store::scrape_for_metrics(db_path, freezer_db_path); } - lighthouse_network::scrape_discovery_metrics(); + network_utils::discovery_metrics::scrape_discovery_metrics(); health_metrics::metrics::scrape_health_metrics(); diff --git a/beacon_node/lighthouse_network/Cargo.toml b/beacon_node/lighthouse_network/Cargo.toml index 9963cc0bc4..7e69f6770b 100644 --- a/beacon_node/lighthouse_network/Cargo.toml +++ b/beacon_node/lighthouse_network/Cargo.toml @@ -31,6 +31,7 @@ logging = { workspace = true } lru = { workspace = true } lru_cache = { workspace = true } metrics = { workspace = true } +network_utils = { workspace = true } parking_lot = { workspace = true } prometheus-client = "0.23.0" rand = { workspace = true } @@ -43,14 +44,12 @@ ssz_types = { workspace = true } strum = { workspace = true } superstruct = { workspace = true } task_executor = { workspace = true } -tiny-keccak = "2" tokio = { workspace = true } tokio-util = { workspace = true } tracing = { workspace = true } tracing-subscriber = { workspace = true } types = { workspace = true } unsigned-varint = { version = "0.8", features = ["codec"] } -unused_port = { workspace = true } [dependencies.libp2p] version = "0.56" diff --git a/beacon_node/lighthouse_network/src/config.rs b/beacon_node/lighthouse_network/src/config.rs index 23d545798f..89c6c58d4f 100644 --- a/beacon_node/lighthouse_network/src/config.rs +++ b/beacon_node/lighthouse_network/src/config.rs @@ -1,4 +1,3 @@ -use crate::listen_addr::{ListenAddr, ListenAddress}; use crate::peer_manager::config::DEFAULT_TARGET_PEERS; use crate::rpc::config::{InboundRateLimiterConfig, OutboundRateLimiterConfig}; use crate::types::GossipKind; @@ -8,6 +7,7 @@ use directory::{ }; use libp2p::Multiaddr; use local_ip_address::local_ipv6; +use network_utils::listen_addr::{ListenAddr, ListenAddress}; use serde::{Deserialize, Serialize}; use sha2::{Digest, Sha256}; use std::net::{Ipv4Addr, Ipv6Addr}; diff --git a/beacon_node/lighthouse_network/src/discovery/enr.rs b/beacon_node/lighthouse_network/src/discovery/enr.rs index bb3a32daf2..bb9ff299c5 100644 --- a/beacon_node/lighthouse_network/src/discovery/enr.rs +++ b/beacon_node/lighthouse_network/src/discovery/enr.rs @@ -3,13 +3,13 @@ pub use discv5::enr::CombinedKey; use super::ENR_FILENAME; -use super::enr_ext::CombinedKeyExt; -use super::enr_ext::{EnrExt, QUIC_ENR_KEY, QUIC6_ENR_KEY}; use crate::NetworkConfig; use crate::types::{Enr, EnrAttestationBitfield, EnrSyncCommitteeBitfield}; use alloy_rlp::bytes::Bytes; use libp2p::identity::Keypair; use lighthouse_version::{client_name, version}; +use network_utils::enr_ext::CombinedKeyExt; +use network_utils::enr_ext::{EnrExt, QUIC_ENR_KEY, QUIC6_ENR_KEY}; use ssz::{Decode, Encode}; use ssz_types::BitVector; use std::fs::File; diff --git a/beacon_node/lighthouse_network/src/discovery/mod.rs b/beacon_node/lighthouse_network/src/discovery/mod.rs index a245e830b9..49de62546d 100644 --- a/beacon_node/lighthouse_network/src/discovery/mod.rs +++ b/beacon_node/lighthouse_network/src/discovery/mod.rs @@ -4,7 +4,6 @@ //! queries and manages access to the discovery routing table. pub(crate) mod enr; -pub mod enr_ext; // Allow external use of the lighthouse ENR builder use crate::service::TARGET_SUBNET_PEERS; @@ -12,8 +11,8 @@ use crate::{ClearDialError, metrics}; use crate::{Enr, NetworkConfig, NetworkGlobals, Subnet, SubnetDiscovery}; use discv5::{Discv5, enr::NodeId}; pub use enr::{CombinedKey, Eth2Enr, build_enr, load_enr_from_disk, use_or_load_enr}; -pub use enr_ext::{CombinedKeyExt, EnrExt, peer_id_to_node_id}; pub use libp2p::identity::{Keypair, PublicKey}; +use network_utils::enr_ext::{CombinedKeyExt, EnrExt, peer_id_to_node_id}; use alloy_rlp::bytes::Bytes; use enr::{ATTESTATION_BITFIELD_ENR_KEY, ETH2_ENR_KEY, SYNC_COMMITTEE_BITFIELD_ENR_KEY}; @@ -33,6 +32,7 @@ pub use libp2p::{ }; use logging::crit; use lru::LruCache; +use network_utils::discovery_metrics; use ssz::Encode; use std::num::NonZeroUsize; use std::{ @@ -687,7 +687,10 @@ impl Discovery { min_ttl, retries, }); - metrics::set_gauge(&metrics::DISCOVERY_QUEUE, self.queued_queries.len() as i64); + metrics::set_gauge( + &discovery_metrics::DISCOVERY_QUEUE, + self.queued_queries.len() as i64, + ); } } @@ -722,7 +725,10 @@ impl Discovery { } } // Update the queue metric - metrics::set_gauge(&metrics::DISCOVERY_QUEUE, self.queued_queries.len() as i64); + metrics::set_gauge( + &discovery_metrics::DISCOVERY_QUEUE, + self.queued_queries.len() as i64, + ); processed } @@ -1233,7 +1239,7 @@ mod tests { let spec = Arc::new(ChainSpec::default()); let keypair = secp256k1::Keypair::generate(); let mut config = NetworkConfig::default(); - config.set_listening_addr(crate::ListenAddress::unused_v4_ports()); + config.set_listening_addr(network_utils::listen_addr::ListenAddress::unused_v4_ports()); let config = Arc::new(config); let enr_key: CombinedKey = CombinedKey::from_secp256k1(&keypair); let next_fork_digest = [0; 4]; diff --git a/beacon_node/lighthouse_network/src/lib.rs b/beacon_node/lighthouse_network/src/lib.rs index 5c4a458650..b6be9b5222 100644 --- a/beacon_node/lighthouse_network/src/lib.rs +++ b/beacon_node/lighthouse_network/src/lib.rs @@ -6,14 +6,12 @@ mod config; pub mod service; pub mod discovery; -pub mod listen_addr; pub mod metrics; pub mod peer_manager; pub mod rpc; pub mod types; use libp2p::swarm::DialError; -pub use listen_addr::*; use serde::{Deserialize, Deserializer, Serialize, Serializer, de}; use std::str::FromStr; @@ -107,13 +105,12 @@ pub use crate::types::{ pub use prometheus_client; pub use config::Config as NetworkConfig; -pub use discovery::{CombinedKeyExt, EnrExt, Eth2Enr}; +pub use discovery::Eth2Enr; pub use discv5; pub use gossipsub::{IdentTopic, MessageAcceptance, MessageId, Topic, TopicHash}; pub use libp2p; pub use libp2p::{Multiaddr, multiaddr}; pub use libp2p::{PeerId, Swarm, core::ConnectedPoint}; -pub use metrics::scrape_discovery_metrics; pub use peer_manager::{ ConnectionDirection, PeerConnectionStatus, PeerInfo, PeerManager, SyncInfo, SyncStatus, peerdb::PeerDB, diff --git a/beacon_node/lighthouse_network/src/metrics.rs b/beacon_node/lighthouse_network/src/metrics.rs index da986f2884..623d43a727 100644 --- a/beacon_node/lighthouse_network/src/metrics.rs +++ b/beacon_node/lighthouse_network/src/metrics.rs @@ -1,14 +1,6 @@ pub use metrics::*; use std::sync::LazyLock; -pub static NAT_OPEN: LazyLock> = LazyLock::new(|| { - try_create_int_gauge_vec( - "nat_open", - "An estimate indicating if the local node is reachable from external nodes", - &["protocol"], - ) -}); - pub static ADDRESS_UPDATE_COUNT: LazyLock> = LazyLock::new(|| { try_create_int_counter( "libp2p_address_update_total", @@ -53,31 +45,6 @@ pub static PEER_DISCONNECT_EVENT_COUNT: LazyLock> = LazyLock: "Count of libp2p peer disconnect events", ) }); -pub static DISCOVERY_BYTES: LazyLock> = LazyLock::new(|| { - try_create_int_gauge_vec( - "discovery_bytes", - "The number of bytes sent and received in discovery", - &["direction"], - ) -}); -pub static DISCOVERY_QUEUE: LazyLock> = LazyLock::new(|| { - try_create_int_gauge( - "discovery_queue_size", - "The number of discovery queries awaiting execution", - ) -}); -pub static DISCOVERY_REQS: LazyLock> = LazyLock::new(|| { - try_create_float_gauge( - "discovery_requests", - "The number of unsolicited discovery requests per second", - ) -}); -pub static DISCOVERY_SESSIONS: LazyLock> = LazyLock::new(|| { - try_create_int_gauge( - "discovery_sessions", - "The number of active discovery sessions with peers", - ) -}); pub static DISCOVERY_NO_USEFUL_ENRS: LazyLock> = LazyLock::new(|| { try_create_int_counter( "discovery_no_useful_enrs_found", @@ -219,14 +186,3 @@ pub static RESPONSE_IDLING: LazyLock> = LazyLock::new(|| { "The time our response remained idle in the response limiter", ) }); - -pub fn scrape_discovery_metrics() { - let metrics = - discv5::metrics::Metrics::from(discv5::Discv5::::raw_metrics()); - set_float_gauge(&DISCOVERY_REQS, metrics.unsolicited_requests_per_second); - set_gauge(&DISCOVERY_SESSIONS, metrics.active_sessions as i64); - set_gauge_vec(&DISCOVERY_BYTES, &["inbound"], metrics.bytes_recv as i64); - set_gauge_vec(&DISCOVERY_BYTES, &["outbound"], metrics.bytes_sent as i64); - set_gauge_vec(&NAT_OPEN, &["discv5_ipv4"], metrics.ipv4_contactable as i64); - set_gauge_vec(&NAT_OPEN, &["discv5_ipv6"], metrics.ipv6_contactable as i64); -} diff --git a/beacon_node/lighthouse_network/src/peer_manager/mod.rs b/beacon_node/lighthouse_network/src/peer_manager/mod.rs index e7c6f69242..592fccdc74 100644 --- a/beacon_node/lighthouse_network/src/peer_manager/mod.rs +++ b/beacon_node/lighthouse_network/src/peer_manager/mod.rs @@ -1,7 +1,5 @@ //! Implementation of Lighthouse's peer management system. -use crate::discovery::enr_ext::EnrExt; -use crate::discovery::peer_id_to_node_id; use crate::rpc::{GoodbyeReason, MetaData, Protocol, RPCError, RpcErrorResponse}; use crate::service::TARGET_SUBNET_PEERS; use crate::{Gossipsub, NetworkGlobals, PeerId, Subnet, SubnetDiscovery, metrics}; @@ -26,6 +24,8 @@ pub mod peerdb; use crate::peer_manager::peerdb::client::ClientKind; use libp2p::multiaddr; +use network_utils::discovery_metrics; +use network_utils::enr_ext::{EnrExt, peer_id_to_node_id}; pub use peerdb::peer_info::{ConnectionDirection, PeerConnectionStatus, PeerInfo}; use peerdb::score::{PeerAction, ReportSource}; pub use peerdb::sync_status::{SyncInfo, SyncStatus}; @@ -1111,7 +1111,7 @@ impl PeerManager { /// /// Protection criteria: /// - Outbound peers: don't prune if it would drop below target outbound peer count - /// - Data column sampling: ≤ MIN_SAMPLING_COLUMN_SUBNET_PEERS (2) peers per subnet + /// - Data column sampling: ≤ MIN_SAMPLING_COLUMN_SUBNET_PEERS (2) peers per subnet /// - Sync committees: ≤ MIN_SYNC_COMMITTEE_PEERS (2) peers per committee /// - Attestation subnets: protect peers on the scarcest attestation subnets /// @@ -1586,16 +1586,16 @@ impl PeerManager { // Set ipv4 nat_open metric flag if threshold of peercount is met, unset if below threshold if inbound_ipv4_peers_connected >= LIBP2P_NAT_OPEN_THRESHOLD { - metrics::set_gauge_vec(&metrics::NAT_OPEN, &["libp2p_ipv4"], 1); + metrics::set_gauge_vec(&discovery_metrics::NAT_OPEN, &["libp2p_ipv4"], 1); } else { - metrics::set_gauge_vec(&metrics::NAT_OPEN, &["libp2p_ipv4"], 0); + metrics::set_gauge_vec(&discovery_metrics::NAT_OPEN, &["libp2p_ipv4"], 0); } // Set ipv6 nat_open metric flag if threshold of peercount is met, unset if below threshold if inbound_ipv6_peers_connected >= LIBP2P_NAT_OPEN_THRESHOLD { - metrics::set_gauge_vec(&metrics::NAT_OPEN, &["libp2p_ipv6"], 1); + metrics::set_gauge_vec(&discovery_metrics::NAT_OPEN, &["libp2p_ipv6"], 1); } else { - metrics::set_gauge_vec(&metrics::NAT_OPEN, &["libp2p_ipv6"], 0); + metrics::set_gauge_vec(&discovery_metrics::NAT_OPEN, &["libp2p_ipv6"], 0); } // PEERS_CONNECTED @@ -2780,7 +2780,7 @@ mod tests { /// /// Create 6 peers with different sync statuses: /// Peer0: Behind - /// Peer1: Unknown + /// Peer1: Unknown /// Peer2: Synced /// Peer3: Advanced /// Peer4: Synced diff --git a/beacon_node/lighthouse_network/src/peer_manager/network_behaviour.rs b/beacon_node/lighthouse_network/src/peer_manager/network_behaviour.rs index 43d9b90d8d..729dbd193b 100644 --- a/beacon_node/lighthouse_network/src/peer_manager/network_behaviour.rs +++ b/beacon_node/lighthouse_network/src/peer_manager/network_behaviour.rs @@ -12,11 +12,12 @@ use libp2p::swarm::behaviour::{ConnectionClosed, ConnectionEstablished, DialFail use libp2p::swarm::dial_opts::{DialOpts, PeerCondition}; use libp2p::swarm::dummy::ConnectionHandler; use libp2p::swarm::{ConnectionDenied, ConnectionId, NetworkBehaviour, ToSwarm}; -pub use metrics::{NAT_OPEN, set_gauge_vec}; +use metrics::set_gauge_vec; +use network_utils::discovery_metrics::NAT_OPEN; +use network_utils::enr_ext::EnrExt; use tracing::{debug, error, trace}; use types::EthSpec; -use crate::discovery::enr_ext::EnrExt; use crate::types::SyncState; use crate::{ClearDialError, metrics}; diff --git a/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs b/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs index 083c3f00c2..0ccad8d042 100644 --- a/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs +++ b/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs @@ -1,10 +1,9 @@ +use crate::discovery::CombinedKey; use crate::discovery::enr::PEERDAS_CUSTODY_GROUP_COUNT_ENR_KEY; -use crate::discovery::{CombinedKey, peer_id_to_node_id}; -use crate::{ - Enr, EnrExt, Gossipsub, PeerId, SyncInfo, metrics, multiaddr::Multiaddr, types::Subnet, -}; +use crate::{Enr, Gossipsub, PeerId, SyncInfo, metrics, multiaddr::Multiaddr, types::Subnet}; use itertools::Itertools; use logging::crit; +use network_utils::enr_ext::{EnrExt, peer_id_to_node_id}; use peer_info::{ConnectionDirection, PeerConnectionStatus, PeerInfo}; use score::{PeerAction, ReportSource, Score, ScoreState}; use std::net::IpAddr; diff --git a/beacon_node/lighthouse_network/src/service/mod.rs b/beacon_node/lighthouse_network/src/service/mod.rs index 9edb70555d..ea2c53a07f 100644 --- a/beacon_node/lighthouse_network/src/service/mod.rs +++ b/beacon_node/lighthouse_network/src/service/mod.rs @@ -1,5 +1,4 @@ use self::gossip_cache::GossipCache; -use crate::EnrExt; use crate::Eth2Enr; use crate::config::{GossipsubConfigParams, NetworkLoad, gossipsub_config}; use crate::discovery::{ @@ -33,6 +32,7 @@ use libp2p::swarm::{NetworkBehaviour, Swarm, SwarmEvent}; use libp2p::upnp::tokio::Behaviour as Upnp; use libp2p::{PeerId, SwarmBuilder, identify}; use logging::crit; +use network_utils::enr_ext::EnrExt; use std::num::{NonZeroU8, NonZeroUsize}; use std::path::PathBuf; use std::pin::Pin; diff --git a/beacon_node/lighthouse_network/src/types/globals.rs b/beacon_node/lighthouse_network/src/types/globals.rs index bcb4758386..b8c34f8392 100644 --- a/beacon_node/lighthouse_network/src/types/globals.rs +++ b/beacon_node/lighthouse_network/src/types/globals.rs @@ -3,7 +3,8 @@ use super::TopicConfig; use crate::peer_manager::peerdb::PeerDB; use crate::rpc::{MetaData, MetaDataV3}; use crate::types::{BackFillState, SyncState}; -use crate::{Client, Enr, EnrExt, GossipTopic, Multiaddr, NetworkConfig, PeerId}; +use crate::{Client, Enr, GossipTopic, Multiaddr, NetworkConfig, PeerId}; +use network_utils::enr_ext::EnrExt; use parking_lot::RwLock; use std::collections::HashSet; use std::sync::Arc; @@ -250,7 +251,7 @@ impl NetworkGlobals { config: Arc, spec: Arc, ) -> NetworkGlobals { - use crate::CombinedKeyExt; + use network_utils::enr_ext::CombinedKeyExt; let keypair = libp2p::identity::secp256k1::Keypair::generate(); let enr_key: discv5::enr::CombinedKey = discv5::enr::CombinedKey::from_secp256k1(&keypair); let enr = discv5::enr::Enr::builder().build(&enr_key).unwrap(); diff --git a/beacon_node/lighthouse_network/tests/common.rs b/beacon_node/lighthouse_network/tests/common.rs index 6b111cfdc1..8a3047692f 100644 --- a/beacon_node/lighthouse_network/tests/common.rs +++ b/beacon_node/lighthouse_network/tests/common.rs @@ -1,9 +1,9 @@ #![cfg(test)] use lighthouse_network::Enr; -use lighthouse_network::EnrExt; use lighthouse_network::Multiaddr; use lighthouse_network::service::Network as LibP2PService; use lighthouse_network::{NetworkConfig, NetworkEvent}; +use network_utils::enr_ext::EnrExt; use std::sync::Arc; use std::sync::Weak; use tokio::runtime::Runtime; diff --git a/beacon_node/src/config.rs b/beacon_node/src/config.rs index 7e4b77e9aa..1b5f25b317 100644 --- a/beacon_node/src/config.rs +++ b/beacon_node/src/config.rs @@ -13,8 +13,8 @@ use directory::{DEFAULT_BEACON_NODE_DIR, DEFAULT_NETWORK_DIR, DEFAULT_ROOT_DIR}; use environment::RuntimeContext; use execution_layer::DEFAULT_JWT_FILE; use http_api::TlsConfig; -use lighthouse_network::ListenAddress; use lighthouse_network::{Enr, Multiaddr, NetworkConfig, PeerIdSerialized, multiaddr::Protocol}; +use network_utils::listen_addr::ListenAddress; use sensitive_url::SensitiveUrl; use std::collections::HashSet; use std::fmt::Debug; @@ -1011,7 +1011,7 @@ pub fn parse_listening_addresses(cli_args: &ArgMatches) -> Result Result Result Result Result Result> = LazyLock::new(|| { + try_create_int_gauge_vec( + "nat_open", + "An estimate indicating if the local node is reachable from external nodes", + &["protocol"], + ) +}); +pub static DISCOVERY_BYTES: LazyLock> = LazyLock::new(|| { + try_create_int_gauge_vec( + "discovery_bytes", + "The number of bytes sent and received in discovery", + &["direction"], + ) +}); +pub static DISCOVERY_QUEUE: LazyLock> = LazyLock::new(|| { + try_create_int_gauge( + "discovery_queue_size", + "The number of discovery queries awaiting execution", + ) +}); +pub static DISCOVERY_REQS: LazyLock> = LazyLock::new(|| { + try_create_float_gauge( + "discovery_requests", + "The number of unsolicited discovery requests per second", + ) +}); +pub static DISCOVERY_SESSIONS: LazyLock> = LazyLock::new(|| { + try_create_int_gauge( + "discovery_sessions", + "The number of active discovery sessions with peers", + ) +}); + +pub fn scrape_discovery_metrics() { + let metrics = + discv5::metrics::Metrics::from(discv5::Discv5::::raw_metrics()); + set_float_gauge(&DISCOVERY_REQS, metrics.unsolicited_requests_per_second); + set_gauge(&DISCOVERY_SESSIONS, metrics.active_sessions as i64); + set_gauge_vec(&DISCOVERY_BYTES, &["inbound"], metrics.bytes_recv as i64); + set_gauge_vec(&DISCOVERY_BYTES, &["outbound"], metrics.bytes_sent as i64); + set_gauge_vec(&NAT_OPEN, &["discv5_ipv4"], metrics.ipv4_contactable as i64); + set_gauge_vec(&NAT_OPEN, &["discv5_ipv6"], metrics.ipv6_contactable as i64); +} diff --git a/beacon_node/lighthouse_network/src/discovery/enr_ext.rs b/common/network_utils/src/enr_ext.rs similarity index 98% rename from beacon_node/lighthouse_network/src/discovery/enr_ext.rs rename to common/network_utils/src/enr_ext.rs index 1d065ebf4a..627dd15559 100644 --- a/beacon_node/lighthouse_network/src/discovery/enr_ext.rs +++ b/common/network_utils/src/enr_ext.rs @@ -1,11 +1,12 @@ //! ENR extension trait to support libp2p integration. -use crate::{Enr, Multiaddr, PeerId}; use discv5::enr::{CombinedKey, CombinedPublicKey}; -use libp2p::core::multiaddr::Protocol; -use libp2p::identity::{KeyType, Keypair, PublicKey, ed25519, secp256k1}; +use libp2p_identity::{KeyType, Keypair, PublicKey, ed25519, secp256k1}; +use multiaddr::{Multiaddr, PeerId, Protocol}; use tiny_keccak::{Hasher, Keccak}; +type Enr = discv5::enr::Enr; + pub const QUIC_ENR_KEY: &str = "quic"; pub const QUIC6_ENR_KEY: &str = "quic6"; diff --git a/common/network_utils/src/lib.rs b/common/network_utils/src/lib.rs new file mode 100644 index 0000000000..c3d6ee1e0c --- /dev/null +++ b/common/network_utils/src/lib.rs @@ -0,0 +1,4 @@ +pub mod discovery_metrics; +pub mod enr_ext; +pub mod listen_addr; +pub mod unused_port; diff --git a/beacon_node/lighthouse_network/src/listen_addr.rs b/common/network_utils/src/listen_addr.rs similarity index 86% rename from beacon_node/lighthouse_network/src/listen_addr.rs rename to common/network_utils/src/listen_addr.rs index 85232c0b35..bdd94b3414 100644 --- a/beacon_node/lighthouse_network/src/listen_addr.rs +++ b/common/network_utils/src/listen_addr.rs @@ -1,6 +1,6 @@ use std::net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddr}; -use libp2p::{Multiaddr, multiaddr::Protocol}; +use multiaddr::{Multiaddr, Protocol}; use serde::{Deserialize, Serialize}; /// A listening address composed by an Ip, an UDP port and a TCP port. @@ -84,23 +84,21 @@ impl ListenAddress { .chain(v6_tcp_multiaddr) } - #[cfg(test)] pub fn unused_v4_ports() -> Self { ListenAddress::V4(ListenAddr { addr: Ipv4Addr::UNSPECIFIED, - disc_port: unused_port::unused_udp4_port().unwrap(), - quic_port: unused_port::unused_udp4_port().unwrap(), - tcp_port: unused_port::unused_tcp4_port().unwrap(), + disc_port: crate::unused_port::unused_udp4_port().unwrap(), + quic_port: crate::unused_port::unused_udp4_port().unwrap(), + tcp_port: crate::unused_port::unused_tcp4_port().unwrap(), }) } - #[cfg(test)] pub fn unused_v6_ports() -> Self { ListenAddress::V6(ListenAddr { addr: Ipv6Addr::UNSPECIFIED, - disc_port: unused_port::unused_udp6_port().unwrap(), - quic_port: unused_port::unused_udp6_port().unwrap(), - tcp_port: unused_port::unused_tcp6_port().unwrap(), + disc_port: crate::unused_port::unused_udp6_port().unwrap(), + quic_port: crate::unused_port::unused_udp6_port().unwrap(), + tcp_port: crate::unused_port::unused_tcp6_port().unwrap(), }) } } diff --git a/common/unused_port/src/lib.rs b/common/network_utils/src/unused_port.rs similarity index 100% rename from common/unused_port/src/lib.rs rename to common/network_utils/src/unused_port.rs diff --git a/common/system_health/Cargo.toml b/common/system_health/Cargo.toml index 034683f72e..2cafc42d6e 100644 --- a/common/system_health/Cargo.toml +++ b/common/system_health/Cargo.toml @@ -5,6 +5,8 @@ edition = { workspace = true } [dependencies] lighthouse_network = { workspace = true } +metrics = { workspace = true } +network_utils = { workspace = true } parking_lot = { workspace = true } serde = { workspace = true } sysinfo = { workspace = true } diff --git a/common/system_health/src/lib.rs b/common/system_health/src/lib.rs index 31b222c540..b61bdec486 100644 --- a/common/system_health/src/lib.rs +++ b/common/system_health/src/lib.rs @@ -1,4 +1,5 @@ use lighthouse_network::{NetworkGlobals, types::SyncState}; +use network_utils::discovery_metrics; use parking_lot::RwLock; use serde::{Deserialize, Serialize}; use std::path::{Path, PathBuf}; @@ -219,33 +220,21 @@ impl NatState { /// Observes if NAT traversal is possible. pub fn observe_nat() -> NatState { - let discv5_ipv4 = lighthouse_network::metrics::get_int_gauge( - &lighthouse_network::metrics::NAT_OPEN, - &["discv5_ipv4"], - ) - .map(|g| g.get() == 1) - .unwrap_or_default(); + let discv5_ipv4 = metrics::get_int_gauge(&discovery_metrics::NAT_OPEN, &["discv5_ipv4"]) + .map(|g| g.get() == 1) + .unwrap_or_default(); - let discv5_ipv6 = lighthouse_network::metrics::get_int_gauge( - &lighthouse_network::metrics::NAT_OPEN, - &["discv5_ipv6"], - ) - .map(|g| g.get() == 1) - .unwrap_or_default(); + let discv5_ipv6 = metrics::get_int_gauge(&discovery_metrics::NAT_OPEN, &["discv5_ipv6"]) + .map(|g| g.get() == 1) + .unwrap_or_default(); - let libp2p_ipv4 = lighthouse_network::metrics::get_int_gauge( - &lighthouse_network::metrics::NAT_OPEN, - &["libp2p_ipv4"], - ) - .map(|g| g.get() == 1) - .unwrap_or_default(); + let libp2p_ipv4 = metrics::get_int_gauge(&discovery_metrics::NAT_OPEN, &["libp2p_ipv4"]) + .map(|g| g.get() == 1) + .unwrap_or_default(); - let libp2p_ipv6 = lighthouse_network::metrics::get_int_gauge( - &lighthouse_network::metrics::NAT_OPEN, - &["libp2p_ipv6"], - ) - .map(|g| g.get() == 1) - .unwrap_or_default(); + let libp2p_ipv6 = metrics::get_int_gauge(&discovery_metrics::NAT_OPEN, &["libp2p_ipv6"]) + .map(|g| g.get() == 1) + .unwrap_or_default(); NatState { discv5_ipv4, diff --git a/common/unused_port/Cargo.toml b/common/unused_port/Cargo.toml deleted file mode 100644 index 2d771cd600..0000000000 --- a/common/unused_port/Cargo.toml +++ /dev/null @@ -1,9 +0,0 @@ -[package] -name = "unused_port" -version = "0.1.0" -edition = { workspace = true } -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[dependencies] -lru_cache = { workspace = true } -parking_lot = { workspace = true } diff --git a/lcli/Cargo.toml b/lcli/Cargo.toml index b962fa3b81..2eed9da4c0 100644 --- a/lcli/Cargo.toml +++ b/lcli/Cargo.toml @@ -30,6 +30,7 @@ hex = { workspace = true } lighthouse_network = { workspace = true } lighthouse_version = { workspace = true } log = { workspace = true } +network_utils = { workspace = true } rayon = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } diff --git a/lcli/src/generate_bootnode_enr.rs b/lcli/src/generate_bootnode_enr.rs index 6fe13d17c3..ddd36e7e7a 100644 --- a/lcli/src/generate_bootnode_enr.rs +++ b/lcli/src/generate_bootnode_enr.rs @@ -1,9 +1,10 @@ use clap::ArgMatches; use lighthouse_network::{ NETWORK_KEY_FILENAME, NetworkConfig, - discovery::{CombinedKey, CombinedKeyExt, ENR_FILENAME, build_enr}, + discovery::{CombinedKey, ENR_FILENAME, build_enr}, libp2p::identity::secp256k1, }; +use network_utils::enr_ext::CombinedKeyExt; use std::io::Write; use std::path::PathBuf; use std::{fs, net::Ipv4Addr}; diff --git a/lighthouse/Cargo.toml b/lighthouse/Cargo.toml index 849d30bcf2..bf8241f8a2 100644 --- a/lighthouse/Cargo.toml +++ b/lighthouse/Cargo.toml @@ -57,6 +57,7 @@ lighthouse_tracing = { workspace = true } lighthouse_version = { workspace = true } logging = { workspace = true } metrics = { workspace = true } +network_utils = { workspace = true } opentelemetry = { workspace = true } opentelemetry-otlp = { workspace = true } opentelemetry_sdk = { workspace = true } @@ -70,7 +71,6 @@ tracing = { workspace = true } tracing-opentelemetry = { workspace = true } tracing-subscriber = { workspace = true } types = { workspace = true } -unused_port = { workspace = true } validator_client = { workspace = true } validator_manager = { path = "../validator_manager" } diff --git a/lighthouse/tests/beacon_node.rs b/lighthouse/tests/beacon_node.rs index 38fd54d29d..1fd3cc1b79 100644 --- a/lighthouse/tests/beacon_node.rs +++ b/lighthouse/tests/beacon_node.rs @@ -10,6 +10,9 @@ use beacon_node::{ }; use beacon_processor::BeaconProcessorConfig; use lighthouse_network::PeerId; +use network_utils::unused_port::{ + unused_tcp4_port, unused_tcp6_port, unused_udp4_port, unused_udp6_port, +}; use std::fs::File; use std::io::{Read, Write}; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr}; @@ -22,7 +25,6 @@ use std::time::Duration; use tempfile::TempDir; use types::non_zero_usize::new_non_zero_usize; use types::{Address, Checkpoint, Epoch, Hash256, MainnetEthSpec}; -use unused_port::{unused_tcp4_port, unused_tcp6_port, unused_udp4_port, unused_udp6_port}; const DEFAULT_EXECUTION_ENDPOINT: &str = "http://localhost:8551/"; const DEFAULT_EXECUTION_JWT_SECRET_KEY: &str = diff --git a/lighthouse/tests/boot_node.rs b/lighthouse/tests/boot_node.rs index bd1cd7574e..38111ca0ef 100644 --- a/lighthouse/tests/boot_node.rs +++ b/lighthouse/tests/boot_node.rs @@ -3,8 +3,8 @@ use boot_node::config::BootNodeConfigSerialization; use crate::exec::{CommandLineTestExec, CompletedTest}; use clap::ArgMatches; use clap_utils::get_eth2_network_config; -use lighthouse_network::Enr; -use lighthouse_network::discovery::ENR_FILENAME; +use lighthouse_network::{Enr, discovery::ENR_FILENAME}; +use network_utils::unused_port::unused_udp4_port; use std::fs::File; use std::io::Write; use std::net::Ipv4Addr; @@ -12,7 +12,6 @@ use std::path::{Path, PathBuf}; use std::process::Command; use std::str::FromStr; use tempfile::TempDir; -use unused_port::unused_udp4_port; const IP_ADDRESS: &str = "192.168.2.108"; diff --git a/testing/execution_engine_integration/Cargo.toml b/testing/execution_engine_integration/Cargo.toml index 07d8d98f1d..eef13cfc73 100644 --- a/testing/execution_engine_integration/Cargo.toml +++ b/testing/execution_engine_integration/Cargo.toml @@ -18,6 +18,7 @@ fork_choice = { workspace = true } futures = { workspace = true } hex = { workspace = true } logging = { workspace = true } +network_utils = { workspace = true } reqwest = { workspace = true } sensitive_url = { workspace = true } serde_json = { workspace = true } @@ -25,4 +26,3 @@ task_executor = { workspace = true } tempfile = { workspace = true } tokio = { workspace = true } types = { workspace = true } -unused_port = { workspace = true } diff --git a/testing/execution_engine_integration/src/execution_engine.rs b/testing/execution_engine_integration/src/execution_engine.rs index 61a50b0405..ed4ee4682f 100644 --- a/testing/execution_engine_integration/src/execution_engine.rs +++ b/testing/execution_engine_integration/src/execution_engine.rs @@ -1,10 +1,10 @@ use ethers_providers::{Http, Provider}; use execution_layer::DEFAULT_JWT_FILE; +use network_utils::unused_port::unused_tcp4_port; use sensitive_url::SensitiveUrl; use std::path::PathBuf; use std::process::Child; use tempfile::TempDir; -use unused_port::unused_tcp4_port; pub const KEYSTORE_PASSWORD: &str = "testpwd"; pub const ACCOUNT1: &str = "7b8C3a386C0eea54693fFB0DA17373ffC9228139"; diff --git a/testing/execution_engine_integration/src/geth.rs b/testing/execution_engine_integration/src/geth.rs index 91d6c7fd57..4b62e68e94 100644 --- a/testing/execution_engine_integration/src/geth.rs +++ b/testing/execution_engine_integration/src/geth.rs @@ -1,11 +1,11 @@ use crate::build_utils; use crate::execution_engine::GenericExecutionEngine; use crate::genesis_json::geth_genesis_json; +use network_utils::unused_port::unused_tcp4_port; use std::path::{Path, PathBuf}; use std::process::{Child, Command, Output}; use std::{env, fs}; use tempfile::TempDir; -use unused_port::unused_tcp4_port; const GETH_BRANCH: &str = "master"; const GETH_REPO_URL: &str = "https://github.com/ethereum/go-ethereum"; diff --git a/testing/execution_engine_integration/src/nethermind.rs b/testing/execution_engine_integration/src/nethermind.rs index c3b8651789..6a336161bd 100644 --- a/testing/execution_engine_integration/src/nethermind.rs +++ b/testing/execution_engine_integration/src/nethermind.rs @@ -1,12 +1,12 @@ use crate::build_utils; use crate::execution_engine::GenericExecutionEngine; use crate::genesis_json::nethermind_genesis_json; +use network_utils::unused_port::unused_tcp4_port; use std::env; use std::fs; use std::path::{Path, PathBuf}; use std::process::{Child, Command, Output}; use tempfile::TempDir; -use unused_port::unused_tcp4_port; /// We've pinned the Nethermind version since our method of using the `master` branch to /// find the latest tag isn't working. It appears Nethermind don't always tag on `master`. From f71d69755d3961474c38e7025064e848f379dbe1 Mon Sep 17 00:00:00 2001 From: kevaundray Date: Wed, 10 Sep 2025 14:48:11 +0100 Subject: [PATCH 25/81] chore: add comment to PendingComponents (#7979) Adds doc comment Co-Authored-By: Kevaundray Wedderburn Co-Authored-By: Jimmy Chen --- .../src/data_availability_checker/overflow_lru_cache.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs b/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs index eaea2f70da..7f083139ee 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs @@ -26,6 +26,15 @@ use types::{ /// /// The blobs are all gossip and kzg verified. /// The block has completed all verifications except the availability check. +/// +/// There are currently three distinct hardfork eras that one should take note of: +/// - Pre-Deneb: No availability requirements (Block is immediately available) +/// - Post-Deneb, Pre-PeerDAS: Blobs are needed, but columns are not for the availability check +/// - Post-PeerDAS: Columns are needed, but blobs are not for the availability check +/// +/// Note: from this, one can immediately see that `verified_blobs` and `verified_data_columns` +/// are mutually exclusive. i.e. If we are verifying columns to determine a block's availability +/// we are ignoring the `verified_blobs` field. pub struct PendingComponents { pub block_root: Hash256, pub verified_blobs: RuntimeFixedVector>>, From 2ecbb7f90bcf67c635a68eccdaec5e5eadf72b08 Mon Sep 17 00:00:00 2001 From: Daniel Ramirez-Chiquillo Date: Wed, 10 Sep 2025 08:52:34 -0500 Subject: [PATCH 26/81] Remove cargo test targets, use nextest exclusively (#7874) Fixes #7835 - Remove cargo test-based Make targets (`test-release`, `test-debug`, `run-ef-tests`) - Update aliases (`test`, `test-full`, `test-ef`) to use existing nextest equivalents - Update contributing documentation to use nextest examples - Fix example commands that previously referenced non-existing packages (`ssz`/`eth2_ssz`) Co-Authored-By: Daniel Ramirez-Chiquillo --- .github/workflows/test-suite.yml | 8 +-- CLAUDE.md | 51 ++++++++++++--- Makefile | 28 ++------- book/src/contributing_setup.md | 105 +++++++++---------------------- wordlist.txt | 1 + 5 files changed, 79 insertions(+), 114 deletions(-) diff --git a/.github/workflows/test-suite.yml b/.github/workflows/test-suite.yml index faa2745f55..59a045c7d3 100644 --- a/.github/workflows/test-suite.yml +++ b/.github/workflows/test-suite.yml @@ -102,7 +102,7 @@ jobs: with: version: nightly-ca67d15f4abd46394b324c50e21e66f306a1162d - name: Run tests in release - run: make nextest-release + run: make test-release - name: Show cache stats if: env.SELF_HOSTED_RUNNERS == 'true' continue-on-error: true @@ -134,7 +134,7 @@ jobs: - name: Set LIBCLANG_PATH run: echo "LIBCLANG_PATH=$((gcm clang).source -replace "clang.exe")" >> $env:GITHUB_ENV - name: Run tests in release - run: make nextest-release + run: make test-release - name: Show cache stats if: env.SELF_HOSTED_RUNNERS == 'true' continue-on-error: true @@ -269,7 +269,7 @@ jobs: with: version: nightly-ca67d15f4abd46394b324c50e21e66f306a1162d - name: Run tests in debug - run: make nextest-debug + run: make test-debug - name: Show cache stats if: env.SELF_HOSTED_RUNNERS == 'true' continue-on-error: true @@ -306,7 +306,7 @@ jobs: cache-target: release bins: cargo-nextest - name: Run consensus-spec-tests with blst and fake_crypto - run: make nextest-ef + run: make test-ef - name: Show cache stats if: env.SELF_HOSTED_RUNNERS == 'true' continue-on-error: true diff --git a/CLAUDE.md b/CLAUDE.md index 53a4433747..3e9ab169f3 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -7,25 +7,28 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co **Important**: Always branch from `unstable` and target `unstable` when creating pull requests. ### Building and Installation + - `make install` - Build and install the main Lighthouse binary in release mode - `make install-lcli` - Build and install the `lcli` utility binary - `cargo build --release` - Standard Rust release build - `cargo build --bin lighthouse --features "gnosis,slasher-lmdb"` - Build with specific features ### Testing + - `make test` - Run the full test suite in release mode (excludes EF tests, beacon_chain, slasher, network, http_api) -- `make nextest-release` - Run tests using nextest (faster parallel test runner) +- `make test-release` - Run tests using nextest (faster parallel test runner) - `make test-beacon-chain` - Run beacon chain tests for all supported forks - `make test-slasher` - Run slasher tests with all database backend combinations - `make test-ef` - Download and run Ethereum Foundation test vectors - `make test-full` - Complete test suite including linting, EF tests, and execution engine tests -- `cargo test -p ` - Run tests for a specific package -- `cargo test -p ` - Run individual test (preferred during development iteration) +- `cargo nextest run -p ` - Run tests for a specific package +- `cargo nextest run -p ` - Run individual test (preferred during development iteration) - `FORK_NAME=electra cargo nextest run -p beacon_chain` - Run tests for specific fork **Note**: Full test suite takes ~20 minutes. When iterating, prefer running individual tests. -### Linting and Code Quality +### Linting and Code Quality + - `make lint` - Run Clippy linter with project-specific rules - `make lint-full` - Run comprehensive linting including tests (recommended for thorough checking) - `make cargo-fmt` - Check code formatting with rustfmt @@ -33,8 +36,9 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co - `make audit` - Run security audit on dependencies ### Cross-compilation + - `make build-x86_64` - Cross-compile for x86_64 Linux -- `make build-aarch64` - Cross-compile for ARM64 Linux +- `make build-aarch64` - Cross-compile for ARM64 Linux - `make build-riscv64` - Cross-compile for RISC-V 64-bit Linux ## Architecture Overview @@ -44,13 +48,15 @@ Lighthouse is a modular Ethereum consensus client with two main components: ### Core Components **Beacon Node** (`beacon_node/`) + - Main consensus client that syncs with the Ethereum network - Contains the beacon chain state transition logic (`beacon_node/beacon_chain/`) - Handles networking, storage, and P2P communication - Provides HTTP API for validator clients and external tools - Entry point: `beacon_node/src/lib.rs` -**Validator Client** (`validator_client/`) +**Validator Client** (`validator_client/`) + - Manages validator keystores and performs validator duties - Connects to beacon nodes via HTTP API - Handles block proposals, attestations, and sync committee duties @@ -60,31 +66,37 @@ Lighthouse is a modular Ethereum consensus client with two main components: ### Key Subsystems **Consensus Types** (`consensus/types/`) + - Core Ethereum consensus data structures (BeaconState, BeaconBlock, etc.) - Ethereum specification implementations for different networks (mainnet, gnosis) - SSZ encoding/decoding and state transition primitives **Storage** (`beacon_node/store/`) + - Hot/cold database architecture for efficient beacon chain storage - Supports multiple backends (LevelDB, RocksDB, REDB) - Handles state pruning and historical data management **Networking** (`beacon_node/lighthouse_network/`, `beacon_node/network/`) + - Libp2p-based P2P networking stack - Gossipsub for message propagation - Discovery v5 for peer discovery - Request/response protocols for sync **Fork Choice** (`consensus/fork_choice/`, `consensus/proto_array/`) + - Implements Ethereum's fork choice algorithm (proto-array) - Manages chain reorganizations and finality **Execution Layer Integration** (`beacon_node/execution_layer/`) + - Interfaces with execution clients - Retrieves payloads from local execution layer or external block builders - Handles payload validation and builder integration **Slasher** (`slasher/`) + - Optional slashing detection service - Supports LMDB, MDBX, and REDB database backends - Can be enabled with `--slasher` flag @@ -120,6 +132,7 @@ Lighthouse is a modular Ethereum consensus client with two main components: ## Common Review Standards ### CI/Testing Requirements + - All checks must pass before merge - Test coverage expected for significant changes - Flaky tests are actively addressed and fixed @@ -127,12 +140,14 @@ Lighthouse is a modular Ethereum consensus client with two main components: - `beacon_chain` and `http_api` tests support fork-specific testing using `FORK_NAME` env var when `beacon_chain/fork_from_env` feature is enabled ### Code Quality Standards + - Clippy warnings must be fixed promptly (multiple PRs show this pattern) - Code formatting with `cargo fmt` enforced - Must run `cargo sort` when adding dependencies - dependency order is enforced on CI - Performance considerations for hot paths ### Documentation and Context + - PRs require clear descriptions of what and why - Breaking changes need migration documentation - API changes require documentation updates @@ -140,6 +155,7 @@ Lighthouse is a modular Ethereum consensus client with two main components: - Comments appreciated for complex logic ### Security and Safety + - Careful review of consensus-critical code paths - Error handling patterns must be comprehensive - Input validation for external data @@ -147,6 +163,7 @@ Lighthouse is a modular Ethereum consensus client with two main components: ## Development Patterns and Best Practices ### Panics and Error Handling + - **Panics should be avoided at all costs** - Always prefer returning a `Result` or `Option` over causing a panic (e.g., prefer `array.get(1)?` over `array[1]`) - Avoid `expect` or `unwrap` at runtime - only acceptable during startup when validating CLI flags or configurations @@ -154,18 +171,22 @@ Lighthouse is a modular Ethereum consensus client with two main components: - Use proper error handling with `Result` types and graceful error propagation ### Rayon Usage + - Avoid using the rayon global thread pool as it results in CPU oversubscription when the beacon processor has fully allocated all CPUs to workers - Use scoped rayon pools started by beacon processor for computational intensive tasks ### Locks + - Take great care to avoid deadlocks when working with fork choice locks - seek detailed review ([reference](beacon_node/beacon_chain/src/canonical_head.rs:9)) - Keep lock scopes as narrow as possible to avoid blocking fast-responding functions like the networking stack ### Async Patterns + - Avoid blocking computations in async tasks - Spawn a blocking task instead for CPU-intensive work ### Tracing + - Design spans carefully and avoid overuse of spans just to add context data to events - Avoid using spans on simple getter methods as it can result in performance overhead - Be cautious of span explosion with recursive functions @@ -173,14 +194,17 @@ Lighthouse is a modular Ethereum consensus client with two main components: - Avoid using `span.enter()` or `span.entered()` in async tasks ### Database + - Maintain schema continuity on `unstable` branch - Database migrations must be backward compatible ### Consensus Crate + - Use safe math methods like `saturating_xxx` or `checked_xxx` - Critical that this crate behaves deterministically and MUST not have undefined behavior ### Testing Patterns + - **Use appropriate test types for the right scenarios**: - **Unit tests** for single component edge cases and isolated logic - **Integration tests** using [`BeaconChainHarness`](beacon_node/beacon_chain/src/test_utils.rs:668) for end-to-end workflows @@ -204,6 +228,7 @@ Lighthouse is a modular Ethereum consensus client with two main components: - See [`scripts/local_testnet/README.md`](scripts/local_testnet/README.md) for setup instructions ### TODOs and Comments + - All `TODO` statements must be accompanied by a GitHub issue link - Prefer line (`//`) comments to block comments (`/* ... */`) - Use doc comments (`///`) before attributes for public items @@ -211,7 +236,9 @@ Lighthouse is a modular Ethereum consensus client with two main components: - Provide examples in doc comments for public APIs when helpful ## Logging Guidelines + Use appropriate log levels for different scenarios: + - **`crit`**: Critical issues with major impact to Lighthouse functionality - Lighthouse may not function correctly without resolving. Needs immediate attention. - **`error`**: Error cases that may have moderate impact to Lighthouse functionality. Expect to receive reports from users for this level. - **`warn`**: Unexpected code paths that don't have major impact - fully recoverable. Expect user reports if excessive warning logs occur. @@ -221,6 +248,7 @@ Use appropriate log levels for different scenarios: ## Code Examples ### Safe Math in Consensus Crate + ```rust // ❌ Avoid - could panic let result = a + b; @@ -234,6 +262,7 @@ let result = a.safe_add(b)?; ``` ### Panics and Error Handling + ```rust // ❌ Avoid - could panic at runtime let value = some_result.unwrap(); @@ -253,6 +282,7 @@ let item = array.get(1).expect("Array always has at least 2 elements due to vali ``` ### TODO Format + ```rust pub fn my_function(&mut self, _something: &[u8]) -> Result { // TODO: Implement proper validation here @@ -261,6 +291,7 @@ pub fn my_function(&mut self, _something: &[u8]) -> Result { ``` ### Async Task Spawning for Blocking Work + ```rust // ❌ Avoid - blocking in async context async fn some_handler() { @@ -276,6 +307,7 @@ async fn some_handler() { ``` ### Tracing Span Usage + ```rust // ❌ Avoid - span on simple getter #[instrument] @@ -291,9 +323,10 @@ async fn process_block(&self, block: Block) -> Result<(), Error> { ``` ## Build and Development Notes -- Full builds and tests take 5+ minutes - use large timeouts (300s+) for any `cargo build`, `cargo test`, or `make` commands + +- Full builds and tests take 5+ minutes - use large timeouts (300s+) for any `cargo build`, `cargo nextest`, or `make` commands - Use `cargo check` for faster iteration during development and always run after code changes +- Prefer targeted package tests (`cargo nextest run -p `) and individual tests over full test suite when debugging specific issues - Use `cargo fmt --all && make lint-fix` to format code and fix linting issues once a task is complete -- Prefer targeted package tests (`cargo test -p `) and individual tests over full test suite when debugging specific issues - Always understand the broader codebase patterns before making changes -- Minimum Supported Rust Version (MSRV) is documented in `lighthouse/Cargo.toml` - ensure Rust version meets or exceeds this requirement \ No newline at end of file +- Minimum Supported Rust Version (MSRV) is documented in `lighthouse/Cargo.toml` - ensure Rust version meets or exceeds this requirement diff --git a/Makefile b/Makefile index 475d3aac8a..79fe7ea496 100644 --- a/Makefile +++ b/Makefile @@ -139,29 +139,18 @@ build-release-tarballs: $(call tarball_release_binary,$(BUILD_PATH_RISCV64),$(RISCV64_TAG),"") + # Runs the full workspace tests in **release**, without downloading any additional # test vectors. test-release: - cargo test --workspace --release --features "$(TEST_FEATURES)" \ - --exclude ef_tests --exclude beacon_chain --exclude slasher --exclude network \ - --exclude http_api - -# Runs the full workspace tests in **release**, without downloading any additional -# test vectors, using nextest. -nextest-release: cargo nextest run --workspace --release --features "$(TEST_FEATURES)" \ --exclude ef_tests --exclude beacon_chain --exclude slasher --exclude network \ --exclude http_api + # Runs the full workspace tests in **debug**, without downloading any additional test # vectors. test-debug: - cargo test --workspace --features "$(TEST_FEATURES)" \ - --exclude ef_tests --exclude beacon_chain --exclude network --exclude http_api - -# Runs the full workspace tests in **debug**, without downloading any additional test -# vectors, using nextest. -nextest-debug: cargo nextest run --workspace --features "$(TEST_FEATURES)" \ --exclude ef_tests --exclude beacon_chain --exclude network --exclude http_api @@ -173,15 +162,9 @@ cargo-fmt: check-benches: cargo check --workspace --benches --features "$(TEST_FEATURES)" -# Runs only the ef-test vectors. -run-ef-tests: - rm -rf $(EF_TESTS)/.accessed_file_log.txt - cargo test --release -p ef_tests --features "ef_tests,$(EF_TEST_FEATURES)" - cargo test --release -p ef_tests --features "ef_tests,$(EF_TEST_FEATURES),fake_crypto" - ./$(EF_TESTS)/check_all_files_accessed.py $(EF_TESTS)/.accessed_file_log.txt $(EF_TESTS)/consensus-spec-tests -# Runs EF test vectors with nextest -nextest-run-ef-tests: +# Runs EF test vectors +run-ef-tests: rm -rf $(EF_TESTS)/.accessed_file_log.txt cargo nextest run --release -p ef_tests --features "ef_tests,$(EF_TEST_FEATURES)" cargo nextest run --release -p ef_tests --features "ef_tests,$(EF_TEST_FEATURES),fake_crypto" @@ -233,9 +216,6 @@ test-ef: make-ef-tests run-ef-tests # Downloads and runs the nightly EF test vectors. test-ef-nightly: make-ef-tests-nightly run-ef-tests -# Downloads and runs the EF test vectors with nextest. -nextest-ef: make-ef-tests nextest-run-ef-tests - # Runs tests checking interop between Lighthouse and execution clients. test-exec-engine: make -C $(EXECUTION_ENGINE_INTEGRATION) test diff --git a/book/src/contributing_setup.md b/book/src/contributing_setup.md index 7143c8f0fb..b817faad87 100644 --- a/book/src/contributing_setup.md +++ b/book/src/contributing_setup.md @@ -26,7 +26,7 @@ you can run them locally and avoid CI failures: - `$ make cargo-fmt`: (fast) runs a Rust code formatting check. - `$ make lint`: (fast) runs a Rust code linter. -- `$ make test`: (medium) runs unit tests across the whole project. +- `$ make test`: (medium) runs unit tests across the whole project using nextest. - `$ make test-ef`: (medium) runs the Ethereum Foundation test vectors. - `$ make test-full`: (slow) runs the full test suite (including all previous commands). This is approximately everything @@ -36,88 +36,39 @@ _The lighthouse test suite is quite extensive, running the whole suite may take ## Testing -As with most other Rust projects, Lighthouse uses `cargo test` for unit and -integration tests. For example, to test the `ssz` crate run: +Lighthouse uses `cargo nextest` for unit and integration tests. Nextest provides better parallelization and is used by CI. For example, to test the `safe_arith` crate run: ```bash -$ cd consensus/ssz -$ cargo test - Finished test [unoptimized + debuginfo] target(s) in 7.69s - Running unittests (target/debug/deps/ssz-61fc26760142b3c4) - -running 27 tests -test decode::impls::tests::awkward_fixed_length_portion ... ok -test decode::impls::tests::invalid_h256 ... ok - -test encode::tests::test_encode_length ... ok -test encode::impls::tests::vec_of_vec_of_u8 ... ok -test encode::tests::test_encode_length_above_max_debug_panics - should panic ... ok - -test result: ok. 27 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.00s - - Running tests/tests.rs (target/debug/deps/tests-f8fb1f9ccb197bf4) - -running 20 tests -test round_trip::bool ... ok -test round_trip::first_offset_skips_byte ... ok -test round_trip::fixed_len_excess_bytes ... ok - -test round_trip::vec_u16 ... ok -test round_trip::vec_of_vec_u16 ... ok - -test result: ok. 20 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.00s - - Doc-tests ssz - -running 3 tests -test src/decode.rs - decode::SszDecoder (line 258) ... ok -test src/encode.rs - encode::SszEncoder (line 57) ... ok -test src/lib.rs - (line 10) ... ok - -test result: ok. 3 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.15s$ cargo test -p eth2_ssz +$ cd consensus/safe_arith +$ cargo nextest run + Finished test [unoptimized + debuginfo] target(s) in 0.43s + ------------ + Nextest run ID: 01234567-89ab-cdef-0123-456789abcdef + Starting 8 tests across 1 binary + PASS [ 0.001s] safe_arith tests::test_safe_add_u64 + PASS [ 0.001s] safe_arith tests::test_safe_mul_u64 + + ------------ + Summary [ 0.012s] 8 tests run: 8 passed, 0 skipped ``` -Alternatively, since `lighthouse` is a cargo workspace you can use `-p eth2_ssz` where -`eth2_ssz` is the package name as defined `/consensus/ssz/Cargo.toml` +Alternatively, since `lighthouse` is a cargo workspace you can use `-p safe_arith` where +`safe_arith` is the package name as defined in `/consensus/safe_arith/Cargo.toml`: ```bash -$ head -2 consensus/ssz/Cargo.toml +$ head -2 consensus/safe_arith/Cargo.toml [package] -name = "eth2_ssz" -$ cargo test -p eth2_ssz - Finished test [unoptimized + debuginfo] target(s) in 7.69s - Running unittests (target/debug/deps/ssz-61fc26760142b3c4) - -running 27 tests -test decode::impls::tests::awkward_fixed_length_portion ... ok -test decode::impls::tests::invalid_h256 ... ok - -test encode::tests::test_encode_length ... ok -test encode::impls::tests::vec_of_vec_of_u8 ... ok -test encode::tests::test_encode_length_above_max_debug_panics - should panic ... ok - -test result: ok. 27 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.00s - - Running tests/tests.rs (target/debug/deps/tests-f8fb1f9ccb197bf4) - -running 20 tests -test round_trip::bool ... ok -test round_trip::first_offset_skips_byte ... ok -test round_trip::fixed_len_excess_bytes ... ok - -test round_trip::vec_u16 ... ok -test round_trip::vec_of_vec_u16 ... ok - -test result: ok. 20 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.00s - - Doc-tests ssz - -running 3 tests -test src/decode.rs - decode::SszDecoder (line 258) ... ok -test src/encode.rs - encode::SszEncoder (line 57) ... ok -test src/lib.rs - (line 10) ... ok - -test result: ok. 3 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.15s$ cargo test -p eth2_ssz +name = "safe_arith" +$ cargo nextest run -p safe_arith + Finished test [unoptimized + debuginfo] target(s) in 0.43s + ------------ + Nextest run ID: 01234567-89ab-cdef-0123-456789abcdef + Starting 8 tests across 1 binary + PASS [ 0.001s] safe_arith tests::test_safe_add_u64 + PASS [ 0.001s] safe_arith tests::test_safe_mul_u64 + + ------------ + Summary [ 0.012s] 8 tests run: 8 passed, 0 skipped ``` ### test_logger @@ -129,7 +80,7 @@ testing the logs are displayed. This can be very helpful while debugging tests. Example: ``` -$ cargo test -p beacon_chain validator_pubkey_cache::test::basic_operation --features 'logging/test_logger' +$ cargo nextest run -p beacon_chain -E 'test(validator_pubkey_cache::test::basic_operation)' --features 'logging/test_logger' Finished test [unoptimized + debuginfo] target(s) in 0.20s Running unittests (target/debug/deps/beacon_chain-975363824f1143bc) diff --git a/wordlist.txt b/wordlist.txt index 0391af78cb..57674cf974 100644 --- a/wordlist.txt +++ b/wordlist.txt @@ -187,6 +187,7 @@ namespace natively nd ness +nextest nginx nitty oom From 02d519e95706afb520329fd9bf8a5b85a61de150 Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Thu, 11 Sep 2025 07:02:27 +1000 Subject: [PATCH 27/81] Fixed orphaned `verify_cell_proof_chunk` span. (#8026) Fixed orphaned kzg verify cell proof chunk spans. See screenshot: image The parent span needs to be passed explicitly to the chunk verification span as parent, as rayon runs the function in a separate thread. Co-Authored-By: Jimmy Chen Co-Authored-By: Eitan Seri-Levi --- crypto/kzg/src/lib.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crypto/kzg/src/lib.rs b/crypto/kzg/src/lib.rs index 1b8d46100f..0fe95b7723 100644 --- a/crypto/kzg/src/lib.rs +++ b/crypto/kzg/src/lib.rs @@ -23,7 +23,7 @@ pub use rust_eth_kzg::{ constants::{BYTES_PER_CELL, CELLS_PER_EXT_BLOB}, Cell, CellIndex as CellID, CellRef, TrustedSetup as PeerDASTrustedSetup, }; -use tracing::instrument; +use tracing::{instrument, Span}; /// Disables the fixed-base multi-scalar multiplication optimization for computing /// cell KZG proofs, because `rust-eth-kzg` already handles the precomputation. @@ -269,6 +269,7 @@ impl Kzg { .push((cell, *proof, *commitment)); } + let span = Span::current(); column_groups .into_par_iter() .map(|(column_index, column_data)| { @@ -286,6 +287,7 @@ impl Kzg { // This is safe from span explosion as we have at most 128 chunks, // i.e. the number of column indices. let _span = tracing::debug_span!( + parent: span.clone(), "verify_cell_proof_chunk", cells = cells.len(), column_index, From a080bb5ceede2f5fdfd8b955841a44ef7855b1e5 Mon Sep 17 00:00:00 2001 From: Michael Sproul Date: Thu, 11 Sep 2025 10:47:39 +1000 Subject: [PATCH 28/81] Increase HTTP timeouts on CI (#8031) Since we re-enabled HTTP API tests on CI (https://github.com/sigp/lighthouse/pull/7943) there have been a few spurious failures: - https://github.com/sigp/lighthouse/actions/runs/17608432465/job/50024519938?pr=7783 That error is awkward, but running locally with a short timeout confirms it to be a timeout. Change the request timeout to 5s everywhere. We had kept it shorter to try to detect performance regressions, but I think this is better suited to being done with metrics & traces. On CI we really just want things to pass reliably without flakiness, so I think a longer timeout to handle slower test code (like mock-builder) and overworked CI boxes makes sense. Co-Authored-By: Michael Sproul --- beacon_node/http_api/src/test_utils.rs | 9 ++++----- beacon_node/http_api/tests/broadcast_validation_tests.rs | 3 ++- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/beacon_node/http_api/src/test_utils.rs b/beacon_node/http_api/src/test_utils.rs index 28eed26276..fe9e0dff70 100644 --- a/beacon_node/http_api/src/test_utils.rs +++ b/beacon_node/http_api/src/test_utils.rs @@ -145,12 +145,11 @@ impl InteractiveTester { tokio::spawn(mock_builder_server); } - // Override the default timeout to 2s to timeouts on CI, as CI seems to require longer - // to process. The 1s timeouts for other tasks have been working for a long time, so we'll - // keep it as it is, as it may help identify a performance regression. + // Use 5s timeouts on CI, as there are several sources of artifical slowness, including + // mock-builder. let timeouts = Timeouts { - default: Duration::from_secs(2), - ..Timeouts::set_all(Duration::from_secs(1)) + default: Duration::from_secs(5), + ..Timeouts::set_all(Duration::from_secs(5)) }; let client = BeaconNodeHttpClient::new(beacon_url.clone(), timeouts); diff --git a/beacon_node/http_api/tests/broadcast_validation_tests.rs b/beacon_node/http_api/tests/broadcast_validation_tests.rs index 7f02c2c0fd..9427f6fdf3 100644 --- a/beacon_node/http_api/tests/broadcast_validation_tests.rs +++ b/beacon_node/http_api/tests/broadcast_validation_tests.rs @@ -1383,7 +1383,8 @@ pub async fn blinded_equivocation_gossip() { // XXX: this should be a 400 but is a 500 due to the mock-builder being janky assert_eq!( error_response.status(), - Some(StatusCode::INTERNAL_SERVER_ERROR) + Some(StatusCode::INTERNAL_SERVER_ERROR), + "{error_response:?}" ); } else { assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); From 58156815f12dd4ed8d4b0c0bb28c96ef78956d97 Mon Sep 17 00:00:00 2001 From: Daniel Knopik <107140945+dknopik@users.noreply.github.com> Date: Thu, 11 Sep 2025 08:11:58 +0200 Subject: [PATCH 29/81] Expose functions to do preliminary slashing checks (#7783) Co-Authored-By: Daniel Knopik Co-Authored-By: Michael Sproul --- .gitignore | 1 - Makefile | 1 + clippy.toml | 7 ++ .../src/slashing_database.rs | 77 +++++++++++++++++++ 4 files changed, 85 insertions(+), 1 deletion(-) create mode 100644 clippy.toml diff --git a/.gitignore b/.gitignore index e63e218a3b..efd7916b05 100644 --- a/.gitignore +++ b/.gitignore @@ -9,7 +9,6 @@ perf.data* *.tar.gz /bin genesis.ssz -/clippy.toml /.cargo # IntelliJ diff --git a/Makefile b/Makefile index 79fe7ea496..b9f93942f6 100644 --- a/Makefile +++ b/Makefile @@ -249,6 +249,7 @@ lint: -D clippy::fn_to_numeric_cast_any \ -D clippy::manual_let_else \ -D clippy::large_stack_frames \ + -D clippy::disallowed_methods \ -D warnings \ -A clippy::derive_partial_eq_without_eq \ -A clippy::upper-case-acronyms \ diff --git a/clippy.toml b/clippy.toml new file mode 100644 index 0000000000..dabcbe8bf5 --- /dev/null +++ b/clippy.toml @@ -0,0 +1,7 @@ +# Disallow preliminary slashing checks, +disallowed-methods = [ + { path = "slashing_protection::slashing_database::SlashingDatabase::preliminary_check_block_proposal", reason = "not safe for slashing checks", replacement = "slashing_protection::slashing_database::SlashingDatabase::check_and_insert_block_proposal" }, + { path = "slashing_protection::slashing_database::SlashingDatabase::preliminary_check_block_signing_root", reason = "not safe for slashing checks", replacement = "slashing_protection::slashing_database::SlashingDatabase::check_and_insert_block_signing_root" }, + { path = "slashing_protection::slashing_database::SlashingDatabase::preliminary_check_attestation", reason = "not safe for slashing checks", replacement = "slashing_protection::slashing_database::SlashingDatabase::check_and_insert_attestation" }, + { path = "slashing_protection::slashing_database::SlashingDatabase::preliminary_check_attestation_signing_root", reason = "not safe for slashing checks", replacement = "slashing_protection::slashing_database::SlashingDatabase::check_and_insert_attestation_signing_root" }, +] diff --git a/validator_client/slashing_protection/src/slashing_database.rs b/validator_client/slashing_protection/src/slashing_database.rs index 9cecdaa8a5..7d8947a584 100644 --- a/validator_client/slashing_protection/src/slashing_database.rs +++ b/validator_client/slashing_protection/src/slashing_database.rs @@ -599,6 +599,40 @@ impl SlashingDatabase { Ok(safe) } + /// Check whether a block would be safe to sign if we were to sign it now. + /// + /// The database is not modified, and therefore multiple threads reading the database might get + /// the same result. Therefore: + /// + /// DO NOT USE THIS FUNCTION TO DECIDE IF A BLOCK IS SAFE TO SIGN! + pub fn preliminary_check_block_proposal( + &self, + validator_pubkey: &PublicKeyBytes, + block_header: &BeaconBlockHeader, + domain: Hash256, + ) -> Result { + #[allow(clippy::disallowed_methods)] + self.preliminary_check_block_signing_root( + validator_pubkey, + block_header.slot, + block_header.signing_root(domain).into(), + ) + } + + /// As for `preliminary_check_block_proposal` but without requiring the whole `BeaconBlockHeader`. + /// + /// DO NOT USE THIS FUNCTION TO DECIDE IF A BLOCK IS SAFE TO SIGN! + pub fn preliminary_check_block_signing_root( + &self, + validator_pubkey: &PublicKeyBytes, + slot: Slot, + signing_root: SigningRoot, + ) -> Result { + let mut conn = self.conn_pool.get()?; + let txn = conn.transaction_with_behavior(TransactionBehavior::Exclusive)?; + self.check_block_proposal(&txn, validator_pubkey, slot, signing_root) + } + /// Check an attestation for slash safety, and if it is safe, record it in the database. /// /// The checking and inserting happen atomically and exclusively. We enforce exclusivity @@ -670,6 +704,49 @@ impl SlashingDatabase { Ok(safe) } + /// Check whether an attestation would be safe to sign if we were to sign it now. + /// + /// The database is not modified, and therefore multiple threads reading the database might get + /// the same result. Therefore: + /// + /// DO NOT USE THIS FUNCTION TO DECIDE IF AN ATTESTATION IS SAFE TO SIGN! + pub fn preliminary_check_attestation( + &self, + validator_pubkey: &PublicKeyBytes, + attestation: &AttestationData, + domain: Hash256, + ) -> Result { + let attestation_signing_root = attestation.signing_root(domain).into(); + #[allow(clippy::disallowed_methods)] + self.preliminary_check_attestation_signing_root( + validator_pubkey, + attestation.source.epoch, + attestation.target.epoch, + attestation_signing_root, + ) + } + + /// As for `preliminary_check_attestation` but without requiring the whole `AttestationData`. + /// + /// DO NOT USE THIS FUNCTION TO DECIDE IF AN ATTESTATION IS SAFE TO SIGN! + pub fn preliminary_check_attestation_signing_root( + &self, + validator_pubkey: &PublicKeyBytes, + att_source_epoch: Epoch, + att_target_epoch: Epoch, + att_signing_root: SigningRoot, + ) -> Result { + let mut conn = self.conn_pool.get()?; + let txn = conn.transaction_with_behavior(TransactionBehavior::Exclusive)?; + self.check_attestation( + &txn, + validator_pubkey, + att_source_epoch, + att_target_epoch, + att_signing_root, + ) + } + /// Import slashing protection from another client in the interchange format. /// /// This function will atomically import the entire interchange, failing if *any* From 87ae301d0942c0bc632e41325765e46fc0fefa7b Mon Sep 17 00:00:00 2001 From: Michael Sproul Date: Fri, 12 Sep 2025 12:48:49 +1000 Subject: [PATCH 30/81] Remove unused logging metrics (#7997) @chong-he noticed that the INFO/WARN/ERRO log counts on our dashboards had stopped working. Since switching to `tracing` we are now tracking total events _per crate_, and the global counters are unused. Per-crate metrics are here: https://github.com/sigp/lighthouse/blob/cfb1f7331064b758c6786e4e1dc15507af5ff5d1/common/logging/src/tracing_metrics_layer.rs#L61-L63 Delete the unused global counters from the source. We can sum across the per-crate metric in our dashboards to restore the previous functionality. Co-Authored-By: Michael Sproul --- common/logging/src/lib.rs | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/common/logging/src/lib.rs b/common/logging/src/lib.rs index 6722381dba..8ef3436b06 100644 --- a/common/logging/src/lib.rs +++ b/common/logging/src/lib.rs @@ -1,5 +1,3 @@ -use metrics::{IntCounter, Result as MetricsResult, try_create_int_counter}; -use std::sync::LazyLock; use std::time::{Duration, Instant}; use tracing_subscriber::EnvFilter; @@ -23,15 +21,6 @@ pub use utils::build_workspace_filter; /// The minimum interval between log messages indicating that a queue is full. const LOG_DEBOUNCE_INTERVAL: Duration = Duration::from_secs(30); -pub static INFOS_TOTAL: LazyLock> = - LazyLock::new(|| try_create_int_counter("info_total", "Count of infos logged")); -pub static WARNS_TOTAL: LazyLock> = - LazyLock::new(|| try_create_int_counter("warn_total", "Count of warns logged")); -pub static ERRORS_TOTAL: LazyLock> = - LazyLock::new(|| try_create_int_counter("error_total", "Count of errors logged")); -pub static CRITS_TOTAL: LazyLock> = - LazyLock::new(|| try_create_int_counter("crit_total", "Count of crits logged")); - /// Provides de-bounce functionality for logging. #[derive(Default)] pub struct TimeLatch(Option); From fb77ce9e192e6e478e56ce5423ffe0b43b3c4519 Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Fri, 12 Sep 2025 15:11:30 +1000 Subject: [PATCH 31/81] Add missing event in `PendingComponent` span and clean up sync logs (#8033) I was looking into some long `PendingComponents` span and noticed the block event wasn't added to the span, so it wasn't possible to see when the block was added from the trace view, this PR fixes this. image Additionally I've noticed a lot of noises and confusion in sync logs due to the initial`peer_id` being included as part of the syncing chain span, causing all logs under the span to have that `peer_id`, which may not be accurate for some sync logs, I've removed `peer_id` from the `SyncingChain` span, and also cleaned up a bunch of spans to use `%` (display) for slots and epochs to make logs easier to read. Co-Authored-By: Jimmy Chen --- beacon_node/beacon_chain/src/block_verification.rs | 2 +- .../data_availability_checker/overflow_lru_cache.rs | 12 +++++++----- .../src/network_beacon_processor/gossip_methods.rs | 2 +- beacon_node/network/src/sync/range_sync/batch.rs | 5 ++++- beacon_node/network/src/sync/range_sync/chain.rs | 10 ++++++++-- beacon_node/store/src/hot_cold_store.rs | 4 ++-- beacon_node/store/src/state_cache.rs | 2 +- 7 files changed, 24 insertions(+), 13 deletions(-) diff --git a/beacon_node/beacon_chain/src/block_verification.rs b/beacon_node/beacon_chain/src/block_verification.rs index 1d6e050f7e..1d10fae0a4 100644 --- a/beacon_node/beacon_chain/src/block_verification.rs +++ b/beacon_node/beacon_chain/src/block_verification.rs @@ -2061,7 +2061,7 @@ impl BlockBlobError for GossipDataColumnError { /// and `Cow::Borrowed(state)` will be returned. Otherwise, the state will be cloned, cheaply /// advanced and then returned as a `Cow::Owned`. The end result is that the given `state` is never /// mutated to be invalid (in fact, it is never changed beyond a simple committee cache build). -#[instrument(skip(state, spec), level = "debug")] +#[instrument(skip_all, fields(?state_root_opt, %block_slot), level = "debug")] pub fn cheap_state_advance_to_obtain_committees<'a, E: EthSpec, Err: BlockBlobError>( state: &'a mut BeaconState, state_root_opt: Option, diff --git a/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs b/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs index 7f083139ee..9de63f6126 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs @@ -666,11 +666,13 @@ impl DataAvailabilityCheckerInner { None }; - debug!( - component = "block", - status = pending_components.status_str(num_expected_columns_opt), - "Component added to data availability checker" - ); + pending_components.span.in_scope(|| { + debug!( + component = "block", + status = pending_components.status_str(num_expected_columns_opt), + "Component added to data availability checker" + ); + }); self.check_availability_and_cache_components( block_root, diff --git a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs index cb6d63fe91..cbe441b419 100644 --- a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs @@ -610,7 +610,7 @@ impl NetworkBeaconProcessor { parent = None, level = "debug", skip_all, - fields(slot = ?column_sidecar.slot(), block_root = ?column_sidecar.block_root(), index = column_sidecar.index), + fields(slot = %column_sidecar.slot(), block_root = ?column_sidecar.block_root(), index = column_sidecar.index), )] pub async fn process_gossip_data_column_sidecar( self: &Arc, diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index 1f51613996..31e6594139 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -1,4 +1,5 @@ use beacon_chain::block_verification_types::RpcBlock; +use derivative::Derivative; use lighthouse_network::PeerId; use lighthouse_network::rpc::methods::BlocksByRangeRequest; use lighthouse_network::service::api_types::Id; @@ -96,7 +97,8 @@ pub enum BatchProcessingResult { NonFaultyFailure, } -#[derive(Debug)] +#[derive(Derivative)] +#[derivative(Debug)] /// A segment of a chain. pub struct BatchInfo { /// Start slot of the batch. @@ -114,6 +116,7 @@ pub struct BatchInfo { /// Whether this batch contains all blocks or all blocks and blobs. batch_type: ByRangeRequestType, /// Pin the generic + #[derivative(Debug = "ignore")] marker: std::marker::PhantomData, } diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index a8c85e44d2..3b816c0922 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -131,8 +131,14 @@ impl SyncingChain { name = SPAN_SYNCING_CHAIN, parent = None, level="debug", - skip(id), - fields(chain_id = %id) + skip_all, + fields( + chain_id = %id, + start_epoch = %start_epoch, + target_head_slot = %target_head_slot, + target_head_root = %target_head_root, + chain_type = ?chain_type, + ) )] pub fn new( id: Id, diff --git a/beacon_node/store/src/hot_cold_store.rs b/beacon_node/store/src/hot_cold_store.rs index 7b390b39f3..7156c75f11 100644 --- a/beacon_node/store/src/hot_cold_store.rs +++ b/beacon_node/store/src/hot_cold_store.rs @@ -1040,7 +1040,7 @@ impl, Cold: ItemStore> HotColdDB /// - `result_state_root == state.canonical_root()` /// - `state.slot() <= max_slot` /// - `state.get_latest_block_root(result_state_root) == block_root` - #[instrument(skip(self, max_slot), level = "debug")] + #[instrument(skip_all, fields(?block_root, %max_slot, ?state_root), level = "debug")] pub fn get_advanced_hot_state( &self, block_root: Hash256, @@ -1112,7 +1112,7 @@ impl, Cold: ItemStore> HotColdDB /// If this function returns `Some(state)` then that `state` will always have /// `latest_block_header` matching `block_root` but may not be advanced all the way through to /// `max_slot`. - #[instrument(skip(self), level = "debug")] + #[instrument(skip_all, fields(?block_root, %max_slot), level = "debug")] pub fn get_advanced_hot_state_from_cache( &self, block_root: Hash256, diff --git a/beacon_node/store/src/state_cache.rs b/beacon_node/store/src/state_cache.rs index 05930c7b71..4b0d1ee016 100644 --- a/beacon_node/store/src/state_cache.rs +++ b/beacon_node/store/src/state_cache.rs @@ -299,7 +299,7 @@ impl StateCache { None } - #[instrument(skip(self), level = "debug")] + #[instrument(skip_all, fields(?block_root, %slot), level = "debug")] pub fn get_by_block_root( &mut self, block_root: Hash256, From aef8291f94fbb0e0cb3ddfd2be5184c5096a5f3c Mon Sep 17 00:00:00 2001 From: Eitan Seri-Levi Date: Thu, 11 Sep 2025 23:05:42 -0700 Subject: [PATCH 32/81] Add max delay to reconstruction (#7976) #7697 If we're three seconds into the current slot just trigger reconstruction. I don't know what the correct reconstruction deadline number is, but it should probably be at least half a second before the attestation deadline Co-Authored-By: Eitan Seri- Levi Co-Authored-By: Eitan Seri-Levi --- .../src/scheduler/work_reprocessing_queue.rs | 20 +- .../gossip_methods.rs | 1 + .../src/network_beacon_processor/tests.rs | 238 +++++++++++++++++- 3 files changed, 253 insertions(+), 6 deletions(-) diff --git a/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs b/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs index 032f14ce3d..9565e57589 100644 --- a/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs +++ b/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs @@ -82,6 +82,9 @@ pub const BACKFILL_SCHEDULE_IN_SLOT: [(u32, u32); 3] = [ (4, 5), ]; +/// Trigger reconstruction if we are this many seconds into the current slot +pub const RECONSTRUCTION_DEADLINE: Duration = Duration::from_millis(3000); + /// Messages that the scheduler can receive. #[derive(AsRefStr)] pub enum ReprocessQueueMessage { @@ -172,6 +175,7 @@ pub struct QueuedBackfillBatch(pub AsyncFn); pub struct QueuedColumnReconstruction { pub block_root: Hash256, + pub slot: Slot, pub process_fn: AsyncFn, } @@ -749,16 +753,26 @@ impl ReprocessQueue { } } InboundEvent::Msg(DelayColumnReconstruction(request)) => { + let mut reconstruction_delay = QUEUED_RECONSTRUCTION_DELAY; + if let Some(seconds_from_current_slot) = + self.slot_clock.seconds_from_current_slot_start() + && let Some(current_slot) = self.slot_clock.now() + && seconds_from_current_slot >= RECONSTRUCTION_DEADLINE + && current_slot == request.slot + { + // If we are at least `RECONSTRUCTION_DEADLINE` seconds into the current slot, + // and the reconstruction request is for the current slot, process reconstruction immediately. + reconstruction_delay = Duration::from_secs(0); + } match self.queued_column_reconstructions.entry(request.block_root) { Entry::Occupied(key) => { - // Push back the reattempted reconstruction self.column_reconstructions_delay_queue - .reset(key.get(), QUEUED_RECONSTRUCTION_DELAY) + .reset(key.get(), reconstruction_delay); } Entry::Vacant(vacant) => { let delay_key = self .column_reconstructions_delay_queue - .insert(request, QUEUED_RECONSTRUCTION_DELAY); + .insert(request, reconstruction_delay); vacant.insert(delay_key); } } diff --git a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs index cbe441b419..1f1a3427e7 100644 --- a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs @@ -1064,6 +1064,7 @@ impl NetworkBeaconProcessor { work: Work::Reprocess(ReprocessQueueMessage::DelayColumnReconstruction( QueuedColumnReconstruction { block_root, + slot: *slot, process_fn: Box::pin(async move { cloned_self .attempt_data_column_reconstruction(block_root, true) diff --git a/beacon_node/network/src/network_beacon_processor/tests.rs b/beacon_node/network/src/network_beacon_processor/tests.rs index 2027a525e6..2935c2d213 100644 --- a/beacon_node/network/src/network_beacon_processor/tests.rs +++ b/beacon_node/network/src/network_beacon_processor/tests.rs @@ -94,12 +94,20 @@ impl TestRig { // This allows for testing voluntary exits without building out a massive chain. let mut spec = test_spec::(); spec.shard_committee_period = 2; - Self::new_parametric(chain_length, BeaconProcessorConfig::default(), spec).await + Self::new_parametric(chain_length, BeaconProcessorConfig::default(), false, spec).await + } + + pub async fn new_supernode(chain_length: u64) -> Self { + // This allows for testing voluntary exits without building out a massive chain. + let mut spec = test_spec::(); + spec.shard_committee_period = 2; + Self::new_parametric(chain_length, BeaconProcessorConfig::default(), true, spec).await } pub async fn new_parametric( chain_length: u64, beacon_processor_config: BeaconProcessorConfig, + import_data_columns: bool, spec: ChainSpec, ) -> Self { let spec = Arc::new(spec); @@ -108,6 +116,7 @@ impl TestRig { .deterministic_keypairs(VALIDATOR_COUNT) .fresh_ephemeral_store() .mock_execution_layer() + .import_all_data_columns(import_data_columns) .chain_config(<_>::default()) .build(); @@ -601,6 +610,40 @@ impl TestRig { .await } + pub async fn assert_event_journal_completes_with_timeout( + &mut self, + expected: &[WorkType], + timeout: Duration, + ) { + self.assert_event_journal_with_timeout( + &expected + .iter() + .map(Into::<&'static str>::into) + .chain(std::iter::once(WORKER_FREED)) + .chain(std::iter::once(NOTHING_TO_DO)) + .collect::>(), + timeout, + ) + .await + } + + pub async fn assert_event_journal_does_not_complete_with_timeout( + &mut self, + expected: &[WorkType], + timeout: Duration, + ) { + self.assert_not_in_event_journal_with_timeout( + &expected + .iter() + .map(Into::<&'static str>::into) + .chain(std::iter::once(WORKER_FREED)) + .chain(std::iter::once(NOTHING_TO_DO)) + .collect::>(), + timeout, + ) + .await + } + pub async fn assert_event_journal_completes(&mut self, expected: &[WorkType]) { self.assert_event_journal( &expected @@ -651,6 +694,37 @@ impl TestRig { assert_eq!(events, expected); } + /// Assert that the `BeaconProcessor` event journal is not as `expected`. + pub async fn assert_not_in_event_journal_with_timeout( + &mut self, + expected: &[&str], + timeout: Duration, + ) { + let mut events = Vec::with_capacity(expected.len()); + + let drain_future = async { + while let Some(event) = self.work_journal_rx.recv().await { + events.push(event); + + // Break as soon as we collect the desired number of events. + if events.len() >= expected.len() { + break; + } + } + }; + + // Panic if we don't time out. + tokio::select! { + _ = tokio::time::sleep(timeout) => {}, + _ = drain_future => panic!( + "Got events before timeout. Expected no events but got {:?}", + events + ), + } + + assert_ne!(events, expected); + } + /// Listen for network messages and collect them for a specified duration or until reaching a count. /// /// Returns None if no messages were received, or Some(Vec) containing the received messages. @@ -743,6 +817,159 @@ fn junk_message_id() -> MessageId { MessageId::new(&[]) } +// Test that column reconstruction is delayed for columns that arrive +// at the beginning of the slot. +#[tokio::test] +async fn data_column_reconstruction_at_slot_start() { + if test_spec::().fulu_fork_epoch.is_none() { + return; + }; + + let mut rig = TestRig::new_supernode(SMALL_CHAIN).await; + + let slot_start = rig + .chain + .slot_clock + .start_of(rig.next_block.slot()) + .unwrap(); + + rig.chain + .slot_clock + .set_current_time(slot_start - rig.chain.spec.maximum_gossip_clock_disparity()); + + assert_eq!( + rig.chain.slot().unwrap(), + rig.next_block.slot() - 1, + "chain should be at the correct slot" + ); + + let num_data_columns = rig.next_data_columns.as_ref().map(|c| c.len()).unwrap_or(0); + for i in 0..num_data_columns { + rig.enqueue_gossip_data_columns(i); + rig.assert_event_journal_completes(&[WorkType::GossipDataColumnSidecar]) + .await; + } + + if num_data_columns > 0 { + // Reconstruction is delayed by 100ms, we should not be able to complete + // reconstruction up to this point + rig.assert_event_journal_does_not_complete_with_timeout( + &[WorkType::ColumnReconstruction], + Duration::from_millis(100), + ) + .await; + + // We've waited at least 150ms, reconstruction can now be triggered + rig.assert_event_journal_completes_with_timeout( + &[WorkType::ColumnReconstruction], + Duration::from_millis(200), + ) + .await; + } +} + +// Test that column reconstruction happens immediately for columns that arrive at the +// reconstruction deadline. +#[tokio::test] +async fn data_column_reconstruction_at_deadline() { + if test_spec::().fulu_fork_epoch.is_none() { + return; + }; + + let mut rig = TestRig::new_supernode(SMALL_CHAIN).await; + + let slot_start = rig + .chain + .slot_clock + .start_of(rig.next_block.slot()) + .unwrap(); + + rig.chain + .slot_clock + .set_current_time(slot_start - rig.chain.spec.maximum_gossip_clock_disparity()); + + assert_eq!( + rig.chain.slot().unwrap(), + rig.next_block.slot() - 1, + "chain should be at the correct slot" + ); + + // We push the slot clock to 3 seconds into the slot, this is the deadline to trigger reconstruction. + rig.chain + .slot_clock + .set_current_time(slot_start + Duration::from_secs(3)); + + let num_data_columns = rig.next_data_columns.as_ref().map(|c| c.len()).unwrap_or(0); + for i in 0..num_data_columns { + rig.enqueue_gossip_data_columns(i); + rig.assert_event_journal_completes(&[WorkType::GossipDataColumnSidecar]) + .await; + } + + // Since we're at the reconstruction deadline, reconstruction should be triggered immediately + if num_data_columns > 0 { + rig.assert_event_journal_completes_with_timeout( + &[WorkType::ColumnReconstruction], + Duration::from_millis(50), + ) + .await; + } +} + +// Test the column reconstruction is delayed for columns that arrive for a previous slot. +#[tokio::test] +async fn data_column_reconstruction_at_next_slot() { + if test_spec::().fulu_fork_epoch.is_none() { + return; + }; + + let mut rig = TestRig::new_supernode(SMALL_CHAIN).await; + + let slot_start = rig + .chain + .slot_clock + .start_of(rig.next_block.slot()) + .unwrap(); + + rig.chain + .slot_clock + .set_current_time(slot_start - rig.chain.spec.maximum_gossip_clock_disparity()); + + assert_eq!( + rig.chain.slot().unwrap(), + rig.next_block.slot() - 1, + "chain should be at the correct slot" + ); + + // We push the slot clock to the next slot. + rig.chain + .slot_clock + .set_current_time(slot_start + Duration::from_secs(12)); + + let num_data_columns = rig.next_data_columns.as_ref().map(|c| c.len()).unwrap_or(0); + for i in 0..num_data_columns { + rig.enqueue_gossip_data_columns(i); + rig.assert_event_journal_completes(&[WorkType::GossipDataColumnSidecar]) + .await; + } + + if num_data_columns > 0 { + // Since we are in the next slot reconstruction for the previous slot should be delayed again + rig.assert_event_journal_does_not_complete_with_timeout( + &[WorkType::ColumnReconstruction], + Duration::from_millis(100), + ) + .await; + + // We've waited at least 150ms, reconstruction can now be triggered + rig.assert_event_journal_completes_with_timeout( + &[WorkType::ColumnReconstruction], + Duration::from_millis(200), + ) + .await; + } +} + /// Blocks that arrive early should be queued for later processing. #[tokio::test] async fn import_gossip_block_acceptably_early() { @@ -1359,8 +1586,13 @@ async fn test_backfill_sync_processing_rate_limiting_disabled() { enable_backfill_rate_limiting: false, ..Default::default() }; - let mut rig = - TestRig::new_parametric(SMALL_CHAIN, beacon_processor_config, test_spec::()).await; + let mut rig = TestRig::new_parametric( + SMALL_CHAIN, + beacon_processor_config, + false, + test_spec::(), + ) + .await; for _ in 0..3 { rig.enqueue_backfill_batch(); From b8178515cd1b844d9af3bbab55455753b9949242 Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Mon, 15 Sep 2025 09:41:12 +1000 Subject: [PATCH 33/81] Update engine methods in notifier (#8038) Fulu uses `getPayloadV5`, this PR updates the notifier logging prior to the fork. Co-Authored-By: Jimmy Chen --- beacon_node/client/src/notifier.rs | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/beacon_node/client/src/notifier.rs b/beacon_node/client/src/notifier.rs index 1e58c210da..c83cdad7e0 100644 --- a/beacon_node/client/src/notifier.rs +++ b/beacon_node/client/src/notifier.rs @@ -9,8 +9,8 @@ use execution_layer::{ EngineCapabilities, http::{ ENGINE_FORKCHOICE_UPDATED_V2, ENGINE_FORKCHOICE_UPDATED_V3, ENGINE_GET_PAYLOAD_V2, - ENGINE_GET_PAYLOAD_V3, ENGINE_GET_PAYLOAD_V4, ENGINE_NEW_PAYLOAD_V2, ENGINE_NEW_PAYLOAD_V3, - ENGINE_NEW_PAYLOAD_V4, + ENGINE_GET_PAYLOAD_V3, ENGINE_GET_PAYLOAD_V4, ENGINE_GET_PAYLOAD_V5, ENGINE_NEW_PAYLOAD_V2, + ENGINE_NEW_PAYLOAD_V3, ENGINE_NEW_PAYLOAD_V4, }, }; use lighthouse_network::{NetworkGlobals, types::SyncState}; @@ -524,18 +524,16 @@ fn methods_required_for_fork( } } ForkName::Fulu => { - // TODO(fulu) switch to v5 when the EL is ready - if !capabilities.get_payload_v4 { - missing_methods.push(ENGINE_GET_PAYLOAD_V4); + if !capabilities.get_payload_v5 { + missing_methods.push(ENGINE_GET_PAYLOAD_V5); } if !capabilities.new_payload_v4 { missing_methods.push(ENGINE_NEW_PAYLOAD_V4); } } ForkName::Gloas => { - // TODO(gloas) switch to v5/v6 when the EL is ready - if !capabilities.get_payload_v4 { - missing_methods.push(ENGINE_GET_PAYLOAD_V4); + if !capabilities.get_payload_v5 { + missing_methods.push(ENGINE_GET_PAYLOAD_V5); } if !capabilities.new_payload_v4 { missing_methods.push(ENGINE_NEW_PAYLOAD_V4); From f04d5ecddd976646d1a07add33ce74eff1bf2a3c Mon Sep 17 00:00:00 2001 From: Michael Sproul Date: Tue, 16 Sep 2025 14:10:42 +1000 Subject: [PATCH 34/81] Another check to prevent duplicate block imports (#8050) Attempt to address performance issues caused by importing the same block multiple times. - Check fork choice "after" obtaining the fork choice write lock in `BeaconChain::import_block`. We actually use an upgradable read lock, but this is semantically equivalent (the upgradable read has the advantage of not excluding regular reads). The hope is that this change has several benefits: 1. By preventing duplicate block imports we save time repeating work inside `import_block` that is unnecessary, e.g. writing the state to disk. Although the store itself now takes some measures to avoid re-writing diffs, it is even better if we avoid a disk write entirely. 2. By returning `DuplicateFullyImported`, we reduce some duplicated work downstream. E.g. if multiple threads importing columns trigger `import_block`, now only _one_ of them will get a notification of the block import completing successfully, and only this one will run `recompute_head`. This should help avoid a situation where multiple beacon processor workers are consumed by threads blocking on the `recompute_head_lock`. However, a similar block-fest is still possible with the upgradable fork choice lock (a large number of threads can be blocked waiting for the first thread to complete block import). Co-Authored-By: Michael Sproul --- beacon_node/beacon_chain/src/beacon_chain.rs | 9 ++++++++- beacon_node/beacon_chain/src/canonical_head.rs | 14 +++++++++++++- beacon_node/beacon_chain/src/metrics.rs | 8 ++++++++ .../beacon_chain/tests/block_verification.rs | 13 +++++++++---- 4 files changed, 38 insertions(+), 6 deletions(-) diff --git a/beacon_node/beacon_chain/src/beacon_chain.rs b/beacon_node/beacon_chain/src/beacon_chain.rs index 6e11b66610..eeafefdff8 100644 --- a/beacon_node/beacon_chain/src/beacon_chain.rs +++ b/beacon_node/beacon_chain/src/beacon_chain.rs @@ -3889,9 +3889,16 @@ impl BeaconChain { .map_err(BeaconChainError::from)?; } + // Take an upgradable read lock on fork choice so we can check if this block has already + // been imported. We don't want to repeat work importing a block that is already imported. + let fork_choice_reader = self.canonical_head.fork_choice_upgradable_read_lock(); + if fork_choice_reader.contains_block(&block_root) { + return Err(BlockError::DuplicateFullyImported(block_root)); + } + // Take an exclusive write-lock on fork choice. It's very important to prevent deadlocks by // avoiding taking other locks whilst holding this lock. - let mut fork_choice = self.canonical_head.fork_choice_write_lock(); + let mut fork_choice = parking_lot::RwLockUpgradableReadGuard::upgrade(fork_choice_reader); // Do not import a block that doesn't descend from the finalized root. let signed_block = diff --git a/beacon_node/beacon_chain/src/canonical_head.rs b/beacon_node/beacon_chain/src/canonical_head.rs index 56d1975972..3dbe8bf5c4 100644 --- a/beacon_node/beacon_chain/src/canonical_head.rs +++ b/beacon_node/beacon_chain/src/canonical_head.rs @@ -48,7 +48,7 @@ use fork_choice::{ }; use itertools::process_results; use logging::crit; -use parking_lot::{Mutex, RwLock, RwLockReadGuard, RwLockWriteGuard}; +use parking_lot::{Mutex, RwLock, RwLockReadGuard, RwLockUpgradableReadGuard, RwLockWriteGuard}; use slot_clock::SlotClock; use state_processing::AllCaches; use std::sync::Arc; @@ -79,6 +79,10 @@ impl CanonicalHeadRwLock { self.0.read() } + fn upgradable_read(&self) -> RwLockUpgradableReadGuard<'_, T> { + self.0.upgradable_read() + } + fn write(&self) -> RwLockWriteGuard<'_, T> { self.0.write() } @@ -389,6 +393,14 @@ impl CanonicalHead { self.fork_choice.read() } + /// Access an upgradable read-lock for fork choice. + pub fn fork_choice_upgradable_read_lock( + &self, + ) -> RwLockUpgradableReadGuard<'_, BeaconForkChoice> { + let _timer = metrics::start_timer(&metrics::FORK_CHOICE_UPGRADABLE_READ_LOCK_AQUIRE_TIMES); + self.fork_choice.upgradable_read() + } + /// Access a write-lock for fork choice. pub fn fork_choice_write_lock(&self) -> RwLockWriteGuard<'_, BeaconForkChoice> { let _timer = metrics::start_timer(&metrics::FORK_CHOICE_WRITE_LOCK_AQUIRE_TIMES); diff --git a/beacon_node/beacon_chain/src/metrics.rs b/beacon_node/beacon_chain/src/metrics.rs index 1b57bad104..3da3cf163a 100644 --- a/beacon_node/beacon_chain/src/metrics.rs +++ b/beacon_node/beacon_chain/src/metrics.rs @@ -578,6 +578,14 @@ pub static FORK_CHOICE_READ_LOCK_AQUIRE_TIMES: LazyLock> = Laz exponential_buckets(1e-4, 4.0, 7), ) }); +pub static FORK_CHOICE_UPGRADABLE_READ_LOCK_AQUIRE_TIMES: LazyLock> = + LazyLock::new(|| { + try_create_histogram_with_buckets( + "beacon_fork_choice_upgradable_read_lock_aquire_seconds", + "Time taken to aquire the fork-choice upgradable read lock", + exponential_buckets(1e-4, 4.0, 7), + ) + }); pub static FORK_CHOICE_WRITE_LOCK_AQUIRE_TIMES: LazyLock> = LazyLock::new(|| { try_create_histogram_with_buckets( "beacon_fork_choice_write_lock_aquire_seconds", diff --git a/beacon_node/beacon_chain/tests/block_verification.rs b/beacon_node/beacon_chain/tests/block_verification.rs index 58ca4a032e..b27295751e 100644 --- a/beacon_node/beacon_chain/tests/block_verification.rs +++ b/beacon_node/beacon_chain/tests/block_verification.rs @@ -1730,6 +1730,8 @@ async fn add_altair_block_to_base_chain() { )); } +// This is a regression test for this bug: +// https://github.com/sigp/lighthouse/issues/4332#issuecomment-1565092279 #[tokio::test] async fn import_duplicate_block_unrealized_justification() { let spec = MainnetEthSpec::default_spec(); @@ -1791,7 +1793,7 @@ async fn import_duplicate_block_unrealized_justification() { .await .unwrap(); - // Unrealized justification should NOT have updated. + // The store's global unrealized justification should update immediately and match the block. let unrealized_justification = { let fc = chain.canonical_head.fork_choice_read_lock(); assert_eq!(fc.justified_checkpoint().epoch, 0); @@ -1808,9 +1810,12 @@ async fn import_duplicate_block_unrealized_justification() { }; // Import the second verified block, simulating a block processed via RPC. - import_execution_pending_block(chain.clone(), verified_block2) - .await - .unwrap(); + assert_eq!( + import_execution_pending_block(chain.clone(), verified_block2) + .await + .unwrap_err(), + format!("DuplicateFullyImported({block_root})") + ); // Unrealized justification should still be updated. let fc3 = chain.canonical_head.fork_choice_read_lock(); From 4409500f63007f98bc901924cee536cfad42f677 Mon Sep 17 00:00:00 2001 From: Eitan Seri-Levi Date: Mon, 15 Sep 2025 22:18:25 -0700 Subject: [PATCH 35/81] Remove column reconstruction when processing rpc requests (#8051) Co-Authored-By: Eitan Seri- Levi --- .../src/network_beacon_processor/gossip_methods.rs | 2 +- .../network/src/network_beacon_processor/mod.rs | 5 +---- .../src/network_beacon_processor/sync_methods.rs | 13 +------------ 3 files changed, 3 insertions(+), 17 deletions(-) diff --git a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs index 1f1a3427e7..bc44db40e9 100644 --- a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs @@ -1067,7 +1067,7 @@ impl NetworkBeaconProcessor { slot: *slot, process_fn: Box::pin(async move { cloned_self - .attempt_data_column_reconstruction(block_root, true) + .attempt_data_column_reconstruction(block_root) .await; }), }, diff --git a/beacon_node/network/src/network_beacon_processor/mod.rs b/beacon_node/network/src/network_beacon_processor/mod.rs index 73349cd431..030f77be37 100644 --- a/beacon_node/network/src/network_beacon_processor/mod.rs +++ b/beacon_node/network/src/network_beacon_processor/mod.rs @@ -837,7 +837,6 @@ impl NetworkBeaconProcessor { async fn attempt_data_column_reconstruction( self: &Arc, block_root: Hash256, - publish_columns: bool, ) -> Option { // Only supernodes attempt reconstruction if !self @@ -852,9 +851,7 @@ impl NetworkBeaconProcessor { let result = self.chain.reconstruct_data_columns(block_root).await; match result { Ok(Some((availability_processing_status, data_columns_to_publish))) => { - if publish_columns { - self.publish_data_columns_gradually(data_columns_to_publish, block_root); - } + self.publish_data_columns_gradually(data_columns_to_publish, block_root); match &availability_processing_status { AvailabilityProcessingStatus::Imported(hash) => { debug!( diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index f24495cc54..edeed7e98c 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -383,7 +383,7 @@ impl NetworkBeaconProcessor { "RPC custody data columns received" ); - let mut result = self + let result = self .chain .process_rpc_custody_columns(custody_columns) .await; @@ -404,17 +404,6 @@ impl NetworkBeaconProcessor { block_hash = %block_root, "Missing components over rpc" ); - // Attempt reconstruction here before notifying sync, to avoid sending out more requests - // that we may no longer need. - // We don't publish columns reconstructed from rpc columns to the gossip network, - // as these are likely historic columns. - let publish_columns = false; - if let Some(availability) = self - .attempt_data_column_reconstruction(block_root, publish_columns) - .await - { - result = Ok(availability) - } } }, Err(BlockError::DuplicateFullyImported(_)) => { From aba362709990d7ec7f4a880bcd1e60114d375450 Mon Sep 17 00:00:00 2001 From: Eitan Seri-Levi Date: Mon, 15 Sep 2025 22:18:28 -0700 Subject: [PATCH 36/81] Reduce reconstruction queue capacity (#8053) Co-Authored-By: Eitan Seri- Levi --- beacon_node/beacon_processor/src/lib.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/beacon_node/beacon_processor/src/lib.rs b/beacon_node/beacon_processor/src/lib.rs index ab9ab045f4..84723fb6a0 100644 --- a/beacon_node/beacon_processor/src/lib.rs +++ b/beacon_node/beacon_processor/src/lib.rs @@ -181,7 +181,7 @@ impl BeaconProcessorQueueLengths { // We don't request more than `PARENT_DEPTH_TOLERANCE` (32) lookups, so we can limit // this queue size. With 48 max blobs per block, each column sidecar list could be up to 12MB. rpc_custody_column_queue: 64, - column_reconstruction_queue: 64, + column_reconstruction_queue: 1, chain_segment_queue: 64, backfill_chain_segment: 64, gossip_block_queue: 1024, @@ -867,7 +867,7 @@ impl BeaconProcessor { let mut rpc_blob_queue = FifoQueue::new(queue_lengths.rpc_blob_queue); let mut rpc_custody_column_queue = FifoQueue::new(queue_lengths.rpc_custody_column_queue); let mut column_reconstruction_queue = - FifoQueue::new(queue_lengths.column_reconstruction_queue); + LifoQueue::new(queue_lengths.column_reconstruction_queue); let mut chain_segment_queue = FifoQueue::new(queue_lengths.chain_segment_queue); let mut backfill_chain_segment = FifoQueue::new(queue_lengths.backfill_chain_segment); let mut gossip_block_queue = FifoQueue::new(queue_lengths.gossip_block_queue); @@ -1354,9 +1354,7 @@ impl BeaconProcessor { Work::RpcCustodyColumn { .. } => { rpc_custody_column_queue.push(work, work_id) } - Work::ColumnReconstruction(_) => { - column_reconstruction_queue.push(work, work_id) - } + Work::ColumnReconstruction(_) => column_reconstruction_queue.push(work), Work::ChainSegment { .. } => chain_segment_queue.push(work, work_id), Work::ChainSegmentBackfill { .. } => { backfill_chain_segment.push(work, work_id) From 242bdfcf1229254ac792039d8ae13b703bd1ab6b Mon Sep 17 00:00:00 2001 From: Eitan Seri-Levi Date: Mon, 15 Sep 2025 22:18:31 -0700 Subject: [PATCH 37/81] Add instrumentation to `recompute_head_at_slot` (#8049) Co-Authored-By: Eitan Seri- Levi --- beacon_node/beacon_chain/src/canonical_head.rs | 18 +++++++++++++++++- beacon_node/lighthouse_tracing/src/lib.rs | 3 +++ beacon_node/store/src/hot_cold_store.rs | 1 + consensus/state_processing/src/all_caches.rs | 2 ++ 4 files changed, 23 insertions(+), 1 deletion(-) diff --git a/beacon_node/beacon_chain/src/canonical_head.rs b/beacon_node/beacon_chain/src/canonical_head.rs index 3dbe8bf5c4..78005bf799 100644 --- a/beacon_node/beacon_chain/src/canonical_head.rs +++ b/beacon_node/beacon_chain/src/canonical_head.rs @@ -47,6 +47,7 @@ use fork_choice::{ ResetPayloadStatuses, }; use itertools::process_results; +use lighthouse_tracing::SPAN_RECOMPUTE_HEAD; use logging::crit; use parking_lot::{Mutex, RwLock, RwLockReadGuard, RwLockUpgradableReadGuard, RwLockWriteGuard}; use slot_clock::SlotClock; @@ -57,6 +58,7 @@ use store::{ Error as StoreError, KeyValueStore, KeyValueStoreOp, StoreConfig, iter::StateRootsIterator, }; use task_executor::{JoinHandle, ShutdownReason}; +use tracing::info_span; use tracing::{debug, error, info, instrument, warn}; use types::*; @@ -383,6 +385,7 @@ impl CanonicalHead { /// /// This function is **not safe** to be public. See the module-level documentation for more /// information about protecting from deadlocks. + #[instrument(skip_all)] fn cached_head_write_lock(&self) -> RwLockWriteGuard<'_, CachedHead> { self.cached_head.write() } @@ -402,6 +405,7 @@ impl CanonicalHead { } /// Access a write-lock for fork choice. + #[instrument(skip_all)] pub fn fork_choice_write_lock(&self) -> RwLockWriteGuard<'_, BeaconForkChoice> { let _timer = metrics::start_timer(&metrics::FORK_CHOICE_WRITE_LOCK_AQUIRE_TIMES); self.fork_choice.write() @@ -509,13 +513,21 @@ impl BeaconChain { /// situation can be rectified. We avoid returning an error here so that calling functions /// can't abort block import because an error is returned here. pub async fn recompute_head_at_slot(self: &Arc, current_slot: Slot) { + let span = info_span!( + SPAN_RECOMPUTE_HEAD, + slot = %current_slot + ); + metrics::inc_counter(&metrics::FORK_CHOICE_REQUESTS); let _timer = metrics::start_timer(&metrics::FORK_CHOICE_TIMES); let chain = self.clone(); match self .spawn_blocking_handle( - move || chain.recompute_head_at_slot_internal(current_slot), + move || { + let _guard = span.enter(); + chain.recompute_head_at_slot_internal(current_slot) + }, "recompute_head_internal", ) .await @@ -773,6 +785,7 @@ impl BeaconChain { } /// Perform updates to caches and other components after the canonical head has been changed. + #[instrument(skip_all)] fn after_new_head( self: &Arc, old_cached_head: &CachedHead, @@ -911,6 +924,7 @@ impl BeaconChain { /// /// This function will take a write-lock on `canonical_head.fork_choice`, therefore it would be /// unwise to hold any lock on fork choice while calling this function. + #[instrument(skip_all)] fn after_finalization( self: &Arc, new_cached_head: &CachedHead, @@ -1046,6 +1060,7 @@ impl BeaconChain { /// /// This function is called whilst holding a write-lock on the `canonical_head`. To ensure dead-lock /// safety, **do not take any other locks inside this function**. +#[instrument(skip_all)] fn check_finalized_payload_validity( chain: &BeaconChain, finalized_proto_block: &ProtoBlock, @@ -1129,6 +1144,7 @@ fn perform_debug_logging( } } +#[instrument(skip_all)] fn spawn_execution_layer_updates( chain: Arc>, forkchoice_update_params: ForkchoiceUpdateParameters, diff --git a/beacon_node/lighthouse_tracing/src/lib.rs b/beacon_node/lighthouse_tracing/src/lib.rs index 1787399761..60fda12cc2 100644 --- a/beacon_node/lighthouse_tracing/src/lib.rs +++ b/beacon_node/lighthouse_tracing/src/lib.rs @@ -27,6 +27,9 @@ pub const SPAN_PROCESS_RPC_BLOBS: &str = "process_rpc_blobs"; pub const SPAN_PROCESS_RPC_CUSTODY_COLUMNS: &str = "process_rpc_custody_columns"; pub const SPAN_PROCESS_CHAIN_SEGMENT: &str = "process_chain_segment"; +/// Fork choice root spans +pub const SPAN_RECOMPUTE_HEAD: &str = "recompute_head_at_slot"; + /// RPC methods root spans pub const SPAN_HANDLE_BLOCKS_BY_RANGE_REQUEST: &str = "handle_blocks_by_range_request"; pub const SPAN_HANDLE_BLOBS_BY_RANGE_REQUEST: &str = "handle_blobs_by_range_request"; diff --git a/beacon_node/store/src/hot_cold_store.rs b/beacon_node/store/src/hot_cold_store.rs index 7156c75f11..52e52fe7ce 100644 --- a/beacon_node/store/src/hot_cold_store.rs +++ b/beacon_node/store/src/hot_cold_store.rs @@ -656,6 +656,7 @@ impl, Cold: ItemStore> HotColdDB } /// Fetch a full block with execution payload from the store. + #[instrument(skip_all)] pub fn get_full_block( &self, block_root: &Hash256, diff --git a/consensus/state_processing/src/all_caches.rs b/consensus/state_processing/src/all_caches.rs index e49eb395c4..d6c4fd3f88 100644 --- a/consensus/state_processing/src/all_caches.rs +++ b/consensus/state_processing/src/all_caches.rs @@ -1,5 +1,6 @@ use crate::common::update_progressive_balances_cache::initialize_progressive_balances_cache; use crate::epoch_cache::initialize_epoch_cache; +use tracing::instrument; use types::{ BeaconState, ChainSpec, EpochCacheError, EthSpec, FixedBytesExtended, Hash256, RelativeEpoch, }; @@ -23,6 +24,7 @@ pub trait AllCaches { } impl AllCaches for BeaconState { + #[instrument(skip_all)] fn build_all_caches(&mut self, spec: &ChainSpec) -> Result<(), EpochCacheError> { self.build_caches(spec)?; initialize_epoch_cache(self, spec)?; From 3de646c8b32b6da7d2ace48aab9ceb2e52bbe8a5 Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Tue, 16 Sep 2025 18:17:43 +1000 Subject: [PATCH 38/81] Enable reconstruction for nodes custodying more than 50% of columns and instrument tracing (#8052) Co-Authored-By: Jimmy Chen Co-Authored-By: Jimmy Chen --- beacon_node/beacon_chain/src/beacon_chain.rs | 6 +- .../src/data_availability_checker.rs | 1 + .../overflow_lru_cache.rs | 22 +++++-- .../beacon_chain/src/validator_custody.rs | 10 ++- .../gossip_methods.rs | 66 ++++++++++--------- .../src/network_beacon_processor/mod.rs | 39 +++-------- .../src/network_beacon_processor/tests.rs | 4 -- 7 files changed, 76 insertions(+), 72 deletions(-) diff --git a/beacon_node/beacon_chain/src/beacon_chain.rs b/beacon_node/beacon_chain/src/beacon_chain.rs index eeafefdff8..084a68bfea 100644 --- a/beacon_node/beacon_chain/src/beacon_chain.rs +++ b/beacon_node/beacon_chain/src/beacon_chain.rs @@ -3299,10 +3299,14 @@ impl BeaconChain { let data_availability_checker = self.data_availability_checker.clone(); + let current_span = Span::current(); let result = self .task_executor .spawn_blocking_handle( - move || data_availability_checker.reconstruct_data_columns(&block_root), + move || { + let _guard = current_span.enter(); + data_availability_checker.reconstruct_data_columns(&block_root) + }, "reconstruct_data_columns", ) .ok_or(BeaconChainError::RuntimeShutdown)? diff --git a/beacon_node/beacon_chain/src/data_availability_checker.rs b/beacon_node/beacon_chain/src/data_availability_checker.rs index 9225ed6b47..307dc0e227 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker.rs @@ -547,6 +547,7 @@ impl DataAvailabilityChecker { } } + #[instrument(skip_all, level = "debug")] pub fn reconstruct_data_columns( &self, block_root: &Hash256, diff --git a/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs b/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs index 9de63f6126..6afb680ddb 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs @@ -592,9 +592,9 @@ impl DataAvailabilityCheckerInner { /// Check whether data column reconstruction should be attempted. /// - /// Potentially trigger reconstruction if: - /// - Our custody requirement is all columns (supernode), and we haven't got all columns - /// - We have >= 50% of columns, but not all columns + /// Potentially trigger reconstruction if all the following satisfy: + /// - Our custody requirement is more than 50% of total columns, + /// - We haven't received all required columns /// - Reconstruction hasn't been started for the block /// /// If reconstruction is required, returns `PendingComponents` which contains the @@ -609,15 +609,25 @@ impl DataAvailabilityCheckerInner { return ReconstructColumnsDecision::No("block already imported"); }; - // If we're sampling all columns, it means we must be custodying all columns. + let Some(epoch) = pending_components + .verified_data_columns + .first() + .map(|c| c.as_data_column().epoch()) + else { + return ReconstructColumnsDecision::No("not enough columns"); + }; + let total_column_count = T::EthSpec::number_of_columns(); + let sampling_column_count = self + .custody_context + .num_of_data_columns_to_sample(epoch, &self.spec); let received_column_count = pending_components.verified_data_columns.len(); if pending_components.reconstruction_started { return ReconstructColumnsDecision::No("already started"); } - if received_column_count >= total_column_count { - return ReconstructColumnsDecision::No("all columns received"); + if received_column_count >= sampling_column_count { + return ReconstructColumnsDecision::No("all sampling columns received"); } if received_column_count < total_column_count / 2 { return ReconstructColumnsDecision::No("not enough columns"); diff --git a/beacon_node/beacon_chain/src/validator_custody.rs b/beacon_node/beacon_chain/src/validator_custody.rs index 1c89624f9d..3ab76828c9 100644 --- a/beacon_node/beacon_chain/src/validator_custody.rs +++ b/beacon_node/beacon_chain/src/validator_custody.rs @@ -130,7 +130,7 @@ pub struct CustodyContext { /// and enr values. validator_custody_count: AtomicU64, /// Is the node run as a supernode based on current cli parameters. - pub current_is_supernode: bool, + current_is_supernode: bool, /// The persisted value for `is_supernode` based on the previous run of this node. /// /// Note: We require this value because if a user restarts the node with a higher cli custody @@ -307,6 +307,14 @@ impl CustodyContext { .expect("should compute node sampling size from valid chain spec") } + /// Returns whether the node should attempt reconstruction at a given epoch. + pub fn should_attempt_reconstruction(&self, epoch: Epoch, spec: &ChainSpec) -> bool { + let min_columns_for_reconstruction = E::number_of_columns() / 2; + // performing reconstruction is not necessary if sampling column count is exactly 50%, + // because the node doesn't need the remaining columns. + self.num_of_data_columns_to_sample(epoch, spec) > min_columns_for_reconstruction + } + /// Returns the ordered list of column indices that should be sampled for data availability checking at the given epoch. /// /// # Parameters diff --git a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs index bc44db40e9..b3d717142f 100644 --- a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs @@ -34,7 +34,6 @@ use std::path::PathBuf; use std::sync::Arc; use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; use store::hot_cold_store::HotColdDBError; -use tokio::sync::mpsc::error::TrySendError; use tracing::{Instrument, Span, debug, error, info, instrument, trace, warn}; use types::{ Attestation, AttestationData, AttestationRef, AttesterSlashing, BlobSidecar, DataColumnSidecar, @@ -1054,36 +1053,43 @@ impl NetworkBeaconProcessor { "Processed data column, waiting for other components" ); - // Instead of triggering reconstruction immediately, schedule it to be run. If - // another column arrives it either completes availability or pushes - // reconstruction back a bit. - let cloned_self = Arc::clone(self); - let block_root = *block_root; - let send_result = self.beacon_processor_send.try_send(WorkEvent { - drop_during_sync: false, - work: Work::Reprocess(ReprocessQueueMessage::DelayColumnReconstruction( - QueuedColumnReconstruction { - block_root, - slot: *slot, - process_fn: Box::pin(async move { - cloned_self - .attempt_data_column_reconstruction(block_root) - .await; - }), - }, - )), - }); - if let Err(TrySendError::Full(WorkEvent { - work: - Work::Reprocess(ReprocessQueueMessage::DelayColumnReconstruction( - reconstruction, - )), - .. - })) = send_result + if self + .chain + .data_availability_checker + .custody_context() + .should_attempt_reconstruction( + slot.epoch(T::EthSpec::slots_per_epoch()), + &self.chain.spec, + ) { - warn!("Unable to send reconstruction to reprocessing"); - // Execute it immediately instead. - reconstruction.process_fn.await; + // Instead of triggering reconstruction immediately, schedule it to be run. If + // another column arrives, it either completes availability or pushes + // reconstruction back a bit. + let cloned_self = Arc::clone(self); + let block_root = *block_root; + + if self + .beacon_processor_send + .try_send(WorkEvent { + drop_during_sync: false, + work: Work::Reprocess( + ReprocessQueueMessage::DelayColumnReconstruction( + QueuedColumnReconstruction { + block_root, + slot: *slot, + process_fn: Box::pin(async move { + cloned_self + .attempt_data_column_reconstruction(block_root) + .await; + }), + }, + ), + ), + }) + .is_err() + { + warn!("Unable to send reconstruction to reprocessing"); + } } } }, diff --git a/beacon_node/network/src/network_beacon_processor/mod.rs b/beacon_node/network/src/network_beacon_processor/mod.rs index 030f77be37..691c06f268 100644 --- a/beacon_node/network/src/network_beacon_processor/mod.rs +++ b/beacon_node/network/src/network_beacon_processor/mod.rs @@ -28,7 +28,7 @@ use std::sync::Arc; use std::time::Duration; use task_executor::TaskExecutor; use tokio::sync::mpsc::{self, error::TrySendError}; -use tracing::{debug, error, trace, warn}; +use tracing::{debug, error, instrument, trace, warn}; use types::*; pub use sync_methods::ChainSegmentProcessId; @@ -825,30 +825,12 @@ impl NetworkBeaconProcessor { } } - /// Attempt to reconstruct all data columns if the following conditions satisfies: - /// - Our custody requirement is all columns - /// - We have >= 50% of columns, but not all columns - /// - /// Returns `Some(AvailabilityProcessingStatus)` if reconstruction is successfully performed, - /// otherwise returns `None`. - /// - /// The `publish_columns` parameter controls whether reconstructed columns should be published - /// to the gossip network. - async fn attempt_data_column_reconstruction( - self: &Arc, - block_root: Hash256, - ) -> Option { - // Only supernodes attempt reconstruction - if !self - .chain - .data_availability_checker - .custody_context() - .current_is_supernode - { - return None; - } - + /// Attempts to reconstruct all data columns if the conditions checked in + /// [`DataAvailabilityCheckerInner::check_and_set_reconstruction_started`] are satisfied. + #[instrument(level = "debug", skip_all, fields(?block_root))] + async fn attempt_data_column_reconstruction(self: &Arc, block_root: Hash256) { let result = self.chain.reconstruct_data_columns(block_root).await; + match result { Ok(Some((availability_processing_status, data_columns_to_publish))) => { self.publish_data_columns_gradually(data_columns_to_publish, block_root); @@ -864,21 +846,18 @@ impl NetworkBeaconProcessor { AvailabilityProcessingStatus::MissingComponents(_, _) => { debug!( result = "imported all custody columns", - block_hash = %block_root, + %block_root, "Block components still missing block after reconstruction" ); } } - - Some(availability_processing_status) } Ok(None) => { // reason is tracked via the `KZG_DATA_COLUMN_RECONSTRUCTION_INCOMPLETE_TOTAL` metric trace!( - block_hash = %block_root, + %block_root, "Reconstruction not required for block" ); - None } Err(e) => { error!( @@ -886,7 +865,6 @@ impl NetworkBeaconProcessor { error = ?e, "Error during data column reconstruction" ); - None } } } @@ -975,6 +953,7 @@ impl NetworkBeaconProcessor { /// by some nodes on the network as soon as possible. Our hope is that some columns arrive from /// other nodes in the meantime, obviating the need for us to publish them. If no other /// publisher exists for a column, it will eventually get published here. + #[instrument(level="debug", skip_all, fields(?block_root, data_column_count=data_columns_to_publish.len()))] fn publish_data_columns_gradually( self: &Arc, mut data_columns_to_publish: DataColumnSidecarList, diff --git a/beacon_node/network/src/network_beacon_processor/tests.rs b/beacon_node/network/src/network_beacon_processor/tests.rs index 2935c2d213..d3a93d4863 100644 --- a/beacon_node/network/src/network_beacon_processor/tests.rs +++ b/beacon_node/network/src/network_beacon_processor/tests.rs @@ -1009,10 +1009,6 @@ async fn import_gossip_block_acceptably_early() { rig.assert_event_journal_completes(&[WorkType::GossipDataColumnSidecar]) .await; } - if num_data_columns > 0 { - rig.assert_event_journal_completes(&[WorkType::ColumnReconstruction]) - .await; - } // Note: this section of the code is a bit race-y. We're assuming that we can set the slot clock // and check the head in the time between the block arrived early and when its due for From 191570e4a162202df72713c177db1386464420dd Mon Sep 17 00:00:00 2001 From: jking-aus <72330194+jking-aus@users.noreply.github.com> Date: Wed, 17 Sep 2025 04:27:37 +1000 Subject: [PATCH 39/81] chore: Bump discv5 and remove generic DefaultProtocolId in metrics (#8056) Bump discv5 version Co-Authored-By: Josh King --- Cargo.lock | 8 ++++---- Cargo.toml | 2 +- common/network_utils/src/discovery_metrics.rs | 3 +-- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 88b5b7b57d..ba6a4587b6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2163,7 +2163,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "18e4fdb82bd54a12e42fb58a800dcae6b9e13982238ce2296dc3570b92148e1f" dependencies = [ "data-encoding", - "syn 2.0.100", + "syn 1.0.109", ] [[package]] @@ -2395,9 +2395,9 @@ dependencies = [ [[package]] name = "discv5" -version = "0.9.1" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4b4e7798d2ff74e29cee344dc490af947ae657d6ab5273dde35d58ce06a4d71" +checksum = "a20b702c8491b3325866a4935d0b5101e49144d74540384243b6293794aad6fa" dependencies = [ "aes 0.8.4", "aes-gcm", @@ -5122,7 +5122,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34" dependencies = [ "cfg-if", - "windows-targets 0.52.6", + "windows-targets 0.48.5", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 0b930b605d..99543dbfb4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -134,7 +134,7 @@ deposit_contract = { path = "common/deposit_contract" } derivative = "2" directory = { path = "common/directory" } dirs = "3" -discv5 = { version = "0.9", features = ["libp2p"] } +discv5 = { version = "0.10", features = ["libp2p"] } doppelganger_service = { path = "validator_client/doppelganger_service" } either = "1.9" environment = { path = "lighthouse/environment" } diff --git a/common/network_utils/src/discovery_metrics.rs b/common/network_utils/src/discovery_metrics.rs index d105dee57a..26a9e8a45f 100644 --- a/common/network_utils/src/discovery_metrics.rs +++ b/common/network_utils/src/discovery_metrics.rs @@ -35,8 +35,7 @@ pub static DISCOVERY_SESSIONS: LazyLock> = LazyLock::new(|| { }); pub fn scrape_discovery_metrics() { - let metrics = - discv5::metrics::Metrics::from(discv5::Discv5::::raw_metrics()); + let metrics = discv5::metrics::Metrics::from(discv5::Discv5::raw_metrics()); set_float_gauge(&DISCOVERY_REQS, metrics.unsolicited_requests_per_second); set_gauge(&DISCOVERY_SESSIONS, metrics.active_sessions as i64); set_gauge_vec(&DISCOVERY_BYTES, &["inbound"], metrics.bytes_recv as i64); From b7d78a91e03d4b3975806e2460bded01825f5a92 Mon Sep 17 00:00:00 2001 From: Lion - dapplion <35266934+dapplion@users.noreply.github.com> Date: Wed, 17 Sep 2025 03:02:29 +0200 Subject: [PATCH 40/81] Don't penalize peers for extending ignored chains (#8042) Lookup sync has a cache of block roots "failed_chains". If a peer triggers a lookup for a block or descendant of a root in failed_chains the lookup is dropped and the peer penalized. However blocks are inserted into failed_chains for a single reason: - If a chain is longer than 32 blocks the lookup is dropped to prevent OOM risks. However the peer is not at fault, since discovering an unknown chain longer than 32 blocks is not malicious. We just drop the lookup to sync the blocks from range forward sync. This discrepancy is probably an oversight when changing old code. Before we used to add blocks that failed too many times to process to that cache. However, we don't do that anymore. Adding a block that fails too many times to process is an optimization to save resources in rare cases where peers keep sending us invalid blocks. In case that happens, today we keep trying to process the block, downscoring the peers and eventually disconnecting them. _IF_ we found that optimization to be necessary we should merge this PR (_Stricter match of BlockError in lookup sync_) first. IMO we are fine without the failed_chains cache and the ignored_chains cache will be obsolete with [tree sync](https://github.com/sigp/lighthouse/issues/7678) as the OOM risk of long lookup chains does not exist anymore. Closes https://github.com/sigp/lighthouse/issues/7577 Rename `failed_chains` for `ignored_chains` and don't penalize peers that trigger lookups for those blocks Co-Authored-By: dapplion <35266934+dapplion@users.noreply.github.com> --- .../network/src/sync/block_lookups/mod.rs | 35 +++++------ beacon_node/network/src/sync/manager.rs | 8 +-- beacon_node/network/src/sync/tests/lookups.rs | 62 +++++++++---------- 3 files changed, 50 insertions(+), 55 deletions(-) diff --git a/beacon_node/network/src/sync/block_lookups/mod.rs b/beacon_node/network/src/sync/block_lookups/mod.rs index b60c21972f..f8ffd298ca 100644 --- a/beacon_node/network/src/sync/block_lookups/mod.rs +++ b/beacon_node/network/src/sync/block_lookups/mod.rs @@ -59,7 +59,7 @@ mod single_block_lookup; /// reaches the maximum depth it will force trigger range sync. pub(crate) const PARENT_DEPTH_TOLERANCE: usize = SLOT_IMPORT_TOLERANCE; -const FAILED_CHAINS_CACHE_EXPIRY_SECONDS: u64 = 60; +const IGNORED_CHAINS_CACHE_EXPIRY_SECONDS: u64 = 60; pub const SINGLE_BLOCK_LOOKUP_MAX_ATTEMPTS: u8 = 4; /// Maximum time we allow a lookup to exist before assuming it is stuck and will never make @@ -110,8 +110,10 @@ enum Action { } pub struct BlockLookups { - /// A cache of failed chain lookups to prevent duplicate searches. - failed_chains: LRUTimeCache, + /// A cache of block roots that must be ignored for some time to prevent useless searches. For + /// example if a chain is too long, its lookup chain is dropped, and range sync is expected to + /// eventually sync those blocks + ignored_chains: LRUTimeCache, // TODO: Why not index lookups by block_root? single_block_lookups: FnvHashMap>, @@ -128,21 +130,21 @@ pub(crate) type BlockLookupSummary = (Id, Hash256, Option, Vec) impl BlockLookups { pub fn new() -> Self { Self { - failed_chains: LRUTimeCache::new(Duration::from_secs( - FAILED_CHAINS_CACHE_EXPIRY_SECONDS, + ignored_chains: LRUTimeCache::new(Duration::from_secs( + IGNORED_CHAINS_CACHE_EXPIRY_SECONDS, )), single_block_lookups: Default::default(), } } #[cfg(test)] - pub(crate) fn insert_failed_chain(&mut self, block_root: Hash256) { - self.failed_chains.insert(block_root); + pub(crate) fn insert_ignored_chain(&mut self, block_root: Hash256) { + self.ignored_chains.insert(block_root); } #[cfg(test)] - pub(crate) fn get_failed_chains(&mut self) -> Vec { - self.failed_chains.keys().cloned().collect() + pub(crate) fn get_ignored_chains(&mut self) -> Vec { + self.ignored_chains.keys().cloned().collect() } #[cfg(test)] @@ -184,7 +186,7 @@ impl BlockLookups { self.search_parent_of_child(parent_root, block_root, &[peer_id], cx); // Only create the child lookup if the parent exists if parent_lookup_exists { - // `search_parent_of_child` ensures that parent root is not a failed chain + // `search_parent_of_child` ensures that the parent lookup exists so we can safely wait for it self.new_current_lookup( block_root, Some(block_component), @@ -244,8 +246,8 @@ impl BlockLookups { debug!(block_root = ?block_root_to_search, "Parent lookup chain too long"); // Searching for this parent would extend a parent chain over the max - // Insert the tip only to failed chains - self.failed_chains.insert(parent_chain.tip); + // Insert the tip only to chains to ignore + self.ignored_chains.insert(parent_chain.tip); // Note: Drop only the chain that's too long until it merges with another chain // that's not too long. Consider this attack: there's a chain of valid unknown @@ -330,12 +332,9 @@ impl BlockLookups { peers: &[PeerId], cx: &mut SyncNetworkContext, ) -> bool { - // If this block or it's parent is part of a known failed chain, ignore it. - if self.failed_chains.contains(&block_root) { - debug!(?block_root, "Block is from a past failed chain. Dropping"); - for peer_id in peers { - cx.report_peer(*peer_id, PeerAction::MidToleranceError, "failed_chain"); - } + // If this block or it's parent is part of a known ignored chain, ignore it. + if self.ignored_chains.contains(&block_root) { + debug!(?block_root, "Dropping lookup for block marked ignored"); return false; } diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 448e784ab6..d7ba028054 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -328,13 +328,13 @@ impl SyncManager { } #[cfg(test)] - pub(crate) fn get_failed_chains(&mut self) -> Vec { - self.block_lookups.get_failed_chains() + pub(crate) fn get_ignored_chains(&mut self) -> Vec { + self.block_lookups.get_ignored_chains() } #[cfg(test)] - pub(crate) fn insert_failed_chain(&mut self, block_root: Hash256) { - self.block_lookups.insert_failed_chain(block_root); + pub(crate) fn insert_ignored_chain(&mut self, block_root: Hash256) { + self.block_lookups.insert_ignored_chain(block_root); } #[cfg(test)] diff --git a/beacon_node/network/src/sync/tests/lookups.rs b/beacon_node/network/src/sync/tests/lookups.rs index b5bc10851d..2edcd12f01 100644 --- a/beacon_node/network/src/sync/tests/lookups.rs +++ b/beacon_node/network/src/sync/tests/lookups.rs @@ -285,21 +285,21 @@ impl TestRig { ); } - fn insert_failed_chain(&mut self, block_root: Hash256) { - self.sync_manager.insert_failed_chain(block_root); + fn insert_ignored_chain(&mut self, block_root: Hash256) { + self.sync_manager.insert_ignored_chain(block_root); } - fn assert_not_failed_chain(&mut self, chain_hash: Hash256) { - let failed_chains = self.sync_manager.get_failed_chains(); - if failed_chains.contains(&chain_hash) { - panic!("failed chains contain {chain_hash:?}: {failed_chains:?}"); + fn assert_not_ignored_chain(&mut self, chain_hash: Hash256) { + let chains = self.sync_manager.get_ignored_chains(); + if chains.contains(&chain_hash) { + panic!("ignored chains contain {chain_hash:?}: {chains:?}"); } } - fn assert_failed_chain(&mut self, chain_hash: Hash256) { - let failed_chains = self.sync_manager.get_failed_chains(); - if !failed_chains.contains(&chain_hash) { - panic!("expected failed chains to contain {chain_hash:?}: {failed_chains:?}"); + fn assert_ignored_chain(&mut self, chain_hash: Hash256) { + let chains = self.sync_manager.get_ignored_chains(); + if !chains.contains(&chain_hash) { + panic!("expected ignored chains to contain {chain_hash:?}: {chains:?}"); } } @@ -1021,11 +1021,6 @@ impl TestRig { self.log(&format!("Found expected penalty {penalty_msg}")); } - pub fn expect_single_penalty(&mut self, peer_id: PeerId, expect_penalty_msg: &'static str) { - self.expect_penalty(peer_id, expect_penalty_msg); - self.expect_no_penalty_for(peer_id); - } - pub fn block_with_parent_and_blobs( &mut self, parent_root: Hash256, @@ -1461,7 +1456,7 @@ fn test_parent_lookup_too_many_download_attempts_no_blacklist() { // Trigger the request rig.trigger_unknown_parent_block(peer_id, block.into()); for i in 1..=PARENT_FAIL_TOLERANCE { - rig.assert_not_failed_chain(block_root); + rig.assert_not_ignored_chain(block_root); let id = rig.expect_block_parent_request(parent_root); if i % 2 != 0 { // The request fails. It should be tried again. @@ -1474,8 +1469,8 @@ fn test_parent_lookup_too_many_download_attempts_no_blacklist() { } } - rig.assert_not_failed_chain(block_root); - rig.assert_not_failed_chain(parent.canonical_root()); + rig.assert_not_ignored_chain(block_root); + rig.assert_not_ignored_chain(parent.canonical_root()); rig.expect_no_active_lookups_empty_network(); } @@ -1500,7 +1495,7 @@ fn test_parent_lookup_too_many_processing_attempts_must_blacklist() { for _ in 0..PROCESSING_FAILURES { let id = rig.expect_block_parent_request(parent_root); // Blobs are only requested in the previous first iteration as this test only retries blocks - rig.assert_not_failed_chain(block_root); + rig.assert_not_ignored_chain(block_root); // send the right parent but fail processing rig.parent_lookup_block_response(id, peer_id, Some(parent.clone().into())); rig.parent_block_processed(block_root, BlockError::BlockSlotLimitReached.into()); @@ -1508,7 +1503,7 @@ fn test_parent_lookup_too_many_processing_attempts_must_blacklist() { rig.expect_penalty(peer_id, "lookup_block_processing_failure"); } - rig.assert_not_failed_chain(block_root); + rig.assert_not_ignored_chain(block_root); rig.expect_no_active_lookups_empty_network(); } @@ -1551,12 +1546,14 @@ fn test_parent_lookup_too_deep_grow_ancestor() { ); // Should not penalize peer, but network is not clear because of the blocks_by_range requests rig.expect_no_penalty_for(peer_id); - rig.assert_failed_chain(chain_hash); + rig.assert_ignored_chain(chain_hash); } // Regression test for https://github.com/sigp/lighthouse/pull/7118 +// 8042 UPDATE: block was previously added to the failed_chains cache, now it's inserted into the +// ignored chains cache. The regression test still applies as the chaild lookup is not created #[test] -fn test_child_lookup_not_created_for_failed_chain_parent_after_processing() { +fn test_child_lookup_not_created_for_ignored_chain_parent_after_processing() { // GIVEN: A parent chain longer than PARENT_DEPTH_TOLERANCE. let mut rig = TestRig::test_setup(); let mut blocks = rig.rand_blockchain(PARENT_DEPTH_TOLERANCE + 1); @@ -1586,8 +1583,8 @@ fn test_child_lookup_not_created_for_failed_chain_parent_after_processing() { } // At this point, the chain should have been deemed too deep and pruned. - // The tip root should have been inserted into failed chains. - rig.assert_failed_chain(tip_root); + // The tip root should have been inserted into ignored chains. + rig.assert_ignored_chain(tip_root); rig.expect_no_penalty_for(peer_id); // WHEN: Trigger the extending block that points to the tip. @@ -1604,10 +1601,10 @@ fn test_child_lookup_not_created_for_failed_chain_parent_after_processing() { }), ); - // THEN: The extending block should not create a lookup because the tip was inserted into failed chains. + // THEN: The extending block should not create a lookup because the tip was inserted into + // ignored chains. rig.expect_no_active_lookups(); - // AND: The peer should be penalized for extending a failed chain. - rig.expect_single_penalty(peer_id, "failed_chain"); + rig.expect_no_penalty_for(peer_id); rig.expect_empty_network(); } @@ -1646,7 +1643,7 @@ fn test_parent_lookup_too_deep_grow_tip() { ); // Should not penalize peer, but network is not clear because of the blocks_by_range requests rig.expect_no_penalty_for(peer_id); - rig.assert_failed_chain(tip.canonical_root()); + rig.assert_ignored_chain(tip.canonical_root()); } #[test] @@ -1699,15 +1696,14 @@ fn test_lookup_add_peers_to_parent() { } #[test] -fn test_skip_creating_failed_parent_lookup() { +fn test_skip_creating_ignored_parent_lookup() { let mut rig = TestRig::test_setup(); let (_, block, parent_root, _) = rig.rand_block_and_parent(); let peer_id = rig.new_connected_peer(); - rig.insert_failed_chain(parent_root); + rig.insert_ignored_chain(parent_root); rig.trigger_unknown_parent_block(peer_id, block.into()); - // Expect single penalty for peer, despite dropping two lookups - rig.expect_single_penalty(peer_id, "failed_chain"); - // Both current and parent lookup should be rejected + rig.expect_no_penalty_for(peer_id); + // Both current and parent lookup should not be created rig.expect_no_active_lookups(); } From 5928407ce45b539082874ca1f9c5e3e0704f5d85 Mon Sep 17 00:00:00 2001 From: Toki <105550481+gitToki@users.noreply.github.com> Date: Wed, 17 Sep 2025 06:51:43 +0200 Subject: [PATCH 41/81] fix(rate_limiter): add missing prune calls for light client protocols (#8058) Co-Authored-By: Jimmy Chen Co-Authored-By: gitToki --- .github/mergify.yml | 4 ++ .../src/rpc/rate_limiter.rs | 45 ++++++++++++++----- 2 files changed, 39 insertions(+), 10 deletions(-) diff --git a/.github/mergify.yml b/.github/mergify.yml index 4ab73bcf07..0b917b2546 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -105,6 +105,10 @@ queue_rules: {{ body | get_section("## Proposed Changes", "") }} + + {% for commit in commits | unique(attribute='email_author') %} + Co-Authored-By: {{ commit.author }} <{{ commit.email_author }}> + {% endfor %} queue_conditions: - "#approved-reviews-by >= 1" - "check-success=license/cla" diff --git a/beacon_node/lighthouse_network/src/rpc/rate_limiter.rs b/beacon_node/lighthouse_network/src/rpc/rate_limiter.rs index 65cd1c2e61..8b364f506c 100644 --- a/beacon_node/lighthouse_network/src/rpc/rate_limiter.rs +++ b/beacon_node/lighthouse_network/src/rpc/rate_limiter.rs @@ -382,16 +382,41 @@ impl RPCRateLimiter { pub fn prune(&mut self) { let time_since_start = self.init_time.elapsed(); - self.ping_rl.prune(time_since_start); - self.status_rl.prune(time_since_start); - self.metadata_rl.prune(time_since_start); - self.goodbye_rl.prune(time_since_start); - self.bbrange_rl.prune(time_since_start); - self.bbroots_rl.prune(time_since_start); - self.blbrange_rl.prune(time_since_start); - self.blbroot_rl.prune(time_since_start); - self.dcbrange_rl.prune(time_since_start); - self.dcbroot_rl.prune(time_since_start); + + let Self { + prune_interval: _, + init_time: _, + goodbye_rl, + ping_rl, + metadata_rl, + status_rl, + bbrange_rl, + bbroots_rl, + blbrange_rl, + blbroot_rl, + dcbroot_rl, + dcbrange_rl, + lc_bootstrap_rl, + lc_optimistic_update_rl, + lc_finality_update_rl, + lc_updates_by_range_rl, + fork_context: _, + } = self; + + goodbye_rl.prune(time_since_start); + ping_rl.prune(time_since_start); + metadata_rl.prune(time_since_start); + status_rl.prune(time_since_start); + bbrange_rl.prune(time_since_start); + bbroots_rl.prune(time_since_start); + blbrange_rl.prune(time_since_start); + blbroot_rl.prune(time_since_start); + dcbrange_rl.prune(time_since_start); + dcbroot_rl.prune(time_since_start); + lc_bootstrap_rl.prune(time_since_start); + lc_optimistic_update_rl.prune(time_since_start); + lc_finality_update_rl.prune(time_since_start); + lc_updates_by_range_rl.prune(time_since_start); } } From 3cb7e59be2ebcf66836dabae2c771b455822f654 Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Thu, 18 Sep 2025 11:17:31 +1000 Subject: [PATCH 42/81] Update issue template (#7938) * Update issue template * Delete old issue template --- .../default-issue-template.md} | 9 +++++++++ 1 file changed, 9 insertions(+) rename .github/{ISSUE_TEMPLATE.md => ISSUE_TEMPLATE/default-issue-template.md} (79%) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE/default-issue-template.md similarity index 79% rename from .github/ISSUE_TEMPLATE.md rename to .github/ISSUE_TEMPLATE/default-issue-template.md index d73b9ff6f0..784add20f3 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE/default-issue-template.md @@ -1,3 +1,12 @@ +--- +name: Default issue template +about: Use this template for all issues +title: '' +labels: '' +assignees: '' + +--- + ## Description Please provide a brief description of the issue. From 521be2b7576e94a0ca01107cc08d0b3a35a96dee Mon Sep 17 00:00:00 2001 From: Eitan Seri-Levi Date: Wed, 17 Sep 2025 18:33:42 -0700 Subject: [PATCH 43/81] Prevent silently dropping cell proof chunks (#8023) Co-Authored-By: Eitan Seri- Levi --- beacon_node/beacon_chain/src/kzg_utils.rs | 7 +++++++ beacon_node/http_api/src/publish_blocks.rs | 2 +- consensus/types/src/data_column_sidecar.rs | 1 + 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/beacon_node/beacon_chain/src/kzg_utils.rs b/beacon_node/beacon_chain/src/kzg_utils.rs index 2147ed5966..ad669e1729 100644 --- a/beacon_node/beacon_chain/src/kzg_utils.rs +++ b/beacon_node/beacon_chain/src/kzg_utils.rs @@ -174,6 +174,13 @@ pub fn blobs_to_data_column_sidecars( let kzg_commitments_inclusion_proof = block.message().body().kzg_commitments_merkle_proof()?; let signed_block_header = block.signed_block_header(); + if cell_proofs.len() != blobs.len() * E::number_of_columns() { + return Err(DataColumnSidecarError::InvalidCellProofLength { + expected: blobs.len() * E::number_of_columns(), + actual: cell_proofs.len(), + }); + } + let proof_chunks = cell_proofs .chunks_exact(E::number_of_columns()) .collect::>(); diff --git a/beacon_node/http_api/src/publish_blocks.rs b/beacon_node/http_api/src/publish_blocks.rs index b6411167d9..05a4a4b7a4 100644 --- a/beacon_node/http_api/src/publish_blocks.rs +++ b/beacon_node/http_api/src/publish_blocks.rs @@ -412,7 +412,7 @@ fn build_data_columns( error!( error = ?e, %slot, - "Invalid data column - not publishing block" + "Invalid data column - not publishing data columns" ); warp_utils::reject::custom_bad_request(format!("{e:?}")) })?; diff --git a/consensus/types/src/data_column_sidecar.rs b/consensus/types/src/data_column_sidecar.rs index 57f7a88e19..2272b1695c 100644 --- a/consensus/types/src/data_column_sidecar.rs +++ b/consensus/types/src/data_column_sidecar.rs @@ -143,6 +143,7 @@ pub enum DataColumnSidecarError { PreDeneb, SszError(SszError), BuildSidecarFailed(String), + InvalidCellProofLength { expected: usize, actual: usize }, } impl From for DataColumnSidecarError { From 684632df731a69d6e42531bc1c323557a7b45d7e Mon Sep 17 00:00:00 2001 From: Michael Sproul Date: Thu, 18 Sep 2025 15:16:59 +1000 Subject: [PATCH 44/81] Fix reprocess queue memory leak (#8065) Fix a memory leak in the reprocess queue. If the vec of attestation IDs for a block is never evicted from the reprocess queue by a `BlockImported` event, then it stays in the map forever consuming memory. The fix is to remove the entry when its last attestation times out. We do similarly for light client updates. In practice this will only occur if there is a race between adding an attestation to the queue and processing the `BlockImported` event, or if there are attestations for block roots that we never import (e.g. random block roots, block roots of invalid blocks). Co-Authored-By: Michael Sproul --- .../src/scheduler/work_reprocessing_queue.rs | 139 ++++++++++++++++-- 1 file changed, 130 insertions(+), 9 deletions(-) diff --git a/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs b/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs index 9565e57589..3e755f0830 100644 --- a/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs +++ b/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs @@ -37,7 +37,9 @@ const TASK_NAME: &str = "beacon_processor_reprocess_queue"; const GOSSIP_BLOCKS: &str = "gossip_blocks"; const RPC_BLOCKS: &str = "rpc_blocks"; const ATTESTATIONS: &str = "attestations"; +const ATTESTATIONS_PER_ROOT: &str = "attestations_per_root"; const LIGHT_CLIENT_UPDATES: &str = "lc_updates"; +const LIGHT_CLIENT_UPDATES_PER_PARENT_ROOT: &str = "lc_updates_per_parent_root"; /// Queue blocks for re-processing with an `ADDITIONAL_QUEUED_BLOCK_DELAY` after the slot starts. /// This is to account for any slight drift in the system clock. @@ -829,10 +831,19 @@ impl ReprocessQueue { ); } - if let Some(queued_atts) = self.awaiting_attestations_per_root.get_mut(&root) - && let Some(index) = queued_atts.iter().position(|&id| id == queued_id) + if let Entry::Occupied(mut queued_atts) = + self.awaiting_attestations_per_root.entry(root) + && let Some(index) = + queued_atts.get().iter().position(|&id| id == queued_id) { - queued_atts.swap_remove(index); + let queued_atts_mut = queued_atts.get_mut(); + queued_atts_mut.swap_remove(index); + + // If the vec is empty after this attestation's removal, we need to delete + // the entry to prevent bloating the hashmap indefinitely. + if queued_atts_mut.is_empty() { + queued_atts.remove_entry(); + } } } } @@ -853,13 +864,19 @@ impl ReprocessQueue { error!("Failed to send scheduled light client optimistic update"); } - if let Some(queued_lc_updates) = self - .awaiting_lc_updates_per_parent_root - .get_mut(&parent_root) - && let Some(index) = - queued_lc_updates.iter().position(|&id| id == queued_id) + if let Entry::Occupied(mut queued_lc_updates) = + self.awaiting_lc_updates_per_parent_root.entry(parent_root) + && let Some(index) = queued_lc_updates + .get() + .iter() + .position(|&id| id == queued_id) { - queued_lc_updates.swap_remove(index); + let queued_lc_updates_mut = queued_lc_updates.get_mut(); + queued_lc_updates_mut.swap_remove(index); + + if queued_lc_updates_mut.is_empty() { + queued_lc_updates.remove_entry(); + } } } } @@ -929,11 +946,21 @@ impl ReprocessQueue { &[ATTESTATIONS], self.attestations_delay_queue.len() as i64, ); + metrics::set_gauge_vec( + &metrics::BEACON_PROCESSOR_REPROCESSING_QUEUE_TOTAL, + &[ATTESTATIONS_PER_ROOT], + self.awaiting_attestations_per_root.len() as i64, + ); metrics::set_gauge_vec( &metrics::BEACON_PROCESSOR_REPROCESSING_QUEUE_TOTAL, &[LIGHT_CLIENT_UPDATES], self.lc_updates_delay_queue.len() as i64, ); + metrics::set_gauge_vec( + &metrics::BEACON_PROCESSOR_REPROCESSING_QUEUE_TOTAL, + &[LIGHT_CLIENT_UPDATES_PER_PARENT_ROOT], + self.awaiting_lc_updates_per_parent_root.len() as i64, + ); } fn recompute_next_backfill_batch_event(&mut self) { @@ -979,6 +1006,7 @@ impl ReprocessQueue { #[cfg(test)] mod tests { use super::*; + use crate::BeaconProcessorConfig; use logging::create_test_tracing_subscriber; use slot_clock::{ManualSlotClock, TestingSlotClock}; use std::ops::Add; @@ -1101,4 +1129,97 @@ mod tests { Duration::from_secs(slot_duration), ) } + + fn test_queue() -> ReprocessQueue { + create_test_tracing_subscriber(); + + let config = BeaconProcessorConfig::default(); + let (ready_work_tx, _) = mpsc::channel::(config.max_scheduled_work_queue_len); + let (_, reprocess_work_rx) = + mpsc::channel::(config.max_scheduled_work_queue_len); + let slot_clock = Arc::new(testing_slot_clock(12)); + + ReprocessQueue::new(ready_work_tx, reprocess_work_rx, slot_clock) + } + + // This is a regression test for a memory leak in `awaiting_attestations_per_root`. + // See: https://github.com/sigp/lighthouse/pull/8065 + #[tokio::test] + async fn prune_awaiting_attestations_per_root() { + create_test_tracing_subscriber(); + + let mut queue = test_queue(); + + // Pause time so it only advances manually + tokio::time::pause(); + + let beacon_block_root = Hash256::repeat_byte(0xaf); + + // Insert an attestation. + let att = ReprocessQueueMessage::UnknownBlockUnaggregate(QueuedUnaggregate { + beacon_block_root, + process_fn: Box::new(|| {}), + }); + + // Process the event to enter it into the delay queue. + queue.handle_message(InboundEvent::Msg(att)); + + // Check that it is queued. + assert_eq!(queue.awaiting_attestations_per_root.len(), 1); + assert!( + queue + .awaiting_attestations_per_root + .contains_key(&beacon_block_root) + ); + + // Advance time to expire the attestation. + advance_time(&queue.slot_clock, 2 * QUEUED_ATTESTATION_DELAY).await; + let ready_msg = queue.next().await.unwrap(); + assert!(matches!(ready_msg, InboundEvent::ReadyAttestation(_))); + queue.handle_message(ready_msg); + + // The entry for the block root should be gone. + assert!(queue.awaiting_attestations_per_root.is_empty()); + } + + // This is a regression test for a memory leak in `awaiting_lc_updates_per_parent_root`. + // See: https://github.com/sigp/lighthouse/pull/8065 + #[tokio::test] + async fn prune_awaiting_lc_updates_per_parent_root() { + create_test_tracing_subscriber(); + + let mut queue = test_queue(); + + // Pause time so it only advances manually + tokio::time::pause(); + + let parent_root = Hash256::repeat_byte(0xaf); + + // Insert an attestation. + let msg = + ReprocessQueueMessage::UnknownLightClientOptimisticUpdate(QueuedLightClientUpdate { + parent_root, + process_fn: Box::new(|| {}), + }); + + // Process the event to enter it into the delay queue. + queue.handle_message(InboundEvent::Msg(msg)); + + // Check that it is queued. + assert_eq!(queue.awaiting_lc_updates_per_parent_root.len(), 1); + assert!( + queue + .awaiting_lc_updates_per_parent_root + .contains_key(&parent_root) + ); + + // Advance time to expire the update. + advance_time(&queue.slot_clock, 2 * QUEUED_LIGHT_CLIENT_UPDATE_DELAY).await; + let ready_msg = queue.next().await.unwrap(); + assert!(matches!(ready_msg, InboundEvent::ReadyLightClientUpdate(_))); + queue.handle_message(ready_msg); + + // The entry for the block root should be gone. + assert!(queue.awaiting_lc_updates_per_parent_root.is_empty()); + } } From 3543a20192bb67190855200d8e2203c1e6a03b3c Mon Sep 17 00:00:00 2001 From: Michael Sproul Date: Thu, 18 Sep 2025 15:17:03 +1000 Subject: [PATCH 45/81] Add experimental complete-blob-backfill flag (#7751) A different (and complementary) approach for: - https://github.com/sigp/lighthouse/issues/5391 This PR adds a flag to set the DA boundary to the Deneb fork. The effect of this change is that Lighthouse will try to backfill _all_ blobs. Most peers do not have this data, but I'm thinking that combined with `trusted-peers` this could be quite effective. Co-Authored-By: Michael Sproul --- beacon_node/beacon_chain/src/builder.rs | 2 ++ beacon_node/beacon_chain/src/chain_config.rs | 3 ++ .../src/data_availability_checker.rs | 27 +++++++++++++--- beacon_node/src/cli.rs | 10 ++++++ beacon_node/src/config.rs | 8 +++++ lighthouse/tests/beacon_node.rs | 31 +++++++++++++++++++ 6 files changed, 76 insertions(+), 5 deletions(-) diff --git a/beacon_node/beacon_chain/src/builder.rs b/beacon_node/beacon_chain/src/builder.rs index 5e7aa7d4f8..35432632cc 100644 --- a/beacon_node/beacon_chain/src/builder.rs +++ b/beacon_node/beacon_chain/src/builder.rs @@ -899,6 +899,7 @@ where let genesis_time = head_snapshot.beacon_state.genesis_time(); let canonical_head = CanonicalHead::new(fork_choice, Arc::new(head_snapshot)); let shuffling_cache_size = self.chain_config.shuffling_cache_size; + let complete_blob_backfill = self.chain_config.complete_blob_backfill; // Calculate the weak subjectivity point in which to backfill blocks to. let genesis_backfill_slot = if self.chain_config.genesis_backfill { @@ -1013,6 +1014,7 @@ where genesis_backfill_slot, data_availability_checker: Arc::new( DataAvailabilityChecker::new( + complete_blob_backfill, slot_clock, self.kzg.clone(), store, diff --git a/beacon_node/beacon_chain/src/chain_config.rs b/beacon_node/beacon_chain/src/chain_config.rs index d6be96afe9..a7defa9fa2 100644 --- a/beacon_node/beacon_chain/src/chain_config.rs +++ b/beacon_node/beacon_chain/src/chain_config.rs @@ -86,6 +86,8 @@ pub struct ChainConfig { /// If using a weak-subjectivity sync, whether we should download blocks all the way back to /// genesis. pub genesis_backfill: bool, + /// EXPERIMENTAL: backfill blobs and data columns beyond the data availability window. + pub complete_blob_backfill: bool, /// Whether to send payload attributes every slot, regardless of connected proposers. /// /// This is useful for block builders and testing. @@ -144,6 +146,7 @@ impl Default for ChainConfig { optimistic_finalized_sync: true, shuffling_cache_size: crate::shuffling_cache::DEFAULT_CACHE_SIZE, genesis_backfill: false, + complete_blob_backfill: false, always_prepare_payload: false, epochs_per_migration: crate::migrate::DEFAULT_EPOCHS_PER_MIGRATION, enable_light_client_server: true, diff --git a/beacon_node/beacon_chain/src/data_availability_checker.rs b/beacon_node/beacon_chain/src/data_availability_checker.rs index 307dc0e227..88cd8f3aab 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker.rs @@ -78,6 +78,7 @@ pub const STATE_LRU_CAPACITY: usize = STATE_LRU_CAPACITY_NON_ZERO.get(); /// proposer. Having a capacity > 1 is an optimization to prevent sync lookup from having re-fetch /// data during moments of unstable network conditions. pub struct DataAvailabilityChecker { + complete_blob_backfill: bool, availability_cache: Arc>, slot_clock: T::SlotClock, kzg: Arc, @@ -116,6 +117,7 @@ impl Debug for Availability { impl DataAvailabilityChecker { pub fn new( + complete_blob_backfill: bool, slot_clock: T::SlotClock, kzg: Arc, store: BeaconStore, @@ -129,6 +131,7 @@ impl DataAvailabilityChecker { spec.clone(), )?; Ok(Self { + complete_blob_backfill, availability_cache: Arc::new(inner), slot_clock, kzg, @@ -518,9 +521,15 @@ impl DataAvailabilityChecker { /// The epoch at which we require a data availability check in block processing. /// `None` if the `Deneb` fork is disabled. pub fn data_availability_boundary(&self) -> Option { - let current_epoch = self.slot_clock.now()?.epoch(T::EthSpec::slots_per_epoch()); - self.spec - .min_epoch_data_availability_boundary(current_epoch) + let fork_epoch = self.spec.deneb_fork_epoch?; + + if self.complete_blob_backfill { + Some(fork_epoch) + } else { + let current_epoch = self.slot_clock.now()?.epoch(T::EthSpec::slots_per_epoch()); + self.spec + .min_epoch_data_availability_boundary(current_epoch) + } } /// Returns true if the given epoch lies within the da boundary and false otherwise. @@ -1076,7 +1085,15 @@ mod test { let kzg = get_kzg(&spec); let store = Arc::new(HotColdDB::open_ephemeral(<_>::default(), spec.clone()).unwrap()); let custody_context = Arc::new(CustodyContext::new(false)); - DataAvailabilityChecker::new(slot_clock, kzg, store, custody_context, spec) - .expect("should initialise data availability checker") + let complete_blob_backfill = false; + DataAvailabilityChecker::new( + complete_blob_backfill, + slot_clock, + kzg, + store, + custody_context, + spec, + ) + .expect("should initialise data availability checker") } } diff --git a/beacon_node/src/cli.rs b/beacon_node/src/cli.rs index 386eb721a0..9a981c6581 100644 --- a/beacon_node/src/cli.rs +++ b/beacon_node/src/cli.rs @@ -401,6 +401,16 @@ pub fn cli_app() -> Command { .help_heading(FLAG_HEADER) .display_order(0) ) + .arg( + Arg::new("complete-blob-backfill") + .long("complete-blob-backfill") + .help("Download all blobs back to the Deneb fork epoch. This will likely result in \ + the node banning most of its peers.") + .action(ArgAction::SetTrue) + .help_heading(FLAG_HEADER) + .display_order(0) + .hide(true) + ) .arg( Arg::new("enable-private-discovery") .long("enable-private-discovery") diff --git a/beacon_node/src/config.rs b/beacon_node/src/config.rs index 1b5f25b317..3681556d11 100644 --- a/beacon_node/src/config.rs +++ b/beacon_node/src/config.rs @@ -825,6 +825,14 @@ pub fn get_config( client_config.chain.genesis_backfill = true; } + client_config.chain.complete_blob_backfill = cli_args.get_flag("complete-blob-backfill"); + + // Ensure `prune_blobs` is false whenever complete-blob-backfill is set. This overrides any + // setting of `--prune-blobs true` applied earlier in flag parsing. + if client_config.chain.complete_blob_backfill { + client_config.store.prune_blobs = false; + } + // Backfill sync rate-limiting client_config.beacon_processor.enable_backfill_rate_limiting = !cli_args.get_flag("disable-backfill-rate-limiting"); diff --git a/lighthouse/tests/beacon_node.rs b/lighthouse/tests/beacon_node.rs index 1fd3cc1b79..0660073bbc 100644 --- a/lighthouse/tests/beacon_node.rs +++ b/lighthouse/tests/beacon_node.rs @@ -392,6 +392,37 @@ fn genesis_backfill_with_historic_flag() { .with_config(|config| assert!(config.chain.genesis_backfill)); } +#[test] +fn complete_blob_backfill_default() { + CommandLineTest::new() + .run_with_zero_port() + .with_config(|config| assert!(!config.chain.complete_blob_backfill)); +} + +#[test] +fn complete_blob_backfill_flag() { + CommandLineTest::new() + .flag("complete-blob-backfill", None) + .run_with_zero_port() + .with_config(|config| { + assert!(config.chain.complete_blob_backfill); + assert!(!config.store.prune_blobs); + }); +} + +// Even if `--prune-blobs true` is provided, `--complete-blob-backfill` should override it to false. +#[test] +fn complete_blob_backfill_and_prune_blobs_true() { + CommandLineTest::new() + .flag("complete-blob-backfill", None) + .flag("prune-blobs", Some("true")) + .run_with_zero_port() + .with_config(|config| { + assert!(config.chain.complete_blob_backfill); + assert!(!config.store.prune_blobs); + }); +} + // Tests for Eth1 flags. // DEPRECATED but should not crash #[test] From 92f60b8fd2a9b62a7999da2fc91043e3c87fd4b8 Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Thu, 18 Sep 2025 16:13:27 +1000 Subject: [PATCH 46/81] Add release helper script to list PRs and breaking changes (#7737) Output for 7.1.0 release: ``` # Commit SHA PR Number Has backwards-incompat Label PR Title --- ------------ ----------- ------------------------------ -------------------------------------------- 1 d5a03c9d86bf 6872 False Add more range sync tests (#6872) 2 ec2fe3812edc - - [NO PR MATCH]: Merge remote-tracking branch 'origin/release-v7.0.0-beta.0' into unstable 3 3992d6ba74c9 6862 False Fix misc PeerDAS todos (#6862) 4 d60388134d07 6928 False Add PeerDAS metrics to track subnets without peers (#6928) 5 431dd7c39828 6917 False Remove un-used batch sync error condition (#6917) 6 0055af56b685 6932 False Unsubscribe blob topics at Fulu fork (#6932) 7 6ab6eae40c0e - - [NO PR MATCH]: Merge remote-tracking branch 'origin/release-v7.0.0-beta.0' into unstable 8 193061ff7376 6634 False Use RpcSend on RPC::self_limiter::ready_requests (#6634) 9 e5e43ecd8129 - - [NO PR MATCH]: Merge remote-tracking branch 'origin/release-v7.0.0' into unstable 10 b4be5141823f 7012 False Add spamoor_blob in network_params.yaml (#7012) 11 01df433dfd02 7021 False update codeowners, to be more specific (#7021) 12 60964fc7b530 6829 False Expose blst internals (#6829) 13 3fab6a2c0ba7 6866 False Block availability data enum (#6866) 14 6e11bddd4bd0 6947 False feat: adds CLI flags to delay publishing for edge case testing on PeerDAS devnets (#6947) 15 454c7d05c40b 7017 False Remove LC server config from HTTP API (#7017) 16 54b4150a6220 7030 False Add test flag to override `SYNC_TOLERANCE_EPOCHS` for range sync testing (#7030) 17 cf4104abe5e2 - - [NO PR MATCH]: Merge remote-tracking branch 'origin/release-v7.0.0' into unstable 18 8a772520a50a 7034 False Cache validator registration only after successful publish (#7034) 19 1235d4480225 7048 False Remove `watch` (#7048) 20 3bc5f1f2a58b 7081 False Validator Registration ssz support (#7081) 21 b4e79edf2a09 - - [NO PR MATCH]: Merge remote-tracking branch 'origin/release-v7.0.0' into unstable 22 8d1abce26ed5 6915 False Bump SSZ version for larger bitfield `SmallVec` (#6915) 23 1916a2ac5ad3 7020 False chore: update to rust-eth-kzg to 0.5.4 (#7020) 24 1a08e6f0a090 7109 False Remove duplicate sync_tolerance_epochs config (#7109) 25 f23f984f8575 7057 False switch to upstream gossipsub (#7057) 26 d60c24ef1cc0 6339 True Integrate tracing (#6339) 27 a6bdc474db01 6991 False Log range sync download errors (#6991) 28 574b204bdb39 6680 False decouple `eth2` from `store` and `lighthouse_network` (#6680) 29 c095a0a58feb 7130 False update gossipsub to the latest upstream revision (#7130) 30 5cda1641ea2f 7137 False Log `file appender` initialization errors properly (#7137) 31 d96123b02882 7149 False Remove unnecessary `filter_layer` in logger builder (#7149) 32 a1b1d7ae589f 7150 False Remove `discv5` logs from logfile output (#7150) 33 ca237652f1da 6998 False Track request IDs in RangeBlockComponentsRequest (#6998) 34 d323699fde01 7183 False Add missing `osaka-time` lcli param (#7183) 35 cbf1c04a1486 - - [NO PR MATCH]: resolve merge conflicts between untstable and release-v7.0.0 36 2f37bf4de5e3 - - [NO PR MATCH]: Fix more merge conflicts between unstable and release-v7.0.0 37 3f6c11db0eb6 6995 False Some updates to Lighthouse book (#6995) 38 9dce729cb6a0 7182 False Ensure sqlite and rusqlite are optional in `consensus/types` (#7182) 39 6f31d4434308 7033 False Remove CGC from data_availability checker (#7033) 40 ca8eaea11677 7169 True Remove `crit` as an option from the CLI entirely (#7169) 41 bde0f1ef0b29 - - [NO PR MATCH]: Merge remote-tracking branch 'origin/release-v7.0.0' into unstable 42 fb7ec0d151d4 7112 False Change `genesis-state-url-timeout` (#7112) 43 4839ed620fa9 7168 False Tracing cleanup (#7168) 44 578db67755cb - - [NO PR MATCH]: Merge remote-tracking branch 'origin/release-v7.0.0' into backmerge-apr-2 45 80626e58d224 7244 False Attempt to fix flaky network tests (#7244) 46 d6cd049a453b 7238 False RPC RequestId Cleanup (#7238) 47 0e6da0fcafe2 - - [NO PR MATCH]: Merge branch 'release-v7.0.0' into v7-backmerge 48 57abffcd997f 7240 False Disable log color when running in non-interactive mode (#7240) 49 6a75f24ab13e 7188 False Fix the `getBlobs` metric and ensure it is recorded promptly to prevent miscounts (#7188) 50 7cc64cab8352 6990 False Add missing error log and remove redundant id field from lookup logs (#6990) 51 591fb7df141d - - [NO PR MATCH]: Merge branch 'release-v7.0.0' into backmerge-for-openssl 52 e77fb01a063c 7265 False Remove CLI conflict for secrets-dir and datadir (#7265) 53 b5d40e3db06d 7256 False Align logs (#7256) 54 70850fe58d56 6744 True Drop head tracker for summaries DAG (#6744) 55 47a85cd1186d 7269 False Bump version to v7.1.0-beta.0 (not a release) (#7269) 56 e924264e17b8 7258 False Fullnodes to publish data columns from EL `getBlobs` (#7258) 57 759b0612b37f 7117 False Offloading KZG Proof Computation from the beacon node (#7117) 58 d96b73152e0e 7192 False Fix for #6296: Deterministic RNG in peer DAS publish block tests (#7192) 59 39eb8145f89e - - [NO PR MATCH]: Merge branch 'release-v7.0.0' into unstable 60 70f8ab9a6fc2 7309 False Add riscv64 build support (#7309) 61 be68dd24d05f 7281 False Fix wrong custody column count for lookup blocks (#7281) 62 08882c64cae5 6996 False Fix execution engine integration tests with latest geth version (#6996) 63 476f3a593c20 7161 False Add `MAX_BLOBS_PER_BLOCK_FULU` config (#7161) 64 c32569ab83bb 7225 False Restore HTTP API logging and add more metrics (#7225) 65 410af7c5f5dc 7279 False feat: update mainnet bootnodes (#7279) 66 80fe133d2c4c 7280 False Update Lighthouse Book for Electra features (#7280) 67 9f4b0cdc2855 7343 False Fix Kurtosis doppelganger CI (#7343) 68 e61e92b926d5 - - [NO PR MATCH]: Merge remote-tracking branch 'origin/stable' into unstable 69 5527125f5e13 7340 False Fix GitHub releases page looks bad in GitHub dark theme (#7340) 70 c13e069c9c63 7324 False Revise logging when `queue is full` (#7324) 71 1dd37048b9d1 7346 False Enable cross-compiling for riscv64 architecture (#7346) 72 402a81cdd78e 7350 False Fix Kurtosis testnet (#7350) 73 1324d3d3c4c2 5923 False Delayed RPC Send Using Tokens (#5923) 74 6fad18644bbe 6747 False feat: presign for validator account (#6747) 75 2e2b0d2176e0 7351 False Revise consolidation info in Lighthouse book (#7351) 76 63a10eaaea62 6956 True Changing `boot_enr.yaml` to expect `bootstap_nodes.yaml` for pectra devnet (#6956) 77 34a6c3a93029 6897 True vc: increase default gas limit (#6897) 78 94ccd7608ea8 6653 False Add documentation for VC API `/lighthouse/beacon/health` (#6653) 79 9779b4ba2c04 7326 False Optimize `validate_data_columns` (#7326) 80 93ec9df13760 7304 False Compute proposer shuffling only once in gossip verification (#7304) 81 2aa5d5c25e22 7359 False Make sure to log SyncingChain ID (#7359) 82 c8224c8d5e19 7387 False docs: fix broken link to voluntary exit guide (#7387) 83 43c38a6fa0cc 7378 False Change slog to tracing in comments (#7378) 84 beb0ce68bdf6 6922 False Make range sync peer loadbalancing PeerDAS-friendly (#6922) 85 3d92e3663b74 6705 False Modularize validator store (#6705) 86 058dae064184 7405 False Add requires --http when using vc subcommands --http-port (#7405) 87 0f13029c7d51 7409 False Don't publish data columns reconstructed from RPC columns to the gossip network (#7409) 88 8dc3d23af083 7400 False Add a default timeout to all `BeaconNodeHttpClient` requests (#7400) 89 e90fcbe6577c 7416 False Add ARM binary for macOS in release (#7416) 90 4b9c16fc7175 7199 False Add Electra forks to basic sim tests (#7199) 91 a497ec601cae 6975 False Retry custody requests after peer metadata updates (#6975) 92 e0c1f27e1303 7394 False simulator: Persist beacon logs (#7394) 93 92391cdac665 7284 False update gossipsub to the latest upstream revision (#7284) 94 593390162f47 7399 False `peerdas-devnet-7`: update `DataColumnSidecarsByRoot` request to use `DataColumnsByRootIdentifier` (#7399) 95 5b25a48af34b 7404 False Siren installation improvement (#7404) 96 e051c7ca89c8 7396 False Siren Pectra Feature Updates (#7396) 97 0a917989b218 7370 False impl test random for some types (#7370) 98 807848bc7ac4 7443 False Next sync committee branch bug (#7443) 99 851ee2bcedfc 7454 False Extract get_domain for VoluntaryExit (#7454) 100 c2c7fb87a862 7460 False Make DAG construction more permissive (#7460) 101 b1138c28fb94 7451 False Add additional mergify rules to automate triaging (#7451) 102 cc6ae9d3f09c 7463 False Fix mergify infinite loop. (#7463) 103 1853d836b7e4 7458 False Added E::slots_per_epoch() to deneb time calculation (#7458) 104 c4182e362b8f 7433 False simulator: Write dependency logs to separate files (#7433) 105 e0ee148d6aca 7470 False Prevent mergify from updating labels while CI is still running. (#7470) 106 e21198c08baa 7472 False One more attempt to fix mergify condition. (#7472) 107 268809a53069 7471 False Rust clippy 1.87 lint fixes (#7471) 108 b051a5d6cc7b 7469 False Delete `at-most` in `lighthouse vm create` (#7469) 109 1d27855db7be 7369 False impl from hash256 for `ExecutionBlockHash` (#7369) 110 23ad833747b6 7417 False Change default EngineState to online (#7417) 111 fcfcbf9a11b3 7481 False Update mdlint to disable descriptive-link-text (#7481) 112 7684d1f866ab 7372 False ContextDeserialize and Beacon API Improvements (#7372) 113 5393d33af823 7411 False Silence `Uninitialized` warn log on start-up (#7411) 114 1e6cdeb88a6a 6799 False feat: Add docker reproducible builds (#6799) 115 50dbfdf61243 7455 False Some updates to Lighthouse book (#7455) 116 af87135e3020 7484 False Move MD059 rule to configuration file (#7484) 117 805c2dc831e6 5047 False Correct reward denominator in op pool (#5047) 118 7e2df6b602a1 7474 False Empty list `[]` to return all validators balances (#7474) 119 f06d1d034615 7495 False Fix blob download from checkpointz servers (#7495) 120 0688932de28d 7497 False Pass blobs into `ValidatorStore::sign_block` (#7497) 121 e29b607257d8 7427 False Move notifier and latency service to `validator_services` (#7427) 122 7759cb8f91c0 7494 False Update mergify rule to not evaluate PRs that are not ready for review - to reduce noise and avoid updating stale PRs. (#7494) 123 2e96e9769b99 7507 False Use slice.is_sorted now that it's stable (#7507) 124 a8035d7395ea 7506 False Enable stdout logging in rpc_tests (#7506) 125 817f14c3491a 7500 False Send execution_requests in fulu (#7500) 126 537fc5bde860 7459 False Revive network-test logs files in CI (#7459) 127 cf0f95985540 7180 False Improve log readability during rpc_tests (#7180) 128 ce8d0814ad71 7246 False Ensure logfile permissions are maintained after rotation (#7246) 129 6af8c187e0b7 7052 False Publish EL Info in Metrics (#7052) 130 a2797d4bbde9 7512 False Fix formatting errors from cargo-sort (#7512) 131 f01dc556d157 7505 False Update `engine_getBlobsV2` response type and add `getBlobsV2` tests (#7505) 132 e6ef644db4e8 7493 False Verify `getBlobsV2` response and avoid reprocessing imported data columns (#7493) 133 7c89b970afe2 7382 False Handle attestation validation errors (#7382) 134 8dde5bdb4413 - - [NO PR MATCH]: Update mergify rules so that I can add `waiting-on-author` on a PR that's passing CI. Remove noisy comments. 135 8989ef8fb11e 7025 False Enable arithmetic lint in rate-limiter (#7025) 136 b7fc03437bba - - [NO PR MATCH]: Fix condition 137 9e9c51be6fef - - [NO PR MATCH]: Remove redundant `and` 138 999b04517e35 - - [NO PR MATCH]: Merge pull request #7525 from jimmygchen/mergify-again 139 0ddf9a99d64a 7332 False Remove support for database migrations prior to schema version v22 (#7332) 140 5cda6a6f9e4b 7522 False Mitigate flakiness in test_delayed_rpc_response (#7522) 141 4d21846aba6b 7533 False Prevent `AvailabilityCheckError` when there's no new custody columns to import (#7533) 142 39744df93f0b 7393 False simulator: Fix `Failed to initialize dependency logging` (#7393) 143 38a5f338fad7 7529 False Add `console-subscriber` feature for debugging (#7529) 144 886ceb7e25e0 6882 False Run Assertoor tests in CI (#6882) 145 94a1446ac955 7541 False Fix unexpected blob error and duplicate import in fetch blobs (#7541) 146 ae30480926b6 7521 False Implement EIP-7892 BPO hardforks (#7521) 147 f67068e1ec53 7518 False Update `staking-deposit-cli` to `ethstaker-deposit-cli` (#7518) 148 cd83d8d95ddd 7544 False Add a name to the Tokio task (#7544) 149 357a8ccbb996 7549 False Checkpoint sync without the blobs from Fulu (#7549) 150 2d9fc34d4326 7540 False Fulu EF tests v1.6.0-alpha.0 (#7540) 151 dcee76c0dc88 7548 False Update key generation in validator manager (#7548) 152 9a4972053eb5 7530 False Add e2e sync tests to CI (#7530) 153 d457ceeaafae 7118 False Don't create child lookup if parent is faulty (#7118) 154 2f807e21bede 7538 False Add support for nightly tests (#7538) 155 e098f667380c 7570 False Update kurtosis config and EL images (#7570) 156 b2e8b67e3446 7566 False Reduce number of basic sim test nodes from 7 to 4 (#7566) 157 170cd0f5875d 7579 False Store the libp2p/discv5 logs when stopping local-testnet (#7579) 158 b08d49c4cb34 7559 False Changes for `fusaka-devnet-1` (#7559) 159 8c6abc0b69b7 7574 False Optimise parallelism in compute cells operations by zipping first (#7574) 160 7416d06dce8e 7561 False Add genesis sync test to CI (#7561) 161 076a1c3faead 7587 False Data column sidecar event (#7587) 162 5f208bb85829 7578 True Implement basic validator custody framework (no backfill) (#7578) 163 9803d69d8045 7590 False Implement status v2 version (#7590) 164 5472cb85008b 7582 False Batch verify KZG proofs for getBlobsV2 (#7582) 165 a65f78222d69 7594 False Drop stale registrations without reducing CGC (#7594) 166 ccd99c138c27 7588 False Wait before column reconstruction (#7588) 167 dc5f5af3eb53 7595 False Fix flaky test_rpc_block_reprocessing (#7595) 168 4fc0665ccdd6 7592 False Add more context to Late Block Re-orgs (#7592) 169 6135f417a2f4 7591 False Add data columns sidecars debug beacon API (#7591) 170 3d2d65bf8d24 7593 False Advertise `--advertise-false-custody-group-count` for testing PeerDAS (#7593) 171 6786b9d12a6d 7444 True Single attestation "Full" implementation (#7444) 172 dd985341581f 6750 True Hierarchical state diffs in hot DB (#6750) 173 f67084a571d1 7437 False Remove reprocess channel (#7437) 174 d50924677a34 7620 False Remove instrumenting log level (#7620) 175 11bcccb353c0 7133 True Remove all prod eth1 related code (#7133) 176 e34a9a0c65d5 6551 False Allow the `--beacon-nodes` list to be updated at runtime (#6551) 177 3fefda68e5c1 7611 False Send byrange responses in the correct requested range (#7611) 178 cef04ee2ee48 7462 False Implement `validator_identities` Beacon API endpoint (#7462) 179 fd643c310c4e 7632 False Un-ignore EF test for v1.6.0-alpha.1 (#7632) 180 56b2d4b5253b 7636 False Remove instrumenting log level (#7636) 181 8e3c5d152413 7644 False Rust 1.89 compiler lint fix (#7644) 182 a0a6b9300f11 7551 False Do not compute sync selection proofs for the sync duty at the current slot (#7551) 183 9b1f3ed9d1a4 7652 False Add gossip check (#7652) 184 83cad25d9880 7657 False Fix Rust 1.88 clippy errors & execution engine tests (#7657) 185 522e00f48df7 7656 False Fix incorrect `waker` update condition (#7656) 186 6ea5f14b3988 7597 False feat: better error message for light_client/bootstrap endpoint (#7597) 187 2d759f78be6c 6576 False Fix beacon_chain metrics descriptions (#6576) 188 6be646ca1153 7666 True Bump DB schema to v25 (#7666) 189 e45ba846aef5 7673 False Increase http client default timeout to 2s in `http-api` tests. (#7673) 190 25ea8a83b77b 7667 False Add Michael as codeowner for store crate (#7667) 191 c1f94d9b7bf8 7669 False Test database schema stability (#7669) 192 257d2707182c 6612 False Add voluntary exit via validator manager (#6612) 193 e305cb1b921f 7661 True Custody persist fix (#7661) 194 41742ce2bde9 7683 False Update `SAMPLES_PER_SLOT` to be number of custody groups instead of data columns (#7683) 195 69c9c7038af7 7681 False Use prepare_beacon_proposer endpoint for validator custody registration (#7681) 196 fcc602a7872a 7646 False Update fulu network configs and add `MIN_EPOCHS_FOR_DATA_COLUMN_SIDECARS_REQUESTS` (#7646) 197 a459a9af98c9 7689 False Fix and test checkpoint sync from genesis (#7689) 198 b35854b71f04 7692 False Record v2 beacon blocks http api metrics separately (#7692) 199 c7bb3b00e409 7693 False Fix lookups of the block at `oldest_block_slot` (#7693) 200 0f895f3066a3 7695 False Bump default gas limit (#7695) 201 56485cc9865a 7707 False Remove unneeded spans that caused debug logs to appear when level is set to `info` (#7707) 202 bd8a2a8ffbaa 7023 False Gossip recently computed light client data (#7023) 203 7b2f138ca7e7 - - [NO PR MATCH]: Merge remote-tracking branch 'origin/stable' into release-v7.1.0 204 8e55684b066f 7723 False Reintroduce `--logfile` with deprecation warning (#7723) 205 8b5ccacac9c0 7663 False Error from RPC `send_response` when request doesn't exist on the active inbound requests (#7663) 206 cfb1f7331064 7609 False Release v7.1.0 (#7609) ``` Co-Authored-By: Jimmy Chen --- scripts/print_release_diffs.py | 72 ++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 scripts/print_release_diffs.py diff --git a/scripts/print_release_diffs.py b/scripts/print_release_diffs.py new file mode 100644 index 0000000000..d910b1be5b --- /dev/null +++ b/scripts/print_release_diffs.py @@ -0,0 +1,72 @@ +""" +Summarise pull requests between two Lighthouse releases. + +Usage: + export GITHUB_TOKEN=your_token + python -m pip install requests==2.32.4 + python print_release_diffs.py --base v7.0.1 --head release-v7.1.0 + +Shows commit SHA, PR number, 'backwards-incompat' label status, and PR title. +""" + +import requests +import re +import argparse +import os + +GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN") +if not GITHUB_TOKEN: + raise SystemExit("Error: Please set the GITHUB_TOKEN environment variable.") + +parser = argparse.ArgumentParser(description="Summarise PRs between two Lighthouse versions.") +parser.add_argument("--base", required=True, help="Base tag or branch (older release)") +parser.add_argument("--head", required=True, help="Head tag or branch (newer release)") +args = parser.parse_args() + +BASE = args.base +HEAD = args.head +OWNER = 'sigp' +REPO = 'lighthouse' + +HEADERS = { + 'Authorization': f'token {GITHUB_TOKEN}', + 'Accept': 'application/vnd.github+json' +} + +def get_commits_between(base, head): + url = f'https://api.github.com/repos/{OWNER}/{REPO}/compare/{base}...{head}' + response = requests.get(url, headers=HEADERS) + response.raise_for_status() + return response.json()['commits'] + +def has_backwards_incompat_label(pr_number): + url = f'https://api.github.com/repos/{OWNER}/{REPO}/issues/{pr_number}' + response = requests.get(url, headers=HEADERS) + if response.status_code != 200: + raise Exception(f"Failed to fetch PR #{pr_number}") + labels = response.json().get('labels', []) + return any(label['name'] == 'backwards-incompat' for label in labels) + +def main(): + commits = get_commits_between(BASE, HEAD) + print(" # Commit SHA PR Number Has backwards-incompat Label PR Title") + print("--- ------------ ----------- ------------------------------ --------------------------------------------") + + for i, commit in enumerate(commits, 1): + sha = commit['sha'][:12] + message = commit['commit']['message'] + pr_match = re.search(r"\(#(\d+)\)", message) + + if not pr_match: + print(f"{i:<3} {sha} {'-':<11} {'-':<30} [NO PR MATCH]: {message.splitlines()[0]}") + continue + + pr_number = int(pr_match.group(1)) + try: + has_label = has_backwards_incompat_label(pr_number) + print(f"{i:<3} {sha} {pr_number:<11} {str(has_label):<30} {message.splitlines()[0]}") + except Exception as e: + print(f"{i:<3} {sha} {pr_number:<11} {'ERROR':<30} [ERROR FETCHING PR]: {e}") + +if __name__ == '__main__': + main() From 51321daabb5f0a401bff41d7f9b5d2f4e9646a75 Mon Sep 17 00:00:00 2001 From: Michael Sproul Date: Thu, 18 Sep 2025 17:10:18 +1000 Subject: [PATCH 47/81] Make the block cache optional (#8066) Address contention on the store's `block_cache` by allowing it to be disabled when `--block-cache-size 0` is provided, and also making this the default. Co-Authored-By: Michael Sproul --- beacon_node/src/cli.rs | 2 +- beacon_node/store/src/config.rs | 4 +- beacon_node/store/src/hot_cold_store.rs | 214 ++++++++++++++---------- book/src/help_bn.md | 2 +- lighthouse/tests/beacon_node.rs | 15 +- 5 files changed, 143 insertions(+), 94 deletions(-) diff --git a/beacon_node/src/cli.rs b/beacon_node/src/cli.rs index 9a981c6581..238907adce 100644 --- a/beacon_node/src/cli.rs +++ b/beacon_node/src/cli.rs @@ -779,7 +779,7 @@ pub fn cli_app() -> Command { .long("block-cache-size") .value_name("SIZE") .help("Specifies how many blocks the database should cache in memory") - .default_value("5") + .default_value("0") .action(ArgAction::Set) .display_order(0) ) diff --git a/beacon_node/store/src/config.rs b/beacon_node/store/src/config.rs index ad81fa6076..c0f15f2417 100644 --- a/beacon_node/store/src/config.rs +++ b/beacon_node/store/src/config.rs @@ -19,7 +19,7 @@ pub const DEFAULT_BACKEND: DatabaseBackend = DatabaseBackend::LevelDb; pub const PREV_DEFAULT_SLOTS_PER_RESTORE_POINT: u64 = 2048; pub const DEFAULT_SLOTS_PER_RESTORE_POINT: u64 = 8192; pub const DEFAULT_EPOCHS_PER_STATE_DIFF: u64 = 8; -pub const DEFAULT_BLOCK_CACHE_SIZE: NonZeroUsize = new_non_zero_usize(64); +pub const DEFAULT_BLOCK_CACHE_SIZE: usize = 0; pub const DEFAULT_STATE_CACHE_SIZE: NonZeroUsize = new_non_zero_usize(128); pub const DEFAULT_STATE_CACHE_HEADROOM: NonZeroUsize = new_non_zero_usize(1); pub const DEFAULT_COMPRESSION_LEVEL: i32 = 1; @@ -34,7 +34,7 @@ pub const DEFAULT_BLOB_PUNE_MARGIN_EPOCHS: u64 = 0; #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct StoreConfig { /// Maximum number of blocks to store in the in-memory block cache. - pub block_cache_size: NonZeroUsize, + pub block_cache_size: usize, /// Maximum number of states to store in the in-memory state cache. pub state_cache_size: NonZeroUsize, /// Minimum number of states to cull from the state cache upon fullness. diff --git a/beacon_node/store/src/hot_cold_store.rs b/beacon_node/store/src/hot_cold_store.rs index 52e52fe7ce..0d8a65e064 100644 --- a/beacon_node/store/src/hot_cold_store.rs +++ b/beacon_node/store/src/hot_cold_store.rs @@ -70,7 +70,7 @@ pub struct HotColdDB, Cold: ItemStore> { /// The hot database also contains all blocks. pub hot_db: Hot, /// LRU cache of deserialized blocks and blobs. Updated whenever a block or blob is loaded. - block_cache: Mutex>, + block_cache: Option>>, /// Cache of beacon states. /// /// LOCK ORDERING: this lock must always be locked *after* the `split` if both are required. @@ -229,7 +229,9 @@ impl HotColdDB, MemoryStore> { cold_db: MemoryStore::open(), blobs_db: MemoryStore::open(), hot_db: MemoryStore::open(), - block_cache: Mutex::new(BlockCache::new(config.block_cache_size)), + block_cache: NonZeroUsize::new(config.block_cache_size) + .map(BlockCache::new) + .map(Mutex::new), state_cache: Mutex::new(StateCache::new( config.state_cache_size, config.state_cache_headroom, @@ -281,7 +283,9 @@ impl HotColdDB, BeaconNodeBackend> { blobs_db: BeaconNodeBackend::open(&config, blobs_db_path)?, cold_db: BeaconNodeBackend::open(&config, cold_path)?, hot_db, - block_cache: Mutex::new(BlockCache::new(config.block_cache_size)), + block_cache: NonZeroUsize::new(config.block_cache_size) + .map(BlockCache::new) + .map(Mutex::new), state_cache: Mutex::new(StateCache::new( config.state_cache_size, config.state_cache_headroom, @@ -488,14 +492,17 @@ impl, Cold: ItemStore> HotColdDB pub fn register_metrics(&self) { let hsc_metrics = self.historic_state_cache.lock().metrics(); - metrics::set_gauge( - &metrics::STORE_BEACON_BLOCK_CACHE_SIZE, - self.block_cache.lock().block_cache.len() as i64, - ); - metrics::set_gauge( - &metrics::STORE_BEACON_BLOB_CACHE_SIZE, - self.block_cache.lock().blob_cache.len() as i64, - ); + if let Some(block_cache) = &self.block_cache { + let cache = block_cache.lock(); + metrics::set_gauge( + &metrics::STORE_BEACON_BLOCK_CACHE_SIZE, + cache.block_cache.len() as i64, + ); + metrics::set_gauge( + &metrics::STORE_BEACON_BLOB_CACHE_SIZE, + cache.blob_cache.len() as i64, + ); + } let state_cache = self.state_cache.lock(); metrics::set_gauge( &metrics::STORE_BEACON_STATE_CACHE_SIZE, @@ -553,7 +560,9 @@ impl, Cold: ItemStore> HotColdDB let block = self.block_as_kv_store_ops(block_root, block, &mut ops)?; self.hot_db.do_atomically(ops)?; // Update cache. - self.block_cache.lock().put_block(*block_root, block); + self.block_cache + .as_ref() + .inspect(|cache| cache.lock().put_block(*block_root, block)); Ok(()) } @@ -605,7 +614,9 @@ impl, Cold: ItemStore> HotColdDB metrics::inc_counter(&metrics::BEACON_BLOCK_GET_COUNT); // Check the cache. - if let Some(block) = self.block_cache.lock().get_block(block_root) { + if let Some(cache) = &self.block_cache + && let Some(block) = cache.lock().get_block(block_root) + { metrics::inc_counter(&metrics::BEACON_BLOCK_CACHE_HIT_COUNT); return Ok(Some(DatabaseBlock::Full(block.clone()))); } @@ -630,8 +641,8 @@ impl, Cold: ItemStore> HotColdDB // Add to cache. self.block_cache - .lock() - .put_block(*block_root, full_block.clone()); + .as_ref() + .inspect(|cache| cache.lock().put_block(*block_root, full_block.clone())); DatabaseBlock::Full(full_block) } else if !self.config.prune_payloads { @@ -902,7 +913,9 @@ impl, Cold: ItemStore> HotColdDB /// Delete a block from the store and the block cache. pub fn delete_block(&self, block_root: &Hash256) -> Result<(), Error> { - self.block_cache.lock().delete(block_root); + self.block_cache + .as_ref() + .inspect(|cache| cache.lock().delete(block_root)); self.hot_db .key_delete(DBColumn::BeaconBlock, block_root.as_slice())?; self.hot_db @@ -917,7 +930,9 @@ impl, Cold: ItemStore> HotColdDB block_root.as_slice(), &blobs.as_ssz_bytes(), )?; - self.block_cache.lock().put_blobs(*block_root, blobs); + self.block_cache + .as_ref() + .inspect(|cache| cache.lock().put_blobs(*block_root, blobs)); Ok(()) } @@ -945,9 +960,11 @@ impl, Cold: ItemStore> HotColdDB self.blobs_db .put(&DATA_COLUMN_CUSTODY_INFO_KEY, &data_column_custody_info)?; - self.block_cache - .lock() - .put_data_column_custody_info(Some(data_column_custody_info)); + self.block_cache.as_ref().inspect(|cache| { + cache + .lock() + .put_data_column_custody_info(Some(data_column_custody_info)) + }); Ok(()) } @@ -964,8 +981,8 @@ impl, Cold: ItemStore> HotColdDB &data_column.as_ssz_bytes(), )?; self.block_cache - .lock() - .put_data_column(*block_root, data_column); + .as_ref() + .inspect(|cache| cache.lock().put_data_column(*block_root, data_column)); } Ok(()) } @@ -1399,7 +1416,7 @@ impl, Cold: ItemStore> HotColdDB // Update database whilst holding a lock on cache, to ensure that the cache updates // atomically with the database. - let mut guard = self.block_cache.lock(); + let guard = self.block_cache.as_ref().map(|cache| cache.lock()); let blob_cache_ops = blobs_ops.clone(); // Try to execute blobs store ops. @@ -1446,56 +1463,67 @@ impl, Cold: ItemStore> HotColdDB return Err(e); } - for op in hot_db_cache_ops { + // Delete from the state cache. + for op in &hot_db_cache_ops { match op { - StoreOp::PutBlock(block_root, block) => { - guard.put_block(block_root, (*block).clone()); - } - - StoreOp::PutBlobs(_, _) => (), - - StoreOp::PutDataColumns(_, _) => (), - - StoreOp::PutState(_, _) => (), - - StoreOp::PutStateSummary(_, _) => (), - StoreOp::DeleteBlock(block_root) => { - guard.delete_block(&block_root); - self.state_cache.lock().delete_block_states(&block_root); + self.state_cache.lock().delete_block_states(block_root); } - StoreOp::DeleteState(state_root, _) => { - self.state_cache.lock().delete_state(&state_root) + self.state_cache.lock().delete_state(state_root) } - - StoreOp::DeleteBlobs(_) => (), - - StoreOp::DeleteDataColumns(_, _) => (), - - StoreOp::DeleteExecutionPayload(_) => (), - - StoreOp::DeleteSyncCommitteeBranch(_) => (), - - StoreOp::KeyValueOp(_) => (), - } - } - - for op in blob_cache_ops { - match op { - StoreOp::PutBlobs(block_root, blobs) => { - guard.put_blobs(block_root, blobs); - } - - StoreOp::DeleteBlobs(block_root) => { - guard.delete_blobs(&block_root); - } - _ => (), } } - drop(guard); + // If the block cache is enabled, also delete from the block cache. + if let Some(mut guard) = guard { + for op in hot_db_cache_ops { + match op { + StoreOp::PutBlock(block_root, block) => { + guard.put_block(block_root, (*block).clone()); + } + + StoreOp::PutBlobs(_, _) => (), + + StoreOp::PutDataColumns(_, _) => (), + + StoreOp::PutState(_, _) => (), + + StoreOp::PutStateSummary(_, _) => (), + + StoreOp::DeleteBlock(block_root) => { + guard.delete_block(&block_root); + } + + StoreOp::DeleteState(_, _) => (), + + StoreOp::DeleteBlobs(_) => (), + + StoreOp::DeleteDataColumns(_, _) => (), + + StoreOp::DeleteExecutionPayload(_) => (), + + StoreOp::DeleteSyncCommitteeBranch(_) => (), + + StoreOp::KeyValueOp(_) => (), + } + } + + for op in blob_cache_ops { + match op { + StoreOp::PutBlobs(block_root, blobs) => { + guard.put_blobs(block_root, blobs); + } + + StoreOp::DeleteBlobs(block_root) => { + guard.delete_blobs(&block_root); + } + + _ => (), + } + } + } Ok(()) } @@ -2425,21 +2453,23 @@ impl, Cold: ItemStore> HotColdDB /// If custody info doesn't exist in the cache, /// try to fetch from the DB and prime the cache. pub fn get_data_column_custody_info(&self) -> Result, Error> { - let Some(data_column_custody_info) = self.block_cache.lock().get_data_column_custody_info() - else { - let data_column_custody_info = self - .blobs_db - .get::(&DATA_COLUMN_CUSTODY_INFO_KEY)?; + if let Some(cache) = &self.block_cache + && let Some(data_column_custody_info) = cache.lock().get_data_column_custody_info() + { + return Ok(Some(data_column_custody_info)); + } + let data_column_custody_info = self + .blobs_db + .get::(&DATA_COLUMN_CUSTODY_INFO_KEY)?; - // Update the cache - self.block_cache + // Update the cache + self.block_cache.as_ref().inspect(|cache| { + cache .lock() - .put_data_column_custody_info(data_column_custody_info.clone()); + .put_data_column_custody_info(data_column_custody_info.clone()) + }); - return Ok(data_column_custody_info); - }; - - Ok(Some(data_column_custody_info)) + Ok(data_column_custody_info) } /// Fetch all columns for a given block from the store. @@ -2460,9 +2490,13 @@ impl, Cold: ItemStore> HotColdDB /// Fetch blobs for a given block from the store. pub fn get_blobs(&self, block_root: &Hash256) -> Result, Error> { // Check the cache. - if let Some(blobs) = self.block_cache.lock().get_blobs(block_root) { + if let Some(blobs) = self + .block_cache + .as_ref() + .and_then(|cache| cache.lock().get_blobs(block_root).cloned()) + { metrics::inc_counter(&metrics::BEACON_BLOBS_CACHE_HIT_COUNT); - return Ok(blobs.clone().into()); + return Ok(blobs.into()); } match self @@ -2481,8 +2515,8 @@ impl, Cold: ItemStore> HotColdDB { let blobs = BlobSidecarList::new(blobs, max_blobs_per_block as usize)?; self.block_cache - .lock() - .put_blobs(*block_root, blobs.clone()); + .as_ref() + .inspect(|cache| cache.lock().put_blobs(*block_root, blobs.clone())); Ok(BlobSidecarListFromRoot::Blobs(blobs)) } else { @@ -2515,8 +2549,8 @@ impl, Cold: ItemStore> HotColdDB // Check the cache. if let Some(data_column) = self .block_cache - .lock() - .get_data_column(block_root, column_index) + .as_ref() + .and_then(|cache| cache.lock().get_data_column(block_root, column_index)) { metrics::inc_counter(&metrics::BEACON_DATA_COLUMNS_CACHE_HIT_COUNT); return Ok(Some(data_column)); @@ -2528,9 +2562,11 @@ impl, Cold: ItemStore> HotColdDB )? { Some(ref data_column_bytes) => { let data_column = Arc::new(DataColumnSidecar::from_ssz_bytes(data_column_bytes)?); - self.block_cache - .lock() - .put_data_column(*block_root, data_column.clone()); + self.block_cache.as_ref().inspect(|cache| { + cache + .lock() + .put_data_column(*block_root, data_column.clone()) + }); Ok(Some(data_column)) } None => Ok(None), @@ -3264,11 +3300,11 @@ impl, Cold: ItemStore> HotColdDB } // Remove deleted blobs from the cache. - let mut block_cache = self.block_cache.lock(); - for block_root in removed_block_roots { - block_cache.delete_blobs(&block_root); + if let Some(mut block_cache) = self.block_cache.as_ref().map(|cache| cache.lock()) { + for block_root in removed_block_roots { + block_cache.delete_blobs(&block_root); + } } - drop(block_cache); let new_blob_info = BlobInfo { oldest_blob_slot: Some(end_slot + 1), diff --git a/book/src/help_bn.md b/book/src/help_bn.md index ea02b39bee..eba6814863 100644 --- a/book/src/help_bn.md +++ b/book/src/help_bn.md @@ -22,7 +22,7 @@ Options: Data directory for the blobs database. --block-cache-size Specifies how many blocks the database should cache in memory - [default: 5] + [default: 0] --boot-nodes One or more comma-delimited base64-encoded ENR's to bootstrap the p2p network. Multiaddr is also supported. diff --git a/lighthouse/tests/beacon_node.rs b/lighthouse/tests/beacon_node.rs index 0660073bbc..629c2e1e9a 100644 --- a/lighthouse/tests/beacon_node.rs +++ b/lighthouse/tests/beacon_node.rs @@ -1839,12 +1839,25 @@ fn slots_per_restore_point_flag() { .run_with_zero_port(); } +#[test] +fn block_cache_size_default() { + CommandLineTest::new() + .run_with_zero_port() + .with_config(|config| assert_eq!(config.store.block_cache_size, 0)); +} #[test] fn block_cache_size_flag() { CommandLineTest::new() .flag("block-cache-size", Some("4")) .run_with_zero_port() - .with_config(|config| assert_eq!(config.store.block_cache_size, new_non_zero_usize(4))); + .with_config(|config| assert_eq!(config.store.block_cache_size, 4)); +} +#[test] +fn block_cache_size_zero() { + CommandLineTest::new() + .flag("block-cache-size", Some("0")) + .run_with_zero_port() + .with_config(|config| assert_eq!(config.store.block_cache_size, 0)); } #[test] fn state_cache_size_default() { From 4111bcb39bb8edaacf3086c621bbc6a895c5433e Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Thu, 18 Sep 2025 17:10:23 +1000 Subject: [PATCH 48/81] Use scoped rayon pool for backfill chain segment processing (#7924) Part of #7866 - Continuation of #7921 In the above PR, we enabled rayon for batch KZG verification in chain segment processing. However, using the global rayon thread pool for backfill is likely to create resource contention with higher-priority beacon processor work. This PR introduces a dedicated low-priority rayon thread pool `LOW_PRIORITY_RAYON_POOL` and uses it for processing backfill chain segments. This prevents backfill KZG verification from using the global rayon thread pool and competing with high-priority beacon processor tasks for CPU resources. However, this PR by itself doesn't prevent CPU oversubscription because other tasks could still fill up the global rayon thread pool, and having an extra thread pool could make things worse. To address this we need the beacon processor to coordinate total CPU allocation across all tasks, which is covered in: - #7789 Co-Authored-By: Jimmy Chen Co-Authored-By: Eitan Seri- Levi Co-Authored-By: Eitan Seri-Levi --- Cargo.lock | 1 + beacon_node/beacon_processor/Cargo.toml | 1 + beacon_node/beacon_processor/src/lib.rs | 34 +++- .../beacon_processor/src/rayon_manager.rs | 27 +++ .../src/scheduler/work_reprocessing_queue.rs | 4 +- beacon_node/client/src/builder.rs | 2 + beacon_node/http_api/src/test_utils.rs | 2 + beacon_node/lighthouse_tracing/src/lib.rs | 2 + .../src/network_beacon_processor/mod.rs | 38 ++-- .../network_beacon_processor/sync_methods.rs | 189 +++++++++++------- .../src/network_beacon_processor/tests.rs | 36 +++- 11 files changed, 228 insertions(+), 108 deletions(-) create mode 100644 beacon_node/beacon_processor/src/rayon_manager.rs diff --git a/Cargo.lock b/Cargo.lock index ba6a4587b6..0e55918243 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -980,6 +980,7 @@ dependencies = [ "metrics", "num_cpus", "parking_lot 0.12.3", + "rayon", "serde", "slot_clock", "strum", diff --git a/beacon_node/beacon_processor/Cargo.toml b/beacon_node/beacon_processor/Cargo.toml index afd4660c9a..262badf7f9 100644 --- a/beacon_node/beacon_processor/Cargo.toml +++ b/beacon_node/beacon_processor/Cargo.toml @@ -12,6 +12,7 @@ logging = { workspace = true } metrics = { workspace = true } num_cpus = { workspace = true } parking_lot = { workspace = true } +rayon = { workspace = true } serde = { workspace = true } slot_clock = { workspace = true } strum = { workspace = true } diff --git a/beacon_node/beacon_processor/src/lib.rs b/beacon_node/beacon_processor/src/lib.rs index 84723fb6a0..64aeb4ceaf 100644 --- a/beacon_node/beacon_processor/src/lib.rs +++ b/beacon_node/beacon_processor/src/lib.rs @@ -38,6 +38,7 @@ //! checks the queues to see if there are more parcels of work that can be spawned in a new worker //! task. +use crate::rayon_manager::RayonManager; use crate::work_reprocessing_queue::{ QueuedBackfillBatch, QueuedColumnReconstruction, QueuedGossipBlock, ReprocessQueueMessage, }; @@ -47,6 +48,7 @@ use lighthouse_network::{MessageId, NetworkGlobals, PeerId}; use logging::TimeLatch; use logging::crit; use parking_lot::Mutex; +use rayon::ThreadPool; pub use scheduler::work_reprocessing_queue; use serde::{Deserialize, Serialize}; use slot_clock::SlotClock; @@ -74,6 +76,7 @@ use work_reprocessing_queue::{ }; mod metrics; +pub mod rayon_manager; pub mod scheduler; /// The maximum size of the channel for work events to the `BeaconProcessor`. @@ -603,7 +606,7 @@ pub enum Work { process_fn: BlockingFn, }, ChainSegment(AsyncFn), - ChainSegmentBackfill(AsyncFn), + ChainSegmentBackfill(BlockingFn), Status(BlockingFn), BlocksByRangeRequest(AsyncFn), BlocksByRootsRequest(AsyncFn), @@ -807,6 +810,7 @@ pub struct BeaconProcessor { pub network_globals: Arc>, pub executor: TaskExecutor, pub current_workers: usize, + pub rayon_manager: RayonManager, pub config: BeaconProcessorConfig, } @@ -1603,7 +1607,17 @@ impl BeaconProcessor { Work::BlocksByRangeRequest(work) | Work::BlocksByRootsRequest(work) => { task_spawner.spawn_async(work) } - Work::ChainSegmentBackfill(process_fn) => task_spawner.spawn_async(process_fn), + Work::ChainSegmentBackfill(process_fn) => { + if self.config.enable_backfill_rate_limiting { + task_spawner.spawn_blocking_with_rayon( + self.rayon_manager.low_priority_threadpool.clone(), + process_fn, + ) + } else { + // use the global rayon thread pool if backfill rate limiting is disabled. + task_spawner.spawn_blocking(process_fn) + } + } Work::ApiRequestP0(process_fn) | Work::ApiRequestP1(process_fn) => match process_fn { BlockingOrAsync::Blocking(process_fn) => task_spawner.spawn_blocking(process_fn), BlockingOrAsync::Async(process_fn) => task_spawner.spawn_async(process_fn), @@ -1665,6 +1679,22 @@ impl TaskSpawner { WORKER_TASK_NAME, ) } + + /// Spawns a blocking task on a rayon thread pool, dropping the `SendOnDrop` after task completion. + fn spawn_blocking_with_rayon(self, thread_pool: Arc, task: F) + where + F: FnOnce() + Send + 'static, + { + self.executor.spawn_blocking( + move || { + thread_pool.install(|| { + task(); + }); + drop(self.send_idle_on_drop) + }, + WORKER_TASK_NAME, + ) + } } /// This struct will send a message on `self.tx` when it is dropped. An error will be logged diff --git a/beacon_node/beacon_processor/src/rayon_manager.rs b/beacon_node/beacon_processor/src/rayon_manager.rs new file mode 100644 index 0000000000..99fe32d5cc --- /dev/null +++ b/beacon_node/beacon_processor/src/rayon_manager.rs @@ -0,0 +1,27 @@ +use rayon::{ThreadPool, ThreadPoolBuilder}; +use std::sync::Arc; + +const DEFAULT_LOW_PRIORITY_DIVISOR: usize = 4; +const MINIMUM_LOW_PRIORITY_THREAD_COUNT: usize = 1; + +pub struct RayonManager { + /// Smaller rayon thread pool for lower-priority, compute-intensive tasks. + /// By default ~25% of CPUs or a minimum of 1 thread. + pub low_priority_threadpool: Arc, +} + +impl Default for RayonManager { + fn default() -> Self { + let low_prio_threads = + (num_cpus::get() / DEFAULT_LOW_PRIORITY_DIVISOR).max(MINIMUM_LOW_PRIORITY_THREAD_COUNT); + let low_priority_threadpool = Arc::new( + ThreadPoolBuilder::new() + .num_threads(low_prio_threads) + .build() + .expect("failed to build low-priority rayon pool"), + ); + Self { + low_priority_threadpool, + } + } +} diff --git a/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs b/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs index 3e755f0830..8c33cf5869 100644 --- a/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs +++ b/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs @@ -173,7 +173,7 @@ pub struct IgnoredRpcBlock { } /// A backfill batch work that has been queued for processing later. -pub struct QueuedBackfillBatch(pub AsyncFn); +pub struct QueuedBackfillBatch(pub BlockingFn); pub struct QueuedColumnReconstruction { pub block_root: Hash256, @@ -1084,7 +1084,7 @@ mod tests { // Now queue a backfill sync batch. work_reprocessing_tx .try_send(ReprocessQueueMessage::BackfillSync(QueuedBackfillBatch( - Box::pin(async {}), + Box::new(|| {}), ))) .unwrap(); tokio::task::yield_now().await; diff --git a/beacon_node/client/src/builder.rs b/beacon_node/client/src/builder.rs index d984d5fedc..87cdcc45ef 100644 --- a/beacon_node/client/src/builder.rs +++ b/beacon_node/client/src/builder.rs @@ -17,6 +17,7 @@ use beacon_chain::{ store::{HotColdDB, ItemStore, StoreConfig}, }; use beacon_chain::{Kzg, LightClientProducerEvent}; +use beacon_processor::rayon_manager::RayonManager; use beacon_processor::{BeaconProcessor, BeaconProcessorChannels}; use beacon_processor::{BeaconProcessorConfig, BeaconProcessorQueueLengths}; use environment::RuntimeContext; @@ -680,6 +681,7 @@ where executor: beacon_processor_context.executor.clone(), current_workers: 0, config: beacon_processor_config, + rayon_manager: RayonManager::default(), } .spawn_manager( beacon_processor_channels.beacon_processor_rx, diff --git a/beacon_node/http_api/src/test_utils.rs b/beacon_node/http_api/src/test_utils.rs index fe9e0dff70..7be8960e69 100644 --- a/beacon_node/http_api/src/test_utils.rs +++ b/beacon_node/http_api/src/test_utils.rs @@ -5,6 +5,7 @@ use beacon_chain::{ }; use beacon_processor::{ BeaconProcessor, BeaconProcessorChannels, BeaconProcessorConfig, BeaconProcessorQueueLengths, + rayon_manager::RayonManager, }; use directory::DEFAULT_ROOT_DIR; use eth2::{BeaconNodeHttpClient, Timeouts}; @@ -247,6 +248,7 @@ pub async fn create_api_server_with_config( executor: test_runtime.task_executor.clone(), current_workers: 0, config: beacon_processor_config, + rayon_manager: RayonManager::default(), } .spawn_manager( beacon_processor_rx, diff --git a/beacon_node/lighthouse_tracing/src/lib.rs b/beacon_node/lighthouse_tracing/src/lib.rs index 60fda12cc2..18a9874252 100644 --- a/beacon_node/lighthouse_tracing/src/lib.rs +++ b/beacon_node/lighthouse_tracing/src/lib.rs @@ -26,6 +26,7 @@ pub const SPAN_PROCESS_RPC_BLOCK: &str = "process_rpc_block"; pub const SPAN_PROCESS_RPC_BLOBS: &str = "process_rpc_blobs"; pub const SPAN_PROCESS_RPC_CUSTODY_COLUMNS: &str = "process_rpc_custody_columns"; pub const SPAN_PROCESS_CHAIN_SEGMENT: &str = "process_chain_segment"; +pub const SPAN_PROCESS_CHAIN_SEGMENT_BACKFILL: &str = "process_chain_segment_backfill"; /// Fork choice root spans pub const SPAN_RECOMPUTE_HEAD: &str = "recompute_head_at_slot"; @@ -61,6 +62,7 @@ pub const LH_BN_ROOT_SPAN_NAMES: &[&str] = &[ SPAN_PROCESS_RPC_BLOBS, SPAN_PROCESS_RPC_CUSTODY_COLUMNS, SPAN_PROCESS_CHAIN_SEGMENT, + SPAN_PROCESS_CHAIN_SEGMENT_BACKFILL, SPAN_HANDLE_BLOCKS_BY_RANGE_REQUEST, SPAN_HANDLE_BLOBS_BY_RANGE_REQUEST, SPAN_HANDLE_DATA_COLUMNS_BY_RANGE_REQUEST, diff --git a/beacon_node/network/src/network_beacon_processor/mod.rs b/beacon_node/network/src/network_beacon_processor/mod.rs index 691c06f268..85ccde1d59 100644 --- a/beacon_node/network/src/network_beacon_processor/mod.rs +++ b/beacon_node/network/src/network_beacon_processor/mod.rs @@ -6,9 +6,7 @@ use beacon_chain::data_column_verification::{GossipDataColumnError, observe_goss use beacon_chain::fetch_blobs::{ EngineGetBlobsOutput, FetchEngineBlobError, fetch_and_process_engine_blobs, }; -use beacon_chain::{ - AvailabilityProcessingStatus, BeaconChain, BeaconChainTypes, BlockError, NotifyExecutionLayer, -}; +use beacon_chain::{AvailabilityProcessingStatus, BeaconChain, BeaconChainTypes, BlockError}; use beacon_processor::{ BeaconProcessorSend, DuplicateCache, GossipAggregatePackage, GossipAttestationPackage, Work, WorkEvent as BeaconWorkEvent, @@ -500,33 +498,23 @@ impl NetworkBeaconProcessor { process_id: ChainSegmentProcessId, blocks: Vec>, ) -> Result<(), Error> { - let is_backfill = matches!(&process_id, ChainSegmentProcessId::BackSyncBatchId { .. }); debug!(blocks = blocks.len(), id = ?process_id, "Batch sending for process"); - let processor = self.clone(); - let process_fn = async move { - let notify_execution_layer = if processor - .network_globals - .sync_state - .read() - .is_syncing_finalized() - { - NotifyExecutionLayer::No - } else { - NotifyExecutionLayer::Yes - }; - processor - .process_chain_segment(process_id, blocks, notify_execution_layer) - .await; - }; - let process_fn = Box::pin(process_fn); // Back-sync batches are dispatched with a different `Work` variant so // they can be rate-limited. - let work = if is_backfill { - Work::ChainSegmentBackfill(process_fn) - } else { - Work::ChainSegment(process_fn) + let work = match process_id { + ChainSegmentProcessId::RangeBatchId(_, _) => { + let process_fn = async move { + processor.process_chain_segment(process_id, blocks).await; + }; + Work::ChainSegment(Box::pin(process_fn)) + } + ChainSegmentProcessId::BackSyncBatchId(_) => { + let process_fn = + move || processor.process_chain_segment_backfill(process_id, blocks); + Work::ChainSegmentBackfill(Box::new(process_fn)) + } }; self.try_send(BeaconWorkEvent { diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index edeed7e98c..b61a6e25c5 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -19,9 +19,10 @@ use beacon_processor::{ use beacon_processor::{Work, WorkEvent}; use lighthouse_network::PeerAction; use lighthouse_tracing::{ - SPAN_PROCESS_CHAIN_SEGMENT, SPAN_PROCESS_RPC_BLOBS, SPAN_PROCESS_RPC_BLOCK, - SPAN_PROCESS_RPC_CUSTODY_COLUMNS, + SPAN_PROCESS_CHAIN_SEGMENT, SPAN_PROCESS_CHAIN_SEGMENT_BACKFILL, SPAN_PROCESS_RPC_BLOBS, + SPAN_PROCESS_RPC_BLOCK, SPAN_PROCESS_RPC_CUSTODY_COLUMNS, }; +use logging::crit; use std::sync::Arc; use std::time::Duration; use store::KzgCommitment; @@ -434,27 +435,42 @@ impl NetworkBeaconProcessor { parent = None, level = "debug", skip_all, - fields(sync_type = ?sync_type, downloaded_blocks = downloaded_blocks.len()) + fields(process_id = ?process_id, downloaded_blocks = downloaded_blocks.len()) )] pub async fn process_chain_segment( &self, - sync_type: ChainSegmentProcessId, + process_id: ChainSegmentProcessId, downloaded_blocks: Vec>, - notify_execution_layer: NotifyExecutionLayer, ) { - let result = match sync_type { - // this a request from the range sync - ChainSegmentProcessId::RangeBatchId(chain_id, epoch) => { - let start_slot = downloaded_blocks.first().map(|b| b.slot().as_u64()); - let end_slot = downloaded_blocks.last().map(|b| b.slot().as_u64()); - let sent_blocks = downloaded_blocks.len(); + let ChainSegmentProcessId::RangeBatchId(chain_id, epoch) = process_id else { + // This is a request from range sync, this should _never_ happen + crit!( + error = "process_chain_segment called on a variant other than RangeBatchId", + "Please notify the devs" + ); + return; + }; - match self - .process_blocks(downloaded_blocks.iter(), notify_execution_layer) - .await - { - (imported_blocks, Ok(_)) => { - debug!( + let start_slot = downloaded_blocks.first().map(|b| b.slot().as_u64()); + let end_slot = downloaded_blocks.last().map(|b| b.slot().as_u64()); + let sent_blocks = downloaded_blocks.len(); + let notify_execution_layer = if self + .network_globals + .sync_state + .read() + .is_syncing_finalized() + { + NotifyExecutionLayer::No + } else { + NotifyExecutionLayer::Yes + }; + + let result = match self + .process_blocks(downloaded_blocks.iter(), notify_execution_layer) + .await + { + (imported_blocks, Ok(_)) => { + debug!( batch_epoch = %epoch, first_block_slot = start_slot, chain = chain_id, @@ -462,13 +478,13 @@ impl NetworkBeaconProcessor { processed_blocks = sent_blocks, service= "sync", "Batch processed"); - BatchProcessResult::Success { - sent_blocks, - imported_blocks, - } - } - (imported_blocks, Err(e)) => { - debug!( + BatchProcessResult::Success { + sent_blocks, + imported_blocks, + } + } + (imported_blocks, Err(e)) => { + debug!( batch_epoch = %epoch, first_block_slot = start_slot, chain = chain_id, @@ -477,33 +493,61 @@ impl NetworkBeaconProcessor { error = %e.message, service = "sync", "Batch processing failed"); - match e.peer_action { - Some(penalty) => BatchProcessResult::FaultyFailure { - imported_blocks, - penalty, - }, - None => BatchProcessResult::NonFaultyFailure, - } - } + match e.peer_action { + Some(penalty) => BatchProcessResult::FaultyFailure { + imported_blocks, + penalty, + }, + None => BatchProcessResult::NonFaultyFailure, } } - // this a request from the Backfill sync - ChainSegmentProcessId::BackSyncBatchId(epoch) => { - let start_slot = downloaded_blocks.first().map(|b| b.slot().as_u64()); - let end_slot = downloaded_blocks.last().map(|b| b.slot().as_u64()); - let sent_blocks = downloaded_blocks.len(); - let n_blobs = downloaded_blocks - .iter() - .map(|wrapped| wrapped.n_blobs()) - .sum::(); - let n_data_columns = downloaded_blocks - .iter() - .map(|wrapped| wrapped.n_data_columns()) - .sum::(); + }; - match self.process_backfill_blocks(downloaded_blocks) { - (imported_blocks, Ok(_)) => { - debug!( + self.send_sync_message(SyncMessage::BatchProcessed { + sync_type: process_id, + result, + }); + } + + /// Attempt to import the chain segment (`blocks`) to the beacon chain, informing the sync + /// thread if more blocks are needed to process it. + #[instrument( + name = SPAN_PROCESS_CHAIN_SEGMENT_BACKFILL, + parent = None, + level = "debug", + skip_all, + fields(downloaded_blocks = downloaded_blocks.len()) + )] + pub fn process_chain_segment_backfill( + &self, + process_id: ChainSegmentProcessId, + downloaded_blocks: Vec>, + ) { + let ChainSegmentProcessId::BackSyncBatchId(epoch) = process_id else { + // this a request from RangeSync, this should _never_ happen + crit!( + error = + "process_chain_segment_backfill called on a variant other than BackSyncBatchId", + "Please notify the devs" + ); + return; + }; + + let start_slot = downloaded_blocks.first().map(|b| b.slot().as_u64()); + let end_slot = downloaded_blocks.last().map(|b| b.slot().as_u64()); + let sent_blocks = downloaded_blocks.len(); + let n_blobs = downloaded_blocks + .iter() + .map(|wrapped| wrapped.n_blobs()) + .sum::(); + let n_data_columns = downloaded_blocks + .iter() + .map(|wrapped| wrapped.n_data_columns()) + .sum::(); + + let result = match self.process_backfill_blocks(downloaded_blocks) { + (imported_blocks, Ok(_)) => { + debug!( batch_epoch = %epoch, first_block_slot = start_slot, keep_execution_payload = !self.chain.store.get_config().prune_payloads, @@ -513,34 +557,35 @@ impl NetworkBeaconProcessor { processed_data_columns = n_data_columns, service= "sync", "Backfill batch processed"); - BatchProcessResult::Success { - sent_blocks, - imported_blocks, - } - } - (_, Err(e)) => { - debug!( - batch_epoch = %epoch, - first_block_slot = start_slot, - last_block_slot = end_slot, - processed_blobs = n_blobs, - error = %e.message, - service = "sync", - "Backfill batch processing failed" - ); - match e.peer_action { - Some(penalty) => BatchProcessResult::FaultyFailure { - imported_blocks: 0, - penalty, - }, - None => BatchProcessResult::NonFaultyFailure, - } - } + BatchProcessResult::Success { + sent_blocks, + imported_blocks, + } + } + (_, Err(e)) => { + debug!( + batch_epoch = %epoch, + first_block_slot = start_slot, + last_block_slot = end_slot, + processed_blobs = n_blobs, + error = %e.message, + service = "sync", + "Backfill batch processing failed" + ); + match e.peer_action { + Some(penalty) => BatchProcessResult::FaultyFailure { + imported_blocks: 0, + penalty, + }, + None => BatchProcessResult::NonFaultyFailure, } } }; - self.send_sync_message(SyncMessage::BatchProcessed { sync_type, result }); + self.send_sync_message(SyncMessage::BatchProcessed { + sync_type: process_id, + result, + }); } /// Helper function to process blocks batches which only consumes the chain and blocks to process. diff --git a/beacon_node/network/src/network_beacon_processor/tests.rs b/beacon_node/network/src/network_beacon_processor/tests.rs index d3a93d4863..99410bc5e5 100644 --- a/beacon_node/network/src/network_beacon_processor/tests.rs +++ b/beacon_node/network/src/network_beacon_processor/tests.rs @@ -17,6 +17,7 @@ use beacon_chain::test_utils::{ test_spec, }; use beacon_chain::{BeaconChain, WhenSlotSkipped}; +use beacon_processor::rayon_manager::RayonManager; use beacon_processor::{work_reprocessing_queue::*, *}; use gossipsub::MessageAcceptance; use itertools::Itertools; @@ -266,6 +267,7 @@ impl TestRig { executor, current_workers: 0, config: beacon_processor_config, + rayon_manager: RayonManager::default(), } .spawn_manager( beacon_processor_rx, @@ -458,10 +460,10 @@ impl TestRig { .unwrap(); } - pub fn enqueue_backfill_batch(&self) { + pub fn enqueue_backfill_batch(&self, epoch: Epoch) { self.network_beacon_processor .send_chain_segment( - ChainSegmentProcessId::BackSyncBatchId(Epoch::default()), + ChainSegmentProcessId::BackSyncBatchId(epoch), Vec::default(), ) .unwrap(); @@ -606,7 +608,7 @@ impl TestRig { } pub async fn assert_event_journal(&mut self, expected: &[&str]) { - self.assert_event_journal_with_timeout(expected, STANDARD_TIMEOUT) + self.assert_event_journal_with_timeout(expected, STANDARD_TIMEOUT, false, false) .await } @@ -623,6 +625,8 @@ impl TestRig { .chain(std::iter::once(NOTHING_TO_DO)) .collect::>(), timeout, + false, + false, ) .await } @@ -666,11 +670,21 @@ impl TestRig { &mut self, expected: &[&str], timeout: Duration, + ignore_worker_freed: bool, + ignore_nothing_to_do: bool, ) { let mut events = Vec::with_capacity(expected.len()); let drain_future = async { while let Some(event) = self.work_journal_rx.recv().await { + if event == WORKER_FREED && ignore_worker_freed { + continue; + } + + if event == NOTHING_TO_DO && ignore_nothing_to_do { + continue; + } + events.push(event); // Break as soon as we collect the desired number of events. @@ -1384,6 +1398,8 @@ async fn requeue_unknown_block_gossip_attestation_without_import() { NOTHING_TO_DO, ], Duration::from_secs(1) + QUEUED_ATTESTATION_DELAY, + false, + false, ) .await; @@ -1424,6 +1440,8 @@ async fn requeue_unknown_block_gossip_aggregated_attestation_without_import() { NOTHING_TO_DO, ], Duration::from_secs(1) + QUEUED_ATTESTATION_DELAY, + false, + false, ) .await; @@ -1558,8 +1576,8 @@ async fn test_backfill_sync_processing() { // (not straight forward to manipulate `TestingSlotClock` due to cloning of `SlotClock` in code) // and makes the test very slow, hence timing calculation is unit tested separately in // `work_reprocessing_queue`. - for _ in 0..1 { - rig.enqueue_backfill_batch(); + for i in 0..1 { + rig.enqueue_backfill_batch(Epoch::new(i)); // ensure queued batch is not processed until later rig.assert_no_events_for(Duration::from_millis(100)).await; // A new batch should be processed within a slot. @@ -1570,6 +1588,8 @@ async fn test_backfill_sync_processing() { NOTHING_TO_DO, ], rig.chain.slot_clock.slot_duration(), + false, + false, ) .await; } @@ -1590,8 +1610,8 @@ async fn test_backfill_sync_processing_rate_limiting_disabled() { ) .await; - for _ in 0..3 { - rig.enqueue_backfill_batch(); + for i in 0..3 { + rig.enqueue_backfill_batch(Epoch::new(i)); } // ensure all batches are processed @@ -1602,6 +1622,8 @@ async fn test_backfill_sync_processing_rate_limiting_disabled() { WorkType::ChainSegmentBackfill.into(), ], Duration::from_millis(100), + true, + true, ) .await; } From 78d330e4b7e2b76ab503cd88f4a365a6d7a0bcf0 Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Fri, 19 Sep 2025 17:01:13 +1000 Subject: [PATCH 49/81] Consolidate `reqresp_pre_import_cache` into `data_availability_checker` (#8045) This PR consolidates the `reqresp_pre_import_cache` into the `data_availability_checker` for the following reasons: - the `reqresp_pre_import_cache` suffers from the same TOCTOU bug we had with `data_availability_checker` earlier, and leads to unbounded memory leak, which we have observed over the last 6 months on some nodes. - the `reqresp_pre_import_cache` is no longer necessary, because we now hold blocks in the `data_availability_checker` for longer since (#7961), and recent blocks can be served from the DA checker. This PR also maintains the following functionalities - Serving pre-executed blocks over RPC, and they're now served from the `data_availability_checker` instead. - Using the cache for de-duplicating lookup requests. Co-Authored-By: Jimmy Chen Co-Authored-By: Jimmy Chen --- beacon_node/beacon_chain/src/beacon_chain.rs | 144 ++++-------- beacon_node/beacon_chain/src/builder.rs | 1 - .../src/data_availability_checker.rs | 38 +++- .../overflow_lru_cache.rs | 208 +++++++++++++----- .../state_lru_cache.rs | 10 - beacon_node/beacon_chain/src/metrics.rs | 12 - .../gossip_methods.rs | 5 +- .../network_beacon_processor/sync_methods.rs | 5 +- beacon_node/network/src/sync/tests/lookups.rs | 24 +- 9 files changed, 239 insertions(+), 208 deletions(-) diff --git a/beacon_node/beacon_chain/src/beacon_chain.rs b/beacon_node/beacon_chain/src/beacon_chain.rs index 084a68bfea..ef3c2f52e0 100644 --- a/beacon_node/beacon_chain/src/beacon_chain.rs +++ b/beacon_node/beacon_chain/src/beacon_chain.rs @@ -340,10 +340,6 @@ pub enum BlockProcessStatus { ExecutionValidated(Arc>), } -pub struct BeaconChainMetrics { - pub reqresp_pre_import_cache_len: usize, -} - pub type LightClientProducerEvent = (Hash256, Slot, SyncAggregate); pub type BeaconForkChoice = ForkChoice< @@ -363,9 +359,6 @@ pub type BeaconStore = Arc< >, >; -/// Cache gossip verified blocks to serve over ReqResp before they are imported -type ReqRespPreImportCache = HashMap>>; - /// Represents the "Beacon Chain" component of Ethereum 2.0. Allows import of blocks and block /// operations and chooses a canonical head. pub struct BeaconChain { @@ -462,8 +455,6 @@ pub struct BeaconChain { pub(crate) attester_cache: Arc, /// A cache used when producing attestations whilst the head block is still being imported. pub early_attester_cache: EarlyAttesterCache, - /// Cache gossip verified blocks to serve over ReqResp before they are imported - pub reqresp_pre_import_cache: Arc>>, /// A cache used to keep track of various block timings. pub block_times_cache: Arc>, /// A cache used to track pre-finalization block roots for quick rejection. @@ -1289,18 +1280,8 @@ impl BeaconChain { /// chain. Used by sync to learn the status of a block and prevent repeated downloads / /// processing attempts. pub fn get_block_process_status(&self, block_root: &Hash256) -> BlockProcessStatus { - if let Some(block) = self - .data_availability_checker - .get_execution_valid_block(block_root) - { - return BlockProcessStatus::ExecutionValidated(block); - } - - if let Some(block) = self.reqresp_pre_import_cache.read().get(block_root) { - // A block is on the `reqresp_pre_import_cache` but NOT in the - // `data_availability_checker` only if it is actively processing. We can expect a future - // event with the result of processing - return BlockProcessStatus::NotValidated(block.clone()); + if let Some(cached_block) = self.data_availability_checker.get_cached_block(block_root) { + return cached_block; } BlockProcessStatus::Unknown @@ -3054,8 +3035,7 @@ impl BeaconChain { self.emit_sse_blob_sidecar_events(&block_root, std::iter::once(blob.as_blob())); - let r = self.check_gossip_blob_availability_and_import(blob).await; - self.remove_notified(&block_root, r) + self.check_gossip_blob_availability_and_import(blob).await } /// Cache the data columns in the processing cache, process it, then evict it from the cache if it was @@ -3092,15 +3072,13 @@ impl BeaconChain { data_columns.iter().map(|column| column.as_data_column()), ); - let r = self - .check_gossip_data_columns_availability_and_import( - slot, - block_root, - data_columns, - publish_fn, - ) - .await; - self.remove_notified(&block_root, r) + self.check_gossip_data_columns_availability_and_import( + slot, + block_root, + data_columns, + publish_fn, + ) + .await } /// Cache the blobs in the processing cache, process it, then evict it from the cache if it was @@ -3139,10 +3117,8 @@ impl BeaconChain { self.emit_sse_blob_sidecar_events(&block_root, blobs.iter().flatten().map(Arc::as_ref)); - let r = self - .check_rpc_blob_availability_and_import(slot, block_root, blobs) - .await; - self.remove_notified(&block_root, r) + self.check_rpc_blob_availability_and_import(slot, block_root, blobs) + .await } /// Process blobs retrieved from the EL and returns the `AvailabilityProcessingStatus`. @@ -3174,10 +3150,8 @@ impl BeaconChain { } } - let r = self - .check_engine_blobs_availability_and_import(slot, block_root, engine_get_blobs_output) - .await; - self.remove_notified(&block_root, r) + self.check_engine_blobs_availability_and_import(slot, block_root, engine_get_blobs_output) + .await } fn emit_sse_blob_sidecar_events<'a, I>(self: &Arc, block_root: &Hash256, blobs_iter: I) @@ -3270,10 +3244,8 @@ impl BeaconChain { custody_columns.iter().map(|column| column.as_ref()), ); - let r = self - .check_rpc_custody_columns_availability_and_import(slot, block_root, custody_columns) - .await; - self.remove_notified(&block_root, r) + self.check_rpc_custody_columns_availability_and_import(slot, block_root, custody_columns) + .await } pub async fn reconstruct_data_columns( @@ -3320,10 +3292,8 @@ impl BeaconChain { return Ok(None); }; - let r = self - .process_availability(slot, availability, || Ok(())) - .await; - self.remove_notified(&block_root, r) + self.process_availability(slot, availability, || Ok(())) + .await .map(|availability_processing_status| { Some((availability_processing_status, data_columns_to_publish)) }) @@ -3340,46 +3310,6 @@ impl BeaconChain { } } - /// Remove any block components from the *processing cache* if we no longer require them. If the - /// block was imported full or erred, we no longer require them. - fn remove_notified( - &self, - block_root: &Hash256, - r: Result, - ) -> Result { - let has_missing_components = - matches!(r, Ok(AvailabilityProcessingStatus::MissingComponents(_, _))); - if !has_missing_components { - self.reqresp_pre_import_cache.write().remove(block_root); - } - r - } - - /// Wraps `process_block` in logic to cache the block's commitments in the processing cache - /// and evict if the block was imported or errored. - pub async fn process_block_with_early_caching>( - self: &Arc, - block_root: Hash256, - unverified_block: B, - block_source: BlockImportSource, - notify_execution_layer: NotifyExecutionLayer, - ) -> Result { - self.reqresp_pre_import_cache - .write() - .insert(block_root, unverified_block.block_cloned()); - - let r = self - .process_block( - block_root, - unverified_block, - notify_execution_layer, - block_source, - || Ok(()), - ) - .await; - self.remove_notified(&block_root, r) - } - /// Check for known and configured invalid block roots before processing. pub fn check_invalid_block_roots(&self, block_root: Hash256) -> Result<(), BlockError> { if self.config.invalid_block_roots.contains(&block_root) { @@ -3411,12 +3341,6 @@ impl BeaconChain { block_source: BlockImportSource, publish_fn: impl FnOnce() -> Result<(), BlockError>, ) -> Result { - // Start the Prometheus timer. - let _full_timer = metrics::start_timer(&metrics::BLOCK_PROCESSING_TIMES); - - // Increment the Prometheus counter for block processing requests. - metrics::inc_counter(&metrics::BLOCK_PROCESSING_REQUESTS); - let block_slot = unverified_block.block().slot(); // Set observed time if not already set. Usually this should be set by gossip or RPC, @@ -3431,6 +3355,15 @@ impl BeaconChain { ); } + self.data_availability_checker + .put_pre_execution_block(block_root, unverified_block.block_cloned())?; + + // Start the Prometheus timer. + let _full_timer = metrics::start_timer(&metrics::BLOCK_PROCESSING_TIMES); + + // Increment the Prometheus counter for block processing requests. + metrics::inc_counter(&metrics::BLOCK_PROCESSING_REQUESTS); + // A small closure to group the verification and import errors. let chain = self.clone(); let import_block = async move { @@ -3448,7 +3381,18 @@ impl BeaconChain { .set_time_consensus_verified(block_root, block_slot, timestamp) } - let executed_block = chain.into_executed_block(execution_pending).await?; + let executed_block = chain + .into_executed_block(execution_pending) + .await + .inspect_err(|_| { + // If the block fails execution for whatever reason (e.g. engine offline), + // and we keep it in the cache, then the node will NOT perform lookup and + // reprocess this block until the block is evicted from DA checker, causing the + // chain to get stuck temporarily if the block is canonical. Therefore we remove + // it from the cache if execution fails. + self.data_availability_checker + .remove_block_on_execution_error(&block_root); + })?; // Record the *additional* time it took to wait for execution layer verification. if let Some(timestamp) = self.slot_clock.now_duration() { @@ -3574,9 +3518,7 @@ impl BeaconChain { block: AvailabilityPendingExecutedBlock, ) -> Result { let slot = block.block.slot(); - let availability = self - .data_availability_checker - .put_pending_executed_block(block)?; + let availability = self.data_availability_checker.put_executed_block(block)?; self.process_availability(slot, availability, || Ok(())) .await } @@ -7156,12 +7098,6 @@ impl BeaconChain { ) } - pub fn metrics(&self) -> BeaconChainMetrics { - BeaconChainMetrics { - reqresp_pre_import_cache_len: self.reqresp_pre_import_cache.read().len(), - } - } - pub(crate) fn get_blobs_or_columns_store_op( &self, block_root: Hash256, diff --git a/beacon_node/beacon_chain/src/builder.rs b/beacon_node/beacon_chain/src/builder.rs index 35432632cc..5564c7916f 100644 --- a/beacon_node/beacon_chain/src/builder.rs +++ b/beacon_node/beacon_chain/src/builder.rs @@ -998,7 +998,6 @@ where validator_pubkey_cache: RwLock::new(validator_pubkey_cache), attester_cache: <_>::default(), early_attester_cache: <_>::default(), - reqresp_pre_import_cache: <_>::default(), light_client_server_cache: LightClientServerCache::new(), light_client_server_tx: self.light_client_server_tx, shutdown_sender: self diff --git a/beacon_node/beacon_chain/src/data_availability_checker.rs b/beacon_node/beacon_chain/src/data_availability_checker.rs index 88cd8f3aab..a0ad1c2112 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker.rs @@ -7,7 +7,9 @@ use crate::block_verification_types::{ use crate::data_availability_checker::overflow_lru_cache::{ DataAvailabilityCheckerInner, ReconstructColumnsDecision, }; -use crate::{BeaconChain, BeaconChainTypes, BeaconStore, CustodyContext, metrics}; +use crate::{ + BeaconChain, BeaconChainTypes, BeaconStore, BlockProcessStatus, CustodyContext, metrics, +}; use kzg::Kzg; use slot_clock::SlotClock; use std::fmt; @@ -27,6 +29,7 @@ mod error; mod overflow_lru_cache; mod state_lru_cache; +use crate::data_availability_checker::error::Error; use crate::data_column_verification::{ CustodyDataColumn, GossipVerifiedDataColumn, KzgVerifiedCustodyDataColumn, KzgVerifiedDataColumn, verify_kzg_for_data_column_list, @@ -144,14 +147,12 @@ impl DataAvailabilityChecker { &self.custody_context } - /// Checks if the block root is currenlty in the availability cache awaiting import because + /// Checks if the block root is currently in the availability cache awaiting import because /// of missing components. - pub fn get_execution_valid_block( - &self, - block_root: &Hash256, - ) -> Option>> { - self.availability_cache - .get_execution_valid_block(block_root) + /// + /// Returns the cache block wrapped in a `BlockProcessStatus` enum if it exists. + pub fn get_cached_block(&self, block_root: &Hash256) -> Option> { + self.availability_cache.get_cached_block(block_root) } /// Return the set of cached blob indexes for `block_root`. Returns None if there is no block @@ -340,12 +341,29 @@ impl DataAvailabilityChecker { /// Check if we have all the blobs for a block. Returns `Availability` which has information /// about whether all components have been received or more are required. - pub fn put_pending_executed_block( + pub fn put_executed_block( &self, executed_block: AvailabilityPendingExecutedBlock, ) -> Result, AvailabilityCheckError> { + self.availability_cache.put_executed_block(executed_block) + } + + /// Inserts a pre-execution block into the cache. + /// This does NOT override an existing executed block. + pub fn put_pre_execution_block( + &self, + block_root: Hash256, + block: Arc>, + ) -> Result<(), Error> { self.availability_cache - .put_pending_executed_block(executed_block) + .put_pre_execution_block(block_root, block) + } + + /// Removes a pre-execution block from the cache. + /// This does NOT remove an existing executed block. + pub fn remove_block_on_execution_error(&self, block_root: &Hash256) { + self.availability_cache + .remove_pre_execution_block(block_root); } /// Verifies kzg commitments for an RpcBlock, returns a `MaybeAvailableBlock` that may diff --git a/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs b/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs index 6afb680ddb..bb44009662 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs @@ -1,6 +1,5 @@ use super::AvailableBlockData; use super::state_lru_cache::{DietAvailabilityPendingExecutedBlock, StateLRUCache}; -use crate::BeaconChainTypes; use crate::CustodyContext; use crate::beacon_chain::BeaconStore; use crate::blob_verification::KzgVerifiedBlob; @@ -9,6 +8,7 @@ use crate::block_verification_types::{ }; use crate::data_availability_checker::{Availability, AvailabilityCheckError}; use crate::data_column_verification::KzgVerifiedCustodyDataColumn; +use crate::{BeaconChainTypes, BlockProcessStatus}; use lighthouse_tracing::SPAN_PENDING_COMPONENTS; use lru::LruCache; use parking_lot::{MappedRwLockReadGuard, RwLock, RwLockReadGuard, RwLockWriteGuard}; @@ -16,12 +16,46 @@ use std::cmp::Ordering; use std::num::NonZeroUsize; use std::sync::Arc; use tracing::{Span, debug, debug_span}; +use types::beacon_block_body::KzgCommitments; use types::blob_sidecar::BlobIdentifier; use types::{ BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, Epoch, EthSpec, Hash256, RuntimeFixedVector, RuntimeVariableList, SignedBeaconBlock, }; +#[derive(Clone)] +pub enum CachedBlock { + PreExecution(Arc>), + Executed(Box>), +} + +impl CachedBlock { + pub fn get_commitments(&self) -> KzgCommitments { + let block = self.as_block(); + block + .message() + .body() + .blob_kzg_commitments() + .cloned() + .unwrap_or_default() + } + + fn as_block(&self) -> &SignedBeaconBlock { + match self { + CachedBlock::PreExecution(b) => b, + CachedBlock::Executed(b) => b.as_block(), + } + } + + pub fn num_blobs_expected(&self) -> usize { + self.as_block() + .message() + .body() + .blob_kzg_commitments() + .map_or(0, |commitments| commitments.len()) + } +} + /// This represents the components of a partially available block /// /// The blobs are all gossip and kzg verified. @@ -39,22 +73,25 @@ pub struct PendingComponents { pub block_root: Hash256, pub verified_blobs: RuntimeFixedVector>>, pub verified_data_columns: Vec>, - pub executed_block: Option>, + pub block: Option>, pub reconstruction_started: bool, span: Span, } impl PendingComponents { - /// Returns an immutable reference to the cached block. - pub fn get_cached_block(&self) -> &Option> { - &self.executed_block - } - /// Returns an immutable reference to the fixed vector of cached blobs. pub fn get_cached_blobs(&self) -> &RuntimeFixedVector>> { &self.verified_blobs } + #[cfg(test)] + fn get_diet_block(&self) -> Option<&DietAvailabilityPendingExecutedBlock> { + self.block.as_ref().and_then(|block| match block { + CachedBlock::Executed(block) => Some(block.as_ref()), + _ => None, + }) + } + /// Returns an immutable reference to the cached data column. pub fn get_cached_data_column( &self, @@ -66,11 +103,6 @@ impl PendingComponents { .map(|d| d.clone_arc()) } - /// Returns a mutable reference to the cached block. - pub fn get_cached_block_mut(&mut self) -> &mut Option> { - &mut self.executed_block - } - /// Returns a mutable reference to the fixed vector of cached blobs. pub fn get_cached_blobs_mut(&mut self) -> &mut RuntimeFixedVector>> { &mut self.verified_blobs @@ -96,9 +128,17 @@ impl PendingComponents { .collect() } - /// Inserts a block into the cache. - pub fn insert_block(&mut self, block: DietAvailabilityPendingExecutedBlock) { - *self.get_cached_block_mut() = Some(block) + /// Inserts an executed block into the cache. + pub fn insert_executed_block(&mut self, block: DietAvailabilityPendingExecutedBlock) { + self.block = Some(CachedBlock::Executed(Box::new(block))) + } + + /// Inserts a pre-execution block into the cache. + /// This does NOT override an existing executed block. + pub fn insert_pre_execution_block(&mut self, block: Arc>) { + if self.block.is_none() { + self.block = Some(CachedBlock::PreExecution(block)) + } } /// Inserts a blob at a specific index in the cache. @@ -128,7 +168,7 @@ impl PendingComponents { /// 1. The blob entry at the index is empty and no block exists, or /// 2. The block exists and its commitment matches the blob's commitment. pub fn merge_single_blob(&mut self, index: usize, blob: KzgVerifiedBlob) { - if let Some(cached_block) = self.get_cached_block() { + if let Some(cached_block) = &self.block { let block_commitment_opt = cached_block.get_commitments().get(index).copied(); if let Some(block_commitment) = block_commitment_opt && block_commitment == *blob.get_commitment() @@ -158,7 +198,7 @@ impl PendingComponents { /// /// Blobs that don't match the new block's commitments are evicted. pub fn merge_block(&mut self, block: DietAvailabilityPendingExecutedBlock) { - self.insert_block(block); + self.insert_executed_block(block); let reinsert = self.get_cached_blobs_mut().take(); self.merge_blobs(reinsert); } @@ -180,7 +220,7 @@ impl PendingComponents { &Span, ) -> Result, AvailabilityCheckError>, { - let Some(block) = &self.executed_block else { + let Some(CachedBlock::Executed(block)) = &self.block else { // Block not available yet return Ok(None); }; @@ -267,7 +307,7 @@ impl PendingComponents { block, import_data, payload_verification_outcome, - } = recover(block.clone(), &self.span)?; + } = recover(*block.clone(), &self.span)?; let available_block = AvailableBlock { block_root: self.block_root, @@ -295,7 +335,7 @@ impl PendingComponents { block_root, verified_blobs: RuntimeFixedVector::new(vec![None; max_len]), verified_data_columns: vec![], - executed_block: None, + block: None, reconstruction_started: false, span, } @@ -307,9 +347,9 @@ impl PendingComponents { /// - The first data column /// Otherwise, returns None pub fn epoch(&self) -> Option { - // Get epoch from cached executed block - if let Some(executed_block) = &self.executed_block { - return Some(executed_block.as_block().epoch()); + // Get epoch from cached block + if let Some(block) = &self.block { + return Some(block.as_block().epoch()); } // Or, get epoch from first available blob @@ -326,7 +366,7 @@ impl PendingComponents { } pub fn status_str(&self, num_expected_columns_opt: Option) -> String { - let block_count = if self.executed_block.is_some() { 1 } else { 0 }; + let block_count = if self.block.is_some() { 1 } else { 0 }; if let Some(num_expected_columns) = num_expected_columns_opt { format!( "block {} data_columns {}/{}", @@ -335,7 +375,7 @@ impl PendingComponents { num_expected_columns ) } else { - let num_expected_blobs = if let Some(block) = self.get_cached_block() { + let num_expected_blobs = if let Some(block) = &self.block { &block.num_blobs_expected().to_string() } else { "?" @@ -387,18 +427,17 @@ impl DataAvailabilityCheckerInner { } /// Returns true if the block root is known, without altering the LRU ordering - pub fn get_execution_valid_block( - &self, - block_root: &Hash256, - ) -> Option>> { + pub fn get_cached_block(&self, block_root: &Hash256) -> Option> { self.critical .read() .peek(block_root) .and_then(|pending_components| { - pending_components - .executed_block - .as_ref() - .map(|block| block.block_cloned()) + pending_components.block.as_ref().map(|block| match block { + CachedBlock::PreExecution(b) => BlockProcessStatus::NotValidated(b.clone()), + CachedBlock::Executed(b) => { + BlockProcessStatus::ExecutionValidated(b.block_cloned()) + } + }) }) } @@ -647,9 +686,46 @@ impl DataAvailabilityCheckerInner { } } + /// Inserts a pre executed block into the cache. + /// - This does NOT trigger the availability check as the block still needs to be executed. + /// - This does NOT override an existing cached block to avoid overwriting an executed block. + pub fn put_pre_execution_block( + &self, + block_root: Hash256, + block: Arc>, + ) -> Result<(), AvailabilityCheckError> { + let epoch = block.epoch(); + let pending_components = + self.update_or_insert_pending_components(block_root, epoch, |pending_components| { + pending_components.insert_pre_execution_block(block); + Ok(()) + })?; + + let num_expected_columns_opt = self.get_num_expected_columns(epoch); + + pending_components.span.in_scope(|| { + debug!( + component = "pre execution block", + status = pending_components.status_str(num_expected_columns_opt), + "Component added to data availability checker" + ); + }); + + Ok(()) + } + + /// Removes a pre-execution block from the cache. + /// This does NOT remove an existing executed block. + pub fn remove_pre_execution_block(&self, block_root: &Hash256) { + // The read lock is immediately dropped so we can safely remove the block from the cache. + if let Some(BlockProcessStatus::NotValidated(_)) = self.get_cached_block(block_root) { + self.critical.write().pop(block_root); + } + } + /// Check if we have all the blobs for a block. If we do, return the Availability variant that /// triggers import of the block. - pub fn put_pending_executed_block( + pub fn put_executed_block( &self, executed_block: AvailabilityPendingExecutedBlock, ) -> Result, AvailabilityCheckError> { @@ -667,14 +743,7 @@ impl DataAvailabilityCheckerInner { Ok(()) })?; - let num_expected_columns_opt = if self.spec.is_peer_das_enabled_for_epoch(epoch) { - let num_of_column_samples = self - .custody_context - .num_of_data_columns_to_sample(epoch, &self.spec); - Some(num_of_column_samples) - } else { - None - }; + let num_expected_columns_opt = self.get_num_expected_columns(epoch); pending_components.span.in_scope(|| { debug!( @@ -691,6 +760,17 @@ impl DataAvailabilityCheckerInner { ) } + fn get_num_expected_columns(&self, epoch: Epoch) -> Option { + if self.spec.is_peer_das_enabled_for_epoch(epoch) { + let num_of_column_samples = self + .custody_context + .num_of_data_columns_to_sample(epoch, &self.spec); + Some(num_of_column_samples) + } else { + None + } + } + /// maintain the cache pub fn do_maintenance(&self, cutoff_epoch: Epoch) -> Result<(), AvailabilityCheckError> { // clean up any lingering states in the state cache @@ -964,7 +1044,7 @@ mod test { ); assert!(cache.critical.read().is_empty(), "cache should be empty"); let availability = cache - .put_pending_executed_block(pending_block) + .put_executed_block(pending_block) .expect("should put block"); if blobs_expected == 0 { assert!( @@ -1031,7 +1111,7 @@ mod test { ); } let availability = cache - .put_pending_executed_block(pending_block) + .put_executed_block(pending_block) .expect("should put block"); assert!( matches!(availability, Availability::Available(_)), @@ -1093,7 +1173,7 @@ mod test { // put the block in the cache let availability = cache - .put_pending_executed_block(pending_block) + .put_executed_block(pending_block) .expect("should put block"); // grab the diet block from the cache for later testing @@ -1101,12 +1181,7 @@ mod test { .critical .read() .peek(&block_root) - .map(|pending_components| { - pending_components - .executed_block - .clone() - .expect("should exist") - }) + .and_then(|pending_components| pending_components.get_diet_block().cloned()) .expect("should exist"); pushed_diet_blocks.push_back(diet_block); @@ -1267,7 +1342,7 @@ mod pending_components_tests { } pub fn assert_cache_consistent(cache: PendingComponents, max_len: usize) { - if let Some(cached_block) = cache.get_cached_block() { + if let Some(cached_block) = &cache.block { let cached_block_commitments = cached_block.get_commitments(); for index in 0..max_len { let block_commitment = cached_block_commitments.get(index).copied(); @@ -1373,4 +1448,33 @@ mod pending_components_tests { assert_cache_consistent(cache, max_len); } + + #[test] + fn should_not_insert_pre_execution_block_if_executed_block_exists() { + let (pre_execution_block, blobs, random_blobs, max_len) = pre_setup(); + let (executed_block, _blobs, _random_blobs) = + setup_pending_components(pre_execution_block.clone(), blobs, random_blobs); + + let block_root = pre_execution_block.canonical_root(); + let mut pending_component = >::empty(block_root, max_len); + + let pre_execution_block = Arc::new(pre_execution_block); + pending_component.insert_pre_execution_block(pre_execution_block.clone()); + assert!( + matches!(pending_component.block, Some(CachedBlock::PreExecution(_))), + "pre execution block inserted" + ); + + pending_component.insert_executed_block(executed_block); + assert!( + matches!(pending_component.block, Some(CachedBlock::Executed(_))), + "executed block inserted" + ); + + pending_component.insert_pre_execution_block(pre_execution_block); + assert!( + matches!(pending_component.block, Some(CachedBlock::Executed(_))), + "executed block should remain" + ); + } } diff --git a/beacon_node/beacon_chain/src/data_availability_checker/state_lru_cache.rs b/beacon_node/beacon_chain/src/data_availability_checker/state_lru_cache.rs index 57c236efcf..24f9237e3c 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker/state_lru_cache.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker/state_lru_cache.rs @@ -10,7 +10,6 @@ use state_processing::BlockReplayer; use std::sync::Arc; use store::OnDiskConsensusContext; use tracing::{Span, debug_span, instrument}; -use types::beacon_block_body::KzgCommitments; use types::{BeaconState, BlindedPayload, ChainSpec, Epoch, EthSpec, Hash256, SignedBeaconBlock}; /// This mirrors everything in the `AvailabilityPendingExecutedBlock`, except @@ -43,15 +42,6 @@ impl DietAvailabilityPendingExecutedBlock { .map_or(0, |commitments| commitments.len()) } - pub fn get_commitments(&self) -> KzgCommitments { - self.as_block() - .message() - .body() - .blob_kzg_commitments() - .cloned() - .unwrap_or_default() - } - /// Returns the epoch corresponding to `self.slot()`. pub fn epoch(&self) -> Epoch { self.block.slot().epoch(E::slots_per_epoch()) diff --git a/beacon_node/beacon_chain/src/metrics.rs b/beacon_node/beacon_chain/src/metrics.rs index 3da3cf163a..0d34ffdcd1 100644 --- a/beacon_node/beacon_chain/src/metrics.rs +++ b/beacon_node/beacon_chain/src/metrics.rs @@ -458,12 +458,6 @@ pub static BEACON_EARLY_ATTESTER_CACHE_HITS: LazyLock> = Lazy ) }); -pub static BEACON_REQRESP_PRE_IMPORT_CACHE_SIZE: LazyLock> = LazyLock::new(|| { - try_create_int_gauge( - "beacon_reqresp_pre_import_cache_size", - "Current count of items of the reqresp pre import cache", - ) -}); pub static BEACON_REQRESP_PRE_IMPORT_CACHE_HITS: LazyLock> = LazyLock::new(|| { try_create_int_counter( @@ -1965,7 +1959,6 @@ pub fn scrape_for_metrics(beacon_chain: &BeaconChain) { } let attestation_stats = beacon_chain.op_pool.attestation_stats(); - let chain_metrics = beacon_chain.metrics(); // Kept duplicated for backwards compatibility set_gauge_by_usize( @@ -1973,11 +1966,6 @@ pub fn scrape_for_metrics(beacon_chain: &BeaconChain) { beacon_chain.store.state_cache_len(), ); - set_gauge_by_usize( - &BEACON_REQRESP_PRE_IMPORT_CACHE_SIZE, - chain_metrics.reqresp_pre_import_cache_len, - ); - let da_checker_metrics = beacon_chain.data_availability_checker.metrics(); set_gauge_by_usize( &DATA_AVAILABILITY_OVERFLOW_MEMORY_BLOCK_CACHE_SIZE, diff --git a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs index b3d717142f..5fc94c2958 100644 --- a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs @@ -1500,11 +1500,12 @@ impl NetworkBeaconProcessor { let result = self .chain - .process_block_with_early_caching( + .process_block( block_root, verified_block, - BlockImportSource::Gossip, NotifyExecutionLayer::Yes, + BlockImportSource::Gossip, + || Ok(()), ) .await; register_process_result_metrics(&result, metrics::BlockSource::Gossip, "block"); diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index b61a6e25c5..f139724702 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -168,11 +168,12 @@ impl NetworkBeaconProcessor { let signed_beacon_block = block.block_cloned(); let result = self .chain - .process_block_with_early_caching( + .process_block( block_root, block, - BlockImportSource::Lookup, NotifyExecutionLayer::Yes, + BlockImportSource::Lookup, + || Ok(()), ) .await; register_process_result_metrics(&result, metrics::BlockSource::Rpc, "block"); diff --git a/beacon_node/network/src/sync/tests/lookups.rs b/beacon_node/network/src/sync/tests/lookups.rs index 2edcd12f01..27968a0635 100644 --- a/beacon_node/network/src/sync/tests/lookups.rs +++ b/beacon_node/network/src/sync/tests/lookups.rs @@ -1079,7 +1079,7 @@ impl TestRig { .harness .chain .data_availability_checker - .put_pending_executed_block(executed_block) + .put_executed_block(executed_block) .unwrap() { Availability::Available(_) => panic!("block removed from da_checker, available"), @@ -1109,20 +1109,19 @@ impl TestRig { }; } - fn insert_block_to_processing_cache(&mut self, block: Arc>) { + fn insert_block_to_availability_cache(&mut self, block: Arc>) { self.harness .chain - .reqresp_pre_import_cache - .write() - .insert(block.canonical_root(), block); + .data_availability_checker + .put_pre_execution_block(block.canonical_root(), block) + .unwrap(); } fn simulate_block_gossip_processing_becomes_invalid(&mut self, block_root: Hash256) { self.harness .chain - .reqresp_pre_import_cache - .write() - .remove(&block_root); + .data_availability_checker + .remove_block_on_execution_error(&block_root); self.send_sync_message(SyncMessage::GossipBlockProcessResult { block_root, @@ -1135,11 +1134,6 @@ impl TestRig { block: Arc>, ) { let block_root = block.canonical_root(); - self.harness - .chain - .reqresp_pre_import_cache - .write() - .remove(&block_root); self.insert_block_to_da_checker(block); @@ -1841,7 +1835,7 @@ fn block_in_processing_cache_becomes_invalid() { let (block, blobs) = r.rand_block_and_blobs(NumBlobs::Number(1)); let block_root = block.canonical_root(); let peer_id = r.new_connected_peer(); - r.insert_block_to_processing_cache(block.clone().into()); + r.insert_block_to_availability_cache(block.clone().into()); r.trigger_unknown_block_from_attestation(block_root, peer_id); // Should trigger blob request let id = r.expect_blob_lookup_request(block_root); @@ -1867,7 +1861,7 @@ fn block_in_processing_cache_becomes_valid_imported() { let (block, blobs) = r.rand_block_and_blobs(NumBlobs::Number(1)); let block_root = block.canonical_root(); let peer_id = r.new_connected_peer(); - r.insert_block_to_processing_cache(block.clone().into()); + r.insert_block_to_availability_cache(block.clone().into()); r.trigger_unknown_block_from_attestation(block_root, peer_id); // Should trigger blob request let id = r.expect_blob_lookup_request(block_root); From 4efe47b3c3ccf5bfe88cd76b6abe1ce7b080e0d0 Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Fri, 19 Sep 2025 17:01:16 +1000 Subject: [PATCH 50/81] Rename `--subscribe-all-data-column-subnets` to `--supernode` and make it visible in help (#8083) Rename `--subscribe-all-data-column-subnets` to `--supernode` as it's now been officially accepted in the spec. Also make it visible in help in preparation for the fusaka release. https://github.com/ethereum/consensus-specs/blob/dev/specs/fulu/p2p-interface.md#supernodes Co-Authored-By: Jimmy Chen --- beacon_node/src/cli.rs | 15 ++++++++------- beacon_node/src/config.rs | 2 +- book/src/help_bn.md | 7 +++++++ lighthouse/tests/beacon_node.rs | 13 +++++++++++++ 4 files changed, 29 insertions(+), 8 deletions(-) diff --git a/beacon_node/src/cli.rs b/beacon_node/src/cli.rs index 238907adce..569d1e4ad8 100644 --- a/beacon_node/src/cli.rs +++ b/beacon_node/src/cli.rs @@ -47,16 +47,17 @@ pub fn cli_app() -> Command { * Network parameters. */ .arg( - Arg::new("subscribe-all-data-column-subnets") - .long("subscribe-all-data-column-subnets") + Arg::new("supernode") + .long("supernode") + .alias("subscribe-all-data-column-subnets") .action(ArgAction::SetTrue) .help_heading(FLAG_HEADER) - .help("Subscribe to all data column subnets and participate in data custody for \ - all columns. This will also advertise the beacon node as being long-lived \ - subscribed to all data column subnets. \ - NOTE: this is an experimental flag and may change any time without notice!") + .help("Run as a voluntary supernode. This node will subscribe to all data column \ + subnets, custody all data columns, and perform reconstruction and cross-seeding. \ + This requires significantly more bandwidth, storage, and computation requirements but \ + the node will have direct access to all blobs via the beacon API and it \ + helps network resilience by serving all data columns to syncing peers.") .display_order(0) - .hide(true) ) .arg( // TODO(das): remove this before PeerDAS release diff --git a/beacon_node/src/config.rs b/beacon_node/src/config.rs index 3681556d11..230350fade 100644 --- a/beacon_node/src/config.rs +++ b/beacon_node/src/config.rs @@ -1162,7 +1162,7 @@ pub fn set_network_config( config.network_dir = data_dir.join(DEFAULT_NETWORK_DIR); }; - if parse_flag(cli_args, "subscribe-all-data-column-subnets") { + if parse_flag(cli_args, "supernode") { config.subscribe_all_data_column_subnets = true; } diff --git a/book/src/help_bn.md b/book/src/help_bn.md index eba6814863..d5396321f2 100644 --- a/book/src/help_bn.md +++ b/book/src/help_bn.md @@ -571,6 +571,13 @@ Flags: Subscribe to all subnets regardless of validator count. This will also advertise the beacon node as being long-lived subscribed to all subnets. + --supernode + Run as a voluntary supernode. This node will subscribe to all data + column subnets, custody all data columns, and perform reconstruction + and cross-seeding. This requires significantly more bandwidth, + storage, and computation requirements but the node will have direct + access to all blobs via the beacon API and it helps network resilience + by serving all data columns to syncing peers. --validator-monitor-auto Enables the automatic detection and monitoring of validators connected to the HTTP API and using the subnet subscription endpoint. This diff --git a/lighthouse/tests/beacon_node.rs b/lighthouse/tests/beacon_node.rs index 629c2e1e9a..8f6d040b62 100644 --- a/lighthouse/tests/beacon_node.rs +++ b/lighthouse/tests/beacon_node.rs @@ -833,6 +833,19 @@ fn network_subscribe_all_data_column_subnets_flag() { .with_config(|config| assert!(config.network.subscribe_all_data_column_subnets)); } #[test] +fn network_supernode_flag() { + CommandLineTest::new() + .flag("supernode", None) + .run_with_zero_port() + .with_config(|config| assert!(config.network.subscribe_all_data_column_subnets)); +} +#[test] +fn network_subscribe_all_data_column_subnets_default() { + CommandLineTest::new() + .run_with_zero_port() + .with_config(|config| assert!(!config.network.subscribe_all_data_column_subnets)); +} +#[test] fn blob_publication_batches() { CommandLineTest::new() .flag("blob-publication-batches", Some("3")) From 366fb0ee0dc3d87eeb6995847f05ecab8e48d11f Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Fri, 19 Sep 2025 22:58:46 +1000 Subject: [PATCH 51/81] Always upload sim test logs (#8082) This CI job failed https://github.com/sigp/lighthouse/actions/runs/17815533375/job/50647915897 But we lost the logs because they aren't uploaded when the job fails. This PR changes the step to always upload job, even in the case of failure. Co-Authored-By: Jimmy Chen --- .github/workflows/test-suite.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/test-suite.yml b/.github/workflows/test-suite.yml index 59a045c7d3..0201bf9ae3 100644 --- a/.github/workflows/test-suite.yml +++ b/.github/workflows/test-suite.yml @@ -225,6 +225,7 @@ jobs: TEST_FEATURES: portable CI_LOGGER_DIR: ${{ runner.temp }}/network_test_logs - name: Upload logs + if: always() uses: actions/upload-artifact@v4 with: name: network_test_logs @@ -328,6 +329,7 @@ jobs: - name: Run a basic beacon chain sim that starts from Deneb run: cargo run --release --bin simulator basic-sim --disable-stdout-logging --log-dir ${{ runner.temp }}/basic_simulator_logs - name: Upload logs + if: always() uses: actions/upload-artifact@v4 with: name: basic_simulator_logs @@ -349,6 +351,7 @@ jobs: - name: Run a beacon chain sim which tests VC fallback behaviour run: cargo run --release --bin simulator fallback-sim --disable-stdout-logging --log-dir ${{ runner.temp }}/fallback_simulator_logs - name: Upload logs + if: always() uses: actions/upload-artifact@v4 with: name: fallback_simulator_logs From 1dbc4f861b3f678516f6b3ba9cb448e3550b1b31 Mon Sep 17 00:00:00 2001 From: Michael Sproul Date: Mon, 22 Sep 2025 15:03:47 +1000 Subject: [PATCH 52/81] Refine HTTP status logs (#8098) Ensure that we don't log a warning for HTTP 202s, which are expected on the blinded block endpoints after Fulu. Co-Authored-By: Michael Sproul --- beacon_node/http_api/src/lib.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/beacon_node/http_api/src/lib.rs b/beacon_node/http_api/src/lib.rs index 5c6a9df739..1b18ed50a3 100644 --- a/beacon_node/http_api/src/lib.rs +++ b/beacon_node/http_api/src/lib.rs @@ -294,10 +294,7 @@ pub fn tracing_logging() -> warp::filters::log::Log Date: Mon, 22 Sep 2025 21:37:33 -0700 Subject: [PATCH 53/81] Reduce `TARGET_BACKFILL_SLOTS` in checkpoint sync test (#8102) Co-Authored-By: Eitan Seri- Levi --- scripts/tests/checkpoint-sync-config-devnet.yaml | 4 ++++ scripts/tests/checkpoint-sync.sh | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/scripts/tests/checkpoint-sync-config-devnet.yaml b/scripts/tests/checkpoint-sync-config-devnet.yaml index f1b96dc9e5..2392011ed3 100644 --- a/scripts/tests/checkpoint-sync-config-devnet.yaml +++ b/scripts/tests/checkpoint-sync-config-devnet.yaml @@ -4,11 +4,15 @@ participants: cl_image: lighthouse:local el_type: geth el_image: ethpandaops/geth:master + cl_extra_params: + - --disable-backfill-rate-limiting supernode: true - cl_type: lighthouse cl_image: lighthouse:local el_type: geth el_image: ethpandaops/geth:master + cl_extra_params: + - --disable-backfill-rate-limiting supernode: false checkpoint_sync_enabled: true diff --git a/scripts/tests/checkpoint-sync.sh b/scripts/tests/checkpoint-sync.sh index a170d1e94d..df03da042e 100755 --- a/scripts/tests/checkpoint-sync.sh +++ b/scripts/tests/checkpoint-sync.sh @@ -15,7 +15,7 @@ CONFIG=${2:-$SCRIPT_DIR/checkpoint-sync-config-sepolia.yaml} # Interval for polling the /lighthouse/syncing endpoint for sync status POLL_INTERVAL_SECS=5 # Target number of slots to backfill to complete this test. -TARGET_BACKFILL_SLOTS=1024 +TARGET_BACKFILL_SLOTS=256 # Timeout for this test, if the node(s) fail to backfill `TARGET_BACKFILL_SLOTS` slots, fail the test. TIMEOUT_MINS=10 TIMEOUT_SECS=$((TIMEOUT_MINS * 60)) From d80c0ff5b57c043f60ee3cdc48730077fc484d75 Mon Sep 17 00:00:00 2001 From: Antonio Viggiano Date: Tue, 23 Sep 2025 22:20:10 -0300 Subject: [PATCH 54/81] Use HTTPS for xdelta3 in Cargo.toml (#8094) No issue Use HTTPS for dependency Co-Authored-By: Antonio Viggiano --- Cargo.lock | 2 +- Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0e55918243..c100fa5ae2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11165,7 +11165,7 @@ dependencies = [ [[package]] name = "xdelta3" version = "0.1.5" -source = "git+http://github.com/sigp/xdelta3-rs?rev=4db64086bb02e9febb584ba93b9d16bb2ae3825a#4db64086bb02e9febb584ba93b9d16bb2ae3825a" +source = "git+https://github.com/sigp/xdelta3-rs?rev=4db64086bb02e9febb584ba93b9d16bb2ae3825a#4db64086bb02e9febb584ba93b9d16bb2ae3825a" dependencies = [ "bindgen", "cc", diff --git a/Cargo.toml b/Cargo.toml index 99543dbfb4..66378a16c4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -279,7 +279,7 @@ validator_test_rig = { path = "testing/validator_test_rig" } warp = { version = "0.3.7", default-features = false, features = ["tls"] } warp_utils = { path = "common/warp_utils" } workspace_members = { path = "common/workspace_members" } -xdelta3 = { git = "http://github.com/sigp/xdelta3-rs", rev = "4db64086bb02e9febb584ba93b9d16bb2ae3825a" } +xdelta3 = { git = "https://github.com/sigp/xdelta3-rs", rev = "4db64086bb02e9febb584ba93b9d16bb2ae3825a" } zeroize = { version = "1", features = ["zeroize_derive", "serde"] } zip = "0.6" zstd = "0.13" From af274029e8c61fe01048105ba1f192cc762effeb Mon Sep 17 00:00:00 2001 From: Eitan Seri-Levi Date: Tue, 23 Sep 2025 23:37:34 -0700 Subject: [PATCH 55/81] Run reconstruction inside a scoped rayon pool (#8075) Co-Authored-By: Jimmy Chen Co-Authored-By: Eitan Seri- Levi Co-Authored-By: Eitan Seri-Levi --- Cargo.lock | 3 +- beacon_node/beacon_chain/src/beacon_chain.rs | 16 ++--- beacon_node/beacon_processor/Cargo.toml | 1 - beacon_node/beacon_processor/src/lib.rs | 20 ++----- .../beacon_processor/src/rayon_manager.rs | 27 --------- beacon_node/client/src/builder.rs | 2 - beacon_node/http_api/src/test_utils.rs | 2 - .../src/network_beacon_processor/tests.rs | 2 - common/task_executor/Cargo.toml | 2 + common/task_executor/src/lib.rs | 50 +++++++++++++++- .../task_executor/src/rayon_pool_provider.rs | 58 +++++++++++++++++++ 11 files changed, 123 insertions(+), 60 deletions(-) delete mode 100644 beacon_node/beacon_processor/src/rayon_manager.rs create mode 100644 common/task_executor/src/rayon_pool_provider.rs diff --git a/Cargo.lock b/Cargo.lock index c100fa5ae2..ee65108097 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -980,7 +980,6 @@ dependencies = [ "metrics", "num_cpus", "parking_lot 0.12.3", - "rayon", "serde", "slot_clock", "strum", @@ -9232,6 +9231,8 @@ dependencies = [ "async-channel 1.9.0", "futures", "metrics", + "num_cpus", + "rayon", "tokio", "tracing", ] diff --git a/beacon_node/beacon_chain/src/beacon_chain.rs b/beacon_node/beacon_chain/src/beacon_chain.rs index ef3c2f52e0..4f0c6aada0 100644 --- a/beacon_node/beacon_chain/src/beacon_chain.rs +++ b/beacon_node/beacon_chain/src/beacon_chain.rs @@ -124,7 +124,7 @@ use store::{ BlobSidecarListFromRoot, DBColumn, DatabaseBlock, Error as DBError, HotColdDB, HotStateSummary, KeyValueStore, KeyValueStoreOp, StoreItem, StoreOp, }; -use task_executor::{ShutdownReason, TaskExecutor}; +use task_executor::{RayonPoolType, ShutdownReason, TaskExecutor}; use tokio_stream::Stream; use tracing::{Span, debug, debug_span, error, info, info_span, instrument, trace, warn}; use tree_hash::TreeHash; @@ -3274,16 +3274,12 @@ impl BeaconChain { let current_span = Span::current(); let result = self .task_executor - .spawn_blocking_handle( - move || { - let _guard = current_span.enter(); - data_availability_checker.reconstruct_data_columns(&block_root) - }, - "reconstruct_data_columns", - ) - .ok_or(BeaconChainError::RuntimeShutdown)? + .spawn_blocking_with_rayon_async(RayonPoolType::HighPriority, move || { + let _guard = current_span.enter(); + data_availability_checker.reconstruct_data_columns(&block_root) + }) .await - .map_err(BeaconChainError::TokioJoin)??; + .map_err(|_| BeaconChainError::RuntimeShutdown)??; match result { DataColumnReconstructionResult::Success((availability, data_columns_to_publish)) => { diff --git a/beacon_node/beacon_processor/Cargo.toml b/beacon_node/beacon_processor/Cargo.toml index 262badf7f9..afd4660c9a 100644 --- a/beacon_node/beacon_processor/Cargo.toml +++ b/beacon_node/beacon_processor/Cargo.toml @@ -12,7 +12,6 @@ logging = { workspace = true } metrics = { workspace = true } num_cpus = { workspace = true } parking_lot = { workspace = true } -rayon = { workspace = true } serde = { workspace = true } slot_clock = { workspace = true } strum = { workspace = true } diff --git a/beacon_node/beacon_processor/src/lib.rs b/beacon_node/beacon_processor/src/lib.rs index 64aeb4ceaf..28ed0cca91 100644 --- a/beacon_node/beacon_processor/src/lib.rs +++ b/beacon_node/beacon_processor/src/lib.rs @@ -38,7 +38,6 @@ //! checks the queues to see if there are more parcels of work that can be spawned in a new worker //! task. -use crate::rayon_manager::RayonManager; use crate::work_reprocessing_queue::{ QueuedBackfillBatch, QueuedColumnReconstruction, QueuedGossipBlock, ReprocessQueueMessage, }; @@ -48,7 +47,6 @@ use lighthouse_network::{MessageId, NetworkGlobals, PeerId}; use logging::TimeLatch; use logging::crit; use parking_lot::Mutex; -use rayon::ThreadPool; pub use scheduler::work_reprocessing_queue; use serde::{Deserialize, Serialize}; use slot_clock::SlotClock; @@ -61,7 +59,7 @@ use std::sync::Arc; use std::task::Context; use std::time::{Duration, Instant}; use strum::IntoStaticStr; -use task_executor::TaskExecutor; +use task_executor::{RayonPoolType, TaskExecutor}; use tokio::sync::mpsc; use tokio::sync::mpsc::error::TrySendError; use tracing::{debug, error, trace, warn}; @@ -76,7 +74,6 @@ use work_reprocessing_queue::{ }; mod metrics; -pub mod rayon_manager; pub mod scheduler; /// The maximum size of the channel for work events to the `BeaconProcessor`. @@ -810,7 +807,6 @@ pub struct BeaconProcessor { pub network_globals: Arc>, pub executor: TaskExecutor, pub current_workers: usize, - pub rayon_manager: RayonManager, pub config: BeaconProcessorConfig, } @@ -1609,10 +1605,7 @@ impl BeaconProcessor { } Work::ChainSegmentBackfill(process_fn) => { if self.config.enable_backfill_rate_limiting { - task_spawner.spawn_blocking_with_rayon( - self.rayon_manager.low_priority_threadpool.clone(), - process_fn, - ) + task_spawner.spawn_blocking_with_rayon(RayonPoolType::LowPriority, process_fn) } else { // use the global rayon thread pool if backfill rate limiting is disabled. task_spawner.spawn_blocking(process_fn) @@ -1681,17 +1674,16 @@ impl TaskSpawner { } /// Spawns a blocking task on a rayon thread pool, dropping the `SendOnDrop` after task completion. - fn spawn_blocking_with_rayon(self, thread_pool: Arc, task: F) + fn spawn_blocking_with_rayon(self, rayon_pool_type: RayonPoolType, task: F) where F: FnOnce() + Send + 'static, { - self.executor.spawn_blocking( + self.executor.spawn_blocking_with_rayon( move || { - thread_pool.install(|| { - task(); - }); + task(); drop(self.send_idle_on_drop) }, + rayon_pool_type, WORKER_TASK_NAME, ) } diff --git a/beacon_node/beacon_processor/src/rayon_manager.rs b/beacon_node/beacon_processor/src/rayon_manager.rs deleted file mode 100644 index 99fe32d5cc..0000000000 --- a/beacon_node/beacon_processor/src/rayon_manager.rs +++ /dev/null @@ -1,27 +0,0 @@ -use rayon::{ThreadPool, ThreadPoolBuilder}; -use std::sync::Arc; - -const DEFAULT_LOW_PRIORITY_DIVISOR: usize = 4; -const MINIMUM_LOW_PRIORITY_THREAD_COUNT: usize = 1; - -pub struct RayonManager { - /// Smaller rayon thread pool for lower-priority, compute-intensive tasks. - /// By default ~25% of CPUs or a minimum of 1 thread. - pub low_priority_threadpool: Arc, -} - -impl Default for RayonManager { - fn default() -> Self { - let low_prio_threads = - (num_cpus::get() / DEFAULT_LOW_PRIORITY_DIVISOR).max(MINIMUM_LOW_PRIORITY_THREAD_COUNT); - let low_priority_threadpool = Arc::new( - ThreadPoolBuilder::new() - .num_threads(low_prio_threads) - .build() - .expect("failed to build low-priority rayon pool"), - ); - Self { - low_priority_threadpool, - } - } -} diff --git a/beacon_node/client/src/builder.rs b/beacon_node/client/src/builder.rs index 87cdcc45ef..d984d5fedc 100644 --- a/beacon_node/client/src/builder.rs +++ b/beacon_node/client/src/builder.rs @@ -17,7 +17,6 @@ use beacon_chain::{ store::{HotColdDB, ItemStore, StoreConfig}, }; use beacon_chain::{Kzg, LightClientProducerEvent}; -use beacon_processor::rayon_manager::RayonManager; use beacon_processor::{BeaconProcessor, BeaconProcessorChannels}; use beacon_processor::{BeaconProcessorConfig, BeaconProcessorQueueLengths}; use environment::RuntimeContext; @@ -681,7 +680,6 @@ where executor: beacon_processor_context.executor.clone(), current_workers: 0, config: beacon_processor_config, - rayon_manager: RayonManager::default(), } .spawn_manager( beacon_processor_channels.beacon_processor_rx, diff --git a/beacon_node/http_api/src/test_utils.rs b/beacon_node/http_api/src/test_utils.rs index 7be8960e69..fe9e0dff70 100644 --- a/beacon_node/http_api/src/test_utils.rs +++ b/beacon_node/http_api/src/test_utils.rs @@ -5,7 +5,6 @@ use beacon_chain::{ }; use beacon_processor::{ BeaconProcessor, BeaconProcessorChannels, BeaconProcessorConfig, BeaconProcessorQueueLengths, - rayon_manager::RayonManager, }; use directory::DEFAULT_ROOT_DIR; use eth2::{BeaconNodeHttpClient, Timeouts}; @@ -248,7 +247,6 @@ pub async fn create_api_server_with_config( executor: test_runtime.task_executor.clone(), current_workers: 0, config: beacon_processor_config, - rayon_manager: RayonManager::default(), } .spawn_manager( beacon_processor_rx, diff --git a/beacon_node/network/src/network_beacon_processor/tests.rs b/beacon_node/network/src/network_beacon_processor/tests.rs index 99410bc5e5..4137c974bf 100644 --- a/beacon_node/network/src/network_beacon_processor/tests.rs +++ b/beacon_node/network/src/network_beacon_processor/tests.rs @@ -17,7 +17,6 @@ use beacon_chain::test_utils::{ test_spec, }; use beacon_chain::{BeaconChain, WhenSlotSkipped}; -use beacon_processor::rayon_manager::RayonManager; use beacon_processor::{work_reprocessing_queue::*, *}; use gossipsub::MessageAcceptance; use itertools::Itertools; @@ -267,7 +266,6 @@ impl TestRig { executor, current_workers: 0, config: beacon_processor_config, - rayon_manager: RayonManager::default(), } .spawn_manager( beacon_processor_rx, diff --git a/common/task_executor/Cargo.toml b/common/task_executor/Cargo.toml index d4faf1e4b8..92a4fc4b59 100644 --- a/common/task_executor/Cargo.toml +++ b/common/task_executor/Cargo.toml @@ -8,6 +8,8 @@ edition = { workspace = true } async-channel = { workspace = true } futures = { workspace = true } metrics = { workspace = true } +num_cpus = { workspace = true } +rayon = { workspace = true } tokio = { workspace = true, features = ["rt-multi-thread", "macros"] } tracing = { workspace = true } diff --git a/common/task_executor/src/lib.rs b/common/task_executor/src/lib.rs index 5f0c822b03..0b8e9f8eba 100644 --- a/common/task_executor/src/lib.rs +++ b/common/task_executor/src/lib.rs @@ -1,12 +1,15 @@ mod metrics; +mod rayon_pool_provider; pub mod test_utils; use futures::channel::mpsc::Sender; use futures::prelude::*; -use std::sync::Weak; +use std::sync::{Arc, Weak}; use tokio::runtime::{Handle, Runtime}; use tracing::debug; +use crate::rayon_pool_provider::RayonPoolProvider; +pub use crate::rayon_pool_provider::RayonPoolType; pub use tokio::task::JoinHandle; /// Provides a reason when Lighthouse is shut down. @@ -84,6 +87,8 @@ pub struct TaskExecutor { // FIXME(sproul): delete? #[allow(dead_code)] service_name: String, + + rayon_pool_provider: Arc, } impl TaskExecutor { @@ -105,6 +110,7 @@ impl TaskExecutor { exit, signal_tx, service_name, + rayon_pool_provider: Arc::new(RayonPoolProvider::default()), } } @@ -115,6 +121,7 @@ impl TaskExecutor { exit: self.exit.clone(), signal_tx: self.signal_tx.clone(), service_name, + rayon_pool_provider: self.rayon_pool_provider.clone(), } } @@ -226,6 +233,47 @@ impl TaskExecutor { } } + /// Spawns a blocking task on a dedicated tokio thread pool and installs a rayon context within it. + pub fn spawn_blocking_with_rayon( + self, + task: F, + rayon_pool_type: RayonPoolType, + name: &'static str, + ) where + F: FnOnce() + Send + 'static, + { + let thread_pool = self.rayon_pool_provider.get_thread_pool(rayon_pool_type); + self.spawn_blocking( + move || { + thread_pool.install(|| { + task(); + }); + }, + name, + ) + } + + /// Spawns a blocking computation on a rayon thread pool and awaits the result. + pub async fn spawn_blocking_with_rayon_async( + &self, + rayon_pool_type: RayonPoolType, + task: F, + ) -> Result + where + F: FnOnce() -> R + Send + 'static, + R: Send + 'static, + { + let thread_pool = self.rayon_pool_provider.get_thread_pool(rayon_pool_type); + let (tx, rx) = tokio::sync::oneshot::channel(); + + thread_pool.spawn(move || { + let result = task(); + let _ = tx.send(result); + }); + + rx.await + } + /// Spawn a future on the tokio runtime wrapped in an `async-channel::Receiver` returning an optional /// join handle to the future. /// The task is cancelled when the corresponding async-channel is dropped. diff --git a/common/task_executor/src/rayon_pool_provider.rs b/common/task_executor/src/rayon_pool_provider.rs new file mode 100644 index 0000000000..8e12f7eaa4 --- /dev/null +++ b/common/task_executor/src/rayon_pool_provider.rs @@ -0,0 +1,58 @@ +use rayon::{ThreadPool, ThreadPoolBuilder}; +use std::sync::Arc; + +const DEFAULT_LOW_PRIORITY_CPU_PERCENTAGE: usize = 25; +const DEFAULT_HIGH_PRIORITY_CPU_PERCENTAGE: usize = 80; +const MINIMUM_THREAD_COUNT: usize = 1; + +pub enum RayonPoolType { + HighPriority, + LowPriority, +} + +pub struct RayonPoolProvider { + /// Smaller rayon thread pool for lower-priority, compute-intensive tasks. + /// By default ~25% of CPUs or a minimum of 1 thread. + low_priority_thread_pool: Arc, + /// Larger rayon thread pool for high-priority, compute-intensive tasks. + /// By default ~80% of CPUs or a minimum of 1 thread. Citical/highest + /// priority tasks should use the global pool instead. + high_priority_thread_pool: Arc, +} + +impl Default for RayonPoolProvider { + fn default() -> Self { + let low_prio_threads = + (num_cpus::get() * DEFAULT_LOW_PRIORITY_CPU_PERCENTAGE / 100).max(MINIMUM_THREAD_COUNT); + let low_priority_thread_pool = Arc::new( + ThreadPoolBuilder::new() + .num_threads(low_prio_threads) + .build() + .expect("failed to build low-priority rayon pool"), + ); + + let high_prio_threads = (num_cpus::get() * DEFAULT_HIGH_PRIORITY_CPU_PERCENTAGE / 100) + .max(MINIMUM_THREAD_COUNT); + let high_priority_thread_pool = Arc::new( + ThreadPoolBuilder::new() + .num_threads(high_prio_threads) + .build() + .expect("failed to build high-priority rayon pool"), + ); + Self { + low_priority_thread_pool, + high_priority_thread_pool, + } + } +} + +impl RayonPoolProvider { + /// Get a scoped thread pool by priority level. + /// For critical/highest priority tasks, use the global pool instead. + pub fn get_thread_pool(&self, rayon_pool_type: RayonPoolType) -> Arc { + match rayon_pool_type { + RayonPoolType::HighPriority => self.high_priority_thread_pool.clone(), + RayonPoolType::LowPriority => self.low_priority_thread_pool.clone(), + } + } +} From 79b33214ea8e6838b426bd19d1c410e98182970e Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Thu, 25 Sep 2025 12:52:07 +1000 Subject: [PATCH 56/81] Only send data coumn subnet discovery requests after peerdas is scheduled (#8109) #8105 (to be confirmed) I noticed a large number of failed discovery requests after deploying latest `unstable` to some of our testnet and mainnet nodes. This is because of a recent PeerDAS change to attempt to maintain sufficient peers across data column subnets - this shouldn't be enabled on network without peerdas scheduled, otherwise it will keep retrying discovery on these subnets and never succeed. Also removed some unused files. Co-Authored-By: Jimmy Chen Co-Authored-By: Jimmy Chen --- .../src/peer_manager/mod.rs | 69 +- .../src/subnet_service/attestation_subnets.rs | 681 ------------------ .../src/subnet_service/sync_subnets.rs | 345 --------- 3 files changed, 67 insertions(+), 1028 deletions(-) delete mode 100644 beacon_node/network/src/subnet_service/attestation_subnets.rs delete mode 100644 beacon_node/network/src/subnet_service/sync_subnets.rs diff --git a/beacon_node/lighthouse_network/src/peer_manager/mod.rs b/beacon_node/lighthouse_network/src/peer_manager/mod.rs index 592fccdc74..ad16bb0421 100644 --- a/beacon_node/lighthouse_network/src/peer_manager/mod.rs +++ b/beacon_node/lighthouse_network/src/peer_manager/mod.rs @@ -23,6 +23,7 @@ pub use libp2p::identity::Keypair; pub mod peerdb; use crate::peer_manager::peerdb::client::ClientKind; +use crate::types::GossipKind; use libp2p::multiaddr; use network_utils::discovery_metrics; use network_utils::enr_ext::{EnrExt, peer_id_to_node_id}; @@ -1434,8 +1435,16 @@ impl PeerManager { // Update peer score metrics; self.update_peer_score_metrics(); - // Maintain minimum count for custody peers. - self.maintain_custody_peers(); + // Maintain minimum count for custody peers if we are subscribed to any data column topics (i.e. PeerDAS activated) + let peerdas_enabled = self + .network_globals + .gossipsub_subscriptions + .read() + .iter() + .any(|topic| matches!(topic.kind(), &GossipKind::DataColumnSidecar(_))); + if peerdas_enabled { + self.maintain_custody_peers(); + } // Maintain minimum count for sync committee peers. self.maintain_sync_committee_peers(); @@ -3140,4 +3149,60 @@ mod tests { }) } } + + #[tokio::test] + async fn test_custody_peer_logic_only_runs_when_peerdas_enabled() { + use crate::types::{GossipEncoding, GossipTopic}; + + let mut peer_manager = build_peer_manager(5).await; + + // Set up sampling subnets so maintain_custody_peers would have work to do + *peer_manager.network_globals.sampling_subnets.write() = std::collections::HashSet::from([ + DataColumnSubnetId::new(0), + DataColumnSubnetId::new(1), + ]); + + // Test 1: No data column subscriptions - custody peer logic should NOT run + peer_manager.heartbeat(); + + // Should be no new DiscoverSubnetPeers events since PeerDAS is not enabled + let discovery_events: Vec<_> = peer_manager + .events + .iter() + .filter(|event| matches!(event, PeerManagerEvent::DiscoverSubnetPeers(_))) + .collect(); + assert!( + discovery_events.is_empty(), + "Should not generate discovery events when PeerDAS is disabled, but found: {:?}", + discovery_events + ); + + // Test 2: Add data column subscription - custody peer logic should run + let data_column_topic = GossipTopic::new( + GossipKind::DataColumnSidecar(DataColumnSubnetId::new(0)), + GossipEncoding::SSZSnappy, + [0, 0, 0, 0], // fork_digest + ); + peer_manager + .network_globals + .gossipsub_subscriptions + .write() + .insert(data_column_topic); + + // Clear any existing events to isolate the test + peer_manager.events.clear(); + + peer_manager.heartbeat(); + + // Should now have DiscoverSubnetPeers events since PeerDAS is enabled + let discovery_events: Vec<_> = peer_manager + .events + .iter() + .filter(|event| matches!(event, PeerManagerEvent::DiscoverSubnetPeers(_))) + .collect(); + assert!( + !discovery_events.is_empty(), + "Should generate discovery events when PeerDAS is enabled, but found no discovery events" + ); + } } diff --git a/beacon_node/network/src/subnet_service/attestation_subnets.rs b/beacon_node/network/src/subnet_service/attestation_subnets.rs deleted file mode 100644 index 0da27c6a21..0000000000 --- a/beacon_node/network/src/subnet_service/attestation_subnets.rs +++ /dev/null @@ -1,681 +0,0 @@ -//! This service keeps track of which shard subnet the beacon node should be subscribed to at any -//! given time. It schedules subscriptions to shard subnets, requests peer discoveries and -//! determines whether attestations should be aggregated and/or passed to the beacon node. - -use super::SubnetServiceMessage; -use std::collections::HashSet; -use std::collections::{HashMap, VecDeque}; -use std::pin::Pin; -use std::sync::Arc; -use std::task::{Context, Poll}; -use std::time::Duration; - -use beacon_chain::{BeaconChain, BeaconChainTypes}; -use delay_map::{HashMapDelay, HashSetDelay}; -use futures::prelude::*; -use lighthouse_network::{discv5::enr::NodeId, NetworkConfig, Subnet, SubnetDiscovery}; -use slot_clock::SlotClock; -use tracing::{debug, error, info, trace, warn}; -use types::{Attestation, EthSpec, Slot, SubnetId, ValidatorSubscription}; - -use crate::metrics; - -/// The minimum number of slots ahead that we attempt to discover peers for a subscription. If the -/// slot is less than this number, skip the peer discovery process. -/// Subnet discovery query takes at most 30 secs, 2 slots take 24s. -pub(crate) const MIN_PEER_DISCOVERY_SLOT_LOOK_AHEAD: u64 = 2; -/// The fraction of a slot that we subscribe to a subnet before the required slot. -/// -/// Currently a whole slot ahead. -const ADVANCE_SUBSCRIBE_SLOT_FRACTION: u32 = 1; - -/// The number of slots after an aggregator duty where we remove the entry from -/// `aggregate_validators_on_subnet` delay map. -const UNSUBSCRIBE_AFTER_AGGREGATOR_DUTY: u32 = 2; - -#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] -pub(crate) enum SubscriptionKind { - /// Long lived subscriptions. - /// - /// These have a longer duration and are advertised in our ENR. - LongLived, - /// Short lived subscriptions. - /// - /// Subscribing to these subnets has a short duration and we don't advertise it in our ENR. - ShortLived, -} - -/// A particular subnet at a given slot. -#[derive(PartialEq, Eq, Hash, Clone, Debug, Copy)] -pub struct ExactSubnet { - /// The `SubnetId` associated with this subnet. - pub subnet_id: SubnetId, - /// The `Slot` associated with this subnet. - pub slot: Slot, -} - -pub struct AttestationService { - /// Queued events to return to the driving service. - events: VecDeque, - - /// A reference to the beacon chain to process received attestations. - pub(crate) beacon_chain: Arc>, - - /// Subnets we are currently subscribed to as short lived subscriptions. - /// - /// Once they expire, we unsubscribe from these. - /// We subscribe to subnets when we are an aggregator for an exact subnet. - short_lived_subscriptions: HashMapDelay, - - /// Subnets we are currently subscribed to as long lived subscriptions. - /// - /// We advertise these in our ENR. When these expire, the subnet is removed from our ENR. - /// These are required of all beacon nodes. The exact number is determined by the chain - /// specification. - long_lived_subscriptions: HashSet, - - /// Short lived subscriptions that need to be executed in the future. - scheduled_short_lived_subscriptions: HashSetDelay, - - /// A collection timeouts to track the existence of aggregate validator subscriptions at an - /// `ExactSubnet`. - aggregate_validators_on_subnet: Option>, - - /// The waker for the current thread. - waker: Option, - - /// The discovery mechanism of lighthouse is disabled. - discovery_disabled: bool, - - /// We are always subscribed to all subnets. - subscribe_all_subnets: bool, - - /// Our Discv5 node_id. - node_id: NodeId, - - /// Future used to manage subscribing and unsubscribing from long lived subnets. - next_long_lived_subscription_event: Pin>, - - /// Whether this node is a block proposer-only node. - proposer_only: bool, -} - -impl AttestationService { - /* Public functions */ - - /// Establish the service based on the passed configuration. - pub fn new(beacon_chain: Arc>, node_id: NodeId, config: &NetworkConfig) -> Self { - let slot_duration = beacon_chain.slot_clock.slot_duration(); - - if config.subscribe_all_subnets { - info!("Subscribing to all subnets"); - } else { - info!( - subnets_per_node = beacon_chain.spec.subnets_per_node, - subscription_duration_in_epochs = beacon_chain.spec.epochs_per_subnet_subscription, - "Deterministic long lived subnets enabled" - ); - } - - let track_validators = !config.import_all_attestations; - let aggregate_validators_on_subnet = - track_validators.then(|| HashSetDelay::new(slot_duration)); - let mut service = AttestationService { - events: VecDeque::with_capacity(10), - beacon_chain, - short_lived_subscriptions: HashMapDelay::new(slot_duration), - long_lived_subscriptions: HashSet::default(), - scheduled_short_lived_subscriptions: HashSetDelay::default(), - aggregate_validators_on_subnet, - waker: None, - discovery_disabled: config.disable_discovery, - subscribe_all_subnets: config.subscribe_all_subnets, - node_id, - next_long_lived_subscription_event: { - // Set a dummy sleep. Calculating the current subnet subscriptions will update this - // value with a smarter timing - Box::pin(tokio::time::sleep(Duration::from_secs(1))) - }, - proposer_only: config.proposer_only, - }; - - // If we are not subscribed to all subnets, handle the deterministic set of subnets - if !config.subscribe_all_subnets { - service.recompute_long_lived_subnets(); - } - - service - } - - /// Return count of all currently subscribed subnets (long-lived **and** short-lived). - #[cfg(test)] - pub fn subscription_count(&self) -> usize { - if self.subscribe_all_subnets { - self.beacon_chain.spec.attestation_subnet_count as usize - } else { - let count = self - .short_lived_subscriptions - .keys() - .chain(self.long_lived_subscriptions.iter()) - .collect::>() - .len(); - count - } - } - - /// Returns whether we are subscribed to a subnet for testing purposes. - #[cfg(test)] - pub(crate) fn is_subscribed( - &self, - subnet_id: &SubnetId, - subscription_kind: SubscriptionKind, - ) -> bool { - match subscription_kind { - SubscriptionKind::LongLived => self.long_lived_subscriptions.contains(subnet_id), - SubscriptionKind::ShortLived => self.short_lived_subscriptions.contains_key(subnet_id), - } - } - - #[cfg(test)] - pub(crate) fn long_lived_subscriptions(&self) -> &HashSet { - &self.long_lived_subscriptions - } - - /// Processes a list of validator subscriptions. - /// - /// This will: - /// - Register new validators as being known. - /// - Search for peers for required subnets. - /// - Request subscriptions for subnets on specific slots when required. - /// - Build the timeouts for each of these events. - /// - /// This returns a result simply for the ergonomics of using ?. The result can be - /// safely dropped. - pub fn validator_subscriptions( - &mut self, - subscriptions: impl Iterator, - ) -> Result<(), String> { - // If the node is in a proposer-only state, we ignore all subnet subscriptions. - if self.proposer_only { - return Ok(()); - } - - // Maps each subnet_id subscription to it's highest slot - let mut subnets_to_discover: HashMap = HashMap::new(); - - // Registers the validator with the attestation service. - for subscription in subscriptions { - metrics::inc_counter(&metrics::SUBNET_SUBSCRIPTION_REQUESTS); - - trace!(?subscription, "Validator subscription"); - - // Compute the subnet that is associated with this subscription - let subnet_id = match SubnetId::compute_subnet::( - subscription.slot, - subscription.attestation_committee_index, - subscription.committee_count_at_slot, - &self.beacon_chain.spec, - ) { - Ok(subnet_id) => subnet_id, - Err(e) => { - warn!( - error = ?e, - "Failed to compute subnet id for validator subscription" - ); - continue; - } - }; - // Ensure each subnet_id inserted into the map has the highest slot as it's value. - // Higher slot corresponds to higher min_ttl in the `SubnetDiscovery` entry. - if let Some(slot) = subnets_to_discover.get(&subnet_id) { - if subscription.slot > *slot { - subnets_to_discover.insert(subnet_id, subscription.slot); - } - } else if !self.discovery_disabled { - subnets_to_discover.insert(subnet_id, subscription.slot); - } - - let exact_subnet = ExactSubnet { - subnet_id, - slot: subscription.slot, - }; - - // Determine if the validator is an aggregator. If so, we subscribe to the subnet and - // if successful add the validator to a mapping of known aggregators for that exact - // subnet. - - if subscription.is_aggregator { - metrics::inc_counter(&metrics::SUBNET_SUBSCRIPTION_AGGREGATOR_REQUESTS); - if let Err(e) = self.subscribe_to_short_lived_subnet(exact_subnet) { - warn!(error = e, "Subscription to subnet error"); - } else { - trace!(?exact_subnet, "Subscribed to subnet for aggregator duties"); - } - } - } - - // If the discovery mechanism isn't disabled, attempt to set up a peer discovery for the - // required subnets. - if !self.discovery_disabled { - if let Err(e) = self.discover_peers_request( - subnets_to_discover - .into_iter() - .map(|(subnet_id, slot)| ExactSubnet { subnet_id, slot }), - ) { - warn!(error = e, "Discovery lookup request error"); - }; - } - - Ok(()) - } - - fn recompute_long_lived_subnets(&mut self) { - // Ensure the next computation is scheduled even if assigning subnets fails. - let next_subscription_event = self - .recompute_long_lived_subnets_inner() - .unwrap_or_else(|_| self.beacon_chain.slot_clock.slot_duration()); - - debug!("Recomputing deterministic long lived subnets"); - self.next_long_lived_subscription_event = - Box::pin(tokio::time::sleep(next_subscription_event)); - - if let Some(waker) = self.waker.as_ref() { - waker.wake_by_ref(); - } - } - - /// Gets the long lived subnets the node should be subscribed to during the current epoch and - /// the remaining duration for which they remain valid. - fn recompute_long_lived_subnets_inner(&mut self) -> Result { - let current_epoch = self.beacon_chain.epoch().map_err(|e| { - if !self - .beacon_chain - .slot_clock - .is_prior_to_genesis() - .unwrap_or(false) - { - error!(err = ?e,"Failed to get the current epoch from clock") - } - })?; - - let (subnets, next_subscription_epoch) = SubnetId::compute_subnets_for_epoch::( - self.node_id.raw(), - current_epoch, - &self.beacon_chain.spec, - ) - .map_err(|e| error!(err = e, "Could not compute subnets for current epoch"))?; - - let next_subscription_slot = - next_subscription_epoch.start_slot(T::EthSpec::slots_per_epoch()); - let next_subscription_event = self - .beacon_chain - .slot_clock - .duration_to_slot(next_subscription_slot) - .ok_or_else(|| { - error!("Failed to compute duration to next to long lived subscription event") - })?; - - self.update_long_lived_subnets(subnets.collect()); - - Ok(next_subscription_event) - } - - /// Updates the long lived subnets. - /// - /// New subnets are registered as subscribed, removed subnets as unsubscribed and the Enr - /// updated accordingly. - fn update_long_lived_subnets(&mut self, mut subnets: HashSet) { - info!(subnets = ?subnets.iter().collect::>(),"Subscribing to long-lived subnets"); - for subnet in &subnets { - // Add the events for those subnets that are new as long lived subscriptions. - if !self.long_lived_subscriptions.contains(subnet) { - // Check if this subnet is new and send the subscription event if needed. - if !self.short_lived_subscriptions.contains_key(subnet) { - debug!( - ?subnet, - subscription_kind = ?SubscriptionKind::LongLived, - "Subscribing to subnet" - ); - self.queue_event(SubnetServiceMessage::Subscribe(Subnet::Attestation( - *subnet, - ))); - } - self.queue_event(SubnetServiceMessage::EnrAdd(Subnet::Attestation(*subnet))); - if !self.discovery_disabled { - self.queue_event(SubnetServiceMessage::DiscoverPeers(vec![SubnetDiscovery { - subnet: Subnet::Attestation(*subnet), - min_ttl: None, - }])) - } - } - } - - // Update the long_lived_subnets set and check for subnets that are being removed - std::mem::swap(&mut self.long_lived_subscriptions, &mut subnets); - for subnet in subnets { - if !self.long_lived_subscriptions.contains(&subnet) { - self.handle_removed_subnet(subnet, SubscriptionKind::LongLived); - } - } - } - - /// Checks if we have subscribed aggregate validators for the subnet. If not, checks the gossip - /// verification, re-propagates and returns false. - pub fn should_process_attestation( - &self, - subnet: SubnetId, - attestation: &Attestation, - ) -> bool { - // Proposer-only mode does not need to process attestations - if self.proposer_only { - return false; - } - self.aggregate_validators_on_subnet - .as_ref() - .map(|tracked_vals| { - tracked_vals.contains_key(&ExactSubnet { - subnet_id: subnet, - slot: attestation.data().slot, - }) - }) - .unwrap_or(true) - } - - /* Internal private functions */ - - /// Adds an event to the event queue and notifies that this service is ready to be polled - /// again. - fn queue_event(&mut self, ev: SubnetServiceMessage) { - self.events.push_back(ev); - if let Some(waker) = &self.waker { - waker.wake_by_ref() - } - } - /// Checks if there are currently queued discovery requests and the time required to make the - /// request. - /// - /// If there is sufficient time, queues a peer discovery request for all the required subnets. - fn discover_peers_request( - &mut self, - exact_subnets: impl Iterator, - ) -> Result<(), &'static str> { - let current_slot = self - .beacon_chain - .slot_clock - .now() - .ok_or("Could not get the current slot")?; - - let discovery_subnets: Vec = exact_subnets - .filter_map(|exact_subnet| { - // Check if there is enough time to perform a discovery lookup. - if exact_subnet.slot - >= current_slot.saturating_add(MIN_PEER_DISCOVERY_SLOT_LOOK_AHEAD) - { - // Send out an event to start looking for peers. - // Require the peer for an additional slot to ensure we keep the peer for the - // duration of the subscription. - let min_ttl = self - .beacon_chain - .slot_clock - .duration_to_slot(exact_subnet.slot + 1) - .map(|duration| std::time::Instant::now() + duration); - Some(SubnetDiscovery { - subnet: Subnet::Attestation(exact_subnet.subnet_id), - min_ttl, - }) - } else { - // We may want to check the global PeerInfo to see estimated timeouts for each - // peer before they can be removed. - warn!( - subnet_id = ?exact_subnet, - "Not enough time for a discovery search" - ); - None - } - }) - .collect(); - - if !discovery_subnets.is_empty() { - self.queue_event(SubnetServiceMessage::DiscoverPeers(discovery_subnets)); - } - Ok(()) - } - - // Subscribes to the subnet if it should be done immediately, or schedules it if required. - fn subscribe_to_short_lived_subnet( - &mut self, - ExactSubnet { subnet_id, slot }: ExactSubnet, - ) -> Result<(), &'static str> { - let slot_duration = self.beacon_chain.slot_clock.slot_duration(); - - // The short time we schedule the subscription before it's actually required. This - // ensures we are subscribed on time, and allows consecutive subscriptions to the same - // subnet to overlap, reducing subnet churn. - let advance_subscription_duration = slot_duration / ADVANCE_SUBSCRIBE_SLOT_FRACTION; - // The time to the required slot. - let time_to_subscription_slot = self - .beacon_chain - .slot_clock - .duration_to_slot(slot) - .unwrap_or_default(); // If this is a past slot we will just get a 0 duration. - - // Calculate how long before we need to subscribe to the subnet. - let time_to_subscription_start = - time_to_subscription_slot.saturating_sub(advance_subscription_duration); - - // The time after a duty slot where we no longer need it in the `aggregate_validators_on_subnet` - // delay map. - let time_to_unsubscribe = - time_to_subscription_slot + UNSUBSCRIBE_AFTER_AGGREGATOR_DUTY * slot_duration; - if let Some(tracked_vals) = self.aggregate_validators_on_subnet.as_mut() { - tracked_vals.insert_at(ExactSubnet { subnet_id, slot }, time_to_unsubscribe); - } - - // If the subscription should be done in the future, schedule it. Otherwise subscribe - // immediately. - if time_to_subscription_start.is_zero() { - // This is a current or past slot, we subscribe immediately. - self.subscribe_to_short_lived_subnet_immediately(subnet_id, slot + 1)?; - } else { - // This is a future slot, schedule subscribing. - trace!(subnet = ?subnet_id, ?time_to_subscription_start,"Scheduling subnet subscription"); - self.scheduled_short_lived_subscriptions - .insert_at(ExactSubnet { subnet_id, slot }, time_to_subscription_start); - } - - Ok(()) - } - - /* A collection of functions that handle the various timeouts */ - - /// Registers a subnet as subscribed. - /// - /// Checks that the time in which the subscription would end is not in the past. If we are - /// already subscribed, extends the timeout if necessary. If this is a new subscription, we send - /// out the appropriate events. - /// - /// On determinist long lived subnets, this is only used for short lived subscriptions. - fn subscribe_to_short_lived_subnet_immediately( - &mut self, - subnet_id: SubnetId, - end_slot: Slot, - ) -> Result<(), &'static str> { - if self.subscribe_all_subnets { - // Case not handled by this service. - return Ok(()); - } - - let time_to_subscription_end = self - .beacon_chain - .slot_clock - .duration_to_slot(end_slot) - .unwrap_or_default(); - - // First check this is worth doing. - if time_to_subscription_end.is_zero() { - return Err("Time when subscription would end has already passed."); - } - - let subscription_kind = SubscriptionKind::ShortLived; - - // We need to check and add a subscription for the right kind, regardless of the presence - // of the subnet as a subscription of the other kind. This is mainly since long lived - // subscriptions can be removed at any time when a validator goes offline. - - let (subscriptions, already_subscribed_as_other_kind) = ( - &mut self.short_lived_subscriptions, - self.long_lived_subscriptions.contains(&subnet_id), - ); - - match subscriptions.get(&subnet_id) { - Some(current_end_slot) => { - // We are already subscribed. Check if we need to extend the subscription. - if &end_slot > current_end_slot { - trace!( - subnet = ?subnet_id, - prev_end_slot = %current_end_slot, - new_end_slot = %end_slot, - ?subscription_kind, - "Extending subscription to subnet" - ); - subscriptions.insert_at(subnet_id, end_slot, time_to_subscription_end); - } - } - None => { - // This is a new subscription. Add with the corresponding timeout and send the - // notification. - subscriptions.insert_at(subnet_id, end_slot, time_to_subscription_end); - - // Inform of the subscription. - if !already_subscribed_as_other_kind { - debug!( - subnet = ?subnet_id, - %end_slot, - ?subscription_kind, - "Subscribing to subnet" - ); - self.queue_event(SubnetServiceMessage::Subscribe(Subnet::Attestation( - subnet_id, - ))); - } - } - } - - Ok(()) - } - - // Unsubscribes from a subnet that was removed if it does not continue to exist as a - // subscription of the other kind. For long lived subscriptions, it also removes the - // advertisement from our ENR. - fn handle_removed_subnet(&mut self, subnet_id: SubnetId, subscription_kind: SubscriptionKind) { - let exists_in_other_subscriptions = match subscription_kind { - SubscriptionKind::LongLived => self.short_lived_subscriptions.contains_key(&subnet_id), - SubscriptionKind::ShortLived => self.long_lived_subscriptions.contains(&subnet_id), - }; - - if !exists_in_other_subscriptions { - // Subscription no longer exists as short lived or long lived. - debug!( - subnet = ?subnet_id, - ?subscription_kind, - "Unsubscribing from subnet" - ); - self.queue_event(SubnetServiceMessage::Unsubscribe(Subnet::Attestation( - subnet_id, - ))); - } - - if subscription_kind == SubscriptionKind::LongLived { - // Remove from our ENR even if we remain subscribed in other way. - self.queue_event(SubnetServiceMessage::EnrRemove(Subnet::Attestation( - subnet_id, - ))); - } - } -} - -impl Stream for AttestationService { - type Item = SubnetServiceMessage; - - fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - // Update the waker if needed. - if let Some(waker) = &self.waker { - if !waker.will_wake(cx.waker()) { - self.waker = Some(cx.waker().clone()); - } - } else { - self.waker = Some(cx.waker().clone()); - } - - // Send out any generated events. - if let Some(event) = self.events.pop_front() { - return Poll::Ready(Some(event)); - } - - // If we aren't subscribed to all subnets, handle the deterministic long-lived subnets - if !self.subscribe_all_subnets { - match self.next_long_lived_subscription_event.as_mut().poll(cx) { - Poll::Ready(_) => { - self.recompute_long_lived_subnets(); - // We re-wake the task as there could be other subscriptions to process - self.waker - .as_ref() - .expect("Waker has been set") - .wake_by_ref(); - } - Poll::Pending => {} - } - } - - // Process scheduled subscriptions that might be ready, since those can extend a soon to - // expire subscription. - match self.scheduled_short_lived_subscriptions.poll_next_unpin(cx) { - Poll::Ready(Some(Ok(ExactSubnet { subnet_id, slot }))) => { - if let Err(e) = - self.subscribe_to_short_lived_subnet_immediately(subnet_id, slot + 1) - { - debug!(subnet = ?subnet_id, err = e,"Failed to subscribe to short lived subnet"); - } - self.waker - .as_ref() - .expect("Waker has been set") - .wake_by_ref(); - } - Poll::Ready(Some(Err(e))) => { - error!( - error = e, - "Failed to check for scheduled subnet subscriptions" - ); - } - Poll::Ready(None) | Poll::Pending => {} - } - - // Finally process any expired subscriptions. - match self.short_lived_subscriptions.poll_next_unpin(cx) { - Poll::Ready(Some(Ok((subnet_id, _end_slot)))) => { - self.handle_removed_subnet(subnet_id, SubscriptionKind::ShortLived); - // We re-wake the task as there could be other subscriptions to process - self.waker - .as_ref() - .expect("Waker has been set") - .wake_by_ref(); - } - Poll::Ready(Some(Err(e))) => { - error!(error = e, "Failed to check for subnet unsubscription times"); - } - Poll::Ready(None) | Poll::Pending => {} - } - - // Poll to remove entries on expiration, no need to act on expiration events. - if let Some(tracked_vals) = self.aggregate_validators_on_subnet.as_mut() { - if let Poll::Ready(Some(Err(e))) = tracked_vals.poll_next_unpin(cx) { - error!( - error = e, - "Failed to check for aggregate validator on subnet expirations" - ); - } - } - - Poll::Pending - } -} diff --git a/beacon_node/network/src/subnet_service/sync_subnets.rs b/beacon_node/network/src/subnet_service/sync_subnets.rs deleted file mode 100644 index 6b3834e195..0000000000 --- a/beacon_node/network/src/subnet_service/sync_subnets.rs +++ /dev/null @@ -1,345 +0,0 @@ -//! This service keeps track of which sync committee subnet the beacon node should be subscribed to at any -//! given time. It schedules subscriptions to sync committee subnets and requests peer discoveries. - -use std::collections::{hash_map::Entry, HashMap, VecDeque}; -use std::pin::Pin; -use std::sync::Arc; -use std::task::{Context, Poll}; -use std::time::Duration; - -use futures::prelude::*; -use tracing::{debug, error, trace, warn}; - -use super::SubnetServiceMessage; -use beacon_chain::{BeaconChain, BeaconChainTypes}; -use delay_map::HashSetDelay; -use lighthouse_network::{NetworkConfig, Subnet, SubnetDiscovery}; -use slot_clock::SlotClock; -use types::{Epoch, EthSpec, SyncCommitteeSubscription, SyncSubnetId}; - -use crate::metrics; - -/// The minimum number of slots ahead that we attempt to discover peers for a subscription. If the -/// slot is less than this number, skip the peer discovery process. -/// Subnet discovery query takes at most 30 secs, 2 slots take 24s. -const MIN_PEER_DISCOVERY_SLOT_LOOK_AHEAD: u64 = 2; - -/// A particular subnet at a given slot. -#[derive(PartialEq, Eq, Hash, Clone, Debug)] -pub struct ExactSubnet { - /// The `SyncSubnetId` associated with this subnet. - pub subnet_id: SyncSubnetId, - /// The epoch until which we need to stay subscribed to the subnet. - pub until_epoch: Epoch, -} -pub struct SyncCommitteeService { - /// Queued events to return to the driving service. - events: VecDeque, - - /// A reference to the beacon chain to process received attestations. - pub(crate) beacon_chain: Arc>, - - /// The collection of all currently subscribed subnets. - subscriptions: HashMap, - - /// A collection of timeouts for when to unsubscribe from a subnet. - unsubscriptions: HashSetDelay, - - /// The waker for the current thread. - waker: Option, - - /// The discovery mechanism of lighthouse is disabled. - discovery_disabled: bool, - - /// We are always subscribed to all subnets. - subscribe_all_subnets: bool, - - /// Whether this node is a block proposer-only node. - proposer_only: bool, -} - -impl SyncCommitteeService { - /* Public functions */ - - pub fn new(beacon_chain: Arc>, config: &NetworkConfig) -> Self { - let spec = &beacon_chain.spec; - let epoch_duration_secs = - beacon_chain.slot_clock.slot_duration().as_secs() * T::EthSpec::slots_per_epoch(); - let default_timeout = - epoch_duration_secs.saturating_mul(spec.epochs_per_sync_committee_period.as_u64()); - - SyncCommitteeService { - events: VecDeque::with_capacity(10), - beacon_chain, - subscriptions: HashMap::new(), - unsubscriptions: HashSetDelay::new(Duration::from_secs(default_timeout)), - waker: None, - subscribe_all_subnets: config.subscribe_all_subnets, - discovery_disabled: config.disable_discovery, - proposer_only: config.proposer_only, - } - } - - /// Return count of all currently subscribed subnets. - #[cfg(test)] - pub fn subscription_count(&self) -> usize { - use types::consts::altair::SYNC_COMMITTEE_SUBNET_COUNT; - if self.subscribe_all_subnets { - SYNC_COMMITTEE_SUBNET_COUNT as usize - } else { - self.subscriptions.len() - } - } - - /// Processes a list of sync committee subscriptions. - /// - /// This will: - /// - Search for peers for required subnets. - /// - Request subscriptions required subnets. - /// - Build the timeouts for each of these events. - /// - /// This returns a result simply for the ergonomics of using ?. The result can be - /// safely dropped. - pub fn validator_subscriptions( - &mut self, - subscriptions: Vec, - ) -> Result<(), String> { - // A proposer-only node does not subscribe to any sync-committees - if self.proposer_only { - return Ok(()); - } - - let mut subnets_to_discover = Vec::new(); - for subscription in subscriptions { - metrics::inc_counter(&metrics::SYNC_COMMITTEE_SUBSCRIPTION_REQUESTS); - //NOTE: We assume all subscriptions have been verified before reaching this service - - // Registers the validator with the subnet service. - // This will subscribe to long-lived random subnets if required. - trace!(?subscription, "Sync committee subscription"); - - let subnet_ids = match SyncSubnetId::compute_subnets_for_sync_committee::( - &subscription.sync_committee_indices, - ) { - Ok(subnet_ids) => subnet_ids, - Err(e) => { - warn!( - error = ?e, - validator_index = subscription.validator_index, - "Failed to compute subnet id for sync committee subscription" - ); - continue; - } - }; - - for subnet_id in subnet_ids { - let exact_subnet = ExactSubnet { - subnet_id, - until_epoch: subscription.until_epoch, - }; - subnets_to_discover.push(exact_subnet.clone()); - if let Err(e) = self.subscribe_to_subnet(exact_subnet.clone()) { - warn!( - error = e, - validator_index = subscription.validator_index, - "Subscription to sync subnet error" - ); - } else { - trace!( - ?exact_subnet, - validator_index = subscription.validator_index, - "Subscribed to subnet for sync committee duties" - ); - } - } - } - // If the discovery mechanism isn't disabled, attempt to set up a peer discovery for the - // required subnets. - if !self.discovery_disabled { - if let Err(e) = self.discover_peers_request(subnets_to_discover.iter()) { - warn!(error = e, "Discovery lookup request error"); - }; - } - - // pre-emptively wake the thread to check for new events - if let Some(waker) = &self.waker { - waker.wake_by_ref(); - } - Ok(()) - } - - /* Internal private functions */ - - /// Checks if there are currently queued discovery requests and the time required to make the - /// request. - /// - /// If there is sufficient time, queues a peer discovery request for all the required subnets. - fn discover_peers_request<'a>( - &mut self, - exact_subnets: impl Iterator, - ) -> Result<(), &'static str> { - let current_slot = self - .beacon_chain - .slot_clock - .now() - .ok_or("Could not get the current slot")?; - - let slots_per_epoch = T::EthSpec::slots_per_epoch(); - - let discovery_subnets: Vec = exact_subnets - .filter_map(|exact_subnet| { - let until_slot = exact_subnet.until_epoch.end_slot(slots_per_epoch); - // check if there is enough time to perform a discovery lookup - if until_slot >= current_slot.saturating_add(MIN_PEER_DISCOVERY_SLOT_LOOK_AHEAD) { - // if the slot is more than epoch away, add an event to start looking for peers - // add one slot to ensure we keep the peer for the subscription slot - let min_ttl = self - .beacon_chain - .slot_clock - .duration_to_slot(until_slot + 1) - .map(|duration| std::time::Instant::now() + duration); - Some(SubnetDiscovery { - subnet: Subnet::SyncCommittee(exact_subnet.subnet_id), - min_ttl, - }) - } else { - // We may want to check the global PeerInfo to see estimated timeouts for each - // peer before they can be removed. - warn!( - subnet_id = ?exact_subnet, - "Not enough time for a discovery search" - ); - None - } - }) - .collect(); - - if !discovery_subnets.is_empty() { - self.events - .push_back(SubnetServiceMessage::DiscoverPeers(discovery_subnets)); - } - Ok(()) - } - - /// Adds a subscription event and an associated unsubscription event if required. - fn subscribe_to_subnet(&mut self, exact_subnet: ExactSubnet) -> Result<(), &'static str> { - // Return if we have subscribed to all subnets - if self.subscribe_all_subnets { - return Ok(()); - } - - // Return if we already have a subscription for exact_subnet - if self.subscriptions.get(&exact_subnet.subnet_id) == Some(&exact_subnet.until_epoch) { - return Ok(()); - } - - // Return if we already have subscription set to expire later than the current request. - if let Some(until_epoch) = self.subscriptions.get(&exact_subnet.subnet_id) { - if *until_epoch >= exact_subnet.until_epoch { - return Ok(()); - } - } - - // initialise timing variables - let current_slot = self - .beacon_chain - .slot_clock - .now() - .ok_or("Could not get the current slot")?; - - let slots_per_epoch = T::EthSpec::slots_per_epoch(); - let until_slot = exact_subnet.until_epoch.end_slot(slots_per_epoch); - // Calculate the duration to the unsubscription event. - let expected_end_subscription_duration = if current_slot >= until_slot { - warn!( - %current_slot, - ?exact_subnet, - "Sync committee subscription is past expiration" - ); - return Ok(()); - } else { - let slot_duration = self.beacon_chain.slot_clock.slot_duration(); - - // the duration until we no longer need this subscription. We assume a single slot is - // sufficient. - self.beacon_chain - .slot_clock - .duration_to_slot(until_slot) - .ok_or("Unable to determine duration to unsubscription slot")? - + slot_duration - }; - - if let Entry::Vacant(e) = self.subscriptions.entry(exact_subnet.subnet_id) { - // We are not currently subscribed and have no waiting subscription, create one - debug!(subnet = *exact_subnet.subnet_id, until_epoch = ?exact_subnet.until_epoch, "Subscribing to subnet"); - e.insert(exact_subnet.until_epoch); - self.events - .push_back(SubnetServiceMessage::Subscribe(Subnet::SyncCommittee( - exact_subnet.subnet_id, - ))); - - // add the subnet to the ENR bitfield - self.events - .push_back(SubnetServiceMessage::EnrAdd(Subnet::SyncCommittee( - exact_subnet.subnet_id, - ))); - - // add an unsubscription event to remove ourselves from the subnet once completed - self.unsubscriptions - .insert_at(exact_subnet.subnet_id, expected_end_subscription_duration); - } else { - // We are already subscribed, extend the unsubscription duration - self.unsubscriptions - .update_timeout(&exact_subnet.subnet_id, expected_end_subscription_duration); - } - - Ok(()) - } - - /// A queued unsubscription is ready. - fn handle_unsubscriptions(&mut self, subnet_id: SyncSubnetId) { - debug!(subnet = *subnet_id, "Unsubscribing from subnet"); - - self.subscriptions.remove(&subnet_id); - self.events - .push_back(SubnetServiceMessage::Unsubscribe(Subnet::SyncCommittee( - subnet_id, - ))); - - self.events - .push_back(SubnetServiceMessage::EnrRemove(Subnet::SyncCommittee( - subnet_id, - ))); - } -} - -impl Stream for SyncCommitteeService { - type Item = SubnetServiceMessage; - - fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - // update the waker if needed - if let Some(waker) = &self.waker { - if !waker.will_wake(cx.waker()) { - self.waker = Some(cx.waker().clone()); - } - } else { - self.waker = Some(cx.waker().clone()); - } - - // process any un-subscription events - match self.unsubscriptions.poll_next_unpin(cx) { - Poll::Ready(Some(Ok(exact_subnet))) => self.handle_unsubscriptions(exact_subnet), - Poll::Ready(Some(Err(e))) => { - error!(error = e, "Failed to check for subnet unsubscription times"); - } - Poll::Ready(None) | Poll::Pending => {} - } - - // process any generated events - if let Some(event) = self.events.pop_front() { - return Poll::Ready(Some(event)); - } - - Poll::Pending - } -} From ffa7b2b2b9e3b4e70678e2c749b8bc45234febd7 Mon Sep 17 00:00:00 2001 From: Lion - dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 25 Sep 2025 05:52:27 +0200 Subject: [PATCH 57/81] Only mark block lookups as pending if block is importing from gossip (#8112) - PR https://github.com/sigp/lighthouse/pull/8045 introduced a regression of how lookup sync interacts with the da_checker. Now in unstable block import from the HTTP API also insert the block in the da_checker while the block is being execution verified. If lookup sync finds the block in the da_checker in `NotValidated` state it expects a `GossipBlockProcessResult` message sometime later. That message is only sent after block import in gossip. I confirmed in our node's logs for 4/4 cases of stuck lookups are caused by this sequence of events: - Receive block through API, insert into da_checker in fn process_block in put_pre_execution_block - Create lookup and leave in AwaitingDownload(block in processing cache) state - Block from HTTP API finishes importing - Lookup is left stuck Closes https://github.com/sigp/lighthouse/issues/8104 - https://github.com/sigp/lighthouse/pull/8110 was my initial solution attempt but we can't send the `GossipBlockProcessResult` event from the `http_api` crate without adding new channels, which seems messy. For a given node it's rare that a lookup is created at the same time that a block is being published. This PR solves https://github.com/sigp/lighthouse/issues/8104 by allowing lookup sync to import the block twice in that case. Co-Authored-By: dapplion <35266934+dapplion@users.noreply.github.com> --- .../beacon_chain/src/beacon_block_streamer.rs | 2 +- beacon_node/beacon_chain/src/beacon_chain.rs | 9 +++-- .../src/data_availability_checker.rs | 7 ++-- .../overflow_lru_cache.rs | 37 +++++++++++++------ .../sync/block_lookups/single_block_lookup.rs | 2 +- .../network/src/sync/network_context.rs | 32 +++++++++++----- beacon_node/network/src/sync/tests/lookups.rs | 6 +-- consensus/types/src/beacon_block.rs | 1 + 8 files changed, 63 insertions(+), 33 deletions(-) diff --git a/beacon_node/beacon_chain/src/beacon_block_streamer.rs b/beacon_node/beacon_chain/src/beacon_block_streamer.rs index d4ce38927b..c816a0b29f 100644 --- a/beacon_node/beacon_chain/src/beacon_block_streamer.rs +++ b/beacon_node/beacon_chain/src/beacon_block_streamer.rs @@ -404,7 +404,7 @@ impl BeaconBlockStreamer { if self.check_caches == CheckCaches::Yes { match self.beacon_chain.get_block_process_status(&root) { BlockProcessStatus::Unknown => None, - BlockProcessStatus::NotValidated(block) + BlockProcessStatus::NotValidated(block, _) | BlockProcessStatus::ExecutionValidated(block) => { metrics::inc_counter(&metrics::BEACON_REQRESP_PRE_IMPORT_CACHE_HITS); Some(block) diff --git a/beacon_node/beacon_chain/src/beacon_chain.rs b/beacon_node/beacon_chain/src/beacon_chain.rs index 4f0c6aada0..08e0d1c674 100644 --- a/beacon_node/beacon_chain/src/beacon_chain.rs +++ b/beacon_node/beacon_chain/src/beacon_chain.rs @@ -334,7 +334,7 @@ pub enum BlockProcessStatus { /// Block is not in any pre-import cache. Block may be in the data-base or in the fork-choice. Unknown, /// Block is currently processing but not yet validated. - NotValidated(Arc>), + NotValidated(Arc>, BlockImportSource), /// Block is fully valid, but not yet imported. It's cached in the da_checker while awaiting /// missing block components. ExecutionValidated(Arc>), @@ -3351,8 +3351,11 @@ impl BeaconChain { ); } - self.data_availability_checker - .put_pre_execution_block(block_root, unverified_block.block_cloned())?; + self.data_availability_checker.put_pre_execution_block( + block_root, + unverified_block.block_cloned(), + block_source, + )?; // Start the Prometheus timer. let _full_timer = metrics::start_timer(&metrics::BLOCK_PROCESSING_TIMES); diff --git a/beacon_node/beacon_chain/src/data_availability_checker.rs b/beacon_node/beacon_chain/src/data_availability_checker.rs index a0ad1c2112..43b7d8f7ea 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker.rs @@ -21,8 +21,8 @@ use task_executor::TaskExecutor; use tracing::{debug, error, instrument}; use types::blob_sidecar::{BlobIdentifier, BlobSidecar, FixedBlobSidecarList}; use types::{ - BlobSidecarList, ChainSpec, DataColumnSidecar, DataColumnSidecarList, Epoch, EthSpec, Hash256, - SignedBeaconBlock, Slot, + BlobSidecarList, BlockImportSource, ChainSpec, DataColumnSidecar, DataColumnSidecarList, Epoch, + EthSpec, Hash256, SignedBeaconBlock, Slot, }; mod error; @@ -354,9 +354,10 @@ impl DataAvailabilityChecker { &self, block_root: Hash256, block: Arc>, + source: BlockImportSource, ) -> Result<(), Error> { self.availability_cache - .put_pre_execution_block(block_root, block) + .put_pre_execution_block(block_root, block, source) } /// Removes a pre-execution block from the cache. diff --git a/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs b/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs index bb44009662..42f6dbd856 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs @@ -19,13 +19,14 @@ use tracing::{Span, debug, debug_span}; use types::beacon_block_body::KzgCommitments; use types::blob_sidecar::BlobIdentifier; use types::{ - BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, Epoch, EthSpec, - Hash256, RuntimeFixedVector, RuntimeVariableList, SignedBeaconBlock, + BlobSidecar, BlockImportSource, ChainSpec, ColumnIndex, DataColumnSidecar, + DataColumnSidecarList, Epoch, EthSpec, Hash256, RuntimeFixedVector, RuntimeVariableList, + SignedBeaconBlock, }; #[derive(Clone)] pub enum CachedBlock { - PreExecution(Arc>), + PreExecution(Arc>, BlockImportSource), Executed(Box>), } @@ -42,7 +43,7 @@ impl CachedBlock { fn as_block(&self) -> &SignedBeaconBlock { match self { - CachedBlock::PreExecution(b) => b, + CachedBlock::PreExecution(b, _) => b, CachedBlock::Executed(b) => b.as_block(), } } @@ -135,9 +136,13 @@ impl PendingComponents { /// Inserts a pre-execution block into the cache. /// This does NOT override an existing executed block. - pub fn insert_pre_execution_block(&mut self, block: Arc>) { + pub fn insert_pre_execution_block( + &mut self, + block: Arc>, + source: BlockImportSource, + ) { if self.block.is_none() { - self.block = Some(CachedBlock::PreExecution(block)) + self.block = Some(CachedBlock::PreExecution(block, source)) } } @@ -433,7 +438,9 @@ impl DataAvailabilityCheckerInner { .peek(block_root) .and_then(|pending_components| { pending_components.block.as_ref().map(|block| match block { - CachedBlock::PreExecution(b) => BlockProcessStatus::NotValidated(b.clone()), + CachedBlock::PreExecution(b, source) => { + BlockProcessStatus::NotValidated(b.clone(), *source) + } CachedBlock::Executed(b) => { BlockProcessStatus::ExecutionValidated(b.block_cloned()) } @@ -693,11 +700,12 @@ impl DataAvailabilityCheckerInner { &self, block_root: Hash256, block: Arc>, + source: BlockImportSource, ) -> Result<(), AvailabilityCheckError> { let epoch = block.epoch(); let pending_components = self.update_or_insert_pending_components(block_root, epoch, |pending_components| { - pending_components.insert_pre_execution_block(block); + pending_components.insert_pre_execution_block(block, source); Ok(()) })?; @@ -718,7 +726,7 @@ impl DataAvailabilityCheckerInner { /// This does NOT remove an existing executed block. pub fn remove_pre_execution_block(&self, block_root: &Hash256) { // The read lock is immediately dropped so we can safely remove the block from the cache. - if let Some(BlockProcessStatus::NotValidated(_)) = self.get_cached_block(block_root) { + if let Some(BlockProcessStatus::NotValidated(_, _)) = self.get_cached_block(block_root) { self.critical.write().pop(block_root); } } @@ -1459,9 +1467,13 @@ mod pending_components_tests { let mut pending_component = >::empty(block_root, max_len); let pre_execution_block = Arc::new(pre_execution_block); - pending_component.insert_pre_execution_block(pre_execution_block.clone()); + pending_component + .insert_pre_execution_block(pre_execution_block.clone(), BlockImportSource::Gossip); assert!( - matches!(pending_component.block, Some(CachedBlock::PreExecution(_))), + matches!( + pending_component.block, + Some(CachedBlock::PreExecution(_, _)) + ), "pre execution block inserted" ); @@ -1471,7 +1483,8 @@ mod pending_components_tests { "executed block inserted" ); - pending_component.insert_pre_execution_block(pre_execution_block); + pending_component + .insert_pre_execution_block(pre_execution_block, BlockImportSource::Gossip); assert!( matches!(pending_component.block, Some(CachedBlock::Executed(_))), "executed block should remain" diff --git a/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs b/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs index 36509d2563..8fb3248a87 100644 --- a/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs +++ b/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs @@ -219,7 +219,7 @@ impl SingleBlockLookup { // can assert that this is the correct value of `blob_kzg_commitments_count`. match cx.chain.get_block_process_status(&self.block_root) { BlockProcessStatus::Unknown => None, - BlockProcessStatus::NotValidated(block) + BlockProcessStatus::NotValidated(block, _) | BlockProcessStatus::ExecutionValidated(block) => Some(block.clone()), } }) { diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 17a4295700..ac2991c147 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -49,8 +49,8 @@ use tokio::sync::mpsc; use tracing::{Span, debug, debug_span, error, warn}; use types::blob_sidecar::FixedBlobSidecarList; use types::{ - BlobSidecar, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, EthSpec, ForkContext, - Hash256, SignedBeaconBlock, Slot, + BlobSidecar, BlockImportSource, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, EthSpec, + ForkContext, Hash256, SignedBeaconBlock, Slot, }; pub mod custody; @@ -835,14 +835,26 @@ impl SyncNetworkContext { match self.chain.get_block_process_status(&block_root) { // Unknown block, continue request to download BlockProcessStatus::Unknown => {} - // Block is known are currently processing, expect a future event with the result of - // processing. - BlockProcessStatus::NotValidated { .. } => { - // Lookup sync event safety: If the block is currently in the processing cache, we - // are guaranteed to receive a `SyncMessage::GossipBlockProcessResult` that will - // make progress on this lookup - return Ok(LookupRequestResult::Pending("block in processing cache")); - } + // Block is known and currently processing. Imports from gossip and HTTP API insert the + // block in the da_cache. However, HTTP API is unable to notify sync when it completes + // block import. Returning `Pending` here will result in stuck lookups if the block is + // importing from sync. + BlockProcessStatus::NotValidated(_, source) => match source { + BlockImportSource::Gossip => { + // Lookup sync event safety: If the block is currently in the processing cache, we + // are guaranteed to receive a `SyncMessage::GossipBlockProcessResult` that will + // make progress on this lookup + return Ok(LookupRequestResult::Pending("block in processing cache")); + } + BlockImportSource::Lookup + | BlockImportSource::RangeSync + | BlockImportSource::HttpApi => { + // Lookup, RangeSync or HttpApi block import don't emit the GossipBlockProcessResult + // event. If a lookup happens to be created during block import from one of + // those sources just import the block twice. Otherwise the lookup will get + // stuck. Double imports are fine, they just waste resources. + } + }, // Block is fully validated. If it's not yet imported it's waiting for missing block // components. Consider this request completed and do nothing. BlockProcessStatus::ExecutionValidated { .. } => { diff --git a/beacon_node/network/src/sync/tests/lookups.rs b/beacon_node/network/src/sync/tests/lookups.rs index 27968a0635..fc64186175 100644 --- a/beacon_node/network/src/sync/tests/lookups.rs +++ b/beacon_node/network/src/sync/tests/lookups.rs @@ -41,8 +41,8 @@ use slot_clock::{SlotClock, TestingSlotClock}; use tokio::sync::mpsc; use tracing::info; use types::{ - BeaconState, BeaconStateBase, BlobSidecar, DataColumnSidecar, EthSpec, ForkContext, ForkName, - Hash256, MinimalEthSpec as E, SignedBeaconBlock, Slot, + BeaconState, BeaconStateBase, BlobSidecar, BlockImportSource, DataColumnSidecar, EthSpec, + ForkContext, ForkName, Hash256, MinimalEthSpec as E, SignedBeaconBlock, Slot, data_column_sidecar::ColumnIndex, test_utils::{SeedableRng, TestRandom, XorShiftRng}, }; @@ -1113,7 +1113,7 @@ impl TestRig { self.harness .chain .data_availability_checker - .put_pre_execution_block(block.canonical_root(), block) + .put_pre_execution_block(block.canonical_root(), block, BlockImportSource::Gossip) .unwrap(); } diff --git a/consensus/types/src/beacon_block.rs b/consensus/types/src/beacon_block.rs index f4e4e36966..61c32dd4ac 100644 --- a/consensus/types/src/beacon_block.rs +++ b/consensus/types/src/beacon_block.rs @@ -843,6 +843,7 @@ impl<'de, E: EthSpec, Payload: AbstractExecPayload> ContextDeserialize<'de, F } } +#[derive(Clone, Copy)] pub enum BlockImportSource { Gossip, Lookup, From 20c6ce455300e26815540acc112a5a1f6094f61c Mon Sep 17 00:00:00 2001 From: Eitan Seri-Levi Date: Fri, 26 Sep 2025 02:12:47 -0700 Subject: [PATCH 58/81] Fulu testnet configs (#8117) Holesky - #8096 Hoodi - #8097 Sepolia - #8099 Testnet configs for Holesky, Hoodi and Sepolia Holesky - https://github.com/eth-clients/holesky/pull/132 Hoodi - https://github.com/eth-clients/hoodi/pull/21 Sepolia - https://github.com/eth-clients/sepolia/pull/111 Co-Authored-By: Eitan Seri- Levi --- .../holesky/config.yaml | 33 +++++++++++++++++- .../hoodi/config.yaml | 34 ++++++++++++++++++- .../sepolia/config.yaml | 34 ++++++++++++++++++- 3 files changed, 98 insertions(+), 3 deletions(-) diff --git a/common/eth2_network_config/built_in_network_configs/holesky/config.yaml b/common/eth2_network_config/built_in_network_configs/holesky/config.yaml index ab5f0f3bde..b1e9faea1d 100644 --- a/common/eth2_network_config/built_in_network_configs/holesky/config.yaml +++ b/common/eth2_network_config/built_in_network_configs/holesky/config.yaml @@ -38,7 +38,7 @@ ELECTRA_FORK_VERSION: 0x06017000 ELECTRA_FORK_EPOCH: 115968 # Fulu FULU_FORK_VERSION: 0x07017000 -FULU_FORK_EPOCH: 18446744073709551615 +FULU_FORK_EPOCH: 165120 # Gloas GLOAS_FORK_VERSION: 0x08017000 GLOAS_FORK_EPOCH: 18446744073709551615 @@ -47,6 +47,8 @@ GLOAS_FORK_EPOCH: 18446744073709551615 # --------------------------------------------------------------- # 12 seconds SECONDS_PER_SLOT: 12 +# 1200 milliseconds +SLOT_DURATION_MS: 12000 # 14 (estimate from Eth1 mainnet) SECONDS_PER_ETH1_BLOCK: 14 # 2**8 (= 256) epochs ~27 hours @@ -55,6 +57,18 @@ MIN_VALIDATOR_WITHDRAWABILITY_DELAY: 256 SHARD_COMMITTEE_PERIOD: 256 # 2**11 (= 2,048) Eth1 blocks ~8 hours ETH1_FOLLOW_DISTANCE: 2048 +# 1667 basis points, ~17% of SLOT_DURATION_MS +PROPOSER_REORG_CUTOFF_BPS: 1667 +# 3333 basis points, ~33% of SLOT_DURATION_MS +ATTESTATION_DUE_BPS: 3333 +# 6667 basis points, ~67% of SLOT_DURATION_MS +AGGREGATE_DUE_BPS: 6667 + +# Altair +# 3333 basis points, ~33% of SLOT_DURATION_MS +SYNC_MESSAGE_DUE_BPS: 3333 +# 6667 basis points, ~67% of SLOT_DURATION_MS +CONTRIBUTION_DUE_BPS: 6667 # Validator cycle # --------------------------------------------------------------- @@ -141,13 +155,30 @@ MAX_BLOBS_PER_BLOCK_ELECTRA: 9 MAX_REQUEST_BLOB_SIDECARS_ELECTRA: 1152 # Fulu +# 2**7 (= 128) groups NUMBER_OF_CUSTODY_GROUPS: 128 +# 2**7 (= 128) subnets DATA_COLUMN_SIDECAR_SUBNET_COUNT: 128 +# MAX_REQUEST_BLOCKS_DENEB * NUMBER_OF_COLUMNS (= 128 * 128) sidecars MAX_REQUEST_DATA_COLUMN_SIDECARS: 16384 +# 2**3 (= 8) samples SAMPLES_PER_SLOT: 8 +# 2**2 (= 4) sidecars CUSTODY_REQUIREMENT: 4 +# 2**3 (= 8) sidecars VALIDATOR_CUSTODY_REQUIREMENT: 8 +# 2**5 * 10**9 (= 32,000,000,000) Gwei BALANCE_PER_ADDITIONAL_CUSTODY_GROUP: 32000000000 +# 2**12 (= 4,096) epochs MIN_EPOCHS_FOR_DATA_COLUMN_SIDECARS_REQUESTS: 4096 +# Blob Scheduling +# --------------------------------------------------------------- + +BLOB_SCHEDULE: + - EPOCH: 166400 + MAX_BLOBS_PER_BLOCK: 15 + - EPOCH: 167936 + MAX_BLOBS_PER_BLOCK: 21 + # Gloas \ No newline at end of file diff --git a/common/eth2_network_config/built_in_network_configs/hoodi/config.yaml b/common/eth2_network_config/built_in_network_configs/hoodi/config.yaml index 01322974c8..256957e119 100644 --- a/common/eth2_network_config/built_in_network_configs/hoodi/config.yaml +++ b/common/eth2_network_config/built_in_network_configs/hoodi/config.yaml @@ -42,7 +42,7 @@ ELECTRA_FORK_EPOCH: 2048 # Fulu FULU_FORK_VERSION: 0x70000910 -FULU_FORK_EPOCH: 18446744073709551615 +FULU_FORK_EPOCH: 50688 # Gloas GLOAS_FORK_VERSION: 0x80000910 @@ -53,6 +53,8 @@ GLOAS_FORK_EPOCH: 18446744073709551615 # --------------------------------------------------------------- # 12 seconds SECONDS_PER_SLOT: 12 +# 12000 milliseconds +SLOT_DURATION_MS: 12000 # 14 (estimate from Eth1 mainnet) SECONDS_PER_ETH1_BLOCK: 12 # 2**8 (= 256) epochs ~27 hours @@ -61,6 +63,18 @@ MIN_VALIDATOR_WITHDRAWABILITY_DELAY: 256 SHARD_COMMITTEE_PERIOD: 256 # 2**11 (= 2,048) Eth1 blocks ~8 hours ETH1_FOLLOW_DISTANCE: 2048 +# 1667 basis points, ~17% of SLOT_DURATION_MS +PROPOSER_REORG_CUTOFF_BPS: 1667 +# 3333 basis points, ~33% of SLOT_DURATION_MS +ATTESTATION_DUE_BPS: 3333 +# 6667 basis points, ~67% of SLOT_DURATION_MS +AGGREGATE_DUE_BPS: 6667 + +# Altair +# 3333 basis points, ~33% of SLOT_DURATION_MS +SYNC_MESSAGE_DUE_BPS: 3333 +# 6667 basis points, ~67% of SLOT_DURATION_MS +CONTRIBUTION_DUE_BPS: 6667 # Validator cycle # --------------------------------------------------------------- @@ -154,15 +168,33 @@ WHISK_EPOCHS_PER_SHUFFLING_PHASE: 256 WHISK_PROPOSER_SELECTION_GAP: 2 # Fulu +# 2**7 (= 128) groups NUMBER_OF_CUSTODY_GROUPS: 128 +# 2**7 (= 128) subnets DATA_COLUMN_SIDECAR_SUBNET_COUNT: 128 +# MAX_REQUEST_BLOCKS_DENEB * NUMBER_OF_COLUMNS (= 128 * 128) sidecars MAX_REQUEST_DATA_COLUMN_SIDECARS: 16384 +# 2**3 (= 8) samples SAMPLES_PER_SLOT: 8 +# 2**2 (= 4) sidecars CUSTODY_REQUIREMENT: 4 +# 2**3 (= 8) sidecars VALIDATOR_CUSTODY_REQUIREMENT: 8 +# 2**5 * 10**9 (= 32,000,000,000) Gwei BALANCE_PER_ADDITIONAL_CUSTODY_GROUP: 32000000000 +# 2**12 (= 4,096) epochs MIN_EPOCHS_FOR_DATA_COLUMN_SIDECARS_REQUESTS: 4096 + +# Blob Scheduling +# --------------------------------------------------------------- + +BLOB_SCHEDULE: + - EPOCH: 52480 + MAX_BLOBS_PER_BLOCK: 15 + - EPOCH: 54016 + MAX_BLOBS_PER_BLOCK: 21 + # Gloas # EIP7732 diff --git a/common/eth2_network_config/built_in_network_configs/sepolia/config.yaml b/common/eth2_network_config/built_in_network_configs/sepolia/config.yaml index 9802e409fb..b1a01933d7 100644 --- a/common/eth2_network_config/built_in_network_configs/sepolia/config.yaml +++ b/common/eth2_network_config/built_in_network_configs/sepolia/config.yaml @@ -42,7 +42,7 @@ ELECTRA_FORK_EPOCH: 222464 # Fulu FULU_FORK_VERSION: 0x90000075 -FULU_FORK_EPOCH: 18446744073709551615 +FULU_FORK_EPOCH: 272640 # Gloas GLOAS_FORK_VERSION: 0x90000076 @@ -52,6 +52,8 @@ GLOAS_FORK_EPOCH: 18446744073709551615 # --------------------------------------------------------------- # 12 seconds SECONDS_PER_SLOT: 12 +# 12000 milliseconds +SLOT_DURATION_MS: 12000 # 14 (estimate from Eth1 mainnet) SECONDS_PER_ETH1_BLOCK: 14 # 2**8 (= 256) epochs ~27 hours @@ -60,6 +62,18 @@ MIN_VALIDATOR_WITHDRAWABILITY_DELAY: 256 SHARD_COMMITTEE_PERIOD: 256 # 2**11 (= 2,048) Eth1 blocks ~8 hours ETH1_FOLLOW_DISTANCE: 2048 +# 1667 basis points, ~17% of SLOT_DURATION_MS +PROPOSER_REORG_CUTOFF_BPS: 1667 +# 3333 basis points, ~33% of SLOT_DURATION_MS +ATTESTATION_DUE_BPS: 3333 +# 6667 basis points, ~67% of SLOT_DURATION_MS +AGGREGATE_DUE_BPS: 6667 + +# Altair +# 3333 basis points, ~33% of SLOT_DURATION_MS +SYNC_MESSAGE_DUE_BPS: 3333 +# 6667 basis points, ~67% of SLOT_DURATION_MS +CONTRIBUTION_DUE_BPS: 6667 # Validator cycle @@ -147,13 +161,31 @@ MAX_BLOBS_PER_BLOCK_ELECTRA: 9 MAX_REQUEST_BLOB_SIDECARS_ELECTRA: 1152 # Fulu +# 2**7 (= 128) groups NUMBER_OF_CUSTODY_GROUPS: 128 +# 2**7 (= 128) subnets DATA_COLUMN_SIDECAR_SUBNET_COUNT: 128 +# MAX_REQUEST_BLOCKS_DENEB * NUMBER_OF_COLUMNS (= 128 * 128) sidecars MAX_REQUEST_DATA_COLUMN_SIDECARS: 16384 +# 2**3 (= 8) samples SAMPLES_PER_SLOT: 8 +# 2**2 (= 4) sidecars CUSTODY_REQUIREMENT: 4 +# 2**3 (= 8) sidecars VALIDATOR_CUSTODY_REQUIREMENT: 8 +# 2**5 * 10**9 (= 32,000,000,000) Gwei BALANCE_PER_ADDITIONAL_CUSTODY_GROUP: 32000000000 +# 2**12 (= 4,096) epochs MIN_EPOCHS_FOR_DATA_COLUMN_SIDECARS_REQUESTS: 4096 + +# Blob Scheduling +# --------------------------------------------------------------- + +BLOB_SCHEDULE: + - EPOCH: 274176 + MAX_BLOBS_PER_BLOCK: 15 + - EPOCH: 275712 + MAX_BLOBS_PER_BLOCK: 21 + # Gloas \ No newline at end of file From c754234b2c94d90ed658788b5fb69ee405ed6cb7 Mon Sep 17 00:00:00 2001 From: Michael Sproul Date: Sat, 27 Sep 2025 00:44:50 +1000 Subject: [PATCH 59/81] Fix bugs in proposer calculation post-Fulu (#8101) As identified by a researcher during the Fusaka security competition, we were computing the proposer index incorrectly in some places by computing without lookahead. - [x] Add "low level" checks to computation functions in `consensus/types` to ensure they error cleanly - [x] Re-work the determination of proposer shuffling decision roots, which are now fork aware. - [x] Re-work and simplify the beacon proposer cache to be fork-aware. - [x] Optimise `with_proposer_cache` to use `OnceCell`. - [x] All tests passing. - [x] Resolve all remaining `FIXME(sproul)`s. - [x] Unit tests for `ProtoBlock::proposer_shuffling_root_for_child_block`. - [x] End-to-end regression test. - [x] Test on pre-Fulu network. - [x] Test on post-Fulu network. Co-Authored-By: Michael Sproul --- beacon_node/beacon_chain/src/beacon_chain.rs | 193 ++++++++----- .../beacon_chain/src/beacon_proposer_cache.rs | 101 ++++--- .../beacon_chain/src/blob_verification.rs | 79 ++---- .../beacon_chain/src/block_verification.rs | 80 ++---- .../beacon_chain/src/canonical_head.rs | 2 +- .../src/data_column_verification.rs | 89 ++---- beacon_node/beacon_chain/src/errors.rs | 17 ++ .../beacon_chain/src/validator_monitor.rs | 8 +- beacon_node/beacon_chain/tests/store_tests.rs | 265 ++++++++++++++++++ .../beacon_chain/tests/validator_monitor.rs | 29 +- beacon_node/http_api/src/proposer_duties.rs | 57 ++-- .../src/proto_array_fork_choice.rs | 44 +++ consensus/state_processing/src/all_caches.rs | 7 +- consensus/state_processing/src/epoch_cache.rs | 4 +- .../state_processing/src/upgrade/fulu.rs | 4 +- consensus/types/src/beacon_state.rs | 103 +++++-- consensus/types/src/chain_spec.rs | 22 ++ consensus/types/src/epoch_cache.rs | 10 +- testing/ef_tests/src/cases/fork.rs | 2 +- 19 files changed, 765 insertions(+), 351 deletions(-) diff --git a/beacon_node/beacon_chain/src/beacon_chain.rs b/beacon_node/beacon_chain/src/beacon_chain.rs index 08e0d1c674..afbf3278fe 100644 --- a/beacon_node/beacon_chain/src/beacon_chain.rs +++ b/beacon_node/beacon_chain/src/beacon_chain.rs @@ -5,8 +5,9 @@ use crate::attestation_verification::{ }; use crate::attester_cache::{AttesterCache, AttesterCacheKey}; use crate::beacon_block_streamer::{BeaconBlockStreamer, CheckCaches}; -use crate::beacon_proposer_cache::BeaconProposerCache; -use crate::beacon_proposer_cache::compute_proposer_duties_from_head; +use crate::beacon_proposer_cache::{ + BeaconProposerCache, EpochBlockProposers, ensure_state_can_determine_proposers_for_epoch, +}; use crate::blob_verification::{GossipBlobError, GossipVerifiedBlob}; use crate::block_times_cache::BlockTimesCache; use crate::block_verification::POS_PANDA_BANNER; @@ -4698,65 +4699,54 @@ impl BeaconChain { // Compute the proposer index. let head_epoch = cached_head.head_slot().epoch(T::EthSpec::slots_per_epoch()); - let shuffling_decision_root = if head_epoch == proposal_epoch { - cached_head - .snapshot - .beacon_state - .proposer_shuffling_decision_root(proposer_head)? - } else { - proposer_head - }; - let cached_proposer = self - .beacon_proposer_cache - .lock() - .get_slot::(shuffling_decision_root, proposal_slot); - let proposer_index = if let Some(proposer) = cached_proposer { - proposer.index as u64 - } else { - if head_epoch + self.config.sync_tolerance_epochs < proposal_epoch { - warn!( - msg = "this is a non-critical issue that can happen on unhealthy nodes or \ - networks.", - %proposal_epoch, - %head_epoch, - "Skipping proposer preparation" - ); + let shuffling_decision_root = cached_head + .snapshot + .beacon_state + .proposer_shuffling_decision_root_at_epoch(proposal_epoch, proposer_head, &self.spec)?; - // Don't skip the head forward more than two epochs. This avoids burdening an - // unhealthy node. - // - // Although this node might miss out on preparing for a proposal, they should still - // be able to propose. This will prioritise beacon chain health over efficient - // packing of execution blocks. - return Ok(None); + let Some(proposer_index) = self.with_proposer_cache( + shuffling_decision_root, + proposal_epoch, + |proposers| proposers.get_slot::(proposal_slot).map(|p| p.index as u64), + || { + if head_epoch + self.config.sync_tolerance_epochs < proposal_epoch { + warn!( + msg = "this is a non-critical issue that can happen on unhealthy nodes or \ + networks", + %proposal_epoch, + %head_epoch, + "Skipping proposer preparation" + ); + + // Don't skip the head forward too many epochs. This avoids burdening an + // unhealthy node. + // + // Although this node might miss out on preparing for a proposal, they should + // still be able to propose. This will prioritise beacon chain health over + // efficient packing of execution blocks. + Err(Error::SkipProposerPreparation) + } else { + let head = self.canonical_head.cached_head(); + Ok(( + head.head_state_root(), + head.snapshot.beacon_state.clone(), + )) + } + }, + ).map_or_else(|e| { + match e { + Error::ProposerCacheIncorrectState { .. } => { + warn!("Head changed during proposer preparation"); + Ok(None) + } + Error::SkipProposerPreparation => { + // Warning logged for this above. + Ok(None) + } + e => Err(e) } - - let (proposers, decision_root, _, fork) = - compute_proposer_duties_from_head(proposal_epoch, self)?; - - let proposer_offset = (proposal_slot % T::EthSpec::slots_per_epoch()).as_usize(); - let proposer = *proposers - .get(proposer_offset) - .ok_or(BeaconChainError::NoProposerForSlot(proposal_slot))?; - - self.beacon_proposer_cache.lock().insert( - proposal_epoch, - decision_root, - proposers, - fork, - )?; - - // It's possible that the head changes whilst computing these duties. If so, abandon - // this routine since the change of head would have also spawned another instance of - // this routine. - // - // Exit now, after updating the cache. - if decision_root != shuffling_decision_root { - warn!("Head changed during proposer preparation"); - return Ok(None); - } - - proposer as u64 + }, |value| Ok(Some(value)))? else { + return Ok(None); }; // Get the `prev_randao` and parent block number. @@ -4916,14 +4906,19 @@ impl BeaconChain { // Only attempt a re-org if we have a proposer registered for the re-org slot. let proposing_at_re_org_slot = { - // The proposer shuffling has the same decision root as the next epoch attestation - // shuffling. We know our re-org block is not on the epoch boundary, so it has the - // same proposer shuffling as the head (but not necessarily the parent which may lie - // in the previous epoch). - let shuffling_decision_root = info - .head_node - .next_epoch_shuffling_id - .shuffling_decision_block; + // We know our re-org block is not on the epoch boundary, so it has the same proposer + // shuffling as the head (but not necessarily the parent which may lie in the previous + // epoch). + let shuffling_decision_root = if self + .spec + .fork_name_at_slot::(re_org_block_slot) + .fulu_enabled() + { + info.head_node.current_epoch_shuffling_id + } else { + info.head_node.next_epoch_shuffling_id + } + .shuffling_decision_block; let proposer_index = self .beacon_proposer_cache .lock() @@ -6558,6 +6553,70 @@ impl BeaconChain { } } + pub fn with_proposer_cache + From>( + &self, + shuffling_decision_block: Hash256, + proposal_epoch: Epoch, + accessor: impl Fn(&EpochBlockProposers) -> Result, + state_provider: impl FnOnce() -> Result<(Hash256, BeaconState), E>, + ) -> Result { + let cache_entry = self + .beacon_proposer_cache + .lock() + .get_or_insert_key(proposal_epoch, shuffling_decision_block); + + // If the cache entry is not initialised, run the code to initialise it inside a OnceCell. + // This prevents duplication of work across multiple threads. + // + // If it is already initialised, then `get_or_try_init` will return immediately without + // executing the initialisation code at all. + let epoch_block_proposers = cache_entry.get_or_try_init(|| { + debug!( + ?shuffling_decision_block, + %proposal_epoch, + "Proposer shuffling cache miss" + ); + + // Fetch the state on-demand if the required epoch was missing from the cache. + // If the caller wants to not compute the state they must return an error here and then + // catch it at the call site. + let (state_root, mut state) = state_provider()?; + + // Ensure the state can compute proposer duties for `epoch`. + ensure_state_can_determine_proposers_for_epoch( + &mut state, + state_root, + proposal_epoch, + &self.spec, + )?; + + // Sanity check the state. + let latest_block_root = state.get_latest_block_root(state_root); + let state_decision_block_root = state.proposer_shuffling_decision_root_at_epoch( + proposal_epoch, + latest_block_root, + &self.spec, + )?; + if state_decision_block_root != shuffling_decision_block { + return Err(Error::ProposerCacheIncorrectState { + state_decision_block_root, + requested_decision_block_root: shuffling_decision_block, + } + .into()); + } + + let proposers = state.get_beacon_proposer_indices(proposal_epoch, &self.spec)?; + Ok::<_, E>(EpochBlockProposers::new( + proposal_epoch, + state.fork(), + proposers, + )) + })?; + + // Run the accessor function on the computed epoch proposers. + accessor(epoch_block_proposers).map_err(Into::into) + } + /// Runs the `map_fn` with the committee cache for `shuffling_epoch` from the chain with head /// `head_block_root`. The `map_fn` will be supplied two values: /// diff --git a/beacon_node/beacon_chain/src/beacon_proposer_cache.rs b/beacon_node/beacon_chain/src/beacon_proposer_cache.rs index 12970214c6..47c44542c0 100644 --- a/beacon_node/beacon_chain/src/beacon_proposer_cache.rs +++ b/beacon_node/beacon_chain/src/beacon_proposer_cache.rs @@ -12,9 +12,9 @@ use crate::{BeaconChain, BeaconChainError, BeaconChainTypes}; use fork_choice::ExecutionStatus; use lru::LruCache; use once_cell::sync::OnceCell; +use safe_arith::SafeArith; use smallvec::SmallVec; use state_processing::state_advance::partial_state_advance; -use std::cmp::Ordering; use std::num::NonZeroUsize; use std::sync::Arc; use types::non_zero_usize::new_non_zero_usize; @@ -51,6 +51,34 @@ pub struct EpochBlockProposers { pub(crate) proposers: SmallVec<[usize; TYPICAL_SLOTS_PER_EPOCH]>, } +impl EpochBlockProposers { + pub fn new(epoch: Epoch, fork: Fork, proposers: Vec) -> Self { + Self { + epoch, + fork, + proposers: proposers.into(), + } + } + + pub fn get_slot(&self, slot: Slot) -> Result { + let epoch = slot.epoch(E::slots_per_epoch()); + if epoch == self.epoch { + self.proposers + .get(slot.as_usize() % E::SlotsPerEpoch::to_usize()) + .map(|&index| Proposer { + index, + fork: self.fork, + }) + .ok_or(BeaconChainError::ProposerCacheOutOfBounds { slot, epoch }) + } else { + Err(BeaconChainError::ProposerCacheWrongEpoch { + request_epoch: epoch, + cache_epoch: self.epoch, + }) + } + } +} + /// A cache to store the proposers for some epoch. /// /// See the module-level documentation for more information. @@ -76,23 +104,8 @@ impl BeaconProposerCache { ) -> Option { let epoch = slot.epoch(E::slots_per_epoch()); let key = (epoch, shuffling_decision_block); - let cache_opt = self.cache.get(&key).and_then(|cell| cell.get()); - if let Some(cache) = cache_opt { - // This `if` statement is likely unnecessary, but it feels like good practice. - if epoch == cache.epoch { - cache - .proposers - .get(slot.as_usize() % E::SlotsPerEpoch::to_usize()) - .map(|&index| Proposer { - index, - fork: cache.fork, - }) - } else { - None - } - } else { - None - } + let cache = self.cache.get(&key)?.get()?; + cache.get_slot::(slot).ok() } /// As per `Self::get_slot`, but returns all proposers in all slots for the given `epoch`. @@ -142,11 +155,7 @@ impl BeaconProposerCache { ) -> Result<(), BeaconStateError> { let key = (epoch, shuffling_decision_block); if !self.cache.contains(&key) { - let epoch_proposers = EpochBlockProposers { - epoch, - fork, - proposers: proposers.into(), - }; + let epoch_proposers = EpochBlockProposers::new(epoch, fork, proposers); self.cache .put(key, Arc::new(OnceCell::with_value(epoch_proposers))); } @@ -178,7 +187,12 @@ pub fn compute_proposer_duties_from_head( .ok_or(BeaconChainError::HeadMissingFromForkChoice(head_block_root))?; // Advance the state into the requested epoch. - ensure_state_is_in_epoch(&mut state, head_state_root, request_epoch, &chain.spec)?; + ensure_state_can_determine_proposers_for_epoch( + &mut state, + head_state_root, + request_epoch, + &chain.spec, + )?; let indices = state .get_beacon_proposer_indices(request_epoch, &chain.spec) @@ -186,13 +200,13 @@ pub fn compute_proposer_duties_from_head( let dependent_root = state // The only block which decides its own shuffling is the genesis block. - .proposer_shuffling_decision_root(chain.genesis_block_root) + .proposer_shuffling_decision_root(chain.genesis_block_root, &chain.spec) .map_err(BeaconChainError::from)?; Ok((indices, dependent_root, execution_status, state.fork())) } -/// If required, advance `state` to `target_epoch`. +/// If required, advance `state` to the epoch required to determine proposer indices in `target_epoch`. /// /// ## Details /// @@ -200,22 +214,33 @@ pub fn compute_proposer_duties_from_head( /// - No-op if `state.current_epoch() == target_epoch`. /// - It must be the case that `state.canonical_root() == state_root`, but this function will not /// check that. -pub fn ensure_state_is_in_epoch( +pub fn ensure_state_can_determine_proposers_for_epoch( state: &mut BeaconState, state_root: Hash256, target_epoch: Epoch, spec: &ChainSpec, ) -> Result<(), BeaconChainError> { - match state.current_epoch().cmp(&target_epoch) { - // Protects against an inconsistent slot clock. - Ordering::Greater => Err(BeaconStateError::SlotOutOfBounds.into()), - // The state needs to be advanced. - Ordering::Less => { - let target_slot = target_epoch.start_slot(E::slots_per_epoch()); - partial_state_advance(state, Some(state_root), target_slot, spec) - .map_err(BeaconChainError::from) - } - // The state is suitable, nothing to do. - Ordering::Equal => Ok(()), + // The decision slot is the end of an epoch, so we add 1 to reach the first slot of the epoch + // at which the shuffling is determined. + let minimum_slot = spec + .proposer_shuffling_decision_slot::(target_epoch) + .safe_add(1)?; + let minimum_epoch = minimum_slot.epoch(E::slots_per_epoch()); + + // Before and after Fulu, the oldest epoch reachable from a state at epoch N is epoch N itself, + // i.e. we can never "look back". + let maximum_epoch = target_epoch; + + if state.current_epoch() > maximum_epoch { + Err(BeaconStateError::SlotOutOfBounds.into()) + } else if state.current_epoch() >= minimum_epoch { + // Fulu allows us to access shufflings in multiple epochs (thanks to lookahead). + // Pre-Fulu we expect `minimum_epoch == maximum_epoch`, and this branch covers that case. + Ok(()) + } else { + // State's current epoch is less than the minimum epoch. + // Advance the state up to the minimum epoch. + partial_state_advance(state, Some(state_root), minimum_slot, spec) + .map_err(BeaconChainError::from) } } diff --git a/beacon_node/beacon_chain/src/blob_verification.rs b/beacon_node/beacon_chain/src/blob_verification.rs index 53676c0b24..53f2eff0ca 100644 --- a/beacon_node/beacon_chain/src/blob_verification.rs +++ b/beacon_node/beacon_chain/src/blob_verification.rs @@ -5,8 +5,7 @@ use std::sync::Arc; use crate::beacon_chain::{BeaconChain, BeaconChainTypes}; use crate::block_verification::{ - BlockSlashInfo, cheap_state_advance_to_obtain_committees, get_validator_pubkey_cache, - process_block_slash_info, + BlockSlashInfo, get_validator_pubkey_cache, process_block_slash_info, }; use crate::kzg_utils::{validate_blob, validate_blobs}; use crate::observed_data_sidecars::{ObservationStrategy, Observe}; @@ -494,59 +493,31 @@ pub fn validate_blob_sidecar_for_gossip(proposer_shuffling_root, blob_slot); - - let (proposer_index, fork) = if let Some(proposer) = proposer_opt { - (proposer.index, proposer.fork) - } else { - debug!( - %block_root, - %blob_index, - "Proposer shuffling cache miss for blob verification" - ); - let (parent_state_root, mut parent_state) = chain - .store - .get_advanced_hot_state(block_parent_root, blob_slot, parent_block.state_root) - .map_err(|e| GossipBlobError::BeaconChainError(Box::new(e.into())))? - .ok_or_else(|| { - BeaconChainError::DBInconsistent(format!( - "Missing state for parent block {block_parent_root:?}", - )) - })?; - - let state = cheap_state_advance_to_obtain_committees::<_, GossipBlobError>( - &mut parent_state, - Some(parent_state_root), - blob_slot, - &chain.spec, - )?; - - let epoch = state.current_epoch(); - let proposers = state.get_beacon_proposer_indices(epoch, &chain.spec)?; - let proposer_index = *proposers - .get(blob_slot.as_usize() % T::EthSpec::slots_per_epoch() as usize) - .ok_or_else(|| BeaconChainError::NoProposerForSlot(blob_slot))?; - - // Prime the proposer shuffling cache with the newly-learned value. - chain.beacon_proposer_cache.lock().insert( - blob_epoch, - proposer_shuffling_root, - proposers, - state.fork(), - )?; - (proposer_index, state.fork()) - }; + let proposer = chain.with_proposer_cache( + proposer_shuffling_root, + blob_epoch, + |proposers| proposers.get_slot::(blob_slot), + || { + debug!( + %block_root, + index = %blob_index, + "Proposer shuffling cache miss for blob verification" + ); + chain + .store + .get_advanced_hot_state(block_parent_root, blob_slot, parent_block.state_root) + .map_err(|e| GossipBlobError::BeaconChainError(Box::new(e.into())))? + .ok_or_else(|| { + GossipBlobError::BeaconChainError(Box::new(BeaconChainError::DBInconsistent( + format!("Missing state for parent block {block_parent_root:?}",), + ))) + }) + }, + )?; + let proposer_index = proposer.index; + let fork = proposer.fork; // Signature verify the signed block header. let signature_is_valid = { diff --git a/beacon_node/beacon_chain/src/block_verification.rs b/beacon_node/beacon_chain/src/block_verification.rs index 1d10fae0a4..d0ed8258e5 100644 --- a/beacon_node/beacon_chain/src/block_verification.rs +++ b/beacon_node/beacon_chain/src/block_verification.rs @@ -948,61 +948,35 @@ impl GossipVerifiedBlock { } let proposer_shuffling_decision_block = - if parent_block.slot.epoch(T::EthSpec::slots_per_epoch()) == block_epoch { - parent_block - .next_epoch_shuffling_id - .shuffling_decision_block - } else { - parent_block.root - }; + parent_block.proposer_shuffling_root_for_child_block(block_epoch, &chain.spec); // We assign to a variable instead of using `if let Some` directly to ensure we drop the // write lock before trying to acquire it again in the `else` clause. - let proposer_opt = chain - .beacon_proposer_cache - .lock() - .get_slot::(proposer_shuffling_decision_block, block.slot()); - let (expected_proposer, fork, parent, block) = if let Some(proposer) = proposer_opt { - // The proposer index was cached and we can return it without needing to load the - // parent. - (proposer.index, proposer.fork, None, block) - } else { - // The proposer index was *not* cached and we must load the parent in order to determine - // the proposer index. - let (mut parent, block) = load_parent(block, chain)?; - - debug!( - parent_root = ?parent.beacon_block_root, - parent_slot = %parent.beacon_block.slot(), - ?block_root, - block_slot = %block.slot(), - "Proposer shuffling cache miss" - ); - - // The state produced is only valid for determining proposer/attester shuffling indices. - let state = cheap_state_advance_to_obtain_committees::<_, BlockError>( - &mut parent.pre_state, - parent.beacon_state_root, - block.slot(), - &chain.spec, - )?; - - let epoch = state.current_epoch(); - let proposers = state.get_beacon_proposer_indices(epoch, &chain.spec)?; - let proposer_index = *proposers - .get(block.slot().as_usize() % T::EthSpec::slots_per_epoch() as usize) - .ok_or_else(|| BeaconChainError::NoProposerForSlot(block.slot()))?; - - // Prime the proposer shuffling cache with the newly-learned value. - chain.beacon_proposer_cache.lock().insert( - block_epoch, - proposer_shuffling_decision_block, - proposers, - state.fork(), - )?; - - (proposer_index, state.fork(), Some(parent), block) - }; + let block_slot = block.slot(); + let mut opt_parent = None; + let proposer = chain.with_proposer_cache::<_, BlockError>( + proposer_shuffling_decision_block, + block_epoch, + |proposers| proposers.get_slot::(block_slot), + || { + // The proposer index was *not* cached and we must load the parent in order to + // determine the proposer index. + let (mut parent, _) = load_parent(block.clone(), chain)?; + let parent_state_root = if let Some(state_root) = parent.beacon_state_root { + state_root + } else { + // This is potentially a little inefficient, although we are likely to need + // the state's hash eventually (if the block is valid), and we are also likely + // to already have the hash cached (if fetched from the state cache). + parent.pre_state.canonical_root()? + }; + let parent_state = parent.pre_state.clone(); + opt_parent = Some(parent); + Ok((parent_state_root, parent_state)) + }, + )?; + let expected_proposer = proposer.index; + let fork = proposer.fork; let signature_is_valid = { let pubkey_cache = get_validator_pubkey_cache(chain)?; @@ -1077,7 +1051,7 @@ impl GossipVerifiedBlock { Ok(Self { block, block_root, - parent, + parent: opt_parent, consensus_context, }) } diff --git a/beacon_node/beacon_chain/src/canonical_head.rs b/beacon_node/beacon_chain/src/canonical_head.rs index 78005bf799..cfc7a9637b 100644 --- a/beacon_node/beacon_chain/src/canonical_head.rs +++ b/beacon_node/beacon_chain/src/canonical_head.rs @@ -829,7 +829,7 @@ impl BeaconChain { let head_slot = new_snapshot.beacon_state.slot(); let dependent_root = new_snapshot .beacon_state - .proposer_shuffling_decision_root(self.genesis_block_root); + .attester_shuffling_decision_root(self.genesis_block_root, RelativeEpoch::Next); let prev_dependent_root = new_snapshot .beacon_state .attester_shuffling_decision_root(self.genesis_block_root, RelativeEpoch::Current); diff --git a/beacon_node/beacon_chain/src/data_column_verification.rs b/beacon_node/beacon_chain/src/data_column_verification.rs index 608e003a22..600b107c1d 100644 --- a/beacon_node/beacon_chain/src/data_column_verification.rs +++ b/beacon_node/beacon_chain/src/data_column_verification.rs @@ -1,7 +1,5 @@ -use crate::beacon_proposer_cache::EpochBlockProposers; use crate::block_verification::{ - BlockSlashInfo, cheap_state_advance_to_obtain_committees, get_validator_pubkey_cache, - process_block_slash_info, + BlockSlashInfo, get_validator_pubkey_cache, process_block_slash_info, }; use crate::kzg_utils::{reconstruct_data_columns, validate_data_columns}; use crate::observed_data_sidecars::{ObservationStrategy, Observe}; @@ -641,65 +639,34 @@ fn verify_proposer_and_signature( let block_root = data_column.block_root(); let block_parent_root = data_column.block_parent_root(); - let proposer_shuffling_root = if parent_block.slot.epoch(slots_per_epoch) == column_epoch { - parent_block - .next_epoch_shuffling_id - .shuffling_decision_block - } else { - parent_block.root - }; + let proposer_shuffling_root = + parent_block.proposer_shuffling_root_for_child_block(column_epoch, &chain.spec); - // We lock the cache briefly to get or insert a OnceCell, then drop the lock - // before doing proposer shuffling calculation via `OnceCell::get_or_try_init`. This avoids - // holding the lock during the computation, while still ensuring the result is cached and - // initialised only once. - // - // This approach exposes the cache internals (`OnceCell` & `EpochBlockProposers`) - // as a trade-off for avoiding lock contention. - let epoch_proposers_cell = chain - .beacon_proposer_cache - .lock() - .get_or_insert_key(column_epoch, proposer_shuffling_root); - - let epoch_proposers = epoch_proposers_cell.get_or_try_init(move || { - debug!( - %block_root, - index = %column_index, - "Proposer shuffling cache miss for column verification" - ); - let (parent_state_root, mut parent_state) = chain - .store - .get_advanced_hot_state(block_parent_root, column_slot, parent_block.state_root) - .map_err(|e| GossipDataColumnError::BeaconChainError(Box::new(e.into())))? - .ok_or_else(|| { - BeaconChainError::DBInconsistent(format!( - "Missing state for parent block {block_parent_root:?}", - )) - })?; - - let state = cheap_state_advance_to_obtain_committees::<_, GossipDataColumnError>( - &mut parent_state, - Some(parent_state_root), - column_slot, - &chain.spec, - )?; - - let epoch = state.current_epoch(); - let proposers = state.get_beacon_proposer_indices(epoch, &chain.spec)?; - // Prime the proposer shuffling cache with the newly-learned value. - Ok::<_, GossipDataColumnError>(EpochBlockProposers { - epoch: column_epoch, - fork: state.fork(), - proposers: proposers.into(), - }) - })?; - - let proposer_index = *epoch_proposers - .proposers - .get(column_slot.as_usize() % slots_per_epoch as usize) - .ok_or_else(|| BeaconChainError::NoProposerForSlot(column_slot))?; - - let fork = epoch_proposers.fork; + let proposer = chain.with_proposer_cache( + proposer_shuffling_root, + column_epoch, + |proposers| proposers.get_slot::(column_slot), + || { + debug!( + %block_root, + index = %column_index, + "Proposer shuffling cache miss for column verification" + ); + chain + .store + .get_advanced_hot_state(block_parent_root, column_slot, parent_block.state_root) + .map_err(|e| GossipDataColumnError::BeaconChainError(Box::new(e.into())))? + .ok_or_else(|| { + GossipDataColumnError::BeaconChainError(Box::new( + BeaconChainError::DBInconsistent(format!( + "Missing state for parent block {block_parent_root:?}", + )), + )) + }) + }, + )?; + let proposer_index = proposer.index; + let fork = proposer.fork; // Signature verify the signed block header. let signature_is_valid = { diff --git a/beacon_node/beacon_chain/src/errors.rs b/beacon_node/beacon_chain/src/errors.rs index a1a0ec74f6..7b04a36fae 100644 --- a/beacon_node/beacon_chain/src/errors.rs +++ b/beacon_node/beacon_chain/src/errors.rs @@ -230,6 +230,23 @@ pub enum BeaconChainError { columns_found: usize, }, FailedToReconstructBlobs(String), + ProposerCacheIncorrectState { + state_decision_block_root: Hash256, + requested_decision_block_root: Hash256, + }, + ProposerCacheAccessorFailure { + decision_block_root: Hash256, + proposal_epoch: Epoch, + }, + ProposerCacheOutOfBounds { + slot: Slot, + epoch: Epoch, + }, + ProposerCacheWrongEpoch { + request_epoch: Epoch, + cache_epoch: Epoch, + }, + SkipProposerPreparation, } easy_from_to!(SlotProcessingError, BeaconChainError); diff --git a/beacon_node/beacon_chain/src/validator_monitor.rs b/beacon_node/beacon_chain/src/validator_monitor.rs index 23f1a7d430..00c30e5ab1 100644 --- a/beacon_node/beacon_chain/src/validator_monitor.rs +++ b/beacon_node/beacon_chain/src/validator_monitor.rs @@ -497,7 +497,7 @@ impl ValidatorMonitor { }); // Add missed non-finalized blocks for the monitored validators - self.add_validators_missed_blocks(state); + self.add_validators_missed_blocks(state, spec); self.process_unaggregated_attestations(state, spec); // Update metrics for individual validators. @@ -588,7 +588,7 @@ impl ValidatorMonitor { } /// Add missed non-finalized blocks for the monitored validators - fn add_validators_missed_blocks(&mut self, state: &BeaconState) { + fn add_validators_missed_blocks(&mut self, state: &BeaconState, spec: &ChainSpec) { // Define range variables let current_slot = state.slot(); let current_epoch = current_slot.epoch(E::slots_per_epoch()); @@ -616,8 +616,8 @@ impl ValidatorMonitor { if block_root == prev_block_root { let slot_epoch = slot.epoch(E::slots_per_epoch()); - if let Ok(shuffling_decision_block) = - state.proposer_shuffling_decision_root_at_epoch(slot_epoch, *block_root) + if let Ok(shuffling_decision_block) = state + .proposer_shuffling_decision_root_at_epoch(slot_epoch, *block_root, spec) { // Update the cache if it has not yet been initialised, or if it is // initialised for a prior epoch. This is an optimisation to avoid bouncing diff --git a/beacon_node/beacon_chain/tests/store_tests.rs b/beacon_node/beacon_chain/tests/store_tests.rs index fbb592b510..efa16978e0 100644 --- a/beacon_node/beacon_chain/tests/store_tests.rs +++ b/beacon_node/beacon_chain/tests/store_tests.rs @@ -1191,6 +1191,271 @@ fn check_shuffling_compatible( } } +/// These tests check the consistency of: +/// +/// - ProtoBlock::proposer_shuffling_root_for_child_block, and +/// - BeaconState::proposer_shuffling_decision_root{_at_epoch} +async fn proposer_shuffling_root_consistency_test(parent_slot: u64, child_slot: u64) { + let child_slot = Slot::new(child_slot); + let db_path = tempdir().unwrap(); + let store = get_store(&db_path); + let validators_keypairs = + types::test_utils::generate_deterministic_keypairs(LOW_VALIDATOR_COUNT); + let harness = TestHarness::builder(MinimalEthSpec) + .default_spec() + .keypairs(validators_keypairs) + .fresh_disk_store(store) + .mock_execution_layer() + .build(); + let spec = &harness.chain.spec; + + // Build chain out to parent block. + let initial_slots: Vec = (1..=parent_slot).map(Into::into).collect(); + let (state, state_root) = harness.get_current_state_and_root(); + let all_validators = harness.get_all_validators(); + let (_, _, parent_root, _) = harness + .add_attested_blocks_at_slots(state, state_root, &initial_slots, &all_validators) + .await; + + // Add the child block. + let (state, state_root) = harness.get_current_state_and_root(); + let all_validators = harness.get_all_validators(); + let (_, _, child_root, child_block_state) = harness + .add_attested_blocks_at_slots(state, state_root, &[child_slot], &all_validators) + .await; + + let child_block_epoch = child_slot.epoch(E::slots_per_epoch()); + + // Load parent block from fork choice. + let fc_parent = harness + .chain + .canonical_head + .fork_choice_read_lock() + .get_block(&parent_root.into()) + .unwrap(); + + // The proposer shuffling decision root computed using fork choice should equal the root + // computed from the child state. + let decision_root = fc_parent.proposer_shuffling_root_for_child_block(child_block_epoch, spec); + + assert_eq!( + decision_root, + child_block_state + .proposer_shuffling_decision_root(child_root.into(), spec) + .unwrap() + ); + assert_eq!( + decision_root, + child_block_state + .proposer_shuffling_decision_root_at_epoch(child_block_epoch, child_root.into(), spec) + .unwrap() + ); + + // The passed block root argument should be irrelevant for all blocks except the genesis block. + assert_eq!( + decision_root, + child_block_state + .proposer_shuffling_decision_root(Hash256::ZERO, spec) + .unwrap() + ); + assert_eq!( + decision_root, + child_block_state + .proposer_shuffling_decision_root_at_epoch(child_block_epoch, Hash256::ZERO, spec) + .unwrap() + ); +} + +#[tokio::test] +async fn proposer_shuffling_root_consistency_same_epoch() { + proposer_shuffling_root_consistency_test(32, 39).await; +} + +#[tokio::test] +async fn proposer_shuffling_root_consistency_next_epoch() { + proposer_shuffling_root_consistency_test(32, 47).await; +} + +#[tokio::test] +async fn proposer_shuffling_root_consistency_two_epochs() { + proposer_shuffling_root_consistency_test(32, 55).await; +} + +#[tokio::test] +async fn proposer_shuffling_changing_with_lookahead() { + let initial_blocks = E::slots_per_epoch() * 4 - 1; + + let spec = ForkName::Fulu.make_genesis_spec(E::default_spec()); + let db_path = tempdir().unwrap(); + let store = get_store_generic(&db_path, Default::default(), spec.clone()); + let validators_keypairs = + types::test_utils::generate_deterministic_keypairs(LOW_VALIDATOR_COUNT); + let harness = TestHarness::builder(MinimalEthSpec) + .spec(spec.into()) + .keypairs(validators_keypairs) + .fresh_disk_store(store) + .mock_execution_layer() + .build(); + let spec = &harness.chain.spec; + + // Start with some blocks, finishing with one slot before a new epoch. + harness.advance_slot(); + harness + .extend_chain( + initial_blocks as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + let pre_deposit_state = harness.get_current_state(); + assert_eq!(pre_deposit_state.slot(), initial_blocks); + let topup_block_slot = Slot::new(initial_blocks + 1); + let validator_to_topup_index = 1; + let validator_to_topup = pre_deposit_state + .get_validator(validator_to_topup_index) + .unwrap() + .clone(); + + // Craft a block with a deposit request and consolidation. + // XXX: This is a really nasty way to do this, but we need better test facilities in + // MockExecutionLayer to address this. + let deposit_request: DepositRequest = DepositRequest { + index: pre_deposit_state.eth1_deposit_index(), + pubkey: validator_to_topup.pubkey, + withdrawal_credentials: validator_to_topup.withdrawal_credentials, + amount: 63_000_000_000, + signature: SignatureBytes::empty(), + }; + + let consolidation_request: ConsolidationRequest = ConsolidationRequest { + source_address: validator_to_topup + .get_execution_withdrawal_address(spec) + .unwrap(), + source_pubkey: validator_to_topup.pubkey, + target_pubkey: validator_to_topup.pubkey, + }; + + let execution_requests = ExecutionRequests:: { + deposits: VariableList::new(vec![deposit_request]).unwrap(), + withdrawals: vec![].into(), + consolidations: VariableList::new(vec![consolidation_request]).unwrap(), + }; + + let mut block = Box::pin(harness.make_block_with_modifier( + pre_deposit_state.clone(), + topup_block_slot, + |block| *block.body_mut().execution_requests_mut().unwrap() = execution_requests, + )) + .await + .0; + + let Err(BlockError::StateRootMismatch { + local: true_state_root, + .. + }) = harness + .process_block(topup_block_slot, block.0.canonical_root(), block.clone()) + .await + else { + panic!("state root should not match due to pending deposits changes/etc"); + }; + let mut new_block = block.0.message_fulu().unwrap().clone(); + new_block.state_root = true_state_root; + block.0 = Arc::new(harness.sign_beacon_block(new_block.into(), &pre_deposit_state)); + + harness + .process_block(topup_block_slot, block.0.canonical_root(), block.clone()) + .await + .unwrap(); + + // Advance two epochs to finalize the deposit and process it. + // Start with just a single epoch advance so we can grab the state one epoch prior to where + // we end up. + harness.advance_slot(); + harness + .extend_chain( + E::slots_per_epoch() as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + // Grab the epoch start state. This is the state from which the proposers at the next epoch were + // computed. + let prev_epoch_state = harness.get_current_state(); + assert_eq!(prev_epoch_state.slot() % E::slots_per_epoch(), 0); + + // The deposit should be pending. + let pending_deposits = prev_epoch_state.pending_deposits().unwrap(); + assert_eq!(pending_deposits.len(), 1, "{pending_deposits:?}"); + + // Advance the 2nd epoch to finalize the deposit and process it. + harness.advance_slot(); + harness + .extend_chain( + E::slots_per_epoch() as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + let current_epoch_state = harness.get_current_state(); + assert_eq!(current_epoch_state.slot() % E::slots_per_epoch(), 0); + + // Deposit is processed! + let pending_deposits = current_epoch_state.pending_deposits().unwrap(); + assert_eq!(pending_deposits.len(), 0, "{pending_deposits:?}"); + + let validator = current_epoch_state + .get_validator(validator_to_topup_index) + .unwrap(); + assert!(validator.has_compounding_withdrawal_credential(spec)); + assert_eq!(validator.effective_balance, 95_000_000_000); + + // The shuffling for the current epoch from `prev_epoch_state` should match the shuffling + // for the current epoch from `current_epoch_state` because we should be correctly using the + // stored lookahead. + let current_epoch = current_epoch_state.current_epoch(); + let proposer_shuffling = prev_epoch_state + .get_beacon_proposer_indices(current_epoch, spec) + .unwrap(); + + assert_eq!( + proposer_shuffling, + current_epoch_state + .get_beacon_proposer_indices(current_epoch, spec) + .unwrap() + ); + + // If we bypass the safety checks in `get_proposer_indices`, we should see that the shuffling + // differs due to the effective balance change. + let unsafe_get_proposer_indices = |state: &BeaconState, epoch| -> Vec { + let indices = state.get_active_validator_indices(epoch, spec).unwrap(); + let preimage = state.get_seed(epoch, Domain::BeaconProposer, spec).unwrap(); + epoch + .slot_iter(E::slots_per_epoch()) + .map(|slot| { + let mut preimage = preimage.to_vec(); + preimage.append(&mut int_to_bytes::int_to_bytes8(slot.as_u64())); + let seed = ethereum_hashing::hash(&preimage); + state.compute_proposer_index(&indices, &seed, spec).unwrap() + }) + .collect() + }; + + // The unsafe function is correct when used with lookahead. + assert_eq!( + unsafe_get_proposer_indices(&prev_epoch_state, current_epoch), + proposer_shuffling + ); + + // Computing the shuffling for current epoch without lookahead is WRONG. + assert_ne!( + unsafe_get_proposer_indices(¤t_epoch_state, current_epoch), + proposer_shuffling, + ); +} + // Ensure blocks from abandoned forks are pruned from the Hot DB #[tokio::test] async fn prunes_abandoned_fork_between_two_finalized_checkpoints() { diff --git a/beacon_node/beacon_chain/tests/validator_monitor.rs b/beacon_node/beacon_chain/tests/validator_monitor.rs index 4e2554d3d8..95732abeb5 100644 --- a/beacon_node/beacon_chain/tests/validator_monitor.rs +++ b/beacon_node/beacon_chain/tests/validator_monitor.rs @@ -3,7 +3,7 @@ use beacon_chain::test_utils::{ }; use beacon_chain::validator_monitor::{MISSED_BLOCK_LAG_SLOTS, ValidatorMonitorConfig}; use std::sync::LazyLock; -use types::{Epoch, EthSpec, Keypair, MainnetEthSpec, PublicKeyBytes, Slot}; +use types::{Epoch, EthSpec, Hash256, Keypair, MainnetEthSpec, PublicKeyBytes, Slot}; // Should ideally be divisible by 3. pub const VALIDATOR_COUNT: usize = 48; @@ -74,7 +74,7 @@ async fn missed_blocks_across_epochs() { .get_hot_state(state_roots_by_slot[&start_slot]) .unwrap(); let decision_root = state - .proposer_shuffling_decision_root(genesis_block_root) + .proposer_shuffling_decision_root(genesis_block_root, &harness.chain.spec) .unwrap(); proposer_shuffling_cache .insert( @@ -152,7 +152,7 @@ async fn missed_blocks_basic() { .unwrap(); let mut missed_block_proposer = validator_indexes[slot_in_epoch.as_usize()]; let mut proposer_shuffling_decision_root = _state - .proposer_shuffling_decision_root(duplicate_block_root) + .proposer_shuffling_decision_root(duplicate_block_root, &harness1.chain.spec) .unwrap(); let beacon_proposer_cache = harness1 @@ -235,17 +235,20 @@ async fn missed_blocks_basic() { // Let's fill the cache with the proposers for the current epoch // and push the duplicate_block_root to the block_roots vector assert_eq!( - beacon_proposer_cache.lock().insert( - epoch, - duplicate_block_root, - validator_indexes.clone(), - _state2.fork() - ), + _state2.set_block_root(prev_slot, duplicate_block_root), Ok(()) ); + let decision_block_root = _state2 + .proposer_shuffling_decision_root_at_epoch(epoch, Hash256::ZERO, &harness2.chain.spec) + .unwrap(); assert_eq!( - _state2.set_block_root(prev_slot, duplicate_block_root), + beacon_proposer_cache.lock().insert( + epoch, + decision_block_root, + validator_indexes.clone(), + _state2.fork() + ), Ok(()) ); @@ -326,7 +329,11 @@ async fn missed_blocks_basic() { .unwrap(); missed_block_proposer = validator_indexes[slot_in_epoch.as_usize()]; proposer_shuffling_decision_root = _state3 - .proposer_shuffling_decision_root_at_epoch(epoch, duplicate_block_root) + .proposer_shuffling_decision_root_at_epoch( + epoch, + duplicate_block_root, + &harness1.chain.spec, + ) .unwrap(); let beacon_proposer_cache = harness3 diff --git a/beacon_node/http_api/src/proposer_duties.rs b/beacon_node/http_api/src/proposer_duties.rs index 3705c399bd..ceac60cbad 100644 --- a/beacon_node/http_api/src/proposer_duties.rs +++ b/beacon_node/http_api/src/proposer_duties.rs @@ -3,12 +3,13 @@ use crate::state_id::StateId; use beacon_chain::{ BeaconChain, BeaconChainError, BeaconChainTypes, - beacon_proposer_cache::{compute_proposer_duties_from_head, ensure_state_is_in_epoch}, + beacon_proposer_cache::{ + compute_proposer_duties_from_head, ensure_state_can_determine_proposers_for_epoch, + }, }; use eth2::types::{self as api_types}; use safe_arith::SafeArith; use slot_clock::SlotClock; -use std::cmp::Ordering; use tracing::debug; use types::{Epoch, EthSpec, Hash256, Slot}; @@ -105,36 +106,29 @@ fn try_proposer_duties_from_cache( let head_decision_root = head .snapshot .beacon_state - .proposer_shuffling_decision_root(head_block_root) + .proposer_shuffling_decision_root(head_block_root, &chain.spec) .map_err(warp_utils::reject::beacon_state_error)?; let execution_optimistic = chain .is_optimistic_or_invalid_head_block(head_block) .map_err(warp_utils::reject::unhandled_error)?; - let dependent_root = match head_epoch.cmp(&request_epoch) { - // head_epoch == request_epoch - Ordering::Equal => head_decision_root, - // head_epoch < request_epoch - Ordering::Less => head_block_root, - // head_epoch > request_epoch - Ordering::Greater => { - return Err(warp_utils::reject::custom_server_error(format!( - "head epoch {} is later than request epoch {}", - head_epoch, request_epoch - ))); - } - }; + // This code path can't handle requests for past epochs. + if head_epoch > request_epoch { + return Err(warp_utils::reject::custom_server_error(format!( + "head epoch {head_epoch} is later than request epoch {request_epoch}", + ))); + } chain .beacon_proposer_cache .lock() - .get_epoch::(dependent_root, request_epoch) + .get_epoch::(head_decision_root, request_epoch) .cloned() .map(|indices| { convert_to_api_response( chain, request_epoch, - dependent_root, + head_decision_root, execution_optimistic, indices.to_vec(), ) @@ -204,18 +198,19 @@ fn compute_historic_proposer_duties( } }; - let (state, execution_optimistic) = - if let Some((state_root, mut state, execution_optimistic)) = state_opt { - // If we've loaded the head state it might be from a previous epoch, ensure it's in a - // suitable epoch. - ensure_state_is_in_epoch(&mut state, state_root, epoch, &chain.spec) - .map_err(warp_utils::reject::unhandled_error)?; - (state, execution_optimistic) - } else { - let (state, execution_optimistic, _finalized) = - StateId::from_slot(epoch.start_slot(T::EthSpec::slots_per_epoch())).state(chain)?; - (state, execution_optimistic) - }; + let (state, execution_optimistic) = if let Some((state_root, mut state, execution_optimistic)) = + state_opt + { + // If we've loaded the head state it might be from a previous epoch, ensure it's in a + // suitable epoch. + ensure_state_can_determine_proposers_for_epoch(&mut state, state_root, epoch, &chain.spec) + .map_err(warp_utils::reject::unhandled_error)?; + (state, execution_optimistic) + } else { + let (state, execution_optimistic, _finalized) = + StateId::from_slot(epoch.start_slot(T::EthSpec::slots_per_epoch())).state(chain)?; + (state, execution_optimistic) + }; // Ensure the state lookup was correct. if state.current_epoch() != epoch { @@ -234,7 +229,7 @@ fn compute_historic_proposer_duties( // We can supply the genesis block root as the block root since we know that the only block that // decides its own root is the genesis block. let dependent_root = state - .proposer_shuffling_decision_root(chain.genesis_block_root) + .proposer_shuffling_decision_root(chain.genesis_block_root, &chain.spec) .map_err(BeaconChainError::from) .map_err(warp_utils::reject::unhandled_error)?; diff --git a/consensus/proto_array/src/proto_array_fork_choice.rs b/consensus/proto_array/src/proto_array_fork_choice.rs index 4b31dc60bd..8c7b58c4d4 100644 --- a/consensus/proto_array/src/proto_array_fork_choice.rs +++ b/consensus/proto_array/src/proto_array_fork_choice.rs @@ -160,6 +160,50 @@ pub struct Block { pub unrealized_finalized_checkpoint: Option, } +impl Block { + /// Compute the proposer shuffling decision root of a child block in `child_block_epoch`. + /// + /// This function assumes that `child_block_epoch >= self.epoch`. It is the responsibility of + /// the caller to check this condition, or else incorrect results will be produced. + pub fn proposer_shuffling_root_for_child_block( + &self, + child_block_epoch: Epoch, + spec: &ChainSpec, + ) -> Hash256 { + let block_epoch = self.current_epoch_shuffling_id.shuffling_epoch; + + if !spec.fork_name_at_epoch(child_block_epoch).fulu_enabled() { + // Prior to Fulu the proposer shuffling decision root for the current epoch is the same + // as the attestation shuffling for the *next* epoch, i.e. it is determined at the start + // of the current epoch. + if block_epoch == child_block_epoch { + self.next_epoch_shuffling_id.shuffling_decision_block + } else { + // Otherwise, the child block epoch is greater, so its decision root is its parent + // root itself (this block's root). + self.root + } + } else { + // After Fulu the proposer shuffling is determined with lookahead, so if the block + // lies in the same epoch as its parent, its decision root is the same as the + // parent's current epoch attester shuffling + // + // i.e. the block from the end of epoch N - 2. + if child_block_epoch == block_epoch { + self.current_epoch_shuffling_id.shuffling_decision_block + } else if child_block_epoch == block_epoch + 1 { + // If the block is the next epoch, then it instead shares its decision root with + // the parent's *next epoch* attester shuffling. + self.next_epoch_shuffling_id.shuffling_decision_block + } else { + // The child block lies in the future beyond the lookahead, at the point where this + // block (its parent) will be the decision block. + self.root + } + } + } +} + /// A Vec-wrapper which will grow to match any request. /// /// E.g., a `get` or `insert` to an out-of-bounds element will cause the Vec to grow (using diff --git a/consensus/state_processing/src/all_caches.rs b/consensus/state_processing/src/all_caches.rs index d6c4fd3f88..0381bb820f 100644 --- a/consensus/state_processing/src/all_caches.rs +++ b/consensus/state_processing/src/all_caches.rs @@ -1,9 +1,7 @@ use crate::common::update_progressive_balances_cache::initialize_progressive_balances_cache; use crate::epoch_cache::initialize_epoch_cache; use tracing::instrument; -use types::{ - BeaconState, ChainSpec, EpochCacheError, EthSpec, FixedBytesExtended, Hash256, RelativeEpoch, -}; +use types::{BeaconState, ChainSpec, EpochCacheError, EthSpec, Hash256, RelativeEpoch}; /// Mixin trait for the beacon state that provides operations on *all* caches. /// @@ -34,8 +32,7 @@ impl AllCaches for BeaconState { fn all_caches_built(&self) -> bool { let current_epoch = self.current_epoch(); - let Ok(epoch_cache_decision_block_root) = - self.proposer_shuffling_decision_root(Hash256::zero()) + let Ok(epoch_cache_decision_block_root) = self.epoch_cache_decision_root(Hash256::ZERO) else { return false; }; diff --git a/consensus/state_processing/src/epoch_cache.rs b/consensus/state_processing/src/epoch_cache.rs index 6654c6a7ef..86db037446 100644 --- a/consensus/state_processing/src/epoch_cache.rs +++ b/consensus/state_processing/src/epoch_cache.rs @@ -123,7 +123,7 @@ pub fn is_epoch_cache_initialized( let current_epoch = state.current_epoch(); let epoch_cache: &EpochCache = state.epoch_cache(); let decision_block_root = state - .proposer_shuffling_decision_root(Hash256::zero()) + .epoch_cache_decision_root(Hash256::zero()) .map_err(EpochCacheError::BeaconState)?; Ok(epoch_cache @@ -146,7 +146,7 @@ pub fn initialize_epoch_cache( let current_epoch = state.current_epoch(); let next_epoch = state.next_epoch().map_err(EpochCacheError::BeaconState)?; let decision_block_root = state - .proposer_shuffling_decision_root(Hash256::zero()) + .epoch_cache_decision_root(Hash256::zero()) .map_err(EpochCacheError::BeaconState)?; state.build_total_active_balance_cache(spec)?; diff --git a/consensus/state_processing/src/upgrade/fulu.rs b/consensus/state_processing/src/upgrade/fulu.rs index 6b038ad73a..c2aced7047 100644 --- a/consensus/state_processing/src/upgrade/fulu.rs +++ b/consensus/state_processing/src/upgrade/fulu.rs @@ -33,9 +33,7 @@ fn initialize_proposer_lookahead( ); } - Vector::new(lookahead).map_err(|e| { - Error::PleaseNotifyTheDevs(format!("Failed to initialize proposer lookahead: {:?}", e)) - }) + Vector::new(lookahead).map_err(|e| e.into()) } pub fn upgrade_state_to_fulu( diff --git a/consensus/types/src/beacon_state.rs b/consensus/types/src/beacon_state.rs index d2efbfe909..0a3d768c59 100644 --- a/consensus/types/src/beacon_state.rs +++ b/consensus/types/src/beacon_state.rs @@ -173,7 +173,21 @@ pub enum Error { AggregatorNotInCommittee { aggregator_index: u64, }, - PleaseNotifyTheDevs(String), + ComputeProposerIndicesPastEpoch { + current_epoch: Epoch, + request_epoch: Epoch, + }, + ComputeProposerIndicesInsufficientLookahead { + current_epoch: Epoch, + request_epoch: Epoch, + }, + ComputeProposerIndicesExcessiveLookahead { + current_epoch: Epoch, + request_epoch: Epoch, + }, + ProposerLookaheadOutOfBounds { + i: usize, + }, } /// Control whether an epoch-indexed field can be indexed at the next epoch or not. @@ -886,8 +900,9 @@ impl BeaconState { &self, epoch: Epoch, block_root: Hash256, + spec: &ChainSpec, ) -> Result { - let decision_slot = self.proposer_shuffling_decision_slot(epoch); + let decision_slot = spec.proposer_shuffling_decision_slot::(epoch); if self.slot() <= decision_slot { Ok(block_root) } else { @@ -902,19 +917,18 @@ impl BeaconState { /// /// The `block_root` covers the one-off scenario where the genesis block decides its own /// shuffling. It should be set to the latest block applied to `self` or the genesis block root. - pub fn proposer_shuffling_decision_root(&self, block_root: Hash256) -> Result { - let decision_slot = self.proposer_shuffling_decision_slot(self.current_epoch()); - if self.slot() == decision_slot { - Ok(block_root) - } else { - self.get_block_root(decision_slot).copied() - } + pub fn proposer_shuffling_decision_root( + &self, + block_root: Hash256, + spec: &ChainSpec, + ) -> Result { + self.proposer_shuffling_decision_root_at_epoch(self.current_epoch(), block_root, spec) } - /// Returns the slot at which the proposer shuffling was decided. The block root at this slot - /// can be used to key the proposer shuffling for the given epoch. - fn proposer_shuffling_decision_slot(&self, epoch: Epoch) -> Slot { - epoch.start_slot(E::slots_per_epoch()).saturating_sub(1_u64) + pub fn epoch_cache_decision_root(&self, block_root: Hash256) -> Result { + // Epoch cache decision root for the current epoch (N) is the block root at the end of epoch + // N - 1. This is the same as the root that determines the next epoch attester shuffling. + self.attester_shuffling_decision_root(block_root, RelativeEpoch::Next) } /// Returns the block root which decided the attester shuffling for the given `relative_epoch`. @@ -998,6 +1012,45 @@ impl BeaconState { indices: &[usize], spec: &ChainSpec, ) -> Result, Error> { + // Regardless of fork, we never support computing proposer indices for past epochs. + let current_epoch = self.current_epoch(); + if epoch < current_epoch { + return Err(Error::ComputeProposerIndicesPastEpoch { + current_epoch, + request_epoch: epoch, + }); + } + + if spec.fork_name_at_epoch(epoch).fulu_enabled() { + // Post-Fulu we must never compute proposer indices using insufficient lookahead. This + // would be very dangerous as it would lead to conflicts between the *true* proposer as + // defined by `self.proposer_lookahead` and the output of this function. + // With MIN_SEED_LOOKAHEAD=1 (common config), this is equivalent to checking that the + // requested epoch is not the current epoch. + // + // We do not run this check if this function is called from `upgrade_to_fulu`, + // which runs *after* the slot is incremented, and needs to compute the proposer + // shuffling for the epoch that was just transitioned into. + if self.fork_name_unchecked().fulu_enabled() + && epoch < current_epoch.safe_add(spec.min_seed_lookahead)? + { + return Err(Error::ComputeProposerIndicesInsufficientLookahead { + current_epoch, + request_epoch: epoch, + }); + } + } else { + // Pre-Fulu the situation is reversed, we *should not* compute proposer indices using + // too much lookahead. To do so would make us vulnerable to changes in the proposer + // indices caused by effective balance changes. + if epoch >= current_epoch.safe_add(spec.min_seed_lookahead)? { + return Err(Error::ComputeProposerIndicesExcessiveLookahead { + current_epoch, + request_epoch: epoch, + }); + } + } + epoch .slot_iter(E::slots_per_epoch()) .map(|slot| { @@ -1146,10 +1199,7 @@ impl BeaconState { let index = slot.as_usize().safe_rem(E::slots_per_epoch() as usize)?; proposer_lookahead .get(index) - .ok_or(Error::PleaseNotifyTheDevs(format!( - "Proposer lookahead out of bounds: {} for slot: {}", - index, slot - ))) + .ok_or(Error::ProposerLookaheadOutOfBounds { i: index }) .map(|index| *index as usize) } else { // Pre-Fulu @@ -1168,6 +1218,25 @@ impl BeaconState { epoch: Epoch, spec: &ChainSpec, ) -> Result, Error> { + // This isn't in the spec, but we remove the footgun that is requesting the current epoch + // for a Fulu state. + if let Ok(proposer_lookahead) = self.proposer_lookahead() + && epoch >= self.current_epoch() + && epoch <= self.next_epoch()? + { + let slots_per_epoch = E::slots_per_epoch() as usize; + let start_offset = if epoch == self.current_epoch() { + 0 + } else { + slots_per_epoch + }; + return Ok(proposer_lookahead + .iter_from(start_offset)? + .take(slots_per_epoch) + .map(|x| *x as usize) + .collect()); + } + // Not using the cached validator indices since they are shuffled. let indices = self.get_active_validator_indices(epoch, spec)?; diff --git a/consensus/types/src/chain_spec.rs b/consensus/types/src/chain_spec.rs index a1005d904a..6670fff629 100644 --- a/consensus/types/src/chain_spec.rs +++ b/consensus/types/src/chain_spec.rs @@ -865,6 +865,28 @@ impl ChainSpec { ) } + /// Returns the slot at which the proposer shuffling was decided. + /// + /// The block root at this slot can be used to key the proposer shuffling for the given epoch. + pub fn proposer_shuffling_decision_slot(&self, epoch: Epoch) -> Slot { + if self.fork_name_at_epoch(epoch).fulu_enabled() { + // Post-Fulu the proposer shuffling decision slot for epoch N is the slot at the end + // of epoch N - 2 (note: min_seed_lookahead=1 in all current configs). + epoch + .saturating_sub(self.min_seed_lookahead) + .start_slot(E::slots_per_epoch()) + .saturating_sub(1_u64) + } else { + // Pre-Fulu the proposer shuffling decision slot for epoch N is the slot at the end of + // epoch N - 1 (note: +1 -1 for min_seed_lookahead=1 in all current configs). + epoch + .saturating_add(Epoch::new(1)) + .saturating_sub(self.min_seed_lookahead) + .start_slot(E::slots_per_epoch()) + .saturating_sub(1_u64) + } + } + /// Returns a `ChainSpec` compatible with the Ethereum Foundation specification. pub fn mainnet() -> Self { Self { diff --git a/consensus/types/src/epoch_cache.rs b/consensus/types/src/epoch_cache.rs index ef91c20d75..9956cb400a 100644 --- a/consensus/types/src/epoch_cache.rs +++ b/consensus/types/src/epoch_cache.rs @@ -5,9 +5,13 @@ use std::sync::Arc; /// Cache of values which are uniquely determined at the start of an epoch. /// /// The values are fixed with respect to the last block of the _prior_ epoch, which we refer -/// to as the "decision block". This cache is very similar to the `BeaconProposerCache` in that -/// beacon proposers are determined at exactly the same time as the values in this cache, so -/// the keys for the two caches are identical. +/// to as the "decision block". +/// +/// Prior to Fulu this cache was similar to the `BeaconProposerCache` in that beacon proposers were +/// determined at exactly the same time as the values in this cache, so the keys for the two caches +/// were identical. +/// +/// Post-Fulu, we use a different key (the proposers have more lookahead). #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] #[derive(Debug, PartialEq, Eq, Clone, Default)] pub struct EpochCache { diff --git a/testing/ef_tests/src/cases/fork.rs b/testing/ef_tests/src/cases/fork.rs index 78d802c228..54efb9f9ce 100644 --- a/testing/ef_tests/src/cases/fork.rs +++ b/testing/ef_tests/src/cases/fork.rs @@ -60,7 +60,7 @@ impl Case for ForkTest { fn result(&self, _case_index: usize, fork_name: ForkName) -> Result<(), Error> { let mut result_state = self.pre.clone(); let mut expected = Some(self.post.clone()); - let spec = &E::default_spec(); + let spec = &fork_name.make_genesis_spec(E::default_spec()); let mut result = match fork_name { ForkName::Base => panic!("phase0 not supported"), From edcfee636cd7c32ee63e981ec0487a2798ec6518 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Sat, 27 Sep 2025 21:03:25 -0700 Subject: [PATCH 60/81] Fix bug in fork calculation at fork boundaries (#8121) N/A In #8101 , when we modified the logic to get the proposer index post fulu, we seem to have missed advancing the state at the fork boundaries to get the right `Fork` for signature verification. This led to lighthouse failing all gossip verification right after transitioning to fulu that was observed on the holesky shadow fork ``` Sep 26 14:24:00.088 DEBUG Rejected gossip block error: "InvalidSignature(ProposerSignature)", graffiti: "grandine-geth-super-1", slot: 640 Sep 26 14:24:00.099 WARN Could not verify block for gossip. Rejecting the block error: InvalidSignature(ProposerSignature) ``` I'm not completely sure this is the correct fix, but this fixes the issue with `InvalidProposerSignature` on the holesky shadow fork. Thanks to @eserilev for helping debug this Co-Authored-By: Pawan Dhananjay --- beacon_node/beacon_chain/src/beacon_proposer_cache.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/beacon_node/beacon_chain/src/beacon_proposer_cache.rs b/beacon_node/beacon_chain/src/beacon_proposer_cache.rs index 47c44542c0..a64b4981cc 100644 --- a/beacon_node/beacon_chain/src/beacon_proposer_cache.rs +++ b/beacon_node/beacon_chain/src/beacon_proposer_cache.rs @@ -234,8 +234,14 @@ pub fn ensure_state_can_determine_proposers_for_epoch( if state.current_epoch() > maximum_epoch { Err(BeaconStateError::SlotOutOfBounds.into()) } else if state.current_epoch() >= minimum_epoch { - // Fulu allows us to access shufflings in multiple epochs (thanks to lookahead). - // Pre-Fulu we expect `minimum_epoch == maximum_epoch`, and this branch covers that case. + if target_epoch > state.current_epoch() { + let target_slot = target_epoch.start_slot(E::slots_per_epoch()); + + // Advance the state into the same epoch as the block. Use the "partial" method since state + // roots are not important for proposer/attester shuffling. + partial_state_advance(state, Some(state_root), target_slot, spec) + .map_err(BeaconChainError::from)?; + } Ok(()) } else { // State's current epoch is less than the minimum epoch. From 38fdaf791ce7a41590dbf5a4e6694eb1c4621721 Mon Sep 17 00:00:00 2001 From: Michael Sproul Date: Mon, 29 Sep 2025 11:13:33 +1000 Subject: [PATCH 61/81] Fix proposer shuffling decision slot at boundary (#8128) Follow-up to the bug fixed in: - https://github.com/sigp/lighthouse/pull/8121 This fixes the root cause of that bug, which was introduced by me in: - https://github.com/sigp/lighthouse/pull/8101 Lion identified the issue here: - https://github.com/sigp/lighthouse/pull/8101#discussion_r2382710356 In the methods that compute the proposer shuffling decision root, ensure we don't use lookahead for the Fulu fork epoch itself. This is accomplished by checking if Fulu is enabled at `epoch - 1`, i.e. if `epoch > fulu_fork_epoch`. I haven't updated the methods that _compute_ shufflings to use these new corrected bounds (e.g. `BeaconState::compute_proposer_indices`), although we could make this change in future. The `get_beacon_proposer_indices` method already gracefully handles the Fulu boundary case by using the `proposer_lookahead` field (if initialised). Co-Authored-By: Michael Sproul --- beacon_node/beacon_chain/tests/store_tests.rs | 57 +++++++++++++++++-- .../src/proto_array_fork_choice.rs | 8 ++- consensus/types/src/chain_spec.rs | 36 +++++++++++- 3 files changed, 93 insertions(+), 8 deletions(-) diff --git a/beacon_node/beacon_chain/tests/store_tests.rs b/beacon_node/beacon_chain/tests/store_tests.rs index efa16978e0..cd4032f55d 100644 --- a/beacon_node/beacon_chain/tests/store_tests.rs +++ b/beacon_node/beacon_chain/tests/store_tests.rs @@ -1195,14 +1195,18 @@ fn check_shuffling_compatible( /// /// - ProtoBlock::proposer_shuffling_root_for_child_block, and /// - BeaconState::proposer_shuffling_decision_root{_at_epoch} -async fn proposer_shuffling_root_consistency_test(parent_slot: u64, child_slot: u64) { +async fn proposer_shuffling_root_consistency_test( + spec: ChainSpec, + parent_slot: u64, + child_slot: u64, +) { let child_slot = Slot::new(child_slot); let db_path = tempdir().unwrap(); - let store = get_store(&db_path); + let store = get_store_generic(&db_path, Default::default(), spec.clone()); let validators_keypairs = types::test_utils::generate_deterministic_keypairs(LOW_VALIDATOR_COUNT); let harness = TestHarness::builder(MinimalEthSpec) - .default_spec() + .spec(spec.into()) .keypairs(validators_keypairs) .fresh_disk_store(store) .mock_execution_layer() @@ -1268,17 +1272,58 @@ async fn proposer_shuffling_root_consistency_test(parent_slot: u64, child_slot: #[tokio::test] async fn proposer_shuffling_root_consistency_same_epoch() { - proposer_shuffling_root_consistency_test(32, 39).await; + let spec = test_spec::(); + proposer_shuffling_root_consistency_test(spec, 32, 39).await; } #[tokio::test] async fn proposer_shuffling_root_consistency_next_epoch() { - proposer_shuffling_root_consistency_test(32, 47).await; + let spec = test_spec::(); + proposer_shuffling_root_consistency_test(spec, 32, 47).await; } #[tokio::test] async fn proposer_shuffling_root_consistency_two_epochs() { - proposer_shuffling_root_consistency_test(32, 55).await; + let spec = test_spec::(); + proposer_shuffling_root_consistency_test(spec, 32, 55).await; +} + +#[tokio::test] +async fn proposer_shuffling_root_consistency_at_fork_boundary() { + let mut spec = ForkName::Electra.make_genesis_spec(E::default_spec()); + spec.fulu_fork_epoch = Some(Epoch::new(4)); + + // Parent block in epoch prior to Fulu fork epoch, child block in Fulu fork epoch. + proposer_shuffling_root_consistency_test( + spec.clone(), + 3 * E::slots_per_epoch(), + 4 * E::slots_per_epoch(), + ) + .await; + + // Parent block and child block in Fulu fork epoch. + proposer_shuffling_root_consistency_test( + spec.clone(), + 4 * E::slots_per_epoch(), + 4 * E::slots_per_epoch() + 1, + ) + .await; + + // Parent block in Fulu fork epoch and child block in epoch after. + proposer_shuffling_root_consistency_test( + spec.clone(), + 4 * E::slots_per_epoch(), + 5 * E::slots_per_epoch(), + ) + .await; + + // Parent block in epoch prior and child block in epoch after. + proposer_shuffling_root_consistency_test( + spec, + 3 * E::slots_per_epoch(), + 5 * E::slots_per_epoch(), + ) + .await; } #[tokio::test] diff --git a/consensus/proto_array/src/proto_array_fork_choice.rs b/consensus/proto_array/src/proto_array_fork_choice.rs index 8c7b58c4d4..dea853d245 100644 --- a/consensus/proto_array/src/proto_array_fork_choice.rs +++ b/consensus/proto_array/src/proto_array_fork_choice.rs @@ -172,7 +172,13 @@ impl Block { ) -> Hash256 { let block_epoch = self.current_epoch_shuffling_id.shuffling_epoch; - if !spec.fork_name_at_epoch(child_block_epoch).fulu_enabled() { + // For child blocks in the Fulu fork epoch itself, we want to use the old logic. There is no + // lookahead in the first Fulu epoch. So we check whether Fulu is enabled at + // `child_block_epoch - 1`, i.e. whether `child_block_epoch > fulu_fork_epoch`. + if !spec + .fork_name_at_epoch(child_block_epoch.saturating_sub(1_u64)) + .fulu_enabled() + { // Prior to Fulu the proposer shuffling decision root for the current epoch is the same // as the attestation shuffling for the *next* epoch, i.e. it is determined at the start // of the current epoch. diff --git a/consensus/types/src/chain_spec.rs b/consensus/types/src/chain_spec.rs index 6670fff629..7916e9fcdb 100644 --- a/consensus/types/src/chain_spec.rs +++ b/consensus/types/src/chain_spec.rs @@ -869,7 +869,13 @@ impl ChainSpec { /// /// The block root at this slot can be used to key the proposer shuffling for the given epoch. pub fn proposer_shuffling_decision_slot(&self, epoch: Epoch) -> Slot { - if self.fork_name_at_epoch(epoch).fulu_enabled() { + // At the Fulu fork epoch itself, the shuffling is computed "the old way" with no lookahead. + // Therefore for `epoch == fulu_fork_epoch` we must take the `else` branch. Checking if Fulu + // is enabled at `epoch - 1` accomplishes this neatly. + if self + .fork_name_at_epoch(epoch.saturating_sub(1_u64)) + .fulu_enabled() + { // Post-Fulu the proposer shuffling decision slot for epoch N is the slot at the end // of epoch N - 2 (note: min_seed_lookahead=1 in all current configs). epoch @@ -2999,4 +3005,32 @@ mod yaml_tests { spec.min_epoch_data_availability_boundary(current_epoch) ); } + + #[test] + fn proposer_shuffling_decision_root_around_epoch_boundary() { + type E = MainnetEthSpec; + let fulu_fork_epoch = 5; + let spec = { + let mut spec = ForkName::Electra.make_genesis_spec(E::default_spec()); + spec.fulu_fork_epoch = Some(Epoch::new(fulu_fork_epoch)); + Arc::new(spec) + }; + + // For epochs prior to AND including the Fulu fork epoch, the decision slot is the end + // of the previous epoch (i.e. only 1 slot lookahead). + for epoch in (0..=fulu_fork_epoch).map(Epoch::new) { + assert_eq!( + spec.proposer_shuffling_decision_slot::(epoch), + epoch.start_slot(E::slots_per_epoch()) - 1 + ); + } + + // For epochs after Fulu, the decision slot is the end of the epoch two epochs prior. + for epoch in ((fulu_fork_epoch + 1)..(fulu_fork_epoch + 10)).map(Epoch::new) { + assert_eq!( + spec.proposer_shuffling_decision_slot::(epoch), + (epoch - 1).start_slot(E::slots_per_epoch()) - 1 + ); + } + } } From e5b4983d6baf85770fe4539a565d8a2dd462bc53 Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Mon, 29 Sep 2025 12:17:30 +1000 Subject: [PATCH 62/81] Release v8.0.0 rc.0 (#8127) Testnet release for the upcoming Fusaka fork. Co-Authored-By: Jimmy Chen Co-Authored-By: Jimmy Chen --- Cargo.lock | 8 ++++---- beacon_node/Cargo.toml | 2 +- boot_node/Cargo.toml | 2 +- common/lighthouse_version/src/lib.rs | 6 +++--- lcli/Cargo.toml | 2 +- lighthouse/Cargo.toml | 2 +- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ee65108097..352ff77975 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -918,7 +918,7 @@ dependencies = [ [[package]] name = "beacon_node" -version = "7.1.0" +version = "8.0.0-rc.0" dependencies = [ "account_utils", "beacon_chain", @@ -1193,7 +1193,7 @@ dependencies = [ [[package]] name = "boot_node" -version = "7.1.0" +version = "8.0.0-rc.0" dependencies = [ "beacon_node", "bytes", @@ -5051,7 +5051,7 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "lcli" -version = "7.1.0" +version = "8.0.0-rc.0" dependencies = [ "account_utils", "beacon_chain", @@ -5561,7 +5561,7 @@ dependencies = [ [[package]] name = "lighthouse" -version = "7.1.0" +version = "8.0.0-rc.0" dependencies = [ "account_manager", "account_utils", diff --git a/beacon_node/Cargo.toml b/beacon_node/Cargo.toml index dd7416af54..bb904a7619 100644 --- a/beacon_node/Cargo.toml +++ b/beacon_node/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "beacon_node" -version = "7.1.0" +version = "8.0.0-rc.0" authors = [ "Paul Hauner ", "Age Manning "] edition = { workspace = true } diff --git a/common/lighthouse_version/src/lib.rs b/common/lighthouse_version/src/lib.rs index c45dbac4d3..574fdfea35 100644 --- a/common/lighthouse_version/src/lib.rs +++ b/common/lighthouse_version/src/lib.rs @@ -17,8 +17,8 @@ pub const VERSION: &str = git_version!( // NOTE: using --match instead of --exclude for compatibility with old Git "--match=thiswillnevermatchlol" ], - prefix = "Lighthouse/v7.1.0-", - fallback = "Lighthouse/v7.1.0" + prefix = "Lighthouse/v8.0.0-rc.0-", + fallback = "Lighthouse/v8.0.0-rc.0" ); /// Returns the first eight characters of the latest commit hash for this build. @@ -54,7 +54,7 @@ pub fn version_with_platform() -> String { /// /// `1.5.1` pub fn version() -> &'static str { - "7.1.0" + "8.0.0-rc.0" } /// Returns the name of the current client running. diff --git a/lcli/Cargo.toml b/lcli/Cargo.toml index 2eed9da4c0..8f020e0387 100644 --- a/lcli/Cargo.toml +++ b/lcli/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "lcli" description = "Lighthouse CLI (modeled after zcli)" -version = "7.1.0" +version = "8.0.0-rc.0" authors = ["Paul Hauner "] edition = { workspace = true } diff --git a/lighthouse/Cargo.toml b/lighthouse/Cargo.toml index bf8241f8a2..4139286b53 100644 --- a/lighthouse/Cargo.toml +++ b/lighthouse/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lighthouse" -version = "7.1.0" +version = "8.0.0-rc.0" authors = ["Sigma Prime "] edition = { workspace = true } autotests = false From 9c6d33110b910572c460b89005b6f47afe81ae84 Mon Sep 17 00:00:00 2001 From: Michael Sproul Date: Tue, 30 Sep 2025 15:10:42 +1000 Subject: [PATCH 63/81] Update book for DB schema v28 (#8132) Co-Authored-By: Michael Sproul --- book/src/advanced_database_migrations.md | 2 ++ wordlist.txt | 1 + 2 files changed, 3 insertions(+) diff --git a/book/src/advanced_database_migrations.md b/book/src/advanced_database_migrations.md index e29397619c..3552a90b0e 100644 --- a/book/src/advanced_database_migrations.md +++ b/book/src/advanced_database_migrations.md @@ -17,6 +17,7 @@ validator client or the slasher**. | Lighthouse version | Release date | Schema version | Downgrade available? | |--------------------|--------------|----------------|----------------------| +| v8.0.0-rc.0 | Sep 2025 | v28 | yes before Fulu | | v7.1.0 | Jul 2025 | v26 | yes | | v7.0.0 | Apr 2025 | v22 | no | | v6.0.0 | Nov 2024 | v22 | no | @@ -207,6 +208,7 @@ Here are the steps to prune historic states: | Lighthouse version | Release date | Schema version | Downgrade available? | |--------------------|--------------|----------------|-------------------------------------| +| v8.0.0-rc.0 | Sep 2025 | v28 | yes before Fulu | | v7.1.0 | Jul 2025 | v26 | yes | | v7.0.0 | Apr 2025 | v22 | no | | v6.0.0 | Nov 2024 | v22 | no | diff --git a/wordlist.txt b/wordlist.txt index 57674cf974..58c4cf6db1 100644 --- a/wordlist.txt +++ b/wordlist.txt @@ -39,6 +39,7 @@ EthStaker Exercism Extractable FFG +Fulu Geth GiB Gitcoin From af5cbfbd4483a6f95f3b257748922e1d60e9951d Mon Sep 17 00:00:00 2001 From: Mac L Date: Tue, 30 Sep 2025 17:42:27 +1000 Subject: [PATCH 64/81] Bump superstruct to `0.10.0` (#8133) Bump `superstruct` to the latest release `0.10.0`. This version uses a later version of `darling` which is helpful for https://github.com/sigp/lighthouse/pull/8125 Co-Authored-By: Mac L --- Cargo.lock | 14 +++++++------- Cargo.toml | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 352ff77975..94d0033d4b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2163,7 +2163,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "18e4fdb82bd54a12e42fb58a800dcae6b9e13982238ce2296dc3570b92148e1f" dependencies = [ "data-encoding", - "syn 1.0.109", + "syn 2.0.100", ] [[package]] @@ -5122,7 +5122,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34" dependencies = [ "cfg-if", - "windows-targets 0.48.5", + "windows-targets 0.52.6", ] [[package]] @@ -9065,16 +9065,16 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "superstruct" -version = "0.8.0" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf0f31f730ad9e579364950e10d6172b4a9bd04b447edf5988b066a860cc340e" +checksum = "3b986e4a629907f20a2c2a639a75bc22a8b5d99b444e0d83c395f4cb309022bf" dependencies = [ - "darling 0.13.4", - "itertools 0.10.5", + "darling 0.20.10", + "itertools 0.13.0", "proc-macro2", "quote", "smallvec", - "syn 1.0.109", + "syn 2.0.100", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 66378a16c4..e471c4e238 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -242,7 +242,7 @@ ssz_types = "0.11.0" state_processing = { path = "consensus/state_processing" } store = { path = "beacon_node/store" } strum = { version = "0.24", features = ["derive"] } -superstruct = "0.8" +superstruct = "0.10" swap_or_not_shuffle = { path = "consensus/swap_or_not_shuffle" } syn = "1" sysinfo = "0.26" From 26575c594c77942a2014182b8c9a5c6832b7daa0 Mon Sep 17 00:00:00 2001 From: Michael Sproul Date: Wed, 1 Oct 2025 19:29:15 +1000 Subject: [PATCH 65/81] Improve spec compliance for `/eth/v1/config/spec` API (#8144) - [x] Remove the unnecessary `_MILLIS` suffix from `MAXIMUM_GOSSIP_CLOCK_DISPARITY` - [x] Add missing Deneb preset `KZG_COMMITMENT_INCLUSION_PROOF_DEPTH`, not to be confused with `KZG_COMMITMENTS_INCLUSION_PROOF_DEPTH` (plural) from Fulu... Co-Authored-By: Michael Sproul --- consensus/types/src/chain_spec.rs | 20 ++++++++++---------- consensus/types/src/preset.rs | 3 +++ 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/consensus/types/src/chain_spec.rs b/consensus/types/src/chain_spec.rs index 7916e9fcdb..50a2f268e0 100644 --- a/consensus/types/src/chain_spec.rs +++ b/consensus/types/src/chain_spec.rs @@ -227,7 +227,7 @@ pub struct ChainSpec { pub ttfb_timeout: u64, pub resp_timeout: u64, pub attestation_propagation_slot_range: u64, - pub maximum_gossip_clock_disparity_millis: u64, + pub maximum_gossip_clock_disparity: u64, pub message_domain_invalid_snappy: [u8; 4], pub message_domain_valid_snappy: [u8; 4], pub subnets_per_node: u8, @@ -670,7 +670,7 @@ impl ChainSpec { } pub fn maximum_gossip_clock_disparity(&self) -> Duration { - Duration::from_millis(self.maximum_gossip_clock_disparity_millis) + Duration::from_millis(self.maximum_gossip_clock_disparity) } pub fn ttfb_timeout(&self) -> Duration { @@ -1112,7 +1112,7 @@ impl ChainSpec { attestation_propagation_slot_range: default_attestation_propagation_slot_range(), attestation_subnet_count: 64, subnets_per_node: 2, - maximum_gossip_clock_disparity_millis: default_maximum_gossip_clock_disparity_millis(), + maximum_gossip_clock_disparity: default_maximum_gossip_clock_disparity(), target_aggregators_per_committee: 16, max_payload_size: default_max_payload_size(), min_epochs_for_block_requests: default_min_epochs_for_block_requests(), @@ -1458,7 +1458,7 @@ impl ChainSpec { attestation_propagation_slot_range: default_attestation_propagation_slot_range(), attestation_subnet_count: 64, subnets_per_node: 4, // Make this larger than usual to avoid network damage - maximum_gossip_clock_disparity_millis: default_maximum_gossip_clock_disparity_millis(), + maximum_gossip_clock_disparity: default_maximum_gossip_clock_disparity(), target_aggregators_per_committee: 16, max_payload_size: default_max_payload_size(), min_epochs_for_block_requests: 33024, @@ -1779,9 +1779,9 @@ pub struct Config { #[serde(default = "default_attestation_propagation_slot_range")] #[serde(with = "serde_utils::quoted_u64")] attestation_propagation_slot_range: u64, - #[serde(default = "default_maximum_gossip_clock_disparity_millis")] + #[serde(default = "default_maximum_gossip_clock_disparity")] #[serde(with = "serde_utils::quoted_u64")] - maximum_gossip_clock_disparity_millis: u64, + maximum_gossip_clock_disparity: u64, #[serde(default = "default_message_domain_invalid_snappy")] #[serde(with = "serde_utils::bytes_4_hex")] message_domain_invalid_snappy: [u8; 4], @@ -1995,7 +1995,7 @@ const fn default_attestation_propagation_slot_range() -> u64 { 32 } -const fn default_maximum_gossip_clock_disparity_millis() -> u64 { +const fn default_maximum_gossip_clock_disparity() -> u64 { 500 } @@ -2214,7 +2214,7 @@ impl Config { ttfb_timeout: spec.ttfb_timeout, resp_timeout: spec.resp_timeout, attestation_propagation_slot_range: spec.attestation_propagation_slot_range, - maximum_gossip_clock_disparity_millis: spec.maximum_gossip_clock_disparity_millis, + maximum_gossip_clock_disparity: spec.maximum_gossip_clock_disparity, message_domain_invalid_snappy: spec.message_domain_invalid_snappy, message_domain_valid_snappy: spec.message_domain_valid_snappy, max_request_blocks_deneb: spec.max_request_blocks_deneb, @@ -2302,7 +2302,7 @@ impl Config { message_domain_valid_snappy, max_request_blocks, attestation_propagation_slot_range, - maximum_gossip_clock_disparity_millis, + maximum_gossip_clock_disparity, max_request_blocks_deneb, max_request_blob_sidecars, max_request_data_column_sidecars, @@ -2378,7 +2378,7 @@ impl Config { attestation_subnet_prefix_bits, max_request_blocks, attestation_propagation_slot_range, - maximum_gossip_clock_disparity_millis, + maximum_gossip_clock_disparity, max_request_blocks_deneb, max_request_blob_sidecars, max_request_data_column_sidecars, diff --git a/consensus/types/src/preset.rs b/consensus/types/src/preset.rs index c31183192f..ab54c0345f 100644 --- a/consensus/types/src/preset.rs +++ b/consensus/types/src/preset.rs @@ -208,6 +208,8 @@ pub struct DenebPreset { #[serde(with = "serde_utils::quoted_u64")] pub max_blob_commitments_per_block: u64, #[serde(with = "serde_utils::quoted_u64")] + pub kzg_commitment_inclusion_proof_depth: u64, + #[serde(with = "serde_utils::quoted_u64")] pub field_elements_per_blob: u64, } @@ -215,6 +217,7 @@ impl DenebPreset { pub fn from_chain_spec(_spec: &ChainSpec) -> Self { Self { max_blob_commitments_per_block: E::max_blob_commitments_per_block() as u64, + kzg_commitment_inclusion_proof_depth: E::KzgCommitmentInclusionProofDepth::to_u64(), field_elements_per_blob: E::field_elements_per_blob() as u64, } } From ff8b514b3f012537de8b99cd526d15bdb4610698 Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Tue, 7 Oct 2025 06:26:37 +1100 Subject: [PATCH 66/81] Remove unnecessary warning logs and update logging levels (#8145) @michaelsproul noticed this warning on a devnet-3 node ``` Oct 01 16:37:29.896 WARN Error when importing rpc custody columns error: ParentUnknown { parent_root: 0xe4cc85a2137b76eb083d7076255094a90f10caaec0afc8fd36807db742f6ff13 }, block_hash: 0x43ce63b2344990f5f4d8911b8f14e3d3b6b006edc35bbc833360e667df0edef7 ``` We're also seeing similar `WARN` logs for blobs on our live nodes. It's normal to get parent unknown in lookups and it's handled here https://github.com/sigp/lighthouse/blob/a134d43446f776fe2a84f420854afbff76ca93d8/beacon_node/network/src/sync/block_lookups/mod.rs#L611-L619 These shouldn't be a `WARN`, and we also log the same error in block lookups at `DEBUG` level here: https://github.com/sigp/lighthouse/blob/a134d43446f776fe2a84f420854afbff76ca93d8/beacon_node/network/src/sync/block_lookups/mod.rs#L643-L648 So i've removed these extra WARN logs. I've also lower the level of an `ERROR` log when unable to serve data column root requests - it's unexpected, but is unlikely to impact the nodes performance, so I think we can downgrade this. Co-Authored-By: Jimmy Chen --- .../network_beacon_processor/rpc_methods.rs | 6 +++--- .../network_beacon_processor/sync_methods.rs | 19 ++++--------------- 2 files changed, 7 insertions(+), 18 deletions(-) diff --git a/beacon_node/network/src/network_beacon_processor/rpc_methods.rs b/beacon_node/network/src/network_beacon_processor/rpc_methods.rs index 9ddba86b81..58e02ffe00 100644 --- a/beacon_node/network/src/network_beacon_processor/rpc_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/rpc_methods.rs @@ -437,12 +437,12 @@ impl NetworkBeaconProcessor { } } Err(e) => { - // TODO(das): lower log level when feature is stabilized - error!( + // The node is expected to be able to serve these columns, but it fails to retrieve them. + warn!( block_root = ?data_column_ids_by_root.block_root, %peer_id, error = ?e, - "Error getting data column" + "Error getting data column for by root request " ); return Err((RpcErrorResponse::ServerError, "Error getting data column")); } diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index f139724702..1d99540c29 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -333,14 +333,8 @@ impl NetworkBeaconProcessor { "Blobs have already been imported" ); } - Err(e) => { - warn!( - error = ?e, - block_hash = %block_root, - %slot, - "Error when importing rpc blobs" - ); - } + // Errors are handled and logged in `block_lookups` + Err(_) => {} } // Sync handles these results @@ -414,13 +408,8 @@ impl NetworkBeaconProcessor { "Custody columns have already been imported" ); } - Err(e) => { - warn!( - error = ?e, - block_hash = %block_root, - "Error when importing rpc custody columns" - ); - } + // Errors are handled and logged in `block_lookups` + Err(_) => {} } self.send_sync_message(SyncMessage::BlockComponentProcessed { From 4eb89604f8b560876cadb410fdf9b7af08457f48 Mon Sep 17 00:00:00 2001 From: Eitan Seri-Levi Date: Tue, 7 Oct 2025 07:32:35 -0700 Subject: [PATCH 67/81] Fulu ASCII art (#8151) Co-Authored-By: Eitan Seri- Levi --- beacon_node/network/src/service.rs | 1 + consensus/types/src/fork_name.rs | 40 ++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/beacon_node/network/src/service.rs b/beacon_node/network/src/service.rs index c97206ea87..4bd649ba82 100644 --- a/beacon_node/network/src/service.rs +++ b/beacon_node/network/src/service.rs @@ -840,6 +840,7 @@ impl NetworkService { new_fork = ?new_fork_name, "Transitioned to new fork" ); + new_fork_name.fork_ascii(); } fork_context.update_current_fork(*new_fork_name, new_fork_digest, current_epoch); diff --git a/consensus/types/src/fork_name.rs b/consensus/types/src/fork_name.rs index f12b14ff6e..363d9e77a2 100644 --- a/consensus/types/src/fork_name.rs +++ b/consensus/types/src/fork_name.rs @@ -201,6 +201,46 @@ impl ForkName { pub fn gloas_enabled(self) -> bool { self >= ForkName::Gloas } + + pub fn fork_ascii(self) { + if self == ForkName::Fulu { + println!( + r#" + ╔═══════════════════════════════════════╗ + ║ ║ + ║ TO FULU, MOAR BLOBS TO ETHEREUM ║ + ║ ║ + ║ III DECEMBER MMXXV ║ + ║ ║ + ╚═══════════════════════════════════════╝ + + ============================================================================= + |||| |||| + |---------------------------------------------------------------------------| + |___-----___-----___-----___-----___-----___-----___-----___-----___-----___| + / _ \===/ _ \ / _ \===/ _ \ / _ \===/ _ \ / _ \===/ _ \ + ( (.\ oOo /.) ) ( (.\ oOo /.) ) ( (.\ oOo /.) ) ( (.\ oOo /.) ) + \__/=====\__/ \__/=====\__/ \__/=====\__/ \__/=====\__/ + ||||||| ||||||| ||||||| ||||||| + ||||||| ||||||| \\/), ||||||| ||||||| + ||||||| ||||||| ,'.' /, ||||||| ||||||| + ||||||| ||||||| (_)- / /, ||||||| ||||||| + ||||||| ||||||| /\_/ |__..--, * ||||||| ||||||| + ||||||| ||||||| (\___/\ \ \ / ).' ||||||| ||||||| + ||||||| ||||||| \____/ / (_ // ||||||| ||||||| + ||||||| ||||||| \\_ ,'--'\_( ||||||| ||||||| + (oOoOo) (oOoOo) )_)_/ )_/ )_) (oOoOo) (oOoOo) + J%%%%%L J%%%%%L (_(_.'(_.'(_.' J%%%%%L J%%%%%L + ZZZZZZZZZ ZZZZZZZZZ ZZZZZZZZZ ZZZZZZZZZ + =========================================================================== + |_________________________________________________________________________| + |___________________________________________________________________________| + |_____________________________________________________________________________| + |_______________________________________________________________________________| + "# + ); + } + } } /// Map a fork name into a fork-versioned superstruct type like `BeaconBlock`. From a4ad3e492f420f484ae36871f8bc9217a0518232 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Tue, 7 Oct 2025 07:32:41 -0700 Subject: [PATCH 68/81] Fallback to getPayload v1 if v2 fails (#8163) N/A Post fulu, we should be calling the v2 api on the relays that doesn't return the blobs/data columns. However, we decided to start hitting the v2 api as soon as fulu is scheduled to avoid unexpected surprises at the fork. In the ACDT call, it seems like most clients are calling v2 only after the fulu fork. This PR aims to be the best of both worlds where we fallback to hitting v1 api if v2 fails. This way, we know beforehand if relays don't support it and can potentially alert them. Co-Authored-By: Pawan Dhananjay --- beacon_node/execution_layer/src/lib.rs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/beacon_node/execution_layer/src/lib.rs b/beacon_node/execution_layer/src/lib.rs index 401646f367..a5fa0f3415 100644 --- a/beacon_node/execution_layer/src/lib.rs +++ b/beacon_node/execution_layer/src/lib.rs @@ -1914,9 +1914,19 @@ impl ExecutionLayer { ) -> Result, Error> { debug!(?block_root, "Sending block to builder"); if spec.is_fulu_scheduled() { - self.post_builder_blinded_blocks_v2(block_root, block) + let resp = self + .post_builder_blinded_blocks_v2(block_root, block) .await - .map(|()| SubmitBlindedBlockResponse::V2) + .map(|()| SubmitBlindedBlockResponse::V2); + // Fallback to v1 if v2 fails because the relay doesn't support it. + // Note: we should remove the fallback post fulu when all relays have support for v2. + if resp.is_err() { + self.post_builder_blinded_blocks_v1(block_root, block) + .await + .map(|full_payload| SubmitBlindedBlockResponse::V1(Box::new(full_payload))) + } else { + resp + } } else { self.post_builder_blinded_blocks_v1(block_root, block) .await From b5c2a9668edb6be72a39d136d333449934b75ac7 Mon Sep 17 00:00:00 2001 From: Michael Sproul Date: Wed, 8 Oct 2025 11:05:41 +1100 Subject: [PATCH 69/81] Quote `BeaconState::proposer_lookahead` in JSON repr (#8167) Use quoted integers for `state.proposer_lookahead` when serializing JSON. This is standard for all integer fields, but was missed for the newly added proposer lookahead. I noticed this issue while inspecting the head state on a local devnet. I'm glad we found this before someone reported it :P Co-Authored-By: Michael Sproul --- consensus/types/src/beacon_state.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/consensus/types/src/beacon_state.rs b/consensus/types/src/beacon_state.rs index 0a3d768c59..1bd4927fe8 100644 --- a/consensus/types/src/beacon_state.rs +++ b/consensus/types/src/beacon_state.rs @@ -592,6 +592,7 @@ where #[compare_fields(as_iter)] #[test_random(default)] #[superstruct(only(Fulu, Gloas))] + #[serde(with = "ssz_types::serde_utils::quoted_u64_fixed_vec")] pub proposer_lookahead: Vector, // Gloas From 2a433bc4066b949d8c61661d467bb645cc4b6b1e Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Wed, 8 Oct 2025 12:52:41 +1100 Subject: [PATCH 70/81] Remove deprecated CLI flags and references for v8.0.0 (#8142) Closes #8131 - [x] Remove deprecated flags from beacon_node/src/cli.rs: - [x] eth1-purge-cache - [x] eth1-blocks-per-log-query - [x] eth1-cache-follow-distance - [x] disable-deposit-contract-sync - [x] light-client-server - [x] Remove deprecated flags from lighthouse/src/main.rs: - [x] logfile - [x] terminal-total-difficulty-override - [x] terminal-block-hash-override - [x] terminal-block-hash-epoch-override - [x] safe-slots-to-import-optimistically - [x] Remove references to deprecated flags in config.rs files - [x] Remove warning messages for deprecated flags in main.rs - [x] Update/remove related tests in beacon_node.rs Co-Authored-By: Jimmy Chen Co-Authored-By: Jimmy Chen --- beacon_node/src/cli.rs | 51 -------------------- beacon_node/src/config.rs | 29 ----------- book/src/help_bn.md | 2 - lighthouse/src/main.rs | 73 +--------------------------- lighthouse/tests/beacon_node.rs | 85 --------------------------------- 5 files changed, 1 insertion(+), 239 deletions(-) diff --git a/beacon_node/src/cli.rs b/beacon_node/src/cli.rs index 569d1e4ad8..2e3b3fde4b 100644 --- a/beacon_node/src/cli.rs +++ b/beacon_node/src/cli.rs @@ -699,38 +699,6 @@ pub fn cli_app() -> Command { .help_heading(FLAG_HEADER) .display_order(0) ) - - /* - * Eth1 Integration - */ - .arg( - Arg::new("eth1-purge-cache") - .long("eth1-purge-cache") - .value_name("PURGE-CACHE") - .help("DEPRECATED") - .action(ArgAction::SetTrue) - .help_heading(FLAG_HEADER) - .display_order(0) - .hide(true) - ) - .arg( - Arg::new("eth1-blocks-per-log-query") - .long("eth1-blocks-per-log-query") - .value_name("BLOCKS") - .help("DEPRECATED") - .action(ArgAction::Set) - .display_order(0) - .hide(true) - ) - .arg( - Arg::new("eth1-cache-follow-distance") - .long("eth1-cache-follow-distance") - .value_name("BLOCKS") - .help("DEPRECATED") - .action(ArgAction::Set) - .display_order(0) - .hide(true) - ) .arg( Arg::new("slots-per-restore-point") .long("slots-per-restore-point") @@ -1498,16 +1466,6 @@ pub fn cli_app() -> Command { .help_heading(FLAG_HEADER) .display_order(0) ) - .arg( - Arg::new("disable-deposit-contract-sync") - .long("disable-deposit-contract-sync") - .help("DEPRECATED") - .action(ArgAction::SetTrue) - .help_heading(FLAG_HEADER) - .conflicts_with("staking") - .display_order(0) - .hide(true) - ) .arg( Arg::new("disable-optimistic-finalized-sync") .long("disable-optimistic-finalized-sync") @@ -1518,15 +1476,6 @@ pub fn cli_app() -> Command { Lighthouse and only passed to the EL if initial verification fails.") .display_order(0) ) - .arg( - Arg::new("light-client-server") - .long("light-client-server") - .help("DEPRECATED") - .action(ArgAction::SetTrue) - - .help_heading(FLAG_HEADER) - .display_order(0) - ) .arg( Arg::new("disable-light-client-server") .long("disable-light-client-server") diff --git a/beacon_node/src/config.rs b/beacon_node/src/config.rs index 230350fade..c2599ec0cd 100644 --- a/beacon_node/src/config.rs +++ b/beacon_node/src/config.rs @@ -170,13 +170,6 @@ pub fn get_config( parse_required(cli_args, "http-duplicate-block-status")?; } - if cli_args.get_flag("light-client-server") { - warn!( - "The --light-client-server flag is deprecated. The light client server is enabled \ - by default" - ); - } - if cli_args.get_flag("disable-light-client-server") { client_config.chain.enable_light_client_server = false; } @@ -262,24 +255,6 @@ pub fn get_config( client_config.http_metrics.allocator_metrics_enabled = false; } - /* - * Deprecated Eth1 flags (can be removed in the next minor release after v7.1.0) - */ - if cli_args - .get_one::("eth1-blocks-per-log-query") - .is_some() - { - warn!("The eth1-blocks-per-log-query flag is deprecated"); - } - - if cli_args.get_flag("eth1-purge-cache") { - warn!("The eth1-purge-cache flag is deprecated"); - } - - if clap_utils::parse_optional::(cli_args, "eth1-cache-follow-distance")?.is_some() { - warn!("The eth1-cache-follow-distance flag is deprecated"); - } - // `--execution-endpoint` is required now. let endpoints: String = clap_utils::parse_required(cli_args, "execution-endpoint")?; let mut el_config = execution_layer::Config::default(); @@ -773,10 +748,6 @@ pub fn get_config( } } - if cli_args.get_flag("disable-deposit-contract-sync") { - warn!("The disable-deposit-contract-sync flag is deprecated"); - } - client_config.chain.prepare_payload_lookahead = clap_utils::parse_optional(cli_args, "prepare-payload-lookahead")? .map(Duration::from_millis) diff --git a/book/src/help_bn.md b/book/src/help_bn.md index d5396321f2..6680202a27 100644 --- a/book/src/help_bn.md +++ b/book/src/help_bn.md @@ -513,8 +513,6 @@ Flags: subscriptions. This will only import attestations from already-subscribed subnets, use with --subscribe-all-subnets to ensure all attestations are received for import. - --light-client-server - DEPRECATED --log-color [] Enables/Disables colors for logs in terminal. Set it to false to disable colors. [default: true] [possible values: true, false] diff --git a/lighthouse/src/main.rs b/lighthouse/src/main.rs index 8660074e91..c93016a0f5 100644 --- a/lighthouse/src/main.rs +++ b/lighthouse/src/main.rs @@ -28,7 +28,7 @@ use std::path::PathBuf; use std::process::exit; use std::sync::LazyLock; use task_executor::ShutdownReason; -use tracing::{Level, info, warn}; +use tracing::{Level, info}; use tracing_subscriber::{Layer, filter::EnvFilter, layer::SubscriberExt, util::SubscriberInitExt}; use types::{EthSpec, EthSpecId}; use validator_client::ProductionValidatorClient; @@ -126,16 +126,6 @@ fn main() { .global(true) .display_order(0), ) - .arg( - Arg::new("logfile") - .long("logfile") - .value_name("PATH") - .help("DEPRECATED") - .action(ArgAction::Set) - .global(true) - .hide(true) - .display_order(0) - ) .arg( Arg::new("logfile-dir") .long("logfile-dir") @@ -385,48 +375,6 @@ fn main() { .global(true) .display_order(0) ) - .arg( - Arg::new("terminal-total-difficulty-override") - .long("terminal-total-difficulty-override") - .value_name("INTEGER") - .help("DEPRECATED") - .action(ArgAction::Set) - .global(true) - .display_order(0) - .hide(true) - ) - .arg( - Arg::new("terminal-block-hash-override") - .long("terminal-block-hash-override") - .value_name("TERMINAL_BLOCK_HASH") - .help("DEPRECATED") - .requires("terminal-block-hash-epoch-override") - .action(ArgAction::Set) - .global(true) - .display_order(0) - .hide(true) - ) - .arg( - Arg::new("terminal-block-hash-epoch-override") - .long("terminal-block-hash-epoch-override") - .value_name("EPOCH") - .help("DEPRECATED") - .requires("terminal-block-hash-override") - .action(ArgAction::Set) - .global(true) - .display_order(0) - .hide(true) - ) - .arg( - Arg::new("safe-slots-to-import-optimistically") - .long("safe-slots-to-import-optimistically") - .value_name("INTEGER") - .help("DEPRECATED") - .action(ArgAction::Set) - .global(true) - .display_order(0) - .hide(true) - ) .arg( Arg::new("genesis-state-url") .long("genesis-state-url") @@ -780,11 +728,6 @@ fn run( // Allow Prometheus access to the version and commit of the Lighthouse build. metrics::expose_lighthouse_version(); - // DEPRECATED: can be removed in v7.2.0/v8.0.0. - if clap_utils::parse_optional::(matches, "logfile")?.is_some() { - warn!("The --logfile flag is deprecated and replaced by --logfile-dir"); - } - #[cfg(all(feature = "modern", target_arch = "x86_64"))] if !std::is_x86_feature_detected!("adx") { tracing::warn!( @@ -793,20 +736,6 @@ fn run( ); } - // Warn for DEPRECATED global flags. This code should be removed when we finish deleting these - // flags. - let deprecated_flags = [ - "terminal-total-difficulty-override", - "terminal-block-hash-override", - "terminal-block-hash-epoch-override", - "safe-slots-to-import-optimistically", - ]; - for flag in deprecated_flags { - if matches.get_one::(flag).is_some() { - warn!("The {} flag is deprecated and does nothing", flag); - } - } - // Note: the current code technically allows for starting a beacon node _and_ a validator // client at the same time. // diff --git a/lighthouse/tests/beacon_node.rs b/lighthouse/tests/beacon_node.rs index 8f6d040b62..5a057d7d7f 100644 --- a/lighthouse/tests/beacon_node.rs +++ b/lighthouse/tests/beacon_node.rs @@ -423,29 +423,6 @@ fn complete_blob_backfill_and_prune_blobs_true() { }); } -// Tests for Eth1 flags. -// DEPRECATED but should not crash -#[test] -fn eth1_blocks_per_log_query_flag() { - CommandLineTest::new() - .flag("eth1-blocks-per-log-query", Some("500")) - .run_with_zero_port(); -} -// DEPRECATED but should not crash -#[test] -fn eth1_purge_cache_flag() { - CommandLineTest::new() - .flag("eth1-purge-cache", None) - .run_with_zero_port(); -} -// DEPRECATED but should not crash -#[test] -fn eth1_cache_follow_distance_manual() { - CommandLineTest::new() - .flag("eth1-cache-follow-distance", Some("128")) - .run_with_zero_port(); -} - // Tests for Bellatrix flags. fn run_bellatrix_execution_endpoints_flag_test(flag: &str) { use sensitive_url::SensitiveUrl; @@ -781,31 +758,6 @@ fn jwt_optional_flags() { fn jwt_optional_alias_flags() { run_jwt_optional_flags_test("jwt-secrets", "jwt-id", "jwt-version"); } -// DEPRECATED. This flag is deprecated but should not cause a crash. -#[test] -fn terminal_total_difficulty_override_flag() { - CommandLineTest::new() - .flag("terminal-total-difficulty-override", Some("1337424242")) - .run_with_zero_port(); -} -// DEPRECATED. This flag is deprecated but should not cause a crash. -#[test] -fn terminal_block_hash_and_activation_epoch_override_flags() { - CommandLineTest::new() - .flag("terminal-block-hash-epoch-override", Some("1337")) - .flag( - "terminal-block-hash-override", - Some("0x4242424242424242424242424242424242424242424242424242424242424242"), - ) - .run_with_zero_port(); -} -// DEPRECATED. This flag is deprecated but should not cause a crash. -#[test] -fn safe_slots_to_import_optimistically_flag() { - CommandLineTest::new() - .flag("safe-slots-to-import-optimistically", Some("421337")) - .run_with_zero_port(); -} // Tests for Network flags. #[test] @@ -2523,42 +2475,6 @@ fn logfile_format_flag() { ) }); } -// DEPRECATED but should not crash. -#[test] -fn deprecated_logfile() { - CommandLineTest::new() - .flag("logfile", Some("test.txt")) - .run_with_zero_port(); -} - -// DEPRECATED but should not crash. -#[test] -fn sync_eth1_chain_disable_deposit_contract_sync_flag() { - let dir = TempDir::new().expect("Unable to create temporary directory"); - CommandLineTest::new_with_no_execution_endpoint() - .flag("disable-deposit-contract-sync", None) - .flag("execution-endpoints", Some("http://localhost:8551/")) - .flag( - "execution-jwt", - dir.path().join("jwt-file").as_os_str().to_str(), - ) - .run_with_zero_port(); -} - -#[test] -#[should_panic] -fn disable_deposit_contract_sync_conflicts_with_staking() { - let dir = TempDir::new().expect("Unable to create temporary directory"); - CommandLineTest::new_with_no_execution_endpoint() - .flag("disable-deposit-contract-sync", None) - .flag("staking", None) - .flag("execution-endpoints", Some("http://localhost:8551/")) - .flag( - "execution-jwt", - dir.path().join("jwt-file").as_os_str().to_str(), - ) - .run_with_zero_port(); -} #[test] fn light_client_server_default() { @@ -2573,7 +2489,6 @@ fn light_client_server_default() { #[test] fn light_client_server_enabled() { CommandLineTest::new() - .flag("light-client-server", None) .run_with_zero_port() .with_config(|config| { assert!(config.network.enable_light_client_server); From 13dfa9200f822c41ccd81b95a3f052df54c888e9 Mon Sep 17 00:00:00 2001 From: Michael Sproul Date: Wed, 8 Oct 2025 17:09:12 +1100 Subject: [PATCH 71/81] Block proposal optimisations (#8156) Closes: - https://github.com/sigp/lighthouse/issues/4412 This should reduce Lighthouse's block proposal times on Holesky and prevent us getting reorged. - [x] Allow the head state to be advanced further than 1 slot. This lets us avoid epoch processing on hot paths including block production, by having new epoch boundaries pre-computed and available in the state cache. - [x] Use the finalized state to prune the op pool. We were previously using the head state and trying to infer slashing/exit relevance based on `exit_epoch`. However some exit epochs are far in the future, despite occurring recently. Co-Authored-By: Michael Sproul --- beacon_node/beacon_chain/src/beacon_chain.rs | 4 ++ .../beacon_chain/src/canonical_head.rs | 30 ++++++---- .../beacon_chain/src/state_advance_timer.rs | 36 +----------- beacon_node/operation_pool/src/lib.rs | 55 +++++++++---------- 4 files changed, 53 insertions(+), 72 deletions(-) diff --git a/beacon_node/beacon_chain/src/beacon_chain.rs b/beacon_node/beacon_chain/src/beacon_chain.rs index afbf3278fe..f085684442 100644 --- a/beacon_node/beacon_chain/src/beacon_chain.rs +++ b/beacon_node/beacon_chain/src/beacon_chain.rs @@ -5233,16 +5233,20 @@ impl BeaconChain { None }; + let slashings_and_exits_span = debug_span!("get_slashings_and_exits").entered(); let (mut proposer_slashings, mut attester_slashings, mut voluntary_exits) = self.op_pool.get_slashings_and_exits(&state, &self.spec); + drop(slashings_and_exits_span); let eth1_data = state.eth1_data().clone(); let deposits = vec![]; + let bls_changes_span = debug_span!("get_bls_to_execution_changes").entered(); let bls_to_execution_changes = self .op_pool .get_bls_to_execution_changes(&state, &self.spec); + drop(bls_changes_span); // Iterate through the naive aggregation pool and ensure all the attestations from there // are included in the operation pool. diff --git a/beacon_node/beacon_chain/src/canonical_head.rs b/beacon_node/beacon_chain/src/canonical_head.rs index cfc7a9637b..7dd4c88c51 100644 --- a/beacon_node/beacon_chain/src/canonical_head.rs +++ b/beacon_node/beacon_chain/src/canonical_head.rs @@ -937,13 +937,6 @@ impl BeaconChain { .execution_status .is_optimistic_or_invalid(); - self.op_pool.prune_all( - &new_snapshot.beacon_block, - &new_snapshot.beacon_state, - self.epoch()?, - &self.spec, - ); - self.observed_block_producers.write().prune( new_view .finalized_checkpoint @@ -982,9 +975,9 @@ impl BeaconChain { })); } - // The store migration task requires the *state at the slot of the finalized epoch*, - // rather than the state of the latest finalized block. These two values will only - // differ when the first slot of the finalized epoch is a skip slot. + // The store migration task and op pool pruning require the *state at the first slot of the + // finalized epoch*, rather than the state of the latest finalized block. These two values + // will only differ when the first slot of the finalized epoch is a skip slot. // // Use the `StateRootsIterator` directly rather than `BeaconChain::state_root_at_slot` // to ensure we use the same state that we just set as the head. @@ -1006,6 +999,23 @@ impl BeaconChain { )? .ok_or(Error::MissingFinalizedStateRoot(new_finalized_slot))?; + let update_cache = true; + let new_finalized_state = self + .store + .get_hot_state(&new_finalized_state_root, update_cache)? + .ok_or(Error::MissingBeaconState(new_finalized_state_root))?; + + self.op_pool.prune_all( + &new_snapshot.beacon_block, + &new_snapshot.beacon_state, + &new_finalized_state, + self.epoch()?, + &self.spec, + ); + + // We just pass the state root to the finalization thread. It should be able to reload the + // state from the state_cache near instantly anyway. We could experiment with sending the + // state over a channel in future, but it's probably no quicker. self.store_migrator.process_finalization( new_finalized_state_root.into(), new_view.finalized_checkpoint, diff --git a/beacon_node/beacon_chain/src/state_advance_timer.rs b/beacon_node/beacon_chain/src/state_advance_timer.rs index 27c2c7c0a1..87348cb01b 100644 --- a/beacon_node/beacon_chain/src/state_advance_timer.rs +++ b/beacon_node/beacon_chain/src/state_advance_timer.rs @@ -33,7 +33,7 @@ use types::{AttestationShufflingId, BeaconStateError, EthSpec, Hash256, Relative /// /// This avoids doing unnecessary work whilst the node is syncing or has perhaps been put to sleep /// for some period of time. -const MAX_ADVANCE_DISTANCE: u64 = 4; +const MAX_ADVANCE_DISTANCE: u64 = 256; /// Similarly for fork choice: avoid the fork choice lookahead during sync. /// @@ -49,17 +49,7 @@ enum Error { HeadMissingFromSnapshotCache(#[allow(dead_code)] Hash256), BeaconState(#[allow(dead_code)] BeaconStateError), Store(#[allow(dead_code)] store::Error), - MaxDistanceExceeded { - current_slot: Slot, - head_slot: Slot, - }, - StateAlreadyAdvanced { - block_root: Hash256, - }, - BadStateSlot { - _state_slot: Slot, - _block_slot: Slot, - }, + MaxDistanceExceeded { current_slot: Slot, head_slot: Slot }, } impl From for Error { @@ -180,9 +170,6 @@ async fn state_advance_timer( error = ?e, "Failed to advance head state" ), - Err(Error::StateAlreadyAdvanced { block_root }) => { - debug!(?block_root, "State already advanced on slot") - } Err(Error::MaxDistanceExceeded { current_slot, head_slot, @@ -295,25 +282,6 @@ fn advance_head(beacon_chain: &Arc>) -> Resu .get_advanced_hot_state(head_block_root, current_slot, head_block_state_root)? .ok_or(Error::HeadMissingFromSnapshotCache(head_block_root))?; - // Protect against advancing a state more than a single slot. - // - // Advancing more than one slot without storing the intermediate state would corrupt the - // database. Future works might store intermediate states inside this function. - match state.slot().cmp(&state.latest_block_header().slot) { - std::cmp::Ordering::Equal => (), - std::cmp::Ordering::Greater => { - return Err(Error::StateAlreadyAdvanced { - block_root: head_block_root, - }); - } - std::cmp::Ordering::Less => { - return Err(Error::BadStateSlot { - _block_slot: state.latest_block_header().slot, - _state_slot: state.slot(), - }); - } - } - let initial_slot = state.slot(); let initial_epoch = state.current_epoch(); diff --git a/beacon_node/operation_pool/src/lib.rs b/beacon_node/operation_pool/src/lib.rs index dd01f568fa..24e2cfbbb5 100644 --- a/beacon_node/operation_pool/src/lib.rs +++ b/beacon_node/operation_pool/src/lib.rs @@ -457,32 +457,35 @@ impl OperationPool { .collect() } - /// Prune proposer slashings for validators which are exited in the finalized epoch. - pub fn prune_proposer_slashings(&self, head_state: &BeaconState) { + /// Prune proposer slashings for validators which are already slashed or exited in the finalized + /// epoch. + pub fn prune_proposer_slashings(&self, finalized_state: &BeaconState) { prune_validator_hash_map( &mut self.proposer_slashings.write(), - |_, validator| validator.exit_epoch <= head_state.finalized_checkpoint().epoch, - head_state, + |_, validator| { + validator.slashed || validator.exit_epoch <= finalized_state.current_epoch() + }, + finalized_state, ); } /// Prune attester slashings for all slashed or withdrawn validators, or attestations on another /// fork. - pub fn prune_attester_slashings(&self, head_state: &BeaconState) { + pub fn prune_attester_slashings(&self, finalized_state: &BeaconState) { self.attester_slashings.write().retain(|slashing| { // Check that the attestation's signature is still valid wrt the fork version. - let signature_ok = slashing.signature_is_still_valid(&head_state.fork()); + // We might be a bit slower to detect signature staleness by using the finalized state + // here, but we filter when proposing anyway, so in the worst case we just keep some + // stuff around until we finalize. + let signature_ok = slashing.signature_is_still_valid(&finalized_state.fork()); // Slashings that don't slash any validators can also be dropped. let slashing_ok = get_slashable_indices_modular( - head_state, + finalized_state, slashing.as_inner().to_ref(), |_, validator| { - // Declare that a validator is still slashable if they have not exited prior - // to the finalized epoch. - // - // We cannot check the `slashed` field since the `head` is not finalized and - // a fork could un-slash someone. - validator.exit_epoch > head_state.finalized_checkpoint().epoch + // Declare that a validator is still slashable if they have not been slashed in + // the finalized state, and have not exited at the finalized epoch. + !validator.slashed && validator.exit_epoch > finalized_state.current_epoch() }, ) .is_ok_and(|indices| !indices.is_empty()); @@ -531,17 +534,12 @@ impl OperationPool { ) } - /// Prune if validator has already exited at or before the finalized checkpoint of the head. - pub fn prune_voluntary_exits(&self, head_state: &BeaconState) { + /// Prune if validator has already exited in the finalized state. + pub fn prune_voluntary_exits(&self, finalized_state: &BeaconState, spec: &ChainSpec) { prune_validator_hash_map( &mut self.voluntary_exits.write(), - // This condition is slightly too loose, since there will be some finalized exits that - // are missed here. - // - // We choose simplicity over the gain of pruning more exits since they are small and - // should not be seen frequently. - |_, validator| validator.exit_epoch <= head_state.finalized_checkpoint().epoch, - head_state, + |_, validator| validator.exit_epoch != spec.far_future_epoch, + finalized_state, ); } @@ -642,14 +640,15 @@ impl OperationPool { &self, head_block: &SignedBeaconBlock, head_state: &BeaconState, + finalized_state: &BeaconState, current_epoch: Epoch, spec: &ChainSpec, ) { self.prune_attestations(current_epoch); self.prune_sync_contributions(head_state.slot()); - self.prune_proposer_slashings(head_state); - self.prune_attester_slashings(head_state); - self.prune_voluntary_exits(head_state); + self.prune_proposer_slashings(finalized_state); + self.prune_attester_slashings(finalized_state); + self.prune_voluntary_exits(finalized_state, spec); self.prune_bls_to_execution_changes(head_block, head_state, spec); } @@ -758,14 +757,14 @@ where fn prune_validator_hash_map( map: &mut HashMap>, prune_if: F, - head_state: &BeaconState, + state: &BeaconState, ) where F: Fn(u64, &Validator) -> bool, T: VerifyOperation, { map.retain(|&validator_index, op| { - op.signature_is_still_valid(&head_state.fork()) - && head_state + op.signature_is_still_valid(&state.fork()) + && state .validators() .get(validator_index as usize) .is_none_or(|validator| !prune_if(validator_index, validator)) From 8e382ceed9ae17a22a8f4e0a1b518194d2783592 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Wed, 8 Oct 2025 18:47:05 -0700 Subject: [PATCH 72/81] Bump kzg library versions (#8174) N/A Update c-kzg and rust-eth-kzg to their latest versions. Also removes the patch version hardcoding in Cargo.toml. Co-Authored-By: Pawan Dhananjay --- Cargo.lock | 53 ++++++++++++++++++++++++++--------------------------- Cargo.toml | 4 ++-- 2 files changed, 28 insertions(+), 29 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 94d0033d4b..481d204865 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1165,9 +1165,9 @@ dependencies = [ [[package]] name = "blst" -version = "0.3.15" +version = "0.3.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fd49896f12ac9b6dcd7a5998466b9b58263a695a3dd1ecc1aaca2e12a90b080" +checksum = "dcdb4c7013139a150f9fc55d123186dbfaba0d912817466282c73ac49e71fb45" dependencies = [ "cc", "glob", @@ -1296,11 +1296,10 @@ dependencies = [ [[package]] name = "c-kzg" -version = "2.1.0" +version = "2.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e7e3c397401eb76228c89561cf22f85f41c95aa799ee9d860de3ea1cbc728fc" +checksum = "e00bf4b112b07b505472dbefd19e37e53307e2bfed5a79e0cc161d58ccd0e687" dependencies = [ - "arbitrary", "blst", "cc", "glob", @@ -2163,7 +2162,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "18e4fdb82bd54a12e42fb58a800dcae6b9e13982238ce2296dc3570b92148e1f" dependencies = [ "data-encoding", - "syn 2.0.100", + "syn 1.0.109", ] [[package]] @@ -2565,9 +2564,9 @@ dependencies = [ [[package]] name = "eip4844" -version = "0.9.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa86cda6af15a9a5e4cf680850addaee8cd427be95be3ec9d022b9d7b98a66c0" +checksum = "82ab45fc63db6bbe5c3eb7c79303b2aff7ee529c991b2111c46879d1ea38407e" dependencies = [ "ekzg-bls12-381", "ekzg-maybe-rayon", @@ -2590,9 +2589,9 @@ checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" [[package]] name = "ekzg-bls12-381" -version = "0.9.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08f0e00a7689af7f4f17e85ae07f5a92b568a47297a165f685b828edfd82e02b" +checksum = "05c599a59deba6188afd9f783507e4d89efc997f0fa340a758f0d0992b322416" dependencies = [ "blst", "blstrs", @@ -2604,9 +2603,9 @@ dependencies = [ [[package]] name = "ekzg-erasure-codes" -version = "0.9.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bfc7ab684a7bb0c5ee37fd6a73da7425858cdd28f4a285c70361f001d6d0efc" +checksum = "8474a41a30ddd2b651798b1aa9ce92011207c3667186fe9044184683250109e7" dependencies = [ "ekzg-bls12-381", "ekzg-polynomial", @@ -2614,15 +2613,15 @@ dependencies = [ [[package]] name = "ekzg-maybe-rayon" -version = "0.9.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e0a4876a612b9317be470768e134b671b8e645e412a82eb12fdd9b1958fa6f9" +checksum = "9cf94d1385185c1f7caef4973be49702c7d9ffdeaf832d126dbb9ed6efe09d40" [[package]] name = "ekzg-multi-open" -version = "0.9.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f7964754aa0921aaa89b1589100e4cae9b31f87f137eeb0af5403fdfca68bfc" +checksum = "e6d37456a32cf79bdbddd6685a2adec73210e2d60332370bc0e9a502b6d93beb" dependencies = [ "ekzg-bls12-381", "ekzg-maybe-rayon", @@ -2632,9 +2631,9 @@ dependencies = [ [[package]] name = "ekzg-polynomial" -version = "0.9.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fed36d2ddf86661c9d18e9d5dfc47dce6c9b6e44db385e2da71952b10ba32df1" +checksum = "704751bac85af4754bb8a14457ef24d820738062d0b6f3763534d0980b1a1e81" dependencies = [ "ekzg-bls12-381", "ekzg-maybe-rayon", @@ -2642,9 +2641,9 @@ dependencies = [ [[package]] name = "ekzg-serialization" -version = "0.9.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c83402d591ac3534d1ae654feb8f56ee64cc2bacfe80bece7977c24ca5e72e2" +checksum = "3cb983d9f75b2804c00246def8d52c01cf05f70c22593b8d314fbcf0cf89042b" dependencies = [ "ekzg-bls12-381", "hex", @@ -2652,9 +2651,9 @@ dependencies = [ [[package]] name = "ekzg-single-open" -version = "0.9.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05e1dbb13023ccebbb24593e4753c87f77b7fb78254a20aef1a028e979145092" +checksum = "799d5806d51e1453fa0f528d6acf4127e2a89e98312c826151ebc24ee3448ec3" dependencies = [ "ekzg-bls12-381", "ekzg-polynomial", @@ -2663,9 +2662,9 @@ dependencies = [ [[package]] name = "ekzg-trusted-setup" -version = "0.9.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff1cb3e907b27fa51f35def95eeabe47e97765e2b6bac7e55967500937f94282" +checksum = "85314d56718dc2c6dd77c3b3630f1839defcb6f47d9c20195608a0f7976095ab" dependencies = [ "ekzg-bls12-381", "ekzg-serialization", @@ -7375,7 +7374,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" dependencies = [ "anyhow", - "itertools 0.14.0", + "itertools 0.10.5", "proc-macro2", "quote", "syn 2.0.100", @@ -8013,9 +8012,9 @@ dependencies = [ [[package]] name = "rust_eth_kzg" -version = "0.9.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0dc46814bb8e72bff20fe117db43b7455112e6fafdae7466f8f24d451ad773c0" +checksum = "1522b7a740cd7f5bc52ea49863618511c8de138dcdf3f8a80b15b3f764942a5b" dependencies = [ "eip4844", "ekzg-bls12-381", diff --git a/Cargo.toml b/Cargo.toml index e471c4e238..a5f01a498d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -117,7 +117,7 @@ byteorder = "1" bytes = "1" # Turn off c-kzg's default features which include `blst/portable`. We can turn on blst's portable # feature ourselves when desired. -c-kzg = { version = "2.1.0", default-features = false } +c-kzg = { version = "2.1", default-features = false } cargo_metadata = "0.19" clap = { version = "4.5.4", features = ["derive", "cargo", "wrap_help"] } clap_utils = { path = "common/clap_utils" } @@ -224,7 +224,7 @@ reqwest = { version = "0.11", default-features = false, features = [ ring = "0.17" rpds = "0.11" rusqlite = { version = "0.28", features = ["bundled"] } -rust_eth_kzg = "0.9.0" +rust_eth_kzg = "0.9" safe_arith = { path = "consensus/safe_arith" } sensitive_url = { path = "common/sensitive_url" } serde = { version = "1", features = ["derive"] } From 3110ca325b6314003f52c0ee71f33877d7be371a Mon Sep 17 00:00:00 2001 From: chonghe <44791194+chong-he@users.noreply.github.com> Date: Thu, 9 Oct 2025 13:01:30 +0800 Subject: [PATCH 73/81] Implement `/eth/v1/beacon/blobs` endpoint (#8103) * #8085 Co-Authored-By: Tan Chee Keong Co-Authored-By: chonghe <44791194+chong-he@users.noreply.github.com> --- beacon_node/beacon_chain/src/kzg_utils.rs | 32 +++-- beacon_node/client/src/builder.rs | 2 +- beacon_node/http_api/src/block_id.rs | 71 ++++++++++- beacon_node/http_api/src/lib.rs | 50 +++++++- beacon_node/http_api/tests/tests.rs | 147 +++++++++++++++++++--- common/eth2/src/lib.rs | 42 ++++++- common/eth2/src/types.rs | 15 +++ consensus/types/src/beacon_response.rs | 24 +++- lcli/src/http_sync.rs | 2 +- testing/simulator/src/checks.rs | 2 +- 10 files changed, 346 insertions(+), 41 deletions(-) diff --git a/beacon_node/beacon_chain/src/kzg_utils.rs b/beacon_node/beacon_chain/src/kzg_utils.rs index ad669e1729..382775ab50 100644 --- a/beacon_node/beacon_chain/src/kzg_utils.rs +++ b/beacon_node/beacon_chain/src/kzg_utils.rs @@ -299,6 +299,8 @@ pub(crate) fn build_data_column_sidecars( /// /// If `blob_indices_opt` is `None`, this function attempts to reconstruct all blobs associated /// with the block. +/// This function does NOT use rayon as this is primarily used by a non critical path in HTTP API +/// and it will be slow if the node needs to reconstruct the blobs pub fn reconstruct_blobs( kzg: &Kzg, data_columns: &[Arc>], @@ -320,7 +322,7 @@ pub fn reconstruct_blobs( }; let blob_sidecars = blob_indices - .into_par_iter() + .into_iter() .map(|row_index| { let mut cells: Vec = vec![]; let mut cell_ids: Vec = vec![]; @@ -337,16 +339,26 @@ pub fn reconstruct_blobs( cell_ids.push(data_column.index); } - let (cells, _kzg_proofs) = kzg - .recover_cells_and_compute_kzg_proofs(&cell_ids, &cells) - .map_err(|e| format!("Failed to recover cells and compute KZG proofs: {e:?}"))?; + let num_cells_original_blob = E::number_of_columns() / 2; + let blob_bytes = if data_columns.len() < E::number_of_columns() { + let (recovered_cells, _kzg_proofs) = kzg + .recover_cells_and_compute_kzg_proofs(&cell_ids, &cells) + .map_err(|e| { + format!("Failed to recover cells and compute KZG proofs: {e:?}") + })?; - let num_cells_original_blob = cells.len() / 2; - let blob_bytes = cells - .into_iter() - .take(num_cells_original_blob) - .flat_map(|cell| cell.into_iter()) - .collect(); + recovered_cells + .into_iter() + .take(num_cells_original_blob) + .flat_map(|cell| cell.into_iter()) + .collect() + } else { + cells + .into_iter() + .take(num_cells_original_blob) + .flat_map(|cell| (*cell).into_iter()) + .collect() + }; let blob = Blob::::new(blob_bytes).map_err(|e| format!("{e:?}"))?; let kzg_proof = KzgProof::empty(); diff --git a/beacon_node/client/src/builder.rs b/beacon_node/client/src/builder.rs index d984d5fedc..02c042bf28 100644 --- a/beacon_node/client/src/builder.rs +++ b/beacon_node/client/src/builder.rs @@ -412,7 +412,7 @@ where let blobs = if block.message().body().has_blobs() { debug!("Downloading finalized blobs"); if let Some(response) = remote - .get_blobs::(BlockId::Root(block_root), None, &spec) + .get_blob_sidecars::(BlockId::Root(block_root), None, &spec) .await .map_err(|e| format!("Error fetching finalized blobs from remote: {e:?}"))? { diff --git a/beacon_node/http_api/src/block_id.rs b/beacon_node/http_api/src/block_id.rs index e527e466f6..778067c32b 100644 --- a/beacon_node/http_api/src/block_id.rs +++ b/beacon_node/http_api/src/block_id.rs @@ -2,15 +2,16 @@ use crate::version::inconsistent_fork_rejection; use crate::{ExecutionOptimistic, state_id::checkpoint_slot_and_execution_optimistic}; use beacon_chain::kzg_utils::reconstruct_blobs; use beacon_chain::{BeaconChain, BeaconChainError, BeaconChainTypes, WhenSlotSkipped}; -use eth2::types::BlobIndicesQuery; use eth2::types::BlockId as CoreBlockId; use eth2::types::DataColumnIndicesQuery; +use eth2::types::{BlobIndicesQuery, BlobWrapper, BlobsVersionedHashesQuery}; use std::fmt; use std::str::FromStr; use std::sync::Arc; use types::{ BlobSidecarList, DataColumnSidecarList, EthSpec, FixedBytesExtended, ForkName, Hash256, - SignedBeaconBlock, SignedBlindedBeaconBlock, Slot, + SignedBeaconBlock, SignedBlindedBeaconBlock, Slot, UnversionedResponse, + beacon_response::ExecutionOptimisticFinalizedMetadata, }; use warp::Rejection; @@ -352,6 +353,68 @@ impl BlockId { Ok((block, blob_sidecar_list, execution_optimistic, finalized)) } + #[allow(clippy::type_complexity)] + pub fn get_blobs_by_versioned_hashes( + &self, + query: BlobsVersionedHashesQuery, + chain: &BeaconChain, + ) -> Result< + UnversionedResponse>, ExecutionOptimisticFinalizedMetadata>, + warp::Rejection, + > { + let (root, execution_optimistic, finalized) = self.root(chain)?; + let block = BlockId::blinded_block_by_root(&root, chain)?.ok_or_else(|| { + warp_utils::reject::custom_not_found(format!("beacon block with root {}", root)) + })?; + + // Error if the block is pre-Deneb and lacks blobs. + let blob_kzg_commitments = block.message().body().blob_kzg_commitments().map_err(|_| { + warp_utils::reject::custom_bad_request( + "block is pre-Deneb and has no blobs".to_string(), + ) + })?; + + let blob_indices_opt = query.versioned_hashes.map(|versioned_hashes| { + versioned_hashes + .iter() + .flat_map(|versioned_hash| { + blob_kzg_commitments.iter().position(|commitment| { + let computed_hash = commitment.calculate_versioned_hash(); + computed_hash == *versioned_hash + }) + }) + .map(|index| index as u64) + .collect::>() + }); + + let max_blobs_per_block = chain.spec.max_blobs_per_block(block.epoch()) as usize; + let blob_sidecar_list = if !blob_kzg_commitments.is_empty() { + if chain.spec.is_peer_das_enabled_for_epoch(block.epoch()) { + Self::get_blobs_from_data_columns(chain, root, blob_indices_opt, &block)? + } else { + Self::get_blobs(chain, root, blob_indices_opt, max_blobs_per_block)? + } + } else { + BlobSidecarList::new(vec![], max_blobs_per_block) + .map_err(|e| warp_utils::reject::custom_server_error(format!("{:?}", e)))? + }; + + let blobs = blob_sidecar_list + .into_iter() + .map(|sidecar| BlobWrapper:: { + blob: sidecar.blob.clone(), + }) + .collect(); + + Ok(UnversionedResponse { + metadata: ExecutionOptimisticFinalizedMetadata { + execution_optimistic: Some(execution_optimistic), + finalized: Some(finalized), + }, + data: blobs, + }) + } + fn get_blobs( chain: &BeaconChain, root: Hash256, @@ -369,9 +432,9 @@ impl BlockId { let blob_sidecar_list_filtered = match indices { Some(vec) => { - let list: Vec<_> = blob_sidecar_list + let list: Vec<_> = vec .into_iter() - .filter(|blob_sidecar| vec.contains(&blob_sidecar.index)) + .flat_map(|index| blob_sidecar_list.get(index as usize).cloned()) .collect(); BlobSidecarList::new(list, max_blobs_per_block) diff --git a/beacon_node/http_api/src/lib.rs b/beacon_node/http_api/src/lib.rs index 1b18ed50a3..7f6c97a0f8 100644 --- a/beacon_node/http_api/src/lib.rs +++ b/beacon_node/http_api/src/lib.rs @@ -214,6 +214,7 @@ pub fn prometheus_metrics() -> warp::filters::log::Log( */ // GET beacon/blob_sidecars/{block_id} - let get_blobs = eth_v1 + let get_blob_sidecars = eth_v1 .and(warp::path("beacon")) .and(warp::path("blob_sidecars")) .and(block_id_or_err) @@ -1947,6 +1948,52 @@ pub fn serve( }, ); + // GET beacon/blobs/{block_id} + let get_blobs = eth_v1 + .and(warp::path("beacon")) + .and(warp::path("blobs")) + .and(block_id_or_err) + .and(warp::path::end()) + .and(multi_key_query::()) + .and(task_spawner_filter.clone()) + .and(chain_filter.clone()) + .and(warp::header::optional::("accept")) + .then( + |block_id: BlockId, + version_hashes_res: Result, + task_spawner: TaskSpawner, + chain: Arc>, + accept_header: Option| { + task_spawner.blocking_response_task(Priority::P1, move || { + let versioned_hashes = version_hashes_res?; + let response = + block_id.get_blobs_by_versioned_hashes(versioned_hashes, &chain)?; + + match accept_header { + Some(api_types::Accept::Ssz) => Response::builder() + .status(200) + .body(response.data.as_ssz_bytes().into()) + .map(|res: Response| add_ssz_content_type_header(res)) + .map_err(|e| { + warp_utils::reject::custom_server_error(format!( + "failed to create response: {}", + e + )) + }), + _ => { + let res = execution_optimistic_finalized_beacon_response( + ResponseIncludesVersion::No, + response.metadata.execution_optimistic.unwrap_or(false), + response.metadata.finalized.unwrap_or(false), + response.data, + )?; + Ok(warp::reply::json(&res).into_response()) + } + } + }) + }, + ); + /* * beacon/pool */ @@ -4794,6 +4841,7 @@ pub fn serve( .uor(get_beacon_block_attestations) .uor(get_beacon_blinded_block) .uor(get_beacon_block_root) + .uor(get_blob_sidecars) .uor(get_blobs) .uor(get_beacon_pool_attestations) .uor(get_beacon_pool_attester_slashings) diff --git a/beacon_node/http_api/tests/tests.rs b/beacon_node/http_api/tests/tests.rs index 2072fb9932..9c18a7c1e8 100644 --- a/beacon_node/http_api/tests/tests.rs +++ b/beacon_node/http_api/tests/tests.rs @@ -90,6 +90,7 @@ struct ApiTester { struct ApiTesterConfig { spec: ChainSpec, retain_historic_states: bool, + import_all_data_columns: bool, } impl Default for ApiTesterConfig { @@ -99,6 +100,7 @@ impl Default for ApiTesterConfig { Self { spec, retain_historic_states: false, + import_all_data_columns: false, } } } @@ -137,6 +139,7 @@ impl ApiTester { .deterministic_withdrawal_keypairs(VALIDATOR_COUNT) .fresh_ephemeral_store() .mock_execution_layer() + .import_all_data_columns(config.import_all_data_columns) .build(); harness @@ -441,10 +444,7 @@ impl ApiTester { } pub async fn new_mev_tester_default_payload_value() -> Self { - let mut config = ApiTesterConfig { - retain_historic_states: false, - spec: E::default_spec(), - }; + let mut config = ApiTesterConfig::default(); config.spec.altair_fork_epoch = Some(Epoch::new(0)); config.spec.bellatrix_fork_epoch = Some(Epoch::new(0)); let tester = Self::new_from_config(config) @@ -1858,7 +1858,7 @@ impl ApiTester { }; let result = match self .client - .get_blobs::( + .get_blob_sidecars::( CoreBlockId::Root(block_root), blob_indices.as_deref(), &self.chain.spec, @@ -1879,6 +1879,77 @@ impl ApiTester { self } + pub async fn test_get_blobs(self, versioned_hashes: bool) -> Self { + let block_id = BlockId(CoreBlockId::Finalized); + let (block_root, _, _) = block_id.root(&self.chain).unwrap(); + let (block, _, _) = block_id.full_block(&self.chain).await.unwrap(); + let num_blobs = block.num_expected_blobs(); + + let versioned_hashes: Option> = if versioned_hashes { + Some( + block + .message() + .body() + .blob_kzg_commitments() + .unwrap() + .iter() + .map(|commitment| commitment.calculate_versioned_hash()) + .collect(), + ) + } else { + None + }; + + let result = match self + .client + .get_blobs::(CoreBlockId::Root(block_root), versioned_hashes.as_deref()) + .await + { + Ok(response) => response.unwrap().into_data(), + Err(e) => panic!("query failed incorrectly: {e:?}"), + }; + + assert_eq!( + result.len(), + versioned_hashes.map_or(num_blobs, |versioned_hashes| versioned_hashes.len()) + ); + + self + } + + pub async fn test_get_blobs_post_fulu_full_node(self, versioned_hashes: bool) -> Self { + let block_id = BlockId(CoreBlockId::Finalized); + let (block_root, _, _) = block_id.root(&self.chain).unwrap(); + let (block, _, _) = block_id.full_block(&self.chain).await.unwrap(); + + let versioned_hashes: Option> = if versioned_hashes { + Some( + block + .message() + .body() + .blob_kzg_commitments() + .unwrap() + .iter() + .map(|commitment| commitment.calculate_versioned_hash()) + .collect(), + ) + } else { + None + }; + + match self + .client + .get_blobs::(CoreBlockId::Root(block_root), versioned_hashes.as_deref()) + .await + { + Ok(result) => panic!("Full node are unable to return blobs post-Fulu: {result:?}"), + // Post-Fulu, full nodes don't store blobs and return error 500 + Err(e) => assert_eq!(e.status().unwrap(), 500), + }; + + self + } + /// Test fetching of blob sidecars that are not available in the database due to pruning. /// /// If `zero_blobs` is false, test a block with >0 blobs, which should be unavailable. @@ -1918,7 +1989,7 @@ impl ApiTester { match self .client - .get_blobs::(CoreBlockId::Slot(test_slot), None, &self.chain.spec) + .get_blob_sidecars::(CoreBlockId::Slot(test_slot), None, &self.chain.spec) .await { Ok(result) => { @@ -1956,7 +2027,7 @@ impl ApiTester { match self .client - .get_blobs::(CoreBlockId::Slot(test_slot), None, &self.chain.spec) + .get_blob_sidecars::(CoreBlockId::Slot(test_slot), None, &self.chain.spec) .await { Ok(result) => panic!("queries for pre-Deneb slots should fail. got: {result:?}"), @@ -7704,10 +7775,7 @@ async fn builder_payload_chosen_by_profit_v3() { #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn builder_works_post_capella() { - let mut config = ApiTesterConfig { - retain_historic_states: false, - spec: E::default_spec(), - }; + let mut config = ApiTesterConfig::default(); config.spec.altair_fork_epoch = Some(Epoch::new(0)); config.spec.bellatrix_fork_epoch = Some(Epoch::new(0)); config.spec.capella_fork_epoch = Some(Epoch::new(0)); @@ -7724,10 +7792,7 @@ async fn builder_works_post_capella() { #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn builder_works_post_deneb() { - let mut config = ApiTesterConfig { - retain_historic_states: false, - spec: E::default_spec(), - }; + let mut config = ApiTesterConfig::default(); config.spec.altair_fork_epoch = Some(Epoch::new(0)); config.spec.bellatrix_fork_epoch = Some(Epoch::new(0)); config.spec.capella_fork_epoch = Some(Epoch::new(0)); @@ -7745,10 +7810,7 @@ async fn builder_works_post_deneb() { #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn get_blob_sidecars() { - let mut config = ApiTesterConfig { - retain_historic_states: false, - spec: E::default_spec(), - }; + let mut config = ApiTesterConfig::default(); config.spec.altair_fork_epoch = Some(Epoch::new(0)); config.spec.bellatrix_fork_epoch = Some(Epoch::new(0)); config.spec.capella_fork_epoch = Some(Epoch::new(0)); @@ -7761,6 +7823,53 @@ async fn get_blob_sidecars() { .test_get_blob_sidecars(false) .await .test_get_blob_sidecars(true) + .await + .test_get_blobs(false) + .await + .test_get_blobs(true) + .await; +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn get_blobs_post_fulu_supernode() { + let mut config = ApiTesterConfig { + retain_historic_states: false, + spec: E::default_spec(), + // For supernode, we import all data columns + import_all_data_columns: true, + }; + config.spec.altair_fork_epoch = Some(Epoch::new(0)); + config.spec.bellatrix_fork_epoch = Some(Epoch::new(0)); + config.spec.capella_fork_epoch = Some(Epoch::new(0)); + config.spec.deneb_fork_epoch = Some(Epoch::new(0)); + config.spec.electra_fork_epoch = Some(Epoch::new(0)); + config.spec.fulu_fork_epoch = Some(Epoch::new(0)); + + ApiTester::new_from_config(config) + .await + // We can call the same get_blobs function in this test + // because the function will call get_blobs_by_versioned_hashes which handles peerDAS post-Fulu + .test_get_blobs(false) + .await + .test_get_blobs(true) + .await; +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn get_blobs_post_fulu_full_node() { + let mut config = ApiTesterConfig::default(); + config.spec.altair_fork_epoch = Some(Epoch::new(0)); + config.spec.bellatrix_fork_epoch = Some(Epoch::new(0)); + config.spec.capella_fork_epoch = Some(Epoch::new(0)); + config.spec.deneb_fork_epoch = Some(Epoch::new(0)); + config.spec.electra_fork_epoch = Some(Epoch::new(0)); + config.spec.fulu_fork_epoch = Some(Epoch::new(0)); + + ApiTester::new_from_config(config) + .await + .test_get_blobs_post_fulu_full_node(false) + .await + .test_get_blobs_post_fulu_full_node(true) .await; } diff --git a/common/eth2/src/lib.rs b/common/eth2/src/lib.rs index 3368569d59..0423794d0d 100644 --- a/common/eth2/src/lib.rs +++ b/common/eth2/src/lib.rs @@ -1336,7 +1336,7 @@ impl BeaconNodeHttpClient { } /// Path for `v1/beacon/blob_sidecars/{block_id}` - pub fn get_blobs_path(&self, block_id: BlockId) -> Result { + pub fn get_blob_sidecars_path(&self, block_id: BlockId) -> Result { let mut path = self.eth_path(V1)?; path.path_segments_mut() .map_err(|()| Error::InvalidUrl(self.server.clone()))? @@ -1346,6 +1346,17 @@ impl BeaconNodeHttpClient { Ok(path) } + /// Path for `v1/beacon/blobs/{blob_id}` + pub fn get_blobs_path(&self, block_id: BlockId) -> Result { + let mut path = self.eth_path(V1)?; + path.path_segments_mut() + .map_err(|()| Error::InvalidUrl(self.server.clone()))? + .push("beacon") + .push("blobs") + .push(&block_id.to_string()); + Ok(path) + } + /// Path for `v1/beacon/blinded_blocks/{block_id}` pub fn get_beacon_blinded_blocks_path(&self, block_id: BlockId) -> Result { let mut path = self.eth_path(V1)?; @@ -1374,13 +1385,13 @@ impl BeaconNodeHttpClient { /// `GET v1/beacon/blob_sidecars/{block_id}` /// /// Returns `Ok(None)` on a 404 error. - pub async fn get_blobs( + pub async fn get_blob_sidecars( &self, block_id: BlockId, indices: Option<&[u64]>, spec: &ChainSpec, ) -> Result>>, Error> { - let mut path = self.get_blobs_path(block_id)?; + let mut path = self.get_blob_sidecars_path(block_id)?; if let Some(indices) = indices { let indices_string = indices .iter() @@ -1400,6 +1411,31 @@ impl BeaconNodeHttpClient { .map(|opt| opt.map(BeaconResponse::ForkVersioned)) } + /// `GET v1/beacon/blobs/{block_id}` + /// + /// Returns `Ok(None)` on a 404 error. + pub async fn get_blobs( + &self, + block_id: BlockId, + versioned_hashes: Option<&[Hash256]>, + ) -> Result>>>, Error> + { + let mut path = self.get_blobs_path(block_id)?; + if let Some(hashes) = versioned_hashes { + let hashes_string = hashes + .iter() + .map(|hash| hash.to_string()) + .collect::>() + .join(","); + path.query_pairs_mut() + .append_pair("versioned_hashes", &hashes_string); + } + + self.get_opt(path) + .await + .map(|opt| opt.map(BeaconResponse::Unversioned)) + } + /// `GET v1/beacon/blinded_blocks/{block_id}` /// /// Returns `Ok(None)` on a 404 error. diff --git a/common/eth2/src/types.rs b/common/eth2/src/types.rs index b72ab29380..8f553b57d9 100644 --- a/common/eth2/src/types.rs +++ b/common/eth2/src/types.rs @@ -716,6 +716,13 @@ pub struct BlobIndicesQuery { pub indices: Option>, } +#[derive(Clone, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct BlobsVersionedHashesQuery { + #[serde(default, deserialize_with = "option_query_vec")] + pub versioned_hashes: Option>, +} + #[derive(Clone, Deserialize)] #[serde(deny_unknown_fields)] pub struct DataColumnIndicesQuery { @@ -2317,6 +2324,14 @@ pub struct StandardAttestationRewards { pub total_rewards: Vec, } +#[derive(Debug, Clone, Serialize, Deserialize, Encode, Decode)] +#[serde(bound = "E: EthSpec")] +#[serde(transparent)] +pub struct BlobWrapper { + #[serde(with = "ssz_types::serde_utils::hex_fixed_vec")] + pub blob: Blob, +} + #[cfg(test)] mod test { use std::fmt::Debug; diff --git a/consensus/types/src/beacon_response.rs b/consensus/types/src/beacon_response.rs index 2e45854364..fc59fc9432 100644 --- a/consensus/types/src/beacon_response.rs +++ b/consensus/types/src/beacon_response.rs @@ -25,6 +25,7 @@ pub struct ForkVersionedResponse { /// `Deserialize`. #[derive(Debug, PartialEq, Clone, Serialize)] pub struct UnversionedResponse { + #[serde(flatten)] pub metadata: M, pub data: T, } @@ -195,9 +196,10 @@ impl From> for BeaconResponse { #[cfg(test)] mod fork_version_response_tests { + use crate::beacon_response::ExecutionOptimisticFinalizedMetadata; use crate::{ ExecutionPayload, ExecutionPayloadBellatrix, ForkName, ForkVersionedResponse, - MainnetEthSpec, + MainnetEthSpec, UnversionedResponse, }; use serde_json::json; @@ -236,4 +238,24 @@ mod fork_version_response_tests { assert!(result.is_err()); } + + // The following test should only pass by having the attribute #[serde(flatten)] on the metadata + #[test] + fn unversioned_response_serialize_dezerialize_round_trip_test() { + // Create an UnversionedResponse with some data + let data = UnversionedResponse { + metadata: ExecutionOptimisticFinalizedMetadata { + execution_optimistic: Some(false), + finalized: Some(false), + }, + data: "some_test_data".to_string(), + }; + + let serialized = serde_json::to_string(&data); + + let deserialized = + serde_json::from_str(&serialized.unwrap()).expect("Failed to deserialize"); + + assert_eq!(data, deserialized); + } } diff --git a/lcli/src/http_sync.rs b/lcli/src/http_sync.rs index 2e36eadf23..6f7dcdb595 100644 --- a/lcli/src/http_sync.rs +++ b/lcli/src/http_sync.rs @@ -124,7 +124,7 @@ async fn get_block_from_source( .unwrap() .unwrap(); let blobs_from_source = source - .get_blobs::(block_id, None, spec) + .get_blob_sidecars::(block_id, None, spec) .await .unwrap() .unwrap() diff --git a/testing/simulator/src/checks.rs b/testing/simulator/src/checks.rs index 1368c495cd..1240785121 100644 --- a/testing/simulator/src/checks.rs +++ b/testing/simulator/src/checks.rs @@ -424,7 +424,7 @@ pub async fn verify_full_blob_production_up_to( // the `verify_full_block_production_up_to` function. if block.is_some() { remote_node - .get_blobs::(BlockId::Slot(Slot::new(slot)), None, &E::default_spec()) + .get_blobs::(BlockId::Slot(Slot::new(slot)), None) .await .map_err(|e| format!("Failed to get blobs at slot {slot:?}: {e:?}"))? .ok_or_else(|| format!("No blobs available at slot {slot:?}"))?; From 538b70495ccc2cbdcf38b7d73ea1989ba94f1784 Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Thu, 9 Oct 2025 18:32:43 +1100 Subject: [PATCH 74/81] Reject data columns that does not descend from finalize root instead of ignoring it (#8179) This issue was identified during the fusaka audit competition. The [`verify_parent_block_and_finalized_descendant`](https://github.com/sigp/lighthouse/blob/62d9302e0f9dd9f94d0325411a3029b36ad90685/beacon_node/beacon_chain/src/data_column_verification.rs#L606-L627) in data column gossip verification currently load the parent first before checking if the column descends from the finalized root. However, the `fork_choice.get_block(&block_parent_root)` function also make the same check internally: https://github.com/sigp/lighthouse/blob/8a4f6cf0d5b6b261b2c3439ce7c05383a53d30c5/consensus/fork_choice/src/fork_choice.rs#L1242-L1249 Therefore, if the column does not descend from the finalized root, we return an `UnknownParent` error, before hitting the `is_finalized_checkpoint_or_descendant` check just below. Which means we `IGNORE` the gossip message instead `REJECT`, and the gossip peer is not _immediately_ penalised. This deviates from the spec. However, worth noting that lighthouse will currently attempt to request the parent from this peer, and if the peer is not able to serve the parent, it gets penalised with a `LowToleranceError`, and will get banned after ~5 occurences. https://github.com/sigp/lighthouse/blob/ffa7b2b2b9e3b4e70678e2c749b8bc45234febd7/beacon_node/network/src/sync/network_context.rs#L1530-L1532 This PR will penalise the bad peer immediately instead of performing block lookups before penalising it. Co-Authored-By: Jimmy Chen --- .../beacon_chain/src/data_column_verification.rs | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/beacon_node/beacon_chain/src/data_column_verification.rs b/beacon_node/beacon_chain/src/data_column_verification.rs index 600b107c1d..fad7771f01 100644 --- a/beacon_node/beacon_chain/src/data_column_verification.rs +++ b/beacon_node/beacon_chain/src/data_column_verification.rs @@ -608,22 +608,21 @@ fn verify_parent_block_and_finalized_descendant( chain: &BeaconChain, ) -> Result { let fork_choice = chain.canonical_head.fork_choice_read_lock(); + let block_parent_root = data_column.block_parent_root(); + + // Do not process a column that does not descend from the finalized root. + if !fork_choice.is_finalized_checkpoint_or_descendant(block_parent_root) { + return Err(GossipDataColumnError::NotFinalizedDescendant { block_parent_root }); + } // We have already verified that the column is past finalization, so we can // just check fork choice for the block's parent. - let block_parent_root = data_column.block_parent_root(); let Some(parent_block) = fork_choice.get_block(&block_parent_root) else { return Err(GossipDataColumnError::ParentUnknown { parent_root: block_parent_root, }); }; - // Do not process a column that does not descend from the finalized root. - // We just loaded the parent_block, so we can be sure that it exists in fork choice. - if !fork_choice.is_finalized_checkpoint_or_descendant(block_parent_root) { - return Err(GossipDataColumnError::NotFinalizedDescendant { block_parent_root }); - } - Ok(parent_block) } From 0c9fdea28db07eb2395d168c1b8369d785856adc Mon Sep 17 00:00:00 2001 From: Michael Sproul Date: Fri, 10 Oct 2025 00:53:51 +1100 Subject: [PATCH 75/81] Update `ForkName::latest_stable` to Fulu for tests (#8181) Update `ForkName::latest_stable` to Fulu, reflecting our plan to stabilise Fulu in the immediate future! This will lead to some more tests running with Fulu rather than Electra. Co-Authored-By: Michael Sproul --- consensus/types/src/fork_name.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/consensus/types/src/fork_name.rs b/consensus/types/src/fork_name.rs index 363d9e77a2..338e2b1e75 100644 --- a/consensus/types/src/fork_name.rs +++ b/consensus/types/src/fork_name.rs @@ -51,7 +51,7 @@ impl ForkName { /// This fork serves as the baseline for many tests, and the goal /// is to ensure features are passing on this fork. pub fn latest_stable() -> ForkName { - ForkName::Electra + ForkName::Fulu } /// Set the activation slots in the given `ChainSpec` so that the fork named by `self` From 178df7a7d60dd1492da66a0b5f123ef5bb37d1cf Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Mon, 13 Oct 2025 12:12:46 +1100 Subject: [PATCH 76/81] Fix duplicate fields being logged when the field exists in both the span and the event (#8183) Closes #7995. Fix duplicate fields being logged when the field exists in both the span and the event. Prefer event fields when this happens. ``` Sep 15 08:13:46.339 WARN State cache missed state_root: 0xc34826ff7794de63a553832b7aff13572d1c716b9e03d5ef7b29649adf98abe2, block_root: 0xf16d3f5b4cc6ec876b7faeccd9f2d4102dc56ed32e828754b62601637910ec1f, state_root: 0xc34826ff7794de63a553832b7aff13572d1c716b9e03d5ef7b29649adf98abe2, block_root: 0xf16d3f5b4cc6ec876b7faeccd9f2d4102dc56ed32e828754b62601637910ec1f ``` becomes ``` Sep 15 08:13:46.339 WARN State cache missed state_root: 0xc34826ff7794de63a553832b7aff13572d1c716b9e03d5ef7b29649adf98abe2, block_root: 0xf16d3f5b4cc6ec876b7faeccd9f2d4102dc56ed32e828754b62601637910ec1f ``` Co-Authored-By: Jimmy Chen --- common/logging/src/tracing_logging_layer.rs | 114 +++++++++++--------- 1 file changed, 63 insertions(+), 51 deletions(-) diff --git a/common/logging/src/tracing_logging_layer.rs b/common/logging/src/tracing_logging_layer.rs index 923ac1758f..e631d272b7 100644 --- a/common/logging/src/tracing_logging_layer.rs +++ b/common/logging/src/tracing_logging_layer.rs @@ -1,4 +1,5 @@ use crate::utils::is_ascii_control; +use std::collections::HashSet; use chrono::prelude::*; use serde_json::{Map, Value}; @@ -261,6 +262,12 @@ fn build_log_json( let module_field = format!("{}:{}", module_path, line_number); log_map.insert("module".to_string(), Value::String(module_field)); + // Avoid adding duplicate fields; prefer event fields when duplicates exist. + for (key, val) in span_fields { + let parsed_span_val = parse_field(val); + log_map.insert(key.clone(), parsed_span_val); + } + for (key, val) in visitor.fields.clone().into_iter() { let cleaned_value = if val.starts_with('\"') && val.ends_with('\"') && val.len() >= 2 { &val[1..val.len() - 1] @@ -272,11 +279,6 @@ fn build_log_json( log_map.insert(key, parsed_val); } - for (key, val) in span_fields { - let parsed_span_val = parse_field(val); - log_map.insert(key.clone(), parsed_span_val); - } - let json_obj = Value::Object(log_map); let output = format!("{}\n", json_obj); @@ -299,23 +301,6 @@ fn build_log_text( let bold_start = "\x1b[1m"; let bold_end = "\x1b[0m"; - let mut formatted_spans = String::new(); - for (i, (field_name, field_value)) in span_fields.iter().rev().enumerate() { - if use_color { - formatted_spans.push_str(&format!( - "{}{}{}: {}", - bold_start, field_name, bold_end, field_value - )); - } else { - formatted_spans.push_str(&format!("{}: {}", field_name, field_value)); - } - - // Check if this is not the last span. - if i != span_fields.len() - 1 { - formatted_spans.push_str(", "); - } - } - let pad = if plain_level_str.len() < ALIGNED_LEVEL_WIDTH { " " } else { @@ -351,24 +336,26 @@ fn build_log_text( message_content.clone() }; - let mut formatted_fields = String::new(); - for (i, (field_name, field_value)) in visitor.fields.iter().enumerate() { - if i > 0 { - formatted_fields.push_str(", "); - } - if use_color { - formatted_fields.push_str(&format!( - "{}{}{}: {}", - bold_start, field_name, bold_end, field_value - )); - } else { - formatted_fields.push_str(&format!("{}: {}", field_name, field_value)); - } - // Check if this is the last field and that we are also adding spans. - if i == visitor.fields.len() - 1 && !span_fields.is_empty() { - formatted_fields.push(','); - } - } + // Avoid adding duplicate fields; prefer event fields when duplicates exist. + let mut added_field_names = HashSet::new(); + let formatted_fields = visitor + .fields + .iter() + .chain(span_fields.iter()) + .filter_map(|(field_name, field_value)| { + if added_field_names.insert(field_name) { + let formatted_field = if use_color { + format!("{}{}{}: {}", bold_start, field_name, bold_end, field_value) + } else { + format!("{}: {}", field_name, field_value) + }; + Some(formatted_field) + } else { + None + } + }) + .collect::>() + .join(", "); let full_message = if !formatted_fields.is_empty() { format!("{} {}", padded_message, formatted_fields) @@ -378,14 +365,11 @@ fn build_log_text( let message = if !location.is_empty() { format!( - "{} {} {} {} {}\n", - timestamp, level_str, location, full_message, formatted_spans + "{} {} {} {}\n", + timestamp, level_str, location, full_message ) } else { - format!( - "{} {} {} {}\n", - timestamp, level_str, full_message, formatted_spans - ) + format!("{} {} {}\n", timestamp, level_str, full_message) }; if let Err(e) = writer.write_all(message.as_bytes()) { @@ -436,7 +420,7 @@ mod tests { fn test_build_log_text_single_log_field() { let log_fields = vec![("field_name".to_string(), "field_value".to_string())]; let span_fields = vec![]; - let expected = "Jan 1 08:00:00.000 INFO test message field_name: field_value \n"; + let expected = "Jan 1 08:00:00.000 INFO test message field_name: field_value\n"; test_build_log_text(log_fields, span_fields, expected); } @@ -447,7 +431,7 @@ mod tests { ("field_name2".to_string(), "field_value2".to_string()), ]; let span_fields = vec![]; - let expected = "Jan 1 08:00:00.000 INFO test message field_name1: field_value1, field_name2: field_value2 \n"; + let expected = "Jan 1 08:00:00.000 INFO test message field_name1: field_value1, field_name2: field_value2\n"; test_build_log_text(log_fields, span_fields, expected); } @@ -469,7 +453,7 @@ mod tests { "span_field_name".to_string(), "span_field_value".to_string(), )]; - let expected = "Jan 1 08:00:00.000 INFO test message span_field_name: span_field_value\n"; + let expected = "Jan 1 08:00:00.000 INFO test message span_field_name: span_field_value\n"; test_build_log_text(log_fields, span_fields, expected); } @@ -486,7 +470,7 @@ mod tests { "span_field_value2".to_string(), ), ]; - let expected = "Jan 1 08:00:00.000 INFO test message span_field_name2: span_field_value2, span_field_name1: span_field_value1\n"; + let expected = "Jan 1 08:00:00.000 INFO test message span_field_name1: span_field_value1, span_field_name2: span_field_value2\n"; test_build_log_text(log_fields, span_fields, expected); } @@ -503,7 +487,35 @@ mod tests { "span_field_value1-2".to_string(), ), ]; - let expected = "Jan 1 08:00:00.000 INFO test message span_field_name1-2: span_field_value1-2, span_field_name1-1: span_field_value1-1\n"; + let expected = "Jan 1 08:00:00.000 INFO test message span_field_name1-1: span_field_value1-1, span_field_name1-2: span_field_value1-2\n"; + test_build_log_text(log_fields, span_fields, expected); + } + + #[test] + fn test_build_log_text_no_duplicate_log_span_fields() { + let log_fields = vec![ + ("field_name_1".to_string(), "field_value_1".to_string()), + ("field_name_2".to_string(), "field_value_2".to_string()), + ]; + let span_fields = vec![ + ("field_name_1".to_string(), "field_value_1".to_string()), + ("field_name_3".to_string(), "field_value_3".to_string()), + ]; + let expected = "Jan 1 08:00:00.000 INFO test message field_name_1: field_value_1, field_name_2: field_value_2, field_name_3: field_value_3\n"; + test_build_log_text(log_fields, span_fields, expected); + } + + #[test] + fn test_build_log_text_duplicate_fields_prefer_log_fields() { + let log_fields = vec![ + ("field_name_1".to_string(), "field_value_1_log".to_string()), + ("field_name_2".to_string(), "field_value_2".to_string()), + ]; + let span_fields = vec![ + ("field_name_1".to_string(), "field_value_1_span".to_string()), + ("field_name_3".to_string(), "field_value_3".to_string()), + ]; + let expected = "Jan 1 08:00:00.000 INFO test message field_name_1: field_value_1_log, field_name_2: field_value_2, field_name_3: field_value_3\n"; test_build_log_text(log_fields, span_fields, expected); } From 2c328e32a6cd139c0c6ee44b99a5e0ab8e7ebe59 Mon Sep 17 00:00:00 2001 From: Pawan Dhananjay Date: Mon, 13 Oct 2025 02:32:13 -0700 Subject: [PATCH 77/81] Persist only custody columns in db (#8188) * Only persist custody columns * Get claude to write tests * lint * Address review comments and fix tests. * Use supernode only when building chain segments * Clean up * Rewrite tests. * Fix tests * Clippy --------- Co-authored-by: Jimmy Chen Co-authored-by: Michael Sproul --- beacon_node/beacon_chain/src/beacon_chain.rs | 13 ++- .../beacon_chain/src/historical_blocks.rs | 2 +- .../beacon_chain/tests/block_verification.rs | 25 ++++-- beacon_node/beacon_chain/tests/store_tests.rs | 90 +++++++++++++++++++ 4 files changed, 118 insertions(+), 12 deletions(-) diff --git a/beacon_node/beacon_chain/src/beacon_chain.rs b/beacon_node/beacon_chain/src/beacon_chain.rs index f085684442..85ccb96f69 100644 --- a/beacon_node/beacon_chain/src/beacon_chain.rs +++ b/beacon_node/beacon_chain/src/beacon_chain.rs @@ -3957,7 +3957,7 @@ impl BeaconChain { // See https://github.com/sigp/lighthouse/issues/2028 let (_, signed_block, block_data) = signed_block.deconstruct(); - match self.get_blobs_or_columns_store_op(block_root, block_data) { + match self.get_blobs_or_columns_store_op(block_root, signed_block.slot(), block_data) { Ok(Some(blobs_or_columns_store_op)) => { ops.push(blobs_or_columns_store_op); } @@ -7163,6 +7163,7 @@ impl BeaconChain { pub(crate) fn get_blobs_or_columns_store_op( &self, block_root: Hash256, + block_slot: Slot, block_data: AvailableBlockData, ) -> Result>, String> { match block_data { @@ -7175,7 +7176,15 @@ impl BeaconChain { ); Ok(Some(StoreOp::PutBlobs(block_root, blobs))) } - AvailableBlockData::DataColumns(data_columns) => { + AvailableBlockData::DataColumns(mut data_columns) => { + let columns_to_custody = self.custody_columns_for_epoch(Some( + block_slot.epoch(T::EthSpec::slots_per_epoch()), + )); + // Supernodes need to persist all sampled custody columns + if columns_to_custody.len() != self.spec.number_of_custody_groups as usize { + data_columns + .retain(|data_column| columns_to_custody.contains(&data_column.index)); + } debug!( %block_root, count = data_columns.len(), diff --git a/beacon_node/beacon_chain/src/historical_blocks.rs b/beacon_node/beacon_chain/src/historical_blocks.rs index 8b9fb5e354..15e0a55cf5 100644 --- a/beacon_node/beacon_chain/src/historical_blocks.rs +++ b/beacon_node/beacon_chain/src/historical_blocks.rs @@ -140,7 +140,7 @@ impl BeaconChain { // Store the blobs or data columns too if let Some(op) = self - .get_blobs_or_columns_store_op(block_root, block_data) + .get_blobs_or_columns_store_op(block_root, block.slot(), block_data) .map_err(|e| { HistoricalBlockError::StoreError(StoreError::DBError { message: format!("get_blobs_or_columns_store_op error {e:?}"), diff --git a/beacon_node/beacon_chain/tests/block_verification.rs b/beacon_node/beacon_chain/tests/block_verification.rs index b27295751e..47f5be02cb 100644 --- a/beacon_node/beacon_chain/tests/block_verification.rs +++ b/beacon_node/beacon_chain/tests/block_verification.rs @@ -42,7 +42,10 @@ enum DataSidecars { } async fn get_chain_segment() -> (Vec>, Vec>>) { - let harness = get_harness(VALIDATOR_COUNT); + // The assumption that you can re-import a block based on what you have in your DB + // is no longer true, as fullnodes stores less than what they sample. + // We use a supernode here to build a chain segment. + let harness = get_harness(VALIDATOR_COUNT, true); harness .extend_chain( @@ -101,7 +104,10 @@ async fn get_chain_segment() -> (Vec>, Vec BeaconChainHarness> { +fn get_harness( + validator_count: usize, + supernode: bool, +) -> BeaconChainHarness> { let harness = BeaconChainHarness::builder(MainnetEthSpec) .default_spec() .chain_config(ChainConfig { @@ -109,6 +115,7 @@ fn get_harness(validator_count: usize) -> BeaconChainHarness( #[tokio::test] async fn chain_segment_full_segment() { - let harness = get_harness(VALIDATOR_COUNT); + let harness = get_harness(VALIDATOR_COUNT, false); let (chain_segment, chain_segment_blobs) = get_chain_segment().await; let blocks: Vec> = chain_segment_blocks(&chain_segment, &chain_segment_blobs) .into_iter() @@ -290,7 +297,7 @@ async fn chain_segment_full_segment() { #[tokio::test] async fn chain_segment_varying_chunk_size() { for chunk_size in &[1, 2, 3, 5, 31, 32, 33, 42] { - let harness = get_harness(VALIDATOR_COUNT); + let harness = get_harness(VALIDATOR_COUNT, false); let (chain_segment, chain_segment_blobs) = get_chain_segment().await; let blocks: Vec> = chain_segment_blocks(&chain_segment, &chain_segment_blobs) .into_iter() @@ -322,7 +329,7 @@ async fn chain_segment_varying_chunk_size() { #[tokio::test] async fn chain_segment_non_linear_parent_roots() { - let harness = get_harness(VALIDATOR_COUNT); + let harness = get_harness(VALIDATOR_COUNT, false); let (chain_segment, chain_segment_blobs) = get_chain_segment().await; harness @@ -379,7 +386,7 @@ async fn chain_segment_non_linear_parent_roots() { #[tokio::test] async fn chain_segment_non_linear_slots() { - let harness = get_harness(VALIDATOR_COUNT); + let harness = get_harness(VALIDATOR_COUNT, false); let (chain_segment, chain_segment_blobs) = get_chain_segment().await; harness .chain @@ -521,7 +528,7 @@ async fn assert_invalid_signature( async fn get_invalid_sigs_harness( chain_segment: &[BeaconSnapshot], ) -> BeaconChainHarness> { - let harness = get_harness(VALIDATOR_COUNT); + let harness = get_harness(VALIDATOR_COUNT, false); harness .chain .slot_clock @@ -979,7 +986,7 @@ fn unwrap_err(result: Result) -> U { #[tokio::test] async fn block_gossip_verification() { - let harness = get_harness(VALIDATOR_COUNT); + let harness = get_harness(VALIDATOR_COUNT, false); let (chain_segment, chain_segment_blobs) = get_chain_segment().await; let block_index = CHAIN_SEGMENT_LENGTH - 2; @@ -1382,7 +1389,7 @@ async fn verify_block_for_gossip_slashing_detection() { #[tokio::test] async fn verify_block_for_gossip_doppelganger_detection() { - let harness = get_harness(VALIDATOR_COUNT); + let harness = get_harness(VALIDATOR_COUNT, false); let state = harness.get_current_state(); let ((block, _), _) = harness.make_block(state.clone(), Slot::new(1)).await; diff --git a/beacon_node/beacon_chain/tests/store_tests.rs b/beacon_node/beacon_chain/tests/store_tests.rs index cd4032f55d..449b5dd043 100644 --- a/beacon_node/beacon_chain/tests/store_tests.rs +++ b/beacon_node/beacon_chain/tests/store_tests.rs @@ -2735,6 +2735,14 @@ async fn weak_subjectivity_sync_test( .rng(Box::new(StdRng::seed_from_u64(42))) .build() .expect("should build"); + beacon_chain + .data_availability_checker + .custody_context() + .init_ordered_data_columns_from_custody_groups( + (0..spec.number_of_custody_groups).collect(), + &spec, + ) + .unwrap(); let beacon_chain = Arc::new(beacon_chain); let wss_block_root = wss_block.canonical_root(); @@ -4137,6 +4145,88 @@ async fn replay_from_split_state() { assert_eq!(state.slot(), split.slot); } +/// Test that regular nodes filter and store only custody columns when processing blocks with data columns. +#[tokio::test] +async fn test_custody_column_filtering_regular_node() { + // Skip test if PeerDAS is not scheduled + if !test_spec::().is_peer_das_scheduled() { + return; + } + + let db_path = tempdir().unwrap(); + let store = get_store(&db_path); + let harness = get_harness(store.clone(), LOW_VALIDATOR_COUNT); + + // Generate a block with data columns + harness.execution_block_generator().set_min_blob_count(1); + let current_slot = harness.get_current_slot(); + let block_root = harness + .extend_chain( + 1, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + // Get custody columns for this epoch - regular nodes only store a subset + let expected_custody_columns: HashSet<_> = harness + .chain + .custody_columns_for_epoch(Some(current_slot.epoch(E::slots_per_epoch()))) + .iter() + .copied() + .collect(); + + // Check what actually got stored in the database + let stored_column_indices: HashSet<_> = store + .get_data_column_keys(block_root) + .expect("should get stored column keys") + .into_iter() + .collect(); + + assert_eq!( + stored_column_indices, expected_custody_columns, + "Regular node should only store custody columns" + ); +} + +/// Test that supernodes store all data columns when processing blocks with data columns. +#[tokio::test] +async fn test_custody_column_filtering_supernode() { + // Skip test if PeerDAS is not scheduled + if !test_spec::().is_peer_das_scheduled() { + return; + } + + let db_path = tempdir().unwrap(); + let store = get_store(&db_path); + let harness = get_harness_import_all_data_columns(store.clone(), LOW_VALIDATOR_COUNT); + + // Generate a block with data columns + harness.execution_block_generator().set_min_blob_count(1); + let block_root = harness + .extend_chain( + 1, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + // Supernodes are expected to store all data columns + let expected_custody_columns: HashSet<_> = (0..E::number_of_columns() as u64).collect(); + + // Check what actually got stored in the database + let stored_column_indices: HashSet<_> = store + .get_data_column_keys(block_root) + .expect("should get stored column keys") + .into_iter() + .collect(); + + assert_eq!( + stored_column_indices, expected_custody_columns, + "Supernode should store all custody columns" + ); +} + /// Checks that two chains are the same, for the purpose of these tests. /// /// Several fields that are hard/impossible to check are ignored (e.g., the store). From 1fb94ce432f08b110e531e1ce2e44b13b1392e5f Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Mon, 13 Oct 2025 20:32:43 +1100 Subject: [PATCH 78/81] Release v8.0.0-rc.1 (#8185) --- Cargo.lock | 8 ++++---- beacon_node/Cargo.toml | 2 +- boot_node/Cargo.toml | 2 +- common/lighthouse_version/src/lib.rs | 6 +++--- lcli/Cargo.toml | 2 +- lighthouse/Cargo.toml | 2 +- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 481d204865..13a4412796 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -918,7 +918,7 @@ dependencies = [ [[package]] name = "beacon_node" -version = "8.0.0-rc.0" +version = "8.0.0-rc.1" dependencies = [ "account_utils", "beacon_chain", @@ -1193,7 +1193,7 @@ dependencies = [ [[package]] name = "boot_node" -version = "8.0.0-rc.0" +version = "8.0.0-rc.1" dependencies = [ "beacon_node", "bytes", @@ -5050,7 +5050,7 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "lcli" -version = "8.0.0-rc.0" +version = "8.0.0-rc.1" dependencies = [ "account_utils", "beacon_chain", @@ -5560,7 +5560,7 @@ dependencies = [ [[package]] name = "lighthouse" -version = "8.0.0-rc.0" +version = "8.0.0-rc.1" dependencies = [ "account_manager", "account_utils", diff --git a/beacon_node/Cargo.toml b/beacon_node/Cargo.toml index bb904a7619..8e2c598fd4 100644 --- a/beacon_node/Cargo.toml +++ b/beacon_node/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "beacon_node" -version = "8.0.0-rc.0" +version = "8.0.0-rc.1" authors = [ "Paul Hauner ", "Age Manning "] edition = { workspace = true } diff --git a/common/lighthouse_version/src/lib.rs b/common/lighthouse_version/src/lib.rs index 574fdfea35..a3f0ca404f 100644 --- a/common/lighthouse_version/src/lib.rs +++ b/common/lighthouse_version/src/lib.rs @@ -17,8 +17,8 @@ pub const VERSION: &str = git_version!( // NOTE: using --match instead of --exclude for compatibility with old Git "--match=thiswillnevermatchlol" ], - prefix = "Lighthouse/v8.0.0-rc.0-", - fallback = "Lighthouse/v8.0.0-rc.0" + prefix = "Lighthouse/v8.0.0-rc.1-", + fallback = "Lighthouse/v8.0.0-rc.1" ); /// Returns the first eight characters of the latest commit hash for this build. @@ -54,7 +54,7 @@ pub fn version_with_platform() -> String { /// /// `1.5.1` pub fn version() -> &'static str { - "8.0.0-rc.0" + "8.0.0-rc.1" } /// Returns the name of the current client running. diff --git a/lcli/Cargo.toml b/lcli/Cargo.toml index 8f020e0387..e5ed7a8926 100644 --- a/lcli/Cargo.toml +++ b/lcli/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "lcli" description = "Lighthouse CLI (modeled after zcli)" -version = "8.0.0-rc.0" +version = "8.0.0-rc.1" authors = ["Paul Hauner "] edition = { workspace = true } diff --git a/lighthouse/Cargo.toml b/lighthouse/Cargo.toml index 4139286b53..ef680c9b96 100644 --- a/lighthouse/Cargo.toml +++ b/lighthouse/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lighthouse" -version = "8.0.0-rc.0" +version = "8.0.0-rc.1" authors = ["Sigma Prime "] edition = { workspace = true } autotests = false From 60df5f4ab609362711f4f518eb8f03df447bfedb Mon Sep 17 00:00:00 2001 From: Eitan Seri-Levi Date: Mon, 13 Oct 2025 20:18:50 -0700 Subject: [PATCH 79/81] Downgrade light client error logs (#8196) Temporary stop gap for #7002 Downgrade light client errors to debug We eventually should fix our light client objects so they can consist of data across forks. Co-Authored-By: Eitan Seri- Levi --- beacon_node/client/src/compute_light_client_updates.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/beacon_node/client/src/compute_light_client_updates.rs b/beacon_node/client/src/compute_light_client_updates.rs index 44c3475bfe..0ef35588df 100644 --- a/beacon_node/client/src/compute_light_client_updates.rs +++ b/beacon_node/client/src/compute_light_client_updates.rs @@ -3,7 +3,7 @@ use beacon_processor::work_reprocessing_queue::ReprocessQueueMessage; use beacon_processor::{BeaconProcessorSend, Work, WorkEvent}; use futures::StreamExt; use futures::channel::mpsc::Receiver; -use tracing::error; +use tracing::{debug, error}; // Each `LightClientProducerEvent` is ~200 bytes. With the light_client server producing only recent // updates it is okay to drop some events in case of overloading. In normal network conditions @@ -27,7 +27,7 @@ pub async fn compute_light_client_updates( chain .recompute_and_cache_light_client_updates(event) .unwrap_or_else(|e| { - error!("error computing light_client updates {:?}", e); + debug!("error computing light_client updates {:?}", e); }); let msg = ReprocessQueueMessage::NewLightClientOptimisticUpdate { parent_root }; From 5886a48d969207c67d36f5e085148d2d6cccb401 Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Wed, 15 Oct 2025 12:52:35 +1100 Subject: [PATCH 80/81] Add `max_blobs_per_block` check to data column gossip validation (#8198) Addresses this spec change https://github.com/ethereum/consensus-specs/pull/4650 Add `max_blobs_per_block` to gossip data column check so we reject large columns before processing. (we currently do this check during processing) Co-Authored-By: Jimmy Chen --- .../src/data_column_verification.rs | 115 ++++++++++++++++-- beacon_node/beacon_chain/src/test_utils.rs | 2 +- .../gossip_methods.rs | 1 + 3 files changed, 107 insertions(+), 11 deletions(-) diff --git a/beacon_node/beacon_chain/src/data_column_verification.rs b/beacon_node/beacon_chain/src/data_column_verification.rs index fad7771f01..01e79c49aa 100644 --- a/beacon_node/beacon_chain/src/data_column_verification.rs +++ b/beacon_node/beacon_chain/src/data_column_verification.rs @@ -161,6 +161,15 @@ pub enum GossipDataColumnError { /// /// The column sidecar is invalid and the peer is faulty InconsistentProofsLength { cells_len: usize, proofs_len: usize }, + /// The number of KZG commitments exceeds the maximum number of blobs allowed for the fork. The + /// sidecar is invalid. + /// + /// ## Peer scoring + /// The column sidecar is invalid and the peer is faulty + MaxBlobsPerBlockExceeded { + max_blobs_per_block: usize, + commitments_len: usize, + }, } impl From for GossipDataColumnError { @@ -220,7 +229,7 @@ impl GossipVerifiedDataColumn column_sidecar: Arc>, chain: &BeaconChain, ) -> Result { - verify_data_column_sidecar(&column_sidecar)?; + verify_data_column_sidecar(&column_sidecar, &chain.spec)?; // Check if the data column is already in the DA checker cache. This happens when data columns // are made available through the `engine_getBlobs` method. If it exists in the cache, we know @@ -475,7 +484,7 @@ pub fn validate_data_column_sidecar_for_gossip, ) -> Result, GossipDataColumnError> { let column_slot = data_column.slot(); - verify_data_column_sidecar(&data_column)?; + verify_data_column_sidecar(&data_column, &chain.spec)?; verify_index_matches_subnet(&data_column, subnet, &chain.spec)?; verify_sidecar_not_from_future_slot(chain, column_slot)?; verify_slot_greater_than_latest_finalized_slot(chain, column_slot)?; @@ -529,6 +538,7 @@ pub fn validate_data_column_sidecar_for_gossip( data_column: &DataColumnSidecar, + spec: &ChainSpec, ) -> Result<(), GossipDataColumnError> { if data_column.index >= E::number_of_columns() as u64 { return Err(GossipDataColumnError::InvalidColumnIndex(data_column.index)); @@ -540,6 +550,14 @@ fn verify_data_column_sidecar( let cells_len = data_column.column.len(); let commitments_len = data_column.kzg_commitments.len(); let proofs_len = data_column.kzg_proofs.len(); + let max_blobs_per_block = spec.max_blobs_per_block(data_column.epoch()) as usize; + + if commitments_len > max_blobs_per_block { + return Err(GossipDataColumnError::MaxBlobsPerBlockExceeded { + max_blobs_per_block, + commitments_len, + }); + } if cells_len != commitments_len { return Err(GossipDataColumnError::InconsistentCommitmentsLength { @@ -782,16 +800,22 @@ pub fn observe_gossip_data_column( #[cfg(test)] mod test { use crate::data_column_verification::{ - GossipDataColumnError, validate_data_column_sidecar_for_gossip, + GossipDataColumnError, GossipVerifiedDataColumn, validate_data_column_sidecar_for_gossip, }; use crate::observed_data_sidecars::Observe; - use crate::test_utils::BeaconChainHarness; + use crate::test_utils::{ + BeaconChainHarness, EphemeralHarnessType, generate_data_column_sidecars_from_block, + }; + use eth2::types::BlobsBundle; + use execution_layer::test_utils::generate_blobs; + use std::sync::Arc; use types::{DataColumnSidecar, DataColumnSubnetId, EthSpec, ForkName, MainnetEthSpec}; type E = MainnetEthSpec; #[tokio::test] - async fn empty_data_column_sidecars_fails_validation() { + async fn test_validate_data_column_sidecar_for_gossip() { + // Setting up harness is slow, we initialise once and use it for all gossip validation tests. let spec = ForkName::Fulu.make_genesis_spec(E::default_spec()); let harness = BeaconChainHarness::builder(E::default()) .spec(spec.into()) @@ -801,6 +825,44 @@ mod test { .build(); harness.advance_slot(); + let verify_fn = |column_sidecar: DataColumnSidecar| { + let col_index = column_sidecar.index; + validate_data_column_sidecar_for_gossip::<_, Observe>( + column_sidecar.into(), + DataColumnSubnetId::from_column_index(col_index, &harness.spec), + &harness.chain, + ) + }; + empty_data_column_sidecars_fails_validation(&harness, &verify_fn).await; + data_column_sidecar_commitments_exceed_max_blobs_per_block(&harness, &verify_fn).await; + } + + #[tokio::test] + async fn test_new_for_block_publishing() { + // Setting up harness is slow, we initialise once and use it for all gossip validation tests. + let spec = ForkName::Fulu.make_genesis_spec(E::default_spec()); + let harness = BeaconChainHarness::builder(E::default()) + .spec(spec.into()) + .deterministic_keypairs(64) + .fresh_ephemeral_store() + .mock_execution_layer() + .build(); + harness.advance_slot(); + + let verify_fn = |column_sidecar: DataColumnSidecar| { + GossipVerifiedDataColumn::<_>::new_for_block_publishing( + column_sidecar.into(), + &harness.chain, + ) + }; + empty_data_column_sidecars_fails_validation(&harness, &verify_fn).await; + data_column_sidecar_commitments_exceed_max_blobs_per_block(&harness, &verify_fn).await; + } + + async fn empty_data_column_sidecars_fails_validation( + harness: &BeaconChainHarness>, + verify_fn: &impl Fn(DataColumnSidecar) -> Result, + ) { let slot = harness.get_current_slot(); let state = harness.get_current_state(); let ((block, _blobs_opt), _state) = harness @@ -823,14 +885,47 @@ mod test { .unwrap(), }; - let result = validate_data_column_sidecar_for_gossip::<_, Observe>( - column_sidecar.into(), - DataColumnSubnetId::from_column_index(index, &harness.spec), - &harness.chain, - ); + let result = verify_fn(column_sidecar); assert!(matches!( result.err(), Some(GossipDataColumnError::UnexpectedDataColumn) )); } + + async fn data_column_sidecar_commitments_exceed_max_blobs_per_block( + harness: &BeaconChainHarness>, + verify_fn: &impl Fn(DataColumnSidecar) -> Result, + ) { + let slot = harness.get_current_slot(); + let epoch = slot.epoch(E::slots_per_epoch()); + let state = harness.get_current_state(); + let max_blobs_per_block = harness.spec.max_blobs_per_block(epoch) as usize; + let fork = harness.spec.fork_name_at_epoch(epoch); + + // Generate data column sidecar with blob count exceeding max_blobs_per_block. + let blob_count = max_blobs_per_block + 1; + let BlobsBundle:: { + commitments: preloaded_commitments_single, + proofs: _, + blobs: _, + } = generate_blobs(1, fork).unwrap().0; + + let ((block, _blobs_opt), _state) = harness + .make_block_with_modifier(state, slot, |block| { + *block.body_mut().blob_kzg_commitments_mut().unwrap() = + vec![preloaded_commitments_single[0]; blob_count].into(); + }) + .await; + + let column_sidecar = generate_data_column_sidecars_from_block(&block, &harness.spec) + .into_iter() + .next() + .unwrap(); + + let result = verify_fn(Arc::try_unwrap(column_sidecar).unwrap()); + assert!(matches!( + result.err(), + Some(GossipDataColumnError::MaxBlobsPerBlockExceeded { .. }) + )); + } } diff --git a/beacon_node/beacon_chain/src/test_utils.rs b/beacon_node/beacon_chain/src/test_utils.rs index c2230ba057..1d57550156 100644 --- a/beacon_node/beacon_chain/src/test_utils.rs +++ b/beacon_node/beacon_chain/src/test_utils.rs @@ -3380,7 +3380,7 @@ pub fn generate_rand_block_and_data_columns( } /// Generate data column sidecars from pre-computed cells and proofs. -fn generate_data_column_sidecars_from_block( +pub fn generate_data_column_sidecars_from_block( block: &SignedBeaconBlock, spec: &ChainSpec, ) -> DataColumnSidecarList { diff --git a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs index 5fc94c2958..fa6b5fd243 100644 --- a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs @@ -708,6 +708,7 @@ impl NetworkBeaconProcessor { | GossipDataColumnError::InvalidKzgProof { .. } | GossipDataColumnError::UnexpectedDataColumn | GossipDataColumnError::InvalidColumnIndex(_) + | GossipDataColumnError::MaxBlobsPerBlockExceeded { .. } | GossipDataColumnError::InconsistentCommitmentsLength { .. } | GossipDataColumnError::InconsistentProofsLength { .. } | GossipDataColumnError::NotFinalizedDescendant { .. } => { From 345faf52cbff602919e7b882758617555ea1f294 Mon Sep 17 00:00:00 2001 From: Mac L Date: Wed, 15 Oct 2025 10:03:46 +0400 Subject: [PATCH 81/81] Remove `safe_arith` and import from crates.io (#8191) Use the recently published `safe_arith` and remove it from Lighthouse https://crates.io/crates/safe_arith Co-Authored-By: Mac L --- Cargo.lock | 6 +- Cargo.toml | 3 +- consensus/safe_arith/Cargo.toml | 8 -- consensus/safe_arith/src/iter.rs | 70 ------------- consensus/safe_arith/src/lib.rs | 166 ------------------------------- 5 files changed, 5 insertions(+), 248 deletions(-) delete mode 100644 consensus/safe_arith/Cargo.toml delete mode 100644 consensus/safe_arith/src/iter.rs delete mode 100644 consensus/safe_arith/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 13a4412796..31cccc6a98 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2162,7 +2162,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "18e4fdb82bd54a12e42fb58a800dcae6b9e13982238ce2296dc3570b92148e1f" dependencies = [ "data-encoding", - "syn 1.0.109", + "syn 2.0.100", ] [[package]] @@ -7374,7 +7374,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" dependencies = [ "anyhow", - "itertools 0.10.5", + "itertools 0.14.0", "proc-macro2", "quote", "syn 2.0.100", @@ -8269,6 +8269,8 @@ checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" [[package]] name = "safe_arith" version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b147bb6111014916d3ef9d4c85173124a8e12193a67f6176d67244afd558d6c1" [[package]] name = "salsa20" diff --git a/Cargo.toml b/Cargo.toml index a5f01a498d..a46dc355e7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -56,7 +56,6 @@ members = [ "consensus/int_to_bytes", "consensus/merkle_proof", "consensus/proto_array", - "consensus/safe_arith", "consensus/state_processing", "consensus/swap_or_not_shuffle", "consensus/types", @@ -225,7 +224,7 @@ ring = "0.17" rpds = "0.11" rusqlite = { version = "0.28", features = ["bundled"] } rust_eth_kzg = "0.9" -safe_arith = { path = "consensus/safe_arith" } +safe_arith = "0.1" sensitive_url = { path = "common/sensitive_url" } serde = { version = "1", features = ["derive"] } serde_json = "1" diff --git a/consensus/safe_arith/Cargo.toml b/consensus/safe_arith/Cargo.toml deleted file mode 100644 index 9ac9fe28d3..0000000000 --- a/consensus/safe_arith/Cargo.toml +++ /dev/null @@ -1,8 +0,0 @@ -[package] -name = "safe_arith" -version = "0.1.0" -authors = ["Michael Sproul "] -edition = { workspace = true } -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[dependencies] diff --git a/consensus/safe_arith/src/iter.rs b/consensus/safe_arith/src/iter.rs deleted file mode 100644 index d5ee51b588..0000000000 --- a/consensus/safe_arith/src/iter.rs +++ /dev/null @@ -1,70 +0,0 @@ -use crate::{Result, SafeArith}; - -/// Extension trait for iterators, providing a safe replacement for `sum`. -pub trait SafeArithIter { - fn safe_sum(self) -> Result; -} - -impl SafeArithIter for I -where - I: Iterator + Sized, - T: SafeArith, -{ - fn safe_sum(mut self) -> Result { - self.try_fold(T::ZERO, |acc, x| acc.safe_add(x)) - } -} - -#[cfg(test)] -mod test { - use super::*; - use crate::ArithError; - - #[test] - fn empty_sum() { - let v: Vec = vec![]; - assert_eq!(v.into_iter().safe_sum(), Ok(0)); - } - - #[test] - fn unsigned_sum_small() { - let arr = [400u64, 401, 402, 403, 404, 405, 406]; - assert_eq!( - arr.iter().copied().safe_sum().unwrap(), - arr.iter().copied().sum() - ); - } - - #[test] - fn unsigned_sum_overflow() { - let v = vec![u64::MAX, 1]; - assert_eq!(v.into_iter().safe_sum(), Err(ArithError::Overflow)); - } - - #[test] - fn signed_sum_small() { - let v = vec![-1i64, -2i64, -3i64, 3, 2, 1]; - assert_eq!(v.into_iter().safe_sum(), Ok(0)); - } - - #[test] - fn signed_sum_overflow_above() { - let v = vec![1, 2, 3, 4, i16::MAX, 0, 1, 2, 3]; - assert_eq!(v.into_iter().safe_sum(), Err(ArithError::Overflow)); - } - - #[test] - fn signed_sum_overflow_below() { - let v = vec![i16::MIN, -1]; - assert_eq!(v.into_iter().safe_sum(), Err(ArithError::Overflow)); - } - - #[test] - fn signed_sum_almost_overflow() { - let arr = [i64::MIN, 1, -1i64, i64::MAX, i64::MAX, 1]; - assert_eq!( - arr.iter().copied().safe_sum().unwrap(), - arr.iter().copied().sum() - ); - } -} diff --git a/consensus/safe_arith/src/lib.rs b/consensus/safe_arith/src/lib.rs deleted file mode 100644 index aa397c0603..0000000000 --- a/consensus/safe_arith/src/lib.rs +++ /dev/null @@ -1,166 +0,0 @@ -//! Library for safe arithmetic on integers, avoiding overflow and division by zero. -mod iter; - -pub use iter::SafeArithIter; - -/// Error representing the failure of an arithmetic operation. -#[derive(Debug, PartialEq, Eq, Clone, Copy)] -pub enum ArithError { - Overflow, - DivisionByZero, -} - -pub type Result = std::result::Result; - -macro_rules! assign_method { - ($name:ident, $op:ident, $doc_op:expr) => { - assign_method!($name, $op, Self, $doc_op); - }; - ($name:ident, $op:ident, $rhs_ty:ty, $doc_op:expr) => { - #[doc = "Safe variant of `"] - #[doc = $doc_op] - #[doc = "`."] - #[inline] - fn $name(&mut self, other: $rhs_ty) -> Result<()> { - *self = self.$op(other)?; - Ok(()) - } - }; -} - -/// Trait providing safe arithmetic operations for built-in types. -pub trait SafeArith: Sized + Copy { - const ZERO: Self; - const ONE: Self; - - /// Safe variant of `+` that guards against overflow. - fn safe_add(&self, other: Rhs) -> Result; - - /// Safe variant of `-` that guards against overflow. - fn safe_sub(&self, other: Rhs) -> Result; - - /// Safe variant of `*` that guards against overflow. - fn safe_mul(&self, other: Rhs) -> Result; - - /// Safe variant of `/` that guards against division by 0. - fn safe_div(&self, other: Rhs) -> Result; - - /// Safe variant of `%` that guards against division by 0. - fn safe_rem(&self, other: Rhs) -> Result; - - /// Safe variant of `<<` that guards against overflow. - fn safe_shl(&self, other: u32) -> Result; - - /// Safe variant of `>>` that guards against overflow. - fn safe_shr(&self, other: u32) -> Result; - - assign_method!(safe_add_assign, safe_add, Rhs, "+="); - assign_method!(safe_sub_assign, safe_sub, Rhs, "-="); - assign_method!(safe_mul_assign, safe_mul, Rhs, "*="); - assign_method!(safe_div_assign, safe_div, Rhs, "/="); - assign_method!(safe_rem_assign, safe_rem, Rhs, "%="); - assign_method!(safe_shl_assign, safe_shl, u32, "<<="); - assign_method!(safe_shr_assign, safe_shr, u32, ">>="); -} - -macro_rules! impl_safe_arith { - ($typ:ty) => { - impl SafeArith for $typ { - const ZERO: Self = 0; - const ONE: Self = 1; - - #[inline] - fn safe_add(&self, other: Self) -> Result { - self.checked_add(other).ok_or(ArithError::Overflow) - } - - #[inline] - fn safe_sub(&self, other: Self) -> Result { - self.checked_sub(other).ok_or(ArithError::Overflow) - } - - #[inline] - fn safe_mul(&self, other: Self) -> Result { - self.checked_mul(other).ok_or(ArithError::Overflow) - } - - #[inline] - fn safe_div(&self, other: Self) -> Result { - self.checked_div(other).ok_or(ArithError::DivisionByZero) - } - - #[inline] - fn safe_rem(&self, other: Self) -> Result { - self.checked_rem(other).ok_or(ArithError::DivisionByZero) - } - - #[inline] - fn safe_shl(&self, other: u32) -> Result { - self.checked_shl(other).ok_or(ArithError::Overflow) - } - - #[inline] - fn safe_shr(&self, other: u32) -> Result { - self.checked_shr(other).ok_or(ArithError::Overflow) - } - } - }; -} - -impl_safe_arith!(u8); -impl_safe_arith!(u16); -impl_safe_arith!(u32); -impl_safe_arith!(u64); -impl_safe_arith!(usize); -impl_safe_arith!(i8); -impl_safe_arith!(i16); -impl_safe_arith!(i32); -impl_safe_arith!(i64); -impl_safe_arith!(isize); - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn basic() { - let x = 10u32; - let y = 11; - assert_eq!(x.safe_add(y), Ok(x + y)); - assert_eq!(y.safe_sub(x), Ok(y - x)); - assert_eq!(x.safe_mul(y), Ok(x * y)); - assert_eq!(x.safe_div(y), Ok(x / y)); - assert_eq!(x.safe_rem(y), Ok(x % y)); - - assert_eq!(x.safe_shl(1), Ok(x << 1)); - assert_eq!(x.safe_shr(1), Ok(x >> 1)); - } - - #[test] - fn mutate() { - let mut x = 0u8; - x.safe_add_assign(2).unwrap(); - assert_eq!(x, 2); - x.safe_sub_assign(1).unwrap(); - assert_eq!(x, 1); - x.safe_shl_assign(1).unwrap(); - assert_eq!(x, 2); - x.safe_mul_assign(3).unwrap(); - assert_eq!(x, 6); - x.safe_div_assign(4).unwrap(); - assert_eq!(x, 1); - x.safe_shr_assign(1).unwrap(); - assert_eq!(x, 0); - } - - #[test] - fn errors() { - assert!(u32::MAX.safe_add(1).is_err()); - assert!(u32::MIN.safe_sub(1).is_err()); - assert!(u32::MAX.safe_mul(2).is_err()); - assert!(u32::MAX.safe_div(0).is_err()); - assert!(u32::MAX.safe_rem(0).is_err()); - assert!(u32::MAX.safe_shl(32).is_err()); - assert!(u32::MAX.safe_shr(32).is_err()); - } -}