From 5527125f5e13456d821998027b84a8abc9bd8be4 Mon Sep 17 00:00:00 2001 From: Tim Fan Date: Tue, 22 Apr 2025 17:40:36 +0800 Subject: [PATCH 01/22] Fix GitHub releases page looks bad in GitHub dark theme (#7340) #2573 Change release page display in dark theme. Before image After: image Others stay unchanged --- .github/workflows/release.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index d8a52dd010..04e8a534da 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -236,13 +236,13 @@ jobs: | System | Architecture | Binary | PGP Signature | |:---:|:---:|:---:|:---| - | | x86_64 | [lighthouse-${{ env.VERSION }}-x86_64-apple-darwin.tar.gz](https://github.com/${{ env.REPO_NAME }}/releases/download/${{ env.VERSION }}/lighthouse-${{ env.VERSION }}-x86_64-apple-darwin.tar.gz) | [PGP Signature](https://github.com/${{ env.REPO_NAME }}/releases/download/${{ env.VERSION }}/lighthouse-${{ env.VERSION }}-x86_64-apple-darwin.tar.gz.asc) | - | | x86_64 | [lighthouse-${{ env.VERSION }}-x86_64-unknown-linux-gnu.tar.gz](https://github.com/${{ env.REPO_NAME }}/releases/download/${{ env.VERSION }}/lighthouse-${{ env.VERSION }}-x86_64-unknown-linux-gnu.tar.gz) | [PGP Signature](https://github.com/${{ env.REPO_NAME }}/releases/download/${{ env.VERSION }}/lighthouse-${{ env.VERSION }}-x86_64-unknown-linux-gnu.tar.gz.asc) | - | | aarch64 | [lighthouse-${{ env.VERSION }}-aarch64-unknown-linux-gnu.tar.gz](https://github.com/${{ env.REPO_NAME }}/releases/download/${{ env.VERSION }}/lighthouse-${{ env.VERSION }}-aarch64-unknown-linux-gnu.tar.gz) | [PGP Signature](https://github.com/${{ env.REPO_NAME }}/releases/download/${{ env.VERSION }}/lighthouse-${{ env.VERSION }}-aarch64-unknown-linux-gnu.tar.gz.asc) | - | | x86_64 | [lighthouse-${{ env.VERSION }}-x86_64-windows.tar.gz](https://github.com/${{ env.REPO_NAME }}/releases/download/${{ env.VERSION }}/lighthouse-${{ env.VERSION }}-x86_64-windows.tar.gz) | [PGP Signature](https://github.com/${{ env.REPO_NAME }}/releases/download/${{ env.VERSION }}/lighthouse-${{ env.VERSION }}-x86_64-windows.tar.gz.asc) | + | Apple logo | x86_64 | [lighthouse-${{ env.VERSION }}-x86_64-apple-darwin.tar.gz](https://github.com/${{ env.REPO_NAME }}/releases/download/${{ env.VERSION }}/lighthouse-${{ env.VERSION }}-x86_64-apple-darwin.tar.gz) | [PGP Signature](https://github.com/${{ env.REPO_NAME }}/releases/download/${{ env.VERSION }}/lighthouse-${{ env.VERSION }}-x86_64-apple-darwin.tar.gz.asc) | + | Linux logo | x86_64 | [lighthouse-${{ env.VERSION }}-x86_64-unknown-linux-gnu.tar.gz](https://github.com/${{ env.REPO_NAME }}/releases/download/${{ env.VERSION }}/lighthouse-${{ env.VERSION }}-x86_64-unknown-linux-gnu.tar.gz) | [PGP Signature](https://github.com/${{ env.REPO_NAME }}/releases/download/${{ env.VERSION }}/lighthouse-${{ env.VERSION }}-x86_64-unknown-linux-gnu.tar.gz.asc) | + | Raspberrypi logo | aarch64 | [lighthouse-${{ env.VERSION }}-aarch64-unknown-linux-gnu.tar.gz](https://github.com/${{ env.REPO_NAME }}/releases/download/${{ env.VERSION }}/lighthouse-${{ env.VERSION }}-aarch64-unknown-linux-gnu.tar.gz) | [PGP Signature](https://github.com/${{ env.REPO_NAME }}/releases/download/${{ env.VERSION }}/lighthouse-${{ env.VERSION }}-aarch64-unknown-linux-gnu.tar.gz.asc) | + | Windows logo | x86_64 | [lighthouse-${{ env.VERSION }}-x86_64-windows.tar.gz](https://github.com/${{ env.REPO_NAME }}/releases/download/${{ env.VERSION }}/lighthouse-${{ env.VERSION }}-x86_64-windows.tar.gz) | [PGP Signature](https://github.com/${{ env.REPO_NAME }}/releases/download/${{ env.VERSION }}/lighthouse-${{ env.VERSION }}-x86_64-windows.tar.gz.asc) | | | | | | | **System** | **Option** | - | **Resource** | - | | Docker | [${{ env.VERSION }}](https://hub.docker.com/r/${{ env.IMAGE_NAME }}/tags?page=1&ordering=last_updated&name=${{ env.VERSION }}) | [${{ env.IMAGE_NAME }}](https://hub.docker.com/r/${{ env.IMAGE_NAME }}) | + | Docker logo | Docker | [${{ env.VERSION }}](https://hub.docker.com/r/${{ env.IMAGE_NAME }}/tags?page=1&ordering=last_updated&name=${{ env.VERSION }}) | [${{ env.IMAGE_NAME }}](https://hub.docker.com/r/${{ env.IMAGE_NAME }}) | ENDBODY ) assets=(./lighthouse-*.tar.gz*/lighthouse-*.tar.gz*) From c13e069c9c633535336bca6d6f5f7b30720df629 Mon Sep 17 00:00:00 2001 From: chonghe <44791194+chong-he@users.noreply.github.com> Date: Wed, 23 Apr 2025 06:46:30 +0800 Subject: [PATCH 02/22] Revise logging when `queue is full` (#7324) --- .../beacon_processor/src/work_reprocessing_queue.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/beacon_node/beacon_processor/src/work_reprocessing_queue.rs b/beacon_node/beacon_processor/src/work_reprocessing_queue.rs index a4f539aea0..2b6e72ae0c 100644 --- a/beacon_node/beacon_processor/src/work_reprocessing_queue.rs +++ b/beacon_node/beacon_processor/src/work_reprocessing_queue.rs @@ -452,7 +452,7 @@ impl ReprocessQueue { if self.early_block_debounce.elapsed() { warn!( queue_size = MAXIMUM_QUEUED_BLOCKS, - msg = "check system clock", + msg = "system resources may be saturated", "Early blocks queue is full" ); } @@ -500,7 +500,7 @@ impl ReprocessQueue { if self.rpc_block_debounce.elapsed() { warn!( queue_size = MAXIMUM_QUEUED_BLOCKS, - msg = "check system clock", + msg = "system resources may be saturated", "RPC blocks queue is full" ); } @@ -540,7 +540,7 @@ impl ReprocessQueue { if self.attestation_delay_debounce.elapsed() { error!( queue_size = MAXIMUM_QUEUED_ATTESTATIONS, - msg = "check system clock", + msg = "system resources may be saturated", "Aggregate attestation delay queue is full" ); } @@ -572,7 +572,7 @@ impl ReprocessQueue { if self.attestation_delay_debounce.elapsed() { error!( queue_size = MAXIMUM_QUEUED_ATTESTATIONS, - msg = "check system clock", + msg = "system resources may be saturated", "Attestation delay queue is full" ); } @@ -606,7 +606,7 @@ impl ReprocessQueue { if self.lc_update_delay_debounce.elapsed() { error!( queue_size = MAXIMUM_QUEUED_LIGHT_CLIENT_UPDATES, - msg = "check system clock", + msg = "system resources may be saturated", "Light client updates delay queue is full" ); } From 1dd37048b9d15c27755d0a9ef73eee3f70393bae Mon Sep 17 00:00:00 2001 From: Robert Mordzon Date: Thu, 24 Apr 2025 00:31:46 +0200 Subject: [PATCH 03/22] Enable cross-compiling for riscv64 architecture (#7346) Lighthouse currently lacks support for cross-compilation targeting the `riscv64` architecture. This PR introduces initial support for cross-compiling Lighthouse to `riscv64`. The following changes were made: - **Makefile**: Updated to support `cross` with `riscv64` as a target. - **Cross.toml**: Added configuration specific to `riscv64`. - **Documentation**: List 'build-riscv64' in `book/src/installation_cross_compiling.md`. --- Cross.toml | 5 +++++ Makefile | 9 +++++++++ book/src/installation_cross_compiling.md | 3 ++- 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/Cross.toml b/Cross.toml index 8181967f32..391e8751c8 100644 --- a/Cross.toml +++ b/Cross.toml @@ -4,6 +4,11 @@ pre-build = ["apt-get install -y cmake clang-5.0"] [target.aarch64-unknown-linux-gnu] pre-build = ["apt-get install -y cmake clang-5.0"] +[target.riscv64gc-unknown-linux-gnu] +pre-build = ["apt-get install -y cmake clang"] +# Use the most recent Cross image for RISCV because the stable 0.2.5 image doesn't work +image = "ghcr.io/cross-rs/riscv64gc-unknown-linux-gnu:main" + # Allow setting page size limits for jemalloc at build time: # For certain architectures (like aarch64), we must compile # jemalloc with support for large page sizes, otherwise the host's diff --git a/Makefile b/Makefile index f621f38a63..03bf33a6d8 100644 --- a/Makefile +++ b/Makefile @@ -10,6 +10,8 @@ X86_64_TAG = "x86_64-unknown-linux-gnu" BUILD_PATH_X86_64 = "target/$(X86_64_TAG)/release" AARCH64_TAG = "aarch64-unknown-linux-gnu" BUILD_PATH_AARCH64 = "target/$(AARCH64_TAG)/release" +RISCV64_TAG = "riscv64gc-unknown-linux-gnu" +BUILD_PATH_RISCV64 = "target/$(RISCV64_TAG)/release" PINNED_NIGHTLY ?= nightly @@ -67,6 +69,8 @@ build-aarch64: # pages, which are commonly used by aarch64 systems. # See: https://github.com/sigp/lighthouse/issues/5244 JEMALLOC_SYS_WITH_LG_PAGE=16 cross build --bin lighthouse --target aarch64-unknown-linux-gnu --features "portable,$(CROSS_FEATURES)" --profile "$(CROSS_PROFILE)" --locked +build-riscv64: + cross build --bin lighthouse --target riscv64gc-unknown-linux-gnu --features "portable,$(CROSS_FEATURES)" --profile "$(CROSS_PROFILE)" --locked build-lcli-x86_64: cross build --bin lcli --target x86_64-unknown-linux-gnu --features "portable" --profile "$(CROSS_PROFILE)" --locked @@ -75,6 +79,8 @@ build-lcli-aarch64: # pages, which are commonly used by aarch64 systems. # See: https://github.com/sigp/lighthouse/issues/5244 JEMALLOC_SYS_WITH_LG_PAGE=16 cross build --bin lcli --target aarch64-unknown-linux-gnu --features "portable" --profile "$(CROSS_PROFILE)" --locked +build-lcli-riscv64: + cross build --bin lcli --target riscv64gc-unknown-linux-gnu --features "portable" --profile "$(CROSS_PROFILE)" --locked # Create a `.tar.gz` containing a binary for a specific target. define tarball_release_binary @@ -95,6 +101,9 @@ build-release-tarballs: $(call tarball_release_binary,$(BUILD_PATH_X86_64),$(X86_64_TAG),"") $(MAKE) build-aarch64 $(call tarball_release_binary,$(BUILD_PATH_AARCH64),$(AARCH64_TAG),"") + $(MAKE) build-riscv64 + $(call tarball_release_binary,$(BUILD_PATH_RISCV64),$(RISCV64_TAG),"") + # Runs the full workspace tests in **release**, without downloading any additional # test vectors. diff --git a/book/src/installation_cross_compiling.md b/book/src/installation_cross_compiling.md index 4f6ba9af38..59fa3762c2 100644 --- a/book/src/installation_cross_compiling.md +++ b/book/src/installation_cross_compiling.md @@ -18,7 +18,8 @@ project. The `Makefile` in the project contains two targets for cross-compiling: - `build-x86_64`: builds an optimized version for x86_64 processors (suitable for most users). -- `build-aarch64`: builds an optimized version for 64-bit ARM processors (suitable for Raspberry Pi 4). +- `build-aarch64`: builds an optimized version for 64-bit ARM processors (suitable for Raspberry Pi 4/5). +- `build-riscv64`: builds an optimized version for 64-bit RISC-V processors. ### Example From 402a81cdd78ebd9ddd2f5e522fac7464a408cdb3 Mon Sep 17 00:00:00 2001 From: chonghe <44791194+chong-he@users.noreply.github.com> Date: Thu, 24 Apr 2025 10:55:10 +0800 Subject: [PATCH 04/22] Fix Kurtosis testnet (#7350) `spamoor_blob` is removed in https://github.com/ethpandaops/ethereum-package/pull/972. When attempting to start local testnet, it will error: ` Evaluation error: fail: Invalid additional_services spamoor_blob, allowed fields: ["assertoor", "broadcaster", "tx_fuzz", "custom_flood", "forkmon", "blockscout", "dora", "full_beaconchain_explorer", "prometheus_grafana", "blobscan", "dugtrio", "blutgang", "forky", "apache", "tracoor", "spamoor"] ` This PR changes `spamoor_blob` to `spamoor`. --- scripts/local_testnet/network_params.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/local_testnet/network_params.yaml b/scripts/local_testnet/network_params.yaml index 87ffeb8d22..e671340afb 100644 --- a/scripts/local_testnet/network_params.yaml +++ b/scripts/local_testnet/network_params.yaml @@ -14,5 +14,5 @@ global_log_level: debug snooper_enabled: false additional_services: - dora - - spamoor_blob + - spamoor - prometheus_grafana From 1324d3d3c4c20914545f6dadcb018a5b442a95a5 Mon Sep 17 00:00:00 2001 From: Akihito Nakano Date: Thu, 24 Apr 2025 12:46:16 +0900 Subject: [PATCH 05/22] Delayed RPC Send Using Tokens (#5923) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit closes https://github.com/sigp/lighthouse/issues/5785 The diagram below shows the differences in how the receiver (responder) behaves before and after this PR. The following sentences will detail the changes. ```mermaid flowchart TD subgraph "*** After ***" Start2([START]) --> AA[Receive request] AA --> COND1{Is there already an active request
with the same protocol?} COND1 --> |Yes| CC[Send error response] CC --> End2([END]) %% COND1 --> |No| COND2{Request is too large?} %% COND2 --> |Yes| CC COND1 --> |No| DD[Process request] DD --> EE{Rate limit reached?} EE --> |Yes| FF[Wait until tokens are regenerated] FF --> EE EE --> |No| GG[Send response] GG --> End2 end subgraph "*** Before ***" Start([START]) --> A[Receive request] A --> B{Rate limit reached
or
request is too large?} B -->|Yes| C[Send error response] C --> End([END]) B -->|No| E[Process request] E --> F[Send response] F --> End end ``` ### `Is there already an active request with the same protocol?` This check is not performed in `Before`. This is taken from the PR in the consensus-spec, which proposes updates regarding rate limiting and response timeout. https://github.com/ethereum/consensus-specs/pull/3767/files > The requester MUST NOT make more than two concurrent requests with the same ID. The PR mentions the requester side. In this PR, I introduced the `ActiveRequestsLimiter` for the `responder` side to restrict more than two requests from running simultaneously on the same protocol per peer. If the limiter disallows a request, the responder sends a rate-limited error and penalizes the requester. ### `Rate limit reached?` and `Wait until tokens are regenerated` UPDATE: I moved the limiter logic to the behaviour side. https://github.com/sigp/lighthouse/pull/5923#issuecomment-2379535927 ~~The rate limiter is shared between the behaviour and the handler. (`Arc>>`) The handler checks the rate limit and queues the response if the limit is reached. The behaviour handles pruning.~~ ~~I considered not sharing the rate limiter between the behaviour and the handler, and performing all of these either within the behaviour or handler. However, I decided against this for the following reasons:~~ - ~~Regarding performing everything within the behaviour: The behaviour is unable to recognize the response protocol when `RPC::send_response()` is called, especially when the response is `RPCCodedResponse::Error`. Therefore, the behaviour can't rate limit responses based on the response protocol.~~ - ~~Regarding performing everything within the handler: When multiple connections are established with a peer, there could be multiple handlers interacting with that peer. Thus, we cannot enforce rate limiting per peer solely within the handler. (Any ideas? 🤔 )~~ --- beacon_node/lighthouse_network/src/metrics.rs | 14 + .../lighthouse_network/src/rpc/handler.rs | 4 +- .../lighthouse_network/src/rpc/methods.rs | 14 + beacon_node/lighthouse_network/src/rpc/mod.rs | 271 ++++++++------ .../src/rpc/rate_limiter.rs | 13 +- .../src/rpc/response_limiter.rs | 177 +++++++++ .../src/rpc/self_limiter.rs | 339 +++++++++++++++--- .../lighthouse_network/tests/common.rs | 31 +- .../lighthouse_network/tests/rpc_tests.rs | 276 +++++++++++++- 9 files changed, 976 insertions(+), 163 deletions(-) create mode 100644 beacon_node/lighthouse_network/src/rpc/response_limiter.rs diff --git a/beacon_node/lighthouse_network/src/metrics.rs b/beacon_node/lighthouse_network/src/metrics.rs index b36cb8075d..da986f2884 100644 --- a/beacon_node/lighthouse_network/src/metrics.rs +++ b/beacon_node/lighthouse_network/src/metrics.rs @@ -206,6 +206,20 @@ pub static REPORT_PEER_MSGS: LazyLock> = LazyLock::new(|| ) }); +pub static OUTBOUND_REQUEST_IDLING: LazyLock> = LazyLock::new(|| { + try_create_histogram( + "outbound_request_idling_seconds", + "The time our own request remained idle in the self-limiter", + ) +}); + +pub static RESPONSE_IDLING: LazyLock> = LazyLock::new(|| { + try_create_histogram( + "response_idling_seconds", + "The time our response remained idle in the response limiter", + ) +}); + pub fn scrape_discovery_metrics() { let metrics = discv5::metrics::Metrics::from(discv5::Discv5::::raw_metrics()); diff --git a/beacon_node/lighthouse_network/src/rpc/handler.rs b/beacon_node/lighthouse_network/src/rpc/handler.rs index b86e2b3a6f..33c5521c3b 100644 --- a/beacon_node/lighthouse_network/src/rpc/handler.rs +++ b/beacon_node/lighthouse_network/src/rpc/handler.rs @@ -141,7 +141,7 @@ where /// Waker, to be sure the handler gets polled when needed. waker: Option, - /// Timeout that will me used for inbound and outbound responses. + /// Timeout that will be used for inbound and outbound responses. resp_timeout: Duration, } @@ -314,6 +314,7 @@ where } return; }; + // If the response we are sending is an error, report back for handling if let RpcResponse::Error(ref code, ref reason) = response { self.events_out.push(HandlerEvent::Err(HandlerErr::Inbound { @@ -331,6 +332,7 @@ where "Response not sent. Deactivated handler"); return; } + inbound_info.pending_items.push_back(response); } } diff --git a/beacon_node/lighthouse_network/src/rpc/methods.rs b/beacon_node/lighthouse_network/src/rpc/methods.rs index b748ab11c0..e6939e36d8 100644 --- a/beacon_node/lighthouse_network/src/rpc/methods.rs +++ b/beacon_node/lighthouse_network/src/rpc/methods.rs @@ -606,6 +606,20 @@ pub enum ResponseTermination { LightClientUpdatesByRange, } +impl ResponseTermination { + pub fn as_protocol(&self) -> Protocol { + match self { + ResponseTermination::BlocksByRange => Protocol::BlocksByRange, + ResponseTermination::BlocksByRoot => Protocol::BlocksByRoot, + ResponseTermination::BlobsByRange => Protocol::BlobsByRange, + ResponseTermination::BlobsByRoot => Protocol::BlobsByRoot, + ResponseTermination::DataColumnsByRoot => Protocol::DataColumnsByRoot, + ResponseTermination::DataColumnsByRange => Protocol::DataColumnsByRange, + ResponseTermination::LightClientUpdatesByRange => Protocol::LightClientUpdatesByRange, + } + } +} + /// The structured response containing a result/code indicating success or failure /// and the contents of the response #[derive(Debug, Clone)] diff --git a/beacon_node/lighthouse_network/src/rpc/mod.rs b/beacon_node/lighthouse_network/src/rpc/mod.rs index 0f23da7f38..8cb720132a 100644 --- a/beacon_node/lighthouse_network/src/rpc/mod.rs +++ b/beacon_node/lighthouse_network/src/rpc/mod.rs @@ -4,7 +4,6 @@ //! direct peer-to-peer communication primarily for sending/receiving chain information for //! syncing. -use futures::future::FutureExt; use handler::RPCHandler; use libp2p::core::transport::PortUse; use libp2p::swarm::{ @@ -13,13 +12,12 @@ use libp2p::swarm::{ }; use libp2p::swarm::{ConnectionClosed, FromSwarm, SubstreamProtocol, THandlerInEvent}; use libp2p::PeerId; -use logging::crit; -use rate_limiter::{RPCRateLimiter as RateLimiter, RateLimitedErr}; +use std::collections::HashMap; use std::marker::PhantomData; use std::sync::Arc; use std::task::{Context, Poll}; use std::time::Duration; -use tracing::{debug, instrument, trace}; +use tracing::{debug, error, instrument, trace}; use types::{EthSpec, ForkContext}; pub(crate) use handler::{HandlerErr, HandlerEvent}; @@ -28,6 +26,11 @@ pub(crate) use methods::{ }; pub use protocol::RequestType; +use self::config::{InboundRateLimiterConfig, OutboundRateLimiterConfig}; +use self::protocol::RPCProtocol; +use self::self_limiter::SelfRateLimiter; +use crate::rpc::rate_limiter::RateLimiterItem; +use crate::rpc::response_limiter::ResponseLimiter; pub use handler::SubstreamId; pub use methods::{ BlocksByRangeRequest, BlocksByRootRequest, GoodbyeReason, LightClientBootstrapRequest, @@ -35,10 +38,6 @@ pub use methods::{ }; pub use protocol::{Protocol, RPCError}; -use self::config::{InboundRateLimiterConfig, OutboundRateLimiterConfig}; -use self::protocol::RPCProtocol; -use self::self_limiter::SelfRateLimiter; - pub(crate) mod codec; pub mod config; mod handler; @@ -46,8 +45,12 @@ pub mod methods; mod outbound; mod protocol; mod rate_limiter; +mod response_limiter; mod self_limiter; +// Maximum number of concurrent requests per protocol ID that a client may issue. +const MAX_CONCURRENT_REQUESTS: usize = 2; + /// Composite trait for a request id. pub trait ReqId: Send + 'static + std::fmt::Debug + Copy + Clone {} impl ReqId for T where T: Send + 'static + std::fmt::Debug + Copy + Clone {} @@ -144,10 +147,12 @@ pub struct NetworkParams { /// Implements the libp2p `NetworkBehaviour` trait and therefore manages network-level /// logic. pub struct RPC { - /// Rate limiter - limiter: Option, + /// Rate limiter for our responses. + response_limiter: Option>, /// Rate limiter for our own requests. - self_limiter: Option>, + outbound_request_limiter: SelfRateLimiter, + /// Active inbound requests that are awaiting a response. + active_inbound_requests: HashMap)>, /// Queue of events to be processed. events: Vec>, fork_context: Arc, @@ -173,20 +178,20 @@ impl RPC { network_params: NetworkParams, seq_number: u64, ) -> Self { - let inbound_limiter = inbound_rate_limiter_config.map(|config| { - debug!(?config, "Using inbound rate limiting params"); - RateLimiter::new_with_config(config.0, fork_context.clone()) + let response_limiter = inbound_rate_limiter_config.map(|config| { + debug!(?config, "Using response rate limiting params"); + ResponseLimiter::new(config, fork_context.clone()) .expect("Inbound limiter configuration parameters are valid") }); - let self_limiter = outbound_rate_limiter_config.map(|config| { - SelfRateLimiter::new(config, fork_context.clone()) - .expect("Configuration parameters are valid") - }); + let outbound_request_limiter: SelfRateLimiter = + SelfRateLimiter::new(outbound_rate_limiter_config, fork_context.clone()) + .expect("Outbound limiter configuration parameters are valid"); RPC { - limiter: inbound_limiter, - self_limiter, + response_limiter, + outbound_request_limiter, + active_inbound_requests: HashMap::new(), events: Vec::new(), fork_context, enable_light_client_server, @@ -210,6 +215,44 @@ impl RPC { request_id: InboundRequestId, response: RpcResponse, ) { + let Some((_peer_id, request_type)) = self.active_inbound_requests.remove(&request_id) + else { + error!(%peer_id, ?request_id, %response, "Request not found in active_inbound_requests. Response not sent"); + return; + }; + + // Add the request back to active requests if the response is `Success` and requires stream + // termination. + if request_type.protocol().terminator().is_some() + && matches!(response, RpcResponse::Success(_)) + { + self.active_inbound_requests + .insert(request_id, (peer_id, request_type.clone())); + } + + self.send_response_inner(peer_id, request_type.protocol(), request_id, response); + } + + fn send_response_inner( + &mut self, + peer_id: PeerId, + protocol: Protocol, + request_id: InboundRequestId, + response: RpcResponse, + ) { + if let Some(response_limiter) = self.response_limiter.as_mut() { + if !response_limiter.allows( + peer_id, + protocol, + request_id.connection_id, + request_id.substream_id, + response.clone(), + ) { + // Response is logged and queued internally in the response limiter. + return; + } + } + self.events.push(ToSwarm::NotifyHandler { peer_id, handler: NotifyHandler::One(request_id.connection_id), @@ -227,23 +270,19 @@ impl RPC { skip_all )] pub fn send_request(&mut self, peer_id: PeerId, request_id: Id, req: RequestType) { - let event = if let Some(self_limiter) = self.self_limiter.as_mut() { - match self_limiter.allows(peer_id, request_id, req) { - Ok(event) => event, - Err(_e) => { - // Request is logged and queued internally in the self rate limiter. - return; - } + match self + .outbound_request_limiter + .allows(peer_id, request_id, req) + { + Ok(event) => self.events.push(BehaviourAction::NotifyHandler { + peer_id, + handler: NotifyHandler::Any, + event, + }), + Err(_e) => { + // Request is logged and queued internally in the self rate limiter. } - } else { - RPCSend::Request(request_id, req) - }; - - self.events.push(BehaviourAction::NotifyHandler { - peer_id, - handler: NotifyHandler::Any, - event, - }); + } } /// Lighthouse wishes to disconnect from this peer by sending a Goodbye message. This @@ -373,20 +412,27 @@ where if remaining_established > 0 { return; } + // Get a list of pending requests from the self rate limiter - if let Some(limiter) = self.self_limiter.as_mut() { - for (id, proto) in limiter.peer_disconnected(peer_id) { - let error_msg = ToSwarm::GenerateEvent(RPCMessage { - peer_id, - connection_id, - message: Err(HandlerErr::Outbound { - id, - proto, - error: RPCError::Disconnected, - }), - }); - self.events.push(error_msg); - } + for (id, proto) in self.outbound_request_limiter.peer_disconnected(peer_id) { + let error_msg = ToSwarm::GenerateEvent(RPCMessage { + peer_id, + connection_id, + message: Err(HandlerErr::Outbound { + id, + proto, + error: RPCError::Disconnected, + }), + }); + self.events.push(error_msg); + } + + self.active_inbound_requests.retain( + |_inbound_request_id, (request_peer_id, _request_type)| *request_peer_id != peer_id, + ); + + if let Some(limiter) = self.response_limiter.as_mut() { + limiter.peer_disconnected(peer_id); } // Replace the pending Requests to the disconnected peer @@ -420,57 +466,39 @@ where ) { match event { HandlerEvent::Ok(RPCReceived::Request(request_id, request_type)) => { - if let Some(limiter) = self.limiter.as_mut() { - // check if the request is conformant to the quota - match limiter.allows(&peer_id, &request_type) { - Err(RateLimitedErr::TooLarge) => { - // we set the batch sizes, so this is a coding/config err for most protocols - let protocol = request_type.versioned_protocol().protocol(); - if matches!( - protocol, - Protocol::BlocksByRange - | Protocol::BlobsByRange - | Protocol::DataColumnsByRange - | Protocol::BlocksByRoot - | Protocol::BlobsByRoot - | Protocol::DataColumnsByRoot - ) { - debug!(request = %request_type, %protocol, "Request too large to process"); - } else { - // Other protocols shouldn't be sending large messages, we should flag the peer kind - crit!(%protocol, "Request size too large to ever be processed"); - } - // send an error code to the peer. - // the handler upon receiving the error code will send it back to the behaviour - self.send_response( - peer_id, - request_id, - RpcResponse::Error( - RpcErrorResponse::RateLimited, - "Rate limited. Request too large".into(), - ), - ); - return; - } - Err(RateLimitedErr::TooSoon(wait_time)) => { - debug!(request = %request_type, %peer_id, wait_time_ms = wait_time.as_millis(), "Request exceeds the rate limit"); - // send an error code to the peer. - // the handler upon receiving the error code will send it back to the behaviour - self.send_response( - peer_id, - request_id, - RpcResponse::Error( - RpcErrorResponse::RateLimited, - format!("Wait {:?}", wait_time).into(), - ), - ); - return; - } - // No rate limiting, continue. - Ok(()) => {} - } + let is_concurrent_request_limit_exceeded = self + .active_inbound_requests + .iter() + .filter( + |(_inbound_request_id, (request_peer_id, active_request_type))| { + *request_peer_id == peer_id + && active_request_type.protocol() == request_type.protocol() + }, + ) + .count() + >= MAX_CONCURRENT_REQUESTS; + + // Restricts more than MAX_CONCURRENT_REQUESTS inbound requests from running simultaneously on the same protocol per peer. + if is_concurrent_request_limit_exceeded { + // There is already an active request with the same protocol. Send an error code to the peer. + debug!(request = %request_type, protocol = %request_type.protocol(), %peer_id, "There is an active request with the same protocol"); + self.send_response_inner( + peer_id, + request_type.protocol(), + request_id, + RpcResponse::Error( + RpcErrorResponse::RateLimited, + format!("Rate limited. There are already {MAX_CONCURRENT_REQUESTS} active requests with the same protocol") + .into(), + ), + ); + return; } + // Requests that are below the limit on the number of simultaneous requests are added to the active inbound requests. + self.active_inbound_requests + .insert(request_id, (peer_id, request_type.clone())); + // If we received a Ping, we queue a Pong response. if let RequestType::Ping(_) = request_type { trace!(connection_id = %connection_id, %peer_id, "Received Ping, queueing Pong"); @@ -489,14 +517,38 @@ where message: Ok(RPCReceived::Request(request_id, request_type)), })); } - HandlerEvent::Ok(rpc) => { + HandlerEvent::Ok(RPCReceived::Response(id, response)) => { + if response.protocol().terminator().is_none() { + // Inform the limiter that a response has been received. + self.outbound_request_limiter + .request_completed(&peer_id, response.protocol()); + } + self.events.push(ToSwarm::GenerateEvent(RPCMessage { peer_id, connection_id, - message: Ok(rpc), + message: Ok(RPCReceived::Response(id, response)), + })); + } + HandlerEvent::Ok(RPCReceived::EndOfStream(id, response_termination)) => { + // Inform the limiter that a response has been received. + self.outbound_request_limiter + .request_completed(&peer_id, response_termination.as_protocol()); + + self.events.push(ToSwarm::GenerateEvent(RPCMessage { + peer_id, + connection_id, + message: Ok(RPCReceived::EndOfStream(id, response_termination)), })); } HandlerEvent::Err(err) => { + // Inform the limiter that the request has ended with an error. + let protocol = match err { + HandlerErr::Inbound { proto, .. } | HandlerErr::Outbound { proto, .. } => proto, + }; + self.outbound_request_limiter + .request_completed(&peer_id, protocol); + self.events.push(ToSwarm::GenerateEvent(RPCMessage { peer_id, connection_id, @@ -514,15 +566,20 @@ where } fn poll(&mut self, cx: &mut Context) -> Poll>> { - // let the rate limiter prune. - if let Some(limiter) = self.limiter.as_mut() { - let _ = limiter.poll_unpin(cx); + if let Some(response_limiter) = self.response_limiter.as_mut() { + if let Poll::Ready(responses) = response_limiter.poll_ready(cx) { + for response in responses { + self.events.push(ToSwarm::NotifyHandler { + peer_id: response.peer_id, + handler: NotifyHandler::One(response.connection_id), + event: RPCSend::Response(response.substream_id, response.response), + }); + } + } } - if let Some(self_limiter) = self.self_limiter.as_mut() { - if let Poll::Ready(event) = self_limiter.poll_ready(cx) { - self.events.push(event) - } + if let Poll::Ready(event) = self.outbound_request_limiter.poll_ready(cx) { + self.events.push(event) } if !self.events.is_empty() { diff --git a/beacon_node/lighthouse_network/src/rpc/rate_limiter.rs b/beacon_node/lighthouse_network/src/rpc/rate_limiter.rs index b9e82a5f1e..f666c30d52 100644 --- a/beacon_node/lighthouse_network/src/rpc/rate_limiter.rs +++ b/beacon_node/lighthouse_network/src/rpc/rate_limiter.rs @@ -149,7 +149,7 @@ pub struct RPCRateLimiterBuilder { lcbootstrap_quota: Option, /// Quota for the LightClientOptimisticUpdate protocol. lc_optimistic_update_quota: Option, - /// Quota for the LightClientOptimisticUpdate protocol. + /// Quota for the LightClientFinalityUpdate protocol. lc_finality_update_quota: Option, /// Quota for the LightClientUpdatesByRange protocol. lc_updates_by_range_quota: Option, @@ -275,6 +275,17 @@ impl RateLimiterItem for super::RequestType { } } +impl RateLimiterItem for (super::RpcResponse, Protocol) { + fn protocol(&self) -> Protocol { + self.1 + } + + fn max_responses(&self, _current_fork: ForkName, _spec: &ChainSpec) -> u64 { + // A response chunk consumes one token of the rate limiter. + 1 + } +} + impl RPCRateLimiter { pub fn new_with_config( config: RateLimiterConfig, diff --git a/beacon_node/lighthouse_network/src/rpc/response_limiter.rs b/beacon_node/lighthouse_network/src/rpc/response_limiter.rs new file mode 100644 index 0000000000..c583baaadd --- /dev/null +++ b/beacon_node/lighthouse_network/src/rpc/response_limiter.rs @@ -0,0 +1,177 @@ +use crate::rpc::config::InboundRateLimiterConfig; +use crate::rpc::rate_limiter::{RPCRateLimiter, RateLimitedErr}; +use crate::rpc::self_limiter::timestamp_now; +use crate::rpc::{Protocol, RpcResponse, SubstreamId}; +use crate::PeerId; +use futures::FutureExt; +use libp2p::swarm::ConnectionId; +use logging::crit; +use std::collections::hash_map::Entry; +use std::collections::{HashMap, VecDeque}; +use std::sync::Arc; +use std::task::{Context, Poll}; +use std::time::Duration; +use tokio_util::time::DelayQueue; +use tracing::debug; +use types::{EthSpec, ForkContext}; + +/// A response that was rate limited or waiting on rate limited responses for the same peer and +/// protocol. +#[derive(Clone)] +pub(super) struct QueuedResponse { + pub peer_id: PeerId, + pub connection_id: ConnectionId, + pub substream_id: SubstreamId, + pub response: RpcResponse, + pub protocol: Protocol, + pub queued_at: Duration, +} + +pub(super) struct ResponseLimiter { + /// Rate limiter for our responses. + limiter: RPCRateLimiter, + /// Responses queued for sending. These responses are stored when the response limiter rejects them. + delayed_responses: HashMap<(PeerId, Protocol), VecDeque>>, + /// The delay required to allow a peer's outbound response per protocol. + next_response: DelayQueue<(PeerId, Protocol)>, +} + +impl ResponseLimiter { + /// Creates a new [`ResponseLimiter`] based on configuration values. + pub fn new( + config: InboundRateLimiterConfig, + fork_context: Arc, + ) -> Result { + Ok(ResponseLimiter { + limiter: RPCRateLimiter::new_with_config(config.0, fork_context)?, + delayed_responses: HashMap::new(), + next_response: DelayQueue::new(), + }) + } + + /// Checks if the rate limiter allows the response. When not allowed, the response is delayed + /// until it can be sent. + pub fn allows( + &mut self, + peer_id: PeerId, + protocol: Protocol, + connection_id: ConnectionId, + substream_id: SubstreamId, + response: RpcResponse, + ) -> bool { + // First check that there are not already other responses waiting to be sent. + if let Some(queue) = self.delayed_responses.get_mut(&(peer_id, protocol)) { + debug!(%peer_id, %protocol, "Response rate limiting since there are already other responses waiting to be sent"); + queue.push_back(QueuedResponse { + peer_id, + connection_id, + substream_id, + response, + protocol, + queued_at: timestamp_now(), + }); + return false; + } + + if let Err(wait_time) = + Self::try_limiter(&mut self.limiter, peer_id, response.clone(), protocol) + { + self.delayed_responses + .entry((peer_id, protocol)) + .or_default() + .push_back(QueuedResponse { + peer_id, + connection_id, + substream_id, + response, + protocol, + queued_at: timestamp_now(), + }); + self.next_response.insert((peer_id, protocol), wait_time); + return false; + } + + true + } + + /// Checks if the limiter allows the response. If the response should be delayed, the duration + /// to wait is returned. + fn try_limiter( + limiter: &mut RPCRateLimiter, + peer_id: PeerId, + response: RpcResponse, + protocol: Protocol, + ) -> Result<(), Duration> { + match limiter.allows(&peer_id, &(response.clone(), protocol)) { + Ok(()) => Ok(()), + Err(e) => match e { + RateLimitedErr::TooLarge => { + // This should never happen with default parameters. Let's just send the response. + // Log a crit since this is a config issue. + crit!( + %protocol, + "Response rate limiting error for a batch that will never fit. Sending response anyway. Check configuration parameters." + ); + Ok(()) + } + RateLimitedErr::TooSoon(wait_time) => { + debug!(%peer_id, %protocol, wait_time_ms = wait_time.as_millis(), "Response rate limiting"); + Err(wait_time) + } + }, + } + } + + /// Informs the limiter that a peer has disconnected. This removes any pending responses. + pub fn peer_disconnected(&mut self, peer_id: PeerId) { + self.delayed_responses + .retain(|(map_peer_id, _protocol), _queue| map_peer_id != &peer_id); + } + + /// When a peer and protocol are allowed to send a next response, this function checks the + /// queued responses and attempts marking as ready as many as the limiter allows. + pub fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll>> { + let mut responses = vec![]; + while let Poll::Ready(Some(expired)) = self.next_response.poll_expired(cx) { + let (peer_id, protocol) = expired.into_inner(); + + if let Entry::Occupied(mut entry) = self.delayed_responses.entry((peer_id, protocol)) { + let queue = entry.get_mut(); + // Take delayed responses from the queue, as long as the limiter allows it. + while let Some(response) = queue.pop_front() { + match Self::try_limiter( + &mut self.limiter, + response.peer_id, + response.response.clone(), + response.protocol, + ) { + Ok(()) => { + metrics::observe_duration( + &crate::metrics::RESPONSE_IDLING, + timestamp_now().saturating_sub(response.queued_at), + ); + responses.push(response) + } + Err(wait_time) => { + // The response was taken from the queue, but the limiter didn't allow it. + queue.push_front(response); + self.next_response.insert((peer_id, protocol), wait_time); + break; + } + } + } + if queue.is_empty() { + entry.remove(); + } + } + } + + // Prune the rate limiter. + let _ = self.limiter.poll_unpin(cx); + + if !responses.is_empty() { + return Poll::Ready(responses); + } + Poll::Pending + } +} diff --git a/beacon_node/lighthouse_network/src/rpc/self_limiter.rs b/beacon_node/lighthouse_network/src/rpc/self_limiter.rs index e4af977a6c..e5b685676f 100644 --- a/beacon_node/lighthouse_network/src/rpc/self_limiter.rs +++ b/beacon_node/lighthouse_network/src/rpc/self_limiter.rs @@ -1,3 +1,10 @@ +use super::{ + config::OutboundRateLimiterConfig, + rate_limiter::{RPCRateLimiter as RateLimiter, RateLimitedErr}, + BehaviourAction, Protocol, RPCSend, ReqId, RequestType, MAX_CONCURRENT_REQUESTS, +}; +use crate::rpc::rate_limiter::RateLimiterItem; +use std::time::{SystemTime, UNIX_EPOCH}; use std::{ collections::{hash_map::Entry, HashMap, VecDeque}, sync::Arc, @@ -13,30 +20,31 @@ use tokio_util::time::DelayQueue; use tracing::debug; use types::{EthSpec, ForkContext}; -use super::{ - config::OutboundRateLimiterConfig, - rate_limiter::{RPCRateLimiter as RateLimiter, RateLimitedErr}, - BehaviourAction, Protocol, RPCSend, ReqId, RequestType, -}; - /// A request that was rate limited or waiting on rate limited requests for the same peer and /// protocol. struct QueuedRequest { req: RequestType, request_id: Id, + queued_at: Duration, } +/// The number of milliseconds requests delayed due to the concurrent request limit stay in the queue. +const WAIT_TIME_DUE_TO_CONCURRENT_REQUESTS: u64 = 100; + +#[allow(clippy::type_complexity)] pub(crate) struct SelfRateLimiter { - /// Requests queued for sending per peer. This requests are stored when the self rate + /// Active requests that are awaiting a response. + active_requests: HashMap>, + /// Requests queued for sending per peer. These requests are stored when the self rate /// limiter rejects them. Rate limiting is based on a Peer and Protocol basis, therefore /// are stored in the same way. delayed_requests: HashMap<(PeerId, Protocol), VecDeque>>, /// The delay required to allow a peer's outbound request per protocol. next_peer_request: DelayQueue<(PeerId, Protocol)>, /// Rate limiter for our own requests. - limiter: RateLimiter, + rate_limiter: Option, /// Requests that are ready to be sent. - ready_requests: SmallVec<[(PeerId, RPCSend); 3]>, + ready_requests: SmallVec<[(PeerId, RPCSend, Duration); 3]>, } /// Error returned when the rate limiter does not accept a request. @@ -49,18 +57,23 @@ pub enum Error { } impl SelfRateLimiter { - /// Creates a new [`SelfRateLimiter`] based on configration values. + /// Creates a new [`SelfRateLimiter`] based on configuration values. pub fn new( - config: OutboundRateLimiterConfig, + config: Option, fork_context: Arc, ) -> Result { debug!(?config, "Using self rate limiting params"); - let limiter = RateLimiter::new_with_config(config.0, fork_context)?; + let rate_limiter = if let Some(c) = config { + Some(RateLimiter::new_with_config(c.0, fork_context)?) + } else { + None + }; Ok(SelfRateLimiter { + active_requests: Default::default(), delayed_requests: Default::default(), next_peer_request: Default::default(), - limiter, + rate_limiter, ready_requests: Default::default(), }) } @@ -77,11 +90,21 @@ impl SelfRateLimiter { let protocol = req.versioned_protocol().protocol(); // First check that there are not already other requests waiting to be sent. if let Some(queued_requests) = self.delayed_requests.get_mut(&(peer_id, protocol)) { - queued_requests.push_back(QueuedRequest { req, request_id }); - + debug!(%peer_id, protocol = %req.protocol(), "Self rate limiting since there are already other requests waiting to be sent"); + queued_requests.push_back(QueuedRequest { + req, + request_id, + queued_at: timestamp_now(), + }); return Err(Error::PendingRequests); } - match Self::try_send_request(&mut self.limiter, peer_id, request_id, req) { + match Self::try_send_request( + &mut self.active_requests, + &mut self.rate_limiter, + peer_id, + request_id, + req, + ) { Err((rate_limited_req, wait_time)) => { let key = (peer_id, protocol); self.next_peer_request.insert(key, wait_time); @@ -99,33 +122,71 @@ impl SelfRateLimiter { /// Auxiliary function to deal with self rate limiting outcomes. If the rate limiter allows the /// request, the [`ToSwarm`] that should be emitted is returned. If the request /// should be delayed, it's returned with the duration to wait. + #[allow(clippy::result_large_err)] fn try_send_request( - limiter: &mut RateLimiter, + active_requests: &mut HashMap>, + rate_limiter: &mut Option, peer_id: PeerId, request_id: Id, req: RequestType, ) -> Result, (QueuedRequest, Duration)> { - match limiter.allows(&peer_id, &req) { - Ok(()) => Ok(RPCSend::Request(request_id, req)), - Err(e) => { - let protocol = req.versioned_protocol(); - match e { - RateLimitedErr::TooLarge => { - // this should never happen with default parameters. Let's just send the request. - // Log a crit since this is a config issue. - crit!( - protocol = %req.versioned_protocol().protocol(), - "Self rate limiting error for a batch that will never fit. Sending request anyway. Check configuration parameters." - ); - Ok(RPCSend::Request(request_id, req)) - } - RateLimitedErr::TooSoon(wait_time) => { - debug!(protocol = %protocol.protocol(), wait_time_ms = wait_time.as_millis(), %peer_id, "Self rate limiting"); - Err((QueuedRequest { req, request_id }, wait_time)) + if let Some(active_request) = active_requests.get(&peer_id) { + if let Some(count) = active_request.get(&req.protocol()) { + if *count >= MAX_CONCURRENT_REQUESTS { + debug!( + %peer_id, + protocol = %req.protocol(), + "Self rate limiting due to the number of concurrent requests" + ); + return Err(( + QueuedRequest { + req, + request_id, + queued_at: timestamp_now(), + }, + Duration::from_millis(WAIT_TIME_DUE_TO_CONCURRENT_REQUESTS), + )); + } + } + } + + if let Some(limiter) = rate_limiter.as_mut() { + match limiter.allows(&peer_id, &req) { + Ok(()) => {} + Err(e) => { + let protocol = req.versioned_protocol(); + match e { + RateLimitedErr::TooLarge => { + // this should never happen with default parameters. Let's just send the request. + // Log a crit since this is a config issue. + crit!( + protocol = %req.versioned_protocol().protocol(), + "Self rate limiting error for a batch that will never fit. Sending request anyway. Check configuration parameters.", + ); + } + RateLimitedErr::TooSoon(wait_time) => { + debug!(protocol = %protocol.protocol(), wait_time_ms = wait_time.as_millis(), %peer_id, "Self rate limiting"); + return Err(( + QueuedRequest { + req, + request_id, + queued_at: timestamp_now(), + }, + wait_time, + )); + } } } } } + + *active_requests + .entry(peer_id) + .or_default() + .entry(req.protocol()) + .or_default() += 1; + + Ok(RPCSend::Request(request_id, req)) } /// When a peer and protocol are allowed to send a next request, this function checks the @@ -133,16 +194,32 @@ impl SelfRateLimiter { fn next_peer_request_ready(&mut self, peer_id: PeerId, protocol: Protocol) { if let Entry::Occupied(mut entry) = self.delayed_requests.entry((peer_id, protocol)) { let queued_requests = entry.get_mut(); - while let Some(QueuedRequest { req, request_id }) = queued_requests.pop_front() { - match Self::try_send_request(&mut self.limiter, peer_id, request_id, req) { - Err((rate_limited_req, wait_time)) => { + while let Some(QueuedRequest { + req, + request_id, + queued_at, + }) = queued_requests.pop_front() + { + match Self::try_send_request( + &mut self.active_requests, + &mut self.rate_limiter, + peer_id, + request_id, + req.clone(), + ) { + Err((_rate_limited_req, wait_time)) => { let key = (peer_id, protocol); self.next_peer_request.insert(key, wait_time); - queued_requests.push_front(rate_limited_req); + // Don't push `rate_limited_req` here to prevent `queued_at` from being updated. + queued_requests.push_front(QueuedRequest { + req, + request_id, + queued_at, + }); // If one fails just wait for the next window that allows sending requests. return; } - Ok(event) => self.ready_requests.push((peer_id, event)), + Ok(event) => self.ready_requests.push((peer_id, event, queued_at)), } } if queued_requests.is_empty() { @@ -156,6 +233,8 @@ impl SelfRateLimiter { /// Informs the limiter that a peer has disconnected. This removes any pending requests and /// returns their IDs. pub fn peer_disconnected(&mut self, peer_id: PeerId) -> Vec<(Id, Protocol)> { + self.active_requests.remove(&peer_id); + // It's not ideal to iterate this map, but the key is (PeerId, Protocol) and this map // should never really be large. So we iterate for simplicity let mut failed_requests = Vec::new(); @@ -177,19 +256,39 @@ impl SelfRateLimiter { failed_requests } + /// Informs the limiter that a response has been received. + pub fn request_completed(&mut self, peer_id: &PeerId, protocol: Protocol) { + if let Some(active_requests) = self.active_requests.get_mut(peer_id) { + if let Entry::Occupied(mut entry) = active_requests.entry(protocol) { + if *entry.get() > 1 { + *entry.get_mut() -= 1; + } else { + entry.remove(); + } + } + } + } + pub fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll> { // First check the requests that were self rate limited, since those might add events to - // the queue. Also do this this before rate limiter prunning to avoid removing and + // the queue. Also do this before rate limiter pruning to avoid removing and // immediately adding rate limiting keys. if let Poll::Ready(Some(expired)) = self.next_peer_request.poll_expired(cx) { let (peer_id, protocol) = expired.into_inner(); self.next_peer_request_ready(peer_id, protocol); } + // Prune the rate limiter. - let _ = self.limiter.poll_unpin(cx); + if let Some(limiter) = self.rate_limiter.as_mut() { + let _ = limiter.poll_unpin(cx); + } // Finally return any queued events. - if let Some((peer_id, event)) = self.ready_requests.pop() { + if let Some((peer_id, event, queued_at)) = self.ready_requests.pop() { + metrics::observe_duration( + &crate::metrics::OUTBOUND_REQUEST_IDLING, + timestamp_now().saturating_sub(queued_at), + ); return Poll::Ready(BehaviourAction::NotifyHandler { peer_id, handler: NotifyHandler::Any, @@ -201,12 +300,19 @@ impl SelfRateLimiter { } } +/// Returns the duration since the unix epoch. +pub fn timestamp_now() -> Duration { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_else(|_| Duration::from_secs(0)) +} + #[cfg(test)] mod tests { use crate::rpc::config::{OutboundRateLimiterConfig, RateLimiterConfig}; use crate::rpc::rate_limiter::Quota; use crate::rpc::self_limiter::SelfRateLimiter; - use crate::rpc::{Ping, Protocol, RequestType}; + use crate::rpc::{Ping, Protocol, RPCSend, RequestType}; use crate::service::api_types::{AppRequestId, SingleLookupReqId, SyncRequestId}; use libp2p::PeerId; use logging::create_test_tracing_subscriber; @@ -227,7 +333,7 @@ mod tests { &MainnetEthSpec::default_spec(), )); let mut limiter: SelfRateLimiter = - SelfRateLimiter::new(config, fork_context).unwrap(); + SelfRateLimiter::new(Some(config), fork_context).unwrap(); let peer_id = PeerId::random(); let lookup_id = 0; @@ -290,4 +396,149 @@ mod tests { assert_eq!(limiter.ready_requests.len(), 1); } } + + /// Test that `next_peer_request_ready` correctly maintains the queue when using the self-limiter without rate limiting. + #[tokio::test] + async fn test_next_peer_request_ready_concurrent_requests() { + let fork_context = std::sync::Arc::new(ForkContext::new::( + Slot::new(0), + Hash256::ZERO, + &MainnetEthSpec::default_spec(), + )); + let mut limiter: SelfRateLimiter = + SelfRateLimiter::new(None, fork_context).unwrap(); + let peer_id = PeerId::random(); + + for i in 1..=5u32 { + let result = limiter.allows( + peer_id, + AppRequestId::Sync(SyncRequestId::SingleBlock { + id: SingleLookupReqId { + lookup_id: i, + req_id: i, + }, + }), + RequestType::Ping(Ping { data: i as u64 }), + ); + + // Check that the limiter allows the first two requests. + if i <= 2 { + assert!(result.is_ok()); + } else { + assert!(result.is_err()); + } + } + + let queue = limiter + .delayed_requests + .get(&(peer_id, Protocol::Ping)) + .unwrap(); + assert_eq!(3, queue.len()); + + // The delayed requests remain even after the next_peer_request_ready call because the responses have not been received. + limiter.next_peer_request_ready(peer_id, Protocol::Ping); + let queue = limiter + .delayed_requests + .get(&(peer_id, Protocol::Ping)) + .unwrap(); + assert_eq!(3, queue.len()); + + limiter.request_completed(&peer_id, Protocol::Ping); + limiter.next_peer_request_ready(peer_id, Protocol::Ping); + + let queue = limiter + .delayed_requests + .get(&(peer_id, Protocol::Ping)) + .unwrap(); + assert_eq!(2, queue.len()); + + limiter.request_completed(&peer_id, Protocol::Ping); + limiter.request_completed(&peer_id, Protocol::Ping); + limiter.next_peer_request_ready(peer_id, Protocol::Ping); + + let queue = limiter.delayed_requests.get(&(peer_id, Protocol::Ping)); + assert!(queue.is_none()); + + // Check that the three delayed requests have moved to ready_requests. + let mut it = limiter.ready_requests.iter(); + for i in 3..=5u32 { + let (_peer_id, RPCSend::Request(request_id, _), _) = it.next().unwrap() else { + unreachable!() + }; + + assert!(matches!( + request_id, + AppRequestId::Sync(SyncRequestId::SingleBlock { + id: SingleLookupReqId { req_id, .. }, + }) if *req_id == i + )); + } + } + + #[tokio::test] + async fn test_peer_disconnected() { + let fork_context = std::sync::Arc::new(ForkContext::new::( + Slot::new(0), + Hash256::ZERO, + &MainnetEthSpec::default_spec(), + )); + let mut limiter: SelfRateLimiter = + SelfRateLimiter::new(None, fork_context).unwrap(); + let peer1 = PeerId::random(); + let peer2 = PeerId::random(); + + for peer in [peer1, peer2] { + for i in 1..=5u32 { + let result = limiter.allows( + peer, + AppRequestId::Sync(SyncRequestId::SingleBlock { + id: SingleLookupReqId { + lookup_id: i, + req_id: i, + }, + }), + RequestType::Ping(Ping { data: i as u64 }), + ); + + // Check that the limiter allows the first two requests. + if i <= 2 { + assert!(result.is_ok()); + } else { + assert!(result.is_err()); + } + } + } + + assert!(limiter.active_requests.contains_key(&peer1)); + assert!(limiter + .delayed_requests + .contains_key(&(peer1, Protocol::Ping))); + assert!(limiter.active_requests.contains_key(&peer2)); + assert!(limiter + .delayed_requests + .contains_key(&(peer2, Protocol::Ping))); + + // Check that the limiter returns the IDs of pending requests and that the IDs are ordered correctly. + let mut failed_requests = limiter.peer_disconnected(peer1); + for i in 3..=5u32 { + let (request_id, _) = failed_requests.remove(0); + assert!(matches!( + request_id, + AppRequestId::Sync(SyncRequestId::SingleBlock { + id: SingleLookupReqId { req_id, .. }, + }) if req_id == i + )); + } + + // Check that peer1’s active and delayed requests have been removed. + assert!(!limiter.active_requests.contains_key(&peer1)); + assert!(!limiter + .delayed_requests + .contains_key(&(peer1, Protocol::Ping))); + + assert!(limiter.active_requests.contains_key(&peer2)); + assert!(limiter + .delayed_requests + .contains_key(&(peer2, Protocol::Ping))); + } } diff --git a/beacon_node/lighthouse_network/tests/common.rs b/beacon_node/lighthouse_network/tests/common.rs index d686885ff7..d979ef9265 100644 --- a/beacon_node/lighthouse_network/tests/common.rs +++ b/beacon_node/lighthouse_network/tests/common.rs @@ -16,6 +16,7 @@ use types::{ type E = MinimalEthSpec; +use lighthouse_network::rpc::config::InboundRateLimiterConfig; use tempfile::Builder as TempBuilder; /// Returns a dummy fork context @@ -77,7 +78,11 @@ pub fn build_tracing_subscriber(level: &str, enabled: bool) { } } -pub fn build_config(mut boot_nodes: Vec) -> Arc { +pub fn build_config( + mut boot_nodes: Vec, + disable_peer_scoring: bool, + inbound_rate_limiter: Option, +) -> Arc { let mut config = NetworkConfig::default(); // Find unused ports by using the 0 port. @@ -93,6 +98,8 @@ pub fn build_config(mut boot_nodes: Vec) -> Arc { config.enr_address = (Some(std::net::Ipv4Addr::LOCALHOST), None); config.boot_nodes_enr.append(&mut boot_nodes); config.network_dir = path.into_path(); + config.disable_peer_scoring = disable_peer_scoring; + config.inbound_rate_limiter_config = inbound_rate_limiter; Arc::new(config) } @@ -102,8 +109,10 @@ pub async fn build_libp2p_instance( fork_name: ForkName, chain_spec: Arc, service_name: String, + disable_peer_scoring: bool, + inbound_rate_limiter: Option, ) -> Libp2pInstance { - let config = build_config(boot_nodes); + let config = build_config(boot_nodes, disable_peer_scoring, inbound_rate_limiter); // launch libp2p service let (signal, exit) = async_channel::bounded(1); @@ -144,6 +153,8 @@ pub async fn build_node_pair( fork_name: ForkName, spec: Arc, protocol: Protocol, + disable_peer_scoring: bool, + inbound_rate_limiter: Option, ) -> (Libp2pInstance, Libp2pInstance) { let mut sender = build_libp2p_instance( rt.clone(), @@ -151,10 +162,20 @@ pub async fn build_node_pair( fork_name, spec.clone(), "sender".to_string(), + disable_peer_scoring, + inbound_rate_limiter.clone(), + ) + .await; + let mut receiver = build_libp2p_instance( + rt, + vec![], + fork_name, + spec.clone(), + "receiver".to_string(), + disable_peer_scoring, + inbound_rate_limiter, ) .await; - let mut receiver = - build_libp2p_instance(rt, vec![], fork_name, spec.clone(), "receiver".to_string()).await; // let the two nodes set up listeners let sender_fut = async { @@ -235,6 +256,8 @@ pub async fn build_linear( fork_name, spec.clone(), "linear".to_string(), + false, + None, ) .await, ); diff --git a/beacon_node/lighthouse_network/tests/rpc_tests.rs b/beacon_node/lighthouse_network/tests/rpc_tests.rs index 7a0eb4602b..9b43e8b581 100644 --- a/beacon_node/lighthouse_network/tests/rpc_tests.rs +++ b/beacon_node/lighthouse_network/tests/rpc_tests.rs @@ -9,10 +9,10 @@ use lighthouse_network::{NetworkEvent, ReportSource, Response}; use ssz::Encode; use ssz_types::VariableList; use std::sync::Arc; -use std::time::Duration; +use std::time::{Duration, Instant}; use tokio::runtime::Runtime; use tokio::time::sleep; -use tracing::{debug, warn}; +use tracing::{debug, error, warn}; use types::{ BeaconBlock, BeaconBlockAltair, BeaconBlockBase, BeaconBlockBellatrix, BlobSidecar, ChainSpec, EmptyBlock, Epoch, EthSpec, FixedBytesExtended, ForkName, Hash256, MinimalEthSpec, @@ -64,8 +64,15 @@ fn test_tcp_status_rpc() { rt.block_on(async { // get sender/receiver - let (mut sender, mut receiver) = - common::build_node_pair(Arc::downgrade(&rt), ForkName::Base, spec, Protocol::Tcp).await; + let (mut sender, mut receiver) = common::build_node_pair( + Arc::downgrade(&rt), + ForkName::Base, + spec, + Protocol::Tcp, + false, + None, + ) + .await; // Dummy STATUS RPC message let rpc_request = RequestType::Status(StatusMessage { @@ -168,6 +175,8 @@ fn test_tcp_blocks_by_range_chunked_rpc() { ForkName::Bellatrix, spec.clone(), Protocol::Tcp, + false, + None, ) .await; @@ -311,6 +320,8 @@ fn test_blobs_by_range_chunked_rpc() { ForkName::Deneb, spec.clone(), Protocol::Tcp, + false, + None, ) .await; @@ -430,6 +441,8 @@ fn test_tcp_blocks_by_range_over_limit() { ForkName::Bellatrix, spec.clone(), Protocol::Tcp, + false, + None, ) .await; @@ -533,6 +546,8 @@ fn test_tcp_blocks_by_range_chunked_rpc_terminates_correctly() { ForkName::Base, spec.clone(), Protocol::Tcp, + false, + None, ) .await; @@ -665,6 +680,8 @@ fn test_tcp_blocks_by_range_single_empty_rpc() { ForkName::Base, spec.clone(), Protocol::Tcp, + false, + None, ) .await; @@ -785,6 +802,8 @@ fn test_tcp_blocks_by_root_chunked_rpc() { ForkName::Bellatrix, spec.clone(), Protocol::Tcp, + false, + None, ) .await; @@ -929,6 +948,8 @@ fn test_tcp_blocks_by_root_chunked_rpc_terminates_correctly() { ForkName::Base, spec.clone(), Protocol::Tcp, + false, + None, ) .await; @@ -1065,8 +1086,15 @@ fn goodbye_test(log_level: &str, enable_logging: bool, protocol: Protocol) { // get sender/receiver rt.block_on(async { - let (mut sender, mut receiver) = - common::build_node_pair(Arc::downgrade(&rt), ForkName::Base, spec, protocol).await; + let (mut sender, mut receiver) = common::build_node_pair( + Arc::downgrade(&rt), + ForkName::Base, + spec, + protocol, + false, + None, + ) + .await; // build the sender future let sender_future = async { @@ -1127,3 +1155,239 @@ fn quic_test_goodbye_rpc() { let enabled_logging = false; goodbye_test(log_level, enabled_logging, Protocol::Quic); } + +// Test that the receiver delays the responses during response rate-limiting. +#[test] +fn test_delayed_rpc_response() { + let rt = Arc::new(Runtime::new().unwrap()); + let spec = Arc::new(E::default_spec()); + + // Allow 1 token to be use used every 3 seconds. + const QUOTA_SEC: u64 = 3; + + rt.block_on(async { + // get sender/receiver + let (mut sender, mut receiver) = common::build_node_pair( + Arc::downgrade(&rt), + ForkName::Base, + spec, + Protocol::Tcp, + false, + // Configure a quota for STATUS responses of 1 token every 3 seconds. + Some(format!("status:1/{QUOTA_SEC}").parse().unwrap()), + ) + .await; + + // Dummy STATUS RPC message + let rpc_request = RequestType::Status(StatusMessage { + fork_digest: [0; 4], + finalized_root: Hash256::from_low_u64_be(0), + finalized_epoch: Epoch::new(1), + head_root: Hash256::from_low_u64_be(0), + head_slot: Slot::new(1), + }); + + // Dummy STATUS RPC message + let rpc_response = Response::Status(StatusMessage { + fork_digest: [0; 4], + finalized_root: Hash256::from_low_u64_be(0), + finalized_epoch: Epoch::new(1), + head_root: Hash256::from_low_u64_be(0), + head_slot: Slot::new(1), + }); + + // build the sender future + let sender_future = async { + let mut request_id = 1; + let mut request_sent_at = Instant::now(); + loop { + match sender.next_event().await { + NetworkEvent::PeerConnectedOutgoing(peer_id) => { + debug!(%request_id, "Sending RPC request"); + sender + .send_request(peer_id, AppRequestId::Router, rpc_request.clone()) + .unwrap(); + request_sent_at = Instant::now(); + } + NetworkEvent::ResponseReceived { + peer_id, + app_request_id: _, + response, + } => { + debug!(%request_id, "Sender received"); + assert_eq!(response, rpc_response); + + match request_id { + 1 => { + // The first response is returned instantly. + assert!(request_sent_at.elapsed() < Duration::from_millis(100)); + } + 2..=5 => { + // The second and subsequent responses are delayed due to the response rate-limiter on the receiver side. + // Adding a slight margin to the elapsed time check to account for potential timing issues caused by system + // scheduling or execution delays during testing. + assert!( + request_sent_at.elapsed() + > (Duration::from_secs(QUOTA_SEC) + - Duration::from_millis(100)) + ); + if request_id == 5 { + // End the test + return; + } + } + _ => unreachable!(), + } + + request_id += 1; + debug!(%request_id, "Sending RPC request"); + sender + .send_request(peer_id, AppRequestId::Router, rpc_request.clone()) + .unwrap(); + request_sent_at = Instant::now(); + } + NetworkEvent::RPCFailed { + app_request_id: _, + peer_id: _, + error, + } => { + error!(?error, "RPC Failed"); + panic!("Rpc failed."); + } + _ => {} + } + } + }; + + // build the receiver future + let receiver_future = async { + loop { + if let NetworkEvent::RequestReceived { + peer_id, + inbound_request_id, + request_type, + } = receiver.next_event().await + { + assert_eq!(request_type, rpc_request); + debug!("Receiver received request"); + receiver.send_response(peer_id, inbound_request_id, rpc_response.clone()); + } + } + }; + + tokio::select! { + _ = sender_future => {} + _ = receiver_future => {} + _ = sleep(Duration::from_secs(30)) => { + panic!("Future timed out"); + } + } + }) +} + +// Test that a rate-limited error doesn't occur even if the sender attempts to send many requests at +// once, thanks to the self-limiter on the sender side. +#[test] +fn test_active_requests() { + let rt = Arc::new(Runtime::new().unwrap()); + let spec = Arc::new(E::default_spec()); + + rt.block_on(async { + // Get sender/receiver. + let (mut sender, mut receiver) = common::build_node_pair( + Arc::downgrade(&rt), + ForkName::Base, + spec, + Protocol::Tcp, + false, + None, + ) + .await; + + // Dummy STATUS RPC request. + let rpc_request = RequestType::Status(StatusMessage { + fork_digest: [0; 4], + finalized_root: Hash256::from_low_u64_be(0), + finalized_epoch: Epoch::new(1), + head_root: Hash256::from_low_u64_be(0), + head_slot: Slot::new(1), + }); + + // Dummy STATUS RPC response. + let rpc_response = Response::Status(StatusMessage { + fork_digest: [0; 4], + finalized_root: Hash256::zero(), + finalized_epoch: Epoch::new(1), + head_root: Hash256::zero(), + head_slot: Slot::new(1), + }); + + // Number of requests. + const REQUESTS: u8 = 10; + + // Build the sender future. + let sender_future = async { + let mut response_received = 0; + loop { + match sender.next_event().await { + NetworkEvent::PeerConnectedOutgoing(peer_id) => { + debug!("Sending RPC request"); + // Send requests in quick succession to intentionally trigger request queueing in the self-limiter. + for _ in 0..REQUESTS { + sender + .send_request(peer_id, AppRequestId::Router, rpc_request.clone()) + .unwrap(); + } + } + NetworkEvent::ResponseReceived { response, .. } => { + debug!(?response, "Sender received response"); + if matches!(response, Response::Status(_)) { + response_received += 1; + } + } + NetworkEvent::RPCFailed { + app_request_id: _, + peer_id: _, + error, + } => panic!("RPC failed: {:?}", error), + _ => {} + } + + if response_received == REQUESTS { + return; + } + } + }; + + // Build the receiver future. + let receiver_future = async { + let mut received_requests = vec![]; + loop { + tokio::select! { + event = receiver.next_event() => { + if let NetworkEvent::RequestReceived { peer_id, inbound_request_id, request_type } = event { + debug!(?request_type, "Receiver received request"); + if matches!(request_type, RequestType::Status(_)) { + received_requests.push((peer_id, inbound_request_id)); + } + } + } + // Introduce a delay in sending responses to trigger request queueing on the sender side. + _ = sleep(Duration::from_secs(3)) => { + for (peer_id, inbound_request_id) in received_requests.drain(..) { + receiver.send_response(peer_id, inbound_request_id, rpc_response.clone()); + } + } + } + } + }; + + tokio::select! { + _ = sender_future => {} + _ = receiver_future => {} + _ = sleep(Duration::from_secs(30)) => { + panic!("Future timed out"); + } + } + }) +} From 6fad18644bbee80f71cbc61baa2f773cdb88ed1d Mon Sep 17 00:00:00 2001 From: Hamdi Allam Date: Thu, 24 Apr 2025 01:38:05 -0400 Subject: [PATCH 06/22] feat: presign for validator account (#6747) #6746 Add a --presign flag to emit the json output to stdout instead of publishing the exit --- Cargo.lock | 1 + account_manager/Cargo.toml | 1 + account_manager/src/validator/exit.rs | 35 ++++++++++++++++++++++++--- 3 files changed, 34 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b98e096718..53592b11bd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -30,6 +30,7 @@ dependencies = [ "filesystem", "safe_arith", "sensitive_url", + "serde_json", "slashing_protection", "slot_clock", "tempfile", diff --git a/account_manager/Cargo.toml b/account_manager/Cargo.toml index a7752d621f..071e2681dd 100644 --- a/account_manager/Cargo.toml +++ b/account_manager/Cargo.toml @@ -22,6 +22,7 @@ eth2_wallet_manager = { path = "../common/eth2_wallet_manager" } filesystem = { workspace = true } safe_arith = { workspace = true } sensitive_url = { workspace = true } +serde_json = { workspace = true } slashing_protection = { workspace = true } slot_clock = { workspace = true } tokio = { workspace = true } diff --git a/account_manager/src/validator/exit.rs b/account_manager/src/validator/exit.rs index 8a2cdb8400..1393d0f152 100644 --- a/account_manager/src/validator/exit.rs +++ b/account_manager/src/validator/exit.rs @@ -11,6 +11,7 @@ use eth2_keystore::Keystore; use eth2_network_config::Eth2NetworkConfig; use safe_arith::SafeArith; use sensitive_url::SensitiveUrl; +use serde_json; use slot_clock::{SlotClock, SystemTimeSlotClock}; use std::path::{Path, PathBuf}; use std::time::Duration; @@ -24,6 +25,7 @@ pub const BEACON_SERVER_FLAG: &str = "beacon-node"; pub const NO_WAIT: &str = "no-wait"; pub const NO_CONFIRMATION: &str = "no-confirmation"; pub const PASSWORD_PROMPT: &str = "Enter the keystore password"; +pub const PRESIGN: &str = "presign"; pub const DEFAULT_BEACON_NODE: &str = "http://localhost:5052/"; pub const CONFIRMATION_PHRASE: &str = "Exit my validator"; @@ -74,6 +76,15 @@ pub fn cli_app() -> Command { .action(ArgAction::SetTrue) .help_heading(FLAG_HEADER) ) + .arg( + Arg::new(PRESIGN) + .long(PRESIGN) + .help("Only presign the voluntary exit message without publishing it") + .default_value("false") + .action(ArgAction::SetTrue) + .help_heading(FLAG_HEADER) + .display_order(0) + ) } pub fn cli_run(matches: &ArgMatches, env: Environment) -> Result<(), String> { @@ -84,6 +95,7 @@ pub fn cli_run(matches: &ArgMatches, env: Environment) -> Result< let stdin_inputs = cfg!(windows) || matches.get_flag(STDIN_INPUTS_FLAG); let no_wait = matches.get_flag(NO_WAIT); let no_confirmation = matches.get_flag(NO_CONFIRMATION); + let presign = matches.get_flag(PRESIGN); let spec = env.eth2_config().spec.clone(); let server_url: String = clap_utils::parse_required(matches, BEACON_SERVER_FLAG)?; @@ -107,6 +119,7 @@ pub fn cli_run(matches: &ArgMatches, env: Environment) -> Result< ð2_network_config, no_wait, no_confirmation, + presign, ))?; Ok(()) @@ -123,6 +136,7 @@ async fn publish_voluntary_exit( eth2_network_config: &Eth2NetworkConfig, no_wait: bool, no_confirmation: bool, + presign: bool, ) -> Result<(), String> { let genesis_data = get_geneisis_data(client).await?; let testnet_genesis_root = eth2_network_config @@ -154,6 +168,23 @@ async fn publish_voluntary_exit( validator_index, }; + // Sign the voluntary exit. We sign ahead of the prompt as that step is only important for the broadcast + let signed_voluntary_exit = + voluntary_exit.sign(&keypair.sk, genesis_data.genesis_validators_root, spec); + if presign { + eprintln!( + "Successfully pre-signed voluntary exit for validator {}. Not publishing.", + keypair.pk + ); + + // Convert to JSON and print + let string_output = serde_json::to_string_pretty(&signed_voluntary_exit) + .map_err(|e| format!("Unable to convert to JSON: {}", e))?; + + println!("{}", string_output); + return Ok(()); + } + eprintln!( "Publishing a voluntary exit for validator: {} \n", keypair.pk @@ -174,9 +205,7 @@ async fn publish_voluntary_exit( }; if confirmation == CONFIRMATION_PHRASE { - // Sign and publish the voluntary exit to network - let signed_voluntary_exit = - voluntary_exit.sign(&keypair.sk, genesis_data.genesis_validators_root, spec); + // Publish the voluntary exit to network client .post_beacon_pool_voluntary_exits(&signed_voluntary_exit) .await From 2e2b0d2176e07870085a240f80bf340830fc6528 Mon Sep 17 00:00:00 2001 From: chonghe <44791194+chong-he@users.noreply.github.com> Date: Mon, 28 Apr 2025 17:43:41 +0800 Subject: [PATCH 07/22] Revise consolidation info in Lighthouse book (#7351) One of the information in the consolidation section in Lighthouse book is wrong. I realise this after reading https://ethereum.org/en/roadmap/pectra/maxeb/ and a further look at [EIP 7251](https://eips.ethereum.org/EIPS/eip-7251) which states: ` Note: the system contract uses the EVM CALLER operation (Solidity: msg.sender) to get the address used in the consolidation request, i.e. the address that calls the system contract must match the 0x01 withdrawal credential recorded in the beacon state. ` So the withdrawal credentials of both source and target validators need not be the same. --- book/src/validator_consolidation.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/book/src/validator_consolidation.md b/book/src/validator_consolidation.md index 10ab5bd97d..3c9860a514 100644 --- a/book/src/validator_consolidation.md +++ b/book/src/validator_consolidation.md @@ -19,11 +19,11 @@ Let's take a look at an example: Initially, validators A and B are both with 0x0 - validator A has 64 ETH - validator B has 0 ETH (i.e., validator B has exited the beacon chain) -The consolidation process can be repeated to consolidate more validators into validator A. +The consolidation process can be repeated to consolidate more validators into validator A. The request is made by signing a transaction using the **withdrawal address** of the source validator. The withdrawal credential of the target validator can be different from the source validator. It is important to note that there are some conditions required to perform consolidation, a few common ones are: -- the **withdrawal address** of the source and target validators **must be the same**. +- both source and target validator **must be active** (i.e., not exiting or slashed). - the _target validator_ **must** have a withdrawal credential **type 0x02**. The source validator could have a 0x01 or 0x02 withdrawal credential. - the source validator must be active for at least 256 epochs to be able to perform consolidation. From 63a10eaaea62eff645dfccbe16bfb1aa87ce130e Mon Sep 17 00:00:00 2001 From: hopinheimer <48147533+hopinheimer@users.noreply.github.com> Date: Mon, 28 Apr 2025 05:43:46 -0400 Subject: [PATCH 08/22] Changing `boot_enr.yaml` to expect `bootstap_nodes.yaml` for pectra devnet (#6956) --- common/eth2_config/src/lib.rs | 2 +- .../chiado/{boot_enr.yaml => bootstrap_nodes.yaml} | 0 .../gnosis/{boot_enr.yaml => bootstrap_nodes.yaml} | 0 .../holesky/{boot_enr.yaml => bootstrap_nodes.yaml} | 0 .../hoodi/{boot_enr.yaml => bootstrap_nodes.yaml} | 0 .../mainnet/{boot_enr.yaml => bootstrap_nodes.yaml} | 0 .../sepolia/{boot_enr.yaml => bootstrap_nodes.yaml} | 0 common/eth2_network_config/src/lib.rs | 2 +- 8 files changed, 2 insertions(+), 2 deletions(-) rename common/eth2_network_config/built_in_network_configs/chiado/{boot_enr.yaml => bootstrap_nodes.yaml} (100%) rename common/eth2_network_config/built_in_network_configs/gnosis/{boot_enr.yaml => bootstrap_nodes.yaml} (100%) rename common/eth2_network_config/built_in_network_configs/holesky/{boot_enr.yaml => bootstrap_nodes.yaml} (100%) rename common/eth2_network_config/built_in_network_configs/hoodi/{boot_enr.yaml => bootstrap_nodes.yaml} (100%) rename common/eth2_network_config/built_in_network_configs/mainnet/{boot_enr.yaml => bootstrap_nodes.yaml} (100%) rename common/eth2_network_config/built_in_network_configs/sepolia/{boot_enr.yaml => bootstrap_nodes.yaml} (100%) diff --git a/common/eth2_config/src/lib.rs b/common/eth2_config/src/lib.rs index 017bdf288d..544138f0fa 100644 --- a/common/eth2_config/src/lib.rs +++ b/common/eth2_config/src/lib.rs @@ -212,7 +212,7 @@ macro_rules! define_net { "../", "deposit_contract_block.txt" ), - boot_enr: $this_crate::$include_file!($this_crate, "../", "boot_enr.yaml"), + boot_enr: $this_crate::$include_file!($this_crate, "../", "bootstrap_nodes.yaml"), genesis_state_bytes: $this_crate::$include_file!($this_crate, "../", "genesis.ssz"), } }}; diff --git a/common/eth2_network_config/built_in_network_configs/chiado/boot_enr.yaml b/common/eth2_network_config/built_in_network_configs/chiado/bootstrap_nodes.yaml similarity index 100% rename from common/eth2_network_config/built_in_network_configs/chiado/boot_enr.yaml rename to common/eth2_network_config/built_in_network_configs/chiado/bootstrap_nodes.yaml diff --git a/common/eth2_network_config/built_in_network_configs/gnosis/boot_enr.yaml b/common/eth2_network_config/built_in_network_configs/gnosis/bootstrap_nodes.yaml similarity index 100% rename from common/eth2_network_config/built_in_network_configs/gnosis/boot_enr.yaml rename to common/eth2_network_config/built_in_network_configs/gnosis/bootstrap_nodes.yaml diff --git a/common/eth2_network_config/built_in_network_configs/holesky/boot_enr.yaml b/common/eth2_network_config/built_in_network_configs/holesky/bootstrap_nodes.yaml similarity index 100% rename from common/eth2_network_config/built_in_network_configs/holesky/boot_enr.yaml rename to common/eth2_network_config/built_in_network_configs/holesky/bootstrap_nodes.yaml diff --git a/common/eth2_network_config/built_in_network_configs/hoodi/boot_enr.yaml b/common/eth2_network_config/built_in_network_configs/hoodi/bootstrap_nodes.yaml similarity index 100% rename from common/eth2_network_config/built_in_network_configs/hoodi/boot_enr.yaml rename to common/eth2_network_config/built_in_network_configs/hoodi/bootstrap_nodes.yaml diff --git a/common/eth2_network_config/built_in_network_configs/mainnet/boot_enr.yaml b/common/eth2_network_config/built_in_network_configs/mainnet/bootstrap_nodes.yaml similarity index 100% rename from common/eth2_network_config/built_in_network_configs/mainnet/boot_enr.yaml rename to common/eth2_network_config/built_in_network_configs/mainnet/bootstrap_nodes.yaml diff --git a/common/eth2_network_config/built_in_network_configs/sepolia/boot_enr.yaml b/common/eth2_network_config/built_in_network_configs/sepolia/bootstrap_nodes.yaml similarity index 100% rename from common/eth2_network_config/built_in_network_configs/sepolia/boot_enr.yaml rename to common/eth2_network_config/built_in_network_configs/sepolia/bootstrap_nodes.yaml diff --git a/common/eth2_network_config/src/lib.rs b/common/eth2_network_config/src/lib.rs index 0bb12c4187..ac488ed2a3 100644 --- a/common/eth2_network_config/src/lib.rs +++ b/common/eth2_network_config/src/lib.rs @@ -31,7 +31,7 @@ use url::Url; pub use eth2_config::GenesisStateSource; pub const DEPLOY_BLOCK_FILE: &str = "deposit_contract_block.txt"; -pub const BOOT_ENR_FILE: &str = "boot_enr.yaml"; +pub const BOOT_ENR_FILE: &str = "bootstrap_nodes.yaml"; pub const GENESIS_STATE_FILE: &str = "genesis.ssz"; pub const BASE_CONFIG_FILE: &str = "config.yaml"; From 34a6c3a9302907316f67ad5b95e1718075ddb626 Mon Sep 17 00:00:00 2001 From: Roman Krasiuk Date: Wed, 30 Apr 2025 03:15:39 +0200 Subject: [PATCH 09/22] vc: increase default gas limit (#6897) Increases default gas limit to 36M. --- book/src/help_vc.md | 2 +- lighthouse/tests/validator_client.rs | 2 +- validator_client/src/cli.rs | 2 +- validator_client/validator_store/src/lib.rs | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/book/src/help_vc.md b/book/src/help_vc.md index c32104b17a..15b5c209a7 100644 --- a/book/src/help_vc.md +++ b/book/src/help_vc.md @@ -40,7 +40,7 @@ Options: The gas limit to be used in all builder proposals for all validators managed by this validator client. Note this will not necessarily be used if the gas limit set here moves too far from the previous block's - gas limit. [default: 30000000] + gas limit. [default: 36000000] --genesis-state-url A URL of a beacon-API compatible server from which to download the genesis state. Checkpoint sync server URLs can generally be used with diff --git a/lighthouse/tests/validator_client.rs b/lighthouse/tests/validator_client.rs index b9edeceaee..6e579f63c1 100644 --- a/lighthouse/tests/validator_client.rs +++ b/lighthouse/tests/validator_client.rs @@ -497,7 +497,7 @@ fn no_doppelganger_protection_flag() { fn no_gas_limit_flag() { CommandLineTest::new() .run() - .with_config(|config| assert!(config.validator_store.gas_limit == Some(30_000_000))); + .with_config(|config| assert!(config.validator_store.gas_limit == Some(36_000_000))); } #[test] fn gas_limit_flag() { diff --git a/validator_client/src/cli.rs b/validator_client/src/cli.rs index 3dd138619b..950d08a028 100644 --- a/validator_client/src/cli.rs +++ b/validator_client/src/cli.rs @@ -387,7 +387,7 @@ pub struct ValidatorClient { #[clap( long, value_name = "INTEGER", - default_value_t = 30_000_000, + default_value_t = 36_000_000, requires = "builder_proposals", help = "The gas limit to be used in all builder proposals for all validators managed \ by this validator client. Note this will not necessarily be used if the gas limit \ diff --git a/validator_client/validator_store/src/lib.rs b/validator_client/validator_store/src/lib.rs index 6b472332a1..015b321d43 100644 --- a/validator_client/validator_store/src/lib.rs +++ b/validator_client/validator_store/src/lib.rs @@ -74,8 +74,8 @@ const SLASHING_PROTECTION_HISTORY_EPOCHS: u64 = 512; /// Currently used as the default gas limit in execution clients. /// -/// https://github.com/ethereum/builder-specs/issues/17 -pub const DEFAULT_GAS_LIMIT: u64 = 30_000_000; +/// https://ethresear.ch/t/on-increasing-the-block-gas-limit-technical-considerations-path-forward/21225. +pub const DEFAULT_GAS_LIMIT: u64 = 36_000_000; pub struct ValidatorStore { validators: Arc>, From 94ccd7608ea89b6f35eabf7611e81da8413cb336 Mon Sep 17 00:00:00 2001 From: Mac L Date: Wed, 30 Apr 2025 12:07:07 +1000 Subject: [PATCH 10/22] Add documentation for VC API `/lighthouse/beacon/health` (#6653) Changes the endpoint to get fallback health information from `/lighthouse/ui/fallback_health` to `/lighthouse/beacon/health`. This more accurately describes that the endpoint is related to the connected beacon nodes and also matched the `/lighthouse/beacon/update` endpoint being added in #6551. Adds documentation for both fallback health and the endpoint to the Lighthouse book. --- book/src/advanced_redundancy.md | 19 ++++++++-- book/src/api_vc_endpoints.md | 54 ++++++++++++++++++++++++++++ validator_client/http_api/src/lib.rs | 10 +++--- 3 files changed, 75 insertions(+), 8 deletions(-) diff --git a/book/src/advanced_redundancy.md b/book/src/advanced_redundancy.md index 4582866657..4c231ed6ab 100644 --- a/book/src/advanced_redundancy.md +++ b/book/src/advanced_redundancy.md @@ -39,9 +39,6 @@ There are a few interesting properties about the list of `--beacon-nodes`: earlier in the list. - *Synced is preferred*: the validator client prefers a synced beacon node over one that is still syncing. -- *Failure is sticky*: if a beacon node fails, it will be flagged as offline - and won't be retried again for the rest of the slot (12 seconds). This helps prevent the impact - of time-outs and other lengthy errors. > Note: When supplying multiple beacon nodes the `http://localhost:5052` address must be explicitly > provided (if it is desired). It will only be used as default if no `--beacon-nodes` flag is @@ -76,6 +73,22 @@ Prior to v3.2.0 fallback beacon nodes also required the `--subscribe-all-subnets now broadcast subscriptions to all connected beacon nodes by default. This broadcast behaviour can be disabled using the `--broadcast none` flag for `lighthouse vc`. +### Fallback Health + +Since v6.0.0, the validator client will be more aggressive in switching to a fallback node. To do this, +it uses the concept of "Health". Every slot, the validator client checks each connected beacon node +to determine which node is the "Healthiest". In general, the validator client will prefer nodes +which are synced, have synced execution layers and which are not currently optimistically +syncing. + +Sync distance is separated out into 4 tiers: "Synced", "Small", "Medium", "Large". Nodes are then +sorted into tiers based onto sync distance and execution layer status. You can use the +`--beacon-nodes-sync-tolerances` to change how many slots wide each tier is. In the case where +multiple nodes fall into the same tier, user order is used to tie-break. + +To see health information for each connected node, you can use the +[`/lighthouse/beacon/health` API endpoint](./api_vc_endpoints.md#get-lighthousebeaconhealth). + ### Broadcast modes Since v4.6.0, the Lighthouse VC can be configured to broadcast messages to all configured beacon diff --git a/book/src/api_vc_endpoints.md b/book/src/api_vc_endpoints.md index a7c6f0ad5e..e51f5d29ae 100644 --- a/book/src/api_vc_endpoints.md +++ b/book/src/api_vc_endpoints.md @@ -18,6 +18,7 @@ | [`POST /lighthouse/validators/mnemonic`](#post-lighthousevalidatorsmnemonic) | Create a new validator from an existing mnemonic. | | [`POST /lighthouse/validators/web3signer`](#post-lighthousevalidatorsweb3signer) | Add web3signer validators. | | [`GET /lighthouse/logs`](#get-lighthouselogs) | Get logs | +| [`GET /lighthouse/beacon/health`](#get-lighthousebeaconhealth) | Get health information for each connected beacon node. | The query to Lighthouse API endpoints requires authorization, see [Authorization Header](./api_vc_auth_header.md). @@ -816,3 +817,56 @@ logs emitted are INFO level or higher. } } ``` + +## `GET /lighthouse/beacon/health` + +Provides information about the sync status and execution layer health of each connected beacon node. +For more information about how to interpret the beacon node health, see [Fallback Health](./advanced_redundancy.md#fallback-health). + +### HTTP Specification + +| Property | Specification | +|-------------------|--------------------------------------------| +| Path | `/lighthouse/beacon/health` | +| Method | GET | +| Required Headers | [`Authorization`](./api_vc_auth_header.md) | +| Typical Responses | 200, 400 | + +Command: + +```bash +DATADIR=/var/lib/lighthouse +curl -X GET http://localhost:5062/lighthouse/beacon/health \ + -H "Authorization: Bearer $(cat ${DATADIR}/validators/api-token.txt)" | jq + ``` + +### Example Response Body + +```json +{ + "data": { + "beacon_nodes": [ + { + "index": 0, + "endpoint": "http://localhost:5052", + "health": { + "user_index": 0, + "head": 10500000, + "optimistic_status": "No", + "execution_status": "Healthy", + "health_tier": { + "tier": 1, + "sync_distance": 0, + "distance_tier": "Synced" + } + } + }, + { + "index": 1, + "endpoint": "http://fallbacks-r.us", + "health": "Offline" + } + ] + } +} +``` diff --git a/validator_client/http_api/src/lib.rs b/validator_client/http_api/src/lib.rs index 5bb4747bfe..a6c9eba752 100644 --- a/validator_client/http_api/src/lib.rs +++ b/validator_client/http_api/src/lib.rs @@ -418,10 +418,10 @@ pub fn serve( }, ); - // GET lighthouse/ui/fallback_health - let get_lighthouse_ui_fallback_health = warp::path("lighthouse") - .and(warp::path("ui")) - .and(warp::path("fallback_health")) + // GET lighthouse/beacon/health + let get_lighthouse_beacon_health = warp::path("lighthouse") + .and(warp::path("beacon")) + .and(warp::path("health")) .and(warp::path::end()) .and(block_service_filter.clone()) .then(|block_filter: BlockService| async move { @@ -1294,7 +1294,7 @@ pub fn serve( .or(get_lighthouse_validators_pubkey) .or(get_lighthouse_ui_health) .or(get_lighthouse_ui_graffiti) - .or(get_lighthouse_ui_fallback_health) + .or(get_lighthouse_beacon_health) .or(get_fee_recipient) .or(get_gas_limit) .or(get_graffiti) From 9779b4ba2c045a401a5894bfeedefcc9a0c1bf68 Mon Sep 17 00:00:00 2001 From: Eitan Seri-Levi Date: Tue, 29 Apr 2025 21:36:50 -0700 Subject: [PATCH 11/22] Optimize `validate_data_columns` (#7326) --- beacon_node/beacon_chain/src/kzg_utils.rs | 53 +++++++++-------------- 1 file changed, 21 insertions(+), 32 deletions(-) diff --git a/beacon_node/beacon_chain/src/kzg_utils.rs b/beacon_node/beacon_chain/src/kzg_utils.rs index eaaa23130d..704fb3663f 100644 --- a/beacon_node/beacon_chain/src/kzg_utils.rs +++ b/beacon_node/beacon_chain/src/kzg_utils.rs @@ -8,9 +8,9 @@ use std::sync::Arc; use types::beacon_block_body::KzgCommitments; use types::data_column_sidecar::{Cell, DataColumn, DataColumnSidecarError}; use types::{ - Blob, BlobSidecar, BlobSidecarList, ChainSpec, ColumnIndex, DataColumnSidecar, - DataColumnSidecarList, EthSpec, Hash256, KzgCommitment, KzgProof, SignedBeaconBlock, - SignedBeaconBlockHeader, SignedBlindedBeaconBlock, + Blob, BlobSidecar, BlobSidecarList, ChainSpec, DataColumnSidecar, DataColumnSidecarList, + EthSpec, Hash256, KzgCommitment, KzgProof, SignedBeaconBlock, SignedBeaconBlockHeader, + SignedBlindedBeaconBlock, }; /// Converts a blob ssz List object to an array to be used with the kzg @@ -79,38 +79,27 @@ pub fn validate_data_columns<'a, E: EthSpec, I>( where I: Iterator>> + Clone, { - let cells = data_column_iter - .clone() - .flat_map(|data_column| data_column.column.iter().map(ssz_cell_to_crypto_cell::)) - .collect::, KzgError>>()?; + let mut cells = Vec::new(); + let mut proofs = Vec::new(); + let mut column_indices = Vec::new(); + let mut commitments = Vec::new(); - let proofs = data_column_iter - .clone() - .flat_map(|data_column| { - data_column - .kzg_proofs - .iter() - .map(|&proof| Bytes48::from(proof)) - }) - .collect::>(); + for data_column in data_column_iter { + let col_index = data_column.index; - let column_indices = data_column_iter - .clone() - .flat_map(|data_column| { - let col_index = data_column.index; - data_column.column.iter().map(move |_| col_index) - }) - .collect::>(); + for cell in &data_column.column { + cells.push(ssz_cell_to_crypto_cell::(cell)?); + column_indices.push(col_index); + } - let commitments = data_column_iter - .clone() - .flat_map(|data_column| { - data_column - .kzg_commitments - .iter() - .map(|&commitment| Bytes48::from(commitment)) - }) - .collect::>(); + for &proof in &data_column.kzg_proofs { + proofs.push(Bytes48::from(proof)); + } + + for &commitment in &data_column.kzg_commitments { + commitments.push(Bytes48::from(commitment)); + } + } kzg.verify_cell_proof_batch(&cells, &proofs, column_indices, &commitments) } From 93ec9df13760a36133b029510cedb9c0f261ad3e Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Thu, 1 May 2025 11:30:42 +1000 Subject: [PATCH 12/22] Compute proposer shuffling only once in gossip verification (#7304) When we perform data column gossip verification, we sometimes see multiple proposer shuffling cache miss simultaneously and this results in multiple threads computing the shuffling cache and potentially slows down the gossip verification. Proposal here is to use a `OnceCell` for each shuffling key to make sure it's only computed once. I have only implemented this in data column verification as a PoC, but this can also be applied to blob and block verification Related issues: - https://github.com/sigp/lighthouse/issues/4447 - https://github.com/sigp/lighthouse/issues/7203 --- Cargo.lock | 3 +- Cargo.toml | 1 + beacon_node/beacon_chain/Cargo.toml | 1 + .../beacon_chain/src/beacon_proposer_cache.rs | 49 +++++++++++++------ .../src/data_column_verification.rs | 41 +++++++++------- common/logging/Cargo.toml | 2 - 6 files changed, 62 insertions(+), 35 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 53592b11bd..40c331b659 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -813,6 +813,7 @@ dependencies = [ "maplit", "merkle_proof", "metrics", + "once_cell", "oneshot_broadcast", "operation_pool", "parking_lot 0.12.3", @@ -5731,8 +5732,6 @@ dependencies = [ "chrono", "logroller", "metrics", - "once_cell", - "parking_lot 0.12.3", "serde", "serde_json", "tokio", diff --git a/Cargo.toml b/Cargo.toml index 31f50068dc..5afed88ee8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -161,6 +161,7 @@ maplit = "1" milhouse = "0.5" mockito = "1.5.0" num_cpus = "1" +once_cell = "1.17.1" parking_lot = "0.12" paste = "1" prometheus = { version = "0.13", default-features = false } diff --git a/beacon_node/beacon_chain/Cargo.toml b/beacon_node/beacon_chain/Cargo.toml index 0cf9ae1a10..18b40cab7e 100644 --- a/beacon_node/beacon_chain/Cargo.toml +++ b/beacon_node/beacon_chain/Cargo.toml @@ -47,6 +47,7 @@ logging = { workspace = true } lru = { workspace = true } merkle_proof = { workspace = true } metrics = { workspace = true } +once_cell = { workspace = true } oneshot_broadcast = { path = "../../common/oneshot_broadcast/" } operation_pool = { workspace = true } parking_lot = { workspace = true } diff --git a/beacon_node/beacon_chain/src/beacon_proposer_cache.rs b/beacon_node/beacon_chain/src/beacon_proposer_cache.rs index 567433caee..56b13b0b77 100644 --- a/beacon_node/beacon_chain/src/beacon_proposer_cache.rs +++ b/beacon_node/beacon_chain/src/beacon_proposer_cache.rs @@ -11,10 +11,12 @@ use crate::{BeaconChain, BeaconChainError, BeaconChainTypes}; use fork_choice::ExecutionStatus; use lru::LruCache; +use once_cell::sync::OnceCell; use smallvec::SmallVec; use state_processing::state_advance::partial_state_advance; use std::cmp::Ordering; use std::num::NonZeroUsize; +use std::sync::Arc; use types::non_zero_usize::new_non_zero_usize; use types::{ BeaconState, BeaconStateError, ChainSpec, Epoch, EthSpec, Fork, Hash256, Slot, Unsigned, @@ -39,21 +41,21 @@ pub struct Proposer { /// their signatures. pub struct EpochBlockProposers { /// The epoch to which the proposers pertain. - epoch: Epoch, + pub(crate) epoch: Epoch, /// The fork that should be used to verify proposer signatures. - fork: Fork, + pub(crate) fork: Fork, /// A list of length `T::EthSpec::slots_per_epoch()`, representing the proposers for each slot /// in that epoch. /// /// E.g., if `self.epoch == 1`, then `self.proposers[0]` contains the proposer for slot `32`. - proposers: SmallVec<[usize; TYPICAL_SLOTS_PER_EPOCH]>, + pub(crate) proposers: SmallVec<[usize; TYPICAL_SLOTS_PER_EPOCH]>, } /// A cache to store the proposers for some epoch. /// /// See the module-level documentation for more information. pub struct BeaconProposerCache { - cache: LruCache<(Epoch, Hash256), EpochBlockProposers>, + cache: LruCache<(Epoch, Hash256), Arc>>, } impl Default for BeaconProposerCache { @@ -74,7 +76,8 @@ impl BeaconProposerCache { ) -> Option { let epoch = slot.epoch(E::slots_per_epoch()); let key = (epoch, shuffling_decision_block); - if let Some(cache) = self.cache.get(&key) { + let cache_opt = self.cache.get(&key).and_then(|cell| cell.get()); + if let Some(cache) = cache_opt { // This `if` statement is likely unnecessary, but it feels like good practice. if epoch == cache.epoch { cache @@ -103,7 +106,26 @@ impl BeaconProposerCache { epoch: Epoch, ) -> Option<&SmallVec<[usize; TYPICAL_SLOTS_PER_EPOCH]>> { let key = (epoch, shuffling_decision_block); - self.cache.get(&key).map(|cache| &cache.proposers) + self.cache + .get(&key) + .and_then(|cache_once_cell| cache_once_cell.get().map(|proposers| &proposers.proposers)) + } + + /// Returns the `OnceCell` for the given `(epoch, shuffling_decision_block)` key, + /// inserting an empty one if it doesn't exist. + /// + /// The returned `OnceCell` allows the caller to initialise the value externally + /// using `get_or_try_init`, enabling deferred computation without holding a mutable + /// reference to the cache. + pub fn get_or_insert_key( + &mut self, + epoch: Epoch, + shuffling_decision_block: Hash256, + ) -> Arc> { + let key = (epoch, shuffling_decision_block); + self.cache + .get_or_insert(key, || Arc::new(OnceCell::new())) + .clone() } /// Insert the proposers into the cache. @@ -120,14 +142,13 @@ impl BeaconProposerCache { ) -> Result<(), BeaconStateError> { let key = (epoch, shuffling_decision_block); if !self.cache.contains(&key) { - self.cache.put( - key, - EpochBlockProposers { - epoch, - fork, - proposers: proposers.into(), - }, - ); + let epoch_proposers = EpochBlockProposers { + epoch, + fork, + proposers: proposers.into(), + }; + self.cache + .put(key, Arc::new(OnceCell::with_value(epoch_proposers))); } Ok(()) diff --git a/beacon_node/beacon_chain/src/data_column_verification.rs b/beacon_node/beacon_chain/src/data_column_verification.rs index 57efbb0a77..7d22bcf341 100644 --- a/beacon_node/beacon_chain/src/data_column_verification.rs +++ b/beacon_node/beacon_chain/src/data_column_verification.rs @@ -1,3 +1,4 @@ +use crate::beacon_proposer_cache::EpochBlockProposers; use crate::block_verification::{ cheap_state_advance_to_obtain_committees, get_validator_pubkey_cache, process_block_slash_info, BlockSlashInfo, @@ -602,14 +603,19 @@ fn verify_proposer_and_signature( parent_block.root }; - let proposer_opt = chain + // We lock the cache briefly to get or insert a OnceCell, then drop the lock + // before doing proposer shuffling calculation via `OnceCell::get_or_try_init`. This avoids + // holding the lock during the computation, while still ensuring the result is cached and + // initialised only once. + // + // This approach exposes the cache internals (`OnceCell` & `EpochBlockProposers`) + // as a trade-off for avoiding lock contention. + let epoch_proposers_cell = chain .beacon_proposer_cache .lock() - .get_slot::(proposer_shuffling_root, column_slot); + .get_or_insert_key(column_epoch, proposer_shuffling_root); - let (proposer_index, fork) = if let Some(proposer) = proposer_opt { - (proposer.index, proposer.fork) - } else { + let epoch_proposers = epoch_proposers_cell.get_or_try_init(move || { debug!( %block_root, index = %column_index, @@ -633,19 +639,20 @@ fn verify_proposer_and_signature( )?; let proposers = state.get_beacon_proposer_indices(&chain.spec)?; - let proposer_index = *proposers - .get(column_slot.as_usize() % T::EthSpec::slots_per_epoch() as usize) - .ok_or_else(|| BeaconChainError::NoProposerForSlot(column_slot))?; - // Prime the proposer shuffling cache with the newly-learned value. - chain.beacon_proposer_cache.lock().insert( - column_epoch, - proposer_shuffling_root, - proposers, - state.fork(), - )?; - (proposer_index, state.fork()) - }; + Ok::<_, GossipDataColumnError>(EpochBlockProposers { + epoch: column_epoch, + fork: state.fork(), + proposers: proposers.into(), + }) + })?; + + let proposer_index = *epoch_proposers + .proposers + .get(column_slot.as_usize() % T::EthSpec::slots_per_epoch() as usize) + .ok_or_else(|| BeaconChainError::NoProposerForSlot(column_slot))?; + + let fork = epoch_proposers.fork; // Signature verify the signed block header. let signature_is_valid = { diff --git a/common/logging/Cargo.toml b/common/logging/Cargo.toml index 6975e04505..41c82dbd61 100644 --- a/common/logging/Cargo.toml +++ b/common/logging/Cargo.toml @@ -11,8 +11,6 @@ test_logger = [] # Print log output to stderr when running tests instead of drop chrono = { version = "0.4", default-features = false, features = ["clock", "std"] } logroller = { workspace = true } metrics = { workspace = true } -once_cell = "1.17.1" -parking_lot = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } tokio = { workspace = true, features = [ "time" ] } From 2aa5d5c25e22e2f48177ee95bb90a84241e73ba4 Mon Sep 17 00:00:00 2001 From: Lion - dapplion <35266934+dapplion@users.noreply.github.com> Date: Thu, 1 May 2025 16:53:29 -0300 Subject: [PATCH 13/22] Make sure to log SyncingChain ID (#7359) Debugging an sync issue from @pawanjay176 I'm missing some key info where instead of logging the ID of the SyncingChain we just log "Finalized" (the sync type). This looks like some typo or something was lost in translation when refactoring things. ``` Apr 17 12:12:00.707 DEBUG Syncing new finalized chain chain: Finalized, component: "range_sync" ``` This log should include more info about the new chain but just logs "Finalized" ``` Apr 17 12:12:00.810 DEBUG New chain added to sync peer_id: "16Uiu2HAmHP8QLYQJwZ4cjMUEyRgxzpkJF87qPgNecLTpUdruYbdA", sync_type: Finalized, new_chain: Finalized, component: "range_sync" ``` - Remove the Display impl and log the ID explicitly for all logs. - Log more details when creating a new SyncingChain --- .../network/src/sync/range_sync/chain.rs | 13 +--------- .../src/sync/range_sync/chain_collection.rs | 24 ++++++++++++------- .../network/src/sync/range_sync/range.rs | 6 ++--- 3 files changed, 20 insertions(+), 23 deletions(-) diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 24045e901b..813eb7a0c7 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -13,7 +13,6 @@ use logging::crit; use rand::seq::SliceRandom; use rand::Rng; use std::collections::{btree_map::Entry, BTreeMap, HashSet}; -use std::fmt; use strum::IntoStaticStr; use tracing::{debug, instrument, warn}; use types::{Epoch, EthSpec, Hash256, Slot}; @@ -116,16 +115,6 @@ pub struct SyncingChain { current_processing_batch: Option, } -impl fmt::Display for SyncingChain { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self.chain_type { - SyncingChainType::Head => write!(f, "Head"), - SyncingChainType::Finalized => write!(f, "Finalized"), - SyncingChainType::Backfill => write!(f, "Backfill"), - } - } -} - #[derive(PartialEq, Debug)] pub enum ChainSyncingState { /// The chain is not being synced. @@ -177,7 +166,7 @@ impl SyncingChain { /// Get the chain's id. #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - pub fn get_id(&self) -> ChainId { + pub fn id(&self) -> ChainId { self.id } diff --git a/beacon_node/network/src/sync/range_sync/chain_collection.rs b/beacon_node/network/src/sync/range_sync/chain_collection.rs index c6be3de576..9f500c61e0 100644 --- a/beacon_node/network/src/sync/range_sync/chain_collection.rs +++ b/beacon_node/network/src/sync/range_sync/chain_collection.rs @@ -293,8 +293,8 @@ impl ChainCollection { .expect("Chain exists"); match old_id { - Some(Some(old_id)) => debug!(old_id, %chain, "Switching finalized chains"), - None => debug!(%chain, "Syncing new finalized chain"), + Some(Some(old_id)) => debug!(old_id, id = chain.id(), "Switching finalized chains"), + None => debug!(id = chain.id(), "Syncing new finalized chain"), Some(None) => { // this is the same chain. We try to advance it. } @@ -359,7 +359,7 @@ impl ChainCollection { if syncing_chains.len() < PARALLEL_HEAD_CHAINS { // start this chain if it's not already syncing if !chain.is_syncing() { - debug!(%chain, "New head chain started syncing"); + debug!(id = chain.id(), "New head chain started syncing"); } if let Err(remove_reason) = chain.start_syncing(network, local_epoch, local_head_epoch) @@ -421,7 +421,7 @@ impl ChainCollection { if is_outdated(&chain.target_head_slot, &chain.target_head_root) || chain.available_peers() == 0 { - debug!(%chain, "Purging out of finalized chain"); + debug!(id, "Purging out of finalized chain"); Some((*id, chain.is_syncing(), RangeSyncType::Finalized)) } else { None @@ -432,7 +432,7 @@ impl ChainCollection { if is_outdated(&chain.target_head_slot, &chain.target_head_root) || chain.available_peers() == 0 { - debug!(%chain, "Purging out of date head chain"); + debug!(id, "Purging out of date head chain"); Some((*id, chain.is_syncing(), RangeSyncType::Head)) } else { None @@ -478,9 +478,9 @@ impl ChainCollection { debug_assert_eq!(chain.target_head_slot, target_head_slot); if let Err(remove_reason) = chain.add_peer(network, peer) { if remove_reason.is_critical() { - crit!(chain = %id, reason = ?remove_reason, "Chain removed after adding peer"); + crit!(id, reason = ?remove_reason, "Chain removed after adding peer"); } else { - error!(chain = %id, reason = ?remove_reason, "Chain removed after adding peer"); + error!(id, reason = ?remove_reason, "Chain removed after adding peer"); } let is_syncing = chain.is_syncing(); collection.remove(&id); @@ -499,7 +499,15 @@ impl ChainCollection { sync_type.into(), ); - debug!(peer_id = peer_rpr, ?sync_type, %new_chain, "New chain added to sync"); + debug!( + peer_id = peer_rpr, + ?sync_type, + id, + %start_epoch, + %target_head_slot, + ?target_head_root, + "New chain added to sync" + ); collection.insert(id, new_chain); metrics::inc_counter_vec(&metrics::SYNCING_CHAINS_ADDED, &[sync_type.as_str()]); self.update_metrics(); diff --git a/beacon_node/network/src/sync/range_sync/range.rs b/beacon_node/network/src/sync/range_sync/range.rs index ab9a88e4ac..c87418b87b 100644 --- a/beacon_node/network/src/sync/range_sync/range.rs +++ b/beacon_node/network/src/sync/range_sync/range.rs @@ -386,15 +386,15 @@ where op: &'static str, ) { if remove_reason.is_critical() { - crit!(?sync_type, %chain, reason = ?remove_reason,op, "Chain removed"); + crit!(id = chain.id(), ?sync_type, reason = ?remove_reason, op, "Chain removed"); } else { - debug!(?sync_type, %chain, reason = ?remove_reason,op, "Chain removed"); + debug!(id = chain.id(), ?sync_type, reason = ?remove_reason, op, "Chain removed"); } if let RemoveChain::ChainFailed { blacklist, .. } = remove_reason { if RangeSyncType::Finalized == sync_type && blacklist { warn!( - %chain, + id = chain.id(), "Chain failed! Syncing to its head won't be retried for at least the next {} seconds", FAILED_CHAINS_EXPIRY_SECONDS ); From c8224c8d5e19add319ac36cd53547d75fb2280b4 Mon Sep 17 00:00:00 2001 From: GarmashAlex Date: Mon, 5 May 2025 16:56:09 +0300 Subject: [PATCH 14/22] docs: fix broken link to voluntary exit guide (#7387) --- book/src/faq.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/book/src/faq.md b/book/src/faq.md index b0dd696902..62a93166b1 100644 --- a/book/src/faq.md +++ b/book/src/faq.md @@ -240,7 +240,7 @@ Another possible reason for missing the head vote is due to a chain "reorg". A r ### Can I submit a voluntary exit message without running a beacon node? -Yes. Beaconcha.in provides the tool to broadcast the message. You can create the voluntary exit message file with [ethdo](https://github.com/wealdtech/ethdo/releases) and submit the message via the [beaconcha.in](https://beaconcha.in/tools/broadcast) website. A guide on how to use `ethdo` to perform voluntary exit can be found [here](https://github.com/eth-educators/ethstaker-guides/blob/main/docs/validator_voluntary_exit.md). +Yes. Beaconcha.in provides the tool to broadcast the message. You can create the voluntary exit message file with [ethdo](https://github.com/wealdtech/ethdo/releases) and submit the message via the [beaconcha.in](https://beaconcha.in/tools/broadcast) website. A guide on how to use `ethdo` to perform voluntary exit can be found [here](https://github.com/eth-educators/ethstaker-guides/blob/main/docs/voluntary-exit.md). It is also noted that you can submit your BLS-to-execution-change message to update your withdrawal credentials from type `0x00` to `0x01` using the same link. From 43c38a6fa0cc357075d09f900d4c1b89a6b43742 Mon Sep 17 00:00:00 2001 From: chonghe <44791194+chong-he@users.noreply.github.com> Date: Tue, 6 May 2025 21:06:40 +0800 Subject: [PATCH 15/22] Change slog to tracing in comments (#7378) * #7196 --- common/logging/src/sse_logging_components.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/common/logging/src/sse_logging_components.rs b/common/logging/src/sse_logging_components.rs index a25b5be6c5..d526f2b040 100644 --- a/common/logging/src/sse_logging_components.rs +++ b/common/logging/src/sse_logging_components.rs @@ -1,5 +1,4 @@ -// TODO(tracing) fix the comments below and remove reference of slog::Drain -//! This module provides an implementation of `slog::Drain` that optionally writes to a channel if +//! This module provides an implementation of `tracing_subscriber::layer::Layer` that optionally writes to a channel if //! there are subscribers to a HTTP SSE stream. use serde_json::json; From beb0ce68bdf62a417a0dba8f11e907bf1c2d6f17 Mon Sep 17 00:00:00 2001 From: Lion - dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 6 May 2025 23:03:07 -0300 Subject: [PATCH 16/22] Make range sync peer loadbalancing PeerDAS-friendly (#6922) - Re-opens https://github.com/sigp/lighthouse/pull/6864 targeting unstable Range sync and backfill sync still assume that each batch request is done by a single peer. This assumption breaks with PeerDAS, where we request custody columns to N peers. Issues with current unstable: - Peer prioritization counts batch requests per peer. This accounting is broken now, data columns by range request are not accounted - Peer selection for data columns by range ignores the set of peers on a syncing chain, instead draws from the global pool of peers - The implementation is very strict when we have no peers to request from. After PeerDAS this case is very common and we want to be flexible or easy and handle that case better than just hard failing everything. - [x] Upstream peer prioritization to the network context, it knows exactly how many active requests a peer (including columns by range) - [x] Upstream peer selection to the network context, now `block_components_by_range_request` gets a set of peers to choose from instead of a single peer. If it can't find a peer, it returns the error `RpcRequestSendError::NoPeer` - [ ] Range sync and backfill sync handle `RpcRequestSendError::NoPeer` explicitly - [ ] Range sync: leaves the batch in `AwaitingDownload` state and does nothing. **TODO**: we should have some mechanism to fail the chain if it's stale for too long - **EDIT**: Not done in this PR - [x] Backfill sync: pauses the sync until another peer joins - **EDIT**: Same logic as unstable ### TODOs - [ ] Add tests :) - [x] Manually test backfill sync Note: this touches the mainnet path! --- .../src/peer_manager/peerdb.rs | 19 +- .../lighthouse_network/src/types/globals.rs | 14 + .../src/network_beacon_processor/mod.rs | 2 +- .../network/src/sync/backfill_sync/mod.rs | 303 +++++++--------- beacon_node/network/src/sync/manager.rs | 4 +- .../network/src/sync/network_context.rs | 331 +++++++++++++----- .../src/sync/network_context/custody.rs | 32 +- .../src/sync/network_context/requests.rs | 4 + .../network/src/sync/range_sync/batch.rs | 43 +-- .../network/src/sync/range_sync/chain.rs | 249 +++++-------- .../network/src/sync/range_sync/range.rs | 5 +- beacon_node/network/src/sync/tests/lookups.rs | 7 +- 12 files changed, 541 insertions(+), 472 deletions(-) diff --git a/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs b/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs index 083887046a..95a4e82fa2 100644 --- a/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs +++ b/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs @@ -1,6 +1,8 @@ use crate::discovery::enr::PEERDAS_CUSTODY_GROUP_COUNT_ENR_KEY; use crate::discovery::{peer_id_to_node_id, CombinedKey}; -use crate::{metrics, multiaddr::Multiaddr, types::Subnet, Enr, EnrExt, Gossipsub, PeerId}; +use crate::{ + metrics, multiaddr::Multiaddr, types::Subnet, Enr, EnrExt, Gossipsub, PeerId, SyncInfo, +}; use itertools::Itertools; use logging::crit; use peer_info::{ConnectionDirection, PeerConnectionStatus, PeerInfo}; @@ -15,7 +17,7 @@ use std::{ use sync_status::SyncStatus; use tracing::{debug, error, trace, warn}; use types::data_column_custody_group::compute_subnets_for_node; -use types::{ChainSpec, DataColumnSubnetId, EthSpec}; +use types::{ChainSpec, DataColumnSubnetId, Epoch, EthSpec, Hash256, Slot}; pub mod client; pub mod peer_info; @@ -735,6 +737,19 @@ impl PeerDB { }, ); + self.update_sync_status( + &peer_id, + SyncStatus::Synced { + // Fill in mock SyncInfo, only for the peer to return `is_synced() == true`. + info: SyncInfo { + head_slot: Slot::new(0), + head_root: Hash256::ZERO, + finalized_epoch: Epoch::new(0), + finalized_root: Hash256::ZERO, + }, + }, + ); + if supernode { let peer_info = self.peers.get_mut(&peer_id).expect("peer exists"); let all_subnets = (0..spec.data_column_sidecar_subnet_count) diff --git a/beacon_node/lighthouse_network/src/types/globals.rs b/beacon_node/lighthouse_network/src/types/globals.rs index 3031a0dff7..fd99d93589 100644 --- a/beacon_node/lighthouse_network/src/types/globals.rs +++ b/beacon_node/lighthouse_network/src/types/globals.rs @@ -206,6 +206,20 @@ impl NetworkGlobals { .collect::>() } + /// Returns true if the peer is known and is a custodian of `column_index` + pub fn is_custody_peer_of(&self, column_index: ColumnIndex, peer_id: &PeerId) -> bool { + self.peers + .read() + .peer_info(peer_id) + .map(|info| { + info.is_assigned_to_custody_subnet(&DataColumnSubnetId::from_column_index( + column_index, + &self.spec, + )) + }) + .unwrap_or(false) + } + /// Returns the TopicConfig to compute the set of Gossip topics for a given fork pub fn as_topic_config(&self) -> TopicConfig { TopicConfig { diff --git a/beacon_node/network/src/network_beacon_processor/mod.rs b/beacon_node/network/src/network_beacon_processor/mod.rs index 9a8edbfa4c..cfd5c24f99 100644 --- a/beacon_node/network/src/network_beacon_processor/mod.rs +++ b/beacon_node/network/src/network_beacon_processor/mod.rs @@ -1141,7 +1141,7 @@ use { }; #[cfg(test)] -type TestBeaconChainType = +pub(crate) type TestBeaconChainType = Witness, E, MemoryStore, MemoryStore>; #[cfg(test)] diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 509caf7316..fcef06271f 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -10,7 +10,9 @@ use crate::network_beacon_processor::ChainSegmentProcessId; use crate::sync::manager::BatchProcessResult; -use crate::sync::network_context::{RangeRequestId, RpcResponseError, SyncNetworkContext}; +use crate::sync::network_context::{ + RangeRequestId, RpcRequestSendError, RpcResponseError, SyncNetworkContext, +}; use crate::sync::range_sync::{ BatchConfig, BatchId, BatchInfo, BatchOperationOutcome, BatchProcessingResult, BatchState, }; @@ -20,10 +22,9 @@ use lighthouse_network::service::api_types::Id; use lighthouse_network::types::{BackFillState, NetworkGlobals}; use lighthouse_network::{PeerAction, PeerId}; use logging::crit; -use rand::seq::SliceRandom; use std::collections::{ btree_map::{BTreeMap, Entry}, - HashMap, HashSet, + HashSet, }; use std::sync::Arc; use tracing::{debug, error, info, instrument, warn}; @@ -121,9 +122,6 @@ pub struct BackFillSync { /// Sorted map of batches undergoing some kind of processing. batches: BTreeMap>, - /// List of peers we are currently awaiting a response for. - active_requests: HashMap>, - /// The current processing batch, if any. current_processing_batch: Option, @@ -176,7 +174,6 @@ impl BackFillSync { let bfs = BackFillSync { batches: BTreeMap::new(), - active_requests: HashMap::new(), processing_target: current_start, current_start, last_batch_downloaded: false, @@ -314,45 +311,11 @@ impl BackFillSync { skip_all )] #[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"] - pub fn peer_disconnected( - &mut self, - peer_id: &PeerId, - network: &mut SyncNetworkContext, - ) -> Result<(), BackFillError> { + pub fn peer_disconnected(&mut self, peer_id: &PeerId) -> Result<(), BackFillError> { if matches!(self.state(), BackFillState::Failed) { return Ok(()); } - if let Some(batch_ids) = self.active_requests.remove(peer_id) { - // fail the batches. - for id in batch_ids { - if let Some(batch) = self.batches.get_mut(&id) { - match batch.download_failed(false) { - Ok(BatchOperationOutcome::Failed { blacklist: _ }) => { - self.fail_sync(BackFillError::BatchDownloadFailed(id))?; - } - Ok(BatchOperationOutcome::Continue) => {} - Err(e) => { - self.fail_sync(BackFillError::BatchInvalidState(id, e.0))?; - } - } - // If we have run out of peers in which to retry this batch, the backfill state - // transitions to a paused state. - // We still need to reset the state for all the affected batches, so we should not - // short circuit early. - if self.retry_batch_download(network, id).is_err() { - debug!( - batch_id = %id, - error = "no synced peers", - "Batch could not be retried" - ); - } - } else { - debug!(peer = %peer_id, batch = %id, "Batch not found while removing peer"); - } - } - } - // Remove the peer from the participation list self.participating_peers.remove(peer_id); Ok(()) @@ -386,15 +349,12 @@ impl BackFillSync { return Ok(()); } debug!(batch_epoch = %batch_id, error = ?err, "Batch download failed"); - if let Some(active_requests) = self.active_requests.get_mut(peer_id) { - active_requests.remove(&batch_id); - } - match batch.download_failed(true) { + match batch.download_failed(Some(*peer_id)) { Err(e) => self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)), Ok(BatchOperationOutcome::Failed { blacklist: _ }) => { self.fail_sync(BackFillError::BatchDownloadFailed(batch_id)) } - Ok(BatchOperationOutcome::Continue) => self.retry_batch_download(network, batch_id), + Ok(BatchOperationOutcome::Continue) => self.send_batch(network, batch_id), } } else { // this could be an error for an old batch, removed when the chain advances @@ -435,19 +395,11 @@ impl BackFillSync { // sending an error /timeout) if the peer is removed from the chain for other // reasons. Check that this block belongs to the expected peer, and that the // request_id matches - // TODO(das): removed peer_id matching as the node may request a different peer for data - // columns. if !batch.is_expecting_block(&request_id) { return Ok(ProcessResult::Successful); } - // A stream termination has been sent. This batch has ended. Process a completed batch. - // Remove the request from the peer's active batches - self.active_requests - .get_mut(peer_id) - .map(|active_requests| active_requests.remove(&batch_id)); - - match batch.download_completed(blocks) { + match batch.download_completed(blocks, *peer_id) { Ok(received) => { let awaiting_batches = self.processing_target.saturating_sub(batch_id) / BACKFILL_EPOCHS_PER_BATCH; @@ -488,7 +440,6 @@ impl BackFillSync { self.set_state(BackFillState::Failed); // Remove all batches and active requests and participating peers. self.batches.clear(); - self.active_requests.clear(); self.participating_peers.clear(); self.restart_failed_sync = false; @@ -622,7 +573,7 @@ impl BackFillSync { } }; - let Some(peer) = batch.current_peer() else { + let Some(peer) = batch.processing_peer() else { self.fail_sync(BackFillError::BatchInvalidState( batch_id, String::from("Peer does not exist"), @@ -698,6 +649,8 @@ impl BackFillSync { ); for peer in self.participating_peers.drain() { + // TODO(das): `participating_peers` only includes block peers. Should we + // penalize the custody column peers too? network.report_peer(peer, *penalty, "backfill_batch_failed"); } self.fail_sync(BackFillError::BatchProcessingFailed(batch_id)) @@ -723,7 +676,7 @@ impl BackFillSync { { self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0))?; } - self.retry_batch_download(network, batch_id)?; + self.send_batch(network, batch_id)?; Ok(ProcessResult::Successful) } } @@ -864,12 +817,7 @@ impl BackFillSync { } } } - BatchState::Downloading(peer, ..) => { - // remove this batch from the peer's active requests - if let Some(active_requests) = self.active_requests.get_mut(peer) { - active_requests.remove(&id); - } - } + BatchState::Downloading(..) => {} BatchState::Failed | BatchState::Poisoned | BatchState::AwaitingDownload => { crit!("batch indicates inconsistent chain state while advancing chain") } @@ -951,57 +899,10 @@ impl BackFillSync { self.processing_target = self.current_start; for id in redownload_queue { - self.retry_batch_download(network, id)?; + self.send_batch(network, id)?; } // finally, re-request the failed batch. - self.retry_batch_download(network, batch_id) - } - - /// Sends and registers the request of a batch awaiting download. - #[instrument(parent = None, - level = "info", - fields(service = "backfill_sync"), - name = "backfill_sync", - skip_all - )] - fn retry_batch_download( - &mut self, - network: &mut SyncNetworkContext, - batch_id: BatchId, - ) -> Result<(), BackFillError> { - let Some(batch) = self.batches.get_mut(&batch_id) else { - return Ok(()); - }; - - // Find a peer to request the batch - let failed_peers = batch.failed_peers(); - - let new_peer = self - .network_globals - .peers - .read() - .synced_peers() - .map(|peer| { - ( - failed_peers.contains(peer), - self.active_requests.get(peer).map(|v| v.len()).unwrap_or(0), - rand::random::(), - *peer, - ) - }) - // Sort peers prioritizing unrelated peers with less active requests. - .min() - .map(|(_, _, _, peer)| peer); - - if let Some(peer) = new_peer { - self.participating_peers.insert(peer); - self.send_batch(network, batch_id, peer) - } else { - // If we are here the chain has no more synced peers - info!(reason = "insufficient_synced_peers", "Backfill sync paused"); - self.set_state(BackFillState::Paused); - Err(BackFillError::Paused) - } + self.send_batch(network, batch_id) } /// Requests the batch assigned to the given id from a given peer. @@ -1015,53 +916,65 @@ impl BackFillSync { &mut self, network: &mut SyncNetworkContext, batch_id: BatchId, - peer: PeerId, ) -> Result<(), BackFillError> { if let Some(batch) = self.batches.get_mut(&batch_id) { + let synced_peers = self + .network_globals + .peers + .read() + .synced_peers() + .cloned() + .collect::>(); + let (request, is_blob_batch) = batch.to_blocks_by_range_request(); + let failed_peers = batch.failed_peers(); match network.block_components_by_range_request( - peer, is_blob_batch, request, RangeRequestId::BackfillSync { batch_id }, + &synced_peers, + &failed_peers, ) { Ok(request_id) => { // inform the batch about the new request - if let Err(e) = batch.start_downloading_from_peer(peer, request_id) { + if let Err(e) = batch.start_downloading(request_id) { return self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)); } debug!(epoch = %batch_id, %batch, "Requesting batch"); - // register the batch for this peer - self.active_requests - .entry(peer) - .or_default() - .insert(batch_id); return Ok(()); } - Err(e) => { - // NOTE: under normal conditions this shouldn't happen but we handle it anyway - warn!(%batch_id, error = ?e, %batch,"Could not send batch request"); - // register the failed download and check if the batch can be retried - if let Err(e) = batch.start_downloading_from_peer(peer, 1) { - return self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)); + Err(e) => match e { + RpcRequestSendError::NoPeer(no_peer) => { + // If we are here the chain has no more synced peers + info!( + "reason" = format!("insufficient_synced_peers({no_peer:?})"), + "Backfill sync paused" + ); + self.set_state(BackFillState::Paused); + return Err(BackFillError::Paused); } - self.active_requests - .get_mut(&peer) - .map(|request| request.remove(&batch_id)); + RpcRequestSendError::InternalError(e) => { + // NOTE: under normal conditions this shouldn't happen but we handle it anyway + warn!(%batch_id, error = ?e, %batch,"Could not send batch request"); + // register the failed download and check if the batch can be retried + if let Err(e) = batch.start_downloading(1) { + return self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0)); + } - match batch.download_failed(true) { - Err(e) => { - self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0))? - } - Ok(BatchOperationOutcome::Failed { blacklist: _ }) => { - self.fail_sync(BackFillError::BatchDownloadFailed(batch_id))? - } - Ok(BatchOperationOutcome::Continue) => { - return self.retry_batch_download(network, batch_id) + match batch.download_failed(None) { + Err(e) => { + self.fail_sync(BackFillError::BatchInvalidState(batch_id, e.0))? + } + Ok(BatchOperationOutcome::Failed { blacklist: _ }) => { + self.fail_sync(BackFillError::BatchDownloadFailed(batch_id))? + } + Ok(BatchOperationOutcome::Continue) => { + return self.send_batch(network, batch_id) + } } } - } + }, } } @@ -1093,7 +1006,7 @@ impl BackFillSync { .collect::>(); for batch_id in batch_ids_to_retry { - self.retry_batch_download(network, batch_id)?; + self.send_batch(network, batch_id)?; } Ok(()) } @@ -1115,34 +1028,16 @@ impl BackFillSync { } // find the next pending batch and request it from the peer - - // randomize the peers for load balancing - let mut rng = rand::thread_rng(); - let mut idle_peers = self - .network_globals - .peers - .read() - .synced_peers() - .filter(|peer_id| { - self.active_requests - .get(peer_id) - .map(|requests| requests.is_empty()) - .unwrap_or(true) - }) - .cloned() - .collect::>(); - - idle_peers.shuffle(&mut rng); - - while let Some(peer) = idle_peers.pop() { - if let Some(batch_id) = self.include_next_batch(network) { - // send the batch - self.send_batch(network, batch_id, peer)?; - } else { - // No more batches, simply stop - return Ok(()); - } + // Note: for this function to not infinite loop we must: + // - If `include_next_batch` returns Some we MUST increase the count of batches that are + // accounted in the `BACKFILL_BATCH_BUFFER_SIZE` limit in the `matches!` statement of + // that function. + while let Some(batch_id) = self.include_next_batch(network) { + // send the batch + self.send_batch(network, batch_id)?; } + + // No more batches, simply stop Ok(()) } @@ -1296,3 +1191,73 @@ enum ResetEpochError { /// The chain has already completed. SyncCompleted, } + +#[cfg(test)] +mod tests { + use super::*; + use beacon_chain::test_utils::BeaconChainHarness; + use bls::Hash256; + use lighthouse_network::{NetworkConfig, SyncInfo, SyncStatus}; + use rand::prelude::StdRng; + use rand::SeedableRng; + use types::MinimalEthSpec; + + #[test] + fn request_batches_should_not_loop_infinitely() { + let harness = BeaconChainHarness::builder(MinimalEthSpec) + .default_spec() + .deterministic_keypairs(4) + .fresh_ephemeral_store() + .build(); + + let beacon_chain = harness.chain.clone(); + let slots_per_epoch = MinimalEthSpec::slots_per_epoch(); + + let network_globals = Arc::new(NetworkGlobals::new_test_globals( + vec![], + Arc::new(NetworkConfig::default()), + beacon_chain.spec.clone(), + )); + + { + let mut rng = StdRng::seed_from_u64(0xDEADBEEF0BAD5EEDu64); + let peer_id = network_globals + .peers + .write() + .__add_connected_peer_testing_only( + true, + &beacon_chain.spec, + k256::ecdsa::SigningKey::random(&mut rng).into(), + ); + + // Simulate finalized epoch and head being 2 epochs ahead + let finalized_epoch = Epoch::new(40); + let head_epoch = finalized_epoch + 2; + let head_slot = head_epoch.start_slot(slots_per_epoch) + 1; + + network_globals.peers.write().update_sync_status( + &peer_id, + SyncStatus::Synced { + info: SyncInfo { + head_slot, + head_root: Hash256::random(), + finalized_epoch, + finalized_root: Hash256::random(), + }, + }, + ); + } + + let mut network = SyncNetworkContext::new_for_testing( + beacon_chain.clone(), + network_globals.clone(), + harness.runtime.task_executor.clone(), + ); + + let mut backfill = BackFillSync::new(beacon_chain, network_globals); + backfill.set_state(BackFillState::Syncing); + + // if this ends up running into an infinite loop, the test will overflow the stack pretty quickly. + let _ = backfill.request_batches(&mut network); + } +} diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 84e492c04f..9119b1652c 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -515,9 +515,7 @@ impl SyncManager { // Remove peer from all data structures self.range_sync.peer_disconnect(&mut self.network, peer_id); - let _ = self - .backfill_sync - .peer_disconnected(peer_id, &mut self.network); + let _ = self.backfill_sync.peer_disconnected(peer_id); self.block_lookups.peer_disconnected(peer_id); // Regardless of the outcome, we update the sync status. diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 2cb5ec9a0a..d9eda651e7 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -9,6 +9,8 @@ use super::range_sync::ByRangeRequestType; use super::SyncMessage; use crate::metrics; use crate::network_beacon_processor::NetworkBeaconProcessor; +#[cfg(test)] +use crate::network_beacon_processor::TestBeaconChainType; use crate::service::NetworkMessage; use crate::status::ToStatusMessage; use crate::sync::block_lookups::SingleLookupId; @@ -27,18 +29,20 @@ use lighthouse_network::service::api_types::{ }; use lighthouse_network::{Client, NetworkGlobals, PeerAction, PeerId, ReportSource}; use parking_lot::RwLock; -use rand::prelude::IteratorRandom; -use rand::thread_rng; pub use requests::LookupVerifyError; use requests::{ ActiveRequests, BlobsByRangeRequestItems, BlobsByRootRequestItems, BlocksByRangeRequestItems, BlocksByRootRequestItems, DataColumnsByRangeRequestItems, DataColumnsByRootRequestItems, }; +#[cfg(test)] +use slot_clock::SlotClock; use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet}; use std::fmt::Debug; use std::sync::Arc; use std::time::Duration; +#[cfg(test)] +use task_executor::TaskExecutor; use tokio::sync::mpsc; use tracing::{debug, error, span, warn, Level}; use types::blob_sidecar::FixedBlobSidecarList; @@ -82,24 +86,18 @@ pub enum RpcResponseError { #[derive(Debug, PartialEq, Eq)] pub enum RpcRequestSendError { - /// Network channel send failed - NetworkSendError, - NoCustodyPeers, - CustodyRequestError(custody::Error), - SlotClockError, + /// No peer available matching the required criteria + NoPeer(NoPeerError), + /// These errors should never happen, including unreachable custody errors or network send + /// errors. + InternalError(String), } -impl std::fmt::Display for RpcRequestSendError { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - match self { - RpcRequestSendError::NetworkSendError => write!(f, "Network send error"), - RpcRequestSendError::NoCustodyPeers => write!(f, "No custody peers"), - RpcRequestSendError::CustodyRequestError(e) => { - write!(f, "Custody request error: {:?}", e) - } - RpcRequestSendError::SlotClockError => write!(f, "Slot clock error"), - } - } +/// Type of peer missing that caused a `RpcRequestSendError::NoPeers` +#[derive(Debug, PartialEq, Eq)] +pub enum NoPeerError { + BlockPeer, + CustodyPeer(ColumnIndex), } #[derive(Debug, PartialEq, Eq)] @@ -232,6 +230,35 @@ pub enum RangeBlockComponent { ), } +#[cfg(test)] +impl SyncNetworkContext> { + pub fn new_for_testing( + beacon_chain: Arc>>, + network_globals: Arc>, + task_executor: TaskExecutor, + ) -> Self { + let fork_context = Arc::new(ForkContext::new::( + beacon_chain.slot_clock.now().unwrap_or(Slot::new(0)), + beacon_chain.genesis_validators_root, + &beacon_chain.spec, + )); + let (network_tx, _network_rx) = mpsc::unbounded_channel(); + let (beacon_processor, _) = NetworkBeaconProcessor::null_for_testing( + network_globals, + mpsc::unbounded_channel().0, + beacon_chain.clone(), + task_executor, + ); + + SyncNetworkContext::new( + network_tx, + Arc::new(beacon_processor), + beacon_chain, + fork_context, + ) + } +} + impl SyncNetworkContext { pub fn new( network_send: mpsc::UnboundedSender>, @@ -331,12 +358,6 @@ impl SyncNetworkContext { .custody_peers_for_column(column_index) } - pub fn get_random_custodial_peer(&self, column_index: ColumnIndex) -> Option { - self.get_custodial_peers(column_index) - .into_iter() - .choose(&mut thread_rng()) - } - pub fn network_globals(&self) -> &NetworkGlobals { &self.network_beacon_processor.network_globals } @@ -381,34 +402,102 @@ impl SyncNetworkContext { } } + fn active_request_count_by_peer(&self) -> HashMap { + let Self { + network_send: _, + request_id: _, + blocks_by_root_requests, + blobs_by_root_requests, + data_columns_by_root_requests, + blocks_by_range_requests, + blobs_by_range_requests, + data_columns_by_range_requests, + // custody_by_root_requests is a meta request of data_columns_by_root_requests + custody_by_root_requests: _, + // components_by_range_requests is a meta request of various _by_range requests + components_by_range_requests: _, + execution_engine_state: _, + network_beacon_processor: _, + chain: _, + fork_context: _, + // Don't use a fallback match. We want to be sure that all requests are considered when + // adding new ones + } = self; + + let mut active_request_count_by_peer = HashMap::::new(); + + for peer_id in blocks_by_root_requests + .iter_request_peers() + .chain(blobs_by_root_requests.iter_request_peers()) + .chain(data_columns_by_root_requests.iter_request_peers()) + .chain(blocks_by_range_requests.iter_request_peers()) + .chain(blobs_by_range_requests.iter_request_peers()) + .chain(data_columns_by_range_requests.iter_request_peers()) + { + *active_request_count_by_peer.entry(peer_id).or_default() += 1; + } + + active_request_count_by_peer + } + /// A blocks by range request sent by the range sync algorithm pub fn block_components_by_range_request( &mut self, - peer_id: PeerId, batch_type: ByRangeRequestType, request: BlocksByRangeRequest, requester: RangeRequestId, + peers: &HashSet, + peers_to_deprioritize: &HashSet, ) -> Result { + let active_request_count_by_peer = self.active_request_count_by_peer(); + + let Some(block_peer) = peers + .iter() + .map(|peer| { + ( + // If contains -> 1 (order after), not contains -> 0 (order first) + peers_to_deprioritize.contains(peer), + // Prefer peers with less overall requests + active_request_count_by_peer.get(peer).copied().unwrap_or(0), + // Random factor to break ties, otherwise the PeerID breaks ties + rand::random::(), + peer, + ) + }) + .min() + .map(|(_, _, _, peer)| *peer) + else { + // Backfill and forward sync handle this condition gracefully. + // - Backfill sync: will pause waiting for more peers to join + // - Forward sync: can never happen as the chain is dropped when removing the last peer. + return Err(RpcRequestSendError::NoPeer(NoPeerError::BlockPeer)); + }; + + // Attempt to find all required custody peers before sending any request or creating an ID + let columns_by_range_peers_to_request = + if matches!(batch_type, ByRangeRequestType::BlocksAndColumns) { + let column_indexes = self.network_globals().sampling_columns.clone(); + Some(self.select_columns_by_range_peers_to_request( + &column_indexes, + peers, + active_request_count_by_peer, + peers_to_deprioritize, + )?) + } else { + None + }; + // Create the overall components_by_range request ID before its individual components let id = ComponentsByRangeRequestId { id: self.next_id(), requester, }; - // Compute custody column peers before sending the blocks_by_range request. If we don't have - // enough peers, error here. - let data_column_requests = if matches!(batch_type, ByRangeRequestType::BlocksAndColumns) { - let column_indexes = self.network_globals().sampling_columns.clone(); - Some(self.make_columns_by_range_requests(request.clone(), &column_indexes)?) - } else { - None - }; - - let blocks_req_id = self.send_blocks_by_range_request(peer_id, request.clone(), id)?; + let blocks_req_id = self.send_blocks_by_range_request(block_peer, request.clone(), id)?; let blobs_req_id = if matches!(batch_type, ByRangeRequestType::BlocksAndBlobs) { Some(self.send_blobs_by_range_request( - peer_id, + block_peer, BlobsByRangeRequest { start_slot: *request.start_slot(), count: *request.count(), @@ -419,64 +508,98 @@ impl SyncNetworkContext { None }; - let data_columns = if let Some(data_column_requests) = data_column_requests { - let data_column_requests = data_column_requests - .into_iter() - .map(|(peer_id, columns_by_range_request)| { - self.send_data_columns_by_range_request(peer_id, columns_by_range_request, id) - }) - .collect::, _>>()?; + let data_column_requests = columns_by_range_peers_to_request + .map(|columns_by_range_peers_to_request| { + columns_by_range_peers_to_request + .into_iter() + .map(|(peer_id, columns)| { + self.send_data_columns_by_range_request( + peer_id, + DataColumnsByRangeRequest { + start_slot: *request.start_slot(), + count: *request.count(), + columns, + }, + id, + ) + }) + .collect::, _>>() + }) + .transpose()?; - Some(( - data_column_requests, - self.network_globals() - .sampling_columns - .iter() - .cloned() - .collect::>(), - )) - } else { - None - }; - - let info = RangeBlockComponentsRequest::new(blocks_req_id, blobs_req_id, data_columns); + let info = RangeBlockComponentsRequest::new( + blocks_req_id, + blobs_req_id, + data_column_requests.map(|data_column_requests| { + ( + data_column_requests, + self.network_globals() + .sampling_columns + .clone() + .iter() + .copied() + .collect(), + ) + }), + ); self.components_by_range_requests.insert(id, info); Ok(id.id) } - fn make_columns_by_range_requests( + fn select_columns_by_range_peers_to_request( &self, - request: BlocksByRangeRequest, custody_indexes: &HashSet, - ) -> Result, RpcRequestSendError> { - let mut peer_id_to_request_map = HashMap::new(); + peers: &HashSet, + active_request_count_by_peer: HashMap, + peers_to_deprioritize: &HashSet, + ) -> Result>, RpcRequestSendError> { + let mut columns_to_request_by_peer = HashMap::>::new(); for column_index in custody_indexes { - // TODO(das): The peer selection logic here needs to be improved - we should probably - // avoid retrying from failed peers, however `BatchState` currently only tracks the peer - // serving the blocks. - let Some(custody_peer) = self.get_random_custodial_peer(*column_index) else { + // Strictly consider peers that are custodials of this column AND are part of this + // syncing chain. If the forward range sync chain has few peers, it's likely that this + // function will not be able to find peers on our custody columns. + let Some(custody_peer) = peers + .iter() + .filter(|peer| { + self.network_globals() + .is_custody_peer_of(*column_index, peer) + }) + .map(|peer| { + ( + // If contains -> 1 (order after), not contains -> 0 (order first) + peers_to_deprioritize.contains(peer), + // Prefer peers with less overall requests + // Also account for requests that are not yet issued tracked in peer_id_to_request_map + // We batch requests to the same peer, so count existance in the + // `columns_to_request_by_peer` as a single 1 request. + active_request_count_by_peer.get(peer).copied().unwrap_or(0) + + columns_to_request_by_peer.get(peer).map(|_| 1).unwrap_or(0), + // Random factor to break ties, otherwise the PeerID breaks ties + rand::random::(), + peer, + ) + }) + .min() + .map(|(_, _, _, peer)| *peer) + else { // TODO(das): this will be pretty bad UX. To improve we should: - // - Attempt to fetch custody requests first, before requesting blocks // - Handle the no peers case gracefully, maybe add some timeout and give a few // minutes / seconds to the peer manager to locate peers on this subnet before // abandoing progress on the chain completely. - return Err(RpcRequestSendError::NoCustodyPeers); + return Err(RpcRequestSendError::NoPeer(NoPeerError::CustodyPeer( + *column_index, + ))); }; - let columns_by_range_request = peer_id_to_request_map + columns_to_request_by_peer .entry(custody_peer) - .or_insert_with(|| DataColumnsByRangeRequest { - start_slot: *request.start_slot(), - count: *request.count(), - columns: vec![], - }); - - columns_by_range_request.columns.push(*column_index); + .or_default() + .push(*column_index); } - Ok(peer_id_to_request_map) + Ok(columns_to_request_by_peer) } /// Received a blocks by range or blobs by range response for a request that couples blocks ' @@ -536,11 +659,21 @@ impl SyncNetworkContext { lookup_peers: Arc>>, block_root: Hash256, ) -> Result { + let active_request_count_by_peer = self.active_request_count_by_peer(); let Some(peer_id) = lookup_peers .read() .iter() - .choose(&mut rand::thread_rng()) - .copied() + .map(|peer| { + ( + // Prefer peers with less overall requests + active_request_count_by_peer.get(peer).copied().unwrap_or(0), + // Random factor to break ties, otherwise the PeerID breaks ties + rand::random::(), + peer, + ) + }) + .min() + .map(|(_, _, peer)| *peer) else { // Allow lookup to not have any peers and do nothing. This is an optimization to not // lose progress of lookups created from a block with unknown parent before we receive @@ -597,7 +730,7 @@ impl SyncNetworkContext { request: RequestType::BlocksByRoot(request.into_request(&self.fork_context)), app_request_id: AppRequestId::Sync(SyncRequestId::SingleBlock { id }), }) - .map_err(|_| RpcRequestSendError::NetworkSendError)?; + .map_err(|_| RpcRequestSendError::InternalError("network send error".to_owned()))?; debug!( method = "BlocksByRoot", @@ -632,11 +765,21 @@ impl SyncNetworkContext { block_root: Hash256, expected_blobs: usize, ) -> Result { + let active_request_count_by_peer = self.active_request_count_by_peer(); let Some(peer_id) = lookup_peers .read() .iter() - .choose(&mut rand::thread_rng()) - .copied() + .map(|peer| { + ( + // Prefer peers with less overall requests + active_request_count_by_peer.get(peer).copied().unwrap_or(0), + // Random factor to break ties, otherwise the PeerID breaks ties + rand::random::(), + peer, + ) + }) + .min() + .map(|(_, _, peer)| *peer) else { // Allow lookup to not have any peers and do nothing. This is an optimization to not // lose progress of lookups created from a block with unknown parent before we receive @@ -686,7 +829,7 @@ impl SyncNetworkContext { request: RequestType::BlobsByRoot(request.clone().into_request(&self.fork_context)), app_request_id: AppRequestId::Sync(SyncRequestId::SingleBlob { id }), }) - .map_err(|_| RpcRequestSendError::NetworkSendError)?; + .map_err(|_| RpcRequestSendError::InternalError("network send error".to_owned()))?; debug!( method = "BlobsByRoot", @@ -821,7 +964,25 @@ impl SyncNetworkContext { self.custody_by_root_requests.insert(requester, request); Ok(LookupRequestResult::RequestSent(id.req_id)) } - Err(e) => Err(RpcRequestSendError::CustodyRequestError(e)), + Err(e) => Err(match e { + CustodyRequestError::NoPeer(column_index) => { + RpcRequestSendError::NoPeer(NoPeerError::CustodyPeer(column_index)) + } + // - TooManyFailures: Should never happen, `request` has just been created, it's + // count of download_failures is 0 here + // - BadState: Should never happen, a bad state can only happen when handling a + // network response + // - UnexpectedRequestId: Never happens: this Err is only constructed handling a + // download or processing response + // - SendFailed: Should never happen unless in a bad drop sequence when shutting + // down the node + e @ (CustodyRequestError::TooManyFailures + | CustodyRequestError::BadState { .. } + | CustodyRequestError::UnexpectedRequestId { .. } + | CustodyRequestError::SendFailed { .. }) => { + RpcRequestSendError::InternalError(format!("{e:?}")) + } + }), } } @@ -841,7 +1002,7 @@ impl SyncNetworkContext { request: RequestType::BlocksByRange(request.clone().into()), app_request_id: AppRequestId::Sync(SyncRequestId::BlocksByRange(id)), }) - .map_err(|_| RpcRequestSendError::NetworkSendError)?; + .map_err(|_| RpcRequestSendError::InternalError("network send error".to_owned()))?; debug!( method = "BlocksByRange", @@ -882,7 +1043,7 @@ impl SyncNetworkContext { request: RequestType::BlobsByRange(request.clone()), app_request_id: AppRequestId::Sync(SyncRequestId::BlobsByRange(id)), }) - .map_err(|_| RpcRequestSendError::NetworkSendError)?; + .map_err(|_| RpcRequestSendError::InternalError("network send error".to_owned()))?; debug!( method = "BlobsByRange", @@ -921,7 +1082,7 @@ impl SyncNetworkContext { request: RequestType::DataColumnsByRange(request.clone()), app_request_id: AppRequestId::Sync(SyncRequestId::DataColumnsByRange(id)), }) - .map_err(|_| RpcRequestSendError::NetworkSendError)?; + .map_err(|_| RpcRequestSendError::InternalError("network send error".to_owned()))?; debug!( method = "DataColumnsByRange", diff --git a/beacon_node/network/src/sync/network_context/custody.rs b/beacon_node/network/src/sync/network_context/custody.rs index e7e6e62349..f4d010b881 100644 --- a/beacon_node/network/src/sync/network_context/custody.rs +++ b/beacon_node/network/src/sync/network_context/custody.rs @@ -45,7 +45,7 @@ pub enum Error { SendFailed(&'static str), TooManyFailures, BadState(String), - NoPeers(ColumnIndex), + NoPeer(ColumnIndex), /// Received a download result for a different request id than the in-flight request. /// There should only exist a single request at a time. Having multiple requests is a bug and /// can result in undefined state, so it's treated as a hard error and the lookup is dropped. @@ -56,7 +56,6 @@ pub enum Error { } struct ActiveBatchColumnsRequest { - peer_id: PeerId, indices: Vec, } @@ -220,6 +219,7 @@ impl ActiveCustodyRequest { return Ok(Some((columns, peer_group, max_seen_timestamp))); } + let active_request_count_by_peer = cx.active_request_count_by_peer(); let mut columns_to_request_by_peer = HashMap::>::new(); let lookup_peers = self.lookup_peers.read(); @@ -238,15 +238,11 @@ impl ActiveCustodyRequest { // only query the peers on that fork. Should this case be handled? How to handle it? let custodial_peers = cx.get_custodial_peers(*column_index); - // TODO(das): cache this computation in a OneCell or similar to prevent having to - // run it every loop - let mut active_requests_by_peer = HashMap::::new(); - for batch_request in self.active_batch_columns_requests.values() { - *active_requests_by_peer - .entry(batch_request.peer_id) - .or_default() += 1; - } - + // We draw from the total set of peers, but prioritize those peers who we have + // received an attestation / status / block message claiming to have imported the + // lookup. The frequency of those messages is low, so drawing only from lookup_peers + // could cause many lookups to take much longer or fail as they don't have enough + // custody peers on a given column let mut priorized_peers = custodial_peers .iter() .map(|peer| { @@ -256,9 +252,12 @@ impl ActiveCustodyRequest { // De-prioritize peers that have failed to successfully respond to // requests recently self.failed_peers.contains(peer), - // Prefer peers with less requests to load balance across peers - active_requests_by_peer.get(peer).copied().unwrap_or(0), - // Final random factor to give all peers a shot in each retry + // Prefer peers with fewer requests to load balance across peers. + // We batch requests to the same peer, so count existence in the + // `columns_to_request_by_peer` as a single 1 request. + active_request_count_by_peer.get(peer).copied().unwrap_or(0) + + columns_to_request_by_peer.get(peer).map(|_| 1).unwrap_or(0), + // Random factor to break ties, otherwise the PeerID breaks ties rand::thread_rng().gen::(), *peer, ) @@ -276,7 +275,7 @@ impl ActiveCustodyRequest { // `MAX_STALE_NO_PEERS_DURATION`, else error and drop the request. Note that // lookup will naturally retry when other peers send us attestations for // descendants of this un-available lookup. - return Err(Error::NoPeers(*column_index)); + return Err(Error::NoPeer(*column_index)); } else { // Do not issue requests if there is no custody peer on this column } @@ -306,13 +305,14 @@ impl ActiveCustodyRequest { let column_request = self .column_requests .get_mut(column_index) + // Should never happen: column_index is iterated from column_requests .ok_or(Error::BadState("unknown column_index".to_owned()))?; column_request.on_download_start(req_id)?; } self.active_batch_columns_requests - .insert(req_id, ActiveBatchColumnsRequest { indices, peer_id }); + .insert(req_id, ActiveBatchColumnsRequest { indices }); } LookupRequestResult::NoRequestNeeded(_) => unreachable!(), LookupRequestResult::Pending(_) => unreachable!(), diff --git a/beacon_node/network/src/sync/network_context/requests.rs b/beacon_node/network/src/sync/network_context/requests.rs index c9b85e47b6..963b633ed6 100644 --- a/beacon_node/network/src/sync/network_context/requests.rs +++ b/beacon_node/network/src/sync/network_context/requests.rs @@ -179,6 +179,10 @@ impl ActiveRequests { .collect() } + pub fn iter_request_peers(&self) -> impl Iterator + '_ { + self.requests.values().map(|request| request.peer_id) + } + pub fn len(&self) -> usize { self.requests.len() } diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index c1ad550376..264f83ee82 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -107,7 +107,7 @@ pub struct BatchInfo { /// Number of processing attempts that have failed but we do not count. non_faulty_processing_attempts: u8, /// The number of download retries this batch has undergone due to a failed request. - failed_download_attempts: Vec, + failed_download_attempts: Vec>, /// State of the batch. state: BatchState, /// Whether this batch contains all blocks or all blocks and blobs. @@ -132,7 +132,7 @@ pub enum BatchState { /// The batch has failed either downloading or processing, but can be requested again. AwaitingDownload, /// The batch is being downloaded. - Downloading(PeerId, Id), + Downloading(Id), /// The batch has been completely downloaded and is ready for processing. AwaitingProcessing(PeerId, Vec>, Instant), /// The batch is being processed. @@ -197,8 +197,8 @@ impl BatchInfo { peers.insert(attempt.peer_id); } - for download in &self.failed_download_attempts { - peers.insert(*download); + for peer in self.failed_download_attempts.iter().flatten() { + peers.insert(*peer); } peers @@ -206,18 +206,17 @@ impl BatchInfo { /// Verifies if an incoming block belongs to this batch. pub fn is_expecting_block(&self, request_id: &Id) -> bool { - if let BatchState::Downloading(_, expected_id) = &self.state { + if let BatchState::Downloading(expected_id) = &self.state { return expected_id == request_id; } false } /// Returns the peer that is currently responsible for progressing the state of the batch. - pub fn current_peer(&self) -> Option<&PeerId> { + pub fn processing_peer(&self) -> Option<&PeerId> { match &self.state { - BatchState::AwaitingDownload | BatchState::Failed => None, - BatchState::Downloading(peer_id, _) - | BatchState::AwaitingProcessing(peer_id, _, _) + BatchState::AwaitingDownload | BatchState::Failed | BatchState::Downloading(..) => None, + BatchState::AwaitingProcessing(peer_id, _, _) | BatchState::Processing(Attempt { peer_id, .. }) | BatchState::AwaitingValidation(Attempt { peer_id, .. }) => Some(peer_id), BatchState::Poisoned => unreachable!("Poisoned batch"), @@ -276,9 +275,10 @@ impl BatchInfo { pub fn download_completed( &mut self, blocks: Vec>, + peer: PeerId, ) -> Result { match self.state.poison() { - BatchState::Downloading(peer, _request_id) => { + BatchState::Downloading(_) => { let received = blocks.len(); self.state = BatchState::AwaitingProcessing(peer, blocks, Instant::now()); Ok(received) @@ -297,19 +297,18 @@ impl BatchInfo { /// Mark the batch as failed and return whether we can attempt a re-download. /// /// This can happen if a peer disconnects or some error occurred that was not the peers fault. - /// THe `mark_failed` parameter, when set to false, does not increment the failed attempts of + /// The `peer` parameter, when set to None, does not increment the failed attempts of /// this batch and register the peer, rather attempts a re-download. #[must_use = "Batch may have failed"] pub fn download_failed( &mut self, - mark_failed: bool, + peer: Option, ) -> Result { match self.state.poison() { - BatchState::Downloading(peer, _request_id) => { + BatchState::Downloading(_) => { // register the attempt and check if the batch can be tried again - if mark_failed { - self.failed_download_attempts.push(peer); - } + self.failed_download_attempts.push(peer); + self.state = if self.failed_download_attempts.len() >= B::max_batch_download_attempts() as usize { @@ -331,14 +330,10 @@ impl BatchInfo { } } - pub fn start_downloading_from_peer( - &mut self, - peer: PeerId, - request_id: Id, - ) -> Result<(), WrongState> { + pub fn start_downloading(&mut self, request_id: Id) -> Result<(), WrongState> { match self.state.poison() { BatchState::AwaitingDownload => { - self.state = BatchState::Downloading(peer, request_id); + self.state = BatchState::Downloading(request_id); Ok(()) } BatchState::Poisoned => unreachable!("Poisoned batch"), @@ -477,8 +472,8 @@ impl std::fmt::Debug for BatchState { BatchState::AwaitingProcessing(ref peer, ref blocks, _) => { write!(f, "AwaitingProcessing({}, {} blocks)", peer, blocks.len()) } - BatchState::Downloading(peer, request_id) => { - write!(f, "Downloading({}, {})", peer, request_id) + BatchState::Downloading(request_id) => { + write!(f, "Downloading({})", request_id) } BatchState::Poisoned => f.write_str("Poisoned"), } diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 813eb7a0c7..be01734417 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -2,16 +2,13 @@ use super::batch::{BatchInfo, BatchProcessingResult, BatchState}; use super::RangeSyncType; use crate::metrics; use crate::network_beacon_processor::ChainSegmentProcessId; -use crate::sync::network_context::{RangeRequestId, RpcResponseError}; +use crate::sync::network_context::{RangeRequestId, RpcRequestSendError, RpcResponseError}; use crate::sync::{network_context::SyncNetworkContext, BatchOperationOutcome, BatchProcessResult}; use beacon_chain::block_verification_types::RpcBlock; use beacon_chain::BeaconChainTypes; -use fnv::FnvHashMap; use lighthouse_network::service::api_types::Id; use lighthouse_network::{PeerAction, PeerId}; use logging::crit; -use rand::seq::SliceRandom; -use rand::Rng; use std::collections::{btree_map::Entry, BTreeMap, HashSet}; use strum::IntoStaticStr; use tracing::{debug, instrument, warn}; @@ -91,7 +88,7 @@ pub struct SyncingChain { /// The peers that agree on the `target_head_slot` and `target_head_root` as a canonical chain /// and thus available to download this chain from, as well as the batches we are currently /// requesting. - peers: FnvHashMap>, + peers: HashSet, /// Starting epoch of the next batch that needs to be downloaded. to_be_downloaded: BatchId, @@ -133,9 +130,6 @@ impl SyncingChain { peer_id: PeerId, chain_type: SyncingChainType, ) -> Self { - let mut peers = FnvHashMap::default(); - peers.insert(peer_id, Default::default()); - SyncingChain { id, chain_type, @@ -143,7 +137,7 @@ impl SyncingChain { target_head_slot, target_head_root, batches: BTreeMap::new(), - peers, + peers: HashSet::from_iter([peer_id]), to_be_downloaded: start_epoch, processing_target: start_epoch, optimistic_start: None, @@ -173,7 +167,7 @@ impl SyncingChain { /// Peers currently syncing this chain. #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] pub fn peers(&self) -> impl Iterator + '_ { - self.peers.keys().cloned() + self.peers.iter().cloned() } /// Progress in epochs made by the chain @@ -196,29 +190,8 @@ impl SyncingChain { /// Removes a peer from the chain. /// If the peer has active batches, those are considered failed and re-requested. #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - pub fn remove_peer( - &mut self, - peer_id: &PeerId, - network: &mut SyncNetworkContext, - ) -> ProcessingResult { - if let Some(batch_ids) = self.peers.remove(peer_id) { - // fail the batches. - for id in batch_ids { - if let Some(batch) = self.batches.get_mut(&id) { - if let BatchOperationOutcome::Failed { blacklist } = - batch.download_failed(true)? - { - return Err(RemoveChain::ChainFailed { - blacklist, - failing_batch: id, - }); - } - self.retry_batch_download(network, id)?; - } else { - debug!(%peer_id, batch = ?id, "Batch not found while removing peer") - } - } - } + pub fn remove_peer(&mut self, peer_id: &PeerId) -> ProcessingResult { + self.peers.remove(peer_id); if self.peers.is_empty() { Err(RemoveChain::EmptyPeerPool) @@ -270,11 +243,9 @@ impl SyncingChain { // A stream termination has been sent. This batch has ended. Process a completed batch. // Remove the request from the peer's active batches - self.peers - .get_mut(peer_id) - .map(|active_requests| active_requests.remove(&batch_id)); - let received = batch.download_completed(blocks)?; + // TODO(das): should use peer group here https://github.com/sigp/lighthouse/issues/6258 + let received = batch.download_completed(blocks, *peer_id)?; let awaiting_batches = batch_id .saturating_sub(self.optimistic_start.unwrap_or(self.processing_target)) / EPOCHS_PER_BATCH; @@ -476,7 +447,7 @@ impl SyncingChain { } }; - let peer = batch.current_peer().cloned().ok_or_else(|| { + let peer = batch.processing_peer().cloned().ok_or_else(|| { RemoveChain::WrongBatchState(format!( "Processing target is in wrong state: {:?}", batch.state(), @@ -582,7 +553,7 @@ impl SyncingChain { "Batch failed to download. Dropping chain scoring peers" ); - for (peer, _) in self.peers.drain() { + for peer in self.peers.drain() { network.report_peer(peer, *penalty, "faulty_chain"); } Err(RemoveChain::ChainFailed { @@ -595,7 +566,7 @@ impl SyncingChain { BatchProcessResult::NonFaultyFailure => { batch.processing_completed(BatchProcessingResult::NonFaultyFailure)?; // Simply redownload the batch. - self.retry_batch_download(network, batch_id) + self.send_batch(network, batch_id) } } } @@ -616,7 +587,7 @@ impl SyncingChain { debug!(%epoch, reason, "Rejected optimistic batch left for future use"); // this batch is now treated as any other batch, and re-requested for future use if redownload { - return self.retry_batch_download(network, epoch); + return self.send_batch(network, epoch); } } else { debug!(%epoch, reason, "Rejected optimistic batch"); @@ -696,12 +667,7 @@ impl SyncingChain { } } } - BatchState::Downloading(peer, ..) => { - // remove this batch from the peer's active requests - if let Some(active_batches) = self.peers.get_mut(peer) { - active_batches.remove(&id); - } - } + BatchState::Downloading(..) => {} BatchState::Failed | BatchState::Poisoned | BatchState::AwaitingDownload => { crit!("batch indicates inconsistent chain state while advancing chain") } @@ -790,10 +756,10 @@ impl SyncingChain { self.processing_target = self.start_epoch; for id in redownload_queue { - self.retry_batch_download(network, id)?; + self.send_batch(network, id)?; } // finally, re-request the failed batch. - self.retry_batch_download(network, batch_id) + self.send_batch(network, batch_id) } pub fn stop_syncing(&mut self) { @@ -849,13 +815,8 @@ impl SyncingChain { network: &mut SyncNetworkContext, peer_id: PeerId, ) -> ProcessingResult { - // add the peer without overwriting its active requests - if self.peers.entry(peer_id).or_default().is_empty() { - // Either new or not, this peer is idle, try to request more batches - self.request_batches(network) - } else { - Ok(KeepChain) - } + self.peers.insert(peer_id); + self.request_batches(network) } /// An RPC error has occurred. @@ -896,16 +857,15 @@ impl SyncingChain { %request_id, "Batch download error" ); - if let Some(active_requests) = self.peers.get_mut(peer_id) { - active_requests.remove(&batch_id); - } - if let BatchOperationOutcome::Failed { blacklist } = batch.download_failed(true)? { + if let BatchOperationOutcome::Failed { blacklist } = + batch.download_failed(Some(*peer_id))? + { return Err(RemoveChain::ChainFailed { blacklist, failing_batch: batch_id, }); } - self.retry_batch_download(network, batch_id) + self.send_batch(network, batch_id) } else { debug!( batch_epoch = %batch_id, @@ -919,66 +879,42 @@ impl SyncingChain { } } - /// Sends and registers the request of a batch awaiting download. - #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] - pub fn retry_batch_download( - &mut self, - network: &mut SyncNetworkContext, - batch_id: BatchId, - ) -> ProcessingResult { - let Some(batch) = self.batches.get_mut(&batch_id) else { - return Ok(KeepChain); - }; - - // Find a peer to request the batch - let failed_peers = batch.failed_peers(); - - let new_peer = self - .peers - .iter() - .map(|(peer, requests)| { - ( - failed_peers.contains(peer), - requests.len(), - rand::thread_rng().gen::(), - *peer, - ) - }) - // Sort peers prioritizing unrelated peers with less active requests. - .min() - .map(|(_, _, _, peer)| peer); - - if let Some(peer) = new_peer { - self.send_batch(network, batch_id, peer) - } else { - // If we are here the chain has no more peers - Err(RemoveChain::EmptyPeerPool) - } - } - /// Requests the batch assigned to the given id from a given peer. #[instrument(parent = None,level = "info", fields(chain = self.id , service = "range_sync"), skip_all)] pub fn send_batch( &mut self, network: &mut SyncNetworkContext, batch_id: BatchId, - peer: PeerId, ) -> ProcessingResult { let batch_state = self.visualize_batch_state(); if let Some(batch) = self.batches.get_mut(&batch_id) { let (request, batch_type) = batch.to_blocks_by_range_request(); + let failed_peers = batch.failed_peers(); + + // TODO(das): we should request only from peers that are part of this SyncingChain. + // However, then we hit the NoPeer error frequently which causes the batch to fail and + // the SyncingChain to be dropped. We need to handle this case more gracefully. + let synced_peers = network + .network_globals() + .peers + .read() + .synced_peers() + .cloned() + .collect::>(); + match network.block_components_by_range_request( - peer, batch_type, request, RangeRequestId::RangeSync { chain_id: self.id, batch_id, }, + &synced_peers, + &failed_peers, ) { Ok(request_id) => { // inform the batch about the new request - batch.start_downloading_from_peer(peer, request_id)?; + batch.start_downloading(request_id)?; if self .optimistic_start .map(|epoch| epoch == batch_id) @@ -988,41 +924,34 @@ impl SyncingChain { } else { debug!(epoch = %batch_id, %batch, %batch_state, "Requesting batch"); } - // register the batch for this peer - return self - .peers - .get_mut(&peer) - .map(|requests| { - requests.insert(batch_id); - Ok(KeepChain) - }) - .unwrap_or_else(|| { - Err(RemoveChain::WrongChainState(format!( - "Sending batch to a peer that is not in the chain: {}", - peer - ))) - }); + return Ok(KeepChain); } - Err(e) => { - // NOTE: under normal conditions this shouldn't happen but we handle it anyway - warn!(%batch_id, error = %e, %batch, "Could not send batch request"); - // register the failed download and check if the batch can be retried - batch.start_downloading_from_peer(peer, 1)?; // fake request_id is not relevant - self.peers - .get_mut(&peer) - .map(|request| request.remove(&batch_id)); - match batch.download_failed(true)? { - BatchOperationOutcome::Failed { blacklist } => { - return Err(RemoveChain::ChainFailed { - blacklist, - failing_batch: batch_id, - }) - } - BatchOperationOutcome::Continue => { - return self.retry_batch_download(network, batch_id) + Err(e) => match e { + // TODO(das): Handle the NoPeer case explicitly and don't drop the batch. For + // sync to work properly it must be okay to have "stalled" batches in + // AwaitingDownload state. Currently it will error with invalid state if + // that happens. Sync manager must periodicatlly prune stalled batches like + // we do for lookup sync. Then we can deprecate the redundant + // `good_peers_on_sampling_subnets` checks. + e + @ (RpcRequestSendError::NoPeer(_) | RpcRequestSendError::InternalError(_)) => { + // NOTE: under normal conditions this shouldn't happen but we handle it anyway + warn!(%batch_id, error = ?e, "batch_id" = %batch_id, %batch, "Could not send batch request"); + // register the failed download and check if the batch can be retried + batch.start_downloading(1)?; // fake request_id = 1 is not relevant + match batch.download_failed(None)? { + BatchOperationOutcome::Failed { blacklist } => { + return Err(RemoveChain::ChainFailed { + blacklist, + failing_batch: batch_id, + }) + } + BatchOperationOutcome::Continue => { + return self.send_batch(network, batch_id) + } } } - } + }, } } @@ -1061,21 +990,6 @@ impl SyncingChain { // find the next pending batch and request it from the peer - // randomize the peers for load balancing - let mut rng = rand::thread_rng(); - let mut idle_peers = self - .peers - .iter() - .filter_map(|(peer, requests)| { - if requests.is_empty() { - Some(*peer) - } else { - None - } - }) - .collect::>(); - idle_peers.shuffle(&mut rng); - // check if we have the batch for our optimistic start. If not, request it first. // We wait for this batch before requesting any other batches. if let Some(epoch) = self.optimistic_start { @@ -1085,26 +999,25 @@ impl SyncingChain { } if let Entry::Vacant(entry) = self.batches.entry(epoch) { - if let Some(peer) = idle_peers.pop() { - let batch_type = network.batch_type(epoch); - let optimistic_batch = BatchInfo::new(&epoch, EPOCHS_PER_BATCH, batch_type); - entry.insert(optimistic_batch); - self.send_batch(network, epoch, peer)?; - } + let batch_type = network.batch_type(epoch); + let optimistic_batch = BatchInfo::new(&epoch, EPOCHS_PER_BATCH, batch_type); + entry.insert(optimistic_batch); + self.send_batch(network, epoch)?; } return Ok(KeepChain); } - while let Some(peer) = idle_peers.pop() { - if let Some(batch_id) = self.include_next_batch(network) { - // send the batch - self.send_batch(network, batch_id, peer)?; - } else { - // No more batches, simply stop - return Ok(KeepChain); - } + // find the next pending batch and request it from the peer + // Note: for this function to not infinite loop we must: + // - If `include_next_batch` returns Some we MUST increase the count of batches that are + // accounted in the `BACKFILL_BATCH_BUFFER_SIZE` limit in the `matches!` statement of + // that function. + while let Some(batch_id) = self.include_next_batch(network) { + // send the batch + self.send_batch(network, batch_id)?; } + // No more batches, simply stop Ok(KeepChain) } @@ -1149,6 +1062,7 @@ impl SyncingChain { { return None; } + // only request batches up to the buffer size limit // NOTE: we don't count batches in the AwaitingValidation state, to prevent stalling sync // if the current processing window is contained in a long range of skip slots. @@ -1177,19 +1091,20 @@ impl SyncingChain { return None; } - let batch_id = self.to_be_downloaded; + // If no batch needs a retry, attempt to send the batch of the next epoch to download + let next_batch_id = self.to_be_downloaded; // this batch could have been included already being an optimistic batch - match self.batches.entry(batch_id) { + match self.batches.entry(next_batch_id) { Entry::Occupied(_) => { // this batch doesn't need downloading, let this same function decide the next batch self.to_be_downloaded += EPOCHS_PER_BATCH; self.include_next_batch(network) } Entry::Vacant(entry) => { - let batch_type = network.batch_type(batch_id); - entry.insert(BatchInfo::new(&batch_id, EPOCHS_PER_BATCH, batch_type)); + let batch_type = network.batch_type(next_batch_id); + entry.insert(BatchInfo::new(&next_batch_id, EPOCHS_PER_BATCH, batch_type)); self.to_be_downloaded += EPOCHS_PER_BATCH; - Some(batch_id) + Some(next_batch_id) } } } diff --git a/beacon_node/network/src/sync/range_sync/range.rs b/beacon_node/network/src/sync/range_sync/range.rs index c87418b87b..1ec1440991 100644 --- a/beacon_node/network/src/sync/range_sync/range.rs +++ b/beacon_node/network/src/sync/range_sync/range.rs @@ -317,9 +317,8 @@ where skip_all )] fn remove_peer(&mut self, network: &mut SyncNetworkContext, peer_id: &PeerId) { - for (removed_chain, sync_type, remove_reason) in self - .chains - .call_all(|chain| chain.remove_peer(peer_id, network)) + for (removed_chain, sync_type, remove_reason) in + self.chains.call_all(|chain| chain.remove_peer(peer_id)) { self.on_chain_removed( removed_chain, diff --git a/beacon_node/network/src/sync/tests/lookups.rs b/beacon_node/network/src/sync/tests/lookups.rs index 84c95b2a4c..565d7bc9f8 100644 --- a/beacon_node/network/src/sync/tests/lookups.rs +++ b/beacon_node/network/src/sync/tests/lookups.rs @@ -357,10 +357,13 @@ impl TestRig { pub fn new_connected_peer(&mut self) -> PeerId { let key = self.determinstic_key(); - self.network_globals + let peer_id = self + .network_globals .peers .write() - .__add_connected_peer_testing_only(false, &self.harness.spec, key) + .__add_connected_peer_testing_only(false, &self.harness.spec, key); + self.log(&format!("Added new peer for testing {peer_id:?}")); + peer_id } pub fn new_connected_supernode_peer(&mut self) -> PeerId { From 3d92e3663b74556e4c3f72c7c7ae6024fa0962a9 Mon Sep 17 00:00:00 2001 From: Daniel Knopik <107140945+dknopik@users.noreply.github.com> Date: Wed, 7 May 2025 05:43:33 +0200 Subject: [PATCH 17/22] Modularize validator store (#6705) - Create trait `ValidatorStore` with all functions used by the `validator_services` - Make `validator_services` generic on `S: ValidatorStore` - Introduce `LighthouseValidatorStore`, which has identical functionality to the old `ValidatorStore` - Remove dependencies (especially `environment`) from `validator_services` and `beacon_node_fallback` in order to be able to cleanly use them in Anchor --- Cargo.lock | 49 +- Cargo.toml | 5 +- consensus/types/src/attestation.rs | 2 +- consensus/types/src/payload.rs | 16 +- testing/web3signer_tests/Cargo.toml | 1 + testing/web3signer_tests/src/lib.rs | 34 +- validator_client/Cargo.toml | 1 + .../beacon_node_fallback/Cargo.toml | 3 +- .../beacon_node_fallback/src/lib.rs | 54 +- .../doppelganger_service/Cargo.toml | 1 + .../doppelganger_service/src/lib.rs | 92 +- validator_client/http_api/Cargo.toml | 31 +- .../src/create_signed_voluntary_exit.rs | 3 +- .../http_api/src/create_validator.rs | 9 +- validator_client/http_api/src/graffiti.rs | 8 +- validator_client/http_api/src/keystores.rs | 14 +- validator_client/http_api/src/lib.rs | 105 +- validator_client/http_api/src/remotekeys.rs | 18 +- validator_client/http_api/src/test_utils.rs | 15 +- validator_client/http_api/src/tests.rs | 23 +- .../http_api/src/tests/keystores.rs | 3 +- validator_client/http_metrics/Cargo.toml | 2 +- validator_client/http_metrics/src/lib.rs | 12 +- .../lighthouse_validator_store/Cargo.toml | 30 + .../lighthouse_validator_store/src/lib.rs | 1130 ++++++++++++++++ validator_client/signing_method/src/lib.rs | 17 +- .../slashing_protection/src/lib.rs | 2 +- .../src/signed_attestation.rs | 2 +- .../slashing_protection/src/signed_block.rs | 2 +- validator_client/src/config.rs | 2 +- validator_client/src/latency.rs | 2 +- validator_client/src/lib.rs | 91 +- validator_client/src/notifier.rs | 5 +- .../validator_services/Cargo.toml | 5 +- .../src/attestation_service.rs | 101 +- .../validator_services/src/block_service.rs | 158 ++- .../validator_services/src/duties_service.rs | 240 +++- .../src/preparation_service.rs | 69 +- .../validator_services/src/sync.rs | 64 +- .../src/sync_committee_service.rs | 60 +- validator_client/validator_store/Cargo.toml | 15 - validator_client/validator_store/src/lib.rs | 1136 +++-------------- 42 files changed, 2010 insertions(+), 1622 deletions(-) create mode 100644 validator_client/lighthouse_validator_store/Cargo.toml create mode 100644 validator_client/lighthouse_validator_store/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 40c331b659..ff87c32783 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -879,14 +879,13 @@ name = "beacon_node_fallback" version = "0.1.0" dependencies = [ "clap", - "environment", "eth2", "futures", "itertools 0.10.5", - "logging", "serde", "slot_clock", "strum", + "task_executor", "tokio", "tracing", "types", @@ -2364,6 +2363,7 @@ dependencies = [ "tokio", "tracing", "types", + "validator_store", ] [[package]] @@ -5635,6 +5635,32 @@ dependencies = [ "unused_port", ] +[[package]] +name = "lighthouse_validator_store" +version = "0.1.0" +dependencies = [ + "account_utils", + "beacon_node_fallback", + "doppelganger_service", + "either", + "environment", + "eth2", + "futures", + "initialized_validators", + "logging", + "parking_lot 0.12.3", + "serde", + "signing_method", + "slashing_protection", + "slot_clock", + "task_executor", + "tokio", + "tracing", + "types", + "validator_metrics", + "validator_store", +] + [[package]] name = "lighthouse_version" version = "0.1.0" @@ -9667,6 +9693,7 @@ dependencies = [ "graffiti_file", "hyper 1.6.0", "initialized_validators", + "lighthouse_validator_store", "metrics", "monitoring_api", "parking_lot 0.12.3", @@ -9722,6 +9749,7 @@ dependencies = [ "health_metrics", "initialized_validators", "itertools 0.10.5", + "lighthouse_validator_store", "lighthouse_version", "logging", "parking_lot 0.12.3", @@ -9754,6 +9782,7 @@ name = "validator_http_metrics" version = "0.1.0" dependencies = [ "health_metrics", + "lighthouse_validator_store", "lighthouse_version", "logging", "malloc_utils", @@ -9765,7 +9794,6 @@ dependencies = [ "types", "validator_metrics", "validator_services", - "validator_store", "warp", "warp_utils", ] @@ -9808,9 +9836,7 @@ version = "0.1.0" dependencies = [ "beacon_node_fallback", "bls", - "doppelganger_service", "either", - "environment", "eth2", "futures", "graffiti_file", @@ -9818,6 +9844,7 @@ dependencies = [ "parking_lot 0.12.3", "safe_arith", "slot_clock", + "task_executor", "tokio", "tracing", "tree_hash", @@ -9830,19 +9857,8 @@ dependencies = [ name = "validator_store" version = "0.1.0" dependencies = [ - "account_utils", - "doppelganger_service", - "initialized_validators", - "logging", - "parking_lot 0.12.3", - "serde", - "signing_method", "slashing_protection", - "slot_clock", - "task_executor", - "tracing", "types", - "validator_metrics", ] [[package]] @@ -10100,6 +10116,7 @@ dependencies = [ "eth2_network_config", "futures", "initialized_validators", + "lighthouse_validator_store", "logging", "parking_lot 0.12.3", "reqwest", diff --git a/Cargo.toml b/Cargo.toml index 5afed88ee8..075552b281 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -96,11 +96,11 @@ members = [ "validator_client/http_api", "validator_client/http_metrics", "validator_client/initialized_validators", + "validator_client/lighthouse_validator_store", "validator_client/signing_method", "validator_client/slashing_protection", "validator_client/validator_metrics", "validator_client/validator_services", - "validator_client/validator_store", "validator_manager", ] @@ -228,7 +228,6 @@ compare_fields = { path = "common/compare_fields" } deposit_contract = { path = "common/deposit_contract" } directory = { path = "common/directory" } doppelganger_service = { path = "validator_client/doppelganger_service" } -validator_services = { path = "validator_client/validator_services" } environment = { path = "lighthouse/environment" } eth1 = { path = "beacon_node/eth1" } eth1_test_rig = { path = "testing/eth1_test_rig" } @@ -250,6 +249,7 @@ int_to_bytes = { path = "consensus/int_to_bytes" } kzg = { path = "crypto/kzg" } metrics = { path = "common/metrics" } lighthouse_network = { path = "beacon_node/lighthouse_network" } +lighthouse_validator_store = { path = "validator_client/lighthouse_validator_store" } lighthouse_version = { path = "common/lighthouse_version" } workspace_members = { path = "common/workspace_members" } lockfile = { path = "common/lockfile" } @@ -281,6 +281,7 @@ validator_dir = { path = "common/validator_dir" } validator_http_api = { path = "validator_client/http_api" } validator_http_metrics = { path = "validator_client/http_metrics" } validator_metrics = { path = "validator_client/validator_metrics" } +validator_services = { path = "validator_client/validator_services" } validator_store = { path = "validator_client/validator_store" } validator_test_rig = { path = "testing/validator_test_rig" } warp_utils = { path = "common/warp_utils" } diff --git a/consensus/types/src/attestation.rs b/consensus/types/src/attestation.rs index e769057182..e2973132b0 100644 --- a/consensus/types/src/attestation.rs +++ b/consensus/types/src/attestation.rs @@ -16,7 +16,7 @@ use super::{ Signature, SignedRoot, }; -#[derive(Debug, PartialEq)] +#[derive(Debug, PartialEq, Clone)] pub enum Error { SszTypesError(ssz_types::Error), BitfieldError(ssz::BitfieldError), diff --git a/consensus/types/src/payload.rs b/consensus/types/src/payload.rs index abc9afd34c..c0262a2cf8 100644 --- a/consensus/types/src/payload.rs +++ b/consensus/types/src/payload.rs @@ -85,6 +85,7 @@ pub trait AbstractExecPayload: + TryInto + TryInto + TryInto + + Sync { type Ref<'a>: ExecPayload + Copy @@ -97,23 +98,28 @@ pub trait AbstractExecPayload: type Bellatrix: OwnedExecPayload + Into + for<'a> From>> - + TryFrom>; + + TryFrom> + + Sync; type Capella: OwnedExecPayload + Into + for<'a> From>> - + TryFrom>; + + TryFrom> + + Sync; type Deneb: OwnedExecPayload + Into + for<'a> From>> - + TryFrom>; + + TryFrom> + + Sync; type Electra: OwnedExecPayload + Into + for<'a> From>> - + TryFrom>; + + TryFrom> + + Sync; type Fulu: OwnedExecPayload + Into + for<'a> From>> - + TryFrom>; + + TryFrom> + + Sync; } #[superstruct( diff --git a/testing/web3signer_tests/Cargo.toml b/testing/web3signer_tests/Cargo.toml index 376aa13406..f68fa56e16 100644 --- a/testing/web3signer_tests/Cargo.toml +++ b/testing/web3signer_tests/Cargo.toml @@ -14,6 +14,7 @@ eth2_keystore = { workspace = true } eth2_network_config = { workspace = true } futures = { workspace = true } initialized_validators = { workspace = true } +lighthouse_validator_store = { workspace = true } logging = { workspace = true } parking_lot = { workspace = true } reqwest = { workspace = true } diff --git a/testing/web3signer_tests/src/lib.rs b/testing/web3signer_tests/src/lib.rs index 1eb14cf1d5..8678eff0ee 100644 --- a/testing/web3signer_tests/src/lib.rs +++ b/testing/web3signer_tests/src/lib.rs @@ -25,6 +25,7 @@ mod tests { use initialized_validators::{ load_pem_certificate, load_pkcs12_identity, InitializedValidators, }; + use lighthouse_validator_store::LighthouseValidatorStore; use parking_lot::Mutex; use reqwest::Client; use serde::Serialize; @@ -44,7 +45,7 @@ mod tests { use tokio::time::sleep; use types::{attestation::AttestationBase, *}; use url::Url; - use validator_store::{Error as ValidatorStoreError, ValidatorStore}; + use validator_store::{Error as ValidatorStoreError, SignedBlock, ValidatorStore}; /// If the we are unable to reach the Web3Signer HTTP API within this time out then we will /// assume it failed to start. @@ -73,6 +74,7 @@ mod tests { impl SignedObject for Signature {} impl SignedObject for Attestation {} impl SignedObject for SignedBeaconBlock {} + impl SignedObject for SignedBlock {} impl SignedObject for SignedAggregateAndProof {} impl SignedObject for SelectionProof {} impl SignedObject for SyncSelectionProof {} @@ -301,7 +303,7 @@ mod tests { /// A testing rig which holds a `ValidatorStore`. struct ValidatorStoreRig { - validator_store: Arc>, + validator_store: Arc>, _validator_dir: TempDir, runtime: Arc, _runtime_shutdown: async_channel::Sender<()>, @@ -352,12 +354,12 @@ mod tests { let slot_clock = TestingSlotClock::new(Slot::new(0), Duration::from_secs(0), Duration::from_secs(1)); - let config = validator_store::Config { + let config = lighthouse_validator_store::Config { enable_web3signer_slashing_protection: slashing_protection_config.local, ..Default::default() }; - let validator_store = ValidatorStore::<_, E>::new( + let validator_store = LighthouseValidatorStore::<_, E>::new( initialized_validators, slashing_protection, Hash256::repeat_byte(42), @@ -481,7 +483,7 @@ mod tests { generate_sig: F, ) -> Self where - F: Fn(PublicKeyBytes, Arc>) -> R, + F: Fn(PublicKeyBytes, Arc>) -> R, R: Future, // We use the `SignedObject` trait to white-list objects for comparison. This avoids // accidentally comparing something meaningless like a `()`. @@ -516,8 +518,8 @@ mod tests { web3signer_should_sign: bool, ) -> Self where - F: Fn(PublicKeyBytes, Arc>) -> R, - R: Future>, + F: Fn(PublicKeyBytes, Arc>) -> R, + R: Future>, { for validator_rig in &self.validator_rigs { let result = @@ -591,10 +593,10 @@ mod tests { .assert_signatures_match("beacon_block_base", |pubkey, validator_store| { let spec = spec.clone(); async move { - let block = BeaconBlock::Base(BeaconBlockBase::empty(&spec)); + let block = BeaconBlock::::Base(BeaconBlockBase::empty(&spec)); let block_slot = block.slot(); validator_store - .sign_block(pubkey, block, block_slot) + .sign_block(pubkey, block.into(), block_slot) .await .unwrap() } @@ -664,7 +666,11 @@ mod tests { let mut altair_block = BeaconBlockAltair::empty(&spec); altair_block.slot = altair_fork_slot; validator_store - .sign_block(pubkey, BeaconBlock::Altair(altair_block), altair_fork_slot) + .sign_block( + pubkey, + BeaconBlock::::Altair(altair_block).into(), + altair_fork_slot, + ) .await .unwrap() } @@ -749,7 +755,7 @@ mod tests { validator_store .sign_block( pubkey, - BeaconBlock::Bellatrix(bellatrix_block), + BeaconBlock::::Bellatrix(bellatrix_block).into(), bellatrix_fork_slot, ) .await @@ -805,7 +811,7 @@ mod tests { }; let first_block = || { - let mut bellatrix_block = BeaconBlockBellatrix::empty(&spec); + let mut bellatrix_block = BeaconBlockBellatrix::::empty(&spec); bellatrix_block.slot = bellatrix_fork_slot; BeaconBlock::Bellatrix(bellatrix_block) }; @@ -871,7 +877,7 @@ mod tests { let block = first_block(); let slot = block.slot(); validator_store - .sign_block(pubkey, block, slot) + .sign_block(pubkey, block.into(), slot) .await .unwrap() }) @@ -882,7 +888,7 @@ mod tests { let block = double_vote_block(); let slot = block.slot(); validator_store - .sign_block(pubkey, block, slot) + .sign_block(pubkey, block.into(), slot) .await .map(|_| ()) }, diff --git a/validator_client/Cargo.toml b/validator_client/Cargo.toml index 85517682bb..a8c8fd59f1 100644 --- a/validator_client/Cargo.toml +++ b/validator_client/Cargo.toml @@ -22,6 +22,7 @@ fdlimit = "0.3.0" graffiti_file = { workspace = true } hyper = { workspace = true } initialized_validators = { workspace = true } +lighthouse_validator_store = { workspace = true } metrics = { workspace = true } monitoring_api = { workspace = true } parking_lot = { workspace = true } diff --git a/validator_client/beacon_node_fallback/Cargo.toml b/validator_client/beacon_node_fallback/Cargo.toml index 4297bae15f..3bcb0d7034 100644 --- a/validator_client/beacon_node_fallback/Cargo.toml +++ b/validator_client/beacon_node_fallback/Cargo.toml @@ -10,18 +10,17 @@ path = "src/lib.rs" [dependencies] clap = { workspace = true } -environment = { workspace = true } eth2 = { workspace = true } futures = { workspace = true } itertools = { workspace = true } serde = { workspace = true } slot_clock = { workspace = true } strum = { workspace = true } +task_executor = { workspace = true } tokio = { workspace = true } tracing = { workspace = true } types = { workspace = true } validator_metrics = { workspace = true } [dev-dependencies] -logging = { workspace = true } validator_test_rig = { workspace = true } diff --git a/validator_client/beacon_node_fallback/src/lib.rs b/validator_client/beacon_node_fallback/src/lib.rs index befc18c563..8d022f8e75 100644 --- a/validator_client/beacon_node_fallback/src/lib.rs +++ b/validator_client/beacon_node_fallback/src/lib.rs @@ -8,7 +8,6 @@ use beacon_node_health::{ IsOptimistic, SyncDistanceTier, }; use clap::ValueEnum; -use environment::RuntimeContext; use eth2::BeaconNodeHttpClient; use futures::future; use serde::{ser::SerializeStruct, Deserialize, Serialize, Serializer}; @@ -17,11 +16,11 @@ use std::cmp::Ordering; use std::fmt; use std::fmt::Debug; use std::future::Future; -use std::marker::PhantomData; use std::sync::Arc; use std::time::{Duration, Instant}; use std::vec::Vec; use strum::EnumVariantNames; +use task_executor::TaskExecutor; use tokio::{sync::RwLock, time::sleep}; use tracing::{debug, error, warn}; use types::{ChainSpec, Config as ConfigSpec, EthSpec, Slot}; @@ -61,17 +60,16 @@ pub struct LatencyMeasurement { /// /// See `SLOT_LOOKAHEAD` for information about when this should run. pub fn start_fallback_updater_service( - context: RuntimeContext, - beacon_nodes: Arc>, + executor: TaskExecutor, + beacon_nodes: Arc>, ) -> Result<(), &'static str> { - let executor = context.executor; if beacon_nodes.slot_clock.is_none() { return Err("Cannot start fallback updater without slot clock"); } let future = async move { loop { - beacon_nodes.update_all_candidates().await; + beacon_nodes.update_all_candidates::().await; let sleep_time = beacon_nodes .slot_clock @@ -186,29 +184,27 @@ impl Serialize for CandidateInfo { /// Represents a `BeaconNodeHttpClient` inside a `BeaconNodeFallback` that may or may not be used /// for a query. #[derive(Clone, Debug)] -pub struct CandidateBeaconNode { +pub struct CandidateBeaconNode { pub index: usize, pub beacon_node: BeaconNodeHttpClient, pub health: Arc>>, - _phantom: PhantomData, } -impl PartialEq for CandidateBeaconNode { +impl PartialEq for CandidateBeaconNode { fn eq(&self, other: &Self) -> bool { self.index == other.index && self.beacon_node == other.beacon_node } } -impl Eq for CandidateBeaconNode {} +impl Eq for CandidateBeaconNode {} -impl CandidateBeaconNode { +impl CandidateBeaconNode { /// Instantiate a new node. pub fn new(beacon_node: BeaconNodeHttpClient, index: usize) -> Self { Self { index, beacon_node, health: Arc::new(RwLock::new(Err(CandidateError::Uninitialized))), - _phantom: PhantomData, } } @@ -217,13 +213,13 @@ impl CandidateBeaconNode { *self.health.read().await } - pub async fn refresh_health( + pub async fn refresh_health( &self, distance_tiers: &BeaconNodeSyncDistanceTiers, slot_clock: Option<&T>, spec: &ChainSpec, ) -> Result<(), CandidateError> { - if let Err(e) = self.is_compatible(spec).await { + if let Err(e) = self.is_compatible::(spec).await { *self.health.write().await = Err(e); return Err(e); } @@ -287,7 +283,7 @@ impl CandidateBeaconNode { } /// Checks if the node has the correct specification. - async fn is_compatible(&self, spec: &ChainSpec) -> Result<(), CandidateError> { + async fn is_compatible(&self, spec: &ChainSpec) -> Result<(), CandidateError> { let config = self .beacon_node .get_config_spec::() @@ -372,17 +368,17 @@ impl CandidateBeaconNode { /// behaviour, where the failure of one candidate results in the next candidate receiving an /// identical query. #[derive(Clone, Debug)] -pub struct BeaconNodeFallback { - pub candidates: Arc>>>, +pub struct BeaconNodeFallback { + pub candidates: Arc>>, distance_tiers: BeaconNodeSyncDistanceTiers, slot_clock: Option, broadcast_topics: Vec, spec: Arc, } -impl BeaconNodeFallback { +impl BeaconNodeFallback { pub fn new( - candidates: Vec>, + candidates: Vec, config: Config, broadcast_topics: Vec, spec: Arc, @@ -464,7 +460,7 @@ impl BeaconNodeFallback { /// It is possible for a node to return an unsynced status while continuing to serve /// low quality responses. To route around this it's best to poll all connected beacon nodes. /// A previous implementation of this function polled only the unavailable BNs. - pub async fn update_all_candidates(&self) { + pub async fn update_all_candidates(&self) { // Clone the vec, so we release the read lock immediately. // `candidate.health` is behind an Arc, so this would still allow us to mutate the values. let candidates = self.candidates.read().await.clone(); @@ -472,7 +468,7 @@ impl BeaconNodeFallback { let mut nodes = Vec::with_capacity(candidates.len()); for candidate in candidates.iter() { - futures.push(candidate.refresh_health( + futures.push(candidate.refresh_health::( &self.distance_tiers, self.slot_clock.as_ref(), &self.spec, @@ -675,7 +671,7 @@ impl BeaconNodeFallback { } /// Helper functions to allow sorting candidate nodes by health. -async fn sort_nodes_by_health(nodes: &mut Vec>) { +async fn sort_nodes_by_health(nodes: &mut Vec) { // Fetch all health values. let health_results: Vec> = future::join_all(nodes.iter().map(|node| node.health())).await; @@ -693,7 +689,7 @@ async fn sort_nodes_by_health(nodes: &mut Vec }); // Reorder candidates based on the sorted indices. - let sorted_nodes: Vec> = indices_with_health + let sorted_nodes: Vec = indices_with_health .into_iter() .map(|(index, _)| nodes[index].clone()) .collect(); @@ -752,7 +748,7 @@ mod tests { let optimistic_status = IsOptimistic::No; let execution_status = ExecutionEngineHealth::Healthy; - fn new_candidate(index: usize) -> CandidateBeaconNode { + fn new_candidate(index: usize) -> CandidateBeaconNode { let beacon_node = BeaconNodeHttpClient::new( SensitiveUrl::parse(&format!("http://example_{index}.com")).unwrap(), Timeouts::set_all(Duration::from_secs(index as u64)), @@ -859,21 +855,21 @@ mod tests { async fn new_mock_beacon_node( index: usize, spec: &ChainSpec, - ) -> (MockBeaconNode, CandidateBeaconNode) { + ) -> (MockBeaconNode, CandidateBeaconNode) { let mut mock_beacon_node = MockBeaconNode::::new().await; mock_beacon_node.mock_config_spec(spec); let beacon_node = - CandidateBeaconNode::::new(mock_beacon_node.beacon_api_client.clone(), index); + CandidateBeaconNode::new(mock_beacon_node.beacon_api_client.clone(), index); (mock_beacon_node, beacon_node) } fn create_beacon_node_fallback( - candidates: Vec>, + candidates: Vec, topics: Vec, spec: Arc, - ) -> BeaconNodeFallback { + ) -> BeaconNodeFallback { let mut beacon_node_fallback = BeaconNodeFallback::new(candidates, Config::default(), topics, spec); @@ -929,7 +925,7 @@ mod tests { sync_distance: Slot::new(0), }); - beacon_node_fallback.update_all_candidates().await; + beacon_node_fallback.update_all_candidates::().await; let candidates = beacon_node_fallback.candidates.read().await; assert_eq!( diff --git a/validator_client/doppelganger_service/Cargo.toml b/validator_client/doppelganger_service/Cargo.toml index 803dd94322..e5b183570d 100644 --- a/validator_client/doppelganger_service/Cargo.toml +++ b/validator_client/doppelganger_service/Cargo.toml @@ -15,6 +15,7 @@ task_executor = { workspace = true } tokio = { workspace = true } tracing = { workspace = true } types = { workspace = true } +validator_store = { workspace = true } [dev-dependencies] futures = { workspace = true } diff --git a/validator_client/doppelganger_service/src/lib.rs b/validator_client/doppelganger_service/src/lib.rs index cb81b3ffc2..e3c7ce78b4 100644 --- a/validator_client/doppelganger_service/src/lib.rs +++ b/validator_client/doppelganger_service/src/lib.rs @@ -42,68 +42,7 @@ use task_executor::ShutdownReason; use tokio::time::sleep; use tracing::{error, info}; use types::{Epoch, EthSpec, PublicKeyBytes, Slot}; - -/// A wrapper around `PublicKeyBytes` which encodes information about the status of a validator -/// pubkey with regards to doppelganger protection. -#[derive(Debug, PartialEq)] -pub enum DoppelgangerStatus { - /// Doppelganger protection has approved this for signing. - /// - /// This is because the service has waited some period of time to - /// detect other instances of this key on the network. - SigningEnabled(PublicKeyBytes), - /// Doppelganger protection is still waiting to detect other instances. - /// - /// Do not use this pubkey for signing slashable messages!! - /// - /// However, it can safely be used for other non-slashable operations (e.g., collecting duties - /// or subscribing to subnets). - SigningDisabled(PublicKeyBytes), - /// This pubkey is unknown to the doppelganger service. - /// - /// This represents a serious internal error in the program. This validator will be permanently - /// disabled! - UnknownToDoppelganger(PublicKeyBytes), -} - -impl DoppelgangerStatus { - /// Only return a pubkey if it is explicitly safe for doppelganger protection. - /// - /// If `Some(pubkey)` is returned, doppelganger has declared it safe for signing. - /// - /// ## Note - /// - /// "Safe" is only best-effort by doppelganger. There is no guarantee that a doppelganger - /// doesn't exist. - pub fn only_safe(self) -> Option { - match self { - DoppelgangerStatus::SigningEnabled(pubkey) => Some(pubkey), - DoppelgangerStatus::SigningDisabled(_) => None, - DoppelgangerStatus::UnknownToDoppelganger(_) => None, - } - } - - /// Returns a key regardless of whether or not doppelganger has approved it. Such a key might be - /// used for signing non-slashable messages, duties collection or other activities. - /// - /// If the validator is unknown to doppelganger then `None` will be returned. - pub fn ignored(self) -> Option { - match self { - DoppelgangerStatus::SigningEnabled(pubkey) => Some(pubkey), - DoppelgangerStatus::SigningDisabled(pubkey) => Some(pubkey), - DoppelgangerStatus::UnknownToDoppelganger(_) => None, - } - } - - /// Only return a pubkey if it will not be used for signing due to doppelganger detection. - pub fn only_unsafe(self) -> Option { - match self { - DoppelgangerStatus::SigningEnabled(_) => None, - DoppelgangerStatus::SigningDisabled(pubkey) => Some(pubkey), - DoppelgangerStatus::UnknownToDoppelganger(pubkey) => Some(pubkey), - } - } -} +use validator_store::{DoppelgangerStatus, ValidatorStore}; struct LivenessResponses { current_epoch_responses: Vec, @@ -114,13 +53,6 @@ struct LivenessResponses { /// validators on the network. pub const DEFAULT_REMAINING_DETECTION_EPOCHS: u64 = 1; -/// This crate cannot depend on ValidatorStore as validator_store depends on this crate and -/// initialises the doppelganger protection. For this reason, we abstract the validator store -/// functions this service needs through the following trait -pub trait DoppelgangerValidatorStore { - fn get_validator_index(&self, pubkey: &PublicKeyBytes) -> Option; -} - /// Store the per-validator status of doppelganger checking. #[derive(Debug, PartialEq)] pub struct DoppelgangerState { @@ -163,8 +95,8 @@ impl DoppelgangerState { /// If the BN fails to respond to either of these requests, simply return an empty response. /// This behaviour is to help prevent spurious failures on the BN from needlessly preventing /// doppelganger progression. -async fn beacon_node_liveness( - beacon_nodes: Arc>, +async fn beacon_node_liveness( + beacon_nodes: Arc>, current_epoch: Epoch, validator_indices: Vec, ) -> LivenessResponses { @@ -280,20 +212,20 @@ impl DoppelgangerService { service: Arc, context: RuntimeContext, validator_store: Arc, - beacon_nodes: Arc>, + beacon_nodes: Arc>, slot_clock: T, ) -> Result<(), String> where E: EthSpec, T: 'static + SlotClock, - V: DoppelgangerValidatorStore + Send + Sync + 'static, + V: ValidatorStore + Send + Sync + 'static, { // Define the `get_index` function as one that uses the validator store. - let get_index = move |pubkey| validator_store.get_validator_index(&pubkey); + let get_index = move |pubkey| validator_store.validator_index(&pubkey); // Define the `get_liveness` function as one that queries the beacon node API. let get_liveness = move |current_epoch, validator_indices| { - beacon_node_liveness(beacon_nodes.clone(), current_epoch, validator_indices) + beacon_node_liveness::(beacon_nodes.clone(), current_epoch, validator_indices) }; let mut shutdown_sender = context.executor.shutdown_sender(); @@ -378,17 +310,18 @@ impl DoppelgangerService { /// /// Validators added during the genesis epoch will not have doppelganger protection applied to /// them. - pub fn register_new_validator( + pub fn register_new_validator( &self, validator: PublicKeyBytes, slot_clock: &T, + slots_per_epoch: u64, ) -> Result<(), String> { let current_epoch = slot_clock // If registering before genesis, use the genesis slot. .now_or_genesis() .ok_or_else(|| "Unable to read slot clock when registering validator".to_string())? - .epoch(E::slots_per_epoch()); - let genesis_epoch = slot_clock.genesis_slot().epoch(E::slots_per_epoch()); + .epoch(slots_per_epoch); + let genesis_epoch = slot_clock.genesis_slot().epoch(slots_per_epoch); let remaining_epochs = if current_epoch <= genesis_epoch { // Disable doppelganger protection when the validator was initialized before genesis. @@ -673,6 +606,7 @@ mod test { test_utils::{SeedableRng, TestRandom, XorShiftRng}, MainnetEthSpec, }; + use validator_store::DoppelgangerStatus; const DEFAULT_VALIDATORS: usize = 8; @@ -773,7 +707,7 @@ mod test { .expect("index should exist"); self.doppelganger - .register_new_validator::(pubkey, &self.slot_clock) + .register_new_validator(pubkey, &self.slot_clock, E::slots_per_epoch()) .unwrap(); self.doppelganger .doppelganger_states diff --git a/validator_client/http_api/Cargo.toml b/validator_client/http_api/Cargo.toml index 482212d890..588aa2ca93 100644 --- a/validator_client/http_api/Cargo.toml +++ b/validator_client/http_api/Cargo.toml @@ -16,13 +16,14 @@ deposit_contract = { workspace = true } directory = { workspace = true } dirs = { workspace = true } doppelganger_service = { workspace = true } -eth2 = { workspace = true } -eth2_keystore = { workspace = true } +eth2 = { workspace = true } +eth2_keystore = { workspace = true } ethereum_serde_utils = { workspace = true } filesystem = { workspace = true } graffiti_file = { workspace = true } health_metrics = { workspace = true } initialized_validators = { workspace = true } +lighthouse_validator_store = { workspace = true } lighthouse_version = { workspace = true } logging = { workspace = true } parking_lot = { workspace = true } @@ -32,19 +33,19 @@ serde = { workspace = true } serde_json = { workspace = true } signing_method = { workspace = true } slashing_protection = { workspace = true } -slot_clock = { workspace = true } -sysinfo = { workspace = true } -system_health = { workspace = true } -task_executor = { workspace = true } -tempfile = { workspace = true } -tokio = { workspace = true } -tokio-stream = { workspace = true } -tracing = { workspace = true } -types = { workspace = true } -url = { workspace = true } -validator_dir = { workspace = true } -validator_services = { workspace = true } -validator_store = { workspace = true } +slot_clock = { workspace = true } +sysinfo = { workspace = true } +system_health = { workspace = true } +task_executor = { workspace = true } +tempfile = { workspace = true } +tokio = { workspace = true } +tokio-stream = { workspace = true } +tracing = { workspace = true } +types = { workspace = true } +url = { workspace = true } +validator_dir = { workspace = true } +validator_services = { workspace = true } +validator_store = { workspace = true } warp = { workspace = true } warp_utils = { workspace = true } zeroize = { workspace = true } diff --git a/validator_client/http_api/src/create_signed_voluntary_exit.rs b/validator_client/http_api/src/create_signed_voluntary_exit.rs index 7a9dc798d6..b536a6aa7a 100644 --- a/validator_client/http_api/src/create_signed_voluntary_exit.rs +++ b/validator_client/http_api/src/create_signed_voluntary_exit.rs @@ -1,5 +1,6 @@ use bls::{PublicKey, PublicKeyBytes}; use eth2::types::GenericResponse; +use lighthouse_validator_store::LighthouseValidatorStore; use slot_clock::SlotClock; use std::sync::Arc; use tracing::info; @@ -9,7 +10,7 @@ use validator_store::ValidatorStore; pub async fn create_signed_voluntary_exit( pubkey: PublicKey, maybe_epoch: Option, - validator_store: Arc>, + validator_store: Arc>, slot_clock: T, ) -> Result, warp::Rejection> { let epoch = match maybe_epoch { diff --git a/validator_client/http_api/src/create_validator.rs b/validator_client/http_api/src/create_validator.rs index f90a1057a4..278274198d 100644 --- a/validator_client/http_api/src/create_validator.rs +++ b/validator_client/http_api/src/create_validator.rs @@ -5,12 +5,11 @@ use account_utils::{ random_mnemonic, random_password, }; use eth2::lighthouse_vc::types::{self as api_types}; +use lighthouse_validator_store::LighthouseValidatorStore; use slot_clock::SlotClock; use std::path::{Path, PathBuf}; -use types::ChainSpec; -use types::EthSpec; +use types::{ChainSpec, EthSpec}; use validator_dir::{keystore_password_path, Builder as ValidatorDirBuilder}; -use validator_store::ValidatorStore; use zeroize::Zeroizing; /// Create some validator EIP-2335 keystores and store them on disk. Then, enroll the validators in @@ -30,7 +29,7 @@ pub async fn create_validators_mnemonic, T: 'static + SlotClock, validator_requests: &[api_types::ValidatorRequest], validator_dir: P, secrets_dir: Option, - validator_store: &ValidatorStore, + validator_store: &LighthouseValidatorStore, spec: &ChainSpec, ) -> Result<(Vec, Mnemonic), warp::Rejection> { let mnemonic = mnemonic_opt.unwrap_or_else(random_mnemonic); @@ -178,7 +177,7 @@ pub async fn create_validators_mnemonic, T: 'static + SlotClock, pub async fn create_validators_web3signer( validators: Vec, - validator_store: &ValidatorStore, + validator_store: &LighthouseValidatorStore, ) -> Result<(), warp::Rejection> { for validator in validators { validator_store diff --git a/validator_client/http_api/src/graffiti.rs b/validator_client/http_api/src/graffiti.rs index 86238a697c..4372b14b04 100644 --- a/validator_client/http_api/src/graffiti.rs +++ b/validator_client/http_api/src/graffiti.rs @@ -1,12 +1,12 @@ use bls::PublicKey; +use lighthouse_validator_store::LighthouseValidatorStore; use slot_clock::SlotClock; use std::sync::Arc; use types::{graffiti::GraffitiString, EthSpec, Graffiti}; -use validator_store::ValidatorStore; pub fn get_graffiti( validator_pubkey: PublicKey, - validator_store: Arc>, + validator_store: Arc>, graffiti_flag: Option, ) -> Result { let initialized_validators_rw_lock = validator_store.initialized_validators(); @@ -29,7 +29,7 @@ pub fn get_graffiti( pub fn set_graffiti( validator_pubkey: PublicKey, graffiti: GraffitiString, - validator_store: Arc>, + validator_store: Arc>, ) -> Result<(), warp::Rejection> { let initialized_validators_rw_lock = validator_store.initialized_validators(); let mut initialized_validators = initialized_validators_rw_lock.write(); @@ -55,7 +55,7 @@ pub fn set_graffiti( pub fn delete_graffiti( validator_pubkey: PublicKey, - validator_store: Arc>, + validator_store: Arc>, ) -> Result<(), warp::Rejection> { let initialized_validators_rw_lock = validator_store.initialized_validators(); let mut initialized_validators = initialized_validators_rw_lock.write(); diff --git a/validator_client/http_api/src/keystores.rs b/validator_client/http_api/src/keystores.rs index c2bcfe5ab4..302b21d7d8 100644 --- a/validator_client/http_api/src/keystores.rs +++ b/validator_client/http_api/src/keystores.rs @@ -10,6 +10,7 @@ use eth2::lighthouse_vc::{ }; use eth2_keystore::Keystore; use initialized_validators::{Error, InitializedValidators}; +use lighthouse_validator_store::LighthouseValidatorStore; use signing_method::SigningMethod; use slot_clock::SlotClock; use std::path::PathBuf; @@ -19,13 +20,12 @@ use tokio::runtime::Handle; use tracing::{info, warn}; use types::{EthSpec, PublicKeyBytes}; use validator_dir::{keystore_password_path, Builder as ValidatorDirBuilder}; -use validator_store::ValidatorStore; use warp::Rejection; use warp_utils::reject::{custom_bad_request, custom_server_error}; use zeroize::Zeroizing; pub fn list( - validator_store: Arc>, + validator_store: Arc>, ) -> ListKeystoresResponse { let initialized_validators_rwlock = validator_store.initialized_validators(); let initialized_validators = initialized_validators_rwlock.read(); @@ -62,7 +62,7 @@ pub fn import( request: ImportKeystoresRequest, validator_dir: PathBuf, secrets_dir: Option, - validator_store: Arc>, + validator_store: Arc>, task_executor: TaskExecutor, ) -> Result { // Check request validity. This is the only cases in which we should return a 4xx code. @@ -117,7 +117,7 @@ pub fn import( ) } else if let Some(handle) = task_executor.handle() { // Import the keystore. - match import_single_keystore( + match import_single_keystore::<_, E>( keystore, password, validator_dir.clone(), @@ -164,7 +164,7 @@ fn import_single_keystore( password: Zeroizing, validator_dir_path: PathBuf, secrets_dir: Option, - validator_store: &ValidatorStore, + validator_store: &LighthouseValidatorStore, handle: Handle, ) -> Result { // Check if the validator key already exists, erroring if it is a remote signer validator. @@ -234,7 +234,7 @@ fn import_single_keystore( pub fn delete( request: DeleteKeystoresRequest, - validator_store: Arc>, + validator_store: Arc>, task_executor: TaskExecutor, ) -> Result { let export_response = export(request, validator_store, task_executor)?; @@ -265,7 +265,7 @@ pub fn delete( pub fn export( request: DeleteKeystoresRequest, - validator_store: Arc>, + validator_store: Arc>, task_executor: TaskExecutor, ) -> Result { // Remove from initialized validators. diff --git a/validator_client/http_api/src/lib.rs b/validator_client/http_api/src/lib.rs index a6c9eba752..aebe179567 100644 --- a/validator_client/http_api/src/lib.rs +++ b/validator_client/http_api/src/lib.rs @@ -13,6 +13,7 @@ use graffiti::{delete_graffiti, get_graffiti, set_graffiti}; use create_signed_voluntary_exit::create_signed_voluntary_exit; use graffiti_file::{determine_graffiti, GraffitiFile}; +use lighthouse_validator_store::LighthouseValidatorStore; use validator_store::ValidatorStore; use account_utils::{ @@ -41,7 +42,6 @@ use serde::{Deserialize, Serialize}; use slot_clock::SlotClock; use std::collections::HashMap; use std::future::Future; -use std::marker::PhantomData; use std::net::{IpAddr, Ipv4Addr, SocketAddr}; use std::path::PathBuf; use std::sync::Arc; @@ -77,11 +77,11 @@ impl From for Error { /// A wrapper around all the items required to spawn the HTTP server. /// /// The server will gracefully handle the case where any fields are `None`. -pub struct Context { +pub struct Context { pub task_executor: TaskExecutor, pub api_secret: ApiSecret, - pub block_service: Option>, - pub validator_store: Option>>, + pub block_service: Option, T>>, + pub validator_store: Option>>, pub validator_dir: Option, pub secrets_dir: Option, pub graffiti_file: Option, @@ -90,7 +90,6 @@ pub struct Context { pub config: Config, pub sse_logging_components: Option, pub slot_clock: T, - pub _phantom: PhantomData, } /// Configuration for the HTTP server. @@ -320,7 +319,7 @@ pub fn serve( .and(warp::path("validators")) .and(warp::path::end()) .and(validator_store_filter.clone()) - .then(|validator_store: Arc>| { + .then(|validator_store: Arc>| { blocking_json_task(move || { let validators = validator_store .initialized_validators() @@ -345,7 +344,7 @@ pub fn serve( .and(warp::path::end()) .and(validator_store_filter.clone()) .then( - |validator_pubkey: PublicKey, validator_store: Arc>| { + |validator_pubkey: PublicKey, validator_store: Arc>| { blocking_json_task(move || { let validator = validator_store .initialized_validators() @@ -395,7 +394,7 @@ pub fn serve( .and(graffiti_file_filter.clone()) .and(graffiti_flag_filter) .then( - |validator_store: Arc>, + |validator_store: Arc>, graffiti_file: Option, graffiti_flag: Option| { blocking_json_task(move || { @@ -424,33 +423,35 @@ pub fn serve( .and(warp::path("health")) .and(warp::path::end()) .and(block_service_filter.clone()) - .then(|block_filter: BlockService| async move { - let mut result: HashMap> = HashMap::new(); + .then( + |block_filter: BlockService, T>| async move { + let mut result: HashMap> = HashMap::new(); - let mut beacon_nodes = Vec::new(); - for node in &*block_filter.beacon_nodes.candidates.read().await { - beacon_nodes.push(CandidateInfo { - index: node.index, - endpoint: node.beacon_node.to_string(), - health: *node.health.read().await, - }); - } - result.insert("beacon_nodes".to_string(), beacon_nodes); - - if let Some(proposer_nodes_list) = &block_filter.proposer_nodes { - let mut proposer_nodes = Vec::new(); - for node in &*proposer_nodes_list.candidates.read().await { - proposer_nodes.push(CandidateInfo { + let mut beacon_nodes = Vec::new(); + for node in &*block_filter.beacon_nodes.candidates.read().await { + beacon_nodes.push(CandidateInfo { index: node.index, endpoint: node.beacon_node.to_string(), health: *node.health.read().await, }); } - result.insert("proposer_nodes".to_string(), proposer_nodes); - } + result.insert("beacon_nodes".to_string(), beacon_nodes); - blocking_json_task(move || Ok(api_types::GenericResponse::from(result))).await - }); + if let Some(proposer_nodes_list) = &block_filter.proposer_nodes { + let mut proposer_nodes = Vec::new(); + for node in &*proposer_nodes_list.candidates.read().await { + proposer_nodes.push(CandidateInfo { + index: node.index, + endpoint: node.beacon_node.to_string(), + health: *node.health.read().await, + }); + } + result.insert("proposer_nodes".to_string(), proposer_nodes); + } + + blocking_json_task(move || Ok(api_types::GenericResponse::from(result))).await + }, + ); // POST lighthouse/validators/ let post_validators = warp::path("lighthouse") @@ -466,14 +467,14 @@ pub fn serve( move |body: Vec, validator_dir: PathBuf, secrets_dir: PathBuf, - validator_store: Arc>, + validator_store: Arc>, spec: Arc, task_executor: TaskExecutor| { blocking_json_task(move || { let secrets_dir = store_passwords_in_secrets_dir.then_some(secrets_dir); if let Some(handle) = task_executor.handle() { let (validators, mnemonic) = - handle.block_on(create_validators_mnemonic( + handle.block_on(create_validators_mnemonic::<_, _, E>( None, None, &body, @@ -511,7 +512,7 @@ pub fn serve( move |body: api_types::CreateValidatorsMnemonicRequest, validator_dir: PathBuf, secrets_dir: PathBuf, - validator_store: Arc>, + validator_store: Arc>, spec: Arc, task_executor: TaskExecutor| { blocking_json_task(move || { @@ -525,7 +526,7 @@ pub fn serve( )) })?; let (validators, _mnemonic) = - handle.block_on(create_validators_mnemonic( + handle.block_on(create_validators_mnemonic::<_, _, E>( Some(mnemonic), Some(body.key_derivation_path_offset), &body.validators, @@ -558,7 +559,7 @@ pub fn serve( move |body: api_types::KeystoreValidatorsPostRequest, validator_dir: PathBuf, secrets_dir: PathBuf, - validator_store: Arc>, + validator_store: Arc>, task_executor: TaskExecutor| { blocking_json_task(move || { // Check to ensure the password is correct. @@ -644,7 +645,7 @@ pub fn serve( .and(task_executor_filter.clone()) .then( |body: Vec, - validator_store: Arc>, + validator_store: Arc>, task_executor: TaskExecutor| { blocking_json_task(move || { if let Some(handle) = task_executor.handle() { @@ -672,7 +673,7 @@ pub fn serve( ), }) .collect(); - handle.block_on(create_validators_web3signer( + handle.block_on(create_validators_web3signer::<_, E>( web3signers, &validator_store, ))?; @@ -698,7 +699,7 @@ pub fn serve( .then( |validator_pubkey: PublicKey, body: api_types::ValidatorPatchRequest, - validator_store: Arc>, + validator_store: Arc>, graffiti_file: Option, task_executor: TaskExecutor| { blocking_json_task(move || { @@ -851,7 +852,7 @@ pub fn serve( .and(warp::path::end()) .and(validator_store_filter.clone()) .then( - |validator_pubkey: PublicKey, validator_store: Arc>| { + |validator_pubkey: PublicKey, validator_store: Arc>| { blocking_json_task(move || { if validator_store .initialized_validators() @@ -892,7 +893,7 @@ pub fn serve( .then( |validator_pubkey: PublicKey, request: api_types::UpdateFeeRecipientRequest, - validator_store: Arc>| { + validator_store: Arc>| { blocking_json_task(move || { if validator_store .initialized_validators() @@ -928,7 +929,7 @@ pub fn serve( .and(warp::path::end()) .and(validator_store_filter.clone()) .then( - |validator_pubkey: PublicKey, validator_store: Arc>| { + |validator_pubkey: PublicKey, validator_store: Arc>| { blocking_json_task(move || { if validator_store .initialized_validators() @@ -964,7 +965,7 @@ pub fn serve( .and(warp::path::end()) .and(validator_store_filter.clone()) .then( - |validator_pubkey: PublicKey, validator_store: Arc>| { + |validator_pubkey: PublicKey, validator_store: Arc>| { blocking_json_task(move || { if validator_store .initialized_validators() @@ -997,7 +998,7 @@ pub fn serve( .then( |validator_pubkey: PublicKey, request: api_types::UpdateGasLimitRequest, - validator_store: Arc>| { + validator_store: Arc>| { blocking_json_task(move || { if validator_store .initialized_validators() @@ -1033,7 +1034,7 @@ pub fn serve( .and(warp::path::end()) .and(validator_store_filter.clone()) .then( - |validator_pubkey: PublicKey, validator_store: Arc>| { + |validator_pubkey: PublicKey, validator_store: Arc>| { blocking_json_task(move || { if validator_store .initialized_validators() @@ -1074,13 +1075,13 @@ pub fn serve( .then( |pubkey: PublicKey, query: api_types::VoluntaryExitQuery, - validator_store: Arc>, + validator_store: Arc>, slot_clock: T, task_executor: TaskExecutor| { blocking_json_task(move || { if let Some(handle) = task_executor.handle() { let signed_voluntary_exit = - handle.block_on(create_signed_voluntary_exit( + handle.block_on(create_signed_voluntary_exit::( pubkey, query.epoch, validator_store, @@ -1106,7 +1107,7 @@ pub fn serve( .and(graffiti_flag_filter) .then( |pubkey: PublicKey, - validator_store: Arc>, + validator_store: Arc>, graffiti_flag: Option| { blocking_json_task(move || { let graffiti = get_graffiti(pubkey.clone(), validator_store, graffiti_flag)?; @@ -1130,7 +1131,7 @@ pub fn serve( .then( |pubkey: PublicKey, query: SetGraffitiRequest, - validator_store: Arc>, + validator_store: Arc>, graffiti_file: Option| { blocking_json_task(move || { if graffiti_file.is_some() { @@ -1155,7 +1156,7 @@ pub fn serve( .and(graffiti_file_filter.clone()) .then( |pubkey: PublicKey, - validator_store: Arc>, + validator_store: Arc>, graffiti_file: Option| { blocking_json_task(move || { if graffiti_file.is_some() { @@ -1172,7 +1173,7 @@ pub fn serve( // GET /eth/v1/keystores let get_std_keystores = std_keystores.and(validator_store_filter.clone()).then( - |validator_store: Arc>| { + |validator_store: Arc>| { blocking_json_task(move || Ok(keystores::list(validator_store))) }, ); @@ -1188,7 +1189,7 @@ pub fn serve( move |request, validator_dir, secrets_dir, validator_store, task_executor| { let secrets_dir = store_passwords_in_secrets_dir.then_some(secrets_dir); blocking_json_task(move || { - keystores::import( + keystores::import::<_, E>( request, validator_dir, secrets_dir, @@ -1210,7 +1211,7 @@ pub fn serve( // GET /eth/v1/remotekeys let get_std_remotekeys = std_remotekeys.and(validator_store_filter.clone()).then( - |validator_store: Arc>| { + |validator_store: Arc>| { blocking_json_task(move || Ok(remotekeys::list(validator_store))) }, ); @@ -1221,7 +1222,9 @@ pub fn serve( .and(validator_store_filter.clone()) .and(task_executor_filter.clone()) .then(|request, validator_store, task_executor| { - blocking_json_task(move || remotekeys::import(request, validator_store, task_executor)) + blocking_json_task(move || { + remotekeys::import::<_, E>(request, validator_store, task_executor) + }) }); // DELETE /eth/v1/remotekeys diff --git a/validator_client/http_api/src/remotekeys.rs b/validator_client/http_api/src/remotekeys.rs index 49d666f303..5aa63baac3 100644 --- a/validator_client/http_api/src/remotekeys.rs +++ b/validator_client/http_api/src/remotekeys.rs @@ -8,6 +8,7 @@ use eth2::lighthouse_vc::std_types::{ ListRemotekeysResponse, SingleListRemotekeysResponse, Status, }; use initialized_validators::{Error, InitializedValidators}; +use lighthouse_validator_store::LighthouseValidatorStore; use slot_clock::SlotClock; use std::sync::Arc; use task_executor::TaskExecutor; @@ -15,12 +16,11 @@ use tokio::runtime::Handle; use tracing::{info, warn}; use types::{EthSpec, PublicKeyBytes}; use url::Url; -use validator_store::ValidatorStore; use warp::Rejection; use warp_utils::reject::custom_server_error; pub fn list( - validator_store: Arc>, + validator_store: Arc>, ) -> ListRemotekeysResponse { let initialized_validators_rwlock = validator_store.initialized_validators(); let initialized_validators = initialized_validators_rwlock.read(); @@ -50,7 +50,7 @@ pub fn list( pub fn import( request: ImportRemotekeysRequest, - validator_store: Arc>, + validator_store: Arc>, task_executor: TaskExecutor, ) -> Result { info!( @@ -63,8 +63,12 @@ pub fn import( for remotekey in request.remote_keys { let status = if let Some(handle) = task_executor.handle() { // Import the keystore. - match import_single_remotekey(remotekey.pubkey, remotekey.url, &validator_store, handle) - { + match import_single_remotekey::<_, E>( + remotekey.pubkey, + remotekey.url, + &validator_store, + handle, + ) { Ok(status) => Status::ok(status), Err(e) => { warn!( @@ -89,7 +93,7 @@ pub fn import( fn import_single_remotekey( pubkey: PublicKeyBytes, url: String, - validator_store: &ValidatorStore, + validator_store: &LighthouseValidatorStore, handle: Handle, ) -> Result { if let Err(url_err) = Url::parse(&url) { @@ -143,7 +147,7 @@ fn import_single_remotekey( pub fn delete( request: DeleteRemotekeysRequest, - validator_store: Arc>, + validator_store: Arc>, task_executor: TaskExecutor, ) -> Result { info!( diff --git a/validator_client/http_api/src/test_utils.rs b/validator_client/http_api/src/test_utils.rs index 4a5d3b6cc7..08447a82ce 100644 --- a/validator_client/http_api/src/test_utils.rs +++ b/validator_client/http_api/src/test_utils.rs @@ -14,19 +14,19 @@ use eth2::{ use eth2_keystore::KeystoreBuilder; use initialized_validators::key_cache::{KeyCache, CACHE_FILENAME}; use initialized_validators::{InitializedValidators, OnDecryptFailure}; +use lighthouse_validator_store::{Config as ValidatorStoreConfig, LighthouseValidatorStore}; use parking_lot::RwLock; use sensitive_url::SensitiveUrl; use slashing_protection::{SlashingDatabase, SLASHING_PROTECTION_FILENAME}; use slot_clock::{SlotClock, TestingSlotClock}; use std::future::Future; -use std::marker::PhantomData; use std::net::{IpAddr, Ipv4Addr}; use std::sync::Arc; use std::time::Duration; use task_executor::test_utils::TestRuntime; use tempfile::{tempdir, TempDir}; use tokio::sync::oneshot; -use validator_store::{Config as ValidatorStoreConfig, ValidatorStore}; +use validator_services::block_service::BlockService; use zeroize::Zeroizing; pub const PASSWORD_BYTES: &[u8] = &[42, 50, 37]; @@ -54,7 +54,7 @@ pub struct Web3SignerValidatorScenario { pub struct ApiTester { pub client: ValidatorClientHttpClient, pub initialized_validators: Arc>, - pub validator_store: Arc>, + pub validator_store: Arc>, pub url: SensitiveUrl, pub api_token: String, pub test_runtime: TestRuntime, @@ -101,7 +101,7 @@ impl ApiTester { let test_runtime = TestRuntime::default(); - let validator_store = Arc::new(ValidatorStore::<_, E>::new( + let validator_store = Arc::new(LighthouseValidatorStore::new( initialized_validators, slashing_protection, Hash256::repeat_byte(42), @@ -121,7 +121,7 @@ impl ApiTester { let context = Arc::new(Context { task_executor: test_runtime.task_executor.clone(), api_secret, - block_service: None, + block_service: None::, _>>, validator_dir: Some(validator_dir.path().into()), secrets_dir: Some(secrets_dir.path().into()), validator_store: Some(validator_store.clone()), @@ -131,7 +131,6 @@ impl ApiTester { config: http_config, sse_logging_components: None, slot_clock, - _phantom: PhantomData, }); let ctx = context; let (shutdown_tx, shutdown_rx) = oneshot::channel(); @@ -139,7 +138,7 @@ impl ApiTester { // It's not really interesting why this triggered, just that it happened. let _ = shutdown_rx.await; }; - let (listening_socket, server) = super::serve(ctx, server_shutdown).unwrap(); + let (listening_socket, server) = super::serve::<_, E>(ctx, server_shutdown).unwrap(); tokio::spawn(server); @@ -638,7 +637,7 @@ impl ApiTester { assert_eq!( self.validator_store - .get_builder_proposals(&validator.voting_pubkey), + .get_builder_proposals_testing_only(&validator.voting_pubkey), builder_proposals ); diff --git a/validator_client/http_api/src/tests.rs b/validator_client/http_api/src/tests.rs index 5468718fb5..4b1a3c0059 100644 --- a/validator_client/http_api/src/tests.rs +++ b/validator_client/http_api/src/tests.rs @@ -18,12 +18,12 @@ use eth2::{ Error as ApiError, }; use eth2_keystore::KeystoreBuilder; +use lighthouse_validator_store::{Config as ValidatorStoreConfig, LighthouseValidatorStore}; use parking_lot::RwLock; use sensitive_url::SensitiveUrl; use slashing_protection::{SlashingDatabase, SLASHING_PROTECTION_FILENAME}; use slot_clock::{SlotClock, TestingSlotClock}; use std::future::Future; -use std::marker::PhantomData; use std::net::{IpAddr, Ipv4Addr}; use std::str::FromStr; use std::sync::Arc; @@ -31,7 +31,7 @@ use std::time::Duration; use task_executor::test_utils::TestRuntime; use tempfile::{tempdir, TempDir}; use types::graffiti::GraffitiString; -use validator_store::{Config as ValidatorStoreConfig, ValidatorStore}; +use validator_store::ValidatorStore; use zeroize::Zeroizing; const PASSWORD_BYTES: &[u8] = &[42, 50, 37]; @@ -42,7 +42,7 @@ type E = MainnetEthSpec; struct ApiTester { client: ValidatorClientHttpClient, initialized_validators: Arc>, - validator_store: Arc>, + validator_store: Arc>, url: SensitiveUrl, slot_clock: TestingSlotClock, _validator_dir: TempDir, @@ -91,7 +91,7 @@ impl ApiTester { let test_runtime = TestRuntime::default(); - let validator_store = Arc::new(ValidatorStore::<_, E>::new( + let validator_store = Arc::new(LighthouseValidatorStore::<_, E>::new( initialized_validators, slashing_protection, Hash256::repeat_byte(42), @@ -129,11 +129,10 @@ impl ApiTester { }, sse_logging_components: None, slot_clock: slot_clock.clone(), - _phantom: PhantomData, }); let ctx = context.clone(); let (listening_socket, server) = - super::serve(ctx, test_runtime.task_executor.exit()).unwrap(); + super::serve::<_, E>(ctx, test_runtime.task_executor.exit()).unwrap(); tokio::spawn(server); @@ -670,7 +669,7 @@ impl ApiTester { assert_eq!( self.validator_store - .get_builder_proposals(&validator.voting_pubkey), + .get_builder_proposals_testing_only(&validator.voting_pubkey), builder_proposals ); @@ -686,7 +685,7 @@ impl ApiTester { assert_eq!( self.validator_store - .get_builder_boost_factor(&validator.voting_pubkey), + .get_builder_boost_factor_testing_only(&validator.voting_pubkey), builder_boost_factor ); @@ -702,7 +701,7 @@ impl ApiTester { assert_eq!( self.validator_store - .determine_validator_builder_boost_factor(&validator.voting_pubkey), + .determine_builder_boost_factor(&validator.voting_pubkey), builder_boost_factor ); @@ -712,7 +711,7 @@ impl ApiTester { pub fn assert_default_builder_boost_factor(self, builder_boost_factor: Option) -> Self { assert_eq!( self.validator_store - .determine_default_builder_boost_factor(), + .determine_builder_boost_factor(&PublicKeyBytes::empty()), builder_boost_factor ); @@ -728,7 +727,7 @@ impl ApiTester { assert_eq!( self.validator_store - .get_prefer_builder_proposals(&validator.voting_pubkey), + .get_prefer_builder_proposals_testing_only(&validator.voting_pubkey), prefer_builder_proposals ); @@ -1159,7 +1158,7 @@ async fn validator_derived_builder_boost_factor_with_process_defaults() { }) .await .assert_default_builder_boost_factor(Some(80)) - .assert_validator_derived_builder_boost_factor(0, None) + .assert_validator_derived_builder_boost_factor(0, Some(80)) .await .set_builder_proposals(0, false) .await diff --git a/validator_client/http_api/src/tests/keystores.rs b/validator_client/http_api/src/tests/keystores.rs index 13494e5fa6..37f7513f37 100644 --- a/validator_client/http_api/src/tests/keystores.rs +++ b/validator_client/http_api/src/tests/keystores.rs @@ -8,12 +8,13 @@ use eth2::lighthouse_vc::{ types::Web3SignerValidatorRequest, }; use itertools::Itertools; +use lighthouse_validator_store::DEFAULT_GAS_LIMIT; use rand::{rngs::SmallRng, Rng, SeedableRng}; use slashing_protection::interchange::{Interchange, InterchangeMetadata}; use std::{collections::HashMap, path::Path}; use tokio::runtime::Handle; use types::{attestation::AttestationBase, Address}; -use validator_store::DEFAULT_GAS_LIMIT; +use validator_store::ValidatorStore; use zeroize::Zeroizing; fn new_keystore(password: Zeroizing) -> Keystore { diff --git a/validator_client/http_metrics/Cargo.toml b/validator_client/http_metrics/Cargo.toml index f2684da4b1..24cbff7cde 100644 --- a/validator_client/http_metrics/Cargo.toml +++ b/validator_client/http_metrics/Cargo.toml @@ -6,6 +6,7 @@ authors = ["Sigma Prime "] [dependencies] health_metrics = { workspace = true } +lighthouse_validator_store = { workspace = true } lighthouse_version = { workspace = true } logging = { workspace = true } malloc_utils = { workspace = true } @@ -17,6 +18,5 @@ tracing = { workspace = true } types = { workspace = true } validator_metrics = { workspace = true } validator_services = { workspace = true } -validator_store = { workspace = true } warp = { workspace = true } warp_utils = { workspace = true } diff --git a/validator_client/http_metrics/src/lib.rs b/validator_client/http_metrics/src/lib.rs index 6bf18e7b93..7441939957 100644 --- a/validator_client/http_metrics/src/lib.rs +++ b/validator_client/http_metrics/src/lib.rs @@ -2,6 +2,7 @@ //! //! For other endpoints, see the `http_api` crate. +use lighthouse_validator_store::LighthouseValidatorStore; use lighthouse_version::version_with_platform; use logging::crit; use malloc_utils::scrape_allocator_metrics; @@ -15,7 +16,6 @@ use std::time::{SystemTime, UNIX_EPOCH}; use tracing::info; use types::EthSpec; use validator_services::duties_service::DutiesService; -use validator_store::ValidatorStore; use warp::{http::Response, Filter}; #[derive(Debug)] @@ -36,17 +36,19 @@ impl From for Error { } } +type ValidatorStore = LighthouseValidatorStore; + /// Contains objects which have shared access from inside/outside of the metrics server. -pub struct Shared { - pub validator_store: Option>>, - pub duties_service: Option>>, +pub struct Shared { + pub validator_store: Option>>, + pub duties_service: Option, SystemTimeSlotClock>>>, pub genesis_time: Option, } /// A wrapper around all the items required to spawn the HTTP server. /// /// The server will gracefully handle the case where any fields are `None`. -pub struct Context { +pub struct Context { pub config: Config, pub shared: RwLock>, } diff --git a/validator_client/lighthouse_validator_store/Cargo.toml b/validator_client/lighthouse_validator_store/Cargo.toml new file mode 100644 index 0000000000..0f8220bdc9 --- /dev/null +++ b/validator_client/lighthouse_validator_store/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "lighthouse_validator_store" +version = "0.1.0" +edition = { workspace = true } +authors = ["Sigma Prime "] + +[dependencies] +account_utils = { workspace = true } +beacon_node_fallback = { workspace = true } +doppelganger_service = { workspace = true } +either = { workspace = true } +environment = { workspace = true } +eth2 = { workspace = true } +initialized_validators = { workspace = true } +logging = { workspace = true } +parking_lot = { workspace = true } +serde = { workspace = true } +signing_method = { workspace = true } +slashing_protection = { workspace = true } +slot_clock = { workspace = true } +task_executor = { workspace = true } +tokio = { workspace = true } +tracing = { workspace = true } +types = { workspace = true } +validator_metrics = { workspace = true } +validator_store = { workspace = true } + +[dev-dependencies] +futures = { workspace = true } +logging = { workspace = true } diff --git a/validator_client/lighthouse_validator_store/src/lib.rs b/validator_client/lighthouse_validator_store/src/lib.rs new file mode 100644 index 0000000000..d07f95f11c --- /dev/null +++ b/validator_client/lighthouse_validator_store/src/lib.rs @@ -0,0 +1,1130 @@ +use account_utils::validator_definitions::{PasswordStorage, ValidatorDefinition}; +use doppelganger_service::DoppelgangerService; +use initialized_validators::InitializedValidators; +use logging::crit; +use parking_lot::{Mutex, RwLock}; +use serde::{Deserialize, Serialize}; +use signing_method::Error as SigningError; +use signing_method::{SignableMessage, SigningContext, SigningMethod}; +use slashing_protection::{ + interchange::Interchange, InterchangeError, NotSafe, Safe, SlashingDatabase, +}; +use slot_clock::SlotClock; +use std::marker::PhantomData; +use std::path::Path; +use std::sync::Arc; +use task_executor::TaskExecutor; +use tracing::{error, info, warn}; +use types::{ + graffiti::GraffitiString, AbstractExecPayload, Address, AggregateAndProof, Attestation, + BeaconBlock, BlindedPayload, ChainSpec, ContributionAndProof, Domain, Epoch, EthSpec, Fork, + Graffiti, Hash256, PublicKeyBytes, SelectionProof, Signature, SignedAggregateAndProof, + SignedBeaconBlock, SignedContributionAndProof, SignedRoot, SignedValidatorRegistrationData, + SignedVoluntaryExit, Slot, SyncAggregatorSelectionData, SyncCommitteeContribution, + SyncCommitteeMessage, SyncSelectionProof, SyncSubnetId, ValidatorRegistrationData, + VoluntaryExit, +}; +use validator_store::{ + DoppelgangerStatus, Error as ValidatorStoreError, ProposalData, SignedBlock, UnsignedBlock, + ValidatorStore, +}; + +pub type Error = ValidatorStoreError; + +#[derive(Debug, Default, Clone, Serialize, Deserialize)] +pub struct Config { + /// Fallback fee recipient address. + pub fee_recipient: Option
, + /// Fallback gas limit. + pub gas_limit: Option, + /// Enable use of the blinded block endpoints during proposals. + pub builder_proposals: bool, + /// Enable slashing protection even while using web3signer keys. + pub enable_web3signer_slashing_protection: bool, + /// If true, Lighthouse will prefer builder proposals, if available. + pub prefer_builder_proposals: bool, + /// Specifies the boost factor, a percentage multiplier to apply to the builder's payload value. + pub builder_boost_factor: Option, +} + +/// Number of epochs of slashing protection history to keep. +/// +/// This acts as a maximum safe-guard against clock drift. +const SLASHING_PROTECTION_HISTORY_EPOCHS: u64 = 512; + +/// Currently used as the default gas limit in execution clients. +/// +/// https://ethresear.ch/t/on-increasing-the-block-gas-limit-technical-considerations-path-forward/21225. +pub const DEFAULT_GAS_LIMIT: u64 = 36_000_000; + +pub struct LighthouseValidatorStore { + validators: Arc>, + slashing_protection: SlashingDatabase, + slashing_protection_last_prune: Arc>, + genesis_validators_root: Hash256, + spec: Arc, + doppelganger_service: Option>, + slot_clock: T, + fee_recipient_process: Option
, + gas_limit: Option, + builder_proposals: bool, + enable_web3signer_slashing_protection: bool, + prefer_builder_proposals: bool, + builder_boost_factor: Option, + task_executor: TaskExecutor, + _phantom: PhantomData, +} + +impl LighthouseValidatorStore { + // All arguments are different types. Making the fields `pub` is undesired. A builder seems + // unnecessary. + #[allow(clippy::too_many_arguments)] + pub fn new( + validators: InitializedValidators, + slashing_protection: SlashingDatabase, + genesis_validators_root: Hash256, + spec: Arc, + doppelganger_service: Option>, + slot_clock: T, + config: &Config, + task_executor: TaskExecutor, + ) -> Self { + Self { + validators: Arc::new(RwLock::new(validators)), + slashing_protection, + slashing_protection_last_prune: Arc::new(Mutex::new(Epoch::new(0))), + genesis_validators_root, + spec, + doppelganger_service, + slot_clock, + fee_recipient_process: config.fee_recipient, + gas_limit: config.gas_limit, + builder_proposals: config.builder_proposals, + enable_web3signer_slashing_protection: config.enable_web3signer_slashing_protection, + prefer_builder_proposals: config.prefer_builder_proposals, + builder_boost_factor: config.builder_boost_factor, + task_executor, + _phantom: PhantomData, + } + } + + /// Register all local validators in doppelganger protection to try and prevent instances of + /// duplicate validators operating on the network at the same time. + /// + /// This function has no effect if doppelganger protection is disabled. + pub fn register_all_in_doppelganger_protection_if_enabled(&self) -> Result<(), String> { + if let Some(doppelganger_service) = &self.doppelganger_service { + for pubkey in self.validators.read().iter_voting_pubkeys() { + doppelganger_service.register_new_validator( + *pubkey, + &self.slot_clock, + E::slots_per_epoch(), + )? + } + } + + Ok(()) + } + + /// Returns `true` if doppelganger protection is enabled, or else `false`. + pub fn doppelganger_protection_enabled(&self) -> bool { + self.doppelganger_service.is_some() + } + + pub fn initialized_validators(&self) -> Arc> { + self.validators.clone() + } + + /// Indicates if the `voting_public_key` exists in self and is enabled. + pub fn has_validator(&self, voting_public_key: &PublicKeyBytes) -> bool { + self.validators + .read() + .validator(voting_public_key) + .is_some() + } + + /// Insert a new validator to `self`, where the validator is represented by an EIP-2335 + /// keystore on the filesystem. + #[allow(clippy::too_many_arguments)] + pub async fn add_validator_keystore>( + &self, + voting_keystore_path: P, + password_storage: PasswordStorage, + enable: bool, + graffiti: Option, + suggested_fee_recipient: Option
, + gas_limit: Option, + builder_proposals: Option, + builder_boost_factor: Option, + prefer_builder_proposals: Option, + ) -> Result { + let mut validator_def = ValidatorDefinition::new_keystore_with_password( + voting_keystore_path, + password_storage, + graffiti, + suggested_fee_recipient, + gas_limit, + builder_proposals, + builder_boost_factor, + prefer_builder_proposals, + ) + .map_err(|e| format!("failed to create validator definitions: {:?}", e))?; + + validator_def.enabled = enable; + + self.add_validator(validator_def).await + } + + /// Insert a new validator to `self`. + /// + /// This function includes: + /// + /// - Adding the validator definition to the YAML file, saving it to the filesystem. + /// - Enabling the validator with the slashing protection database. + /// - If `enable == true`, starting to perform duties for the validator. + // FIXME: ignore this clippy lint until the validator store is refactored to use async locks + #[allow(clippy::await_holding_lock)] + pub async fn add_validator( + &self, + validator_def: ValidatorDefinition, + ) -> Result { + let validator_pubkey = validator_def.voting_public_key.compress(); + + self.slashing_protection + .register_validator(validator_pubkey) + .map_err(|e| format!("failed to register validator: {:?}", e))?; + + if let Some(doppelganger_service) = &self.doppelganger_service { + doppelganger_service.register_new_validator( + validator_pubkey, + &self.slot_clock, + E::slots_per_epoch(), + )?; + } + + self.validators + .write() + .add_definition_replace_disabled(validator_def.clone()) + .await + .map_err(|e| format!("Unable to add definition: {:?}", e))?; + + Ok(validator_def) + } + + /// Returns doppelganger statuses for all enabled validators. + #[allow(clippy::needless_collect)] // Collect is required to avoid holding a lock. + pub fn doppelganger_statuses(&self) -> Vec { + // Collect all the pubkeys first to avoid interleaving locks on `self.validators` and + // `self.doppelganger_service`. + let pubkeys = self + .validators + .read() + .iter_voting_pubkeys() + .cloned() + .collect::>(); + + pubkeys + .into_iter() + .map(|pubkey| { + self.doppelganger_service + .as_ref() + .map(|doppelganger_service| doppelganger_service.validator_status(pubkey)) + // Allow signing on all pubkeys if doppelganger protection is disabled. + .unwrap_or_else(|| DoppelgangerStatus::SigningEnabled(pubkey)) + }) + .collect() + } + + fn fork(&self, epoch: Epoch) -> Fork { + self.spec.fork_at_epoch(epoch) + } + + /// Returns a `SigningMethod` for `validator_pubkey` *only if* that validator is considered safe + /// by doppelganger protection. + fn doppelganger_checked_signing_method( + &self, + validator_pubkey: PublicKeyBytes, + ) -> Result, Error> { + if self.doppelganger_protection_allows_signing(validator_pubkey) { + self.validators + .read() + .signing_method(&validator_pubkey) + .ok_or(Error::UnknownPubkey(validator_pubkey)) + } else { + Err(Error::DoppelgangerProtected(validator_pubkey)) + } + } + + /// Returns a `SigningMethod` for `validator_pubkey` regardless of that validators doppelganger + /// protection status. + /// + /// ## Warning + /// + /// This method should only be used for signing non-slashable messages. + fn doppelganger_bypassed_signing_method( + &self, + validator_pubkey: PublicKeyBytes, + ) -> Result, Error> { + self.validators + .read() + .signing_method(&validator_pubkey) + .ok_or(Error::UnknownPubkey(validator_pubkey)) + } + + fn signing_context(&self, domain: Domain, signing_epoch: Epoch) -> SigningContext { + if domain == Domain::VoluntaryExit { + if self.spec.fork_name_at_epoch(signing_epoch).deneb_enabled() { + // EIP-7044 + SigningContext { + domain, + epoch: signing_epoch, + fork: Fork { + previous_version: self.spec.capella_fork_version, + current_version: self.spec.capella_fork_version, + epoch: signing_epoch, + }, + genesis_validators_root: self.genesis_validators_root, + } + } else { + SigningContext { + domain, + epoch: signing_epoch, + fork: self.fork(signing_epoch), + genesis_validators_root: self.genesis_validators_root, + } + } + } else { + SigningContext { + domain, + epoch: signing_epoch, + fork: self.fork(signing_epoch), + genesis_validators_root: self.genesis_validators_root, + } + } + } + + pub fn get_fee_recipient_defaulting(&self, fee_recipient: Option
) -> Option
{ + // If there's nothing in the file, try the process-level default value. + fee_recipient.or(self.fee_recipient_process) + } + + /// Returns the suggested_fee_recipient from `validator_definitions.yml` if any. + /// This has been pulled into a private function so the read lock is dropped easily + fn suggested_fee_recipient(&self, validator_pubkey: &PublicKeyBytes) -> Option
{ + self.validators + .read() + .suggested_fee_recipient(validator_pubkey) + } + + /// Returns the gas limit for the given public key. The priority order for fetching + /// the gas limit is: + /// + /// 1. validator_definitions.yml + /// 2. process level gas limit + /// 3. `DEFAULT_GAS_LIMIT` + pub fn get_gas_limit(&self, validator_pubkey: &PublicKeyBytes) -> u64 { + self.get_gas_limit_defaulting(self.validators.read().gas_limit(validator_pubkey)) + } + + fn get_gas_limit_defaulting(&self, gas_limit: Option) -> u64 { + // If there is a `gas_limit` in the validator definitions yaml + // file, use that value. + gas_limit + // If there's nothing in the file, try the process-level default value. + .or(self.gas_limit) + // If there's no process-level default, use the `DEFAULT_GAS_LIMIT`. + .unwrap_or(DEFAULT_GAS_LIMIT) + } + + /// Returns a `bool` for the given public key that denotes whether this validator should use the + /// builder API. The priority order for fetching this value is: + /// + /// 1. validator_definitions.yml + /// 2. process level flag + /// + /// This function is currently only used in tests because in prod it is translated and combined + /// with other flags into a builder boost factor (see `determine_builder_boost_factor`). + pub fn get_builder_proposals_testing_only(&self, validator_pubkey: &PublicKeyBytes) -> bool { + // If there is a `suggested_fee_recipient` in the validator definitions yaml + // file, use that value. + self.get_builder_proposals_defaulting( + self.validators.read().builder_proposals(validator_pubkey), + ) + } + + fn get_builder_proposals_defaulting(&self, builder_proposals: Option) -> bool { + builder_proposals + // If there's nothing in the file, try the process-level default value. + .unwrap_or(self.builder_proposals) + } + + /// Returns a `u64` for the given public key that denotes the builder boost factor. The priority order for fetching this value is: + /// + /// 1. validator_definitions.yml + /// 2. process level flag + /// + /// This function is currently only used in tests because in prod it is translated and combined + /// with other flags into a builder boost factor (see `determine_builder_boost_factor`). + pub fn get_builder_boost_factor_testing_only( + &self, + validator_pubkey: &PublicKeyBytes, + ) -> Option { + self.validators + .read() + .builder_boost_factor(validator_pubkey) + .or(self.builder_boost_factor) + } + + /// Returns a `bool` for the given public key that denotes whether this validator should prefer a + /// builder payload. The priority order for fetching this value is: + /// + /// 1. validator_definitions.yml + /// 2. process level flag + /// + /// This function is currently only used in tests because in prod it is translated and combined + /// with other flags into a builder boost factor (see `determine_builder_boost_factor`). + pub fn get_prefer_builder_proposals_testing_only( + &self, + validator_pubkey: &PublicKeyBytes, + ) -> bool { + self.validators + .read() + .prefer_builder_proposals(validator_pubkey) + .unwrap_or(self.prefer_builder_proposals) + } + + pub fn import_slashing_protection( + &self, + interchange: Interchange, + ) -> Result<(), InterchangeError> { + self.slashing_protection + .import_interchange_info(interchange, self.genesis_validators_root)?; + Ok(()) + } + + /// Export slashing protection data while also disabling the given keys in the database. + /// + /// If any key is unknown to the slashing protection database it will be silently omitted + /// from the result. It is the caller's responsibility to check whether all keys provided + /// had data returned for them. + pub fn export_slashing_protection_for_keys( + &self, + pubkeys: &[PublicKeyBytes], + ) -> Result { + self.slashing_protection.with_transaction(|txn| { + let known_pubkeys = pubkeys + .iter() + .filter_map(|pubkey| { + let validator_id = self + .slashing_protection + .get_validator_id_ignoring_status(txn, pubkey) + .ok()?; + + Some( + self.slashing_protection + .update_validator_status(txn, validator_id, false) + .map(|()| *pubkey), + ) + }) + .collect::, _>>()?; + self.slashing_protection.export_interchange_info_in_txn( + self.genesis_validators_root, + Some(&known_pubkeys), + txn, + ) + }) + } + + async fn sign_abstract_block>( + &self, + validator_pubkey: PublicKeyBytes, + block: BeaconBlock, + current_slot: Slot, + ) -> Result, Error> { + // Make sure the block slot is not higher than the current slot to avoid potential attacks. + if block.slot() > current_slot { + warn!( + block_slot = block.slot().as_u64(), + current_slot = current_slot.as_u64(), + "Not signing block with slot greater than current slot" + ); + return Err(Error::GreaterThanCurrentSlot { + slot: block.slot(), + current_slot, + }); + } + + let signing_epoch = block.epoch(); + let signing_context = self.signing_context(Domain::BeaconProposer, signing_epoch); + let domain_hash = signing_context.domain_hash(&self.spec); + + let signing_method = self.doppelganger_checked_signing_method(validator_pubkey)?; + + // Check for slashing conditions. + let slashing_status = if signing_method + .requires_local_slashing_protection(self.enable_web3signer_slashing_protection) + { + self.slashing_protection.check_and_insert_block_proposal( + &validator_pubkey, + &block.block_header(), + domain_hash, + ) + } else { + Ok(Safe::Valid) + }; + + match slashing_status { + // We can safely sign this block without slashing. + Ok(Safe::Valid) => { + validator_metrics::inc_counter_vec( + &validator_metrics::SIGNED_BLOCKS_TOTAL, + &[validator_metrics::SUCCESS], + ); + + let signature = signing_method + .get_signature( + SignableMessage::BeaconBlock(&block), + signing_context, + &self.spec, + &self.task_executor, + ) + .await?; + Ok(SignedBeaconBlock::from_block(block, signature)) + } + Ok(Safe::SameData) => { + warn!("Skipping signing of previously signed block"); + validator_metrics::inc_counter_vec( + &validator_metrics::SIGNED_BLOCKS_TOTAL, + &[validator_metrics::SAME_DATA], + ); + Err(Error::SameData) + } + Err(NotSafe::UnregisteredValidator(pk)) => { + warn!( + msg = "Carefully consider running with --init-slashing-protection (see --help)", + public_key = ?pk, + "Not signing block for unregistered validator" + ); + validator_metrics::inc_counter_vec( + &validator_metrics::SIGNED_BLOCKS_TOTAL, + &[validator_metrics::UNREGISTERED], + ); + Err(Error::Slashable(NotSafe::UnregisteredValidator(pk))) + } + Err(e) => { + crit!( + error = ?e, + "Not signing slashable block" + ); + validator_metrics::inc_counter_vec( + &validator_metrics::SIGNED_BLOCKS_TOTAL, + &[validator_metrics::SLASHABLE], + ); + Err(Error::Slashable(e)) + } + } + } + + pub async fn sign_voluntary_exit( + &self, + validator_pubkey: PublicKeyBytes, + voluntary_exit: VoluntaryExit, + ) -> Result { + let signing_epoch = voluntary_exit.epoch; + let signing_context = self.signing_context(Domain::VoluntaryExit, signing_epoch); + let signing_method = self.doppelganger_bypassed_signing_method(validator_pubkey)?; + + let signature = signing_method + .get_signature::>( + SignableMessage::VoluntaryExit(&voluntary_exit), + signing_context, + &self.spec, + &self.task_executor, + ) + .await?; + + validator_metrics::inc_counter_vec( + &validator_metrics::SIGNED_VOLUNTARY_EXITS_TOTAL, + &[validator_metrics::SUCCESS], + ); + + Ok(SignedVoluntaryExit { + message: voluntary_exit, + signature, + }) + } +} + +impl ValidatorStore for LighthouseValidatorStore { + type Error = SigningError; + type E = E; + + /// Attempts to resolve the pubkey to a validator index. + /// + /// It may return `None` if the `pubkey` is: + /// + /// - Unknown. + /// - Known, but with an unknown index. + fn validator_index(&self, pubkey: &PublicKeyBytes) -> Option { + self.validators.read().get_index(pubkey) + } + + /// Returns all voting pubkeys for all enabled validators. + /// + /// The `filter_func` allows for filtering pubkeys based upon their `DoppelgangerStatus`. There + /// are two primary functions used here: + /// + /// - `DoppelgangerStatus::only_safe`: only returns pubkeys which have passed doppelganger + /// protection and are safe-enough to sign messages. + /// - `DoppelgangerStatus::ignored`: returns all the pubkeys from `only_safe` *plus* those still + /// undergoing protection. This is useful for collecting duties or other non-signing tasks. + #[allow(clippy::needless_collect)] // Collect is required to avoid holding a lock. + fn voting_pubkeys(&self, filter_func: F) -> I + where + I: FromIterator, + F: Fn(DoppelgangerStatus) -> Option, + { + // Collect all the pubkeys first to avoid interleaving locks on `self.validators` and + // `self.doppelganger_service()`. + let pubkeys = self + .validators + .read() + .iter_voting_pubkeys() + .cloned() + .collect::>(); + + pubkeys + .into_iter() + .map(|pubkey| { + self.doppelganger_service + .as_ref() + .map(|doppelganger_service| doppelganger_service.validator_status(pubkey)) + // Allow signing on all pubkeys if doppelganger protection is disabled. + .unwrap_or_else(|| DoppelgangerStatus::SigningEnabled(pubkey)) + }) + .filter_map(filter_func) + .collect() + } + + /// Check if the `validator_pubkey` is permitted by the doppleganger protection to sign + /// messages. + fn doppelganger_protection_allows_signing(&self, validator_pubkey: PublicKeyBytes) -> bool { + self.doppelganger_service + .as_ref() + // If there's no doppelganger service then we assume it is purposefully disabled and + // declare that all keys are safe with regard to it. + .is_none_or(|doppelganger_service| { + doppelganger_service + .validator_status(validator_pubkey) + .only_safe() + .is_some() + }) + } + + fn num_voting_validators(&self) -> usize { + self.validators.read().num_enabled() + } + + fn graffiti(&self, validator_pubkey: &PublicKeyBytes) -> Option { + self.validators.read().graffiti(validator_pubkey) + } + + /// Returns the fee recipient for the given public key. The priority order for fetching + /// the fee recipient is: + /// 1. validator_definitions.yml + /// 2. process level fee recipient + fn get_fee_recipient(&self, validator_pubkey: &PublicKeyBytes) -> Option
{ + // If there is a `suggested_fee_recipient` in the validator definitions yaml + // file, use that value. + self.get_fee_recipient_defaulting(self.suggested_fee_recipient(validator_pubkey)) + } + + /// Translate the per validator and per process `builder_proposals`, `builder_boost_factor` and + /// `prefer_builder_proposals` configurations to a boost factor, if available. + /// + /// Priority is given to per-validator values, and then if no preference is established by + /// these the process-level defaults are used. For both types of config, the logic is the same: + /// + /// - If `prefer_builder_proposals` is true, set boost factor to `u64::MAX` to indicate a + /// preference for builder payloads. + /// - If `builder_boost_factor` is a value other than None, return its value as the boost factor. + /// - If `builder_proposals` is set to false, set boost factor to 0 to indicate a preference for + /// local payloads. + /// - Else return `None` to indicate no preference between builder and local payloads. + fn determine_builder_boost_factor(&self, validator_pubkey: &PublicKeyBytes) -> Option { + let validator_prefer_builder_proposals = self + .validators + .read() + .prefer_builder_proposals(validator_pubkey); + + if matches!(validator_prefer_builder_proposals, Some(true)) { + return Some(u64::MAX); + } + + let factor = self + .validators + .read() + .builder_boost_factor(validator_pubkey) + .or_else(|| { + if matches!( + self.validators.read().builder_proposals(validator_pubkey), + Some(false) + ) { + return Some(0); + } + None + }); + + factor + .or_else(|| { + if self.prefer_builder_proposals { + return Some(u64::MAX); + } + self.builder_boost_factor.or({ + if !self.builder_proposals { + Some(0) + } else { + None + } + }) + }) + .and_then(|factor| { + // If builder boost factor is set to 100 it should be treated + // as None to prevent unnecessary calculations that could + // lead to loss of information. + if factor == 100 { + None + } else { + Some(factor) + } + }) + } + + async fn randao_reveal( + &self, + validator_pubkey: PublicKeyBytes, + signing_epoch: Epoch, + ) -> Result { + let signing_method = self.doppelganger_checked_signing_method(validator_pubkey)?; + let signing_context = self.signing_context(Domain::Randao, signing_epoch); + + let signature = signing_method + .get_signature::>( + SignableMessage::RandaoReveal(signing_epoch), + signing_context, + &self.spec, + &self.task_executor, + ) + .await?; + + Ok(signature) + } + + fn set_validator_index(&self, validator_pubkey: &PublicKeyBytes, index: u64) { + self.initialized_validators() + .write() + .set_index(validator_pubkey, index); + } + + async fn sign_block( + &self, + validator_pubkey: PublicKeyBytes, + block: UnsignedBlock, + current_slot: Slot, + ) -> Result, Error> { + match block { + UnsignedBlock::Full(block) => self + .sign_abstract_block(validator_pubkey, block, current_slot) + .await + .map(SignedBlock::Full), + UnsignedBlock::Blinded(block) => self + .sign_abstract_block(validator_pubkey, block, current_slot) + .await + .map(SignedBlock::Blinded), + } + } + + async fn sign_attestation( + &self, + validator_pubkey: PublicKeyBytes, + validator_committee_position: usize, + attestation: &mut Attestation, + current_epoch: Epoch, + ) -> Result<(), Error> { + // Make sure the target epoch is not higher than the current epoch to avoid potential attacks. + if attestation.data().target.epoch > current_epoch { + return Err(Error::GreaterThanCurrentEpoch { + epoch: attestation.data().target.epoch, + current_epoch, + }); + } + + // Get the signing method and check doppelganger protection. + let signing_method = self.doppelganger_checked_signing_method(validator_pubkey)?; + + // Checking for slashing conditions. + let signing_epoch = attestation.data().target.epoch; + let signing_context = self.signing_context(Domain::BeaconAttester, signing_epoch); + let domain_hash = signing_context.domain_hash(&self.spec); + let slashing_status = if signing_method + .requires_local_slashing_protection(self.enable_web3signer_slashing_protection) + { + self.slashing_protection.check_and_insert_attestation( + &validator_pubkey, + attestation.data(), + domain_hash, + ) + } else { + Ok(Safe::Valid) + }; + + match slashing_status { + // We can safely sign this attestation. + Ok(Safe::Valid) => { + let signature = signing_method + .get_signature::>( + SignableMessage::AttestationData(attestation.data()), + signing_context, + &self.spec, + &self.task_executor, + ) + .await?; + attestation + .add_signature(&signature, validator_committee_position) + .map_err(Error::UnableToSignAttestation)?; + + validator_metrics::inc_counter_vec( + &validator_metrics::SIGNED_ATTESTATIONS_TOTAL, + &[validator_metrics::SUCCESS], + ); + + Ok(()) + } + Ok(Safe::SameData) => { + warn!("Skipping signing of previously signed attestation"); + validator_metrics::inc_counter_vec( + &validator_metrics::SIGNED_ATTESTATIONS_TOTAL, + &[validator_metrics::SAME_DATA], + ); + Err(Error::SameData) + } + Err(NotSafe::UnregisteredValidator(pk)) => { + warn!( + msg = "Carefully consider running with --init-slashing-protection (see --help)", + public_key = format!("{:?}", pk), + "Not signing attestation for unregistered validator" + ); + validator_metrics::inc_counter_vec( + &validator_metrics::SIGNED_ATTESTATIONS_TOTAL, + &[validator_metrics::UNREGISTERED], + ); + Err(Error::Slashable(NotSafe::UnregisteredValidator(pk))) + } + Err(e) => { + crit!( + attestation = format!("{:?}", attestation.data()), + error = format!("{:?}", e), + "Not signing slashable attestation" + ); + validator_metrics::inc_counter_vec( + &validator_metrics::SIGNED_ATTESTATIONS_TOTAL, + &[validator_metrics::SLASHABLE], + ); + Err(Error::Slashable(e)) + } + } + } + + async fn sign_validator_registration_data( + &self, + validator_registration_data: ValidatorRegistrationData, + ) -> Result { + let domain_hash = self.spec.get_builder_domain(); + let signing_root = validator_registration_data.signing_root(domain_hash); + + let signing_method = + self.doppelganger_bypassed_signing_method(validator_registration_data.pubkey)?; + let signature = signing_method + .get_signature_from_root::>( + SignableMessage::ValidatorRegistration(&validator_registration_data), + signing_root, + &self.task_executor, + None, + ) + .await?; + + validator_metrics::inc_counter_vec( + &validator_metrics::SIGNED_VALIDATOR_REGISTRATIONS_TOTAL, + &[validator_metrics::SUCCESS], + ); + + Ok(SignedValidatorRegistrationData { + message: validator_registration_data, + signature, + }) + } + + /// Signs an `AggregateAndProof` for a given validator. + /// + /// The resulting `SignedAggregateAndProof` is sent on the aggregation channel and cannot be + /// modified by actors other than the signing validator. + async fn produce_signed_aggregate_and_proof( + &self, + validator_pubkey: PublicKeyBytes, + aggregator_index: u64, + aggregate: Attestation, + selection_proof: SelectionProof, + ) -> Result, Error> { + let signing_epoch = aggregate.data().target.epoch; + let signing_context = self.signing_context(Domain::AggregateAndProof, signing_epoch); + + let message = + AggregateAndProof::from_attestation(aggregator_index, aggregate, selection_proof); + + let signing_method = self.doppelganger_checked_signing_method(validator_pubkey)?; + let signature = signing_method + .get_signature::>( + SignableMessage::SignedAggregateAndProof(message.to_ref()), + signing_context, + &self.spec, + &self.task_executor, + ) + .await?; + + validator_metrics::inc_counter_vec( + &validator_metrics::SIGNED_AGGREGATES_TOTAL, + &[validator_metrics::SUCCESS], + ); + + Ok(SignedAggregateAndProof::from_aggregate_and_proof( + message, signature, + )) + } + + /// Produces a `SelectionProof` for the `slot`, signed by with corresponding secret key to + /// `validator_pubkey`. + async fn produce_selection_proof( + &self, + validator_pubkey: PublicKeyBytes, + slot: Slot, + ) -> Result { + let signing_epoch = slot.epoch(E::slots_per_epoch()); + let signing_context = self.signing_context(Domain::SelectionProof, signing_epoch); + + // Bypass the `with_validator_signing_method` function. + // + // This is because we don't care about doppelganger protection when it comes to selection + // proofs. They are not slashable and we need them to subscribe to subnets on the BN. + // + // As long as we disallow `SignedAggregateAndProof` then these selection proofs will never + // be published on the network. + let signing_method = self.doppelganger_bypassed_signing_method(validator_pubkey)?; + + let signature = signing_method + .get_signature::>( + SignableMessage::SelectionProof(slot), + signing_context, + &self.spec, + &self.task_executor, + ) + .await + .map_err(Error::SpecificError)?; + + validator_metrics::inc_counter_vec( + &validator_metrics::SIGNED_SELECTION_PROOFS_TOTAL, + &[validator_metrics::SUCCESS], + ); + + Ok(signature.into()) + } + + /// Produce a `SyncSelectionProof` for `slot` signed by the secret key of `validator_pubkey`. + async fn produce_sync_selection_proof( + &self, + validator_pubkey: &PublicKeyBytes, + slot: Slot, + subnet_id: SyncSubnetId, + ) -> Result { + let signing_epoch = slot.epoch(E::slots_per_epoch()); + let signing_context = + self.signing_context(Domain::SyncCommitteeSelectionProof, signing_epoch); + + // Bypass `with_validator_signing_method`: sync committee messages are not slashable. + let signing_method = self.doppelganger_bypassed_signing_method(*validator_pubkey)?; + + validator_metrics::inc_counter_vec( + &validator_metrics::SIGNED_SYNC_SELECTION_PROOFS_TOTAL, + &[validator_metrics::SUCCESS], + ); + + let message = SyncAggregatorSelectionData { + slot, + subcommittee_index: subnet_id.into(), + }; + + let signature = signing_method + .get_signature::>( + SignableMessage::SyncSelectionProof(&message), + signing_context, + &self.spec, + &self.task_executor, + ) + .await + .map_err(Error::SpecificError)?; + + Ok(signature.into()) + } + + async fn produce_sync_committee_signature( + &self, + slot: Slot, + beacon_block_root: Hash256, + validator_index: u64, + validator_pubkey: &PublicKeyBytes, + ) -> Result { + let signing_epoch = slot.epoch(E::slots_per_epoch()); + let signing_context = self.signing_context(Domain::SyncCommittee, signing_epoch); + + // Bypass `with_validator_signing_method`: sync committee messages are not slashable. + let signing_method = self.doppelganger_bypassed_signing_method(*validator_pubkey)?; + + let signature = signing_method + .get_signature::>( + SignableMessage::SyncCommitteeSignature { + beacon_block_root, + slot, + }, + signing_context, + &self.spec, + &self.task_executor, + ) + .await + .map_err(Error::SpecificError)?; + + validator_metrics::inc_counter_vec( + &validator_metrics::SIGNED_SYNC_COMMITTEE_MESSAGES_TOTAL, + &[validator_metrics::SUCCESS], + ); + + Ok(SyncCommitteeMessage { + slot, + beacon_block_root, + validator_index, + signature, + }) + } + + async fn produce_signed_contribution_and_proof( + &self, + aggregator_index: u64, + aggregator_pubkey: PublicKeyBytes, + contribution: SyncCommitteeContribution, + selection_proof: SyncSelectionProof, + ) -> Result, Error> { + let signing_epoch = contribution.slot.epoch(E::slots_per_epoch()); + let signing_context = self.signing_context(Domain::ContributionAndProof, signing_epoch); + + // Bypass `with_validator_signing_method`: sync committee messages are not slashable. + let signing_method = self.doppelganger_bypassed_signing_method(aggregator_pubkey)?; + + let message = ContributionAndProof { + aggregator_index, + contribution, + selection_proof: selection_proof.into(), + }; + + let signature = signing_method + .get_signature::>( + SignableMessage::SignedContributionAndProof(&message), + signing_context, + &self.spec, + &self.task_executor, + ) + .await + .map_err(Error::SpecificError)?; + + validator_metrics::inc_counter_vec( + &validator_metrics::SIGNED_SYNC_COMMITTEE_CONTRIBUTIONS_TOTAL, + &[validator_metrics::SUCCESS], + ); + + Ok(SignedContributionAndProof { message, signature }) + } + + /// Prune the slashing protection database so that it remains performant. + /// + /// This function will only do actual pruning periodically, so it should usually be + /// cheap to call. The `first_run` flag can be used to print a more verbose message when pruning + /// runs. + fn prune_slashing_protection_db(&self, current_epoch: Epoch, first_run: bool) { + // Attempt to prune every SLASHING_PROTECTION_HISTORY_EPOCHs, with a tolerance for + // missing the epoch that aligns exactly. + let mut last_prune = self.slashing_protection_last_prune.lock(); + if current_epoch / SLASHING_PROTECTION_HISTORY_EPOCHS + <= *last_prune / SLASHING_PROTECTION_HISTORY_EPOCHS + { + return; + } + + if first_run { + info!( + epoch = %current_epoch, + msg = "pruning may take several minutes the first time it runs", + "Pruning slashing protection DB" + ); + } else { + info!(epoch = %current_epoch, "Pruning slashing protection DB"); + } + + let _timer = + validator_metrics::start_timer(&validator_metrics::SLASHING_PROTECTION_PRUNE_TIMES); + + let new_min_target_epoch = current_epoch.saturating_sub(SLASHING_PROTECTION_HISTORY_EPOCHS); + let new_min_slot = new_min_target_epoch.start_slot(E::slots_per_epoch()); + + let all_pubkeys: Vec<_> = self.voting_pubkeys(DoppelgangerStatus::ignored); + + if let Err(e) = self + .slashing_protection + .prune_all_signed_attestations(all_pubkeys.iter(), new_min_target_epoch) + { + error!( + error = ?e, + "Error during pruning of signed attestations" + ); + return; + } + + if let Err(e) = self + .slashing_protection + .prune_all_signed_blocks(all_pubkeys.iter(), new_min_slot) + { + error!( + error = ?e, + "Error during pruning of signed blocks" + ); + return; + } + + *last_prune = current_epoch; + + info!("Completed pruning of slashing protection DB"); + } + + /// Returns `ProposalData` for the provided `pubkey` if it exists in `InitializedValidators`. + /// `ProposalData` fields include defaulting logic described in `get_fee_recipient_defaulting`, + /// `get_gas_limit_defaulting`, and `get_builder_proposals_defaulting`. + fn proposal_data(&self, pubkey: &PublicKeyBytes) -> Option { + self.validators + .read() + .validator(pubkey) + .map(|validator| ProposalData { + validator_index: validator.get_index(), + fee_recipient: self + .get_fee_recipient_defaulting(validator.get_suggested_fee_recipient()), + gas_limit: self.get_gas_limit_defaulting(validator.get_gas_limit()), + builder_proposals: self + .get_builder_proposals_defaulting(validator.get_builder_proposals()), + }) + } +} diff --git a/validator_client/signing_method/src/lib.rs b/validator_client/signing_method/src/lib.rs index f3b62c9500..316c1d2205 100644 --- a/validator_client/signing_method/src/lib.rs +++ b/validator_client/signing_method/src/lib.rs @@ -12,7 +12,7 @@ use std::sync::Arc; use task_executor::TaskExecutor; use types::*; use url::Url; -use web3signer::{ForkInfo, SigningRequest, SigningResponse}; +use web3signer::{ForkInfo, MessageType, SigningRequest, SigningResponse}; pub use web3signer::Web3SignerObject; @@ -152,8 +152,13 @@ impl SigningMethod { genesis_validators_root, }); - self.get_signature_from_root(signable_message, signing_root, executor, fork_info) - .await + self.get_signature_from_root::( + signable_message, + signing_root, + executor, + fork_info, + ) + .await } pub async fn get_signature_from_root>( @@ -227,11 +232,7 @@ impl SigningMethod { // Determine the Web3Signer message type. let message_type = object.message_type(); - - if matches!( - object, - Web3SignerObject::Deposit { .. } | Web3SignerObject::ValidatorRegistration(_) - ) && fork_info.is_some() + if matches!(message_type, MessageType::ValidatorRegistration) && fork_info.is_some() { return Err(Error::GenesisForkVersionRequired); } diff --git a/validator_client/slashing_protection/src/lib.rs b/validator_client/slashing_protection/src/lib.rs index 51dd3e3164..825a34cabc 100644 --- a/validator_client/slashing_protection/src/lib.rs +++ b/validator_client/slashing_protection/src/lib.rs @@ -27,7 +27,7 @@ pub const SLASHING_PROTECTION_FILENAME: &str = "slashing_protection.sqlite"; /// The attestation or block is not safe to sign. /// /// This could be because it's slashable, or because an error occurred. -#[derive(PartialEq, Debug)] +#[derive(PartialEq, Debug, Clone)] pub enum NotSafe { UnregisteredValidator(PublicKeyBytes), DisabledValidator(PublicKeyBytes), diff --git a/validator_client/slashing_protection/src/signed_attestation.rs b/validator_client/slashing_protection/src/signed_attestation.rs index 779b5f770a..332f80c704 100644 --- a/validator_client/slashing_protection/src/signed_attestation.rs +++ b/validator_client/slashing_protection/src/signed_attestation.rs @@ -10,7 +10,7 @@ pub struct SignedAttestation { } /// Reasons why an attestation may be slashable (or invalid). -#[derive(PartialEq, Debug)] +#[derive(PartialEq, Debug, Clone)] pub enum InvalidAttestation { /// The attestation has the same target epoch as an attestation from the DB (enclosed). DoubleVote(SignedAttestation), diff --git a/validator_client/slashing_protection/src/signed_block.rs b/validator_client/slashing_protection/src/signed_block.rs index 92ec2dcbe8..d46872529e 100644 --- a/validator_client/slashing_protection/src/signed_block.rs +++ b/validator_client/slashing_protection/src/signed_block.rs @@ -9,7 +9,7 @@ pub struct SignedBlock { } /// Reasons why a block may be slashable. -#[derive(PartialEq, Debug)] +#[derive(PartialEq, Debug, Clone)] pub enum InvalidBlock { DoubleBlockProposal(SignedBlock), SlotViolatesLowerBound { block_slot: Slot, bound_slot: Slot }, diff --git a/validator_client/src/config.rs b/validator_client/src/config.rs index cfc88969c9..726aa96cf9 100644 --- a/validator_client/src/config.rs +++ b/validator_client/src/config.rs @@ -10,6 +10,7 @@ use directory::{ use eth2::types::Graffiti; use graffiti_file::GraffitiFile; use initialized_validators::Config as InitializedValidatorsConfig; +use lighthouse_validator_store::Config as ValidatorStoreConfig; use sensitive_url::SensitiveUrl; use serde::{Deserialize, Serialize}; use std::fs; @@ -20,7 +21,6 @@ use tracing::{info, warn}; use types::GRAFFITI_BYTES_LEN; use validator_http_api::{self, PK_FILENAME}; use validator_http_metrics; -use validator_store::Config as ValidatorStoreConfig; pub const DEFAULT_BEACON_NODE: &str = "http://localhost:5052/"; diff --git a/validator_client/src/latency.rs b/validator_client/src/latency.rs index edd8daa731..2382d350af 100644 --- a/validator_client/src/latency.rs +++ b/validator_client/src/latency.rs @@ -15,7 +15,7 @@ pub const SLOT_DELAY_DENOMINATOR: u32 = 12; pub fn start_latency_service( context: RuntimeContext, slot_clock: T, - beacon_nodes: Arc>, + beacon_nodes: Arc>, ) { let future = async move { loop { diff --git a/validator_client/src/lib.rs b/validator_client/src/lib.rs index 7171dea57b..100f896f8e 100644 --- a/validator_client/src/lib.rs +++ b/validator_client/src/lib.rs @@ -20,6 +20,7 @@ use doppelganger_service::DoppelgangerService; use environment::RuntimeContext; use eth2::{reqwest::ClientBuilder, BeaconNodeHttpClient, StatusCode, Timeouts}; use initialized_validators::Error::UnableToOpenVotingKeystore; +use lighthouse_validator_store::LighthouseValidatorStore; use notifier::spawn_notifier; use parking_lot::RwLock; use reqwest::Certificate; @@ -27,7 +28,6 @@ use slot_clock::SlotClock; use slot_clock::SystemTimeSlotClock; use std::fs::File; use std::io::Read; -use std::marker::PhantomData; use std::net::SocketAddr; use std::path::Path; use std::sync::Arc; @@ -42,12 +42,11 @@ use validator_http_api::ApiSecret; use validator_services::{ attestation_service::{AttestationService, AttestationServiceBuilder}, block_service::{BlockService, BlockServiceBuilder}, - duties_service::{self, DutiesService}, + duties_service::{self, DutiesService, DutiesServiceBuilder}, preparation_service::{PreparationService, PreparationServiceBuilder}, - sync::SyncDutiesMap, sync_committee_service::SyncCommitteeService, }; -use validator_store::ValidatorStore; +use validator_store::ValidatorStore as ValidatorStoreTrait; /// The interval between attempts to contact the beacon node during startup. const RETRY_DELAY: Duration = Duration::from_secs(2); @@ -72,20 +71,22 @@ const HTTP_GET_VALIDATOR_BLOCK_TIMEOUT_QUOTIENT: u32 = 4; const DOPPELGANGER_SERVICE_NAME: &str = "doppelganger"; +type ValidatorStore = LighthouseValidatorStore; + #[derive(Clone)] pub struct ProductionValidatorClient { context: RuntimeContext, - duties_service: Arc>, - block_service: BlockService, - attestation_service: AttestationService, - sync_committee_service: SyncCommitteeService, + duties_service: Arc, SystemTimeSlotClock>>, + block_service: BlockService, SystemTimeSlotClock>, + attestation_service: AttestationService, SystemTimeSlotClock>, + sync_committee_service: SyncCommitteeService, SystemTimeSlotClock>, doppelganger_service: Option>, - preparation_service: PreparationService, - validator_store: Arc>, + preparation_service: PreparationService, SystemTimeSlotClock>, + validator_store: Arc>, slot_clock: SystemTimeSlotClock, http_api_listen_addr: Option, config: Config, - beacon_nodes: Arc>, + beacon_nodes: Arc>, genesis_time: u64, } @@ -367,14 +368,14 @@ impl ProductionValidatorClient { // Initialize the number of connected, avaliable beacon nodes to 0. set_gauge(&validator_metrics::AVAILABLE_BEACON_NODES_COUNT, 0); - let mut beacon_nodes: BeaconNodeFallback<_, E> = BeaconNodeFallback::new( + let mut beacon_nodes: BeaconNodeFallback<_> = BeaconNodeFallback::new( candidates, config.beacon_node_fallback, config.broadcast_topics.clone(), context.eth2_config.spec.clone(), ); - let mut proposer_nodes: BeaconNodeFallback<_, E> = BeaconNodeFallback::new( + let mut proposer_nodes: BeaconNodeFallback<_> = BeaconNodeFallback::new( proposer_candidates, config.beacon_node_fallback, config.broadcast_topics.clone(), @@ -383,7 +384,7 @@ impl ProductionValidatorClient { // Perform some potentially long-running initialization tasks. let (genesis_time, genesis_validators_root) = tokio::select! { - tuple = init_from_beacon_node(&beacon_nodes, &proposer_nodes) => tuple?, + tuple = init_from_beacon_node::(&beacon_nodes, &proposer_nodes) => tuple?, () = context.executor.exit() => return Err("Shutting down".to_string()) }; @@ -402,10 +403,10 @@ impl ProductionValidatorClient { proposer_nodes.set_slot_clock(slot_clock.clone()); let beacon_nodes = Arc::new(beacon_nodes); - start_fallback_updater_service(context.clone(), beacon_nodes.clone())?; + start_fallback_updater_service::<_, E>(context.executor.clone(), beacon_nodes.clone())?; let proposer_nodes = Arc::new(proposer_nodes); - start_fallback_updater_service(context.clone(), proposer_nodes.clone())?; + start_fallback_updater_service::<_, E>(context.executor.clone(), proposer_nodes.clone())?; let doppelganger_service = if config.enable_doppelganger_protection { Some(Arc::new(DoppelgangerService::default())) @@ -413,7 +414,7 @@ impl ProductionValidatorClient { None }; - let validator_store = Arc::new(ValidatorStore::new( + let validator_store = Arc::new(LighthouseValidatorStore::new( validators, slashing_protection, genesis_validators_root, @@ -439,21 +440,18 @@ impl ProductionValidatorClient { validator_store.prune_slashing_protection_db(slot.epoch(E::slots_per_epoch()), true); } - let duties_context = context.service_context("duties".into()); - let duties_service = Arc::new(DutiesService { - attesters: <_>::default(), - proposers: <_>::default(), - sync_duties: SyncDutiesMap::new(config.distributed), - slot_clock: slot_clock.clone(), - beacon_nodes: beacon_nodes.clone(), - validator_store: validator_store.clone(), - unknown_validator_next_poll_slots: <_>::default(), - spec: context.eth2_config.spec.clone(), - context: duties_context, - enable_high_validator_count_metrics: config.enable_high_validator_count_metrics, - distributed: config.distributed, - disable_attesting: config.disable_attesting, - }); + let duties_service = Arc::new( + DutiesServiceBuilder::new() + .slot_clock(slot_clock.clone()) + .beacon_nodes(beacon_nodes.clone()) + .validator_store(validator_store.clone()) + .spec(context.eth2_config.spec.clone()) + .executor(context.executor.clone()) + .enable_high_validator_count_metrics(config.enable_high_validator_count_metrics) + .distributed(config.distributed) + .disable_attesting(config.disable_attesting) + .build()?, + ); // Update the metrics server. if let Some(ctx) = &validator_metrics_ctx { @@ -465,7 +463,8 @@ impl ProductionValidatorClient { .slot_clock(slot_clock.clone()) .validator_store(validator_store.clone()) .beacon_nodes(beacon_nodes.clone()) - .runtime_context(context.service_context("block".into())) + .executor(context.executor.clone()) + .chain_spec(context.eth2_config.spec.clone()) .graffiti(config.graffiti) .graffiti_file(config.graffiti_file.clone()); @@ -481,7 +480,8 @@ impl ProductionValidatorClient { .slot_clock(slot_clock.clone()) .validator_store(validator_store.clone()) .beacon_nodes(beacon_nodes.clone()) - .runtime_context(context.service_context("attestation".into())) + .executor(context.executor.clone()) + .chain_spec(context.eth2_config.spec.clone()) .disable(config.disable_attesting) .build()?; @@ -489,7 +489,7 @@ impl ProductionValidatorClient { .slot_clock(slot_clock.clone()) .validator_store(validator_store.clone()) .beacon_nodes(beacon_nodes.clone()) - .runtime_context(context.service_context("preparation".into())) + .executor(context.executor.clone()) .builder_registration_timestamp_override(config.builder_registration_timestamp_override) .validator_registration_batch_size(config.validator_registration_batch_size) .build()?; @@ -499,7 +499,7 @@ impl ProductionValidatorClient { validator_store.clone(), slot_clock.clone(), beacon_nodes.clone(), - context.service_context("sync_committee".into()), + context.executor.clone(), ); Ok(Self { @@ -542,12 +542,11 @@ impl ProductionValidatorClient { config: self.config.http_api.clone(), sse_logging_components: self.context.sse_logging_components.clone(), slot_clock: self.slot_clock.clone(), - _phantom: PhantomData, }); let exit = self.context.executor.exit(); - let (listen_addr, server) = validator_http_api::serve(ctx, exit) + let (listen_addr, server) = validator_http_api::serve::<_, E>(ctx, exit) .map_err(|e| format!("Unable to start HTTP API server: {:?}", e))?; self.context @@ -615,12 +614,12 @@ impl ProductionValidatorClient { } async fn init_from_beacon_node( - beacon_nodes: &BeaconNodeFallback, - proposer_nodes: &BeaconNodeFallback, + beacon_nodes: &BeaconNodeFallback, + proposer_nodes: &BeaconNodeFallback, ) -> Result<(u64, Hash256), String> { loop { - beacon_nodes.update_all_candidates().await; - proposer_nodes.update_all_candidates().await; + beacon_nodes.update_all_candidates::().await; + proposer_nodes.update_all_candidates::().await; let num_available = beacon_nodes.num_available().await; let num_total = beacon_nodes.num_total().await; @@ -697,8 +696,8 @@ async fn init_from_beacon_node( Ok((genesis.genesis_time, genesis.genesis_validators_root)) } -async fn wait_for_genesis( - beacon_nodes: &BeaconNodeFallback, +async fn wait_for_genesis( + beacon_nodes: &BeaconNodeFallback, genesis_time: u64, ) -> Result<(), String> { let now = SystemTime::now() @@ -740,8 +739,8 @@ async fn wait_for_genesis( /// Request the version from the node, looping back and trying again on failure. Exit once the node /// has been contacted. -async fn poll_whilst_waiting_for_genesis( - beacon_nodes: &BeaconNodeFallback, +async fn poll_whilst_waiting_for_genesis( + beacon_nodes: &BeaconNodeFallback, genesis_time: Duration, ) -> Result<(), String> { loop { diff --git a/validator_client/src/notifier.rs b/validator_client/src/notifier.rs index 75b3d46457..05f1c919d2 100644 --- a/validator_client/src/notifier.rs +++ b/validator_client/src/notifier.rs @@ -1,4 +1,5 @@ use crate::{DutiesService, ProductionValidatorClient}; +use lighthouse_validator_store::LighthouseValidatorStore; use metrics::set_gauge; use slot_clock::SlotClock; use tokio::time::{sleep, Duration}; @@ -32,7 +33,9 @@ pub fn spawn_notifier(client: &ProductionValidatorClient) -> Resu } /// Performs a single notification routine. -async fn notify(duties_service: &DutiesService) { +async fn notify( + duties_service: &DutiesService, T>, +) { let (candidate_info, num_available, num_synced) = duties_service.beacon_nodes.get_notifier_info().await; let num_total = candidate_info.len(); diff --git a/validator_client/validator_services/Cargo.toml b/validator_client/validator_services/Cargo.toml index 4b023bb40a..86208dadef 100644 --- a/validator_client/validator_services/Cargo.toml +++ b/validator_client/validator_services/Cargo.toml @@ -6,10 +6,8 @@ authors = ["Sigma Prime "] [dependencies] beacon_node_fallback = { workspace = true } -bls = { workspace = true } -doppelganger_service = { workspace = true } +bls = { workspace = true } either = { workspace = true } -environment = { workspace = true } eth2 = { workspace = true } futures = { workspace = true } graffiti_file = { workspace = true } @@ -17,6 +15,7 @@ logging = { workspace = true } parking_lot = { workspace = true } safe_arith = { workspace = true } slot_clock = { workspace = true } +task_executor = { workspace = true } tokio = { workspace = true } tracing = { workspace = true } tree_hash = { workspace = true } diff --git a/validator_client/validator_services/src/attestation_service.rs b/validator_client/validator_services/src/attestation_service.rs index 8e098b81b0..c1e96a2808 100644 --- a/validator_client/validator_services/src/attestation_service.rs +++ b/validator_client/validator_services/src/attestation_service.rs @@ -1,13 +1,13 @@ use crate::duties_service::{DutiesService, DutyAndProof}; use beacon_node_fallback::{ApiTopic, BeaconNodeFallback}; use either::Either; -use environment::RuntimeContext; use futures::future::join_all; use logging::crit; use slot_clock::SlotClock; use std::collections::HashMap; use std::ops::Deref; use std::sync::Arc; +use task_executor::TaskExecutor; use tokio::time::{sleep, sleep_until, Duration, Instant}; use tracing::{debug, error, info, trace, warn}; use tree_hash::TreeHash; @@ -16,33 +16,35 @@ use validator_store::{Error as ValidatorStoreError, ValidatorStore}; /// Builds an `AttestationService`. #[derive(Default)] -pub struct AttestationServiceBuilder { - duties_service: Option>>, - validator_store: Option>>, +pub struct AttestationServiceBuilder { + duties_service: Option>>, + validator_store: Option>, slot_clock: Option, - beacon_nodes: Option>>, - context: Option>, + beacon_nodes: Option>>, + executor: Option, + chain_spec: Option>, disable: bool, } -impl AttestationServiceBuilder { +impl AttestationServiceBuilder { pub fn new() -> Self { Self { duties_service: None, validator_store: None, slot_clock: None, beacon_nodes: None, - context: None, + executor: None, + chain_spec: None, disable: false, } } - pub fn duties_service(mut self, service: Arc>) -> Self { + pub fn duties_service(mut self, service: Arc>) -> Self { self.duties_service = Some(service); self } - pub fn validator_store(mut self, store: Arc>) -> Self { + pub fn validator_store(mut self, store: Arc) -> Self { self.validator_store = Some(store); self } @@ -52,13 +54,18 @@ impl AttestationServiceBuilder { self } - pub fn beacon_nodes(mut self, beacon_nodes: Arc>) -> Self { + pub fn beacon_nodes(mut self, beacon_nodes: Arc>) -> Self { self.beacon_nodes = Some(beacon_nodes); self } - pub fn runtime_context(mut self, context: RuntimeContext) -> Self { - self.context = Some(context); + pub fn executor(mut self, executor: TaskExecutor) -> Self { + self.executor = Some(executor); + self + } + + pub fn chain_spec(mut self, chain_spec: Arc) -> Self { + self.chain_spec = Some(chain_spec); self } @@ -67,7 +74,7 @@ impl AttestationServiceBuilder { self } - pub fn build(self) -> Result, String> { + pub fn build(self) -> Result, String> { Ok(AttestationService { inner: Arc::new(Inner { duties_service: self @@ -82,9 +89,12 @@ impl AttestationServiceBuilder { beacon_nodes: self .beacon_nodes .ok_or("Cannot build AttestationService without beacon_nodes")?, - context: self - .context - .ok_or("Cannot build AttestationService without runtime_context")?, + executor: self + .executor + .ok_or("Cannot build AttestationService without executor")?, + chain_spec: self + .chain_spec + .ok_or("Cannot build AttestationService without chain_spec")?, disable: self.disable, }), }) @@ -92,12 +102,13 @@ impl AttestationServiceBuilder { } /// Helper to minimise `Arc` usage. -pub struct Inner { - duties_service: Arc>, - validator_store: Arc>, +pub struct Inner { + duties_service: Arc>, + validator_store: Arc, slot_clock: T, - beacon_nodes: Arc>, - context: RuntimeContext, + beacon_nodes: Arc>, + executor: TaskExecutor, + chain_spec: Arc, disable: bool, } @@ -106,11 +117,11 @@ pub struct Inner { /// If any validators are on the same committee, a single attestation will be downloaded and /// returned to the beacon node. This attestation will have a signature from each of the /// validators. -pub struct AttestationService { - inner: Arc>, +pub struct AttestationService { + inner: Arc>, } -impl Clone for AttestationService { +impl Clone for AttestationService { fn clone(&self) -> Self { Self { inner: self.inner.clone(), @@ -118,15 +129,15 @@ impl Clone for AttestationService { } } -impl Deref for AttestationService { - type Target = Inner; +impl Deref for AttestationService { + type Target = Inner; fn deref(&self) -> &Self::Target { self.inner.deref() } } -impl AttestationService { +impl AttestationService { /// Starts the service which periodically produces attestations. pub fn start_update_service(self, spec: &ChainSpec) -> Result<(), String> { if self.disable { @@ -145,7 +156,7 @@ impl AttestationService { "Attestation production service started" ); - let executor = self.context.executor.clone(); + let executor = self.executor.clone(); let interval_fut = async move { loop { @@ -205,7 +216,7 @@ impl AttestationService { .into_iter() .for_each(|(committee_index, validator_duties)| { // Spawn a separate task for each attestation. - self.inner.context.executor.spawn_ignoring_error( + self.inner.executor.spawn_ignoring_error( self.clone().publish_attestations_and_aggregates( slot, committee_index, @@ -332,7 +343,7 @@ impl AttestationService { .slot_clock .now() .ok_or("Unable to determine current slot from clock")? - .epoch(E::slots_per_epoch()); + .epoch(S::E::slots_per_epoch()); let attestation_data = self .beacon_nodes @@ -357,7 +368,7 @@ impl AttestationService { let attestation_data = attestation_data_ref; // Ensure that the attestation matches the duties. - if !duty.match_attestation_data::(attestation_data, &self.context.eth2_config.spec) { + if !duty.match_attestation_data::(attestation_data, &self.chain_spec) { crit!( validator = ?duty.pubkey, duty_slot = %duty.slot, @@ -369,14 +380,14 @@ impl AttestationService { return None; } - let mut attestation = match Attestation::::empty_for_signing( + let mut attestation = match Attestation::empty_for_signing( duty.committee_index, duty.committee_length as usize, attestation_data.slot, attestation_data.beacon_block_root, attestation_data.source, attestation_data.target, - &self.context.eth2_config.spec, + &self.chain_spec, ) { Ok(attestation) => attestation, Err(err) => { @@ -439,10 +450,8 @@ impl AttestationService { return Ok(None); } let fork_name = self - .context - .eth2_config - .spec - .fork_name_at_slot::(attestation_data.slot); + .chain_spec + .fork_name_at_slot::(attestation_data.slot); // Post the attestations to the BN. match self @@ -476,7 +485,7 @@ impl AttestationService { .collect::>(); beacon_node - .post_beacon_pool_attestations_v2::( + .post_beacon_pool_attestations_v2::( Either::Right(single_attestations), fork_name, ) @@ -538,10 +547,8 @@ impl AttestationService { } let fork_name = self - .context - .eth2_config - .spec - .fork_name_at_slot::(attestation_data.slot); + .chain_spec + .fork_name_at_slot::(attestation_data.slot); let aggregated_attestation = &self .beacon_nodes @@ -585,7 +592,7 @@ impl AttestationService { let duty = &duty_and_proof.duty; let selection_proof = duty_and_proof.selection_proof.as_ref()?; - if !duty.match_attestation_data::(attestation_data, &self.context.eth2_config.spec) { + if !duty.match_attestation_data::(attestation_data, &self.chain_spec) { crit!("Inconsistent validator duties during signing"); return None; } @@ -689,11 +696,11 @@ impl AttestationService { /// Start the task at `pruning_instant` to avoid interference with other tasks. fn spawn_slashing_protection_pruning_task(&self, slot: Slot, pruning_instant: Instant) { let attestation_service = self.clone(); - let executor = self.inner.context.executor.clone(); - let current_epoch = slot.epoch(E::slots_per_epoch()); + let executor = self.inner.executor.clone(); + let current_epoch = slot.epoch(S::E::slots_per_epoch()); // Wait for `pruning_instant` in a regular task, and then switch to a blocking one. - self.inner.context.executor.spawn( + self.inner.executor.spawn( async move { sleep_until(pruning_instant).await; diff --git a/validator_client/validator_services/src/block_service.rs b/validator_client/validator_services/src/block_service.rs index d2dbbb656e..2f29c1feb7 100644 --- a/validator_client/validator_services/src/block_service.rs +++ b/validator_client/validator_services/src/block_service.rs @@ -1,6 +1,5 @@ use beacon_node_fallback::{ApiTopic, BeaconNodeFallback, Error as FallbackError, Errors}; use bls::SignatureBytes; -use environment::RuntimeContext; use eth2::types::{FullBlockContents, PublishBlockRequest}; use eth2::{BeaconNodeHttpClient, StatusCode}; use graffiti_file::{determine_graffiti, GraffitiFile}; @@ -11,11 +10,12 @@ use std::future::Future; use std::ops::Deref; use std::sync::Arc; use std::time::Duration; +use task_executor::TaskExecutor; use tokio::sync::mpsc; use tracing::{debug, error, info, trace, warn}; use types::{ - BlindedBeaconBlock, BlockType, EthSpec, Graffiti, PublicKeyBytes, SignedBlindedBeaconBlock, - Slot, + BlindedBeaconBlock, BlockType, ChainSpec, EthSpec, Graffiti, PublicKeyBytes, + SignedBlindedBeaconBlock, Slot, }; use validator_store::{Error as ValidatorStoreError, ValidatorStore}; @@ -45,30 +45,32 @@ impl From> for BlockError { /// Builds a `BlockService`. #[derive(Default)] -pub struct BlockServiceBuilder { - validator_store: Option>>, +pub struct BlockServiceBuilder { + validator_store: Option>, slot_clock: Option>, - beacon_nodes: Option>>, - proposer_nodes: Option>>, - context: Option>, + beacon_nodes: Option>>, + proposer_nodes: Option>>, + executor: Option, + chain_spec: Option>, graffiti: Option, graffiti_file: Option, } -impl BlockServiceBuilder { +impl BlockServiceBuilder { pub fn new() -> Self { Self { validator_store: None, slot_clock: None, beacon_nodes: None, proposer_nodes: None, - context: None, + executor: None, + chain_spec: None, graffiti: None, graffiti_file: None, } } - pub fn validator_store(mut self, store: Arc>) -> Self { + pub fn validator_store(mut self, store: Arc) -> Self { self.validator_store = Some(store); self } @@ -78,18 +80,23 @@ impl BlockServiceBuilder { self } - pub fn beacon_nodes(mut self, beacon_nodes: Arc>) -> Self { + pub fn beacon_nodes(mut self, beacon_nodes: Arc>) -> Self { self.beacon_nodes = Some(beacon_nodes); self } - pub fn proposer_nodes(mut self, proposer_nodes: Arc>) -> Self { + pub fn proposer_nodes(mut self, proposer_nodes: Arc>) -> Self { self.proposer_nodes = Some(proposer_nodes); self } - pub fn runtime_context(mut self, context: RuntimeContext) -> Self { - self.context = Some(context); + pub fn executor(mut self, executor: TaskExecutor) -> Self { + self.executor = Some(executor); + self + } + + pub fn chain_spec(mut self, chain_spec: Arc) -> Self { + self.chain_spec = Some(chain_spec); self } @@ -103,7 +110,7 @@ impl BlockServiceBuilder { self } - pub fn build(self) -> Result, String> { + pub fn build(self) -> Result, String> { Ok(BlockService { inner: Arc::new(Inner { validator_store: self @@ -115,9 +122,12 @@ impl BlockServiceBuilder { beacon_nodes: self .beacon_nodes .ok_or("Cannot build BlockService without beacon_node")?, - context: self - .context - .ok_or("Cannot build BlockService without runtime_context")?, + executor: self + .executor + .ok_or("Cannot build BlockService without executor")?, + chain_spec: self + .chain_spec + .ok_or("Cannot build BlockService without chain_spec")?, proposer_nodes: self.proposer_nodes, graffiti: self.graffiti, graffiti_file: self.graffiti_file, @@ -128,12 +138,12 @@ impl BlockServiceBuilder { // Combines a set of non-block-proposing `beacon_nodes` and only-block-proposing // `proposer_nodes`. -pub struct ProposerFallback { - beacon_nodes: Arc>, - proposer_nodes: Option>>, +pub struct ProposerFallback { + beacon_nodes: Arc>, + proposer_nodes: Option>>, } -impl ProposerFallback { +impl ProposerFallback { // Try `func` on `self.proposer_nodes` first. If that doesn't work, try `self.beacon_nodes`. pub async fn request_proposers_first(&self, func: F) -> Result<(), Errors> where @@ -178,22 +188,23 @@ impl ProposerFallback { } /// Helper to minimise `Arc` usage. -pub struct Inner { - validator_store: Arc>, +pub struct Inner { + validator_store: Arc, slot_clock: Arc, - pub beacon_nodes: Arc>, - pub proposer_nodes: Option>>, - context: RuntimeContext, + pub beacon_nodes: Arc>, + pub proposer_nodes: Option>>, + executor: TaskExecutor, + chain_spec: Arc, graffiti: Option, graffiti_file: Option, } /// Attempts to produce attestations for any block producer(s) at the start of the epoch. -pub struct BlockService { - inner: Arc>, +pub struct BlockService { + inner: Arc>, } -impl Clone for BlockService { +impl Clone for BlockService { fn clone(&self) -> Self { Self { inner: self.inner.clone(), @@ -201,8 +212,8 @@ impl Clone for BlockService { } } -impl Deref for BlockService { - type Target = Inner; +impl Deref for BlockService { + type Target = Inner; fn deref(&self) -> &Self::Target { self.inner.deref() @@ -215,14 +226,14 @@ pub struct BlockServiceNotification { pub block_proposers: Vec, } -impl BlockService { +impl BlockService { pub fn start_update_service( self, mut notification_rx: mpsc::Receiver, ) -> Result<(), String> { info!("Block production service started"); - let executor = self.inner.context.executor.clone(); + let executor = self.inner.executor.clone(); executor.spawn( async move { @@ -258,7 +269,7 @@ impl BlockService { return Ok(()); } - if slot == self.context.eth2_config.spec.genesis_slot { + if slot == self.chain_spec.genesis_slot { debug!( proposers = format!("{:?}", notification.block_proposers), "Not producing block at genesis slot" @@ -285,9 +296,11 @@ impl BlockService { } for validator_pubkey in proposers { - let builder_boost_factor = self.get_builder_boost_factor(&validator_pubkey); + let builder_boost_factor = self + .validator_store + .determine_builder_boost_factor(&validator_pubkey); let service = self.clone(); - self.inner.context.executor.spawn( + self.inner.executor.spawn( async move { let result = service .publish_block(slot, validator_pubkey, builder_boost_factor) @@ -314,30 +327,35 @@ impl BlockService { #[allow(clippy::too_many_arguments)] async fn sign_and_publish_block( &self, - proposer_fallback: ProposerFallback, + proposer_fallback: ProposerFallback, slot: Slot, graffiti: Option, validator_pubkey: &PublicKeyBytes, - unsigned_block: UnsignedBlock, + unsigned_block: UnsignedBlock, ) -> Result<(), BlockError> { let signing_timer = validator_metrics::start_timer(&validator_metrics::BLOCK_SIGNING_TIMES); - let res = match unsigned_block { + let (block, maybe_blobs) = match unsigned_block { UnsignedBlock::Full(block_contents) => { let (block, maybe_blobs) = block_contents.deconstruct(); - self.validator_store - .sign_block(*validator_pubkey, block, slot) - .await - .map(|b| SignedBlock::Full(PublishBlockRequest::new(Arc::new(b), maybe_blobs))) + (block.into(), maybe_blobs) } - UnsignedBlock::Blinded(block) => self - .validator_store - .sign_block(*validator_pubkey, block, slot) - .await - .map(Arc::new) - .map(SignedBlock::Blinded), + UnsignedBlock::Blinded(block) => (block.into(), None), }; + let res = self + .validator_store + .sign_block(*validator_pubkey, block, slot) + .await + .map(|block| match block { + validator_store::SignedBlock::Full(block) => { + SignedBlock::Full(PublishBlockRequest::new(Arc::new(block), maybe_blobs)) + } + validator_store::SignedBlock::Blinded(block) => { + SignedBlock::Blinded(Arc::new(block)) + } + }); + let signed_block = match res { Ok(block) => block, Err(ValidatorStoreError::UnknownPubkey(pubkey)) => { @@ -404,7 +422,7 @@ impl BlockService { let randao_reveal = match self .validator_store - .randao_reveal(validator_pubkey, slot.epoch(E::slots_per_epoch())) + .randao_reveal(validator_pubkey, slot.epoch(S::E::slots_per_epoch())) .await { Ok(signature) => signature.into(), @@ -487,7 +505,7 @@ impl BlockService { async fn publish_signed_block_contents( &self, - signed_block: &SignedBlock, + signed_block: &SignedBlock, beacon_node: BeaconNodeHttpClient, ) -> Result<(), BlockError> { let slot = signed_block.slot(); @@ -523,9 +541,9 @@ impl BlockService { graffiti: Option, proposer_index: Option, builder_boost_factor: Option, - ) -> Result, BlockError> { + ) -> Result, BlockError> { let (block_response, _) = beacon_node - .get_validator_blocks_v3::( + .get_validator_blocks_v3::( slot, randao_reveal_ref, graffiti.as_ref(), @@ -553,36 +571,6 @@ impl BlockService { Ok::<_, BlockError>(unsigned_block) } - - /// Returns the builder boost factor of the given public key. - /// The priority order for fetching this value is: - /// - /// 1. validator_definitions.yml - /// 2. process level flag - fn get_builder_boost_factor(&self, validator_pubkey: &PublicKeyBytes) -> Option { - // Apply per validator configuration first. - let validator_builder_boost_factor = self - .validator_store - .determine_validator_builder_boost_factor(validator_pubkey); - - // Fallback to process-wide configuration if needed. - let maybe_builder_boost_factor = validator_builder_boost_factor.or_else(|| { - self.validator_store - .determine_default_builder_boost_factor() - }); - - if let Some(builder_boost_factor) = maybe_builder_boost_factor { - // if builder boost factor is set to 100 it should be treated - // as None to prevent unnecessary calculations that could - // lead to loss of information. - if builder_boost_factor == 100 { - return None; - } - return Some(builder_boost_factor); - } - - None - } } pub enum UnsignedBlock { diff --git a/validator_client/validator_services/src/duties_service.rs b/validator_client/validator_services/src/duties_service.rs index 0921f95298..b4d9bae273 100644 --- a/validator_client/validator_services/src/duties_service.rs +++ b/validator_client/validator_services/src/duties_service.rs @@ -10,8 +10,6 @@ use crate::block_service::BlockServiceNotification; use crate::sync::poll_sync_committee_duties; use crate::sync::SyncDutiesMap; use beacon_node_fallback::{ApiTopic, BeaconNodeFallback}; -use doppelganger_service::DoppelgangerStatus; -use environment::RuntimeContext; use eth2::types::{ AttesterData, BeaconCommitteeSubscription, DutiesResponse, ProposerData, StateId, ValidatorId, }; @@ -24,11 +22,12 @@ use std::collections::{hash_map, BTreeMap, HashMap, HashSet}; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; use std::time::Duration; +use task_executor::TaskExecutor; use tokio::{sync::mpsc::Sender, time::sleep}; use tracing::{debug, error, info, warn}; use types::{ChainSpec, Epoch, EthSpec, Hash256, PublicKeyBytes, SelectionProof, Slot}; use validator_metrics::{get_int_gauge, set_int_gauge, ATTESTATION_DUTY}; -use validator_store::{Error as ValidatorStoreError, ValidatorStore}; +use validator_store::{DoppelgangerStatus, Error as ValidatorStoreError, ValidatorStore}; /// Only retain `HISTORICAL_DUTIES_EPOCHS` duties prior to the current epoch. const HISTORICAL_DUTIES_EPOCHS: u64 = 2; @@ -87,16 +86,16 @@ const _: () = assert!(ATTESTATION_SUBSCRIPTION_OFFSETS[0] > MIN_ATTESTATION_SUBS // The info in the enum variants is displayed in logging, clippy thinks it's dead code. #[derive(Debug)] -pub enum Error { +pub enum Error { UnableToReadSlotClock, FailedToDownloadAttesters(#[allow(dead_code)] String), - FailedToProduceSelectionProof(#[allow(dead_code)] ValidatorStoreError), + FailedToProduceSelectionProof(#[allow(dead_code)] ValidatorStoreError), InvalidModulo(#[allow(dead_code)] ArithError), Arith(#[allow(dead_code)] ArithError), SyncDutiesNotFound(#[allow(dead_code)] u64), } -impl From for Error { +impl From for Error { fn from(e: ArithError) -> Self { Self::Arith(e) } @@ -125,11 +124,11 @@ pub struct SubscriptionSlots { /// Create a selection proof for `duty`. /// /// Return `Ok(None)` if the attesting validator is not an aggregator. -async fn make_selection_proof( +async fn make_selection_proof( duty: &AttesterData, - validator_store: &ValidatorStore, + validator_store: &S, spec: &ChainSpec, -) -> Result, Error> { +) -> Result, Error> { let selection_proof = validator_store .produce_selection_proof(duty.pubkey, duty.slot) .await @@ -205,25 +204,132 @@ type DependentRoot = Hash256; type AttesterMap = HashMap>; type ProposerMap = HashMap)>; +pub struct DutiesServiceBuilder { + /// Provides the canonical list of locally-managed validators. + validator_store: Option>, + /// Tracks the current slot. + slot_clock: Option, + /// Provides HTTP access to remote beacon nodes. + beacon_nodes: Option>>, + /// The runtime for spawning tasks. + executor: Option, + /// The current chain spec. + spec: Option>, + //// Whether we permit large validator counts in the metrics. + enable_high_validator_count_metrics: bool, + /// If this validator is running in distributed mode. + distributed: bool, + disable_attesting: bool, +} + +impl Default for DutiesServiceBuilder { + fn default() -> Self { + Self::new() + } +} + +impl DutiesServiceBuilder { + pub fn new() -> Self { + Self { + validator_store: None, + slot_clock: None, + beacon_nodes: None, + executor: None, + spec: None, + enable_high_validator_count_metrics: false, + distributed: false, + disable_attesting: false, + } + } + + pub fn validator_store(mut self, validator_store: Arc) -> Self { + self.validator_store = Some(validator_store); + self + } + + pub fn slot_clock(mut self, slot_clock: T) -> Self { + self.slot_clock = Some(slot_clock); + self + } + + pub fn beacon_nodes(mut self, beacon_nodes: Arc>) -> Self { + self.beacon_nodes = Some(beacon_nodes); + self + } + + pub fn executor(mut self, executor: TaskExecutor) -> Self { + self.executor = Some(executor); + self + } + + pub fn spec(mut self, spec: Arc) -> Self { + self.spec = Some(spec); + self + } + + pub fn enable_high_validator_count_metrics( + mut self, + enable_high_validator_count_metrics: bool, + ) -> Self { + self.enable_high_validator_count_metrics = enable_high_validator_count_metrics; + self + } + + pub fn distributed(mut self, distributed: bool) -> Self { + self.distributed = distributed; + self + } + + pub fn disable_attesting(mut self, disable_attesting: bool) -> Self { + self.disable_attesting = disable_attesting; + self + } + + pub fn build(self) -> Result, String> { + Ok(DutiesService { + attesters: Default::default(), + proposers: Default::default(), + sync_duties: SyncDutiesMap::new(self.distributed), + validator_store: self + .validator_store + .ok_or("Cannot build DutiesService without validator_store")?, + unknown_validator_next_poll_slots: Default::default(), + slot_clock: self + .slot_clock + .ok_or("Cannot build DutiesService without slot_clock")?, + beacon_nodes: self + .beacon_nodes + .ok_or("Cannot build DutiesService without beacon_nodes")?, + executor: self + .executor + .ok_or("Cannot build DutiesService without executor")?, + spec: self.spec.ok_or("Cannot build DutiesService without spec")?, + enable_high_validator_count_metrics: self.enable_high_validator_count_metrics, + distributed: self.distributed, + disable_attesting: self.disable_attesting, + }) + } +} + /// See the module-level documentation. -pub struct DutiesService { +pub struct DutiesService { /// Maps a validator public key to their duties for each epoch. pub attesters: RwLock, /// Maps an epoch to all *local* proposers in this epoch. Notably, this does not contain /// proposals for any validators which are not registered locally. pub proposers: RwLock, /// Map from validator index to sync committee duties. - pub sync_duties: SyncDutiesMap, + pub sync_duties: SyncDutiesMap, /// Provides the canonical list of locally-managed validators. - pub validator_store: Arc>, + pub validator_store: Arc, /// Maps unknown validator pubkeys to the next slot time when a poll should be conducted again. pub unknown_validator_next_poll_slots: RwLock>, /// Tracks the current slot. pub slot_clock: T, /// Provides HTTP access to remote beacon nodes. - pub beacon_nodes: Arc>, + pub beacon_nodes: Arc>, /// The runtime for spawning tasks. - pub context: RuntimeContext, + pub executor: TaskExecutor, /// The current chain spec. pub spec: Arc, //// Whether we permit large validator counts in the metrics. @@ -233,7 +339,7 @@ pub struct DutiesService { pub disable_attesting: bool, } -impl DutiesService { +impl DutiesService { /// Returns the total number of validators known to the duties service. pub fn total_validator_count(&self) -> usize { self.validator_store.num_voting_validators() @@ -284,7 +390,7 @@ impl DutiesService { /// It is possible that multiple validators have an identical proposal slot, however that is /// likely the result of heavy forking (lol) or inconsistent beacon node connections. pub fn block_proposers(&self, slot: Slot) -> HashSet { - let epoch = slot.epoch(E::slots_per_epoch()); + let epoch = slot.epoch(S::E::slots_per_epoch()); // Only collect validators that are considered safe in terms of doppelganger protection. let signing_pubkeys: HashSet<_> = self @@ -309,7 +415,7 @@ impl DutiesService { /// Returns all `ValidatorDuty` for the given `slot`. pub fn attesters(&self, slot: Slot) -> Vec { - let epoch = slot.epoch(E::slots_per_epoch()); + let epoch = slot.epoch(S::E::slots_per_epoch()); // Only collect validators that are considered safe in terms of doppelganger protection. let signing_pubkeys: HashSet<_> = self @@ -347,15 +453,15 @@ impl DutiesService { /// process every slot, which has the chance of creating a theoretically unlimited backlog of tasks. /// It was a conscious decision to choose to drop tasks on an overloaded/latent system rather than /// overload it even more. -pub fn start_update_service( - core_duties_service: Arc>, +pub fn start_update_service( + core_duties_service: Arc>, mut block_service_tx: Sender, ) { /* * Spawn the task which updates the map of pubkey to validator index. */ let duties_service = core_duties_service.clone(); - core_duties_service.context.executor.spawn( + core_duties_service.executor.spawn( async move { loop { // Run this poll before the wait, this should hopefully download all the indices @@ -378,7 +484,7 @@ pub fn start_update_service( * Spawn the task which keeps track of local block proposal duties. */ let duties_service = core_duties_service.clone(); - core_duties_service.context.executor.spawn( + core_duties_service.executor.spawn( async move { loop { if let Some(duration) = duties_service.slot_clock.duration_to_next_slot() { @@ -411,7 +517,7 @@ pub fn start_update_service( * Spawn the task which keeps track of local attestation duties. */ let duties_service = core_duties_service.clone(); - core_duties_service.context.executor.spawn( + core_duties_service.executor.spawn( async move { loop { if let Some(duration) = duties_service.slot_clock.duration_to_next_slot() { @@ -436,7 +542,7 @@ pub fn start_update_service( // Spawn the task which keeps track of local sync committee duties. let duties_service = core_duties_service.clone(); - core_duties_service.context.executor.spawn( + core_duties_service.executor.spawn( async move { loop { if let Err(e) = poll_sync_committee_duties(&duties_service).await { @@ -466,8 +572,8 @@ pub fn start_update_service( /// Iterate through all the voting pubkeys in the `ValidatorStore` and attempt to learn any unknown /// validator indices. -async fn poll_validator_indices( - duties_service: &DutiesService, +async fn poll_validator_indices( + duties_service: &DutiesService, ) { let _timer = validator_metrics::start_timer_vec( &validator_metrics::DUTIES_SERVICE_TIMES, @@ -486,16 +592,14 @@ async fn poll_validator_indices( // This is on its own line to avoid some weirdness with locks and if statements. let is_known = duties_service .validator_store - .initialized_validators() - .read() - .get_index(&pubkey) + .validator_index(&pubkey) .is_some(); if !is_known { let current_slot_opt = duties_service.slot_clock.now(); if let Some(current_slot) = current_slot_opt { - let is_first_slot_of_epoch = current_slot % E::slots_per_epoch() == 0; + let is_first_slot_of_epoch = current_slot % S::E::slots_per_epoch() == 0; // Query an unknown validator later if it was queried within the last epoch, or if // the current slot is the first slot of an epoch. @@ -546,9 +650,7 @@ async fn poll_validator_indices( ); duties_service .validator_store - .initialized_validators() - .write() - .set_index(&pubkey, response.data.index); + .set_validator_index(&pubkey, response.data.index); duties_service .unknown_validator_next_poll_slots @@ -559,7 +661,7 @@ async fn poll_validator_indices( // the beacon chain. Ok(None) => { if let Some(current_slot) = current_slot_opt { - let next_poll_slot = current_slot.saturating_add(E::slots_per_epoch()); + let next_poll_slot = current_slot.saturating_add(S::E::slots_per_epoch()); duties_service .unknown_validator_next_poll_slots .write() @@ -590,9 +692,9 @@ async fn poll_validator_indices( /// 2. As above, but for the next-epoch. /// 3. Push out any attestation subnet subscriptions to the BN. /// 4. Prune old entries from `duties_service.attesters`. -async fn poll_beacon_attesters( - duties_service: &Arc>, -) -> Result<(), Error> { +async fn poll_beacon_attesters( + duties_service: &Arc>, +) -> Result<(), Error> { let current_epoch_timer = validator_metrics::start_timer_vec( &validator_metrics::DUTIES_SERVICE_TIMES, &[validator_metrics::UPDATE_ATTESTERS_CURRENT_EPOCH], @@ -602,7 +704,7 @@ async fn poll_beacon_attesters( .slot_clock .now() .ok_or(Error::UnableToReadSlotClock)?; - let current_epoch = current_slot.epoch(E::slots_per_epoch()); + let current_epoch = current_slot.epoch(S::E::slots_per_epoch()); let next_epoch = current_epoch + 1; // Collect *all* pubkeys, even those undergoing doppelganger protection. @@ -616,10 +718,8 @@ async fn poll_beacon_attesters( let local_indices = { let mut local_indices = Vec::with_capacity(local_pubkeys.len()); - let vals_ref = duties_service.validator_store.initialized_validators(); - let vals = vals_ref.read(); for &pubkey in &local_pubkeys { - if let Some(validator_index) = vals.get_index(&pubkey) { + if let Some(validator_index) = duties_service.validator_store.validator_index(&pubkey) { local_indices.push(validator_index) } } @@ -643,7 +743,7 @@ async fn poll_beacon_attesters( ) } - update_per_validator_duty_metrics::(duties_service, current_epoch, current_slot); + update_per_validator_duty_metrics(duties_service, current_epoch, current_slot); drop(current_epoch_timer); let next_epoch_timer = validator_metrics::start_timer_vec( @@ -664,7 +764,7 @@ async fn poll_beacon_attesters( ) } - update_per_validator_duty_metrics::(duties_service, next_epoch, current_slot); + update_per_validator_duty_metrics(duties_service, next_epoch, current_slot); drop(next_epoch_timer); let subscriptions_timer = validator_metrics::start_timer_vec( @@ -685,7 +785,7 @@ async fn poll_beacon_attesters( * std::cmp::max( 1, local_pubkeys.len() * ATTESTATION_SUBSCRIPTION_OFFSETS.len() - / E::slots_per_epoch() as usize, + / S::E::slots_per_epoch() as usize, ) / overallocation_denominator; let mut subscriptions = Vec::with_capacity(num_expected_subscriptions); @@ -781,12 +881,12 @@ async fn poll_beacon_attesters( /// For the given `local_indices` and `local_pubkeys`, download the duties for the given `epoch` and /// store them in `duties_service.attesters`. -async fn poll_beacon_attesters_for_epoch( - duties_service: &Arc>, +async fn poll_beacon_attesters_for_epoch( + duties_service: &Arc>, epoch: Epoch, local_indices: &[u64], local_pubkeys: &HashSet, -) -> Result<(), Error> { +) -> Result<(), Error> { // No need to bother the BN if we don't have any validators. if local_indices.is_empty() { debug!( @@ -930,7 +1030,7 @@ async fn poll_beacon_attesters_for_epoch( // Spawn the background task to compute selection proofs. let subservice = duties_service.clone(); - duties_service.context.executor.spawn( + duties_service.executor.spawn( async move { fill_in_selection_proofs(subservice, new_duties, dependent_root).await; }, @@ -941,8 +1041,8 @@ async fn poll_beacon_attesters_for_epoch( } /// Get a filtered list of local validators for which we don't already know their duties for that epoch -fn get_uninitialized_validators( - duties_service: &Arc>, +fn get_uninitialized_validators( + duties_service: &Arc>, epoch: &Epoch, local_pubkeys: &HashSet, ) -> Vec { @@ -958,8 +1058,8 @@ fn get_uninitialized_validators( .collect::>() } -fn update_per_validator_duty_metrics( - duties_service: &Arc>, +fn update_per_validator_duty_metrics( + duties_service: &Arc>, epoch: Epoch, current_slot: Slot, ) { @@ -974,14 +1074,14 @@ fn update_per_validator_duty_metrics( get_int_gauge(&ATTESTATION_DUTY, &[&validator_index.to_string()]) { let existing_slot = Slot::new(existing_slot_gauge.get() as u64); - let existing_epoch = existing_slot.epoch(E::slots_per_epoch()); + let existing_epoch = existing_slot.epoch(S::E::slots_per_epoch()); // First condition ensures that we switch to the next epoch duty slot // once the current epoch duty slot passes. // Second condition is to ensure that next epoch duties don't override // current epoch duties. if existing_slot < current_slot - || (duty_slot.epoch(E::slots_per_epoch()) <= existing_epoch + || (duty_slot.epoch(S::E::slots_per_epoch()) <= existing_epoch && duty_slot > current_slot && duty_slot != existing_slot) { @@ -999,11 +1099,11 @@ fn update_per_validator_duty_metrics( } } -async fn post_validator_duties_attester( - duties_service: &Arc>, +async fn post_validator_duties_attester( + duties_service: &Arc>, epoch: Epoch, validator_indices: &[u64], -) -> Result>, Error> { +) -> Result>, Error> { duties_service .beacon_nodes .first_success(|beacon_node| async move { @@ -1023,8 +1123,8 @@ async fn post_validator_duties_attester( /// /// Duties are computed in batches each slot. If a re-org is detected then the process will /// terminate early as it is assumed the selection proofs from `duties` are no longer relevant. -async fn fill_in_selection_proofs( - duties_service: Arc>, +async fn fill_in_selection_proofs( + duties_service: Arc>, duties: Vec, dependent_root: Hash256, ) { @@ -1075,7 +1175,7 @@ async fn fill_in_selection_proofs( .then(|duty| async { let opt_selection_proof = make_selection_proof( &duty, - &duties_service.validator_store, + duties_service.validator_store.as_ref(), &duties_service.spec, ) .await?; @@ -1114,7 +1214,7 @@ async fn fill_in_selection_proofs( }; let attester_map = attesters.entry(duty.pubkey).or_default(); - let epoch = duty.slot.epoch(E::slots_per_epoch()); + let epoch = duty.slot.epoch(S::E::slots_per_epoch()); match attester_map.entry(epoch) { hash_map::Entry::Occupied(mut entry) => { // No need to update duties for which no proof was computed. @@ -1191,10 +1291,10 @@ async fn fill_in_selection_proofs( /// through the slow path every time. I.e., the proposal will only happen after we've been able to /// download and process the duties from the BN. This means it is very important to ensure this /// function is as fast as possible. -async fn poll_beacon_proposers( - duties_service: &DutiesService, +async fn poll_beacon_proposers( + duties_service: &DutiesService, block_service_tx: &mut Sender, -) -> Result<(), Error> { +) -> Result<(), Error> { let _timer = validator_metrics::start_timer_vec( &validator_metrics::DUTIES_SERVICE_TIMES, &[validator_metrics::UPDATE_PROPOSERS], @@ -1204,17 +1304,17 @@ async fn poll_beacon_proposers( .slot_clock .now() .ok_or(Error::UnableToReadSlotClock)?; - let current_epoch = current_slot.epoch(E::slots_per_epoch()); + let current_epoch = current_slot.epoch(S::E::slots_per_epoch()); // Notify the block proposal service for any proposals that we have in our cache. // // See the function-level documentation for more information. let initial_block_proposers = duties_service.block_proposers(current_slot); - notify_block_production_service( + notify_block_production_service::( current_slot, &initial_block_proposers, block_service_tx, - &duties_service.validator_store, + duties_service.validator_store.as_ref(), ) .await; @@ -1296,11 +1396,11 @@ async fn poll_beacon_proposers( // // See the function-level documentation for more reasoning about this behaviour. if !additional_block_producers.is_empty() { - notify_block_production_service( + notify_block_production_service::( current_slot, &additional_block_producers, block_service_tx, - &duties_service.validator_store, + duties_service.validator_store.as_ref(), ) .await; debug!( @@ -1321,11 +1421,11 @@ async fn poll_beacon_proposers( } /// Notify the block service if it should produce a block. -async fn notify_block_production_service( +async fn notify_block_production_service( current_slot: Slot, block_proposers: &HashSet, block_service_tx: &mut Sender, - validator_store: &ValidatorStore, + validator_store: &S, ) { let non_doppelganger_proposers = block_proposers .iter() diff --git a/validator_client/validator_services/src/preparation_service.rs b/validator_client/validator_services/src/preparation_service.rs index 3367f2d6ca..b59e3266dc 100644 --- a/validator_client/validator_services/src/preparation_service.rs +++ b/validator_client/validator_services/src/preparation_service.rs @@ -1,7 +1,5 @@ use beacon_node_fallback::{ApiTopic, BeaconNodeFallback}; use bls::PublicKeyBytes; -use doppelganger_service::DoppelgangerStatus; -use environment::RuntimeContext; use parking_lot::RwLock; use slot_clock::SlotClock; use std::collections::HashMap; @@ -9,13 +7,16 @@ use std::hash::Hash; use std::ops::Deref; use std::sync::Arc; use std::time::{SystemTime, UNIX_EPOCH}; +use task_executor::TaskExecutor; use tokio::time::{sleep, Duration}; use tracing::{debug, error, info, warn}; use types::{ Address, ChainSpec, EthSpec, ProposerPreparationData, SignedValidatorRegistrationData, ValidatorRegistrationData, }; -use validator_store::{Error as ValidatorStoreError, ProposalData, ValidatorStore}; +use validator_store::{ + DoppelgangerStatus, Error as ValidatorStoreError, ProposalData, ValidatorStore, +}; /// Number of epochs before the Bellatrix hard fork to begin posting proposer preparations. const PROPOSER_PREPARATION_LOOKAHEAD_EPOCHS: u64 = 2; @@ -25,28 +26,28 @@ const EPOCHS_PER_VALIDATOR_REGISTRATION_SUBMISSION: u64 = 1; /// Builds an `PreparationService`. #[derive(Default)] -pub struct PreparationServiceBuilder { - validator_store: Option>>, +pub struct PreparationServiceBuilder { + validator_store: Option>, slot_clock: Option, - beacon_nodes: Option>>, - context: Option>, + beacon_nodes: Option>>, + executor: Option, builder_registration_timestamp_override: Option, validator_registration_batch_size: Option, } -impl PreparationServiceBuilder { +impl PreparationServiceBuilder { pub fn new() -> Self { Self { validator_store: None, slot_clock: None, beacon_nodes: None, - context: None, + executor: None, builder_registration_timestamp_override: None, validator_registration_batch_size: None, } } - pub fn validator_store(mut self, store: Arc>) -> Self { + pub fn validator_store(mut self, store: Arc) -> Self { self.validator_store = Some(store); self } @@ -56,13 +57,13 @@ impl PreparationServiceBuilder { self } - pub fn beacon_nodes(mut self, beacon_nodes: Arc>) -> Self { + pub fn beacon_nodes(mut self, beacon_nodes: Arc>) -> Self { self.beacon_nodes = Some(beacon_nodes); self } - pub fn runtime_context(mut self, context: RuntimeContext) -> Self { - self.context = Some(context); + pub fn executor(mut self, executor: TaskExecutor) -> Self { + self.executor = Some(executor); self } @@ -82,7 +83,7 @@ impl PreparationServiceBuilder { self } - pub fn build(self) -> Result, String> { + pub fn build(self) -> Result, String> { Ok(PreparationService { inner: Arc::new(Inner { validator_store: self @@ -94,9 +95,9 @@ impl PreparationServiceBuilder { beacon_nodes: self .beacon_nodes .ok_or("Cannot build PreparationService without beacon_nodes")?, - context: self - .context - .ok_or("Cannot build PreparationService without runtime_context")?, + executor: self + .executor + .ok_or("Cannot build PreparationService without executor")?, builder_registration_timestamp_override: self .builder_registration_timestamp_override, validator_registration_batch_size: self.validator_registration_batch_size.ok_or( @@ -109,11 +110,11 @@ impl PreparationServiceBuilder { } /// Helper to minimise `Arc` usage. -pub struct Inner { - validator_store: Arc>, +pub struct Inner { + validator_store: Arc, slot_clock: T, - beacon_nodes: Arc>, - context: RuntimeContext, + beacon_nodes: Arc>, + executor: TaskExecutor, builder_registration_timestamp_override: Option, // Used to track unpublished validator registration changes. validator_registration_cache: @@ -145,11 +146,11 @@ impl From for ValidatorRegistrationKey { } /// Attempts to produce proposer preparations for all known validators at the beginning of each epoch. -pub struct PreparationService { - inner: Arc>, +pub struct PreparationService { + inner: Arc>, } -impl Clone for PreparationService { +impl Clone for PreparationService { fn clone(&self) -> Self { Self { inner: self.inner.clone(), @@ -157,15 +158,15 @@ impl Clone for PreparationService { } } -impl Deref for PreparationService { - type Target = Inner; +impl Deref for PreparationService { + type Target = Inner; fn deref(&self) -> &Self::Target { self.inner.deref() } } -impl PreparationService { +impl PreparationService { pub fn start_update_service(self, spec: &ChainSpec) -> Result<(), String> { self.clone().start_validator_registration_service(spec)?; self.start_proposer_prepare_service(spec) @@ -176,7 +177,7 @@ impl PreparationService { let slot_duration = Duration::from_secs(spec.seconds_per_slot); info!("Proposer preparation service started"); - let executor = self.context.executor.clone(); + let executor = self.executor.clone(); let spec = spec.clone(); let interval_fut = async move { @@ -215,7 +216,7 @@ impl PreparationService { let spec = spec.clone(); let slot_duration = Duration::from_secs(spec.seconds_per_slot); - let executor = self.context.executor.clone(); + let executor = self.executor.clone(); let validator_registration_fut = async move { loop { @@ -243,10 +244,9 @@ impl PreparationService { /// This avoids spamming the BN with preparations before the Bellatrix fork epoch, which may /// cause errors if it doesn't support the preparation API. fn should_publish_at_current_slot(&self, spec: &ChainSpec) -> bool { - let current_epoch = self - .slot_clock - .now() - .map_or(E::genesis_epoch(), |slot| slot.epoch(E::slots_per_epoch())); + let current_epoch = self.slot_clock.now().map_or(S::E::genesis_epoch(), |slot| { + slot.epoch(S::E::slots_per_epoch()) + }); spec.bellatrix_fork_epoch.is_some_and(|fork_epoch| { current_epoch + PROPOSER_PREPARATION_LOOKAHEAD_EPOCHS >= fork_epoch }) @@ -367,7 +367,8 @@ impl PreparationService { // Check if any have changed or it's been `EPOCHS_PER_VALIDATOR_REGISTRATION_SUBMISSION`. if let Some(slot) = self.slot_clock.now() { - if slot % (E::slots_per_epoch() * EPOCHS_PER_VALIDATOR_REGISTRATION_SUBMISSION) == 0 { + if slot % (S::E::slots_per_epoch() * EPOCHS_PER_VALIDATOR_REGISTRATION_SUBMISSION) == 0 + { self.publish_validator_registration_data(registration_keys) .await?; } else if !changed_keys.is_empty() { diff --git a/validator_client/validator_services/src/sync.rs b/validator_client/validator_services/src/sync.rs index 5151633514..c13b70db80 100644 --- a/validator_client/validator_services/src/sync.rs +++ b/validator_client/validator_services/src/sync.rs @@ -1,15 +1,13 @@ use crate::duties_service::{DutiesService, Error}; -use doppelganger_service::DoppelgangerStatus; use futures::future::join_all; use logging::crit; use parking_lot::{MappedRwLockReadGuard, RwLock, RwLockReadGuard, RwLockWriteGuard}; use slot_clock::SlotClock; use std::collections::{HashMap, HashSet}; -use std::marker::PhantomData; use std::sync::Arc; use tracing::{debug, info, warn}; use types::{ChainSpec, EthSpec, PublicKeyBytes, Slot, SyncDuty, SyncSelectionProof, SyncSubnetId}; -use validator_store::Error as ValidatorStoreError; +use validator_store::{DoppelgangerStatus, Error as ValidatorStoreError, ValidatorStore}; /// Number of epochs in advance to compute selection proofs when not in `distributed` mode. pub const AGGREGATION_PRE_COMPUTE_EPOCHS: u64 = 2; @@ -28,12 +26,11 @@ pub const AGGREGATION_PRE_COMPUTE_SLOTS_DISTRIBUTED: u64 = 1; /// 2. One-at-a-time locking. For the innermost locks on the aggregator duties, all of the functions /// in this file take care to only lock one validator at a time. We never hold a lock while /// trying to obtain another one (hence no lock ordering issues). -pub struct SyncDutiesMap { +pub struct SyncDutiesMap { /// Map from sync committee period to duties for members of that sync committee. committees: RwLock>, /// Whether we are in `distributed` mode and using reduced lookahead for aggregate pre-compute. distributed: bool, - _phantom: PhantomData, } /// Duties for a single sync committee period. @@ -81,12 +78,11 @@ pub struct SlotDuties { pub aggregators: HashMap>, } -impl SyncDutiesMap { +impl SyncDutiesMap { pub fn new(distributed: bool) -> Self { Self { committees: RwLock::new(HashMap::new()), distributed, - _phantom: PhantomData, } } @@ -104,7 +100,7 @@ impl SyncDutiesMap { } /// Number of slots in advance to compute selection proofs - fn aggregation_pre_compute_slots(&self) -> u64 { + fn aggregation_pre_compute_slots(&self) -> u64 { if self.distributed { AGGREGATION_PRE_COMPUTE_SLOTS_DISTRIBUTED } else { @@ -117,7 +113,7 @@ impl SyncDutiesMap { /// Return the slot up to which proofs should be pre-computed, as well as a vec of /// `(previous_pre_compute_slot, sync_duty)` pairs for all validators which need to have proofs /// computed. See `fill_in_aggregation_proofs` for the actual calculation. - fn prepare_for_aggregator_pre_compute( + fn prepare_for_aggregator_pre_compute( &self, committee_period: u64, current_slot: Slot, @@ -127,7 +123,7 @@ impl SyncDutiesMap { current_slot, first_slot_of_period::(committee_period, spec), ); - let pre_compute_lookahead_slots = self.aggregation_pre_compute_slots(); + let pre_compute_lookahead_slots = self.aggregation_pre_compute_slots::(); let pre_compute_slot = std::cmp::min( current_slot + pre_compute_lookahead_slots, last_slot_of_period::(committee_period, spec), @@ -187,7 +183,7 @@ impl SyncDutiesMap { /// Get duties for all validators for the given `wall_clock_slot`. /// /// This is the entry-point for the sync committee service. - pub fn get_duties_for_slot( + pub fn get_duties_for_slot( &self, wall_clock_slot: Slot, spec: &ChainSpec, @@ -284,16 +280,16 @@ fn last_slot_of_period(sync_committee_period: u64, spec: &ChainSpec) first_slot_of_period::(sync_committee_period + 1, spec) - 1 } -pub async fn poll_sync_committee_duties( - duties_service: &Arc>, -) -> Result<(), Error> { +pub async fn poll_sync_committee_duties( + duties_service: &Arc>, +) -> Result<(), Error> { let sync_duties = &duties_service.sync_duties; let spec = &duties_service.spec; let current_slot = duties_service .slot_clock .now() .ok_or(Error::UnableToReadSlotClock)?; - let current_epoch = current_slot.epoch(E::slots_per_epoch()); + let current_epoch = current_slot.epoch(S::E::slots_per_epoch()); // If the Altair fork is yet to be activated, do not attempt to poll for duties. if spec @@ -317,10 +313,8 @@ pub async fn poll_sync_committee_duties( let local_indices = { let mut local_indices = Vec::with_capacity(local_pubkeys.len()); - let vals_ref = duties_service.validator_store.initialized_validators(); - let vals = vals_ref.read(); for &pubkey in &local_pubkeys { - if let Some(validator_index) = vals.get_index(&pubkey) { + if let Some(validator_index) = duties_service.validator_store.validator_index(&pubkey) { local_indices.push(validator_index) } } @@ -342,11 +336,15 @@ pub async fn poll_sync_committee_duties( // Pre-compute aggregator selection proofs for the current period. let (current_pre_compute_slot, new_pre_compute_duties) = sync_duties - .prepare_for_aggregator_pre_compute(current_sync_committee_period, current_slot, spec); + .prepare_for_aggregator_pre_compute::( + current_sync_committee_period, + current_slot, + spec, + ); if !new_pre_compute_duties.is_empty() { let sub_duties_service = duties_service.clone(); - duties_service.context.executor.spawn( + duties_service.executor.spawn( async move { fill_in_aggregation_proofs( sub_duties_service, @@ -379,18 +377,22 @@ pub async fn poll_sync_committee_duties( } // Pre-compute aggregator selection proofs for the next period. - let aggregate_pre_compute_lookahead_slots = sync_duties.aggregation_pre_compute_slots(); + let aggregate_pre_compute_lookahead_slots = sync_duties.aggregation_pre_compute_slots::(); if (current_slot + aggregate_pre_compute_lookahead_slots) - .epoch(E::slots_per_epoch()) + .epoch(S::E::slots_per_epoch()) .sync_committee_period(spec)? == next_sync_committee_period { let (pre_compute_slot, new_pre_compute_duties) = sync_duties - .prepare_for_aggregator_pre_compute(next_sync_committee_period, current_slot, spec); + .prepare_for_aggregator_pre_compute::( + next_sync_committee_period, + current_slot, + spec, + ); if !new_pre_compute_duties.is_empty() { let sub_duties_service = duties_service.clone(); - duties_service.context.executor.spawn( + duties_service.executor.spawn( async move { fill_in_aggregation_proofs( sub_duties_service, @@ -409,11 +411,11 @@ pub async fn poll_sync_committee_duties( Ok(()) } -pub async fn poll_sync_committee_duties_for_period( - duties_service: &Arc>, +pub async fn poll_sync_committee_duties_for_period( + duties_service: &Arc>, local_indices: &[u64], sync_committee_period: u64, -) -> Result<(), Error> { +) -> Result<(), Error> { let spec = &duties_service.spec; // no local validators don't need to poll for sync committee @@ -496,8 +498,8 @@ pub async fn poll_sync_committee_duties_for_period( - duties_service: Arc>, +pub async fn fill_in_aggregation_proofs( + duties_service: Arc>, pre_compute_duties: &[(Slot, SyncDuty)], sync_committee_period: u64, current_slot: Slot, @@ -519,7 +521,7 @@ pub async fn fill_in_aggregation_proofs( continue; } - let subnet_ids = match duty.subnet_ids::() { + let subnet_ids = match duty.subnet_ids::() { Ok(subnet_ids) => subnet_ids, Err(e) => { crit!( @@ -564,7 +566,7 @@ pub async fn fill_in_aggregation_proofs( } }; - match proof.is_aggregator::() { + match proof.is_aggregator::() { Ok(true) => { debug!( validator_index = duty.validator_index, diff --git a/validator_client/validator_services/src/sync_committee_service.rs b/validator_client/validator_services/src/sync_committee_service.rs index d99c0d3107..be9e2918a4 100644 --- a/validator_client/validator_services/src/sync_committee_service.rs +++ b/validator_client/validator_services/src/sync_committee_service.rs @@ -1,6 +1,5 @@ use crate::duties_service::DutiesService; use beacon_node_fallback::{ApiTopic, BeaconNodeFallback}; -use environment::RuntimeContext; use eth2::types::BlockId; use futures::future::join_all; use futures::future::FutureExt; @@ -10,6 +9,7 @@ use std::collections::HashMap; use std::ops::Deref; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; +use task_executor::TaskExecutor; use tokio::time::{sleep, sleep_until, Duration, Instant}; use tracing::{debug, error, info, trace, warn}; use types::{ @@ -20,11 +20,11 @@ use validator_store::{Error as ValidatorStoreError, ValidatorStore}; pub const SUBSCRIPTION_LOOKAHEAD_EPOCHS: u64 = 4; -pub struct SyncCommitteeService { - inner: Arc>, +pub struct SyncCommitteeService { + inner: Arc>, } -impl Clone for SyncCommitteeService { +impl Clone for SyncCommitteeService { fn clone(&self) -> Self { Self { inner: self.inner.clone(), @@ -32,33 +32,33 @@ impl Clone for SyncCommitteeService { } } -impl Deref for SyncCommitteeService { - type Target = Inner; +impl Deref for SyncCommitteeService { + type Target = Inner; fn deref(&self) -> &Self::Target { self.inner.deref() } } -pub struct Inner { - duties_service: Arc>, - validator_store: Arc>, +pub struct Inner { + duties_service: Arc>, + validator_store: Arc, slot_clock: T, - beacon_nodes: Arc>, - context: RuntimeContext, + beacon_nodes: Arc>, + executor: TaskExecutor, /// Boolean to track whether the service has posted subscriptions to the BN at least once. /// /// This acts as a latch that fires once upon start-up, and then never again. first_subscription_done: AtomicBool, } -impl SyncCommitteeService { +impl SyncCommitteeService { pub fn new( - duties_service: Arc>, - validator_store: Arc>, + duties_service: Arc>, + validator_store: Arc, slot_clock: T, - beacon_nodes: Arc>, - context: RuntimeContext, + beacon_nodes: Arc>, + executor: TaskExecutor, ) -> Self { Self { inner: Arc::new(Inner { @@ -66,7 +66,7 @@ impl SyncCommitteeService { validator_store, slot_clock, beacon_nodes, - context, + executor, first_subscription_done: AtomicBool::new(false), }), } @@ -80,7 +80,7 @@ impl SyncCommitteeService { .spec .altair_fork_epoch .and_then(|fork_epoch| { - let current_epoch = self.slot_clock.now()?.epoch(E::slots_per_epoch()); + let current_epoch = self.slot_clock.now()?.epoch(S::E::slots_per_epoch()); Some(current_epoch >= fork_epoch) }) .unwrap_or(false) @@ -103,7 +103,7 @@ impl SyncCommitteeService { "Sync committee service started" ); - let executor = self.context.executor.clone(); + let executor = self.executor.clone(); let interval_fut = async move { loop { @@ -156,7 +156,7 @@ impl SyncCommitteeService { let Some(slot_duties) = self .duties_service .sync_duties - .get_duties_for_slot(slot, &self.duties_service.spec) + .get_duties_for_slot::(slot, &self.duties_service.spec) else { debug!("No duties known for slot {}", slot); return Ok(()); @@ -202,7 +202,7 @@ impl SyncCommitteeService { // Spawn one task to publish all of the sync committee signatures. let validator_duties = slot_duties.duties; let service = self.clone(); - self.inner.context.executor.spawn( + self.inner.executor.spawn( async move { service .publish_sync_committee_signatures(slot, block_root, validator_duties) @@ -214,7 +214,7 @@ impl SyncCommitteeService { let aggregators = slot_duties.aggregators; let service = self.clone(); - self.inner.context.executor.spawn( + self.inner.executor.spawn( async move { service .publish_sync_committee_aggregates( @@ -316,7 +316,7 @@ impl SyncCommitteeService { ) { for (subnet_id, subnet_aggregators) in aggregators { let service = self.clone(); - self.inner.context.executor.spawn( + self.inner.executor.spawn( async move { service .publish_sync_committee_aggregate_for_subnet( @@ -354,7 +354,7 @@ impl SyncCommitteeService { }; beacon_node - .get_validator_sync_committee_contribution::(&sync_contribution_data) + .get_validator_sync_committee_contribution(&sync_contribution_data) .await }) .await @@ -440,7 +440,7 @@ impl SyncCommitteeService { fn spawn_subscription_tasks(&self) { let service = self.clone(); - self.inner.context.executor.spawn( + self.inner.executor.spawn( async move { service.publish_subscriptions().await.unwrap_or_else(|e| { error!( @@ -463,10 +463,10 @@ impl SyncCommitteeService { // At the start of every epoch during the current period, re-post the subscriptions // to the beacon node. This covers the case where the BN has forgotten the subscriptions // due to a restart, or where the VC has switched to a fallback BN. - let current_period = sync_period_of_slot::(slot, spec)?; + let current_period = sync_period_of_slot::(slot, spec)?; if !self.first_subscription_done.load(Ordering::Relaxed) - || slot.as_u64() % E::slots_per_epoch() == 0 + || slot.as_u64() % S::E::slots_per_epoch() == 0 { duty_slots.push((slot, current_period)); } @@ -474,9 +474,9 @@ impl SyncCommitteeService { // Near the end of the current period, push subscriptions for the next period to the // beacon node. We aggressively push every slot in the lead-up, as this is the main way // that we want to ensure that the BN is subscribed (well in advance). - let lookahead_slot = slot + SUBSCRIPTION_LOOKAHEAD_EPOCHS * E::slots_per_epoch(); + let lookahead_slot = slot + SUBSCRIPTION_LOOKAHEAD_EPOCHS * S::E::slots_per_epoch(); - let lookahead_period = sync_period_of_slot::(lookahead_slot, spec)?; + let lookahead_period = sync_period_of_slot::(lookahead_slot, spec)?; if lookahead_period > current_period { duty_slots.push((lookahead_slot, lookahead_period)); @@ -494,7 +494,7 @@ impl SyncCommitteeService { match self .duties_service .sync_duties - .get_duties_for_slot(duty_slot, spec) + .get_duties_for_slot::(duty_slot, spec) { Some(duties) => subscriptions.extend(subscriptions_from_sync_duties( duties.duties, diff --git a/validator_client/validator_store/Cargo.toml b/validator_client/validator_store/Cargo.toml index 1338c2a07e..91df9dc3ab 100644 --- a/validator_client/validator_store/Cargo.toml +++ b/validator_client/validator_store/Cargo.toml @@ -4,21 +4,6 @@ version = "0.1.0" edition = { workspace = true } authors = ["Sigma Prime "] -[lib] -name = "validator_store" -path = "src/lib.rs" - [dependencies] -account_utils = { workspace = true } -doppelganger_service = { workspace = true } -initialized_validators = { workspace = true } -logging = { workspace = true } -parking_lot = { workspace = true } -serde = { workspace = true } -signing_method = { workspace = true } slashing_protection = { workspace = true } -slot_clock = { workspace = true } -task_executor = { workspace = true } -tracing = { workspace = true } types = { workspace = true } -validator_metrics = { workspace = true } diff --git a/validator_client/validator_store/src/lib.rs b/validator_client/validator_store/src/lib.rs index 015b321d43..9de3a6d66a 100644 --- a/validator_client/validator_store/src/lib.rs +++ b/validator_client/validator_store/src/lib.rs @@ -1,31 +1,16 @@ -use account_utils::validator_definitions::{PasswordStorage, ValidatorDefinition}; -use doppelganger_service::{DoppelgangerService, DoppelgangerStatus, DoppelgangerValidatorStore}; -use initialized_validators::InitializedValidators; -use logging::crit; -use parking_lot::{Mutex, RwLock}; -use serde::{Deserialize, Serialize}; -use signing_method::{Error as SigningError, SignableMessage, SigningContext, SigningMethod}; -use slashing_protection::{ - interchange::Interchange, InterchangeError, NotSafe, Safe, SlashingDatabase, -}; -use slot_clock::SlotClock; -use std::marker::PhantomData; -use std::path::Path; -use std::sync::Arc; -use task_executor::TaskExecutor; -use tracing::{error, info, warn}; +use slashing_protection::NotSafe; +use std::fmt::Debug; +use std::future::Future; use types::{ - attestation::Error as AttestationError, graffiti::GraffitiString, AbstractExecPayload, Address, - AggregateAndProof, Attestation, BeaconBlock, BlindedPayload, ChainSpec, ContributionAndProof, - Domain, Epoch, EthSpec, Fork, Graffiti, Hash256, PublicKeyBytes, SelectionProof, Signature, - SignedAggregateAndProof, SignedBeaconBlock, SignedContributionAndProof, SignedRoot, - SignedValidatorRegistrationData, SignedVoluntaryExit, Slot, SyncAggregatorSelectionData, - SyncCommitteeContribution, SyncCommitteeMessage, SyncSelectionProof, SyncSubnetId, - ValidatorRegistrationData, VoluntaryExit, + Address, Attestation, AttestationError, BeaconBlock, BlindedBeaconBlock, Epoch, EthSpec, + Graffiti, Hash256, PublicKeyBytes, SelectionProof, Signature, SignedAggregateAndProof, + SignedBeaconBlock, SignedBlindedBeaconBlock, SignedContributionAndProof, + SignedValidatorRegistrationData, Slot, SyncCommitteeContribution, SyncCommitteeMessage, + SyncSelectionProof, SyncSubnetId, ValidatorRegistrationData, }; -#[derive(Debug, PartialEq)] -pub enum Error { +#[derive(Debug, PartialEq, Clone)] +pub enum Error { DoppelgangerProtected(PublicKeyBytes), UnknownToDoppelgangerService(PublicKeyBytes), UnknownPubkey(PublicKeyBytes), @@ -34,31 +19,15 @@ pub enum Error { GreaterThanCurrentSlot { slot: Slot, current_slot: Slot }, GreaterThanCurrentEpoch { epoch: Epoch, current_epoch: Epoch }, UnableToSignAttestation(AttestationError), - UnableToSign(SigningError), + SpecificError(T), } -impl From for Error { - fn from(e: SigningError) -> Self { - Error::UnableToSign(e) +impl From for Error { + fn from(e: T) -> Self { + Error::SpecificError(e) } } -#[derive(Debug, Default, Clone, Serialize, Deserialize)] -pub struct Config { - /// Fallback fee recipient address. - pub fee_recipient: Option
, - /// Fallback gas limit. - pub gas_limit: Option, - /// Enable use of the blinded block endpoints during proposals. - pub builder_proposals: bool, - /// Enable slashing protection even while using web3signer keys. - pub enable_web3signer_slashing_protection: bool, - /// If true, Lighthouse will prefer builder proposals, if available. - pub prefer_builder_proposals: bool, - /// Specifies the boost factor, a percentage multiplier to apply to the builder's payload value. - pub builder_boost_factor: Option, -} - /// A helper struct, used for passing data from the validator store to services. pub struct ProposalData { pub validator_index: Option, @@ -67,185 +36,9 @@ pub struct ProposalData { pub builder_proposals: bool, } -/// Number of epochs of slashing protection history to keep. -/// -/// This acts as a maximum safe-guard against clock drift. -const SLASHING_PROTECTION_HISTORY_EPOCHS: u64 = 512; - -/// Currently used as the default gas limit in execution clients. -/// -/// https://ethresear.ch/t/on-increasing-the-block-gas-limit-technical-considerations-path-forward/21225. -pub const DEFAULT_GAS_LIMIT: u64 = 36_000_000; - -pub struct ValidatorStore { - validators: Arc>, - slashing_protection: SlashingDatabase, - slashing_protection_last_prune: Arc>, - genesis_validators_root: Hash256, - spec: Arc, - doppelganger_service: Option>, - slot_clock: T, - fee_recipient_process: Option
, - gas_limit: Option, - builder_proposals: bool, - enable_web3signer_slashing_protection: bool, - prefer_builder_proposals: bool, - builder_boost_factor: Option, - task_executor: TaskExecutor, - _phantom: PhantomData, -} - -impl DoppelgangerValidatorStore for ValidatorStore { - fn get_validator_index(&self, pubkey: &PublicKeyBytes) -> Option { - self.validator_index(pubkey) - } -} - -impl ValidatorStore { - // All arguments are different types. Making the fields `pub` is undesired. A builder seems - // unnecessary. - #[allow(clippy::too_many_arguments)] - pub fn new( - validators: InitializedValidators, - slashing_protection: SlashingDatabase, - genesis_validators_root: Hash256, - spec: Arc, - doppelganger_service: Option>, - slot_clock: T, - config: &Config, - task_executor: TaskExecutor, - ) -> Self { - Self { - validators: Arc::new(RwLock::new(validators)), - slashing_protection, - slashing_protection_last_prune: Arc::new(Mutex::new(Epoch::new(0))), - genesis_validators_root, - spec, - doppelganger_service, - slot_clock, - fee_recipient_process: config.fee_recipient, - gas_limit: config.gas_limit, - builder_proposals: config.builder_proposals, - enable_web3signer_slashing_protection: config.enable_web3signer_slashing_protection, - prefer_builder_proposals: config.prefer_builder_proposals, - builder_boost_factor: config.builder_boost_factor, - task_executor, - _phantom: PhantomData, - } - } - - /// Register all local validators in doppelganger protection to try and prevent instances of - /// duplicate validators operating on the network at the same time. - /// - /// This function has no effect if doppelganger protection is disabled. - pub fn register_all_in_doppelganger_protection_if_enabled(&self) -> Result<(), String> { - if let Some(doppelganger_service) = &self.doppelganger_service { - for pubkey in self.validators.read().iter_voting_pubkeys() { - doppelganger_service.register_new_validator::(*pubkey, &self.slot_clock)? - } - } - - Ok(()) - } - - /// Returns `true` if doppelganger protection is enabled, or else `false`. - pub fn doppelganger_protection_enabled(&self) -> bool { - self.doppelganger_service.is_some() - } - - pub fn initialized_validators(&self) -> Arc> { - self.validators.clone() - } - - /// Indicates if the `voting_public_key` exists in self and is enabled. - pub fn has_validator(&self, voting_public_key: &PublicKeyBytes) -> bool { - self.validators - .read() - .validator(voting_public_key) - .is_some() - } - - /// Insert a new validator to `self`, where the validator is represented by an EIP-2335 - /// keystore on the filesystem. - #[allow(clippy::too_many_arguments)] - pub async fn add_validator_keystore>( - &self, - voting_keystore_path: P, - password_storage: PasswordStorage, - enable: bool, - graffiti: Option, - suggested_fee_recipient: Option
, - gas_limit: Option, - builder_proposals: Option, - builder_boost_factor: Option, - prefer_builder_proposals: Option, - ) -> Result { - let mut validator_def = ValidatorDefinition::new_keystore_with_password( - voting_keystore_path, - password_storage, - graffiti, - suggested_fee_recipient, - gas_limit, - builder_proposals, - builder_boost_factor, - prefer_builder_proposals, - ) - .map_err(|e| format!("failed to create validator definitions: {:?}", e))?; - - validator_def.enabled = enable; - - self.add_validator(validator_def).await - } - - /// Insert a new validator to `self`. - /// - /// This function includes: - /// - /// - Adding the validator definition to the YAML file, saving it to the filesystem. - /// - Enabling the validator with the slashing protection database. - /// - If `enable == true`, starting to perform duties for the validator. - // FIXME: ignore this clippy lint until the validator store is refactored to use async locks - #[allow(clippy::await_holding_lock)] - pub async fn add_validator( - &self, - validator_def: ValidatorDefinition, - ) -> Result { - let validator_pubkey = validator_def.voting_public_key.compress(); - - self.slashing_protection - .register_validator(validator_pubkey) - .map_err(|e| format!("failed to register validator: {:?}", e))?; - - if let Some(doppelganger_service) = &self.doppelganger_service { - doppelganger_service - .register_new_validator::(validator_pubkey, &self.slot_clock)?; - } - - self.validators - .write() - .add_definition_replace_disabled(validator_def.clone()) - .await - .map_err(|e| format!("Unable to add definition: {:?}", e))?; - - Ok(validator_def) - } - - /// Returns `ProposalData` for the provided `pubkey` if it exists in `InitializedValidators`. - /// `ProposalData` fields include defaulting logic described in `get_fee_recipient_defaulting`, - /// `get_gas_limit_defaulting`, and `get_builder_proposals_defaulting`. - pub fn proposal_data(&self, pubkey: &PublicKeyBytes) -> Option { - self.validators - .read() - .validator(pubkey) - .map(|validator| ProposalData { - validator_index: validator.get_index(), - fee_recipient: self - .get_fee_recipient_defaulting(validator.get_suggested_fee_recipient()), - gas_limit: self.get_gas_limit_defaulting(validator.get_gas_limit()), - builder_proposals: self - .get_builder_proposals_defaulting(validator.get_builder_proposals()), - }) - } +pub trait ValidatorStore: Send + Sync { + type Error: Debug + Send + Sync; + type E: EthSpec; /// Attempts to resolve the pubkey to a validator index. /// @@ -253,9 +46,7 @@ impl ValidatorStore { /// /// - Unknown. /// - Known, but with an unknown index. - pub fn validator_index(&self, pubkey: &PublicKeyBytes) -> Option { - self.validators.read().get_index(pubkey) - } + fn validator_index(&self, pubkey: &PublicKeyBytes) -> Option; /// Returns all voting pubkeys for all enabled validators. /// @@ -266,255 +57,25 @@ impl ValidatorStore { /// protection and are safe-enough to sign messages. /// - `DoppelgangerStatus::ignored`: returns all the pubkeys from `only_safe` *plus* those still /// undergoing protection. This is useful for collecting duties or other non-signing tasks. - #[allow(clippy::needless_collect)] // Collect is required to avoid holding a lock. - pub fn voting_pubkeys(&self, filter_func: F) -> I + fn voting_pubkeys(&self, filter_func: F) -> I where I: FromIterator, - F: Fn(DoppelgangerStatus) -> Option, - { - // Collect all the pubkeys first to avoid interleaving locks on `self.validators` and - // `self.doppelganger_service()`. - let pubkeys = self - .validators - .read() - .iter_voting_pubkeys() - .cloned() - .collect::>(); - - pubkeys - .into_iter() - .map(|pubkey| { - self.doppelganger_service - .as_ref() - .map(|doppelganger_service| doppelganger_service.validator_status(pubkey)) - // Allow signing on all pubkeys if doppelganger protection is disabled. - .unwrap_or_else(|| DoppelgangerStatus::SigningEnabled(pubkey)) - }) - .filter_map(filter_func) - .collect() - } - - /// Returns doppelganger statuses for all enabled validators. - #[allow(clippy::needless_collect)] // Collect is required to avoid holding a lock. - pub fn doppelganger_statuses(&self) -> Vec { - // Collect all the pubkeys first to avoid interleaving locks on `self.validators` and - // `self.doppelganger_service`. - let pubkeys = self - .validators - .read() - .iter_voting_pubkeys() - .cloned() - .collect::>(); - - pubkeys - .into_iter() - .map(|pubkey| { - self.doppelganger_service - .as_ref() - .map(|doppelganger_service| doppelganger_service.validator_status(pubkey)) - // Allow signing on all pubkeys if doppelganger protection is disabled. - .unwrap_or_else(|| DoppelgangerStatus::SigningEnabled(pubkey)) - }) - .collect() - } + F: Fn(DoppelgangerStatus) -> Option; /// Check if the `validator_pubkey` is permitted by the doppleganger protection to sign /// messages. - pub fn doppelganger_protection_allows_signing(&self, validator_pubkey: PublicKeyBytes) -> bool { - self.doppelganger_service - .as_ref() - // If there's no doppelganger service then we assume it is purposefully disabled and - // declare that all keys are safe with regard to it. - .is_none_or(|doppelganger_service| { - doppelganger_service - .validator_status(validator_pubkey) - .only_safe() - .is_some() - }) - } + fn doppelganger_protection_allows_signing(&self, validator_pubkey: PublicKeyBytes) -> bool; - pub fn num_voting_validators(&self) -> usize { - self.validators.read().num_enabled() - } - - fn fork(&self, epoch: Epoch) -> Fork { - self.spec.fork_at_epoch(epoch) - } - - /// Returns a `SigningMethod` for `validator_pubkey` *only if* that validator is considered safe - /// by doppelganger protection. - fn doppelganger_checked_signing_method( - &self, - validator_pubkey: PublicKeyBytes, - ) -> Result, Error> { - if self.doppelganger_protection_allows_signing(validator_pubkey) { - self.validators - .read() - .signing_method(&validator_pubkey) - .ok_or(Error::UnknownPubkey(validator_pubkey)) - } else { - Err(Error::DoppelgangerProtected(validator_pubkey)) - } - } - - /// Returns a `SigningMethod` for `validator_pubkey` regardless of that validators doppelganger - /// protection status. - /// - /// ## Warning - /// - /// This method should only be used for signing non-slashable messages. - fn doppelganger_bypassed_signing_method( - &self, - validator_pubkey: PublicKeyBytes, - ) -> Result, Error> { - self.validators - .read() - .signing_method(&validator_pubkey) - .ok_or(Error::UnknownPubkey(validator_pubkey)) - } - - fn signing_context(&self, domain: Domain, signing_epoch: Epoch) -> SigningContext { - if domain == Domain::VoluntaryExit { - if self.spec.fork_name_at_epoch(signing_epoch).deneb_enabled() { - // EIP-7044 - SigningContext { - domain, - epoch: signing_epoch, - fork: Fork { - previous_version: self.spec.capella_fork_version, - current_version: self.spec.capella_fork_version, - epoch: signing_epoch, - }, - genesis_validators_root: self.genesis_validators_root, - } - } else { - SigningContext { - domain, - epoch: signing_epoch, - fork: self.fork(signing_epoch), - genesis_validators_root: self.genesis_validators_root, - } - } - } else { - SigningContext { - domain, - epoch: signing_epoch, - fork: self.fork(signing_epoch), - genesis_validators_root: self.genesis_validators_root, - } - } - } - - pub async fn randao_reveal( - &self, - validator_pubkey: PublicKeyBytes, - signing_epoch: Epoch, - ) -> Result { - let signing_method = self.doppelganger_checked_signing_method(validator_pubkey)?; - let signing_context = self.signing_context(Domain::Randao, signing_epoch); - - let signature = signing_method - .get_signature::>( - SignableMessage::RandaoReveal(signing_epoch), - signing_context, - &self.spec, - &self.task_executor, - ) - .await?; - - Ok(signature) - } - - pub fn graffiti(&self, validator_pubkey: &PublicKeyBytes) -> Option { - self.validators.read().graffiti(validator_pubkey) - } + fn num_voting_validators(&self) -> usize; + fn graffiti(&self, validator_pubkey: &PublicKeyBytes) -> Option; /// Returns the fee recipient for the given public key. The priority order for fetching /// the fee recipient is: /// 1. validator_definitions.yml /// 2. process level fee recipient - pub fn get_fee_recipient(&self, validator_pubkey: &PublicKeyBytes) -> Option
{ - // If there is a `suggested_fee_recipient` in the validator definitions yaml - // file, use that value. - self.get_fee_recipient_defaulting(self.suggested_fee_recipient(validator_pubkey)) - } + fn get_fee_recipient(&self, validator_pubkey: &PublicKeyBytes) -> Option
; - pub fn get_fee_recipient_defaulting(&self, fee_recipient: Option
) -> Option
{ - // If there's nothing in the file, try the process-level default value. - fee_recipient.or(self.fee_recipient_process) - } - - /// Returns the suggested_fee_recipient from `validator_definitions.yml` if any. - /// This has been pulled into a private function so the read lock is dropped easily - fn suggested_fee_recipient(&self, validator_pubkey: &PublicKeyBytes) -> Option
{ - self.validators - .read() - .suggested_fee_recipient(validator_pubkey) - } - - /// Returns the gas limit for the given public key. The priority order for fetching - /// the gas limit is: - /// - /// 1. validator_definitions.yml - /// 2. process level gas limit - /// 3. `DEFAULT_GAS_LIMIT` - pub fn get_gas_limit(&self, validator_pubkey: &PublicKeyBytes) -> u64 { - self.get_gas_limit_defaulting(self.validators.read().gas_limit(validator_pubkey)) - } - - fn get_gas_limit_defaulting(&self, gas_limit: Option) -> u64 { - // If there is a `gas_limit` in the validator definitions yaml - // file, use that value. - gas_limit - // If there's nothing in the file, try the process-level default value. - .or(self.gas_limit) - // If there's no process-level default, use the `DEFAULT_GAS_LIMIT`. - .unwrap_or(DEFAULT_GAS_LIMIT) - } - - /// Returns a `bool` for the given public key that denotes whether this validator should use the - /// builder API. The priority order for fetching this value is: - /// - /// 1. validator_definitions.yml - /// 2. process level flag - pub fn get_builder_proposals(&self, validator_pubkey: &PublicKeyBytes) -> bool { - // If there is a `suggested_fee_recipient` in the validator definitions yaml - // file, use that value. - self.get_builder_proposals_defaulting( - self.validators.read().builder_proposals(validator_pubkey), - ) - } - - /// Returns a `u64` for the given public key that denotes the builder boost factor. The priority order for fetching this value is: - /// - /// 1. validator_definitions.yml - /// 2. process level flag - pub fn get_builder_boost_factor(&self, validator_pubkey: &PublicKeyBytes) -> Option { - self.validators - .read() - .builder_boost_factor(validator_pubkey) - .or(self.builder_boost_factor) - } - - /// Returns a `bool` for the given public key that denotes whether this validator should prefer a - /// builder payload. The priority order for fetching this value is: - /// - /// 1. validator_definitions.yml - /// 2. process level flag - pub fn get_prefer_builder_proposals(&self, validator_pubkey: &PublicKeyBytes) -> bool { - self.validators - .read() - .prefer_builder_proposals(validator_pubkey) - .unwrap_or(self.prefer_builder_proposals) - } - - fn get_builder_proposals_defaulting(&self, builder_proposals: Option) -> bool { - builder_proposals - // If there's nothing in the file, try the process-level default value. - .unwrap_or(self.builder_proposals) - } - - /// Translate the per validator `builder_proposals`, `builder_boost_factor` and + /// Translate the `builder_proposals`, `builder_boost_factor` and /// `prefer_builder_proposals` to a boost factor, if available. /// - If `prefer_builder_proposals` is true, set boost factor to `u64::MAX` to indicate a /// preference for builder payloads. @@ -522,576 +83,187 @@ impl ValidatorStore { /// - If `builder_proposals` is set to false, set boost factor to 0 to indicate a preference for /// local payloads. /// - Else return `None` to indicate no preference between builder and local payloads. - pub fn determine_validator_builder_boost_factor( - &self, - validator_pubkey: &PublicKeyBytes, - ) -> Option { - let validator_prefer_builder_proposals = self - .validators - .read() - .prefer_builder_proposals(validator_pubkey); + fn determine_builder_boost_factor(&self, validator_pubkey: &PublicKeyBytes) -> Option; - if matches!(validator_prefer_builder_proposals, Some(true)) { - return Some(u64::MAX); - } - - self.validators - .read() - .builder_boost_factor(validator_pubkey) - .or_else(|| { - if matches!( - self.validators.read().builder_proposals(validator_pubkey), - Some(false) - ) { - return Some(0); - } - None - }) - } - - /// Translate the process-wide `builder_proposals`, `builder_boost_factor` and - /// `prefer_builder_proposals` configurations to a boost factor. - /// - If `prefer_builder_proposals` is true, set boost factor to `u64::MAX` to indicate a - /// preference for builder payloads. - /// - If `builder_boost_factor` is a value other than None, return its value as the boost factor. - /// - If `builder_proposals` is set to false, set boost factor to 0 to indicate a preference for - /// local payloads. - /// - Else return `None` to indicate no preference between builder and local payloads. - pub fn determine_default_builder_boost_factor(&self) -> Option { - if self.prefer_builder_proposals { - return Some(u64::MAX); - } - self.builder_boost_factor.or({ - if !self.builder_proposals { - Some(0) - } else { - None - } - }) - } - - pub async fn sign_block>( + fn randao_reveal( &self, validator_pubkey: PublicKeyBytes, - block: BeaconBlock, + signing_epoch: Epoch, + ) -> impl Future>> + Send; + + fn set_validator_index(&self, validator_pubkey: &PublicKeyBytes, index: u64); + + fn sign_block( + &self, + validator_pubkey: PublicKeyBytes, + block: UnsignedBlock, current_slot: Slot, - ) -> Result, Error> { - // Make sure the block slot is not higher than the current slot to avoid potential attacks. - if block.slot() > current_slot { - warn!( - block_slot = block.slot().as_u64(), - current_slot = current_slot.as_u64(), - "Not signing block with slot greater than current slot" - ); - return Err(Error::GreaterThanCurrentSlot { - slot: block.slot(), - current_slot, - }); - } + ) -> impl Future, Error>> + Send; - let signing_epoch = block.epoch(); - let signing_context = self.signing_context(Domain::BeaconProposer, signing_epoch); - let domain_hash = signing_context.domain_hash(&self.spec); - - let signing_method = self.doppelganger_checked_signing_method(validator_pubkey)?; - - // Check for slashing conditions. - let slashing_status = if signing_method - .requires_local_slashing_protection(self.enable_web3signer_slashing_protection) - { - self.slashing_protection.check_and_insert_block_proposal( - &validator_pubkey, - &block.block_header(), - domain_hash, - ) - } else { - Ok(Safe::Valid) - }; - - match slashing_status { - // We can safely sign this block without slashing. - Ok(Safe::Valid) => { - validator_metrics::inc_counter_vec( - &validator_metrics::SIGNED_BLOCKS_TOTAL, - &[validator_metrics::SUCCESS], - ); - - let signature = signing_method - .get_signature::( - SignableMessage::BeaconBlock(&block), - signing_context, - &self.spec, - &self.task_executor, - ) - .await?; - Ok(SignedBeaconBlock::from_block(block, signature)) - } - Ok(Safe::SameData) => { - warn!("Skipping signing of previously signed block"); - validator_metrics::inc_counter_vec( - &validator_metrics::SIGNED_BLOCKS_TOTAL, - &[validator_metrics::SAME_DATA], - ); - Err(Error::SameData) - } - Err(NotSafe::UnregisteredValidator(pk)) => { - warn!( - msg = "Carefully consider running with --init-slashing-protection (see --help)", - public_key = format!("{:?}", pk), - "Not signing block for unregistered validator" - ); - validator_metrics::inc_counter_vec( - &validator_metrics::SIGNED_BLOCKS_TOTAL, - &[validator_metrics::UNREGISTERED], - ); - Err(Error::Slashable(NotSafe::UnregisteredValidator(pk))) - } - Err(e) => { - crit!(error = format!("{:?}", e), "Not signing slashable block"); - validator_metrics::inc_counter_vec( - &validator_metrics::SIGNED_BLOCKS_TOTAL, - &[validator_metrics::SLASHABLE], - ); - Err(Error::Slashable(e)) - } - } - } - - pub async fn sign_attestation( + fn sign_attestation( &self, validator_pubkey: PublicKeyBytes, validator_committee_position: usize, - attestation: &mut Attestation, + attestation: &mut Attestation, current_epoch: Epoch, - ) -> Result<(), Error> { - // Make sure the target epoch is not higher than the current epoch to avoid potential attacks. - if attestation.data().target.epoch > current_epoch { - return Err(Error::GreaterThanCurrentEpoch { - epoch: attestation.data().target.epoch, - current_epoch, - }); - } + ) -> impl Future>> + Send; - // Get the signing method and check doppelganger protection. - let signing_method = self.doppelganger_checked_signing_method(validator_pubkey)?; - - // Checking for slashing conditions. - let signing_epoch = attestation.data().target.epoch; - let signing_context = self.signing_context(Domain::BeaconAttester, signing_epoch); - let domain_hash = signing_context.domain_hash(&self.spec); - let slashing_status = if signing_method - .requires_local_slashing_protection(self.enable_web3signer_slashing_protection) - { - self.slashing_protection.check_and_insert_attestation( - &validator_pubkey, - attestation.data(), - domain_hash, - ) - } else { - Ok(Safe::Valid) - }; - - match slashing_status { - // We can safely sign this attestation. - Ok(Safe::Valid) => { - let signature = signing_method - .get_signature::>( - SignableMessage::AttestationData(attestation.data()), - signing_context, - &self.spec, - &self.task_executor, - ) - .await?; - attestation - .add_signature(&signature, validator_committee_position) - .map_err(Error::UnableToSignAttestation)?; - - validator_metrics::inc_counter_vec( - &validator_metrics::SIGNED_ATTESTATIONS_TOTAL, - &[validator_metrics::SUCCESS], - ); - - Ok(()) - } - Ok(Safe::SameData) => { - warn!("Skipping signing of previously signed attestation"); - validator_metrics::inc_counter_vec( - &validator_metrics::SIGNED_ATTESTATIONS_TOTAL, - &[validator_metrics::SAME_DATA], - ); - Err(Error::SameData) - } - Err(NotSafe::UnregisteredValidator(pk)) => { - warn!( - msg = "Carefully consider running with --init-slashing-protection (see --help)", - public_key = format!("{:?}", pk), - "Not signing attestation for unregistered validator" - ); - validator_metrics::inc_counter_vec( - &validator_metrics::SIGNED_ATTESTATIONS_TOTAL, - &[validator_metrics::UNREGISTERED], - ); - Err(Error::Slashable(NotSafe::UnregisteredValidator(pk))) - } - Err(e) => { - crit!( - attestation = format!("{:?}", attestation.data()), - error = format!("{:?}", e), - "Not signing slashable attestation" - ); - validator_metrics::inc_counter_vec( - &validator_metrics::SIGNED_ATTESTATIONS_TOTAL, - &[validator_metrics::SLASHABLE], - ); - Err(Error::Slashable(e)) - } - } - } - - pub async fn sign_voluntary_exit( - &self, - validator_pubkey: PublicKeyBytes, - voluntary_exit: VoluntaryExit, - ) -> Result { - let signing_epoch = voluntary_exit.epoch; - let signing_context = self.signing_context(Domain::VoluntaryExit, signing_epoch); - let signing_method = self.doppelganger_bypassed_signing_method(validator_pubkey)?; - - let signature = signing_method - .get_signature::>( - SignableMessage::VoluntaryExit(&voluntary_exit), - signing_context, - &self.spec, - &self.task_executor, - ) - .await?; - - validator_metrics::inc_counter_vec( - &validator_metrics::SIGNED_VOLUNTARY_EXITS_TOTAL, - &[validator_metrics::SUCCESS], - ); - - Ok(SignedVoluntaryExit { - message: voluntary_exit, - signature, - }) - } - - pub async fn sign_validator_registration_data( + fn sign_validator_registration_data( &self, validator_registration_data: ValidatorRegistrationData, - ) -> Result { - let domain_hash = self.spec.get_builder_domain(); - let signing_root = validator_registration_data.signing_root(domain_hash); - - let signing_method = - self.doppelganger_bypassed_signing_method(validator_registration_data.pubkey)?; - let signature = signing_method - .get_signature_from_root::>( - SignableMessage::ValidatorRegistration(&validator_registration_data), - signing_root, - &self.task_executor, - None, - ) - .await?; - - validator_metrics::inc_counter_vec( - &validator_metrics::SIGNED_VALIDATOR_REGISTRATIONS_TOTAL, - &[validator_metrics::SUCCESS], - ); - - Ok(SignedValidatorRegistrationData { - message: validator_registration_data, - signature, - }) - } + ) -> impl Future>> + Send; /// Signs an `AggregateAndProof` for a given validator. /// /// The resulting `SignedAggregateAndProof` is sent on the aggregation channel and cannot be /// modified by actors other than the signing validator. - pub async fn produce_signed_aggregate_and_proof( + fn produce_signed_aggregate_and_proof( &self, validator_pubkey: PublicKeyBytes, aggregator_index: u64, - aggregate: Attestation, + aggregate: Attestation, selection_proof: SelectionProof, - ) -> Result, Error> { - let signing_epoch = aggregate.data().target.epoch; - let signing_context = self.signing_context(Domain::AggregateAndProof, signing_epoch); - - let message = - AggregateAndProof::from_attestation(aggregator_index, aggregate, selection_proof); - - let signing_method = self.doppelganger_checked_signing_method(validator_pubkey)?; - let signature = signing_method - .get_signature::>( - SignableMessage::SignedAggregateAndProof(message.to_ref()), - signing_context, - &self.spec, - &self.task_executor, - ) - .await?; - - validator_metrics::inc_counter_vec( - &validator_metrics::SIGNED_AGGREGATES_TOTAL, - &[validator_metrics::SUCCESS], - ); - - Ok(SignedAggregateAndProof::from_aggregate_and_proof( - message, signature, - )) - } + ) -> impl Future, Error>> + Send; /// Produces a `SelectionProof` for the `slot`, signed by with corresponding secret key to /// `validator_pubkey`. - pub async fn produce_selection_proof( + fn produce_selection_proof( &self, validator_pubkey: PublicKeyBytes, slot: Slot, - ) -> Result { - let signing_epoch = slot.epoch(E::slots_per_epoch()); - let signing_context = self.signing_context(Domain::SelectionProof, signing_epoch); - - // Bypass the `with_validator_signing_method` function. - // - // This is because we don't care about doppelganger protection when it comes to selection - // proofs. They are not slashable and we need them to subscribe to subnets on the BN. - // - // As long as we disallow `SignedAggregateAndProof` then these selection proofs will never - // be published on the network. - let signing_method = self.doppelganger_bypassed_signing_method(validator_pubkey)?; - - let signature = signing_method - .get_signature::>( - SignableMessage::SelectionProof(slot), - signing_context, - &self.spec, - &self.task_executor, - ) - .await - .map_err(Error::UnableToSign)?; - - validator_metrics::inc_counter_vec( - &validator_metrics::SIGNED_SELECTION_PROOFS_TOTAL, - &[validator_metrics::SUCCESS], - ); - - Ok(signature.into()) - } + ) -> impl Future>> + Send; /// Produce a `SyncSelectionProof` for `slot` signed by the secret key of `validator_pubkey`. - pub async fn produce_sync_selection_proof( + fn produce_sync_selection_proof( &self, validator_pubkey: &PublicKeyBytes, slot: Slot, subnet_id: SyncSubnetId, - ) -> Result { - let signing_epoch = slot.epoch(E::slots_per_epoch()); - let signing_context = - self.signing_context(Domain::SyncCommitteeSelectionProof, signing_epoch); + ) -> impl Future>> + Send; - // Bypass `with_validator_signing_method`: sync committee messages are not slashable. - let signing_method = self.doppelganger_bypassed_signing_method(*validator_pubkey)?; - - validator_metrics::inc_counter_vec( - &validator_metrics::SIGNED_SYNC_SELECTION_PROOFS_TOTAL, - &[validator_metrics::SUCCESS], - ); - - let message = SyncAggregatorSelectionData { - slot, - subcommittee_index: subnet_id.into(), - }; - - let signature = signing_method - .get_signature::>( - SignableMessage::SyncSelectionProof(&message), - signing_context, - &self.spec, - &self.task_executor, - ) - .await - .map_err(Error::UnableToSign)?; - - Ok(signature.into()) - } - - pub async fn produce_sync_committee_signature( + fn produce_sync_committee_signature( &self, slot: Slot, beacon_block_root: Hash256, validator_index: u64, validator_pubkey: &PublicKeyBytes, - ) -> Result { - let signing_epoch = slot.epoch(E::slots_per_epoch()); - let signing_context = self.signing_context(Domain::SyncCommittee, signing_epoch); + ) -> impl Future>> + Send; - // Bypass `with_validator_signing_method`: sync committee messages are not slashable. - let signing_method = self.doppelganger_bypassed_signing_method(*validator_pubkey)?; - - let signature = signing_method - .get_signature::>( - SignableMessage::SyncCommitteeSignature { - beacon_block_root, - slot, - }, - signing_context, - &self.spec, - &self.task_executor, - ) - .await - .map_err(Error::UnableToSign)?; - - validator_metrics::inc_counter_vec( - &validator_metrics::SIGNED_SYNC_COMMITTEE_MESSAGES_TOTAL, - &[validator_metrics::SUCCESS], - ); - - Ok(SyncCommitteeMessage { - slot, - beacon_block_root, - validator_index, - signature, - }) - } - - pub async fn produce_signed_contribution_and_proof( + fn produce_signed_contribution_and_proof( &self, aggregator_index: u64, aggregator_pubkey: PublicKeyBytes, - contribution: SyncCommitteeContribution, + contribution: SyncCommitteeContribution, selection_proof: SyncSelectionProof, - ) -> Result, Error> { - let signing_epoch = contribution.slot.epoch(E::slots_per_epoch()); - let signing_context = self.signing_context(Domain::ContributionAndProof, signing_epoch); - - // Bypass `with_validator_signing_method`: sync committee messages are not slashable. - let signing_method = self.doppelganger_bypassed_signing_method(aggregator_pubkey)?; - - let message = ContributionAndProof { - aggregator_index, - contribution, - selection_proof: selection_proof.into(), - }; - - let signature = signing_method - .get_signature::>( - SignableMessage::SignedContributionAndProof(&message), - signing_context, - &self.spec, - &self.task_executor, - ) - .await - .map_err(Error::UnableToSign)?; - - validator_metrics::inc_counter_vec( - &validator_metrics::SIGNED_SYNC_COMMITTEE_CONTRIBUTIONS_TOTAL, - &[validator_metrics::SUCCESS], - ); - - Ok(SignedContributionAndProof { message, signature }) - } - - pub fn import_slashing_protection( - &self, - interchange: Interchange, - ) -> Result<(), InterchangeError> { - self.slashing_protection - .import_interchange_info(interchange, self.genesis_validators_root)?; - Ok(()) - } - - /// Export slashing protection data while also disabling the given keys in the database. - /// - /// If any key is unknown to the slashing protection database it will be silently omitted - /// from the result. It is the caller's responsibility to check whether all keys provided - /// had data returned for them. - pub fn export_slashing_protection_for_keys( - &self, - pubkeys: &[PublicKeyBytes], - ) -> Result { - self.slashing_protection.with_transaction(|txn| { - let known_pubkeys = pubkeys - .iter() - .filter_map(|pubkey| { - let validator_id = self - .slashing_protection - .get_validator_id_ignoring_status(txn, pubkey) - .ok()?; - - Some( - self.slashing_protection - .update_validator_status(txn, validator_id, false) - .map(|()| *pubkey), - ) - }) - .collect::, _>>()?; - self.slashing_protection.export_interchange_info_in_txn( - self.genesis_validators_root, - Some(&known_pubkeys), - txn, - ) - }) - } + ) -> impl Future, Error>> + Send; /// Prune the slashing protection database so that it remains performant. /// /// This function will only do actual pruning periodically, so it should usually be /// cheap to call. The `first_run` flag can be used to print a more verbose message when pruning /// runs. - pub fn prune_slashing_protection_db(&self, current_epoch: Epoch, first_run: bool) { - // Attempt to prune every SLASHING_PROTECTION_HISTORY_EPOCHs, with a tolerance for - // missing the epoch that aligns exactly. - let mut last_prune = self.slashing_protection_last_prune.lock(); - if current_epoch / SLASHING_PROTECTION_HISTORY_EPOCHS - <= *last_prune / SLASHING_PROTECTION_HISTORY_EPOCHS - { - return; - } + fn prune_slashing_protection_db(&self, current_epoch: Epoch, first_run: bool); - if first_run { - info!( - epoch = %current_epoch, - msg = "pruning may take several minutes the first time it runs", - "Pruning slashing protection DB" - ); - } else { - info!(epoch = %current_epoch, "Pruning slashing protection DB"); - } + /// Returns `ProposalData` for the provided `pubkey` if it exists in `InitializedValidators`. + /// `ProposalData` fields include defaulting logic described in `get_fee_recipient_defaulting`, + /// `get_gas_limit_defaulting`, and `get_builder_proposals_defaulting`. + fn proposal_data(&self, pubkey: &PublicKeyBytes) -> Option; +} - let _timer = - validator_metrics::start_timer(&validator_metrics::SLASHING_PROTECTION_PRUNE_TIMES); +#[derive(Clone, Debug, PartialEq)] +pub enum UnsignedBlock { + Full(BeaconBlock), + Blinded(BlindedBeaconBlock), +} - let new_min_target_epoch = current_epoch.saturating_sub(SLASHING_PROTECTION_HISTORY_EPOCHS); - let new_min_slot = new_min_target_epoch.start_slot(E::slots_per_epoch()); - - let all_pubkeys: Vec<_> = self.voting_pubkeys(DoppelgangerStatus::ignored); - - if let Err(e) = self - .slashing_protection - .prune_all_signed_attestations(all_pubkeys.iter(), new_min_target_epoch) - { - error!( - error = ?e, - "Error during pruning of signed attestations" - ); - return; - } - - if let Err(e) = self - .slashing_protection - .prune_all_signed_blocks(all_pubkeys.iter(), new_min_slot) - { - error!( - error = ?e, - "Error during pruning of signed blocks" - ); - return; - } - - *last_prune = current_epoch; - - info!("Completed pruning of slashing protection DB"); +impl From> for UnsignedBlock { + fn from(block: BeaconBlock) -> Self { + UnsignedBlock::Full(block) + } +} + +impl From> for UnsignedBlock { + fn from(block: BlindedBeaconBlock) -> Self { + UnsignedBlock::Blinded(block) + } +} + +#[derive(Clone, Debug, PartialEq)] +pub enum SignedBlock { + Full(SignedBeaconBlock), + Blinded(SignedBlindedBeaconBlock), +} + +impl From> for SignedBlock { + fn from(block: SignedBeaconBlock) -> Self { + SignedBlock::Full(block) + } +} + +impl From> for SignedBlock { + fn from(block: SignedBlindedBeaconBlock) -> Self { + SignedBlock::Blinded(block) + } +} + +/// A wrapper around `PublicKeyBytes` which encodes information about the status of a validator +/// pubkey with regards to doppelganger protection. +#[derive(Debug, PartialEq)] +pub enum DoppelgangerStatus { + /// Doppelganger protection has approved this for signing. + /// + /// This is because the service has waited some period of time to + /// detect other instances of this key on the network. + SigningEnabled(PublicKeyBytes), + /// Doppelganger protection is still waiting to detect other instances. + /// + /// Do not use this pubkey for signing slashable messages!! + /// + /// However, it can safely be used for other non-slashable operations (e.g., collecting duties + /// or subscribing to subnets). + SigningDisabled(PublicKeyBytes), + /// This pubkey is unknown to the doppelganger service. + /// + /// This represents a serious internal error in the program. This validator will be permanently + /// disabled! + UnknownToDoppelganger(PublicKeyBytes), +} + +impl DoppelgangerStatus { + /// Only return a pubkey if it is explicitly safe for doppelganger protection. + /// + /// If `Some(pubkey)` is returned, doppelganger has declared it safe for signing. + /// + /// ## Note + /// + /// "Safe" is only best-effort by doppelganger. There is no guarantee that a doppelganger + /// doesn't exist. + pub fn only_safe(self) -> Option { + match self { + DoppelgangerStatus::SigningEnabled(pubkey) => Some(pubkey), + DoppelgangerStatus::SigningDisabled(_) => None, + DoppelgangerStatus::UnknownToDoppelganger(_) => None, + } + } + + /// Returns a key regardless of whether or not doppelganger has approved it. Such a key might be + /// used for signing non-slashable messages, duties collection or other activities. + /// + /// If the validator is unknown to doppelganger then `None` will be returned. + pub fn ignored(self) -> Option { + match self { + DoppelgangerStatus::SigningEnabled(pubkey) => Some(pubkey), + DoppelgangerStatus::SigningDisabled(pubkey) => Some(pubkey), + DoppelgangerStatus::UnknownToDoppelganger(_) => None, + } + } + + /// Only return a pubkey if it will not be used for signing due to doppelganger detection. + pub fn only_unsafe(self) -> Option { + match self { + DoppelgangerStatus::SigningEnabled(_) => None, + DoppelgangerStatus::SigningDisabled(pubkey) => Some(pubkey), + DoppelgangerStatus::UnknownToDoppelganger(pubkey) => Some(pubkey), + } } } From 058dae064184b63071828024a894b2590f7ea31a Mon Sep 17 00:00:00 2001 From: Yeongjong Pyo Date: Wed, 7 May 2025 14:00:56 +0900 Subject: [PATCH 18/22] Add requires --http when using vc subcommands --http-port (#7405) Prevent running `lighthouse vc --http-port ` without `--http`. Issue: https://github.com/sigp/lighthouse/issues/7402 Added requires `--http` when using `lighthouse vc --http-port `. Implemented a test code for this issue. --- lighthouse/tests/validator_client.rs | 8 ++++++++ validator_client/src/cli.rs | 1 + 2 files changed, 9 insertions(+) diff --git a/lighthouse/tests/validator_client.rs b/lighthouse/tests/validator_client.rs index 6e579f63c1..f99fc3c460 100644 --- a/lighthouse/tests/validator_client.rs +++ b/lighthouse/tests/validator_client.rs @@ -317,6 +317,14 @@ fn missing_unencrypted_http_transport_flag() { .with_config(|config| assert_eq!(config.http_api.listen_addr, addr)); } #[test] +#[should_panic] +fn missing_http_http_port_flag() { + CommandLineTest::new() + .flag("http-port", Some("9090")) + .run() + .with_config(|config| assert_eq!(config.http_api.listen_port, 9090)); +} +#[test] fn http_port_flag() { CommandLineTest::new() .flag("http", None) diff --git a/validator_client/src/cli.rs b/validator_client/src/cli.rs index 950d08a028..cdbf9f8472 100644 --- a/validator_client/src/cli.rs +++ b/validator_client/src/cli.rs @@ -219,6 +219,7 @@ pub struct ValidatorClient { #[clap( long, + requires = "http", value_name = "PORT", default_value_t = 5062, help = "Set the listen TCP port for the RESTful HTTP API server.", From 0f13029c7d516ef931d8043436b811c41de3cba4 Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Thu, 8 May 2025 09:24:48 +1000 Subject: [PATCH 19/22] Don't publish data columns reconstructed from RPC columns to the gossip network (#7409) Don't publish data columns reconstructed from RPC columns to the gossip network, as this may result in peer downscoring if we're sending columns from past slots. --- .../src/network_beacon_processor/gossip_methods.rs | 3 ++- beacon_node/network/src/network_beacon_processor/mod.rs | 8 +++++++- .../network/src/network_beacon_processor/sync_methods.rs | 8 ++++++-- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs index d61ea58377..cf0e98cda8 100644 --- a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs @@ -1160,7 +1160,8 @@ impl NetworkBeaconProcessor { "Processed data column, waiting for other components" ); - self.attempt_data_column_reconstruction(block_root).await; + self.attempt_data_column_reconstruction(block_root, true) + .await; } }, Err(BlockError::DuplicateFullyImported(_)) => { diff --git a/beacon_node/network/src/network_beacon_processor/mod.rs b/beacon_node/network/src/network_beacon_processor/mod.rs index cfd5c24f99..ba681eed14 100644 --- a/beacon_node/network/src/network_beacon_processor/mod.rs +++ b/beacon_node/network/src/network_beacon_processor/mod.rs @@ -918,9 +918,13 @@ impl NetworkBeaconProcessor { /// /// Returns `Some(AvailabilityProcessingStatus)` if reconstruction is successfully performed, /// otherwise returns `None`. + /// + /// The `publish_columns` parameter controls whether reconstructed columns should be published + /// to the gossip network. async fn attempt_data_column_reconstruction( self: &Arc, block_root: Hash256, + publish_columns: bool, ) -> Option { // Only supernodes attempt reconstruction if !self.network_globals.is_supernode() { @@ -930,7 +934,9 @@ impl NetworkBeaconProcessor { let result = self.chain.reconstruct_data_columns(block_root).await; match result { Ok(Some((availability_processing_status, data_columns_to_publish))) => { - self.publish_data_columns_gradually(data_columns_to_publish, block_root); + if publish_columns { + self.publish_data_columns_gradually(data_columns_to_publish, block_root); + } match &availability_processing_status { AvailabilityProcessingStatus::Imported(hash) => { debug!( diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index 48ae26c826..31b17a41a4 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -383,8 +383,12 @@ impl NetworkBeaconProcessor { ); // Attempt reconstruction here before notifying sync, to avoid sending out more requests // that we may no longer need. - if let Some(availability) = - self.attempt_data_column_reconstruction(block_root).await + // We don't publish columns reconstructed from rpc columns to the gossip network, + // as these are likely historic columns. + let publish_columns = false; + if let Some(availability) = self + .attempt_data_column_reconstruction(block_root, publish_columns) + .await { result = Ok(availability) } From 8dc3d23af083abee0f7d0d12892ef7134cf8ac9f Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Thu, 8 May 2025 12:08:32 +1000 Subject: [PATCH 20/22] Add a default timeout to all `BeaconNodeHttpClient` requests (#7400) Add a default request timeout to all `BeaconNodeHttpClient` requests to ensure that no HTTP request can hang indefinitely. --- common/eth2/src/lib.rs | 41 ++++++++++++++++++++++--------------- validator_client/src/lib.rs | 2 ++ 2 files changed, 27 insertions(+), 16 deletions(-) diff --git a/common/eth2/src/lib.rs b/common/eth2/src/lib.rs index d114d037eb..fc12a4c5f3 100644 --- a/common/eth2/src/lib.rs +++ b/common/eth2/src/lib.rs @@ -144,6 +144,7 @@ pub struct Timeouts { pub get_debug_beacon_states: Duration, pub get_deposit_snapshot: Duration, pub get_validator_block: Duration, + pub default: Duration, } impl Timeouts { @@ -161,6 +162,7 @@ impl Timeouts { get_debug_beacon_states: timeout, get_deposit_snapshot: timeout, get_validator_block: timeout, + default: timeout, } } } @@ -235,7 +237,9 @@ impl BeaconNodeHttpClient { url: U, builder: impl FnOnce(RequestBuilder) -> RequestBuilder, ) -> Result { - let response = builder(self.client.get(url)).send().await?; + let response = builder(self.client.get(url).timeout(self.timeouts.default)) + .send() + .await?; ok_or_error(response).await } @@ -398,11 +402,10 @@ impl BeaconNodeHttpClient { body: &T, timeout: Option, ) -> Result { - let mut builder = self.client.post(url); - if let Some(timeout) = timeout { - builder = builder.timeout(timeout); - } - + let builder = self + .client + .post(url) + .timeout(timeout.unwrap_or(self.timeouts.default)); let response = builder.json(body).send().await?; ok_or_error(response).await } @@ -415,10 +418,10 @@ impl BeaconNodeHttpClient { timeout: Option, fork: ForkName, ) -> Result { - let mut builder = self.client.post(url); - if let Some(timeout) = timeout { - builder = builder.timeout(timeout); - } + let builder = self + .client + .post(url) + .timeout(timeout.unwrap_or(self.timeouts.default)); let response = builder .header(CONSENSUS_VERSION_HEADER, fork.to_string()) .json(body) @@ -433,7 +436,7 @@ impl BeaconNodeHttpClient { url: U, body: &T, ) -> Result { - let builder = self.client.post(url); + let builder = self.client.post(url).timeout(self.timeouts.default); let mut headers = HeaderMap::new(); headers.insert( @@ -452,10 +455,10 @@ impl BeaconNodeHttpClient { timeout: Option, fork: ForkName, ) -> Result { - let mut builder = self.client.post(url); - if let Some(timeout) = timeout { - builder = builder.timeout(timeout); - } + let builder = self + .client + .post(url) + .timeout(timeout.unwrap_or(self.timeouts.default)); let mut headers = HeaderMap::new(); headers.insert( CONSENSUS_VERSION_HEADER, @@ -1868,7 +1871,13 @@ impl BeaconNodeHttpClient { .push("node") .push("health"); - let status = self.client.get(path).send().await?.status(); + let status = self + .client + .get(path) + .timeout(self.timeouts.default) + .send() + .await? + .status(); if status == StatusCode::OK || status == StatusCode::PARTIAL_CONTENT { Ok(status) } else { diff --git a/validator_client/src/lib.rs b/validator_client/src/lib.rs index 100f896f8e..a7993dc879 100644 --- a/validator_client/src/lib.rs +++ b/validator_client/src/lib.rs @@ -68,6 +68,7 @@ const HTTP_GET_BEACON_BLOCK_SSZ_TIMEOUT_QUOTIENT: u32 = 4; const HTTP_GET_DEBUG_BEACON_STATE_QUOTIENT: u32 = 4; const HTTP_GET_DEPOSIT_SNAPSHOT_QUOTIENT: u32 = 4; const HTTP_GET_VALIDATOR_BLOCK_TIMEOUT_QUOTIENT: u32 = 4; +const HTTP_DEFAULT_TIMEOUT_QUOTIENT: u32 = 4; const DOPPELGANGER_SERVICE_NAME: &str = "doppelganger"; @@ -307,6 +308,7 @@ impl ProductionValidatorClient { get_debug_beacon_states: slot_duration / HTTP_GET_DEBUG_BEACON_STATE_QUOTIENT, get_deposit_snapshot: slot_duration / HTTP_GET_DEPOSIT_SNAPSHOT_QUOTIENT, get_validator_block: slot_duration / HTTP_GET_VALIDATOR_BLOCK_TIMEOUT_QUOTIENT, + default: slot_duration / HTTP_DEFAULT_TIMEOUT_QUOTIENT, } } else { Timeouts::set_all(slot_duration.saturating_mul(config.long_timeouts_multiplier)) From e90fcbe6577cb9e999e5b2372a55ed2dc2882232 Mon Sep 17 00:00:00 2001 From: chonghe <44791194+chong-he@users.noreply.github.com> Date: Thu, 8 May 2025 14:12:57 +0800 Subject: [PATCH 21/22] Add ARM binary for macOS in release (#7416) * #5410 --- .github/workflows/release.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 04e8a534da..de4fd29409 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -33,6 +33,7 @@ jobs: arch: [aarch64-unknown-linux-gnu, x86_64-unknown-linux-gnu, x86_64-apple-darwin, + aarch64-apple-darwin, x86_64-windows] include: - arch: aarch64-unknown-linux-gnu @@ -44,6 +45,9 @@ jobs: - arch: x86_64-apple-darwin runner: macos-13 profile: maxperf + - arch: aarch64-apple-darwin + runner: macos-14 + profile: maxperf - arch: x86_64-windows runner: ${{ github.repository == 'sigp/lighthouse' && fromJson('["self-hosted", "windows", "release"]') || 'windows-2019' }} profile: maxperf @@ -94,6 +98,10 @@ jobs: if: matrix.arch == 'x86_64-apple-darwin' run: cargo install --path lighthouse --force --locked --features portable,gnosis --profile ${{ matrix.profile }} + - name: Build Lighthouse for aarch64-apple-darwin + if: matrix.arch == 'aarch64-apple-darwin' + run: cargo install --path lighthouse --force --locked --features portable,gnosis --profile ${{ matrix.profile }} + - name: Build Lighthouse for Windows if: matrix.arch == 'x86_64-windows' run: cargo install --path lighthouse --force --locked --features portable,gnosis --profile ${{ matrix.profile }} @@ -237,6 +245,7 @@ jobs: | System | Architecture | Binary | PGP Signature | |:---:|:---:|:---:|:---| | Apple logo | x86_64 | [lighthouse-${{ env.VERSION }}-x86_64-apple-darwin.tar.gz](https://github.com/${{ env.REPO_NAME }}/releases/download/${{ env.VERSION }}/lighthouse-${{ env.VERSION }}-x86_64-apple-darwin.tar.gz) | [PGP Signature](https://github.com/${{ env.REPO_NAME }}/releases/download/${{ env.VERSION }}/lighthouse-${{ env.VERSION }}-x86_64-apple-darwin.tar.gz.asc) | + | Apple logo | aarch64 | [lighthouse-${{ env.VERSION }}-aarch64-apple-darwin.tar.gz](https://github.com/${{ env.REPO_NAME }}/releases/download/${{ env.VERSION }}/lighthouse-${{ env.VERSION }}-aarch64-apple-darwin.tar.gz) | [PGP Signature](https://github.com/${{ env.REPO_NAME }}/releases/download/${{ env.VERSION }}/lighthouse-${{ env.VERSION }}-aarch64-apple-darwin.tar.gz.asc) | | Linux logo | x86_64 | [lighthouse-${{ env.VERSION }}-x86_64-unknown-linux-gnu.tar.gz](https://github.com/${{ env.REPO_NAME }}/releases/download/${{ env.VERSION }}/lighthouse-${{ env.VERSION }}-x86_64-unknown-linux-gnu.tar.gz) | [PGP Signature](https://github.com/${{ env.REPO_NAME }}/releases/download/${{ env.VERSION }}/lighthouse-${{ env.VERSION }}-x86_64-unknown-linux-gnu.tar.gz.asc) | | Raspberrypi logo | aarch64 | [lighthouse-${{ env.VERSION }}-aarch64-unknown-linux-gnu.tar.gz](https://github.com/${{ env.REPO_NAME }}/releases/download/${{ env.VERSION }}/lighthouse-${{ env.VERSION }}-aarch64-unknown-linux-gnu.tar.gz) | [PGP Signature](https://github.com/${{ env.REPO_NAME }}/releases/download/${{ env.VERSION }}/lighthouse-${{ env.VERSION }}-aarch64-unknown-linux-gnu.tar.gz.asc) | | Windows logo | x86_64 | [lighthouse-${{ env.VERSION }}-x86_64-windows.tar.gz](https://github.com/${{ env.REPO_NAME }}/releases/download/${{ env.VERSION }}/lighthouse-${{ env.VERSION }}-x86_64-windows.tar.gz) | [PGP Signature](https://github.com/${{ env.REPO_NAME }}/releases/download/${{ env.VERSION }}/lighthouse-${{ env.VERSION }}-x86_64-windows.tar.gz.asc) | From 4b9c16fc717546189a4a136d43666db5bec4d38a Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Thu, 8 May 2025 18:43:44 +1000 Subject: [PATCH 22/22] Add Electra forks to basic sim tests (#7199) This PR adds transitions to Electra ~~and Fulu~~ fork epochs in the simulator tests. ~~It also covers blob inclusion verification and data column syncing on a full node in Fulu.~~ UPDATE: Remove fulu fork from sim tests due to https://github.com/sigp/lighthouse/pull/7199#issuecomment-2852281176 --- .github/workflows/test-suite.yml | 2 +- .../overflow_lru_cache.rs | 2 +- .../src/data_column_verification.rs | 21 ++++++------- testing/node_test_rig/src/lib.rs | 3 +- testing/simulator/src/basic_sim.rs | 21 +++++++------ testing/simulator/src/checks.rs | 31 +++++++++++++------ 6 files changed, 46 insertions(+), 34 deletions(-) diff --git a/.github/workflows/test-suite.yml b/.github/workflows/test-suite.yml index 817fd9524d..64a93ab5ae 100644 --- a/.github/workflows/test-suite.yml +++ b/.github/workflows/test-suite.yml @@ -295,7 +295,7 @@ jobs: with: channel: stable cache-target: release - - name: Run a basic beacon chain sim that starts from Bellatrix + - name: Run a basic beacon chain sim that starts from Deneb run: cargo run --release --bin simulator basic-sim fallback-simulator-ubuntu: name: fallback-simulator-ubuntu diff --git a/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs b/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs index f5fd24483a..5b5a6fcc0d 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs @@ -331,7 +331,7 @@ impl PendingComponents { format!( "block {} blobs {}/{}", block_count, - self.verified_blobs.len(), + self.verified_blobs.iter().flatten().count(), num_expected_blobs ) } diff --git a/beacon_node/beacon_chain/src/data_column_verification.rs b/beacon_node/beacon_chain/src/data_column_verification.rs index 7d22bcf341..20b5c9aa02 100644 --- a/beacon_node/beacon_chain/src/data_column_verification.rs +++ b/beacon_node/beacon_chain/src/data_column_verification.rs @@ -10,7 +10,6 @@ use derivative::Derivative; use fork_choice::ProtoBlock; use kzg::{Error as KzgError, Kzg}; use proto_array::Block; -use slasher::test_utils::E; use slot_clock::SlotClock; use ssz_derive::{Decode, Encode}; use std::iter; @@ -589,19 +588,19 @@ fn verify_proposer_and_signature( chain: &BeaconChain, ) -> Result<(), GossipDataColumnError> { let column_slot = data_column.slot(); - let column_epoch = column_slot.epoch(E::slots_per_epoch()); + let slots_per_epoch = T::EthSpec::slots_per_epoch(); + let column_epoch = column_slot.epoch(slots_per_epoch); let column_index = data_column.index; let block_root = data_column.block_root(); let block_parent_root = data_column.block_parent_root(); - let proposer_shuffling_root = - if parent_block.slot.epoch(T::EthSpec::slots_per_epoch()) == column_epoch { - parent_block - .next_epoch_shuffling_id - .shuffling_decision_block - } else { - parent_block.root - }; + let proposer_shuffling_root = if parent_block.slot.epoch(slots_per_epoch) == column_epoch { + parent_block + .next_epoch_shuffling_id + .shuffling_decision_block + } else { + parent_block.root + }; // We lock the cache briefly to get or insert a OnceCell, then drop the lock // before doing proposer shuffling calculation via `OnceCell::get_or_try_init`. This avoids @@ -649,7 +648,7 @@ fn verify_proposer_and_signature( let proposer_index = *epoch_proposers .proposers - .get(column_slot.as_usize() % T::EthSpec::slots_per_epoch() as usize) + .get(column_slot.as_usize() % slots_per_epoch as usize) .ok_or_else(|| BeaconChainError::NoProposerForSlot(column_slot))?; let fork = epoch_proposers.fork; diff --git a/testing/node_test_rig/src/lib.rs b/testing/node_test_rig/src/lib.rs index 6e632ccf54..4021a6d2c5 100644 --- a/testing/node_test_rig/src/lib.rs +++ b/testing/node_test_rig/src/lib.rs @@ -7,7 +7,6 @@ use environment::RuntimeContext; use eth2::{reqwest::ClientBuilder, BeaconNodeHttpClient, Timeouts}; use sensitive_url::SensitiveUrl; use std::path::PathBuf; -use std::sync::Arc; use std::time::Duration; use std::time::{SystemTime, UNIX_EPOCH}; use tempfile::{Builder as TempBuilder, TempDir}; @@ -249,7 +248,7 @@ impl LocalExecutionNode { if let Err(e) = std::fs::write(jwt_file_path, config.jwt_key.hex_string()) { panic!("Failed to write jwt file {}", e); } - let spec = Arc::new(E::default_spec()); + let spec = context.eth2_config.spec.clone(); Self { server: MockServer::new_with_config( &context.executor.handle().unwrap(), diff --git a/testing/simulator/src/basic_sim.rs b/testing/simulator/src/basic_sim.rs index 6afc7771d4..4e9ad86dc6 100644 --- a/testing/simulator/src/basic_sim.rs +++ b/testing/simulator/src/basic_sim.rs @@ -18,6 +18,7 @@ use environment::tracing_common; use tracing_subscriber::prelude::*; use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt}; +use logging::build_workspace_filter; use tokio::time::sleep; use types::{Epoch, EthSpec, MinimalEthSpec}; @@ -25,10 +26,9 @@ const END_EPOCH: u64 = 16; const GENESIS_DELAY: u64 = 32; const ALTAIR_FORK_EPOCH: u64 = 0; const BELLATRIX_FORK_EPOCH: u64 = 0; -const CAPELLA_FORK_EPOCH: u64 = 1; -const DENEB_FORK_EPOCH: u64 = 2; -// const ELECTRA_FORK_EPOCH: u64 = 3; -// const FULU_FORK_EPOCH: u64 = 4; +const CAPELLA_FORK_EPOCH: u64 = 0; +const DENEB_FORK_EPOCH: u64 = 0; +const ELECTRA_FORK_EPOCH: u64 = 2; const SUGGESTED_FEE_RECIPIENT: [u8; 20] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]; @@ -116,7 +116,11 @@ pub fn run_basic_sim(matches: &ArgMatches) -> Result<(), String> { ); if let Err(e) = tracing_subscriber::registry() - .with(stdout_logging_layer.with_filter(logger_config.debug_level)) + .with( + stdout_logging_layer + .with_filter(logger_config.debug_level) + .with_filter(build_workspace_filter()?), + ) .try_init() { eprintln!("Failed to initialize dependency logging: {e}"); @@ -130,8 +134,8 @@ pub fn run_basic_sim(matches: &ArgMatches) -> Result<(), String> { let genesis_delay = GENESIS_DELAY; // Convenience variables. Update these values when adding a newer fork. - let latest_fork_version = spec.deneb_fork_version; - let latest_fork_start_epoch = DENEB_FORK_EPOCH; + let latest_fork_version = spec.electra_fork_version; + let latest_fork_start_epoch = ELECTRA_FORK_EPOCH; spec.seconds_per_slot /= speed_up_factor; spec.seconds_per_slot = max(1, spec.seconds_per_slot); @@ -142,8 +146,7 @@ pub fn run_basic_sim(matches: &ArgMatches) -> Result<(), String> { spec.bellatrix_fork_epoch = Some(Epoch::new(BELLATRIX_FORK_EPOCH)); spec.capella_fork_epoch = Some(Epoch::new(CAPELLA_FORK_EPOCH)); spec.deneb_fork_epoch = Some(Epoch::new(DENEB_FORK_EPOCH)); - //spec.electra_fork_epoch = Some(Epoch::new(ELECTRA_FORK_EPOCH)); - //spec.fulu_fork_epoch = Some(Epoch::new(FULU_FORK_EPOCH)); + spec.electra_fork_epoch = Some(Epoch::new(ELECTRA_FORK_EPOCH)); let spec = Arc::new(spec); env.eth2_config.spec = spec.clone(); diff --git a/testing/simulator/src/checks.rs b/testing/simulator/src/checks.rs index 35c2508b53..cd0e2e726e 100644 --- a/testing/simulator/src/checks.rs +++ b/testing/simulator/src/checks.rs @@ -128,17 +128,23 @@ pub async fn verify_full_block_production_up_to( slot_delay(slot, slot_duration).await; let beacon_nodes = network.beacon_nodes.read(); let beacon_chain = beacon_nodes[0].client.beacon_chain().unwrap(); - let num_blocks = beacon_chain + let block_slots = beacon_chain .chain_dump() .unwrap() .iter() .take_while(|s| s.beacon_block.slot() <= slot) - .count(); + .map(|s| s.beacon_block.slot().as_usize()) + .collect::>(); + let num_blocks = block_slots.len(); if num_blocks != slot.as_usize() + 1 { + let missed_slots = (0..slot.as_usize()) + .filter(|slot| !block_slots.contains(slot)) + .collect::>(); return Err(format!( - "There wasn't a block produced at every slot, got: {}, expected: {}", + "There wasn't a block produced at every slot, got: {}, expected: {}, missed: {:?}", num_blocks, - slot.as_usize() + 1 + slot.as_usize() + 1, + missed_slots )); } Ok(()) @@ -185,12 +191,17 @@ pub async fn verify_full_sync_aggregates_up_to( .get_beacon_blocks::(BlockId::Slot(Slot::new(slot))) .await .map(|resp| { - resp.unwrap() - .data - .message() - .body() - .sync_aggregate() - .map(|agg| agg.num_set_bits()) + resp.unwrap_or_else(|| { + panic!( + "Beacon block for slot {} not returned from Beacon API", + slot + ) + }) + .data + .message() + .body() + .sync_aggregate() + .map(|agg| agg.num_set_bits()) }) .map_err(|e| format!("Error while getting beacon block: {:?}", e))? .map_err(|_| format!("Altair block {} should have sync aggregate", slot))?;