From 4588971085840dc56cedc85ba0f12bcaa99be8ed Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Thu, 19 Feb 2026 12:57:53 +1100 Subject: [PATCH] Add sync batch state metrics (#8847) Co-Authored-By: Jimmy Chen --- beacon_node/network/src/metrics.rs | 7 +++++ .../network/src/sync/backfill_sync/mod.rs | 20 +++++++++++++- beacon_node/network/src/sync/batch.rs | 26 ++++++++++++++++++- .../src/sync/custody_backfill_sync/mod.rs | 21 +++++++++++++-- beacon_node/network/src/sync/manager.rs | 3 +++ .../network/src/sync/range_sync/chain.rs | 11 +++++++- .../src/sync/range_sync/chain_collection.rs | 21 +++++++++++++++ .../network/src/sync/range_sync/range.rs | 4 +++ 8 files changed, 108 insertions(+), 5 deletions(-) diff --git a/beacon_node/network/src/metrics.rs b/beacon_node/network/src/metrics.rs index cea06a28c8..0fa95b4758 100644 --- a/beacon_node/network/src/metrics.rs +++ b/beacon_node/network/src/metrics.rs @@ -462,6 +462,13 @@ pub static SYNCING_CHAIN_BATCH_AWAITING_PROCESSING: LazyLock> ]), ) }); +pub static SYNCING_CHAIN_BATCHES: LazyLock> = LazyLock::new(|| { + try_create_int_gauge_vec( + "sync_batches", + "Number of batches in sync chains by sync type and state", + &["sync_type", "state"], + ) +}); pub static SYNC_SINGLE_BLOCK_LOOKUPS: LazyLock> = LazyLock::new(|| { try_create_int_gauge( "sync_single_block_lookups", diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 9802ec56a1..f18d31863b 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -8,9 +8,11 @@ //! If a batch fails, the backfill sync cannot progress. In this scenario, we mark the backfill //! sync as failed, log an error and attempt to retry once a new peer joins the node. +use crate::metrics; use crate::network_beacon_processor::ChainSegmentProcessId; use crate::sync::batch::{ - BatchConfig, BatchId, BatchInfo, BatchOperationOutcome, BatchProcessingResult, BatchState, + BatchConfig, BatchId, BatchInfo, BatchMetricsState, BatchOperationOutcome, + BatchProcessingResult, BatchState, }; use crate::sync::block_sidecar_coupling::CouplingError; use crate::sync::manager::BatchProcessResult; @@ -31,6 +33,7 @@ use std::collections::{ use std::hash::{Hash, Hasher}; use std::marker::PhantomData; use std::sync::Arc; +use strum::IntoEnumIterator; use tracing::{debug, error, info, warn}; use types::{ColumnIndex, Epoch, EthSpec}; @@ -1181,6 +1184,21 @@ impl BackFillSync { .epoch(T::EthSpec::slots_per_epoch()) } + pub fn register_metrics(&self) { + for state in BatchMetricsState::iter() { + let count = self + .batches + .values() + .filter(|b| b.state().metrics_state() == state) + .count(); + metrics::set_gauge_vec( + &metrics::SYNCING_CHAIN_BATCHES, + &["backfill", state.into()], + count as i64, + ); + } + } + /// Updates the global network state indicating the current state of a backfill sync. fn set_state(&self, state: BackFillState) { *self.network_globals.backfill_state.write() = state; diff --git a/beacon_node/network/src/sync/batch.rs b/beacon_node/network/src/sync/batch.rs index 8de386f5be..8f8d39ca4b 100644 --- a/beacon_node/network/src/sync/batch.rs +++ b/beacon_node/network/src/sync/batch.rs @@ -10,10 +10,22 @@ use std::marker::PhantomData; use std::ops::Sub; use std::time::Duration; use std::time::Instant; -use strum::Display; +use strum::{Display, EnumIter, IntoStaticStr}; use types::Slot; use types::{DataColumnSidecarList, Epoch, EthSpec}; +/// Batch states used as metrics labels. +#[derive(Debug, Clone, Copy, PartialEq, Eq, EnumIter, IntoStaticStr)] +#[strum(serialize_all = "snake_case")] +pub enum BatchMetricsState { + AwaitingDownload, + Downloading, + AwaitingProcessing, + Processing, + AwaitingValidation, + Failed, +} + pub type BatchId = Epoch; /// Type of expected batch. @@ -142,6 +154,18 @@ impl BatchState { pub fn poison(&mut self) -> BatchState { std::mem::replace(self, BatchState::Poisoned) } + + /// Returns the metrics state for this batch. + pub fn metrics_state(&self) -> BatchMetricsState { + match self { + BatchState::AwaitingDownload => BatchMetricsState::AwaitingDownload, + BatchState::Downloading(_) => BatchMetricsState::Downloading, + BatchState::AwaitingProcessing(..) => BatchMetricsState::AwaitingProcessing, + BatchState::Processing(_) => BatchMetricsState::Processing, + BatchState::AwaitingValidation(_) => BatchMetricsState::AwaitingValidation, + BatchState::Poisoned | BatchState::Failed => BatchMetricsState::Failed, + } + } } impl BatchInfo { diff --git a/beacon_node/network/src/sync/custody_backfill_sync/mod.rs b/beacon_node/network/src/sync/custody_backfill_sync/mod.rs index fa8b70c8b4..893aa849d3 100644 --- a/beacon_node/network/src/sync/custody_backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/custody_backfill_sync/mod.rs @@ -12,14 +12,16 @@ use lighthouse_network::{ }; use logging::crit; use std::hash::{DefaultHasher, Hash, Hasher}; +use strum::IntoEnumIterator; use tracing::{debug, error, info, info_span, warn}; use types::{DataColumnSidecarList, Epoch, EthSpec}; +use crate::metrics; use crate::sync::{ backfill_sync::{BACKFILL_EPOCHS_PER_BATCH, ProcessResult, SyncStart}, batch::{ - BatchConfig, BatchId, BatchInfo, BatchOperationOutcome, BatchProcessingResult, BatchState, - ByRangeRequestType, + BatchConfig, BatchId, BatchInfo, BatchMetricsState, BatchOperationOutcome, + BatchProcessingResult, BatchState, ByRangeRequestType, }, block_sidecar_coupling::CouplingError, manager::CustodyBatchProcessResult, @@ -1114,6 +1116,21 @@ impl CustodyBackFillSync { *self.network_globals.custody_sync_state.write() = state; } + pub fn register_metrics(&self) { + for state in BatchMetricsState::iter() { + let count = self + .batches + .values() + .filter(|b| b.state().metrics_state() == state) + .count(); + metrics::set_gauge_vec( + &metrics::SYNCING_CHAIN_BATCHES, + &["custody_backfill", state.into()], + count as i64, + ); + } + } + /// A fully synced peer has joined us. /// If we are in a failed state, update a local variable to indicate we are able to restart /// the failed sync on the next attempt. diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 096ed9c328..c2faff5b62 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -784,6 +784,9 @@ impl SyncManager { } _ = register_metrics_interval.tick() => { self.network.register_metrics(); + self.range_sync.register_metrics(); + self.backfill_sync.register_metrics(); + self.custody_backfill_sync.register_metrics(); } _ = epoch_interval.tick() => { self.update_sync_state(); diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index d67d6468a9..61161ae6f4 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -3,7 +3,8 @@ use crate::metrics; use crate::network_beacon_processor::ChainSegmentProcessId; use crate::sync::batch::BatchId; use crate::sync::batch::{ - BatchConfig, BatchInfo, BatchOperationOutcome, BatchProcessingResult, BatchState, + BatchConfig, BatchInfo, BatchMetricsState, BatchOperationOutcome, BatchProcessingResult, + BatchState, }; use crate::sync::block_sidecar_coupling::CouplingError; use crate::sync::network_context::{RangeRequestId, RpcRequestSendError, RpcResponseError}; @@ -234,6 +235,14 @@ impl SyncingChain { .sum() } + /// Returns the number of batches in the given metrics state. + pub fn count_batches_in_state(&self, state: BatchMetricsState) -> usize { + self.batches + .values() + .filter(|b| b.state().metrics_state() == state) + .count() + } + /// Removes a peer from the chain. /// If the peer has active batches, those are considered failed and re-requested. pub fn remove_peer(&mut self, peer_id: &PeerId) -> ProcessingResult { diff --git a/beacon_node/network/src/sync/range_sync/chain_collection.rs b/beacon_node/network/src/sync/range_sync/chain_collection.rs index bd4dd6c181..b430b7c572 100644 --- a/beacon_node/network/src/sync/range_sync/chain_collection.rs +++ b/beacon_node/network/src/sync/range_sync/chain_collection.rs @@ -6,6 +6,7 @@ use super::chain::{ChainId, ProcessingResult, RemoveChain, SyncingChain}; use super::sync_type::RangeSyncType; use crate::metrics; +use crate::sync::batch::BatchMetricsState; use crate::sync::network_context::SyncNetworkContext; use beacon_chain::{BeaconChain, BeaconChainTypes}; use fnv::FnvHashMap; @@ -17,6 +18,7 @@ use smallvec::SmallVec; use std::collections::HashMap; use std::collections::hash_map::Entry; use std::sync::Arc; +use strum::IntoEnumIterator; use tracing::{debug, error}; use types::EthSpec; use types::{Epoch, Hash256, Slot}; @@ -516,6 +518,25 @@ impl ChainCollection { } } + pub fn register_metrics(&self) { + for (sync_type, chains) in [ + ("range_finalized", &self.finalized_chains), + ("range_head", &self.head_chains), + ] { + for state in BatchMetricsState::iter() { + let count: usize = chains + .values() + .map(|chain| chain.count_batches_in_state(state)) + .sum(); + metrics::set_gauge_vec( + &metrics::SYNCING_CHAIN_BATCHES, + &[sync_type, state.into()], + count as i64, + ); + } + } + } + fn update_metrics(&self) { metrics::set_gauge_vec( &metrics::SYNCING_CHAINS_COUNT, diff --git a/beacon_node/network/src/sync/range_sync/range.rs b/beacon_node/network/src/sync/range_sync/range.rs index c9656ad1d0..4c2123451a 100644 --- a/beacon_node/network/src/sync/range_sync/range.rs +++ b/beacon_node/network/src/sync/range_sync/range.rs @@ -371,6 +371,10 @@ where .update(network, &local, &mut self.awaiting_head_peers); } + pub fn register_metrics(&self) { + self.chains.register_metrics(); + } + /// Kickstarts sync. pub fn resume(&mut self, network: &mut SyncNetworkContext) { for (removed_chain, sync_type, remove_reason) in