From d60388134d074068c837f5bf3bf4edf8f8b194df Mon Sep 17 00:00:00 2001 From: Lion - dapplion <35266934+dapplion@users.noreply.github.com> Date: Tue, 11 Feb 2025 03:56:38 -0300 Subject: [PATCH] Add PeerDAS metrics to track subnets without peers (#6928) Currently we track a key metric `PEERS_PER_COLUMN_SUBNET` in a getter `good_peers_on_sampling_subnets`. Another PR https://github.com/sigp/lighthouse/pull/6922 deletes that function, so we have to move the metric anyway. This PR moves that metric computation to the metrics spawned task which is refreshed every 5 seconds. I also added a few more useful metrics. The total set and intended usage is: - `sync_peers_per_column_subnet`: Track health of overall subnets in your node - `sync_peers_per_custody_column_subnet`: Track health of the subnets your node needs. We should track this metric closely in our dashboards with a heatmap and bar plot - ~~`sync_column_subnets_with_zero_peers`: Is equivalent to the Grafana query `count(sync_peers_per_column_subnet == 0) by (instance)`. We may prefer to skip it, but I believe it's the most important metric as if `sync_column_subnets_with_zero_peers > 0` your node stalls.~~ - ~~`sync_custody_column_subnets_with_zero_peers`: `count(sync_peers_per_custody_column_subnet == 0) by (instance)`~~ --- .../src/peer_manager/peerdb/peer_info.rs | 5 ++ beacon_node/network/src/metrics.rs | 48 ++++++++++++++++--- .../network/src/sync/range_sync/chain.rs | 7 --- 3 files changed, 46 insertions(+), 14 deletions(-) diff --git a/beacon_node/lighthouse_network/src/peer_manager/peerdb/peer_info.rs b/beacon_node/lighthouse_network/src/peer_manager/peerdb/peer_info.rs index 2e8f462565..4cbff59ce2 100644 --- a/beacon_node/lighthouse_network/src/peer_manager/peerdb/peer_info.rs +++ b/beacon_node/lighthouse_network/src/peer_manager/peerdb/peer_info.rs @@ -234,6 +234,11 @@ impl PeerInfo { self.custody_subnets.contains(subnet) } + /// Returns an iterator on this peer's custody subnets + pub fn custody_subnets_iter(&self) -> impl Iterator { + self.custody_subnets.iter() + } + /// Returns true if the peer is connected to a long-lived subnet. pub fn has_long_lived_subnet(&self) -> bool { // Check the meta_data diff --git a/beacon_node/network/src/metrics.rs b/beacon_node/network/src/metrics.rs index 154a59eade..7c38ae9d75 100644 --- a/beacon_node/network/src/metrics.rs +++ b/beacon_node/network/src/metrics.rs @@ -14,6 +14,7 @@ pub use metrics::*; use std::sync::{Arc, LazyLock}; use strum::AsRefStr; use strum::IntoEnumIterator; +use types::DataColumnSubnetId; use types::EthSpec; pub const SUCCESS: &str = "SUCCESS"; @@ -374,11 +375,18 @@ pub static PEERS_PER_SYNC_TYPE: LazyLock> = LazyLock::new(|| }); pub static PEERS_PER_COLUMN_SUBNET: LazyLock> = LazyLock::new(|| { try_create_int_gauge_vec( - "peers_per_column_subnet", + "sync_peers_per_column_subnet", "Number of connected peers per column subnet", &["subnet_id"], ) }); +pub static PEERS_PER_CUSTODY_COLUMN_SUBNET: LazyLock> = LazyLock::new(|| { + try_create_int_gauge_vec( + "sync_peers_per_custody_column_subnet", + "Number of connected peers per custody column subnet", + &["subnet_id"], + ) +}); pub static SYNCING_CHAINS_COUNT: LazyLock> = LazyLock::new(|| { try_create_int_gauge_vec( "sync_range_chains", @@ -746,16 +754,42 @@ pub fn update_sync_metrics(network_globals: &Arc>) // count per sync status, the number of connected peers let mut peers_per_sync_type = FnvHashMap::default(); - for sync_type in network_globals - .peers - .read() - .connected_peers() - .map(|(_peer_id, info)| info.sync_status().as_str()) - { + let mut peers_per_column_subnet = FnvHashMap::default(); + + for (_, info) in network_globals.peers.read().connected_peers() { + let sync_type = info.sync_status().as_str(); *peers_per_sync_type.entry(sync_type).or_default() += 1; + + for subnet in info.custody_subnets_iter() { + *peers_per_column_subnet.entry(*subnet).or_default() += 1; + } } for (sync_type, peer_count) in peers_per_sync_type { set_gauge_entry(&PEERS_PER_SYNC_TYPE, &[sync_type], peer_count); } + + let all_column_subnets = + (0..network_globals.spec.data_column_sidecar_subnet_count).map(DataColumnSubnetId::new); + let custody_column_subnets = network_globals.sampling_subnets.iter(); + + // Iterate all subnet values to set to zero the empty entries in peers_per_column_subnet + for subnet in all_column_subnets { + set_gauge_entry( + &PEERS_PER_COLUMN_SUBNET, + &[&format!("{subnet}")], + peers_per_column_subnet.get(&subnet).copied().unwrap_or(0), + ); + } + + // Registering this metric is a duplicate for supernodes but helpful for fullnodes. This way + // operators can monitor the health of only the subnets of their interest without complex + // Grafana queries. + for subnet in custody_column_subnets { + set_gauge_entry( + &PEERS_PER_CUSTODY_COLUMN_SUBNET, + &[&format!("{subnet}")], + peers_per_column_subnet.get(subnet).copied().unwrap_or(0), + ); + } } diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index f02262e4b5..cab08dd278 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -1,7 +1,6 @@ use super::batch::{BatchInfo, BatchProcessingResult, BatchState}; use super::RangeSyncType; use crate::metrics; -use crate::metrics::PEERS_PER_COLUMN_SUBNET; use crate::network_beacon_processor::ChainSegmentProcessId; use crate::sync::network_context::RangeRequestId; use crate::sync::{network_context::SyncNetworkContext, BatchOperationOutcome, BatchProcessResult}; @@ -10,7 +9,6 @@ use beacon_chain::BeaconChainTypes; use fnv::FnvHashMap; use lighthouse_network::service::api_types::Id; use lighthouse_network::{PeerAction, PeerId}; -use metrics::set_int_gauge; use rand::seq::SliceRandom; use rand::Rng; use slog::{crit, debug, o, warn}; @@ -1106,11 +1104,6 @@ impl SyncingChain { .good_custody_subnet_peer(*subnet_id) .count(); - set_int_gauge( - &PEERS_PER_COLUMN_SUBNET, - &[&subnet_id.to_string()], - peer_count as i64, - ); peer_count > 0 }); peers_on_all_custody_subnets