mirror of
https://github.com/sigp/lighthouse.git
synced 2026-03-03 00:31:50 +00:00
Add PeerDAS metrics to track subnets without peers (#6928)
Currently we track a key metric `PEERS_PER_COLUMN_SUBNET` in a getter `good_peers_on_sampling_subnets`. Another PR https://github.com/sigp/lighthouse/pull/6922 deletes that function, so we have to move the metric anyway. This PR moves that metric computation to the metrics spawned task which is refreshed every 5 seconds. I also added a few more useful metrics. The total set and intended usage is: - `sync_peers_per_column_subnet`: Track health of overall subnets in your node - `sync_peers_per_custody_column_subnet`: Track health of the subnets your node needs. We should track this metric closely in our dashboards with a heatmap and bar plot - ~~`sync_column_subnets_with_zero_peers`: Is equivalent to the Grafana query `count(sync_peers_per_column_subnet == 0) by (instance)`. We may prefer to skip it, but I believe it's the most important metric as if `sync_column_subnets_with_zero_peers > 0` your node stalls.~~ - ~~`sync_custody_column_subnets_with_zero_peers`: `count(sync_peers_per_custody_column_subnet == 0) by (instance)`~~
This commit is contained in:
@@ -234,6 +234,11 @@ impl<E: EthSpec> PeerInfo<E> {
|
|||||||
self.custody_subnets.contains(subnet)
|
self.custody_subnets.contains(subnet)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns an iterator on this peer's custody subnets
|
||||||
|
pub fn custody_subnets_iter(&self) -> impl Iterator<Item = &DataColumnSubnetId> {
|
||||||
|
self.custody_subnets.iter()
|
||||||
|
}
|
||||||
|
|
||||||
/// Returns true if the peer is connected to a long-lived subnet.
|
/// Returns true if the peer is connected to a long-lived subnet.
|
||||||
pub fn has_long_lived_subnet(&self) -> bool {
|
pub fn has_long_lived_subnet(&self) -> bool {
|
||||||
// Check the meta_data
|
// Check the meta_data
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ pub use metrics::*;
|
|||||||
use std::sync::{Arc, LazyLock};
|
use std::sync::{Arc, LazyLock};
|
||||||
use strum::AsRefStr;
|
use strum::AsRefStr;
|
||||||
use strum::IntoEnumIterator;
|
use strum::IntoEnumIterator;
|
||||||
|
use types::DataColumnSubnetId;
|
||||||
use types::EthSpec;
|
use types::EthSpec;
|
||||||
|
|
||||||
pub const SUCCESS: &str = "SUCCESS";
|
pub const SUCCESS: &str = "SUCCESS";
|
||||||
@@ -374,11 +375,18 @@ pub static PEERS_PER_SYNC_TYPE: LazyLock<Result<IntGaugeVec>> = LazyLock::new(||
|
|||||||
});
|
});
|
||||||
pub static PEERS_PER_COLUMN_SUBNET: LazyLock<Result<IntGaugeVec>> = LazyLock::new(|| {
|
pub static PEERS_PER_COLUMN_SUBNET: LazyLock<Result<IntGaugeVec>> = LazyLock::new(|| {
|
||||||
try_create_int_gauge_vec(
|
try_create_int_gauge_vec(
|
||||||
"peers_per_column_subnet",
|
"sync_peers_per_column_subnet",
|
||||||
"Number of connected peers per column subnet",
|
"Number of connected peers per column subnet",
|
||||||
&["subnet_id"],
|
&["subnet_id"],
|
||||||
)
|
)
|
||||||
});
|
});
|
||||||
|
pub static PEERS_PER_CUSTODY_COLUMN_SUBNET: LazyLock<Result<IntGaugeVec>> = LazyLock::new(|| {
|
||||||
|
try_create_int_gauge_vec(
|
||||||
|
"sync_peers_per_custody_column_subnet",
|
||||||
|
"Number of connected peers per custody column subnet",
|
||||||
|
&["subnet_id"],
|
||||||
|
)
|
||||||
|
});
|
||||||
pub static SYNCING_CHAINS_COUNT: LazyLock<Result<IntGaugeVec>> = LazyLock::new(|| {
|
pub static SYNCING_CHAINS_COUNT: LazyLock<Result<IntGaugeVec>> = LazyLock::new(|| {
|
||||||
try_create_int_gauge_vec(
|
try_create_int_gauge_vec(
|
||||||
"sync_range_chains",
|
"sync_range_chains",
|
||||||
@@ -746,16 +754,42 @@ pub fn update_sync_metrics<E: EthSpec>(network_globals: &Arc<NetworkGlobals<E>>)
|
|||||||
|
|
||||||
// count per sync status, the number of connected peers
|
// count per sync status, the number of connected peers
|
||||||
let mut peers_per_sync_type = FnvHashMap::default();
|
let mut peers_per_sync_type = FnvHashMap::default();
|
||||||
for sync_type in network_globals
|
let mut peers_per_column_subnet = FnvHashMap::default();
|
||||||
.peers
|
|
||||||
.read()
|
for (_, info) in network_globals.peers.read().connected_peers() {
|
||||||
.connected_peers()
|
let sync_type = info.sync_status().as_str();
|
||||||
.map(|(_peer_id, info)| info.sync_status().as_str())
|
|
||||||
{
|
|
||||||
*peers_per_sync_type.entry(sync_type).or_default() += 1;
|
*peers_per_sync_type.entry(sync_type).or_default() += 1;
|
||||||
|
|
||||||
|
for subnet in info.custody_subnets_iter() {
|
||||||
|
*peers_per_column_subnet.entry(*subnet).or_default() += 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (sync_type, peer_count) in peers_per_sync_type {
|
for (sync_type, peer_count) in peers_per_sync_type {
|
||||||
set_gauge_entry(&PEERS_PER_SYNC_TYPE, &[sync_type], peer_count);
|
set_gauge_entry(&PEERS_PER_SYNC_TYPE, &[sync_type], peer_count);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let all_column_subnets =
|
||||||
|
(0..network_globals.spec.data_column_sidecar_subnet_count).map(DataColumnSubnetId::new);
|
||||||
|
let custody_column_subnets = network_globals.sampling_subnets.iter();
|
||||||
|
|
||||||
|
// Iterate all subnet values to set to zero the empty entries in peers_per_column_subnet
|
||||||
|
for subnet in all_column_subnets {
|
||||||
|
set_gauge_entry(
|
||||||
|
&PEERS_PER_COLUMN_SUBNET,
|
||||||
|
&[&format!("{subnet}")],
|
||||||
|
peers_per_column_subnet.get(&subnet).copied().unwrap_or(0),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Registering this metric is a duplicate for supernodes but helpful for fullnodes. This way
|
||||||
|
// operators can monitor the health of only the subnets of their interest without complex
|
||||||
|
// Grafana queries.
|
||||||
|
for subnet in custody_column_subnets {
|
||||||
|
set_gauge_entry(
|
||||||
|
&PEERS_PER_CUSTODY_COLUMN_SUBNET,
|
||||||
|
&[&format!("{subnet}")],
|
||||||
|
peers_per_column_subnet.get(subnet).copied().unwrap_or(0),
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
use super::batch::{BatchInfo, BatchProcessingResult, BatchState};
|
use super::batch::{BatchInfo, BatchProcessingResult, BatchState};
|
||||||
use super::RangeSyncType;
|
use super::RangeSyncType;
|
||||||
use crate::metrics;
|
use crate::metrics;
|
||||||
use crate::metrics::PEERS_PER_COLUMN_SUBNET;
|
|
||||||
use crate::network_beacon_processor::ChainSegmentProcessId;
|
use crate::network_beacon_processor::ChainSegmentProcessId;
|
||||||
use crate::sync::network_context::RangeRequestId;
|
use crate::sync::network_context::RangeRequestId;
|
||||||
use crate::sync::{network_context::SyncNetworkContext, BatchOperationOutcome, BatchProcessResult};
|
use crate::sync::{network_context::SyncNetworkContext, BatchOperationOutcome, BatchProcessResult};
|
||||||
@@ -10,7 +9,6 @@ use beacon_chain::BeaconChainTypes;
|
|||||||
use fnv::FnvHashMap;
|
use fnv::FnvHashMap;
|
||||||
use lighthouse_network::service::api_types::Id;
|
use lighthouse_network::service::api_types::Id;
|
||||||
use lighthouse_network::{PeerAction, PeerId};
|
use lighthouse_network::{PeerAction, PeerId};
|
||||||
use metrics::set_int_gauge;
|
|
||||||
use rand::seq::SliceRandom;
|
use rand::seq::SliceRandom;
|
||||||
use rand::Rng;
|
use rand::Rng;
|
||||||
use slog::{crit, debug, o, warn};
|
use slog::{crit, debug, o, warn};
|
||||||
@@ -1106,11 +1104,6 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
|
|||||||
.good_custody_subnet_peer(*subnet_id)
|
.good_custody_subnet_peer(*subnet_id)
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
set_int_gauge(
|
|
||||||
&PEERS_PER_COLUMN_SUBNET,
|
|
||||||
&[&subnet_id.to_string()],
|
|
||||||
peer_count as i64,
|
|
||||||
);
|
|
||||||
peer_count > 0
|
peer_count > 0
|
||||||
});
|
});
|
||||||
peers_on_all_custody_subnets
|
peers_on_all_custody_subnets
|
||||||
|
|||||||
Reference in New Issue
Block a user