Additional networking metrics (#2549)

Adds additional metrics for network monitoring and evaluation.


Co-authored-by: Mark Mackey <mark@sigmaprime.io>
This commit is contained in:
Age Manning
2021-12-22 06:17:14 +00:00
parent 60d917d9e9
commit 81c667b58e
29 changed files with 877 additions and 1158 deletions

View File

@@ -16,6 +16,8 @@ pub struct Config {
/* Peer count related configurations */
/// Whether discovery is enabled.
pub discovery_enabled: bool,
/// Whether metrics are enabled.
pub metrics_enabled: bool,
/// Target number of peers to connect to.
pub target_peer_count: usize,
@@ -34,6 +36,7 @@ impl Default for Config {
fn default() -> Self {
Config {
discovery_enabled: true,
metrics_enabled: false,
target_peer_count: DEFAULT_TARGET_PEERS,
status_interval: DEFAULT_STATUS_INTERVAL,
ping_interval_inbound: DEFAULT_PING_INTERVAL_INBOUND,

View File

@@ -8,13 +8,14 @@ use crate::{Subnet, SubnetDiscovery};
use discv5::Enr;
use hashset_delay::HashSetDelay;
use libp2p::identify::IdentifyInfo;
use peerdb::{BanOperation, BanResult, ScoreUpdateResult};
use peerdb::{client::ClientKind, BanOperation, BanResult, ScoreUpdateResult};
use slog::{debug, error, warn};
use smallvec::SmallVec;
use std::{
sync::Arc,
time::{Duration, Instant},
};
use strum::IntoEnumIterator;
use types::{EthSpec, SyncSubnetId};
pub use libp2p::core::{identity::Keypair, Multiaddr};
@@ -71,6 +72,8 @@ pub struct PeerManager<TSpec: EthSpec> {
heartbeat: tokio::time::Interval,
/// Keeps track of whether the discovery service is enabled or not.
discovery_enabled: bool,
/// Keeps track if the current instance is reporting metrics or not.
metrics_enabled: bool,
/// The logger associated with the `PeerManager`.
log: slog::Logger,
}
@@ -111,6 +114,7 @@ impl<TSpec: EthSpec> PeerManager<TSpec> {
) -> error::Result<Self> {
let config::Config {
discovery_enabled,
metrics_enabled,
target_peer_count,
status_interval,
ping_interval_inbound,
@@ -130,6 +134,7 @@ impl<TSpec: EthSpec> PeerManager<TSpec> {
sync_committee_subnets: Default::default(),
heartbeat,
discovery_enabled,
metrics_enabled,
log: log.clone(),
})
}
@@ -378,19 +383,21 @@ impl<TSpec: EthSpec> PeerManager<TSpec> {
"protocols" => ?info.protocols
);
// update the peer client kind metric
if let Some(v) = metrics::get_int_gauge(
&metrics::PEERS_PER_CLIENT,
&[&peer_info.client().kind.to_string()],
// update the peer client kind metric if the peer is connected
if matches!(
peer_info.connection_status(),
PeerConnectionStatus::Connected { .. }
| PeerConnectionStatus::Disconnecting { .. }
) {
v.inc()
};
if let Some(v) = metrics::get_int_gauge(
&metrics::PEERS_PER_CLIENT,
&[&previous_kind.to_string()],
) {
v.dec()
};
metrics::inc_gauge_vec(
&metrics::PEERS_PER_CLIENT,
&[&peer_info.client().kind.to_string()],
);
metrics::dec_gauge_vec(
&metrics::PEERS_PER_CLIENT,
&[&previous_kind.to_string()],
);
}
}
} else {
error!(self.log, "Received an Identify response from an unknown peer"; "peer_id" => peer_id.to_string());
@@ -606,6 +613,46 @@ impl<TSpec: EthSpec> PeerManager<TSpec> {
}
}
// This function updates metrics for all connected peers.
fn update_connected_peer_metrics(&self) {
// Do nothing if we don't have metrics enabled.
if !self.metrics_enabled {
return;
}
let mut connected_peer_count = 0;
let mut inbound_connected_peers = 0;
let mut outbound_connected_peers = 0;
let mut clients_per_peer = HashMap::new();
for (_peer, peer_info) in self.network_globals.peers.read().connected_peers() {
connected_peer_count += 1;
if let PeerConnectionStatus::Connected { n_in, .. } = peer_info.connection_status() {
if *n_in > 0 {
inbound_connected_peers += 1;
} else {
outbound_connected_peers += 1;
}
}
*clients_per_peer
.entry(peer_info.client().kind.to_string())
.or_default() += 1;
}
metrics::set_gauge(&metrics::PEERS_CONNECTED, connected_peer_count);
metrics::set_gauge(&metrics::NETWORK_INBOUND_PEERS, inbound_connected_peers);
metrics::set_gauge(&metrics::NETWORK_OUTBOUND_PEERS, outbound_connected_peers);
for client_kind in ClientKind::iter() {
let value = clients_per_peer.get(&client_kind.to_string()).unwrap_or(&0);
metrics::set_gauge_vec(
&metrics::PEERS_PER_CLIENT,
&[&client_kind.to_string()],
*value as i64,
);
}
}
/* Internal functions */
/// Sets a peer as connected as long as their reputation allows it
@@ -705,22 +752,6 @@ impl<TSpec: EthSpec> PeerManager<TSpec> {
// increment prometheus metrics
metrics::inc_counter(&metrics::PEER_CONNECT_EVENT_COUNT);
metrics::set_gauge(&metrics::PEERS_CONNECTED, connected_peers);
metrics::set_gauge(&metrics::PEERS_CONNECTED_INTEROP, connected_peers);
// Increment the PEERS_PER_CLIENT metric
if let Some(kind) = self
.network_globals
.peers
.read()
.peer_info(peer_id)
.map(|peer_info| peer_info.client().kind.clone())
{
if let Some(v) =
metrics::get_int_gauge(&metrics::PEERS_PER_CLIENT, &[&kind.to_string()])
{
v.inc()
};
}
true
}
@@ -802,6 +833,9 @@ impl<TSpec: EthSpec> PeerManager<TSpec> {
self.handle_score_action(&peer_id, action, None);
}
// Update peer score metrics;
self.update_peer_score_metrics();
// Maintain minimum count for sync committee peers.
self.maintain_sync_committee_peers();
@@ -840,6 +874,75 @@ impl<TSpec: EthSpec> PeerManager<TSpec> {
self.disconnect_peer(peer_id, GoodbyeReason::TooManyPeers);
}
}
// Update metrics related to peer scoring.
fn update_peer_score_metrics(&self) {
if !self.metrics_enabled {
return;
}
// reset the gauges
let _ = metrics::PEER_SCORE_DISTRIBUTION
.as_ref()
.map(|gauge| gauge.reset());
let _ = metrics::PEER_SCORE_PER_CLIENT
.as_ref()
.map(|gauge| gauge.reset());
let mut avg_score_per_client: HashMap<String, (f64, usize)> = HashMap::with_capacity(5);
{
let peers_db_read_lock = self.network_globals.peers.read();
let connected_peers = peers_db_read_lock.best_peers_by_status(PeerInfo::is_connected);
let total_peers = connected_peers.len();
for (id, (_peer, peer_info)) in connected_peers.into_iter().enumerate() {
// First quartile
if id == 0 {
metrics::set_gauge_vec(
&metrics::PEER_SCORE_DISTRIBUTION,
&["1st"],
peer_info.score().score() as i64,
);
} else if id == (total_peers * 3 / 4).saturating_sub(1) {
metrics::set_gauge_vec(
&metrics::PEER_SCORE_DISTRIBUTION,
&["3/4"],
peer_info.score().score() as i64,
);
} else if id == (total_peers / 2).saturating_sub(1) {
metrics::set_gauge_vec(
&metrics::PEER_SCORE_DISTRIBUTION,
&["1/2"],
peer_info.score().score() as i64,
);
} else if id == (total_peers / 4).saturating_sub(1) {
metrics::set_gauge_vec(
&metrics::PEER_SCORE_DISTRIBUTION,
&["1/4"],
peer_info.score().score() as i64,
);
} else if id == total_peers.saturating_sub(1) {
metrics::set_gauge_vec(
&metrics::PEER_SCORE_DISTRIBUTION,
&["last"],
peer_info.score().score() as i64,
);
}
let mut score_peers: &mut (f64, usize) = avg_score_per_client
.entry(peer_info.client().kind.to_string())
.or_default();
score_peers.0 += peer_info.score().score();
score_peers.1 += 1;
}
} // read lock ended
for (client, (score, peers)) in avg_score_per_client {
metrics::set_float_gauge_vec(
&metrics::PEER_SCORE_PER_CLIENT,
&[&client.to_string()],
score / (peers as f64),
);
}
}
}
enum ConnectingType {

View File

@@ -111,8 +111,11 @@ impl<TSpec: EthSpec> NetworkBehaviour for PeerManager<TSpec> {
endpoint: &ConnectedPoint,
_failed_addresses: Option<&Vec<Multiaddr>>,
) {
// Log the connection
debug!(self.log, "Connection established"; "peer_id" => %peer_id, "connection" => ?endpoint.to_endpoint());
// Check NAT if metrics are enabled
if self.network_globals.local_enr.read().udp().is_some() {
metrics::check_nat();
}
// Check to make sure the peer is not supposed to be banned
match self.ban_status(peer_id) {
@@ -150,10 +153,8 @@ impl<TSpec: EthSpec> NetworkBehaviour for PeerManager<TSpec> {
return;
}
// Register the newly connected peer (regardless if we are about to disconnect them).
// NOTE: We don't register peers that we are disconnecting immediately. The network service
// does not need to know about these peers.
// let enr
match endpoint {
ConnectedPoint::Listener { send_back_addr, .. } => {
self.inject_connect_ingoing(peer_id, send_back_addr.clone(), None);
@@ -167,12 +168,9 @@ impl<TSpec: EthSpec> NetworkBehaviour for PeerManager<TSpec> {
}
}
let connected_peers = self.network_globals.connected_peers() as i64;
// increment prometheus metrics
self.update_connected_peer_metrics();
metrics::inc_counter(&metrics::PEER_CONNECT_EVENT_COUNT);
metrics::set_gauge(&metrics::PEERS_CONNECTED, connected_peers);
metrics::set_gauge(&metrics::PEERS_CONNECTED_INTEROP, connected_peers);
}
fn inject_disconnected(&mut self, peer_id: &PeerId) {
@@ -190,21 +188,6 @@ impl<TSpec: EthSpec> NetworkBehaviour for PeerManager<TSpec> {
self.events
.push(PeerManagerEvent::PeerDisconnected(*peer_id));
debug!(self.log, "Peer disconnected"; "peer_id" => %peer_id);
// Decrement the PEERS_PER_CLIENT metric
if let Some(kind) = self
.network_globals
.peers
.read()
.peer_info(peer_id)
.map(|info| info.client().kind.clone())
{
if let Some(v) =
metrics::get_int_gauge(&metrics::PEERS_PER_CLIENT, &[&kind.to_string()])
{
v.dec()
};
}
}
// NOTE: It may be the case that a rejected node, due to too many peers is disconnected
@@ -212,12 +195,9 @@ impl<TSpec: EthSpec> NetworkBehaviour for PeerManager<TSpec> {
// reference so that peer manager can track this peer.
self.inject_disconnect(peer_id);
let connected_peers = self.network_globals.connected_peers() as i64;
// Update the prometheus metrics
self.update_connected_peer_metrics();
metrics::inc_counter(&metrics::PEER_DISCONNECT_EVENT_COUNT);
metrics::set_gauge(&metrics::PEERS_CONNECTED, connected_peers);
metrics::set_gauge(&metrics::PEERS_CONNECTED_INTEROP, connected_peers);
}
fn inject_address_change(

View File

@@ -4,7 +4,7 @@
use libp2p::identify::IdentifyInfo;
use serde::Serialize;
use strum::{AsRefStr, AsStaticStr};
use strum::{AsRefStr, AsStaticStr, EnumIter};
/// Various client and protocol information related to a node.
#[derive(Clone, Debug, Serialize)]
@@ -21,7 +21,7 @@ pub struct Client {
pub agent_string: Option<String>,
}
#[derive(Clone, Debug, Serialize, PartialEq, AsRefStr, AsStaticStr)]
#[derive(Clone, Debug, Serialize, PartialEq, AsRefStr, AsStaticStr, EnumIter)]
pub enum ClientKind {
/// A lighthouse node (the best kind).
Lighthouse,

View File

@@ -19,8 +19,6 @@ use PeerConnectionStatus::*;
#[derive(Clone, Debug, Serialize)]
#[serde(bound = "T: EthSpec")]
pub struct PeerInfo<T: EthSpec> {
/// The connection status of the peer
_status: PeerStatus,
/// The peers reputation
score: Score,
/// Client managing this peer
@@ -57,7 +55,6 @@ pub struct PeerInfo<T: EthSpec> {
impl<TSpec: EthSpec> Default for PeerInfo<TSpec> {
fn default() -> PeerInfo<TSpec> {
PeerInfo {
_status: Default::default(),
score: Score::default(),
client: Client::default(),
connection_status: Default::default(),
@@ -387,21 +384,6 @@ impl<T: EthSpec> PeerInfo<T> {
}
}
#[derive(Clone, Debug, Serialize)]
/// The current health status of the peer.
pub enum PeerStatus {
/// The peer is healthy.
Healthy,
/// The peer is clogged. It has not been responding to requests on time.
_Clogged,
}
impl Default for PeerStatus {
fn default() -> Self {
PeerStatus::Healthy
}
}
/// Connection Direction of connection.
#[derive(Debug, Clone, Serialize, AsRefStr)]
#[strum(serialize_all = "snake_case")]