Extended Gossipsub metrics (#1577)

## Issue Addressed

N/A

## Proposed Changes

Adds extended metrics to get a better idea of what is happening at the gossipsub layer of lighthouse. This provides information about mesh statistics per topics, subscriptions and peer scores. 

## Additional Info
This commit is contained in:
Age Manning
2020-09-01 06:59:14 +00:00
parent 8301a984eb
commit fb9d828e5e
11 changed files with 445 additions and 89 deletions

View File

@@ -1,36 +1,91 @@
use beacon_chain::attestation_verification::Error as AttnError;
pub use lighthouse_metrics::*;
lazy_static! {
/*
* Gossip subnets and scoring
*/
pub static ref PEERS_PER_PROTOCOL: Result<IntGaugeVec> = try_create_int_gauge_vec(
"gossipsub_peers_per_protocol",
"Peers via supported protocol",
&["protocol"]
);
pub static ref GOSSIPSUB_SUBSCRIBED_SUBNET_TOPIC: Result<IntGaugeVec> = try_create_int_gauge_vec(
"gossipsub_subscribed_subnets",
"Subnets currently subscribed to",
&["subnet"]
);
pub static ref GOSSIPSUB_SUBSCRIBED_PEERS_SUBNET_TOPIC: Result<IntGaugeVec> = try_create_int_gauge_vec(
"gossipsub_peers_per_subnet_topic_count",
"Peers subscribed per subnet topic",
&["subnet"]
);
pub static ref MESH_PEERS_PER_MAIN_TOPIC: Result<IntGaugeVec> = try_create_int_gauge_vec(
"gossipsub_mesh_peers_per_main_topic",
"Mesh peers per main topic",
&["topic_hash"]
);
pub static ref MESH_PEERS_PER_SUBNET_TOPIC: Result<IntGaugeVec> = try_create_int_gauge_vec(
"gossipsub_mesh_peers_per_subnet_topic",
"Mesh peers per subnet topic",
&["subnet"]
);
pub static ref AVG_GOSSIPSUB_PEER_SCORE_PER_MAIN_TOPIC: Result<IntGaugeVec> = try_create_int_gauge_vec(
"gossipsub_avg_peer_score_per_topic",
"Average peer's score per topic",
&["topic_hash"]
);
pub static ref AVG_GOSSIPSUB_PEER_SCORE_PER_SUBNET_TOPIC: Result<IntGaugeVec> = try_create_int_gauge_vec(
"gossipsub_avg_peer_score_per_subnet_topic",
"Average peer's score per subnet topic",
&["subnet"]
);
pub static ref ATTESTATIONS_PUBLISHED_PER_SUBNET_PER_SLOT: Result<IntCounterVec> = try_create_int_counter_vec(
"gossipsub_attestations_published_per_subnet_per_slot",
"Failed attestation publishes per subnet",
&["subnet"]
);
}
lazy_static! {
/*
* Gossip Rx
*/
pub static ref GOSSIP_BLOCKS_RX: Result<IntCounter> = try_create_int_counter(
"network_gossip_blocks_rx_total",
"gossipsub_blocks_rx_total",
"Count of gossip blocks received"
);
pub static ref GOSSIP_UNAGGREGATED_ATTESTATIONS_RX: Result<IntCounter> = try_create_int_counter(
"network_gossip_unaggregated_attestations_rx_total",
"gossipsub_unaggregated_attestations_rx_total",
"Count of gossip unaggregated attestations received"
);
pub static ref GOSSIP_AGGREGATED_ATTESTATIONS_RX: Result<IntCounter> = try_create_int_counter(
"network_gossip_aggregated_attestations_rx_total",
"gossipsub_aggregated_attestations_rx_total",
"Count of gossip aggregated attestations received"
);
/*
* Gossip Tx
*/
pub static ref GOSSIP_BLOCKS_TX: Result<IntCounter> = try_create_int_counter(
"network_gossip_blocks_tx_total",
"gossipsub_blocks_tx_total",
"Count of gossip blocks transmitted"
);
pub static ref GOSSIP_UNAGGREGATED_ATTESTATIONS_TX: Result<IntCounter> = try_create_int_counter(
"network_gossip_unaggregated_attestations_tx_total",
"gossipsub_unaggregated_attestations_tx_total",
"Count of gossip unaggregated attestations transmitted"
);
pub static ref GOSSIP_AGGREGATED_ATTESTATIONS_TX: Result<IntCounter> = try_create_int_counter(
"network_gossip_aggregated_attestations_tx_total",
"gossipsub_aggregated_attestations_tx_total",
"Count of gossip aggregated attestations transmitted"
);
@@ -38,11 +93,11 @@ lazy_static! {
* Attestation subnet subscriptions
*/
pub static ref SUBNET_SUBSCRIPTION_REQUESTS: Result<IntCounter> = try_create_int_counter(
"network_subnet_subscriptions_total",
"gossipsub_subnet_subscriptions_total",
"Count of validator subscription requests."
);
pub static ref SUBNET_SUBSCRIPTION_AGGREGATOR_REQUESTS: Result<IntCounter> = try_create_int_counter(
"network_subnet_subscriptions_aggregator_total",
"gossipsub_subnet_subscriptions_aggregator_total",
"Count of validator subscription requests where the subscriber is an aggregator."
);
@@ -194,95 +249,95 @@ lazy_static! {
* Attestation Errors
*/
pub static ref GOSSIP_ATTESTATION_ERROR_FUTURE_EPOCH: Result<IntCounter> = try_create_int_counter(
"gossip_attestation_error_future_epoch",
"gossipsub_attestation_error_future_epoch",
"Count of a specific error type (see metric name)"
);
pub static ref GOSSIP_ATTESTATION_ERROR_PAST_EPOCH: Result<IntCounter> = try_create_int_counter(
"gossip_attestation_error_past_epoch",
"gossipsub_attestation_error_past_epoch",
"Count of a specific error type (see metric name)"
);
pub static ref GOSSIP_ATTESTATION_ERROR_FUTURE_SLOT: Result<IntCounter> = try_create_int_counter(
"gossip_attestation_error_future_slot",
"gossipsub_attestation_error_future_slot",
"Count of a specific error type (see metric name)"
);
pub static ref GOSSIP_ATTESTATION_ERROR_PAST_SLOT: Result<IntCounter> = try_create_int_counter(
"gossip_attestation_error_past_slot",
"gossipsub_attestation_error_past_slot",
"Count of a specific error type (see metric name)"
);
pub static ref GOSSIP_ATTESTATION_ERROR_INVALID_SELECTION_PROOF: Result<IntCounter> = try_create_int_counter(
"gossip_attestation_error_invalid_selection_proof",
"gossipsub_attestation_error_invalid_selection_proof",
"Count of a specific error type (see metric name)"
);
pub static ref GOSSIP_ATTESTATION_ERROR_INVALID_SIGNATURE: Result<IntCounter> = try_create_int_counter(
"gossip_attestation_error_invalid_signature",
"gossipsub_attestation_error_invalid_signature",
"Count of a specific error type (see metric name)"
);
pub static ref GOSSIP_ATTESTATION_ERROR_EMPTY_AGGREGATION_BITFIELD: Result<IntCounter> = try_create_int_counter(
"gossip_attestation_error_empty_aggregation_bitfield",
"gossipsub_attestation_error_empty_aggregation_bitfield",
"Count of a specific error type (see metric name)"
);
pub static ref GOSSIP_ATTESTATION_ERROR_AGGREGATOR_PUBKEY_UNKNOWN: Result<IntCounter> = try_create_int_counter(
"gossip_attestation_error_aggregator_pubkey_unknown",
"gossipsub_attestation_error_aggregator_pubkey_unknown",
"Count of a specific error type (see metric name)"
);
pub static ref GOSSIP_ATTESTATION_ERROR_AGGREGATOR_NOT_IN_COMMITTEE: Result<IntCounter> = try_create_int_counter(
"gossip_attestation_error_aggregator_not_in_committee",
"gossipsub_attestation_error_aggregator_not_in_committee",
"Count of a specific error type (see metric name)"
);
pub static ref GOSSIP_ATTESTATION_ERROR_ATTESTATION_ALREADY_KNOWN: Result<IntCounter> = try_create_int_counter(
"gossip_attestation_error_attestation_already_known",
"gossipsub_attestation_error_attestation_already_known",
"Count of a specific error type (see metric name)"
);
pub static ref GOSSIP_ATTESTATION_ERROR_AGGREGATOR_ALREADY_KNOWN: Result<IntCounter> = try_create_int_counter(
"gossip_attestation_error_aggregator_already_known",
"gossipsub_attestation_error_aggregator_already_known",
"Count of a specific error type (see metric name)"
);
pub static ref GOSSIP_ATTESTATION_ERROR_PRIOR_ATTESTATION_KNOWN: Result<IntCounter> = try_create_int_counter(
"gossip_attestation_error_prior_attestation_known",
"gossipsub_attestation_error_prior_attestation_known",
"Count of a specific error type (see metric name)"
);
pub static ref GOSSIP_ATTESTATION_ERROR_VALIDATOR_INDEX_TOO_HIGH: Result<IntCounter> = try_create_int_counter(
"gossip_attestation_error_validator_index_too_high",
"gossipsub_attestation_error_validator_index_too_high",
"Count of a specific error type (see metric name)"
);
pub static ref GOSSIP_ATTESTATION_ERROR_UNKNOWN_HEAD_BLOCK: Result<IntCounter> = try_create_int_counter(
"gossip_attestation_error_unknown_head_block",
"gossipsub_attestation_error_unknown_head_block",
"Count of a specific error type (see metric name)"
);
pub static ref GOSSIP_ATTESTATION_ERROR_UNKNOWN_TARGET_ROOT: Result<IntCounter> = try_create_int_counter(
"gossip_attestation_error_unknown_target_root",
"gossipsub_attestation_error_unknown_target_root",
"Count of a specific error type (see metric name)"
);
pub static ref GOSSIP_ATTESTATION_ERROR_BAD_TARGET_EPOCH: Result<IntCounter> = try_create_int_counter(
"gossip_attestation_error_bad_target_epoch",
"gossipsub_attestation_error_bad_target_epoch",
"Count of a specific error type (see metric name)"
);
pub static ref GOSSIP_ATTESTATION_ERROR_NO_COMMITTEE_FOR_SLOT_AND_INDEX: Result<IntCounter> = try_create_int_counter(
"gossip_attestation_error_no_committee_for_slot_and_index",
"gossipsub_attestation_error_no_committee_for_slot_and_index",
"Count of a specific error type (see metric name)"
);
pub static ref GOSSIP_ATTESTATION_ERROR_NOT_EXACTLY_ONE_AGGREGATION_BIT_SET: Result<IntCounter> = try_create_int_counter(
"gossip_attestation_error_not_exactly_one_aggregation_bit_set",
"gossipsub_attestation_error_not_exactly_one_aggregation_bit_set",
"Count of a specific error type (see metric name)"
);
pub static ref GOSSIP_ATTESTATION_ERROR_ATTESTS_TO_FUTURE_BLOCK: Result<IntCounter> = try_create_int_counter(
"gossip_attestation_error_attests_to_future_block",
"gossipsub_attestation_error_attests_to_future_block",
"Count of a specific error type (see metric name)"
);
pub static ref GOSSIP_ATTESTATION_ERROR_INVALID_SUBNET_ID: Result<IntCounter> = try_create_int_counter(
"gossip_attestation_error_invalid_subnet_id",
"gossipsub_attestation_error_invalid_subnet_id",
"Count of a specific error type (see metric name)"
);
pub static ref GOSSIP_ATTESTATION_ERROR_INVALID_STATE_PROCESSING: Result<IntCounter> = try_create_int_counter(
"gossip_attestation_error_invalid_state_processing",
"gossipsub_attestation_error_invalid_state_processing",
"Count of a specific error type (see metric name)"
);
pub static ref GOSSIP_ATTESTATION_ERROR_INVALID_TOO_MANY_SKIPPED_SLOTS: Result<IntCounter> = try_create_int_counter(
"gossip_attestation_error_invalid_too_many_skipped_slots",
"gossipsub_attestation_error_invalid_too_many_skipped_slots",
"Count of a specific error type (see metric name)"
);
pub static ref GOSSIP_ATTESTATION_ERROR_BEACON_CHAIN_ERROR: Result<IntCounter> = try_create_int_counter(
"gossip_attestation_error_beacon_chain_error",
"gossipsub_attestation_error_beacon_chain_error",
"Count of a specific error type (see metric name)"
);
}

View File

@@ -10,13 +10,14 @@ use eth2_libp2p::{
rpc::{GoodbyeReason, RPCResponseErrorCode, RequestId},
Libp2pEvent, PeerAction, PeerRequestId, PubsubMessage, Request, Response,
};
use eth2_libp2p::{BehaviourEvent, MessageId, NetworkGlobals, PeerId};
use eth2_libp2p::{
types::GossipKind, BehaviourEvent, GossipTopic, MessageId, NetworkGlobals, PeerId, TopicHash,
};
use eth2_libp2p::{MessageAcceptance, Service as LibP2PService};
use futures::prelude::*;
use rest_types::ValidatorSubscription;
use slog::{debug, error, info, o, trace, warn};
use std::sync::Arc;
use std::time::Duration;
use std::{collections::HashMap, sync::Arc, time::Duration};
use store::HotColdDB;
use tokio::sync::mpsc;
use tokio::time::Delay;
@@ -24,6 +25,9 @@ use types::EthSpec;
mod tests;
/// The interval (in seconds) that various network metrics will update.
const METRIC_UPDATE_INTERVAL: u64 = 1;
/// Types of messages that the network service can receive.
#[derive(Debug)]
pub enum NetworkMessage<T: EthSpec> {
@@ -91,6 +95,8 @@ pub struct NetworkService<T: BeaconChainTypes> {
network_globals: Arc<NetworkGlobals<T::EthSpec>>,
/// A delay that expires when a new fork takes place.
next_fork_update: Option<Delay>,
/// A timer for updating various network metrics.
metrics_update: tokio::time::Interval,
/// The logger for the network service.
log: slog::Logger,
}
@@ -146,6 +152,9 @@ impl<T: BeaconChainTypes> NetworkService<T> {
let attestation_service =
AttestationService::new(beacon_chain.clone(), network_globals.clone(), &network_log);
// create a timer for updating network metrics
let metrics_update = tokio::time::interval(Duration::from_secs(METRIC_UPDATE_INTERVAL));
// create the network service and spawn the task
let network_log = network_log.new(o!("service" => "network"));
let network_service = NetworkService {
@@ -157,6 +166,7 @@ impl<T: BeaconChainTypes> NetworkService<T> {
store,
network_globals: network_globals.clone(),
next_fork_update,
metrics_update,
log: network_log,
};
@@ -175,9 +185,8 @@ fn spawn_service<T: BeaconChainTypes>(
// spawn on the current executor
executor.spawn_without_exit(async move {
// TODO: there is something with this code that prevents cargo fmt from doing anything at
// all. Ok, it is worse, the compiler doesn't show errors over this code beyond ast
// checking
let mut metric_update_counter = 0;
loop {
// build the futures to check simultaneously
tokio::select! {
@@ -206,6 +215,17 @@ fn spawn_service<T: BeaconChainTypes>(
info!(service.log, "Network service shutdown");
return;
}
_ = service.metrics_update.next() => {
// update various network metrics
metric_update_counter +=1;
if metric_update_counter* 1000 % T::EthSpec::default_spec().milliseconds_per_slot == 0 {
// if a slot has occurred, reset the metrics
let _ = metrics::ATTESTATIONS_PUBLISHED_PER_SUBNET_PER_SLOT
.as_ref()
.map(|gauge| gauge.reset());
}
update_gossip_metrics::<T::EthSpec>(&service.libp2p.swarm.gs());
}
// handle a message sent to the network
Some(message) = service.network_recv.recv() => {
match message {
@@ -424,7 +444,11 @@ fn expose_publish_metrics<T: EthSpec>(messages: &[PubsubMessage<T>]) {
for message in messages {
match message {
PubsubMessage::BeaconBlock(_) => metrics::inc_counter(&metrics::GOSSIP_BLOCKS_TX),
PubsubMessage::Attestation(_) => {
PubsubMessage::Attestation(subnet_id) => {
metrics::inc_counter_vec(
&metrics::ATTESTATIONS_PUBLISHED_PER_SUBNET_PER_SLOT,
&[&subnet_id.0.to_string()],
);
metrics::inc_counter(&metrics::GOSSIP_UNAGGREGATED_ATTESTATIONS_TX)
}
PubsubMessage::AggregateAndProofAttestation(_) => {
@@ -448,3 +472,163 @@ fn expose_receive_metrics<T: EthSpec>(message: &PubsubMessage<T>) {
_ => {}
}
}
fn update_gossip_metrics<T: EthSpec>(gossipsub: &eth2_libp2p::Gossipsub) {
// Clear the metrics
let _ = metrics::PEERS_PER_PROTOCOL
.as_ref()
.map(|gauge| gauge.reset());
let _ = metrics::PEERS_PER_PROTOCOL
.as_ref()
.map(|gauge| gauge.reset());
let _ = metrics::MESH_PEERS_PER_MAIN_TOPIC
.as_ref()
.map(|gauge| gauge.reset());
let _ = metrics::AVG_GOSSIPSUB_PEER_SCORE_PER_MAIN_TOPIC
.as_ref()
.map(|gauge| gauge.reset());
let _ = metrics::AVG_GOSSIPSUB_PEER_SCORE_PER_SUBNET_TOPIC
.as_ref()
.map(|gauge| gauge.reset());
// reset the mesh peers, showing all subnets
for subnet_id in 0..T::default_spec().attestation_subnet_count {
let _ = metrics::get_int_gauge(
&metrics::MESH_PEERS_PER_SUBNET_TOPIC,
&[&subnet_id.to_string()],
)
.map(|v| v.set(0));
let _ = metrics::get_int_gauge(
&metrics::GOSSIPSUB_SUBSCRIBED_SUBNET_TOPIC,
&[&subnet_id.to_string()],
)
.map(|v| v.set(0));
let _ = metrics::get_int_gauge(
&metrics::GOSSIPSUB_SUBSCRIBED_PEERS_SUBNET_TOPIC,
&[&subnet_id.to_string()],
)
.map(|v| v.set(0));
}
// Subnet topics subscribed to
for topic_hash in gossipsub.topics() {
if let Ok(topic) = GossipTopic::decode(topic_hash.as_str()) {
if let GossipKind::Attestation(subnet_id) = topic.kind() {
let _ = metrics::get_int_gauge(
&metrics::GOSSIPSUB_SUBSCRIBED_SUBNET_TOPIC,
&[&subnet_id.to_string()],
)
.map(|v| v.set(1));
}
}
}
// Peers per subscribed subnet
let mut peers_per_topic: HashMap<TopicHash, usize> = HashMap::new();
for (peer_id, topics) in gossipsub.all_peers() {
for topic_hash in topics {
*peers_per_topic.entry(topic_hash.clone()).or_default() += 1;
if let Ok(topic) = GossipTopic::decode(topic_hash.as_str()) {
match topic.kind() {
GossipKind::Attestation(subnet_id) => {
if let Some(v) = metrics::get_int_gauge(
&metrics::GOSSIPSUB_SUBSCRIBED_PEERS_SUBNET_TOPIC,
&[&subnet_id.to_string()],
) {
v.inc()
};
// average peer scores
if let Some(score) = gossipsub.peer_score(peer_id) {
if let Some(v) = metrics::get_int_gauge(
&metrics::AVG_GOSSIPSUB_PEER_SCORE_PER_SUBNET_TOPIC,
&[&subnet_id.to_string()],
) {
v.add(score as i64)
};
}
}
kind => {
// main topics
if let Some(score) = gossipsub.peer_score(peer_id) {
if let Some(v) = metrics::get_int_gauge(
&metrics::AVG_GOSSIPSUB_PEER_SCORE_PER_MAIN_TOPIC,
&[&format!("{:?}", kind)],
) {
v.add(score as i64)
};
}
}
}
}
}
}
// adjust to average scores by dividing by number of peers
for (topic_hash, peers) in peers_per_topic.iter() {
if let Ok(topic) = GossipTopic::decode(topic_hash.as_str()) {
match topic.kind() {
GossipKind::Attestation(subnet_id) => {
// average peer scores
if let Some(v) = metrics::get_int_gauge(
&metrics::AVG_GOSSIPSUB_PEER_SCORE_PER_SUBNET_TOPIC,
&[&subnet_id.to_string()],
) {
v.set(v.get() / (*peers as i64))
};
}
kind => {
// main topics
if let Some(v) = metrics::get_int_gauge(
&metrics::AVG_GOSSIPSUB_PEER_SCORE_PER_MAIN_TOPIC,
&[&format!("{:?}", kind)],
) {
v.set(v.get() / (*peers as i64))
};
}
}
}
}
// mesh peers
for topic_hash in gossipsub.topics() {
let peers = gossipsub.mesh_peers(&topic_hash).count();
if let Ok(topic) = GossipTopic::decode(topic_hash.as_str()) {
match topic.kind() {
GossipKind::Attestation(subnet_id) => {
if let Some(v) = metrics::get_int_gauge(
&metrics::MESH_PEERS_PER_SUBNET_TOPIC,
&[&subnet_id.to_string()],
) {
v.set(peers as i64)
};
}
kind => {
// main topics
if let Some(v) = metrics::get_int_gauge(
&metrics::MESH_PEERS_PER_MAIN_TOPIC,
&[&format!("{:?}", kind)],
) {
v.set(peers as i64)
};
}
}
}
}
// protocol peers
let mut peers_per_protocol: HashMap<String, i64> = HashMap::new();
for (_peer, protocol) in gossipsub.peer_protocol() {
*peers_per_protocol.entry(protocol.to_string()).or_default() += 1;
}
for (protocol, peers) in peers_per_protocol.iter() {
if let Some(v) =
metrics::get_int_gauge(&metrics::PEERS_PER_PROTOCOL, &[&protocol.to_string()])
{
v.set(*peers)
};
}
}