Monitoring service api (#2251)

## Issue Addressed

N/A

## Proposed Changes

Adds a client side api for collecting system and process metrics and pushing it to a monitoring service.
This commit is contained in:
Pawan Dhananjay
2021-05-26 05:58:41 +00:00
parent 55aada006f
commit fdaeec631b
30 changed files with 1108 additions and 65 deletions

View File

@@ -64,4 +64,5 @@ scrypt = { version = "0.5.0", default-features = false }
lighthouse_metrics = { path = "../common/lighthouse_metrics" }
lazy_static = "1.4.0"
fallback = { path = "../common/fallback" }
monitoring_api = { path = "../common/monitoring_api" }
sensitive_url = { path = "../common/sensitive_url" }

View File

@@ -302,7 +302,7 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
}
/// The count of candidates, regardless of their state.
pub async fn num_total(&self) -> usize {
pub fn num_total(&self) -> usize {
self.candidates.len()
}
@@ -317,6 +317,17 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
n
}
/// The count of synced and ready fallbacks excluding the primary beacon node candidate.
pub async fn num_synced_fallback(&self) -> usize {
let mut n = 0;
for candidate in self.candidates.iter().skip(1) {
if candidate.status(RequireSynced::Yes).await.is_ok() {
n += 1
}
}
n
}
/// The count of candidates that are online and compatible, but not necessarily synced.
pub async fn num_available(&self) -> usize {
let mut n = 0;

View File

@@ -181,4 +181,19 @@ pub fn cli_app<'a, 'b>() -> App<'a, 'b> {
address of this server (e.g., http://localhost:5064).")
.takes_value(true),
)
/*
* Explorer metrics
*/
.arg(
Arg::with_name("monitoring-endpoint")
.long("monitoring-endpoint")
.value_name("ADDRESS")
.help("Enables the monitoring service for sending system metrics to a remote endpoint. \
This can be used to monitor your setup on certain services (e.g. beaconcha.in). \
This flag sets the endpoint where the beacon node metrics will be sent. \
Note: This will send information to a remote sever which may identify and associate your \
validators, IP address and other personal information. Always use a HTTPS connection \
and never provide an untrusted URL.")
.takes_value(true),
)
}

View File

@@ -43,6 +43,8 @@ pub struct Config {
pub http_api: http_api::Config,
/// Configuration for the HTTP REST API.
pub http_metrics: http_metrics::Config,
/// Configuration for sending metrics to a remote explorer endpoint.
pub monitoring_api: Option<monitoring_api::Config>,
}
impl Default for Config {
@@ -70,6 +72,7 @@ impl Default for Config {
graffiti_file: None,
http_api: <_>::default(),
http_metrics: <_>::default(),
monitoring_api: None,
}
}
}
@@ -233,6 +236,16 @@ impl Config {
config.http_metrics.allow_origin = Some(allow_origin.to_string());
}
/*
* Explorer metrics
*/
if let Some(monitoring_endpoint) = cli_args.value_of("monitoring-endpoint") {
config.monitoring_api = Some(monitoring_api::Config {
db_path: None,
freezer_db_path: None,
monitoring_endpoint: monitoring_endpoint.to_string(),
});
}
Ok(config)
}

View File

@@ -108,6 +108,16 @@ lazy_static::lazy_static! {
"The number of beacon node requests for each endpoint",
&["endpoint"]
);
pub static ref ETH2_FALLBACK_CONFIGURED: Result<IntGauge> = try_create_int_gauge(
"sync_eth2_fallback_configured",
"The number of configured eth2 fallbacks",
);
pub static ref ETH2_FALLBACK_CONNECTED: Result<IntGauge> = try_create_int_gauge(
"sync_eth2_fallback_connected",
"Set to 1 if connected to atleast one synced eth2 fallback node, otherwise set to 0",
);
}
pub fn gather_prometheus_metrics<T: EthSpec>(
@@ -126,20 +136,6 @@ pub fn gather_prometheus_metrics<T: EthSpec>(
}
}
if let Some(validator_store) = &shared.validator_store {
let initialized_validators_lock = validator_store.initialized_validators();
let initialized_validators = initialized_validators_lock.read();
set_gauge(
&ENABLED_VALIDATORS_COUNT,
initialized_validators.num_enabled() as i64,
);
set_gauge(
&TOTAL_VALIDATORS_COUNT,
initialized_validators.num_total() as i64,
);
}
if let Some(duties_service) = &shared.duties_service {
if let Some(slot) = duties_service.slot_clock.now() {
let current_epoch = slot.epoch(T::slots_per_epoch());

View File

@@ -14,6 +14,7 @@ use account_utils::{
ZeroizeString,
};
use eth2_keystore::Keystore;
use lighthouse_metrics::set_gauge;
use lockfile::{Lockfile, LockfileError};
use slog::{debug, error, info, warn, Logger};
use std::collections::{HashMap, HashSet};
@@ -609,6 +610,16 @@ impl InitializedValidators {
} else {
debug!(log, "Key cache not modified");
}
// Update the enabled and total validator counts
set_gauge(
&crate::http_metrics::metrics::ENABLED_VALIDATORS_COUNT,
self.num_enabled() as i64,
);
set_gauge(
&crate::http_metrics::metrics::TOTAL_VALIDATORS_COUNT,
self.num_total() as i64,
);
Ok(())
}
}

View File

@@ -17,6 +17,8 @@ pub mod http_api;
pub use cli::cli_app;
pub use config::Config;
use lighthouse_metrics::set_gauge;
use monitoring_api::{MonitoringHttpClient, ProcessType};
use crate::beacon_node_fallback::{
start_fallback_updater_service, BeaconNodeFallback, CandidateBeaconNode, RequireSynced,
@@ -125,6 +127,17 @@ impl<T: EthSpec> ProductionValidatorClient<T> {
None
};
// Start the explorer client which periodically sends validator process
// and system metrics to the configured endpoint.
if let Some(monitoring_config) = &config.monitoring_api {
let monitoring_client =
MonitoringHttpClient::new(monitoring_config, context.log().clone())?;
monitoring_client.auto_update(
context.executor.clone(),
vec![ProcessType::Validator, ProcessType::System],
);
};
let mut validator_defs = ValidatorDefinitions::open_or_create(&config.validator_dir)
.map_err(|e| format!("Unable to open or create validator definitions: {:?}", e))?;
@@ -225,10 +238,19 @@ impl<T: EthSpec> ProductionValidatorClient<T> {
})
.collect::<Result<Vec<BeaconNodeHttpClient>, String>>()?;
let num_nodes = beacon_nodes.len();
let candidates = beacon_nodes
.into_iter()
.map(CandidateBeaconNode::new)
.collect();
// Set the count for beacon node fallbacks excluding the primary beacon node
set_gauge(
&http_metrics::metrics::ETH2_FALLBACK_CONFIGURED,
num_nodes.saturating_sub(1) as i64,
);
// Initialize the number of connected, synced fallbacks to 0.
set_gauge(&http_metrics::metrics::ETH2_FALLBACK_CONNECTED, 0);
let mut beacon_nodes: BeaconNodeFallback<_, T> =
BeaconNodeFallback::new(candidates, context.eth2_config.spec.clone(), log.clone());
@@ -409,7 +431,7 @@ async fn init_from_beacon_node<E: EthSpec>(
loop {
beacon_nodes.update_unready_candidates().await;
let num_available = beacon_nodes.num_available().await;
let num_total = beacon_nodes.num_total().await;
let num_total = beacon_nodes.num_total();
if num_available > 0 {
info!(
context.log(),

View File

@@ -1,4 +1,6 @@
use crate::http_metrics;
use crate::{DutiesService, ProductionValidatorClient};
use lighthouse_metrics::set_gauge;
use slog::{error, info, Logger};
use slot_clock::SlotClock;
use tokio::time::{sleep, Duration};
@@ -39,7 +41,7 @@ async fn notify<T: SlotClock + 'static, E: EthSpec>(
) {
let num_available = duties_service.beacon_nodes.num_available().await;
let num_synced = duties_service.beacon_nodes.num_synced().await;
let num_total = duties_service.beacon_nodes.num_total().await;
let num_total = duties_service.beacon_nodes.num_total();
if num_synced > 0 {
info!(
log,
@@ -57,6 +59,12 @@ async fn notify<T: SlotClock + 'static, E: EthSpec>(
"synced" => num_synced,
)
}
let num_synced_fallback = duties_service.beacon_nodes.num_synced_fallback().await;
if num_synced_fallback > 0 {
set_gauge(&http_metrics::metrics::ETH2_FALLBACK_CONNECTED, 1);
} else {
set_gauge(&http_metrics::metrics::ETH2_FALLBACK_CONNECTED, 0);
}
if let Some(slot) = duties_service.slot_clock.now() {
let epoch = slot.epoch(E::slots_per_epoch());