Add metrics to VC (#1954)

## Issue Addressed

NA

## Proposed Changes

- Adds a HTTP server to the VC which provides Prometheus metrics.
- Moves the health metrics into the `lighthouse_metrics` crate so it can be shared between BN/VC.
- Sprinkle some metrics around the VC.
- Update the book to indicate that we now have VC metrics.
- Shifts the "waiting for genesis" logic later in the `ProductionValidatorClient::new_from_cli`
  - This is worth attention during the review.

## Additional Info

- ~~`clippy` has some new lints that are failing. I'll deal with that in another PR.~~
This commit is contained in:
Paul Hauner
2020-11-26 01:10:51 +00:00
parent 50558e61f7
commit 26741944b1
18 changed files with 571 additions and 73 deletions

View File

@@ -4,6 +4,7 @@ mod cli;
mod config;
mod duties_service;
mod fork_service;
mod http_metrics;
mod initialized_validators;
mod is_synced;
mod key_cache;
@@ -28,6 +29,7 @@ use futures::channel::mpsc;
use http_api::ApiSecret;
use initialized_validators::InitializedValidators;
use notifier::spawn_notifier;
use parking_lot::RwLock;
use slashing_protection::{SlashingDatabase, SLASHING_PROTECTION_FILENAME};
use slog::{error, info, warn, Logger};
use slot_clock::SlotClock;
@@ -49,6 +51,7 @@ const WAITING_FOR_GENESIS_POLL_TIME: Duration = Duration::from_secs(12);
/// The global timeout for HTTP requests to the beacon node.
const HTTP_TIMEOUT: Duration = Duration::from_secs(12);
#[derive(Clone)]
pub struct ProductionValidatorClient<T: EthSpec> {
context: RuntimeContext<T>,
duties_service: DutiesService<SystemTimeSlotClock, T>,
@@ -57,6 +60,7 @@ pub struct ProductionValidatorClient<T: EthSpec> {
attestation_service: AttestationService<SystemTimeSlotClock, T>,
validator_store: ValidatorStore<SystemTimeSlotClock, T>,
http_api_listen_addr: Option<SocketAddr>,
http_metrics_ctx: Option<Arc<http_metrics::Context<T>>>,
config: Config,
}
@@ -84,6 +88,36 @@ impl<T: EthSpec> ProductionValidatorClient<T> {
"validator_dir" => format!("{:?}", config.validator_dir),
);
// Optionally start the metrics server.
let http_metrics_ctx = if config.http_metrics.enabled {
let shared = http_metrics::Shared {
validator_store: None,
genesis_time: None,
duties_service: None,
};
let ctx: Arc<http_metrics::Context<T>> = Arc::new(http_metrics::Context {
config: config.http_metrics.clone(),
shared: RwLock::new(shared),
log: log.clone(),
});
let exit = context.executor.exit();
let (_listen_addr, server) = http_metrics::serve(ctx.clone(), exit)
.map_err(|e| format!("Unable to start metrics API server: {:?}", e))?;
context
.clone()
.executor
.spawn_without_exit(async move { server.await }, "metrics-api");
Some(ctx)
} else {
info!(log, "HTTP metrics server is disabled");
None
};
let mut validator_defs = ValidatorDefinitions::open_or_create(&config.validator_dir)
.map_err(|e| format!("Unable to open or create validator definitions: {:?}", e))?;
@@ -186,6 +220,11 @@ impl<T: EthSpec> ProductionValidatorClient<T> {
() = context.executor.exit() => return Err("Shutting down".to_string())
};
// Update the metrics server.
if let Some(ctx) = &http_metrics_ctx {
ctx.shared.write().genesis_time = Some(genesis_time);
}
let slot_clock = SystemTimeSlotClock::new(
context.eth2_config.spec.genesis_slot,
Duration::from_secs(genesis_time),
@@ -221,6 +260,12 @@ impl<T: EthSpec> ProductionValidatorClient<T> {
.allow_unsynced_beacon_node(config.allow_unsynced_beacon_node)
.build()?;
// Update the metrics server.
if let Some(ctx) = &http_metrics_ctx {
ctx.shared.write().validator_store = Some(validator_store.clone());
ctx.shared.write().duties_service = Some(duties_service.clone());
}
let block_service = BlockServiceBuilder::new()
.slot_clock(slot_clock.clone())
.validator_store(validator_store.clone())
@@ -233,10 +278,16 @@ impl<T: EthSpec> ProductionValidatorClient<T> {
.duties_service(duties_service.clone())
.slot_clock(slot_clock)
.validator_store(validator_store.clone())
.beacon_node(beacon_node)
.beacon_node(beacon_node.clone())
.runtime_context(context.service_context("attestation".into()))
.build()?;
// Wait until genesis has occured.
//
// It seems most sensible to move this into the `start_service` function, but I'm caution
// of making too many changes this close to genesis (<1 week).
wait_for_genesis(&beacon_node, genesis_time, &context).await?;
Ok(Self {
context,
duties_service,
@@ -246,6 +297,7 @@ impl<T: EthSpec> ProductionValidatorClient<T> {
validator_store,
config,
http_api_listen_addr: None,
http_metrics_ctx,
})
}
@@ -320,7 +372,7 @@ async fn init_from_beacon_node<E: EthSpec>(
context: &RuntimeContext<E>,
) -> Result<(u64, Hash256), String> {
// Wait for the beacon node to come online.
wait_for_node(beacon_node, context.log()).await?;
wait_for_connectivity(beacon_node, context.log()).await?;
let yaml_config = beacon_node
.get_config_spec()
@@ -367,10 +419,18 @@ async fn init_from_beacon_node<E: EthSpec>(
delay_for(RETRY_DELAY).await;
};
Ok((genesis.genesis_time, genesis.genesis_validators_root))
}
async fn wait_for_genesis<E: EthSpec>(
beacon_node: &BeaconNodeHttpClient,
genesis_time: u64,
context: &RuntimeContext<E>,
) -> Result<(), String> {
let now = SystemTime::now()
.duration_since(UNIX_EPOCH)
.map_err(|e| format!("Unable to read system time: {:?}", e))?;
let genesis_time = Duration::from_secs(genesis.genesis_time);
let genesis_time = Duration::from_secs(genesis_time);
// If the time now is less than (prior to) genesis, then delay until the
// genesis instant.
@@ -404,12 +464,15 @@ async fn init_from_beacon_node<E: EthSpec>(
);
}
Ok((genesis.genesis_time, genesis.genesis_validators_root))
Ok(())
}
/// Request the version from the node, looping back and trying again on failure. Exit once the node
/// has been contacted.
async fn wait_for_node(beacon_node: &BeaconNodeHttpClient, log: &Logger) -> Result<(), String> {
async fn wait_for_connectivity(
beacon_node: &BeaconNodeHttpClient,
log: &Logger,
) -> Result<(), String> {
// Try to get the version string from the node, looping until success is returned.
loop {
let log = log.clone();