mirror of
https://github.com/sigp/lighthouse.git
synced 2026-04-16 12:28:24 +00:00
Add metrics to VC (#1954)
## Issue Addressed NA ## Proposed Changes - Adds a HTTP server to the VC which provides Prometheus metrics. - Moves the health metrics into the `lighthouse_metrics` crate so it can be shared between BN/VC. - Sprinkle some metrics around the VC. - Update the book to indicate that we now have VC metrics. - Shifts the "waiting for genesis" logic later in the `ProductionValidatorClient::new_from_cli` - This is worth attention during the review. ## Additional Info - ~~`clippy` has some new lints that are failing. I'll deal with that in another PR.~~
This commit is contained in:
@@ -4,6 +4,7 @@ mod cli;
|
||||
mod config;
|
||||
mod duties_service;
|
||||
mod fork_service;
|
||||
mod http_metrics;
|
||||
mod initialized_validators;
|
||||
mod is_synced;
|
||||
mod key_cache;
|
||||
@@ -28,6 +29,7 @@ use futures::channel::mpsc;
|
||||
use http_api::ApiSecret;
|
||||
use initialized_validators::InitializedValidators;
|
||||
use notifier::spawn_notifier;
|
||||
use parking_lot::RwLock;
|
||||
use slashing_protection::{SlashingDatabase, SLASHING_PROTECTION_FILENAME};
|
||||
use slog::{error, info, warn, Logger};
|
||||
use slot_clock::SlotClock;
|
||||
@@ -49,6 +51,7 @@ const WAITING_FOR_GENESIS_POLL_TIME: Duration = Duration::from_secs(12);
|
||||
/// The global timeout for HTTP requests to the beacon node.
|
||||
const HTTP_TIMEOUT: Duration = Duration::from_secs(12);
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct ProductionValidatorClient<T: EthSpec> {
|
||||
context: RuntimeContext<T>,
|
||||
duties_service: DutiesService<SystemTimeSlotClock, T>,
|
||||
@@ -57,6 +60,7 @@ pub struct ProductionValidatorClient<T: EthSpec> {
|
||||
attestation_service: AttestationService<SystemTimeSlotClock, T>,
|
||||
validator_store: ValidatorStore<SystemTimeSlotClock, T>,
|
||||
http_api_listen_addr: Option<SocketAddr>,
|
||||
http_metrics_ctx: Option<Arc<http_metrics::Context<T>>>,
|
||||
config: Config,
|
||||
}
|
||||
|
||||
@@ -84,6 +88,36 @@ impl<T: EthSpec> ProductionValidatorClient<T> {
|
||||
"validator_dir" => format!("{:?}", config.validator_dir),
|
||||
);
|
||||
|
||||
// Optionally start the metrics server.
|
||||
let http_metrics_ctx = if config.http_metrics.enabled {
|
||||
let shared = http_metrics::Shared {
|
||||
validator_store: None,
|
||||
genesis_time: None,
|
||||
duties_service: None,
|
||||
};
|
||||
|
||||
let ctx: Arc<http_metrics::Context<T>> = Arc::new(http_metrics::Context {
|
||||
config: config.http_metrics.clone(),
|
||||
shared: RwLock::new(shared),
|
||||
log: log.clone(),
|
||||
});
|
||||
|
||||
let exit = context.executor.exit();
|
||||
|
||||
let (_listen_addr, server) = http_metrics::serve(ctx.clone(), exit)
|
||||
.map_err(|e| format!("Unable to start metrics API server: {:?}", e))?;
|
||||
|
||||
context
|
||||
.clone()
|
||||
.executor
|
||||
.spawn_without_exit(async move { server.await }, "metrics-api");
|
||||
|
||||
Some(ctx)
|
||||
} else {
|
||||
info!(log, "HTTP metrics server is disabled");
|
||||
None
|
||||
};
|
||||
|
||||
let mut validator_defs = ValidatorDefinitions::open_or_create(&config.validator_dir)
|
||||
.map_err(|e| format!("Unable to open or create validator definitions: {:?}", e))?;
|
||||
|
||||
@@ -186,6 +220,11 @@ impl<T: EthSpec> ProductionValidatorClient<T> {
|
||||
() = context.executor.exit() => return Err("Shutting down".to_string())
|
||||
};
|
||||
|
||||
// Update the metrics server.
|
||||
if let Some(ctx) = &http_metrics_ctx {
|
||||
ctx.shared.write().genesis_time = Some(genesis_time);
|
||||
}
|
||||
|
||||
let slot_clock = SystemTimeSlotClock::new(
|
||||
context.eth2_config.spec.genesis_slot,
|
||||
Duration::from_secs(genesis_time),
|
||||
@@ -221,6 +260,12 @@ impl<T: EthSpec> ProductionValidatorClient<T> {
|
||||
.allow_unsynced_beacon_node(config.allow_unsynced_beacon_node)
|
||||
.build()?;
|
||||
|
||||
// Update the metrics server.
|
||||
if let Some(ctx) = &http_metrics_ctx {
|
||||
ctx.shared.write().validator_store = Some(validator_store.clone());
|
||||
ctx.shared.write().duties_service = Some(duties_service.clone());
|
||||
}
|
||||
|
||||
let block_service = BlockServiceBuilder::new()
|
||||
.slot_clock(slot_clock.clone())
|
||||
.validator_store(validator_store.clone())
|
||||
@@ -233,10 +278,16 @@ impl<T: EthSpec> ProductionValidatorClient<T> {
|
||||
.duties_service(duties_service.clone())
|
||||
.slot_clock(slot_clock)
|
||||
.validator_store(validator_store.clone())
|
||||
.beacon_node(beacon_node)
|
||||
.beacon_node(beacon_node.clone())
|
||||
.runtime_context(context.service_context("attestation".into()))
|
||||
.build()?;
|
||||
|
||||
// Wait until genesis has occured.
|
||||
//
|
||||
// It seems most sensible to move this into the `start_service` function, but I'm caution
|
||||
// of making too many changes this close to genesis (<1 week).
|
||||
wait_for_genesis(&beacon_node, genesis_time, &context).await?;
|
||||
|
||||
Ok(Self {
|
||||
context,
|
||||
duties_service,
|
||||
@@ -246,6 +297,7 @@ impl<T: EthSpec> ProductionValidatorClient<T> {
|
||||
validator_store,
|
||||
config,
|
||||
http_api_listen_addr: None,
|
||||
http_metrics_ctx,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -320,7 +372,7 @@ async fn init_from_beacon_node<E: EthSpec>(
|
||||
context: &RuntimeContext<E>,
|
||||
) -> Result<(u64, Hash256), String> {
|
||||
// Wait for the beacon node to come online.
|
||||
wait_for_node(beacon_node, context.log()).await?;
|
||||
wait_for_connectivity(beacon_node, context.log()).await?;
|
||||
|
||||
let yaml_config = beacon_node
|
||||
.get_config_spec()
|
||||
@@ -367,10 +419,18 @@ async fn init_from_beacon_node<E: EthSpec>(
|
||||
delay_for(RETRY_DELAY).await;
|
||||
};
|
||||
|
||||
Ok((genesis.genesis_time, genesis.genesis_validators_root))
|
||||
}
|
||||
|
||||
async fn wait_for_genesis<E: EthSpec>(
|
||||
beacon_node: &BeaconNodeHttpClient,
|
||||
genesis_time: u64,
|
||||
context: &RuntimeContext<E>,
|
||||
) -> Result<(), String> {
|
||||
let now = SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.map_err(|e| format!("Unable to read system time: {:?}", e))?;
|
||||
let genesis_time = Duration::from_secs(genesis.genesis_time);
|
||||
let genesis_time = Duration::from_secs(genesis_time);
|
||||
|
||||
// If the time now is less than (prior to) genesis, then delay until the
|
||||
// genesis instant.
|
||||
@@ -404,12 +464,15 @@ async fn init_from_beacon_node<E: EthSpec>(
|
||||
);
|
||||
}
|
||||
|
||||
Ok((genesis.genesis_time, genesis.genesis_validators_root))
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Request the version from the node, looping back and trying again on failure. Exit once the node
|
||||
/// has been contacted.
|
||||
async fn wait_for_node(beacon_node: &BeaconNodeHttpClient, log: &Logger) -> Result<(), String> {
|
||||
async fn wait_for_connectivity(
|
||||
beacon_node: &BeaconNodeHttpClient,
|
||||
log: &Logger,
|
||||
) -> Result<(), String> {
|
||||
// Try to get the version string from the node, looping until success is returned.
|
||||
loop {
|
||||
let log = log.clone();
|
||||
|
||||
Reference in New Issue
Block a user