Add metrics to VC (#1954)

## Issue Addressed NA ## Proposed Changes - Adds a HTTP server to the VC which provides Prometheus metrics. - Moves the health metrics into the `lighthouse_metrics` crate so it can be shared between BN/VC. - Sprinkle some metrics around the VC. - Update the book to indicate that we now have VC metrics. - Shifts the "waiting for genesis" logic later in the `ProductionValidatorClient::new_from_cli` - This is worth attention during the review. ## Additional Info - ~~`clippy` has some new lints that are failing. I'll deal with that in another PR.~~
2026-06-01 05:37:05 +00:00 · 2020-11-26 01:10:51 +00:00
parent 50558e61f7
commit 26741944b1
18 changed files with 571 additions and 73 deletions
--- a/validator_client/src/lib.rs
+++ b/validator_client/src/lib.rs
@@ -4,6 +4,7 @@ mod cli;
 mod config;
 mod duties_service;
 mod fork_service;
+mod http_metrics;
 mod initialized_validators;
 mod is_synced;
 mod key_cache;
@@ -28,6 +29,7 @@ use futures::channel::mpsc;
 use http_api::ApiSecret;
 use initialized_validators::InitializedValidators;
 use notifier::spawn_notifier;
+use parking_lot::RwLock;
 use slashing_protection::{SlashingDatabase, SLASHING_PROTECTION_FILENAME};
 use slog::{error, info, warn, Logger};
 use slot_clock::SlotClock;
@@ -49,6 +51,7 @@ const WAITING_FOR_GENESIS_POLL_TIME: Duration = Duration::from_secs(12);
 /// The global timeout for HTTP requests to the beacon node.
 const HTTP_TIMEOUT: Duration = Duration::from_secs(12);

+#[derive(Clone)]
 pub struct ProductionValidatorClient<T: EthSpec> {
    context: RuntimeContext<T>,
    duties_service: DutiesService<SystemTimeSlotClock, T>,
@@ -57,6 +60,7 @@ pub struct ProductionValidatorClient<T: EthSpec> {
    attestation_service: AttestationService<SystemTimeSlotClock, T>,
    validator_store: ValidatorStore<SystemTimeSlotClock, T>,
    http_api_listen_addr: Option<SocketAddr>,
+    http_metrics_ctx: Option<Arc<http_metrics::Context<T>>>,
    config: Config,
 }

@@ -84,6 +88,36 @@ impl<T: EthSpec> ProductionValidatorClient<T> {
            "validator_dir" => format!("{:?}", config.validator_dir),
        );

+        // Optionally start the metrics server.
+        let http_metrics_ctx = if config.http_metrics.enabled {
+            let shared = http_metrics::Shared {
+                validator_store: None,
+                genesis_time: None,
+                duties_service: None,
+            };
+
+            let ctx: Arc<http_metrics::Context<T>> = Arc::new(http_metrics::Context {
+                config: config.http_metrics.clone(),
+                shared: RwLock::new(shared),
+                log: log.clone(),
+            });
+
+            let exit = context.executor.exit();
+
+            let (_listen_addr, server) = http_metrics::serve(ctx.clone(), exit)
+                .map_err(|e| format!("Unable to start metrics API server: {:?}", e))?;
+
+            context
+                .clone()
+                .executor
+                .spawn_without_exit(async move { server.await }, "metrics-api");
+
+            Some(ctx)
+        } else {
+            info!(log, "HTTP metrics server is disabled");
+            None
+        };
+
        let mut validator_defs = ValidatorDefinitions::open_or_create(&config.validator_dir)
            .map_err(|e| format!("Unable to open or create validator definitions: {:?}", e))?;

@@ -186,6 +220,11 @@ impl<T: EthSpec> ProductionValidatorClient<T> {
            () = context.executor.exit() => return Err("Shutting down".to_string())
        };

+        // Update the metrics server.
+        if let Some(ctx) = &http_metrics_ctx {
+            ctx.shared.write().genesis_time = Some(genesis_time);
+        }
+
        let slot_clock = SystemTimeSlotClock::new(
            context.eth2_config.spec.genesis_slot,
            Duration::from_secs(genesis_time),
@@ -221,6 +260,12 @@ impl<T: EthSpec> ProductionValidatorClient<T> {
            .allow_unsynced_beacon_node(config.allow_unsynced_beacon_node)
            .build()?;

+        // Update the metrics server.
+        if let Some(ctx) = &http_metrics_ctx {
+            ctx.shared.write().validator_store = Some(validator_store.clone());
+            ctx.shared.write().duties_service = Some(duties_service.clone());
+        }
+
        let block_service = BlockServiceBuilder::new()
            .slot_clock(slot_clock.clone())
            .validator_store(validator_store.clone())
@@ -233,10 +278,16 @@ impl<T: EthSpec> ProductionValidatorClient<T> {
            .duties_service(duties_service.clone())
            .slot_clock(slot_clock)
            .validator_store(validator_store.clone())
-            .beacon_node(beacon_node)
+            .beacon_node(beacon_node.clone())
            .runtime_context(context.service_context("attestation".into()))
            .build()?;

+        // Wait until genesis has occured.
+        //
+        // It seems most sensible to move this into the `start_service` function, but I'm caution
+        // of making too many changes this close to genesis (<1 week).
+        wait_for_genesis(&beacon_node, genesis_time, &context).await?;
+
        Ok(Self {
            context,
            duties_service,
@@ -246,6 +297,7 @@ impl<T: EthSpec> ProductionValidatorClient<T> {
            validator_store,
            config,
            http_api_listen_addr: None,
+            http_metrics_ctx,
        })
    }

@@ -320,7 +372,7 @@ async fn init_from_beacon_node<E: EthSpec>(
    context: &RuntimeContext<E>,
 ) -> Result<(u64, Hash256), String> {
    // Wait for the beacon node to come online.
-    wait_for_node(beacon_node, context.log()).await?;
+    wait_for_connectivity(beacon_node, context.log()).await?;

    let yaml_config = beacon_node
        .get_config_spec()
@@ -367,10 +419,18 @@ async fn init_from_beacon_node<E: EthSpec>(
        delay_for(RETRY_DELAY).await;
    };

+    Ok((genesis.genesis_time, genesis.genesis_validators_root))
+}
+
+async fn wait_for_genesis<E: EthSpec>(
+    beacon_node: &BeaconNodeHttpClient,
+    genesis_time: u64,
+    context: &RuntimeContext<E>,
+) -> Result<(), String> {
    let now = SystemTime::now()
        .duration_since(UNIX_EPOCH)
        .map_err(|e| format!("Unable to read system time: {:?}", e))?;
-    let genesis_time = Duration::from_secs(genesis.genesis_time);
+    let genesis_time = Duration::from_secs(genesis_time);

    // If the time now is less than (prior to) genesis, then delay until the
    // genesis instant.
@@ -404,12 +464,15 @@ async fn init_from_beacon_node<E: EthSpec>(
        );
    }

-    Ok((genesis.genesis_time, genesis.genesis_validators_root))
+    Ok(())
 }

 /// Request the version from the node, looping back and trying again on failure. Exit once the node
 /// has been contacted.
-async fn wait_for_node(beacon_node: &BeaconNodeHttpClient, log: &Logger) -> Result<(), String> {
+async fn wait_for_connectivity(
+    beacon_node: &BeaconNodeHttpClient,
+    log: &Logger,
+) -> Result<(), String> {
    // Try to get the version string from the node, looping until success is returned.
    loop {
        let log = log.clone();