Implement el_offline and use it in the VC (#4295)

## Issue Addressed Closes https://github.com/sigp/lighthouse/issues/4291, part of #3613. ## Proposed Changes - Implement the `el_offline` field on `/eth/v1/node/syncing`. We set `el_offline=true` if: - The EL's internal status is `Offline` or `AuthFailed`, _or_ - The most recent call to `newPayload` resulted in an error (more on this in a moment). - Use the `el_offline` field in the VC to mark nodes with offline ELs as _unsynced_. These nodes will still be used, but only after synced nodes. - Overhaul the usage of `RequireSynced` so that `::No` is used almost everywhere. The `--allow-unsynced` flag was broken and had the opposite effect to intended, so it has been deprecated. - Add tests for the EL being offline on the upcheck call, and being offline due to the newPayload check. ## Why track `newPayload` errors? Tracking the EL's online/offline status is too coarse-grained to be useful in practice, because: - If the EL is timing out to some calls, it's unlikely to timeout on the `upcheck` call, which is _just_ `eth_syncing`. Every failed call is followed by an upcheck [here](693886b941/beacon_node/execution_layer/src/engines.rs (L372-L380)), which would have the effect of masking the failure and keeping the status _online_. - The `newPayload` call is the most likely to time out. It's the call in which ELs tend to do most of their work (often 1-2 seconds), with `forkchoiceUpdated` usually returning much faster (<50ms). - If `newPayload` is failing consistently (e.g. timing out) then this is a good indication that either the node's EL is in trouble, or the network as a whole is. In the first case validator clients _should_ prefer other BNs if they have one available. In the second case, all of their BNs will likely report `el_offline` and they'll just have to proceed with trying to use them. ## Additional Changes - Add utility method `ForkName::latest` which is quite convenient for test writing, but probably other things too. - Delete some stale comments from when we used to support multiple execution nodes.
2026-04-29 18:53:32 +00:00 · 2023-05-17 05:51:56 +00:00
parent aaa118ff0e
commit 3052db29fe
21 changed files with 307 additions and 116 deletions
--- a/validator_client/src/beacon_node_fallback.rs
+++ b/validator_client/src/beacon_node_fallback.rs
@@ -28,7 +28,7 @@ const UPDATE_REQUIRED_LOG_HINT: &str = "this VC or the remote BN may need updati
 /// too early, we risk switching nodes between the time of publishing an attestation and publishing
 /// an aggregate; this may result in a missed aggregation. If we set this time too late, we risk not
 /// having the correct nodes up and running prior to the start of the slot.
-const SLOT_LOOKAHEAD: Duration = Duration::from_secs(1);
+const SLOT_LOOKAHEAD: Duration = Duration::from_secs(2);

 /// Indicates a measurement of latency between the VC and a BN.
 pub struct LatencyMeasurement {
@@ -52,7 +52,7 @@ pub fn start_fallback_updater_service<T: SlotClock + 'static, E: EthSpec>(

    let future = async move {
        loop {
-            beacon_nodes.update_unready_candidates().await;
+            beacon_nodes.update_all_candidates().await;

            let sleep_time = beacon_nodes
                .slot_clock
@@ -385,33 +385,21 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
        n
    }

-    /// Loop through any `self.candidates` that we don't think are online, compatible or synced and
-    /// poll them to see if their status has changed.
+    /// Loop through ALL candidates in `self.candidates` and update their sync status.
    ///
-    /// We do not poll nodes that are synced to avoid sending additional requests when everything is
-    /// going smoothly.
-    pub async fn update_unready_candidates(&self) {
-        let mut futures = Vec::new();
-        for candidate in &self.candidates {
-            // There is a potential race condition between having the read lock and the write
-            // lock. The worst case of this race is running `try_become_ready` twice, which is
-            // acceptable.
-            //
-            // Note: `RequireSynced` is always set to false here. This forces us to recheck the sync
-            // status of nodes that were previously not-synced.
-            if candidate.status(RequireSynced::Yes).await.is_err() {
-                // There exists a race-condition that could result in `refresh_status` being called
-                // when the status does not require refreshing anymore. This is deemed an
-                // acceptable inefficiency.
-                futures.push(candidate.refresh_status(
-                    self.slot_clock.as_ref(),
-                    &self.spec,
-                    &self.log,
-                ));
-            }
-        }
+    /// It is possible for a node to return an unsynced status while continuing to serve
+    /// low quality responses. To route around this it's best to poll all connected beacon nodes.
+    /// A previous implementation of this function polled only the unavailable BNs.
+    pub async fn update_all_candidates(&self) {
+        let futures = self
+            .candidates
+            .iter()
+            .map(|candidate| {
+                candidate.refresh_status(self.slot_clock.as_ref(), &self.spec, &self.log)
+            })
+            .collect::<Vec<_>>();

-        //run all updates concurrently and ignore results
+        // run all updates concurrently and ignore errors
        let _ = future::join_all(futures).await;
    }

--- a/validator_client/src/check_synced.rs
+++ b/validator_client/src/check_synced.rs
@@ -36,7 +36,10 @@ pub async fn check_synced<T: SlotClock>(
        }
    };

-    let is_synced = !resp.data.is_syncing || (resp.data.sync_distance.as_u64() < SYNC_TOLERANCE);
+    // Default EL status to "online" for backwards-compatibility with BNs that don't include it.
+    let el_offline = resp.data.el_offline.unwrap_or(false);
+    let bn_is_synced = !resp.data.is_syncing || (resp.data.sync_distance.as_u64() < SYNC_TOLERANCE);
+    let is_synced = bn_is_synced && !el_offline;

    if let Some(log) = log_opt {
        if !is_synced {
@@ -52,6 +55,7 @@ pub async fn check_synced<T: SlotClock>(
                "sync_distance" => resp.data.sync_distance.as_u64(),
                "head_slot" => resp.data.head_slot.as_u64(),
                "endpoint" => %beacon_node,
+                "el_offline" => el_offline,
            );
        }

--- a/validator_client/src/cli.rs
+++ b/validator_client/src/cli.rs
@@ -109,10 +109,7 @@ pub fn cli_app<'a, 'b>() -> App<'a, 'b> {
        .arg(
            Arg::with_name("allow-unsynced")
                .long("allow-unsynced")
-                .help(
-                    "If present, the validator client will still poll for duties if the beacon
-                      node is not synced.",
-                ),
+                .help("DEPRECATED: this flag does nothing"),
        )
        .arg(
            Arg::with_name("use-long-timeouts")
--- a/validator_client/src/config.rs
+++ b/validator_client/src/config.rs
@@ -205,7 +205,13 @@ impl Config {
            );
        }

-        config.allow_unsynced_beacon_node = cli_args.is_present("allow-unsynced");
+        if cli_args.is_present("allow-unsynced") {
+            warn!(
+                log,
+                "The --allow-unsynced flag is deprecated";
+                "msg" => "it no longer has any effect",
+            );
+        }
        config.disable_run_on_all = cli_args.is_present("disable-run-on-all");
        config.disable_auto_discover = cli_args.is_present("disable-auto-discover");
        config.init_slashing_protection = cli_args.is_present("init-slashing-protection");
--- a/validator_client/src/duties_service.rs
+++ b/validator_client/src/duties_service.rs
@@ -147,11 +147,6 @@ pub struct DutiesService<T, E: EthSpec> {
    pub slot_clock: T,
    /// Provides HTTP access to remote beacon nodes.
    pub beacon_nodes: Arc<BeaconNodeFallback<T, E>>,
-    /// Controls whether or not this function will refuse to interact with non-synced beacon nodes.
-    ///
-    /// This functionality is a little redundant since most BNs will likely reject duties when they
-    /// aren't synced, but we keep it around for an emergency.
-    pub require_synced: RequireSynced,
    pub enable_high_validator_count_metrics: bool,
    pub context: RuntimeContext<E>,
    pub spec: ChainSpec,
@@ -421,7 +416,7 @@ async fn poll_validator_indices<T: SlotClock + 'static, E: EthSpec>(
            let download_result = duties_service
                .beacon_nodes
                .first_success(
-                    duties_service.require_synced,
+                    RequireSynced::No,
                    OfflineOnFailure::Yes,
                    |beacon_node| async move {
                        let _timer = metrics::start_timer_vec(
@@ -618,7 +613,7 @@ async fn poll_beacon_attesters<T: SlotClock + 'static, E: EthSpec>(
        if let Err(e) = duties_service
            .beacon_nodes
            .run(
-                duties_service.require_synced,
+                RequireSynced::No,
                OfflineOnFailure::Yes,
                |beacon_node| async move {
                    let _timer = metrics::start_timer_vec(
@@ -856,7 +851,7 @@ async fn post_validator_duties_attester<T: SlotClock + 'static, E: EthSpec>(
    duties_service
        .beacon_nodes
        .first_success(
-            duties_service.require_synced,
+            RequireSynced::No,
            OfflineOnFailure::Yes,
            |beacon_node| async move {
                let _timer = metrics::start_timer_vec(
@@ -1063,7 +1058,7 @@ async fn poll_beacon_proposers<T: SlotClock + 'static, E: EthSpec>(
        let download_result = duties_service
            .beacon_nodes
            .first_success(
-                duties_service.require_synced,
+                RequireSynced::No,
                OfflineOnFailure::Yes,
                |beacon_node| async move {
                    let _timer = metrics::start_timer_vec(
--- a/validator_client/src/duties_service/sync.rs
+++ b/validator_client/src/duties_service/sync.rs
@@ -1,4 +1,4 @@
-use crate::beacon_node_fallback::OfflineOnFailure;
+use crate::beacon_node_fallback::{OfflineOnFailure, RequireSynced};
 use crate::{
    doppelganger_service::DoppelgangerStatus,
    duties_service::{DutiesService, Error},
@@ -422,7 +422,7 @@ pub async fn poll_sync_committee_duties_for_period<T: SlotClock + 'static, E: Et
    let duties_response = duties_service
        .beacon_nodes
        .first_success(
-            duties_service.require_synced,
+            RequireSynced::No,
            OfflineOnFailure::Yes,
            |beacon_node| async move {
                beacon_node
--- a/validator_client/src/lib.rs
+++ b/validator_client/src/lib.rs
@@ -446,11 +446,6 @@ impl<T: EthSpec> ProductionValidatorClient<T> {
            slot_clock: slot_clock.clone(),
            beacon_nodes: beacon_nodes.clone(),
            validator_store: validator_store.clone(),
-            require_synced: if config.allow_unsynced_beacon_node {
-                RequireSynced::Yes
-            } else {
-                RequireSynced::No
-            },
            spec: context.eth2_config.spec.clone(),
            context: duties_context,
            enable_high_validator_count_metrics: config.enable_high_validator_count_metrics,
@@ -620,8 +615,8 @@ async fn init_from_beacon_node<E: EthSpec>(
    context: &RuntimeContext<E>,
 ) -> Result<(u64, Hash256), String> {
    loop {
-        beacon_nodes.update_unready_candidates().await;
-        proposer_nodes.update_unready_candidates().await;
+        beacon_nodes.update_all_candidates().await;
+        proposer_nodes.update_all_candidates().await;

        let num_available = beacon_nodes.num_available().await;
        let num_total = beacon_nodes.num_total();
--- a/validator_client/src/preparation_service.rs
+++ b/validator_client/src/preparation_service.rs
@@ -332,7 +332,7 @@ impl<T: SlotClock + 'static, E: EthSpec> PreparationService<T, E> {
        match self
            .beacon_nodes
            .run(
-                RequireSynced::Yes,
+                RequireSynced::No,
                OfflineOnFailure::Yes,
                |beacon_node| async move {
                    beacon_node
@@ -451,7 +451,7 @@ impl<T: SlotClock + 'static, E: EthSpec> PreparationService<T, E> {
                match self
                    .beacon_nodes
                    .first_success(
-                        RequireSynced::Yes,
+                        RequireSynced::No,
                        OfflineOnFailure::No,
                        |beacon_node| async move {
                            beacon_node.post_validator_register_validator(batch).await
--- a/validator_client/src/sync_committee_service.rs
+++ b/validator_client/src/sync_committee_service.rs
@@ -178,7 +178,7 @@ impl<T: SlotClock + 'static, E: EthSpec> SyncCommitteeService<T, E> {
        let response = self
            .beacon_nodes
            .first_success(
-                RequireSynced::Yes,
+                RequireSynced::No,
                OfflineOnFailure::Yes,
                |beacon_node| async move {
                    match beacon_node.get_beacon_blocks_root(BlockId::Head).await {