Implement el_offline and use it in the VC (#4295)

## Issue Addressed Closes https://github.com/sigp/lighthouse/issues/4291, part of #3613. ## Proposed Changes - Implement the `el_offline` field on `/eth/v1/node/syncing`. We set `el_offline=true` if: - The EL's internal status is `Offline` or `AuthFailed`, _or_ - The most recent call to `newPayload` resulted in an error (more on this in a moment). - Use the `el_offline` field in the VC to mark nodes with offline ELs as _unsynced_. These nodes will still be used, but only after synced nodes. - Overhaul the usage of `RequireSynced` so that `::No` is used almost everywhere. The `--allow-unsynced` flag was broken and had the opposite effect to intended, so it has been deprecated. - Add tests for the EL being offline on the upcheck call, and being offline due to the newPayload check. ## Why track `newPayload` errors? Tracking the EL's online/offline status is too coarse-grained to be useful in practice, because: - If the EL is timing out to some calls, it's unlikely to timeout on the `upcheck` call, which is _just_ `eth_syncing`. Every failed call is followed by an upcheck [here](693886b941/beacon_node/execution_layer/src/engines.rs (L372-L380)), which would have the effect of masking the failure and keeping the status _online_. - The `newPayload` call is the most likely to time out. It's the call in which ELs tend to do most of their work (often 1-2 seconds), with `forkchoiceUpdated` usually returning much faster (<50ms). - If `newPayload` is failing consistently (e.g. timing out) then this is a good indication that either the node's EL is in trouble, or the network as a whole is. In the first case validator clients _should_ prefer other BNs if they have one available. In the second case, all of their BNs will likely report `el_offline` and they'll just have to proceed with trying to use them. ## Additional Changes - Add utility method `ForkName::latest` which is quite convenient for test writing, but probably other things too. - Delete some stale comments from when we used to support multiple execution nodes.
2026-04-25 08:48:25 +00:00 · 2023-05-17 05:51:56 +00:00
parent aaa118ff0e
commit 3052db29fe
21 changed files with 307 additions and 116 deletions
--- a/beacon_node/execution_layer/src/engines.rs
+++ b/beacon_node/execution_layer/src/engines.rs
@@ -238,6 +238,11 @@ impl Engine {
        **self.state.read().await == EngineStateInternal::Synced
    }

+    /// Returns `true` if the engine has a status other than synced or syncing.
+    pub async fn is_offline(&self) -> bool {
+        EngineState::from(**self.state.read().await) == EngineState::Offline
+    }
+
    /// Run the `EngineApi::upcheck` function if the node's last known state is not synced. This
    /// might be used to recover the node if offline.
    pub async fn upcheck(&self) {
--- a/beacon_node/execution_layer/src/lib.rs
+++ b/beacon_node/execution_layer/src/lib.rs
@@ -222,6 +222,11 @@ struct Inner<E: EthSpec> {
    builder_profit_threshold: Uint256,
    log: Logger,
    always_prefer_builder_payload: bool,
+    /// Track whether the last `newPayload` call errored.
+    ///
+    /// This is used *only* in the informational sync status endpoint, so that a VC using this
+    /// node can prefer another node with a healthier EL.
+    last_new_payload_errored: RwLock<bool>,
 }

 #[derive(Debug, Default, Clone, Serialize, Deserialize)]
@@ -350,6 +355,7 @@ impl<T: EthSpec> ExecutionLayer<T> {
            builder_profit_threshold: Uint256::from(builder_profit_threshold),
            log,
            always_prefer_builder_payload,
+            last_new_payload_errored: RwLock::new(false),
        };

        Ok(Self {
@@ -542,6 +548,15 @@ impl<T: EthSpec> ExecutionLayer<T> {
        synced
    }

+    /// Return `true` if the execution layer is offline or returning errors on `newPayload`.
+    ///
+    /// This function should never be used to prevent any operation in the beacon node, but can
+    /// be used to give an indication on the HTTP API that the node's execution layer is struggling,
+    /// which can in turn be used by the VC.
+    pub async fn is_offline_or_erroring(&self) -> bool {
+        self.engine().is_offline().await || *self.inner.last_new_payload_errored.read().await
+    }
+
    /// Updates the proposer preparation data provided by validators
    pub async fn update_proposer_preparation(
        &self,
@@ -1116,18 +1131,6 @@ impl<T: EthSpec> ExecutionLayer<T> {
    }

    /// Maps to the `engine_newPayload` JSON-RPC call.
-    ///
-    /// ## Fallback Behaviour
-    ///
-    /// The request will be broadcast to all nodes, simultaneously. It will await a response (or
-    /// failure) from all nodes and then return based on the first of these conditions which
-    /// returns true:
-    ///
-    /// - Error::ConsensusFailure if some nodes return valid and some return invalid
-    /// - Valid, if any nodes return valid.
-    /// - Invalid, if any nodes return invalid.
-    /// - Syncing, if any nodes return syncing.
-    /// - An error, if all nodes return an error.
    pub async fn notify_new_payload(
        &self,
        execution_payload: &ExecutionPayload<T>,
@@ -1156,12 +1159,18 @@ impl<T: EthSpec> ExecutionLayer<T> {
                &["new_payload", status.status.into()],
            );
        }
+        *self.inner.last_new_payload_errored.write().await = result.is_err();

        process_payload_status(execution_payload.block_hash(), result, self.log())
            .map_err(Box::new)
            .map_err(Error::EngineError)
    }

+    /// Update engine sync status.
+    pub async fn upcheck(&self) {
+        self.engine().upcheck().await;
+    }
+
    /// Register that the given `validator_index` is going to produce a block at `slot`.
    ///
    /// The block will be built atop `head_block_root` and the EL will need to prepare an
@@ -1221,18 +1230,6 @@ impl<T: EthSpec> ExecutionLayer<T> {
    }

    /// Maps to the `engine_consensusValidated` JSON-RPC call.
-    ///
-    /// ## Fallback Behaviour
-    ///
-    /// The request will be broadcast to all nodes, simultaneously. It will await a response (or
-    /// failure) from all nodes and then return based on the first of these conditions which
-    /// returns true:
-    ///
-    /// - Error::ConsensusFailure if some nodes return valid and some return invalid
-    /// - Valid, if any nodes return valid.
-    /// - Invalid, if any nodes return invalid.
-    /// - Syncing, if any nodes return syncing.
-    /// - An error, if all nodes return an error.
    pub async fn notify_forkchoice_updated(
        &self,
        head_block_hash: ExecutionBlockHash,
--- a/beacon_node/execution_layer/src/test_utils/handle_rpc.rs
+++ b/beacon_node/execution_layer/src/test_utils/handle_rpc.rs
@@ -30,7 +30,12 @@ pub async fn handle_rpc<T: EthSpec>(
        .map_err(|s| (s, GENERIC_ERROR_CODE))?;

    match method {
-        ETH_SYNCING => Ok(JsonValue::Bool(false)),
+        ETH_SYNCING => ctx
+            .syncing_response
+            .lock()
+            .clone()
+            .map(JsonValue::Bool)
+            .map_err(|message| (message, GENERIC_ERROR_CODE)),
        ETH_GET_BLOCK_BY_NUMBER => {
            let tag = params
                .get(0)
@@ -145,7 +150,9 @@ pub async fn handle_rpc<T: EthSpec>(

            // Canned responses set by block hash take priority.
            if let Some(status) = ctx.get_new_payload_status(request.block_hash()) {
-                return Ok(serde_json::to_value(JsonPayloadStatusV1::from(status)).unwrap());
+                return status
+                    .map(|status| serde_json::to_value(JsonPayloadStatusV1::from(status)).unwrap())
+                    .map_err(|message| (message, GENERIC_ERROR_CODE));
            }

            let (static_response, should_import) =
@@ -320,11 +327,15 @@ pub async fn handle_rpc<T: EthSpec>(

            // Canned responses set by block hash take priority.
            if let Some(status) = ctx.get_fcu_payload_status(&head_block_hash) {
-                let response = JsonForkchoiceUpdatedV1Response {
-                    payload_status: JsonPayloadStatusV1::from(status),
-                    payload_id: None,
-                };
-                return Ok(serde_json::to_value(response).unwrap());
+                return status
+                    .map(|status| {
+                        let response = JsonForkchoiceUpdatedV1Response {
+                            payload_status: JsonPayloadStatusV1::from(status),
+                            payload_id: None,
+                        };
+                        serde_json::to_value(response).unwrap()
+                    })
+                    .map_err(|message| (message, GENERIC_ERROR_CODE));
            }

            let mut response = ctx
--- a/beacon_node/execution_layer/src/test_utils/mod.rs
+++ b/beacon_node/execution_layer/src/test_utils/mod.rs
@@ -126,6 +126,7 @@ impl<T: EthSpec> MockServer<T> {
            hook: <_>::default(),
            new_payload_statuses: <_>::default(),
            fcu_payload_statuses: <_>::default(),
+            syncing_response: Arc::new(Mutex::new(Ok(false))),
            engine_capabilities: Arc::new(RwLock::new(DEFAULT_ENGINE_CAPABILITIES)),
            _phantom: PhantomData,
        });
@@ -414,14 +415,25 @@ impl<T: EthSpec> MockServer<T> {
        self.ctx
            .new_payload_statuses
            .lock()
-            .insert(block_hash, status);
+            .insert(block_hash, Ok(status));
    }

    pub fn set_fcu_payload_status(&self, block_hash: ExecutionBlockHash, status: PayloadStatusV1) {
        self.ctx
            .fcu_payload_statuses
            .lock()
-            .insert(block_hash, status);
+            .insert(block_hash, Ok(status));
+    }
+
+    pub fn set_new_payload_error(&self, block_hash: ExecutionBlockHash, error: String) {
+        self.ctx
+            .new_payload_statuses
+            .lock()
+            .insert(block_hash, Err(error));
+    }
+
+    pub fn set_syncing_response(&self, res: Result<bool, String>) {
+        *self.ctx.syncing_response.lock() = res;
    }
 }

@@ -478,8 +490,11 @@ pub struct Context<T: EthSpec> {
    //
    // This is a more flexible and less stateful alternative to `static_new_payload_response`
    // and `preloaded_responses`.
-    pub new_payload_statuses: Arc<Mutex<HashMap<ExecutionBlockHash, PayloadStatusV1>>>,
-    pub fcu_payload_statuses: Arc<Mutex<HashMap<ExecutionBlockHash, PayloadStatusV1>>>,
+    pub new_payload_statuses:
+        Arc<Mutex<HashMap<ExecutionBlockHash, Result<PayloadStatusV1, String>>>>,
+    pub fcu_payload_statuses:
+        Arc<Mutex<HashMap<ExecutionBlockHash, Result<PayloadStatusV1, String>>>>,
+    pub syncing_response: Arc<Mutex<Result<bool, String>>>,

    pub engine_capabilities: Arc<RwLock<EngineCapabilities>>,
    pub _phantom: PhantomData<T>,
@@ -489,14 +504,14 @@ impl<T: EthSpec> Context<T> {
    pub fn get_new_payload_status(
        &self,
        block_hash: &ExecutionBlockHash,
-    ) -> Option<PayloadStatusV1> {
+    ) -> Option<Result<PayloadStatusV1, String>> {
        self.new_payload_statuses.lock().get(block_hash).cloned()
    }

    pub fn get_fcu_payload_status(
        &self,
        block_hash: &ExecutionBlockHash,
-    ) -> Option<PayloadStatusV1> {
+    ) -> Option<Result<PayloadStatusV1, String>> {
        self.fcu_payload_statuses.lock().get(block_hash).cloned()
    }
 }