Implement el_offline and use it in the VC (#4295)

## Issue Addressed Closes https://github.com/sigp/lighthouse/issues/4291, part of #3613. ## Proposed Changes - Implement the `el_offline` field on `/eth/v1/node/syncing`. We set `el_offline=true` if: - The EL's internal status is `Offline` or `AuthFailed`, _or_ - The most recent call to `newPayload` resulted in an error (more on this in a moment). - Use the `el_offline` field in the VC to mark nodes with offline ELs as _unsynced_. These nodes will still be used, but only after synced nodes. - Overhaul the usage of `RequireSynced` so that `::No` is used almost everywhere. The `--allow-unsynced` flag was broken and had the opposite effect to intended, so it has been deprecated. - Add tests for the EL being offline on the upcheck call, and being offline due to the newPayload check. ## Why track `newPayload` errors? Tracking the EL's online/offline status is too coarse-grained to be useful in practice, because: - If the EL is timing out to some calls, it's unlikely to timeout on the `upcheck` call, which is _just_ `eth_syncing`. Every failed call is followed by an upcheck [here](693886b941/beacon_node/execution_layer/src/engines.rs (L372-L380)), which would have the effect of masking the failure and keeping the status _online_. - The `newPayload` call is the most likely to time out. It's the call in which ELs tend to do most of their work (often 1-2 seconds), with `forkchoiceUpdated` usually returning much faster (<50ms). - If `newPayload` is failing consistently (e.g. timing out) then this is a good indication that either the node's EL is in trouble, or the network as a whole is. In the first case validator clients _should_ prefer other BNs if they have one available. In the second case, all of their BNs will likely report `el_offline` and they'll just have to proceed with trying to use them. ## Additional Changes - Add utility method `ForkName::latest` which is quite convenient for test writing, but probably other things too. - Delete some stale comments from when we used to support multiple execution nodes.
2026-06-15 09:48:20 +00:00 · 2023-05-17 05:51:56 +00:00
parent aaa118ff0e
commit 3052db29fe
21 changed files with 307 additions and 116 deletions
--- a/beacon_node/http_api/tests/main.rs
+++ b/beacon_node/http_api/tests/main.rs
@@ -2,4 +2,5 @@

 pub mod fork_tests;
 pub mod interactive_tests;
+pub mod status_tests;
 pub mod tests;
--- a/beacon_node/http_api/tests/status_tests.rs
+++ b/beacon_node/http_api/tests/status_tests.rs
@@ -0,0 +1,145 @@
+//! Tests related to the beacon node's sync status
+use beacon_chain::{
+    test_utils::{AttestationStrategy, BlockStrategy, SyncCommitteeStrategy},
+    BlockError,
+};
+use execution_layer::{PayloadStatusV1, PayloadStatusV1Status};
+use http_api::test_utils::InteractiveTester;
+use types::{EthSpec, ExecPayload, ForkName, MinimalEthSpec, Slot};
+
+type E = MinimalEthSpec;
+
+/// Create a new test environment that is post-merge with `chain_depth` blocks.
+async fn post_merge_tester(chain_depth: u64, validator_count: u64) -> InteractiveTester<E> {
+    // Test using latest fork so that we simulate conditions as similar to mainnet as possible.
+    let mut spec = ForkName::latest().make_genesis_spec(E::default_spec());
+    spec.terminal_total_difficulty = 1.into();
+
+    let tester = InteractiveTester::<E>::new(Some(spec), validator_count as usize).await;
+    let harness = &tester.harness;
+    let mock_el = harness.mock_execution_layer.as_ref().unwrap();
+    let execution_ctx = mock_el.server.ctx.clone();
+
+    // Move to terminal block.
+    mock_el.server.all_payloads_valid();
+    execution_ctx
+        .execution_block_generator
+        .write()
+        .move_to_terminal_block()
+        .unwrap();
+
+    // Create some chain depth.
+    harness.advance_slot();
+    harness
+        .extend_chain_with_sync(
+            chain_depth as usize,
+            BlockStrategy::OnCanonicalHead,
+            AttestationStrategy::AllValidators,
+            SyncCommitteeStrategy::AllValidators,
+        )
+        .await;
+    tester
+}
+
+/// Check `syncing` endpoint when the EL is syncing.
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn el_syncing_then_synced() {
+    let num_blocks = E::slots_per_epoch() / 2;
+    let num_validators = E::slots_per_epoch();
+    let tester = post_merge_tester(num_blocks, num_validators).await;
+    let harness = &tester.harness;
+    let mock_el = harness.mock_execution_layer.as_ref().unwrap();
+
+    // EL syncing
+    mock_el.server.set_syncing_response(Ok(true));
+    mock_el.el.upcheck().await;
+
+    let api_response = tester.client.get_node_syncing().await.unwrap().data;
+    assert_eq!(api_response.el_offline, Some(false));
+    assert_eq!(api_response.is_optimistic, Some(false));
+    assert_eq!(api_response.is_syncing, false);
+
+    // EL synced
+    mock_el.server.set_syncing_response(Ok(false));
+    mock_el.el.upcheck().await;
+
+    let api_response = tester.client.get_node_syncing().await.unwrap().data;
+    assert_eq!(api_response.el_offline, Some(false));
+    assert_eq!(api_response.is_optimistic, Some(false));
+    assert_eq!(api_response.is_syncing, false);
+}
+
+/// Check `syncing` endpoint when the EL is offline (errors on upcheck).
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn el_offline() {
+    let num_blocks = E::slots_per_epoch() / 2;
+    let num_validators = E::slots_per_epoch();
+    let tester = post_merge_tester(num_blocks, num_validators).await;
+    let harness = &tester.harness;
+    let mock_el = harness.mock_execution_layer.as_ref().unwrap();
+
+    // EL offline
+    mock_el.server.set_syncing_response(Err("offline".into()));
+    mock_el.el.upcheck().await;
+
+    let api_response = tester.client.get_node_syncing().await.unwrap().data;
+    assert_eq!(api_response.el_offline, Some(true));
+    assert_eq!(api_response.is_optimistic, Some(false));
+    assert_eq!(api_response.is_syncing, false);
+}
+
+/// Check `syncing` endpoint when the EL errors on newPaylod but is not fully offline.
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn el_error_on_new_payload() {
+    let num_blocks = E::slots_per_epoch() / 2;
+    let num_validators = E::slots_per_epoch();
+    let tester = post_merge_tester(num_blocks, num_validators).await;
+    let harness = &tester.harness;
+    let mock_el = harness.mock_execution_layer.as_ref().unwrap();
+
+    // Make a block.
+    let pre_state = harness.get_current_state();
+    let (block, _) = harness
+        .make_block(pre_state, Slot::new(num_blocks + 1))
+        .await;
+    let block_hash = block
+        .message()
+        .body()
+        .execution_payload()
+        .unwrap()
+        .block_hash();
+
+    // Make sure `newPayload` errors for the new block.
+    mock_el
+        .server
+        .set_new_payload_error(block_hash, "error".into());
+
+    // Attempt to process the block, which should error.
+    harness.advance_slot();
+    assert!(matches!(
+        harness.process_block_result(block.clone()).await,
+        Err(BlockError::ExecutionPayloadError(_))
+    ));
+
+    // The EL should now be *offline* according to the API.
+    let api_response = tester.client.get_node_syncing().await.unwrap().data;
+    assert_eq!(api_response.el_offline, Some(true));
+    assert_eq!(api_response.is_optimistic, Some(false));
+    assert_eq!(api_response.is_syncing, false);
+
+    // Processing a block successfully should remove the status.
+    mock_el.server.set_new_payload_status(
+        block_hash,
+        PayloadStatusV1 {
+            status: PayloadStatusV1Status::Valid,
+            latest_valid_hash: Some(block_hash),
+            validation_error: None,
+        },
+    );
+    harness.process_block_result(block).await.unwrap();
+
+    let api_response = tester.client.get_node_syncing().await.unwrap().data;
+    assert_eq!(api_response.el_offline, Some(false));
+    assert_eq!(api_response.is_optimistic, Some(false));
+    assert_eq!(api_response.is_syncing, false);
+}
--- a/beacon_node/http_api/tests/tests.rs
+++ b/beacon_node/http_api/tests/tests.rs
@@ -1721,6 +1721,8 @@ impl ApiTester {
        let expected = SyncingData {
            is_syncing: false,
            is_optimistic: Some(false),
+            // these tests run without the Bellatrix fork enabled
+            el_offline: Some(true),
            head_slot,
            sync_distance,
        };