Fix simulator

2026-05-01 19:53:32 +00:00 · 2023-07-21 17:49:52 +10:00
parent 61bc700fdf
commit 129568565e
4 changed files with 94 additions and 46 deletions
--- a/testing/simulator/src/checks.rs
+++ b/testing/simulator/src/checks.rs
@@ -1,4 +1,5 @@
 use crate::local_network::LocalNetwork;
+use crate::ACCEPTABLE_FALLBACK_ATTESTATION_HIT_PERCENTAGE;
 use node_test_rig::eth2::types::{BlockId, StateId};
 use std::time::Duration;
 use types::{Epoch, EthSpec, ExecPayload, ExecutionBlockHash, Hash256, Slot, Unsigned};
@@ -244,30 +245,42 @@ pub async fn verify_transition_block_finalized<E: EthSpec>(
    }
 }

+// Causes the execution node at `node_index` to disconnect from the execution layer 1 epoch after
+// the merge transition.
 pub async fn disconnect_from_execution_layer<E: EthSpec>(
    network: LocalNetwork<E>,
    transition_epoch: Epoch,
    slot_duration: Duration,
+    node_index: usize,
 ) -> Result<(), String> {
    epoch_delay(transition_epoch + 1, slot_duration, E::slots_per_epoch()).await;

    eprintln!("Disabling Execution Layer");

-    // Take the execution node at position 0 and force it to return the `syncing` status.
-    network.execution_nodes.read()[0]
+    // Force the execution node to return the `syncing` status.
+    network.execution_nodes.read()[node_index]
        .server
        .all_payloads_syncing(false);
+    Ok(())
+}

-    // Run for 2 epochs with the 0th execution node stalled.
+pub async fn reconnect_to_execution_layer<E: EthSpec>(
+    network: LocalNetwork<E>,
+    transition_epoch: Epoch,
+    slot_duration: Duration,
+    node_index: usize,
+    epochs_offline: u64,
+) -> Result<(), String> {
+    // Ensure this is configurable by only reconnecting after `epoch_offline`.
    epoch_delay(
-        transition_epoch + 1 + 2,
+        transition_epoch + epochs_offline,
        slot_duration,
        E::slots_per_epoch(),
    )
    .await;

-    // Restore the functionality of the 0th execution node.
-    network.execution_nodes.read()[0]
+    // Restore the functionality of the execution node.
+    network.execution_nodes.read()[node_index]
        .server
        .all_payloads_valid();

@@ -278,32 +291,76 @@ pub async fn disconnect_from_execution_layer<E: EthSpec>(
 /// Ensure all validators have attested correctly.
 pub async fn check_attestation_correctness<E: EthSpec>(
    network: LocalNetwork<E>,
+    start_epoch: Epoch,
+    // Must be 2 epochs less than the end of the simulation.
    upto_epoch: Epoch,
    slots_per_epoch: u64,
    slot_duration: Duration,
+    // Select which node to query. Will use this node to determine the global network performance.
+    node_index: usize,
 ) -> Result<(), String> {
    let upto_slot = upto_epoch.start_slot(slots_per_epoch);
    slot_delay(upto_slot, slot_duration).await;

-    let remote_node = &network.remote_nodes()?[1];
+    let remote_node = &network.remote_nodes()?[node_index];

    let results = remote_node
        .get_lighthouse_analysis_attestation_performance(
-            Epoch::new(2),
+            start_epoch,
            upto_epoch - 2,
            "global".to_string(),
        )
        .await
        .map_err(|e| format!("Unable to get attestation performance: {e}"))?;

+    let mut active_successes: f64 = 0.0;
+    let mut head_successes: f64 = 0.0;
+    let mut target_successes: f64 = 0.0;
+    let mut source_successes: f64 = 0.0;
+
+    let mut total: f64 = 0.0;
+
    for result in results {
        for epochs in result.epochs.values() {
-            assert!(epochs.active);
-            assert!(epochs.head);
-            assert!(epochs.target);
-            assert!(epochs.source);
+            total += 1.0;
+
+            if epochs.active {
+                active_successes += 1.0;
+            }
+            if epochs.head {
+                head_successes += 1.0;
+            }
+            if epochs.target {
+                target_successes += 1.0;
+            }
+            if epochs.source {
+                source_successes += 1.0;
+            }
        }
    }
+    let active_percent = active_successes / total * 100.0;
+    let head_percent = head_successes / total * 100.0;
+    let target_percent = target_successes / total * 100.0;
+    let source_percent = source_successes / total * 100.0;
+
+    eprintln!("Total Attestations: {}", total);
+    eprintln!("Active: {}: {}%", active_successes, active_percent);
+    eprintln!("Head: {}: {}%", head_successes, head_percent);
+    eprintln!("Target: {}: {}%", target_successes, target_percent);
+    eprintln!("Source: {}: {}%", source_successes, source_percent);
+
+    if active_percent < ACCEPTABLE_FALLBACK_ATTESTATION_HIT_PERCENTAGE {
+        return Err("Active percent was below required level".to_string());
+    }
+    if head_percent < ACCEPTABLE_FALLBACK_ATTESTATION_HIT_PERCENTAGE {
+        return Err("Head percent was below required level".to_string());
+    }
+    if target_percent < ACCEPTABLE_FALLBACK_ATTESTATION_HIT_PERCENTAGE {
+        return Err("Target percent was below required level".to_string());
+    }
+    if source_percent < ACCEPTABLE_FALLBACK_ATTESTATION_HIT_PERCENTAGE {
+        return Err("Source percent was below required level".to_string());
+    }

    Ok(())
 }
--- a/testing/simulator/src/cli.rs
+++ b/testing/simulator/src/cli.rs
@@ -148,11 +148,6 @@ pub fn cli_app<'a, 'b>() -> App<'a, 'b> {
                    .takes_value(true)
                    .default_value("3")
                    .help("Speed up factor. Please use a divisor of 12."))
-                .arg(Arg::with_name("post-merge")
-                    .short("m")
-                    .long("post-merge")
-                    .takes_value(false)
-                    .help("Simulate the merge transition"))
                .arg(Arg::with_name("continue_after_checks")
                    .short("c")
                    .long("continue_after_checks")
--- a/testing/simulator/src/fallback_sim.rs
+++ b/testing/simulator/src/fallback_sim.rs
@@ -21,7 +21,7 @@ use sensitive_url::SensitiveUrl;
 use tokio::time::sleep;
 use types::{Epoch, EthSpec, MinimalEthSpec};

-const END_EPOCH: u64 = 20;
+const END_EPOCH: u64 = 16;
 const ALTAIR_FORK_EPOCH: u64 = 1;
 const BELLATRIX_FORK_EPOCH: u64 = 2;

@@ -36,7 +36,6 @@ pub fn run_fallback_sim(matches: &ArgMatches) -> Result<(), String> {
        value_t!(matches, "validators_per_vc", usize).expect("missing validators_per_vc default");
    let bns_per_vc = value_t!(matches, "bns_per_vc", usize).expect("missing bns_per_vc default");
    let continue_after_checks = matches.is_present("continue_after_checks");
-    //let post_merge_sim = matches.is_present("post-merge");
    let post_merge_sim = true;

    println!("Fallback Simulator:");
@@ -106,8 +105,6 @@ fn fallback_sim(

    let total_validator_count = validators_per_vc * vc_count;
    let node_count = vc_count * bns_per_vc;
-    //let altair_fork_version = spec.altair_fork_version;
-    //let bellatrix_fork_version = spec.bellatrix_fork_version;

    spec.seconds_per_slot /= speed_up_factor;
    spec.seconds_per_slot = max(1, spec.seconds_per_slot);
@@ -266,41 +263,32 @@ fn fallback_sim(
         * breakage by changes to the VC.
         */

-        let (
-            //finalization,
-            //block_prod,
-            //validator_count,
-            //onboarding,
-            fallback,
-            check_attestations,
-            //fork,
-            //sync_aggregate,
-            //transition,
-        ) = futures::join!(
-            //checks::verify_first_finalization(network.clone(), slot_duration),
+        let (disconnect, reconnect, check_attestations) = futures::join!(
            checks::disconnect_from_execution_layer(
                network.clone(),
                Epoch::new(BELLATRIX_FORK_EPOCH),
-                slot_duration
+                slot_duration,
+                0
+            ),
+            checks::reconnect_to_execution_layer(
+                network.clone(),
+                Epoch::new(BELLATRIX_FORK_EPOCH),
+                slot_duration,
+                0,
+                2,
            ),
            checks::check_attestation_correctness(
                network.clone(),
-                Epoch::new(END_EPOCH),
+                Epoch::new(0),
+                Epoch::new(END_EPOCH - 2),
                MinimalEthSpec::slots_per_epoch(),
-                slot_duration
+                slot_duration,
+                1,
            ),
-            //checks::stall_node(network.clone(), 0, 30, seconds_per_slot),
        );
-
-        //block_prod?;
-        //finalization?;
-        //validator_count?;
-        //onboarding?;
-        fallback?;
+        disconnect?;
+        reconnect?;
        check_attestations?;
-        //fork?;
-        //sync_aggregate?;
-        //transition?;

        // The `final_future` either completes immediately or never completes, depending on the value
        // of `continue_after_checks`.
--- a/testing/simulator/src/main.rs
+++ b/testing/simulator/src/main.rs
@@ -30,6 +30,14 @@ use env_logger::{Builder, Env};
 use local_network::LocalNetwork;
 use types::MinimalEthSpec;

+// Since simulator tests are non-deterministic and there is a non-zero chance of missed
+// attestations, define an acceptable network-wide attestation performance.
+//
+// This has potential to block CI so it should be set conservatively enough that spurious failures
+// don't become very common, but not so conservatively that regressions to the fallback mechanism
+// cannot be detected.
+pub(crate) const ACCEPTABLE_FALLBACK_ATTESTATION_HIT_PERCENTAGE: f64 = 99.0;
+
 pub type E = MinimalEthSpec;

 fn main() {