diff --git a/testing/simulator/src/checks.rs b/testing/simulator/src/checks.rs index ac672cbc21..c0be219d30 100644 --- a/testing/simulator/src/checks.rs +++ b/testing/simulator/src/checks.rs @@ -1,4 +1,5 @@ use crate::local_network::LocalNetwork; +use crate::ACCEPTABLE_FALLBACK_ATTESTATION_HIT_PERCENTAGE; use node_test_rig::eth2::types::{BlockId, StateId}; use std::time::Duration; use types::{Epoch, EthSpec, ExecPayload, ExecutionBlockHash, Hash256, Slot, Unsigned}; @@ -244,30 +245,42 @@ pub async fn verify_transition_block_finalized( } } +// Causes the execution node at `node_index` to disconnect from the execution layer 1 epoch after +// the merge transition. pub async fn disconnect_from_execution_layer( network: LocalNetwork, transition_epoch: Epoch, slot_duration: Duration, + node_index: usize, ) -> Result<(), String> { epoch_delay(transition_epoch + 1, slot_duration, E::slots_per_epoch()).await; eprintln!("Disabling Execution Layer"); - // Take the execution node at position 0 and force it to return the `syncing` status. - network.execution_nodes.read()[0] + // Force the execution node to return the `syncing` status. + network.execution_nodes.read()[node_index] .server .all_payloads_syncing(false); + Ok(()) +} - // Run for 2 epochs with the 0th execution node stalled. +pub async fn reconnect_to_execution_layer( + network: LocalNetwork, + transition_epoch: Epoch, + slot_duration: Duration, + node_index: usize, + epochs_offline: u64, +) -> Result<(), String> { + // Ensure this is configurable by only reconnecting after `epoch_offline`. epoch_delay( - transition_epoch + 1 + 2, + transition_epoch + epochs_offline, slot_duration, E::slots_per_epoch(), ) .await; - // Restore the functionality of the 0th execution node. - network.execution_nodes.read()[0] + // Restore the functionality of the execution node. + network.execution_nodes.read()[node_index] .server .all_payloads_valid(); @@ -278,32 +291,76 @@ pub async fn disconnect_from_execution_layer( /// Ensure all validators have attested correctly. pub async fn check_attestation_correctness( network: LocalNetwork, + start_epoch: Epoch, + // Must be 2 epochs less than the end of the simulation. upto_epoch: Epoch, slots_per_epoch: u64, slot_duration: Duration, + // Select which node to query. Will use this node to determine the global network performance. + node_index: usize, ) -> Result<(), String> { let upto_slot = upto_epoch.start_slot(slots_per_epoch); slot_delay(upto_slot, slot_duration).await; - let remote_node = &network.remote_nodes()?[1]; + let remote_node = &network.remote_nodes()?[node_index]; let results = remote_node .get_lighthouse_analysis_attestation_performance( - Epoch::new(2), + start_epoch, upto_epoch - 2, "global".to_string(), ) .await .map_err(|e| format!("Unable to get attestation performance: {e}"))?; + let mut active_successes: f64 = 0.0; + let mut head_successes: f64 = 0.0; + let mut target_successes: f64 = 0.0; + let mut source_successes: f64 = 0.0; + + let mut total: f64 = 0.0; + for result in results { for epochs in result.epochs.values() { - assert!(epochs.active); - assert!(epochs.head); - assert!(epochs.target); - assert!(epochs.source); + total += 1.0; + + if epochs.active { + active_successes += 1.0; + } + if epochs.head { + head_successes += 1.0; + } + if epochs.target { + target_successes += 1.0; + } + if epochs.source { + source_successes += 1.0; + } } } + let active_percent = active_successes / total * 100.0; + let head_percent = head_successes / total * 100.0; + let target_percent = target_successes / total * 100.0; + let source_percent = source_successes / total * 100.0; + + eprintln!("Total Attestations: {}", total); + eprintln!("Active: {}: {}%", active_successes, active_percent); + eprintln!("Head: {}: {}%", head_successes, head_percent); + eprintln!("Target: {}: {}%", target_successes, target_percent); + eprintln!("Source: {}: {}%", source_successes, source_percent); + + if active_percent < ACCEPTABLE_FALLBACK_ATTESTATION_HIT_PERCENTAGE { + return Err("Active percent was below required level".to_string()); + } + if head_percent < ACCEPTABLE_FALLBACK_ATTESTATION_HIT_PERCENTAGE { + return Err("Head percent was below required level".to_string()); + } + if target_percent < ACCEPTABLE_FALLBACK_ATTESTATION_HIT_PERCENTAGE { + return Err("Target percent was below required level".to_string()); + } + if source_percent < ACCEPTABLE_FALLBACK_ATTESTATION_HIT_PERCENTAGE { + return Err("Source percent was below required level".to_string()); + } Ok(()) } diff --git a/testing/simulator/src/cli.rs b/testing/simulator/src/cli.rs index 0b888314cf..2eff6c5243 100644 --- a/testing/simulator/src/cli.rs +++ b/testing/simulator/src/cli.rs @@ -148,11 +148,6 @@ pub fn cli_app<'a, 'b>() -> App<'a, 'b> { .takes_value(true) .default_value("3") .help("Speed up factor. Please use a divisor of 12.")) - .arg(Arg::with_name("post-merge") - .short("m") - .long("post-merge") - .takes_value(false) - .help("Simulate the merge transition")) .arg(Arg::with_name("continue_after_checks") .short("c") .long("continue_after_checks") diff --git a/testing/simulator/src/fallback_sim.rs b/testing/simulator/src/fallback_sim.rs index 63b50d5c3c..c122df7b33 100644 --- a/testing/simulator/src/fallback_sim.rs +++ b/testing/simulator/src/fallback_sim.rs @@ -21,7 +21,7 @@ use sensitive_url::SensitiveUrl; use tokio::time::sleep; use types::{Epoch, EthSpec, MinimalEthSpec}; -const END_EPOCH: u64 = 20; +const END_EPOCH: u64 = 16; const ALTAIR_FORK_EPOCH: u64 = 1; const BELLATRIX_FORK_EPOCH: u64 = 2; @@ -36,7 +36,6 @@ pub fn run_fallback_sim(matches: &ArgMatches) -> Result<(), String> { value_t!(matches, "validators_per_vc", usize).expect("missing validators_per_vc default"); let bns_per_vc = value_t!(matches, "bns_per_vc", usize).expect("missing bns_per_vc default"); let continue_after_checks = matches.is_present("continue_after_checks"); - //let post_merge_sim = matches.is_present("post-merge"); let post_merge_sim = true; println!("Fallback Simulator:"); @@ -106,8 +105,6 @@ fn fallback_sim( let total_validator_count = validators_per_vc * vc_count; let node_count = vc_count * bns_per_vc; - //let altair_fork_version = spec.altair_fork_version; - //let bellatrix_fork_version = spec.bellatrix_fork_version; spec.seconds_per_slot /= speed_up_factor; spec.seconds_per_slot = max(1, spec.seconds_per_slot); @@ -266,41 +263,32 @@ fn fallback_sim( * breakage by changes to the VC. */ - let ( - //finalization, - //block_prod, - //validator_count, - //onboarding, - fallback, - check_attestations, - //fork, - //sync_aggregate, - //transition, - ) = futures::join!( - //checks::verify_first_finalization(network.clone(), slot_duration), + let (disconnect, reconnect, check_attestations) = futures::join!( checks::disconnect_from_execution_layer( network.clone(), Epoch::new(BELLATRIX_FORK_EPOCH), - slot_duration + slot_duration, + 0 + ), + checks::reconnect_to_execution_layer( + network.clone(), + Epoch::new(BELLATRIX_FORK_EPOCH), + slot_duration, + 0, + 2, ), checks::check_attestation_correctness( network.clone(), - Epoch::new(END_EPOCH), + Epoch::new(0), + Epoch::new(END_EPOCH - 2), MinimalEthSpec::slots_per_epoch(), - slot_duration + slot_duration, + 1, ), - //checks::stall_node(network.clone(), 0, 30, seconds_per_slot), ); - - //block_prod?; - //finalization?; - //validator_count?; - //onboarding?; - fallback?; + disconnect?; + reconnect?; check_attestations?; - //fork?; - //sync_aggregate?; - //transition?; // The `final_future` either completes immediately or never completes, depending on the value // of `continue_after_checks`. diff --git a/testing/simulator/src/main.rs b/testing/simulator/src/main.rs index 83ca1135ef..4b27184f74 100644 --- a/testing/simulator/src/main.rs +++ b/testing/simulator/src/main.rs @@ -30,6 +30,14 @@ use env_logger::{Builder, Env}; use local_network::LocalNetwork; use types::MinimalEthSpec; +// Since simulator tests are non-deterministic and there is a non-zero chance of missed +// attestations, define an acceptable network-wide attestation performance. +// +// This has potential to block CI so it should be set conservatively enough that spurious failures +// don't become very common, but not so conservatively that regressions to the fallback mechanism +// cannot be detected. +pub(crate) const ACCEPTABLE_FALLBACK_ATTESTATION_HIT_PERCENTAGE: f64 = 99.0; + pub type E = MinimalEthSpec; fn main() {