Rework Validator Client fallback mechanism (#4393)

* Rework Validator Client fallback mechanism

* Add CI workflow for fallback simulator

* Tie-break with sync distance for non-synced nodes

* Fix simulator

* Cleanup unused code

* More improvements

* Add IsOptimistic enum for readability

* Use configurable sync distance tiers

* Fix tests

* Combine status and health and improve logging

* Fix nodes not being marked as available

* Fix simulator

* Fix tests again

* Increase fallback simulator tolerance

* Add http api endpoint

* Fix todos and tests

* Update simulator

* Merge branch 'unstable' into vc-fallback

* Add suggestions

* Add id to ui endpoint

* Remove unnecessary clones

* Formatting

* Merge branch 'unstable' into vc-fallback

* Merge branch 'unstable' into vc-fallback

* Fix flag tests

* Merge branch 'unstable' into vc-fallback

* Merge branch 'unstable' into vc-fallback

* Fix conflicts

* Merge branch 'unstable' into vc-fallback

* Remove unnecessary pubs

* Simplify `compute_distance_tier` and reduce notifier awaits

* Use the more descriptive `user_index` instead of `id`

* Combine sync distance tolerance flags into one

* Merge branch 'unstable' into vc-fallback

* Merge branch 'unstable' into vc-fallback

* wip

* Use new simulator from unstable

* Fix cli text

* Remove leftover files

* Remove old commented code

* Merge branch 'unstable' into vc-fallback

* Update cli text

* Silence candidate errors when pre-genesis

* Merge branch 'unstable' into vc-fallback

* Merge branch 'unstable' into vc-fallback

* Retry on failure

* Merge branch 'unstable' into vc-fallback

* Merge branch 'unstable' into vc-fallback

* Remove disable_run_on_all

* Remove unused error variant

* Fix out of date comment

* Merge branch 'unstable' into vc-fallback

* Remove unnecessary as_u64

* Remove more out of date comments

* Use tokio RwLock and remove parking_lot

* Merge branch 'unstable' into vc-fallback

* Formatting

* Ensure nodes are still added to total when not available

* Allow VC to detect when BN comes online

* Fix ui endpoint

* Don't have block_service as an Option

* Merge branch 'unstable' into vc-fallback

* Clean up lifetimes and futures

* Revert "Don't have block_service as an Option"

This reverts commit b5445a09e9.

* Merge branch 'unstable' into vc-fallback

* Merge branch 'unstable' into vc-fallback

* Improve rwlock sanitation using clones

* Merge branch 'unstable' into vc-fallback

* Drop read lock immediately by cloning the vec.
This commit is contained in:
Mac L
2024-10-03 09:57:12 +04:00
committed by GitHub
parent 17849b58ec
commit f870b66f49
24 changed files with 1316 additions and 778 deletions

View File

@@ -1,80 +1,27 @@
use crate::beacon_node_fallback::CandidateError;
use eth2::BeaconNodeHttpClient;
use slog::{debug, error, warn, Logger};
use slot_clock::SlotClock;
use eth2::{types::Slot, BeaconNodeHttpClient};
use slog::{warn, Logger};
/// A distance in slots.
const SYNC_TOLERANCE: u64 = 4;
/// Returns
///
/// `Ok(())` if the beacon node is synced and ready for action,
/// `Err(CandidateError::Offline)` if the beacon node is unreachable,
/// `Err(CandidateError::NotSynced)` if the beacon node indicates that it is syncing **AND**
/// it is more than `SYNC_TOLERANCE` behind the highest
/// known slot.
///
/// The second condition means the even if the beacon node thinks that it's syncing, we'll still
/// try to use it if it's close enough to the head.
pub async fn check_synced<T: SlotClock>(
pub async fn check_node_health(
beacon_node: &BeaconNodeHttpClient,
slot_clock: &T,
log_opt: Option<&Logger>,
) -> Result<(), CandidateError> {
log: &Logger,
) -> Result<(Slot, bool, bool), CandidateError> {
let resp = match beacon_node.get_node_syncing().await {
Ok(resp) => resp,
Err(e) => {
if let Some(log) = log_opt {
warn!(
log,
"Unable connect to beacon node";
"error" => %e
)
}
warn!(
log,
"Unable connect to beacon node";
"error" => %e
);
return Err(CandidateError::Offline);
}
};
let bn_is_synced = !resp.data.is_syncing || (resp.data.sync_distance.as_u64() < SYNC_TOLERANCE);
let is_synced = bn_is_synced && !resp.data.el_offline;
if let Some(log) = log_opt {
if !is_synced {
debug!(
log,
"Beacon node sync status";
"status" => format!("{:?}", resp),
);
warn!(
log,
"Beacon node is not synced";
"sync_distance" => resp.data.sync_distance.as_u64(),
"head_slot" => resp.data.head_slot.as_u64(),
"endpoint" => %beacon_node,
"el_offline" => resp.data.el_offline,
);
}
if let Some(local_slot) = slot_clock.now() {
let remote_slot = resp.data.head_slot + resp.data.sync_distance;
if remote_slot + 1 < local_slot || local_slot + 1 < remote_slot {
error!(
log,
"Time discrepancy with beacon node";
"msg" => "check the system time on this host and the beacon node",
"beacon_node_slot" => remote_slot,
"local_slot" => local_slot,
"endpoint" => %beacon_node,
);
}
}
}
if is_synced {
Ok(())
} else {
Err(CandidateError::NotSynced)
}
Ok((
resp.data.head_slot,
resp.data.is_optimistic,
resp.data.el_offline,
))
}