Rework Validator Client fallback mechanism (#4393)

* Rework Validator Client fallback mechanism

* Add CI workflow for fallback simulator

* Tie-break with sync distance for non-synced nodes

* Fix simulator

* Cleanup unused code

* More improvements

* Add IsOptimistic enum for readability

* Use configurable sync distance tiers

* Fix tests

* Combine status and health and improve logging

* Fix nodes not being marked as available

* Fix simulator

* Fix tests again

* Increase fallback simulator tolerance

* Add http api endpoint

* Fix todos and tests

* Update simulator

* Merge branch 'unstable' into vc-fallback

* Add suggestions

* Add id to ui endpoint

* Remove unnecessary clones

* Formatting

* Merge branch 'unstable' into vc-fallback

* Merge branch 'unstable' into vc-fallback

* Fix flag tests

* Merge branch 'unstable' into vc-fallback

* Merge branch 'unstable' into vc-fallback

* Fix conflicts

* Merge branch 'unstable' into vc-fallback

* Remove unnecessary pubs

* Simplify `compute_distance_tier` and reduce notifier awaits

* Use the more descriptive `user_index` instead of `id`

* Combine sync distance tolerance flags into one

* Merge branch 'unstable' into vc-fallback

* Merge branch 'unstable' into vc-fallback

* wip

* Use new simulator from unstable

* Fix cli text

* Remove leftover files

* Remove old commented code

* Merge branch 'unstable' into vc-fallback

* Update cli text

* Silence candidate errors when pre-genesis

* Merge branch 'unstable' into vc-fallback

* Merge branch 'unstable' into vc-fallback

* Retry on failure

* Merge branch 'unstable' into vc-fallback

* Merge branch 'unstable' into vc-fallback

* Remove disable_run_on_all

* Remove unused error variant

* Fix out of date comment

* Merge branch 'unstable' into vc-fallback

* Remove unnecessary as_u64

* Remove more out of date comments

* Use tokio RwLock and remove parking_lot

* Merge branch 'unstable' into vc-fallback

* Formatting

* Ensure nodes are still added to total when not available

* Allow VC to detect when BN comes online

* Fix ui endpoint

* Don't have block_service as an Option

* Merge branch 'unstable' into vc-fallback

* Clean up lifetimes and futures

* Revert "Don't have block_service as an Option"

This reverts commit b5445a09e9.

* Merge branch 'unstable' into vc-fallback

* Merge branch 'unstable' into vc-fallback

* Improve rwlock sanitation using clones

* Merge branch 'unstable' into vc-fallback

* Drop read lock immediately by cloning the vec.
This commit is contained in:
Mac L
2024-10-03 09:57:12 +04:00
committed by GitHub
parent 17849b58ec
commit f870b66f49
24 changed files with 1316 additions and 778 deletions

View File

@@ -1,7 +1,7 @@
use crate::http_metrics;
use crate::{DutiesService, ProductionValidatorClient};
use lighthouse_metrics::set_gauge;
use slog::{error, info, Logger};
use slog::{debug, error, info, Logger};
use slot_clock::SlotClock;
use tokio::time::{sleep, Duration};
use types::EthSpec;
@@ -39,25 +39,32 @@ async fn notify<T: SlotClock + 'static, E: EthSpec>(
duties_service: &DutiesService<T, E>,
log: &Logger,
) {
let num_available = duties_service.beacon_nodes.num_available().await;
let (candidate_info, num_available, num_synced) =
duties_service.beacon_nodes.get_notifier_info().await;
let num_total = candidate_info.len();
let num_synced_fallback = num_synced.saturating_sub(1);
set_gauge(
&http_metrics::metrics::AVAILABLE_BEACON_NODES_COUNT,
num_available as i64,
);
let num_synced = duties_service.beacon_nodes.num_synced().await;
set_gauge(
&http_metrics::metrics::SYNCED_BEACON_NODES_COUNT,
num_synced as i64,
);
let num_total = duties_service.beacon_nodes.num_total();
set_gauge(
&http_metrics::metrics::TOTAL_BEACON_NODES_COUNT,
num_total as i64,
);
if num_synced > 0 {
let primary = candidate_info
.first()
.map(|candidate| candidate.endpoint.as_str())
.unwrap_or("None");
info!(
log,
"Connected to beacon node(s)";
"primary" => primary,
"total" => num_total,
"available" => num_available,
"synced" => num_synced,
@@ -71,13 +78,36 @@ async fn notify<T: SlotClock + 'static, E: EthSpec>(
"synced" => num_synced,
)
}
let num_synced_fallback = duties_service.beacon_nodes.num_synced_fallback().await;
if num_synced_fallback > 0 {
set_gauge(&http_metrics::metrics::ETH2_FALLBACK_CONNECTED, 1);
} else {
set_gauge(&http_metrics::metrics::ETH2_FALLBACK_CONNECTED, 0);
}
for info in candidate_info {
if let Ok(health) = info.health {
debug!(
log,
"Beacon node info";
"status" => "Connected",
"index" => info.index,
"endpoint" => info.endpoint,
"head_slot" => %health.head,
"is_optimistic" => ?health.optimistic_status,
"execution_engine_status" => ?health.execution_status,
"health_tier" => %health.health_tier,
);
} else {
debug!(
log,
"Beacon node info";
"status" => "Disconnected",
"index" => info.index,
"endpoint" => info.endpoint,
);
}
}
if let Some(slot) = duties_service.slot_clock.now() {
let epoch = slot.epoch(E::slots_per_epoch());