Rework Validator Client fallback mechanism

This commit is contained in:
Mac L
2023-06-13 16:43:38 +10:00
parent 687c58fde0
commit 71ee4cf2d8
17 changed files with 1303 additions and 175 deletions

View File

@@ -2,13 +2,19 @@
//! "fallback" behaviour; it will try a request on all of the nodes until one or none of them
//! succeed.
use crate::check_synced::check_synced;
use crate::beacon_node_health::{
BeaconNodeHealth, BeaconNodeSyncDistanceTiers, ExecutionEngineHealth, SyncDistanceTier,
};
use crate::check_synced::{check_node_health, check_synced};
use crate::http_metrics::metrics::{inc_counter_vec, ENDPOINT_ERRORS, ENDPOINT_REQUESTS};
use environment::RuntimeContext;
use eth2::BeaconNodeHttpClient;
use futures::future;
use parking_lot::RwLock as PLRwLock;
use serde_derive::{Deserialize, Serialize};
use slog::{debug, error, info, warn, Logger};
use slot_clock::SlotClock;
use std::cmp::Ordering;
use std::fmt;
use std::fmt::Debug;
use std::future::Future;
@@ -16,7 +22,7 @@ use std::marker::PhantomData;
use std::sync::Arc;
use std::time::{Duration, Instant};
use tokio::{sync::RwLock, time::sleep};
use types::{ChainSpec, Config, EthSpec};
use types::{ChainSpec, Config as ConfigSpec, EthSpec};
/// Message emitted when the VC detects the BN is using a different spec.
const UPDATE_REQUIRED_LOG_HINT: &str = "this VC or the remote BN may need updating";
@@ -30,6 +36,16 @@ const UPDATE_REQUIRED_LOG_HINT: &str = "this VC or the remote BN may need updati
/// having the correct nodes up and running prior to the start of the slot.
const SLOT_LOOKAHEAD: Duration = Duration::from_secs(2);
// Configuration for the Beacon Node fallback.
#[derive(Copy, Clone, Debug, Default, Serialize, Deserialize)]
pub struct Config {
/// Disables publishing http api requests to all beacon nodes for select api calls.
pub disable_run_on_all: bool,
/// Sets the number of slots behind the head a beacon node is allowed to be to still be
/// considered `synced`.
pub sync_tolerance: Option<u64>,
}
/// Indicates a measurement of latency between the VC and a BN.
pub struct LatencyMeasurement {
/// An identifier for the beacon node (e.g. the URL).
@@ -139,21 +155,52 @@ pub enum CandidateError {
Offline,
Incompatible,
NotSynced,
TimeDiscrepancy,
}
/// Represents a `BeaconNodeHttpClient` inside a `BeaconNodeFallback` that may or may not be used
/// for a query.
#[derive(Debug)]
pub struct CandidateBeaconNode<E> {
id: usize,
beacon_node: BeaconNodeHttpClient,
health: PLRwLock<Option<BeaconNodeHealth>>,
status: RwLock<Result<(), CandidateError>>,
_phantom: PhantomData<E>,
}
impl<E: EthSpec> PartialEq for CandidateBeaconNode<E> {
fn eq(&self, other: &Self) -> bool {
self.id == other.id && self.beacon_node == other.beacon_node
}
}
impl<E: EthSpec> Eq for CandidateBeaconNode<E> {}
impl<E: EthSpec> Ord for CandidateBeaconNode<E> {
fn cmp(&self, other: &Self) -> Ordering {
match (&(*self.health.read()), &(*other.health.read())) {
(None, None) => Ordering::Equal,
(None, _) => Ordering::Greater,
(_, None) => Ordering::Less,
(Some(health_1), Some(health_2)) => health_1.cmp(health_2),
}
}
}
impl<E: EthSpec> PartialOrd for CandidateBeaconNode<E> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl<E: EthSpec> CandidateBeaconNode<E> {
/// Instantiate a new node.
pub fn new(beacon_node: BeaconNodeHttpClient) -> Self {
pub fn new(beacon_node: BeaconNodeHttpClient, id: usize) -> Self {
Self {
id,
beacon_node,
health: PLRwLock::new(None),
status: RwLock::new(Err(CandidateError::Uninitialized)),
_phantom: PhantomData,
}
@@ -204,6 +251,64 @@ impl<E: EthSpec> CandidateBeaconNode<E> {
new_status
}
pub async fn refresh_health<T: SlotClock>(
&self,
distance_tiers: &BeaconNodeSyncDistanceTiers,
slot_clock: Option<&T>,
spec: &ChainSpec,
log: &Logger,
) -> Result<(), CandidateError> {
if let Err(e) = self.is_compatible(spec, log).await {
*self.status.write().await = Err(e);
return Ok(());
}
if let Some(slot_clock) = slot_clock {
match check_node_health(&self.beacon_node, log).await {
Ok((head, is_optimistic, el_offline)) => {
// Currently ExecutionEngineHealth is solely determined by online status.
let execution_status = if el_offline {
ExecutionEngineHealth::Unhealthy
} else {
ExecutionEngineHealth::Healthy
};
let new_health = BeaconNodeHealth::from_status(
self.id,
head,
is_optimistic,
execution_status,
distance_tiers,
slot_clock,
);
warn!(
log,
"Health of Beacon Node: {}, updated. Health tier: {}",
new_health.get_id(),
new_health.get_health_tier()
);
*self.health.write() = Some(new_health);
*self.status.write().await = Ok(());
Ok(())
}
Err(status) => {
// Set the health as None which is sorted last in the list.
*self.health.write() = None;
*self.status.write().await = Err(status);
Ok(())
}
}
} else {
// Slot clock will only be None at startup.
// Assume compatible nodes are available.
*self.status.write().await = Ok(());
Ok(())
}
}
/// Checks if the node is reachable.
async fn is_online(&self, was_offline: bool, log: &Logger) -> Result<(), CandidateError> {
let result = self
@@ -240,7 +345,7 @@ impl<E: EthSpec> CandidateBeaconNode<E> {
async fn is_compatible(&self, spec: &ChainSpec, log: &Logger) -> Result<(), CandidateError> {
let config = self
.beacon_node
.get_config_spec::<Config>()
.get_config_spec::<ConfigSpec>()
.await
.map_err(|e| {
error!(
@@ -319,10 +424,12 @@ impl<E: EthSpec> CandidateBeaconNode<E> {
/// A collection of `CandidateBeaconNode` that can be used to perform requests with "fallback"
/// behaviour, where the failure of one candidate results in the next candidate receiving an
/// identical query.
#[derive(Clone, Debug)]
pub struct BeaconNodeFallback<T, E> {
candidates: Vec<CandidateBeaconNode<E>>,
slot_clock: Option<T>,
candidates: Arc<RwLock<Vec<CandidateBeaconNode<E>>>>,
disable_run_on_all: bool,
distance_tiers: BeaconNodeSyncDistanceTiers,
slot_clock: Option<T>,
spec: ChainSpec,
log: Logger,
}
@@ -330,14 +437,16 @@ pub struct BeaconNodeFallback<T, E> {
impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
pub fn new(
candidates: Vec<CandidateBeaconNode<E>>,
disable_run_on_all: bool,
config: Config,
spec: ChainSpec,
log: Logger,
) -> Self {
let distance_tiers = BeaconNodeSyncDistanceTiers::from_config(&config);
Self {
candidates,
candidates: Arc::new(RwLock::new(candidates)),
disable_run_on_all: config.disable_run_on_all,
distance_tiers,
slot_clock: None,
disable_run_on_all,
spec,
log,
}
@@ -353,16 +462,22 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
}
/// The count of candidates, regardless of their state.
pub fn num_total(&self) -> usize {
self.candidates.len()
pub async fn num_total(&self) -> usize {
self.candidates.read().await.len()
}
/// The count of synced and ready candidates.
pub async fn num_synced(&self) -> usize {
let mut n = 0;
for candidate in &self.candidates {
if candidate.status(RequireSynced::Yes).await.is_ok() {
n += 1
for candidate in self.candidates.read().await.iter() {
if let Some(cand) = candidate.health.read().as_ref() {
if self
.distance_tiers
.distance_tier(cand.health_tier.sync_distance)
== SyncDistanceTier::Synced
{
n += 1
}
}
}
n
@@ -371,9 +486,15 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
/// The count of synced and ready fallbacks excluding the primary beacon node candidate.
pub async fn num_synced_fallback(&self) -> usize {
let mut n = 0;
for candidate in self.candidates.iter().skip(1) {
if candidate.status(RequireSynced::Yes).await.is_ok() {
n += 1
for candidate in self.candidates.read().await.iter().skip(1) {
if let Some(cand) = candidate.health.read().as_ref() {
if self
.distance_tiers
.distance_tier(cand.health_tier.sync_distance)
== SyncDistanceTier::Synced
{
n += 1
}
}
}
n
@@ -382,7 +503,7 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
/// The count of candidates that are online and compatible, but not necessarily synced.
pub async fn num_available(&self) -> usize {
let mut n = 0;
for candidate in &self.candidates {
for candidate in self.candidates.read().await.iter() {
if candidate.status(RequireSynced::No).await.is_ok() {
n += 1
}
@@ -396,24 +517,36 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
/// low quality responses. To route around this it's best to poll all connected beacon nodes.
/// A previous implementation of this function polled only the unavailable BNs.
pub async fn update_all_candidates(&self) {
let futures = self
.candidates
let candidates = self.candidates.read().await;
let futures = candidates
.iter()
.map(|candidate| {
candidate.refresh_status(self.slot_clock.as_ref(), &self.spec, &self.log)
candidate.refresh_health(
&self.distance_tiers,
self.slot_clock.as_ref(),
&self.spec,
&self.log,
)
})
.collect::<Vec<_>>();
// run all updates concurrently and ignore errors
// Run all updates concurrently and ignore errors.
let _ = future::join_all(futures).await;
drop(candidates);
// Sort the list to put the healthiest candidate first.
let mut write = self.candidates.write().await;
write.sort();
}
/// Concurrently send a request to all candidates (regardless of
/// offline/online) status and attempt to collect a rough reading on the
/// latency between the VC and candidate.
pub async fn measure_latency(&self) -> Vec<LatencyMeasurement> {
let futures: Vec<_> = self
.candidates
let candidates = self.candidates.read().await;
let futures: Vec<_> = candidates
.iter()
.map(|candidate| async {
let beacon_node_id = candidate.beacon_node.to_string();
@@ -455,20 +588,18 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
/// First this function will try all nodes with a suitable status. If no candidates are suitable
/// or all the requests fail, it will try updating the status of all unsuitable nodes and
/// re-running `func` again.
pub async fn first_success<'a, F, O, Err, R>(
&'a self,
require_synced: RequireSynced,
offline_on_failure: OfflineOnFailure,
pub async fn first_success<F, O, Err, R>(
&self,
_require_synced: RequireSynced,
_offline_on_failure: OfflineOnFailure,
func: F,
) -> Result<O, Errors<Err>>
where
F: Fn(&'a BeaconNodeHttpClient) -> R,
F: Fn(BeaconNodeHttpClient) -> R,
R: Future<Output = Result<O, Err>>,
Err: Debug,
{
let mut errors = vec![];
let mut to_retry = vec![];
let mut retry_unsynced = vec![];
let log = &self.log.clone();
// Run `func` using a `candidate`, returning the value or capturing errors.
@@ -481,7 +612,7 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
// There exists a race condition where `func` may be called when the candidate is
// actually not ready. We deem this an acceptable inefficiency.
match func(&$candidate.beacon_node).await {
match func($candidate.beacon_node.clone()).await {
Ok(val) => return Ok(val),
Err(e) => {
debug!(
@@ -495,9 +626,9 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
// There exists a race condition where the candidate may have been marked
// as ready between the `func` call and now. We deem this an acceptable
// inefficiency.
if matches!(offline_on_failure, OfflineOnFailure::Yes) {
$candidate.set_offline().await;
}
//if matches!(offline_on_failure, OfflineOnFailure::Yes) {
// $candidate.set_offline().await;
//}
errors.push(($candidate.beacon_node.to_string(), Error::RequestFailed(e)));
inc_counter_vec(&ENDPOINT_ERRORS, &[$candidate.beacon_node.as_ref()]);
}
@@ -508,53 +639,9 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
// First pass: try `func` on all synced and ready candidates.
//
// This ensures that we always choose a synced node if it is available.
for candidate in &self.candidates {
match candidate.status(RequireSynced::Yes).await {
Err(e @ CandidateError::NotSynced) if require_synced == false => {
// This client is unsynced we will try it after trying all synced clients
retry_unsynced.push(candidate);
errors.push((candidate.beacon_node.to_string(), Error::Unavailable(e)));
}
Err(e) => {
// This client was not ready on the first pass, we might try it again later.
to_retry.push(candidate);
errors.push((candidate.beacon_node.to_string(), Error::Unavailable(e)));
}
_ => try_func!(candidate),
}
}
// Second pass: try `func` on ready unsynced candidates. This only runs if we permit
// unsynced candidates.
//
// Due to async race-conditions, it is possible that we will send a request to a candidate
// that has been set to an offline/unready status. This is acceptable.
if require_synced == false {
for candidate in retry_unsynced {
try_func!(candidate);
}
}
// Third pass: try again, attempting to make non-ready clients become ready.
for candidate in to_retry {
// If the candidate hasn't luckily transferred into the correct state in the meantime,
// force an update of the state.
let new_status = match candidate.status(require_synced).await {
Ok(()) => Ok(()),
Err(_) => {
candidate
.refresh_status(self.slot_clock.as_ref(), &self.spec, &self.log)
.await
}
};
match new_status {
Ok(()) => try_func!(candidate),
Err(CandidateError::NotSynced) if require_synced == false => try_func!(candidate),
Err(e) => {
errors.push((candidate.beacon_node.to_string(), Error::Unavailable(e)));
}
}
let candidates = self.candidates.read().await;
for candidate in candidates.iter() {
try_func!(candidate);
}
// There were no candidates already ready and we were unable to make any of them ready.
@@ -571,19 +658,17 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
/// It returns a list of errors along with the beacon node id that failed for `func`.
/// Since this ignores the actual result of `func`, this function should only be used for beacon
/// node calls whose results we do not care about, only that they completed successfully.
pub async fn run_on_all<'a, F, O, Err, R>(
&'a self,
require_synced: RequireSynced,
offline_on_failure: OfflineOnFailure,
pub async fn run_on_all<F, O, Err, R>(
&self,
_require_synced: RequireSynced,
_offline_on_failure: OfflineOnFailure,
func: F,
) -> Result<(), Errors<Err>>
where
F: Fn(&'a BeaconNodeHttpClient) -> R,
F: Fn(BeaconNodeHttpClient) -> R,
R: Future<Output = Result<O, Err>>,
{
let mut results = vec![];
let mut to_retry = vec![];
let mut retry_unsynced = vec![];
// Run `func` using a `candidate`, returning the value or capturing errors.
//
@@ -595,7 +680,7 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
// There exists a race condition where `func` may be called when the candidate is
// actually not ready. We deem this an acceptable inefficiency.
match func(&$candidate.beacon_node).await {
match func($candidate.beacon_node.clone()).await {
Ok(val) => results.push(Ok(val)),
Err(e) => {
// If we have an error on this function, make the client as not-ready.
@@ -603,9 +688,9 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
// There exists a race condition where the candidate may have been marked
// as ready between the `func` call and now. We deem this an acceptable
// inefficiency.
if matches!(offline_on_failure, OfflineOnFailure::Yes) {
$candidate.set_offline().await;
}
//if matches!(offline_on_failure, OfflineOnFailure::Yes) {
// $candidate.set_offline().await;
//}
results.push(Err((
$candidate.beacon_node.to_string(),
Error::RequestFailed(e),
@@ -619,54 +704,9 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
// First pass: try `func` on all synced and ready candidates.
//
// This ensures that we always choose a synced node if it is available.
for candidate in &self.candidates {
match candidate.status(RequireSynced::Yes).await {
Err(CandidateError::NotSynced) if require_synced == false => {
// This client is unsynced we will try it after trying all synced clients
retry_unsynced.push(candidate);
}
Err(_) => {
// This client was not ready on the first pass, we might try it again later.
to_retry.push(candidate);
}
Ok(_) => try_func!(candidate),
}
}
// Second pass: try `func` on ready unsynced candidates. This only runs if we permit
// unsynced candidates.
//
// Due to async race-conditions, it is possible that we will send a request to a candidate
// that has been set to an offline/unready status. This is acceptable.
if require_synced == false {
for candidate in retry_unsynced {
try_func!(candidate);
}
}
// Third pass: try again, attempting to make non-ready clients become ready.
for candidate in to_retry {
// If the candidate hasn't luckily transferred into the correct state in the meantime,
// force an update of the state.
let new_status = match candidate.status(require_synced).await {
Ok(()) => Ok(()),
Err(_) => {
candidate
.refresh_status(self.slot_clock.as_ref(), &self.spec, &self.log)
.await
}
};
match new_status {
Ok(()) => try_func!(candidate),
Err(CandidateError::NotSynced) if require_synced == false => try_func!(candidate),
Err(e) => {
results.push(Err((
candidate.beacon_node.to_string(),
Error::Unavailable(e),
)));
}
}
let candidates = self.candidates.read().await;
for candidate in candidates.iter() {
try_func!(candidate);
}
let errors: Vec<_> = results.into_iter().filter_map(|res| res.err()).collect();
@@ -680,14 +720,14 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
/// Call `func` on first beacon node that returns success or on all beacon nodes
/// depending on the value of `disable_run_on_all`.
pub async fn run<'a, F, Err, R>(
&'a self,
pub async fn run<F, Err, R>(
&self,
require_synced: RequireSynced,
offline_on_failure: OfflineOnFailure,
func: F,
) -> Result<(), Errors<Err>>
where
F: Fn(&'a BeaconNodeHttpClient) -> R,
F: Fn(BeaconNodeHttpClient) -> R,
R: Future<Output = Result<(), Err>>,
Err: Debug,
{
@@ -701,3 +741,154 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::beacon_node_health::BeaconNodeHealthTier;
use crate::SensitiveUrl;
use eth2::Timeouts;
use types::{MainnetEthSpec, Slot};
type E = MainnetEthSpec;
#[test]
fn check_candidate_order() {
let candidate_1: CandidateBeaconNode<E> = CandidateBeaconNode::new(
BeaconNodeHttpClient::new(
SensitiveUrl::parse("http://example_1.com").unwrap(),
Timeouts::set_all(Duration::from_secs(1)),
),
1,
);
let expected_candidate_1: CandidateBeaconNode<E> = CandidateBeaconNode::new(
BeaconNodeHttpClient::new(
SensitiveUrl::parse("http://example_1.com").unwrap(),
Timeouts::set_all(Duration::from_secs(1)),
),
1,
);
let candidate_2: CandidateBeaconNode<E> = CandidateBeaconNode::new(
BeaconNodeHttpClient::new(
SensitiveUrl::parse("http://example_2.com").unwrap(),
Timeouts::set_all(Duration::from_secs(2)),
),
2,
);
let expected_candidate_2: CandidateBeaconNode<E> = CandidateBeaconNode::new(
BeaconNodeHttpClient::new(
SensitiveUrl::parse("http://example_2.com").unwrap(),
Timeouts::set_all(Duration::from_secs(2)),
),
2,
);
let candidate_3: CandidateBeaconNode<E> = CandidateBeaconNode::new(
BeaconNodeHttpClient::new(
SensitiveUrl::parse("http://example_3.com").unwrap(),
Timeouts::set_all(Duration::from_secs(3)),
),
3,
);
let expected_candidate_3: CandidateBeaconNode<E> = CandidateBeaconNode::new(
BeaconNodeHttpClient::new(
SensitiveUrl::parse("http://example_3.com").unwrap(),
Timeouts::set_all(Duration::from_secs(3)),
),
3,
);
let candidate_4: CandidateBeaconNode<E> = CandidateBeaconNode::new(
BeaconNodeHttpClient::new(
SensitiveUrl::parse("http://example_4.com").unwrap(),
Timeouts::set_all(Duration::from_secs(4)),
),
3,
);
let expected_candidate_4: CandidateBeaconNode<E> = CandidateBeaconNode::new(
BeaconNodeHttpClient::new(
SensitiveUrl::parse("http://example_4.com").unwrap(),
Timeouts::set_all(Duration::from_secs(4)),
),
3,
);
let candidate_5: CandidateBeaconNode<E> = CandidateBeaconNode::new(
BeaconNodeHttpClient::new(
SensitiveUrl::parse("http://example_5.com").unwrap(),
Timeouts::set_all(Duration::from_secs(5)),
),
3,
);
let expected_candidate_5: CandidateBeaconNode<E> = CandidateBeaconNode::new(
BeaconNodeHttpClient::new(
SensitiveUrl::parse("http://example_5.com").unwrap(),
Timeouts::set_all(Duration::from_secs(5)),
),
3,
);
// All health parameters other than `health_tier` are irrelevant for ordering.
let health_1 = BeaconNodeHealth {
id: 1,
head: Slot::new(99),
optimistic_status: false,
execution_status: ExecutionEngineHealth::Healthy,
health_tier: BeaconNodeHealthTier::new(1, Slot::new(1)),
};
let health_2 = BeaconNodeHealth {
id: 2,
head: Slot::new(99),
optimistic_status: false,
execution_status: ExecutionEngineHealth::Healthy,
health_tier: BeaconNodeHealthTier::new(2, Slot::new(1)),
};
let health_3 = BeaconNodeHealth {
id: 3,
head: Slot::new(99),
optimistic_status: false,
execution_status: ExecutionEngineHealth::Healthy,
health_tier: BeaconNodeHealthTier::new(3, Slot::new(1)),
};
let health_4 = BeaconNodeHealth {
id: 4,
head: Slot::new(99),
optimistic_status: false,
execution_status: ExecutionEngineHealth::Healthy,
health_tier: BeaconNodeHealthTier::new(4, Slot::new(1)),
};
let health_5 = BeaconNodeHealth {
id: 5,
head: Slot::new(99),
optimistic_status: false,
execution_status: ExecutionEngineHealth::Unhealthy,
health_tier: BeaconNodeHealthTier::new(4, Slot::new(5)),
};
*candidate_1.health.write() = Some(health_1);
*candidate_2.health.write() = Some(health_2);
*candidate_3.health.write() = Some(health_3);
*candidate_4.health.write() = Some(health_4);
*candidate_5.health.write() = Some(health_5);
let mut candidates = vec![
candidate_3,
candidate_5,
candidate_1,
candidate_4,
candidate_2,
];
let expected_candidates = vec![
expected_candidate_1,
expected_candidate_2,
expected_candidate_3,
expected_candidate_4,
expected_candidate_5,
];
candidates.sort();
assert_eq!(candidates, expected_candidates);
}
}

View File

@@ -0,0 +1,363 @@
use crate::beacon_node_fallback::Config;
use slot_clock::SlotClock;
use std::cmp::Ordering;
use std::fmt::{Debug, Display, Formatter};
use types::Slot;
// Sync distances between 0 and DEFAULT_SYNC_TOLERANCE are considered `synced`.
// Sync distance tiers are determined by the different modifiers.
const DEFAULT_SYNC_TOLERANCE: Slot = Slot::new(4);
const SYNC_DISTANCE_SMALL_MODIFIER: Slot = Slot::new(7);
const SYNC_DISTANCE_MEDIUM_MODIFIER: Slot = Slot::new(31);
type HealthTier = u8;
type SyncDistance = Slot;
type OptimisticStatus = bool;
/// Helpful enum which is used when pattern matching to determine health tier.
#[derive(PartialEq, Debug)]
pub enum SyncDistanceTier {
Synced,
Small,
Medium,
Large,
}
/// Contains the different sync distance tiers which are determined at runtime by the
/// `sync_tolerance` CLI flag.
#[derive(Clone, Debug)]
pub struct BeaconNodeSyncDistanceTiers {
synced: SyncDistance,
small: SyncDistance,
medium: SyncDistance,
}
impl BeaconNodeSyncDistanceTiers {
pub fn from_config(config: &Config) -> Self {
if let Some(sync_tolerance) = config.sync_tolerance {
Self {
synced: Slot::new(sync_tolerance),
small: Slot::new(sync_tolerance) + SYNC_DISTANCE_SMALL_MODIFIER,
medium: Slot::new(sync_tolerance) + SYNC_DISTANCE_MEDIUM_MODIFIER,
}
} else {
Self::default()
}
}
/// Takes a given sync distance and determines its tier based on the `sync_tolerance` defined by
/// the CLI.
pub fn distance_tier(&self, distance: SyncDistance) -> SyncDistanceTier {
let distance = distance.as_u64();
// Add 1 since we are using exclusive ranges.
let synced = self.synced.as_u64() + 1;
let small = self.small.as_u64() + 1;
let medium = self.medium.as_u64() + 1;
if (0..synced).contains(&distance) {
SyncDistanceTier::Synced
} else if (synced..small).contains(&distance) {
SyncDistanceTier::Small
} else if (small..medium).contains(&distance) {
SyncDistanceTier::Medium
} else {
SyncDistanceTier::Large
}
}
}
impl Default for BeaconNodeSyncDistanceTiers {
fn default() -> Self {
Self {
synced: DEFAULT_SYNC_TOLERANCE,
small: DEFAULT_SYNC_TOLERANCE + SYNC_DISTANCE_SMALL_MODIFIER,
medium: DEFAULT_SYNC_TOLERANCE + SYNC_DISTANCE_MEDIUM_MODIFIER,
}
}
}
/// Execution Node health metrics.
#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
#[allow(dead_code)]
pub enum ExecutionEngineHealth {
Healthy,
Unhealthy,
}
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub struct BeaconNodeHealthTier {
pub tier: HealthTier,
pub sync_distance: SyncDistance,
}
impl Display for BeaconNodeHealthTier {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(f, "Tier{}({})", self.tier, self.sync_distance)
}
}
impl Ord for BeaconNodeHealthTier {
fn cmp(&self, other: &Self) -> Ordering {
self.tier.cmp(&other.tier)
}
}
impl PartialOrd for BeaconNodeHealthTier {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl BeaconNodeHealthTier {
pub fn new(tier: HealthTier, sync_distance: SyncDistance) -> Self {
Self {
tier,
sync_distance,
}
}
}
/// Beacon Node Health metrics.
#[derive(Debug, PartialEq, Eq)]
pub struct BeaconNodeHealth {
// The ID of the Beacon Node. This should correspond with its position in the `--beacon-nodes`
// list. Note that the ID field is used to tie-break nodes with the same health so that nodes
// with a lower ID are preferred.
pub id: usize,
// The slot number of the head.
pub head: Slot,
// Whether the node is optimistically synced.
pub optimistic_status: OptimisticStatus,
// The status of the nodes connected Execution Engine.
pub execution_status: ExecutionEngineHealth,
// The overall health tier of the Beacon Node. Used to rank the nodes for the purposes of
// fallbacks.
pub health_tier: BeaconNodeHealthTier,
}
impl Ord for BeaconNodeHealth {
fn cmp(&self, other: &Self) -> Ordering {
let ordering = self.health_tier.cmp(&other.health_tier);
if ordering == Ordering::Equal {
// Tie-break node health by ID.
self.id.cmp(&other.id)
} else {
ordering
}
}
}
impl PartialOrd for BeaconNodeHealth {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl BeaconNodeHealth {
pub fn from_status<T: SlotClock>(
id: usize,
head: Slot,
optimistic_status: OptimisticStatus,
execution_status: ExecutionEngineHealth,
distance_tiers: &BeaconNodeSyncDistanceTiers,
slot_clock: &T,
) -> Self {
let sync_distance = BeaconNodeHealth::compute_sync_distance(head, slot_clock);
let health_tier = BeaconNodeHealth::compute_health_tier(
sync_distance,
optimistic_status,
execution_status,
distance_tiers,
);
Self {
id,
head,
optimistic_status,
execution_status,
health_tier,
}
}
pub fn get_id(&self) -> usize {
self.id
}
pub fn get_health_tier(&self) -> BeaconNodeHealthTier {
self.health_tier
}
fn compute_sync_distance<T: SlotClock>(head: Slot, slot_clock: &T) -> SyncDistance {
// TODO(mac) May be worth distinguishing between nodes that are ahead of the `slot_clock`.
slot_clock
.now()
.map(|head_slot| head_slot.saturating_sub(head))
.unwrap_or(Slot::max_value())
}
fn compute_health_tier(
sync_distance: SyncDistance,
optimistic_status: OptimisticStatus,
execution_status: ExecutionEngineHealth,
sync_distance_tiers: &BeaconNodeSyncDistanceTiers,
) -> BeaconNodeHealthTier {
let sync_distance_tier = sync_distance_tiers.distance_tier(sync_distance);
let health = (sync_distance_tier, optimistic_status, execution_status);
match health {
(SyncDistanceTier::Synced, false, ExecutionEngineHealth::Healthy) => {
BeaconNodeHealthTier::new(1, sync_distance)
}
(SyncDistanceTier::Small, false, ExecutionEngineHealth::Healthy) => {
BeaconNodeHealthTier::new(2, sync_distance)
}
(SyncDistanceTier::Synced, false, ExecutionEngineHealth::Unhealthy) => {
BeaconNodeHealthTier::new(3, sync_distance)
}
(SyncDistanceTier::Medium, false, ExecutionEngineHealth::Healthy) => {
BeaconNodeHealthTier::new(4, sync_distance)
}
(SyncDistanceTier::Synced, true, ExecutionEngineHealth::Healthy) => {
BeaconNodeHealthTier::new(5, sync_distance)
}
(SyncDistanceTier::Synced, true, ExecutionEngineHealth::Unhealthy) => {
BeaconNodeHealthTier::new(6, sync_distance)
}
(SyncDistanceTier::Small, false, ExecutionEngineHealth::Unhealthy) => {
BeaconNodeHealthTier::new(7, sync_distance)
}
(SyncDistanceTier::Small, true, ExecutionEngineHealth::Healthy) => {
BeaconNodeHealthTier::new(8, sync_distance)
}
(SyncDistanceTier::Small, true, ExecutionEngineHealth::Unhealthy) => {
BeaconNodeHealthTier::new(9, sync_distance)
}
(SyncDistanceTier::Large, false, ExecutionEngineHealth::Healthy) => {
BeaconNodeHealthTier::new(10, sync_distance)
}
(SyncDistanceTier::Medium, false, ExecutionEngineHealth::Unhealthy) => {
BeaconNodeHealthTier::new(11, sync_distance)
}
(SyncDistanceTier::Medium, true, ExecutionEngineHealth::Healthy) => {
BeaconNodeHealthTier::new(12, sync_distance)
}
(SyncDistanceTier::Medium, true, ExecutionEngineHealth::Unhealthy) => {
BeaconNodeHealthTier::new(13, sync_distance)
}
(SyncDistanceTier::Large, false, ExecutionEngineHealth::Unhealthy) => {
BeaconNodeHealthTier::new(14, sync_distance)
}
(SyncDistanceTier::Large, true, ExecutionEngineHealth::Healthy) => {
BeaconNodeHealthTier::new(15, sync_distance)
}
(SyncDistanceTier::Large, true, ExecutionEngineHealth::Unhealthy) => {
BeaconNodeHealthTier::new(16, sync_distance)
}
}
}
}
#[cfg(test)]
mod tests {
use super::ExecutionEngineHealth::{Healthy, Unhealthy};
use super::{BeaconNodeHealth, BeaconNodeSyncDistanceTiers, SyncDistanceTier};
use crate::beacon_node_fallback::Config;
use slot_clock::{SlotClock, TestingSlotClock};
use std::time::Duration;
use types::Slot;
#[test]
fn all_possible_health_tiers() {
let current_head = Slot::new(64);
let config = Config::default();
let beacon_node_sync_distance_tiers = BeaconNodeSyncDistanceTiers::from_config(&config);
let slot_clock =
TestingSlotClock::new(current_head, Duration::from_secs(0), Duration::from_secs(1));
let mut health_vec = vec![];
for head_slot in (0..=64).rev() {
for optimistic_status in &[false, true] {
for ee_health in &[Healthy, Unhealthy] {
let health = BeaconNodeHealth::from_status(
0,
Slot::new(head_slot),
*optimistic_status,
*ee_health,
&beacon_node_sync_distance_tiers,
&slot_clock,
);
health_vec.push(health);
}
}
}
for health in health_vec {
let health_tier = health.get_health_tier();
let tier = health_tier.tier;
let distance = health_tier.sync_distance;
let distance_tier = beacon_node_sync_distance_tiers.distance_tier(distance);
// Check sync distance.
if [1, 3, 5, 6].contains(&tier) {
assert!(distance_tier == SyncDistanceTier::Synced)
} else if [2, 7, 8, 9].contains(&tier) {
assert!(distance_tier == SyncDistanceTier::Small);
} else if [4, 11, 12, 13].contains(&tier) {
assert!(distance_tier == SyncDistanceTier::Medium);
} else {
assert!(distance_tier == SyncDistanceTier::Large);
}
// Check optimistic status.
if [1, 2, 3, 4, 7, 10, 11, 14].contains(&tier) {
assert!(!health.optimistic_status);
} else {
assert!(health.optimistic_status);
}
// Check execution health.
if [3, 6, 7, 9, 11, 13, 14, 16].contains(&tier) {
assert_eq!(health.execution_status, Unhealthy);
} else {
assert_eq!(health.execution_status, Healthy);
}
}
}
#[test]
fn sync_tolerance() {
let config = Config {
disable_run_on_all: false,
sync_tolerance: Some(8),
};
let distance_tiers = BeaconNodeSyncDistanceTiers::from_config(&config);
let synced_low =
BeaconNodeHealth::compute_health_tier(Slot::new(0), false, Healthy, &distance_tiers);
let synced_high =
BeaconNodeHealth::compute_health_tier(Slot::new(8), false, Healthy, &distance_tiers);
let small_low =
BeaconNodeHealth::compute_health_tier(Slot::new(9), false, Healthy, &distance_tiers);
let small_high =
BeaconNodeHealth::compute_health_tier(Slot::new(15), false, Healthy, &distance_tiers);
let medium_low =
BeaconNodeHealth::compute_health_tier(Slot::new(16), false, Healthy, &distance_tiers);
let medium_high =
BeaconNodeHealth::compute_health_tier(Slot::new(39), false, Healthy, &distance_tiers);
let large =
BeaconNodeHealth::compute_health_tier(Slot::new(40), false, Healthy, &distance_tiers);
assert!(synced_low.tier == 1);
assert!(synced_high.tier == 1);
assert!(small_low.tier == 2);
assert!(small_high.tier == 2);
assert!(medium_low.tier == 4);
assert!(medium_high.tier == 4);
assert!(large.tier == 10);
}
}

View File

@@ -145,14 +145,14 @@ pub struct ProposerFallback<T, E: EthSpec> {
impl<T: SlotClock, E: EthSpec> ProposerFallback<T, E> {
// Try `func` on `self.proposer_nodes` first. If that doesn't work, try `self.beacon_nodes`.
pub async fn first_success_try_proposers_first<'a, F, O, Err, R>(
&'a self,
pub async fn first_success_try_proposers_first<F, O, Err, R>(
&self,
require_synced: RequireSynced,
offline_on_failure: OfflineOnFailure,
func: F,
) -> Result<O, Errors<Err>>
where
F: Fn(&'a BeaconNodeHttpClient) -> R + Clone,
F: Fn(BeaconNodeHttpClient) -> R + Clone,
R: Future<Output = Result<O, Err>>,
Err: Debug,
{
@@ -173,14 +173,14 @@ impl<T: SlotClock, E: EthSpec> ProposerFallback<T, E> {
}
// Try `func` on `self.beacon_nodes` first. If that doesn't work, try `self.proposer_nodes`.
pub async fn first_success_try_proposers_last<'a, F, O, Err, R>(
&'a self,
pub async fn first_success_try_proposers_last<F, O, Err, R>(
&self,
require_synced: RequireSynced,
offline_on_failure: OfflineOnFailure,
func: F,
) -> Result<O, Errors<Err>>
where
F: Fn(&'a BeaconNodeHttpClient) -> R + Clone,
F: Fn(BeaconNodeHttpClient) -> R + Clone,
R: Future<Output = Result<O, Err>>,
Err: Debug,
{

View File

@@ -1,5 +1,5 @@
use crate::beacon_node_fallback::CandidateError;
use eth2::BeaconNodeHttpClient;
use eth2::{types::Slot, BeaconNodeHttpClient};
use slog::{debug, error, warn, Logger};
use slot_clock::SlotClock;
@@ -70,6 +70,8 @@ pub async fn check_synced<T: SlotClock>(
"local_slot" => local_slot,
"endpoint" => %beacon_node,
);
return Err(CandidateError::TimeDiscrepancy);
}
}
}
@@ -80,3 +82,29 @@ pub async fn check_synced<T: SlotClock>(
Err(CandidateError::NotSynced)
}
}
pub async fn check_node_health(
beacon_node: &BeaconNodeHttpClient,
log: &Logger,
) -> Result<(Slot, bool, bool), CandidateError> {
let resp = match beacon_node.get_node_syncing().await {
Ok(resp) => resp,
Err(e) => {
warn!(
log,
"Unable connect to beacon node";
"error" => %e
);
return Err(CandidateError::Offline);
}
};
Ok((
resp.data.head_slot,
// Note that optimistic and EL status will both default to their healthy variants which may
// be undesirable.
resp.data.is_optimistic.unwrap_or(false),
resp.data.el_offline.unwrap_or(false),
))
}

View File

@@ -363,6 +363,14 @@ pub fn cli_app<'a, 'b>() -> App<'a, 'b> {
.default_value("500")
.takes_value(true),
)
.arg(
Arg::with_name("beacon-node-sync-tolerance")
.long("beacon-node-sync-tolerance")
.help("Sets the number of slots behind the head that each connected Beacon Node can be \
to still be considered synced. Effectively this gives more priority to the first \
connected Beacon Node.")
.takes_value(true),
)
/*
* Experimental/development options.
*/

View File

@@ -1,5 +1,5 @@
use crate::graffiti_file::GraffitiFile;
use crate::{http_api, http_metrics};
use crate::{beacon_node_fallback, http_api, http_metrics};
use clap::ArgMatches;
use clap_utils::{flags::DISABLE_MALLOC_TUNING_FLAG, parse_optional, parse_required};
use directory::{
@@ -19,7 +19,7 @@ use types::{Address, GRAFFITI_BYTES_LEN};
pub const DEFAULT_BEACON_NODE: &str = "http://localhost:5052/";
/// Stores the core configuration for this validator instance.
#[derive(Clone, Serialize, Deserialize)]
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct Config {
/// The data directory, which stores all validator databases
pub validator_dir: PathBuf,
@@ -50,6 +50,8 @@ pub struct Config {
pub http_api: http_api::Config,
/// Configuration for the HTTP REST API.
pub http_metrics: http_metrics::Config,
/// Configuration for the Beacon Node fallback.
pub beacon_node_fallback: beacon_node_fallback::Config,
/// Configuration for sending metrics to a remote explorer endpoint.
pub monitoring_api: Option<monitoring_api::Config>,
/// If true, enable functionality that monitors the network for attestations or proposals from
@@ -73,8 +75,6 @@ pub struct Config {
///
/// This is *not* recommended in prod and should only be used for testing.
pub block_delay: Option<Duration>,
/// Disables publishing http api requests to all beacon nodes for select api calls.
pub disable_run_on_all: bool,
/// Enables a service which attempts to measure latency between the VC and BNs.
pub enable_latency_measurement_service: bool,
/// Defines the number of validators per `validator/register_validator` request sent to the BN.
@@ -109,6 +109,7 @@ impl Default for Config {
fee_recipient: None,
http_api: <_>::default(),
http_metrics: <_>::default(),
beacon_node_fallback: <_>::default(),
monitoring_api: None,
enable_doppelganger_protection: false,
enable_high_validator_count_metrics: false,
@@ -117,7 +118,6 @@ impl Default for Config {
builder_proposals: false,
builder_registration_timestamp_override: None,
gas_limit: None,
disable_run_on_all: false,
enable_latency_measurement_service: true,
validator_registration_batch_size: 500,
}
@@ -215,7 +215,6 @@ impl Config {
"msg" => "it no longer has any effect",
);
}
config.disable_run_on_all = cli_args.is_present("disable-run-on-all");
config.disable_auto_discover = cli_args.is_present("disable-auto-discover");
config.init_slashing_protection = cli_args.is_present("init-slashing-protection");
config.use_long_timeouts = cli_args.is_present("use-long-timeouts");
@@ -258,6 +257,20 @@ impl Config {
config.beacon_nodes_tls_certs = Some(tls_certs.split(',').map(PathBuf::from).collect());
}
/*
* Beacon node fallback
*/
config.beacon_node_fallback.disable_run_on_all = cli_args.is_present("disable-run-on-all");
if let Some(sync_tolerance) = cli_args.value_of("beacon-node-sync-tolerance") {
config.beacon_node_fallback.sync_tolerance = Some(
sync_tolerance
.parse::<u64>()
.map_err(|_| "beacon-node-sync-tolerance is not a valid u64.")?,
);
}
/*
* Http API server
*/

View File

@@ -1,5 +1,6 @@
mod attestation_service;
mod beacon_node_fallback;
mod beacon_node_health;
mod block_service;
mod check_synced;
mod cli;
@@ -334,15 +335,18 @@ impl<T: EthSpec> ProductionValidatorClient<T> {
.collect::<Result<Vec<BeaconNodeHttpClient>, String>>()?;
let num_nodes = beacon_nodes.len();
let candidates = beacon_nodes
.into_iter()
.map(CandidateBeaconNode::new)
.zip(0..num_nodes)
.map(|(node, id)| CandidateBeaconNode::new(node, id))
.collect();
let proposer_nodes_num = proposer_nodes.len();
let proposer_candidates = proposer_nodes
.into_iter()
.map(CandidateBeaconNode::new)
.zip(0..num_nodes)
.map(|(node, id)| CandidateBeaconNode::new(node, id))
.collect();
// Set the count for beacon node fallbacks excluding the primary beacon node.
@@ -364,14 +368,14 @@ impl<T: EthSpec> ProductionValidatorClient<T> {
let mut beacon_nodes: BeaconNodeFallback<_, T> = BeaconNodeFallback::new(
candidates,
config.disable_run_on_all,
config.beacon_node_fallback,
context.eth2_config.spec.clone(),
log.clone(),
);
let mut proposer_nodes: BeaconNodeFallback<_, T> = BeaconNodeFallback::new(
proposer_candidates,
config.disable_run_on_all,
config.beacon_node_fallback,
context.eth2_config.spec.clone(),
log.clone(),
);
@@ -625,10 +629,10 @@ async fn init_from_beacon_node<E: EthSpec>(
proposer_nodes.update_all_candidates().await;
let num_available = beacon_nodes.num_available().await;
let num_total = beacon_nodes.num_total();
let num_total = beacon_nodes.num_total().await;
let proposer_available = proposer_nodes.num_available().await;
let proposer_total = proposer_nodes.num_total();
let proposer_total = proposer_nodes.num_total().await;
if proposer_total > 0 && proposer_available == 0 {
warn!(

View File

@@ -49,7 +49,7 @@ async fn notify<T: SlotClock + 'static, E: EthSpec>(
&http_metrics::metrics::SYNCED_BEACON_NODES_COUNT,
num_synced as i64,
);
let num_total = duties_service.beacon_nodes.num_total();
let num_total = duties_service.beacon_nodes.num_total().await;
set_gauge(
&http_metrics::metrics::TOTAL_BEACON_NODES_COUNT,
num_total as i64,