Publish subscriptions to all beacon nodes (#3529)

## Issue Addressed

Resolves #3516 

## Proposed Changes

Adds a beacon fallback function for running a beacon node http query on all available fallbacks instead of returning on a first successful result. Uses the new `run_on_all` method for attestation and sync committee subscriptions. 

## Additional Info

Please provide any additional information. For example, future considerations
or information useful for reviewers.
This commit is contained in:
Pawan Dhananjay
2022-09-28 19:53:35 +00:00
parent abcebf276f
commit 6779912fe4
9 changed files with 198 additions and 16 deletions

View File

@@ -105,11 +105,13 @@ impl<E> Error<E> {
}
/// The list of errors encountered whilst attempting to perform a query.
pub struct AllErrored<E>(pub Vec<(String, Error<E>)>);
pub struct Errors<E>(pub Vec<(String, Error<E>)>);
impl<E: Debug> fmt::Display for AllErrored<E> {
impl<E: Debug> fmt::Display for Errors<E> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "All endpoints failed")?;
if !self.0.is_empty() {
write!(f, "Some endpoints failed, num_failed: {}", self.0.len())?;
}
for (i, (id, error)) in self.0.iter().enumerate() {
let comma = if i + 1 < self.0.len() { "," } else { "" };
@@ -294,15 +296,22 @@ impl<E: EthSpec> CandidateBeaconNode<E> {
pub struct BeaconNodeFallback<T, E> {
candidates: Vec<CandidateBeaconNode<E>>,
slot_clock: Option<T>,
disable_run_on_all: bool,
spec: ChainSpec,
log: Logger,
}
impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
pub fn new(candidates: Vec<CandidateBeaconNode<E>>, spec: ChainSpec, log: Logger) -> Self {
pub fn new(
candidates: Vec<CandidateBeaconNode<E>>,
disable_run_on_all: bool,
spec: ChainSpec,
log: Logger,
) -> Self {
Self {
candidates,
slot_clock: None,
disable_run_on_all,
spec,
log,
}
@@ -396,7 +405,7 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
require_synced: RequireSynced,
offline_on_failure: OfflineOnFailure,
func: F,
) -> Result<O, AllErrored<Err>>
) -> Result<O, Errors<Err>>
where
F: Fn(&'a BeaconNodeHttpClient) -> R,
R: Future<Output = Result<O, Err>>,
@@ -486,6 +495,145 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
}
// There were no candidates already ready and we were unable to make any of them ready.
Err(AllErrored(errors))
Err(Errors(errors))
}
/// Run `func` against all candidates in `self`, collecting the result of `func` against each
/// candidate.
///
/// First this function will try all nodes with a suitable status. If no candidates are suitable
/// it will try updating the status of all unsuitable nodes and re-running `func` again.
///
/// Note: This function returns `Ok(())` if `func` returned successfully on all beacon nodes.
/// It returns a list of errors along with the beacon node id that failed for `func`.
/// Since this ignores the actual result of `func`, this function should only be used for beacon
/// node calls whose results we do not care about, only that they completed successfully.
pub async fn run_on_all<'a, F, O, Err, R>(
&'a self,
require_synced: RequireSynced,
offline_on_failure: OfflineOnFailure,
func: F,
) -> Result<(), Errors<Err>>
where
F: Fn(&'a BeaconNodeHttpClient) -> R,
R: Future<Output = Result<O, Err>>,
{
let mut results = vec![];
let mut to_retry = vec![];
let mut retry_unsynced = vec![];
// Run `func` using a `candidate`, returning the value or capturing errors.
//
// We use a macro instead of a closure here since it is not trivial to move `func` into a
// closure.
macro_rules! try_func {
($candidate: ident) => {{
inc_counter_vec(&ENDPOINT_REQUESTS, &[$candidate.beacon_node.as_ref()]);
// There exists a race condition where `func` may be called when the candidate is
// actually not ready. We deem this an acceptable inefficiency.
match func(&$candidate.beacon_node).await {
Ok(val) => results.push(Ok(val)),
Err(e) => {
// If we have an error on this function, make the client as not-ready.
//
// There exists a race condition where the candidate may have been marked
// as ready between the `func` call and now. We deem this an acceptable
// inefficiency.
if matches!(offline_on_failure, OfflineOnFailure::Yes) {
$candidate.set_offline().await;
}
results.push(Err((
$candidate.beacon_node.to_string(),
Error::RequestFailed(e),
)));
inc_counter_vec(&ENDPOINT_ERRORS, &[$candidate.beacon_node.as_ref()]);
}
}
}};
}
// First pass: try `func` on all synced and ready candidates.
//
// This ensures that we always choose a synced node if it is available.
for candidate in &self.candidates {
match candidate.status(RequireSynced::Yes).await {
Err(CandidateError::NotSynced) if require_synced == false => {
// This client is unsynced we will try it after trying all synced clients
retry_unsynced.push(candidate);
}
Err(_) => {
// This client was not ready on the first pass, we might try it again later.
to_retry.push(candidate);
}
Ok(_) => try_func!(candidate),
}
}
// Second pass: try `func` on ready unsynced candidates. This only runs if we permit
// unsynced candidates.
//
// Due to async race-conditions, it is possible that we will send a request to a candidate
// that has been set to an offline/unready status. This is acceptable.
if require_synced == false {
for candidate in retry_unsynced {
try_func!(candidate);
}
}
// Third pass: try again, attempting to make non-ready clients become ready.
for candidate in to_retry {
// If the candidate hasn't luckily transferred into the correct state in the meantime,
// force an update of the state.
let new_status = match candidate.status(require_synced).await {
Ok(()) => Ok(()),
Err(_) => {
candidate
.refresh_status(self.slot_clock.as_ref(), &self.spec, &self.log)
.await
}
};
match new_status {
Ok(()) => try_func!(candidate),
Err(CandidateError::NotSynced) if require_synced == false => try_func!(candidate),
Err(e) => {
results.push(Err((
candidate.beacon_node.to_string(),
Error::Unavailable(e),
)));
}
}
}
let errors: Vec<_> = results.into_iter().filter_map(|res| res.err()).collect();
if !errors.is_empty() {
Err(Errors(errors))
} else {
Ok(())
}
}
/// Call `func` on first beacon node that returns success or on all beacon nodes
/// depending on the value of `disable_run_on_all`.
pub async fn run<'a, F, Err, R>(
&'a self,
require_synced: RequireSynced,
offline_on_failure: OfflineOnFailure,
func: F,
) -> Result<(), Errors<Err>>
where
F: Fn(&'a BeaconNodeHttpClient) -> R,
R: Future<Output = Result<(), Err>>,
{
if self.disable_run_on_all {
self.first_success(require_synced, offline_on_failure, func)
.await?;
Ok(())
} else {
self.run_on_all(require_synced, offline_on_failure, func)
.await
}
}
}