Combine status and health and improve logging

This commit is contained in:
Mac L
2023-06-21 15:51:37 +10:00
parent d839620be1
commit 5153aff196
5 changed files with 106 additions and 65 deletions

View File

@@ -1,6 +1,5 @@
use crate::local_network::LocalNetwork; use crate::local_network::LocalNetwork;
use node_test_rig::eth2::types::{BlockId, StateId}; use node_test_rig::eth2::types::{BlockId, StateId};
use std::time::Duration; use std::time::Duration;
use types::{Epoch, EthSpec, ExecPayload, ExecutionBlockHash, Hash256, Slot, Unsigned}; use types::{Epoch, EthSpec, ExecPayload, ExecutionBlockHash, Hash256, Slot, Unsigned};

View File

@@ -64,7 +64,7 @@ pub fn run_eth1_sim(matches: &ArgMatches) -> Result<(), String> {
let mut env = EnvironmentBuilder::minimal() let mut env = EnvironmentBuilder::minimal()
.initialize_logger(LoggerConfig { .initialize_logger(LoggerConfig {
path: None, path: None,
debug_level: String::from("info"), debug_level: String::from("debug"),
logfile_debug_level: String::from("debug"), logfile_debug_level: String::from("debug"),
log_format: None, log_format: None,
logfile_format: None, logfile_format: None,

View File

@@ -137,18 +137,23 @@ pub enum CandidateError {
Uninitialized, Uninitialized,
Offline, Offline,
Incompatible, Incompatible,
NotSynced,
TimeDiscrepancy, TimeDiscrepancy,
} }
#[derive(Debug, Clone)]
pub struct CandidateInfo {
pub id: usize,
pub node: String,
pub health: Option<BeaconNodeHealth>,
}
/// Represents a `BeaconNodeHttpClient` inside a `BeaconNodeFallback` that may or may not be used /// Represents a `BeaconNodeHttpClient` inside a `BeaconNodeFallback` that may or may not be used
/// for a query. /// for a query.
#[derive(Debug)] #[derive(Debug)]
pub struct CandidateBeaconNode<E> { pub struct CandidateBeaconNode<E> {
id: usize, id: usize,
beacon_node: BeaconNodeHttpClient, beacon_node: BeaconNodeHttpClient,
health: PLRwLock<Option<BeaconNodeHealth>>, health: PLRwLock<Result<BeaconNodeHealth, CandidateError>>,
status: RwLock<Result<(), CandidateError>>,
_phantom: PhantomData<E>, _phantom: PhantomData<E>,
} }
@@ -162,11 +167,11 @@ impl<E: EthSpec> Eq for CandidateBeaconNode<E> {}
impl<E: EthSpec> Ord for CandidateBeaconNode<E> { impl<E: EthSpec> Ord for CandidateBeaconNode<E> {
fn cmp(&self, other: &Self) -> Ordering { fn cmp(&self, other: &Self) -> Ordering {
match (&(*self.health.read()), &(*other.health.read())) { match (&(self.health()), &(other.health())) {
(None, None) => Ordering::Equal, (Err(_), Err(_)) => Ordering::Equal,
(None, _) => Ordering::Greater, (Err(_), _) => Ordering::Greater,
(_, None) => Ordering::Less, (_, Err(_)) => Ordering::Less,
(Some(health_1), Some(health_2)) => health_1.cmp(health_2), (Ok(health_1), Ok(health_2)) => health_1.cmp(health_2),
} }
} }
} }
@@ -183,15 +188,14 @@ impl<E: EthSpec> CandidateBeaconNode<E> {
Self { Self {
id, id,
beacon_node, beacon_node,
health: PLRwLock::new(None), health: PLRwLock::new(Err(CandidateError::Uninitialized)),
status: RwLock::new(Err(CandidateError::Uninitialized)),
_phantom: PhantomData, _phantom: PhantomData,
} }
} }
/// Returns the status of `self`. /// Returns the health of `self`.
pub async fn status(&self) -> Result<(), CandidateError> { pub fn health(&self) -> Result<BeaconNodeHealth, CandidateError> {
*self.status.read().await *self.health.read()
} }
pub async fn refresh_health<T: SlotClock>( pub async fn refresh_health<T: SlotClock>(
@@ -202,7 +206,7 @@ impl<E: EthSpec> CandidateBeaconNode<E> {
log: &Logger, log: &Logger,
) -> Result<(), CandidateError> { ) -> Result<(), CandidateError> {
if let Err(e) = self.is_compatible(spec, log).await { if let Err(e) = self.is_compatible(spec, log).await {
*self.status.write().await = Err(e); *self.health.write() = Err(e);
return Ok(()); return Ok(());
} }
@@ -231,29 +235,20 @@ impl<E: EthSpec> CandidateBeaconNode<E> {
slot_clock, slot_clock,
); );
warn!( // TODO(mac): Set metric here.
log,
"Health of Beacon Node: {}, updated. Health tier: {}",
new_health.get_id(),
new_health.get_health_tier()
);
*self.health.write() = Some(new_health);
*self.status.write().await = Ok(());
*self.health.write() = Ok(new_health);
Ok(()) Ok(())
} }
Err(status) => { Err(e) => {
// Set the health as None which is sorted last in the list. // Set the health as `Err` which is sorted last in the list.
*self.health.write() = None; *self.health.write() = Err(e);
*self.status.write().await = Err(status);
Ok(()) Ok(())
} }
} }
} else { } else {
// Slot clock will only be None at startup. // Slot clock will only be `None` at startup.
// Assume compatible nodes are available. // Assume compatible nodes are available.
*self.status.write().await = Ok(());
Ok(()) Ok(())
} }
} }
@@ -373,7 +368,7 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
pub async fn num_synced(&self) -> usize { pub async fn num_synced(&self) -> usize {
let mut n = 0; let mut n = 0;
for candidate in self.candidates.read().await.iter() { for candidate in self.candidates.read().await.iter() {
if let Some(cand) = candidate.health.read().as_ref() { if let Ok(cand) = candidate.health().as_ref() {
if self if self
.distance_tiers .distance_tiers
.distance_tier(cand.health_tier.sync_distance) .distance_tier(cand.health_tier.sync_distance)
@@ -390,7 +385,7 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
pub async fn num_synced_fallback(&self) -> usize { pub async fn num_synced_fallback(&self) -> usize {
let mut n = 0; let mut n = 0;
for candidate in self.candidates.read().await.iter().skip(1) { for candidate in self.candidates.read().await.iter().skip(1) {
if let Some(cand) = candidate.health.read().as_ref() { if let Ok(cand) = candidate.health().as_ref() {
if self if self
.distance_tiers .distance_tiers
.distance_tier(cand.health_tier.sync_distance) .distance_tier(cand.health_tier.sync_distance)
@@ -407,13 +402,26 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
pub async fn num_available(&self) -> usize { pub async fn num_available(&self) -> usize {
let mut n = 0; let mut n = 0;
for candidate in self.candidates.read().await.iter() { for candidate in self.candidates.read().await.iter() {
if candidate.status().await.is_ok() { if candidate.health().is_ok() {
n += 1 n += 1
} }
} }
n n
} }
pub async fn get_all_candidate_info(&self) -> Vec<CandidateInfo> {
let candidates = self.candidates.read().await;
let mut results = Vec::with_capacity(candidates.len());
for candidate in candidates.iter() {
let id = candidate.id;
let node = candidate.beacon_node.to_string();
let health = candidate.health().ok();
let info = CandidateInfo { id, node, health };
results.push(info);
}
results
}
/// Loop through ALL candidates in `self.candidates` and update their sync status. /// Loop through ALL candidates in `self.candidates` and update their sync status.
/// ///
/// It is possible for a node to return an unsynced status while continuing to serve /// It is possible for a node to return an unsynced status while continuing to serve
@@ -421,21 +429,33 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
/// A previous implementation of this function polled only the unavailable BNs. /// A previous implementation of this function polled only the unavailable BNs.
pub async fn update_all_candidates(&self) { pub async fn update_all_candidates(&self) {
let candidates = self.candidates.read().await; let candidates = self.candidates.read().await;
let mut futures = Vec::with_capacity(candidates.len());
let mut nodes = Vec::with_capacity(candidates.len());
let futures = candidates for candidate in candidates.iter() {
.iter() futures.push(candidate.refresh_health(
.map(|candidate| { &self.distance_tiers,
candidate.refresh_health( self.slot_clock.as_ref(),
&self.distance_tiers, &self.spec,
self.slot_clock.as_ref(), &self.log,
&self.spec, ));
&self.log, nodes.push(candidate.beacon_node.to_string());
) }
})
.collect::<Vec<_>>();
// Run all updates concurrently and ignore errors. // Run all updates concurrently.
let _ = future::join_all(futures).await; let future_results = future::join_all(futures).await;
let results = future_results.iter().zip(nodes);
for (result, node) in results {
if let Err(e) = result {
warn!(
self.log,
"A connected beacon node errored during routine health check.";
"error" => ?e,
"endpoint" => node,
);
}
}
drop(candidates); drop(candidates);
@@ -519,11 +539,7 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
"node" => $candidate.beacon_node.to_string(), "node" => $candidate.beacon_node.to_string(),
"error" => ?e, "error" => ?e,
); );
// If we have an error on this function, make the client as not-ready.
//
// There exists a race condition where the candidate may have been marked
// as ready between the `func` call and now. We deem this an acceptable
// inefficiency.
errors.push(($candidate.beacon_node.to_string(), Error::RequestFailed(e))); errors.push(($candidate.beacon_node.to_string(), Error::RequestFailed(e)));
inc_counter_vec(&ENDPOINT_ERRORS, &[$candidate.beacon_node.as_ref()]); inc_counter_vec(&ENDPOINT_ERRORS, &[$candidate.beacon_node.as_ref()]);
} }
@@ -573,11 +589,6 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
match func($candidate.beacon_node.clone()).await { match func($candidate.beacon_node.clone()).await {
Ok(val) => results.push(Ok(val)), Ok(val) => results.push(Ok(val)),
Err(e) => { Err(e) => {
// If we have an error on this function, make the client as not-ready.
//
// There exists a race condition where the candidate may have been marked
// as ready between the `func` call and now. We deem this an acceptable
// inefficiency.
results.push(Err(( results.push(Err((
$candidate.beacon_node.to_string(), $candidate.beacon_node.to_string(),
Error::RequestFailed(e), Error::RequestFailed(e),
@@ -714,12 +725,12 @@ mod tests {
health_tier: BeaconNodeHealthTier::new(4, Slot::new(10), small), health_tier: BeaconNodeHealthTier::new(4, Slot::new(10), small),
}; };
*candidate_1.health.write() = Some(health_1); *candidate_1.health.write() = Ok(health_1);
*candidate_2.health.write() = Some(health_2); *candidate_2.health.write() = Ok(health_2);
*candidate_3.health.write() = Some(health_3); *candidate_3.health.write() = Ok(health_3);
*candidate_4.health.write() = Some(health_4); *candidate_4.health.write() = Ok(health_4);
*candidate_5.health.write() = Some(health_5); *candidate_5.health.write() = Ok(health_5);
*candidate_6.health.write() = Some(health_6); *candidate_6.health.write() = Ok(health_6);
let mut candidates = vec![ let mut candidates = vec![
candidate_3, candidate_3,

View File

@@ -158,7 +158,7 @@ impl BeaconNodeHealthTier {
} }
/// Beacon Node Health metrics. /// Beacon Node Health metrics.
#[derive(Debug, PartialEq, Eq)] #[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub struct BeaconNodeHealth { pub struct BeaconNodeHealth {
// The ID of the Beacon Node. This should correspond with its position in the `--beacon-nodes` // The ID of the Beacon Node. This should correspond with its position in the `--beacon-nodes`
// list. Note that the ID field is used to tie-break nodes with the same health so that nodes // list. Note that the ID field is used to tie-break nodes with the same health so that nodes

View File

@@ -1,7 +1,7 @@
use crate::http_metrics; use crate::http_metrics;
use crate::{DutiesService, ProductionValidatorClient}; use crate::{DutiesService, ProductionValidatorClient};
use lighthouse_metrics::set_gauge; use lighthouse_metrics::set_gauge;
use slog::{error, info, Logger}; use slog::{debug, error, info, Logger};
use slot_clock::SlotClock; use slot_clock::SlotClock;
use tokio::time::{sleep, Duration}; use tokio::time::{sleep, Duration};
use types::EthSpec; use types::EthSpec;
@@ -39,6 +39,7 @@ async fn notify<T: SlotClock + 'static, E: EthSpec>(
duties_service: &DutiesService<T, E>, duties_service: &DutiesService<T, E>,
log: &Logger, log: &Logger,
) { ) {
let candidate_info = duties_service.beacon_nodes.get_all_candidate_info().await;
let num_available = duties_service.beacon_nodes.num_available().await; let num_available = duties_service.beacon_nodes.num_available().await;
set_gauge( set_gauge(
&http_metrics::metrics::AVAILABLE_BEACON_NODES_COUNT, &http_metrics::metrics::AVAILABLE_BEACON_NODES_COUNT,
@@ -55,9 +56,14 @@ async fn notify<T: SlotClock + 'static, E: EthSpec>(
num_total as i64, num_total as i64,
); );
if num_synced > 0 { if num_synced > 0 {
let primary = candidate_info
.get(0)
.map(|candidate| candidate.node.as_str())
.unwrap_or("None");
info!( info!(
log, log,
"Connected to beacon node(s)"; "Connected to beacon node(s)";
"primary" => primary,
"total" => num_total, "total" => num_total,
"available" => num_available, "available" => num_available,
"synced" => num_synced, "synced" => num_synced,
@@ -78,6 +84,31 @@ async fn notify<T: SlotClock + 'static, E: EthSpec>(
set_gauge(&http_metrics::metrics::ETH2_FALLBACK_CONNECTED, 0); set_gauge(&http_metrics::metrics::ETH2_FALLBACK_CONNECTED, 0);
} }
// TODO(mac) Store all connected node info into metrics.
for info in candidate_info {
if let Some(health) = info.health {
debug!(
log,
"Beacon node info";
"status" => "Connected",
"id" => info.id,
"endpoint" => info.node,
"head_slot" => %health.head,
"is_optimistic" => ?health.optimistic_status,
"execution_engine_status" => ?health.execution_status,
"health_tier" => ?health.health_tier,
);
} else {
debug!(
log,
"Beacon node info";
"status" => "Disconnected",
"id" => info.id,
"endpoint" => info.node,
);
}
}
if let Some(slot) = duties_service.slot_clock.now() { if let Some(slot) = duties_service.slot_clock.now() {
let epoch = slot.epoch(E::slots_per_epoch()); let epoch = slot.epoch(E::slots_per_epoch());