mirror of
https://github.com/sigp/lighthouse.git
synced 2026-05-07 16:55:46 +00:00
Combine status and health and improve logging
This commit is contained in:
@@ -1,6 +1,5 @@
|
|||||||
use crate::local_network::LocalNetwork;
|
use crate::local_network::LocalNetwork;
|
||||||
use node_test_rig::eth2::types::{BlockId, StateId};
|
use node_test_rig::eth2::types::{BlockId, StateId};
|
||||||
|
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use types::{Epoch, EthSpec, ExecPayload, ExecutionBlockHash, Hash256, Slot, Unsigned};
|
use types::{Epoch, EthSpec, ExecPayload, ExecutionBlockHash, Hash256, Slot, Unsigned};
|
||||||
|
|
||||||
|
|||||||
@@ -64,7 +64,7 @@ pub fn run_eth1_sim(matches: &ArgMatches) -> Result<(), String> {
|
|||||||
let mut env = EnvironmentBuilder::minimal()
|
let mut env = EnvironmentBuilder::minimal()
|
||||||
.initialize_logger(LoggerConfig {
|
.initialize_logger(LoggerConfig {
|
||||||
path: None,
|
path: None,
|
||||||
debug_level: String::from("info"),
|
debug_level: String::from("debug"),
|
||||||
logfile_debug_level: String::from("debug"),
|
logfile_debug_level: String::from("debug"),
|
||||||
log_format: None,
|
log_format: None,
|
||||||
logfile_format: None,
|
logfile_format: None,
|
||||||
|
|||||||
@@ -137,18 +137,23 @@ pub enum CandidateError {
|
|||||||
Uninitialized,
|
Uninitialized,
|
||||||
Offline,
|
Offline,
|
||||||
Incompatible,
|
Incompatible,
|
||||||
NotSynced,
|
|
||||||
TimeDiscrepancy,
|
TimeDiscrepancy,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct CandidateInfo {
|
||||||
|
pub id: usize,
|
||||||
|
pub node: String,
|
||||||
|
pub health: Option<BeaconNodeHealth>,
|
||||||
|
}
|
||||||
|
|
||||||
/// Represents a `BeaconNodeHttpClient` inside a `BeaconNodeFallback` that may or may not be used
|
/// Represents a `BeaconNodeHttpClient` inside a `BeaconNodeFallback` that may or may not be used
|
||||||
/// for a query.
|
/// for a query.
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct CandidateBeaconNode<E> {
|
pub struct CandidateBeaconNode<E> {
|
||||||
id: usize,
|
id: usize,
|
||||||
beacon_node: BeaconNodeHttpClient,
|
beacon_node: BeaconNodeHttpClient,
|
||||||
health: PLRwLock<Option<BeaconNodeHealth>>,
|
health: PLRwLock<Result<BeaconNodeHealth, CandidateError>>,
|
||||||
status: RwLock<Result<(), CandidateError>>,
|
|
||||||
_phantom: PhantomData<E>,
|
_phantom: PhantomData<E>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -162,11 +167,11 @@ impl<E: EthSpec> Eq for CandidateBeaconNode<E> {}
|
|||||||
|
|
||||||
impl<E: EthSpec> Ord for CandidateBeaconNode<E> {
|
impl<E: EthSpec> Ord for CandidateBeaconNode<E> {
|
||||||
fn cmp(&self, other: &Self) -> Ordering {
|
fn cmp(&self, other: &Self) -> Ordering {
|
||||||
match (&(*self.health.read()), &(*other.health.read())) {
|
match (&(self.health()), &(other.health())) {
|
||||||
(None, None) => Ordering::Equal,
|
(Err(_), Err(_)) => Ordering::Equal,
|
||||||
(None, _) => Ordering::Greater,
|
(Err(_), _) => Ordering::Greater,
|
||||||
(_, None) => Ordering::Less,
|
(_, Err(_)) => Ordering::Less,
|
||||||
(Some(health_1), Some(health_2)) => health_1.cmp(health_2),
|
(Ok(health_1), Ok(health_2)) => health_1.cmp(health_2),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -183,15 +188,14 @@ impl<E: EthSpec> CandidateBeaconNode<E> {
|
|||||||
Self {
|
Self {
|
||||||
id,
|
id,
|
||||||
beacon_node,
|
beacon_node,
|
||||||
health: PLRwLock::new(None),
|
health: PLRwLock::new(Err(CandidateError::Uninitialized)),
|
||||||
status: RwLock::new(Err(CandidateError::Uninitialized)),
|
|
||||||
_phantom: PhantomData,
|
_phantom: PhantomData,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the status of `self`.
|
/// Returns the health of `self`.
|
||||||
pub async fn status(&self) -> Result<(), CandidateError> {
|
pub fn health(&self) -> Result<BeaconNodeHealth, CandidateError> {
|
||||||
*self.status.read().await
|
*self.health.read()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn refresh_health<T: SlotClock>(
|
pub async fn refresh_health<T: SlotClock>(
|
||||||
@@ -202,7 +206,7 @@ impl<E: EthSpec> CandidateBeaconNode<E> {
|
|||||||
log: &Logger,
|
log: &Logger,
|
||||||
) -> Result<(), CandidateError> {
|
) -> Result<(), CandidateError> {
|
||||||
if let Err(e) = self.is_compatible(spec, log).await {
|
if let Err(e) = self.is_compatible(spec, log).await {
|
||||||
*self.status.write().await = Err(e);
|
*self.health.write() = Err(e);
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -231,29 +235,20 @@ impl<E: EthSpec> CandidateBeaconNode<E> {
|
|||||||
slot_clock,
|
slot_clock,
|
||||||
);
|
);
|
||||||
|
|
||||||
warn!(
|
// TODO(mac): Set metric here.
|
||||||
log,
|
|
||||||
"Health of Beacon Node: {}, updated. Health tier: {}",
|
|
||||||
new_health.get_id(),
|
|
||||||
new_health.get_health_tier()
|
|
||||||
);
|
|
||||||
|
|
||||||
*self.health.write() = Some(new_health);
|
|
||||||
*self.status.write().await = Ok(());
|
|
||||||
|
|
||||||
|
*self.health.write() = Ok(new_health);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
Err(status) => {
|
Err(e) => {
|
||||||
// Set the health as None which is sorted last in the list.
|
// Set the health as `Err` which is sorted last in the list.
|
||||||
*self.health.write() = None;
|
*self.health.write() = Err(e);
|
||||||
*self.status.write().await = Err(status);
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Slot clock will only be None at startup.
|
// Slot clock will only be `None` at startup.
|
||||||
// Assume compatible nodes are available.
|
// Assume compatible nodes are available.
|
||||||
*self.status.write().await = Ok(());
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -373,7 +368,7 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
|
|||||||
pub async fn num_synced(&self) -> usize {
|
pub async fn num_synced(&self) -> usize {
|
||||||
let mut n = 0;
|
let mut n = 0;
|
||||||
for candidate in self.candidates.read().await.iter() {
|
for candidate in self.candidates.read().await.iter() {
|
||||||
if let Some(cand) = candidate.health.read().as_ref() {
|
if let Ok(cand) = candidate.health().as_ref() {
|
||||||
if self
|
if self
|
||||||
.distance_tiers
|
.distance_tiers
|
||||||
.distance_tier(cand.health_tier.sync_distance)
|
.distance_tier(cand.health_tier.sync_distance)
|
||||||
@@ -390,7 +385,7 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
|
|||||||
pub async fn num_synced_fallback(&self) -> usize {
|
pub async fn num_synced_fallback(&self) -> usize {
|
||||||
let mut n = 0;
|
let mut n = 0;
|
||||||
for candidate in self.candidates.read().await.iter().skip(1) {
|
for candidate in self.candidates.read().await.iter().skip(1) {
|
||||||
if let Some(cand) = candidate.health.read().as_ref() {
|
if let Ok(cand) = candidate.health().as_ref() {
|
||||||
if self
|
if self
|
||||||
.distance_tiers
|
.distance_tiers
|
||||||
.distance_tier(cand.health_tier.sync_distance)
|
.distance_tier(cand.health_tier.sync_distance)
|
||||||
@@ -407,13 +402,26 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
|
|||||||
pub async fn num_available(&self) -> usize {
|
pub async fn num_available(&self) -> usize {
|
||||||
let mut n = 0;
|
let mut n = 0;
|
||||||
for candidate in self.candidates.read().await.iter() {
|
for candidate in self.candidates.read().await.iter() {
|
||||||
if candidate.status().await.is_ok() {
|
if candidate.health().is_ok() {
|
||||||
n += 1
|
n += 1
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
n
|
n
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn get_all_candidate_info(&self) -> Vec<CandidateInfo> {
|
||||||
|
let candidates = self.candidates.read().await;
|
||||||
|
let mut results = Vec::with_capacity(candidates.len());
|
||||||
|
for candidate in candidates.iter() {
|
||||||
|
let id = candidate.id;
|
||||||
|
let node = candidate.beacon_node.to_string();
|
||||||
|
let health = candidate.health().ok();
|
||||||
|
let info = CandidateInfo { id, node, health };
|
||||||
|
results.push(info);
|
||||||
|
}
|
||||||
|
results
|
||||||
|
}
|
||||||
|
|
||||||
/// Loop through ALL candidates in `self.candidates` and update their sync status.
|
/// Loop through ALL candidates in `self.candidates` and update their sync status.
|
||||||
///
|
///
|
||||||
/// It is possible for a node to return an unsynced status while continuing to serve
|
/// It is possible for a node to return an unsynced status while continuing to serve
|
||||||
@@ -421,21 +429,33 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
|
|||||||
/// A previous implementation of this function polled only the unavailable BNs.
|
/// A previous implementation of this function polled only the unavailable BNs.
|
||||||
pub async fn update_all_candidates(&self) {
|
pub async fn update_all_candidates(&self) {
|
||||||
let candidates = self.candidates.read().await;
|
let candidates = self.candidates.read().await;
|
||||||
|
let mut futures = Vec::with_capacity(candidates.len());
|
||||||
|
let mut nodes = Vec::with_capacity(candidates.len());
|
||||||
|
|
||||||
let futures = candidates
|
for candidate in candidates.iter() {
|
||||||
.iter()
|
futures.push(candidate.refresh_health(
|
||||||
.map(|candidate| {
|
&self.distance_tiers,
|
||||||
candidate.refresh_health(
|
self.slot_clock.as_ref(),
|
||||||
&self.distance_tiers,
|
&self.spec,
|
||||||
self.slot_clock.as_ref(),
|
&self.log,
|
||||||
&self.spec,
|
));
|
||||||
&self.log,
|
nodes.push(candidate.beacon_node.to_string());
|
||||||
)
|
}
|
||||||
})
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
|
|
||||||
// Run all updates concurrently and ignore errors.
|
// Run all updates concurrently.
|
||||||
let _ = future::join_all(futures).await;
|
let future_results = future::join_all(futures).await;
|
||||||
|
let results = future_results.iter().zip(nodes);
|
||||||
|
|
||||||
|
for (result, node) in results {
|
||||||
|
if let Err(e) = result {
|
||||||
|
warn!(
|
||||||
|
self.log,
|
||||||
|
"A connected beacon node errored during routine health check.";
|
||||||
|
"error" => ?e,
|
||||||
|
"endpoint" => node,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
drop(candidates);
|
drop(candidates);
|
||||||
|
|
||||||
@@ -519,11 +539,7 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
|
|||||||
"node" => $candidate.beacon_node.to_string(),
|
"node" => $candidate.beacon_node.to_string(),
|
||||||
"error" => ?e,
|
"error" => ?e,
|
||||||
);
|
);
|
||||||
// If we have an error on this function, make the client as not-ready.
|
|
||||||
//
|
|
||||||
// There exists a race condition where the candidate may have been marked
|
|
||||||
// as ready between the `func` call and now. We deem this an acceptable
|
|
||||||
// inefficiency.
|
|
||||||
errors.push(($candidate.beacon_node.to_string(), Error::RequestFailed(e)));
|
errors.push(($candidate.beacon_node.to_string(), Error::RequestFailed(e)));
|
||||||
inc_counter_vec(&ENDPOINT_ERRORS, &[$candidate.beacon_node.as_ref()]);
|
inc_counter_vec(&ENDPOINT_ERRORS, &[$candidate.beacon_node.as_ref()]);
|
||||||
}
|
}
|
||||||
@@ -573,11 +589,6 @@ impl<T: SlotClock, E: EthSpec> BeaconNodeFallback<T, E> {
|
|||||||
match func($candidate.beacon_node.clone()).await {
|
match func($candidate.beacon_node.clone()).await {
|
||||||
Ok(val) => results.push(Ok(val)),
|
Ok(val) => results.push(Ok(val)),
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
// If we have an error on this function, make the client as not-ready.
|
|
||||||
//
|
|
||||||
// There exists a race condition where the candidate may have been marked
|
|
||||||
// as ready between the `func` call and now. We deem this an acceptable
|
|
||||||
// inefficiency.
|
|
||||||
results.push(Err((
|
results.push(Err((
|
||||||
$candidate.beacon_node.to_string(),
|
$candidate.beacon_node.to_string(),
|
||||||
Error::RequestFailed(e),
|
Error::RequestFailed(e),
|
||||||
@@ -714,12 +725,12 @@ mod tests {
|
|||||||
health_tier: BeaconNodeHealthTier::new(4, Slot::new(10), small),
|
health_tier: BeaconNodeHealthTier::new(4, Slot::new(10), small),
|
||||||
};
|
};
|
||||||
|
|
||||||
*candidate_1.health.write() = Some(health_1);
|
*candidate_1.health.write() = Ok(health_1);
|
||||||
*candidate_2.health.write() = Some(health_2);
|
*candidate_2.health.write() = Ok(health_2);
|
||||||
*candidate_3.health.write() = Some(health_3);
|
*candidate_3.health.write() = Ok(health_3);
|
||||||
*candidate_4.health.write() = Some(health_4);
|
*candidate_4.health.write() = Ok(health_4);
|
||||||
*candidate_5.health.write() = Some(health_5);
|
*candidate_5.health.write() = Ok(health_5);
|
||||||
*candidate_6.health.write() = Some(health_6);
|
*candidate_6.health.write() = Ok(health_6);
|
||||||
|
|
||||||
let mut candidates = vec![
|
let mut candidates = vec![
|
||||||
candidate_3,
|
candidate_3,
|
||||||
|
|||||||
@@ -158,7 +158,7 @@ impl BeaconNodeHealthTier {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Beacon Node Health metrics.
|
/// Beacon Node Health metrics.
|
||||||
#[derive(Debug, PartialEq, Eq)]
|
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
||||||
pub struct BeaconNodeHealth {
|
pub struct BeaconNodeHealth {
|
||||||
// The ID of the Beacon Node. This should correspond with its position in the `--beacon-nodes`
|
// The ID of the Beacon Node. This should correspond with its position in the `--beacon-nodes`
|
||||||
// list. Note that the ID field is used to tie-break nodes with the same health so that nodes
|
// list. Note that the ID field is used to tie-break nodes with the same health so that nodes
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
use crate::http_metrics;
|
use crate::http_metrics;
|
||||||
use crate::{DutiesService, ProductionValidatorClient};
|
use crate::{DutiesService, ProductionValidatorClient};
|
||||||
use lighthouse_metrics::set_gauge;
|
use lighthouse_metrics::set_gauge;
|
||||||
use slog::{error, info, Logger};
|
use slog::{debug, error, info, Logger};
|
||||||
use slot_clock::SlotClock;
|
use slot_clock::SlotClock;
|
||||||
use tokio::time::{sleep, Duration};
|
use tokio::time::{sleep, Duration};
|
||||||
use types::EthSpec;
|
use types::EthSpec;
|
||||||
@@ -39,6 +39,7 @@ async fn notify<T: SlotClock + 'static, E: EthSpec>(
|
|||||||
duties_service: &DutiesService<T, E>,
|
duties_service: &DutiesService<T, E>,
|
||||||
log: &Logger,
|
log: &Logger,
|
||||||
) {
|
) {
|
||||||
|
let candidate_info = duties_service.beacon_nodes.get_all_candidate_info().await;
|
||||||
let num_available = duties_service.beacon_nodes.num_available().await;
|
let num_available = duties_service.beacon_nodes.num_available().await;
|
||||||
set_gauge(
|
set_gauge(
|
||||||
&http_metrics::metrics::AVAILABLE_BEACON_NODES_COUNT,
|
&http_metrics::metrics::AVAILABLE_BEACON_NODES_COUNT,
|
||||||
@@ -55,9 +56,14 @@ async fn notify<T: SlotClock + 'static, E: EthSpec>(
|
|||||||
num_total as i64,
|
num_total as i64,
|
||||||
);
|
);
|
||||||
if num_synced > 0 {
|
if num_synced > 0 {
|
||||||
|
let primary = candidate_info
|
||||||
|
.get(0)
|
||||||
|
.map(|candidate| candidate.node.as_str())
|
||||||
|
.unwrap_or("None");
|
||||||
info!(
|
info!(
|
||||||
log,
|
log,
|
||||||
"Connected to beacon node(s)";
|
"Connected to beacon node(s)";
|
||||||
|
"primary" => primary,
|
||||||
"total" => num_total,
|
"total" => num_total,
|
||||||
"available" => num_available,
|
"available" => num_available,
|
||||||
"synced" => num_synced,
|
"synced" => num_synced,
|
||||||
@@ -78,6 +84,31 @@ async fn notify<T: SlotClock + 'static, E: EthSpec>(
|
|||||||
set_gauge(&http_metrics::metrics::ETH2_FALLBACK_CONNECTED, 0);
|
set_gauge(&http_metrics::metrics::ETH2_FALLBACK_CONNECTED, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO(mac) Store all connected node info into metrics.
|
||||||
|
for info in candidate_info {
|
||||||
|
if let Some(health) = info.health {
|
||||||
|
debug!(
|
||||||
|
log,
|
||||||
|
"Beacon node info";
|
||||||
|
"status" => "Connected",
|
||||||
|
"id" => info.id,
|
||||||
|
"endpoint" => info.node,
|
||||||
|
"head_slot" => %health.head,
|
||||||
|
"is_optimistic" => ?health.optimistic_status,
|
||||||
|
"execution_engine_status" => ?health.execution_status,
|
||||||
|
"health_tier" => ?health.health_tier,
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
debug!(
|
||||||
|
log,
|
||||||
|
"Beacon node info";
|
||||||
|
"status" => "Disconnected",
|
||||||
|
"id" => info.id,
|
||||||
|
"endpoint" => info.node,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if let Some(slot) = duties_service.slot_clock.now() {
|
if let Some(slot) = duties_service.slot_clock.now() {
|
||||||
let epoch = slot.epoch(E::slots_per_epoch());
|
let epoch = slot.epoch(E::slots_per_epoch());
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user