Optimize validator duties (#2243)

## Issue Addressed

Closes #2052

## Proposed Changes

- Refactor the attester/proposer duties endpoints in the BN
    - Performance improvements
    - Fixes some potential inconsistencies with the dependent root fields.
    - Removes `http_api::beacon_proposer_cache` and just uses the one on the `BeaconChain` instead.
    - Move the code for the proposer/attester duties endpoints into separate files, for readability.
- Refactor the `DutiesService` in the VC
    - Required to reduce the delay on broadcasting new blocks.
    - Gets rid of the `ValidatorDuty` shim struct that came about when we adopted the standard API.
    - Separate block/attestation duty tasks so that they don't block each other when one is slow.
- In the VC, use `PublicKeyBytes` to represent validators instead of `PublicKey`. `PublicKey` is a legit crypto object whilst `PublicKeyBytes` is just a byte-array, it's much faster to clone/hash `PublicKeyBytes` and this change has had a significant impact on runtimes.
    - Unfortunately this has created lots of dust changes.
 - In the BN, store `PublicKeyBytes` in the `beacon_proposer_cache` and allow access to them. The HTTP API always sends `PublicKeyBytes` over the wire and the conversion from `PublicKey` -> `PublickeyBytes` is non-trivial, especially when queries have 100s/1000s of validators (like Pyrmont).
 - Add the `state_processing::state_advance` mod which dedups a lot of the "apply `n` skip slots to the state" code.
    - This also fixes a bug with some functions which were failing to include a state root as per [this comment](072695284f/consensus/state_processing/src/state_advance.rs (L69-L74)). I couldn't find any instance of this bug that resulted in anything more severe than keying a shuffling cache by the wrong block root.
 - Swap the VC block service to use `mpsc` from `tokio` instead of `futures`. This is consistent with the rest of the code base.
    
~~This PR *reduces* the size of the codebase 🎉~~ It *used* to reduce the size of the code base before I added more comments. 

## Observations on Prymont

- Proposer duties times down from peaks of 450ms to consistent <1ms.
- Current epoch attester duties times down from >1s peaks to a consistent 20-30ms.
- Block production down from +600ms to 100-200ms.

## Additional Info

- ~~Blocked on #2241~~
- ~~Blocked on #2234~~

## TODO

- [x] ~~Refactor this into some smaller PRs?~~ Leaving this as-is for now.
- [x] Address `per_slot_processing` roots.
- [x] Investigate slow next epoch times. Not getting added to cache on block processing?
- [x] Consider [this](072695284f/beacon_node/store/src/hot_cold_store.rs (L811-L812)) in the scenario of replacing the state roots


Co-authored-by: pawan <pawandhananjay@gmail.com>
Co-authored-by: Michael Sproul <michael@sigmaprime.io>
This commit is contained in:
Paul Hauner
2021-03-17 05:09:57 +00:00
parent 6a69b20be1
commit 015ab7d0a7
49 changed files with 2201 additions and 1833 deletions

View File

@@ -1,7 +1,7 @@
use crate::ProductionValidatorClient;
use slog::{error, info};
use crate::{DutiesService, ProductionValidatorClient};
use slog::{error, info, Logger};
use slot_clock::SlotClock;
use tokio::time::{interval_at, Duration, Instant};
use tokio::time::{sleep, Duration};
use types::EthSpec;
/// Spawns a notifier service which periodically logs information about the node.
@@ -11,86 +11,19 @@ pub fn spawn_notifier<T: EthSpec>(client: &ProductionValidatorClient<T>) -> Resu
let duties_service = client.duties_service.clone();
let slot_duration = Duration::from_secs(context.eth2_config.spec.seconds_per_slot);
let duration_to_next_slot = duties_service
.slot_clock
.duration_to_next_slot()
.ok_or("slot_notifier unable to determine time to next slot")?;
// Run the notifier half way through each slot.
let start_instant = Instant::now() + duration_to_next_slot + (slot_duration / 2);
let mut interval = interval_at(start_instant, slot_duration);
let interval_fut = async move {
let log = context.log();
loop {
interval.tick().await;
let num_available = duties_service.beacon_nodes.num_available().await;
let num_synced = duties_service.beacon_nodes.num_synced().await;
let num_total = duties_service.beacon_nodes.num_total().await;
if num_synced > 0 {
info!(
log,
"Connected to beacon node(s)";
"total" => num_total,
"available" => num_available,
"synced" => num_synced,
)
if let Some(duration_to_next_slot) = duties_service.slot_clock.duration_to_next_slot() {
sleep(duration_to_next_slot + slot_duration / 2).await;
notify(&duties_service, &log).await;
} else {
error!(
log,
"No synced beacon nodes";
"total" => num_total,
"available" => num_available,
"synced" => num_synced,
)
}
if let Some(slot) = duties_service.slot_clock.now() {
let epoch = slot.epoch(T::slots_per_epoch());
let total_validators = duties_service.total_validator_count();
let proposing_validators = duties_service.proposer_count(epoch);
let attesting_validators = duties_service.attester_count(epoch);
if total_validators == 0 {
info!(
log,
"No validators present";
"msg" => "see `lighthouse account validator create --help` \
or the HTTP API documentation"
)
} else if total_validators == attesting_validators {
info!(
log,
"All validators active";
"proposers" => proposing_validators,
"active_validators" => attesting_validators,
"total_validators" => total_validators,
"epoch" => format!("{}", epoch),
"slot" => format!("{}", slot),
);
} else if attesting_validators > 0 {
info!(
log,
"Some validators active";
"proposers" => proposing_validators,
"active_validators" => attesting_validators,
"total_validators" => total_validators,
"epoch" => format!("{}", epoch),
"slot" => format!("{}", slot),
);
} else {
info!(
log,
"Awaiting activation";
"validators" => total_validators,
"epoch" => format!("{}", epoch),
"slot" => format!("{}", slot),
);
}
} else {
error!(log, "Unable to read slot clock");
error!(log, "Failed to read slot clock");
// If we can't read the slot clock, just wait another slot.
sleep(slot_duration).await;
continue;
}
}
};
@@ -98,3 +31,77 @@ pub fn spawn_notifier<T: EthSpec>(client: &ProductionValidatorClient<T>) -> Resu
executor.spawn(interval_fut, "validator_notifier");
Ok(())
}
/// Performs a single notification routine.
async fn notify<T: SlotClock + 'static, E: EthSpec>(
duties_service: &DutiesService<T, E>,
log: &Logger,
) {
let num_available = duties_service.beacon_nodes.num_available().await;
let num_synced = duties_service.beacon_nodes.num_synced().await;
let num_total = duties_service.beacon_nodes.num_total().await;
if num_synced > 0 {
info!(
log,
"Connected to beacon node(s)";
"total" => num_total,
"available" => num_available,
"synced" => num_synced,
)
} else {
error!(
log,
"No synced beacon nodes";
"total" => num_total,
"available" => num_available,
"synced" => num_synced,
)
}
if let Some(slot) = duties_service.slot_clock.now() {
let epoch = slot.epoch(E::slots_per_epoch());
let total_validators = duties_service.total_validator_count();
let proposing_validators = duties_service.proposer_count(epoch);
let attesting_validators = duties_service.attester_count(epoch);
if total_validators == 0 {
info!(
log,
"No validators present";
"msg" => "see `lighthouse account validator create --help` \
or the HTTP API documentation"
)
} else if total_validators == attesting_validators {
info!(
log,
"All validators active";
"proposers" => proposing_validators,
"active_validators" => attesting_validators,
"total_validators" => total_validators,
"epoch" => format!("{}", epoch),
"slot" => format!("{}", slot),
);
} else if attesting_validators > 0 {
info!(
log,
"Some validators active";
"proposers" => proposing_validators,
"active_validators" => attesting_validators,
"total_validators" => total_validators,
"epoch" => format!("{}", epoch),
"slot" => format!("{}", slot),
);
} else {
info!(
log,
"Awaiting activation";
"validators" => total_validators,
"epoch" => format!("{}", epoch),
"slot" => format!("{}", slot),
);
}
} else {
error!(log, "Unable to read slot clock");
}
}