Rate limiting backfill sync (#3936)

## Issue Addressed

#3212 

## Proposed Changes

- Introduce a new `rate_limiting_backfill_queue` - any new inbound backfill work events gets immediately sent to this FIFO queue **without any processing**
- Spawn a `backfill_scheduler` routine that pops a backfill event from the FIFO queue at specified intervals (currently halfway through a slot, or at 6s after slot start for 12s slots) and sends the event to `BeaconProcessor` via a `scheduled_backfill_work_tx` channel
- This channel gets polled last in the `InboundEvents`, and work event received is  wrapped in a `InboundEvent::ScheduledBackfillWork` enum variant, which gets processed immediately or queued by the `BeaconProcessor` (existing logic applies from here)

Diagram comparing backfill processing with / without rate-limiting: 
https://github.com/sigp/lighthouse/issues/3212#issuecomment-1386249922

See this comment for @paulhauner's  explanation and solution: https://github.com/sigp/lighthouse/issues/3212#issuecomment-1384674956

## Additional Info

I've compared this branch (with backfill processing rate limited to to 1 and 3 batches per slot) against the latest stable version. The CPU usage during backfill sync is reduced by ~5% - 20%, more details on this page:

https://hackmd.io/@jimmygchen/SJuVpJL3j

The above testing is done on Goerli (as I don't currently have hardware for Mainnet), I'm guessing the differences are likely to be bigger on mainnet due to block size.

### TODOs

- [x] Experiment with processing multiple batches per slot. (need to think about how to do this for different slot durations)
- [x] Add option to disable rate-limiting, enabed by default.
- [x] (No longer required now we're reusing the reprocessing queue) Complete the `backfill_scheduler` task when backfill sync is completed or not required
This commit is contained in:
Jimmy Chen
2023-04-03 03:02:55 +00:00
parent c5383e393a
commit 2de3451011
10 changed files with 380 additions and 24 deletions

View File

@@ -61,6 +61,7 @@ use std::time::Duration;
use std::{cmp, collections::HashSet};
use task_executor::TaskExecutor;
use tokio::sync::mpsc;
use tokio::sync::mpsc::error::TrySendError;
use types::{
Attestation, AttesterSlashing, Hash256, LightClientFinalityUpdate, LightClientOptimisticUpdate,
ProposerSlashing, SignedAggregateAndProof, SignedBeaconBlock, SignedBlsToExecutionChange,
@@ -77,7 +78,9 @@ mod tests;
mod work_reprocessing_queue;
mod worker;
use crate::beacon_processor::work_reprocessing_queue::QueuedGossipBlock;
use crate::beacon_processor::work_reprocessing_queue::{
QueuedBackfillBatch, QueuedGossipBlock, ReprocessQueueMessage,
};
pub use worker::{ChainSegmentProcessId, GossipAggregatePackage, GossipAttestationPackage};
/// The maximum size of the channel for work events to the `BeaconProcessor`.
@@ -218,6 +221,7 @@ pub const GOSSIP_LIGHT_CLIENT_FINALITY_UPDATE: &str = "light_client_finality_upd
pub const GOSSIP_LIGHT_CLIENT_OPTIMISTIC_UPDATE: &str = "light_client_optimistic_update";
pub const RPC_BLOCK: &str = "rpc_block";
pub const CHAIN_SEGMENT: &str = "chain_segment";
pub const CHAIN_SEGMENT_BACKFILL: &str = "chain_segment_backfill";
pub const STATUS_PROCESSING: &str = "status_processing";
pub const BLOCKS_BY_RANGE_REQUEST: &str = "blocks_by_range_request";
pub const BLOCKS_BY_ROOTS_REQUEST: &str = "blocks_by_roots_request";
@@ -738,6 +742,9 @@ impl<T: BeaconChainTypes> std::convert::From<ReadyWork<T>> for WorkEvent<T> {
seen_timestamp,
},
},
ReadyWork::BackfillSync(QueuedBackfillBatch { process_id, blocks }) => {
WorkEvent::chain_segment(process_id, blocks)
}
}
}
}
@@ -893,6 +900,10 @@ impl<T: BeaconChainTypes> Work<T> {
Work::GossipLightClientFinalityUpdate { .. } => GOSSIP_LIGHT_CLIENT_FINALITY_UPDATE,
Work::GossipLightClientOptimisticUpdate { .. } => GOSSIP_LIGHT_CLIENT_OPTIMISTIC_UPDATE,
Work::RpcBlock { .. } => RPC_BLOCK,
Work::ChainSegment {
process_id: ChainSegmentProcessId::BackSyncBatchId { .. },
..
} => CHAIN_SEGMENT_BACKFILL,
Work::ChainSegment { .. } => CHAIN_SEGMENT,
Work::Status { .. } => STATUS_PROCESSING,
Work::BlocksByRangeRequest { .. } => BLOCKS_BY_RANGE_REQUEST,
@@ -1054,23 +1065,23 @@ impl<T: BeaconChainTypes> BeaconProcessor<T> {
FifoQueue::new(MAX_BLS_TO_EXECUTION_CHANGE_QUEUE_LEN);
let mut lcbootstrap_queue = FifoQueue::new(MAX_LIGHT_CLIENT_BOOTSTRAP_QUEUE_LEN);
let chain = match self.beacon_chain.upgrade() {
Some(chain) => chain,
// No need to proceed any further if the beacon chain has been dropped, the client
// is shutting down.
None => return,
};
// Channels for sending work to the re-process scheduler (`work_reprocessing_tx`) and to
// receive them back once they are ready (`ready_work_rx`).
let (ready_work_tx, ready_work_rx) = mpsc::channel(MAX_SCHEDULED_WORK_QUEUE_LEN);
let work_reprocessing_tx = {
if let Some(chain) = self.beacon_chain.upgrade() {
spawn_reprocess_scheduler(
ready_work_tx,
&self.executor,
chain.slot_clock.clone(),
self.log.clone(),
)
} else {
// No need to proceed any further if the beacon chain has been dropped, the client
// is shutting down.
return;
}
};
let work_reprocessing_tx = spawn_reprocess_scheduler(
ready_work_tx,
&self.executor,
chain.slot_clock.clone(),
self.log.clone(),
);
let executor = self.executor.clone();
@@ -1083,12 +1094,55 @@ impl<T: BeaconChainTypes> BeaconProcessor<T> {
reprocess_work_rx: ready_work_rx,
};
let enable_backfill_rate_limiting = chain.config.enable_backfill_rate_limiting;
loop {
let work_event = match inbound_events.next().await {
Some(InboundEvent::WorkerIdle) => {
self.current_workers = self.current_workers.saturating_sub(1);
None
}
Some(InboundEvent::WorkEvent(event)) if enable_backfill_rate_limiting => {
match QueuedBackfillBatch::try_from(event) {
Ok(backfill_batch) => {
match work_reprocessing_tx
.try_send(ReprocessQueueMessage::BackfillSync(backfill_batch))
{
Err(e) => {
warn!(
self.log,
"Unable to queue backfill work event. Will try to process now.";
"error" => %e
);
match e {
TrySendError::Full(reprocess_queue_message)
| TrySendError::Closed(reprocess_queue_message) => {
match reprocess_queue_message {
ReprocessQueueMessage::BackfillSync(
backfill_batch,
) => Some(backfill_batch.into()),
other => {
crit!(
self.log,
"Unexpected queue message type";
"message_type" => other.as_ref()
);
// This is an unhandled exception, drop the message.
continue;
}
}
}
}
}
Ok(..) => {
// backfill work sent to "reprocessing" queue. Process the next event.
continue;
}
}
}
Err(event) => Some(event),
}
}
Some(InboundEvent::WorkEvent(event))
| Some(InboundEvent::ReprocessingWork(event)) => Some(event),
None => {