Rate limiting backfill sync (#3936)

## Issue Addressed #3212 ## Proposed Changes - Introduce a new `rate_limiting_backfill_queue` - any new inbound backfill work events gets immediately sent to this FIFO queue **without any processing** - Spawn a `backfill_scheduler` routine that pops a backfill event from the FIFO queue at specified intervals (currently halfway through a slot, or at 6s after slot start for 12s slots) and sends the event to `BeaconProcessor` via a `scheduled_backfill_work_tx` channel - This channel gets polled last in the `InboundEvents`, and work event received is wrapped in a `InboundEvent::ScheduledBackfillWork` enum variant, which gets processed immediately or queued by the `BeaconProcessor` (existing logic applies from here) Diagram comparing backfill processing with / without rate-limiting: https://github.com/sigp/lighthouse/issues/3212#issuecomment-1386249922 See this comment for @paulhauner's explanation and solution: https://github.com/sigp/lighthouse/issues/3212#issuecomment-1384674956 ## Additional Info I've compared this branch (with backfill processing rate limited to to 1 and 3 batches per slot) against the latest stable version. The CPU usage during backfill sync is reduced by ~5% - 20%, more details on this page: https://hackmd.io/@jimmygchen/SJuVpJL3j The above testing is done on Goerli (as I don't currently have hardware for Mainnet), I'm guessing the differences are likely to be bigger on mainnet due to block size. ### TODOs - [x] Experiment with processing multiple batches per slot. (need to think about how to do this for different slot durations) - [x] Add option to disable rate-limiting, enabed by default. - [x] (No longer required now we're reusing the reprocessing queue) Complete the `backfill_scheduler` task when backfill sync is completed or not required
2026-03-15 19:02:42 +00:00 · 2023-04-03 03:02:55 +00:00
parent c5383e393a
commit 2de3451011
10 changed files with 380 additions and 24 deletions
--- a/beacon_node/network/src/beacon_processor/tests.rs
+++ b/beacon_node/network/src/beacon_processor/tests.rs
@@ -9,7 +9,7 @@ use crate::{service::NetworkMessage, sync::SyncMessage};
 use beacon_chain::test_utils::{
    AttestationStrategy, BeaconChainHarness, BlockStrategy, EphemeralHarnessType,
 };
-use beacon_chain::{BeaconChain, MAXIMUM_GOSSIP_CLOCK_DISPARITY};
+use beacon_chain::{BeaconChain, ChainConfig, MAXIMUM_GOSSIP_CLOCK_DISPARITY};
 use lighthouse_network::{
    discv5::enr::{CombinedKey, EnrBuilder},
    rpc::methods::{MetaData, MetaDataV2},
@@ -23,8 +23,8 @@ use std::sync::Arc;
 use std::time::Duration;
 use tokio::sync::mpsc;
 use types::{
-    Attestation, AttesterSlashing, EthSpec, MainnetEthSpec, ProposerSlashing, SignedBeaconBlock,
-    SignedVoluntaryExit, SubnetId,
+    Attestation, AttesterSlashing, Epoch, EthSpec, MainnetEthSpec, ProposerSlashing,
+    SignedBeaconBlock, SignedVoluntaryExit, SubnetId,
 };

 type E = MainnetEthSpec;
@@ -70,6 +70,10 @@ impl Drop for TestRig {

 impl TestRig {
    pub async fn new(chain_length: u64) -> Self {
+        Self::new_with_chain_config(chain_length, ChainConfig::default()).await
+    }
+
+    pub async fn new_with_chain_config(chain_length: u64, chain_config: ChainConfig) -> Self {
        // This allows for testing voluntary exits without building out a massive chain.
        let mut spec = E::default_spec();
        spec.shard_committee_period = 2;
@@ -78,6 +82,7 @@ impl TestRig {
            .spec(spec)
            .deterministic_keypairs(VALIDATOR_COUNT)
            .fresh_ephemeral_store()
+            .chain_config(chain_config)
            .build();

        harness.advance_slot();
@@ -261,6 +266,14 @@ impl TestRig {
        self.beacon_processor_tx.try_send(event).unwrap();
    }

+    pub fn enqueue_backfill_batch(&self) {
+        let event = WorkEvent::chain_segment(
+            ChainSegmentProcessId::BackSyncBatchId(Epoch::default()),
+            Vec::default(),
+        );
+        self.beacon_processor_tx.try_send(event).unwrap();
+    }
+
    pub fn enqueue_unaggregated_attestation(&self) {
        let (attestation, subnet_id) = self.attestations.first().unwrap().clone();
        self.beacon_processor_tx
@@ -873,3 +886,49 @@ async fn test_rpc_block_reprocessing() {
    // cache handle was dropped.
    assert_eq!(next_block_root, rig.head_root());
 }
+
+/// Ensure that backfill batches get rate-limited and processing is scheduled at specified intervals.
+#[tokio::test]
+async fn test_backfill_sync_processing() {
+    let mut rig = TestRig::new(SMALL_CHAIN).await;
+    // Note: to verify the exact event times in an integration test is not straight forward here
+    // (not straight forward to manipulate `TestingSlotClock` due to cloning of `SlotClock` in code)
+    // and makes the test very slow, hence timing calculation is unit tested separately in
+    // `work_reprocessing_queue`.
+    for _ in 0..1 {
+        rig.enqueue_backfill_batch();
+        // ensure queued batch is not processed until later
+        rig.assert_no_events_for(Duration::from_millis(100)).await;
+        // A new batch should be processed within a slot.
+        rig.assert_event_journal_with_timeout(
+            &[CHAIN_SEGMENT_BACKFILL, WORKER_FREED, NOTHING_TO_DO],
+            rig.chain.slot_clock.slot_duration(),
+        )
+        .await;
+    }
+}
+
+/// Ensure that backfill batches get processed as fast as they can when rate-limiting is disabled.
+#[tokio::test]
+async fn test_backfill_sync_processing_rate_limiting_disabled() {
+    let chain_config = ChainConfig {
+        enable_backfill_rate_limiting: false,
+        ..Default::default()
+    };
+    let mut rig = TestRig::new_with_chain_config(SMALL_CHAIN, chain_config).await;
+
+    for _ in 0..3 {
+        rig.enqueue_backfill_batch();
+    }
+
+    // ensure all batches are processed
+    rig.assert_event_journal_with_timeout(
+        &[
+            CHAIN_SEGMENT_BACKFILL,
+            CHAIN_SEGMENT_BACKFILL,
+            CHAIN_SEGMENT_BACKFILL,
+        ],
+        Duration::from_millis(100),
+    )
+    .await;
+}