mirror of
https://github.com/sigp/lighthouse.git
synced 2026-03-03 00:31:50 +00:00
Use scoped rayon pool for backfill chain segment processing (#7924)
Part of #7866 - Continuation of #7921 In the above PR, we enabled rayon for batch KZG verification in chain segment processing. However, using the global rayon thread pool for backfill is likely to create resource contention with higher-priority beacon processor work. This PR introduces a dedicated low-priority rayon thread pool `LOW_PRIORITY_RAYON_POOL` and uses it for processing backfill chain segments. This prevents backfill KZG verification from using the global rayon thread pool and competing with high-priority beacon processor tasks for CPU resources. However, this PR by itself doesn't prevent CPU oversubscription because other tasks could still fill up the global rayon thread pool, and having an extra thread pool could make things worse. To address this we need the beacon processor to coordinate total CPU allocation across all tasks, which is covered in: - #7789 Co-Authored-By: Jimmy Chen <jchen.tc@gmail.com> Co-Authored-By: Eitan Seri- Levi <eserilev@gmail.com> Co-Authored-By: Eitan Seri-Levi <eserilev@ucsc.edu>
This commit is contained in:
@@ -6,9 +6,7 @@ use beacon_chain::data_column_verification::{GossipDataColumnError, observe_goss
|
||||
use beacon_chain::fetch_blobs::{
|
||||
EngineGetBlobsOutput, FetchEngineBlobError, fetch_and_process_engine_blobs,
|
||||
};
|
||||
use beacon_chain::{
|
||||
AvailabilityProcessingStatus, BeaconChain, BeaconChainTypes, BlockError, NotifyExecutionLayer,
|
||||
};
|
||||
use beacon_chain::{AvailabilityProcessingStatus, BeaconChain, BeaconChainTypes, BlockError};
|
||||
use beacon_processor::{
|
||||
BeaconProcessorSend, DuplicateCache, GossipAggregatePackage, GossipAttestationPackage, Work,
|
||||
WorkEvent as BeaconWorkEvent,
|
||||
@@ -500,33 +498,23 @@ impl<T: BeaconChainTypes> NetworkBeaconProcessor<T> {
|
||||
process_id: ChainSegmentProcessId,
|
||||
blocks: Vec<RpcBlock<T::EthSpec>>,
|
||||
) -> Result<(), Error<T::EthSpec>> {
|
||||
let is_backfill = matches!(&process_id, ChainSegmentProcessId::BackSyncBatchId { .. });
|
||||
debug!(blocks = blocks.len(), id = ?process_id, "Batch sending for process");
|
||||
|
||||
let processor = self.clone();
|
||||
let process_fn = async move {
|
||||
let notify_execution_layer = if processor
|
||||
.network_globals
|
||||
.sync_state
|
||||
.read()
|
||||
.is_syncing_finalized()
|
||||
{
|
||||
NotifyExecutionLayer::No
|
||||
} else {
|
||||
NotifyExecutionLayer::Yes
|
||||
};
|
||||
processor
|
||||
.process_chain_segment(process_id, blocks, notify_execution_layer)
|
||||
.await;
|
||||
};
|
||||
let process_fn = Box::pin(process_fn);
|
||||
|
||||
// Back-sync batches are dispatched with a different `Work` variant so
|
||||
// they can be rate-limited.
|
||||
let work = if is_backfill {
|
||||
Work::ChainSegmentBackfill(process_fn)
|
||||
} else {
|
||||
Work::ChainSegment(process_fn)
|
||||
let work = match process_id {
|
||||
ChainSegmentProcessId::RangeBatchId(_, _) => {
|
||||
let process_fn = async move {
|
||||
processor.process_chain_segment(process_id, blocks).await;
|
||||
};
|
||||
Work::ChainSegment(Box::pin(process_fn))
|
||||
}
|
||||
ChainSegmentProcessId::BackSyncBatchId(_) => {
|
||||
let process_fn =
|
||||
move || processor.process_chain_segment_backfill(process_id, blocks);
|
||||
Work::ChainSegmentBackfill(Box::new(process_fn))
|
||||
}
|
||||
};
|
||||
|
||||
self.try_send(BeaconWorkEvent {
|
||||
|
||||
@@ -19,9 +19,10 @@ use beacon_processor::{
|
||||
use beacon_processor::{Work, WorkEvent};
|
||||
use lighthouse_network::PeerAction;
|
||||
use lighthouse_tracing::{
|
||||
SPAN_PROCESS_CHAIN_SEGMENT, SPAN_PROCESS_RPC_BLOBS, SPAN_PROCESS_RPC_BLOCK,
|
||||
SPAN_PROCESS_RPC_CUSTODY_COLUMNS,
|
||||
SPAN_PROCESS_CHAIN_SEGMENT, SPAN_PROCESS_CHAIN_SEGMENT_BACKFILL, SPAN_PROCESS_RPC_BLOBS,
|
||||
SPAN_PROCESS_RPC_BLOCK, SPAN_PROCESS_RPC_CUSTODY_COLUMNS,
|
||||
};
|
||||
use logging::crit;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use store::KzgCommitment;
|
||||
@@ -434,27 +435,42 @@ impl<T: BeaconChainTypes> NetworkBeaconProcessor<T> {
|
||||
parent = None,
|
||||
level = "debug",
|
||||
skip_all,
|
||||
fields(sync_type = ?sync_type, downloaded_blocks = downloaded_blocks.len())
|
||||
fields(process_id = ?process_id, downloaded_blocks = downloaded_blocks.len())
|
||||
)]
|
||||
pub async fn process_chain_segment(
|
||||
&self,
|
||||
sync_type: ChainSegmentProcessId,
|
||||
process_id: ChainSegmentProcessId,
|
||||
downloaded_blocks: Vec<RpcBlock<T::EthSpec>>,
|
||||
notify_execution_layer: NotifyExecutionLayer,
|
||||
) {
|
||||
let result = match sync_type {
|
||||
// this a request from the range sync
|
||||
ChainSegmentProcessId::RangeBatchId(chain_id, epoch) => {
|
||||
let start_slot = downloaded_blocks.first().map(|b| b.slot().as_u64());
|
||||
let end_slot = downloaded_blocks.last().map(|b| b.slot().as_u64());
|
||||
let sent_blocks = downloaded_blocks.len();
|
||||
let ChainSegmentProcessId::RangeBatchId(chain_id, epoch) = process_id else {
|
||||
// This is a request from range sync, this should _never_ happen
|
||||
crit!(
|
||||
error = "process_chain_segment called on a variant other than RangeBatchId",
|
||||
"Please notify the devs"
|
||||
);
|
||||
return;
|
||||
};
|
||||
|
||||
match self
|
||||
.process_blocks(downloaded_blocks.iter(), notify_execution_layer)
|
||||
.await
|
||||
{
|
||||
(imported_blocks, Ok(_)) => {
|
||||
debug!(
|
||||
let start_slot = downloaded_blocks.first().map(|b| b.slot().as_u64());
|
||||
let end_slot = downloaded_blocks.last().map(|b| b.slot().as_u64());
|
||||
let sent_blocks = downloaded_blocks.len();
|
||||
let notify_execution_layer = if self
|
||||
.network_globals
|
||||
.sync_state
|
||||
.read()
|
||||
.is_syncing_finalized()
|
||||
{
|
||||
NotifyExecutionLayer::No
|
||||
} else {
|
||||
NotifyExecutionLayer::Yes
|
||||
};
|
||||
|
||||
let result = match self
|
||||
.process_blocks(downloaded_blocks.iter(), notify_execution_layer)
|
||||
.await
|
||||
{
|
||||
(imported_blocks, Ok(_)) => {
|
||||
debug!(
|
||||
batch_epoch = %epoch,
|
||||
first_block_slot = start_slot,
|
||||
chain = chain_id,
|
||||
@@ -462,13 +478,13 @@ impl<T: BeaconChainTypes> NetworkBeaconProcessor<T> {
|
||||
processed_blocks = sent_blocks,
|
||||
service= "sync",
|
||||
"Batch processed");
|
||||
BatchProcessResult::Success {
|
||||
sent_blocks,
|
||||
imported_blocks,
|
||||
}
|
||||
}
|
||||
(imported_blocks, Err(e)) => {
|
||||
debug!(
|
||||
BatchProcessResult::Success {
|
||||
sent_blocks,
|
||||
imported_blocks,
|
||||
}
|
||||
}
|
||||
(imported_blocks, Err(e)) => {
|
||||
debug!(
|
||||
batch_epoch = %epoch,
|
||||
first_block_slot = start_slot,
|
||||
chain = chain_id,
|
||||
@@ -477,33 +493,61 @@ impl<T: BeaconChainTypes> NetworkBeaconProcessor<T> {
|
||||
error = %e.message,
|
||||
service = "sync",
|
||||
"Batch processing failed");
|
||||
match e.peer_action {
|
||||
Some(penalty) => BatchProcessResult::FaultyFailure {
|
||||
imported_blocks,
|
||||
penalty,
|
||||
},
|
||||
None => BatchProcessResult::NonFaultyFailure,
|
||||
}
|
||||
}
|
||||
match e.peer_action {
|
||||
Some(penalty) => BatchProcessResult::FaultyFailure {
|
||||
imported_blocks,
|
||||
penalty,
|
||||
},
|
||||
None => BatchProcessResult::NonFaultyFailure,
|
||||
}
|
||||
}
|
||||
// this a request from the Backfill sync
|
||||
ChainSegmentProcessId::BackSyncBatchId(epoch) => {
|
||||
let start_slot = downloaded_blocks.first().map(|b| b.slot().as_u64());
|
||||
let end_slot = downloaded_blocks.last().map(|b| b.slot().as_u64());
|
||||
let sent_blocks = downloaded_blocks.len();
|
||||
let n_blobs = downloaded_blocks
|
||||
.iter()
|
||||
.map(|wrapped| wrapped.n_blobs())
|
||||
.sum::<usize>();
|
||||
let n_data_columns = downloaded_blocks
|
||||
.iter()
|
||||
.map(|wrapped| wrapped.n_data_columns())
|
||||
.sum::<usize>();
|
||||
};
|
||||
|
||||
match self.process_backfill_blocks(downloaded_blocks) {
|
||||
(imported_blocks, Ok(_)) => {
|
||||
debug!(
|
||||
self.send_sync_message(SyncMessage::BatchProcessed {
|
||||
sync_type: process_id,
|
||||
result,
|
||||
});
|
||||
}
|
||||
|
||||
/// Attempt to import the chain segment (`blocks`) to the beacon chain, informing the sync
|
||||
/// thread if more blocks are needed to process it.
|
||||
#[instrument(
|
||||
name = SPAN_PROCESS_CHAIN_SEGMENT_BACKFILL,
|
||||
parent = None,
|
||||
level = "debug",
|
||||
skip_all,
|
||||
fields(downloaded_blocks = downloaded_blocks.len())
|
||||
)]
|
||||
pub fn process_chain_segment_backfill(
|
||||
&self,
|
||||
process_id: ChainSegmentProcessId,
|
||||
downloaded_blocks: Vec<RpcBlock<T::EthSpec>>,
|
||||
) {
|
||||
let ChainSegmentProcessId::BackSyncBatchId(epoch) = process_id else {
|
||||
// this a request from RangeSync, this should _never_ happen
|
||||
crit!(
|
||||
error =
|
||||
"process_chain_segment_backfill called on a variant other than BackSyncBatchId",
|
||||
"Please notify the devs"
|
||||
);
|
||||
return;
|
||||
};
|
||||
|
||||
let start_slot = downloaded_blocks.first().map(|b| b.slot().as_u64());
|
||||
let end_slot = downloaded_blocks.last().map(|b| b.slot().as_u64());
|
||||
let sent_blocks = downloaded_blocks.len();
|
||||
let n_blobs = downloaded_blocks
|
||||
.iter()
|
||||
.map(|wrapped| wrapped.n_blobs())
|
||||
.sum::<usize>();
|
||||
let n_data_columns = downloaded_blocks
|
||||
.iter()
|
||||
.map(|wrapped| wrapped.n_data_columns())
|
||||
.sum::<usize>();
|
||||
|
||||
let result = match self.process_backfill_blocks(downloaded_blocks) {
|
||||
(imported_blocks, Ok(_)) => {
|
||||
debug!(
|
||||
batch_epoch = %epoch,
|
||||
first_block_slot = start_slot,
|
||||
keep_execution_payload = !self.chain.store.get_config().prune_payloads,
|
||||
@@ -513,34 +557,35 @@ impl<T: BeaconChainTypes> NetworkBeaconProcessor<T> {
|
||||
processed_data_columns = n_data_columns,
|
||||
service= "sync",
|
||||
"Backfill batch processed");
|
||||
BatchProcessResult::Success {
|
||||
sent_blocks,
|
||||
imported_blocks,
|
||||
}
|
||||
}
|
||||
(_, Err(e)) => {
|
||||
debug!(
|
||||
batch_epoch = %epoch,
|
||||
first_block_slot = start_slot,
|
||||
last_block_slot = end_slot,
|
||||
processed_blobs = n_blobs,
|
||||
error = %e.message,
|
||||
service = "sync",
|
||||
"Backfill batch processing failed"
|
||||
);
|
||||
match e.peer_action {
|
||||
Some(penalty) => BatchProcessResult::FaultyFailure {
|
||||
imported_blocks: 0,
|
||||
penalty,
|
||||
},
|
||||
None => BatchProcessResult::NonFaultyFailure,
|
||||
}
|
||||
}
|
||||
BatchProcessResult::Success {
|
||||
sent_blocks,
|
||||
imported_blocks,
|
||||
}
|
||||
}
|
||||
(_, Err(e)) => {
|
||||
debug!(
|
||||
batch_epoch = %epoch,
|
||||
first_block_slot = start_slot,
|
||||
last_block_slot = end_slot,
|
||||
processed_blobs = n_blobs,
|
||||
error = %e.message,
|
||||
service = "sync",
|
||||
"Backfill batch processing failed"
|
||||
);
|
||||
match e.peer_action {
|
||||
Some(penalty) => BatchProcessResult::FaultyFailure {
|
||||
imported_blocks: 0,
|
||||
penalty,
|
||||
},
|
||||
None => BatchProcessResult::NonFaultyFailure,
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
self.send_sync_message(SyncMessage::BatchProcessed { sync_type, result });
|
||||
self.send_sync_message(SyncMessage::BatchProcessed {
|
||||
sync_type: process_id,
|
||||
result,
|
||||
});
|
||||
}
|
||||
|
||||
/// Helper function to process blocks batches which only consumes the chain and blocks to process.
|
||||
|
||||
@@ -17,6 +17,7 @@ use beacon_chain::test_utils::{
|
||||
test_spec,
|
||||
};
|
||||
use beacon_chain::{BeaconChain, WhenSlotSkipped};
|
||||
use beacon_processor::rayon_manager::RayonManager;
|
||||
use beacon_processor::{work_reprocessing_queue::*, *};
|
||||
use gossipsub::MessageAcceptance;
|
||||
use itertools::Itertools;
|
||||
@@ -266,6 +267,7 @@ impl TestRig {
|
||||
executor,
|
||||
current_workers: 0,
|
||||
config: beacon_processor_config,
|
||||
rayon_manager: RayonManager::default(),
|
||||
}
|
||||
.spawn_manager(
|
||||
beacon_processor_rx,
|
||||
@@ -458,10 +460,10 @@ impl TestRig {
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
pub fn enqueue_backfill_batch(&self) {
|
||||
pub fn enqueue_backfill_batch(&self, epoch: Epoch) {
|
||||
self.network_beacon_processor
|
||||
.send_chain_segment(
|
||||
ChainSegmentProcessId::BackSyncBatchId(Epoch::default()),
|
||||
ChainSegmentProcessId::BackSyncBatchId(epoch),
|
||||
Vec::default(),
|
||||
)
|
||||
.unwrap();
|
||||
@@ -606,7 +608,7 @@ impl TestRig {
|
||||
}
|
||||
|
||||
pub async fn assert_event_journal(&mut self, expected: &[&str]) {
|
||||
self.assert_event_journal_with_timeout(expected, STANDARD_TIMEOUT)
|
||||
self.assert_event_journal_with_timeout(expected, STANDARD_TIMEOUT, false, false)
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -623,6 +625,8 @@ impl TestRig {
|
||||
.chain(std::iter::once(NOTHING_TO_DO))
|
||||
.collect::<Vec<_>>(),
|
||||
timeout,
|
||||
false,
|
||||
false,
|
||||
)
|
||||
.await
|
||||
}
|
||||
@@ -666,11 +670,21 @@ impl TestRig {
|
||||
&mut self,
|
||||
expected: &[&str],
|
||||
timeout: Duration,
|
||||
ignore_worker_freed: bool,
|
||||
ignore_nothing_to_do: bool,
|
||||
) {
|
||||
let mut events = Vec::with_capacity(expected.len());
|
||||
|
||||
let drain_future = async {
|
||||
while let Some(event) = self.work_journal_rx.recv().await {
|
||||
if event == WORKER_FREED && ignore_worker_freed {
|
||||
continue;
|
||||
}
|
||||
|
||||
if event == NOTHING_TO_DO && ignore_nothing_to_do {
|
||||
continue;
|
||||
}
|
||||
|
||||
events.push(event);
|
||||
|
||||
// Break as soon as we collect the desired number of events.
|
||||
@@ -1384,6 +1398,8 @@ async fn requeue_unknown_block_gossip_attestation_without_import() {
|
||||
NOTHING_TO_DO,
|
||||
],
|
||||
Duration::from_secs(1) + QUEUED_ATTESTATION_DELAY,
|
||||
false,
|
||||
false,
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -1424,6 +1440,8 @@ async fn requeue_unknown_block_gossip_aggregated_attestation_without_import() {
|
||||
NOTHING_TO_DO,
|
||||
],
|
||||
Duration::from_secs(1) + QUEUED_ATTESTATION_DELAY,
|
||||
false,
|
||||
false,
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -1558,8 +1576,8 @@ async fn test_backfill_sync_processing() {
|
||||
// (not straight forward to manipulate `TestingSlotClock` due to cloning of `SlotClock` in code)
|
||||
// and makes the test very slow, hence timing calculation is unit tested separately in
|
||||
// `work_reprocessing_queue`.
|
||||
for _ in 0..1 {
|
||||
rig.enqueue_backfill_batch();
|
||||
for i in 0..1 {
|
||||
rig.enqueue_backfill_batch(Epoch::new(i));
|
||||
// ensure queued batch is not processed until later
|
||||
rig.assert_no_events_for(Duration::from_millis(100)).await;
|
||||
// A new batch should be processed within a slot.
|
||||
@@ -1570,6 +1588,8 @@ async fn test_backfill_sync_processing() {
|
||||
NOTHING_TO_DO,
|
||||
],
|
||||
rig.chain.slot_clock.slot_duration(),
|
||||
false,
|
||||
false,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
@@ -1590,8 +1610,8 @@ async fn test_backfill_sync_processing_rate_limiting_disabled() {
|
||||
)
|
||||
.await;
|
||||
|
||||
for _ in 0..3 {
|
||||
rig.enqueue_backfill_batch();
|
||||
for i in 0..3 {
|
||||
rig.enqueue_backfill_batch(Epoch::new(i));
|
||||
}
|
||||
|
||||
// ensure all batches are processed
|
||||
@@ -1602,6 +1622,8 @@ async fn test_backfill_sync_processing_rate_limiting_disabled() {
|
||||
WorkType::ChainSegmentBackfill.into(),
|
||||
],
|
||||
Duration::from_millis(100),
|
||||
true,
|
||||
true,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user