Run reconstruction inside a scoped rayon pool (#8075)

Co-Authored-By: Jimmy Chen <jchen.tc@gmail.com>

Co-Authored-By: Eitan Seri- Levi <eserilev@gmail.com>

Co-Authored-By: Eitan Seri-Levi <eserilev@ucsc.edu>
This commit is contained in:
Eitan Seri-Levi
2025-09-23 23:37:34 -07:00
committed by GitHub
parent d80c0ff5b5
commit af274029e8
11 changed files with 123 additions and 60 deletions

3
Cargo.lock generated
View File

@@ -980,7 +980,6 @@ dependencies = [
"metrics",
"num_cpus",
"parking_lot 0.12.3",
"rayon",
"serde",
"slot_clock",
"strum",
@@ -9232,6 +9231,8 @@ dependencies = [
"async-channel 1.9.0",
"futures",
"metrics",
"num_cpus",
"rayon",
"tokio",
"tracing",
]

View File

@@ -124,7 +124,7 @@ use store::{
BlobSidecarListFromRoot, DBColumn, DatabaseBlock, Error as DBError, HotColdDB, HotStateSummary,
KeyValueStore, KeyValueStoreOp, StoreItem, StoreOp,
};
use task_executor::{ShutdownReason, TaskExecutor};
use task_executor::{RayonPoolType, ShutdownReason, TaskExecutor};
use tokio_stream::Stream;
use tracing::{Span, debug, debug_span, error, info, info_span, instrument, trace, warn};
use tree_hash::TreeHash;
@@ -3274,16 +3274,12 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
let current_span = Span::current();
let result = self
.task_executor
.spawn_blocking_handle(
move || {
let _guard = current_span.enter();
data_availability_checker.reconstruct_data_columns(&block_root)
},
"reconstruct_data_columns",
)
.ok_or(BeaconChainError::RuntimeShutdown)?
.spawn_blocking_with_rayon_async(RayonPoolType::HighPriority, move || {
let _guard = current_span.enter();
data_availability_checker.reconstruct_data_columns(&block_root)
})
.await
.map_err(BeaconChainError::TokioJoin)??;
.map_err(|_| BeaconChainError::RuntimeShutdown)??;
match result {
DataColumnReconstructionResult::Success((availability, data_columns_to_publish)) => {

View File

@@ -12,7 +12,6 @@ logging = { workspace = true }
metrics = { workspace = true }
num_cpus = { workspace = true }
parking_lot = { workspace = true }
rayon = { workspace = true }
serde = { workspace = true }
slot_clock = { workspace = true }
strum = { workspace = true }

View File

@@ -38,7 +38,6 @@
//! checks the queues to see if there are more parcels of work that can be spawned in a new worker
//! task.
use crate::rayon_manager::RayonManager;
use crate::work_reprocessing_queue::{
QueuedBackfillBatch, QueuedColumnReconstruction, QueuedGossipBlock, ReprocessQueueMessage,
};
@@ -48,7 +47,6 @@ use lighthouse_network::{MessageId, NetworkGlobals, PeerId};
use logging::TimeLatch;
use logging::crit;
use parking_lot::Mutex;
use rayon::ThreadPool;
pub use scheduler::work_reprocessing_queue;
use serde::{Deserialize, Serialize};
use slot_clock::SlotClock;
@@ -61,7 +59,7 @@ use std::sync::Arc;
use std::task::Context;
use std::time::{Duration, Instant};
use strum::IntoStaticStr;
use task_executor::TaskExecutor;
use task_executor::{RayonPoolType, TaskExecutor};
use tokio::sync::mpsc;
use tokio::sync::mpsc::error::TrySendError;
use tracing::{debug, error, trace, warn};
@@ -76,7 +74,6 @@ use work_reprocessing_queue::{
};
mod metrics;
pub mod rayon_manager;
pub mod scheduler;
/// The maximum size of the channel for work events to the `BeaconProcessor`.
@@ -810,7 +807,6 @@ pub struct BeaconProcessor<E: EthSpec> {
pub network_globals: Arc<NetworkGlobals<E>>,
pub executor: TaskExecutor,
pub current_workers: usize,
pub rayon_manager: RayonManager,
pub config: BeaconProcessorConfig,
}
@@ -1609,10 +1605,7 @@ impl<E: EthSpec> BeaconProcessor<E> {
}
Work::ChainSegmentBackfill(process_fn) => {
if self.config.enable_backfill_rate_limiting {
task_spawner.spawn_blocking_with_rayon(
self.rayon_manager.low_priority_threadpool.clone(),
process_fn,
)
task_spawner.spawn_blocking_with_rayon(RayonPoolType::LowPriority, process_fn)
} else {
// use the global rayon thread pool if backfill rate limiting is disabled.
task_spawner.spawn_blocking(process_fn)
@@ -1681,17 +1674,16 @@ impl TaskSpawner {
}
/// Spawns a blocking task on a rayon thread pool, dropping the `SendOnDrop` after task completion.
fn spawn_blocking_with_rayon<F>(self, thread_pool: Arc<ThreadPool>, task: F)
fn spawn_blocking_with_rayon<F>(self, rayon_pool_type: RayonPoolType, task: F)
where
F: FnOnce() + Send + 'static,
{
self.executor.spawn_blocking(
self.executor.spawn_blocking_with_rayon(
move || {
thread_pool.install(|| {
task();
});
task();
drop(self.send_idle_on_drop)
},
rayon_pool_type,
WORKER_TASK_NAME,
)
}

View File

@@ -1,27 +0,0 @@
use rayon::{ThreadPool, ThreadPoolBuilder};
use std::sync::Arc;
const DEFAULT_LOW_PRIORITY_DIVISOR: usize = 4;
const MINIMUM_LOW_PRIORITY_THREAD_COUNT: usize = 1;
pub struct RayonManager {
/// Smaller rayon thread pool for lower-priority, compute-intensive tasks.
/// By default ~25% of CPUs or a minimum of 1 thread.
pub low_priority_threadpool: Arc<ThreadPool>,
}
impl Default for RayonManager {
fn default() -> Self {
let low_prio_threads =
(num_cpus::get() / DEFAULT_LOW_PRIORITY_DIVISOR).max(MINIMUM_LOW_PRIORITY_THREAD_COUNT);
let low_priority_threadpool = Arc::new(
ThreadPoolBuilder::new()
.num_threads(low_prio_threads)
.build()
.expect("failed to build low-priority rayon pool"),
);
Self {
low_priority_threadpool,
}
}
}

View File

@@ -17,7 +17,6 @@ use beacon_chain::{
store::{HotColdDB, ItemStore, StoreConfig},
};
use beacon_chain::{Kzg, LightClientProducerEvent};
use beacon_processor::rayon_manager::RayonManager;
use beacon_processor::{BeaconProcessor, BeaconProcessorChannels};
use beacon_processor::{BeaconProcessorConfig, BeaconProcessorQueueLengths};
use environment::RuntimeContext;
@@ -681,7 +680,6 @@ where
executor: beacon_processor_context.executor.clone(),
current_workers: 0,
config: beacon_processor_config,
rayon_manager: RayonManager::default(),
}
.spawn_manager(
beacon_processor_channels.beacon_processor_rx,

View File

@@ -5,7 +5,6 @@ use beacon_chain::{
};
use beacon_processor::{
BeaconProcessor, BeaconProcessorChannels, BeaconProcessorConfig, BeaconProcessorQueueLengths,
rayon_manager::RayonManager,
};
use directory::DEFAULT_ROOT_DIR;
use eth2::{BeaconNodeHttpClient, Timeouts};
@@ -248,7 +247,6 @@ pub async fn create_api_server_with_config<T: BeaconChainTypes>(
executor: test_runtime.task_executor.clone(),
current_workers: 0,
config: beacon_processor_config,
rayon_manager: RayonManager::default(),
}
.spawn_manager(
beacon_processor_rx,

View File

@@ -17,7 +17,6 @@ use beacon_chain::test_utils::{
test_spec,
};
use beacon_chain::{BeaconChain, WhenSlotSkipped};
use beacon_processor::rayon_manager::RayonManager;
use beacon_processor::{work_reprocessing_queue::*, *};
use gossipsub::MessageAcceptance;
use itertools::Itertools;
@@ -267,7 +266,6 @@ impl TestRig {
executor,
current_workers: 0,
config: beacon_processor_config,
rayon_manager: RayonManager::default(),
}
.spawn_manager(
beacon_processor_rx,

View File

@@ -8,6 +8,8 @@ edition = { workspace = true }
async-channel = { workspace = true }
futures = { workspace = true }
metrics = { workspace = true }
num_cpus = { workspace = true }
rayon = { workspace = true }
tokio = { workspace = true, features = ["rt-multi-thread", "macros"] }
tracing = { workspace = true }

View File

@@ -1,12 +1,15 @@
mod metrics;
mod rayon_pool_provider;
pub mod test_utils;
use futures::channel::mpsc::Sender;
use futures::prelude::*;
use std::sync::Weak;
use std::sync::{Arc, Weak};
use tokio::runtime::{Handle, Runtime};
use tracing::debug;
use crate::rayon_pool_provider::RayonPoolProvider;
pub use crate::rayon_pool_provider::RayonPoolType;
pub use tokio::task::JoinHandle;
/// Provides a reason when Lighthouse is shut down.
@@ -84,6 +87,8 @@ pub struct TaskExecutor {
// FIXME(sproul): delete?
#[allow(dead_code)]
service_name: String,
rayon_pool_provider: Arc<RayonPoolProvider>,
}
impl TaskExecutor {
@@ -105,6 +110,7 @@ impl TaskExecutor {
exit,
signal_tx,
service_name,
rayon_pool_provider: Arc::new(RayonPoolProvider::default()),
}
}
@@ -115,6 +121,7 @@ impl TaskExecutor {
exit: self.exit.clone(),
signal_tx: self.signal_tx.clone(),
service_name,
rayon_pool_provider: self.rayon_pool_provider.clone(),
}
}
@@ -226,6 +233,47 @@ impl TaskExecutor {
}
}
/// Spawns a blocking task on a dedicated tokio thread pool and installs a rayon context within it.
pub fn spawn_blocking_with_rayon<F>(
self,
task: F,
rayon_pool_type: RayonPoolType,
name: &'static str,
) where
F: FnOnce() + Send + 'static,
{
let thread_pool = self.rayon_pool_provider.get_thread_pool(rayon_pool_type);
self.spawn_blocking(
move || {
thread_pool.install(|| {
task();
});
},
name,
)
}
/// Spawns a blocking computation on a rayon thread pool and awaits the result.
pub async fn spawn_blocking_with_rayon_async<F, R>(
&self,
rayon_pool_type: RayonPoolType,
task: F,
) -> Result<R, tokio::sync::oneshot::error::RecvError>
where
F: FnOnce() -> R + Send + 'static,
R: Send + 'static,
{
let thread_pool = self.rayon_pool_provider.get_thread_pool(rayon_pool_type);
let (tx, rx) = tokio::sync::oneshot::channel();
thread_pool.spawn(move || {
let result = task();
let _ = tx.send(result);
});
rx.await
}
/// Spawn a future on the tokio runtime wrapped in an `async-channel::Receiver` returning an optional
/// join handle to the future.
/// The task is cancelled when the corresponding async-channel is dropped.

View File

@@ -0,0 +1,58 @@
use rayon::{ThreadPool, ThreadPoolBuilder};
use std::sync::Arc;
const DEFAULT_LOW_PRIORITY_CPU_PERCENTAGE: usize = 25;
const DEFAULT_HIGH_PRIORITY_CPU_PERCENTAGE: usize = 80;
const MINIMUM_THREAD_COUNT: usize = 1;
pub enum RayonPoolType {
HighPriority,
LowPriority,
}
pub struct RayonPoolProvider {
/// Smaller rayon thread pool for lower-priority, compute-intensive tasks.
/// By default ~25% of CPUs or a minimum of 1 thread.
low_priority_thread_pool: Arc<ThreadPool>,
/// Larger rayon thread pool for high-priority, compute-intensive tasks.
/// By default ~80% of CPUs or a minimum of 1 thread. Citical/highest
/// priority tasks should use the global pool instead.
high_priority_thread_pool: Arc<ThreadPool>,
}
impl Default for RayonPoolProvider {
fn default() -> Self {
let low_prio_threads =
(num_cpus::get() * DEFAULT_LOW_PRIORITY_CPU_PERCENTAGE / 100).max(MINIMUM_THREAD_COUNT);
let low_priority_thread_pool = Arc::new(
ThreadPoolBuilder::new()
.num_threads(low_prio_threads)
.build()
.expect("failed to build low-priority rayon pool"),
);
let high_prio_threads = (num_cpus::get() * DEFAULT_HIGH_PRIORITY_CPU_PERCENTAGE / 100)
.max(MINIMUM_THREAD_COUNT);
let high_priority_thread_pool = Arc::new(
ThreadPoolBuilder::new()
.num_threads(high_prio_threads)
.build()
.expect("failed to build high-priority rayon pool"),
);
Self {
low_priority_thread_pool,
high_priority_thread_pool,
}
}
}
impl RayonPoolProvider {
/// Get a scoped thread pool by priority level.
/// For critical/highest priority tasks, use the global pool instead.
pub fn get_thread_pool(&self, rayon_pool_type: RayonPoolType) -> Arc<ThreadPool> {
match rayon_pool_type {
RayonPoolType::HighPriority => self.high_priority_thread_pool.clone(),
RayonPoolType::LowPriority => self.low_priority_thread_pool.clone(),
}
}
}