Improve tokio task execution (#1181)

* Add logging on shutdown

* Replace tokio::spawn with handle.spawn

* Upgrade tokio

* Add a task executor

* Beacon chain tasks use task executor

* Validator client tasks use task executor

* Rename runtime_handle to executor

* Add duration histograms; minor fixes

* Cleanup

* Fix logs

* Fix tests

* Remove random file

* Get enr dependency instead of libp2p

* Address some review comments

* Libp2p takes a TaskExecutor

* Ugly fix libp2p tests

* Move TaskExecutor to own file

* Upgrade Dockerfile rust version

* Minor fixes

* Revert "Ugly fix libp2p tests"

This reverts commit 58d4bb690f.

* Pretty fix libp2p tests

* Add spawn_without_exit; change Counter to Gauge

* Tidy

* Move log from RuntimeContext to TaskExecutor

* Fix errors

* Replace histogram with int_gauge for async tasks

* Fix todo

* Fix memory leak in test by exiting all spawned tasks at the end
This commit is contained in:
Pawan Dhananjay
2020-06-04 17:18:05 +05:30
committed by GitHub
parent ce10db15da
commit 042e80570c
53 changed files with 541 additions and 361 deletions

View File

@@ -6,7 +6,7 @@ edition = "2018"
[dependencies]
clap = "2.33.0"
tokio = "0.2.20"
tokio = "0.2.21"
slog = { version = "2.5.2", features = ["max_level_trace"] }
sloggers = "1.0.0"
types = { "path" = "../../consensus/types" }
@@ -20,6 +20,9 @@ ctrlc = { version = "3.1.4", features = ["termination"] }
futures = "0.3.5"
parking_lot = "0.10.2"
slog-json = "2.3.0"
exit-future = "0.2.0"
lazy_static = "1.4.0"
lighthouse_metrics = { path = "../../common/lighthouse_metrics" }
[dev-dependencies]
beacon_node = { path = "../../beacon_node" }

View File

@@ -0,0 +1,128 @@
use crate::metrics;
use futures::prelude::*;
use slog::{debug, trace};
use tokio::runtime::Handle;
/// A wrapper over a runtime handle which can spawn async and blocking tasks.
#[derive(Clone)]
pub struct TaskExecutor {
/// The handle to the runtime on which tasks are spawned
pub(crate) handle: Handle,
/// The receiver exit future which on receiving shuts down the task
pub(crate) exit: exit_future::Exit,
pub(crate) log: slog::Logger,
}
impl TaskExecutor {
/// Create a new task executor.
///
/// Note: this function is mainly useful in tests. A `TaskExecutor` should be normally obtained from
/// a [`RuntimeContext`](struct.RuntimeContext.html)
pub fn new(handle: Handle, exit: exit_future::Exit, log: slog::Logger) -> Self {
Self { handle, exit, log }
}
/// Spawn a future on the tokio runtime wrapped in an `exit_future::Exit`. The task is canceled
/// when the corresponding exit_future `Signal` is fired/dropped.
///
/// This function generates prometheus metrics on number of tasks and task duration.
pub fn spawn(&self, task: impl Future<Output = ()> + Send + 'static, name: &'static str) {
let exit = self.exit.clone();
let log = self.log.clone();
if let Some(int_gauge) = metrics::get_int_gauge(&metrics::ASYNC_TASKS_COUNT, &[name]) {
// Task is shutdown before it completes if `exit` receives
let int_gauge_1 = int_gauge.clone();
let future = future::select(Box::pin(task), exit).then(move |either| {
match either {
future::Either::Left(_) => trace!(log, "Async task completed"; "task" => name),
future::Either::Right(_) => {
debug!(log, "Async task shutdown, exit received"; "task" => name)
}
}
int_gauge_1.dec();
futures::future::ready(())
});
int_gauge.inc();
self.handle.spawn(future);
}
}
/// Spawn a future on the tokio runtime. This function does not wrap the task in an `exit_future::Exit`
/// like [spawn](#method.spawn).
/// The caller of this function is responsible for wrapping up the task with an `exit_future::Exit` to
/// ensure that the task gets canceled appropriately.
/// This function generates prometheus metrics on number of tasks and task duration.
///
/// This is useful in cases where the future to be spawned needs to do additional cleanup work when
/// the task is completed/canceled (e.g. writing local variables to disk) or the task is created from
/// some framework which does its own cleanup (e.g. a hyper server).
pub fn spawn_without_exit(
&self,
task: impl Future<Output = ()> + Send + 'static,
name: &'static str,
) {
if let Some(int_gauge) = metrics::get_int_gauge(&metrics::ASYNC_TASKS_COUNT, &[name]) {
let int_gauge_1 = int_gauge.clone();
let future = task.then(move |_| {
int_gauge_1.dec();
futures::future::ready(())
});
int_gauge.inc();
self.handle.spawn(future);
}
}
/// Spawn a blocking task on a dedicated tokio thread pool wrapped in an exit future.
/// This function generates prometheus metrics on number of tasks and task duration.
pub fn spawn_blocking<F>(&self, task: F, name: &'static str)
where
F: FnOnce() -> () + Send + 'static,
{
let exit = self.exit.clone();
let log = self.log.clone();
if let Some(metric) = metrics::get_histogram(&metrics::BLOCKING_TASKS_HISTOGRAM, &[name]) {
if let Some(int_gauge) = metrics::get_int_gauge(&metrics::BLOCKING_TASKS_COUNT, &[name])
{
let int_gauge_1 = int_gauge.clone();
let timer = metric.start_timer();
let join_handle = self.handle.spawn_blocking(task);
let future = future::select(join_handle, exit).then(move |either| {
match either {
future::Either::Left(_) => {
trace!(log, "Blocking task completed"; "task" => name)
}
future::Either::Right(_) => {
debug!(log, "Blocking task shutdown, exit received"; "task" => name)
}
}
timer.observe_duration();
int_gauge_1.dec();
futures::future::ready(())
});
int_gauge.inc();
self.handle.spawn(future);
}
}
}
/// Returns the underlying runtime handle.
pub fn runtime_handle(&self) -> Handle {
self.handle.clone()
}
/// Returns a copy of the `exit_future::Exit`.
pub fn exit(&self) -> exit_future::Exit {
self.exit.clone()
}
/// Returns a reference to the logger.
pub fn log(&self) -> &slog::Logger {
&self.log
}
}

View File

@@ -10,6 +10,8 @@
use eth2_config::Eth2Config;
use eth2_testnet_config::Eth2TestnetConfig;
use futures::channel::oneshot;
pub use executor::TaskExecutor;
use slog::{info, o, Drain, Level, Logger};
use sloggers::{null::NullLoggerBuilder, Build};
use std::cell::RefCell;
@@ -17,8 +19,10 @@ use std::ffi::OsStr;
use std::fs::{rename as FsRename, OpenOptions};
use std::path::PathBuf;
use std::time::{SystemTime, UNIX_EPOCH};
use tokio::runtime::{Builder as RuntimeBuilder, Handle, Runtime};
use tokio::runtime::{Builder as RuntimeBuilder, Runtime};
use types::{EthSpec, InteropEthSpec, MainnetEthSpec, MinimalEthSpec};
mod executor;
mod metrics;
pub const ETH2_CONFIG_FILENAME: &str = "eth2-spec.toml";
@@ -172,10 +176,13 @@ impl<E: EthSpec> EnvironmentBuilder<E> {
/// Consumes the builder, returning an `Environment`.
pub fn build(self) -> Result<Environment<E>, String> {
let (signal, exit) = exit_future::signal();
Ok(Environment {
runtime: self
.runtime
.ok_or_else(|| "Cannot build environment without runtime".to_string())?,
signal: Some(signal),
exit,
log: self
.log
.ok_or_else(|| "Cannot build environment without log".to_string())?,
@@ -192,8 +199,7 @@ impl<E: EthSpec> EnvironmentBuilder<E> {
/// `Runtime`, instead it only has access to a `Runtime`.
#[derive(Clone)]
pub struct RuntimeContext<E: EthSpec> {
pub runtime_handle: Handle,
pub log: Logger,
pub executor: TaskExecutor,
pub eth_spec_instance: E,
pub eth2_config: Eth2Config,
}
@@ -204,8 +210,11 @@ impl<E: EthSpec> RuntimeContext<E> {
/// The generated service will have the `service_name` in all it's logs.
pub fn service_context(&self, service_name: String) -> Self {
Self {
runtime_handle: self.runtime_handle.clone(),
log: self.log.new(o!("service" => service_name)),
executor: TaskExecutor {
handle: self.executor.handle.clone(),
exit: self.executor.exit.clone(),
log: self.executor.log.new(o!("service" => service_name)),
},
eth_spec_instance: self.eth_spec_instance.clone(),
eth2_config: self.eth2_config.clone(),
}
@@ -215,12 +224,19 @@ impl<E: EthSpec> RuntimeContext<E> {
pub fn eth2_config(&self) -> &Eth2Config {
&self.eth2_config
}
/// Returns a reference to the logger for this service.
pub fn log(&self) -> &slog::Logger {
self.executor.log()
}
}
/// An environment where Lighthouse services can run. Used to start a production beacon node or
/// validator client, or to run tests that involve logging and async task execution.
pub struct Environment<E: EthSpec> {
runtime: Runtime,
signal: Option<exit_future::Signal>,
exit: exit_future::Exit,
log: Logger,
eth_spec_instance: E,
pub eth2_config: Eth2Config,
@@ -239,8 +255,11 @@ impl<E: EthSpec> Environment<E> {
/// Returns a `Context` where no "service" has been added to the logger output.
pub fn core_context(&mut self) -> RuntimeContext<E> {
RuntimeContext {
runtime_handle: self.runtime.handle().clone(),
log: self.log.clone(),
executor: TaskExecutor {
exit: self.exit.clone(),
handle: self.runtime().handle().clone(),
log: self.log.clone(),
},
eth_spec_instance: self.eth_spec_instance.clone(),
eth2_config: self.eth2_config.clone(),
}
@@ -249,8 +268,11 @@ impl<E: EthSpec> Environment<E> {
/// Returns a `Context` where the `service_name` is added to the logger output.
pub fn service_context(&mut self, service_name: String) -> RuntimeContext<E> {
RuntimeContext {
runtime_handle: self.runtime.handle().clone(),
log: self.log.new(o!("service" => service_name)),
executor: TaskExecutor {
exit: self.exit.clone(),
handle: self.runtime().handle().clone(),
log: self.log.new(o!("service" => service_name.clone())),
},
eth_spec_instance: self.eth_spec_instance.clone(),
eth2_config: self.eth2_config.clone(),
}
@@ -279,6 +301,13 @@ impl<E: EthSpec> Environment<E> {
.shutdown_timeout(std::time::Duration::from_secs(2))
}
/// Fire exit signal which shuts down all spawned services
pub fn fire_signal(&mut self) {
if let Some(signal) = self.signal.take() {
let _ = signal.fire();
}
}
/// Sets the logger (and all child loggers) to log to a file.
pub fn log_to_json_file(
&mut self,

View File

@@ -0,0 +1,21 @@
/// Handles async task metrics
use lazy_static::lazy_static;
pub use lighthouse_metrics::*;
lazy_static! {
pub static ref ASYNC_TASKS_COUNT: Result<IntGaugeVec> = try_create_int_gauge_vec(
"async_tasks_count",
"Total number of async tasks spawned using spawn",
&["async_task_count"]
);
pub static ref BLOCKING_TASKS_COUNT: Result<IntGaugeVec> = try_create_int_gauge_vec(
"blocking_tasks_count",
"Total number of async tasks spawned using spawn_blocking",
&["blocking_task_count"]
);
pub static ref BLOCKING_TASKS_HISTOGRAM: Result<HistogramVec> = try_create_histogram_vec(
"blocking_tasks_histogram",
"Time taken by blocking tasks",
&["blocking_task_hist"]
);
}