mirror of
https://github.com/sigp/lighthouse.git
synced 2026-04-18 13:28:33 +00:00
Shutdown gracefully on panic (#2596)
## Proposed Changes
* Modify the `TaskExecutor` so that it spawns a "monitor" future for each future spawned by `spawn` or `spawn_blocking`. This monitor future joins the handle of the child future and shuts down the executor if it detects a panic.
* Enable backtraces by default by setting the environment variable `RUST_BACKTRACE`.
* Spawn the `ProductionBeaconNode` on the `TaskExecutor` so that if a panic occurs during start-up it will take down the whole process. Previously we were using a raw Tokio `spawn`, but I can't see any reason not to use the executor (perhaps someone else can).
## Additional Info
I considered using [`std::panic::set_hook`](https://doc.rust-lang.org/std/panic/fn.set_hook.html) to instantiate a custom panic handler, however this doesn't allow us to send a shutdown signal because `Fn` functions can't move variables (i.e. the shutdown sender) out of their environment. This also prevents it from receiving a `Logger`. Hence I decided to leave the panic handler untouched, but with backtraces turned on by default.
I did a run through the code base with all the raw Tokio spawn functions disallowed by Clippy, and found only two instances where we bypass the `TaskExecutor`: the HTTP API and `InitializedValidators` in the VC. In both places we use `spawn_blocking` and handle the return value, so I figured that was OK for now.
In terms of performance I think the overhead should be minimal. The monitor tasks will just get parked by the executor until their child resolves.
I've checked that this covers Discv5, as the `TaskExecutor` gets injected into Discv5 here: f9bba92db3/beacon_node/src/lib.rs (L125-L126)
This commit is contained in:
@@ -32,6 +32,11 @@ fn bls_library_name() -> &'static str {
|
||||
}
|
||||
|
||||
fn main() {
|
||||
// Enable backtraces unless a RUST_BACKTRACE value has already been explicitly provided.
|
||||
if std::env::var("RUST_BACKTRACE").is_err() {
|
||||
std::env::set_var("RUST_BACKTRACE", "1");
|
||||
}
|
||||
|
||||
// Parse the CLI parameters.
|
||||
let matches = App::new("Lighthouse")
|
||||
.version(VERSION.replace("Lighthouse/", "").as_str())
|
||||
@@ -344,20 +349,23 @@ fn run<E: EthSpec>(
|
||||
.map_err(|e| format!("Error serializing config: {:?}", e))?;
|
||||
};
|
||||
|
||||
environment.runtime().spawn(async move {
|
||||
if let Err(e) = ProductionBeaconNode::new(context.clone(), config).await {
|
||||
crit!(log, "Failed to start beacon node"; "reason" => e);
|
||||
// Ignore the error since it always occurs during normal operation when
|
||||
// shutting down.
|
||||
let _ = executor
|
||||
.shutdown_sender()
|
||||
.try_send(ShutdownReason::Failure("Failed to start beacon node"));
|
||||
} else if shutdown_flag {
|
||||
let _ = executor.shutdown_sender().try_send(ShutdownReason::Success(
|
||||
"Beacon node immediate shutdown triggered.",
|
||||
));
|
||||
}
|
||||
});
|
||||
executor.clone().spawn(
|
||||
async move {
|
||||
if let Err(e) = ProductionBeaconNode::new(context.clone(), config).await {
|
||||
crit!(log, "Failed to start beacon node"; "reason" => e);
|
||||
// Ignore the error since it always occurs during normal operation when
|
||||
// shutting down.
|
||||
let _ = executor
|
||||
.shutdown_sender()
|
||||
.try_send(ShutdownReason::Failure("Failed to start beacon node"));
|
||||
} else if shutdown_flag {
|
||||
let _ = executor.shutdown_sender().try_send(ShutdownReason::Success(
|
||||
"Beacon node immediate shutdown triggered.",
|
||||
));
|
||||
}
|
||||
},
|
||||
"beacon_node",
|
||||
);
|
||||
}
|
||||
("validator_client", Some(matches)) => {
|
||||
let context = environment.core_context();
|
||||
@@ -374,19 +382,22 @@ fn run<E: EthSpec>(
|
||||
.map_err(|e| format!("Error serializing config: {:?}", e))?;
|
||||
};
|
||||
if !shutdown_flag {
|
||||
environment.runtime().spawn(async move {
|
||||
if let Err(e) = ProductionValidatorClient::new(context, config)
|
||||
.await
|
||||
.and_then(|mut vc| vc.start_service())
|
||||
{
|
||||
crit!(log, "Failed to start validator client"; "reason" => e);
|
||||
// Ignore the error since it always occurs during normal operation when
|
||||
// shutting down.
|
||||
let _ = executor
|
||||
.shutdown_sender()
|
||||
.try_send(ShutdownReason::Failure("Failed to start validator client"));
|
||||
}
|
||||
});
|
||||
executor.clone().spawn(
|
||||
async move {
|
||||
if let Err(e) = ProductionValidatorClient::new(context, config)
|
||||
.await
|
||||
.and_then(|mut vc| vc.start_service())
|
||||
{
|
||||
crit!(log, "Failed to start validator client"; "reason" => e);
|
||||
// Ignore the error since it always occurs during normal operation when
|
||||
// shutting down.
|
||||
let _ = executor.shutdown_sender().try_send(ShutdownReason::Failure(
|
||||
"Failed to start validator client",
|
||||
));
|
||||
}
|
||||
},
|
||||
"validator_client",
|
||||
);
|
||||
} else {
|
||||
let _ = executor.shutdown_sender().try_send(ShutdownReason::Success(
|
||||
"Validator client immediate shutdown triggered.",
|
||||
|
||||
Reference in New Issue
Block a user