From d2e3d4c6f1c75d9d4a014f5470dfb917933e7d70 Mon Sep 17 00:00:00 2001 From: Michael Sproul Date: Tue, 19 Oct 2021 00:30:40 +0000 Subject: [PATCH] Add flag to disable lock timeouts (#2714) ## Issue Addressed Mitigates #1096 ## Proposed Changes Add a flag to the beacon node called `--disable-lock-timeouts` which allows opting out of lock timeouts. The lock timeouts serve a dual purpose: 1. They prevent any single operation from hogging the lock for too long. When a timeout occurs it logs a nasty error which indicates that there's suboptimal lock use occurring, which we can then act on. 2. They allow deadlock detection. We're fairly sure there are no deadlocks left in Lighthouse anymore but the timeout locks offer a safeguard against that. However, timeouts on locks are not without downsides: They allow for the possibility of livelock, particularly on slower hardware. If lock timeouts keep failing spuriously the node can be prevented from making any progress, even if it would be able to make progress slowly without the timeout. One particularly concerning scenario which could occur would be if a DoS attack succeeded in slowing block signature verification times across the network, and all Lighthouse nodes got livelocked because they timed out repeatedly. This could also occur on just a subset of nodes (e.g. dual core VPSs or Raspberri Pis). By making the behaviour runtime configurable this PR allows us to choose the behaviour we want depending on circumstance. I suspect that long term we could make the timeout-free approach the default (#2381 moves in this direction) and just enable the timeouts on our testnet nodes for debugging purposes. This PR conservatively leaves the default as-is so we can gain some more experience before switching the default. --- beacon_node/beacon_chain/src/chain_config.rs | 3 ++ beacon_node/beacon_chain/src/lib.rs | 1 + .../beacon_chain/src/timeout_rw_lock.rs | 32 +++++++++++++++++-- beacon_node/src/cli.rs | 8 +++++ beacon_node/src/config.rs | 4 +++ beacon_node/src/lib.rs | 6 ++++ lighthouse/tests/beacon_node.rs | 15 +++++++++ 7 files changed, 67 insertions(+), 2 deletions(-) diff --git a/beacon_node/beacon_chain/src/chain_config.rs b/beacon_node/beacon_chain/src/chain_config.rs index 9d02003230..9fe09c9822 100644 --- a/beacon_node/beacon_chain/src/chain_config.rs +++ b/beacon_node/beacon_chain/src/chain_config.rs @@ -14,6 +14,8 @@ pub struct ChainConfig { pub weak_subjectivity_checkpoint: Option, /// Determine whether to reconstruct historic states, usually after a checkpoint sync. pub reconstruct_historic_states: bool, + /// Whether timeouts on `TimeoutRwLock`s are enabled or not. + pub enable_lock_timeouts: bool, } impl Default for ChainConfig { @@ -22,6 +24,7 @@ impl Default for ChainConfig { import_max_skip_slots: None, weak_subjectivity_checkpoint: None, reconstruct_historic_states: false, + enable_lock_timeouts: true, } } } diff --git a/beacon_node/beacon_chain/src/lib.rs b/beacon_node/beacon_chain/src/lib.rs index fd4cb14956..8e70e72ba0 100644 --- a/beacon_node/beacon_chain/src/lib.rs +++ b/beacon_node/beacon_chain/src/lib.rs @@ -55,4 +55,5 @@ pub use state_processing::per_block_processing::errors::{ ExitValidationError, ProposerSlashingValidationError, }; pub use store; +pub use timeout_rw_lock::TimeoutRwLock; pub use types; diff --git a/beacon_node/beacon_chain/src/timeout_rw_lock.rs b/beacon_node/beacon_chain/src/timeout_rw_lock.rs index 19d3c3d4fa..28e3f4c29c 100644 --- a/beacon_node/beacon_chain/src/timeout_rw_lock.rs +++ b/beacon_node/beacon_chain/src/timeout_rw_lock.rs @@ -1,20 +1,48 @@ use parking_lot::{RwLock, RwLockReadGuard, RwLockWriteGuard}; +use std::sync::atomic::{AtomicBool, Ordering}; use std::time::Duration; /// A simple wrapper around `parking_lot::RwLock` that only permits read/write access with a /// time-out (i.e., no indefinitely-blocking operations). +/// +/// Timeouts can be optionally be disabled at runtime for all instances of this type by calling +/// `TimeoutRwLock::disable_timeouts()`. pub struct TimeoutRwLock(RwLock); +const TIMEOUT_LOCKS_ENABLED_DEFAULT: bool = true; +static TIMEOUT_LOCKS_ENABLED: AtomicBool = AtomicBool::new(TIMEOUT_LOCKS_ENABLED_DEFAULT); + +impl TimeoutRwLock<()> { + pub fn disable_timeouts() { + // Use the strongest `SeqCst` ordering for the write, as it should only happen once. + TIMEOUT_LOCKS_ENABLED.store(false, Ordering::SeqCst); + } +} + impl TimeoutRwLock { pub fn new(inner: T) -> Self { Self(RwLock::new(inner)) } + fn timeouts_enabled() -> bool { + // Use relaxed ordering as it's OK for a few locks to run with timeouts "accidentally", + // and we want the atomic check to be as fast as possible. + TIMEOUT_LOCKS_ENABLED.load(Ordering::Relaxed) + } + pub fn try_read_for(&self, timeout: Duration) -> Option> { - self.0.try_read_for(timeout) + if Self::timeouts_enabled() { + self.0.try_read_for(timeout) + } else { + Some(self.0.read()) + } } pub fn try_write_for(&self, timeout: Duration) -> Option> { - self.0.try_write_for(timeout) + if Self::timeouts_enabled() { + self.0.try_write_for(timeout) + } else { + Some(self.0.write()) + } } } diff --git a/beacon_node/src/cli.rs b/beacon_node/src/cli.rs index f95b94b0c3..b90410aaca 100644 --- a/beacon_node/src/cli.rs +++ b/beacon_node/src/cli.rs @@ -568,4 +568,12 @@ pub fn cli_app<'a, 'b>() -> App<'a, 'b> { .value_name("PATH") .takes_value(true) ) + .arg( + Arg::with_name("disable-lock-timeouts") + .long("disable-lock-timeouts") + .help("Disable the timeouts applied to some internal locks by default. This can \ + lead to less spurious failures on slow hardware but is considered \ + experimental as it may obscure performance issues.") + .takes_value(false) + ) } diff --git a/beacon_node/src/config.rs b/beacon_node/src/config.rs index 8369ebe053..fb0322f623 100644 --- a/beacon_node/src/config.rs +++ b/beacon_node/src/config.rs @@ -504,6 +504,10 @@ pub fn get_config( .extend_from_slice(&pubkeys); } + if cli_args.is_present("disable-lock-timeouts") { + client_config.chain.enable_lock_timeouts = false; + } + Ok(client_config) } diff --git a/beacon_node/src/lib.rs b/beacon_node/src/lib.rs index bdaefb0e9a..d452e3e463 100644 --- a/beacon_node/src/lib.rs +++ b/beacon_node/src/lib.rs @@ -8,6 +8,7 @@ pub use beacon_chain; use beacon_chain::store::LevelDB; use beacon_chain::{ builder::Witness, eth1_chain::CachingEth1Backend, slot_clock::SystemTimeSlotClock, + TimeoutRwLock, }; use clap::ArgMatches; pub use cli::cli_app; @@ -66,6 +67,11 @@ impl ProductionBeaconNode { let freezer_db_path = client_config.create_freezer_db_path()?; let executor = context.executor.clone(); + if !client_config.chain.enable_lock_timeouts { + info!(log, "Disabling lock timeouts globally"); + TimeoutRwLock::disable_timeouts() + } + let builder = ClientBuilder::new(context.eth_spec_instance.clone()) .runtime_context(context) .chain_spec(spec) diff --git a/lighthouse/tests/beacon_node.rs b/lighthouse/tests/beacon_node.rs index b95d8b942f..2456c8d505 100644 --- a/lighthouse/tests/beacon_node.rs +++ b/lighthouse/tests/beacon_node.rs @@ -177,6 +177,21 @@ fn max_skip_slots_flag() { .with_config(|config| assert_eq!(config.chain.import_max_skip_slots, Some(10))); } +#[test] +fn enable_lock_timeouts_default() { + CommandLineTest::new() + .run() + .with_config(|config| assert!(config.chain.enable_lock_timeouts)); +} + +#[test] +fn disable_lock_timeouts_flag() { + CommandLineTest::new() + .flag("disable-lock-timeouts", None) + .run() + .with_config(|config| assert!(!config.chain.enable_lock_timeouts)); +} + #[test] fn freezer_dir_flag() { let dir = TempDir::new().expect("Unable to create temporary directory");