mirror of
https://github.com/sigp/lighthouse.git
synced 2026-04-18 13:28:33 +00:00
Fix regression in DB write atomicity (#3931)
## Issue Addressed Fix a bug introduced by #3696. The bug is not expected to occur frequently, so releasing this PR is non-urgent. ## Proposed Changes * Add a variant to `StoreOp` that allows a raw KV operation to be passed around. * Return to using `self.store.do_atomically` rather than `self.store.hot_db.do_atomically`. This streamlines the write back into a single call and makes our auto-revert work again. * Prevent `import_block_update_shuffling_cache` from failing block import. This is an outstanding bug from before v3.4.0 which may have contributed to some random unexplained database corruption. ## Additional Info In #3696 I split the database write into two calls, one to convert the `StoreOp`s to `KeyValueStoreOp`s and one to write them. This had the unfortunate side-effect of damaging our atomicity guarantees in case of a write error. If the first call failed, we would be left with the block in fork choice but not on-disk (or the snapshot cache), which would prevent us from processing any descendant blocks. On `unstable` the first call is very unlikely to fail unless the disk is full, but on `tree-states` the conversion is more involved and a user reported database corruption after it failed in a way that should have been recoverable. Additionally, as @emhane observed, #3696 also inadvertently removed the import of the new block into the block cache. Although this seems like it could have negatively impacted performance, there are several mitigating factors: - For regular block processing we should almost always load the parent block (and state) from the snapshot cache. - We often load blinded blocks, which bypass the block cache anyway. - Metrics show no noticeable increase in the block cache miss rate with v3.4.0. However, I expect the block cache _will_ be useful again in `tree-states`, so it is restored to use by this PR.
This commit is contained in:
@@ -2714,7 +2714,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
|
||||
// is so we don't have to think about lock ordering with respect to the fork choice lock.
|
||||
// There are a bunch of places where we lock both fork choice and the pubkey cache and it
|
||||
// would be difficult to check that they all lock fork choice first.
|
||||
let mut kv_store_ops = self
|
||||
let mut ops = self
|
||||
.validator_pubkey_cache
|
||||
.try_write_for(VALIDATOR_PUBKEY_CACHE_LOCK_TIMEOUT)
|
||||
.ok_or(Error::ValidatorPubkeyCacheLockTimeout)?
|
||||
@@ -2816,9 +2816,14 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
|
||||
// ---------------------------- BLOCK PROBABLY ATTESTABLE ----------------------------------
|
||||
// Most blocks are now capable of being attested to thanks to the `early_attester_cache`
|
||||
// cache above. Resume non-essential processing.
|
||||
//
|
||||
// It is important NOT to return errors here before the database commit, because the block
|
||||
// has already been added to fork choice and the database would be left in an inconsistent
|
||||
// state if we returned early without committing. In other words, an error here would
|
||||
// corrupt the node's database permanently.
|
||||
// -----------------------------------------------------------------------------------------
|
||||
|
||||
self.import_block_update_shuffling_cache(block_root, &mut state)?;
|
||||
self.import_block_update_shuffling_cache(block_root, &mut state);
|
||||
self.import_block_observe_attestations(
|
||||
block,
|
||||
&state,
|
||||
@@ -2841,17 +2846,16 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
|
||||
// If the write fails, revert fork choice to the version from disk, else we can
|
||||
// end up with blocks in fork choice that are missing from disk.
|
||||
// See https://github.com/sigp/lighthouse/issues/2028
|
||||
let mut ops: Vec<_> = confirmed_state_roots
|
||||
.into_iter()
|
||||
.map(StoreOp::DeleteStateTemporaryFlag)
|
||||
.collect();
|
||||
ops.extend(
|
||||
confirmed_state_roots
|
||||
.into_iter()
|
||||
.map(StoreOp::DeleteStateTemporaryFlag),
|
||||
);
|
||||
ops.push(StoreOp::PutBlock(block_root, signed_block.clone()));
|
||||
ops.push(StoreOp::PutState(block.state_root(), &state));
|
||||
let txn_lock = self.store.hot_db.begin_rw_transaction();
|
||||
|
||||
kv_store_ops.extend(self.store.convert_to_kv_batch(ops)?);
|
||||
|
||||
if let Err(e) = self.store.hot_db.do_atomically(kv_store_ops) {
|
||||
if let Err(e) = self.store.do_atomically(ops) {
|
||||
error!(
|
||||
self.log,
|
||||
"Database write failed!";
|
||||
@@ -3280,13 +3284,27 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
|
||||
}
|
||||
}
|
||||
|
||||
// For the current and next epoch of this state, ensure we have the shuffling from this
|
||||
// block in our cache.
|
||||
fn import_block_update_shuffling_cache(
|
||||
&self,
|
||||
block_root: Hash256,
|
||||
state: &mut BeaconState<T::EthSpec>,
|
||||
) {
|
||||
if let Err(e) = self.import_block_update_shuffling_cache_fallible(block_root, state) {
|
||||
warn!(
|
||||
self.log,
|
||||
"Failed to prime shuffling cache";
|
||||
"error" => ?e
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
fn import_block_update_shuffling_cache_fallible(
|
||||
&self,
|
||||
block_root: Hash256,
|
||||
state: &mut BeaconState<T::EthSpec>,
|
||||
) -> Result<(), BlockError<T::EthSpec>> {
|
||||
// For the current and next epoch of this state, ensure we have the shuffling from this
|
||||
// block in our cache.
|
||||
for relative_epoch in [RelativeEpoch::Current, RelativeEpoch::Next] {
|
||||
let shuffling_id = AttestationShufflingId::new(block_root, state, relative_epoch)?;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user