mirror of
https://github.com/sigp/lighthouse.git
synced 2026-03-14 10:22:38 +00:00
Fix I/O atomicity issues with checkpoint sync (#2671)
## Issue Addressed This PR addresses an issue found by @YorickDowne during testing of v2.0.0-rc.0. Due to a lack of atomic database writes on checkpoint sync start-up, it was possible for the database to get into an inconsistent state from which it couldn't recover without `--purge-db`. The core of the issue was that the store's anchor info was being stored _before_ the `PersistedBeaconChain`. If a crash occured so that anchor info was stored but _not_ the `PersistedBeaconChain`, then on restart Lighthouse would think the database was unitialized and attempt to compare-and-swap a `None` value, but would actually find the stale info from the previous run. ## Proposed Changes The issue is fixed by writing the anchor info, the split point, and the `PersistedBeaconChain` atomically on start-up. Some type-hinting ugliness was required, which could possibly be cleaned up in future refactors.
This commit is contained in:
@@ -261,7 +261,6 @@ impl<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> HotColdDB<E, Hot, Cold>
|
||||
}
|
||||
|
||||
/// Prepare a signed beacon block for storage in the database.
|
||||
#[must_use]
|
||||
pub fn block_as_kv_store_op(
|
||||
&self,
|
||||
key: &Hash256,
|
||||
@@ -973,7 +972,7 @@ impl<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> HotColdDB<E, Hot, Cold>
|
||||
}
|
||||
|
||||
/// Initialise the anchor info for checkpoint sync starting from `block`.
|
||||
pub fn init_anchor_info(&self, block: BeaconBlockRef<'_, E>) -> Result<(), Error> {
|
||||
pub fn init_anchor_info(&self, block: BeaconBlockRef<'_, E>) -> Result<KeyValueStoreOp, Error> {
|
||||
let anchor_slot = block.slot();
|
||||
let slots_per_restore_point = self.config.slots_per_restore_point;
|
||||
|
||||
@@ -1003,23 +1002,36 @@ impl<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> HotColdDB<E, Hot, Cold>
|
||||
|
||||
/// Atomically update the anchor info from `prev_value` to `new_value`.
|
||||
///
|
||||
/// Return a `KeyValueStoreOp` which should be written to disk, possibly atomically with other
|
||||
/// values.
|
||||
///
|
||||
/// Return an `AnchorInfoConcurrentMutation` error if the `prev_value` provided
|
||||
/// is not correct.
|
||||
pub fn compare_and_set_anchor_info(
|
||||
&self,
|
||||
prev_value: Option<AnchorInfo>,
|
||||
new_value: Option<AnchorInfo>,
|
||||
) -> Result<(), Error> {
|
||||
) -> Result<KeyValueStoreOp, Error> {
|
||||
let mut anchor_info = self.anchor_info.write();
|
||||
if *anchor_info == prev_value {
|
||||
self.store_anchor_info(&new_value)?;
|
||||
let kv_op = self.store_anchor_info_in_batch(&new_value);
|
||||
*anchor_info = new_value;
|
||||
Ok(())
|
||||
Ok(kv_op)
|
||||
} else {
|
||||
Err(Error::AnchorInfoConcurrentMutation)
|
||||
}
|
||||
}
|
||||
|
||||
/// As for `compare_and_set_anchor_info`, but also writes the anchor to disk immediately.
|
||||
pub fn compare_and_set_anchor_info_with_write(
|
||||
&self,
|
||||
prev_value: Option<AnchorInfo>,
|
||||
new_value: Option<AnchorInfo>,
|
||||
) -> Result<(), Error> {
|
||||
let kv_store_op = self.compare_and_set_anchor_info(prev_value, new_value)?;
|
||||
self.hot_db.do_atomically(vec![kv_store_op])
|
||||
}
|
||||
|
||||
/// Load the anchor info from disk, but do not set `self.anchor_info`.
|
||||
fn load_anchor_info(&self) -> Result<Option<AnchorInfo>, Error> {
|
||||
self.hot_db.get(&ANCHOR_INFO_KEY)
|
||||
@@ -1029,13 +1041,15 @@ impl<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> HotColdDB<E, Hot, Cold>
|
||||
///
|
||||
/// The argument is intended to be `self.anchor_info`, but is passed manually to avoid issues
|
||||
/// with recursive locking.
|
||||
fn store_anchor_info(&self, anchor_info: &Option<AnchorInfo>) -> Result<(), Error> {
|
||||
fn store_anchor_info_in_batch(&self, anchor_info: &Option<AnchorInfo>) -> KeyValueStoreOp {
|
||||
if let Some(ref anchor_info) = anchor_info {
|
||||
self.hot_db.put(&ANCHOR_INFO_KEY, anchor_info)?;
|
||||
anchor_info.as_kv_store_op(ANCHOR_INFO_KEY)
|
||||
} else {
|
||||
self.hot_db.delete::<AnchorInfo>(&ANCHOR_INFO_KEY)?;
|
||||
KeyValueStoreOp::DeleteKey(get_key_for_col(
|
||||
DBColumn::BeaconMeta.into(),
|
||||
ANCHOR_INFO_KEY.as_bytes(),
|
||||
))
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// If an anchor exists, return its `anchor_slot` field.
|
||||
@@ -1103,10 +1117,9 @@ impl<E: EthSpec, Hot: ItemStore<E>, Cold: ItemStore<E>> HotColdDB<E, Hot, Cold>
|
||||
self.hot_db.get(&SPLIT_KEY)
|
||||
}
|
||||
|
||||
/// Store the split point to disk.
|
||||
pub fn store_split(&self) -> Result<(), Error> {
|
||||
self.hot_db.put_sync(&SPLIT_KEY, &*self.split.read())?;
|
||||
Ok(())
|
||||
/// Stage the split for storage to disk.
|
||||
pub fn store_split_in_batch(&self) -> KeyValueStoreOp {
|
||||
self.split.read_recursive().as_kv_store_op(SPLIT_KEY)
|
||||
}
|
||||
|
||||
/// Load the state root of a restore point.
|
||||
|
||||
@@ -81,6 +81,7 @@ pub fn get_key_for_col(column: &str, key: &[u8]) -> Vec<u8> {
|
||||
result
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub enum KeyValueStoreOp {
|
||||
PutKeyValue(Vec<u8>, Vec<u8>),
|
||||
DeleteKey(Vec<u8>),
|
||||
|
||||
@@ -189,7 +189,6 @@ impl<T: EthSpec> PartialBeaconState<T> {
|
||||
}
|
||||
|
||||
/// Prepare the partial state for storage in the KV database.
|
||||
#[must_use]
|
||||
pub fn as_kv_store_op(&self, state_root: Hash256) -> KeyValueStoreOp {
|
||||
let db_key = get_key_for_col(DBColumn::BeaconState.into(), state_root.as_bytes());
|
||||
KeyValueStoreOp::PutKeyValue(db_key, self.as_ssz_bytes())
|
||||
|
||||
@@ -131,14 +131,17 @@ where
|
||||
});
|
||||
}
|
||||
|
||||
self.compare_and_set_anchor_info(old_anchor, None)?;
|
||||
self.compare_and_set_anchor_info_with_write(old_anchor, None)?;
|
||||
|
||||
return Ok(());
|
||||
} else {
|
||||
// The lower limit has been raised, store it.
|
||||
anchor.state_lower_limit = slot;
|
||||
|
||||
self.compare_and_set_anchor_info(old_anchor, Some(anchor.clone()))?;
|
||||
self.compare_and_set_anchor_info_with_write(
|
||||
old_anchor,
|
||||
Some(anchor.clone()),
|
||||
)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user