Implement Subnet Sampling for PeerDAS (#6410)

* Add `SAMPLES_PER_SLOT` config.

* Rename `sampling` module to `peer_sampling`

* Implement subnet sampling.

* Update lookup test.

* Merge branch 'unstable' into subnet-sampling

* Merge branch 'unstable' into subnet-sampling

# Conflicts:
#	beacon_node/beacon_chain/src/data_availability_checker.rs
#	beacon_node/http_api/src/publish_blocks.rs
#	beacon_node/lighthouse_network/src/types/globals.rs
#	beacon_node/network/src/sync/manager.rs

* Merge branch 'unstable' into subnet-sampling
This commit is contained in:
Jimmy Chen
2024-10-04 10:27:30 +10:00
committed by GitHub
parent a4a673b780
commit f3a5e256da
20 changed files with 122 additions and 80 deletions

View File

@@ -0,0 +1,659 @@
use self::request::ActiveColumnSampleRequest;
use super::network_context::{
DataColumnsByRootSingleBlockRequest, RpcResponseError, SyncNetworkContext,
};
use crate::metrics;
use beacon_chain::BeaconChainTypes;
use fnv::FnvHashMap;
use lighthouse_network::service::api_types::{
DataColumnsByRootRequester, SamplingId, SamplingRequestId, SamplingRequester,
};
use lighthouse_network::{PeerAction, PeerId};
use rand::{seq::SliceRandom, thread_rng};
use slog::{debug, error, warn};
use std::{
collections::hash_map::Entry, collections::HashMap, marker::PhantomData, sync::Arc,
time::Duration,
};
use types::{data_column_sidecar::ColumnIndex, ChainSpec, DataColumnSidecar, Hash256};
pub type SamplingResult = Result<(), SamplingError>;
type DataColumnSidecarList<E> = Vec<Arc<DataColumnSidecar<E>>>;
pub struct Sampling<T: BeaconChainTypes> {
// TODO(das): stalled sampling request are never cleaned up
requests: HashMap<SamplingRequester, ActiveSamplingRequest<T>>,
sampling_config: SamplingConfig,
log: slog::Logger,
}
impl<T: BeaconChainTypes> Sampling<T> {
pub fn new(sampling_config: SamplingConfig, log: slog::Logger) -> Self {
Self {
requests: <_>::default(),
sampling_config,
log,
}
}
#[cfg(test)]
pub fn active_sampling_requests(&self) -> Vec<Hash256> {
self.requests.values().map(|r| r.block_root).collect()
}
/// Create a new sampling request for a known block
///
/// ### Returns
///
/// - `Some`: Request completed, won't make more progress. Expect requester to act on the result.
/// - `None`: Request still active, requester should do no action
pub fn on_new_sample_request(
&mut self,
block_root: Hash256,
cx: &mut SyncNetworkContext<T>,
) -> Option<(SamplingRequester, SamplingResult)> {
let id = SamplingRequester::ImportedBlock(block_root);
let request = match self.requests.entry(id) {
Entry::Vacant(e) => e.insert(ActiveSamplingRequest::new(
block_root,
id,
&self.sampling_config,
self.log.clone(),
&cx.chain.spec,
)),
Entry::Occupied(_) => {
// Sampling is triggered from multiple sources, duplicate sampling requests are
// likely (gossip block + gossip data column)
// TODO(das): Should track failed sampling request for some time? Otherwise there's
// a risk of a loop with multiple triggers creating the request, then failing,
// and repeat.
debug!(self.log, "Ignoring duplicate sampling request"; "id" => ?id);
return None;
}
};
debug!(self.log, "Created new sample request"; "id" => ?id);
// TOOD(das): If a node has very little peers, continue_sampling() will attempt to find enough
// to sample here, immediately failing the sampling request. There should be some grace
// period to allow the peer manager to find custody peers.
let result = request.continue_sampling(cx);
self.handle_sampling_result(result, &id)
}
/// Insert a downloaded column into an active sampling request. Then make progress on the
/// entire request.
///
/// ### Returns
///
/// - `Some`: Request completed, won't make more progress. Expect requester to act on the result.
/// - `None`: Request still active, requester should do no action
pub fn on_sample_downloaded(
&mut self,
id: SamplingId,
peer_id: PeerId,
resp: Result<(DataColumnSidecarList<T::EthSpec>, Duration), RpcResponseError>,
cx: &mut SyncNetworkContext<T>,
) -> Option<(SamplingRequester, SamplingResult)> {
let Some(request) = self.requests.get_mut(&id.id) else {
// TOOD(das): This log can happen if the request is error'ed early and dropped
debug!(self.log, "Sample downloaded event for unknown request"; "id" => ?id);
return None;
};
let result = request.on_sample_downloaded(peer_id, id.sampling_request_id, resp, cx);
self.handle_sampling_result(result, &id.id)
}
/// Insert a downloaded column into an active sampling request. Then make progress on the
/// entire request.
///
/// ### Returns
///
/// - `Some`: Request completed, won't make more progress. Expect requester to act on the result.
/// - `None`: Request still active, requester should do no action
pub fn on_sample_verified(
&mut self,
id: SamplingId,
result: Result<(), String>,
cx: &mut SyncNetworkContext<T>,
) -> Option<(SamplingRequester, SamplingResult)> {
let Some(request) = self.requests.get_mut(&id.id) else {
// TOOD(das): This log can happen if the request is error'ed early and dropped
debug!(self.log, "Sample verified event for unknown request"; "id" => ?id);
return None;
};
let result = request.on_sample_verified(id.sampling_request_id, result, cx);
self.handle_sampling_result(result, &id.id)
}
/// Converts a result from the internal format of `ActiveSamplingRequest` (error first to use ?
/// conveniently), to an Option first format to use an `if let Some() { act on result }` pattern
/// in the sync manager.
fn handle_sampling_result(
&mut self,
result: Result<Option<()>, SamplingError>,
id: &SamplingRequester,
) -> Option<(SamplingRequester, SamplingResult)> {
let result = result.transpose();
if let Some(result) = result {
debug!(self.log, "Sampling request completed, removing"; "id" => ?id, "result" => ?result);
metrics::inc_counter_vec(
&metrics::SAMPLING_REQUEST_RESULT,
&[metrics::from_result(&result)],
);
self.requests.remove(id);
Some((*id, result))
} else {
None
}
}
}
pub struct ActiveSamplingRequest<T: BeaconChainTypes> {
block_root: Hash256,
requester_id: SamplingRequester,
column_requests: FnvHashMap<ColumnIndex, ActiveColumnSampleRequest>,
/// Mapping of column indexes for a sampling request.
column_indexes_by_sampling_request: FnvHashMap<SamplingRequestId, Vec<ColumnIndex>>,
/// Sequential ID for sampling requests.
current_sampling_request_id: SamplingRequestId,
column_shuffle: Vec<ColumnIndex>,
required_successes: Vec<usize>,
/// Logger for the `SyncNetworkContext`.
pub log: slog::Logger,
_phantom: PhantomData<T>,
}
#[derive(Debug)]
pub enum SamplingError {
SendFailed(#[allow(dead_code)] &'static str),
ProcessorUnavailable,
TooManyFailures,
BadState(#[allow(dead_code)] String),
ColumnIndexOutOfBounds,
}
/// Required success index by current failures, with p_target=5.00E-06
/// Ref: https://colab.research.google.com/drive/18uUgT2i-m3CbzQ5TyP9XFKqTn1DImUJD#scrollTo=E82ITcgB5ATh
const REQUIRED_SUCCESSES: [usize; 11] = [16, 20, 23, 26, 29, 32, 34, 37, 39, 42, 44];
#[derive(Debug, Clone)]
pub enum SamplingConfig {
Default,
#[allow(dead_code)]
Custom {
required_successes: Vec<usize>,
},
}
impl<T: BeaconChainTypes> ActiveSamplingRequest<T> {
fn new(
block_root: Hash256,
requester_id: SamplingRequester,
sampling_config: &SamplingConfig,
log: slog::Logger,
spec: &ChainSpec,
) -> Self {
// Select ahead of time the full list of to-sample columns
let mut column_shuffle =
(0..spec.number_of_columns as ColumnIndex).collect::<Vec<ColumnIndex>>();
let mut rng = thread_rng();
column_shuffle.shuffle(&mut rng);
Self {
block_root,
requester_id,
column_requests: <_>::default(),
column_indexes_by_sampling_request: <_>::default(),
current_sampling_request_id: SamplingRequestId(0),
column_shuffle,
required_successes: match sampling_config {
SamplingConfig::Default => REQUIRED_SUCCESSES.to_vec(),
SamplingConfig::Custom { required_successes } => required_successes.clone(),
},
log,
_phantom: PhantomData,
}
}
/// Insert a downloaded column into an active sampling request. Then make progress on the
/// entire request.
///
/// ### Returns
///
/// - `Err`: Sampling request has failed and will be dropped
/// - `Ok(Some)`: Sampling request has successfully completed and will be dropped
/// - `Ok(None)`: Sampling request still active
pub(crate) fn on_sample_downloaded(
&mut self,
_peer_id: PeerId,
sampling_request_id: SamplingRequestId,
resp: Result<(DataColumnSidecarList<T::EthSpec>, Duration), RpcResponseError>,
cx: &mut SyncNetworkContext<T>,
) -> Result<Option<()>, SamplingError> {
// Select columns to sample
// Create individual request per column
// Progress requests
// If request fails retry or expand search
// If all good return
let Some(column_indexes) = self
.column_indexes_by_sampling_request
.get(&sampling_request_id)
else {
error!(self.log,
"Column indexes for the sampling request ID not found";
"sampling_request_id" => ?sampling_request_id
);
return Ok(None);
};
match resp {
Ok((mut resp_data_columns, seen_timestamp)) => {
debug!(self.log,
"Sample download success";
"block_root" => %self.block_root,
"column_indexes" => ?column_indexes,
"count" => resp_data_columns.len()
);
metrics::inc_counter_vec(&metrics::SAMPLE_DOWNLOAD_RESULT, &[metrics::SUCCESS]);
// Filter the data received in the response using the requested column indexes.
let mut data_columns = vec![];
for column_index in column_indexes {
let Some(request) = self.column_requests.get_mut(column_index) else {
warn!(self.log,
"Active column sample request not found";
"block_root" => %self.block_root,
"column_index" => column_index
);
continue;
};
let Some(data_pos) = resp_data_columns
.iter()
.position(|data| &data.index == column_index)
else {
// Peer does not have the requested data.
// TODO(das) what to do?
debug!(self.log,
"Sampling peer claims to not have the data";
"block_root" => %self.block_root,
"column_index" => column_index
);
request.on_sampling_error()?;
continue;
};
data_columns.push(resp_data_columns.swap_remove(data_pos));
}
if !resp_data_columns.is_empty() {
let resp_column_indexes = resp_data_columns
.iter()
.map(|d| d.index)
.collect::<Vec<_>>();
debug!(self.log,
"Received data that was not requested";
"block_root" => %self.block_root,
"column_indexes" => ?resp_column_indexes
);
}
// Handle the downloaded data columns.
if data_columns.is_empty() {
debug!(self.log, "Received empty response"; "block_root" => %self.block_root);
self.column_indexes_by_sampling_request
.remove(&sampling_request_id);
} else {
// Overwrite `column_indexes` with the column indexes received in the response.
let column_indexes = data_columns.iter().map(|d| d.index).collect::<Vec<_>>();
self.column_indexes_by_sampling_request
.insert(sampling_request_id, column_indexes.clone());
// Peer has data column, send to verify
let Some(beacon_processor) = cx.beacon_processor_if_enabled() else {
// If processor is not available, error the entire sampling
debug!(self.log,
"Dropping sampling";
"block" => %self.block_root,
"reason" => "beacon processor unavailable"
);
return Err(SamplingError::ProcessorUnavailable);
};
debug!(self.log,
"Sending data_column for verification";
"block" => ?self.block_root,
"column_indexes" => ?column_indexes
);
if let Err(e) = beacon_processor.send_rpc_validate_data_columns(
self.block_root,
data_columns,
seen_timestamp,
SamplingId {
id: self.requester_id,
sampling_request_id,
},
) {
// TODO(das): Beacon processor is overloaded, what should we do?
error!(self.log,
"Dropping sampling";
"block" => %self.block_root,
"reason" => e.to_string()
);
return Err(SamplingError::SendFailed("beacon processor send failure"));
}
}
}
Err(err) => {
debug!(self.log, "Sample download error";
"block_root" => %self.block_root,
"column_indexes" => ?column_indexes,
"error" => ?err
);
metrics::inc_counter_vec(&metrics::SAMPLE_DOWNLOAD_RESULT, &[metrics::FAILURE]);
// Error downloading, maybe penalize peer and retry again.
// TODO(das) with different peer or different peer?
for column_index in column_indexes {
let Some(request) = self.column_requests.get_mut(column_index) else {
warn!(self.log,
"Active column sample request not found";
"block_root" => %self.block_root,
"column_index" => column_index
);
continue;
};
request.on_sampling_error()?;
}
}
};
self.continue_sampling(cx)
}
/// Insert a column verification result into an active sampling request. Then make progress
/// on the entire request.
///
/// ### Returns
///
/// - `Err`: Sampling request has failed and will be dropped
/// - `Ok(Some)`: Sampling request has successfully completed and will be dropped
/// - `Ok(None)`: Sampling request still active
pub(crate) fn on_sample_verified(
&mut self,
sampling_request_id: SamplingRequestId,
result: Result<(), String>,
cx: &mut SyncNetworkContext<T>,
) -> Result<Option<()>, SamplingError> {
let Some(column_indexes) = self
.column_indexes_by_sampling_request
.get(&sampling_request_id)
else {
error!(self.log, "Column indexes for the sampling request ID not found"; "sampling_request_id" => ?sampling_request_id);
return Ok(None);
};
match result {
Ok(_) => {
debug!(self.log, "Sample verification success"; "block_root" => %self.block_root, "column_indexes" => ?column_indexes);
metrics::inc_counter_vec(&metrics::SAMPLE_VERIFY_RESULT, &[metrics::SUCCESS]);
// Valid, continue_sampling will maybe consider sampling succees
for column_index in column_indexes {
let Some(request) = self.column_requests.get_mut(column_index) else {
warn!(
self.log,
"Active column sample request not found"; "block_root" => %self.block_root, "column_index" => column_index
);
continue;
};
request.on_sampling_success()?;
}
}
Err(err) => {
debug!(self.log, "Sample verification failure"; "block_root" => %self.block_root, "column_indexes" => ?column_indexes, "reason" => ?err);
metrics::inc_counter_vec(&metrics::SAMPLE_VERIFY_RESULT, &[metrics::FAILURE]);
// TODO(das): Peer sent invalid data, penalize and try again from different peer
// TODO(das): Count individual failures
for column_index in column_indexes {
let Some(request) = self.column_requests.get_mut(column_index) else {
warn!(
self.log,
"Active column sample request not found"; "block_root" => %self.block_root, "column_index" => column_index
);
continue;
};
let peer_id = request.on_sampling_error()?;
cx.report_peer(
peer_id,
PeerAction::LowToleranceError,
"invalid data column",
);
}
}
}
self.continue_sampling(cx)
}
pub(crate) fn continue_sampling(
&mut self,
cx: &mut SyncNetworkContext<T>,
) -> Result<Option<()>, SamplingError> {
// First check if sampling is completed, by computing `required_successes`
let mut successes = 0;
let mut failures = 0;
let mut ongoings = 0;
for request in self.column_requests.values() {
if request.is_completed() {
successes += 1;
}
if request.is_failed() {
failures += 1;
}
if request.is_ongoing() {
ongoings += 1;
}
}
// If there are too many failures, consider the sampling failed
let Some(required_successes) = self.required_successes.get(failures) else {
return Err(SamplingError::TooManyFailures);
};
// If there are enough successes, consider the sampling complete
if successes >= *required_successes {
return Ok(Some(()));
}
// First, attempt to progress sampling by requesting more columns, so that request failures
// are accounted for below.
// Group the requested column indexes by the destination peer to batch sampling requests.
let mut column_indexes_to_request = FnvHashMap::default();
for idx in 0..*required_successes {
// Re-request columns. Note: out of bounds error should never happen, inputs are hardcoded
let column_index = *self
.column_shuffle
.get(idx)
.ok_or(SamplingError::ColumnIndexOutOfBounds)?;
let request = self
.column_requests
.entry(column_index)
.or_insert(ActiveColumnSampleRequest::new(column_index));
if request.is_ready_to_request() {
if let Some(peer_id) = request.choose_peer(cx) {
let indexes = column_indexes_to_request.entry(peer_id).or_insert(vec![]);
indexes.push(column_index);
}
}
}
// Send requests.
let mut sent_request = false;
for (peer_id, column_indexes) in column_indexes_to_request {
cx.data_column_lookup_request(
DataColumnsByRootRequester::Sampling(SamplingId {
id: self.requester_id,
sampling_request_id: self.current_sampling_request_id,
}),
peer_id,
DataColumnsByRootSingleBlockRequest {
block_root: self.block_root,
indices: column_indexes.clone(),
},
)
.map_err(SamplingError::SendFailed)?;
self.column_indexes_by_sampling_request
.insert(self.current_sampling_request_id, column_indexes.clone());
self.current_sampling_request_id.0 += 1;
sent_request = true;
// Update request status.
for column_index in column_indexes {
let Some(request) = self.column_requests.get_mut(&column_index) else {
continue;
};
request.on_start_sampling(peer_id)?;
}
}
// Make sure that sampling doesn't stall, by ensuring that this sampling request will
// receive a new event of some type. If there are no ongoing requests, and no new
// request was sent, loop to increase the required_successes until the sampling fails if
// there are no peers.
if ongoings == 0 && !sent_request {
debug!(self.log, "Sampling request stalled"; "block_root" => %self.block_root);
}
Ok(None)
}
}
mod request {
use super::SamplingError;
use crate::sync::network_context::SyncNetworkContext;
use beacon_chain::BeaconChainTypes;
use lighthouse_network::PeerId;
use rand::seq::SliceRandom;
use rand::thread_rng;
use std::collections::HashSet;
use types::data_column_sidecar::ColumnIndex;
pub(crate) struct ActiveColumnSampleRequest {
column_index: ColumnIndex,
status: Status,
// TODO(das): Should downscore peers that claim to not have the sample?
peers_dont_have: HashSet<PeerId>,
}
#[derive(Debug, Clone)]
enum Status {
NoPeers,
NotStarted,
Sampling(PeerId),
Verified,
}
impl ActiveColumnSampleRequest {
pub(crate) fn new(column_index: ColumnIndex) -> Self {
Self {
column_index,
status: Status::NotStarted,
peers_dont_have: <_>::default(),
}
}
pub(crate) fn is_completed(&self) -> bool {
match self.status {
Status::NoPeers | Status::NotStarted | Status::Sampling(_) => false,
Status::Verified => true,
}
}
pub(crate) fn is_failed(&self) -> bool {
match self.status {
Status::NotStarted | Status::Sampling(_) | Status::Verified => false,
Status::NoPeers => true,
}
}
pub(crate) fn is_ongoing(&self) -> bool {
match self.status {
Status::NotStarted | Status::NoPeers | Status::Verified => false,
Status::Sampling(_) => true,
}
}
pub(crate) fn is_ready_to_request(&self) -> bool {
match self.status {
Status::NoPeers | Status::NotStarted => true,
Status::Sampling(_) | Status::Verified => false,
}
}
pub(crate) fn choose_peer<T: BeaconChainTypes>(
&mut self,
cx: &SyncNetworkContext<T>,
) -> Option<PeerId> {
// TODO: When is a fork and only a subset of your peers know about a block, sampling should only
// be queried on the peers on that fork. Should this case be handled? How to handle it?
let mut peer_ids = cx.get_custodial_peers(self.column_index);
peer_ids.retain(|peer_id| !self.peers_dont_have.contains(peer_id));
if let Some(peer_id) = peer_ids.choose(&mut thread_rng()) {
Some(*peer_id)
} else {
self.status = Status::NoPeers;
None
}
}
pub(crate) fn on_start_sampling(&mut self, peer_id: PeerId) -> Result<(), SamplingError> {
match self.status.clone() {
Status::NoPeers | Status::NotStarted => {
self.status = Status::Sampling(peer_id);
Ok(())
}
other => Err(SamplingError::BadState(format!(
"bad state on_start_sampling expected NoPeers|NotStarted got {other:?}. column_index:{}",
self.column_index
))),
}
}
pub(crate) fn on_sampling_error(&mut self) -> Result<PeerId, SamplingError> {
match self.status.clone() {
Status::Sampling(peer_id) => {
self.peers_dont_have.insert(peer_id);
self.status = Status::NotStarted;
Ok(peer_id)
}
other => Err(SamplingError::BadState(format!(
"bad state on_sampling_error expected Sampling got {other:?}. column_index:{}",
self.column_index
))),
}
}
pub(crate) fn on_sampling_success(&mut self) -> Result<(), SamplingError> {
match &self.status {
Status::Sampling(_) => {
self.status = Status::Verified;
Ok(())
}
other => Err(SamplingError::BadState(format!(
"bad state on_sampling_success expected Sampling got {other:?}. column_index:{}",
self.column_index
))),
}
}
}
}