add retry logic to peer discovery and an expiration time for peers (#1203)

* add retry logic to peer discovery and an expiration time for peers

* Restructure discovery

* Add mac build to CI

* Always return an error for Health when not linux

* Change macos workflow

* Rename macos tests

* Update DiscoverPeers messages to pass Instants. Implement PartialEq for AttServiceMessage

* update discover peer queueing to always check existing messages and extend min_ttl as necessary

* update method name and comment

* Correct merge issues

* Add subnet id check to partialeq, fix discover peer message dups

* fix discover peer message dups

* fix discover peer message dups for real this time

Co-authored-by: Age Manning <Age@AgeManning.com>
Co-authored-by: Paul Hauner <paul@paulhauner.com>
This commit is contained in:
realbigsean
2020-06-05 00:55:03 -04:00
committed by GitHub
parent 0e37a16927
commit 036096ef61
10 changed files with 442 additions and 141 deletions

View File

@@ -25,6 +25,7 @@ use std::{
marker::PhantomData,
sync::Arc,
task::{Context, Poll},
time::Instant,
};
use types::{EnrForkId, EthSpec, SignedBeaconBlock, SubnetId};
@@ -459,9 +460,10 @@ impl<TSpec: EthSpec> Behaviour<TSpec> {
self.update_metadata();
}
/// A request to search for peers connected to a long-lived subnet.
pub fn peers_request(&mut self, subnet_id: SubnetId) {
self.discovery.peers_request(subnet_id);
/// Attempts to discover new peers for a given subnet. The `min_ttl` gives the time at which we
/// would like to retain the peers for.
pub fn discover_subnet_peers(&mut self, subnet_id: SubnetId, min_ttl: Option<Instant>) {
self.discovery.discover_subnet_peers(subnet_id, min_ttl)
}
/// Updates the local ENR's "eth2" field with the latest EnrForkId.

View File

@@ -8,7 +8,7 @@ pub use enr_ext::{CombinedKeyExt, EnrExt};
use crate::metrics;
use crate::{error, Enr, NetworkConfig, NetworkGlobals};
use discv5::{enr::NodeId, Discv5, Discv5Event};
use discv5::{enr::NodeId, Discv5, Discv5Event, QueryId};
use enr::{Eth2Enr, BITFIELD_ENR_KEY, ETH2_ENR_KEY};
use futures::prelude::*;
use libp2p::core::{connection::ConnectionId, Multiaddr, PeerId};
@@ -18,20 +18,24 @@ use libp2p::swarm::{
NetworkBehaviourAction, PollParameters, ProtocolsHandler,
};
use lru::LruCache;
use slog::{crit, debug, info, warn};
use slog::{crit, debug, info, trace, warn};
use ssz::{Decode, Encode};
use ssz_types::BitVector;
use std::{
collections::{HashSet, VecDeque},
collections::{HashMap, HashSet, VecDeque},
net::SocketAddr,
path::Path,
sync::Arc,
task::{Context, Poll},
time::Duration,
time::{Duration, Instant},
};
use tokio::time::{delay_until, Delay, Instant};
use tokio::time::{delay_until, Delay};
use types::{EnrForkId, EthSpec, SubnetId};
mod subnet_predicate;
use subnet_predicate::subnet_predicate;
/// Maximum seconds before searching for extra peers.
const MAX_TIME_BETWEEN_PEER_SEARCHES: u64 = 120;
/// Initial delay between peer searches.
@@ -41,7 +45,18 @@ const MINIMUM_PEERS_BEFORE_DELAY_INCREASE: usize = 5;
/// Local ENR storage filename.
pub const ENR_FILENAME: &str = "enr.dat";
/// Number of peers we'd like to have connected to a given long-lived subnet.
const TARGET_SUBNET_PEERS: u64 = 3;
const TARGET_SUBNET_PEERS: usize = 3;
/// Number of times to attempt a discovery request
const MAX_DISCOVERY_RETRY: u64 = 3;
/// A struct representing the information associated with a single discovery request,
/// which can be retried with multiple queries
#[derive(Clone, Debug)]
pub struct Request {
pub query_id: Option<QueryId>,
pub min_ttl: Option<Instant>,
pub retries: u64,
}
/// Lighthouse discovery behaviour. This provides peer management and discovery using the Discv5
/// libp2p protocol.
@@ -79,6 +94,9 @@ pub struct Discovery<TSpec: EthSpec> {
/// A collection of network constants that can be read from other threads.
network_globals: Arc<NetworkGlobals<TSpec>>,
/// A mapping of SubnetId that we are currently searching for to all information associated with each request.
subnet_queries: HashMap<SubnetId, Request>,
/// Logger for the discovery behaviour.
log: slog::Logger,
}
@@ -139,11 +157,12 @@ impl<TSpec: EthSpec> Discovery<TSpec> {
cached_enrs: LruCache::new(50),
banned_peers: HashSet::new(),
max_peers: config.max_peers,
peer_discovery_delay: delay_until(Instant::now()),
peer_discovery_delay: delay_until(tokio::time::Instant::now()),
past_discovery_delay: INITIAL_SEARCH_DELAY,
tcp_port: config.libp2p_port,
discovery,
network_globals,
subnet_queries: HashMap::new(),
log,
enr_dir,
})
@@ -280,57 +299,93 @@ impl<TSpec: EthSpec> Discovery<TSpec> {
}
/// A request to find peers on a given subnet.
// TODO: This logic should be improved with added sophistication in peer management
// This currently checks for currently connected peers and if we don't have
// PEERS_WANTED_BEFORE_DISCOVERY connected to a given subnet we search for more.
pub fn peers_request(&mut self, subnet_id: SubnetId) {
pub fn discover_subnet_peers(&mut self, subnet_id: SubnetId, min_ttl: Option<Instant>) {
// TODO: Extend this to an event once discovery becomes a thread managed by the peer
// manager
if let Some(min_ttl) = min_ttl {
self.network_globals
.peers
.write()
.extend_peers_on_subnet(subnet_id, min_ttl);
}
// If there is already a discovery request in process for this subnet, ignore this request,
// but update the min_ttl.
if let Some(request) = self.subnet_queries.get_mut(&subnet_id) {
// update the min_ttl if required
if let Some(min_ttl) = min_ttl {
if request.min_ttl < Some(min_ttl) {
request.min_ttl = Some(min_ttl);
}
}
return;
}
// Insert a request and start a query for the subnet
self.subnet_queries.insert(
subnet_id.clone(),
Request {
query_id: None,
min_ttl,
retries: 0,
},
);
self.run_subnet_query(subnet_id);
}
/// Runs a discovery request for a given subnet_id if one already exists.
fn run_subnet_query(&mut self, subnet_id: SubnetId) {
let mut request = match self.subnet_queries.remove(&subnet_id) {
Some(v) => v,
None => return, // request doesn't exist
};
// increment the retry count
request.retries += 1;
let peers_on_subnet = self
.network_globals
.peers
.read()
.peers_on_subnet(subnet_id)
.count() as u64;
.count();
if peers_on_subnet < TARGET_SUBNET_PEERS {
let target_peers = TARGET_SUBNET_PEERS - peers_on_subnet;
debug!(self.log, "Searching for peers for subnet";
"subnet_id" => *subnet_id,
"connected_peers_on_subnet" => peers_on_subnet,
"target_subnet_peers" => TARGET_SUBNET_PEERS,
"peers_to_find" => target_peers
);
let log_clone = self.log.clone();
let subnet_predicate = move |enr: &Enr| {
if let Some(bitfield_bytes) = enr.get(BITFIELD_ENR_KEY) {
let bitfield = match BitVector::<TSpec::SubnetBitfieldLength>::from_ssz_bytes(
bitfield_bytes,
) {
Ok(v) => v,
Err(e) => {
warn!(log_clone, "Could not decode ENR bitfield for peer"; "peer_id" => format!("{}", enr.peer_id()), "error" => format!("{:?}", e));
return false;
}
};
return bitfield.get(*subnet_id as usize).unwrap_or_else(|_| {
debug!(log_clone, "Peer found but not on desired subnet"; "peer_id" => format!("{}", enr.peer_id()));
false
});
}
false
};
// start the query
self.start_query(subnet_predicate, target_peers as usize);
} else {
debug!(self.log, "Discovery ignored";
if peers_on_subnet > TARGET_SUBNET_PEERS {
trace!(self.log, "Discovery ignored";
"reason" => "Already connected to desired peers",
"connected_peers_on_subnet" => peers_on_subnet,
"target_subnet_peers" => TARGET_SUBNET_PEERS,
);
return;
}
// remove the entry and complete the query if greater than the maximum search count
if request.retries >= MAX_DISCOVERY_RETRY {
debug!(
self.log,
"Subnet peer discovery did not find sufficient peers. Reached max retry limit"
);
return;
}
let target_peers = TARGET_SUBNET_PEERS - peers_on_subnet;
debug!(self.log, "Searching for peers for subnet";
"subnet_id" => *subnet_id,
"connected_peers_on_subnet" => peers_on_subnet,
"target_subnet_peers" => TARGET_SUBNET_PEERS,
"peers_to_find" => target_peers,
"attempt" => request.retries,
);
// start the query, and update the queries map if necessary
let subnet_predicate = subnet_predicate::<TSpec>(subnet_id, &self.log);
if let Some(query_id) = self.start_query(subnet_predicate, target_peers) {
request.query_id = Some(query_id);
} else {
// ENR is not present remove the query
return;
}
self.subnet_queries.insert(subnet_id, request);
}
/* Internal Functions */
@@ -348,7 +403,7 @@ impl<TSpec: EthSpec> Discovery<TSpec> {
/// This can optionally search for peers for a given predicate. Regardless of the predicate
/// given, this will only search for peers on the same enr_fork_id as specified in the local
/// ENR.
fn start_query<F>(&mut self, enr_predicate: F, num_nodes: usize)
fn start_query<F>(&mut self, enr_predicate: F, num_nodes: usize) -> Option<QueryId>
where
F: Fn(&Enr) -> bool + Send + 'static + Clone,
{
@@ -359,18 +414,54 @@ impl<TSpec: EthSpec> Discovery<TSpec> {
Ok(v) => v,
Err(e) => {
crit!(self.log, "Local ENR has no fork id"; "error" => e);
return;
return None;
}
};
// predicate for finding nodes with a matching fork
let eth2_fork_predicate = move |enr: &Enr| {
enr.eth2().map(|enr| enr.fork_digest) == Ok(enr_fork_id.fork_digest.clone())
};
let eth2_fork_predicate = move |enr: &Enr| enr.eth2() == Ok(enr_fork_id.clone());
let predicate = move |enr: &Enr| eth2_fork_predicate(enr) && enr_predicate(enr);
// general predicate
self.discovery
.find_enr_predicate(random_node, predicate, num_nodes);
Some(
self.discovery
.find_enr_predicate(random_node, predicate, num_nodes),
)
}
/// Peers that are found during discovery are optionally dialed.
// TODO: Shift to peer manager. As its own service, discovery should spit out discovered nodes
// and the peer manager should decide about who to connect to.
fn dial_discovered_peers(&mut self, peers: Vec<Enr>, min_ttl: Option<Instant>) {
for enr in peers {
// cache known peers
let peer_id = enr.peer_id();
self.cached_enrs.put(enr.peer_id(), enr);
// if we need more peers, attempt a connection
if self.network_globals.connected_or_dialing_peers() < self.max_peers
&& !self
.network_globals
.peers
.read()
.is_connected_or_dialing(&peer_id)
&& !self.banned_peers.contains(&peer_id)
{
debug!(self.log, "Connecting to discovered peer"; "peer_id"=> peer_id.to_string());
// TODO: Update output
// This should be updated with the peer dialing. In fact created once the peer is
// dialed
if let Some(min_ttl) = min_ttl {
self.network_globals
.peers
.write()
.update_min_ttl(&peer_id, min_ttl);
}
self.events.push_back(NetworkBehaviourAction::DialPeer {
peer_id,
condition: DialPeerCondition::Disconnected,
});
}
}
}
}
@@ -440,7 +531,8 @@ impl<TSpec: EthSpec> NetworkBehaviour for Discovery<TSpec> {
}
// Set to maximum, and update to earlier, once we get our results back.
self.peer_discovery_delay.reset(
Instant::now() + Duration::from_secs(MAX_TIME_BETWEEN_PEER_SEARCHES),
tokio::time::Instant::now()
+ Duration::from_secs(MAX_TIME_BETWEEN_PEER_SEARCHES),
);
}
Poll::Pending => break,
@@ -477,7 +569,11 @@ impl<TSpec: EthSpec> NetworkBehaviour for Discovery<TSpec> {
address,
});
}
Discv5Event::FindNodeResult { closer_peers, .. } => {
Discv5Event::FindNodeResult {
closer_peers,
query_id,
..
} => {
debug!(self.log, "Discovery query completed"; "peers_found" => closer_peers.len());
// update the time to the next query
if self.past_discovery_delay < MAX_TIME_BETWEEN_PEER_SEARCHES
@@ -486,40 +582,30 @@ impl<TSpec: EthSpec> NetworkBehaviour for Discovery<TSpec> {
{
self.past_discovery_delay *= 2;
}
let delay = std::cmp::min(
let delay = std::cmp::max(
self.past_discovery_delay,
MAX_TIME_BETWEEN_PEER_SEARCHES,
);
self.peer_discovery_delay
.reset(Instant::now() + Duration::from_secs(delay));
.reset(tokio::time::Instant::now() + Duration::from_secs(delay));
for enr in closer_peers {
// cache known peers
let peer_id = enr.peer_id();
self.cached_enrs.put(enr.peer_id(), enr);
// if we need more peers, attempt a connection
if self.network_globals.connected_or_dialing_peers()
< self.max_peers
&& !self
.network_globals
.peers
.read()
.is_connected_or_dialing(&peer_id)
&& !self.banned_peers.contains(&peer_id)
{
// TODO: Debugging only
// NOTE: The peer manager will get updated by the global swarm.
let connection_status = self
.network_globals
.peers
.read()
.connection_status(&peer_id);
debug!(self.log, "Connecting to discovered peer"; "peer_id"=> peer_id.to_string(), "status" => format!("{:?}", connection_status));
self.events.push_back(NetworkBehaviourAction::DialPeer {
peer_id,
condition: DialPeerCondition::Disconnected,
});
// if this is a subnet query, run it to completion
if let Some((subnet_id, min_ttl)) = self
.subnet_queries
.iter()
.find(|(_, request)| request.query_id == Some(query_id))
.map(|(subnet_id, request)| {
(subnet_id.clone(), request.min_ttl.clone())
})
{
debug!(self.log, "Peer subnet discovery request completed"; "peers_found" => closer_peers.len(), "subnet_id" => *subnet_id);
self.dial_discovered_peers(closer_peers, min_ttl);
self.run_subnet_query(subnet_id);
} else {
if closer_peers.is_empty() {
debug!(self.log, "Peer Discovery request yielded no results.");
} else {
self.dial_discovered_peers(closer_peers, None);
}
}
}

View File

@@ -0,0 +1,33 @@
///! The subnet predicate used for searching for a particular subnet.
use super::*;
/// Returns the predicate for a given subnet.
pub fn subnet_predicate<TSpec>(
subnet_id: SubnetId,
log: &slog::Logger,
) -> impl Fn(&Enr) -> bool + Send + 'static + Clone
where
TSpec: EthSpec,
{
let log_clone = log.clone();
move |enr: &Enr| {
if let Some(bitfield_bytes) = enr.get(BITFIELD_ENR_KEY) {
let bitfield = match BitVector::<TSpec::SubnetBitfieldLength>::from_ssz_bytes(
bitfield_bytes,
) {
Ok(v) => v,
Err(e) => {
warn!(log_clone, "Could not decode ENR bitfield for peer"; "peer_id" => format!("{}", enr.peer_id()), "error" => format!("{:?}", e));
return false;
}
};
return bitfield.get(*subnet_id as usize).unwrap_or_else(|_| {
debug!(log_clone, "Peer found but not on desired subnet"; "peer_id" => format!("{}", enr.peer_id()));
false
});
}
false
}
}

View File

@@ -31,6 +31,10 @@ pub struct PeerInfo<T: EthSpec> {
/// The ENR subnet bitfield of the peer. This may be determined after it's initial
/// connection.
pub meta_data: Option<MetaData<T>>,
/// The time we would like to retain this peer. After this time, the peer is no longer
/// necessary.
#[serde(skip)]
pub min_ttl: Option<Instant>,
}
impl<TSpec: EthSpec> Default for PeerInfo<TSpec> {
@@ -43,6 +47,7 @@ impl<TSpec: EthSpec> Default for PeerInfo<TSpec> {
listening_addresses: vec![],
sync_status: PeerSyncStatus::Unknown,
meta_data: None,
min_ttl: None,
}
}
}

View File

@@ -2,7 +2,7 @@ use super::peer_info::{PeerConnectionStatus, PeerInfo};
use super::peer_sync_status::PeerSyncStatus;
use crate::rpc::methods::MetaData;
use crate::PeerId;
use slog::{crit, debug, warn};
use slog::{crit, debug, trace, warn};
use std::collections::{hash_map::Entry, HashMap};
use std::time::Instant;
use types::{EthSpec, SubnetId};
@@ -236,6 +236,42 @@ impl<TSpec: EthSpec> PeerDB<TSpec> {
debug!(self.log, "Peer dialing in db"; "peer_id" => peer_id.to_string(), "n_dc" => self.n_dc);
}
/// Update min ttl of a peer.
pub fn update_min_ttl(&mut self, peer_id: &PeerId, min_ttl: Instant) {
let info = self.peers.entry(peer_id.clone()).or_default();
// only update if the ttl is longer
if info.min_ttl.is_none() || Some(min_ttl) > info.min_ttl {
info.min_ttl = Some(min_ttl);
let min_ttl_secs = min_ttl
.checked_duration_since(Instant::now())
.map(|duration| duration.as_secs())
.unwrap_or_else(|| 0);
debug!(self.log, "Updating the time a peer is required for"; "peer_id" => peer_id.to_string(), "future_min_ttl_secs" => min_ttl_secs);
}
}
/// Extends the ttl of all peers on the given subnet that have a shorter
/// min_ttl than what's given.
pub fn extend_peers_on_subnet(&mut self, subnet_id: SubnetId, min_ttl: Instant) {
let log = &self.log;
self.peers.iter_mut()
.filter(move |(_, info)| {
info.connection_status.is_connected() && info.on_subnet(subnet_id)
})
.for_each(|(peer_id,info)| {
if info.min_ttl.is_none() || Some(min_ttl) > info.min_ttl {
info.min_ttl = Some(min_ttl);
}
let min_ttl_secs = min_ttl
.checked_duration_since(Instant::now())
.map(|duration| duration.as_secs())
.unwrap_or_else(|| 0);
trace!(log, "Updating minimum duration a peer is required for"; "peer_id" => peer_id.to_string(), "min_ttl" => min_ttl_secs);
});
}
/// Sets a peer as connected with an ingoing connection.
pub fn connect_ingoing(&mut self, peer_id: &PeerId) {
let info = self.peers.entry(peer_id.clone()).or_default();