Errors for all RPC Requests (#5867)

* Return and error if peer has disconnected

* Report errors for rate limited requests

* Code improvement

* Bump rust version to 1.78

* Downgrade to 1.77

* Update beacon_node/lighthouse_network/src/service/mod.rs

Co-authored-by: João Oliveira <hello@jxs.pt>

* fix fmt

* Merge branch 'unstable' of https://github.com/sigp/lighthouse into rpc-peer-disconnect-error

* update lockfile
This commit is contained in:
Age Manning
2024-06-07 10:12:45 +10:00
committed by GitHub
parent 7a7fc82cbd
commit 7b48b0b4a7
13 changed files with 452 additions and 412 deletions

View File

@@ -7,7 +7,7 @@ edition = { workspace = true }
[dependencies]
discv5 = { workspace = true }
gossipsub = { workspace = true }
unsigned-varint = { version = "0.6", features = ["codec"] }
unsigned-varint = { version = "0.8", features = ["codec"] }
ssz_types = { workspace = true }
types = { workspace = true }
serde = { workspace = true }

View File

@@ -15,7 +15,7 @@ use libp2p::swarm::handler::{
FullyNegotiatedInbound, FullyNegotiatedOutbound, StreamUpgradeError, SubstreamProtocol,
};
use libp2p::swarm::Stream;
use slog::{crit, debug, trace, warn};
use slog::{crit, debug, trace};
use smallvec::SmallVec;
use std::{
collections::{hash_map::Entry, VecDeque},
@@ -414,70 +414,44 @@ where
}
// purge expired inbound substreams and send an error
loop {
match self.inbound_substreams_delay.poll_expired(cx) {
Poll::Ready(Some(Ok(inbound_id))) => {
// handle a stream timeout for various states
if let Some(info) = self.inbound_substreams.get_mut(inbound_id.get_ref()) {
// the delay has been removed
info.delay_key = None;
self.events_out.push(HandlerEvent::Err(HandlerErr::Inbound {
error: RPCError::StreamTimeout,
proto: info.protocol,
id: *inbound_id.get_ref(),
}));
if info.pending_items.back().map(|l| l.close_after()) == Some(false) {
// if the last chunk does not close the stream, append an error
info.pending_items.push_back(RPCCodedResponse::Error(
RPCResponseErrorCode::ServerError,
"Request timed out".into(),
));
}
}
}
Poll::Ready(Some(Err(e))) => {
warn!(self.log, "Inbound substream poll failed"; "error" => ?e);
// drops the peer if we cannot read the delay queue
return Poll::Ready(ConnectionHandlerEvent::NotifyBehaviour(
HandlerEvent::Close(RPCError::InternalError(
"Could not poll inbound stream timer",
)),
while let Poll::Ready(Some(inbound_id)) = self.inbound_substreams_delay.poll_expired(cx) {
// handle a stream timeout for various states
if let Some(info) = self.inbound_substreams.get_mut(inbound_id.get_ref()) {
// the delay has been removed
info.delay_key = None;
self.events_out.push(HandlerEvent::Err(HandlerErr::Inbound {
error: RPCError::StreamTimeout,
proto: info.protocol,
id: *inbound_id.get_ref(),
}));
if info.pending_items.back().map(|l| l.close_after()) == Some(false) {
// if the last chunk does not close the stream, append an error
info.pending_items.push_back(RPCCodedResponse::Error(
RPCResponseErrorCode::ServerError,
"Request timed out".into(),
));
}
Poll::Pending | Poll::Ready(None) => break,
}
}
// purge expired outbound substreams
loop {
match self.outbound_substreams_delay.poll_expired(cx) {
Poll::Ready(Some(Ok(outbound_id))) => {
if let Some(OutboundInfo { proto, req_id, .. }) =
self.outbound_substreams.remove(outbound_id.get_ref())
{
let outbound_err = HandlerErr::Outbound {
id: req_id,
proto,
error: RPCError::StreamTimeout,
};
// notify the user
return Poll::Ready(ConnectionHandlerEvent::NotifyBehaviour(
HandlerEvent::Err(outbound_err),
));
} else {
crit!(self.log, "timed out substream not in the books"; "stream_id" => outbound_id.get_ref());
}
}
Poll::Ready(Some(Err(e))) => {
warn!(self.log, "Outbound substream poll failed"; "error" => ?e);
return Poll::Ready(ConnectionHandlerEvent::NotifyBehaviour(
HandlerEvent::Close(RPCError::InternalError(
"Could not poll outbound stream timer",
)),
));
}
Poll::Pending | Poll::Ready(None) => break,
while let Poll::Ready(Some(outbound_id)) = self.outbound_substreams_delay.poll_expired(cx) {
if let Some(OutboundInfo { proto, req_id, .. }) =
self.outbound_substreams.remove(outbound_id.get_ref())
{
let outbound_err = HandlerErr::Outbound {
id: req_id,
proto,
error: RPCError::StreamTimeout,
};
// notify the user
return Poll::Ready(ConnectionHandlerEvent::NotifyBehaviour(HandlerEvent::Err(
outbound_err,
)));
} else {
crit!(self.log, "timed out substream not in the books"; "stream_id" => outbound_id.get_ref());
}
}

View File

@@ -10,7 +10,7 @@ use libp2p::swarm::{
handler::ConnectionHandler, CloseConnection, ConnectionId, NetworkBehaviour, NotifyHandler,
ToSwarm,
};
use libp2p::swarm::{FromSwarm, SubstreamProtocol, THandlerInEvent};
use libp2p::swarm::{ConnectionClosed, FromSwarm, SubstreamProtocol, THandlerInEvent};
use libp2p::PeerId;
use rate_limiter::{RPCRateLimiter as RateLimiter, RateLimitedErr};
use slog::{crit, debug, o};
@@ -283,9 +283,40 @@ where
Ok(handler)
}
fn on_swarm_event(&mut self, _event: FromSwarm) {
fn on_swarm_event(&mut self, event: FromSwarm) {
// NOTE: FromSwarm is a non exhaustive enum so updates should be based on release notes more
// than compiler feedback
// The self rate limiter holds on to requests and attempts to process them within our rate
// limits. If a peer disconnects whilst we are self-rate limiting, we want to terminate any
// pending requests and return an error response to the application.
if let FromSwarm::ConnectionClosed(ConnectionClosed {
peer_id,
remaining_established,
connection_id,
..
}) = event
{
// If there are still connections remaining, do nothing.
if remaining_established > 0 {
return;
}
// Get a list of pending requests from the self rate limiter
if let Some(limiter) = self.self_limiter.as_mut() {
for (id, proto) in limiter.peer_disconnected(peer_id) {
let error_msg = ToSwarm::GenerateEvent(RPCMessage {
peer_id,
conn_id: connection_id,
event: HandlerEvent::Err(HandlerErr::Outbound {
id,
proto,
error: RPCError::Disconnected,
}),
});
self.events.push(error_msg);
}
}
}
}
fn on_connection_handler_event(

View File

@@ -158,13 +158,39 @@ impl<Id: ReqId, E: EthSpec> SelfRateLimiter<Id, E> {
entry.remove();
}
}
// NOTE: There can be entries that have been removed due to peer disconnections, we simply
// ignore these messages here.
}
/// Informs the limiter that a peer has disconnected. This removes any pending requests and
/// returns their IDs.
pub fn peer_disconnected(&mut self, peer_id: PeerId) -> Vec<(Id, Protocol)> {
// It's not ideal to iterate this map, but the key is (PeerId, Protocol) and this map
// should never really be large. So we iterate for simplicity
let mut failed_requests = Vec::new();
self.delayed_requests
.retain(|(map_peer_id, protocol), queue| {
if map_peer_id == &peer_id {
// NOTE: Currently cannot remove entries from the DelayQueue, we will just let
// them expire and ignore them.
for message in queue {
failed_requests.push((message.request_id, *protocol))
}
// Remove the entry
false
} else {
// Keep the entry
true
}
});
failed_requests
}
pub fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll<BehaviourAction<Id, E>> {
// First check the requests that were self rate limited, since those might add events to
// the queue. Also do this this before rate limiter prunning to avoid removing and
// immediately adding rate limiting keys.
if let Poll::Ready(Some(Ok(expired))) = self.next_peer_request.poll_expired(cx) {
if let Poll::Ready(Some(expired)) = self.next_peer_request.poll_expired(cx) {
let (peer_id, protocol) = expired.into_inner();
self.next_peer_request_ready(peer_id, protocol);
}

View File

@@ -240,7 +240,7 @@ impl futures::stream::Stream for GossipCache {
fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
match self.expirations.poll_expired(cx) {
Poll::Ready(Some(Ok(expired))) => {
Poll::Ready(Some(expired)) => {
let expected_key = expired.key();
let (topic, data) = expired.into_inner();
match self.topic_msgs.get_mut(&topic) {
@@ -259,7 +259,6 @@ impl futures::stream::Stream for GossipCache {
}
Poll::Ready(Some(Ok(topic)))
}
Poll::Ready(Some(Err(x))) => Poll::Ready(Some(Err(x.to_string()))),
Poll::Ready(None) => Poll::Ready(None),
Poll::Pending => Poll::Pending,
}

View File

@@ -917,12 +917,23 @@ impl<AppReqId: ReqId, E: EthSpec> Network<AppReqId, E> {
/* Eth2 RPC behaviour functions */
/// Send a request to a peer over RPC.
pub fn send_request(&mut self, peer_id: PeerId, request_id: AppReqId, request: Request) {
pub fn send_request(
&mut self,
peer_id: PeerId,
request_id: AppReqId,
request: Request,
) -> Result<(), (AppReqId, RPCError)> {
// Check if the peer is connected before sending an RPC request
if !self.swarm.is_connected(&peer_id) {
return Err((request_id, RPCError::Disconnected));
}
self.eth2_rpc_mut().send_request(
peer_id,
RequestId::Application(request_id),
request.into(),
)
);
Ok(())
}
/// Send a successful response to a peer over RPC.

View File

@@ -98,7 +98,9 @@ fn test_tcp_status_rpc() {
NetworkEvent::PeerConnectedOutgoing(peer_id) => {
// Send a STATUS message
debug!(log, "Sending RPC");
sender.send_request(peer_id, 10, rpc_request.clone());
sender
.send_request(peer_id, 10, rpc_request.clone())
.unwrap();
}
NetworkEvent::ResponseReceived {
peer_id: _,
@@ -202,7 +204,9 @@ fn test_tcp_blocks_by_range_chunked_rpc() {
NetworkEvent::PeerConnectedOutgoing(peer_id) => {
// Send a STATUS message
debug!(log, "Sending RPC");
sender.send_request(peer_id, request_id, rpc_request.clone());
sender
.send_request(peer_id, request_id, rpc_request.clone())
.unwrap();
}
NetworkEvent::ResponseReceived {
peer_id: _,
@@ -327,7 +331,9 @@ fn test_blobs_by_range_chunked_rpc() {
NetworkEvent::PeerConnectedOutgoing(peer_id) => {
// Send a STATUS message
debug!(log, "Sending RPC");
sender.send_request(peer_id, request_id, rpc_request.clone());
sender
.send_request(peer_id, request_id, rpc_request.clone())
.unwrap();
}
NetworkEvent::ResponseReceived {
peer_id: _,
@@ -435,7 +441,9 @@ fn test_tcp_blocks_by_range_over_limit() {
NetworkEvent::PeerConnectedOutgoing(peer_id) => {
// Send a STATUS message
debug!(log, "Sending RPC");
sender.send_request(peer_id, request_id, rpc_request.clone());
sender
.send_request(peer_id, request_id, rpc_request.clone())
.unwrap();
}
// The request will fail because the sender will refuse to send anything > MAX_RPC_SIZE
NetworkEvent::RPCFailed { id, .. } => {
@@ -528,7 +536,9 @@ fn test_tcp_blocks_by_range_chunked_rpc_terminates_correctly() {
NetworkEvent::PeerConnectedOutgoing(peer_id) => {
// Send a STATUS message
debug!(log, "Sending RPC");
sender.send_request(peer_id, request_id, rpc_request.clone());
sender
.send_request(peer_id, request_id, rpc_request.clone())
.unwrap();
}
NetworkEvent::ResponseReceived {
peer_id: _,
@@ -657,7 +667,9 @@ fn test_tcp_blocks_by_range_single_empty_rpc() {
NetworkEvent::PeerConnectedOutgoing(peer_id) => {
// Send a STATUS message
debug!(log, "Sending RPC");
sender.send_request(peer_id, 10, rpc_request.clone());
sender
.send_request(peer_id, 10, rpc_request.clone())
.unwrap();
}
NetworkEvent::ResponseReceived {
peer_id: _,
@@ -780,7 +792,9 @@ fn test_tcp_blocks_by_root_chunked_rpc() {
NetworkEvent::PeerConnectedOutgoing(peer_id) => {
// Send a STATUS message
debug!(log, "Sending RPC");
sender.send_request(peer_id, 6, rpc_request.clone());
sender
.send_request(peer_id, 6, rpc_request.clone())
.unwrap();
}
NetworkEvent::ResponseReceived {
peer_id: _,
@@ -911,7 +925,9 @@ fn test_tcp_blocks_by_root_chunked_rpc_terminates_correctly() {
NetworkEvent::PeerConnectedOutgoing(peer_id) => {
// Send a STATUS message
debug!(log, "Sending RPC");
sender.send_request(peer_id, 10, rpc_request.clone());
sender
.send_request(peer_id, 10, rpc_request.clone())
.unwrap();
}
NetworkEvent::ResponseReceived {
peer_id: _,
@@ -1031,7 +1047,9 @@ fn test_disconnect_triggers_rpc_error() {
NetworkEvent::PeerConnectedOutgoing(peer_id) => {
// Send a STATUS message
debug!(log, "Sending RPC");
sender.send_request(peer_id, 42, rpc_request.clone());
sender
.send_request(peer_id, 42, rpc_request.clone())
.unwrap();
}
NetworkEvent::RPCFailed { error, id: 42, .. } => match error {
RPCError::Disconnected => return,