Files
lighthouse/beacon_node/lighthouse_network/src/rpc/self_limiter.rs
Akihito Nakano 1324d3d3c4 Delayed RPC Send Using Tokens (#5923)
closes https://github.com/sigp/lighthouse/issues/5785


  The diagram below shows the differences in how the receiver (responder) behaves before and after this PR. The following sentences will detail the changes.

```mermaid
flowchart TD

subgraph "*** After ***"
Start2([START]) --> AA[Receive request]
AA --> COND1{Is there already an active request <br> with the same protocol?}
COND1 --> |Yes| CC[Send error response]
CC --> End2([END])
%% COND1 --> |No| COND2{Request is too large?}
%% COND2 --> |Yes| CC
COND1 --> |No| DD[Process request]
DD --> EE{Rate limit reached?}
EE --> |Yes| FF[Wait until tokens are regenerated]
FF --> EE
EE --> |No| GG[Send response]
GG --> End2
end

subgraph "*** Before ***"
Start([START]) --> A[Receive request]
A --> B{Rate limit reached <br> or <br> request is too large?}
B -->|Yes| C[Send error response]
C --> End([END])
B -->|No| E[Process request]
E --> F[Send response]
F --> End
end
```

### `Is there already an active request with the same protocol?`

This check is not performed in `Before`. This is taken from the PR in the consensus-spec, which proposes updates regarding rate limiting and response timeout.
https://github.com/ethereum/consensus-specs/pull/3767/files
> The requester MUST NOT make more than two concurrent requests with the same ID.

The PR mentions the requester side. In this PR, I introduced the `ActiveRequestsLimiter` for the `responder` side to restrict more than two requests from running simultaneously on the same protocol per peer. If the limiter disallows a request, the responder sends a rate-limited error and penalizes the requester.



### `Rate limit reached?` and `Wait until tokens are regenerated`

UPDATE: I moved the limiter logic to the behaviour side. https://github.com/sigp/lighthouse/pull/5923#issuecomment-2379535927

~~The rate limiter is shared between the behaviour and the handler.  (`Arc<Mutex<RateLimiter>>>`) The handler checks the rate limit and queues the response if the limit is reached. The behaviour handles pruning.~~

~~I considered not sharing the rate limiter between the behaviour and the handler, and performing all of these either within the behaviour or handler. However, I decided against this for the following reasons:~~

- ~~Regarding performing everything within the behaviour: The behaviour is unable to recognize the response protocol when `RPC::send_response()` is called, especially when the response is `RPCCodedResponse::Error`. Therefore, the behaviour can't rate limit responses based on the response protocol.~~
- ~~Regarding performing everything within the handler: When multiple connections are established with a peer, there could be multiple handlers interacting with that peer. Thus, we cannot enforce rate limiting per peer solely within the handler. (Any ideas? 🤔 )~~
2025-04-24 03:46:16 +00:00

545 lines
20 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
use super::{
config::OutboundRateLimiterConfig,
rate_limiter::{RPCRateLimiter as RateLimiter, RateLimitedErr},
BehaviourAction, Protocol, RPCSend, ReqId, RequestType, MAX_CONCURRENT_REQUESTS,
};
use crate::rpc::rate_limiter::RateLimiterItem;
use std::time::{SystemTime, UNIX_EPOCH};
use std::{
collections::{hash_map::Entry, HashMap, VecDeque},
sync::Arc,
task::{Context, Poll},
time::Duration,
};
use futures::FutureExt;
use libp2p::{swarm::NotifyHandler, PeerId};
use logging::crit;
use smallvec::SmallVec;
use tokio_util::time::DelayQueue;
use tracing::debug;
use types::{EthSpec, ForkContext};
/// A request that was rate limited or waiting on rate limited requests for the same peer and
/// protocol.
struct QueuedRequest<Id: ReqId, E: EthSpec> {
req: RequestType<E>,
request_id: Id,
queued_at: Duration,
}
/// The number of milliseconds requests delayed due to the concurrent request limit stay in the queue.
const WAIT_TIME_DUE_TO_CONCURRENT_REQUESTS: u64 = 100;
#[allow(clippy::type_complexity)]
pub(crate) struct SelfRateLimiter<Id: ReqId, E: EthSpec> {
/// Active requests that are awaiting a response.
active_requests: HashMap<PeerId, HashMap<Protocol, usize>>,
/// Requests queued for sending per peer. These requests are stored when the self rate
/// limiter rejects them. Rate limiting is based on a Peer and Protocol basis, therefore
/// are stored in the same way.
delayed_requests: HashMap<(PeerId, Protocol), VecDeque<QueuedRequest<Id, E>>>,
/// The delay required to allow a peer's outbound request per protocol.
next_peer_request: DelayQueue<(PeerId, Protocol)>,
/// Rate limiter for our own requests.
rate_limiter: Option<RateLimiter>,
/// Requests that are ready to be sent.
ready_requests: SmallVec<[(PeerId, RPCSend<Id, E>, Duration); 3]>,
}
/// Error returned when the rate limiter does not accept a request.
// NOTE: this is currently not used, but might be useful for debugging.
pub enum Error {
/// There are queued requests for this same peer and protocol.
PendingRequests,
/// Request was tried but rate limited.
RateLimited,
}
impl<Id: ReqId, E: EthSpec> SelfRateLimiter<Id, E> {
/// Creates a new [`SelfRateLimiter`] based on configuration values.
pub fn new(
config: Option<OutboundRateLimiterConfig>,
fork_context: Arc<ForkContext>,
) -> Result<Self, &'static str> {
debug!(?config, "Using self rate limiting params");
let rate_limiter = if let Some(c) = config {
Some(RateLimiter::new_with_config(c.0, fork_context)?)
} else {
None
};
Ok(SelfRateLimiter {
active_requests: Default::default(),
delayed_requests: Default::default(),
next_peer_request: Default::default(),
rate_limiter,
ready_requests: Default::default(),
})
}
/// Checks if the rate limiter allows the request. If it's allowed, returns the
/// [`ToSwarm`] that should be emitted. When not allowed, the request is delayed
/// until it can be sent.
pub fn allows(
&mut self,
peer_id: PeerId,
request_id: Id,
req: RequestType<E>,
) -> Result<RPCSend<Id, E>, Error> {
let protocol = req.versioned_protocol().protocol();
// First check that there are not already other requests waiting to be sent.
if let Some(queued_requests) = self.delayed_requests.get_mut(&(peer_id, protocol)) {
debug!(%peer_id, protocol = %req.protocol(), "Self rate limiting since there are already other requests waiting to be sent");
queued_requests.push_back(QueuedRequest {
req,
request_id,
queued_at: timestamp_now(),
});
return Err(Error::PendingRequests);
}
match Self::try_send_request(
&mut self.active_requests,
&mut self.rate_limiter,
peer_id,
request_id,
req,
) {
Err((rate_limited_req, wait_time)) => {
let key = (peer_id, protocol);
self.next_peer_request.insert(key, wait_time);
self.delayed_requests
.entry(key)
.or_default()
.push_back(rate_limited_req);
Err(Error::RateLimited)
}
Ok(event) => Ok(event),
}
}
/// Auxiliary function to deal with self rate limiting outcomes. If the rate limiter allows the
/// request, the [`ToSwarm`] that should be emitted is returned. If the request
/// should be delayed, it's returned with the duration to wait.
#[allow(clippy::result_large_err)]
fn try_send_request(
active_requests: &mut HashMap<PeerId, HashMap<Protocol, usize>>,
rate_limiter: &mut Option<RateLimiter>,
peer_id: PeerId,
request_id: Id,
req: RequestType<E>,
) -> Result<RPCSend<Id, E>, (QueuedRequest<Id, E>, Duration)> {
if let Some(active_request) = active_requests.get(&peer_id) {
if let Some(count) = active_request.get(&req.protocol()) {
if *count >= MAX_CONCURRENT_REQUESTS {
debug!(
%peer_id,
protocol = %req.protocol(),
"Self rate limiting due to the number of concurrent requests"
);
return Err((
QueuedRequest {
req,
request_id,
queued_at: timestamp_now(),
},
Duration::from_millis(WAIT_TIME_DUE_TO_CONCURRENT_REQUESTS),
));
}
}
}
if let Some(limiter) = rate_limiter.as_mut() {
match limiter.allows(&peer_id, &req) {
Ok(()) => {}
Err(e) => {
let protocol = req.versioned_protocol();
match e {
RateLimitedErr::TooLarge => {
// this should never happen with default parameters. Let's just send the request.
// Log a crit since this is a config issue.
crit!(
protocol = %req.versioned_protocol().protocol(),
"Self rate limiting error for a batch that will never fit. Sending request anyway. Check configuration parameters.",
);
}
RateLimitedErr::TooSoon(wait_time) => {
debug!(protocol = %protocol.protocol(), wait_time_ms = wait_time.as_millis(), %peer_id, "Self rate limiting");
return Err((
QueuedRequest {
req,
request_id,
queued_at: timestamp_now(),
},
wait_time,
));
}
}
}
}
}
*active_requests
.entry(peer_id)
.or_default()
.entry(req.protocol())
.or_default() += 1;
Ok(RPCSend::Request(request_id, req))
}
/// When a peer and protocol are allowed to send a next request, this function checks the
/// queued requests and attempts marking as ready as many as the limiter allows.
fn next_peer_request_ready(&mut self, peer_id: PeerId, protocol: Protocol) {
if let Entry::Occupied(mut entry) = self.delayed_requests.entry((peer_id, protocol)) {
let queued_requests = entry.get_mut();
while let Some(QueuedRequest {
req,
request_id,
queued_at,
}) = queued_requests.pop_front()
{
match Self::try_send_request(
&mut self.active_requests,
&mut self.rate_limiter,
peer_id,
request_id,
req.clone(),
) {
Err((_rate_limited_req, wait_time)) => {
let key = (peer_id, protocol);
self.next_peer_request.insert(key, wait_time);
// Don't push `rate_limited_req` here to prevent `queued_at` from being updated.
queued_requests.push_front(QueuedRequest {
req,
request_id,
queued_at,
});
// If one fails just wait for the next window that allows sending requests.
return;
}
Ok(event) => self.ready_requests.push((peer_id, event, queued_at)),
}
}
if queued_requests.is_empty() {
entry.remove();
}
}
// NOTE: There can be entries that have been removed due to peer disconnections, we simply
// ignore these messages here.
}
/// Informs the limiter that a peer has disconnected. This removes any pending requests and
/// returns their IDs.
pub fn peer_disconnected(&mut self, peer_id: PeerId) -> Vec<(Id, Protocol)> {
self.active_requests.remove(&peer_id);
// It's not ideal to iterate this map, but the key is (PeerId, Protocol) and this map
// should never really be large. So we iterate for simplicity
let mut failed_requests = Vec::new();
self.delayed_requests
.retain(|(map_peer_id, protocol), queue| {
if map_peer_id == &peer_id {
// NOTE: Currently cannot remove entries from the DelayQueue, we will just let
// them expire and ignore them.
for message in queue {
failed_requests.push((message.request_id, *protocol))
}
// Remove the entry
false
} else {
// Keep the entry
true
}
});
failed_requests
}
/// Informs the limiter that a response has been received.
pub fn request_completed(&mut self, peer_id: &PeerId, protocol: Protocol) {
if let Some(active_requests) = self.active_requests.get_mut(peer_id) {
if let Entry::Occupied(mut entry) = active_requests.entry(protocol) {
if *entry.get() > 1 {
*entry.get_mut() -= 1;
} else {
entry.remove();
}
}
}
}
pub fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll<BehaviourAction<Id, E>> {
// First check the requests that were self rate limited, since those might add events to
// the queue. Also do this before rate limiter pruning to avoid removing and
// immediately adding rate limiting keys.
if let Poll::Ready(Some(expired)) = self.next_peer_request.poll_expired(cx) {
let (peer_id, protocol) = expired.into_inner();
self.next_peer_request_ready(peer_id, protocol);
}
// Prune the rate limiter.
if let Some(limiter) = self.rate_limiter.as_mut() {
let _ = limiter.poll_unpin(cx);
}
// Finally return any queued events.
if let Some((peer_id, event, queued_at)) = self.ready_requests.pop() {
metrics::observe_duration(
&crate::metrics::OUTBOUND_REQUEST_IDLING,
timestamp_now().saturating_sub(queued_at),
);
return Poll::Ready(BehaviourAction::NotifyHandler {
peer_id,
handler: NotifyHandler::Any,
event,
});
}
Poll::Pending
}
}
/// Returns the duration since the unix epoch.
pub fn timestamp_now() -> Duration {
SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_else(|_| Duration::from_secs(0))
}
#[cfg(test)]
mod tests {
use crate::rpc::config::{OutboundRateLimiterConfig, RateLimiterConfig};
use crate::rpc::rate_limiter::Quota;
use crate::rpc::self_limiter::SelfRateLimiter;
use crate::rpc::{Ping, Protocol, RPCSend, RequestType};
use crate::service::api_types::{AppRequestId, SingleLookupReqId, SyncRequestId};
use libp2p::PeerId;
use logging::create_test_tracing_subscriber;
use std::time::Duration;
use types::{EthSpec, ForkContext, Hash256, MainnetEthSpec, Slot};
/// Test that `next_peer_request_ready` correctly maintains the queue.
#[tokio::test]
async fn test_next_peer_request_ready() {
create_test_tracing_subscriber();
let config = OutboundRateLimiterConfig(RateLimiterConfig {
ping_quota: Quota::n_every(1, 2),
..Default::default()
});
let fork_context = std::sync::Arc::new(ForkContext::new::<MainnetEthSpec>(
Slot::new(0),
Hash256::ZERO,
&MainnetEthSpec::default_spec(),
));
let mut limiter: SelfRateLimiter<AppRequestId, MainnetEthSpec> =
SelfRateLimiter::new(Some(config), fork_context).unwrap();
let peer_id = PeerId::random();
let lookup_id = 0;
for i in 1..=5u32 {
let _ = limiter.allows(
peer_id,
AppRequestId::Sync(SyncRequestId::SingleBlock {
id: SingleLookupReqId {
lookup_id,
req_id: i,
},
}),
RequestType::Ping(Ping { data: i as u64 }),
);
}
{
let queue = limiter
.delayed_requests
.get(&(peer_id, Protocol::Ping))
.unwrap();
assert_eq!(4, queue.len());
// Check that requests in the queue are ordered in the sequence 2, 3, 4, 5.
let mut iter = queue.iter();
for i in 2..=5u32 {
assert!(matches!(
iter.next().unwrap().request_id,
AppRequestId::Sync(SyncRequestId::SingleBlock {
id: SingleLookupReqId { req_id, .. },
}) if req_id == i,
));
}
assert_eq!(limiter.ready_requests.len(), 0);
}
// Wait until the tokens have been regenerated, then run `next_peer_request_ready`.
tokio::time::sleep(Duration::from_secs(3)).await;
limiter.next_peer_request_ready(peer_id, Protocol::Ping);
{
let queue = limiter
.delayed_requests
.get(&(peer_id, Protocol::Ping))
.unwrap();
assert_eq!(3, queue.len());
// Check that requests in the queue are ordered in the sequence 3, 4, 5.
let mut iter = queue.iter();
for i in 3..=5 {
assert!(matches!(
iter.next().unwrap().request_id,
AppRequestId::Sync(SyncRequestId::SingleBlock {
id: SingleLookupReqId { req_id, .. },
}) if req_id == i,
));
}
assert_eq!(limiter.ready_requests.len(), 1);
}
}
/// Test that `next_peer_request_ready` correctly maintains the queue when using the self-limiter without rate limiting.
#[tokio::test]
async fn test_next_peer_request_ready_concurrent_requests() {
let fork_context = std::sync::Arc::new(ForkContext::new::<MainnetEthSpec>(
Slot::new(0),
Hash256::ZERO,
&MainnetEthSpec::default_spec(),
));
let mut limiter: SelfRateLimiter<AppRequestId, MainnetEthSpec> =
SelfRateLimiter::new(None, fork_context).unwrap();
let peer_id = PeerId::random();
for i in 1..=5u32 {
let result = limiter.allows(
peer_id,
AppRequestId::Sync(SyncRequestId::SingleBlock {
id: SingleLookupReqId {
lookup_id: i,
req_id: i,
},
}),
RequestType::Ping(Ping { data: i as u64 }),
);
// Check that the limiter allows the first two requests.
if i <= 2 {
assert!(result.is_ok());
} else {
assert!(result.is_err());
}
}
let queue = limiter
.delayed_requests
.get(&(peer_id, Protocol::Ping))
.unwrap();
assert_eq!(3, queue.len());
// The delayed requests remain even after the next_peer_request_ready call because the responses have not been received.
limiter.next_peer_request_ready(peer_id, Protocol::Ping);
let queue = limiter
.delayed_requests
.get(&(peer_id, Protocol::Ping))
.unwrap();
assert_eq!(3, queue.len());
limiter.request_completed(&peer_id, Protocol::Ping);
limiter.next_peer_request_ready(peer_id, Protocol::Ping);
let queue = limiter
.delayed_requests
.get(&(peer_id, Protocol::Ping))
.unwrap();
assert_eq!(2, queue.len());
limiter.request_completed(&peer_id, Protocol::Ping);
limiter.request_completed(&peer_id, Protocol::Ping);
limiter.next_peer_request_ready(peer_id, Protocol::Ping);
let queue = limiter.delayed_requests.get(&(peer_id, Protocol::Ping));
assert!(queue.is_none());
// Check that the three delayed requests have moved to ready_requests.
let mut it = limiter.ready_requests.iter();
for i in 3..=5u32 {
let (_peer_id, RPCSend::Request(request_id, _), _) = it.next().unwrap() else {
unreachable!()
};
assert!(matches!(
request_id,
AppRequestId::Sync(SyncRequestId::SingleBlock {
id: SingleLookupReqId { req_id, .. },
}) if *req_id == i
));
}
}
#[tokio::test]
async fn test_peer_disconnected() {
let fork_context = std::sync::Arc::new(ForkContext::new::<MainnetEthSpec>(
Slot::new(0),
Hash256::ZERO,
&MainnetEthSpec::default_spec(),
));
let mut limiter: SelfRateLimiter<AppRequestId, MainnetEthSpec> =
SelfRateLimiter::new(None, fork_context).unwrap();
let peer1 = PeerId::random();
let peer2 = PeerId::random();
for peer in [peer1, peer2] {
for i in 1..=5u32 {
let result = limiter.allows(
peer,
AppRequestId::Sync(SyncRequestId::SingleBlock {
id: SingleLookupReqId {
lookup_id: i,
req_id: i,
},
}),
RequestType::Ping(Ping { data: i as u64 }),
);
// Check that the limiter allows the first two requests.
if i <= 2 {
assert!(result.is_ok());
} else {
assert!(result.is_err());
}
}
}
assert!(limiter.active_requests.contains_key(&peer1));
assert!(limiter
.delayed_requests
.contains_key(&(peer1, Protocol::Ping)));
assert!(limiter.active_requests.contains_key(&peer2));
assert!(limiter
.delayed_requests
.contains_key(&(peer2, Protocol::Ping)));
// Check that the limiter returns the IDs of pending requests and that the IDs are ordered correctly.
let mut failed_requests = limiter.peer_disconnected(peer1);
for i in 3..=5u32 {
let (request_id, _) = failed_requests.remove(0);
assert!(matches!(
request_id,
AppRequestId::Sync(SyncRequestId::SingleBlock {
id: SingleLookupReqId { req_id, .. },
}) if req_id == i
));
}
// Check that peer1s active and delayed requests have been removed.
assert!(!limiter.active_requests.contains_key(&peer1));
assert!(!limiter
.delayed_requests
.contains_key(&(peer1, Protocol::Ping)));
assert!(limiter.active_requests.contains_key(&peer2));
assert!(limiter
.delayed_requests
.contains_key(&(peer2, Protocol::Ping)));
}
}