Separate committee subscriptions queue (#3508)

## Issue Addressed

NA

## Proposed Changes

As we've seen on Prater, there seems to be a correlation between these messages

```
WARN Not enough time for a discovery search  subnet_id: ExactSubnet { subnet_id: SubnetId(19), slot: Slot(3742336) }, service: attestation_service
```

... and nodes falling 20-30 slots behind the head for short periods. These nodes are running ~20k Prater validators.

After running some metrics, I can see that the `network_recv` channel is processing ~250k `AttestationSubscribe` messages per minute. It occurred to me that perhaps the `AttestationSubscribe` messages are "washing out" the `SendRequest` and `SendResponse` messages. In this PR I separate the `AttestationSubscribe` and `SyncCommitteeSubscribe` messages into their own queue so the `tokio::select!` in the `NetworkService` can still process the other messages in the `network_recv` channel without necessarily having to clear all the subscription messages first.

~~I've also added filter to the HTTP API to prevent duplicate subscriptions going to the network service.~~

## Additional Info

- Currently being tested on Prater
This commit is contained in:
Paul Hauner
2022-08-30 05:47:31 +00:00
parent ebd0e0e2d9
commit 661307dce1
8 changed files with 231 additions and 102 deletions

View File

@@ -25,11 +25,10 @@ use beacon_chain::{
BeaconChainTypes, ProduceBlockVerification, WhenSlotSkipped,
};
pub use block_id::BlockId;
use eth2::types::ValidatorStatus;
use eth2::types::{self as api_types, EndpointVersion, ValidatorId};
use eth2::types::{self as api_types, EndpointVersion, ValidatorId, ValidatorStatus};
use lighthouse_network::{types::SyncState, EnrExt, NetworkGlobals, PeerId, PubsubMessage};
use lighthouse_version::version_with_platform;
use network::NetworkMessage;
use network::{NetworkMessage, NetworkSenders, ValidatorSubscriptionMessage};
use serde::{Deserialize, Serialize};
use slog::{crit, debug, error, info, warn, Logger};
use slot_clock::SlotClock;
@@ -42,7 +41,7 @@ use std::net::{IpAddr, Ipv4Addr, SocketAddr};
use std::path::PathBuf;
use std::pin::Pin;
use std::sync::Arc;
use tokio::sync::mpsc::UnboundedSender;
use tokio::sync::mpsc::{Sender, UnboundedSender};
use tokio_stream::{wrappers::BroadcastStream, StreamExt};
use types::{
Attestation, AttestationData, AttesterSlashing, BeaconStateError, BlindedPayload,
@@ -93,7 +92,7 @@ pub struct TlsConfig {
pub struct Context<T: BeaconChainTypes> {
pub config: Config,
pub chain: Option<Arc<BeaconChain<T>>>,
pub network_tx: Option<UnboundedSender<NetworkMessage<T::EthSpec>>>,
pub network_senders: Option<NetworkSenders<T::EthSpec>>,
pub network_globals: Option<Arc<NetworkGlobals<T::EthSpec>>>,
pub eth1_service: Option<eth1::Service>,
pub log: Logger,
@@ -337,14 +336,35 @@ pub fn serve<T: BeaconChainTypes>(
});
// Create a `warp` filter that provides access to the network sender channel.
let inner_ctx = ctx.clone();
let network_tx_filter = warp::any()
.map(move || inner_ctx.network_tx.clone())
.and_then(|network_tx| async move {
match network_tx {
Some(network_tx) => Ok(network_tx),
let network_tx = ctx
.network_senders
.as_ref()
.map(|senders| senders.network_send());
let network_tx_filter =
warp::any()
.map(move || network_tx.clone())
.and_then(|network_tx| async move {
match network_tx {
Some(network_tx) => Ok(network_tx),
None => Err(warp_utils::reject::custom_not_found(
"The networking stack has not yet started (network_tx).".to_string(),
)),
}
});
// Create a `warp` filter that provides access to the network attestation subscription channel.
let validator_subscriptions_tx = ctx
.network_senders
.as_ref()
.map(|senders| senders.validator_subscription_send());
let validator_subscription_tx_filter = warp::any()
.map(move || validator_subscriptions_tx.clone())
.and_then(|validator_subscriptions_tx| async move {
match validator_subscriptions_tx {
Some(validator_subscriptions_tx) => Ok(validator_subscriptions_tx),
None => Err(warp_utils::reject::custom_not_found(
"The networking stack has not yet started.".to_string(),
"The networking stack has not yet started (validator_subscription_tx)."
.to_string(),
)),
}
});
@@ -2083,7 +2103,7 @@ pub fn serve<T: BeaconChainTypes>(
.to_ref()
.fork_name(&chain.spec)
.map_err(inconsistent_fork_rejection)?;
// Pose as a V2 endpoint so we return the fork `version`.
// Pose as a V2 endpoint so we return the fork `version`.
fork_versioned_response(V2, fork_name, block)
.map(|response| warp::reply::json(&response))
},
@@ -2345,7 +2365,7 @@ pub fn serve<T: BeaconChainTypes>(
.and(not_while_syncing_filter.clone())
.and(chain_filter.clone())
.and(warp::body::json())
.and(network_tx_filter.clone())
.and(network_tx_filter)
.and(log_filter.clone())
.and_then(
|chain: Arc<BeaconChain<T>>,
@@ -2370,12 +2390,14 @@ pub fn serve<T: BeaconChainTypes>(
.and(warp::path("beacon_committee_subscriptions"))
.and(warp::path::end())
.and(warp::body::json())
.and(network_tx_filter.clone())
.and(validator_subscription_tx_filter.clone())
.and(chain_filter.clone())
.and(log_filter.clone())
.and_then(
|subscriptions: Vec<api_types::BeaconCommitteeSubscription>,
network_tx: UnboundedSender<NetworkMessage<T::EthSpec>>,
chain: Arc<BeaconChain<T>>| {
validator_subscription_tx: Sender<ValidatorSubscriptionMessage>,
chain: Arc<BeaconChain<T>>,
log: Logger| {
blocking_json_task(move || {
for subscription in &subscriptions {
chain
@@ -2383,7 +2405,7 @@ pub fn serve<T: BeaconChainTypes>(
.write()
.auto_register_local_validator(subscription.validator_index);
let subscription = api_types::ValidatorSubscription {
let validator_subscription = api_types::ValidatorSubscription {
validator_index: subscription.validator_index,
attestation_committee_index: subscription.committee_index,
slot: subscription.slot,
@@ -2391,12 +2413,20 @@ pub fn serve<T: BeaconChainTypes>(
is_aggregator: subscription.is_aggregator,
};
publish_network_message(
&network_tx,
NetworkMessage::AttestationSubscribe {
subscriptions: vec![subscription],
},
)?;
let message = ValidatorSubscriptionMessage::AttestationSubscribe {
subscriptions: vec![validator_subscription],
};
if let Err(e) = validator_subscription_tx.try_send(message) {
warn!(
log,
"Unable to process committee subscriptions";
"info" => "the host may be overloaded or resource-constrained",
"error" => ?e,
);
return Err(warp_utils::reject::custom_server_error(
"unable to queue subscription, host may be overloaded or shutting down".to_string(),
));
}
}
Ok(())
@@ -2581,12 +2611,15 @@ pub fn serve<T: BeaconChainTypes>(
.and(warp::path("sync_committee_subscriptions"))
.and(warp::path::end())
.and(warp::body::json())
.and(network_tx_filter)
.and(validator_subscription_tx_filter)
.and(chain_filter.clone())
.and(log_filter.clone())
.and_then(
|subscriptions: Vec<types::SyncCommitteeSubscription>,
network_tx: UnboundedSender<NetworkMessage<T::EthSpec>>,
chain: Arc<BeaconChain<T>>| {
validator_subscription_tx: Sender<ValidatorSubscriptionMessage>,
chain: Arc<BeaconChain<T>>,
log: Logger
| {
blocking_json_task(move || {
for subscription in subscriptions {
chain
@@ -2594,12 +2627,20 @@ pub fn serve<T: BeaconChainTypes>(
.write()
.auto_register_local_validator(subscription.validator_index);
publish_network_message(
&network_tx,
NetworkMessage::SyncCommitteeSubscribe {
let message = ValidatorSubscriptionMessage::SyncCommitteeSubscribe {
subscriptions: vec![subscription],
},
)?;
};
if let Err(e) = validator_subscription_tx.try_send(message) {
warn!(
log,
"Unable to process sync subscriptions";
"info" => "the host may be overloaded or resource-constrained",
"error" => ?e
);
return Err(warp_utils::reject::custom_server_error(
"unable to queue subscription, host may be overloaded or shutting down".to_string(),
));
}
}
Ok(())

View File

@@ -11,14 +11,14 @@ use lighthouse_network::{
types::{EnrAttestationBitfield, EnrSyncCommitteeBitfield, SyncState},
ConnectedPoint, Enr, NetworkGlobals, PeerId, PeerManager,
};
use network::NetworkMessage;
use network::{NetworkReceivers, NetworkSenders};
use sensitive_url::SensitiveUrl;
use slog::Logger;
use std::future::Future;
use std::net::{IpAddr, Ipv4Addr, SocketAddr};
use std::sync::Arc;
use std::time::Duration;
use tokio::sync::{mpsc, oneshot};
use tokio::sync::oneshot;
use types::{ChainSpec, EthSpec};
pub const TCP_PORT: u16 = 42;
@@ -30,7 +30,7 @@ pub const EXTERNAL_ADDR: &str = "/ip4/0.0.0.0/tcp/9000";
pub struct InteractiveTester<E: EthSpec> {
pub harness: BeaconChainHarness<EphemeralHarnessType<E>>,
pub client: BeaconNodeHttpClient,
pub network_rx: mpsc::UnboundedReceiver<NetworkMessage<E>>,
pub network_rx: NetworkReceivers<E>,
_server_shutdown: oneshot::Sender<()>,
}
@@ -41,7 +41,7 @@ pub struct ApiServer<E: EthSpec, SFut: Future<Output = ()>> {
pub server: SFut,
pub listening_socket: SocketAddr,
pub shutdown_tx: oneshot::Sender<()>,
pub network_rx: tokio::sync::mpsc::UnboundedReceiver<NetworkMessage<E>>,
pub network_rx: NetworkReceivers<E>,
pub local_enr: Enr,
pub external_peer_id: PeerId,
}
@@ -97,7 +97,7 @@ pub async fn create_api_server_on_port<T: BeaconChainTypes>(
log: Logger,
port: u16,
) -> ApiServer<T::EthSpec, impl Future<Output = ()>> {
let (network_tx, network_rx) = mpsc::unbounded_channel();
let (network_senders, network_receivers) = NetworkSenders::new();
// Default metadata
let meta_data = MetaData::V2(MetaDataV2 {
@@ -146,7 +146,7 @@ pub async fn create_api_server_on_port<T: BeaconChainTypes>(
spec_fork_name: None,
},
chain: Some(chain.clone()),
network_tx: Some(network_tx),
network_senders: Some(network_senders),
network_globals: Some(network_globals),
eth1_service: Some(eth1_service),
log,
@@ -163,7 +163,7 @@ pub async fn create_api_server_on_port<T: BeaconChainTypes>(
server,
listening_socket,
shutdown_tx,
network_rx,
network_rx: network_receivers,
local_enr: enr,
external_peer_id: peer_id,
}

View File

@@ -17,14 +17,14 @@ use futures::stream::{Stream, StreamExt};
use futures::FutureExt;
use http_api::{BlockId, StateId};
use lighthouse_network::{Enr, EnrExt, PeerId};
use network::NetworkMessage;
use network::NetworkReceivers;
use proto_array::ExecutionStatus;
use sensitive_url::SensitiveUrl;
use slot_clock::SlotClock;
use state_processing::per_slot_processing;
use std::convert::TryInto;
use std::sync::Arc;
use tokio::sync::{mpsc, oneshot};
use tokio::sync::oneshot;
use tokio::time::Duration;
use tree_hash::TreeHash;
use types::application_domain::ApplicationDomain;
@@ -65,7 +65,7 @@ struct ApiTester {
proposer_slashing: ProposerSlashing,
voluntary_exit: SignedVoluntaryExit,
_server_shutdown: oneshot::Sender<()>,
network_rx: mpsc::UnboundedReceiver<NetworkMessage<E>>,
network_rx: NetworkReceivers<E>,
local_enr: Enr,
external_peer_id: PeerId,
mock_builder: Option<Arc<TestingBuilder<E>>>,
@@ -899,7 +899,7 @@ impl ApiTester {
self.client.post_beacon_blocks(next_block).await.unwrap();
assert!(
self.network_rx.recv().await.is_some(),
self.network_rx.network_recv.recv().await.is_some(),
"valid blocks should be sent to network"
);
@@ -913,7 +913,7 @@ impl ApiTester {
assert!(self.client.post_beacon_blocks(&next_block).await.is_err());
assert!(
self.network_rx.recv().await.is_some(),
self.network_rx.network_recv.recv().await.is_some(),
"invalid blocks should be sent to network"
);
@@ -1041,7 +1041,7 @@ impl ApiTester {
.unwrap();
assert!(
self.network_rx.recv().await.is_some(),
self.network_rx.network_recv.recv().await.is_some(),
"valid attestation should be sent to network"
);
@@ -1078,7 +1078,7 @@ impl ApiTester {
}
assert!(
self.network_rx.recv().await.is_some(),
self.network_rx.network_recv.recv().await.is_some(),
"if some attestations are valid, we should send them to the network"
);
@@ -1108,7 +1108,7 @@ impl ApiTester {
.unwrap();
assert!(
self.network_rx.recv().await.is_some(),
self.network_rx.network_recv.recv().await.is_some(),
"valid attester slashing should be sent to network"
);
@@ -1125,7 +1125,7 @@ impl ApiTester {
.unwrap_err();
assert!(
self.network_rx.recv().now_or_never().is_none(),
self.network_rx.network_recv.recv().now_or_never().is_none(),
"invalid attester slashing should not be sent to network"
);
@@ -1154,7 +1154,7 @@ impl ApiTester {
.unwrap();
assert!(
self.network_rx.recv().await.is_some(),
self.network_rx.network_recv.recv().await.is_some(),
"valid proposer slashing should be sent to network"
);
@@ -1171,7 +1171,7 @@ impl ApiTester {
.unwrap_err();
assert!(
self.network_rx.recv().now_or_never().is_none(),
self.network_rx.network_recv.recv().now_or_never().is_none(),
"invalid proposer slashing should not be sent to network"
);
@@ -1200,7 +1200,7 @@ impl ApiTester {
.unwrap();
assert!(
self.network_rx.recv().await.is_some(),
self.network_rx.network_recv.recv().await.is_some(),
"valid exit should be sent to network"
);
@@ -1217,7 +1217,7 @@ impl ApiTester {
.unwrap_err();
assert!(
self.network_rx.recv().now_or_never().is_none(),
self.network_rx.network_recv.recv().now_or_never().is_none(),
"invalid exit should not be sent to network"
);
@@ -2351,7 +2351,7 @@ impl ApiTester {
.await
.unwrap();
assert!(self.network_rx.recv().await.is_some());
assert!(self.network_rx.network_recv.recv().await.is_some());
self
}
@@ -2366,7 +2366,7 @@ impl ApiTester {
.await
.unwrap_err();
assert!(self.network_rx.recv().now_or_never().is_none());
assert!(self.network_rx.network_recv.recv().now_or_never().is_none());
self
}
@@ -2385,7 +2385,11 @@ impl ApiTester {
.await
.unwrap();
self.network_rx.recv().now_or_never().unwrap();
self.network_rx
.validator_subscription_recv
.recv()
.now_or_never()
.unwrap();
self
}