deneb related logging improvements (#4859)

1. Add commitments to logs and update the `Display` implementation of `KzgCommitment` to become truncated similarly to block root.

I've been finding it difficult to debug scenarios involving multiple blobs for the same `(index, block_root)`. Logging the commitment will help with this, we can match it to what exists in the block. 

Example output:

```
Oct 20 21:13:36.700 DEBG Successfully verified gossip blob       commitment: 0xa3c1…1cd8, index: 0, root: 0xf31e…f9de, slot: 154568
Oct 20 21:13:36.785 DEBG Successfully verified gossip block      commitments: [0xa3c1…1cd8, 0x8655…02ff, 0x8d6a…955a, 0x84ac…3a1b, 0x9752…629b, 0xb9fc…20fb], root: 0xf31eeb732702e429e89057b15e1c0c631e8452e09e03cb1924353f536ef4f9de, slot: 154568, graffiti: teku/besu, service: beacon
```

Example output in a block with no blobs (this will show up pre-deneb):
```
426734:Oct 20 21:15:24.113 DEBG Successfully verified gossip block, commitments: [], root: 0x619db1360ba0e8d44ae2a0f2450ebca47e167191feecffcfac0e8d7b6c39623c, slot: 154577, graffiti: teku/nethermind, service: beacon, module: beacon_chain::beacon_chain:2765
```

2. Remove `strum::IntoStaticStr` from `AvailabilityCheckError`. This is because `IntoStaticStr` end up dropping information inside the enum. So kzg commitments in this error are dropped, making it more difficult to debug
```
AvailabilityCheckError::KzgCommitmentMismatch {
        blob_commitment: KzgCommitment,
        block_commitment: KzgCommitment,
    },
```
which is output as just `AvailabilityCheckError`

3. Some additional misc sync logs I found useful in debugging https://github.com/sigp/lighthouse/pull/4869

4. This downgrades ”Block returned for single block lookup not present” to debug because I don’t think we can fix the scenario that causes this unless we can cancel inflight rpc requests

Co-authored-by: realbigsean <seananderson33@gmail.com>
This commit is contained in:
realbigsean
2023-10-25 15:30:17 +00:00
parent d4f26ee123
commit ba891e1fed
14 changed files with 181 additions and 31 deletions

View File

@@ -330,7 +330,9 @@ impl<T: BeaconChainTypes> BlockLookups<T> {
let Some(lookup) = self.get_single_lookup::<R>(lookup_id) else {
if response.is_some() {
warn!(
// We don't have the ability to cancel in-flight RPC requests. So this can happen
// if we started this RPC request, and later saw the block/blobs via gossip.
debug!(
self.log,
"Block returned for single block lookup not present";
"response_type" => ?response_type,

View File

@@ -30,7 +30,7 @@ impl<T: EthSpec> BlocksAndBlobsRequestInfo<T> {
}
}
pub fn into_responses(self) -> Result<Vec<RpcBlock<T>>, &'static str> {
pub fn into_responses(self) -> Result<Vec<RpcBlock<T>>, String> {
let BlocksAndBlobsRequestInfo {
accumulated_blocks,
accumulated_sidecars,
@@ -50,28 +50,28 @@ impl<T: EthSpec> BlocksAndBlobsRequestInfo<T> {
.unwrap_or(false);
pair_next_blob
} {
blob_list.push(blob_iter.next().ok_or("Missing next blob")?);
blob_list.push(blob_iter.next().ok_or("Missing next blob".to_string())?);
}
let mut blobs_buffer = vec![None; T::max_blobs_per_block()];
for blob in blob_list {
let blob_index = blob.index as usize;
let Some(blob_opt) = blobs_buffer.get_mut(blob_index) else {
return Err("Invalid blob index");
return Err("Invalid blob index".to_string());
};
if blob_opt.is_some() {
return Err("Repeat blob index");
return Err("Repeat blob index".to_string());
} else {
*blob_opt = Some(blob);
}
}
let blobs = VariableList::from(blobs_buffer.into_iter().flatten().collect::<Vec<_>>());
responses.push(RpcBlock::new(None, block, Some(blobs))?)
responses.push(RpcBlock::new(None, block, Some(blobs)).map_err(|e| format!("{e:?}"))?)
}
// if accumulated sidecars is not empty, throw an error.
if blob_iter.next().is_some() {
return Err("Received sidecars that don't pair well");
return Err("Received sidecars that don't pair well".to_string());
}
Ok(responses)

View File

@@ -1034,8 +1034,11 @@ impl<T: BeaconChainTypes> SyncManager<T> {
// inform range that the request needs to be treated as failed
// With time we will want to downgrade this log
warn!(
self.log, "Blocks and blobs request for range received invalid data";
"peer_id" => %peer_id, "batch_id" => resp.batch_id, "error" => e
self.log,
"Blocks and blobs request for range received invalid data";
"peer_id" => %peer_id,
"batch_id" => resp.batch_id,
"error" => e.clone()
);
let id = RequestId::RangeBlockAndBlobs { id };
self.network.report_peer(
@@ -1043,7 +1046,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
PeerAction::MidToleranceError,
"block_blob_faulty_batch",
);
self.inject_error(peer_id, id, RPCError::InvalidData(e.into()))
self.inject_error(peer_id, id, RPCError::InvalidData(e))
}
}
}
@@ -1091,7 +1094,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
// With time we will want to downgrade this log
warn!(
self.log, "Blocks and blobs request for backfill received invalid data";
"peer_id" => %peer_id, "batch_id" => resp.batch_id, "error" => e
"peer_id" => %peer_id, "batch_id" => resp.batch_id, "error" => e.clone()
);
let id = RequestId::BackFillBlockAndBlobs { id };
self.network.report_peer(
@@ -1099,7 +1102,7 @@ impl<T: BeaconChainTypes> SyncManager<T> {
PeerAction::MidToleranceError,
"block_blob_faulty_backfill_batch",
);
self.inject_error(peer_id, id, RPCError::InvalidData(e.into()))
self.inject_error(peer_id, id, RPCError::InvalidData(e))
}
}
}

View File

@@ -23,7 +23,7 @@ use types::{BlobSidecar, EthSpec, SignedBeaconBlock};
pub struct BlocksAndBlobsByRangeResponse<T: EthSpec> {
pub batch_id: BatchId,
pub responses: Result<Vec<RpcBlock<T>>, &'static str>,
pub responses: Result<Vec<RpcBlock<T>>, String>,
}
pub struct BlocksAndBlobsByRangeRequest<T: EthSpec> {
@@ -338,12 +338,26 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
request_id: Id,
batch_type: ByRangeRequestType,
) -> Option<(ChainId, BatchId)> {
match batch_type {
let req = match batch_type {
ByRangeRequestType::BlocksAndBlobs => self
.range_blocks_and_blobs_requests
.remove(&request_id)
.map(|req| (req.chain_id, req.batch_id)),
ByRangeRequestType::Blocks => self.range_requests.remove(&request_id),
};
if let Some(req) = req {
debug!(
self.log,
"Range sync request failed";
"request_id" => request_id,
"batch_type" => ?batch_type,
"chain_id" => ?req.0,
"batch_id" => ?req.1
);
Some(req)
} else {
debug!(self.log, "Range sync request failed"; "request_id" => request_id, "batch_type" => ?batch_type);
None
}
}
@@ -352,12 +366,25 @@ impl<T: BeaconChainTypes> SyncNetworkContext<T> {
request_id: Id,
batch_type: ByRangeRequestType,
) -> Option<BatchId> {
match batch_type {
let batch_id = match batch_type {
ByRangeRequestType::BlocksAndBlobs => self
.backfill_blocks_and_blobs_requests
.remove(&request_id)
.map(|(batch_id, _info)| batch_id),
ByRangeRequestType::Blocks => self.backfill_requests.remove(&request_id),
};
if let Some(batch_id) = batch_id {
debug!(
self.log,
"Backfill sync request failed";
"request_id" => request_id,
"batch_type" => ?batch_type,
"batch_id" => ?batch_id
);
Some(batch_id)
} else {
debug!(self.log, "Backfill sync request failed"; "request_id" => request_id, "batch_type" => ?batch_type);
None
}
}

View File

@@ -827,9 +827,24 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
// sending an error /timeout) if the peer is removed from the chain for other
// reasons. Check that this block belongs to the expected peer
if !batch.is_expecting_block(peer_id, &request_id) {
debug!(
self.log,
"Batch not expecting block";
"batch_epoch" => batch_id,
"batch_state" => ?batch.state(),
"peer_id" => %peer_id,
"request_id" => %request_id
);
return Ok(KeepChain);
}
debug!(self.log, "Batch failed. RPC Error"; "batch_epoch" => batch_id);
debug!(
self.log,
"Batch failed. RPC Error";
"batch_epoch" => batch_id,
"batch_state" => ?batch.state(),
"peer_id" => %peer_id,
"request_id" => %request_id
);
if let Some(active_requests) = self.peers.get_mut(peer_id) {
active_requests.remove(&batch_id);
}
@@ -841,6 +856,13 @@ impl<T: BeaconChainTypes> SyncingChain<T> {
}
self.retry_batch_download(network, batch_id)
} else {
debug!(
self.log,
"Batch not found";
"batch_epoch" => batch_id,
"peer_id" => %peer_id,
"request_id" => %request_id
);
// this could be an error for an old batch, removed when the chain advances
Ok(KeepChain)
}