From 6a75f24ab13e5659a7380bc93c1a3c7fc2c7012e Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Fri, 4 Apr 2025 20:01:39 +1100 Subject: [PATCH] Fix the `getBlobs` metric and ensure it is recorded promptly to prevent miscounts (#7188) From testing conducted by Sunnyside Labs, they noticed that the "expected blobs" are quite low on bandwidth constrained nodes. This observation revealed that we don't record the `beacon_blobs_from_el_expected_total` metric at all if the EL doesn't return any response. The fetch blobs function returns without recording the metric. To fix this, I've moved `BLOBS_FROM_EL_EXPECTED_TOTAL` and `BLOBS_FROM_EL_RECEIVED_TOTAL` to as early as possible, to make the metric more accurate. --- beacon_node/beacon_chain/src/fetch_blobs.rs | 25 ++++++++------------- beacon_node/beacon_chain/src/metrics.rs | 21 ++++++++++++----- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/beacon_node/beacon_chain/src/fetch_blobs.rs b/beacon_node/beacon_chain/src/fetch_blobs.rs index ceb563ffc2..3c28ac9a44 100644 --- a/beacon_node/beacon_chain/src/fetch_blobs.rs +++ b/beacon_node/beacon_chain/src/fetch_blobs.rs @@ -13,7 +13,7 @@ use crate::observed_data_sidecars::DoNotObserve; use crate::{metrics, AvailabilityProcessingStatus, BeaconChain, BeaconChainTypes, BlockError}; use execution_layer::json_structures::BlobAndProofV1; use execution_layer::Error as ExecutionLayerError; -use metrics::{inc_counter, inc_counter_by, TryExt}; +use metrics::{inc_counter, TryExt}; use ssz_types::FixedVector; use state_processing::per_block_processing::deneb::kzg_commitment_to_versioned_hash; use std::sync::Arc; @@ -73,13 +73,20 @@ pub async fn fetch_and_process_engine_blobs( .as_ref() .ok_or(FetchEngineBlobError::ExecutionLayerMissing)?; + metrics::observe(&metrics::BLOBS_FROM_EL_EXPECTED, num_expected_blobs as f64); debug!(num_expected_blobs, "Fetching blobs from the EL"); let response = execution_layer .get_blobs(versioned_hashes) .await + .inspect_err(|_| { + inc_counter(&metrics::BLOBS_FROM_EL_ERROR_TOTAL); + }) .map_err(FetchEngineBlobError::RequestFailed)?; - if response.is_empty() || response.iter().all(|opt| opt.is_none()) { + let num_fetched_blobs = response.iter().filter(|opt| opt.is_some()).count(); + metrics::observe(&metrics::BLOBS_FROM_EL_RECEIVED, num_fetched_blobs as f64); + + if num_fetched_blobs == 0 { debug!(num_expected_blobs, "No blobs fetched from the EL"); inc_counter(&metrics::BLOBS_FROM_EL_MISS_TOTAL); return Ok(None); @@ -99,20 +106,6 @@ pub async fn fetch_and_process_engine_blobs( &chain.spec, )?; - let num_fetched_blobs = fixed_blob_sidecar_list - .iter() - .filter(|b| b.is_some()) - .count(); - - inc_counter_by( - &metrics::BLOBS_FROM_EL_EXPECTED_TOTAL, - num_expected_blobs as u64, - ); - inc_counter_by( - &metrics::BLOBS_FROM_EL_RECEIVED_TOTAL, - num_fetched_blobs as u64, - ); - // Gossip verify blobs before publishing. This prevents blobs with invalid KZG proofs from // the EL making it into the data availability checker. We do not immediately add these // blobs to the observed blobs/columns cache because we want to allow blobs/columns to arrive on gossip diff --git a/beacon_node/beacon_chain/src/metrics.rs b/beacon_node/beacon_chain/src/metrics.rs index d1c7a2a5df..463319a1f5 100644 --- a/beacon_node/beacon_chain/src/metrics.rs +++ b/beacon_node/beacon_chain/src/metrics.rs @@ -1662,28 +1662,37 @@ pub static DATA_COLUMN_SIDECAR_GOSSIP_VERIFICATION_TIMES: LazyLock> = LazyLock::new(|| { try_create_int_counter( "beacon_blobs_from_el_hit_total", - "Number of blob batches fetched from the execution layer", + "Number of non-empty blob batches fetched from the execution layer", ) }); pub static BLOBS_FROM_EL_MISS_TOTAL: LazyLock> = LazyLock::new(|| { try_create_int_counter( "beacon_blobs_from_el_miss_total", - "Number of blob batches failed to fetch from the execution layer", + "Number of empty blob responses from the execution layer", ) }); -pub static BLOBS_FROM_EL_EXPECTED_TOTAL: LazyLock> = LazyLock::new(|| { +pub static BLOBS_FROM_EL_ERROR_TOTAL: LazyLock> = LazyLock::new(|| { try_create_int_counter( - "beacon_blobs_from_el_expected_total", + "beacon_blobs_from_el_error_total", + "Number of failed blob fetches from the execution layer", + ) +}); + +pub static BLOBS_FROM_EL_EXPECTED: LazyLock> = LazyLock::new(|| { + try_create_histogram_with_buckets( + "beacon_blobs_from_el_expected", "Number of blobs expected from the execution layer", + Ok(vec![0.0, 3.0, 6.0, 9.0, 12.0, 18.0, 24.0, 30.0]), ) }); -pub static BLOBS_FROM_EL_RECEIVED_TOTAL: LazyLock> = LazyLock::new(|| { - try_create_int_counter( +pub static BLOBS_FROM_EL_RECEIVED: LazyLock> = LazyLock::new(|| { + try_create_histogram_with_buckets( "beacon_blobs_from_el_received_total", "Number of blobs fetched from the execution layer", + linear_buckets(0.0, 4.0, 20), ) });