From bbb903931ffb4cc93561a8b5924d70d376289f4b Mon Sep 17 00:00:00 2001
From: Michael Sproul <michael@sigmaprime.io>
Date: Mon, 30 May 2022 15:37:00 +1000
Subject: [PATCH] Cause analysis for delayed head blocks

---
 beacon_node/beacon_chain/src/beacon_chain.rs  | 56 +++++++++++++++----
 .../beacon_chain/src/block_times_cache.rs     | 14 +++++
 beacon_node/beacon_chain/src/metrics.rs       | 25 ++++++---
 common/eth2/src/types.rs                      |  1 +
 4 files changed, 79 insertions(+), 17 deletions(-)
diff --git a/beacon_node/beacon_chain/src/beacon_chain.rs b/beacon_node/beacon_chain/src/beacon_chain.rs
index ce88290cd8..7763a67064 100644
--- a/beacon_node/beacon_chain/src/beacon_chain.rs
+++ b/beacon_node/beacon_chain/src/beacon_chain.rs
@@ -2751,6 +2751,13 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
                             "Early attester cache insert failed";
                             "error" => ?e
                         );
+                    } else {
+                        // Success, record the block as capable of being attested to.
+                        self.block_times_cache.write().set_time_attestable(
+                            block_root,
+                            block.slot(),
+                            timestamp_now(),
+                        );
                     }
                 } else {
                     warn!(
@@ -3572,7 +3579,8 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
 
         // Determine whether the block has been set as head too late for proper attestation
         // production.
-        let late_head = block_delay_total >= self.slot_clock.unagg_attestation_production_delay();
+        let attestation_deadline = self.slot_clock.unagg_attestation_production_delay();
+        let late_head = block_delay_total >= attestation_deadline;
 
         // Do not store metrics if the block was > 4 slots old, this helps prevent noise during
         // sync.
@@ -3592,34 +3600,61 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
                     .start_of(head_slot)
                     .unwrap_or_else(|| Duration::from_secs(0)),
             );
+            let observed_delay = block_delays
+                .observed
+                .unwrap_or_else(|| Duration::from_secs(0));
+            let import_delay = block_delays
+                .imported
+                .unwrap_or_else(|| Duration::from_secs(0));
+            let attestable_delay = block_delays
+                .attestable
+                .unwrap_or_else(|| Duration::from_secs(0));
+            let set_as_head_delay = block_delays
+                .set_as_head
+                .unwrap_or_else(|| Duration::from_secs(0));
 
             metrics::observe_duration(
                 &metrics::BEACON_BLOCK_OBSERVED_SLOT_START_DELAY_TIME,
-                block_delays
-                    .observed
-                    .unwrap_or_else(|| Duration::from_secs(0)),
+                observed_delay,
+            );
+            metrics::observe_duration(
+                &metrics::BEACON_BLOCK_HEAD_ATTESTABLE_DELAY_TIME,
+                attestable_delay,
             );
-
             metrics::observe_duration(
                 &metrics::BEACON_BLOCK_HEAD_IMPORTED_DELAY_TIME,
-                block_delays
-                    .set_as_head
-                    .unwrap_or_else(|| Duration::from_secs(0)),
+                set_as_head_delay,
             );
 
             // If the block was enshrined as head too late for attestations to be created for it,
             // log a debug warning and increment a metric.
-            if late_head {
-                metrics::inc_counter(&metrics::BEACON_BLOCK_HEAD_SLOT_START_DELAY_EXCEEDED_TOTAL);
+            let missed_attestation_deadline = attestable_delay >= attestation_deadline;
+            if missed_attestation_deadline {
+                let due_to_late_block = observed_delay >= attestation_deadline;
+                let due_to_processing = observed_delay + import_delay >= attestation_deadline;
+
+                let reason = if due_to_late_block {
+                    metrics::inc_counter(&metrics::BEACON_BLOCK_HEAD_MISSED_ATT_DEADLINE_LATE);
+                    "late block"
+                } else if due_to_processing {
+                    metrics::inc_counter(&metrics::BEACON_BLOCK_HEAD_MISSED_ATT_DEADLINE_SLOW);
+                    "slow to process"
+                } else {
+                    metrics::inc_counter(&metrics::BEACON_BLOCK_HEAD_MISSED_ATT_DEADLINE_OTHER);
+                    "other"
+                };
+
                 debug!(
                     self.log,
                     "Delayed head block";
+                    "reason" => reason,
                     "block_root" => ?beacon_block_root,
                     "proposer_index" => head_proposer_index,
                     "slot" => head_slot,
                     "block_delay" => ?block_delay_total,
                     "observed_delay" => ?block_delays.observed,
                     "imported_delay" => ?block_delays.imported,
+                    "attestable_delay" => ?attestable_delay,
                     "set_as_head_delay" => ?block_delays.set_as_head,
                 );
             }
@@ -3722,6 +3757,7 @@ impl<T: BeaconChainTypes> BeaconChain<T> {
                     proposer_graffiti,
                     block_delay: block_delay_total,
                     observed_delay: block_delays.observed,
+                    attestable_delay: block_delays.attestable,
                     imported_delay: block_delays.imported,
                     set_as_head_delay: block_delays.set_as_head,
                 }));
diff --git a/beacon_node/beacon_chain/src/block_times_cache.rs b/beacon_node/beacon_chain/src/block_times_cache.rs
index 484de841de..6623932f59 100644
--- a/beacon_node/beacon_chain/src/block_times_cache.rs
+++ b/beacon_node/beacon_chain/src/block_times_cache.rs
@@ -18,6 +18,7 @@ type BlockRoot = Hash256;
 #[derive(Clone, Default)]
 pub struct Timestamps {
     pub observed: Option<Duration>,
+    pub attestable: Option<Duration>,
     pub imported: Option<Duration>,
     pub set_as_head: Option<Duration>,
 }
@@ -26,6 +27,7 @@ pub struct Timestamps {
 #[derive(Default)]
 pub struct BlockDelays {
     pub observed: Option<Duration>,
+    pub attestable: Option<Duration>,
     pub imported: Option<Duration>,
     pub set_as_head: Option<Duration>,
 }
@@ -35,6 +37,9 @@ impl BlockDelays {
         let observed = times
             .observed
             .and_then(|observed_time| observed_time.checked_sub(slot_start_time));
+        let attestable = times
+            .attestable
+            .and_then(|attestable_time| attestable_time.checked_sub(slot_start_time));
         let imported = times
             .imported
             .and_then(|imported_time| imported_time.checked_sub(times.observed?));
@@ -43,6 +48,7 @@ impl BlockDelays {
             .and_then(|set_as_head_time| set_as_head_time.checked_sub(times.imported?));
         BlockDelays {
             observed,
+            attestable,
             imported,
             set_as_head,
         }
@@ -99,6 +105,14 @@ impl BlockTimesCache {
         };
     }
 
+    pub fn set_time_attestable(&mut self, block_root: BlockRoot, slot: Slot, timestamp: Duration) {
+        let block_times = self
+            .cache
+            .entry(block_root)
+            .or_insert_with(|| BlockTimesCacheValue::new(slot));
+        block_times.timestamps.attestable = Some(timestamp);
+    }
+
     pub fn set_time_imported(&mut self, block_root: BlockRoot, slot: Slot, timestamp: Duration) {
         let block_times = self
             .cache
diff --git a/beacon_node/beacon_chain/src/metrics.rs b/beacon_node/beacon_chain/src/metrics.rs
index efeb76e76f..f785036d45 100644
--- a/beacon_node/beacon_chain/src/metrics.rs
+++ b/beacon_node/beacon_chain/src/metrics.rs
@@ -767,7 +767,10 @@ lazy_static! {
         "Number of attester slashings seen",
         &["src", "validator"]
     );
+}
 
+// Fourth lazy-static block is used to account for macro recursion limit.
+lazy_static! {
     /*
      * Block Delay Metrics
      */
@@ -783,14 +786,25 @@ lazy_static! {
         "beacon_block_head_imported_delay_time",
         "Duration between the time the block was imported and the time when it was set as head.",
     );
+    pub static ref BEACON_BLOCK_HEAD_ATTESTABLE_DELAY_TIME: Result<Histogram> = try_create_histogram(
+        "beacon_block_head_attestable_delay_time",
+        "Duration between the start of the slot and the time at which the block could be attested to.",
+    );
     pub static ref BEACON_BLOCK_HEAD_SLOT_START_DELAY_TIME: Result<Histogram> = try_create_histogram(
         "beacon_block_head_slot_start_delay_time",
         "Duration between the start of the block's slot and the time when it was set as head.",
     );
-    pub static ref BEACON_BLOCK_HEAD_SLOT_START_DELAY_EXCEEDED_TOTAL: Result<IntCounter> = try_create_int_counter(
-        "beacon_block_head_slot_start_delay_exceeded_total",
-        "Triggered when the duration between the start of the block's slot and the current time \
-        will result in failed attestations.",
+    pub static ref BEACON_BLOCK_HEAD_MISSED_ATT_DEADLINE_LATE: Result<IntCounter> = try_create_int_counter(
+        "beacon_block_head_missed_att_deadline_late",
+        "Total number of delayed head blocks that arrived late"
+    );
+    pub static ref BEACON_BLOCK_HEAD_MISSED_ATT_DEADLINE_SLOW: Result<IntCounter> = try_create_int_counter(
+        "beacon_block_head_missed_att_deadline_slow",
+        "Total number of delayed head blocks that arrived on time but not processed in time"
+    );
+    pub static ref BEACON_BLOCK_HEAD_MISSED_ATT_DEADLINE_OTHER: Result<IntCounter> = try_create_int_counter(
+        "beacon_block_head_missed_att_deadline_other",
+        "Total number of delayed head blocks that were not late and not slow to process"
     );
 
     /*
@@ -801,10 +815,7 @@ lazy_static! {
             "gossip_beacon_block_skipped_slots",
             "For each gossip blocks, the number of skip slots between it and its parent"
         );
-}
 
-// Fourth lazy-static block is used to account for macro recursion limit.
-lazy_static! {
     /*
      * Sync Committee Message Verification
      */
diff --git a/common/eth2/src/types.rs b/common/eth2/src/types.rs
index 8ef3582268..e4ba43d526 100644
--- a/common/eth2/src/types.rs
+++ b/common/eth2/src/types.rs
@@ -835,6 +835,7 @@ pub struct SseLateHead {
     pub proposer_graffiti: String,
     pub block_delay: Duration,
     pub observed_delay: Option<Duration>,
+    pub attestable_delay: Option<Duration>,
     pub imported_delay: Option<Duration>,
     pub set_as_head_delay: Option<Duration>,
 }