diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE/default-issue-template.md similarity index 79% rename from .github/ISSUE_TEMPLATE.md rename to .github/ISSUE_TEMPLATE/default-issue-template.md index d73b9ff6f0..784add20f3 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE/default-issue-template.md @@ -1,3 +1,12 @@ +--- +name: Default issue template +about: Use this template for all issues +title: '' +labels: '' +assignees: '' + +--- + ## Description Please provide a brief description of the issue. diff --git a/.github/mergify.yml b/.github/mergify.yml index 4ab73bcf07..0b917b2546 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -105,6 +105,10 @@ queue_rules: {{ body | get_section("## Proposed Changes", "") }} + + {% for commit in commits | unique(attribute='email_author') %} + Co-Authored-By: {{ commit.author }} <{{ commit.email_author }}> + {% endfor %} queue_conditions: - "#approved-reviews-by >= 1" - "check-success=license/cla" diff --git a/.github/workflows/test-suite.yml b/.github/workflows/test-suite.yml index faa2745f55..0201bf9ae3 100644 --- a/.github/workflows/test-suite.yml +++ b/.github/workflows/test-suite.yml @@ -102,7 +102,7 @@ jobs: with: version: nightly-ca67d15f4abd46394b324c50e21e66f306a1162d - name: Run tests in release - run: make nextest-release + run: make test-release - name: Show cache stats if: env.SELF_HOSTED_RUNNERS == 'true' continue-on-error: true @@ -134,7 +134,7 @@ jobs: - name: Set LIBCLANG_PATH run: echo "LIBCLANG_PATH=$((gcm clang).source -replace "clang.exe")" >> $env:GITHUB_ENV - name: Run tests in release - run: make nextest-release + run: make test-release - name: Show cache stats if: env.SELF_HOSTED_RUNNERS == 'true' continue-on-error: true @@ -225,6 +225,7 @@ jobs: TEST_FEATURES: portable CI_LOGGER_DIR: ${{ runner.temp }}/network_test_logs - name: Upload logs + if: always() uses: actions/upload-artifact@v4 with: name: network_test_logs @@ -269,7 +270,7 @@ jobs: with: version: nightly-ca67d15f4abd46394b324c50e21e66f306a1162d - name: Run tests in debug - run: make nextest-debug + run: make test-debug - name: Show cache stats if: env.SELF_HOSTED_RUNNERS == 'true' continue-on-error: true @@ -306,7 +307,7 @@ jobs: cache-target: release bins: cargo-nextest - name: Run consensus-spec-tests with blst and fake_crypto - run: make nextest-ef + run: make test-ef - name: Show cache stats if: env.SELF_HOSTED_RUNNERS == 'true' continue-on-error: true @@ -328,6 +329,7 @@ jobs: - name: Run a basic beacon chain sim that starts from Deneb run: cargo run --release --bin simulator basic-sim --disable-stdout-logging --log-dir ${{ runner.temp }}/basic_simulator_logs - name: Upload logs + if: always() uses: actions/upload-artifact@v4 with: name: basic_simulator_logs @@ -349,6 +351,7 @@ jobs: - name: Run a beacon chain sim which tests VC fallback behaviour run: cargo run --release --bin simulator fallback-sim --disable-stdout-logging --log-dir ${{ runner.temp }}/fallback_simulator_logs - name: Upload logs + if: always() uses: actions/upload-artifact@v4 with: name: fallback_simulator_logs diff --git a/.gitignore b/.gitignore index e63e218a3b..efd7916b05 100644 --- a/.gitignore +++ b/.gitignore @@ -9,7 +9,6 @@ perf.data* *.tar.gz /bin genesis.ssz -/clippy.toml /.cargo # IntelliJ diff --git a/CLAUDE.md b/CLAUDE.md index 53a4433747..3e9ab169f3 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -7,25 +7,28 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co **Important**: Always branch from `unstable` and target `unstable` when creating pull requests. ### Building and Installation + - `make install` - Build and install the main Lighthouse binary in release mode - `make install-lcli` - Build and install the `lcli` utility binary - `cargo build --release` - Standard Rust release build - `cargo build --bin lighthouse --features "gnosis,slasher-lmdb"` - Build with specific features ### Testing + - `make test` - Run the full test suite in release mode (excludes EF tests, beacon_chain, slasher, network, http_api) -- `make nextest-release` - Run tests using nextest (faster parallel test runner) +- `make test-release` - Run tests using nextest (faster parallel test runner) - `make test-beacon-chain` - Run beacon chain tests for all supported forks - `make test-slasher` - Run slasher tests with all database backend combinations - `make test-ef` - Download and run Ethereum Foundation test vectors - `make test-full` - Complete test suite including linting, EF tests, and execution engine tests -- `cargo test -p ` - Run tests for a specific package -- `cargo test -p ` - Run individual test (preferred during development iteration) +- `cargo nextest run -p ` - Run tests for a specific package +- `cargo nextest run -p ` - Run individual test (preferred during development iteration) - `FORK_NAME=electra cargo nextest run -p beacon_chain` - Run tests for specific fork **Note**: Full test suite takes ~20 minutes. When iterating, prefer running individual tests. -### Linting and Code Quality +### Linting and Code Quality + - `make lint` - Run Clippy linter with project-specific rules - `make lint-full` - Run comprehensive linting including tests (recommended for thorough checking) - `make cargo-fmt` - Check code formatting with rustfmt @@ -33,8 +36,9 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co - `make audit` - Run security audit on dependencies ### Cross-compilation + - `make build-x86_64` - Cross-compile for x86_64 Linux -- `make build-aarch64` - Cross-compile for ARM64 Linux +- `make build-aarch64` - Cross-compile for ARM64 Linux - `make build-riscv64` - Cross-compile for RISC-V 64-bit Linux ## Architecture Overview @@ -44,13 +48,15 @@ Lighthouse is a modular Ethereum consensus client with two main components: ### Core Components **Beacon Node** (`beacon_node/`) + - Main consensus client that syncs with the Ethereum network - Contains the beacon chain state transition logic (`beacon_node/beacon_chain/`) - Handles networking, storage, and P2P communication - Provides HTTP API for validator clients and external tools - Entry point: `beacon_node/src/lib.rs` -**Validator Client** (`validator_client/`) +**Validator Client** (`validator_client/`) + - Manages validator keystores and performs validator duties - Connects to beacon nodes via HTTP API - Handles block proposals, attestations, and sync committee duties @@ -60,31 +66,37 @@ Lighthouse is a modular Ethereum consensus client with two main components: ### Key Subsystems **Consensus Types** (`consensus/types/`) + - Core Ethereum consensus data structures (BeaconState, BeaconBlock, etc.) - Ethereum specification implementations for different networks (mainnet, gnosis) - SSZ encoding/decoding and state transition primitives **Storage** (`beacon_node/store/`) + - Hot/cold database architecture for efficient beacon chain storage - Supports multiple backends (LevelDB, RocksDB, REDB) - Handles state pruning and historical data management **Networking** (`beacon_node/lighthouse_network/`, `beacon_node/network/`) + - Libp2p-based P2P networking stack - Gossipsub for message propagation - Discovery v5 for peer discovery - Request/response protocols for sync **Fork Choice** (`consensus/fork_choice/`, `consensus/proto_array/`) + - Implements Ethereum's fork choice algorithm (proto-array) - Manages chain reorganizations and finality **Execution Layer Integration** (`beacon_node/execution_layer/`) + - Interfaces with execution clients - Retrieves payloads from local execution layer or external block builders - Handles payload validation and builder integration **Slasher** (`slasher/`) + - Optional slashing detection service - Supports LMDB, MDBX, and REDB database backends - Can be enabled with `--slasher` flag @@ -120,6 +132,7 @@ Lighthouse is a modular Ethereum consensus client with two main components: ## Common Review Standards ### CI/Testing Requirements + - All checks must pass before merge - Test coverage expected for significant changes - Flaky tests are actively addressed and fixed @@ -127,12 +140,14 @@ Lighthouse is a modular Ethereum consensus client with two main components: - `beacon_chain` and `http_api` tests support fork-specific testing using `FORK_NAME` env var when `beacon_chain/fork_from_env` feature is enabled ### Code Quality Standards + - Clippy warnings must be fixed promptly (multiple PRs show this pattern) - Code formatting with `cargo fmt` enforced - Must run `cargo sort` when adding dependencies - dependency order is enforced on CI - Performance considerations for hot paths ### Documentation and Context + - PRs require clear descriptions of what and why - Breaking changes need migration documentation - API changes require documentation updates @@ -140,6 +155,7 @@ Lighthouse is a modular Ethereum consensus client with two main components: - Comments appreciated for complex logic ### Security and Safety + - Careful review of consensus-critical code paths - Error handling patterns must be comprehensive - Input validation for external data @@ -147,6 +163,7 @@ Lighthouse is a modular Ethereum consensus client with two main components: ## Development Patterns and Best Practices ### Panics and Error Handling + - **Panics should be avoided at all costs** - Always prefer returning a `Result` or `Option` over causing a panic (e.g., prefer `array.get(1)?` over `array[1]`) - Avoid `expect` or `unwrap` at runtime - only acceptable during startup when validating CLI flags or configurations @@ -154,18 +171,22 @@ Lighthouse is a modular Ethereum consensus client with two main components: - Use proper error handling with `Result` types and graceful error propagation ### Rayon Usage + - Avoid using the rayon global thread pool as it results in CPU oversubscription when the beacon processor has fully allocated all CPUs to workers - Use scoped rayon pools started by beacon processor for computational intensive tasks ### Locks + - Take great care to avoid deadlocks when working with fork choice locks - seek detailed review ([reference](beacon_node/beacon_chain/src/canonical_head.rs:9)) - Keep lock scopes as narrow as possible to avoid blocking fast-responding functions like the networking stack ### Async Patterns + - Avoid blocking computations in async tasks - Spawn a blocking task instead for CPU-intensive work ### Tracing + - Design spans carefully and avoid overuse of spans just to add context data to events - Avoid using spans on simple getter methods as it can result in performance overhead - Be cautious of span explosion with recursive functions @@ -173,14 +194,17 @@ Lighthouse is a modular Ethereum consensus client with two main components: - Avoid using `span.enter()` or `span.entered()` in async tasks ### Database + - Maintain schema continuity on `unstable` branch - Database migrations must be backward compatible ### Consensus Crate + - Use safe math methods like `saturating_xxx` or `checked_xxx` - Critical that this crate behaves deterministically and MUST not have undefined behavior ### Testing Patterns + - **Use appropriate test types for the right scenarios**: - **Unit tests** for single component edge cases and isolated logic - **Integration tests** using [`BeaconChainHarness`](beacon_node/beacon_chain/src/test_utils.rs:668) for end-to-end workflows @@ -204,6 +228,7 @@ Lighthouse is a modular Ethereum consensus client with two main components: - See [`scripts/local_testnet/README.md`](scripts/local_testnet/README.md) for setup instructions ### TODOs and Comments + - All `TODO` statements must be accompanied by a GitHub issue link - Prefer line (`//`) comments to block comments (`/* ... */`) - Use doc comments (`///`) before attributes for public items @@ -211,7 +236,9 @@ Lighthouse is a modular Ethereum consensus client with two main components: - Provide examples in doc comments for public APIs when helpful ## Logging Guidelines + Use appropriate log levels for different scenarios: + - **`crit`**: Critical issues with major impact to Lighthouse functionality - Lighthouse may not function correctly without resolving. Needs immediate attention. - **`error`**: Error cases that may have moderate impact to Lighthouse functionality. Expect to receive reports from users for this level. - **`warn`**: Unexpected code paths that don't have major impact - fully recoverable. Expect user reports if excessive warning logs occur. @@ -221,6 +248,7 @@ Use appropriate log levels for different scenarios: ## Code Examples ### Safe Math in Consensus Crate + ```rust // ❌ Avoid - could panic let result = a + b; @@ -234,6 +262,7 @@ let result = a.safe_add(b)?; ``` ### Panics and Error Handling + ```rust // ❌ Avoid - could panic at runtime let value = some_result.unwrap(); @@ -253,6 +282,7 @@ let item = array.get(1).expect("Array always has at least 2 elements due to vali ``` ### TODO Format + ```rust pub fn my_function(&mut self, _something: &[u8]) -> Result { // TODO: Implement proper validation here @@ -261,6 +291,7 @@ pub fn my_function(&mut self, _something: &[u8]) -> Result { ``` ### Async Task Spawning for Blocking Work + ```rust // ❌ Avoid - blocking in async context async fn some_handler() { @@ -276,6 +307,7 @@ async fn some_handler() { ``` ### Tracing Span Usage + ```rust // ❌ Avoid - span on simple getter #[instrument] @@ -291,9 +323,10 @@ async fn process_block(&self, block: Block) -> Result<(), Error> { ``` ## Build and Development Notes -- Full builds and tests take 5+ minutes - use large timeouts (300s+) for any `cargo build`, `cargo test`, or `make` commands + +- Full builds and tests take 5+ minutes - use large timeouts (300s+) for any `cargo build`, `cargo nextest`, or `make` commands - Use `cargo check` for faster iteration during development and always run after code changes +- Prefer targeted package tests (`cargo nextest run -p `) and individual tests over full test suite when debugging specific issues - Use `cargo fmt --all && make lint-fix` to format code and fix linting issues once a task is complete -- Prefer targeted package tests (`cargo test -p `) and individual tests over full test suite when debugging specific issues - Always understand the broader codebase patterns before making changes -- Minimum Supported Rust Version (MSRV) is documented in `lighthouse/Cargo.toml` - ensure Rust version meets or exceeds this requirement \ No newline at end of file +- Minimum Supported Rust Version (MSRV) is documented in `lighthouse/Cargo.toml` - ensure Rust version meets or exceeds this requirement diff --git a/Cargo.lock b/Cargo.lock index 1bd65e1721..31cccc6a98 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -918,7 +918,7 @@ dependencies = [ [[package]] name = "beacon_node" -version = "7.1.0" +version = "8.0.0-rc.1" dependencies = [ "account_utils", "beacon_chain", @@ -936,6 +936,7 @@ dependencies = [ "hyper 1.6.0", "lighthouse_network", "monitoring_api", + "network_utils", "node_test_rig", "sensitive_url", "serde_json", @@ -945,7 +946,6 @@ dependencies = [ "task_executor", "tracing", "types", - "unused_port", ] [[package]] @@ -1165,9 +1165,9 @@ dependencies = [ [[package]] name = "blst" -version = "0.3.14" +version = "0.3.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47c79a94619fade3c0b887670333513a67ac28a6a7e653eb260bf0d4103db38d" +checksum = "dcdb4c7013139a150f9fc55d123186dbfaba0d912817466282c73ac49e71fb45" dependencies = [ "cc", "glob", @@ -1193,7 +1193,7 @@ dependencies = [ [[package]] name = "boot_node" -version = "7.1.0" +version = "8.0.0-rc.1" dependencies = [ "beacon_node", "bytes", @@ -1205,6 +1205,7 @@ dependencies = [ "lighthouse_network", "log", "logging", + "network_utils", "serde", "tokio", "tracing", @@ -1295,11 +1296,10 @@ dependencies = [ [[package]] name = "c-kzg" -version = "2.1.0" +version = "2.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e7e3c397401eb76228c89561cf22f85f41c95aa799ee9d860de3ea1cbc728fc" +checksum = "e00bf4b112b07b505472dbefd19e37e53307e2bfed5a79e0cc161d58ccd0e687" dependencies = [ - "arbitrary", "blst", "cc", "glob", @@ -1809,6 +1809,16 @@ dependencies = [ "libc", ] +[[package]] +name = "core-foundation" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2a6cd9ae233e7f62ba4e9353e81a88df7fc8a5987b8d445b4d90c879bd156f6" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "core-foundation-sys" version = "0.8.7" @@ -2384,9 +2394,9 @@ dependencies = [ [[package]] name = "discv5" -version = "0.9.1" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4b4e7798d2ff74e29cee344dc490af947ae657d6ab5273dde35d58ce06a4d71" +checksum = "a20b702c8491b3325866a4935d0b5101e49144d74540384243b6293794aad6fa" dependencies = [ "aes 0.8.4", "aes-gcm", @@ -2554,9 +2564,9 @@ dependencies = [ [[package]] name = "eip4844" -version = "0.8.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0067055675ea62c0287d520099d9a560f5ad4fd0c00956da99bbb2a68ad2bfc9" +checksum = "82ab45fc63db6bbe5c3eb7c79303b2aff7ee529c991b2111c46879d1ea38407e" dependencies = [ "ekzg-bls12-381", "ekzg-maybe-rayon", @@ -2579,9 +2589,9 @@ checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" [[package]] name = "ekzg-bls12-381" -version = "0.8.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef34382b1387ebc5acb0d509ab88401beade921af5982142778ae0c200f71edf" +checksum = "05c599a59deba6188afd9f783507e4d89efc997f0fa340a758f0d0992b322416" dependencies = [ "blst", "blstrs", @@ -2593,9 +2603,9 @@ dependencies = [ [[package]] name = "ekzg-erasure-codes" -version = "0.8.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fa58fcb3f698451a3a1ceb5f4a13ea7a4decab9f0bad63ee1690671b12b901c" +checksum = "8474a41a30ddd2b651798b1aa9ce92011207c3667186fe9044184683250109e7" dependencies = [ "ekzg-bls12-381", "ekzg-polynomial", @@ -2603,15 +2613,15 @@ dependencies = [ [[package]] name = "ekzg-maybe-rayon" -version = "0.8.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ce7a570aaa7eb80ea92637f7153a8cd4c20640a3043146b57590ab4ae8eb0e9" +checksum = "9cf94d1385185c1f7caef4973be49702c7d9ffdeaf832d126dbb9ed6efe09d40" [[package]] name = "ekzg-multi-open" -version = "0.8.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51a24896816c59dde1cf08b67480114edb9df1738b7f4f99ec51f7ce0e2dfaa0" +checksum = "e6d37456a32cf79bdbddd6685a2adec73210e2d60332370bc0e9a502b6d93beb" dependencies = [ "ekzg-bls12-381", "ekzg-maybe-rayon", @@ -2621,9 +2631,9 @@ dependencies = [ [[package]] name = "ekzg-polynomial" -version = "0.8.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6520b5210781436d42ec6cb2e3a278573f1af10707b92502f5329ec967d30018" +checksum = "704751bac85af4754bb8a14457ef24d820738062d0b6f3763534d0980b1a1e81" dependencies = [ "ekzg-bls12-381", "ekzg-maybe-rayon", @@ -2631,9 +2641,9 @@ dependencies = [ [[package]] name = "ekzg-serialization" -version = "0.8.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bcf1197575ca1babbd7861424e7c5339233fa8215cf8b1ee9188a2c354f34b6a" +checksum = "3cb983d9f75b2804c00246def8d52c01cf05f70c22593b8d314fbcf0cf89042b" dependencies = [ "ekzg-bls12-381", "hex", @@ -2641,9 +2651,9 @@ dependencies = [ [[package]] name = "ekzg-single-open" -version = "0.8.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f6e471860c94135d9075562a991c4456c4148efdac2bfccc64e1bf3fd074beb" +checksum = "799d5806d51e1453fa0f528d6acf4127e2a89e98312c826151ebc24ee3448ec3" dependencies = [ "ekzg-bls12-381", "ekzg-polynomial", @@ -2652,9 +2662,9 @@ dependencies = [ [[package]] name = "ekzg-trusted-setup" -version = "0.8.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1b016cc437c85ece6d54ecfe51b745516e520b388beb2b09a5196748bab21f3" +checksum = "85314d56718dc2c6dd77c3b3630f1839defcb6f47d9c20195608a0f7976095ab" dependencies = [ "ekzg-bls12-381", "ekzg-serialization", @@ -3304,6 +3314,7 @@ dependencies = [ "futures", "hex", "logging", + "network_utils", "reqwest 0.11.27", "sensitive_url", "serde_json", @@ -3311,7 +3322,6 @@ dependencies = [ "tempfile", "tokio", "types", - "unused_port", ] [[package]] @@ -4290,6 +4300,7 @@ dependencies = [ "lru", "metrics", "network", + "network_utils", "operation_pool", "parking_lot 0.12.3", "proto_array", @@ -4324,6 +4335,7 @@ dependencies = [ "logging", "malloc_utils", "metrics", + "network_utils", "reqwest 0.11.27", "serde", "slot_clock", @@ -4642,7 +4654,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cdf9d64cfcf380606e64f9a0bcf493616b65331199f984151a6fa11a7b3cde38" dependencies = [ "async-io", - "core-foundation", + "core-foundation 0.9.4", "fnv", "futures", "if-addrs", @@ -5038,7 +5050,7 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "lcli" -version = "7.1.0" +version = "8.0.0-rc.1" dependencies = [ "account_utils", "beacon_chain", @@ -5058,6 +5070,7 @@ dependencies = [ "lighthouse_version", "log", "malloc_utils", + "network_utils", "rayon", "serde", "serde_json", @@ -5097,9 +5110,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.171" +version = "0.2.175" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c19937216e9d3aa9956d9bb8dfc0b0c8beb6058fc4f7a4dc4d850edf86a237d6" +checksum = "6a82ae493e598baaea5209805c49bbf2ea7de956d50d7da0da1164f9c6d28543" [[package]] name = "libloading" @@ -5547,7 +5560,7 @@ dependencies = [ [[package]] name = "lighthouse" -version = "7.1.0" +version = "8.0.0-rc.1" dependencies = [ "account_manager", "account_utils", @@ -5573,6 +5586,7 @@ dependencies = [ "logging", "malloc_utils", "metrics", + "network_utils", "opentelemetry", "opentelemetry-otlp", "opentelemetry_sdk", @@ -5589,7 +5603,6 @@ dependencies = [ "tracing-opentelemetry", "tracing-subscriber", "types", - "unused_port", "validator_client", "validator_dir", "validator_manager", @@ -5625,6 +5638,7 @@ dependencies = [ "lru", "lru_cache", "metrics", + "network_utils", "parking_lot 0.12.3", "prometheus-client", "quickcheck", @@ -5640,15 +5654,12 @@ dependencies = [ "superstruct", "task_executor", "tempfile", - "tiny-keccak", "tokio", - "tokio-io-timeout", "tokio-util", "tracing", "tracing-subscriber", "types", "unsigned-varint 0.8.0", - "unused_port", ] [[package]] @@ -5687,7 +5698,6 @@ version = "0.1.0" dependencies = [ "git-version", "regex", - "target_info", ] [[package]] @@ -5865,11 +5875,11 @@ checksum = "ffbee8634e0d45d258acb448e7eaab3fce7a0a467395d4d9f228e3c1f01fb2e4" [[package]] name = "matchers" -version = "0.1.0" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558" +checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" dependencies = [ - "regex-automata 0.1.10", + "regex-automata", ] [[package]] @@ -6201,7 +6211,7 @@ dependencies = [ "openssl-probe", "openssl-sys", "schannel", - "security-framework", + "security-framework 2.11.1", "security-framework-sys", "tempfile", ] @@ -6347,6 +6357,21 @@ dependencies = [ "types", ] +[[package]] +name = "network_utils" +version = "0.1.0" +dependencies = [ + "discv5", + "hex", + "libp2p-identity", + "lru_cache", + "metrics", + "multiaddr", + "parking_lot 0.12.3", + "serde", + "tiny-keccak", +] + [[package]] name = "nix" version = "0.24.3" @@ -6425,12 +6450,11 @@ dependencies = [ [[package]] name = "nu-ansi-term" -version = "0.46.0" +version = "0.50.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" +checksum = "d4a28e057d01f97e61255210fcff094d74ed0466038633e95017f5beb68e4399" dependencies = [ - "overload", - "winapi", + "windows-sys 0.52.0", ] [[package]] @@ -6754,12 +6778,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "overload" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" - [[package]] name = "p256" version = "0.13.2" @@ -7322,7 +7340,7 @@ dependencies = [ "rand 0.8.5", "rand_chacha 0.3.1", "rand_xorshift 0.3.0", - "regex-syntax 0.8.5", + "regex-syntax", "rusty-fork", "tempfile", "unarray", @@ -7356,7 +7374,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" dependencies = [ "anyhow", - "itertools 0.13.0", + "itertools 0.14.0", "proc-macro2", "quote", "syn 2.0.100", @@ -7712,17 +7730,8 @@ checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" dependencies = [ "aho-corasick", "memchr", - "regex-automata 0.4.9", - "regex-syntax 0.8.5", -] - -[[package]] -name = "regex-automata" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" -dependencies = [ - "regex-syntax 0.6.29", + "regex-automata", + "regex-syntax", ] [[package]] @@ -7733,15 +7742,9 @@ checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" dependencies = [ "aho-corasick", "memchr", - "regex-syntax 0.8.5", + "regex-syntax", ] -[[package]] -name = "regex-syntax" -version = "0.6.29" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" - [[package]] name = "regex-syntax" version = "0.8.5" @@ -8009,9 +8012,9 @@ dependencies = [ [[package]] name = "rust_eth_kzg" -version = "0.8.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c552fbda8be95ddcbebc9ebcb198cb9fe97e538450bcb7476ce5d9e03c499ff" +checksum = "1522b7a740cd7f5bc52ea49863618511c8de138dcdf3f8a80b15b3f764942a5b" dependencies = [ "eip4844", "ekzg-bls12-381", @@ -8147,6 +8150,7 @@ version = "0.23.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "47796c98c480fce5406ef69d1c76378375492c3b0a0de587be0c1d9feb12f395" dependencies = [ + "log", "once_cell", "ring", "rustls-pki-types", @@ -8155,6 +8159,18 @@ dependencies = [ "zeroize", ] +[[package]] +name = "rustls-native-certs" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fcff2dd52b58a8d98a70243663a0d234c4e2b79235637849d15913394a247d3" +dependencies = [ + "openssl-probe", + "rustls-pki-types", + "schannel", + "security-framework 3.3.0", +] + [[package]] name = "rustls-pemfile" version = "1.0.4" @@ -8253,6 +8269,8 @@ checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" [[package]] name = "safe_arith" version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b147bb6111014916d3ef9d4c85173124a8e12193a67f6176d67244afd558d6c1" [[package]] name = "salsa20" @@ -8404,7 +8422,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" dependencies = [ "bitflags 2.9.0", - "core-foundation", + "core-foundation 0.9.4", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "80fb1d92c5028aa318b4b8bd7302a5bfcf48be96a37fc6fc790f806b0004ee0c" +dependencies = [ + "bitflags 2.9.0", + "core-foundation 0.10.1", "core-foundation-sys", "libc", "security-framework-sys", @@ -9035,16 +9066,16 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "superstruct" -version = "0.8.0" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf0f31f730ad9e579364950e10d6172b4a9bd04b447edf5988b066a860cc340e" +checksum = "3b986e4a629907f20a2c2a639a75bc22a8b5d99b444e0d83c395f4cb309022bf" dependencies = [ - "darling 0.13.4", - "itertools 0.10.5", + "darling 0.20.10", + "itertools 0.13.0", "proc-macro2", "quote", "smallvec", - "syn 1.0.109", + "syn 2.0.100", ] [[package]] @@ -9127,7 +9158,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba3a3adc5c275d719af8cb4272ea1c4a6d668a777f37e115f6d11ddbc1c8e0e7" dependencies = [ "bitflags 1.3.2", - "core-foundation", + "core-foundation 0.9.4", "system-configuration-sys 0.5.0", ] @@ -9138,7 +9169,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" dependencies = [ "bitflags 2.9.0", - "core-foundation", + "core-foundation 0.9.4", "system-configuration-sys 0.6.0", ] @@ -9167,6 +9198,8 @@ name = "system_health" version = "0.1.0" dependencies = [ "lighthouse_network", + "metrics", + "network_utils", "parking_lot 0.12.3", "serde", "sysinfo", @@ -9192,12 +9225,6 @@ dependencies = [ "static_assertions", ] -[[package]] -name = "target_info" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c63f48baada5c52e65a29eef93ab4f8982681b67f9e8d29c7b05abcfec2b9ffe" - [[package]] name = "task_executor" version = "0.1.0" @@ -9205,6 +9232,8 @@ dependencies = [ "async-channel 1.9.0", "futures", "metrics", + "num_cpus", + "rayon", "tokio", "tracing", ] @@ -9461,16 +9490,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "tokio-io-timeout" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30b74022ada614a1b4834de765f9bb43877f910cc8ce4be40e89042c9223a8bf" -dependencies = [ - "pin-project-lite", - "tokio", -] - [[package]] name = "tokio-macros" version = "2.5.0" @@ -9513,6 +9532,16 @@ dependencies = [ "tokio", ] +[[package]] +name = "tokio-rustls" +version = "0.26.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e727b36a1a0e8b74c376ac2211e40c2c8af09fb4013c60d910495810f008e9b" +dependencies = [ + "rustls 0.23.23", + "tokio", +] + [[package]] name = "tokio-stream" version = "0.1.17" @@ -9616,7 +9645,9 @@ dependencies = [ "percent-encoding", "pin-project", "prost", + "rustls-native-certs", "tokio", + "tokio-rustls 0.26.2", "tokio-stream", "tower 0.5.2", "tower-layer", @@ -9771,14 +9802,14 @@ dependencies = [ [[package]] name = "tracing-subscriber" -version = "0.3.19" +version = "0.3.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008" +checksum = "2054a14f5307d601f88daf0553e1cbf472acc4f2c51afab632431cdcd72124d5" dependencies = [ "matchers", "nu-ansi-term", "once_cell", - "regex", + "regex-automata", "serde", "serde_json", "sharded-slab", @@ -10014,14 +10045,6 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" -[[package]] -name = "unused_port" -version = "0.1.0" -dependencies = [ - "lru_cache", - "parking_lot 0.12.3", -] - [[package]] name = "url" version = "2.5.4" @@ -11144,7 +11167,7 @@ dependencies = [ [[package]] name = "xdelta3" version = "0.1.5" -source = "git+http://github.com/sigp/xdelta3-rs?rev=4db64086bb02e9febb584ba93b9d16bb2ae3825a#4db64086bb02e9febb584ba93b9d16bb2ae3825a" +source = "git+https://github.com/sigp/xdelta3-rs?rev=4db64086bb02e9febb584ba93b9d16bb2ae3825a#4db64086bb02e9febb584ba93b9d16bb2ae3825a" dependencies = [ "bindgen", "cc", diff --git a/Cargo.toml b/Cargo.toml index 8588be49c0..a46dc355e7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -37,6 +37,7 @@ members = [ "common/malloc_utils", "common/metrics", "common/monitoring_api", + "common/network_utils", "common/oneshot_broadcast", "common/pretty_reqwest_error", "common/sensitive_url", @@ -45,7 +46,6 @@ members = [ "common/target_check", "common/task_executor", "common/test_random_derive", - "common/unused_port", "common/validator_dir", "common/warp_utils", "common/workspace_members", @@ -56,7 +56,6 @@ members = [ "consensus/int_to_bytes", "consensus/merkle_proof", "consensus/proto_array", - "consensus/safe_arith", "consensus/state_processing", "consensus/swap_or_not_shuffle", "consensus/types", @@ -117,7 +116,7 @@ byteorder = "1" bytes = "1" # Turn off c-kzg's default features which include `blst/portable`. We can turn on blst's portable # feature ourselves when desired. -c-kzg = { version = "2.1.0", default-features = false } +c-kzg = { version = "2.1", default-features = false } cargo_metadata = "0.19" clap = { version = "4.5.4", features = ["derive", "cargo", "wrap_help"] } clap_utils = { path = "common/clap_utils" } @@ -134,7 +133,7 @@ deposit_contract = { path = "common/deposit_contract" } derivative = "2" directory = { path = "common/directory" } dirs = "3" -discv5 = { version = "0.9", features = ["libp2p"] } +discv5 = { version = "0.10", features = ["libp2p"] } doppelganger_service = { path = "validator_client/doppelganger_service" } either = "1.9" environment = { path = "lighthouse/environment" } @@ -194,11 +193,12 @@ mockall_double = "0.3" mockito = "1.5.0" monitoring_api = { path = "common/monitoring_api" } network = { path = "beacon_node/network" } +network_utils = { path = "common/network_utils" } node_test_rig = { path = "testing/node_test_rig" } num_cpus = "1" once_cell = "1.17.1" opentelemetry = "0.30.0" -opentelemetry-otlp = { version = "0.30.0", features = ["grpc-tonic"] } +opentelemetry-otlp = { version = "0.30.0", features = ["grpc-tonic", "tls-roots"] } opentelemetry_sdk = "0.30.0" operation_pool = { path = "beacon_node/operation_pool" } parking_lot = "0.12" @@ -223,8 +223,8 @@ reqwest = { version = "0.11", default-features = false, features = [ ring = "0.17" rpds = "0.11" rusqlite = { version = "0.28", features = ["bundled"] } -rust_eth_kzg = "0.8.0" -safe_arith = { path = "consensus/safe_arith" } +rust_eth_kzg = "0.9" +safe_arith = "0.1" sensitive_url = { path = "common/sensitive_url" } serde = { version = "1", features = ["derive"] } serde_json = "1" @@ -241,7 +241,7 @@ ssz_types = "0.11.0" state_processing = { path = "consensus/state_processing" } store = { path = "beacon_node/store" } strum = { version = "0.24", features = ["derive"] } -superstruct = "0.8" +superstruct = "0.10" swap_or_not_shuffle = { path = "consensus/swap_or_not_shuffle" } syn = "1" sysinfo = "0.26" @@ -265,7 +265,6 @@ tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] } tree_hash = "0.10.0" tree_hash_derive = "0.10.0" types = { path = "consensus/types" } -unused_port = { path = "common/unused_port" } url = "2" uuid = { version = "0.8", features = ["serde", "v4"] } validator_client = { path = "validator_client" } @@ -279,7 +278,7 @@ validator_test_rig = { path = "testing/validator_test_rig" } warp = { version = "0.3.7", default-features = false, features = ["tls"] } warp_utils = { path = "common/warp_utils" } workspace_members = { path = "common/workspace_members" } -xdelta3 = { git = "http://github.com/sigp/xdelta3-rs", rev = "4db64086bb02e9febb584ba93b9d16bb2ae3825a" } +xdelta3 = { git = "https://github.com/sigp/xdelta3-rs", rev = "4db64086bb02e9febb584ba93b9d16bb2ae3825a" } zeroize = { version = "1", features = ["zeroize_derive", "serde"] } zip = "0.6" zstd = "0.13" diff --git a/Makefile b/Makefile index 3b9d999e9a..e1e8a2db1f 100644 --- a/Makefile +++ b/Makefile @@ -144,29 +144,18 @@ build-release-tarballs: $(call tarball_release_binary,$(BUILD_PATH_RISCV64),$(RISCV64_TAG),"") + # Runs the full workspace tests in **release**, without downloading any additional # test vectors. test-release: - cargo test --workspace --release --features "$(TEST_FEATURES)" \ - --exclude ef_tests --exclude beacon_chain --exclude slasher --exclude network \ - --exclude http_api - -# Runs the full workspace tests in **release**, without downloading any additional -# test vectors, using nextest. -nextest-release: cargo nextest run --workspace --release --features "$(TEST_FEATURES)" \ --exclude ef_tests --exclude beacon_chain --exclude slasher --exclude network \ --exclude http_api + # Runs the full workspace tests in **debug**, without downloading any additional test # vectors. test-debug: - cargo test --workspace --features "$(TEST_FEATURES)" \ - --exclude ef_tests --exclude beacon_chain --exclude network --exclude http_api - -# Runs the full workspace tests in **debug**, without downloading any additional test -# vectors, using nextest. -nextest-debug: cargo nextest run --workspace --features "$(TEST_FEATURES)" \ --exclude ef_tests --exclude beacon_chain --exclude network --exclude http_api @@ -178,15 +167,9 @@ cargo-fmt: check-benches: cargo check --workspace --benches --features "$(TEST_FEATURES)" -# Runs only the ef-test vectors. -run-ef-tests: - rm -rf $(EF_TESTS)/.accessed_file_log.txt - cargo test --release -p ef_tests --features "ef_tests,$(EF_TEST_FEATURES)" - cargo test --release -p ef_tests --features "ef_tests,$(EF_TEST_FEATURES),fake_crypto" - ./$(EF_TESTS)/check_all_files_accessed.py $(EF_TESTS)/.accessed_file_log.txt $(EF_TESTS)/consensus-spec-tests -# Runs EF test vectors with nextest -nextest-run-ef-tests: +# Runs EF test vectors +run-ef-tests: rm -rf $(EF_TESTS)/.accessed_file_log.txt cargo nextest run --release -p ef_tests --features "ef_tests,$(EF_TEST_FEATURES)" cargo nextest run --release -p ef_tests --features "ef_tests,$(EF_TEST_FEATURES),fake_crypto" @@ -199,11 +182,11 @@ test-beacon-chain: $(patsubst %,test-beacon-chain-%,$(FORKS_BEFORE_GLOAS)) test-beacon-chain-%: env FORK_NAME=$* cargo nextest run --release --features "fork_from_env,slasher/lmdb,$(TEST_FEATURES)" -p beacon_chain -# Run the tests in the `beacon_chain` crate for all known forks. -test-http-api: $(patsubst %,test-beacon-chain-%,$(RECENT_FORKS)) +# Run the tests in the `http_api` crate for recent forks. +test-http-api: $(patsubst %,test-http-api-%,$(RECENT_FORKS)) test-http-api-%: - env FORK_NAME=$* cargo nextest run --release --features "fork_from_env,slasher/lmdb,$(TEST_FEATURES)" -p http_api + env FORK_NAME=$* cargo nextest run --release --features "beacon_chain/fork_from_env" -p http_api # Run the tests in the `operation_pool` crate for all known forks. @@ -240,9 +223,6 @@ test-ef: make-ef-tests run-ef-tests # Downloads and runs the nightly EF test vectors. test-ef-nightly: make-ef-tests-nightly run-ef-tests -# Downloads and runs the EF test vectors with nextest. -nextest-ef: make-ef-tests nextest-run-ef-tests - # Runs tests checking interop between Lighthouse and execution clients. test-exec-engine: make -C $(EXECUTION_ENGINE_INTEGRATION) test @@ -276,6 +256,7 @@ lint: -D clippy::fn_to_numeric_cast_any \ -D clippy::manual_let_else \ -D clippy::large_stack_frames \ + -D clippy::disallowed_methods \ -D warnings \ -A clippy::derive_partial_eq_without_eq \ -A clippy::upper-case-acronyms \ diff --git a/beacon_node/Cargo.toml b/beacon_node/Cargo.toml index 456376e79b..8e2c598fd4 100644 --- a/beacon_node/Cargo.toml +++ b/beacon_node/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "beacon_node" -version = "7.1.0" +version = "8.0.0-rc.1" authors = [ "Paul Hauner ", "Age Manning BeaconBlockStreamer { if self.check_caches == CheckCaches::Yes { match self.beacon_chain.get_block_process_status(&root) { BlockProcessStatus::Unknown => None, - BlockProcessStatus::NotValidated(block) + BlockProcessStatus::NotValidated(block, _) | BlockProcessStatus::ExecutionValidated(block) => { metrics::inc_counter(&metrics::BEACON_REQRESP_PRE_IMPORT_CACHE_HITS); Some(block) diff --git a/beacon_node/beacon_chain/src/beacon_chain.rs b/beacon_node/beacon_chain/src/beacon_chain.rs index 3a511dacd4..760e447b75 100644 --- a/beacon_node/beacon_chain/src/beacon_chain.rs +++ b/beacon_node/beacon_chain/src/beacon_chain.rs @@ -5,8 +5,9 @@ use crate::attestation_verification::{ }; use crate::attester_cache::{AttesterCache, AttesterCacheKey}; use crate::beacon_block_streamer::{BeaconBlockStreamer, CheckCaches}; -use crate::beacon_proposer_cache::BeaconProposerCache; -use crate::beacon_proposer_cache::compute_proposer_duties_from_head; +use crate::beacon_proposer_cache::{ + BeaconProposerCache, EpochBlockProposers, ensure_state_can_determine_proposers_for_epoch, +}; use crate::blob_verification::{GossipBlobError, GossipVerifiedBlob}; use crate::block_times_cache::BlockTimesCache; use crate::block_verification::POS_PANDA_BANNER; @@ -124,7 +125,7 @@ use store::{ BlobSidecarListFromRoot, DBColumn, DatabaseBlock, Error as DBError, HotColdDB, HotStateSummary, KeyValueStore, KeyValueStoreOp, StoreItem, StoreOp, }; -use task_executor::{ShutdownReason, TaskExecutor}; +use task_executor::{RayonPoolType, ShutdownReason, TaskExecutor}; use tokio_stream::Stream; use tracing::{Span, debug, debug_span, error, info, info_span, instrument, trace, warn}; use tree_hash::TreeHash; @@ -334,16 +335,12 @@ pub enum BlockProcessStatus { /// Block is not in any pre-import cache. Block may be in the data-base or in the fork-choice. Unknown, /// Block is currently processing but not yet validated. - NotValidated(Arc>), + NotValidated(Arc>, BlockImportSource), /// Block is fully valid, but not yet imported. It's cached in the da_checker while awaiting /// missing block components. ExecutionValidated(Arc>), } -pub struct BeaconChainMetrics { - pub reqresp_pre_import_cache_len: usize, -} - pub type LightClientProducerEvent = (Hash256, Slot, SyncAggregate); pub type BeaconForkChoice = ForkChoice< @@ -363,9 +360,6 @@ pub type BeaconStore = Arc< >, >; -/// Cache gossip verified blocks to serve over ReqResp before they are imported -type ReqRespPreImportCache = HashMap>>; - /// Represents the "Beacon Chain" component of Ethereum 2.0. Allows import of blocks and block /// operations and chooses a canonical head. pub struct BeaconChain { @@ -462,8 +456,6 @@ pub struct BeaconChain { pub(crate) attester_cache: Arc, /// A cache used when producing attestations whilst the head block is still being imported. pub early_attester_cache: EarlyAttesterCache, - /// Cache gossip verified blocks to serve over ReqResp before they are imported - pub reqresp_pre_import_cache: Arc>>, /// A cache used to keep track of various block timings. pub block_times_cache: Arc>, /// A cache used to track pre-finalization block roots for quick rejection. @@ -1289,18 +1281,8 @@ impl BeaconChain { /// chain. Used by sync to learn the status of a block and prevent repeated downloads / /// processing attempts. pub fn get_block_process_status(&self, block_root: &Hash256) -> BlockProcessStatus { - if let Some(block) = self - .data_availability_checker - .get_execution_valid_block(block_root) - { - return BlockProcessStatus::ExecutionValidated(block); - } - - if let Some(block) = self.reqresp_pre_import_cache.read().get(block_root) { - // A block is on the `reqresp_pre_import_cache` but NOT in the - // `data_availability_checker` only if it is actively processing. We can expect a future - // event with the result of processing - return BlockProcessStatus::NotValidated(block.clone()); + if let Some(cached_block) = self.data_availability_checker.get_cached_block(block_root) { + return cached_block; } BlockProcessStatus::Unknown @@ -1437,6 +1419,7 @@ impl BeaconChain { /// /// Returns `None` when the state is not found in the database or there is an error skipping /// to a future state. + #[instrument(level = "debug", skip_all)] pub fn state_at_slot( &self, slot: Slot, @@ -3053,8 +3036,7 @@ impl BeaconChain { self.emit_sse_blob_sidecar_events(&block_root, std::iter::once(blob.as_blob())); - let r = self.check_gossip_blob_availability_and_import(blob).await; - self.remove_notified(&block_root, r) + self.check_gossip_blob_availability_and_import(blob).await } /// Cache the data columns in the processing cache, process it, then evict it from the cache if it was @@ -3091,15 +3073,13 @@ impl BeaconChain { data_columns.iter().map(|column| column.as_data_column()), ); - let r = self - .check_gossip_data_columns_availability_and_import( - slot, - block_root, - data_columns, - publish_fn, - ) - .await; - self.remove_notified(&block_root, r) + self.check_gossip_data_columns_availability_and_import( + slot, + block_root, + data_columns, + publish_fn, + ) + .await } /// Cache the blobs in the processing cache, process it, then evict it from the cache if it was @@ -3138,10 +3118,8 @@ impl BeaconChain { self.emit_sse_blob_sidecar_events(&block_root, blobs.iter().flatten().map(Arc::as_ref)); - let r = self - .check_rpc_blob_availability_and_import(slot, block_root, blobs) - .await; - self.remove_notified(&block_root, r) + self.check_rpc_blob_availability_and_import(slot, block_root, blobs) + .await } /// Process blobs retrieved from the EL and returns the `AvailabilityProcessingStatus`. @@ -3173,10 +3151,8 @@ impl BeaconChain { } } - let r = self - .check_engine_blobs_availability_and_import(slot, block_root, engine_get_blobs_output) - .await; - self.remove_notified(&block_root, r) + self.check_engine_blobs_availability_and_import(slot, block_root, engine_get_blobs_output) + .await } fn emit_sse_blob_sidecar_events<'a, I>(self: &Arc, block_root: &Hash256, blobs_iter: I) @@ -3269,10 +3245,8 @@ impl BeaconChain { custody_columns.iter().map(|column| column.as_ref()), ); - let r = self - .check_rpc_custody_columns_availability_and_import(slot, block_root, custody_columns) - .await; - self.remove_notified(&block_root, r) + self.check_rpc_custody_columns_availability_and_import(slot, block_root, custody_columns) + .await } pub async fn reconstruct_data_columns( @@ -3298,15 +3272,15 @@ impl BeaconChain { let data_availability_checker = self.data_availability_checker.clone(); + let current_span = Span::current(); let result = self .task_executor - .spawn_blocking_handle( - move || data_availability_checker.reconstruct_data_columns(&block_root), - "reconstruct_data_columns", - ) - .ok_or(BeaconChainError::RuntimeShutdown)? + .spawn_blocking_with_rayon_async(RayonPoolType::HighPriority, move || { + let _guard = current_span.enter(); + data_availability_checker.reconstruct_data_columns(&block_root) + }) .await - .map_err(BeaconChainError::TokioJoin)??; + .map_err(|_| BeaconChainError::RuntimeShutdown)??; match result { DataColumnReconstructionResult::Success((availability, data_columns_to_publish)) => { @@ -3315,10 +3289,8 @@ impl BeaconChain { return Ok(None); }; - let r = self - .process_availability(slot, availability, || Ok(())) - .await; - self.remove_notified(&block_root, r) + self.process_availability(slot, availability, || Ok(())) + .await .map(|availability_processing_status| { Some((availability_processing_status, data_columns_to_publish)) }) @@ -3335,46 +3307,6 @@ impl BeaconChain { } } - /// Remove any block components from the *processing cache* if we no longer require them. If the - /// block was imported full or erred, we no longer require them. - fn remove_notified( - &self, - block_root: &Hash256, - r: Result, - ) -> Result { - let has_missing_components = - matches!(r, Ok(AvailabilityProcessingStatus::MissingComponents(_, _))); - if !has_missing_components { - self.reqresp_pre_import_cache.write().remove(block_root); - } - r - } - - /// Wraps `process_block` in logic to cache the block's commitments in the processing cache - /// and evict if the block was imported or errored. - pub async fn process_block_with_early_caching>( - self: &Arc, - block_root: Hash256, - unverified_block: B, - block_source: BlockImportSource, - notify_execution_layer: NotifyExecutionLayer, - ) -> Result { - self.reqresp_pre_import_cache - .write() - .insert(block_root, unverified_block.block_cloned()); - - let r = self - .process_block( - block_root, - unverified_block, - notify_execution_layer, - block_source, - || Ok(()), - ) - .await; - self.remove_notified(&block_root, r) - } - /// Check for known and configured invalid block roots before processing. pub fn check_invalid_block_roots(&self, block_root: Hash256) -> Result<(), BlockError> { if self.config.invalid_block_roots.contains(&block_root) { @@ -3406,12 +3338,6 @@ impl BeaconChain { block_source: BlockImportSource, publish_fn: impl FnOnce() -> Result<(), BlockError>, ) -> Result { - // Start the Prometheus timer. - let _full_timer = metrics::start_timer(&metrics::BLOCK_PROCESSING_TIMES); - - // Increment the Prometheus counter for block processing requests. - metrics::inc_counter(&metrics::BLOCK_PROCESSING_REQUESTS); - let block_slot = unverified_block.block().slot(); // Set observed time if not already set. Usually this should be set by gossip or RPC, @@ -3426,6 +3352,18 @@ impl BeaconChain { ); } + self.data_availability_checker.put_pre_execution_block( + block_root, + unverified_block.block_cloned(), + block_source, + )?; + + // Start the Prometheus timer. + let _full_timer = metrics::start_timer(&metrics::BLOCK_PROCESSING_TIMES); + + // Increment the Prometheus counter for block processing requests. + metrics::inc_counter(&metrics::BLOCK_PROCESSING_REQUESTS); + // A small closure to group the verification and import errors. let chain = self.clone(); let import_block = async move { @@ -3443,7 +3381,18 @@ impl BeaconChain { .set_time_consensus_verified(block_root, block_slot, timestamp) } - let executed_block = chain.into_executed_block(execution_pending).await?; + let executed_block = chain + .into_executed_block(execution_pending) + .await + .inspect_err(|_| { + // If the block fails execution for whatever reason (e.g. engine offline), + // and we keep it in the cache, then the node will NOT perform lookup and + // reprocess this block until the block is evicted from DA checker, causing the + // chain to get stuck temporarily if the block is canonical. Therefore we remove + // it from the cache if execution fails. + self.data_availability_checker + .remove_block_on_execution_error(&block_root); + })?; // Record the *additional* time it took to wait for execution layer verification. if let Some(timestamp) = self.slot_clock.now_duration() { @@ -3569,9 +3518,7 @@ impl BeaconChain { block: AvailabilityPendingExecutedBlock, ) -> Result { let slot = block.block.slot(); - let availability = self - .data_availability_checker - .put_pending_executed_block(block)?; + let availability = self.data_availability_checker.put_executed_block(block)?; self.process_availability(slot, availability, || Ok(())) .await } @@ -3815,19 +3762,6 @@ impl BeaconChain { .await?? }; - // Remove block components from da_checker AFTER completing block import. Then we can assert - // the following invariant: - // > A valid unfinalized block is either in fork-choice or da_checker. - // - // If we remove the block when it becomes available, there's some time window during - // `import_block` where the block is nowhere. Consumers of the da_checker can handle the - // extend time a block may exist in the da_checker. - // - // If `import_block` errors (only errors with internal errors), the pending components will - // be pruned on data_availability_checker maintenance as finality advances. - self.data_availability_checker - .remove_pending_components(block_root); - Ok(AvailabilityProcessingStatus::Imported(block_root)) } @@ -3901,9 +3835,16 @@ impl BeaconChain { .map_err(BeaconChainError::from)?; } + // Take an upgradable read lock on fork choice so we can check if this block has already + // been imported. We don't want to repeat work importing a block that is already imported. + let fork_choice_reader = self.canonical_head.fork_choice_upgradable_read_lock(); + if fork_choice_reader.contains_block(&block_root) { + return Err(BlockError::DuplicateFullyImported(block_root)); + } + // Take an exclusive write-lock on fork choice. It's very important to prevent deadlocks by // avoiding taking other locks whilst holding this lock. - let mut fork_choice = self.canonical_head.fork_choice_write_lock(); + let mut fork_choice = parking_lot::RwLockUpgradableReadGuard::upgrade(fork_choice_reader); // Do not import a block that doesn't descend from the finalized root. let signed_block = @@ -4016,7 +3957,7 @@ impl BeaconChain { // See https://github.com/sigp/lighthouse/issues/2028 let (_, signed_block, block_data) = signed_block.deconstruct(); - match self.get_blobs_or_columns_store_op(block_root, block_data) { + match self.get_blobs_or_columns_store_op(block_root, signed_block.slot(), block_data) { Ok(Some(blobs_or_columns_store_op)) => { ops.push(blobs_or_columns_store_op); } @@ -4479,6 +4420,7 @@ impl BeaconChain { } /// If configured, wait for the fork choice run at the start of the slot to complete. + #[instrument(level = "debug", skip_all)] fn wait_for_fork_choice_before_block_production( self: &Arc, slot: Slot, @@ -4541,10 +4483,15 @@ impl BeaconChain { // // Load the parent state from disk. let chain = self.clone(); + let span = Span::current(); let (state, state_root_opt) = self .task_executor .spawn_blocking_handle( - move || chain.load_state_for_block_production(slot), + move || { + let _guard = + debug_span!(parent: span, "load_state_for_block_production").entered(); + chain.load_state_for_block_production(slot) + }, "load_state_for_block_production", ) .ok_or(BlockProductionError::ShuttingDown)? @@ -4631,6 +4578,7 @@ impl BeaconChain { /// Fetch the beacon state to use for producing a block if a 1-slot proposer re-org is viable. /// /// This function will return `None` if proposer re-orgs are disabled. + #[instrument(skip_all, level = "debug")] fn get_state_for_re_org( &self, slot: Slot, @@ -4751,65 +4699,54 @@ impl BeaconChain { // Compute the proposer index. let head_epoch = cached_head.head_slot().epoch(T::EthSpec::slots_per_epoch()); - let shuffling_decision_root = if head_epoch == proposal_epoch { - cached_head - .snapshot - .beacon_state - .proposer_shuffling_decision_root(proposer_head)? - } else { - proposer_head - }; - let cached_proposer = self - .beacon_proposer_cache - .lock() - .get_slot::(shuffling_decision_root, proposal_slot); - let proposer_index = if let Some(proposer) = cached_proposer { - proposer.index as u64 - } else { - if head_epoch + self.config.sync_tolerance_epochs < proposal_epoch { - warn!( - msg = "this is a non-critical issue that can happen on unhealthy nodes or \ - networks.", - %proposal_epoch, - %head_epoch, - "Skipping proposer preparation" - ); + let shuffling_decision_root = cached_head + .snapshot + .beacon_state + .proposer_shuffling_decision_root_at_epoch(proposal_epoch, proposer_head, &self.spec)?; - // Don't skip the head forward more than two epochs. This avoids burdening an - // unhealthy node. - // - // Although this node might miss out on preparing for a proposal, they should still - // be able to propose. This will prioritise beacon chain health over efficient - // packing of execution blocks. - return Ok(None); + let Some(proposer_index) = self.with_proposer_cache( + shuffling_decision_root, + proposal_epoch, + |proposers| proposers.get_slot::(proposal_slot).map(|p| p.index as u64), + || { + if head_epoch + self.config.sync_tolerance_epochs < proposal_epoch { + warn!( + msg = "this is a non-critical issue that can happen on unhealthy nodes or \ + networks", + %proposal_epoch, + %head_epoch, + "Skipping proposer preparation" + ); + + // Don't skip the head forward too many epochs. This avoids burdening an + // unhealthy node. + // + // Although this node might miss out on preparing for a proposal, they should + // still be able to propose. This will prioritise beacon chain health over + // efficient packing of execution blocks. + Err(Error::SkipProposerPreparation) + } else { + let head = self.canonical_head.cached_head(); + Ok(( + head.head_state_root(), + head.snapshot.beacon_state.clone(), + )) + } + }, + ).map_or_else(|e| { + match e { + Error::ProposerCacheIncorrectState { .. } => { + warn!("Head changed during proposer preparation"); + Ok(None) + } + Error::SkipProposerPreparation => { + // Warning logged for this above. + Ok(None) + } + e => Err(e) } - - let (proposers, decision_root, _, fork) = - compute_proposer_duties_from_head(proposal_epoch, self)?; - - let proposer_offset = (proposal_slot % T::EthSpec::slots_per_epoch()).as_usize(); - let proposer = *proposers - .get(proposer_offset) - .ok_or(BeaconChainError::NoProposerForSlot(proposal_slot))?; - - self.beacon_proposer_cache.lock().insert( - proposal_epoch, - decision_root, - proposers, - fork, - )?; - - // It's possible that the head changes whilst computing these duties. If so, abandon - // this routine since the change of head would have also spawned another instance of - // this routine. - // - // Exit now, after updating the cache. - if decision_root != shuffling_decision_root { - warn!("Head changed during proposer preparation"); - return Ok(None); - } - - proposer as u64 + }, |value| Ok(Some(value)))? else { + return Ok(None); }; // Get the `prev_randao` and parent block number. @@ -4969,14 +4906,19 @@ impl BeaconChain { // Only attempt a re-org if we have a proposer registered for the re-org slot. let proposing_at_re_org_slot = { - // The proposer shuffling has the same decision root as the next epoch attestation - // shuffling. We know our re-org block is not on the epoch boundary, so it has the - // same proposer shuffling as the head (but not necessarily the parent which may lie - // in the previous epoch). - let shuffling_decision_root = info - .head_node - .next_epoch_shuffling_id - .shuffling_decision_block; + // We know our re-org block is not on the epoch boundary, so it has the same proposer + // shuffling as the head (but not necessarily the parent which may lie in the previous + // epoch). + let shuffling_decision_root = if self + .spec + .fork_name_at_slot::(re_org_block_slot) + .fulu_enabled() + { + info.head_node.current_epoch_shuffling_id + } else { + info.head_node.next_epoch_shuffling_id + } + .shuffling_decision_block; let proposer_index = self .beacon_proposer_cache .lock() @@ -5085,6 +5027,7 @@ impl BeaconChain { /// equal to the root of `state`. Providing this value will serve as an optimization to avoid /// performing a tree hash in some scenarios. #[allow(clippy::too_many_arguments)] + #[instrument(level = "debug", skip_all)] pub async fn produce_block_on_state( self: &Arc, state: BeaconState, @@ -5104,10 +5047,13 @@ impl BeaconChain { .graffiti_calculator .get_graffiti(validator_graffiti) .await; + let span = Span::current(); let mut partial_beacon_block = self .task_executor .spawn_blocking_handle( move || { + let _guard = + debug_span!(parent: span, "produce_partial_beacon_block").entered(); chain.produce_partial_beacon_block( state, state_root_opt, @@ -5143,10 +5089,14 @@ impl BeaconChain { match block_contents_type { BlockProposalContentsType::Full(block_contents) => { let chain = self.clone(); + let span = Span::current(); let beacon_block_response = self .task_executor .spawn_blocking_handle( move || { + let _guard = + debug_span!(parent: span, "complete_partial_beacon_block") + .entered(); chain.complete_partial_beacon_block( partial_beacon_block, Some(block_contents), @@ -5163,10 +5113,14 @@ impl BeaconChain { } BlockProposalContentsType::Blinded(block_contents) => { let chain = self.clone(); + let span = Span::current(); let beacon_block_response = self .task_executor .spawn_blocking_handle( move || { + let _guard = + debug_span!(parent: span, "complete_partial_beacon_block") + .entered(); chain.complete_partial_beacon_block( partial_beacon_block, Some(block_contents), @@ -5184,10 +5138,13 @@ impl BeaconChain { } } else { let chain = self.clone(); + let span = Span::current(); let beacon_block_response = self .task_executor .spawn_blocking_handle( move || { + let _guard = + debug_span!(parent: span, "complete_partial_beacon_block").entered(); chain.complete_partial_beacon_block( partial_beacon_block, None, @@ -5276,64 +5233,71 @@ impl BeaconChain { None }; + let slashings_and_exits_span = debug_span!("get_slashings_and_exits").entered(); let (mut proposer_slashings, mut attester_slashings, mut voluntary_exits) = self.op_pool.get_slashings_and_exits(&state, &self.spec); + drop(slashings_and_exits_span); let eth1_data = state.eth1_data().clone(); let deposits = vec![]; + let bls_changes_span = debug_span!("get_bls_to_execution_changes").entered(); let bls_to_execution_changes = self .op_pool .get_bls_to_execution_changes(&state, &self.spec); + drop(bls_changes_span); // Iterate through the naive aggregation pool and ensure all the attestations from there // are included in the operation pool. - let unagg_import_timer = - metrics::start_timer(&metrics::BLOCK_PRODUCTION_UNAGGREGATED_TIMES); - for attestation in self.naive_aggregation_pool.read().iter() { - let import = |attestation: &Attestation| { - let attesting_indices = - get_attesting_indices_from_state(&state, attestation.to_ref())?; - self.op_pool - .insert_attestation(attestation.clone(), attesting_indices) - }; - if let Err(e) = import(attestation) { - // Don't stop block production if there's an error, just create a log. - error!( - reason = ?e, - "Attestation did not transfer to op pool" - ); + { + let _guard = debug_span!("import_naive_aggregation_pool").entered(); + let _unagg_import_timer = + metrics::start_timer(&metrics::BLOCK_PRODUCTION_UNAGGREGATED_TIMES); + for attestation in self.naive_aggregation_pool.read().iter() { + let import = |attestation: &Attestation| { + let attesting_indices = + get_attesting_indices_from_state(&state, attestation.to_ref())?; + self.op_pool + .insert_attestation(attestation.clone(), attesting_indices) + }; + if let Err(e) = import(attestation) { + // Don't stop block production if there's an error, just create a log. + error!( + reason = ?e, + "Attestation did not transfer to op pool" + ); + } } - } - drop(unagg_import_timer); - - let attestation_packing_timer = - metrics::start_timer(&metrics::BLOCK_PRODUCTION_ATTESTATION_TIMES); - - // Epoch cache and total balance cache are required for op pool packing. - state.build_total_active_balance_cache(&self.spec)?; - initialize_epoch_cache(&mut state, &self.spec)?; - - let mut prev_filter_cache = HashMap::new(); - let prev_attestation_filter = |att: &CompactAttestationRef| { - self.filter_op_pool_attestation(&mut prev_filter_cache, att, &state) - }; - let mut curr_filter_cache = HashMap::new(); - let curr_attestation_filter = |att: &CompactAttestationRef| { - self.filter_op_pool_attestation(&mut curr_filter_cache, att, &state) }; - let mut attestations = self - .op_pool - .get_attestations( - &state, - prev_attestation_filter, - curr_attestation_filter, - &self.spec, - ) - .map_err(BlockProductionError::OpPoolError)?; - drop(attestation_packing_timer); + let mut attestations = { + let _guard = debug_span!("pack_attestations").entered(); + let _attestation_packing_timer = + metrics::start_timer(&metrics::BLOCK_PRODUCTION_ATTESTATION_TIMES); + + // Epoch cache and total balance cache are required for op pool packing. + state.build_total_active_balance_cache(&self.spec)?; + initialize_epoch_cache(&mut state, &self.spec)?; + + let mut prev_filter_cache = HashMap::new(); + let prev_attestation_filter = |att: &CompactAttestationRef| { + self.filter_op_pool_attestation(&mut prev_filter_cache, att, &state) + }; + let mut curr_filter_cache = HashMap::new(); + let curr_attestation_filter = |att: &CompactAttestationRef| { + self.filter_op_pool_attestation(&mut curr_filter_cache, att, &state) + }; + + self.op_pool + .get_attestations( + &state, + prev_attestation_filter, + curr_attestation_filter, + &self.spec, + ) + .map_err(BlockProductionError::OpPoolError)? + }; // If paranoid mode is enabled re-check the signatures of every included message. // This will be a lot slower but guards against bugs in block production and can be @@ -6552,6 +6516,70 @@ impl BeaconChain { } } + pub fn with_proposer_cache + From>( + &self, + shuffling_decision_block: Hash256, + proposal_epoch: Epoch, + accessor: impl Fn(&EpochBlockProposers) -> Result, + state_provider: impl FnOnce() -> Result<(Hash256, BeaconState), E>, + ) -> Result { + let cache_entry = self + .beacon_proposer_cache + .lock() + .get_or_insert_key(proposal_epoch, shuffling_decision_block); + + // If the cache entry is not initialised, run the code to initialise it inside a OnceCell. + // This prevents duplication of work across multiple threads. + // + // If it is already initialised, then `get_or_try_init` will return immediately without + // executing the initialisation code at all. + let epoch_block_proposers = cache_entry.get_or_try_init(|| { + debug!( + ?shuffling_decision_block, + %proposal_epoch, + "Proposer shuffling cache miss" + ); + + // Fetch the state on-demand if the required epoch was missing from the cache. + // If the caller wants to not compute the state they must return an error here and then + // catch it at the call site. + let (state_root, mut state) = state_provider()?; + + // Ensure the state can compute proposer duties for `epoch`. + ensure_state_can_determine_proposers_for_epoch( + &mut state, + state_root, + proposal_epoch, + &self.spec, + )?; + + // Sanity check the state. + let latest_block_root = state.get_latest_block_root(state_root); + let state_decision_block_root = state.proposer_shuffling_decision_root_at_epoch( + proposal_epoch, + latest_block_root, + &self.spec, + )?; + if state_decision_block_root != shuffling_decision_block { + return Err(Error::ProposerCacheIncorrectState { + state_decision_block_root, + requested_decision_block_root: shuffling_decision_block, + } + .into()); + } + + let proposers = state.get_beacon_proposer_indices(proposal_epoch, &self.spec)?; + Ok::<_, E>(EpochBlockProposers::new( + proposal_epoch, + state.fork(), + proposers, + )) + })?; + + // Run the accessor function on the computed epoch proposers. + accessor(epoch_block_proposers).map_err(Into::into) + } + /// Runs the `map_fn` with the committee cache for `shuffling_epoch` from the chain with head /// `head_block_root`. The `map_fn` will be supplied two values: /// @@ -7091,15 +7119,10 @@ impl BeaconChain { ) } - pub fn metrics(&self) -> BeaconChainMetrics { - BeaconChainMetrics { - reqresp_pre_import_cache_len: self.reqresp_pre_import_cache.read().len(), - } - } - pub(crate) fn get_blobs_or_columns_store_op( &self, block_root: Hash256, + block_slot: Slot, block_data: AvailableBlockData, ) -> Result>, String> { match block_data { @@ -7112,7 +7135,15 @@ impl BeaconChain { ); Ok(Some(StoreOp::PutBlobs(block_root, blobs))) } - AvailableBlockData::DataColumns(data_columns) => { + AvailableBlockData::DataColumns(mut data_columns) => { + let columns_to_custody = self.custody_columns_for_epoch(Some( + block_slot.epoch(T::EthSpec::slots_per_epoch()), + )); + // Supernodes need to persist all sampled custody columns + if columns_to_custody.len() != self.spec.number_of_custody_groups as usize { + data_columns + .retain(|data_column| columns_to_custody.contains(&data_column.index)); + } debug!( %block_root, count = data_columns.len(), diff --git a/beacon_node/beacon_chain/src/beacon_proposer_cache.rs b/beacon_node/beacon_chain/src/beacon_proposer_cache.rs index 12970214c6..a64b4981cc 100644 --- a/beacon_node/beacon_chain/src/beacon_proposer_cache.rs +++ b/beacon_node/beacon_chain/src/beacon_proposer_cache.rs @@ -12,9 +12,9 @@ use crate::{BeaconChain, BeaconChainError, BeaconChainTypes}; use fork_choice::ExecutionStatus; use lru::LruCache; use once_cell::sync::OnceCell; +use safe_arith::SafeArith; use smallvec::SmallVec; use state_processing::state_advance::partial_state_advance; -use std::cmp::Ordering; use std::num::NonZeroUsize; use std::sync::Arc; use types::non_zero_usize::new_non_zero_usize; @@ -51,6 +51,34 @@ pub struct EpochBlockProposers { pub(crate) proposers: SmallVec<[usize; TYPICAL_SLOTS_PER_EPOCH]>, } +impl EpochBlockProposers { + pub fn new(epoch: Epoch, fork: Fork, proposers: Vec) -> Self { + Self { + epoch, + fork, + proposers: proposers.into(), + } + } + + pub fn get_slot(&self, slot: Slot) -> Result { + let epoch = slot.epoch(E::slots_per_epoch()); + if epoch == self.epoch { + self.proposers + .get(slot.as_usize() % E::SlotsPerEpoch::to_usize()) + .map(|&index| Proposer { + index, + fork: self.fork, + }) + .ok_or(BeaconChainError::ProposerCacheOutOfBounds { slot, epoch }) + } else { + Err(BeaconChainError::ProposerCacheWrongEpoch { + request_epoch: epoch, + cache_epoch: self.epoch, + }) + } + } +} + /// A cache to store the proposers for some epoch. /// /// See the module-level documentation for more information. @@ -76,23 +104,8 @@ impl BeaconProposerCache { ) -> Option { let epoch = slot.epoch(E::slots_per_epoch()); let key = (epoch, shuffling_decision_block); - let cache_opt = self.cache.get(&key).and_then(|cell| cell.get()); - if let Some(cache) = cache_opt { - // This `if` statement is likely unnecessary, but it feels like good practice. - if epoch == cache.epoch { - cache - .proposers - .get(slot.as_usize() % E::SlotsPerEpoch::to_usize()) - .map(|&index| Proposer { - index, - fork: cache.fork, - }) - } else { - None - } - } else { - None - } + let cache = self.cache.get(&key)?.get()?; + cache.get_slot::(slot).ok() } /// As per `Self::get_slot`, but returns all proposers in all slots for the given `epoch`. @@ -142,11 +155,7 @@ impl BeaconProposerCache { ) -> Result<(), BeaconStateError> { let key = (epoch, shuffling_decision_block); if !self.cache.contains(&key) { - let epoch_proposers = EpochBlockProposers { - epoch, - fork, - proposers: proposers.into(), - }; + let epoch_proposers = EpochBlockProposers::new(epoch, fork, proposers); self.cache .put(key, Arc::new(OnceCell::with_value(epoch_proposers))); } @@ -178,7 +187,12 @@ pub fn compute_proposer_duties_from_head( .ok_or(BeaconChainError::HeadMissingFromForkChoice(head_block_root))?; // Advance the state into the requested epoch. - ensure_state_is_in_epoch(&mut state, head_state_root, request_epoch, &chain.spec)?; + ensure_state_can_determine_proposers_for_epoch( + &mut state, + head_state_root, + request_epoch, + &chain.spec, + )?; let indices = state .get_beacon_proposer_indices(request_epoch, &chain.spec) @@ -186,13 +200,13 @@ pub fn compute_proposer_duties_from_head( let dependent_root = state // The only block which decides its own shuffling is the genesis block. - .proposer_shuffling_decision_root(chain.genesis_block_root) + .proposer_shuffling_decision_root(chain.genesis_block_root, &chain.spec) .map_err(BeaconChainError::from)?; Ok((indices, dependent_root, execution_status, state.fork())) } -/// If required, advance `state` to `target_epoch`. +/// If required, advance `state` to the epoch required to determine proposer indices in `target_epoch`. /// /// ## Details /// @@ -200,22 +214,39 @@ pub fn compute_proposer_duties_from_head( /// - No-op if `state.current_epoch() == target_epoch`. /// - It must be the case that `state.canonical_root() == state_root`, but this function will not /// check that. -pub fn ensure_state_is_in_epoch( +pub fn ensure_state_can_determine_proposers_for_epoch( state: &mut BeaconState, state_root: Hash256, target_epoch: Epoch, spec: &ChainSpec, ) -> Result<(), BeaconChainError> { - match state.current_epoch().cmp(&target_epoch) { - // Protects against an inconsistent slot clock. - Ordering::Greater => Err(BeaconStateError::SlotOutOfBounds.into()), - // The state needs to be advanced. - Ordering::Less => { + // The decision slot is the end of an epoch, so we add 1 to reach the first slot of the epoch + // at which the shuffling is determined. + let minimum_slot = spec + .proposer_shuffling_decision_slot::(target_epoch) + .safe_add(1)?; + let minimum_epoch = minimum_slot.epoch(E::slots_per_epoch()); + + // Before and after Fulu, the oldest epoch reachable from a state at epoch N is epoch N itself, + // i.e. we can never "look back". + let maximum_epoch = target_epoch; + + if state.current_epoch() > maximum_epoch { + Err(BeaconStateError::SlotOutOfBounds.into()) + } else if state.current_epoch() >= minimum_epoch { + if target_epoch > state.current_epoch() { let target_slot = target_epoch.start_slot(E::slots_per_epoch()); + + // Advance the state into the same epoch as the block. Use the "partial" method since state + // roots are not important for proposer/attester shuffling. partial_state_advance(state, Some(state_root), target_slot, spec) - .map_err(BeaconChainError::from) + .map_err(BeaconChainError::from)?; } - // The state is suitable, nothing to do. - Ordering::Equal => Ok(()), + Ok(()) + } else { + // State's current epoch is less than the minimum epoch. + // Advance the state up to the minimum epoch. + partial_state_advance(state, Some(state_root), minimum_slot, spec) + .map_err(BeaconChainError::from) } } diff --git a/beacon_node/beacon_chain/src/blob_verification.rs b/beacon_node/beacon_chain/src/blob_verification.rs index 2ba20d5a82..53f2eff0ca 100644 --- a/beacon_node/beacon_chain/src/blob_verification.rs +++ b/beacon_node/beacon_chain/src/blob_verification.rs @@ -5,8 +5,7 @@ use std::sync::Arc; use crate::beacon_chain::{BeaconChain, BeaconChainTypes}; use crate::block_verification::{ - BlockSlashInfo, cheap_state_advance_to_obtain_committees, get_validator_pubkey_cache, - process_block_slash_info, + BlockSlashInfo, get_validator_pubkey_cache, process_block_slash_info, }; use crate::kzg_utils::{validate_blob, validate_blobs}; use crate::observed_data_sidecars::{ObservationStrategy, Observe}; @@ -96,7 +95,7 @@ pub enum GossipBlobError { /// ## Peer scoring /// /// We cannot process the blob without validating its parent, the peer isn't necessarily faulty. - BlobParentUnknown { parent_root: Hash256 }, + ParentUnknown { parent_root: Hash256 }, /// Invalid kzg commitment inclusion proof /// ## Peer scoring @@ -474,7 +473,7 @@ pub fn validate_blob_sidecar_for_gossip(proposer_shuffling_root, blob_slot); - - let (proposer_index, fork) = if let Some(proposer) = proposer_opt { - (proposer.index, proposer.fork) - } else { - debug!( - %block_root, - %blob_index, - "Proposer shuffling cache miss for blob verification" - ); - let (parent_state_root, mut parent_state) = chain - .store - .get_advanced_hot_state(block_parent_root, blob_slot, parent_block.state_root) - .map_err(|e| GossipBlobError::BeaconChainError(Box::new(e.into())))? - .ok_or_else(|| { - BeaconChainError::DBInconsistent(format!( - "Missing state for parent block {block_parent_root:?}", - )) - })?; - - let state = cheap_state_advance_to_obtain_committees::<_, GossipBlobError>( - &mut parent_state, - Some(parent_state_root), - blob_slot, - &chain.spec, - )?; - - let epoch = state.current_epoch(); - let proposers = state.get_beacon_proposer_indices(epoch, &chain.spec)?; - let proposer_index = *proposers - .get(blob_slot.as_usize() % T::EthSpec::slots_per_epoch() as usize) - .ok_or_else(|| BeaconChainError::NoProposerForSlot(blob_slot))?; - - // Prime the proposer shuffling cache with the newly-learned value. - chain.beacon_proposer_cache.lock().insert( - blob_epoch, - proposer_shuffling_root, - proposers, - state.fork(), - )?; - (proposer_index, state.fork()) - }; + let proposer = chain.with_proposer_cache( + proposer_shuffling_root, + blob_epoch, + |proposers| proposers.get_slot::(blob_slot), + || { + debug!( + %block_root, + index = %blob_index, + "Proposer shuffling cache miss for blob verification" + ); + chain + .store + .get_advanced_hot_state(block_parent_root, blob_slot, parent_block.state_root) + .map_err(|e| GossipBlobError::BeaconChainError(Box::new(e.into())))? + .ok_or_else(|| { + GossipBlobError::BeaconChainError(Box::new(BeaconChainError::DBInconsistent( + format!("Missing state for parent block {block_parent_root:?}",), + ))) + }) + }, + )?; + let proposer_index = proposer.index; + let fork = proposer.fork; // Signature verify the signed block header. let signature_is_valid = { diff --git a/beacon_node/beacon_chain/src/block_verification.rs b/beacon_node/beacon_chain/src/block_verification.rs index 1d6e050f7e..d0ed8258e5 100644 --- a/beacon_node/beacon_chain/src/block_verification.rs +++ b/beacon_node/beacon_chain/src/block_verification.rs @@ -948,61 +948,35 @@ impl GossipVerifiedBlock { } let proposer_shuffling_decision_block = - if parent_block.slot.epoch(T::EthSpec::slots_per_epoch()) == block_epoch { - parent_block - .next_epoch_shuffling_id - .shuffling_decision_block - } else { - parent_block.root - }; + parent_block.proposer_shuffling_root_for_child_block(block_epoch, &chain.spec); // We assign to a variable instead of using `if let Some` directly to ensure we drop the // write lock before trying to acquire it again in the `else` clause. - let proposer_opt = chain - .beacon_proposer_cache - .lock() - .get_slot::(proposer_shuffling_decision_block, block.slot()); - let (expected_proposer, fork, parent, block) = if let Some(proposer) = proposer_opt { - // The proposer index was cached and we can return it without needing to load the - // parent. - (proposer.index, proposer.fork, None, block) - } else { - // The proposer index was *not* cached and we must load the parent in order to determine - // the proposer index. - let (mut parent, block) = load_parent(block, chain)?; - - debug!( - parent_root = ?parent.beacon_block_root, - parent_slot = %parent.beacon_block.slot(), - ?block_root, - block_slot = %block.slot(), - "Proposer shuffling cache miss" - ); - - // The state produced is only valid for determining proposer/attester shuffling indices. - let state = cheap_state_advance_to_obtain_committees::<_, BlockError>( - &mut parent.pre_state, - parent.beacon_state_root, - block.slot(), - &chain.spec, - )?; - - let epoch = state.current_epoch(); - let proposers = state.get_beacon_proposer_indices(epoch, &chain.spec)?; - let proposer_index = *proposers - .get(block.slot().as_usize() % T::EthSpec::slots_per_epoch() as usize) - .ok_or_else(|| BeaconChainError::NoProposerForSlot(block.slot()))?; - - // Prime the proposer shuffling cache with the newly-learned value. - chain.beacon_proposer_cache.lock().insert( - block_epoch, - proposer_shuffling_decision_block, - proposers, - state.fork(), - )?; - - (proposer_index, state.fork(), Some(parent), block) - }; + let block_slot = block.slot(); + let mut opt_parent = None; + let proposer = chain.with_proposer_cache::<_, BlockError>( + proposer_shuffling_decision_block, + block_epoch, + |proposers| proposers.get_slot::(block_slot), + || { + // The proposer index was *not* cached and we must load the parent in order to + // determine the proposer index. + let (mut parent, _) = load_parent(block.clone(), chain)?; + let parent_state_root = if let Some(state_root) = parent.beacon_state_root { + state_root + } else { + // This is potentially a little inefficient, although we are likely to need + // the state's hash eventually (if the block is valid), and we are also likely + // to already have the hash cached (if fetched from the state cache). + parent.pre_state.canonical_root()? + }; + let parent_state = parent.pre_state.clone(); + opt_parent = Some(parent); + Ok((parent_state_root, parent_state)) + }, + )?; + let expected_proposer = proposer.index; + let fork = proposer.fork; let signature_is_valid = { let pubkey_cache = get_validator_pubkey_cache(chain)?; @@ -1077,7 +1051,7 @@ impl GossipVerifiedBlock { Ok(Self { block, block_root, - parent, + parent: opt_parent, consensus_context, }) } @@ -2061,7 +2035,7 @@ impl BlockBlobError for GossipDataColumnError { /// and `Cow::Borrowed(state)` will be returned. Otherwise, the state will be cloned, cheaply /// advanced and then returned as a `Cow::Owned`. The end result is that the given `state` is never /// mutated to be invalid (in fact, it is never changed beyond a simple committee cache build). -#[instrument(skip(state, spec), level = "debug")] +#[instrument(skip_all, fields(?state_root_opt, %block_slot), level = "debug")] pub fn cheap_state_advance_to_obtain_committees<'a, E: EthSpec, Err: BlockBlobError>( state: &'a mut BeaconState, state_root_opt: Option, diff --git a/beacon_node/beacon_chain/src/builder.rs b/beacon_node/beacon_chain/src/builder.rs index 5e7aa7d4f8..5564c7916f 100644 --- a/beacon_node/beacon_chain/src/builder.rs +++ b/beacon_node/beacon_chain/src/builder.rs @@ -899,6 +899,7 @@ where let genesis_time = head_snapshot.beacon_state.genesis_time(); let canonical_head = CanonicalHead::new(fork_choice, Arc::new(head_snapshot)); let shuffling_cache_size = self.chain_config.shuffling_cache_size; + let complete_blob_backfill = self.chain_config.complete_blob_backfill; // Calculate the weak subjectivity point in which to backfill blocks to. let genesis_backfill_slot = if self.chain_config.genesis_backfill { @@ -997,7 +998,6 @@ where validator_pubkey_cache: RwLock::new(validator_pubkey_cache), attester_cache: <_>::default(), early_attester_cache: <_>::default(), - reqresp_pre_import_cache: <_>::default(), light_client_server_cache: LightClientServerCache::new(), light_client_server_tx: self.light_client_server_tx, shutdown_sender: self @@ -1013,6 +1013,7 @@ where genesis_backfill_slot, data_availability_checker: Arc::new( DataAvailabilityChecker::new( + complete_blob_backfill, slot_clock, self.kzg.clone(), store, diff --git a/beacon_node/beacon_chain/src/canonical_head.rs b/beacon_node/beacon_chain/src/canonical_head.rs index 56d1975972..7dd4c88c51 100644 --- a/beacon_node/beacon_chain/src/canonical_head.rs +++ b/beacon_node/beacon_chain/src/canonical_head.rs @@ -47,8 +47,9 @@ use fork_choice::{ ResetPayloadStatuses, }; use itertools::process_results; +use lighthouse_tracing::SPAN_RECOMPUTE_HEAD; use logging::crit; -use parking_lot::{Mutex, RwLock, RwLockReadGuard, RwLockWriteGuard}; +use parking_lot::{Mutex, RwLock, RwLockReadGuard, RwLockUpgradableReadGuard, RwLockWriteGuard}; use slot_clock::SlotClock; use state_processing::AllCaches; use std::sync::Arc; @@ -57,6 +58,7 @@ use store::{ Error as StoreError, KeyValueStore, KeyValueStoreOp, StoreConfig, iter::StateRootsIterator, }; use task_executor::{JoinHandle, ShutdownReason}; +use tracing::info_span; use tracing::{debug, error, info, instrument, warn}; use types::*; @@ -79,6 +81,10 @@ impl CanonicalHeadRwLock { self.0.read() } + fn upgradable_read(&self) -> RwLockUpgradableReadGuard<'_, T> { + self.0.upgradable_read() + } + fn write(&self) -> RwLockWriteGuard<'_, T> { self.0.write() } @@ -379,6 +385,7 @@ impl CanonicalHead { /// /// This function is **not safe** to be public. See the module-level documentation for more /// information about protecting from deadlocks. + #[instrument(skip_all)] fn cached_head_write_lock(&self) -> RwLockWriteGuard<'_, CachedHead> { self.cached_head.write() } @@ -389,7 +396,16 @@ impl CanonicalHead { self.fork_choice.read() } + /// Access an upgradable read-lock for fork choice. + pub fn fork_choice_upgradable_read_lock( + &self, + ) -> RwLockUpgradableReadGuard<'_, BeaconForkChoice> { + let _timer = metrics::start_timer(&metrics::FORK_CHOICE_UPGRADABLE_READ_LOCK_AQUIRE_TIMES); + self.fork_choice.upgradable_read() + } + /// Access a write-lock for fork choice. + #[instrument(skip_all)] pub fn fork_choice_write_lock(&self) -> RwLockWriteGuard<'_, BeaconForkChoice> { let _timer = metrics::start_timer(&metrics::FORK_CHOICE_WRITE_LOCK_AQUIRE_TIMES); self.fork_choice.write() @@ -497,13 +513,21 @@ impl BeaconChain { /// situation can be rectified. We avoid returning an error here so that calling functions /// can't abort block import because an error is returned here. pub async fn recompute_head_at_slot(self: &Arc, current_slot: Slot) { + let span = info_span!( + SPAN_RECOMPUTE_HEAD, + slot = %current_slot + ); + metrics::inc_counter(&metrics::FORK_CHOICE_REQUESTS); let _timer = metrics::start_timer(&metrics::FORK_CHOICE_TIMES); let chain = self.clone(); match self .spawn_blocking_handle( - move || chain.recompute_head_at_slot_internal(current_slot), + move || { + let _guard = span.enter(); + chain.recompute_head_at_slot_internal(current_slot) + }, "recompute_head_internal", ) .await @@ -761,6 +785,7 @@ impl BeaconChain { } /// Perform updates to caches and other components after the canonical head has been changed. + #[instrument(skip_all)] fn after_new_head( self: &Arc, old_cached_head: &CachedHead, @@ -804,7 +829,7 @@ impl BeaconChain { let head_slot = new_snapshot.beacon_state.slot(); let dependent_root = new_snapshot .beacon_state - .proposer_shuffling_decision_root(self.genesis_block_root); + .attester_shuffling_decision_root(self.genesis_block_root, RelativeEpoch::Next); let prev_dependent_root = new_snapshot .beacon_state .attester_shuffling_decision_root(self.genesis_block_root, RelativeEpoch::Current); @@ -899,6 +924,7 @@ impl BeaconChain { /// /// This function will take a write-lock on `canonical_head.fork_choice`, therefore it would be /// unwise to hold any lock on fork choice while calling this function. + #[instrument(skip_all)] fn after_finalization( self: &Arc, new_cached_head: &CachedHead, @@ -911,13 +937,6 @@ impl BeaconChain { .execution_status .is_optimistic_or_invalid(); - self.op_pool.prune_all( - &new_snapshot.beacon_block, - &new_snapshot.beacon_state, - self.epoch()?, - &self.spec, - ); - self.observed_block_producers.write().prune( new_view .finalized_checkpoint @@ -956,9 +975,9 @@ impl BeaconChain { })); } - // The store migration task requires the *state at the slot of the finalized epoch*, - // rather than the state of the latest finalized block. These two values will only - // differ when the first slot of the finalized epoch is a skip slot. + // The store migration task and op pool pruning require the *state at the first slot of the + // finalized epoch*, rather than the state of the latest finalized block. These two values + // will only differ when the first slot of the finalized epoch is a skip slot. // // Use the `StateRootsIterator` directly rather than `BeaconChain::state_root_at_slot` // to ensure we use the same state that we just set as the head. @@ -980,6 +999,23 @@ impl BeaconChain { )? .ok_or(Error::MissingFinalizedStateRoot(new_finalized_slot))?; + let update_cache = true; + let new_finalized_state = self + .store + .get_hot_state(&new_finalized_state_root, update_cache)? + .ok_or(Error::MissingBeaconState(new_finalized_state_root))?; + + self.op_pool.prune_all( + &new_snapshot.beacon_block, + &new_snapshot.beacon_state, + &new_finalized_state, + self.epoch()?, + &self.spec, + ); + + // We just pass the state root to the finalization thread. It should be able to reload the + // state from the state_cache near instantly anyway. We could experiment with sending the + // state over a channel in future, but it's probably no quicker. self.store_migrator.process_finalization( new_finalized_state_root.into(), new_view.finalized_checkpoint, @@ -1034,6 +1070,7 @@ impl BeaconChain { /// /// This function is called whilst holding a write-lock on the `canonical_head`. To ensure dead-lock /// safety, **do not take any other locks inside this function**. +#[instrument(skip_all)] fn check_finalized_payload_validity( chain: &BeaconChain, finalized_proto_block: &ProtoBlock, @@ -1117,6 +1154,7 @@ fn perform_debug_logging( } } +#[instrument(skip_all)] fn spawn_execution_layer_updates( chain: Arc>, forkchoice_update_params: ForkchoiceUpdateParameters, diff --git a/beacon_node/beacon_chain/src/chain_config.rs b/beacon_node/beacon_chain/src/chain_config.rs index d6be96afe9..a7defa9fa2 100644 --- a/beacon_node/beacon_chain/src/chain_config.rs +++ b/beacon_node/beacon_chain/src/chain_config.rs @@ -86,6 +86,8 @@ pub struct ChainConfig { /// If using a weak-subjectivity sync, whether we should download blocks all the way back to /// genesis. pub genesis_backfill: bool, + /// EXPERIMENTAL: backfill blobs and data columns beyond the data availability window. + pub complete_blob_backfill: bool, /// Whether to send payload attributes every slot, regardless of connected proposers. /// /// This is useful for block builders and testing. @@ -144,6 +146,7 @@ impl Default for ChainConfig { optimistic_finalized_sync: true, shuffling_cache_size: crate::shuffling_cache::DEFAULT_CACHE_SIZE, genesis_backfill: false, + complete_blob_backfill: false, always_prepare_payload: false, epochs_per_migration: crate::migrate::DEFAULT_EPOCHS_PER_MIGRATION, enable_light_client_server: true, diff --git a/beacon_node/beacon_chain/src/data_availability_checker.rs b/beacon_node/beacon_chain/src/data_availability_checker.rs index 2ebf765a4e..43b7d8f7ea 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker.rs @@ -7,7 +7,9 @@ use crate::block_verification_types::{ use crate::data_availability_checker::overflow_lru_cache::{ DataAvailabilityCheckerInner, ReconstructColumnsDecision, }; -use crate::{BeaconChain, BeaconChainTypes, BeaconStore, CustodyContext, metrics}; +use crate::{ + BeaconChain, BeaconChainTypes, BeaconStore, BlockProcessStatus, CustodyContext, metrics, +}; use kzg::Kzg; use slot_clock::SlotClock; use std::fmt; @@ -19,14 +21,15 @@ use task_executor::TaskExecutor; use tracing::{debug, error, instrument}; use types::blob_sidecar::{BlobIdentifier, BlobSidecar, FixedBlobSidecarList}; use types::{ - BlobSidecarList, ChainSpec, DataColumnSidecar, DataColumnSidecarList, Epoch, EthSpec, Hash256, - SignedBeaconBlock, Slot, + BlobSidecarList, BlockImportSource, ChainSpec, DataColumnSidecar, DataColumnSidecarList, Epoch, + EthSpec, Hash256, SignedBeaconBlock, Slot, }; mod error; mod overflow_lru_cache; mod state_lru_cache; +use crate::data_availability_checker::error::Error; use crate::data_column_verification::{ CustodyDataColumn, GossipVerifiedDataColumn, KzgVerifiedCustodyDataColumn, KzgVerifiedDataColumn, verify_kzg_for_data_column_list, @@ -38,19 +41,18 @@ use crate::observed_data_sidecars::ObservationStrategy; pub use error::{Error as AvailabilityCheckError, ErrorCategory as AvailabilityCheckErrorCategory}; use types::non_zero_usize::new_non_zero_usize; -/// The LRU Cache stores `PendingComponents`, which can store up to `MAX_BLOBS_PER_BLOCK` blobs each. +/// The LRU Cache stores `PendingComponents`, which store block and its associated blob data: /// /// * Deneb blobs are 128 kb each and are stored in the form of `BlobSidecar`. /// * From Fulu (PeerDAS), blobs are erasure-coded and are 256 kb each, stored in the form of 128 `DataColumnSidecar`s. /// /// With `MAX_BLOBS_PER_BLOCK` = 48 (expected in the next year), the maximum size of data columns -/// in `PendingComponents` is ~12.29 MB. Setting this to 64 means the maximum size of the cache is -/// approximately 0.8 GB. +/// in `PendingComponents` is ~12.29 MB. Setting this to 32 means the maximum size of the cache is +/// approximately 0.4 GB. /// -/// Under normal conditions, the cache should only store the current pending block, but could -/// occasionally spike to 2-4 for various reasons e.g. components arriving late, but would very -/// rarely go above this, unless there are many concurrent forks. -pub const OVERFLOW_LRU_CAPACITY: NonZeroUsize = new_non_zero_usize(64); +/// `PendingComponents` are now never removed from the cache manually are only removed via LRU +/// eviction to prevent race conditions (#7961), so we expect this cache to be full all the time. +pub const OVERFLOW_LRU_CAPACITY: NonZeroUsize = new_non_zero_usize(32); pub const STATE_LRU_CAPACITY_NON_ZERO: NonZeroUsize = new_non_zero_usize(32); pub const STATE_LRU_CAPACITY: usize = STATE_LRU_CAPACITY_NON_ZERO.get(); @@ -79,6 +81,7 @@ pub const STATE_LRU_CAPACITY: usize = STATE_LRU_CAPACITY_NON_ZERO.get(); /// proposer. Having a capacity > 1 is an optimization to prevent sync lookup from having re-fetch /// data during moments of unstable network conditions. pub struct DataAvailabilityChecker { + complete_blob_backfill: bool, availability_cache: Arc>, slot_clock: T::SlotClock, kzg: Arc, @@ -117,6 +120,7 @@ impl Debug for Availability { impl DataAvailabilityChecker { pub fn new( + complete_blob_backfill: bool, slot_clock: T::SlotClock, kzg: Arc, store: BeaconStore, @@ -130,6 +134,7 @@ impl DataAvailabilityChecker { spec.clone(), )?; Ok(Self { + complete_blob_backfill, availability_cache: Arc::new(inner), slot_clock, kzg, @@ -142,14 +147,12 @@ impl DataAvailabilityChecker { &self.custody_context } - /// Checks if the block root is currenlty in the availability cache awaiting import because + /// Checks if the block root is currently in the availability cache awaiting import because /// of missing components. - pub fn get_execution_valid_block( - &self, - block_root: &Hash256, - ) -> Option>> { - self.availability_cache - .get_execution_valid_block(block_root) + /// + /// Returns the cache block wrapped in a `BlockProcessStatus` enum if it exists. + pub fn get_cached_block(&self, block_root: &Hash256) -> Option> { + self.availability_cache.get_cached_block(block_root) } /// Return the set of cached blob indexes for `block_root`. Returns None if there is no block @@ -338,17 +341,30 @@ impl DataAvailabilityChecker { /// Check if we have all the blobs for a block. Returns `Availability` which has information /// about whether all components have been received or more are required. - pub fn put_pending_executed_block( + pub fn put_executed_block( &self, executed_block: AvailabilityPendingExecutedBlock, ) -> Result, AvailabilityCheckError> { - self.availability_cache - .put_pending_executed_block(executed_block) + self.availability_cache.put_executed_block(executed_block) } - pub fn remove_pending_components(&self, block_root: Hash256) { + /// Inserts a pre-execution block into the cache. + /// This does NOT override an existing executed block. + pub fn put_pre_execution_block( + &self, + block_root: Hash256, + block: Arc>, + source: BlockImportSource, + ) -> Result<(), Error> { self.availability_cache - .remove_pending_components(block_root) + .put_pre_execution_block(block_root, block, source) + } + + /// Removes a pre-execution block from the cache. + /// This does NOT remove an existing executed block. + pub fn remove_block_on_execution_error(&self, block_root: &Hash256) { + self.availability_cache + .remove_pre_execution_block(block_root); } /// Verifies kzg commitments for an RpcBlock, returns a `MaybeAvailableBlock` that may @@ -524,9 +540,15 @@ impl DataAvailabilityChecker { /// The epoch at which we require a data availability check in block processing. /// `None` if the `Deneb` fork is disabled. pub fn data_availability_boundary(&self) -> Option { - let current_epoch = self.slot_clock.now()?.epoch(T::EthSpec::slots_per_epoch()); - self.spec - .min_epoch_data_availability_boundary(current_epoch) + let fork_epoch = self.spec.deneb_fork_epoch?; + + if self.complete_blob_backfill { + Some(fork_epoch) + } else { + let current_epoch = self.slot_clock.now()?.epoch(T::EthSpec::slots_per_epoch()); + self.spec + .min_epoch_data_availability_boundary(current_epoch) + } } /// Returns true if the given epoch lies within the da boundary and false otherwise. @@ -553,6 +575,7 @@ impl DataAvailabilityChecker { } } + #[instrument(skip_all, level = "debug")] pub fn reconstruct_data_columns( &self, block_root: &Hash256, @@ -589,8 +612,8 @@ impl DataAvailabilityChecker { // Check indices from cache again to make sure we don't publish components we've already received. let Some(existing_column_indices) = self.cached_data_column_indexes(block_root) else { - return Ok(DataColumnReconstructionResult::RecoveredColumnsNotImported( - "block already imported", + return Err(AvailabilityCheckError::Unexpected( + "block no longer exists in the data availability checker".to_string(), )); }; @@ -1081,7 +1104,15 @@ mod test { let kzg = get_kzg(&spec); let store = Arc::new(HotColdDB::open_ephemeral(<_>::default(), spec.clone()).unwrap()); let custody_context = Arc::new(CustodyContext::new(false)); - DataAvailabilityChecker::new(slot_clock, kzg, store, custody_context, spec) - .expect("should initialise data availability checker") + let complete_blob_backfill = false; + DataAvailabilityChecker::new( + complete_blob_backfill, + slot_clock, + kzg, + store, + custody_context, + spec, + ) + .expect("should initialise data availability checker") } } diff --git a/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs b/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs index 3c1b4e8b16..42f6dbd856 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker/overflow_lru_cache.rs @@ -1,6 +1,5 @@ use super::AvailableBlockData; use super::state_lru_cache::{DietAvailabilityPendingExecutedBlock, StateLRUCache}; -use crate::BeaconChainTypes; use crate::CustodyContext; use crate::beacon_chain::BeaconStore; use crate::blob_verification::KzgVerifiedBlob; @@ -9,43 +8,91 @@ use crate::block_verification_types::{ }; use crate::data_availability_checker::{Availability, AvailabilityCheckError}; use crate::data_column_verification::KzgVerifiedCustodyDataColumn; +use crate::{BeaconChainTypes, BlockProcessStatus}; use lighthouse_tracing::SPAN_PENDING_COMPONENTS; use lru::LruCache; -use parking_lot::RwLock; +use parking_lot::{MappedRwLockReadGuard, RwLock, RwLockReadGuard, RwLockWriteGuard}; use std::cmp::Ordering; use std::num::NonZeroUsize; use std::sync::Arc; use tracing::{Span, debug, debug_span}; +use types::beacon_block_body::KzgCommitments; use types::blob_sidecar::BlobIdentifier; use types::{ - BlobSidecar, ChainSpec, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, Epoch, EthSpec, - Hash256, RuntimeFixedVector, RuntimeVariableList, SignedBeaconBlock, + BlobSidecar, BlockImportSource, ChainSpec, ColumnIndex, DataColumnSidecar, + DataColumnSidecarList, Epoch, EthSpec, Hash256, RuntimeFixedVector, RuntimeVariableList, + SignedBeaconBlock, }; +#[derive(Clone)] +pub enum CachedBlock { + PreExecution(Arc>, BlockImportSource), + Executed(Box>), +} + +impl CachedBlock { + pub fn get_commitments(&self) -> KzgCommitments { + let block = self.as_block(); + block + .message() + .body() + .blob_kzg_commitments() + .cloned() + .unwrap_or_default() + } + + fn as_block(&self) -> &SignedBeaconBlock { + match self { + CachedBlock::PreExecution(b, _) => b, + CachedBlock::Executed(b) => b.as_block(), + } + } + + pub fn num_blobs_expected(&self) -> usize { + self.as_block() + .message() + .body() + .blob_kzg_commitments() + .map_or(0, |commitments| commitments.len()) + } +} + /// This represents the components of a partially available block /// /// The blobs are all gossip and kzg verified. /// The block has completed all verifications except the availability check. +/// +/// There are currently three distinct hardfork eras that one should take note of: +/// - Pre-Deneb: No availability requirements (Block is immediately available) +/// - Post-Deneb, Pre-PeerDAS: Blobs are needed, but columns are not for the availability check +/// - Post-PeerDAS: Columns are needed, but blobs are not for the availability check +/// +/// Note: from this, one can immediately see that `verified_blobs` and `verified_data_columns` +/// are mutually exclusive. i.e. If we are verifying columns to determine a block's availability +/// we are ignoring the `verified_blobs` field. pub struct PendingComponents { pub block_root: Hash256, pub verified_blobs: RuntimeFixedVector>>, pub verified_data_columns: Vec>, - pub executed_block: Option>, + pub block: Option>, pub reconstruction_started: bool, span: Span, } impl PendingComponents { - /// Returns an immutable reference to the cached block. - pub fn get_cached_block(&self) -> &Option> { - &self.executed_block - } - /// Returns an immutable reference to the fixed vector of cached blobs. pub fn get_cached_blobs(&self) -> &RuntimeFixedVector>> { &self.verified_blobs } + #[cfg(test)] + fn get_diet_block(&self) -> Option<&DietAvailabilityPendingExecutedBlock> { + self.block.as_ref().and_then(|block| match block { + CachedBlock::Executed(block) => Some(block.as_ref()), + _ => None, + }) + } + /// Returns an immutable reference to the cached data column. pub fn get_cached_data_column( &self, @@ -57,11 +104,6 @@ impl PendingComponents { .map(|d| d.clone_arc()) } - /// Returns a mutable reference to the cached block. - pub fn get_cached_block_mut(&mut self) -> &mut Option> { - &mut self.executed_block - } - /// Returns a mutable reference to the fixed vector of cached blobs. pub fn get_cached_blobs_mut(&mut self) -> &mut RuntimeFixedVector>> { &mut self.verified_blobs @@ -87,20 +129,28 @@ impl PendingComponents { .collect() } - /// Inserts a block into the cache. - pub fn insert_block(&mut self, block: DietAvailabilityPendingExecutedBlock) { - let _guard = self.span.clone().entered(); - debug!("Block added to pending components"); - *self.get_cached_block_mut() = Some(block) + /// Inserts an executed block into the cache. + pub fn insert_executed_block(&mut self, block: DietAvailabilityPendingExecutedBlock) { + self.block = Some(CachedBlock::Executed(Box::new(block))) + } + + /// Inserts a pre-execution block into the cache. + /// This does NOT override an existing executed block. + pub fn insert_pre_execution_block( + &mut self, + block: Arc>, + source: BlockImportSource, + ) { + if self.block.is_none() { + self.block = Some(CachedBlock::PreExecution(block, source)) + } } /// Inserts a blob at a specific index in the cache. /// /// Existing blob at the index will be replaced. pub fn insert_blob_at_index(&mut self, blob_index: usize, blob: KzgVerifiedBlob) { - let _guard = self.span.clone().entered(); if let Some(b) = self.get_cached_blobs_mut().get_mut(blob_index) { - debug!(blob_index, "Blob added to pending components"); *b = Some(blob); } } @@ -123,7 +173,7 @@ impl PendingComponents { /// 1. The blob entry at the index is empty and no block exists, or /// 2. The block exists and its commitment matches the blob's commitment. pub fn merge_single_blob(&mut self, index: usize, blob: KzgVerifiedBlob) { - if let Some(cached_block) = self.get_cached_block() { + if let Some(cached_block) = &self.block { let block_commitment_opt = cached_block.get_commitments().get(index).copied(); if let Some(block_commitment) = block_commitment_opt && block_commitment == *blob.get_commitment() @@ -140,13 +190,8 @@ impl PendingComponents { &mut self, kzg_verified_data_columns: I, ) -> Result<(), AvailabilityCheckError> { - let _guard = self.span.clone().entered(); for data_column in kzg_verified_data_columns { if self.get_cached_data_column(data_column.index()).is_none() { - debug!( - column_index = data_column.index(), - "Data column added to pending components" - ); self.verified_data_columns.push(data_column); } } @@ -158,7 +203,7 @@ impl PendingComponents { /// /// Blobs that don't match the new block's commitments are evicted. pub fn merge_block(&mut self, block: DietAvailabilityPendingExecutedBlock) { - self.insert_block(block); + self.insert_executed_block(block); let reinsert = self.get_cached_blobs_mut().take(); self.merge_blobs(reinsert); } @@ -169,9 +214,9 @@ impl PendingComponents { /// WARNING: This function can potentially take a lot of time if the state needs to be /// reconstructed from disk. Ensure you are not holding any write locks while calling this. pub fn make_available( - &mut self, + &self, spec: &Arc, - num_expected_columns: usize, + num_expected_columns_opt: Option, recover: R, ) -> Result>, AvailabilityCheckError> where @@ -180,7 +225,7 @@ impl PendingComponents { &Span, ) -> Result, AvailabilityCheckError>, { - let Some(block) = &self.executed_block else { + let Some(CachedBlock::Executed(block)) = &self.block else { // Block not available yet return Ok(None); }; @@ -188,7 +233,7 @@ impl PendingComponents { let num_expected_blobs = block.num_blobs_expected(); let blob_data = if num_expected_blobs == 0 { Some(AvailableBlockData::NoData) - } else if spec.is_peer_das_enabled_for_epoch(block.epoch()) { + } else if let Some(num_expected_columns) = num_expected_columns_opt { let num_received_columns = self.verified_data_columns.len(); match num_received_columns.cmp(&num_expected_columns) { Ordering::Greater => { @@ -267,7 +312,7 @@ impl PendingComponents { block, import_data, payload_verification_outcome, - } = recover(block.clone(), &self.span)?; + } = recover(*block.clone(), &self.span)?; let available_block = AvailableBlock { block_root: self.block_root, @@ -295,56 +340,47 @@ impl PendingComponents { block_root, verified_blobs: RuntimeFixedVector::new(vec![None; max_len]), verified_data_columns: vec![], - executed_block: None, + block: None, reconstruction_started: false, span, } } - /// Returns the epoch of the block if it is cached, otherwise returns the epoch of the first blob. + /// Returns the epoch of: + /// - The block if it is cached + /// - The first available blob + /// - The first data column + /// Otherwise, returns None pub fn epoch(&self) -> Option { - self.executed_block - .as_ref() - .map(|pending_block| pending_block.as_block().epoch()) - .or_else(|| { - for maybe_blob in self.verified_blobs.iter() { - if maybe_blob.is_some() { - return maybe_blob.as_ref().map(|kzg_verified_blob| { - kzg_verified_blob - .as_blob() - .slot() - .epoch(E::slots_per_epoch()) - }); - } - } + // Get epoch from cached block + if let Some(block) = &self.block { + return Some(block.as_block().epoch()); + } - if let Some(kzg_verified_data_column) = self.verified_data_columns.first() { - let epoch = kzg_verified_data_column.as_data_column().epoch(); - return Some(epoch); - } + // Or, get epoch from first available blob + if let Some(blob) = self.verified_blobs.iter().flatten().next() { + return Some(blob.as_blob().slot().epoch(E::slots_per_epoch())); + } - None - }) + // Or, get epoch from first data column + if let Some(data_column) = self.verified_data_columns.first() { + return Some(data_column.as_data_column().epoch()); + } + + None } - pub fn status_str( - &self, - block_epoch: Epoch, - num_expected_columns: Option, - spec: &ChainSpec, - ) -> String { - let block_count = if self.executed_block.is_some() { 1 } else { 0 }; - if spec.is_peer_das_enabled_for_epoch(block_epoch) { + pub fn status_str(&self, num_expected_columns_opt: Option) -> String { + let block_count = if self.block.is_some() { 1 } else { 0 }; + if let Some(num_expected_columns) = num_expected_columns_opt { format!( "block {} data_columns {}/{}", block_count, self.verified_data_columns.len(), num_expected_columns - .map(|c| c.to_string()) - .unwrap_or("?".into()) ) } else { - let num_expected_blobs = if let Some(block) = self.get_cached_block() { + let num_expected_blobs = if let Some(block) = &self.block { &block.num_blobs_expected().to_string() } else { "?" @@ -396,18 +432,19 @@ impl DataAvailabilityCheckerInner { } /// Returns true if the block root is known, without altering the LRU ordering - pub fn get_execution_valid_block( - &self, - block_root: &Hash256, - ) -> Option>> { + pub fn get_cached_block(&self, block_root: &Hash256) -> Option> { self.critical .read() .peek(block_root) .and_then(|pending_components| { - pending_components - .executed_block - .as_ref() - .map(|block| block.block_cloned()) + pending_components.block.as_ref().map(|block| match block { + CachedBlock::PreExecution(b, source) => { + BlockProcessStatus::NotValidated(b.clone(), *source) + } + CachedBlock::Executed(b) => { + BlockProcessStatus::ExecutionValidated(b.block_cloned()) + } + }) }) } @@ -477,41 +514,21 @@ impl DataAvailabilityCheckerInner { *blob_opt = Some(blob); } } + let pending_components = + self.update_or_insert_pending_components(block_root, epoch, |pending_components| { + pending_components.merge_blobs(fixed_blobs); + Ok(()) + })?; - let mut write_lock = self.critical.write(); + pending_components.span.in_scope(|| { + debug!( + component = "blobs", + status = pending_components.status_str(None), + "Component added to data availability checker" + ); + }); - // Grab existing entry or create a new entry. - let mut pending_components = write_lock - .pop_entry(&block_root) - .map(|(_, v)| v) - .unwrap_or_else(|| { - PendingComponents::empty(block_root, self.spec.max_blobs_per_block(epoch) as usize) - }); - - // Merge in the blobs. - pending_components.merge_blobs(fixed_blobs); - - debug!( - component = "blobs", - ?block_root, - status = pending_components.status_str(epoch, None, &self.spec), - "Component added to data availability checker" - ); - - if let Some(available_block) = pending_components.make_available( - &self.spec, - self.custody_context - .num_of_data_columns_to_sample(epoch, &self.spec), - |block, span| self.state_cache.recover_pending_executed_block(block, span), - )? { - // We keep the pending components in the availability cache during block import (#5845). - write_lock.put(block_root, pending_components); - drop(write_lock); - Ok(Availability::Available(Box::new(available_block))) - } else { - write_lock.put(block_root, pending_components); - Ok(Availability::MissingComponents(block_root)) - } + self.check_availability_and_cache_components(block_root, pending_components, None) } #[allow(clippy::type_complexity)] @@ -534,49 +551,96 @@ impl DataAvailabilityCheckerInner { return Ok(Availability::MissingComponents(block_root)); }; - let mut write_lock = self.critical.write(); - - // Grab existing entry or create a new entry. - let mut pending_components = write_lock - .pop_entry(&block_root) - .map(|(_, v)| v) - .unwrap_or_else(|| { - PendingComponents::empty(block_root, self.spec.max_blobs_per_block(epoch) as usize) - }); - - // Merge in the data columns. - pending_components.merge_data_columns(kzg_verified_data_columns)?; + let pending_components = + self.update_or_insert_pending_components(block_root, epoch, |pending_components| { + pending_components.merge_data_columns(kzg_verified_data_columns) + })?; let num_expected_columns = self .custody_context .num_of_data_columns_to_sample(epoch, &self.spec); - debug!( - component = "data_columns", - ?block_root, - status = pending_components.status_str(epoch, Some(num_expected_columns), &self.spec), - "Component added to data availability checker" - ); - if let Some(available_block) = - pending_components.make_available(&self.spec, num_expected_columns, |block, span| { - self.state_cache.recover_pending_executed_block(block, span) - })? - { - // We keep the pending components in the availability cache during block import (#5845). - write_lock.put(block_root, pending_components); - drop(write_lock); + pending_components.span.in_scope(|| { + debug!( + component = "data_columns", + status = pending_components.status_str(Some(num_expected_columns)), + "Component added to data availability checker" + ); + }); + + self.check_availability_and_cache_components( + block_root, + pending_components, + Some(num_expected_columns), + ) + } + + fn check_availability_and_cache_components( + &self, + block_root: Hash256, + pending_components: MappedRwLockReadGuard<'_, PendingComponents>, + num_expected_columns_opt: Option, + ) -> Result, AvailabilityCheckError> { + if let Some(available_block) = pending_components.make_available( + &self.spec, + num_expected_columns_opt, + |block, span| self.state_cache.recover_pending_executed_block(block, span), + )? { + // Explicitly drop read lock before acquiring write lock + drop(pending_components); + if let Some(components) = self.critical.write().get_mut(&block_root) { + // Clean up span now that block is available + components.span = Span::none(); + } + + // We never remove the pending components manually to avoid race conditions. + // This ensures components remain available during and right after block import, + // preventing a race condition where a component was removed after the block was + // imported, but re-inserted immediately, causing partial pending components to be + // stored and served to peers. + // Components are only removed via LRU eviction as finality advances. Ok(Availability::Available(Box::new(available_block))) } else { - write_lock.put(block_root, pending_components); Ok(Availability::MissingComponents(block_root)) } } + /// Updates or inserts a new `PendingComponents` if it doesn't exist, and then apply the + /// `update_fn` while holding the write lock. + /// + /// Once the update is complete, the write lock is downgraded and a read guard with a + /// reference of the updated `PendingComponents` is returned. + fn update_or_insert_pending_components( + &self, + block_root: Hash256, + epoch: Epoch, + update_fn: F, + ) -> Result>, AvailabilityCheckError> + where + F: FnOnce(&mut PendingComponents) -> Result<(), AvailabilityCheckError>, + { + let mut write_lock = self.critical.write(); + + { + let pending_components = write_lock.get_or_insert_mut(block_root, || { + PendingComponents::empty(block_root, self.spec.max_blobs_per_block(epoch) as usize) + }); + update_fn(pending_components)? + } + + RwLockReadGuard::try_map(RwLockWriteGuard::downgrade(write_lock), |cache| { + cache.peek(&block_root) + }) + .map_err(|_| { + AvailabilityCheckError::Unexpected("pending components should exist".to_string()) + }) + } + /// Check whether data column reconstruction should be attempted. /// - /// Potentially trigger reconstruction if: - /// - Our custody requirement is all columns (supernode), and we haven't got all columns - /// - We have >= 50% of columns, but not all columns + /// Potentially trigger reconstruction if all the following satisfy: + /// - Our custody requirement is more than 50% of total columns, + /// - We haven't received all required columns /// - Reconstruction hasn't been started for the block /// /// If reconstruction is required, returns `PendingComponents` which contains the @@ -591,15 +655,25 @@ impl DataAvailabilityCheckerInner { return ReconstructColumnsDecision::No("block already imported"); }; - // If we're sampling all columns, it means we must be custodying all columns. + let Some(epoch) = pending_components + .verified_data_columns + .first() + .map(|c| c.as_data_column().epoch()) + else { + return ReconstructColumnsDecision::No("not enough columns"); + }; + let total_column_count = T::EthSpec::number_of_columns(); + let sampling_column_count = self + .custody_context + .num_of_data_columns_to_sample(epoch, &self.spec); let received_column_count = pending_components.verified_data_columns.len(); if pending_components.reconstruction_started { return ReconstructColumnsDecision::No("already started"); } - if received_column_count >= total_column_count { - return ReconstructColumnsDecision::No("all columns received"); + if received_column_count >= sampling_column_count { + return ReconstructColumnsDecision::No("all sampling columns received"); } if received_column_count < total_column_count / 2 { return ReconstructColumnsDecision::No("not enough columns"); @@ -619,13 +693,50 @@ impl DataAvailabilityCheckerInner { } } + /// Inserts a pre executed block into the cache. + /// - This does NOT trigger the availability check as the block still needs to be executed. + /// - This does NOT override an existing cached block to avoid overwriting an executed block. + pub fn put_pre_execution_block( + &self, + block_root: Hash256, + block: Arc>, + source: BlockImportSource, + ) -> Result<(), AvailabilityCheckError> { + let epoch = block.epoch(); + let pending_components = + self.update_or_insert_pending_components(block_root, epoch, |pending_components| { + pending_components.insert_pre_execution_block(block, source); + Ok(()) + })?; + + let num_expected_columns_opt = self.get_num_expected_columns(epoch); + + pending_components.span.in_scope(|| { + debug!( + component = "pre execution block", + status = pending_components.status_str(num_expected_columns_opt), + "Component added to data availability checker" + ); + }); + + Ok(()) + } + + /// Removes a pre-execution block from the cache. + /// This does NOT remove an existing executed block. + pub fn remove_pre_execution_block(&self, block_root: &Hash256) { + // The read lock is immediately dropped so we can safely remove the block from the cache. + if let Some(BlockProcessStatus::NotValidated(_, _)) = self.get_cached_block(block_root) { + self.critical.write().pop(block_root); + } + } + /// Check if we have all the blobs for a block. If we do, return the Availability variant that /// triggers import of the block. - pub fn put_pending_executed_block( + pub fn put_executed_block( &self, executed_block: AvailabilityPendingExecutedBlock, ) -> Result, AvailabilityCheckError> { - let mut write_lock = self.critical.write(); let epoch = executed_block.as_block().epoch(); let block_root = executed_block.import_data.block_root; @@ -634,45 +745,38 @@ impl DataAvailabilityCheckerInner { .state_cache .register_pending_executed_block(executed_block); - // Grab existing entry or create a new entry. - let mut pending_components = write_lock - .pop_entry(&block_root) - .map(|(_, v)| v) - .unwrap_or_else(|| { - PendingComponents::empty(block_root, self.spec.max_blobs_per_block(epoch) as usize) - }); + let pending_components = + self.update_or_insert_pending_components(block_root, epoch, |pending_components| { + pending_components.merge_block(diet_executed_block); + Ok(()) + })?; - // Merge in the block. - pending_components.merge_block(diet_executed_block); + let num_expected_columns_opt = self.get_num_expected_columns(epoch); - let num_expected_columns = self - .custody_context - .num_of_data_columns_to_sample(epoch, &self.spec); - debug!( - component = "block", - ?block_root, - status = pending_components.status_str(epoch, Some(num_expected_columns), &self.spec), - "Component added to data availability checker" - ); + pending_components.span.in_scope(|| { + debug!( + component = "block", + status = pending_components.status_str(num_expected_columns_opt), + "Component added to data availability checker" + ); + }); - // Check if we have all components and entire set is consistent. - if let Some(available_block) = - pending_components.make_available(&self.spec, num_expected_columns, |block, span| { - self.state_cache.recover_pending_executed_block(block, span) - })? - { - // We keep the pending components in the availability cache during block import (#5845). - write_lock.put(block_root, pending_components); - drop(write_lock); - Ok(Availability::Available(Box::new(available_block))) - } else { - write_lock.put(block_root, pending_components); - Ok(Availability::MissingComponents(block_root)) - } + self.check_availability_and_cache_components( + block_root, + pending_components, + num_expected_columns_opt, + ) } - pub fn remove_pending_components(&self, block_root: Hash256) { - self.critical.write().pop_entry(&block_root); + fn get_num_expected_columns(&self, epoch: Epoch) -> Option { + if self.spec.is_peer_das_enabled_for_epoch(epoch) { + let num_of_column_samples = self + .custody_context + .num_of_data_columns_to_sample(epoch, &self.spec); + Some(num_of_column_samples) + } else { + None + } } /// maintain the cache @@ -948,7 +1052,7 @@ mod test { ); assert!(cache.critical.read().is_empty(), "cache should be empty"); let availability = cache - .put_pending_executed_block(pending_block) + .put_executed_block(pending_block) .expect("should put block"); if blobs_expected == 0 { assert!( @@ -960,13 +1064,6 @@ mod test { 1, "cache should still have block as it hasn't been imported yet" ); - // remove the blob to simulate successful import - cache.remove_pending_components(root); - assert_eq!( - cache.critical.read().len(), - 0, - "cache should be empty now that block has been imported" - ); } else { assert!( matches!(availability, Availability::MissingComponents(_)), @@ -996,12 +1093,6 @@ mod test { assert_eq!(cache.critical.read().len(), 1); } } - // remove the blob to simulate successful import - cache.remove_pending_components(root); - assert!( - cache.critical.read().is_empty(), - "cache should be empty now that all components available" - ); let (pending_block, blobs) = availability_pending_block(&harness).await; let blobs_expected = pending_block.num_blobs_expected(); @@ -1021,10 +1112,14 @@ mod test { matches!(availability, Availability::MissingComponents(_)), "should be pending block" ); - assert_eq!(cache.critical.read().len(), 1); + assert_eq!( + cache.critical.read().len(), + 2, + "cache should have two blocks now" + ); } let availability = cache - .put_pending_executed_block(pending_block) + .put_executed_block(pending_block) .expect("should put block"); assert!( matches!(availability, Availability::Available(_)), @@ -1032,14 +1127,8 @@ mod test { availability ); assert!( - cache.critical.read().len() == 1, - "cache should still have available block until import" - ); - // remove the blob to simulate successful import - cache.remove_pending_components(root); - assert!( - cache.critical.read().is_empty(), - "cache should be empty now that all components available" + cache.critical.read().len() == 2, + "cache should still have available block" ); } @@ -1092,7 +1181,7 @@ mod test { // put the block in the cache let availability = cache - .put_pending_executed_block(pending_block) + .put_executed_block(pending_block) .expect("should put block"); // grab the diet block from the cache for later testing @@ -1100,12 +1189,7 @@ mod test { .critical .read() .peek(&block_root) - .map(|pending_components| { - pending_components - .executed_block - .clone() - .expect("should exist") - }) + .and_then(|pending_components| pending_components.get_diet_block().cloned()) .expect("should exist"); pushed_diet_blocks.push_back(diet_block); @@ -1161,14 +1245,6 @@ mod test { states.last(), "recovered state should be the same as the original" ); - // the state should no longer be in the cache - assert!( - state_cache - .read() - .peek(&last_block.as_block().state_root()) - .is_none(), - "last block state should no longer be in cache" - ); } } @@ -1274,7 +1350,7 @@ mod pending_components_tests { } pub fn assert_cache_consistent(cache: PendingComponents, max_len: usize) { - if let Some(cached_block) = cache.get_cached_block() { + if let Some(cached_block) = &cache.block { let cached_block_commitments = cached_block.get_commitments(); for index in 0..max_len { let block_commitment = cached_block_commitments.get(index).copied(); @@ -1380,4 +1456,38 @@ mod pending_components_tests { assert_cache_consistent(cache, max_len); } + + #[test] + fn should_not_insert_pre_execution_block_if_executed_block_exists() { + let (pre_execution_block, blobs, random_blobs, max_len) = pre_setup(); + let (executed_block, _blobs, _random_blobs) = + setup_pending_components(pre_execution_block.clone(), blobs, random_blobs); + + let block_root = pre_execution_block.canonical_root(); + let mut pending_component = >::empty(block_root, max_len); + + let pre_execution_block = Arc::new(pre_execution_block); + pending_component + .insert_pre_execution_block(pre_execution_block.clone(), BlockImportSource::Gossip); + assert!( + matches!( + pending_component.block, + Some(CachedBlock::PreExecution(_, _)) + ), + "pre execution block inserted" + ); + + pending_component.insert_executed_block(executed_block); + assert!( + matches!(pending_component.block, Some(CachedBlock::Executed(_))), + "executed block inserted" + ); + + pending_component + .insert_pre_execution_block(pre_execution_block, BlockImportSource::Gossip); + assert!( + matches!(pending_component.block, Some(CachedBlock::Executed(_))), + "executed block should remain" + ); + } } diff --git a/beacon_node/beacon_chain/src/data_availability_checker/state_lru_cache.rs b/beacon_node/beacon_chain/src/data_availability_checker/state_lru_cache.rs index e328bd9b9c..24f9237e3c 100644 --- a/beacon_node/beacon_chain/src/data_availability_checker/state_lru_cache.rs +++ b/beacon_node/beacon_chain/src/data_availability_checker/state_lru_cache.rs @@ -10,7 +10,6 @@ use state_processing::BlockReplayer; use std::sync::Arc; use store::OnDiskConsensusContext; use tracing::{Span, debug_span, instrument}; -use types::beacon_block_body::KzgCommitments; use types::{BeaconState, BlindedPayload, ChainSpec, Epoch, EthSpec, Hash256, SignedBeaconBlock}; /// This mirrors everything in the `AvailabilityPendingExecutedBlock`, except @@ -43,15 +42,6 @@ impl DietAvailabilityPendingExecutedBlock { .map_or(0, |commitments| commitments.len()) } - pub fn get_commitments(&self) -> KzgCommitments { - self.as_block() - .message() - .body() - .blob_kzg_commitments() - .cloned() - .unwrap_or_default() - } - /// Returns the epoch corresponding to `self.slot()`. pub fn epoch(&self) -> Epoch { self.block.slot().epoch(E::slots_per_epoch()) @@ -113,8 +103,9 @@ impl StateLRUCache { diet_executed_block: DietAvailabilityPendingExecutedBlock, _span: &Span, ) -> Result, AvailabilityCheckError> { - let state = if let Some(state) = self.states.write().pop(&diet_executed_block.state_root) { - state + // Keep the state in the cache to prevent reconstruction in race conditions + let state = if let Some(state) = self.states.write().get(&diet_executed_block.state_root) { + state.clone() } else { self.reconstruct_state(&diet_executed_block)? }; diff --git a/beacon_node/beacon_chain/src/data_column_verification.rs b/beacon_node/beacon_chain/src/data_column_verification.rs index fb88db1300..01e79c49aa 100644 --- a/beacon_node/beacon_chain/src/data_column_verification.rs +++ b/beacon_node/beacon_chain/src/data_column_verification.rs @@ -1,7 +1,5 @@ -use crate::beacon_proposer_cache::EpochBlockProposers; use crate::block_verification::{ - BlockSlashInfo, cheap_state_advance_to_obtain_committees, get_validator_pubkey_cache, - process_block_slash_info, + BlockSlashInfo, get_validator_pubkey_cache, process_block_slash_info, }; use crate::kzg_utils::{reconstruct_data_columns, validate_data_columns}; use crate::observed_data_sidecars::{ObservationStrategy, Observe}; @@ -163,6 +161,15 @@ pub enum GossipDataColumnError { /// /// The column sidecar is invalid and the peer is faulty InconsistentProofsLength { cells_len: usize, proofs_len: usize }, + /// The number of KZG commitments exceeds the maximum number of blobs allowed for the fork. The + /// sidecar is invalid. + /// + /// ## Peer scoring + /// The column sidecar is invalid and the peer is faulty + MaxBlobsPerBlockExceeded { + max_blobs_per_block: usize, + commitments_len: usize, + }, } impl From for GossipDataColumnError { @@ -215,6 +222,40 @@ impl GossipVerifiedDataColumn ) } + /// Create a `GossipVerifiedDataColumn` from `DataColumnSidecar` for block production ONLY. + /// When publishing a block constructed locally, the EL will have already verified the cell proofs. + /// When publishing a block constructed externally, there will be no columns here. + pub fn new_for_block_publishing( + column_sidecar: Arc>, + chain: &BeaconChain, + ) -> Result { + verify_data_column_sidecar(&column_sidecar, &chain.spec)?; + + // Check if the data column is already in the DA checker cache. This happens when data columns + // are made available through the `engine_getBlobs` method. If it exists in the cache, we know + // it has already passed the gossip checks, even though this particular instance hasn't been + // seen / published on the gossip network yet (passed the `verify_is_unknown_sidecar` check above). + // In this case, we should accept it for gossip propagation. + verify_is_unknown_sidecar(chain, &column_sidecar)?; + + if chain + .data_availability_checker + .is_data_column_cached(&column_sidecar.block_root(), &column_sidecar) + { + // Observe this data column so we don't process it again. + if O::observe() { + observe_gossip_data_column(&column_sidecar, chain)?; + } + return Err(GossipDataColumnError::PriorKnownUnpublished); + } + + Ok(Self { + block_root: column_sidecar.block_root(), + data_column: KzgVerifiedDataColumn::from_execution_verified(column_sidecar), + _phantom: Default::default(), + }) + } + /// Create a `GossipVerifiedDataColumn` from `DataColumnSidecar` for testing ONLY. pub fn __new_for_testing(column_sidecar: Arc>) -> Self { Self { @@ -376,7 +417,7 @@ impl KzgVerifiedCustodyDataColumn { ) -> Result>, KzgError> { let all_data_columns = reconstruct_data_columns( kzg, - &partial_set_of_columns + partial_set_of_columns .iter() .map(|d| d.clone_arc()) .collect::>(), @@ -443,16 +484,16 @@ pub fn validate_data_column_sidecar_for_gossip, ) -> Result, GossipDataColumnError> { let column_slot = data_column.slot(); - verify_data_column_sidecar(&data_column)?; + verify_data_column_sidecar(&data_column, &chain.spec)?; verify_index_matches_subnet(&data_column, subnet, &chain.spec)?; verify_sidecar_not_from_future_slot(chain, column_slot)?; verify_slot_greater_than_latest_finalized_slot(chain, column_slot)?; - verify_is_first_sidecar(chain, &data_column)?; + verify_is_unknown_sidecar(chain, &data_column)?; // Check if the data column is already in the DA checker cache. This happens when data columns // are made available through the `engine_getBlobs` method. If it exists in the cache, we know // it has already passed the gossip checks, even though this particular instance hasn't been - // seen / published on the gossip network yet (passed the `verify_is_first_sidecar` check above). + // seen / published on the gossip network yet (passed the `verify_is_unknown_sidecar` check above). // In this case, we should accept it for gossip propagation. if chain .data_availability_checker @@ -497,6 +538,7 @@ pub fn validate_data_column_sidecar_for_gossip( data_column: &DataColumnSidecar, + spec: &ChainSpec, ) -> Result<(), GossipDataColumnError> { if data_column.index >= E::number_of_columns() as u64 { return Err(GossipDataColumnError::InvalidColumnIndex(data_column.index)); @@ -508,6 +550,14 @@ fn verify_data_column_sidecar( let cells_len = data_column.column.len(); let commitments_len = data_column.kzg_commitments.len(); let proofs_len = data_column.kzg_proofs.len(); + let max_blobs_per_block = spec.max_blobs_per_block(data_column.epoch()) as usize; + + if commitments_len > max_blobs_per_block { + return Err(GossipDataColumnError::MaxBlobsPerBlockExceeded { + max_blobs_per_block, + commitments_len, + }); + } if cells_len != commitments_len { return Err(GossipDataColumnError::InconsistentCommitmentsLength { @@ -526,22 +576,22 @@ fn verify_data_column_sidecar( Ok(()) } -// Verify that this is the first column sidecar received for the tuple: -// (block_header.slot, block_header.proposer_index, column_sidecar.index) -fn verify_is_first_sidecar( +/// Verify that `column_sidecar` is not yet known, i.e. this is the first time `column_sidecar` has been received for the tuple: +/// `(block_header.slot, block_header.proposer_index, column_sidecar.index)` +fn verify_is_unknown_sidecar( chain: &BeaconChain, - data_column: &DataColumnSidecar, + column_sidecar: &DataColumnSidecar, ) -> Result<(), GossipDataColumnError> { if chain .observed_column_sidecars .read() - .proposer_is_known(data_column) + .proposer_is_known(column_sidecar) .map_err(|e| GossipDataColumnError::BeaconChainError(Box::new(e.into())))? { return Err(GossipDataColumnError::PriorKnown { - proposer: data_column.block_proposer_index(), - slot: data_column.slot(), - index: data_column.index, + proposer: column_sidecar.block_proposer_index(), + slot: column_sidecar.slot(), + index: column_sidecar.index, }); } Ok(()) @@ -576,22 +626,21 @@ fn verify_parent_block_and_finalized_descendant( chain: &BeaconChain, ) -> Result { let fork_choice = chain.canonical_head.fork_choice_read_lock(); + let block_parent_root = data_column.block_parent_root(); + + // Do not process a column that does not descend from the finalized root. + if !fork_choice.is_finalized_checkpoint_or_descendant(block_parent_root) { + return Err(GossipDataColumnError::NotFinalizedDescendant { block_parent_root }); + } // We have already verified that the column is past finalization, so we can // just check fork choice for the block's parent. - let block_parent_root = data_column.block_parent_root(); let Some(parent_block) = fork_choice.get_block(&block_parent_root) else { return Err(GossipDataColumnError::ParentUnknown { parent_root: block_parent_root, }); }; - // Do not process a column that does not descend from the finalized root. - // We just loaded the parent_block, so we can be sure that it exists in fork choice. - if !fork_choice.is_finalized_checkpoint_or_descendant(block_parent_root) { - return Err(GossipDataColumnError::NotFinalizedDescendant { block_parent_root }); - } - Ok(parent_block) } @@ -607,65 +656,34 @@ fn verify_proposer_and_signature( let block_root = data_column.block_root(); let block_parent_root = data_column.block_parent_root(); - let proposer_shuffling_root = if parent_block.slot.epoch(slots_per_epoch) == column_epoch { - parent_block - .next_epoch_shuffling_id - .shuffling_decision_block - } else { - parent_block.root - }; + let proposer_shuffling_root = + parent_block.proposer_shuffling_root_for_child_block(column_epoch, &chain.spec); - // We lock the cache briefly to get or insert a OnceCell, then drop the lock - // before doing proposer shuffling calculation via `OnceCell::get_or_try_init`. This avoids - // holding the lock during the computation, while still ensuring the result is cached and - // initialised only once. - // - // This approach exposes the cache internals (`OnceCell` & `EpochBlockProposers`) - // as a trade-off for avoiding lock contention. - let epoch_proposers_cell = chain - .beacon_proposer_cache - .lock() - .get_or_insert_key(column_epoch, proposer_shuffling_root); - - let epoch_proposers = epoch_proposers_cell.get_or_try_init(move || { - debug!( - %block_root, - index = %column_index, - "Proposer shuffling cache miss for column verification" - ); - let (parent_state_root, mut parent_state) = chain - .store - .get_advanced_hot_state(block_parent_root, column_slot, parent_block.state_root) - .map_err(|e| GossipDataColumnError::BeaconChainError(Box::new(e.into())))? - .ok_or_else(|| { - BeaconChainError::DBInconsistent(format!( - "Missing state for parent block {block_parent_root:?}", - )) - })?; - - let state = cheap_state_advance_to_obtain_committees::<_, GossipDataColumnError>( - &mut parent_state, - Some(parent_state_root), - column_slot, - &chain.spec, - )?; - - let epoch = state.current_epoch(); - let proposers = state.get_beacon_proposer_indices(epoch, &chain.spec)?; - // Prime the proposer shuffling cache with the newly-learned value. - Ok::<_, GossipDataColumnError>(EpochBlockProposers { - epoch: column_epoch, - fork: state.fork(), - proposers: proposers.into(), - }) - })?; - - let proposer_index = *epoch_proposers - .proposers - .get(column_slot.as_usize() % slots_per_epoch as usize) - .ok_or_else(|| BeaconChainError::NoProposerForSlot(column_slot))?; - - let fork = epoch_proposers.fork; + let proposer = chain.with_proposer_cache( + proposer_shuffling_root, + column_epoch, + |proposers| proposers.get_slot::(column_slot), + || { + debug!( + %block_root, + index = %column_index, + "Proposer shuffling cache miss for column verification" + ); + chain + .store + .get_advanced_hot_state(block_parent_root, column_slot, parent_block.state_root) + .map_err(|e| GossipDataColumnError::BeaconChainError(Box::new(e.into())))? + .ok_or_else(|| { + GossipDataColumnError::BeaconChainError(Box::new( + BeaconChainError::DBInconsistent(format!( + "Missing state for parent block {block_parent_root:?}", + )), + )) + }) + }, + )?; + let proposer_index = proposer.index; + let fork = proposer.fork; // Signature verify the signed block header. let signature_is_valid = { @@ -782,16 +800,22 @@ pub fn observe_gossip_data_column( #[cfg(test)] mod test { use crate::data_column_verification::{ - GossipDataColumnError, validate_data_column_sidecar_for_gossip, + GossipDataColumnError, GossipVerifiedDataColumn, validate_data_column_sidecar_for_gossip, }; use crate::observed_data_sidecars::Observe; - use crate::test_utils::BeaconChainHarness; + use crate::test_utils::{ + BeaconChainHarness, EphemeralHarnessType, generate_data_column_sidecars_from_block, + }; + use eth2::types::BlobsBundle; + use execution_layer::test_utils::generate_blobs; + use std::sync::Arc; use types::{DataColumnSidecar, DataColumnSubnetId, EthSpec, ForkName, MainnetEthSpec}; type E = MainnetEthSpec; #[tokio::test] - async fn empty_data_column_sidecars_fails_validation() { + async fn test_validate_data_column_sidecar_for_gossip() { + // Setting up harness is slow, we initialise once and use it for all gossip validation tests. let spec = ForkName::Fulu.make_genesis_spec(E::default_spec()); let harness = BeaconChainHarness::builder(E::default()) .spec(spec.into()) @@ -801,6 +825,44 @@ mod test { .build(); harness.advance_slot(); + let verify_fn = |column_sidecar: DataColumnSidecar| { + let col_index = column_sidecar.index; + validate_data_column_sidecar_for_gossip::<_, Observe>( + column_sidecar.into(), + DataColumnSubnetId::from_column_index(col_index, &harness.spec), + &harness.chain, + ) + }; + empty_data_column_sidecars_fails_validation(&harness, &verify_fn).await; + data_column_sidecar_commitments_exceed_max_blobs_per_block(&harness, &verify_fn).await; + } + + #[tokio::test] + async fn test_new_for_block_publishing() { + // Setting up harness is slow, we initialise once and use it for all gossip validation tests. + let spec = ForkName::Fulu.make_genesis_spec(E::default_spec()); + let harness = BeaconChainHarness::builder(E::default()) + .spec(spec.into()) + .deterministic_keypairs(64) + .fresh_ephemeral_store() + .mock_execution_layer() + .build(); + harness.advance_slot(); + + let verify_fn = |column_sidecar: DataColumnSidecar| { + GossipVerifiedDataColumn::<_>::new_for_block_publishing( + column_sidecar.into(), + &harness.chain, + ) + }; + empty_data_column_sidecars_fails_validation(&harness, &verify_fn).await; + data_column_sidecar_commitments_exceed_max_blobs_per_block(&harness, &verify_fn).await; + } + + async fn empty_data_column_sidecars_fails_validation( + harness: &BeaconChainHarness>, + verify_fn: &impl Fn(DataColumnSidecar) -> Result, + ) { let slot = harness.get_current_slot(); let state = harness.get_current_state(); let ((block, _blobs_opt), _state) = harness @@ -823,14 +885,47 @@ mod test { .unwrap(), }; - let result = validate_data_column_sidecar_for_gossip::<_, Observe>( - column_sidecar.into(), - DataColumnSubnetId::from_column_index(index, &harness.spec), - &harness.chain, - ); + let result = verify_fn(column_sidecar); assert!(matches!( result.err(), Some(GossipDataColumnError::UnexpectedDataColumn) )); } + + async fn data_column_sidecar_commitments_exceed_max_blobs_per_block( + harness: &BeaconChainHarness>, + verify_fn: &impl Fn(DataColumnSidecar) -> Result, + ) { + let slot = harness.get_current_slot(); + let epoch = slot.epoch(E::slots_per_epoch()); + let state = harness.get_current_state(); + let max_blobs_per_block = harness.spec.max_blobs_per_block(epoch) as usize; + let fork = harness.spec.fork_name_at_epoch(epoch); + + // Generate data column sidecar with blob count exceeding max_blobs_per_block. + let blob_count = max_blobs_per_block + 1; + let BlobsBundle:: { + commitments: preloaded_commitments_single, + proofs: _, + blobs: _, + } = generate_blobs(1, fork).unwrap().0; + + let ((block, _blobs_opt), _state) = harness + .make_block_with_modifier(state, slot, |block| { + *block.body_mut().blob_kzg_commitments_mut().unwrap() = + vec![preloaded_commitments_single[0]; blob_count].into(); + }) + .await; + + let column_sidecar = generate_data_column_sidecars_from_block(&block, &harness.spec) + .into_iter() + .next() + .unwrap(); + + let result = verify_fn(Arc::try_unwrap(column_sidecar).unwrap()); + assert!(matches!( + result.err(), + Some(GossipDataColumnError::MaxBlobsPerBlockExceeded { .. }) + )); + } } diff --git a/beacon_node/beacon_chain/src/errors.rs b/beacon_node/beacon_chain/src/errors.rs index a1a0ec74f6..7b04a36fae 100644 --- a/beacon_node/beacon_chain/src/errors.rs +++ b/beacon_node/beacon_chain/src/errors.rs @@ -230,6 +230,23 @@ pub enum BeaconChainError { columns_found: usize, }, FailedToReconstructBlobs(String), + ProposerCacheIncorrectState { + state_decision_block_root: Hash256, + requested_decision_block_root: Hash256, + }, + ProposerCacheAccessorFailure { + decision_block_root: Hash256, + proposal_epoch: Epoch, + }, + ProposerCacheOutOfBounds { + slot: Slot, + epoch: Epoch, + }, + ProposerCacheWrongEpoch { + request_epoch: Epoch, + cache_epoch: Epoch, + }, + SkipProposerPreparation, } easy_from_to!(SlotProcessingError, BeaconChainError); diff --git a/beacon_node/beacon_chain/src/execution_payload.rs b/beacon_node/beacon_chain/src/execution_payload.rs index 697fee351e..f0cab06ca3 100644 --- a/beacon_node/beacon_chain/src/execution_payload.rs +++ b/beacon_node/beacon_chain/src/execution_payload.rs @@ -24,7 +24,7 @@ use state_processing::per_block_processing::{ }; use std::sync::Arc; use tokio::task::JoinHandle; -use tracing::{debug, warn}; +use tracing::{Instrument, debug, debug_span, warn}; use tree_hash::TreeHash; use types::payload::BlockProductionVersion; use types::*; @@ -403,8 +403,9 @@ pub fn get_execution_payload( block_production_version, ) .await - }, - "get_execution_payload", + } + .instrument(debug_span!("prepare_execution_payload")), + "prepare_execution_payload", ) .ok_or(BlockProductionError::ShuttingDown)?; @@ -503,6 +504,7 @@ where }, "prepare_execution_payload_forkchoice_update_params", ) + .instrument(debug_span!("forkchoice_update_params")) .await .map_err(|e| BlockProductionError::BeaconChain(Box::new(e)))?; diff --git a/beacon_node/beacon_chain/src/historical_blocks.rs b/beacon_node/beacon_chain/src/historical_blocks.rs index 8b9fb5e354..15e0a55cf5 100644 --- a/beacon_node/beacon_chain/src/historical_blocks.rs +++ b/beacon_node/beacon_chain/src/historical_blocks.rs @@ -140,7 +140,7 @@ impl BeaconChain { // Store the blobs or data columns too if let Some(op) = self - .get_blobs_or_columns_store_op(block_root, block_data) + .get_blobs_or_columns_store_op(block_root, block.slot(), block_data) .map_err(|e| { HistoricalBlockError::StoreError(StoreError::DBError { message: format!("get_blobs_or_columns_store_op error {e:?}"), diff --git a/beacon_node/beacon_chain/src/kzg_utils.rs b/beacon_node/beacon_chain/src/kzg_utils.rs index cde9050ed2..382775ab50 100644 --- a/beacon_node/beacon_chain/src/kzg_utils.rs +++ b/beacon_node/beacon_chain/src/kzg_utils.rs @@ -1,6 +1,6 @@ use kzg::{ Blob as KzgBlob, Bytes48, Cell as KzgCell, CellRef as KzgCellRef, CellsAndKzgProofs, - Error as KzgError, Kzg, + Error as KzgError, Kzg, KzgBlobRef, }; use rayon::prelude::*; use ssz_types::{FixedVector, VariableList}; @@ -28,9 +28,9 @@ fn ssz_blob_to_crypto_blob_boxed(blob: &Blob) -> Result(cell: &Cell) -> Result, KzgError> { let cell_bytes: &[u8] = cell.as_ref(); - Ok(cell_bytes + cell_bytes .try_into() - .expect("expected cell to have size {BYTES_PER_CELL}. This should be guaranteed by the `FixedVector type")) + .map_err(|e| KzgError::InconsistentArrayLength(format!("expected cell to have size BYTES_PER_CELL. This should be guaranteed by the `FixedVector` type: {e:?}"))) } /// Validate a single blob-commitment-proof triplet from a `BlobSidecar`. @@ -174,6 +174,13 @@ pub fn blobs_to_data_column_sidecars( let kzg_commitments_inclusion_proof = block.message().body().kzg_commitments_merkle_proof()?; let signed_block_header = block.signed_block_header(); + if cell_proofs.len() != blobs.len() * E::number_of_columns() { + return Err(DataColumnSidecarError::InvalidCellProofLength { + expected: blobs.len() * E::number_of_columns(), + actual: cell_proofs.len(), + }); + } + let proof_chunks = cell_proofs .chunks_exact(E::number_of_columns()) .collect::>(); @@ -183,18 +190,19 @@ pub fn blobs_to_data_column_sidecars( let blob_cells_and_proofs_vec = zipped .into_par_iter() .map(|(blob, proofs)| { - let blob = blob - .as_ref() - .try_into() - .expect("blob should have a guaranteed size due to FixedVector"); + let blob = blob.as_ref().try_into().map_err(|e| { + KzgError::InconsistentArrayLength(format!( + "blob should have a guaranteed size due to FixedVector: {e:?}" + )) + })?; - kzg.compute_cells(blob).map(|cells| { - ( - cells, - proofs - .try_into() - .expect("proof chunks should have exactly `number_of_columns` proofs"), - ) + kzg.compute_cells(blob).and_then(|cells| { + let proofs = proofs.try_into().map_err(|e| { + KzgError::InconsistentArrayLength(format!( + "proof chunks should have exactly `number_of_columns` proofs: {e:?}" + )) + })?; + Ok((cells, proofs)) }) }) .collect::, KzgError>>()?; @@ -213,10 +221,11 @@ pub fn compute_cells(blobs: &[&Blob], kzg: &Kzg) -> Result = blob.as_ref().try_into().map_err(|e| { + KzgError::InconsistentArrayLength(format!( + "blob should have a guaranteed size due to FixedVector: {e:?}", + )) + })?; kzg.compute_cells(blob) }) @@ -290,6 +299,8 @@ pub(crate) fn build_data_column_sidecars( /// /// If `blob_indices_opt` is `None`, this function attempts to reconstruct all blobs associated /// with the block. +/// This function does NOT use rayon as this is primarily used by a non critical path in HTTP API +/// and it will be slow if the node needs to reconstruct the blobs pub fn reconstruct_blobs( kzg: &Kzg, data_columns: &[Arc>], @@ -311,7 +322,7 @@ pub fn reconstruct_blobs( }; let blob_sidecars = blob_indices - .into_par_iter() + .into_iter() .map(|row_index| { let mut cells: Vec = vec![]; let mut cell_ids: Vec = vec![]; @@ -328,16 +339,26 @@ pub fn reconstruct_blobs( cell_ids.push(data_column.index); } - let (cells, _kzg_proofs) = kzg - .recover_cells_and_compute_kzg_proofs(&cell_ids, &cells) - .map_err(|e| format!("Failed to recover cells and compute KZG proofs: {e:?}"))?; + let num_cells_original_blob = E::number_of_columns() / 2; + let blob_bytes = if data_columns.len() < E::number_of_columns() { + let (recovered_cells, _kzg_proofs) = kzg + .recover_cells_and_compute_kzg_proofs(&cell_ids, &cells) + .map_err(|e| { + format!("Failed to recover cells and compute KZG proofs: {e:?}") + })?; - let num_cells_original_blob = cells.len() / 2; - let blob_bytes = cells - .into_iter() - .take(num_cells_original_blob) - .flat_map(|cell| cell.into_iter()) - .collect(); + recovered_cells + .into_iter() + .take(num_cells_original_blob) + .flat_map(|cell| cell.into_iter()) + .collect() + } else { + cells + .into_iter() + .take(num_cells_original_blob) + .flat_map(|cell| (*cell).into_iter()) + .collect() + }; let blob = Blob::::new(blob_bytes).map_err(|e| format!("{e:?}"))?; let kzg_proof = KzgProof::empty(); @@ -363,14 +384,18 @@ pub fn reconstruct_blobs( /// Reconstruct all data columns from a subset of data column sidecars (requires at least 50%). pub fn reconstruct_data_columns( kzg: &Kzg, - data_columns: &[Arc>], + mut data_columns: Vec>>, spec: &ChainSpec, ) -> Result, KzgError> { + // Sort data columns by index to ensure ascending order for KZG operations + data_columns.sort_unstable_by_key(|dc| dc.index); + let first_data_column = data_columns .first() .ok_or(KzgError::InconsistentArrayLength( "data_columns should have at least one element".to_string(), ))?; + let num_of_blobs = first_data_column.kzg_commitments.len(); let blob_cells_and_proofs_vec = @@ -379,7 +404,7 @@ pub fn reconstruct_data_columns( .map(|row_index| { let mut cells: Vec = vec![]; let mut cell_ids: Vec = vec![]; - for data_column in data_columns { + for data_column in &data_columns { let cell = data_column.column.get(row_index).ok_or( KzgError::InconsistentArrayLength(format!( "Missing data column at row index {row_index}" @@ -431,6 +456,7 @@ mod test { test_build_data_columns_empty(&kzg, &spec); test_build_data_columns(&kzg, &spec); test_reconstruct_data_columns(&kzg, &spec); + test_reconstruct_data_columns_unordered(&kzg, &spec); test_reconstruct_blobs_from_data_columns(&kzg, &spec); test_validate_data_columns(&kzg, &spec); } @@ -503,7 +529,7 @@ mod test { #[track_caller] fn test_reconstruct_data_columns(kzg: &Kzg, spec: &ChainSpec) { - let num_of_blobs = 6; + let num_of_blobs = 2; let (signed_block, blobs, proofs) = create_test_fulu_block_and_blobs::(num_of_blobs, spec); let blob_refs = blobs.iter().collect::>(); @@ -514,7 +540,7 @@ mod test { // Now reconstruct let reconstructed_columns = reconstruct_data_columns( kzg, - &column_sidecars.iter().as_slice()[0..column_sidecars.len() / 2], + column_sidecars.iter().as_slice()[0..column_sidecars.len() / 2].to_vec(), spec, ) .unwrap(); @@ -524,6 +550,27 @@ mod test { } } + #[track_caller] + fn test_reconstruct_data_columns_unordered(kzg: &Kzg, spec: &ChainSpec) { + let num_of_blobs = 2; + let (signed_block, blobs, proofs) = + create_test_fulu_block_and_blobs::(num_of_blobs, spec); + let blob_refs = blobs.iter().collect::>(); + let column_sidecars = + blobs_to_data_column_sidecars(&blob_refs, proofs.to_vec(), &signed_block, kzg, spec) + .unwrap(); + + // Test reconstruction with columns in reverse order (non-ascending) + let mut subset_columns: Vec<_> = + column_sidecars.iter().as_slice()[0..column_sidecars.len() / 2].to_vec(); + subset_columns.reverse(); // This would fail without proper sorting in reconstruct_data_columns + let reconstructed_columns = reconstruct_data_columns(kzg, subset_columns, spec).unwrap(); + + for i in 0..E::number_of_columns() { + assert_eq!(reconstructed_columns.get(i), column_sidecars.get(i), "{i}"); + } + } + #[track_caller] fn test_reconstruct_blobs_from_data_columns(kzg: &Kzg, spec: &ChainSpec) { let num_of_blobs = 6; diff --git a/beacon_node/beacon_chain/src/light_client_finality_update_verification.rs b/beacon_node/beacon_chain/src/light_client_finality_update_verification.rs index 0d5a5425d5..fe62b8ef90 100644 --- a/beacon_node/beacon_chain/src/light_client_finality_update_verification.rs +++ b/beacon_node/beacon_chain/src/light_client_finality_update_verification.rs @@ -116,7 +116,13 @@ impl VerifiedLightClientFinalityUpdate { // Verify that the gossiped finality update is the same as the locally constructed one. if latest_finality_update != rcv_finality_update { let signature_slot = latest_finality_update.signature_slot(); + if signature_slot != rcv_finality_update.signature_slot() { + // The locally constructed finality update is not up to date, probably + // because the node has fallen behind and needs to sync. + if rcv_finality_update.signature_slot() > signature_slot { + return Err(Error::Ignore); + } return Err(Error::MismatchedSignatureSlot { local: signature_slot, observed: rcv_finality_update.signature_slot(), diff --git a/beacon_node/beacon_chain/src/light_client_optimistic_update_verification.rs b/beacon_node/beacon_chain/src/light_client_optimistic_update_verification.rs index 4da6913443..b59390ea0c 100644 --- a/beacon_node/beacon_chain/src/light_client_optimistic_update_verification.rs +++ b/beacon_node/beacon_chain/src/light_client_optimistic_update_verification.rs @@ -118,6 +118,11 @@ impl VerifiedLightClientOptimisticUpdate { if latest_optimistic_update != rcv_optimistic_update { let signature_slot = latest_optimistic_update.signature_slot(); if signature_slot != rcv_optimistic_update.signature_slot() { + // The locally constructed optimistic update is not up to date, probably + // because the node has fallen behind and needs to sync. + if rcv_optimistic_update.signature_slot() > signature_slot { + return Err(Error::Ignore); + } return Err(Error::MismatchedSignatureSlot { local: signature_slot, observed: rcv_optimistic_update.signature_slot(), diff --git a/beacon_node/beacon_chain/src/metrics.rs b/beacon_node/beacon_chain/src/metrics.rs index 1b57bad104..0d34ffdcd1 100644 --- a/beacon_node/beacon_chain/src/metrics.rs +++ b/beacon_node/beacon_chain/src/metrics.rs @@ -458,12 +458,6 @@ pub static BEACON_EARLY_ATTESTER_CACHE_HITS: LazyLock> = Lazy ) }); -pub static BEACON_REQRESP_PRE_IMPORT_CACHE_SIZE: LazyLock> = LazyLock::new(|| { - try_create_int_gauge( - "beacon_reqresp_pre_import_cache_size", - "Current count of items of the reqresp pre import cache", - ) -}); pub static BEACON_REQRESP_PRE_IMPORT_CACHE_HITS: LazyLock> = LazyLock::new(|| { try_create_int_counter( @@ -578,6 +572,14 @@ pub static FORK_CHOICE_READ_LOCK_AQUIRE_TIMES: LazyLock> = Laz exponential_buckets(1e-4, 4.0, 7), ) }); +pub static FORK_CHOICE_UPGRADABLE_READ_LOCK_AQUIRE_TIMES: LazyLock> = + LazyLock::new(|| { + try_create_histogram_with_buckets( + "beacon_fork_choice_upgradable_read_lock_aquire_seconds", + "Time taken to aquire the fork-choice upgradable read lock", + exponential_buckets(1e-4, 4.0, 7), + ) + }); pub static FORK_CHOICE_WRITE_LOCK_AQUIRE_TIMES: LazyLock> = LazyLock::new(|| { try_create_histogram_with_buckets( "beacon_fork_choice_write_lock_aquire_seconds", @@ -1957,7 +1959,6 @@ pub fn scrape_for_metrics(beacon_chain: &BeaconChain) { } let attestation_stats = beacon_chain.op_pool.attestation_stats(); - let chain_metrics = beacon_chain.metrics(); // Kept duplicated for backwards compatibility set_gauge_by_usize( @@ -1965,11 +1966,6 @@ pub fn scrape_for_metrics(beacon_chain: &BeaconChain) { beacon_chain.store.state_cache_len(), ); - set_gauge_by_usize( - &BEACON_REQRESP_PRE_IMPORT_CACHE_SIZE, - chain_metrics.reqresp_pre_import_cache_len, - ); - let da_checker_metrics = beacon_chain.data_availability_checker.metrics(); set_gauge_by_usize( &DATA_AVAILABILITY_OVERFLOW_MEMORY_BLOCK_CACHE_SIZE, diff --git a/beacon_node/beacon_chain/src/state_advance_timer.rs b/beacon_node/beacon_chain/src/state_advance_timer.rs index 27c2c7c0a1..87348cb01b 100644 --- a/beacon_node/beacon_chain/src/state_advance_timer.rs +++ b/beacon_node/beacon_chain/src/state_advance_timer.rs @@ -33,7 +33,7 @@ use types::{AttestationShufflingId, BeaconStateError, EthSpec, Hash256, Relative /// /// This avoids doing unnecessary work whilst the node is syncing or has perhaps been put to sleep /// for some period of time. -const MAX_ADVANCE_DISTANCE: u64 = 4; +const MAX_ADVANCE_DISTANCE: u64 = 256; /// Similarly for fork choice: avoid the fork choice lookahead during sync. /// @@ -49,17 +49,7 @@ enum Error { HeadMissingFromSnapshotCache(#[allow(dead_code)] Hash256), BeaconState(#[allow(dead_code)] BeaconStateError), Store(#[allow(dead_code)] store::Error), - MaxDistanceExceeded { - current_slot: Slot, - head_slot: Slot, - }, - StateAlreadyAdvanced { - block_root: Hash256, - }, - BadStateSlot { - _state_slot: Slot, - _block_slot: Slot, - }, + MaxDistanceExceeded { current_slot: Slot, head_slot: Slot }, } impl From for Error { @@ -180,9 +170,6 @@ async fn state_advance_timer( error = ?e, "Failed to advance head state" ), - Err(Error::StateAlreadyAdvanced { block_root }) => { - debug!(?block_root, "State already advanced on slot") - } Err(Error::MaxDistanceExceeded { current_slot, head_slot, @@ -295,25 +282,6 @@ fn advance_head(beacon_chain: &Arc>) -> Resu .get_advanced_hot_state(head_block_root, current_slot, head_block_state_root)? .ok_or(Error::HeadMissingFromSnapshotCache(head_block_root))?; - // Protect against advancing a state more than a single slot. - // - // Advancing more than one slot without storing the intermediate state would corrupt the - // database. Future works might store intermediate states inside this function. - match state.slot().cmp(&state.latest_block_header().slot) { - std::cmp::Ordering::Equal => (), - std::cmp::Ordering::Greater => { - return Err(Error::StateAlreadyAdvanced { - block_root: head_block_root, - }); - } - std::cmp::Ordering::Less => { - return Err(Error::BadStateSlot { - _block_slot: state.latest_block_header().slot, - _state_slot: state.slot(), - }); - } - } - let initial_slot = state.slot(); let initial_epoch = state.current_epoch(); diff --git a/beacon_node/beacon_chain/src/test_utils.rs b/beacon_node/beacon_chain/src/test_utils.rs index 4737d07f2e..bfd0484d91 100644 --- a/beacon_node/beacon_chain/src/test_utils.rs +++ b/beacon_node/beacon_chain/src/test_utils.rs @@ -715,7 +715,10 @@ where pub fn set_mock_builder( &mut self, beacon_url: SensitiveUrl, - ) -> impl futures::Future + 'static { + strict_registrations: bool, + apply_operations: bool, + broadcast_to_bn: bool, + ) -> impl futures::Future + use { let mock_el = self .mock_execution_layer .as_ref() @@ -727,6 +730,9 @@ where let (mock_builder, (addr, mock_builder_server)) = MockBuilder::new_for_testing( mock_el_url, beacon_url, + strict_registrations, + apply_operations, + broadcast_to_bn, self.spec.clone(), self.runtime.task_executor.clone(), ); @@ -903,8 +909,65 @@ where state: BeaconState, slot: Slot, ) -> (SignedBlindedBeaconBlock, BeaconState) { - let (unblinded, new_state) = self.make_block(state, slot).await; - ((*unblinded.0).clone().into(), new_state) + self.make_blinded_block_with_modifier(state, slot, |_| {}) + .await + } + + pub async fn make_blinded_block_with_modifier( + &self, + mut state: BeaconState, + slot: Slot, + block_modifier: impl FnOnce(&mut BlindedBeaconBlock), + ) -> (SignedBlindedBeaconBlock, BeaconState) { + assert_ne!(slot, 0, "can't produce a block at slot 0"); + assert!(slot >= state.slot()); + + complete_state_advance(&mut state, None, slot, &self.spec) + .expect("should be able to advance state to slot"); + + state.build_caches(&self.spec).expect("should build caches"); + + let proposer_index = state.get_beacon_proposer_index(slot, &self.spec).unwrap(); + + // If we produce two blocks for the same slot, they hash up to the same value and + // BeaconChain errors out with `DuplicateFullyImported`. Vary the graffiti so that we produce + // different blocks each time. + let graffiti = Graffiti::from(self.rng.lock().random::<[u8; 32]>()); + + let randao_reveal = self.sign_randao_reveal(&state, proposer_index, slot); + + // Always use the builder, so that we produce a "real" blinded payload. + let builder_boost_factor = Some(u64::MAX); + + let BeaconBlockResponseWrapper::Blinded(block_response) = self + .chain + .produce_block_on_state( + state, + None, + slot, + randao_reveal, + Some(graffiti), + ProduceBlockVerification::VerifyRandao, + builder_boost_factor, + BlockProductionVersion::V3, + ) + .await + .unwrap() + else { + panic!("Should always be a blinded payload response"); + }; + + let mut block = block_response.block; + block_modifier(&mut block); + + let signed_block = block.sign( + &self.validator_keypairs[proposer_index].sk, + &block_response.state.fork(), + block_response.state.genesis_validators_root(), + &self.spec, + ); + + (signed_block, block_response.state) } /// Returns a newly created block, signed by the proposer for the given slot. @@ -3298,7 +3361,7 @@ pub fn generate_rand_block_and_data_columns( } /// Generate data column sidecars from pre-computed cells and proofs. -fn generate_data_column_sidecars_from_block( +pub fn generate_data_column_sidecars_from_block( block: &SignedBeaconBlock, spec: &ChainSpec, ) -> DataColumnSidecarList { diff --git a/beacon_node/beacon_chain/src/validator_custody.rs b/beacon_node/beacon_chain/src/validator_custody.rs index 1c89624f9d..3ab76828c9 100644 --- a/beacon_node/beacon_chain/src/validator_custody.rs +++ b/beacon_node/beacon_chain/src/validator_custody.rs @@ -130,7 +130,7 @@ pub struct CustodyContext { /// and enr values. validator_custody_count: AtomicU64, /// Is the node run as a supernode based on current cli parameters. - pub current_is_supernode: bool, + current_is_supernode: bool, /// The persisted value for `is_supernode` based on the previous run of this node. /// /// Note: We require this value because if a user restarts the node with a higher cli custody @@ -307,6 +307,14 @@ impl CustodyContext { .expect("should compute node sampling size from valid chain spec") } + /// Returns whether the node should attempt reconstruction at a given epoch. + pub fn should_attempt_reconstruction(&self, epoch: Epoch, spec: &ChainSpec) -> bool { + let min_columns_for_reconstruction = E::number_of_columns() / 2; + // performing reconstruction is not necessary if sampling column count is exactly 50%, + // because the node doesn't need the remaining columns. + self.num_of_data_columns_to_sample(epoch, spec) > min_columns_for_reconstruction + } + /// Returns the ordered list of column indices that should be sampled for data availability checking at the given epoch. /// /// # Parameters diff --git a/beacon_node/beacon_chain/src/validator_monitor.rs b/beacon_node/beacon_chain/src/validator_monitor.rs index 23f1a7d430..00c30e5ab1 100644 --- a/beacon_node/beacon_chain/src/validator_monitor.rs +++ b/beacon_node/beacon_chain/src/validator_monitor.rs @@ -497,7 +497,7 @@ impl ValidatorMonitor { }); // Add missed non-finalized blocks for the monitored validators - self.add_validators_missed_blocks(state); + self.add_validators_missed_blocks(state, spec); self.process_unaggregated_attestations(state, spec); // Update metrics for individual validators. @@ -588,7 +588,7 @@ impl ValidatorMonitor { } /// Add missed non-finalized blocks for the monitored validators - fn add_validators_missed_blocks(&mut self, state: &BeaconState) { + fn add_validators_missed_blocks(&mut self, state: &BeaconState, spec: &ChainSpec) { // Define range variables let current_slot = state.slot(); let current_epoch = current_slot.epoch(E::slots_per_epoch()); @@ -616,8 +616,8 @@ impl ValidatorMonitor { if block_root == prev_block_root { let slot_epoch = slot.epoch(E::slots_per_epoch()); - if let Ok(shuffling_decision_block) = - state.proposer_shuffling_decision_root_at_epoch(slot_epoch, *block_root) + if let Ok(shuffling_decision_block) = state + .proposer_shuffling_decision_root_at_epoch(slot_epoch, *block_root, spec) { // Update the cache if it has not yet been initialised, or if it is // initialised for a prior epoch. This is an optimisation to avoid bouncing diff --git a/beacon_node/beacon_chain/tests/block_verification.rs b/beacon_node/beacon_chain/tests/block_verification.rs index 58ca4a032e..47f5be02cb 100644 --- a/beacon_node/beacon_chain/tests/block_verification.rs +++ b/beacon_node/beacon_chain/tests/block_verification.rs @@ -42,7 +42,10 @@ enum DataSidecars { } async fn get_chain_segment() -> (Vec>, Vec>>) { - let harness = get_harness(VALIDATOR_COUNT); + // The assumption that you can re-import a block based on what you have in your DB + // is no longer true, as fullnodes stores less than what they sample. + // We use a supernode here to build a chain segment. + let harness = get_harness(VALIDATOR_COUNT, true); harness .extend_chain( @@ -101,7 +104,10 @@ async fn get_chain_segment() -> (Vec>, Vec BeaconChainHarness> { +fn get_harness( + validator_count: usize, + supernode: bool, +) -> BeaconChainHarness> { let harness = BeaconChainHarness::builder(MainnetEthSpec) .default_spec() .chain_config(ChainConfig { @@ -109,6 +115,7 @@ fn get_harness(validator_count: usize) -> BeaconChainHarness( #[tokio::test] async fn chain_segment_full_segment() { - let harness = get_harness(VALIDATOR_COUNT); + let harness = get_harness(VALIDATOR_COUNT, false); let (chain_segment, chain_segment_blobs) = get_chain_segment().await; let blocks: Vec> = chain_segment_blocks(&chain_segment, &chain_segment_blobs) .into_iter() @@ -290,7 +297,7 @@ async fn chain_segment_full_segment() { #[tokio::test] async fn chain_segment_varying_chunk_size() { for chunk_size in &[1, 2, 3, 5, 31, 32, 33, 42] { - let harness = get_harness(VALIDATOR_COUNT); + let harness = get_harness(VALIDATOR_COUNT, false); let (chain_segment, chain_segment_blobs) = get_chain_segment().await; let blocks: Vec> = chain_segment_blocks(&chain_segment, &chain_segment_blobs) .into_iter() @@ -322,7 +329,7 @@ async fn chain_segment_varying_chunk_size() { #[tokio::test] async fn chain_segment_non_linear_parent_roots() { - let harness = get_harness(VALIDATOR_COUNT); + let harness = get_harness(VALIDATOR_COUNT, false); let (chain_segment, chain_segment_blobs) = get_chain_segment().await; harness @@ -379,7 +386,7 @@ async fn chain_segment_non_linear_parent_roots() { #[tokio::test] async fn chain_segment_non_linear_slots() { - let harness = get_harness(VALIDATOR_COUNT); + let harness = get_harness(VALIDATOR_COUNT, false); let (chain_segment, chain_segment_blobs) = get_chain_segment().await; harness .chain @@ -521,7 +528,7 @@ async fn assert_invalid_signature( async fn get_invalid_sigs_harness( chain_segment: &[BeaconSnapshot], ) -> BeaconChainHarness> { - let harness = get_harness(VALIDATOR_COUNT); + let harness = get_harness(VALIDATOR_COUNT, false); harness .chain .slot_clock @@ -979,7 +986,7 @@ fn unwrap_err(result: Result) -> U { #[tokio::test] async fn block_gossip_verification() { - let harness = get_harness(VALIDATOR_COUNT); + let harness = get_harness(VALIDATOR_COUNT, false); let (chain_segment, chain_segment_blobs) = get_chain_segment().await; let block_index = CHAIN_SEGMENT_LENGTH - 2; @@ -1382,7 +1389,7 @@ async fn verify_block_for_gossip_slashing_detection() { #[tokio::test] async fn verify_block_for_gossip_doppelganger_detection() { - let harness = get_harness(VALIDATOR_COUNT); + let harness = get_harness(VALIDATOR_COUNT, false); let state = harness.get_current_state(); let ((block, _), _) = harness.make_block(state.clone(), Slot::new(1)).await; @@ -1730,6 +1737,8 @@ async fn add_altair_block_to_base_chain() { )); } +// This is a regression test for this bug: +// https://github.com/sigp/lighthouse/issues/4332#issuecomment-1565092279 #[tokio::test] async fn import_duplicate_block_unrealized_justification() { let spec = MainnetEthSpec::default_spec(); @@ -1791,7 +1800,7 @@ async fn import_duplicate_block_unrealized_justification() { .await .unwrap(); - // Unrealized justification should NOT have updated. + // The store's global unrealized justification should update immediately and match the block. let unrealized_justification = { let fc = chain.canonical_head.fork_choice_read_lock(); assert_eq!(fc.justified_checkpoint().epoch, 0); @@ -1808,9 +1817,12 @@ async fn import_duplicate_block_unrealized_justification() { }; // Import the second verified block, simulating a block processed via RPC. - import_execution_pending_block(chain.clone(), verified_block2) - .await - .unwrap(); + assert_eq!( + import_execution_pending_block(chain.clone(), verified_block2) + .await + .unwrap_err(), + format!("DuplicateFullyImported({block_root})") + ); // Unrealized justification should still be updated. let fc3 = chain.canonical_head.fork_choice_read_lock(); diff --git a/beacon_node/beacon_chain/tests/store_tests.rs b/beacon_node/beacon_chain/tests/store_tests.rs index 62c08adc21..0b7004781f 100644 --- a/beacon_node/beacon_chain/tests/store_tests.rs +++ b/beacon_node/beacon_chain/tests/store_tests.rs @@ -1191,6 +1191,316 @@ fn check_shuffling_compatible( } } +/// These tests check the consistency of: +/// +/// - ProtoBlock::proposer_shuffling_root_for_child_block, and +/// - BeaconState::proposer_shuffling_decision_root{_at_epoch} +async fn proposer_shuffling_root_consistency_test( + spec: ChainSpec, + parent_slot: u64, + child_slot: u64, +) { + let child_slot = Slot::new(child_slot); + let db_path = tempdir().unwrap(); + let store = get_store_generic(&db_path, Default::default(), spec.clone()); + let validators_keypairs = + types::test_utils::generate_deterministic_keypairs(LOW_VALIDATOR_COUNT); + let harness = TestHarness::builder(MinimalEthSpec) + .spec(spec.into()) + .keypairs(validators_keypairs) + .fresh_disk_store(store) + .mock_execution_layer() + .build(); + let spec = &harness.chain.spec; + + // Build chain out to parent block. + let initial_slots: Vec = (1..=parent_slot).map(Into::into).collect(); + let (state, state_root) = harness.get_current_state_and_root(); + let all_validators = harness.get_all_validators(); + let (_, _, parent_root, _) = harness + .add_attested_blocks_at_slots(state, state_root, &initial_slots, &all_validators) + .await; + + // Add the child block. + let (state, state_root) = harness.get_current_state_and_root(); + let all_validators = harness.get_all_validators(); + let (_, _, child_root, child_block_state) = harness + .add_attested_blocks_at_slots(state, state_root, &[child_slot], &all_validators) + .await; + + let child_block_epoch = child_slot.epoch(E::slots_per_epoch()); + + // Load parent block from fork choice. + let fc_parent = harness + .chain + .canonical_head + .fork_choice_read_lock() + .get_block(&parent_root.into()) + .unwrap(); + + // The proposer shuffling decision root computed using fork choice should equal the root + // computed from the child state. + let decision_root = fc_parent.proposer_shuffling_root_for_child_block(child_block_epoch, spec); + + assert_eq!( + decision_root, + child_block_state + .proposer_shuffling_decision_root(child_root.into(), spec) + .unwrap() + ); + assert_eq!( + decision_root, + child_block_state + .proposer_shuffling_decision_root_at_epoch(child_block_epoch, child_root.into(), spec) + .unwrap() + ); + + // The passed block root argument should be irrelevant for all blocks except the genesis block. + assert_eq!( + decision_root, + child_block_state + .proposer_shuffling_decision_root(Hash256::ZERO, spec) + .unwrap() + ); + assert_eq!( + decision_root, + child_block_state + .proposer_shuffling_decision_root_at_epoch(child_block_epoch, Hash256::ZERO, spec) + .unwrap() + ); +} + +#[tokio::test] +async fn proposer_shuffling_root_consistency_same_epoch() { + let spec = test_spec::(); + proposer_shuffling_root_consistency_test(spec, 32, 39).await; +} + +#[tokio::test] +async fn proposer_shuffling_root_consistency_next_epoch() { + let spec = test_spec::(); + proposer_shuffling_root_consistency_test(spec, 32, 47).await; +} + +#[tokio::test] +async fn proposer_shuffling_root_consistency_two_epochs() { + let spec = test_spec::(); + proposer_shuffling_root_consistency_test(spec, 32, 55).await; +} + +#[tokio::test] +async fn proposer_shuffling_root_consistency_at_fork_boundary() { + let mut spec = ForkName::Electra.make_genesis_spec(E::default_spec()); + spec.fulu_fork_epoch = Some(Epoch::new(4)); + + // Parent block in epoch prior to Fulu fork epoch, child block in Fulu fork epoch. + proposer_shuffling_root_consistency_test( + spec.clone(), + 3 * E::slots_per_epoch(), + 4 * E::slots_per_epoch(), + ) + .await; + + // Parent block and child block in Fulu fork epoch. + proposer_shuffling_root_consistency_test( + spec.clone(), + 4 * E::slots_per_epoch(), + 4 * E::slots_per_epoch() + 1, + ) + .await; + + // Parent block in Fulu fork epoch and child block in epoch after. + proposer_shuffling_root_consistency_test( + spec.clone(), + 4 * E::slots_per_epoch(), + 5 * E::slots_per_epoch(), + ) + .await; + + // Parent block in epoch prior and child block in epoch after. + proposer_shuffling_root_consistency_test( + spec, + 3 * E::slots_per_epoch(), + 5 * E::slots_per_epoch(), + ) + .await; +} + +#[tokio::test] +async fn proposer_shuffling_changing_with_lookahead() { + let initial_blocks = E::slots_per_epoch() * 4 - 1; + + let spec = ForkName::Fulu.make_genesis_spec(E::default_spec()); + let db_path = tempdir().unwrap(); + let store = get_store_generic(&db_path, Default::default(), spec.clone()); + let validators_keypairs = + types::test_utils::generate_deterministic_keypairs(LOW_VALIDATOR_COUNT); + let harness = TestHarness::builder(MinimalEthSpec) + .spec(spec.into()) + .keypairs(validators_keypairs) + .fresh_disk_store(store) + .mock_execution_layer() + .build(); + let spec = &harness.chain.spec; + + // Start with some blocks, finishing with one slot before a new epoch. + harness.advance_slot(); + harness + .extend_chain( + initial_blocks as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + let pre_deposit_state = harness.get_current_state(); + assert_eq!(pre_deposit_state.slot(), initial_blocks); + let topup_block_slot = Slot::new(initial_blocks + 1); + let validator_to_topup_index = 1; + let validator_to_topup = pre_deposit_state + .get_validator(validator_to_topup_index) + .unwrap() + .clone(); + + // Craft a block with a deposit request and consolidation. + // XXX: This is a really nasty way to do this, but we need better test facilities in + // MockExecutionLayer to address this. + let deposit_request: DepositRequest = DepositRequest { + index: pre_deposit_state.eth1_deposit_index(), + pubkey: validator_to_topup.pubkey, + withdrawal_credentials: validator_to_topup.withdrawal_credentials, + amount: 63_000_000_000, + signature: SignatureBytes::empty(), + }; + + let consolidation_request: ConsolidationRequest = ConsolidationRequest { + source_address: validator_to_topup + .get_execution_withdrawal_address(spec) + .unwrap(), + source_pubkey: validator_to_topup.pubkey, + target_pubkey: validator_to_topup.pubkey, + }; + + let execution_requests = ExecutionRequests:: { + deposits: VariableList::new(vec![deposit_request]).unwrap(), + withdrawals: vec![].into(), + consolidations: VariableList::new(vec![consolidation_request]).unwrap(), + }; + + let mut block = Box::pin(harness.make_block_with_modifier( + pre_deposit_state.clone(), + topup_block_slot, + |block| *block.body_mut().execution_requests_mut().unwrap() = execution_requests, + )) + .await + .0; + + let Err(BlockError::StateRootMismatch { + local: true_state_root, + .. + }) = harness + .process_block(topup_block_slot, block.0.canonical_root(), block.clone()) + .await + else { + panic!("state root should not match due to pending deposits changes/etc"); + }; + let mut new_block = block.0.message_fulu().unwrap().clone(); + new_block.state_root = true_state_root; + block.0 = Arc::new(harness.sign_beacon_block(new_block.into(), &pre_deposit_state)); + + harness + .process_block(topup_block_slot, block.0.canonical_root(), block.clone()) + .await + .unwrap(); + + // Advance two epochs to finalize the deposit and process it. + // Start with just a single epoch advance so we can grab the state one epoch prior to where + // we end up. + harness.advance_slot(); + harness + .extend_chain( + E::slots_per_epoch() as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + // Grab the epoch start state. This is the state from which the proposers at the next epoch were + // computed. + let prev_epoch_state = harness.get_current_state(); + assert_eq!(prev_epoch_state.slot() % E::slots_per_epoch(), 0); + + // The deposit should be pending. + let pending_deposits = prev_epoch_state.pending_deposits().unwrap(); + assert_eq!(pending_deposits.len(), 1, "{pending_deposits:?}"); + + // Advance the 2nd epoch to finalize the deposit and process it. + harness.advance_slot(); + harness + .extend_chain( + E::slots_per_epoch() as usize, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + let current_epoch_state = harness.get_current_state(); + assert_eq!(current_epoch_state.slot() % E::slots_per_epoch(), 0); + + // Deposit is processed! + let pending_deposits = current_epoch_state.pending_deposits().unwrap(); + assert_eq!(pending_deposits.len(), 0, "{pending_deposits:?}"); + + let validator = current_epoch_state + .get_validator(validator_to_topup_index) + .unwrap(); + assert!(validator.has_compounding_withdrawal_credential(spec)); + assert_eq!(validator.effective_balance, 95_000_000_000); + + // The shuffling for the current epoch from `prev_epoch_state` should match the shuffling + // for the current epoch from `current_epoch_state` because we should be correctly using the + // stored lookahead. + let current_epoch = current_epoch_state.current_epoch(); + let proposer_shuffling = prev_epoch_state + .get_beacon_proposer_indices(current_epoch, spec) + .unwrap(); + + assert_eq!( + proposer_shuffling, + current_epoch_state + .get_beacon_proposer_indices(current_epoch, spec) + .unwrap() + ); + + // If we bypass the safety checks in `get_proposer_indices`, we should see that the shuffling + // differs due to the effective balance change. + let unsafe_get_proposer_indices = |state: &BeaconState, epoch| -> Vec { + let indices = state.get_active_validator_indices(epoch, spec).unwrap(); + let preimage = state.get_seed(epoch, Domain::BeaconProposer, spec).unwrap(); + epoch + .slot_iter(E::slots_per_epoch()) + .map(|slot| { + let mut preimage = preimage.to_vec(); + preimage.append(&mut int_to_bytes::int_to_bytes8(slot.as_u64())); + let seed = ethereum_hashing::hash(&preimage); + state.compute_proposer_index(&indices, &seed, spec).unwrap() + }) + .collect() + }; + + // The unsafe function is correct when used with lookahead. + assert_eq!( + unsafe_get_proposer_indices(&prev_epoch_state, current_epoch), + proposer_shuffling + ); + + // Computing the shuffling for current epoch without lookahead is WRONG. + assert_ne!( + unsafe_get_proposer_indices(¤t_epoch_state, current_epoch), + proposer_shuffling, + ); +} + // Ensure blocks from abandoned forks are pruned from the Hot DB #[tokio::test] async fn prunes_abandoned_fork_between_two_finalized_checkpoints() { @@ -2425,6 +2735,14 @@ async fn weak_subjectivity_sync_test( .rng(Box::new(StdRng::seed_from_u64(42))) .build() .expect("should build"); + beacon_chain + .data_availability_checker + .custody_context() + .init_ordered_data_columns_from_custody_groups( + (0..spec.number_of_custody_groups).collect(), + &spec, + ) + .unwrap(); let beacon_chain = Arc::new(beacon_chain); let wss_block_root = wss_block.canonical_root(); @@ -3827,6 +4145,88 @@ async fn replay_from_split_state() { assert_eq!(state.slot(), split.slot); } +/// Test that regular nodes filter and store only custody columns when processing blocks with data columns. +#[tokio::test] +async fn test_custody_column_filtering_regular_node() { + // Skip test if PeerDAS is not scheduled + if !test_spec::().is_peer_das_scheduled() { + return; + } + + let db_path = tempdir().unwrap(); + let store = get_store(&db_path); + let harness = get_harness(store.clone(), LOW_VALIDATOR_COUNT); + + // Generate a block with data columns + harness.execution_block_generator().set_min_blob_count(1); + let current_slot = harness.get_current_slot(); + let block_root = harness + .extend_chain( + 1, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + // Get custody columns for this epoch - regular nodes only store a subset + let expected_custody_columns: HashSet<_> = harness + .chain + .custody_columns_for_epoch(Some(current_slot.epoch(E::slots_per_epoch()))) + .iter() + .copied() + .collect(); + + // Check what actually got stored in the database + let stored_column_indices: HashSet<_> = store + .get_data_column_keys(block_root) + .expect("should get stored column keys") + .into_iter() + .collect(); + + assert_eq!( + stored_column_indices, expected_custody_columns, + "Regular node should only store custody columns" + ); +} + +/// Test that supernodes store all data columns when processing blocks with data columns. +#[tokio::test] +async fn test_custody_column_filtering_supernode() { + // Skip test if PeerDAS is not scheduled + if !test_spec::().is_peer_das_scheduled() { + return; + } + + let db_path = tempdir().unwrap(); + let store = get_store(&db_path); + let harness = get_harness_import_all_data_columns(store.clone(), LOW_VALIDATOR_COUNT); + + // Generate a block with data columns + harness.execution_block_generator().set_min_blob_count(1); + let block_root = harness + .extend_chain( + 1, + BlockStrategy::OnCanonicalHead, + AttestationStrategy::AllValidators, + ) + .await; + + // Supernodes are expected to store all data columns + let expected_custody_columns: HashSet<_> = (0..E::number_of_columns() as u64).collect(); + + // Check what actually got stored in the database + let stored_column_indices: HashSet<_> = store + .get_data_column_keys(block_root) + .expect("should get stored column keys") + .into_iter() + .collect(); + + assert_eq!( + stored_column_indices, expected_custody_columns, + "Supernode should store all custody columns" + ); +} + /// Checks that two chains are the same, for the purpose of these tests. /// /// Several fields that are hard/impossible to check are ignored (e.g., the store). diff --git a/beacon_node/beacon_chain/tests/validator_monitor.rs b/beacon_node/beacon_chain/tests/validator_monitor.rs index 4e2554d3d8..95732abeb5 100644 --- a/beacon_node/beacon_chain/tests/validator_monitor.rs +++ b/beacon_node/beacon_chain/tests/validator_monitor.rs @@ -3,7 +3,7 @@ use beacon_chain::test_utils::{ }; use beacon_chain::validator_monitor::{MISSED_BLOCK_LAG_SLOTS, ValidatorMonitorConfig}; use std::sync::LazyLock; -use types::{Epoch, EthSpec, Keypair, MainnetEthSpec, PublicKeyBytes, Slot}; +use types::{Epoch, EthSpec, Hash256, Keypair, MainnetEthSpec, PublicKeyBytes, Slot}; // Should ideally be divisible by 3. pub const VALIDATOR_COUNT: usize = 48; @@ -74,7 +74,7 @@ async fn missed_blocks_across_epochs() { .get_hot_state(state_roots_by_slot[&start_slot]) .unwrap(); let decision_root = state - .proposer_shuffling_decision_root(genesis_block_root) + .proposer_shuffling_decision_root(genesis_block_root, &harness.chain.spec) .unwrap(); proposer_shuffling_cache .insert( @@ -152,7 +152,7 @@ async fn missed_blocks_basic() { .unwrap(); let mut missed_block_proposer = validator_indexes[slot_in_epoch.as_usize()]; let mut proposer_shuffling_decision_root = _state - .proposer_shuffling_decision_root(duplicate_block_root) + .proposer_shuffling_decision_root(duplicate_block_root, &harness1.chain.spec) .unwrap(); let beacon_proposer_cache = harness1 @@ -235,17 +235,20 @@ async fn missed_blocks_basic() { // Let's fill the cache with the proposers for the current epoch // and push the duplicate_block_root to the block_roots vector assert_eq!( - beacon_proposer_cache.lock().insert( - epoch, - duplicate_block_root, - validator_indexes.clone(), - _state2.fork() - ), + _state2.set_block_root(prev_slot, duplicate_block_root), Ok(()) ); + let decision_block_root = _state2 + .proposer_shuffling_decision_root_at_epoch(epoch, Hash256::ZERO, &harness2.chain.spec) + .unwrap(); assert_eq!( - _state2.set_block_root(prev_slot, duplicate_block_root), + beacon_proposer_cache.lock().insert( + epoch, + decision_block_root, + validator_indexes.clone(), + _state2.fork() + ), Ok(()) ); @@ -326,7 +329,11 @@ async fn missed_blocks_basic() { .unwrap(); missed_block_proposer = validator_indexes[slot_in_epoch.as_usize()]; proposer_shuffling_decision_root = _state3 - .proposer_shuffling_decision_root_at_epoch(epoch, duplicate_block_root) + .proposer_shuffling_decision_root_at_epoch( + epoch, + duplicate_block_root, + &harness1.chain.spec, + ) .unwrap(); let beacon_proposer_cache = harness3 diff --git a/beacon_node/beacon_processor/src/lib.rs b/beacon_node/beacon_processor/src/lib.rs index ab9ab045f4..28ed0cca91 100644 --- a/beacon_node/beacon_processor/src/lib.rs +++ b/beacon_node/beacon_processor/src/lib.rs @@ -59,7 +59,7 @@ use std::sync::Arc; use std::task::Context; use std::time::{Duration, Instant}; use strum::IntoStaticStr; -use task_executor::TaskExecutor; +use task_executor::{RayonPoolType, TaskExecutor}; use tokio::sync::mpsc; use tokio::sync::mpsc::error::TrySendError; use tracing::{debug, error, trace, warn}; @@ -181,7 +181,7 @@ impl BeaconProcessorQueueLengths { // We don't request more than `PARENT_DEPTH_TOLERANCE` (32) lookups, so we can limit // this queue size. With 48 max blobs per block, each column sidecar list could be up to 12MB. rpc_custody_column_queue: 64, - column_reconstruction_queue: 64, + column_reconstruction_queue: 1, chain_segment_queue: 64, backfill_chain_segment: 64, gossip_block_queue: 1024, @@ -603,7 +603,7 @@ pub enum Work { process_fn: BlockingFn, }, ChainSegment(AsyncFn), - ChainSegmentBackfill(AsyncFn), + ChainSegmentBackfill(BlockingFn), Status(BlockingFn), BlocksByRangeRequest(AsyncFn), BlocksByRootsRequest(AsyncFn), @@ -867,7 +867,7 @@ impl BeaconProcessor { let mut rpc_blob_queue = FifoQueue::new(queue_lengths.rpc_blob_queue); let mut rpc_custody_column_queue = FifoQueue::new(queue_lengths.rpc_custody_column_queue); let mut column_reconstruction_queue = - FifoQueue::new(queue_lengths.column_reconstruction_queue); + LifoQueue::new(queue_lengths.column_reconstruction_queue); let mut chain_segment_queue = FifoQueue::new(queue_lengths.chain_segment_queue); let mut backfill_chain_segment = FifoQueue::new(queue_lengths.backfill_chain_segment); let mut gossip_block_queue = FifoQueue::new(queue_lengths.gossip_block_queue); @@ -1354,9 +1354,7 @@ impl BeaconProcessor { Work::RpcCustodyColumn { .. } => { rpc_custody_column_queue.push(work, work_id) } - Work::ColumnReconstruction(_) => { - column_reconstruction_queue.push(work, work_id) - } + Work::ColumnReconstruction(_) => column_reconstruction_queue.push(work), Work::ChainSegment { .. } => chain_segment_queue.push(work, work_id), Work::ChainSegmentBackfill { .. } => { backfill_chain_segment.push(work, work_id) @@ -1605,7 +1603,14 @@ impl BeaconProcessor { Work::BlocksByRangeRequest(work) | Work::BlocksByRootsRequest(work) => { task_spawner.spawn_async(work) } - Work::ChainSegmentBackfill(process_fn) => task_spawner.spawn_async(process_fn), + Work::ChainSegmentBackfill(process_fn) => { + if self.config.enable_backfill_rate_limiting { + task_spawner.spawn_blocking_with_rayon(RayonPoolType::LowPriority, process_fn) + } else { + // use the global rayon thread pool if backfill rate limiting is disabled. + task_spawner.spawn_blocking(process_fn) + } + } Work::ApiRequestP0(process_fn) | Work::ApiRequestP1(process_fn) => match process_fn { BlockingOrAsync::Blocking(process_fn) => task_spawner.spawn_blocking(process_fn), BlockingOrAsync::Async(process_fn) => task_spawner.spawn_async(process_fn), @@ -1667,6 +1672,21 @@ impl TaskSpawner { WORKER_TASK_NAME, ) } + + /// Spawns a blocking task on a rayon thread pool, dropping the `SendOnDrop` after task completion. + fn spawn_blocking_with_rayon(self, rayon_pool_type: RayonPoolType, task: F) + where + F: FnOnce() + Send + 'static, + { + self.executor.spawn_blocking_with_rayon( + move || { + task(); + drop(self.send_idle_on_drop) + }, + rayon_pool_type, + WORKER_TASK_NAME, + ) + } } /// This struct will send a message on `self.tx` when it is dropped. An error will be logged diff --git a/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs b/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs index 032f14ce3d..8c33cf5869 100644 --- a/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs +++ b/beacon_node/beacon_processor/src/scheduler/work_reprocessing_queue.rs @@ -37,7 +37,9 @@ const TASK_NAME: &str = "beacon_processor_reprocess_queue"; const GOSSIP_BLOCKS: &str = "gossip_blocks"; const RPC_BLOCKS: &str = "rpc_blocks"; const ATTESTATIONS: &str = "attestations"; +const ATTESTATIONS_PER_ROOT: &str = "attestations_per_root"; const LIGHT_CLIENT_UPDATES: &str = "lc_updates"; +const LIGHT_CLIENT_UPDATES_PER_PARENT_ROOT: &str = "lc_updates_per_parent_root"; /// Queue blocks for re-processing with an `ADDITIONAL_QUEUED_BLOCK_DELAY` after the slot starts. /// This is to account for any slight drift in the system clock. @@ -82,6 +84,9 @@ pub const BACKFILL_SCHEDULE_IN_SLOT: [(u32, u32); 3] = [ (4, 5), ]; +/// Trigger reconstruction if we are this many seconds into the current slot +pub const RECONSTRUCTION_DEADLINE: Duration = Duration::from_millis(3000); + /// Messages that the scheduler can receive. #[derive(AsRefStr)] pub enum ReprocessQueueMessage { @@ -168,10 +173,11 @@ pub struct IgnoredRpcBlock { } /// A backfill batch work that has been queued for processing later. -pub struct QueuedBackfillBatch(pub AsyncFn); +pub struct QueuedBackfillBatch(pub BlockingFn); pub struct QueuedColumnReconstruction { pub block_root: Hash256, + pub slot: Slot, pub process_fn: AsyncFn, } @@ -749,16 +755,26 @@ impl ReprocessQueue { } } InboundEvent::Msg(DelayColumnReconstruction(request)) => { + let mut reconstruction_delay = QUEUED_RECONSTRUCTION_DELAY; + if let Some(seconds_from_current_slot) = + self.slot_clock.seconds_from_current_slot_start() + && let Some(current_slot) = self.slot_clock.now() + && seconds_from_current_slot >= RECONSTRUCTION_DEADLINE + && current_slot == request.slot + { + // If we are at least `RECONSTRUCTION_DEADLINE` seconds into the current slot, + // and the reconstruction request is for the current slot, process reconstruction immediately. + reconstruction_delay = Duration::from_secs(0); + } match self.queued_column_reconstructions.entry(request.block_root) { Entry::Occupied(key) => { - // Push back the reattempted reconstruction self.column_reconstructions_delay_queue - .reset(key.get(), QUEUED_RECONSTRUCTION_DELAY) + .reset(key.get(), reconstruction_delay); } Entry::Vacant(vacant) => { let delay_key = self .column_reconstructions_delay_queue - .insert(request, QUEUED_RECONSTRUCTION_DELAY); + .insert(request, reconstruction_delay); vacant.insert(delay_key); } } @@ -815,10 +831,19 @@ impl ReprocessQueue { ); } - if let Some(queued_atts) = self.awaiting_attestations_per_root.get_mut(&root) - && let Some(index) = queued_atts.iter().position(|&id| id == queued_id) + if let Entry::Occupied(mut queued_atts) = + self.awaiting_attestations_per_root.entry(root) + && let Some(index) = + queued_atts.get().iter().position(|&id| id == queued_id) { - queued_atts.swap_remove(index); + let queued_atts_mut = queued_atts.get_mut(); + queued_atts_mut.swap_remove(index); + + // If the vec is empty after this attestation's removal, we need to delete + // the entry to prevent bloating the hashmap indefinitely. + if queued_atts_mut.is_empty() { + queued_atts.remove_entry(); + } } } } @@ -839,13 +864,19 @@ impl ReprocessQueue { error!("Failed to send scheduled light client optimistic update"); } - if let Some(queued_lc_updates) = self - .awaiting_lc_updates_per_parent_root - .get_mut(&parent_root) - && let Some(index) = - queued_lc_updates.iter().position(|&id| id == queued_id) + if let Entry::Occupied(mut queued_lc_updates) = + self.awaiting_lc_updates_per_parent_root.entry(parent_root) + && let Some(index) = queued_lc_updates + .get() + .iter() + .position(|&id| id == queued_id) { - queued_lc_updates.swap_remove(index); + let queued_lc_updates_mut = queued_lc_updates.get_mut(); + queued_lc_updates_mut.swap_remove(index); + + if queued_lc_updates_mut.is_empty() { + queued_lc_updates.remove_entry(); + } } } } @@ -915,11 +946,21 @@ impl ReprocessQueue { &[ATTESTATIONS], self.attestations_delay_queue.len() as i64, ); + metrics::set_gauge_vec( + &metrics::BEACON_PROCESSOR_REPROCESSING_QUEUE_TOTAL, + &[ATTESTATIONS_PER_ROOT], + self.awaiting_attestations_per_root.len() as i64, + ); metrics::set_gauge_vec( &metrics::BEACON_PROCESSOR_REPROCESSING_QUEUE_TOTAL, &[LIGHT_CLIENT_UPDATES], self.lc_updates_delay_queue.len() as i64, ); + metrics::set_gauge_vec( + &metrics::BEACON_PROCESSOR_REPROCESSING_QUEUE_TOTAL, + &[LIGHT_CLIENT_UPDATES_PER_PARENT_ROOT], + self.awaiting_lc_updates_per_parent_root.len() as i64, + ); } fn recompute_next_backfill_batch_event(&mut self) { @@ -965,6 +1006,7 @@ impl ReprocessQueue { #[cfg(test)] mod tests { use super::*; + use crate::BeaconProcessorConfig; use logging::create_test_tracing_subscriber; use slot_clock::{ManualSlotClock, TestingSlotClock}; use std::ops::Add; @@ -1042,7 +1084,7 @@ mod tests { // Now queue a backfill sync batch. work_reprocessing_tx .try_send(ReprocessQueueMessage::BackfillSync(QueuedBackfillBatch( - Box::pin(async {}), + Box::new(|| {}), ))) .unwrap(); tokio::task::yield_now().await; @@ -1087,4 +1129,97 @@ mod tests { Duration::from_secs(slot_duration), ) } + + fn test_queue() -> ReprocessQueue { + create_test_tracing_subscriber(); + + let config = BeaconProcessorConfig::default(); + let (ready_work_tx, _) = mpsc::channel::(config.max_scheduled_work_queue_len); + let (_, reprocess_work_rx) = + mpsc::channel::(config.max_scheduled_work_queue_len); + let slot_clock = Arc::new(testing_slot_clock(12)); + + ReprocessQueue::new(ready_work_tx, reprocess_work_rx, slot_clock) + } + + // This is a regression test for a memory leak in `awaiting_attestations_per_root`. + // See: https://github.com/sigp/lighthouse/pull/8065 + #[tokio::test] + async fn prune_awaiting_attestations_per_root() { + create_test_tracing_subscriber(); + + let mut queue = test_queue(); + + // Pause time so it only advances manually + tokio::time::pause(); + + let beacon_block_root = Hash256::repeat_byte(0xaf); + + // Insert an attestation. + let att = ReprocessQueueMessage::UnknownBlockUnaggregate(QueuedUnaggregate { + beacon_block_root, + process_fn: Box::new(|| {}), + }); + + // Process the event to enter it into the delay queue. + queue.handle_message(InboundEvent::Msg(att)); + + // Check that it is queued. + assert_eq!(queue.awaiting_attestations_per_root.len(), 1); + assert!( + queue + .awaiting_attestations_per_root + .contains_key(&beacon_block_root) + ); + + // Advance time to expire the attestation. + advance_time(&queue.slot_clock, 2 * QUEUED_ATTESTATION_DELAY).await; + let ready_msg = queue.next().await.unwrap(); + assert!(matches!(ready_msg, InboundEvent::ReadyAttestation(_))); + queue.handle_message(ready_msg); + + // The entry for the block root should be gone. + assert!(queue.awaiting_attestations_per_root.is_empty()); + } + + // This is a regression test for a memory leak in `awaiting_lc_updates_per_parent_root`. + // See: https://github.com/sigp/lighthouse/pull/8065 + #[tokio::test] + async fn prune_awaiting_lc_updates_per_parent_root() { + create_test_tracing_subscriber(); + + let mut queue = test_queue(); + + // Pause time so it only advances manually + tokio::time::pause(); + + let parent_root = Hash256::repeat_byte(0xaf); + + // Insert an attestation. + let msg = + ReprocessQueueMessage::UnknownLightClientOptimisticUpdate(QueuedLightClientUpdate { + parent_root, + process_fn: Box::new(|| {}), + }); + + // Process the event to enter it into the delay queue. + queue.handle_message(InboundEvent::Msg(msg)); + + // Check that it is queued. + assert_eq!(queue.awaiting_lc_updates_per_parent_root.len(), 1); + assert!( + queue + .awaiting_lc_updates_per_parent_root + .contains_key(&parent_root) + ); + + // Advance time to expire the update. + advance_time(&queue.slot_clock, 2 * QUEUED_LIGHT_CLIENT_UPDATE_DELAY).await; + let ready_msg = queue.next().await.unwrap(); + assert!(matches!(ready_msg, InboundEvent::ReadyLightClientUpdate(_))); + queue.handle_message(ready_msg); + + // The entry for the block root should be gone. + assert!(queue.awaiting_lc_updates_per_parent_root.is_empty()); + } } diff --git a/beacon_node/builder_client/src/lib.rs b/beacon_node/builder_client/src/lib.rs index 0c3fdca907..2c83e34755 100644 --- a/beacon_node/builder_client/src/lib.rs +++ b/beacon_node/builder_client/src/lib.rs @@ -8,7 +8,7 @@ use eth2::types::{ use eth2::types::{FullPayloadContents, SignedBlindedBeaconBlock}; use eth2::{ CONSENSUS_VERSION_HEADER, CONTENT_TYPE_HEADER, JSON_CONTENT_TYPE_HEADER, - SSZ_CONTENT_TYPE_HEADER, StatusCode, ok_or_error, + SSZ_CONTENT_TYPE_HEADER, StatusCode, ok_or_error, success_or_error, }; use reqwest::header::{ACCEPT, HeaderMap, HeaderValue}; use reqwest::{IntoUrl, Response}; @@ -249,7 +249,7 @@ impl BuilderHttpClient { .send() .await .map_err(Error::from)?; - ok_or_error(response).await + success_or_error(response).await } async fn post_with_raw_response( @@ -270,7 +270,7 @@ impl BuilderHttpClient { .send() .await .map_err(Error::from)?; - ok_or_error(response).await + success_or_error(response).await } /// `POST /eth/v1/builder/validators` diff --git a/beacon_node/client/src/builder.rs b/beacon_node/client/src/builder.rs index d984d5fedc..02c042bf28 100644 --- a/beacon_node/client/src/builder.rs +++ b/beacon_node/client/src/builder.rs @@ -412,7 +412,7 @@ where let blobs = if block.message().body().has_blobs() { debug!("Downloading finalized blobs"); if let Some(response) = remote - .get_blobs::(BlockId::Root(block_root), None, &spec) + .get_blob_sidecars::(BlockId::Root(block_root), None, &spec) .await .map_err(|e| format!("Error fetching finalized blobs from remote: {e:?}"))? { diff --git a/beacon_node/client/src/compute_light_client_updates.rs b/beacon_node/client/src/compute_light_client_updates.rs index 44c3475bfe..0ef35588df 100644 --- a/beacon_node/client/src/compute_light_client_updates.rs +++ b/beacon_node/client/src/compute_light_client_updates.rs @@ -3,7 +3,7 @@ use beacon_processor::work_reprocessing_queue::ReprocessQueueMessage; use beacon_processor::{BeaconProcessorSend, Work, WorkEvent}; use futures::StreamExt; use futures::channel::mpsc::Receiver; -use tracing::error; +use tracing::{debug, error}; // Each `LightClientProducerEvent` is ~200 bytes. With the light_client server producing only recent // updates it is okay to drop some events in case of overloading. In normal network conditions @@ -27,7 +27,7 @@ pub async fn compute_light_client_updates( chain .recompute_and_cache_light_client_updates(event) .unwrap_or_else(|e| { - error!("error computing light_client updates {:?}", e); + debug!("error computing light_client updates {:?}", e); }); let msg = ReprocessQueueMessage::NewLightClientOptimisticUpdate { parent_root }; diff --git a/beacon_node/client/src/notifier.rs b/beacon_node/client/src/notifier.rs index 1e58c210da..c83cdad7e0 100644 --- a/beacon_node/client/src/notifier.rs +++ b/beacon_node/client/src/notifier.rs @@ -9,8 +9,8 @@ use execution_layer::{ EngineCapabilities, http::{ ENGINE_FORKCHOICE_UPDATED_V2, ENGINE_FORKCHOICE_UPDATED_V3, ENGINE_GET_PAYLOAD_V2, - ENGINE_GET_PAYLOAD_V3, ENGINE_GET_PAYLOAD_V4, ENGINE_NEW_PAYLOAD_V2, ENGINE_NEW_PAYLOAD_V3, - ENGINE_NEW_PAYLOAD_V4, + ENGINE_GET_PAYLOAD_V3, ENGINE_GET_PAYLOAD_V4, ENGINE_GET_PAYLOAD_V5, ENGINE_NEW_PAYLOAD_V2, + ENGINE_NEW_PAYLOAD_V3, ENGINE_NEW_PAYLOAD_V4, }, }; use lighthouse_network::{NetworkGlobals, types::SyncState}; @@ -524,18 +524,16 @@ fn methods_required_for_fork( } } ForkName::Fulu => { - // TODO(fulu) switch to v5 when the EL is ready - if !capabilities.get_payload_v4 { - missing_methods.push(ENGINE_GET_PAYLOAD_V4); + if !capabilities.get_payload_v5 { + missing_methods.push(ENGINE_GET_PAYLOAD_V5); } if !capabilities.new_payload_v4 { missing_methods.push(ENGINE_NEW_PAYLOAD_V4); } } ForkName::Gloas => { - // TODO(gloas) switch to v5/v6 when the EL is ready - if !capabilities.get_payload_v4 { - missing_methods.push(ENGINE_GET_PAYLOAD_V4); + if !capabilities.get_payload_v5 { + missing_methods.push(ENGINE_GET_PAYLOAD_V5); } if !capabilities.new_payload_v4 { missing_methods.push(ENGINE_NEW_PAYLOAD_V4); diff --git a/beacon_node/execution_layer/src/lib.rs b/beacon_node/execution_layer/src/lib.rs index e67da468d2..1983db57eb 100644 --- a/beacon_node/execution_layer/src/lib.rs +++ b/beacon_node/execution_layer/src/lib.rs @@ -43,7 +43,7 @@ use tokio::{ time::sleep, }; use tokio_stream::wrappers::WatchStream; -use tracing::{debug, error, info, warn}; +use tracing::{Instrument, debug, debug_span, error, info, instrument, warn}; use tree_hash::TreeHash; use types::beacon_block_body::KzgCommitments; use types::builder_bid::BuilderBid; @@ -844,6 +844,7 @@ impl ExecutionLayer { } /// Returns the fee-recipient address that should be used to build a block + #[instrument(level = "debug", skip_all)] pub async fn get_suggested_fee_recipient(&self, proposer_index: u64) -> Address { if let Some(preparation_data_entry) = self.proposer_preparation_data().await.get(&proposer_index) @@ -868,6 +869,7 @@ impl ExecutionLayer { } } + #[instrument(level = "debug", skip_all)] pub async fn get_proposer_gas_limit(&self, proposer_index: u64) -> Option { self.proposer_preparation_data() .await @@ -884,6 +886,7 @@ impl ExecutionLayer { /// /// The result will be returned from the first node that returns successfully. No more nodes /// will be contacted. + #[instrument(level = "debug", skip_all)] pub async fn get_payload( &self, payload_parameters: PayloadParameters<'_>, @@ -989,6 +992,7 @@ impl ExecutionLayer { timed_future(metrics::GET_BLINDED_PAYLOAD_BUILDER, async { builder .get_builder_header::(slot, parent_hash, pubkey) + .instrument(debug_span!("get_builder_header")) .await }), timed_future(metrics::GET_BLINDED_PAYLOAD_LOCAL, async { @@ -1230,6 +1234,7 @@ impl ExecutionLayer { .await } + #[instrument(level = "debug", skip_all)] async fn get_full_payload_with( &self, payload_parameters: PayloadParameters<'_>, @@ -1905,9 +1910,19 @@ impl ExecutionLayer { ) -> Result, Error> { debug!(?block_root, "Sending block to builder"); if spec.is_fulu_scheduled() { - self.post_builder_blinded_blocks_v2(block_root, block) + let resp = self + .post_builder_blinded_blocks_v2(block_root, block) .await - .map(|()| SubmitBlindedBlockResponse::V2) + .map(|()| SubmitBlindedBlockResponse::V2); + // Fallback to v1 if v2 fails because the relay doesn't support it. + // Note: we should remove the fallback post fulu when all relays have support for v2. + if resp.is_err() { + self.post_builder_blinded_blocks_v1(block_root, block) + .await + .map(|full_payload| SubmitBlindedBlockResponse::V1(Box::new(full_payload))) + } else { + resp + } } else { self.post_builder_blinded_blocks_v1(block_root, block) .await @@ -2023,7 +2038,9 @@ impl ExecutionLayer { relay_response_ms = duration.as_millis(), ?block_root, "Successfully submitted blinded block to the builder" - ) + ); + + Ok(()) } Err(e) => { metrics::inc_counter_vec( @@ -2036,11 +2053,10 @@ impl ExecutionLayer { relay_response_ms = duration.as_millis(), ?block_root, "Failed to submit blinded block to the builder" - ) + ); + Err(e) } } - - Ok(()) } else { Err(Error::NoPayloadBuilder) } diff --git a/beacon_node/execution_layer/src/test_utils/mock_builder.rs b/beacon_node/execution_layer/src/test_utils/mock_builder.rs index 5121551545..64e492518e 100644 --- a/beacon_node/execution_layer/src/test_utils/mock_builder.rs +++ b/beacon_node/execution_layer/src/test_utils/mock_builder.rs @@ -3,8 +3,8 @@ use crate::{Config, ExecutionLayer, PayloadAttributes, PayloadParameters}; use bytes::Bytes; use eth2::types::PublishBlockRequest; use eth2::types::{ - BlobsBundle, BlockId, BroadcastValidation, EventKind, EventTopic, FullPayloadContents, - ProposerData, StateId, ValidatorId, + BlobsBundle, BlockId, BroadcastValidation, EndpointVersion, EventKind, EventTopic, + FullPayloadContents, ProposerData, StateId, ValidatorId, }; use eth2::{ BeaconNodeHttpClient, CONSENSUS_VERSION_HEADER, CONTENT_TYPE_HEADER, SSZ_CONTENT_TYPE_HEADER, @@ -307,6 +307,10 @@ pub struct MockBuilder { payload_id_cache: Arc>>, /// If set to `true`, sets the bid returned by `get_header` to Uint256::MAX max_bid: bool, + /// Broadcast the full block with payload to the attached beacon node (simulating the relay). + /// + /// Turning this off is useful for testing. + broadcast_to_bn: bool, /// A cache that stores the proposers index for a given epoch proposers_cache: Arc>>>, } @@ -315,6 +319,9 @@ impl MockBuilder { pub fn new_for_testing( mock_el_url: SensitiveUrl, beacon_url: SensitiveUrl, + validate_pubkey: bool, + apply_operations: bool, + broadcast_to_bn: bool, spec: Arc, executor: TaskExecutor, ) -> (Self, (SocketAddr, impl Future)) { @@ -332,12 +339,15 @@ impl MockBuilder { let el = ExecutionLayer::from_config(config, executor.clone()).unwrap(); + let max_bid = false; + let builder = MockBuilder::new( el, BeaconNodeHttpClient::new(beacon_url, Timeouts::set_all(Duration::from_secs(1))), - true, - true, - false, + validate_pubkey, + apply_operations, + broadcast_to_bn, + max_bid, spec, None, ); @@ -353,6 +363,7 @@ impl MockBuilder { beacon_client: BeaconNodeHttpClient, validate_pubkey: bool, apply_operations: bool, + broadcast_to_bn: bool, max_bid: bool, spec: Arc, sk: Option<&[u8]>, @@ -382,6 +393,7 @@ impl MockBuilder { proposers_cache: Arc::new(RwLock::new(HashMap::new())), apply_operations, max_bid, + broadcast_to_bn, genesis_time: None, } } @@ -462,14 +474,20 @@ impl MockBuilder { return Err("invalid fork".to_string()); } }; + let block_hash = block + .message() + .body() + .execution_payload() + .unwrap() + .block_hash(); info!( - block_hash = %root, + execution_payload_root = %root, + ?block_hash, "Submitting blinded beacon block to builder" ); - let payload = self - .el - .get_payload_by_root(&root) - .ok_or_else(|| "missing payload for tx root".to_string())?; + let payload = self.el.get_payload_by_root(&root).ok_or_else(|| { + format!("missing payload for root: {root:?}, block_hash: {block_hash:?}",) + })?; let (payload, blobs) = payload.deconstruct(); let full_block = block @@ -478,16 +496,28 @@ impl MockBuilder { debug!( txs_count = payload.transactions().len(), blob_count = blobs.as_ref().map(|b| b.commitments.len()), - "Got full payload, sending to local beacon node for propagation" + "Got full payload" ); - let publish_block_request = PublishBlockRequest::new( - Arc::new(full_block), - blobs.clone().map(|b| (b.proofs, b.blobs)), - ); - self.beacon_client - .post_beacon_blocks_v2(&publish_block_request, Some(BroadcastValidation::Gossip)) - .await - .map_err(|e| format!("Failed to post blinded block {:?}", e))?; + if self.broadcast_to_bn { + debug!( + block_hash = ?payload.block_hash(), + "Broadcasting builder block to BN" + ); + let publish_block_request = PublishBlockRequest::new( + Arc::new(full_block), + blobs.clone().map(|b| (b.proofs, b.blobs)), + ); + self.beacon_client + .post_beacon_blocks_v2( + &publish_block_request, + Some(BroadcastValidation::ConsensusAndEquivocation), + ) + .await + .map_err(|e| { + // XXX: this should really be a 400 but warp makes that annoyingly difficult + format!("Failed to post blinded block {e:?}") + })?; + } Ok(FullPayloadContents::new(payload, blobs)) } @@ -518,16 +548,29 @@ impl MockBuilder { info!("Got payload params"); let fork = self.fork_name_at_slot(slot); + let payload_response_type = self .el - .get_full_payload_caching(PayloadParameters { - parent_hash: payload_parameters.parent_hash, - parent_gas_limit: payload_parameters.parent_gas_limit, - proposer_gas_limit: payload_parameters.proposer_gas_limit, - payload_attributes: &payload_parameters.payload_attributes, - forkchoice_update_params: &payload_parameters.forkchoice_update_params, - current_fork: payload_parameters.current_fork, - }) + .get_full_payload_with( + PayloadParameters { + parent_hash: payload_parameters.parent_hash, + parent_gas_limit: payload_parameters.parent_gas_limit, + proposer_gas_limit: payload_parameters.proposer_gas_limit, + payload_attributes: &payload_parameters.payload_attributes, + forkchoice_update_params: &payload_parameters.forkchoice_update_params, + current_fork: payload_parameters.current_fork, + }, + // If apply_operations is set, do NOT cache the payload at this point, we are about + // to mutate it and it would be incorrect to cache the unmutated payload. + // + // This is a flaw in apply_operations generally, if you want the mock builder to + // actually return payloads then this option should be turned off. + if self.apply_operations { + |_, _| None + } else { + ExecutionLayer::cache_payload + }, + ) .await .map_err(|e| format!("couldn't get payload {:?}", e))?; @@ -926,11 +969,21 @@ pub fn serve( let inner_ctx = builder.clone(); let ctx_filter = warp::any().map(move || inner_ctx.clone()); - let prefix = warp::path("eth") + let prefix_v1 = warp::path("eth") .and(warp::path("v1")) .and(warp::path("builder")); - let validators = prefix + let prefix_either = warp::path("eth") + .and( + warp::path::param::().or_else(|_| async move { + Err(warp::reject::custom(Custom( + "Invalid EndpointVersion".to_string(), + ))) + }), + ) + .and(warp::path("builder")); + + let validators = prefix_v1 .and(warp::path("validators")) .and(warp::body::json()) .and(warp::path::end()) @@ -942,61 +995,89 @@ pub fn serve( .register_validators(registrations) .await .map_err(|e| warp::reject::custom(Custom(e)))?; - Ok::<_, Rejection>(warp::reply()) - }, - ) - .boxed(); - - let blinded_block_ssz = prefix - .and(warp::path("blinded_blocks")) - .and(warp::body::bytes()) - .and(warp::header::header::(CONSENSUS_VERSION_HEADER)) - .and(warp::path::end()) - .and(ctx_filter.clone()) - .and_then( - |block_bytes: Bytes, fork_name: ForkName, builder: MockBuilder| async move { - let block = - SignedBlindedBeaconBlock::::from_ssz_bytes_by_fork(&block_bytes, fork_name) - .map_err(|e| warp::reject::custom(Custom(format!("{:?}", e))))?; - let payload = builder - .submit_blinded_block(block) - .await - .map_err(|e| warp::reject::custom(Custom(e)))?; - - Ok::<_, warp::reject::Rejection>( - warp::http::Response::builder() - .status(200) - .body(payload.as_ssz_bytes()) - .map(add_ssz_content_type_header) - .map(|res| add_consensus_version_header(res, fork_name)) - .unwrap(), - ) + Ok::<_, Rejection>(warp::reply().into_response()) }, ); - let blinded_block = - prefix + let blinded_block_ssz = + prefix_either .and(warp::path("blinded_blocks")) - .and(warp::body::json()) + .and(warp::body::bytes()) .and(warp::header::header::(CONSENSUS_VERSION_HEADER)) .and(warp::path::end()) .and(ctx_filter.clone()) .and_then( - |block: SignedBlindedBeaconBlock, + |endpoint_version, + block_bytes: Bytes, fork_name: ForkName, builder: MockBuilder| async move { + if endpoint_version != EndpointVersion(1) + && endpoint_version != EndpointVersion(2) + { + return Err(warp::reject::custom(Custom(format!( + "Unsupported version: {endpoint_version}" + )))); + } + let block = SignedBlindedBeaconBlock::::from_ssz_bytes_by_fork( + &block_bytes, + fork_name, + ) + .map_err(|e| warp::reject::custom(Custom(format!("{:?}", e))))?; let payload = builder .submit_blinded_block(block) .await .map_err(|e| warp::reject::custom(Custom(e)))?; - let resp: ForkVersionedResponse<_> = ForkVersionedResponse { - version: fork_name, - metadata: Default::default(), - data: payload, - }; - let json_payload = serde_json::to_string(&resp) - .map_err(|_| reject("coudn't serialize response"))?; + if endpoint_version == EndpointVersion(1) { + Ok::<_, warp::reject::Rejection>( + warp::http::Response::builder() + .status(200) + .body(payload.as_ssz_bytes()) + .map(add_ssz_content_type_header) + .map(|res| add_consensus_version_header(res, fork_name)) + .unwrap(), + ) + } else { + Ok(warp::http::Response::builder() + .status(202) + .body(&[] as &'static [u8]) + .map(|res| add_consensus_version_header(res, fork_name)) + .unwrap()) + } + }, + ); + + let blinded_block = prefix_either + .and(warp::path("blinded_blocks")) + .and(warp::body::json()) + .and(warp::header::header::(CONSENSUS_VERSION_HEADER)) + .and(warp::path::end()) + .and(ctx_filter.clone()) + .and_then( + |endpoint_version, + block: SignedBlindedBeaconBlock, + fork_name: ForkName, + builder: MockBuilder| async move { + if endpoint_version != EndpointVersion(1) && endpoint_version != EndpointVersion(2) + { + return Err(warp::reject::custom(Custom(format!( + "Unsupported version: {endpoint_version}" + )))); + } + let payload = builder + .submit_blinded_block(block) + .await + .map_err(|e| warp::reject::custom(Custom(e)))?; + let resp: ForkVersionedResponse<_> = ForkVersionedResponse { + version: fork_name, + metadata: Default::default(), + data: payload, + }; + + let json_payload = serde_json::to_string(&resp) + .map_err(|_| reject("coudn't serialize response"))?; + + if endpoint_version == EndpointVersion(1) { Ok::<_, warp::reject::Rejection>( warp::http::Response::builder() .status(200) @@ -1004,16 +1085,24 @@ pub fn serve( serde_json::to_string(&json_payload) .map_err(|_| reject("invalid JSON"))?, ) + .map(|res| add_consensus_version_header(res, fork_name)) .unwrap(), ) - }, - ); + } else { + Ok(warp::http::Response::builder() + .status(202) + .body("".to_string()) + .map(|res| add_consensus_version_header(res, fork_name)) + .unwrap()) + } + }, + ); - let status = prefix + let status = prefix_v1 .and(warp::path("status")) - .then(|| async { warp::reply() }); + .then(|| async { warp::reply().into_response() }); - let header = prefix + let header = prefix_v1 .and(warp::path("header")) .and(warp::path::param::().or_else(|_| async { Err(reject("Invalid slot")) })) .and( diff --git a/beacon_node/http_api/Cargo.toml b/beacon_node/http_api/Cargo.toml index 2061df3762..7dd0d0223f 100644 --- a/beacon_node/http_api/Cargo.toml +++ b/beacon_node/http_api/Cargo.toml @@ -26,6 +26,7 @@ logging = { workspace = true } lru = { workspace = true } metrics = { workspace = true } network = { workspace = true } +network_utils = { workspace = true } operation_pool = { workspace = true } parking_lot = { workspace = true } proto_array = { workspace = true } diff --git a/beacon_node/http_api/src/block_id.rs b/beacon_node/http_api/src/block_id.rs index e527e466f6..778067c32b 100644 --- a/beacon_node/http_api/src/block_id.rs +++ b/beacon_node/http_api/src/block_id.rs @@ -2,15 +2,16 @@ use crate::version::inconsistent_fork_rejection; use crate::{ExecutionOptimistic, state_id::checkpoint_slot_and_execution_optimistic}; use beacon_chain::kzg_utils::reconstruct_blobs; use beacon_chain::{BeaconChain, BeaconChainError, BeaconChainTypes, WhenSlotSkipped}; -use eth2::types::BlobIndicesQuery; use eth2::types::BlockId as CoreBlockId; use eth2::types::DataColumnIndicesQuery; +use eth2::types::{BlobIndicesQuery, BlobWrapper, BlobsVersionedHashesQuery}; use std::fmt; use std::str::FromStr; use std::sync::Arc; use types::{ BlobSidecarList, DataColumnSidecarList, EthSpec, FixedBytesExtended, ForkName, Hash256, - SignedBeaconBlock, SignedBlindedBeaconBlock, Slot, + SignedBeaconBlock, SignedBlindedBeaconBlock, Slot, UnversionedResponse, + beacon_response::ExecutionOptimisticFinalizedMetadata, }; use warp::Rejection; @@ -352,6 +353,68 @@ impl BlockId { Ok((block, blob_sidecar_list, execution_optimistic, finalized)) } + #[allow(clippy::type_complexity)] + pub fn get_blobs_by_versioned_hashes( + &self, + query: BlobsVersionedHashesQuery, + chain: &BeaconChain, + ) -> Result< + UnversionedResponse>, ExecutionOptimisticFinalizedMetadata>, + warp::Rejection, + > { + let (root, execution_optimistic, finalized) = self.root(chain)?; + let block = BlockId::blinded_block_by_root(&root, chain)?.ok_or_else(|| { + warp_utils::reject::custom_not_found(format!("beacon block with root {}", root)) + })?; + + // Error if the block is pre-Deneb and lacks blobs. + let blob_kzg_commitments = block.message().body().blob_kzg_commitments().map_err(|_| { + warp_utils::reject::custom_bad_request( + "block is pre-Deneb and has no blobs".to_string(), + ) + })?; + + let blob_indices_opt = query.versioned_hashes.map(|versioned_hashes| { + versioned_hashes + .iter() + .flat_map(|versioned_hash| { + blob_kzg_commitments.iter().position(|commitment| { + let computed_hash = commitment.calculate_versioned_hash(); + computed_hash == *versioned_hash + }) + }) + .map(|index| index as u64) + .collect::>() + }); + + let max_blobs_per_block = chain.spec.max_blobs_per_block(block.epoch()) as usize; + let blob_sidecar_list = if !blob_kzg_commitments.is_empty() { + if chain.spec.is_peer_das_enabled_for_epoch(block.epoch()) { + Self::get_blobs_from_data_columns(chain, root, blob_indices_opt, &block)? + } else { + Self::get_blobs(chain, root, blob_indices_opt, max_blobs_per_block)? + } + } else { + BlobSidecarList::new(vec![], max_blobs_per_block) + .map_err(|e| warp_utils::reject::custom_server_error(format!("{:?}", e)))? + }; + + let blobs = blob_sidecar_list + .into_iter() + .map(|sidecar| BlobWrapper:: { + blob: sidecar.blob.clone(), + }) + .collect(); + + Ok(UnversionedResponse { + metadata: ExecutionOptimisticFinalizedMetadata { + execution_optimistic: Some(execution_optimistic), + finalized: Some(finalized), + }, + data: blobs, + }) + } + fn get_blobs( chain: &BeaconChain, root: Hash256, @@ -369,9 +432,9 @@ impl BlockId { let blob_sidecar_list_filtered = match indices { Some(vec) => { - let list: Vec<_> = blob_sidecar_list + let list: Vec<_> = vec .into_iter() - .filter(|blob_sidecar| vec.contains(&blob_sidecar.index)) + .flat_map(|index| blob_sidecar_list.get(index as usize).cloned()) .collect(); BlobSidecarList::new(list, max_blobs_per_block) diff --git a/beacon_node/http_api/src/lib.rs b/beacon_node/http_api/src/lib.rs index 515c262b19..7f6c97a0f8 100644 --- a/beacon_node/http_api/src/lib.rs +++ b/beacon_node/http_api/src/lib.rs @@ -54,10 +54,11 @@ use eth2::types::{ use eth2::{CONSENSUS_VERSION_HEADER, CONTENT_TYPE_HEADER, SSZ_CONTENT_TYPE_HEADER}; use health_metrics::observe::Observe; use lighthouse_network::rpc::methods::MetaData; -use lighthouse_network::{Enr, EnrExt, NetworkGlobals, PeerId, PubsubMessage, types::SyncState}; +use lighthouse_network::{Enr, NetworkGlobals, PeerId, PubsubMessage, types::SyncState}; use lighthouse_version::version_with_platform; use logging::{SSELoggingComponents, crit}; use network::{NetworkMessage, NetworkSenders, ValidatorSubscriptionMessage}; +use network_utils::enr_ext::EnrExt; use operation_pool::ReceivedPreCapella; use parking_lot::RwLock; pub use publish_blocks::{ @@ -213,6 +214,7 @@ pub fn prometheus_metrics() -> warp::filters::log::Log warp::filters::log::Log( .and(warp::query::()) .and(warp::path::end()) .and(warp_utils::json::json()) + .and(consensus_version_header_filter) .and(task_spawner_filter.clone()) .and(chain_filter.clone()) .and(network_tx_filter.clone()) .then( move |validation_level: api_types::BroadcastValidationQuery, - blinded_block: Arc>, + blinded_block_json: serde_json::Value, + consensus_version: ForkName, task_spawner: TaskSpawner, chain: Arc>, network_tx: UnboundedSender>| { task_spawner.spawn_async_with_rejection(Priority::P0, async move { + let blinded_block = + SignedBlindedBeaconBlock::::context_deserialize( + &blinded_block_json, + consensus_version, + ) + .map(Arc::new) + .map_err(|e| { + warp_utils::reject::custom_bad_request(format!("invalid JSON: {e:?}")) + })?; publish_blocks::publish_blinded_block( blinded_block, chain, @@ -1888,7 +1898,7 @@ pub fn serve( */ // GET beacon/blob_sidecars/{block_id} - let get_blobs = eth_v1 + let get_blob_sidecars = eth_v1 .and(warp::path("beacon")) .and(warp::path("blob_sidecars")) .and(block_id_or_err) @@ -1938,6 +1948,52 @@ pub fn serve( }, ); + // GET beacon/blobs/{block_id} + let get_blobs = eth_v1 + .and(warp::path("beacon")) + .and(warp::path("blobs")) + .and(block_id_or_err) + .and(warp::path::end()) + .and(multi_key_query::()) + .and(task_spawner_filter.clone()) + .and(chain_filter.clone()) + .and(warp::header::optional::("accept")) + .then( + |block_id: BlockId, + version_hashes_res: Result, + task_spawner: TaskSpawner, + chain: Arc>, + accept_header: Option| { + task_spawner.blocking_response_task(Priority::P1, move || { + let versioned_hashes = version_hashes_res?; + let response = + block_id.get_blobs_by_versioned_hashes(versioned_hashes, &chain)?; + + match accept_header { + Some(api_types::Accept::Ssz) => Response::builder() + .status(200) + .body(response.data.as_ssz_bytes().into()) + .map(|res: Response| add_ssz_content_type_header(res)) + .map_err(|e| { + warp_utils::reject::custom_server_error(format!( + "failed to create response: {}", + e + )) + }), + _ => { + let res = execution_optimistic_finalized_beacon_response( + ResponseIncludesVersion::No, + response.metadata.execution_optimistic.unwrap_or(false), + response.metadata.finalized.unwrap_or(false), + response.data, + )?; + Ok(warp::reply::json(&res).into_response()) + } + } + }) + }, + ); + /* * beacon/pool */ @@ -4785,6 +4841,7 @@ pub fn serve( .uor(get_beacon_block_attestations) .uor(get_beacon_blinded_block) .uor(get_beacon_block_root) + .uor(get_blob_sidecars) .uor(get_blobs) .uor(get_beacon_pool_attestations) .uor(get_beacon_pool_attester_slashings) diff --git a/beacon_node/http_api/src/produce_block.rs b/beacon_node/http_api/src/produce_block.rs index 932fb00179..367e09969b 100644 --- a/beacon_node/http_api/src/produce_block.rs +++ b/beacon_node/http_api/src/produce_block.rs @@ -10,8 +10,10 @@ use beacon_chain::{ BeaconBlockResponseWrapper, BeaconChain, BeaconChainTypes, ProduceBlockVerification, }; use eth2::types::{self as api_types, ProduceBlockV3Metadata, SkipRandaoVerification}; +use lighthouse_tracing::{SPAN_PRODUCE_BLOCK_V2, SPAN_PRODUCE_BLOCK_V3}; use ssz::Encode; use std::sync::Arc; +use tracing::instrument; use types::{payload::BlockProductionVersion, *}; use warp::{ Reply, @@ -40,6 +42,11 @@ pub fn get_randao_verification( Ok(randao_verification) } +#[instrument( + name = SPAN_PRODUCE_BLOCK_V3, + skip_all, + fields(%slot) +)] pub async fn produce_block_v3( accept_header: Option, chain: Arc>, @@ -155,6 +162,11 @@ pub async fn produce_blinded_block_v2( build_response_v2(chain, block_response_type, accept_header) } +#[instrument( + name = SPAN_PRODUCE_BLOCK_V2, + skip_all, + fields(%slot) +)] pub async fn produce_block_v2( accept_header: Option, chain: Arc>, diff --git a/beacon_node/http_api/src/proposer_duties.rs b/beacon_node/http_api/src/proposer_duties.rs index 3705c399bd..ceac60cbad 100644 --- a/beacon_node/http_api/src/proposer_duties.rs +++ b/beacon_node/http_api/src/proposer_duties.rs @@ -3,12 +3,13 @@ use crate::state_id::StateId; use beacon_chain::{ BeaconChain, BeaconChainError, BeaconChainTypes, - beacon_proposer_cache::{compute_proposer_duties_from_head, ensure_state_is_in_epoch}, + beacon_proposer_cache::{ + compute_proposer_duties_from_head, ensure_state_can_determine_proposers_for_epoch, + }, }; use eth2::types::{self as api_types}; use safe_arith::SafeArith; use slot_clock::SlotClock; -use std::cmp::Ordering; use tracing::debug; use types::{Epoch, EthSpec, Hash256, Slot}; @@ -105,36 +106,29 @@ fn try_proposer_duties_from_cache( let head_decision_root = head .snapshot .beacon_state - .proposer_shuffling_decision_root(head_block_root) + .proposer_shuffling_decision_root(head_block_root, &chain.spec) .map_err(warp_utils::reject::beacon_state_error)?; let execution_optimistic = chain .is_optimistic_or_invalid_head_block(head_block) .map_err(warp_utils::reject::unhandled_error)?; - let dependent_root = match head_epoch.cmp(&request_epoch) { - // head_epoch == request_epoch - Ordering::Equal => head_decision_root, - // head_epoch < request_epoch - Ordering::Less => head_block_root, - // head_epoch > request_epoch - Ordering::Greater => { - return Err(warp_utils::reject::custom_server_error(format!( - "head epoch {} is later than request epoch {}", - head_epoch, request_epoch - ))); - } - }; + // This code path can't handle requests for past epochs. + if head_epoch > request_epoch { + return Err(warp_utils::reject::custom_server_error(format!( + "head epoch {head_epoch} is later than request epoch {request_epoch}", + ))); + } chain .beacon_proposer_cache .lock() - .get_epoch::(dependent_root, request_epoch) + .get_epoch::(head_decision_root, request_epoch) .cloned() .map(|indices| { convert_to_api_response( chain, request_epoch, - dependent_root, + head_decision_root, execution_optimistic, indices.to_vec(), ) @@ -204,18 +198,19 @@ fn compute_historic_proposer_duties( } }; - let (state, execution_optimistic) = - if let Some((state_root, mut state, execution_optimistic)) = state_opt { - // If we've loaded the head state it might be from a previous epoch, ensure it's in a - // suitable epoch. - ensure_state_is_in_epoch(&mut state, state_root, epoch, &chain.spec) - .map_err(warp_utils::reject::unhandled_error)?; - (state, execution_optimistic) - } else { - let (state, execution_optimistic, _finalized) = - StateId::from_slot(epoch.start_slot(T::EthSpec::slots_per_epoch())).state(chain)?; - (state, execution_optimistic) - }; + let (state, execution_optimistic) = if let Some((state_root, mut state, execution_optimistic)) = + state_opt + { + // If we've loaded the head state it might be from a previous epoch, ensure it's in a + // suitable epoch. + ensure_state_can_determine_proposers_for_epoch(&mut state, state_root, epoch, &chain.spec) + .map_err(warp_utils::reject::unhandled_error)?; + (state, execution_optimistic) + } else { + let (state, execution_optimistic, _finalized) = + StateId::from_slot(epoch.start_slot(T::EthSpec::slots_per_epoch())).state(chain)?; + (state, execution_optimistic) + }; // Ensure the state lookup was correct. if state.current_epoch() != epoch { @@ -234,7 +229,7 @@ fn compute_historic_proposer_duties( // We can supply the genesis block root as the block root since we know that the only block that // decides its own root is the genesis block. let dependent_root = state - .proposer_shuffling_decision_root(chain.genesis_block_root) + .proposer_shuffling_decision_root(chain.genesis_block_root, &chain.spec) .map_err(BeaconChainError::from) .map_err(warp_utils::reject::unhandled_error)?; diff --git a/beacon_node/http_api/src/publish_blocks.rs b/beacon_node/http_api/src/publish_blocks.rs index f797e3f300..05a4a4b7a4 100644 --- a/beacon_node/http_api/src/publish_blocks.rs +++ b/beacon_node/http_api/src/publish_blocks.rs @@ -3,7 +3,7 @@ use std::future::Future; use beacon_chain::blob_verification::{GossipBlobError, GossipVerifiedBlob}; use beacon_chain::block_verification_types::{AsBlock, RpcBlock}; -use beacon_chain::data_column_verification::{GossipDataColumnError, GossipVerifiedDataColumn}; +use beacon_chain::data_column_verification::GossipVerifiedDataColumn; use beacon_chain::validator_monitor::{get_block_delay_ms, timestamp_now}; use beacon_chain::{ AvailabilityProcessingStatus, BeaconChain, BeaconChainError, BeaconChainTypes, BlockError, @@ -216,7 +216,7 @@ pub async fn publish_block>( } } - if gossip_verified_columns.iter().map(Option::is_some).count() > 0 { + if !gossip_verified_columns.is_empty() { if let Some(data_column_publishing_delay) = data_column_publishing_delay_for_testing { // Subtract block publishing delay if it is also used. // Note: if `data_column_publishing_delay` is less than `block_publishing_delay`, it @@ -240,7 +240,6 @@ pub async fn publish_block>( let sampling_columns_indices = chain.sampling_columns_for_epoch(epoch); let sampling_columns = gossip_verified_columns .into_iter() - .flatten() .filter(|data_column| sampling_columns_indices.contains(&data_column.index())) .collect::>(); @@ -348,7 +347,7 @@ pub async fn publish_block>( type BuildDataSidecarTaskResult = Result< ( Vec>>, - Vec>>, + Vec>, ), Rejection, >; @@ -382,7 +381,7 @@ fn spawn_build_data_sidecar_task( } else { // Post PeerDAS: construct data columns. let gossip_verified_data_columns = - build_gossip_verified_data_columns(&chain, &block, blobs, kzg_proofs)?; + build_data_columns(&chain, &block, blobs, kzg_proofs)?; Ok((vec![], gossip_verified_data_columns)) } }, @@ -397,66 +396,33 @@ fn spawn_build_data_sidecar_task( }) } -fn build_gossip_verified_data_columns( +/// Build data columns as wrapped `GossipVerifiedDataColumn`s. +/// There is no need to actually perform gossip verification on columns that a block producer +/// is publishing. In the locally constructed case, cell proof verification happens in the EL. +/// In the externally constructed case, there wont be any columns here. +fn build_data_columns( chain: &BeaconChain, block: &SignedBeaconBlock>, blobs: BlobsList, kzg_cell_proofs: KzgProofs, -) -> Result>>, Rejection> { +) -> Result>, Rejection> { let slot = block.slot(); let data_column_sidecars = build_blob_data_column_sidecars(chain, block, blobs, kzg_cell_proofs).map_err(|e| { error!( error = ?e, %slot, - "Invalid data column - not publishing block" + "Invalid data column - not publishing data columns" ); warp_utils::reject::custom_bad_request(format!("{e:?}")) })?; - let slot = block.slot(); let gossip_verified_data_columns = data_column_sidecars .into_iter() - .map(|data_column_sidecar| { - let column_index = data_column_sidecar.index; - let subnet = DataColumnSubnetId::from_column_index(column_index, &chain.spec); - let gossip_verified_column = - GossipVerifiedDataColumn::new(data_column_sidecar, subnet, chain); - - match gossip_verified_column { - Ok(blob) => Ok(Some(blob)), - Err(GossipDataColumnError::PriorKnown { proposer, .. }) => { - // Log the error but do not abort publication, we may need to publish the block - // or some of the other data columns if the block & data columns are only - // partially published by the other publisher. - debug!( - column_index, - %slot, - proposer, - "Data column for publication already known" - ); - Ok(None) - } - Err(GossipDataColumnError::PriorKnownUnpublished) => { - debug!( - column_index, - %slot, - "Data column for publication already known via the EL" - ); - Ok(None) - } - Err(e) => { - error!( - column_index, - %slot, - error = ?e, - "Data column for publication is gossip-invalid" - ); - Err(warp_utils::reject::custom_bad_request(format!("{e:?}"))) - } - } + .filter_map(|data_column_sidecar| { + GossipVerifiedDataColumn::new_for_block_publishing(data_column_sidecar, chain).ok() }) - .collect::, Rejection>>()?; + .collect::>(); Ok(gossip_verified_data_columns) } @@ -533,13 +499,12 @@ fn publish_blob_sidecars( fn publish_column_sidecars( sender_clone: &UnboundedSender>, - data_column_sidecars: &[Option>], + data_column_sidecars: &[GossipVerifiedDataColumn], chain: &BeaconChain, ) -> Result<(), BlockError> { let malicious_withhold_count = chain.config.malicious_withhold_count; let mut data_column_sidecars = data_column_sidecars .iter() - .flatten() .map(|d| d.clone_data_column()) .collect::>(); if malicious_withhold_count > 0 { diff --git a/beacon_node/http_api/src/test_utils.rs b/beacon_node/http_api/src/test_utils.rs index 90f2fd2d95..fe9e0dff70 100644 --- a/beacon_node/http_api/src/test_utils.rs +++ b/beacon_node/http_api/src/test_utils.rs @@ -60,8 +60,15 @@ type Mutator = BoxedMutator, MemoryStore>; impl InteractiveTester { pub async fn new(spec: Option, validator_count: usize) -> Self { - Self::new_with_initializer_and_mutator(spec, validator_count, None, None, Config::default()) - .await + Self::new_with_initializer_and_mutator( + spec, + validator_count, + None, + None, + Config::default(), + true, + ) + .await } pub async fn new_with_initializer_and_mutator( @@ -70,6 +77,7 @@ impl InteractiveTester { initializer: Option>, mutator: Option>, config: Config, + use_mock_builder: bool, ) -> Self { let mut harness_builder = BeaconChainHarness::builder(E::default()) .spec_or_default(spec.map(Arc::new)) @@ -91,7 +99,7 @@ impl InteractiveTester { harness_builder = harness_builder.initial_mutator(mutator); } - let harness = harness_builder.build(); + let mut harness = harness_builder.build(); let ApiServer { ctx, @@ -103,22 +111,47 @@ impl InteractiveTester { tokio::spawn(server); - // Override the default timeout to 2s to timeouts on CI, as CI seems to require longer - // to process. The 1s timeouts for other tasks have been working for a long time, so we'll - // keep it as it is, as it may help identify a performance regression. + // Late-initalize the mock builder now that the mock execution node and beacon API ports + // have been allocated. + let beacon_api_ip = listening_socket.ip(); + let beacon_api_port = listening_socket.port(); + let beacon_url = + SensitiveUrl::parse(format!("http://{beacon_api_ip}:{beacon_api_port}").as_str()) + .unwrap(); + + // We disable apply_operations because it breaks the mock builder's ability to return + // payloads. + let apply_operations = false; + + // We disable strict registration checks too, because it makes HTTP tests less fiddly to + // write. + let strict_registrations = false; + + // Broadcast to the BN only if Fulu is scheduled. In the broadcast validation tests we want + // to infer things from the builder return code, and pre-Fulu it's simpler to let the BN + // handle broadcast and return detailed codes. Post-Fulu the builder doesn't return the + // block at all, so we *need* the builder to do the broadcast and return a 400 if the block + // is invalid. + let broadcast_to_bn = ctx.chain.as_ref().unwrap().spec.is_fulu_scheduled(); + + if use_mock_builder { + let mock_builder_server = harness.set_mock_builder( + beacon_url.clone(), + strict_registrations, + apply_operations, + broadcast_to_bn, + ); + + tokio::spawn(mock_builder_server); + } + + // Use 5s timeouts on CI, as there are several sources of artifical slowness, including + // mock-builder. let timeouts = Timeouts { - default: Duration::from_secs(2), - ..Timeouts::set_all(Duration::from_secs(1)) + default: Duration::from_secs(5), + ..Timeouts::set_all(Duration::from_secs(5)) }; - let client = BeaconNodeHttpClient::new( - SensitiveUrl::parse(&format!( - "http://{}:{}", - listening_socket.ip(), - listening_socket.port() - )) - .unwrap(), - timeouts, - ); + let client = BeaconNodeHttpClient::new(beacon_url.clone(), timeouts); Self { ctx, diff --git a/beacon_node/http_api/tests/broadcast_validation_tests.rs b/beacon_node/http_api/tests/broadcast_validation_tests.rs index c125ae035b..9427f6fdf3 100644 --- a/beacon_node/http_api/tests/broadcast_validation_tests.rs +++ b/beacon_node/http_api/tests/broadcast_validation_tests.rs @@ -1,9 +1,9 @@ use beacon_chain::test_utils::test_spec; use beacon_chain::{ - GossipVerifiedBlock, IntoGossipVerifiedBlock, + GossipVerifiedBlock, IntoGossipVerifiedBlock, WhenSlotSkipped, test_utils::{AttestationStrategy, BlockStrategy}, }; -use eth2::reqwest::StatusCode; +use eth2::reqwest::{Response, StatusCode}; use eth2::types::{BroadcastValidation, PublishBlockRequest}; use http_api::test_utils::InteractiveTester; use http_api::{Config, ProvenancedBlock, publish_blinded_block, publish_block, reconstruct_block}; @@ -74,7 +74,7 @@ pub async fn gossip_invalid() { }) .await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blocks_v2_ssz(&PublishBlockRequest::new(block, blobs), validation_level) .await; @@ -85,7 +85,18 @@ pub async fn gossip_invalid() { /* mandated by Beacon API spec */ assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); - assert_server_message_error(error_response, "BAD_REQUEST: NotFinalizedDescendant { block_parent_root: 0x0000000000000000000000000000000000000000000000000000000000000000 }".to_string()); + let pre_finalized_block_root = Hash256::zero(); + let expected_error_msg = if tester.harness.spec.is_fulu_scheduled() { + format!( + "BAD_REQUEST: NotFinalizedDescendant {{ block_parent_root: {pre_finalized_block_root:?} }}" + ) + } else { + // Since Deneb, the invalidity of the blobs will be detected prior to the invalidity of the + // block. + format!("BAD_REQUEST: ParentUnknown {{ parent_root: {pre_finalized_block_root:?} }}") + }; + + assert_server_message_error(error_response, expected_error_msg); } /// This test checks that a block that is valid from a gossip perspective is accepted when using `broadcast_validation=gossip`. @@ -123,15 +134,11 @@ pub async fn gossip_partial_pass() { }) .await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blocks_v2_ssz(&PublishBlockRequest::new(block, blobs), validation_level) .await; - assert!(response.is_err()); - - let error_response = response.unwrap_err(); - - assert_eq!(error_response.status(), Some(StatusCode::ACCEPTED)); + assert_eq!(response.unwrap().status(), StatusCode::ACCEPTED); } // This test checks that a block that is valid from both a gossip and consensus perspective is accepted when using `broadcast_validation=gossip`. @@ -164,7 +171,7 @@ pub async fn gossip_full_pass() { let state_a = tester.harness.get_current_state(); let ((block, blobs), _) = tester.harness.make_block(state_a, slot_b).await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blocks_v2_ssz( &PublishBlockRequest::new(block.clone(), blobs), @@ -215,7 +222,7 @@ pub async fn gossip_full_pass_ssz() { let (block_contents_tuple, _) = tester.harness.make_block(state_a, slot_b).await; let block_contents = block_contents_tuple.into(); - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blocks_v2_ssz(&block_contents, validation_level) .await; @@ -264,7 +271,7 @@ pub async fn consensus_invalid() { }) .await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blocks_v2_ssz(&PublishBlockRequest::new(block, blobs), validation_level) .await; @@ -274,7 +281,19 @@ pub async fn consensus_invalid() { /* mandated by Beacon API spec */ assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); - assert_server_message_error(error_response, "BAD_REQUEST: NotFinalizedDescendant { block_parent_root: 0x0000000000000000000000000000000000000000000000000000000000000000 }".to_string()); + + let pre_finalized_block_root = Hash256::zero(); + let expected_error_msg = if tester.harness.spec.is_fulu_scheduled() { + format!( + "BAD_REQUEST: NotFinalizedDescendant {{ block_parent_root: {pre_finalized_block_root:?} }}" + ) + } else { + // Since Deneb, the invalidity of the blobs will be detected prior to the invalidity of the + // block. + format!("BAD_REQUEST: ParentUnknown {{ parent_root: {pre_finalized_block_root:?} }}") + }; + + assert_server_message_error(error_response, expected_error_msg); } /// This test checks that a block that is only valid from a gossip perspective is rejected when using `broadcast_validation=consensus`. @@ -304,13 +323,17 @@ pub async fn consensus_gossip() { let slot_a = Slot::new(num_initial); let slot_b = slot_a + 1; + let mut correct_state_root = Hash256::ZERO; let state_a = tester.harness.get_current_state(); let ((block, blobs), _) = tester .harness - .make_block_with_modifier(state_a, slot_b, |b| *b.state_root_mut() = Hash256::zero()) + .make_block_with_modifier(state_a, slot_b, |b| { + *correct_state_root = *b.state_root(); + *b.state_root_mut() = Hash256::zero() + }) .await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blocks_v2_ssz(&PublishBlockRequest::new(block, blobs), validation_level) .await; @@ -320,7 +343,14 @@ pub async fn consensus_gossip() { /* mandated by Beacon API spec */ assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); - assert_server_message_error(error_response, "BAD_REQUEST: Invalid block: StateRootMismatch { block: 0x0000000000000000000000000000000000000000000000000000000000000000, local: 0x253405be9aa159bce7b276b8e1d3849c743e673118dfafe8c7d07c203ae0d80d }".to_string()); + assert_server_message_error( + error_response, + format!( + "BAD_REQUEST: Invalid block: StateRootMismatch {{ block: {}, \ + local: {correct_state_root:?} }}", + Hash256::ZERO + ), + ); } /// This test checks that a block that is valid from both a gossip and consensus perspective, but nonetheless equivocates, is accepted when using `broadcast_validation=consensus`. @@ -424,7 +454,7 @@ pub async fn consensus_full_pass() { let state_a = tester.harness.get_current_state(); let ((block, blobs), _) = tester.harness.make_block(state_a, slot_b).await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blocks_v2_ssz( &PublishBlockRequest::new(block.clone(), blobs), @@ -478,7 +508,7 @@ pub async fn equivocation_invalid() { }) .await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blocks_v2_ssz(&PublishBlockRequest::new(block, blobs), validation_level) .await; @@ -488,7 +518,19 @@ pub async fn equivocation_invalid() { /* mandated by Beacon API spec */ assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); - assert_server_message_error(error_response, "BAD_REQUEST: NotFinalizedDescendant { block_parent_root: 0x0000000000000000000000000000000000000000000000000000000000000000 }".to_string()); + + let pre_finalized_block_root = Hash256::zero(); + let expected_error_msg = if tester.harness.spec.is_fulu_scheduled() { + format!( + "BAD_REQUEST: NotFinalizedDescendant {{ block_parent_root: {pre_finalized_block_root:?} }}" + ) + } else { + // Since Deneb, the invalidity of the blobs will be detected prior to the invalidity of the + // block. + format!("BAD_REQUEST: ParentUnknown {{ parent_root: {pre_finalized_block_root:?} }}") + }; + + assert_server_message_error(error_response, expected_error_msg); } /// This test checks that a block that is valid from both a gossip and consensus perspective is rejected when using `broadcast_validation=consensus_and_equivocation`. @@ -554,7 +596,7 @@ pub async fn equivocation_consensus_early_equivocation() { ); /* submit `block_b` which should induce equivocation */ - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blocks_v2_ssz( &PublishBlockRequest::new(block_b.clone(), blobs_b), @@ -597,14 +639,18 @@ pub async fn equivocation_gossip() { let slot_a = Slot::new(num_initial); let slot_b = slot_a + 1; + let mut correct_state_root = Hash256::zero(); let state_a = tester.harness.get_current_state(); let ((block, blobs), _) = tester .harness - .make_block_with_modifier(state_a, slot_b, |b| *b.state_root_mut() = Hash256::zero()) + .make_block_with_modifier(state_a, slot_b, |b| { + *correct_state_root = *b.state_root(); + *b.state_root_mut() = Hash256::zero() + }) .await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blocks_v2_ssz(&PublishBlockRequest::new(block, blobs), validation_level) .await; @@ -614,7 +660,13 @@ pub async fn equivocation_gossip() { /* mandated by Beacon API spec */ assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); - assert_server_message_error(error_response, "BAD_REQUEST: Invalid block: StateRootMismatch { block: 0x0000000000000000000000000000000000000000000000000000000000000000, local: 0x253405be9aa159bce7b276b8e1d3849c743e673118dfafe8c7d07c203ae0d80d }".to_string()); + assert_server_message_error( + error_response, + format!( + "BAD_REQUEST: Invalid block: StateRootMismatch {{ block: {}, local: {correct_state_root} }}", + Hash256::zero() + ), + ); } /// This test checks that a block that is valid from both a gossip and consensus perspective but @@ -725,7 +777,7 @@ pub async fn equivocation_full_pass() { let state_a = tester.harness.get_current_state(); let ((block, blobs), _) = tester.harness.make_block(state_a, slot_b).await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blocks_v2_ssz( &PublishBlockRequest::new(block.clone(), blobs), @@ -770,28 +822,43 @@ pub async fn blinded_gossip_invalid() { tester.harness.advance_slot(); - let (block_contents_tuple, _) = tester + let (blinded_block, _) = tester .harness - .make_block_with_modifier(chain_state_before, slot, |b| { + .make_blinded_block_with_modifier(chain_state_before, slot, |b| { *b.state_root_mut() = Hash256::zero(); *b.parent_root_mut() = Hash256::zero(); }) .await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client - .post_beacon_blinded_blocks_v2(&block_contents_tuple.0.clone_as_blinded(), validation_level) + .post_beacon_blinded_blocks_v2(&blinded_block, validation_level) .await; assert!(response.is_err()); let error_response: eth2::Error = response.err().unwrap(); - + let pre_finalized_block_root = Hash256::zero(); /* mandated by Beacon API spec */ - assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); - assert_server_message_error(error_response, "BAD_REQUEST: NotFinalizedDescendant { block_parent_root: 0x0000000000000000000000000000000000000000000000000000000000000000 }".to_string()); + if tester.harness.spec.is_fulu_scheduled() { + // XXX: this should be a 400 but is a 500 due to the mock-builder being janky + assert_eq!( + error_response.status(), + Some(StatusCode::INTERNAL_SERVER_ERROR) + ); + } else { + assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); + assert_server_message_error( + error_response, + format!("BAD_REQUEST: ParentUnknown {{ parent_root: {pre_finalized_block_root:?} }}"), + ); + } } -/// This test checks that a block that is valid from a gossip perspective is accepted when using `broadcast_validation=gossip`. +/// Process a blinded block that is invalid, but valid on gossip. +/// +/// Due to the checks conducted by the "relay" (mock-builder) when `broadcast_to_bn` is set (post +/// Fulu), we can't always assert that we get a 202 status for this block -- post Fulu the relay +/// detects it as invalid and the BN returns an error. #[tokio::test(flavor = "multi_thread", worker_threads = 2)] pub async fn blinded_gossip_partial_pass() { /* this test targets gossip-level validation */ @@ -819,22 +886,27 @@ pub async fn blinded_gossip_partial_pass() { tester.harness.advance_slot(); - let (block_contents_tuple, _) = tester + let (blinded_block, _) = tester .harness - .make_block_with_modifier(chain_state_before, slot, |b| { + .make_blinded_block_with_modifier(chain_state_before, slot, |b| { *b.state_root_mut() = Hash256::zero() }) .await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client - .post_beacon_blinded_blocks_v2(&block_contents_tuple.0.clone_as_blinded(), validation_level) + .post_beacon_blinded_blocks_v2(&blinded_block, validation_level) .await; - assert!(response.is_err()); - - let error_response = response.unwrap_err(); - - assert_eq!(error_response.status(), Some(StatusCode::ACCEPTED)); + if tester.harness.spec.is_fulu_scheduled() { + let error_response = response.unwrap_err(); + // XXX: this should be a 400 but is a 500 due to the mock-builder being janky + assert_eq!( + error_response.status(), + Some(StatusCode::INTERNAL_SERVER_ERROR) + ); + } else { + assert_eq!(response.unwrap().status(), StatusCode::ACCEPTED); + } } // This test checks that a block that is valid from both a gossip and consensus perspective is accepted when using `broadcast_validation=gossip`. @@ -866,12 +938,13 @@ pub async fn blinded_gossip_full_pass() { let state_a = tester.harness.get_current_state(); let (blinded_block, _) = tester.harness.make_blinded_block(state_a, slot_b).await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blinded_blocks_v2(&blinded_block, validation_level) .await; assert!(response.is_ok()); + assert_eq!(response.unwrap().status(), StatusCode::OK); assert!( tester .harness @@ -910,12 +983,13 @@ pub async fn blinded_gossip_full_pass_ssz() { let state_a = tester.harness.get_current_state(); let (blinded_block, _) = tester.harness.make_blinded_block(state_a, slot_b).await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blinded_blocks_v2_ssz(&blinded_block, validation_level) .await; assert!(response.is_ok()); + assert_eq!(response.unwrap().status(), StatusCode::OK); assert!( tester .harness @@ -933,7 +1007,7 @@ pub async fn blinded_consensus_invalid() { // Validator count needs to be at least 32 or proposer boost gets set to 0 when computing // `validator_count // 32`. let validator_count = 64; - let num_initial: u64 = 31; + let num_initial: u64 = 256; let tester = InteractiveTester::::new(None, validator_count).await; // Create some chain depth. @@ -952,25 +1026,48 @@ pub async fn blinded_consensus_invalid() { tester.harness.advance_slot(); - let (block_contents_tuple, _) = tester + let finalized_slot = chain_state_before + .finalized_checkpoint() + .epoch + .start_slot(E::slots_per_epoch()); + assert_ne!(finalized_slot, 0); + let pre_finalized_block_root = tester .harness - .make_block_with_modifier(chain_state_before, slot, |b| { + .chain + .block_root_at_slot(finalized_slot - 1, WhenSlotSkipped::Prev) + .unwrap() + .unwrap(); + + let (blinded_block, _) = tester + .harness + .make_blinded_block_with_modifier(chain_state_before, slot, |b| { *b.state_root_mut() = Hash256::zero(); - *b.parent_root_mut() = Hash256::zero(); + *b.parent_root_mut() = pre_finalized_block_root; }) .await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client - .post_beacon_blinded_blocks_v2(&block_contents_tuple.0.clone_as_blinded(), validation_level) + .post_beacon_blinded_blocks_v2(&blinded_block, validation_level) .await; assert!(response.is_err()); let error_response: eth2::Error = response.err().unwrap(); /* mandated by Beacon API spec */ - assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); - assert_server_message_error(error_response, "BAD_REQUEST: NotFinalizedDescendant { block_parent_root: 0x0000000000000000000000000000000000000000000000000000000000000000 }".to_string()); + if tester.harness.spec.is_fulu_scheduled() { + // XXX: this should be a 400 but is a 500 due to the mock-builder being janky + assert_eq!( + error_response.status(), + Some(StatusCode::INTERNAL_SERVER_ERROR) + ); + } else { + assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); + assert_server_message_error( + error_response, + format!("BAD_REQUEST: ParentUnknown {{ parent_root: {pre_finalized_block_root:?} }}"), + ); + } } /// This test checks that a block that is only valid from a gossip perspective is rejected when using `broadcast_validation=consensus`. @@ -1000,23 +1097,44 @@ pub async fn blinded_consensus_gossip() { let slot_a = Slot::new(num_initial); let slot_b = slot_a + 1; + let mut correct_state_root = Hash256::zero(); + let state_a = tester.harness.get_current_state(); - let (block_contents_tuple, _) = tester + let (blinded_block, _) = tester .harness - .make_block_with_modifier(state_a, slot_b, |b| *b.state_root_mut() = Hash256::zero()) + .make_blinded_block_with_modifier(state_a, slot_b, |b| { + *correct_state_root = *b.state_root(); + *b.state_root_mut() = Hash256::zero() + }) .await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client - .post_beacon_blinded_blocks_v2(&block_contents_tuple.0.clone_as_blinded(), validation_level) + .post_beacon_blinded_blocks_v2(&blinded_block, validation_level) .await; + assert!(response.is_err()); let error_response: eth2::Error = response.err().unwrap(); /* mandated by Beacon API spec */ - assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); - assert_server_message_error(error_response, "BAD_REQUEST: Invalid block: StateRootMismatch { block: 0x0000000000000000000000000000000000000000000000000000000000000000, local: 0x253405be9aa159bce7b276b8e1d3849c743e673118dfafe8c7d07c203ae0d80d }".to_string()); + if tester.harness.spec.is_fulu_scheduled() { + // XXX: this should be a 400 but is a 500 due to the mock-builder being janky + assert_eq!( + error_response.status(), + Some(StatusCode::INTERNAL_SERVER_ERROR) + ); + } else { + assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); + assert_server_message_error( + error_response, + format!( + "BAD_REQUEST: Invalid block: StateRootMismatch {{ block: {}, \ + local: {correct_state_root} }}", + Hash256::ZERO + ), + ); + } } /// This test checks that a block that is valid from both a gossip and consensus perspective is accepted when using `broadcast_validation=consensus`. @@ -1049,7 +1167,7 @@ pub async fn blinded_consensus_full_pass() { let state_a = tester.harness.get_current_state(); let (blinded_block, _) = tester.harness.make_blinded_block(state_a, slot_b).await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blinded_blocks_v2(&blinded_block, validation_level) .await; @@ -1073,7 +1191,7 @@ pub async fn blinded_equivocation_invalid() { // Validator count needs to be at least 32 or proposer boost gets set to 0 when computing // `validator_count // 32`. let validator_count = 64; - let num_initial: u64 = 31; + let num_initial: u64 = 256; let tester = InteractiveTester::::new(None, validator_count).await; // Create some chain depth. @@ -1092,25 +1210,47 @@ pub async fn blinded_equivocation_invalid() { tester.harness.advance_slot(); - let (block_contents_tuple, _) = tester + let finalized_slot = chain_state_before + .finalized_checkpoint() + .epoch + .start_slot(E::slots_per_epoch()); + assert_ne!(finalized_slot, 0); + let pre_finalized_block_root = tester .harness - .make_block_with_modifier(chain_state_before, slot, |b| { + .chain + .block_root_at_slot(finalized_slot - 1, WhenSlotSkipped::Prev) + .unwrap() + .unwrap(); + + let (blinded_block, _) = tester + .harness + .make_blinded_block_with_modifier(chain_state_before, slot, |b| { *b.state_root_mut() = Hash256::zero(); - *b.parent_root_mut() = Hash256::zero(); + *b.parent_root_mut() = pre_finalized_block_root; }) .await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client - .post_beacon_blinded_blocks_v2(&block_contents_tuple.0.clone_as_blinded(), validation_level) + .post_beacon_blinded_blocks_v2(&blinded_block, validation_level) .await; assert!(response.is_err()); let error_response: eth2::Error = response.err().unwrap(); /* mandated by Beacon API spec */ - assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); - assert_server_message_error(error_response, "BAD_REQUEST: NotFinalizedDescendant { block_parent_root: 0x0000000000000000000000000000000000000000000000000000000000000000 }".to_string()); + if tester.harness.spec.is_fulu_scheduled() { + assert_eq!( + error_response.status(), + Some(StatusCode::INTERNAL_SERVER_ERROR) + ); + } else { + assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); + assert_server_message_error( + error_response, + format!("BAD_REQUEST: ParentUnknown {{ parent_root: {pre_finalized_block_root:?} }}"), + ); + } } /// This test checks that a block that is valid from both a gossip and consensus perspective is rejected when using `broadcast_validation=consensus_and_equivocation`. @@ -1160,13 +1300,11 @@ pub async fn blinded_equivocation_consensus_early_equivocation() { assert_ne!(block_a.state_root(), block_b.state_root()); /* submit `block_a` as valid */ - assert!( - tester - .client - .post_beacon_blinded_blocks_v2(&block_a, validation_level) - .await - .is_ok() - ); + tester + .client + .post_beacon_blinded_blocks_v2(&block_a, validation_level) + .await + .unwrap(); assert!( tester .harness @@ -1175,7 +1313,7 @@ pub async fn blinded_equivocation_consensus_early_equivocation() { ); /* submit `block_b` which should induce equivocation */ - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blinded_blocks_v2(&block_b, validation_level) .await; @@ -1183,8 +1321,15 @@ pub async fn blinded_equivocation_consensus_early_equivocation() { let error_response: eth2::Error = response.err().unwrap(); - assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); - assert_server_message_error(error_response, "BAD_REQUEST: Slashable".to_string()); + if tester.harness.spec.is_fulu_scheduled() { + assert_eq!( + error_response.status(), + Some(StatusCode::INTERNAL_SERVER_ERROR) + ); + } else { + assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); + assert_server_message_error(error_response, "BAD_REQUEST: Slashable".to_string()); + } } /// This test checks that a block that is only valid from a gossip perspective is rejected when using `broadcast_validation=consensus_and_equivocation`. @@ -1215,24 +1360,42 @@ pub async fn blinded_equivocation_gossip() { let slot_a = Slot::new(num_initial); let slot_b = slot_a + 1; + let mut correct_state_root = Hash256::zero(); let state_a = tester.harness.get_current_state(); - let (block_contents_tuple, _) = tester + let (blinded_block, _) = tester .harness - .make_block_with_modifier(state_a, slot_b, |b| *b.state_root_mut() = Hash256::zero()) + .make_blinded_block_with_modifier(state_a, slot_b, |b| { + *correct_state_root = *b.state_root(); + *b.state_root_mut() = Hash256::zero() + }) .await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client - .post_beacon_blinded_blocks_v2(&block_contents_tuple.0.clone_as_blinded(), validation_level) + .post_beacon_blinded_blocks_v2(&blinded_block, validation_level) .await; - assert!(response.is_err()); + assert!(response.is_err()); let error_response: eth2::Error = response.err().unwrap(); /* mandated by Beacon API spec */ - assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); - - assert_server_message_error(error_response, "BAD_REQUEST: Invalid block: StateRootMismatch { block: 0x0000000000000000000000000000000000000000000000000000000000000000, local: 0x253405be9aa159bce7b276b8e1d3849c743e673118dfafe8c7d07c203ae0d80d }".to_string()); + if tester.harness.spec.is_fulu_scheduled() { + // XXX: this should be a 400 but is a 500 due to the mock-builder being janky + assert_eq!( + error_response.status(), + Some(StatusCode::INTERNAL_SERVER_ERROR), + "{error_response:?}" + ); + } else { + assert_eq!(error_response.status(), Some(StatusCode::BAD_REQUEST)); + assert_server_message_error( + error_response, + format!( + "BAD_REQUEST: Invalid block: StateRootMismatch {{ block: {}, local: {correct_state_root} }}", + Hash256::zero() + ), + ); + } } /// This test checks that a block that is valid from both a gossip and @@ -1287,54 +1450,58 @@ pub async fn blinded_equivocation_consensus_late_equivocation() { ); assert_ne!(block_a.state_root(), block_b.state_root()); - let unblinded_block_a = reconstruct_block( - tester.harness.chain.clone(), - block_a.canonical_root(), - Arc::new(block_a), - ) - .await - .expect("failed to reconstruct block") - .expect("block expected"); + // From fulu builders never send back a full payload, hence further checks in this test + // are not possible + if !tester.harness.spec.is_fulu_scheduled() { + let unblinded_block_a = reconstruct_block( + tester.harness.chain.clone(), + block_a.canonical_root(), + Arc::new(block_a), + ) + .await + .expect("failed to reconstruct block") + .expect("block expected"); - let unblinded_block_b = reconstruct_block( - tester.harness.chain.clone(), - block_b.canonical_root(), - block_b.clone(), - ) - .await - .expect("failed to reconstruct block") - .expect("block expected"); + let unblinded_block_b = reconstruct_block( + tester.harness.chain.clone(), + block_b.canonical_root(), + block_b.clone(), + ) + .await + .expect("failed to reconstruct block") + .expect("block expected"); - let inner_block_a = match unblinded_block_a { - ProvenancedBlock::Local(a, _, _) => a, - ProvenancedBlock::Builder(a, _, _) => a, - }; - let inner_block_b = match unblinded_block_b { - ProvenancedBlock::Local(b, _, _) => b, - ProvenancedBlock::Builder(b, _, _) => b, - }; + let inner_block_a = match unblinded_block_a { + ProvenancedBlock::Local(a, _, _) => a, + ProvenancedBlock::Builder(a, _, _) => a, + }; + let inner_block_b = match unblinded_block_b { + ProvenancedBlock::Local(b, _, _) => b, + ProvenancedBlock::Builder(b, _, _) => b, + }; - let gossip_block_b = GossipVerifiedBlock::new(inner_block_b, &tester.harness.chain); - assert!(gossip_block_b.is_ok()); - let gossip_block_a = GossipVerifiedBlock::new(inner_block_a, &tester.harness.chain); - assert!(gossip_block_a.is_err()); + let gossip_block_b = GossipVerifiedBlock::new(inner_block_b, &tester.harness.chain); + assert!(gossip_block_b.is_ok()); + let gossip_block_a = GossipVerifiedBlock::new(inner_block_a, &tester.harness.chain); + assert!(gossip_block_a.is_err()); - let channel = tokio::sync::mpsc::unbounded_channel(); + let channel = tokio::sync::mpsc::unbounded_channel(); - let publication_result = publish_blinded_block( - block_b, - tester.harness.chain, - &channel.0, - validation_level, - StatusCode::ACCEPTED, - ) - .await; + let publication_result = publish_blinded_block( + block_b, + tester.harness.chain, + &channel.0, + validation_level, + StatusCode::ACCEPTED, + ) + .await; - assert!(publication_result.is_err()); + assert!(publication_result.is_err()); - let publication_error: Rejection = publication_result.unwrap_err(); + let publication_error: Rejection = publication_result.unwrap_err(); - assert!(publication_error.find::().is_some()); + assert!(publication_error.find::().is_some()); + } } /// This test checks that a block that is valid from both a gossip and consensus perspective (and does not equivocate) is accepted when using `broadcast_validation=consensus_and_equivocation`. @@ -1368,7 +1535,7 @@ pub async fn blinded_equivocation_full_pass() { let state_a = tester.harness.get_current_state(); let (block, _) = tester.harness.make_blinded_block(state_a, slot_b).await; - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blinded_blocks_v2(&block, validation_level) .await; @@ -1434,7 +1601,7 @@ pub async fn block_seen_on_gossip_without_blobs_or_columns() { ); // Post the block *and* blobs to the HTTP API. - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blocks_v2_ssz( &PublishBlockRequest::new(block.clone(), Some(blobs)), @@ -1522,7 +1689,7 @@ pub async fn block_seen_on_gossip_with_some_blobs_or_columns() { ); // Post the block *and* all blobs to the HTTP API. - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blocks_v2_ssz( &PublishBlockRequest::new(block.clone(), Some(blobs)), @@ -1597,7 +1764,7 @@ pub async fn blobs_or_columns_seen_on_gossip_without_block() { ); // Post the block *and* all blobs to the HTTP API. - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blocks_v2_ssz( &PublishBlockRequest::new(block.clone(), Some((kzg_proofs, blobs))), @@ -1672,7 +1839,7 @@ async fn blobs_or_columns_seen_on_gossip_without_block_and_no_http_blobs_or_colu ); // Post just the block to the HTTP API (blob lists are empty). - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blocks_v2_ssz( &PublishBlockRequest::new( @@ -1750,7 +1917,7 @@ async fn slashable_blobs_or_columns_seen_on_gossip_cause_failure() { ); // Post block A *and* all its blobs to the HTTP API. - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blocks_v2_ssz( &PublishBlockRequest::new(block_a.clone(), Some((kzg_proofs_a, blobs_a))), @@ -1788,6 +1955,7 @@ pub async fn duplicate_block_status_code() { duplicate_block_status_code, ..Config::default() }, + true, ) .await; @@ -1812,7 +1980,7 @@ pub async fn duplicate_block_status_code() { // Post the block blobs to the HTTP API once. let block_request = PublishBlockRequest::new(block.clone(), Some((kzg_proofs, blobs))); - let response: Result<(), eth2::Error> = tester + let response: Result = tester .client .post_beacon_blocks_v2_ssz(&block_request, validation_level) .await; @@ -1827,7 +1995,7 @@ pub async fn duplicate_block_status_code() { ); // Post again. - let duplicate_response: Result<(), eth2::Error> = tester + let duplicate_response: Result = tester .client .post_beacon_blocks_v2_ssz(&block_request, validation_level) .await; diff --git a/beacon_node/http_api/tests/fork_tests.rs b/beacon_node/http_api/tests/fork_tests.rs index 880e206777..62a3461276 100644 --- a/beacon_node/http_api/tests/fork_tests.rs +++ b/beacon_node/http_api/tests/fork_tests.rs @@ -425,6 +425,7 @@ async fn bls_to_execution_changes_update_all_around_capella_fork() { })), None, Default::default(), + true, ) .await; let harness = &tester.harness; diff --git a/beacon_node/http_api/tests/interactive_tests.rs b/beacon_node/http_api/tests/interactive_tests.rs index 1e55bfb7b3..1398d8c72f 100644 --- a/beacon_node/http_api/tests/interactive_tests.rs +++ b/beacon_node/http_api/tests/interactive_tests.rs @@ -73,6 +73,7 @@ async fn state_by_root_pruned_from_fork_choice() { })), None, Default::default(), + false, ) .await; @@ -429,6 +430,7 @@ pub async fn proposer_boost_re_org_test( ) })), Default::default(), + false, ) .await; let harness = &tester.harness; @@ -666,6 +668,7 @@ pub async fn proposer_boost_re_org_test( // Check the fork choice updates that were sent. let forkchoice_updates = forkchoice_updates.lock(); + let block_a_exec_hash = block_a .0 .message() diff --git a/beacon_node/http_api/tests/tests.rs b/beacon_node/http_api/tests/tests.rs index 92abbd84c7..9c18a7c1e8 100644 --- a/beacon_node/http_api/tests/tests.rs +++ b/beacon_node/http_api/tests/tests.rs @@ -1,14 +1,16 @@ use beacon_chain::test_utils::RelativeSyncCommittee; use beacon_chain::{ BeaconChain, ChainConfig, StateSkipConfig, WhenSlotSkipped, - test_utils::{AttestationStrategy, BeaconChainHarness, BlockStrategy, EphemeralHarnessType}, + test_utils::{ + AttestationStrategy, BeaconChainHarness, BlockStrategy, EphemeralHarnessType, test_spec, + }, }; use eth2::{ BeaconNodeHttpClient, Error, Error::ServerMessage, StatusCode, Timeouts, mixin::{RequestAccept, ResponseForkName, ResponseOptional}, - reqwest::RequestBuilder, + reqwest::{RequestBuilder, Response}, types::{ BlockId as CoreBlockId, ForkChoiceNode, ProduceBlockV3Response, StateId as CoreStateId, *, }, @@ -24,8 +26,9 @@ use http_api::{ BlockId, StateId, test_utils::{ApiServer, create_api_server}, }; -use lighthouse_network::{Enr, EnrExt, PeerId, types::SyncState}; +use lighthouse_network::{Enr, PeerId, types::SyncState}; use network::NetworkReceivers; +use network_utils::enr_ext::EnrExt; use operation_pool::attestation_storage::CheckpointKey; use proto_array::ExecutionStatus; use sensitive_url::SensitiveUrl; @@ -87,6 +90,7 @@ struct ApiTester { struct ApiTesterConfig { spec: ChainSpec, retain_historic_states: bool, + import_all_data_columns: bool, } impl Default for ApiTesterConfig { @@ -96,6 +100,7 @@ impl Default for ApiTesterConfig { Self { spec, retain_historic_states: false, + import_all_data_columns: false, } } } @@ -113,15 +118,11 @@ impl ApiTester { Self::new_from_config(ApiTesterConfig::default()).await } - pub async fn new_with_hard_forks(altair: bool, bellatrix: bool) -> Self { - let mut config = ApiTesterConfig::default(); - // Set whether the chain has undergone each hard fork. - if altair { - config.spec.altair_fork_epoch = Some(Epoch::new(0)); - } - if bellatrix { - config.spec.bellatrix_fork_epoch = Some(Epoch::new(0)); - } + pub async fn new_with_hard_forks() -> Self { + let config = ApiTesterConfig { + spec: test_spec::(), + ..Default::default() + }; Self::new_from_config(config).await } @@ -138,6 +139,7 @@ impl ApiTester { .deterministic_withdrawal_keypairs(VALIDATOR_COUNT) .fresh_ephemeral_store() .mock_execution_layer() + .import_all_data_columns(config.import_all_data_columns) .build(); harness @@ -291,7 +293,19 @@ impl ApiTester { let beacon_api_port = listening_socket.port(); let beacon_url = SensitiveUrl::parse(format!("http://127.0.0.1:{beacon_api_port}").as_str()).unwrap(); - let mock_builder_server = harness.set_mock_builder(beacon_url.clone()); + + // Be strict with validator registrations, but don't bother applying operations, that flag + // is only used by mock-builder tests. + let strict_registrations = true; + let apply_operations = true; + let broadcast_to_bn = true; + + let mock_builder_server = harness.set_mock_builder( + beacon_url.clone(), + strict_registrations, + apply_operations, + broadcast_to_bn, + ); // Start the mock builder service prior to building the chain out. harness @@ -334,6 +348,7 @@ impl ApiTester { .deterministic_keypairs(VALIDATOR_COUNT) .deterministic_withdrawal_keypairs(VALIDATOR_COUNT) .fresh_ephemeral_store() + .mock_execution_layer() .build(), ); @@ -419,7 +434,7 @@ impl ApiTester { } pub async fn new_mev_tester() -> Self { - let tester = Self::new_with_hard_forks(true, true) + let tester = Self::new_with_hard_forks() .await .test_post_validator_register_validator() .await; @@ -429,10 +444,7 @@ impl ApiTester { } pub async fn new_mev_tester_default_payload_value() -> Self { - let mut config = ApiTesterConfig { - retain_historic_states: false, - spec: E::default_spec(), - }; + let mut config = ApiTesterConfig::default(); config.spec.altair_fork_epoch = Some(Epoch::new(0)); config.spec.bellatrix_fork_epoch = Some(Epoch::new(0)); let tester = Self::new_from_config(config) @@ -1539,7 +1551,10 @@ impl ApiTester { pub async fn test_post_beacon_blocks_valid(mut self) -> Self { let next_block = self.next_block.clone(); - self.client.post_beacon_blocks(&next_block).await.unwrap(); + self.client + .post_beacon_blocks_v2(&next_block, None) + .await + .unwrap(); assert!( self.network_rx.network_recv.recv().await.is_some(), @@ -1553,7 +1568,7 @@ impl ApiTester { let next_block = &self.next_block; self.client - .post_beacon_blocks_ssz(next_block) + .post_beacon_blocks_v2_ssz(next_block, None) .await .unwrap(); @@ -1578,12 +1593,14 @@ impl ApiTester { .await .0; - assert!( - self.client - .post_beacon_blocks(&PublishBlockRequest::from(block)) - .await - .is_err() - ); + let response: Result = self + .client + .post_beacon_blocks_v2(&PublishBlockRequest::from(block), None) + .await; + + assert!(response.is_ok()); + + assert_eq!(response.unwrap().status(), StatusCode::ACCEPTED); assert!( self.network_rx.network_recv.recv().await.is_some(), @@ -1606,13 +1623,13 @@ impl ApiTester { .await .0; - assert!( - self.client - .post_beacon_blocks_ssz(&PublishBlockRequest::from(block)) - .await - .is_err() - ); + let response: Result = self + .client + .post_beacon_blocks_v2(&PublishBlockRequest::from(block), None) + .await; + assert!(response.is_ok()); + assert_eq!(response.unwrap().status(), StatusCode::ACCEPTED); assert!( self.network_rx.network_recv.recv().await.is_some(), "gossip valid blocks should be sent to network" @@ -1634,7 +1651,7 @@ impl ApiTester { assert!( self.client - .post_beacon_blocks(&block_contents) + .post_beacon_blocks_v2(&block_contents, None) .await .is_ok() ); @@ -1644,45 +1661,25 @@ impl ApiTester { // Test all the POST methods in sequence, they should all behave the same. let responses = vec![ - self.client - .post_beacon_blocks(&block_contents) - .await - .unwrap_err(), self.client .post_beacon_blocks_v2(&block_contents, None) .await - .unwrap_err(), - self.client - .post_beacon_blocks_ssz(&block_contents) - .await - .unwrap_err(), + .unwrap(), self.client .post_beacon_blocks_v2_ssz(&block_contents, None) .await - .unwrap_err(), - self.client - .post_beacon_blinded_blocks(&blinded_block_contents) - .await - .unwrap_err(), + .unwrap(), self.client .post_beacon_blinded_blocks_v2(&blinded_block_contents, None) .await - .unwrap_err(), - self.client - .post_beacon_blinded_blocks_ssz(&blinded_block_contents) - .await - .unwrap_err(), + .unwrap(), self.client .post_beacon_blinded_blocks_v2_ssz(&blinded_block_contents, None) .await - .unwrap_err(), + .unwrap(), ]; for (i, response) in responses.into_iter().enumerate() { - assert_eq!( - response.status().unwrap(), - StatusCode::ACCEPTED, - "response {i}" - ); + assert_eq!(response.status(), StatusCode::ACCEPTED, "response {i}"); } self @@ -1861,7 +1858,7 @@ impl ApiTester { }; let result = match self .client - .get_blobs::( + .get_blob_sidecars::( CoreBlockId::Root(block_root), blob_indices.as_deref(), &self.chain.spec, @@ -1882,6 +1879,77 @@ impl ApiTester { self } + pub async fn test_get_blobs(self, versioned_hashes: bool) -> Self { + let block_id = BlockId(CoreBlockId::Finalized); + let (block_root, _, _) = block_id.root(&self.chain).unwrap(); + let (block, _, _) = block_id.full_block(&self.chain).await.unwrap(); + let num_blobs = block.num_expected_blobs(); + + let versioned_hashes: Option> = if versioned_hashes { + Some( + block + .message() + .body() + .blob_kzg_commitments() + .unwrap() + .iter() + .map(|commitment| commitment.calculate_versioned_hash()) + .collect(), + ) + } else { + None + }; + + let result = match self + .client + .get_blobs::(CoreBlockId::Root(block_root), versioned_hashes.as_deref()) + .await + { + Ok(response) => response.unwrap().into_data(), + Err(e) => panic!("query failed incorrectly: {e:?}"), + }; + + assert_eq!( + result.len(), + versioned_hashes.map_or(num_blobs, |versioned_hashes| versioned_hashes.len()) + ); + + self + } + + pub async fn test_get_blobs_post_fulu_full_node(self, versioned_hashes: bool) -> Self { + let block_id = BlockId(CoreBlockId::Finalized); + let (block_root, _, _) = block_id.root(&self.chain).unwrap(); + let (block, _, _) = block_id.full_block(&self.chain).await.unwrap(); + + let versioned_hashes: Option> = if versioned_hashes { + Some( + block + .message() + .body() + .blob_kzg_commitments() + .unwrap() + .iter() + .map(|commitment| commitment.calculate_versioned_hash()) + .collect(), + ) + } else { + None + }; + + match self + .client + .get_blobs::(CoreBlockId::Root(block_root), versioned_hashes.as_deref()) + .await + { + Ok(result) => panic!("Full node are unable to return blobs post-Fulu: {result:?}"), + // Post-Fulu, full nodes don't store blobs and return error 500 + Err(e) => assert_eq!(e.status().unwrap(), 500), + }; + + self + } + /// Test fetching of blob sidecars that are not available in the database due to pruning. /// /// If `zero_blobs` is false, test a block with >0 blobs, which should be unavailable. @@ -1921,7 +1989,7 @@ impl ApiTester { match self .client - .get_blobs::(CoreBlockId::Slot(test_slot), None, &self.chain.spec) + .get_blob_sidecars::(CoreBlockId::Slot(test_slot), None, &self.chain.spec) .await { Ok(result) => { @@ -1959,7 +2027,7 @@ impl ApiTester { match self .client - .get_blobs::(CoreBlockId::Slot(test_slot), None, &self.chain.spec) + .get_blob_sidecars::(CoreBlockId::Slot(test_slot), None, &self.chain.spec) .await { Ok(result) => panic!("queries for pre-Deneb slots should fail. got: {result:?}"), @@ -3405,7 +3473,7 @@ impl ApiTester { PublishBlockRequest::try_from(Arc::new(signed_block.clone())).unwrap(); self.client - .post_beacon_blocks(&signed_block_contents) + .post_beacon_blocks_v2(&signed_block_contents, None) .await .unwrap(); @@ -3470,7 +3538,7 @@ impl ApiTester { block_contents.sign(&sk, &fork, genesis_validators_root, &self.chain.spec); self.client - .post_beacon_blocks_ssz(&signed_block_contents) + .post_beacon_blocks_v2_ssz(&signed_block_contents, None) .await .unwrap(); @@ -3588,7 +3656,7 @@ impl ApiTester { block_contents.sign(&sk, &fork, genesis_validators_root, &self.chain.spec); self.client - .post_beacon_blocks_ssz(&signed_block_contents) + .post_beacon_blocks_v2_ssz(&signed_block_contents, None) .await .unwrap(); @@ -6394,7 +6462,7 @@ impl ApiTester { }); self.client - .post_beacon_blocks(&self.next_block) + .post_beacon_blocks_v2(&self.next_block, None) .await .unwrap(); @@ -6439,7 +6507,7 @@ impl ApiTester { self.harness.advance_slot(); self.client - .post_beacon_blocks(&self.reorg_block) + .post_beacon_blocks_v2(&self.reorg_block, None) .await .unwrap(); @@ -6661,7 +6729,7 @@ impl ApiTester { }); self.client - .post_beacon_blocks(&self.next_block) + .post_beacon_blocks_v2(&self.next_block, None) .await .unwrap(); @@ -7707,10 +7775,7 @@ async fn builder_payload_chosen_by_profit_v3() { #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn builder_works_post_capella() { - let mut config = ApiTesterConfig { - retain_historic_states: false, - spec: E::default_spec(), - }; + let mut config = ApiTesterConfig::default(); config.spec.altair_fork_epoch = Some(Epoch::new(0)); config.spec.bellatrix_fork_epoch = Some(Epoch::new(0)); config.spec.capella_fork_epoch = Some(Epoch::new(0)); @@ -7727,10 +7792,7 @@ async fn builder_works_post_capella() { #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn builder_works_post_deneb() { - let mut config = ApiTesterConfig { - retain_historic_states: false, - spec: E::default_spec(), - }; + let mut config = ApiTesterConfig::default(); config.spec.altair_fork_epoch = Some(Epoch::new(0)); config.spec.bellatrix_fork_epoch = Some(Epoch::new(0)); config.spec.capella_fork_epoch = Some(Epoch::new(0)); @@ -7748,10 +7810,7 @@ async fn builder_works_post_deneb() { #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn get_blob_sidecars() { - let mut config = ApiTesterConfig { - retain_historic_states: false, - spec: E::default_spec(), - }; + let mut config = ApiTesterConfig::default(); config.spec.altair_fork_epoch = Some(Epoch::new(0)); config.spec.bellatrix_fork_epoch = Some(Epoch::new(0)); config.spec.capella_fork_epoch = Some(Epoch::new(0)); @@ -7764,6 +7823,53 @@ async fn get_blob_sidecars() { .test_get_blob_sidecars(false) .await .test_get_blob_sidecars(true) + .await + .test_get_blobs(false) + .await + .test_get_blobs(true) + .await; +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn get_blobs_post_fulu_supernode() { + let mut config = ApiTesterConfig { + retain_historic_states: false, + spec: E::default_spec(), + // For supernode, we import all data columns + import_all_data_columns: true, + }; + config.spec.altair_fork_epoch = Some(Epoch::new(0)); + config.spec.bellatrix_fork_epoch = Some(Epoch::new(0)); + config.spec.capella_fork_epoch = Some(Epoch::new(0)); + config.spec.deneb_fork_epoch = Some(Epoch::new(0)); + config.spec.electra_fork_epoch = Some(Epoch::new(0)); + config.spec.fulu_fork_epoch = Some(Epoch::new(0)); + + ApiTester::new_from_config(config) + .await + // We can call the same get_blobs function in this test + // because the function will call get_blobs_by_versioned_hashes which handles peerDAS post-Fulu + .test_get_blobs(false) + .await + .test_get_blobs(true) + .await; +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn get_blobs_post_fulu_full_node() { + let mut config = ApiTesterConfig::default(); + config.spec.altair_fork_epoch = Some(Epoch::new(0)); + config.spec.bellatrix_fork_epoch = Some(Epoch::new(0)); + config.spec.capella_fork_epoch = Some(Epoch::new(0)); + config.spec.deneb_fork_epoch = Some(Epoch::new(0)); + config.spec.electra_fork_epoch = Some(Epoch::new(0)); + config.spec.fulu_fork_epoch = Some(Epoch::new(0)); + + ApiTester::new_from_config(config) + .await + .test_get_blobs_post_fulu_full_node(false) + .await + .test_get_blobs_post_fulu_full_node(true) .await; } @@ -7829,7 +7935,7 @@ async fn lighthouse_endpoints() { #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn optimistic_responses() { - ApiTester::new_with_hard_forks(true, true) + ApiTester::new_with_hard_forks() .await .test_check_optimistic_responses() .await; diff --git a/beacon_node/http_metrics/Cargo.toml b/beacon_node/http_metrics/Cargo.toml index e12053ac43..b74c04a4cb 100644 --- a/beacon_node/http_metrics/Cargo.toml +++ b/beacon_node/http_metrics/Cargo.toml @@ -13,6 +13,7 @@ lighthouse_version = { workspace = true } logging = { workspace = true } malloc_utils = { workspace = true } metrics = { workspace = true } +network_utils = { workspace = true } serde = { workspace = true } slot_clock = { workspace = true } store = { workspace = true } diff --git a/beacon_node/http_metrics/src/metrics.rs b/beacon_node/http_metrics/src/metrics.rs index dbb0707a90..c19fa8fd3b 100644 --- a/beacon_node/http_metrics/src/metrics.rs +++ b/beacon_node/http_metrics/src/metrics.rs @@ -37,7 +37,7 @@ pub fn gather_prometheus_metrics( store::scrape_for_metrics(db_path, freezer_db_path); } - lighthouse_network::scrape_discovery_metrics(); + network_utils::discovery_metrics::scrape_discovery_metrics(); health_metrics::metrics::scrape_health_metrics(); diff --git a/beacon_node/lighthouse_network/Cargo.toml b/beacon_node/lighthouse_network/Cargo.toml index 0b2ca9e818..7e69f6770b 100644 --- a/beacon_node/lighthouse_network/Cargo.toml +++ b/beacon_node/lighthouse_network/Cargo.toml @@ -31,6 +31,7 @@ logging = { workspace = true } lru = { workspace = true } lru_cache = { workspace = true } metrics = { workspace = true } +network_utils = { workspace = true } parking_lot = { workspace = true } prometheus-client = "0.23.0" rand = { workspace = true } @@ -43,15 +44,12 @@ ssz_types = { workspace = true } strum = { workspace = true } superstruct = { workspace = true } task_executor = { workspace = true } -tiny-keccak = "2" tokio = { workspace = true } -tokio-io-timeout = "1" tokio-util = { workspace = true } tracing = { workspace = true } tracing-subscriber = { workspace = true } types = { workspace = true } unsigned-varint = { version = "0.8", features = ["codec"] } -unused_port = { workspace = true } [dependencies.libp2p] version = "0.56" diff --git a/beacon_node/lighthouse_network/src/config.rs b/beacon_node/lighthouse_network/src/config.rs index 23d545798f..89c6c58d4f 100644 --- a/beacon_node/lighthouse_network/src/config.rs +++ b/beacon_node/lighthouse_network/src/config.rs @@ -1,4 +1,3 @@ -use crate::listen_addr::{ListenAddr, ListenAddress}; use crate::peer_manager::config::DEFAULT_TARGET_PEERS; use crate::rpc::config::{InboundRateLimiterConfig, OutboundRateLimiterConfig}; use crate::types::GossipKind; @@ -8,6 +7,7 @@ use directory::{ }; use libp2p::Multiaddr; use local_ip_address::local_ipv6; +use network_utils::listen_addr::{ListenAddr, ListenAddress}; use serde::{Deserialize, Serialize}; use sha2::{Digest, Sha256}; use std::net::{Ipv4Addr, Ipv6Addr}; diff --git a/beacon_node/lighthouse_network/src/discovery/enr.rs b/beacon_node/lighthouse_network/src/discovery/enr.rs index bb3a32daf2..bb9ff299c5 100644 --- a/beacon_node/lighthouse_network/src/discovery/enr.rs +++ b/beacon_node/lighthouse_network/src/discovery/enr.rs @@ -3,13 +3,13 @@ pub use discv5::enr::CombinedKey; use super::ENR_FILENAME; -use super::enr_ext::CombinedKeyExt; -use super::enr_ext::{EnrExt, QUIC_ENR_KEY, QUIC6_ENR_KEY}; use crate::NetworkConfig; use crate::types::{Enr, EnrAttestationBitfield, EnrSyncCommitteeBitfield}; use alloy_rlp::bytes::Bytes; use libp2p::identity::Keypair; use lighthouse_version::{client_name, version}; +use network_utils::enr_ext::CombinedKeyExt; +use network_utils::enr_ext::{EnrExt, QUIC_ENR_KEY, QUIC6_ENR_KEY}; use ssz::{Decode, Encode}; use ssz_types::BitVector; use std::fs::File; diff --git a/beacon_node/lighthouse_network/src/discovery/mod.rs b/beacon_node/lighthouse_network/src/discovery/mod.rs index 2d47153809..49de62546d 100644 --- a/beacon_node/lighthouse_network/src/discovery/mod.rs +++ b/beacon_node/lighthouse_network/src/discovery/mod.rs @@ -4,7 +4,6 @@ //! queries and manages access to the discovery routing table. pub(crate) mod enr; -pub mod enr_ext; // Allow external use of the lighthouse ENR builder use crate::service::TARGET_SUBNET_PEERS; @@ -12,8 +11,8 @@ use crate::{ClearDialError, metrics}; use crate::{Enr, NetworkConfig, NetworkGlobals, Subnet, SubnetDiscovery}; use discv5::{Discv5, enr::NodeId}; pub use enr::{CombinedKey, Eth2Enr, build_enr, load_enr_from_disk, use_or_load_enr}; -pub use enr_ext::{CombinedKeyExt, EnrExt, peer_id_to_node_id}; pub use libp2p::identity::{Keypair, PublicKey}; +use network_utils::enr_ext::{CombinedKeyExt, EnrExt, peer_id_to_node_id}; use alloy_rlp::bytes::Bytes; use enr::{ATTESTATION_BITFIELD_ENR_KEY, ETH2_ENR_KEY, SYNC_COMMITTEE_BITFIELD_ENR_KEY}; @@ -33,6 +32,7 @@ pub use libp2p::{ }; use logging::crit; use lru::LruCache; +use network_utils::discovery_metrics; use ssz::Encode; use std::num::NonZeroUsize; use std::{ @@ -687,7 +687,10 @@ impl Discovery { min_ttl, retries, }); - metrics::set_gauge(&metrics::DISCOVERY_QUEUE, self.queued_queries.len() as i64); + metrics::set_gauge( + &discovery_metrics::DISCOVERY_QUEUE, + self.queued_queries.len() as i64, + ); } } @@ -722,7 +725,10 @@ impl Discovery { } } // Update the queue metric - metrics::set_gauge(&metrics::DISCOVERY_QUEUE, self.queued_queries.len() as i64); + metrics::set_gauge( + &discovery_metrics::DISCOVERY_QUEUE, + self.queued_queries.len() as i64, + ); processed } @@ -1223,7 +1229,7 @@ impl Discovery { #[cfg(test)] mod tests { use super::*; - use crate::rpc::methods::{MetaData, MetaDataV2}; + use crate::rpc::methods::{MetaData, MetaDataV3}; use libp2p::identity::secp256k1; use types::{BitVector, MinimalEthSpec, SubnetId}; @@ -1233,7 +1239,7 @@ mod tests { let spec = Arc::new(ChainSpec::default()); let keypair = secp256k1::Keypair::generate(); let mut config = NetworkConfig::default(); - config.set_listening_addr(crate::ListenAddress::unused_v4_ports()); + config.set_listening_addr(network_utils::listen_addr::ListenAddress::unused_v4_ports()); let config = Arc::new(config); let enr_key: CombinedKey = CombinedKey::from_secp256k1(&keypair); let next_fork_digest = [0; 4]; @@ -1248,10 +1254,11 @@ mod tests { .unwrap(); let globals = NetworkGlobals::new( enr, - MetaData::V2(MetaDataV2 { + MetaData::V3(MetaDataV3 { seq_number: 0, attnets: Default::default(), syncnets: Default::default(), + custody_group_count: spec.custody_requirement, }), vec![], false, diff --git a/beacon_node/lighthouse_network/src/lib.rs b/beacon_node/lighthouse_network/src/lib.rs index 5c4a458650..b6be9b5222 100644 --- a/beacon_node/lighthouse_network/src/lib.rs +++ b/beacon_node/lighthouse_network/src/lib.rs @@ -6,14 +6,12 @@ mod config; pub mod service; pub mod discovery; -pub mod listen_addr; pub mod metrics; pub mod peer_manager; pub mod rpc; pub mod types; use libp2p::swarm::DialError; -pub use listen_addr::*; use serde::{Deserialize, Deserializer, Serialize, Serializer, de}; use std::str::FromStr; @@ -107,13 +105,12 @@ pub use crate::types::{ pub use prometheus_client; pub use config::Config as NetworkConfig; -pub use discovery::{CombinedKeyExt, EnrExt, Eth2Enr}; +pub use discovery::Eth2Enr; pub use discv5; pub use gossipsub::{IdentTopic, MessageAcceptance, MessageId, Topic, TopicHash}; pub use libp2p; pub use libp2p::{Multiaddr, multiaddr}; pub use libp2p::{PeerId, Swarm, core::ConnectedPoint}; -pub use metrics::scrape_discovery_metrics; pub use peer_manager::{ ConnectionDirection, PeerConnectionStatus, PeerInfo, PeerManager, SyncInfo, SyncStatus, peerdb::PeerDB, diff --git a/beacon_node/lighthouse_network/src/metrics.rs b/beacon_node/lighthouse_network/src/metrics.rs index da986f2884..623d43a727 100644 --- a/beacon_node/lighthouse_network/src/metrics.rs +++ b/beacon_node/lighthouse_network/src/metrics.rs @@ -1,14 +1,6 @@ pub use metrics::*; use std::sync::LazyLock; -pub static NAT_OPEN: LazyLock> = LazyLock::new(|| { - try_create_int_gauge_vec( - "nat_open", - "An estimate indicating if the local node is reachable from external nodes", - &["protocol"], - ) -}); - pub static ADDRESS_UPDATE_COUNT: LazyLock> = LazyLock::new(|| { try_create_int_counter( "libp2p_address_update_total", @@ -53,31 +45,6 @@ pub static PEER_DISCONNECT_EVENT_COUNT: LazyLock> = LazyLock: "Count of libp2p peer disconnect events", ) }); -pub static DISCOVERY_BYTES: LazyLock> = LazyLock::new(|| { - try_create_int_gauge_vec( - "discovery_bytes", - "The number of bytes sent and received in discovery", - &["direction"], - ) -}); -pub static DISCOVERY_QUEUE: LazyLock> = LazyLock::new(|| { - try_create_int_gauge( - "discovery_queue_size", - "The number of discovery queries awaiting execution", - ) -}); -pub static DISCOVERY_REQS: LazyLock> = LazyLock::new(|| { - try_create_float_gauge( - "discovery_requests", - "The number of unsolicited discovery requests per second", - ) -}); -pub static DISCOVERY_SESSIONS: LazyLock> = LazyLock::new(|| { - try_create_int_gauge( - "discovery_sessions", - "The number of active discovery sessions with peers", - ) -}); pub static DISCOVERY_NO_USEFUL_ENRS: LazyLock> = LazyLock::new(|| { try_create_int_counter( "discovery_no_useful_enrs_found", @@ -219,14 +186,3 @@ pub static RESPONSE_IDLING: LazyLock> = LazyLock::new(|| { "The time our response remained idle in the response limiter", ) }); - -pub fn scrape_discovery_metrics() { - let metrics = - discv5::metrics::Metrics::from(discv5::Discv5::::raw_metrics()); - set_float_gauge(&DISCOVERY_REQS, metrics.unsolicited_requests_per_second); - set_gauge(&DISCOVERY_SESSIONS, metrics.active_sessions as i64); - set_gauge_vec(&DISCOVERY_BYTES, &["inbound"], metrics.bytes_recv as i64); - set_gauge_vec(&DISCOVERY_BYTES, &["outbound"], metrics.bytes_sent as i64); - set_gauge_vec(&NAT_OPEN, &["discv5_ipv4"], metrics.ipv4_contactable as i64); - set_gauge_vec(&NAT_OPEN, &["discv5_ipv6"], metrics.ipv6_contactable as i64); -} diff --git a/beacon_node/lighthouse_network/src/peer_manager/mod.rs b/beacon_node/lighthouse_network/src/peer_manager/mod.rs index 93515ed5f6..ad16bb0421 100644 --- a/beacon_node/lighthouse_network/src/peer_manager/mod.rs +++ b/beacon_node/lighthouse_network/src/peer_manager/mod.rs @@ -1,7 +1,5 @@ //! Implementation of Lighthouse's peer management system. -use crate::discovery::enr_ext::EnrExt; -use crate::discovery::peer_id_to_node_id; use crate::rpc::{GoodbyeReason, MetaData, Protocol, RPCError, RpcErrorResponse}; use crate::service::TARGET_SUBNET_PEERS; use crate::{Gossipsub, NetworkGlobals, PeerId, Subnet, SubnetDiscovery, metrics}; @@ -17,7 +15,7 @@ use std::{ time::{Duration, Instant}, }; use tracing::{debug, error, trace, warn}; -use types::{DataColumnSubnetId, EthSpec, SyncSubnetId}; +use types::{DataColumnSubnetId, EthSpec, SubnetId, SyncSubnetId}; pub use libp2p::core::Multiaddr; pub use libp2p::identity::Keypair; @@ -25,10 +23,11 @@ pub use libp2p::identity::Keypair; pub mod peerdb; use crate::peer_manager::peerdb::client::ClientKind; +use crate::types::GossipKind; use libp2p::multiaddr; -pub use peerdb::peer_info::{ - ConnectionDirection, PeerConnectionStatus, PeerConnectionStatus::*, PeerInfo, -}; +use network_utils::discovery_metrics; +use network_utils::enr_ext::{EnrExt, peer_id_to_node_id}; +pub use peerdb::peer_info::{ConnectionDirection, PeerConnectionStatus, PeerInfo}; use peerdb::score::{PeerAction, ReportSource}; pub use peerdb::sync_status::{SyncInfo, SyncStatus}; use std::collections::{HashMap, HashSet, hash_map::Entry}; @@ -38,6 +37,14 @@ use types::data_column_custody_group::{ CustodyIndex, compute_subnets_from_custody_group, get_custody_groups, }; +/// Unified peer subnet information structure for pruning logic. +struct PeerSubnetInfo { + info: PeerInfo, + attestation_subnets: HashSet, + sync_committees: HashSet, + custody_subnets: HashSet, +} + pub mod config; mod network_behaviour; @@ -52,6 +59,8 @@ pub const PEER_RECONNECTION_TIMEOUT: Duration = Duration::from_secs(600); /// lower our peer count below this number. Instead we favour a non-uniform distribution of subnet /// peers. pub const MIN_SYNC_COMMITTEE_PEERS: u64 = 2; +/// Avoid pruning sampling peers if subnet peer count is below this number. +pub const MIN_SAMPLING_COLUMN_SUBNET_PEERS: u64 = 2; /// A fraction of `PeerManager::target_peers` that we allow to connect to us in excess of /// `PeerManager::target_peers`. For clarity, if `PeerManager::target_peers` is 50 and /// PEER_EXCESS_FACTOR = 0.1 we allow 10% more nodes, i.e 55. @@ -161,7 +170,7 @@ impl PeerManager { } = cfg; // Set up the peer manager heartbeat interval - let heartbeat = tokio::time::interval(tokio::time::Duration::from_secs(HEARTBEAT_INTERVAL)); + let heartbeat = tokio::time::interval(Duration::from_secs(HEARTBEAT_INTERVAL)); // Compute subnets for all custody groups let subnets_by_custody_group = if network_globals.spec.is_peer_das_scheduled() { @@ -729,7 +738,16 @@ impl PeerManager { } } else { // we have no meta-data for this peer, update - debug!(%peer_id, new_seq_no = meta_data.seq_number(), "Obtained peer's metadata"); + let cgc = meta_data + .custody_group_count() + .map(|&count| count.to_string()) + .unwrap_or_else(|_| "unknown".to_string()); + debug!( + %peer_id, + new_seq_no = meta_data.seq_number(), + cgc, + "Obtained peer's metadata" + ); } let known_custody_group_count = peer_info @@ -949,6 +967,43 @@ impl PeerManager { } } + /// Run discovery query for additional custody peers if we fall below `MIN_SAMPLING_COLUMN_SUBNET_PEERS`. + fn maintain_custody_peers(&mut self) { + let subnets_to_discover: Vec = self + .network_globals + .sampling_subnets() + .iter() + .filter_map(|custody_subnet| { + if self + .network_globals + .peers + .read() + .has_good_peers_in_custody_subnet( + custody_subnet, + MIN_SAMPLING_COLUMN_SUBNET_PEERS as usize, + ) + { + None + } else { + Some(SubnetDiscovery { + subnet: Subnet::DataColumn(*custody_subnet), + min_ttl: None, + }) + } + }) + .collect(); + + // request the subnet query from discovery + if !subnets_to_discover.is_empty() { + debug!( + subnets = ?subnets_to_discover.iter().map(|s| s.subnet).collect::>(), + "Making subnet queries for maintaining custody peers" + ); + self.events + .push(PeerManagerEvent::DiscoverSubnetPeers(subnets_to_discover)); + } + } + fn maintain_trusted_peers(&mut self) { let trusted_peers = self.trusted_peers.clone(); for trusted_peer in trusted_peers { @@ -991,9 +1046,204 @@ impl PeerManager { } } + /// Build unified peer subnet information from connected peers. + /// + /// This creates a unified structure containing all subnet information for each peer, + /// excluding trusted peers and peers already marked for pruning. + fn build_peer_subnet_info( + &self, + peers_to_prune: &HashSet, + ) -> HashMap> { + let mut peer_subnet_info: HashMap> = HashMap::new(); + + for (peer_id, info) in self.network_globals.peers.read().connected_peers() { + // Ignore peers we trust or that we are already pruning + if info.is_trusted() || peers_to_prune.contains(peer_id) { + continue; + } + + let mut peer_info = PeerSubnetInfo { + info: info.clone(), + attestation_subnets: HashSet::new(), + sync_committees: HashSet::new(), + custody_subnets: HashSet::new(), + }; + + // Populate subnet information from long-lived subnets + for subnet in info.long_lived_subnets() { + match subnet { + Subnet::Attestation(subnet_id) => { + peer_info.attestation_subnets.insert(subnet_id); + } + Subnet::SyncCommittee(id) => { + peer_info.sync_committees.insert(id); + } + Subnet::DataColumn(id) => { + peer_info.custody_subnets.insert(id); + } + } + } + + peer_subnet_info.insert(*peer_id, peer_info); + } + + peer_subnet_info + } + + /// Build reverse lookup from custody subnets to peer lists. + fn build_custody_subnet_lookup( + peer_subnet_info: &HashMap>, + ) -> HashMap> { + let mut custody_subnet_to_peers: HashMap> = HashMap::new(); + + for (peer_id, peer_info) in peer_subnet_info { + for &custody_subnet in &peer_info.custody_subnets { + custody_subnet_to_peers + .entry(custody_subnet) + .or_default() + .push(*peer_id); + } + } + + custody_subnet_to_peers + } + + /// Determine if a peer should be protected from pruning based on various criteria. + /// + /// Protection criteria: + /// - Outbound peers: don't prune if it would drop below target outbound peer count + /// - Data column sampling: ≤ MIN_SAMPLING_COLUMN_SUBNET_PEERS (2) peers per subnet + /// - Sync committees: ≤ MIN_SYNC_COMMITTEE_PEERS (2) peers per committee + /// - Attestation subnets: protect peers on the scarcest attestation subnets + /// + /// Returns true if the peer should be protected (not pruned). + fn should_protect_peer( + &self, + candidate_info: &PeerSubnetInfo, + sampling_subnets: &HashSet, + custody_subnet_to_peers: &HashMap>, + peer_subnet_info: &HashMap>, + connected_outbound_peer_count: usize, + outbound_peers_pruned: usize, + ) -> bool { + // Ensure we don't remove too many outbound peers + if candidate_info.info.is_outbound_only() + && self.target_outbound_peers() + >= connected_outbound_peer_count.saturating_sub(outbound_peers_pruned) + { + return true; + } + + // Check data column sampling subnets + // If the peer exists in a sampling subnet that is less than or equal to MIN_SAMPLING_COLUMN_SUBNET_PEERS, we keep it + let should_protect_sampling = candidate_info + .custody_subnets + .iter() + .filter(|subnet| sampling_subnets.contains(subnet)) + .any(|subnet| { + let count = custody_subnet_to_peers + .get(subnet) + .map(|peers| peers.len()) + .unwrap_or(0); + count <= MIN_SAMPLING_COLUMN_SUBNET_PEERS as usize + }); + + if should_protect_sampling { + return true; + } + + // Check sync committee protection + let should_protect_sync = candidate_info.sync_committees.iter().any(|sync_committee| { + let count = peer_subnet_info + .values() + .filter(|p| p.sync_committees.contains(sync_committee)) + .count(); + count <= MIN_SYNC_COMMITTEE_PEERS as usize + }); + + if should_protect_sync { + return true; + } + + // Check attestation subnet to avoid pruning from subnets with the lowest peer count + let attestation_subnet_counts: HashMap = peer_subnet_info + .values() + .flat_map(|p| &p.attestation_subnets) + .fold(HashMap::new(), |mut acc, &subnet| { + *acc.entry(subnet).or_insert(0) += 1; + acc + }); + + if let Some(&least_dense_size) = attestation_subnet_counts.values().min() { + let is_on_least_dense = candidate_info + .attestation_subnets + .iter() + .any(|subnet| attestation_subnet_counts.get(subnet) == Some(&least_dense_size)); + + if is_on_least_dense { + return true; + } + } + + false + } + + /// Find the best candidate for removal from the densest custody subnet. + /// + /// Returns the PeerId of the candidate to remove, or None if no suitable candidate found. + fn find_prune_candidate( + &self, + column_subnet: DataColumnSubnetId, + column_subnet_to_peers: &HashMap>, + peer_subnet_info: &HashMap>, + sampling_subnets: &HashSet, + connected_outbound_peer_count: usize, + outbound_peers_pruned: usize, + ) -> Option { + let peers_on_subnet_clone = column_subnet_to_peers.get(&column_subnet)?.clone(); + + // Create a sorted list of peers prioritized for removal + let mut sorted_peers = peers_on_subnet_clone; + sorted_peers.shuffle(&mut rand::rng()); + sorted_peers.sort_by_key(|peer_id| { + if let Some(peer_info) = peer_subnet_info.get(peer_id) { + ( + peer_info.info.custody_subnet_count(), + peer_info.info.is_synced_or_advanced(), + ) + } else { + (0, false) + } + }); + + // Try and find a candidate peer to remove from the subnet + for candidate_peer in &sorted_peers { + let Some(candidate_info) = peer_subnet_info.get(candidate_peer) else { + continue; + }; + + // Check if this peer should be protected + if self.should_protect_peer( + candidate_info, + sampling_subnets, + column_subnet_to_peers, + peer_subnet_info, + connected_outbound_peer_count, + outbound_peers_pruned, + ) { + continue; + } + + // Found a suitable candidate + return Some(*candidate_peer); + } + + None + } + /// Remove excess peers back down to our target values. /// This prioritises peers with a good score and uniform distribution of peers across - /// subnets. + /// data column subnets. /// /// The logic for the peer pruning is as follows: /// @@ -1023,9 +1273,12 @@ impl PeerManager { /// Prune peers in the following order: /// 1. Remove worst scoring peers /// 2. Remove peers that are not subscribed to a subnet (they have less value) - /// 3. Remove peers that we have many on any particular subnet - /// 4. Randomly remove peers if all the above are satisfied - /// + /// 3. Remove peers that we have many on any particular subnet, with some exceptions + /// - Don't remove peers needed for data column sampling (≥ MIN_SAMPLING_COLUMN_SUBNET_PEERS) + /// - Don't remove peers needed for sync committees (>=MIN_SYNC_COMMITTEE_PEERS) + /// - Don't remove peers from the lowest density attestation subnets + /// 4. Randomly remove peers if all the above are satisfied until we reach `target_peers`, or + /// until we can't prune any more peers due to the above constraints. fn prune_excess_peers(&mut self) { // The current number of connected peers. let connected_peer_count = self.network_globals.connected_peers(); @@ -1035,7 +1288,7 @@ impl PeerManager { } // Keep a list of peers we are pruning. - let mut peers_to_prune = std::collections::HashSet::new(); + let mut peers_to_prune = HashSet::new(); let connected_outbound_peer_count = self.network_globals.connected_outbound_only_peers(); // Keep track of the number of outbound peers we are pruning. @@ -1087,146 +1340,57 @@ impl PeerManager { prune_peers!(|info: &PeerInfo| { !info.has_long_lived_subnet() }); } - // 3. and 4. Remove peers that are too grouped on any given subnet. If all subnets are + // 3. and 4. Remove peers that are too grouped on any given data column subnet. If all subnets are // uniformly distributed, remove random peers. if peers_to_prune.len() < connected_peer_count.saturating_sub(self.target_peers) { - // Of our connected peers, build a map from subnet_id -> Vec<(PeerId, PeerInfo)> - let mut subnet_to_peer: HashMap)>> = HashMap::new(); - // These variables are used to track if a peer is in a long-lived sync-committee as we - // may wish to retain this peer over others when pruning. - let mut sync_committee_peer_count: HashMap = HashMap::new(); - let mut peer_to_sync_committee: HashMap< - PeerId, - std::collections::HashSet, - > = HashMap::new(); + let sampling_subnets = self.network_globals.sampling_subnets(); + let mut peer_subnet_info = self.build_peer_subnet_info(&peers_to_prune); + let mut custody_subnet_to_peers = Self::build_custody_subnet_lookup(&peer_subnet_info); - for (peer_id, info) in self.network_globals.peers.read().connected_peers() { - // Ignore peers we trust or that we are already pruning - if info.is_trusted() || peers_to_prune.contains(peer_id) { - continue; - } - - // Count based on long-lived subnets not short-lived subnets - // NOTE: There are only 4 sync committees. These are likely to be denser than the - // subnets, so our priority here to make the subnet peer count uniform, ignoring - // the dense sync committees. - for subnet in info.long_lived_subnets() { - match subnet { - Subnet::Attestation(_) => { - subnet_to_peer - .entry(subnet) - .or_default() - .push((*peer_id, info.clone())); - } - Subnet::SyncCommittee(id) => { - *sync_committee_peer_count.entry(id).or_default() += 1; - peer_to_sync_committee - .entry(*peer_id) - .or_default() - .insert(id); - } - // TODO(das) to be implemented. We're not pruning data column peers yet - // because data column topics are subscribed as core topics until we - // implement recomputing data column subnets. - Subnet::DataColumn(_) => {} - } - } - } - - // Add to the peers to prune mapping + // Attempt to prune peers to `target_peers`, or until we run out of peers to prune. while peers_to_prune.len() < connected_peer_count.saturating_sub(self.target_peers) { - if let Some((_, peers_on_subnet)) = subnet_to_peer - .iter_mut() + let custody_subnet_with_most_peers = custody_subnet_to_peers + .iter() + .filter(|(_, peers)| !peers.is_empty()) .max_by_key(|(_, peers)| peers.len()) - { - // and the subnet still contains peers - if !peers_on_subnet.is_empty() { - // Order the peers by the number of subnets they are long-lived - // subscribed too, shuffle equal peers. - peers_on_subnet.shuffle(&mut rand::rng()); - peers_on_subnet.sort_by_key(|(_, info)| info.long_lived_subnet_count()); + .map(|(subnet_id, _)| *subnet_id); - // Try and find a candidate peer to remove from the subnet. - // We ignore peers that would put us below our target outbound peers - // and we currently ignore peers that would put us below our - // sync-committee threshold, if we can avoid it. - - let mut removed_peer_index = None; - for (index, (candidate_peer, info)) in peers_on_subnet.iter().enumerate() { - // Ensure we don't remove too many outbound peers - if info.is_outbound_only() - && self.target_outbound_peers() - >= connected_outbound_peer_count - .saturating_sub(outbound_peers_pruned) - { - // Restart the main loop with the outbound peer removed from - // the list. This will lower the peers per subnet count and - // potentially a new subnet may be chosen to remove peers. This - // can occur recursively until we have no peers left to choose - // from. - continue; - } - - // Check the sync committee - if let Some(subnets) = peer_to_sync_committee.get(candidate_peer) { - // The peer is subscribed to some long-lived sync-committees - // Of all the subnets this peer is subscribed too, the minimum - // peer count of all of them is min_subnet_count - if let Some(min_subnet_count) = subnets - .iter() - .filter_map(|v| sync_committee_peer_count.get(v).copied()) - .min() - { - // If the minimum count is our target or lower, we - // shouldn't remove this peer, because it drops us lower - // than our target - if min_subnet_count <= MIN_SYNC_COMMITTEE_PEERS { - // Do not drop this peer in this pruning interval - continue; - } - } - } - - if info.is_outbound_only() { - outbound_peers_pruned += 1; - } - // This peer is suitable to be pruned - removed_peer_index = Some(index); - break; + if let Some(densest_subnet) = custody_subnet_with_most_peers { + // If we have successfully found a candidate peer to prune, prune it, + // otherwise all peers on this subnet should not be removed due to our + // outbound limit or min_subnet_count. In this case, we remove all + // peers from the pruning logic and try another subnet. + if let Some(candidate_peer) = self.find_prune_candidate( + densest_subnet, + &custody_subnet_to_peers, + &peer_subnet_info, + &sampling_subnets, + connected_outbound_peer_count, + outbound_peers_pruned, + ) { + // Update outbound peer count if needed + if let Some(candidate_info) = peer_subnet_info.get(&candidate_peer) + && candidate_info.info.is_outbound_only() + { + outbound_peers_pruned += 1; } - // If we have successfully found a candidate peer to prune, prune it, - // otherwise all peers on this subnet should not be removed due to our - // outbound limit or min_subnet_count. In this case, we remove all - // peers from the pruning logic and try another subnet. - if let Some(index) = removed_peer_index { - let (candidate_peer, _) = peers_on_subnet.remove(index); - // Remove pruned peers from other subnet counts - for subnet_peers in subnet_to_peer.values_mut() { - subnet_peers.retain(|(peer_id, _)| peer_id != &candidate_peer); - } - // Remove pruned peers from all sync-committee counts - if let Some(known_sync_committes) = - peer_to_sync_committee.get(&candidate_peer) - { - for sync_committee in known_sync_committes { - if let Some(sync_committee_count) = - sync_committee_peer_count.get_mut(sync_committee) - { - *sync_committee_count = - sync_committee_count.saturating_sub(1); - } - } - } - peers_to_prune.insert(candidate_peer); - } else { - peers_on_subnet.clear(); + // Remove the candidate peer from the maps, so we don't account for them + // when finding the next prune candidate. + for subnet_peers in custody_subnet_to_peers.values_mut() { + subnet_peers.retain(|peer_id| peer_id != &candidate_peer); } - continue; + peer_subnet_info.remove(&candidate_peer); + + peers_to_prune.insert(candidate_peer); + } else if let Some(peers) = custody_subnet_to_peers.get_mut(&densest_subnet) { + // If we can't find a prune candidate in this subnet, remove peers in this subnet + peers.clear() } + } else { + // If there are no peers left to prune, exit. + break; } - // If there are no peers left to prune exit. - break; } } @@ -1271,6 +1435,17 @@ impl PeerManager { // Update peer score metrics; self.update_peer_score_metrics(); + // Maintain minimum count for custody peers if we are subscribed to any data column topics (i.e. PeerDAS activated) + let peerdas_enabled = self + .network_globals + .gossipsub_subscriptions + .read() + .iter() + .any(|topic| matches!(topic.kind(), &GossipKind::DataColumnSidecar(_))); + if peerdas_enabled { + self.maintain_custody_peers(); + } + // Maintain minimum count for sync committee peers. self.maintain_sync_committee_peers(); @@ -1420,16 +1595,16 @@ impl PeerManager { // Set ipv4 nat_open metric flag if threshold of peercount is met, unset if below threshold if inbound_ipv4_peers_connected >= LIBP2P_NAT_OPEN_THRESHOLD { - metrics::set_gauge_vec(&metrics::NAT_OPEN, &["libp2p_ipv4"], 1); + metrics::set_gauge_vec(&discovery_metrics::NAT_OPEN, &["libp2p_ipv4"], 1); } else { - metrics::set_gauge_vec(&metrics::NAT_OPEN, &["libp2p_ipv4"], 0); + metrics::set_gauge_vec(&discovery_metrics::NAT_OPEN, &["libp2p_ipv4"], 0); } // Set ipv6 nat_open metric flag if threshold of peercount is met, unset if below threshold if inbound_ipv6_peers_connected >= LIBP2P_NAT_OPEN_THRESHOLD { - metrics::set_gauge_vec(&metrics::NAT_OPEN, &["libp2p_ipv6"], 1); + metrics::set_gauge_vec(&discovery_metrics::NAT_OPEN, &["libp2p_ipv6"], 1); } else { - metrics::set_gauge_vec(&metrics::NAT_OPEN, &["libp2p_ipv6"], 0); + metrics::set_gauge_vec(&discovery_metrics::NAT_OPEN, &["libp2p_ipv6"], 0); } // PEERS_CONNECTED @@ -1561,6 +1736,22 @@ mod tests { PeerManager::new(config, Arc::new(globals)).unwrap() } + fn empty_synced_status() -> SyncStatus { + SyncStatus::Synced { + info: empty_sync_info(), + } + } + + fn empty_sync_info() -> SyncInfo { + SyncInfo { + head_slot: Default::default(), + head_root: Default::default(), + finalized_epoch: Default::default(), + finalized_root: Default::default(), + earliest_available_slot: None, + } + } + #[tokio::test] async fn test_peer_manager_disconnects_correctly_during_heartbeat() { // Create 6 peers to connect to with a target of 3. @@ -1805,6 +1996,7 @@ mod tests { /// a priority over all else. async fn test_peer_manager_remove_non_subnet_peers_when_all_healthy() { let mut peer_manager = build_peer_manager(3).await; + let spec = peer_manager.network_globals.spec.clone(); // Create 5 peers to connect to. let peer0 = PeerId::random(); @@ -1828,10 +2020,11 @@ mod tests { // Have some of the peers be on a long-lived subnet let mut attnets = crate::types::EnrAttestationBitfield::::new(); attnets.set(1, true).unwrap(); - let metadata = crate::rpc::MetaDataV2 { + let metadata = MetaDataV3 { seq_number: 0, attnets, syncnets: Default::default(), + custody_group_count: spec.custody_requirement, }; peer_manager .network_globals @@ -1839,7 +2032,7 @@ mod tests { .write() .peer_info_mut(&peer0) .unwrap() - .set_meta_data(MetaData::V2(metadata)); + .set_meta_data(MetaData::V3(metadata)); peer_manager .network_globals .peers @@ -1848,10 +2041,11 @@ mod tests { let mut attnets = crate::types::EnrAttestationBitfield::::new(); attnets.set(10, true).unwrap(); - let metadata = crate::rpc::MetaDataV2 { + let metadata = MetaDataV3 { seq_number: 0, attnets, syncnets: Default::default(), + custody_group_count: spec.custody_requirement, }; peer_manager .network_globals @@ -1859,7 +2053,7 @@ mod tests { .write() .peer_info_mut(&peer2) .unwrap() - .set_meta_data(MetaData::V2(metadata)); + .set_meta_data(MetaData::V3(metadata)); peer_manager .network_globals .peers @@ -1868,10 +2062,11 @@ mod tests { let mut syncnets = crate::types::EnrSyncCommitteeBitfield::::new(); syncnets.set(3, true).unwrap(); - let metadata = crate::rpc::MetaDataV2 { + let metadata = MetaDataV3 { seq_number: 0, attnets: Default::default(), syncnets, + custody_group_count: spec.custody_requirement, }; peer_manager .network_globals @@ -1879,7 +2074,7 @@ mod tests { .write() .peer_info_mut(&peer4) .unwrap() - .set_meta_data(MetaData::V2(metadata)); + .set_meta_data(MetaData::V3(metadata)); peer_manager .network_globals .peers @@ -1893,7 +2088,7 @@ mod tests { assert_eq!(peer_manager.network_globals.connected_or_dialing_peers(), 3); // Check that we removed the peers that were not subscribed to any subnet - let mut peers_should_have_removed = std::collections::HashSet::new(); + let mut peers_should_have_removed = HashSet::new(); peers_should_have_removed.insert(peer1); peers_should_have_removed.insert(peer3); for (peer, _) in peer_manager @@ -1954,12 +2149,14 @@ mod tests { } #[tokio::test] - /// Test the pruning logic to remove grouped subnet peers - async fn test_peer_manager_prune_grouped_subnet_peers() { + /// Test the pruning logic to remove grouped data column subnet peers + async fn test_peer_manager_prune_grouped_data_column_subnet_peers() { let target = 9; let mut peer_manager = build_peer_manager(target).await; + // Override sampling subnets to prevent sampling peer protection from interfering with this test. + *peer_manager.network_globals.sampling_subnets.write() = HashSet::new(); - // Create 5 peers to connect to. + // Create 20 peers to connect to. let mut peers = Vec::new(); for x in 0..20 { // Make 20 peers and group peers as: @@ -1972,25 +2169,18 @@ mod tests { peer_manager.inject_connect_ingoing(&peer, "/ip4/0.0.0.0".parse().unwrap(), None); // Have some of the peers be on a long-lived subnet - let mut attnets = crate::types::EnrAttestationBitfield::::new(); - attnets.set(subnet as usize, true).unwrap(); - let metadata = crate::rpc::MetaDataV2 { - seq_number: 0, - attnets, - syncnets: Default::default(), - }; + { + let mut peers_db = peer_manager.network_globals.peers.write(); + let peer_info = peers_db.peer_info_mut(&peer).unwrap(); + peer_info.set_custody_subnets(HashSet::from([DataColumnSubnetId::new(subnet)])); + peer_info.update_sync_status(empty_synced_status()); + } + peer_manager .network_globals .peers .write() - .peer_info_mut(&peer) - .unwrap() - .set_meta_data(MetaData::V2(metadata)); - peer_manager - .network_globals - .peers - .write() - .add_subscription(&peer, Subnet::Attestation(subnet.into())); + .add_subscription(&peer, Subnet::DataColumn(subnet.into())); println!("{},{},{}", x, subnet, peer); peers.push(peer); } @@ -2062,7 +2252,7 @@ mod tests { /// most peers and have the least subscribed long-lived subnets. And peer 0 because it has no /// long-lived subnet. #[tokio::test] - async fn test_peer_manager_prune_subnet_peers_most_subscribed() { + async fn test_peer_manager_prune_data_column_subnet_peers_most_subscribed() { let target = 3; let mut peer_manager = build_peer_manager(target).await; @@ -2073,43 +2263,27 @@ mod tests { peer_manager.inject_connect_ingoing(&peer, "/ip4/0.0.0.0".parse().unwrap(), None); // Have some of the peers be on a long-lived subnet - let mut attnets = crate::types::EnrAttestationBitfield::::new(); - - match x { - 0 => {} - 1 => { - attnets.set(1, true).unwrap(); - attnets.set(2, true).unwrap(); - attnets.set(3, true).unwrap(); - } - 2 => { - attnets.set(1, true).unwrap(); - attnets.set(2, true).unwrap(); - } - 3 => { - attnets.set(3, true).unwrap(); - } - 4 => { - attnets.set(1, true).unwrap(); - } - 5 => { - attnets.set(2, true).unwrap(); - } + let custody_subnets = match x { + 0 => HashSet::new(), + 1 => HashSet::from([ + DataColumnSubnetId::new(1), + DataColumnSubnetId::new(2), + DataColumnSubnetId::new(3), + ]), + 2 => HashSet::from([DataColumnSubnetId::new(1), DataColumnSubnetId::new(2)]), + 3 => HashSet::from([DataColumnSubnetId::new(3)]), + 4 => HashSet::from([DataColumnSubnetId::new(1)]), + 5 => HashSet::from([DataColumnSubnetId::new(2)]), _ => unreachable!(), + }; + + { + let mut peer_db = peer_manager.network_globals.peers.write(); + let peer_info = peer_db.peer_info_mut(&peer).unwrap(); + peer_info.set_custody_subnets(custody_subnets); + peer_info.update_sync_status(empty_synced_status()); } - let metadata = crate::rpc::MetaDataV2 { - seq_number: 0, - attnets, - syncnets: Default::default(), - }; - peer_manager - .network_globals - .peers - .write() - .peer_info_mut(&peer) - .unwrap() - .set_meta_data(MetaData::V2(metadata)); let long_lived_subnets = peer_manager .network_globals .peers @@ -2153,22 +2327,24 @@ mod tests { assert!(!connected_peers.contains(&peers[5])); } - /// Test the pruning logic to prioritise peers with the most subnets, but not at the expense of - /// removing our few sync-committee subnets. + /// Test the pruning logic to prioritise peers with the most data column subnets, but not at + /// the expense of removing our few sync-committee subnets. /// /// Create 6 peers. /// Peer0: None - /// Peer1 : Subnet 1,2,3, - /// Peer2 : Subnet 1,2, - /// Peer3 : Subnet 3 - /// Peer4 : Subnet 1,2, Sync-committee-1 - /// Peer5 : Subnet 1,2, Sync-committee-2 + /// Peer1 : Column subnet 1,2,3, + /// Peer2 : Column subnet 1,2, + /// Peer3 : Column subnet 3 + /// Peer4 : Column subnet 1,2, Sync-committee-1 + /// Peer5 : Column subnet 1,2, Sync-committee-2 /// /// Prune 3 peers: Should be Peer0, Peer1 and Peer2 because (4 and 5 are on a sync-committee) #[tokio::test] async fn test_peer_manager_prune_subnet_peers_sync_committee() { let target = 3; let mut peer_manager = build_peer_manager(target).await; + // Override sampling subnets to prevent sampling peer protection from interfering with this test. + *peer_manager.network_globals.sampling_subnets.write() = HashSet::new(); // Create 6 peers to connect to. let mut peers = Vec::new(); @@ -2177,48 +2353,40 @@ mod tests { peer_manager.inject_connect_ingoing(&peer, "/ip4/0.0.0.0".parse().unwrap(), None); // Have some of the peers be on a long-lived subnet - let mut attnets = crate::types::EnrAttestationBitfield::::new(); let mut syncnets = crate::types::EnrSyncCommitteeBitfield::::new(); - - match x { - 0 => {} - 1 => { - attnets.set(1, true).unwrap(); - attnets.set(2, true).unwrap(); - attnets.set(3, true).unwrap(); - } - 2 => { - attnets.set(1, true).unwrap(); - attnets.set(2, true).unwrap(); - } - 3 => { - attnets.set(3, true).unwrap(); - } + let custody_subnets = match x { + 0 => HashSet::new(), + 1 => HashSet::from([ + DataColumnSubnetId::new(1), + DataColumnSubnetId::new(2), + DataColumnSubnetId::new(3), + ]), + 2 => HashSet::from([DataColumnSubnetId::new(1), DataColumnSubnetId::new(2)]), + 3 => HashSet::from([DataColumnSubnetId::new(3)]), 4 => { - attnets.set(1, true).unwrap(); - attnets.set(2, true).unwrap(); syncnets.set(1, true).unwrap(); + HashSet::from([DataColumnSubnetId::new(1), DataColumnSubnetId::new(2)]) } 5 => { - attnets.set(1, true).unwrap(); - attnets.set(2, true).unwrap(); syncnets.set(2, true).unwrap(); + HashSet::from([DataColumnSubnetId::new(1), DataColumnSubnetId::new(2)]) } _ => unreachable!(), + }; + + { + let mut peer_db = peer_manager.network_globals.peers.write(); + let peer_info = peer_db.peer_info_mut(&peer).unwrap(); + peer_info.set_meta_data(MetaData::V3(MetaDataV3 { + seq_number: 0, + attnets: Default::default(), + syncnets, + custody_group_count: 0, // unused in this test, as pruning logic uses `custody_subnets` + })); + peer_info.set_custody_subnets(custody_subnets); + peer_info.update_sync_status(empty_synced_status()); } - let metadata = crate::rpc::MetaDataV2 { - seq_number: 0, - attnets, - syncnets, - }; - peer_manager - .network_globals - .peers - .write() - .peer_info_mut(&peer) - .unwrap() - .set_meta_data(MetaData::V2(metadata)); let long_lived_subnets = peer_manager .network_globals .peers @@ -2262,10 +2430,111 @@ mod tests { assert!(!connected_peers.contains(&peers[2])); } + /// Test that custody subnet peer count below the `MIN_SAMPLING_COLUMN_SUBNET_PEERS`(2) + /// threshold are protected from pruning. + /// + /// Create 8 peers. + /// Peer0: None (can be pruned) + /// Peer1: Subnet 1,4,5 + /// Peer2: Subnet 1,4 + /// Peer3: Subnet 2 + /// Peer4: Subnet 2 + /// Peer5: Subnet 1 (can be pruned) + /// Peer6: Subnet 3 + /// Peer7: Subnet 5 (can be pruned) + /// + /// Sampling subnets: 1, 2 + /// + /// Prune 3 peers: Should be Peer0, Peer 5 and Peer 7 because + /// - Peer 0 because it has no long-lived subnet. + /// - Peer 5 is on the subnet with the most peers and have the least subscribed long-lived subnets. + /// - Peer 7 because it's on a non-sampling subnet and have the least subscribed long-lived subnets. + #[tokio::test] + async fn test_peer_manager_protect_sampling_subnet_peers_below_threshold() { + let target = 5; + let mut peer_manager = build_peer_manager(target).await; + + *peer_manager.network_globals.sampling_subnets.write() = + HashSet::from([DataColumnSubnetId::new(1), DataColumnSubnetId::new(2)]); + + // Create 8 peers to connect to. + let mut peers = Vec::new(); + for peer_idx in 0..8 { + let peer = PeerId::random(); + peer_manager.inject_connect_ingoing(&peer, "/ip4/0.0.0.0".parse().unwrap(), None); + + // Have some of the peers be on a long-lived subnet + let custody_subnets = match peer_idx { + 0 => HashSet::new(), + 1 => HashSet::from([ + DataColumnSubnetId::new(1), + DataColumnSubnetId::new(4), + DataColumnSubnetId::new(5), + ]), + 2 => HashSet::from([DataColumnSubnetId::new(1), DataColumnSubnetId::new(4)]), + 3 => HashSet::from([DataColumnSubnetId::new(2)]), + 4 => HashSet::from([DataColumnSubnetId::new(2)]), + 5 => HashSet::from([DataColumnSubnetId::new(1)]), + 6 => HashSet::from([DataColumnSubnetId::new(3)]), + 7 => HashSet::from([DataColumnSubnetId::new(5)]), + _ => unreachable!(), + }; + + { + let mut peer_db = peer_manager.network_globals.peers.write(); + let peer_info = peer_db.peer_info_mut(&peer).unwrap(); + peer_info.set_custody_subnets(custody_subnets); + peer_info.update_sync_status(empty_synced_status()); + } + + let long_lived_subnets = peer_manager + .network_globals + .peers + .read() + .peer_info(&peer) + .unwrap() + .long_lived_subnets(); + for subnet in long_lived_subnets { + println!("Subnet: {:?}", subnet); + peer_manager + .network_globals + .peers + .write() + .add_subscription(&peer, subnet); + } + println!("{},{}", peer_idx, peer); + peers.push(peer); + } + + // Perform the heartbeat. + peer_manager.heartbeat(); + + // Tests that when we are over the target peer limit, after disconnecting an unhealthy peer, + // the number of connected peers updates and we will not remove too many peers. + assert_eq!( + peer_manager.network_globals.connected_or_dialing_peers(), + target + ); + + // Check that we removed peers 0, 5 and 7 + let connected_peers: std::collections::HashSet<_> = peer_manager + .network_globals + .peers + .read() + .connected_or_dialing_peers() + .cloned() + .collect(); + + println!("Connected peers: {:?}", connected_peers); + assert!(!connected_peers.contains(&peers[0])); + assert!(!connected_peers.contains(&peers[5])); + assert!(!connected_peers.contains(&peers[7])); + } + /// This test is for reproducing the issue: /// https://github.com/sigp/lighthouse/pull/3236#issue-1256432659 /// - /// Whether the issue happens depends on `subnet_to_peer` (HashMap), since HashMap doesn't + /// Whether the issue happens depends on `custody_subnet_to_peers` (HashMap), since HashMap doesn't /// guarantee a particular order of iteration. So we repeat the test case to try to reproduce /// the issue. #[tokio::test] @@ -2275,41 +2544,42 @@ mod tests { } } - /// Test the pruning logic to prioritize peers with the most subnets. This test specifies + /// Test the pruning logic to prioritize peers with the most column subnets. This test specifies /// the connection direction for the peers. /// Either Peer 4 or 5 is expected to be removed in this test case. /// /// Create 8 peers. - /// Peer0 (out) : Subnet 1, Sync-committee-1 - /// Peer1 (out) : Subnet 1, Sync-committee-1 - /// Peer2 (out) : Subnet 2, Sync-committee-2 - /// Peer3 (out) : Subnet 2, Sync-committee-2 - /// Peer4 (out) : Subnet 3 - /// Peer5 (out) : Subnet 3 - /// Peer6 (in) : Subnet 4 - /// Peer7 (in) : Subnet 5 + /// Peer0 (out) : Column subnet 1, Sync-committee-1 + /// Peer1 (out) : Column subnet 1, Sync-committee-1 + /// Peer2 (out) : Column subnet 2, Sync-committee-2 + /// Peer3 (out) : Column subnet 2, Sync-committee-2 + /// Peer4 (out) : Column subnet 3 + /// Peer5 (out) : Column subnet 3 + /// Peer6 (in) : Column subnet 4 + /// Peer7 (in) : Column subnet 5 async fn test_peer_manager_prune_based_on_subnet_count() { let target = 7; let mut peer_manager = build_peer_manager(target).await; + // Override sampling subnets to prevent sampling peer protection from interfering with this test. + *peer_manager.network_globals.sampling_subnets.write() = HashSet::new(); // Create 8 peers to connect to. let mut peers = Vec::new(); - for x in 0..8 { + for peer_idx in 0..8 { let peer = PeerId::random(); // Have some of the peers be on a long-lived subnet - let mut attnets = crate::types::EnrAttestationBitfield::::new(); let mut syncnets = crate::types::EnrSyncCommitteeBitfield::::new(); - match x { + let custody_subnets = match peer_idx { 0 => { peer_manager.inject_connect_outgoing( &peer, "/ip4/0.0.0.0".parse().unwrap(), None, ); - attnets.set(1, true).unwrap(); syncnets.set(1, true).unwrap(); + HashSet::from([DataColumnSubnetId::new(1)]) } 1 => { peer_manager.inject_connect_outgoing( @@ -2317,8 +2587,8 @@ mod tests { "/ip4/0.0.0.0".parse().unwrap(), None, ); - attnets.set(1, true).unwrap(); syncnets.set(1, true).unwrap(); + HashSet::from([DataColumnSubnetId::new(1)]) } 2 => { peer_manager.inject_connect_outgoing( @@ -2326,8 +2596,8 @@ mod tests { "/ip4/0.0.0.0".parse().unwrap(), None, ); - attnets.set(2, true).unwrap(); syncnets.set(2, true).unwrap(); + HashSet::from([DataColumnSubnetId::new(2)]) } 3 => { peer_manager.inject_connect_outgoing( @@ -2335,8 +2605,8 @@ mod tests { "/ip4/0.0.0.0".parse().unwrap(), None, ); - attnets.set(2, true).unwrap(); syncnets.set(2, true).unwrap(); + HashSet::from([DataColumnSubnetId::new(2)]) } 4 => { peer_manager.inject_connect_outgoing( @@ -2344,7 +2614,7 @@ mod tests { "/ip4/0.0.0.0".parse().unwrap(), None, ); - attnets.set(3, true).unwrap(); + HashSet::from([DataColumnSubnetId::new(3)]) } 5 => { peer_manager.inject_connect_outgoing( @@ -2352,7 +2622,7 @@ mod tests { "/ip4/0.0.0.0".parse().unwrap(), None, ); - attnets.set(3, true).unwrap(); + HashSet::from([DataColumnSubnetId::new(3)]) } 6 => { peer_manager.inject_connect_ingoing( @@ -2360,7 +2630,7 @@ mod tests { "/ip4/0.0.0.0".parse().unwrap(), None, ); - attnets.set(4, true).unwrap(); + HashSet::from([DataColumnSubnetId::new(4)]) } 7 => { peer_manager.inject_connect_ingoing( @@ -2368,23 +2638,26 @@ mod tests { "/ip4/0.0.0.0".parse().unwrap(), None, ); - attnets.set(5, true).unwrap(); + HashSet::from([DataColumnSubnetId::new(5)]) } _ => unreachable!(), + }; + + let metadata = MetaDataV3 { + seq_number: 0, + attnets: Default::default(), + syncnets, + custody_group_count: 0, // unused in this test, as pruning logic uses `custody_subnets` + }; + + { + let mut peer_db_write = peer_manager.network_globals.peers.write(); + let peer_info = peer_db_write.peer_info_mut(&peer).unwrap(); + peer_info.set_meta_data(MetaData::V3(metadata)); + peer_info.set_custody_subnets(custody_subnets); + peer_info.update_sync_status(empty_synced_status()); } - let metadata = crate::rpc::MetaDataV2 { - seq_number: 0, - attnets, - syncnets, - }; - peer_manager - .network_globals - .peers - .write() - .peer_info_mut(&peer) - .unwrap() - .set_meta_data(MetaData::V2(metadata)); let long_lived_subnets = peer_manager .network_globals .peers @@ -2392,7 +2665,7 @@ mod tests { .peer_info(&peer) .unwrap() .long_lived_subnets(); - println!("{},{}", x, peer); + println!("{},{}", peer_idx, peer); for subnet in long_lived_subnets { println!("Subnet: {:?}", subnet); peer_manager @@ -2428,17 +2701,285 @@ mod tests { assert!(connected_peers.contains(&peers[7])); } + /// Test that peers with the sparsest attestation subnets are protected from pruning. + /// + /// Create 7 peers: + /// - 4 on attnet 0 + /// - 1 on attnet 1 (least dense) + /// - 2 on attnet 2 + /// + /// Prune 3 peers: 2 peers from subnet 0 and 1 from either subnet 0 or 2, BUT never from attnet 1. + #[tokio::test] + async fn test_peer_manager_not_prune_sparsest_attestation_subnet() { + let target = 4; + let mut peer_manager = build_peer_manager(target).await; + let spec = peer_manager.network_globals.spec.clone(); + let mut peers = Vec::new(); + + let subnet_assignments = [0, 0, 0, 0, 1, 2, 2]; + + for &subnet in subnet_assignments.iter() { + let peer = PeerId::random(); + peer_manager.inject_connect_ingoing(&peer, "/ip4/0.0.0.0".parse().unwrap(), None); + + let mut attnets = crate::types::EnrAttestationBitfield::::new(); + attnets.set(subnet, true).unwrap(); + + let metadata = MetaDataV3 { + seq_number: 0, + attnets, + syncnets: Default::default(), + custody_group_count: spec.custody_requirement, + }; + peer_manager + .network_globals + .peers + .write() + .peer_info_mut(&peer) + .unwrap() + .set_meta_data(MetaData::V3(metadata)); + + peer_manager + .network_globals + .peers + .write() + .add_subscription(&peer, Subnet::Attestation((subnet as u64).into())); + + peers.push(peer); + } + + peer_manager.heartbeat(); + + // Check attestation subnet to avoid pruning from subnets with lowest peer count: + // Peer 4 (on least dense subnet 1) should be protected + // Should preferentially remove from subnet 0 (most dense) rather than subnet 1 (least dense) + let connected_peers: HashSet<_> = peer_manager + .network_globals + .peers + .read() + .connected_or_dialing_peers() + .cloned() + .collect(); + + // Peer 4 (on least dense attestation subnet 1) should be kept + assert!(connected_peers.contains(&peers[4])); + + // Attestation subnet uniformity should protect peers on least dense subnets + // Count peers on subnet 1 (least dense) + let subnet_1_count = peers + .iter() + .filter(|&peer| connected_peers.contains(peer)) + .filter(|&peer| { + peer_manager + .network_globals + .peers + .read() + .peer_info(peer) + .unwrap() + .long_lived_subnets() + .iter() + .any(|subnet| matches!(subnet, Subnet::Attestation(id) if id == &1u64.into())) + }) + .count(); + + assert!(subnet_1_count > 0, "Least dense subnet should be protected"); + } + + /// Test the pruning logic prioritizes synced and advanced peers over behind/unknown peers. + /// + /// Create 6 peers with different sync statuses: + /// Peer0: Behind + /// Peer1: Unknown + /// Peer2: Synced + /// Peer3: Advanced + /// Peer4: Synced + /// Peer5: Unknown + /// + /// Target: 3 peers. Should prune peers 0, 1, 5 (behind/unknown) and keep 2, 3, 4 (synced/advanced). + #[tokio::test] + async fn test_peer_manager_prune_should_prioritize_synced_advanced_peers() { + let target = 3; + let mut peer_manager = build_peer_manager(target).await; + // Override sampling subnets to prevent sampling peer protection from interfering with this test. + *peer_manager.network_globals.sampling_subnets.write() = HashSet::new(); + + let mut peers = Vec::new(); + let current_peer_count = 6; + for i in 0..current_peer_count { + let peer = PeerId::random(); + peer_manager.inject_connect_ingoing(&peer, "/ip4/0.0.0.0".parse().unwrap(), None); + + let sync_status = match i { + 0 => SyncStatus::Behind { + info: empty_sync_info(), + }, + 1 | 5 => SyncStatus::Unknown, + 2 | 4 => SyncStatus::Synced { + info: empty_sync_info(), + }, + 3 => SyncStatus::Advanced { + info: empty_sync_info(), + }, + _ => unreachable!(), + }; + + { + let mut peer_db = peer_manager.network_globals.peers.write(); + let peer_info = peer_db.peer_info_mut(&peer).unwrap(); + peer_info.update_sync_status(sync_status); + // make sure all the peers have some long live subnets that are not protected + peer_info.set_custody_subnets(HashSet::from([DataColumnSubnetId::new(2)])) + } + + let long_lived_subnets = peer_manager + .network_globals + .peers + .read() + .peer_info(&peer) + .unwrap() + .long_lived_subnets(); + for subnet in long_lived_subnets { + println!("Subnet: {:?}", subnet); + peer_manager + .network_globals + .peers + .write() + .add_subscription(&peer, subnet); + } + + peers.push(peer); + } + + // Perform the heartbeat to trigger pruning + peer_manager.heartbeat(); + + // Should have exactly target number of peers + assert_eq!( + peer_manager.network_globals.connected_or_dialing_peers(), + target + ); + + let connected_peers: std::collections::HashSet<_> = peer_manager + .network_globals + .peers + .read() + .connected_or_dialing_peers() + .cloned() + .collect(); + + // Count how many synced/advanced peers are kept vs behind/unknown peers + let synced_advanced_kept = [&peers[2], &peers[3], &peers[4]] + .iter() + .filter(|peer| connected_peers.contains(peer)) + .count(); + + let behind_unknown_kept = [&peers[0], &peers[1], &peers[5]] + .iter() + .filter(|peer| connected_peers.contains(peer)) + .count(); + + assert_eq!(synced_advanced_kept, target); + assert_eq!(behind_unknown_kept, 0); + } + + /// Test that `peer_subnet_info` is properly cleaned up during pruning iterations. + /// + /// Without proper cleanup, stale peer data affects protection logic for sync committees and we + /// may end up pruning more than expected. + #[tokio::test] + async fn test_peer_manager_prune_mixed_custody_subnet_protection() { + let target = 6; + let mut peer_manager = build_peer_manager(target).await; + // Override sampling subnets to prevent sampling peer protection from interfering. + *peer_manager.network_globals.sampling_subnets.write() = HashSet::new(); + + // Create 12 peers: + // * 4 on custody subnet 0, all on sync committee 0 subnet as well (should only prune up to 2 peers) + // * 3 on subnet 1 + // * 2 on subnet 2 + // * 3 scattered. + let mut peers = Vec::new(); + for i in 0..12 { + let peer = PeerId::random(); + peer_manager.inject_connect_ingoing(&peer, "/ip4/0.0.0.0".parse().unwrap(), None); + + let custody_subnet = match i { + ..4 => 0, + 4..7 => 1, + 7..9 => 2, + _ => i - 6, + }; + let on_sync_committee = i < 4; + + { + let mut peers_db = peer_manager.network_globals.peers.write(); + let peer_info = peers_db.peer_info_mut(&peer).unwrap(); + peer_info + .set_custody_subnets(HashSet::from([DataColumnSubnetId::new(custody_subnet)])); + peer_info.update_sync_status(empty_synced_status()); + + if on_sync_committee { + let mut syncnets = crate::types::EnrSyncCommitteeBitfield::::new(); + syncnets.set(0, true).unwrap(); + peer_info.set_meta_data(MetaData::V3(MetaDataV3 { + seq_number: 0, + attnets: Default::default(), + syncnets, + custody_group_count: 0, + })); + } + + for subnet in peer_info.long_lived_subnets() { + peers_db.add_subscription(&peer, subnet); + } + + peers.push(peer); + } + } + + assert_eq!( + peer_manager.network_globals.connected_or_dialing_peers(), + 12 + ); + + peer_manager.heartbeat(); + + assert_eq!( + peer_manager.network_globals.connected_or_dialing_peers(), + target + ); + + let connected_peers: HashSet = peer_manager + .network_globals + .peers + .read() + .connected_or_dialing_peers() + .cloned() + .collect(); + + // only 2 peers should be pruned from the 4 peers in subnet 0. + let remaining_sync_peers = connected_peers + .iter() + .filter(|peer| peers[0..4].contains(peer)) + .count(); + assert_eq!( + remaining_sync_peers, 2, + "Sync committee protection should preserve exactly MIN_SYNC_COMMITTEE_PEERS (2)" + ); + } + // Test properties PeerManager should have using randomly generated input. #[cfg(test)] mod property_based_tests { use crate::peer_manager::config::DEFAULT_TARGET_PEERS; use crate::peer_manager::tests::build_peer_manager_with_trusted_peers; - use crate::rpc::MetaData; + use crate::rpc::{MetaData, MetaDataV3}; use libp2p::PeerId; use quickcheck::{Arbitrary, Gen, TestResult}; use quickcheck_macros::quickcheck; + use std::collections::HashSet; use tokio::runtime::Runtime; - use types::Unsigned; + use types::{DataColumnSubnetId, Unsigned}; use types::{EthSpec, MainnetEthSpec as E}; #[derive(Clone, Debug)] @@ -2450,6 +2991,7 @@ mod tests { score: f64, trusted: bool, gossipsub_score: f64, + custody_subnets: HashSet, } impl Arbitrary for PeerCondition { @@ -2472,6 +3014,17 @@ mod tests { bitfield }; + let spec = E::default_spec(); + let custody_subnets = { + let total_subnet_count = spec.data_column_sidecar_subnet_count; + let custody_subnet_count = u64::arbitrary(g) % (total_subnet_count + 1); // 0 to 128 + (spec.custody_requirement..total_subnet_count) + .filter(|_| bool::arbitrary(g)) + .map(DataColumnSubnetId::new) + .take(custody_subnet_count as usize) + .collect() + }; + PeerCondition { peer_id: PeerId::random(), outgoing: bool::arbitrary(g), @@ -2480,6 +3033,7 @@ mod tests { score: f64::arbitrary(g), trusted: bool::arbitrary(g), gossipsub_score: f64::arbitrary(g), + custody_subnets, } } } @@ -2487,6 +3041,7 @@ mod tests { #[quickcheck] fn prune_excess_peers(peer_conditions: Vec) -> TestResult { let target_peer_count = DEFAULT_TARGET_PEERS; + let spec = E::default_spec(); if peer_conditions.len() < target_peer_count { return TestResult::discard(); } @@ -2533,17 +3088,22 @@ mod tests { syncnets.set(i, *value).unwrap(); } - let metadata = crate::rpc::MetaDataV2 { + let subnets_per_custody_group = + spec.data_column_sidecar_subnet_count / spec.number_of_custody_groups; + let metadata = MetaDataV3 { seq_number: 0, attnets, syncnets, + custody_group_count: condition.custody_subnets.len() as u64 + / subnets_per_custody_group, }; let mut peer_db = peer_manager.network_globals.peers.write(); let peer_info = peer_db.peer_info_mut(&condition.peer_id).unwrap(); - peer_info.set_meta_data(MetaData::V2(metadata)); + peer_info.set_meta_data(MetaData::V3(metadata)); peer_info.set_gossipsub_score(condition.gossipsub_score); peer_info.add_to_score(condition.score); + peer_info.set_custody_subnets(condition.custody_subnets.clone()); for subnet in peer_info.long_lived_subnets() { peer_db.add_subscription(&condition.peer_id, subnet); @@ -2589,4 +3149,60 @@ mod tests { }) } } + + #[tokio::test] + async fn test_custody_peer_logic_only_runs_when_peerdas_enabled() { + use crate::types::{GossipEncoding, GossipTopic}; + + let mut peer_manager = build_peer_manager(5).await; + + // Set up sampling subnets so maintain_custody_peers would have work to do + *peer_manager.network_globals.sampling_subnets.write() = std::collections::HashSet::from([ + DataColumnSubnetId::new(0), + DataColumnSubnetId::new(1), + ]); + + // Test 1: No data column subscriptions - custody peer logic should NOT run + peer_manager.heartbeat(); + + // Should be no new DiscoverSubnetPeers events since PeerDAS is not enabled + let discovery_events: Vec<_> = peer_manager + .events + .iter() + .filter(|event| matches!(event, PeerManagerEvent::DiscoverSubnetPeers(_))) + .collect(); + assert!( + discovery_events.is_empty(), + "Should not generate discovery events when PeerDAS is disabled, but found: {:?}", + discovery_events + ); + + // Test 2: Add data column subscription - custody peer logic should run + let data_column_topic = GossipTopic::new( + GossipKind::DataColumnSidecar(DataColumnSubnetId::new(0)), + GossipEncoding::SSZSnappy, + [0, 0, 0, 0], // fork_digest + ); + peer_manager + .network_globals + .gossipsub_subscriptions + .write() + .insert(data_column_topic); + + // Clear any existing events to isolate the test + peer_manager.events.clear(); + + peer_manager.heartbeat(); + + // Should now have DiscoverSubnetPeers events since PeerDAS is enabled + let discovery_events: Vec<_> = peer_manager + .events + .iter() + .filter(|event| matches!(event, PeerManagerEvent::DiscoverSubnetPeers(_))) + .collect(); + assert!( + !discovery_events.is_empty(), + "Should generate discovery events when PeerDAS is enabled, but found no discovery events" + ); + } } diff --git a/beacon_node/lighthouse_network/src/peer_manager/network_behaviour.rs b/beacon_node/lighthouse_network/src/peer_manager/network_behaviour.rs index 43d9b90d8d..729dbd193b 100644 --- a/beacon_node/lighthouse_network/src/peer_manager/network_behaviour.rs +++ b/beacon_node/lighthouse_network/src/peer_manager/network_behaviour.rs @@ -12,11 +12,12 @@ use libp2p::swarm::behaviour::{ConnectionClosed, ConnectionEstablished, DialFail use libp2p::swarm::dial_opts::{DialOpts, PeerCondition}; use libp2p::swarm::dummy::ConnectionHandler; use libp2p::swarm::{ConnectionDenied, ConnectionId, NetworkBehaviour, ToSwarm}; -pub use metrics::{NAT_OPEN, set_gauge_vec}; +use metrics::set_gauge_vec; +use network_utils::discovery_metrics::NAT_OPEN; +use network_utils::enr_ext::EnrExt; use tracing::{debug, error, trace}; use types::EthSpec; -use crate::discovery::enr_ext::EnrExt; use crate::types::SyncState; use crate::{ClearDialError, metrics}; diff --git a/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs b/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs index 974b41230e..0ccad8d042 100644 --- a/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs +++ b/beacon_node/lighthouse_network/src/peer_manager/peerdb.rs @@ -1,10 +1,9 @@ +use crate::discovery::CombinedKey; use crate::discovery::enr::PEERDAS_CUSTODY_GROUP_COUNT_ENR_KEY; -use crate::discovery::{CombinedKey, peer_id_to_node_id}; -use crate::{ - Enr, EnrExt, Gossipsub, PeerId, SyncInfo, metrics, multiaddr::Multiaddr, types::Subnet, -}; +use crate::{Enr, Gossipsub, PeerId, SyncInfo, metrics, multiaddr::Multiaddr, types::Subnet}; use itertools::Itertools; use logging::crit; +use network_utils::enr_ext::{EnrExt, peer_id_to_node_id}; use peer_info::{ConnectionDirection, PeerConnectionStatus, PeerInfo}; use score::{PeerAction, ReportSource, Score, ScoreState}; use std::net::IpAddr; @@ -300,6 +299,7 @@ impl PeerDB { .filter(move |(_, info)| { // We check both the metadata and gossipsub data as we only want to count long-lived subscribed peers info.is_connected() + && info.is_synced_or_advanced() && info.on_subnet_metadata(&subnet) && info.on_subnet_gossipsub(&subnet) && info.is_good_gossipsub_peer() @@ -318,40 +318,69 @@ impl PeerDB { .filter(move |(_, info)| { // The custody_subnets hashset can be populated via enr or metadata let is_custody_subnet_peer = info.is_assigned_to_custody_subnet(&subnet); - info.is_connected() && info.is_good_gossipsub_peer() && is_custody_subnet_peer + info.is_connected() + && info.is_good_gossipsub_peer() + && is_custody_subnet_peer + && info.is_synced_or_advanced() }) .map(|(peer_id, _)| peer_id) } - /// Returns an iterator of all peers that are supposed to be custodying - /// the given subnet id. - pub fn good_range_sync_custody_subnet_peers( + /// Checks if there is at least one good peer for each specified custody subnet for the given epoch. + /// A "good" peer is one that is both connected and synced (or advanced) for the specified epoch. + pub fn has_good_custody_range_sync_peer( &self, - subnet: DataColumnSubnetId, - ) -> impl Iterator { - self.peers - .iter() - .filter(move |(_, info)| { - // The custody_subnets hashset can be populated via enr or metadata - info.is_connected() && info.is_assigned_to_custody_subnet(&subnet) - }) - .map(|(peer_id, _)| peer_id) - } - - /// Returns `true` if the given peer is assigned to the given subnet. - /// else returns `false` - /// - /// Returns `false` if peer doesn't exist in peerdb. - pub fn is_good_range_sync_custody_subnet_peer( - &self, - subnet: DataColumnSubnetId, - peer: &PeerId, + subnets: &HashSet, + epoch: Epoch, ) -> bool { - if let Some(info) = self.peers.get(peer) { - info.is_connected() && info.is_assigned_to_custody_subnet(&subnet) - } else { - false + let mut remaining_subnets = subnets.clone(); + + let good_sync_peers_for_epoch = self.peers.values().filter(|&info| { + info.is_connected() + && match info.sync_status() { + SyncStatus::Synced { info } | SyncStatus::Advanced { info } => { + info.has_slot(epoch.end_slot(E::slots_per_epoch())) + } + SyncStatus::IrrelevantPeer + | SyncStatus::Behind { .. } + | SyncStatus::Unknown => false, + } + }); + + for info in good_sync_peers_for_epoch { + for subnet in info.custody_subnets_iter() { + if remaining_subnets.remove(subnet) && remaining_subnets.is_empty() { + return true; + } + } } + + false + } + + /// Checks if there are sufficient good peers for a single custody subnet. + /// A "good" peer is one that is both connected and synced (or advanced). + pub fn has_good_peers_in_custody_subnet( + &self, + subnet: &DataColumnSubnetId, + target_peers: usize, + ) -> bool { + let mut peer_count = 0usize; + for info in self + .peers + .values() + .filter(|info| info.is_connected() && info.is_synced_or_advanced()) + { + if info.is_assigned_to_custody_subnet(subnet) { + peer_count += 1; + } + + if peer_count >= target_peers { + return true; + } + } + + false } /// Gives the ids of all known disconnected peers. diff --git a/beacon_node/lighthouse_network/src/peer_manager/peerdb/peer_info.rs b/beacon_node/lighthouse_network/src/peer_manager/peerdb/peer_info.rs index e643fca30f..c289cb9a69 100644 --- a/beacon_node/lighthouse_network/src/peer_manager/peerdb/peer_info.rs +++ b/beacon_node/lighthouse_network/src/peer_manager/peerdb/peer_info.rs @@ -174,19 +174,6 @@ impl PeerInfo { self.subnets.iter() } - /// Returns the number of long lived subnets a peer is subscribed to. - // NOTE: This currently excludes sync committee subnets - pub fn long_lived_subnet_count(&self) -> usize { - if let Some(meta_data) = self.meta_data.as_ref() { - return meta_data.attnets().num_set_bits(); - } else if let Some(enr) = self.enr.as_ref() - && let Ok(attnets) = enr.attestation_bitfield::() - { - return attnets.num_set_bits(); - } - 0 - } - /// Returns an iterator over the long-lived subnets if it has any. pub fn long_lived_subnets(&self) -> Vec { let mut long_lived_subnets = Vec::new(); @@ -222,6 +209,13 @@ impl PeerInfo { } } } + + long_lived_subnets.extend( + self.custody_subnets + .iter() + .map(|&id| Subnet::DataColumn(id)), + ); + long_lived_subnets } @@ -240,6 +234,11 @@ impl PeerInfo { self.custody_subnets.iter() } + /// Returns the number of custody subnets this peer is assigned to. + pub fn custody_subnet_count(&self) -> usize { + self.custody_subnets.len() + } + /// Returns true if the peer is connected to a long-lived subnet. pub fn has_long_lived_subnet(&self) -> bool { // Check the meta_data @@ -262,6 +261,17 @@ impl PeerInfo { { return true; } + + // Check if the peer has custody subnets populated and the peer is subscribed to any of + // its custody subnets + let subscribed_to_any_custody_subnets = self + .custody_subnets + .iter() + .any(|subnet_id| self.subnets.contains(&Subnet::DataColumn(*subnet_id))); + if subscribed_to_any_custody_subnets { + return true; + } + false } @@ -318,6 +328,14 @@ impl PeerInfo { ) } + /// Checks if the peer is synced or advanced. + pub fn is_synced_or_advanced(&self) -> bool { + matches!( + self.sync_status, + SyncStatus::Synced { .. } | SyncStatus::Advanced { .. } + ) + } + /// Checks if the status is connected. pub fn is_dialing(&self) -> bool { matches!(self.connection_status, PeerConnectionStatus::Dialing { .. }) @@ -645,3 +663,50 @@ impl From for PeerState { } } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::types::Subnet; + use types::{DataColumnSubnetId, MainnetEthSpec}; + + type E = MainnetEthSpec; + + fn create_test_peer_info() -> PeerInfo { + PeerInfo::default() + } + + #[test] + fn test_has_long_lived_subnet_empty_custody_subnets() { + let peer_info = create_test_peer_info(); + // peer has no custody subnets or subscribed to any subnets hence return false + assert!(!peer_info.has_long_lived_subnet()); + } + + #[test] + fn test_has_long_lived_subnet_empty_subnets_with_custody_subnets() { + let mut peer_info = create_test_peer_info(); + peer_info.custody_subnets.insert(DataColumnSubnetId::new(1)); + peer_info.custody_subnets.insert(DataColumnSubnetId::new(2)); + // Peer has custody subnets but isn't subscribed to any hence return false + assert!(!peer_info.has_long_lived_subnet()); + } + + #[test] + fn test_has_long_lived_subnet_subscribed_to_custody_subnets() { + let mut peer_info = create_test_peer_info(); + peer_info.custody_subnets.insert(DataColumnSubnetId::new(1)); + peer_info.custody_subnets.insert(DataColumnSubnetId::new(2)); + peer_info.custody_subnets.insert(DataColumnSubnetId::new(3)); + + peer_info + .subnets + .insert(Subnet::DataColumn(DataColumnSubnetId::new(1))); + peer_info + .subnets + .insert(Subnet::DataColumn(DataColumnSubnetId::new(2))); + // Missing DataColumnSubnetId::new(3) - but peer is subscribed to some custody subnets + // Peer is subscribed to any custody subnets - return true + assert!(peer_info.has_long_lived_subnet()); + } +} diff --git a/beacon_node/lighthouse_network/src/rpc/handler.rs b/beacon_node/lighthouse_network/src/rpc/handler.rs index 972d45cdfe..720895bbe7 100644 --- a/beacon_node/lighthouse_network/src/rpc/handler.rs +++ b/beacon_node/lighthouse_network/src/rpc/handler.rs @@ -39,6 +39,9 @@ const SHUTDOWN_TIMEOUT_SECS: u64 = 15; /// Maximum number of simultaneous inbound substreams we keep for this peer. const MAX_INBOUND_SUBSTREAMS: usize = 32; +/// Timeout that will be used for inbound and outbound responses. +const RESP_TIMEOUT: Duration = Duration::from_secs(10); + /// Identifier of inbound and outbound substreams from the handler's perspective. #[derive(Debug, Clone, Copy, Hash, Eq, PartialEq)] pub struct SubstreamId(usize); @@ -140,9 +143,6 @@ where /// Waker, to be sure the handler gets polled when needed. waker: Option, - - /// Timeout that will be used for inbound and outbound responses. - resp_timeout: Duration, } enum HandlerState { @@ -224,7 +224,6 @@ where pub fn new( listen_protocol: SubstreamProtocol, ()>, fork_context: Arc, - resp_timeout: Duration, peer_id: PeerId, connection_id: ConnectionId, ) -> Self { @@ -246,7 +245,6 @@ where outbound_io_error_retries: 0, fork_context, waker: None, - resp_timeout, } } @@ -542,8 +540,7 @@ where // If this substream has not ended, we reset the timer. // Each chunk is allowed RESPONSE_TIMEOUT to be sent. if let Some(ref delay_key) = info.delay_key { - self.inbound_substreams_delay - .reset(delay_key, self.resp_timeout); + self.inbound_substreams_delay.reset(delay_key, RESP_TIMEOUT); } // The stream may be currently idle. Attempt to process more @@ -712,7 +709,7 @@ where }; substream_entry.max_remaining_chunks = Some(max_remaining_chunks); self.outbound_substreams_delay - .reset(delay_key, self.resp_timeout); + .reset(delay_key, RESP_TIMEOUT); } } @@ -960,7 +957,7 @@ where // Store the stream and tag the output. let delay_key = self .inbound_substreams_delay - .insert(self.current_inbound_substream_id, self.resp_timeout); + .insert(self.current_inbound_substream_id, RESP_TIMEOUT); let awaiting_stream = InboundState::Idle(substream); self.inbound_substreams.insert( self.current_inbound_substream_id, @@ -1036,7 +1033,7 @@ where // new outbound request. Store the stream and tag the output. let delay_key = self .outbound_substreams_delay - .insert(self.current_outbound_substream_id, self.resp_timeout); + .insert(self.current_outbound_substream_id, RESP_TIMEOUT); let awaiting_stream = OutboundSubstreamState::RequestPendingResponse { substream: Box::new(substream), request, diff --git a/beacon_node/lighthouse_network/src/rpc/mod.rs b/beacon_node/lighthouse_network/src/rpc/mod.rs index 5e8e55891c..7c43018af8 100644 --- a/beacon_node/lighthouse_network/src/rpc/mod.rs +++ b/beacon_node/lighthouse_network/src/rpc/mod.rs @@ -16,7 +16,6 @@ use std::collections::HashMap; use std::marker::PhantomData; use std::sync::Arc; use std::task::{Context, Poll}; -use std::time::Duration; use tracing::{debug, trace}; use types::{EthSpec, ForkContext}; @@ -143,12 +142,6 @@ pub struct RPCMessage { type BehaviourAction = ToSwarm, RPCSend>; -pub struct NetworkParams { - pub max_payload_size: usize, - pub ttfb_timeout: Duration, - pub resp_timeout: Duration, -} - /// Implements the libp2p `NetworkBehaviour` trait and therefore manages network-level /// logic. pub struct RPC { @@ -162,8 +155,6 @@ pub struct RPC { events: Vec>, fork_context: Arc, enable_light_client_server: bool, - /// Networking constant values - network_params: NetworkParams, /// A sequential counter indicating when data gets modified. seq_number: u64, } @@ -174,7 +165,6 @@ impl RPC { enable_light_client_server: bool, inbound_rate_limiter_config: Option, outbound_rate_limiter_config: Option, - network_params: NetworkParams, seq_number: u64, ) -> Self { let response_limiter = inbound_rate_limiter_config.map(|config| { @@ -194,7 +184,6 @@ impl RPC { events: Vec::new(), fork_context, enable_light_client_server, - network_params, seq_number, } } @@ -331,18 +320,11 @@ where max_rpc_size: self.fork_context.spec.max_payload_size as usize, enable_light_client_server: self.enable_light_client_server, phantom: PhantomData, - ttfb_timeout: self.network_params.ttfb_timeout, }, (), ); - let handler = RPCHandler::new( - protocol, - self.fork_context.clone(), - self.network_params.resp_timeout, - peer_id, - connection_id, - ); + let handler = RPCHandler::new(protocol, self.fork_context.clone(), peer_id, connection_id); Ok(handler) } @@ -361,18 +343,11 @@ where max_rpc_size: self.fork_context.spec.max_payload_size as usize, enable_light_client_server: self.enable_light_client_server, phantom: PhantomData, - ttfb_timeout: self.network_params.ttfb_timeout, }, (), ); - let handler = RPCHandler::new( - protocol, - self.fork_context.clone(), - self.network_params.resp_timeout, - peer_id, - connection_id, - ); + let handler = RPCHandler::new(protocol, self.fork_context.clone(), peer_id, connection_id); Ok(handler) } diff --git a/beacon_node/lighthouse_network/src/rpc/protocol.rs b/beacon_node/lighthouse_network/src/rpc/protocol.rs index 6529ff5f92..228a74f08c 100644 --- a/beacon_node/lighthouse_network/src/rpc/protocol.rs +++ b/beacon_node/lighthouse_network/src/rpc/protocol.rs @@ -11,7 +11,6 @@ use std::marker::PhantomData; use std::sync::{Arc, LazyLock}; use std::time::Duration; use strum::{AsRefStr, Display, EnumString, IntoStaticStr}; -use tokio_io_timeout::TimeoutStream; use tokio_util::{ codec::Framed, compat::{Compat, FuturesAsyncReadCompatExt}, @@ -425,7 +424,6 @@ pub struct RPCProtocol { pub max_rpc_size: usize, pub enable_light_client_server: bool, pub phantom: PhantomData, - pub ttfb_timeout: Duration, } impl UpgradeInfo for RPCProtocol { @@ -652,7 +650,7 @@ pub fn rpc_data_column_limits( pub type InboundOutput = (RequestType, InboundFramed); pub type InboundFramed = - Framed>>>, SSZSnappyInboundCodec>; + Framed>>, SSZSnappyInboundCodec>; impl InboundUpgrade for RPCProtocol where @@ -676,10 +674,7 @@ where ), }; - let mut timed_socket = TimeoutStream::new(socket); - timed_socket.set_read_timeout(Some(self.ttfb_timeout)); - - let socket = Framed::new(Box::pin(timed_socket), codec); + let socket = Framed::new(Box::pin(socket), codec); // MetaData requests should be empty, return the stream match versioned_protocol { diff --git a/beacon_node/lighthouse_network/src/rpc/rate_limiter.rs b/beacon_node/lighthouse_network/src/rpc/rate_limiter.rs index 65cd1c2e61..8b364f506c 100644 --- a/beacon_node/lighthouse_network/src/rpc/rate_limiter.rs +++ b/beacon_node/lighthouse_network/src/rpc/rate_limiter.rs @@ -382,16 +382,41 @@ impl RPCRateLimiter { pub fn prune(&mut self) { let time_since_start = self.init_time.elapsed(); - self.ping_rl.prune(time_since_start); - self.status_rl.prune(time_since_start); - self.metadata_rl.prune(time_since_start); - self.goodbye_rl.prune(time_since_start); - self.bbrange_rl.prune(time_since_start); - self.bbroots_rl.prune(time_since_start); - self.blbrange_rl.prune(time_since_start); - self.blbroot_rl.prune(time_since_start); - self.dcbrange_rl.prune(time_since_start); - self.dcbroot_rl.prune(time_since_start); + + let Self { + prune_interval: _, + init_time: _, + goodbye_rl, + ping_rl, + metadata_rl, + status_rl, + bbrange_rl, + bbroots_rl, + blbrange_rl, + blbroot_rl, + dcbroot_rl, + dcbrange_rl, + lc_bootstrap_rl, + lc_optimistic_update_rl, + lc_finality_update_rl, + lc_updates_by_range_rl, + fork_context: _, + } = self; + + goodbye_rl.prune(time_since_start); + ping_rl.prune(time_since_start); + metadata_rl.prune(time_since_start); + status_rl.prune(time_since_start); + bbrange_rl.prune(time_since_start); + bbroots_rl.prune(time_since_start); + blbrange_rl.prune(time_since_start); + blbroot_rl.prune(time_since_start); + dcbrange_rl.prune(time_since_start); + dcbroot_rl.prune(time_since_start); + lc_bootstrap_rl.prune(time_since_start); + lc_optimistic_update_rl.prune(time_since_start); + lc_finality_update_rl.prune(time_since_start); + lc_updates_by_range_rl.prune(time_since_start); } } diff --git a/beacon_node/lighthouse_network/src/service/mod.rs b/beacon_node/lighthouse_network/src/service/mod.rs index eebc2f0200..ea2c53a07f 100644 --- a/beacon_node/lighthouse_network/src/service/mod.rs +++ b/beacon_node/lighthouse_network/src/service/mod.rs @@ -1,5 +1,4 @@ use self::gossip_cache::GossipCache; -use crate::EnrExt; use crate::Eth2Enr; use crate::config::{GossipsubConfigParams, NetworkLoad, gossipsub_config}; use crate::discovery::{ @@ -12,8 +11,8 @@ use crate::peer_manager::{ use crate::peer_manager::{MIN_OUTBOUND_ONLY_FACTOR, PEER_EXCESS_FACTOR, PRIORITY_PEER_EXCESS}; use crate::rpc::methods::MetadataRequest; use crate::rpc::{ - GoodbyeReason, HandlerErr, InboundRequestId, NetworkParams, Protocol, RPC, RPCError, - RPCMessage, RPCReceived, RequestType, ResponseTermination, RpcResponse, RpcSuccessResponse, + GoodbyeReason, HandlerErr, InboundRequestId, Protocol, RPC, RPCError, RPCMessage, RPCReceived, + RequestType, ResponseTermination, RpcResponse, RpcSuccessResponse, }; use crate::types::{ GossipEncoding, GossipKind, GossipTopic, SnappyTransform, Subnet, SubnetDiscovery, @@ -33,6 +32,7 @@ use libp2p::swarm::{NetworkBehaviour, Swarm, SwarmEvent}; use libp2p::upnp::tokio::Behaviour as Upnp; use libp2p::{PeerId, SwarmBuilder, identify}; use logging::crit; +use network_utils::enr_ext::EnrExt; use std::num::{NonZeroU8, NonZeroUsize}; use std::path::PathBuf; use std::pin::Pin; @@ -367,17 +367,11 @@ impl Network { (gossipsub, update_gossipsub_scores) }; - let network_params = NetworkParams { - max_payload_size: ctx.chain_spec.max_payload_size as usize, - ttfb_timeout: ctx.chain_spec.ttfb_timeout(), - resp_timeout: ctx.chain_spec.resp_timeout(), - }; let eth2_rpc = RPC::new( ctx.fork_context.clone(), config.enable_light_client_server, config.inbound_rate_limiter_config.clone(), config.outbound_rate_limiter_config.clone(), - network_params, seq_number, ); diff --git a/beacon_node/lighthouse_network/src/types/globals.rs b/beacon_node/lighthouse_network/src/types/globals.rs index bcb4758386..b8c34f8392 100644 --- a/beacon_node/lighthouse_network/src/types/globals.rs +++ b/beacon_node/lighthouse_network/src/types/globals.rs @@ -3,7 +3,8 @@ use super::TopicConfig; use crate::peer_manager::peerdb::PeerDB; use crate::rpc::{MetaData, MetaDataV3}; use crate::types::{BackFillState, SyncState}; -use crate::{Client, Enr, EnrExt, GossipTopic, Multiaddr, NetworkConfig, PeerId}; +use crate::{Client, Enr, GossipTopic, Multiaddr, NetworkConfig, PeerId}; +use network_utils::enr_ext::EnrExt; use parking_lot::RwLock; use std::collections::HashSet; use std::sync::Arc; @@ -250,7 +251,7 @@ impl NetworkGlobals { config: Arc, spec: Arc, ) -> NetworkGlobals { - use crate::CombinedKeyExt; + use network_utils::enr_ext::CombinedKeyExt; let keypair = libp2p::identity::secp256k1::Keypair::generate(); let enr_key: discv5::enr::CombinedKey = discv5::enr::CombinedKey::from_secp256k1(&keypair); let enr = discv5::enr::Enr::builder().build(&enr_key).unwrap(); diff --git a/beacon_node/lighthouse_network/tests/common.rs b/beacon_node/lighthouse_network/tests/common.rs index 6b111cfdc1..8a3047692f 100644 --- a/beacon_node/lighthouse_network/tests/common.rs +++ b/beacon_node/lighthouse_network/tests/common.rs @@ -1,9 +1,9 @@ #![cfg(test)] use lighthouse_network::Enr; -use lighthouse_network::EnrExt; use lighthouse_network::Multiaddr; use lighthouse_network::service::Network as LibP2PService; use lighthouse_network::{NetworkConfig, NetworkEvent}; +use network_utils::enr_ext::EnrExt; use std::sync::Arc; use std::sync::Weak; use tokio::runtime::Runtime; diff --git a/beacon_node/lighthouse_tracing/src/lib.rs b/beacon_node/lighthouse_tracing/src/lib.rs index ffbad1364c..18a9874252 100644 --- a/beacon_node/lighthouse_tracing/src/lib.rs +++ b/beacon_node/lighthouse_tracing/src/lib.rs @@ -3,7 +3,9 @@ //! TODO: These span identifiers will be used to implement selective tracing export (to be implemented), //! where only the listed root spans and their descendants will be exported to the tracing backend. -/// Root span name for publish_block +/// Root span names for block production and publishing +pub const SPAN_PRODUCE_BLOCK_V2: &str = "produce_block_v2"; +pub const SPAN_PRODUCE_BLOCK_V3: &str = "produce_block_v3"; pub const SPAN_PUBLISH_BLOCK: &str = "publish_block"; /// Data Availability checker span identifiers @@ -17,11 +19,17 @@ pub const SPAN_PROCESS_GOSSIP_BLOCK: &str = "process_gossip_block"; /// Sync methods root spans pub const SPAN_SYNCING_CHAIN: &str = "syncing_chain"; pub const SPAN_OUTGOING_RANGE_REQUEST: &str = "outgoing_range_request"; +pub const SPAN_SINGLE_BLOCK_LOOKUP: &str = "single_block_lookup"; +pub const SPAN_OUTGOING_BLOCK_BY_ROOT_REQUEST: &str = "outgoing_block_by_root_request"; pub const SPAN_OUTGOING_CUSTODY_REQUEST: &str = "outgoing_custody_request"; pub const SPAN_PROCESS_RPC_BLOCK: &str = "process_rpc_block"; pub const SPAN_PROCESS_RPC_BLOBS: &str = "process_rpc_blobs"; pub const SPAN_PROCESS_RPC_CUSTODY_COLUMNS: &str = "process_rpc_custody_columns"; pub const SPAN_PROCESS_CHAIN_SEGMENT: &str = "process_chain_segment"; +pub const SPAN_PROCESS_CHAIN_SEGMENT_BACKFILL: &str = "process_chain_segment_backfill"; + +/// Fork choice root spans +pub const SPAN_RECOMPUTE_HEAD: &str = "recompute_head_at_slot"; /// RPC methods root spans pub const SPAN_HANDLE_BLOCKS_BY_RANGE_REQUEST: &str = "handle_blocks_by_range_request"; @@ -40,17 +48,21 @@ pub const SPAN_HANDLE_LIGHT_CLIENT_FINALITY_UPDATE: &str = "handle_light_client_ /// Only these spans and their descendants will be processed to reduce noise from /// uninstrumented code paths. New root spans must be added to this list to be traced. pub const LH_BN_ROOT_SPAN_NAMES: &[&str] = &[ - SPAN_SYNCING_CHAIN, + SPAN_PRODUCE_BLOCK_V2, + SPAN_PRODUCE_BLOCK_V3, + SPAN_PUBLISH_BLOCK, SPAN_PENDING_COMPONENTS, SPAN_PROCESS_GOSSIP_DATA_COLUMN, SPAN_PROCESS_GOSSIP_BLOB, SPAN_PROCESS_GOSSIP_BLOCK, + SPAN_SYNCING_CHAIN, SPAN_OUTGOING_RANGE_REQUEST, - SPAN_OUTGOING_CUSTODY_REQUEST, + SPAN_SINGLE_BLOCK_LOOKUP, SPAN_PROCESS_RPC_BLOCK, SPAN_PROCESS_RPC_BLOBS, SPAN_PROCESS_RPC_CUSTODY_COLUMNS, SPAN_PROCESS_CHAIN_SEGMENT, + SPAN_PROCESS_CHAIN_SEGMENT_BACKFILL, SPAN_HANDLE_BLOCKS_BY_RANGE_REQUEST, SPAN_HANDLE_BLOBS_BY_RANGE_REQUEST, SPAN_HANDLE_DATA_COLUMNS_BY_RANGE_REQUEST, diff --git a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs index a53e76402e..fa6b5fd243 100644 --- a/beacon_node/network/src/network_beacon_processor/gossip_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/gossip_methods.rs @@ -34,7 +34,6 @@ use std::path::PathBuf; use std::sync::Arc; use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; use store::hot_cold_store::HotColdDBError; -use tokio::sync::mpsc::error::TrySendError; use tracing::{Instrument, Span, debug, error, info, instrument, trace, warn}; use types::{ Attestation, AttestationData, AttestationRef, AttesterSlashing, BlobSidecar, DataColumnSidecar, @@ -610,7 +609,7 @@ impl NetworkBeaconProcessor { parent = None, level = "debug", skip_all, - fields(slot = ?column_sidecar.slot(), block_root = ?column_sidecar.block_root(), index = column_sidecar.index), + fields(slot = %column_sidecar.slot(), block_root = ?column_sidecar.block_root(), index = column_sidecar.index), )] pub async fn process_gossip_data_column_sidecar( self: &Arc, @@ -709,6 +708,7 @@ impl NetworkBeaconProcessor { | GossipDataColumnError::InvalidKzgProof { .. } | GossipDataColumnError::UnexpectedDataColumn | GossipDataColumnError::InvalidColumnIndex(_) + | GossipDataColumnError::MaxBlobsPerBlockExceeded { .. } | GossipDataColumnError::InconsistentCommitmentsLength { .. } | GossipDataColumnError::InconsistentProofsLength { .. } | GossipDataColumnError::NotFinalizedDescendant { .. } => { @@ -840,7 +840,7 @@ impl NetworkBeaconProcessor { } Err(err) => { match err { - GossipBlobError::BlobParentUnknown { parent_root } => { + GossipBlobError::ParentUnknown { parent_root } => { debug!( action = "requesting parent", block_root = %root, @@ -1054,35 +1054,43 @@ impl NetworkBeaconProcessor { "Processed data column, waiting for other components" ); - // Instead of triggering reconstruction immediately, schedule it to be run. If - // another column arrives it either completes availability or pushes - // reconstruction back a bit. - let cloned_self = Arc::clone(self); - let block_root = *block_root; - let send_result = self.beacon_processor_send.try_send(WorkEvent { - drop_during_sync: false, - work: Work::Reprocess(ReprocessQueueMessage::DelayColumnReconstruction( - QueuedColumnReconstruction { - block_root, - process_fn: Box::pin(async move { - cloned_self - .attempt_data_column_reconstruction(block_root, true) - .await; - }), - }, - )), - }); - if let Err(TrySendError::Full(WorkEvent { - work: - Work::Reprocess(ReprocessQueueMessage::DelayColumnReconstruction( - reconstruction, - )), - .. - })) = send_result + if self + .chain + .data_availability_checker + .custody_context() + .should_attempt_reconstruction( + slot.epoch(T::EthSpec::slots_per_epoch()), + &self.chain.spec, + ) { - warn!("Unable to send reconstruction to reprocessing"); - // Execute it immediately instead. - reconstruction.process_fn.await; + // Instead of triggering reconstruction immediately, schedule it to be run. If + // another column arrives, it either completes availability or pushes + // reconstruction back a bit. + let cloned_self = Arc::clone(self); + let block_root = *block_root; + + if self + .beacon_processor_send + .try_send(WorkEvent { + drop_during_sync: false, + work: Work::Reprocess( + ReprocessQueueMessage::DelayColumnReconstruction( + QueuedColumnReconstruction { + block_root, + slot: *slot, + process_fn: Box::pin(async move { + cloned_self + .attempt_data_column_reconstruction(block_root) + .await; + }), + }, + ), + ), + }) + .is_err() + { + warn!("Unable to send reconstruction to reprocessing"); + } } } }, @@ -1493,11 +1501,12 @@ impl NetworkBeaconProcessor { let result = self .chain - .process_block_with_early_caching( + .process_block( block_root, verified_block, - BlockImportSource::Gossip, NotifyExecutionLayer::Yes, + BlockImportSource::Gossip, + || Ok(()), ) .await; register_process_result_metrics(&result, metrics::BlockSource::Gossip, "block"); diff --git a/beacon_node/network/src/network_beacon_processor/mod.rs b/beacon_node/network/src/network_beacon_processor/mod.rs index 73349cd431..85ccde1d59 100644 --- a/beacon_node/network/src/network_beacon_processor/mod.rs +++ b/beacon_node/network/src/network_beacon_processor/mod.rs @@ -6,9 +6,7 @@ use beacon_chain::data_column_verification::{GossipDataColumnError, observe_goss use beacon_chain::fetch_blobs::{ EngineGetBlobsOutput, FetchEngineBlobError, fetch_and_process_engine_blobs, }; -use beacon_chain::{ - AvailabilityProcessingStatus, BeaconChain, BeaconChainTypes, BlockError, NotifyExecutionLayer, -}; +use beacon_chain::{AvailabilityProcessingStatus, BeaconChain, BeaconChainTypes, BlockError}; use beacon_processor::{ BeaconProcessorSend, DuplicateCache, GossipAggregatePackage, GossipAttestationPackage, Work, WorkEvent as BeaconWorkEvent, @@ -28,7 +26,7 @@ use std::sync::Arc; use std::time::Duration; use task_executor::TaskExecutor; use tokio::sync::mpsc::{self, error::TrySendError}; -use tracing::{debug, error, trace, warn}; +use tracing::{debug, error, instrument, trace, warn}; use types::*; pub use sync_methods::ChainSegmentProcessId; @@ -500,33 +498,23 @@ impl NetworkBeaconProcessor { process_id: ChainSegmentProcessId, blocks: Vec>, ) -> Result<(), Error> { - let is_backfill = matches!(&process_id, ChainSegmentProcessId::BackSyncBatchId { .. }); debug!(blocks = blocks.len(), id = ?process_id, "Batch sending for process"); - let processor = self.clone(); - let process_fn = async move { - let notify_execution_layer = if processor - .network_globals - .sync_state - .read() - .is_syncing_finalized() - { - NotifyExecutionLayer::No - } else { - NotifyExecutionLayer::Yes - }; - processor - .process_chain_segment(process_id, blocks, notify_execution_layer) - .await; - }; - let process_fn = Box::pin(process_fn); // Back-sync batches are dispatched with a different `Work` variant so // they can be rate-limited. - let work = if is_backfill { - Work::ChainSegmentBackfill(process_fn) - } else { - Work::ChainSegment(process_fn) + let work = match process_id { + ChainSegmentProcessId::RangeBatchId(_, _) => { + let process_fn = async move { + processor.process_chain_segment(process_id, blocks).await; + }; + Work::ChainSegment(Box::pin(process_fn)) + } + ChainSegmentProcessId::BackSyncBatchId(_) => { + let process_fn = + move || processor.process_chain_segment_backfill(process_id, blocks); + Work::ChainSegmentBackfill(Box::new(process_fn)) + } }; self.try_send(BeaconWorkEvent { @@ -825,36 +813,15 @@ impl NetworkBeaconProcessor { } } - /// Attempt to reconstruct all data columns if the following conditions satisfies: - /// - Our custody requirement is all columns - /// - We have >= 50% of columns, but not all columns - /// - /// Returns `Some(AvailabilityProcessingStatus)` if reconstruction is successfully performed, - /// otherwise returns `None`. - /// - /// The `publish_columns` parameter controls whether reconstructed columns should be published - /// to the gossip network. - async fn attempt_data_column_reconstruction( - self: &Arc, - block_root: Hash256, - publish_columns: bool, - ) -> Option { - // Only supernodes attempt reconstruction - if !self - .chain - .data_availability_checker - .custody_context() - .current_is_supernode - { - return None; - } - + /// Attempts to reconstruct all data columns if the conditions checked in + /// [`DataAvailabilityCheckerInner::check_and_set_reconstruction_started`] are satisfied. + #[instrument(level = "debug", skip_all, fields(?block_root))] + async fn attempt_data_column_reconstruction(self: &Arc, block_root: Hash256) { let result = self.chain.reconstruct_data_columns(block_root).await; + match result { Ok(Some((availability_processing_status, data_columns_to_publish))) => { - if publish_columns { - self.publish_data_columns_gradually(data_columns_to_publish, block_root); - } + self.publish_data_columns_gradually(data_columns_to_publish, block_root); match &availability_processing_status { AvailabilityProcessingStatus::Imported(hash) => { debug!( @@ -867,21 +834,18 @@ impl NetworkBeaconProcessor { AvailabilityProcessingStatus::MissingComponents(_, _) => { debug!( result = "imported all custody columns", - block_hash = %block_root, + %block_root, "Block components still missing block after reconstruction" ); } } - - Some(availability_processing_status) } Ok(None) => { // reason is tracked via the `KZG_DATA_COLUMN_RECONSTRUCTION_INCOMPLETE_TOTAL` metric trace!( - block_hash = %block_root, + %block_root, "Reconstruction not required for block" ); - None } Err(e) => { error!( @@ -889,7 +853,6 @@ impl NetworkBeaconProcessor { error = ?e, "Error during data column reconstruction" ); - None } } } @@ -978,6 +941,7 @@ impl NetworkBeaconProcessor { /// by some nodes on the network as soon as possible. Our hope is that some columns arrive from /// other nodes in the meantime, obviating the need for us to publish them. If no other /// publisher exists for a column, it will eventually get published here. + #[instrument(level="debug", skip_all, fields(?block_root, data_column_count=data_columns_to_publish.len()))] fn publish_data_columns_gradually( self: &Arc, mut data_columns_to_publish: DataColumnSidecarList, diff --git a/beacon_node/network/src/network_beacon_processor/rpc_methods.rs b/beacon_node/network/src/network_beacon_processor/rpc_methods.rs index 9ddba86b81..58e02ffe00 100644 --- a/beacon_node/network/src/network_beacon_processor/rpc_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/rpc_methods.rs @@ -437,12 +437,12 @@ impl NetworkBeaconProcessor { } } Err(e) => { - // TODO(das): lower log level when feature is stabilized - error!( + // The node is expected to be able to serve these columns, but it fails to retrieve them. + warn!( block_root = ?data_column_ids_by_root.block_root, %peer_id, error = ?e, - "Error getting data column" + "Error getting data column for by root request " ); return Err((RpcErrorResponse::ServerError, "Error getting data column")); } diff --git a/beacon_node/network/src/network_beacon_processor/sync_methods.rs b/beacon_node/network/src/network_beacon_processor/sync_methods.rs index f24495cc54..1d99540c29 100644 --- a/beacon_node/network/src/network_beacon_processor/sync_methods.rs +++ b/beacon_node/network/src/network_beacon_processor/sync_methods.rs @@ -19,9 +19,10 @@ use beacon_processor::{ use beacon_processor::{Work, WorkEvent}; use lighthouse_network::PeerAction; use lighthouse_tracing::{ - SPAN_PROCESS_CHAIN_SEGMENT, SPAN_PROCESS_RPC_BLOBS, SPAN_PROCESS_RPC_BLOCK, - SPAN_PROCESS_RPC_CUSTODY_COLUMNS, + SPAN_PROCESS_CHAIN_SEGMENT, SPAN_PROCESS_CHAIN_SEGMENT_BACKFILL, SPAN_PROCESS_RPC_BLOBS, + SPAN_PROCESS_RPC_BLOCK, SPAN_PROCESS_RPC_CUSTODY_COLUMNS, }; +use logging::crit; use std::sync::Arc; use std::time::Duration; use store::KzgCommitment; @@ -167,11 +168,12 @@ impl NetworkBeaconProcessor { let signed_beacon_block = block.block_cloned(); let result = self .chain - .process_block_with_early_caching( + .process_block( block_root, block, - BlockImportSource::Lookup, NotifyExecutionLayer::Yes, + BlockImportSource::Lookup, + || Ok(()), ) .await; register_process_result_metrics(&result, metrics::BlockSource::Rpc, "block"); @@ -331,14 +333,8 @@ impl NetworkBeaconProcessor { "Blobs have already been imported" ); } - Err(e) => { - warn!( - error = ?e, - block_hash = %block_root, - %slot, - "Error when importing rpc blobs" - ); - } + // Errors are handled and logged in `block_lookups` + Err(_) => {} } // Sync handles these results @@ -383,7 +379,7 @@ impl NetworkBeaconProcessor { "RPC custody data columns received" ); - let mut result = self + let result = self .chain .process_rpc_custody_columns(custody_columns) .await; @@ -404,17 +400,6 @@ impl NetworkBeaconProcessor { block_hash = %block_root, "Missing components over rpc" ); - // Attempt reconstruction here before notifying sync, to avoid sending out more requests - // that we may no longer need. - // We don't publish columns reconstructed from rpc columns to the gossip network, - // as these are likely historic columns. - let publish_columns = false; - if let Some(availability) = self - .attempt_data_column_reconstruction(block_root, publish_columns) - .await - { - result = Ok(availability) - } } }, Err(BlockError::DuplicateFullyImported(_)) => { @@ -423,13 +408,8 @@ impl NetworkBeaconProcessor { "Custody columns have already been imported" ); } - Err(e) => { - warn!( - error = ?e, - block_hash = %block_root, - "Error when importing rpc custody columns" - ); - } + // Errors are handled and logged in `block_lookups` + Err(_) => {} } self.send_sync_message(SyncMessage::BlockComponentProcessed { @@ -445,27 +425,42 @@ impl NetworkBeaconProcessor { parent = None, level = "debug", skip_all, - fields(sync_type = ?sync_type, downloaded_blocks = downloaded_blocks.len()) + fields(process_id = ?process_id, downloaded_blocks = downloaded_blocks.len()) )] pub async fn process_chain_segment( &self, - sync_type: ChainSegmentProcessId, + process_id: ChainSegmentProcessId, downloaded_blocks: Vec>, - notify_execution_layer: NotifyExecutionLayer, ) { - let result = match sync_type { - // this a request from the range sync - ChainSegmentProcessId::RangeBatchId(chain_id, epoch) => { - let start_slot = downloaded_blocks.first().map(|b| b.slot().as_u64()); - let end_slot = downloaded_blocks.last().map(|b| b.slot().as_u64()); - let sent_blocks = downloaded_blocks.len(); + let ChainSegmentProcessId::RangeBatchId(chain_id, epoch) = process_id else { + // This is a request from range sync, this should _never_ happen + crit!( + error = "process_chain_segment called on a variant other than RangeBatchId", + "Please notify the devs" + ); + return; + }; - match self - .process_blocks(downloaded_blocks.iter(), notify_execution_layer) - .await - { - (imported_blocks, Ok(_)) => { - debug!( + let start_slot = downloaded_blocks.first().map(|b| b.slot().as_u64()); + let end_slot = downloaded_blocks.last().map(|b| b.slot().as_u64()); + let sent_blocks = downloaded_blocks.len(); + let notify_execution_layer = if self + .network_globals + .sync_state + .read() + .is_syncing_finalized() + { + NotifyExecutionLayer::No + } else { + NotifyExecutionLayer::Yes + }; + + let result = match self + .process_blocks(downloaded_blocks.iter(), notify_execution_layer) + .await + { + (imported_blocks, Ok(_)) => { + debug!( batch_epoch = %epoch, first_block_slot = start_slot, chain = chain_id, @@ -473,13 +468,13 @@ impl NetworkBeaconProcessor { processed_blocks = sent_blocks, service= "sync", "Batch processed"); - BatchProcessResult::Success { - sent_blocks, - imported_blocks, - } - } - (imported_blocks, Err(e)) => { - debug!( + BatchProcessResult::Success { + sent_blocks, + imported_blocks, + } + } + (imported_blocks, Err(e)) => { + debug!( batch_epoch = %epoch, first_block_slot = start_slot, chain = chain_id, @@ -488,33 +483,61 @@ impl NetworkBeaconProcessor { error = %e.message, service = "sync", "Batch processing failed"); - match e.peer_action { - Some(penalty) => BatchProcessResult::FaultyFailure { - imported_blocks, - penalty, - }, - None => BatchProcessResult::NonFaultyFailure, - } - } + match e.peer_action { + Some(penalty) => BatchProcessResult::FaultyFailure { + imported_blocks, + penalty, + }, + None => BatchProcessResult::NonFaultyFailure, } } - // this a request from the Backfill sync - ChainSegmentProcessId::BackSyncBatchId(epoch) => { - let start_slot = downloaded_blocks.first().map(|b| b.slot().as_u64()); - let end_slot = downloaded_blocks.last().map(|b| b.slot().as_u64()); - let sent_blocks = downloaded_blocks.len(); - let n_blobs = downloaded_blocks - .iter() - .map(|wrapped| wrapped.n_blobs()) - .sum::(); - let n_data_columns = downloaded_blocks - .iter() - .map(|wrapped| wrapped.n_data_columns()) - .sum::(); + }; - match self.process_backfill_blocks(downloaded_blocks) { - (imported_blocks, Ok(_)) => { - debug!( + self.send_sync_message(SyncMessage::BatchProcessed { + sync_type: process_id, + result, + }); + } + + /// Attempt to import the chain segment (`blocks`) to the beacon chain, informing the sync + /// thread if more blocks are needed to process it. + #[instrument( + name = SPAN_PROCESS_CHAIN_SEGMENT_BACKFILL, + parent = None, + level = "debug", + skip_all, + fields(downloaded_blocks = downloaded_blocks.len()) + )] + pub fn process_chain_segment_backfill( + &self, + process_id: ChainSegmentProcessId, + downloaded_blocks: Vec>, + ) { + let ChainSegmentProcessId::BackSyncBatchId(epoch) = process_id else { + // this a request from RangeSync, this should _never_ happen + crit!( + error = + "process_chain_segment_backfill called on a variant other than BackSyncBatchId", + "Please notify the devs" + ); + return; + }; + + let start_slot = downloaded_blocks.first().map(|b| b.slot().as_u64()); + let end_slot = downloaded_blocks.last().map(|b| b.slot().as_u64()); + let sent_blocks = downloaded_blocks.len(); + let n_blobs = downloaded_blocks + .iter() + .map(|wrapped| wrapped.n_blobs()) + .sum::(); + let n_data_columns = downloaded_blocks + .iter() + .map(|wrapped| wrapped.n_data_columns()) + .sum::(); + + let result = match self.process_backfill_blocks(downloaded_blocks) { + (imported_blocks, Ok(_)) => { + debug!( batch_epoch = %epoch, first_block_slot = start_slot, keep_execution_payload = !self.chain.store.get_config().prune_payloads, @@ -524,34 +547,35 @@ impl NetworkBeaconProcessor { processed_data_columns = n_data_columns, service= "sync", "Backfill batch processed"); - BatchProcessResult::Success { - sent_blocks, - imported_blocks, - } - } - (_, Err(e)) => { - debug!( - batch_epoch = %epoch, - first_block_slot = start_slot, - last_block_slot = end_slot, - processed_blobs = n_blobs, - error = %e.message, - service = "sync", - "Backfill batch processing failed" - ); - match e.peer_action { - Some(penalty) => BatchProcessResult::FaultyFailure { - imported_blocks: 0, - penalty, - }, - None => BatchProcessResult::NonFaultyFailure, - } - } + BatchProcessResult::Success { + sent_blocks, + imported_blocks, + } + } + (_, Err(e)) => { + debug!( + batch_epoch = %epoch, + first_block_slot = start_slot, + last_block_slot = end_slot, + processed_blobs = n_blobs, + error = %e.message, + service = "sync", + "Backfill batch processing failed" + ); + match e.peer_action { + Some(penalty) => BatchProcessResult::FaultyFailure { + imported_blocks: 0, + penalty, + }, + None => BatchProcessResult::NonFaultyFailure, } } }; - self.send_sync_message(SyncMessage::BatchProcessed { sync_type, result }); + self.send_sync_message(SyncMessage::BatchProcessed { + sync_type: process_id, + result, + }); } /// Helper function to process blocks batches which only consumes the chain and blocks to process. diff --git a/beacon_node/network/src/network_beacon_processor/tests.rs b/beacon_node/network/src/network_beacon_processor/tests.rs index 2027a525e6..4137c974bf 100644 --- a/beacon_node/network/src/network_beacon_processor/tests.rs +++ b/beacon_node/network/src/network_beacon_processor/tests.rs @@ -94,12 +94,20 @@ impl TestRig { // This allows for testing voluntary exits without building out a massive chain. let mut spec = test_spec::(); spec.shard_committee_period = 2; - Self::new_parametric(chain_length, BeaconProcessorConfig::default(), spec).await + Self::new_parametric(chain_length, BeaconProcessorConfig::default(), false, spec).await + } + + pub async fn new_supernode(chain_length: u64) -> Self { + // This allows for testing voluntary exits without building out a massive chain. + let mut spec = test_spec::(); + spec.shard_committee_period = 2; + Self::new_parametric(chain_length, BeaconProcessorConfig::default(), true, spec).await } pub async fn new_parametric( chain_length: u64, beacon_processor_config: BeaconProcessorConfig, + import_data_columns: bool, spec: ChainSpec, ) -> Self { let spec = Arc::new(spec); @@ -108,6 +116,7 @@ impl TestRig { .deterministic_keypairs(VALIDATOR_COUNT) .fresh_ephemeral_store() .mock_execution_layer() + .import_all_data_columns(import_data_columns) .chain_config(<_>::default()) .build(); @@ -449,10 +458,10 @@ impl TestRig { .unwrap(); } - pub fn enqueue_backfill_batch(&self) { + pub fn enqueue_backfill_batch(&self, epoch: Epoch) { self.network_beacon_processor .send_chain_segment( - ChainSegmentProcessId::BackSyncBatchId(Epoch::default()), + ChainSegmentProcessId::BackSyncBatchId(epoch), Vec::default(), ) .unwrap(); @@ -597,10 +606,46 @@ impl TestRig { } pub async fn assert_event_journal(&mut self, expected: &[&str]) { - self.assert_event_journal_with_timeout(expected, STANDARD_TIMEOUT) + self.assert_event_journal_with_timeout(expected, STANDARD_TIMEOUT, false, false) .await } + pub async fn assert_event_journal_completes_with_timeout( + &mut self, + expected: &[WorkType], + timeout: Duration, + ) { + self.assert_event_journal_with_timeout( + &expected + .iter() + .map(Into::<&'static str>::into) + .chain(std::iter::once(WORKER_FREED)) + .chain(std::iter::once(NOTHING_TO_DO)) + .collect::>(), + timeout, + false, + false, + ) + .await + } + + pub async fn assert_event_journal_does_not_complete_with_timeout( + &mut self, + expected: &[WorkType], + timeout: Duration, + ) { + self.assert_not_in_event_journal_with_timeout( + &expected + .iter() + .map(Into::<&'static str>::into) + .chain(std::iter::once(WORKER_FREED)) + .chain(std::iter::once(NOTHING_TO_DO)) + .collect::>(), + timeout, + ) + .await + } + pub async fn assert_event_journal_completes(&mut self, expected: &[WorkType]) { self.assert_event_journal( &expected @@ -623,11 +668,21 @@ impl TestRig { &mut self, expected: &[&str], timeout: Duration, + ignore_worker_freed: bool, + ignore_nothing_to_do: bool, ) { let mut events = Vec::with_capacity(expected.len()); let drain_future = async { while let Some(event) = self.work_journal_rx.recv().await { + if event == WORKER_FREED && ignore_worker_freed { + continue; + } + + if event == NOTHING_TO_DO && ignore_nothing_to_do { + continue; + } + events.push(event); // Break as soon as we collect the desired number of events. @@ -651,6 +706,37 @@ impl TestRig { assert_eq!(events, expected); } + /// Assert that the `BeaconProcessor` event journal is not as `expected`. + pub async fn assert_not_in_event_journal_with_timeout( + &mut self, + expected: &[&str], + timeout: Duration, + ) { + let mut events = Vec::with_capacity(expected.len()); + + let drain_future = async { + while let Some(event) = self.work_journal_rx.recv().await { + events.push(event); + + // Break as soon as we collect the desired number of events. + if events.len() >= expected.len() { + break; + } + } + }; + + // Panic if we don't time out. + tokio::select! { + _ = tokio::time::sleep(timeout) => {}, + _ = drain_future => panic!( + "Got events before timeout. Expected no events but got {:?}", + events + ), + } + + assert_ne!(events, expected); + } + /// Listen for network messages and collect them for a specified duration or until reaching a count. /// /// Returns None if no messages were received, or Some(Vec) containing the received messages. @@ -743,6 +829,159 @@ fn junk_message_id() -> MessageId { MessageId::new(&[]) } +// Test that column reconstruction is delayed for columns that arrive +// at the beginning of the slot. +#[tokio::test] +async fn data_column_reconstruction_at_slot_start() { + if test_spec::().fulu_fork_epoch.is_none() { + return; + }; + + let mut rig = TestRig::new_supernode(SMALL_CHAIN).await; + + let slot_start = rig + .chain + .slot_clock + .start_of(rig.next_block.slot()) + .unwrap(); + + rig.chain + .slot_clock + .set_current_time(slot_start - rig.chain.spec.maximum_gossip_clock_disparity()); + + assert_eq!( + rig.chain.slot().unwrap(), + rig.next_block.slot() - 1, + "chain should be at the correct slot" + ); + + let num_data_columns = rig.next_data_columns.as_ref().map(|c| c.len()).unwrap_or(0); + for i in 0..num_data_columns { + rig.enqueue_gossip_data_columns(i); + rig.assert_event_journal_completes(&[WorkType::GossipDataColumnSidecar]) + .await; + } + + if num_data_columns > 0 { + // Reconstruction is delayed by 100ms, we should not be able to complete + // reconstruction up to this point + rig.assert_event_journal_does_not_complete_with_timeout( + &[WorkType::ColumnReconstruction], + Duration::from_millis(100), + ) + .await; + + // We've waited at least 150ms, reconstruction can now be triggered + rig.assert_event_journal_completes_with_timeout( + &[WorkType::ColumnReconstruction], + Duration::from_millis(200), + ) + .await; + } +} + +// Test that column reconstruction happens immediately for columns that arrive at the +// reconstruction deadline. +#[tokio::test] +async fn data_column_reconstruction_at_deadline() { + if test_spec::().fulu_fork_epoch.is_none() { + return; + }; + + let mut rig = TestRig::new_supernode(SMALL_CHAIN).await; + + let slot_start = rig + .chain + .slot_clock + .start_of(rig.next_block.slot()) + .unwrap(); + + rig.chain + .slot_clock + .set_current_time(slot_start - rig.chain.spec.maximum_gossip_clock_disparity()); + + assert_eq!( + rig.chain.slot().unwrap(), + rig.next_block.slot() - 1, + "chain should be at the correct slot" + ); + + // We push the slot clock to 3 seconds into the slot, this is the deadline to trigger reconstruction. + rig.chain + .slot_clock + .set_current_time(slot_start + Duration::from_secs(3)); + + let num_data_columns = rig.next_data_columns.as_ref().map(|c| c.len()).unwrap_or(0); + for i in 0..num_data_columns { + rig.enqueue_gossip_data_columns(i); + rig.assert_event_journal_completes(&[WorkType::GossipDataColumnSidecar]) + .await; + } + + // Since we're at the reconstruction deadline, reconstruction should be triggered immediately + if num_data_columns > 0 { + rig.assert_event_journal_completes_with_timeout( + &[WorkType::ColumnReconstruction], + Duration::from_millis(50), + ) + .await; + } +} + +// Test the column reconstruction is delayed for columns that arrive for a previous slot. +#[tokio::test] +async fn data_column_reconstruction_at_next_slot() { + if test_spec::().fulu_fork_epoch.is_none() { + return; + }; + + let mut rig = TestRig::new_supernode(SMALL_CHAIN).await; + + let slot_start = rig + .chain + .slot_clock + .start_of(rig.next_block.slot()) + .unwrap(); + + rig.chain + .slot_clock + .set_current_time(slot_start - rig.chain.spec.maximum_gossip_clock_disparity()); + + assert_eq!( + rig.chain.slot().unwrap(), + rig.next_block.slot() - 1, + "chain should be at the correct slot" + ); + + // We push the slot clock to the next slot. + rig.chain + .slot_clock + .set_current_time(slot_start + Duration::from_secs(12)); + + let num_data_columns = rig.next_data_columns.as_ref().map(|c| c.len()).unwrap_or(0); + for i in 0..num_data_columns { + rig.enqueue_gossip_data_columns(i); + rig.assert_event_journal_completes(&[WorkType::GossipDataColumnSidecar]) + .await; + } + + if num_data_columns > 0 { + // Since we are in the next slot reconstruction for the previous slot should be delayed again + rig.assert_event_journal_does_not_complete_with_timeout( + &[WorkType::ColumnReconstruction], + Duration::from_millis(100), + ) + .await; + + // We've waited at least 150ms, reconstruction can now be triggered + rig.assert_event_journal_completes_with_timeout( + &[WorkType::ColumnReconstruction], + Duration::from_millis(200), + ) + .await; + } +} + /// Blocks that arrive early should be queued for later processing. #[tokio::test] async fn import_gossip_block_acceptably_early() { @@ -782,10 +1021,6 @@ async fn import_gossip_block_acceptably_early() { rig.assert_event_journal_completes(&[WorkType::GossipDataColumnSidecar]) .await; } - if num_data_columns > 0 { - rig.assert_event_journal_completes(&[WorkType::ColumnReconstruction]) - .await; - } // Note: this section of the code is a bit race-y. We're assuming that we can set the slot clock // and check the head in the time between the block arrived early and when its due for @@ -1161,6 +1396,8 @@ async fn requeue_unknown_block_gossip_attestation_without_import() { NOTHING_TO_DO, ], Duration::from_secs(1) + QUEUED_ATTESTATION_DELAY, + false, + false, ) .await; @@ -1201,6 +1438,8 @@ async fn requeue_unknown_block_gossip_aggregated_attestation_without_import() { NOTHING_TO_DO, ], Duration::from_secs(1) + QUEUED_ATTESTATION_DELAY, + false, + false, ) .await; @@ -1335,8 +1574,8 @@ async fn test_backfill_sync_processing() { // (not straight forward to manipulate `TestingSlotClock` due to cloning of `SlotClock` in code) // and makes the test very slow, hence timing calculation is unit tested separately in // `work_reprocessing_queue`. - for _ in 0..1 { - rig.enqueue_backfill_batch(); + for i in 0..1 { + rig.enqueue_backfill_batch(Epoch::new(i)); // ensure queued batch is not processed until later rig.assert_no_events_for(Duration::from_millis(100)).await; // A new batch should be processed within a slot. @@ -1347,6 +1586,8 @@ async fn test_backfill_sync_processing() { NOTHING_TO_DO, ], rig.chain.slot_clock.slot_duration(), + false, + false, ) .await; } @@ -1359,11 +1600,16 @@ async fn test_backfill_sync_processing_rate_limiting_disabled() { enable_backfill_rate_limiting: false, ..Default::default() }; - let mut rig = - TestRig::new_parametric(SMALL_CHAIN, beacon_processor_config, test_spec::()).await; + let mut rig = TestRig::new_parametric( + SMALL_CHAIN, + beacon_processor_config, + false, + test_spec::(), + ) + .await; - for _ in 0..3 { - rig.enqueue_backfill_batch(); + for i in 0..3 { + rig.enqueue_backfill_batch(Epoch::new(i)); } // ensure all batches are processed @@ -1374,6 +1620,8 @@ async fn test_backfill_sync_processing_rate_limiting_disabled() { WorkType::ChainSegmentBackfill.into(), ], Duration::from_millis(100), + true, + true, ) .await; } diff --git a/beacon_node/network/src/service.rs b/beacon_node/network/src/service.rs index c97206ea87..4bd649ba82 100644 --- a/beacon_node/network/src/service.rs +++ b/beacon_node/network/src/service.rs @@ -840,6 +840,7 @@ impl NetworkService { new_fork = ?new_fork_name, "Transitioned to new fork" ); + new_fork_name.fork_ascii(); } fork_context.update_current_fork(*new_fork_name, new_fork_digest, current_epoch); diff --git a/beacon_node/network/src/subnet_service/attestation_subnets.rs b/beacon_node/network/src/subnet_service/attestation_subnets.rs deleted file mode 100644 index 0da27c6a21..0000000000 --- a/beacon_node/network/src/subnet_service/attestation_subnets.rs +++ /dev/null @@ -1,681 +0,0 @@ -//! This service keeps track of which shard subnet the beacon node should be subscribed to at any -//! given time. It schedules subscriptions to shard subnets, requests peer discoveries and -//! determines whether attestations should be aggregated and/or passed to the beacon node. - -use super::SubnetServiceMessage; -use std::collections::HashSet; -use std::collections::{HashMap, VecDeque}; -use std::pin::Pin; -use std::sync::Arc; -use std::task::{Context, Poll}; -use std::time::Duration; - -use beacon_chain::{BeaconChain, BeaconChainTypes}; -use delay_map::{HashMapDelay, HashSetDelay}; -use futures::prelude::*; -use lighthouse_network::{discv5::enr::NodeId, NetworkConfig, Subnet, SubnetDiscovery}; -use slot_clock::SlotClock; -use tracing::{debug, error, info, trace, warn}; -use types::{Attestation, EthSpec, Slot, SubnetId, ValidatorSubscription}; - -use crate::metrics; - -/// The minimum number of slots ahead that we attempt to discover peers for a subscription. If the -/// slot is less than this number, skip the peer discovery process. -/// Subnet discovery query takes at most 30 secs, 2 slots take 24s. -pub(crate) const MIN_PEER_DISCOVERY_SLOT_LOOK_AHEAD: u64 = 2; -/// The fraction of a slot that we subscribe to a subnet before the required slot. -/// -/// Currently a whole slot ahead. -const ADVANCE_SUBSCRIBE_SLOT_FRACTION: u32 = 1; - -/// The number of slots after an aggregator duty where we remove the entry from -/// `aggregate_validators_on_subnet` delay map. -const UNSUBSCRIBE_AFTER_AGGREGATOR_DUTY: u32 = 2; - -#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] -pub(crate) enum SubscriptionKind { - /// Long lived subscriptions. - /// - /// These have a longer duration and are advertised in our ENR. - LongLived, - /// Short lived subscriptions. - /// - /// Subscribing to these subnets has a short duration and we don't advertise it in our ENR. - ShortLived, -} - -/// A particular subnet at a given slot. -#[derive(PartialEq, Eq, Hash, Clone, Debug, Copy)] -pub struct ExactSubnet { - /// The `SubnetId` associated with this subnet. - pub subnet_id: SubnetId, - /// The `Slot` associated with this subnet. - pub slot: Slot, -} - -pub struct AttestationService { - /// Queued events to return to the driving service. - events: VecDeque, - - /// A reference to the beacon chain to process received attestations. - pub(crate) beacon_chain: Arc>, - - /// Subnets we are currently subscribed to as short lived subscriptions. - /// - /// Once they expire, we unsubscribe from these. - /// We subscribe to subnets when we are an aggregator for an exact subnet. - short_lived_subscriptions: HashMapDelay, - - /// Subnets we are currently subscribed to as long lived subscriptions. - /// - /// We advertise these in our ENR. When these expire, the subnet is removed from our ENR. - /// These are required of all beacon nodes. The exact number is determined by the chain - /// specification. - long_lived_subscriptions: HashSet, - - /// Short lived subscriptions that need to be executed in the future. - scheduled_short_lived_subscriptions: HashSetDelay, - - /// A collection timeouts to track the existence of aggregate validator subscriptions at an - /// `ExactSubnet`. - aggregate_validators_on_subnet: Option>, - - /// The waker for the current thread. - waker: Option, - - /// The discovery mechanism of lighthouse is disabled. - discovery_disabled: bool, - - /// We are always subscribed to all subnets. - subscribe_all_subnets: bool, - - /// Our Discv5 node_id. - node_id: NodeId, - - /// Future used to manage subscribing and unsubscribing from long lived subnets. - next_long_lived_subscription_event: Pin>, - - /// Whether this node is a block proposer-only node. - proposer_only: bool, -} - -impl AttestationService { - /* Public functions */ - - /// Establish the service based on the passed configuration. - pub fn new(beacon_chain: Arc>, node_id: NodeId, config: &NetworkConfig) -> Self { - let slot_duration = beacon_chain.slot_clock.slot_duration(); - - if config.subscribe_all_subnets { - info!("Subscribing to all subnets"); - } else { - info!( - subnets_per_node = beacon_chain.spec.subnets_per_node, - subscription_duration_in_epochs = beacon_chain.spec.epochs_per_subnet_subscription, - "Deterministic long lived subnets enabled" - ); - } - - let track_validators = !config.import_all_attestations; - let aggregate_validators_on_subnet = - track_validators.then(|| HashSetDelay::new(slot_duration)); - let mut service = AttestationService { - events: VecDeque::with_capacity(10), - beacon_chain, - short_lived_subscriptions: HashMapDelay::new(slot_duration), - long_lived_subscriptions: HashSet::default(), - scheduled_short_lived_subscriptions: HashSetDelay::default(), - aggregate_validators_on_subnet, - waker: None, - discovery_disabled: config.disable_discovery, - subscribe_all_subnets: config.subscribe_all_subnets, - node_id, - next_long_lived_subscription_event: { - // Set a dummy sleep. Calculating the current subnet subscriptions will update this - // value with a smarter timing - Box::pin(tokio::time::sleep(Duration::from_secs(1))) - }, - proposer_only: config.proposer_only, - }; - - // If we are not subscribed to all subnets, handle the deterministic set of subnets - if !config.subscribe_all_subnets { - service.recompute_long_lived_subnets(); - } - - service - } - - /// Return count of all currently subscribed subnets (long-lived **and** short-lived). - #[cfg(test)] - pub fn subscription_count(&self) -> usize { - if self.subscribe_all_subnets { - self.beacon_chain.spec.attestation_subnet_count as usize - } else { - let count = self - .short_lived_subscriptions - .keys() - .chain(self.long_lived_subscriptions.iter()) - .collect::>() - .len(); - count - } - } - - /// Returns whether we are subscribed to a subnet for testing purposes. - #[cfg(test)] - pub(crate) fn is_subscribed( - &self, - subnet_id: &SubnetId, - subscription_kind: SubscriptionKind, - ) -> bool { - match subscription_kind { - SubscriptionKind::LongLived => self.long_lived_subscriptions.contains(subnet_id), - SubscriptionKind::ShortLived => self.short_lived_subscriptions.contains_key(subnet_id), - } - } - - #[cfg(test)] - pub(crate) fn long_lived_subscriptions(&self) -> &HashSet { - &self.long_lived_subscriptions - } - - /// Processes a list of validator subscriptions. - /// - /// This will: - /// - Register new validators as being known. - /// - Search for peers for required subnets. - /// - Request subscriptions for subnets on specific slots when required. - /// - Build the timeouts for each of these events. - /// - /// This returns a result simply for the ergonomics of using ?. The result can be - /// safely dropped. - pub fn validator_subscriptions( - &mut self, - subscriptions: impl Iterator, - ) -> Result<(), String> { - // If the node is in a proposer-only state, we ignore all subnet subscriptions. - if self.proposer_only { - return Ok(()); - } - - // Maps each subnet_id subscription to it's highest slot - let mut subnets_to_discover: HashMap = HashMap::new(); - - // Registers the validator with the attestation service. - for subscription in subscriptions { - metrics::inc_counter(&metrics::SUBNET_SUBSCRIPTION_REQUESTS); - - trace!(?subscription, "Validator subscription"); - - // Compute the subnet that is associated with this subscription - let subnet_id = match SubnetId::compute_subnet::( - subscription.slot, - subscription.attestation_committee_index, - subscription.committee_count_at_slot, - &self.beacon_chain.spec, - ) { - Ok(subnet_id) => subnet_id, - Err(e) => { - warn!( - error = ?e, - "Failed to compute subnet id for validator subscription" - ); - continue; - } - }; - // Ensure each subnet_id inserted into the map has the highest slot as it's value. - // Higher slot corresponds to higher min_ttl in the `SubnetDiscovery` entry. - if let Some(slot) = subnets_to_discover.get(&subnet_id) { - if subscription.slot > *slot { - subnets_to_discover.insert(subnet_id, subscription.slot); - } - } else if !self.discovery_disabled { - subnets_to_discover.insert(subnet_id, subscription.slot); - } - - let exact_subnet = ExactSubnet { - subnet_id, - slot: subscription.slot, - }; - - // Determine if the validator is an aggregator. If so, we subscribe to the subnet and - // if successful add the validator to a mapping of known aggregators for that exact - // subnet. - - if subscription.is_aggregator { - metrics::inc_counter(&metrics::SUBNET_SUBSCRIPTION_AGGREGATOR_REQUESTS); - if let Err(e) = self.subscribe_to_short_lived_subnet(exact_subnet) { - warn!(error = e, "Subscription to subnet error"); - } else { - trace!(?exact_subnet, "Subscribed to subnet for aggregator duties"); - } - } - } - - // If the discovery mechanism isn't disabled, attempt to set up a peer discovery for the - // required subnets. - if !self.discovery_disabled { - if let Err(e) = self.discover_peers_request( - subnets_to_discover - .into_iter() - .map(|(subnet_id, slot)| ExactSubnet { subnet_id, slot }), - ) { - warn!(error = e, "Discovery lookup request error"); - }; - } - - Ok(()) - } - - fn recompute_long_lived_subnets(&mut self) { - // Ensure the next computation is scheduled even if assigning subnets fails. - let next_subscription_event = self - .recompute_long_lived_subnets_inner() - .unwrap_or_else(|_| self.beacon_chain.slot_clock.slot_duration()); - - debug!("Recomputing deterministic long lived subnets"); - self.next_long_lived_subscription_event = - Box::pin(tokio::time::sleep(next_subscription_event)); - - if let Some(waker) = self.waker.as_ref() { - waker.wake_by_ref(); - } - } - - /// Gets the long lived subnets the node should be subscribed to during the current epoch and - /// the remaining duration for which they remain valid. - fn recompute_long_lived_subnets_inner(&mut self) -> Result { - let current_epoch = self.beacon_chain.epoch().map_err(|e| { - if !self - .beacon_chain - .slot_clock - .is_prior_to_genesis() - .unwrap_or(false) - { - error!(err = ?e,"Failed to get the current epoch from clock") - } - })?; - - let (subnets, next_subscription_epoch) = SubnetId::compute_subnets_for_epoch::( - self.node_id.raw(), - current_epoch, - &self.beacon_chain.spec, - ) - .map_err(|e| error!(err = e, "Could not compute subnets for current epoch"))?; - - let next_subscription_slot = - next_subscription_epoch.start_slot(T::EthSpec::slots_per_epoch()); - let next_subscription_event = self - .beacon_chain - .slot_clock - .duration_to_slot(next_subscription_slot) - .ok_or_else(|| { - error!("Failed to compute duration to next to long lived subscription event") - })?; - - self.update_long_lived_subnets(subnets.collect()); - - Ok(next_subscription_event) - } - - /// Updates the long lived subnets. - /// - /// New subnets are registered as subscribed, removed subnets as unsubscribed and the Enr - /// updated accordingly. - fn update_long_lived_subnets(&mut self, mut subnets: HashSet) { - info!(subnets = ?subnets.iter().collect::>(),"Subscribing to long-lived subnets"); - for subnet in &subnets { - // Add the events for those subnets that are new as long lived subscriptions. - if !self.long_lived_subscriptions.contains(subnet) { - // Check if this subnet is new and send the subscription event if needed. - if !self.short_lived_subscriptions.contains_key(subnet) { - debug!( - ?subnet, - subscription_kind = ?SubscriptionKind::LongLived, - "Subscribing to subnet" - ); - self.queue_event(SubnetServiceMessage::Subscribe(Subnet::Attestation( - *subnet, - ))); - } - self.queue_event(SubnetServiceMessage::EnrAdd(Subnet::Attestation(*subnet))); - if !self.discovery_disabled { - self.queue_event(SubnetServiceMessage::DiscoverPeers(vec![SubnetDiscovery { - subnet: Subnet::Attestation(*subnet), - min_ttl: None, - }])) - } - } - } - - // Update the long_lived_subnets set and check for subnets that are being removed - std::mem::swap(&mut self.long_lived_subscriptions, &mut subnets); - for subnet in subnets { - if !self.long_lived_subscriptions.contains(&subnet) { - self.handle_removed_subnet(subnet, SubscriptionKind::LongLived); - } - } - } - - /// Checks if we have subscribed aggregate validators for the subnet. If not, checks the gossip - /// verification, re-propagates and returns false. - pub fn should_process_attestation( - &self, - subnet: SubnetId, - attestation: &Attestation, - ) -> bool { - // Proposer-only mode does not need to process attestations - if self.proposer_only { - return false; - } - self.aggregate_validators_on_subnet - .as_ref() - .map(|tracked_vals| { - tracked_vals.contains_key(&ExactSubnet { - subnet_id: subnet, - slot: attestation.data().slot, - }) - }) - .unwrap_or(true) - } - - /* Internal private functions */ - - /// Adds an event to the event queue and notifies that this service is ready to be polled - /// again. - fn queue_event(&mut self, ev: SubnetServiceMessage) { - self.events.push_back(ev); - if let Some(waker) = &self.waker { - waker.wake_by_ref() - } - } - /// Checks if there are currently queued discovery requests and the time required to make the - /// request. - /// - /// If there is sufficient time, queues a peer discovery request for all the required subnets. - fn discover_peers_request( - &mut self, - exact_subnets: impl Iterator, - ) -> Result<(), &'static str> { - let current_slot = self - .beacon_chain - .slot_clock - .now() - .ok_or("Could not get the current slot")?; - - let discovery_subnets: Vec = exact_subnets - .filter_map(|exact_subnet| { - // Check if there is enough time to perform a discovery lookup. - if exact_subnet.slot - >= current_slot.saturating_add(MIN_PEER_DISCOVERY_SLOT_LOOK_AHEAD) - { - // Send out an event to start looking for peers. - // Require the peer for an additional slot to ensure we keep the peer for the - // duration of the subscription. - let min_ttl = self - .beacon_chain - .slot_clock - .duration_to_slot(exact_subnet.slot + 1) - .map(|duration| std::time::Instant::now() + duration); - Some(SubnetDiscovery { - subnet: Subnet::Attestation(exact_subnet.subnet_id), - min_ttl, - }) - } else { - // We may want to check the global PeerInfo to see estimated timeouts for each - // peer before they can be removed. - warn!( - subnet_id = ?exact_subnet, - "Not enough time for a discovery search" - ); - None - } - }) - .collect(); - - if !discovery_subnets.is_empty() { - self.queue_event(SubnetServiceMessage::DiscoverPeers(discovery_subnets)); - } - Ok(()) - } - - // Subscribes to the subnet if it should be done immediately, or schedules it if required. - fn subscribe_to_short_lived_subnet( - &mut self, - ExactSubnet { subnet_id, slot }: ExactSubnet, - ) -> Result<(), &'static str> { - let slot_duration = self.beacon_chain.slot_clock.slot_duration(); - - // The short time we schedule the subscription before it's actually required. This - // ensures we are subscribed on time, and allows consecutive subscriptions to the same - // subnet to overlap, reducing subnet churn. - let advance_subscription_duration = slot_duration / ADVANCE_SUBSCRIBE_SLOT_FRACTION; - // The time to the required slot. - let time_to_subscription_slot = self - .beacon_chain - .slot_clock - .duration_to_slot(slot) - .unwrap_or_default(); // If this is a past slot we will just get a 0 duration. - - // Calculate how long before we need to subscribe to the subnet. - let time_to_subscription_start = - time_to_subscription_slot.saturating_sub(advance_subscription_duration); - - // The time after a duty slot where we no longer need it in the `aggregate_validators_on_subnet` - // delay map. - let time_to_unsubscribe = - time_to_subscription_slot + UNSUBSCRIBE_AFTER_AGGREGATOR_DUTY * slot_duration; - if let Some(tracked_vals) = self.aggregate_validators_on_subnet.as_mut() { - tracked_vals.insert_at(ExactSubnet { subnet_id, slot }, time_to_unsubscribe); - } - - // If the subscription should be done in the future, schedule it. Otherwise subscribe - // immediately. - if time_to_subscription_start.is_zero() { - // This is a current or past slot, we subscribe immediately. - self.subscribe_to_short_lived_subnet_immediately(subnet_id, slot + 1)?; - } else { - // This is a future slot, schedule subscribing. - trace!(subnet = ?subnet_id, ?time_to_subscription_start,"Scheduling subnet subscription"); - self.scheduled_short_lived_subscriptions - .insert_at(ExactSubnet { subnet_id, slot }, time_to_subscription_start); - } - - Ok(()) - } - - /* A collection of functions that handle the various timeouts */ - - /// Registers a subnet as subscribed. - /// - /// Checks that the time in which the subscription would end is not in the past. If we are - /// already subscribed, extends the timeout if necessary. If this is a new subscription, we send - /// out the appropriate events. - /// - /// On determinist long lived subnets, this is only used for short lived subscriptions. - fn subscribe_to_short_lived_subnet_immediately( - &mut self, - subnet_id: SubnetId, - end_slot: Slot, - ) -> Result<(), &'static str> { - if self.subscribe_all_subnets { - // Case not handled by this service. - return Ok(()); - } - - let time_to_subscription_end = self - .beacon_chain - .slot_clock - .duration_to_slot(end_slot) - .unwrap_or_default(); - - // First check this is worth doing. - if time_to_subscription_end.is_zero() { - return Err("Time when subscription would end has already passed."); - } - - let subscription_kind = SubscriptionKind::ShortLived; - - // We need to check and add a subscription for the right kind, regardless of the presence - // of the subnet as a subscription of the other kind. This is mainly since long lived - // subscriptions can be removed at any time when a validator goes offline. - - let (subscriptions, already_subscribed_as_other_kind) = ( - &mut self.short_lived_subscriptions, - self.long_lived_subscriptions.contains(&subnet_id), - ); - - match subscriptions.get(&subnet_id) { - Some(current_end_slot) => { - // We are already subscribed. Check if we need to extend the subscription. - if &end_slot > current_end_slot { - trace!( - subnet = ?subnet_id, - prev_end_slot = %current_end_slot, - new_end_slot = %end_slot, - ?subscription_kind, - "Extending subscription to subnet" - ); - subscriptions.insert_at(subnet_id, end_slot, time_to_subscription_end); - } - } - None => { - // This is a new subscription. Add with the corresponding timeout and send the - // notification. - subscriptions.insert_at(subnet_id, end_slot, time_to_subscription_end); - - // Inform of the subscription. - if !already_subscribed_as_other_kind { - debug!( - subnet = ?subnet_id, - %end_slot, - ?subscription_kind, - "Subscribing to subnet" - ); - self.queue_event(SubnetServiceMessage::Subscribe(Subnet::Attestation( - subnet_id, - ))); - } - } - } - - Ok(()) - } - - // Unsubscribes from a subnet that was removed if it does not continue to exist as a - // subscription of the other kind. For long lived subscriptions, it also removes the - // advertisement from our ENR. - fn handle_removed_subnet(&mut self, subnet_id: SubnetId, subscription_kind: SubscriptionKind) { - let exists_in_other_subscriptions = match subscription_kind { - SubscriptionKind::LongLived => self.short_lived_subscriptions.contains_key(&subnet_id), - SubscriptionKind::ShortLived => self.long_lived_subscriptions.contains(&subnet_id), - }; - - if !exists_in_other_subscriptions { - // Subscription no longer exists as short lived or long lived. - debug!( - subnet = ?subnet_id, - ?subscription_kind, - "Unsubscribing from subnet" - ); - self.queue_event(SubnetServiceMessage::Unsubscribe(Subnet::Attestation( - subnet_id, - ))); - } - - if subscription_kind == SubscriptionKind::LongLived { - // Remove from our ENR even if we remain subscribed in other way. - self.queue_event(SubnetServiceMessage::EnrRemove(Subnet::Attestation( - subnet_id, - ))); - } - } -} - -impl Stream for AttestationService { - type Item = SubnetServiceMessage; - - fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - // Update the waker if needed. - if let Some(waker) = &self.waker { - if !waker.will_wake(cx.waker()) { - self.waker = Some(cx.waker().clone()); - } - } else { - self.waker = Some(cx.waker().clone()); - } - - // Send out any generated events. - if let Some(event) = self.events.pop_front() { - return Poll::Ready(Some(event)); - } - - // If we aren't subscribed to all subnets, handle the deterministic long-lived subnets - if !self.subscribe_all_subnets { - match self.next_long_lived_subscription_event.as_mut().poll(cx) { - Poll::Ready(_) => { - self.recompute_long_lived_subnets(); - // We re-wake the task as there could be other subscriptions to process - self.waker - .as_ref() - .expect("Waker has been set") - .wake_by_ref(); - } - Poll::Pending => {} - } - } - - // Process scheduled subscriptions that might be ready, since those can extend a soon to - // expire subscription. - match self.scheduled_short_lived_subscriptions.poll_next_unpin(cx) { - Poll::Ready(Some(Ok(ExactSubnet { subnet_id, slot }))) => { - if let Err(e) = - self.subscribe_to_short_lived_subnet_immediately(subnet_id, slot + 1) - { - debug!(subnet = ?subnet_id, err = e,"Failed to subscribe to short lived subnet"); - } - self.waker - .as_ref() - .expect("Waker has been set") - .wake_by_ref(); - } - Poll::Ready(Some(Err(e))) => { - error!( - error = e, - "Failed to check for scheduled subnet subscriptions" - ); - } - Poll::Ready(None) | Poll::Pending => {} - } - - // Finally process any expired subscriptions. - match self.short_lived_subscriptions.poll_next_unpin(cx) { - Poll::Ready(Some(Ok((subnet_id, _end_slot)))) => { - self.handle_removed_subnet(subnet_id, SubscriptionKind::ShortLived); - // We re-wake the task as there could be other subscriptions to process - self.waker - .as_ref() - .expect("Waker has been set") - .wake_by_ref(); - } - Poll::Ready(Some(Err(e))) => { - error!(error = e, "Failed to check for subnet unsubscription times"); - } - Poll::Ready(None) | Poll::Pending => {} - } - - // Poll to remove entries on expiration, no need to act on expiration events. - if let Some(tracked_vals) = self.aggregate_validators_on_subnet.as_mut() { - if let Poll::Ready(Some(Err(e))) = tracked_vals.poll_next_unpin(cx) { - error!( - error = e, - "Failed to check for aggregate validator on subnet expirations" - ); - } - } - - Poll::Pending - } -} diff --git a/beacon_node/network/src/subnet_service/sync_subnets.rs b/beacon_node/network/src/subnet_service/sync_subnets.rs deleted file mode 100644 index 6b3834e195..0000000000 --- a/beacon_node/network/src/subnet_service/sync_subnets.rs +++ /dev/null @@ -1,345 +0,0 @@ -//! This service keeps track of which sync committee subnet the beacon node should be subscribed to at any -//! given time. It schedules subscriptions to sync committee subnets and requests peer discoveries. - -use std::collections::{hash_map::Entry, HashMap, VecDeque}; -use std::pin::Pin; -use std::sync::Arc; -use std::task::{Context, Poll}; -use std::time::Duration; - -use futures::prelude::*; -use tracing::{debug, error, trace, warn}; - -use super::SubnetServiceMessage; -use beacon_chain::{BeaconChain, BeaconChainTypes}; -use delay_map::HashSetDelay; -use lighthouse_network::{NetworkConfig, Subnet, SubnetDiscovery}; -use slot_clock::SlotClock; -use types::{Epoch, EthSpec, SyncCommitteeSubscription, SyncSubnetId}; - -use crate::metrics; - -/// The minimum number of slots ahead that we attempt to discover peers for a subscription. If the -/// slot is less than this number, skip the peer discovery process. -/// Subnet discovery query takes at most 30 secs, 2 slots take 24s. -const MIN_PEER_DISCOVERY_SLOT_LOOK_AHEAD: u64 = 2; - -/// A particular subnet at a given slot. -#[derive(PartialEq, Eq, Hash, Clone, Debug)] -pub struct ExactSubnet { - /// The `SyncSubnetId` associated with this subnet. - pub subnet_id: SyncSubnetId, - /// The epoch until which we need to stay subscribed to the subnet. - pub until_epoch: Epoch, -} -pub struct SyncCommitteeService { - /// Queued events to return to the driving service. - events: VecDeque, - - /// A reference to the beacon chain to process received attestations. - pub(crate) beacon_chain: Arc>, - - /// The collection of all currently subscribed subnets. - subscriptions: HashMap, - - /// A collection of timeouts for when to unsubscribe from a subnet. - unsubscriptions: HashSetDelay, - - /// The waker for the current thread. - waker: Option, - - /// The discovery mechanism of lighthouse is disabled. - discovery_disabled: bool, - - /// We are always subscribed to all subnets. - subscribe_all_subnets: bool, - - /// Whether this node is a block proposer-only node. - proposer_only: bool, -} - -impl SyncCommitteeService { - /* Public functions */ - - pub fn new(beacon_chain: Arc>, config: &NetworkConfig) -> Self { - let spec = &beacon_chain.spec; - let epoch_duration_secs = - beacon_chain.slot_clock.slot_duration().as_secs() * T::EthSpec::slots_per_epoch(); - let default_timeout = - epoch_duration_secs.saturating_mul(spec.epochs_per_sync_committee_period.as_u64()); - - SyncCommitteeService { - events: VecDeque::with_capacity(10), - beacon_chain, - subscriptions: HashMap::new(), - unsubscriptions: HashSetDelay::new(Duration::from_secs(default_timeout)), - waker: None, - subscribe_all_subnets: config.subscribe_all_subnets, - discovery_disabled: config.disable_discovery, - proposer_only: config.proposer_only, - } - } - - /// Return count of all currently subscribed subnets. - #[cfg(test)] - pub fn subscription_count(&self) -> usize { - use types::consts::altair::SYNC_COMMITTEE_SUBNET_COUNT; - if self.subscribe_all_subnets { - SYNC_COMMITTEE_SUBNET_COUNT as usize - } else { - self.subscriptions.len() - } - } - - /// Processes a list of sync committee subscriptions. - /// - /// This will: - /// - Search for peers for required subnets. - /// - Request subscriptions required subnets. - /// - Build the timeouts for each of these events. - /// - /// This returns a result simply for the ergonomics of using ?. The result can be - /// safely dropped. - pub fn validator_subscriptions( - &mut self, - subscriptions: Vec, - ) -> Result<(), String> { - // A proposer-only node does not subscribe to any sync-committees - if self.proposer_only { - return Ok(()); - } - - let mut subnets_to_discover = Vec::new(); - for subscription in subscriptions { - metrics::inc_counter(&metrics::SYNC_COMMITTEE_SUBSCRIPTION_REQUESTS); - //NOTE: We assume all subscriptions have been verified before reaching this service - - // Registers the validator with the subnet service. - // This will subscribe to long-lived random subnets if required. - trace!(?subscription, "Sync committee subscription"); - - let subnet_ids = match SyncSubnetId::compute_subnets_for_sync_committee::( - &subscription.sync_committee_indices, - ) { - Ok(subnet_ids) => subnet_ids, - Err(e) => { - warn!( - error = ?e, - validator_index = subscription.validator_index, - "Failed to compute subnet id for sync committee subscription" - ); - continue; - } - }; - - for subnet_id in subnet_ids { - let exact_subnet = ExactSubnet { - subnet_id, - until_epoch: subscription.until_epoch, - }; - subnets_to_discover.push(exact_subnet.clone()); - if let Err(e) = self.subscribe_to_subnet(exact_subnet.clone()) { - warn!( - error = e, - validator_index = subscription.validator_index, - "Subscription to sync subnet error" - ); - } else { - trace!( - ?exact_subnet, - validator_index = subscription.validator_index, - "Subscribed to subnet for sync committee duties" - ); - } - } - } - // If the discovery mechanism isn't disabled, attempt to set up a peer discovery for the - // required subnets. - if !self.discovery_disabled { - if let Err(e) = self.discover_peers_request(subnets_to_discover.iter()) { - warn!(error = e, "Discovery lookup request error"); - }; - } - - // pre-emptively wake the thread to check for new events - if let Some(waker) = &self.waker { - waker.wake_by_ref(); - } - Ok(()) - } - - /* Internal private functions */ - - /// Checks if there are currently queued discovery requests and the time required to make the - /// request. - /// - /// If there is sufficient time, queues a peer discovery request for all the required subnets. - fn discover_peers_request<'a>( - &mut self, - exact_subnets: impl Iterator, - ) -> Result<(), &'static str> { - let current_slot = self - .beacon_chain - .slot_clock - .now() - .ok_or("Could not get the current slot")?; - - let slots_per_epoch = T::EthSpec::slots_per_epoch(); - - let discovery_subnets: Vec = exact_subnets - .filter_map(|exact_subnet| { - let until_slot = exact_subnet.until_epoch.end_slot(slots_per_epoch); - // check if there is enough time to perform a discovery lookup - if until_slot >= current_slot.saturating_add(MIN_PEER_DISCOVERY_SLOT_LOOK_AHEAD) { - // if the slot is more than epoch away, add an event to start looking for peers - // add one slot to ensure we keep the peer for the subscription slot - let min_ttl = self - .beacon_chain - .slot_clock - .duration_to_slot(until_slot + 1) - .map(|duration| std::time::Instant::now() + duration); - Some(SubnetDiscovery { - subnet: Subnet::SyncCommittee(exact_subnet.subnet_id), - min_ttl, - }) - } else { - // We may want to check the global PeerInfo to see estimated timeouts for each - // peer before they can be removed. - warn!( - subnet_id = ?exact_subnet, - "Not enough time for a discovery search" - ); - None - } - }) - .collect(); - - if !discovery_subnets.is_empty() { - self.events - .push_back(SubnetServiceMessage::DiscoverPeers(discovery_subnets)); - } - Ok(()) - } - - /// Adds a subscription event and an associated unsubscription event if required. - fn subscribe_to_subnet(&mut self, exact_subnet: ExactSubnet) -> Result<(), &'static str> { - // Return if we have subscribed to all subnets - if self.subscribe_all_subnets { - return Ok(()); - } - - // Return if we already have a subscription for exact_subnet - if self.subscriptions.get(&exact_subnet.subnet_id) == Some(&exact_subnet.until_epoch) { - return Ok(()); - } - - // Return if we already have subscription set to expire later than the current request. - if let Some(until_epoch) = self.subscriptions.get(&exact_subnet.subnet_id) { - if *until_epoch >= exact_subnet.until_epoch { - return Ok(()); - } - } - - // initialise timing variables - let current_slot = self - .beacon_chain - .slot_clock - .now() - .ok_or("Could not get the current slot")?; - - let slots_per_epoch = T::EthSpec::slots_per_epoch(); - let until_slot = exact_subnet.until_epoch.end_slot(slots_per_epoch); - // Calculate the duration to the unsubscription event. - let expected_end_subscription_duration = if current_slot >= until_slot { - warn!( - %current_slot, - ?exact_subnet, - "Sync committee subscription is past expiration" - ); - return Ok(()); - } else { - let slot_duration = self.beacon_chain.slot_clock.slot_duration(); - - // the duration until we no longer need this subscription. We assume a single slot is - // sufficient. - self.beacon_chain - .slot_clock - .duration_to_slot(until_slot) - .ok_or("Unable to determine duration to unsubscription slot")? - + slot_duration - }; - - if let Entry::Vacant(e) = self.subscriptions.entry(exact_subnet.subnet_id) { - // We are not currently subscribed and have no waiting subscription, create one - debug!(subnet = *exact_subnet.subnet_id, until_epoch = ?exact_subnet.until_epoch, "Subscribing to subnet"); - e.insert(exact_subnet.until_epoch); - self.events - .push_back(SubnetServiceMessage::Subscribe(Subnet::SyncCommittee( - exact_subnet.subnet_id, - ))); - - // add the subnet to the ENR bitfield - self.events - .push_back(SubnetServiceMessage::EnrAdd(Subnet::SyncCommittee( - exact_subnet.subnet_id, - ))); - - // add an unsubscription event to remove ourselves from the subnet once completed - self.unsubscriptions - .insert_at(exact_subnet.subnet_id, expected_end_subscription_duration); - } else { - // We are already subscribed, extend the unsubscription duration - self.unsubscriptions - .update_timeout(&exact_subnet.subnet_id, expected_end_subscription_duration); - } - - Ok(()) - } - - /// A queued unsubscription is ready. - fn handle_unsubscriptions(&mut self, subnet_id: SyncSubnetId) { - debug!(subnet = *subnet_id, "Unsubscribing from subnet"); - - self.subscriptions.remove(&subnet_id); - self.events - .push_back(SubnetServiceMessage::Unsubscribe(Subnet::SyncCommittee( - subnet_id, - ))); - - self.events - .push_back(SubnetServiceMessage::EnrRemove(Subnet::SyncCommittee( - subnet_id, - ))); - } -} - -impl Stream for SyncCommitteeService { - type Item = SubnetServiceMessage; - - fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - // update the waker if needed - if let Some(waker) = &self.waker { - if !waker.will_wake(cx.waker()) { - self.waker = Some(cx.waker().clone()); - } - } else { - self.waker = Some(cx.waker().clone()); - } - - // process any un-subscription events - match self.unsubscriptions.poll_next_unpin(cx) { - Poll::Ready(Some(Ok(exact_subnet))) => self.handle_unsubscriptions(exact_subnet), - Poll::Ready(Some(Err(e))) => { - error!(error = e, "Failed to check for subnet unsubscription times"); - } - Poll::Ready(None) | Poll::Pending => {} - } - - // process any generated events - if let Some(event) = self.events.pop_front() { - return Poll::Ready(Some(event)); - } - - Poll::Pending - } -} diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs index 2f5eb3f689..d5a4e9b73a 100644 --- a/beacon_node/network/src/sync/backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/backfill_sync/mod.rs @@ -687,11 +687,12 @@ impl BackFillSync { // Batch is not ready, nothing to process } BatchState::Poisoned => unreachable!("Poisoned batch"), - BatchState::Failed | BatchState::AwaitingDownload | BatchState::Processing(_) => { + // Batches can be in `AwaitingDownload` state if there weren't good data column subnet + // peers to send the request to. + BatchState::AwaitingDownload => return Ok(ProcessResult::Successful), + BatchState::Failed | BatchState::Processing(_) => { // these are all inconsistent states: // - Failed -> non recoverable batch. Chain should have been removed - // - AwaitingDownload -> A recoverable failed batch should have been - // re-requested. // - Processing -> `self.current_processing_batch` is None self.fail_sync(BackFillError::InvalidSyncState(String::from( "Invalid expected batch state", @@ -790,7 +791,8 @@ impl BackFillSync { } } BatchState::Downloading(..) => {} - BatchState::Failed | BatchState::Poisoned | BatchState::AwaitingDownload => { + BatchState::AwaitingDownload => return, + BatchState::Failed | BatchState::Poisoned => { crit!("batch indicates inconsistent chain state while advancing chain") } BatchState::AwaitingProcessing(..) => {} @@ -1120,13 +1122,12 @@ impl BackFillSync { .sampling_subnets() .iter() .all(|subnet_id| { - let peer_count = network + let min_peer_count = 1; + network .network_globals() .peers .read() - .good_range_sync_custody_subnet_peers(*subnet_id) - .count(); - peer_count > 0 + .has_good_peers_in_custody_subnet(subnet_id, min_peer_count) }) } else { true diff --git a/beacon_node/network/src/sync/block_lookups/mod.rs b/beacon_node/network/src/sync/block_lookups/mod.rs index e9f24697ac..f8ffd298ca 100644 --- a/beacon_node/network/src/sync/block_lookups/mod.rs +++ b/beacon_node/network/src/sync/block_lookups/mod.rs @@ -59,7 +59,7 @@ mod single_block_lookup; /// reaches the maximum depth it will force trigger range sync. pub(crate) const PARENT_DEPTH_TOLERANCE: usize = SLOT_IMPORT_TOLERANCE; -const FAILED_CHAINS_CACHE_EXPIRY_SECONDS: u64 = 60; +const IGNORED_CHAINS_CACHE_EXPIRY_SECONDS: u64 = 60; pub const SINGLE_BLOCK_LOOKUP_MAX_ATTEMPTS: u8 = 4; /// Maximum time we allow a lookup to exist before assuming it is stuck and will never make @@ -110,8 +110,10 @@ enum Action { } pub struct BlockLookups { - /// A cache of failed chain lookups to prevent duplicate searches. - failed_chains: LRUTimeCache, + /// A cache of block roots that must be ignored for some time to prevent useless searches. For + /// example if a chain is too long, its lookup chain is dropped, and range sync is expected to + /// eventually sync those blocks + ignored_chains: LRUTimeCache, // TODO: Why not index lookups by block_root? single_block_lookups: FnvHashMap>, @@ -128,21 +130,21 @@ pub(crate) type BlockLookupSummary = (Id, Hash256, Option, Vec) impl BlockLookups { pub fn new() -> Self { Self { - failed_chains: LRUTimeCache::new(Duration::from_secs( - FAILED_CHAINS_CACHE_EXPIRY_SECONDS, + ignored_chains: LRUTimeCache::new(Duration::from_secs( + IGNORED_CHAINS_CACHE_EXPIRY_SECONDS, )), single_block_lookups: Default::default(), } } #[cfg(test)] - pub(crate) fn insert_failed_chain(&mut self, block_root: Hash256) { - self.failed_chains.insert(block_root); + pub(crate) fn insert_ignored_chain(&mut self, block_root: Hash256) { + self.ignored_chains.insert(block_root); } #[cfg(test)] - pub(crate) fn get_failed_chains(&mut self) -> Vec { - self.failed_chains.keys().cloned().collect() + pub(crate) fn get_ignored_chains(&mut self) -> Vec { + self.ignored_chains.keys().cloned().collect() } #[cfg(test)] @@ -184,7 +186,7 @@ impl BlockLookups { self.search_parent_of_child(parent_root, block_root, &[peer_id], cx); // Only create the child lookup if the parent exists if parent_lookup_exists { - // `search_parent_of_child` ensures that parent root is not a failed chain + // `search_parent_of_child` ensures that the parent lookup exists so we can safely wait for it self.new_current_lookup( block_root, Some(block_component), @@ -244,8 +246,8 @@ impl BlockLookups { debug!(block_root = ?block_root_to_search, "Parent lookup chain too long"); // Searching for this parent would extend a parent chain over the max - // Insert the tip only to failed chains - self.failed_chains.insert(parent_chain.tip); + // Insert the tip only to chains to ignore + self.ignored_chains.insert(parent_chain.tip); // Note: Drop only the chain that's too long until it merges with another chain // that's not too long. Consider this attack: there's a chain of valid unknown @@ -330,12 +332,9 @@ impl BlockLookups { peers: &[PeerId], cx: &mut SyncNetworkContext, ) -> bool { - // If this block or it's parent is part of a known failed chain, ignore it. - if self.failed_chains.contains(&block_root) { - debug!(?block_root, "Block is from a past failed chain. Dropping"); - for peer_id in peers { - cx.report_peer(*peer_id, PeerAction::MidToleranceError, "failed_chain"); - } + // If this block or it's parent is part of a known ignored chain, ignore it. + if self.ignored_chains.contains(&block_root) { + debug!(?block_root, "Dropping lookup for block marked ignored"); return false; } @@ -384,6 +383,7 @@ impl BlockLookups { // If we know that this lookup has unknown parent (is awaiting a parent lookup to resolve), // signal here to hold processing downloaded data. let mut lookup = SingleBlockLookup::new(block_root, peers, cx.next_id(), awaiting_parent); + let _guard = lookup.span.clone().entered(); // Add block components to the new request if let Some(block_component) = block_component { diff --git a/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs b/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs index 30947cf1f0..8fb3248a87 100644 --- a/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs +++ b/beacon_node/network/src/sync/block_lookups/single_block_lookup.rs @@ -7,6 +7,7 @@ use crate::sync::network_context::{ use beacon_chain::{BeaconChainTypes, BlockProcessStatus}; use derivative::Derivative; use lighthouse_network::service::api_types::Id; +use lighthouse_tracing::SPAN_SINGLE_BLOCK_LOOKUP; use parking_lot::RwLock; use std::collections::HashSet; use std::fmt::Debug; @@ -14,6 +15,7 @@ use std::sync::Arc; use std::time::{Duration, Instant}; use store::Hash256; use strum::IntoStaticStr; +use tracing::{Span, debug_span}; use types::blob_sidecar::FixedBlobSidecarList; use types::{DataColumnSidecarList, EthSpec, SignedBeaconBlock, Slot}; @@ -70,6 +72,7 @@ pub struct SingleBlockLookup { block_root: Hash256, awaiting_parent: Option, created: Instant, + pub(crate) span: Span, } #[derive(Debug)] @@ -89,6 +92,12 @@ impl SingleBlockLookup { id: Id, awaiting_parent: Option, ) -> Self { + let lookup_span = debug_span!( + SPAN_SINGLE_BLOCK_LOOKUP, + block_root = %requested_block_root, + id = id, + ); + Self { id, block_request_state: BlockRequestState::new(requested_block_root), @@ -97,6 +106,7 @@ impl SingleBlockLookup { block_root: requested_block_root, awaiting_parent, created: Instant::now(), + span: lookup_span, } } @@ -192,6 +202,7 @@ impl SingleBlockLookup { &mut self, cx: &mut SyncNetworkContext, ) -> Result { + let _guard = self.span.clone().entered(); // TODO: Check what's necessary to download, specially for blobs self.continue_request::>(cx, 0)?; @@ -208,7 +219,7 @@ impl SingleBlockLookup { // can assert that this is the correct value of `blob_kzg_commitments_count`. match cx.chain.get_block_process_status(&self.block_root) { BlockProcessStatus::Unknown => None, - BlockProcessStatus::NotValidated(block) + BlockProcessStatus::NotValidated(block, _) | BlockProcessStatus::ExecutionValidated(block) => Some(block.clone()), } }) { @@ -257,6 +268,7 @@ impl SingleBlockLookup { // that can make progress so it must be dropped. Consider the lookup completed. // This case can happen if we receive the components from gossip during a retry. if self.all_components_processed() { + self.span = Span::none(); Ok(LookupResult::Completed) } else { Ok(LookupResult::Pending) diff --git a/beacon_node/network/src/sync/manager.rs b/beacon_node/network/src/sync/manager.rs index 448e784ab6..d7ba028054 100644 --- a/beacon_node/network/src/sync/manager.rs +++ b/beacon_node/network/src/sync/manager.rs @@ -328,13 +328,13 @@ impl SyncManager { } #[cfg(test)] - pub(crate) fn get_failed_chains(&mut self) -> Vec { - self.block_lookups.get_failed_chains() + pub(crate) fn get_ignored_chains(&mut self) -> Vec { + self.block_lookups.get_ignored_chains() } #[cfg(test)] - pub(crate) fn insert_failed_chain(&mut self, block_root: Hash256) { - self.block_lookups.insert_failed_chain(block_root); + pub(crate) fn insert_ignored_chain(&mut self, block_root: Hash256) { + self.block_lookups.insert_ignored_chain(block_root); } #[cfg(test)] diff --git a/beacon_node/network/src/sync/network_context.rs b/beacon_node/network/src/sync/network_context.rs index 07462a01fe..ac2991c147 100644 --- a/beacon_node/network/src/sync/network_context.rs +++ b/beacon_node/network/src/sync/network_context.rs @@ -29,7 +29,7 @@ use lighthouse_network::service::api_types::{ DataColumnsByRootRequester, Id, SingleLookupReqId, SyncRequestId, }; use lighthouse_network::{Client, NetworkGlobals, PeerAction, PeerId, ReportSource}; -use lighthouse_tracing::SPAN_OUTGOING_RANGE_REQUEST; +use lighthouse_tracing::{SPAN_OUTGOING_BLOCK_BY_ROOT_REQUEST, SPAN_OUTGOING_RANGE_REQUEST}; use parking_lot::RwLock; pub use requests::LookupVerifyError; use requests::{ @@ -49,8 +49,8 @@ use tokio::sync::mpsc; use tracing::{Span, debug, debug_span, error, warn}; use types::blob_sidecar::FixedBlobSidecarList; use types::{ - BlobSidecar, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, EthSpec, ForkContext, - Hash256, SignedBeaconBlock, Slot, + BlobSidecar, BlockImportSource, ColumnIndex, DataColumnSidecar, DataColumnSidecarList, EthSpec, + ForkContext, Hash256, SignedBeaconBlock, Slot, }; pub mod custody; @@ -835,14 +835,26 @@ impl SyncNetworkContext { match self.chain.get_block_process_status(&block_root) { // Unknown block, continue request to download BlockProcessStatus::Unknown => {} - // Block is known are currently processing, expect a future event with the result of - // processing. - BlockProcessStatus::NotValidated { .. } => { - // Lookup sync event safety: If the block is currently in the processing cache, we - // are guaranteed to receive a `SyncMessage::GossipBlockProcessResult` that will - // make progress on this lookup - return Ok(LookupRequestResult::Pending("block in processing cache")); - } + // Block is known and currently processing. Imports from gossip and HTTP API insert the + // block in the da_cache. However, HTTP API is unable to notify sync when it completes + // block import. Returning `Pending` here will result in stuck lookups if the block is + // importing from sync. + BlockProcessStatus::NotValidated(_, source) => match source { + BlockImportSource::Gossip => { + // Lookup sync event safety: If the block is currently in the processing cache, we + // are guaranteed to receive a `SyncMessage::GossipBlockProcessResult` that will + // make progress on this lookup + return Ok(LookupRequestResult::Pending("block in processing cache")); + } + BlockImportSource::Lookup + | BlockImportSource::RangeSync + | BlockImportSource::HttpApi => { + // Lookup, RangeSync or HttpApi block import don't emit the GossipBlockProcessResult + // event. If a lookup happens to be created during block import from one of + // those sources just import the block twice. Otherwise the lookup will get + // stuck. Double imports are fine, they just waste resources. + } + }, // Block is fully validated. If it's not yet imported it's waiting for missing block // components. Consider this request completed and do nothing. BlockProcessStatus::ExecutionValidated { .. } => { @@ -886,6 +898,11 @@ impl SyncNetworkContext { "Sync RPC request sent" ); + let request_span = debug_span!( + parent: Span::current(), + SPAN_OUTGOING_BLOCK_BY_ROOT_REQUEST, + %block_root, + ); self.blocks_by_root_requests.insert( id, peer_id, @@ -893,8 +910,7 @@ impl SyncNetworkContext { // block and the peer must have it. true, BlocksByRootRequestItems::new(request), - // Not implemented - Span::none(), + request_span, ); Ok(LookupRequestResult::RequestSent(id.req_id)) diff --git a/beacon_node/network/src/sync/network_context/custody.rs b/beacon_node/network/src/sync/network_context/custody.rs index d973e83cea..71e002cc42 100644 --- a/beacon_node/network/src/sync/network_context/custody.rs +++ b/beacon_node/network/src/sync/network_context/custody.rs @@ -7,19 +7,17 @@ use fnv::FnvHashMap; use lighthouse_network::PeerId; use lighthouse_network::service::api_types::{CustodyId, DataColumnsByRootRequester}; use lighthouse_tracing::SPAN_OUTGOING_CUSTODY_REQUEST; -use lru_cache::LRUTimeCache; use parking_lot::RwLock; -use rand::Rng; use std::collections::HashSet; +use std::hash::{BuildHasher, RandomState}; use std::time::{Duration, Instant}; use std::{collections::HashMap, marker::PhantomData, sync::Arc}; -use tracing::{Span, debug, debug_span, field, warn}; +use tracing::{Span, debug, debug_span, warn}; use types::{DataColumnSidecar, Hash256, data_column_sidecar::ColumnIndex}; use types::{DataColumnSidecarList, EthSpec}; use super::{LookupRequestResult, PeerGroup, RpcResponseResult, SyncNetworkContext}; -const FAILED_PEERS_CACHE_EXPIRY_SECONDS: u64 = 5; const MAX_STALE_NO_PEERS_DURATION: Duration = Duration::from_secs(30); pub struct ActiveCustodyRequest { @@ -30,9 +28,7 @@ pub struct ActiveCustodyRequest { /// Active requests for 1 or more columns each active_batch_columns_requests: FnvHashMap, - /// Peers that have recently failed to successfully respond to a columns by root request. - /// Having a LRUTimeCache allows this request to not have to track disconnecting peers. - failed_peers: LRUTimeCache, + peer_attempts: HashMap, /// Set of peers that claim to have imported this block and their custody columns lookup_peers: Arc>>, /// Span for tracing the lifetime of this request. @@ -71,7 +67,11 @@ impl ActiveCustodyRequest { column_indices: &[ColumnIndex], lookup_peers: Arc>>, ) -> Self { - let span = debug_span!(parent: None, SPAN_OUTGOING_CUSTODY_REQUEST, %block_root); + let span = debug_span!( + parent: Span::current(), + SPAN_OUTGOING_CUSTODY_REQUEST, + %block_root, + ); Self { block_root, custody_id, @@ -81,7 +81,7 @@ impl ActiveCustodyRequest { .map(|index| (*index, ColumnRequest::new())), ), active_batch_columns_requests: <_>::default(), - failed_peers: LRUTimeCache::new(Duration::from_secs(FAILED_PEERS_CACHE_EXPIRY_SECONDS)), + peer_attempts: HashMap::new(), lookup_peers, span, _phantom: PhantomData, @@ -170,13 +170,6 @@ impl ActiveCustodyRequest { ?missing_column_indexes, "Custody column peer claims to not have some data" ); - - batch_request.span.record( - "missing_column_indexes", - field::debug(missing_column_indexes), - ); - - self.failed_peers.insert(peer_id); } } Err(err) => { @@ -195,13 +188,6 @@ impl ActiveCustodyRequest { .ok_or(Error::BadState("unknown column_index".to_owned()))? .on_download_error_and_mark_failure(req_id)?; } - - batch_request.span.record( - "missing_column_indexes", - field::debug(&batch_request.indices), - ); - - self.failed_peers.insert(peer_id); } }; @@ -238,52 +224,29 @@ impl ActiveCustodyRequest { let active_request_count_by_peer = cx.active_request_count_by_peer(); let mut columns_to_request_by_peer = HashMap::>::new(); let lookup_peers = self.lookup_peers.read(); + // Create deterministic hasher per request to ensure consistent peer ordering within + // this request (avoiding fragmentation) while varying selection across different requests + let random_state = RandomState::new(); - // Need to: - // - track how many active requests a peer has for load balancing - // - which peers have failures to attempt others - // - which peer returned what to have PeerGroup attributability - - for (column_index, request) in self.column_requests.iter_mut() { + for (column_index, request) in self.column_requests.iter() { if let Some(wait_duration) = request.is_awaiting_download() { + // Note: an empty response is considered a successful response, so we may end up + // retrying many more times than `MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS`. if request.download_failures > MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS { return Err(Error::TooManyFailures); } - // TODO(das): When is a fork and only a subset of your peers know about a block, we should - // only query the peers on that fork. Should this case be handled? How to handle it? - let custodial_peers = cx.get_custodial_peers(*column_index); + let peer_to_request = self.select_column_peer( + cx, + &active_request_count_by_peer, + &lookup_peers, + *column_index, + &random_state, + ); - // We draw from the total set of peers, but prioritize those peers who we have - // received an attestation / status / block message claiming to have imported the - // lookup. The frequency of those messages is low, so drawing only from lookup_peers - // could cause many lookups to take much longer or fail as they don't have enough - // custody peers on a given column - let mut priorized_peers = custodial_peers - .iter() - .map(|peer| { - ( - // Prioritize peers that claim to know have imported this block - if lookup_peers.contains(peer) { 0 } else { 1 }, - // De-prioritize peers that have failed to successfully respond to - // requests recently - self.failed_peers.contains(peer), - // Prefer peers with fewer requests to load balance across peers. - // We batch requests to the same peer, so count existence in the - // `columns_to_request_by_peer` as a single 1 request. - active_request_count_by_peer.get(peer).copied().unwrap_or(0) - + columns_to_request_by_peer.get(peer).map(|_| 1).unwrap_or(0), - // Random factor to break ties, otherwise the PeerID breaks ties - rand::rng().random::(), - *peer, - ) - }) - .collect::>(); - priorized_peers.sort_unstable(); - - if let Some((_, _, _, _, peer_id)) = priorized_peers.first() { + if let Some(peer_id) = peer_to_request { columns_to_request_by_peer - .entry(*peer_id) + .entry(peer_id) .or_default() .push(*column_index); } else if wait_duration > MAX_STALE_NO_PEERS_DURATION { @@ -298,6 +261,23 @@ impl ActiveCustodyRequest { } } + let peer_requests = columns_to_request_by_peer.len(); + if peer_requests > 0 { + let columns_requested_count = columns_to_request_by_peer + .values() + .map(|v| v.len()) + .sum::(); + debug!( + lookup_peers = lookup_peers.len(), + "Requesting {} columns from {} peers", columns_requested_count, peer_requests, + ); + } else { + debug!( + lookup_peers = lookup_peers.len(), + "No column peers found for look up", + ); + } + for (peer_id, indices) in columns_to_request_by_peer.into_iter() { let request_result = cx .data_column_lookup_request( @@ -317,8 +297,14 @@ impl ActiveCustodyRequest { match request_result { LookupRequestResult::RequestSent(req_id) => { + *self.peer_attempts.entry(peer_id).or_insert(0) += 1; + let client = cx.network_globals().client(&peer_id).kind; - let batch_columns_req_span = debug_span!("batch_columns_req", %peer_id, %client, missing_column_indexes = tracing::field::Empty); + let batch_columns_req_span = debug_span!( + "batch_columns_req", + %peer_id, + %client, + ); let _guard = batch_columns_req_span.clone().entered(); for column_index in &indices { let column_request = self @@ -345,11 +331,54 @@ impl ActiveCustodyRequest { Ok(None) } + + fn select_column_peer( + &self, + cx: &mut SyncNetworkContext, + active_request_count_by_peer: &HashMap, + lookup_peers: &HashSet, + column_index: ColumnIndex, + random_state: &RandomState, + ) -> Option { + // We draw from the total set of peers, but prioritize those peers who we have + // received an attestation or a block from (`lookup_peers`), as the `lookup_peers` may take + // time to build up and we are likely to not find any column peers initially. + let custodial_peers = cx.get_custodial_peers(column_index); + let mut prioritized_peers = custodial_peers + .iter() + .filter(|peer| { + // Exclude peers that we have already made too many attempts to. + self.peer_attempts.get(peer).copied().unwrap_or(0) <= MAX_CUSTODY_PEER_ATTEMPTS + }) + .map(|peer| { + ( + // Prioritize peers that claim to know have imported this block + if lookup_peers.contains(peer) { 0 } else { 1 }, + // De-prioritize peers that we have already attempted to download from + self.peer_attempts.get(peer).copied().unwrap_or(0), + // Prefer peers with fewer requests to load balance across peers. + active_request_count_by_peer.get(peer).copied().unwrap_or(0), + // The hash ensures consistent peer ordering within this request + // to avoid fragmentation while varying selection across different requests. + random_state.hash_one(peer), + *peer, + ) + }) + .collect::>(); + prioritized_peers.sort_unstable(); + + prioritized_peers + .first() + .map(|(_, _, _, _, peer_id)| *peer_id) + } } /// TODO(das): this attempt count is nested into the existing lookup request count. const MAX_CUSTODY_COLUMN_DOWNLOAD_ATTEMPTS: usize = 3; +/// Max number of attempts to request custody columns from a single peer. +const MAX_CUSTODY_PEER_ATTEMPTS: usize = 3; + struct ColumnRequest { status: Status, download_failures: usize, diff --git a/beacon_node/network/src/sync/range_sync/batch.rs b/beacon_node/network/src/sync/range_sync/batch.rs index 1f51613996..31e6594139 100644 --- a/beacon_node/network/src/sync/range_sync/batch.rs +++ b/beacon_node/network/src/sync/range_sync/batch.rs @@ -1,4 +1,5 @@ use beacon_chain::block_verification_types::RpcBlock; +use derivative::Derivative; use lighthouse_network::PeerId; use lighthouse_network::rpc::methods::BlocksByRangeRequest; use lighthouse_network::service::api_types::Id; @@ -96,7 +97,8 @@ pub enum BatchProcessingResult { NonFaultyFailure, } -#[derive(Debug)] +#[derive(Derivative)] +#[derivative(Debug)] /// A segment of a chain. pub struct BatchInfo { /// Start slot of the batch. @@ -114,6 +116,7 @@ pub struct BatchInfo { /// Whether this batch contains all blocks or all blocks and blobs. batch_type: ByRangeRequestType, /// Pin the generic + #[derivative(Debug = "ignore")] marker: std::marker::PhantomData, } diff --git a/beacon_node/network/src/sync/range_sync/chain.rs b/beacon_node/network/src/sync/range_sync/chain.rs index 96319f2efa..3b816c0922 100644 --- a/beacon_node/network/src/sync/range_sync/chain.rs +++ b/beacon_node/network/src/sync/range_sync/chain.rs @@ -131,8 +131,14 @@ impl SyncingChain { name = SPAN_SYNCING_CHAIN, parent = None, level="debug", - skip(id), - fields(chain_id = %id) + skip_all, + fields( + chain_id = %id, + start_epoch = %start_epoch, + target_head_slot = %target_head_slot, + target_head_root = %target_head_root, + chain_type = ?chain_type, + ) )] pub fn new( id: Id, @@ -350,7 +356,10 @@ impl SyncingChain { return Ok(KeepChain); } BatchState::Poisoned => unreachable!("Poisoned batch"), - BatchState::Processing(_) | BatchState::AwaitingDownload | BatchState::Failed => { + // Batches can be in `AwaitingDownload` state if there weren't good data column subnet + // peers to send the request to. + BatchState::AwaitingDownload => return Ok(KeepChain), + BatchState::Processing(_) | BatchState::Failed => { // these are all inconsistent states: // - Processing -> `self.current_processing_batch` is None // - Failed -> non recoverable batch. For an optimistic batch, it should @@ -384,7 +393,10 @@ impl SyncingChain { // Batch is not ready, nothing to process } BatchState::Poisoned => unreachable!("Poisoned batch"), - BatchState::Failed | BatchState::AwaitingDownload | BatchState::Processing(_) => { + // Batches can be in `AwaitingDownload` state if there weren't good data column subnet + // peers to send the request to. + BatchState::AwaitingDownload => return Ok(KeepChain), + BatchState::Failed | BatchState::Processing(_) => { // these are all inconsistent states: // - Failed -> non recoverable batch. Chain should have been removed // - AwaitingDownload -> A recoverable failed batch should have been @@ -582,8 +594,8 @@ impl SyncingChain { BatchProcessResult::NonFaultyFailure => { batch.processing_completed(BatchProcessingResult::NonFaultyFailure)?; - // Simply re-download the batch. - self.send_batch(network, batch_id) + // Simply re-download all batches in `AwaitingDownload` state. + self.attempt_send_awaiting_download_batches(network, "non-faulty-failure") } } } @@ -717,6 +729,7 @@ impl SyncingChain { previous_start = %old_start, new_start = %self.start_epoch, processing_target = %self.processing_target, + id=%self.id, "Chain advanced" ); } @@ -753,7 +766,6 @@ impl SyncingChain { } // this is our robust `processing_target`. All previous batches must be awaiting // validation - let mut redownload_queue = Vec::new(); for (id, batch) in self.batches.range_mut(..batch_id) { if let BatchOperationOutcome::Failed { blacklist } = batch.validation_failed()? { @@ -763,18 +775,14 @@ impl SyncingChain { failing_batch: *id, }); } - redownload_queue.push(*id); } // no batch maxed out it process attempts, so now the chain's volatile progress must be // reset self.processing_target = self.start_epoch; - for id in redownload_queue { - self.send_batch(network, id)?; - } - // finally, re-request the failed batch. - self.send_batch(network, batch_id) + // finally, re-request the failed batch and all other batches in `AwaitingDownload` state. + self.attempt_send_awaiting_download_batches(network, "handle_invalid_batch") } pub fn stop_syncing(&mut self) { @@ -810,6 +818,9 @@ impl SyncingChain { // advance the chain to the new validating epoch self.advance_chain(network, validating_epoch); + // attempt to download any batches stuck in the `AwaitingDownload` state because of + // a lack of peers earlier + self.attempt_send_awaiting_download_batches(network, "start_syncing")?; if self.optimistic_start.is_none() && optimistic_epoch > self.processing_target && !self.attempted_optimistic_starts.contains(&optimistic_epoch) @@ -939,6 +950,41 @@ impl SyncingChain { } } + /// Attempts to send all batches that are in `AwaitingDownload` state. + /// + /// Batches might get stuck in `AwaitingDownload` post peerdas because of lack of peers + /// in required subnets. We need to progress them if peers are available at a later point. + pub fn attempt_send_awaiting_download_batches( + &mut self, + network: &mut SyncNetworkContext, + src: &str, + ) -> ProcessingResult { + // Collect all batches in AwaitingDownload state and see if they can be sent + let awaiting_downloads: Vec<_> = self + .batches + .iter() + .filter(|(_, batch)| matches!(batch.state(), BatchState::AwaitingDownload)) + .map(|(batch_id, _)| batch_id) + .copied() + .collect(); + debug!( + ?awaiting_downloads, + src, "Attempting to send batches awaiting downlaod" + ); + + for batch_id in awaiting_downloads { + if self.good_peers_on_sampling_subnets(batch_id, network) { + self.send_batch(network, batch_id)?; + } else { + debug!( + src = "attempt_send_awaiting_download_batches", + "Waiting for peers to be available on sampling column subnets" + ); + } + } + Ok(KeepChain) + } + /// Requests the batch assigned to the given id from a given peer. pub fn send_batch( &mut self, @@ -1089,14 +1135,16 @@ impl SyncingChain { if !matches!(self.state, ChainSyncingState::Syncing) { return Ok(KeepChain); } - // find the next pending batch and request it from the peer // check if we have the batch for our optimistic start. If not, request it first. // We wait for this batch before requesting any other batches. if let Some(epoch) = self.optimistic_start { if !self.good_peers_on_sampling_subnets(epoch, network) { - debug!("Waiting for peers to be available on sampling column subnets"); + debug!( + src = "request_batches_optimistic", + "Waiting for peers to be available on sampling column subnets" + ); return Ok(KeepChain); } @@ -1105,6 +1153,8 @@ impl SyncingChain { let optimistic_batch = BatchInfo::new(&epoch, EPOCHS_PER_BATCH, batch_type); entry.insert(optimistic_batch); self.send_batch(network, epoch)?; + } else { + self.attempt_send_awaiting_download_batches(network, "request_batches_optimistic")?; } return Ok(KeepChain); } @@ -1132,21 +1182,12 @@ impl SyncingChain { ) -> bool { if network.chain.spec.is_peer_das_enabled_for_epoch(epoch) { // Require peers on all sampling column subnets before sending batches + let sampling_subnets = network.network_globals().sampling_subnets(); network .network_globals() - .sampling_subnets() - .iter() - .all(|subnet_id| { - let peer_db = network.network_globals().peers.read(); - let peer_count = self - .peers - .iter() - .filter(|peer| { - peer_db.is_good_range_sync_custody_subnet_peer(*subnet_id, peer) - }) - .count(); - peer_count > 0 - }) + .peers + .read() + .has_good_custody_range_sync_peer(&sampling_subnets, epoch) } else { true } @@ -1188,7 +1229,10 @@ impl SyncingChain { // block and data column requests are currently coupled. This can be removed once we find a // way to decouple the requests and do retries individually, see issue #6258. if !self.good_peers_on_sampling_subnets(self.to_be_downloaded, network) { - debug!("Waiting for peers to be available on custody column subnets"); + debug!( + src = "include_next_batch", + "Waiting for peers to be available on custody column subnets" + ); return None; } diff --git a/beacon_node/network/src/sync/tests/lookups.rs b/beacon_node/network/src/sync/tests/lookups.rs index b5bc10851d..fc64186175 100644 --- a/beacon_node/network/src/sync/tests/lookups.rs +++ b/beacon_node/network/src/sync/tests/lookups.rs @@ -41,8 +41,8 @@ use slot_clock::{SlotClock, TestingSlotClock}; use tokio::sync::mpsc; use tracing::info; use types::{ - BeaconState, BeaconStateBase, BlobSidecar, DataColumnSidecar, EthSpec, ForkContext, ForkName, - Hash256, MinimalEthSpec as E, SignedBeaconBlock, Slot, + BeaconState, BeaconStateBase, BlobSidecar, BlockImportSource, DataColumnSidecar, EthSpec, + ForkContext, ForkName, Hash256, MinimalEthSpec as E, SignedBeaconBlock, Slot, data_column_sidecar::ColumnIndex, test_utils::{SeedableRng, TestRandom, XorShiftRng}, }; @@ -285,21 +285,21 @@ impl TestRig { ); } - fn insert_failed_chain(&mut self, block_root: Hash256) { - self.sync_manager.insert_failed_chain(block_root); + fn insert_ignored_chain(&mut self, block_root: Hash256) { + self.sync_manager.insert_ignored_chain(block_root); } - fn assert_not_failed_chain(&mut self, chain_hash: Hash256) { - let failed_chains = self.sync_manager.get_failed_chains(); - if failed_chains.contains(&chain_hash) { - panic!("failed chains contain {chain_hash:?}: {failed_chains:?}"); + fn assert_not_ignored_chain(&mut self, chain_hash: Hash256) { + let chains = self.sync_manager.get_ignored_chains(); + if chains.contains(&chain_hash) { + panic!("ignored chains contain {chain_hash:?}: {chains:?}"); } } - fn assert_failed_chain(&mut self, chain_hash: Hash256) { - let failed_chains = self.sync_manager.get_failed_chains(); - if !failed_chains.contains(&chain_hash) { - panic!("expected failed chains to contain {chain_hash:?}: {failed_chains:?}"); + fn assert_ignored_chain(&mut self, chain_hash: Hash256) { + let chains = self.sync_manager.get_ignored_chains(); + if !chains.contains(&chain_hash) { + panic!("expected ignored chains to contain {chain_hash:?}: {chains:?}"); } } @@ -1021,11 +1021,6 @@ impl TestRig { self.log(&format!("Found expected penalty {penalty_msg}")); } - pub fn expect_single_penalty(&mut self, peer_id: PeerId, expect_penalty_msg: &'static str) { - self.expect_penalty(peer_id, expect_penalty_msg); - self.expect_no_penalty_for(peer_id); - } - pub fn block_with_parent_and_blobs( &mut self, parent_root: Hash256, @@ -1084,7 +1079,7 @@ impl TestRig { .harness .chain .data_availability_checker - .put_pending_executed_block(executed_block) + .put_executed_block(executed_block) .unwrap() { Availability::Available(_) => panic!("block removed from da_checker, available"), @@ -1114,20 +1109,19 @@ impl TestRig { }; } - fn insert_block_to_processing_cache(&mut self, block: Arc>) { + fn insert_block_to_availability_cache(&mut self, block: Arc>) { self.harness .chain - .reqresp_pre_import_cache - .write() - .insert(block.canonical_root(), block); + .data_availability_checker + .put_pre_execution_block(block.canonical_root(), block, BlockImportSource::Gossip) + .unwrap(); } fn simulate_block_gossip_processing_becomes_invalid(&mut self, block_root: Hash256) { self.harness .chain - .reqresp_pre_import_cache - .write() - .remove(&block_root); + .data_availability_checker + .remove_block_on_execution_error(&block_root); self.send_sync_message(SyncMessage::GossipBlockProcessResult { block_root, @@ -1140,11 +1134,6 @@ impl TestRig { block: Arc>, ) { let block_root = block.canonical_root(); - self.harness - .chain - .reqresp_pre_import_cache - .write() - .remove(&block_root); self.insert_block_to_da_checker(block); @@ -1461,7 +1450,7 @@ fn test_parent_lookup_too_many_download_attempts_no_blacklist() { // Trigger the request rig.trigger_unknown_parent_block(peer_id, block.into()); for i in 1..=PARENT_FAIL_TOLERANCE { - rig.assert_not_failed_chain(block_root); + rig.assert_not_ignored_chain(block_root); let id = rig.expect_block_parent_request(parent_root); if i % 2 != 0 { // The request fails. It should be tried again. @@ -1474,8 +1463,8 @@ fn test_parent_lookup_too_many_download_attempts_no_blacklist() { } } - rig.assert_not_failed_chain(block_root); - rig.assert_not_failed_chain(parent.canonical_root()); + rig.assert_not_ignored_chain(block_root); + rig.assert_not_ignored_chain(parent.canonical_root()); rig.expect_no_active_lookups_empty_network(); } @@ -1500,7 +1489,7 @@ fn test_parent_lookup_too_many_processing_attempts_must_blacklist() { for _ in 0..PROCESSING_FAILURES { let id = rig.expect_block_parent_request(parent_root); // Blobs are only requested in the previous first iteration as this test only retries blocks - rig.assert_not_failed_chain(block_root); + rig.assert_not_ignored_chain(block_root); // send the right parent but fail processing rig.parent_lookup_block_response(id, peer_id, Some(parent.clone().into())); rig.parent_block_processed(block_root, BlockError::BlockSlotLimitReached.into()); @@ -1508,7 +1497,7 @@ fn test_parent_lookup_too_many_processing_attempts_must_blacklist() { rig.expect_penalty(peer_id, "lookup_block_processing_failure"); } - rig.assert_not_failed_chain(block_root); + rig.assert_not_ignored_chain(block_root); rig.expect_no_active_lookups_empty_network(); } @@ -1551,12 +1540,14 @@ fn test_parent_lookup_too_deep_grow_ancestor() { ); // Should not penalize peer, but network is not clear because of the blocks_by_range requests rig.expect_no_penalty_for(peer_id); - rig.assert_failed_chain(chain_hash); + rig.assert_ignored_chain(chain_hash); } // Regression test for https://github.com/sigp/lighthouse/pull/7118 +// 8042 UPDATE: block was previously added to the failed_chains cache, now it's inserted into the +// ignored chains cache. The regression test still applies as the chaild lookup is not created #[test] -fn test_child_lookup_not_created_for_failed_chain_parent_after_processing() { +fn test_child_lookup_not_created_for_ignored_chain_parent_after_processing() { // GIVEN: A parent chain longer than PARENT_DEPTH_TOLERANCE. let mut rig = TestRig::test_setup(); let mut blocks = rig.rand_blockchain(PARENT_DEPTH_TOLERANCE + 1); @@ -1586,8 +1577,8 @@ fn test_child_lookup_not_created_for_failed_chain_parent_after_processing() { } // At this point, the chain should have been deemed too deep and pruned. - // The tip root should have been inserted into failed chains. - rig.assert_failed_chain(tip_root); + // The tip root should have been inserted into ignored chains. + rig.assert_ignored_chain(tip_root); rig.expect_no_penalty_for(peer_id); // WHEN: Trigger the extending block that points to the tip. @@ -1604,10 +1595,10 @@ fn test_child_lookup_not_created_for_failed_chain_parent_after_processing() { }), ); - // THEN: The extending block should not create a lookup because the tip was inserted into failed chains. + // THEN: The extending block should not create a lookup because the tip was inserted into + // ignored chains. rig.expect_no_active_lookups(); - // AND: The peer should be penalized for extending a failed chain. - rig.expect_single_penalty(peer_id, "failed_chain"); + rig.expect_no_penalty_for(peer_id); rig.expect_empty_network(); } @@ -1646,7 +1637,7 @@ fn test_parent_lookup_too_deep_grow_tip() { ); // Should not penalize peer, but network is not clear because of the blocks_by_range requests rig.expect_no_penalty_for(peer_id); - rig.assert_failed_chain(tip.canonical_root()); + rig.assert_ignored_chain(tip.canonical_root()); } #[test] @@ -1699,15 +1690,14 @@ fn test_lookup_add_peers_to_parent() { } #[test] -fn test_skip_creating_failed_parent_lookup() { +fn test_skip_creating_ignored_parent_lookup() { let mut rig = TestRig::test_setup(); let (_, block, parent_root, _) = rig.rand_block_and_parent(); let peer_id = rig.new_connected_peer(); - rig.insert_failed_chain(parent_root); + rig.insert_ignored_chain(parent_root); rig.trigger_unknown_parent_block(peer_id, block.into()); - // Expect single penalty for peer, despite dropping two lookups - rig.expect_single_penalty(peer_id, "failed_chain"); - // Both current and parent lookup should be rejected + rig.expect_no_penalty_for(peer_id); + // Both current and parent lookup should not be created rig.expect_no_active_lookups(); } @@ -1845,7 +1835,7 @@ fn block_in_processing_cache_becomes_invalid() { let (block, blobs) = r.rand_block_and_blobs(NumBlobs::Number(1)); let block_root = block.canonical_root(); let peer_id = r.new_connected_peer(); - r.insert_block_to_processing_cache(block.clone().into()); + r.insert_block_to_availability_cache(block.clone().into()); r.trigger_unknown_block_from_attestation(block_root, peer_id); // Should trigger blob request let id = r.expect_blob_lookup_request(block_root); @@ -1871,7 +1861,7 @@ fn block_in_processing_cache_becomes_valid_imported() { let (block, blobs) = r.rand_block_and_blobs(NumBlobs::Number(1)); let block_root = block.canonical_root(); let peer_id = r.new_connected_peer(); - r.insert_block_to_processing_cache(block.clone().into()); + r.insert_block_to_availability_cache(block.clone().into()); r.trigger_unknown_block_from_attestation(block_root, peer_id); // Should trigger blob request let id = r.expect_blob_lookup_request(block_root); diff --git a/beacon_node/operation_pool/src/lib.rs b/beacon_node/operation_pool/src/lib.rs index dd01f568fa..24e2cfbbb5 100644 --- a/beacon_node/operation_pool/src/lib.rs +++ b/beacon_node/operation_pool/src/lib.rs @@ -457,32 +457,35 @@ impl OperationPool { .collect() } - /// Prune proposer slashings for validators which are exited in the finalized epoch. - pub fn prune_proposer_slashings(&self, head_state: &BeaconState) { + /// Prune proposer slashings for validators which are already slashed or exited in the finalized + /// epoch. + pub fn prune_proposer_slashings(&self, finalized_state: &BeaconState) { prune_validator_hash_map( &mut self.proposer_slashings.write(), - |_, validator| validator.exit_epoch <= head_state.finalized_checkpoint().epoch, - head_state, + |_, validator| { + validator.slashed || validator.exit_epoch <= finalized_state.current_epoch() + }, + finalized_state, ); } /// Prune attester slashings for all slashed or withdrawn validators, or attestations on another /// fork. - pub fn prune_attester_slashings(&self, head_state: &BeaconState) { + pub fn prune_attester_slashings(&self, finalized_state: &BeaconState) { self.attester_slashings.write().retain(|slashing| { // Check that the attestation's signature is still valid wrt the fork version. - let signature_ok = slashing.signature_is_still_valid(&head_state.fork()); + // We might be a bit slower to detect signature staleness by using the finalized state + // here, but we filter when proposing anyway, so in the worst case we just keep some + // stuff around until we finalize. + let signature_ok = slashing.signature_is_still_valid(&finalized_state.fork()); // Slashings that don't slash any validators can also be dropped. let slashing_ok = get_slashable_indices_modular( - head_state, + finalized_state, slashing.as_inner().to_ref(), |_, validator| { - // Declare that a validator is still slashable if they have not exited prior - // to the finalized epoch. - // - // We cannot check the `slashed` field since the `head` is not finalized and - // a fork could un-slash someone. - validator.exit_epoch > head_state.finalized_checkpoint().epoch + // Declare that a validator is still slashable if they have not been slashed in + // the finalized state, and have not exited at the finalized epoch. + !validator.slashed && validator.exit_epoch > finalized_state.current_epoch() }, ) .is_ok_and(|indices| !indices.is_empty()); @@ -531,17 +534,12 @@ impl OperationPool { ) } - /// Prune if validator has already exited at or before the finalized checkpoint of the head. - pub fn prune_voluntary_exits(&self, head_state: &BeaconState) { + /// Prune if validator has already exited in the finalized state. + pub fn prune_voluntary_exits(&self, finalized_state: &BeaconState, spec: &ChainSpec) { prune_validator_hash_map( &mut self.voluntary_exits.write(), - // This condition is slightly too loose, since there will be some finalized exits that - // are missed here. - // - // We choose simplicity over the gain of pruning more exits since they are small and - // should not be seen frequently. - |_, validator| validator.exit_epoch <= head_state.finalized_checkpoint().epoch, - head_state, + |_, validator| validator.exit_epoch != spec.far_future_epoch, + finalized_state, ); } @@ -642,14 +640,15 @@ impl OperationPool { &self, head_block: &SignedBeaconBlock, head_state: &BeaconState, + finalized_state: &BeaconState, current_epoch: Epoch, spec: &ChainSpec, ) { self.prune_attestations(current_epoch); self.prune_sync_contributions(head_state.slot()); - self.prune_proposer_slashings(head_state); - self.prune_attester_slashings(head_state); - self.prune_voluntary_exits(head_state); + self.prune_proposer_slashings(finalized_state); + self.prune_attester_slashings(finalized_state); + self.prune_voluntary_exits(finalized_state, spec); self.prune_bls_to_execution_changes(head_block, head_state, spec); } @@ -758,14 +757,14 @@ where fn prune_validator_hash_map( map: &mut HashMap>, prune_if: F, - head_state: &BeaconState, + state: &BeaconState, ) where F: Fn(u64, &Validator) -> bool, T: VerifyOperation, { map.retain(|&validator_index, op| { - op.signature_is_still_valid(&head_state.fork()) - && head_state + op.signature_is_still_valid(&state.fork()) + && state .validators() .get(validator_index as usize) .is_none_or(|validator| !prune_if(validator_index, validator)) diff --git a/beacon_node/src/cli.rs b/beacon_node/src/cli.rs index 386eb721a0..2e3b3fde4b 100644 --- a/beacon_node/src/cli.rs +++ b/beacon_node/src/cli.rs @@ -47,16 +47,17 @@ pub fn cli_app() -> Command { * Network parameters. */ .arg( - Arg::new("subscribe-all-data-column-subnets") - .long("subscribe-all-data-column-subnets") + Arg::new("supernode") + .long("supernode") + .alias("subscribe-all-data-column-subnets") .action(ArgAction::SetTrue) .help_heading(FLAG_HEADER) - .help("Subscribe to all data column subnets and participate in data custody for \ - all columns. This will also advertise the beacon node as being long-lived \ - subscribed to all data column subnets. \ - NOTE: this is an experimental flag and may change any time without notice!") + .help("Run as a voluntary supernode. This node will subscribe to all data column \ + subnets, custody all data columns, and perform reconstruction and cross-seeding. \ + This requires significantly more bandwidth, storage, and computation requirements but \ + the node will have direct access to all blobs via the beacon API and it \ + helps network resilience by serving all data columns to syncing peers.") .display_order(0) - .hide(true) ) .arg( // TODO(das): remove this before PeerDAS release @@ -401,6 +402,16 @@ pub fn cli_app() -> Command { .help_heading(FLAG_HEADER) .display_order(0) ) + .arg( + Arg::new("complete-blob-backfill") + .long("complete-blob-backfill") + .help("Download all blobs back to the Deneb fork epoch. This will likely result in \ + the node banning most of its peers.") + .action(ArgAction::SetTrue) + .help_heading(FLAG_HEADER) + .display_order(0) + .hide(true) + ) .arg( Arg::new("enable-private-discovery") .long("enable-private-discovery") @@ -688,38 +699,6 @@ pub fn cli_app() -> Command { .help_heading(FLAG_HEADER) .display_order(0) ) - - /* - * Eth1 Integration - */ - .arg( - Arg::new("eth1-purge-cache") - .long("eth1-purge-cache") - .value_name("PURGE-CACHE") - .help("DEPRECATED") - .action(ArgAction::SetTrue) - .help_heading(FLAG_HEADER) - .display_order(0) - .hide(true) - ) - .arg( - Arg::new("eth1-blocks-per-log-query") - .long("eth1-blocks-per-log-query") - .value_name("BLOCKS") - .help("DEPRECATED") - .action(ArgAction::Set) - .display_order(0) - .hide(true) - ) - .arg( - Arg::new("eth1-cache-follow-distance") - .long("eth1-cache-follow-distance") - .value_name("BLOCKS") - .help("DEPRECATED") - .action(ArgAction::Set) - .display_order(0) - .hide(true) - ) .arg( Arg::new("slots-per-restore-point") .long("slots-per-restore-point") @@ -769,7 +748,7 @@ pub fn cli_app() -> Command { .long("block-cache-size") .value_name("SIZE") .help("Specifies how many blocks the database should cache in memory") - .default_value("5") + .default_value("0") .action(ArgAction::Set) .display_order(0) ) @@ -1487,16 +1466,6 @@ pub fn cli_app() -> Command { .help_heading(FLAG_HEADER) .display_order(0) ) - .arg( - Arg::new("disable-deposit-contract-sync") - .long("disable-deposit-contract-sync") - .help("DEPRECATED") - .action(ArgAction::SetTrue) - .help_heading(FLAG_HEADER) - .conflicts_with("staking") - .display_order(0) - .hide(true) - ) .arg( Arg::new("disable-optimistic-finalized-sync") .long("disable-optimistic-finalized-sync") @@ -1507,15 +1476,6 @@ pub fn cli_app() -> Command { Lighthouse and only passed to the EL if initial verification fails.") .display_order(0) ) - .arg( - Arg::new("light-client-server") - .long("light-client-server") - .help("DEPRECATED") - .action(ArgAction::SetTrue) - - .help_heading(FLAG_HEADER) - .display_order(0) - ) .arg( Arg::new("disable-light-client-server") .long("disable-light-client-server") diff --git a/beacon_node/src/config.rs b/beacon_node/src/config.rs index 7e4b77e9aa..c2599ec0cd 100644 --- a/beacon_node/src/config.rs +++ b/beacon_node/src/config.rs @@ -13,8 +13,8 @@ use directory::{DEFAULT_BEACON_NODE_DIR, DEFAULT_NETWORK_DIR, DEFAULT_ROOT_DIR}; use environment::RuntimeContext; use execution_layer::DEFAULT_JWT_FILE; use http_api::TlsConfig; -use lighthouse_network::ListenAddress; use lighthouse_network::{Enr, Multiaddr, NetworkConfig, PeerIdSerialized, multiaddr::Protocol}; +use network_utils::listen_addr::ListenAddress; use sensitive_url::SensitiveUrl; use std::collections::HashSet; use std::fmt::Debug; @@ -170,13 +170,6 @@ pub fn get_config( parse_required(cli_args, "http-duplicate-block-status")?; } - if cli_args.get_flag("light-client-server") { - warn!( - "The --light-client-server flag is deprecated. The light client server is enabled \ - by default" - ); - } - if cli_args.get_flag("disable-light-client-server") { client_config.chain.enable_light_client_server = false; } @@ -262,24 +255,6 @@ pub fn get_config( client_config.http_metrics.allocator_metrics_enabled = false; } - /* - * Deprecated Eth1 flags (can be removed in the next minor release after v7.1.0) - */ - if cli_args - .get_one::("eth1-blocks-per-log-query") - .is_some() - { - warn!("The eth1-blocks-per-log-query flag is deprecated"); - } - - if cli_args.get_flag("eth1-purge-cache") { - warn!("The eth1-purge-cache flag is deprecated"); - } - - if clap_utils::parse_optional::(cli_args, "eth1-cache-follow-distance")?.is_some() { - warn!("The eth1-cache-follow-distance flag is deprecated"); - } - // `--execution-endpoint` is required now. let endpoints: String = clap_utils::parse_required(cli_args, "execution-endpoint")?; let mut el_config = execution_layer::Config::default(); @@ -773,10 +748,6 @@ pub fn get_config( } } - if cli_args.get_flag("disable-deposit-contract-sync") { - warn!("The disable-deposit-contract-sync flag is deprecated"); - } - client_config.chain.prepare_payload_lookahead = clap_utils::parse_optional(cli_args, "prepare-payload-lookahead")? .map(Duration::from_millis) @@ -825,6 +796,14 @@ pub fn get_config( client_config.chain.genesis_backfill = true; } + client_config.chain.complete_blob_backfill = cli_args.get_flag("complete-blob-backfill"); + + // Ensure `prune_blobs` is false whenever complete-blob-backfill is set. This overrides any + // setting of `--prune-blobs true` applied earlier in flag parsing. + if client_config.chain.complete_blob_backfill { + client_config.store.prune_blobs = false; + } + // Backfill sync rate-limiting client_config.beacon_processor.enable_backfill_rate_limiting = !cli_args.get_flag("disable-backfill-rate-limiting"); @@ -1011,7 +990,7 @@ pub fn parse_listening_addresses(cli_args: &ArgMatches) -> Result Result Result Result Result Result, Cold: ItemStore> { /// The hot database also contains all blocks. pub hot_db: Hot, /// LRU cache of deserialized blocks and blobs. Updated whenever a block or blob is loaded. - block_cache: Mutex>, + block_cache: Option>>, /// Cache of beacon states. /// /// LOCK ORDERING: this lock must always be locked *after* the `split` if both are required. @@ -229,7 +229,9 @@ impl HotColdDB, MemoryStore> { cold_db: MemoryStore::open(), blobs_db: MemoryStore::open(), hot_db: MemoryStore::open(), - block_cache: Mutex::new(BlockCache::new(config.block_cache_size)), + block_cache: NonZeroUsize::new(config.block_cache_size) + .map(BlockCache::new) + .map(Mutex::new), state_cache: Mutex::new(StateCache::new( config.state_cache_size, config.state_cache_headroom, @@ -281,7 +283,9 @@ impl HotColdDB, BeaconNodeBackend> { blobs_db: BeaconNodeBackend::open(&config, blobs_db_path)?, cold_db: BeaconNodeBackend::open(&config, cold_path)?, hot_db, - block_cache: Mutex::new(BlockCache::new(config.block_cache_size)), + block_cache: NonZeroUsize::new(config.block_cache_size) + .map(BlockCache::new) + .map(Mutex::new), state_cache: Mutex::new(StateCache::new( config.state_cache_size, config.state_cache_headroom, @@ -488,14 +492,17 @@ impl, Cold: ItemStore> HotColdDB pub fn register_metrics(&self) { let hsc_metrics = self.historic_state_cache.lock().metrics(); - metrics::set_gauge( - &metrics::STORE_BEACON_BLOCK_CACHE_SIZE, - self.block_cache.lock().block_cache.len() as i64, - ); - metrics::set_gauge( - &metrics::STORE_BEACON_BLOB_CACHE_SIZE, - self.block_cache.lock().blob_cache.len() as i64, - ); + if let Some(block_cache) = &self.block_cache { + let cache = block_cache.lock(); + metrics::set_gauge( + &metrics::STORE_BEACON_BLOCK_CACHE_SIZE, + cache.block_cache.len() as i64, + ); + metrics::set_gauge( + &metrics::STORE_BEACON_BLOB_CACHE_SIZE, + cache.blob_cache.len() as i64, + ); + } let state_cache = self.state_cache.lock(); metrics::set_gauge( &metrics::STORE_BEACON_STATE_CACHE_SIZE, @@ -553,7 +560,9 @@ impl, Cold: ItemStore> HotColdDB let block = self.block_as_kv_store_ops(block_root, block, &mut ops)?; self.hot_db.do_atomically(ops)?; // Update cache. - self.block_cache.lock().put_block(*block_root, block); + self.block_cache + .as_ref() + .inspect(|cache| cache.lock().put_block(*block_root, block)); Ok(()) } @@ -605,7 +614,9 @@ impl, Cold: ItemStore> HotColdDB metrics::inc_counter(&metrics::BEACON_BLOCK_GET_COUNT); // Check the cache. - if let Some(block) = self.block_cache.lock().get_block(block_root) { + if let Some(cache) = &self.block_cache + && let Some(block) = cache.lock().get_block(block_root) + { metrics::inc_counter(&metrics::BEACON_BLOCK_CACHE_HIT_COUNT); return Ok(Some(DatabaseBlock::Full(block.clone()))); } @@ -630,8 +641,8 @@ impl, Cold: ItemStore> HotColdDB // Add to cache. self.block_cache - .lock() - .put_block(*block_root, full_block.clone()); + .as_ref() + .inspect(|cache| cache.lock().put_block(*block_root, full_block.clone())); DatabaseBlock::Full(full_block) } else if !self.config.prune_payloads { @@ -656,6 +667,7 @@ impl, Cold: ItemStore> HotColdDB } /// Fetch a full block with execution payload from the store. + #[instrument(skip_all)] pub fn get_full_block( &self, block_root: &Hash256, @@ -901,7 +913,9 @@ impl, Cold: ItemStore> HotColdDB /// Delete a block from the store and the block cache. pub fn delete_block(&self, block_root: &Hash256) -> Result<(), Error> { - self.block_cache.lock().delete(block_root); + self.block_cache + .as_ref() + .inspect(|cache| cache.lock().delete(block_root)); self.hot_db .key_delete(DBColumn::BeaconBlock, block_root.as_slice())?; self.hot_db @@ -916,7 +930,9 @@ impl, Cold: ItemStore> HotColdDB block_root.as_slice(), &blobs.as_ssz_bytes(), )?; - self.block_cache.lock().put_blobs(*block_root, blobs); + self.block_cache + .as_ref() + .inspect(|cache| cache.lock().put_blobs(*block_root, blobs)); Ok(()) } @@ -944,9 +960,11 @@ impl, Cold: ItemStore> HotColdDB self.blobs_db .put(&DATA_COLUMN_CUSTODY_INFO_KEY, &data_column_custody_info)?; - self.block_cache - .lock() - .put_data_column_custody_info(Some(data_column_custody_info)); + self.block_cache.as_ref().inspect(|cache| { + cache + .lock() + .put_data_column_custody_info(Some(data_column_custody_info)) + }); Ok(()) } @@ -963,8 +981,8 @@ impl, Cold: ItemStore> HotColdDB &data_column.as_ssz_bytes(), )?; self.block_cache - .lock() - .put_data_column(*block_root, data_column); + .as_ref() + .inspect(|cache| cache.lock().put_data_column(*block_root, data_column)); } Ok(()) } @@ -1040,7 +1058,7 @@ impl, Cold: ItemStore> HotColdDB /// - `result_state_root == state.canonical_root()` /// - `state.slot() <= max_slot` /// - `state.get_latest_block_root(result_state_root) == block_root` - #[instrument(skip(self, max_slot), level = "debug")] + #[instrument(skip_all, fields(?block_root, %max_slot, ?state_root), level = "debug")] pub fn get_advanced_hot_state( &self, block_root: Hash256, @@ -1112,7 +1130,7 @@ impl, Cold: ItemStore> HotColdDB /// If this function returns `Some(state)` then that `state` will always have /// `latest_block_header` matching `block_root` but may not be advanced all the way through to /// `max_slot`. - #[instrument(skip(self), level = "debug")] + #[instrument(skip_all, fields(?block_root, %max_slot), level = "debug")] pub fn get_advanced_hot_state_from_cache( &self, block_root: Hash256, @@ -1398,7 +1416,7 @@ impl, Cold: ItemStore> HotColdDB // Update database whilst holding a lock on cache, to ensure that the cache updates // atomically with the database. - let mut guard = self.block_cache.lock(); + let guard = self.block_cache.as_ref().map(|cache| cache.lock()); let blob_cache_ops = blobs_ops.clone(); // Try to execute blobs store ops. @@ -1445,56 +1463,67 @@ impl, Cold: ItemStore> HotColdDB return Err(e); } - for op in hot_db_cache_ops { + // Delete from the state cache. + for op in &hot_db_cache_ops { match op { - StoreOp::PutBlock(block_root, block) => { - guard.put_block(block_root, (*block).clone()); - } - - StoreOp::PutBlobs(_, _) => (), - - StoreOp::PutDataColumns(_, _) => (), - - StoreOp::PutState(_, _) => (), - - StoreOp::PutStateSummary(_, _) => (), - StoreOp::DeleteBlock(block_root) => { - guard.delete_block(&block_root); - self.state_cache.lock().delete_block_states(&block_root); + self.state_cache.lock().delete_block_states(block_root); } - StoreOp::DeleteState(state_root, _) => { - self.state_cache.lock().delete_state(&state_root) + self.state_cache.lock().delete_state(state_root) } - - StoreOp::DeleteBlobs(_) => (), - - StoreOp::DeleteDataColumns(_, _) => (), - - StoreOp::DeleteExecutionPayload(_) => (), - - StoreOp::DeleteSyncCommitteeBranch(_) => (), - - StoreOp::KeyValueOp(_) => (), - } - } - - for op in blob_cache_ops { - match op { - StoreOp::PutBlobs(block_root, blobs) => { - guard.put_blobs(block_root, blobs); - } - - StoreOp::DeleteBlobs(block_root) => { - guard.delete_blobs(&block_root); - } - _ => (), } } - drop(guard); + // If the block cache is enabled, also delete from the block cache. + if let Some(mut guard) = guard { + for op in hot_db_cache_ops { + match op { + StoreOp::PutBlock(block_root, block) => { + guard.put_block(block_root, (*block).clone()); + } + + StoreOp::PutBlobs(_, _) => (), + + StoreOp::PutDataColumns(_, _) => (), + + StoreOp::PutState(_, _) => (), + + StoreOp::PutStateSummary(_, _) => (), + + StoreOp::DeleteBlock(block_root) => { + guard.delete_block(&block_root); + } + + StoreOp::DeleteState(_, _) => (), + + StoreOp::DeleteBlobs(_) => (), + + StoreOp::DeleteDataColumns(_, _) => (), + + StoreOp::DeleteExecutionPayload(_) => (), + + StoreOp::DeleteSyncCommitteeBranch(_) => (), + + StoreOp::KeyValueOp(_) => (), + } + } + + for op in blob_cache_ops { + match op { + StoreOp::PutBlobs(block_root, blobs) => { + guard.put_blobs(block_root, blobs); + } + + StoreOp::DeleteBlobs(block_root) => { + guard.delete_blobs(&block_root); + } + + _ => (), + } + } + } Ok(()) } @@ -2424,21 +2453,23 @@ impl, Cold: ItemStore> HotColdDB /// If custody info doesn't exist in the cache, /// try to fetch from the DB and prime the cache. pub fn get_data_column_custody_info(&self) -> Result, Error> { - let Some(data_column_custody_info) = self.block_cache.lock().get_data_column_custody_info() - else { - let data_column_custody_info = self - .blobs_db - .get::(&DATA_COLUMN_CUSTODY_INFO_KEY)?; + if let Some(cache) = &self.block_cache + && let Some(data_column_custody_info) = cache.lock().get_data_column_custody_info() + { + return Ok(Some(data_column_custody_info)); + } + let data_column_custody_info = self + .blobs_db + .get::(&DATA_COLUMN_CUSTODY_INFO_KEY)?; - // Update the cache - self.block_cache + // Update the cache + self.block_cache.as_ref().inspect(|cache| { + cache .lock() - .put_data_column_custody_info(data_column_custody_info.clone()); + .put_data_column_custody_info(data_column_custody_info.clone()) + }); - return Ok(data_column_custody_info); - }; - - Ok(Some(data_column_custody_info)) + Ok(data_column_custody_info) } /// Fetch all columns for a given block from the store. @@ -2459,9 +2490,13 @@ impl, Cold: ItemStore> HotColdDB /// Fetch blobs for a given block from the store. pub fn get_blobs(&self, block_root: &Hash256) -> Result, Error> { // Check the cache. - if let Some(blobs) = self.block_cache.lock().get_blobs(block_root) { + if let Some(blobs) = self + .block_cache + .as_ref() + .and_then(|cache| cache.lock().get_blobs(block_root).cloned()) + { metrics::inc_counter(&metrics::BEACON_BLOBS_CACHE_HIT_COUNT); - return Ok(blobs.clone().into()); + return Ok(blobs.into()); } match self @@ -2480,8 +2515,8 @@ impl, Cold: ItemStore> HotColdDB { let blobs = BlobSidecarList::new(blobs, max_blobs_per_block as usize)?; self.block_cache - .lock() - .put_blobs(*block_root, blobs.clone()); + .as_ref() + .inspect(|cache| cache.lock().put_blobs(*block_root, blobs.clone())); Ok(BlobSidecarListFromRoot::Blobs(blobs)) } else { @@ -2514,8 +2549,8 @@ impl, Cold: ItemStore> HotColdDB // Check the cache. if let Some(data_column) = self .block_cache - .lock() - .get_data_column(block_root, column_index) + .as_ref() + .and_then(|cache| cache.lock().get_data_column(block_root, column_index)) { metrics::inc_counter(&metrics::BEACON_DATA_COLUMNS_CACHE_HIT_COUNT); return Ok(Some(data_column)); @@ -2527,9 +2562,11 @@ impl, Cold: ItemStore> HotColdDB )? { Some(ref data_column_bytes) => { let data_column = Arc::new(DataColumnSidecar::from_ssz_bytes(data_column_bytes)?); - self.block_cache - .lock() - .put_data_column(*block_root, data_column.clone()); + self.block_cache.as_ref().inspect(|cache| { + cache + .lock() + .put_data_column(*block_root, data_column.clone()) + }); Ok(Some(data_column)) } None => Ok(None), @@ -3263,11 +3300,11 @@ impl, Cold: ItemStore> HotColdDB } // Remove deleted blobs from the cache. - let mut block_cache = self.block_cache.lock(); - for block_root in removed_block_roots { - block_cache.delete_blobs(&block_root); + if let Some(mut block_cache) = self.block_cache.as_ref().map(|cache| cache.lock()) { + for block_root in removed_block_roots { + block_cache.delete_blobs(&block_root); + } } - drop(block_cache); let new_blob_info = BlobInfo { oldest_blob_slot: Some(end_slot + 1), diff --git a/beacon_node/store/src/state_cache.rs b/beacon_node/store/src/state_cache.rs index 05930c7b71..4b0d1ee016 100644 --- a/beacon_node/store/src/state_cache.rs +++ b/beacon_node/store/src/state_cache.rs @@ -299,7 +299,7 @@ impl StateCache { None } - #[instrument(skip(self), level = "debug")] + #[instrument(skip_all, fields(?block_root, %slot), level = "debug")] pub fn get_by_block_root( &mut self, block_root: Hash256, diff --git a/book/src/advanced_database_migrations.md b/book/src/advanced_database_migrations.md index e29397619c..3552a90b0e 100644 --- a/book/src/advanced_database_migrations.md +++ b/book/src/advanced_database_migrations.md @@ -17,6 +17,7 @@ validator client or the slasher**. | Lighthouse version | Release date | Schema version | Downgrade available? | |--------------------|--------------|----------------|----------------------| +| v8.0.0-rc.0 | Sep 2025 | v28 | yes before Fulu | | v7.1.0 | Jul 2025 | v26 | yes | | v7.0.0 | Apr 2025 | v22 | no | | v6.0.0 | Nov 2024 | v22 | no | @@ -207,6 +208,7 @@ Here are the steps to prune historic states: | Lighthouse version | Release date | Schema version | Downgrade available? | |--------------------|--------------|----------------|-------------------------------------| +| v8.0.0-rc.0 | Sep 2025 | v28 | yes before Fulu | | v7.1.0 | Jul 2025 | v26 | yes | | v7.0.0 | Apr 2025 | v22 | no | | v6.0.0 | Nov 2024 | v22 | no | diff --git a/book/src/contributing_setup.md b/book/src/contributing_setup.md index 7143c8f0fb..b817faad87 100644 --- a/book/src/contributing_setup.md +++ b/book/src/contributing_setup.md @@ -26,7 +26,7 @@ you can run them locally and avoid CI failures: - `$ make cargo-fmt`: (fast) runs a Rust code formatting check. - `$ make lint`: (fast) runs a Rust code linter. -- `$ make test`: (medium) runs unit tests across the whole project. +- `$ make test`: (medium) runs unit tests across the whole project using nextest. - `$ make test-ef`: (medium) runs the Ethereum Foundation test vectors. - `$ make test-full`: (slow) runs the full test suite (including all previous commands). This is approximately everything @@ -36,88 +36,39 @@ _The lighthouse test suite is quite extensive, running the whole suite may take ## Testing -As with most other Rust projects, Lighthouse uses `cargo test` for unit and -integration tests. For example, to test the `ssz` crate run: +Lighthouse uses `cargo nextest` for unit and integration tests. Nextest provides better parallelization and is used by CI. For example, to test the `safe_arith` crate run: ```bash -$ cd consensus/ssz -$ cargo test - Finished test [unoptimized + debuginfo] target(s) in 7.69s - Running unittests (target/debug/deps/ssz-61fc26760142b3c4) - -running 27 tests -test decode::impls::tests::awkward_fixed_length_portion ... ok -test decode::impls::tests::invalid_h256 ... ok - -test encode::tests::test_encode_length ... ok -test encode::impls::tests::vec_of_vec_of_u8 ... ok -test encode::tests::test_encode_length_above_max_debug_panics - should panic ... ok - -test result: ok. 27 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.00s - - Running tests/tests.rs (target/debug/deps/tests-f8fb1f9ccb197bf4) - -running 20 tests -test round_trip::bool ... ok -test round_trip::first_offset_skips_byte ... ok -test round_trip::fixed_len_excess_bytes ... ok - -test round_trip::vec_u16 ... ok -test round_trip::vec_of_vec_u16 ... ok - -test result: ok. 20 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.00s - - Doc-tests ssz - -running 3 tests -test src/decode.rs - decode::SszDecoder (line 258) ... ok -test src/encode.rs - encode::SszEncoder (line 57) ... ok -test src/lib.rs - (line 10) ... ok - -test result: ok. 3 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.15s$ cargo test -p eth2_ssz +$ cd consensus/safe_arith +$ cargo nextest run + Finished test [unoptimized + debuginfo] target(s) in 0.43s + ------------ + Nextest run ID: 01234567-89ab-cdef-0123-456789abcdef + Starting 8 tests across 1 binary + PASS [ 0.001s] safe_arith tests::test_safe_add_u64 + PASS [ 0.001s] safe_arith tests::test_safe_mul_u64 + + ------------ + Summary [ 0.012s] 8 tests run: 8 passed, 0 skipped ``` -Alternatively, since `lighthouse` is a cargo workspace you can use `-p eth2_ssz` where -`eth2_ssz` is the package name as defined `/consensus/ssz/Cargo.toml` +Alternatively, since `lighthouse` is a cargo workspace you can use `-p safe_arith` where +`safe_arith` is the package name as defined in `/consensus/safe_arith/Cargo.toml`: ```bash -$ head -2 consensus/ssz/Cargo.toml +$ head -2 consensus/safe_arith/Cargo.toml [package] -name = "eth2_ssz" -$ cargo test -p eth2_ssz - Finished test [unoptimized + debuginfo] target(s) in 7.69s - Running unittests (target/debug/deps/ssz-61fc26760142b3c4) - -running 27 tests -test decode::impls::tests::awkward_fixed_length_portion ... ok -test decode::impls::tests::invalid_h256 ... ok - -test encode::tests::test_encode_length ... ok -test encode::impls::tests::vec_of_vec_of_u8 ... ok -test encode::tests::test_encode_length_above_max_debug_panics - should panic ... ok - -test result: ok. 27 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.00s - - Running tests/tests.rs (target/debug/deps/tests-f8fb1f9ccb197bf4) - -running 20 tests -test round_trip::bool ... ok -test round_trip::first_offset_skips_byte ... ok -test round_trip::fixed_len_excess_bytes ... ok - -test round_trip::vec_u16 ... ok -test round_trip::vec_of_vec_u16 ... ok - -test result: ok. 20 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.00s - - Doc-tests ssz - -running 3 tests -test src/decode.rs - decode::SszDecoder (line 258) ... ok -test src/encode.rs - encode::SszEncoder (line 57) ... ok -test src/lib.rs - (line 10) ... ok - -test result: ok. 3 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.15s$ cargo test -p eth2_ssz +name = "safe_arith" +$ cargo nextest run -p safe_arith + Finished test [unoptimized + debuginfo] target(s) in 0.43s + ------------ + Nextest run ID: 01234567-89ab-cdef-0123-456789abcdef + Starting 8 tests across 1 binary + PASS [ 0.001s] safe_arith tests::test_safe_add_u64 + PASS [ 0.001s] safe_arith tests::test_safe_mul_u64 + + ------------ + Summary [ 0.012s] 8 tests run: 8 passed, 0 skipped ``` ### test_logger @@ -129,7 +80,7 @@ testing the logs are displayed. This can be very helpful while debugging tests. Example: ``` -$ cargo test -p beacon_chain validator_pubkey_cache::test::basic_operation --features 'logging/test_logger' +$ cargo nextest run -p beacon_chain -E 'test(validator_pubkey_cache::test::basic_operation)' --features 'logging/test_logger' Finished test [unoptimized + debuginfo] target(s) in 0.20s Running unittests (target/debug/deps/beacon_chain-975363824f1143bc) diff --git a/book/src/help_bn.md b/book/src/help_bn.md index ea02b39bee..6680202a27 100644 --- a/book/src/help_bn.md +++ b/book/src/help_bn.md @@ -22,7 +22,7 @@ Options: Data directory for the blobs database. --block-cache-size Specifies how many blocks the database should cache in memory - [default: 5] + [default: 0] --boot-nodes One or more comma-delimited base64-encoded ENR's to bootstrap the p2p network. Multiaddr is also supported. @@ -513,8 +513,6 @@ Flags: subscriptions. This will only import attestations from already-subscribed subnets, use with --subscribe-all-subnets to ensure all attestations are received for import. - --light-client-server - DEPRECATED --log-color [] Enables/Disables colors for logs in terminal. Set it to false to disable colors. [default: true] [possible values: true, false] @@ -571,6 +569,13 @@ Flags: Subscribe to all subnets regardless of validator count. This will also advertise the beacon node as being long-lived subscribed to all subnets. + --supernode + Run as a voluntary supernode. This node will subscribe to all data + column subnets, custody all data columns, and perform reconstruction + and cross-seeding. This requires significantly more bandwidth, + storage, and computation requirements but the node will have direct + access to all blobs via the beacon API and it helps network resilience + by serving all data columns to syncing peers. --validator-monitor-auto Enables the automatic detection and monitoring of validators connected to the HTTP API and using the subnet subscription endpoint. This diff --git a/boot_node/Cargo.toml b/boot_node/Cargo.toml index 07513d6ab2..aedd57dd4b 100644 --- a/boot_node/Cargo.toml +++ b/boot_node/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "boot_node" -version = "7.1.0" +version = "8.0.0-rc.1" authors = ["Sigma Prime "] edition = { workspace = true } @@ -15,6 +15,7 @@ hex = { workspace = true } lighthouse_network = { workspace = true } log = { workspace = true } logging = { workspace = true } +network_utils = { workspace = true } serde = { workspace = true } tokio = { workspace = true } tracing = { workspace = true } diff --git a/boot_node/src/config.rs b/boot_node/src/config.rs index 1cb4e34381..fb0daf5264 100644 --- a/boot_node/src/config.rs +++ b/boot_node/src/config.rs @@ -4,10 +4,11 @@ use clap::ArgMatches; use eth2_network_config::Eth2NetworkConfig; use lighthouse_network::discv5::{self, Enr, enr::CombinedKey}; use lighthouse_network::{ - CombinedKeyExt, NetworkConfig, + NetworkConfig, discovery::{load_enr_from_disk, use_or_load_enr}, load_private_key, }; +use network_utils::enr_ext::CombinedKeyExt; use serde::{Deserialize, Serialize}; use ssz::Encode; use std::net::{SocketAddrV4, SocketAddrV6}; diff --git a/boot_node/src/server.rs b/boot_node/src/server.rs index 5bd4ef10a4..fce734bd70 100644 --- a/boot_node/src/server.rs +++ b/boot_node/src/server.rs @@ -5,9 +5,10 @@ use crate::config::BootNodeConfigSerialization; use clap::ArgMatches; use eth2_network_config::Eth2NetworkConfig; use lighthouse_network::{ - EnrExt, Eth2Enr, + Eth2Enr, discv5::{self, Discv5, enr::NodeId}, }; +use network_utils::enr_ext::EnrExt; use tracing::{info, warn}; use types::EthSpec; diff --git a/clippy.toml b/clippy.toml new file mode 100644 index 0000000000..dabcbe8bf5 --- /dev/null +++ b/clippy.toml @@ -0,0 +1,7 @@ +# Disallow preliminary slashing checks, +disallowed-methods = [ + { path = "slashing_protection::slashing_database::SlashingDatabase::preliminary_check_block_proposal", reason = "not safe for slashing checks", replacement = "slashing_protection::slashing_database::SlashingDatabase::check_and_insert_block_proposal" }, + { path = "slashing_protection::slashing_database::SlashingDatabase::preliminary_check_block_signing_root", reason = "not safe for slashing checks", replacement = "slashing_protection::slashing_database::SlashingDatabase::check_and_insert_block_signing_root" }, + { path = "slashing_protection::slashing_database::SlashingDatabase::preliminary_check_attestation", reason = "not safe for slashing checks", replacement = "slashing_protection::slashing_database::SlashingDatabase::check_and_insert_attestation" }, + { path = "slashing_protection::slashing_database::SlashingDatabase::preliminary_check_attestation_signing_root", reason = "not safe for slashing checks", replacement = "slashing_protection::slashing_database::SlashingDatabase::check_and_insert_attestation_signing_root" }, +] diff --git a/common/eth2/src/lib.rs b/common/eth2/src/lib.rs index 9709b0631f..0423794d0d 100644 --- a/common/eth2/src/lib.rs +++ b/common/eth2/src/lib.rs @@ -55,11 +55,13 @@ pub const JSON_CONTENT_TYPE_HEADER: &str = "application/json"; const HTTP_ATTESTATION_TIMEOUT_QUOTIENT: u32 = 4; const HTTP_ATTESTER_DUTIES_TIMEOUT_QUOTIENT: u32 = 4; const HTTP_ATTESTATION_SUBSCRIPTIONS_TIMEOUT_QUOTIENT: u32 = 24; +const HTTP_ATTESTATION_AGGREGATOR_TIMEOUT_QUOTIENT: u32 = 24; // For DVT involving middleware only const HTTP_LIVENESS_TIMEOUT_QUOTIENT: u32 = 4; const HTTP_PROPOSAL_TIMEOUT_QUOTIENT: u32 = 2; const HTTP_PROPOSER_DUTIES_TIMEOUT_QUOTIENT: u32 = 4; const HTTP_SYNC_COMMITTEE_CONTRIBUTION_TIMEOUT_QUOTIENT: u32 = 4; const HTTP_SYNC_DUTIES_TIMEOUT_QUOTIENT: u32 = 4; +const HTTP_SYNC_AGGREGATOR_TIMEOUT_QUOTIENT: u32 = 24; // For DVT involving middleware only const HTTP_GET_BEACON_BLOCK_SSZ_TIMEOUT_QUOTIENT: u32 = 4; const HTTP_GET_DEBUG_BEACON_STATE_QUOTIENT: u32 = 4; const HTTP_GET_DEPOSIT_SNAPSHOT_QUOTIENT: u32 = 4; @@ -150,11 +152,13 @@ pub struct Timeouts { pub attestation: Duration, pub attester_duties: Duration, pub attestation_subscriptions: Duration, + pub attestation_aggregators: Duration, pub liveness: Duration, pub proposal: Duration, pub proposer_duties: Duration, pub sync_committee_contribution: Duration, pub sync_duties: Duration, + pub sync_aggregators: Duration, pub get_beacon_blocks_ssz: Duration, pub get_debug_beacon_states: Duration, pub get_deposit_snapshot: Duration, @@ -168,11 +172,13 @@ impl Timeouts { attestation: timeout, attester_duties: timeout, attestation_subscriptions: timeout, + attestation_aggregators: timeout, liveness: timeout, proposal: timeout, proposer_duties: timeout, sync_committee_contribution: timeout, sync_duties: timeout, + sync_aggregators: timeout, get_beacon_blocks_ssz: timeout, get_debug_beacon_states: timeout, get_deposit_snapshot: timeout, @@ -187,12 +193,14 @@ impl Timeouts { attester_duties: base_timeout / HTTP_ATTESTER_DUTIES_TIMEOUT_QUOTIENT, attestation_subscriptions: base_timeout / HTTP_ATTESTATION_SUBSCRIPTIONS_TIMEOUT_QUOTIENT, + attestation_aggregators: base_timeout / HTTP_ATTESTATION_AGGREGATOR_TIMEOUT_QUOTIENT, liveness: base_timeout / HTTP_LIVENESS_TIMEOUT_QUOTIENT, proposal: base_timeout / HTTP_PROPOSAL_TIMEOUT_QUOTIENT, proposer_duties: base_timeout / HTTP_PROPOSER_DUTIES_TIMEOUT_QUOTIENT, sync_committee_contribution: base_timeout / HTTP_SYNC_COMMITTEE_CONTRIBUTION_TIMEOUT_QUOTIENT, sync_duties: base_timeout / HTTP_SYNC_DUTIES_TIMEOUT_QUOTIENT, + sync_aggregators: base_timeout / HTTP_SYNC_AGGREGATOR_TIMEOUT_QUOTIENT, get_beacon_blocks_ssz: base_timeout / HTTP_GET_BEACON_BLOCK_SSZ_TIMEOUT_QUOTIENT, get_debug_beacon_states: base_timeout / HTTP_GET_DEBUG_BEACON_STATE_QUOTIENT, get_deposit_snapshot: base_timeout / HTTP_GET_DEPOSIT_SNAPSHOT_QUOTIENT, @@ -490,7 +498,7 @@ impl BeaconNodeHttpClient { .post(url) .timeout(timeout.unwrap_or(self.timeouts.default)); let response = builder.json(body).send().await?; - ok_or_error(response).await + success_or_error(response).await } /// Generic POST function supporting arbitrary responses and timeouts. @@ -510,7 +518,7 @@ impl BeaconNodeHttpClient { .json(body) .send() .await?; - ok_or_error(response).await + success_or_error(response).await } /// Generic POST function that includes octet-stream content type header. @@ -527,7 +535,7 @@ impl BeaconNodeHttpClient { HeaderValue::from_static("application/octet-stream"), ); let response = builder.headers(headers).json(body).send().await?; - ok_or_error(response).await + success_or_error(response).await } /// Generic POST function supporting arbitrary responses and timeouts. @@ -552,7 +560,7 @@ impl BeaconNodeHttpClient { HeaderValue::from_static("application/octet-stream"), ); let response = builder.headers(headers).body(body).send().await?; - ok_or_error(response).await + success_or_error(response).await } /// `GET beacon/genesis` @@ -1249,16 +1257,17 @@ impl BeaconNodeHttpClient { &self, block_contents: &PublishBlockRequest, validation_level: Option, - ) -> Result<(), Error> { - self.post_generic_with_consensus_version( - self.post_beacon_blocks_v2_path(validation_level)?, - block_contents, - Some(self.timeouts.proposal), - block_contents.signed_block().message().body().fork_name(), - ) - .await?; + ) -> Result { + let response = self + .post_generic_with_consensus_version( + self.post_beacon_blocks_v2_path(validation_level)?, + block_contents, + Some(self.timeouts.proposal), + block_contents.signed_block().message().body().fork_name(), + ) + .await?; - Ok(()) + Ok(response) } /// `POST v2/beacon/blocks` @@ -1266,16 +1275,17 @@ impl BeaconNodeHttpClient { &self, block_contents: &PublishBlockRequest, validation_level: Option, - ) -> Result<(), Error> { - self.post_generic_with_consensus_version_and_ssz_body( - self.post_beacon_blocks_v2_path(validation_level)?, - block_contents.as_ssz_bytes(), - Some(self.timeouts.proposal), - block_contents.signed_block().message().body().fork_name(), - ) - .await?; + ) -> Result { + let response = self + .post_generic_with_consensus_version_and_ssz_body( + self.post_beacon_blocks_v2_path(validation_level)?, + block_contents.as_ssz_bytes(), + Some(self.timeouts.proposal), + block_contents.signed_block().message().body().fork_name(), + ) + .await?; - Ok(()) + Ok(response) } /// `POST v2/beacon/blinded_blocks` @@ -1283,16 +1293,17 @@ impl BeaconNodeHttpClient { &self, signed_block: &SignedBlindedBeaconBlock, validation_level: Option, - ) -> Result<(), Error> { - self.post_generic_with_consensus_version( - self.post_beacon_blinded_blocks_v2_path(validation_level)?, - signed_block, - Some(self.timeouts.proposal), - signed_block.message().body().fork_name(), - ) - .await?; + ) -> Result { + let response = self + .post_generic_with_consensus_version( + self.post_beacon_blinded_blocks_v2_path(validation_level)?, + signed_block, + Some(self.timeouts.proposal), + signed_block.message().body().fork_name(), + ) + .await?; - Ok(()) + Ok(response) } /// `POST v2/beacon/blinded_blocks` @@ -1300,16 +1311,17 @@ impl BeaconNodeHttpClient { &self, signed_block: &SignedBlindedBeaconBlock, validation_level: Option, - ) -> Result<(), Error> { - self.post_generic_with_consensus_version_and_ssz_body( - self.post_beacon_blinded_blocks_v2_path(validation_level)?, - signed_block.as_ssz_bytes(), - Some(self.timeouts.proposal), - signed_block.message().body().fork_name(), - ) - .await?; + ) -> Result { + let response = self + .post_generic_with_consensus_version_and_ssz_body( + self.post_beacon_blinded_blocks_v2_path(validation_level)?, + signed_block.as_ssz_bytes(), + Some(self.timeouts.proposal), + signed_block.message().body().fork_name(), + ) + .await?; - Ok(()) + Ok(response) } /// Path for `v2/beacon/blocks` @@ -1324,7 +1336,7 @@ impl BeaconNodeHttpClient { } /// Path for `v1/beacon/blob_sidecars/{block_id}` - pub fn get_blobs_path(&self, block_id: BlockId) -> Result { + pub fn get_blob_sidecars_path(&self, block_id: BlockId) -> Result { let mut path = self.eth_path(V1)?; path.path_segments_mut() .map_err(|()| Error::InvalidUrl(self.server.clone()))? @@ -1334,6 +1346,17 @@ impl BeaconNodeHttpClient { Ok(path) } + /// Path for `v1/beacon/blobs/{blob_id}` + pub fn get_blobs_path(&self, block_id: BlockId) -> Result { + let mut path = self.eth_path(V1)?; + path.path_segments_mut() + .map_err(|()| Error::InvalidUrl(self.server.clone()))? + .push("beacon") + .push("blobs") + .push(&block_id.to_string()); + Ok(path) + } + /// Path for `v1/beacon/blinded_blocks/{block_id}` pub fn get_beacon_blinded_blocks_path(&self, block_id: BlockId) -> Result { let mut path = self.eth_path(V1)?; @@ -1362,13 +1385,13 @@ impl BeaconNodeHttpClient { /// `GET v1/beacon/blob_sidecars/{block_id}` /// /// Returns `Ok(None)` on a 404 error. - pub async fn get_blobs( + pub async fn get_blob_sidecars( &self, block_id: BlockId, indices: Option<&[u64]>, spec: &ChainSpec, ) -> Result>>, Error> { - let mut path = self.get_blobs_path(block_id)?; + let mut path = self.get_blob_sidecars_path(block_id)?; if let Some(indices) = indices { let indices_string = indices .iter() @@ -1388,6 +1411,31 @@ impl BeaconNodeHttpClient { .map(|opt| opt.map(BeaconResponse::ForkVersioned)) } + /// `GET v1/beacon/blobs/{block_id}` + /// + /// Returns `Ok(None)` on a 404 error. + pub async fn get_blobs( + &self, + block_id: BlockId, + versioned_hashes: Option<&[Hash256]>, + ) -> Result>>>, Error> + { + let mut path = self.get_blobs_path(block_id)?; + if let Some(hashes) = versioned_hashes { + let hashes_string = hashes + .iter() + .map(|hash| hash.to_string()) + .collect::>() + .join(","); + path.query_pairs_mut() + .append_pair("versioned_hashes", &hashes_string); + } + + self.get_opt(path) + .await + .map(|opt| opt.map(BeaconResponse::Unversioned)) + } + /// `GET v1/beacon/blinded_blocks/{block_id}` /// /// Returns `Ok(None)` on a 404 error. @@ -2841,6 +2889,42 @@ impl BeaconNodeHttpClient { ) .await } + + /// `POST validator/beacon_committee_selections` + pub async fn post_validator_beacon_committee_selections( + &self, + selections: &[BeaconCommitteeSelection], + ) -> Result>, Error> { + let mut path = self.eth_path(V1)?; + + path.path_segments_mut() + .map_err(|()| Error::InvalidUrl(self.server.clone()))? + .push("validator") + .push("beacon_committee_selections"); + + self.post_with_timeout_and_response( + path, + &selections, + self.timeouts.attestation_aggregators, + ) + .await + } + + /// `POST validator/sync_committee_selections` + pub async fn post_validator_sync_committee_selections( + &self, + selections: &[SyncCommitteeSelection], + ) -> Result>, Error> { + let mut path = self.eth_path(V1)?; + + path.path_segments_mut() + .map_err(|()| Error::InvalidUrl(self.server.clone()))? + .push("validator") + .push("sync_committee_selections"); + + self.post_with_timeout_and_response(path, &selections, self.timeouts.sync_aggregators) + .await + } } /// Returns `Ok(response)` if the response is a `200 OK` response. Otherwise, creates an @@ -2859,3 +2943,20 @@ pub async fn ok_or_error(response: Response) -> Result { Err(Error::StatusCode(status)) } } + +/// Returns `Ok(response)` if the response is a success (2xx) response. Otherwise, creates an +/// appropriate error message. +pub async fn success_or_error(response: Response) -> Result { + let status = response.status(); + + if status.is_success() { + Ok(response) + } else if let Ok(message) = response.json().await { + match message { + ResponseError::Message(message) => Err(Error::ServerMessage(message)), + ResponseError::Indexed(indexed) => Err(Error::ServerIndexedMessage(indexed)), + } + } else { + Err(Error::StatusCode(status)) + } +} diff --git a/common/eth2/src/types.rs b/common/eth2/src/types.rs index 169551e35b..8f553b57d9 100644 --- a/common/eth2/src/types.rs +++ b/common/eth2/src/types.rs @@ -716,6 +716,13 @@ pub struct BlobIndicesQuery { pub indices: Option>, } +#[derive(Clone, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct BlobsVersionedHashesQuery { + #[serde(default, deserialize_with = "option_query_vec")] + pub versioned_hashes: Option>, +} + #[derive(Clone, Deserialize)] #[serde(deny_unknown_fields)] pub struct DataColumnIndicesQuery { @@ -967,6 +974,23 @@ pub struct PeerCount { pub disconnecting: u64, } +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct BeaconCommitteeSelection { + #[serde(with = "serde_utils::quoted_u64")] + pub validator_index: u64, + pub slot: Slot, + pub selection_proof: Signature, +} + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct SyncCommitteeSelection { + #[serde(with = "serde_utils::quoted_u64")] + pub validator_index: u64, + pub slot: Slot, + #[serde(with = "serde_utils::quoted_u64")] + pub subcommittee_index: u64, + pub selection_proof: Signature, +} // --------- Server Sent Event Types ----------- #[derive(PartialEq, Debug, Serialize, Deserialize, Clone)] @@ -2300,6 +2324,14 @@ pub struct StandardAttestationRewards { pub total_rewards: Vec, } +#[derive(Debug, Clone, Serialize, Deserialize, Encode, Decode)] +#[serde(bound = "E: EthSpec")] +#[serde(transparent)] +pub struct BlobWrapper { + #[serde(with = "ssz_types::serde_utils::hex_fixed_vec")] + pub blob: Blob, +} + #[cfg(test)] mod test { use std::fmt::Debug; diff --git a/common/eth2_network_config/built_in_network_configs/holesky/config.yaml b/common/eth2_network_config/built_in_network_configs/holesky/config.yaml index ab5f0f3bde..b1e9faea1d 100644 --- a/common/eth2_network_config/built_in_network_configs/holesky/config.yaml +++ b/common/eth2_network_config/built_in_network_configs/holesky/config.yaml @@ -38,7 +38,7 @@ ELECTRA_FORK_VERSION: 0x06017000 ELECTRA_FORK_EPOCH: 115968 # Fulu FULU_FORK_VERSION: 0x07017000 -FULU_FORK_EPOCH: 18446744073709551615 +FULU_FORK_EPOCH: 165120 # Gloas GLOAS_FORK_VERSION: 0x08017000 GLOAS_FORK_EPOCH: 18446744073709551615 @@ -47,6 +47,8 @@ GLOAS_FORK_EPOCH: 18446744073709551615 # --------------------------------------------------------------- # 12 seconds SECONDS_PER_SLOT: 12 +# 1200 milliseconds +SLOT_DURATION_MS: 12000 # 14 (estimate from Eth1 mainnet) SECONDS_PER_ETH1_BLOCK: 14 # 2**8 (= 256) epochs ~27 hours @@ -55,6 +57,18 @@ MIN_VALIDATOR_WITHDRAWABILITY_DELAY: 256 SHARD_COMMITTEE_PERIOD: 256 # 2**11 (= 2,048) Eth1 blocks ~8 hours ETH1_FOLLOW_DISTANCE: 2048 +# 1667 basis points, ~17% of SLOT_DURATION_MS +PROPOSER_REORG_CUTOFF_BPS: 1667 +# 3333 basis points, ~33% of SLOT_DURATION_MS +ATTESTATION_DUE_BPS: 3333 +# 6667 basis points, ~67% of SLOT_DURATION_MS +AGGREGATE_DUE_BPS: 6667 + +# Altair +# 3333 basis points, ~33% of SLOT_DURATION_MS +SYNC_MESSAGE_DUE_BPS: 3333 +# 6667 basis points, ~67% of SLOT_DURATION_MS +CONTRIBUTION_DUE_BPS: 6667 # Validator cycle # --------------------------------------------------------------- @@ -141,13 +155,30 @@ MAX_BLOBS_PER_BLOCK_ELECTRA: 9 MAX_REQUEST_BLOB_SIDECARS_ELECTRA: 1152 # Fulu +# 2**7 (= 128) groups NUMBER_OF_CUSTODY_GROUPS: 128 +# 2**7 (= 128) subnets DATA_COLUMN_SIDECAR_SUBNET_COUNT: 128 +# MAX_REQUEST_BLOCKS_DENEB * NUMBER_OF_COLUMNS (= 128 * 128) sidecars MAX_REQUEST_DATA_COLUMN_SIDECARS: 16384 +# 2**3 (= 8) samples SAMPLES_PER_SLOT: 8 +# 2**2 (= 4) sidecars CUSTODY_REQUIREMENT: 4 +# 2**3 (= 8) sidecars VALIDATOR_CUSTODY_REQUIREMENT: 8 +# 2**5 * 10**9 (= 32,000,000,000) Gwei BALANCE_PER_ADDITIONAL_CUSTODY_GROUP: 32000000000 +# 2**12 (= 4,096) epochs MIN_EPOCHS_FOR_DATA_COLUMN_SIDECARS_REQUESTS: 4096 +# Blob Scheduling +# --------------------------------------------------------------- + +BLOB_SCHEDULE: + - EPOCH: 166400 + MAX_BLOBS_PER_BLOCK: 15 + - EPOCH: 167936 + MAX_BLOBS_PER_BLOCK: 21 + # Gloas \ No newline at end of file diff --git a/common/eth2_network_config/built_in_network_configs/hoodi/config.yaml b/common/eth2_network_config/built_in_network_configs/hoodi/config.yaml index 01322974c8..256957e119 100644 --- a/common/eth2_network_config/built_in_network_configs/hoodi/config.yaml +++ b/common/eth2_network_config/built_in_network_configs/hoodi/config.yaml @@ -42,7 +42,7 @@ ELECTRA_FORK_EPOCH: 2048 # Fulu FULU_FORK_VERSION: 0x70000910 -FULU_FORK_EPOCH: 18446744073709551615 +FULU_FORK_EPOCH: 50688 # Gloas GLOAS_FORK_VERSION: 0x80000910 @@ -53,6 +53,8 @@ GLOAS_FORK_EPOCH: 18446744073709551615 # --------------------------------------------------------------- # 12 seconds SECONDS_PER_SLOT: 12 +# 12000 milliseconds +SLOT_DURATION_MS: 12000 # 14 (estimate from Eth1 mainnet) SECONDS_PER_ETH1_BLOCK: 12 # 2**8 (= 256) epochs ~27 hours @@ -61,6 +63,18 @@ MIN_VALIDATOR_WITHDRAWABILITY_DELAY: 256 SHARD_COMMITTEE_PERIOD: 256 # 2**11 (= 2,048) Eth1 blocks ~8 hours ETH1_FOLLOW_DISTANCE: 2048 +# 1667 basis points, ~17% of SLOT_DURATION_MS +PROPOSER_REORG_CUTOFF_BPS: 1667 +# 3333 basis points, ~33% of SLOT_DURATION_MS +ATTESTATION_DUE_BPS: 3333 +# 6667 basis points, ~67% of SLOT_DURATION_MS +AGGREGATE_DUE_BPS: 6667 + +# Altair +# 3333 basis points, ~33% of SLOT_DURATION_MS +SYNC_MESSAGE_DUE_BPS: 3333 +# 6667 basis points, ~67% of SLOT_DURATION_MS +CONTRIBUTION_DUE_BPS: 6667 # Validator cycle # --------------------------------------------------------------- @@ -154,15 +168,33 @@ WHISK_EPOCHS_PER_SHUFFLING_PHASE: 256 WHISK_PROPOSER_SELECTION_GAP: 2 # Fulu +# 2**7 (= 128) groups NUMBER_OF_CUSTODY_GROUPS: 128 +# 2**7 (= 128) subnets DATA_COLUMN_SIDECAR_SUBNET_COUNT: 128 +# MAX_REQUEST_BLOCKS_DENEB * NUMBER_OF_COLUMNS (= 128 * 128) sidecars MAX_REQUEST_DATA_COLUMN_SIDECARS: 16384 +# 2**3 (= 8) samples SAMPLES_PER_SLOT: 8 +# 2**2 (= 4) sidecars CUSTODY_REQUIREMENT: 4 +# 2**3 (= 8) sidecars VALIDATOR_CUSTODY_REQUIREMENT: 8 +# 2**5 * 10**9 (= 32,000,000,000) Gwei BALANCE_PER_ADDITIONAL_CUSTODY_GROUP: 32000000000 +# 2**12 (= 4,096) epochs MIN_EPOCHS_FOR_DATA_COLUMN_SIDECARS_REQUESTS: 4096 + +# Blob Scheduling +# --------------------------------------------------------------- + +BLOB_SCHEDULE: + - EPOCH: 52480 + MAX_BLOBS_PER_BLOCK: 15 + - EPOCH: 54016 + MAX_BLOBS_PER_BLOCK: 21 + # Gloas # EIP7732 diff --git a/common/eth2_network_config/built_in_network_configs/sepolia/config.yaml b/common/eth2_network_config/built_in_network_configs/sepolia/config.yaml index 9802e409fb..b1a01933d7 100644 --- a/common/eth2_network_config/built_in_network_configs/sepolia/config.yaml +++ b/common/eth2_network_config/built_in_network_configs/sepolia/config.yaml @@ -42,7 +42,7 @@ ELECTRA_FORK_EPOCH: 222464 # Fulu FULU_FORK_VERSION: 0x90000075 -FULU_FORK_EPOCH: 18446744073709551615 +FULU_FORK_EPOCH: 272640 # Gloas GLOAS_FORK_VERSION: 0x90000076 @@ -52,6 +52,8 @@ GLOAS_FORK_EPOCH: 18446744073709551615 # --------------------------------------------------------------- # 12 seconds SECONDS_PER_SLOT: 12 +# 12000 milliseconds +SLOT_DURATION_MS: 12000 # 14 (estimate from Eth1 mainnet) SECONDS_PER_ETH1_BLOCK: 14 # 2**8 (= 256) epochs ~27 hours @@ -60,6 +62,18 @@ MIN_VALIDATOR_WITHDRAWABILITY_DELAY: 256 SHARD_COMMITTEE_PERIOD: 256 # 2**11 (= 2,048) Eth1 blocks ~8 hours ETH1_FOLLOW_DISTANCE: 2048 +# 1667 basis points, ~17% of SLOT_DURATION_MS +PROPOSER_REORG_CUTOFF_BPS: 1667 +# 3333 basis points, ~33% of SLOT_DURATION_MS +ATTESTATION_DUE_BPS: 3333 +# 6667 basis points, ~67% of SLOT_DURATION_MS +AGGREGATE_DUE_BPS: 6667 + +# Altair +# 3333 basis points, ~33% of SLOT_DURATION_MS +SYNC_MESSAGE_DUE_BPS: 3333 +# 6667 basis points, ~67% of SLOT_DURATION_MS +CONTRIBUTION_DUE_BPS: 6667 # Validator cycle @@ -147,13 +161,31 @@ MAX_BLOBS_PER_BLOCK_ELECTRA: 9 MAX_REQUEST_BLOB_SIDECARS_ELECTRA: 1152 # Fulu +# 2**7 (= 128) groups NUMBER_OF_CUSTODY_GROUPS: 128 +# 2**7 (= 128) subnets DATA_COLUMN_SIDECAR_SUBNET_COUNT: 128 +# MAX_REQUEST_BLOCKS_DENEB * NUMBER_OF_COLUMNS (= 128 * 128) sidecars MAX_REQUEST_DATA_COLUMN_SIDECARS: 16384 +# 2**3 (= 8) samples SAMPLES_PER_SLOT: 8 +# 2**2 (= 4) sidecars CUSTODY_REQUIREMENT: 4 +# 2**3 (= 8) sidecars VALIDATOR_CUSTODY_REQUIREMENT: 8 +# 2**5 * 10**9 (= 32,000,000,000) Gwei BALANCE_PER_ADDITIONAL_CUSTODY_GROUP: 32000000000 +# 2**12 (= 4,096) epochs MIN_EPOCHS_FOR_DATA_COLUMN_SIDECARS_REQUESTS: 4096 + +# Blob Scheduling +# --------------------------------------------------------------- + +BLOB_SCHEDULE: + - EPOCH: 274176 + MAX_BLOBS_PER_BLOCK: 15 + - EPOCH: 275712 + MAX_BLOBS_PER_BLOCK: 21 + # Gloas \ No newline at end of file diff --git a/common/lighthouse_version/Cargo.toml b/common/lighthouse_version/Cargo.toml index cb4a43e407..b7e669ed94 100644 --- a/common/lighthouse_version/Cargo.toml +++ b/common/lighthouse_version/Cargo.toml @@ -7,7 +7,6 @@ edition = { workspace = true } [dependencies] git-version = "0.3.4" -target_info = "0.1.0" [dev-dependencies] regex = { workspace = true } diff --git a/common/lighthouse_version/src/lib.rs b/common/lighthouse_version/src/lib.rs index 238efd591a..a3f0ca404f 100644 --- a/common/lighthouse_version/src/lib.rs +++ b/common/lighthouse_version/src/lib.rs @@ -1,5 +1,5 @@ use git_version::git_version; -use target_info::Target; +use std::env::consts; /// Returns the current version of this build of Lighthouse. /// @@ -17,8 +17,8 @@ pub const VERSION: &str = git_version!( // NOTE: using --match instead of --exclude for compatibility with old Git "--match=thiswillnevermatchlol" ], - prefix = "Lighthouse/v7.1.0-", - fallback = "Lighthouse/v7.1.0" + prefix = "Lighthouse/v8.0.0-rc.1-", + fallback = "Lighthouse/v8.0.0-rc.1" ); /// Returns the first eight characters of the latest commit hash for this build. @@ -45,7 +45,7 @@ pub const COMMIT_PREFIX: &str = git_version!( /// /// `Lighthouse/v1.5.1-67da032+/x86_64-linux` pub fn version_with_platform() -> String { - format!("{}/{}-{}", VERSION, Target::arch(), Target::os()) + format!("{}/{}-{}", VERSION, consts::ARCH, consts::OS) } /// Returns semantic versioning information only. @@ -54,7 +54,7 @@ pub fn version_with_platform() -> String { /// /// `1.5.1` pub fn version() -> &'static str { - "7.1.0" + "8.0.0-rc.1" } /// Returns the name of the current client running. diff --git a/common/logging/src/lib.rs b/common/logging/src/lib.rs index 6722381dba..8ef3436b06 100644 --- a/common/logging/src/lib.rs +++ b/common/logging/src/lib.rs @@ -1,5 +1,3 @@ -use metrics::{IntCounter, Result as MetricsResult, try_create_int_counter}; -use std::sync::LazyLock; use std::time::{Duration, Instant}; use tracing_subscriber::EnvFilter; @@ -23,15 +21,6 @@ pub use utils::build_workspace_filter; /// The minimum interval between log messages indicating that a queue is full. const LOG_DEBOUNCE_INTERVAL: Duration = Duration::from_secs(30); -pub static INFOS_TOTAL: LazyLock> = - LazyLock::new(|| try_create_int_counter("info_total", "Count of infos logged")); -pub static WARNS_TOTAL: LazyLock> = - LazyLock::new(|| try_create_int_counter("warn_total", "Count of warns logged")); -pub static ERRORS_TOTAL: LazyLock> = - LazyLock::new(|| try_create_int_counter("error_total", "Count of errors logged")); -pub static CRITS_TOTAL: LazyLock> = - LazyLock::new(|| try_create_int_counter("crit_total", "Count of crits logged")); - /// Provides de-bounce functionality for logging. #[derive(Default)] pub struct TimeLatch(Option); diff --git a/common/logging/src/tracing_logging_layer.rs b/common/logging/src/tracing_logging_layer.rs index 27841cb7d8..e631d272b7 100644 --- a/common/logging/src/tracing_logging_layer.rs +++ b/common/logging/src/tracing_logging_layer.rs @@ -1,4 +1,5 @@ use crate::utils::is_ascii_control; +use std::collections::HashSet; use chrono::prelude::*; use serde_json::{Map, Value}; @@ -80,12 +81,11 @@ where event.record(&mut visitor); let mut span_data = Vec::new(); - if let Some(scope) = ctx.event_scope(event) { - for span in scope.from_root() { - if let Some(data) = span.extensions().get::() { - span_data.extend(data.fields.clone()); - } - } + if let Some(mut scope) = ctx.event_scope(event) + && let Some(span) = scope.next() + && let Some(data) = span.extensions().get::() + { + span_data.extend(data.fields.clone()); } // Remove ascii control codes from message. @@ -262,6 +262,12 @@ fn build_log_json( let module_field = format!("{}:{}", module_path, line_number); log_map.insert("module".to_string(), Value::String(module_field)); + // Avoid adding duplicate fields; prefer event fields when duplicates exist. + for (key, val) in span_fields { + let parsed_span_val = parse_field(val); + log_map.insert(key.clone(), parsed_span_val); + } + for (key, val) in visitor.fields.clone().into_iter() { let cleaned_value = if val.starts_with('\"') && val.ends_with('\"') && val.len() >= 2 { &val[1..val.len() - 1] @@ -273,11 +279,6 @@ fn build_log_json( log_map.insert(key, parsed_val); } - for (key, val) in span_fields { - let parsed_span_val = parse_field(val); - log_map.insert(key.clone(), parsed_span_val); - } - let json_obj = Value::Object(log_map); let output = format!("{}\n", json_obj); @@ -300,23 +301,6 @@ fn build_log_text( let bold_start = "\x1b[1m"; let bold_end = "\x1b[0m"; - let mut formatted_spans = String::new(); - for (i, (field_name, field_value)) in span_fields.iter().rev().enumerate() { - if use_color { - formatted_spans.push_str(&format!( - "{}{}{}: {}", - bold_start, field_name, bold_end, field_value - )); - } else { - formatted_spans.push_str(&format!("{}: {}", field_name, field_value)); - } - - // Check if this is not the last span. - if i != span_fields.len() - 1 { - formatted_spans.push_str(", "); - } - } - let pad = if plain_level_str.len() < ALIGNED_LEVEL_WIDTH { " " } else { @@ -352,24 +336,26 @@ fn build_log_text( message_content.clone() }; - let mut formatted_fields = String::new(); - for (i, (field_name, field_value)) in visitor.fields.iter().enumerate() { - if i > 0 { - formatted_fields.push_str(", "); - } - if use_color { - formatted_fields.push_str(&format!( - "{}{}{}: {}", - bold_start, field_name, bold_end, field_value - )); - } else { - formatted_fields.push_str(&format!("{}: {}", field_name, field_value)); - } - // Check if this is the last field and that we are also adding spans. - if i == visitor.fields.len() - 1 && !span_fields.is_empty() { - formatted_fields.push(','); - } - } + // Avoid adding duplicate fields; prefer event fields when duplicates exist. + let mut added_field_names = HashSet::new(); + let formatted_fields = visitor + .fields + .iter() + .chain(span_fields.iter()) + .filter_map(|(field_name, field_value)| { + if added_field_names.insert(field_name) { + let formatted_field = if use_color { + format!("{}{}{}: {}", bold_start, field_name, bold_end, field_value) + } else { + format!("{}: {}", field_name, field_value) + }; + Some(formatted_field) + } else { + None + } + }) + .collect::>() + .join(", "); let full_message = if !formatted_fields.is_empty() { format!("{} {}", padded_message, formatted_fields) @@ -379,14 +365,11 @@ fn build_log_text( let message = if !location.is_empty() { format!( - "{} {} {} {} {}\n", - timestamp, level_str, location, full_message, formatted_spans + "{} {} {} {}\n", + timestamp, level_str, location, full_message ) } else { - format!( - "{} {} {} {}\n", - timestamp, level_str, full_message, formatted_spans - ) + format!("{} {} {}\n", timestamp, level_str, full_message) }; if let Err(e) = writer.write_all(message.as_bytes()) { @@ -437,7 +420,7 @@ mod tests { fn test_build_log_text_single_log_field() { let log_fields = vec![("field_name".to_string(), "field_value".to_string())]; let span_fields = vec![]; - let expected = "Jan 1 08:00:00.000 INFO test message field_name: field_value \n"; + let expected = "Jan 1 08:00:00.000 INFO test message field_name: field_value\n"; test_build_log_text(log_fields, span_fields, expected); } @@ -448,7 +431,7 @@ mod tests { ("field_name2".to_string(), "field_value2".to_string()), ]; let span_fields = vec![]; - let expected = "Jan 1 08:00:00.000 INFO test message field_name1: field_value1, field_name2: field_value2 \n"; + let expected = "Jan 1 08:00:00.000 INFO test message field_name1: field_value1, field_name2: field_value2\n"; test_build_log_text(log_fields, span_fields, expected); } @@ -470,7 +453,7 @@ mod tests { "span_field_name".to_string(), "span_field_value".to_string(), )]; - let expected = "Jan 1 08:00:00.000 INFO test message span_field_name: span_field_value\n"; + let expected = "Jan 1 08:00:00.000 INFO test message span_field_name: span_field_value\n"; test_build_log_text(log_fields, span_fields, expected); } @@ -487,7 +470,7 @@ mod tests { "span_field_value2".to_string(), ), ]; - let expected = "Jan 1 08:00:00.000 INFO test message span_field_name2: span_field_value2, span_field_name1: span_field_value1\n"; + let expected = "Jan 1 08:00:00.000 INFO test message span_field_name1: span_field_value1, span_field_name2: span_field_value2\n"; test_build_log_text(log_fields, span_fields, expected); } @@ -504,7 +487,35 @@ mod tests { "span_field_value1-2".to_string(), ), ]; - let expected = "Jan 1 08:00:00.000 INFO test message span_field_name1-2: span_field_value1-2, span_field_name1-1: span_field_value1-1\n"; + let expected = "Jan 1 08:00:00.000 INFO test message span_field_name1-1: span_field_value1-1, span_field_name1-2: span_field_value1-2\n"; + test_build_log_text(log_fields, span_fields, expected); + } + + #[test] + fn test_build_log_text_no_duplicate_log_span_fields() { + let log_fields = vec![ + ("field_name_1".to_string(), "field_value_1".to_string()), + ("field_name_2".to_string(), "field_value_2".to_string()), + ]; + let span_fields = vec![ + ("field_name_1".to_string(), "field_value_1".to_string()), + ("field_name_3".to_string(), "field_value_3".to_string()), + ]; + let expected = "Jan 1 08:00:00.000 INFO test message field_name_1: field_value_1, field_name_2: field_value_2, field_name_3: field_value_3\n"; + test_build_log_text(log_fields, span_fields, expected); + } + + #[test] + fn test_build_log_text_duplicate_fields_prefer_log_fields() { + let log_fields = vec![ + ("field_name_1".to_string(), "field_value_1_log".to_string()), + ("field_name_2".to_string(), "field_value_2".to_string()), + ]; + let span_fields = vec![ + ("field_name_1".to_string(), "field_value_1_span".to_string()), + ("field_name_3".to_string(), "field_value_3".to_string()), + ]; + let expected = "Jan 1 08:00:00.000 INFO test message field_name_1: field_value_1_log, field_name_2: field_value_2, field_name_3: field_value_3\n"; test_build_log_text(log_fields, span_fields, expected); } diff --git a/common/network_utils/Cargo.toml b/common/network_utils/Cargo.toml new file mode 100644 index 0000000000..5206249e6f --- /dev/null +++ b/common/network_utils/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "network_utils" +version = "0.1.0" +edition = { workspace = true } + +[dependencies] +discv5 = { workspace = true } +libp2p-identity = "0.2" +lru_cache = { workspace = true } +metrics = { workspace = true } +multiaddr = "0.18.2" +parking_lot = { workspace = true } +serde = { workspace = true } +tiny-keccak = { version = "2", features = ["keccak"] } + +[dev-dependencies] +hex = { workspace = true } diff --git a/common/network_utils/src/discovery_metrics.rs b/common/network_utils/src/discovery_metrics.rs new file mode 100644 index 0000000000..26a9e8a45f --- /dev/null +++ b/common/network_utils/src/discovery_metrics.rs @@ -0,0 +1,45 @@ +use metrics::*; +use std::sync::LazyLock; + +pub static NAT_OPEN: LazyLock> = LazyLock::new(|| { + try_create_int_gauge_vec( + "nat_open", + "An estimate indicating if the local node is reachable from external nodes", + &["protocol"], + ) +}); +pub static DISCOVERY_BYTES: LazyLock> = LazyLock::new(|| { + try_create_int_gauge_vec( + "discovery_bytes", + "The number of bytes sent and received in discovery", + &["direction"], + ) +}); +pub static DISCOVERY_QUEUE: LazyLock> = LazyLock::new(|| { + try_create_int_gauge( + "discovery_queue_size", + "The number of discovery queries awaiting execution", + ) +}); +pub static DISCOVERY_REQS: LazyLock> = LazyLock::new(|| { + try_create_float_gauge( + "discovery_requests", + "The number of unsolicited discovery requests per second", + ) +}); +pub static DISCOVERY_SESSIONS: LazyLock> = LazyLock::new(|| { + try_create_int_gauge( + "discovery_sessions", + "The number of active discovery sessions with peers", + ) +}); + +pub fn scrape_discovery_metrics() { + let metrics = discv5::metrics::Metrics::from(discv5::Discv5::raw_metrics()); + set_float_gauge(&DISCOVERY_REQS, metrics.unsolicited_requests_per_second); + set_gauge(&DISCOVERY_SESSIONS, metrics.active_sessions as i64); + set_gauge_vec(&DISCOVERY_BYTES, &["inbound"], metrics.bytes_recv as i64); + set_gauge_vec(&DISCOVERY_BYTES, &["outbound"], metrics.bytes_sent as i64); + set_gauge_vec(&NAT_OPEN, &["discv5_ipv4"], metrics.ipv4_contactable as i64); + set_gauge_vec(&NAT_OPEN, &["discv5_ipv6"], metrics.ipv6_contactable as i64); +} diff --git a/beacon_node/lighthouse_network/src/discovery/enr_ext.rs b/common/network_utils/src/enr_ext.rs similarity index 98% rename from beacon_node/lighthouse_network/src/discovery/enr_ext.rs rename to common/network_utils/src/enr_ext.rs index 1d065ebf4a..627dd15559 100644 --- a/beacon_node/lighthouse_network/src/discovery/enr_ext.rs +++ b/common/network_utils/src/enr_ext.rs @@ -1,11 +1,12 @@ //! ENR extension trait to support libp2p integration. -use crate::{Enr, Multiaddr, PeerId}; use discv5::enr::{CombinedKey, CombinedPublicKey}; -use libp2p::core::multiaddr::Protocol; -use libp2p::identity::{KeyType, Keypair, PublicKey, ed25519, secp256k1}; +use libp2p_identity::{KeyType, Keypair, PublicKey, ed25519, secp256k1}; +use multiaddr::{Multiaddr, PeerId, Protocol}; use tiny_keccak::{Hasher, Keccak}; +type Enr = discv5::enr::Enr; + pub const QUIC_ENR_KEY: &str = "quic"; pub const QUIC6_ENR_KEY: &str = "quic6"; diff --git a/common/network_utils/src/lib.rs b/common/network_utils/src/lib.rs new file mode 100644 index 0000000000..c3d6ee1e0c --- /dev/null +++ b/common/network_utils/src/lib.rs @@ -0,0 +1,4 @@ +pub mod discovery_metrics; +pub mod enr_ext; +pub mod listen_addr; +pub mod unused_port; diff --git a/beacon_node/lighthouse_network/src/listen_addr.rs b/common/network_utils/src/listen_addr.rs similarity index 86% rename from beacon_node/lighthouse_network/src/listen_addr.rs rename to common/network_utils/src/listen_addr.rs index 85232c0b35..bdd94b3414 100644 --- a/beacon_node/lighthouse_network/src/listen_addr.rs +++ b/common/network_utils/src/listen_addr.rs @@ -1,6 +1,6 @@ use std::net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddr}; -use libp2p::{Multiaddr, multiaddr::Protocol}; +use multiaddr::{Multiaddr, Protocol}; use serde::{Deserialize, Serialize}; /// A listening address composed by an Ip, an UDP port and a TCP port. @@ -84,23 +84,21 @@ impl ListenAddress { .chain(v6_tcp_multiaddr) } - #[cfg(test)] pub fn unused_v4_ports() -> Self { ListenAddress::V4(ListenAddr { addr: Ipv4Addr::UNSPECIFIED, - disc_port: unused_port::unused_udp4_port().unwrap(), - quic_port: unused_port::unused_udp4_port().unwrap(), - tcp_port: unused_port::unused_tcp4_port().unwrap(), + disc_port: crate::unused_port::unused_udp4_port().unwrap(), + quic_port: crate::unused_port::unused_udp4_port().unwrap(), + tcp_port: crate::unused_port::unused_tcp4_port().unwrap(), }) } - #[cfg(test)] pub fn unused_v6_ports() -> Self { ListenAddress::V6(ListenAddr { addr: Ipv6Addr::UNSPECIFIED, - disc_port: unused_port::unused_udp6_port().unwrap(), - quic_port: unused_port::unused_udp6_port().unwrap(), - tcp_port: unused_port::unused_tcp6_port().unwrap(), + disc_port: crate::unused_port::unused_udp6_port().unwrap(), + quic_port: crate::unused_port::unused_udp6_port().unwrap(), + tcp_port: crate::unused_port::unused_tcp6_port().unwrap(), }) } } diff --git a/common/unused_port/src/lib.rs b/common/network_utils/src/unused_port.rs similarity index 100% rename from common/unused_port/src/lib.rs rename to common/network_utils/src/unused_port.rs diff --git a/common/system_health/Cargo.toml b/common/system_health/Cargo.toml index 034683f72e..2cafc42d6e 100644 --- a/common/system_health/Cargo.toml +++ b/common/system_health/Cargo.toml @@ -5,6 +5,8 @@ edition = { workspace = true } [dependencies] lighthouse_network = { workspace = true } +metrics = { workspace = true } +network_utils = { workspace = true } parking_lot = { workspace = true } serde = { workspace = true } sysinfo = { workspace = true } diff --git a/common/system_health/src/lib.rs b/common/system_health/src/lib.rs index 31b222c540..b61bdec486 100644 --- a/common/system_health/src/lib.rs +++ b/common/system_health/src/lib.rs @@ -1,4 +1,5 @@ use lighthouse_network::{NetworkGlobals, types::SyncState}; +use network_utils::discovery_metrics; use parking_lot::RwLock; use serde::{Deserialize, Serialize}; use std::path::{Path, PathBuf}; @@ -219,33 +220,21 @@ impl NatState { /// Observes if NAT traversal is possible. pub fn observe_nat() -> NatState { - let discv5_ipv4 = lighthouse_network::metrics::get_int_gauge( - &lighthouse_network::metrics::NAT_OPEN, - &["discv5_ipv4"], - ) - .map(|g| g.get() == 1) - .unwrap_or_default(); + let discv5_ipv4 = metrics::get_int_gauge(&discovery_metrics::NAT_OPEN, &["discv5_ipv4"]) + .map(|g| g.get() == 1) + .unwrap_or_default(); - let discv5_ipv6 = lighthouse_network::metrics::get_int_gauge( - &lighthouse_network::metrics::NAT_OPEN, - &["discv5_ipv6"], - ) - .map(|g| g.get() == 1) - .unwrap_or_default(); + let discv5_ipv6 = metrics::get_int_gauge(&discovery_metrics::NAT_OPEN, &["discv5_ipv6"]) + .map(|g| g.get() == 1) + .unwrap_or_default(); - let libp2p_ipv4 = lighthouse_network::metrics::get_int_gauge( - &lighthouse_network::metrics::NAT_OPEN, - &["libp2p_ipv4"], - ) - .map(|g| g.get() == 1) - .unwrap_or_default(); + let libp2p_ipv4 = metrics::get_int_gauge(&discovery_metrics::NAT_OPEN, &["libp2p_ipv4"]) + .map(|g| g.get() == 1) + .unwrap_or_default(); - let libp2p_ipv6 = lighthouse_network::metrics::get_int_gauge( - &lighthouse_network::metrics::NAT_OPEN, - &["libp2p_ipv6"], - ) - .map(|g| g.get() == 1) - .unwrap_or_default(); + let libp2p_ipv6 = metrics::get_int_gauge(&discovery_metrics::NAT_OPEN, &["libp2p_ipv6"]) + .map(|g| g.get() == 1) + .unwrap_or_default(); NatState { discv5_ipv4, diff --git a/common/task_executor/Cargo.toml b/common/task_executor/Cargo.toml index d4faf1e4b8..92a4fc4b59 100644 --- a/common/task_executor/Cargo.toml +++ b/common/task_executor/Cargo.toml @@ -8,6 +8,8 @@ edition = { workspace = true } async-channel = { workspace = true } futures = { workspace = true } metrics = { workspace = true } +num_cpus = { workspace = true } +rayon = { workspace = true } tokio = { workspace = true, features = ["rt-multi-thread", "macros"] } tracing = { workspace = true } diff --git a/common/task_executor/src/lib.rs b/common/task_executor/src/lib.rs index 5f0c822b03..0b8e9f8eba 100644 --- a/common/task_executor/src/lib.rs +++ b/common/task_executor/src/lib.rs @@ -1,12 +1,15 @@ mod metrics; +mod rayon_pool_provider; pub mod test_utils; use futures::channel::mpsc::Sender; use futures::prelude::*; -use std::sync::Weak; +use std::sync::{Arc, Weak}; use tokio::runtime::{Handle, Runtime}; use tracing::debug; +use crate::rayon_pool_provider::RayonPoolProvider; +pub use crate::rayon_pool_provider::RayonPoolType; pub use tokio::task::JoinHandle; /// Provides a reason when Lighthouse is shut down. @@ -84,6 +87,8 @@ pub struct TaskExecutor { // FIXME(sproul): delete? #[allow(dead_code)] service_name: String, + + rayon_pool_provider: Arc, } impl TaskExecutor { @@ -105,6 +110,7 @@ impl TaskExecutor { exit, signal_tx, service_name, + rayon_pool_provider: Arc::new(RayonPoolProvider::default()), } } @@ -115,6 +121,7 @@ impl TaskExecutor { exit: self.exit.clone(), signal_tx: self.signal_tx.clone(), service_name, + rayon_pool_provider: self.rayon_pool_provider.clone(), } } @@ -226,6 +233,47 @@ impl TaskExecutor { } } + /// Spawns a blocking task on a dedicated tokio thread pool and installs a rayon context within it. + pub fn spawn_blocking_with_rayon( + self, + task: F, + rayon_pool_type: RayonPoolType, + name: &'static str, + ) where + F: FnOnce() + Send + 'static, + { + let thread_pool = self.rayon_pool_provider.get_thread_pool(rayon_pool_type); + self.spawn_blocking( + move || { + thread_pool.install(|| { + task(); + }); + }, + name, + ) + } + + /// Spawns a blocking computation on a rayon thread pool and awaits the result. + pub async fn spawn_blocking_with_rayon_async( + &self, + rayon_pool_type: RayonPoolType, + task: F, + ) -> Result + where + F: FnOnce() -> R + Send + 'static, + R: Send + 'static, + { + let thread_pool = self.rayon_pool_provider.get_thread_pool(rayon_pool_type); + let (tx, rx) = tokio::sync::oneshot::channel(); + + thread_pool.spawn(move || { + let result = task(); + let _ = tx.send(result); + }); + + rx.await + } + /// Spawn a future on the tokio runtime wrapped in an `async-channel::Receiver` returning an optional /// join handle to the future. /// The task is cancelled when the corresponding async-channel is dropped. diff --git a/common/task_executor/src/rayon_pool_provider.rs b/common/task_executor/src/rayon_pool_provider.rs new file mode 100644 index 0000000000..8e12f7eaa4 --- /dev/null +++ b/common/task_executor/src/rayon_pool_provider.rs @@ -0,0 +1,58 @@ +use rayon::{ThreadPool, ThreadPoolBuilder}; +use std::sync::Arc; + +const DEFAULT_LOW_PRIORITY_CPU_PERCENTAGE: usize = 25; +const DEFAULT_HIGH_PRIORITY_CPU_PERCENTAGE: usize = 80; +const MINIMUM_THREAD_COUNT: usize = 1; + +pub enum RayonPoolType { + HighPriority, + LowPriority, +} + +pub struct RayonPoolProvider { + /// Smaller rayon thread pool for lower-priority, compute-intensive tasks. + /// By default ~25% of CPUs or a minimum of 1 thread. + low_priority_thread_pool: Arc, + /// Larger rayon thread pool for high-priority, compute-intensive tasks. + /// By default ~80% of CPUs or a minimum of 1 thread. Citical/highest + /// priority tasks should use the global pool instead. + high_priority_thread_pool: Arc, +} + +impl Default for RayonPoolProvider { + fn default() -> Self { + let low_prio_threads = + (num_cpus::get() * DEFAULT_LOW_PRIORITY_CPU_PERCENTAGE / 100).max(MINIMUM_THREAD_COUNT); + let low_priority_thread_pool = Arc::new( + ThreadPoolBuilder::new() + .num_threads(low_prio_threads) + .build() + .expect("failed to build low-priority rayon pool"), + ); + + let high_prio_threads = (num_cpus::get() * DEFAULT_HIGH_PRIORITY_CPU_PERCENTAGE / 100) + .max(MINIMUM_THREAD_COUNT); + let high_priority_thread_pool = Arc::new( + ThreadPoolBuilder::new() + .num_threads(high_prio_threads) + .build() + .expect("failed to build high-priority rayon pool"), + ); + Self { + low_priority_thread_pool, + high_priority_thread_pool, + } + } +} + +impl RayonPoolProvider { + /// Get a scoped thread pool by priority level. + /// For critical/highest priority tasks, use the global pool instead. + pub fn get_thread_pool(&self, rayon_pool_type: RayonPoolType) -> Arc { + match rayon_pool_type { + RayonPoolType::HighPriority => self.high_priority_thread_pool.clone(), + RayonPoolType::LowPriority => self.low_priority_thread_pool.clone(), + } + } +} diff --git a/common/unused_port/Cargo.toml b/common/unused_port/Cargo.toml deleted file mode 100644 index 2d771cd600..0000000000 --- a/common/unused_port/Cargo.toml +++ /dev/null @@ -1,9 +0,0 @@ -[package] -name = "unused_port" -version = "0.1.0" -edition = { workspace = true } -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[dependencies] -lru_cache = { workspace = true } -parking_lot = { workspace = true } diff --git a/consensus/fork_choice/src/fork_choice.rs b/consensus/fork_choice/src/fork_choice.rs index 19f294d439..fe1f5fba9e 100644 --- a/consensus/fork_choice/src/fork_choice.rs +++ b/consensus/fork_choice/src/fork_choice.rs @@ -523,6 +523,7 @@ where /// /// You *must* call `get_head` for the proposal slot prior to calling this function and pass /// in the result of `get_head` as `canonical_head`. + #[instrument(level = "debug", skip_all)] pub fn get_proposer_head( &self, current_slot: Slot, diff --git a/consensus/proto_array/src/proto_array_fork_choice.rs b/consensus/proto_array/src/proto_array_fork_choice.rs index 4b31dc60bd..dea853d245 100644 --- a/consensus/proto_array/src/proto_array_fork_choice.rs +++ b/consensus/proto_array/src/proto_array_fork_choice.rs @@ -160,6 +160,56 @@ pub struct Block { pub unrealized_finalized_checkpoint: Option, } +impl Block { + /// Compute the proposer shuffling decision root of a child block in `child_block_epoch`. + /// + /// This function assumes that `child_block_epoch >= self.epoch`. It is the responsibility of + /// the caller to check this condition, or else incorrect results will be produced. + pub fn proposer_shuffling_root_for_child_block( + &self, + child_block_epoch: Epoch, + spec: &ChainSpec, + ) -> Hash256 { + let block_epoch = self.current_epoch_shuffling_id.shuffling_epoch; + + // For child blocks in the Fulu fork epoch itself, we want to use the old logic. There is no + // lookahead in the first Fulu epoch. So we check whether Fulu is enabled at + // `child_block_epoch - 1`, i.e. whether `child_block_epoch > fulu_fork_epoch`. + if !spec + .fork_name_at_epoch(child_block_epoch.saturating_sub(1_u64)) + .fulu_enabled() + { + // Prior to Fulu the proposer shuffling decision root for the current epoch is the same + // as the attestation shuffling for the *next* epoch, i.e. it is determined at the start + // of the current epoch. + if block_epoch == child_block_epoch { + self.next_epoch_shuffling_id.shuffling_decision_block + } else { + // Otherwise, the child block epoch is greater, so its decision root is its parent + // root itself (this block's root). + self.root + } + } else { + // After Fulu the proposer shuffling is determined with lookahead, so if the block + // lies in the same epoch as its parent, its decision root is the same as the + // parent's current epoch attester shuffling + // + // i.e. the block from the end of epoch N - 2. + if child_block_epoch == block_epoch { + self.current_epoch_shuffling_id.shuffling_decision_block + } else if child_block_epoch == block_epoch + 1 { + // If the block is the next epoch, then it instead shares its decision root with + // the parent's *next epoch* attester shuffling. + self.next_epoch_shuffling_id.shuffling_decision_block + } else { + // The child block lies in the future beyond the lookahead, at the point where this + // block (its parent) will be the decision block. + self.root + } + } + } +} + /// A Vec-wrapper which will grow to match any request. /// /// E.g., a `get` or `insert` to an out-of-bounds element will cause the Vec to grow (using diff --git a/consensus/safe_arith/Cargo.toml b/consensus/safe_arith/Cargo.toml deleted file mode 100644 index 9ac9fe28d3..0000000000 --- a/consensus/safe_arith/Cargo.toml +++ /dev/null @@ -1,8 +0,0 @@ -[package] -name = "safe_arith" -version = "0.1.0" -authors = ["Michael Sproul "] -edition = { workspace = true } -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[dependencies] diff --git a/consensus/safe_arith/src/iter.rs b/consensus/safe_arith/src/iter.rs deleted file mode 100644 index d5ee51b588..0000000000 --- a/consensus/safe_arith/src/iter.rs +++ /dev/null @@ -1,70 +0,0 @@ -use crate::{Result, SafeArith}; - -/// Extension trait for iterators, providing a safe replacement for `sum`. -pub trait SafeArithIter { - fn safe_sum(self) -> Result; -} - -impl SafeArithIter for I -where - I: Iterator + Sized, - T: SafeArith, -{ - fn safe_sum(mut self) -> Result { - self.try_fold(T::ZERO, |acc, x| acc.safe_add(x)) - } -} - -#[cfg(test)] -mod test { - use super::*; - use crate::ArithError; - - #[test] - fn empty_sum() { - let v: Vec = vec![]; - assert_eq!(v.into_iter().safe_sum(), Ok(0)); - } - - #[test] - fn unsigned_sum_small() { - let arr = [400u64, 401, 402, 403, 404, 405, 406]; - assert_eq!( - arr.iter().copied().safe_sum().unwrap(), - arr.iter().copied().sum() - ); - } - - #[test] - fn unsigned_sum_overflow() { - let v = vec![u64::MAX, 1]; - assert_eq!(v.into_iter().safe_sum(), Err(ArithError::Overflow)); - } - - #[test] - fn signed_sum_small() { - let v = vec![-1i64, -2i64, -3i64, 3, 2, 1]; - assert_eq!(v.into_iter().safe_sum(), Ok(0)); - } - - #[test] - fn signed_sum_overflow_above() { - let v = vec![1, 2, 3, 4, i16::MAX, 0, 1, 2, 3]; - assert_eq!(v.into_iter().safe_sum(), Err(ArithError::Overflow)); - } - - #[test] - fn signed_sum_overflow_below() { - let v = vec![i16::MIN, -1]; - assert_eq!(v.into_iter().safe_sum(), Err(ArithError::Overflow)); - } - - #[test] - fn signed_sum_almost_overflow() { - let arr = [i64::MIN, 1, -1i64, i64::MAX, i64::MAX, 1]; - assert_eq!( - arr.iter().copied().safe_sum().unwrap(), - arr.iter().copied().sum() - ); - } -} diff --git a/consensus/safe_arith/src/lib.rs b/consensus/safe_arith/src/lib.rs deleted file mode 100644 index aa397c0603..0000000000 --- a/consensus/safe_arith/src/lib.rs +++ /dev/null @@ -1,166 +0,0 @@ -//! Library for safe arithmetic on integers, avoiding overflow and division by zero. -mod iter; - -pub use iter::SafeArithIter; - -/// Error representing the failure of an arithmetic operation. -#[derive(Debug, PartialEq, Eq, Clone, Copy)] -pub enum ArithError { - Overflow, - DivisionByZero, -} - -pub type Result = std::result::Result; - -macro_rules! assign_method { - ($name:ident, $op:ident, $doc_op:expr) => { - assign_method!($name, $op, Self, $doc_op); - }; - ($name:ident, $op:ident, $rhs_ty:ty, $doc_op:expr) => { - #[doc = "Safe variant of `"] - #[doc = $doc_op] - #[doc = "`."] - #[inline] - fn $name(&mut self, other: $rhs_ty) -> Result<()> { - *self = self.$op(other)?; - Ok(()) - } - }; -} - -/// Trait providing safe arithmetic operations for built-in types. -pub trait SafeArith: Sized + Copy { - const ZERO: Self; - const ONE: Self; - - /// Safe variant of `+` that guards against overflow. - fn safe_add(&self, other: Rhs) -> Result; - - /// Safe variant of `-` that guards against overflow. - fn safe_sub(&self, other: Rhs) -> Result; - - /// Safe variant of `*` that guards against overflow. - fn safe_mul(&self, other: Rhs) -> Result; - - /// Safe variant of `/` that guards against division by 0. - fn safe_div(&self, other: Rhs) -> Result; - - /// Safe variant of `%` that guards against division by 0. - fn safe_rem(&self, other: Rhs) -> Result; - - /// Safe variant of `<<` that guards against overflow. - fn safe_shl(&self, other: u32) -> Result; - - /// Safe variant of `>>` that guards against overflow. - fn safe_shr(&self, other: u32) -> Result; - - assign_method!(safe_add_assign, safe_add, Rhs, "+="); - assign_method!(safe_sub_assign, safe_sub, Rhs, "-="); - assign_method!(safe_mul_assign, safe_mul, Rhs, "*="); - assign_method!(safe_div_assign, safe_div, Rhs, "/="); - assign_method!(safe_rem_assign, safe_rem, Rhs, "%="); - assign_method!(safe_shl_assign, safe_shl, u32, "<<="); - assign_method!(safe_shr_assign, safe_shr, u32, ">>="); -} - -macro_rules! impl_safe_arith { - ($typ:ty) => { - impl SafeArith for $typ { - const ZERO: Self = 0; - const ONE: Self = 1; - - #[inline] - fn safe_add(&self, other: Self) -> Result { - self.checked_add(other).ok_or(ArithError::Overflow) - } - - #[inline] - fn safe_sub(&self, other: Self) -> Result { - self.checked_sub(other).ok_or(ArithError::Overflow) - } - - #[inline] - fn safe_mul(&self, other: Self) -> Result { - self.checked_mul(other).ok_or(ArithError::Overflow) - } - - #[inline] - fn safe_div(&self, other: Self) -> Result { - self.checked_div(other).ok_or(ArithError::DivisionByZero) - } - - #[inline] - fn safe_rem(&self, other: Self) -> Result { - self.checked_rem(other).ok_or(ArithError::DivisionByZero) - } - - #[inline] - fn safe_shl(&self, other: u32) -> Result { - self.checked_shl(other).ok_or(ArithError::Overflow) - } - - #[inline] - fn safe_shr(&self, other: u32) -> Result { - self.checked_shr(other).ok_or(ArithError::Overflow) - } - } - }; -} - -impl_safe_arith!(u8); -impl_safe_arith!(u16); -impl_safe_arith!(u32); -impl_safe_arith!(u64); -impl_safe_arith!(usize); -impl_safe_arith!(i8); -impl_safe_arith!(i16); -impl_safe_arith!(i32); -impl_safe_arith!(i64); -impl_safe_arith!(isize); - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn basic() { - let x = 10u32; - let y = 11; - assert_eq!(x.safe_add(y), Ok(x + y)); - assert_eq!(y.safe_sub(x), Ok(y - x)); - assert_eq!(x.safe_mul(y), Ok(x * y)); - assert_eq!(x.safe_div(y), Ok(x / y)); - assert_eq!(x.safe_rem(y), Ok(x % y)); - - assert_eq!(x.safe_shl(1), Ok(x << 1)); - assert_eq!(x.safe_shr(1), Ok(x >> 1)); - } - - #[test] - fn mutate() { - let mut x = 0u8; - x.safe_add_assign(2).unwrap(); - assert_eq!(x, 2); - x.safe_sub_assign(1).unwrap(); - assert_eq!(x, 1); - x.safe_shl_assign(1).unwrap(); - assert_eq!(x, 2); - x.safe_mul_assign(3).unwrap(); - assert_eq!(x, 6); - x.safe_div_assign(4).unwrap(); - assert_eq!(x, 1); - x.safe_shr_assign(1).unwrap(); - assert_eq!(x, 0); - } - - #[test] - fn errors() { - assert!(u32::MAX.safe_add(1).is_err()); - assert!(u32::MIN.safe_sub(1).is_err()); - assert!(u32::MAX.safe_mul(2).is_err()); - assert!(u32::MAX.safe_div(0).is_err()); - assert!(u32::MAX.safe_rem(0).is_err()); - assert!(u32::MAX.safe_shl(32).is_err()); - assert!(u32::MAX.safe_shr(32).is_err()); - } -} diff --git a/consensus/state_processing/src/all_caches.rs b/consensus/state_processing/src/all_caches.rs index e49eb395c4..0381bb820f 100644 --- a/consensus/state_processing/src/all_caches.rs +++ b/consensus/state_processing/src/all_caches.rs @@ -1,8 +1,7 @@ use crate::common::update_progressive_balances_cache::initialize_progressive_balances_cache; use crate::epoch_cache::initialize_epoch_cache; -use types::{ - BeaconState, ChainSpec, EpochCacheError, EthSpec, FixedBytesExtended, Hash256, RelativeEpoch, -}; +use tracing::instrument; +use types::{BeaconState, ChainSpec, EpochCacheError, EthSpec, Hash256, RelativeEpoch}; /// Mixin trait for the beacon state that provides operations on *all* caches. /// @@ -23,6 +22,7 @@ pub trait AllCaches { } impl AllCaches for BeaconState { + #[instrument(skip_all)] fn build_all_caches(&mut self, spec: &ChainSpec) -> Result<(), EpochCacheError> { self.build_caches(spec)?; initialize_epoch_cache(self, spec)?; @@ -32,8 +32,7 @@ impl AllCaches for BeaconState { fn all_caches_built(&self) -> bool { let current_epoch = self.current_epoch(); - let Ok(epoch_cache_decision_block_root) = - self.proposer_shuffling_decision_root(Hash256::zero()) + let Ok(epoch_cache_decision_block_root) = self.epoch_cache_decision_root(Hash256::ZERO) else { return false; }; diff --git a/consensus/state_processing/src/epoch_cache.rs b/consensus/state_processing/src/epoch_cache.rs index 6654c6a7ef..86db037446 100644 --- a/consensus/state_processing/src/epoch_cache.rs +++ b/consensus/state_processing/src/epoch_cache.rs @@ -123,7 +123,7 @@ pub fn is_epoch_cache_initialized( let current_epoch = state.current_epoch(); let epoch_cache: &EpochCache = state.epoch_cache(); let decision_block_root = state - .proposer_shuffling_decision_root(Hash256::zero()) + .epoch_cache_decision_root(Hash256::zero()) .map_err(EpochCacheError::BeaconState)?; Ok(epoch_cache @@ -146,7 +146,7 @@ pub fn initialize_epoch_cache( let current_epoch = state.current_epoch(); let next_epoch = state.next_epoch().map_err(EpochCacheError::BeaconState)?; let decision_block_root = state - .proposer_shuffling_decision_root(Hash256::zero()) + .epoch_cache_decision_root(Hash256::zero()) .map_err(EpochCacheError::BeaconState)?; state.build_total_active_balance_cache(spec)?; diff --git a/consensus/state_processing/src/per_slot_processing.rs b/consensus/state_processing/src/per_slot_processing.rs index 04b1e8148f..8695054e1e 100644 --- a/consensus/state_processing/src/per_slot_processing.rs +++ b/consensus/state_processing/src/per_slot_processing.rs @@ -26,7 +26,7 @@ impl From for Error { /// If the root of the supplied `state` is known, then it can be passed as `state_root`. If /// `state_root` is `None`, the root of `state` will be computed using a cached tree hash. /// Providing the `state_root` makes this function several orders of magnitude faster. -#[instrument(skip_all)] +#[instrument(level = "debug", skip_all)] pub fn per_slot_processing( state: &mut BeaconState, state_root: Option, diff --git a/consensus/state_processing/src/upgrade/fulu.rs b/consensus/state_processing/src/upgrade/fulu.rs index 6b038ad73a..c2aced7047 100644 --- a/consensus/state_processing/src/upgrade/fulu.rs +++ b/consensus/state_processing/src/upgrade/fulu.rs @@ -33,9 +33,7 @@ fn initialize_proposer_lookahead( ); } - Vector::new(lookahead).map_err(|e| { - Error::PleaseNotifyTheDevs(format!("Failed to initialize proposer lookahead: {:?}", e)) - }) + Vector::new(lookahead).map_err(|e| e.into()) } pub fn upgrade_state_to_fulu( diff --git a/consensus/types/src/beacon_block.rs b/consensus/types/src/beacon_block.rs index bd47418ad6..9bffedec22 100644 --- a/consensus/types/src/beacon_block.rs +++ b/consensus/types/src/beacon_block.rs @@ -869,6 +869,7 @@ impl<'de, E: EthSpec, Payload: AbstractExecPayload> ContextDeserialize<'de, F } } +#[derive(Clone, Copy)] pub enum BlockImportSource { Gossip, Lookup, diff --git a/consensus/types/src/beacon_block_body.rs b/consensus/types/src/beacon_block_body.rs index d6c6149b4d..3069ffbcd1 100644 --- a/consensus/types/src/beacon_block_body.rs +++ b/consensus/types/src/beacon_block_body.rs @@ -171,7 +171,7 @@ impl<'a, E: EthSpec, Payload: AbstractExecPayload> BeaconBlockBodyRef<'a, E, } } - pub(crate) fn body_merkle_leaves(&self) -> Vec { + pub fn body_merkle_leaves(&self) -> Vec { let mut leaves = vec![]; match self { Self::Base(body) => { @@ -322,29 +322,17 @@ impl<'a, E: EthSpec, Payload: AbstractExecPayload> BeaconBlockBodyRef<'a, E, } pub fn attestations_len(&self) -> usize { - match self { - Self::Base(body) => body.attestations.len(), - Self::Altair(body) => body.attestations.len(), - Self::Bellatrix(body) => body.attestations.len(), - Self::Capella(body) => body.attestations.len(), - Self::Deneb(body) => body.attestations.len(), - Self::Electra(body) => body.attestations.len(), - Self::Fulu(body) => body.attestations.len(), - Self::Gloas(body) => body.attestations.len(), - } + map_beacon_block_body_ref!(&'a _, self, |inner, cons| { + cons(inner); + inner.attestations.len() + }) } pub fn attester_slashings_len(&self) -> usize { - match self { - Self::Base(body) => body.attester_slashings.len(), - Self::Altair(body) => body.attester_slashings.len(), - Self::Bellatrix(body) => body.attester_slashings.len(), - Self::Capella(body) => body.attester_slashings.len(), - Self::Deneb(body) => body.attester_slashings.len(), - Self::Electra(body) => body.attester_slashings.len(), - Self::Fulu(body) => body.attester_slashings.len(), - Self::Gloas(body) => body.attester_slashings.len(), - } + map_beacon_block_body_ref!(&'a _, self, |inner, cons| { + cons(inner); + inner.attester_slashings.len() + }) } pub fn attestations(&self) -> Box> + 'a> { diff --git a/consensus/types/src/beacon_response.rs b/consensus/types/src/beacon_response.rs index 2e45854364..fc59fc9432 100644 --- a/consensus/types/src/beacon_response.rs +++ b/consensus/types/src/beacon_response.rs @@ -25,6 +25,7 @@ pub struct ForkVersionedResponse { /// `Deserialize`. #[derive(Debug, PartialEq, Clone, Serialize)] pub struct UnversionedResponse { + #[serde(flatten)] pub metadata: M, pub data: T, } @@ -195,9 +196,10 @@ impl From> for BeaconResponse { #[cfg(test)] mod fork_version_response_tests { + use crate::beacon_response::ExecutionOptimisticFinalizedMetadata; use crate::{ ExecutionPayload, ExecutionPayloadBellatrix, ForkName, ForkVersionedResponse, - MainnetEthSpec, + MainnetEthSpec, UnversionedResponse, }; use serde_json::json; @@ -236,4 +238,24 @@ mod fork_version_response_tests { assert!(result.is_err()); } + + // The following test should only pass by having the attribute #[serde(flatten)] on the metadata + #[test] + fn unversioned_response_serialize_dezerialize_round_trip_test() { + // Create an UnversionedResponse with some data + let data = UnversionedResponse { + metadata: ExecutionOptimisticFinalizedMetadata { + execution_optimistic: Some(false), + finalized: Some(false), + }, + data: "some_test_data".to_string(), + }; + + let serialized = serde_json::to_string(&data); + + let deserialized = + serde_json::from_str(&serialized.unwrap()).expect("Failed to deserialize"); + + assert_eq!(data, deserialized); + } } diff --git a/consensus/types/src/beacon_state.rs b/consensus/types/src/beacon_state.rs index 17d56e4644..2d1eab2e1e 100644 --- a/consensus/types/src/beacon_state.rs +++ b/consensus/types/src/beacon_state.rs @@ -173,7 +173,21 @@ pub enum Error { AggregatorNotInCommittee { aggregator_index: u64, }, - PleaseNotifyTheDevs(String), + ComputeProposerIndicesPastEpoch { + current_epoch: Epoch, + request_epoch: Epoch, + }, + ComputeProposerIndicesInsufficientLookahead { + current_epoch: Epoch, + request_epoch: Epoch, + }, + ComputeProposerIndicesExcessiveLookahead { + current_epoch: Epoch, + request_epoch: Epoch, + }, + ProposerLookaheadOutOfBounds { + i: usize, + }, } /// Control whether an epoch-indexed field can be indexed at the next epoch or not. @@ -573,6 +587,7 @@ where #[compare_fields(as_iter)] #[test_random(default)] #[superstruct(only(Fulu, Gloas))] + #[serde(with = "ssz_types::serde_utils::quoted_u64_fixed_vec")] pub proposer_lookahead: Vector, // Gloas @@ -906,8 +921,9 @@ impl BeaconState { &self, epoch: Epoch, block_root: Hash256, + spec: &ChainSpec, ) -> Result { - let decision_slot = self.proposer_shuffling_decision_slot(epoch); + let decision_slot = spec.proposer_shuffling_decision_slot::(epoch); if self.slot() <= decision_slot { Ok(block_root) } else { @@ -922,19 +938,18 @@ impl BeaconState { /// /// The `block_root` covers the one-off scenario where the genesis block decides its own /// shuffling. It should be set to the latest block applied to `self` or the genesis block root. - pub fn proposer_shuffling_decision_root(&self, block_root: Hash256) -> Result { - let decision_slot = self.proposer_shuffling_decision_slot(self.current_epoch()); - if self.slot() == decision_slot { - Ok(block_root) - } else { - self.get_block_root(decision_slot).copied() - } + pub fn proposer_shuffling_decision_root( + &self, + block_root: Hash256, + spec: &ChainSpec, + ) -> Result { + self.proposer_shuffling_decision_root_at_epoch(self.current_epoch(), block_root, spec) } - /// Returns the slot at which the proposer shuffling was decided. The block root at this slot - /// can be used to key the proposer shuffling for the given epoch. - fn proposer_shuffling_decision_slot(&self, epoch: Epoch) -> Slot { - epoch.start_slot(E::slots_per_epoch()).saturating_sub(1_u64) + pub fn epoch_cache_decision_root(&self, block_root: Hash256) -> Result { + // Epoch cache decision root for the current epoch (N) is the block root at the end of epoch + // N - 1. This is the same as the root that determines the next epoch attester shuffling. + self.attester_shuffling_decision_root(block_root, RelativeEpoch::Next) } /// Returns the block root which decided the attester shuffling for the given `relative_epoch`. @@ -1018,6 +1033,45 @@ impl BeaconState { indices: &[usize], spec: &ChainSpec, ) -> Result, Error> { + // Regardless of fork, we never support computing proposer indices for past epochs. + let current_epoch = self.current_epoch(); + if epoch < current_epoch { + return Err(Error::ComputeProposerIndicesPastEpoch { + current_epoch, + request_epoch: epoch, + }); + } + + if spec.fork_name_at_epoch(epoch).fulu_enabled() { + // Post-Fulu we must never compute proposer indices using insufficient lookahead. This + // would be very dangerous as it would lead to conflicts between the *true* proposer as + // defined by `self.proposer_lookahead` and the output of this function. + // With MIN_SEED_LOOKAHEAD=1 (common config), this is equivalent to checking that the + // requested epoch is not the current epoch. + // + // We do not run this check if this function is called from `upgrade_to_fulu`, + // which runs *after* the slot is incremented, and needs to compute the proposer + // shuffling for the epoch that was just transitioned into. + if self.fork_name_unchecked().fulu_enabled() + && epoch < current_epoch.safe_add(spec.min_seed_lookahead)? + { + return Err(Error::ComputeProposerIndicesInsufficientLookahead { + current_epoch, + request_epoch: epoch, + }); + } + } else { + // Pre-Fulu the situation is reversed, we *should not* compute proposer indices using + // too much lookahead. To do so would make us vulnerable to changes in the proposer + // indices caused by effective balance changes. + if epoch >= current_epoch.safe_add(spec.min_seed_lookahead)? { + return Err(Error::ComputeProposerIndicesExcessiveLookahead { + current_epoch, + request_epoch: epoch, + }); + } + } + epoch .slot_iter(E::slots_per_epoch()) .map(|slot| { @@ -1164,10 +1218,7 @@ impl BeaconState { let index = slot.as_usize().safe_rem(E::slots_per_epoch() as usize)?; proposer_lookahead .get(index) - .ok_or(Error::PleaseNotifyTheDevs(format!( - "Proposer lookahead out of bounds: {} for slot: {}", - index, slot - ))) + .ok_or(Error::ProposerLookaheadOutOfBounds { i: index }) .map(|index| *index as usize) } else { // Pre-Fulu @@ -1186,6 +1237,25 @@ impl BeaconState { epoch: Epoch, spec: &ChainSpec, ) -> Result, Error> { + // This isn't in the spec, but we remove the footgun that is requesting the current epoch + // for a Fulu state. + if let Ok(proposer_lookahead) = self.proposer_lookahead() + && epoch >= self.current_epoch() + && epoch <= self.next_epoch()? + { + let slots_per_epoch = E::slots_per_epoch() as usize; + let start_offset = if epoch == self.current_epoch() { + 0 + } else { + slots_per_epoch + }; + return Ok(proposer_lookahead + .iter_from(start_offset)? + .take(slots_per_epoch) + .map(|x| *x as usize) + .collect()); + } + // Not using the cached validator indices since they are shuffled. let indices = self.get_active_validator_indices(epoch, spec)?; @@ -2665,6 +2735,12 @@ impl BeaconState { } } +impl ForkVersionDecode for BeaconState { + fn from_ssz_bytes_by_fork(bytes: &[u8], fork_name: ForkName) -> Result { + Ok(map_fork_name!(fork_name, Self, <_>::from_ssz_bytes(bytes)?)) + } +} + impl BeaconState { /// The number of fields of the `BeaconState` rounded up to the nearest power of two. /// @@ -2793,7 +2869,7 @@ impl BeaconState { Ok(proof) } - fn generate_proof( + pub fn generate_proof( &self, field_index: usize, leaves: &[Hash256], @@ -2808,7 +2884,7 @@ impl BeaconState { Ok(proof) } - fn get_beacon_state_leaves(&self) -> Vec { + pub fn get_beacon_state_leaves(&self) -> Vec { let mut leaves = vec![]; #[allow(clippy::arithmetic_side_effects)] match self { diff --git a/consensus/types/src/chain_spec.rs b/consensus/types/src/chain_spec.rs index 92cccf2b6f..364dc6c731 100644 --- a/consensus/types/src/chain_spec.rs +++ b/consensus/types/src/chain_spec.rs @@ -234,7 +234,7 @@ pub struct ChainSpec { pub ttfb_timeout: u64, pub resp_timeout: u64, pub attestation_propagation_slot_range: u64, - pub maximum_gossip_clock_disparity_millis: u64, + pub maximum_gossip_clock_disparity: u64, pub message_domain_invalid_snappy: [u8; 4], pub message_domain_valid_snappy: [u8; 4], pub subnets_per_node: u8, @@ -679,7 +679,7 @@ impl ChainSpec { } pub fn maximum_gossip_clock_disparity(&self) -> Duration { - Duration::from_millis(self.maximum_gossip_clock_disparity_millis) + Duration::from_millis(self.maximum_gossip_clock_disparity) } pub fn ttfb_timeout(&self) -> Duration { @@ -874,6 +874,34 @@ impl ChainSpec { ) } + /// Returns the slot at which the proposer shuffling was decided. + /// + /// The block root at this slot can be used to key the proposer shuffling for the given epoch. + pub fn proposer_shuffling_decision_slot(&self, epoch: Epoch) -> Slot { + // At the Fulu fork epoch itself, the shuffling is computed "the old way" with no lookahead. + // Therefore for `epoch == fulu_fork_epoch` we must take the `else` branch. Checking if Fulu + // is enabled at `epoch - 1` accomplishes this neatly. + if self + .fork_name_at_epoch(epoch.saturating_sub(1_u64)) + .fulu_enabled() + { + // Post-Fulu the proposer shuffling decision slot for epoch N is the slot at the end + // of epoch N - 2 (note: min_seed_lookahead=1 in all current configs). + epoch + .saturating_sub(self.min_seed_lookahead) + .start_slot(E::slots_per_epoch()) + .saturating_sub(1_u64) + } else { + // Pre-Fulu the proposer shuffling decision slot for epoch N is the slot at the end of + // epoch N - 1 (note: +1 -1 for min_seed_lookahead=1 in all current configs). + epoch + .saturating_add(Epoch::new(1)) + .saturating_sub(self.min_seed_lookahead) + .start_slot(E::slots_per_epoch()) + .saturating_sub(1_u64) + } + } + /// Returns a `ChainSpec` compatible with the Ethereum Foundation specification. pub fn mainnet() -> Self { Self { @@ -1098,7 +1126,7 @@ impl ChainSpec { attestation_propagation_slot_range: default_attestation_propagation_slot_range(), attestation_subnet_count: 64, subnets_per_node: 2, - maximum_gossip_clock_disparity_millis: default_maximum_gossip_clock_disparity_millis(), + maximum_gossip_clock_disparity: default_maximum_gossip_clock_disparity(), target_aggregators_per_committee: 16, max_payload_size: default_max_payload_size(), min_epochs_for_block_requests: default_min_epochs_for_block_requests(), @@ -1449,7 +1477,7 @@ impl ChainSpec { attestation_propagation_slot_range: default_attestation_propagation_slot_range(), attestation_subnet_count: 64, subnets_per_node: 4, // Make this larger than usual to avoid network damage - maximum_gossip_clock_disparity_millis: default_maximum_gossip_clock_disparity_millis(), + maximum_gossip_clock_disparity: default_maximum_gossip_clock_disparity(), target_aggregators_per_committee: 16, max_payload_size: default_max_payload_size(), min_epochs_for_block_requests: 33024, @@ -1776,9 +1804,9 @@ pub struct Config { #[serde(default = "default_attestation_propagation_slot_range")] #[serde(with = "serde_utils::quoted_u64")] attestation_propagation_slot_range: u64, - #[serde(default = "default_maximum_gossip_clock_disparity_millis")] + #[serde(default = "default_maximum_gossip_clock_disparity")] #[serde(with = "serde_utils::quoted_u64")] - maximum_gossip_clock_disparity_millis: u64, + maximum_gossip_clock_disparity: u64, #[serde(default = "default_message_domain_invalid_snappy")] #[serde(with = "serde_utils::bytes_4_hex")] message_domain_invalid_snappy: [u8; 4], @@ -2000,7 +2028,7 @@ const fn default_attestation_propagation_slot_range() -> u64 { 32 } -const fn default_maximum_gossip_clock_disparity_millis() -> u64 { +const fn default_maximum_gossip_clock_disparity() -> u64 { 500 } @@ -2222,7 +2250,7 @@ impl Config { ttfb_timeout: spec.ttfb_timeout, resp_timeout: spec.resp_timeout, attestation_propagation_slot_range: spec.attestation_propagation_slot_range, - maximum_gossip_clock_disparity_millis: spec.maximum_gossip_clock_disparity_millis, + maximum_gossip_clock_disparity: spec.maximum_gossip_clock_disparity, message_domain_invalid_snappy: spec.message_domain_invalid_snappy, message_domain_valid_snappy: spec.message_domain_valid_snappy, max_request_blocks_deneb: spec.max_request_blocks_deneb, @@ -2312,7 +2340,7 @@ impl Config { message_domain_valid_snappy, max_request_blocks, attestation_propagation_slot_range, - maximum_gossip_clock_disparity_millis, + maximum_gossip_clock_disparity, max_request_blocks_deneb, max_request_blob_sidecars, max_request_data_column_sidecars, @@ -2390,7 +2418,7 @@ impl Config { attestation_subnet_prefix_bits, max_request_blocks, attestation_propagation_slot_range, - maximum_gossip_clock_disparity_millis, + maximum_gossip_clock_disparity, max_request_blocks_deneb, max_request_blob_sidecars, max_request_data_column_sidecars, @@ -3019,4 +3047,32 @@ mod yaml_tests { spec.min_epoch_data_availability_boundary(current_epoch) ); } + + #[test] + fn proposer_shuffling_decision_root_around_epoch_boundary() { + type E = MainnetEthSpec; + let fulu_fork_epoch = 5; + let spec = { + let mut spec = ForkName::Electra.make_genesis_spec(E::default_spec()); + spec.fulu_fork_epoch = Some(Epoch::new(fulu_fork_epoch)); + Arc::new(spec) + }; + + // For epochs prior to AND including the Fulu fork epoch, the decision slot is the end + // of the previous epoch (i.e. only 1 slot lookahead). + for epoch in (0..=fulu_fork_epoch).map(Epoch::new) { + assert_eq!( + spec.proposer_shuffling_decision_slot::(epoch), + epoch.start_slot(E::slots_per_epoch()) - 1 + ); + } + + // For epochs after Fulu, the decision slot is the end of the epoch two epochs prior. + for epoch in ((fulu_fork_epoch + 1)..(fulu_fork_epoch + 10)).map(Epoch::new) { + assert_eq!( + spec.proposer_shuffling_decision_slot::(epoch), + (epoch - 1).start_slot(E::slots_per_epoch()) - 1 + ); + } + } } diff --git a/consensus/types/src/data_column_sidecar.rs b/consensus/types/src/data_column_sidecar.rs index 57f7a88e19..2272b1695c 100644 --- a/consensus/types/src/data_column_sidecar.rs +++ b/consensus/types/src/data_column_sidecar.rs @@ -143,6 +143,7 @@ pub enum DataColumnSidecarError { PreDeneb, SszError(SszError), BuildSidecarFailed(String), + InvalidCellProofLength { expected: usize, actual: usize }, } impl From for DataColumnSidecarError { diff --git a/consensus/types/src/data_column_subnet_id.rs b/consensus/types/src/data_column_subnet_id.rs index 125a77fc1e..4061cb4fdb 100644 --- a/consensus/types/src/data_column_subnet_id.rs +++ b/consensus/types/src/data_column_subnet_id.rs @@ -1,13 +1,15 @@ //! Identifies each data column subnet by an integer identifier. use crate::ChainSpec; use crate::data_column_sidecar::ColumnIndex; +use derivative::Derivative; use safe_arith::{ArithError, SafeArith}; use serde::{Deserialize, Serialize}; use std::fmt::{self, Display}; use std::ops::{Deref, DerefMut}; #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[derive(Clone, Copy, Derivative, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[derivative(Debug = "transparent")] #[serde(transparent)] pub struct DataColumnSubnetId(#[serde(with = "serde_utils::quoted_u64")] u64); diff --git a/consensus/types/src/epoch_cache.rs b/consensus/types/src/epoch_cache.rs index ef91c20d75..9956cb400a 100644 --- a/consensus/types/src/epoch_cache.rs +++ b/consensus/types/src/epoch_cache.rs @@ -5,9 +5,13 @@ use std::sync::Arc; /// Cache of values which are uniquely determined at the start of an epoch. /// /// The values are fixed with respect to the last block of the _prior_ epoch, which we refer -/// to as the "decision block". This cache is very similar to the `BeaconProposerCache` in that -/// beacon proposers are determined at exactly the same time as the values in this cache, so -/// the keys for the two caches are identical. +/// to as the "decision block". +/// +/// Prior to Fulu this cache was similar to the `BeaconProposerCache` in that beacon proposers were +/// determined at exactly the same time as the values in this cache, so the keys for the two caches +/// were identical. +/// +/// Post-Fulu, we use a different key (the proposers have more lookahead). #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] #[derive(Debug, PartialEq, Eq, Clone, Default)] pub struct EpochCache { diff --git a/consensus/types/src/fork_name.rs b/consensus/types/src/fork_name.rs index f12b14ff6e..338e2b1e75 100644 --- a/consensus/types/src/fork_name.rs +++ b/consensus/types/src/fork_name.rs @@ -51,7 +51,7 @@ impl ForkName { /// This fork serves as the baseline for many tests, and the goal /// is to ensure features are passing on this fork. pub fn latest_stable() -> ForkName { - ForkName::Electra + ForkName::Fulu } /// Set the activation slots in the given `ChainSpec` so that the fork named by `self` @@ -201,6 +201,46 @@ impl ForkName { pub fn gloas_enabled(self) -> bool { self >= ForkName::Gloas } + + pub fn fork_ascii(self) { + if self == ForkName::Fulu { + println!( + r#" + ╔═══════════════════════════════════════╗ + ║ ║ + ║ TO FULU, MOAR BLOBS TO ETHEREUM ║ + ║ ║ + ║ III DECEMBER MMXXV ║ + ║ ║ + ╚═══════════════════════════════════════╝ + + ============================================================================= + |||| |||| + |---------------------------------------------------------------------------| + |___-----___-----___-----___-----___-----___-----___-----___-----___-----___| + / _ \===/ _ \ / _ \===/ _ \ / _ \===/ _ \ / _ \===/ _ \ + ( (.\ oOo /.) ) ( (.\ oOo /.) ) ( (.\ oOo /.) ) ( (.\ oOo /.) ) + \__/=====\__/ \__/=====\__/ \__/=====\__/ \__/=====\__/ + ||||||| ||||||| ||||||| ||||||| + ||||||| ||||||| \\/), ||||||| ||||||| + ||||||| ||||||| ,'.' /, ||||||| ||||||| + ||||||| ||||||| (_)- / /, ||||||| ||||||| + ||||||| ||||||| /\_/ |__..--, * ||||||| ||||||| + ||||||| ||||||| (\___/\ \ \ / ).' ||||||| ||||||| + ||||||| ||||||| \____/ / (_ // ||||||| ||||||| + ||||||| ||||||| \\_ ,'--'\_( ||||||| ||||||| + (oOoOo) (oOoOo) )_)_/ )_/ )_) (oOoOo) (oOoOo) + J%%%%%L J%%%%%L (_(_.'(_.'(_.' J%%%%%L J%%%%%L + ZZZZZZZZZ ZZZZZZZZZ ZZZZZZZZZ ZZZZZZZZZ + =========================================================================== + |_________________________________________________________________________| + |___________________________________________________________________________| + |_____________________________________________________________________________| + |_______________________________________________________________________________| + "# + ); + } + } } /// Map a fork name into a fork-versioned superstruct type like `BeaconBlock`. diff --git a/consensus/types/src/preset.rs b/consensus/types/src/preset.rs index c31183192f..ab54c0345f 100644 --- a/consensus/types/src/preset.rs +++ b/consensus/types/src/preset.rs @@ -208,6 +208,8 @@ pub struct DenebPreset { #[serde(with = "serde_utils::quoted_u64")] pub max_blob_commitments_per_block: u64, #[serde(with = "serde_utils::quoted_u64")] + pub kzg_commitment_inclusion_proof_depth: u64, + #[serde(with = "serde_utils::quoted_u64")] pub field_elements_per_blob: u64, } @@ -215,6 +217,7 @@ impl DenebPreset { pub fn from_chain_spec(_spec: &ChainSpec) -> Self { Self { max_blob_commitments_per_block: E::max_blob_commitments_per_block() as u64, + kzg_commitment_inclusion_proof_depth: E::KzgCommitmentInclusionProofDepth::to_u64(), field_elements_per_blob: E::field_elements_per_blob() as u64, } } diff --git a/consensus/types/src/runtime_fixed_vector.rs b/consensus/types/src/runtime_fixed_vector.rs index 2b08b7bf70..f562322a3d 100644 --- a/consensus/types/src/runtime_fixed_vector.rs +++ b/consensus/types/src/runtime_fixed_vector.rs @@ -2,12 +2,21 @@ //! //! The length of the list cannot be changed once it is set. -#[derive(Clone, Debug)] +use std::fmt; +use std::fmt::Debug; + +#[derive(Clone)] pub struct RuntimeFixedVector { vec: Vec, len: usize, } +impl Debug for RuntimeFixedVector { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{:?} (len={})", self.vec, self.len) + } +} + impl RuntimeFixedVector { pub fn new(vec: Vec) -> Self { let len = vec.len(); diff --git a/consensus/types/src/runtime_var_list.rs b/consensus/types/src/runtime_var_list.rs index dcb98538b7..d57c65b1b7 100644 --- a/consensus/types/src/runtime_var_list.rs +++ b/consensus/types/src/runtime_var_list.rs @@ -4,6 +4,8 @@ use serde::de::Error as DeError; use serde::{Deserialize, Deserializer, Serialize}; use ssz::Decode; use ssz_types::Error; +use std::fmt; +use std::fmt::Debug; use std::ops::{Deref, Index, IndexMut}; use std::slice::SliceIndex; use tree_hash::{Hash256, MerkleHasher, PackedEncoding, TreeHash, TreeHashType}; @@ -42,7 +44,7 @@ use tree_hash::{Hash256, MerkleHasher, PackedEncoding, TreeHash, TreeHashType}; /// assert!(long.push(6).is_err()); /// /// ``` -#[derive(Debug, Clone, Serialize, Deserialize, Derivative)] +#[derive(Clone, Serialize, Deserialize, Derivative)] #[derivative(PartialEq, Eq, Hash(bound = "T: std::hash::Hash"))] #[serde(transparent)] pub struct RuntimeVariableList { @@ -51,6 +53,12 @@ pub struct RuntimeVariableList { max_len: usize, } +impl Debug for RuntimeVariableList { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{:?} (max_len={})", self.vec, self.max_len) + } +} + impl RuntimeVariableList { /// Returns `Ok` if the given `vec` equals the fixed length of `Self`. Otherwise returns /// `Err(OutOfBounds { .. })`. diff --git a/consensus/types/src/selection_proof.rs b/consensus/types/src/selection_proof.rs index e471457c25..aa8c0c5658 100644 --- a/consensus/types/src/selection_proof.rs +++ b/consensus/types/src/selection_proof.rs @@ -3,11 +3,13 @@ use crate::{ }; use ethereum_hashing::hash; use safe_arith::{ArithError, SafeArith}; +use serde::{Deserialize, Serialize}; use ssz::Encode; use std::cmp; #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] -#[derive(PartialEq, Debug, Clone)] +#[derive(PartialEq, Debug, Clone, Serialize, Deserialize)] +#[serde(transparent)] pub struct SelectionProof(Signature); impl SelectionProof { diff --git a/consensus/types/src/sync_selection_proof.rs b/consensus/types/src/sync_selection_proof.rs index 6387212d94..b1e9e8186f 100644 --- a/consensus/types/src/sync_selection_proof.rs +++ b/consensus/types/src/sync_selection_proof.rs @@ -7,12 +7,14 @@ use crate::{ }; use ethereum_hashing::hash; use safe_arith::{ArithError, SafeArith}; +use serde::{Deserialize, Serialize}; use ssz::Encode; use ssz_types::typenum::Unsigned; use std::cmp; #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] -#[derive(PartialEq, Debug, Clone)] +#[derive(PartialEq, Debug, Clone, Serialize, Deserialize)] +#[serde(transparent)] pub struct SyncSelectionProof(Signature); impl SyncSelectionProof { diff --git a/crypto/kzg/src/lib.rs b/crypto/kzg/src/lib.rs index 1b8d46100f..0fe95b7723 100644 --- a/crypto/kzg/src/lib.rs +++ b/crypto/kzg/src/lib.rs @@ -23,7 +23,7 @@ pub use rust_eth_kzg::{ constants::{BYTES_PER_CELL, CELLS_PER_EXT_BLOB}, Cell, CellIndex as CellID, CellRef, TrustedSetup as PeerDASTrustedSetup, }; -use tracing::instrument; +use tracing::{instrument, Span}; /// Disables the fixed-base multi-scalar multiplication optimization for computing /// cell KZG proofs, because `rust-eth-kzg` already handles the precomputation. @@ -269,6 +269,7 @@ impl Kzg { .push((cell, *proof, *commitment)); } + let span = Span::current(); column_groups .into_par_iter() .map(|(column_index, column_data)| { @@ -286,6 +287,7 @@ impl Kzg { // This is safe from span explosion as we have at most 128 chunks, // i.e. the number of column indices. let _span = tracing::debug_span!( + parent: span.clone(), "verify_cell_proof_chunk", cells = cells.len(), column_index, diff --git a/lcli/Cargo.toml b/lcli/Cargo.toml index b962fa3b81..e5ed7a8926 100644 --- a/lcli/Cargo.toml +++ b/lcli/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "lcli" description = "Lighthouse CLI (modeled after zcli)" -version = "7.1.0" +version = "8.0.0-rc.1" authors = ["Paul Hauner "] edition = { workspace = true } @@ -30,6 +30,7 @@ hex = { workspace = true } lighthouse_network = { workspace = true } lighthouse_version = { workspace = true } log = { workspace = true } +network_utils = { workspace = true } rayon = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } diff --git a/lcli/src/generate_bootnode_enr.rs b/lcli/src/generate_bootnode_enr.rs index 6fe13d17c3..ddd36e7e7a 100644 --- a/lcli/src/generate_bootnode_enr.rs +++ b/lcli/src/generate_bootnode_enr.rs @@ -1,9 +1,10 @@ use clap::ArgMatches; use lighthouse_network::{ NETWORK_KEY_FILENAME, NetworkConfig, - discovery::{CombinedKey, CombinedKeyExt, ENR_FILENAME, build_enr}, + discovery::{CombinedKey, ENR_FILENAME, build_enr}, libp2p::identity::secp256k1, }; +use network_utils::enr_ext::CombinedKeyExt; use std::io::Write; use std::path::PathBuf; use std::{fs, net::Ipv4Addr}; diff --git a/lcli/src/http_sync.rs b/lcli/src/http_sync.rs index 2e36eadf23..6f7dcdb595 100644 --- a/lcli/src/http_sync.rs +++ b/lcli/src/http_sync.rs @@ -124,7 +124,7 @@ async fn get_block_from_source( .unwrap() .unwrap(); let blobs_from_source = source - .get_blobs::(block_id, None, spec) + .get_blob_sidecars::(block_id, None, spec) .await .unwrap() .unwrap() diff --git a/lighthouse/Cargo.toml b/lighthouse/Cargo.toml index 849d30bcf2..ef680c9b96 100644 --- a/lighthouse/Cargo.toml +++ b/lighthouse/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lighthouse" -version = "7.1.0" +version = "8.0.0-rc.1" authors = ["Sigma Prime "] edition = { workspace = true } autotests = false @@ -57,6 +57,7 @@ lighthouse_tracing = { workspace = true } lighthouse_version = { workspace = true } logging = { workspace = true } metrics = { workspace = true } +network_utils = { workspace = true } opentelemetry = { workspace = true } opentelemetry-otlp = { workspace = true } opentelemetry_sdk = { workspace = true } @@ -70,7 +71,6 @@ tracing = { workspace = true } tracing-opentelemetry = { workspace = true } tracing-subscriber = { workspace = true } types = { workspace = true } -unused_port = { workspace = true } validator_client = { workspace = true } validator_manager = { path = "../validator_manager" } diff --git a/lighthouse/src/main.rs b/lighthouse/src/main.rs index 3b0f7c3376..c93016a0f5 100644 --- a/lighthouse/src/main.rs +++ b/lighthouse/src/main.rs @@ -20,14 +20,15 @@ use lighthouse_version::VERSION; use logging::{MetricsLayer, build_workspace_filter, crit}; use malloc_utils::configure_memory_allocator; use opentelemetry::trace::TracerProvider; -use opentelemetry_otlp::WithExportConfig; +use opentelemetry_otlp::tonic_types::transport::ClientTlsConfig; +use opentelemetry_otlp::{WithExportConfig, WithTonicConfig}; use std::backtrace::Backtrace; use std::io::IsTerminal; use std::path::PathBuf; use std::process::exit; use std::sync::LazyLock; use task_executor::ShutdownReason; -use tracing::{Level, info, warn}; +use tracing::{Level, info}; use tracing_subscriber::{Layer, filter::EnvFilter, layer::SubscriberExt, util::SubscriberInitExt}; use types::{EthSpec, EthSpecId}; use validator_client::ProductionValidatorClient; @@ -125,16 +126,6 @@ fn main() { .global(true) .display_order(0), ) - .arg( - Arg::new("logfile") - .long("logfile") - .value_name("PATH") - .help("DEPRECATED") - .action(ArgAction::Set) - .global(true) - .hide(true) - .display_order(0) - ) .arg( Arg::new("logfile-dir") .long("logfile-dir") @@ -384,48 +375,6 @@ fn main() { .global(true) .display_order(0) ) - .arg( - Arg::new("terminal-total-difficulty-override") - .long("terminal-total-difficulty-override") - .value_name("INTEGER") - .help("DEPRECATED") - .action(ArgAction::Set) - .global(true) - .display_order(0) - .hide(true) - ) - .arg( - Arg::new("terminal-block-hash-override") - .long("terminal-block-hash-override") - .value_name("TERMINAL_BLOCK_HASH") - .help("DEPRECATED") - .requires("terminal-block-hash-epoch-override") - .action(ArgAction::Set) - .global(true) - .display_order(0) - .hide(true) - ) - .arg( - Arg::new("terminal-block-hash-epoch-override") - .long("terminal-block-hash-epoch-override") - .value_name("EPOCH") - .help("DEPRECATED") - .requires("terminal-block-hash-override") - .action(ArgAction::Set) - .global(true) - .display_order(0) - .hide(true) - ) - .arg( - Arg::new("safe-slots-to-import-optimistically") - .long("safe-slots-to-import-optimistically") - .value_name("INTEGER") - .help("DEPRECATED") - .action(ArgAction::Set) - .global(true) - .display_order(0) - .hide(true) - ) .arg( Arg::new("genesis-state-url") .long("genesis-state-url") @@ -712,6 +661,7 @@ fn run( let telemetry_layer = environment.runtime().block_on(async { let exporter = opentelemetry_otlp::SpanExporter::builder() .with_tonic() + .with_tls_config(ClientTlsConfig::new().with_native_roots()) .with_endpoint(telemetry_collector_url) .build() .map_err(|e| format!("Failed to create OTLP exporter: {:?}", e))?; @@ -778,11 +728,6 @@ fn run( // Allow Prometheus access to the version and commit of the Lighthouse build. metrics::expose_lighthouse_version(); - // DEPRECATED: can be removed in v7.2.0/v8.0.0. - if clap_utils::parse_optional::(matches, "logfile")?.is_some() { - warn!("The --logfile flag is deprecated and replaced by --logfile-dir"); - } - #[cfg(all(feature = "modern", target_arch = "x86_64"))] if !std::is_x86_feature_detected!("adx") { tracing::warn!( @@ -791,20 +736,6 @@ fn run( ); } - // Warn for DEPRECATED global flags. This code should be removed when we finish deleting these - // flags. - let deprecated_flags = [ - "terminal-total-difficulty-override", - "terminal-block-hash-override", - "terminal-block-hash-epoch-override", - "safe-slots-to-import-optimistically", - ]; - for flag in deprecated_flags { - if matches.get_one::(flag).is_some() { - warn!("The {} flag is deprecated and does nothing", flag); - } - } - // Note: the current code technically allows for starting a beacon node _and_ a validator // client at the same time. // diff --git a/lighthouse/tests/beacon_node.rs b/lighthouse/tests/beacon_node.rs index 38fd54d29d..5a057d7d7f 100644 --- a/lighthouse/tests/beacon_node.rs +++ b/lighthouse/tests/beacon_node.rs @@ -10,6 +10,9 @@ use beacon_node::{ }; use beacon_processor::BeaconProcessorConfig; use lighthouse_network::PeerId; +use network_utils::unused_port::{ + unused_tcp4_port, unused_tcp6_port, unused_udp4_port, unused_udp6_port, +}; use std::fs::File; use std::io::{Read, Write}; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr}; @@ -22,7 +25,6 @@ use std::time::Duration; use tempfile::TempDir; use types::non_zero_usize::new_non_zero_usize; use types::{Address, Checkpoint, Epoch, Hash256, MainnetEthSpec}; -use unused_port::{unused_tcp4_port, unused_tcp6_port, unused_udp4_port, unused_udp6_port}; const DEFAULT_EXECUTION_ENDPOINT: &str = "http://localhost:8551/"; const DEFAULT_EXECUTION_JWT_SECRET_KEY: &str = @@ -390,27 +392,35 @@ fn genesis_backfill_with_historic_flag() { .with_config(|config| assert!(config.chain.genesis_backfill)); } -// Tests for Eth1 flags. -// DEPRECATED but should not crash #[test] -fn eth1_blocks_per_log_query_flag() { +fn complete_blob_backfill_default() { CommandLineTest::new() - .flag("eth1-blocks-per-log-query", Some("500")) - .run_with_zero_port(); + .run_with_zero_port() + .with_config(|config| assert!(!config.chain.complete_blob_backfill)); } -// DEPRECATED but should not crash + #[test] -fn eth1_purge_cache_flag() { +fn complete_blob_backfill_flag() { CommandLineTest::new() - .flag("eth1-purge-cache", None) - .run_with_zero_port(); + .flag("complete-blob-backfill", None) + .run_with_zero_port() + .with_config(|config| { + assert!(config.chain.complete_blob_backfill); + assert!(!config.store.prune_blobs); + }); } -// DEPRECATED but should not crash + +// Even if `--prune-blobs true` is provided, `--complete-blob-backfill` should override it to false. #[test] -fn eth1_cache_follow_distance_manual() { +fn complete_blob_backfill_and_prune_blobs_true() { CommandLineTest::new() - .flag("eth1-cache-follow-distance", Some("128")) - .run_with_zero_port(); + .flag("complete-blob-backfill", None) + .flag("prune-blobs", Some("true")) + .run_with_zero_port() + .with_config(|config| { + assert!(config.chain.complete_blob_backfill); + assert!(!config.store.prune_blobs); + }); } // Tests for Bellatrix flags. @@ -748,31 +758,6 @@ fn jwt_optional_flags() { fn jwt_optional_alias_flags() { run_jwt_optional_flags_test("jwt-secrets", "jwt-id", "jwt-version"); } -// DEPRECATED. This flag is deprecated but should not cause a crash. -#[test] -fn terminal_total_difficulty_override_flag() { - CommandLineTest::new() - .flag("terminal-total-difficulty-override", Some("1337424242")) - .run_with_zero_port(); -} -// DEPRECATED. This flag is deprecated but should not cause a crash. -#[test] -fn terminal_block_hash_and_activation_epoch_override_flags() { - CommandLineTest::new() - .flag("terminal-block-hash-epoch-override", Some("1337")) - .flag( - "terminal-block-hash-override", - Some("0x4242424242424242424242424242424242424242424242424242424242424242"), - ) - .run_with_zero_port(); -} -// DEPRECATED. This flag is deprecated but should not cause a crash. -#[test] -fn safe_slots_to_import_optimistically_flag() { - CommandLineTest::new() - .flag("safe-slots-to-import-optimistically", Some("421337")) - .run_with_zero_port(); -} // Tests for Network flags. #[test] @@ -800,6 +785,19 @@ fn network_subscribe_all_data_column_subnets_flag() { .with_config(|config| assert!(config.network.subscribe_all_data_column_subnets)); } #[test] +fn network_supernode_flag() { + CommandLineTest::new() + .flag("supernode", None) + .run_with_zero_port() + .with_config(|config| assert!(config.network.subscribe_all_data_column_subnets)); +} +#[test] +fn network_subscribe_all_data_column_subnets_default() { + CommandLineTest::new() + .run_with_zero_port() + .with_config(|config| assert!(!config.network.subscribe_all_data_column_subnets)); +} +#[test] fn blob_publication_batches() { CommandLineTest::new() .flag("blob-publication-batches", Some("3")) @@ -1806,12 +1804,25 @@ fn slots_per_restore_point_flag() { .run_with_zero_port(); } +#[test] +fn block_cache_size_default() { + CommandLineTest::new() + .run_with_zero_port() + .with_config(|config| assert_eq!(config.store.block_cache_size, 0)); +} #[test] fn block_cache_size_flag() { CommandLineTest::new() .flag("block-cache-size", Some("4")) .run_with_zero_port() - .with_config(|config| assert_eq!(config.store.block_cache_size, new_non_zero_usize(4))); + .with_config(|config| assert_eq!(config.store.block_cache_size, 4)); +} +#[test] +fn block_cache_size_zero() { + CommandLineTest::new() + .flag("block-cache-size", Some("0")) + .run_with_zero_port() + .with_config(|config| assert_eq!(config.store.block_cache_size, 0)); } #[test] fn state_cache_size_default() { @@ -2464,42 +2475,6 @@ fn logfile_format_flag() { ) }); } -// DEPRECATED but should not crash. -#[test] -fn deprecated_logfile() { - CommandLineTest::new() - .flag("logfile", Some("test.txt")) - .run_with_zero_port(); -} - -// DEPRECATED but should not crash. -#[test] -fn sync_eth1_chain_disable_deposit_contract_sync_flag() { - let dir = TempDir::new().expect("Unable to create temporary directory"); - CommandLineTest::new_with_no_execution_endpoint() - .flag("disable-deposit-contract-sync", None) - .flag("execution-endpoints", Some("http://localhost:8551/")) - .flag( - "execution-jwt", - dir.path().join("jwt-file").as_os_str().to_str(), - ) - .run_with_zero_port(); -} - -#[test] -#[should_panic] -fn disable_deposit_contract_sync_conflicts_with_staking() { - let dir = TempDir::new().expect("Unable to create temporary directory"); - CommandLineTest::new_with_no_execution_endpoint() - .flag("disable-deposit-contract-sync", None) - .flag("staking", None) - .flag("execution-endpoints", Some("http://localhost:8551/")) - .flag( - "execution-jwt", - dir.path().join("jwt-file").as_os_str().to_str(), - ) - .run_with_zero_port(); -} #[test] fn light_client_server_default() { @@ -2514,7 +2489,6 @@ fn light_client_server_default() { #[test] fn light_client_server_enabled() { CommandLineTest::new() - .flag("light-client-server", None) .run_with_zero_port() .with_config(|config| { assert!(config.network.enable_light_client_server); diff --git a/lighthouse/tests/boot_node.rs b/lighthouse/tests/boot_node.rs index bd1cd7574e..38111ca0ef 100644 --- a/lighthouse/tests/boot_node.rs +++ b/lighthouse/tests/boot_node.rs @@ -3,8 +3,8 @@ use boot_node::config::BootNodeConfigSerialization; use crate::exec::{CommandLineTestExec, CompletedTest}; use clap::ArgMatches; use clap_utils::get_eth2_network_config; -use lighthouse_network::Enr; -use lighthouse_network::discovery::ENR_FILENAME; +use lighthouse_network::{Enr, discovery::ENR_FILENAME}; +use network_utils::unused_port::unused_udp4_port; use std::fs::File; use std::io::Write; use std::net::Ipv4Addr; @@ -12,7 +12,6 @@ use std::path::{Path, PathBuf}; use std::process::Command; use std::str::FromStr; use tempfile::TempDir; -use unused_port::unused_udp4_port; const IP_ADDRESS: &str = "192.168.2.108"; diff --git a/scripts/print_release_diffs.py b/scripts/print_release_diffs.py new file mode 100644 index 0000000000..d910b1be5b --- /dev/null +++ b/scripts/print_release_diffs.py @@ -0,0 +1,72 @@ +""" +Summarise pull requests between two Lighthouse releases. + +Usage: + export GITHUB_TOKEN=your_token + python -m pip install requests==2.32.4 + python print_release_diffs.py --base v7.0.1 --head release-v7.1.0 + +Shows commit SHA, PR number, 'backwards-incompat' label status, and PR title. +""" + +import requests +import re +import argparse +import os + +GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN") +if not GITHUB_TOKEN: + raise SystemExit("Error: Please set the GITHUB_TOKEN environment variable.") + +parser = argparse.ArgumentParser(description="Summarise PRs between two Lighthouse versions.") +parser.add_argument("--base", required=True, help="Base tag or branch (older release)") +parser.add_argument("--head", required=True, help="Head tag or branch (newer release)") +args = parser.parse_args() + +BASE = args.base +HEAD = args.head +OWNER = 'sigp' +REPO = 'lighthouse' + +HEADERS = { + 'Authorization': f'token {GITHUB_TOKEN}', + 'Accept': 'application/vnd.github+json' +} + +def get_commits_between(base, head): + url = f'https://api.github.com/repos/{OWNER}/{REPO}/compare/{base}...{head}' + response = requests.get(url, headers=HEADERS) + response.raise_for_status() + return response.json()['commits'] + +def has_backwards_incompat_label(pr_number): + url = f'https://api.github.com/repos/{OWNER}/{REPO}/issues/{pr_number}' + response = requests.get(url, headers=HEADERS) + if response.status_code != 200: + raise Exception(f"Failed to fetch PR #{pr_number}") + labels = response.json().get('labels', []) + return any(label['name'] == 'backwards-incompat' for label in labels) + +def main(): + commits = get_commits_between(BASE, HEAD) + print(" # Commit SHA PR Number Has backwards-incompat Label PR Title") + print("--- ------------ ----------- ------------------------------ --------------------------------------------") + + for i, commit in enumerate(commits, 1): + sha = commit['sha'][:12] + message = commit['commit']['message'] + pr_match = re.search(r"\(#(\d+)\)", message) + + if not pr_match: + print(f"{i:<3} {sha} {'-':<11} {'-':<30} [NO PR MATCH]: {message.splitlines()[0]}") + continue + + pr_number = int(pr_match.group(1)) + try: + has_label = has_backwards_incompat_label(pr_number) + print(f"{i:<3} {sha} {pr_number:<11} {str(has_label):<30} {message.splitlines()[0]}") + except Exception as e: + print(f"{i:<3} {sha} {pr_number:<11} {'ERROR':<30} [ERROR FETCHING PR]: {e}") + +if __name__ == '__main__': + main() diff --git a/scripts/tests/checkpoint-sync-config-devnet.yaml b/scripts/tests/checkpoint-sync-config-devnet.yaml index f1b96dc9e5..2392011ed3 100644 --- a/scripts/tests/checkpoint-sync-config-devnet.yaml +++ b/scripts/tests/checkpoint-sync-config-devnet.yaml @@ -4,11 +4,15 @@ participants: cl_image: lighthouse:local el_type: geth el_image: ethpandaops/geth:master + cl_extra_params: + - --disable-backfill-rate-limiting supernode: true - cl_type: lighthouse cl_image: lighthouse:local el_type: geth el_image: ethpandaops/geth:master + cl_extra_params: + - --disable-backfill-rate-limiting supernode: false checkpoint_sync_enabled: true diff --git a/scripts/tests/checkpoint-sync.sh b/scripts/tests/checkpoint-sync.sh index a170d1e94d..df03da042e 100755 --- a/scripts/tests/checkpoint-sync.sh +++ b/scripts/tests/checkpoint-sync.sh @@ -15,7 +15,7 @@ CONFIG=${2:-$SCRIPT_DIR/checkpoint-sync-config-sepolia.yaml} # Interval for polling the /lighthouse/syncing endpoint for sync status POLL_INTERVAL_SECS=5 # Target number of slots to backfill to complete this test. -TARGET_BACKFILL_SLOTS=1024 +TARGET_BACKFILL_SLOTS=256 # Timeout for this test, if the node(s) fail to backfill `TARGET_BACKFILL_SLOTS` slots, fail the test. TIMEOUT_MINS=10 TIMEOUT_SECS=$((TIMEOUT_MINS * 60)) diff --git a/testing/ef_tests/Makefile b/testing/ef_tests/Makefile index 0c6fd50dfd..da8640d681 100644 --- a/testing/ef_tests/Makefile +++ b/testing/ef_tests/Makefile @@ -1,6 +1,6 @@ # To download/extract nightly tests, run: # CONSENSUS_SPECS_TEST_VERSION=nightly make -CONSENSUS_SPECS_TEST_VERSION ?= v1.6.0-alpha.5 +CONSENSUS_SPECS_TEST_VERSION ?= v1.6.0-alpha.6 REPO_NAME := consensus-spec-tests OUTPUT_DIR := ./$(REPO_NAME) diff --git a/testing/ef_tests/check_all_files_accessed.py b/testing/ef_tests/check_all_files_accessed.py index 821287ce25..41e3c4bff7 100755 --- a/testing/ef_tests/check_all_files_accessed.py +++ b/testing/ef_tests/check_all_files_accessed.py @@ -59,6 +59,9 @@ excluded_paths = [ "tests/.*/.*/epoch_processing/.*/post_epoch.ssz_snappy", # Ignore gloas tests for now "tests/.*/gloas/.*", + # Ignore KZG tests that target internal kzg library functions + "tests/.*/compute_verify_cell_kzg_proof_batch_challenge/.*", + "tests/.*/compute_challenge/.*", ] diff --git a/testing/ef_tests/src/cases/fork.rs b/testing/ef_tests/src/cases/fork.rs index 78d802c228..54efb9f9ce 100644 --- a/testing/ef_tests/src/cases/fork.rs +++ b/testing/ef_tests/src/cases/fork.rs @@ -60,7 +60,7 @@ impl Case for ForkTest { fn result(&self, _case_index: usize, fork_name: ForkName) -> Result<(), Error> { let mut result_state = self.pre.clone(); let mut expected = Some(self.post.clone()); - let spec = &E::default_spec(); + let spec = &fork_name.make_genesis_spec(E::default_spec()); let mut result = match fork_name { ForkName::Base => panic!("phase0 not supported"), diff --git a/testing/execution_engine_integration/Cargo.toml b/testing/execution_engine_integration/Cargo.toml index 07d8d98f1d..eef13cfc73 100644 --- a/testing/execution_engine_integration/Cargo.toml +++ b/testing/execution_engine_integration/Cargo.toml @@ -18,6 +18,7 @@ fork_choice = { workspace = true } futures = { workspace = true } hex = { workspace = true } logging = { workspace = true } +network_utils = { workspace = true } reqwest = { workspace = true } sensitive_url = { workspace = true } serde_json = { workspace = true } @@ -25,4 +26,3 @@ task_executor = { workspace = true } tempfile = { workspace = true } tokio = { workspace = true } types = { workspace = true } -unused_port = { workspace = true } diff --git a/testing/execution_engine_integration/src/execution_engine.rs b/testing/execution_engine_integration/src/execution_engine.rs index 61a50b0405..ed4ee4682f 100644 --- a/testing/execution_engine_integration/src/execution_engine.rs +++ b/testing/execution_engine_integration/src/execution_engine.rs @@ -1,10 +1,10 @@ use ethers_providers::{Http, Provider}; use execution_layer::DEFAULT_JWT_FILE; +use network_utils::unused_port::unused_tcp4_port; use sensitive_url::SensitiveUrl; use std::path::PathBuf; use std::process::Child; use tempfile::TempDir; -use unused_port::unused_tcp4_port; pub const KEYSTORE_PASSWORD: &str = "testpwd"; pub const ACCOUNT1: &str = "7b8C3a386C0eea54693fFB0DA17373ffC9228139"; diff --git a/testing/execution_engine_integration/src/geth.rs b/testing/execution_engine_integration/src/geth.rs index 91d6c7fd57..4b62e68e94 100644 --- a/testing/execution_engine_integration/src/geth.rs +++ b/testing/execution_engine_integration/src/geth.rs @@ -1,11 +1,11 @@ use crate::build_utils; use crate::execution_engine::GenericExecutionEngine; use crate::genesis_json::geth_genesis_json; +use network_utils::unused_port::unused_tcp4_port; use std::path::{Path, PathBuf}; use std::process::{Child, Command, Output}; use std::{env, fs}; use tempfile::TempDir; -use unused_port::unused_tcp4_port; const GETH_BRANCH: &str = "master"; const GETH_REPO_URL: &str = "https://github.com/ethereum/go-ethereum"; diff --git a/testing/execution_engine_integration/src/nethermind.rs b/testing/execution_engine_integration/src/nethermind.rs index c3b8651789..6a336161bd 100644 --- a/testing/execution_engine_integration/src/nethermind.rs +++ b/testing/execution_engine_integration/src/nethermind.rs @@ -1,12 +1,12 @@ use crate::build_utils; use crate::execution_engine::GenericExecutionEngine; use crate::genesis_json::nethermind_genesis_json; +use network_utils::unused_port::unused_tcp4_port; use std::env; use std::fs; use std::path::{Path, PathBuf}; use std::process::{Child, Command, Output}; use tempfile::TempDir; -use unused_port::unused_tcp4_port; /// We've pinned the Nethermind version since our method of using the `master` branch to /// find the latest tag isn't working. It appears Nethermind don't always tag on `master`. diff --git a/testing/simulator/src/checks.rs b/testing/simulator/src/checks.rs index 1368c495cd..1240785121 100644 --- a/testing/simulator/src/checks.rs +++ b/testing/simulator/src/checks.rs @@ -424,7 +424,7 @@ pub async fn verify_full_blob_production_up_to( // the `verify_full_block_production_up_to` function. if block.is_some() { remote_node - .get_blobs::(BlockId::Slot(Slot::new(slot)), None, &E::default_spec()) + .get_blobs::(BlockId::Slot(Slot::new(slot)), None) .await .map_err(|e| format!("Failed to get blobs at slot {slot:?}: {e:?}"))? .ok_or_else(|| format!("No blobs available at slot {slot:?}"))?; diff --git a/validator_client/slashing_protection/src/slashing_database.rs b/validator_client/slashing_protection/src/slashing_database.rs index 9cecdaa8a5..7d8947a584 100644 --- a/validator_client/slashing_protection/src/slashing_database.rs +++ b/validator_client/slashing_protection/src/slashing_database.rs @@ -599,6 +599,40 @@ impl SlashingDatabase { Ok(safe) } + /// Check whether a block would be safe to sign if we were to sign it now. + /// + /// The database is not modified, and therefore multiple threads reading the database might get + /// the same result. Therefore: + /// + /// DO NOT USE THIS FUNCTION TO DECIDE IF A BLOCK IS SAFE TO SIGN! + pub fn preliminary_check_block_proposal( + &self, + validator_pubkey: &PublicKeyBytes, + block_header: &BeaconBlockHeader, + domain: Hash256, + ) -> Result { + #[allow(clippy::disallowed_methods)] + self.preliminary_check_block_signing_root( + validator_pubkey, + block_header.slot, + block_header.signing_root(domain).into(), + ) + } + + /// As for `preliminary_check_block_proposal` but without requiring the whole `BeaconBlockHeader`. + /// + /// DO NOT USE THIS FUNCTION TO DECIDE IF A BLOCK IS SAFE TO SIGN! + pub fn preliminary_check_block_signing_root( + &self, + validator_pubkey: &PublicKeyBytes, + slot: Slot, + signing_root: SigningRoot, + ) -> Result { + let mut conn = self.conn_pool.get()?; + let txn = conn.transaction_with_behavior(TransactionBehavior::Exclusive)?; + self.check_block_proposal(&txn, validator_pubkey, slot, signing_root) + } + /// Check an attestation for slash safety, and if it is safe, record it in the database. /// /// The checking and inserting happen atomically and exclusively. We enforce exclusivity @@ -670,6 +704,49 @@ impl SlashingDatabase { Ok(safe) } + /// Check whether an attestation would be safe to sign if we were to sign it now. + /// + /// The database is not modified, and therefore multiple threads reading the database might get + /// the same result. Therefore: + /// + /// DO NOT USE THIS FUNCTION TO DECIDE IF AN ATTESTATION IS SAFE TO SIGN! + pub fn preliminary_check_attestation( + &self, + validator_pubkey: &PublicKeyBytes, + attestation: &AttestationData, + domain: Hash256, + ) -> Result { + let attestation_signing_root = attestation.signing_root(domain).into(); + #[allow(clippy::disallowed_methods)] + self.preliminary_check_attestation_signing_root( + validator_pubkey, + attestation.source.epoch, + attestation.target.epoch, + attestation_signing_root, + ) + } + + /// As for `preliminary_check_attestation` but without requiring the whole `AttestationData`. + /// + /// DO NOT USE THIS FUNCTION TO DECIDE IF AN ATTESTATION IS SAFE TO SIGN! + pub fn preliminary_check_attestation_signing_root( + &self, + validator_pubkey: &PublicKeyBytes, + att_source_epoch: Epoch, + att_target_epoch: Epoch, + att_signing_root: SigningRoot, + ) -> Result { + let mut conn = self.conn_pool.get()?; + let txn = conn.transaction_with_behavior(TransactionBehavior::Exclusive)?; + self.check_attestation( + &txn, + validator_pubkey, + att_source_epoch, + att_target_epoch, + att_signing_root, + ) + } + /// Import slashing protection from another client in the interchange format. /// /// This function will atomically import the entire interchange, failing if *any* diff --git a/validator_client/src/lib.rs b/validator_client/src/lib.rs index 5b396ccaf5..71bdde10b0 100644 --- a/validator_client/src/lib.rs +++ b/validator_client/src/lib.rs @@ -2,6 +2,7 @@ pub mod cli; pub mod config; use crate::cli::ValidatorClient; +use crate::duties_service::SelectionProofConfig; pub use config::Config; use initialized_validators::InitializedValidators; use metrics::set_gauge; @@ -55,6 +56,22 @@ const WAITING_FOR_GENESIS_POLL_TIME: Duration = Duration::from_secs(12); const DOPPELGANGER_SERVICE_NAME: &str = "doppelganger"; +/// Compute attestation selection proofs this many slots before they are required. +/// +/// At start-up selection proofs will be computed with less lookahead out of necessity. +const SELECTION_PROOF_SLOT_LOOKAHEAD: u64 = 8; + +/// The attestation selection proof lookahead for those running with the --distributed flag. +const SELECTION_PROOF_SLOT_LOOKAHEAD_DVT: u64 = 1; + +/// Fraction of a slot at which attestation selection proof signing should happen (2 means half way). +const SELECTION_PROOF_SCHEDULE_DENOM: u32 = 2; + +/// Number of epochs in advance to compute sync selection proofs when not in `distributed` mode. +pub const AGGREGATION_PRE_COMPUTE_EPOCHS: u64 = 2; +/// Number of slots in advance to compute sync selection proofs when in `distributed` mode. +pub const AGGREGATION_PRE_COMPUTE_SLOTS_DISTRIBUTED: u64 = 1; + type ValidatorStore = LighthouseValidatorStore; #[derive(Clone)] @@ -407,6 +424,41 @@ impl ProductionValidatorClient { validator_store.prune_slashing_protection_db(slot.epoch(E::slots_per_epoch()), true); } + // Define a config to be pass to duties_service. + // The defined config here defaults to using selections_endpoint and parallel_sign (i.e., distributed mode) + // Other DVT applications, e.g., Anchor can pass in different configs to suit different needs. + let attestation_selection_proof_config = if config.distributed { + SelectionProofConfig { + lookahead_slot: SELECTION_PROOF_SLOT_LOOKAHEAD_DVT, + computation_offset: slot_clock.slot_duration() / SELECTION_PROOF_SCHEDULE_DENOM, + selections_endpoint: true, + parallel_sign: true, + } + } else { + SelectionProofConfig { + lookahead_slot: SELECTION_PROOF_SLOT_LOOKAHEAD, + computation_offset: slot_clock.slot_duration() / SELECTION_PROOF_SCHEDULE_DENOM, + selections_endpoint: false, + parallel_sign: false, + } + }; + + let sync_selection_proof_config = if config.distributed { + SelectionProofConfig { + lookahead_slot: AGGREGATION_PRE_COMPUTE_SLOTS_DISTRIBUTED, + computation_offset: Duration::default(), + selections_endpoint: true, + parallel_sign: true, + } + } else { + SelectionProofConfig { + lookahead_slot: E::slots_per_epoch() * AGGREGATION_PRE_COMPUTE_EPOCHS, + computation_offset: Duration::default(), + selections_endpoint: false, + parallel_sign: false, + } + }; + let duties_service = Arc::new( DutiesServiceBuilder::new() .slot_clock(slot_clock.clone()) @@ -415,7 +467,8 @@ impl ProductionValidatorClient { .spec(context.eth2_config.spec.clone()) .executor(context.executor.clone()) .enable_high_validator_count_metrics(config.enable_high_validator_count_metrics) - .distributed(config.distributed) + .attestation_selection_proof_config(attestation_selection_proof_config) + .sync_selection_proof_config(sync_selection_proof_config) .disable_attesting(config.disable_attesting) .build()?, ); diff --git a/validator_client/validator_services/src/block_service.rs b/validator_client/validator_services/src/block_service.rs index 834df67e8a..c111b1f22e 100644 --- a/validator_client/validator_services/src/block_service.rs +++ b/validator_client/validator_services/src/block_service.rs @@ -497,6 +497,7 @@ impl BlockService { beacon_node .post_beacon_blocks_v2_ssz(signed_block, None) .await + .map(|_| ()) .or_else(|e| { handle_block_post_error(e, signed_block.signed_block().message().slot()) })? @@ -506,10 +507,12 @@ impl BlockService { &validator_metrics::BLOCK_SERVICE_TIMES, &[validator_metrics::BLINDED_BEACON_BLOCK_HTTP_POST], ); + beacon_node .post_beacon_blinded_blocks_v2_ssz(signed_block, None) .await - .or_else(|e| handle_block_post_error(e, signed_block.message().slot()))? + .map(|_| ()) + .or_else(|e| handle_block_post_error(e, signed_block.message().slot()))?; } } Ok::<_, BlockError>(()) diff --git a/validator_client/validator_services/src/duties_service.rs b/validator_client/validator_services/src/duties_service.rs index 009537bc43..7569d3946a 100644 --- a/validator_client/validator_services/src/duties_service.rs +++ b/validator_client/validator_services/src/duties_service.rs @@ -11,10 +11,14 @@ use crate::sync::SyncDutiesMap; use crate::sync::poll_sync_committee_duties; use beacon_node_fallback::{ApiTopic, BeaconNodeFallback}; use eth2::types::{ - AttesterData, BeaconCommitteeSubscription, DutiesResponse, ProposerData, StateId, ValidatorId, + AttesterData, BeaconCommitteeSelection, BeaconCommitteeSubscription, DutiesResponse, + ProposerData, StateId, ValidatorId, }; -use futures::{StreamExt, stream}; -use parking_lot::RwLock; +use futures::{ + StreamExt, + stream::{self, FuturesUnordered}, +}; +use parking_lot::{RwLock, RwLockWriteGuard}; use safe_arith::{ArithError, SafeArith}; use slot_clock::SlotClock; use std::cmp::min; @@ -32,17 +36,6 @@ use validator_store::{DoppelgangerStatus, Error as ValidatorStoreError, Validato /// Only retain `HISTORICAL_DUTIES_EPOCHS` duties prior to the current epoch. const HISTORICAL_DUTIES_EPOCHS: u64 = 2; -/// Compute attestation selection proofs this many slots before they are required. -/// -/// At start-up selection proofs will be computed with less lookahead out of necessity. -const SELECTION_PROOF_SLOT_LOOKAHEAD: u64 = 8; - -/// The attestation selection proof lookahead for those running with the --distributed flag. -const SELECTION_PROOF_SLOT_LOOKAHEAD_DVT: u64 = 1; - -/// Fraction of a slot at which selection proof signing should happen (2 means half way). -const SELECTION_PROOF_SCHEDULE_DENOM: u32 = 2; - /// Minimum number of validators for which we auto-enable per-validator metrics. /// For validators greater than this value, we need to manually set the `enable-per-validator-metrics` /// flag in the cli to enable collection of per validator metrics. @@ -121,18 +114,97 @@ pub struct SubscriptionSlots { duty_slot: Slot, } +#[derive(Copy, Clone, Debug)] +pub struct SelectionProofConfig { + pub lookahead_slot: u64, + /// The seconds to compute the selection proof before a slot. + pub computation_offset: Duration, + /// Whether to call the selections endpoint, true for DVT with middleware. + pub selections_endpoint: bool, + /// Whether to sign the selection proof in parallel, true in distributed mode. + pub parallel_sign: bool, +} + +/// The default config for selection proofs covers the non-DVT case. +impl Default for SelectionProofConfig { + fn default() -> Self { + Self { + lookahead_slot: 0, + computation_offset: Duration::default(), + selections_endpoint: false, + parallel_sign: false, + } + } +} + /// Create a selection proof for `duty`. /// /// Return `Ok(None)` if the attesting validator is not an aggregator. -async fn make_selection_proof( +async fn make_selection_proof( duty: &AttesterData, validator_store: &S, spec: &ChainSpec, + beacon_nodes: &Arc>, + config: &SelectionProofConfig, ) -> Result, Error> { - let selection_proof = validator_store - .produce_selection_proof(duty.pubkey, duty.slot) - .await - .map_err(Error::FailedToProduceSelectionProof)?; + let selection_proof = if config.selections_endpoint { + let beacon_committee_selection = BeaconCommitteeSelection { + validator_index: duty.validator_index, + slot: duty.slot, + // This is partial selection proof + selection_proof: validator_store + .produce_selection_proof(duty.pubkey, duty.slot) + .await + .map_err(Error::FailedToProduceSelectionProof)? + .into(), + }; + // Call the endpoint /eth/v1/validator/beacon_committee_selections + // by sending the BeaconCommitteeSelection that contains partial selection proof + // The middleware should return BeaconCommitteeSelection that contains full selection proof + let middleware_response = beacon_nodes + .first_success(|beacon_node| { + let selection_data = beacon_committee_selection.clone(); + debug!( + "validator_index" = duty.validator_index, + "slot" = %duty.slot, + "partial selection proof" = ?beacon_committee_selection.selection_proof, + "Sending selection to middleware" + ); + async move { + beacon_node + .post_validator_beacon_committee_selections(&[selection_data]) + .await + } + }) + .await; + + let response_data = middleware_response + .map_err(|e| { + Error::FailedToProduceSelectionProof(ValidatorStoreError::Middleware(e.to_string())) + })? + .data + .pop() + .ok_or_else(|| { + Error::FailedToProduceSelectionProof(ValidatorStoreError::Middleware(format!( + "attestation selection proof - empty response for validator {}", + duty.validator_index + ))) + })?; + + debug!( + "validator_index" = response_data.validator_index, + "slot" = %response_data.slot, + // The selection proof from middleware response will be a full selection proof + "full selection proof" = ?response_data.selection_proof, + "Received selection from middleware" + ); + SelectionProof::from(response_data.selection_proof) + } else { + validator_store + .produce_selection_proof(duty.pubkey, duty.slot) + .await + .map_err(Error::FailedToProduceSelectionProof)? + }; selection_proof .is_aggregator(duty.committee_length as usize, spec) @@ -217,8 +289,10 @@ pub struct DutiesServiceBuilder { spec: Option>, //// Whether we permit large validator counts in the metrics. enable_high_validator_count_metrics: bool, - /// If this validator is running in distributed mode. - distributed: bool, + /// Create attestation selection proof config + attestation_selection_proof_config: SelectionProofConfig, + /// Create sync selection proof config + sync_selection_proof_config: SelectionProofConfig, disable_attesting: bool, } @@ -237,7 +311,8 @@ impl DutiesServiceBuilder { executor: None, spec: None, enable_high_validator_count_metrics: false, - distributed: false, + attestation_selection_proof_config: SelectionProofConfig::default(), + sync_selection_proof_config: SelectionProofConfig::default(), disable_attesting: false, } } @@ -275,8 +350,19 @@ impl DutiesServiceBuilder { self } - pub fn distributed(mut self, distributed: bool) -> Self { - self.distributed = distributed; + pub fn attestation_selection_proof_config( + mut self, + attestation_selection_proof_config: SelectionProofConfig, + ) -> Self { + self.attestation_selection_proof_config = attestation_selection_proof_config; + self + } + + pub fn sync_selection_proof_config( + mut self, + sync_selection_proof_config: SelectionProofConfig, + ) -> Self { + self.sync_selection_proof_config = sync_selection_proof_config; self } @@ -289,7 +375,7 @@ impl DutiesServiceBuilder { Ok(DutiesService { attesters: Default::default(), proposers: Default::default(), - sync_duties: SyncDutiesMap::new(self.distributed), + sync_duties: SyncDutiesMap::new(self.sync_selection_proof_config), validator_store: self .validator_store .ok_or("Cannot build DutiesService without validator_store")?, @@ -305,7 +391,7 @@ impl DutiesServiceBuilder { .ok_or("Cannot build DutiesService without executor")?, spec: self.spec.ok_or("Cannot build DutiesService without spec")?, enable_high_validator_count_metrics: self.enable_high_validator_count_metrics, - distributed: self.distributed, + selection_proof_config: self.attestation_selection_proof_config, disable_attesting: self.disable_attesting, }) } @@ -332,10 +418,10 @@ pub struct DutiesService { pub executor: TaskExecutor, /// The current chain spec. pub spec: Arc, - //// Whether we permit large validator counts in the metrics. + /// Whether we permit large validator counts in the metrics. pub enable_high_validator_count_metrics: bool, - /// If this validator is running in distributed mode. - pub distributed: bool, + /// Pass the config for distributed or non-distributed mode. + pub selection_proof_config: SelectionProofConfig, pub disable_attesting: bool, } @@ -1119,6 +1205,75 @@ async fn post_validator_duties_attester( + attesters: &mut RwLockWriteGuard, + result: Result<(AttesterData, Option), Error>, + dependent_root: Hash256, + current_slot: Slot, +) -> bool { + let (duty, selection_proof) = match result { + Ok(duty_and_proof) => duty_and_proof, + Err(Error::FailedToProduceSelectionProof(ValidatorStoreError::UnknownPubkey(pubkey))) => { + // A pubkey can be missing when a validator was recently removed via the API. + warn!( + info = "A validator may have recently been removed from this VC", + ?pubkey, + "Missing pubkey for duty and proof" + ); + // Do not abort the entire batch for a single failure. + // return true means continue processing duties. + return true; + } + Err(e) => { + error!( + error = ?e, + msg = "may impair attestation duties", + "Failed to produce duty and proof" + ); + return true; + } + }; + + let attester_map = attesters.entry(duty.pubkey).or_default(); + let epoch = duty.slot.epoch(S::E::slots_per_epoch()); + match attester_map.entry(epoch) { + hash_map::Entry::Occupied(mut entry) => { + // No need to update duties for which no proof was computed. + let Some(selection_proof) = selection_proof else { + return true; + }; + + let (existing_dependent_root, existing_duty) = entry.get_mut(); + + if *existing_dependent_root == dependent_root { + // Replace existing proof. + existing_duty.selection_proof = Some(selection_proof); + true + } else { + // Our selection proofs are no longer relevant due to a reorg, abandon this entire background process. + debug!( + reason = "re-org", + "Stopping selection proof background task" + ); + false + } + } + + hash_map::Entry::Vacant(entry) => { + // This probably shouldn't happen, but we have enough info to fill in the entry so we may as well. + let subscription_slots = SubscriptionSlots::new(duty.slot, current_slot); + let duty_and_proof = DutyAndProof { + duty, + selection_proof, + subscription_slots, + }; + entry.insert((dependent_root, duty_and_proof)); + true + } + } +} + /// Compute the attestation selection proofs for the `duties` and add them to the `attesters` map. /// /// Duties are computed in batches each slot. If a re-org is detected then the process will @@ -1138,26 +1293,33 @@ async fn fill_in_selection_proofs(); @@ -1170,87 +1332,69 @@ async fn fill_in_selection_proofs>() - .await; + // In distributed case, we want to send all partial selection proofs to the middleware to determine aggregation duties, + // as the middleware will need to have a threshold of partial selection proofs to be able to return the full selection proof + // Thus, sign selection proofs in parallel in distributed case; Otherwise, sign them serially in non-distributed (normal) case + if duties_service.selection_proof_config.parallel_sign { + let mut duty_and_proof_results = relevant_duties + .into_values() + .flatten() + .map(|duty| async { + let opt_selection_proof = make_selection_proof( + &duty, + duties_service.validator_store.as_ref(), + &duties_service.spec, + &duties_service.beacon_nodes, + &duties_service.selection_proof_config, + ) + .await?; + Ok((duty, opt_selection_proof)) + }) + .collect::>(); - // Add to attesters store. - let mut attesters = duties_service.attesters.write(); - for result in duty_and_proof_results { - let (duty, selection_proof) = match result { - Ok(duty_and_proof) => duty_and_proof, - Err(Error::FailedToProduceSelectionProof( - ValidatorStoreError::UnknownPubkey(pubkey), - )) => { - // A pubkey can be missing when a validator was recently - // removed via the API. - warn!( - info = "a validator may have recently been removed from this VC", - ?pubkey, - "Missing pubkey for duty and proof" - ); - // Do not abort the entire batch for a single failure. - continue; - } - Err(e) => { - error!( - error = ?e, - msg = "may impair attestation duties", - "Failed to produce duty and proof" - ); - // Do not abort the entire batch for a single failure. - continue; - } - }; - - let attester_map = attesters.entry(duty.pubkey).or_default(); - let epoch = duty.slot.epoch(S::E::slots_per_epoch()); - match attester_map.entry(epoch) { - hash_map::Entry::Occupied(mut entry) => { - // No need to update duties for which no proof was computed. - let Some(selection_proof) = selection_proof else { - continue; - }; - - let (existing_dependent_root, existing_duty) = entry.get_mut(); - - if *existing_dependent_root == dependent_root { - // Replace existing proof. - existing_duty.selection_proof = Some(selection_proof); - } else { - // Our selection proofs are no longer relevant due to a reorg, abandon - // this entire background process. - debug!( - reason = "re-org", - "Stopping selection proof background task" - ); - return; - } - } - hash_map::Entry::Vacant(entry) => { - // This probably shouldn't happen, but we have enough info to fill in the - // entry so we may as well. - let subscription_slots = SubscriptionSlots::new(duty.slot, current_slot); - let duty_and_proof = DutyAndProof { - duty, - selection_proof, - subscription_slots, - }; - entry.insert((dependent_root, duty_and_proof)); + while let Some(result) = duty_and_proof_results.next().await { + let mut attesters = duties_service.attesters.write(); + // if process_duty_and_proof returns false, exit the loop + if !process_duty_and_proof::( + &mut attesters, + result, + dependent_root, + current_slot, + ) { + return; } } - } - drop(attesters); + } else { + // In normal (non-distributed case), sign selection proofs serially + let duty_and_proof_results = stream::iter(relevant_duties.into_values().flatten()) + .then(|duty| async { + let opt_selection_proof = make_selection_proof( + &duty, + duties_service.validator_store.as_ref(), + &duties_service.spec, + &duties_service.beacon_nodes, + &duties_service.selection_proof_config, + ) + .await?; + Ok((duty, opt_selection_proof)) + }) + .collect::>() + .await; + + // Add to attesters store. + let mut attesters = duties_service.attesters.write(); + for result in duty_and_proof_results { + if !process_duty_and_proof::( + &mut attesters, + result, + dependent_root, + current_slot, + ) { + return; + } + } + drop(attesters); + }; let time_taken_ms = Duration::from_secs_f64(timer.map_or(0.0, |t| t.stop_and_record())).as_millis(); diff --git a/validator_client/validator_services/src/sync.rs b/validator_client/validator_services/src/sync.rs index 328308d514..77032ed15b 100644 --- a/validator_client/validator_services/src/sync.rs +++ b/validator_client/validator_services/src/sync.rs @@ -1,19 +1,16 @@ -use crate::duties_service::{DutiesService, Error}; +use crate::duties_service::{DutiesService, Error, SelectionProofConfig}; +use eth2::types::SyncCommitteeSelection; use futures::future::join_all; +use futures::stream::{FuturesUnordered, StreamExt}; use logging::crit; use parking_lot::{MappedRwLockReadGuard, RwLock, RwLockReadGuard, RwLockWriteGuard}; use slot_clock::SlotClock; use std::collections::{HashMap, HashSet}; use std::sync::Arc; -use tracing::{debug, info, warn}; +use tracing::{debug, error, info, warn}; use types::{ChainSpec, EthSpec, PublicKeyBytes, Slot, SyncDuty, SyncSelectionProof, SyncSubnetId}; use validator_store::{DoppelgangerStatus, Error as ValidatorStoreError, ValidatorStore}; -/// Number of epochs in advance to compute selection proofs when not in `distributed` mode. -pub const AGGREGATION_PRE_COMPUTE_EPOCHS: u64 = 2; -/// Number of slots in advance to compute selection proofs when in `distributed` mode. -pub const AGGREGATION_PRE_COMPUTE_SLOTS_DISTRIBUTED: u64 = 1; - /// Top-level data-structure containing sync duty information. /// /// This data is structured as a series of nested `HashMap`s wrapped in `RwLock`s. Fine-grained @@ -30,7 +27,7 @@ pub struct SyncDutiesMap { /// Map from sync committee period to duties for members of that sync committee. committees: RwLock>, /// Whether we are in `distributed` mode and using reduced lookahead for aggregate pre-compute. - distributed: bool, + pub selection_proof_config: SelectionProofConfig, } /// Duties for a single sync committee period. @@ -79,10 +76,10 @@ pub struct SlotDuties { } impl SyncDutiesMap { - pub fn new(distributed: bool) -> Self { + pub fn new(selection_proof_config: SelectionProofConfig) -> Self { Self { committees: RwLock::new(HashMap::new()), - distributed, + selection_proof_config, } } @@ -99,15 +96,6 @@ impl SyncDutiesMap { }) } - /// Number of slots in advance to compute selection proofs - fn aggregation_pre_compute_slots(&self) -> u64 { - if self.distributed { - AGGREGATION_PRE_COMPUTE_SLOTS_DISTRIBUTED - } else { - E::slots_per_epoch() * AGGREGATION_PRE_COMPUTE_EPOCHS - } - } - /// Prepare for pre-computation of selection proofs for `committee_period`. /// /// Return the slot up to which proofs should be pre-computed, as well as a vec of @@ -123,7 +111,7 @@ impl SyncDutiesMap { current_slot, first_slot_of_period::(committee_period, spec), ); - let pre_compute_lookahead_slots = self.aggregation_pre_compute_slots::(); + let pre_compute_lookahead_slots = self.selection_proof_config.lookahead_slot; let pre_compute_slot = std::cmp::min( current_slot + pre_compute_lookahead_slots, last_slot_of_period::(committee_period, spec), @@ -377,7 +365,7 @@ pub async fn poll_sync_committee_duties(); + let aggregate_pre_compute_lookahead_slots = sync_duties.selection_proof_config.lookahead_slot; if (current_slot + aggregate_pre_compute_lookahead_slots) .epoch(S::E::slots_per_epoch()) .sync_committee_period(spec)? @@ -498,6 +486,114 @@ pub async fn poll_sync_committee_duties_for_period( + duties_service: &Arc>, + duty: &SyncDuty, + proof_slot: Slot, + subnet_id: SyncSubnetId, +) -> Option { + let sync_selection_proof = duties_service + .validator_store + .produce_sync_selection_proof(&duty.pubkey, proof_slot, subnet_id) + .await; + + let selection_proof = match sync_selection_proof { + Ok(proof) => proof, + Err(ValidatorStoreError::UnknownPubkey(pubkey)) => { + // A pubkey can be missing when a validator was recently removed via the API + debug!( + ?pubkey, + "slot" = %proof_slot, + "Missing pubkey for sync selection proof"); + return None; + } + Err(e) => { + warn!( + "error" = ?e, + "pubkey" = ?duty.pubkey, + "slot" = %proof_slot, + "Unable to sign selection proof" + ); + return None; + } + }; + + // In DVT with middleware, when we want to call the selections endpoint + if duties_service + .sync_duties + .selection_proof_config + .selections_endpoint + { + debug!( + "validator_index" = duty.validator_index, + "slot" = %proof_slot, + "subcommittee_index" = *subnet_id, + // This is partial selection proof + "partial selection proof" = ?selection_proof, + "Sending sync selection to middleware" + ); + + let sync_committee_selection = SyncCommitteeSelection { + validator_index: duty.validator_index, + slot: proof_slot, + subcommittee_index: *subnet_id, + selection_proof: selection_proof.clone().into(), + }; + + // Call the endpoint /eth/v1/validator/sync_committee_selections + // by sending the SyncCommitteeSelection that contains partial sync selection proof + // The middleware should return SyncCommitteeSelection that contains full sync selection proof + let middleware_response = duties_service + .beacon_nodes + .first_success(|beacon_node| { + let selection_data = sync_committee_selection.clone(); + async move { + beacon_node + .post_validator_sync_committee_selections(&[selection_data]) + .await + } + }) + .await; + + match middleware_response { + Ok(mut response) => { + let Some(response_data) = response.data.pop() else { + error!( + validator_index = duty.validator_index, + slot = %proof_slot, + "Empty response from sync selection middleware", + ); + return None; + }; + debug!( + "validator_index" = response_data.validator_index, + "slot" = %response_data.slot, + "subcommittee_index" = response_data.subcommittee_index, + // The selection proof from middleware response will be a full selection proof + "full selection proof" = ?response_data.selection_proof, + "Received sync selection from middleware" + ); + + // Convert the response to a SyncSelectionProof + let full_selection_proof = SyncSelectionProof::from(response_data.selection_proof); + Some(full_selection_proof) + } + Err(e) => { + error!( + "error" = %e, + %proof_slot, + "Failed to get sync selection proofs from middleware" + ); + None + } + } + } else { + // In non-distributed mode, the selection_proof is already a full selection proof + Some(selection_proof) + } +} + pub async fn fill_in_aggregation_proofs( duties_service: Arc>, pre_compute_duties: &[(Slot, SyncDuty)], @@ -505,131 +601,193 @@ pub async fn fill_in_aggregation_proofs() { - Ok(subnet_ids) => subnet_ids, - Err(e) => { - crit!( - error = ?e, - "Arithmetic error computing subnet IDs" - ); - continue; - } - }; - - // Create futures to produce proofs. - let duties_service_ref = &duties_service; - let futures = subnet_ids.iter().map(|subnet_id| async move { - // Construct proof for prior slot. - let proof_slot = slot - 1; - - let proof = match duties_service_ref - .validator_store - .produce_sync_selection_proof(&duty.pubkey, proof_slot, *subnet_id) - .await - { - Ok(proof) => proof, - Err(ValidatorStoreError::UnknownPubkey(pubkey)) => { - // A pubkey can be missing when a validator was recently - // removed via the API. - debug!( - ?pubkey, - pubkey = ?duty.pubkey, - slot = %proof_slot, - "Missing pubkey for sync selection proof" - ); - return None; - } + for (_, duty) in pre_compute_duties { + let subnet_ids = match duty.subnet_ids::() { + Ok(subnet_ids) => subnet_ids, Err(e) => { - warn!( - error = ?e, - pubkey = ?duty.pubkey, - slot = %proof_slot, - "Unable to sign selection proof" + crit!( + "error" = ?e, + "Arithmetic error computing subnet IDs" ); - return None; + continue; } }; + // Construct proof for prior slot. + let proof_slot = slot - 1; + + // Calling the make_sync_selection_proof will return a full selection proof + for &subnet_id in &subnet_ids { + let duties_service = duties_service.clone(); + futures_unordered.push(async move { + let result = + make_sync_selection_proof(&duties_service, duty, proof_slot, subnet_id) + .await; + + result.map(|proof| (duty.validator_index, proof_slot, subnet_id, proof)) + }); + } + } + + while let Some(result) = futures_unordered.next().await { + let Some((validator_index, proof_slot, subnet_id, proof)) = result else { + continue; + }; + let sync_map = duties_service.sync_duties.committees.read(); + let Some(committee_duties) = sync_map.get(&sync_committee_period) else { + debug!("period" = sync_committee_period, "Missing sync duties"); + continue; + }; + + let validators = committee_duties.validators.read(); + + // Check if the validator is an aggregator match proof.is_aggregator::() { Ok(true) => { - debug!( - validator_index = duty.validator_index, - slot = %proof_slot, - %subnet_id, - "Validator is sync aggregator" - ); - Some(((proof_slot, *subnet_id), proof)) + if let Some(Some(duty)) = validators.get(&validator_index) { + debug!( + validator_index, + "slot" = %proof_slot, + "subcommittee_index" = *subnet_id, + // log full selection proof for debugging + "full selection proof" = ?proof, + "Validator is sync aggregator" + ); + + // Store the proof + duty.aggregation_duties + .proofs + .write() + .insert((proof_slot, subnet_id), proof); + } } - Ok(false) => None, + Ok(false) => {} // Not an aggregator Err(e) => { warn!( - pubkey = ?duty.pubkey, - slot = %proof_slot, - error = ?e, + validator_index, + %slot, + "error" = ?e, "Error determining is_aggregator" ); - None } } - }); + } + } else { + // For non-distributed mode + debug!( + period = sync_committee_period, + %current_slot, + %pre_compute_slot, + "Calculating sync selection proofs" + ); - // Execute all the futures in parallel, collecting any successful results. - let proofs = join_all(futures) - .await - .into_iter() - .flatten() - .collect::>(); + let mut validator_proofs = vec![]; + for (validator_start_slot, duty) in pre_compute_duties { + // Proofs are already known at this slot for this validator. + if slot < *validator_start_slot { + continue; + } - validator_proofs.push((duty.validator_index, proofs)); - } + let subnet_ids = match duty.subnet_ids::() { + Ok(subnet_ids) => subnet_ids, + Err(e) => { + crit!( + error = ?e, + "Arithmetic error computing subnet IDs" + ); + continue; + } + }; - // Add to global storage (we add regularly so the proofs can be used ASAP). - let sync_map = duties_service.sync_duties.committees.read(); - let Some(committee_duties) = sync_map.get(&sync_committee_period) else { - debug!(period = sync_committee_period, "Missing sync duties"); - continue; - }; - let validators = committee_duties.validators.read(); - let num_validators_updated = validator_proofs.len(); + // Create futures to produce proofs. + let duties_service_ref = &duties_service; + let futures = subnet_ids.iter().map(|subnet_id| async move { + // Construct proof for prior slot. + let proof_slot = slot - 1; - for (validator_index, proofs) in validator_proofs { - if let Some(Some(duty)) = validators.get(&validator_index) { - duty.aggregation_duties.proofs.write().extend(proofs); - } else { + let proof = + make_sync_selection_proof(duties_service_ref, duty, proof_slot, *subnet_id) + .await; + + match proof { + Some(proof) => match proof.is_aggregator::() { + Ok(true) => { + debug!( + validator_index = duty.validator_index, + slot = %proof_slot, + %subnet_id, + "Validator is sync aggregator" + ); + Some(((proof_slot, *subnet_id), proof)) + } + Ok(false) => None, + Err(e) => { + warn!( + pubkey = ?duty.pubkey, + slot = %proof_slot, + error = ?e, + "Error determining is_aggregator" + ); + None + } + }, + + None => None, + } + }); + + // Execute all the futures in parallel, collecting any successful results. + let proofs = join_all(futures) + .await + .into_iter() + .flatten() + .collect::>(); + + validator_proofs.push((duty.validator_index, proofs)); + } + + // Add to global storage (we add regularly so the proofs can be used ASAP). + let sync_map = duties_service.sync_duties.committees.read(); + let Some(committee_duties) = sync_map.get(&sync_committee_period) else { + debug!(period = sync_committee_period, "Missing sync duties"); + continue; + }; + let validators = committee_duties.validators.read(); + let num_validators_updated = validator_proofs.len(); + + for (validator_index, proofs) in validator_proofs { + if let Some(Some(duty)) = validators.get(&validator_index) { + duty.aggregation_duties.proofs.write().extend(proofs); + } else { + debug!( + validator_index, + period = sync_committee_period, + "Missing sync duty to update" + ); + } + } + + if num_validators_updated > 0 { debug!( - validator_index, - period = sync_committee_period, - "Missing sync duty to update" + %slot, + updated_validators = num_validators_updated, + "Finished computing sync selection proofs" ); } } - - if num_validators_updated > 0 { - debug!( - %slot, - updated_validators = num_validators_updated, - "Finished computing sync selection proofs" - ); - } } } diff --git a/validator_client/validator_store/src/lib.rs b/validator_client/validator_store/src/lib.rs index c3b551c249..6fd2e27064 100644 --- a/validator_client/validator_store/src/lib.rs +++ b/validator_client/validator_store/src/lib.rs @@ -21,6 +21,7 @@ pub enum Error { GreaterThanCurrentEpoch { epoch: Epoch, current_epoch: Epoch }, UnableToSignAttestation(AttestationError), SpecificError(T), + Middleware(String), } impl From for Error { diff --git a/wordlist.txt b/wordlist.txt index 0391af78cb..58c4cf6db1 100644 --- a/wordlist.txt +++ b/wordlist.txt @@ -39,6 +39,7 @@ EthStaker Exercism Extractable FFG +Fulu Geth GiB Gitcoin @@ -187,6 +188,7 @@ namespace natively nd ness +nextest nginx nitty oom