diff --git a/.github/workflows/local-testnet.yml b/.github/workflows/local-testnet.yml index 7bd8b40d76..5cffb4e2fd 100644 --- a/.github/workflows/local-testnet.yml +++ b/.github/workflows/local-testnet.yml @@ -20,7 +20,7 @@ jobs: - name: Build Docker image run: | - docker build --build-arg FEATURES=portable -t lighthouse:local . + docker build --build-arg FEATURES=portable,spec-minimal -t lighthouse:local . docker save lighthouse:local -o lighthouse-docker.tar - name: Upload Docker image artifact @@ -213,6 +213,49 @@ jobs: scripts/local_testnet/logs retention-days: 3 + # Test syncing from genesis on a local testnet. Aims to cover forward syncing both short and long distances. + genesis-sync-test: + name: genesis-sync-test-${{ matrix.fork }}-${{ matrix.offline_secs }}s + runs-on: ubuntu-latest + needs: dockerfile-ubuntu + if: contains(github.event.pull_request.labels.*.name, 'syncing') + strategy: + matrix: + fork: [electra, fulu] + offline_secs: [120, 300] + steps: + - uses: actions/checkout@v4 + + - name: Install Kurtosis + run: | + echo "deb [trusted=yes] https://apt.fury.io/kurtosis-tech/ /" | sudo tee /etc/apt/sources.list.d/kurtosis.list + sudo apt update + sudo apt install -y kurtosis-cli + kurtosis analytics disable + + - name: Download Docker image artifact + uses: actions/download-artifact@v4 + with: + name: lighthouse-docker + path: . + + - name: Load Docker image + run: docker load -i lighthouse-docker.tar + + - name: Run the genesis sync test script + run: | + ./genesis-sync.sh "sync-${{ matrix.fork }}-${{ matrix.offline_secs }}s" "genesis-sync-config-${{ matrix.fork }}.yaml" "${{ matrix.fork }}" "${{ matrix.offline_secs }}" + working-directory: scripts/tests + + - name: Upload logs artifact + if: always() + uses: actions/upload-artifact@v4 + with: + name: logs-genesis-sync-${{ matrix.fork }}-${{ matrix.offline_secs }}s + path: | + scripts/local_testnet/logs + retention-days: 3 + # This job succeeds ONLY IF all others succeed. It is used by the merge queue to determine whether # a PR is safe to merge. New jobs should be added here. local-testnet-success: @@ -228,5 +271,5 @@ jobs: - uses: actions/checkout@v4 - name: Check that success job is dependent on all others run: | - exclude_jobs='checkpoint-sync-test' + exclude_jobs='checkpoint-sync-test|genesis-sync-test' ./scripts/ci/check-success-job.sh ./.github/workflows/local-testnet.yml local-testnet-success "$exclude_jobs" diff --git a/scripts/local_testnet/start_local_testnet.sh b/scripts/local_testnet/start_local_testnet.sh index 8e8859ca0e..442e6fd98d 100755 --- a/scripts/local_testnet/start_local_testnet.sh +++ b/scripts/local_testnet/start_local_testnet.sh @@ -81,7 +81,7 @@ fi if [ "$BUILD_IMAGE" = true ]; then echo "Building Lighthouse Docker image." ROOT_DIR="$SCRIPT_DIR/../.." - docker build --build-arg FEATURES=portable -f $ROOT_DIR/Dockerfile -t $LH_IMAGE_NAME $ROOT_DIR + docker build --build-arg FEATURES=portable,spec-minimal -f $ROOT_DIR/Dockerfile -t $LH_IMAGE_NAME $ROOT_DIR else echo "Not rebuilding Lighthouse Docker image." fi diff --git a/scripts/tests/genesis-sync-config-electra.yaml b/scripts/tests/genesis-sync-config-electra.yaml new file mode 100644 index 0000000000..153f754c94 --- /dev/null +++ b/scripts/tests/genesis-sync-config-electra.yaml @@ -0,0 +1,22 @@ +# Kurtosis config file for testing sync on a local devnet. +participants: + - cl_type: lighthouse + cl_image: lighthouse:local + count: 2 + # nodes without validators, used for testing sync. + - cl_type: lighthouse + cl_image: lighthouse:local + supernode: true # no supernode in Electra, this is for future proof + validator_count: 0 + - cl_type: lighthouse + cl_image: lighthouse:local + supernode: false + validator_count: 0 +network_params: + seconds_per_slot: 6 + electra_fork_epoch: 0 + preset: "minimal" +additional_services: + - tx_fuzz + - spamoor +global_log_level: debug diff --git a/scripts/tests/genesis-sync-config-fulu.yaml b/scripts/tests/genesis-sync-config-fulu.yaml new file mode 100644 index 0000000000..ccdc09c0d3 --- /dev/null +++ b/scripts/tests/genesis-sync-config-fulu.yaml @@ -0,0 +1,22 @@ +# Kurtosis config file for testing sync on a local devnet. +participants: + - cl_type: lighthouse + cl_image: lighthouse:local + count: 2 + # nodes without validators, used for testing sync. + - cl_type: lighthouse + cl_image: lighthouse:local + supernode: true + validator_count: 0 + - cl_type: lighthouse + cl_image: lighthouse:local + supernode: false + validator_count: 0 +network_params: + seconds_per_slot: 6 + fulu_fork_epoch: 0 + preset: "minimal" +additional_services: + - tx_fuzz + - spamoor +global_log_level: debug diff --git a/scripts/tests/genesis-sync.sh b/scripts/tests/genesis-sync.sh new file mode 100755 index 0000000000..39628c9e73 --- /dev/null +++ b/scripts/tests/genesis-sync.sh @@ -0,0 +1,151 @@ +#!/usr/bin/env bash +# +# Genesis sync test on a local network. +# +# Start a local testnet, shut down non-validator nodes for a period, then restart them +# and monitor their sync progress from genesis to head. +SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" + +ENCLAVE_NAME=${1:-genesis-sync-testnet} +CONFIG=${2:-$SCRIPT_DIR/genesis-sync-config-electra.yaml} +FORK_TYPE=${3:-electra} # electra or fulu +OFFLINE_DURATION_SECS=${4:-120} # stopped duration of non validating nodes + +# Test configuration +# ------------------------------------------------------ +# Interval for polling the /lighthouse/syncing endpoint for sync status +# Reduce the polling time so that some progress can be seen +POLL_INTERVAL_SECS=0.5 +# Timeout for this test, if the nodes fail to sync, fail the test. +TIMEOUT_MINS=5 +TIMEOUT_SECS=$((TIMEOUT_MINS * 60)) +# ------------------------------------------------------ + +echo "Starting genesis sync test with:" +echo " Fork: $FORK_TYPE" +echo " Offline duration: ${OFFLINE_DURATION_SECS}s" + +# Polls a node's sync status +poll_node() { + local node_type=$1 + local url=${node_urls[$node_type]} + + response=$(curl -s "${url}/lighthouse/syncing" 2>/dev/null) + + if [ -z "$response" ] || [ "$response" = "null" ]; then + echo "${node_type} status: No response or null response" + return + fi + + # Print syncing status + sync_state=$(echo "$response" | jq -r 'if (.data | type) == "object" then "object" else "string" end' 2>/dev/null) + + if [ "$sync_state" = "object" ]; then + status=$(echo "$response" | jq -r '.data | keys[0] // "Unknown"') + fields=$(echo "$response" | jq -r ".data.${status} | to_entries | map(\"\(.key): \(.value)\") | join(\", \")") + echo "${node_type} status: ${status}, ${fields}" + else + status=$(echo "$response" | jq -r '.data' 2>/dev/null) + echo "${node_type} status: ${status:-Unknown}" + + # The test is complete when the node is synced + if [ "$status" = "Synced" ]; then + mark_node_complete "$node_type" + fi + fi +} + +# Marks a node as complete and record time +mark_node_complete() { + local node_type=$1 + if [ "${node_completed[$node_type]}" = false ]; then + node_completed[$node_type]=true + node_complete_time[$node_type]=$(date +%s) + echo "${node_type} completed sync in $((node_complete_time[$node_type] - sync_start_time)) seconds" + fi +} + +exit_and_dump_logs() { + local exit_code=$1 + echo "Shutting down..." + $SCRIPT_DIR/../local_testnet/stop_local_testnet.sh $ENCLAVE_NAME + echo "Test completed with exit code $exit_code." + exit $exit_code +} + +# Start the nodes +$SCRIPT_DIR/../local_testnet/start_local_testnet.sh -e $ENCLAVE_NAME -b false -n $CONFIG +if [ $? -ne 0 ]; then + echo "Failed to start local testnet" + exit_and_dump_logs 1 +fi + +# Wait for 10s before stopping non-validating nodes +sleep 10 + +# These are non validating nodes +supernode="cl-3-lighthouse-geth" +fullnode="cl-4-lighthouse-geth" + +# Stop the non-validator nodes +kurtosis service stop $ENCLAVE_NAME $supernode +kurtosis service stop $ENCLAVE_NAME $fullnode + +echo "Non-validator nodes stopped. Waiting ${OFFLINE_DURATION_SECS} seconds..." + +# Display the time every 10s when the nodes are stopped +remaining_time=$OFFLINE_DURATION_SECS +while [ $remaining_time -gt 0 ]; do + sleep 10 + remaining_time=$((remaining_time - 10)) + echo "Nodes are stopped for $((OFFLINE_DURATION_SECS - remaining_time))s, ${remaining_time}s remains..." +done + +echo "Resuming non-validator nodes..." + +# Resume the non validating nodes +kurtosis service start $ENCLAVE_NAME $supernode +kurtosis service start $ENCLAVE_NAME $fullnode + +# The time at which syncing starts after the node was stopped +sync_start_time=$(date +%s) + +# Get beacon API URLs for non validating nodes for query +supernode_url=$(kurtosis port print $ENCLAVE_NAME $supernode http) +fullnode_url=$(kurtosis port print $ENCLAVE_NAME $fullnode http) + +# Initialize statuses +declare -A node_completed +declare -A node_complete_time +declare -A node_urls + +node_urls["supernode"]="$supernode_url" +node_urls["fullnode"]="$fullnode_url" +node_completed["supernode"]=false +node_completed["fullnode"]=false + +echo "Polling sync status until nodes are synced or timeout of ${TIMEOUT_MINS} mins" + +while [ "${node_completed[supernode]}" = false ] || [ "${node_completed[fullnode]}" = false ]; do + current_time=$(date +%s) + elapsed=$((current_time - sync_start_time)) + + if [ "$elapsed" -ge "$TIMEOUT_SECS" ]; then + echo "ERROR: Nodes timed out syncing after ${TIMEOUT_MINS} minutes. Exiting." + exit_and_dump_logs 1 + fi + + # Poll each node that hasn't completed yet + for node in "supernode" "fullnode"; do + if [ "${node_completed[$node]}" = false ]; then + poll_node "$node" + fi + done + + sleep $POLL_INTERVAL_SECS +done + +echo "Genesis sync test complete! Both supernode and fullnode have synced successfully." +echo "Supernode time: $((node_complete_time[supernode] - sync_start_time)) seconds" +echo "Fullnode time: $((node_complete_time[fullnode] - sync_start_time)) seconds" +exit_and_dump_logs 0 \ No newline at end of file