mirror of
https://github.com/sigp/lighthouse.git
synced 2026-03-03 00:31:50 +00:00
Add e2e sync tests to CI (#7530)
This PR adds the following sync tests to CI workflow - triggered when a PR is labeled `syncing` - to ensure we have some e2e coverage on basic sync scenarios: - [x] checkpoint sync to a live network (covers range and backfill sync for _current_ fork) - [x] checkpoint sync to a running devnet (covers range and backfill sync for _next_ fork) It seems to work fine running on github hosted runners - but if performance become an issue we could switch to using self hosted runners for sepolia sync test. (standard CPU runners have 4 CPU, 16 GB ram - i think it _should_ be enough on sepolia / devnet networks) The following tests have been **removed** from this PR and moved to a separate issue *(#7550) - [x] genesis sync on a local devnet (covers current and next fork) - [x] brief shutdown and restart (covers lookup sync) - [x] longer shutdown and restart (covers range sync) I'm hoping to keep these e2e test maintenance effort to a minimum - hopefully longer term we could have some generic e2e tests that works for all clients and the maintenance effort can be spread across teams. ### Latest test run: https://github.com/sigp/lighthouse/actions/runs/15411744248 ### Results: <img width="687" alt="image" src="https://github.com/user-attachments/assets/c7178291-7b39-4f3b-a339-d3715eb16081" /> <img width="693" alt="image" src="https://github.com/user-attachments/assets/a8fc3520-296c-4baf-ae1e-1e887e660a3c" /> #### logs are available as artifacts: <img width="629" alt="image" src="https://github.com/user-attachments/assets/3c0e1cd7-9c94-4d0c-be62-5e45179ab8f3" />
This commit is contained in:
49
.github/workflows/local-testnet.yml
vendored
49
.github/workflows/local-testnet.yml
vendored
@@ -67,6 +67,7 @@ jobs:
|
|||||||
working-directory: scripts/local_testnet
|
working-directory: scripts/local_testnet
|
||||||
|
|
||||||
- name: Upload logs artifact
|
- name: Upload logs artifact
|
||||||
|
if: always()
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: logs-local-testnet
|
name: logs-local-testnet
|
||||||
@@ -125,6 +126,7 @@ jobs:
|
|||||||
working-directory: scripts/tests
|
working-directory: scripts/tests
|
||||||
|
|
||||||
- name: Upload logs artifact
|
- name: Upload logs artifact
|
||||||
|
if: always()
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: logs-doppelganger-protection-success
|
name: logs-doppelganger-protection-success
|
||||||
@@ -160,6 +162,7 @@ jobs:
|
|||||||
working-directory: scripts/tests
|
working-directory: scripts/tests
|
||||||
|
|
||||||
- name: Upload logs artifact
|
- name: Upload logs artifact
|
||||||
|
if: always()
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: logs-doppelganger-protection-failure
|
name: logs-doppelganger-protection-failure
|
||||||
@@ -167,6 +170,48 @@ jobs:
|
|||||||
scripts/local_testnet/logs
|
scripts/local_testnet/logs
|
||||||
retention-days: 3
|
retention-days: 3
|
||||||
|
|
||||||
|
# Tests checkpoint syncing to a live network (current fork) and a running devnet (usually next scheduled fork)
|
||||||
|
checkpoint-sync-test:
|
||||||
|
name: checkpoint-sync-test-${{ matrix.network }}
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
needs: dockerfile-ubuntu
|
||||||
|
if: contains(github.event.pull_request.labels.*.name, 'syncing')
|
||||||
|
continue-on-error: true
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
network: [sepolia, devnet]
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Install Kurtosis
|
||||||
|
run: |
|
||||||
|
echo "deb [trusted=yes] https://apt.fury.io/kurtosis-tech/ /" | sudo tee /etc/apt/sources.list.d/kurtosis.list
|
||||||
|
sudo apt update
|
||||||
|
sudo apt install -y kurtosis-cli
|
||||||
|
kurtosis analytics disable
|
||||||
|
|
||||||
|
- name: Download Docker image artifact
|
||||||
|
uses: actions/download-artifact@v4
|
||||||
|
with:
|
||||||
|
name: lighthouse-docker
|
||||||
|
path: .
|
||||||
|
|
||||||
|
- name: Load Docker image
|
||||||
|
run: docker load -i lighthouse-docker.tar
|
||||||
|
|
||||||
|
- name: Run the checkpoint sync test script
|
||||||
|
run: |
|
||||||
|
./checkpoint-sync.sh "sync-${{ matrix.network }}" "checkpoint-sync-config-${{ matrix.network }}.yaml"
|
||||||
|
working-directory: scripts/tests
|
||||||
|
|
||||||
|
- name: Upload logs artifact
|
||||||
|
if: always()
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: logs-checkpoint-sync-${{ matrix.network }}
|
||||||
|
path: |
|
||||||
|
scripts/local_testnet/logs
|
||||||
|
retention-days: 3
|
||||||
|
|
||||||
# This job succeeds ONLY IF all others succeed. It is used by the merge queue to determine whether
|
# This job succeeds ONLY IF all others succeed. It is used by the merge queue to determine whether
|
||||||
# a PR is safe to merge. New jobs should be added here.
|
# a PR is safe to merge. New jobs should be added here.
|
||||||
@@ -182,4 +227,6 @@ jobs:
|
|||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
- name: Check that success job is dependent on all others
|
- name: Check that success job is dependent on all others
|
||||||
run: ./scripts/ci/check-success-job.sh ./.github/workflows/local-testnet.yml local-testnet-success
|
run: |
|
||||||
|
exclude_jobs='checkpoint-sync-test'
|
||||||
|
./scripts/ci/check-success-job.sh ./.github/workflows/local-testnet.yml local-testnet-success "$exclude_jobs"
|
||||||
|
|||||||
@@ -5,8 +5,13 @@ set -euf -o pipefail
|
|||||||
|
|
||||||
YAML=$1
|
YAML=$1
|
||||||
SUCCESS_JOB=$2
|
SUCCESS_JOB=$2
|
||||||
|
EXCLUDE_JOBS_REGEX=${3:-}
|
||||||
|
|
||||||
|
yq '... comments="" | .jobs | map(. | key) | .[]' < "$YAML" |
|
||||||
|
grep -v "$SUCCESS_JOB" |
|
||||||
|
{ [ -n "$EXCLUDE_JOBS_REGEX" ] && grep -Ev "$EXCLUDE_JOBS_REGEX" || cat; } |
|
||||||
|
sort > all_jobs.txt
|
||||||
|
|
||||||
yq '... comments="" | .jobs | map(. | key) | .[]' < "$YAML" | grep -v "$SUCCESS_JOB" | sort > all_jobs.txt
|
|
||||||
yq "... comments=\"\" | .jobs.$SUCCESS_JOB.needs[]" < "$YAML" | grep -v "$SUCCESS_JOB" | sort > dep_jobs.txt
|
yq "... comments=\"\" | .jobs.$SUCCESS_JOB.needs[]" < "$YAML" | grep -v "$SUCCESS_JOB" | sort > dep_jobs.txt
|
||||||
diff all_jobs.txt dep_jobs.txt || (echo "COMPLETENESS CHECK FAILED" && exit 1)
|
diff all_jobs.txt dep_jobs.txt || (echo "COMPLETENESS CHECK FAILED" && exit 1)
|
||||||
rm all_jobs.txt dep_jobs.txt
|
rm all_jobs.txt dep_jobs.txt
|
||||||
|
|||||||
16
scripts/tests/checkpoint-sync-config-devnet.yaml
Normal file
16
scripts/tests/checkpoint-sync-config-devnet.yaml
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
# Kurtosis config file to checkpoint sync to a running devnet supported by ethPandaOps and `ethereum-package`.
|
||||||
|
participants:
|
||||||
|
- cl_type: lighthouse
|
||||||
|
cl_image: lighthouse:local
|
||||||
|
supernode: true
|
||||||
|
- cl_type: lighthouse
|
||||||
|
cl_image: lighthouse:local
|
||||||
|
supernode: false
|
||||||
|
|
||||||
|
checkpoint_sync_enabled: true
|
||||||
|
checkpoint_sync_url: "https://checkpoint-sync.fusaka-devnet-0.ethpandaops.io"
|
||||||
|
|
||||||
|
global_log_level: debug
|
||||||
|
|
||||||
|
network_params:
|
||||||
|
network: fusaka-devnet-0
|
||||||
16
scripts/tests/checkpoint-sync-config-sepolia.yaml
Normal file
16
scripts/tests/checkpoint-sync-config-sepolia.yaml
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
# Kurtosis config file to checkpoint sync to a live network (Sepolia).
|
||||||
|
participants:
|
||||||
|
- cl_type: lighthouse
|
||||||
|
cl_image: lighthouse:local
|
||||||
|
supernode: true
|
||||||
|
- cl_type: lighthouse
|
||||||
|
cl_image: lighthouse:local
|
||||||
|
supernode: false
|
||||||
|
|
||||||
|
checkpoint_sync_enabled: true
|
||||||
|
checkpoint_sync_url: "https://checkpoint-sync.sepolia.ethpandaops.io"
|
||||||
|
|
||||||
|
global_log_level: debug
|
||||||
|
|
||||||
|
network_params:
|
||||||
|
network: sepolia
|
||||||
127
scripts/tests/checkpoint-sync.sh
Executable file
127
scripts/tests/checkpoint-sync.sh
Executable file
@@ -0,0 +1,127 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
#
|
||||||
|
# Checkpoint sync to a live network.
|
||||||
|
#
|
||||||
|
# Start with checkpoint sync and let the node(s) sync to head and perform backfill for a specified number of slots.
|
||||||
|
# This test ensures we cover all sync components (range, lookup, backfill) and measures sync speed
|
||||||
|
# to detect any performance regressions.
|
||||||
|
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
|
||||||
|
|
||||||
|
ENCLAVE_NAME=${1:-sync-testnet}
|
||||||
|
CONFIG=${2:-$SCRIPT_DIR/checkpoint-sync-config-sepolia.yaml}
|
||||||
|
|
||||||
|
# Test configuration
|
||||||
|
# ------------------------------------------------------
|
||||||
|
# Interval for polling the /lighthouse/syncing endpoint for sync status
|
||||||
|
POLL_INTERVAL_SECS=5
|
||||||
|
# Target number of slots to backfill to complete this test.
|
||||||
|
TARGET_BACKFILL_SLOTS=1024
|
||||||
|
# Timeout for this test, if the node(s) fail to backfill `TARGET_BACKFILL_SLOTS` slots, fail the test.
|
||||||
|
TIMEOUT_MINS=10
|
||||||
|
TIMEOUT_SECS=$((TIMEOUT_MINS * 60))
|
||||||
|
# ------------------------------------------------------
|
||||||
|
|
||||||
|
# Polls a single node's sync status
|
||||||
|
poll_node() {
|
||||||
|
local node_type=$1
|
||||||
|
local url=${node_urls[$node_type]}
|
||||||
|
|
||||||
|
response=$(curl -s "${url}/lighthouse/syncing")
|
||||||
|
|
||||||
|
if [ -z "$response" ] || [ "$response" = "null" ]; then
|
||||||
|
echo "${node_type} status: No response or null response"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Print syncing status
|
||||||
|
sync_state=$(echo "$response" | jq -r 'if (.data | type) == "object" then "object" else "string" end' 2>/dev/null)
|
||||||
|
|
||||||
|
if [ "$sync_state" = "object" ]; then
|
||||||
|
status=$(echo "$response" | jq -r '.data | keys[0] // "Unknown"')
|
||||||
|
fields=$(echo "$response" | jq -r ".data.${status} | to_entries | map(\"\(.key): \(.value)\") | join(\", \")")
|
||||||
|
echo "${node_type} status: ${status}, ${fields}"
|
||||||
|
else
|
||||||
|
status=$(echo "$response" | jq -r '.data' 2>/dev/null)
|
||||||
|
echo "${node_type} status: ${status:-Unknown}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check for completion criteria
|
||||||
|
if [ "$status" = "BackFillSyncing" ]; then
|
||||||
|
completed=$(echo "$response" | jq -r ".data.${status}.completed // 0")
|
||||||
|
if [ "$completed" -ge "$TARGET_BACKFILL_SLOTS" ]; then
|
||||||
|
mark_node_complete "$node_type"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
# For other states (Synced, SyncingFinalized, SyncingHead, SyncTransition, Stalled, Unknown),
|
||||||
|
# we continue polling
|
||||||
|
# NOTE: there is a bug where Lighthouse briefly switch to "Synced" before completing backfilling. We ignore this state
|
||||||
|
# as it's unlikely a node is fully synced without going through backfilling `TARGET_BACKFILL_SLOTS` slots (only
|
||||||
|
# possible on a new network).
|
||||||
|
}
|
||||||
|
|
||||||
|
# Marks a node as complete and record time
|
||||||
|
mark_node_complete() {
|
||||||
|
local node_type=$1
|
||||||
|
if [ "${node_completed[$node_type]}" = false ]; then
|
||||||
|
node_completed[$node_type]=true
|
||||||
|
node_complete_time[$node_type]=$(date +%s)
|
||||||
|
echo "${node_type} completed backfill in $((node_complete_time[$node_type] - start_time)) seconds"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
exit_and_dump_logs() {
|
||||||
|
local exit_code=$1
|
||||||
|
echo "Shutting down..."
|
||||||
|
$SCRIPT_DIR/../local_testnet/stop_local_testnet.sh $ENCLAVE_NAME
|
||||||
|
echo "Test completed with exit code $exit_code."
|
||||||
|
exit $exit_code
|
||||||
|
}
|
||||||
|
|
||||||
|
# Start the nodes
|
||||||
|
$SCRIPT_DIR/../local_testnet/start_local_testnet.sh -e $ENCLAVE_NAME -b false -n $CONFIG
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Failed to start local testnet"
|
||||||
|
exit_and_dump_logs 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
start_time=$(date +%s)
|
||||||
|
|
||||||
|
# Get all beacon API URLs
|
||||||
|
supernode_url=$(kurtosis port print $ENCLAVE_NAME cl-1-lighthouse-geth http)
|
||||||
|
fullnode_url=$(kurtosis port print $ENCLAVE_NAME cl-2-lighthouse-geth http)
|
||||||
|
|
||||||
|
# Initialize statuses
|
||||||
|
declare -A node_completed
|
||||||
|
declare -A node_complete_time
|
||||||
|
declare -A node_urls
|
||||||
|
|
||||||
|
node_urls["supernode"]="$supernode_url"
|
||||||
|
node_urls["fullnode"]="$fullnode_url"
|
||||||
|
node_completed["supernode"]=false
|
||||||
|
node_completed["fullnode"]=false
|
||||||
|
|
||||||
|
echo "Polling sync status until backfill reaches ${TARGET_BACKFILL_SLOTS} slots or timeout of ${TIMEOUT_MINS} mins"
|
||||||
|
|
||||||
|
while [ "${node_completed[supernode]}" = false ] || [ "${node_completed[fullnode]}" = false ]; do
|
||||||
|
current_time=$(date +%s)
|
||||||
|
elapsed=$((current_time - start_time))
|
||||||
|
|
||||||
|
if [ "$elapsed" -ge "$TIMEOUT_SECS" ]; then
|
||||||
|
echo "ERROR: Nodes timed out syncing after ${TIMEOUT_MINS} minutes. Exiting."
|
||||||
|
exit_and_dump_logs 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Poll each node that hasn't completed yet
|
||||||
|
for node in "supernode" "fullnode"; do
|
||||||
|
if [ "${node_completed[$node]}" = false ]; then
|
||||||
|
poll_node "$node"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
sleep $POLL_INTERVAL_SECS
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "Sync test complete! Both supernode and fullnode have synced to HEAD and backfilled ${TARGET_BACKFILL_SLOTS} slots."
|
||||||
|
echo "Supernode time: $((node_complete_time[supernode] - start_time)) seconds"
|
||||||
|
echo "Fullnode time: $((node_complete_time[fullnode] - start_time)) seconds"
|
||||||
|
exit_and_dump_logs 0
|
||||||
Reference in New Issue
Block a user