diff --git a/beacon_node/rest_api/src/metrics.rs b/beacon_node/rest_api/src/metrics.rs index 7e7afa1121..b367a21be5 100644 --- a/beacon_node/rest_api/src/metrics.rs +++ b/beacon_node/rest_api/src/metrics.rs @@ -3,6 +3,7 @@ use crate::{ApiError, ApiResult}; use beacon_chain::{BeaconChain, BeaconChainTypes}; use hyper::{Body, Request}; use lighthouse_metrics::{Encoder, TextEncoder}; +use rest_types::Health; use std::path::PathBuf; use std::sync::Arc; @@ -36,6 +37,38 @@ lazy_static! { "http_server_validator_duties_get_request_duration_seconds", "Time taken to respond to GET /validator/duties" ); + pub static ref PROCESS_NUM_THREADS: Result = try_create_int_gauge( + "process_num_threads", + "Number of threads used by the current process" + ); + pub static ref PROCESS_RES_MEM: Result = try_create_int_gauge( + "process_resident_memory_bytes", + "Resident memory used by the current process" + ); + pub static ref PROCESS_VIRT_MEM: Result = try_create_int_gauge( + "process_virtual_memory_bytes", + "Virtual memory used by the current process" + ); + pub static ref SYSTEM_VIRT_MEM_TOTAL: Result = + try_create_int_gauge("system_virt_mem_total_bytes", "Total system virtual memory"); + pub static ref SYSTEM_VIRT_MEM_AVAILABLE: Result = try_create_int_gauge( + "system_virt_mem_available_bytes", + "Available system virtual memory" + ); + pub static ref SYSTEM_VIRT_MEM_USED: Result = + try_create_int_gauge("system_virt_mem_used_bytes", "Used system virtual memory"); + pub static ref SYSTEM_VIRT_MEM_FREE: Result = + try_create_int_gauge("system_virt_mem_free_bytes", "Free system virtual memory"); + pub static ref SYSTEM_VIRT_MEM_PERCENTAGE: Result = try_create_float_gauge( + "system_virt_mem_percentage", + "Percentage of used virtual memory" + ); + pub static ref SYSTEM_LOADAVG_1: Result = + try_create_float_gauge("system_loadavg_1", "Loadavg over 1 minute"); + pub static ref SYSTEM_LOADAVG_5: Result = + try_create_float_gauge("system_loadavg_5", "Loadavg over 5 minutes"); + pub static ref SYSTEM_LOADAVG_15: Result = + try_create_float_gauge("system_loadavg_15", "Loadavg over 15 minutes"); } /// Returns the full set of Prometheus metrics for the Beacon Node application. @@ -72,6 +105,26 @@ pub fn get_prometheus( store::scrape_for_metrics(&db_path, &freezer_db_path); beacon_chain::scrape_for_metrics(&beacon_chain); + if let Ok(health) = Health::observe() { + set_gauge(&PROCESS_NUM_THREADS, health.pid_num_threads as i64); + set_gauge(&PROCESS_RES_MEM, health.pid_mem_resident_set_size as i64); + set_gauge(&PROCESS_VIRT_MEM, health.pid_mem_virtual_memory_size as i64); + set_gauge(&SYSTEM_VIRT_MEM_TOTAL, health.sys_virt_mem_total as i64); + set_gauge( + &SYSTEM_VIRT_MEM_AVAILABLE, + health.sys_virt_mem_available as i64, + ); + set_gauge(&SYSTEM_VIRT_MEM_USED, health.sys_virt_mem_used as i64); + set_gauge(&SYSTEM_VIRT_MEM_FREE, health.sys_virt_mem_free as i64); + set_float_gauge( + &SYSTEM_VIRT_MEM_PERCENTAGE, + health.sys_virt_mem_percent as f64, + ); + set_float_gauge(&SYSTEM_LOADAVG_1, health.sys_loadavg_1); + set_float_gauge(&SYSTEM_LOADAVG_5, health.sys_loadavg_5); + set_float_gauge(&SYSTEM_LOADAVG_15, health.sys_loadavg_15); + } + encoder .encode(&lighthouse_metrics::gather(), &mut buffer) .unwrap(); diff --git a/beacon_node/rest_api/src/node.rs b/beacon_node/rest_api/src/node.rs index ffd07f8f10..b8b6a4fd2c 100644 --- a/beacon_node/rest_api/src/node.rs +++ b/beacon_node/rest_api/src/node.rs @@ -1,8 +1,8 @@ use crate::response_builder::ResponseBuilder; -use crate::ApiResult; +use crate::{ApiError, ApiResult}; use eth2_libp2p::{types::SyncState, NetworkGlobals}; use hyper::{Body, Request}; -use rest_types::{SyncingResponse, SyncingStatus}; +use rest_types::{Health, SyncingResponse, SyncingStatus}; use std::sync::Arc; use types::{EthSpec, Slot}; use version; @@ -41,3 +41,9 @@ pub fn syncing( sync_status, }) } + +pub fn get_health(req: Request) -> ApiResult { + let health = Health::observe().map_err(|e| ApiError::ServerError(e))?; + + ResponseBuilder::new(&req)?.body_no_ssz(&health) +} diff --git a/beacon_node/rest_api/src/router.rs b/beacon_node/rest_api/src/router.rs index 6c7924cce3..7220927152 100644 --- a/beacon_node/rest_api/src/router.rs +++ b/beacon_node/rest_api/src/router.rs @@ -33,6 +33,7 @@ pub async fn route( let log = local_log.clone(); let request_result = match (req.method(), path.as_ref()) { // Methods for Client + (&Method::GET, "/node/health") => node::get_health(req), (&Method::GET, "/node/version") => node::get_version(req), (&Method::GET, "/node/syncing") => { // inform the current slot, or set to 0 diff --git a/beacon_node/rest_api/tests/test.rs b/beacon_node/rest_api/tests/test.rs index de83d90d64..a38d62a449 100644 --- a/beacon_node/rest_api/tests/test.rs +++ b/beacon_node/rest_api/tests/test.rs @@ -1252,3 +1252,15 @@ mod validator_attestation { ); } } + +#[test] +fn get_health() { + let mut env = build_env(); + + let node = build_node(&mut env, testing_client_config()); + let remote_node = node.remote_node().expect("should produce remote node"); + + env.runtime() + .block_on(remote_node.http.node().get_health()) + .unwrap(); +} diff --git a/book/src/http/node.md b/book/src/http/node.md index 3c32c6c1cc..801ec7d0e8 100644 --- a/book/src/http/node.md +++ b/book/src/http/node.md @@ -55,3 +55,36 @@ Typical Responses | 200 } } ``` + +## `/node/health` + +Requests information about the health of the beacon node. + +### HTTP Specification + +| Property | Specification | +| --- |--- | +Path | `/node/health` +Method | GET +JSON Encoding | Object +Query Parameters | None +Typical Responses | 200 + +### Example Response + +```json +{ + "pid": 96160, + "pid_num_threads": 30, + "pid_mem_resident_set_size": 55476224, + "pid_mem_virtual_memory_size": 2081382400, + "sys_virt_mem_total": 16721076224, + "sys_virt_mem_available": 7423197184, + "sys_virt_mem_used": 8450183168, + "sys_virt_mem_free": 3496345600, + "sys_virt_mem_percent": 55.605743, + "sys_loadavg_1": 1.56, + "sys_loadavg_5": 2.61, + "sys_loadavg_15": 2.43 +} +``` diff --git a/common/remote_beacon_node/src/lib.rs b/common/remote_beacon_node/src/lib.rs index 612a7c01a3..c8b628205b 100644 --- a/common/remote_beacon_node/src/lib.rs +++ b/common/remote_beacon_node/src/lib.rs @@ -19,7 +19,7 @@ use url::Url; pub use operation_pool::PersistedOperationPool; pub use proto_array_fork_choice::core::ProtoArray; pub use rest_types::{ - CanonicalHeadResponse, Committee, HeadBeaconBlock, IndividualVotesRequest, + CanonicalHeadResponse, Committee, HeadBeaconBlock, Health, IndividualVotesRequest, IndividualVotesResponse, SyncingResponse, ValidatorDutiesRequest, ValidatorDutyBytes, ValidatorRequest, ValidatorResponse, ValidatorSubscription, }; @@ -612,6 +612,12 @@ impl Node { client.json_get(url, vec![]).await } + pub async fn get_health(&self) -> Result { + let client = self.0.clone(); + let url = self.url("health")?; + client.json_get(url, vec![]).await + } + pub async fn syncing_status(&self) -> Result { let client = self.0.clone(); let url = self.url("syncing")?; diff --git a/common/rest_types/Cargo.toml b/common/rest_types/Cargo.toml index 5874601cc5..31126aa205 100644 --- a/common/rest_types/Cargo.toml +++ b/common/rest_types/Cargo.toml @@ -14,3 +14,5 @@ state_processing = { path = "../../consensus/state_processing" } bls = { path = "../../crypto/bls" } serde = { version = "1.0.110", features = ["derive"] } rayon = "1.3.0" +psutil = "3.1.0" +procinfo = "0.4.2" diff --git a/common/rest_types/src/lib.rs b/common/rest_types/src/lib.rs index 2c834f6e71..79a66e0341 100644 --- a/common/rest_types/src/lib.rs +++ b/common/rest_types/src/lib.rs @@ -18,4 +18,4 @@ pub use validator::{ pub use consensus::{IndividualVote, IndividualVotesRequest, IndividualVotesResponse}; -pub use node::{SyncingResponse, SyncingStatus}; +pub use node::{Health, SyncingResponse, SyncingStatus}; diff --git a/common/rest_types/src/node.rs b/common/rest_types/src/node.rs index ecacacc1ce..9246c13d92 100644 --- a/common/rest_types/src/node.rs +++ b/common/rest_types/src/node.rs @@ -1,4 +1,6 @@ //! Collection of types for the /node HTTP +use procinfo::pid; +use psutil::process::Process; use serde::{Deserialize, Serialize}; use ssz_derive::{Decode, Encode}; use types::Slot; @@ -30,3 +32,65 @@ pub struct SyncingResponse { /// The current sync status. pub sync_status: SyncingStatus, } + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +/// Reports on the health of the Lighthouse instance. +pub struct Health { + /// The pid of this process. + pub pid: u32, + /// The number of threads used by this pid. + pub pid_num_threads: i32, + /// The total resident memory used by this pid. + pub pid_mem_resident_set_size: u64, + /// The total virtual memory used by this pid. + pub pid_mem_virtual_memory_size: u64, + /// Total virtual memory on the system + pub sys_virt_mem_total: u64, + /// Total virtual memory available for new processes. + pub sys_virt_mem_available: u64, + /// Total virtual memory used on the system + pub sys_virt_mem_used: u64, + /// Total virtual memory not used on the system + pub sys_virt_mem_free: u64, + /// Percentage of virtual memory used on the system + pub sys_virt_mem_percent: f32, + /// System load average over 1 minute. + pub sys_loadavg_1: f64, + /// System load average over 5 minutes. + pub sys_loadavg_5: f64, + /// System load average over 15 minutes. + pub sys_loadavg_15: f64, +} + +impl Health { + pub fn observe() -> Result { + let process = + Process::current().map_err(|e| format!("Unable to get current process: {:?}", e))?; + + let process_mem = process + .memory_info() + .map_err(|e| format!("Unable to get process memory info: {:?}", e))?; + + let stat = pid::stat_self().map_err(|e| format!("Unable to get stat: {:?}", e))?; + + let vm = psutil::memory::virtual_memory() + .map_err(|e| format!("Unable to get virtual memory: {:?}", e))?; + let loadavg = + psutil::host::loadavg().map_err(|e| format!("Unable to get loadavg: {:?}", e))?; + + Ok(Self { + pid: process.pid().into(), + pid_num_threads: stat.num_threads, + pid_mem_resident_set_size: process_mem.rss().into(), + pid_mem_virtual_memory_size: process_mem.vms().into(), + sys_virt_mem_total: vm.total().into(), + sys_virt_mem_available: vm.available().into(), + sys_virt_mem_used: vm.used().into(), + sys_virt_mem_free: vm.free().into(), + sys_virt_mem_percent: vm.percent().into(), + sys_loadavg_1: loadavg.one.into(), + sys_loadavg_5: loadavg.five.into(), + sys_loadavg_15: loadavg.fifteen.into(), + }) + } +}