Monitoring service api (#2251)

## Issue Addressed N/A ## Proposed Changes Adds a client side api for collecting system and process metrics and pushing it to a monitoring service.
2026-06-01 13:47:16 +00:00 · 2021-05-26 05:58:41 +00:00
parent 55aada006f
commit fdaeec631b
30 changed files with 1108 additions and 65 deletions
--- a/common/warp_utils/src/metrics.rs
+++ b/common/warp_utils/src/metrics.rs
@@ -1,4 +1,4 @@
-use eth2::lighthouse::Health;
+use eth2::lighthouse::{ProcessHealth, SystemHealth};
 use lighthouse_metrics::*;

 lazy_static::lazy_static! {
@@ -14,6 +14,10 @@ lazy_static::lazy_static! {
        "process_virtual_memory_bytes",
        "Virtual memory used by the current process"
    );
+    pub static ref PROCESS_SECONDS: Result<IntGauge> = try_create_int_gauge(
+        "process_cpu_seconds_total",
+        "Total cpu time taken by the current process"
+    );
    pub static ref SYSTEM_VIRT_MEM_TOTAL: Result<IntGauge> =
        try_create_int_gauge("system_virt_mem_total_bytes", "Total system virtual memory");
    pub static ref SYSTEM_VIRT_MEM_AVAILABLE: Result<IntGauge> = try_create_int_gauge(
@@ -24,6 +28,10 @@ lazy_static::lazy_static! {
        try_create_int_gauge("system_virt_mem_used_bytes", "Used system virtual memory");
    pub static ref SYSTEM_VIRT_MEM_FREE: Result<IntGauge> =
        try_create_int_gauge("system_virt_mem_free_bytes", "Free system virtual memory");
+    pub static ref SYSTEM_VIRT_MEM_CACHED: Result<IntGauge> =
+        try_create_int_gauge("system_virt_mem_cached_bytes", "Used system virtual memory");
+    pub static ref SYSTEM_VIRT_MEM_BUFFERS: Result<IntGauge> =
+        try_create_int_gauge("system_virt_mem_buffer_bytes", "Free system virtual memory");
    pub static ref SYSTEM_VIRT_MEM_PERCENTAGE: Result<Gauge> = try_create_float_gauge(
        "system_virt_mem_percentage",
        "Percentage of used virtual memory"
@@ -34,15 +42,62 @@ lazy_static::lazy_static! {
        try_create_float_gauge("system_loadavg_5", "Loadavg over 5 minutes");
    pub static ref SYSTEM_LOADAVG_15: Result<Gauge> =
        try_create_float_gauge("system_loadavg_15", "Loadavg over 15 minutes");
+
+    pub static ref CPU_CORES: Result<IntGauge> =
+        try_create_int_gauge("cpu_cores", "Number of physical cpu cores");
+    pub static ref CPU_THREADS: Result<IntGauge> =
+        try_create_int_gauge("cpu_threads", "Number of logical cpu cores");
+
+    pub static ref CPU_SYSTEM_SECONDS_TOTAL: Result<IntGauge> =
+        try_create_int_gauge("cpu_system_seconds_total", "Total time spent in kernel mode");
+    pub static ref CPU_USER_SECONDS_TOTAL: Result<IntGauge> =
+        try_create_int_gauge("cpu_user_seconds_total", "Total time spent in user mode");
+    pub static ref CPU_IOWAIT_SECONDS_TOTAL: Result<IntGauge> =
+        try_create_int_gauge("cpu_iowait_seconds_total", "Total time spent waiting for io");
+    pub static ref CPU_IDLE_SECONDS_TOTAL: Result<IntGauge> =
+        try_create_int_gauge("cpu_idle_seconds_total", "Total time spent idle");
+
+    pub static ref DISK_BYTES_TOTAL: Result<IntGauge> =
+        try_create_int_gauge("disk_node_bytes_total", "Total capacity of disk");
+
+    pub static ref DISK_BYTES_FREE: Result<IntGauge> =
+        try_create_int_gauge("disk_node_bytes_free", "Free space in disk");
+
+    pub static ref DISK_READS: Result<IntGauge> =
+        try_create_int_gauge("disk_node_reads_total", "Number of disk reads");
+
+    pub static ref DISK_WRITES: Result<IntGauge> =
+        try_create_int_gauge("disk_node_writes_total", "Number of disk writes");
+
+    pub static ref NETWORK_BYTES_RECEIVED: Result<IntGauge> =
+        try_create_int_gauge("network_node_bytes_total_received", "Total bytes received over all network interfaces");
+    pub static ref NETWORK_BYTES_SENT: Result<IntGauge> =
+        try_create_int_gauge("network_node_bytes_total_transmit", "Total bytes sent over all network interfaces");
+
+    pub static ref BOOT_TIME: Result<IntGauge> =
+        try_create_int_gauge("misc_node_boot_ts_seconds", "Boot time as unix epoch timestamp");
 }

 pub fn scrape_health_metrics() {
+    scrape_process_health_metrics();
+    scrape_system_health_metrics();
+}
+
+pub fn scrape_process_health_metrics() {
    // This will silently fail if we are unable to observe the health. This is desired behaviour
    // since we don't support `Health` for all platforms.
-    if let Ok(health) = Health::observe() {
+    if let Ok(health) = ProcessHealth::observe() {
        set_gauge(&PROCESS_NUM_THREADS, health.pid_num_threads as i64);
        set_gauge(&PROCESS_RES_MEM, health.pid_mem_resident_set_size as i64);
        set_gauge(&PROCESS_VIRT_MEM, health.pid_mem_virtual_memory_size as i64);
+        set_gauge(&PROCESS_SECONDS, health.pid_process_seconds_total as i64);
+    }
+}
+
+pub fn scrape_system_health_metrics() {
+    // This will silently fail if we are unable to observe the health. This is desired behaviour
+    // since we don't support `Health` for all platforms.
+    if let Ok(health) = SystemHealth::observe() {
        set_gauge(&SYSTEM_VIRT_MEM_TOTAL, health.sys_virt_mem_total as i64);
        set_gauge(
            &SYSTEM_VIRT_MEM_AVAILABLE,
@@ -57,5 +112,34 @@ pub fn scrape_health_metrics() {
        set_float_gauge(&SYSTEM_LOADAVG_1, health.sys_loadavg_1);
        set_float_gauge(&SYSTEM_LOADAVG_5, health.sys_loadavg_5);
        set_float_gauge(&SYSTEM_LOADAVG_15, health.sys_loadavg_15);
+
+        set_gauge(&CPU_CORES, health.cpu_cores as i64);
+        set_gauge(&CPU_THREADS, health.cpu_threads as i64);
+
+        set_gauge(
+            &CPU_SYSTEM_SECONDS_TOTAL,
+            health.system_seconds_total as i64,
+        );
+        set_gauge(&CPU_USER_SECONDS_TOTAL, health.user_seconds_total as i64);
+        set_gauge(
+            &CPU_IOWAIT_SECONDS_TOTAL,
+            health.iowait_seconds_total as i64,
+        );
+        set_gauge(&CPU_IDLE_SECONDS_TOTAL, health.idle_seconds_total as i64);
+
+        set_gauge(&DISK_BYTES_TOTAL, health.disk_node_bytes_total as i64);
+
+        set_gauge(&DISK_BYTES_FREE, health.disk_node_bytes_free as i64);
+        set_gauge(&DISK_READS, health.disk_node_reads_total as i64);
+        set_gauge(&DISK_WRITES, health.disk_node_writes_total as i64);
+
+        set_gauge(
+            &NETWORK_BYTES_RECEIVED,
+            health.network_node_bytes_total_received as i64,
+        );
+        set_gauge(
+            &NETWORK_BYTES_SENT,
+            health.network_node_bytes_total_transmit as i64,
+        );
    }
 }