Add PHP Health Metrics (PHM) with /proc-based worker collection (#145)
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 56532c4..d92b69c 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -115,7 +115,10 @@
- name: Setup php-fpm for Linux
if: matrix.os == 'ubuntu-24.04'
run: |
- sudo apt-get update
+ sudo apt-get update
+ sudo apt-get install -y software-properties-common
+ sudo add-apt-repository -y ppa:ondrej/php
+ sudo apt-get update
sudo apt-get install -y php${{ matrix.flag.php_version }}-fpm
sudo ln -sf /usr/sbin/php-fpm${{ matrix.flag.php_version }} /usr/sbin/php-fpm
@@ -171,8 +174,18 @@
# Build mixture for cargo test.
- name: Docker compose
run: |
- docker compose up -d --wait
- docker compose ps
+ for i in 1 2 3; do
+ if docker compose up -d --wait && docker compose ps; then
+ break
+ fi
+ echo "docker compose up failed (attempt ${i}/3), retrying in 10s..."
+ docker compose down --remove-orphans 2>/dev/null || true
+ if [ "${i}" -lt 3 ]; then
+ sleep 10
+ else
+ exit 1
+ fi
+ done
# Try cargo test.
- name: Cargo test
diff --git a/Cargo.lock b/Cargo.lock
index 8d20824..5f6ac84 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2456,7 +2456,7 @@
[[package]]
name = "skywalking-php"
-version = "1.1.0"
+version = "1.2.0"
dependencies = [
"anyhow",
"axum 0.8.4",
@@ -2487,7 +2487,7 @@
[[package]]
name = "skywalking-php-worker"
-version = "1.1.0"
+version = "1.2.0"
dependencies = [
"anyhow",
"bincode",
diff --git a/Cargo.toml b/Cargo.toml
index 87cf48d..b5a6c0a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,7 +21,7 @@
]
[workspace.package]
-version = "1.1.0"
+version = "1.2.0"
authors = ["Apache Software Foundation", "jmjoy <jmjoy@apache.org>", "Yanlong He <heyanlong@apache.org>"]
edition = "2024"
rust-version = "1.85"
diff --git a/README.md b/README.md
index a256adb..60c8415 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,8 @@
<img src="http://skywalking.apache.org/assets/logo.svg" alt="Sky Walking logo" height="90px" align="right" />
-**SkyWalking PHP** The PHP Agent for Apache SkyWalking, which provides the native tracing abilities for PHP project.
+**SkyWalking PHP** The PHP Agent for Apache SkyWalking, which provides native tracing and PHP Health Metrics (PHM)
+runtime reporting for PHP projects.
**SkyWalking** an APM(application performance monitor) system, especially designed for
microservices, cloud native and container-based (Docker, Kubernetes, Mesos) architectures.
diff --git a/docs/en/configuration/ini-settings.md b/docs/en/configuration/ini-settings.md
index 214677e..c07daf5 100644
--- a/docs/en/configuration/ini-settings.md
+++ b/docs/en/configuration/ini-settings.md
@@ -27,3 +27,5 @@
| skywalking_agent.instance_name | Instance name. You can set `${HOSTNAME}`, refer to [Example #1](https://www.php.net/manual/en/install.fpm.configuration.php) | |
| skywalking_agent.standalone_socket_path | Unix domain socket file path of standalone skywalking php worker. Only available when `reporter_type` is `standalone`. | |
| skywalking_agent.psr_logging_level | The log level reported to SkyWalking, based on PSR-3, one of `Off`, `Debug`, `Info`, Notice`, Warning`, Error`, Critical`, Alert`, Emergency`. | Off |
+| skywalking_agent.metrics_enable | Enable PHP Health Metrics (PHM) meter reporting via native MeterReportService. **Linux only** (requires `/proc`). Default **On** on Linux when the agent is active; default **Off** on macOS/Windows. Set to `Off` to disable on Linux. Reports six process meters: CPU utilization, memory used/peak, virtual memory, thread count, and open FD count. See [PHP agent README](../setup/service-agent/php-agent/README.md#php-health-metrics-phm). | On (Linux); Off (other) |
+| skywalking_agent.metrics_report_period | PHM meter collection interval in seconds. Process meters are sampled by the forked reporter worker via `/proc` (parent PHP process PID). **Linux only.** | 30 |
diff --git a/docs/en/setup/service-agent/php-agent/README.md b/docs/en/setup/service-agent/php-agent/README.md
index 5464177..cc04f11 100644
--- a/docs/en/setup/service-agent/php-agent/README.md
+++ b/docs/en/setup/service-agent/php-agent/README.md
@@ -113,6 +113,48 @@
> Enabling it by default will cause extra meaningless consumption when skywalking agent is not
> needed (such as simply executing a php script).
+### PHP Health Metrics (PHM)
+
+> **Platform:** PHM process meters are **Linux only**. The forked reporter worker reads the
+> parent PHP process via `/proc` (`/proc/{pid}/status`, `stat`, and `fd`). They are not available
+> on macOS or Windows. Trace and other agent features are unchanged.
+
+When `reporter_type` is `grpc` or `kafka`, the forked reporter worker boots
+`skywalking::metrics::Metricer` in `start_worker`, alongside heartbeat reporting. A background
+collector samples `/proc` for the parent PHP process (`getppid()`), updates Gauges, and `Metricer`
+reports meter data to OAP through the same path as traces and logs. PHM does not run when
+`reporter_type = standalone`.
+
+PHM reports PHP runtime meters through the native Meter protocol (MeterReportService), without
+requiring HTTP traffic, similar to Python PVM and Ruby runtime meters.
+**PHM is enabled by default on Linux** when the agent is active (`skywalking_agent.enable = On`).
+To disable it or tune the interval, use `php.ini`:
+
+```ini
+; Disable PHM if not needed (default is On on Linux).
+; skywalking_agent.metrics_enable = Off
+
+; Report interval in seconds (default 30).
+skywalking_agent.metrics_report_period = 30
+```
+
+PHM reports six process meters (aligned with OAP `php-runtime.yaml` and Horizon UI widgets):
+
+| Agent meter name | OAP / UI expression | Source |
+| --- | --- | --- |
+| `instance_php_process_cpu_utilization` | `meter_instance_php_process_cpu_utilization` | `/proc/{pid}/stat` utime+stime delta |
+| `instance_php_memory_used_mb` | `meter_instance_php_memory_used_mb` | `/proc/{pid}/status` VmRSS |
+| `instance_php_memory_peak_mb` | `meter_instance_php_memory_peak_mb` | `/proc/{pid}/status` VmHWM |
+| `instance_php_virtual_memory_mb` | `meter_instance_php_virtual_memory_mb` | `/proc/{pid}/status` VmSize |
+| `instance_php_thread_count` | `meter_instance_php_thread_count` | `/proc/{pid}/status` Threads |
+| `instance_php_open_fd_count` | `meter_instance_php_open_fd_count` | `/proc/{pid}/fd` count |
+
+On the OAP side, activate the `php-runtime` entry in
+`agent-analyzer.default.meterAnalyzerActiveFiles`. Horizon UI shows the widgets on the **General
+Service → Instance** dashboard when data is available.
+
+See [INI Settings](../../../configuration/ini-settings.md) for all PHM options.
+
## Run
Start `php-fpm` server:
diff --git a/src/lib.rs b/src/lib.rs
index 1c6736f..c813268 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -118,6 +118,14 @@
/// `Info`, Notice`, Warning`, Error`, Critical`, Alert`, Emergency`.
const SKYWALKING_AGENT_PSR_LOGGING_LEVEL: &str = "skywalking_agent.psr_logging_level";
+/// Whether to report PHP Health Metrics (PHM) via native meter protocol.
+/// Default is **On on Linux** when the agent extension is active (`/proc`
+/// sampling only); **Off** on other platforms.
+const SKYWALKING_AGENT_METRICS_ENABLE: &str = "skywalking_agent.metrics_enable";
+
+/// PHM report period in seconds. Meters are sampled at most once per period.
+const SKYWALKING_AGENT_METRICS_REPORT_PERIOD: &str = "skywalking_agent.metrics_report_period";
+
#[php_get_module]
pub fn get_module() -> Module {
let mut module = Module::new(
@@ -214,6 +222,15 @@
"".to_string(),
Policy::System,
);
+ #[cfg(target_os = "linux")]
+ module.add_ini(SKYWALKING_AGENT_METRICS_ENABLE, true, Policy::System);
+ #[cfg(not(target_os = "linux"))]
+ module.add_ini(SKYWALKING_AGENT_METRICS_ENABLE, false, Policy::System);
+ module.add_ini(
+ SKYWALKING_AGENT_METRICS_REPORT_PERIOD,
+ 30i64,
+ Policy::System,
+ );
// Hooks.
module.on_module_init(module::init);
diff --git a/src/module.rs b/src/module.rs
index c9eea05..be80743 100644
--- a/src/module.rs
+++ b/src/module.rs
@@ -167,6 +167,12 @@
.into()
});
+pub static METRICS_ENABLE: Lazy<bool> =
+ Lazy::new(|| ini_get::<bool>(SKYWALKING_AGENT_METRICS_ENABLE));
+
+pub static METRICS_REPORT_PERIOD: Lazy<i64> =
+ Lazy::new(|| ini_get::<i64>(SKYWALKING_AGENT_METRICS_REPORT_PERIOD));
+
pub fn init() {
if !is_enable() {
return;
@@ -193,6 +199,8 @@
Lazy::force(&KAFKA_PRODUCER_CONFIG);
Lazy::force(&INJECT_CONTEXT);
Lazy::force(&PSR_LOGGING_LEVEL);
+ Lazy::force(&METRICS_ENABLE);
+ Lazy::force(&METRICS_REPORT_PERIOD);
if let Err(err) = try_init_logger() {
eprintln!("skywalking_agent: initialize logger failed: {}", err);
@@ -228,7 +236,6 @@
return;
}
- // Initialize Agent worker.
init_worker();
let reporter = Arc::new(Reporter::new(&*SOCKET_FILE_PATH));
@@ -239,7 +246,11 @@
reporter.clone(),
));
- logger::set_global_logger(Logger::new(&*SERVICE_NAME, &*SERVICE_INSTANCE, reporter));
+ logger::set_global_logger(Logger::new(
+ &*SERVICE_NAME,
+ &*SERVICE_INSTANCE,
+ reporter.clone(),
+ ));
// Hook functions.
register_execute_functions();
diff --git a/src/worker.rs b/src/worker.rs
index c01f7c5..99036e8 100644
--- a/src/worker.rs
+++ b/src/worker.rs
@@ -14,9 +14,10 @@
// limitations under the License.
use crate::module::{
- AUTHENTICATION, ENABLE_TLS, HEARTBEAT_PERIOD, PROPERTIES_REPORT_PERIOD_FACTOR, REPORTER_TYPE,
- SERVER_ADDR, SERVICE_INSTANCE, SERVICE_NAME, SOCKET_FILE_PATH, SSL_CERT_CHAIN_PATH,
- SSL_KEY_PATH, SSL_TRUSTED_CA_PATH, WORKER_THREADS, is_standalone_reporter_type,
+ AUTHENTICATION, ENABLE_TLS, HEARTBEAT_PERIOD, METRICS_ENABLE, METRICS_REPORT_PERIOD,
+ PROPERTIES_REPORT_PERIOD_FACTOR, REPORTER_TYPE, SERVER_ADDR, SERVICE_INSTANCE, SERVICE_NAME,
+ SOCKET_FILE_PATH, SSL_CERT_CHAIN_PATH, SSL_KEY_PATH, SSL_TRUSTED_CA_PATH, WORKER_THREADS,
+ is_standalone_reporter_type,
};
#[cfg(feature = "kafka-reporter")]
use crate::module::{KAFKA_BOOTSTRAP_SERVERS, KAFKA_PRODUCER_CONFIG};
@@ -24,6 +25,7 @@
use skywalking_php_worker::reporter::KafkaReporterConfiguration;
use skywalking_php_worker::{
HeartBeatConfiguration, WorkerConfiguration, new_tokio_runtime,
+ phm::PhmConfiguration,
reporter::{GrpcReporterConfiguration, ReporterConfiguration},
start_worker,
};
@@ -78,6 +80,7 @@
heartbeat_period: *HEARTBEAT_PERIOD,
properties_report_period_factor: *PROPERTIES_REPORT_PERIOD_FACTOR,
}),
+ phm: phm_configuration(),
reporter_config,
};
@@ -106,3 +109,20 @@
worker_threads as usize
}
}
+
+#[cfg(target_os = "linux")]
+fn phm_configuration() -> Option<PhmConfiguration> {
+ if !*METRICS_ENABLE {
+ return None;
+ }
+ Some(PhmConfiguration {
+ service_name: SERVICE_NAME.clone(),
+ service_instance: SERVICE_INSTANCE.clone(),
+ report_period_secs: *METRICS_REPORT_PERIOD,
+ })
+}
+
+#[cfg(not(target_os = "linux"))]
+fn phm_configuration() -> Option<PhmConfiguration> {
+ None
+}
diff --git a/tests/common/mod.rs b/tests/common/mod.rs
index 8083027..db2d95e 100644
--- a/tests/common/mod.rs
+++ b/tests/common/mod.rs
@@ -23,17 +23,10 @@
routing::any,
};
use futures_util::future::join_all;
-use libc::{SIGTERM, kill, pid_t};
+use libc::{SIGKILL, SIGTERM, kill, pid_t};
use once_cell::sync::Lazy;
use std::{
- env,
- fs::File,
- io::{self, Cursor},
- net::SocketAddr,
- process::{ExitStatus, Stdio},
- sync::Arc,
- thread,
- time::Duration,
+ env, fs::File, io::Cursor, net::SocketAddr, process::Stdio, sync::Arc, thread, time::Duration,
};
use tokio::{
net::TcpStream,
@@ -112,16 +105,13 @@
fixture.http_server_1_handle.abort();
fixture.http_server_2_handle.abort();
- let results = join_all([
- kill_command(fixture.php_fpm_1_child),
- kill_command(fixture.php_fpm_2_child),
- kill_command(fixture.php_swoole_1_child),
- kill_command(fixture.php_swoole_2_child),
+ join_all([
+ stop_child(fixture.php_fpm_1_child),
+ stop_child(fixture.php_fpm_2_child),
+ stop_child(fixture.php_swoole_1_child),
+ stop_child(fixture.php_swoole_2_child),
])
.await;
- for result in results {
- assert!(result.unwrap().success());
- }
}
fn setup_logging() {
@@ -319,8 +309,22 @@
"-d",
"skywalking_agent.psr_logging_level=Warning",
];
+ let mut args: Vec<String> = args.iter().map(|s| (*s).to_string()).collect();
+ if index == 1 {
+ args.extend([
+ "-d".to_owned(),
+ "skywalking_agent.metrics_enable=On".to_owned(),
+ "-d".to_owned(),
+ "skywalking_agent.metrics_report_period=5".to_owned(),
+ ]);
+ } else {
+ args.extend([
+ "-d".to_owned(),
+ "skywalking_agent.metrics_enable=Off".to_owned(),
+ ]);
+ }
info!(cmd = args.join(" "), "start command");
- let child = Command::new(args[0])
+ let child = Command::new(&args[0])
.args(&args[1..])
.stdin(Stdio::null())
.stdout(File::create("/tmp/fpm-skywalking-stdout.log").unwrap())
@@ -364,6 +368,8 @@
"skywalking_agent.enable_zend_observer={}",
*ENABLE_ZEND_OBSERVER
),
+ "-d",
+ "skywalking_agent.metrics_enable=Off",
&format!("tests/php/swoole/main.{}.php", index),
];
info!(cmd = args.join(" "), "start command");
@@ -378,11 +384,21 @@
child
}
-async fn kill_command(mut child: Child) -> io::Result<ExitStatus> {
- if let Some(id) = child.id() {
- unsafe {
- kill(id as pid_t, SIGTERM);
+async fn stop_child(mut child: Child) {
+ let Some(id) = child.id() else {
+ let _ = child.wait().await;
+ return;
+ };
+ unsafe {
+ kill(id as pid_t, SIGTERM);
+ }
+ match tokio::time::timeout(Duration::from_secs(5), child.wait()).await {
+ Ok(Ok(_)) | Ok(Err(_)) => {}
+ Err(_) => {
+ unsafe {
+ kill(id as pid_t, SIGKILL);
+ }
+ let _ = child.wait().await;
}
}
- child.wait().await
}
diff --git a/tests/data/expected_context.yaml b/tests/data/expected_context.yaml
index d67e37b..40fbc20 100644
--- a/tests/data/expected_context.yaml
+++ b/tests/data/expected_context.yaml
@@ -2059,3 +2059,31 @@
- {key: bar, value: 'false'}
- {key: baz, value: test}
layer: ''
+meterItems:
+ - serviceName: skywalking-agent-test-1
+ meterSize: ge 6
+ meters:
+ - meterId:
+ name: instance_php_process_cpu_utilization
+ tags: []
+ singleValue: ge 0
+ - meterId:
+ name: instance_php_memory_used_mb
+ tags: []
+ singleValue: ge 0
+ - meterId:
+ name: instance_php_memory_peak_mb
+ tags: []
+ singleValue: ge 0
+ - meterId:
+ name: instance_php_virtual_memory_mb
+ tags: []
+ singleValue: ge 0
+ - meterId:
+ name: instance_php_thread_count
+ tags: []
+ singleValue: ge 1
+ - meterId:
+ name: instance_php_open_fd_count
+ tags: []
+ singleValue: ge 1
diff --git a/tests/e2e.rs b/tests/e2e.rs
index 97ba724..a76ce6c 100644
--- a/tests/e2e.rs
+++ b/tests/e2e.rs
@@ -74,7 +74,8 @@
request_swoole_2_predis().await;
request_swoole_2_mongodb().await;
request_swoole_2_memcache().await;
- sleep(Duration::from_secs(3)).await;
+ // Wait for PHM meter report (metrics_report_period=5s on FPM test-1).
+ sleep(Duration::from_secs(8)).await;
request_collector_validate().await;
}
diff --git a/worker/src/lib.rs b/worker/src/lib.rs
index 45df23f..6d12f80 100644
--- a/worker/src/lib.rs
+++ b/worker/src/lib.rs
@@ -14,10 +14,12 @@
// limitations under the License.
pub mod channel;
+pub mod phm;
pub mod reporter;
use crate::{
channel::TxReporter,
+ phm::{PhmConfiguration, boot_phm_metrics},
reporter::{ReporterConfiguration, run_reporter},
};
use skywalking::{
@@ -44,6 +46,7 @@
pub struct WorkerConfiguration {
pub socket_file_path: PathBuf,
pub heart_beat: Option<HeartBeatConfiguration>,
+ pub phm: Option<PhmConfiguration>,
pub reporter_config: ReporterConfiguration,
}
@@ -126,9 +129,13 @@
});
if let Some(heart_beat_config) = config.heart_beat {
- report_properties_and_keep_alive(heart_beat_config, TxReporter(tx_));
+ report_properties_and_keep_alive(heart_beat_config, TxReporter(tx_.clone()));
}
+ let _phm_booting = config
+ .phm
+ .map(|phm_config| boot_phm_metrics(phm_config, TxReporter(tx_.clone())));
+
// Run reporter with blocking.
run_reporter(config.reporter_config, (), Consumer(rx)).await?;
diff --git a/worker/src/phm.rs b/worker/src/phm.rs
new file mode 100644
index 0000000..dc9625c
--- /dev/null
+++ b/worker/src/phm.rs
@@ -0,0 +1,296 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Periodic PHM meter collection in the forked reporter worker. Samples the
+//! parent PHP process via `/proc` and reports through `skywalking::metrics`
+//! `Metricer`, booted from `start_worker` alongside heartbeat reporting.
+
+use crate::channel::TxReporter;
+use skywalking::metrics::{
+ meter::Gauge,
+ metricer::{Booting, Metricer},
+};
+use std::{
+ fs,
+ sync::{
+ Arc,
+ atomic::{AtomicU64, Ordering},
+ },
+ time::{Duration, SystemTime, UNIX_EPOCH},
+};
+use tracing::{debug, trace, warn};
+
+const METRIC_PROCESS_CPU: &str = "instance_php_process_cpu_utilization";
+const DEFAULT_CLK_TCK: f64 = 100.0;
+const METRIC_MEMORY_USED_MB: &str = "instance_php_memory_used_mb";
+const METRIC_MEMORY_PEAK_MB: &str = "instance_php_memory_peak_mb";
+const METRIC_THREAD_COUNT: &str = "instance_php_thread_count";
+const METRIC_VIRTUAL_MEMORY_MB: &str = "instance_php_virtual_memory_mb";
+const METRIC_OPEN_FD_COUNT: &str = "instance_php_open_fd_count";
+
+#[derive(Clone)]
+pub struct PhmConfiguration {
+ pub service_name: String,
+ pub service_instance: String,
+ pub report_period_secs: i64,
+}
+
+#[derive(Clone)]
+struct PhmCollectorConfiguration {
+ report_period_secs: i64,
+}
+
+#[derive(Clone, Default)]
+pub struct PhmSamples {
+ memory_used_mb: Arc<AtomicU64>,
+ memory_peak_mb: Arc<AtomicU64>,
+ virtual_memory_mb: Arc<AtomicU64>,
+ thread_count: Arc<AtomicU64>,
+ open_fd_count: Arc<AtomicU64>,
+ process_cpu: Arc<AtomicU64>,
+}
+
+impl PhmSamples {
+ fn store(cell: &AtomicU64, value: f64) {
+ cell.store(value.to_bits(), Ordering::Relaxed);
+ }
+
+ fn gauge(cell: Arc<AtomicU64>) -> impl Fn() -> f64 + Send + Sync + 'static {
+ move || f64::from_bits(cell.load(Ordering::Relaxed))
+ }
+}
+
+pub fn register_gauges(metricer: &mut Metricer, samples: PhmSamples) {
+ metricer.register(Gauge::new(
+ METRIC_MEMORY_USED_MB,
+ PhmSamples::gauge(samples.memory_used_mb.clone()),
+ ));
+ metricer.register(Gauge::new(
+ METRIC_MEMORY_PEAK_MB,
+ PhmSamples::gauge(samples.memory_peak_mb.clone()),
+ ));
+ metricer.register(Gauge::new(
+ METRIC_VIRTUAL_MEMORY_MB,
+ PhmSamples::gauge(samples.virtual_memory_mb.clone()),
+ ));
+ metricer.register(Gauge::new(
+ METRIC_THREAD_COUNT,
+ PhmSamples::gauge(samples.thread_count.clone()),
+ ));
+ metricer.register(Gauge::new(
+ METRIC_OPEN_FD_COUNT,
+ PhmSamples::gauge(samples.open_fd_count.clone()),
+ ));
+ metricer.register(Gauge::new(
+ METRIC_PROCESS_CPU,
+ PhmSamples::gauge(samples.process_cpu.clone()),
+ ));
+}
+
+struct CpuStatSample {
+ utime: u64,
+ stime: u64,
+ wall_ms: u128,
+}
+
+fn update_samples(samples: &PhmSamples, cpu_sample: &mut Option<CpuStatSample>) -> Option<i32> {
+ let pid = unsafe { libc::getppid() as i32 };
+ if !process_alive(pid) {
+ warn!(pid, "PHM target PHP process is gone, skip sample");
+ return None;
+ }
+
+ let status = read_proc_status(pid);
+ if let Some(mb) = status.vm_rss_mb {
+ PhmSamples::store(&samples.memory_used_mb, mb);
+ }
+ if let Some(mb) = status.vm_hwm_mb {
+ PhmSamples::store(&samples.memory_peak_mb, mb);
+ }
+ if let Some(mb) = status.vm_size_mb {
+ PhmSamples::store(&samples.virtual_memory_mb, mb);
+ }
+ if let Some(count) = status.threads {
+ PhmSamples::store(&samples.thread_count, count as f64);
+ }
+ if let Some(count) = read_open_fd_count(pid) {
+ PhmSamples::store(&samples.open_fd_count, count);
+ }
+ if let Some((utime, stime)) = read_proc_stat_cpu(pid) {
+ let now_ms = current_time_millis();
+ let cpu = match cpu_sample {
+ None => {
+ *cpu_sample = Some(CpuStatSample {
+ utime,
+ stime,
+ wall_ms: now_ms,
+ });
+ None
+ }
+ Some(sample) => {
+ let delta_jiffies =
+ utime.saturating_sub(sample.utime) + stime.saturating_sub(sample.stime);
+ let delta_wall_ms = now_ms.saturating_sub(sample.wall_ms);
+ sample.utime = utime;
+ sample.stime = stime;
+ sample.wall_ms = now_ms;
+ Some(cpu_percent(delta_jiffies, delta_wall_ms))
+ }
+ };
+ if let Some(cpu) = cpu {
+ trace!(pid, cpu, "update PHM process CPU sample");
+ PhmSamples::store(&samples.process_cpu, cpu);
+ }
+ } else {
+ warn!(pid, "failed to read /proc stat for PHM CPU sampling");
+ }
+ debug!(pid, "PHM proc samples updated");
+ Some(pid)
+}
+
+/// Populate gauges once before `Metricer::boot()` so the first report is not
+/// all zeros.
+pub fn warmup_samples(samples: &PhmSamples) {
+ let mut cpu_sample = None;
+ update_samples(samples, &mut cpu_sample);
+}
+
+pub fn boot_phm_metrics(config: PhmConfiguration, reporter: TxReporter) -> Booting {
+ let samples = PhmSamples::default();
+ let report_period = Duration::from_secs(config.report_period_secs.max(1) as u64);
+ let collector_config = PhmCollectorConfiguration {
+ report_period_secs: config.report_period_secs,
+ };
+ warmup_samples(&samples);
+ run_phm_collector(collector_config, samples.clone());
+ let mut metricer = Metricer::new(config.service_name, config.service_instance, reporter);
+ metricer.set_report_interval(report_period);
+ register_gauges(&mut metricer, samples);
+ metricer.boot()
+}
+
+fn run_phm_collector(config: PhmCollectorConfiguration, samples: PhmSamples) {
+ tokio::spawn(async move {
+ let period = Duration::from_secs(config.report_period_secs.max(1) as u64);
+ let mut cpu_sample: Option<CpuStatSample> = None;
+ loop {
+ if update_samples(&samples, &mut cpu_sample).is_none() {
+ break;
+ }
+ tokio::time::sleep(period).await;
+ }
+ });
+}
+
+fn process_alive(pid: i32) -> bool {
+ fs::metadata(format!("/proc/{pid}")).is_ok()
+}
+
+#[derive(Default)]
+struct ProcStatusFields {
+ vm_rss_mb: Option<f64>,
+ vm_hwm_mb: Option<f64>,
+ vm_size_mb: Option<f64>,
+ threads: Option<u64>,
+}
+
+fn read_proc_status(pid: i32) -> ProcStatusFields {
+ let Ok(content) = fs::read_to_string(format!("/proc/{pid}/status")) else {
+ return ProcStatusFields::default();
+ };
+ let mut fields = ProcStatusFields::default();
+ for line in content.lines() {
+ if fields.vm_rss_mb.is_none() {
+ fields.vm_rss_mb = parse_status_kib_line(line, "VmRSS");
+ }
+ if fields.vm_hwm_mb.is_none() {
+ fields.vm_hwm_mb = parse_status_kib_line(line, "VmHWM");
+ }
+ if fields.vm_size_mb.is_none() {
+ fields.vm_size_mb = parse_status_kib_line(line, "VmSize");
+ }
+ if fields.threads.is_none() {
+ fields.threads = parse_status_count_line(line, "Threads");
+ }
+ if fields.vm_rss_mb.is_some()
+ && fields.vm_hwm_mb.is_some()
+ && fields.vm_size_mb.is_some()
+ && fields.threads.is_some()
+ {
+ break;
+ }
+ }
+ fields
+}
+
+fn parse_status_kib_line(line: &str, key: &str) -> Option<f64> {
+ let prefix = format!("{key}:");
+ if !line.starts_with(&prefix) {
+ return None;
+ }
+ let kb: f64 = line.split_whitespace().nth(1)?.parse().ok()?;
+ Some(kb / 1024.0)
+}
+
+fn parse_status_count_line(line: &str, key: &str) -> Option<u64> {
+ let prefix = format!("{key}:");
+ if !line.starts_with(&prefix) {
+ return None;
+ }
+ line.split_whitespace().nth(1)?.parse().ok()
+}
+
+fn read_open_fd_count(pid: i32) -> Option<f64> {
+ let count = fs::read_dir(format!("/proc/{pid}/fd"))
+ .ok()?
+ .filter_map(|entry| entry.ok())
+ .count();
+ Some(count as f64)
+}
+
+fn read_proc_stat_cpu(pid: i32) -> Option<(u64, u64)> {
+ let content = fs::read_to_string(format!("/proc/{pid}/stat")).ok()?;
+ let rparen = content.rfind(')')?;
+ let fields: Vec<&str> = content[rparen + 2..].split_whitespace().collect();
+ let utime: u64 = fields.get(11)?.parse().ok()?;
+ let stime: u64 = fields.get(12)?.parse().ok()?;
+ Some((utime, stime))
+}
+
+fn cpu_percent(delta_jiffies: u64, delta_wall_ms: u128) -> f64 {
+ if delta_wall_ms == 0 {
+ return 0.0;
+ }
+ let clk_tck = unsafe { libc::sysconf(libc::_SC_CLK_TCK) };
+ let clk_tck = if clk_tck > 0 {
+ clk_tck as f64
+ } else {
+ warn!(
+ clk_tck,
+ "sysconf(_SC_CLK_TCK) unavailable, using default {DEFAULT_CLK_TCK}"
+ );
+ DEFAULT_CLK_TCK
+ };
+ let cpu_sec = delta_jiffies as f64 / clk_tck;
+ let wall_sec = delta_wall_ms as f64 / 1000.0;
+ cpu_sec / wall_sec * 100.0
+}
+
+fn current_time_millis() -> u128 {
+ SystemTime::now()
+ .duration_since(UNIX_EPOCH)
+ .map(|d| d.as_millis())
+ .unwrap_or_default()
+}