iota_metrics/
hardware_metrics.rs

1// Copyright (c) 2025 IOTA Stiftung
2// SPDX-License-Identifier: Apache-2.0
3
4use std::{
5    collections::HashMap,
6    path::{Path, PathBuf},
7    sync::{Arc, Mutex},
8};
9
10use prometheus::{
11    IntGauge, Opts, Registry,
12    core::{Collector, Desc, Number},
13    proto::{LabelPair, Metric, MetricFamily, MetricType},
14};
15use sysinfo::{CpuRefreshKind, Disk, Disks, MemoryRefreshKind, RefreshKind, System};
16
17use crate::RegistryService;
18
19#[derive(thiserror::Error, Debug)]
20pub enum HardwareMetricsErr {
21    #[error("Failed creating metric: {0}")]
22    ErrCreateMetric(prometheus::Error),
23    #[error("Failed registering hardware metrics onto RegistryService: {0}")]
24    ErrRegisterHardwareMetrics(prometheus::Error),
25}
26
27/// Register all hardware matrics: CPU specs, Memory specs/usage, Disk
28/// specs/usage
29/// These metrics are all named with a prefix "hw_"
30/// They are both pushed to iota-proxy and exposed on the /metrics endpoint.
31pub fn register_hardware_metrics(
32    registry_service: &RegistryService,
33    db_path: &Path,
34) -> Result<(), HardwareMetricsErr> {
35    let registry = Registry::new_custom(Some("hw".to_string()), None)
36        .map_err(HardwareMetricsErr::ErrRegisterHardwareMetrics)?;
37    registry
38        .register(Box::new(HardwareMetrics::new(db_path)?))
39        .map_err(HardwareMetricsErr::ErrRegisterHardwareMetrics)?;
40    registry_service.add(registry);
41    Ok(())
42}
43
44pub struct HardwareMetrics {
45    system: Arc<Mutex<System>>,
46    disks: Arc<Mutex<Disks>>,
47    // Descriptions for the static metrics
48    pub static_descriptions: Vec<Desc>,
49    // Static metrics contain metrics that are not expected to change during runtime
50    // e.g. CPU model, memory total, disk total, etc.
51    pub static_metric_families: Vec<MetricFamily>,
52    pub memory_available_collector: IntGauge,
53    // Path where the database is mounted (to identify which disk contains the DB)
54    pub db_path: PathBuf,
55}
56
57impl HardwareMetrics {
58    pub fn new(db_path: &Path) -> Result<Self, HardwareMetricsErr> {
59        let mut system = System::new_with_specifics(
60            RefreshKind::nothing()
61                .with_cpu(CpuRefreshKind::nothing())
62                .with_memory(MemoryRefreshKind::nothing().with_ram()),
63        );
64        system.refresh_all();
65
66        let disks = Disks::new_with_refreshed_list();
67
68        Ok(Self {
69            static_descriptions: Self::static_descriptions(&system, &disks, db_path)?,
70            static_metric_families: Self::static_metric_families(&system, &disks, db_path)?,
71            memory_available_collector: Self::memory_available_collector()?,
72            system: Arc::new(Mutex::new(system)),
73            disks: Arc::new(Mutex::new(disks)),
74            db_path: PathBuf::from(db_path),
75        })
76    }
77
78    pub fn static_descriptions(
79        system: &System,
80        disks: &Disks,
81        db_path: &Path,
82    ) -> Result<Vec<Desc>, HardwareMetricsErr> {
83        let mut descs: Vec<Desc> = Vec::new();
84        for mf in Self::static_metric_families(system, disks, db_path)? {
85            descs.push(Self::metric_family_desc(&mf)?);
86        }
87        Ok(descs)
88    }
89
90    pub fn static_metric_families(
91        system: &System,
92        disks: &Disks,
93        db_path: &Path,
94    ) -> Result<Vec<MetricFamily>, HardwareMetricsErr> {
95        let mut mfs = Vec::new();
96        mfs.push(Self::collect_cpu_specs(system));
97        mfs.extend(Self::memory_total_collector(system)?.collect());
98        for mf in Self::collect_disks_total_bytes(disks, db_path) {
99            mfs.push(mf);
100        }
101        Ok(mfs)
102    }
103
104    fn label(name: &str, value: impl ToString) -> LabelPair {
105        let mut label = LabelPair::new();
106        label.set_name(name.to_string());
107        label.set_value(value.to_string());
108        label
109    }
110
111    fn uint_gauge(
112        name: &str,
113        help: &str,
114        value: u64,
115        labels: &[Option<LabelPair>],
116    ) -> MetricFamily {
117        let mut g = prometheus::proto::Gauge::default();
118        let mut m = Metric::default();
119        let mut mf = MetricFamily::new();
120
121        g.set_value(value.into_f64());
122        m.set_gauge(g);
123        m.set_label(
124            labels
125                .iter()
126                .filter_map(|opt| opt.as_ref())
127                .cloned()
128                .collect::<Vec<_>>(),
129        );
130
131        mf.mut_metric().push(m);
132        mf.set_name(name.to_string());
133        mf.set_help(help.to_string());
134        mf.set_field_type(MetricType::GAUGE);
135        mf
136    }
137
138    fn metric_family_desc(fam: &MetricFamily) -> Result<Desc, HardwareMetricsErr> {
139        Desc::new(
140            fam.name().to_string(),
141            fam.help().to_string(),
142            vec![],
143            HashMap::new(),
144        )
145        .map_err(HardwareMetricsErr::ErrCreateMetric)
146    }
147
148    fn cpu_vendor_id(system: &System) -> String {
149        let vendor_id = system
150            .cpus()
151            .first()
152            .map_or("cpu_vendor_id_unavailable", |cpu| cpu.vendor_id());
153        match vendor_id {
154            "" => "cpu_vendor_id_unavailable",
155            _ => vendor_id,
156        }
157        .to_string()
158    }
159
160    fn cpu_model(system: &System) -> String {
161        let brand = system
162            .cpus()
163            .first()
164            .map_or("cpu_model_unavailable", |cpu| cpu.brand());
165        match brand {
166            "" => "cpu_model_unavailable",
167            _ => brand,
168        }
169        .to_string()
170    }
171
172    fn collect_cpu_specs(system: &System) -> MetricFamily {
173        Self::uint_gauge(
174            "cpu_core_count",
175            "CPU core count (and labels: model,vendor_id,arch)",
176            system.physical_core_count().unwrap_or_default() as u64,
177            &[
178                Some(Self::label("model", Self::cpu_model(system))),
179                Some(Self::label("vendor_id", Self::cpu_vendor_id(system))),
180                Some(Self::label("arch", System::cpu_arch())),
181            ],
182        )
183    }
184
185    // we deactivated collecting CPU usage per core to avoid performance impact
186    // fn collect_cpu_usage(system: &System) -> Result<Vec<MetricFamily>,
187    // HardwareMetricsErr> { let cpu_usage_per_core: Vec<MetricFamily> =
188    // system.cpus()         .iter()
189    //         .map(|core| {
190    //             let core_name = core.name();
191    //             Self::f64gauge(
192    //                 format!("cpu_{core_name}_usage"),
193    //                 format!("CPU core {core_name} usage in percent"),
194    //                 core.cpu_usage() as f64,
195    //             )
196    //         })
197    //         .collect();
198    //     Ok(cpu_usage_per_core)
199    // }
200
201    fn memory_total_collector(system: &System) -> Result<IntGauge, HardwareMetricsErr> {
202        let mem_total_bytes = system.total_memory();
203        let memory_total_collector =
204            IntGauge::with_opts(Opts::new("memory_total_bytes", "Memory total (bytes)"))
205                .map_err(HardwareMetricsErr::ErrCreateMetric)?;
206        memory_total_collector.set(mem_total_bytes as i64);
207        Ok(memory_total_collector)
208    }
209
210    fn memory_available_collector() -> Result<IntGauge, HardwareMetricsErr> {
211        IntGauge::with_opts(Opts::new(
212            "memory_available_bytes",
213            "Memory available (bytes)",
214        ))
215        .map_err(HardwareMetricsErr::ErrCreateMetric)
216    }
217
218    fn collect_memory_available(&self, system: &System) -> Option<Vec<MetricFamily>> {
219        let memory_available_bytes = match i64::try_from(system.available_memory()) {
220            Ok(bytes) => bytes,
221            Err(e) => {
222                tracing::error!("Failed converting memory_available_bytes to i64: {e}");
223                return None;
224            }
225        };
226        self.memory_available_collector.set(memory_available_bytes);
227        Some(self.memory_available_collector.collect())
228    }
229
230    fn disk_has_db(disk: &Disk, db_path: &Path) -> bool {
231        db_path.starts_with(disk.mount_point())
232    }
233
234    fn collect_disk_available(&self, disks: &Disks) -> Vec<MetricFamily> {
235        let space_available_per_disk: Vec<MetricFamily> = disks
236            .iter()
237            .enumerate()
238            .map(|(idx, disk)| {
239                let disk_name = disk.name().to_string_lossy();
240                let disk_num = idx + 1;
241                Self::uint_gauge(
242                    &format!("disk_{disk_num}_available_bytes",),
243                    &format!("Disk space available (bytes), for disk {disk_num}",),
244                    disk.available_space(),
245                    &[
246                        Some(Self::label("disk_name", disk_name.to_string())),
247                        if Self::disk_has_db(disk, &self.db_path) {
248                            Some(Self::label("is_database_disk", true))
249                        } else {
250                            None
251                        },
252                    ],
253                )
254            })
255            .collect();
256
257        space_available_per_disk
258    }
259
260    fn collect_disks_total_bytes(disks: &Disks, db_path: &Path) -> Vec<MetricFamily> {
261        let total_bytes_per_disk: Vec<MetricFamily> = disks
262            .iter()
263            .enumerate()
264            .map(|(idx, disk)| {
265                let disk_name = disk.name().to_string_lossy();
266                let disk_num = idx + 1;
267                Self::uint_gauge(
268                    &format!("disk_{disk_num}_total_bytes",),
269                    &format!("Disk space total (bytes), for disk {disk_num}",),
270                    disk.total_space(),
271                    &[
272                        Some(Self::label("disk_name", disk_name.to_string())),
273                        if Self::disk_has_db(disk, db_path) {
274                            Some(Self::label("is_database_disk", true))
275                        } else {
276                            None
277                        },
278                    ],
279                )
280            })
281            .collect();
282
283        total_bytes_per_disk
284    }
285}
286
287impl Collector for HardwareMetrics {
288    fn desc(&self) -> Vec<&Desc> {
289        self.static_descriptions.iter().collect()
290    }
291
292    fn collect(&self) -> Vec<MetricFamily> {
293        let mut system = match self.system.lock() {
294            Ok(lock) => lock,
295            Err(e) => {
296                tracing::error!("Failed acquiring lock on System: Lock is poisoned: {e}");
297                return Vec::new();
298            }
299        };
300        system.refresh_memory();
301
302        let mut disks = match self.disks.lock() {
303            Ok(lock) => lock,
304            Err(e) => {
305                tracing::error!("Failed acquiring lock on Disks: Lock is poisoned: {e}");
306                return Vec::new();
307            }
308        };
309        disks.refresh(true);
310
311        let mut mfs = self.static_metric_families.clone();
312        if let Some(families) = self.collect_memory_available(&system) {
313            mfs.extend(families);
314        };
315
316        mfs.extend(self.collect_disk_available(&disks));
317
318        mfs
319    }
320}
321
322#[cfg(test)]
323mod tests {
324    use std::{
325        net::SocketAddrV4,
326        path::PathBuf,
327        sync::LazyLock,
328        time::{SystemTime, UNIX_EPOCH},
329    };
330
331    use super::*;
332
333    static DB_PATH: LazyLock<PathBuf> = LazyLock::new(|| PathBuf::from("/opt/iota/db"));
334
335    #[tokio::test]
336    async fn test_collect_hardware_specs() -> Result<(), String> {
337        let prom_server_addr: SocketAddrV4 = "0.0.0.0:9194".parse().unwrap();
338
339        let registry_svc = crate::start_prometheus_server(prom_server_addr.into());
340
341        register_hardware_metrics(&registry_svc, &DB_PATH)
342            .expect("Failed registering hardware metrics");
343
344        let now = SystemTime::now()
345            .duration_since(UNIX_EPOCH)
346            .unwrap()
347            .as_millis() as i64;
348
349        let mut metric_families = registry_svc.gather_all();
350        for mf in metric_families.iter_mut() {
351            for m in mf.mut_metric() {
352                m.set_timestamp_ms(now);
353            }
354        }
355
356        let find_metric = |family_name: &str| -> Result<&Metric, String> {
357            let fname_namespaced = format!("hw_{}", family_name.trim_start_matches("hw_"));
358            let metric = metric_families
359                .iter()
360                .find(|mf| mf.name() == fname_namespaced)
361                .ok_or_else(|| format!("Metric family not found: {fname_namespaced}"))?
362                .get_metric()
363                .first()
364                .ok_or_else(|| format!("No metrics in family {fname_namespaced}"))?;
365            Ok(metric)
366        };
367        let find_metric_label = |family_name: &str, label_name: &str| -> Result<String, String> {
368            let metric = find_metric(family_name)?;
369            Ok(metric
370                .get_label()
371                .iter()
372                .find(|l| l.name() == label_name)
373                .ok_or_else(|| format!("Label not found: {label_name}"))?
374                .value()
375                .to_string())
376        };
377
378        let cpu_core_count = find_metric("cpu_core_count")?;
379        let core_count: usize = cpu_core_count.get_gauge().value() as usize;
380        assert!(core_count > 0 && core_count < 513);
381
382        // we only check specs are present in labels
383        let _ = find_metric_label("cpu_core_count", "model")?;
384        let _ = find_metric_label("cpu_core_count", "vendor_id")?;
385        let _ = find_metric_label("cpu_core_count", "arch")?;
386
387        let mem_total_bytes = find_metric("memory_total_bytes")?.get_gauge().value();
388        assert!(mem_total_bytes > 0.0);
389        let mem_available_bytes = find_metric("memory_available_bytes")?.get_gauge().value();
390        assert!(mem_available_bytes > 0.0);
391
392        let disk_1_total_bytes = find_metric("disk_1_total_bytes")?;
393        assert!(disk_1_total_bytes.get_gauge().value() > 0.0);
394        let disk_available = find_metric("disk_1_available_bytes")?;
395        assert!(disk_available.get_gauge().value() > 0.0);
396
397        Ok(())
398    }
399}