1use std::{
5 collections::HashMap,
6 path::{Path, PathBuf},
7 sync::{Arc, Mutex},
8};
9
10use prometheus::{
11 IntGauge, Opts, Registry,
12 core::{Collector, Desc, Number},
13 proto::{LabelPair, Metric, MetricFamily, MetricType},
14};
15use sysinfo::{CpuRefreshKind, Disk, Disks, MemoryRefreshKind, RefreshKind, System};
16
17use crate::RegistryService;
18
19#[derive(thiserror::Error, Debug)]
20pub enum HardwareMetricsErr {
21 #[error("Failed creating metric: {0}")]
22 ErrCreateMetric(prometheus::Error),
23 #[error("Failed registering hardware metrics onto RegistryService: {0}")]
24 ErrRegisterHardwareMetrics(prometheus::Error),
25}
26
27pub fn register_hardware_metrics(
32 registry_service: &RegistryService,
33 db_path: &Path,
34) -> Result<(), HardwareMetricsErr> {
35 let registry = Registry::new_custom(Some("hw".to_string()), None)
36 .map_err(HardwareMetricsErr::ErrRegisterHardwareMetrics)?;
37 registry
38 .register(Box::new(HardwareMetrics::new(db_path)?))
39 .map_err(HardwareMetricsErr::ErrRegisterHardwareMetrics)?;
40 registry_service.add(registry);
41 Ok(())
42}
43
44pub struct HardwareMetrics {
45 system: Arc<Mutex<System>>,
46 disks: Arc<Mutex<Disks>>,
47 pub static_descriptions: Vec<Desc>,
49 pub static_metric_families: Vec<MetricFamily>,
52 pub memory_available_collector: IntGauge,
53 pub db_path: PathBuf,
55}
56
57impl HardwareMetrics {
58 pub fn new(db_path: &Path) -> Result<Self, HardwareMetricsErr> {
59 let mut system = System::new_with_specifics(
60 RefreshKind::nothing()
61 .with_cpu(CpuRefreshKind::nothing())
62 .with_memory(MemoryRefreshKind::nothing().with_ram()),
63 );
64 system.refresh_all();
65
66 let disks = Disks::new_with_refreshed_list();
67
68 Ok(Self {
69 static_descriptions: Self::static_descriptions(&system, &disks, db_path)?,
70 static_metric_families: Self::static_metric_families(&system, &disks, db_path)?,
71 memory_available_collector: Self::memory_available_collector()?,
72 system: Arc::new(Mutex::new(system)),
73 disks: Arc::new(Mutex::new(disks)),
74 db_path: PathBuf::from(db_path),
75 })
76 }
77
78 pub fn static_descriptions(
79 system: &System,
80 disks: &Disks,
81 db_path: &Path,
82 ) -> Result<Vec<Desc>, HardwareMetricsErr> {
83 let mut descs: Vec<Desc> = Vec::new();
84 for mf in Self::static_metric_families(system, disks, db_path)? {
85 descs.push(Self::metric_family_desc(&mf)?);
86 }
87 Ok(descs)
88 }
89
90 pub fn static_metric_families(
91 system: &System,
92 disks: &Disks,
93 db_path: &Path,
94 ) -> Result<Vec<MetricFamily>, HardwareMetricsErr> {
95 let mut mfs = Vec::new();
96 mfs.push(Self::collect_cpu_specs(system));
97 mfs.extend(Self::memory_total_collector(system)?.collect());
98 for mf in Self::collect_disks_total_bytes(disks, db_path) {
99 mfs.push(mf);
100 }
101 Ok(mfs)
102 }
103
104 fn label(name: &str, value: impl ToString) -> LabelPair {
105 let mut label = LabelPair::new();
106 label.set_name(name.to_string());
107 label.set_value(value.to_string());
108 label
109 }
110
111 fn uint_gauge(
112 name: &str,
113 help: &str,
114 value: u64,
115 labels: &[Option<LabelPair>],
116 ) -> MetricFamily {
117 let mut g = prometheus::proto::Gauge::default();
118 let mut m = Metric::default();
119 let mut mf = MetricFamily::new();
120
121 g.set_value(value.into_f64());
122 m.set_gauge(g);
123 m.set_label(
124 labels
125 .iter()
126 .filter_map(|opt| opt.as_ref())
127 .cloned()
128 .collect::<Vec<_>>(),
129 );
130
131 mf.mut_metric().push(m);
132 mf.set_name(name.to_string());
133 mf.set_help(help.to_string());
134 mf.set_field_type(MetricType::GAUGE);
135 mf
136 }
137
138 fn metric_family_desc(fam: &MetricFamily) -> Result<Desc, HardwareMetricsErr> {
139 Desc::new(
140 fam.name().to_string(),
141 fam.help().to_string(),
142 vec![],
143 HashMap::new(),
144 )
145 .map_err(HardwareMetricsErr::ErrCreateMetric)
146 }
147
148 fn cpu_vendor_id(system: &System) -> String {
149 let vendor_id = system
150 .cpus()
151 .first()
152 .map_or("cpu_vendor_id_unavailable", |cpu| cpu.vendor_id());
153 match vendor_id {
154 "" => "cpu_vendor_id_unavailable",
155 _ => vendor_id,
156 }
157 .to_string()
158 }
159
160 fn cpu_model(system: &System) -> String {
161 let brand = system
162 .cpus()
163 .first()
164 .map_or("cpu_model_unavailable", |cpu| cpu.brand());
165 match brand {
166 "" => "cpu_model_unavailable",
167 _ => brand,
168 }
169 .to_string()
170 }
171
172 fn collect_cpu_specs(system: &System) -> MetricFamily {
173 Self::uint_gauge(
174 "cpu_core_count",
175 "CPU core count (and labels: model,vendor_id,arch)",
176 system.physical_core_count().unwrap_or_default() as u64,
177 &[
178 Some(Self::label("model", Self::cpu_model(system))),
179 Some(Self::label("vendor_id", Self::cpu_vendor_id(system))),
180 Some(Self::label("arch", System::cpu_arch())),
181 ],
182 )
183 }
184
185 fn memory_total_collector(system: &System) -> Result<IntGauge, HardwareMetricsErr> {
202 let mem_total_bytes = system.total_memory();
203 let memory_total_collector =
204 IntGauge::with_opts(Opts::new("memory_total_bytes", "Memory total (bytes)"))
205 .map_err(HardwareMetricsErr::ErrCreateMetric)?;
206 memory_total_collector.set(mem_total_bytes as i64);
207 Ok(memory_total_collector)
208 }
209
210 fn memory_available_collector() -> Result<IntGauge, HardwareMetricsErr> {
211 IntGauge::with_opts(Opts::new(
212 "memory_available_bytes",
213 "Memory available (bytes)",
214 ))
215 .map_err(HardwareMetricsErr::ErrCreateMetric)
216 }
217
218 fn collect_memory_available(&self, system: &System) -> Option<Vec<MetricFamily>> {
219 let memory_available_bytes = match i64::try_from(system.available_memory()) {
220 Ok(bytes) => bytes,
221 Err(e) => {
222 tracing::error!("Failed converting memory_available_bytes to i64: {e}");
223 return None;
224 }
225 };
226 self.memory_available_collector.set(memory_available_bytes);
227 Some(self.memory_available_collector.collect())
228 }
229
230 fn disk_has_db(disk: &Disk, db_path: &Path) -> bool {
231 db_path.starts_with(disk.mount_point())
232 }
233
234 fn collect_disk_available(&self, disks: &Disks) -> Vec<MetricFamily> {
235 let space_available_per_disk: Vec<MetricFamily> = disks
236 .iter()
237 .enumerate()
238 .map(|(idx, disk)| {
239 let disk_name = disk.name().to_string_lossy();
240 let disk_num = idx + 1;
241 Self::uint_gauge(
242 &format!("disk_{disk_num}_available_bytes",),
243 &format!("Disk space available (bytes), for disk {disk_num}",),
244 disk.available_space(),
245 &[
246 Some(Self::label("disk_name", disk_name.to_string())),
247 if Self::disk_has_db(disk, &self.db_path) {
248 Some(Self::label("is_database_disk", true))
249 } else {
250 None
251 },
252 ],
253 )
254 })
255 .collect();
256
257 space_available_per_disk
258 }
259
260 fn collect_disks_total_bytes(disks: &Disks, db_path: &Path) -> Vec<MetricFamily> {
261 let total_bytes_per_disk: Vec<MetricFamily> = disks
262 .iter()
263 .enumerate()
264 .map(|(idx, disk)| {
265 let disk_name = disk.name().to_string_lossy();
266 let disk_num = idx + 1;
267 Self::uint_gauge(
268 &format!("disk_{disk_num}_total_bytes",),
269 &format!("Disk space total (bytes), for disk {disk_num}",),
270 disk.total_space(),
271 &[
272 Some(Self::label("disk_name", disk_name.to_string())),
273 if Self::disk_has_db(disk, db_path) {
274 Some(Self::label("is_database_disk", true))
275 } else {
276 None
277 },
278 ],
279 )
280 })
281 .collect();
282
283 total_bytes_per_disk
284 }
285}
286
287impl Collector for HardwareMetrics {
288 fn desc(&self) -> Vec<&Desc> {
289 self.static_descriptions.iter().collect()
290 }
291
292 fn collect(&self) -> Vec<MetricFamily> {
293 let mut system = match self.system.lock() {
294 Ok(lock) => lock,
295 Err(e) => {
296 tracing::error!("Failed acquiring lock on System: Lock is poisoned: {e}");
297 return Vec::new();
298 }
299 };
300 system.refresh_memory();
301
302 let mut disks = match self.disks.lock() {
303 Ok(lock) => lock,
304 Err(e) => {
305 tracing::error!("Failed acquiring lock on Disks: Lock is poisoned: {e}");
306 return Vec::new();
307 }
308 };
309 disks.refresh(true);
310
311 let mut mfs = self.static_metric_families.clone();
312 if let Some(families) = self.collect_memory_available(&system) {
313 mfs.extend(families);
314 };
315
316 mfs.extend(self.collect_disk_available(&disks));
317
318 mfs
319 }
320}
321
322#[cfg(test)]
323mod tests {
324 use std::{
325 net::SocketAddrV4,
326 path::PathBuf,
327 sync::LazyLock,
328 time::{SystemTime, UNIX_EPOCH},
329 };
330
331 use super::*;
332
333 static DB_PATH: LazyLock<PathBuf> = LazyLock::new(|| PathBuf::from("/opt/iota/db"));
334
335 #[tokio::test]
336 async fn test_collect_hardware_specs() -> Result<(), String> {
337 let prom_server_addr: SocketAddrV4 = "0.0.0.0:9194".parse().unwrap();
338
339 let registry_svc = crate::start_prometheus_server(prom_server_addr.into());
340
341 register_hardware_metrics(®istry_svc, &DB_PATH)
342 .expect("Failed registering hardware metrics");
343
344 let now = SystemTime::now()
345 .duration_since(UNIX_EPOCH)
346 .unwrap()
347 .as_millis() as i64;
348
349 let mut metric_families = registry_svc.gather_all();
350 for mf in metric_families.iter_mut() {
351 for m in mf.mut_metric() {
352 m.set_timestamp_ms(now);
353 }
354 }
355
356 let find_metric = |family_name: &str| -> Result<&Metric, String> {
357 let fname_namespaced = format!("hw_{}", family_name.trim_start_matches("hw_"));
358 let metric = metric_families
359 .iter()
360 .find(|mf| mf.name() == fname_namespaced)
361 .ok_or_else(|| format!("Metric family not found: {fname_namespaced}"))?
362 .get_metric()
363 .first()
364 .ok_or_else(|| format!("No metrics in family {fname_namespaced}"))?;
365 Ok(metric)
366 };
367 let find_metric_label = |family_name: &str, label_name: &str| -> Result<String, String> {
368 let metric = find_metric(family_name)?;
369 Ok(metric
370 .get_label()
371 .iter()
372 .find(|l| l.name() == label_name)
373 .ok_or_else(|| format!("Label not found: {label_name}"))?
374 .value()
375 .to_string())
376 };
377
378 let cpu_core_count = find_metric("cpu_core_count")?;
379 let core_count: usize = cpu_core_count.get_gauge().value() as usize;
380 assert!(core_count > 0 && core_count < 513);
381
382 let _ = find_metric_label("cpu_core_count", "model")?;
384 let _ = find_metric_label("cpu_core_count", "vendor_id")?;
385 let _ = find_metric_label("cpu_core_count", "arch")?;
386
387 let mem_total_bytes = find_metric("memory_total_bytes")?.get_gauge().value();
388 assert!(mem_total_bytes > 0.0);
389 let mem_available_bytes = find_metric("memory_available_bytes")?.get_gauge().value();
390 assert!(mem_available_bytes > 0.0);
391
392 let disk_1_total_bytes = find_metric("disk_1_total_bytes")?;
393 assert!(disk_1_total_bytes.get_gauge().value() > 0.0);
394 let disk_available = find_metric("disk_1_available_bytes")?;
395 assert!(disk_available.get_gauge().value() > 0.0);
396
397 Ok(())
398 }
399}