iota_metrics/
thread_stall_monitor.rs1use std::{
6 sync::{Arc, Mutex, Once},
7 time::{Duration, Instant},
8};
9
10use tracing::{error, info};
11
12use crate::{get_metrics, spawn_logged_monitored_task};
13
14static THREAD_STALL_MONITOR: Once = Once::new();
15
16const MONITOR_INTERVAL: Duration = Duration::from_millis(50);
17const ALERT_THRESHOLD: Duration = Duration::from_millis(500);
18
19#[inline(never)]
43extern "C" fn thread_monitor_report_stall(duration_ms: u64) {
44 error!("Thread stalled for {}ms", duration_ms);
45}
46
47#[inline(never)]
48extern "C" fn thread_monitor_report_stall_cleared(duration_ms: u64) {
49 error!("Thread stall cleared after {}ms", duration_ms);
50}
51
52pub fn start_thread_stall_monitor() {
56 let mut called = true;
57 THREAD_STALL_MONITOR.call_once(|| {
58 called = false;
59 });
60 if called {
61 return;
62 }
63 if tokio::runtime::Handle::try_current().is_err() {
64 info!("Not running in a tokio runtime, not starting thread stall monitor.");
65 return;
66 }
67
68 let last_update: Arc<Mutex<Instant>> = Arc::new(Mutex::new(Instant::now()));
69
70 {
71 let last_update = last_update.clone();
72 std::thread::spawn(move || {
73 info!("Starting thread stall monitor watchdog thread");
74 let mut stall_duration = None;
75
76 loop {
77 std::thread::sleep(MONITOR_INTERVAL);
78 let now = Instant::now();
79 let last_update = *last_update.lock().unwrap();
80 let time_since_last_update = now - last_update;
81 if time_since_last_update > ALERT_THRESHOLD {
82 if stall_duration.is_none() {
83 thread_monitor_report_stall(time_since_last_update.as_millis() as u64);
84 }
85 stall_duration = Some(time_since_last_update);
86 } else if let Some(dur) = stall_duration {
87 stall_duration = None;
88 thread_monitor_report_stall_cleared(dur.as_millis() as u64);
89 if let Some(metrics) = get_metrics() {
90 metrics.thread_stall_duration_sec.observe(dur.as_secs_f64());
91 }
92 }
93 }
94 });
95 }
96
97 spawn_logged_monitored_task!(
98 async move {
99 info!("Starting thread stall monitor update task");
100 loop {
101 tokio::time::sleep(MONITOR_INTERVAL).await;
102 *last_update.lock().unwrap() = Instant::now();
103 }
104 },
105 "ThreadStallMonitor"
106 );
107}