iota_metrics/
thread_stall_monitor.rs

1// Copyright (c) Mysten Labs, Inc.
2// Modifications Copyright (c) 2024 IOTA Stiftung
3// SPDX-License-Identifier: Apache-2.0
4
5use std::{
6    sync::{Arc, Mutex, Once},
7    time::{Duration, Instant},
8};
9
10use tracing::{error, info};
11
12use crate::{get_metrics, spawn_logged_monitored_task};
13
14static THREAD_STALL_MONITOR: Once = Once::new();
15
16const MONITOR_INTERVAL: Duration = Duration::from_millis(50);
17const ALERT_THRESHOLD: Duration = Duration::from_millis(500);
18
19// These funcs are extern in order to be easily findable by debuggers
20// To catch a thread stall in the act, do the following:
21//
22// Create a file `gdbcmd` with
23//
24//      set logging file gdb.txt
25//      set logging on
26//      set pagination off
27//      set breakpoint pending on
28//
29//      b thread_monitor_report_stall
30//      commands
31//      thread apply all bt
32//      continue
33//      end
34//
35// Then run gdb with:
36//     gdb -x gdbmcmd -p <pid of sui-node>
37//
38// You will need to type `c` to continue the process after it loads.
39//
40// The debugger will now print out all thread stacks every time a thread stall
41// is detected.
42#[inline(never)]
43extern "C" fn thread_monitor_report_stall(duration_ms: u64) {
44    error!("Thread stalled for {}ms", duration_ms);
45}
46
47#[inline(never)]
48extern "C" fn thread_monitor_report_stall_cleared(duration_ms: u64) {
49    error!("Thread stall cleared after {}ms", duration_ms);
50}
51
52/// Monitors temporary stalls in tokio scheduling every MONITOR_INTERVAL.
53/// Calls `thread_monitor_report_stall` if more than ALERT_THRESHOLD has
54/// elapsed. When the stall clears, we observer the duration in a histogram.
55pub fn start_thread_stall_monitor() {
56    let mut called = true;
57    THREAD_STALL_MONITOR.call_once(|| {
58        called = false;
59    });
60    if called {
61        return;
62    }
63    if tokio::runtime::Handle::try_current().is_err() {
64        info!("Not running in a tokio runtime, not starting thread stall monitor.");
65        return;
66    }
67
68    let last_update: Arc<Mutex<Instant>> = Arc::new(Mutex::new(Instant::now()));
69
70    {
71        let last_update = last_update.clone();
72        std::thread::spawn(move || {
73            info!("Starting thread stall monitor watchdog thread");
74            let mut stall_duration = None;
75
76            loop {
77                std::thread::sleep(MONITOR_INTERVAL);
78                let now = Instant::now();
79                let last_update = *last_update.lock().unwrap();
80                let time_since_last_update = now - last_update;
81                if time_since_last_update > ALERT_THRESHOLD {
82                    if stall_duration.is_none() {
83                        thread_monitor_report_stall(time_since_last_update.as_millis() as u64);
84                    }
85                    stall_duration = Some(time_since_last_update);
86                } else if let Some(dur) = stall_duration {
87                    stall_duration = None;
88                    thread_monitor_report_stall_cleared(dur.as_millis() as u64);
89                    if let Some(metrics) = get_metrics() {
90                        metrics.thread_stall_duration_sec.observe(dur.as_secs_f64());
91                    }
92                }
93            }
94        });
95    }
96
97    spawn_logged_monitored_task!(
98        async move {
99            info!("Starting thread stall monitor update task");
100            loop {
101                tokio::time::sleep(MONITOR_INTERVAL).await;
102                *last_update.lock().unwrap() = Instant::now();
103            }
104        },
105        "ThreadStallMonitor"
106    );
107}