iota_metrics/
thread_stall_monitor.rs

1// Copyright (c) Mysten Labs, Inc.
2// Modifications Copyright (c) 2024 IOTA Stiftung
3// SPDX-License-Identifier: Apache-2.0
4
5use std::sync::Once;
6
7use tracing::{error, info};
8
9use crate::{get_metrics, spawn_logged_monitored_task};
10
11static THREAD_STALL_MONITOR: Once = Once::new();
12
13const MONITOR_INTERVAL: std::time::Duration = std::time::Duration::from_millis(500);
14
15/// Monitors temporary stalls in tokio scheduling every MONITOR_INTERVAL.
16/// Logs an error and increments a metric if more than 2 * MONITOR_INTERVAL has
17/// elapsed, which means the stall lasted longer than MONITOR_INTERVAL.
18pub fn start_thread_stall_monitor() {
19    let mut called = true;
20    THREAD_STALL_MONITOR.call_once(|| {
21        called = false;
22    });
23    if called {
24        return;
25    }
26    if tokio::runtime::Handle::try_current().is_err() {
27        info!("Not running in a tokio runtime, not starting thread stall monitor.");
28        return;
29    }
30
31    spawn_logged_monitored_task!(
32        async move {
33            let Some(metrics) = get_metrics() else {
34                info!("Metrics uninitialized, not starting thread stall monitor.");
35                return;
36            };
37            let mut last_sleep_time = tokio::time::Instant::now();
38            loop {
39                tokio::time::sleep(MONITOR_INTERVAL).await;
40                let current_time = tokio::time::Instant::now();
41                let stalled_duration = current_time - last_sleep_time - MONITOR_INTERVAL;
42                last_sleep_time = current_time;
43                if stalled_duration > MONITOR_INTERVAL {
44                    metrics
45                        .thread_stall_duration_sec
46                        .observe(stalled_duration.as_secs_f64());
47                    // TODO: disable this in simulation tests with artificial thread stalls?
48                    error!(
49                        "Thread stalled for {}s. Possible causes include CPU overload or too much blocking calls.",
50                        stalled_duration.as_secs_f64()
51                    );
52                }
53            }
54        },
55        "ThreadStallMonitor"
56    );
57}