telemetry_subscribers/
span_latency_prom.rs

1// Copyright (c) Mysten Labs, Inc.
2// Modifications Copyright (c) 2024 IOTA Stiftung
3// SPDX-License-Identifier: Apache-2.0
4
5//! This is a module that records Tokio-tracing [span](https://docs.rs/tracing/latest/tracing/span/index.html)
6//! latencies into Prometheus histograms directly.
7//! The name of the Prometheus histogram is
8//! "tracing_span_latencies[_sum/count/bucket]"
9//!
10//! There is also the tracing-timing crate, from which this differs
11//! significantly:
12//! - tracing-timing records latencies between events (logs).  We just want to
13//!   record the latencies of spans.
14//! - tracing-timing does not output to Prometheus, and extracting data from its
15//!   histograms takes extra CPU
16//! - tracing-timing records latencies using HDRHistogram, which is great, but
17//!   uses extra memory when one is already using Prometheus
18//!
19//! Thus this is a much smaller and more focused module.
20//!
21//! ## Making spans visible
22//! This module can only record latencies for spans that get created.  By
23//! default, this is controlled by env_filter and logging levels.
24
25use std::time::Instant;
26
27use prometheus::{Registry, exponential_buckets, register_histogram_vec_with_registry};
28use tracing::{Subscriber, span};
29
30/// A tokio_tracing Layer that records span latencies into Prometheus histograms
31pub struct PrometheusSpanLatencyLayer {
32    span_latencies: prometheus::HistogramVec,
33}
34
35#[derive(Debug)]
36pub enum PrometheusSpanError {
37    /// num_buckets must be positive >= 1
38    ZeroOrNegativeNumBuckets,
39    Prometheus(prometheus::Error),
40}
41
42impl From<prometheus::Error> for PrometheusSpanError {
43    fn from(err: prometheus::Error) -> Self {
44        Self::Prometheus(err)
45    }
46}
47
48const TOP_LATENCY_IN_NS: f64 = 300.0 * 1.0e9;
49const LOWEST_LATENCY_IN_NS: f64 = 500.0;
50
51impl PrometheusSpanLatencyLayer {
52    /// Create a new layer, injecting latencies into the given registry.
53    /// The num_buckets controls how many buckets thus how much memory and time
54    /// series one uses up in Prometheus (and in the application).  10 is
55    /// probably a minimum.
56    pub fn try_new(registry: &Registry, num_buckets: usize) -> Result<Self, PrometheusSpanError> {
57        if num_buckets < 1 {
58            return Err(PrometheusSpanError::ZeroOrNegativeNumBuckets);
59        }
60
61        // Histogram for span latencies must accommodate a wide range of possible
62        // latencies, so don't use the default Prometheus buckets.  Latencies in
63        // NS.  Calculate the multiplier to go from LOWEST to TOP in num_bucket
64        // steps, step n+1 = step n * factor.
65        let factor = (TOP_LATENCY_IN_NS / LOWEST_LATENCY_IN_NS).powf(1.0 / (num_buckets as f64));
66        let buckets = exponential_buckets(LOWEST_LATENCY_IN_NS, factor, num_buckets)?;
67        let span_latencies = register_histogram_vec_with_registry!(
68            "tracing_span_latencies",
69            "Latencies from tokio-tracing spans",
70            &["span_name"],
71            buckets,
72            registry
73        )?;
74        Ok(Self { span_latencies })
75    }
76}
77
78struct PromSpanTimestamp(Instant);
79
80impl<S> tracing_subscriber::Layer<S> for PrometheusSpanLatencyLayer
81where
82    S: Subscriber + for<'span> tracing_subscriber::registry::LookupSpan<'span>,
83{
84    fn on_new_span(
85        &self,
86        _attrs: &span::Attributes,
87        id: &span::Id,
88        ctx: tracing_subscriber::layer::Context<S>,
89    ) {
90        let span = ctx.span(id).unwrap();
91        // NOTE: there are other extensions that insert timings.  For example,
92        // tracing_subscriber's with_span_events() inserts events at open and close that
93        // contain timings. However, we cannot be guaranteed that those events
94        // would be turned on.
95        span.extensions_mut()
96            .insert(PromSpanTimestamp(Instant::now()));
97    }
98
99    fn on_close(&self, id: span::Id, ctx: tracing_subscriber::layer::Context<'_, S>) {
100        let span = ctx.span(&id).unwrap();
101        let start_time = span
102            .extensions()
103            .get::<PromSpanTimestamp>()
104            .expect("Could not find saved timestamp on span")
105            .0;
106        let elapsed_ns = start_time.elapsed().as_nanos() as u64;
107        self.span_latencies
108            .with_label_values(&[span.name()])
109            .observe(elapsed_ns as f64);
110    }
111}
112
113#[cfg(test)]
114mod tests {
115    use super::*;
116
117    #[test]
118    fn test_prom_span_latency_init() {
119        let registry = prometheus::Registry::new();
120
121        let res = PrometheusSpanLatencyLayer::try_new(&registry, 0);
122        assert!(matches!(
123            res,
124            Err(PrometheusSpanError::ZeroOrNegativeNumBuckets)
125        ));
126    }
127}