iota_graphql_rpc/
metrics.rs

1// Copyright (c) Mysten Labs, Inc.
2// Modifications Copyright (c) 2024 IOTA Stiftung
3// SPDX-License-Identifier: Apache-2.0
4
5use std::{sync::Arc, time::Duration};
6
7use async_graphql::{PathSegment, ServerError};
8use prometheus::{
9    Gauge, Histogram, HistogramVec, IntCounter, IntCounterVec, Registry,
10    register_gauge_with_registry, register_histogram_vec_with_registry,
11    register_histogram_with_registry, register_int_counter_vec_with_registry,
12    register_int_counter_with_registry,
13};
14
15use crate::error::code;
16
17// TODO: finetune buckets as we learn more about the distribution of queries
18const LATENCY_SEC_BUCKETS: &[f64] = &[
19    0.001, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1., 2.5, 5., 10., 20., 30., 60., 90.,
20];
21const DB_LATENCY_SEC_BUCKETS: &[f64] = &[
22    0.001, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0,
23    5.0, 10.0, 20.0, 40.0, 60.0, 80.0, 100.0, 200.0,
24];
25const INPUT_NODES_BUCKETS: &[f64] = &[
26    1., 2., 4., 8., 12., 16., 24., 32., 48., 64., 96., 128., 256., 512., 1024.,
27];
28const OUTPUT_NODES_BUCKETS: &[f64] = &[
29    100., 200., 400., 800., 1200., 1600., 2400., 3200., 4800., 6400., 9600., 12800., 25600.,
30    51200., 102400.,
31];
32const QUERY_DEPTH_BUCKETS: &[f64] = &[
33    1., 2., 4., 8., 12., 16., 24., 32., 48., 64., 96., 128., 256., 512., 1024.,
34];
35const QUERY_PAYLOAD_SIZE_BUCKETS: &[f64] = &[
36    10., 20., 50., 100., 200., 400., 800., 1200., 1600., 2400., 3200., 4800., 6400., 9600., 12800.,
37    25600., 51200., 102400.,
38];
39const DB_QUERY_COST_BUCKETS: &[f64] = &[
40    1., 2., 4., 8., 12., 16., 24., 32., 48., 64., 96., 128., 256., 512., 1024.,
41];
42
43#[derive(Clone)]
44pub(crate) struct Metrics {
45    pub db_metrics: Arc<DBMetrics>,
46    pub request_metrics: Arc<RequestMetrics>,
47}
48
49#[derive(Clone)]
50pub(crate) struct DBMetrics {
51    /// The number of fetches grouped by result (success or error)
52    pub db_fetches: IntCounterVec,
53    /// The fetch latency grouped by result (success or error)
54    pub db_fetch_latency: HistogramVec,
55    // TODO make this work, blocked by pg.rs (unclear if to use log function or smth else)
56    pub _db_query_cost: Histogram,
57    // TODO determine if we want this metric, and implement it
58    pub _db_fetch_batch_size: HistogramVec,
59}
60
61#[derive(Clone)]
62pub(crate) struct RequestMetrics {
63    /// The number of nodes for the input query that passed the query limits
64    /// check
65    pub input_nodes: Histogram,
66    /// The number of nodes in the result
67    pub output_nodes: Histogram,
68    /// The query depth
69    pub query_depth: Histogram,
70    /// The size (in bytes) of the payload that is higher than the maximum
71    pub query_payload_too_large_size: Histogram,
72    /// The size (in bytes) of the payload
73    pub query_payload_size: Histogram,
74    /// The time it takes to validate the query
75    pub query_validation_latency: Histogram,
76    /// The time it takes for the GraphQL service to execute the request
77    pub query_latency: Histogram,
78    /// Number of errors by path and type.
79    pub num_errors: IntCounterVec,
80    /// Number of queries
81    pub num_queries: IntCounter,
82    /// Number of queries by top level path
83    pub num_queries_top_level: IntCounterVec,
84    /// Total inflight requests
85    pub inflight_requests: Gauge,
86}
87
88impl Metrics {
89    pub(crate) fn new(registry: &Registry) -> Self {
90        let db_metrics = DBMetrics::new(registry);
91        let request_metrics = RequestMetrics::new(registry);
92
93        Self {
94            db_metrics: Arc::new(db_metrics),
95            request_metrics: Arc::new(request_metrics),
96        }
97    }
98
99    /// Updates the DB related metrics (latency, error, success)
100    pub(crate) fn observe_db_data(&self, time: Duration, succeeded: bool) {
101        let label = if succeeded { "success" } else { "error" };
102        self.db_metrics.db_fetches.with_label_values(&[label]).inc();
103        self.db_metrics
104            .db_fetch_latency
105            .with_label_values(&[label])
106            .observe(time.as_secs_f64());
107    }
108
109    /// The total time needed for handling the query
110    pub(crate) fn query_latency(&self, time: Duration) {
111        self.request_metrics
112            .query_latency
113            .observe(time.as_secs_f64());
114    }
115
116    /// The time needed for validating the query
117    pub(crate) fn query_validation_latency(&self, time: Duration) {
118        self.request_metrics
119            .query_validation_latency
120            .observe(time.as_secs_f64());
121    }
122
123    /// Increment the total number of queries by one
124    pub(crate) fn inc_num_queries(&self) {
125        self.request_metrics.num_queries.inc();
126    }
127
128    /// Use this function to increment the number of errors per path and per
129    /// error type. The error type is detected automatically from the passed
130    /// errors.
131    pub(crate) fn inc_errors(&self, errors: &[ServerError]) {
132        for err in errors {
133            if let Some(ext) = &err.extensions {
134                if let Some(async_graphql_value::ConstValue::String(val)) = ext.get("code") {
135                    self.request_metrics
136                        .num_errors
137                        .with_label_values(&[query_label_for_error(&err.path).as_str(), val])
138                        .inc();
139                }
140            } else {
141                self.request_metrics
142                    .num_errors
143                    .with_label_values(&[query_label_for_error(&err.path).as_str(), code::UNKNOWN])
144                    .inc();
145            }
146        }
147    }
148}
149
150impl DBMetrics {
151    pub(crate) fn new(registry: &Registry) -> Self {
152        Self {
153            db_fetches: register_int_counter_vec_with_registry!(
154                "db_fetches",
155                "The number of fetches grouped by result (success or error)",
156                &["type"],
157                registry
158            )
159            .unwrap(),
160            db_fetch_latency: register_histogram_vec_with_registry!(
161                "db_fetch_latency",
162                "The fetch latency grouped by result (success or error)",
163                &["type"],
164                DB_LATENCY_SEC_BUCKETS.to_vec(),
165                registry,
166            )
167            .unwrap(),
168            _db_query_cost: register_histogram_with_registry!(
169                "db_query_cost",
170                "Cost of a DB query",
171                DB_QUERY_COST_BUCKETS.to_vec(),
172                registry,
173            )
174            .unwrap(),
175            _db_fetch_batch_size: register_histogram_vec_with_registry!(
176                "db_fetch_batch_size",
177                "Number of ids fetched per batch",
178                &["type"],
179                registry,
180            )
181            .unwrap(),
182        }
183    }
184}
185
186impl RequestMetrics {
187    pub(crate) fn new(registry: &Registry) -> Self {
188        Self {
189            input_nodes: register_histogram_with_registry!(
190                "input_nodes",
191                "Number of input nodes in the query",
192                INPUT_NODES_BUCKETS.to_vec(),
193                registry,
194            )
195            .unwrap(),
196            output_nodes: register_histogram_with_registry!(
197                "output_nodes",
198                "Number of output nodes in the response",
199                OUTPUT_NODES_BUCKETS.to_vec(),
200                registry,
201            )
202            .unwrap(),
203            query_depth: register_histogram_with_registry!(
204                "query_depth",
205                "Depth of the query",
206                QUERY_DEPTH_BUCKETS.to_vec(),
207                registry
208            )
209            .unwrap(),
210            query_payload_too_large_size: register_histogram_with_registry!(
211                "query_payload_too_large_size",
212                "Query payload size (bytes), that was rejected due to being larger than maximum",
213                QUERY_PAYLOAD_SIZE_BUCKETS.to_vec(),
214                registry,
215            )
216            .unwrap(),
217            query_payload_size: register_histogram_with_registry!(
218                "query_payload_size",
219                "Size of the query payload string",
220                QUERY_PAYLOAD_SIZE_BUCKETS.to_vec(),
221                registry,
222            )
223            .unwrap(),
224            query_validation_latency: register_histogram_with_registry!(
225                "query_validation_latency",
226                "The time to validate the query",
227                LATENCY_SEC_BUCKETS.to_vec(),
228                registry,
229            )
230            .unwrap(),
231            query_latency: register_histogram_with_registry!(
232                "query_latency",
233                "The time needed to resolve and get the result for the request",
234                LATENCY_SEC_BUCKETS.to_vec(),
235                registry,
236            )
237            .unwrap(),
238            num_errors: register_int_counter_vec_with_registry!(
239                "num_errors",
240                "Number of errors by path and error type",
241                &["path", "type"],
242                registry,
243            )
244            .unwrap(),
245            num_queries: register_int_counter_with_registry!(
246                "num_queries",
247                "Total number of queries",
248                registry
249            )
250            .unwrap(),
251            num_queries_top_level: register_int_counter_vec_with_registry!(
252                "num_queries_top_level",
253                "Number of queries for each top level node",
254                &["path"],
255                registry
256            )
257            .unwrap(),
258            inflight_requests: register_gauge_with_registry!(
259                "inflight_requests",
260                "Number of queries that are being resolved at a moment in time",
261                registry
262            )
263            .unwrap(),
264        }
265    }
266}
267
268/// When an error occurs, GraphQL returns a vector of PathSegments,
269/// that we can use to retrieve the last node which contains the error.
270pub(crate) fn query_label_for_error(query: &[PathSegment]) -> String {
271    let fields: Vec<_> = query
272        .iter()
273        .filter_map(|s| {
274            if let PathSegment::Field(name) = s {
275                Some(name)
276            } else {
277                None
278            }
279        })
280        .collect();
281
282    match &fields[..] {
283        [] => "".to_string(),
284        [seg] => seg.to_string(),
285        [fst, .., lst] => format!("{fst}..{lst}"),
286    }
287}