1use std::{sync::Arc, time::Duration};
6
7use async_graphql::{PathSegment, ServerError};
8use prometheus::{
9 Gauge, Histogram, HistogramVec, IntCounter, IntCounterVec, Registry,
10 register_gauge_with_registry, register_histogram_vec_with_registry,
11 register_histogram_with_registry, register_int_counter_vec_with_registry,
12 register_int_counter_with_registry,
13};
14
15use crate::error::code;
16
17const LATENCY_SEC_BUCKETS: &[f64] = &[
19 0.001, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1., 2.5, 5., 10., 20., 30., 60., 90.,
20];
21const DB_LATENCY_SEC_BUCKETS: &[f64] = &[
22 0.001, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0,
23 5.0, 10.0, 20.0, 40.0, 60.0, 80.0, 100.0, 200.0,
24];
25const INPUT_NODES_BUCKETS: &[f64] = &[
26 1., 2., 4., 8., 12., 16., 24., 32., 48., 64., 96., 128., 256., 512., 1024.,
27];
28const OUTPUT_NODES_BUCKETS: &[f64] = &[
29 100., 200., 400., 800., 1200., 1600., 2400., 3200., 4800., 6400., 9600., 12800., 25600.,
30 51200., 102400.,
31];
32const QUERY_DEPTH_BUCKETS: &[f64] = &[
33 1., 2., 4., 8., 12., 16., 24., 32., 48., 64., 96., 128., 256., 512., 1024.,
34];
35const QUERY_PAYLOAD_SIZE_BUCKETS: &[f64] = &[
36 10., 20., 50., 100., 200., 400., 800., 1200., 1600., 2400., 3200., 4800., 6400., 9600., 12800.,
37 25600., 51200., 102400.,
38];
39const DB_QUERY_COST_BUCKETS: &[f64] = &[
40 1., 2., 4., 8., 12., 16., 24., 32., 48., 64., 96., 128., 256., 512., 1024.,
41];
42
43#[derive(Clone)]
44pub(crate) struct Metrics {
45 pub db_metrics: Arc<DBMetrics>,
46 pub request_metrics: Arc<RequestMetrics>,
47}
48
49#[derive(Clone)]
50pub(crate) struct DBMetrics {
51 pub db_fetches: IntCounterVec,
53 pub db_fetch_latency: HistogramVec,
55 pub _db_query_cost: Histogram,
57 pub _db_fetch_batch_size: HistogramVec,
59}
60
61#[derive(Clone)]
62pub(crate) struct RequestMetrics {
63 pub input_nodes: Histogram,
66 pub output_nodes: Histogram,
68 pub query_depth: Histogram,
70 pub query_payload_too_large_size: Histogram,
72 pub query_payload_size: Histogram,
74 pub query_validation_latency: Histogram,
76 pub query_latency: Histogram,
78 pub num_errors: IntCounterVec,
80 pub num_queries: IntCounter,
82 pub num_queries_top_level: IntCounterVec,
84 pub inflight_requests: Gauge,
86}
87
88impl Metrics {
89 pub(crate) fn new(registry: &Registry) -> Self {
90 let db_metrics = DBMetrics::new(registry);
91 let request_metrics = RequestMetrics::new(registry);
92
93 Self {
94 db_metrics: Arc::new(db_metrics),
95 request_metrics: Arc::new(request_metrics),
96 }
97 }
98
99 pub(crate) fn observe_db_data(&self, time: Duration, succeeded: bool) {
101 let label = if succeeded { "success" } else { "error" };
102 self.db_metrics.db_fetches.with_label_values(&[label]).inc();
103 self.db_metrics
104 .db_fetch_latency
105 .with_label_values(&[label])
106 .observe(time.as_secs_f64());
107 }
108
109 pub(crate) fn query_latency(&self, time: Duration) {
111 self.request_metrics
112 .query_latency
113 .observe(time.as_secs_f64());
114 }
115
116 pub(crate) fn query_validation_latency(&self, time: Duration) {
118 self.request_metrics
119 .query_validation_latency
120 .observe(time.as_secs_f64());
121 }
122
123 pub(crate) fn inc_num_queries(&self) {
125 self.request_metrics.num_queries.inc();
126 }
127
128 pub(crate) fn inc_errors(&self, errors: &[ServerError]) {
132 for err in errors {
133 if let Some(ext) = &err.extensions {
134 if let Some(async_graphql_value::ConstValue::String(val)) = ext.get("code") {
135 self.request_metrics
136 .num_errors
137 .with_label_values(&[query_label_for_error(&err.path).as_str(), val])
138 .inc();
139 }
140 } else {
141 self.request_metrics
142 .num_errors
143 .with_label_values(&[query_label_for_error(&err.path).as_str(), code::UNKNOWN])
144 .inc();
145 }
146 }
147 }
148}
149
150impl DBMetrics {
151 pub(crate) fn new(registry: &Registry) -> Self {
152 Self {
153 db_fetches: register_int_counter_vec_with_registry!(
154 "db_fetches",
155 "The number of fetches grouped by result (success or error)",
156 &["type"],
157 registry
158 )
159 .unwrap(),
160 db_fetch_latency: register_histogram_vec_with_registry!(
161 "db_fetch_latency",
162 "The fetch latency grouped by result (success or error)",
163 &["type"],
164 DB_LATENCY_SEC_BUCKETS.to_vec(),
165 registry,
166 )
167 .unwrap(),
168 _db_query_cost: register_histogram_with_registry!(
169 "db_query_cost",
170 "Cost of a DB query",
171 DB_QUERY_COST_BUCKETS.to_vec(),
172 registry,
173 )
174 .unwrap(),
175 _db_fetch_batch_size: register_histogram_vec_with_registry!(
176 "db_fetch_batch_size",
177 "Number of ids fetched per batch",
178 &["type"],
179 registry,
180 )
181 .unwrap(),
182 }
183 }
184}
185
186impl RequestMetrics {
187 pub(crate) fn new(registry: &Registry) -> Self {
188 Self {
189 input_nodes: register_histogram_with_registry!(
190 "input_nodes",
191 "Number of input nodes in the query",
192 INPUT_NODES_BUCKETS.to_vec(),
193 registry,
194 )
195 .unwrap(),
196 output_nodes: register_histogram_with_registry!(
197 "output_nodes",
198 "Number of output nodes in the response",
199 OUTPUT_NODES_BUCKETS.to_vec(),
200 registry,
201 )
202 .unwrap(),
203 query_depth: register_histogram_with_registry!(
204 "query_depth",
205 "Depth of the query",
206 QUERY_DEPTH_BUCKETS.to_vec(),
207 registry
208 )
209 .unwrap(),
210 query_payload_too_large_size: register_histogram_with_registry!(
211 "query_payload_too_large_size",
212 "Query payload size (bytes), that was rejected due to being larger than maximum",
213 QUERY_PAYLOAD_SIZE_BUCKETS.to_vec(),
214 registry,
215 )
216 .unwrap(),
217 query_payload_size: register_histogram_with_registry!(
218 "query_payload_size",
219 "Size of the query payload string",
220 QUERY_PAYLOAD_SIZE_BUCKETS.to_vec(),
221 registry,
222 )
223 .unwrap(),
224 query_validation_latency: register_histogram_with_registry!(
225 "query_validation_latency",
226 "The time to validate the query",
227 LATENCY_SEC_BUCKETS.to_vec(),
228 registry,
229 )
230 .unwrap(),
231 query_latency: register_histogram_with_registry!(
232 "query_latency",
233 "The time needed to resolve and get the result for the request",
234 LATENCY_SEC_BUCKETS.to_vec(),
235 registry,
236 )
237 .unwrap(),
238 num_errors: register_int_counter_vec_with_registry!(
239 "num_errors",
240 "Number of errors by path and error type",
241 &["path", "type"],
242 registry,
243 )
244 .unwrap(),
245 num_queries: register_int_counter_with_registry!(
246 "num_queries",
247 "Total number of queries",
248 registry
249 )
250 .unwrap(),
251 num_queries_top_level: register_int_counter_vec_with_registry!(
252 "num_queries_top_level",
253 "Number of queries for each top level node",
254 &["path"],
255 registry
256 )
257 .unwrap(),
258 inflight_requests: register_gauge_with_registry!(
259 "inflight_requests",
260 "Number of queries that are being resolved at a moment in time",
261 registry
262 )
263 .unwrap(),
264 }
265 }
266}
267
268pub(crate) fn query_label_for_error(query: &[PathSegment]) -> String {
271 let fields: Vec<_> = query
272 .iter()
273 .filter_map(|s| {
274 if let PathSegment::Field(name) = s {
275 Some(name)
276 } else {
277 None
278 }
279 })
280 .collect();
281
282 match &fields[..] {
283 [] => "".to_string(),
284 [seg] => seg.to_string(),
285 [fst, .., lst] => format!("{fst}..{lst}"),
286 }
287}