typed_store/
metrics.rs

1// Copyright (c) Mysten Labs, Inc.
2// Modifications Copyright (c) 2024 IOTA Stiftung
3// SPDX-License-Identifier: Apache-2.0
4
5use std::{
6    cell::RefCell,
7    sync::{
8        Arc,
9        atomic::{AtomicU64, Ordering},
10    },
11    time::Duration,
12};
13
14use once_cell::sync::OnceCell;
15use prometheus::{
16    HistogramVec, IntCounterVec, IntGaugeVec, Registry, register_histogram_vec_with_registry,
17    register_int_counter_vec_with_registry, register_int_gauge_vec_with_registry,
18};
19use rocksdb::{PerfContext, PerfMetric, PerfStatsLevel, perf::set_perf_stats};
20use tap::TapFallible;
21use tracing::warn;
22
23thread_local! {
24    static PER_THREAD_ROCKS_PERF_CONTEXT: std::cell::RefCell<rocksdb::PerfContext>  = RefCell::new(PerfContext::default());
25}
26
27const LATENCY_SEC_BUCKETS: &[f64] = &[
28    0.00001, 0.00005, // 10 mcs, 50 mcs
29    0.0001, 0.0002, 0.0003, 0.0004, 0.0005, // 100..500 mcs
30    0.001, 0.002, 0.003, 0.004, 0.005, // 1..5ms
31    0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1., 2.5, 5., 10.,
32];
33
34#[derive(Debug, Clone)]
35// A struct for sampling based on number of operations or duration.
36// Sampling happens if the duration expires and after number of operations
37pub struct SamplingInterval {
38    // Sample once every time duration
39    pub once_every_duration: Duration,
40    // Sample once every number of operations
41    pub after_num_ops: u64,
42    // Counter for keeping track of previous sample
43    pub counter: Arc<AtomicU64>,
44}
45
46impl Default for SamplingInterval {
47    fn default() -> Self {
48        // Enabled with 60 second interval
49        SamplingInterval::new(Duration::from_secs(60), 0)
50    }
51}
52
53impl SamplingInterval {
54    pub fn new(once_every_duration: Duration, after_num_ops: u64) -> Self {
55        let counter = Arc::new(AtomicU64::new(1));
56        if !once_every_duration.is_zero() {
57            let counter = counter.clone();
58            tokio::task::spawn(async move {
59                loop {
60                    if counter.load(Ordering::SeqCst) > after_num_ops {
61                        counter.store(0, Ordering::SeqCst);
62                    }
63                    tokio::time::sleep(once_every_duration).await;
64                }
65            });
66        }
67        SamplingInterval {
68            once_every_duration,
69            after_num_ops,
70            counter,
71        }
72    }
73    pub fn new_from_self(&self) -> SamplingInterval {
74        SamplingInterval::new(self.once_every_duration, self.after_num_ops)
75    }
76    pub fn sample(&self) -> bool {
77        if self.once_every_duration.is_zero() {
78            self.counter
79                .fetch_add(1, Ordering::Relaxed)
80                .is_multiple_of(self.after_num_ops + 1)
81        } else {
82            self.counter.fetch_add(1, Ordering::Relaxed) == 0
83        }
84    }
85}
86
87#[derive(Debug)]
88pub struct ColumnFamilyMetrics {
89    pub rocksdb_total_sst_files_size: IntGaugeVec,
90    pub rocksdb_total_blob_files_size: IntGaugeVec,
91    pub rocksdb_total_num_files: IntGaugeVec,
92    pub rocksdb_num_level0_files: IntGaugeVec,
93    pub rocksdb_current_size_active_mem_tables: IntGaugeVec,
94    pub rocksdb_size_all_mem_tables: IntGaugeVec,
95    pub rocksdb_num_snapshots: IntGaugeVec,
96    pub rocksdb_oldest_snapshot_time: IntGaugeVec,
97    pub rocksdb_actual_delayed_write_rate: IntGaugeVec,
98    pub rocksdb_is_write_stopped: IntGaugeVec,
99    pub rocksdb_block_cache_capacity: IntGaugeVec,
100    pub rocksdb_block_cache_usage: IntGaugeVec,
101    pub rocksdb_block_cache_pinned_usage: IntGaugeVec,
102    pub rocksdb_estimate_table_readers_mem: IntGaugeVec,
103    pub rocksdb_num_immutable_mem_tables: IntGaugeVec,
104    pub rocksdb_mem_table_flush_pending: IntGaugeVec,
105    pub rocksdb_compaction_pending: IntGaugeVec,
106    pub rocksdb_estimate_pending_compaction_bytes: IntGaugeVec,
107    pub rocksdb_num_running_compactions: IntGaugeVec,
108    pub rocksdb_num_running_flushes: IntGaugeVec,
109    pub rocksdb_estimate_oldest_key_time: IntGaugeVec,
110    pub rocksdb_background_errors: IntGaugeVec,
111    pub rocksdb_estimated_num_keys: IntGaugeVec,
112    pub rocksdb_base_level: IntGaugeVec,
113}
114
115impl ColumnFamilyMetrics {
116    pub(crate) fn new(registry: &Registry) -> Self {
117        ColumnFamilyMetrics {
118            rocksdb_total_sst_files_size: register_int_gauge_vec_with_registry!(
119                "rocksdb_total_sst_files_size",
120                "The storage size occupied by the sst files in the column family",
121                &["cf_name"],
122                registry,
123            )
124            .unwrap(),
125            rocksdb_total_blob_files_size: register_int_gauge_vec_with_registry!(
126                "rocksdb_total_blob_files_size",
127                "The storage size occupied by the blob files in the column family",
128                &["cf_name"],
129                registry,
130            )
131            .unwrap(),
132            rocksdb_total_num_files: register_int_gauge_vec_with_registry!(
133                "rocksdb_total_num_files",
134                "Total number of files used in the column family",
135                &["cf_name"],
136                registry,
137            )
138            .unwrap(),
139            rocksdb_num_level0_files: register_int_gauge_vec_with_registry!(
140                "rocksdb_num_level0_files",
141                "Number of level 0 files in the column family",
142                &["cf_name"],
143                registry,
144            )
145            .unwrap(),
146            rocksdb_current_size_active_mem_tables: register_int_gauge_vec_with_registry!(
147                "rocksdb_current_size_active_mem_tables",
148                "The current approximate size of active memtable (bytes).",
149                &["cf_name"],
150                registry,
151            )
152            .unwrap(),
153            rocksdb_size_all_mem_tables: register_int_gauge_vec_with_registry!(
154                "rocksdb_size_all_mem_tables",
155                "The memory size occupied by the column family's in-memory buffer",
156                &["cf_name"],
157                registry,
158            )
159            .unwrap(),
160            rocksdb_num_snapshots: register_int_gauge_vec_with_registry!(
161                "rocksdb_num_snapshots",
162                "Number of snapshots held for the column family",
163                &["cf_name"],
164                registry,
165            )
166            .unwrap(),
167            rocksdb_oldest_snapshot_time: register_int_gauge_vec_with_registry!(
168                "rocksdb_oldest_snapshot_time",
169                "Unit timestamp of the oldest unreleased snapshot",
170                &["cf_name"],
171                registry,
172            )
173            .unwrap(),
174            rocksdb_actual_delayed_write_rate: register_int_gauge_vec_with_registry!(
175                "rocksdb_actual_delayed_write_rate",
176                "The current actual delayed write rate. 0 means no delay",
177                &["cf_name"],
178                registry,
179            )
180            .unwrap(),
181            rocksdb_is_write_stopped: register_int_gauge_vec_with_registry!(
182                "rocksdb_is_write_stopped",
183                "A flag indicating whether writes are stopped on this column family. 1 indicates writes have been stopped.",
184                &["cf_name"],
185                registry,
186            )
187            .unwrap(),
188            rocksdb_block_cache_capacity: register_int_gauge_vec_with_registry!(
189                "rocksdb_block_cache_capacity",
190                "The block cache capacity of the column family.",
191                &["cf_name"],
192                registry,
193            )
194            .unwrap(),
195            rocksdb_block_cache_usage: register_int_gauge_vec_with_registry!(
196                "rocksdb_block_cache_usage",
197                "The memory size used by the column family in the block cache.",
198                &["cf_name"],
199                registry,
200            )
201            .unwrap(),
202            rocksdb_block_cache_pinned_usage: register_int_gauge_vec_with_registry!(
203                "rocksdb_block_cache_pinned_usage",
204                "The memory size used by the column family in the block cache where entries are pinned",
205                &["cf_name"],
206                registry,
207            )
208            .unwrap(),
209            rocksdb_estimate_table_readers_mem: register_int_gauge_vec_with_registry!(
210                "rocksdb_estimate_table_readers_mem",
211                "The estimated memory size used for reading SST tables in this column
212                family such as filters and index blocks. Note that this number does not
213                include the memory used in block cache.",
214                &["cf_name"],
215                registry,
216            )
217            .unwrap(),
218            rocksdb_num_immutable_mem_tables: register_int_gauge_vec_with_registry!(
219                "rocksdb_num_immutable_mem_tables",
220                "The number of immutable memtables that have not yet been flushed.",
221                &["cf_name"],
222                registry,
223            )
224            .unwrap(),
225            rocksdb_mem_table_flush_pending: register_int_gauge_vec_with_registry!(
226                "rocksdb_mem_table_flush_pending",
227                "A 1 or 0 flag indicating whether a memtable flush is pending.
228                If this number is 1, it means a memtable is waiting for being flushed,
229                but there might be too many L0 files that prevents it from being flushed.",
230                &["cf_name"],
231                registry,
232            )
233            .unwrap(),
234            rocksdb_compaction_pending: register_int_gauge_vec_with_registry!(
235                "rocksdb_compaction_pending",
236                "A 1 or 0 flag indicating whether a compaction job is pending.
237                If this number is 1, it means some part of the column family requires
238                compaction in order to maintain shape of LSM tree, but the compaction
239                is pending because the desired compaction job is either waiting for
240                other dependent compactions to be finished or waiting for an available
241                compaction thread.",
242                &["cf_name"],
243                registry,
244            )
245            .unwrap(),
246            rocksdb_estimate_pending_compaction_bytes: register_int_gauge_vec_with_registry!(
247                "rocksdb_estimate_pending_compaction_bytes",
248                "Estimated total number of bytes compaction needs to rewrite to get all levels down
249                to under target size. Not valid for other compactions than level-based.",
250                &["cf_name"],
251                registry,
252            )
253            .unwrap(),
254            rocksdb_num_running_compactions: register_int_gauge_vec_with_registry!(
255                "rocksdb_num_running_compactions",
256                "The number of compactions that are currently running for the column family.",
257                &["cf_name"],
258                registry,
259            )
260            .unwrap(),
261            rocksdb_num_running_flushes: register_int_gauge_vec_with_registry!(
262                "rocksdb_num_running_flushes",
263                "The number of flushes that are currently running for the column family.",
264                &["cf_name"],
265                registry,
266            )
267            .unwrap(),
268            rocksdb_estimate_oldest_key_time: register_int_gauge_vec_with_registry!(
269                "rocksdb_estimate_oldest_key_time",
270                "Estimation of the oldest key timestamp in the DB. Only available
271                for FIFO compaction with compaction_options_fifo.allow_compaction = false.",
272                &["cf_name"],
273                registry,
274            )
275            .unwrap(),
276            rocksdb_estimated_num_keys: register_int_gauge_vec_with_registry!(
277                "rocksdb_estimated_num_keys",
278                "The estimated number of keys in the table",
279                &["cf_name"],
280                registry,
281            )
282            .unwrap(),
283            rocksdb_background_errors: register_int_gauge_vec_with_registry!(
284                "rocksdb_background_errors",
285                "The accumulated number of RocksDB background errors.",
286                &["cf_name"],
287                registry,
288            )
289            .unwrap(),
290            rocksdb_base_level: register_int_gauge_vec_with_registry!(
291                "rocksdb_base_level",
292                "The number of level to which L0 data will be compacted.",
293                &["cf_name"],
294                registry,
295            )
296            .unwrap(),
297        }
298    }
299}
300
301#[derive(Debug)]
302pub struct OperationMetrics {
303    pub rocksdb_iter_latency_seconds: HistogramVec,
304    pub rocksdb_iter_bytes: HistogramVec,
305    pub rocksdb_iter_keys: HistogramVec,
306    pub rocksdb_get_latency_seconds: HistogramVec,
307    pub rocksdb_get_bytes: HistogramVec,
308    pub rocksdb_multiget_latency_seconds: HistogramVec,
309    pub rocksdb_multiget_bytes: HistogramVec,
310    pub rocksdb_put_latency_seconds: HistogramVec,
311    pub rocksdb_put_bytes: HistogramVec,
312    pub rocksdb_batch_put_bytes: HistogramVec,
313    pub rocksdb_delete_latency_seconds: HistogramVec,
314    pub rocksdb_deletes: IntCounterVec,
315    pub rocksdb_batch_commit_latency_seconds: HistogramVec,
316    pub rocksdb_batch_commit_bytes: HistogramVec,
317    pub rocksdb_num_active_db_handles: IntGaugeVec,
318    pub rocksdb_very_slow_batch_writes_count: IntCounterVec,
319    pub rocksdb_very_slow_batch_writes_duration_ms: IntCounterVec,
320    pub rocksdb_very_slow_puts_count: IntCounterVec,
321    pub rocksdb_very_slow_puts_duration_ms: IntCounterVec,
322}
323
324impl OperationMetrics {
325    pub(crate) fn new(registry: &Registry) -> Self {
326        OperationMetrics {
327            rocksdb_iter_latency_seconds: register_histogram_vec_with_registry!(
328                "rocksdb_iter_latency_seconds",
329                "Rocksdb iter latency in seconds",
330                &["cf_name"],
331                LATENCY_SEC_BUCKETS.to_vec(),
332                registry,
333            )
334            .unwrap(),
335            rocksdb_iter_bytes: register_histogram_vec_with_registry!(
336                "rocksdb_iter_bytes",
337                "Rocksdb iter size in bytes",
338                &["cf_name"],
339                prometheus::exponential_buckets(1.0, 4.0, 15)
340                    .unwrap()
341                    .to_vec(),
342                registry,
343            )
344            .unwrap(),
345            rocksdb_iter_keys: register_histogram_vec_with_registry!(
346                "rocksdb_iter_keys",
347                "Rocksdb iter num keys",
348                &["cf_name"],
349                registry,
350            )
351            .unwrap(),
352            rocksdb_get_latency_seconds: register_histogram_vec_with_registry!(
353                "rocksdb_get_latency_seconds",
354                "Rocksdb get latency in seconds",
355                &["cf_name"],
356                LATENCY_SEC_BUCKETS.to_vec(),
357                registry,
358            )
359            .unwrap(),
360            rocksdb_get_bytes: register_histogram_vec_with_registry!(
361                "rocksdb_get_bytes",
362                "Rocksdb get call returned data size in bytes",
363                &["cf_name"],
364                prometheus::exponential_buckets(1.0, 4.0, 15)
365                    .unwrap()
366                    .to_vec(),
367                registry
368            )
369            .unwrap(),
370            rocksdb_multiget_latency_seconds: register_histogram_vec_with_registry!(
371                "rocksdb_multiget_latency_seconds",
372                "Rocksdb multiget latency in seconds",
373                &["cf_name"],
374                LATENCY_SEC_BUCKETS.to_vec(),
375                registry,
376            )
377            .unwrap(),
378            rocksdb_multiget_bytes: register_histogram_vec_with_registry!(
379                "rocksdb_multiget_bytes",
380                "Rocksdb multiget call returned data size in bytes",
381                &["cf_name"],
382                prometheus::exponential_buckets(1.0, 4.0, 15)
383                    .unwrap()
384                    .to_vec(),
385                registry,
386            )
387            .unwrap(),
388            rocksdb_put_latency_seconds: register_histogram_vec_with_registry!(
389                "rocksdb_put_latency_seconds",
390                "Rocksdb put latency in seconds",
391                &["cf_name"],
392                LATENCY_SEC_BUCKETS.to_vec(),
393                registry,
394            )
395            .unwrap(),
396            rocksdb_put_bytes: register_histogram_vec_with_registry!(
397                "rocksdb_put_bytes",
398                "Rocksdb put call puts data size in bytes",
399                &["cf_name"],
400                prometheus::exponential_buckets(1.0, 4.0, 15)
401                    .unwrap()
402                    .to_vec(),
403                registry,
404            )
405            .unwrap(),
406            rocksdb_batch_put_bytes: register_histogram_vec_with_registry!(
407                "rocksdb_batch_put_bytes",
408                "Rocksdb batch put call puts data size in bytes",
409                &["cf_name"],
410                prometheus::exponential_buckets(1.0, 4.0, 15)
411                    .unwrap()
412                    .to_vec(),
413                registry,
414            )
415            .unwrap(),
416            rocksdb_delete_latency_seconds: register_histogram_vec_with_registry!(
417                "rocksdb_delete_latency_seconds",
418                "Rocksdb delete latency in seconds",
419                &["cf_name"],
420                LATENCY_SEC_BUCKETS.to_vec(),
421                registry,
422            )
423            .unwrap(),
424            rocksdb_deletes: register_int_counter_vec_with_registry!(
425                "rocksdb_deletes",
426                "Rocksdb delete calls",
427                &["cf_name"],
428                registry
429            )
430            .unwrap(),
431            rocksdb_batch_commit_latency_seconds: register_histogram_vec_with_registry!(
432                "rocksdb_write_batch_commit_latency_seconds",
433                "Rocksdb schema batch commit latency in seconds",
434                &["db_name"],
435                LATENCY_SEC_BUCKETS.to_vec(),
436                registry,
437            )
438            .unwrap(),
439            rocksdb_batch_commit_bytes: register_histogram_vec_with_registry!(
440                "rocksdb_batch_commit_bytes",
441                "Rocksdb schema batch commit size in bytes",
442                &["db_name"],
443                prometheus::exponential_buckets(1.0, 4.0, 15)
444                    .unwrap()
445                    .to_vec(),
446                registry,
447            )
448            .unwrap(),
449            rocksdb_num_active_db_handles: register_int_gauge_vec_with_registry!(
450                "rocksdb_num_active_db_handles",
451                "Number of active db handles",
452                &["db_name"],
453                registry,
454            )
455            .unwrap(),
456            rocksdb_very_slow_batch_writes_count: register_int_counter_vec_with_registry!(
457                "rocksdb_num_very_slow_batch_writes",
458                "Number of batch writes that took more than 1 second",
459                &["db_name"],
460                registry,
461            )
462            .unwrap(),
463            rocksdb_very_slow_batch_writes_duration_ms: register_int_counter_vec_with_registry!(
464                "rocksdb_very_slow_batch_writes_duration",
465                "Total duration of batch writes that took more than 1 second",
466                &["db_name"],
467                registry,
468            )
469            .unwrap(),
470            rocksdb_very_slow_puts_count: register_int_counter_vec_with_registry!(
471                "rocksdb_num_very_slow_puts",
472                "Number of puts that took more than 1 second",
473                &["cf_name"],
474                registry,
475            )
476            .unwrap(),
477            rocksdb_very_slow_puts_duration_ms: register_int_counter_vec_with_registry!(
478                "rocksdb_very_slow_puts_duration",
479                "Total duration of puts that took more than 1 second",
480                &["cf_name"],
481                registry,
482            )
483            .unwrap(),
484        }
485    }
486}
487
488pub struct RocksDBPerfContext;
489
490impl Default for RocksDBPerfContext {
491    fn default() -> Self {
492        set_perf_stats(PerfStatsLevel::EnableTime);
493        PER_THREAD_ROCKS_PERF_CONTEXT.with(|perf_context| {
494            perf_context.borrow_mut().reset();
495        });
496        RocksDBPerfContext {}
497    }
498}
499
500impl Drop for RocksDBPerfContext {
501    fn drop(&mut self) {
502        set_perf_stats(PerfStatsLevel::Disable);
503    }
504}
505
506#[derive(Debug)]
507pub struct ReadPerfContextMetrics {
508    pub user_key_comparison_count: IntCounterVec,
509    pub block_cache_hit_count: IntCounterVec,
510    pub block_read_count: IntCounterVec,
511    pub block_read_byte: IntCounterVec,
512    pub block_read_nanos: IntCounterVec,
513    pub block_checksum_nanos: IntCounterVec,
514    pub block_decompress_nanos: IntCounterVec,
515    pub get_read_bytes: IntCounterVec,
516    pub multiget_read_bytes: IntCounterVec,
517    pub get_snapshot_nanos: IntCounterVec,
518    pub get_from_memtable_nanos: IntCounterVec,
519    pub get_from_memtable_count: IntCounterVec,
520    pub get_post_process_nanos: IntCounterVec,
521    pub get_from_output_files_nanos: IntCounterVec,
522    pub db_mutex_lock_nanos: IntCounterVec,
523    pub db_condition_wait_nanos: IntCounterVec,
524    pub merge_operator_nanos: IntCounterVec,
525    pub read_index_block_nanos: IntCounterVec,
526    pub read_filter_block_nanos: IntCounterVec,
527    pub new_table_block_iter_nanos: IntCounterVec,
528    pub block_seek_nanos: IntCounterVec,
529    pub find_table_nanos: IntCounterVec,
530    pub bloom_memtable_hit_count: IntCounterVec,
531    pub bloom_memtable_miss_count: IntCounterVec,
532    pub bloom_sst_hit_count: IntCounterVec,
533    pub bloom_sst_miss_count: IntCounterVec,
534    pub key_lock_wait_time: IntCounterVec,
535    pub key_lock_wait_count: IntCounterVec,
536    pub internal_delete_skipped_count: IntCounterVec,
537    pub internal_skipped_count: IntCounterVec,
538}
539
540impl ReadPerfContextMetrics {
541    pub(crate) fn new(registry: &Registry) -> Self {
542        ReadPerfContextMetrics {
543            user_key_comparison_count: register_int_counter_vec_with_registry!(
544                "user_key_comparison_count",
545                "Helps us figure out whether too many comparisons in binary search can be a problem,
546                especially when a more expensive comparator is used. Moreover, since number of comparisons
547                is usually uniform based on the memtable size, the SST file size for Level 0 and size of other
548                levels, an significant increase of the counter can indicate unexpected LSM-tree shape.
549                You may want to check whether flush/compaction can keep up with the write speed",
550                &["cf_name"],
551                registry,
552            )
553            .unwrap(),
554            block_cache_hit_count: register_int_counter_vec_with_registry!(
555                "block_cache_hit_count",
556                "Tells us how many times we read data blocks from block cache, and block_read_count tells us how many
557                times we have to read blocks from the file system (either block cache is disabled or it is a cache miss).
558                We can evaluate the block cache efficiency by looking at the two counters over time.",
559                &["cf_name"],
560                registry,
561            )
562            .unwrap(),
563            block_read_count: register_int_counter_vec_with_registry!(
564                "block_read_count",
565                "Tells us how many times we have to read blocks from the file system (either block cache is disabled or it is a cache miss)",
566                &["cf_name"],
567                registry,
568            )
569            .unwrap(),
570            block_read_byte: register_int_counter_vec_with_registry!(
571                "block_read_byte",
572                "Tells us how many total bytes we read from the file system. It can tell us whether a slow query can be caused by reading
573                large blocks from the file system. Index and bloom filter blocks are usually large blocks. A large block can also be the result
574                of a very large key or value",
575                &["cf_name"],
576                registry,
577            )
578            .unwrap(),
579            block_read_nanos: register_int_counter_vec_with_registry!(
580                "block_read_nanos",
581                "Total nanos spent on block reads",
582                &["cf_name"],
583                registry,
584            )
585            .unwrap(),
586            block_checksum_nanos: register_int_counter_vec_with_registry!(
587                "block_checksum_nanos",
588                "Total nanos spent on verifying block checksum",
589                &["cf_name"],
590                registry,
591            )
592            .unwrap(),
593            block_decompress_nanos: register_int_counter_vec_with_registry!(
594                "block_decompress_nanos",
595                "Total nanos spent on decompressing a block",
596                &["cf_name"],
597                registry,
598            )
599            .unwrap(),
600            get_read_bytes: register_int_counter_vec_with_registry!(
601                "get_read_bytes",
602                "Total bytes for values returned by Get",
603                &["cf_name"],
604                registry,
605            )
606            .unwrap(),
607            multiget_read_bytes: register_int_counter_vec_with_registry!(
608                "multiget_read_bytes",
609                "Total bytes for values returned by MultiGet.",
610                &["cf_name"],
611                registry,
612            )
613            .unwrap(),
614            get_snapshot_nanos: register_int_counter_vec_with_registry!(
615                "get_snapshot_nanos",
616                "Time spent in getting snapshot.",
617                &["cf_name"],
618                registry,
619            )
620            .unwrap(),
621            get_from_memtable_nanos: register_int_counter_vec_with_registry!(
622                "get_from_memtable_nanos",
623                "Time spent on reading data from memtable.",
624                &["cf_name"],
625                registry,
626            )
627            .unwrap(),
628            get_from_memtable_count: register_int_counter_vec_with_registry!(
629                "get_from_memtable_count",
630                "Number of memtables queried",
631                &["cf_name"],
632                registry,
633            )
634            .unwrap(),
635            get_post_process_nanos: register_int_counter_vec_with_registry!(
636                "get_post_process_nanos",
637                "Total nanos spent after Get() finds a key",
638                &["cf_name"],
639                registry,
640            )
641            .unwrap(),
642            get_from_output_files_nanos: register_int_counter_vec_with_registry!(
643                "get_from_output_files_nanos",
644                "Total nanos reading from output files",
645                &["cf_name"],
646                registry,
647            )
648            .unwrap(),
649            db_mutex_lock_nanos: register_int_counter_vec_with_registry!(
650                "db_mutex_lock_nanos",
651                "Time spent on acquiring db mutex",
652                &["cf_name"],
653                registry,
654            )
655            .unwrap(),
656            db_condition_wait_nanos: register_int_counter_vec_with_registry!(
657                "db_condition_wait_nanos",
658                "Time spent waiting with a condition variable created with DB Mutex.",
659                &["cf_name"],
660                registry,
661            )
662            .unwrap(),
663            merge_operator_nanos: register_int_counter_vec_with_registry!(
664                "merge_operator_nanos",
665                "Time spent on merge operator.",
666                &["cf_name"],
667                registry,
668            )
669            .unwrap(),
670            read_index_block_nanos: register_int_counter_vec_with_registry!(
671                "read_index_block_nanos",
672                "Time spent on reading index block from block cache or SST file",
673                &["cf_name"],
674                registry,
675            )
676            .unwrap(),
677            read_filter_block_nanos: register_int_counter_vec_with_registry!(
678                "read_filter_block_nanos",
679                "Time spent on reading filter block from block cache or SST file",
680                &["cf_name"],
681                registry,
682            )
683            .unwrap(),
684            new_table_block_iter_nanos: register_int_counter_vec_with_registry!(
685                "new_table_block_iter_nanos",
686                "Time spent on creating data block iterator",
687                &["cf_name"],
688                registry,
689            )
690            .unwrap(),
691            block_seek_nanos: register_int_counter_vec_with_registry!(
692                "block_seek_nanos",
693                "Time spent on seeking a key in data/index blocks",
694                &["cf_name"],
695                registry,
696            )
697            .unwrap(),
698            find_table_nanos: register_int_counter_vec_with_registry!(
699                "find_table_nanos",
700                "Time spent on finding or creating a table reader",
701                &["cf_name"],
702                registry,
703            )
704            .unwrap(),
705            bloom_memtable_hit_count: register_int_counter_vec_with_registry!(
706                "bloom_memtable_hit_count",
707                "Total number of mem table bloom hits",
708                &["cf_name"],
709                registry,
710            )
711            .unwrap(),
712            bloom_memtable_miss_count: register_int_counter_vec_with_registry!(
713                "bloom_memtable_miss_count",
714                "Total number of mem table bloom misses",
715                &["cf_name"],
716                registry,
717            )
718            .unwrap(),
719            bloom_sst_hit_count: register_int_counter_vec_with_registry!(
720                "bloom_sst_hit_count",
721                "Total number of SST table bloom hits",
722                &["cf_name"],
723                registry,
724            )
725            .unwrap(),
726            bloom_sst_miss_count: register_int_counter_vec_with_registry!(
727                "bloom_sst_miss_count",
728                "Total number of SST table bloom misses",
729                &["cf_name"],
730                registry,
731            )
732            .unwrap(),
733            key_lock_wait_time: register_int_counter_vec_with_registry!(
734                "key_lock_wait_time",
735                "Time spent waiting on key locks in transaction lock manager",
736                &["cf_name"],
737                registry,
738            )
739            .unwrap(),
740            key_lock_wait_count: register_int_counter_vec_with_registry!(
741                "key_lock_wait_count",
742                "Number of times acquiring a lock was blocked by another transaction",
743                &["cf_name"],
744                registry,
745            )
746            .unwrap(),
747            internal_delete_skipped_count: register_int_counter_vec_with_registry!(
748                "internal_delete_skipped_count",
749                "Total number of deleted keys skipped during iteration",
750                &["cf_name"],
751                registry,
752            )
753                .unwrap(),
754            internal_skipped_count: register_int_counter_vec_with_registry!(
755                "internal_skipped_count",
756                "Totall number of internal keys skipped during iteration",
757                &["cf_name"],
758                registry,
759            )
760                .unwrap(),
761        }
762    }
763
764    pub fn report_metrics(&self, cf_name: &str) {
765        PER_THREAD_ROCKS_PERF_CONTEXT.with(|perf_context_cell| {
766            set_perf_stats(PerfStatsLevel::Disable);
767            let perf_context = perf_context_cell.borrow();
768            self.user_key_comparison_count
769                .with_label_values(&[cf_name])
770                .inc_by(perf_context.metric(PerfMetric::UserKeyComparisonCount));
771            self.block_cache_hit_count
772                .with_label_values(&[cf_name])
773                .inc_by(perf_context.metric(PerfMetric::BlockCacheHitCount));
774            self.block_read_count
775                .with_label_values(&[cf_name])
776                .inc_by(perf_context.metric(PerfMetric::BlockReadCount));
777            self.block_read_byte
778                .with_label_values(&[cf_name])
779                .inc_by(perf_context.metric(PerfMetric::BlockReadByte));
780            self.block_read_nanos
781                .with_label_values(&[cf_name])
782                .inc_by(perf_context.metric(PerfMetric::BlockReadTime));
783            self.block_read_count
784                .with_label_values(&[cf_name])
785                .inc_by(perf_context.metric(PerfMetric::BlockReadCount));
786            self.block_checksum_nanos
787                .with_label_values(&[cf_name])
788                .inc_by(perf_context.metric(PerfMetric::BlockChecksumTime));
789            self.block_decompress_nanos
790                .with_label_values(&[cf_name])
791                .inc_by(perf_context.metric(PerfMetric::BlockDecompressTime));
792            self.get_read_bytes
793                .with_label_values(&[cf_name])
794                .inc_by(perf_context.metric(PerfMetric::GetReadBytes));
795            self.multiget_read_bytes
796                .with_label_values(&[cf_name])
797                .inc_by(perf_context.metric(PerfMetric::MultigetReadBytes));
798            self.get_snapshot_nanos
799                .with_label_values(&[cf_name])
800                .inc_by(perf_context.metric(PerfMetric::GetSnapshotTime));
801            self.get_from_memtable_nanos
802                .with_label_values(&[cf_name])
803                .inc_by(perf_context.metric(PerfMetric::GetFromMemtableTime));
804            self.get_from_memtable_count
805                .with_label_values(&[cf_name])
806                .inc_by(perf_context.metric(PerfMetric::GetFromMemtableCount));
807            self.get_post_process_nanos
808                .with_label_values(&[cf_name])
809                .inc_by(perf_context.metric(PerfMetric::GetPostProcessTime));
810            self.get_from_output_files_nanos
811                .with_label_values(&[cf_name])
812                .inc_by(perf_context.metric(PerfMetric::GetFromOutputFilesTime));
813            self.db_mutex_lock_nanos
814                .with_label_values(&[cf_name])
815                .inc_by(perf_context.metric(PerfMetric::DbMutexLockNanos));
816            self.db_condition_wait_nanos
817                .with_label_values(&[cf_name])
818                .inc_by(perf_context.metric(PerfMetric::DbConditionWaitNanos));
819            self.merge_operator_nanos
820                .with_label_values(&[cf_name])
821                .inc_by(perf_context.metric(PerfMetric::MergeOperatorTimeNanos));
822            self.read_index_block_nanos
823                .with_label_values(&[cf_name])
824                .inc_by(perf_context.metric(PerfMetric::ReadIndexBlockNanos));
825            self.read_filter_block_nanos
826                .with_label_values(&[cf_name])
827                .inc_by(perf_context.metric(PerfMetric::ReadFilterBlockNanos));
828            self.new_table_block_iter_nanos
829                .with_label_values(&[cf_name])
830                .inc_by(perf_context.metric(PerfMetric::NewTableBlockIterNanos));
831            self.block_seek_nanos
832                .with_label_values(&[cf_name])
833                .inc_by(perf_context.metric(PerfMetric::BlockSeekNanos));
834            self.find_table_nanos
835                .with_label_values(&[cf_name])
836                .inc_by(perf_context.metric(PerfMetric::FindTableNanos));
837            self.bloom_memtable_hit_count
838                .with_label_values(&[cf_name])
839                .inc_by(perf_context.metric(PerfMetric::BloomMemtableHitCount));
840            self.bloom_memtable_miss_count
841                .with_label_values(&[cf_name])
842                .inc_by(perf_context.metric(PerfMetric::BloomMemtableMissCount));
843            self.bloom_sst_hit_count
844                .with_label_values(&[cf_name])
845                .inc_by(perf_context.metric(PerfMetric::BloomSstHitCount));
846            self.bloom_sst_miss_count
847                .with_label_values(&[cf_name])
848                .inc_by(perf_context.metric(PerfMetric::BloomSstMissCount));
849            self.key_lock_wait_time
850                .with_label_values(&[cf_name])
851                .inc_by(perf_context.metric(PerfMetric::KeyLockWaitTime));
852            self.key_lock_wait_count
853                .with_label_values(&[cf_name])
854                .inc_by(perf_context.metric(PerfMetric::KeyLockWaitCount));
855            self.internal_delete_skipped_count
856                .with_label_values(&[cf_name])
857                .inc_by(perf_context.metric(PerfMetric::InternalDeleteSkippedCount));
858            self.internal_skipped_count
859                .with_label_values(&[cf_name])
860                .inc_by(perf_context.metric(PerfMetric::InternalKeySkippedCount));
861        });
862    }
863}
864
865#[derive(Debug)]
866pub struct WritePerfContextMetrics {
867    pub write_wal_nanos: IntCounterVec,
868    pub write_memtable_nanos: IntCounterVec,
869    pub write_delay_nanos: IntCounterVec,
870    pub write_pre_and_post_process_nanos: IntCounterVec,
871    pub write_db_mutex_lock_nanos: IntCounterVec,
872    pub write_db_condition_wait_nanos: IntCounterVec,
873    pub write_key_lock_wait_nanos: IntCounterVec,
874    pub write_key_lock_wait_count: IntCounterVec,
875}
876
877impl WritePerfContextMetrics {
878    pub(crate) fn new(registry: &Registry) -> Self {
879        WritePerfContextMetrics {
880            write_wal_nanos: register_int_counter_vec_with_registry!(
881                "write_wal_nanos",
882                "Total nanos spent on writing to WAL",
883                &["cf_name"],
884                registry,
885            )
886            .unwrap(),
887            write_memtable_nanos: register_int_counter_vec_with_registry!(
888                "write_memtable_nanos",
889                "Total nanos spent on writing to memtable",
890                &["cf_name"],
891                registry,
892            )
893            .unwrap(),
894            write_delay_nanos: register_int_counter_vec_with_registry!(
895                "write_delay_nanos",
896                "Total nanos spent on delaying or throttling write",
897                &["cf_name"],
898                registry,
899            )
900            .unwrap(),
901            write_pre_and_post_process_nanos: register_int_counter_vec_with_registry!(
902                "write_pre_and_post_process_nanos",
903                "Total nanos spent on writing a record, excluding the above four things",
904                &["cf_name"],
905                registry,
906            )
907            .unwrap(),
908            write_db_mutex_lock_nanos: register_int_counter_vec_with_registry!(
909                "write_db_mutex_lock_nanos",
910                "Time spent on acquiring db mutex",
911                &["cf_name"],
912                registry,
913            )
914            .unwrap(),
915            write_db_condition_wait_nanos: register_int_counter_vec_with_registry!(
916                "write_db_condition_wait_nanos",
917                "Time spent waiting with a condition variable created with DB Mutex.",
918                &["cf_name"],
919                registry,
920            )
921            .unwrap(),
922            write_key_lock_wait_nanos: register_int_counter_vec_with_registry!(
923                "write_key_lock_wait_time",
924                "Time spent waiting on key locks in transaction lock manager",
925                &["cf_name"],
926                registry,
927            )
928            .unwrap(),
929            write_key_lock_wait_count: register_int_counter_vec_with_registry!(
930                "write_key_lock_wait_count",
931                "Number of times acquiring a lock was blocked by another transaction",
932                &["cf_name"],
933                registry,
934            )
935            .unwrap(),
936        }
937    }
938    pub fn report_metrics(&self, db_name: &str) {
939        PER_THREAD_ROCKS_PERF_CONTEXT.with(|perf_context_cell| {
940            set_perf_stats(PerfStatsLevel::Disable);
941            let perf_context = perf_context_cell.borrow();
942            self.write_wal_nanos
943                .with_label_values(&[db_name])
944                .inc_by(perf_context.metric(PerfMetric::WriteWalTime));
945            self.write_memtable_nanos
946                .with_label_values(&[db_name])
947                .inc_by(perf_context.metric(PerfMetric::WriteMemtableTime));
948            self.write_delay_nanos
949                .with_label_values(&[db_name])
950                .inc_by(perf_context.metric(PerfMetric::WriteDelayTime));
951            self.write_pre_and_post_process_nanos
952                .with_label_values(&[db_name])
953                .inc_by(perf_context.metric(PerfMetric::WritePreAndPostProcessTime));
954            self.write_db_mutex_lock_nanos
955                .with_label_values(&[db_name])
956                .inc_by(perf_context.metric(PerfMetric::DbMutexLockNanos));
957            self.write_db_condition_wait_nanos
958                .with_label_values(&[db_name])
959                .inc_by(perf_context.metric(PerfMetric::DbConditionWaitNanos));
960            self.write_key_lock_wait_nanos
961                .with_label_values(&[db_name])
962                .inc_by(perf_context.metric(PerfMetric::KeyLockWaitTime));
963            self.write_key_lock_wait_count
964                .with_label_values(&[db_name])
965                .inc_by(perf_context.metric(PerfMetric::KeyLockWaitCount));
966        });
967    }
968}
969
970#[derive(Debug)]
971pub struct DBMetrics {
972    pub op_metrics: OperationMetrics,
973    pub cf_metrics: ColumnFamilyMetrics,
974    pub read_perf_ctx_metrics: ReadPerfContextMetrics,
975    pub write_perf_ctx_metrics: WritePerfContextMetrics,
976}
977
978static ONCE: OnceCell<Arc<DBMetrics>> = OnceCell::new();
979
980impl DBMetrics {
981    fn new(registry: &Registry) -> Self {
982        DBMetrics {
983            op_metrics: OperationMetrics::new(registry),
984            cf_metrics: ColumnFamilyMetrics::new(registry),
985            read_perf_ctx_metrics: ReadPerfContextMetrics::new(registry),
986            write_perf_ctx_metrics: WritePerfContextMetrics::new(registry),
987        }
988    }
989    pub fn init(registry: &Registry) -> &'static Arc<DBMetrics> {
990        // Initialize this before creating any instance of DBMap
991        // TODO: Remove static initialization because this basically means we can
992        // only ever initialize db metrics once with a registry whereas
993        // in the code we might want to initialize it with different
994        // registries. The problem is underlying metrics cannot be re-initialized
995        // or prometheus complains. We essentially need to pass in DBMetrics
996        // everywhere we create DBMap as the right fix
997        let _ = ONCE
998            .set(Arc::new(DBMetrics::new(registry)))
999            // this happens many times during tests
1000            .tap_err(|_| warn!("DBMetrics registry overwritten"));
1001        ONCE.get().unwrap()
1002    }
1003    pub fn increment_num_active_dbs(&self, db_name: &str) {
1004        self.op_metrics
1005            .rocksdb_num_active_db_handles
1006            .with_label_values(&[db_name])
1007            .inc();
1008    }
1009    pub fn decrement_num_active_dbs(&self, db_name: &str) {
1010        self.op_metrics
1011            .rocksdb_num_active_db_handles
1012            .with_label_values(&[db_name])
1013            .dec();
1014    }
1015    pub fn get() -> &'static Arc<DBMetrics> {
1016        ONCE.get()
1017            .unwrap_or_else(|| DBMetrics::init(prometheus::default_registry()))
1018    }
1019}