typed_store/
metrics.rs

1// Copyright (c) Mysten Labs, Inc.
2// Modifications Copyright (c) 2024 IOTA Stiftung
3// SPDX-License-Identifier: Apache-2.0
4
5use std::{
6    cell::RefCell,
7    sync::{
8        Arc,
9        atomic::{AtomicU64, Ordering},
10    },
11    time::Duration,
12};
13
14use once_cell::sync::OnceCell;
15use prometheus::{
16    HistogramVec, IntCounterVec, IntGaugeVec, Registry, register_histogram_vec_with_registry,
17    register_int_counter_vec_with_registry, register_int_gauge_vec_with_registry,
18};
19use rocksdb::{PerfContext, PerfMetric, PerfStatsLevel, perf::set_perf_stats};
20use tap::TapFallible;
21use tracing::warn;
22
23thread_local! {
24    static PER_THREAD_ROCKS_PERF_CONTEXT: std::cell::RefCell<rocksdb::PerfContext>  = RefCell::new(PerfContext::default());
25}
26
27const LATENCY_SEC_BUCKETS: &[f64] = &[
28    0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1., 2.5, 5., 10., 20., 30., 60., 90.,
29];
30
31#[derive(Debug, Clone)]
32// A struct for sampling based on number of operations or duration.
33// Sampling happens if the duration expires and after number of operations
34pub struct SamplingInterval {
35    // Sample once every time duration
36    pub once_every_duration: Duration,
37    // Sample once every number of operations
38    pub after_num_ops: u64,
39    // Counter for keeping track of previous sample
40    pub counter: Arc<AtomicU64>,
41}
42
43impl Default for SamplingInterval {
44    fn default() -> Self {
45        // Enabled with 60 second interval
46        SamplingInterval::new(Duration::from_secs(60), 0)
47    }
48}
49
50impl SamplingInterval {
51    pub fn new(once_every_duration: Duration, after_num_ops: u64) -> Self {
52        let counter = Arc::new(AtomicU64::new(1));
53        if !once_every_duration.is_zero() {
54            let counter = counter.clone();
55            tokio::task::spawn(async move {
56                loop {
57                    if counter.load(Ordering::SeqCst) > after_num_ops {
58                        counter.store(0, Ordering::SeqCst);
59                    }
60                    tokio::time::sleep(once_every_duration).await;
61                }
62            });
63        }
64        SamplingInterval {
65            once_every_duration,
66            after_num_ops,
67            counter,
68        }
69    }
70    pub fn new_from_self(&self) -> SamplingInterval {
71        SamplingInterval::new(self.once_every_duration, self.after_num_ops)
72    }
73    pub fn sample(&self) -> bool {
74        if self.once_every_duration.is_zero() {
75            self.counter.fetch_add(1, Ordering::Relaxed) % (self.after_num_ops + 1) == 0
76        } else {
77            self.counter.fetch_add(1, Ordering::Relaxed) == 0
78        }
79    }
80}
81
82#[derive(Debug)]
83pub struct ColumnFamilyMetrics {
84    pub rocksdb_total_sst_files_size: IntGaugeVec,
85    pub rocksdb_total_blob_files_size: IntGaugeVec,
86    pub rocksdb_total_num_files: IntGaugeVec,
87    pub rocksdb_num_level0_files: IntGaugeVec,
88    pub rocksdb_current_size_active_mem_tables: IntGaugeVec,
89    pub rocksdb_size_all_mem_tables: IntGaugeVec,
90    pub rocksdb_num_snapshots: IntGaugeVec,
91    pub rocksdb_oldest_snapshot_time: IntGaugeVec,
92    pub rocksdb_actual_delayed_write_rate: IntGaugeVec,
93    pub rocksdb_is_write_stopped: IntGaugeVec,
94    pub rocksdb_block_cache_capacity: IntGaugeVec,
95    pub rocksdb_block_cache_usage: IntGaugeVec,
96    pub rocksdb_block_cache_pinned_usage: IntGaugeVec,
97    pub rocksdb_estimate_table_readers_mem: IntGaugeVec,
98    pub rocksdb_num_immutable_mem_tables: IntGaugeVec,
99    pub rocksdb_mem_table_flush_pending: IntGaugeVec,
100    pub rocksdb_compaction_pending: IntGaugeVec,
101    pub rocksdb_estimate_pending_compaction_bytes: IntGaugeVec,
102    pub rocksdb_num_running_compactions: IntGaugeVec,
103    pub rocksdb_num_running_flushes: IntGaugeVec,
104    pub rocksdb_estimate_oldest_key_time: IntGaugeVec,
105    pub rocksdb_background_errors: IntGaugeVec,
106    pub rocksdb_estimated_num_keys: IntGaugeVec,
107    pub rocksdb_base_level: IntGaugeVec,
108}
109
110impl ColumnFamilyMetrics {
111    pub(crate) fn new(registry: &Registry) -> Self {
112        ColumnFamilyMetrics {
113            rocksdb_total_sst_files_size: register_int_gauge_vec_with_registry!(
114                "rocksdb_total_sst_files_size",
115                "The storage size occupied by the sst files in the column family",
116                &["cf_name"],
117                registry,
118            )
119            .unwrap(),
120            rocksdb_total_blob_files_size: register_int_gauge_vec_with_registry!(
121                "rocksdb_total_blob_files_size",
122                "The storage size occupied by the blob files in the column family",
123                &["cf_name"],
124                registry,
125            )
126            .unwrap(),
127            rocksdb_total_num_files: register_int_gauge_vec_with_registry!(
128                "rocksdb_total_num_files",
129                "Total number of files used in the column family",
130                &["cf_name"],
131                registry,
132            )
133            .unwrap(),
134            rocksdb_num_level0_files: register_int_gauge_vec_with_registry!(
135                "rocksdb_num_level0_files",
136                "Number of level 0 files in the column family",
137                &["cf_name"],
138                registry,
139            )
140            .unwrap(),
141            rocksdb_current_size_active_mem_tables: register_int_gauge_vec_with_registry!(
142                "rocksdb_current_size_active_mem_tables",
143                "The current approximate size of active memtable (bytes).",
144                &["cf_name"],
145                registry,
146            )
147            .unwrap(),
148            rocksdb_size_all_mem_tables: register_int_gauge_vec_with_registry!(
149                "rocksdb_size_all_mem_tables",
150                "The memory size occupied by the column family's in-memory buffer",
151                &["cf_name"],
152                registry,
153            )
154            .unwrap(),
155            rocksdb_num_snapshots: register_int_gauge_vec_with_registry!(
156                "rocksdb_num_snapshots",
157                "Number of snapshots held for the column family",
158                &["cf_name"],
159                registry,
160            )
161            .unwrap(),
162            rocksdb_oldest_snapshot_time: register_int_gauge_vec_with_registry!(
163                "rocksdb_oldest_snapshot_time",
164                "Unit timestamp of the oldest unreleased snapshot",
165                &["cf_name"],
166                registry,
167            )
168            .unwrap(),
169            rocksdb_actual_delayed_write_rate: register_int_gauge_vec_with_registry!(
170                "rocksdb_actual_delayed_write_rate",
171                "The current actual delayed write rate. 0 means no delay",
172                &["cf_name"],
173                registry,
174            )
175            .unwrap(),
176            rocksdb_is_write_stopped: register_int_gauge_vec_with_registry!(
177                "rocksdb_is_write_stopped",
178                "A flag indicating whether writes are stopped on this column family. 1 indicates writes have been stopped.",
179                &["cf_name"],
180                registry,
181            )
182            .unwrap(),
183            rocksdb_block_cache_capacity: register_int_gauge_vec_with_registry!(
184                "rocksdb_block_cache_capacity",
185                "The block cache capacity of the column family.",
186                &["cf_name"],
187                registry,
188            )
189            .unwrap(),
190            rocksdb_block_cache_usage: register_int_gauge_vec_with_registry!(
191                "rocksdb_block_cache_usage",
192                "The memory size used by the column family in the block cache.",
193                &["cf_name"],
194                registry,
195            )
196            .unwrap(),
197            rocksdb_block_cache_pinned_usage: register_int_gauge_vec_with_registry!(
198                "rocksdb_block_cache_pinned_usage",
199                "The memory size used by the column family in the block cache where entries are pinned",
200                &["cf_name"],
201                registry,
202            )
203            .unwrap(),
204            rocksdb_estimate_table_readers_mem: register_int_gauge_vec_with_registry!(
205                "rocksdb_estimate_table_readers_mem",
206                "The estimated memory size used for reading SST tables in this column
207                family such as filters and index blocks. Note that this number does not
208                include the memory used in block cache.",
209                &["cf_name"],
210                registry,
211            )
212            .unwrap(),
213            rocksdb_num_immutable_mem_tables: register_int_gauge_vec_with_registry!(
214                "rocksdb_num_immutable_mem_tables",
215                "The number of immutable memtables that have not yet been flushed.",
216                &["cf_name"],
217                registry,
218            )
219            .unwrap(),
220            rocksdb_mem_table_flush_pending: register_int_gauge_vec_with_registry!(
221                "rocksdb_mem_table_flush_pending",
222                "A 1 or 0 flag indicating whether a memtable flush is pending.
223                If this number is 1, it means a memtable is waiting for being flushed,
224                but there might be too many L0 files that prevents it from being flushed.",
225                &["cf_name"],
226                registry,
227            )
228            .unwrap(),
229            rocksdb_compaction_pending: register_int_gauge_vec_with_registry!(
230                "rocksdb_compaction_pending",
231                "A 1 or 0 flag indicating whether a compaction job is pending.
232                If this number is 1, it means some part of the column family requires
233                compaction in order to maintain shape of LSM tree, but the compaction
234                is pending because the desired compaction job is either waiting for
235                other dependent compactions to be finished or waiting for an available
236                compaction thread.",
237                &["cf_name"],
238                registry,
239            )
240            .unwrap(),
241            rocksdb_estimate_pending_compaction_bytes: register_int_gauge_vec_with_registry!(
242                "rocksdb_estimate_pending_compaction_bytes",
243                "Estimated total number of bytes compaction needs to rewrite to get all levels down
244                to under target size. Not valid for other compactions than level-based.",
245                &["cf_name"],
246                registry,
247            )
248            .unwrap(),
249            rocksdb_num_running_compactions: register_int_gauge_vec_with_registry!(
250                "rocksdb_num_running_compactions",
251                "The number of compactions that are currently running for the column family.",
252                &["cf_name"],
253                registry,
254            )
255            .unwrap(),
256            rocksdb_num_running_flushes: register_int_gauge_vec_with_registry!(
257                "rocksdb_num_running_flushes",
258                "The number of flushes that are currently running for the column family.",
259                &["cf_name"],
260                registry,
261            )
262            .unwrap(),
263            rocksdb_estimate_oldest_key_time: register_int_gauge_vec_with_registry!(
264                "rocksdb_estimate_oldest_key_time",
265                "Estimation of the oldest key timestamp in the DB. Only available
266                for FIFO compaction with compaction_options_fifo.allow_compaction = false.",
267                &["cf_name"],
268                registry,
269            )
270            .unwrap(),
271            rocksdb_estimated_num_keys: register_int_gauge_vec_with_registry!(
272                "rocksdb_estimated_num_keys",
273                "The estimated number of keys in the table",
274                &["cf_name"],
275                registry,
276            )
277            .unwrap(),
278            rocksdb_background_errors: register_int_gauge_vec_with_registry!(
279                "rocksdb_background_errors",
280                "The accumulated number of RocksDB background errors.",
281                &["cf_name"],
282                registry,
283            )
284            .unwrap(),
285            rocksdb_base_level: register_int_gauge_vec_with_registry!(
286                "rocksdb_base_level",
287                "The number of level to which L0 data will be compacted.",
288                &["cf_name"],
289                registry,
290            )
291            .unwrap(),
292        }
293    }
294}
295
296#[derive(Debug)]
297pub struct OperationMetrics {
298    pub rocksdb_iter_latency_seconds: HistogramVec,
299    pub rocksdb_iter_bytes: HistogramVec,
300    pub rocksdb_iter_keys: HistogramVec,
301    pub rocksdb_get_latency_seconds: HistogramVec,
302    pub rocksdb_get_bytes: HistogramVec,
303    pub rocksdb_multiget_latency_seconds: HistogramVec,
304    pub rocksdb_multiget_bytes: HistogramVec,
305    pub rocksdb_put_latency_seconds: HistogramVec,
306    pub rocksdb_put_bytes: HistogramVec,
307    pub rocksdb_batch_put_bytes: HistogramVec,
308    pub rocksdb_delete_latency_seconds: HistogramVec,
309    pub rocksdb_deletes: IntCounterVec,
310    pub rocksdb_batch_commit_latency_seconds: HistogramVec,
311    pub rocksdb_batch_commit_bytes: HistogramVec,
312    pub rocksdb_num_active_db_handles: IntGaugeVec,
313    pub rocksdb_very_slow_batch_writes_count: IntCounterVec,
314    pub rocksdb_very_slow_batch_writes_duration_ms: IntCounterVec,
315    pub rocksdb_very_slow_puts_count: IntCounterVec,
316    pub rocksdb_very_slow_puts_duration_ms: IntCounterVec,
317}
318
319impl OperationMetrics {
320    pub(crate) fn new(registry: &Registry) -> Self {
321        OperationMetrics {
322            rocksdb_iter_latency_seconds: register_histogram_vec_with_registry!(
323                "rocksdb_iter_latency_seconds",
324                "Rocksdb iter latency in seconds",
325                &["cf_name"],
326                LATENCY_SEC_BUCKETS.to_vec(),
327                registry,
328            )
329            .unwrap(),
330            rocksdb_iter_bytes: register_histogram_vec_with_registry!(
331                "rocksdb_iter_bytes",
332                "Rocksdb iter size in bytes",
333                &["cf_name"],
334                prometheus::exponential_buckets(1.0, 4.0, 15)
335                    .unwrap()
336                    .to_vec(),
337                registry,
338            )
339            .unwrap(),
340            rocksdb_iter_keys: register_histogram_vec_with_registry!(
341                "rocksdb_iter_keys",
342                "Rocksdb iter num keys",
343                &["cf_name"],
344                registry,
345            )
346            .unwrap(),
347            rocksdb_get_latency_seconds: register_histogram_vec_with_registry!(
348                "rocksdb_get_latency_seconds",
349                "Rocksdb get latency in seconds",
350                &["cf_name"],
351                LATENCY_SEC_BUCKETS.to_vec(),
352                registry,
353            )
354            .unwrap(),
355            rocksdb_get_bytes: register_histogram_vec_with_registry!(
356                "rocksdb_get_bytes",
357                "Rocksdb get call returned data size in bytes",
358                &["cf_name"],
359                prometheus::exponential_buckets(1.0, 4.0, 15)
360                    .unwrap()
361                    .to_vec(),
362                registry
363            )
364            .unwrap(),
365            rocksdb_multiget_latency_seconds: register_histogram_vec_with_registry!(
366                "rocksdb_multiget_latency_seconds",
367                "Rocksdb multiget latency in seconds",
368                &["cf_name"],
369                LATENCY_SEC_BUCKETS.to_vec(),
370                registry,
371            )
372            .unwrap(),
373            rocksdb_multiget_bytes: register_histogram_vec_with_registry!(
374                "rocksdb_multiget_bytes",
375                "Rocksdb multiget call returned data size in bytes",
376                &["cf_name"],
377                prometheus::exponential_buckets(1.0, 4.0, 15)
378                    .unwrap()
379                    .to_vec(),
380                registry,
381            )
382            .unwrap(),
383            rocksdb_put_latency_seconds: register_histogram_vec_with_registry!(
384                "rocksdb_put_latency_seconds",
385                "Rocksdb put latency in seconds",
386                &["cf_name"],
387                LATENCY_SEC_BUCKETS.to_vec(),
388                registry,
389            )
390            .unwrap(),
391            rocksdb_put_bytes: register_histogram_vec_with_registry!(
392                "rocksdb_put_bytes",
393                "Rocksdb put call puts data size in bytes",
394                &["cf_name"],
395                prometheus::exponential_buckets(1.0, 4.0, 15)
396                    .unwrap()
397                    .to_vec(),
398                registry,
399            )
400            .unwrap(),
401            rocksdb_batch_put_bytes: register_histogram_vec_with_registry!(
402                "rocksdb_batch_put_bytes",
403                "Rocksdb batch put call puts data size in bytes",
404                &["cf_name"],
405                prometheus::exponential_buckets(1.0, 4.0, 15)
406                    .unwrap()
407                    .to_vec(),
408                registry,
409            )
410            .unwrap(),
411            rocksdb_delete_latency_seconds: register_histogram_vec_with_registry!(
412                "rocksdb_delete_latency_seconds",
413                "Rocksdb delete latency in seconds",
414                &["cf_name"],
415                LATENCY_SEC_BUCKETS.to_vec(),
416                registry,
417            )
418            .unwrap(),
419            rocksdb_deletes: register_int_counter_vec_with_registry!(
420                "rocksdb_deletes",
421                "Rocksdb delete calls",
422                &["cf_name"],
423                registry
424            )
425            .unwrap(),
426            rocksdb_batch_commit_latency_seconds: register_histogram_vec_with_registry!(
427                "rocksdb_write_batch_commit_latency_seconds",
428                "Rocksdb schema batch commit latency in seconds",
429                &["db_name"],
430                LATENCY_SEC_BUCKETS.to_vec(),
431                registry,
432            )
433            .unwrap(),
434            rocksdb_batch_commit_bytes: register_histogram_vec_with_registry!(
435                "rocksdb_batch_commit_bytes",
436                "Rocksdb schema batch commit size in bytes",
437                &["db_name"],
438                prometheus::exponential_buckets(1.0, 4.0, 15)
439                    .unwrap()
440                    .to_vec(),
441                registry,
442            )
443            .unwrap(),
444            rocksdb_num_active_db_handles: register_int_gauge_vec_with_registry!(
445                "rocksdb_num_active_db_handles",
446                "Number of active db handles",
447                &["db_name"],
448                registry,
449            )
450            .unwrap(),
451            rocksdb_very_slow_batch_writes_count: register_int_counter_vec_with_registry!(
452                "rocksdb_num_very_slow_batch_writes",
453                "Number of batch writes that took more than 1 second",
454                &["db_name"],
455                registry,
456            )
457            .unwrap(),
458            rocksdb_very_slow_batch_writes_duration_ms: register_int_counter_vec_with_registry!(
459                "rocksdb_very_slow_batch_writes_duration",
460                "Total duration of batch writes that took more than 1 second",
461                &["db_name"],
462                registry,
463            )
464            .unwrap(),
465            rocksdb_very_slow_puts_count: register_int_counter_vec_with_registry!(
466                "rocksdb_num_very_slow_puts",
467                "Number of puts that took more than 1 second",
468                &["cf_name"],
469                registry,
470            )
471            .unwrap(),
472            rocksdb_very_slow_puts_duration_ms: register_int_counter_vec_with_registry!(
473                "rocksdb_very_slow_puts_duration",
474                "Total duration of puts that took more than 1 second",
475                &["cf_name"],
476                registry,
477            )
478            .unwrap(),
479        }
480    }
481}
482
483pub struct RocksDBPerfContext;
484
485impl Default for RocksDBPerfContext {
486    fn default() -> Self {
487        set_perf_stats(PerfStatsLevel::EnableTime);
488        PER_THREAD_ROCKS_PERF_CONTEXT.with(|perf_context| {
489            perf_context.borrow_mut().reset();
490        });
491        RocksDBPerfContext {}
492    }
493}
494
495impl Drop for RocksDBPerfContext {
496    fn drop(&mut self) {
497        set_perf_stats(PerfStatsLevel::Disable);
498    }
499}
500
501#[derive(Debug)]
502pub struct ReadPerfContextMetrics {
503    pub user_key_comparison_count: IntCounterVec,
504    pub block_cache_hit_count: IntCounterVec,
505    pub block_read_count: IntCounterVec,
506    pub block_read_byte: IntCounterVec,
507    pub block_read_nanos: IntCounterVec,
508    pub block_checksum_nanos: IntCounterVec,
509    pub block_decompress_nanos: IntCounterVec,
510    pub get_read_bytes: IntCounterVec,
511    pub multiget_read_bytes: IntCounterVec,
512    pub get_snapshot_nanos: IntCounterVec,
513    pub get_from_memtable_nanos: IntCounterVec,
514    pub get_from_memtable_count: IntCounterVec,
515    pub get_post_process_nanos: IntCounterVec,
516    pub get_from_output_files_nanos: IntCounterVec,
517    pub db_mutex_lock_nanos: IntCounterVec,
518    pub db_condition_wait_nanos: IntCounterVec,
519    pub merge_operator_nanos: IntCounterVec,
520    pub read_index_block_nanos: IntCounterVec,
521    pub read_filter_block_nanos: IntCounterVec,
522    pub new_table_block_iter_nanos: IntCounterVec,
523    pub block_seek_nanos: IntCounterVec,
524    pub find_table_nanos: IntCounterVec,
525    pub bloom_memtable_hit_count: IntCounterVec,
526    pub bloom_memtable_miss_count: IntCounterVec,
527    pub bloom_sst_hit_count: IntCounterVec,
528    pub bloom_sst_miss_count: IntCounterVec,
529    pub key_lock_wait_time: IntCounterVec,
530    pub key_lock_wait_count: IntCounterVec,
531    pub internal_delete_skipped_count: IntCounterVec,
532    pub internal_skipped_count: IntCounterVec,
533}
534
535impl ReadPerfContextMetrics {
536    pub(crate) fn new(registry: &Registry) -> Self {
537        ReadPerfContextMetrics {
538            user_key_comparison_count: register_int_counter_vec_with_registry!(
539                "user_key_comparison_count",
540                "Helps us figure out whether too many comparisons in binary search can be a problem,
541                especially when a more expensive comparator is used. Moreover, since number of comparisons
542                is usually uniform based on the memtable size, the SST file size for Level 0 and size of other
543                levels, an significant increase of the counter can indicate unexpected LSM-tree shape.
544                You may want to check whether flush/compaction can keep up with the write speed",
545                &["cf_name"],
546                registry,
547            )
548            .unwrap(),
549            block_cache_hit_count: register_int_counter_vec_with_registry!(
550                "block_cache_hit_count",
551                "Tells us how many times we read data blocks from block cache, and block_read_count tells us how many
552                times we have to read blocks from the file system (either block cache is disabled or it is a cache miss).
553                We can evaluate the block cache efficiency by looking at the two counters over time.",
554                &["cf_name"],
555                registry,
556            )
557            .unwrap(),
558            block_read_count: register_int_counter_vec_with_registry!(
559                "block_read_count",
560                "Tells us how many times we have to read blocks from the file system (either block cache is disabled or it is a cache miss)",
561                &["cf_name"],
562                registry,
563            )
564            .unwrap(),
565            block_read_byte: register_int_counter_vec_with_registry!(
566                "block_read_byte",
567                "Tells us how many total bytes we read from the file system. It can tell us whether a slow query can be caused by reading
568                large blocks from the file system. Index and bloom filter blocks are usually large blocks. A large block can also be the result
569                of a very large key or value",
570                &["cf_name"],
571                registry,
572            )
573            .unwrap(),
574            block_read_nanos: register_int_counter_vec_with_registry!(
575                "block_read_nanos",
576                "Total nanos spent on block reads",
577                &["cf_name"],
578                registry,
579            )
580            .unwrap(),
581            block_checksum_nanos: register_int_counter_vec_with_registry!(
582                "block_checksum_nanos",
583                "Total nanos spent on verifying block checksum",
584                &["cf_name"],
585                registry,
586            )
587            .unwrap(),
588            block_decompress_nanos: register_int_counter_vec_with_registry!(
589                "block_decompress_nanos",
590                "Total nanos spent on decompressing a block",
591                &["cf_name"],
592                registry,
593            )
594            .unwrap(),
595            get_read_bytes: register_int_counter_vec_with_registry!(
596                "get_read_bytes",
597                "Total bytes for values returned by Get",
598                &["cf_name"],
599                registry,
600            )
601            .unwrap(),
602            multiget_read_bytes: register_int_counter_vec_with_registry!(
603                "multiget_read_bytes",
604                "Total bytes for values returned by MultiGet.",
605                &["cf_name"],
606                registry,
607            )
608            .unwrap(),
609            get_snapshot_nanos: register_int_counter_vec_with_registry!(
610                "get_snapshot_nanos",
611                "Time spent in getting snapshot.",
612                &["cf_name"],
613                registry,
614            )
615            .unwrap(),
616            get_from_memtable_nanos: register_int_counter_vec_with_registry!(
617                "get_from_memtable_nanos",
618                "Time spent on reading data from memtable.",
619                &["cf_name"],
620                registry,
621            )
622            .unwrap(),
623            get_from_memtable_count: register_int_counter_vec_with_registry!(
624                "get_from_memtable_count",
625                "Number of memtables queried",
626                &["cf_name"],
627                registry,
628            )
629            .unwrap(),
630            get_post_process_nanos: register_int_counter_vec_with_registry!(
631                "get_post_process_nanos",
632                "Total nanos spent after Get() finds a key",
633                &["cf_name"],
634                registry,
635            )
636            .unwrap(),
637            get_from_output_files_nanos: register_int_counter_vec_with_registry!(
638                "get_from_output_files_nanos",
639                "Total nanos reading from output files",
640                &["cf_name"],
641                registry,
642            )
643            .unwrap(),
644            db_mutex_lock_nanos: register_int_counter_vec_with_registry!(
645                "db_mutex_lock_nanos",
646                "Time spent on acquiring db mutex",
647                &["cf_name"],
648                registry,
649            )
650            .unwrap(),
651            db_condition_wait_nanos: register_int_counter_vec_with_registry!(
652                "db_condition_wait_nanos",
653                "Time spent waiting with a condition variable created with DB Mutex.",
654                &["cf_name"],
655                registry,
656            )
657            .unwrap(),
658            merge_operator_nanos: register_int_counter_vec_with_registry!(
659                "merge_operator_nanos",
660                "Time spent on merge operator.",
661                &["cf_name"],
662                registry,
663            )
664            .unwrap(),
665            read_index_block_nanos: register_int_counter_vec_with_registry!(
666                "read_index_block_nanos",
667                "Time spent on reading index block from block cache or SST file",
668                &["cf_name"],
669                registry,
670            )
671            .unwrap(),
672            read_filter_block_nanos: register_int_counter_vec_with_registry!(
673                "read_filter_block_nanos",
674                "Time spent on reading filter block from block cache or SST file",
675                &["cf_name"],
676                registry,
677            )
678            .unwrap(),
679            new_table_block_iter_nanos: register_int_counter_vec_with_registry!(
680                "new_table_block_iter_nanos",
681                "Time spent on creating data block iterator",
682                &["cf_name"],
683                registry,
684            )
685            .unwrap(),
686            block_seek_nanos: register_int_counter_vec_with_registry!(
687                "block_seek_nanos",
688                "Time spent on seeking a key in data/index blocks",
689                &["cf_name"],
690                registry,
691            )
692            .unwrap(),
693            find_table_nanos: register_int_counter_vec_with_registry!(
694                "find_table_nanos",
695                "Time spent on finding or creating a table reader",
696                &["cf_name"],
697                registry,
698            )
699            .unwrap(),
700            bloom_memtable_hit_count: register_int_counter_vec_with_registry!(
701                "bloom_memtable_hit_count",
702                "Total number of mem table bloom hits",
703                &["cf_name"],
704                registry,
705            )
706            .unwrap(),
707            bloom_memtable_miss_count: register_int_counter_vec_with_registry!(
708                "bloom_memtable_miss_count",
709                "Total number of mem table bloom misses",
710                &["cf_name"],
711                registry,
712            )
713            .unwrap(),
714            bloom_sst_hit_count: register_int_counter_vec_with_registry!(
715                "bloom_sst_hit_count",
716                "Total number of SST table bloom hits",
717                &["cf_name"],
718                registry,
719            )
720            .unwrap(),
721            bloom_sst_miss_count: register_int_counter_vec_with_registry!(
722                "bloom_sst_miss_count",
723                "Total number of SST table bloom misses",
724                &["cf_name"],
725                registry,
726            )
727            .unwrap(),
728            key_lock_wait_time: register_int_counter_vec_with_registry!(
729                "key_lock_wait_time",
730                "Time spent waiting on key locks in transaction lock manager",
731                &["cf_name"],
732                registry,
733            )
734            .unwrap(),
735            key_lock_wait_count: register_int_counter_vec_with_registry!(
736                "key_lock_wait_count",
737                "Number of times acquiring a lock was blocked by another transaction",
738                &["cf_name"],
739                registry,
740            )
741            .unwrap(),
742            internal_delete_skipped_count: register_int_counter_vec_with_registry!(
743                "internal_delete_skipped_count",
744                "Total number of deleted keys skipped during iteration",
745                &["cf_name"],
746                registry,
747            )
748                .unwrap(),
749            internal_skipped_count: register_int_counter_vec_with_registry!(
750                "internal_skipped_count",
751                "Totall number of internal keys skipped during iteration",
752                &["cf_name"],
753                registry,
754            )
755                .unwrap(),
756        }
757    }
758
759    pub fn report_metrics(&self, cf_name: &str) {
760        PER_THREAD_ROCKS_PERF_CONTEXT.with(|perf_context_cell| {
761            set_perf_stats(PerfStatsLevel::Disable);
762            let perf_context = perf_context_cell.borrow();
763            self.user_key_comparison_count
764                .with_label_values(&[cf_name])
765                .inc_by(perf_context.metric(PerfMetric::UserKeyComparisonCount));
766            self.block_cache_hit_count
767                .with_label_values(&[cf_name])
768                .inc_by(perf_context.metric(PerfMetric::BlockCacheHitCount));
769            self.block_read_count
770                .with_label_values(&[cf_name])
771                .inc_by(perf_context.metric(PerfMetric::BlockReadCount));
772            self.block_read_byte
773                .with_label_values(&[cf_name])
774                .inc_by(perf_context.metric(PerfMetric::BlockReadByte));
775            self.block_read_nanos
776                .with_label_values(&[cf_name])
777                .inc_by(perf_context.metric(PerfMetric::BlockReadTime));
778            self.block_read_count
779                .with_label_values(&[cf_name])
780                .inc_by(perf_context.metric(PerfMetric::BlockReadCount));
781            self.block_checksum_nanos
782                .with_label_values(&[cf_name])
783                .inc_by(perf_context.metric(PerfMetric::BlockChecksumTime));
784            self.block_decompress_nanos
785                .with_label_values(&[cf_name])
786                .inc_by(perf_context.metric(PerfMetric::BlockDecompressTime));
787            self.get_read_bytes
788                .with_label_values(&[cf_name])
789                .inc_by(perf_context.metric(PerfMetric::GetReadBytes));
790            self.multiget_read_bytes
791                .with_label_values(&[cf_name])
792                .inc_by(perf_context.metric(PerfMetric::MultigetReadBytes));
793            self.get_snapshot_nanos
794                .with_label_values(&[cf_name])
795                .inc_by(perf_context.metric(PerfMetric::GetSnapshotTime));
796            self.get_from_memtable_nanos
797                .with_label_values(&[cf_name])
798                .inc_by(perf_context.metric(PerfMetric::GetFromMemtableTime));
799            self.get_from_memtable_count
800                .with_label_values(&[cf_name])
801                .inc_by(perf_context.metric(PerfMetric::GetFromMemtableCount));
802            self.get_post_process_nanos
803                .with_label_values(&[cf_name])
804                .inc_by(perf_context.metric(PerfMetric::GetPostProcessTime));
805            self.get_from_output_files_nanos
806                .with_label_values(&[cf_name])
807                .inc_by(perf_context.metric(PerfMetric::GetFromOutputFilesTime));
808            self.db_mutex_lock_nanos
809                .with_label_values(&[cf_name])
810                .inc_by(perf_context.metric(PerfMetric::DbMutexLockNanos));
811            self.db_condition_wait_nanos
812                .with_label_values(&[cf_name])
813                .inc_by(perf_context.metric(PerfMetric::DbConditionWaitNanos));
814            self.merge_operator_nanos
815                .with_label_values(&[cf_name])
816                .inc_by(perf_context.metric(PerfMetric::MergeOperatorTimeNanos));
817            self.read_index_block_nanos
818                .with_label_values(&[cf_name])
819                .inc_by(perf_context.metric(PerfMetric::ReadIndexBlockNanos));
820            self.read_filter_block_nanos
821                .with_label_values(&[cf_name])
822                .inc_by(perf_context.metric(PerfMetric::ReadFilterBlockNanos));
823            self.new_table_block_iter_nanos
824                .with_label_values(&[cf_name])
825                .inc_by(perf_context.metric(PerfMetric::NewTableBlockIterNanos));
826            self.block_seek_nanos
827                .with_label_values(&[cf_name])
828                .inc_by(perf_context.metric(PerfMetric::BlockSeekNanos));
829            self.find_table_nanos
830                .with_label_values(&[cf_name])
831                .inc_by(perf_context.metric(PerfMetric::FindTableNanos));
832            self.bloom_memtable_hit_count
833                .with_label_values(&[cf_name])
834                .inc_by(perf_context.metric(PerfMetric::BloomMemtableHitCount));
835            self.bloom_memtable_miss_count
836                .with_label_values(&[cf_name])
837                .inc_by(perf_context.metric(PerfMetric::BloomMemtableMissCount));
838            self.bloom_sst_hit_count
839                .with_label_values(&[cf_name])
840                .inc_by(perf_context.metric(PerfMetric::BloomSstHitCount));
841            self.bloom_sst_miss_count
842                .with_label_values(&[cf_name])
843                .inc_by(perf_context.metric(PerfMetric::BloomSstMissCount));
844            self.key_lock_wait_time
845                .with_label_values(&[cf_name])
846                .inc_by(perf_context.metric(PerfMetric::KeyLockWaitTime));
847            self.key_lock_wait_count
848                .with_label_values(&[cf_name])
849                .inc_by(perf_context.metric(PerfMetric::KeyLockWaitCount));
850            self.internal_delete_skipped_count
851                .with_label_values(&[cf_name])
852                .inc_by(perf_context.metric(PerfMetric::InternalDeleteSkippedCount));
853            self.internal_skipped_count
854                .with_label_values(&[cf_name])
855                .inc_by(perf_context.metric(PerfMetric::InternalKeySkippedCount));
856        });
857    }
858}
859
860#[derive(Debug)]
861pub struct WritePerfContextMetrics {
862    pub write_wal_nanos: IntCounterVec,
863    pub write_memtable_nanos: IntCounterVec,
864    pub write_delay_nanos: IntCounterVec,
865    pub write_pre_and_post_process_nanos: IntCounterVec,
866    pub write_db_mutex_lock_nanos: IntCounterVec,
867    pub write_db_condition_wait_nanos: IntCounterVec,
868    pub write_key_lock_wait_nanos: IntCounterVec,
869    pub write_key_lock_wait_count: IntCounterVec,
870}
871
872impl WritePerfContextMetrics {
873    pub(crate) fn new(registry: &Registry) -> Self {
874        WritePerfContextMetrics {
875            write_wal_nanos: register_int_counter_vec_with_registry!(
876                "write_wal_nanos",
877                "Total nanos spent on writing to WAL",
878                &["cf_name"],
879                registry,
880            )
881            .unwrap(),
882            write_memtable_nanos: register_int_counter_vec_with_registry!(
883                "write_memtable_nanos",
884                "Total nanos spent on writing to memtable",
885                &["cf_name"],
886                registry,
887            )
888            .unwrap(),
889            write_delay_nanos: register_int_counter_vec_with_registry!(
890                "write_delay_nanos",
891                "Total nanos spent on delaying or throttling write",
892                &["cf_name"],
893                registry,
894            )
895            .unwrap(),
896            write_pre_and_post_process_nanos: register_int_counter_vec_with_registry!(
897                "write_pre_and_post_process_nanos",
898                "Total nanos spent on writing a record, excluding the above four things",
899                &["cf_name"],
900                registry,
901            )
902            .unwrap(),
903            write_db_mutex_lock_nanos: register_int_counter_vec_with_registry!(
904                "write_db_mutex_lock_nanos",
905                "Time spent on acquiring db mutex",
906                &["cf_name"],
907                registry,
908            )
909            .unwrap(),
910            write_db_condition_wait_nanos: register_int_counter_vec_with_registry!(
911                "write_db_condition_wait_nanos",
912                "Time spent waiting with a condition variable created with DB Mutex.",
913                &["cf_name"],
914                registry,
915            )
916            .unwrap(),
917            write_key_lock_wait_nanos: register_int_counter_vec_with_registry!(
918                "write_key_lock_wait_time",
919                "Time spent waiting on key locks in transaction lock manager",
920                &["cf_name"],
921                registry,
922            )
923            .unwrap(),
924            write_key_lock_wait_count: register_int_counter_vec_with_registry!(
925                "write_key_lock_wait_count",
926                "Number of times acquiring a lock was blocked by another transaction",
927                &["cf_name"],
928                registry,
929            )
930            .unwrap(),
931        }
932    }
933    pub fn report_metrics(&self, db_name: &str) {
934        PER_THREAD_ROCKS_PERF_CONTEXT.with(|perf_context_cell| {
935            set_perf_stats(PerfStatsLevel::Disable);
936            let perf_context = perf_context_cell.borrow();
937            self.write_wal_nanos
938                .with_label_values(&[db_name])
939                .inc_by(perf_context.metric(PerfMetric::WriteWalTime));
940            self.write_memtable_nanos
941                .with_label_values(&[db_name])
942                .inc_by(perf_context.metric(PerfMetric::WriteMemtableTime));
943            self.write_delay_nanos
944                .with_label_values(&[db_name])
945                .inc_by(perf_context.metric(PerfMetric::WriteDelayTime));
946            self.write_pre_and_post_process_nanos
947                .with_label_values(&[db_name])
948                .inc_by(perf_context.metric(PerfMetric::WritePreAndPostProcessTime));
949            self.write_db_mutex_lock_nanos
950                .with_label_values(&[db_name])
951                .inc_by(perf_context.metric(PerfMetric::DbMutexLockNanos));
952            self.write_db_condition_wait_nanos
953                .with_label_values(&[db_name])
954                .inc_by(perf_context.metric(PerfMetric::DbConditionWaitNanos));
955            self.write_key_lock_wait_nanos
956                .with_label_values(&[db_name])
957                .inc_by(perf_context.metric(PerfMetric::KeyLockWaitTime));
958            self.write_key_lock_wait_count
959                .with_label_values(&[db_name])
960                .inc_by(perf_context.metric(PerfMetric::KeyLockWaitCount));
961        });
962    }
963}
964
965#[derive(Debug)]
966pub struct DBMetrics {
967    pub op_metrics: OperationMetrics,
968    pub cf_metrics: ColumnFamilyMetrics,
969    pub read_perf_ctx_metrics: ReadPerfContextMetrics,
970    pub write_perf_ctx_metrics: WritePerfContextMetrics,
971}
972
973static ONCE: OnceCell<Arc<DBMetrics>> = OnceCell::new();
974
975impl DBMetrics {
976    fn new(registry: &Registry) -> Self {
977        DBMetrics {
978            op_metrics: OperationMetrics::new(registry),
979            cf_metrics: ColumnFamilyMetrics::new(registry),
980            read_perf_ctx_metrics: ReadPerfContextMetrics::new(registry),
981            write_perf_ctx_metrics: WritePerfContextMetrics::new(registry),
982        }
983    }
984    pub fn init(registry: &Registry) -> &'static Arc<DBMetrics> {
985        // Initialize this before creating any instance of DBMap
986        // TODO: Remove static initialization because this basically means we can
987        // only ever initialize db metrics once with a registry whereas
988        // in the code we might want to initialize it with different
989        // registries. The problem is underlying metrics cannot be re-initialized
990        // or prometheus complains. We essentially need to pass in DBMetrics
991        // everywhere we create DBMap as the right fix
992        let _ = ONCE
993            .set(Arc::new(DBMetrics::new(registry)))
994            // this happens many times during tests
995            .tap_err(|_| warn!("DBMetrics registry overwritten"));
996        ONCE.get().unwrap()
997    }
998    pub fn increment_num_active_dbs(&self, db_name: &str) {
999        self.op_metrics
1000            .rocksdb_num_active_db_handles
1001            .with_label_values(&[db_name])
1002            .inc();
1003    }
1004    pub fn decrement_num_active_dbs(&self, db_name: &str) {
1005        self.op_metrics
1006            .rocksdb_num_active_db_handles
1007            .with_label_values(&[db_name])
1008            .dec();
1009    }
1010    pub fn get() -> &'static Arc<DBMetrics> {
1011        ONCE.get()
1012            .unwrap_or_else(|| DBMetrics::init(prometheus::default_registry()))
1013    }
1014}