typed_store/
metrics.rs

1// Copyright (c) Mysten Labs, Inc.
2// Modifications Copyright (c) 2024 IOTA Stiftung
3// SPDX-License-Identifier: Apache-2.0
4
5use std::{
6    cell::RefCell,
7    sync::{
8        Arc,
9        atomic::{AtomicU64, Ordering},
10    },
11    time::Duration,
12};
13
14use once_cell::sync::OnceCell;
15use prometheus::{
16    HistogramVec, IntCounterVec, IntGaugeVec, Registry, register_histogram_vec_with_registry,
17    register_int_counter_vec_with_registry, register_int_gauge_vec_with_registry,
18};
19use rocksdb::{PerfContext, PerfMetric, PerfStatsLevel, perf::set_perf_stats};
20use tap::TapFallible;
21use tracing::warn;
22
23thread_local! {
24    static PER_THREAD_ROCKS_PERF_CONTEXT: std::cell::RefCell<rocksdb::PerfContext>  = RefCell::new(PerfContext::default());
25}
26
27const LATENCY_SEC_BUCKETS: &[f64] = &[
28    0.00001, 0.00005, // 10 mcs, 50 mcs
29    0.0001, 0.0002, 0.0003, 0.0004, 0.0005, // 100..500 mcs
30    0.001, 0.002, 0.003, 0.004, 0.005, // 1..5ms
31    0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1., 2.5, 5., 10.,
32];
33
34#[derive(Debug, Clone)]
35// A struct for sampling based on number of operations or duration.
36// Sampling happens if the duration expires and after number of operations
37pub struct SamplingInterval {
38    // Sample once every time duration
39    pub once_every_duration: Duration,
40    // Sample once every number of operations
41    pub after_num_ops: u64,
42    // Counter for keeping track of previous sample
43    pub counter: Arc<AtomicU64>,
44}
45
46impl Default for SamplingInterval {
47    fn default() -> Self {
48        // Enabled with 60 second interval
49        SamplingInterval::new(Duration::from_secs(60), 0)
50    }
51}
52
53impl SamplingInterval {
54    pub fn new(once_every_duration: Duration, after_num_ops: u64) -> Self {
55        let counter = Arc::new(AtomicU64::new(1));
56        if !once_every_duration.is_zero() {
57            let counter = counter.clone();
58            tokio::task::spawn(async move {
59                loop {
60                    if counter.load(Ordering::SeqCst) > after_num_ops {
61                        counter.store(0, Ordering::SeqCst);
62                    }
63                    tokio::time::sleep(once_every_duration).await;
64                }
65            });
66        }
67        SamplingInterval {
68            once_every_duration,
69            after_num_ops,
70            counter,
71        }
72    }
73    pub fn new_from_self(&self) -> SamplingInterval {
74        SamplingInterval::new(self.once_every_duration, self.after_num_ops)
75    }
76    pub fn sample(&self) -> bool {
77        if self.once_every_duration.is_zero() {
78            self.counter.fetch_add(1, Ordering::Relaxed) % (self.after_num_ops + 1) == 0
79        } else {
80            self.counter.fetch_add(1, Ordering::Relaxed) == 0
81        }
82    }
83}
84
85#[derive(Debug)]
86pub struct ColumnFamilyMetrics {
87    pub rocksdb_total_sst_files_size: IntGaugeVec,
88    pub rocksdb_total_blob_files_size: IntGaugeVec,
89    pub rocksdb_total_num_files: IntGaugeVec,
90    pub rocksdb_num_level0_files: IntGaugeVec,
91    pub rocksdb_current_size_active_mem_tables: IntGaugeVec,
92    pub rocksdb_size_all_mem_tables: IntGaugeVec,
93    pub rocksdb_num_snapshots: IntGaugeVec,
94    pub rocksdb_oldest_snapshot_time: IntGaugeVec,
95    pub rocksdb_actual_delayed_write_rate: IntGaugeVec,
96    pub rocksdb_is_write_stopped: IntGaugeVec,
97    pub rocksdb_block_cache_capacity: IntGaugeVec,
98    pub rocksdb_block_cache_usage: IntGaugeVec,
99    pub rocksdb_block_cache_pinned_usage: IntGaugeVec,
100    pub rocksdb_estimate_table_readers_mem: IntGaugeVec,
101    pub rocksdb_num_immutable_mem_tables: IntGaugeVec,
102    pub rocksdb_mem_table_flush_pending: IntGaugeVec,
103    pub rocksdb_compaction_pending: IntGaugeVec,
104    pub rocksdb_estimate_pending_compaction_bytes: IntGaugeVec,
105    pub rocksdb_num_running_compactions: IntGaugeVec,
106    pub rocksdb_num_running_flushes: IntGaugeVec,
107    pub rocksdb_estimate_oldest_key_time: IntGaugeVec,
108    pub rocksdb_background_errors: IntGaugeVec,
109    pub rocksdb_estimated_num_keys: IntGaugeVec,
110    pub rocksdb_base_level: IntGaugeVec,
111}
112
113impl ColumnFamilyMetrics {
114    pub(crate) fn new(registry: &Registry) -> Self {
115        ColumnFamilyMetrics {
116            rocksdb_total_sst_files_size: register_int_gauge_vec_with_registry!(
117                "rocksdb_total_sst_files_size",
118                "The storage size occupied by the sst files in the column family",
119                &["cf_name"],
120                registry,
121            )
122            .unwrap(),
123            rocksdb_total_blob_files_size: register_int_gauge_vec_with_registry!(
124                "rocksdb_total_blob_files_size",
125                "The storage size occupied by the blob files in the column family",
126                &["cf_name"],
127                registry,
128            )
129            .unwrap(),
130            rocksdb_total_num_files: register_int_gauge_vec_with_registry!(
131                "rocksdb_total_num_files",
132                "Total number of files used in the column family",
133                &["cf_name"],
134                registry,
135            )
136            .unwrap(),
137            rocksdb_num_level0_files: register_int_gauge_vec_with_registry!(
138                "rocksdb_num_level0_files",
139                "Number of level 0 files in the column family",
140                &["cf_name"],
141                registry,
142            )
143            .unwrap(),
144            rocksdb_current_size_active_mem_tables: register_int_gauge_vec_with_registry!(
145                "rocksdb_current_size_active_mem_tables",
146                "The current approximate size of active memtable (bytes).",
147                &["cf_name"],
148                registry,
149            )
150            .unwrap(),
151            rocksdb_size_all_mem_tables: register_int_gauge_vec_with_registry!(
152                "rocksdb_size_all_mem_tables",
153                "The memory size occupied by the column family's in-memory buffer",
154                &["cf_name"],
155                registry,
156            )
157            .unwrap(),
158            rocksdb_num_snapshots: register_int_gauge_vec_with_registry!(
159                "rocksdb_num_snapshots",
160                "Number of snapshots held for the column family",
161                &["cf_name"],
162                registry,
163            )
164            .unwrap(),
165            rocksdb_oldest_snapshot_time: register_int_gauge_vec_with_registry!(
166                "rocksdb_oldest_snapshot_time",
167                "Unit timestamp of the oldest unreleased snapshot",
168                &["cf_name"],
169                registry,
170            )
171            .unwrap(),
172            rocksdb_actual_delayed_write_rate: register_int_gauge_vec_with_registry!(
173                "rocksdb_actual_delayed_write_rate",
174                "The current actual delayed write rate. 0 means no delay",
175                &["cf_name"],
176                registry,
177            )
178            .unwrap(),
179            rocksdb_is_write_stopped: register_int_gauge_vec_with_registry!(
180                "rocksdb_is_write_stopped",
181                "A flag indicating whether writes are stopped on this column family. 1 indicates writes have been stopped.",
182                &["cf_name"],
183                registry,
184            )
185            .unwrap(),
186            rocksdb_block_cache_capacity: register_int_gauge_vec_with_registry!(
187                "rocksdb_block_cache_capacity",
188                "The block cache capacity of the column family.",
189                &["cf_name"],
190                registry,
191            )
192            .unwrap(),
193            rocksdb_block_cache_usage: register_int_gauge_vec_with_registry!(
194                "rocksdb_block_cache_usage",
195                "The memory size used by the column family in the block cache.",
196                &["cf_name"],
197                registry,
198            )
199            .unwrap(),
200            rocksdb_block_cache_pinned_usage: register_int_gauge_vec_with_registry!(
201                "rocksdb_block_cache_pinned_usage",
202                "The memory size used by the column family in the block cache where entries are pinned",
203                &["cf_name"],
204                registry,
205            )
206            .unwrap(),
207            rocksdb_estimate_table_readers_mem: register_int_gauge_vec_with_registry!(
208                "rocksdb_estimate_table_readers_mem",
209                "The estimated memory size used for reading SST tables in this column
210                family such as filters and index blocks. Note that this number does not
211                include the memory used in block cache.",
212                &["cf_name"],
213                registry,
214            )
215            .unwrap(),
216            rocksdb_num_immutable_mem_tables: register_int_gauge_vec_with_registry!(
217                "rocksdb_num_immutable_mem_tables",
218                "The number of immutable memtables that have not yet been flushed.",
219                &["cf_name"],
220                registry,
221            )
222            .unwrap(),
223            rocksdb_mem_table_flush_pending: register_int_gauge_vec_with_registry!(
224                "rocksdb_mem_table_flush_pending",
225                "A 1 or 0 flag indicating whether a memtable flush is pending.
226                If this number is 1, it means a memtable is waiting for being flushed,
227                but there might be too many L0 files that prevents it from being flushed.",
228                &["cf_name"],
229                registry,
230            )
231            .unwrap(),
232            rocksdb_compaction_pending: register_int_gauge_vec_with_registry!(
233                "rocksdb_compaction_pending",
234                "A 1 or 0 flag indicating whether a compaction job is pending.
235                If this number is 1, it means some part of the column family requires
236                compaction in order to maintain shape of LSM tree, but the compaction
237                is pending because the desired compaction job is either waiting for
238                other dependent compactions to be finished or waiting for an available
239                compaction thread.",
240                &["cf_name"],
241                registry,
242            )
243            .unwrap(),
244            rocksdb_estimate_pending_compaction_bytes: register_int_gauge_vec_with_registry!(
245                "rocksdb_estimate_pending_compaction_bytes",
246                "Estimated total number of bytes compaction needs to rewrite to get all levels down
247                to under target size. Not valid for other compactions than level-based.",
248                &["cf_name"],
249                registry,
250            )
251            .unwrap(),
252            rocksdb_num_running_compactions: register_int_gauge_vec_with_registry!(
253                "rocksdb_num_running_compactions",
254                "The number of compactions that are currently running for the column family.",
255                &["cf_name"],
256                registry,
257            )
258            .unwrap(),
259            rocksdb_num_running_flushes: register_int_gauge_vec_with_registry!(
260                "rocksdb_num_running_flushes",
261                "The number of flushes that are currently running for the column family.",
262                &["cf_name"],
263                registry,
264            )
265            .unwrap(),
266            rocksdb_estimate_oldest_key_time: register_int_gauge_vec_with_registry!(
267                "rocksdb_estimate_oldest_key_time",
268                "Estimation of the oldest key timestamp in the DB. Only available
269                for FIFO compaction with compaction_options_fifo.allow_compaction = false.",
270                &["cf_name"],
271                registry,
272            )
273            .unwrap(),
274            rocksdb_estimated_num_keys: register_int_gauge_vec_with_registry!(
275                "rocksdb_estimated_num_keys",
276                "The estimated number of keys in the table",
277                &["cf_name"],
278                registry,
279            )
280            .unwrap(),
281            rocksdb_background_errors: register_int_gauge_vec_with_registry!(
282                "rocksdb_background_errors",
283                "The accumulated number of RocksDB background errors.",
284                &["cf_name"],
285                registry,
286            )
287            .unwrap(),
288            rocksdb_base_level: register_int_gauge_vec_with_registry!(
289                "rocksdb_base_level",
290                "The number of level to which L0 data will be compacted.",
291                &["cf_name"],
292                registry,
293            )
294            .unwrap(),
295        }
296    }
297}
298
299#[derive(Debug)]
300pub struct OperationMetrics {
301    pub rocksdb_iter_latency_seconds: HistogramVec,
302    pub rocksdb_iter_bytes: HistogramVec,
303    pub rocksdb_iter_keys: HistogramVec,
304    pub rocksdb_get_latency_seconds: HistogramVec,
305    pub rocksdb_get_bytes: HistogramVec,
306    pub rocksdb_multiget_latency_seconds: HistogramVec,
307    pub rocksdb_multiget_bytes: HistogramVec,
308    pub rocksdb_put_latency_seconds: HistogramVec,
309    pub rocksdb_put_bytes: HistogramVec,
310    pub rocksdb_batch_put_bytes: HistogramVec,
311    pub rocksdb_delete_latency_seconds: HistogramVec,
312    pub rocksdb_deletes: IntCounterVec,
313    pub rocksdb_batch_commit_latency_seconds: HistogramVec,
314    pub rocksdb_batch_commit_bytes: HistogramVec,
315    pub rocksdb_num_active_db_handles: IntGaugeVec,
316    pub rocksdb_very_slow_batch_writes_count: IntCounterVec,
317    pub rocksdb_very_slow_batch_writes_duration_ms: IntCounterVec,
318    pub rocksdb_very_slow_puts_count: IntCounterVec,
319    pub rocksdb_very_slow_puts_duration_ms: IntCounterVec,
320}
321
322impl OperationMetrics {
323    pub(crate) fn new(registry: &Registry) -> Self {
324        OperationMetrics {
325            rocksdb_iter_latency_seconds: register_histogram_vec_with_registry!(
326                "rocksdb_iter_latency_seconds",
327                "Rocksdb iter latency in seconds",
328                &["cf_name"],
329                LATENCY_SEC_BUCKETS.to_vec(),
330                registry,
331            )
332            .unwrap(),
333            rocksdb_iter_bytes: register_histogram_vec_with_registry!(
334                "rocksdb_iter_bytes",
335                "Rocksdb iter size in bytes",
336                &["cf_name"],
337                prometheus::exponential_buckets(1.0, 4.0, 15)
338                    .unwrap()
339                    .to_vec(),
340                registry,
341            )
342            .unwrap(),
343            rocksdb_iter_keys: register_histogram_vec_with_registry!(
344                "rocksdb_iter_keys",
345                "Rocksdb iter num keys",
346                &["cf_name"],
347                registry,
348            )
349            .unwrap(),
350            rocksdb_get_latency_seconds: register_histogram_vec_with_registry!(
351                "rocksdb_get_latency_seconds",
352                "Rocksdb get latency in seconds",
353                &["cf_name"],
354                LATENCY_SEC_BUCKETS.to_vec(),
355                registry,
356            )
357            .unwrap(),
358            rocksdb_get_bytes: register_histogram_vec_with_registry!(
359                "rocksdb_get_bytes",
360                "Rocksdb get call returned data size in bytes",
361                &["cf_name"],
362                prometheus::exponential_buckets(1.0, 4.0, 15)
363                    .unwrap()
364                    .to_vec(),
365                registry
366            )
367            .unwrap(),
368            rocksdb_multiget_latency_seconds: register_histogram_vec_with_registry!(
369                "rocksdb_multiget_latency_seconds",
370                "Rocksdb multiget latency in seconds",
371                &["cf_name"],
372                LATENCY_SEC_BUCKETS.to_vec(),
373                registry,
374            )
375            .unwrap(),
376            rocksdb_multiget_bytes: register_histogram_vec_with_registry!(
377                "rocksdb_multiget_bytes",
378                "Rocksdb multiget call returned data size in bytes",
379                &["cf_name"],
380                prometheus::exponential_buckets(1.0, 4.0, 15)
381                    .unwrap()
382                    .to_vec(),
383                registry,
384            )
385            .unwrap(),
386            rocksdb_put_latency_seconds: register_histogram_vec_with_registry!(
387                "rocksdb_put_latency_seconds",
388                "Rocksdb put latency in seconds",
389                &["cf_name"],
390                LATENCY_SEC_BUCKETS.to_vec(),
391                registry,
392            )
393            .unwrap(),
394            rocksdb_put_bytes: register_histogram_vec_with_registry!(
395                "rocksdb_put_bytes",
396                "Rocksdb put call puts data size in bytes",
397                &["cf_name"],
398                prometheus::exponential_buckets(1.0, 4.0, 15)
399                    .unwrap()
400                    .to_vec(),
401                registry,
402            )
403            .unwrap(),
404            rocksdb_batch_put_bytes: register_histogram_vec_with_registry!(
405                "rocksdb_batch_put_bytes",
406                "Rocksdb batch put call puts data size in bytes",
407                &["cf_name"],
408                prometheus::exponential_buckets(1.0, 4.0, 15)
409                    .unwrap()
410                    .to_vec(),
411                registry,
412            )
413            .unwrap(),
414            rocksdb_delete_latency_seconds: register_histogram_vec_with_registry!(
415                "rocksdb_delete_latency_seconds",
416                "Rocksdb delete latency in seconds",
417                &["cf_name"],
418                LATENCY_SEC_BUCKETS.to_vec(),
419                registry,
420            )
421            .unwrap(),
422            rocksdb_deletes: register_int_counter_vec_with_registry!(
423                "rocksdb_deletes",
424                "Rocksdb delete calls",
425                &["cf_name"],
426                registry
427            )
428            .unwrap(),
429            rocksdb_batch_commit_latency_seconds: register_histogram_vec_with_registry!(
430                "rocksdb_write_batch_commit_latency_seconds",
431                "Rocksdb schema batch commit latency in seconds",
432                &["db_name"],
433                LATENCY_SEC_BUCKETS.to_vec(),
434                registry,
435            )
436            .unwrap(),
437            rocksdb_batch_commit_bytes: register_histogram_vec_with_registry!(
438                "rocksdb_batch_commit_bytes",
439                "Rocksdb schema batch commit size in bytes",
440                &["db_name"],
441                prometheus::exponential_buckets(1.0, 4.0, 15)
442                    .unwrap()
443                    .to_vec(),
444                registry,
445            )
446            .unwrap(),
447            rocksdb_num_active_db_handles: register_int_gauge_vec_with_registry!(
448                "rocksdb_num_active_db_handles",
449                "Number of active db handles",
450                &["db_name"],
451                registry,
452            )
453            .unwrap(),
454            rocksdb_very_slow_batch_writes_count: register_int_counter_vec_with_registry!(
455                "rocksdb_num_very_slow_batch_writes",
456                "Number of batch writes that took more than 1 second",
457                &["db_name"],
458                registry,
459            )
460            .unwrap(),
461            rocksdb_very_slow_batch_writes_duration_ms: register_int_counter_vec_with_registry!(
462                "rocksdb_very_slow_batch_writes_duration",
463                "Total duration of batch writes that took more than 1 second",
464                &["db_name"],
465                registry,
466            )
467            .unwrap(),
468            rocksdb_very_slow_puts_count: register_int_counter_vec_with_registry!(
469                "rocksdb_num_very_slow_puts",
470                "Number of puts that took more than 1 second",
471                &["cf_name"],
472                registry,
473            )
474            .unwrap(),
475            rocksdb_very_slow_puts_duration_ms: register_int_counter_vec_with_registry!(
476                "rocksdb_very_slow_puts_duration",
477                "Total duration of puts that took more than 1 second",
478                &["cf_name"],
479                registry,
480            )
481            .unwrap(),
482        }
483    }
484}
485
486pub struct RocksDBPerfContext;
487
488impl Default for RocksDBPerfContext {
489    fn default() -> Self {
490        set_perf_stats(PerfStatsLevel::EnableTime);
491        PER_THREAD_ROCKS_PERF_CONTEXT.with(|perf_context| {
492            perf_context.borrow_mut().reset();
493        });
494        RocksDBPerfContext {}
495    }
496}
497
498impl Drop for RocksDBPerfContext {
499    fn drop(&mut self) {
500        set_perf_stats(PerfStatsLevel::Disable);
501    }
502}
503
504#[derive(Debug)]
505pub struct ReadPerfContextMetrics {
506    pub user_key_comparison_count: IntCounterVec,
507    pub block_cache_hit_count: IntCounterVec,
508    pub block_read_count: IntCounterVec,
509    pub block_read_byte: IntCounterVec,
510    pub block_read_nanos: IntCounterVec,
511    pub block_checksum_nanos: IntCounterVec,
512    pub block_decompress_nanos: IntCounterVec,
513    pub get_read_bytes: IntCounterVec,
514    pub multiget_read_bytes: IntCounterVec,
515    pub get_snapshot_nanos: IntCounterVec,
516    pub get_from_memtable_nanos: IntCounterVec,
517    pub get_from_memtable_count: IntCounterVec,
518    pub get_post_process_nanos: IntCounterVec,
519    pub get_from_output_files_nanos: IntCounterVec,
520    pub db_mutex_lock_nanos: IntCounterVec,
521    pub db_condition_wait_nanos: IntCounterVec,
522    pub merge_operator_nanos: IntCounterVec,
523    pub read_index_block_nanos: IntCounterVec,
524    pub read_filter_block_nanos: IntCounterVec,
525    pub new_table_block_iter_nanos: IntCounterVec,
526    pub block_seek_nanos: IntCounterVec,
527    pub find_table_nanos: IntCounterVec,
528    pub bloom_memtable_hit_count: IntCounterVec,
529    pub bloom_memtable_miss_count: IntCounterVec,
530    pub bloom_sst_hit_count: IntCounterVec,
531    pub bloom_sst_miss_count: IntCounterVec,
532    pub key_lock_wait_time: IntCounterVec,
533    pub key_lock_wait_count: IntCounterVec,
534    pub internal_delete_skipped_count: IntCounterVec,
535    pub internal_skipped_count: IntCounterVec,
536}
537
538impl ReadPerfContextMetrics {
539    pub(crate) fn new(registry: &Registry) -> Self {
540        ReadPerfContextMetrics {
541            user_key_comparison_count: register_int_counter_vec_with_registry!(
542                "user_key_comparison_count",
543                "Helps us figure out whether too many comparisons in binary search can be a problem,
544                especially when a more expensive comparator is used. Moreover, since number of comparisons
545                is usually uniform based on the memtable size, the SST file size for Level 0 and size of other
546                levels, an significant increase of the counter can indicate unexpected LSM-tree shape.
547                You may want to check whether flush/compaction can keep up with the write speed",
548                &["cf_name"],
549                registry,
550            )
551            .unwrap(),
552            block_cache_hit_count: register_int_counter_vec_with_registry!(
553                "block_cache_hit_count",
554                "Tells us how many times we read data blocks from block cache, and block_read_count tells us how many
555                times we have to read blocks from the file system (either block cache is disabled or it is a cache miss).
556                We can evaluate the block cache efficiency by looking at the two counters over time.",
557                &["cf_name"],
558                registry,
559            )
560            .unwrap(),
561            block_read_count: register_int_counter_vec_with_registry!(
562                "block_read_count",
563                "Tells us how many times we have to read blocks from the file system (either block cache is disabled or it is a cache miss)",
564                &["cf_name"],
565                registry,
566            )
567            .unwrap(),
568            block_read_byte: register_int_counter_vec_with_registry!(
569                "block_read_byte",
570                "Tells us how many total bytes we read from the file system. It can tell us whether a slow query can be caused by reading
571                large blocks from the file system. Index and bloom filter blocks are usually large blocks. A large block can also be the result
572                of a very large key or value",
573                &["cf_name"],
574                registry,
575            )
576            .unwrap(),
577            block_read_nanos: register_int_counter_vec_with_registry!(
578                "block_read_nanos",
579                "Total nanos spent on block reads",
580                &["cf_name"],
581                registry,
582            )
583            .unwrap(),
584            block_checksum_nanos: register_int_counter_vec_with_registry!(
585                "block_checksum_nanos",
586                "Total nanos spent on verifying block checksum",
587                &["cf_name"],
588                registry,
589            )
590            .unwrap(),
591            block_decompress_nanos: register_int_counter_vec_with_registry!(
592                "block_decompress_nanos",
593                "Total nanos spent on decompressing a block",
594                &["cf_name"],
595                registry,
596            )
597            .unwrap(),
598            get_read_bytes: register_int_counter_vec_with_registry!(
599                "get_read_bytes",
600                "Total bytes for values returned by Get",
601                &["cf_name"],
602                registry,
603            )
604            .unwrap(),
605            multiget_read_bytes: register_int_counter_vec_with_registry!(
606                "multiget_read_bytes",
607                "Total bytes for values returned by MultiGet.",
608                &["cf_name"],
609                registry,
610            )
611            .unwrap(),
612            get_snapshot_nanos: register_int_counter_vec_with_registry!(
613                "get_snapshot_nanos",
614                "Time spent in getting snapshot.",
615                &["cf_name"],
616                registry,
617            )
618            .unwrap(),
619            get_from_memtable_nanos: register_int_counter_vec_with_registry!(
620                "get_from_memtable_nanos",
621                "Time spent on reading data from memtable.",
622                &["cf_name"],
623                registry,
624            )
625            .unwrap(),
626            get_from_memtable_count: register_int_counter_vec_with_registry!(
627                "get_from_memtable_count",
628                "Number of memtables queried",
629                &["cf_name"],
630                registry,
631            )
632            .unwrap(),
633            get_post_process_nanos: register_int_counter_vec_with_registry!(
634                "get_post_process_nanos",
635                "Total nanos spent after Get() finds a key",
636                &["cf_name"],
637                registry,
638            )
639            .unwrap(),
640            get_from_output_files_nanos: register_int_counter_vec_with_registry!(
641                "get_from_output_files_nanos",
642                "Total nanos reading from output files",
643                &["cf_name"],
644                registry,
645            )
646            .unwrap(),
647            db_mutex_lock_nanos: register_int_counter_vec_with_registry!(
648                "db_mutex_lock_nanos",
649                "Time spent on acquiring db mutex",
650                &["cf_name"],
651                registry,
652            )
653            .unwrap(),
654            db_condition_wait_nanos: register_int_counter_vec_with_registry!(
655                "db_condition_wait_nanos",
656                "Time spent waiting with a condition variable created with DB Mutex.",
657                &["cf_name"],
658                registry,
659            )
660            .unwrap(),
661            merge_operator_nanos: register_int_counter_vec_with_registry!(
662                "merge_operator_nanos",
663                "Time spent on merge operator.",
664                &["cf_name"],
665                registry,
666            )
667            .unwrap(),
668            read_index_block_nanos: register_int_counter_vec_with_registry!(
669                "read_index_block_nanos",
670                "Time spent on reading index block from block cache or SST file",
671                &["cf_name"],
672                registry,
673            )
674            .unwrap(),
675            read_filter_block_nanos: register_int_counter_vec_with_registry!(
676                "read_filter_block_nanos",
677                "Time spent on reading filter block from block cache or SST file",
678                &["cf_name"],
679                registry,
680            )
681            .unwrap(),
682            new_table_block_iter_nanos: register_int_counter_vec_with_registry!(
683                "new_table_block_iter_nanos",
684                "Time spent on creating data block iterator",
685                &["cf_name"],
686                registry,
687            )
688            .unwrap(),
689            block_seek_nanos: register_int_counter_vec_with_registry!(
690                "block_seek_nanos",
691                "Time spent on seeking a key in data/index blocks",
692                &["cf_name"],
693                registry,
694            )
695            .unwrap(),
696            find_table_nanos: register_int_counter_vec_with_registry!(
697                "find_table_nanos",
698                "Time spent on finding or creating a table reader",
699                &["cf_name"],
700                registry,
701            )
702            .unwrap(),
703            bloom_memtable_hit_count: register_int_counter_vec_with_registry!(
704                "bloom_memtable_hit_count",
705                "Total number of mem table bloom hits",
706                &["cf_name"],
707                registry,
708            )
709            .unwrap(),
710            bloom_memtable_miss_count: register_int_counter_vec_with_registry!(
711                "bloom_memtable_miss_count",
712                "Total number of mem table bloom misses",
713                &["cf_name"],
714                registry,
715            )
716            .unwrap(),
717            bloom_sst_hit_count: register_int_counter_vec_with_registry!(
718                "bloom_sst_hit_count",
719                "Total number of SST table bloom hits",
720                &["cf_name"],
721                registry,
722            )
723            .unwrap(),
724            bloom_sst_miss_count: register_int_counter_vec_with_registry!(
725                "bloom_sst_miss_count",
726                "Total number of SST table bloom misses",
727                &["cf_name"],
728                registry,
729            )
730            .unwrap(),
731            key_lock_wait_time: register_int_counter_vec_with_registry!(
732                "key_lock_wait_time",
733                "Time spent waiting on key locks in transaction lock manager",
734                &["cf_name"],
735                registry,
736            )
737            .unwrap(),
738            key_lock_wait_count: register_int_counter_vec_with_registry!(
739                "key_lock_wait_count",
740                "Number of times acquiring a lock was blocked by another transaction",
741                &["cf_name"],
742                registry,
743            )
744            .unwrap(),
745            internal_delete_skipped_count: register_int_counter_vec_with_registry!(
746                "internal_delete_skipped_count",
747                "Total number of deleted keys skipped during iteration",
748                &["cf_name"],
749                registry,
750            )
751                .unwrap(),
752            internal_skipped_count: register_int_counter_vec_with_registry!(
753                "internal_skipped_count",
754                "Totall number of internal keys skipped during iteration",
755                &["cf_name"],
756                registry,
757            )
758                .unwrap(),
759        }
760    }
761
762    pub fn report_metrics(&self, cf_name: &str) {
763        PER_THREAD_ROCKS_PERF_CONTEXT.with(|perf_context_cell| {
764            set_perf_stats(PerfStatsLevel::Disable);
765            let perf_context = perf_context_cell.borrow();
766            self.user_key_comparison_count
767                .with_label_values(&[cf_name])
768                .inc_by(perf_context.metric(PerfMetric::UserKeyComparisonCount));
769            self.block_cache_hit_count
770                .with_label_values(&[cf_name])
771                .inc_by(perf_context.metric(PerfMetric::BlockCacheHitCount));
772            self.block_read_count
773                .with_label_values(&[cf_name])
774                .inc_by(perf_context.metric(PerfMetric::BlockReadCount));
775            self.block_read_byte
776                .with_label_values(&[cf_name])
777                .inc_by(perf_context.metric(PerfMetric::BlockReadByte));
778            self.block_read_nanos
779                .with_label_values(&[cf_name])
780                .inc_by(perf_context.metric(PerfMetric::BlockReadTime));
781            self.block_read_count
782                .with_label_values(&[cf_name])
783                .inc_by(perf_context.metric(PerfMetric::BlockReadCount));
784            self.block_checksum_nanos
785                .with_label_values(&[cf_name])
786                .inc_by(perf_context.metric(PerfMetric::BlockChecksumTime));
787            self.block_decompress_nanos
788                .with_label_values(&[cf_name])
789                .inc_by(perf_context.metric(PerfMetric::BlockDecompressTime));
790            self.get_read_bytes
791                .with_label_values(&[cf_name])
792                .inc_by(perf_context.metric(PerfMetric::GetReadBytes));
793            self.multiget_read_bytes
794                .with_label_values(&[cf_name])
795                .inc_by(perf_context.metric(PerfMetric::MultigetReadBytes));
796            self.get_snapshot_nanos
797                .with_label_values(&[cf_name])
798                .inc_by(perf_context.metric(PerfMetric::GetSnapshotTime));
799            self.get_from_memtable_nanos
800                .with_label_values(&[cf_name])
801                .inc_by(perf_context.metric(PerfMetric::GetFromMemtableTime));
802            self.get_from_memtable_count
803                .with_label_values(&[cf_name])
804                .inc_by(perf_context.metric(PerfMetric::GetFromMemtableCount));
805            self.get_post_process_nanos
806                .with_label_values(&[cf_name])
807                .inc_by(perf_context.metric(PerfMetric::GetPostProcessTime));
808            self.get_from_output_files_nanos
809                .with_label_values(&[cf_name])
810                .inc_by(perf_context.metric(PerfMetric::GetFromOutputFilesTime));
811            self.db_mutex_lock_nanos
812                .with_label_values(&[cf_name])
813                .inc_by(perf_context.metric(PerfMetric::DbMutexLockNanos));
814            self.db_condition_wait_nanos
815                .with_label_values(&[cf_name])
816                .inc_by(perf_context.metric(PerfMetric::DbConditionWaitNanos));
817            self.merge_operator_nanos
818                .with_label_values(&[cf_name])
819                .inc_by(perf_context.metric(PerfMetric::MergeOperatorTimeNanos));
820            self.read_index_block_nanos
821                .with_label_values(&[cf_name])
822                .inc_by(perf_context.metric(PerfMetric::ReadIndexBlockNanos));
823            self.read_filter_block_nanos
824                .with_label_values(&[cf_name])
825                .inc_by(perf_context.metric(PerfMetric::ReadFilterBlockNanos));
826            self.new_table_block_iter_nanos
827                .with_label_values(&[cf_name])
828                .inc_by(perf_context.metric(PerfMetric::NewTableBlockIterNanos));
829            self.block_seek_nanos
830                .with_label_values(&[cf_name])
831                .inc_by(perf_context.metric(PerfMetric::BlockSeekNanos));
832            self.find_table_nanos
833                .with_label_values(&[cf_name])
834                .inc_by(perf_context.metric(PerfMetric::FindTableNanos));
835            self.bloom_memtable_hit_count
836                .with_label_values(&[cf_name])
837                .inc_by(perf_context.metric(PerfMetric::BloomMemtableHitCount));
838            self.bloom_memtable_miss_count
839                .with_label_values(&[cf_name])
840                .inc_by(perf_context.metric(PerfMetric::BloomMemtableMissCount));
841            self.bloom_sst_hit_count
842                .with_label_values(&[cf_name])
843                .inc_by(perf_context.metric(PerfMetric::BloomSstHitCount));
844            self.bloom_sst_miss_count
845                .with_label_values(&[cf_name])
846                .inc_by(perf_context.metric(PerfMetric::BloomSstMissCount));
847            self.key_lock_wait_time
848                .with_label_values(&[cf_name])
849                .inc_by(perf_context.metric(PerfMetric::KeyLockWaitTime));
850            self.key_lock_wait_count
851                .with_label_values(&[cf_name])
852                .inc_by(perf_context.metric(PerfMetric::KeyLockWaitCount));
853            self.internal_delete_skipped_count
854                .with_label_values(&[cf_name])
855                .inc_by(perf_context.metric(PerfMetric::InternalDeleteSkippedCount));
856            self.internal_skipped_count
857                .with_label_values(&[cf_name])
858                .inc_by(perf_context.metric(PerfMetric::InternalKeySkippedCount));
859        });
860    }
861}
862
863#[derive(Debug)]
864pub struct WritePerfContextMetrics {
865    pub write_wal_nanos: IntCounterVec,
866    pub write_memtable_nanos: IntCounterVec,
867    pub write_delay_nanos: IntCounterVec,
868    pub write_pre_and_post_process_nanos: IntCounterVec,
869    pub write_db_mutex_lock_nanos: IntCounterVec,
870    pub write_db_condition_wait_nanos: IntCounterVec,
871    pub write_key_lock_wait_nanos: IntCounterVec,
872    pub write_key_lock_wait_count: IntCounterVec,
873}
874
875impl WritePerfContextMetrics {
876    pub(crate) fn new(registry: &Registry) -> Self {
877        WritePerfContextMetrics {
878            write_wal_nanos: register_int_counter_vec_with_registry!(
879                "write_wal_nanos",
880                "Total nanos spent on writing to WAL",
881                &["cf_name"],
882                registry,
883            )
884            .unwrap(),
885            write_memtable_nanos: register_int_counter_vec_with_registry!(
886                "write_memtable_nanos",
887                "Total nanos spent on writing to memtable",
888                &["cf_name"],
889                registry,
890            )
891            .unwrap(),
892            write_delay_nanos: register_int_counter_vec_with_registry!(
893                "write_delay_nanos",
894                "Total nanos spent on delaying or throttling write",
895                &["cf_name"],
896                registry,
897            )
898            .unwrap(),
899            write_pre_and_post_process_nanos: register_int_counter_vec_with_registry!(
900                "write_pre_and_post_process_nanos",
901                "Total nanos spent on writing a record, excluding the above four things",
902                &["cf_name"],
903                registry,
904            )
905            .unwrap(),
906            write_db_mutex_lock_nanos: register_int_counter_vec_with_registry!(
907                "write_db_mutex_lock_nanos",
908                "Time spent on acquiring db mutex",
909                &["cf_name"],
910                registry,
911            )
912            .unwrap(),
913            write_db_condition_wait_nanos: register_int_counter_vec_with_registry!(
914                "write_db_condition_wait_nanos",
915                "Time spent waiting with a condition variable created with DB Mutex.",
916                &["cf_name"],
917                registry,
918            )
919            .unwrap(),
920            write_key_lock_wait_nanos: register_int_counter_vec_with_registry!(
921                "write_key_lock_wait_time",
922                "Time spent waiting on key locks in transaction lock manager",
923                &["cf_name"],
924                registry,
925            )
926            .unwrap(),
927            write_key_lock_wait_count: register_int_counter_vec_with_registry!(
928                "write_key_lock_wait_count",
929                "Number of times acquiring a lock was blocked by another transaction",
930                &["cf_name"],
931                registry,
932            )
933            .unwrap(),
934        }
935    }
936    pub fn report_metrics(&self, db_name: &str) {
937        PER_THREAD_ROCKS_PERF_CONTEXT.with(|perf_context_cell| {
938            set_perf_stats(PerfStatsLevel::Disable);
939            let perf_context = perf_context_cell.borrow();
940            self.write_wal_nanos
941                .with_label_values(&[db_name])
942                .inc_by(perf_context.metric(PerfMetric::WriteWalTime));
943            self.write_memtable_nanos
944                .with_label_values(&[db_name])
945                .inc_by(perf_context.metric(PerfMetric::WriteMemtableTime));
946            self.write_delay_nanos
947                .with_label_values(&[db_name])
948                .inc_by(perf_context.metric(PerfMetric::WriteDelayTime));
949            self.write_pre_and_post_process_nanos
950                .with_label_values(&[db_name])
951                .inc_by(perf_context.metric(PerfMetric::WritePreAndPostProcessTime));
952            self.write_db_mutex_lock_nanos
953                .with_label_values(&[db_name])
954                .inc_by(perf_context.metric(PerfMetric::DbMutexLockNanos));
955            self.write_db_condition_wait_nanos
956                .with_label_values(&[db_name])
957                .inc_by(perf_context.metric(PerfMetric::DbConditionWaitNanos));
958            self.write_key_lock_wait_nanos
959                .with_label_values(&[db_name])
960                .inc_by(perf_context.metric(PerfMetric::KeyLockWaitTime));
961            self.write_key_lock_wait_count
962                .with_label_values(&[db_name])
963                .inc_by(perf_context.metric(PerfMetric::KeyLockWaitCount));
964        });
965    }
966}
967
968#[derive(Debug)]
969pub struct DBMetrics {
970    pub op_metrics: OperationMetrics,
971    pub cf_metrics: ColumnFamilyMetrics,
972    pub read_perf_ctx_metrics: ReadPerfContextMetrics,
973    pub write_perf_ctx_metrics: WritePerfContextMetrics,
974}
975
976static ONCE: OnceCell<Arc<DBMetrics>> = OnceCell::new();
977
978impl DBMetrics {
979    fn new(registry: &Registry) -> Self {
980        DBMetrics {
981            op_metrics: OperationMetrics::new(registry),
982            cf_metrics: ColumnFamilyMetrics::new(registry),
983            read_perf_ctx_metrics: ReadPerfContextMetrics::new(registry),
984            write_perf_ctx_metrics: WritePerfContextMetrics::new(registry),
985        }
986    }
987    pub fn init(registry: &Registry) -> &'static Arc<DBMetrics> {
988        // Initialize this before creating any instance of DBMap
989        // TODO: Remove static initialization because this basically means we can
990        // only ever initialize db metrics once with a registry whereas
991        // in the code we might want to initialize it with different
992        // registries. The problem is underlying metrics cannot be re-initialized
993        // or prometheus complains. We essentially need to pass in DBMetrics
994        // everywhere we create DBMap as the right fix
995        let _ = ONCE
996            .set(Arc::new(DBMetrics::new(registry)))
997            // this happens many times during tests
998            .tap_err(|_| warn!("DBMetrics registry overwritten"));
999        ONCE.get().unwrap()
1000    }
1001    pub fn increment_num_active_dbs(&self, db_name: &str) {
1002        self.op_metrics
1003            .rocksdb_num_active_db_handles
1004            .with_label_values(&[db_name])
1005            .inc();
1006    }
1007    pub fn decrement_num_active_dbs(&self, db_name: &str) {
1008        self.op_metrics
1009            .rocksdb_num_active_db_handles
1010            .with_label_values(&[db_name])
1011            .dec();
1012    }
1013    pub fn get() -> &'static Arc<DBMetrics> {
1014        ONCE.get()
1015            .unwrap_or_else(|| DBMetrics::init(prometheus::default_registry()))
1016    }
1017}