typed_store/
metrics.rs

1// Copyright (c) Mysten Labs, Inc.
2// Modifications Copyright (c) 2024 IOTA Stiftung
3// SPDX-License-Identifier: Apache-2.0
4
5use std::{
6    cell::RefCell,
7    sync::{
8        Arc,
9        atomic::{AtomicU64, Ordering},
10    },
11    time::Duration,
12};
13
14use once_cell::sync::OnceCell;
15use prometheus::{
16    HistogramVec, IntCounterVec, IntGaugeVec, Registry, register_histogram_vec_with_registry,
17    register_int_counter_vec_with_registry, register_int_gauge_vec_with_registry,
18};
19use rocksdb::{PerfContext, PerfMetric, PerfStatsLevel, perf::set_perf_stats};
20use tap::TapFallible;
21use tracing::warn;
22
23thread_local! {
24    static PER_THREAD_ROCKS_PERF_CONTEXT: std::cell::RefCell<rocksdb::PerfContext>  = RefCell::new(PerfContext::default());
25}
26
27const LATENCY_SEC_BUCKETS: &[f64] = &[
28    0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1., 2.5, 5., 10., 20., 30., 60., 90.,
29];
30
31#[derive(Debug, Clone)]
32// A struct for sampling based on number of operations or duration.
33// Sampling happens if the duration expires and after number of operations
34pub struct SamplingInterval {
35    // Sample once every time duration
36    pub once_every_duration: Duration,
37    // Sample once every number of operations
38    pub after_num_ops: u64,
39    // Counter for keeping track of previous sample
40    pub counter: Arc<AtomicU64>,
41}
42
43impl Default for SamplingInterval {
44    fn default() -> Self {
45        // Enabled with 60 second interval
46        SamplingInterval::new(Duration::from_secs(60), 0)
47    }
48}
49
50impl SamplingInterval {
51    pub fn new(once_every_duration: Duration, after_num_ops: u64) -> Self {
52        let counter = Arc::new(AtomicU64::new(1));
53        if !once_every_duration.is_zero() {
54            let counter = counter.clone();
55            tokio::task::spawn(async move {
56                loop {
57                    if counter.load(Ordering::SeqCst) > after_num_ops {
58                        counter.store(0, Ordering::SeqCst);
59                    }
60                    tokio::time::sleep(once_every_duration).await;
61                }
62            });
63        }
64        SamplingInterval {
65            once_every_duration,
66            after_num_ops,
67            counter,
68        }
69    }
70    pub fn new_from_self(&self) -> SamplingInterval {
71        SamplingInterval::new(self.once_every_duration, self.after_num_ops)
72    }
73    pub fn sample(&self) -> bool {
74        if self.once_every_duration.is_zero() {
75            self.counter.fetch_add(1, Ordering::Relaxed) % (self.after_num_ops + 1) == 0
76        } else {
77            self.counter.fetch_add(1, Ordering::Relaxed) == 0
78        }
79    }
80}
81
82#[derive(Debug)]
83pub struct ColumnFamilyMetrics {
84    pub rocksdb_total_sst_files_size: IntGaugeVec,
85    pub rocksdb_total_blob_files_size: IntGaugeVec,
86    pub rocksdb_current_size_active_mem_tables: IntGaugeVec,
87    pub rocksdb_size_all_mem_tables: IntGaugeVec,
88    pub rocksdb_num_snapshots: IntGaugeVec,
89    pub rocksdb_oldest_snapshot_time: IntGaugeVec,
90    pub rocksdb_actual_delayed_write_rate: IntGaugeVec,
91    pub rocksdb_is_write_stopped: IntGaugeVec,
92    pub rocksdb_block_cache_capacity: IntGaugeVec,
93    pub rocksdb_block_cache_usage: IntGaugeVec,
94    pub rocksdb_block_cache_pinned_usage: IntGaugeVec,
95    pub rocksdb_estimate_table_readers_mem: IntGaugeVec,
96    pub rocksdb_num_immutable_mem_tables: IntGaugeVec,
97    pub rocksdb_mem_table_flush_pending: IntGaugeVec,
98    pub rocksdb_compaction_pending: IntGaugeVec,
99    pub rocksdb_estimate_pending_compaction_bytes: IntGaugeVec,
100    pub rocksdb_num_running_compactions: IntGaugeVec,
101    pub rocksdb_num_running_flushes: IntGaugeVec,
102    pub rocksdb_estimate_oldest_key_time: IntGaugeVec,
103    pub rocksdb_background_errors: IntGaugeVec,
104    pub rocksdb_estimated_num_keys: IntGaugeVec,
105    pub rocksdb_base_level: IntGaugeVec,
106}
107
108impl ColumnFamilyMetrics {
109    pub(crate) fn new(registry: &Registry) -> Self {
110        ColumnFamilyMetrics {
111            rocksdb_total_sst_files_size: register_int_gauge_vec_with_registry!(
112                "rocksdb_total_sst_files_size",
113                "The storage size occupied by the sst files in the column family",
114                &["cf_name"],
115                registry,
116            )
117            .unwrap(),
118            rocksdb_total_blob_files_size: register_int_gauge_vec_with_registry!(
119                "rocksdb_total_blob_files_size",
120                "The storage size occupied by the blob files in the column family",
121                &["cf_name"],
122                registry,
123            )
124            .unwrap(),
125            rocksdb_current_size_active_mem_tables: register_int_gauge_vec_with_registry!(
126                "rocksdb_current_size_active_mem_tables",
127                "The current approximate size of active memtable (bytes).",
128                &["cf_name"],
129                registry,
130            )
131            .unwrap(),
132            rocksdb_size_all_mem_tables: register_int_gauge_vec_with_registry!(
133                "rocksdb_size_all_mem_tables",
134                "The memory size occupied by the column family's in-memory buffer",
135                &["cf_name"],
136                registry,
137            )
138            .unwrap(),
139            rocksdb_num_snapshots: register_int_gauge_vec_with_registry!(
140                "rocksdb_num_snapshots",
141                "Number of snapshots held for the column family",
142                &["cf_name"],
143                registry,
144            )
145            .unwrap(),
146            rocksdb_oldest_snapshot_time: register_int_gauge_vec_with_registry!(
147                "rocksdb_oldest_snapshot_time",
148                "Unit timestamp of the oldest unreleased snapshot",
149                &["cf_name"],
150                registry,
151            )
152            .unwrap(),
153            rocksdb_actual_delayed_write_rate: register_int_gauge_vec_with_registry!(
154                "rocksdb_actual_delayed_write_rate",
155                "The current actual delayed write rate. 0 means no delay",
156                &["cf_name"],
157                registry,
158            )
159            .unwrap(),
160            rocksdb_is_write_stopped: register_int_gauge_vec_with_registry!(
161                "rocksdb_is_write_stopped",
162                "A flag indicating whether writes are stopped on this column family. 1 indicates writes have been stopped.",
163                &["cf_name"],
164                registry,
165            )
166            .unwrap(),
167            rocksdb_block_cache_capacity: register_int_gauge_vec_with_registry!(
168                "rocksdb_block_cache_capacity",
169                "The block cache capacity of the column family.",
170                &["cf_name"],
171                registry,
172            )
173            .unwrap(),
174            rocksdb_block_cache_usage: register_int_gauge_vec_with_registry!(
175                "rocksdb_block_cache_usage",
176                "The memory size used by the column family in the block cache.",
177                &["cf_name"],
178                registry,
179            )
180            .unwrap(),
181            rocksdb_block_cache_pinned_usage: register_int_gauge_vec_with_registry!(
182                "rocksdb_block_cache_pinned_usage",
183                "The memory size used by the column family in the block cache where entries are pinned",
184                &["cf_name"],
185                registry,
186            )
187            .unwrap(),
188            rocksdb_estimate_table_readers_mem: register_int_gauge_vec_with_registry!(
189                "rocksdb_estimate_table_readers_mem",
190                "The estimated memory size used for reading SST tables in this column
191                family such as filters and index blocks. Note that this number does not
192                include the memory used in block cache.",
193                &["cf_name"],
194                registry,
195            )
196            .unwrap(),
197            rocksdb_num_immutable_mem_tables: register_int_gauge_vec_with_registry!(
198                "rocksdb_num_immutable_mem_tables",
199                "The number of immutable memtables that have not yet been flushed.",
200                &["cf_name"],
201                registry,
202            )
203            .unwrap(),
204            rocksdb_mem_table_flush_pending: register_int_gauge_vec_with_registry!(
205                "rocksdb_mem_table_flush_pending",
206                "A 1 or 0 flag indicating whether a memtable flush is pending.
207                If this number is 1, it means a memtable is waiting for being flushed,
208                but there might be too many L0 files that prevents it from being flushed.",
209                &["cf_name"],
210                registry,
211            )
212            .unwrap(),
213            rocksdb_compaction_pending: register_int_gauge_vec_with_registry!(
214                "rocksdb_compaction_pending",
215                "A 1 or 0 flag indicating whether a compaction job is pending.
216                If this number is 1, it means some part of the column family requires
217                compaction in order to maintain shape of LSM tree, but the compaction
218                is pending because the desired compaction job is either waiting for
219                other dependent compactions to be finished or waiting for an available
220                compaction thread.",
221                &["cf_name"],
222                registry,
223            )
224            .unwrap(),
225            rocksdb_estimate_pending_compaction_bytes: register_int_gauge_vec_with_registry!(
226                "rocksdb_estimate_pending_compaction_bytes",
227                "Estimated total number of bytes compaction needs to rewrite to get all levels down
228                to under target size. Not valid for other compactions than level-based.",
229                &["cf_name"],
230                registry,
231            )
232            .unwrap(),
233            rocksdb_num_running_compactions: register_int_gauge_vec_with_registry!(
234                "rocksdb_num_running_compactions",
235                "The number of compactions that are currently running for the column family.",
236                &["cf_name"],
237                registry,
238            )
239            .unwrap(),
240            rocksdb_num_running_flushes: register_int_gauge_vec_with_registry!(
241                "rocksdb_num_running_flushes",
242                "The number of flushes that are currently running for the column family.",
243                &["cf_name"],
244                registry,
245            )
246            .unwrap(),
247            rocksdb_estimate_oldest_key_time: register_int_gauge_vec_with_registry!(
248                "rocksdb_estimate_oldest_key_time",
249                "Estimation of the oldest key timestamp in the DB. Only available
250                for FIFO compaction with compaction_options_fifo.allow_compaction = false.",
251                &["cf_name"],
252                registry,
253            )
254            .unwrap(),
255            rocksdb_estimated_num_keys: register_int_gauge_vec_with_registry!(
256                "rocksdb_estimated_num_keys",
257                "The estimated number of keys in the table",
258                &["cf_name"],
259                registry,
260            )
261            .unwrap(),
262            rocksdb_background_errors: register_int_gauge_vec_with_registry!(
263                "rocksdb_background_errors",
264                "The accumulated number of RocksDB background errors.",
265                &["cf_name"],
266                registry,
267            )
268            .unwrap(),
269            rocksdb_base_level: register_int_gauge_vec_with_registry!(
270                "rocksdb_base_level",
271                "The number of level to which L0 data will be compacted.",
272                &["cf_name"],
273                registry,
274            )
275            .unwrap(),
276        }
277    }
278}
279
280#[derive(Debug)]
281pub struct OperationMetrics {
282    pub rocksdb_iter_latency_seconds: HistogramVec,
283    pub rocksdb_iter_bytes: HistogramVec,
284    pub rocksdb_iter_keys: HistogramVec,
285    pub rocksdb_get_latency_seconds: HistogramVec,
286    pub rocksdb_get_bytes: HistogramVec,
287    pub rocksdb_multiget_latency_seconds: HistogramVec,
288    pub rocksdb_multiget_bytes: HistogramVec,
289    pub rocksdb_put_latency_seconds: HistogramVec,
290    pub rocksdb_put_bytes: HistogramVec,
291    pub rocksdb_batch_put_bytes: HistogramVec,
292    pub rocksdb_delete_latency_seconds: HistogramVec,
293    pub rocksdb_deletes: IntCounterVec,
294    pub rocksdb_batch_commit_latency_seconds: HistogramVec,
295    pub rocksdb_batch_commit_bytes: HistogramVec,
296    pub rocksdb_num_active_db_handles: IntGaugeVec,
297    pub rocksdb_very_slow_batch_writes_count: IntCounterVec,
298    pub rocksdb_very_slow_batch_writes_duration_ms: IntCounterVec,
299    pub rocksdb_very_slow_puts_count: IntCounterVec,
300    pub rocksdb_very_slow_puts_duration_ms: IntCounterVec,
301}
302
303impl OperationMetrics {
304    pub(crate) fn new(registry: &Registry) -> Self {
305        OperationMetrics {
306            rocksdb_iter_latency_seconds: register_histogram_vec_with_registry!(
307                "rocksdb_iter_latency_seconds",
308                "Rocksdb iter latency in seconds",
309                &["cf_name"],
310                LATENCY_SEC_BUCKETS.to_vec(),
311                registry,
312            )
313            .unwrap(),
314            rocksdb_iter_bytes: register_histogram_vec_with_registry!(
315                "rocksdb_iter_bytes",
316                "Rocksdb iter size in bytes",
317                &["cf_name"],
318                prometheus::exponential_buckets(1.0, 4.0, 15)
319                    .unwrap()
320                    .to_vec(),
321                registry,
322            )
323            .unwrap(),
324            rocksdb_iter_keys: register_histogram_vec_with_registry!(
325                "rocksdb_iter_keys",
326                "Rocksdb iter num keys",
327                &["cf_name"],
328                registry,
329            )
330            .unwrap(),
331            rocksdb_get_latency_seconds: register_histogram_vec_with_registry!(
332                "rocksdb_get_latency_seconds",
333                "Rocksdb get latency in seconds",
334                &["cf_name"],
335                LATENCY_SEC_BUCKETS.to_vec(),
336                registry,
337            )
338            .unwrap(),
339            rocksdb_get_bytes: register_histogram_vec_with_registry!(
340                "rocksdb_get_bytes",
341                "Rocksdb get call returned data size in bytes",
342                &["cf_name"],
343                prometheus::exponential_buckets(1.0, 4.0, 15)
344                    .unwrap()
345                    .to_vec(),
346                registry
347            )
348            .unwrap(),
349            rocksdb_multiget_latency_seconds: register_histogram_vec_with_registry!(
350                "rocksdb_multiget_latency_seconds",
351                "Rocksdb multiget latency in seconds",
352                &["cf_name"],
353                LATENCY_SEC_BUCKETS.to_vec(),
354                registry,
355            )
356            .unwrap(),
357            rocksdb_multiget_bytes: register_histogram_vec_with_registry!(
358                "rocksdb_multiget_bytes",
359                "Rocksdb multiget call returned data size in bytes",
360                &["cf_name"],
361                prometheus::exponential_buckets(1.0, 4.0, 15)
362                    .unwrap()
363                    .to_vec(),
364                registry,
365            )
366            .unwrap(),
367            rocksdb_put_latency_seconds: register_histogram_vec_with_registry!(
368                "rocksdb_put_latency_seconds",
369                "Rocksdb put latency in seconds",
370                &["cf_name"],
371                LATENCY_SEC_BUCKETS.to_vec(),
372                registry,
373            )
374            .unwrap(),
375            rocksdb_put_bytes: register_histogram_vec_with_registry!(
376                "rocksdb_put_bytes",
377                "Rocksdb put call puts data size in bytes",
378                &["cf_name"],
379                prometheus::exponential_buckets(1.0, 4.0, 15)
380                    .unwrap()
381                    .to_vec(),
382                registry,
383            )
384            .unwrap(),
385            rocksdb_batch_put_bytes: register_histogram_vec_with_registry!(
386                "rocksdb_batch_put_bytes",
387                "Rocksdb batch put call puts data size in bytes",
388                &["cf_name"],
389                prometheus::exponential_buckets(1.0, 4.0, 15)
390                    .unwrap()
391                    .to_vec(),
392                registry,
393            )
394            .unwrap(),
395            rocksdb_delete_latency_seconds: register_histogram_vec_with_registry!(
396                "rocksdb_delete_latency_seconds",
397                "Rocksdb delete latency in seconds",
398                &["cf_name"],
399                LATENCY_SEC_BUCKETS.to_vec(),
400                registry,
401            )
402            .unwrap(),
403            rocksdb_deletes: register_int_counter_vec_with_registry!(
404                "rocksdb_deletes",
405                "Rocksdb delete calls",
406                &["cf_name"],
407                registry
408            )
409            .unwrap(),
410            rocksdb_batch_commit_latency_seconds: register_histogram_vec_with_registry!(
411                "rocksdb_write_batch_commit_latency_seconds",
412                "Rocksdb schema batch commit latency in seconds",
413                &["db_name"],
414                LATENCY_SEC_BUCKETS.to_vec(),
415                registry,
416            )
417            .unwrap(),
418            rocksdb_batch_commit_bytes: register_histogram_vec_with_registry!(
419                "rocksdb_batch_commit_bytes",
420                "Rocksdb schema batch commit size in bytes",
421                &["db_name"],
422                prometheus::exponential_buckets(1.0, 4.0, 15)
423                    .unwrap()
424                    .to_vec(),
425                registry,
426            )
427            .unwrap(),
428            rocksdb_num_active_db_handles: register_int_gauge_vec_with_registry!(
429                "rocksdb_num_active_db_handles",
430                "Number of active db handles",
431                &["db_name"],
432                registry,
433            )
434            .unwrap(),
435            rocksdb_very_slow_batch_writes_count: register_int_counter_vec_with_registry!(
436                "rocksdb_num_very_slow_batch_writes",
437                "Number of batch writes that took more than 1 second",
438                &["db_name"],
439                registry,
440            )
441            .unwrap(),
442            rocksdb_very_slow_batch_writes_duration_ms: register_int_counter_vec_with_registry!(
443                "rocksdb_very_slow_batch_writes_duration",
444                "Total duration of batch writes that took more than 1 second",
445                &["db_name"],
446                registry,
447            )
448            .unwrap(),
449            rocksdb_very_slow_puts_count: register_int_counter_vec_with_registry!(
450                "rocksdb_num_very_slow_puts",
451                "Number of puts that took more than 1 second",
452                &["cf_name"],
453                registry,
454            )
455            .unwrap(),
456            rocksdb_very_slow_puts_duration_ms: register_int_counter_vec_with_registry!(
457                "rocksdb_very_slow_puts_duration",
458                "Total duration of puts that took more than 1 second",
459                &["cf_name"],
460                registry,
461            )
462            .unwrap(),
463        }
464    }
465}
466
467pub struct RocksDBPerfContext;
468
469impl Default for RocksDBPerfContext {
470    fn default() -> Self {
471        set_perf_stats(PerfStatsLevel::EnableTime);
472        PER_THREAD_ROCKS_PERF_CONTEXT.with(|perf_context| {
473            perf_context.borrow_mut().reset();
474        });
475        RocksDBPerfContext {}
476    }
477}
478
479impl Drop for RocksDBPerfContext {
480    fn drop(&mut self) {
481        set_perf_stats(PerfStatsLevel::Disable);
482    }
483}
484
485#[derive(Debug)]
486pub struct ReadPerfContextMetrics {
487    pub user_key_comparison_count: IntCounterVec,
488    pub block_cache_hit_count: IntCounterVec,
489    pub block_read_count: IntCounterVec,
490    pub block_read_byte: IntCounterVec,
491    pub block_read_nanos: IntCounterVec,
492    pub block_checksum_nanos: IntCounterVec,
493    pub block_decompress_nanos: IntCounterVec,
494    pub get_read_bytes: IntCounterVec,
495    pub multiget_read_bytes: IntCounterVec,
496    pub get_snapshot_nanos: IntCounterVec,
497    pub get_from_memtable_nanos: IntCounterVec,
498    pub get_from_memtable_count: IntCounterVec,
499    pub get_post_process_nanos: IntCounterVec,
500    pub get_from_output_files_nanos: IntCounterVec,
501    pub db_mutex_lock_nanos: IntCounterVec,
502    pub db_condition_wait_nanos: IntCounterVec,
503    pub merge_operator_nanos: IntCounterVec,
504    pub read_index_block_nanos: IntCounterVec,
505    pub read_filter_block_nanos: IntCounterVec,
506    pub new_table_block_iter_nanos: IntCounterVec,
507    pub block_seek_nanos: IntCounterVec,
508    pub find_table_nanos: IntCounterVec,
509    pub bloom_memtable_hit_count: IntCounterVec,
510    pub bloom_memtable_miss_count: IntCounterVec,
511    pub bloom_sst_hit_count: IntCounterVec,
512    pub bloom_sst_miss_count: IntCounterVec,
513    pub key_lock_wait_time: IntCounterVec,
514    pub key_lock_wait_count: IntCounterVec,
515    pub internal_delete_skipped_count: IntCounterVec,
516    pub internal_skipped_count: IntCounterVec,
517}
518
519impl ReadPerfContextMetrics {
520    pub(crate) fn new(registry: &Registry) -> Self {
521        ReadPerfContextMetrics {
522            user_key_comparison_count: register_int_counter_vec_with_registry!(
523                "user_key_comparison_count",
524                "Helps us figure out whether too many comparisons in binary search can be a problem,
525                especially when a more expensive comparator is used. Moreover, since number of comparisons
526                is usually uniform based on the memtable size, the SST file size for Level 0 and size of other
527                levels, an significant increase of the counter can indicate unexpected LSM-tree shape.
528                You may want to check whether flush/compaction can keep up with the write speed",
529                &["cf_name"],
530                registry,
531            )
532            .unwrap(),
533            block_cache_hit_count: register_int_counter_vec_with_registry!(
534                "block_cache_hit_count",
535                "Tells us how many times we read data blocks from block cache, and block_read_count tells us how many
536                times we have to read blocks from the file system (either block cache is disabled or it is a cache miss).
537                We can evaluate the block cache efficiency by looking at the two counters over time.",
538                &["cf_name"],
539                registry,
540            )
541            .unwrap(),
542            block_read_count: register_int_counter_vec_with_registry!(
543                "block_read_count",
544                "Tells us how many times we have to read blocks from the file system (either block cache is disabled or it is a cache miss)",
545                &["cf_name"],
546                registry,
547            )
548            .unwrap(),
549            block_read_byte: register_int_counter_vec_with_registry!(
550                "block_read_byte",
551                "Tells us how many total bytes we read from the file system. It can tell us whether a slow query can be caused by reading
552                large blocks from the file system. Index and bloom filter blocks are usually large blocks. A large block can also be the result
553                of a very large key or value",
554                &["cf_name"],
555                registry,
556            )
557            .unwrap(),
558            block_read_nanos: register_int_counter_vec_with_registry!(
559                "block_read_nanos",
560                "Total nanos spent on block reads",
561                &["cf_name"],
562                registry,
563            )
564            .unwrap(),
565            block_checksum_nanos: register_int_counter_vec_with_registry!(
566                "block_checksum_nanos",
567                "Total nanos spent on verifying block checksum",
568                &["cf_name"],
569                registry,
570            )
571            .unwrap(),
572            block_decompress_nanos: register_int_counter_vec_with_registry!(
573                "block_decompress_nanos",
574                "Total nanos spent on decompressing a block",
575                &["cf_name"],
576                registry,
577            )
578            .unwrap(),
579            get_read_bytes: register_int_counter_vec_with_registry!(
580                "get_read_bytes",
581                "Total bytes for values returned by Get",
582                &["cf_name"],
583                registry,
584            )
585            .unwrap(),
586            multiget_read_bytes: register_int_counter_vec_with_registry!(
587                "multiget_read_bytes",
588                "Total bytes for values returned by MultiGet.",
589                &["cf_name"],
590                registry,
591            )
592            .unwrap(),
593            get_snapshot_nanos: register_int_counter_vec_with_registry!(
594                "get_snapshot_nanos",
595                "Time spent in getting snapshot.",
596                &["cf_name"],
597                registry,
598            )
599            .unwrap(),
600            get_from_memtable_nanos: register_int_counter_vec_with_registry!(
601                "get_from_memtable_nanos",
602                "Time spent on reading data from memtable.",
603                &["cf_name"],
604                registry,
605            )
606            .unwrap(),
607            get_from_memtable_count: register_int_counter_vec_with_registry!(
608                "get_from_memtable_count",
609                "Number of memtables queried",
610                &["cf_name"],
611                registry,
612            )
613            .unwrap(),
614            get_post_process_nanos: register_int_counter_vec_with_registry!(
615                "get_post_process_nanos",
616                "Total nanos spent after Get() finds a key",
617                &["cf_name"],
618                registry,
619            )
620            .unwrap(),
621            get_from_output_files_nanos: register_int_counter_vec_with_registry!(
622                "get_from_output_files_nanos",
623                "Total nanos reading from output files",
624                &["cf_name"],
625                registry,
626            )
627            .unwrap(),
628            db_mutex_lock_nanos: register_int_counter_vec_with_registry!(
629                "db_mutex_lock_nanos",
630                "Time spent on acquiring db mutex",
631                &["cf_name"],
632                registry,
633            )
634            .unwrap(),
635            db_condition_wait_nanos: register_int_counter_vec_with_registry!(
636                "db_condition_wait_nanos",
637                "Time spent waiting with a condition variable created with DB Mutex.",
638                &["cf_name"],
639                registry,
640            )
641            .unwrap(),
642            merge_operator_nanos: register_int_counter_vec_with_registry!(
643                "merge_operator_nanos",
644                "Time spent on merge operator.",
645                &["cf_name"],
646                registry,
647            )
648            .unwrap(),
649            read_index_block_nanos: register_int_counter_vec_with_registry!(
650                "read_index_block_nanos",
651                "Time spent on reading index block from block cache or SST file",
652                &["cf_name"],
653                registry,
654            )
655            .unwrap(),
656            read_filter_block_nanos: register_int_counter_vec_with_registry!(
657                "read_filter_block_nanos",
658                "Time spent on reading filter block from block cache or SST file",
659                &["cf_name"],
660                registry,
661            )
662            .unwrap(),
663            new_table_block_iter_nanos: register_int_counter_vec_with_registry!(
664                "new_table_block_iter_nanos",
665                "Time spent on creating data block iterator",
666                &["cf_name"],
667                registry,
668            )
669            .unwrap(),
670            block_seek_nanos: register_int_counter_vec_with_registry!(
671                "block_seek_nanos",
672                "Time spent on seeking a key in data/index blocks",
673                &["cf_name"],
674                registry,
675            )
676            .unwrap(),
677            find_table_nanos: register_int_counter_vec_with_registry!(
678                "find_table_nanos",
679                "Time spent on finding or creating a table reader",
680                &["cf_name"],
681                registry,
682            )
683            .unwrap(),
684            bloom_memtable_hit_count: register_int_counter_vec_with_registry!(
685                "bloom_memtable_hit_count",
686                "Total number of mem table bloom hits",
687                &["cf_name"],
688                registry,
689            )
690            .unwrap(),
691            bloom_memtable_miss_count: register_int_counter_vec_with_registry!(
692                "bloom_memtable_miss_count",
693                "Total number of mem table bloom misses",
694                &["cf_name"],
695                registry,
696            )
697            .unwrap(),
698            bloom_sst_hit_count: register_int_counter_vec_with_registry!(
699                "bloom_sst_hit_count",
700                "Total number of SST table bloom hits",
701                &["cf_name"],
702                registry,
703            )
704            .unwrap(),
705            bloom_sst_miss_count: register_int_counter_vec_with_registry!(
706                "bloom_sst_miss_count",
707                "Total number of SST table bloom misses",
708                &["cf_name"],
709                registry,
710            )
711            .unwrap(),
712            key_lock_wait_time: register_int_counter_vec_with_registry!(
713                "key_lock_wait_time",
714                "Time spent waiting on key locks in transaction lock manager",
715                &["cf_name"],
716                registry,
717            )
718            .unwrap(),
719            key_lock_wait_count: register_int_counter_vec_with_registry!(
720                "key_lock_wait_count",
721                "Number of times acquiring a lock was blocked by another transaction",
722                &["cf_name"],
723                registry,
724            )
725            .unwrap(),
726            internal_delete_skipped_count: register_int_counter_vec_with_registry!(
727                "internal_delete_skipped_count",
728                "Total number of deleted keys skipped during iteration",
729                &["cf_name"],
730                registry,
731            )
732                .unwrap(),
733            internal_skipped_count: register_int_counter_vec_with_registry!(
734                "internal_skipped_count",
735                "Totall number of internal keys skipped during iteration",
736                &["cf_name"],
737                registry,
738            )
739                .unwrap(),
740        }
741    }
742
743    pub fn report_metrics(&self, cf_name: &str) {
744        PER_THREAD_ROCKS_PERF_CONTEXT.with(|perf_context_cell| {
745            set_perf_stats(PerfStatsLevel::Disable);
746            let perf_context = perf_context_cell.borrow();
747            self.user_key_comparison_count
748                .with_label_values(&[cf_name])
749                .inc_by(perf_context.metric(PerfMetric::UserKeyComparisonCount));
750            self.block_cache_hit_count
751                .with_label_values(&[cf_name])
752                .inc_by(perf_context.metric(PerfMetric::BlockCacheHitCount));
753            self.block_read_count
754                .with_label_values(&[cf_name])
755                .inc_by(perf_context.metric(PerfMetric::BlockReadCount));
756            self.block_read_byte
757                .with_label_values(&[cf_name])
758                .inc_by(perf_context.metric(PerfMetric::BlockReadByte));
759            self.block_read_nanos
760                .with_label_values(&[cf_name])
761                .inc_by(perf_context.metric(PerfMetric::BlockReadTime));
762            self.block_read_count
763                .with_label_values(&[cf_name])
764                .inc_by(perf_context.metric(PerfMetric::BlockReadCount));
765            self.block_checksum_nanos
766                .with_label_values(&[cf_name])
767                .inc_by(perf_context.metric(PerfMetric::BlockChecksumTime));
768            self.block_decompress_nanos
769                .with_label_values(&[cf_name])
770                .inc_by(perf_context.metric(PerfMetric::BlockDecompressTime));
771            self.get_read_bytes
772                .with_label_values(&[cf_name])
773                .inc_by(perf_context.metric(PerfMetric::GetReadBytes));
774            self.multiget_read_bytes
775                .with_label_values(&[cf_name])
776                .inc_by(perf_context.metric(PerfMetric::MultigetReadBytes));
777            self.get_snapshot_nanos
778                .with_label_values(&[cf_name])
779                .inc_by(perf_context.metric(PerfMetric::GetSnapshotTime));
780            self.get_from_memtable_nanos
781                .with_label_values(&[cf_name])
782                .inc_by(perf_context.metric(PerfMetric::GetFromMemtableTime));
783            self.get_from_memtable_count
784                .with_label_values(&[cf_name])
785                .inc_by(perf_context.metric(PerfMetric::GetFromMemtableCount));
786            self.get_post_process_nanos
787                .with_label_values(&[cf_name])
788                .inc_by(perf_context.metric(PerfMetric::GetPostProcessTime));
789            self.get_from_output_files_nanos
790                .with_label_values(&[cf_name])
791                .inc_by(perf_context.metric(PerfMetric::GetFromOutputFilesTime));
792            self.db_mutex_lock_nanos
793                .with_label_values(&[cf_name])
794                .inc_by(perf_context.metric(PerfMetric::DbMutexLockNanos));
795            self.db_condition_wait_nanos
796                .with_label_values(&[cf_name])
797                .inc_by(perf_context.metric(PerfMetric::DbConditionWaitNanos));
798            self.merge_operator_nanos
799                .with_label_values(&[cf_name])
800                .inc_by(perf_context.metric(PerfMetric::MergeOperatorTimeNanos));
801            self.read_index_block_nanos
802                .with_label_values(&[cf_name])
803                .inc_by(perf_context.metric(PerfMetric::ReadIndexBlockNanos));
804            self.read_filter_block_nanos
805                .with_label_values(&[cf_name])
806                .inc_by(perf_context.metric(PerfMetric::ReadFilterBlockNanos));
807            self.new_table_block_iter_nanos
808                .with_label_values(&[cf_name])
809                .inc_by(perf_context.metric(PerfMetric::NewTableBlockIterNanos));
810            self.block_seek_nanos
811                .with_label_values(&[cf_name])
812                .inc_by(perf_context.metric(PerfMetric::BlockSeekNanos));
813            self.find_table_nanos
814                .with_label_values(&[cf_name])
815                .inc_by(perf_context.metric(PerfMetric::FindTableNanos));
816            self.bloom_memtable_hit_count
817                .with_label_values(&[cf_name])
818                .inc_by(perf_context.metric(PerfMetric::BloomMemtableHitCount));
819            self.bloom_memtable_miss_count
820                .with_label_values(&[cf_name])
821                .inc_by(perf_context.metric(PerfMetric::BloomMemtableMissCount));
822            self.bloom_sst_hit_count
823                .with_label_values(&[cf_name])
824                .inc_by(perf_context.metric(PerfMetric::BloomSstHitCount));
825            self.bloom_sst_miss_count
826                .with_label_values(&[cf_name])
827                .inc_by(perf_context.metric(PerfMetric::BloomSstMissCount));
828            self.key_lock_wait_time
829                .with_label_values(&[cf_name])
830                .inc_by(perf_context.metric(PerfMetric::KeyLockWaitTime));
831            self.key_lock_wait_count
832                .with_label_values(&[cf_name])
833                .inc_by(perf_context.metric(PerfMetric::KeyLockWaitCount));
834            self.internal_delete_skipped_count
835                .with_label_values(&[cf_name])
836                .inc_by(perf_context.metric(PerfMetric::InternalDeleteSkippedCount));
837            self.internal_skipped_count
838                .with_label_values(&[cf_name])
839                .inc_by(perf_context.metric(PerfMetric::InternalKeySkippedCount));
840        });
841    }
842}
843
844#[derive(Debug)]
845pub struct WritePerfContextMetrics {
846    pub write_wal_nanos: IntCounterVec,
847    pub write_memtable_nanos: IntCounterVec,
848    pub write_delay_nanos: IntCounterVec,
849    pub write_pre_and_post_process_nanos: IntCounterVec,
850    pub write_db_mutex_lock_nanos: IntCounterVec,
851    pub write_db_condition_wait_nanos: IntCounterVec,
852    pub write_key_lock_wait_nanos: IntCounterVec,
853    pub write_key_lock_wait_count: IntCounterVec,
854}
855
856impl WritePerfContextMetrics {
857    pub(crate) fn new(registry: &Registry) -> Self {
858        WritePerfContextMetrics {
859            write_wal_nanos: register_int_counter_vec_with_registry!(
860                "write_wal_nanos",
861                "Total nanos spent on writing to WAL",
862                &["cf_name"],
863                registry,
864            )
865            .unwrap(),
866            write_memtable_nanos: register_int_counter_vec_with_registry!(
867                "write_memtable_nanos",
868                "Total nanos spent on writing to memtable",
869                &["cf_name"],
870                registry,
871            )
872            .unwrap(),
873            write_delay_nanos: register_int_counter_vec_with_registry!(
874                "write_delay_nanos",
875                "Total nanos spent on delaying or throttling write",
876                &["cf_name"],
877                registry,
878            )
879            .unwrap(),
880            write_pre_and_post_process_nanos: register_int_counter_vec_with_registry!(
881                "write_pre_and_post_process_nanos",
882                "Total nanos spent on writing a record, excluding the above four things",
883                &["cf_name"],
884                registry,
885            )
886            .unwrap(),
887            write_db_mutex_lock_nanos: register_int_counter_vec_with_registry!(
888                "write_db_mutex_lock_nanos",
889                "Time spent on acquiring db mutex",
890                &["cf_name"],
891                registry,
892            )
893            .unwrap(),
894            write_db_condition_wait_nanos: register_int_counter_vec_with_registry!(
895                "write_db_condition_wait_nanos",
896                "Time spent waiting with a condition variable created with DB Mutex.",
897                &["cf_name"],
898                registry,
899            )
900            .unwrap(),
901            write_key_lock_wait_nanos: register_int_counter_vec_with_registry!(
902                "write_key_lock_wait_time",
903                "Time spent waiting on key locks in transaction lock manager",
904                &["cf_name"],
905                registry,
906            )
907            .unwrap(),
908            write_key_lock_wait_count: register_int_counter_vec_with_registry!(
909                "write_key_lock_wait_count",
910                "Number of times acquiring a lock was blocked by another transaction",
911                &["cf_name"],
912                registry,
913            )
914            .unwrap(),
915        }
916    }
917    pub fn report_metrics(&self, db_name: &str) {
918        PER_THREAD_ROCKS_PERF_CONTEXT.with(|perf_context_cell| {
919            set_perf_stats(PerfStatsLevel::Disable);
920            let perf_context = perf_context_cell.borrow();
921            self.write_wal_nanos
922                .with_label_values(&[db_name])
923                .inc_by(perf_context.metric(PerfMetric::WriteWalTime));
924            self.write_memtable_nanos
925                .with_label_values(&[db_name])
926                .inc_by(perf_context.metric(PerfMetric::WriteMemtableTime));
927            self.write_delay_nanos
928                .with_label_values(&[db_name])
929                .inc_by(perf_context.metric(PerfMetric::WriteDelayTime));
930            self.write_pre_and_post_process_nanos
931                .with_label_values(&[db_name])
932                .inc_by(perf_context.metric(PerfMetric::WritePreAndPostProcessTime));
933            self.write_db_mutex_lock_nanos
934                .with_label_values(&[db_name])
935                .inc_by(perf_context.metric(PerfMetric::DbMutexLockNanos));
936            self.write_db_condition_wait_nanos
937                .with_label_values(&[db_name])
938                .inc_by(perf_context.metric(PerfMetric::DbConditionWaitNanos));
939            self.write_key_lock_wait_nanos
940                .with_label_values(&[db_name])
941                .inc_by(perf_context.metric(PerfMetric::KeyLockWaitTime));
942            self.write_key_lock_wait_count
943                .with_label_values(&[db_name])
944                .inc_by(perf_context.metric(PerfMetric::KeyLockWaitCount));
945        });
946    }
947}
948
949#[derive(Debug)]
950pub struct DBMetrics {
951    pub op_metrics: OperationMetrics,
952    pub cf_metrics: ColumnFamilyMetrics,
953    pub read_perf_ctx_metrics: ReadPerfContextMetrics,
954    pub write_perf_ctx_metrics: WritePerfContextMetrics,
955}
956
957static ONCE: OnceCell<Arc<DBMetrics>> = OnceCell::new();
958
959impl DBMetrics {
960    fn new(registry: &Registry) -> Self {
961        DBMetrics {
962            op_metrics: OperationMetrics::new(registry),
963            cf_metrics: ColumnFamilyMetrics::new(registry),
964            read_perf_ctx_metrics: ReadPerfContextMetrics::new(registry),
965            write_perf_ctx_metrics: WritePerfContextMetrics::new(registry),
966        }
967    }
968    pub fn init(registry: &Registry) -> &'static Arc<DBMetrics> {
969        // Initialize this before creating any instance of DBMap
970        // TODO: Remove static initialization because this basically means we can
971        // only ever initialize db metrics once with a registry whereas
972        // in the code we might want to initialize it with different
973        // registries. The problem is underlying metrics cannot be re-initialized
974        // or prometheus complains. We essentially need to pass in DBMetrics
975        // everywhere we create DBMap as the right fix
976        let _ = ONCE
977            .set(Arc::new(DBMetrics::new(registry)))
978            // this happens many times during tests
979            .tap_err(|_| warn!("DBMetrics registry overwritten"));
980        ONCE.get().unwrap()
981    }
982    pub fn increment_num_active_dbs(&self, db_name: &str) {
983        self.op_metrics
984            .rocksdb_num_active_db_handles
985            .with_label_values(&[db_name])
986            .inc();
987    }
988    pub fn decrement_num_active_dbs(&self, db_name: &str) {
989        self.op_metrics
990            .rocksdb_num_active_db_handles
991            .with_label_values(&[db_name])
992            .dec();
993    }
994    pub fn get() -> &'static Arc<DBMetrics> {
995        ONCE.get()
996            .unwrap_or_else(|| DBMetrics::init(prometheus::default_registry()))
997    }
998}