1use std::{
6 cell::RefCell,
7 sync::{
8 Arc,
9 atomic::{AtomicU64, Ordering},
10 },
11 time::Duration,
12};
13
14use once_cell::sync::OnceCell;
15use prometheus::{
16 HistogramVec, IntCounterVec, IntGaugeVec, Registry, register_histogram_vec_with_registry,
17 register_int_counter_vec_with_registry, register_int_gauge_vec_with_registry,
18};
19use rocksdb::{PerfContext, PerfMetric, PerfStatsLevel, perf::set_perf_stats};
20use tap::TapFallible;
21use tracing::warn;
22
23thread_local! {
24 static PER_THREAD_ROCKS_PERF_CONTEXT: std::cell::RefCell<rocksdb::PerfContext> = RefCell::new(PerfContext::default());
25}
26
27const LATENCY_SEC_BUCKETS: &[f64] = &[
28 0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1., 2.5, 5., 10., 20., 30., 60., 90.,
29];
30
31#[derive(Debug, Clone)]
32pub struct SamplingInterval {
35 pub once_every_duration: Duration,
37 pub after_num_ops: u64,
39 pub counter: Arc<AtomicU64>,
41}
42
43impl Default for SamplingInterval {
44 fn default() -> Self {
45 SamplingInterval::new(Duration::from_secs(60), 0)
47 }
48}
49
50impl SamplingInterval {
51 pub fn new(once_every_duration: Duration, after_num_ops: u64) -> Self {
52 let counter = Arc::new(AtomicU64::new(1));
53 if !once_every_duration.is_zero() {
54 let counter = counter.clone();
55 tokio::task::spawn(async move {
56 loop {
57 if counter.load(Ordering::SeqCst) > after_num_ops {
58 counter.store(0, Ordering::SeqCst);
59 }
60 tokio::time::sleep(once_every_duration).await;
61 }
62 });
63 }
64 SamplingInterval {
65 once_every_duration,
66 after_num_ops,
67 counter,
68 }
69 }
70 pub fn new_from_self(&self) -> SamplingInterval {
71 SamplingInterval::new(self.once_every_duration, self.after_num_ops)
72 }
73 pub fn sample(&self) -> bool {
74 if self.once_every_duration.is_zero() {
75 self.counter.fetch_add(1, Ordering::Relaxed) % (self.after_num_ops + 1) == 0
76 } else {
77 self.counter.fetch_add(1, Ordering::Relaxed) == 0
78 }
79 }
80}
81
82#[derive(Debug)]
83pub struct ColumnFamilyMetrics {
84 pub rocksdb_total_sst_files_size: IntGaugeVec,
85 pub rocksdb_total_blob_files_size: IntGaugeVec,
86 pub rocksdb_current_size_active_mem_tables: IntGaugeVec,
87 pub rocksdb_size_all_mem_tables: IntGaugeVec,
88 pub rocksdb_num_snapshots: IntGaugeVec,
89 pub rocksdb_oldest_snapshot_time: IntGaugeVec,
90 pub rocksdb_actual_delayed_write_rate: IntGaugeVec,
91 pub rocksdb_is_write_stopped: IntGaugeVec,
92 pub rocksdb_block_cache_capacity: IntGaugeVec,
93 pub rocksdb_block_cache_usage: IntGaugeVec,
94 pub rocksdb_block_cache_pinned_usage: IntGaugeVec,
95 pub rocksdb_estimate_table_readers_mem: IntGaugeVec,
96 pub rocksdb_num_immutable_mem_tables: IntGaugeVec,
97 pub rocksdb_mem_table_flush_pending: IntGaugeVec,
98 pub rocksdb_compaction_pending: IntGaugeVec,
99 pub rocksdb_estimate_pending_compaction_bytes: IntGaugeVec,
100 pub rocksdb_num_running_compactions: IntGaugeVec,
101 pub rocksdb_num_running_flushes: IntGaugeVec,
102 pub rocksdb_estimate_oldest_key_time: IntGaugeVec,
103 pub rocksdb_background_errors: IntGaugeVec,
104 pub rocksdb_estimated_num_keys: IntGaugeVec,
105 pub rocksdb_base_level: IntGaugeVec,
106}
107
108impl ColumnFamilyMetrics {
109 pub(crate) fn new(registry: &Registry) -> Self {
110 ColumnFamilyMetrics {
111 rocksdb_total_sst_files_size: register_int_gauge_vec_with_registry!(
112 "rocksdb_total_sst_files_size",
113 "The storage size occupied by the sst files in the column family",
114 &["cf_name"],
115 registry,
116 )
117 .unwrap(),
118 rocksdb_total_blob_files_size: register_int_gauge_vec_with_registry!(
119 "rocksdb_total_blob_files_size",
120 "The storage size occupied by the blob files in the column family",
121 &["cf_name"],
122 registry,
123 )
124 .unwrap(),
125 rocksdb_current_size_active_mem_tables: register_int_gauge_vec_with_registry!(
126 "rocksdb_current_size_active_mem_tables",
127 "The current approximate size of active memtable (bytes).",
128 &["cf_name"],
129 registry,
130 )
131 .unwrap(),
132 rocksdb_size_all_mem_tables: register_int_gauge_vec_with_registry!(
133 "rocksdb_size_all_mem_tables",
134 "The memory size occupied by the column family's in-memory buffer",
135 &["cf_name"],
136 registry,
137 )
138 .unwrap(),
139 rocksdb_num_snapshots: register_int_gauge_vec_with_registry!(
140 "rocksdb_num_snapshots",
141 "Number of snapshots held for the column family",
142 &["cf_name"],
143 registry,
144 )
145 .unwrap(),
146 rocksdb_oldest_snapshot_time: register_int_gauge_vec_with_registry!(
147 "rocksdb_oldest_snapshot_time",
148 "Unit timestamp of the oldest unreleased snapshot",
149 &["cf_name"],
150 registry,
151 )
152 .unwrap(),
153 rocksdb_actual_delayed_write_rate: register_int_gauge_vec_with_registry!(
154 "rocksdb_actual_delayed_write_rate",
155 "The current actual delayed write rate. 0 means no delay",
156 &["cf_name"],
157 registry,
158 )
159 .unwrap(),
160 rocksdb_is_write_stopped: register_int_gauge_vec_with_registry!(
161 "rocksdb_is_write_stopped",
162 "A flag indicating whether writes are stopped on this column family. 1 indicates writes have been stopped.",
163 &["cf_name"],
164 registry,
165 )
166 .unwrap(),
167 rocksdb_block_cache_capacity: register_int_gauge_vec_with_registry!(
168 "rocksdb_block_cache_capacity",
169 "The block cache capacity of the column family.",
170 &["cf_name"],
171 registry,
172 )
173 .unwrap(),
174 rocksdb_block_cache_usage: register_int_gauge_vec_with_registry!(
175 "rocksdb_block_cache_usage",
176 "The memory size used by the column family in the block cache.",
177 &["cf_name"],
178 registry,
179 )
180 .unwrap(),
181 rocksdb_block_cache_pinned_usage: register_int_gauge_vec_with_registry!(
182 "rocksdb_block_cache_pinned_usage",
183 "The memory size used by the column family in the block cache where entries are pinned",
184 &["cf_name"],
185 registry,
186 )
187 .unwrap(),
188 rocksdb_estimate_table_readers_mem: register_int_gauge_vec_with_registry!(
189 "rocksdb_estimate_table_readers_mem",
190 "The estimated memory size used for reading SST tables in this column
191 family such as filters and index blocks. Note that this number does not
192 include the memory used in block cache.",
193 &["cf_name"],
194 registry,
195 )
196 .unwrap(),
197 rocksdb_num_immutable_mem_tables: register_int_gauge_vec_with_registry!(
198 "rocksdb_num_immutable_mem_tables",
199 "The number of immutable memtables that have not yet been flushed.",
200 &["cf_name"],
201 registry,
202 )
203 .unwrap(),
204 rocksdb_mem_table_flush_pending: register_int_gauge_vec_with_registry!(
205 "rocksdb_mem_table_flush_pending",
206 "A 1 or 0 flag indicating whether a memtable flush is pending.
207 If this number is 1, it means a memtable is waiting for being flushed,
208 but there might be too many L0 files that prevents it from being flushed.",
209 &["cf_name"],
210 registry,
211 )
212 .unwrap(),
213 rocksdb_compaction_pending: register_int_gauge_vec_with_registry!(
214 "rocksdb_compaction_pending",
215 "A 1 or 0 flag indicating whether a compaction job is pending.
216 If this number is 1, it means some part of the column family requires
217 compaction in order to maintain shape of LSM tree, but the compaction
218 is pending because the desired compaction job is either waiting for
219 other dependent compactions to be finished or waiting for an available
220 compaction thread.",
221 &["cf_name"],
222 registry,
223 )
224 .unwrap(),
225 rocksdb_estimate_pending_compaction_bytes: register_int_gauge_vec_with_registry!(
226 "rocksdb_estimate_pending_compaction_bytes",
227 "Estimated total number of bytes compaction needs to rewrite to get all levels down
228 to under target size. Not valid for other compactions than level-based.",
229 &["cf_name"],
230 registry,
231 )
232 .unwrap(),
233 rocksdb_num_running_compactions: register_int_gauge_vec_with_registry!(
234 "rocksdb_num_running_compactions",
235 "The number of compactions that are currently running for the column family.",
236 &["cf_name"],
237 registry,
238 )
239 .unwrap(),
240 rocksdb_num_running_flushes: register_int_gauge_vec_with_registry!(
241 "rocksdb_num_running_flushes",
242 "The number of flushes that are currently running for the column family.",
243 &["cf_name"],
244 registry,
245 )
246 .unwrap(),
247 rocksdb_estimate_oldest_key_time: register_int_gauge_vec_with_registry!(
248 "rocksdb_estimate_oldest_key_time",
249 "Estimation of the oldest key timestamp in the DB. Only available
250 for FIFO compaction with compaction_options_fifo.allow_compaction = false.",
251 &["cf_name"],
252 registry,
253 )
254 .unwrap(),
255 rocksdb_estimated_num_keys: register_int_gauge_vec_with_registry!(
256 "rocksdb_estimated_num_keys",
257 "The estimated number of keys in the table",
258 &["cf_name"],
259 registry,
260 )
261 .unwrap(),
262 rocksdb_background_errors: register_int_gauge_vec_with_registry!(
263 "rocksdb_background_errors",
264 "The accumulated number of RocksDB background errors.",
265 &["cf_name"],
266 registry,
267 )
268 .unwrap(),
269 rocksdb_base_level: register_int_gauge_vec_with_registry!(
270 "rocksdb_base_level",
271 "The number of level to which L0 data will be compacted.",
272 &["cf_name"],
273 registry,
274 )
275 .unwrap(),
276 }
277 }
278}
279
280#[derive(Debug)]
281pub struct OperationMetrics {
282 pub rocksdb_iter_latency_seconds: HistogramVec,
283 pub rocksdb_iter_bytes: HistogramVec,
284 pub rocksdb_iter_keys: HistogramVec,
285 pub rocksdb_get_latency_seconds: HistogramVec,
286 pub rocksdb_get_bytes: HistogramVec,
287 pub rocksdb_multiget_latency_seconds: HistogramVec,
288 pub rocksdb_multiget_bytes: HistogramVec,
289 pub rocksdb_put_latency_seconds: HistogramVec,
290 pub rocksdb_put_bytes: HistogramVec,
291 pub rocksdb_batch_put_bytes: HistogramVec,
292 pub rocksdb_delete_latency_seconds: HistogramVec,
293 pub rocksdb_deletes: IntCounterVec,
294 pub rocksdb_batch_commit_latency_seconds: HistogramVec,
295 pub rocksdb_batch_commit_bytes: HistogramVec,
296 pub rocksdb_num_active_db_handles: IntGaugeVec,
297 pub rocksdb_very_slow_batch_writes_count: IntCounterVec,
298 pub rocksdb_very_slow_batch_writes_duration_ms: IntCounterVec,
299 pub rocksdb_very_slow_puts_count: IntCounterVec,
300 pub rocksdb_very_slow_puts_duration_ms: IntCounterVec,
301}
302
303impl OperationMetrics {
304 pub(crate) fn new(registry: &Registry) -> Self {
305 OperationMetrics {
306 rocksdb_iter_latency_seconds: register_histogram_vec_with_registry!(
307 "rocksdb_iter_latency_seconds",
308 "Rocksdb iter latency in seconds",
309 &["cf_name"],
310 LATENCY_SEC_BUCKETS.to_vec(),
311 registry,
312 )
313 .unwrap(),
314 rocksdb_iter_bytes: register_histogram_vec_with_registry!(
315 "rocksdb_iter_bytes",
316 "Rocksdb iter size in bytes",
317 &["cf_name"],
318 prometheus::exponential_buckets(1.0, 4.0, 15)
319 .unwrap()
320 .to_vec(),
321 registry,
322 )
323 .unwrap(),
324 rocksdb_iter_keys: register_histogram_vec_with_registry!(
325 "rocksdb_iter_keys",
326 "Rocksdb iter num keys",
327 &["cf_name"],
328 registry,
329 )
330 .unwrap(),
331 rocksdb_get_latency_seconds: register_histogram_vec_with_registry!(
332 "rocksdb_get_latency_seconds",
333 "Rocksdb get latency in seconds",
334 &["cf_name"],
335 LATENCY_SEC_BUCKETS.to_vec(),
336 registry,
337 )
338 .unwrap(),
339 rocksdb_get_bytes: register_histogram_vec_with_registry!(
340 "rocksdb_get_bytes",
341 "Rocksdb get call returned data size in bytes",
342 &["cf_name"],
343 prometheus::exponential_buckets(1.0, 4.0, 15)
344 .unwrap()
345 .to_vec(),
346 registry
347 )
348 .unwrap(),
349 rocksdb_multiget_latency_seconds: register_histogram_vec_with_registry!(
350 "rocksdb_multiget_latency_seconds",
351 "Rocksdb multiget latency in seconds",
352 &["cf_name"],
353 LATENCY_SEC_BUCKETS.to_vec(),
354 registry,
355 )
356 .unwrap(),
357 rocksdb_multiget_bytes: register_histogram_vec_with_registry!(
358 "rocksdb_multiget_bytes",
359 "Rocksdb multiget call returned data size in bytes",
360 &["cf_name"],
361 prometheus::exponential_buckets(1.0, 4.0, 15)
362 .unwrap()
363 .to_vec(),
364 registry,
365 )
366 .unwrap(),
367 rocksdb_put_latency_seconds: register_histogram_vec_with_registry!(
368 "rocksdb_put_latency_seconds",
369 "Rocksdb put latency in seconds",
370 &["cf_name"],
371 LATENCY_SEC_BUCKETS.to_vec(),
372 registry,
373 )
374 .unwrap(),
375 rocksdb_put_bytes: register_histogram_vec_with_registry!(
376 "rocksdb_put_bytes",
377 "Rocksdb put call puts data size in bytes",
378 &["cf_name"],
379 prometheus::exponential_buckets(1.0, 4.0, 15)
380 .unwrap()
381 .to_vec(),
382 registry,
383 )
384 .unwrap(),
385 rocksdb_batch_put_bytes: register_histogram_vec_with_registry!(
386 "rocksdb_batch_put_bytes",
387 "Rocksdb batch put call puts data size in bytes",
388 &["cf_name"],
389 prometheus::exponential_buckets(1.0, 4.0, 15)
390 .unwrap()
391 .to_vec(),
392 registry,
393 )
394 .unwrap(),
395 rocksdb_delete_latency_seconds: register_histogram_vec_with_registry!(
396 "rocksdb_delete_latency_seconds",
397 "Rocksdb delete latency in seconds",
398 &["cf_name"],
399 LATENCY_SEC_BUCKETS.to_vec(),
400 registry,
401 )
402 .unwrap(),
403 rocksdb_deletes: register_int_counter_vec_with_registry!(
404 "rocksdb_deletes",
405 "Rocksdb delete calls",
406 &["cf_name"],
407 registry
408 )
409 .unwrap(),
410 rocksdb_batch_commit_latency_seconds: register_histogram_vec_with_registry!(
411 "rocksdb_write_batch_commit_latency_seconds",
412 "Rocksdb schema batch commit latency in seconds",
413 &["db_name"],
414 LATENCY_SEC_BUCKETS.to_vec(),
415 registry,
416 )
417 .unwrap(),
418 rocksdb_batch_commit_bytes: register_histogram_vec_with_registry!(
419 "rocksdb_batch_commit_bytes",
420 "Rocksdb schema batch commit size in bytes",
421 &["db_name"],
422 prometheus::exponential_buckets(1.0, 4.0, 15)
423 .unwrap()
424 .to_vec(),
425 registry,
426 )
427 .unwrap(),
428 rocksdb_num_active_db_handles: register_int_gauge_vec_with_registry!(
429 "rocksdb_num_active_db_handles",
430 "Number of active db handles",
431 &["db_name"],
432 registry,
433 )
434 .unwrap(),
435 rocksdb_very_slow_batch_writes_count: register_int_counter_vec_with_registry!(
436 "rocksdb_num_very_slow_batch_writes",
437 "Number of batch writes that took more than 1 second",
438 &["db_name"],
439 registry,
440 )
441 .unwrap(),
442 rocksdb_very_slow_batch_writes_duration_ms: register_int_counter_vec_with_registry!(
443 "rocksdb_very_slow_batch_writes_duration",
444 "Total duration of batch writes that took more than 1 second",
445 &["db_name"],
446 registry,
447 )
448 .unwrap(),
449 rocksdb_very_slow_puts_count: register_int_counter_vec_with_registry!(
450 "rocksdb_num_very_slow_puts",
451 "Number of puts that took more than 1 second",
452 &["cf_name"],
453 registry,
454 )
455 .unwrap(),
456 rocksdb_very_slow_puts_duration_ms: register_int_counter_vec_with_registry!(
457 "rocksdb_very_slow_puts_duration",
458 "Total duration of puts that took more than 1 second",
459 &["cf_name"],
460 registry,
461 )
462 .unwrap(),
463 }
464 }
465}
466
467pub struct RocksDBPerfContext;
468
469impl Default for RocksDBPerfContext {
470 fn default() -> Self {
471 set_perf_stats(PerfStatsLevel::EnableTime);
472 PER_THREAD_ROCKS_PERF_CONTEXT.with(|perf_context| {
473 perf_context.borrow_mut().reset();
474 });
475 RocksDBPerfContext {}
476 }
477}
478
479impl Drop for RocksDBPerfContext {
480 fn drop(&mut self) {
481 set_perf_stats(PerfStatsLevel::Disable);
482 }
483}
484
485#[derive(Debug)]
486pub struct ReadPerfContextMetrics {
487 pub user_key_comparison_count: IntCounterVec,
488 pub block_cache_hit_count: IntCounterVec,
489 pub block_read_count: IntCounterVec,
490 pub block_read_byte: IntCounterVec,
491 pub block_read_nanos: IntCounterVec,
492 pub block_checksum_nanos: IntCounterVec,
493 pub block_decompress_nanos: IntCounterVec,
494 pub get_read_bytes: IntCounterVec,
495 pub multiget_read_bytes: IntCounterVec,
496 pub get_snapshot_nanos: IntCounterVec,
497 pub get_from_memtable_nanos: IntCounterVec,
498 pub get_from_memtable_count: IntCounterVec,
499 pub get_post_process_nanos: IntCounterVec,
500 pub get_from_output_files_nanos: IntCounterVec,
501 pub db_mutex_lock_nanos: IntCounterVec,
502 pub db_condition_wait_nanos: IntCounterVec,
503 pub merge_operator_nanos: IntCounterVec,
504 pub read_index_block_nanos: IntCounterVec,
505 pub read_filter_block_nanos: IntCounterVec,
506 pub new_table_block_iter_nanos: IntCounterVec,
507 pub block_seek_nanos: IntCounterVec,
508 pub find_table_nanos: IntCounterVec,
509 pub bloom_memtable_hit_count: IntCounterVec,
510 pub bloom_memtable_miss_count: IntCounterVec,
511 pub bloom_sst_hit_count: IntCounterVec,
512 pub bloom_sst_miss_count: IntCounterVec,
513 pub key_lock_wait_time: IntCounterVec,
514 pub key_lock_wait_count: IntCounterVec,
515 pub internal_delete_skipped_count: IntCounterVec,
516 pub internal_skipped_count: IntCounterVec,
517}
518
519impl ReadPerfContextMetrics {
520 pub(crate) fn new(registry: &Registry) -> Self {
521 ReadPerfContextMetrics {
522 user_key_comparison_count: register_int_counter_vec_with_registry!(
523 "user_key_comparison_count",
524 "Helps us figure out whether too many comparisons in binary search can be a problem,
525 especially when a more expensive comparator is used. Moreover, since number of comparisons
526 is usually uniform based on the memtable size, the SST file size for Level 0 and size of other
527 levels, an significant increase of the counter can indicate unexpected LSM-tree shape.
528 You may want to check whether flush/compaction can keep up with the write speed",
529 &["cf_name"],
530 registry,
531 )
532 .unwrap(),
533 block_cache_hit_count: register_int_counter_vec_with_registry!(
534 "block_cache_hit_count",
535 "Tells us how many times we read data blocks from block cache, and block_read_count tells us how many
536 times we have to read blocks from the file system (either block cache is disabled or it is a cache miss).
537 We can evaluate the block cache efficiency by looking at the two counters over time.",
538 &["cf_name"],
539 registry,
540 )
541 .unwrap(),
542 block_read_count: register_int_counter_vec_with_registry!(
543 "block_read_count",
544 "Tells us how many times we have to read blocks from the file system (either block cache is disabled or it is a cache miss)",
545 &["cf_name"],
546 registry,
547 )
548 .unwrap(),
549 block_read_byte: register_int_counter_vec_with_registry!(
550 "block_read_byte",
551 "Tells us how many total bytes we read from the file system. It can tell us whether a slow query can be caused by reading
552 large blocks from the file system. Index and bloom filter blocks are usually large blocks. A large block can also be the result
553 of a very large key or value",
554 &["cf_name"],
555 registry,
556 )
557 .unwrap(),
558 block_read_nanos: register_int_counter_vec_with_registry!(
559 "block_read_nanos",
560 "Total nanos spent on block reads",
561 &["cf_name"],
562 registry,
563 )
564 .unwrap(),
565 block_checksum_nanos: register_int_counter_vec_with_registry!(
566 "block_checksum_nanos",
567 "Total nanos spent on verifying block checksum",
568 &["cf_name"],
569 registry,
570 )
571 .unwrap(),
572 block_decompress_nanos: register_int_counter_vec_with_registry!(
573 "block_decompress_nanos",
574 "Total nanos spent on decompressing a block",
575 &["cf_name"],
576 registry,
577 )
578 .unwrap(),
579 get_read_bytes: register_int_counter_vec_with_registry!(
580 "get_read_bytes",
581 "Total bytes for values returned by Get",
582 &["cf_name"],
583 registry,
584 )
585 .unwrap(),
586 multiget_read_bytes: register_int_counter_vec_with_registry!(
587 "multiget_read_bytes",
588 "Total bytes for values returned by MultiGet.",
589 &["cf_name"],
590 registry,
591 )
592 .unwrap(),
593 get_snapshot_nanos: register_int_counter_vec_with_registry!(
594 "get_snapshot_nanos",
595 "Time spent in getting snapshot.",
596 &["cf_name"],
597 registry,
598 )
599 .unwrap(),
600 get_from_memtable_nanos: register_int_counter_vec_with_registry!(
601 "get_from_memtable_nanos",
602 "Time spent on reading data from memtable.",
603 &["cf_name"],
604 registry,
605 )
606 .unwrap(),
607 get_from_memtable_count: register_int_counter_vec_with_registry!(
608 "get_from_memtable_count",
609 "Number of memtables queried",
610 &["cf_name"],
611 registry,
612 )
613 .unwrap(),
614 get_post_process_nanos: register_int_counter_vec_with_registry!(
615 "get_post_process_nanos",
616 "Total nanos spent after Get() finds a key",
617 &["cf_name"],
618 registry,
619 )
620 .unwrap(),
621 get_from_output_files_nanos: register_int_counter_vec_with_registry!(
622 "get_from_output_files_nanos",
623 "Total nanos reading from output files",
624 &["cf_name"],
625 registry,
626 )
627 .unwrap(),
628 db_mutex_lock_nanos: register_int_counter_vec_with_registry!(
629 "db_mutex_lock_nanos",
630 "Time spent on acquiring db mutex",
631 &["cf_name"],
632 registry,
633 )
634 .unwrap(),
635 db_condition_wait_nanos: register_int_counter_vec_with_registry!(
636 "db_condition_wait_nanos",
637 "Time spent waiting with a condition variable created with DB Mutex.",
638 &["cf_name"],
639 registry,
640 )
641 .unwrap(),
642 merge_operator_nanos: register_int_counter_vec_with_registry!(
643 "merge_operator_nanos",
644 "Time spent on merge operator.",
645 &["cf_name"],
646 registry,
647 )
648 .unwrap(),
649 read_index_block_nanos: register_int_counter_vec_with_registry!(
650 "read_index_block_nanos",
651 "Time spent on reading index block from block cache or SST file",
652 &["cf_name"],
653 registry,
654 )
655 .unwrap(),
656 read_filter_block_nanos: register_int_counter_vec_with_registry!(
657 "read_filter_block_nanos",
658 "Time spent on reading filter block from block cache or SST file",
659 &["cf_name"],
660 registry,
661 )
662 .unwrap(),
663 new_table_block_iter_nanos: register_int_counter_vec_with_registry!(
664 "new_table_block_iter_nanos",
665 "Time spent on creating data block iterator",
666 &["cf_name"],
667 registry,
668 )
669 .unwrap(),
670 block_seek_nanos: register_int_counter_vec_with_registry!(
671 "block_seek_nanos",
672 "Time spent on seeking a key in data/index blocks",
673 &["cf_name"],
674 registry,
675 )
676 .unwrap(),
677 find_table_nanos: register_int_counter_vec_with_registry!(
678 "find_table_nanos",
679 "Time spent on finding or creating a table reader",
680 &["cf_name"],
681 registry,
682 )
683 .unwrap(),
684 bloom_memtable_hit_count: register_int_counter_vec_with_registry!(
685 "bloom_memtable_hit_count",
686 "Total number of mem table bloom hits",
687 &["cf_name"],
688 registry,
689 )
690 .unwrap(),
691 bloom_memtable_miss_count: register_int_counter_vec_with_registry!(
692 "bloom_memtable_miss_count",
693 "Total number of mem table bloom misses",
694 &["cf_name"],
695 registry,
696 )
697 .unwrap(),
698 bloom_sst_hit_count: register_int_counter_vec_with_registry!(
699 "bloom_sst_hit_count",
700 "Total number of SST table bloom hits",
701 &["cf_name"],
702 registry,
703 )
704 .unwrap(),
705 bloom_sst_miss_count: register_int_counter_vec_with_registry!(
706 "bloom_sst_miss_count",
707 "Total number of SST table bloom misses",
708 &["cf_name"],
709 registry,
710 )
711 .unwrap(),
712 key_lock_wait_time: register_int_counter_vec_with_registry!(
713 "key_lock_wait_time",
714 "Time spent waiting on key locks in transaction lock manager",
715 &["cf_name"],
716 registry,
717 )
718 .unwrap(),
719 key_lock_wait_count: register_int_counter_vec_with_registry!(
720 "key_lock_wait_count",
721 "Number of times acquiring a lock was blocked by another transaction",
722 &["cf_name"],
723 registry,
724 )
725 .unwrap(),
726 internal_delete_skipped_count: register_int_counter_vec_with_registry!(
727 "internal_delete_skipped_count",
728 "Total number of deleted keys skipped during iteration",
729 &["cf_name"],
730 registry,
731 )
732 .unwrap(),
733 internal_skipped_count: register_int_counter_vec_with_registry!(
734 "internal_skipped_count",
735 "Totall number of internal keys skipped during iteration",
736 &["cf_name"],
737 registry,
738 )
739 .unwrap(),
740 }
741 }
742
743 pub fn report_metrics(&self, cf_name: &str) {
744 PER_THREAD_ROCKS_PERF_CONTEXT.with(|perf_context_cell| {
745 set_perf_stats(PerfStatsLevel::Disable);
746 let perf_context = perf_context_cell.borrow();
747 self.user_key_comparison_count
748 .with_label_values(&[cf_name])
749 .inc_by(perf_context.metric(PerfMetric::UserKeyComparisonCount));
750 self.block_cache_hit_count
751 .with_label_values(&[cf_name])
752 .inc_by(perf_context.metric(PerfMetric::BlockCacheHitCount));
753 self.block_read_count
754 .with_label_values(&[cf_name])
755 .inc_by(perf_context.metric(PerfMetric::BlockReadCount));
756 self.block_read_byte
757 .with_label_values(&[cf_name])
758 .inc_by(perf_context.metric(PerfMetric::BlockReadByte));
759 self.block_read_nanos
760 .with_label_values(&[cf_name])
761 .inc_by(perf_context.metric(PerfMetric::BlockReadTime));
762 self.block_read_count
763 .with_label_values(&[cf_name])
764 .inc_by(perf_context.metric(PerfMetric::BlockReadCount));
765 self.block_checksum_nanos
766 .with_label_values(&[cf_name])
767 .inc_by(perf_context.metric(PerfMetric::BlockChecksumTime));
768 self.block_decompress_nanos
769 .with_label_values(&[cf_name])
770 .inc_by(perf_context.metric(PerfMetric::BlockDecompressTime));
771 self.get_read_bytes
772 .with_label_values(&[cf_name])
773 .inc_by(perf_context.metric(PerfMetric::GetReadBytes));
774 self.multiget_read_bytes
775 .with_label_values(&[cf_name])
776 .inc_by(perf_context.metric(PerfMetric::MultigetReadBytes));
777 self.get_snapshot_nanos
778 .with_label_values(&[cf_name])
779 .inc_by(perf_context.metric(PerfMetric::GetSnapshotTime));
780 self.get_from_memtable_nanos
781 .with_label_values(&[cf_name])
782 .inc_by(perf_context.metric(PerfMetric::GetFromMemtableTime));
783 self.get_from_memtable_count
784 .with_label_values(&[cf_name])
785 .inc_by(perf_context.metric(PerfMetric::GetFromMemtableCount));
786 self.get_post_process_nanos
787 .with_label_values(&[cf_name])
788 .inc_by(perf_context.metric(PerfMetric::GetPostProcessTime));
789 self.get_from_output_files_nanos
790 .with_label_values(&[cf_name])
791 .inc_by(perf_context.metric(PerfMetric::GetFromOutputFilesTime));
792 self.db_mutex_lock_nanos
793 .with_label_values(&[cf_name])
794 .inc_by(perf_context.metric(PerfMetric::DbMutexLockNanos));
795 self.db_condition_wait_nanos
796 .with_label_values(&[cf_name])
797 .inc_by(perf_context.metric(PerfMetric::DbConditionWaitNanos));
798 self.merge_operator_nanos
799 .with_label_values(&[cf_name])
800 .inc_by(perf_context.metric(PerfMetric::MergeOperatorTimeNanos));
801 self.read_index_block_nanos
802 .with_label_values(&[cf_name])
803 .inc_by(perf_context.metric(PerfMetric::ReadIndexBlockNanos));
804 self.read_filter_block_nanos
805 .with_label_values(&[cf_name])
806 .inc_by(perf_context.metric(PerfMetric::ReadFilterBlockNanos));
807 self.new_table_block_iter_nanos
808 .with_label_values(&[cf_name])
809 .inc_by(perf_context.metric(PerfMetric::NewTableBlockIterNanos));
810 self.block_seek_nanos
811 .with_label_values(&[cf_name])
812 .inc_by(perf_context.metric(PerfMetric::BlockSeekNanos));
813 self.find_table_nanos
814 .with_label_values(&[cf_name])
815 .inc_by(perf_context.metric(PerfMetric::FindTableNanos));
816 self.bloom_memtable_hit_count
817 .with_label_values(&[cf_name])
818 .inc_by(perf_context.metric(PerfMetric::BloomMemtableHitCount));
819 self.bloom_memtable_miss_count
820 .with_label_values(&[cf_name])
821 .inc_by(perf_context.metric(PerfMetric::BloomMemtableMissCount));
822 self.bloom_sst_hit_count
823 .with_label_values(&[cf_name])
824 .inc_by(perf_context.metric(PerfMetric::BloomSstHitCount));
825 self.bloom_sst_miss_count
826 .with_label_values(&[cf_name])
827 .inc_by(perf_context.metric(PerfMetric::BloomSstMissCount));
828 self.key_lock_wait_time
829 .with_label_values(&[cf_name])
830 .inc_by(perf_context.metric(PerfMetric::KeyLockWaitTime));
831 self.key_lock_wait_count
832 .with_label_values(&[cf_name])
833 .inc_by(perf_context.metric(PerfMetric::KeyLockWaitCount));
834 self.internal_delete_skipped_count
835 .with_label_values(&[cf_name])
836 .inc_by(perf_context.metric(PerfMetric::InternalDeleteSkippedCount));
837 self.internal_skipped_count
838 .with_label_values(&[cf_name])
839 .inc_by(perf_context.metric(PerfMetric::InternalKeySkippedCount));
840 });
841 }
842}
843
844#[derive(Debug)]
845pub struct WritePerfContextMetrics {
846 pub write_wal_nanos: IntCounterVec,
847 pub write_memtable_nanos: IntCounterVec,
848 pub write_delay_nanos: IntCounterVec,
849 pub write_pre_and_post_process_nanos: IntCounterVec,
850 pub write_db_mutex_lock_nanos: IntCounterVec,
851 pub write_db_condition_wait_nanos: IntCounterVec,
852 pub write_key_lock_wait_nanos: IntCounterVec,
853 pub write_key_lock_wait_count: IntCounterVec,
854}
855
856impl WritePerfContextMetrics {
857 pub(crate) fn new(registry: &Registry) -> Self {
858 WritePerfContextMetrics {
859 write_wal_nanos: register_int_counter_vec_with_registry!(
860 "write_wal_nanos",
861 "Total nanos spent on writing to WAL",
862 &["cf_name"],
863 registry,
864 )
865 .unwrap(),
866 write_memtable_nanos: register_int_counter_vec_with_registry!(
867 "write_memtable_nanos",
868 "Total nanos spent on writing to memtable",
869 &["cf_name"],
870 registry,
871 )
872 .unwrap(),
873 write_delay_nanos: register_int_counter_vec_with_registry!(
874 "write_delay_nanos",
875 "Total nanos spent on delaying or throttling write",
876 &["cf_name"],
877 registry,
878 )
879 .unwrap(),
880 write_pre_and_post_process_nanos: register_int_counter_vec_with_registry!(
881 "write_pre_and_post_process_nanos",
882 "Total nanos spent on writing a record, excluding the above four things",
883 &["cf_name"],
884 registry,
885 )
886 .unwrap(),
887 write_db_mutex_lock_nanos: register_int_counter_vec_with_registry!(
888 "write_db_mutex_lock_nanos",
889 "Time spent on acquiring db mutex",
890 &["cf_name"],
891 registry,
892 )
893 .unwrap(),
894 write_db_condition_wait_nanos: register_int_counter_vec_with_registry!(
895 "write_db_condition_wait_nanos",
896 "Time spent waiting with a condition variable created with DB Mutex.",
897 &["cf_name"],
898 registry,
899 )
900 .unwrap(),
901 write_key_lock_wait_nanos: register_int_counter_vec_with_registry!(
902 "write_key_lock_wait_time",
903 "Time spent waiting on key locks in transaction lock manager",
904 &["cf_name"],
905 registry,
906 )
907 .unwrap(),
908 write_key_lock_wait_count: register_int_counter_vec_with_registry!(
909 "write_key_lock_wait_count",
910 "Number of times acquiring a lock was blocked by another transaction",
911 &["cf_name"],
912 registry,
913 )
914 .unwrap(),
915 }
916 }
917 pub fn report_metrics(&self, db_name: &str) {
918 PER_THREAD_ROCKS_PERF_CONTEXT.with(|perf_context_cell| {
919 set_perf_stats(PerfStatsLevel::Disable);
920 let perf_context = perf_context_cell.borrow();
921 self.write_wal_nanos
922 .with_label_values(&[db_name])
923 .inc_by(perf_context.metric(PerfMetric::WriteWalTime));
924 self.write_memtable_nanos
925 .with_label_values(&[db_name])
926 .inc_by(perf_context.metric(PerfMetric::WriteMemtableTime));
927 self.write_delay_nanos
928 .with_label_values(&[db_name])
929 .inc_by(perf_context.metric(PerfMetric::WriteDelayTime));
930 self.write_pre_and_post_process_nanos
931 .with_label_values(&[db_name])
932 .inc_by(perf_context.metric(PerfMetric::WritePreAndPostProcessTime));
933 self.write_db_mutex_lock_nanos
934 .with_label_values(&[db_name])
935 .inc_by(perf_context.metric(PerfMetric::DbMutexLockNanos));
936 self.write_db_condition_wait_nanos
937 .with_label_values(&[db_name])
938 .inc_by(perf_context.metric(PerfMetric::DbConditionWaitNanos));
939 self.write_key_lock_wait_nanos
940 .with_label_values(&[db_name])
941 .inc_by(perf_context.metric(PerfMetric::KeyLockWaitTime));
942 self.write_key_lock_wait_count
943 .with_label_values(&[db_name])
944 .inc_by(perf_context.metric(PerfMetric::KeyLockWaitCount));
945 });
946 }
947}
948
949#[derive(Debug)]
950pub struct DBMetrics {
951 pub op_metrics: OperationMetrics,
952 pub cf_metrics: ColumnFamilyMetrics,
953 pub read_perf_ctx_metrics: ReadPerfContextMetrics,
954 pub write_perf_ctx_metrics: WritePerfContextMetrics,
955}
956
957static ONCE: OnceCell<Arc<DBMetrics>> = OnceCell::new();
958
959impl DBMetrics {
960 fn new(registry: &Registry) -> Self {
961 DBMetrics {
962 op_metrics: OperationMetrics::new(registry),
963 cf_metrics: ColumnFamilyMetrics::new(registry),
964 read_perf_ctx_metrics: ReadPerfContextMetrics::new(registry),
965 write_perf_ctx_metrics: WritePerfContextMetrics::new(registry),
966 }
967 }
968 pub fn init(registry: &Registry) -> &'static Arc<DBMetrics> {
969 let _ = ONCE
977 .set(Arc::new(DBMetrics::new(registry)))
978 .tap_err(|_| warn!("DBMetrics registry overwritten"));
980 ONCE.get().unwrap()
981 }
982 pub fn increment_num_active_dbs(&self, db_name: &str) {
983 self.op_metrics
984 .rocksdb_num_active_db_handles
985 .with_label_values(&[db_name])
986 .inc();
987 }
988 pub fn decrement_num_active_dbs(&self, db_name: &str) {
989 self.op_metrics
990 .rocksdb_num_active_db_handles
991 .with_label_values(&[db_name])
992 .dec();
993 }
994 pub fn get() -> &'static Arc<DBMetrics> {
995 ONCE.get()
996 .unwrap_or_else(|| DBMetrics::init(prometheus::default_registry()))
997 }
998}