1use std::{
6 cell::RefCell,
7 sync::{
8 Arc,
9 atomic::{AtomicU64, Ordering},
10 },
11 time::Duration,
12};
13
14use once_cell::sync::OnceCell;
15use prometheus::{
16 HistogramVec, IntCounterVec, IntGaugeVec, Registry, register_histogram_vec_with_registry,
17 register_int_counter_vec_with_registry, register_int_gauge_vec_with_registry,
18};
19use rocksdb::{PerfContext, PerfMetric, PerfStatsLevel, perf::set_perf_stats};
20use tap::TapFallible;
21use tracing::warn;
22
23thread_local! {
24 static PER_THREAD_ROCKS_PERF_CONTEXT: std::cell::RefCell<rocksdb::PerfContext> = RefCell::new(PerfContext::default());
25}
26
27const LATENCY_SEC_BUCKETS: &[f64] = &[
28 0.00001, 0.00005, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.001, 0.002, 0.003, 0.004, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1., 2.5, 5., 10.,
32];
33
34#[derive(Debug, Clone)]
35pub struct SamplingInterval {
38 pub once_every_duration: Duration,
40 pub after_num_ops: u64,
42 pub counter: Arc<AtomicU64>,
44}
45
46impl Default for SamplingInterval {
47 fn default() -> Self {
48 SamplingInterval::new(Duration::from_secs(60), 0)
50 }
51}
52
53impl SamplingInterval {
54 pub fn new(once_every_duration: Duration, after_num_ops: u64) -> Self {
55 let counter = Arc::new(AtomicU64::new(1));
56 if !once_every_duration.is_zero() {
57 let counter = counter.clone();
58 tokio::task::spawn(async move {
59 loop {
60 if counter.load(Ordering::SeqCst) > after_num_ops {
61 counter.store(0, Ordering::SeqCst);
62 }
63 tokio::time::sleep(once_every_duration).await;
64 }
65 });
66 }
67 SamplingInterval {
68 once_every_duration,
69 after_num_ops,
70 counter,
71 }
72 }
73 pub fn new_from_self(&self) -> SamplingInterval {
74 SamplingInterval::new(self.once_every_duration, self.after_num_ops)
75 }
76 pub fn sample(&self) -> bool {
77 if self.once_every_duration.is_zero() {
78 self.counter
79 .fetch_add(1, Ordering::Relaxed)
80 .is_multiple_of(self.after_num_ops + 1)
81 } else {
82 self.counter.fetch_add(1, Ordering::Relaxed) == 0
83 }
84 }
85}
86
87#[derive(Debug)]
88pub struct ColumnFamilyMetrics {
89 pub rocksdb_total_sst_files_size: IntGaugeVec,
90 pub rocksdb_total_blob_files_size: IntGaugeVec,
91 pub rocksdb_total_num_files: IntGaugeVec,
92 pub rocksdb_num_level0_files: IntGaugeVec,
93 pub rocksdb_current_size_active_mem_tables: IntGaugeVec,
94 pub rocksdb_size_all_mem_tables: IntGaugeVec,
95 pub rocksdb_num_snapshots: IntGaugeVec,
96 pub rocksdb_oldest_snapshot_time: IntGaugeVec,
97 pub rocksdb_actual_delayed_write_rate: IntGaugeVec,
98 pub rocksdb_is_write_stopped: IntGaugeVec,
99 pub rocksdb_block_cache_capacity: IntGaugeVec,
100 pub rocksdb_block_cache_usage: IntGaugeVec,
101 pub rocksdb_block_cache_pinned_usage: IntGaugeVec,
102 pub rocksdb_estimate_table_readers_mem: IntGaugeVec,
103 pub rocksdb_num_immutable_mem_tables: IntGaugeVec,
104 pub rocksdb_mem_table_flush_pending: IntGaugeVec,
105 pub rocksdb_compaction_pending: IntGaugeVec,
106 pub rocksdb_estimate_pending_compaction_bytes: IntGaugeVec,
107 pub rocksdb_num_running_compactions: IntGaugeVec,
108 pub rocksdb_num_running_flushes: IntGaugeVec,
109 pub rocksdb_estimate_oldest_key_time: IntGaugeVec,
110 pub rocksdb_background_errors: IntGaugeVec,
111 pub rocksdb_estimated_num_keys: IntGaugeVec,
112 pub rocksdb_base_level: IntGaugeVec,
113}
114
115impl ColumnFamilyMetrics {
116 pub(crate) fn new(registry: &Registry) -> Self {
117 ColumnFamilyMetrics {
118 rocksdb_total_sst_files_size: register_int_gauge_vec_with_registry!(
119 "rocksdb_total_sst_files_size",
120 "The storage size occupied by the sst files in the column family",
121 &["cf_name"],
122 registry,
123 )
124 .unwrap(),
125 rocksdb_total_blob_files_size: register_int_gauge_vec_with_registry!(
126 "rocksdb_total_blob_files_size",
127 "The storage size occupied by the blob files in the column family",
128 &["cf_name"],
129 registry,
130 )
131 .unwrap(),
132 rocksdb_total_num_files: register_int_gauge_vec_with_registry!(
133 "rocksdb_total_num_files",
134 "Total number of files used in the column family",
135 &["cf_name"],
136 registry,
137 )
138 .unwrap(),
139 rocksdb_num_level0_files: register_int_gauge_vec_with_registry!(
140 "rocksdb_num_level0_files",
141 "Number of level 0 files in the column family",
142 &["cf_name"],
143 registry,
144 )
145 .unwrap(),
146 rocksdb_current_size_active_mem_tables: register_int_gauge_vec_with_registry!(
147 "rocksdb_current_size_active_mem_tables",
148 "The current approximate size of active memtable (bytes).",
149 &["cf_name"],
150 registry,
151 )
152 .unwrap(),
153 rocksdb_size_all_mem_tables: register_int_gauge_vec_with_registry!(
154 "rocksdb_size_all_mem_tables",
155 "The memory size occupied by the column family's in-memory buffer",
156 &["cf_name"],
157 registry,
158 )
159 .unwrap(),
160 rocksdb_num_snapshots: register_int_gauge_vec_with_registry!(
161 "rocksdb_num_snapshots",
162 "Number of snapshots held for the column family",
163 &["cf_name"],
164 registry,
165 )
166 .unwrap(),
167 rocksdb_oldest_snapshot_time: register_int_gauge_vec_with_registry!(
168 "rocksdb_oldest_snapshot_time",
169 "Unit timestamp of the oldest unreleased snapshot",
170 &["cf_name"],
171 registry,
172 )
173 .unwrap(),
174 rocksdb_actual_delayed_write_rate: register_int_gauge_vec_with_registry!(
175 "rocksdb_actual_delayed_write_rate",
176 "The current actual delayed write rate. 0 means no delay",
177 &["cf_name"],
178 registry,
179 )
180 .unwrap(),
181 rocksdb_is_write_stopped: register_int_gauge_vec_with_registry!(
182 "rocksdb_is_write_stopped",
183 "A flag indicating whether writes are stopped on this column family. 1 indicates writes have been stopped.",
184 &["cf_name"],
185 registry,
186 )
187 .unwrap(),
188 rocksdb_block_cache_capacity: register_int_gauge_vec_with_registry!(
189 "rocksdb_block_cache_capacity",
190 "The block cache capacity of the column family.",
191 &["cf_name"],
192 registry,
193 )
194 .unwrap(),
195 rocksdb_block_cache_usage: register_int_gauge_vec_with_registry!(
196 "rocksdb_block_cache_usage",
197 "The memory size used by the column family in the block cache.",
198 &["cf_name"],
199 registry,
200 )
201 .unwrap(),
202 rocksdb_block_cache_pinned_usage: register_int_gauge_vec_with_registry!(
203 "rocksdb_block_cache_pinned_usage",
204 "The memory size used by the column family in the block cache where entries are pinned",
205 &["cf_name"],
206 registry,
207 )
208 .unwrap(),
209 rocksdb_estimate_table_readers_mem: register_int_gauge_vec_with_registry!(
210 "rocksdb_estimate_table_readers_mem",
211 "The estimated memory size used for reading SST tables in this column
212 family such as filters and index blocks. Note that this number does not
213 include the memory used in block cache.",
214 &["cf_name"],
215 registry,
216 )
217 .unwrap(),
218 rocksdb_num_immutable_mem_tables: register_int_gauge_vec_with_registry!(
219 "rocksdb_num_immutable_mem_tables",
220 "The number of immutable memtables that have not yet been flushed.",
221 &["cf_name"],
222 registry,
223 )
224 .unwrap(),
225 rocksdb_mem_table_flush_pending: register_int_gauge_vec_with_registry!(
226 "rocksdb_mem_table_flush_pending",
227 "A 1 or 0 flag indicating whether a memtable flush is pending.
228 If this number is 1, it means a memtable is waiting for being flushed,
229 but there might be too many L0 files that prevents it from being flushed.",
230 &["cf_name"],
231 registry,
232 )
233 .unwrap(),
234 rocksdb_compaction_pending: register_int_gauge_vec_with_registry!(
235 "rocksdb_compaction_pending",
236 "A 1 or 0 flag indicating whether a compaction job is pending.
237 If this number is 1, it means some part of the column family requires
238 compaction in order to maintain shape of LSM tree, but the compaction
239 is pending because the desired compaction job is either waiting for
240 other dependent compactions to be finished or waiting for an available
241 compaction thread.",
242 &["cf_name"],
243 registry,
244 )
245 .unwrap(),
246 rocksdb_estimate_pending_compaction_bytes: register_int_gauge_vec_with_registry!(
247 "rocksdb_estimate_pending_compaction_bytes",
248 "Estimated total number of bytes compaction needs to rewrite to get all levels down
249 to under target size. Not valid for other compactions than level-based.",
250 &["cf_name"],
251 registry,
252 )
253 .unwrap(),
254 rocksdb_num_running_compactions: register_int_gauge_vec_with_registry!(
255 "rocksdb_num_running_compactions",
256 "The number of compactions that are currently running for the column family.",
257 &["cf_name"],
258 registry,
259 )
260 .unwrap(),
261 rocksdb_num_running_flushes: register_int_gauge_vec_with_registry!(
262 "rocksdb_num_running_flushes",
263 "The number of flushes that are currently running for the column family.",
264 &["cf_name"],
265 registry,
266 )
267 .unwrap(),
268 rocksdb_estimate_oldest_key_time: register_int_gauge_vec_with_registry!(
269 "rocksdb_estimate_oldest_key_time",
270 "Estimation of the oldest key timestamp in the DB. Only available
271 for FIFO compaction with compaction_options_fifo.allow_compaction = false.",
272 &["cf_name"],
273 registry,
274 )
275 .unwrap(),
276 rocksdb_estimated_num_keys: register_int_gauge_vec_with_registry!(
277 "rocksdb_estimated_num_keys",
278 "The estimated number of keys in the table",
279 &["cf_name"],
280 registry,
281 )
282 .unwrap(),
283 rocksdb_background_errors: register_int_gauge_vec_with_registry!(
284 "rocksdb_background_errors",
285 "The accumulated number of RocksDB background errors.",
286 &["cf_name"],
287 registry,
288 )
289 .unwrap(),
290 rocksdb_base_level: register_int_gauge_vec_with_registry!(
291 "rocksdb_base_level",
292 "The number of level to which L0 data will be compacted.",
293 &["cf_name"],
294 registry,
295 )
296 .unwrap(),
297 }
298 }
299}
300
301#[derive(Debug)]
302pub struct OperationMetrics {
303 pub rocksdb_iter_latency_seconds: HistogramVec,
304 pub rocksdb_iter_bytes: HistogramVec,
305 pub rocksdb_iter_keys: HistogramVec,
306 pub rocksdb_get_latency_seconds: HistogramVec,
307 pub rocksdb_get_bytes: HistogramVec,
308 pub rocksdb_multiget_latency_seconds: HistogramVec,
309 pub rocksdb_multiget_bytes: HistogramVec,
310 pub rocksdb_put_latency_seconds: HistogramVec,
311 pub rocksdb_put_bytes: HistogramVec,
312 pub rocksdb_batch_put_bytes: HistogramVec,
313 pub rocksdb_delete_latency_seconds: HistogramVec,
314 pub rocksdb_deletes: IntCounterVec,
315 pub rocksdb_batch_commit_latency_seconds: HistogramVec,
316 pub rocksdb_batch_commit_bytes: HistogramVec,
317 pub rocksdb_num_active_db_handles: IntGaugeVec,
318 pub rocksdb_very_slow_batch_writes_count: IntCounterVec,
319 pub rocksdb_very_slow_batch_writes_duration_ms: IntCounterVec,
320 pub rocksdb_very_slow_puts_count: IntCounterVec,
321 pub rocksdb_very_slow_puts_duration_ms: IntCounterVec,
322}
323
324impl OperationMetrics {
325 pub(crate) fn new(registry: &Registry) -> Self {
326 OperationMetrics {
327 rocksdb_iter_latency_seconds: register_histogram_vec_with_registry!(
328 "rocksdb_iter_latency_seconds",
329 "Rocksdb iter latency in seconds",
330 &["cf_name"],
331 LATENCY_SEC_BUCKETS.to_vec(),
332 registry,
333 )
334 .unwrap(),
335 rocksdb_iter_bytes: register_histogram_vec_with_registry!(
336 "rocksdb_iter_bytes",
337 "Rocksdb iter size in bytes",
338 &["cf_name"],
339 prometheus::exponential_buckets(1.0, 4.0, 15)
340 .unwrap()
341 .to_vec(),
342 registry,
343 )
344 .unwrap(),
345 rocksdb_iter_keys: register_histogram_vec_with_registry!(
346 "rocksdb_iter_keys",
347 "Rocksdb iter num keys",
348 &["cf_name"],
349 registry,
350 )
351 .unwrap(),
352 rocksdb_get_latency_seconds: register_histogram_vec_with_registry!(
353 "rocksdb_get_latency_seconds",
354 "Rocksdb get latency in seconds",
355 &["cf_name"],
356 LATENCY_SEC_BUCKETS.to_vec(),
357 registry,
358 )
359 .unwrap(),
360 rocksdb_get_bytes: register_histogram_vec_with_registry!(
361 "rocksdb_get_bytes",
362 "Rocksdb get call returned data size in bytes",
363 &["cf_name"],
364 prometheus::exponential_buckets(1.0, 4.0, 15)
365 .unwrap()
366 .to_vec(),
367 registry
368 )
369 .unwrap(),
370 rocksdb_multiget_latency_seconds: register_histogram_vec_with_registry!(
371 "rocksdb_multiget_latency_seconds",
372 "Rocksdb multiget latency in seconds",
373 &["cf_name"],
374 LATENCY_SEC_BUCKETS.to_vec(),
375 registry,
376 )
377 .unwrap(),
378 rocksdb_multiget_bytes: register_histogram_vec_with_registry!(
379 "rocksdb_multiget_bytes",
380 "Rocksdb multiget call returned data size in bytes",
381 &["cf_name"],
382 prometheus::exponential_buckets(1.0, 4.0, 15)
383 .unwrap()
384 .to_vec(),
385 registry,
386 )
387 .unwrap(),
388 rocksdb_put_latency_seconds: register_histogram_vec_with_registry!(
389 "rocksdb_put_latency_seconds",
390 "Rocksdb put latency in seconds",
391 &["cf_name"],
392 LATENCY_SEC_BUCKETS.to_vec(),
393 registry,
394 )
395 .unwrap(),
396 rocksdb_put_bytes: register_histogram_vec_with_registry!(
397 "rocksdb_put_bytes",
398 "Rocksdb put call puts data size in bytes",
399 &["cf_name"],
400 prometheus::exponential_buckets(1.0, 4.0, 15)
401 .unwrap()
402 .to_vec(),
403 registry,
404 )
405 .unwrap(),
406 rocksdb_batch_put_bytes: register_histogram_vec_with_registry!(
407 "rocksdb_batch_put_bytes",
408 "Rocksdb batch put call puts data size in bytes",
409 &["cf_name"],
410 prometheus::exponential_buckets(1.0, 4.0, 15)
411 .unwrap()
412 .to_vec(),
413 registry,
414 )
415 .unwrap(),
416 rocksdb_delete_latency_seconds: register_histogram_vec_with_registry!(
417 "rocksdb_delete_latency_seconds",
418 "Rocksdb delete latency in seconds",
419 &["cf_name"],
420 LATENCY_SEC_BUCKETS.to_vec(),
421 registry,
422 )
423 .unwrap(),
424 rocksdb_deletes: register_int_counter_vec_with_registry!(
425 "rocksdb_deletes",
426 "Rocksdb delete calls",
427 &["cf_name"],
428 registry
429 )
430 .unwrap(),
431 rocksdb_batch_commit_latency_seconds: register_histogram_vec_with_registry!(
432 "rocksdb_write_batch_commit_latency_seconds",
433 "Rocksdb schema batch commit latency in seconds",
434 &["db_name"],
435 LATENCY_SEC_BUCKETS.to_vec(),
436 registry,
437 )
438 .unwrap(),
439 rocksdb_batch_commit_bytes: register_histogram_vec_with_registry!(
440 "rocksdb_batch_commit_bytes",
441 "Rocksdb schema batch commit size in bytes",
442 &["db_name"],
443 prometheus::exponential_buckets(1.0, 4.0, 15)
444 .unwrap()
445 .to_vec(),
446 registry,
447 )
448 .unwrap(),
449 rocksdb_num_active_db_handles: register_int_gauge_vec_with_registry!(
450 "rocksdb_num_active_db_handles",
451 "Number of active db handles",
452 &["db_name"],
453 registry,
454 )
455 .unwrap(),
456 rocksdb_very_slow_batch_writes_count: register_int_counter_vec_with_registry!(
457 "rocksdb_num_very_slow_batch_writes",
458 "Number of batch writes that took more than 1 second",
459 &["db_name"],
460 registry,
461 )
462 .unwrap(),
463 rocksdb_very_slow_batch_writes_duration_ms: register_int_counter_vec_with_registry!(
464 "rocksdb_very_slow_batch_writes_duration",
465 "Total duration of batch writes that took more than 1 second",
466 &["db_name"],
467 registry,
468 )
469 .unwrap(),
470 rocksdb_very_slow_puts_count: register_int_counter_vec_with_registry!(
471 "rocksdb_num_very_slow_puts",
472 "Number of puts that took more than 1 second",
473 &["cf_name"],
474 registry,
475 )
476 .unwrap(),
477 rocksdb_very_slow_puts_duration_ms: register_int_counter_vec_with_registry!(
478 "rocksdb_very_slow_puts_duration",
479 "Total duration of puts that took more than 1 second",
480 &["cf_name"],
481 registry,
482 )
483 .unwrap(),
484 }
485 }
486}
487
488pub struct RocksDBPerfContext;
489
490impl Default for RocksDBPerfContext {
491 fn default() -> Self {
492 set_perf_stats(PerfStatsLevel::EnableTime);
493 PER_THREAD_ROCKS_PERF_CONTEXT.with(|perf_context| {
494 perf_context.borrow_mut().reset();
495 });
496 RocksDBPerfContext {}
497 }
498}
499
500impl Drop for RocksDBPerfContext {
501 fn drop(&mut self) {
502 set_perf_stats(PerfStatsLevel::Disable);
503 }
504}
505
506#[derive(Debug)]
507pub struct ReadPerfContextMetrics {
508 pub user_key_comparison_count: IntCounterVec,
509 pub block_cache_hit_count: IntCounterVec,
510 pub block_read_count: IntCounterVec,
511 pub block_read_byte: IntCounterVec,
512 pub block_read_nanos: IntCounterVec,
513 pub block_checksum_nanos: IntCounterVec,
514 pub block_decompress_nanos: IntCounterVec,
515 pub get_read_bytes: IntCounterVec,
516 pub multiget_read_bytes: IntCounterVec,
517 pub get_snapshot_nanos: IntCounterVec,
518 pub get_from_memtable_nanos: IntCounterVec,
519 pub get_from_memtable_count: IntCounterVec,
520 pub get_post_process_nanos: IntCounterVec,
521 pub get_from_output_files_nanos: IntCounterVec,
522 pub db_mutex_lock_nanos: IntCounterVec,
523 pub db_condition_wait_nanos: IntCounterVec,
524 pub merge_operator_nanos: IntCounterVec,
525 pub read_index_block_nanos: IntCounterVec,
526 pub read_filter_block_nanos: IntCounterVec,
527 pub new_table_block_iter_nanos: IntCounterVec,
528 pub block_seek_nanos: IntCounterVec,
529 pub find_table_nanos: IntCounterVec,
530 pub bloom_memtable_hit_count: IntCounterVec,
531 pub bloom_memtable_miss_count: IntCounterVec,
532 pub bloom_sst_hit_count: IntCounterVec,
533 pub bloom_sst_miss_count: IntCounterVec,
534 pub key_lock_wait_time: IntCounterVec,
535 pub key_lock_wait_count: IntCounterVec,
536 pub internal_delete_skipped_count: IntCounterVec,
537 pub internal_skipped_count: IntCounterVec,
538}
539
540impl ReadPerfContextMetrics {
541 pub(crate) fn new(registry: &Registry) -> Self {
542 ReadPerfContextMetrics {
543 user_key_comparison_count: register_int_counter_vec_with_registry!(
544 "user_key_comparison_count",
545 "Helps us figure out whether too many comparisons in binary search can be a problem,
546 especially when a more expensive comparator is used. Moreover, since number of comparisons
547 is usually uniform based on the memtable size, the SST file size for Level 0 and size of other
548 levels, an significant increase of the counter can indicate unexpected LSM-tree shape.
549 You may want to check whether flush/compaction can keep up with the write speed",
550 &["cf_name"],
551 registry,
552 )
553 .unwrap(),
554 block_cache_hit_count: register_int_counter_vec_with_registry!(
555 "block_cache_hit_count",
556 "Tells us how many times we read data blocks from block cache, and block_read_count tells us how many
557 times we have to read blocks from the file system (either block cache is disabled or it is a cache miss).
558 We can evaluate the block cache efficiency by looking at the two counters over time.",
559 &["cf_name"],
560 registry,
561 )
562 .unwrap(),
563 block_read_count: register_int_counter_vec_with_registry!(
564 "block_read_count",
565 "Tells us how many times we have to read blocks from the file system (either block cache is disabled or it is a cache miss)",
566 &["cf_name"],
567 registry,
568 )
569 .unwrap(),
570 block_read_byte: register_int_counter_vec_with_registry!(
571 "block_read_byte",
572 "Tells us how many total bytes we read from the file system. It can tell us whether a slow query can be caused by reading
573 large blocks from the file system. Index and bloom filter blocks are usually large blocks. A large block can also be the result
574 of a very large key or value",
575 &["cf_name"],
576 registry,
577 )
578 .unwrap(),
579 block_read_nanos: register_int_counter_vec_with_registry!(
580 "block_read_nanos",
581 "Total nanos spent on block reads",
582 &["cf_name"],
583 registry,
584 )
585 .unwrap(),
586 block_checksum_nanos: register_int_counter_vec_with_registry!(
587 "block_checksum_nanos",
588 "Total nanos spent on verifying block checksum",
589 &["cf_name"],
590 registry,
591 )
592 .unwrap(),
593 block_decompress_nanos: register_int_counter_vec_with_registry!(
594 "block_decompress_nanos",
595 "Total nanos spent on decompressing a block",
596 &["cf_name"],
597 registry,
598 )
599 .unwrap(),
600 get_read_bytes: register_int_counter_vec_with_registry!(
601 "get_read_bytes",
602 "Total bytes for values returned by Get",
603 &["cf_name"],
604 registry,
605 )
606 .unwrap(),
607 multiget_read_bytes: register_int_counter_vec_with_registry!(
608 "multiget_read_bytes",
609 "Total bytes for values returned by MultiGet.",
610 &["cf_name"],
611 registry,
612 )
613 .unwrap(),
614 get_snapshot_nanos: register_int_counter_vec_with_registry!(
615 "get_snapshot_nanos",
616 "Time spent in getting snapshot.",
617 &["cf_name"],
618 registry,
619 )
620 .unwrap(),
621 get_from_memtable_nanos: register_int_counter_vec_with_registry!(
622 "get_from_memtable_nanos",
623 "Time spent on reading data from memtable.",
624 &["cf_name"],
625 registry,
626 )
627 .unwrap(),
628 get_from_memtable_count: register_int_counter_vec_with_registry!(
629 "get_from_memtable_count",
630 "Number of memtables queried",
631 &["cf_name"],
632 registry,
633 )
634 .unwrap(),
635 get_post_process_nanos: register_int_counter_vec_with_registry!(
636 "get_post_process_nanos",
637 "Total nanos spent after Get() finds a key",
638 &["cf_name"],
639 registry,
640 )
641 .unwrap(),
642 get_from_output_files_nanos: register_int_counter_vec_with_registry!(
643 "get_from_output_files_nanos",
644 "Total nanos reading from output files",
645 &["cf_name"],
646 registry,
647 )
648 .unwrap(),
649 db_mutex_lock_nanos: register_int_counter_vec_with_registry!(
650 "db_mutex_lock_nanos",
651 "Time spent on acquiring db mutex",
652 &["cf_name"],
653 registry,
654 )
655 .unwrap(),
656 db_condition_wait_nanos: register_int_counter_vec_with_registry!(
657 "db_condition_wait_nanos",
658 "Time spent waiting with a condition variable created with DB Mutex.",
659 &["cf_name"],
660 registry,
661 )
662 .unwrap(),
663 merge_operator_nanos: register_int_counter_vec_with_registry!(
664 "merge_operator_nanos",
665 "Time spent on merge operator.",
666 &["cf_name"],
667 registry,
668 )
669 .unwrap(),
670 read_index_block_nanos: register_int_counter_vec_with_registry!(
671 "read_index_block_nanos",
672 "Time spent on reading index block from block cache or SST file",
673 &["cf_name"],
674 registry,
675 )
676 .unwrap(),
677 read_filter_block_nanos: register_int_counter_vec_with_registry!(
678 "read_filter_block_nanos",
679 "Time spent on reading filter block from block cache or SST file",
680 &["cf_name"],
681 registry,
682 )
683 .unwrap(),
684 new_table_block_iter_nanos: register_int_counter_vec_with_registry!(
685 "new_table_block_iter_nanos",
686 "Time spent on creating data block iterator",
687 &["cf_name"],
688 registry,
689 )
690 .unwrap(),
691 block_seek_nanos: register_int_counter_vec_with_registry!(
692 "block_seek_nanos",
693 "Time spent on seeking a key in data/index blocks",
694 &["cf_name"],
695 registry,
696 )
697 .unwrap(),
698 find_table_nanos: register_int_counter_vec_with_registry!(
699 "find_table_nanos",
700 "Time spent on finding or creating a table reader",
701 &["cf_name"],
702 registry,
703 )
704 .unwrap(),
705 bloom_memtable_hit_count: register_int_counter_vec_with_registry!(
706 "bloom_memtable_hit_count",
707 "Total number of mem table bloom hits",
708 &["cf_name"],
709 registry,
710 )
711 .unwrap(),
712 bloom_memtable_miss_count: register_int_counter_vec_with_registry!(
713 "bloom_memtable_miss_count",
714 "Total number of mem table bloom misses",
715 &["cf_name"],
716 registry,
717 )
718 .unwrap(),
719 bloom_sst_hit_count: register_int_counter_vec_with_registry!(
720 "bloom_sst_hit_count",
721 "Total number of SST table bloom hits",
722 &["cf_name"],
723 registry,
724 )
725 .unwrap(),
726 bloom_sst_miss_count: register_int_counter_vec_with_registry!(
727 "bloom_sst_miss_count",
728 "Total number of SST table bloom misses",
729 &["cf_name"],
730 registry,
731 )
732 .unwrap(),
733 key_lock_wait_time: register_int_counter_vec_with_registry!(
734 "key_lock_wait_time",
735 "Time spent waiting on key locks in transaction lock manager",
736 &["cf_name"],
737 registry,
738 )
739 .unwrap(),
740 key_lock_wait_count: register_int_counter_vec_with_registry!(
741 "key_lock_wait_count",
742 "Number of times acquiring a lock was blocked by another transaction",
743 &["cf_name"],
744 registry,
745 )
746 .unwrap(),
747 internal_delete_skipped_count: register_int_counter_vec_with_registry!(
748 "internal_delete_skipped_count",
749 "Total number of deleted keys skipped during iteration",
750 &["cf_name"],
751 registry,
752 )
753 .unwrap(),
754 internal_skipped_count: register_int_counter_vec_with_registry!(
755 "internal_skipped_count",
756 "Totall number of internal keys skipped during iteration",
757 &["cf_name"],
758 registry,
759 )
760 .unwrap(),
761 }
762 }
763
764 pub fn report_metrics(&self, cf_name: &str) {
765 PER_THREAD_ROCKS_PERF_CONTEXT.with(|perf_context_cell| {
766 set_perf_stats(PerfStatsLevel::Disable);
767 let perf_context = perf_context_cell.borrow();
768 self.user_key_comparison_count
769 .with_label_values(&[cf_name])
770 .inc_by(perf_context.metric(PerfMetric::UserKeyComparisonCount));
771 self.block_cache_hit_count
772 .with_label_values(&[cf_name])
773 .inc_by(perf_context.metric(PerfMetric::BlockCacheHitCount));
774 self.block_read_count
775 .with_label_values(&[cf_name])
776 .inc_by(perf_context.metric(PerfMetric::BlockReadCount));
777 self.block_read_byte
778 .with_label_values(&[cf_name])
779 .inc_by(perf_context.metric(PerfMetric::BlockReadByte));
780 self.block_read_nanos
781 .with_label_values(&[cf_name])
782 .inc_by(perf_context.metric(PerfMetric::BlockReadTime));
783 self.block_read_count
784 .with_label_values(&[cf_name])
785 .inc_by(perf_context.metric(PerfMetric::BlockReadCount));
786 self.block_checksum_nanos
787 .with_label_values(&[cf_name])
788 .inc_by(perf_context.metric(PerfMetric::BlockChecksumTime));
789 self.block_decompress_nanos
790 .with_label_values(&[cf_name])
791 .inc_by(perf_context.metric(PerfMetric::BlockDecompressTime));
792 self.get_read_bytes
793 .with_label_values(&[cf_name])
794 .inc_by(perf_context.metric(PerfMetric::GetReadBytes));
795 self.multiget_read_bytes
796 .with_label_values(&[cf_name])
797 .inc_by(perf_context.metric(PerfMetric::MultigetReadBytes));
798 self.get_snapshot_nanos
799 .with_label_values(&[cf_name])
800 .inc_by(perf_context.metric(PerfMetric::GetSnapshotTime));
801 self.get_from_memtable_nanos
802 .with_label_values(&[cf_name])
803 .inc_by(perf_context.metric(PerfMetric::GetFromMemtableTime));
804 self.get_from_memtable_count
805 .with_label_values(&[cf_name])
806 .inc_by(perf_context.metric(PerfMetric::GetFromMemtableCount));
807 self.get_post_process_nanos
808 .with_label_values(&[cf_name])
809 .inc_by(perf_context.metric(PerfMetric::GetPostProcessTime));
810 self.get_from_output_files_nanos
811 .with_label_values(&[cf_name])
812 .inc_by(perf_context.metric(PerfMetric::GetFromOutputFilesTime));
813 self.db_mutex_lock_nanos
814 .with_label_values(&[cf_name])
815 .inc_by(perf_context.metric(PerfMetric::DbMutexLockNanos));
816 self.db_condition_wait_nanos
817 .with_label_values(&[cf_name])
818 .inc_by(perf_context.metric(PerfMetric::DbConditionWaitNanos));
819 self.merge_operator_nanos
820 .with_label_values(&[cf_name])
821 .inc_by(perf_context.metric(PerfMetric::MergeOperatorTimeNanos));
822 self.read_index_block_nanos
823 .with_label_values(&[cf_name])
824 .inc_by(perf_context.metric(PerfMetric::ReadIndexBlockNanos));
825 self.read_filter_block_nanos
826 .with_label_values(&[cf_name])
827 .inc_by(perf_context.metric(PerfMetric::ReadFilterBlockNanos));
828 self.new_table_block_iter_nanos
829 .with_label_values(&[cf_name])
830 .inc_by(perf_context.metric(PerfMetric::NewTableBlockIterNanos));
831 self.block_seek_nanos
832 .with_label_values(&[cf_name])
833 .inc_by(perf_context.metric(PerfMetric::BlockSeekNanos));
834 self.find_table_nanos
835 .with_label_values(&[cf_name])
836 .inc_by(perf_context.metric(PerfMetric::FindTableNanos));
837 self.bloom_memtable_hit_count
838 .with_label_values(&[cf_name])
839 .inc_by(perf_context.metric(PerfMetric::BloomMemtableHitCount));
840 self.bloom_memtable_miss_count
841 .with_label_values(&[cf_name])
842 .inc_by(perf_context.metric(PerfMetric::BloomMemtableMissCount));
843 self.bloom_sst_hit_count
844 .with_label_values(&[cf_name])
845 .inc_by(perf_context.metric(PerfMetric::BloomSstHitCount));
846 self.bloom_sst_miss_count
847 .with_label_values(&[cf_name])
848 .inc_by(perf_context.metric(PerfMetric::BloomSstMissCount));
849 self.key_lock_wait_time
850 .with_label_values(&[cf_name])
851 .inc_by(perf_context.metric(PerfMetric::KeyLockWaitTime));
852 self.key_lock_wait_count
853 .with_label_values(&[cf_name])
854 .inc_by(perf_context.metric(PerfMetric::KeyLockWaitCount));
855 self.internal_delete_skipped_count
856 .with_label_values(&[cf_name])
857 .inc_by(perf_context.metric(PerfMetric::InternalDeleteSkippedCount));
858 self.internal_skipped_count
859 .with_label_values(&[cf_name])
860 .inc_by(perf_context.metric(PerfMetric::InternalKeySkippedCount));
861 });
862 }
863}
864
865#[derive(Debug)]
866pub struct WritePerfContextMetrics {
867 pub write_wal_nanos: IntCounterVec,
868 pub write_memtable_nanos: IntCounterVec,
869 pub write_delay_nanos: IntCounterVec,
870 pub write_pre_and_post_process_nanos: IntCounterVec,
871 pub write_db_mutex_lock_nanos: IntCounterVec,
872 pub write_db_condition_wait_nanos: IntCounterVec,
873 pub write_key_lock_wait_nanos: IntCounterVec,
874 pub write_key_lock_wait_count: IntCounterVec,
875}
876
877impl WritePerfContextMetrics {
878 pub(crate) fn new(registry: &Registry) -> Self {
879 WritePerfContextMetrics {
880 write_wal_nanos: register_int_counter_vec_with_registry!(
881 "write_wal_nanos",
882 "Total nanos spent on writing to WAL",
883 &["cf_name"],
884 registry,
885 )
886 .unwrap(),
887 write_memtable_nanos: register_int_counter_vec_with_registry!(
888 "write_memtable_nanos",
889 "Total nanos spent on writing to memtable",
890 &["cf_name"],
891 registry,
892 )
893 .unwrap(),
894 write_delay_nanos: register_int_counter_vec_with_registry!(
895 "write_delay_nanos",
896 "Total nanos spent on delaying or throttling write",
897 &["cf_name"],
898 registry,
899 )
900 .unwrap(),
901 write_pre_and_post_process_nanos: register_int_counter_vec_with_registry!(
902 "write_pre_and_post_process_nanos",
903 "Total nanos spent on writing a record, excluding the above four things",
904 &["cf_name"],
905 registry,
906 )
907 .unwrap(),
908 write_db_mutex_lock_nanos: register_int_counter_vec_with_registry!(
909 "write_db_mutex_lock_nanos",
910 "Time spent on acquiring db mutex",
911 &["cf_name"],
912 registry,
913 )
914 .unwrap(),
915 write_db_condition_wait_nanos: register_int_counter_vec_with_registry!(
916 "write_db_condition_wait_nanos",
917 "Time spent waiting with a condition variable created with DB Mutex.",
918 &["cf_name"],
919 registry,
920 )
921 .unwrap(),
922 write_key_lock_wait_nanos: register_int_counter_vec_with_registry!(
923 "write_key_lock_wait_time",
924 "Time spent waiting on key locks in transaction lock manager",
925 &["cf_name"],
926 registry,
927 )
928 .unwrap(),
929 write_key_lock_wait_count: register_int_counter_vec_with_registry!(
930 "write_key_lock_wait_count",
931 "Number of times acquiring a lock was blocked by another transaction",
932 &["cf_name"],
933 registry,
934 )
935 .unwrap(),
936 }
937 }
938 pub fn report_metrics(&self, db_name: &str) {
939 PER_THREAD_ROCKS_PERF_CONTEXT.with(|perf_context_cell| {
940 set_perf_stats(PerfStatsLevel::Disable);
941 let perf_context = perf_context_cell.borrow();
942 self.write_wal_nanos
943 .with_label_values(&[db_name])
944 .inc_by(perf_context.metric(PerfMetric::WriteWalTime));
945 self.write_memtable_nanos
946 .with_label_values(&[db_name])
947 .inc_by(perf_context.metric(PerfMetric::WriteMemtableTime));
948 self.write_delay_nanos
949 .with_label_values(&[db_name])
950 .inc_by(perf_context.metric(PerfMetric::WriteDelayTime));
951 self.write_pre_and_post_process_nanos
952 .with_label_values(&[db_name])
953 .inc_by(perf_context.metric(PerfMetric::WritePreAndPostProcessTime));
954 self.write_db_mutex_lock_nanos
955 .with_label_values(&[db_name])
956 .inc_by(perf_context.metric(PerfMetric::DbMutexLockNanos));
957 self.write_db_condition_wait_nanos
958 .with_label_values(&[db_name])
959 .inc_by(perf_context.metric(PerfMetric::DbConditionWaitNanos));
960 self.write_key_lock_wait_nanos
961 .with_label_values(&[db_name])
962 .inc_by(perf_context.metric(PerfMetric::KeyLockWaitTime));
963 self.write_key_lock_wait_count
964 .with_label_values(&[db_name])
965 .inc_by(perf_context.metric(PerfMetric::KeyLockWaitCount));
966 });
967 }
968}
969
970#[derive(Debug)]
971pub struct DBMetrics {
972 pub op_metrics: OperationMetrics,
973 pub cf_metrics: ColumnFamilyMetrics,
974 pub read_perf_ctx_metrics: ReadPerfContextMetrics,
975 pub write_perf_ctx_metrics: WritePerfContextMetrics,
976}
977
978static ONCE: OnceCell<Arc<DBMetrics>> = OnceCell::new();
979
980impl DBMetrics {
981 fn new(registry: &Registry) -> Self {
982 DBMetrics {
983 op_metrics: OperationMetrics::new(registry),
984 cf_metrics: ColumnFamilyMetrics::new(registry),
985 read_perf_ctx_metrics: ReadPerfContextMetrics::new(registry),
986 write_perf_ctx_metrics: WritePerfContextMetrics::new(registry),
987 }
988 }
989 pub fn init(registry: &Registry) -> &'static Arc<DBMetrics> {
990 let _ = ONCE
998 .set(Arc::new(DBMetrics::new(registry)))
999 .tap_err(|_| warn!("DBMetrics registry overwritten"));
1001 ONCE.get().unwrap()
1002 }
1003 pub fn increment_num_active_dbs(&self, db_name: &str) {
1004 self.op_metrics
1005 .rocksdb_num_active_db_handles
1006 .with_label_values(&[db_name])
1007 .inc();
1008 }
1009 pub fn decrement_num_active_dbs(&self, db_name: &str) {
1010 self.op_metrics
1011 .rocksdb_num_active_db_handles
1012 .with_label_values(&[db_name])
1013 .dec();
1014 }
1015 pub fn get() -> &'static Arc<DBMetrics> {
1016 ONCE.get()
1017 .unwrap_or_else(|| DBMetrics::init(prometheus::default_registry()))
1018 }
1019}