iota_core/epoch/
epoch_metrics.rs

1// Copyright (c) Mysten Labs, Inc.
2// Modifications Copyright (c) 2024 IOTA Stiftung
3// SPDX-License-Identifier: Apache-2.0
4
5use std::sync::Arc;
6
7use prometheus::{IntGauge, Registry, register_int_gauge_with_registry};
8
9pub struct EpochMetrics {
10    /// The current epoch ID. This is updated only when the AuthorityState
11    /// finishes reconfiguration.
12    pub current_epoch: IntGauge,
13
14    /// Current voting right of the validator in the protocol. Updated at the
15    /// start of epochs.
16    pub current_voting_right: IntGauge,
17
18    /// Total duration of the epoch. This is measured from when the current
19    /// epoch store is opened, until the current epoch store is replaced
20    /// with the next epoch store.
21    pub epoch_total_duration: IntGauge,
22
23    /// Number of checkpoints in the epoch.
24    pub epoch_checkpoint_count: IntGauge,
25
26    /// Number of transactions in the epoch.
27    pub epoch_transaction_count: IntGauge,
28
29    /// Total amount of gas rewards (i.e. computation gas cost) in the epoch.
30    pub epoch_total_gas_reward: IntGauge,
31
32    // An active validator reconfigures through the following steps:
33    // 1. Halt validator (a.k.a. close epoch) and stop accepting user transaction certs.
34    // 2. Finishes processing all pending certificates and then send EndOfPublish message.
35    // 3. Stop accepting messages from consensus after seeing 2f+1 EndOfPublish messages.
36    // 4. Creating the last checkpoint of the epoch by augmenting it with AdvanceEpoch transaction.
37    // 5. CheckpointExecutor finishes executing the last checkpoint, and triggers reconfiguration.
38    // 6. During reconfiguration, we tear down consensus, reconfigure state (at which point we
39    //    opens up user certs), and start consensus again.
40    // 7. After reconfiguration, and eventually consensus starts successfully, at some point the
41    //    first checkpoint of the new epoch will be created.
42    // We introduce various metrics to cover the latency of above steps.
43    /// The duration from when the epoch is closed (i.e. validator halted) to
44    /// when all pending certificates are processed (i.e. ready to send
45    /// EndOfPublish message). This is the duration of (1) through (2)
46    /// above.
47    pub epoch_pending_certs_processed_time_since_epoch_close_ms: IntGauge,
48
49    /// The interval from when the epoch is closed to when we receive 2f+1
50    /// EndOfPublish messages. This is the duration of (1) through (3)
51    /// above.
52    pub epoch_end_of_publish_quorum_time_since_epoch_close_ms: IntGauge,
53
54    /// The interval from when the epoch is closed to when we created the last
55    /// checkpoint of the epoch.
56    /// This is the duration of (1) through (4) above.
57    pub epoch_last_checkpoint_created_time_since_epoch_close_ms: IntGauge,
58
59    /// The interval from when the epoch is closed to when we finished executing
60    /// the last transaction of the checkpoint (and hence triggering
61    /// reconfiguration process). This is the duration of (1) through (5)
62    /// above.
63    pub epoch_reconfig_start_time_since_epoch_close_ms: IntGauge,
64
65    /// The total duration when this validator is halted, and hence does not
66    /// accept certs from users. This is the duration of (1) through (6)
67    /// above, and is the most important latency metric reflecting
68    /// reconfiguration delay for each validator.
69    pub epoch_validator_halt_duration_ms: IntGauge,
70
71    /// The interval from when the epoch begins (i.e. right after state
72    /// reconfigure, when the new epoch_store is created), to when the first
73    /// checkpoint of the epoch is ready for creation locally. This is (7)
74    /// above, and is a good proxy to how long it takes for the validator to
75    /// become useful in the network after reconfiguration.
76    // TODO: This needs to be reported properly.
77    pub epoch_first_checkpoint_created_time_since_epoch_begin_ms: IntGauge,
78
79    /// Whether we are running in safe mode where reward distribution and
80    /// tokenomics are disabled.
81    pub is_safe_mode: IntGauge,
82
83    /// When building the last checkpoint of the epoch, we execute advance epoch
84    /// transaction once without committing results to the store. It's
85    /// useful to know whether this execution leads to safe_mode, since in
86    /// theory the result could be different from checkpoint executor.
87    pub checkpoint_builder_advance_epoch_is_safe_mode: IntGauge,
88
89    /// Buffer stake current in effect for this epoch
90    pub effective_buffer_stake: IntGauge,
91
92    /// Set to 1 if the random beacon DKG protocol failed for the most recent
93    /// epoch.
94    pub epoch_random_beacon_dkg_failed: IntGauge,
95
96    /// The number of shares held by this node after the random beacon DKG
97    /// protocol completed.
98    pub epoch_random_beacon_dkg_num_shares: IntGauge,
99
100    /// The amount of time taken from epoch start to completion of random beacon
101    /// DKG protocol, for the most recent epoch.
102    pub epoch_random_beacon_dkg_epoch_start_completion_time_ms: IntGauge,
103
104    /// The amount of time taken to complete random beacon DKG protocol from the
105    /// time it was started (which may be a bit after the epcoh began), for
106    /// the most recent epoch.
107    pub epoch_random_beacon_dkg_completion_time_ms: IntGauge,
108
109    /// The amount of time taken to start first phase of the random beacon DKG
110    /// protocol, at which point the node has submitted a DKG Message, for
111    /// the most recent epoch.
112    pub epoch_random_beacon_dkg_message_time_ms: IntGauge,
113
114    /// The amount of time taken to complete first phase of the random beacon
115    /// DKG protocol, at which point the node has submitted a DKG
116    /// Confirmation, for the most recent epoch.
117    pub epoch_random_beacon_dkg_confirmation_time_ms: IntGauge,
118
119    /// The number of consensus output items in the quarantine.
120    pub consensus_quarantine_queue_size: IntGauge,
121
122    /// The number of shared object assignments in the quarantine.
123    pub shared_object_assignments_size: IntGauge,
124}
125
126impl EpochMetrics {
127    pub fn new(registry: &Registry) -> Arc<Self> {
128        let this = Self {
129            current_epoch: register_int_gauge_with_registry!(
130                "current_epoch",
131                "Current epoch ID",
132                registry
133            )
134            .unwrap(),
135            current_voting_right: register_int_gauge_with_registry!(
136                "current_voting_right",
137                "Current voting right of the validator",
138                registry
139            )
140            .unwrap(),
141            epoch_checkpoint_count: register_int_gauge_with_registry!(
142                "epoch_checkpoint_count",
143                "Number of checkpoints in the epoch",
144                registry
145            ).unwrap(),
146            epoch_total_duration: register_int_gauge_with_registry!(
147                "epoch_total_duration",
148                "Total duration of the epoch",
149                registry
150            ).unwrap(),
151            epoch_transaction_count: register_int_gauge_with_registry!(
152                "epoch_transaction_count",
153                "Number of transactions in the epoch",
154                registry
155            ).unwrap(),
156            epoch_total_gas_reward: register_int_gauge_with_registry!(
157                "epoch_total_gas_reward",
158                "Total amount of gas rewards (i.e. computation gas cost) in the epoch",
159                registry
160            ).unwrap(),
161            epoch_pending_certs_processed_time_since_epoch_close_ms: register_int_gauge_with_registry!(
162                "epoch_pending_certs_processed_time_since_epoch_close_ms",
163                "Time interval from when epoch was closed to when all pending certificates are processed",
164                registry
165            ).unwrap(),
166            epoch_end_of_publish_quorum_time_since_epoch_close_ms: register_int_gauge_with_registry!(
167                "epoch_end_of_publish_quorum_time_since_epoch_close_ms",
168                "Time interval from when epoch was closed to when 2f+1 EndOfPublish messages are received",
169                registry
170            ).unwrap(),
171            epoch_last_checkpoint_created_time_since_epoch_close_ms: register_int_gauge_with_registry!(
172                "epoch_last_checkpoint_created_time_since_epoch_close_ms",
173                "Time interval from when epoch was closed to when the last checkpoint of the epoch is created",
174                registry
175            ).unwrap(),
176            epoch_reconfig_start_time_since_epoch_close_ms: register_int_gauge_with_registry!(
177                "epoch_reconfig_start_time_since_epoch_close_ms",
178                "Total time duration from when epoch was closed to when we begin to reconfigure the validator",
179                registry
180            ).unwrap(),
181            epoch_validator_halt_duration_ms: register_int_gauge_with_registry!(
182                "epoch_validator_halt_duration_ms",
183                "Total time duration when the validator was halted (i.e. epoch closed)",
184                registry
185            ).unwrap(),
186            epoch_first_checkpoint_created_time_since_epoch_begin_ms: register_int_gauge_with_registry!(
187                "epoch_first_checkpoint_created_time_since_epoch_begin_ms",
188                "Time interval from when the epoch opens at new epoch to the first checkpoint is created locally",
189                registry
190            ).unwrap(),
191            is_safe_mode: register_int_gauge_with_registry!(
192                "is_safe_mode",
193                "Whether we are running in safe mode",
194                registry,
195            ).unwrap(),
196            checkpoint_builder_advance_epoch_is_safe_mode: register_int_gauge_with_registry!(
197                "checkpoint_builder_advance_epoch_is_safe_mode",
198                "Whether the advance epoch execution leads to safe mode while building the last checkpoint",
199                registry,
200            ).unwrap(),
201            effective_buffer_stake: register_int_gauge_with_registry!(
202                "effective_buffer_stake",
203                "Buffer stake current in effect for this epoch",
204                registry,
205            ).unwrap(),
206            epoch_random_beacon_dkg_failed: register_int_gauge_with_registry!(
207                "epoch_random_beacon_dkg_failed",
208                "Set to 1 if the random beacon DKG protocol failed for the most recent epoch.",
209                registry
210            )
211            .unwrap(),
212            epoch_random_beacon_dkg_num_shares: register_int_gauge_with_registry!(
213                "epoch_random_beacon_dkg_num_shares",
214                "The number of shares held by this node after the random beacon DKG protocol completed",
215                registry
216            )
217            .unwrap(),
218            epoch_random_beacon_dkg_epoch_start_completion_time_ms: register_int_gauge_with_registry!(
219                "epoch_random_beacon_dkg_epoch_start_completion_time_ms",
220                "The amount of time taken from epoch start to completion of random beacon DKG protocol, for the most recent epoch",
221                registry
222            )
223            .unwrap(),
224            epoch_random_beacon_dkg_completion_time_ms: register_int_gauge_with_registry!(
225                "epoch_random_beacon_dkg_completion_time_ms",
226                "The amount of time taken to complete random beacon DKG protocol from the time it was started (which may be a bit after the epoch began), for the most recent epoch",
227                registry
228            )
229            .unwrap(),
230            epoch_random_beacon_dkg_message_time_ms: register_int_gauge_with_registry!(
231                "epoch_random_beacon_dkg_message_time_ms",
232                "The amount of time taken to start first phase of the random beacon DKG protocol, at which point the node has submitted a DKG Message, for the most recent epoch",
233                registry
234            )
235            .unwrap(),
236            epoch_random_beacon_dkg_confirmation_time_ms: register_int_gauge_with_registry!(
237                "epoch_random_beacon_dkg_confirmation_time_ms",
238                "The amount of time taken to complete first phase of the random beacon DKG protocol, at which point the node has submitted a DKG Confirmation, for the most recent epoch",
239                registry
240            )
241            .unwrap(),
242            consensus_quarantine_queue_size: register_int_gauge_with_registry!(
243                "consensus_quarantine_queue_size",
244                "The number of consensus output items in the quarantine",
245                registry
246            )
247            .unwrap(),
248            shared_object_assignments_size: register_int_gauge_with_registry!(
249                "shared_object_assignments_size",
250                "The number of shared object assignments in the quarantine",
251                registry
252            )
253            .unwrap(),
254        };
255        Arc::new(this)
256    }
257}