iota_core/epoch/
epoch_metrics.rs

1// Copyright (c) Mysten Labs, Inc.
2// Modifications Copyright (c) 2024 IOTA Stiftung
3// SPDX-License-Identifier: Apache-2.0
4
5use std::sync::Arc;
6
7use prometheus::{IntGauge, Registry, register_int_gauge_with_registry};
8
9pub struct EpochMetrics {
10    /// The current epoch ID. This is updated only when the AuthorityState
11    /// finishes reconfiguration.
12    pub current_epoch: IntGauge,
13
14    /// Current voting right of the validator in the protocol. Updated at the
15    /// start of epochs.
16    pub current_voting_right: IntGauge,
17
18    /// Total duration of the epoch. This is measured from when the current
19    /// epoch store is opened, until the current epoch store is replaced
20    /// with the next epoch store.
21    pub epoch_total_duration: IntGauge,
22
23    /// Number of checkpoints in the epoch.
24    pub epoch_checkpoint_count: IntGauge,
25
26    /// Number of transactions in the epoch.
27    pub epoch_transaction_count: IntGauge,
28
29    /// Total amount of gas rewards (i.e. computation gas cost) in the epoch.
30    pub epoch_total_gas_reward: IntGauge,
31
32    // An active validator reconfigures through the following steps:
33    // 1. Halt validator (a.k.a. close epoch) and stop accepting user transaction certs.
34    // 2. Finishes processing all pending certificates and then send EndOfPublish message.
35    // 3. Stop accepting messages from consensus after seeing 2f+1 EndOfPublish messages.
36    // 4. Creating the last checkpoint of the epoch by augmenting it with AdvanceEpoch transaction.
37    // 5. CheckpointExecutor finishes executing the last checkpoint, and triggers reconfiguration.
38    // 6. During reconfiguration, we tear down consensus, reconfigure state (at which point we
39    //    opens up user certs), and start consensus again.
40    // 7. After reconfiguration, and eventually consensus starts successfully, at some point the
41    //    first checkpoint of the new epoch will be created.
42    // We introduce various metrics to cover the latency of above steps.
43    /// The duration from when the epoch is closed (i.e. validator halted) to
44    /// when all pending certificates are processed (i.e. ready to send
45    /// EndOfPublish message). This is the duration of (1) through (2)
46    /// above.
47    pub epoch_pending_certs_processed_time_since_epoch_close_ms: IntGauge,
48
49    /// The interval from when the epoch is closed to when we receive 2f+1
50    /// EndOfPublish messages. This is the duration of (1) through (3)
51    /// above.
52    pub epoch_end_of_publish_quorum_time_since_epoch_close_ms: IntGauge,
53
54    /// The interval from when the epoch is closed to when we created the last
55    /// checkpoint of the epoch.
56    /// This is the duration of (1) through (4) above.
57    pub epoch_last_checkpoint_created_time_since_epoch_close_ms: IntGauge,
58
59    /// The interval from when the epoch is closed to when we finished executing
60    /// the last transaction of the checkpoint (and hence triggering
61    /// reconfiguration process). This is the duration of (1) through (5)
62    /// above.
63    pub epoch_reconfig_start_time_since_epoch_close_ms: IntGauge,
64
65    /// The total duration when this validator is halted, and hence does not
66    /// accept certs from users. This is the duration of (1) through (6)
67    /// above, and is the most important latency metric reflecting
68    /// reconfiguration delay for each validator.
69    pub epoch_validator_halt_duration_ms: IntGauge,
70
71    /// The interval from when the epoch begins (i.e. right after state
72    /// reconfigure, when the new epoch_store is created), to when the first
73    /// checkpoint of the epoch is ready for creation locally. This is (7)
74    /// above, and is a good proxy to how long it takes for the validator to
75    /// become useful in the network after reconfiguration.
76    // TODO: This needs to be reported properly.
77    pub epoch_first_checkpoint_created_time_since_epoch_begin_ms: IntGauge,
78
79    /// Whether we are running in safe mode where reward distribution and
80    /// tokenomics are disabled.
81    pub is_safe_mode: IntGauge,
82
83    /// When building the last checkpoint of the epoch, we execute advance epoch
84    /// transaction once without committing results to the store. It's
85    /// useful to know whether this execution leads to safe_mode, since in
86    /// theory the result could be different from checkpoint executor.
87    pub checkpoint_builder_advance_epoch_is_safe_mode: IntGauge,
88
89    /// Buffer stake current in effect for this epoch
90    pub effective_buffer_stake: IntGauge,
91
92    /// Set to 1 if the random beacon DKG protocol failed for the most recent
93    /// epoch.
94    pub epoch_random_beacon_dkg_failed: IntGauge,
95
96    /// The number of shares held by this node after the random beacon DKG
97    /// protocol completed.
98    pub epoch_random_beacon_dkg_num_shares: IntGauge,
99
100    /// The amount of time taken from epoch start to completion of random beacon
101    /// DKG protocol, for the most recent epoch.
102    pub epoch_random_beacon_dkg_epoch_start_completion_time_ms: IntGauge,
103
104    /// The amount of time taken to complete random beacon DKG protocol from the
105    /// time it was started (which may be a bit after the epcoh began), for
106    /// the most recent epoch.
107    pub epoch_random_beacon_dkg_completion_time_ms: IntGauge,
108
109    /// The amount of time taken to start first phase of the random beacon DKG
110    /// protocol, at which point the node has submitted a DKG Message, for
111    /// the most recent epoch.
112    pub epoch_random_beacon_dkg_message_time_ms: IntGauge,
113
114    /// The amount of time taken to complete first phase of the random beacon
115    /// DKG protocol, at which point the node has submitted a DKG
116    /// Confirmation, for the most recent epoch.
117    pub epoch_random_beacon_dkg_confirmation_time_ms: IntGauge,
118}
119
120impl EpochMetrics {
121    pub fn new(registry: &Registry) -> Arc<Self> {
122        let this = Self {
123            current_epoch: register_int_gauge_with_registry!(
124                "current_epoch",
125                "Current epoch ID",
126                registry
127            )
128            .unwrap(),
129            current_voting_right: register_int_gauge_with_registry!(
130                "current_voting_right",
131                "Current voting right of the validator",
132                registry
133            )
134            .unwrap(),
135            epoch_checkpoint_count: register_int_gauge_with_registry!(
136                "epoch_checkpoint_count",
137                "Number of checkpoints in the epoch",
138                registry
139            ).unwrap(),
140            epoch_total_duration: register_int_gauge_with_registry!(
141                "epoch_total_duration",
142                "Total duration of the epoch",
143                registry
144            ).unwrap(),
145            epoch_transaction_count: register_int_gauge_with_registry!(
146                "epoch_transaction_count",
147                "Number of transactions in the epoch",
148                registry
149            ).unwrap(),
150            epoch_total_gas_reward: register_int_gauge_with_registry!(
151                "epoch_total_gas_reward",
152                "Total amount of gas rewards (i.e. computation gas cost) in the epoch",
153                registry
154            ).unwrap(),
155            epoch_pending_certs_processed_time_since_epoch_close_ms: register_int_gauge_with_registry!(
156                "epoch_pending_certs_processed_time_since_epoch_close_ms",
157                "Time interval from when epoch was closed to when all pending certificates are processed",
158                registry
159            ).unwrap(),
160            epoch_end_of_publish_quorum_time_since_epoch_close_ms: register_int_gauge_with_registry!(
161                "epoch_end_of_publish_quorum_time_since_epoch_close_ms",
162                "Time interval from when epoch was closed to when 2f+1 EndOfPublish messages are received",
163                registry
164            ).unwrap(),
165            epoch_last_checkpoint_created_time_since_epoch_close_ms: register_int_gauge_with_registry!(
166                "epoch_last_checkpoint_created_time_since_epoch_close_ms",
167                "Time interval from when epoch was closed to when the last checkpoint of the epoch is created",
168                registry
169            ).unwrap(),
170            epoch_reconfig_start_time_since_epoch_close_ms: register_int_gauge_with_registry!(
171                "epoch_reconfig_start_time_since_epoch_close_ms",
172                "Total time duration from when epoch was closed to when we begin to reconfigure the validator",
173                registry
174            ).unwrap(),
175            epoch_validator_halt_duration_ms: register_int_gauge_with_registry!(
176                "epoch_validator_halt_duration_ms",
177                "Total time duration when the validator was halted (i.e. epoch closed)",
178                registry
179            ).unwrap(),
180            epoch_first_checkpoint_created_time_since_epoch_begin_ms: register_int_gauge_with_registry!(
181                "epoch_first_checkpoint_created_time_since_epoch_begin_ms",
182                "Time interval from when the epoch opens at new epoch to the first checkpoint is created locally",
183                registry
184            ).unwrap(),
185            is_safe_mode: register_int_gauge_with_registry!(
186                "is_safe_mode",
187                "Whether we are running in safe mode",
188                registry,
189            ).unwrap(),
190            checkpoint_builder_advance_epoch_is_safe_mode: register_int_gauge_with_registry!(
191                "checkpoint_builder_advance_epoch_is_safe_mode",
192                "Whether the advance epoch execution leads to safe mode while building the last checkpoint",
193                registry,
194            ).unwrap(),
195            effective_buffer_stake: register_int_gauge_with_registry!(
196                "effective_buffer_stake",
197                "Buffer stake current in effect for this epoch",
198                registry,
199            ).unwrap(),
200            epoch_random_beacon_dkg_failed: register_int_gauge_with_registry!(
201                "epoch_random_beacon_dkg_failed",
202                "Set to 1 if the random beacon DKG protocol failed for the most recent epoch.",
203                registry
204            )
205            .unwrap(),
206            epoch_random_beacon_dkg_num_shares: register_int_gauge_with_registry!(
207                "epoch_random_beacon_dkg_num_shares",
208                "The number of shares held by this node after the random beacon DKG protocol completed",
209                registry
210            )
211            .unwrap(),
212            epoch_random_beacon_dkg_epoch_start_completion_time_ms: register_int_gauge_with_registry!(
213                "epoch_random_beacon_dkg_epoch_start_completion_time_ms",
214                "The amount of time taken from epoch start to completion of random beacon DKG protocol, for the most recent epoch",
215                registry
216            )
217            .unwrap(),
218            epoch_random_beacon_dkg_completion_time_ms: register_int_gauge_with_registry!(
219                "epoch_random_beacon_dkg_completion_time_ms",
220                "The amount of time taken to complete random beacon DKG protocol from the time it was started (which may be a bit after the epoch began), for the most recent epoch",
221                registry
222            )
223            .unwrap(),
224            epoch_random_beacon_dkg_message_time_ms: register_int_gauge_with_registry!(
225                "epoch_random_beacon_dkg_message_time_ms",
226                "The amount of time taken to start first phase of the random beacon DKG protocol, at which point the node has submitted a DKG Message, for the most recent epoch",
227                registry
228            )
229            .unwrap(),
230            epoch_random_beacon_dkg_confirmation_time_ms: register_int_gauge_with_registry!(
231                "epoch_random_beacon_dkg_confirmation_time_ms",
232                "The amount of time taken to complete first phase of the random beacon DKG protocol, at which point the node has submitted a DKG Confirmation, for the most recent epoch",
233                registry
234            )
235            .unwrap(),
236        };
237        Arc::new(this)
238    }
239}