Skip to content

Commit d5cf53d

Browse files
replay: adding better metrics (#7583)
* replay: adding counters for replay-blocking conditions * replay: adding reasm tracking metrics
1 parent f13391c commit d5cf53d

File tree

5 files changed

+108
-10
lines changed

5 files changed

+108
-10
lines changed

book/api/metrics-generated.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -526,8 +526,15 @@
526526
| <span class="metrics-name">replay_&#8203;reset_&#8203;slot</span> | gauge | The slot at which we last reset the replay stage, or 0 if unknown |
527527
| <span class="metrics-name">replay_&#8203;max_&#8203;live_&#8203;banks</span> | gauge | The maximum number of banks we can have alive |
528528
| <span class="metrics-name">replay_&#8203;live_&#8203;banks</span> | gauge | The number of banks we currently have alive |
529+
| <span class="metrics-name">replay_&#8203;reasm_&#8203;free</span> | gauge | The number of free FEC sets in the reassembly queue |
530+
| <span class="metrics-name">replay_&#8203;reasm_&#8203;latest_&#8203;slot</span> | gauge | Slot of the latest FEC set in the reassembly queue that can be replayed |
531+
| <span class="metrics-name">replay_&#8203;reasm_&#8203;latest_&#8203;fec_&#8203;idx</span> | gauge | FEC set index of the latest FEC set in the reassembly queue that can be replayed |
529532
| <span class="metrics-name">replay_&#8203;slots_&#8203;total</span> | counter | Count of slots replayed successfully |
530533
| <span class="metrics-name">replay_&#8203;transactions_&#8203;total</span> | counter | Count of transactions processed overall on the current fork |
534+
| <span class="metrics-name">replay_&#8203;sched_&#8203;full</span> | counter | Times where sched is full and a FEC set can't be processed |
535+
| <span class="metrics-name">replay_&#8203;reasm_&#8203;empty</span> | counter | Times where reasm is empty and a FEC set can't be processed |
536+
| <span class="metrics-name">replay_&#8203;leader_&#8203;bid_&#8203;wait</span> | counter | Times where replay is blocked by the the PoH tile not sending an end of leader message |
537+
| <span class="metrics-name">replay_&#8203;banks_&#8203;full</span> | counter | Times where banks are full and a FEC set can't be processed |
531538
| <span class="metrics-name">replay_&#8203;progcache_&#8203;rooted</span> | counter | Number of program cache entries rooted |
532539
| <span class="metrics-name">replay_&#8203;progcache_&#8203;gc_&#8203;root</span> | counter | Number of program cache entries garbage collected while rooting |
533540
| <span class="metrics-name">replay_&#8203;accdb_&#8203;rooted</span> | counter | Number of account database entries rooted |

src/disco/metrics/generated/fd_metrics_replay.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,15 @@ const fd_metrics_meta_t FD_METRICS_REPLAY[FD_METRICS_REPLAY_TOTAL] = {
1515
DECLARE_METRIC( REPLAY_RESET_SLOT, GAUGE ),
1616
DECLARE_METRIC( REPLAY_MAX_LIVE_BANKS, GAUGE ),
1717
DECLARE_METRIC( REPLAY_LIVE_BANKS, GAUGE ),
18+
DECLARE_METRIC( REPLAY_REASM_FREE, GAUGE ),
19+
DECLARE_METRIC( REPLAY_REASM_LATEST_SLOT, GAUGE ),
20+
DECLARE_METRIC( REPLAY_REASM_LATEST_FEC_IDX, GAUGE ),
1821
DECLARE_METRIC( REPLAY_SLOTS_TOTAL, COUNTER ),
1922
DECLARE_METRIC( REPLAY_TRANSACTIONS_TOTAL, COUNTER ),
23+
DECLARE_METRIC( REPLAY_SCHED_FULL, COUNTER ),
24+
DECLARE_METRIC( REPLAY_REASM_EMPTY, COUNTER ),
25+
DECLARE_METRIC( REPLAY_LEADER_BID_WAIT, COUNTER ),
26+
DECLARE_METRIC( REPLAY_BANKS_FULL, COUNTER ),
2027
DECLARE_METRIC( REPLAY_PROGCACHE_ROOTED, COUNTER ),
2128
DECLARE_METRIC( REPLAY_PROGCACHE_GC_ROOT, COUNTER ),
2229
DECLARE_METRIC( REPLAY_ACCDB_ROOTED, COUNTER ),

src/disco/metrics/generated/fd_metrics_replay.h

Lines changed: 49 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -96,43 +96,85 @@
9696
#define FD_METRICS_GAUGE_REPLAY_LIVE_BANKS_DESC "The number of banks we currently have alive"
9797
#define FD_METRICS_GAUGE_REPLAY_LIVE_BANKS_CVT (FD_METRICS_CONVERTER_NONE)
9898

99-
#define FD_METRICS_COUNTER_REPLAY_SLOTS_TOTAL_OFF (125UL)
99+
#define FD_METRICS_GAUGE_REPLAY_REASM_FREE_OFF (125UL)
100+
#define FD_METRICS_GAUGE_REPLAY_REASM_FREE_NAME "replay_reasm_free"
101+
#define FD_METRICS_GAUGE_REPLAY_REASM_FREE_TYPE (FD_METRICS_TYPE_GAUGE)
102+
#define FD_METRICS_GAUGE_REPLAY_REASM_FREE_DESC "The number of free FEC sets in the reassembly queue"
103+
#define FD_METRICS_GAUGE_REPLAY_REASM_FREE_CVT (FD_METRICS_CONVERTER_NONE)
104+
105+
#define FD_METRICS_GAUGE_REPLAY_REASM_LATEST_SLOT_OFF (126UL)
106+
#define FD_METRICS_GAUGE_REPLAY_REASM_LATEST_SLOT_NAME "replay_reasm_latest_slot"
107+
#define FD_METRICS_GAUGE_REPLAY_REASM_LATEST_SLOT_TYPE (FD_METRICS_TYPE_GAUGE)
108+
#define FD_METRICS_GAUGE_REPLAY_REASM_LATEST_SLOT_DESC "Slot of the latest FEC set in the reassembly queue that can be replayed"
109+
#define FD_METRICS_GAUGE_REPLAY_REASM_LATEST_SLOT_CVT (FD_METRICS_CONVERTER_NONE)
110+
111+
#define FD_METRICS_GAUGE_REPLAY_REASM_LATEST_FEC_IDX_OFF (127UL)
112+
#define FD_METRICS_GAUGE_REPLAY_REASM_LATEST_FEC_IDX_NAME "replay_reasm_latest_fec_idx"
113+
#define FD_METRICS_GAUGE_REPLAY_REASM_LATEST_FEC_IDX_TYPE (FD_METRICS_TYPE_GAUGE)
114+
#define FD_METRICS_GAUGE_REPLAY_REASM_LATEST_FEC_IDX_DESC "FEC set index of the latest FEC set in the reassembly queue that can be replayed"
115+
#define FD_METRICS_GAUGE_REPLAY_REASM_LATEST_FEC_IDX_CVT (FD_METRICS_CONVERTER_NONE)
116+
117+
#define FD_METRICS_COUNTER_REPLAY_SLOTS_TOTAL_OFF (128UL)
100118
#define FD_METRICS_COUNTER_REPLAY_SLOTS_TOTAL_NAME "replay_slots_total"
101119
#define FD_METRICS_COUNTER_REPLAY_SLOTS_TOTAL_TYPE (FD_METRICS_TYPE_COUNTER)
102120
#define FD_METRICS_COUNTER_REPLAY_SLOTS_TOTAL_DESC "Count of slots replayed successfully"
103121
#define FD_METRICS_COUNTER_REPLAY_SLOTS_TOTAL_CVT (FD_METRICS_CONVERTER_NONE)
104122

105-
#define FD_METRICS_COUNTER_REPLAY_TRANSACTIONS_TOTAL_OFF (126UL)
123+
#define FD_METRICS_COUNTER_REPLAY_TRANSACTIONS_TOTAL_OFF (129UL)
106124
#define FD_METRICS_COUNTER_REPLAY_TRANSACTIONS_TOTAL_NAME "replay_transactions_total"
107125
#define FD_METRICS_COUNTER_REPLAY_TRANSACTIONS_TOTAL_TYPE (FD_METRICS_TYPE_COUNTER)
108126
#define FD_METRICS_COUNTER_REPLAY_TRANSACTIONS_TOTAL_DESC "Count of transactions processed overall on the current fork"
109127
#define FD_METRICS_COUNTER_REPLAY_TRANSACTIONS_TOTAL_CVT (FD_METRICS_CONVERTER_NONE)
110128

111-
#define FD_METRICS_COUNTER_REPLAY_PROGCACHE_ROOTED_OFF (127UL)
129+
#define FD_METRICS_COUNTER_REPLAY_SCHED_FULL_OFF (130UL)
130+
#define FD_METRICS_COUNTER_REPLAY_SCHED_FULL_NAME "replay_sched_full"
131+
#define FD_METRICS_COUNTER_REPLAY_SCHED_FULL_TYPE (FD_METRICS_TYPE_COUNTER)
132+
#define FD_METRICS_COUNTER_REPLAY_SCHED_FULL_DESC "Times where sched is full and a FEC set can't be processed"
133+
#define FD_METRICS_COUNTER_REPLAY_SCHED_FULL_CVT (FD_METRICS_CONVERTER_NONE)
134+
135+
#define FD_METRICS_COUNTER_REPLAY_REASM_EMPTY_OFF (131UL)
136+
#define FD_METRICS_COUNTER_REPLAY_REASM_EMPTY_NAME "replay_reasm_empty"
137+
#define FD_METRICS_COUNTER_REPLAY_REASM_EMPTY_TYPE (FD_METRICS_TYPE_COUNTER)
138+
#define FD_METRICS_COUNTER_REPLAY_REASM_EMPTY_DESC "Times where reasm is empty and a FEC set can't be processed"
139+
#define FD_METRICS_COUNTER_REPLAY_REASM_EMPTY_CVT (FD_METRICS_CONVERTER_NONE)
140+
141+
#define FD_METRICS_COUNTER_REPLAY_LEADER_BID_WAIT_OFF (132UL)
142+
#define FD_METRICS_COUNTER_REPLAY_LEADER_BID_WAIT_NAME "replay_leader_bid_wait"
143+
#define FD_METRICS_COUNTER_REPLAY_LEADER_BID_WAIT_TYPE (FD_METRICS_TYPE_COUNTER)
144+
#define FD_METRICS_COUNTER_REPLAY_LEADER_BID_WAIT_DESC "Times where replay is blocked by the the PoH tile not sending an end of leader message"
145+
#define FD_METRICS_COUNTER_REPLAY_LEADER_BID_WAIT_CVT (FD_METRICS_CONVERTER_NONE)
146+
147+
#define FD_METRICS_COUNTER_REPLAY_BANKS_FULL_OFF (133UL)
148+
#define FD_METRICS_COUNTER_REPLAY_BANKS_FULL_NAME "replay_banks_full"
149+
#define FD_METRICS_COUNTER_REPLAY_BANKS_FULL_TYPE (FD_METRICS_TYPE_COUNTER)
150+
#define FD_METRICS_COUNTER_REPLAY_BANKS_FULL_DESC "Times where banks are full and a FEC set can't be processed"
151+
#define FD_METRICS_COUNTER_REPLAY_BANKS_FULL_CVT (FD_METRICS_CONVERTER_NONE)
152+
153+
#define FD_METRICS_COUNTER_REPLAY_PROGCACHE_ROOTED_OFF (134UL)
112154
#define FD_METRICS_COUNTER_REPLAY_PROGCACHE_ROOTED_NAME "replay_progcache_rooted"
113155
#define FD_METRICS_COUNTER_REPLAY_PROGCACHE_ROOTED_TYPE (FD_METRICS_TYPE_COUNTER)
114156
#define FD_METRICS_COUNTER_REPLAY_PROGCACHE_ROOTED_DESC "Number of program cache entries rooted"
115157
#define FD_METRICS_COUNTER_REPLAY_PROGCACHE_ROOTED_CVT (FD_METRICS_CONVERTER_NONE)
116158

117-
#define FD_METRICS_COUNTER_REPLAY_PROGCACHE_GC_ROOT_OFF (128UL)
159+
#define FD_METRICS_COUNTER_REPLAY_PROGCACHE_GC_ROOT_OFF (135UL)
118160
#define FD_METRICS_COUNTER_REPLAY_PROGCACHE_GC_ROOT_NAME "replay_progcache_gc_root"
119161
#define FD_METRICS_COUNTER_REPLAY_PROGCACHE_GC_ROOT_TYPE (FD_METRICS_TYPE_COUNTER)
120162
#define FD_METRICS_COUNTER_REPLAY_PROGCACHE_GC_ROOT_DESC "Number of program cache entries garbage collected while rooting"
121163
#define FD_METRICS_COUNTER_REPLAY_PROGCACHE_GC_ROOT_CVT (FD_METRICS_CONVERTER_NONE)
122164

123-
#define FD_METRICS_COUNTER_REPLAY_ACCDB_ROOTED_OFF (129UL)
165+
#define FD_METRICS_COUNTER_REPLAY_ACCDB_ROOTED_OFF (136UL)
124166
#define FD_METRICS_COUNTER_REPLAY_ACCDB_ROOTED_NAME "replay_accdb_rooted"
125167
#define FD_METRICS_COUNTER_REPLAY_ACCDB_ROOTED_TYPE (FD_METRICS_TYPE_COUNTER)
126168
#define FD_METRICS_COUNTER_REPLAY_ACCDB_ROOTED_DESC "Number of account database entries rooted"
127169
#define FD_METRICS_COUNTER_REPLAY_ACCDB_ROOTED_CVT (FD_METRICS_CONVERTER_NONE)
128170

129-
#define FD_METRICS_COUNTER_REPLAY_ACCDB_GC_ROOT_OFF (130UL)
171+
#define FD_METRICS_COUNTER_REPLAY_ACCDB_GC_ROOT_OFF (137UL)
130172
#define FD_METRICS_COUNTER_REPLAY_ACCDB_GC_ROOT_NAME "replay_accdb_gc_root"
131173
#define FD_METRICS_COUNTER_REPLAY_ACCDB_GC_ROOT_TYPE (FD_METRICS_TYPE_COUNTER)
132174
#define FD_METRICS_COUNTER_REPLAY_ACCDB_GC_ROOT_DESC "Number of account database entries garbage collected"
133175
#define FD_METRICS_COUNTER_REPLAY_ACCDB_GC_ROOT_CVT (FD_METRICS_CONVERTER_NONE)
134176

135-
#define FD_METRICS_REPLAY_TOTAL (19UL)
177+
#define FD_METRICS_REPLAY_TOTAL (26UL)
136178
extern const fd_metrics_meta_t FD_METRICS_REPLAY[FD_METRICS_REPLAY_TOTAL];
137179

138180
#endif /* HEADER_fd_src_disco_metrics_generated_fd_metrics_replay_h */

src/disco/metrics/metrics.xml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -794,9 +794,18 @@ metric introduced.
794794
<gauge name="MaxLiveBanks" summary="The maximum number of banks we can have alive" />
795795
<gauge name="LiveBanks" summary="The number of banks we currently have alive" />
796796

797+
<gauge name="ReasmFree" summary="The number of free FEC sets in the reassembly queue" />
798+
<gauge name="ReasmLatestSlot" summary="Slot of the latest FEC set in the reassembly queue that can be replayed" />
799+
<gauge name="ReasmLatestFecIdx" summary="FEC set index of the latest FEC set in the reassembly queue that can be replayed" />
800+
797801
<counter name="SlotsTotal" summary="Count of slots replayed successfully" />
798802
<counter name="TransactionsTotal" summary="Count of transactions processed overall on the current fork" />
799803

804+
<counter name="SchedFull" summary="Times where sched is full and a FEC set can't be processed" />
805+
<counter name="ReasmEmpty" summary="Times where reasm is empty and a FEC set can't be processed" />
806+
<counter name="LeaderBidWait" summary="Times where replay is blocked by the PoH tile not sending an end of leader message" />
807+
<counter name="BanksFull" summary="Times where banks are full and a FEC set can't be processed" />
808+
800809
<counter name="ProgcacheRooted" summary="Number of program cache entries rooted" />
801810
<counter name="ProgcacheGcRoot" summary="Number of program cache entries garbage collected while rooting" />
802811

src/discof/replay/fd_replay_tile.c

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -406,6 +406,14 @@ struct fd_replay_tile {
406406

407407
ulong slots_total;
408408
ulong transactions_total;
409+
410+
ulong reasm_latest_slot;
411+
ulong reasm_latest_fec_idx;
412+
413+
ulong sched_full;
414+
ulong reasm_empty;
415+
ulong leader_bid_wait;
416+
ulong banks_full;
409417
} metrics;
410418

411419
uchar __attribute__((aligned(FD_MULTI_EPOCH_LEADERS_ALIGN))) mleaders_mem[ FD_MULTI_EPOCH_LEADERS_FOOTPRINT ];
@@ -470,9 +478,20 @@ metrics_write( fd_replay_tile_t * ctx ) {
470478
ulong live_banks = fd_banks_pool_max( bank_pool ) - fd_banks_pool_free( bank_pool );
471479
FD_MGAUGE_SET( REPLAY, LIVE_BANKS, live_banks );
472480

481+
ulong reasm_free = fd_reasm_free( ctx->reasm );
482+
FD_MGAUGE_SET( REPLAY, REASM_FREE, reasm_free );
483+
473484
FD_MCNT_SET( REPLAY, SLOTS_TOTAL, ctx->metrics.slots_total );
474485
FD_MCNT_SET( REPLAY, TRANSACTIONS_TOTAL, ctx->metrics.transactions_total );
475486

487+
FD_MGAUGE_SET( REPLAY, REASM_LATEST_SLOT, ctx->metrics.reasm_latest_slot );
488+
FD_MGAUGE_SET( REPLAY, REASM_LATEST_FEC_IDX, ctx->metrics.reasm_latest_fec_idx );
489+
490+
FD_MCNT_SET( REPLAY, SCHED_FULL, ctx->metrics.sched_full );
491+
FD_MCNT_SET( REPLAY, REASM_EMPTY, ctx->metrics.reasm_empty );
492+
FD_MCNT_SET( REPLAY, LEADER_BID_WAIT, ctx->metrics.leader_bid_wait );
493+
FD_MCNT_SET( REPLAY, BANKS_FULL, ctx->metrics.banks_full );
494+
476495
FD_MCNT_SET( REPLAY, PROGCACHE_ROOTED, ctx->progcache_admin->metrics.root_cnt );
477496
FD_MCNT_SET( REPLAY, PROGCACHE_GC_ROOT, ctx->progcache_admin->metrics.gc_root_cnt );
478497

@@ -1600,8 +1619,18 @@ replay( fd_replay_tile_t * ctx,
16001619
static int
16011620
can_process_fec( fd_replay_tile_t * ctx ) {
16021621
fd_reasm_fec_t * fec;
1603-
if( FD_UNLIKELY( !fd_sched_can_ingest( ctx->sched, 1UL ) ) ) return 0;
1604-
if( FD_UNLIKELY( (fec = fd_reasm_peek( ctx->reasm ))==NULL ) ) return 0;
1622+
if( FD_UNLIKELY( !fd_sched_can_ingest( ctx->sched, 1UL ) ) ) {
1623+
ctx->metrics.sched_full++;
1624+
return 0;
1625+
}
1626+
1627+
if( FD_UNLIKELY( (fec = fd_reasm_peek( ctx->reasm ))==NULL ) ) {
1628+
ctx->metrics.reasm_empty++;
1629+
return 0;
1630+
}
1631+
1632+
ctx->metrics.reasm_latest_slot = fec->slot;
1633+
ctx->metrics.reasm_latest_fec_idx = fec->fec_set_idx;
16051634

16061635
if( FD_UNLIKELY( ctx->is_leader && fec->fec_set_idx==0U && fd_reasm_parent( ctx->reasm, fec )->bank_idx==ctx->leader_bank->idx ) ) {
16071636
/* There's a race that's exceedingly rare, where we receive the
@@ -1618,12 +1647,16 @@ can_process_fec( fd_replay_tile_t * ctx ) {
16181647
ordering invariants in banks and sched. */
16191648
FD_TEST( ctx->recv_block_id );
16201649
FD_TEST( !ctx->recv_poh );
1650+
ctx->metrics.leader_bid_wait++;
16211651
return 0;
16221652
}
16231653

16241654
/* If fec_set_idx is 0, we need a new bank for a new slot. Banks must
16251655
not be full in this case. */
1626-
if( FD_UNLIKELY( fd_banks_is_full( ctx->banks ) && fec->fec_set_idx==0 ) ) return 0;
1656+
if( FD_UNLIKELY( fd_banks_is_full( ctx->banks ) && fec->fec_set_idx==0 ) ) {
1657+
ctx->metrics.banks_full++;
1658+
return 0;
1659+
}
16271660

16281661
/* Otherwise, banks may not be full, so we can always create a new
16291662
bank if needed. Or, if banks are full, the current fec set's

0 commit comments

Comments
 (0)