@@ -130,6 +130,8 @@ struct fd_repair_tile_ctx {
130
130
131
131
fd_wksp_t * wksp ;
132
132
133
+ fd_stem_context_t * stem ;
134
+
133
135
uchar in_kind [ MAX_IN_LINKS ];
134
136
fd_repair_in_ctx_t in_links [ MAX_IN_LINKS ];
135
137
@@ -235,19 +237,19 @@ repair_signer_async( void * signer_ctx,
235
237
ulong len ,
236
238
int sign_type ) {
237
239
fd_repair_tile_ctx_t * ctx = (fd_repair_tile_ctx_t * ) signer_ctx ;
238
-
240
+
239
241
if ( FD_UNLIKELY ( ctx -> repair_sign_cnt == 0 ) ) {
240
242
FD_LOG_ERR (( "No repair_sign links configured for async signing" ));
241
243
}
242
-
244
+
243
245
uint round_robin_idx = (uint )(nonce % ctx -> repair_sign_cnt );
244
246
fd_repair_out_ctx_t * sign_out = & ctx -> repair_sign_out_ctx [ round_robin_idx ];
245
-
247
+
246
248
uchar * dst = fd_chunk_to_laddr ( sign_out -> mem , sign_out -> chunk );
247
249
fd_memcpy ( dst , buffer , len );
248
-
250
+
249
251
ulong sig = ((ulong )nonce << 32 ) | (ulong )(uint )sign_type ;
250
- fd_stem_publish ( ctx -> stem , sign_out -> idx , sig , sign_out -> chunk , len , 0UL , 0UL , 0UL );
252
+ fd_stem_publish ( ctx -> stem , sign_out -> idx , sig , sign_out -> chunk , len , 0UL , 0UL , 0UL );
251
253
sign_out -> chunk = fd_dcache_compact_next ( sign_out -> chunk , len , sign_out -> chunk0 , sign_out -> wmark );
252
254
253
255
ctx -> request_seq = fd_seq_inc ( ctx -> request_seq , 1UL );
@@ -409,7 +411,7 @@ fd_repair_sign_and_send( fd_repair_tile_ctx_t * repair_tile_ctx,
409
411
if ( is_async ) {
410
412
repair_signer_async ( repair_tile_ctx , nonce , buf , buflen , FD_KEYGUARD_SIGN_TYPE_ED25519 );
411
413
return 0UL ;
412
-
414
+
413
415
/* If sync, we sign using keyguard */
414
416
} else {
415
417
fd_signature_t sig ;
@@ -424,7 +426,7 @@ fd_repair_sign_and_send( fd_repair_tile_ctx_t * repair_tile_ctx,
424
426
}
425
427
}
426
428
427
- /* REPAIR TILE REQUEST HANDLING ARCHITECTURE
429
+ /* REQUEST HANDLING ARCHITECTURE
428
430
=========================================
429
431
430
432
The repair tile implements two distinct request handling patterns
@@ -440,11 +442,7 @@ fd_repair_sign_and_send( fd_repair_tile_ctx_t * repair_tile_ctx,
440
442
- PINGs & PONGs: Handles peer connectivity and liveness with simple
441
443
round-trip messages.
442
444
443
- - PEER WARM UPs: On receiving peer information in
444
- handle_new_cluster_contact_info, we prepay the RTT cost by sending
445
- a placeholder Repair request immediately.
446
-
447
- 2. ASYNCHRONOUS REQUEST HANDLING
445
+ 2. ASYNCHRONOUS REQUEST HANDLING
448
446
--------------------------------
449
447
Used strictly for repair requests. These requests are sent to the
450
448
sign tile, and the repair tile continues handling other operations
@@ -456,15 +454,19 @@ fd_repair_sign_and_send( fd_repair_tile_ctx_t * repair_tile_ctx,
456
454
- WINDOW_INDEX (exact shred): Requests for a specific shred at a
457
455
known slot and index. Used when the repair tile knows exactly
458
456
which shred is missing from a FEC set.
459
-
457
+
460
458
- HIGHEST_WINDOW_INDEX: Requests for the highest shred in a slot.
461
459
Used to determine the end boundary of a slot when the exact count
462
460
is unknown.
463
-
461
+
464
462
- ORPHAN: Requests for the highest shred in the parent slot of an
465
463
orphaned slot. Used to establish the chain of slot ancestry when a
466
464
slot's parent is missing.
467
465
466
+ - PEER WARM UPs: On receiving peer information in
467
+ handle_new_cluster_contact_info, we prepay the RTT cost by sending
468
+ a placeholder Repair request immediately.
469
+
468
470
Async requests can be distributed across multiple sign tiles using
469
471
round-robin based on the request nonce. This provides load balancing
470
472
and prevents any single sign tile from becoming a bottleneck. */
@@ -519,32 +521,29 @@ fd_repair_send_request_async( fd_repair_tile_ctx_t * ctx,
519
521
long now ){
520
522
fd_active_elem_t * peer = fd_active_table_query (glob -> actives , recipient , NULL );
521
523
if (!peer ) return ;
522
-
524
+
523
525
uint nonce = (uint )glob -> next_nonce ;
524
526
525
527
fd_repair_protocol_t protocol ;
526
528
fd_repair_construct_request_protocol ( glob , & protocol , type , slot , shred_index , recipient , glob -> next_nonce , now );
527
529
glob -> next_nonce ++ ;
528
530
glob -> metrics .send_pkt_cnt ++ ;
529
-
531
+
530
532
if ( FD_UNLIKELY ( fd_repair_pending_sign_req_deque_full ( ctx -> pending_sign_req_deque ) ) ) {
531
533
return ;
532
534
}
533
-
535
+
534
536
fd_repair_pending_sign_req_t * pending = fd_repair_pending_sign_req_deque_push_tail_nocopy ( ctx -> pending_sign_req_deque );
535
-
536
- /* Use the unified sign_and_send function in async mode */
537
+
537
538
fd_repair_sign_and_send ( ctx , & protocol , & peer -> addr , pending -> buf , sizeof (pending -> buf ), 1 , nonce );
538
-
539
- /* Since async requests don't complete the buffer with signature,
540
- we need to track the encoded length */
539
+
541
540
fd_bincode_encode_ctx_t encode_ctx = { .data = pending -> buf , .dataend = pending -> buf + sizeof (pending -> buf ) };
542
541
if ( FD_UNLIKELY ( fd_repair_protocol_encode ( & protocol , & encode_ctx ) != FD_BINCODE_SUCCESS ) ) {
543
542
FD_LOG_CRIT (( "Failed to encode repair message (type %#x)" , protocol .discriminant ));
544
543
}
545
-
544
+
546
545
ulong buflen = (ulong )encode_ctx .data - (ulong )pending -> buf ;
547
-
546
+
548
547
pending -> buflen = buflen ;
549
548
pending -> sig_offset = 4 ;
550
549
pending -> dst_ip_addr = peer -> addr .addr ;
@@ -593,12 +592,12 @@ handle_new_cluster_contact_info( fd_repair_tile_ctx_t * ctx,
593
592
};
594
593
int dup = fd_repair_add_active_peer ( ctx -> repair , & repair_peer , in_dests [i ].pubkey );
595
594
if ( !dup ) {
596
- /* The repair process uses a Ping-Pong protocol that incurs one
597
- round-trip time (RTT) for the initial repair request. To optimize
598
- this, we proactively send a placeholder Repair request as soon as we
599
- receive a peer's contact information for the first time, effectively
600
- prepaying the RTT cost. */
601
- fd_repair_send_request (ctx , ctx -> stem , ctx -> repair , 0 , 0 , 0 , in_dests [i ].pubkey , fd_log_wallclock ());
595
+ /* The repair process uses a Ping-Pong protocol that incurs one
596
+ round-trip time (RTT) for the initial repair request. To optimize
597
+ this, we proactively send a placeholder Repair request as soon as we
598
+ receive a peer's contact information for the first time, effectively
599
+ prepaying the RTT cost. */
600
+ fd_repair_send_request_async (ctx , ctx -> stem , ctx -> repair , 0 , 0 , 0 , in_dests [i ].pubkey , fd_log_wallclock ());
602
601
ulong hash_src = 0xfffffUL & fd_ulong_hash ( (ulong )in_dests [i ].ip4_addr | ((ulong )repair_peer .port <<32 ) );
603
602
FD_LOG_INFO (( "Added repair peer: pubkey %s hash_src %lu" , FD_BASE58_ENC_32_ALLOCA (in_dests [i ].pubkey ), hash_src ));
604
603
}
@@ -611,7 +610,6 @@ before_frag( fd_repair_tile_ctx_t * ctx,
611
610
ulong in_idx ,
612
611
ulong seq FD_PARAM_UNUSED ,
613
612
ulong sig ) {
614
- // FD_LOG_NOTICE(( "repair: before_frag %lu", in_idx ));
615
613
uint in_kind = ctx -> in_kind [ in_idx ];
616
614
if ( FD_LIKELY ( in_kind == IN_KIND_NET ) ) return fd_disco_netmux_sig_proto ( sig )!= DST_PROTO_REPAIR ;
617
615
return 0 ;
@@ -670,7 +668,7 @@ during_frag( fd_repair_tile_ctx_t * ctx,
670
668
fd_memcpy ( ctx -> buffer , dcache_entry , dcache_entry_sz );
671
669
}
672
670
673
- static ulong
671
+ static ulong FD_FN_UNUSED
674
672
fd_repair_send_ping ( fd_repair_tile_ctx_t * repair_tile_ctx ,
675
673
fd_repair_t * glob ,
676
674
fd_pinged_elem_t * val ,
@@ -697,7 +695,7 @@ fd_repair_send_ping( fd_repair_tile_ctx_t * repair_tile_ctx,
697
695
return (ulong )((uchar * )ctx .data - buf );
698
696
}
699
697
700
- static void
698
+ static void FD_FN_UNUSED
701
699
fd_repair_recv_pong (fd_repair_t * glob , fd_gossip_ping_t const * pong , fd_gossip_peer_addr_t const * from ) {
702
700
fd_pinged_elem_t * val = fd_pinged_table_query (glob -> pinged , from , NULL );
703
701
if ( val == NULL || !fd_pubkey_eq ( & val -> id , & pong -> from ) )
@@ -731,10 +729,6 @@ fd_repair_recv_pong(fd_repair_t * glob, fd_gossip_ping_t const * pong, fd_gossip
731
729
val -> good = 1 ;
732
730
}
733
731
734
- /* Pass a raw service request packet into the protocol.
735
- src_addr is the address of the sender
736
- dst_ip4_addr is the dst IPv4 address of the incoming packet (i.e. our IP) */
737
-
738
732
static void
739
733
after_frag ( fd_repair_tile_ctx_t * ctx ,
740
734
ulong in_idx ,
@@ -747,8 +741,9 @@ after_frag( fd_repair_tile_ctx_t * ctx,
747
741
748
742
if ( FD_UNLIKELY ( ctx -> skip_frag ) ) return ;
749
743
744
+ ctx -> stem = stem ;
745
+
750
746
uint in_kind = ctx -> in_kind [ in_idx ];
751
- // FD_LOG_INFO(( "in_idx: %lu, in_kind: %u", in_idx, in_kind ));
752
747
if ( FD_UNLIKELY ( in_kind == IN_KIND_CONTACT ) ) {
753
748
handle_new_cluster_contact_info ( ctx , ctx -> buffer , sz );
754
749
return ;
@@ -763,37 +758,37 @@ after_frag( fd_repair_tile_ctx_t * ctx,
763
758
/* Nonce was packed into sig, so we need to unpack it */
764
759
ulong response_nonce = sig >> 32 ;
765
760
fd_repair_pending_sign_req_t pending ;
766
-
761
+
767
762
/* Iterate over all pending requests, as every request sent to the
768
763
sign tile will be returned. Since the repair_sign links are
769
764
reliable, the incoming sign_repair fragments represent a complete
770
765
set of the previously sent outgoing messages. However, with
771
- multiple sign tiles, the responses may not arrive in order. But,
766
+ multiple sign tiles, the responses may not arrive in order. But,
772
767
we can safely process them sequentially as we encounter them in
773
768
the deque. */
774
769
while ( !fd_repair_pending_sign_req_deque_empty ( ctx -> pending_sign_req_deque ) ) {
775
770
fd_repair_pending_sign_req_t * head_req = fd_repair_pending_sign_req_deque_peek_head ( ctx -> pending_sign_req_deque );
776
-
771
+
777
772
if ( head_req -> nonce > response_nonce ) {
778
773
break ;
779
774
}
780
-
775
+
781
776
pending = fd_repair_pending_sign_req_deque_pop_head ( ctx -> pending_sign_req_deque );
782
-
777
+
783
778
if ( pending .nonce == response_nonce ) {
784
779
fd_memcpy ( pending .buf + pending .sig_offset , ctx -> buffer , 64UL );
785
780
ulong tsorig = fd_frag_meta_ts_comp ( fd_tickcount () );
786
-
781
+
787
782
uint src_ip4_addr = 0U ;
788
783
send_packet ( ctx , stem , 1 , pending .dst_ip_addr , pending .dst_port , src_ip4_addr , pending .buf , pending .buflen , tsorig );
789
784
return ;
790
785
}
791
786
}
792
-
787
+
793
788
FD_LOG_WARNING (( "No matching request found for nonce %lu" , response_nonce ));
794
789
return ;
795
790
}
796
-
791
+
797
792
if ( FD_UNLIKELY ( in_kind == IN_KIND_SHRED ) ) {
798
793
799
794
/* Initialize the forest, which requires the root to be ready. This
@@ -941,10 +936,6 @@ after_credit( fd_repair_tile_ctx_t * ctx,
941
936
int * opt_poll_in ,
942
937
int * charge_busy ) {
943
938
944
- /* TODO: Don't charge the tile as busy if after_credit isn't actually
945
- doing any work. */
946
- * charge_busy = 1 ;
947
-
948
939
if ( FD_LIKELY ( !fd_fec_out_empty ( ctx -> fec_chainer -> out ) && ctx -> store ) ) {
949
940
950
941
fd_fec_out_t out = fd_fec_out_pop_head ( ctx -> fec_chainer -> out );
@@ -1036,7 +1027,6 @@ after_credit( fd_repair_tile_ctx_t * ctx,
1036
1027
fd_stem_publish ( ctx -> stem , REPLAY_OUT_IDX , sig , 0 , 0 , 0 , 0 , tspub );
1037
1028
if ( FD_UNLIKELY ( out .slot_complete ) ) {
1038
1029
fd_reasm_remove ( ctx -> reasm , reasm );
1039
- // FD_LOG_INFO(( "SLOT COMPLETE: %lu, time: %ld", out.slot, fd_log_wallclock() ));
1040
1030
}
1041
1031
}
1042
1032
* opt_poll_in = 1 ;
@@ -1192,17 +1182,7 @@ unprivileged_init( fd_topo_t * topo,
1192
1182
ctx -> in_links [ in_idx ].chunk0 = fd_dcache_compact_chunk0 ( ctx -> in_links [ in_idx ].mem , link -> dcache );
1193
1183
ctx -> in_links [ in_idx ].wmark = fd_dcache_compact_wmark ( ctx -> in_links [ in_idx ].mem , link -> dcache , link -> mtu );
1194
1184
ctx -> in_links [ in_idx ].mtu = link -> mtu ;
1195
-
1196
- if ( ctx -> in_kind [ in_idx ] == IN_KIND_SIGN ) {
1197
- // fd_wksp_t * wksp = fd_wksp_containing( link->dcache );
1198
- // FD_LOG_NOTICE(( "repair tile: link %s[%lu] mem=%p, dcache=%p, wksp=%p, wksp_name=%s, in_idx=%u",
1199
- // link->name, link->kind_id,
1200
- // (void*)ctx->in_links[ in_idx ].mem,
1201
- // link->dcache,
1202
- // (void*)wksp,
1203
- // wksp ? fd_wksp_name( wksp ) : "NULL",
1204
- // in_idx ));
1205
- }
1185
+
1206
1186
FD_TEST ( fd_dcache_compact_is_safe ( ctx -> in_links [in_idx ].mem , link -> dcache , link -> mtu , link -> depth ) );
1207
1187
}
1208
1188
@@ -1247,7 +1227,7 @@ unprivileged_init( fd_topo_t * topo,
1247
1227
ctx -> shredcap_out_chunk0 = fd_dcache_compact_chunk0 ( ctx -> shredcap_out_mem , link -> dcache );
1248
1228
ctx -> shredcap_out_wmark = fd_dcache_compact_wmark ( ctx -> shredcap_out_mem , link -> dcache , link -> mtu );
1249
1229
ctx -> shredcap_out_chunk = ctx -> shredcap_out_chunk0 ;
1250
-
1230
+
1251
1231
} else if ( 0 == strcmp ( link -> name , "ping_sign" ) ) {
1252
1232
ctx -> ping_sign_out_idx = out_idx ;
1253
1233
ctx -> ping_sign_out_mem = topo -> workspaces [ topo -> objs [ link -> dcache_obj_id ].wksp_id ].wksp ;
0 commit comments