9
9
//! sync as failed, log an error and attempt to retry once a new peer joins the node.
10
10
11
11
use crate :: network_beacon_processor:: ChainSegmentProcessId ;
12
+ use crate :: sync:: block_sidecar_coupling:: CouplingError ;
12
13
use crate :: sync:: manager:: BatchProcessResult ;
13
14
use crate :: sync:: network_context:: {
14
15
RangeRequestId , RpcRequestSendError , RpcResponseError , SyncNetworkContext ,
@@ -28,7 +29,7 @@ use std::collections::{
28
29
} ;
29
30
use std:: sync:: Arc ;
30
31
use tracing:: { debug, error, info, instrument, warn} ;
31
- use types:: { Epoch , EthSpec } ;
32
+ use types:: { ColumnIndex , Epoch , EthSpec } ;
32
33
33
34
/// Blocks are downloaded in batches from peers. This constant specifies how many epochs worth of
34
35
/// blocks per batch are requested _at most_. A batch may request less blocks to account for
@@ -223,9 +224,11 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
223
224
. network_globals
224
225
. peers
225
226
. read ( )
226
- . synced_peers ( )
227
+ . synced_peers_for_epoch ( self . to_be_downloaded , None )
227
228
. next ( )
228
229
. is_some ( )
230
+ // backfill can't progress if we do not have peers in the required subnets post peerdas.
231
+ && self . good_peers_on_sampling_subnets ( self . to_be_downloaded , network)
229
232
{
230
233
// If there are peers to resume with, begin the resume.
231
234
debug ! ( start_epoch = ?self . current_start, awaiting_batches = self . batches. len( ) , processing_target = ?self . processing_target, "Resuming backfill sync" ) ;
@@ -334,6 +337,48 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
334
337
err : RpcResponseError ,
335
338
) -> Result < ( ) , BackFillError > {
336
339
if let Some ( batch) = self . batches . get_mut ( & batch_id) {
340
+ if let RpcResponseError :: BlockComponentCouplingError ( coupling_error) = & err {
341
+ match coupling_error {
342
+ CouplingError :: PeerFailure {
343
+ error,
344
+ faulty_peers,
345
+ action,
346
+ } => {
347
+ debug ! ( ?batch_id, error, "Block components coupling error" ) ;
348
+ // Note: we don't fail the batch here because a `CouplingError` is
349
+ // recoverable by requesting from other honest peers.
350
+ let mut failed_columns = HashSet :: new ( ) ;
351
+ let mut failed_peers = HashSet :: new ( ) ;
352
+ for ( column, peer) in faulty_peers {
353
+ failed_columns. insert ( * column) ;
354
+ failed_peers. insert ( * peer) ;
355
+ }
356
+ for peer in failed_peers. iter ( ) {
357
+ network. report_peer ( * peer, * action, "failed to return columns" ) ;
358
+ }
359
+
360
+ return self . retry_partial_batch (
361
+ network,
362
+ batch_id,
363
+ request_id,
364
+ failed_columns,
365
+ failed_peers,
366
+ ) ;
367
+ }
368
+ CouplingError :: ExceededMaxRetries ( peers, action) => {
369
+ for peer in peers. iter ( ) {
370
+ network. report_peer (
371
+ * peer,
372
+ * action,
373
+ "failed to return columns, exceeded retry attempts" ,
374
+ ) ;
375
+ }
376
+ }
377
+ CouplingError :: InternalError ( msg) => {
378
+ debug ! ( ?batch_id, msg, "Block components coupling internal error" ) ;
379
+ }
380
+ }
381
+ }
337
382
// A batch could be retried without the peer failing the request (disconnecting/
338
383
// sending an error /timeout) if the peer is removed from the chain for other
339
384
// reasons. Check that this block belongs to the expected peer
@@ -903,12 +948,16 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
903
948
network : & mut SyncNetworkContext < T > ,
904
949
batch_id : BatchId ,
905
950
) -> Result < ( ) , BackFillError > {
951
+ if matches ! ( self . state( ) , BackFillState :: Paused ) {
952
+ return Err ( BackFillError :: Paused ) ;
953
+ }
906
954
if let Some ( batch) = self . batches . get_mut ( & batch_id) {
955
+ debug ! ( ?batch_id, "Sending backfill batch" ) ;
907
956
let synced_peers = self
908
957
. network_globals
909
958
. peers
910
959
. read ( )
911
- . synced_peers ( )
960
+ . synced_peers_for_epoch ( batch_id , None )
912
961
. cloned ( )
913
962
. collect :: < HashSet < _ > > ( ) ;
914
963
@@ -967,6 +1016,54 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
967
1016
Ok ( ( ) )
968
1017
}
969
1018
1019
+ /// Retries partial column requests within the batch by creating new requests for the failed columns.
1020
+ #[ instrument( parent = None ,
1021
+ fields( service = "backfill_sync" ) ,
1022
+ name = "backfill_sync" ,
1023
+ skip_all
1024
+ ) ]
1025
+ pub fn retry_partial_batch (
1026
+ & mut self ,
1027
+ network : & mut SyncNetworkContext < T > ,
1028
+ batch_id : BatchId ,
1029
+ id : Id ,
1030
+ failed_columns : HashSet < ColumnIndex > ,
1031
+ mut failed_peers : HashSet < PeerId > ,
1032
+ ) -> Result < ( ) , BackFillError > {
1033
+ if let Some ( batch) = self . batches . get_mut ( & batch_id) {
1034
+ failed_peers. extend ( & batch. failed_peers ( ) ) ;
1035
+ let req = batch. to_blocks_by_range_request ( ) . 0 ;
1036
+
1037
+ let synced_peers = network
1038
+ . network_globals ( )
1039
+ . peers
1040
+ . read ( )
1041
+ . synced_peers ( )
1042
+ . cloned ( )
1043
+ . collect :: < HashSet < _ > > ( ) ;
1044
+
1045
+ match network. retry_columns_by_range (
1046
+ id,
1047
+ & synced_peers,
1048
+ & failed_peers,
1049
+ req,
1050
+ & failed_columns,
1051
+ ) {
1052
+ Ok ( _) => {
1053
+ debug ! (
1054
+ ?batch_id,
1055
+ id, "Retried column requests from different peers"
1056
+ ) ;
1057
+ return Ok ( ( ) ) ;
1058
+ }
1059
+ Err ( e) => {
1060
+ debug ! ( ?batch_id, id, e, "Failed to retry partial batch" ) ;
1061
+ }
1062
+ }
1063
+ }
1064
+ Ok ( ( ) )
1065
+ }
1066
+
970
1067
/// When resuming a chain, this function searches for batches that need to be re-downloaded and
971
1068
/// transitions their state to redownload the batch.
972
1069
#[ instrument( parent = None ,
@@ -1057,6 +1154,11 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
1057
1154
return None ;
1058
1155
}
1059
1156
1157
+ if !self . good_peers_on_sampling_subnets ( self . to_be_downloaded , network) {
1158
+ debug ! ( "Waiting for peers to be available on custody column subnets" ) ;
1159
+ return None ;
1160
+ }
1161
+
1060
1162
let batch_id = self . to_be_downloaded ;
1061
1163
// this batch could have been included already being an optimistic batch
1062
1164
match self . batches . entry ( batch_id) {
@@ -1089,6 +1191,36 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
1089
1191
}
1090
1192
}
1091
1193
1194
+ /// Checks all sampling column subnets for peers. Returns `true` if there is at least one peer in
1195
+ /// every sampling column subnet.
1196
+ ///
1197
+ /// Returns `true` if peerdas isn't enabled for the epoch.
1198
+ fn good_peers_on_sampling_subnets (
1199
+ & self ,
1200
+ epoch : Epoch ,
1201
+ network : & SyncNetworkContext < T > ,
1202
+ ) -> bool {
1203
+ if network. chain . spec . is_peer_das_enabled_for_epoch ( epoch) {
1204
+ // Require peers on all sampling column subnets before sending batches
1205
+ let peers_on_all_custody_subnets = network
1206
+ . network_globals ( )
1207
+ . sampling_subnets ( )
1208
+ . iter ( )
1209
+ . all ( |subnet_id| {
1210
+ let peer_count = network
1211
+ . network_globals ( )
1212
+ . peers
1213
+ . read ( )
1214
+ . good_range_sync_custody_subnet_peers ( * subnet_id)
1215
+ . count ( ) ;
1216
+ peer_count > 0
1217
+ } ) ;
1218
+ peers_on_all_custody_subnets
1219
+ } else {
1220
+ true
1221
+ }
1222
+ }
1223
+
1092
1224
/// Resets the start epoch based on the beacon chain.
1093
1225
///
1094
1226
/// This errors if the beacon chain indicates that backfill sync has already completed or is
0 commit comments