diff --git a/beacon_node/network/src/sync/custody_backfill_sync/mod.rs b/beacon_node/network/src/sync/custody_backfill_sync/mod.rs index bb2c6799f1d..d073d19ebca 100644 --- a/beacon_node/network/src/sync/custody_backfill_sync/mod.rs +++ b/beacon_node/network/src/sync/custody_backfill_sync/mod.rs @@ -114,9 +114,12 @@ pub struct CustodyBackFillSync { skipped_batches: HashSet, /// When a custody backfill sync fails, we keep track of whether a new fully synced peer has joined. - /// This signifies that we are able to attempt to restart a failed chain. + /// This signifies that we are able to attempt to restart a failed sync. restart_failed_sync: bool, + /// Indicates that the custody backfill sync has failed and is waiting to be retried. + failed_sync: bool, + /// Reference to the beacon chain to obtain initial starting points for custody backfill sync. beacon_chain: Arc>, @@ -142,6 +145,7 @@ impl CustodyBackFillSync { current_processing_batch: None, validated_batches: 0, restart_failed_sync: false, + failed_sync: false, beacon_chain, network_globals, } @@ -202,6 +206,7 @@ impl CustodyBackFillSync { self.batches.clear(); self.skipped_batches.clear(); self.restart_failed_sync = false; + self.failed_sync = false; // Reset all downloading and processing targets // NOTE: Lets keep validated_batches for posterity @@ -244,12 +249,16 @@ impl CustodyBackFillSync { } if self.check_completed() { + self.failed_sync = false; + self.restart_failed_sync = false; self.set_state(CustodyBackFillState::Completed); return Ok(SyncStart::NotSyncing); } } CustodyBackFillState::Pending(_) | CustodyBackFillState::Completed => { if self.check_completed() { + self.failed_sync = false; + self.restart_failed_sync = false; self.set_state(CustodyBackFillState::Completed); return Ok(SyncStart::NotSyncing); } @@ -258,7 +267,18 @@ impl CustodyBackFillSync { if !self.should_start_custody_backfill_sync() { return Ok(SyncStart::NotSyncing); } - self.set_start_epoch(); + + // If the last custody backfill attempt failed, only restart once a new fully + // synced peer has joined and set `restart_failed_sync`. + if self.failed_sync { + if !self.restart_failed_sync { + return Ok(SyncStart::NotSyncing); + } + // We can now safely restart a failed sync with a fresh run id. + self.restart_sync(); + } else { + self.set_start_epoch(); + } if self .network_globals .peers @@ -734,7 +754,6 @@ impl CustodyBackFillSync { "Custody backfill sync completed" ); self.batches.clear(); - self.restart_failed_sync = false; self.processing_target = self.current_start; self.to_be_downloaded = self.current_start; self.last_batch_downloaded = false; @@ -1089,11 +1108,15 @@ impl CustodyBackFillSync { return Ok(()); } + // Mark this sync as failed and wait for a new fully synced peer before restarting. + self.failed_sync = true; + self.restart_failed_sync = false; + // Set the state self.pause("Sync has failed".to_string()); // Remove all batches and active requests. self.batches.clear(); - self.restart_failed_sync = false; + self.skipped_batches.clear(); // Reset all downloading and processing targets // NOTE: Lets keep validated_batches for posterity @@ -1101,7 +1124,6 @@ impl CustodyBackFillSync { self.to_be_downloaded = self.current_start; self.last_batch_downloaded = false; self.current_processing_batch = None; - self.restart_sync(); Err(error) } @@ -1116,10 +1138,10 @@ impl CustodyBackFillSync { } /// A fully synced peer has joined us. - /// If we are in a failed state, update a local variable to indicate we are able to restart - /// the failed sync on the next attempt. + /// If the last custody backfill sync failed, update a local variable to indicate we are able + /// to restart the failed sync on the next attempt. pub fn fully_synced_peer_joined(&mut self) { - if matches!(self.state(), CustodyBackFillState::Pending(_)) { + if self.failed_sync && matches!(self.state(), CustodyBackFillState::Pending(_)) { self.restart_failed_sync = true; } }