@@ -107,6 +107,7 @@ use slog_error_chain::InlineErrorChain;
107
107
use std:: net:: { IpAddr , Ipv6Addr , SocketAddr } ;
108
108
use std:: sync:: atomic:: { AtomicBool , Ordering } ;
109
109
use std:: sync:: { Arc , OnceLock } ;
110
+ use std:: time:: Duration ;
110
111
use tokio:: io:: AsyncWriteExt ;
111
112
use tokio:: sync:: Mutex ;
112
113
use tokio:: sync:: oneshot;
@@ -310,6 +311,14 @@ impl From<Error> for omicron_common::api::external::Error {
310
311
}
311
312
}
312
313
314
+ /// Information describing the underlay network, used when activating the switch
315
+ /// zone.
316
+ #[ derive( Debug , Clone ) ]
317
+ pub struct UnderlayInfo {
318
+ pub ip : Ipv6Addr ,
319
+ pub rack_network_config : Option < RackNetworkConfig > ,
320
+ }
321
+
313
322
fn display_zone_init_errors ( errors : & [ ( String , Box < Error > ) ] ) -> String {
314
323
if errors. len ( ) == 1 {
315
324
return format ! (
@@ -550,6 +559,9 @@ enum SwitchZoneState {
550
559
request : SwitchZoneConfig ,
551
560
// The currently running zone
552
561
zone : Box < RunningZone > ,
562
+ // A background task which keeps looping until the zone's uplinks are
563
+ // configured.
564
+ worker : Option < Task > ,
553
565
} ,
554
566
}
555
567
@@ -3151,7 +3163,7 @@ impl ServiceManager {
3151
3163
& self ,
3152
3164
// If we're reconfiguring the switch zone with an underlay address, we
3153
3165
// also need the rack network config to set tfport uplinks.
3154
- underlay_info : Option < ( Ipv6Addr , Option < & RackNetworkConfig > ) > ,
3166
+ underlay_info : Option < UnderlayInfo > ,
3155
3167
baseboard : Baseboard ,
3156
3168
) -> Result < ( ) , Error > {
3157
3169
info ! ( self . inner. log, "Ensuring scrimlet services (enabling services)" ) ;
@@ -3238,33 +3250,98 @@ impl ServiceManager {
3238
3250
}
3239
3251
} ;
3240
3252
3241
- let mut addresses =
3242
- if let Some ( ( ip, _) ) = underlay_info { vec ! [ ip] } else { vec ! [ ] } ;
3253
+ let mut addresses = if let Some ( info) = & underlay_info {
3254
+ vec ! [ info. ip]
3255
+ } else {
3256
+ vec ! [ ]
3257
+ } ;
3243
3258
addresses. push ( Ipv6Addr :: LOCALHOST ) ;
3244
3259
3245
3260
let request =
3246
3261
SwitchZoneConfig { id : Uuid :: new_v4 ( ) , addresses, services } ;
3247
3262
3248
3263
self . ensure_switch_zone (
3249
- // request=
3250
3264
Some ( request) ,
3251
- // filesystems=
3252
3265
filesystems,
3253
- // data_links=
3254
3266
data_links,
3267
+ underlay_info,
3255
3268
)
3256
3269
. await ?;
3257
3270
3258
- // If we've given the switch an underlay address, we also need to inject
3259
- // SMF properties so that tfport uplinks can be created.
3260
- if let Some ( ( ip, Some ( rack_network_config) ) ) = underlay_info {
3261
- self . ensure_switch_zone_uplinks_configured ( ip, rack_network_config)
3262
- . await ?;
3263
- }
3264
-
3265
3271
Ok ( ( ) )
3266
3272
}
3267
3273
3274
+ // Retry ensuring switch zone uplinks until success or we're told to stop.
3275
+ //
3276
+ // TODO-correctness: This is not in great shape, and may get stuck in an
3277
+ // infinite retry loop _within_ one attempt, or may succeed even if it
3278
+ // didn't fully configure all switch zone services. See
3279
+ // <https://github.com/oxidecomputer/omicron/issues/8970> for details.
3280
+ async fn ensure_switch_zone_uplinks_configured_loop (
3281
+ & self ,
3282
+ underlay_info : & UnderlayInfo ,
3283
+ mut exit_rx : oneshot:: Receiver < ( ) > ,
3284
+ ) {
3285
+ // We don't really expect failures trying to initialize the switch zone
3286
+ // unless something is unhealthy. This timeout is somewhat arbitrary,
3287
+ // but we probably don't want to use backoff here.
3288
+ const RETRY_DELAY : Duration = Duration :: from_secs ( 1 ) ;
3289
+
3290
+ // We can only ensure uplinks if we have a rack network config.
3291
+ //
3292
+ // It'd be surprising to have `underlay_info` containing our underlay IP
3293
+ // without a rack network config, but it is technically possible. These
3294
+ // bits of information are ledgered separately, and we could have
3295
+ // ledgered our underlay IP, then crashed before the bootstore gossip
3296
+ // happened to tell us the rack network config, and are now restarting.
3297
+ let Some ( rack_network_config) = & underlay_info. rack_network_config
3298
+ else {
3299
+ return ;
3300
+ } ;
3301
+
3302
+ loop {
3303
+ match self
3304
+ . ensure_switch_zone_uplinks_configured (
3305
+ underlay_info. ip ,
3306
+ & rack_network_config,
3307
+ )
3308
+ . await
3309
+ {
3310
+ Ok ( ( ) ) => {
3311
+ info ! ( self . inner. log, "configured switch zone uplinks" ) ;
3312
+ break ;
3313
+ }
3314
+ Err ( e) => {
3315
+ warn ! (
3316
+ self . inner. log,
3317
+ "Failed to configure switch zone uplinks" ;
3318
+ InlineErrorChain :: new( & e) ,
3319
+ ) ;
3320
+ }
3321
+ }
3322
+
3323
+ tokio:: select! {
3324
+ // If we've been told to stop trying, bail.
3325
+ _ = & mut exit_rx => {
3326
+ info!(
3327
+ self . inner. log,
3328
+ "instructed to give up on switch zone uplink \
3329
+ configuration",
3330
+ ) ;
3331
+ return ;
3332
+ }
3333
+
3334
+ _ = tokio:: time:: sleep( RETRY_DELAY ) => {
3335
+ info!(
3336
+ self . inner. log,
3337
+ "retrying switch zone uplink configuration" ,
3338
+ ) ;
3339
+ continue ;
3340
+ }
3341
+ } ;
3342
+ }
3343
+ }
3344
+
3268
3345
// Ensure our switch zone (at the given IP address) has its uplinks
3269
3346
// configured based on `rack_network_config`. This first requires us to ask
3270
3347
// MGS running in the switch zone which switch we are, so we know which
@@ -3409,6 +3486,8 @@ impl ServiceManager {
3409
3486
vec ! [ ] ,
3410
3487
// data_links=
3411
3488
vec ! [ ] ,
3489
+ // underlay_info=
3490
+ None ,
3412
3491
)
3413
3492
. await
3414
3493
}
@@ -3427,6 +3506,7 @@ impl ServiceManager {
3427
3506
request : SwitchZoneConfig ,
3428
3507
filesystems : Vec < zone:: Fs > ,
3429
3508
data_links : Vec < String > ,
3509
+ underlay_info : Option < UnderlayInfo > ,
3430
3510
) {
3431
3511
let ( exit_tx, exit_rx) = oneshot:: channel ( ) ;
3432
3512
* zone = SwitchZoneState :: Initializing {
@@ -3436,7 +3516,8 @@ impl ServiceManager {
3436
3516
worker : Some ( Task {
3437
3517
exit_tx,
3438
3518
initializer : tokio:: task:: spawn ( async move {
3439
- self . initialize_switch_zone_loop ( exit_rx) . await
3519
+ self . initialize_switch_zone_loop ( underlay_info, exit_rx)
3520
+ . await
3440
3521
} ) ,
3441
3522
} ) ,
3442
3523
} ;
@@ -3448,6 +3529,7 @@ impl ServiceManager {
3448
3529
request : Option < SwitchZoneConfig > ,
3449
3530
filesystems : Vec < zone:: Fs > ,
3450
3531
data_links : Vec < String > ,
3532
+ underlay_info : Option < UnderlayInfo > ,
3451
3533
) -> Result < ( ) , Error > {
3452
3534
let log = & self . inner . log ;
3453
3535
@@ -3462,6 +3544,7 @@ impl ServiceManager {
3462
3544
request,
3463
3545
filesystems,
3464
3546
data_links,
3547
+ underlay_info,
3465
3548
) ;
3466
3549
}
3467
3550
(
@@ -3473,9 +3556,10 @@ impl ServiceManager {
3473
3556
// the next request with our new request.
3474
3557
* request = new_request;
3475
3558
}
3476
- ( SwitchZoneState :: Running { request, zone } , Some ( new_request) )
3477
- if request. addresses != new_request. addresses =>
3478
- {
3559
+ (
3560
+ SwitchZoneState :: Running { request, zone, worker } ,
3561
+ Some ( new_request) ,
3562
+ ) if request. addresses != new_request. addresses => {
3479
3563
// If the switch zone is running but we have new addresses, it
3480
3564
// means we're moving from the bootstrap to the underlay
3481
3565
// network. We need to add an underlay address and route in the
@@ -3511,8 +3595,8 @@ impl ServiceManager {
3511
3595
) ;
3512
3596
}
3513
3597
3514
- // When the request addresses have changed this means the underlay is
3515
- // available now as well.
3598
+ // When the request addresses have changed this means the
3599
+ // underlay is available now as well.
3516
3600
if let Some ( info) = self . inner . sled_info . get ( ) {
3517
3601
info ! (
3518
3602
self . inner. log,
@@ -3823,6 +3907,26 @@ impl ServiceManager {
3823
3907
}
3824
3908
}
3825
3909
}
3910
+
3911
+ // We also need to ensure any uplinks are configured. Spawn a
3912
+ // task that goes into an infinite retry loop until it succeeds.
3913
+ if let Some ( underlay_info) = underlay_info {
3914
+ if let Some ( old_worker) = worker. take ( ) {
3915
+ old_worker. stop ( ) . await ;
3916
+ }
3917
+ let me = self . clone ( ) ;
3918
+ let ( exit_tx, exit_rx) = oneshot:: channel ( ) ;
3919
+ * worker = Some ( Task {
3920
+ exit_tx,
3921
+ initializer : tokio:: task:: spawn ( async move {
3922
+ me. ensure_switch_zone_uplinks_configured_loop (
3923
+ & underlay_info,
3924
+ exit_rx,
3925
+ )
3926
+ . await ;
3927
+ } ) ,
3928
+ } ) ;
3929
+ }
3826
3930
}
3827
3931
( SwitchZoneState :: Running { .. } , Some ( _) ) => {
3828
3932
info ! ( log, "Enabling {zone_typestr} zone (already complete)" ) ;
@@ -3894,6 +3998,7 @@ impl ServiceManager {
3894
3998
* sled_zone = SwitchZoneState :: Running {
3895
3999
request : request. clone ( ) ,
3896
4000
zone : Box :: new ( zone) ,
4001
+ worker : None ,
3897
4002
} ;
3898
4003
Ok ( ( ) )
3899
4004
}
@@ -3902,29 +4007,62 @@ impl ServiceManager {
3902
4007
// inititalized, or it has been told to stop.
3903
4008
async fn initialize_switch_zone_loop (
3904
4009
& self ,
4010
+ underlay_info : Option < UnderlayInfo > ,
3905
4011
mut exit_rx : oneshot:: Receiver < ( ) > ,
3906
4012
) {
4013
+ // We don't really expect failures trying to initialize the switch zone
4014
+ // unless something is unhealthy. This timeout is somewhat arbitrary,
4015
+ // but we probably don't want to use backoff here.
4016
+ const RETRY_DELAY : Duration = Duration :: from_secs ( 1 ) ;
4017
+
4018
+ // First, go into a loop to bring up the switch zone; retry until we
4019
+ // succeed or are told to give up via `exit_rx`.
3907
4020
loop {
3908
4021
{
3909
4022
let mut sled_zone = self . inner . switch_zone . lock ( ) . await ;
3910
4023
match self . try_initialize_switch_zone ( & mut sled_zone) . await {
3911
- Ok ( ( ) ) => return ,
3912
- Err ( e) => warn ! (
3913
- self . inner. log,
3914
- "Failed to initialize switch zone: {e}"
3915
- ) ,
4024
+ Ok ( ( ) ) => {
4025
+ info ! ( self . inner. log, "initialized switch zone" ) ;
4026
+ break ;
4027
+ }
4028
+ Err ( e) => {
4029
+ warn ! (
4030
+ self . inner. log, "Failed to initialize switch zone" ;
4031
+ InlineErrorChain :: new( & e) ,
4032
+ ) ;
4033
+ }
3916
4034
}
3917
4035
}
3918
4036
3919
4037
tokio:: select! {
3920
4038
// If we've been told to stop trying, bail.
3921
- _ = & mut exit_rx => return ,
4039
+ _ = & mut exit_rx => {
4040
+ info!(
4041
+ self . inner. log,
4042
+ "instructed to give up on switch zone initialization" ,
4043
+ ) ;
4044
+ return ;
4045
+ }
3922
4046
3923
- // Poll for the device every second - this timeout is somewhat
3924
- // arbitrary, but we probably don't want to use backoff here.
3925
- _ = tokio:: time:: sleep( tokio:: time:: Duration :: from_secs( 1 ) ) => ( ) ,
4047
+ _ = tokio:: time:: sleep( RETRY_DELAY ) => {
4048
+ info!(
4049
+ self . inner. log,
4050
+ "retrying switch zone initialization" ,
4051
+ ) ;
4052
+ continue ;
4053
+ }
3926
4054
} ;
3927
4055
}
4056
+
4057
+ // Then, if we have underlay info, go into a loop trying to configure
4058
+ // our uplinks. As above, retry until we succeed or are told to stop.
4059
+ if let Some ( underlay_info) = underlay_info {
4060
+ self . ensure_switch_zone_uplinks_configured_loop (
4061
+ & underlay_info,
4062
+ exit_rx,
4063
+ )
4064
+ . await ;
4065
+ }
3928
4066
}
3929
4067
}
3930
4068
0 commit comments