Skip to content

Commit 24b6e7c

Browse files
jgallaghercharliepark
authored andcommitted
[sled-agent] Don't block HardwareMonitor when starting the switch zone if MGS isn't reachable (#9002)
See #8970 for context. This doesn't fix _all_ the ways we could get wedged if the switch zone is unhealthy after starting it up, but it does fix the case where it's so broken not even MGS is functional. The changes here are pretty precarious (and took a bunch of retries to get something that worked!). I ran them on dublin while doing some testing for #8480, and was successfully able to start and stop the switch zone even if the sidecar was powered off and MGS was unresponsive.
1 parent 0a362de commit 24b6e7c

File tree

4 files changed

+184
-54
lines changed

4 files changed

+184
-54
lines changed

sled-agent/src/bootstrap/early_networking.rs

Lines changed: 11 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,11 @@ use omicron_common::api::internal::shared::{
3232
};
3333
use omicron_common::backoff::{
3434
BackoffError, ExponentialBackoff, ExponentialBackoffBuilder, retry_notify,
35-
retry_policy_local,
3635
};
3736
use omicron_ddm_admin_client::DdmError;
3837
use oxnet::IpNet;
3938
use slog::Logger;
39+
use slog_error_chain::InlineErrorChain;
4040
use std::collections::{HashMap, HashSet};
4141
use std::net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddrV6};
4242
use std::time::{Duration, Instant};
@@ -376,25 +376,16 @@ impl<'a> EarlyNetworkSetup<'a> {
376376
&format!("http://[{}]:{}", switch_zone_underlay_ip, MGS_PORT),
377377
self.log.new(o!("component" => "MgsClient")),
378378
);
379-
let switch_slot = retry_notify(
380-
retry_policy_local(),
381-
|| async {
382-
mgs_client
383-
.sp_local_switch_id()
384-
.await
385-
.map_err(BackoffError::transient)
386-
.map(|response| response.into_inner().slot)
387-
},
388-
|error, delay| {
389-
warn!(
390-
self.log,
391-
"Failed to get switch ID from MGS (retrying in {delay:?})";
392-
"error" => ?error,
393-
);
394-
},
395-
)
396-
.await
397-
.expect("Expected an infinite retry loop getting our switch ID");
379+
let switch_slot = mgs_client
380+
.sp_local_switch_id()
381+
.await
382+
.map_err(|err| {
383+
EarlyNetworkSetupError::Mgs(format!(
384+
"failed to determine local switch ID via MGS: {}",
385+
InlineErrorChain::new(&err)
386+
))
387+
})?
388+
.slot;
398389

399390
let switch_location = match switch_slot {
400391
0 => SwitchLocation::Switch0,

sled-agent/src/http_entrypoints.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1064,7 +1064,7 @@ impl SledAgentApi for SledAgentImpl {
10641064
// policy to "off" if this request came from our switch zone;
10651065
// i.e., only allow disabling the switch zone if we have at
10661066
// least some evidence that the _other_ switch zone is up.
1067-
let (our_switch_zone_ip, _) = sa.switch_zone_underlay_info();
1067+
let our_switch_zone_ip = sa.switch_zone_underlay_info().ip;
10681068
if request_context.request.remote_addr().ip()
10691069
== our_switch_zone_ip
10701070
{

sled-agent/src/services.rs

Lines changed: 166 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ use slog_error_chain::InlineErrorChain;
107107
use std::net::{IpAddr, Ipv6Addr, SocketAddr};
108108
use std::sync::atomic::{AtomicBool, Ordering};
109109
use std::sync::{Arc, OnceLock};
110+
use std::time::Duration;
110111
use tokio::io::AsyncWriteExt;
111112
use tokio::sync::Mutex;
112113
use tokio::sync::oneshot;
@@ -310,6 +311,14 @@ impl From<Error> for omicron_common::api::external::Error {
310311
}
311312
}
312313

314+
/// Information describing the underlay network, used when activating the switch
315+
/// zone.
316+
#[derive(Debug, Clone)]
317+
pub struct UnderlayInfo {
318+
pub ip: Ipv6Addr,
319+
pub rack_network_config: Option<RackNetworkConfig>,
320+
}
321+
313322
fn display_zone_init_errors(errors: &[(String, Box<Error>)]) -> String {
314323
if errors.len() == 1 {
315324
return format!(
@@ -550,6 +559,9 @@ enum SwitchZoneState {
550559
request: SwitchZoneConfig,
551560
// The currently running zone
552561
zone: Box<RunningZone>,
562+
// A background task which keeps looping until the zone's uplinks are
563+
// configured.
564+
worker: Option<Task>,
553565
},
554566
}
555567

@@ -3151,7 +3163,7 @@ impl ServiceManager {
31513163
&self,
31523164
// If we're reconfiguring the switch zone with an underlay address, we
31533165
// also need the rack network config to set tfport uplinks.
3154-
underlay_info: Option<(Ipv6Addr, Option<&RackNetworkConfig>)>,
3166+
underlay_info: Option<UnderlayInfo>,
31553167
baseboard: Baseboard,
31563168
) -> Result<(), Error> {
31573169
info!(self.inner.log, "Ensuring scrimlet services (enabling services)");
@@ -3238,33 +3250,98 @@ impl ServiceManager {
32383250
}
32393251
};
32403252

3241-
let mut addresses =
3242-
if let Some((ip, _)) = underlay_info { vec![ip] } else { vec![] };
3253+
let mut addresses = if let Some(info) = &underlay_info {
3254+
vec![info.ip]
3255+
} else {
3256+
vec![]
3257+
};
32433258
addresses.push(Ipv6Addr::LOCALHOST);
32443259

32453260
let request =
32463261
SwitchZoneConfig { id: Uuid::new_v4(), addresses, services };
32473262

32483263
self.ensure_switch_zone(
3249-
// request=
32503264
Some(request),
3251-
// filesystems=
32523265
filesystems,
3253-
// data_links=
32543266
data_links,
3267+
underlay_info,
32553268
)
32563269
.await?;
32573270

3258-
// If we've given the switch an underlay address, we also need to inject
3259-
// SMF properties so that tfport uplinks can be created.
3260-
if let Some((ip, Some(rack_network_config))) = underlay_info {
3261-
self.ensure_switch_zone_uplinks_configured(ip, rack_network_config)
3262-
.await?;
3263-
}
3264-
32653271
Ok(())
32663272
}
32673273

3274+
// Retry ensuring switch zone uplinks until success or we're told to stop.
3275+
//
3276+
// TODO-correctness: This is not in great shape, and may get stuck in an
3277+
// infinite retry loop _within_ one attempt, or may succeed even if it
3278+
// didn't fully configure all switch zone services. See
3279+
// <https://github.com/oxidecomputer/omicron/issues/8970> for details.
3280+
async fn ensure_switch_zone_uplinks_configured_loop(
3281+
&self,
3282+
underlay_info: &UnderlayInfo,
3283+
mut exit_rx: oneshot::Receiver<()>,
3284+
) {
3285+
// We don't really expect failures trying to initialize the switch zone
3286+
// unless something is unhealthy. This timeout is somewhat arbitrary,
3287+
// but we probably don't want to use backoff here.
3288+
const RETRY_DELAY: Duration = Duration::from_secs(1);
3289+
3290+
// We can only ensure uplinks if we have a rack network config.
3291+
//
3292+
// It'd be surprising to have `underlay_info` containing our underlay IP
3293+
// without a rack network config, but it is technically possible. These
3294+
// bits of information are ledgered separately, and we could have
3295+
// ledgered our underlay IP, then crashed before the bootstore gossip
3296+
// happened to tell us the rack network config, and are now restarting.
3297+
let Some(rack_network_config) = &underlay_info.rack_network_config
3298+
else {
3299+
return;
3300+
};
3301+
3302+
loop {
3303+
match self
3304+
.ensure_switch_zone_uplinks_configured(
3305+
underlay_info.ip,
3306+
&rack_network_config,
3307+
)
3308+
.await
3309+
{
3310+
Ok(()) => {
3311+
info!(self.inner.log, "configured switch zone uplinks");
3312+
break;
3313+
}
3314+
Err(e) => {
3315+
warn!(
3316+
self.inner.log,
3317+
"Failed to configure switch zone uplinks";
3318+
InlineErrorChain::new(&e),
3319+
);
3320+
}
3321+
}
3322+
3323+
tokio::select! {
3324+
// If we've been told to stop trying, bail.
3325+
_ = &mut exit_rx => {
3326+
info!(
3327+
self.inner.log,
3328+
"instructed to give up on switch zone uplink \
3329+
configuration",
3330+
);
3331+
return;
3332+
}
3333+
3334+
_ = tokio::time::sleep(RETRY_DELAY) => {
3335+
info!(
3336+
self.inner.log,
3337+
"retrying switch zone uplink configuration",
3338+
);
3339+
continue;
3340+
}
3341+
};
3342+
}
3343+
}
3344+
32683345
// Ensure our switch zone (at the given IP address) has its uplinks
32693346
// configured based on `rack_network_config`. This first requires us to ask
32703347
// MGS running in the switch zone which switch we are, so we know which
@@ -3409,6 +3486,8 @@ impl ServiceManager {
34093486
vec![],
34103487
// data_links=
34113488
vec![],
3489+
// underlay_info=
3490+
None,
34123491
)
34133492
.await
34143493
}
@@ -3427,6 +3506,7 @@ impl ServiceManager {
34273506
request: SwitchZoneConfig,
34283507
filesystems: Vec<zone::Fs>,
34293508
data_links: Vec<String>,
3509+
underlay_info: Option<UnderlayInfo>,
34303510
) {
34313511
let (exit_tx, exit_rx) = oneshot::channel();
34323512
*zone = SwitchZoneState::Initializing {
@@ -3436,7 +3516,8 @@ impl ServiceManager {
34363516
worker: Some(Task {
34373517
exit_tx,
34383518
initializer: tokio::task::spawn(async move {
3439-
self.initialize_switch_zone_loop(exit_rx).await
3519+
self.initialize_switch_zone_loop(underlay_info, exit_rx)
3520+
.await
34403521
}),
34413522
}),
34423523
};
@@ -3448,6 +3529,7 @@ impl ServiceManager {
34483529
request: Option<SwitchZoneConfig>,
34493530
filesystems: Vec<zone::Fs>,
34503531
data_links: Vec<String>,
3532+
underlay_info: Option<UnderlayInfo>,
34513533
) -> Result<(), Error> {
34523534
let log = &self.inner.log;
34533535

@@ -3462,6 +3544,7 @@ impl ServiceManager {
34623544
request,
34633545
filesystems,
34643546
data_links,
3547+
underlay_info,
34653548
);
34663549
}
34673550
(
@@ -3473,9 +3556,10 @@ impl ServiceManager {
34733556
// the next request with our new request.
34743557
*request = new_request;
34753558
}
3476-
(SwitchZoneState::Running { request, zone }, Some(new_request))
3477-
if request.addresses != new_request.addresses =>
3478-
{
3559+
(
3560+
SwitchZoneState::Running { request, zone, worker },
3561+
Some(new_request),
3562+
) if request.addresses != new_request.addresses => {
34793563
// If the switch zone is running but we have new addresses, it
34803564
// means we're moving from the bootstrap to the underlay
34813565
// network. We need to add an underlay address and route in the
@@ -3511,8 +3595,8 @@ impl ServiceManager {
35113595
);
35123596
}
35133597

3514-
// When the request addresses have changed this means the underlay is
3515-
// available now as well.
3598+
// When the request addresses have changed this means the
3599+
// underlay is available now as well.
35163600
if let Some(info) = self.inner.sled_info.get() {
35173601
info!(
35183602
self.inner.log,
@@ -3823,6 +3907,26 @@ impl ServiceManager {
38233907
}
38243908
}
38253909
}
3910+
3911+
// We also need to ensure any uplinks are configured. Spawn a
3912+
// task that goes into an infinite retry loop until it succeeds.
3913+
if let Some(underlay_info) = underlay_info {
3914+
if let Some(old_worker) = worker.take() {
3915+
old_worker.stop().await;
3916+
}
3917+
let me = self.clone();
3918+
let (exit_tx, exit_rx) = oneshot::channel();
3919+
*worker = Some(Task {
3920+
exit_tx,
3921+
initializer: tokio::task::spawn(async move {
3922+
me.ensure_switch_zone_uplinks_configured_loop(
3923+
&underlay_info,
3924+
exit_rx,
3925+
)
3926+
.await;
3927+
}),
3928+
});
3929+
}
38263930
}
38273931
(SwitchZoneState::Running { .. }, Some(_)) => {
38283932
info!(log, "Enabling {zone_typestr} zone (already complete)");
@@ -3894,6 +3998,7 @@ impl ServiceManager {
38943998
*sled_zone = SwitchZoneState::Running {
38953999
request: request.clone(),
38964000
zone: Box::new(zone),
4001+
worker: None,
38974002
};
38984003
Ok(())
38994004
}
@@ -3902,29 +4007,62 @@ impl ServiceManager {
39024007
// inititalized, or it has been told to stop.
39034008
async fn initialize_switch_zone_loop(
39044009
&self,
4010+
underlay_info: Option<UnderlayInfo>,
39054011
mut exit_rx: oneshot::Receiver<()>,
39064012
) {
4013+
// We don't really expect failures trying to initialize the switch zone
4014+
// unless something is unhealthy. This timeout is somewhat arbitrary,
4015+
// but we probably don't want to use backoff here.
4016+
const RETRY_DELAY: Duration = Duration::from_secs(1);
4017+
4018+
// First, go into a loop to bring up the switch zone; retry until we
4019+
// succeed or are told to give up via `exit_rx`.
39074020
loop {
39084021
{
39094022
let mut sled_zone = self.inner.switch_zone.lock().await;
39104023
match self.try_initialize_switch_zone(&mut sled_zone).await {
3911-
Ok(()) => return,
3912-
Err(e) => warn!(
3913-
self.inner.log,
3914-
"Failed to initialize switch zone: {e}"
3915-
),
4024+
Ok(()) => {
4025+
info!(self.inner.log, "initialized switch zone");
4026+
break;
4027+
}
4028+
Err(e) => {
4029+
warn!(
4030+
self.inner.log, "Failed to initialize switch zone";
4031+
InlineErrorChain::new(&e),
4032+
);
4033+
}
39164034
}
39174035
}
39184036

39194037
tokio::select! {
39204038
// If we've been told to stop trying, bail.
3921-
_ = &mut exit_rx => return,
4039+
_ = &mut exit_rx => {
4040+
info!(
4041+
self.inner.log,
4042+
"instructed to give up on switch zone initialization",
4043+
);
4044+
return;
4045+
}
39224046

3923-
// Poll for the device every second - this timeout is somewhat
3924-
// arbitrary, but we probably don't want to use backoff here.
3925-
_ = tokio::time::sleep(tokio::time::Duration::from_secs(1)) => (),
4047+
_ = tokio::time::sleep(RETRY_DELAY) => {
4048+
info!(
4049+
self.inner.log,
4050+
"retrying switch zone initialization",
4051+
);
4052+
continue;
4053+
}
39264054
};
39274055
}
4056+
4057+
// Then, if we have underlay info, go into a loop trying to configure
4058+
// our uplinks. As above, retry until we succeed or are told to stop.
4059+
if let Some(underlay_info) = underlay_info {
4060+
self.ensure_switch_zone_uplinks_configured_loop(
4061+
&underlay_info,
4062+
exit_rx,
4063+
)
4064+
.await;
4065+
}
39284066
}
39294067
}
39304068

0 commit comments

Comments
 (0)