Skip to content
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
4e6a2a3
add HardwareMonitorHandle and add it to long running tasks
jgallagher Aug 28, 2025
131bc2e
remove TofinoManager; separate caching of tofino_loaded from ServiceM…
jgallagher Aug 28, 2025
b4bb460
check policy when deciding whether to start or stop switch zone
jgallagher Aug 28, 2025
64db578
listen for policy changes
jgallagher Aug 28, 2025
73af35a
add sled-agent endpoints to get/put switch zone policy
jgallagher Aug 29, 2025
2a8a5cb
openapi
jgallagher Aug 29, 2025
bd24c9e
clippy and update server API versions
jgallagher Aug 29, 2025
d72b7d0
add omdb subcommands to control switch zone policy
jgallagher Aug 29, 2025
6c3b861
reject "switch zone disable" requests from our own switch zone
jgallagher Aug 29, 2025
88bdb3a
Merge remote-tracking branch 'origin/main' into john/omdb-switch-zone…
jgallagher Aug 29, 2025
8fb28dd
Merge remote-tracking branch 'origin/main' into john/omdb-switch-zone…
jgallagher Sep 2, 2025
c0b9899
remove set_tofino_loaded() and add args to ensure_switch_zone_activat…
jgallagher Sep 2, 2025
73e3f3f
api docs
jgallagher Sep 2, 2025
8f8d2c0
rustdoc link
jgallagher Sep 2, 2025
2831991
[sled-agent] Move switch zone uplink config into switch zone startup …
jgallagher Sep 2, 2025
ed046b7
Merge remote-tracking branch 'origin/main' into john/switch-zone-upli…
jgallagher Sep 2, 2025
d014309
UnderlayInfo struct instead of tuple
jgallagher Sep 3, 2025
e65e055
extract ensure_switch_zone_uplinks_configured_loop() method
jgallagher Sep 3, 2025
010432a
also ensure uplinks when reconfiguring switch zone
jgallagher Sep 3, 2025
e321d2e
Merge remote-tracking branch 'origin/main' into john/switch-zone-upli…
jgallagher Sep 3, 2025
1029364
unlock mutex to prevent deadlock
jgallagher Sep 3, 2025
d9e3aec
Merge remote-tracking branch 'origin/main' into john/switch-zone-upli…
jgallagher Sep 3, 2025
62d55e1
move underlay uplink config into a worker task
jgallagher Sep 4, 2025
c0c9892
Merge remote-tracking branch 'origin/main' into john/switch-zone-upli…
jgallagher Sep 4, 2025
a6a642b
Merge remote-tracking branch 'origin/main' into john/switch-zone-upli…
jgallagher Sep 5, 2025
3dc1f40
reword comment
jgallagher Sep 17, 2025
1c40048
Merge branch 'main' into john/switch-zone-uplinks-async
jgallagher Sep 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 11 additions & 20 deletions sled-agent/src/bootstrap/early_networking.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,11 @@ use omicron_common::api::internal::shared::{
};
use omicron_common::backoff::{
BackoffError, ExponentialBackoff, ExponentialBackoffBuilder, retry_notify,
retry_policy_local,
};
use omicron_ddm_admin_client::DdmError;
use oxnet::IpNet;
use slog::Logger;
use slog_error_chain::InlineErrorChain;
use std::collections::{HashMap, HashSet};
use std::net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddrV6};
use std::time::{Duration, Instant};
Expand Down Expand Up @@ -376,25 +376,16 @@ impl<'a> EarlyNetworkSetup<'a> {
&format!("http://[{}]:{}", switch_zone_underlay_ip, MGS_PORT),
self.log.new(o!("component" => "MgsClient")),
);
let switch_slot = retry_notify(
retry_policy_local(),
|| async {
mgs_client
.sp_local_switch_id()
.await
.map_err(BackoffError::transient)
.map(|response| response.into_inner().slot)
},
|error, delay| {
warn!(
self.log,
"Failed to get switch ID from MGS (retrying in {delay:?})";
"error" => ?error,
);
},
)
.await
.expect("Expected an infinite retry loop getting our switch ID");
let switch_slot = mgs_client
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We no longer retry this forever here, because our caller will retry if we return an error. (There are still spots later in this function that go into infinite retry loops, so it's possible for MGS to be healthy, we get a successful response here, then get stuck in one of those. But one fix at a time.)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Confirmed that the retry at the higher level occurs via reading the code.

.sp_local_switch_id()
.await
.map_err(|err| {
EarlyNetworkSetupError::Mgs(format!(
"failed to determine local switch ID via MGS: {}",
InlineErrorChain::new(&err)
))
})?
.slot;

let switch_location = match switch_slot {
0 => SwitchLocation::Switch0,
Expand Down
2 changes: 1 addition & 1 deletion sled-agent/src/http_entrypoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1064,7 +1064,7 @@ impl SledAgentApi for SledAgentImpl {
// policy to "off" if this request came from our switch zone;
// i.e., only allow disabling the switch zone if we have at
// least some evidence that the _other_ switch zone is up.
let (our_switch_zone_ip, _) = sa.switch_zone_underlay_info();
let our_switch_zone_ip = sa.switch_zone_underlay_info().ip;
if request_context.request.remote_addr().ip()
== our_switch_zone_ip
{
Expand Down
191 changes: 163 additions & 28 deletions sled-agent/src/services.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ use slog_error_chain::InlineErrorChain;
use std::net::{IpAddr, Ipv6Addr, SocketAddr};
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, OnceLock};
use std::time::Duration;
use tokio::io::AsyncWriteExt;
use tokio::sync::Mutex;
use tokio::sync::oneshot;
Expand Down Expand Up @@ -310,6 +311,14 @@ impl From<Error> for omicron_common::api::external::Error {
}
}

/// Information describing the underlay network, used when activating the switch
/// zone.
#[derive(Debug, Clone)]
pub struct UnderlayInfo {
pub ip: Ipv6Addr,
pub rack_network_config: Option<RackNetworkConfig>,
}

fn display_zone_init_errors(errors: &[(String, Box<Error>)]) -> String {
if errors.len() == 1 {
return format!(
Expand Down Expand Up @@ -550,6 +559,9 @@ enum SwitchZoneState {
request: SwitchZoneConfig,
// The currently running zone
zone: Box<RunningZone>,
// A background task which keeps looping until the zone's uplinks are
// configured.
worker: Option<Task>,
},
}

Expand Down Expand Up @@ -3151,7 +3163,7 @@ impl ServiceManager {
&self,
// If we're reconfiguring the switch zone with an underlay address, we
// also need the rack network config to set tfport uplinks.
underlay_info: Option<(Ipv6Addr, Option<&RackNetworkConfig>)>,
underlay_info: Option<UnderlayInfo>,
baseboard: Baseboard,
) -> Result<(), Error> {
info!(self.inner.log, "Ensuring scrimlet services (enabling services)");
Expand Down Expand Up @@ -3238,33 +3250,95 @@ impl ServiceManager {
}
};

let mut addresses =
if let Some((ip, _)) = underlay_info { vec![ip] } else { vec![] };
let mut addresses = if let Some(info) = &underlay_info {
vec![info.ip]
} else {
vec![]
};
addresses.push(Ipv6Addr::LOCALHOST);

let request =
SwitchZoneConfig { id: Uuid::new_v4(), addresses, services };

self.ensure_switch_zone(
// request=
Some(request),
// filesystems=
filesystems,
// data_links=
data_links,
underlay_info,
)
.await?;

// If we've given the switch an underlay address, we also need to inject
// SMF properties so that tfport uplinks can be created.
if let Some((ip, Some(rack_network_config))) = underlay_info {
self.ensure_switch_zone_uplinks_configured(ip, rack_network_config)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is where we used to get stuck trying forever; instead, we'll now do this uplink configuration either

a) inside the async task we were already spawning if starting the switch zone for the first time
b) inside a new async task we now spawn if we're reconfiguring the switch zone because we just got our network config from RSS

.await?;
}

Ok(())
}

// Retry ensuring switch zone uplinks until success or we're told to stop.
//
// TODO-correctness: This is not in great shape, and may get stuck in an
// infinite retry loop _within_ one attempt, or may succeed even if it
// didn't fully configure all switch zone services. See
// <https://github.com/oxidecomputer/omicron/issues/8970> for details.
async fn ensure_switch_zone_uplinks_configured_loop(
&self,
underlay_info: &UnderlayInfo,
mut exit_rx: oneshot::Receiver<()>,
) {
// We don't really expect failures trying to initialize the switch zone
// unless something is unhealthy. This timeout is somewhat arbitrary,
// but we probably don't want to use backoff here.
const RETRY_DELAY: Duration = Duration::from_secs(1);

// We can only ensure uplinks if we have a rack network config.
//
// TODO-correctness How can we have an underlay IP without a rack
// network config??
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder: do we even get here without receiving the sled agent config? Maybe this can be tied to SledAgentState somehow?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we do, but it looked tricky to fix up the types. I'll take another look and either try fixing it or file an issue with some details.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like the Option comes from the bootstore.get_network_config() call. This call has to return an option because before RSS runs and propagates to all nodes the bootstore configuration is unset. So maybe, inside get_network_config we should wait for a a Some and then remove the optionality later on.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Theoretically, if the bootstore early network config is updated, we'll need to do some reconfiguration, but that happens elsewhere and I believe is driven by an RPW.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ahh, so on a cold boot, there is (or might be?) a window of time where we know our IP (because we ledger it ourselves) but don't know the RackNetworkConfig yet (because we have to unlock the bootstore first)?

If that's right I should probably just remove this comment and keep things as-is.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We ledger the RackNetworkConfig as well. The issue is that the RackNetworkConfig is written to a single (or a few) bootstore nodes and replicated between them. I think it may be possible that some nodes haven't yet learned the RackNetworkConfig on cold boot because the crash happened before the gossip. But this is only detectable if the option is None. If there was an old version and a new version hasn't propagated, this is not detectable locally.

There is no "unlocking the bootstore". The bootstore is used solely to enable configuring the network so we can establish time sync with an external NTP server so that when we do unlock the control plane CRDB actually works.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks; I'll reword this comment.

let Some(rack_network_config) = &underlay_info.rack_network_config
else {
return;
};

loop {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC we loop here waiting for MGS rather than blocking the early networking because this code runs in its own task. That seems like the crux of the fix.

match self
.ensure_switch_zone_uplinks_configured(
underlay_info.ip,
&rack_network_config,
)
.await
{
Ok(()) => {
info!(self.inner.log, "configured switch zone uplinks");
break;
}
Err(e) => {
warn!(
self.inner.log,
"Failed to configure switch zone uplinks";
InlineErrorChain::new(&e),
);
}
}

tokio::select! {
// If we've been told to stop trying, bail.
_ = &mut exit_rx => {
info!(
self.inner.log,
"instructed to give up on switch zone uplink \
configuration",
);
return;
}

_ = tokio::time::sleep(RETRY_DELAY) => {
info!(
self.inner.log,
"retrying switch zone uplink configuration",
);
continue;
}
};
}
}

// Ensure our switch zone (at the given IP address) has its uplinks
// configured based on `rack_network_config`. This first requires us to ask
// MGS running in the switch zone which switch we are, so we know which
Expand Down Expand Up @@ -3409,6 +3483,8 @@ impl ServiceManager {
vec![],
// data_links=
vec![],
// underlay_info=
None,
)
.await
}
Expand All @@ -3427,6 +3503,7 @@ impl ServiceManager {
request: SwitchZoneConfig,
filesystems: Vec<zone::Fs>,
data_links: Vec<String>,
underlay_info: Option<UnderlayInfo>,
) {
let (exit_tx, exit_rx) = oneshot::channel();
*zone = SwitchZoneState::Initializing {
Expand All @@ -3436,7 +3513,8 @@ impl ServiceManager {
worker: Some(Task {
exit_tx,
initializer: tokio::task::spawn(async move {
self.initialize_switch_zone_loop(exit_rx).await
self.initialize_switch_zone_loop(underlay_info, exit_rx)
.await
}),
}),
};
Expand All @@ -3448,6 +3526,7 @@ impl ServiceManager {
request: Option<SwitchZoneConfig>,
filesystems: Vec<zone::Fs>,
data_links: Vec<String>,
underlay_info: Option<UnderlayInfo>,
) -> Result<(), Error> {
let log = &self.inner.log;

Expand All @@ -3462,6 +3541,7 @@ impl ServiceManager {
request,
filesystems,
data_links,
underlay_info,
);
}
(
Expand All @@ -3473,9 +3553,10 @@ impl ServiceManager {
// the next request with our new request.
*request = new_request;
}
(SwitchZoneState::Running { request, zone }, Some(new_request))
if request.addresses != new_request.addresses =>
{
(
SwitchZoneState::Running { request, zone, worker },
Some(new_request),
) if request.addresses != new_request.addresses => {
// If the switch zone is running but we have new addresses, it
// means we're moving from the bootstrap to the underlay
// network. We need to add an underlay address and route in the
Expand Down Expand Up @@ -3511,8 +3592,8 @@ impl ServiceManager {
);
}

// When the request addresses have changed this means the underlay is
// available now as well.
// When the request addresses have changed this means the
// underlay is available now as well.
if let Some(info) = self.inner.sled_info.get() {
info!(
self.inner.log,
Expand Down Expand Up @@ -3823,6 +3904,26 @@ impl ServiceManager {
}
}
}

// We also need to ensure any uplinks are configured. Spawn a
// task that goes into an infinite retry loop until it succeeds.
if let Some(underlay_info) = underlay_info {
if let Some(old_worker) = worker.take() {
old_worker.stop().await;
}
let me = self.clone();
let (exit_tx, exit_rx) = oneshot::channel();
*worker = Some(Task {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the new task we spawn in the "reconfiguring an existing switch zone" case.

exit_tx,
initializer: tokio::task::spawn(async move {
me.ensure_switch_zone_uplinks_configured_loop(
&underlay_info,
exit_rx,
)
.await;
}),
});
}
}
(SwitchZoneState::Running { .. }, Some(_)) => {
info!(log, "Enabling {zone_typestr} zone (already complete)");
Expand Down Expand Up @@ -3894,6 +3995,7 @@ impl ServiceManager {
*sled_zone = SwitchZoneState::Running {
request: request.clone(),
zone: Box::new(zone),
worker: None,
};
Ok(())
}
Expand All @@ -3902,29 +4004,62 @@ impl ServiceManager {
// inititalized, or it has been told to stop.
async fn initialize_switch_zone_loop(
&self,
underlay_info: Option<UnderlayInfo>,
mut exit_rx: oneshot::Receiver<()>,
) {
// We don't really expect failures trying to initialize the switch zone
// unless something is unhealthy. This timeout is somewhat arbitrary,
// but we probably don't want to use backoff here.
const RETRY_DELAY: Duration = Duration::from_secs(1);

// First, go into a loop to bring up the switch zone; retry until we
// succeed or are told to give up via `exit_rx`.
loop {
{
let mut sled_zone = self.inner.switch_zone.lock().await;
match self.try_initialize_switch_zone(&mut sled_zone).await {
Ok(()) => return,
Err(e) => warn!(
self.inner.log,
"Failed to initialize switch zone: {e}"
),
Ok(()) => {
info!(self.inner.log, "initialized switch zone");
break;
}
Err(e) => {
warn!(
self.inner.log, "Failed to initialize switch zone";
InlineErrorChain::new(&e),
);
}
}
}

tokio::select! {
// If we've been told to stop trying, bail.
_ = &mut exit_rx => return,
_ = &mut exit_rx => {
info!(
self.inner.log,
"instructed to give up on switch zone initialization",
);
return;
}

// Poll for the device every second - this timeout is somewhat
// arbitrary, but we probably don't want to use backoff here.
_ = tokio::time::sleep(tokio::time::Duration::from_secs(1)) => (),
_ = tokio::time::sleep(RETRY_DELAY) => {
info!(
self.inner.log,
"retrying switch zone initialization",
);
continue;
}
};
}

// Then, if we have underlay info, go into a loop trying to configure
// our uplinks. As above, retry until we succeed or are told to stop.
if let Some(underlay_info) = underlay_info {
self.ensure_switch_zone_uplinks_configured_loop(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This extends what the task we were already spawning for the "start the switch zone" case.

&underlay_info,
exit_rx,
)
.await;
}
}
}

Expand Down
Loading
Loading