-
Notifications
You must be signed in to change notification settings - Fork 60
[sled-agent] Don't block HardwareMonitor when starting the switch zone if MGS isn't reachable
#9002
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 25 commits
4e6a2a3
131bc2e
b4bb460
64db578
73af35a
2a8a5cb
bd24c9e
d72b7d0
6c3b861
88bdb3a
8fb28dd
c0b9899
73e3f3f
8f8d2c0
2831991
ed046b7
d014309
e65e055
010432a
e321d2e
1029364
d9e3aec
62d55e1
c0c9892
a6a642b
3dc1f40
1c40048
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -107,6 +107,7 @@ use slog_error_chain::InlineErrorChain; | |
| use std::net::{IpAddr, Ipv6Addr, SocketAddr}; | ||
| use std::sync::atomic::{AtomicBool, Ordering}; | ||
| use std::sync::{Arc, OnceLock}; | ||
| use std::time::Duration; | ||
| use tokio::io::AsyncWriteExt; | ||
| use tokio::sync::Mutex; | ||
| use tokio::sync::oneshot; | ||
|
|
@@ -310,6 +311,14 @@ impl From<Error> for omicron_common::api::external::Error { | |
| } | ||
| } | ||
|
|
||
| /// Information describing the underlay network, used when activating the switch | ||
| /// zone. | ||
| #[derive(Debug, Clone)] | ||
| pub struct UnderlayInfo { | ||
| pub ip: Ipv6Addr, | ||
| pub rack_network_config: Option<RackNetworkConfig>, | ||
| } | ||
|
|
||
| fn display_zone_init_errors(errors: &[(String, Box<Error>)]) -> String { | ||
| if errors.len() == 1 { | ||
| return format!( | ||
|
|
@@ -550,6 +559,9 @@ enum SwitchZoneState { | |
| request: SwitchZoneConfig, | ||
| // The currently running zone | ||
| zone: Box<RunningZone>, | ||
| // A background task which keeps looping until the zone's uplinks are | ||
| // configured. | ||
| worker: Option<Task>, | ||
| }, | ||
| } | ||
|
|
||
|
|
@@ -3151,7 +3163,7 @@ impl ServiceManager { | |
| &self, | ||
| // If we're reconfiguring the switch zone with an underlay address, we | ||
| // also need the rack network config to set tfport uplinks. | ||
| underlay_info: Option<(Ipv6Addr, Option<&RackNetworkConfig>)>, | ||
| underlay_info: Option<UnderlayInfo>, | ||
| baseboard: Baseboard, | ||
| ) -> Result<(), Error> { | ||
| info!(self.inner.log, "Ensuring scrimlet services (enabling services)"); | ||
|
|
@@ -3238,33 +3250,95 @@ impl ServiceManager { | |
| } | ||
| }; | ||
|
|
||
| let mut addresses = | ||
| if let Some((ip, _)) = underlay_info { vec![ip] } else { vec![] }; | ||
| let mut addresses = if let Some(info) = &underlay_info { | ||
| vec![info.ip] | ||
| } else { | ||
| vec![] | ||
| }; | ||
| addresses.push(Ipv6Addr::LOCALHOST); | ||
|
|
||
| let request = | ||
| SwitchZoneConfig { id: Uuid::new_v4(), addresses, services }; | ||
|
|
||
| self.ensure_switch_zone( | ||
| // request= | ||
| Some(request), | ||
| // filesystems= | ||
| filesystems, | ||
| // data_links= | ||
| data_links, | ||
| underlay_info, | ||
| ) | ||
| .await?; | ||
|
|
||
| // If we've given the switch an underlay address, we also need to inject | ||
| // SMF properties so that tfport uplinks can be created. | ||
| if let Some((ip, Some(rack_network_config))) = underlay_info { | ||
| self.ensure_switch_zone_uplinks_configured(ip, rack_network_config) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is where we used to get stuck trying forever; instead, we'll now do this uplink configuration either a) inside the async task we were already spawning if starting the switch zone for the first time |
||
| .await?; | ||
| } | ||
|
|
||
| Ok(()) | ||
| } | ||
|
|
||
| // Retry ensuring switch zone uplinks until success or we're told to stop. | ||
| // | ||
| // TODO-correctness: This is not in great shape, and may get stuck in an | ||
| // infinite retry loop _within_ one attempt, or may succeed even if it | ||
| // didn't fully configure all switch zone services. See | ||
| // <https://github.com/oxidecomputer/omicron/issues/8970> for details. | ||
| async fn ensure_switch_zone_uplinks_configured_loop( | ||
| &self, | ||
| underlay_info: &UnderlayInfo, | ||
| mut exit_rx: oneshot::Receiver<()>, | ||
| ) { | ||
| // We don't really expect failures trying to initialize the switch zone | ||
| // unless something is unhealthy. This timeout is somewhat arbitrary, | ||
| // but we probably don't want to use backoff here. | ||
| const RETRY_DELAY: Duration = Duration::from_secs(1); | ||
|
|
||
| // We can only ensure uplinks if we have a rack network config. | ||
| // | ||
| // TODO-correctness How can we have an underlay IP without a rack | ||
| // network config?? | ||
|
||
| let Some(rack_network_config) = &underlay_info.rack_network_config | ||
| else { | ||
| return; | ||
| }; | ||
|
|
||
| loop { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IIUC we loop here waiting for MGS rather than blocking the early networking because this code runs in its own task. That seems like the crux of the fix. |
||
| match self | ||
| .ensure_switch_zone_uplinks_configured( | ||
| underlay_info.ip, | ||
| &rack_network_config, | ||
| ) | ||
| .await | ||
| { | ||
| Ok(()) => { | ||
| info!(self.inner.log, "configured switch zone uplinks"); | ||
| break; | ||
| } | ||
| Err(e) => { | ||
| warn!( | ||
| self.inner.log, | ||
| "Failed to configure switch zone uplinks"; | ||
| InlineErrorChain::new(&e), | ||
| ); | ||
| } | ||
| } | ||
|
|
||
| tokio::select! { | ||
| // If we've been told to stop trying, bail. | ||
| _ = &mut exit_rx => { | ||
| info!( | ||
| self.inner.log, | ||
| "instructed to give up on switch zone uplink \ | ||
| configuration", | ||
| ); | ||
| return; | ||
| } | ||
|
|
||
| _ = tokio::time::sleep(RETRY_DELAY) => { | ||
| info!( | ||
| self.inner.log, | ||
| "retrying switch zone uplink configuration", | ||
| ); | ||
| continue; | ||
| } | ||
| }; | ||
| } | ||
| } | ||
|
|
||
| // Ensure our switch zone (at the given IP address) has its uplinks | ||
| // configured based on `rack_network_config`. This first requires us to ask | ||
| // MGS running in the switch zone which switch we are, so we know which | ||
|
|
@@ -3409,6 +3483,8 @@ impl ServiceManager { | |
| vec![], | ||
| // data_links= | ||
| vec![], | ||
| // underlay_info= | ||
| None, | ||
| ) | ||
| .await | ||
| } | ||
|
|
@@ -3427,6 +3503,7 @@ impl ServiceManager { | |
| request: SwitchZoneConfig, | ||
| filesystems: Vec<zone::Fs>, | ||
| data_links: Vec<String>, | ||
| underlay_info: Option<UnderlayInfo>, | ||
| ) { | ||
| let (exit_tx, exit_rx) = oneshot::channel(); | ||
| *zone = SwitchZoneState::Initializing { | ||
|
|
@@ -3436,7 +3513,8 @@ impl ServiceManager { | |
| worker: Some(Task { | ||
| exit_tx, | ||
| initializer: tokio::task::spawn(async move { | ||
| self.initialize_switch_zone_loop(exit_rx).await | ||
| self.initialize_switch_zone_loop(underlay_info, exit_rx) | ||
| .await | ||
| }), | ||
| }), | ||
| }; | ||
|
|
@@ -3448,6 +3526,7 @@ impl ServiceManager { | |
| request: Option<SwitchZoneConfig>, | ||
| filesystems: Vec<zone::Fs>, | ||
| data_links: Vec<String>, | ||
| underlay_info: Option<UnderlayInfo>, | ||
| ) -> Result<(), Error> { | ||
| let log = &self.inner.log; | ||
|
|
||
|
|
@@ -3462,6 +3541,7 @@ impl ServiceManager { | |
| request, | ||
| filesystems, | ||
| data_links, | ||
| underlay_info, | ||
| ); | ||
| } | ||
| ( | ||
|
|
@@ -3473,9 +3553,10 @@ impl ServiceManager { | |
| // the next request with our new request. | ||
| *request = new_request; | ||
| } | ||
| (SwitchZoneState::Running { request, zone }, Some(new_request)) | ||
| if request.addresses != new_request.addresses => | ||
| { | ||
| ( | ||
| SwitchZoneState::Running { request, zone, worker }, | ||
| Some(new_request), | ||
| ) if request.addresses != new_request.addresses => { | ||
| // If the switch zone is running but we have new addresses, it | ||
| // means we're moving from the bootstrap to the underlay | ||
| // network. We need to add an underlay address and route in the | ||
|
|
@@ -3511,8 +3592,8 @@ impl ServiceManager { | |
| ); | ||
| } | ||
|
|
||
| // When the request addresses have changed this means the underlay is | ||
| // available now as well. | ||
| // When the request addresses have changed this means the | ||
| // underlay is available now as well. | ||
| if let Some(info) = self.inner.sled_info.get() { | ||
| info!( | ||
| self.inner.log, | ||
|
|
@@ -3823,6 +3904,26 @@ impl ServiceManager { | |
| } | ||
| } | ||
| } | ||
|
|
||
| // We also need to ensure any uplinks are configured. Spawn a | ||
| // task that goes into an infinite retry loop until it succeeds. | ||
| if let Some(underlay_info) = underlay_info { | ||
| if let Some(old_worker) = worker.take() { | ||
| old_worker.stop().await; | ||
| } | ||
| let me = self.clone(); | ||
| let (exit_tx, exit_rx) = oneshot::channel(); | ||
| *worker = Some(Task { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the new task we spawn in the "reconfiguring an existing switch zone" case. |
||
| exit_tx, | ||
| initializer: tokio::task::spawn(async move { | ||
| me.ensure_switch_zone_uplinks_configured_loop( | ||
| &underlay_info, | ||
| exit_rx, | ||
| ) | ||
| .await; | ||
| }), | ||
| }); | ||
| } | ||
| } | ||
| (SwitchZoneState::Running { .. }, Some(_)) => { | ||
| info!(log, "Enabling {zone_typestr} zone (already complete)"); | ||
|
|
@@ -3894,6 +3995,7 @@ impl ServiceManager { | |
| *sled_zone = SwitchZoneState::Running { | ||
| request: request.clone(), | ||
| zone: Box::new(zone), | ||
| worker: None, | ||
| }; | ||
| Ok(()) | ||
| } | ||
|
|
@@ -3902,29 +4004,62 @@ impl ServiceManager { | |
| // inititalized, or it has been told to stop. | ||
| async fn initialize_switch_zone_loop( | ||
| &self, | ||
| underlay_info: Option<UnderlayInfo>, | ||
| mut exit_rx: oneshot::Receiver<()>, | ||
| ) { | ||
| // We don't really expect failures trying to initialize the switch zone | ||
| // unless something is unhealthy. This timeout is somewhat arbitrary, | ||
| // but we probably don't want to use backoff here. | ||
| const RETRY_DELAY: Duration = Duration::from_secs(1); | ||
|
|
||
| // First, go into a loop to bring up the switch zone; retry until we | ||
| // succeed or are told to give up via `exit_rx`. | ||
| loop { | ||
| { | ||
| let mut sled_zone = self.inner.switch_zone.lock().await; | ||
| match self.try_initialize_switch_zone(&mut sled_zone).await { | ||
| Ok(()) => return, | ||
| Err(e) => warn!( | ||
| self.inner.log, | ||
| "Failed to initialize switch zone: {e}" | ||
| ), | ||
| Ok(()) => { | ||
| info!(self.inner.log, "initialized switch zone"); | ||
| break; | ||
| } | ||
| Err(e) => { | ||
| warn!( | ||
| self.inner.log, "Failed to initialize switch zone"; | ||
| InlineErrorChain::new(&e), | ||
| ); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| tokio::select! { | ||
| // If we've been told to stop trying, bail. | ||
| _ = &mut exit_rx => return, | ||
| _ = &mut exit_rx => { | ||
| info!( | ||
| self.inner.log, | ||
| "instructed to give up on switch zone initialization", | ||
| ); | ||
| return; | ||
| } | ||
|
|
||
| // Poll for the device every second - this timeout is somewhat | ||
| // arbitrary, but we probably don't want to use backoff here. | ||
| _ = tokio::time::sleep(tokio::time::Duration::from_secs(1)) => (), | ||
| _ = tokio::time::sleep(RETRY_DELAY) => { | ||
| info!( | ||
| self.inner.log, | ||
| "retrying switch zone initialization", | ||
| ); | ||
| continue; | ||
| } | ||
| }; | ||
| } | ||
|
|
||
| // Then, if we have underlay info, go into a loop trying to configure | ||
| // our uplinks. As above, retry until we succeed or are told to stop. | ||
| if let Some(underlay_info) = underlay_info { | ||
| self.ensure_switch_zone_uplinks_configured_loop( | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This extends what the task we were already spawning for the "start the switch zone" case. |
||
| &underlay_info, | ||
| exit_rx, | ||
| ) | ||
| .await; | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We no longer retry this forever here, because our caller will retry if we return an error. (There are still spots later in this function that go into infinite retry loops, so it's possible for MGS to be healthy, we get a successful response here, then get stuck in one of those. But one fix at a time.)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Confirmed that the retry at the higher level occurs via reading the code.