Skip to content
118 changes: 49 additions & 69 deletions nexus/reconfigurator/planning/src/blueprint_builder/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ use nexus_types::deployment::PlanningInput;
use nexus_types::deployment::SledFilter;
use nexus_types::deployment::SledResources;
use nexus_types::deployment::TufRepoContentsError;
use nexus_types::deployment::ZpoolFilter;
use nexus_types::deployment::ZpoolName;
use nexus_types::deployment::blueprint_zone_type;
use nexus_types::external_api::views::SledState;
Expand Down Expand Up @@ -512,15 +511,15 @@ pub struct BlueprintBuilder<'a> {
/// The ID that the completed blueprint will have
new_blueprint_id: BlueprintUuid,

// These fields are used to allocate resources for sleds.
input: &'a PlanningInput,

Comment on lines -515 to -517
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How do we ensure the PlanningInput doesn't creep its way in here in the future?

Copy link
Contributor Author

@jgallagher jgallagher Nov 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think there's any way to guarantee it, but: I'm working on a few more PRs to remove the need to pass it into BlueprintBuilder::new_based_on() at all. Hopefully at that point the bar for adding it back is pretty high.

// These fields will become part of the final blueprint. See the
// corresponding fields in `Blueprint`.
sled_editors: BTreeMap<SledUuid, SledEditor>,
cockroachdb_setting_preserve_downgrade: CockroachDbPreserveDowngrade,
cockroachdb_fingerprint: String,
target_release_minimum_generation: Generation,
nexus_generation: Generation,
internal_dns_version: Generation,
external_dns_version: Generation,

creator: String,
operations: Vec<Operation>,
Expand Down Expand Up @@ -624,17 +623,19 @@ impl<'a> BlueprintBuilder<'a> {

let editor = match state {
SledState::Active => {
let subnet = input
let details = input
.sled_lookup(SledFilter::Commissioned, *sled_id)
.with_context(|| {
format!(
"failed to find sled details for \
active sled in parent blueprint {sled_id}"
)
})?
.resources
.subnet;
SledEditor::for_existing_active(subnet, sled_cfg.clone())
})?;
SledEditor::for_existing_active(
Arc::new(details.baseboard_id.clone()),
details.resources.subnet,
sled_cfg.clone(),
)
}
SledState::Decommissioned => {
SledEditor::for_existing_decommissioned(sled_cfg.clone())
Expand All @@ -652,6 +653,7 @@ impl<'a> BlueprintBuilder<'a> {
for (sled_id, details) in input.all_sleds(SledFilter::Commissioned) {
if let Entry::Vacant(slot) = sled_editors.entry(sled_id) {
slot.insert(SledEditor::for_new_active(
Arc::new(details.baseboard_id.clone()),
details.resources.subnet,
));
}
Expand All @@ -672,14 +674,19 @@ impl<'a> BlueprintBuilder<'a> {
.clone(),
oximeter_read_policy,
new_blueprint_id: rng.next_blueprint(),
input,
sled_editors,
cockroachdb_setting_preserve_downgrade: parent_blueprint
.cockroachdb_setting_preserve_downgrade,
cockroachdb_fingerprint: input
.cockroachdb_settings()
.state_fingerprint
.clone(),
pending_mgs_updates: parent_blueprint.pending_mgs_updates.clone(),
target_release_minimum_generation: parent_blueprint
.target_release_minimum_generation,
nexus_generation: parent_blueprint.nexus_generation,
internal_dns_version: input.internal_dns_version(),
external_dns_version: input.external_dns_version(),
creator: creator.to_owned(),
operations: Vec::new(),
comments: Vec::new(),
Expand Down Expand Up @@ -738,9 +745,9 @@ impl<'a> BlueprintBuilder<'a> {
// multirack (either DNS will be on a wider subnet or we need to pick a
// particular rack subnet here?).
let any_sled_subnet = self
.input
.all_sled_resources(SledFilter::Commissioned)
.map(|(_sled_id, resources)| resources.subnet)
.sled_editors
.values()
.filter_map(|editor| editor.subnet())
.next()
.ok_or(Error::RackSubnetUnknownNoSleds)?;
let rack_subnet = ReservedRackSubnet::from_subnet(any_sled_subnet);
Expand All @@ -764,10 +771,6 @@ impl<'a> BlueprintBuilder<'a> {
}))
}

pub fn planning_input(&self) -> &PlanningInput {
&self.input
}

/// Iterates over the list of sled IDs for which we have zones.
///
/// This may include decommissioned sleds.
Expand Down Expand Up @@ -879,19 +882,14 @@ impl<'a> BlueprintBuilder<'a> {
sleds,
pending_mgs_updates: self.pending_mgs_updates,
parent_blueprint_id: Some(self.parent_blueprint.id),
internal_dns_version: self.input.internal_dns_version(),
external_dns_version: self.input.external_dns_version(),
internal_dns_version: self.internal_dns_version,
external_dns_version: self.external_dns_version,
target_release_minimum_generation: self
.target_release_minimum_generation,
nexus_generation: self.nexus_generation,
cockroachdb_fingerprint: self
.input
.cockroachdb_settings()
.state_fingerprint
.clone(),
cockroachdb_fingerprint: self.cockroachdb_fingerprint,
cockroachdb_setting_preserve_downgrade: self
.cockroachdb_setting_preserve_downgrade,

clickhouse_cluster_config: self.clickhouse_cluster_config,
oximeter_read_version: self.oximeter_read_policy.version,
oximeter_read_mode: self.oximeter_read_policy.mode,
Expand Down Expand Up @@ -1315,14 +1313,16 @@ impl<'a> BlueprintBuilder<'a> {
"tried to ensure mupdate override for unknown sled {sled_id}"
))
})?;
let baseboard_id = editor.baseboard_id().ok_or_else(|| {
// All commissioned sleds have baseboards; this should never fail.
Error::Planner(anyhow!(
"tried to ensure mupdate override for \
decommissioned sled {sled_id}"
))
})?;

// Also map the editor to the corresponding PendingMgsUpdates.
let sled_details = self
.input
.sled_lookup(SledFilter::InService, sled_id)
.map_err(|error| Error::Planner(anyhow!(error)))?;
let pending_mgs_update =
self.pending_mgs_updates.entry(&sled_details.baseboard_id);
let pending_mgs_update = self.pending_mgs_updates.entry(baseboard_id);
let noop_sled_info = noop_info.sled_info_mut(sled_id)?;

editor
Expand Down Expand Up @@ -1492,14 +1492,14 @@ impl<'a> BlueprintBuilder<'a> {
image_source: BlueprintZoneImageSource,
) -> Result<Ensure, Error> {
let pool_name = ZpoolName::new_external(zpool_id);
let editor = self.sled_editors.get(&sled_id).ok_or_else(|| {
Error::Planner(anyhow!(
"tried to ensure crucible zone for unknown sled {sled_id}"
))
})?;

// If this sled already has a Crucible zone on this pool, do nothing.
let has_crucible_on_this_pool = {
let editor = self.sled_editors.get(&sled_id).ok_or_else(|| {
Error::Planner(anyhow!(
"tried to ensure crucible zone for unknown sled {sled_id}"
))
})?;
let has_crucible_on_this_pool =
editor.zones(BlueprintZoneDisposition::is_in_service).any(|z| {
matches!(
&z.zone_type,
Expand All @@ -1509,17 +1509,19 @@ impl<'a> BlueprintBuilder<'a> {
})
if dataset.pool_name == pool_name
)
})
};
});
if has_crucible_on_this_pool {
return Ok(Ensure::NotNeeded);
}

let sled_info = self.sled_resources(sled_id)?;
if !sled_info.zpools.contains_key(&zpool_id) {
// Double-check that our caller didn't pass a bad sled/zpool combo.
if !editor
.disks(BlueprintPhysicalDiskDisposition::is_in_service)
.any(|disk| disk.pool_id == zpool_id)
{
return Err(Error::Planner(anyhow!(
"adding crucible zone for sled {:?}: \
attempted to use unknown zpool {:?}",
attempted to use unknown zpool {:?}",
sled_id,
pool_name
)));
Expand Down Expand Up @@ -2114,17 +2116,12 @@ impl<'a> BlueprintBuilder<'a> {
))
})?;

// We'll check both the disks available to this sled per our current
// blueprint and the list of all in-service zpools on this sled per our
// planning input, and only pick zpools that are available in both.
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the logical change I mentioned in the PR description: I removed this second check that we only pick a zpool that's also InService according to the planning input.

I believe this change should not affect any planner behavior. It always decommissions disks before potentially adding new zones, so any disks that are no longer in service in the planning input will also no longer be in service according to the BlueprintPhysicalDiskDisposition by the time we get here.

This change could affect some weirder tests that only go through BlueprintBuilder, but I'm hoping we don't have any that trip over that particular case. If we do I think I'd claim the test is probably in the wrong: it means they'd be decommissioning disks, not updating the disk config to match, then trying to add zones that require a trip through this method to pick an available disk.

let current_sled_disks = editor
// Only choose from zpools that are in-service.
let in_service_zpools = editor
.disks(BlueprintPhysicalDiskDisposition::is_in_service)
.map(|disk_config| disk_config.pool_id)
.collect::<BTreeSet<_>>();

let all_in_service_zpools =
self.sled_resources(sled_id)?.all_zpools(ZpoolFilter::InService);

// We refuse to choose a zpool for a zone of a given `zone_kind` if this
// sled already has a durable zone of that kind on the same zpool. Build
// up a set of invalid zpools for this sled/kind pair.
Expand All @@ -2142,31 +2139,14 @@ impl<'a> BlueprintBuilder<'a> {
skip_zpools.insert(&zone_config.filesystem_pool);
}

for &zpool_id in all_in_service_zpools {
for zpool_id in in_service_zpools {
let zpool_name = ZpoolName::new_external(zpool_id);
if !skip_zpools.contains(&zpool_name)
&& current_sled_disks.contains(&zpool_id)
{
if !skip_zpools.contains(&zpool_name) {
return Ok(zpool_name);
}
}
Err(Error::NoAvailableZpool { sled_id, kind: zone_kind })
}

/// Returns the resources for a sled that hasn't been decommissioned.
fn sled_resources(
&self,
sled_id: SledUuid,
) -> Result<&'a SledResources, Error> {
let details = self
.input
.sled_lookup(SledFilter::Commissioned, sled_id)
.map_err(|error| {
Error::Planner(anyhow!(error).context(format!(
"for sled {sled_id}, error looking up resources"
)))
})?;
Ok(&details.resources)
Err(Error::NoAvailableZpool { sled_id, kind: zone_kind })
}

/// Determine the number of desired external DNS zones by counting
Expand Down
52 changes: 45 additions & 7 deletions nexus/reconfigurator/planning/src/blueprint_editor/sled_editor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ use nexus_types::deployment::BlueprintZoneType;
use nexus_types::deployment::PendingMgsUpdate;
use nexus_types::deployment::blueprint_zone_type;
use nexus_types::external_api::views::SledState;
use nexus_types::inventory::BaseboardId;
use omicron_common::address::Ipv6Subnet;
use omicron_common::address::SLED_PREFIX;
use omicron_common::api::external::Generation;
Expand All @@ -50,6 +51,7 @@ use scalar::ScalarEditor;
use std::iter;
use std::mem;
use std::net::Ipv6Addr;
use std::sync::Arc;
use underlay_ip_allocator::SledUnderlayIpAllocator;

mod datasets;
Expand Down Expand Up @@ -148,6 +150,7 @@ pub enum SledEditError {
pub(crate) struct SledEditor(InnerSledEditor);

#[derive(Debug)]
#[allow(clippy::large_enum_variant)]
enum InnerSledEditor {
// Internally, `SledEditor` has a variant for each variant of `SledState`,
// as the operations allowed in different states are substantially different
Expand All @@ -159,6 +162,7 @@ enum InnerSledEditor {

impl SledEditor {
pub fn for_existing_active(
baseboard_id: Arc<BaseboardId>,
subnet: Ipv6Subnet<SLED_PREFIX>,
config: BlueprintSledConfig,
) -> Result<Self, SledInputError> {
Expand All @@ -167,7 +171,7 @@ impl SledEditor {
SledState::Active,
"for_existing_active called on non-active sled"
);
let inner = ActiveSledEditor::new(subnet, config)?;
let inner = ActiveSledEditor::new(baseboard_id, subnet, config)?;
Ok(Self(InnerSledEditor::Active(inner)))
}

Expand All @@ -187,8 +191,14 @@ impl SledEditor {
Ok(Self(InnerSledEditor::Decommissioned(inner)))
}

pub fn for_new_active(subnet: Ipv6Subnet<SLED_PREFIX>) -> Self {
Self(InnerSledEditor::Active(ActiveSledEditor::new_empty(subnet)))
pub fn for_new_active(
baseboard_id: Arc<BaseboardId>,
subnet: Ipv6Subnet<SLED_PREFIX>,
) -> Self {
Self(InnerSledEditor::Active(ActiveSledEditor::new_empty(
baseboard_id,
subnet,
)))
}

pub fn finalize(self) -> EditedSled {
Expand All @@ -205,6 +215,26 @@ impl SledEditor {
}
}

/// Returns the baseboard of this sled if it is active, or `None` if it is
/// decommissioned.
pub fn baseboard_id(&self) -> Option<&Arc<BaseboardId>> {
match &self.0 {
InnerSledEditor::Active(active) => Some(&active.baseboard_id),
InnerSledEditor::Decommissioned(_) => None,
}
}

/// Returns the subnet of this sled if it is active, or `None` if it is
/// decommissioned.
pub fn subnet(&self) -> Option<Ipv6Subnet<SLED_PREFIX>> {
match &self.0 {
InnerSledEditor::Active(active) => {
Some(active.underlay_ip_allocator.subnet())
}
InnerSledEditor::Decommissioned(_) => None,
}
}

pub fn edit_counts(&self) -> SledEditCounts {
match &self.0 {
InnerSledEditor::Active(editor) => editor.edit_counts(),
Expand All @@ -231,9 +261,10 @@ impl SledEditor {
// below, we'll be left in the active state with an empty sled
// editor), but omicron in general is not panic safe and aborts
// on panic. Plus `finalize()` should never panic.
let mut stolen = ActiveSledEditor::new_empty(Ipv6Subnet::new(
Ipv6Addr::LOCALHOST,
));
let mut stolen = ActiveSledEditor::new_empty(
Arc::clone(&editor.baseboard_id),
Ipv6Subnet::new(Ipv6Addr::LOCALHOST),
);
mem::swap(editor, &mut stolen);

let mut finalized = stolen.finalize();
Expand Down Expand Up @@ -475,6 +506,7 @@ impl SledEditor {

#[derive(Debug)]
struct ActiveSledEditor {
baseboard_id: Arc<BaseboardId>,
underlay_ip_allocator: SledUnderlayIpAllocator,
incoming_sled_agent_generation: Generation,
zones: ZonesEditor,
Expand All @@ -494,6 +526,7 @@ pub(crate) struct EditedSled {

impl ActiveSledEditor {
pub fn new(
baseboard_id: Arc<BaseboardId>,
subnet: Ipv6Subnet<SLED_PREFIX>,
config: BlueprintSledConfig,
) -> Result<Self, SledInputError> {
Expand All @@ -508,6 +541,7 @@ impl ActiveSledEditor {
zones.zones(BlueprintZoneDisposition::any).map(|z| z.underlay_ip());

Ok(Self {
baseboard_id,
underlay_ip_allocator: SledUnderlayIpAllocator::new(
subnet, zone_ips,
),
Expand All @@ -523,7 +557,10 @@ impl ActiveSledEditor {
})
}

pub fn new_empty(subnet: Ipv6Subnet<SLED_PREFIX>) -> Self {
pub fn new_empty(
baseboard_id: Arc<BaseboardId>,
subnet: Ipv6Subnet<SLED_PREFIX>,
) -> Self {
// Creating the underlay IP allocator can only fail if we have a zone
// with an IP outside the sled subnet, but we don't have any zones at
// all, so this can't fail. Match explicitly to guard against this error
Expand All @@ -532,6 +569,7 @@ impl ActiveSledEditor {
SledUnderlayIpAllocator::new(subnet, iter::empty());

Self {
baseboard_id,
underlay_ip_allocator,
incoming_sled_agent_generation: Generation::new(),
zones: ZonesEditor::empty(),
Expand Down
Loading
Loading