Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
f2c6334
Add debug to test flake step
leftwo Aug 29, 2025
889ab2d
more info when failing
leftwo Aug 29, 2025
93c2bec
more debug logging
leftwo Aug 29, 2025
d251d45
more debug
leftwo Aug 29, 2025
b1b197b
more debug on more tests
leftwo Aug 30, 2025
6f129c4
More debug
leftwo Sep 2, 2025
b002fb2
cargo fmt
leftwo Sep 2, 2025
b7ba833
make some checks loop forever, others take longer
leftwo Sep 2, 2025
f4157a5
loop around checking clean slate/ replacement state
leftwo Sep 2, 2025
459fcaa
fixed incorrect check on last commit
leftwo Sep 2, 2025
b82f35d
Make loop forever for volume delete check
leftwo Sep 2, 2025
1be06ae
fix variable name
leftwo Sep 2, 2025
5fa251a
Make another test loop forever
leftwo Sep 2, 2025
04051f3
more debug, correct pre-test checks
leftwo Sep 3, 2025
5c28a2c
print more when test_cooldown_on_sub_reinc fails
leftwo Sep 3, 2025
fadd5a0
don't unwrap when debugging possible failures
leftwo Sep 3, 2025
967a0d4
debug for test_quiesce_full
leftwo Sep 3, 2025
deb35db
Increase wait time for instance changes to 320
leftwo Sep 3, 2025
0478bbd
more debug, but this ones a problem
leftwo Sep 3, 2025
3ae245c
Merge branch 'main' into alan/why-do-tests-flake
leftwo Sep 3, 2025
43dd58f
Merge branch 'main' into alan/why-do-tests-flake
leftwo Sep 3, 2025
cd16660
Merge branch 'main' into alan/why-do-tests-flake
leftwo Sep 4, 2025
146023c
Give up eventually on assert_region_snapshot_replacement_request_unto…
leftwo Sep 4, 2025
f5e7e7a
Add a comment about a possible flake point
leftwo Sep 4, 2025
0cff9d8
Increase timeout for crucible replacements
leftwo Sep 4, 2025
1d2b8ba
reduce test threads
leftwo Sep 5, 2025
ad9163c
Bump timer for inventory collection
leftwo Sep 5, 2025
a9f7450
Increase timeout for pantry_endpoints to 120 seconds
leftwo Sep 5, 2025
e9e8c03
Cleanup test flake comments
leftwo Sep 5, 2025
46e621d
Remove debugging statements
leftwo Sep 5, 2025
5c43b98
cleanup more comments
leftwo Sep 5, 2025
f226c52
Remove more debug messages
leftwo Sep 5, 2025
abf38b1
Merge branch 'main' into alan/why-do-tests-flake
leftwo Sep 6, 2025
ff71337
comment cleanup
leftwo Sep 6, 2025
9d5d02e
More comment changes
leftwo Sep 6, 2025
e45b4e5
Comment for quiesce eprintln
leftwo Sep 6, 2025
5e83e94
Remove more debug messages
leftwo Sep 6, 2025
35aa542
Update dendrite, lldpd, maghemite, and sidecar-lite to get ipv6 ecmp …
Nieuwejaar Sep 6, 2025
d13dfcf
nix: fix flake versions (#8969)
KaiSforza Sep 6, 2025
67beca5
Explain how to authenticate to the now running control plane. (#8977)
JustinAzoff Sep 6, 2025
6fd635c
Bump web console (fix IP pools) (#9007)
david-crespo Sep 6, 2025
2151b30
bump maghemite to pick up #539 (#9009)
rcgoodfellow Sep 6, 2025
044713a
remove debug messages
leftwo Sep 7, 2025
3c4736c
Update dendrite, lldpd, maghemite, and sidecar-lite to get ipv6 ecmp …
Nieuwejaar Sep 6, 2025
666d5dd
Merge branch 'main' into alan/why-do-tests-flake
leftwo Sep 7, 2025
fca7c7b
Remove more debug logs
leftwo Sep 7, 2025
732cbb1
Remove another debug print, run tests again
leftwo Sep 7, 2025
e06766a
Remove yet another debug print, run tests again
leftwo Sep 7, 2025
7871810
Remove another comment, run tests again
leftwo Sep 8, 2025
687fe86
Update comment, run tests again
leftwo Sep 8, 2025
f4b3271
fix mismerge for mgd_checksums
leftwo Sep 8, 2025
725da36
remove debug function, run tests again
leftwo Sep 8, 2025
9c49f87
PR comments
leftwo Sep 15, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions .github/buildomat/build-and-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -128,9 +128,13 @@ ptime -m cargo build -Z unstable-options --timings=json \
#
# We apply our own timeout to ensure that we get a normal failure on timeout
# rather than a buildomat timeout. See oxidecomputer/buildomat#8.
#
# To avoid too many tests running at the same time, we choose a test threads
# 2 less (negative 2) than the default. This avoids many test flakes where
# the test would have worked but the system was too overloaded and tests
# take longer than their default timeouts.
banner test
ptime -m timeout 2h cargo nextest run --profile ci --locked --verbose
ptime -m timeout 2h cargo nextest run --profile ci --locked --verbose \
--test-threads -2

#
# https://github.com/nextest-rs/nextest/issues/16
Expand Down
4 changes: 3 additions & 1 deletion nexus/src/app/background/tasks/instance_reincarnation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -906,8 +906,10 @@ mod test {
.await;

// Activate the background task again. Now, only instance 2 should be
// restarted.
// restarted. Possible test flake here and this adds a bit more debug
// if we see this assertion fail.
let status = assert_activation_ok!(task.activate(&opctx).await);
eprintln!("status: {:?}", status);
assert_eq!(status.total_instances_found(), 1);
assert_eq!(
status.instances_reincarnated,
Expand Down
40 changes: 30 additions & 10 deletions nexus/src/app/sagas/region_snapshot_replacement_start.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1534,17 +1534,37 @@ pub(crate) mod test {
request: &RegionSnapshotReplacement,
) {
let opctx = test_opctx(cptestctx);
let db_request = datastore
.get_region_snapshot_replacement_request_by_id(&opctx, request.id)
.await
.unwrap();

assert_eq!(db_request.new_region_id, None);
assert_eq!(
db_request.replacement_state,
RegionSnapshotReplacementState::Requested
);
assert_eq!(db_request.operating_saga_id, None);
let mut i = 1;
loop {
let db_request = datastore
.get_region_snapshot_replacement_request_by_id(
&opctx, request.id,
)
.await
.unwrap();

assert_eq!(db_request.new_region_id, None);
assert_eq!(db_request.operating_saga_id, None);

if !matches!(
db_request.replacement_state,
RegionSnapshotReplacementState::Requested
) {
eprintln!(
"loop {i} Failed {:?} != Requested",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What state was this in when you saw this? This seems like a failure to properly unwind the saga as it should never have reached this check without going through rsrss_set_saga_id_undo. Likewise, the assert_eq above checking operating_saga_id should have fired if the request was in an intermediate state.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does eventually get to the requested state.
When I saw the panic, it would be in Allocating state. At least that's what I had written down in an earlier comment.

db_request.replacement_state
);
// 200 * 5 = 1000 seconds, at this point something is wrong.
if i > 200 {
panic!("Failed to reach requested state after {i} tries");
}
tokio::time::sleep(std::time::Duration::from_secs(5)).await;
} else {
break;
}
i += 1;
}
}

async fn assert_volume_untouched(
Expand Down
2 changes: 1 addition & 1 deletion nexus/test-utils/src/resource_helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1685,7 +1685,7 @@ impl<'a, N: NexusServer> DiskTest<'a, N> {
}
},
&Duration::from_millis(50),
&Duration::from_secs(30),
&Duration::from_secs(120),
)
.await
.expect("expected to find inventory collection");
Expand Down
122 changes: 86 additions & 36 deletions nexus/tests/integration_tests/crucible_replacements.rs
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ pub(crate) async fn wait_for_all_replacements(
}
},
&std::time::Duration::from_millis(50),
&std::time::Duration::from_secs(180),
&std::time::Duration::from_secs(260),
)
.await
.expect("all replacements finished");
Expand Down Expand Up @@ -481,7 +481,7 @@ mod region_replacement {
}
},
&std::time::Duration::from_millis(50),
&std::time::Duration::from_secs(60),
&std::time::Duration::from_secs(260),
)
.await
.expect("request transitioned to expected state");
Expand Down Expand Up @@ -1071,24 +1071,34 @@ async fn test_racing_replacements_for_soft_deleted_disk_volume(
activate_background_task(&internal_client, "region_replacement_driver")
.await;

assert!(match last_background_task.last {
let res = match last_background_task.last {
LastResult::Completed(last_result_completed) => {
match serde_json::from_value::<RegionReplacementDriverStatus>(
last_result_completed.details,
) {
Err(e) => {
eprintln!("Json not what we expected");
eprintln!("{e}");
false
}

Ok(v) => !v.drive_invoked_ok.is_empty(),
Ok(v) => {
if !v.drive_invoked_ok.is_empty() {
eprintln!("v.drive_ok: {:?}", v.drive_invoked_ok);
true
} else {
eprintln!("v.drive_ok: {:?} empty", v.drive_invoked_ok);
false
}
}
}
}

_ => {
x => {
eprintln!("Unexpected result here: {:?}", x);
false
}
});
};
assert!(res);

// wait for the drive saga to complete here
wait_for_condition(
Expand Down Expand Up @@ -1485,20 +1495,33 @@ mod region_snapshot_replacement {

// Assert no volumes are referencing the snapshot address

let volumes = self
.datastore
.find_volumes_referencing_socket_addr(
&self.opctx(),
self.snapshot_socket_addr,
)
.await
.unwrap();
let mut counter = 1;
loop {
let volumes = self
.datastore
.find_volumes_referencing_socket_addr(
&self.opctx(),
self.snapshot_socket_addr,
)
.await
.unwrap();

if !volumes.is_empty() {
eprintln!("{:?}", volumes);
if !volumes.is_empty() {
eprintln!(
"Volume should be gone, try {counter} {:?}",
volumes
);
tokio::time::sleep(std::time::Duration::from_secs(5)).await;
counter += 1;
if counter > 200 {
panic!(
"Tried 200 times, and still this did not finish"
);
}
} else {
break;
}
}

assert!(volumes.is_empty());
}

/// Assert no Crucible resources are leaked
Expand Down Expand Up @@ -1649,32 +1672,59 @@ mod region_snapshot_replacement {
match result {
InsertStepResult::Inserted { .. } => {}

_ => {
assert!(
false,
"bad result from create_region_snapshot_replacement_step"
InsertStepResult::AlreadyHandled { existing_step_id } => {
let region_snapshot_replace_request = self
.datastore
.get_region_snapshot_replacement_request_by_id(
&self.opctx(),
existing_step_id,
)
.await;
eprintln!(
"we were suppose to create this: {:?} but found it AlreadyHandled, then got {:?}",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like this error message better, but: did you see this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did see it in the beginning, yes.
After changing timeouts though, I never got it to fire again.

self.replacement_request_id,
region_snapshot_replace_request
);
panic!("Something else created our replacement");
}
}
}

pub async fn assert_read_only_target_gone(&self) {
let region_snapshot_replace_request = self
.datastore
.get_region_snapshot_replacement_request_by_id(
&self.opctx(),
self.replacement_request_id,
)
.await
.unwrap();
eprintln!(
"Starting, replace_request_id: {:?}",
self.replacement_request_id
);
let mut i = 1;
loop {
let region_snapshot_replace_request = self
.datastore
.get_region_snapshot_replacement_request_by_id(
&self.opctx(),
self.replacement_request_id,
)
.await
.unwrap();
eprintln!(
"In loop {i} with rs_replace_request: {:?}",
region_snapshot_replace_request
);

assert!(
self.datastore
let res = self
.datastore
.read_only_target_addr(&region_snapshot_replace_request)
.await
.unwrap()
.is_none()
);
.unwrap();

eprintln!("In loop {i} target that should be gone: {:?}", res);
if res.is_none() {
// test pass, move on
break;
}
eprintln!("loop {i}, snapshot that should be gone: {:?}", res);
tokio::time::sleep(std::time::Duration::from_secs(5)).await;
i += 1;
}
}

pub async fn remove_disk_from_snapshot_rop(&self) {
Expand Down
2 changes: 1 addition & 1 deletion nexus/tests/integration_tests/instances.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6891,7 +6891,7 @@ pub async fn instance_wait_for_state_as(
instance_id: InstanceUuid,
state: omicron_common::api::external::InstanceState,
) -> Instance {
const MAX_WAIT: Duration = Duration::from_secs(120);
const MAX_WAIT: Duration = Duration::from_secs(320);

slog::info!(
&client.client_log,
Expand Down
14 changes: 12 additions & 2 deletions sled-agent/src/sim/http_entrypoints_pantry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -399,8 +399,18 @@ mod tests {
let raw_url = format!(
"https://raw.githubusercontent.com/oxidecomputer/crucible/{part}/openapi/crucible-pantry.json",
);
let raw_json =
reqwest::blocking::get(&raw_url).unwrap().text().unwrap();

// The default timeout of 30 seconds was sometimes not enough
// heavy load.
let raw_json = reqwest::blocking::Client::builder()
.timeout(std::time::Duration::from_secs(120))
.build()
.unwrap()
.get(&raw_url)
.send()
.unwrap()
.text()
.unwrap();
serde_json::from_str(&raw_json).unwrap()
}

Expand Down
Loading