Skip to content

Commit d70dec4

Browse files
committed
TQ: Remove Faults from test-utils TqState
`Faults` has become a layer of indirection for reaching `crashed_nodes`. Early on when writing this test I figured that we'd have separate actions for connecting and disconnecting nodes in addition to crashing and restarting them. While I didn't open the possibility to asymmetric connectivity (hard to do realistically with TLS!), I made it so that we could track connectivity between alive nodes. With further reflection this seems unnecessary. As of #8993, we crash and restart nodes. We anticipate on restart that every alive node will reconnect at some point. And reconection can trigger the sending of messages destined for a crashed node. This is how retries are implemented in this connection oriented protocol. So the only real thing we are trying to ensure is that those retried messages get interleaved upon connection and don't always end up delivered in the same order at the destination node. This is accomplished by randomising the connection order. If we decide later on that we want to interleave connections via a new action we can add similar logic and remove the automatic `on_connect` calls..
1 parent fe1fe40 commit d70dec4

File tree

2 files changed

+57
-115
lines changed

2 files changed

+57
-115
lines changed

trust-quorum/test-utils/src/state.rs

Lines changed: 47 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ pub struct TqState {
5353
pub member_universe: Vec<PlatformId>,
5454

5555
/// All possible system faults in our test
56-
pub faults: Faults,
56+
pub crashed_nodes: BTreeSet<PlatformId>,
5757

5858
/// All configurations ever generated by a coordinator.
5959
///
@@ -79,7 +79,7 @@ impl TqState {
7979
underlay_network: Vec::new(),
8080
nexus: NexusState::new(),
8181
member_universe,
82-
faults: Faults::default(),
82+
crashed_nodes: BTreeSet::new(),
8383
all_coordinated_configs: IdOrdMap::new(),
8484
expunged: BTreeSet::new(),
8585
}
@@ -91,7 +91,7 @@ impl TqState {
9191
pub fn send_reconfigure_msg(&mut self) {
9292
let (coordinator, msg) = self.nexus.reconfigure_msg_for_latest_config();
9393
let epoch_to_config = msg.epoch;
94-
if !self.faults.crashed_nodes.contains(coordinator) {
94+
if !self.crashed_nodes.contains(coordinator) {
9595
let (node, ctx) = self
9696
.sut
9797
.nodes
@@ -122,7 +122,7 @@ impl TqState {
122122
let (coordinator, msg) = self.nexus.reconfigure_msg_for_latest_config();
123123

124124
// The coordinator should have received the `ReconfigureMsg` from Nexus
125-
if !self.faults.crashed_nodes.contains(coordinator) {
125+
if !self.crashed_nodes.contains(coordinator) {
126126
let (node, ctx) = self
127127
.sut
128128
.nodes
@@ -131,22 +131,24 @@ impl TqState {
131131
let mut connected_members = 0;
132132
// The coordinator should start preparing by sending a `PrepareMsg` to all
133133
// connected nodes in the membership set.
134-
for member in
135-
msg.members.iter().filter(|&id| id != coordinator).cloned()
134+
for member in msg
135+
.members
136+
.iter()
137+
.filter(|&id| {
138+
!self.crashed_nodes.contains(id) && id != ctx.platform_id()
139+
})
140+
.cloned()
136141
{
137-
if self.faults.is_connected(coordinator.clone(), member.clone())
138-
{
139-
connected_members += 1;
140-
let msg_found = ctx.envelopes().any(|envelope| {
141-
envelope.to == member
142-
&& envelope.from == *coordinator
143-
&& matches!(
144-
envelope.msg.kind,
145-
PeerMsgKind::Prepare { .. }
146-
)
147-
});
148-
assert!(msg_found);
149-
}
142+
connected_members += 1;
143+
let msg_found = ctx.envelopes().any(|envelope| {
144+
envelope.to == member
145+
&& envelope.from == *coordinator
146+
&& matches!(
147+
envelope.msg.kind,
148+
PeerMsgKind::Prepare { .. }
149+
)
150+
});
151+
assert!(msg_found);
150152
}
151153
assert_eq!(connected_members, ctx.envelopes().count());
152154

@@ -176,7 +178,7 @@ impl TqState {
176178
// Only send envelopes to alive nodes
177179
for envelope in ctx
178180
.drain_envelopes()
179-
.filter(|e| !self.faults.crashed_nodes.contains(&e.to))
181+
.filter(|e| !self.crashed_nodes.contains(&e.to))
180182
{
181183
let msgs =
182184
self.bootstrap_network.entry(envelope.to.clone()).or_default();
@@ -241,7 +243,7 @@ impl TqState {
241243
// Create the SUT nodes
242244
self.sut = Sut::new(&self.log, self.member_universe.clone());
243245

244-
self.faults.crashed_nodes = crashed_nodes;
246+
self.crashed_nodes = crashed_nodes;
245247

246248
// Inform nexus about the initial configuration
247249
self.nexus.configs.insert_unique(config).expect("new config");
@@ -251,11 +253,13 @@ impl TqState {
251253
.sut
252254
.nodes
253255
.iter_mut()
254-
.filter(|(id, _)| !self.faults.crashed_nodes.contains(id))
256+
.filter(|(id, _)| !self.crashed_nodes.contains(id))
255257
{
256-
for to in self.member_universe.iter().filter(|id| {
257-
!self.faults.crashed_nodes.contains(id) && from != *id
258-
}) {
258+
for to in self
259+
.member_universe
260+
.iter()
261+
.filter(|id| !self.crashed_nodes.contains(id) && from != *id)
262+
{
259263
node.on_connect(ctx, to.clone());
260264
}
261265
}
@@ -339,7 +343,7 @@ impl TqState {
339343
self.bootstrap_network.remove(&id);
340344

341345
// Keep track of the crashed node
342-
self.faults.crashed_nodes.insert(id.clone());
346+
self.crashed_nodes.insert(id.clone());
343347

344348
// We get to define the semantics of the network with regards to an
345349
// inflight message sourced from a crashed node. We have two choices:
@@ -355,7 +359,7 @@ impl TqState {
355359
.sut
356360
.nodes
357361
.iter_mut()
358-
.filter(|(id, _)| !self.faults.crashed_nodes.contains(id))
362+
.filter(|(id, _)| !self.crashed_nodes.contains(id))
359363
{
360364
node.on_disconnect(ctx, id.clone());
361365
}
@@ -367,7 +371,7 @@ impl TqState {
367371
connection_order: Vec<PlatformId>,
368372
) {
369373
// The node is no longer crashed.
370-
self.faults.crashed_nodes.remove(&id);
374+
self.crashed_nodes.remove(&id);
371375

372376
// We need to clear the mutable state of the `Node`. We do this by
373377
// creating a new `Node` and passing in the existing context which
@@ -390,14 +394,18 @@ impl TqState {
390394
send_envelopes(
391395
peer_ctx,
392396
&mut self.bootstrap_network,
393-
&mut self.faults,
397+
&self.crashed_nodes,
394398
);
395399

396400
let (node, ctx) = self.sut.nodes.get_mut(&id).expect("node exists");
397401
// Inform the restarted node of the connection
398402
node.on_connect(ctx, peer);
399403
// Send any messages output as a result of the connection
400-
send_envelopes(ctx, &mut self.bootstrap_network, &self.faults);
404+
send_envelopes(
405+
ctx,
406+
&mut self.bootstrap_network,
407+
&self.crashed_nodes,
408+
);
401409
}
402410
}
403411

@@ -478,7 +486,7 @@ impl TqState {
478486
}
479487

480488
// Send any messages as a result of handling this message
481-
send_envelopes(ctx, &mut self.bootstrap_network, &self.faults);
489+
send_envelopes(ctx, &mut self.bootstrap_network, &self.crashed_nodes);
482490

483491
// Remove any destinations with zero messages in-flight
484492
self.bootstrap_network.retain(|_, msgs| !msgs.is_empty());
@@ -536,10 +544,10 @@ impl TqState {
536544
fn send_envelopes(
537545
ctx: &mut NodeCtx,
538546
bootstrap_network: &mut BTreeMap<PlatformId, Vec<Envelope>>,
539-
faults: &Faults,
547+
crashed_nodes: &BTreeSet<PlatformId>,
540548
) {
541549
for envelope in
542-
ctx.drain_envelopes().filter(|e| !faults.crashed_nodes.contains(&e.to))
550+
ctx.drain_envelopes().filter(|e| !crashed_nodes.contains(&e.to))
543551
{
544552
let envelopes =
545553
bootstrap_network.entry(envelope.to.clone()).or_default();
@@ -574,55 +582,6 @@ impl Sut {
574582
}
575583
}
576584

577-
/// Faults in our system. It's useful to keep these self contained and not
578-
/// in separate fields in `TestState` so that we can access them all at once
579-
/// independently of other `TestState` fields.
580-
#[derive(Default, Debug, Clone, Diffable)]
581-
pub struct Faults {
582-
// We allow nodes to crash and restart and therefore track crashed nodes here.
583-
//
584-
// A crashed node is implicitly disconnected from every other node. We don't
585-
// bother storing the pairs in `disconnected_nodes`, but instead check both
586-
// fields when necessary.
587-
pub crashed_nodes: BTreeSet<PlatformId>,
588-
589-
/// The set of disconnected nodes
590-
pub disconnected_nodes: DisconnectedNodes,
591-
}
592-
593-
impl Faults {
594-
pub fn is_connected(&self, node1: PlatformId, node2: PlatformId) -> bool {
595-
!self.crashed_nodes.contains(&node1)
596-
&& !self.crashed_nodes.contains(&node2)
597-
&& !self.disconnected_nodes.contains(node1, node2)
598-
}
599-
}
600-
601-
/// For cardinality purposes, we assume all nodes are connected and explicitly
602-
/// disconnect some of them. This allows us to track and compare much less data.
603-
#[derive(Default, Debug, Clone, Diffable)]
604-
pub struct DisconnectedNodes {
605-
// We sort each pair on insert for quick lookups
606-
pairs: BTreeSet<(PlatformId, PlatformId)>,
607-
}
608-
609-
impl DisconnectedNodes {
610-
// Return true if the pair is newly inserted
611-
pub fn insert(&mut self, node1: PlatformId, node2: PlatformId) -> bool {
612-
assert_ne!(node1, node2);
613-
614-
let pair = if node1 < node2 { (node1, node2) } else { (node2, node1) };
615-
self.pairs.insert(pair)
616-
}
617-
618-
// Return true if the pair of nodes is disconnected, false otherwise
619-
pub fn contains(&self, node1: PlatformId, node2: PlatformId) -> bool {
620-
assert_ne!(node1, node2);
621-
let pair = if node1 < node2 { (node1, node2) } else { (node2, node1) };
622-
self.pairs.contains(&pair)
623-
}
624-
}
625-
626585
/*****************************************************************************
627586
*
628587
* Diff related display code
@@ -657,7 +616,7 @@ impl Display for TqStateDiff<'_> {
657616
display_bootstrap_network_diff(&self.bootstrap_network, f)?;
658617
display_underlay_network_diff(&self.underlay_network, f)?;
659618
display_nexus_state_diff(&self.nexus, f)?;
660-
display_faults_diff(&self.faults, f)?;
619+
display_faults_diff(&self.crashed_nodes, f)?;
661620
display_expunged_diff(&self.expunged, f)?;
662621

663622
Ok(())
@@ -678,34 +637,22 @@ fn display_expunged_diff(
678637
}
679638

680639
fn display_faults_diff(
681-
diff: &FaultsDiff<'_>,
640+
crashed_nodes: &BTreeSetDiff<'_, PlatformId>,
682641
f: &mut std::fmt::Formatter<'_>,
683642
) -> std::fmt::Result {
684-
if !diff.crashed_nodes.added.is_empty() {
643+
if !crashed_nodes.added.is_empty() {
685644
writeln!(f, " Nodes crashed:")?;
686-
for id in &diff.crashed_nodes.added {
645+
for id in &crashed_nodes.added {
687646
writeln!(f, " {id}")?;
688647
}
689648
}
690-
if !diff.crashed_nodes.removed.is_empty() {
649+
if !crashed_nodes.removed.is_empty() {
691650
writeln!(f, " nodes started:")?;
692-
for id in &diff.crashed_nodes.removed {
651+
for id in &crashed_nodes.removed {
693652
writeln!(f, " {id}")?;
694653
}
695654
}
696655

697-
if !diff.disconnected_nodes.pairs.added.is_empty() {
698-
writeln!(f, " nodes disconnected from each other:")?;
699-
for pair in &diff.disconnected_nodes.pairs.added {
700-
writeln!(f, " {}, {}", pair.0, pair.1)?;
701-
}
702-
}
703-
if !diff.disconnected_nodes.pairs.removed.is_empty() {
704-
writeln!(f, " nodes connected to each other:")?;
705-
for pair in &diff.disconnected_nodes.pairs.removed {
706-
writeln!(f, " {}, {}", pair.0, pair.1)?;
707-
}
708-
}
709656
Ok(())
710657
}
711658

trust-quorum/tests/cluster.rs

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ impl TestState {
172172
// bootstrap network. We could choose not to actually deliver it when
173173
// applying the event, but that means we have events that don't actually
174174
// do anything in our event log, which is quite misleading.
175-
assert!(!self.tq_state.faults.crashed_nodes.contains(destination));
175+
assert!(!self.tq_state.crashed_nodes.contains(destination));
176176

177177
// We pop from the back and push on the front
178178
let envelope = self
@@ -192,7 +192,7 @@ impl TestState {
192192
.tq_state
193193
.member_universe
194194
.iter()
195-
.filter(|m| !self.tq_state.faults.crashed_nodes.contains(&m))
195+
.filter(|m| !self.tq_state.crashed_nodes.contains(&m))
196196
.peekable();
197197

198198
if faultable.peek().is_none() {
@@ -222,12 +222,12 @@ impl TestState {
222222
id: Selector,
223223
connection_order_indexes: Vec<Index>,
224224
) -> Vec<Event> {
225-
if self.tq_state.faults.crashed_nodes.is_empty() {
225+
if self.tq_state.crashed_nodes.is_empty() {
226226
return vec![];
227227
}
228228

229229
// Choose the node to restart
230-
let id = id.select(self.tq_state.faults.crashed_nodes.iter()).clone();
230+
let id = id.select(self.tq_state.crashed_nodes.iter()).clone();
231231

232232
// Now order the peer connections
233233

@@ -236,7 +236,7 @@ impl TestState {
236236
.tq_state
237237
.member_universe
238238
.iter()
239-
.filter(|id| !self.tq_state.faults.crashed_nodes.contains(id))
239+
.filter(|id| !self.tq_state.crashed_nodes.contains(id))
240240
.cloned()
241241
.collect();
242242

@@ -272,7 +272,7 @@ impl TestState {
272272
let mut loadable = c
273273
.members
274274
.iter()
275-
.filter(|m| !self.tq_state.faults.crashed_nodes.contains(m))
275+
.filter(|m| !self.tq_state.crashed_nodes.contains(m))
276276
.peekable();
277277
if loadable.peek().is_some() {
278278
let id = selector.select(loadable).clone();
@@ -311,7 +311,7 @@ impl TestState {
311311
let mut loadable = c
312312
.members
313313
.iter()
314-
.filter(|m| !self.tq_state.faults.crashed_nodes.contains(m))
314+
.filter(|m| !self.tq_state.crashed_nodes.contains(m))
315315
.peekable();
316316
if loadable.peek().is_none() {
317317
return vec![];
@@ -329,12 +329,7 @@ impl TestState {
329329
}
330330

331331
// If the coordinator is currently down then Nexus should abort.
332-
if self
333-
.tq_state
334-
.faults
335-
.crashed_nodes
336-
.contains(&latest_config.coordinator)
337-
{
332+
if self.tq_state.crashed_nodes.contains(&latest_config.coordinator) {
338333
events.push(Event::AbortConfiguration(latest_config.epoch));
339334
return events;
340335
}
@@ -387,7 +382,7 @@ impl TestState {
387382
let committable: Vec<_> = latest_config
388383
.prepared_members
389384
.difference(&latest_config.committed_members)
390-
.filter(|m| !self.tq_state.faults.crashed_nodes.contains(m))
385+
.filter(|m| !self.tq_state.crashed_nodes.contains(m))
391386
.collect();
392387

393388
if committable.is_empty() {
@@ -534,7 +529,7 @@ impl TestState {
534529
);
535530
let mut events = vec![Event::Reconfigure(nexus_config)];
536531

537-
if self.tq_state.faults.crashed_nodes.contains(&coordinator) {
532+
if self.tq_state.crashed_nodes.contains(&coordinator) {
538533
// This simulates a timeout on the reply from the coordinator which
539534
// triggers an abort.
540535
events.push(Event::AbortConfiguration(epoch));

0 commit comments

Comments
 (0)