Skip to content

Commit bd56c5f

Browse files
committed
TQ: Remove Faults from test-utils TqState
`Faults` has become a layer of indirection for reaching `crashed_nodes`. Early on when writing this test I figured that we'd have separate actions for connecting and disconnecting nodes in addition to crashing and restarting them. While I didn't open the possibility to asymmetric connectivity (hard to do realistically with TLS!), I made it so that we could track connectivity between alive nodes. With further reflection this seems unnecessary. As of #8993, we crash and restart nodes. We anticipate on restart that every alive node will reconnect at some point. And reconection can trigger the sending of messages destined for a crashed node. This is how retries are implemented in this connection oriented protocol. So the only real thing we are trying to ensure is that those retried messages get interleaved upon connection and don't always end up delivered in the same order at the destination node. This is accomplished by randomising the connection order. If we decide later on that we want to interleave connections via a new action we can add similar logic and remove the automatic `on_connect` calls..
1 parent 847f524 commit bd56c5f

File tree

2 files changed

+55
-113
lines changed

2 files changed

+55
-113
lines changed

trust-quorum/test-utils/src/state.rs

Lines changed: 46 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ pub struct TqState {
5353
pub member_universe: Vec<PlatformId>,
5454

5555
/// All possible system faults in our test
56-
pub faults: Faults,
56+
pub crashed_nodes: BTreeSet<PlatformId>,
5757

5858
/// All configurations ever generated by a coordinator.
5959
///
@@ -79,7 +79,7 @@ impl TqState {
7979
underlay_network: Vec::new(),
8080
nexus: NexusState::new(),
8181
member_universe,
82-
faults: Faults::default(),
82+
crashed_nodes: BTreeSet::new(),
8383
all_coordinated_configs: IdOrdMap::new(),
8484
expunged: BTreeSet::new(),
8585
}
@@ -91,7 +91,7 @@ impl TqState {
9191
pub fn send_reconfigure_msg(&mut self) {
9292
let (coordinator, msg) = self.nexus.reconfigure_msg_for_latest_config();
9393
let epoch_to_config = msg.epoch;
94-
if self.faults.crashed_nodes.contains(coordinator) {
94+
if self.crashed_nodes.contains(coordinator) {
9595
// We must abort the configuration. This mimics a timeout.
9696
self.nexus.abort_reconfiguration();
9797
} else {
@@ -125,7 +125,7 @@ impl TqState {
125125
let (coordinator, msg) = self.nexus.reconfigure_msg_for_latest_config();
126126

127127
// The coordinator should have received the `ReconfigureMsg` from Nexus
128-
if !self.faults.crashed_nodes.contains(coordinator) {
128+
if !self.crashed_nodes.contains(coordinator) {
129129
let (node, ctx) = self
130130
.sut
131131
.nodes
@@ -134,22 +134,24 @@ impl TqState {
134134
let mut connected_members = 0;
135135
// The coordinator should start preparing by sending a `PrepareMsg` to all
136136
// connected nodes in the membership set.
137-
for member in
138-
msg.members.iter().filter(|&id| id != coordinator).cloned()
137+
for member in msg
138+
.members
139+
.iter()
140+
.filter(|&id| {
141+
!self.crashed_nodes.contains(id) && id != ctx.platform_id()
142+
})
143+
.cloned()
139144
{
140-
if self.faults.is_connected(coordinator.clone(), member.clone())
141-
{
142-
connected_members += 1;
143-
let msg_found = ctx.envelopes().any(|envelope| {
144-
envelope.to == member
145-
&& envelope.from == *coordinator
146-
&& matches!(
147-
envelope.msg.kind,
148-
PeerMsgKind::Prepare { .. }
149-
)
150-
});
151-
assert!(msg_found);
152-
}
145+
connected_members += 1;
146+
let msg_found = ctx.envelopes().any(|envelope| {
147+
envelope.to == member
148+
&& envelope.from == *coordinator
149+
&& matches!(
150+
envelope.msg.kind,
151+
PeerMsgKind::Prepare { .. }
152+
)
153+
});
154+
assert!(msg_found);
153155
}
154156
assert_eq!(connected_members, ctx.envelopes().count());
155157

@@ -179,7 +181,7 @@ impl TqState {
179181
// Only send envelopes to alive nodes
180182
for envelope in ctx
181183
.drain_envelopes()
182-
.filter(|e| !self.faults.crashed_nodes.contains(&e.to))
184+
.filter(|e| !self.crashed_nodes.contains(&e.to))
183185
{
184186
let msgs =
185187
self.bootstrap_network.entry(envelope.to.clone()).or_default();
@@ -244,7 +246,7 @@ impl TqState {
244246
// Create the SUT nodes
245247
self.sut = Sut::new(&self.log, self.member_universe.clone());
246248

247-
self.faults.crashed_nodes = crashed_nodes;
249+
self.crashed_nodes = crashed_nodes;
248250

249251
// Inform nexus about the initial configuration
250252
self.nexus.configs.insert_unique(config).expect("new config");
@@ -254,11 +256,13 @@ impl TqState {
254256
.sut
255257
.nodes
256258
.iter_mut()
257-
.filter(|(id, _)| !self.faults.crashed_nodes.contains(id))
259+
.filter(|(id, _)| !self.crashed_nodes.contains(id))
258260
{
259-
for to in self.member_universe.iter().filter(|id| {
260-
!self.faults.crashed_nodes.contains(id) && from != *id
261-
}) {
261+
for to in self
262+
.member_universe
263+
.iter()
264+
.filter(|id| !self.crashed_nodes.contains(id) && from != *id)
265+
{
262266
node.on_connect(ctx, to.clone());
263267
}
264268
}
@@ -342,7 +346,7 @@ impl TqState {
342346
self.bootstrap_network.remove(&id);
343347

344348
// Keep track of the crashed node
345-
self.faults.crashed_nodes.insert(id.clone());
349+
self.crashed_nodes.insert(id.clone());
346350

347351
// We get to define the semantics of the network with regards to an
348352
// inflight message sourced from a crashed node. We have two choices:
@@ -358,7 +362,7 @@ impl TqState {
358362
.sut
359363
.nodes
360364
.iter_mut()
361-
.filter(|(id, _)| !self.faults.crashed_nodes.contains(id))
365+
.filter(|(id, _)| !self.crashed_nodes.contains(id))
362366
{
363367
node.on_disconnect(ctx, id.clone());
364368
}
@@ -390,14 +394,18 @@ impl TqState {
390394
send_envelopes(
391395
peer_ctx,
392396
&mut self.bootstrap_network,
393-
&mut self.faults,
397+
&self.crashed_nodes,
394398
);
395399

396400
let (node, ctx) = self.sut.nodes.get_mut(&id).expect("node exists");
397401
// Inform the restarted node of the connection
398402
node.on_connect(ctx, peer);
399403
// Send any messages output as a result of the connection
400-
send_envelopes(ctx, &mut self.bootstrap_network, &self.faults);
404+
send_envelopes(
405+
ctx,
406+
&mut self.bootstrap_network,
407+
&self.crashed_nodes,
408+
);
401409
}
402410
}
403411

@@ -478,7 +486,7 @@ impl TqState {
478486
}
479487

480488
// Send any messages as a result of handling this message
481-
send_envelopes(ctx, &mut self.bootstrap_network, &self.faults);
489+
send_envelopes(ctx, &mut self.bootstrap_network, &self.crashed_nodes);
482490

483491
// Remove any destinations with zero messages in-flight
484492
self.bootstrap_network.retain(|_, msgs| !msgs.is_empty());
@@ -536,10 +544,10 @@ impl TqState {
536544
fn send_envelopes(
537545
ctx: &mut NodeCtx,
538546
bootstrap_network: &mut BTreeMap<PlatformId, Vec<Envelope>>,
539-
faults: &Faults,
547+
crashed_nodes: &BTreeSet<PlatformId>,
540548
) {
541549
for envelope in
542-
ctx.drain_envelopes().filter(|e| !faults.crashed_nodes.contains(&e.to))
550+
ctx.drain_envelopes().filter(|e| !crashed_nodes.contains(&e.to))
543551
{
544552
let envelopes =
545553
bootstrap_network.entry(envelope.to.clone()).or_default();
@@ -574,55 +582,6 @@ impl Sut {
574582
}
575583
}
576584

577-
/// Faults in our system. It's useful to keep these self contained and not
578-
/// in separate fields in `TestState` so that we can access them all at once
579-
/// independently of other `TestState` fields.
580-
#[derive(Default, Debug, Clone, Diffable)]
581-
pub struct Faults {
582-
// We allow nodes to crash and restart and therefore track crashed nodes here.
583-
//
584-
// A crashed node is implicitly disconnected from every other node. We don't
585-
// bother storing the pairs in `disconnected_nodes`, but instead check both
586-
// fields when necessary.
587-
pub crashed_nodes: BTreeSet<PlatformId>,
588-
589-
/// The set of disconnected nodes
590-
pub disconnected_nodes: DisconnectedNodes,
591-
}
592-
593-
impl Faults {
594-
pub fn is_connected(&self, node1: PlatformId, node2: PlatformId) -> bool {
595-
!self.crashed_nodes.contains(&node1)
596-
&& !self.crashed_nodes.contains(&node2)
597-
&& !self.disconnected_nodes.contains(node1, node2)
598-
}
599-
}
600-
601-
/// For cardinality purposes, we assume all nodes are connected and explicitly
602-
/// disconnect some of them. This allows us to track and compare much less data.
603-
#[derive(Default, Debug, Clone, Diffable)]
604-
pub struct DisconnectedNodes {
605-
// We sort each pair on insert for quick lookups
606-
pairs: BTreeSet<(PlatformId, PlatformId)>,
607-
}
608-
609-
impl DisconnectedNodes {
610-
// Return true if the pair is newly inserted
611-
pub fn insert(&mut self, node1: PlatformId, node2: PlatformId) -> bool {
612-
assert_ne!(node1, node2);
613-
614-
let pair = if node1 < node2 { (node1, node2) } else { (node2, node1) };
615-
self.pairs.insert(pair)
616-
}
617-
618-
// Return true if the pair of nodes is disconnected, false otherwise
619-
pub fn contains(&self, node1: PlatformId, node2: PlatformId) -> bool {
620-
assert_ne!(node1, node2);
621-
let pair = if node1 < node2 { (node1, node2) } else { (node2, node1) };
622-
self.pairs.contains(&pair)
623-
}
624-
}
625-
626585
/*****************************************************************************
627586
*
628587
* Diff related display code
@@ -657,7 +616,7 @@ impl Display for TqStateDiff<'_> {
657616
display_bootstrap_network_diff(&self.bootstrap_network, f)?;
658617
display_underlay_network_diff(&self.underlay_network, f)?;
659618
display_nexus_state_diff(&self.nexus, f)?;
660-
display_faults_diff(&self.faults, f)?;
619+
display_faults_diff(&self.crashed_nodes, f)?;
661620
display_expunged_diff(&self.expunged, f)?;
662621

663622
Ok(())
@@ -678,34 +637,22 @@ fn display_expunged_diff(
678637
}
679638

680639
fn display_faults_diff(
681-
diff: &FaultsDiff<'_>,
640+
crashed_nodes: &BTreeSetDiff<'_, PlatformId>,
682641
f: &mut std::fmt::Formatter<'_>,
683642
) -> std::fmt::Result {
684-
if !diff.crashed_nodes.added.is_empty() {
643+
if !crashed_nodes.added.is_empty() {
685644
writeln!(f, " Nodes crashed:")?;
686-
for id in &diff.crashed_nodes.added {
645+
for id in &crashed_nodes.added {
687646
writeln!(f, " {id}")?;
688647
}
689648
}
690-
if !diff.crashed_nodes.removed.is_empty() {
649+
if !crashed_nodes.removed.is_empty() {
691650
writeln!(f, " nodes started:")?;
692-
for id in &diff.crashed_nodes.removed {
651+
for id in &crashed_nodes.removed {
693652
writeln!(f, " {id}")?;
694653
}
695654
}
696655

697-
if !diff.disconnected_nodes.pairs.added.is_empty() {
698-
writeln!(f, " nodes disconnected from each other:")?;
699-
for pair in &diff.disconnected_nodes.pairs.added {
700-
writeln!(f, " {}, {}", pair.0, pair.1)?;
701-
}
702-
}
703-
if !diff.disconnected_nodes.pairs.removed.is_empty() {
704-
writeln!(f, " nodes connected to each other:")?;
705-
for pair in &diff.disconnected_nodes.pairs.removed {
706-
writeln!(f, " {}, {}", pair.0, pair.1)?;
707-
}
708-
}
709656
Ok(())
710657
}
711658

trust-quorum/tests/cluster.rs

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ impl TestState {
166166
// bootstrap network. We could choose not to actually deliver it when
167167
// applying the event, but that means we have events that don't actually
168168
// do anything in our event log, which is quite misleading.
169-
assert!(!self.tq_state.faults.crashed_nodes.contains(destination));
169+
assert!(!self.tq_state.crashed_nodes.contains(destination));
170170

171171
// We pop from the back and push on the front
172172
let envelope = self
@@ -186,7 +186,7 @@ impl TestState {
186186
.tq_state
187187
.member_universe
188188
.iter()
189-
.filter(|m| !self.tq_state.faults.crashed_nodes.contains(&m))
189+
.filter(|m| !self.tq_state.crashed_nodes.contains(&m))
190190
.peekable();
191191

192192
if faultable.peek().is_none() {
@@ -203,12 +203,12 @@ impl TestState {
203203
id: Selector,
204204
connection_order_indexes: Vec<Index>,
205205
) -> Vec<Event> {
206-
if self.tq_state.faults.crashed_nodes.is_empty() {
206+
if self.tq_state.crashed_nodes.is_empty() {
207207
return vec![];
208208
}
209209

210210
// Choose the node to restart
211-
let id = id.select(self.tq_state.faults.crashed_nodes.iter()).clone();
211+
let id = id.select(self.tq_state.crashed_nodes.iter()).clone();
212212

213213
// Now order the peer connections
214214

@@ -217,7 +217,7 @@ impl TestState {
217217
.tq_state
218218
.member_universe
219219
.iter()
220-
.filter(|id| !self.tq_state.faults.crashed_nodes.contains(id))
220+
.filter(|id| !self.tq_state.crashed_nodes.contains(id))
221221
.cloned()
222222
.collect();
223223

@@ -253,7 +253,7 @@ impl TestState {
253253
let mut loadable = c
254254
.members
255255
.iter()
256-
.filter(|m| !self.tq_state.faults.crashed_nodes.contains(m))
256+
.filter(|m| !self.tq_state.crashed_nodes.contains(m))
257257
.peekable();
258258
if loadable.peek().is_some() {
259259
let id = selector.select(loadable).clone();
@@ -292,7 +292,7 @@ impl TestState {
292292
let mut loadable = c
293293
.members
294294
.iter()
295-
.filter(|m| !self.tq_state.faults.crashed_nodes.contains(m))
295+
.filter(|m| !self.tq_state.crashed_nodes.contains(m))
296296
.peekable();
297297
if loadable.peek().is_none() {
298298
return vec![];
@@ -311,12 +311,7 @@ impl TestState {
311311

312312
// If the coordinator has crashed then Nexus should abort.
313313
// Crashing is not actually implemented yet, but it will be.
314-
if self
315-
.tq_state
316-
.faults
317-
.crashed_nodes
318-
.contains(&latest_config.coordinator)
319-
{
314+
if self.tq_state.crashed_nodes.contains(&latest_config.coordinator) {
320315
events.push(Event::AbortConfiguration(latest_config.epoch));
321316
return events;
322317
}
@@ -369,7 +364,7 @@ impl TestState {
369364
let committable: Vec<_> = latest_config
370365
.prepared_members
371366
.difference(&latest_config.committed_members)
372-
.filter(|m| !self.tq_state.faults.crashed_nodes.contains(m))
367+
.filter(|m| !self.tq_state.crashed_nodes.contains(m))
373368
.collect();
374369

375370
if committable.is_empty() {

0 commit comments

Comments
 (0)