@@ -139,6 +139,10 @@ impl TestState {
139
139
threshold,
140
140
coordinator,
141
141
) ,
142
+ Action :: CrashNode ( index) => self . action_to_events_crash_node ( index) ,
143
+ Action :: RestartNode { id, connection_order } => {
144
+ self . action_to_events_restart_node ( id, connection_order)
145
+ }
142
146
}
143
147
}
144
148
@@ -154,6 +158,16 @@ impl TestState {
154
158
let destination =
155
159
selector. select ( self . tq_state . bootstrap_network . keys ( ) ) ;
156
160
161
+ // Envelopes should never be sent on the bootstrap network to crashed nodes
162
+ // when events are applied in `TqState::apply_event`.
163
+ //
164
+ // The rationale is that we don't mutate state here, and so we can't
165
+ // choose to pop the message that shouldn't be delivered off the
166
+ // bootstrap network. We could choose not to actually deliver it when
167
+ // applying the event, but that means we have events that don't actually
168
+ // do anything in our event log, which is quite misleading.
169
+ assert ! ( !self . tq_state. faults. crashed_nodes. contains( destination) ) ;
170
+
157
171
// We pop from the back and push on the front
158
172
let envelope = self
159
173
. tq_state
@@ -167,14 +181,84 @@ impl TestState {
167
181
events
168
182
}
169
183
184
+ fn action_to_events_crash_node ( & self , selector : Selector ) -> Vec < Event > {
185
+ let mut faultable = self
186
+ . tq_state
187
+ . member_universe
188
+ . iter ( )
189
+ . filter ( |m| !self . tq_state . faults . crashed_nodes . contains ( & m) )
190
+ . peekable ( ) ;
191
+
192
+ if faultable. peek ( ) . is_none ( ) {
193
+ // All nodes are down
194
+ return vec ! [ ] ;
195
+ }
196
+
197
+ let id = selector. select ( faultable) . clone ( ) ;
198
+ vec ! [ Event :: CrashNode ( id) ]
199
+ }
200
+
201
+ fn action_to_events_restart_node (
202
+ & self ,
203
+ id : Selector ,
204
+ connection_order_indexes : Vec < Index > ,
205
+ ) -> Vec < Event > {
206
+ if self . tq_state . faults . crashed_nodes . is_empty ( ) {
207
+ return vec ! [ ] ;
208
+ }
209
+
210
+ // Choose the node to restart
211
+ let id = id. select ( self . tq_state . faults . crashed_nodes . iter ( ) ) . clone ( ) ;
212
+
213
+ // Now order the peer connections
214
+
215
+ // First find all the peers we want to connect to.
216
+ let mut to_connect: Vec < _ > = self
217
+ . tq_state
218
+ . member_universe
219
+ . iter ( )
220
+ . filter ( |id| !self . tq_state . faults . crashed_nodes . contains ( id) )
221
+ . cloned ( )
222
+ . collect ( ) ;
223
+
224
+ let total_connections = to_connect. len ( ) ;
225
+
226
+ // Then remove them from `to_connect` and put them into `connection_order`.
227
+ let mut connection_order = vec ! [ ] ;
228
+ for index in connection_order_indexes {
229
+ if to_connect. is_empty ( ) {
230
+ break ;
231
+ }
232
+ let i = index. index ( to_connect. len ( ) ) ;
233
+ let dst = to_connect. swap_remove ( i) ;
234
+ connection_order. push ( dst) ;
235
+ }
236
+
237
+ // If there is anything left in `to_connect`, then just extend
238
+ // `connection_order` with it.
239
+ connection_order. extend_from_slice ( & to_connect) ;
240
+
241
+ // Ensure we have exactly the number of connections we want
242
+ assert_eq ! ( connection_order. len( ) , total_connections) ;
243
+
244
+ vec ! [ Event :: RestartNode { id, connection_order } ]
245
+ }
246
+
170
247
fn action_to_events_load_latest_rack_secret (
171
248
& self ,
172
249
selector : Selector ,
173
250
) -> Vec < Event > {
174
251
let mut events = vec ! [ ] ;
175
252
if let Some ( c) = self . tq_state . nexus . last_committed_config ( ) {
176
- let id = selector. select ( c. members . iter ( ) ) . clone ( ) ;
177
- events. push ( Event :: LoadRackSecret ( id, c. epoch ) ) ;
253
+ let mut loadable = c
254
+ . members
255
+ . iter ( )
256
+ . filter ( |m| !self . tq_state . faults . crashed_nodes . contains ( m) )
257
+ . peekable ( ) ;
258
+ if loadable. peek ( ) . is_some ( ) {
259
+ let id = selector. select ( loadable) . clone ( ) ;
260
+ events. push ( Event :: LoadRackSecret ( id, c. epoch ) ) ;
261
+ }
178
262
}
179
263
events
180
264
}
@@ -205,6 +289,14 @@ impl TestState {
205
289
return vec ! [ ] ;
206
290
}
207
291
let c = config. select ( committed_configs_iter) ;
292
+ let mut loadable = c
293
+ . members
294
+ . iter ( )
295
+ . filter ( |m| !self . tq_state . faults . crashed_nodes . contains ( m) )
296
+ . peekable ( ) ;
297
+ if loadable. peek ( ) . is_none ( ) {
298
+ return vec ! [ ] ;
299
+ }
208
300
let id = id. select ( c. members . iter ( ) ) . clone ( ) ;
209
301
vec ! [ Event :: LoadRackSecret ( id, c. epoch) ]
210
302
}
@@ -277,6 +369,7 @@ impl TestState {
277
369
let committable: Vec < _ > = latest_config
278
370
. prepared_members
279
371
. difference ( & latest_config. committed_members )
372
+ . filter ( |m| !self . tq_state . faults . crashed_nodes . contains ( m) )
280
373
. collect ( ) ;
281
374
282
375
if committable. is_empty ( ) {
@@ -703,6 +796,22 @@ pub enum Action {
703
796
threshold : Index ,
704
797
coordinator : Selector ,
705
798
} ,
799
+
800
+ /// Crash a random node in the universe
801
+ #[ weight( 2 ) ]
802
+ CrashNode ( Selector ) ,
803
+
804
+ /// Restart a crashed node if there is one
805
+ ///
806
+ /// We randomize the connection order, because that influences the order
807
+ /// that messages sent on reconnect will get delivered to the newly
808
+ /// connected node.
809
+ #[ weight( 2 ) ]
810
+ RestartNode {
811
+ id : Selector ,
812
+ #[ any( size_range( MEMBER_UNIVERSE_SIZE -1 ..MEMBER_UNIVERSE_SIZE ) . lift( ) ) ]
813
+ connection_order : Vec < Index > ,
814
+ } ,
706
815
}
707
816
708
817
const MIN_CLUSTER_SIZE : usize = 3 ;
@@ -770,6 +879,7 @@ fn test_trust_quorum_protocol(input: TestInput) {
770
879
let ( parent_dir, _) = log_prefix_for_test ( logctx. test_name ( ) ) ;
771
880
let event_log_path = parent_dir. join ( format ! ( "{test_name}.events.json" ) ) ;
772
881
let mut event_log = EventLog :: new ( & event_log_path) ;
882
+ println ! ( "Event log path: {event_log_path}" ) ;
773
883
774
884
let log = logctx. log . new ( o ! ( "component" => "tq-proptest" ) ) ;
775
885
let mut state = TestState :: new ( log. clone ( ) ) ;
@@ -789,6 +899,6 @@ fn test_trust_quorum_protocol(input: TestInput) {
789
899
"skipped_actions" => state. skipped_actions
790
900
) ;
791
901
792
- let _ = std:: fs:: remove_file ( event_log_path) ;
902
+ // let _ = std::fs::remove_file(event_log_path);
793
903
logctx. cleanup_successful ( ) ;
794
904
}
0 commit comments