@@ -2106,3 +2106,155 @@ func TestWALCorruptionBitFlip(t *testing.T) {
2106
2106
}
2107
2107
checkBitFlipErr (err , t )
2108
2108
}
2109
+
2110
+ // TestCrashDuringOpenRandomized is a randomized test that simulates a hard crash
2111
+ // during database opening. It creates a database with some data, then simulates
2112
+ // opening it with injected filesystem slowness and crashes during the open
2113
+ // process. It ensures that the resulting DB state opens successfully, and the
2114
+ // contents of the DB match the expectations based on the keys written.
2115
+ func TestCrashDuringOpenRandomized (t * testing.T ) {
2116
+ seed := time .Now ().UnixNano ()
2117
+ t .Logf ("seed %d" , seed )
2118
+ rng := rand .New (rand .NewPCG (0 , uint64 (seed )))
2119
+
2120
+ // Create initial database with some data.
2121
+ mem := vfs .NewCrashableMem ()
2122
+ failoverOpts := WALFailoverOptions {
2123
+ Secondary : wal.Dir {FS : mem , Dirname : "secondary" },
2124
+ FailoverOptions : wal.FailoverOptions {
2125
+ PrimaryDirProbeInterval : 100 * time .Microsecond ,
2126
+ HealthyProbeLatencyThreshold : 5 * time .Millisecond ,
2127
+ HealthyInterval : 50 * time .Microsecond ,
2128
+ UnhealthySamplingInterval : 10 * time .Microsecond ,
2129
+ UnhealthyOperationLatencyThreshold : func () (time.Duration , bool ) {
2130
+ return 50 * time .Microsecond , true
2131
+ },
2132
+ ElevatedWriteStallThresholdLag : 100 * time .Microsecond ,
2133
+ },
2134
+ }
2135
+ opts := & Options {
2136
+ FS : mem ,
2137
+ FormatMajorVersion : internalFormatNewest ,
2138
+ Logger : testutils.Logger {T : t },
2139
+ MemTableSize : 128 << 10 , // 128 KiB
2140
+ MemTableStopWritesThreshold : 4 ,
2141
+ WALFailover : & failoverOpts ,
2142
+ }
2143
+
2144
+ // Create and populate initial database.
2145
+ d , err := Open ("testdb" , opts )
2146
+ require .NoError (t , err )
2147
+
2148
+ testData := make (map [string ][]byte )
2149
+ for i := range 50 {
2150
+ key := fmt .Sprintf ("key-%d" , i )
2151
+ value := make ([]byte , 100 + rng .IntN (900 )) // 100-1000 bytes
2152
+ for j := range value {
2153
+ value [j ] = byte (i + j )
2154
+ }
2155
+ testData [key ] = value
2156
+ require .NoError (t , d .Set ([]byte (key ), value , Sync ))
2157
+ }
2158
+ require .NoError (t , d .Close ())
2159
+
2160
+ // Now simulate opening with a crash clone we have taken during open.
2161
+ // Create options with latency injection and WAL failover for the slow
2162
+ // open process.
2163
+ mean := time .Duration (rng .ExpFloat64 () * float64 (time .Microsecond ))
2164
+ p := 1.0
2165
+ t .Logf ("injecting mean %s of latency with p=%.3f" , mean , p )
2166
+ slowFS := errorfs .Wrap (mem , errorfs .RandomLatency (
2167
+ errorfs .Randomly (p , seed ),
2168
+ mean ,
2169
+ seed ,
2170
+ 10 * time .Millisecond ,
2171
+ ))
2172
+
2173
+ // Create WAL failover options for the slow open.
2174
+ slowFailoverOpts := failoverOpts
2175
+ slowFailoverOpts .Secondary = wal.Dir {FS : slowFS , Dirname : "secondary" }
2176
+ slowOpts := & Options {
2177
+ FS : mem ,
2178
+ FormatMajorVersion : internalFormatNewest ,
2179
+ Logger : testutils.Logger {T : t },
2180
+ MemTableSize : 128 << 10 ,
2181
+ MemTableStopWritesThreshold : 4 ,
2182
+ WALFailover : & slowFailoverOpts ,
2183
+ }
2184
+
2185
+ // Start opening the database in a goroutine.
2186
+ type openResult struct {
2187
+ db * DB
2188
+ err error
2189
+ }
2190
+ openResultChan := make (chan openResult , 1 )
2191
+ go func () {
2192
+ t .Log ("opening database" )
2193
+ db , err := Open ("testdb" , slowOpts )
2194
+ t .Log ("opened database" )
2195
+ openResultChan <- openResult {db : db , err : err }
2196
+ }()
2197
+
2198
+ // Wait a bit to let the open process make some progress.
2199
+ time .Sleep (time .Millisecond * time .Duration (5 + rng .IntN (10 )))
2200
+
2201
+ // Take crash clone while the open process is still running.
2202
+ t .Log ("taking crash clone during open process" )
2203
+ crashClone := mem .CrashClone (vfs.CrashCloneCfg {
2204
+ UnsyncedDataPercent : rng .IntN (101 ),
2205
+ RNG : rng ,
2206
+ })
2207
+
2208
+ // Wait for the original open to complete (it might succeed or fail).
2209
+ result := <- openResultChan
2210
+ openedDB := result .db
2211
+ if result .err != nil {
2212
+ t .Errorf ("open failed: %v" , result .err )
2213
+ }
2214
+ if openedDB != nil {
2215
+ if err := openedDB .Close (); err != nil {
2216
+ t .Errorf ("failed to close openedDB: %v" , err )
2217
+ }
2218
+
2219
+ }
2220
+ t .Log ("using crashed filesystem for recovery" )
2221
+ // Create WAL failover options for the crashed filesystem recovery.
2222
+ crashedFailoverOpts := failoverOpts
2223
+ crashedFailoverOpts .Secondary = wal.Dir {FS : crashClone , Dirname : "secondary" }
2224
+
2225
+ // Now try to open the crashed filesystem with WAL failover.
2226
+ crashedOpts := & Options {
2227
+ FS : crashClone ,
2228
+ FormatMajorVersion : internalFormatNewest ,
2229
+ Logger : testutils.Logger {T : t },
2230
+ MemTableSize : 128 << 10 ,
2231
+ MemTableStopWritesThreshold : 4 ,
2232
+ WALFailover : & crashedFailoverOpts ,
2233
+ }
2234
+
2235
+ recoveredDB , err := Open ("testdb" , crashedOpts )
2236
+ require .NoError (t , err )
2237
+
2238
+ // Verify that we can read some of the expected data.
2239
+ iter , err := recoveredDB .NewIter (nil )
2240
+ require .NoError (t , err )
2241
+
2242
+ foundKeys := make (map [string ][]byte )
2243
+ for valid := iter .First (); valid ; valid = iter .Next () {
2244
+ key := string (iter .Key ())
2245
+ value := slices .Clone (iter .Value ())
2246
+ foundKeys [key ] = value
2247
+ }
2248
+ require .NoError (t , iter .Close ())
2249
+
2250
+ // Verify that found data matches expected data.
2251
+ require .NotEmpty (t , foundKeys , "no keys found after crash" )
2252
+
2253
+ // Check that all found keys match expected data.
2254
+ for key , foundValue := range foundKeys {
2255
+ expectedValue , exists := testData [key ]
2256
+ require .True (t , exists , "found unexpected key: %s" , key )
2257
+ require .Equal (t , expectedValue , foundValue , "mismatch for key %s" , key )
2258
+ }
2259
+ require .NoError (t , recoveredDB .Close ())
2260
+ }
0 commit comments