@@ -2105,3 +2105,171 @@ func TestWALCorruptionBitFlip(t *testing.T) {
2105
2105
}
2106
2106
checkBitFlipErr (err , t )
2107
2107
}
2108
+
2109
+ // TestCrashDuringOpenRandomized is a randomized test that simulates a hard crash
2110
+ // during database opening. It creates a database with some data, then simulates
2111
+ // opening it with injected filesystem slowness and crashes during the open
2112
+ // process. It ensures that the resulting DB state opens successfully, and the
2113
+ // contents of the DB match the expectations based on the keys written.
2114
+ func TestCrashDuringOpenRandomized (t * testing.T ) {
2115
+ seed := time .Now ().UnixNano ()
2116
+ t .Logf ("seed %d" , seed )
2117
+ rng := rand .New (rand .NewPCG (0 , uint64 (seed )))
2118
+
2119
+ // Create initial database with some data.
2120
+ mem := vfs .NewCrashableMem ()
2121
+ failoverOpts := WALFailoverOptions {
2122
+ Secondary : wal.Dir {FS : mem , Dirname : "secondary" },
2123
+ FailoverOptions : wal.FailoverOptions {
2124
+ PrimaryDirProbeInterval : 100 * time .Microsecond ,
2125
+ HealthyProbeLatencyThreshold : 5 * time .Millisecond ,
2126
+ HealthyInterval : 50 * time .Microsecond ,
2127
+ UnhealthySamplingInterval : 10 * time .Microsecond ,
2128
+ UnhealthyOperationLatencyThreshold : func () (time.Duration , bool ) {
2129
+ return 50 * time .Microsecond , true
2130
+ },
2131
+ ElevatedWriteStallThresholdLag : 100 * time .Microsecond ,
2132
+ },
2133
+ }
2134
+ opts := & Options {
2135
+ FS : mem ,
2136
+ FormatMajorVersion : internalFormatNewest ,
2137
+ Logger : testutils.Logger {T : t },
2138
+ MemTableSize : 128 << 10 , // 128 KiB
2139
+ MemTableStopWritesThreshold : 4 ,
2140
+ WALFailover : & failoverOpts ,
2141
+ }
2142
+
2143
+ // Create and populate initial database.
2144
+ d , err := Open ("testdb" , opts )
2145
+ require .NoError (t , err )
2146
+
2147
+ testData := make (map [string ][]byte )
2148
+ for i := range 50 {
2149
+ key := fmt .Sprintf ("key-%d" , i )
2150
+ value := make ([]byte , 100 + rng .IntN (900 )) // 100-1000 bytes
2151
+ for j := range value {
2152
+ value [j ] = byte (i + j )
2153
+ }
2154
+ testData [key ] = value
2155
+ require .NoError (t , d .Set ([]byte (key ), value , Sync ))
2156
+ }
2157
+ require .NoError (t , d .Flush ())
2158
+ require .NoError (t , d .Close ())
2159
+
2160
+ // Now simulate opening with a crash during open.
2161
+ for attempt := range 3 {
2162
+ t .Logf ("attempt %d" , attempt )
2163
+
2164
+ // Create a crashable clone of the filesystem.
2165
+ crashClone := mem .CrashClone (vfs.CrashCloneCfg {
2166
+ UnsyncedDataPercent : rng .IntN (101 ), // 0-100% unsynced data
2167
+ RNG : rng ,
2168
+ })
2169
+
2170
+ // Create options with latency injection and WAL failover for the slow
2171
+ // open process.
2172
+ mean := time .Duration (rng .ExpFloat64 () * float64 (500 * time .Microsecond ))
2173
+ p := 1.0
2174
+ t .Logf ("Injecting mean %s of latency with p=%.3f" , mean , p )
2175
+ slowFS := errorfs .Wrap (crashClone , errorfs .RandomLatency (
2176
+ errorfs .Randomly (p , seed + int64 (attempt )),
2177
+ mean ,
2178
+ seed + int64 (attempt ),
2179
+ 10 * time .Millisecond ,
2180
+ ))
2181
+
2182
+ // Create WAL failover options for the slow open.
2183
+ slowFailoverOpts := failoverOpts
2184
+ slowFailoverOpts .Secondary = wal.Dir {FS : crashClone , Dirname : "secondary" }
2185
+ slowOpts := & Options {
2186
+ FS : slowFS ,
2187
+ FormatMajorVersion : internalFormatNewest ,
2188
+ Logger : testutils.Logger {T : t },
2189
+ MemTableSize : 128 << 10 ,
2190
+ MemTableStopWritesThreshold : 4 ,
2191
+ WALFailover : & slowFailoverOpts ,
2192
+ }
2193
+
2194
+ // Start opening the database in a goroutine.
2195
+ type openResult struct {
2196
+ db * DB
2197
+ err error
2198
+ }
2199
+ openResultChan := make (chan openResult , 1 )
2200
+ go func () {
2201
+ db , err := Open ("testdb" , slowOpts )
2202
+ openResultChan <- openResult {db : db , err : err }
2203
+ }()
2204
+
2205
+ // Wait a bit to let the open process make some progress.
2206
+ time .Sleep (time .Millisecond * time .Duration (5 + rng .IntN (10 )))
2207
+
2208
+ // Simulate a crash by taking another crash clone.
2209
+ t .Log ("simulating crash during open" )
2210
+ crashedFS := crashClone .CrashClone (vfs.CrashCloneCfg {
2211
+ UnsyncedDataPercent : rng .IntN (101 ), // 0-100% unsynced data
2212
+ RNG : rng ,
2213
+ })
2214
+
2215
+ // Wait for the original open to complete (it might succeed or fail).
2216
+ var openedDB * DB
2217
+ select {
2218
+ case result := <- openResultChan :
2219
+ openedDB = result .db
2220
+ if result .err != nil {
2221
+ t .Logf ("Open failed: %v" , result .err )
2222
+ }
2223
+ if openedDB != nil {
2224
+ if err := openedDB .Close (); err != nil {
2225
+ t .Logf ("Failed to close openedDB: %v" , err )
2226
+ }
2227
+ }
2228
+
2229
+ // Create WAL failover options for the crashed filesystem recovery.
2230
+ crashedFailoverOpts := failoverOpts
2231
+ crashedFailoverOpts .Secondary = wal.Dir {FS : crashedFS , Dirname : "secondary" }
2232
+
2233
+ // Now try to open the crashed filesystem with WAL failover.
2234
+ crashedOpts := & Options {
2235
+ FS : crashedFS ,
2236
+ FormatMajorVersion : internalFormatNewest ,
2237
+ Logger : testutils.Logger {T : t },
2238
+ MemTableSize : 128 << 10 ,
2239
+ MemTableStopWritesThreshold : 4 ,
2240
+ WALFailover : & crashedFailoverOpts ,
2241
+ }
2242
+
2243
+ recoveredDB , err := Open ("testdb" , crashedOpts )
2244
+ if err != nil {
2245
+ t .Errorf ("failed to open crashed database (attempt %d): %v" , attempt , err )
2246
+ continue
2247
+ }
2248
+
2249
+ // Verify that we can read some of the expected data.
2250
+ iter , err := recoveredDB .NewIter (nil )
2251
+ require .NoError (t , err )
2252
+
2253
+ foundKeys := make (map [string ][]byte )
2254
+ for valid := iter .First (); valid ; valid = iter .Next () {
2255
+ key := string (iter .Key ())
2256
+ value := make ([]byte , len (iter .Value ()))
2257
+ copy (value , iter .Value ())
2258
+ foundKeys [key ] = value
2259
+ }
2260
+ require .NoError (t , iter .Close ())
2261
+
2262
+ // Verify that found data matches expected data.
2263
+ if len (foundKeys ) > 0 {
2264
+ t .Logf ("recovered %d keys after crash" , len (foundKeys ))
2265
+ }
2266
+
2267
+ // Check that all found keys match expected data.
2268
+ for key , foundValue := range foundKeys {
2269
+ expectedValue , exists := testData [key ]
2270
+ require .True (t , exists , "found unexpected key: %s" , key )
2271
+ require .Equal (t , expectedValue , foundValue , "mismatch for key %s" , key )
2272
+ }
2273
+ require .NoError (t , recoveredDB .Close ())
2274
+ }
2275
+ }
0 commit comments