Skip to content

Commit d560342

Browse files
committed
db: add TestCrashDuringOpenRandomized
Fixes: #4342
1 parent b33397f commit d560342

File tree

1 file changed

+168
-0
lines changed

1 file changed

+168
-0
lines changed

open_test.go

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2105,3 +2105,171 @@ func TestWALCorruptionBitFlip(t *testing.T) {
21052105
}
21062106
checkBitFlipErr(err, t)
21072107
}
2108+
2109+
// TestCrashDuringOpenRandomized is a randomized test that simulates a hard crash
2110+
// during database opening. It creates a database with some data, then simulates
2111+
// opening it with injected filesystem slowness and crashes during the open
2112+
// process. It ensures that the resulting DB state opens successfully, and the
2113+
// contents of the DB match the expectations based on the keys written.
2114+
func TestCrashDuringOpenRandomized(t *testing.T) {
2115+
seed := time.Now().UnixNano()
2116+
t.Logf("seed %d", seed)
2117+
rng := rand.New(rand.NewPCG(0, uint64(seed)))
2118+
2119+
// Create initial database with some data.
2120+
mem := vfs.NewCrashableMem()
2121+
failoverOpts := WALFailoverOptions{
2122+
Secondary: wal.Dir{FS: mem, Dirname: "secondary"},
2123+
FailoverOptions: wal.FailoverOptions{
2124+
PrimaryDirProbeInterval: 100 * time.Microsecond,
2125+
HealthyProbeLatencyThreshold: 5 * time.Millisecond,
2126+
HealthyInterval: 50 * time.Microsecond,
2127+
UnhealthySamplingInterval: 10 * time.Microsecond,
2128+
UnhealthyOperationLatencyThreshold: func() (time.Duration, bool) {
2129+
return 50 * time.Microsecond, true
2130+
},
2131+
ElevatedWriteStallThresholdLag: 100 * time.Microsecond,
2132+
},
2133+
}
2134+
opts := &Options{
2135+
FS: mem,
2136+
FormatMajorVersion: internalFormatNewest,
2137+
Logger: testutils.Logger{T: t},
2138+
MemTableSize: 128 << 10, // 128 KiB
2139+
MemTableStopWritesThreshold: 4,
2140+
WALFailover: &failoverOpts,
2141+
}
2142+
2143+
// Create and populate initial database.
2144+
d, err := Open("testdb", opts)
2145+
require.NoError(t, err)
2146+
2147+
testData := make(map[string][]byte)
2148+
for i := range 50 {
2149+
key := fmt.Sprintf("key-%d", i)
2150+
value := make([]byte, 100+rng.IntN(900)) // 100-1000 bytes
2151+
for j := range value {
2152+
value[j] = byte(i + j)
2153+
}
2154+
testData[key] = value
2155+
require.NoError(t, d.Set([]byte(key), value, Sync))
2156+
}
2157+
require.NoError(t, d.Flush())
2158+
require.NoError(t, d.Close())
2159+
2160+
// Now simulate opening with a crash during open.
2161+
for attempt := range 3 {
2162+
t.Logf("attempt %d", attempt)
2163+
2164+
// Create a crashable clone of the filesystem.
2165+
crashClone := mem.CrashClone(vfs.CrashCloneCfg{
2166+
UnsyncedDataPercent: rng.IntN(101), // 0-100% unsynced data
2167+
RNG: rng,
2168+
})
2169+
2170+
// Create options with latency injection and WAL failover for the slow
2171+
// open process.
2172+
mean := time.Duration(rng.ExpFloat64() * float64(500*time.Microsecond))
2173+
p := 1.0
2174+
t.Logf("Injecting mean %s of latency with p=%.3f", mean, p)
2175+
slowFS := errorfs.Wrap(crashClone, errorfs.RandomLatency(
2176+
errorfs.Randomly(p, seed+int64(attempt)),
2177+
mean,
2178+
seed+int64(attempt),
2179+
10*time.Millisecond,
2180+
))
2181+
2182+
// Create WAL failover options for the slow open.
2183+
slowFailoverOpts := failoverOpts
2184+
slowFailoverOpts.Secondary = wal.Dir{FS: crashClone, Dirname: "secondary"}
2185+
slowOpts := &Options{
2186+
FS: slowFS,
2187+
FormatMajorVersion: internalFormatNewest,
2188+
Logger: testutils.Logger{T: t},
2189+
MemTableSize: 128 << 10,
2190+
MemTableStopWritesThreshold: 4,
2191+
WALFailover: &slowFailoverOpts,
2192+
}
2193+
2194+
// Start opening the database in a goroutine.
2195+
type openResult struct {
2196+
db *DB
2197+
err error
2198+
}
2199+
openResultChan := make(chan openResult, 1)
2200+
go func() {
2201+
db, err := Open("testdb", slowOpts)
2202+
openResultChan <- openResult{db: db, err: err}
2203+
}()
2204+
2205+
// Wait a bit to let the open process make some progress.
2206+
time.Sleep(time.Millisecond * time.Duration(5+rng.IntN(10)))
2207+
2208+
// Simulate a crash by taking another crash clone.
2209+
t.Log("simulating crash during open")
2210+
crashedFS := crashClone.CrashClone(vfs.CrashCloneCfg{
2211+
UnsyncedDataPercent: rng.IntN(101), // 0-100% unsynced data
2212+
RNG: rng,
2213+
})
2214+
2215+
// Wait for the original open to complete (it might succeed or fail).
2216+
var openedDB *DB
2217+
select {
2218+
case result := <-openResultChan:
2219+
openedDB = result.db
2220+
if result.err != nil {
2221+
t.Logf("Open failed: %v", result.err)
2222+
}
2223+
if openedDB != nil {
2224+
if err := openedDB.Close(); err != nil {
2225+
t.Logf("Failed to close openedDB: %v", err)
2226+
}
2227+
}
2228+
2229+
// Create WAL failover options for the crashed filesystem recovery.
2230+
crashedFailoverOpts := failoverOpts
2231+
crashedFailoverOpts.Secondary = wal.Dir{FS: crashedFS, Dirname: "secondary"}
2232+
2233+
// Now try to open the crashed filesystem with WAL failover.
2234+
crashedOpts := &Options{
2235+
FS: crashedFS,
2236+
FormatMajorVersion: internalFormatNewest,
2237+
Logger: testutils.Logger{T: t},
2238+
MemTableSize: 128 << 10,
2239+
MemTableStopWritesThreshold: 4,
2240+
WALFailover: &crashedFailoverOpts,
2241+
}
2242+
2243+
recoveredDB, err := Open("testdb", crashedOpts)
2244+
if err != nil {
2245+
t.Errorf("failed to open crashed database (attempt %d): %v", attempt, err)
2246+
continue
2247+
}
2248+
2249+
// Verify that we can read some of the expected data.
2250+
iter, err := recoveredDB.NewIter(nil)
2251+
require.NoError(t, err)
2252+
2253+
foundKeys := make(map[string][]byte)
2254+
for valid := iter.First(); valid; valid = iter.Next() {
2255+
key := string(iter.Key())
2256+
value := make([]byte, len(iter.Value()))
2257+
copy(value, iter.Value())
2258+
foundKeys[key] = value
2259+
}
2260+
require.NoError(t, iter.Close())
2261+
2262+
// Verify that found data matches expected data.
2263+
if len(foundKeys) > 0 {
2264+
t.Logf("recovered %d keys after crash", len(foundKeys))
2265+
}
2266+
2267+
// Check that all found keys match expected data.
2268+
for key, foundValue := range foundKeys {
2269+
expectedValue, exists := testData[key]
2270+
require.True(t, exists, "found unexpected key: %s", key)
2271+
require.Equal(t, expectedValue, foundValue, "mismatch for key %s", key)
2272+
}
2273+
require.NoError(t, recoveredDB.Close())
2274+
}
2275+
}

0 commit comments

Comments
 (0)