Skip to content

Commit 347d5dc

Browse files
committed
db: add TestCrashDuringOpenRandomized
Fixes: #4342
1 parent 4e1e327 commit 347d5dc

File tree

1 file changed

+152
-0
lines changed

1 file changed

+152
-0
lines changed

open_test.go

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2106,3 +2106,155 @@ func TestWALCorruptionBitFlip(t *testing.T) {
21062106
}
21072107
checkBitFlipErr(err, t)
21082108
}
2109+
2110+
// TestCrashDuringOpenRandomized is a randomized test that simulates a hard crash
2111+
// during database opening. It creates a database with some data, then simulates
2112+
// opening it with injected filesystem slowness and crashes during the open
2113+
// process. It ensures that the resulting DB state opens successfully, and the
2114+
// contents of the DB match the expectations based on the keys written.
2115+
func TestCrashDuringOpenRandomized(t *testing.T) {
2116+
seed := time.Now().UnixNano()
2117+
t.Logf("seed %d", seed)
2118+
rng := rand.New(rand.NewPCG(0, uint64(seed)))
2119+
2120+
// Create initial database with some data.
2121+
mem := vfs.NewCrashableMem()
2122+
failoverOpts := WALFailoverOptions{
2123+
Secondary: wal.Dir{FS: mem, Dirname: "secondary"},
2124+
FailoverOptions: wal.FailoverOptions{
2125+
PrimaryDirProbeInterval: 100 * time.Microsecond,
2126+
HealthyProbeLatencyThreshold: 5 * time.Millisecond,
2127+
HealthyInterval: 50 * time.Microsecond,
2128+
UnhealthySamplingInterval: 10 * time.Microsecond,
2129+
UnhealthyOperationLatencyThreshold: func() (time.Duration, bool) {
2130+
return 50 * time.Microsecond, true
2131+
},
2132+
ElevatedWriteStallThresholdLag: 100 * time.Microsecond,
2133+
},
2134+
}
2135+
opts := &Options{
2136+
FS: mem,
2137+
FormatMajorVersion: internalFormatNewest,
2138+
Logger: testutils.Logger{T: t},
2139+
MemTableSize: 128 << 10, // 128 KiB
2140+
MemTableStopWritesThreshold: 4,
2141+
WALFailover: &failoverOpts,
2142+
}
2143+
2144+
// Create and populate initial database.
2145+
d, err := Open("testdb", opts)
2146+
require.NoError(t, err)
2147+
2148+
testData := make(map[string][]byte)
2149+
for i := range 50 {
2150+
key := fmt.Sprintf("key-%d", i)
2151+
value := make([]byte, 100+rng.IntN(900)) // 100-1000 bytes
2152+
for j := range value {
2153+
value[j] = byte(i + j)
2154+
}
2155+
testData[key] = value
2156+
require.NoError(t, d.Set([]byte(key), value, Sync))
2157+
}
2158+
require.NoError(t, d.Close())
2159+
2160+
// Now simulate opening with a crash clone we have taken during open.
2161+
// Create options with latency injection and WAL failover for the slow
2162+
// open process.
2163+
mean := time.Duration(rng.ExpFloat64() * float64(time.Microsecond))
2164+
p := 1.0
2165+
t.Logf("injecting mean %s of latency with p=%.3f", mean, p)
2166+
slowFS := errorfs.Wrap(mem, errorfs.RandomLatency(
2167+
errorfs.Randomly(p, seed),
2168+
mean,
2169+
seed,
2170+
10*time.Millisecond,
2171+
))
2172+
2173+
// Create WAL failover options for the slow open.
2174+
slowFailoverOpts := failoverOpts
2175+
slowFailoverOpts.Secondary = wal.Dir{FS: slowFS, Dirname: "secondary"}
2176+
slowOpts := &Options{
2177+
FS: mem,
2178+
FormatMajorVersion: internalFormatNewest,
2179+
Logger: testutils.Logger{T: t},
2180+
MemTableSize: 128 << 10,
2181+
MemTableStopWritesThreshold: 4,
2182+
WALFailover: &slowFailoverOpts,
2183+
}
2184+
2185+
// Start opening the database in a goroutine.
2186+
type openResult struct {
2187+
db *DB
2188+
err error
2189+
}
2190+
openResultChan := make(chan openResult, 1)
2191+
go func() {
2192+
t.Log("opening database")
2193+
db, err := Open("testdb", slowOpts)
2194+
t.Log("opened database")
2195+
openResultChan <- openResult{db: db, err: err}
2196+
}()
2197+
2198+
// Wait a bit to let the open process make some progress.
2199+
time.Sleep(time.Millisecond * time.Duration(5+rng.IntN(10)))
2200+
2201+
// Take crash clone while the open process is still running.
2202+
t.Log("taking crash clone during open process")
2203+
crashClone := mem.CrashClone(vfs.CrashCloneCfg{
2204+
UnsyncedDataPercent: rng.IntN(101),
2205+
RNG: rng,
2206+
})
2207+
2208+
// Wait for the original open to complete (it might succeed or fail).
2209+
result := <-openResultChan
2210+
openedDB := result.db
2211+
if result.err != nil {
2212+
t.Errorf("open failed: %v", result.err)
2213+
}
2214+
if openedDB != nil {
2215+
if err := openedDB.Close(); err != nil {
2216+
t.Errorf("failed to close openedDB: %v", err)
2217+
}
2218+
2219+
}
2220+
t.Log("using crashed filesystem for recovery")
2221+
// Create WAL failover options for the crashed filesystem recovery.
2222+
crashedFailoverOpts := failoverOpts
2223+
crashedFailoverOpts.Secondary = wal.Dir{FS: crashClone, Dirname: "secondary"}
2224+
2225+
// Now try to open the crashed filesystem with WAL failover.
2226+
crashedOpts := &Options{
2227+
FS: crashClone,
2228+
FormatMajorVersion: internalFormatNewest,
2229+
Logger: testutils.Logger{T: t},
2230+
MemTableSize: 128 << 10,
2231+
MemTableStopWritesThreshold: 4,
2232+
WALFailover: &crashedFailoverOpts,
2233+
}
2234+
2235+
recoveredDB, err := Open("testdb", crashedOpts)
2236+
require.NoError(t, err)
2237+
2238+
// Verify that we can read some of the expected data.
2239+
iter, err := recoveredDB.NewIter(nil)
2240+
require.NoError(t, err)
2241+
2242+
foundKeys := make(map[string][]byte)
2243+
for valid := iter.First(); valid; valid = iter.Next() {
2244+
key := string(iter.Key())
2245+
value := slices.Clone(iter.Value())
2246+
foundKeys[key] = value
2247+
}
2248+
require.NoError(t, iter.Close())
2249+
2250+
// Verify that found data matches expected data.
2251+
require.NotEmpty(t, foundKeys, "no keys found after crash")
2252+
2253+
// Check that all found keys match expected data.
2254+
for key, foundValue := range foundKeys {
2255+
expectedValue, exists := testData[key]
2256+
require.True(t, exists, "found unexpected key: %s", key)
2257+
require.Equal(t, expectedValue, foundValue, "mismatch for key %s", key)
2258+
}
2259+
require.NoError(t, recoveredDB.Close())
2260+
}

0 commit comments

Comments
 (0)