Skip to content

Commit 8ed2677

Browse files
committed
go/oasis-node/cmd/storage: Add command for offline pruning (POC)
When enabling aggresive pruning on a previously synced node and restarting it immediately, node may start lagging behind (minutes to hours) and still believe its status is ready. We should offer validators a maintenance command that can be called offline, when increasing or possibly enabling the pruning for the first time, to ensure only healthy nodes join the network.
1 parent ac4c227 commit 8ed2677

File tree

2 files changed

+177
-7
lines changed

2 files changed

+177
-7
lines changed

go/consensus/cometbft/abci/prune.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,8 @@ PruneLoop:
195195
return nil
196196
}
197197

198+
// Warning: When registering new handler DO NOT forget to update the logic for
199+
// "oasis-node storage prune" command as well.
198200
func (p *genericPruner) RegisterHandler(handler consensus.StatePruneHandler) {
199201
p.Lock()
200202
defer p.Unlock()

go/oasis-node/cmd/storage/storage.go

Lines changed: 175 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,13 @@ import (
66
"errors"
77
"fmt"
88
"io/fs"
9+
"math"
910
"os"
1011
"path/filepath"
1112
"strings"
1213
"time"
1314

15+
cmtBlockstore "github.com/cometbft/cometbft/store"
1416
badgerDB "github.com/dgraph-io/badger/v4"
1517
"github.com/spf13/cobra"
1618

@@ -20,6 +22,7 @@ import (
2022
"github.com/oasisprotocol/oasis-core/go/config"
2123
"github.com/oasisprotocol/oasis-core/go/consensus/cometbft/abci"
2224
cmtCommon "github.com/oasisprotocol/oasis-core/go/consensus/cometbft/common"
25+
cmtConfig "github.com/oasisprotocol/oasis-core/go/consensus/cometbft/config"
2326
cmtDBProvider "github.com/oasisprotocol/oasis-core/go/consensus/cometbft/db/badger"
2427
cmdCommon "github.com/oasisprotocol/oasis-core/go/oasis-node/cmd/common"
2528
roothash "github.com/oasisprotocol/oasis-core/go/roothash/api"
@@ -70,6 +73,13 @@ WARNING: Ensure you have at least as much of a free disk as your largest databas
7073
RunE: doDBCompactions,
7174
}
7275

76+
pruneCmd = &cobra.Command{
77+
Use: "prune-experimental",
78+
Args: cobra.NoArgs,
79+
Short: "EXPERIMENTAL: trigger pruning for all consensus databases",
80+
RunE: doPrune,
81+
}
82+
7383
logger = logging.GetLogger("cmd/storage")
7484

7585
pretty = cmdCommon.Isatty(1)
@@ -385,7 +395,17 @@ func flattenBadgerDB(db *badgerDB.DB, logger *logging.Logger) error {
385395
}
386396

387397
func compactConsensusNodeDB(dataDir string) error {
388-
ldb, ndb, _, err := abci.InitStateStorage(
398+
ndb, err := openConsensusStateNodeDB(dataDir)
399+
if err != nil {
400+
return fmt.Errorf("failed to initialize ABCI storage backend: %w", err)
401+
}
402+
defer ndb.Close()
403+
404+
return ndb.Compact()
405+
}
406+
407+
func openConsensusStateNodeDB(dataDir string) (db.NodeDB, error) {
408+
_, ndb, _, err := abci.InitStateStorage(
389409
&abci.ApplicationConfig{
390410
DataDir: filepath.Join(dataDir, cmtCommon.StateDir),
391411
StorageBackend: config.GlobalConfig.Storage.Backend,
@@ -394,16 +414,163 @@ func compactConsensusNodeDB(dataDir string) error {
394414
DisableCheckpointer: true,
395415
},
396416
)
397-
if err != nil {
398-
return fmt.Errorf("failed to initialize ABCI storage backend: %w", err)
417+
418+
return ndb, err
419+
}
420+
421+
func doPrune(_ *cobra.Command, args []string) error {
422+
if err := cmdCommon.Init(); err != nil {
423+
cmdCommon.EarlyLogAndExit(err)
424+
}
425+
426+
// TODO consider validating correct mode?
427+
428+
dataDir := cmdCommon.DataDir()
429+
if err := pruneConsensusDBs(dataDir); err != nil {
430+
return fmt.Errorf("failed to prune consensus databases: %w", err)
399431
}
400432

401-
// Close the resources. Both Close and Cleanup only close NodeDB.
402-
// Closing both here, to prevent resource leaks if things change in the future.
433+
return nil
434+
}
435+
436+
func pruneConsensusDBs(dataDir string) error {
437+
if config.GlobalConfig.Consensus.Prune.Strategy == cmtConfig.PruneStrategyNone {
438+
logger.Info("skipping consensus pruning: (strategy=%s)", cmtConfig.PruneStrategyNone)
439+
return nil
440+
}
441+
442+
ndb, err := openConsensusStateNodeDB(dataDir)
443+
if err != nil {
444+
return fmt.Errorf("failed to open NodeDB: %w", err)
445+
}
403446
defer ndb.Close()
404-
defer ldb.Cleanup()
405447

406-
return ndb.Compact()
448+
latest, ok := ndb.GetLatestVersion()
449+
if !ok {
450+
logger.Info("skipping consensus pruning as state db is empty")
451+
return nil
452+
}
453+
454+
earliest, err := pruneConsensusState(dataDir, ndb, latest)
455+
if err != nil {
456+
return fmt.Errorf("failed to prune application state: %w", err)
457+
}
458+
459+
if err := pruneCometDBs(dataDir, int64(earliest)); err != nil {
460+
return fmt.Errorf("failed to prune CometBFT managed databases: %w", err)
461+
}
462+
463+
return nil
464+
}
465+
466+
func pruneConsensusState(dataDir string, ndb db.NodeDB, latest uint64) (uint64, error) {
467+
if latest < config.GlobalConfig.Consensus.Prune.NumKept {
468+
logger.Info("consensus state pruning skipped: latest version is smaller than the number of versions to keep")
469+
return latest, nil
470+
}
471+
472+
// In case of configured runtimes, we should not prune past the latest reindexed
473+
// consensus height, so that light history can be populated correctly.
474+
minReindexed, err := minReindexedHeight(dataDir)
475+
if err != nil {
476+
return 0, fmt.Errorf("failed to fetch minimum reindexed consensus height: %w", err)
477+
}
478+
479+
start := ndb.GetEarliestVersion()
480+
end := min(
481+
latest-config.GlobalConfig.Consensus.Prune.NumKept, // does not underflow due to if at the top.
482+
uint64(minReindexed),
483+
)
484+
485+
if end <= start {
486+
logger.Info("consensus state already pruned")
487+
return end, nil
488+
}
489+
490+
logger.Info("pruning consensus state", "start", start, "end", end)
491+
for i := start; i < end; i++ {
492+
if err := ndb.Prune(i); err != nil {
493+
return 0, fmt.Errorf("failed to prune version %d: %w", i, err)
494+
}
495+
496+
if i%10_000 == 0 { // TODO not sure this is even needed.
497+
if err := ndb.Sync(); err != nil {
498+
return 0, fmt.Errorf("failed to sync NodeDB: %w", err)
499+
}
500+
logger.Debug("forcing NodeDB disk sync during pruning", "version", i)
501+
}
502+
}
503+
504+
if err := ndb.Sync(); err != nil {
505+
return 0, fmt.Errorf("failed to sync NodeDB: %w", err)
506+
}
507+
508+
return end, nil
509+
}
510+
511+
// minReindexedHeight returns the smallest consensus height reindexed by any
512+
// of the configured runtimes.
513+
//
514+
// In case of no configured runtimes it returns max int64.
515+
func minReindexedHeight(dataDir string) (int64, error) {
516+
fetchLastReindexedHeight := func(runtimeID common.Namespace) (int64, error) {
517+
rtDir := runtimeConfig.GetRuntimeStateDir(dataDir, runtimeID)
518+
mode := config.GlobalConfig.Mode
519+
hasLocalStorage := mode.HasLocalStorage() && !mode.IsArchive()
520+
521+
// TODO ideally we would not start whole light history with all background workers, but this would
522+
// require as to refactor existing code...
523+
history, err := history.New(runtimeID, rtDir, history.NewNonePrunerFactory(), hasLocalStorage)
524+
if err != nil {
525+
return 0, fmt.Errorf("failed to open new light history: %w", err)
526+
}
527+
defer history.Close()
528+
529+
h, err := history.LastConsensusHeight()
530+
if err != nil {
531+
return 0, fmt.Errorf("failed to get last consensus height: %w", err)
532+
}
533+
534+
return h, nil
535+
}
536+
537+
var minH int64 = math.MaxInt64
538+
for _, rt := range config.GlobalConfig.Runtime.Runtimes {
539+
h, err := fetchLastReindexedHeight(rt.ID)
540+
if err != nil {
541+
return 0, fmt.Errorf("failed to fetch last reindexed height for %s: %w", rt.ID, err)
542+
}
543+
544+
if h < minH {
545+
minH = h
546+
}
547+
}
548+
549+
return minH, nil
550+
}
551+
552+
func pruneCometDBs(dataDir string, height int64) error {
553+
// TODO: This is a hack. In fact even if we manage to get this right via
554+
// BadgerDBProvider and somehow pass correct config via context, this will
555+
// still not be intended way to use it. I believe this hack is worth it, but
556+
// we should definitely release this command as experimental first.
557+
blockstorePath := fmt.Sprintf("%s/consensus/data/blockstore.badger.db", dataDir)
558+
blockDB, err := cmtDBProvider.New(blockstorePath, false)
559+
if err != nil {
560+
return fmt.Errorf("failed to open blockstore: %w", err)
561+
}
562+
blockstore := cmtBlockstore.NewBlockStore(blockDB)
563+
564+
logger.Info("pruning consensus blockstore", "target_height", height)
565+
n, err := blockstore.PruneBlocks(height)
566+
if err != nil {
567+
return fmt.Errorf("failed to prune blocks: %w", err)
568+
}
569+
logger.Info("consensus blockstore finished", "pruned", n)
570+
571+
// TODO add pruning of state.badger.db
572+
573+
return nil
407574
}
408575

409576
// Register registers the client sub-command and all of its children.
@@ -414,5 +581,6 @@ func Register(parentCmd *cobra.Command) {
414581
storageCmd.AddCommand(storageCheckCmd)
415582
storageCmd.AddCommand(storageRenameNsCmd)
416583
storageCmd.AddCommand(storageCompactCmd)
584+
storageCmd.AddCommand(pruneCmd)
417585
parentCmd.AddCommand(storageCmd)
418586
}

0 commit comments

Comments
 (0)