From 810d0b3d2e2c7f7eb5116956aabe9eccec01fe41 Mon Sep 17 00:00:00 2001 From: Lucas Francisco Lopez Date: Wed, 8 May 2024 12:35:38 +0200 Subject: [PATCH 1/3] Fix snapshot manager reset on chunk timeout --- baseapp/abci.go | 35 +++++++++++++++++---------------- go.mod | 2 ++ go.sum | 2 -- store/snapshots/manager.go | 16 +++++++++++++++ store/snapshots/types/errors.go | 3 +++ 5 files changed, 39 insertions(+), 19 deletions(-) diff --git a/baseapp/abci.go b/baseapp/abci.go index 7e83e68fc012..c861d30e0709 100644 --- a/baseapp/abci.go +++ b/baseapp/abci.go @@ -277,31 +277,32 @@ func (app *BaseApp) OfferSnapshot(req *abci.OfferSnapshotRequest) (*abci.OfferSn return &abci.OfferSnapshotResponse{Result: abci.OFFER_SNAPSHOT_RESULT_REJECT}, nil } - err = app.snapshotManager.Restore(snapshot) - switch { - case err == nil: - return &abci.OfferSnapshotResponse{Result: abci.OFFER_SNAPSHOT_RESULT_ACCEPT}, nil + if err := app.snapshotManager.Restore(snapshot); err != nil { + return app.handleSnapshotRestoreError(err, req) + } + return &abci.OfferSnapshotResponse{Result: abci.OFFER_SNAPSHOT_RESULT_ACCEPT}, nil +} + +// handleSnapshotRestoreError handles specific snapshot restore errors +func (app *BaseApp) handleSnapshotRestoreError(err error, req *abci.OfferSnapshotRequest) (*abci.OfferSnapshotResponse, error) { + app.logger.Error("failed to restore snapshot", "height", req.Snapshot.Height, "format", req.Snapshot.Format, "err", err) + + switch { case errors.Is(err, snapshottypes.ErrUnknownFormat): return &abci.OfferSnapshotResponse{Result: abci.OFFER_SNAPSHOT_RESULT_REJECT_FORMAT}, nil case errors.Is(err, snapshottypes.ErrInvalidMetadata): - app.logger.Error( - "rejecting invalid snapshot", - "height", req.Snapshot.Height, - "format", req.Snapshot.Format, - "err", err, - ) return &abci.OfferSnapshotResponse{Result: abci.OFFER_SNAPSHOT_RESULT_REJECT}, nil - default: - app.logger.Error( - "failed to restore snapshot", - "height", req.Snapshot.Height, - "format", req.Snapshot.Format, - "err", err, - ) + case errors.Is(err, snapshottypes.ErrTimedOutSnapshotChunks): + if resetErr := app.snapshotManager.Reset(); resetErr != nil { + app.logger.Error("failed to reset snapshot manager", "err", resetErr) + return &abci.OfferSnapshotResponse{Result: abci.OFFER_SNAPSHOT_RESULT_ABORT}, nil + } + return &abci.OfferSnapshotResponse{Result: abci.OFFER_SNAPSHOT_RESULT_ACCEPT}, nil + default: // We currently don't support resetting the IAVL stores and retrying a // different snapshot, so we ask CometBFT to abort all snapshot restoration. return &abci.OfferSnapshotResponse{Result: abci.OFFER_SNAPSHOT_RESULT_ABORT}, nil diff --git a/go.mod b/go.mod index ee13f12cf934..009366764c48 100644 --- a/go.mod +++ b/go.mod @@ -204,6 +204,8 @@ replace ( github.com/syndtr/goleveldb => github.com/syndtr/goleveldb v1.0.1-0.20210819022825-2ae1ddf74ef7 ) +replace cosmossdk.io/store => ./store // TODO + retract ( // false start by tagging the wrong branch v0.50.0 diff --git a/go.sum b/go.sum index adabd4f3abac..25a4c1b9f16f 100644 --- a/go.sum +++ b/go.sum @@ -10,8 +10,6 @@ cosmossdk.io/log v1.3.1 h1:UZx8nWIkfbbNEWusZqzAx3ZGvu54TZacWib3EzUYmGI= cosmossdk.io/log v1.3.1/go.mod h1:2/dIomt8mKdk6vl3OWJcPk2be3pGOS8OQaLUM/3/tCM= cosmossdk.io/math v1.3.0 h1:RC+jryuKeytIiictDslBP9i1fhkVm6ZDmZEoNP316zE= cosmossdk.io/math v1.3.0/go.mod h1:vnRTxewy+M7BtXBNFybkuhSH4WfedVAAnERHgVFhp3k= -cosmossdk.io/store v1.1.1-0.20240418092142-896cdf1971bc h1:R9O9d75e0qZYUsVV0zzi+D7cNLnX2JrUOQNoIPaF0Bg= -cosmossdk.io/store v1.1.1-0.20240418092142-896cdf1971bc/go.mod h1:amTTatOUV3u1PsKmNb87z6/galCxrRbz9kRdJkL0DyU= cosmossdk.io/x/accounts/defaults/lockup v0.0.0-20240417181816-5e7aae0db1f5 h1:eb0kcGyaYHSS0do7+MIWg7UKlskSH01biRNENbm/zDA= cosmossdk.io/x/accounts/defaults/lockup v0.0.0-20240417181816-5e7aae0db1f5/go.mod h1:drzY4oVisyWvSgpsM7ccQ7IX3efMuVIvd9Eij1Gm/6o= cosmossdk.io/x/tx v0.13.3 h1:Ha4mNaHmxBc6RMun9aKuqul8yHiL78EKJQ8g23Zf73g= diff --git a/store/snapshots/manager.go b/store/snapshots/manager.go index 3bedcd3ae001..d66740535131 100644 --- a/store/snapshots/manager.go +++ b/store/snapshots/manager.go @@ -140,6 +140,22 @@ func (m *Manager) endLocked() { m.restoreChunkIndex = 0 } +// Reset cleans up the current state of the Manager and prepares it for new operations. +// This method should be used with caution. +// State Clearing: Calls endLocked to reset the internal state, which includes closing channels and resetting variables. +// +// Improper use might lead to issues if not all components are reinitialized correctly. +func (m *Manager) Reset() error { + m.mtx.Lock() // Ensure mutual exclusion to prevent race conditions + defer m.mtx.Unlock() + + // Call endLocked to clear the current state + m.endLocked() + + m.logger.Info("Manager state has been reset") + return nil +} + // GetInterval returns snapshot interval represented in heights. func (m *Manager) GetInterval() uint64 { return m.opts.Interval diff --git a/store/snapshots/types/errors.go b/store/snapshots/types/errors.go index c1b5db532e42..de2d1e9d85f4 100644 --- a/store/snapshots/types/errors.go +++ b/store/snapshots/types/errors.go @@ -16,4 +16,7 @@ var ( // ErrInvalidSnapshotVersion is returned when the snapshot version is invalid ErrInvalidSnapshotVersion = errors.New("invalid snapshot version") + + // ErrTimedOutSnapshotChunks is returned when the snapshot chunks takes more than 2min + ErrTimedOutSnapshotChunks = errors.New("invalid snapshot version") ) From bf060e8669414807391b77501fd0a65f57d0fd29 Mon Sep 17 00:00:00 2001 From: Lucas Francisco Lopez Date: Wed, 8 May 2024 15:38:32 +0200 Subject: [PATCH 2/3] minor change --- store/snapshots/types/errors.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/store/snapshots/types/errors.go b/store/snapshots/types/errors.go index de2d1e9d85f4..e0c689275070 100644 --- a/store/snapshots/types/errors.go +++ b/store/snapshots/types/errors.go @@ -18,5 +18,5 @@ var ( ErrInvalidSnapshotVersion = errors.New("invalid snapshot version") // ErrTimedOutSnapshotChunks is returned when the snapshot chunks takes more than 2min - ErrTimedOutSnapshotChunks = errors.New("invalid snapshot version") + ErrTimedOutSnapshotChunks = errors.New("timed out waiting for chunk") ) From 3a8aaff76d069095c319a9fa2036bfff6b9f3954 Mon Sep 17 00:00:00 2001 From: Lucas Francisco Lopez Date: Wed, 8 May 2024 16:18:08 +0200 Subject: [PATCH 3/3] linting issues --- store/types/store.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/store/types/store.go b/store/types/store.go index 835d5b29ff84..6c773f2807d6 100644 --- a/store/types/store.go +++ b/store/types/store.go @@ -4,12 +4,12 @@ import ( "fmt" "io" + crypto "github.com/cometbft/cometbft/api/cometbft/crypto/v1" dbm "github.com/cosmos/cosmos-db" "cosmossdk.io/store/metrics" pruningtypes "cosmossdk.io/store/pruning/types" snapshottypes "cosmossdk.io/store/snapshots/types" - crypto "github.com/cometbft/cometbft/api/cometbft/crypto/v1" ) type Store interface {