Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Reset snapshot manager on chunk timeout to handle new cometBFT offer #20319

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 18 additions & 17 deletions baseapp/abci.go
Original file line number Diff line number Diff line change
Expand Up @@ -277,31 +277,32 @@ func (app *BaseApp) OfferSnapshot(req *abci.OfferSnapshotRequest) (*abci.OfferSn
return &abci.OfferSnapshotResponse{Result: abci.OFFER_SNAPSHOT_RESULT_REJECT}, nil
}

err = app.snapshotManager.Restore(snapshot)
switch {
case err == nil:
return &abci.OfferSnapshotResponse{Result: abci.OFFER_SNAPSHOT_RESULT_ACCEPT}, nil
if err := app.snapshotManager.Restore(snapshot); err != nil {
return app.handleSnapshotRestoreError(err, req)
}

return &abci.OfferSnapshotResponse{Result: abci.OFFER_SNAPSHOT_RESULT_ACCEPT}, nil
}

// handleSnapshotRestoreError handles specific snapshot restore errors
func (app *BaseApp) handleSnapshotRestoreError(err error, req *abci.OfferSnapshotRequest) (*abci.OfferSnapshotResponse, error) {
app.logger.Error("failed to restore snapshot", "height", req.Snapshot.Height, "format", req.Snapshot.Format, "err", err)

switch {
case errors.Is(err, snapshottypes.ErrUnknownFormat):
return &abci.OfferSnapshotResponse{Result: abci.OFFER_SNAPSHOT_RESULT_REJECT_FORMAT}, nil

case errors.Is(err, snapshottypes.ErrInvalidMetadata):
app.logger.Error(
"rejecting invalid snapshot",
"height", req.Snapshot.Height,
"format", req.Snapshot.Format,
"err", err,
)
return &abci.OfferSnapshotResponse{Result: abci.OFFER_SNAPSHOT_RESULT_REJECT}, nil

default:
app.logger.Error(
"failed to restore snapshot",
"height", req.Snapshot.Height,
"format", req.Snapshot.Format,
"err", err,
)
case errors.Is(err, snapshottypes.ErrTimedOutSnapshotChunks):
if resetErr := app.snapshotManager.Reset(); resetErr != nil {
app.logger.Error("failed to reset snapshot manager", "err", resetErr)
return &abci.OfferSnapshotResponse{Result: abci.OFFER_SNAPSHOT_RESULT_ABORT}, nil
}
return &abci.OfferSnapshotResponse{Result: abci.OFFER_SNAPSHOT_RESULT_ACCEPT}, nil

default:
// We currently don't support resetting the IAVL stores and retrying a
// different snapshot, so we ask CometBFT to abort all snapshot restoration.
return &abci.OfferSnapshotResponse{Result: abci.OFFER_SNAPSHOT_RESULT_ABORT}, nil
Expand Down
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,8 @@ replace (
github.com/syndtr/goleveldb => github.com/syndtr/goleveldb v1.0.1-0.20210819022825-2ae1ddf74ef7
)

replace cosmossdk.io/store => ./store // TODO

retract (
// false start by tagging the wrong branch
v0.50.0
Expand Down
2 changes: 0 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@ cosmossdk.io/log v1.3.1 h1:UZx8nWIkfbbNEWusZqzAx3ZGvu54TZacWib3EzUYmGI=
cosmossdk.io/log v1.3.1/go.mod h1:2/dIomt8mKdk6vl3OWJcPk2be3pGOS8OQaLUM/3/tCM=
cosmossdk.io/math v1.3.0 h1:RC+jryuKeytIiictDslBP9i1fhkVm6ZDmZEoNP316zE=
cosmossdk.io/math v1.3.0/go.mod h1:vnRTxewy+M7BtXBNFybkuhSH4WfedVAAnERHgVFhp3k=
cosmossdk.io/store v1.1.1-0.20240418092142-896cdf1971bc h1:R9O9d75e0qZYUsVV0zzi+D7cNLnX2JrUOQNoIPaF0Bg=
cosmossdk.io/store v1.1.1-0.20240418092142-896cdf1971bc/go.mod h1:amTTatOUV3u1PsKmNb87z6/galCxrRbz9kRdJkL0DyU=
cosmossdk.io/x/accounts/defaults/lockup v0.0.0-20240417181816-5e7aae0db1f5 h1:eb0kcGyaYHSS0do7+MIWg7UKlskSH01biRNENbm/zDA=
cosmossdk.io/x/accounts/defaults/lockup v0.0.0-20240417181816-5e7aae0db1f5/go.mod h1:drzY4oVisyWvSgpsM7ccQ7IX3efMuVIvd9Eij1Gm/6o=
cosmossdk.io/x/tx v0.13.3 h1:Ha4mNaHmxBc6RMun9aKuqul8yHiL78EKJQ8g23Zf73g=
Expand Down
16 changes: 16 additions & 0 deletions store/snapshots/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,22 @@ func (m *Manager) endLocked() {
m.restoreChunkIndex = 0
}

// Reset cleans up the current state of the Manager and prepares it for new operations.
// This method should be used with caution.
// State Clearing: Calls endLocked to reset the internal state, which includes closing channels and resetting variables.
//
// Improper use might lead to issues if not all components are reinitialized correctly.
func (m *Manager) Reset() error {
m.mtx.Lock() // Ensure mutual exclusion to prevent race conditions
defer m.mtx.Unlock()

// Call endLocked to clear the current state
m.endLocked()

m.logger.Info("Manager state has been reset")
return nil
}

// GetInterval returns snapshot interval represented in heights.
func (m *Manager) GetInterval() uint64 {
return m.opts.Interval
Expand Down
3 changes: 3 additions & 0 deletions store/snapshots/types/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,7 @@ var (

// ErrInvalidSnapshotVersion is returned when the snapshot version is invalid
ErrInvalidSnapshotVersion = errors.New("invalid snapshot version")

// ErrTimedOutSnapshotChunks is returned when the snapshot chunks takes more than 2min
ErrTimedOutSnapshotChunks = errors.New("timed out waiting for chunk")
)
2 changes: 1 addition & 1 deletion store/types/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@ import (
"fmt"
"io"

crypto "github.com/cometbft/cometbft/api/cometbft/crypto/v1"
dbm "github.com/cosmos/cosmos-db"

"cosmossdk.io/store/metrics"
pruningtypes "cosmossdk.io/store/pruning/types"
snapshottypes "cosmossdk.io/store/snapshots/types"
crypto "github.com/cometbft/cometbft/api/cometbft/crypto/v1"
)

type Store interface {
Expand Down
Loading