diff --git a/CHANGELOG.md b/CHANGELOG.md index 8152920da..5c952c9de 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,25 @@ # Changelog +## v1.8.0 +BUGFIXES +* [#1404](https://github.com/bnb-chain/greenfield-storage-provider/pull/1404) fix: use exponential backoff for task retry policy +* [#1406](https://github.com/bnb-chain/greenfield-storage-provider/pull/1406) fix: change replicate failure error code to 404 when object is not found +* [#1407](https://github.com/bnb-chain/greenfield-storage-provider/pull/1407) fix: refine error code for bucket migration +* [#1409](https://github.com/bnb-chain/greenfield-storage-provider/pull/1409) fix: avoid users to select network in universal endpoint page +* [#1412](https://github.com/bnb-chain/greenfield-storage-provider/pull/1412) fix: bs bucket migrate event bug +* [#1414](https://github.com/bnb-chain/greenfield-storage-provider/pull/1414) fix: metric check sp +* [#1415](https://github.com/bnb-chain/greenfield-storage-provider/pull/1415) fix: check sp health retry +* [#1422](https://github.com/bnb-chain/greenfield-storage-provider/pull/1422) fix: gvg staking storage size + +FEATURES +* [#1405](https://github.com/bnb-chain/greenfield-storage-provider/pull/1405) feat: bs add bucket status field +* [#1408](https://github.com/bnb-chain/greenfield-storage-provider/pull/1408) feat: add detailed logs for special customized logs +* [#1416](https://github.com/bnb-chain/greenfield-storage-provider/pull/1416) perf: logic + +DOCS +* [#1402](https://github.com/bnb-chain/greenfield-storage-provider/pull/1402) docs: move SP cmd/module/api docs docs.bnbchain.org to sp repo +* [#1417](https://github.com/bnb-chain/greenfield-storage-provider/pull/1417) fix:readme + ## v1.7.0 BUGFIXES * [#1394](https://github.com/bnb-chain/greenfield-storage-provider/pull/1394) fix: pick new gvg when retry failed replicate piece task diff --git a/base/gfspvgmgr/virtual_group_manager.go b/base/gfspvgmgr/virtual_group_manager.go index 76de54502..f92f9d7e5 100644 --- a/base/gfspvgmgr/virtual_group_manager.go +++ b/base/gfspvgmgr/virtual_group_manager.go @@ -7,6 +7,7 @@ import ( "math/rand" "net/http" "sort" + "strconv" "strings" "sync" "time" @@ -19,6 +20,7 @@ import ( "github.com/bnb-chain/greenfield-storage-provider/core/consensus" "github.com/bnb-chain/greenfield-storage-provider/core/vgmgr" "github.com/bnb-chain/greenfield-storage-provider/pkg/log" + "github.com/bnb-chain/greenfield-storage-provider/pkg/metrics" "github.com/bnb-chain/greenfield-storage-provider/util" sptypes "github.com/bnb-chain/greenfield/x/sp/types" virtualgrouptypes "github.com/bnb-chain/greenfield/x/virtualgroup/types" @@ -30,12 +32,14 @@ const ( VirtualGroupManagerSpace = "VirtualGroupManager" RefreshMetaInterval = 5 * time.Second MaxStorageUsageRatio = 0.95 - DefaultInitialGVGStakingStorageSize = uint64(1) * 1024 * 1024 * 1024 * 1024 // 1TB per GVG, chain side DefaultMaxStoreSizePerFamily is 64 TB - additionalGVGStakingStorageSize = uint64(1) * 1024 * 1024 * 1024 * 512 // 0.5TB + DefaultInitialGVGStakingStorageSize = uint64(1) * 1024 * 1024 * 1024 * 256 // 256GB per GVG, chain side DefaultMaxStoreSizePerFamily is 64 TB + additionalGVGStakingStorageSize = uint64(1) * 1024 * 1024 * 1024 * 512 // 0.5TB - defaultSPCheckTimeout = 3 * time.Second - defaultSPHealthCheckerInterval = 10 * time.Second - httpStatusPath = "/status" + defaultSPCheckTimeout = 1 * time.Minute + defaultSPHealthCheckerInterval = 10 * time.Second + defaultSPHealthCheckerRetryInterval = 1 * time.Second + defaultSPHealthCheckerMaxRetries = 5 + httpStatusPath = "/status" emptyGVGSafeDeletePeriod = int64(60) * 60 * 24 ) @@ -525,6 +529,10 @@ func (vgm *virtualGroupManager) FreezeSPAndGVGs(spID uint32, gvgs []*virtualgrou vgm.freezeSPPool.FreezeSPAndGVGs(spID, gvgs) } +func (vgm *virtualGroupManager) ReleaseAllSP() { + vgm.freezeSPPool.ReleaseAllSP() +} + // releaseSPAndGVGLoop runs periodically to release SP from the freeze pool func (vgm *virtualGroupManager) releaseSPAndGVGLoop() { ticker := time.NewTicker(ReleaseSPJobInterval) @@ -773,7 +781,7 @@ func (checker *HealthChecker) checkAllSPHealth() { func (checker *HealthChecker) checkSPHealth(sp *sptypes.StorageProvider) bool { if !sp.IsInService() { - log.CtxInfow(context.Background(), "the sp is not in service,sp is treated as unhealthy", "sp", sp) + log.CtxInfow(context.Background(), "the sp is not in service, sp is treated as unhealthy", "sp", sp) return false } @@ -785,30 +793,41 @@ func (checker *HealthChecker) checkSPHealth(sp *sptypes.StorageProvider) bool { Transport: &http.Transport{ TLSClientConfig: &tls.Config{MinVersion: tls.VersionTLS12}, }, - Timeout: defaultSPCheckTimeout * time.Second, + Timeout: defaultSPCheckTimeout, } // Create an HTTP request to test the validity of the endpoint urlToCheck := fmt.Sprintf("%s%s", endpoint, httpStatusPath) - req, err := http.NewRequestWithContext(ctxTimeout, http.MethodGet, urlToCheck, nil) - if err != nil { - return false - } + for attempt := 0; attempt < defaultSPHealthCheckerMaxRetries; attempt++ { + start := time.Now() + req, err := http.NewRequestWithContext(ctxTimeout, http.MethodGet, urlToCheck, nil) + if err != nil { + log.CtxErrorw(context.Background(), "failed to create request", "sp", sp, "error", err) + return false + } - resp, err := client.Do(req) - if err != nil { - log.CtxErrorw(context.Background(), "failed to connect to sp", "sp", sp, "error", err) - return false - } - defer resp.Body.Close() + resp, err := client.Do(req) + duration := time.Since(start) + metrics.SPHealthCheckerTime.WithLabelValues(strconv.Itoa(int(sp.Id))).Observe(duration.Seconds()) + if err != nil { + log.CtxErrorw(context.Background(), "failed to connect to sp", "sp", sp, "error", err, "duration", duration) + time.Sleep(defaultSPHealthCheckerRetryInterval) + continue + } + defer resp.Body.Close() - if resp.StatusCode != http.StatusOK { - log.CtxErrorw(context.Background(), "failed to check sp healthy", "sp", sp, "http_status_code", resp.StatusCode, "resp_body", resp.Body) - return false + if resp.StatusCode == http.StatusOK { + log.CtxInfow(context.Background(), "succeed to check the sp healthy", "sp", sp, "duration", duration) + return true + } else { + metrics.SPHealthCheckerFailureCounter.WithLabelValues(strconv.Itoa(int(sp.Id))).Inc() + log.CtxErrorw(context.Background(), "failed to check sp healthy", "sp", sp, "http_status_code", resp.StatusCode, "duration", duration) + time.Sleep(defaultSPHealthCheckerRetryInterval) + } } - log.CtxInfow(context.Background(), "succeed to check the sp healthy", "sp", sp) - return true + log.CtxErrorw(context.Background(), "failed to check sp healthy after retries", "sp", sp) + return false } func (checker *HealthChecker) Start() { diff --git a/core/vgmgr/virtual_group_manager.go b/core/vgmgr/virtual_group_manager.go index cbbc2f9dd..779b44106 100644 --- a/core/vgmgr/virtual_group_manager.go +++ b/core/vgmgr/virtual_group_manager.go @@ -144,6 +144,9 @@ type VirtualGroupManager interface { // For those SPs which are in the pool will be skipped when creating a GVG, GVGs in the pool will not be chosen to seal Object // until released FreezeSPAndGVGs(spID uint32, gvgs []*virtualgrouptypes.GlobalVirtualGroup) + // ReleaseAllSP release all sp and their related GVG, in case that there is no enough balance to create a new GVG. + // should use the exisiting GVG even it failed to serve previously. + ReleaseAllSP() } // NewVirtualGroupManager is the virtual group manager init api. diff --git a/core/vgmgr/virtual_group_manager_mock.go b/core/vgmgr/virtual_group_manager_mock.go index 9788a813a..244d162f8 100644 --- a/core/vgmgr/virtual_group_manager_mock.go +++ b/core/vgmgr/virtual_group_manager_mock.go @@ -278,6 +278,18 @@ func (mr *MockVirtualGroupManagerMockRecorder) FreezeSPAndGVGs(spID, gvgs any) * return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "FreezeSPAndGVGs", reflect.TypeOf((*MockVirtualGroupManager)(nil).FreezeSPAndGVGs), spID, gvgs) } +// ReleaseAllSP indicates an expected call of ReleaseAllSP. +func (m *MockVirtualGroupManager) ReleaseAllSP() { + m.ctrl.T.Helper() + m.ctrl.Call(m, "ReleaseAllSP") +} + +// FreezeSPAndGVGs indicates an expected call of FreezeSPAndGVGs. +func (mr *MockVirtualGroupManagerMockRecorder) ReleaseAllSP(spID, gvgs any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ReleaseAllSP", reflect.TypeOf((*MockVirtualGroupManager)(nil).ReleaseAllSP)) +} + // GenerateGlobalVirtualGroupMeta mocks base method. func (m *MockVirtualGroupManager) GenerateGlobalVirtualGroupMeta(genPolicy GenerateGVGSecondarySPsPolicy, excludeSPsFilter ExcludeFilter) (*GlobalVirtualGroupMeta, error) { m.ctrl.T.Helper() diff --git a/go.mod b/go.mod index a0f666fbe..c96c6bb9e 100644 --- a/go.mod +++ b/go.mod @@ -10,7 +10,7 @@ require ( github.com/aliyun/credentials-go v1.3.0 github.com/avast/retry-go/v4 v4.3.1 github.com/aws/aws-sdk-go v1.44.159 - github.com/bnb-chain/greenfield v1.7.1-0.20240521062200-cc41c389096c + github.com/bnb-chain/greenfield v1.8.0 github.com/bnb-chain/greenfield-common/go v0.0.0-20240228080631-2683b0ee669a github.com/bytedance/gopkg v0.0.0-20221122125632-68358b8ecec6 github.com/cometbft/cometbft v0.38.6 @@ -306,7 +306,7 @@ replace ( github.com/cometbft/cometbft => github.com/bnb-chain/greenfield-cometbft v1.2.1-0.20240408033601-a6b682aa870e github.com/cometbft/cometbft-db => github.com/bnb-chain/greenfield-cometbft-db v0.8.1-alpha.1 github.com/confio/ics23/go => github.com/cosmos/cosmos-sdk/ics23/go v0.8.0 - github.com/cosmos/cosmos-sdk => github.com/bnb-chain/greenfield-cosmos-sdk v1.6.1-0.20240419024340-b5c75cfd8110 + github.com/cosmos/cosmos-sdk => github.com/bnb-chain/greenfield-cosmos-sdk v1.8.0 github.com/cosmos/iavl => github.com/bnb-chain/greenfield-iavl v0.20.1 github.com/forbole/juno/v4 => github.com/bnb-chain/juno/v4 v4.0.0-20240604033531-028f2cc8f76d github.com/gogo/protobuf => github.com/regen-network/protobuf v1.3.3-alpha.regen.1 diff --git a/go.sum b/go.sum index a86372c8f..47409bfe6 100644 --- a/go.sum +++ b/go.sum @@ -176,16 +176,16 @@ github.com/bgentry/speakeasy v0.1.1-0.20220910012023-760eaf8b6816/go.mod h1:+zsy github.com/bits-and-blooms/bitset v1.10.0 h1:ePXTeiPEazB5+opbv5fr8umg2R/1NlzgDsyepwsSr88= github.com/bits-and-blooms/bitset v1.10.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/bmizerany/pat v0.0.0-20170815010413-6226ea591a40/go.mod h1:8rLXio+WjiTceGBHIoTvn60HIbs7Hm7bcHjyrSqYB9c= -github.com/bnb-chain/greenfield v1.7.1-0.20240521062200-cc41c389096c h1:azKKvzAC/yLthneXB1sH6iPwVVj7sRK8X58PuTU/TsM= -github.com/bnb-chain/greenfield v1.7.1-0.20240521062200-cc41c389096c/go.mod h1:RkoY1ISUFUMNbw2iR7iX8M+ToqmB8AlcRFKpwCJEX3Q= +github.com/bnb-chain/greenfield v1.8.0 h1:5E8GQF9bS+ltG2PVXOiDXr6zBNL8aG/QhWYp7C2EgDM= +github.com/bnb-chain/greenfield v1.8.0/go.mod h1:R4itO5Q7d5wj0L9sAXpbrVZMUrdDyRtSjccW8XOEFvI= github.com/bnb-chain/greenfield-cometbft v1.2.1-0.20240408033601-a6b682aa870e h1:4ttDy8yBhBUW0gdFyBK0wHMJS5ZtlhBdoYx/O6T6Eqg= github.com/bnb-chain/greenfield-cometbft v1.2.1-0.20240408033601-a6b682aa870e/go.mod h1:q9/nqW19iXvxyma5XgcZfxL/OkWI9s5e7yX9ecePz8A= github.com/bnb-chain/greenfield-cometbft-db v0.8.1-alpha.1 h1:XcWulGacHVRiSCx90Q8Y//ajOrLNBQWR/KDB89dy3cU= github.com/bnb-chain/greenfield-cometbft-db v0.8.1-alpha.1/go.mod h1:ey1CiK4bYo1RBNJLRiVbYr5CMdSxci9S/AZRINLtppI= github.com/bnb-chain/greenfield-common/go v0.0.0-20240228080631-2683b0ee669a h1:VjUknQkIcqkjYCt1hmfpinM7kToOBuUU+KykrrqFsEM= github.com/bnb-chain/greenfield-common/go v0.0.0-20240228080631-2683b0ee669a/go.mod h1:K9jK80fbahciC+FAvrch8Qsbw9ZkvVgjfKsqrzPTAVA= -github.com/bnb-chain/greenfield-cosmos-sdk v1.6.1-0.20240419024340-b5c75cfd8110 h1:max1dH2HkKrNZpL2Jv6xwl+XWHsjJC6Ay+caN17u3CI= -github.com/bnb-chain/greenfield-cosmos-sdk v1.6.1-0.20240419024340-b5c75cfd8110/go.mod h1:siglWrVkM1+6tj8ZPwzMIITWQh7D8gsKJUk0Suz+ul0= +github.com/bnb-chain/greenfield-cosmos-sdk v1.8.0 h1:XaHBYnlAJNIEVTr9dXp3jzw12gCoIEL5jHiAMp+PX0s= +github.com/bnb-chain/greenfield-cosmos-sdk v1.8.0/go.mod h1:2bwmwdXYBISnQoMwgAcZTVGt21lMsHZSeeeMByTvDlQ= github.com/bnb-chain/greenfield-cosmos-sdk/api v0.0.0-20231206043955-0855e0965bc8 h1:mUMOeNo3K0SZvAhiOHNKW4mmkrhOphBF8tDUyK6e1tY= github.com/bnb-chain/greenfield-cosmos-sdk/api v0.0.0-20231206043955-0855e0965bc8/go.mod h1:vhsZxXE9tYJeYB5JR4hPhd6Pc/uPf7j1T8IJ7p9FdeM= github.com/bnb-chain/greenfield-cosmos-sdk/math v0.0.0-20231206043955-0855e0965bc8 h1:1Ud7itq03c4Q9h0kBpw1FYlWKN3kco8cgj59vdd50UQ= diff --git a/modular/manager/manage_task.go b/modular/manager/manage_task.go index 6938aff06..11cb56661 100644 --- a/modular/manager/manage_task.go +++ b/modular/manager/manage_task.go @@ -145,6 +145,8 @@ func (m *ManageModular) pickGVGAndReplicate(ctx context.Context, vgfID uint32, t gvgMeta, err := m.pickGlobalVirtualGroup(ctx, vgfID, task.GetStorageParams()) log.CtxInfow(ctx, "pick global virtual group", "time_cost", time.Since(startPickGVGTime).Seconds(), "gvg_meta", gvgMeta, "error", err) if err != nil { + // If there is no way to create a new GVG, release all sp from freeze Pool, better than not serving requests. + m.virtualGroupManager.ReleaseAllSP() return err } replicateTask := &gfsptask.GfSpReplicatePieceTask{} diff --git a/pkg/metrics/metric_items.go b/pkg/metrics/metric_items.go index b5fac7253..26372f317 100644 --- a/pkg/metrics/metric_items.go +++ b/pkg/metrics/metric_items.go @@ -61,6 +61,8 @@ var MetricsItems = []prometheus.Collector{ ManagerCounter, ManagerTime, GCBlockNumberGauge, + SPHealthCheckerTime, + SPHealthCheckerFailureCounter, // workflow metrics category PerfApprovalTime, @@ -247,6 +249,20 @@ var ( Name: "gc_block_number", Help: "Track the next gc block number.", }, []string{"gc_block_number"}) + SPHealthCheckerTime = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "sp_health_checker_request_time", + Help: "Request duration in seconds.", + }, + []string{"sp_id"}, + ) + SPHealthCheckerFailureCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "sp_health_checker_request_counter", + Help: "Request failure count.", + }, + []string{"sp_id"}, + ) ) // workflow metrics items diff --git a/store/sqldb/object_integrity.go b/store/sqldb/object_integrity.go index b6725dc37..f7d91ded4 100644 --- a/store/sqldb/object_integrity.go +++ b/store/sqldb/object_integrity.go @@ -434,7 +434,7 @@ func (s *SpDBImpl) DeleteReplicatePieceChecksum(objectID uint64, segmentIdx uint metrics.SPDBTime.WithLabelValues(SPDBSuccessDelReplicatePieceChecksum).Observe( time.Since(startTime).Seconds()) }() - err = s.db.Where("object_id = ? and segment_idx = ? and redundancy_index = ? ", objectID, segmentIdx, redundancyIdx).Delete(PieceHashTable{}).Error + err = s.db.Where("object_id = ? and segment_index = ? and redundancy_index = ? ", objectID, segmentIdx, redundancyIdx).Delete(PieceHashTable{}).Error return err } diff --git a/store/sqldb/object_integrity_test.go b/store/sqldb/object_integrity_test.go index 7da578f61..9c5d6f8d6 100644 --- a/store/sqldb/object_integrity_test.go +++ b/store/sqldb/object_integrity_test.go @@ -468,7 +468,7 @@ func TestSpDBImpl_DeleteReplicatePieceChecksumSuccess(t *testing.T) { ) s, mock := setupDB(t) mock.ExpectBegin() - mock.ExpectExec("DELETE FROM `piece_hash` WHERE object_id = ? and segment_idx = ? and redundancy_index = ?"). + mock.ExpectExec("DELETE FROM `piece_hash` WHERE object_id = ? and segment_index = ? and redundancy_index = ?"). WithArgs(objectID, segmentIdx, redundancyIdx).WillReturnResult(sqlmock.NewResult(1, 1)) mock.ExpectCommit() err := s.DeleteReplicatePieceChecksum(objectID, segmentIdx, redundancyIdx) @@ -546,7 +546,7 @@ func TestSpDBImpl_DeleteAllReplicatePieceChecksumSuccess(t *testing.T) { ) s, mock := setupDB(t) mock.ExpectBegin() - mock.ExpectExec("DELETE FROM `piece_hash` WHERE object_id = ? and segment_idx = ? and redundancy_index = ?"). + mock.ExpectExec("DELETE FROM `piece_hash` WHERE object_id = ? and segment_index = ? and redundancy_index = ?"). WithArgs(objectID, segmentIdx, redundancyIdx).WillReturnResult(sqlmock.NewResult(1, 1)) mock.ExpectCommit() err := s.DeleteAllReplicatePieceChecksum(objectID, redundancyIdx, pieceCount) @@ -562,7 +562,7 @@ func TestSpDBImpl_DeleteAllReplicatePieceChecksumFailure(t *testing.T) { ) s, mock := setupDB(t) mock.ExpectBegin() - mock.ExpectExec("DELETE FROM `piece_hash` WHERE object_id = ? and segment_idx = ? and redundancy_index = ?"). + mock.ExpectExec("DELETE FROM `piece_hash` WHERE object_id = ? and segment_index = ? and redundancy_index = ?"). WithArgs(objectID, segmentIdx, redundancyIdx).WillReturnError(mockDBInternalError) mock.ExpectRollback() mock.ExpectCommit()