Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: check sp health retry #1415

Merged
merged 1 commit into from
Jun 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 35 additions & 20 deletions base/gfspvgmgr/virtual_group_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"math/rand"
"net/http"
"sort"
"strconv"
"strings"
"sync"
"time"
Expand All @@ -19,6 +20,7 @@ import (
"github.com/bnb-chain/greenfield-storage-provider/core/consensus"
"github.com/bnb-chain/greenfield-storage-provider/core/vgmgr"
"github.com/bnb-chain/greenfield-storage-provider/pkg/log"
"github.com/bnb-chain/greenfield-storage-provider/pkg/metrics"
"github.com/bnb-chain/greenfield-storage-provider/util"
sptypes "github.com/bnb-chain/greenfield/x/sp/types"
virtualgrouptypes "github.com/bnb-chain/greenfield/x/virtualgroup/types"
Expand All @@ -33,9 +35,11 @@ const (
DefaultInitialGVGStakingStorageSize = uint64(2) * 1024 * 1024 * 1024 * 1024 // 2TB per GVG, chain side DefaultMaxStoreSizePerFamily is 64 TB
additionalGVGStakingStorageSize = uint64(1) * 1024 * 1024 * 1024 * 1024 // 1TB

defaultSPCheckTimeout = 3 * time.Second
defaultSPHealthCheckerInterval = 10 * time.Second
httpStatusPath = "/status"
defaultSPCheckTimeout = 1 * time.Minute
defaultSPHealthCheckerInterval = 10 * time.Second
defaultSPHealthCheckerRetryInterval = 1 * time.Second
defaultSPHealthCheckerMaxRetries = 5
httpStatusPath = "/status"

emptyGVGSafeDeletePeriod = int64(60) * 60 * 24
)
Expand Down Expand Up @@ -773,7 +777,7 @@ func (checker *HealthChecker) checkAllSPHealth() {

func (checker *HealthChecker) checkSPHealth(sp *sptypes.StorageProvider) bool {
if !sp.IsInService() {
log.CtxInfow(context.Background(), "the sp is not in service,sp is treated as unhealthy", "sp", sp)
log.CtxInfow(context.Background(), "the sp is not in service, sp is treated as unhealthy", "sp", sp)
return false
}

Expand All @@ -785,30 +789,41 @@ func (checker *HealthChecker) checkSPHealth(sp *sptypes.StorageProvider) bool {
Transport: &http.Transport{
TLSClientConfig: &tls.Config{MinVersion: tls.VersionTLS12},
},
Timeout: defaultSPCheckTimeout * time.Second,
Timeout: defaultSPCheckTimeout,
}

// Create an HTTP request to test the validity of the endpoint
urlToCheck := fmt.Sprintf("%s%s", endpoint, httpStatusPath)
req, err := http.NewRequestWithContext(ctxTimeout, http.MethodGet, urlToCheck, nil)
if err != nil {
return false
}
for attempt := 0; attempt < defaultSPHealthCheckerMaxRetries; attempt++ {
start := time.Now()
req, err := http.NewRequestWithContext(ctxTimeout, http.MethodGet, urlToCheck, nil)
if err != nil {
log.CtxErrorw(context.Background(), "failed to create request", "sp", sp, "error", err)
return false
}

resp, err := client.Do(req)
if err != nil {
log.CtxErrorw(context.Background(), "failed to connect to sp", "sp", sp, "error", err)
return false
}
defer resp.Body.Close()
resp, err := client.Do(req)
duration := time.Since(start)
metrics.SPHealthCheckerTime.WithLabelValues(strconv.Itoa(int(sp.Id))).Observe(duration.Seconds())
if err != nil {
log.CtxErrorw(context.Background(), "failed to connect to sp", "sp", sp, "error", err, "duration", duration)
time.Sleep(defaultSPHealthCheckerRetryInterval)
continue
}
defer resp.Body.Close()

if resp.StatusCode != http.StatusOK {
log.CtxErrorw(context.Background(), "failed to check sp healthy", "sp", sp, "http_status_code", resp.StatusCode, "resp_body", resp.Body)
return false
if resp.StatusCode == http.StatusOK {
log.CtxInfow(context.Background(), "succeed to check the sp healthy", "sp", sp, "duration", duration)
return true
} else {
metrics.SPHealthCheckerFailureCounter.WithLabelValues(strconv.Itoa(int(sp.Id))).Inc()
log.CtxErrorw(context.Background(), "failed to check sp healthy", "sp", sp, "http_status_code", resp.StatusCode, "duration", duration)
time.Sleep(defaultSPHealthCheckerRetryInterval)
}
}

log.CtxInfow(context.Background(), "succeed to check the sp healthy", "sp", sp)
return true
log.CtxErrorw(context.Background(), "failed to check sp healthy after retries", "sp", sp)
return false
}

func (checker *HealthChecker) Start() {
Expand Down
16 changes: 16 additions & 0 deletions pkg/metrics/metric_items.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ var MetricsItems = []prometheus.Collector{
ManagerCounter,
ManagerTime,
GCBlockNumberGauge,
SPHealthCheckerTime,
SPHealthCheckerFailureCounter,

// workflow metrics category
PerfApprovalTime,
Expand Down Expand Up @@ -247,6 +249,20 @@ var (
Name: "gc_block_number",
Help: "Track the next gc block number.",
}, []string{"gc_block_number"})
SPHealthCheckerTime = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "sp_health_checker_request_time",
Help: "Request duration in seconds.",
},
[]string{"sp_id"},
)
SPHealthCheckerFailureCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "sp_health_checker_request_counter",
Help: "Request failure count.",
},
[]string{"sp_id"},
)
)

// workflow metrics items
Expand Down
Loading