Skip to content

Commit

Permalink
fix: metric check sp (#1414)
Browse files Browse the repository at this point in the history
  • Loading branch information
BarryTong65 committed Jun 4, 2024
1 parent 8c58fa2 commit f3d5a1a
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 7 deletions.
12 changes: 5 additions & 7 deletions base/gfspvgmgr/virtual_group_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"math/rand"
"net/http"
"sort"
"strconv"
"strings"
"sync"
"time"
Expand Down Expand Up @@ -40,9 +41,6 @@ const (
defaultSPHealthCheckerMaxRetries = 5
httpStatusPath = "/status"

SPHealthCheckerDuration = "check_sp_health_duration"
SPHealthCheckerFailure = "check_sp_health_total_failure"

emptyGVGSafeDeletePeriod = int64(60) * 60 * 24
)

Expand Down Expand Up @@ -779,7 +777,7 @@ func (checker *HealthChecker) checkAllSPHealth() {

func (checker *HealthChecker) checkSPHealth(sp *sptypes.StorageProvider) bool {
if !sp.IsInService() {
log.CtxInfow(context.Background(), "the sp is not in service,sp is treated as unhealthy", "sp", sp)
log.CtxInfow(context.Background(), "the sp is not in service, sp is treated as unhealthy", "sp", sp)
return false
}

Expand All @@ -806,7 +804,7 @@ func (checker *HealthChecker) checkSPHealth(sp *sptypes.StorageProvider) bool {

resp, err := client.Do(req)
duration := time.Since(start)
metrics.ReqTime.WithLabelValues(SPHealthCheckerDuration).Observe(duration.Seconds())
metrics.SPHealthCheckerTime.WithLabelValues(strconv.Itoa(int(sp.Id))).Observe(duration.Seconds())
if err != nil {
log.CtxErrorw(context.Background(), "failed to connect to sp", "sp", sp, "error", err, "duration", duration)
time.Sleep(defaultSPHealthCheckerRetryInterval)
Expand All @@ -818,8 +816,8 @@ func (checker *HealthChecker) checkSPHealth(sp *sptypes.StorageProvider) bool {
log.CtxInfow(context.Background(), "succeed to check the sp healthy", "sp", sp, "duration", duration)
return true
} else {
metrics.ReqCounter.WithLabelValues(SPHealthCheckerFailure).Inc()
log.CtxErrorw(context.Background(), "failed to check sp healthy", "sp", sp, "http_status_code", resp.StatusCode, "resp_body", resp.Body, "duration", duration)
metrics.SPHealthCheckerFailureCounter.WithLabelValues(strconv.Itoa(int(sp.Id))).Inc()
log.CtxErrorw(context.Background(), "failed to check sp healthy", "sp", sp, "http_status_code", resp.StatusCode, "duration", duration)
time.Sleep(defaultSPHealthCheckerRetryInterval)
}
}
Expand Down
16 changes: 16 additions & 0 deletions pkg/metrics/metric_items.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ var MetricsItems = []prometheus.Collector{
ManagerCounter,
ManagerTime,
GCBlockNumberGauge,
SPHealthCheckerTime,
SPHealthCheckerFailureCounter,

// workflow metrics category
PerfApprovalTime,
Expand Down Expand Up @@ -246,6 +248,20 @@ var (
Name: "gc_block_number",
Help: "Track the next gc block number.",
}, []string{"gc_block_number"})
SPHealthCheckerTime = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "sp_health_checker_request_time",
Help: "Request duration in seconds.",
},
[]string{"sp_id"},
)
SPHealthCheckerFailureCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "sp_health_checker_request_counter",
Help: "Request failure count.",
},
[]string{"sp_id"},
)
)

// workflow metrics items
Expand Down

0 comments on commit f3d5a1a

Please sign in to comment.