Skip to content

Commit 5a685d9

Browse files
authored
add more log for diagnose (#915) (#931)
* add more log for diagnose * fix * add more log for diagnose * add more log * address comment --------- Signed-off-by: crazycs520 <crazycs520@gmail.com>
1 parent 2cf75b1 commit 5a685d9

File tree

4 files changed

+158
-10
lines changed

4 files changed

+158
-10
lines changed

internal/locate/region_cache.go

+38
Original file line numberDiff line numberDiff line change
@@ -2392,6 +2392,24 @@ const (
23922392
tombstone
23932393
)
23942394

2395+
// String implements fmt.Stringer interface.
2396+
func (s resolveState) String() string {
2397+
switch s {
2398+
case unresolved:
2399+
return "unresolved"
2400+
case resolved:
2401+
return "resolved"
2402+
case needCheck:
2403+
return "needCheck"
2404+
case deleted:
2405+
return "deleted"
2406+
case tombstone:
2407+
return "tombstone"
2408+
default:
2409+
return fmt.Sprintf("unknown-%v", uint64(s))
2410+
}
2411+
}
2412+
23952413
// IsTiFlash returns true if the storeType is TiFlash
23962414
func (s *Store) IsTiFlash() bool {
23972415
return s.storeType == tikvrpc.TiFlash
@@ -2531,6 +2549,12 @@ func (s *Store) changeResolveStateTo(from, to resolveState) bool {
25312549
return false
25322550
}
25332551
if atomic.CompareAndSwapUint64(&s.state, uint64(from), uint64(to)) {
2552+
logutil.BgLogger().Info("change store resolve state",
2553+
zap.Uint64("store", s.storeID),
2554+
zap.String("addr", s.addr),
2555+
zap.String("from", from.String()),
2556+
zap.String("to", to.String()),
2557+
zap.String("liveness-state", s.getLivenessState().String()))
25342558
return true
25352559
}
25362560
}
@@ -2631,6 +2655,20 @@ const (
26312655
unknown
26322656
)
26332657

2658+
// String implements fmt.Stringer interface.
2659+
func (s livenessState) String() string {
2660+
switch s {
2661+
case unreachable:
2662+
return "unreachable"
2663+
case reachable:
2664+
return "reachable"
2665+
case unknown:
2666+
return "unknown"
2667+
default:
2668+
return fmt.Sprintf("unknown-%v", uint32(s))
2669+
}
2670+
}
2671+
26342672
func (s *Store) startHealthCheckLoopIfNeeded(c *RegionCache, liveness livenessState) {
26352673
// This mechanism doesn't support non-TiKV stores currently.
26362674
if s.storeType != tikvrpc.TiKV {

internal/locate/region_request.go

+86-10
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
package locate
3636

3737
import (
38+
"bytes"
3839
"context"
3940
"fmt"
4041
"math"
@@ -587,14 +588,15 @@ func (state *accessFollower) next(bo *retry.Backoffer, selector *replicaSelector
587588
}
588589
// If there is no candidate, fallback to the leader.
589590
if selector.targetIdx < 0 {
591+
leader := selector.replicas[state.leaderIdx]
592+
leaderInvalid := leader.isEpochStale() || (!state.option.leaderOnly && leader.isExhausted(1))
590593
if len(state.option.labels) > 0 {
591-
logutil.BgLogger().Warn(
592-
"unable to find stores with given labels",
593-
zap.Any("labels", state.option.labels),
594-
)
594+
logutil.Logger(bo.GetCtx()).Warn("unable to find stores with given labels",
595+
zap.Uint64("region", selector.region.GetID()),
596+
zap.Bool("leader-invalid", leaderInvalid),
597+
zap.Any("labels", state.option.labels))
595598
}
596-
leader := selector.replicas[state.leaderIdx]
597-
if leader.isEpochStale() || (!state.option.leaderOnly && leader.isExhausted(1)) {
599+
if leaderInvalid {
598600
metrics.TiKVReplicaSelectorFailureCounter.WithLabelValues("exhausted").Inc()
599601
selector.invalidateRegion()
600602
return nil, nil
@@ -1168,6 +1170,7 @@ func (s *RegionRequestSender) SendReqCtx(
11681170
}()
11691171
}
11701172

1173+
totalErrors := make(map[string]int)
11711174
for {
11721175
if retryTimes > 0 {
11731176
req.IsRetryRequest = true
@@ -1200,10 +1203,7 @@ func (s *RegionRequestSender) SendReqCtx(
12001203

12011204
// TODO: Change the returned error to something like "region missing in cache",
12021205
// and handle this error like EpochNotMatch, which means to re-split the request and retry.
1203-
logutil.Logger(bo.GetCtx()).Debug(
1204-
"throwing pseudo region error due to region not found in cache",
1205-
zap.Stringer("region", &regionID),
1206-
)
1206+
s.logSendReqError(bo, "throwing pseudo region error due to no replica available", regionID, retryTimes, req, totalErrors)
12071207
resp, err = tikvrpc.GenRegionErrorResp(req, &errorpb.Error{EpochNotMatch: &errorpb.EpochNotMatch{}})
12081208
return resp, nil, retryTimes, err
12091209
}
@@ -1229,6 +1229,8 @@ func (s *RegionRequestSender) SendReqCtx(
12291229
var retry bool
12301230
resp, retry, err = s.sendReqToRegion(bo, rpcCtx, req, timeout)
12311231
if err != nil {
1232+
msg := fmt.Sprintf("send request failed, err: %v", err.Error())
1233+
s.logSendReqError(bo, msg, regionID, retryTimes, req, totalErrors)
12321234
return nil, nil, retryTimes, err
12331235
}
12341236

@@ -1260,14 +1262,19 @@ func (s *RegionRequestSender) SendReqCtx(
12601262
return nil, nil, retryTimes, err
12611263
}
12621264
if regionErr != nil {
1265+
regionErrLabel := regionErrorToLabel(regionErr)
1266+
totalErrors[regionErrLabel]++
12631267
retry, err = s.onRegionError(bo, rpcCtx, req, regionErr)
12641268
if err != nil {
1269+
msg := fmt.Sprintf("send request on region error failed, err: %v", err.Error())
1270+
s.logSendReqError(bo, msg, regionID, retryTimes, req, totalErrors)
12651271
return nil, nil, retryTimes, err
12661272
}
12671273
if retry {
12681274
retryTimes++
12691275
continue
12701276
}
1277+
s.logSendReqError(bo, "send request meet region error without retry", regionID, retryTimes, req, totalErrors)
12711278
} else {
12721279
if s.replicaSelector != nil {
12731280
s.replicaSelector.onSendSuccess()
@@ -1280,6 +1287,75 @@ func (s *RegionRequestSender) SendReqCtx(
12801287
}
12811288
}
12821289

1290+
func (s *RegionRequestSender) logSendReqError(bo *retry.Backoffer, msg string, regionID RegionVerID, retryTimes int, req *tikvrpc.Request, totalErrors map[string]int) {
1291+
var replicaStatus []string
1292+
replicaSelectorState := "nil"
1293+
cacheRegionIsValid := "unknown"
1294+
if s.replicaSelector != nil {
1295+
switch s.replicaSelector.state.(type) {
1296+
case *accessKnownLeader:
1297+
replicaSelectorState = "accessKnownLeader"
1298+
case *accessFollower:
1299+
replicaSelectorState = "accessFollower"
1300+
case *accessByKnownProxy:
1301+
replicaSelectorState = "accessByKnownProxy"
1302+
case *tryFollower:
1303+
replicaSelectorState = "tryFollower"
1304+
case *tryNewProxy:
1305+
replicaSelectorState = "tryNewProxy"
1306+
case *invalidLeader:
1307+
replicaSelectorState = "invalidLeader"
1308+
case *invalidStore:
1309+
replicaSelectorState = "invalidStore"
1310+
case *stateBase:
1311+
replicaSelectorState = "stateBase"
1312+
case nil:
1313+
replicaSelectorState = "nil"
1314+
}
1315+
if s.replicaSelector.region != nil {
1316+
if s.replicaSelector.region.isValid() {
1317+
cacheRegionIsValid = "true"
1318+
} else {
1319+
cacheRegionIsValid = "false"
1320+
}
1321+
}
1322+
for _, replica := range s.replicaSelector.replicas {
1323+
replicaStatus = append(replicaStatus, fmt.Sprintf("peer: %v, store: %v, isEpochStale: %v, attempts: %v, replica-epoch: %v, store-epoch: %v, store-state: %v, store-liveness-state: %v",
1324+
replica.peer.GetId(),
1325+
replica.store.storeID,
1326+
replica.isEpochStale(),
1327+
replica.attempts,
1328+
replica.epoch,
1329+
atomic.LoadUint32(&replica.store.epoch),
1330+
replica.store.getResolveState(),
1331+
replica.store.getLivenessState(),
1332+
))
1333+
}
1334+
}
1335+
var totalErrorStr bytes.Buffer
1336+
for err, cnt := range totalErrors {
1337+
if totalErrorStr.Len() > 0 {
1338+
totalErrorStr.WriteString(", ")
1339+
}
1340+
totalErrorStr.WriteString(err)
1341+
totalErrorStr.WriteString(":")
1342+
totalErrorStr.WriteString(strconv.Itoa(cnt))
1343+
}
1344+
logutil.Logger(bo.GetCtx()).Info(msg,
1345+
zap.Uint64("req-ts", req.GetStartTS()),
1346+
zap.String("req-type", req.Type.String()),
1347+
zap.String("region", regionID.String()),
1348+
zap.String("region-is-valid", cacheRegionIsValid),
1349+
zap.Int("retry-times", retryTimes),
1350+
zap.String("replica-read-type", req.ReplicaReadType.String()),
1351+
zap.String("replica-selector-state", replicaSelectorState),
1352+
zap.Bool("stale-read", req.StaleRead),
1353+
zap.String("replica-status", strings.Join(replicaStatus, "; ")),
1354+
zap.Int("total-backoff-ms", bo.GetTotalSleep()),
1355+
zap.Int("total-backoff-times", bo.GetTotalBackoffTimes()),
1356+
zap.String("total-region-errors", totalErrorStr.String()))
1357+
}
1358+
12831359
// RPCCancellerCtxKey is context key attach rpc send cancelFunc collector to ctx.
12841360
type RPCCancellerCtxKey struct{}
12851361

internal/retry/backoff.go

+14
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,11 @@
3535
package retry
3636

3737
import (
38+
"bytes"
3839
"context"
3940
"fmt"
4041
"math"
42+
"strconv"
4143
"strings"
4244
"sync/atomic"
4345
"time"
@@ -150,6 +152,18 @@ func (b *Backoffer) BackoffWithCfgAndMaxSleep(cfg *Config, maxSleepMs int, err e
150152
errMsg += "\n" + err.Error()
151153
}
152154
}
155+
var backoffDetail bytes.Buffer
156+
totalTimes := 0
157+
for name, times := range b.backoffTimes {
158+
totalTimes += times
159+
if backoffDetail.Len() > 0 {
160+
backoffDetail.WriteString(", ")
161+
}
162+
backoffDetail.WriteString(name)
163+
backoffDetail.WriteString(":")
164+
backoffDetail.WriteString(strconv.Itoa(times))
165+
}
166+
errMsg += fmt.Sprintf("\ntotal-backoff-times: %v, backoff-detail: %v", totalTimes, backoffDetail.String())
153167
returnedErr := err
154168
if longestSleepCfg != nil {
155169
errMsg += fmt.Sprintf("\nlongest sleep type: %s, time: %dms", longestSleepCfg.String(), longestSleepTime)

kv/store_vars.go

+20
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@
3535
package kv
3636

3737
import (
38+
"fmt"
39+
3840
"go.uber.org/atomic"
3941
)
4042

@@ -72,3 +74,21 @@ const (
7274
func (r ReplicaReadType) IsFollowerRead() bool {
7375
return r != ReplicaReadLeader
7476
}
77+
78+
// String implements fmt.Stringer interface.
79+
func (r ReplicaReadType) String() string {
80+
switch r {
81+
case ReplicaReadLeader:
82+
return "leader"
83+
case ReplicaReadFollower:
84+
return "follower"
85+
case ReplicaReadMixed:
86+
return "mixed"
87+
case ReplicaReadLearner:
88+
return "learner"
89+
case ReplicaReadPreferLeader:
90+
return "prefer-leader"
91+
default:
92+
return fmt.Sprintf("unknown-%v", byte(r))
93+
}
94+
}

0 commit comments

Comments
 (0)