35
35
package locate
36
36
37
37
import (
38
+ "bytes"
38
39
"context"
39
40
"fmt"
40
41
"math"
@@ -587,14 +588,15 @@ func (state *accessFollower) next(bo *retry.Backoffer, selector *replicaSelector
587
588
}
588
589
// If there is no candidate, fallback to the leader.
589
590
if selector .targetIdx < 0 {
591
+ leader := selector .replicas [state .leaderIdx ]
592
+ leaderInvalid := leader .isEpochStale () || (! state .option .leaderOnly && leader .isExhausted (1 ))
590
593
if len (state .option .labels ) > 0 {
591
- logutil .BgLogger () .Warn (
592
- "unable to find stores with given labels" ,
593
- zap .Any ( "labels " , state . option . labels ),
594
- )
594
+ logutil .Logger ( bo . GetCtx ()) .Warn ("unable to find stores with given labels" ,
595
+ zap . Uint64 ( "region" , selector . region . GetID ()) ,
596
+ zap .Bool ( "leader-invalid " , leaderInvalid ),
597
+ zap . Any ( "labels" , state . option . labels ) )
595
598
}
596
- leader := selector .replicas [state .leaderIdx ]
597
- if leader .isEpochStale () || (! state .option .leaderOnly && leader .isExhausted (1 )) {
599
+ if leaderInvalid {
598
600
metrics .TiKVReplicaSelectorFailureCounter .WithLabelValues ("exhausted" ).Inc ()
599
601
selector .invalidateRegion ()
600
602
return nil , nil
@@ -1168,6 +1170,7 @@ func (s *RegionRequestSender) SendReqCtx(
1168
1170
}()
1169
1171
}
1170
1172
1173
+ totalErrors := make (map [string ]int )
1171
1174
for {
1172
1175
if retryTimes > 0 {
1173
1176
req .IsRetryRequest = true
@@ -1200,10 +1203,7 @@ func (s *RegionRequestSender) SendReqCtx(
1200
1203
1201
1204
// TODO: Change the returned error to something like "region missing in cache",
1202
1205
// and handle this error like EpochNotMatch, which means to re-split the request and retry.
1203
- logutil .Logger (bo .GetCtx ()).Debug (
1204
- "throwing pseudo region error due to region not found in cache" ,
1205
- zap .Stringer ("region" , & regionID ),
1206
- )
1206
+ s .logSendReqError (bo , "throwing pseudo region error due to no replica available" , regionID , retryTimes , req , totalErrors )
1207
1207
resp , err = tikvrpc .GenRegionErrorResp (req , & errorpb.Error {EpochNotMatch : & errorpb.EpochNotMatch {}})
1208
1208
return resp , nil , retryTimes , err
1209
1209
}
@@ -1229,6 +1229,8 @@ func (s *RegionRequestSender) SendReqCtx(
1229
1229
var retry bool
1230
1230
resp , retry , err = s .sendReqToRegion (bo , rpcCtx , req , timeout )
1231
1231
if err != nil {
1232
+ msg := fmt .Sprintf ("send request failed, err: %v" , err .Error ())
1233
+ s .logSendReqError (bo , msg , regionID , retryTimes , req , totalErrors )
1232
1234
return nil , nil , retryTimes , err
1233
1235
}
1234
1236
@@ -1260,14 +1262,19 @@ func (s *RegionRequestSender) SendReqCtx(
1260
1262
return nil , nil , retryTimes , err
1261
1263
}
1262
1264
if regionErr != nil {
1265
+ regionErrLabel := regionErrorToLabel (regionErr )
1266
+ totalErrors [regionErrLabel ]++
1263
1267
retry , err = s .onRegionError (bo , rpcCtx , req , regionErr )
1264
1268
if err != nil {
1269
+ msg := fmt .Sprintf ("send request on region error failed, err: %v" , err .Error ())
1270
+ s .logSendReqError (bo , msg , regionID , retryTimes , req , totalErrors )
1265
1271
return nil , nil , retryTimes , err
1266
1272
}
1267
1273
if retry {
1268
1274
retryTimes ++
1269
1275
continue
1270
1276
}
1277
+ s .logSendReqError (bo , "send request meet region error without retry" , regionID , retryTimes , req , totalErrors )
1271
1278
} else {
1272
1279
if s .replicaSelector != nil {
1273
1280
s .replicaSelector .onSendSuccess ()
@@ -1280,6 +1287,75 @@ func (s *RegionRequestSender) SendReqCtx(
1280
1287
}
1281
1288
}
1282
1289
1290
+ func (s * RegionRequestSender ) logSendReqError (bo * retry.Backoffer , msg string , regionID RegionVerID , retryTimes int , req * tikvrpc.Request , totalErrors map [string ]int ) {
1291
+ var replicaStatus []string
1292
+ replicaSelectorState := "nil"
1293
+ cacheRegionIsValid := "unknown"
1294
+ if s .replicaSelector != nil {
1295
+ switch s .replicaSelector .state .(type ) {
1296
+ case * accessKnownLeader :
1297
+ replicaSelectorState = "accessKnownLeader"
1298
+ case * accessFollower :
1299
+ replicaSelectorState = "accessFollower"
1300
+ case * accessByKnownProxy :
1301
+ replicaSelectorState = "accessByKnownProxy"
1302
+ case * tryFollower :
1303
+ replicaSelectorState = "tryFollower"
1304
+ case * tryNewProxy :
1305
+ replicaSelectorState = "tryNewProxy"
1306
+ case * invalidLeader :
1307
+ replicaSelectorState = "invalidLeader"
1308
+ case * invalidStore :
1309
+ replicaSelectorState = "invalidStore"
1310
+ case * stateBase :
1311
+ replicaSelectorState = "stateBase"
1312
+ case nil :
1313
+ replicaSelectorState = "nil"
1314
+ }
1315
+ if s .replicaSelector .region != nil {
1316
+ if s .replicaSelector .region .isValid () {
1317
+ cacheRegionIsValid = "true"
1318
+ } else {
1319
+ cacheRegionIsValid = "false"
1320
+ }
1321
+ }
1322
+ for _ , replica := range s .replicaSelector .replicas {
1323
+ replicaStatus = append (replicaStatus , fmt .Sprintf ("peer: %v, store: %v, isEpochStale: %v, attempts: %v, replica-epoch: %v, store-epoch: %v, store-state: %v, store-liveness-state: %v" ,
1324
+ replica .peer .GetId (),
1325
+ replica .store .storeID ,
1326
+ replica .isEpochStale (),
1327
+ replica .attempts ,
1328
+ replica .epoch ,
1329
+ atomic .LoadUint32 (& replica .store .epoch ),
1330
+ replica .store .getResolveState (),
1331
+ replica .store .getLivenessState (),
1332
+ ))
1333
+ }
1334
+ }
1335
+ var totalErrorStr bytes.Buffer
1336
+ for err , cnt := range totalErrors {
1337
+ if totalErrorStr .Len () > 0 {
1338
+ totalErrorStr .WriteString (", " )
1339
+ }
1340
+ totalErrorStr .WriteString (err )
1341
+ totalErrorStr .WriteString (":" )
1342
+ totalErrorStr .WriteString (strconv .Itoa (cnt ))
1343
+ }
1344
+ logutil .Logger (bo .GetCtx ()).Info (msg ,
1345
+ zap .Uint64 ("req-ts" , req .GetStartTS ()),
1346
+ zap .String ("req-type" , req .Type .String ()),
1347
+ zap .String ("region" , regionID .String ()),
1348
+ zap .String ("region-is-valid" , cacheRegionIsValid ),
1349
+ zap .Int ("retry-times" , retryTimes ),
1350
+ zap .String ("replica-read-type" , req .ReplicaReadType .String ()),
1351
+ zap .String ("replica-selector-state" , replicaSelectorState ),
1352
+ zap .Bool ("stale-read" , req .StaleRead ),
1353
+ zap .String ("replica-status" , strings .Join (replicaStatus , "; " )),
1354
+ zap .Int ("total-backoff-ms" , bo .GetTotalSleep ()),
1355
+ zap .Int ("total-backoff-times" , bo .GetTotalBackoffTimes ()),
1356
+ zap .String ("total-region-errors" , totalErrorStr .String ()))
1357
+ }
1358
+
1283
1359
// RPCCancellerCtxKey is context key attach rpc send cancelFunc collector to ctx.
1284
1360
type RPCCancellerCtxKey struct {}
1285
1361
0 commit comments