From 42e13fffd8a1700d394bba915411bcaccca96220 Mon Sep 17 00:00:00 2001 From: yisaer Date: Mon, 12 Apr 2021 18:12:36 +0800 Subject: [PATCH 1/6] add metrcis Signed-off-by: yisaer --- server/schedule/filter/filters.go | 2 ++ server/schedule/metrics.go | 18 +++++++++++++++++ server/schedule/region_scatterer.go | 31 +++++++++++++++++++++++++++-- 3 files changed, 49 insertions(+), 2 deletions(-) diff --git a/server/schedule/filter/filters.go b/server/schedule/filter/filters.go index e8cefee7206..92b88587023 100644 --- a/server/schedule/filter/filters.go +++ b/server/schedule/filter/filters.go @@ -686,6 +686,8 @@ const ( EngineKey = "engine" // EngineTiFlash is the tiflash value of the engine label. EngineTiFlash = "tiflash" + // EngineTiKV indicates the tikv engine in metrics + EngineTiKV = "tikv" ) var allSpecialUses = []string{SpecialUseHotRegion, SpecialUseReserved} diff --git a/server/schedule/metrics.go b/server/schedule/metrics.go index 80205d8068d..0feddf6a159 100644 --- a/server/schedule/metrics.go +++ b/server/schedule/metrics.go @@ -73,6 +73,22 @@ var ( Name: "store_limit_cost", Help: "limit rate cost of store.", }, []string{"store", "limit_type"}) + + scatterCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "pd", + Subsystem: "schedule", + Name: "scatter_operators_count", + Help: "Counter of region scatter operators.", + }, []string{"type", "event"}) + + scatterDistributionCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "pd", + Subsystem: "schedule", + Name: "scatter_distribution", + Help: "Counter of the distribution in scatter.", + }, []string{"store", "is_leader", "engine"}) ) func init() { @@ -83,4 +99,6 @@ func init() { prometheus.MustRegister(storeLimitRateGauge) prometheus.MustRegister(storeLimitCostCounter) prometheus.MustRegister(operatorWaitCounter) + prometheus.MustRegister(scatterCounter) + prometheus.MustRegister(scatterDistributionCounter) } diff --git a/server/schedule/region_scatterer.go b/server/schedule/region_scatterer.go index 89e8261d213..010287bd4a5 100644 --- a/server/schedule/region_scatterer.go +++ b/server/schedule/region_scatterer.go @@ -137,6 +137,7 @@ const maxRetryLimit = 30 func (r *RegionScatterer) ScatterRegionsByRange(startKey, endKey []byte, group string, retryLimit int) ([]*operator.Operator, map[uint64]error, error) { regions := r.cluster.ScanRegions(startKey, endKey, -1) if len(regions) < 1 { + scatterCounter.WithLabelValues("skip", "empty_region").Inc() return nil, nil, errors.New("empty region") } failures := make(map[uint64]error, len(regions)) @@ -155,6 +156,7 @@ func (r *RegionScatterer) ScatterRegionsByRange(startKey, endKey []byte, group s // ScatterRegionsByID directly scatter regions by ScatterRegions func (r *RegionScatterer) ScatterRegionsByID(regionsID []uint64, group string, retryLimit int) ([]*operator.Operator, map[uint64]error, error) { if len(regionsID) < 1 { + scatterCounter.WithLabelValues("skip", "empty_region").Inc() return nil, nil, errors.New("empty region") } failures := make(map[uint64]error, len(regionsID)) @@ -162,6 +164,8 @@ func (r *RegionScatterer) ScatterRegionsByID(regionsID []uint64, group string, r for _, id := range regionsID { region := r.cluster.GetRegion(id) if region == nil { + scatterCounter.WithLabelValues("skip", "no_region").Inc() + log.Warn("failed to find region during scatter", zap.Uint64("region-id", id)) failures[id] = errors.New(fmt.Sprintf("failed to find region %v", id)) continue } @@ -187,6 +191,7 @@ func (r *RegionScatterer) ScatterRegionsByID(regionsID []uint64, group string, r // and the value of the failures indicates the failure error. func (r *RegionScatterer) ScatterRegions(regions map[uint64]*core.RegionInfo, failures map[uint64]error, group string, retryLimit int) ([]*operator.Operator, error) { if len(regions) < 1 { + scatterCounter.WithLabelValues("skip", "empty_region").Inc() return nil, errors.New("empty region") } if retryLimit > maxRetryLimit { @@ -226,14 +231,20 @@ func (r *RegionScatterer) ScatterRegions(regions map[uint64]*core.RegionInfo, fa func (r *RegionScatterer) Scatter(region *core.RegionInfo, group string) (*operator.Operator, error) { if !opt.IsRegionReplicated(r.cluster, region) { r.cluster.AddSuspectRegions(region.GetID()) + scatterCounter.WithLabelValues("skip", "not_replicated").Inc() + log.Warn("region not replicated during scatter", zap.Uint64("region-id", region.GetID())) return nil, errors.Errorf("region %d is not fully replicated", region.GetID()) } if region.GetLeader() == nil { + scatterCounter.WithLabelValues("skip", "no_leader").Inc() + log.Warn("region no leader during scatter", zap.Uint64("region-id", region.GetID())) return nil, errors.Errorf("region %d has no leader", region.GetID()) } if r.cluster.IsRegionHot(region) { + scatterCounter.WithLabelValues("skip", "hot").Inc() + log.Warn("region too hot during scatter", zap.Uint64("region-id", region.GetID())) return nil, errors.Errorf("region %d is hot", region.GetID()) } @@ -286,6 +297,7 @@ func (r *RegionScatterer) scatterRegion(region *core.RegionInfo, group string) * op, err := operator.CreateScatterRegionOperator("scatter-region", r.cluster, region, targetPeers, targetLeader) if err != nil { + scatterCounter.WithLabelValues("fail", "").Inc() for _, peer := range region.GetPeers() { targetPeers[peer.GetStoreId()] = peer } @@ -293,8 +305,11 @@ func (r *RegionScatterer) scatterRegion(region *core.RegionInfo, group string) * log.Debug("fail to create scatter region operator", errs.ZapError(err)) return nil } - r.Put(targetPeers, targetLeader, group) - op.SetPriorityLevel(core.HighPriority) + if op != nil { + scatterCounter.WithLabelValues("success", "").Inc() + r.Put(targetPeers, targetLeader, group) + op.SetPriorityLevel(core.HighPriority) + } return op } @@ -375,10 +390,22 @@ func (r *RegionScatterer) Put(peers map[uint64]*metapb.Peer, leaderStoreID uint6 store := r.cluster.GetStore(storeID) if ordinaryFilter.Target(r.cluster.GetOpts(), store) { r.ordinaryEngine.selectedPeer.Put(storeID, group) + scatterDistributionCounter.WithLabelValues( + fmt.Sprintf("%v", storeID), + fmt.Sprintf("%v", false), + filter.EngineTiKV).Inc() } else { engine := store.GetLabelValue(filter.EngineKey) r.specialEngines[engine].selectedPeer.Put(storeID, group) + scatterDistributionCounter.WithLabelValues( + fmt.Sprintf("%v", storeID), + fmt.Sprintf("%v", false), + engine).Inc() } } r.ordinaryEngine.selectedLeader.Put(leaderStoreID, group) + scatterDistributionCounter.WithLabelValues( + fmt.Sprintf("%v", leaderStoreID), + fmt.Sprintf("%v", true), + filter.EngineTiKV).Inc() } From f89a6473878b95da2146bf23a795fb958b0ac159 Mon Sep 17 00:00:00 2001 From: Song Gao Date: Tue, 13 Apr 2021 13:17:31 +0800 Subject: [PATCH 2/6] add metrics json Signed-off-by: Song Gao --- metrics/grafana/pd.json | 214 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 214 insertions(+) diff --git a/metrics/grafana/pd.json b/metrics/grafana/pd.json index 6f423e1de39..26f869a8ef6 100644 --- a/metrics/grafana/pd.json +++ b/metrics/grafana/pd.json @@ -6853,6 +6853,220 @@ "title": "Scheduler", "type": "row" }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 19 + }, + "id": 1437, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "tidb-cluster", + "description": "", + "fill": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 20 + }, + "id": 1433, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(delta(pd_schedule_scatter_operators_count{tidb_cluster=\"$tidb_cluster\", instance=\"$instance\", type=\"skip\"}[1m])) by (event)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "skip-{{event}}", + "refId": "A" + }, + { + "expr": "delta(pd_schedule_scatter_operators_count{tidb_cluster=\"$tidb_cluster\", instance=\"$instance\", type=\"fail\"}[1m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "fail", + "refId": "B" + }, + { + "expr": "delta(pd_schedule_scatter_operators_count{tidb_cluster=\"$tidb_cluster\", instance=\"$instance\", type=\"success\"}[1m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "success", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "scatter operator event", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "opm", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "tidb-cluster", + "fill": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 20 + }, + "id": 1435, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(delta(pd_schedule_scatter_distribution{tidb_cluster=\"$tidb_cluster\", instance=\"$instance\",engine=\"tikv\",is_leader=\"false\"}[1m])) by (store)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "peer-{{store}}", + "refId": "A" + }, + { + "expr": "sum(delta(pd_schedule_scatter_distribution{tidb_cluster=\"$tidb_cluster\", instance=\"$instance\",engine=\"tikv\",is_leader=\"true\"}[1m])) by (store)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "leader-{{store}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "scatter store selection", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "opm", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "title": "Scatter and Splitter", + "type": "row" + }, { "collapsed": true, "gridPos": { From 9b672f9ffad4ac6c5a0a017d63ed9c03f49036eb Mon Sep 17 00:00:00 2001 From: yisaer Date: Tue, 13 Apr 2021 15:29:09 +0800 Subject: [PATCH 3/6] address the comment Signed-off-by: yisaer --- server/schedule/region_scatterer.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/server/schedule/region_scatterer.go b/server/schedule/region_scatterer.go index 010287bd4a5..1efc3fc5f28 100644 --- a/server/schedule/region_scatterer.go +++ b/server/schedule/region_scatterer.go @@ -137,7 +137,7 @@ const maxRetryLimit = 30 func (r *RegionScatterer) ScatterRegionsByRange(startKey, endKey []byte, group string, retryLimit int) ([]*operator.Operator, map[uint64]error, error) { regions := r.cluster.ScanRegions(startKey, endKey, -1) if len(regions) < 1 { - scatterCounter.WithLabelValues("skip", "empty_region").Inc() + scatterCounter.WithLabelValues("skip", "empty-region").Inc() return nil, nil, errors.New("empty region") } failures := make(map[uint64]error, len(regions)) @@ -156,7 +156,7 @@ func (r *RegionScatterer) ScatterRegionsByRange(startKey, endKey []byte, group s // ScatterRegionsByID directly scatter regions by ScatterRegions func (r *RegionScatterer) ScatterRegionsByID(regionsID []uint64, group string, retryLimit int) ([]*operator.Operator, map[uint64]error, error) { if len(regionsID) < 1 { - scatterCounter.WithLabelValues("skip", "empty_region").Inc() + scatterCounter.WithLabelValues("skip", "empty-region").Inc() return nil, nil, errors.New("empty region") } failures := make(map[uint64]error, len(regionsID)) @@ -164,7 +164,7 @@ func (r *RegionScatterer) ScatterRegionsByID(regionsID []uint64, group string, r for _, id := range regionsID { region := r.cluster.GetRegion(id) if region == nil { - scatterCounter.WithLabelValues("skip", "no_region").Inc() + scatterCounter.WithLabelValues("skip", "no-region").Inc() log.Warn("failed to find region during scatter", zap.Uint64("region-id", id)) failures[id] = errors.New(fmt.Sprintf("failed to find region %v", id)) continue @@ -191,7 +191,7 @@ func (r *RegionScatterer) ScatterRegionsByID(regionsID []uint64, group string, r // and the value of the failures indicates the failure error. func (r *RegionScatterer) ScatterRegions(regions map[uint64]*core.RegionInfo, failures map[uint64]error, group string, retryLimit int) ([]*operator.Operator, error) { if len(regions) < 1 { - scatterCounter.WithLabelValues("skip", "empty_region").Inc() + scatterCounter.WithLabelValues("skip", "empty-region").Inc() return nil, errors.New("empty region") } if retryLimit > maxRetryLimit { @@ -231,13 +231,13 @@ func (r *RegionScatterer) ScatterRegions(regions map[uint64]*core.RegionInfo, fa func (r *RegionScatterer) Scatter(region *core.RegionInfo, group string) (*operator.Operator, error) { if !opt.IsRegionReplicated(r.cluster, region) { r.cluster.AddSuspectRegions(region.GetID()) - scatterCounter.WithLabelValues("skip", "not_replicated").Inc() + scatterCounter.WithLabelValues("skip", "not-replicated").Inc() log.Warn("region not replicated during scatter", zap.Uint64("region-id", region.GetID())) return nil, errors.Errorf("region %d is not fully replicated", region.GetID()) } if region.GetLeader() == nil { - scatterCounter.WithLabelValues("skip", "no_leader").Inc() + scatterCounter.WithLabelValues("skip", "no-leader").Inc() log.Warn("region no leader during scatter", zap.Uint64("region-id", region.GetID())) return nil, errors.Errorf("region %d has no leader", region.GetID()) } From 8137f98b7493754a1c03ce2634f8acbcd3090cdb Mon Sep 17 00:00:00 2001 From: yisaer Date: Tue, 13 Apr 2021 15:31:55 +0800 Subject: [PATCH 4/6] address the comment Signed-off-by: yisaer --- server/schedule/region_scatterer.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/schedule/region_scatterer.go b/server/schedule/region_scatterer.go index 1efc3fc5f28..6c29bf7cf8e 100644 --- a/server/schedule/region_scatterer.go +++ b/server/schedule/region_scatterer.go @@ -320,7 +320,7 @@ func (r *RegionScatterer) selectCandidates(region *core.RegionInfo, sourceStoreI return nil } filters := []filter.Filter{ - filter.NewExcludedFilter("scatter-region", nil, selectedStores), + filter.NewExcludedFilter(r.name, nil, selectedStores), } scoreGuard := filter.NewPlacementSafeguard(r.name, r.cluster, region, sourceStore) filters = append(filters, context.filters...) From 521c3f99995828da9affd6416e154e9a5ad5bd27 Mon Sep 17 00:00:00 2001 From: yisaer Date: Tue, 13 Apr 2021 19:05:13 +0800 Subject: [PATCH 5/6] address the comment Signed-off-by: yisaer --- server/schedule/filter/filters.go | 26 ++++++++++++++++++++++++++ server/schedule/region_scatterer.go | 3 ++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/server/schedule/filter/filters.go b/server/schedule/filter/filters.go index 92b88587023..f4058c17fcd 100644 --- a/server/schedule/filter/filters.go +++ b/server/schedule/filter/filters.go @@ -759,6 +759,32 @@ func (f *isolationFilter) Target(opt *config.PersistOptions, store *core.StoreIn return true } +// NewStoreBusyFilter creates storeBusyFilter +func NewStoreBusyFilter(scope string) *storeBusyFilter { + return &storeBusyFilter{scope: scope} +} + +// StoreBusyFilter +type storeBusyFilter struct { + scope string +} + +func (f *storeBusyFilter) Scope() string { + return f.scope +} + +func (f *storeBusyFilter) Type() string { + return "store-busy-filter" +} + +func (f *storeBusyFilter) Source(opt *config.PersistOptions, store *core.StoreInfo) bool { + return true +} + +func (f *storeBusyFilter) Target(opt *config.PersistOptions, store *core.StoreInfo) bool { + return !store.IsBusy() +} + // createRegionForRuleFit is used to create a clone region with RegionCreateOptions which is only used for // FitRegion in filter func createRegionForRuleFit(startKey, endKey []byte, diff --git a/server/schedule/region_scatterer.go b/server/schedule/region_scatterer.go index 6c29bf7cf8e..28ff12f5ec9 100644 --- a/server/schedule/region_scatterer.go +++ b/server/schedule/region_scatterer.go @@ -325,10 +325,11 @@ func (r *RegionScatterer) selectCandidates(region *core.RegionInfo, sourceStoreI scoreGuard := filter.NewPlacementSafeguard(r.name, r.cluster, region, sourceStore) filters = append(filters, context.filters...) filters = append(filters, scoreGuard) + filters = append(filters, filter.NewStoreBusyFilter(r.name)) stores := r.cluster.GetStores() candidates := make([]uint64, 0) for _, store := range stores { - if filter.Target(r.cluster.GetOpts(), store, filters) && !store.IsBusy() { + if filter.Target(r.cluster.GetOpts(), store, filters) { candidates = append(candidates, store.GetID()) } } From 147cc104e902224392a00c229fbb3d630c820459 Mon Sep 17 00:00:00 2001 From: yisaer Date: Wed, 14 Apr 2021 11:00:31 +0800 Subject: [PATCH 6/6] address the comment Signed-off-by: yisaer --- server/schedule/filter/filters.go | 28 +--------------------------- server/schedule/region_scatterer.go | 1 - 2 files changed, 1 insertion(+), 28 deletions(-) diff --git a/server/schedule/filter/filters.go b/server/schedule/filter/filters.go index f4058c17fcd..62cf42315f4 100644 --- a/server/schedule/filter/filters.go +++ b/server/schedule/filter/filters.go @@ -389,7 +389,7 @@ func (f *StoreStateFilter) anyConditionMatch(typ int, opt *config.PersistOptions funcs = []conditionFunc{f.isTombstone, f.isOffline, f.isDown, f.isDisconnected, f.isBusy, f.exceedAddLimit, f.tooManySnapshots, f.tooManyPendingPeers} case scatterRegionTarget: - funcs = []conditionFunc{f.isTombstone, f.isOffline, f.isDown, f.isDisconnected} + funcs = []conditionFunc{f.isTombstone, f.isOffline, f.isDown, f.isDisconnected, f.isBusy} } for _, cf := range funcs { if cf(opt, store) { @@ -759,32 +759,6 @@ func (f *isolationFilter) Target(opt *config.PersistOptions, store *core.StoreIn return true } -// NewStoreBusyFilter creates storeBusyFilter -func NewStoreBusyFilter(scope string) *storeBusyFilter { - return &storeBusyFilter{scope: scope} -} - -// StoreBusyFilter -type storeBusyFilter struct { - scope string -} - -func (f *storeBusyFilter) Scope() string { - return f.scope -} - -func (f *storeBusyFilter) Type() string { - return "store-busy-filter" -} - -func (f *storeBusyFilter) Source(opt *config.PersistOptions, store *core.StoreInfo) bool { - return true -} - -func (f *storeBusyFilter) Target(opt *config.PersistOptions, store *core.StoreInfo) bool { - return !store.IsBusy() -} - // createRegionForRuleFit is used to create a clone region with RegionCreateOptions which is only used for // FitRegion in filter func createRegionForRuleFit(startKey, endKey []byte, diff --git a/server/schedule/region_scatterer.go b/server/schedule/region_scatterer.go index 28ff12f5ec9..a09202844f1 100644 --- a/server/schedule/region_scatterer.go +++ b/server/schedule/region_scatterer.go @@ -325,7 +325,6 @@ func (r *RegionScatterer) selectCandidates(region *core.RegionInfo, sourceStoreI scoreGuard := filter.NewPlacementSafeguard(r.name, r.cluster, region, sourceStore) filters = append(filters, context.filters...) filters = append(filters, scoreGuard) - filters = append(filters, filter.NewStoreBusyFilter(r.name)) stores := r.cluster.GetStores() candidates := make([]uint64, 0) for _, store := range stores {