From 9d580d03e62e0dad3627de3f050a4a32f8f3e207 Mon Sep 17 00:00:00 2001 From: Ryan Leung Date: Tue, 28 May 2024 13:40:20 +0800 Subject: [PATCH 01/47] *: batch process peer task (#8213) ref tikv/pd#7897 Signed-off-by: Ryan Leung Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/cluster/cluster.go | 7 +- pkg/core/peer.go | 31 --------- pkg/mcs/scheduling/server/cluster.go | 4 +- pkg/mock/mockcluster/mockcluster.go | 26 +------ pkg/statistics/hot_cache.go | 9 +-- pkg/statistics/hot_cache_task.go | 43 +++++++++--- pkg/statistics/hot_peer_cache.go | 87 ++++++++++++------------ pkg/statistics/hot_peer_cache_test.go | 98 +++++++++++---------------- pkg/statistics/utils/kind.go | 10 +-- server/cluster/cluster.go | 3 +- server/cluster/cluster_test.go | 32 +++++++++ tools/pd-ctl/tests/hot/hot_test.go | 3 +- 12 files changed, 165 insertions(+), 188 deletions(-) diff --git a/pkg/cluster/cluster.go b/pkg/cluster/cluster.go index 8bd2616f41f7..ab97c7899db9 100644 --- a/pkg/cluster/cluster.go +++ b/pkg/cluster/cluster.go @@ -35,12 +35,7 @@ type Cluster interface { func HandleStatsAsync(c Cluster, region *core.RegionInfo) { c.GetHotStat().CheckWriteAsync(statistics.NewCheckExpiredItemTask(region)) c.GetHotStat().CheckReadAsync(statistics.NewCheckExpiredItemTask(region)) - reportInterval := region.GetInterval() - interval := reportInterval.GetEndTimestamp() - reportInterval.GetStartTimestamp() - for _, peer := range region.GetPeers() { - peerInfo := core.NewPeerInfo(peer, region.GetWriteLoads(), interval) - c.GetHotStat().CheckWriteAsync(statistics.NewCheckPeerTask(peerInfo, region)) - } + c.GetHotStat().CheckWriteAsync(statistics.NewCheckWritePeerTask(region)) c.GetCoordinator().GetSchedulersController().CheckTransferWitnessLeader(region) } diff --git a/pkg/core/peer.go b/pkg/core/peer.go index 659886e6d394..1f888ba58eb7 100644 --- a/pkg/core/peer.go +++ b/pkg/core/peer.go @@ -77,34 +77,3 @@ func CountInJointState(peers ...*metapb.Peer) int { } return count } - -// PeerInfo provides peer information -type PeerInfo struct { - *metapb.Peer - loads []float64 - interval uint64 -} - -// NewPeerInfo creates PeerInfo -func NewPeerInfo(meta *metapb.Peer, loads []float64, interval uint64) *PeerInfo { - return &PeerInfo{ - Peer: meta, - loads: loads, - interval: interval, - } -} - -// GetLoads provides loads -func (p *PeerInfo) GetLoads() []float64 { - return p.loads -} - -// GetPeerID provides peer id -func (p *PeerInfo) GetPeerID() uint64 { - return p.GetId() -} - -// GetInterval returns reporting interval -func (p *PeerInfo) GetInterval() uint64 { - return p.interval -} diff --git a/pkg/mcs/scheduling/server/cluster.go b/pkg/mcs/scheduling/server/cluster.go index c6c365b03ad8..d711ab2d4f6e 100644 --- a/pkg/mcs/scheduling/server/cluster.go +++ b/pkg/mcs/scheduling/server/cluster.go @@ -9,6 +9,7 @@ import ( "github.com/pingcap/errors" "github.com/pingcap/failpoint" + "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/kvproto/pkg/pdpb" "github.com/pingcap/kvproto/pkg/schedulingpb" "github.com/pingcap/log" @@ -442,8 +443,7 @@ func (c *Cluster) HandleStoreHeartbeat(heartbeat *schedulingpb.StoreHeartbeatReq utils.RegionWriteKeys: 0, utils.RegionWriteQueryNum: 0, } - peerInfo := core.NewPeerInfo(peer, loads, interval) - c.hotStat.CheckReadAsync(statistics.NewCheckPeerTask(peerInfo, region)) + c.hotStat.CheckReadAsync(statistics.NewCheckReadPeerTask(region, []*metapb.Peer{peer}, loads, interval)) } // Here we will compare the reported regions with the previous hot peers to decide if it is still hot. diff --git a/pkg/mock/mockcluster/mockcluster.go b/pkg/mock/mockcluster/mockcluster.go index e5b3e39a5020..3f9710c48fd2 100644 --- a/pkg/mock/mockcluster/mockcluster.go +++ b/pkg/mock/mockcluster/mockcluster.go @@ -896,14 +896,7 @@ func (mc *Cluster) CheckRegionRead(region *core.RegionInfo) []*statistics.HotPee items = append(items, expiredItems...) reportInterval := region.GetInterval() interval := reportInterval.GetEndTimestamp() - reportInterval.GetStartTimestamp() - for _, peer := range region.GetPeers() { - peerInfo := core.NewPeerInfo(peer, region.GetLoads(), interval) - item := mc.HotCache.CheckReadPeerSync(peerInfo, region) - if item != nil { - items = append(items, item) - } - } - return items + return append(items, mc.HotCache.CheckReadPeerSync(region, region.GetPeers(), region.GetLoads(), interval)...) } // CheckRegionWrite checks region write info with all peers @@ -913,14 +906,7 @@ func (mc *Cluster) CheckRegionWrite(region *core.RegionInfo) []*statistics.HotPe items = append(items, expiredItems...) reportInterval := region.GetInterval() interval := reportInterval.GetEndTimestamp() - reportInterval.GetStartTimestamp() - for _, peer := range region.GetPeers() { - peerInfo := core.NewPeerInfo(peer, region.GetLoads(), interval) - item := mc.HotCache.CheckWritePeerSync(peerInfo, region) - if item != nil { - items = append(items, item) - } - } - return items + return append(items, mc.HotCache.CheckWritePeerSync(region, region.GetPeers(), region.GetLoads(), interval)...) } // CheckRegionLeaderRead checks region read info with leader peer @@ -930,13 +916,7 @@ func (mc *Cluster) CheckRegionLeaderRead(region *core.RegionInfo) []*statistics. items = append(items, expiredItems...) reportInterval := region.GetInterval() interval := reportInterval.GetEndTimestamp() - reportInterval.GetStartTimestamp() - peer := region.GetLeader() - peerInfo := core.NewPeerInfo(peer, region.GetLoads(), interval) - item := mc.HotCache.CheckReadPeerSync(peerInfo, region) - if item != nil { - items = append(items, item) - } - return items + return append(items, mc.HotCache.CheckReadPeerSync(region, []*metapb.Peer{region.GetLeader()}, region.GetLoads(), interval)...) } // ObserveRegionsStats records the current stores stats from region stats. diff --git a/pkg/statistics/hot_cache.go b/pkg/statistics/hot_cache.go index 799fb240d108..26548c8b47eb 100644 --- a/pkg/statistics/hot_cache.go +++ b/pkg/statistics/hot_cache.go @@ -17,6 +17,7 @@ package statistics import ( "context" + "github.com/pingcap/kvproto/pkg/metapb" "github.com/smallnest/chanx" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/statistics/utils" @@ -172,14 +173,14 @@ func (w *HotCache) Update(item *HotPeerStat, kind utils.RWType) { // CheckWritePeerSync checks the write status, returns update items. // This is used for mockcluster, for test purpose. -func (w *HotCache) CheckWritePeerSync(peer *core.PeerInfo, region *core.RegionInfo) *HotPeerStat { - return w.writeCache.checkPeerFlow(peer, region) +func (w *HotCache) CheckWritePeerSync(region *core.RegionInfo, peers []*metapb.Peer, loads []float64, interval uint64) []*HotPeerStat { + return w.writeCache.checkPeerFlow(region, peers, loads, interval) } // CheckReadPeerSync checks the read status, returns update items. // This is used for mockcluster, for test purpose. -func (w *HotCache) CheckReadPeerSync(peer *core.PeerInfo, region *core.RegionInfo) *HotPeerStat { - return w.readCache.checkPeerFlow(peer, region) +func (w *HotCache) CheckReadPeerSync(region *core.RegionInfo, peers []*metapb.Peer, loads []float64, interval uint64) []*HotPeerStat { + return w.readCache.checkPeerFlow(region, peers, loads, interval) } // ExpiredReadItems returns the read items which are already expired. diff --git a/pkg/statistics/hot_cache_task.go b/pkg/statistics/hot_cache_task.go index fa224b522ff1..01731f3fe4d5 100644 --- a/pkg/statistics/hot_cache_task.go +++ b/pkg/statistics/hot_cache_task.go @@ -17,6 +17,7 @@ package statistics import ( "context" + "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" ) @@ -25,22 +26,46 @@ type FlowItemTask interface { runTask(cache *hotPeerCache) } -type checkPeerTask struct { - peerInfo *core.PeerInfo +type checkReadPeerTask struct { regionInfo *core.RegionInfo + peers []*metapb.Peer + loads []float64 + interval uint64 } -// NewCheckPeerTask creates task to update peerInfo -func NewCheckPeerTask(peerInfo *core.PeerInfo, regionInfo *core.RegionInfo) FlowItemTask { - return &checkPeerTask{ - peerInfo: peerInfo, +// NewCheckReadPeerTask creates task to update peerInfo +func NewCheckReadPeerTask(regionInfo *core.RegionInfo, peers []*metapb.Peer, loads []float64, interval uint64) FlowItemTask { + return &checkReadPeerTask{ regionInfo: regionInfo, + peers: peers, + loads: loads, + interval: interval, } } -func (t *checkPeerTask) runTask(cache *hotPeerCache) { - stat := cache.checkPeerFlow(t.peerInfo, t.regionInfo) - if stat != nil { +func (t *checkReadPeerTask) runTask(cache *hotPeerCache) { + stats := cache.checkPeerFlow(t.regionInfo, t.peers, t.loads, t.interval) + for _, stat := range stats { + cache.updateStat(stat) + } +} + +type checkWritePeerTask struct { + region *core.RegionInfo +} + +// NewCheckWritePeerTask creates task to update peerInfo +func NewCheckWritePeerTask(region *core.RegionInfo) FlowItemTask { + return &checkWritePeerTask{ + region: region, + } +} + +func (t *checkWritePeerTask) runTask(cache *hotPeerCache) { + reportInterval := t.region.GetInterval() + interval := reportInterval.GetEndTimestamp() - reportInterval.GetStartTimestamp() + stats := cache.checkPeerFlow(t.region, t.region.GetPeers(), t.region.GetWriteLoads(), interval) + for _, stat := range stats { cache.updateStat(stat) } } diff --git a/pkg/statistics/hot_peer_cache.go b/pkg/statistics/hot_peer_cache.go index cd27dcad4c8d..3a3d3519bd96 100644 --- a/pkg/statistics/hot_peer_cache.go +++ b/pkg/statistics/hot_peer_cache.go @@ -174,58 +174,61 @@ func (f *hotPeerCache) collectExpiredItems(region *core.RegionInfo) []*HotPeerSt // checkPeerFlow checks the flow information of a peer. // Notice: checkPeerFlow couldn't be used concurrently. // checkPeerFlow will update oldItem's rollingLoads into newItem, thus we should use write lock here. -func (f *hotPeerCache) checkPeerFlow(peer *core.PeerInfo, region *core.RegionInfo) *HotPeerStat { - interval := peer.GetInterval() +func (f *hotPeerCache) checkPeerFlow(region *core.RegionInfo, peers []*metapb.Peer, deltaLoads []float64, interval uint64) []*HotPeerStat { if Denoising && interval < HotRegionReportMinInterval { // for test or simulator purpose return nil } - storeID := peer.GetStoreId() - deltaLoads := peer.GetLoads() + f.collectPeerMetrics(deltaLoads, interval) // update metrics regionID := region.GetID() - oldItem := f.getOldHotPeerStat(regionID, storeID) - - // check whether the peer is allowed to be inherited - source := utils.Direct - if oldItem == nil { - for _, storeID := range f.getAllStoreIDs(region) { - oldItem = f.getOldHotPeerStat(regionID, storeID) - if oldItem != nil && oldItem.allowInherited { - source = utils.Inherit - break + + regionPeers := region.GetPeers() + stats := make([]*HotPeerStat, 0, len(peers)) + for _, peer := range peers { + storeID := peer.GetStoreId() + oldItem := f.getOldHotPeerStat(regionID, storeID) + + // check whether the peer is allowed to be inherited + source := utils.Direct + if oldItem == nil { + for _, storeID := range f.getAllStoreIDs(region) { + oldItem = f.getOldHotPeerStat(regionID, storeID) + if oldItem != nil && oldItem.allowInherited { + source = utils.Inherit + break + } } } - } - - // check new item whether is hot - if oldItem == nil { - regionStats := f.kind.RegionStats() - thresholds := f.calcHotThresholds(storeID) - isHot := slice.AnyOf(regionStats, func(i int) bool { - return deltaLoads[regionStats[i]]/float64(interval) >= thresholds[i] - }) - if !isHot { - return nil + // check new item whether is hot + if oldItem == nil { + regionStats := f.kind.RegionStats() + thresholds := f.calcHotThresholds(storeID) + isHot := slice.AnyOf(regionStats, func(i int) bool { + return deltaLoads[regionStats[i]]/float64(interval) >= thresholds[i] + }) + if !isHot { + continue + } } - } - - peers := region.GetPeers() - newItem := &HotPeerStat{ - StoreID: storeID, - RegionID: regionID, - Loads: f.kind.GetLoadRatesFromPeer(peer), - isLeader: region.GetLeader().GetStoreId() == storeID, - actionType: utils.Update, - stores: make([]uint64, len(peers)), - } - for i, peer := range peers { - newItem.stores[i] = peer.GetStoreId() - } - if oldItem == nil { - return f.updateNewHotPeerStat(newItem, deltaLoads, time.Duration(interval)*time.Second) + newItem := &HotPeerStat{ + StoreID: storeID, + RegionID: regionID, + Loads: f.kind.GetLoadRates(deltaLoads, interval), + isLeader: region.GetLeader().GetStoreId() == storeID, + actionType: utils.Update, + stores: make([]uint64, len(regionPeers)), + } + for i, peer := range regionPeers { + newItem.stores[i] = peer.GetStoreId() + } + if oldItem == nil { + stats = append(stats, f.updateNewHotPeerStat(newItem, deltaLoads, time.Duration(interval)*time.Second)) + continue + } + stats = append(stats, f.updateHotPeerStat(region, newItem, oldItem, deltaLoads, time.Duration(interval)*time.Second, source)) } - return f.updateHotPeerStat(region, newItem, oldItem, deltaLoads, time.Duration(interval)*time.Second, source) + return stats } // checkColdPeer checks the collect the un-heartbeat peer and maintain it. diff --git a/pkg/statistics/hot_peer_cache_test.go b/pkg/statistics/hot_peer_cache_test.go index 36f922d38307..c116e020f544 100644 --- a/pkg/statistics/hot_peer_cache_test.go +++ b/pkg/statistics/hot_peer_cache_test.go @@ -109,14 +109,7 @@ func checkFlow(cache *hotPeerCache, region *core.RegionInfo, peers []*metapb.Pee reportInterval := region.GetInterval() interval := reportInterval.GetEndTimestamp() - reportInterval.GetStartTimestamp() res = append(res, cache.collectExpiredItems(region)...) - for _, peer := range peers { - peerInfo := core.NewPeerInfo(peer, region.GetLoads(), interval) - item := cache.checkPeerFlow(peerInfo, region) - if item != nil { - res = append(res, item) - } - } - return res + return append(res, cache.checkPeerFlow(region, peers, region.GetLoads(), interval)...) } func updateFlow(cache *hotPeerCache, res []*HotPeerStat) []*HotPeerStat { @@ -318,13 +311,13 @@ func TestUpdateHotPeerStat(t *testing.T) { }() // skip interval=0 - interval := 0 + interval := uint64(0) deltaLoads := []float64{0.0, 0.0, 0.0} utils.MinHotThresholds[utils.RegionReadBytes] = 0.0 utils.MinHotThresholds[utils.RegionReadKeys] = 0.0 utils.MinHotThresholds[utils.RegionReadQueryNum] = 0.0 - newItem := cache.checkPeerFlow(core.NewPeerInfo(peer, deltaLoads, uint64(interval)), region) + newItem := cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) re.Nil(newItem) // new peer, interval is larger than report interval, but no hot @@ -333,8 +326,8 @@ func TestUpdateHotPeerStat(t *testing.T) { utils.MinHotThresholds[utils.RegionReadBytes] = 1.0 utils.MinHotThresholds[utils.RegionReadKeys] = 1.0 utils.MinHotThresholds[utils.RegionReadQueryNum] = 1.0 - newItem = cache.checkPeerFlow(core.NewPeerInfo(peer, deltaLoads, uint64(interval)), region) - re.Nil(newItem) + newItem = cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) + re.Empty(newItem) // new peer, interval is less than report interval interval = 4 @@ -342,50 +335,49 @@ func TestUpdateHotPeerStat(t *testing.T) { utils.MinHotThresholds[utils.RegionReadBytes] = 0.0 utils.MinHotThresholds[utils.RegionReadKeys] = 0.0 utils.MinHotThresholds[utils.RegionReadQueryNum] = 0.0 - newItem = cache.checkPeerFlow(core.NewPeerInfo(peer, deltaLoads, uint64(interval)), region) + newItem = cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) re.NotNil(newItem) - re.Equal(0, newItem.HotDegree) - re.Equal(0, newItem.AntiCount) + re.Equal(0, newItem[0].HotDegree) + re.Equal(0, newItem[0].AntiCount) // sum of interval is less than report interval - interval = 4 deltaLoads = []float64{60.0, 60.0, 60.0} - cache.updateStat(newItem) - newItem = cache.checkPeerFlow(core.NewPeerInfo(peer, deltaLoads, uint64(interval)), region) - re.Equal(0, newItem.HotDegree) - re.Equal(0, newItem.AntiCount) + cache.updateStat(newItem[0]) + newItem = cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) + re.Equal(0, newItem[0].HotDegree) + re.Equal(0, newItem[0].AntiCount) // sum of interval is larger than report interval, and hot - newItem.AntiCount = utils.Read.DefaultAntiCount() - cache.updateStat(newItem) - newItem = cache.checkPeerFlow(core.NewPeerInfo(peer, deltaLoads, uint64(interval)), region) - re.Equal(1, newItem.HotDegree) - re.Equal(2*m, newItem.AntiCount) + newItem[0].AntiCount = utils.Read.DefaultAntiCount() + cache.updateStat(newItem[0]) + newItem = cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) + re.Equal(1, newItem[0].HotDegree) + re.Equal(2*m, newItem[0].AntiCount) // sum of interval is less than report interval - cache.updateStat(newItem) - newItem = cache.checkPeerFlow(core.NewPeerInfo(peer, deltaLoads, uint64(interval)), region) - re.Equal(1, newItem.HotDegree) - re.Equal(2*m, newItem.AntiCount) + cache.updateStat(newItem[0]) + newItem = cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) + re.Equal(1, newItem[0].HotDegree) + re.Equal(2*m, newItem[0].AntiCount) // sum of interval is larger than report interval, and hot interval = 10 - cache.updateStat(newItem) - newItem = cache.checkPeerFlow(core.NewPeerInfo(peer, deltaLoads, uint64(interval)), region) - re.Equal(2, newItem.HotDegree) - re.Equal(2*m, newItem.AntiCount) + cache.updateStat(newItem[0]) + newItem = cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) + re.Equal(2, newItem[0].HotDegree) + re.Equal(2*m, newItem[0].AntiCount) // sum of interval is larger than report interval, and cold utils.MinHotThresholds[utils.RegionReadBytes] = 10.0 utils.MinHotThresholds[utils.RegionReadKeys] = 10.0 utils.MinHotThresholds[utils.RegionReadQueryNum] = 10.0 - cache.updateStat(newItem) - newItem = cache.checkPeerFlow(core.NewPeerInfo(peer, deltaLoads, uint64(interval)), region) - re.Equal(1, newItem.HotDegree) - re.Equal(2*m-1, newItem.AntiCount) + cache.updateStat(newItem[0]) + newItem = cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) + re.Equal(1, newItem[0].HotDegree) + re.Equal(2*m-1, newItem[0].AntiCount) // sum of interval is larger than report interval, and cold for i := 0; i < 2*m-1; i++ { - cache.updateStat(newItem) - newItem = cache.checkPeerFlow(core.NewPeerInfo(peer, deltaLoads, uint64(interval)), region) + cache.updateStat(newItem[0]) + newItem = cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) } - re.Less(newItem.HotDegree, 0) - re.Equal(0, newItem.AntiCount) - re.Equal(utils.Remove, newItem.actionType) + re.Less(newItem[0].HotDegree, 0) + re.Equal(0, newItem[0].AntiCount) + re.Equal(utils.Remove, newItem[0].actionType) } func TestThresholdWithUpdateHotPeerStat(t *testing.T) { @@ -688,9 +680,8 @@ func TestHotPeerCacheTopNThreshold(t *testing.T) { StartTimestamp: start, EndTimestamp: end, })) - newPeer := core.NewPeerInfo(meta.Peers[0], region.GetLoads(), end-start) - stat := cache.checkPeerFlow(newPeer, newRegion) - if stat != nil { + stats := cache.checkPeerFlow(newRegion, newRegion.GetPeers(), newRegion.GetLoads(), end-start) + for _, stat := range stats { cache.updateStat(stat) } } @@ -717,22 +708,11 @@ func TestHotPeerCacheTopNThreshold(t *testing.T) { func BenchmarkCheckRegionFlow(b *testing.B) { cache := NewHotPeerCache(context.Background(), utils.Read) region := buildRegion(utils.Read, 3, 10) - peerInfos := make([]*core.PeerInfo, 0) - for _, peer := range region.GetPeers() { - peerInfo := core.NewPeerInfo(peer, region.GetLoads(), 10) - peerInfos = append(peerInfos, peerInfo) - } b.ResetTimer() for i := 0; i < b.N; i++ { - items := make([]*HotPeerStat, 0) - for _, peerInfo := range peerInfos { - item := cache.checkPeerFlow(peerInfo, region) - if item != nil { - items = append(items, item) - } - } - for _, ret := range items { - cache.updateStat(ret) + stats := cache.checkPeerFlow(region, region.GetPeers(), region.GetLoads(), 10) + for _, stat := range stats { + cache.updateStat(stat) } } } diff --git a/pkg/statistics/utils/kind.go b/pkg/statistics/utils/kind.go index 4d44b8d57e10..089732f759f8 100644 --- a/pkg/statistics/utils/kind.go +++ b/pkg/statistics/utils/kind.go @@ -14,10 +14,6 @@ package utils -import ( - "github.com/tikv/pd/pkg/core" -) - const ( // BytePriority indicates hot-region-scheduler prefer byte dim BytePriority = "byte" @@ -230,10 +226,8 @@ func (rw RWType) DefaultAntiCount() int { } } -// GetLoadRatesFromPeer gets the load rates of the read or write type from PeerInfo. -func (rw RWType) GetLoadRatesFromPeer(peer *core.PeerInfo) []float64 { - deltaLoads := peer.GetLoads() - interval := peer.GetInterval() +// GetLoadRates gets the load rates of the read or write type. +func (rw RWType) GetLoadRates(deltaLoads []float64, interval uint64) []float64 { loads := make([]float64, DimLen) for dim, k := range rw.RegionStats() { loads[dim] = deltaLoads[k] / float64(interval) diff --git a/server/cluster/cluster.go b/server/cluster/cluster.go index 148b43541a23..057814b718bc 100644 --- a/server/cluster/cluster.go +++ b/server/cluster/cluster.go @@ -959,8 +959,7 @@ func (c *RaftCluster) HandleStoreHeartbeat(heartbeat *pdpb.StoreHeartbeatRequest utils.RegionWriteKeys: 0, utils.RegionWriteQueryNum: 0, } - peerInfo := core.NewPeerInfo(peer, loads, interval) - c.hotStat.CheckReadAsync(statistics.NewCheckPeerTask(peerInfo, region)) + c.hotStat.CheckReadAsync(statistics.NewCheckReadPeerTask(region, []*metapb.Peer{peer}, loads, interval)) } } for _, stat := range stats.GetSnapshotStats() { diff --git a/server/cluster/cluster_test.go b/server/cluster/cluster_test.go index 945e354bb6cb..0f08153c8ae7 100644 --- a/server/cluster/cluster_test.go +++ b/server/cluster/cluster_test.go @@ -33,6 +33,7 @@ import ( "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/kvproto/pkg/pdpb" "github.com/stretchr/testify/require" + "github.com/tikv/pd/pkg/cluster" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/core/constant" "github.com/tikv/pd/pkg/core/storelimit" @@ -3730,3 +3731,34 @@ func waitNoResponse(re *require.Assertions, stream mockhbstream.HeartbeatStream) return res == nil }) } + +func BenchmarkHandleStatsAsync(b *testing.B) { + // Setup: create a new instance of Cluster + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + _, opt, _ := newTestScheduleConfig() + c := newTestRaftCluster(ctx, mockid.NewIDAllocator(), opt, storage.NewStorageWithMemoryBackend()) + c.coordinator = schedule.NewCoordinator(ctx, c, nil) + c.SetPrepared() + region := core.NewRegionInfo(&metapb.Region{ + Id: 1, + RegionEpoch: &metapb.RegionEpoch{ + ConfVer: 1, + Version: 1, + }, + StartKey: []byte{byte(2)}, + EndKey: []byte{byte(3)}, + Peers: []*metapb.Peer{{Id: 11, StoreId: uint64(1)}}, + }, nil, + core.SetApproximateSize(10), + core.SetReportInterval(0, 10), + ) + + // Reset timer after setup + b.ResetTimer() + // Run HandleStatsAsync b.N times + for i := 0; i < b.N; i++ { + cluster.HandleStatsAsync(c, region) + } +} diff --git a/tools/pd-ctl/tests/hot/hot_test.go b/tools/pd-ctl/tests/hot/hot_test.go index 7661704aa41d..f65b811b36a8 100644 --- a/tools/pd-ctl/tests/hot/hot_test.go +++ b/tools/pd-ctl/tests/hot/hot_test.go @@ -188,11 +188,10 @@ func (suite *hotTestSuite) checkHot(cluster *pdTests.TestCluster) { Id: 100 + regionIDCounter, StoreId: hotStoreID, } - peerInfo := core.NewPeerInfo(leader, loads, reportInterval) region := core.NewRegionInfo(&metapb.Region{ Id: hotRegionID, }, leader) - hotStat.CheckReadAsync(statistics.NewCheckPeerTask(peerInfo, region)) + hotStat.CheckReadAsync(statistics.NewCheckReadPeerTask(region, []*metapb.Peer{leader}, loads, reportInterval)) testutil.Eventually(re, func() bool { hotPeerStat := getHotPeerStat(utils.Read, hotRegionID, hotStoreID) return hotPeerStat != nil From 5eb66e09360987ffe35b91cf1849d5628b2f52fb Mon Sep 17 00:00:00 2001 From: Ryan Leung Date: Tue, 28 May 2024 14:49:20 +0800 Subject: [PATCH 02/47] *: refactor store info (#6830) ref tikv/pd#7897 Signed-off-by: Ryan Leung Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/core/basic_cluster.go | 195 +---------- pkg/core/store.go | 194 +++++++---- pkg/keyspace/keyspace.go | 10 +- pkg/mcs/scheduling/server/apis/v1/api.go | 4 +- pkg/mcs/scheduling/server/cluster.go | 20 +- pkg/mock/mockcluster/mockcluster.go | 12 +- pkg/schedule/checker/rule_checker_test.go | 2 +- pkg/schedule/placement/fit_test.go | 2 +- pkg/schedule/scatter/region_scatterer_test.go | 2 +- pkg/schedule/schedulers/balance_test.go | 2 +- pkg/storage/leveldb_backend.go | 0 pkg/storage/storage_test.go | 4 +- .../unsafe_recovery_controller.go | 4 +- server/api/admin.go | 6 +- server/api/stats.go | 2 +- server/cluster/cluster.go | 305 +++--------------- server/cluster/cluster_test.go | 136 ++++---- server/cluster/scheduling_controller.go | 2 +- server/grpc_service.go | 2 +- server/server.go | 2 - tests/integrations/mcs/scheduling/api_test.go | 6 +- .../mcs/scheduling/config_test.go | 2 +- .../integrations/mcs/scheduling/meta_test.go | 4 +- .../mcs/scheduling/server_test.go | 2 +- tests/server/api/region_test.go | 2 +- tests/server/cluster/cluster_test.go | 2 +- 26 files changed, 293 insertions(+), 631 deletions(-) mode change 100644 => 100755 pkg/storage/leveldb_backend.go diff --git a/pkg/core/basic_cluster.go b/pkg/core/basic_cluster.go index d70b620db3bc..2392b7ddac69 100644 --- a/pkg/core/basic_cluster.go +++ b/pkg/core/basic_cluster.go @@ -14,218 +14,43 @@ package core -import ( - "github.com/pingcap/kvproto/pkg/metapb" - "github.com/tikv/pd/pkg/core/storelimit" - "github.com/tikv/pd/pkg/utils/syncutil" -) - // BasicCluster provides basic data member and interface for a tikv cluster. type BasicCluster struct { - Stores struct { - mu syncutil.RWMutex - *StoresInfo - } - + *StoresInfo *RegionsInfo } // NewBasicCluster creates a BasicCluster. func NewBasicCluster() *BasicCluster { return &BasicCluster{ - Stores: struct { - mu syncutil.RWMutex - *StoresInfo - }{StoresInfo: NewStoresInfo()}, - + StoresInfo: NewStoresInfo(), RegionsInfo: NewRegionsInfo(), } } -/* Stores read operations */ - -// GetStores returns all Stores in the cluster. -func (bc *BasicCluster) GetStores() []*StoreInfo { - bc.Stores.mu.RLock() - defer bc.Stores.mu.RUnlock() - return bc.Stores.GetStores() -} - -// GetMetaStores gets a complete set of metapb.Store. -func (bc *BasicCluster) GetMetaStores() []*metapb.Store { - bc.Stores.mu.RLock() - defer bc.Stores.mu.RUnlock() - return bc.Stores.GetMetaStores() -} - -// GetStore searches for a store by ID. -func (bc *BasicCluster) GetStore(storeID uint64) *StoreInfo { - bc.Stores.mu.RLock() - defer bc.Stores.mu.RUnlock() - return bc.Stores.GetStore(storeID) -} - -// GetRegionStores returns all Stores that contains the region's peer. -func (bc *BasicCluster) GetRegionStores(region *RegionInfo) []*StoreInfo { - bc.Stores.mu.RLock() - defer bc.Stores.mu.RUnlock() - var Stores []*StoreInfo - for id := range region.GetStoreIDs() { - if store := bc.Stores.GetStore(id); store != nil { - Stores = append(Stores, store) - } - } - return Stores -} - -// GetNonWitnessVoterStores returns all Stores that contains the non-witness's voter peer. -func (bc *BasicCluster) GetNonWitnessVoterStores(region *RegionInfo) []*StoreInfo { - bc.Stores.mu.RLock() - defer bc.Stores.mu.RUnlock() - var Stores []*StoreInfo - for id := range region.GetNonWitnessVoters() { - if store := bc.Stores.GetStore(id); store != nil { - Stores = append(Stores, store) - } - } - return Stores -} - -// GetFollowerStores returns all Stores that contains the region's follower peer. -func (bc *BasicCluster) GetFollowerStores(region *RegionInfo) []*StoreInfo { - bc.Stores.mu.RLock() - defer bc.Stores.mu.RUnlock() - var Stores []*StoreInfo - for id := range region.GetFollowers() { - if store := bc.Stores.GetStore(id); store != nil { - Stores = append(Stores, store) - } - } - return Stores -} - -// GetLeaderStore returns all Stores that contains the region's leader peer. -func (bc *BasicCluster) GetLeaderStore(region *RegionInfo) *StoreInfo { - bc.Stores.mu.RLock() - defer bc.Stores.mu.RUnlock() - return bc.Stores.GetStore(region.GetLeader().GetStoreId()) -} - -// GetStoreCount returns the total count of storeInfo. -func (bc *BasicCluster) GetStoreCount() int { - bc.Stores.mu.RLock() - defer bc.Stores.mu.RUnlock() - return bc.Stores.GetStoreCount() -} - -/* Stores Write operations */ - -// PauseLeaderTransfer prevents the store from been selected as source or -// target store of TransferLeader. -func (bc *BasicCluster) PauseLeaderTransfer(storeID uint64) error { - bc.Stores.mu.Lock() - defer bc.Stores.mu.Unlock() - return bc.Stores.PauseLeaderTransfer(storeID) -} - -// ResumeLeaderTransfer cleans a store's pause state. The store can be selected -// as source or target of TransferLeader again. -func (bc *BasicCluster) ResumeLeaderTransfer(storeID uint64) { - bc.Stores.mu.Lock() - defer bc.Stores.mu.Unlock() - bc.Stores.ResumeLeaderTransfer(storeID) -} - -// SlowStoreEvicted marks a store as a slow store and prevents transferring -// leader to the store -func (bc *BasicCluster) SlowStoreEvicted(storeID uint64) error { - bc.Stores.mu.Lock() - defer bc.Stores.mu.Unlock() - return bc.Stores.SlowStoreEvicted(storeID) -} - -// SlowTrendEvicted marks a store as a slow store by trend and prevents transferring -// leader to the store -func (bc *BasicCluster) SlowTrendEvicted(storeID uint64) error { - bc.Stores.mu.Lock() - defer bc.Stores.mu.Unlock() - return bc.Stores.SlowTrendEvicted(storeID) -} - -// SlowTrendRecovered cleans the evicted by slow trend state of a store. -func (bc *BasicCluster) SlowTrendRecovered(storeID uint64) { - bc.Stores.mu.Lock() - defer bc.Stores.mu.Unlock() - bc.Stores.SlowTrendRecovered(storeID) -} - -// SlowStoreRecovered cleans the evicted state of a store. -func (bc *BasicCluster) SlowStoreRecovered(storeID uint64) { - bc.Stores.mu.Lock() - defer bc.Stores.mu.Unlock() - bc.Stores.SlowStoreRecovered(storeID) -} - -// ResetStoreLimit resets the limit for a specific store. -func (bc *BasicCluster) ResetStoreLimit(storeID uint64, limitType storelimit.Type, ratePerSec ...float64) { - bc.Stores.mu.Lock() - defer bc.Stores.mu.Unlock() - bc.Stores.ResetStoreLimit(storeID, limitType, ratePerSec...) -} - // UpdateStoreStatus updates the information of the store. func (bc *BasicCluster) UpdateStoreStatus(storeID uint64) { - leaderCount, regionCount, witnessCount, learnerCount, pendingPeerCount, leaderRegionSize, regionSize := bc.RegionsInfo.GetStoreStats(storeID) - bc.Stores.mu.Lock() - defer bc.Stores.mu.Unlock() - bc.Stores.UpdateStoreStatus(storeID, leaderCount, regionCount, witnessCount, learnerCount, pendingPeerCount, leaderRegionSize, regionSize) -} - -// PutStore put a store. -func (bc *BasicCluster) PutStore(store *StoreInfo) { - bc.Stores.mu.Lock() - defer bc.Stores.mu.Unlock() - bc.Stores.SetStore(store) -} - -// ResetStores resets the store cache. -func (bc *BasicCluster) ResetStores() { - bc.Stores.mu.Lock() - defer bc.Stores.mu.Unlock() - bc.Stores.StoresInfo = NewStoresInfo() -} - -// DeleteStore deletes a store. -func (bc *BasicCluster) DeleteStore(store *StoreInfo) { - bc.Stores.mu.Lock() - defer bc.Stores.mu.Unlock() - bc.Stores.DeleteStore(store) + leaderCount, regionCount, witnessCount, learnerCount, pendingPeerCount, leaderRegionSize, regionSize := bc.GetStoreStats(storeID) + bc.StoresInfo.UpdateStoreStatus(storeID, leaderCount, regionCount, witnessCount, learnerCount, pendingPeerCount, leaderRegionSize, regionSize) } /* Regions read operations */ // GetLeaderStoreByRegionID returns the leader store of the given region. func (bc *BasicCluster) GetLeaderStoreByRegionID(regionID uint64) *StoreInfo { - region := bc.RegionsInfo.GetRegion(regionID) + region := bc.GetRegion(regionID) if region == nil || region.GetLeader() == nil { return nil } - bc.Stores.mu.RLock() - defer bc.Stores.mu.RUnlock() - return bc.Stores.GetStore(region.GetLeader().GetStoreId()) + return bc.GetStore(region.GetLeader().GetStoreId()) } func (bc *BasicCluster) getWriteRate( f func(storeID uint64) (bytesRate, keysRate float64), ) (storeIDs []uint64, bytesRates, keysRates []float64) { - bc.Stores.mu.RLock() - count := len(bc.Stores.stores) - storeIDs = make([]uint64, 0, count) - for _, store := range bc.Stores.stores { - storeIDs = append(storeIDs, store.GetID()) - } - bc.Stores.mu.RUnlock() + storeIDs = bc.GetStoreIDs() + count := len(storeIDs) bytesRates = make([]float64, 0, count) keysRates = make([]float64, 0, count) for _, id := range storeIDs { @@ -238,12 +63,12 @@ func (bc *BasicCluster) getWriteRate( // GetStoresLeaderWriteRate get total write rate of each store's leaders. func (bc *BasicCluster) GetStoresLeaderWriteRate() (storeIDs []uint64, bytesRates, keysRates []float64) { - return bc.getWriteRate(bc.RegionsInfo.GetStoreLeaderWriteRate) + return bc.getWriteRate(bc.GetStoreLeaderWriteRate) } // GetStoresWriteRate get total write rate of each store's regions. func (bc *BasicCluster) GetStoresWriteRate() (storeIDs []uint64, bytesRates, keysRates []float64) { - return bc.getWriteRate(bc.RegionsInfo.GetStoreWriteRate) + return bc.getWriteRate(bc.GetStoreWriteRate) } // UpdateAllStoreStatus updates the information of all stores. diff --git a/pkg/core/store.go b/pkg/core/store.go index 9b660754496e..5baedafdb05e 100644 --- a/pkg/core/store.go +++ b/pkg/core/store.go @@ -26,6 +26,7 @@ import ( "github.com/tikv/pd/pkg/core/constant" "github.com/tikv/pd/pkg/core/storelimit" "github.com/tikv/pd/pkg/errs" + "github.com/tikv/pd/pkg/utils/syncutil" "github.com/tikv/pd/pkg/utils/typeutil" "go.uber.org/zap" ) @@ -639,6 +640,7 @@ func MergeLabels(origin []*metapb.StoreLabel, labels []*metapb.StoreLabel) []*me // StoresInfo contains information about all stores. type StoresInfo struct { + syncutil.RWMutex stores map[uint64]*StoreInfo } @@ -649,8 +651,12 @@ func NewStoresInfo() *StoresInfo { } } +/* Stores read operations */ + // GetStore returns a copy of the StoreInfo with the specified storeID. func (s *StoresInfo) GetStore(storeID uint64) *StoreInfo { + s.RLock() + defer s.RUnlock() store, ok := s.stores[storeID] if !ok { return nil @@ -658,13 +664,121 @@ func (s *StoresInfo) GetStore(storeID uint64) *StoreInfo { return store } -// SetStore sets a StoreInfo with storeID. -func (s *StoresInfo) SetStore(store *StoreInfo) { +// GetStores gets a complete set of StoreInfo. +func (s *StoresInfo) GetStores() []*StoreInfo { + s.RLock() + defer s.RUnlock() + stores := make([]*StoreInfo, 0, len(s.stores)) + for _, store := range s.stores { + stores = append(stores, store) + } + return stores +} + +// GetMetaStores gets a complete set of metapb.Store. +func (s *StoresInfo) GetMetaStores() []*metapb.Store { + s.RLock() + defer s.RUnlock() + stores := make([]*metapb.Store, 0, len(s.stores)) + for _, store := range s.stores { + stores = append(stores, store.GetMeta()) + } + return stores +} + +// GetStoreIDs returns a list of store ids. +func (s *StoresInfo) GetStoreIDs() []uint64 { + s.RLock() + defer s.RUnlock() + count := len(s.stores) + storeIDs := make([]uint64, 0, count) + for _, store := range s.stores { + storeIDs = append(storeIDs, store.GetID()) + } + return storeIDs +} + +// GetFollowerStores returns all Stores that contains the region's follower peer. +func (s *StoresInfo) GetFollowerStores(region *RegionInfo) []*StoreInfo { + s.RLock() + defer s.RUnlock() + var stores []*StoreInfo + for id := range region.GetFollowers() { + if store, ok := s.stores[id]; ok && store != nil { + stores = append(stores, store) + } + } + return stores +} + +// GetRegionStores returns all Stores that contains the region's peer. +func (s *StoresInfo) GetRegionStores(region *RegionInfo) []*StoreInfo { + s.RLock() + defer s.RUnlock() + var stores []*StoreInfo + for id := range region.GetStoreIDs() { + if store, ok := s.stores[id]; ok && store != nil { + stores = append(stores, store) + } + } + return stores +} + +// GetLeaderStore returns all Stores that contains the region's leader peer. +func (s *StoresInfo) GetLeaderStore(region *RegionInfo) *StoreInfo { + s.RLock() + defer s.RUnlock() + if store, ok := s.stores[region.GetLeader().GetStoreId()]; ok && store != nil { + return store + } + return nil +} + +// GetStoreCount returns the total count of storeInfo. +func (s *StoresInfo) GetStoreCount() int { + s.RLock() + defer s.RUnlock() + return len(s.stores) +} + +// GetNonWitnessVoterStores returns all Stores that contains the non-witness's voter peer. +func (s *StoresInfo) GetNonWitnessVoterStores(region *RegionInfo) []*StoreInfo { + s.RLock() + defer s.RUnlock() + var stores []*StoreInfo + for id := range region.GetNonWitnessVoters() { + if store, ok := s.stores[id]; ok && store != nil { + stores = append(stores, store) + } + } + return stores +} + +/* Stores write operations */ + +// PutStore sets a StoreInfo with storeID. +func (s *StoresInfo) PutStore(store *StoreInfo) { + s.Lock() + defer s.Unlock() + s.putStoreLocked(store) +} + +// putStoreLocked sets a StoreInfo with storeID. +func (s *StoresInfo) putStoreLocked(store *StoreInfo) { s.stores[store.GetID()] = store } +// ResetStores resets the store cache. +func (s *StoresInfo) ResetStores() { + s.Lock() + defer s.Unlock() + s.stores = make(map[uint64]*StoreInfo) +} + // PauseLeaderTransfer pauses a StoreInfo with storeID. func (s *StoresInfo) PauseLeaderTransfer(storeID uint64) error { + s.Lock() + defer s.Unlock() store, ok := s.stores[storeID] if !ok { return errs.ErrStoreNotFound.FastGenByArgs(storeID) @@ -679,6 +793,8 @@ func (s *StoresInfo) PauseLeaderTransfer(storeID uint64) error { // ResumeLeaderTransfer cleans a store's pause state. The store can be selected // as source or target of TransferLeader again. func (s *StoresInfo) ResumeLeaderTransfer(storeID uint64) { + s.Lock() + defer s.Unlock() store, ok := s.stores[storeID] if !ok { log.Warn("try to clean a store's pause state, but it is not found. It may be cleanup", @@ -691,6 +807,8 @@ func (s *StoresInfo) ResumeLeaderTransfer(storeID uint64) { // SlowStoreEvicted marks a store as a slow store and prevents transferring // leader to the store func (s *StoresInfo) SlowStoreEvicted(storeID uint64) error { + s.Lock() + defer s.Unlock() store, ok := s.stores[storeID] if !ok { return errs.ErrStoreNotFound.FastGenByArgs(storeID) @@ -704,6 +822,8 @@ func (s *StoresInfo) SlowStoreEvicted(storeID uint64) error { // SlowStoreRecovered cleans the evicted state of a store. func (s *StoresInfo) SlowStoreRecovered(storeID uint64) { + s.Lock() + defer s.Unlock() store, ok := s.stores[storeID] if !ok { log.Warn("try to clean a store's evicted as a slow store state, but it is not found. It may be cleanup", @@ -716,6 +836,8 @@ func (s *StoresInfo) SlowStoreRecovered(storeID uint64) { // SlowTrendEvicted marks a store as a slow trend and prevents transferring // leader to the store func (s *StoresInfo) SlowTrendEvicted(storeID uint64) error { + s.Lock() + defer s.Unlock() store, ok := s.stores[storeID] if !ok { return errs.ErrStoreNotFound.FastGenByArgs(storeID) @@ -729,6 +851,8 @@ func (s *StoresInfo) SlowTrendEvicted(storeID uint64) error { // SlowTrendRecovered cleans the evicted by trend state of a store. func (s *StoresInfo) SlowTrendRecovered(storeID uint64) { + s.Lock() + defer s.Unlock() store, ok := s.stores[storeID] if !ok { log.Warn("try to clean a store's evicted by trend as a slow store state, but it is not found. It may be cleanup", @@ -740,76 +864,24 @@ func (s *StoresInfo) SlowTrendRecovered(storeID uint64) { // ResetStoreLimit resets the limit for a specific store. func (s *StoresInfo) ResetStoreLimit(storeID uint64, limitType storelimit.Type, ratePerSec ...float64) { + s.Lock() + defer s.Unlock() if store, ok := s.stores[storeID]; ok { s.stores[storeID] = store.Clone(ResetStoreLimit(limitType, ratePerSec...)) } } -// GetStores gets a complete set of StoreInfo. -func (s *StoresInfo) GetStores() []*StoreInfo { - stores := make([]*StoreInfo, 0, len(s.stores)) - for _, store := range s.stores { - stores = append(stores, store) - } - return stores -} - -// GetMetaStores gets a complete set of metapb.Store. -func (s *StoresInfo) GetMetaStores() []*metapb.Store { - stores := make([]*metapb.Store, 0, len(s.stores)) - for _, store := range s.stores { - stores = append(stores, store.GetMeta()) - } - return stores -} - // DeleteStore deletes tombstone record form store func (s *StoresInfo) DeleteStore(store *StoreInfo) { + s.Lock() + defer s.Unlock() delete(s.stores, store.GetID()) } -// GetStoreCount returns the total count of storeInfo. -func (s *StoresInfo) GetStoreCount() int { - return len(s.stores) -} - -// SetLeaderCount sets the leader count to a storeInfo. -func (s *StoresInfo) SetLeaderCount(storeID uint64, leaderCount int) { - if store, ok := s.stores[storeID]; ok { - s.stores[storeID] = store.Clone(SetLeaderCount(leaderCount)) - } -} - -// SetRegionCount sets the region count to a storeInfo. -func (s *StoresInfo) SetRegionCount(storeID uint64, regionCount int) { - if store, ok := s.stores[storeID]; ok { - s.stores[storeID] = store.Clone(SetRegionCount(regionCount)) - } -} - -// SetPendingPeerCount sets the pending count to a storeInfo. -func (s *StoresInfo) SetPendingPeerCount(storeID uint64, pendingPeerCount int) { - if store, ok := s.stores[storeID]; ok { - s.stores[storeID] = store.Clone(SetPendingPeerCount(pendingPeerCount)) - } -} - -// SetLeaderSize sets the leader size to a storeInfo. -func (s *StoresInfo) SetLeaderSize(storeID uint64, leaderSize int64) { - if store, ok := s.stores[storeID]; ok { - s.stores[storeID] = store.Clone(SetLeaderSize(leaderSize)) - } -} - -// SetRegionSize sets the region size to a storeInfo. -func (s *StoresInfo) SetRegionSize(storeID uint64, regionSize int64) { - if store, ok := s.stores[storeID]; ok { - s.stores[storeID] = store.Clone(SetRegionSize(regionSize)) - } -} - // UpdateStoreStatus updates the information of the store. func (s *StoresInfo) UpdateStoreStatus(storeID uint64, leaderCount, regionCount, witnessCount, learnerCount, pendingPeerCount int, leaderSize int64, regionSize int64) { + s.Lock() + defer s.Unlock() if store, ok := s.stores[storeID]; ok { newStore := store.ShallowClone(SetLeaderCount(leaderCount), SetRegionCount(regionCount), @@ -818,7 +890,7 @@ func (s *StoresInfo) UpdateStoreStatus(storeID uint64, leaderCount, regionCount, SetPendingPeerCount(pendingPeerCount), SetLeaderSize(leaderSize), SetRegionSize(regionSize)) - s.SetStore(newStore) + s.putStoreLocked(newStore) } } diff --git a/pkg/keyspace/keyspace.go b/pkg/keyspace/keyspace.go index d84b3698f69a..b37ec7f0fcaa 100644 --- a/pkg/keyspace/keyspace.go +++ b/pkg/keyspace/keyspace.go @@ -343,20 +343,20 @@ func (manager *Manager) splitKeyspaceRegion(id uint32, waitRegionSplit bool) (er for { select { case <-ticker.C: - regionsInfo := manager.cluster.GetBasicCluster().RegionsInfo - region := regionsInfo.GetRegionByKey(rawLeftBound) + c := manager.cluster.GetBasicCluster() + region := c.GetRegionByKey(rawLeftBound) if region == nil || !bytes.Equal(region.GetStartKey(), rawLeftBound) { continue } - region = regionsInfo.GetRegionByKey(rawRightBound) + region = c.GetRegionByKey(rawRightBound) if region == nil || !bytes.Equal(region.GetStartKey(), rawRightBound) { continue } - region = regionsInfo.GetRegionByKey(txnLeftBound) + region = c.GetRegionByKey(txnLeftBound) if region == nil || !bytes.Equal(region.GetStartKey(), txnLeftBound) { continue } - region = regionsInfo.GetRegionByKey(txnRightBound) + region = c.GetRegionByKey(txnRightBound) if region == nil || !bytes.Equal(region.GetStartKey(), txnRightBound) { continue } diff --git a/pkg/mcs/scheduling/server/apis/v1/api.go b/pkg/mcs/scheduling/server/apis/v1/api.go index be3277f3fc63..39aa11927ca4 100644 --- a/pkg/mcs/scheduling/server/apis/v1/api.go +++ b/pkg/mcs/scheduling/server/apis/v1/api.go @@ -272,7 +272,7 @@ func deleteAllRegionCache(c *gin.Context) { c.String(http.StatusInternalServerError, errs.ErrNotBootstrapped.GenWithStackByArgs().Error()) return } - cluster.DropCacheAllRegion() + cluster.ResetRegionCache() c.String(http.StatusOK, "All regions are removed from server cache.") } @@ -297,7 +297,7 @@ func deleteRegionCacheByID(c *gin.Context) { c.String(http.StatusBadRequest, err.Error()) return } - cluster.DropCacheRegion(regionID) + cluster.RemoveRegionIfExist(regionID) c.String(http.StatusOK, "The region is removed from server cache.") } diff --git a/pkg/mcs/scheduling/server/cluster.go b/pkg/mcs/scheduling/server/cluster.go index d711ab2d4f6e..caaafe42c87e 100644 --- a/pkg/mcs/scheduling/server/cluster.go +++ b/pkg/mcs/scheduling/server/cluster.go @@ -69,9 +69,9 @@ const ( collectWaitTime = time.Minute // heartbeat relative const - heartbeatTaskRunner = "heartbeat-task-runner" - statisticsTaskRunner = "statistics-task-runner" - logTaskRunner = "log-task-runner" + heartbeatTaskRunner = "heartbeat-task-runner" + miscTaskRunner = "misc-task-runner" + logTaskRunner = "log-task-runner" ) var syncRunner = ratelimit.NewSyncRunner() @@ -100,7 +100,7 @@ func NewCluster(parentCtx context.Context, persistConfig *config.PersistConfig, checkMembershipCh: checkMembershipCh, heartbeatRunner: ratelimit.NewConcurrentRunner(heartbeatTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), - miscRunner: ratelimit.NewConcurrentRunner(statisticsTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), + miscRunner: ratelimit.NewConcurrentRunner(miscTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), logRunner: ratelimit.NewConcurrentRunner(logTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), } c.coordinator = schedule.NewCoordinator(ctx, c, hbStreams) @@ -521,7 +521,7 @@ func (c *Cluster) collectMetrics() { // collect hot cache metrics c.hotStat.CollectMetrics() // collect the lock metrics - c.RegionsInfo.CollectWaitLockMetrics() + c.CollectWaitLockMetrics() } func resetMetrics() { @@ -688,16 +688,6 @@ func (c *Cluster) SetPrepared() { c.coordinator.GetPrepareChecker().SetPrepared() } -// DropCacheAllRegion removes all cached regions. -func (c *Cluster) DropCacheAllRegion() { - c.ResetRegionCache() -} - -// DropCacheRegion removes a region from the cache. -func (c *Cluster) DropCacheRegion(id uint64) { - c.RemoveRegionIfExist(id) -} - // IsSchedulingHalted returns whether the scheduling is halted. // Currently, the microservice scheduling is halted when: // - The `HaltScheduling` persist option is set to true. diff --git a/pkg/mock/mockcluster/mockcluster.go b/pkg/mock/mockcluster/mockcluster.go index 3f9710c48fd2..5d3aba2d2e8e 100644 --- a/pkg/mock/mockcluster/mockcluster.go +++ b/pkg/mock/mockcluster/mockcluster.go @@ -138,11 +138,6 @@ func (mc *Cluster) GetStoresLoads() map[uint64][]float64 { return mc.HotStat.GetStoresLoads() } -// GetStore gets a store with a given store ID. -func (mc *Cluster) GetStore(storeID uint64) *core.StoreInfo { - return mc.Stores.GetStore(storeID) -} - // IsRegionHot checks if the region is hot. func (mc *Cluster) IsRegionHot(region *core.RegionInfo) bool { return mc.HotCache.IsRegionHot(region, mc.GetHotRegionCacheHitsThreshold()) @@ -561,11 +556,6 @@ func (mc *Cluster) AddLeaderRegionWithWriteInfo( return items } -// DropCacheAllRegion removes all regions from the cache. -func (mc *Cluster) DropCacheAllRegion() { - mc.ResetRegionCache() -} - // UpdateStoreLeaderWeight updates store leader weight. func (mc *Cluster) UpdateStoreLeaderWeight(storeID uint64, weight float64) { store := mc.GetStore(storeID) @@ -752,7 +742,7 @@ func (mc *Cluster) UpdateStoreStatus(id uint64) { pendingPeerCount := mc.GetStorePendingPeerCount(id) leaderSize := mc.GetStoreLeaderRegionSize(id) regionSize := mc.GetStoreRegionSize(id) - store := mc.Stores.GetStore(id) + store := mc.GetStore(id) stats := &pdpb.StoreStats{} stats.Capacity = defaultStoreCapacity stats.Available = stats.Capacity - uint64(store.GetRegionSize()*units.MiB) diff --git a/pkg/schedule/checker/rule_checker_test.go b/pkg/schedule/checker/rule_checker_test.go index e69b956134b5..e1cc702fd36b 100644 --- a/pkg/schedule/checker/rule_checker_test.go +++ b/pkg/schedule/checker/rule_checker_test.go @@ -1980,7 +1980,7 @@ func makeStores() placement.StoreSet { if zone == 1 && host == 1 { labels["type"] = "read" } - stores.SetStore(core.NewStoreInfoWithLabel(id, labels).Clone(core.SetLastHeartbeatTS(now), core.SetStoreState(metapb.StoreState_Up))) + stores.PutStore(core.NewStoreInfoWithLabel(id, labels).Clone(core.SetLastHeartbeatTS(now), core.SetStoreState(metapb.StoreState_Up))) } } } diff --git a/pkg/schedule/placement/fit_test.go b/pkg/schedule/placement/fit_test.go index aa5c66059f7d..cc49d25640c5 100644 --- a/pkg/schedule/placement/fit_test.go +++ b/pkg/schedule/placement/fit_test.go @@ -47,7 +47,7 @@ func makeStores() StoreSet { if id == 1111 || id == 2111 || id == 3111 { labels["disk"] = "ssd" } - stores.SetStore(core.NewStoreInfoWithLabel(id, labels).Clone(core.SetLastHeartbeatTS(now))) + stores.PutStore(core.NewStoreInfoWithLabel(id, labels).Clone(core.SetLastHeartbeatTS(now))) } } } diff --git a/pkg/schedule/scatter/region_scatterer_test.go b/pkg/schedule/scatter/region_scatterer_test.go index b0027e0e4155..89e55e5c9c72 100644 --- a/pkg/schedule/scatter/region_scatterer_test.go +++ b/pkg/schedule/scatter/region_scatterer_test.go @@ -216,7 +216,7 @@ func scatterSpecial(re *require.Assertions, numOrdinaryStores, numSpecialStores, leaderStoreID := region.GetLeader().GetStoreId() for _, peer := range region.GetPeers() { storeID := peer.GetStoreId() - store := tc.Stores.GetStore(storeID) + store := tc.GetStore(storeID) if store.GetLabelValue("engine") == "tiflash" { countSpecialPeers[storeID]++ } else { diff --git a/pkg/schedule/schedulers/balance_test.go b/pkg/schedule/schedulers/balance_test.go index 234acfd6d264..26214ed5456c 100644 --- a/pkg/schedule/schedulers/balance_test.go +++ b/pkg/schedule/schedulers/balance_test.go @@ -697,7 +697,7 @@ func (suite *balanceLeaderRangeSchedulerTestSuite) TestReSortStores() { suite.tc.AddLeaderStore(4, 100) suite.tc.AddLeaderStore(5, 100) suite.tc.AddLeaderStore(6, 0) - stores := suite.tc.Stores.GetStores() + stores := suite.tc.GetStores() sort.Slice(stores, func(i, j int) bool { return stores[i].GetID() < stores[j].GetID() }) diff --git a/pkg/storage/leveldb_backend.go b/pkg/storage/leveldb_backend.go old mode 100644 new mode 100755 diff --git a/pkg/storage/storage_test.go b/pkg/storage/storage_test.go index 4525ec6091c0..460489ecd10f 100644 --- a/pkg/storage/storage_test.go +++ b/pkg/storage/storage_test.go @@ -100,7 +100,7 @@ func TestLoadStores(t *testing.T) { n := 10 stores := mustSaveStores(re, storage, n) - re.NoError(storage.LoadStores(cache.SetStore)) + re.NoError(storage.LoadStores(cache.PutStore)) re.Equal(n, cache.GetStoreCount()) for _, store := range cache.GetMetaStores() { @@ -117,7 +117,7 @@ func TestStoreWeight(t *testing.T) { mustSaveStores(re, storage, n) re.NoError(storage.SaveStoreWeight(1, 2.0, 3.0)) re.NoError(storage.SaveStoreWeight(2, 0.2, 0.3)) - re.NoError(storage.LoadStores(cache.SetStore)) + re.NoError(storage.LoadStores(cache.PutStore)) leaderWeights := []float64{1.0, 2.0, 0.2} regionWeights := []float64{1.0, 3.0, 0.3} for i := 0; i < n; i++ { diff --git a/pkg/unsaferecovery/unsafe_recovery_controller.go b/pkg/unsaferecovery/unsafe_recovery_controller.go index d2f6125c3f3a..89cd6e6393c8 100644 --- a/pkg/unsaferecovery/unsafe_recovery_controller.go +++ b/pkg/unsaferecovery/unsafe_recovery_controller.go @@ -107,7 +107,7 @@ const ( type cluster interface { core.StoreSetInformer - DropCacheAllRegion() + ResetRegionCache() AllocID() (uint64, error) BuryStore(storeID uint64, forceBury bool) error GetSchedulerConfig() sc.SchedulerConfigProvider @@ -544,7 +544,7 @@ func (u *Controller) changeStage(stage stage) { case Finished: if u.step > 1 { // == 1 means no operation has done, no need to invalid cache - u.cluster.DropCacheAllRegion() + u.cluster.ResetRegionCache() } output.Info = "Unsafe recovery Finished" output.Details = u.getAffectedTableDigest() diff --git a/server/api/admin.go b/server/api/admin.go index ab5ba8822871..dd81985b514c 100644 --- a/server/api/admin.go +++ b/server/api/admin.go @@ -60,7 +60,7 @@ func (h *adminHandler) DeleteRegionCache(w http.ResponseWriter, r *http.Request) h.rd.JSON(w, http.StatusBadRequest, err.Error()) return } - rc.DropCacheRegion(regionID) + rc.RemoveRegionIfExist(regionID) if h.svr.IsServiceIndependent(utils.SchedulingServiceName) { err = h.DeleteRegionCacheInSchedulingServer(regionID) } @@ -100,7 +100,7 @@ func (h *adminHandler) DeleteRegionStorage(w http.ResponseWriter, r *http.Reques return } // Remove region from cache. - rc.DropCacheRegion(regionID) + rc.RemoveRegionIfExist(regionID) if h.svr.IsServiceIndependent(utils.SchedulingServiceName) { err = h.DeleteRegionCacheInSchedulingServer(regionID) } @@ -116,7 +116,7 @@ func (h *adminHandler) DeleteRegionStorage(w http.ResponseWriter, r *http.Reques func (h *adminHandler) DeleteAllRegionCache(w http.ResponseWriter, r *http.Request) { var err error rc := getCluster(r) - rc.DropCacheAllRegion() + rc.ResetRegionCache() if h.svr.IsServiceIndependent(utils.SchedulingServiceName) { err = h.DeleteRegionCacheInSchedulingServer() } diff --git a/server/api/stats.go b/server/api/stats.go index 915d33ddfdff..5aa8fcb72a69 100644 --- a/server/api/stats.go +++ b/server/api/stats.go @@ -47,7 +47,7 @@ func (h *statsHandler) GetRegionStatus(w http.ResponseWriter, r *http.Request) { startKey, endKey := r.URL.Query().Get("start_key"), r.URL.Query().Get("end_key") var stats *statistics.RegionStats if r.URL.Query().Has("count") { - stats = rc.GetRegionCount([]byte(startKey), []byte(endKey)) + stats = rc.GetRegionStatsCount([]byte(startKey), []byte(endKey)) } else { stats = rc.GetRegionStatsByRange([]byte(startKey), []byte(endKey)) } diff --git a/server/cluster/cluster.go b/server/cluster/cluster.go index 057814b718bc..70d6b46b9803 100644 --- a/server/cluster/cluster.go +++ b/server/cluster/cluster.go @@ -107,9 +107,9 @@ const ( minSnapshotDurationSec = 5 // heartbeat relative const - heartbeatTaskRunner = "heartbeat-async" - statisticsTaskRunner = "statistics-async" - logTaskRunner = "log-async" + heartbeatTaskRunner = "heartbeat-async" + miscTaskRunner = "misc-async" + logTaskRunner = "log-async" ) // Server is the interface for cluster. @@ -143,6 +143,8 @@ type RaftCluster struct { ctx context.Context cancel context.CancelFunc + *core.BasicCluster // cached cluster info + etcdClient *clientv3.Client httpClient *http.Client @@ -159,7 +161,6 @@ type RaftCluster struct { // This below fields are all read-only, we cannot update itself after the raft cluster starts. clusterID uint64 id id.Allocator - core *core.BasicCluster // cached cluster info opt *config.PersistOptions limiter *StoreLimiter *schedulingController @@ -201,10 +202,10 @@ func NewRaftCluster(ctx context.Context, clusterID uint64, basicCluster *core.Ba regionSyncer: regionSyncer, httpClient: httpClient, etcdClient: etcdClient, - core: basicCluster, + BasicCluster: basicCluster, storage: storage, heartbeatRunner: ratelimit.NewConcurrentRunner(heartbeatTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), - miscRunner: ratelimit.NewConcurrentRunner(statisticsTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), + miscRunner: ratelimit.NewConcurrentRunner(miscTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), logRunner: ratelimit.NewConcurrentRunner(logTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), } } @@ -251,10 +252,10 @@ func (c *RaftCluster) LoadClusterStatus() (*Status, error) { } func (c *RaftCluster) isInitialized() bool { - if c.core.GetTotalRegionCount() > 1 { + if c.GetTotalRegionCount() > 1 { return true } - region := c.core.GetRegionByKey(nil) + region := c.GetRegionByKey(nil) return region != nil && len(region.GetVoters()) >= int(c.opt.GetReplicationConfig().MaxReplicas) && len(region.GetPendingPeers()) == 0 @@ -295,7 +296,7 @@ func (c *RaftCluster) InitCluster( return err } } - c.schedulingController = newSchedulingController(c.ctx, c.core, c.opt, c.ruleManager) + c.schedulingController = newSchedulingController(c.ctx, c.BasicCluster, c.opt, c.ruleManager) return nil } @@ -644,9 +645,9 @@ func (c *RaftCluster) LoadClusterInfo() (*RaftCluster, error) { return nil, nil } - c.core.ResetStores() + c.ResetStores() start := time.Now() - if err := c.storage.LoadStores(c.core.PutStore); err != nil { + if err := c.storage.LoadStores(c.PutStore); err != nil { return nil, err } log.Info("load stores", @@ -657,11 +658,11 @@ func (c *RaftCluster) LoadClusterInfo() (*RaftCluster, error) { start = time.Now() // used to load region from kv storage to cache storage. - if err = storage.TryLoadRegionsOnce(c.ctx, c.storage, c.core.CheckAndPutRegion); err != nil { + if err = storage.TryLoadRegionsOnce(c.ctx, c.storage, c.CheckAndPutRegion); err != nil { return nil, err } log.Info("load regions", - zap.Int("count", c.core.GetTotalRegionCount()), + zap.Int("count", c.GetTotalRegionCount()), zap.Duration("cost", time.Since(start)), ) @@ -729,7 +730,7 @@ func (c *RaftCluster) runUpdateStoreStats() { case <-ticker.C: // Update related stores. start := time.Now() - c.core.UpdateAllStoreStatus() + c.UpdateAllStoreStatus() updateStoreStatsGauge.Set(time.Since(start).Seconds()) } } @@ -868,8 +869,6 @@ func (c *RaftCluster) GetUnsafeRecoveryController() *unsaferecovery.Controller { func (c *RaftCluster) HandleStoreHeartbeat(heartbeat *pdpb.StoreHeartbeatRequest, resp *pdpb.StoreHeartbeatResponse) error { stats := heartbeat.GetStats() storeID := stats.GetStoreId() - c.Lock() - defer c.Unlock() store := c.GetStore(storeID) if store == nil { return errors.Errorf("store %v not found", storeID) @@ -917,10 +916,10 @@ func (c *RaftCluster) HandleStoreHeartbeat(heartbeat *pdpb.StoreHeartbeatRequest newStore = newStore.Clone(core.SetLastPersistTime(nowTime)) } } - if store := c.core.GetStore(storeID); store != nil { + if store := c.GetStore(storeID); store != nil { statistics.UpdateStoreHeartbeatMetrics(store) } - c.core.PutStore(newStore) + c.PutStore(newStore) var ( regions map[uint64]*core.RegionInfo interval uint64 @@ -989,7 +988,7 @@ func (c *RaftCluster) HandleStoreHeartbeat(heartbeat *pdpb.StoreHeartbeatRequest // processReportBuckets update the bucket information. func (c *RaftCluster) processReportBuckets(buckets *metapb.Buckets) error { - region := c.core.GetRegion(buckets.GetRegionId()) + region := c.GetRegion(buckets.GetRegionId()) if region == nil { regionCacheMissCounter.Inc() return errors.Errorf("region %v not found", buckets.GetRegionId()) @@ -1022,7 +1021,7 @@ var syncRunner = ratelimit.NewSyncRunner() // processRegionHeartbeat updates the region information. func (c *RaftCluster) processRegionHeartbeat(ctx *core.MetaProcessContext, region *core.RegionInfo) error { tracer := ctx.Tracer - origin, _, err := c.core.PreCheckPutRegion(region) + origin, _, err := c.PreCheckPutRegion(region) tracer.OnPreCheckFinished() if err != nil { return err @@ -1082,7 +1081,7 @@ func (c *RaftCluster) processRegionHeartbeat(ctx *core.MetaProcessContext, regio // check its validation again here. // // However, it can't solve the race condition of concurrent heartbeats from the same region. - if overlaps, err = c.core.CheckAndPutRootTree(ctx, region); err != nil { + if overlaps, err = c.CheckAndPutRootTree(ctx, region); err != nil { tracer.OnSaveCacheFinished() return err } @@ -1173,158 +1172,7 @@ func (c *RaftCluster) putMetaLocked(meta *metapb.Cluster) error { // GetBasicCluster returns the basic cluster. func (c *RaftCluster) GetBasicCluster() *core.BasicCluster { - return c.core -} - -// GetRegionByKey gets regionInfo by region key from cluster. -func (c *RaftCluster) GetRegionByKey(regionKey []byte) *core.RegionInfo { - return c.core.GetRegionByKey(regionKey) -} - -// GetPrevRegionByKey gets previous region and leader peer by the region key from cluster. -func (c *RaftCluster) GetPrevRegionByKey(regionKey []byte) *core.RegionInfo { - return c.core.GetPrevRegionByKey(regionKey) -} - -// ScanRegions scans region with start key, until the region contains endKey, or -// total number greater than limit. -func (c *RaftCluster) ScanRegions(startKey, endKey []byte, limit int) []*core.RegionInfo { - return c.core.ScanRegions(startKey, endKey, limit) -} - -// GetRegion searches for a region by ID. -func (c *RaftCluster) GetRegion(regionID uint64) *core.RegionInfo { - return c.core.GetRegion(regionID) -} - -// GetMetaRegions gets regions from cluster. -func (c *RaftCluster) GetMetaRegions() []*metapb.Region { - return c.core.GetMetaRegions() -} - -// GetRegions returns all regions' information in detail. -func (c *RaftCluster) GetRegions() []*core.RegionInfo { - return c.core.GetRegions() -} - -// ValidRegion is used to decide if the region is valid. -func (c *RaftCluster) ValidRegion(region *metapb.Region) error { - return c.core.ValidRegion(region) -} - -// GetTotalRegionCount returns total count of regions -func (c *RaftCluster) GetTotalRegionCount() int { - return c.core.GetTotalRegionCount() -} - -// GetStoreRegions returns all regions' information with a given storeID. -func (c *RaftCluster) GetStoreRegions(storeID uint64) []*core.RegionInfo { - return c.core.GetStoreRegions(storeID) -} - -// GetStoreRegions returns all regions' information with a given storeID. -func (c *RaftCluster) GetStoreRegionsByType(storeID uint64) []*core.RegionInfo { - return c.core.GetStoreRegions(storeID) -} - -// RandLeaderRegions returns some random regions that has leader on the store. -func (c *RaftCluster) RandLeaderRegions(storeID uint64, ranges []core.KeyRange) []*core.RegionInfo { - return c.core.RandLeaderRegions(storeID, ranges) -} - -// RandFollowerRegions returns some random regions that has a follower on the store. -func (c *RaftCluster) RandFollowerRegions(storeID uint64, ranges []core.KeyRange) []*core.RegionInfo { - return c.core.RandFollowerRegions(storeID, ranges) -} - -// RandPendingRegions returns some random regions that has a pending peer on the store. -func (c *RaftCluster) RandPendingRegions(storeID uint64, ranges []core.KeyRange) []*core.RegionInfo { - return c.core.RandPendingRegions(storeID, ranges) -} - -// RandLearnerRegions returns some random regions that has a learner peer on the store. -func (c *RaftCluster) RandLearnerRegions(storeID uint64, ranges []core.KeyRange) []*core.RegionInfo { - return c.core.RandLearnerRegions(storeID, ranges) -} - -// RandWitnessRegions returns some random regions that has a witness peer on the store. -func (c *RaftCluster) RandWitnessRegions(storeID uint64, ranges []core.KeyRange) []*core.RegionInfo { - return c.core.RandWitnessRegions(storeID, ranges) -} - -// GetLeaderStore returns all stores that contains the region's leader peer. -func (c *RaftCluster) GetLeaderStore(region *core.RegionInfo) *core.StoreInfo { - return c.core.GetLeaderStore(region) -} - -// GetNonWitnessVoterStores returns all stores that contains the region's non-witness voter peer. -func (c *RaftCluster) GetNonWitnessVoterStores(region *core.RegionInfo) []*core.StoreInfo { - return c.core.GetNonWitnessVoterStores(region) -} - -// GetFollowerStores returns all stores that contains the region's follower peer. -func (c *RaftCluster) GetFollowerStores(region *core.RegionInfo) []*core.StoreInfo { - return c.core.GetFollowerStores(region) -} - -// GetRegionStores returns all stores that contains the region's peer. -func (c *RaftCluster) GetRegionStores(region *core.RegionInfo) []*core.StoreInfo { - return c.core.GetRegionStores(region) -} - -// GetStoreCount returns the count of stores. -func (c *RaftCluster) GetStoreCount() int { - return c.core.GetStoreCount() -} - -// GetStoreRegionCount returns the number of regions for a given store. -func (c *RaftCluster) GetStoreRegionCount(storeID uint64) int { - return c.core.GetStoreRegionCount(storeID) -} - -// GetAverageRegionSize returns the average region approximate size. -func (c *RaftCluster) GetAverageRegionSize() int64 { - return c.core.GetAverageRegionSize() -} - -// DropCacheRegion removes a region from the cache. -func (c *RaftCluster) DropCacheRegion(id uint64) { - c.core.RemoveRegionIfExist(id) -} - -// DropCacheAllRegion removes all regions from the cache. -func (c *RaftCluster) DropCacheAllRegion() { - c.core.ResetRegionCache() -} - -// GetMetaStores gets stores from cluster. -func (c *RaftCluster) GetMetaStores() []*metapb.Store { - return c.core.GetMetaStores() -} - -// GetStores returns all stores in the cluster. -func (c *RaftCluster) GetStores() []*core.StoreInfo { - return c.core.GetStores() -} - -// GetLeaderStoreByRegionID returns the leader store of the given region. -func (c *RaftCluster) GetLeaderStoreByRegionID(regionID uint64) *core.StoreInfo { - return c.core.GetLeaderStoreByRegionID(regionID) -} - -// GetStore gets store from cluster. -func (c *RaftCluster) GetStore(storeID uint64) *core.StoreInfo { - return c.core.GetStore(storeID) -} - -// GetAdjacentRegions returns regions' information that are adjacent with the specific region ID. -func (c *RaftCluster) GetAdjacentRegions(region *core.RegionInfo) (*core.RegionInfo, *core.RegionInfo) { - return c.core.GetAdjacentRegions(region) -} - -// GetRangeHoles returns all range holes, i.e the key ranges without any region info. -func (c *RaftCluster) GetRangeHoles() [][]string { - return c.core.GetRangeHoles() + return c.BasicCluster } // UpdateStoreLabels updates a store's location labels @@ -1360,8 +1208,8 @@ func (c *RaftCluster) DeleteStoreLabel(storeID uint64, labelKey string) error { return c.putStoreImpl(newStore, true) } -// PutStore puts a store. -func (c *RaftCluster) PutStore(store *metapb.Store) error { +// PutMetaStore puts a store. +func (c *RaftCluster) PutMetaStore(store *metapb.Store) error { if err := c.putStoreImpl(store, false); err != nil { return err } @@ -1374,9 +1222,6 @@ func (c *RaftCluster) PutStore(store *metapb.Store) error { // If 'force' is true, the store's labels will overwrite those labels which already existed in the store. // If 'force' is false, the store's labels will merge into those labels which already existed in the store. func (c *RaftCluster) putStoreImpl(store *metapb.Store, force bool) error { - c.Lock() - defer c.Unlock() - if store.GetId() == 0 { return errors.Errorf("invalid put store %v", store) } @@ -1418,7 +1263,7 @@ func (c *RaftCluster) putStoreImpl(store *metapb.Store, force bool) error { if err := c.checkStoreLabels(s); err != nil { return err } - return c.putStoreLocked(s) + return c.setStore(s) } func (c *RaftCluster) checkStoreVersion(store *metapb.Store) error { @@ -1463,9 +1308,6 @@ func (c *RaftCluster) checkStoreLabels(s *core.StoreInfo) error { // RemoveStore marks a store as offline in cluster. // State transition: Up -> Offline. func (c *RaftCluster) RemoveStore(storeID uint64, physicallyDestroyed bool) error { - c.Lock() - defer c.Unlock() - store := c.GetStore(storeID) if store == nil { return errs.ErrStoreNotFound.FastGenByArgs(storeID) @@ -1490,9 +1332,9 @@ func (c *RaftCluster) RemoveStore(storeID uint64, physicallyDestroyed bool) erro zap.Uint64("store-id", storeID), zap.String("store-address", newStore.GetAddress()), zap.Bool("physically-destroyed", newStore.IsPhysicallyDestroyed())) - err := c.putStoreLocked(newStore) + err := c.setStore(newStore) if err == nil { - regionSize := float64(c.core.GetStoreRegionSize(storeID)) + regionSize := float64(c.GetStoreRegionSize(storeID)) c.resetProgress(storeID, store.GetAddress()) c.progressManager.AddProgress(encodeRemovingProgressKey(storeID), regionSize, regionSize, nodeStateCheckJobInterval, progress.WindowDurationOption(c.GetCoordinator().GetPatrolRegionsDuration())) // record the current store limit in memory @@ -1555,9 +1397,6 @@ func (c *RaftCluster) getUpStores() []uint64 { // BuryStore marks a store as tombstone in cluster. // If forceBury is false, the store should be offlined and emptied before calling this func. func (c *RaftCluster) BuryStore(storeID uint64, forceBury bool) error { - c.Lock() - defer c.Unlock() - store := c.GetStore(storeID) if store == nil { return errs.ErrStoreNotFound.FastGenByArgs(storeID) @@ -1582,8 +1421,8 @@ func (c *RaftCluster) BuryStore(storeID uint64, forceBury bool) error { zap.String("store-address", newStore.GetAddress()), zap.String("state", store.GetState().String()), zap.Bool("physically-destroyed", store.IsPhysicallyDestroyed())) - err := c.putStoreLocked(newStore) - c.onStoreVersionChangeLocked() + err := c.setStore(newStore) + c.OnStoreVersionChange() if err == nil { // clean up the residual information. delete(c.prevStoreLimit, storeID) @@ -1599,40 +1438,6 @@ func (c *RaftCluster) BuryStore(storeID uint64, forceBury bool) error { return err } -// PauseLeaderTransfer prevents the store from been selected as source or -// target store of TransferLeader. -func (c *RaftCluster) PauseLeaderTransfer(storeID uint64) error { - return c.core.PauseLeaderTransfer(storeID) -} - -// ResumeLeaderTransfer cleans a store's pause state. The store can be selected -// as source or target of TransferLeader again. -func (c *RaftCluster) ResumeLeaderTransfer(storeID uint64) { - c.core.ResumeLeaderTransfer(storeID) -} - -// SlowStoreEvicted marks a store as a slow store and prevents transferring -// leader to the store -func (c *RaftCluster) SlowStoreEvicted(storeID uint64) error { - return c.core.SlowStoreEvicted(storeID) -} - -// SlowTrendEvicted marks a store as a slow store by trend and prevents transferring -// leader to the store -func (c *RaftCluster) SlowTrendEvicted(storeID uint64) error { - return c.core.SlowTrendEvicted(storeID) -} - -// SlowTrendRecovered cleans the evicted by slow trend state of a store. -func (c *RaftCluster) SlowTrendRecovered(storeID uint64) { - c.core.SlowTrendRecovered(storeID) -} - -// SlowStoreRecovered cleans the evicted state of a store. -func (c *RaftCluster) SlowStoreRecovered(storeID uint64) { - c.core.SlowStoreRecovered(storeID) -} - // NeedAwakenAllRegionsInStore checks whether we should do AwakenRegions operation. func (c *RaftCluster) NeedAwakenAllRegionsInStore(storeID uint64) (needAwaken bool, slowStoreIDs []uint64) { store := c.GetStore(storeID) @@ -1664,9 +1469,6 @@ func (c *RaftCluster) NeedAwakenAllRegionsInStore(storeID uint64) (needAwaken bo // UpStore up a store from offline func (c *RaftCluster) UpStore(storeID uint64) error { - c.Lock() - defer c.Unlock() - store := c.GetStore(storeID) if store == nil { return errs.ErrStoreNotFound.FastGenByArgs(storeID) @@ -1697,7 +1499,7 @@ func (c *RaftCluster) UpStore(storeID uint64) error { log.Warn("store has been up", zap.Uint64("store-id", storeID), zap.String("store-address", newStore.GetAddress())) - err := c.putStoreLocked(newStore) + err := c.setStore(newStore) if err == nil { if exist { // persist the store limit @@ -1711,9 +1513,6 @@ func (c *RaftCluster) UpStore(storeID uint64) error { // ReadyToServe change store's node state to Serving. func (c *RaftCluster) ReadyToServe(storeID uint64) error { - c.Lock() - defer c.Unlock() - store := c.GetStore(storeID) if store == nil { return errs.ErrStoreNotFound.FastGenByArgs(storeID) @@ -1735,7 +1534,7 @@ func (c *RaftCluster) ReadyToServe(storeID uint64) error { log.Info("store has changed to serving", zap.Uint64("store-id", storeID), zap.String("store-address", newStore.GetAddress())) - err := c.putStoreLocked(newStore) + err := c.setStore(newStore) if err == nil { c.resetProgress(storeID, store.GetAddress()) } @@ -1758,16 +1557,16 @@ func (c *RaftCluster) SetStoreWeight(storeID uint64, leaderWeight, regionWeight core.SetRegionWeight(regionWeight), ) - return c.putStoreLocked(newStore) + return c.setStore(newStore) } -func (c *RaftCluster) putStoreLocked(store *core.StoreInfo) error { +func (c *RaftCluster) setStore(store *core.StoreInfo) error { if c.storage != nil { if err := c.storage.SaveStoreMeta(store.GetMeta()); err != nil { return err } } - c.core.PutStore(store) + c.PutStore(store) if !c.IsServiceIndependent(mcsutils.SchedulingServiceName) { c.updateStoreStatistics(store.GetID(), store.IsSlow()) } @@ -1833,11 +1632,11 @@ func (c *RaftCluster) checkStores() { offlineStore := store.GetMeta() id := offlineStore.GetId() - regionSize := c.core.GetStoreRegionSize(id) + regionSize := c.GetStoreRegionSize(id) if c.IsPrepared() { c.updateProgress(id, store.GetAddress(), removingAction, float64(regionSize), float64(regionSize), false /* dec */) } - regionCount := c.core.GetStoreRegionCount(id) + regionCount := c.GetStoreRegionCount(id) // If the store is empty, it can be buried. if regionCount == 0 { if err := c.BuryStore(id, false); err != nil { @@ -1865,7 +1664,7 @@ func (c *RaftCluster) checkStores() { func (c *RaftCluster) getThreshold(stores []*core.StoreInfo, store *core.StoreInfo) float64 { start := time.Now() if !c.opt.IsPlacementRulesEnabled() { - regionSize := c.core.GetRegionSizeByRange([]byte(""), []byte("")) * int64(c.opt.GetMaxReplicas()) + regionSize := c.GetRegionSizeByRange([]byte(""), []byte("")) * int64(c.opt.GetMaxReplicas()) weight := getStoreTopoWeight(store, stores, c.opt.GetLocationLabels(), c.opt.GetMaxReplicas()) return float64(regionSize) * weight * 0.9 } @@ -1905,7 +1704,7 @@ func (c *RaftCluster) calculateRange(stores []*core.StoreInfo, store *core.Store matchStores = append(matchStores, s) } } - regionSize := c.core.GetRegionSizeByRange(startKey, endKey) * int64(rule.Count) + regionSize := c.GetRegionSizeByRange(startKey, endKey) * int64(rule.Count) weight := getStoreTopoWeight(store, matchStores, rule.LocationLabels, rule.Count) storeSize += float64(regionSize) * weight log.Debug("calculate range result", @@ -2071,13 +1870,10 @@ func encodePreparingProgressKey(storeID uint64) string { // RemoveTombStoneRecords removes the tombStone Records. func (c *RaftCluster) RemoveTombStoneRecords() error { - c.Lock() - defer c.Unlock() - var failedStores []uint64 for _, store := range c.GetStores() { if store.IsRemoved() { - if c.core.GetStoreRegionCount(store.GetID()) > 0 { + if c.GetStoreRegionCount(store.GetID()) > 0 { log.Warn("skip removing tombstone", zap.Stringer("store", store.GetMeta())) failedStores = append(failedStores, store.GetID()) continue @@ -2115,7 +1911,7 @@ func (c *RaftCluster) deleteStore(store *core.StoreInfo) error { return err } } - c.core.DeleteStore(store) + c.DeleteStore(store) return nil } @@ -2156,12 +1952,6 @@ func (c *RaftCluster) resetProgressIndicator() { // OnStoreVersionChange changes the version of the cluster when needed. func (c *RaftCluster) OnStoreVersionChange() { - c.RLock() - defer c.RUnlock() - c.onStoreVersionChangeLocked() -} - -func (c *RaftCluster) onStoreVersionChangeLocked() { var minVersion *semver.Version stores := c.GetStores() for _, s := range stores { @@ -2219,13 +2009,13 @@ func (c *RaftCluster) PutMetaCluster(meta *metapb.Cluster) error { // GetRegionStatsByRange returns region statistics from cluster. func (c *RaftCluster) GetRegionStatsByRange(startKey, endKey []byte) *statistics.RegionStats { - return statistics.GetRegionStats(c.core.ScanRegions(startKey, endKey, -1)) + return statistics.GetRegionStats(c.ScanRegions(startKey, endKey, -1)) } -// GetRegionCount returns the number of regions in the range. -func (c *RaftCluster) GetRegionCount(startKey, endKey []byte) *statistics.RegionStats { +// GetRegionStatsCount returns the number of regions in the range. +func (c *RaftCluster) GetRegionStatsCount(startKey, endKey []byte) *statistics.RegionStats { stats := &statistics.RegionStats{} - stats.Count = c.core.GetRegionCount(startKey, endKey) + stats.Count = c.GetRegionCount(startKey, endKey) return stats } @@ -2237,7 +2027,7 @@ func (c *RaftCluster) putRegion(region *core.RegionInfo) error { return err } } - c.core.PutRegion(region) + c.PutRegion(region) return nil } @@ -2292,7 +2082,7 @@ func (c *RaftCluster) AddStoreLimit(store *metapb.Store) { func (c *RaftCluster) RemoveStoreLimit(storeID uint64) { cfg := c.opt.GetScheduleConfig().Clone() for _, limitType := range storelimit.TypeNameValue { - c.core.ResetStoreLimit(storeID, limitType) + c.ResetStoreLimit(storeID, limitType) } delete(cfg.StoreLimit, storeID) c.opt.SetScheduleConfig(cfg) @@ -2312,16 +2102,13 @@ func (c *RaftCluster) RemoveStoreLimit(storeID uint64) { // SetMinResolvedTS sets up a store with min resolved ts. func (c *RaftCluster) SetMinResolvedTS(storeID, minResolvedTS uint64) error { - c.Lock() - defer c.Unlock() - store := c.GetStore(storeID) if store == nil { return errs.ErrStoreNotFound.FastGenByArgs(storeID) } newStore := store.Clone(core.SetMinResolvedTS(minResolvedTS)) - c.core.PutStore(newStore) + c.PutStore(newStore) return nil } diff --git a/server/cluster/cluster_test.go b/server/cluster/cluster_test.go index 0f08153c8ae7..ee7c477476be 100644 --- a/server/cluster/cluster_test.go +++ b/server/cluster/cluster_test.go @@ -93,7 +93,7 @@ func TestStoreHeartbeat(t *testing.T) { } re.Error(cluster.HandleStoreHeartbeat(req, resp)) - re.NoError(cluster.putStoreLocked(store)) + re.NoError(cluster.setStore(store)) re.Equal(i+1, cluster.GetStoreCount()) re.Equal(int64(0), store.GetLastHeartbeatTS().UnixNano()) @@ -215,7 +215,7 @@ func TestFilterUnhealthyStore(t *testing.T) { Available: 50, RegionCount: 1, } - re.NoError(cluster.putStoreLocked(store)) + re.NoError(cluster.setStore(store)) re.NoError(cluster.HandleStoreHeartbeat(req, resp)) re.NotNil(cluster.hotStat.GetRollingStoreStats(store.GetID())) } @@ -228,7 +228,7 @@ func TestFilterUnhealthyStore(t *testing.T) { RegionCount: 1, } newStore := store.Clone(core.SetStoreState(metapb.StoreState_Tombstone)) - re.NoError(cluster.putStoreLocked(newStore)) + re.NoError(cluster.setStore(newStore)) re.NoError(cluster.HandleStoreHeartbeat(req, resp)) re.Nil(cluster.hotStat.GetRollingStoreStats(store.GetID())) } @@ -253,7 +253,7 @@ func TestSetOfflineStore(t *testing.T) { // Put 6 stores. for _, store := range newTestStores(6, "2.0.0") { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } // store 1: up -> offline @@ -295,7 +295,7 @@ func TestSetOfflineStore(t *testing.T) { // test clean up tombstone store toCleanStore := cluster.GetStore(1).Clone().GetMeta() toCleanStore.LastHeartbeat = time.Now().Add(-40 * 24 * time.Hour).UnixNano() - cluster.PutStore(toCleanStore) + cluster.PutMetaStore(toCleanStore) cluster.checkStores() re.Nil(cluster.GetStore(1)) } @@ -312,7 +312,7 @@ func TestSetOfflineWithReplica(t *testing.T) { // Put 4 stores. for _, store := range newTestStores(4, "2.0.0") { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } re.NoError(cluster.RemoveStore(2, false)) @@ -351,7 +351,7 @@ func TestSetOfflineStoreWithEvictLeader(t *testing.T) { // Put 3 stores. for _, store := range newTestStores(3, "2.0.0") { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } _, err = addEvictLeaderScheduler(cluster, 1) @@ -378,7 +378,7 @@ func TestForceBuryStore(t *testing.T) { stores := newTestStores(2, "5.3.0") stores[1] = stores[1].Clone(core.SetLastHeartbeatTS(time.Now())) for _, store := range stores { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } re.NoError(cluster.BuryStore(uint64(1), true)) re.Error(cluster.BuryStore(uint64(2), true)) @@ -396,7 +396,7 @@ func TestReuseAddress(t *testing.T) { cluster.coordinator = schedule.NewCoordinator(ctx, cluster, nil) // Put 4 stores. for _, store := range newTestStores(4, "2.0.0") { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } // store 1: up // store 2: offline @@ -420,9 +420,9 @@ func TestReuseAddress(t *testing.T) { if storeInfo.IsPhysicallyDestroyed() || storeInfo.IsRemoved() { // try to start a new store with the same address with store which is physically destroyed or tombstone should be success - re.NoError(cluster.PutStore(newStore)) + re.NoError(cluster.PutMetaStore(newStore)) } else { - re.Error(cluster.PutStore(newStore)) + re.Error(cluster.PutMetaStore(newStore)) } } } @@ -450,7 +450,7 @@ func TestUpStore(t *testing.T) { // Put 5 stores. for _, store := range newTestStores(5, "5.0.0") { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } // set store 1 offline @@ -490,7 +490,7 @@ func TestRemovingProcess(t *testing.T) { // Put 5 stores. stores := newTestStores(5, "5.0.0") for _, store := range stores { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } regions := newTestRegions(100, 5, 1) var regionInStore1 []*core.RegionInfo @@ -518,7 +518,7 @@ func TestRemovingProcess(t *testing.T) { if i >= 5 { break } - cluster.DropCacheRegion(region.GetID()) + cluster.RemoveRegionIfExist(region.GetID()) i++ } cluster.checkStores() @@ -553,13 +553,13 @@ func TestDeleteStoreUpdatesClusterVersion(t *testing.T) { // Put 3 new 4.0.9 stores. for _, store := range newTestStores(3, "4.0.9") { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } re.Equal("4.0.9", cluster.GetClusterVersion()) // Upgrade 2 stores to 5.0.0. for _, store := range newTestStores(2, "5.0.0") { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } re.Equal("4.0.9", cluster.GetClusterVersion()) @@ -582,14 +582,14 @@ func TestStoreClusterVersion(t *testing.T) { s1.Version = "5.0.1" s2.Version = "5.0.3" s3.Version = "5.0.5" - re.NoError(cluster.PutStore(s2)) + re.NoError(cluster.PutMetaStore(s2)) re.Equal(s2.Version, cluster.GetClusterVersion()) - re.NoError(cluster.PutStore(s1)) + re.NoError(cluster.PutMetaStore(s1)) // the cluster version should be 5.0.1(the min one) re.Equal(s1.Version, cluster.GetClusterVersion()) - re.NoError(cluster.PutStore(s3)) + re.NoError(cluster.PutMetaStore(s3)) // the cluster version should be 5.0.1(the min one) re.Equal(s1.Version, cluster.GetClusterVersion()) } @@ -679,7 +679,7 @@ func TestBucketHeartbeat(t *testing.T) { n, np := uint64(2), uint64(2) regions := newTestRegions(n, n, np) for _, store := range stores { - re.NoError(cluster.putStoreLocked(store)) + re.NoError(cluster.setStore(store)) } re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), regions[0])) @@ -729,31 +729,31 @@ func TestRegionHeartbeat(t *testing.T) { regions := newTestRegions(n, n, np) for _, store := range stores { - re.NoError(cluster.putStoreLocked(store)) + re.NoError(cluster.setStore(store)) } for i, region := range regions { // region does not exist. re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) checkRegionsKV(re, cluster.storage, regions[:i+1]) // region is the same, not updated. re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) checkRegionsKV(re, cluster.storage, regions[:i+1]) origin := region // region is updated. region = origin.Clone(core.WithIncVersion()) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) checkRegionsKV(re, cluster.storage, regions[:i+1]) // region is stale (Version). stale := origin.Clone(core.WithIncConfVer()) re.Error(cluster.processRegionHeartbeat(core.ContextTODO(), stale)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) checkRegionsKV(re, cluster.storage, regions[:i+1]) // region is updated @@ -763,13 +763,13 @@ func TestRegionHeartbeat(t *testing.T) { ) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) checkRegionsKV(re, cluster.storage, regions[:i+1]) // region is stale (ConfVer). stale = origin.Clone(core.WithIncConfVer()) re.Error(cluster.processRegionHeartbeat(core.ContextTODO(), stale)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) checkRegionsKV(re, cluster.storage, regions[:i+1]) // Add a down peer. @@ -781,38 +781,38 @@ func TestRegionHeartbeat(t *testing.T) { })) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) // Add a pending peer. region = region.Clone(core.WithPendingPeers([]*metapb.Peer{region.GetPeers()[rand.Intn(len(region.GetPeers()))]})) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) // Clear down peers. region = region.Clone(core.WithDownPeers(nil)) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) // Clear pending peers. region = region.Clone(core.WithPendingPeers(nil)) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) // Remove peers. origin = region region = origin.Clone(core.SetPeers(region.GetPeers()[:1])) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) checkRegionsKV(re, cluster.storage, regions[:i+1]) // Add peers. region = origin regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) checkRegionsKV(re, cluster.storage, regions[:i+1]) // Change one peer to witness @@ -822,47 +822,47 @@ func TestRegionHeartbeat(t *testing.T) { ) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) // Change leader. region = region.Clone(core.WithLeader(region.GetPeers()[1])) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) // Change ApproximateSize. region = region.Clone(core.SetApproximateSize(144)) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) // Change ApproximateKeys. region = region.Clone(core.SetApproximateKeys(144000)) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) // Change bytes written. region = region.Clone(core.SetWrittenBytes(24000)) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) // Change bytes read. region = region.Clone(core.SetReadBytes(1080000)) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) // Flashback region = region.Clone(core.WithFlashback(true, 1)) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) region = region.Clone(core.WithFlashback(false, 0)) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) } regionCounts := make(map[uint64]int) @@ -894,10 +894,10 @@ func TestRegionHeartbeat(t *testing.T) { time.Sleep(50 * time.Millisecond) for _, store := range cluster.GetStores() { - re.Equal(cluster.core.GetStoreLeaderCount(store.GetID()), store.GetLeaderCount()) - re.Equal(cluster.core.GetStoreRegionCount(store.GetID()), store.GetRegionCount()) - re.Equal(cluster.core.GetStoreLeaderRegionSize(store.GetID()), store.GetLeaderSize()) - re.Equal(cluster.core.GetStoreRegionSize(store.GetID()), store.GetRegionSize()) + re.Equal(cluster.GetStoreLeaderCount(store.GetID()), store.GetLeaderCount()) + re.Equal(cluster.GetStoreRegionCount(store.GetID()), store.GetRegionCount()) + re.Equal(cluster.GetStoreLeaderRegionSize(store.GetID()), store.GetLeaderSize()) + re.Equal(cluster.GetStoreRegionSize(store.GetID()), store.GetRegionSize()) } // Test with storage. @@ -1133,7 +1133,7 @@ func TestRegionLabelIsolationLevel(t *testing.T) { State: metapb.StoreState_Up, Labels: labels, } - re.NoError(cluster.putStoreLocked(core.NewStoreInfo(store))) + re.NoError(cluster.setStore(core.NewStoreInfo(store))) } peers := make([]*metapb.Peer, 0, 4) @@ -1296,7 +1296,7 @@ func TestOfflineAndMerge(t *testing.T) { // Put 4 stores. for _, store := range newTestStores(4, "5.0.0") { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } peers := []*metapb.Peer{ @@ -1351,7 +1351,7 @@ func TestStoreConfigUpdate(t *testing.T) { tc := newTestCluster(ctx, opt) stores := newTestStores(5, "2.0.0") for _, s := range stores { - re.NoError(tc.putStoreLocked(s)) + re.NoError(tc.setStore(s)) } re.Len(tc.getUpStores(), 5) // Case1: big region. @@ -1436,7 +1436,7 @@ func TestSyncConfigContext(t *testing.T) { })) stores := newTestStores(1, "2.0.0") for _, s := range stores { - re.NoError(tc.putStoreLocked(s)) + re.NoError(tc.setStore(s)) } // trip schema header now := time.Now() @@ -1458,7 +1458,7 @@ func TestStoreConfigSync(t *testing.T) { tc := newTestCluster(ctx, opt) stores := newTestStores(5, "2.0.0") for _, s := range stores { - re.NoError(tc.putStoreLocked(s)) + re.NoError(tc.setStore(s)) } re.Len(tc.getUpStores(), 5) @@ -1503,7 +1503,7 @@ func TestUpdateStorePendingPeerCount(t *testing.T) { tc.RaftCluster.coordinator = schedule.NewCoordinator(ctx, tc.RaftCluster, nil) stores := newTestStores(5, "2.0.0") for _, s := range stores { - re.NoError(tc.putStoreLocked(s)) + re.NoError(tc.setStore(s)) } tc.RaftCluster.wg.Add(1) go tc.RaftCluster.runUpdateStoreStats() @@ -1678,7 +1678,7 @@ func TestCalculateStoreSize1(t *testing.T) { }, }...) s := store.Clone(core.SetStoreLabels(labels)) - re.NoError(cluster.PutStore(s.GetMeta())) + re.NoError(cluster.PutMetaStore(s.GetMeta())) } cluster.ruleManager.SetRule( @@ -1762,7 +1762,7 @@ func TestCalculateStoreSize2(t *testing.T) { } labels = append(labels, []*metapb.StoreLabel{{Key: "rack", Value: "r1"}, {Key: "host", Value: "h1"}}...) s := store.Clone(core.SetStoreLabels(labels)) - re.NoError(cluster.PutStore(s.GetMeta())) + re.NoError(cluster.PutMetaStore(s.GetMeta())) } cluster.ruleManager.SetRule( @@ -1812,7 +1812,7 @@ func TestStores(t *testing.T) { id := store.GetID() re.Nil(cache.GetStore(id)) re.Error(cache.PauseLeaderTransfer(id)) - cache.SetStore(store) + cache.PutStore(store) re.Equal(store, cache.GetStore(id)) re.Equal(i+1, cache.GetStoreCount()) re.NoError(cache.PauseLeaderTransfer(id)) @@ -1843,7 +1843,7 @@ func Test(t *testing.T) { _, opts, err := newTestScheduleConfig() re.NoError(err) tc := newTestRaftCluster(ctx, mockid.NewIDAllocator(), opts, storage.NewStorageWithMemoryBackend()) - cache := tc.core + cache := tc.BasicCluster for i := uint64(0); i < n; i++ { region := regions[i] @@ -1961,7 +1961,7 @@ func TestAwakenStore(t *testing.T) { stores := newTestStores(n, "6.5.0") re.True(stores[0].NeedAwakenStore()) for _, store := range stores { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } for i := uint64(1); i <= n; i++ { re.False(cluster.slowStat.ExistsSlowStores()) @@ -1971,7 +1971,7 @@ func TestAwakenStore(t *testing.T) { now := time.Now() store4 := stores[0].Clone(core.SetLastHeartbeatTS(now), core.SetLastAwakenTime(now.Add(-11*time.Minute))) - re.NoError(cluster.putStoreLocked(store4)) + re.NoError(cluster.setStore(store4)) store1 := cluster.GetStore(1) re.True(store1.NeedAwakenStore()) @@ -2013,7 +2013,7 @@ func TestUpdateAndDeleteLabel(t *testing.T) { cluster := newTestRaftCluster(ctx, mockid.NewIDAllocator(), opt, storage.NewStorageWithMemoryBackend()) stores := newTestStores(1, "6.5.1") for _, store := range stores { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } re.Empty(cluster.GetStore(1).GetLabels()) // Update label. @@ -2105,7 +2105,7 @@ func TestUpdateAndDeleteLabel(t *testing.T) { newStore := typeutil.DeepClone(cluster.GetStore(1).GetMeta(), core.StoreFactory) newStore.Labels = nil // Store rebooting will call PutStore. - err = cluster.PutStore(newStore) + err = cluster.PutMetaStore(newStore) re.NoError(err) // Check the label after rebooting. re.Equal([]*metapb.StoreLabel{{Key: "mode", Value: "readonly"}}, cluster.GetStore(1).GetLabels()) @@ -2142,7 +2142,7 @@ func newTestRaftCluster( s storage.Storage, ) *RaftCluster { opt.GetScheduleConfig().EnableHeartbeatConcurrentRunner = false - rc := &RaftCluster{serverCtx: ctx, core: core.NewBasicCluster(), storage: s} + rc := &RaftCluster{serverCtx: ctx, BasicCluster: core.NewBasicCluster(), storage: s} rc.InitCluster(id, opt, nil, nil) rc.ruleManager = placement.NewRuleManager(ctx, storage.NewStorageWithMemoryBackend(), rc, opt) if opt.IsPlacementRulesEnabled() { @@ -2151,7 +2151,7 @@ func newTestRaftCluster( panic(err) } } - rc.schedulingController = newSchedulingController(rc.ctx, rc.core, rc.opt, rc.ruleManager) + rc.schedulingController = newSchedulingController(rc.ctx, rc.BasicCluster, rc.opt, rc.ruleManager) return rc } @@ -2324,7 +2324,7 @@ func (c *testCluster) addRegionStore(storeID uint64, regionCount int, regionSize c.SetStoreLimit(storeID, storelimit.RemovePeer, 60) c.Lock() defer c.Unlock() - return c.putStoreLocked(newStore) + return c.setStore(newStore) } func (c *testCluster) addLeaderRegion(regionID uint64, leaderStoreID uint64, followerStoreIDs ...uint64) error { @@ -2347,7 +2347,7 @@ func (c *testCluster) updateLeaderCount(storeID uint64, leaderCount int) error { ) c.Lock() defer c.Unlock() - return c.putStoreLocked(newStore) + return c.setStore(newStore) } func (c *testCluster) addLeaderStore(storeID uint64, leaderCount int) error { @@ -2363,7 +2363,7 @@ func (c *testCluster) addLeaderStore(storeID uint64, leaderCount int) error { c.SetStoreLimit(storeID, storelimit.RemovePeer, 60) c.Lock() defer c.Unlock() - return c.putStoreLocked(newStore) + return c.setStore(newStore) } func (c *testCluster) setStoreDown(storeID uint64) error { @@ -2374,7 +2374,7 @@ func (c *testCluster) setStoreDown(storeID uint64) error { ) c.Lock() defer c.Unlock() - return c.putStoreLocked(newStore) + return c.setStore(newStore) } func (c *testCluster) setStoreOffline(storeID uint64) error { @@ -2382,7 +2382,7 @@ func (c *testCluster) setStoreOffline(storeID uint64) error { newStore := store.Clone(core.SetStoreState(metapb.StoreState_Offline, false)) c.Lock() defer c.Unlock() - return c.putStoreLocked(newStore) + return c.setStore(newStore) } func (c *testCluster) LoadRegion(regionID uint64, followerStoreIDs ...uint64) error { @@ -2966,7 +2966,7 @@ func TestShouldRun(t *testing.T) { nr := &metapb.Region{Id: 6, Peers: []*metapb.Peer{}} newRegion := core.NewRegionInfo(nr, nil, core.SetSource(core.Heartbeat)) re.Error(tc.processRegionHeartbeat(core.ContextTODO(), newRegion)) - re.Equal(7, tc.core.GetClusterNotFromStorageRegionsCnt()) + re.Equal(7, tc.GetClusterNotFromStorageRegionsCnt()) } func TestShouldRunWithNonLeaderRegions(t *testing.T) { @@ -3009,7 +3009,7 @@ func TestShouldRunWithNonLeaderRegions(t *testing.T) { nr := &metapb.Region{Id: 9, Peers: []*metapb.Peer{}} newRegion := core.NewRegionInfo(nr, nil, core.SetSource(core.Heartbeat)) re.Error(tc.processRegionHeartbeat(core.ContextTODO(), newRegion)) - re.Equal(9, tc.core.GetClusterNotFromStorageRegionsCnt()) + re.Equal(9, tc.GetClusterNotFromStorageRegionsCnt()) // Now, after server is prepared, there exist some regions with no leader. re.Equal(uint64(0), tc.GetRegion(10).GetLeader().GetStoreId()) diff --git a/server/cluster/scheduling_controller.go b/server/cluster/scheduling_controller.go index 20d5a6bceaea..ca846eaa8858 100644 --- a/server/cluster/scheduling_controller.go +++ b/server/cluster/scheduling_controller.go @@ -195,7 +195,7 @@ func (sc *schedulingController) collectSchedulingMetrics() { // collect hot cache metrics sc.hotStat.CollectMetrics() // collect the lock metrics - sc.RegionsInfo.CollectWaitLockMetrics() + sc.CollectWaitLockMetrics() } func (sc *schedulingController) removeStoreStatistics(storeID uint64) { diff --git a/server/grpc_service.go b/server/grpc_service.go index 2b3ee232686b..acfc87fcf718 100644 --- a/server/grpc_service.go +++ b/server/grpc_service.go @@ -826,7 +826,7 @@ func (s *GrpcServer) PutStore(ctx context.Context, request *pdpb.PutStoreRequest }, nil } - if err := rc.PutStore(store); err != nil { + if err := rc.PutMetaStore(store); err != nil { return &pdpb.PutStoreResponse{ Header: s.wrapErrorToHeader(pdpb.ErrorType_UNKNOWN, err.Error()), }, nil diff --git a/server/server.go b/server/server.go index af9f48f8c9b0..1d38a5ee4956 100644 --- a/server/server.go +++ b/server/server.go @@ -1555,8 +1555,6 @@ func (s *Server) UpdateGRPCServiceRateLimiter(serviceLabel string, opts ...ratel // GetClusterStatus gets cluster status. func (s *Server) GetClusterStatus() (*cluster.Status, error) { - s.cluster.Lock() - defer s.cluster.Unlock() return s.cluster.LoadClusterStatus() } diff --git a/tests/integrations/mcs/scheduling/api_test.go b/tests/integrations/mcs/scheduling/api_test.go index cf2c6dd2508f..365ab1ca493e 100644 --- a/tests/integrations/mcs/scheduling/api_test.go +++ b/tests/integrations/mcs/scheduling/api_test.go @@ -498,19 +498,19 @@ func (suite *apiTestSuite) checkAdminRegionCacheForward(cluster *tests.TestClust apiServer := cluster.GetLeaderServer().GetServer() schedulingServer := cluster.GetSchedulingPrimaryServer() re.Equal(3, schedulingServer.GetCluster().GetRegionCount([]byte{}, []byte{})) - re.Equal(3, apiServer.GetRaftCluster().GetRegionCount([]byte{}, []byte{}).Count) + re.Equal(3, apiServer.GetRaftCluster().GetRegionCount([]byte{}, []byte{})) addr := cluster.GetLeaderServer().GetAddr() urlPrefix := fmt.Sprintf("%s/pd/api/v1/admin/cache/region", addr) err := testutil.CheckDelete(tests.TestDialClient, fmt.Sprintf("%s/%s", urlPrefix, "30"), testutil.StatusOK(re)) re.NoError(err) re.Equal(2, schedulingServer.GetCluster().GetRegionCount([]byte{}, []byte{})) - re.Equal(2, apiServer.GetRaftCluster().GetRegionCount([]byte{}, []byte{}).Count) + re.Equal(2, apiServer.GetRaftCluster().GetRegionCount([]byte{}, []byte{})) err = testutil.CheckDelete(tests.TestDialClient, urlPrefix+"s", testutil.StatusOK(re)) re.NoError(err) re.Equal(0, schedulingServer.GetCluster().GetRegionCount([]byte{}, []byte{})) - re.Equal(0, apiServer.GetRaftCluster().GetRegionCount([]byte{}, []byte{}).Count) + re.Equal(0, apiServer.GetRaftCluster().GetRegionCount([]byte{}, []byte{})) } func (suite *apiTestSuite) TestFollowerForward() { diff --git a/tests/integrations/mcs/scheduling/config_test.go b/tests/integrations/mcs/scheduling/config_test.go index d78833797312..54622d5c515b 100644 --- a/tests/integrations/mcs/scheduling/config_test.go +++ b/tests/integrations/mcs/scheduling/config_test.go @@ -175,7 +175,7 @@ func (suite *configTestSuite) TestSchedulerConfigWatch() { }) assertEvictLeaderStoreIDs(re, storage, []uint64{1}) // Update the scheduler by adding a store. - err = suite.pdLeaderServer.GetServer().GetRaftCluster().PutStore( + err = suite.pdLeaderServer.GetServer().GetRaftCluster().PutMetaStore( &metapb.Store{ Id: 2, Address: "mock://2", diff --git a/tests/integrations/mcs/scheduling/meta_test.go b/tests/integrations/mcs/scheduling/meta_test.go index abc1efd9021f..11782590ab95 100644 --- a/tests/integrations/mcs/scheduling/meta_test.go +++ b/tests/integrations/mcs/scheduling/meta_test.go @@ -79,7 +79,7 @@ func (suite *metaTestSuite) TestStoreWatch() { ) re.NoError(err) for i := uint64(1); i <= 4; i++ { - suite.pdLeaderServer.GetServer().GetRaftCluster().PutStore( + suite.pdLeaderServer.GetServer().GetRaftCluster().PutMetaStore( &metapb.Store{Id: i, Address: fmt.Sprintf("mock-%d", i), State: metapb.StoreState_Up, NodeState: metapb.NodeState_Serving, LastHeartbeat: time.Now().UnixNano()}, ) } @@ -102,7 +102,7 @@ func (suite *metaTestSuite) TestStoreWatch() { }) // test synchronized store labels - suite.pdLeaderServer.GetServer().GetRaftCluster().PutStore( + suite.pdLeaderServer.GetServer().GetRaftCluster().PutMetaStore( &metapb.Store{Id: 5, Address: "mock-5", State: metapb.StoreState_Up, NodeState: metapb.NodeState_Serving, LastHeartbeat: time.Now().UnixNano(), Labels: []*metapb.StoreLabel{{Key: "zone", Value: "z1"}}}, ) testutil.Eventually(re, func() bool { diff --git a/tests/integrations/mcs/scheduling/server_test.go b/tests/integrations/mcs/scheduling/server_test.go index 38c1cc6a41bd..82da47d18f3f 100644 --- a/tests/integrations/mcs/scheduling/server_test.go +++ b/tests/integrations/mcs/scheduling/server_test.go @@ -310,7 +310,7 @@ func (suite *serverTestSuite) TestSchedulerSync() { checkEvictLeaderSchedulerExist(re, schedulersController, true) checkEvictLeaderStoreIDs(re, schedulersController, []uint64{1}) // Add a store_id to the evict-leader-scheduler through the API server. - err = suite.pdLeader.GetServer().GetRaftCluster().PutStore( + err = suite.pdLeader.GetServer().GetRaftCluster().PutMetaStore( &metapb.Store{ Id: 2, Address: "mock://2", diff --git a/tests/server/api/region_test.go b/tests/server/api/region_test.go index 2ff0b5d4b860..23ebceaefd60 100644 --- a/tests/server/api/region_test.go +++ b/tests/server/api/region_test.go @@ -407,7 +407,7 @@ func (suite *regionTestSuite) checkRegionsReplicated(cluster *tests.TestCluster) func checkRegionCount(re *require.Assertions, cluster *tests.TestCluster, count uint64) { leader := cluster.GetLeaderServer() tu.Eventually(re, func() bool { - return leader.GetRaftCluster().GetRegionCount([]byte{}, []byte{}).Count == int(count) + return leader.GetRaftCluster().GetRegionCount([]byte{}, []byte{}) == int(count) }) if sche := cluster.GetSchedulingPrimaryServer(); sche != nil { tu.Eventually(re, func() bool { diff --git a/tests/server/cluster/cluster_test.go b/tests/server/cluster/cluster_test.go index 61a4561c55a7..07bcf3ee2a19 100644 --- a/tests/server/cluster/cluster_test.go +++ b/tests/server/cluster/cluster_test.go @@ -601,7 +601,7 @@ func TestRaftClusterMultipleRestart(t *testing.T) { store := newMetaStore(storeID, "127.0.0.1:4", "2.1.0", metapb.StoreState_Offline, getTestDeployPath(storeID)) rc := leaderServer.GetRaftCluster() re.NotNil(rc) - err = rc.PutStore(store) + err = rc.PutMetaStore(store) re.NoError(err) re.NotNil(tc) rc.Stop() From b1cbc7151f40e6e34c0582820aca1463c8e8c8c4 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Tue, 28 May 2024 15:58:21 +0800 Subject: [PATCH 03/47] tests: fix the testify usage of re.Positive/Negative (#8221) ref tikv/pd#4399 Signed-off-by: JmPotato --- pkg/schedule/schedulers/evict_slow_trend_test.go | 2 +- pkg/statistics/hot_peer_cache_test.go | 2 +- tests/integrations/client/http_client_test.go | 4 ++-- tests/integrations/mcs/tso/keyspace_group_manager_test.go | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pkg/schedule/schedulers/evict_slow_trend_test.go b/pkg/schedule/schedulers/evict_slow_trend_test.go index 834ef3376398..dd6807f4a853 100644 --- a/pkg/schedule/schedulers/evict_slow_trend_test.go +++ b/pkg/schedule/schedulers/evict_slow_trend_test.go @@ -105,7 +105,7 @@ func (suite *evictSlowTrendTestSuite) TestEvictSlowTrendBasicFuncs() { re.Equal(slowCandidate{}, es2.conf.evictCandidate) es2.conf.markCandidateRecovered() lastCapturedCandidate = es2.conf.lastCapturedCandidate() - re.Greater(lastCapturedCandidate.recoverTS.Compare(recoverTS), 0) + re.Positive(lastCapturedCandidate.recoverTS.Compare(recoverTS)) re.Equal(lastCapturedCandidate.storeID, store.GetID()) // Test capture another store 2 diff --git a/pkg/statistics/hot_peer_cache_test.go b/pkg/statistics/hot_peer_cache_test.go index c116e020f544..db215238604f 100644 --- a/pkg/statistics/hot_peer_cache_test.go +++ b/pkg/statistics/hot_peer_cache_test.go @@ -375,7 +375,7 @@ func TestUpdateHotPeerStat(t *testing.T) { cache.updateStat(newItem[0]) newItem = cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) } - re.Less(newItem[0].HotDegree, 0) + re.Negative(newItem[0].HotDegree) re.Equal(0, newItem[0].AntiCount) re.Equal(utils.Remove, newItem[0].actionType) } diff --git a/tests/integrations/client/http_client_test.go b/tests/integrations/client/http_client_test.go index 9e712b808f38..33652da9be00 100644 --- a/tests/integrations/client/http_client_test.go +++ b/tests/integrations/client/http_client_test.go @@ -174,11 +174,11 @@ func (suite *httpClientTestSuite) checkMeta(mode mode, client pd.Client) { re.Equal("INPROGRESS", state) regionStats, err := client.GetRegionStatusByKeyRange(env.ctx, pd.NewKeyRange([]byte("a1"), []byte("a3")), false) re.NoError(err) - re.Greater(regionStats.Count, 0) + re.Positive(regionStats.Count) re.NotEmpty(regionStats.StoreLeaderCount) regionStats, err = client.GetRegionStatusByKeyRange(env.ctx, pd.NewKeyRange([]byte("a1"), []byte("a3")), true) re.NoError(err) - re.Greater(regionStats.Count, 0) + re.Positive(regionStats.Count) re.Empty(regionStats.StoreLeaderCount) hotReadRegions, err := client.GetHotReadRegions(env.ctx) re.NoError(err) diff --git a/tests/integrations/mcs/tso/keyspace_group_manager_test.go b/tests/integrations/mcs/tso/keyspace_group_manager_test.go index f7b892ce77d7..25d9516bf632 100644 --- a/tests/integrations/mcs/tso/keyspace_group_manager_test.go +++ b/tests/integrations/mcs/tso/keyspace_group_manager_test.go @@ -300,7 +300,7 @@ func (suite *tsoKeyspaceGroupManagerTestSuite) TestTSOKeyspaceGroupSplit() { // Check the split TSO from keyspace group `newID` now. splitTS, err := suite.requestTSO(re, 222, newID) re.NoError(err) - re.Greater(tsoutil.CompareTimestamp(&splitTS, &ts), 0) + re.Positive(tsoutil.CompareTimestamp(&splitTS, &ts)) } func (suite *tsoKeyspaceGroupManagerTestSuite) requestTSO( @@ -636,7 +636,7 @@ func (suite *tsoKeyspaceGroupManagerTestSuite) TestTSOKeyspaceGroupMerge() { } return err == nil && tsoutil.CompareTimestamp(&mergedTS, &pdpb.Timestamp{}) > 0 }, testutil.WithTickInterval(5*time.Second), testutil.WithWaitFor(time.Minute)) - re.Greater(tsoutil.CompareTimestamp(&mergedTS, &ts), 0) + re.Positive(tsoutil.CompareTimestamp(&mergedTS, &ts)) } func (suite *tsoKeyspaceGroupManagerTestSuite) TestTSOKeyspaceGroupMergeClient() { From b7d8b94060e3c60693829574a40286b08d444f16 Mon Sep 17 00:00:00 2001 From: ShuNing Date: Tue, 28 May 2024 17:52:21 +0800 Subject: [PATCH 04/47] controller: fix error retry and add more metrics (#8219) close tikv/pd#8217 controller: fix error retry and add more metrics Signed-off-by: nolouch Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- .../resource_group/controller/controller.go | 71 ++++++++++++------- .../controller/controller_test.go | 15 ++++ client/resource_group/controller/limiter.go | 27 ++++++- client/resource_group/controller/metrics.go | 18 ++++- 4 files changed, 101 insertions(+), 30 deletions(-) diff --git a/client/resource_group/controller/controller.go b/client/resource_group/controller/controller.go index 11ea3f7997d9..1910e37eff8d 100755 --- a/client/resource_group/controller/controller.go +++ b/client/resource_group/controller/controller.go @@ -515,7 +515,7 @@ func (c *ResourceGroupsController) collectTokenBucketRequests(ctx context.Contex request := gc.collectRequestAndConsumption(typ) if request != nil { c.run.currentRequests = append(c.run.currentRequests, request) - gc.tokenRequestCounter.Inc() + gc.metrics.tokenRequestCounter.Inc() } return true }) @@ -632,13 +632,9 @@ type groupCostController struct { calculators []ResourceCalculator handleRespFunc func(*rmpb.TokenBucketResponse) - successfulRequestDuration prometheus.Observer - failedLimitReserveDuration prometheus.Observer - requestRetryCounter prometheus.Counter - failedRequestCounter prometheus.Counter - tokenRequestCounter prometheus.Counter - - mu struct { + // metrics + metrics *groupMetricsCollection + mu struct { sync.Mutex consumption *rmpb.Consumption storeCounter map[uint64]*rmpb.Consumption @@ -685,6 +681,30 @@ type groupCostController struct { tombstone bool } +type groupMetricsCollection struct { + successfulRequestDuration prometheus.Observer + failedLimitReserveDuration prometheus.Observer + requestRetryCounter prometheus.Counter + failedRequestCounterWithOthers prometheus.Counter + failedRequestCounterWithThrottled prometheus.Counter + tokenRequestCounter prometheus.Counter +} + +func initMetrics(oldName, name string) *groupMetricsCollection { + const ( + otherType = "others" + throttledType = "throttled" + ) + return &groupMetricsCollection{ + successfulRequestDuration: successfulRequestDuration.WithLabelValues(oldName, name), + failedLimitReserveDuration: failedLimitReserveDuration.WithLabelValues(oldName, name), + failedRequestCounterWithOthers: failedRequestCounter.WithLabelValues(oldName, name, otherType), + failedRequestCounterWithThrottled: failedRequestCounter.WithLabelValues(oldName, name, throttledType), + requestRetryCounter: requestRetryCounter.WithLabelValues(oldName, name), + tokenRequestCounter: resourceGroupTokenRequestCounter.WithLabelValues(oldName, name), + } +} + type tokenCounter struct { getTokenBucketFunc func() *rmpb.TokenBucket @@ -725,16 +745,13 @@ func newGroupCostController( default: return nil, errs.ErrClientResourceGroupConfigUnavailable.FastGenByArgs("not supports the resource type") } + ms := initMetrics(group.Name, group.Name) gc := &groupCostController{ - meta: group, - name: group.Name, - mainCfg: mainCfg, - mode: group.GetMode(), - successfulRequestDuration: successfulRequestDuration.WithLabelValues(group.Name, group.Name), - failedLimitReserveDuration: failedLimitReserveDuration.WithLabelValues(group.Name, group.Name), - failedRequestCounter: failedRequestCounter.WithLabelValues(group.Name, group.Name), - requestRetryCounter: requestRetryCounter.WithLabelValues(group.Name, group.Name), - tokenRequestCounter: resourceGroupTokenRequestCounter.WithLabelValues(group.Name, group.Name), + meta: group, + name: group.Name, + mainCfg: mainCfg, + mode: group.GetMode(), + metrics: ms, calculators: []ResourceCalculator{ newKVCalculator(mainCfg), newSQLCalculator(mainCfg), @@ -789,7 +806,7 @@ func (gc *groupCostController) initRunState() { case rmpb.GroupMode_RUMode: gc.run.requestUnitTokens = make(map[rmpb.RequestUnitType]*tokenCounter) for typ := range requestUnitLimitTypeList { - limiter := NewLimiterWithCfg(now, cfgFunc(getRUTokenBucketSetting(gc.meta, typ)), gc.lowRUNotifyChan) + limiter := NewLimiterWithCfg(gc.name, now, cfgFunc(getRUTokenBucketSetting(gc.meta, typ)), gc.lowRUNotifyChan) counter := &tokenCounter{ limiter: limiter, avgRUPerSec: 0, @@ -803,7 +820,7 @@ func (gc *groupCostController) initRunState() { case rmpb.GroupMode_RawMode: gc.run.resourceTokens = make(map[rmpb.RawResourceType]*tokenCounter) for typ := range requestResourceLimitTypeList { - limiter := NewLimiterWithCfg(now, cfgFunc(getRawResourceTokenBucketSetting(gc.meta, typ)), gc.lowRUNotifyChan) + limiter := NewLimiterWithCfg(gc.name, now, cfgFunc(getRawResourceTokenBucketSetting(gc.meta, typ)), gc.lowRUNotifyChan) counter := &tokenCounter{ limiter: limiter, avgRUPerSec: 0, @@ -1233,7 +1250,7 @@ func (gc *groupCostController) onRequestWait( res = append(res, counter.limiter.Reserve(ctx, gc.mainCfg.LTBMaxWaitDuration, now, v)) } } - if d, err = WaitReservations(ctx, now, res); err == nil { + if d, err = WaitReservations(ctx, now, res); err == nil || errs.ErrClientResourceGroupThrottled.NotEqual(err) { break retryLoop } case rmpb.GroupMode_RUMode: @@ -1243,18 +1260,20 @@ func (gc *groupCostController) onRequestWait( res = append(res, counter.limiter.Reserve(ctx, gc.mainCfg.LTBMaxWaitDuration, now, v)) } } - if d, err = WaitReservations(ctx, now, res); err == nil { + if d, err = WaitReservations(ctx, now, res); err == nil || errs.ErrClientResourceGroupThrottled.NotEqual(err) { break retryLoop } } - gc.requestRetryCounter.Inc() + gc.metrics.requestRetryCounter.Inc() time.Sleep(gc.mainCfg.WaitRetryInterval) waitDuration += gc.mainCfg.WaitRetryInterval } if err != nil { - gc.failedRequestCounter.Inc() - if d.Seconds() > 0 { - gc.failedLimitReserveDuration.Observe(d.Seconds()) + if errs.ErrClientResourceGroupThrottled.Equal(err) { + gc.metrics.failedRequestCounterWithThrottled.Inc() + gc.metrics.failedLimitReserveDuration.Observe(d.Seconds()) + } else { + gc.metrics.failedRequestCounterWithOthers.Inc() } gc.mu.Lock() sub(gc.mu.consumption, delta) @@ -1264,7 +1283,7 @@ func (gc *groupCostController) onRequestWait( }) return nil, nil, waitDuration, 0, err } - gc.successfulRequestDuration.Observe(d.Seconds()) + gc.metrics.successfulRequestDuration.Observe(d.Seconds()) waitDuration += d } diff --git a/client/resource_group/controller/controller_test.go b/client/resource_group/controller/controller_test.go index fea4a133ad0b..4f4ec5927931 100644 --- a/client/resource_group/controller/controller_test.go +++ b/client/resource_group/controller/controller_test.go @@ -26,6 +26,7 @@ import ( rmpb "github.com/pingcap/kvproto/pkg/resource_manager" "github.com/stretchr/testify/require" + "github.com/tikv/pd/client/errs" ) func createTestGroupCostController(re *require.Assertions) *groupCostController { @@ -117,3 +118,17 @@ func TestRequestAndResponseConsumption(t *testing.T) { re.Equal(expectedConsumption.TotalCpuTimeMs, consumption.TotalCpuTimeMs, caseNum) } } + +func TestResourceGroupThrottledError(t *testing.T) { + re := require.New(t) + gc := createTestGroupCostController(re) + gc.initRunState() + req := &TestRequestInfo{ + isWrite: true, + writeBytes: 10000000, + } + // The group is throttled + _, _, _, _, err := gc.onRequestWait(context.TODO(), req) + re.Error(err) + re.True(errs.ErrClientResourceGroupThrottled.Equal(err)) +} diff --git a/client/resource_group/controller/limiter.go b/client/resource_group/controller/limiter.go index a726b0e219a7..2e42f591b8be 100644 --- a/client/resource_group/controller/limiter.go +++ b/client/resource_group/controller/limiter.go @@ -26,6 +26,7 @@ import ( "time" "github.com/pingcap/log" + "github.com/prometheus/client_golang/prometheus" "github.com/tikv/pd/client/errs" "go.uber.org/zap" ) @@ -81,6 +82,15 @@ type Limiter struct { isLowProcess bool // remainingNotifyTimes is used to limit notify when the speed limit is already set. remainingNotifyTimes int + name string + + // metrics + metrics *limiterMetricsCollection +} + +// limiterMetricsCollection is a collection of metrics for a limiter. +type limiterMetricsCollection struct { + lowTokenNotifyCounter prometheus.Counter } // Limit returns the maximum overall event rate. @@ -106,8 +116,9 @@ func NewLimiter(now time.Time, r Limit, b int64, tokens float64, lowTokensNotify // NewLimiterWithCfg returns a new Limiter that allows events up to rate r and permits // bursts of at most b tokens. -func NewLimiterWithCfg(now time.Time, cfg tokenBucketReconfigureArgs, lowTokensNotifyChan chan<- struct{}) *Limiter { +func NewLimiterWithCfg(name string, now time.Time, cfg tokenBucketReconfigureArgs, lowTokensNotifyChan chan<- struct{}) *Limiter { lim := &Limiter{ + name: name, limit: Limit(cfg.NewRate), last: now, tokens: cfg.NewTokens, @@ -115,6 +126,9 @@ func NewLimiterWithCfg(now time.Time, cfg tokenBucketReconfigureArgs, lowTokensN notifyThreshold: cfg.NotifyThreshold, lowTokensNotifyChan: lowTokensNotifyChan, } + lim.metrics = &limiterMetricsCollection{ + lowTokenNotifyCounter: lowTokenRequestNotifyCounter.WithLabelValues(lim.name), + } log.Debug("new limiter", zap.String("limiter", fmt.Sprintf("%+v", lim))) return lim } @@ -224,6 +238,14 @@ func (lim *Limiter) SetupNotificationThreshold(threshold float64) { lim.notifyThreshold = threshold } +// SetName sets the name of the limiter. +func (lim *Limiter) SetName(name string) *Limiter { + lim.mu.Lock() + defer lim.mu.Unlock() + lim.name = name + return lim +} + // notify tries to send a non-blocking notification on notifyCh and disables // further notifications (until the next Reconfigure or StartNotification). func (lim *Limiter) notify() { @@ -234,6 +256,9 @@ func (lim *Limiter) notify() { lim.isLowProcess = true select { case lim.lowTokensNotifyChan <- struct{}{}: + if lim.metrics != nil { + lim.metrics.lowTokenNotifyCounter.Inc() + } default: } } diff --git a/client/resource_group/controller/metrics.go b/client/resource_group/controller/metrics.go index 4261705a6f6a..30a0b850c7d0 100644 --- a/client/resource_group/controller/metrics.go +++ b/client/resource_group/controller/metrics.go @@ -24,6 +24,8 @@ const ( // TODO: remove old label in 8.x resourceGroupNameLabel = "name" newResourceGroupNameLabel = "resource_group" + + errType = "type" ) var ( @@ -40,7 +42,7 @@ var ( Namespace: namespace, Subsystem: requestSubsystem, Name: "success", - Buckets: []float64{.005, .01, .05, .1, .5, 1, 5, 10, 20, 25, 30}, // 0.005 ~ 30 + Buckets: []float64{0.0005, .005, .01, .05, .1, .5, 1, 5, 10, 20, 25, 30, 60, 600, 1800, 3600}, // 0.0005 ~ 1h Help: "Bucketed histogram of wait duration of successful request.", }, []string{resourceGroupNameLabel, newResourceGroupNameLabel}) @@ -49,7 +51,7 @@ var ( Namespace: namespace, Subsystem: requestSubsystem, Name: "limit_reserve_time_failed", - Buckets: []float64{.005, .01, .05, .1, .5, 1, 5, 10, 20, 25, 30}, // 0.005 ~ 30 + Buckets: []float64{0.0005, .01, .05, .1, .5, 1, 5, 10, 20, 25, 30, 60, 600, 1800, 3600, 86400}, // 0.0005 ~ 24h Help: "Bucketed histogram of wait duration of failed request.", }, []string{resourceGroupNameLabel, newResourceGroupNameLabel}) @@ -59,7 +61,7 @@ var ( Subsystem: requestSubsystem, Name: "fail", Help: "Counter of failed request.", - }, []string{resourceGroupNameLabel, newResourceGroupNameLabel}) + }, []string{resourceGroupNameLabel, newResourceGroupNameLabel, errType}) requestRetryCounter = prometheus.NewCounterVec( prometheus.CounterOpts{ @@ -73,6 +75,7 @@ var ( prometheus.HistogramOpts{ Namespace: namespace, Subsystem: tokenRequestSubsystem, + Buckets: prometheus.ExponentialBuckets(0.001, 2, 13), // 1ms ~ 8s Name: "duration", Help: "Bucketed histogram of latency(s) of token request.", }, []string{"type"}) @@ -84,6 +87,14 @@ var ( Name: "resource_group", Help: "Counter of token request by every resource group.", }, []string{resourceGroupNameLabel, newResourceGroupNameLabel}) + + lowTokenRequestNotifyCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: tokenRequestSubsystem, + Name: "low_token_notified", + Help: "Counter of low token request.", + }, []string{newResourceGroupNameLabel}) ) var ( @@ -100,4 +111,5 @@ func init() { prometheus.MustRegister(requestRetryCounter) prometheus.MustRegister(tokenRequestDuration) prometheus.MustRegister(resourceGroupTokenRequestCounter) + prometheus.MustRegister(lowTokenRequestNotifyCounter) } From 4820bc5f505824a0212bc54c8a2d92cb5d3251dc Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Wed, 29 May 2024 15:26:52 +0800 Subject: [PATCH 05/47] OWNERS: Auto Sync OWNERS files from community membership (#8163) Signed-off-by: Ti Chi Robot Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- OWNERS | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 OWNERS diff --git a/OWNERS b/OWNERS new file mode 100644 index 000000000000..5911dfd3b663 --- /dev/null +++ b/OWNERS @@ -0,0 +1,26 @@ +# See the OWNERS docs at https://go.k8s.io/owners +approvers: + - AndreMouche + - binshi-bing + - bufferflies + - CabinfeverB + - Connor1996 + - disksing + - huachaohuang + - HunDunDM + - HuSharp + - JmPotato + - lhy1024 + - nolouch + - overvenus + - qiuyesuifeng + - rleungx + - siddontang + - Yisaer + - zhouqiang-cl +reviewers: + - BusyJay + - howardlau1999 + - Luffbee + - shafreeck + - xhebox From c498063583dabfbc35a1bb3198fe9224f806d744 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Wed, 29 May 2024 17:49:22 +0800 Subject: [PATCH 06/47] tests: test HTTP client initialization with different suites (#8224) ref tikv/pd#7300 Use different suites to test HTTP client initialization instead of maintaining different modes manually. Signed-off-by: JmPotato --- tests/integrations/client/http_client_test.go | 465 ++++++++---------- 1 file changed, 196 insertions(+), 269 deletions(-) diff --git a/tests/integrations/client/http_client_test.go b/tests/integrations/client/http_client_test.go index 33652da9be00..fa109946e4ba 100644 --- a/tests/integrations/client/http_client_test.go +++ b/tests/integrations/client/http_client_test.go @@ -41,190 +41,169 @@ import ( "github.com/tikv/pd/tests" ) -type mode int - -// We have two ways to create HTTP client. -// 1. using `NewClient` which created `DefaultPDServiceDiscovery` -// 2. using `NewClientWithServiceDiscovery` which pass a `PDServiceDiscovery` as parameter -// test cases should be run in both modes. -const ( - defaultServiceDiscovery mode = iota - specificServiceDiscovery -) - type httpClientTestSuite struct { suite.Suite - env map[mode]*httpClientTestEnv + // 1. Using `NewClient` will create a `DefaultPDServiceDiscovery` internal. + // 2. Using `NewClientWithServiceDiscovery` will need a `PDServiceDiscovery` to be passed in. + withServiceDiscovery bool + ctx context.Context + cancelFunc context.CancelFunc + cluster *tests.TestCluster + endpoints []string + client pd.Client } -type httpClientTestEnv struct { - ctx context.Context - cancelFunc context.CancelFunc - cluster *tests.TestCluster - endpoints []string +func TestHTTPClientTestSuite(t *testing.T) { + suite.Run(t, &httpClientTestSuite{ + withServiceDiscovery: false, + }) } -func TestHTTPClientTestSuite(t *testing.T) { - suite.Run(t, new(httpClientTestSuite)) +func TestHTTPClientTestSuiteWithServiceDiscovery(t *testing.T) { + suite.Run(t, &httpClientTestSuite{ + withServiceDiscovery: true, + }) } func (suite *httpClientTestSuite) SetupSuite() { - suite.env = make(map[mode]*httpClientTestEnv) re := suite.Require() + suite.ctx, suite.cancelFunc = context.WithCancel(context.Background()) - for _, mode := range []mode{defaultServiceDiscovery, specificServiceDiscovery} { - env := &httpClientTestEnv{} - env.ctx, env.cancelFunc = context.WithCancel(context.Background()) + cluster, err := tests.NewTestCluster(suite.ctx, 2) + re.NoError(err) - cluster, err := tests.NewTestCluster(env.ctx, 2) - re.NoError(err) + err = cluster.RunInitialServers() + re.NoError(err) + leader := cluster.WaitLeader() + re.NotEmpty(leader) + leaderServer := cluster.GetLeaderServer() - err = cluster.RunInitialServers() + err = leaderServer.BootstrapCluster() + re.NoError(err) + for _, region := range []*core.RegionInfo{ + core.NewTestRegionInfo(10, 1, []byte("a1"), []byte("a2")), + core.NewTestRegionInfo(11, 1, []byte("a2"), []byte("a3")), + } { + err := leaderServer.GetRaftCluster().HandleRegionHeartbeat(region) re.NoError(err) - leader := cluster.WaitLeader() - re.NotEmpty(leader) - leaderServer := cluster.GetLeaderServer() - - err = leaderServer.BootstrapCluster() + } + var ( + testServers = cluster.GetServers() + endpoints = make([]string, 0, len(testServers)) + ) + for _, s := range testServers { + addr := s.GetConfig().AdvertiseClientUrls + url, err := url.Parse(addr) re.NoError(err) - for _, region := range []*core.RegionInfo{ - core.NewTestRegionInfo(10, 1, []byte("a1"), []byte("a2")), - core.NewTestRegionInfo(11, 1, []byte("a2"), []byte("a3")), - } { - err := leaderServer.GetRaftCluster().HandleRegionHeartbeat(region) - re.NoError(err) - } - var ( - testServers = cluster.GetServers() - endpoints = make([]string, 0, len(testServers)) - ) - for _, s := range testServers { - addr := s.GetConfig().AdvertiseClientUrls - url, err := url.Parse(addr) - re.NoError(err) - endpoints = append(endpoints, url.Host) - } - env.endpoints = endpoints - env.cluster = cluster - - suite.env[mode] = env + endpoints = append(endpoints, url.Host) } -} - -func (suite *httpClientTestSuite) TearDownSuite() { - for _, env := range suite.env { - env.cancelFunc() - env.cluster.Destroy() + suite.endpoints = endpoints + suite.cluster = cluster + + if suite.withServiceDiscovery { + // Run test with specific service discovery. + cli := setupCli(suite.ctx, re, suite.endpoints) + sd := cli.GetServiceDiscovery() + suite.client = pd.NewClientWithServiceDiscovery("pd-http-client-it-grpc", sd) + } else { + // Run test with default service discovery. + suite.client = pd.NewClient("pd-http-client-it-http", suite.endpoints) } } -// RunTestInTwoModes is to run test in two modes. -func (suite *httpClientTestSuite) RunTestInTwoModes(test func(mode mode, client pd.Client)) { - // Run test with specific service discovery. - cli := setupCli(suite.env[specificServiceDiscovery].ctx, suite.Require(), suite.env[specificServiceDiscovery].endpoints) - sd := cli.GetServiceDiscovery() - client := pd.NewClientWithServiceDiscovery("pd-http-client-it-grpc", sd) - test(specificServiceDiscovery, client) - client.Close() - - // Run test with default service discovery. - client = pd.NewClient("pd-http-client-it-http", suite.env[defaultServiceDiscovery].endpoints) - test(defaultServiceDiscovery, client) - client.Close() +func (suite *httpClientTestSuite) TearDownSuite() { + suite.cancelFunc() + suite.client.Close() + suite.cluster.Destroy() } func (suite *httpClientTestSuite) TestMeta() { - suite.RunTestInTwoModes(suite.checkMeta) -} - -func (suite *httpClientTestSuite) checkMeta(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - replicateConfig, err := client.GetReplicateConfig(env.ctx) + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() + replicateConfig, err := client.GetReplicateConfig(ctx) re.NoError(err) re.Equal(3.0, replicateConfig["max-replicas"]) - region, err := client.GetRegionByID(env.ctx, 10) + region, err := client.GetRegionByID(ctx, 10) re.NoError(err) re.Equal(int64(10), region.ID) re.Equal(core.HexRegionKeyStr([]byte("a1")), region.StartKey) re.Equal(core.HexRegionKeyStr([]byte("a2")), region.EndKey) - region, err = client.GetRegionByKey(env.ctx, []byte("a2")) + region, err = client.GetRegionByKey(ctx, []byte("a2")) re.NoError(err) re.Equal(int64(11), region.ID) re.Equal(core.HexRegionKeyStr([]byte("a2")), region.StartKey) re.Equal(core.HexRegionKeyStr([]byte("a3")), region.EndKey) - regions, err := client.GetRegions(env.ctx) + regions, err := client.GetRegions(ctx) re.NoError(err) re.Equal(int64(2), regions.Count) re.Len(regions.Regions, 2) - regions, err = client.GetRegionsByKeyRange(env.ctx, pd.NewKeyRange([]byte("a1"), []byte("a3")), -1) + regions, err = client.GetRegionsByKeyRange(ctx, pd.NewKeyRange([]byte("a1"), []byte("a3")), -1) re.NoError(err) re.Equal(int64(2), regions.Count) re.Len(regions.Regions, 2) - regions, err = client.GetRegionsByStoreID(env.ctx, 1) + regions, err = client.GetRegionsByStoreID(ctx, 1) re.NoError(err) re.Equal(int64(2), regions.Count) re.Len(regions.Regions, 2) - regions, err = client.GetEmptyRegions(env.ctx) + regions, err = client.GetEmptyRegions(ctx) re.NoError(err) re.Equal(int64(2), regions.Count) re.Len(regions.Regions, 2) - state, err := client.GetRegionsReplicatedStateByKeyRange(env.ctx, pd.NewKeyRange([]byte("a1"), []byte("a3"))) + state, err := client.GetRegionsReplicatedStateByKeyRange(ctx, pd.NewKeyRange([]byte("a1"), []byte("a3"))) re.NoError(err) re.Equal("INPROGRESS", state) - regionStats, err := client.GetRegionStatusByKeyRange(env.ctx, pd.NewKeyRange([]byte("a1"), []byte("a3")), false) + regionStats, err := client.GetRegionStatusByKeyRange(ctx, pd.NewKeyRange([]byte("a1"), []byte("a3")), false) re.NoError(err) re.Positive(regionStats.Count) re.NotEmpty(regionStats.StoreLeaderCount) - regionStats, err = client.GetRegionStatusByKeyRange(env.ctx, pd.NewKeyRange([]byte("a1"), []byte("a3")), true) + regionStats, err = client.GetRegionStatusByKeyRange(ctx, pd.NewKeyRange([]byte("a1"), []byte("a3")), true) re.NoError(err) re.Positive(regionStats.Count) re.Empty(regionStats.StoreLeaderCount) - hotReadRegions, err := client.GetHotReadRegions(env.ctx) + hotReadRegions, err := client.GetHotReadRegions(ctx) re.NoError(err) re.Len(hotReadRegions.AsPeer, 1) re.Len(hotReadRegions.AsLeader, 1) - hotWriteRegions, err := client.GetHotWriteRegions(env.ctx) + hotWriteRegions, err := client.GetHotWriteRegions(ctx) re.NoError(err) re.Len(hotWriteRegions.AsPeer, 1) re.Len(hotWriteRegions.AsLeader, 1) - historyHorRegions, err := client.GetHistoryHotRegions(env.ctx, &pd.HistoryHotRegionsRequest{ + historyHorRegions, err := client.GetHistoryHotRegions(ctx, &pd.HistoryHotRegionsRequest{ StartTime: 0, EndTime: time.Now().AddDate(0, 0, 1).UnixNano() / int64(time.Millisecond), }) re.NoError(err) re.Empty(historyHorRegions.HistoryHotRegion) - store, err := client.GetStores(env.ctx) + store, err := client.GetStores(ctx) re.NoError(err) re.Equal(1, store.Count) re.Len(store.Stores, 1) storeID := uint64(store.Stores[0].Store.ID) // TODO: why type is different? - store2, err := client.GetStore(env.ctx, storeID) + store2, err := client.GetStore(ctx, storeID) re.NoError(err) re.EqualValues(storeID, store2.Store.ID) - version, err := client.GetClusterVersion(env.ctx) + version, err := client.GetClusterVersion(ctx) re.NoError(err) re.Equal("0.0.0", version) - rgs, _ := client.GetRegionsByKeyRange(env.ctx, pd.NewKeyRange([]byte("a"), []byte("a1")), 100) + rgs, _ := client.GetRegionsByKeyRange(ctx, pd.NewKeyRange([]byte("a"), []byte("a1")), 100) re.Equal(int64(0), rgs.Count) - rgs, _ = client.GetRegionsByKeyRange(env.ctx, pd.NewKeyRange([]byte("a1"), []byte("a3")), 100) + rgs, _ = client.GetRegionsByKeyRange(ctx, pd.NewKeyRange([]byte("a1"), []byte("a3")), 100) re.Equal(int64(2), rgs.Count) - rgs, _ = client.GetRegionsByKeyRange(env.ctx, pd.NewKeyRange([]byte("a2"), []byte("b")), 100) + rgs, _ = client.GetRegionsByKeyRange(ctx, pd.NewKeyRange([]byte("a2"), []byte("b")), 100) re.Equal(int64(1), rgs.Count) - rgs, _ = client.GetRegionsByKeyRange(env.ctx, pd.NewKeyRange([]byte(""), []byte("")), 100) + rgs, _ = client.GetRegionsByKeyRange(ctx, pd.NewKeyRange([]byte(""), []byte("")), 100) re.Equal(int64(2), rgs.Count) } func (suite *httpClientTestSuite) TestGetMinResolvedTSByStoresIDs() { - suite.RunTestInTwoModes(suite.checkGetMinResolvedTSByStoresIDs) -} - -func (suite *httpClientTestSuite) checkGetMinResolvedTSByStoresIDs(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() testMinResolvedTS := tsoutil.TimeToTS(time.Now()) - raftCluster := env.cluster.GetLeaderServer().GetRaftCluster() + raftCluster := suite.cluster.GetLeaderServer().GetRaftCluster() err := raftCluster.SetMinResolvedTS(1, testMinResolvedTS) re.NoError(err) // Make sure the min resolved TS is updated. @@ -233,18 +212,18 @@ func (suite *httpClientTestSuite) checkGetMinResolvedTSByStoresIDs(mode mode, cl return minResolvedTS == testMinResolvedTS }) // Wait for the cluster-level min resolved TS to be initialized. - minResolvedTS, storeMinResolvedTSMap, err := client.GetMinResolvedTSByStoresIDs(env.ctx, nil) + minResolvedTS, storeMinResolvedTSMap, err := client.GetMinResolvedTSByStoresIDs(ctx, nil) re.NoError(err) re.Equal(testMinResolvedTS, minResolvedTS) re.Empty(storeMinResolvedTSMap) // Get the store-level min resolved TS. - minResolvedTS, storeMinResolvedTSMap, err = client.GetMinResolvedTSByStoresIDs(env.ctx, []uint64{1}) + minResolvedTS, storeMinResolvedTSMap, err = client.GetMinResolvedTSByStoresIDs(ctx, []uint64{1}) re.NoError(err) re.Equal(testMinResolvedTS, minResolvedTS) re.Len(storeMinResolvedTSMap, 1) re.Equal(minResolvedTS, storeMinResolvedTSMap[1]) // Get the store-level min resolved TS with an invalid store ID. - minResolvedTS, storeMinResolvedTSMap, err = client.GetMinResolvedTSByStoresIDs(env.ctx, []uint64{1, 2}) + minResolvedTS, storeMinResolvedTSMap, err = client.GetMinResolvedTSByStoresIDs(ctx, []uint64{1, 2}) re.NoError(err) re.Equal(testMinResolvedTS, minResolvedTS) re.Len(storeMinResolvedTSMap, 2) @@ -253,22 +232,19 @@ func (suite *httpClientTestSuite) checkGetMinResolvedTSByStoresIDs(mode mode, cl } func (suite *httpClientTestSuite) TestRule() { - suite.RunTestInTwoModes(suite.checkRule) -} - -func (suite *httpClientTestSuite) checkRule(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - bundles, err := client.GetAllPlacementRuleBundles(env.ctx) + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() + bundles, err := client.GetAllPlacementRuleBundles(ctx) re.NoError(err) re.Len(bundles, 1) re.Equal(placement.DefaultGroupID, bundles[0].ID) - bundle, err := client.GetPlacementRuleBundleByGroup(env.ctx, placement.DefaultGroupID) + bundle, err := client.GetPlacementRuleBundleByGroup(ctx, placement.DefaultGroupID) re.NoError(err) re.Equal(bundles[0], bundle) // Check if we have the default rule. - checkRuleResult(re, env, client, &pd.Rule{ + suite.checkRuleResult(ctx, re, &pd.Rule{ GroupID: placement.DefaultGroupID, ID: placement.DefaultRuleID, Role: pd.Voter, @@ -277,7 +253,7 @@ func (suite *httpClientTestSuite) checkRule(mode mode, client pd.Client) { EndKey: []byte{}, }, 1, true) // Should be the same as the rules in the bundle. - checkRuleResult(re, env, client, bundle.Rules[0], 1, true) + suite.checkRuleResult(ctx, re, bundle.Rules[0], 1, true) testRule := &pd.Rule{ GroupID: placement.DefaultGroupID, ID: "test", @@ -286,39 +262,39 @@ func (suite *httpClientTestSuite) checkRule(mode mode, client pd.Client) { StartKey: []byte{}, EndKey: []byte{}, } - err = client.SetPlacementRule(env.ctx, testRule) + err = client.SetPlacementRule(ctx, testRule) re.NoError(err) - checkRuleResult(re, env, client, testRule, 2, true) - err = client.DeletePlacementRule(env.ctx, placement.DefaultGroupID, "test") + suite.checkRuleResult(ctx, re, testRule, 2, true) + err = client.DeletePlacementRule(ctx, placement.DefaultGroupID, "test") re.NoError(err) - checkRuleResult(re, env, client, testRule, 1, false) + suite.checkRuleResult(ctx, re, testRule, 1, false) testRuleOp := &pd.RuleOp{ Rule: testRule, Action: pd.RuleOpAdd, } - err = client.SetPlacementRuleInBatch(env.ctx, []*pd.RuleOp{testRuleOp}) + err = client.SetPlacementRuleInBatch(ctx, []*pd.RuleOp{testRuleOp}) re.NoError(err) - checkRuleResult(re, env, client, testRule, 2, true) + suite.checkRuleResult(ctx, re, testRule, 2, true) testRuleOp = &pd.RuleOp{ Rule: testRule, Action: pd.RuleOpDel, } - err = client.SetPlacementRuleInBatch(env.ctx, []*pd.RuleOp{testRuleOp}) + err = client.SetPlacementRuleInBatch(ctx, []*pd.RuleOp{testRuleOp}) re.NoError(err) - checkRuleResult(re, env, client, testRule, 1, false) - err = client.SetPlacementRuleBundles(env.ctx, []*pd.GroupBundle{ + suite.checkRuleResult(ctx, re, testRule, 1, false) + err = client.SetPlacementRuleBundles(ctx, []*pd.GroupBundle{ { ID: placement.DefaultGroupID, Rules: []*pd.Rule{testRule}, }, }, true) re.NoError(err) - checkRuleResult(re, env, client, testRule, 1, true) - ruleGroups, err := client.GetAllPlacementRuleGroups(env.ctx) + suite.checkRuleResult(ctx, re, testRule, 1, true) + ruleGroups, err := client.GetAllPlacementRuleGroups(ctx) re.NoError(err) re.Len(ruleGroups, 1) re.Equal(placement.DefaultGroupID, ruleGroups[0].ID) - ruleGroup, err := client.GetPlacementRuleGroupByID(env.ctx, placement.DefaultGroupID) + ruleGroup, err := client.GetPlacementRuleGroupByID(ctx, placement.DefaultGroupID) re.NoError(err) re.Equal(ruleGroups[0], ruleGroup) testRuleGroup := &pd.RuleGroup{ @@ -326,14 +302,14 @@ func (suite *httpClientTestSuite) checkRule(mode mode, client pd.Client) { Index: 1, Override: true, } - err = client.SetPlacementRuleGroup(env.ctx, testRuleGroup) + err = client.SetPlacementRuleGroup(ctx, testRuleGroup) re.NoError(err) - ruleGroup, err = client.GetPlacementRuleGroupByID(env.ctx, testRuleGroup.ID) + ruleGroup, err = client.GetPlacementRuleGroupByID(ctx, testRuleGroup.ID) re.NoError(err) re.Equal(testRuleGroup, ruleGroup) - err = client.DeletePlacementRuleGroupByID(env.ctx, testRuleGroup.ID) + err = client.DeletePlacementRuleGroupByID(ctx, testRuleGroup.ID) re.NoError(err) - ruleGroup, err = client.GetPlacementRuleGroupByID(env.ctx, testRuleGroup.ID) + ruleGroup, err = client.GetPlacementRuleGroupByID(ctx, testRuleGroup.ID) re.ErrorContains(err, http.StatusText(http.StatusNotFound)) re.Empty(ruleGroup) // Test the start key and end key. @@ -345,34 +321,33 @@ func (suite *httpClientTestSuite) checkRule(mode mode, client pd.Client) { StartKey: []byte("a1"), EndKey: []byte(""), } - err = client.SetPlacementRule(env.ctx, testRule) + err = client.SetPlacementRule(ctx, testRule) re.NoError(err) - checkRuleResult(re, env, client, testRule, 1, true) + suite.checkRuleResult(ctx, re, testRule, 1, true) } -func checkRuleResult( - re *require.Assertions, - env *httpClientTestEnv, - client pd.Client, +func (suite *httpClientTestSuite) checkRuleResult( + ctx context.Context, re *require.Assertions, rule *pd.Rule, totalRuleCount int, exist bool, ) { + client := suite.client if exist { - got, err := client.GetPlacementRule(env.ctx, rule.GroupID, rule.ID) + got, err := client.GetPlacementRule(ctx, rule.GroupID, rule.ID) re.NoError(err) // skip comparison of the generated field got.StartKeyHex = rule.StartKeyHex got.EndKeyHex = rule.EndKeyHex re.Equal(rule, got) } else { - _, err := client.GetPlacementRule(env.ctx, rule.GroupID, rule.ID) + _, err := client.GetPlacementRule(ctx, rule.GroupID, rule.ID) re.ErrorContains(err, http.StatusText(http.StatusNotFound)) } // Check through the `GetPlacementRulesByGroup` API. - rules, err := client.GetPlacementRulesByGroup(env.ctx, rule.GroupID) + rules, err := client.GetPlacementRulesByGroup(ctx, rule.GroupID) re.NoError(err) checkRuleFunc(re, rules, rule, totalRuleCount, exist) // Check through the `GetPlacementRuleBundleByGroup` API. - bundle, err := client.GetPlacementRuleBundleByGroup(env.ctx, rule.GroupID) + bundle, err := client.GetPlacementRuleBundleByGroup(ctx, rule.GroupID) re.NoError(err) checkRuleFunc(re, bundle.Rules, rule, totalRuleCount, exist) } @@ -400,14 +375,11 @@ func checkRuleFunc( } func (suite *httpClientTestSuite) TestRegionLabel() { - suite.RunTestInTwoModes(suite.checkRegionLabel) -} - -func (suite *httpClientTestSuite) checkRegionLabel(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - labelRules, err := client.GetAllRegionLabelRules(env.ctx) + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() + labelRules, err := client.GetAllRegionLabelRules(ctx) re.NoError(err) re.Len(labelRules, 1) re.Equal("keyspaces/0", labelRules[0].ID) @@ -418,9 +390,9 @@ func (suite *httpClientTestSuite) checkRegionLabel(mode mode, client pd.Client) RuleType: "key-range", Data: labeler.MakeKeyRanges("1234", "5678"), } - err = client.SetRegionLabelRule(env.ctx, labelRule) + err = client.SetRegionLabelRule(ctx, labelRule) re.NoError(err) - labelRules, err = client.GetAllRegionLabelRules(env.ctx) + labelRules, err = client.GetAllRegionLabelRules(ctx) re.NoError(err) re.Len(labelRules, 2) sort.Slice(labelRules, func(i, j int) bool { @@ -440,9 +412,9 @@ func (suite *httpClientTestSuite) checkRegionLabel(mode mode, client pd.Client) SetRules: []*pd.LabelRule{labelRule}, DeleteRules: []string{"rule1"}, } - err = client.PatchRegionLabelRules(env.ctx, patch) + err = client.PatchRegionLabelRules(ctx, patch) re.NoError(err) - allLabelRules, err := client.GetAllRegionLabelRules(env.ctx) + allLabelRules, err := client.GetAllRegionLabelRules(ctx) re.NoError(err) re.Len(labelRules, 2) sort.Slice(allLabelRules, func(i, j int) bool { @@ -451,7 +423,7 @@ func (suite *httpClientTestSuite) checkRegionLabel(mode mode, client pd.Client) re.Equal(labelRule.ID, allLabelRules[1].ID) re.Equal(labelRule.Labels, allLabelRules[1].Labels) re.Equal(labelRule.RuleType, allLabelRules[1].RuleType) - labelRules, err = client.GetRegionLabelRulesByIDs(env.ctx, []string{"keyspaces/0", "rule2"}) + labelRules, err = client.GetRegionLabelRulesByIDs(ctx, []string{"keyspaces/0", "rule2"}) re.NoError(err) sort.Slice(labelRules, func(i, j int) bool { return labelRules[i].ID < labelRules[j].ID @@ -460,24 +432,21 @@ func (suite *httpClientTestSuite) checkRegionLabel(mode mode, client pd.Client) } func (suite *httpClientTestSuite) TestAccelerateSchedule() { - suite.RunTestInTwoModes(suite.checkAccelerateSchedule) -} - -func (suite *httpClientTestSuite) checkAccelerateSchedule(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - raftCluster := env.cluster.GetLeaderServer().GetRaftCluster() + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() + raftCluster := suite.cluster.GetLeaderServer().GetRaftCluster() suspectRegions := raftCluster.GetSuspectRegions() re.Empty(suspectRegions) - err := client.AccelerateSchedule(env.ctx, pd.NewKeyRange([]byte("a1"), []byte("a2"))) + err := client.AccelerateSchedule(ctx, pd.NewKeyRange([]byte("a1"), []byte("a2"))) re.NoError(err) suspectRegions = raftCluster.GetSuspectRegions() re.Len(suspectRegions, 1) raftCluster.ClearSuspectRegions() suspectRegions = raftCluster.GetSuspectRegions() re.Empty(suspectRegions) - err = client.AccelerateScheduleInBatch(env.ctx, []*pd.KeyRange{ + err = client.AccelerateScheduleInBatch(ctx, []*pd.KeyRange{ pd.NewKeyRange([]byte("a1"), []byte("a2")), pd.NewKeyRange([]byte("a2"), []byte("a3")), }) @@ -487,24 +456,21 @@ func (suite *httpClientTestSuite) checkAccelerateSchedule(mode mode, client pd.C } func (suite *httpClientTestSuite) TestConfig() { - suite.RunTestInTwoModes(suite.checkConfig) -} - -func (suite *httpClientTestSuite) checkConfig(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - config, err := client.GetConfig(env.ctx) + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() + config, err := client.GetConfig(ctx) re.NoError(err) re.Equal(float64(4), config["schedule"].(map[string]any)["leader-schedule-limit"]) newConfig := map[string]any{ "schedule.leader-schedule-limit": float64(8), } - err = client.SetConfig(env.ctx, newConfig) + err = client.SetConfig(ctx, newConfig) re.NoError(err) - config, err = client.GetConfig(env.ctx) + config, err = client.GetConfig(ctx) re.NoError(err) re.Equal(float64(8), config["schedule"].(map[string]any)["leader-schedule-limit"]) @@ -512,15 +478,15 @@ func (suite *httpClientTestSuite) checkConfig(mode mode, client pd.Client) { newConfig = map[string]any{ "schedule.leader-schedule-limit": float64(16), } - err = client.SetConfig(env.ctx, newConfig, 5) + err = client.SetConfig(ctx, newConfig, 5) re.NoError(err) - resp, err := env.cluster.GetEtcdClient().Get(env.ctx, sc.TTLConfigPrefix+"/schedule.leader-schedule-limit") + resp, err := suite.cluster.GetEtcdClient().Get(ctx, sc.TTLConfigPrefix+"/schedule.leader-schedule-limit") re.NoError(err) re.Equal([]byte("16"), resp.Kvs[0].Value) // delete the config with TTL. - err = client.SetConfig(env.ctx, newConfig, 0) + err = client.SetConfig(ctx, newConfig, 0) re.NoError(err) - resp, err = env.cluster.GetEtcdClient().Get(env.ctx, sc.TTLConfigPrefix+"/schedule.leader-schedule-limit") + resp, err = suite.cluster.GetEtcdClient().Get(ctx, sc.TTLConfigPrefix+"/schedule.leader-schedule-limit") re.NoError(err) re.Empty(resp.Kvs) @@ -528,81 +494,72 @@ func (suite *httpClientTestSuite) checkConfig(mode mode, client pd.Client) { newConfig = map[string]any{ "schedule.max-pending-peer-count": uint64(math.MaxInt32), } - err = client.SetConfig(env.ctx, newConfig, 4) + err = client.SetConfig(ctx, newConfig, 4) re.NoError(err) - c := env.cluster.GetLeaderServer().GetRaftCluster().GetOpts().GetMaxPendingPeerCount() + c := suite.cluster.GetLeaderServer().GetRaftCluster().GetOpts().GetMaxPendingPeerCount() re.Equal(uint64(math.MaxInt32), c) - err = client.SetConfig(env.ctx, newConfig, 0) + err = client.SetConfig(ctx, newConfig, 0) re.NoError(err) - resp, err = env.cluster.GetEtcdClient().Get(env.ctx, sc.TTLConfigPrefix+"/schedule.max-pending-peer-count") + resp, err = suite.cluster.GetEtcdClient().Get(ctx, sc.TTLConfigPrefix+"/schedule.max-pending-peer-count") re.NoError(err) re.Empty(resp.Kvs) } func (suite *httpClientTestSuite) TestScheduleConfig() { - suite.RunTestInTwoModes(suite.checkScheduleConfig) -} - -func (suite *httpClientTestSuite) checkScheduleConfig(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - config, err := client.GetScheduleConfig(env.ctx) + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() + config, err := client.GetScheduleConfig(ctx) re.NoError(err) re.Equal(float64(4), config["hot-region-schedule-limit"]) re.Equal(float64(2048), config["region-schedule-limit"]) config["hot-region-schedule-limit"] = float64(8) - err = client.SetScheduleConfig(env.ctx, config) + err = client.SetScheduleConfig(ctx, config) re.NoError(err) - config, err = client.GetScheduleConfig(env.ctx) + config, err = client.GetScheduleConfig(ctx) re.NoError(err) re.Equal(float64(8), config["hot-region-schedule-limit"]) re.Equal(float64(2048), config["region-schedule-limit"]) } func (suite *httpClientTestSuite) TestSchedulers() { - suite.RunTestInTwoModes(suite.checkSchedulers) -} - -func (suite *httpClientTestSuite) checkSchedulers(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - schedulers, err := client.GetSchedulers(env.ctx) + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() + schedulers, err := client.GetSchedulers(ctx) re.NoError(err) re.Empty(schedulers) - err = client.CreateScheduler(env.ctx, "evict-leader-scheduler", 1) + err = client.CreateScheduler(ctx, "evict-leader-scheduler", 1) re.NoError(err) - schedulers, err = client.GetSchedulers(env.ctx) + schedulers, err = client.GetSchedulers(ctx) re.NoError(err) re.Len(schedulers, 1) - err = client.SetSchedulerDelay(env.ctx, "evict-leader-scheduler", 100) + err = client.SetSchedulerDelay(ctx, "evict-leader-scheduler", 100) re.NoError(err) - err = client.SetSchedulerDelay(env.ctx, "not-exist", 100) + err = client.SetSchedulerDelay(ctx, "not-exist", 100) re.ErrorContains(err, "500 Internal Server Error") // TODO: should return friendly error message } func (suite *httpClientTestSuite) TestSetStoreLabels() { - suite.RunTestInTwoModes(suite.checkSetStoreLabels) -} - -func (suite *httpClientTestSuite) checkSetStoreLabels(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - resp, err := client.GetStores(env.ctx) + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() + resp, err := client.GetStores(ctx) re.NoError(err) setStore := resp.Stores[0] re.Empty(setStore.Store.Labels, nil) storeLabels := map[string]string{ "zone": "zone1", } - err = client.SetStoreLabels(env.ctx, 1, storeLabels) + err = client.SetStoreLabels(ctx, 1, storeLabels) re.NoError(err) - resp, err = client.GetStores(env.ctx) + resp, err = client.GetStores(ctx) re.NoError(err) for _, store := range resp.Stores { if store.Store.ID == setStore.Store.ID { @@ -614,67 +571,52 @@ func (suite *httpClientTestSuite) checkSetStoreLabels(mode mode, client pd.Clien } func (suite *httpClientTestSuite) TestTransferLeader() { - suite.RunTestInTwoModes(suite.checkTransferLeader) -} - -func (suite *httpClientTestSuite) checkTransferLeader(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - members, err := client.GetMembers(env.ctx) + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() + members, err := client.GetMembers(ctx) re.NoError(err) re.Len(members.Members, 2) - leader, err := client.GetLeader(env.ctx) + leader, err := client.GetLeader(ctx) re.NoError(err) // Transfer leader to another pd for _, member := range members.Members { if member.GetName() != leader.GetName() { - err = client.TransferLeader(env.ctx, member.GetName()) + err = client.TransferLeader(ctx, member.GetName()) re.NoError(err) break } } - newLeader := env.cluster.WaitLeader() + newLeader := suite.cluster.WaitLeader() re.NotEmpty(newLeader) re.NoError(err) re.NotEqual(leader.GetName(), newLeader) // Force to update the members info. testutil.Eventually(re, func() bool { - leader, err = client.GetLeader(env.ctx) + leader, err = client.GetLeader(ctx) re.NoError(err) return newLeader == leader.GetName() }) - members, err = client.GetMembers(env.ctx) + members, err = client.GetMembers(ctx) re.NoError(err) re.Len(members.Members, 2) re.Equal(leader.GetName(), members.Leader.GetName()) } func (suite *httpClientTestSuite) TestVersion() { - suite.RunTestInTwoModes(suite.checkVersion) -} - -func (suite *httpClientTestSuite) checkVersion(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - ver, err := client.GetPDVersion(env.ctx) + ver, err := suite.client.GetPDVersion(suite.ctx) re.NoError(err) re.Equal(versioninfo.PDReleaseVersion, ver) } func (suite *httpClientTestSuite) TestStatus() { - suite.RunTestInTwoModes(suite.checkStatus) -} - -func (suite *httpClientTestSuite) checkStatus(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - status, err := client.GetStatus(env.ctx) + status, err := suite.client.GetStatus(suite.ctx) re.NoError(err) re.Equal(versioninfo.PDReleaseVersion, status.Version) re.Equal(versioninfo.PDGitHash, status.GitHash) @@ -683,48 +625,41 @@ func (suite *httpClientTestSuite) checkStatus(mode mode, client pd.Client) { } func (suite *httpClientTestSuite) TestAdmin() { - suite.RunTestInTwoModes(suite.checkAdmin) -} - -func (suite *httpClientTestSuite) checkAdmin(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - err := client.SetSnapshotRecoveringMark(env.ctx) + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() + err := client.SetSnapshotRecoveringMark(ctx) re.NoError(err) - err = client.ResetTS(env.ctx, 123, true) + err = client.ResetTS(ctx, 123, true) re.NoError(err) - err = client.ResetBaseAllocID(env.ctx, 456) + err = client.ResetBaseAllocID(ctx, 456) re.NoError(err) - err = client.DeleteSnapshotRecoveringMark(env.ctx) + err = client.DeleteSnapshotRecoveringMark(ctx) re.NoError(err) } func (suite *httpClientTestSuite) TestWithBackoffer() { - suite.RunTestInTwoModes(suite.checkWithBackoffer) -} - -func (suite *httpClientTestSuite) checkWithBackoffer(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() // Should return with 404 error without backoffer. - rule, err := client.GetPlacementRule(env.ctx, "non-exist-group", "non-exist-rule") + rule, err := client.GetPlacementRule(ctx, "non-exist-group", "non-exist-rule") re.ErrorContains(err, http.StatusText(http.StatusNotFound)) re.Nil(rule) // Should return with 404 error even with an infinite backoffer. rule, err = client. WithBackoffer(retry.InitialBackoffer(100*time.Millisecond, time.Second, 0)). - GetPlacementRule(env.ctx, "non-exist-group", "non-exist-rule") + GetPlacementRule(ctx, "non-exist-group", "non-exist-rule") re.ErrorContains(err, http.StatusText(http.StatusNotFound)) re.Nil(rule) } func (suite *httpClientTestSuite) TestRedirectWithMetrics() { re := suite.Require() - env := suite.env[defaultServiceDiscovery] - cli := setupCli(env.ctx, suite.Require(), env.endpoints) + cli := setupCli(suite.ctx, re, suite.endpoints) defer cli.Close() sd := cli.GetServiceDiscovery() @@ -785,12 +720,10 @@ func (suite *httpClientTestSuite) TestRedirectWithMetrics() { } func (suite *httpClientTestSuite) TestUpdateKeyspaceGCManagementType() { - suite.RunTestInTwoModes(suite.checkUpdateKeyspaceGCManagementType) -} - -func (suite *httpClientTestSuite) checkUpdateKeyspaceGCManagementType(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() keyspaceName := "DEFAULT" expectGCManagementType := "keyspace_level_gc" @@ -800,10 +733,10 @@ func (suite *httpClientTestSuite) checkUpdateKeyspaceGCManagementType(mode mode, GCManagementType: expectGCManagementType, }, } - err := client.UpdateKeyspaceGCManagementType(env.ctx, keyspaceName, &keyspaceSafePointVersionConfig) + err := client.UpdateKeyspaceGCManagementType(ctx, keyspaceName, &keyspaceSafePointVersionConfig) re.NoError(err) - keyspaceMetaRes, err := client.GetKeyspaceMetaByName(env.ctx, keyspaceName) + keyspaceMetaRes, err := client.GetKeyspaceMetaByName(ctx, keyspaceName) re.NoError(err) val, ok := keyspaceMetaRes.Config["gc_management_type"] @@ -813,14 +746,8 @@ func (suite *httpClientTestSuite) checkUpdateKeyspaceGCManagementType(mode mode, } func (suite *httpClientTestSuite) TestGetHealthStatus() { - suite.RunTestInTwoModes(suite.checkGetHealthStatus) -} - -func (suite *httpClientTestSuite) checkGetHealthStatus(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - healths, err := client.GetHealthStatus(env.ctx) + healths, err := suite.client.GetHealthStatus(suite.ctx) re.NoError(err) re.Len(healths, 2) sort.Slice(healths, func(i, j int) bool { From 52389b04f21726b54117ee29acf62923480ccbde Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Thu, 30 May 2024 15:07:21 +0800 Subject: [PATCH 07/47] simulator: make store,region,replica configurable in cases (#8215) ref tikv/pd#8135 Signed-off-by: lhy1024 Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/utils/configutil/configutil.go | 7 +++ tools/pd-simulator/main.go | 34 +++++------ .../pd-simulator/simulator/cases/add_nodes.go | 49 +++++++-------- .../simulator/cases/add_nodes_dynamic.go | 60 +++++++++---------- .../simulator/cases/balance_leader.go | 42 +++++++------ .../simulator/cases/balance_region.go | 45 +++++++------- tools/pd-simulator/simulator/cases/cases.go | 42 +++++-------- .../simulator/cases/delete_nodes.go | 55 +++++++++-------- .../cases/diagnose_label_isolation.go | 7 ++- .../simulator/cases/diagnose_rule.go | 5 +- .../pd-simulator/simulator/cases/hot_read.go | 32 +++++----- .../pd-simulator/simulator/cases/hot_write.go | 33 +++++----- .../simulator/cases/import_data.go | 33 +++++----- .../simulator/cases/makeup_down_replica.go | 55 +++++++---------- .../simulator/cases/region_merge.go | 41 ++++++------- .../simulator/cases/region_split.go | 25 ++++---- tools/pd-simulator/simulator/client.go | 5 +- .../simulator/{ => config}/config.go | 23 ++++--- tools/pd-simulator/simulator/conn.go | 3 +- tools/pd-simulator/simulator/drive.go | 11 ++-- tools/pd-simulator/simulator/node.go | 5 +- tools/pd-simulator/simulator/raft.go | 5 +- .../simulator/simutil/case_config.go | 34 ----------- tools/pd-simulator/simulator/task.go | 2 +- 24 files changed, 303 insertions(+), 350 deletions(-) rename tools/pd-simulator/simulator/{ => config}/config.go (85%) delete mode 100644 tools/pd-simulator/simulator/simutil/case_config.go diff --git a/pkg/utils/configutil/configutil.go b/pkg/utils/configutil/configutil.go index 2e7c74d9f8c9..086f74ff8426 100644 --- a/pkg/utils/configutil/configutil.go +++ b/pkg/utils/configutil/configutil.go @@ -171,3 +171,10 @@ func AdjustPath(p *string) { *p = absPath } } + +// AdjustBool adjusts the value of a bool variable. +func AdjustBool(v *bool, defValue bool) { + if !*v { + *v = defValue + } +} diff --git a/tools/pd-simulator/main.go b/tools/pd-simulator/main.go index 73f4a0bba12b..04de914f5f0a 100644 --- a/tools/pd-simulator/main.go +++ b/tools/pd-simulator/main.go @@ -38,21 +38,19 @@ import ( "github.com/tikv/pd/tools/pd-analysis/analysis" "github.com/tikv/pd/tools/pd-simulator/simulator" "github.com/tikv/pd/tools/pd-simulator/simulator/cases" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" "go.uber.org/zap" ) var ( - pdAddr = flag.String("pd-endpoints", "", "pd address") - configFile = flag.String("config", "conf/simconfig.toml", "config file") - caseName = flag.String("case", "", "case name") - serverLogLevel = flag.String("serverLog", "info", "pd server log level") - simLogLevel = flag.String("simLog", "info", "simulator log level") - simLogFile = flag.String("log-file", "", "simulator log file") - regionNum = flag.Int("regionNum", 0, "regionNum of one store") - storeNum = flag.Int("storeNum", 0, "storeNum") - enableTransferRegionCounter = flag.Bool("enableTransferRegionCounter", false, "enableTransferRegionCounter") - statusAddress = flag.String("status-addr", "0.0.0.0:20180", "status address") + pdAddr = flag.String("pd-endpoints", "", "pd address") + configFile = flag.String("config", "conf/simconfig.toml", "config file") + caseName = flag.String("case", "", "case name") + serverLogLevel = flag.String("serverLog", "info", "pd server log level") + simLogLevel = flag.String("simLog", "info", "simulator log level") + simLogFile = flag.String("log-file", "", "simulator log file") + statusAddress = flag.String("status-addr", "0.0.0.0:20180", "status address") ) func main() { @@ -63,14 +61,12 @@ func main() { flag.Parse() simutil.InitLogger(*simLogLevel, *simLogFile) - simutil.InitCaseConfig(*storeNum, *regionNum, *enableTransferRegionCounter) statistics.Denoising = false - if simutil.CaseConfigure.EnableTransferRegionCounter { - analysis.GetTransferCounter().Init(simutil.CaseConfigure.StoreNum, simutil.CaseConfigure.RegionNum) - } - schedulers.Register() // register schedulers, which is needed by simConfig.Adjust - simConfig := simulator.NewSimConfig(*serverLogLevel) + simConfig := sc.NewSimConfig(*serverLogLevel) + if simConfig.EnableTransferRegionCounter { + analysis.GetTransferCounter().Init(simConfig.TotalStore, simConfig.TotalRegion) + } var meta toml.MetaData var err error if *configFile != "" { @@ -97,7 +93,7 @@ func main() { } } -func run(simCase string, simConfig *simulator.SimConfig) { +func run(simCase string, simConfig *sc.SimConfig) { if *pdAddr != "" { go runHTTPServer() simStart(*pdAddr, simCase, simConfig) @@ -136,7 +132,7 @@ func runHTTPServer() { } // NewSingleServer creates a pd server for simulator. -func NewSingleServer(ctx context.Context, simConfig *simulator.SimConfig) (*server.Server, testutil.CleanupFunc) { +func NewSingleServer(ctx context.Context, simConfig *sc.SimConfig) (*server.Server, testutil.CleanupFunc) { err := logutil.SetupLogger(simConfig.ServerConfig.Log, &simConfig.ServerConfig.Logger, &simConfig.ServerConfig.LogProps) if err == nil { log.ReplaceGlobals(simConfig.ServerConfig.Logger, simConfig.ServerConfig.LogProps) @@ -161,7 +157,7 @@ func cleanServer(cfg *config.Config) { os.RemoveAll(cfg.DataDir) } -func simStart(pdAddr string, simCase string, simConfig *simulator.SimConfig, clean ...testutil.CleanupFunc) { +func simStart(pdAddr string, simCase string, simConfig *sc.SimConfig, clean ...testutil.CleanupFunc) { start := time.Now() driver, err := simulator.NewDriver(pdAddr, simCase, simConfig) if err != nil { diff --git a/tools/pd-simulator/simulator/cases/add_nodes.go b/tools/pd-simulator/simulator/cases/add_nodes.go index 241b34a94735..5c73fe9764c9 100644 --- a/tools/pd-simulator/simulator/cases/add_nodes.go +++ b/tools/pd-simulator/simulator/cases/add_nodes.go @@ -15,35 +15,35 @@ package cases import ( - "math/rand" - "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newAddNodes() *Case { +func newAddNodes(config *sc.SimConfig) *Case { var simCase Case - storeNum, regionNum := getStoreNum(), getRegionNum() - noEmptyRatio := rand.Float64() // the ratio of noEmpty store to total store - noEmptyStoreNum := getNoEmptyStoreNum(storeNum, noEmptyRatio) + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) + noEmptyStoreNum := getNoEmptyStoreNum(totalStore, replica) - for i := 1; i <= storeNum; i++ { + for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ ID: IDAllocator.nextID(), Status: metapb.StoreState_Up, }) } - for i := 0; i < regionNum*storeNum/3; i++ { - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(i)%noEmptyStoreNum + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64(i+1)%noEmptyStoreNum + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64(i+2)%noEmptyStoreNum + 1}, + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + for j := 0; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: IDAllocator.nextID(), + StoreId: uint64((i+j)%noEmptyStoreNum + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ ID: IDAllocator.nextID(), @@ -54,21 +54,18 @@ func newAddNodes() *Case { }) } - threshold := 0.05 simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - res := true - leaderCounts := make([]int, 0, storeNum) - regionCounts := make([]int, 0, storeNum) - for i := 1; i <= storeNum; i++ { + for i := 1; i <= totalStore; i++ { leaderCount := regions.GetStoreLeaderCount(uint64(i)) - regionCount := regions.GetStoreRegionCount(uint64(i)) - leaderCounts = append(leaderCounts, leaderCount) - regionCounts = append(regionCounts, regionCount) - res = res && leaderAndRegionIsUniform(leaderCount, regionCount, regionNum, threshold) + peerCount := regions.GetStoreRegionCount(uint64(i)) + if !isUniform(leaderCount, totalRegion/totalStore) { + return false + } + if !isUniform(peerCount, totalRegion*replica/totalStore) { + return false + } } - - simutil.Logger.Info("current counts", zap.Ints("leader", leaderCounts), zap.Ints("region", regionCounts)) - return res + return true } return &simCase } diff --git a/tools/pd-simulator/simulator/cases/add_nodes_dynamic.go b/tools/pd-simulator/simulator/cases/add_nodes_dynamic.go index 59b0b54e1ca4..aa585b489235 100644 --- a/tools/pd-simulator/simulator/cases/add_nodes_dynamic.go +++ b/tools/pd-simulator/simulator/cases/add_nodes_dynamic.go @@ -15,24 +15,22 @@ package cases import ( - "math/rand" - "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newAddNodesDynamic() *Case { +func newAddNodesDynamic(config *sc.SimConfig) *Case { var simCase Case - storeNum, regionNum := getStoreNum(), getRegionNum() - noEmptyRatio := rand.Float64() // the ratio of noEmpty store to total store - noEmptyStoreNum := getNoEmptyStoreNum(storeNum, noEmptyRatio) + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) + noEmptyStoreNum := getNoEmptyStoreNum(totalStore, replica) - for i := 1; i <= int(noEmptyStoreNum); i++ { + for i := 0; i < noEmptyStoreNum; i++ { simCase.Stores = append(simCase.Stores, &Store{ ID: IDAllocator.nextID(), Status: metapb.StoreState_Up, @@ -40,15 +38,17 @@ func newAddNodesDynamic() *Case { } var ids []uint64 - for i := 1; i <= storeNum-int(noEmptyStoreNum); i++ { + for i := 0; i < totalStore-noEmptyStoreNum; i++ { ids = append(ids, IDAllocator.nextID()) } - for i := 0; i < regionNum*storeNum/3; i++ { - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(i)%noEmptyStoreNum + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64(i+1)%noEmptyStoreNum + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64(i+2)%noEmptyStoreNum + 1}, + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + for j := 0; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: IDAllocator.nextID(), + StoreId: uint64((i+j)%noEmptyStoreNum + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ ID: IDAllocator.nextID(), @@ -59,11 +59,11 @@ func newAddNodesDynamic() *Case { }) } - numNodes := int(noEmptyStoreNum) + currentStoreCount := noEmptyStoreNum e := &AddNodesDescriptor{} e.Step = func(tick int64) uint64 { - if tick%100 == 0 && numNodes < storeNum { - numNodes++ + if tick%100 == 0 && currentStoreCount < totalStore { + currentStoreCount++ nodeID := ids[0] ids = append(ids[:0], ids[1:]...) return nodeID @@ -72,21 +72,21 @@ func newAddNodesDynamic() *Case { } simCase.Events = []EventDescriptor{e} - threshold := 0.05 simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - res := numNodes == storeNum - leaderCounts := make([]int, 0, numNodes) - regionCounts := make([]int, 0, numNodes) - for i := 1; i <= numNodes; i++ { + if currentStoreCount != totalStore { + return false + } + for i := 1; i <= currentStoreCount; i++ { leaderCount := regions.GetStoreLeaderCount(uint64(i)) - regionCount := regions.GetStoreRegionCount(uint64(i)) - leaderCounts = append(leaderCounts, leaderCount) - regionCounts = append(regionCounts, regionCount) - res = res && leaderAndRegionIsUniform(leaderCount, regionCount, regionNum, threshold) + peerCount := regions.GetStoreRegionCount(uint64(i)) + if !isUniform(leaderCount, totalRegion/totalStore) { + return false + } + if !isUniform(peerCount, totalRegion*replica/totalStore) { + return false + } } - - simutil.Logger.Info("current counts", zap.Ints("leader", leaderCounts), zap.Ints("region", regionCounts)) - return res + return true } return &simCase } diff --git a/tools/pd-simulator/simulator/cases/balance_leader.go b/tools/pd-simulator/simulator/cases/balance_leader.go index bbc7ce97f687..c5315f85d8e7 100644 --- a/tools/pd-simulator/simulator/cases/balance_leader.go +++ b/tools/pd-simulator/simulator/cases/balance_leader.go @@ -18,28 +18,35 @@ import ( "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newBalanceLeader() *Case { +func newBalanceLeader(config *sc.SimConfig) *Case { var simCase Case - storeNum, regionNum := getStoreNum(), getRegionNum() - - for i := 1; i <= storeNum; i++ { + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) + for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ ID: IDAllocator.nextID(), Status: metapb.StoreState_Up, }) } - for i := 0; i < storeNum*regionNum/3; i++ { - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(storeNum)}, - {Id: IDAllocator.nextID(), StoreId: uint64((i+1)%(storeNum-1)) + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64((i+2)%(storeNum-1)) + 1}, + leaderStoreID := simCase.Stores[totalStore-1].ID + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + peers = append(peers, &metapb.Peer{ + Id: IDAllocator.nextID(), + StoreId: leaderStoreID, + }) + for j := 1; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: IDAllocator.nextID(), + StoreId: uint64((i+j)%(totalStore-1) + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ ID: IDAllocator.nextID(), @@ -50,17 +57,14 @@ func newBalanceLeader() *Case { }) } - threshold := 0.05 simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - res := true - leaderCounts := make([]int, 0, storeNum) - for i := 1; i <= storeNum; i++ { + for i := 1; i <= totalStore; i++ { leaderCount := regions.GetStoreLeaderCount(uint64(i)) - leaderCounts = append(leaderCounts, leaderCount) - res = res && isUniform(leaderCount, regionNum/3, threshold) + if !isUniform(leaderCount, totalRegion/totalStore) { + return false + } } - simutil.Logger.Info("current counts", zap.Ints("leader", leaderCounts)) - return res + return true } return &simCase } diff --git a/tools/pd-simulator/simulator/cases/balance_region.go b/tools/pd-simulator/simulator/cases/balance_region.go index 3b0c46f1670a..a559a335c97a 100644 --- a/tools/pd-simulator/simulator/cases/balance_region.go +++ b/tools/pd-simulator/simulator/cases/balance_region.go @@ -19,21 +19,18 @@ import ( "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newRedundantBalanceRegion() *Case { +func newRedundantBalanceRegion(config *sc.SimConfig) *Case { var simCase Case - storeNum := simutil.CaseConfigure.StoreNum - regionNum := simutil.CaseConfigure.RegionNum - if storeNum == 0 || regionNum == 0 { - storeNum, regionNum = 6, 4000 - } + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) - for i := 0; i < storeNum; i++ { + for i := 0; i < totalStore; i++ { s := &Store{ ID: IDAllocator.nextID(), Status: metapb.StoreState_Up, @@ -44,11 +41,13 @@ func newRedundantBalanceRegion() *Case { simCase.Stores = append(simCase.Stores, s) } - for i := 0; i < regionNum; i++ { - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(i%storeNum + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64((i+1)%storeNum + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64((i+2)%storeNum + 1)}, + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + for j := 0; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: IDAllocator.nextID(), + StoreId: uint64((i+j)%totalStore + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ ID: IDAllocator.nextID(), @@ -57,30 +56,26 @@ func newRedundantBalanceRegion() *Case { }) } - storesLastUpdateTime := make([]int64, storeNum+1) - storeLastAvailable := make([]uint64, storeNum+1) + storesLastUpdateTime := make([]int64, totalStore+1) + storeLastAvailable := make([]uint64, totalStore+1) simCase.Checker = func(_ *core.RegionsInfo, stats []info.StoreStats) bool { - res := true curTime := time.Now().Unix() - storesAvailable := make([]uint64, 0, storeNum+1) - for i := 1; i <= storeNum; i++ { + for i := 1; i <= totalStore; i++ { available := stats[i].GetAvailable() - storesAvailable = append(storesAvailable, available) if curTime-storesLastUpdateTime[i] > 60 { if storeLastAvailable[i] != available { - res = false + return false } if stats[i].ToCompactionSize != 0 { - res = false + return false } storesLastUpdateTime[i] = curTime storeLastAvailable[i] = available } else { - res = false + return false } } - simutil.Logger.Info("current counts", zap.Uint64s("storesAvailable", storesAvailable)) - return res + return true } return &simCase } diff --git a/tools/pd-simulator/simulator/cases/cases.go b/tools/pd-simulator/simulator/cases/cases.go index 0a8967a8d866..f2e79a819248 100644 --- a/tools/pd-simulator/simulator/cases/cases.go +++ b/tools/pd-simulator/simulator/cases/cases.go @@ -15,12 +15,14 @@ package cases import ( + "math/rand" + "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/schedule/placement" "github.com/tikv/pd/pkg/utils/typeutil" + "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" ) // Store is used to simulate tikv. @@ -86,7 +88,7 @@ func (a *idAllocator) GetID() uint64 { var IDAllocator idAllocator // CaseMap is a mapping of the cases to the their corresponding initialize functions. -var CaseMap = map[string]func() *Case{ +var CaseMap = map[string]func(*config.SimConfig) *Case{ "balance-leader": newBalanceLeader, "redundant-balance-region": newRedundantBalanceRegion, "add-nodes": newAddNodes, @@ -106,43 +108,27 @@ var CaseMap = map[string]func() *Case{ } // NewCase creates a new case. -func NewCase(name string) *Case { +func NewCase(name string, simConfig *config.SimConfig) *Case { if f, ok := CaseMap[name]; ok { - return f() + return f(simConfig) } return nil } -func leaderAndRegionIsUniform(leaderCount, regionCount, regionNum int, threshold float64) bool { - return isUniform(leaderCount, regionNum/3, threshold) && isUniform(regionCount, regionNum, threshold) -} - -func isUniform(count, meanCount int, threshold float64) bool { +func isUniform(count, meanCount int) bool { + threshold := 0.05 maxCount := int((1.0 + threshold) * float64(meanCount)) minCount := int((1.0 - threshold) * float64(meanCount)) return minCount <= count && count <= maxCount } -func getStoreNum() int { - storeNum := simutil.CaseConfigure.StoreNum - if storeNum < 3 { - simutil.Logger.Fatal("store num should be larger than or equal to 3") - } - return storeNum -} - -func getRegionNum() int { - regionNum := simutil.CaseConfigure.RegionNum - if regionNum <= 0 { - simutil.Logger.Fatal("region num should be larger than 0") +func getNoEmptyStoreNum(storeNum int, replica int) int { + noEmptyStoreNum := rand.Intn(storeNum) + if noEmptyStoreNum < replica { + return replica } - return regionNum -} - -func getNoEmptyStoreNum(storeNum int, noEmptyRatio float64) uint64 { - noEmptyStoreNum := uint64(float64(storeNum) * noEmptyRatio) - if noEmptyStoreNum < 3 || noEmptyStoreNum == uint64(storeNum) { - noEmptyStoreNum = 3 + if noEmptyStoreNum == storeNum { + return storeNum - 1 } return noEmptyStoreNum } diff --git a/tools/pd-simulator/simulator/cases/delete_nodes.go b/tools/pd-simulator/simulator/cases/delete_nodes.go index 4ba8e5064a4c..80650cf109d2 100644 --- a/tools/pd-simulator/simulator/cases/delete_nodes.go +++ b/tools/pd-simulator/simulator/cases/delete_nodes.go @@ -20,28 +20,31 @@ import ( "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newDeleteNodes() *Case { +func newDeleteNodes(config *sc.SimConfig) *Case { var simCase Case - storeNum, regionNum := getStoreNum(), getRegionNum() - noEmptyStoreNum := storeNum - 1 - for i := 1; i <= storeNum; i++ { + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) + noEmptyStoreNum := totalStore - 1 + for i := 1; i <= totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ ID: IDAllocator.nextID(), Status: metapb.StoreState_Up, }) } - for i := 0; i < regionNum*storeNum/3; i++ { - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(i%storeNum) + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64((i+1)%storeNum) + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64((i+2)%storeNum) + 1}, + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + for j := 0; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: IDAllocator.nextID(), + StoreId: uint64((i+j)%totalStore + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ ID: IDAllocator.nextID(), @@ -57,12 +60,12 @@ func newDeleteNodes() *Case { ids = append(ids, store.ID) } - numNodes := storeNum + currentStoreCount := totalStore e := &DeleteNodesDescriptor{} e.Step = func(tick int64) uint64 { - if numNodes > noEmptyStoreNum && tick%100 == 0 { - idx := rand.Intn(numNodes) - numNodes-- + if currentStoreCount > noEmptyStoreNum && tick%100 == 0 { + idx := rand.Intn(currentStoreCount) + currentStoreCount-- nodeID := ids[idx] ids = append(ids[:idx], ids[idx+1:]...) return nodeID @@ -71,21 +74,21 @@ func newDeleteNodes() *Case { } simCase.Events = []EventDescriptor{e} - threshold := 0.05 simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - res := numNodes == noEmptyStoreNum - leaderCounts := make([]int, 0, numNodes) - regionCounts := make([]int, 0, numNodes) + if currentStoreCount != noEmptyStoreNum { + return false + } for _, i := range ids { leaderCount := regions.GetStoreLeaderCount(i) - regionCount := regions.GetStoreRegionCount(i) - leaderCounts = append(leaderCounts, leaderCount) - regionCounts = append(regionCounts, regionCount) - res = res && leaderAndRegionIsUniform(leaderCount, regionCount, regionNum*storeNum/noEmptyStoreNum, threshold) + peerCount := regions.GetStoreRegionCount(i) + if !isUniform(leaderCount, totalRegion/noEmptyStoreNum) { + return false + } + if !isUniform(peerCount, totalRegion*replica/noEmptyStoreNum) { + return false + } } - - simutil.Logger.Info("current counts", zap.Ints("leader", leaderCounts), zap.Ints("region", regionCounts)) - return res + return true } return &simCase } diff --git a/tools/pd-simulator/simulator/cases/diagnose_label_isolation.go b/tools/pd-simulator/simulator/cases/diagnose_label_isolation.go index 7fa50e56197b..090371366083 100644 --- a/tools/pd-simulator/simulator/cases/diagnose_label_isolation.go +++ b/tools/pd-simulator/simulator/cases/diagnose_label_isolation.go @@ -21,12 +21,13 @@ import ( "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" "go.uber.org/zap" ) -func newLabelNotMatch1() *Case { +func newLabelNotMatch1(_ *sc.SimConfig) *Case { var simCase Case simCase.Labels = []string{"host"} @@ -88,7 +89,7 @@ func newLabelNotMatch1() *Case { return &simCase } -func newLabelIsolation1() *Case { +func newLabelIsolation1(_ *sc.SimConfig) *Case { var simCase Case simCase.Labels = []string{"host"} @@ -154,7 +155,7 @@ func newLabelIsolation1() *Case { return &simCase } -func newLabelIsolation2() *Case { +func newLabelIsolation2(_ *sc.SimConfig) *Case { var simCase Case simCase.Labels = []string{"dc", "zone", "host"} diff --git a/tools/pd-simulator/simulator/cases/diagnose_rule.go b/tools/pd-simulator/simulator/cases/diagnose_rule.go index 15c5942d810e..5d34e051071c 100644 --- a/tools/pd-simulator/simulator/cases/diagnose_rule.go +++ b/tools/pd-simulator/simulator/cases/diagnose_rule.go @@ -21,12 +21,13 @@ import ( "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/schedule/placement" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" "go.uber.org/zap" ) -func newRule1() *Case { +func newRule1(_ *sc.SimConfig) *Case { var simCase Case simCase.Rules = make([]*placement.Rule, 0) @@ -126,7 +127,7 @@ func newRule1() *Case { return &simCase } -func newRule2() *Case { +func newRule2(_ *sc.SimConfig) *Case { var simCase Case simCase.Rules = make([]*placement.Rule, 0) diff --git a/tools/pd-simulator/simulator/cases/hot_read.go b/tools/pd-simulator/simulator/cases/hot_read.go index d4ec6831d95d..50ad08d6011b 100644 --- a/tools/pd-simulator/simulator/cases/hot_read.go +++ b/tools/pd-simulator/simulator/cases/hot_read.go @@ -15,35 +15,34 @@ package cases import ( - "math/rand" - "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newHotRead() *Case { +func newHotRead(config *sc.SimConfig) *Case { var simCase Case - - storeNum, regionNum := getStoreNum(), getRegionNum() + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) // Initialize the cluster - for i := 1; i <= storeNum; i++ { + for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ ID: IDAllocator.nextID(), Status: metapb.StoreState_Up, }) } - for i := 0; i < storeNum*regionNum/3; i++ { - storeIDs := rand.Perm(storeNum) - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[0] + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[1] + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[2] + 1)}, + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + for j := 0; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: IDAllocator.nextID(), + StoreId: uint64((i+j)%totalStore + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ ID: IDAllocator.nextID(), @@ -56,7 +55,7 @@ func newHotRead() *Case { // Events description // select regions on store 1 as hot read regions. - selectRegionNum := 4 * storeNum + selectRegionNum := 4 * totalStore readFlow := make(map[uint64]int64, selectRegionNum) for _, r := range simCase.Regions { if r.Leader.GetStoreId() == 1 { @@ -73,12 +72,11 @@ func newHotRead() *Case { simCase.Events = []EventDescriptor{e} // Checker description simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - leaderCount := make([]int, storeNum) + leaderCount := make([]int, totalStore) for id := range readFlow { leaderStore := regions.GetRegion(id).GetLeader().GetStoreId() leaderCount[int(leaderStore-1)]++ } - simutil.Logger.Info("current hot region counts", zap.Reflect("hot-region", leaderCount)) // check count diff < 2. var min, max int diff --git a/tools/pd-simulator/simulator/cases/hot_write.go b/tools/pd-simulator/simulator/cases/hot_write.go index 8428afa75b58..a30afd1a8ec2 100644 --- a/tools/pd-simulator/simulator/cases/hot_write.go +++ b/tools/pd-simulator/simulator/cases/hot_write.go @@ -15,34 +15,34 @@ package cases import ( - "math/rand" - "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newHotWrite() *Case { +func newHotWrite(config *sc.SimConfig) *Case { var simCase Case + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) - storeNum, regionNum := getStoreNum(), getRegionNum() // Initialize the cluster - for i := 1; i <= storeNum; i++ { + for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ ID: IDAllocator.nextID(), Status: metapb.StoreState_Up, }) } - for i := 0; i < storeNum*regionNum/3; i++ { - storeIDs := rand.Perm(storeNum) - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[0] + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[1] + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[2] + 1)}, + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + for j := 0; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: IDAllocator.nextID(), + StoreId: uint64((i+j)%totalStore + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ ID: IDAllocator.nextID(), @@ -55,7 +55,7 @@ func newHotWrite() *Case { // Events description // select regions on store 1 as hot write regions. - selectStoreNum := storeNum + selectStoreNum := totalStore writeFlow := make(map[uint64]int64, selectStoreNum) for _, r := range simCase.Regions { if r.Leader.GetStoreId() == 1 { @@ -74,8 +74,8 @@ func newHotWrite() *Case { // Checker description simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - leaderCount := make([]int, storeNum) - peerCount := make([]int, storeNum) + leaderCount := make([]int, totalStore) + peerCount := make([]int, totalStore) for id := range writeFlow { region := regions.GetRegion(id) leaderCount[int(region.GetLeader().GetStoreId()-1)]++ @@ -83,7 +83,6 @@ func newHotWrite() *Case { peerCount[int(p.GetStoreId()-1)]++ } } - simutil.Logger.Info("current hot region counts", zap.Reflect("leader", leaderCount), zap.Reflect("peer", peerCount)) // check count diff <= 2. var minLeader, maxLeader, minPeer, maxPeer int diff --git a/tools/pd-simulator/simulator/cases/import_data.go b/tools/pd-simulator/simulator/cases/import_data.go index 6cf3b79a736f..b9f448a6cf6e 100644 --- a/tools/pd-simulator/simulator/cases/import_data.go +++ b/tools/pd-simulator/simulator/cases/import_data.go @@ -17,7 +17,6 @@ package cases import ( "bytes" "fmt" - "math/rand" "os" "github.com/docker/go-units" @@ -26,27 +25,33 @@ import ( "github.com/pingcap/log" "github.com/tikv/pd/pkg/codec" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" "go.uber.org/zap" ) -func newImportData() *Case { +func newImportData(config *sc.SimConfig) *Case { var simCase Case + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) + // Initialize the cluster - for i := 1; i <= 10; i++ { + for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ ID: IDAllocator.nextID(), Status: metapb.StoreState_Up, }) } - for i := 0; i < getRegionNum(); i++ { - storeIDs := rand.Perm(10) - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[0] + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[1] + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[2] + 1)}, + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + for j := 0; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: IDAllocator.nextID(), + StoreId: uint64((i+j)%totalStore + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ ID: IDAllocator.nextID(), @@ -65,7 +70,7 @@ func newImportData() *Case { table12 := string(codec.EncodeBytes(codec.GenerateTableKey(12))) table13 := string(codec.EncodeBytes(codec.GenerateTableKey(13))) e.Step = func(tick int64) map[string]int64 { - if tick > int64(getRegionNum())/10 { + if tick > int64(totalRegion)/10 { return nil } return map[string]int64{ @@ -141,14 +146,14 @@ func newImportData() *Case { if dev > 0.02 { simutil.Logger.Warn("Not balanced, change scheduler or store limit", zap.Float64("dev score", dev)) } - if checkCount > uint64(getRegionNum())/5 { + if checkCount > uint64(totalRegion)/5 { isEnd = true - } else if checkCount > uint64(getRegionNum())/10 { + } else if checkCount > uint64(totalRegion)/10 { isEnd = dev < 0.01 } if isEnd { - renderPlot("new_region.html", newRegionCount, int(checkCount), 0, getRegionNum()/10) - renderPlot("all_region.html", allRegionCount, int(checkCount), 28*getRegionNum()/100, getRegionNum()/3) + renderPlot("new_region.html", newRegionCount, int(checkCount), 0, totalRegion/10) + renderPlot("all_region.html", allRegionCount, int(checkCount), 28*totalRegion/100, totalRegion/3) } return isEnd } diff --git a/tools/pd-simulator/simulator/cases/makeup_down_replica.go b/tools/pd-simulator/simulator/cases/makeup_down_replica.go index 86c9b4cac1da..28de9577cfc1 100644 --- a/tools/pd-simulator/simulator/cases/makeup_down_replica.go +++ b/tools/pd-simulator/simulator/cases/makeup_down_replica.go @@ -18,27 +18,31 @@ import ( "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newMakeupDownReplicas() *Case { +func newMakeupDownReplicas(config *sc.SimConfig) *Case { var simCase Case - storeNum, regionNum := getStoreNum(), getRegionNum() - noEmptyStoreNum := storeNum - 1 - for i := 1; i <= storeNum; i++ { + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) + + noEmptyStoreNum := totalStore - 1 + for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ ID: IDAllocator.nextID(), Status: metapb.StoreState_Up, }) } - for i := 0; i < storeNum*regionNum/3; i++ { - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64((i)%storeNum) + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64((i+1)%storeNum) + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64((i+2)%storeNum) + 1}, + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + for j := 0; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: IDAllocator.nextID(), + StoreId: uint64((i+j)%totalStore + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ ID: IDAllocator.nextID(), @@ -49,7 +53,7 @@ func newMakeupDownReplicas() *Case { }) } - numNodes := storeNum + numNodes := totalStore down := false e := &DeleteNodesDescriptor{} e.Step = func(tick int64) uint64 { @@ -65,31 +69,16 @@ func newMakeupDownReplicas() *Case { simCase.Events = []EventDescriptor{e} simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - sum := 0 - regionCounts := make([]int, 0, storeNum) - for i := 1; i <= storeNum; i++ { - regionCount := regions.GetStoreRegionCount(uint64(i)) - regionCounts = append(regionCounts, regionCount) - sum += regionCount - } - simutil.Logger.Info("current region counts", zap.Ints("region", regionCounts)) - - if down && sum < storeNum*regionNum { - // only need to print once - down = false - simutil.Logger.Error("making up replicas don't start immediately") + if !down { return false } - - res := true - threshold := 0.05 - for index, regionCount := range regionCounts { - if index == 0 { // storeId == 1 - continue + for i := 1; i <= totalStore; i++ { + peerCount := regions.GetStoreRegionCount(uint64(i)) + if isUniform(peerCount, replica*totalRegion/noEmptyStoreNum) { + return false } - res = res && isUniform(regionCount, storeNum*regionNum/noEmptyStoreNum, threshold) } - return res + return true } return &simCase } diff --git a/tools/pd-simulator/simulator/cases/region_merge.go b/tools/pd-simulator/simulator/cases/region_merge.go index 3d5d57f804fc..953b0e309e19 100644 --- a/tools/pd-simulator/simulator/cases/region_merge.go +++ b/tools/pd-simulator/simulator/cases/region_merge.go @@ -15,33 +15,33 @@ package cases import ( - "math/rand" - "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newRegionMerge() *Case { +func newRegionMerge(config *sc.SimConfig) *Case { var simCase Case - // Initialize the cluster - storeNum, regionNum := getStoreNum(), getRegionNum() - for i := 1; i <= storeNum; i++ { + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) + + for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ ID: IDAllocator.nextID(), Status: metapb.StoreState_Up, }) } - for i := 0; i < storeNum*regionNum/3; i++ { - storeIDs := rand.Perm(storeNum) - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[0] + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[1] + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[2] + 1)}, + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + for j := 0; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: IDAllocator.nextID(), + StoreId: uint64((i+j)%totalStore + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ ID: IDAllocator.nextID(), @@ -52,18 +52,13 @@ func newRegionMerge() *Case { }) } // Checker description - threshold := 0.05 mergeRatio := 4 // when max-merge-region-size is 20, per region will reach 40MB simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - sum := 0 - regionCounts := make([]int, 0, storeNum) - for i := 1; i <= storeNum; i++ { - regionCount := regions.GetStoreRegionCount(uint64(i)) - regionCounts = append(regionCounts, regionCount) - sum += regionCount + currentPeerCount := 0 + for i := 1; i <= totalStore; i++ { + currentPeerCount += regions.GetStoreRegionCount(uint64(i)) } - simutil.Logger.Info("current counts", zap.Ints("region", regionCounts), zap.Int64("average region size", regions.GetAverageRegionSize())) - return isUniform(sum, storeNum*regionNum/mergeRatio, threshold) + return isUniform(currentPeerCount, totalRegion*replica/mergeRatio) } return &simCase } diff --git a/tools/pd-simulator/simulator/cases/region_split.go b/tools/pd-simulator/simulator/cases/region_split.go index b85cd319494f..7b712f4dc483 100644 --- a/tools/pd-simulator/simulator/cases/region_split.go +++ b/tools/pd-simulator/simulator/cases/region_split.go @@ -18,16 +18,15 @@ import ( "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newRegionSplit() *Case { +func newRegionSplit(config *sc.SimConfig) *Case { var simCase Case - // Initialize the cluster - storeNum := getStoreNum() - for i := 1; i <= storeNum; i++ { + totalStore := config.TotalStore + + for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ ID: uint64(i), Status: metapb.StoreState_Up, @@ -57,15 +56,13 @@ func newRegionSplit() *Case { // Checker description simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - res := true - regionCounts := make([]int, 0, storeNum) - for i := 1; i <= storeNum; i++ { - regionCount := regions.GetStoreRegionCount(uint64(i)) - regionCounts = append(regionCounts, regionCount) - res = res && regionCount > 5 + for i := 1; i <= totalStore; i++ { + peerCount := regions.GetStoreRegionCount(uint64(i)) + if peerCount < 5 { + return false + } } - simutil.Logger.Info("current counts", zap.Ints("region", regionCounts)) - return res + return true } return &simCase } diff --git a/tools/pd-simulator/simulator/client.go b/tools/pd-simulator/simulator/client.go index 808c991e97f6..50ed57995dfb 100644 --- a/tools/pd-simulator/simulator/client.go +++ b/tools/pd-simulator/simulator/client.go @@ -30,6 +30,7 @@ import ( "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/schedule/placement" "github.com/tikv/pd/pkg/utils/typeutil" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" "go.uber.org/zap" "google.golang.org/grpc" @@ -45,7 +46,7 @@ type Client interface { PutStore(ctx context.Context, store *metapb.Store) error StoreHeartbeat(ctx context.Context, stats *pdpb.StoreStats) error RegionHeartbeat(ctx context.Context, region *core.RegionInfo) error - PutPDConfig(*PDConfig) error + PutPDConfig(*sc.PDConfig) error Close() } @@ -316,7 +317,7 @@ func (c *client) PutStore(ctx context.Context, store *metapb.Store) error { return nil } -func (c *client) PutPDConfig(config *PDConfig) error { +func (c *client) PutPDConfig(config *sc.PDConfig) error { if len(config.PlacementRules) > 0 { path := fmt.Sprintf("%s/%s/config/rules/batch", c.url, httpPrefix) ruleOps := make([]*placement.RuleOp, 0) diff --git a/tools/pd-simulator/simulator/config.go b/tools/pd-simulator/simulator/config/config.go similarity index 85% rename from tools/pd-simulator/simulator/config.go rename to tools/pd-simulator/simulator/config/config.go index 4f197fb83c22..01bf8199ab42 100644 --- a/tools/pd-simulator/simulator/config.go +++ b/tools/pd-simulator/simulator/config/config.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -package simulator +package config import ( "fmt" @@ -31,8 +31,11 @@ import ( ) const ( - // tick - defaultSimTickInterval = 100 * time.Millisecond + // simulator + defaultSimTickInterval = 100 * time.Millisecond + defaultTotalStore = 3 + defaultTotalRegion = 1000 + defaultEnableTransferRegionCounter = false // store defaultStoreIOMBPerSecond = 40 defaultStoreHeartbeat = 10 * time.Second @@ -53,9 +56,12 @@ const ( // SimConfig is the simulator configuration. type SimConfig struct { - // tick - CaseName string `toml:"case-name"` - SimTickInterval typeutil.Duration `toml:"sim-tick-interval"` + // Simulator + CaseName string `toml:"case-name"` + TotalStore int `toml:"total-store"` + TotalRegion int `toml:"total-region"` + EnableTransferRegionCounter bool `toml:"enable-transfer-region-counter"` + SimTickInterval typeutil.Duration `toml:"sim-tick-interval"` // store StoreIOMBPerSecond int64 `toml:"store-io-per-second"` StoreVersion string `toml:"store-version"` @@ -99,6 +105,9 @@ func NewSimConfig(serverLogLevel string) *SimConfig { // Adjust is used to adjust configurations func (sc *SimConfig) Adjust(meta *toml.MetaData) error { configutil.AdjustDuration(&sc.SimTickInterval, defaultSimTickInterval) + configutil.AdjustInt(&sc.TotalStore, defaultTotalStore) + configutil.AdjustInt(&sc.TotalRegion, defaultTotalRegion) + configutil.AdjustBool(&sc.EnableTransferRegionCounter, defaultEnableTransferRegionCounter) configutil.AdjustInt64(&sc.StoreIOMBPerSecond, defaultStoreIOMBPerSecond) configutil.AdjustString(&sc.StoreVersion, versioninfo.PDReleaseVersion) configutil.AdjustDuration(&sc.RaftStore.RegionHeartBeatInterval, defaultRegionHeartbeat) @@ -118,7 +127,7 @@ func (sc *SimConfig) Adjust(meta *toml.MetaData) error { return sc.ServerConfig.Adjust(meta, false) } -func (sc *SimConfig) speed() uint64 { +func (sc *SimConfig) Speed() uint64 { return uint64(time.Second / sc.SimTickInterval.Duration) } diff --git a/tools/pd-simulator/simulator/conn.go b/tools/pd-simulator/simulator/conn.go index 588fec246d4c..b95b33ee63df 100644 --- a/tools/pd-simulator/simulator/conn.go +++ b/tools/pd-simulator/simulator/conn.go @@ -17,6 +17,7 @@ package simulator import ( "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/tools/pd-simulator/simulator/cases" + "github.com/tikv/pd/tools/pd-simulator/simulator/config" ) // Connection records the information of connection among nodes. @@ -26,7 +27,7 @@ type Connection struct { } // NewConnection creates nodes according to the configuration and returns the connection among nodes. -func NewConnection(simCase *cases.Case, pdAddr string, storeConfig *SimConfig) (*Connection, error) { +func NewConnection(simCase *cases.Case, pdAddr string, storeConfig *config.SimConfig) (*Connection, error) { conn := &Connection{ pdAddr: pdAddr, Nodes: make(map[uint64]*Node), diff --git a/tools/pd-simulator/simulator/drive.go b/tools/pd-simulator/simulator/drive.go index c7f64324c19d..3d2bce746750 100644 --- a/tools/pd-simulator/simulator/drive.go +++ b/tools/pd-simulator/simulator/drive.go @@ -26,6 +26,7 @@ import ( "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/utils/typeutil" "github.com/tikv/pd/tools/pd-simulator/simulator/cases" + "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" "go.etcd.io/etcd/clientv3" @@ -42,17 +43,17 @@ type Driver struct { eventRunner *EventRunner raftEngine *RaftEngine conn *Connection - simConfig *SimConfig - pdConfig *PDConfig + simConfig *config.SimConfig + pdConfig *config.PDConfig } // NewDriver returns a driver. -func NewDriver(pdAddr string, caseName string, simConfig *SimConfig) (*Driver, error) { - simCase := cases.NewCase(caseName) +func NewDriver(pdAddr string, caseName string, simConfig *config.SimConfig) (*Driver, error) { + simCase := cases.NewCase(caseName, simConfig) if simCase == nil { return nil, errors.Errorf("failed to create case %s", caseName) } - pdConfig := &PDConfig{} + pdConfig := &config.PDConfig{} pdConfig.PlacementRules = simCase.Rules pdConfig.LocationLabels = simCase.Labels return &Driver{ diff --git a/tools/pd-simulator/simulator/node.go b/tools/pd-simulator/simulator/node.go index 68a10a8638e1..883b5d4474b6 100644 --- a/tools/pd-simulator/simulator/node.go +++ b/tools/pd-simulator/simulator/node.go @@ -27,6 +27,7 @@ import ( "github.com/tikv/pd/pkg/ratelimit" "github.com/tikv/pd/pkg/utils/syncutil" "github.com/tikv/pd/tools/pd-simulator/simulator/cases" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" "go.uber.org/zap" @@ -57,7 +58,7 @@ type Node struct { } // NewNode returns a Node. -func NewNode(s *cases.Store, pdAddr string, config *SimConfig) (*Node, error) { +func NewNode(s *cases.Store, pdAddr string, config *sc.SimConfig) (*Node, error) { ctx, cancel := context.WithCancel(context.Background()) store := &metapb.Store{ Id: s.ID, @@ -93,7 +94,7 @@ func NewNode(s *cases.Store, pdAddr string, config *SimConfig) (*Node, error) { cancel() return nil, err } - ratio := config.speed() + ratio := config.Speed() speed := config.StoreIOMBPerSecond * units.MiB * int64(ratio) return &Node{ Store: store, diff --git a/tools/pd-simulator/simulator/raft.go b/tools/pd-simulator/simulator/raft.go index fccf75781d39..d416f69ff801 100644 --- a/tools/pd-simulator/simulator/raft.go +++ b/tools/pd-simulator/simulator/raft.go @@ -22,6 +22,7 @@ import ( "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/utils/syncutil" "github.com/tikv/pd/tools/pd-simulator/simulator/cases" + "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" "go.uber.org/zap" ) @@ -34,12 +35,12 @@ type RaftEngine struct { regionChange map[uint64][]uint64 regionSplitSize int64 regionSplitKeys int64 - storeConfig *SimConfig + storeConfig *config.SimConfig useTiDBEncodedKey bool } // NewRaftEngine creates the initialized raft with the configuration. -func NewRaftEngine(conf *cases.Case, conn *Connection, storeConfig *SimConfig) *RaftEngine { +func NewRaftEngine(conf *cases.Case, conn *Connection, storeConfig *config.SimConfig) *RaftEngine { r := &RaftEngine{ regionsInfo: core.NewRegionsInfo(), conn: conn, diff --git a/tools/pd-simulator/simulator/simutil/case_config.go b/tools/pd-simulator/simulator/simutil/case_config.go deleted file mode 100644 index a34035c15aa3..000000000000 --- a/tools/pd-simulator/simulator/simutil/case_config.go +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2019 TiKV Project Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package simutil - -// CaseConfig is to save flags -type CaseConfig struct { - StoreNum int - RegionNum int - EnableTransferRegionCounter bool -} - -// CaseConfigure is an global instance for CaseConfig -var CaseConfigure *CaseConfig - -// InitCaseConfig is to init caseConfigure -func InitCaseConfig(storeNum, regionNum int, enableTransferRegionCounter bool) { - CaseConfigure = &CaseConfig{ - StoreNum: storeNum, - RegionNum: regionNum, - EnableTransferRegionCounter: enableTransferRegionCounter, - } -} diff --git a/tools/pd-simulator/simulator/task.go b/tools/pd-simulator/simulator/task.go index a19854b53ba3..c0bfa1e691b8 100644 --- a/tools/pd-simulator/simulator/task.go +++ b/tools/pd-simulator/simulator/task.go @@ -415,7 +415,7 @@ func (a *addPeer) tick(engine *RaftEngine, region *core.RegionInfo) (newRegion * pendingPeers := append(region.GetPendingPeers(), a.peer) return region.Clone(core.WithAddPeer(a.peer), core.WithIncConfVer(), core.WithPendingPeers(pendingPeers)), false } - speed := engine.storeConfig.speed() + speed := engine.storeConfig.Speed() // Step 2: Process Snapshot if !processSnapshot(sendNode, a.sendingStat, speed) { return nil, false From 71490f72b4c57a70f4f5b4e3486018859f85189c Mon Sep 17 00:00:00 2001 From: Hu# Date: Thu, 30 May 2024 16:46:21 +0800 Subject: [PATCH 08/47] pkg/member: Fixing residual counts in campaign times (#8226) close tikv/pd#8225 Signed-off-by: husharp --- pkg/election/leadership.go | 15 +++++++------- pkg/election/leadership_test.go | 33 ++++++++++++++++++++++++++++++ pkg/member/member.go | 3 ++- tests/server/member/member_test.go | 10 +++++++-- 4 files changed, 51 insertions(+), 10 deletions(-) diff --git a/pkg/election/leadership.go b/pkg/election/leadership.go index 02f519dbc75d..3ee413818a50 100644 --- a/pkg/election/leadership.go +++ b/pkg/election/leadership.go @@ -34,11 +34,12 @@ import ( ) const ( - defaultCampaignTimesSlot = 10 - watchLoopUnhealthyTimeout = 60 * time.Second - campaignTimesRecordTimeout = 5 * time.Minute + defaultCampaignTimesSlot = 10 + watchLoopUnhealthyTimeout = 60 * time.Second ) +var campaignTimesRecordTimeout = 5 * time.Minute + // GetLeader gets the corresponding leader from etcd by given leaderPath (as the key). func GetLeader(c *clientv3.Client, leaderPath string) (*pdpb.Member, int64, error) { leader := &pdpb.Member{} @@ -114,6 +115,7 @@ func (ls *Leadership) GetLeaderKey() string { } // GetCampaignTimesNum is used to get the campaign times of the leader within `campaignTimesRecordTimeout`. +// Need to make sure `AddCampaignTimes` is called before this function. func (ls *Leadership) GetCampaignTimesNum() int { if ls == nil { return 0 @@ -129,8 +131,8 @@ func (ls *Leadership) ResetCampaignTimes() { ls.campaignTimes = make([]time.Time, 0, defaultCampaignTimesSlot) } -// addCampaignTimes is used to add the campaign times of the leader. -func (ls *Leadership) addCampaignTimes() { +// AddCampaignTimes is used to add the campaign times of the leader. +func (ls *Leadership) AddCampaignTimes() { if ls == nil { return } @@ -138,7 +140,7 @@ func (ls *Leadership) addCampaignTimes() { if time.Since(ls.campaignTimes[i]) > campaignTimesRecordTimeout { // remove the time which is more than `campaignTimesRecordTimeout` // array is sorted by time - ls.campaignTimes = ls.campaignTimes[i:] + ls.campaignTimes = ls.campaignTimes[i+1:] break } } @@ -148,7 +150,6 @@ func (ls *Leadership) addCampaignTimes() { // Campaign is used to campaign the leader with given lease and returns a leadership func (ls *Leadership) Campaign(leaseTimeout int64, leaderData string, cmps ...clientv3.Cmp) error { - ls.addCampaignTimes() ls.leaderValue = leaderData // Create a new lease to campaign newLease := &lease{ diff --git a/pkg/election/leadership_test.go b/pkg/election/leadership_test.go index 1fde4ddeba7e..40f0bcbee239 100644 --- a/pkg/election/leadership_test.go +++ b/pkg/election/leadership_test.go @@ -262,3 +262,36 @@ func TestRequestProgress(t *testing.T) { checkWatcherRequestProgress(false) checkWatcherRequestProgress(true) } + +func TestCampaignTimes(t *testing.T) { + re := require.New(t) + _, client, clean := etcdutil.NewTestEtcdCluster(t, 1) + defer clean() + leadership := NewLeadership(client, "test_leader", "test_leader") + + // all the campaign times are within the timeout. + campaignTimesRecordTimeout = 10 * time.Second + defer func() { + campaignTimesRecordTimeout = 5 * time.Minute + }() + for i := 0; i < 3; i++ { + leadership.AddCampaignTimes() + time.Sleep(100 * time.Millisecond) + } + re.Equal(3, leadership.GetCampaignTimesNum()) + + // only the last 2 records are valid. + campaignTimesRecordTimeout = 200 * time.Millisecond + for i := 0; i < 3; i++ { + leadership.AddCampaignTimes() + time.Sleep(100 * time.Millisecond) + } + re.Equal(2, leadership.GetCampaignTimesNum()) + + time.Sleep(200 * time.Millisecond) + // need to wait for the next addCampaignTimes to update the campaign time. + re.Equal(2, leadership.GetCampaignTimesNum()) + // check campaign leader frequency. + leadership.AddCampaignTimes() + re.Equal(1, leadership.GetCampaignTimesNum()) +} diff --git a/pkg/member/member.go b/pkg/member/member.go index af504d839638..bbf46d8f167f 100644 --- a/pkg/member/member.go +++ b/pkg/member/member.go @@ -182,11 +182,12 @@ func (m *EmbeddedEtcdMember) GetLastLeaderUpdatedTime() time.Time { // and make it become a PD leader. // leader should be changed when campaign leader frequently. func (m *EmbeddedEtcdMember) CampaignLeader(ctx context.Context, leaseTimeout int64) error { + m.leadership.AddCampaignTimes() failpoint.Inject("skipCampaignLeaderCheck", func() { failpoint.Return(m.leadership.Campaign(leaseTimeout, m.MemberValue())) }) - if m.leadership.GetCampaignTimesNum() >= campaignLeaderFrequencyTimes { + if m.leadership.GetCampaignTimesNum() > campaignLeaderFrequencyTimes { if err := m.ResignEtcdLeader(ctx, m.Name(), ""); err != nil { return err } diff --git a/tests/server/member/member_test.go b/tests/server/member/member_test.go index c581eb393907..edff14a3b987 100644 --- a/tests/server/member/member_test.go +++ b/tests/server/member/member_test.go @@ -328,20 +328,26 @@ func TestCampaignLeaderFrequently(t *testing.T) { re := require.New(t) ctx, cancel := context.WithCancel(context.Background()) defer cancel() - cluster, err := tests.NewTestCluster(ctx, 5) + cluster, err := tests.NewTestCluster(ctx, 3) defer cluster.Destroy() re.NoError(err) err = cluster.RunInitialServers() re.NoError(err) + // the 1st time campaign leader. cluster.WaitLeader() leader := cluster.GetLeader() re.NotEmpty(cluster.GetLeader()) - for i := 0; i < 3; i++ { + // need to prevent 3 times(including the above 1st time) campaign leader in 5 min. + for i := 0; i < 2; i++ { cluster.GetLeaderServer().ResetPDLeader() cluster.WaitLeader() + re.Equal(leader, cluster.GetLeader()) } + // check for the 4th time. + cluster.GetLeaderServer().ResetPDLeader() + cluster.WaitLeader() // PD leader should be different from before because etcd leader changed. re.NotEmpty(cluster.GetLeader()) re.NotEqual(leader, cluster.GetLeader()) From 632cda452a5284d272330d02278ed4882355a7aa Mon Sep 17 00:00:00 2001 From: JmPotato Date: Thu, 30 May 2024 17:25:22 +0800 Subject: [PATCH 09/47] api/middleware: avoid redirecting when the leader remains unchanged (#8228) ref tikv/pd#7300 Avoid redirecting when the leader remains unchanged. Signed-off-by: JmPotato Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/utils/apiutil/serverapi/middleware.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pkg/utils/apiutil/serverapi/middleware.go b/pkg/utils/apiutil/serverapi/middleware.go index 18dd2f52155f..1cd3d5b53d63 100755 --- a/pkg/utils/apiutil/serverapi/middleware.go +++ b/pkg/utils/apiutil/serverapi/middleware.go @@ -208,10 +208,16 @@ func (h *redirector) ServeHTTP(w http.ResponseWriter, r *http.Request, next http w.Header().Add(apiutil.XForwardedToMicroServiceHeader, "true") } else if name := r.Header.Get(apiutil.PDRedirectorHeader); len(name) == 0 { leader := h.waitForLeader(r) + // The leader has not been elected yet. if leader == nil { http.Error(w, "no leader", http.StatusServiceUnavailable) return } + // If the leader is the current server now, we can handle the request directly. + if h.s.GetMember().IsLeader() || leader.GetName() == h.s.Name() { + next(w, r) + return + } clientUrls = leader.GetClientUrls() r.Header.Set(apiutil.PDRedirectorHeader, h.s.Name()) } else { From 19c9852decda4cb49a2319b453c4f01c6a26014f Mon Sep 17 00:00:00 2001 From: Ryan Leung Date: Fri, 31 May 2024 12:28:22 +0800 Subject: [PATCH 10/47] tools: support triggering an event through HTTP API (#5677) close tikv/pd#5451, ref tikv/pd#5468 Signed-off-by: Ryan Leung --- tools/pd-simulator/main.go | 30 +---- .../pd-simulator/simulator/cases/add_nodes.go | 71 ------------ .../simulator/cases/add_nodes_dynamic.go | 92 --------------- .../simulator/cases/balance_leader.go | 9 +- .../simulator/cases/balance_region.go | 7 +- tools/pd-simulator/simulator/cases/cases.go | 16 --- .../simulator/cases/delete_nodes.go | 94 ---------------- .../pd-simulator/simulator/cases/hot_read.go | 7 +- .../pd-simulator/simulator/cases/hot_write.go | 7 +- .../simulator/cases/makeup_down_replica.go | 7 +- .../simulator/cases/region_merge.go | 7 +- tools/pd-simulator/simulator/conn.go | 10 ++ tools/pd-simulator/simulator/drive.go | 59 +++++++--- tools/pd-simulator/simulator/event.go | 106 ++++++++++++------ tools/pd-simulator/simulator/simutil/id.go | 39 +++++++ 15 files changed, 190 insertions(+), 371 deletions(-) delete mode 100644 tools/pd-simulator/simulator/cases/add_nodes.go delete mode 100644 tools/pd-simulator/simulator/cases/add_nodes_dynamic.go delete mode 100644 tools/pd-simulator/simulator/cases/delete_nodes.go create mode 100644 tools/pd-simulator/simulator/simutil/id.go diff --git a/tools/pd-simulator/main.go b/tools/pd-simulator/main.go index 04de914f5f0a..45b3ecd75c9c 100644 --- a/tools/pd-simulator/main.go +++ b/tools/pd-simulator/main.go @@ -17,8 +17,6 @@ package main import ( "context" "fmt" - "net/http" - "net/http/pprof" "os" "os/signal" "syscall" @@ -26,7 +24,6 @@ import ( "github.com/BurntSushi/toml" "github.com/pingcap/log" - "github.com/prometheus/client_golang/prometheus/promhttp" flag "github.com/spf13/pflag" "github.com/tikv/pd/pkg/schedule/schedulers" "github.com/tikv/pd/pkg/statistics" @@ -95,8 +92,7 @@ func main() { func run(simCase string, simConfig *sc.SimConfig) { if *pdAddr != "" { - go runHTTPServer() - simStart(*pdAddr, simCase, simConfig) + simStart(*pdAddr, *statusAddress, simCase, simConfig) } else { local, clean := NewSingleServer(context.Background(), simConfig) err := local.Run() @@ -109,28 +105,10 @@ func run(simCase string, simConfig *sc.SimConfig) { } time.Sleep(100 * time.Millisecond) } - simStart(local.GetAddr(), simCase, simConfig, clean) + simStart(local.GetAddr(), "", simCase, simConfig, clean) } } -func runHTTPServer() { - http.Handle("/metrics", promhttp.Handler()) - // profile API - http.HandleFunc("/pprof/profile", pprof.Profile) - http.HandleFunc("/pprof/trace", pprof.Trace) - http.HandleFunc("/pprof/symbol", pprof.Symbol) - http.Handle("/pprof/heap", pprof.Handler("heap")) - http.Handle("/pprof/mutex", pprof.Handler("mutex")) - http.Handle("/pprof/allocs", pprof.Handler("allocs")) - http.Handle("/pprof/block", pprof.Handler("block")) - http.Handle("/pprof/goroutine", pprof.Handler("goroutine")) - server := &http.Server{ - Addr: *statusAddress, - ReadHeaderTimeout: 3 * time.Second, - } - server.ListenAndServe() -} - // NewSingleServer creates a pd server for simulator. func NewSingleServer(ctx context.Context, simConfig *sc.SimConfig) (*server.Server, testutil.CleanupFunc) { err := logutil.SetupLogger(simConfig.ServerConfig.Log, &simConfig.ServerConfig.Logger, &simConfig.ServerConfig.LogProps) @@ -157,9 +135,9 @@ func cleanServer(cfg *config.Config) { os.RemoveAll(cfg.DataDir) } -func simStart(pdAddr string, simCase string, simConfig *sc.SimConfig, clean ...testutil.CleanupFunc) { +func simStart(pdAddr, statusAddress string, simCase string, simConfig *sc.SimConfig, clean ...testutil.CleanupFunc) { start := time.Now() - driver, err := simulator.NewDriver(pdAddr, simCase, simConfig) + driver, err := simulator.NewDriver(pdAddr, statusAddress, simCase, simConfig) if err != nil { simutil.Logger.Fatal("create driver error", zap.Error(err)) } diff --git a/tools/pd-simulator/simulator/cases/add_nodes.go b/tools/pd-simulator/simulator/cases/add_nodes.go deleted file mode 100644 index 5c73fe9764c9..000000000000 --- a/tools/pd-simulator/simulator/cases/add_nodes.go +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright 2017 TiKV Project Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package cases - -import ( - "github.com/docker/go-units" - "github.com/pingcap/kvproto/pkg/metapb" - "github.com/tikv/pd/pkg/core" - sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" - "github.com/tikv/pd/tools/pd-simulator/simulator/info" -) - -func newAddNodes(config *sc.SimConfig) *Case { - var simCase Case - - totalStore := config.TotalStore - totalRegion := config.TotalRegion - replica := int(config.ServerConfig.Replication.MaxReplicas) - noEmptyStoreNum := getNoEmptyStoreNum(totalStore, replica) - - for i := 0; i < totalStore; i++ { - simCase.Stores = append(simCase.Stores, &Store{ - ID: IDAllocator.nextID(), - Status: metapb.StoreState_Up, - }) - } - - for i := 0; i < totalRegion; i++ { - peers := make([]*metapb.Peer, 0, replica) - for j := 0; j < replica; j++ { - peers = append(peers, &metapb.Peer{ - Id: IDAllocator.nextID(), - StoreId: uint64((i+j)%noEmptyStoreNum + 1), - }) - } - simCase.Regions = append(simCase.Regions, Region{ - ID: IDAllocator.nextID(), - Peers: peers, - Leader: peers[0], - Size: 96 * units.MiB, - Keys: 960000, - }) - } - - simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - for i := 1; i <= totalStore; i++ { - leaderCount := regions.GetStoreLeaderCount(uint64(i)) - peerCount := regions.GetStoreRegionCount(uint64(i)) - if !isUniform(leaderCount, totalRegion/totalStore) { - return false - } - if !isUniform(peerCount, totalRegion*replica/totalStore) { - return false - } - } - return true - } - return &simCase -} diff --git a/tools/pd-simulator/simulator/cases/add_nodes_dynamic.go b/tools/pd-simulator/simulator/cases/add_nodes_dynamic.go deleted file mode 100644 index aa585b489235..000000000000 --- a/tools/pd-simulator/simulator/cases/add_nodes_dynamic.go +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright 2018 TiKV Project Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package cases - -import ( - "github.com/docker/go-units" - "github.com/pingcap/kvproto/pkg/metapb" - "github.com/tikv/pd/pkg/core" - sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" - "github.com/tikv/pd/tools/pd-simulator/simulator/info" -) - -func newAddNodesDynamic(config *sc.SimConfig) *Case { - var simCase Case - - totalStore := config.TotalStore - totalRegion := config.TotalRegion - replica := int(config.ServerConfig.Replication.MaxReplicas) - noEmptyStoreNum := getNoEmptyStoreNum(totalStore, replica) - - for i := 0; i < noEmptyStoreNum; i++ { - simCase.Stores = append(simCase.Stores, &Store{ - ID: IDAllocator.nextID(), - Status: metapb.StoreState_Up, - }) - } - - var ids []uint64 - for i := 0; i < totalStore-noEmptyStoreNum; i++ { - ids = append(ids, IDAllocator.nextID()) - } - - for i := 0; i < totalRegion; i++ { - peers := make([]*metapb.Peer, 0, replica) - for j := 0; j < replica; j++ { - peers = append(peers, &metapb.Peer{ - Id: IDAllocator.nextID(), - StoreId: uint64((i+j)%noEmptyStoreNum + 1), - }) - } - simCase.Regions = append(simCase.Regions, Region{ - ID: IDAllocator.nextID(), - Peers: peers, - Leader: peers[0], - Size: 96 * units.MiB, - Keys: 960000, - }) - } - - currentStoreCount := noEmptyStoreNum - e := &AddNodesDescriptor{} - e.Step = func(tick int64) uint64 { - if tick%100 == 0 && currentStoreCount < totalStore { - currentStoreCount++ - nodeID := ids[0] - ids = append(ids[:0], ids[1:]...) - return nodeID - } - return 0 - } - simCase.Events = []EventDescriptor{e} - - simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - if currentStoreCount != totalStore { - return false - } - for i := 1; i <= currentStoreCount; i++ { - leaderCount := regions.GetStoreLeaderCount(uint64(i)) - peerCount := regions.GetStoreRegionCount(uint64(i)) - if !isUniform(leaderCount, totalRegion/totalStore) { - return false - } - if !isUniform(peerCount, totalRegion*replica/totalStore) { - return false - } - } - return true - } - return &simCase -} diff --git a/tools/pd-simulator/simulator/cases/balance_leader.go b/tools/pd-simulator/simulator/cases/balance_leader.go index c5315f85d8e7..fd9028bc91af 100644 --- a/tools/pd-simulator/simulator/cases/balance_leader.go +++ b/tools/pd-simulator/simulator/cases/balance_leader.go @@ -20,6 +20,7 @@ import ( "github.com/tikv/pd/pkg/core" sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" + "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" ) func newBalanceLeader(config *sc.SimConfig) *Case { @@ -30,7 +31,7 @@ func newBalanceLeader(config *sc.SimConfig) *Case { replica := int(config.ServerConfig.Replication.MaxReplicas) for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Status: metapb.StoreState_Up, }) } @@ -39,17 +40,17 @@ func newBalanceLeader(config *sc.SimConfig) *Case { for i := 0; i < totalRegion; i++ { peers := make([]*metapb.Peer, 0, replica) peers = append(peers, &metapb.Peer{ - Id: IDAllocator.nextID(), + Id: simutil.IDAllocator.NextID(), StoreId: leaderStoreID, }) for j := 1; j < replica; j++ { peers = append(peers, &metapb.Peer{ - Id: IDAllocator.nextID(), + Id: simutil.IDAllocator.NextID(), StoreId: uint64((i+j)%(totalStore-1) + 1), }) } simCase.Regions = append(simCase.Regions, Region{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Peers: peers, Leader: peers[0], Size: 96 * units.MiB, diff --git a/tools/pd-simulator/simulator/cases/balance_region.go b/tools/pd-simulator/simulator/cases/balance_region.go index a559a335c97a..82a7ac2d7042 100644 --- a/tools/pd-simulator/simulator/cases/balance_region.go +++ b/tools/pd-simulator/simulator/cases/balance_region.go @@ -21,6 +21,7 @@ import ( "github.com/tikv/pd/pkg/core" sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" + "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" ) func newRedundantBalanceRegion(config *sc.SimConfig) *Case { @@ -32,7 +33,7 @@ func newRedundantBalanceRegion(config *sc.SimConfig) *Case { for i := 0; i < totalStore; i++ { s := &Store{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Status: metapb.StoreState_Up, } if i%2 == 1 { @@ -45,12 +46,12 @@ func newRedundantBalanceRegion(config *sc.SimConfig) *Case { peers := make([]*metapb.Peer, 0, replica) for j := 0; j < replica; j++ { peers = append(peers, &metapb.Peer{ - Id: IDAllocator.nextID(), + Id: simutil.IDAllocator.NextID(), StoreId: uint64((i+j)%totalStore + 1), }) } simCase.Regions = append(simCase.Regions, Region{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Peers: peers, Leader: peers[0], }) diff --git a/tools/pd-simulator/simulator/cases/cases.go b/tools/pd-simulator/simulator/cases/cases.go index f2e79a819248..00b5404669fe 100644 --- a/tools/pd-simulator/simulator/cases/cases.go +++ b/tools/pd-simulator/simulator/cases/cases.go @@ -15,8 +15,6 @@ package cases import ( - "math/rand" - "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/schedule/placement" @@ -91,9 +89,6 @@ var IDAllocator idAllocator var CaseMap = map[string]func(*config.SimConfig) *Case{ "balance-leader": newBalanceLeader, "redundant-balance-region": newRedundantBalanceRegion, - "add-nodes": newAddNodes, - "add-nodes-dynamic": newAddNodesDynamic, - "delete-nodes": newDeleteNodes, "region-split": newRegionSplit, "region-merge": newRegionMerge, "hot-read": newHotRead, @@ -121,14 +116,3 @@ func isUniform(count, meanCount int) bool { minCount := int((1.0 - threshold) * float64(meanCount)) return minCount <= count && count <= maxCount } - -func getNoEmptyStoreNum(storeNum int, replica int) int { - noEmptyStoreNum := rand.Intn(storeNum) - if noEmptyStoreNum < replica { - return replica - } - if noEmptyStoreNum == storeNum { - return storeNum - 1 - } - return noEmptyStoreNum -} diff --git a/tools/pd-simulator/simulator/cases/delete_nodes.go b/tools/pd-simulator/simulator/cases/delete_nodes.go deleted file mode 100644 index 80650cf109d2..000000000000 --- a/tools/pd-simulator/simulator/cases/delete_nodes.go +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright 2018 TiKV Project Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package cases - -import ( - "math/rand" - - "github.com/docker/go-units" - "github.com/pingcap/kvproto/pkg/metapb" - "github.com/tikv/pd/pkg/core" - sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" - "github.com/tikv/pd/tools/pd-simulator/simulator/info" -) - -func newDeleteNodes(config *sc.SimConfig) *Case { - var simCase Case - - totalStore := config.TotalStore - totalRegion := config.TotalRegion - replica := int(config.ServerConfig.Replication.MaxReplicas) - noEmptyStoreNum := totalStore - 1 - for i := 1; i <= totalStore; i++ { - simCase.Stores = append(simCase.Stores, &Store{ - ID: IDAllocator.nextID(), - Status: metapb.StoreState_Up, - }) - } - - for i := 0; i < totalRegion; i++ { - peers := make([]*metapb.Peer, 0, replica) - for j := 0; j < replica; j++ { - peers = append(peers, &metapb.Peer{ - Id: IDAllocator.nextID(), - StoreId: uint64((i+j)%totalStore + 1), - }) - } - simCase.Regions = append(simCase.Regions, Region{ - ID: IDAllocator.nextID(), - Peers: peers, - Leader: peers[0], - Size: 96 * units.MiB, - Keys: 960000, - }) - } - - ids := make([]uint64, 0, len(simCase.Stores)) - for _, store := range simCase.Stores { - ids = append(ids, store.ID) - } - - currentStoreCount := totalStore - e := &DeleteNodesDescriptor{} - e.Step = func(tick int64) uint64 { - if currentStoreCount > noEmptyStoreNum && tick%100 == 0 { - idx := rand.Intn(currentStoreCount) - currentStoreCount-- - nodeID := ids[idx] - ids = append(ids[:idx], ids[idx+1:]...) - return nodeID - } - return 0 - } - simCase.Events = []EventDescriptor{e} - - simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - if currentStoreCount != noEmptyStoreNum { - return false - } - for _, i := range ids { - leaderCount := regions.GetStoreLeaderCount(i) - peerCount := regions.GetStoreRegionCount(i) - if !isUniform(leaderCount, totalRegion/noEmptyStoreNum) { - return false - } - if !isUniform(peerCount, totalRegion*replica/noEmptyStoreNum) { - return false - } - } - return true - } - return &simCase -} diff --git a/tools/pd-simulator/simulator/cases/hot_read.go b/tools/pd-simulator/simulator/cases/hot_read.go index 50ad08d6011b..d154886b0a47 100644 --- a/tools/pd-simulator/simulator/cases/hot_read.go +++ b/tools/pd-simulator/simulator/cases/hot_read.go @@ -20,6 +20,7 @@ import ( "github.com/tikv/pd/pkg/core" sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" + "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" ) func newHotRead(config *sc.SimConfig) *Case { @@ -31,7 +32,7 @@ func newHotRead(config *sc.SimConfig) *Case { // Initialize the cluster for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Status: metapb.StoreState_Up, }) } @@ -40,12 +41,12 @@ func newHotRead(config *sc.SimConfig) *Case { peers := make([]*metapb.Peer, 0, replica) for j := 0; j < replica; j++ { peers = append(peers, &metapb.Peer{ - Id: IDAllocator.nextID(), + Id: simutil.IDAllocator.NextID(), StoreId: uint64((i+j)%totalStore + 1), }) } simCase.Regions = append(simCase.Regions, Region{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Peers: peers, Leader: peers[0], Size: 96 * units.MiB, diff --git a/tools/pd-simulator/simulator/cases/hot_write.go b/tools/pd-simulator/simulator/cases/hot_write.go index a30afd1a8ec2..e73ca6f3ce3b 100644 --- a/tools/pd-simulator/simulator/cases/hot_write.go +++ b/tools/pd-simulator/simulator/cases/hot_write.go @@ -20,6 +20,7 @@ import ( "github.com/tikv/pd/pkg/core" sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" + "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" ) func newHotWrite(config *sc.SimConfig) *Case { @@ -31,7 +32,7 @@ func newHotWrite(config *sc.SimConfig) *Case { // Initialize the cluster for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Status: metapb.StoreState_Up, }) } @@ -40,12 +41,12 @@ func newHotWrite(config *sc.SimConfig) *Case { peers := make([]*metapb.Peer, 0, replica) for j := 0; j < replica; j++ { peers = append(peers, &metapb.Peer{ - Id: IDAllocator.nextID(), + Id: simutil.IDAllocator.NextID(), StoreId: uint64((i+j)%totalStore + 1), }) } simCase.Regions = append(simCase.Regions, Region{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Peers: peers, Leader: peers[0], Size: 96 * units.MiB, diff --git a/tools/pd-simulator/simulator/cases/makeup_down_replica.go b/tools/pd-simulator/simulator/cases/makeup_down_replica.go index 28de9577cfc1..a5ee63e71a0b 100644 --- a/tools/pd-simulator/simulator/cases/makeup_down_replica.go +++ b/tools/pd-simulator/simulator/cases/makeup_down_replica.go @@ -20,6 +20,7 @@ import ( "github.com/tikv/pd/pkg/core" sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" + "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" ) func newMakeupDownReplicas(config *sc.SimConfig) *Case { @@ -31,7 +32,7 @@ func newMakeupDownReplicas(config *sc.SimConfig) *Case { noEmptyStoreNum := totalStore - 1 for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Status: metapb.StoreState_Up, }) } @@ -40,12 +41,12 @@ func newMakeupDownReplicas(config *sc.SimConfig) *Case { peers := make([]*metapb.Peer, 0, replica) for j := 0; j < replica; j++ { peers = append(peers, &metapb.Peer{ - Id: IDAllocator.nextID(), + Id: simutil.IDAllocator.NextID(), StoreId: uint64((i+j)%totalStore + 1), }) } simCase.Regions = append(simCase.Regions, Region{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Peers: peers, Leader: peers[0], Size: 96 * units.MiB, diff --git a/tools/pd-simulator/simulator/cases/region_merge.go b/tools/pd-simulator/simulator/cases/region_merge.go index 953b0e309e19..8097565d1a72 100644 --- a/tools/pd-simulator/simulator/cases/region_merge.go +++ b/tools/pd-simulator/simulator/cases/region_merge.go @@ -20,6 +20,7 @@ import ( "github.com/tikv/pd/pkg/core" sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" + "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" ) func newRegionMerge(config *sc.SimConfig) *Case { @@ -30,7 +31,7 @@ func newRegionMerge(config *sc.SimConfig) *Case { for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Status: metapb.StoreState_Up, }) } @@ -39,12 +40,12 @@ func newRegionMerge(config *sc.SimConfig) *Case { peers := make([]*metapb.Peer, 0, replica) for j := 0; j < replica; j++ { peers = append(peers, &metapb.Peer{ - Id: IDAllocator.nextID(), + Id: simutil.IDAllocator.NextID(), StoreId: uint64((i+j)%totalStore + 1), }) } simCase.Regions = append(simCase.Regions, Region{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Peers: peers, Leader: peers[0], Size: 10 * units.MiB, diff --git a/tools/pd-simulator/simulator/conn.go b/tools/pd-simulator/simulator/conn.go index b95b33ee63df..4be8a2b76dc9 100644 --- a/tools/pd-simulator/simulator/conn.go +++ b/tools/pd-simulator/simulator/conn.go @@ -52,3 +52,13 @@ func (c *Connection) nodeHealth(storeID uint64) bool { return n.GetNodeState() == metapb.NodeState_Preparing || n.GetNodeState() == metapb.NodeState_Serving } + +func (c *Connection) getNodes() []*Node { + var nodes []*Node + for _, n := range c.Nodes { + if n.GetNodeState() != metapb.NodeState_Removed { + nodes = append(nodes, n) + } + } + return nodes +} diff --git a/tools/pd-simulator/simulator/drive.go b/tools/pd-simulator/simulator/drive.go index 3d2bce746750..700dd58f87aa 100644 --- a/tools/pd-simulator/simulator/drive.go +++ b/tools/pd-simulator/simulator/drive.go @@ -16,6 +16,8 @@ package simulator import ( "context" + "net/http" + "net/http/pprof" "path" "strconv" "sync" @@ -23,6 +25,7 @@ import ( "github.com/pingcap/errors" "github.com/pingcap/kvproto/pkg/metapb" + "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/utils/typeutil" "github.com/tikv/pd/tools/pd-simulator/simulator/cases" @@ -35,20 +38,21 @@ import ( // Driver promotes the cluster status change. type Driver struct { - wg sync.WaitGroup - pdAddr string - simCase *cases.Case - client Client - tickCount int64 - eventRunner *EventRunner - raftEngine *RaftEngine - conn *Connection - simConfig *config.SimConfig - pdConfig *config.PDConfig + wg sync.WaitGroup + pdAddr string + statusAddress string + simCase *cases.Case + client Client + tickCount int64 + eventRunner *EventRunner + raftEngine *RaftEngine + conn *Connection + simConfig *config.SimConfig + pdConfig *config.PDConfig } // NewDriver returns a driver. -func NewDriver(pdAddr string, caseName string, simConfig *config.SimConfig) (*Driver, error) { +func NewDriver(pdAddr, statusAddress, caseName string, simConfig *config.SimConfig) (*Driver, error) { simCase := cases.NewCase(caseName, simConfig) if simCase == nil { return nil, errors.Errorf("failed to create case %s", caseName) @@ -57,10 +61,11 @@ func NewDriver(pdAddr string, caseName string, simConfig *config.SimConfig) (*Dr pdConfig.PlacementRules = simCase.Rules pdConfig.LocationLabels = simCase.Labels return &Driver{ - pdAddr: pdAddr, - simCase: simCase, - simConfig: simConfig, - pdConfig: pdConfig, + pdAddr: pdAddr, + statusAddress: statusAddress, + simCase: simCase, + simConfig: simConfig, + pdConfig: pdConfig, }, nil } @@ -77,6 +82,9 @@ func (d *Driver) Prepare() error { d.updateNodeAvailable() + if d.statusAddress != "" { + go d.runHTTPServer() + } // Bootstrap. store, region, err := d.GetBootstrapInfo(d.raftEngine) if err != nil { @@ -95,7 +103,7 @@ func (d *Driver) Prepare() error { // Setup alloc id. // TODO: This is a hack way. Once we have reset alloc ID API, we need to replace it. - maxID := cases.IDAllocator.GetID() + maxID := simutil.IDAllocator.GetID() requestTimeout := 10 * time.Second etcdTimeout := 3 * time.Second etcdClient, err := clientv3.New(clientv3.Config{ @@ -123,7 +131,7 @@ func (d *Driver) Prepare() error { return errors.WithStack(err) } if id > maxID { - cases.IDAllocator.ResetID() + simutil.IDAllocator.ResetID() break } } @@ -226,3 +234,20 @@ func (d *Driver) updateNodeAvailable() { } } } + +func (d *Driver) runHTTPServer() { + http.Handle("/metrics", promhttp.Handler()) + // profile API + http.HandleFunc("/pprof/profile", pprof.Profile) + http.HandleFunc("/pprof/trace", pprof.Trace) + http.HandleFunc("/pprof/symbol", pprof.Symbol) + http.Handle("/pprof/heap", pprof.Handler("heap")) + http.Handle("/pprof/mutex", pprof.Handler("mutex")) + http.Handle("/pprof/allocs", pprof.Handler("allocs")) + http.Handle("/pprof/block", pprof.Handler("block")) + http.Handle("/pprof/goroutine", pprof.Handler("goroutine")) + eventHandler := newEventHandler(d.eventRunner) + http.HandleFunc("/event", eventHandler.createEvent) + // nolint + http.ListenAndServe(d.statusAddress, nil) +} diff --git a/tools/pd-simulator/simulator/event.go b/tools/pd-simulator/simulator/event.go index 04ad10a0db84..8be8f89d759e 100644 --- a/tools/pd-simulator/simulator/event.go +++ b/tools/pd-simulator/simulator/event.go @@ -15,6 +15,12 @@ package simulator import ( + "context" + "fmt" + "math/rand" + "net/http" + "sync" + "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/kvproto/pkg/pdpb" "github.com/tikv/pd/pkg/core" @@ -30,6 +36,7 @@ type Event interface { // EventRunner includes all events. type EventRunner struct { + sync.RWMutex events []Event raftEngine *RaftEngine } @@ -46,6 +53,33 @@ func NewEventRunner(events []cases.EventDescriptor, raftEngine *RaftEngine) *Eve return er } +type eventHandler struct { + er *EventRunner +} + +func newEventHandler(er *EventRunner) *eventHandler { + return &eventHandler{ + er: er, + } +} + +func (e *eventHandler) createEvent(w http.ResponseWriter, r *http.Request) { + event := r.URL.Query().Get("event") + if len(event) < 1 { + fmt.Fprintf(w, "no given event") + return + } + switch event { + case "add-node": + e.er.addEvent(&AddNode{}) + return + case "down-node": + e.er.addEvent(&DownNode{}) + return + default: + } +} + func parserEvent(e cases.EventDescriptor) Event { switch t := e.(type) { case *cases.WriteFlowOnSpotDescriptor: @@ -54,16 +88,20 @@ func parserEvent(e cases.EventDescriptor) Event { return &WriteFlowOnRegion{descriptor: t} case *cases.ReadFlowOnRegionDescriptor: return &ReadFlowOnRegion{descriptor: t} - case *cases.AddNodesDescriptor: - return &AddNodes{descriptor: t} - case *cases.DeleteNodesDescriptor: - return &DeleteNodes{descriptor: t} } return nil } +func (er *EventRunner) addEvent(e Event) { + er.Lock() + defer er.Unlock() + er.events = append(er.events, e) +} + // Tick ticks the event run func (er *EventRunner) Tick(tickCount int64) { + er.Lock() + defer er.Unlock() var finishedIndex int for i, e := range er.events { isFinished := e.Run(er.raftEngine, tickCount) @@ -126,24 +164,18 @@ func (e *ReadFlowOnRegion) Run(raft *RaftEngine, tickCount int64) bool { return false } -// AddNodes adds nodes. -type AddNodes struct { - descriptor *cases.AddNodesDescriptor -} +// AddNode adds nodes. +type AddNode struct{} // Run implements the event interface. -func (e *AddNodes) Run(raft *RaftEngine, tickCount int64) bool { - id := e.descriptor.Step(tickCount) - if id == 0 { - return false - } - - if _, ok := raft.conn.Nodes[id]; ok { - simutil.Logger.Info("node has already existed", zap.Uint64("node-id", id)) +func (*AddNode) Run(raft *RaftEngine, _ int64) bool { + config := raft.storeConfig + nodes := raft.conn.getNodes() + id, err := nodes[0].client.AllocID(context.TODO()) + if err != nil { + simutil.Logger.Error("alloc node id failed", zap.Error(err)) return false } - - config := raft.storeConfig s := &cases.Store{ ID: id, Status: metapb.StoreState_Up, @@ -152,49 +184,51 @@ func (e *AddNodes) Run(raft *RaftEngine, tickCount int64) bool { } n, err := NewNode(s, raft.conn.pdAddr, config) if err != nil { - simutil.Logger.Error("add node failed", zap.Uint64("node-id", id), zap.Error(err)) + simutil.Logger.Error("create node failed", zap.Error(err)) return false } - raft.conn.Nodes[id] = n + + raft.conn.Nodes[s.ID] = n n.raftEngine = raft err = n.Start() if err != nil { - simutil.Logger.Error("start node failed", zap.Uint64("node-id", id), zap.Error(err)) + delete(raft.conn.Nodes, s.ID) + simutil.Logger.Error("start node failed", zap.Uint64("node-id", s.ID), zap.Error(err)) + return false } - return false + return true } -// DeleteNodes deletes nodes. -type DeleteNodes struct { - descriptor *cases.DeleteNodesDescriptor -} +// DownNode deletes nodes. +type DownNode struct{} // Run implements the event interface. -func (e *DeleteNodes) Run(raft *RaftEngine, tickCount int64) bool { - id := e.descriptor.Step(tickCount) - if id == 0 { +func (*DownNode) Run(raft *RaftEngine, _ int64) bool { + nodes := raft.conn.getNodes() + if len(nodes) == 0 { + simutil.Logger.Error("can not find any node") return false } - - node := raft.conn.Nodes[id] + i := rand.Intn(len(nodes)) + node := nodes[i] if node == nil { - simutil.Logger.Error("node is not existed", zap.Uint64("node-id", id)) + simutil.Logger.Error("node is not existed", zap.Uint64("node-id", node.Id)) return false } - delete(raft.conn.Nodes, id) + delete(raft.conn.Nodes, node.Id) node.Stop() regions := raft.GetRegions() for _, region := range regions { storeIDs := region.GetStoreIDs() - if _, ok := storeIDs[id]; ok { + if _, ok := storeIDs[node.Id]; ok { downPeer := &pdpb.PeerStats{ - Peer: region.GetStorePeer(id), + Peer: region.GetStorePeer(node.Id), DownSeconds: 24 * 60 * 60, } region = region.Clone(core.WithDownPeers(append(region.GetDownPeers(), downPeer))) raft.SetRegion(region) } } - return false + return true } diff --git a/tools/pd-simulator/simulator/simutil/id.go b/tools/pd-simulator/simulator/simutil/id.go new file mode 100644 index 000000000000..8badddff3f1b --- /dev/null +++ b/tools/pd-simulator/simulator/simutil/id.go @@ -0,0 +1,39 @@ +// Copyright 2024 TiKV Project Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package simutil + +// IDAllocator is used to alloc unique ID. +type idAllocator struct { + id uint64 +} + +// NextID gets the next unique ID. +func (a *idAllocator) NextID() uint64 { + a.id++ + return a.id +} + +// ResetID resets the IDAllocator. +func (a *idAllocator) ResetID() { + a.id = 0 +} + +// GetID gets the current ID. +func (a *idAllocator) GetID() uint64 { + return a.id +} + +// IDAllocator is used to alloc unique ID. +var IDAllocator idAllocator From 199b01792159e5d8e83ef419a5053401e998bb0e Mon Sep 17 00:00:00 2001 From: JmPotato Date: Fri, 31 May 2024 16:29:52 +0800 Subject: [PATCH 11/47] client/retry: only return the latest error in backoffer (#8227) ref tikv/pd#8142 Due to the return of historical errors causing the client's retry logic to fail, and since we currently do not need to obtain all errors during retries, this PR removes `multierr` from backoffer and add tests to ensure the correctness of the retry logic. Signed-off-by: JmPotato Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- client/go.mod | 2 +- client/http/client.go | 5 +- client/retry/backoff.go | 45 ++++++-------- client/retry/backoff_test.go | 62 +++++++++++++++---- tests/integrations/client/http_client_test.go | 50 +++++++++++++-- 5 files changed, 119 insertions(+), 45 deletions(-) diff --git a/client/go.mod b/client/go.mod index 89799796521b..6baa2f112f40 100644 --- a/client/go.mod +++ b/client/go.mod @@ -16,7 +16,6 @@ require ( github.com/stretchr/testify v1.8.2 go.uber.org/atomic v1.10.0 go.uber.org/goleak v1.1.11 - go.uber.org/multierr v1.11.0 go.uber.org/zap v1.24.0 golang.org/x/exp v0.0.0-20230711005742-c3f37128e5a4 google.golang.org/grpc v1.62.1 @@ -34,6 +33,7 @@ require ( github.com/prometheus/client_model v0.5.0 // indirect github.com/prometheus/common v0.46.0 // indirect github.com/prometheus/procfs v0.12.0 // indirect + go.uber.org/multierr v1.11.0 // indirect golang.org/x/net v0.23.0 // indirect golang.org/x/sys v0.18.0 // indirect golang.org/x/text v0.14.0 // indirect diff --git a/client/http/client.go b/client/http/client.go index 30144ebe2c50..7b34193c2a41 100644 --- a/client/http/client.go +++ b/client/http/client.go @@ -153,10 +153,11 @@ func (ci *clientInner) requestWithRetry( } // Copy a new backoffer for each request. bo := *reqInfo.bo - // Backoffer also needs to check the status code to determine whether to retry. + // Set the retryable checker for the backoffer if it's not set. bo.SetRetryableChecker(func(err error) bool { + // Backoffer also needs to check the status code to determine whether to retry. return err != nil && !noNeedRetry(statusCode) - }) + }, false) return bo.Exec(ctx, execFunc) } diff --git a/client/retry/backoff.go b/client/retry/backoff.go index 580e466badb4..9161ad0fea16 100644 --- a/client/retry/backoff.go +++ b/client/retry/backoff.go @@ -24,12 +24,9 @@ import ( "github.com/pingcap/errors" "github.com/pingcap/failpoint" "github.com/pingcap/log" - "go.uber.org/multierr" "go.uber.org/zap" ) -const maxRecordErrorCount = 20 - // Option is used to customize the backoffer. type Option func(*Backoffer) @@ -50,7 +47,7 @@ type Backoffer struct { // total defines the max total time duration cost in retrying. If it's 0, it means infinite retry until success. total time.Duration // retryableChecker is used to check if the error is retryable. - // By default, all errors are retryable. + // If it's not set, it will always retry unconditionally no matter what the error is. retryableChecker func(err error) bool // logInterval defines the log interval for retrying. logInterval time.Duration @@ -69,28 +66,22 @@ func (bo *Backoffer) Exec( ) error { defer bo.resetBackoff() var ( - allErrors error - err error - after *time.Timer + err error + after *time.Timer ) fnName := getFunctionName(fn) for { err = fn() bo.attempt++ - if bo.attempt < maxRecordErrorCount { - // multierr.Append will ignore nil error. - allErrors = multierr.Append(allErrors, err) - } - if !bo.isRetryable(err) { + if err == nil || !bo.isRetryable(err) { break } currentInterval := bo.nextInterval() bo.nextLogTime += currentInterval - if err != nil { - if bo.logInterval > 0 && bo.nextLogTime >= bo.logInterval { - bo.nextLogTime %= bo.logInterval - log.Warn("call PD API failed and retrying", zap.String("api", fnName), zap.Int("retry-time", bo.attempt), zap.Error(err)) - } + if bo.logInterval > 0 && bo.nextLogTime >= bo.logInterval { + bo.nextLogTime %= bo.logInterval + log.Warn("[pd.backoffer] exec fn failed and retrying", + zap.String("fn-name", fnName), zap.Int("retry-time", bo.attempt), zap.Error(err)) } if after == nil { after = time.NewTimer(currentInterval) @@ -100,7 +91,7 @@ func (bo *Backoffer) Exec( select { case <-ctx.Done(): after.Stop() - return multierr.Append(allErrors, errors.Trace(ctx.Err())) + return errors.Trace(ctx.Err()) case <-after.C: failpoint.Inject("backOffExecute", func() { testBackOffExecuteFlag = true @@ -115,7 +106,7 @@ func (bo *Backoffer) Exec( } } } - return allErrors + return err } // InitialBackoffer make the initial state for retrying. @@ -132,12 +123,9 @@ func InitialBackoffer(base, max, total time.Duration, opts ...Option) *Backoffer total = base } bo := &Backoffer{ - base: base, - max: max, - total: total, - retryableChecker: func(err error) bool { - return err != nil - }, + base: base, + max: max, + total: total, next: base, currentTotal: 0, attempt: 0, @@ -148,8 +136,11 @@ func InitialBackoffer(base, max, total time.Duration, opts ...Option) *Backoffer return bo } -// SetRetryableChecker sets the retryable checker. -func (bo *Backoffer) SetRetryableChecker(checker func(err error) bool) { +// SetRetryableChecker sets the retryable checker, `overwrite` flag is used to indicate whether to overwrite the existing checker. +func (bo *Backoffer) SetRetryableChecker(checker func(err error) bool, overwrite bool) { + if !overwrite && bo.retryableChecker != nil { + return + } bo.retryableChecker = checker } diff --git a/client/retry/backoff_test.go b/client/retry/backoff_test.go index 8df06b75f941..22d487b18858 100644 --- a/client/retry/backoff_test.go +++ b/client/retry/backoff_test.go @@ -18,6 +18,7 @@ import ( "bytes" "context" "errors" + "fmt" "testing" "time" @@ -87,24 +88,64 @@ func TestBackoffer(t *testing.T) { return expectedErr }) re.InDelta(total, time.Since(start), float64(250*time.Millisecond)) - re.ErrorContains(err, "test; test; test; test") + re.ErrorContains(err, "test") re.ErrorIs(err, expectedErr) re.Equal(4, execCount) re.True(isBackofferReset(bo)) - // Test the retryable checker. + // Test the error returned. execCount = 0 - bo = InitialBackoffer(base, max, total) - bo.SetRetryableChecker(func(error) bool { - return execCount < 2 + err = bo.Exec(ctx, func() error { + execCount++ + return fmt.Errorf("test %d", execCount) }) + re.Error(err) + re.Equal("test 4", err.Error()) + re.Equal(4, execCount) + re.True(isBackofferReset(bo)) + execCount = 0 err = bo.Exec(ctx, func() error { + if execCount == 1 { + return nil + } execCount++ - return nil + return expectedErr }) + re.Equal(1, execCount) re.NoError(err) + re.True(isBackofferReset(bo)) + + // Test the retryable checker. + execCount = 0 + bo = InitialBackoffer(base, max, total) + retryableChecker := func(error) bool { + return execCount < 2 + } + bo.SetRetryableChecker(retryableChecker, false) + execFunc := func() error { + execCount++ + return expectedErr + } + err = bo.Exec(ctx, execFunc) + re.ErrorIs(err, expectedErr) + re.Equal(2, execCount) + re.True(isBackofferReset(bo)) + // Test the retryable checker with overwrite. + execCount = 0 + retryableChecker = func(error) bool { + return execCount < 4 + } + bo.SetRetryableChecker(retryableChecker, false) + err = bo.Exec(ctx, execFunc) + re.ErrorIs(err, expectedErr) re.Equal(2, execCount) re.True(isBackofferReset(bo)) + execCount = 0 + bo.SetRetryableChecker(retryableChecker, true) + err = bo.Exec(ctx, execFunc) + re.ErrorIs(err, expectedErr) + re.Equal(4, execCount) + re.True(isBackofferReset(bo)) } func isBackofferReset(bo *Backoffer) bool { @@ -129,21 +170,20 @@ func TestBackofferWithLog(t *testing.T) { // 10 + 20 + 40 + 80(log) + 100(log) * 9 >= 1000, so log ten times. re.Len(ms, 10) // 10 + 20 + 40 + 80 + 100 * 9, 13 times retry. - rfc := `["call PD API failed and retrying"] [api=testFn] [retry-time=13] [error=test]` + rfc := `["[pd.backoffer] exec fn failed and retrying"] [fn-name=testFn] [retry-time=13] [error=test]` re.Contains(ms[len(ms)-1], rfc) // 10 + 20 + 40 + 80(log), 4 times retry. - rfc = `["call PD API failed and retrying"] [api=testFn] [retry-time=4] [error=test]` + rfc = `["[pd.backoffer] exec fn failed and retrying"] [fn-name=testFn] [retry-time=4] [error=test]` re.Contains(ms[0], rfc) - bo.resetBackoff() err = bo.Exec(ctx, testFn) re.ErrorIs(err, errTest) ms = lg.Messages() re.Len(ms, 20) - rfc = `["call PD API failed and retrying"] [api=testFn] [retry-time=13] [error=test]` + rfc = `["[pd.backoffer] exec fn failed and retrying"] [fn-name=testFn] [retry-time=13] [error=test]` re.Contains(ms[len(ms)-1], rfc) - rfc = `["call PD API failed and retrying"] [api=testFn] [retry-time=4] [error=test]` + rfc = `["[pd.backoffer] exec fn failed and retrying"] [fn-name=testFn] [retry-time=4] [error=test]` re.Contains(ms[len1], rfc) } diff --git a/tests/integrations/client/http_client_test.go b/tests/integrations/client/http_client_test.go index fa109946e4ba..9d7e0985940e 100644 --- a/tests/integrations/client/http_client_test.go +++ b/tests/integrations/client/http_client_test.go @@ -21,6 +21,7 @@ import ( "net/url" "sort" "strings" + "sync" "testing" "time" @@ -531,14 +532,15 @@ func (suite *httpClientTestSuite) TestSchedulers() { defer cancel() schedulers, err := client.GetSchedulers(ctx) re.NoError(err) - re.Empty(schedulers) + const schedulerName = "evict-leader-scheduler" + re.NotContains(schedulers, schedulerName) - err = client.CreateScheduler(ctx, "evict-leader-scheduler", 1) + err = client.CreateScheduler(ctx, schedulerName, 1) re.NoError(err) schedulers, err = client.GetSchedulers(ctx) re.NoError(err) - re.Len(schedulers, 1) - err = client.SetSchedulerDelay(ctx, "evict-leader-scheduler", 100) + re.Contains(schedulers, schedulerName) + err = client.SetSchedulerDelay(ctx, schedulerName, 100) re.NoError(err) err = client.SetSchedulerDelay(ctx, "not-exist", 100) re.ErrorContains(err, "500 Internal Server Error") // TODO: should return friendly error message @@ -757,3 +759,43 @@ func (suite *httpClientTestSuite) TestGetHealthStatus() { re.Equal("pd2", healths[1].Name) re.True(healths[0].Health && healths[1].Health) } + +func (suite *httpClientTestSuite) TestRetryOnLeaderChange() { + re := suite.Require() + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() + + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + bo := retry.InitialBackoffer(100*time.Millisecond, time.Second, 0) + client := suite.client.WithBackoffer(bo) + for { + healths, err := client.GetHealthStatus(ctx) + if err != nil && strings.Contains(err.Error(), "context canceled") { + return + } + re.NoError(err) + re.Len(healths, 2) + select { + case <-ctx.Done(): + return + default: + } + } + }() + + leader := suite.cluster.GetLeaderServer() + re.NotNil(leader) + for i := 0; i < 3; i++ { + leader.ResignLeader() + re.NotEmpty(suite.cluster.WaitLeader()) + leader = suite.cluster.GetLeaderServer() + re.NotNil(leader) + } + + // Cancel the context to stop the goroutine. + cancel() + wg.Wait() +} From a929a546a790222299b556e449816e622288a5d1 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Mon, 3 Jun 2024 16:28:25 +0800 Subject: [PATCH 12/47] client/http, api/middleware: enhance the retry logic of the HTTP client (#8229) ref tikv/pd#7300 Schedule a member change check when the HTTP status code is 503 or receives a leader/primary change error. Signed-off-by: JmPotato --- client/client.go | 11 ----- client/errs/errno.go | 13 +++-- client/errs/errs.go | 18 +++++++ client/http/client.go | 49 +++++++++++++------ client/http/request_info.go | 11 +++++ client/pd_service_discovery_test.go | 3 +- client/resource_manager_client.go | 7 +-- client/tso_dispatcher.go | 2 +- errors.toml | 10 ++++ pkg/errs/errno.go | 9 ++-- .../apiutil/multiservicesapi/middleware.go | 4 +- pkg/utils/apiutil/serverapi/middleware.go | 4 +- server/apiv2/middlewares/redirector.go | 4 +- tests/integrations/client/client_test.go | 3 +- .../mcs/tso/keyspace_group_manager_test.go | 5 +- tests/server/cluster/cluster_test.go | 2 +- 16 files changed, 99 insertions(+), 56 deletions(-) diff --git a/client/client.go b/client/client.go index 1865fd0866ed..1c8ef3cafe84 100644 --- a/client/client.go +++ b/client/client.go @@ -1431,17 +1431,6 @@ func (c *client) scatterRegionsWithOptions(ctx context.Context, regionsID []uint return resp, nil } -// IsLeaderChange will determine whether there is a leader change. -func IsLeaderChange(err error) bool { - if err == errs.ErrClientTSOStreamClosed { - return true - } - errMsg := err.Error() - return strings.Contains(errMsg, errs.NotLeaderErr) || - strings.Contains(errMsg, errs.MismatchLeaderErr) || - strings.Contains(errMsg, errs.NotServedErr) -} - const ( httpSchemePrefix = "http://" httpsSchemePrefix = "https://" diff --git a/client/errs/errno.go b/client/errs/errno.go index 50c136dd5f2e..0dbcb4fe147a 100644 --- a/client/errs/errno.go +++ b/client/errs/errno.go @@ -20,21 +20,20 @@ import ( "github.com/pingcap/errors" ) +// Note: keep the same as the ones defined on the server side to ensure the client can use them correctly. const ( + // NoLeaderErr indicates there is no leader in the cluster currently. + NoLeaderErr = "no leader" // NotLeaderErr indicates the non-leader member received the requests which should be received by leader. - // Note: keep the same as the ones defined on the server side, because the client side checks if an error message - // contains this string to judge whether the leader is changed. - NotLeaderErr = "is not leader" + NotLeaderErr = "not leader" // MismatchLeaderErr indicates the non-leader member received the requests which should be received by leader. - // Note: keep the same as the ones defined on the server side, because the client side checks if an error message - // contains this string to judge whether the leader is changed. MismatchLeaderErr = "mismatch leader id" // NotServedErr indicates an tso node/pod received the requests for the keyspace groups which are not served by it. - // Note: keep the same as the ones defined on the server side, because the client side checks if an error message - // contains this string to judge whether the leader is changed. NotServedErr = "is not served" // RetryTimeoutErr indicates the server is busy. RetryTimeoutErr = "retry timeout" + // NotPrimaryErr indicates the non-primary member received the requests which should be received by primary. + NotPrimaryErr = "not primary" ) // client errors diff --git a/client/errs/errs.go b/client/errs/errs.go index 47f7c29a467d..da333efda4c3 100644 --- a/client/errs/errs.go +++ b/client/errs/errs.go @@ -15,11 +15,29 @@ package errs import ( + "strings" + "github.com/pingcap/errors" "go.uber.org/zap" "go.uber.org/zap/zapcore" ) +// IsLeaderChange will determine whether there is a leader/primary change. +func IsLeaderChange(err error) bool { + if err == nil { + return false + } + if err == ErrClientTSOStreamClosed { + return true + } + errMsg := err.Error() + return strings.Contains(errMsg, NoLeaderErr) || + strings.Contains(errMsg, NotLeaderErr) || + strings.Contains(errMsg, MismatchLeaderErr) || + strings.Contains(errMsg, NotServedErr) || + strings.Contains(errMsg, NotPrimaryErr) +} + // ZapError is used to make the log output easier. func ZapError(err error, causeError ...error) zap.Field { if err == nil { diff --git a/client/http/client.go b/client/http/client.go index 7b34193c2a41..123ca6164220 100644 --- a/client/http/client.go +++ b/client/http/client.go @@ -120,10 +120,25 @@ func (ci *clientInner) requestWithRetry( headerOpts ...HeaderOption, ) error { var ( + serverURL string + isLeader bool statusCode int err error + logFields = append(reqInfo.logFields(), zap.String("source", ci.source)) ) execFunc := func() error { + defer func() { + // If the status code is 503, it indicates that there may be PD leader/follower changes. + // If the error message contains the leader/primary change information, it indicates that there may be PD leader/primary change. + if statusCode == http.StatusServiceUnavailable || errs.IsLeaderChange(err) { + ci.sd.ScheduleCheckMemberChanged() + } + log.Debug("[pd] http request finished", append(logFields, + zap.String("server-url", serverURL), + zap.Bool("is-leader", isLeader), + zap.Int("status-code", statusCode), + zap.Error(err))...) + }() // It will try to send the request to the PD leader first and then try to send the request to the other PD followers. clients := ci.sd.GetAllServiceClients() if len(clients) == 0 { @@ -131,17 +146,21 @@ func (ci *clientInner) requestWithRetry( } skipNum := 0 for _, cli := range clients { - url := cli.GetURL() - if reqInfo.targetURL != "" && reqInfo.targetURL != url { + serverURL = cli.GetURL() + isLeader = cli.IsConnectedToLeader() + if len(reqInfo.targetURL) > 0 && reqInfo.targetURL != serverURL { skipNum++ continue } - statusCode, err = ci.doRequest(ctx, url, reqInfo, headerOpts...) + statusCode, err = ci.doRequest(ctx, serverURL, reqInfo, headerOpts...) if err == nil || noNeedRetry(statusCode) { return err } - log.Debug("[pd] request url failed", - zap.String("source", ci.source), zap.Bool("is-leader", cli.IsConnectedToLeader()), zap.String("url", url), zap.Error(err)) + log.Debug("[pd] http request url failed", append(logFields, + zap.String("server-url", serverURL), + zap.Bool("is-leader", isLeader), + zap.Int("status-code", statusCode), + zap.Error(err))...) } if skipNum == len(clients) { return errs.ErrClientNoTargetMember @@ -169,26 +188,21 @@ func noNeedRetry(statusCode int) bool { func (ci *clientInner) doRequest( ctx context.Context, - url string, reqInfo *requestInfo, + serverURL string, reqInfo *requestInfo, headerOpts ...HeaderOption, ) (int, error) { var ( - source = ci.source callerID = reqInfo.callerID name = reqInfo.name method = reqInfo.method body = reqInfo.body res = reqInfo.res respHandler = reqInfo.respHandler + url = reqInfo.getURL(serverURL) + logFields = append(reqInfo.logFields(), + zap.String("source", ci.source), + zap.String("url", url)) ) - url = reqInfo.getURL(url) - logFields := []zap.Field{ - zap.String("source", source), - zap.String("name", name), - zap.String("url", url), - zap.String("method", method), - zap.String("caller-id", callerID), - } log.Debug("[pd] request the http url", logFields...) req, err := http.NewRequestWithContext(ctx, method, url, bytes.NewBuffer(body)) if err != nil { @@ -229,11 +243,14 @@ func (ci *clientInner) doRequest( if readErr != nil { logFields = append(logFields, zap.NamedError("read-body-error", err)) } else { + // API server will return a JSON body containing the detailed error message + // when the status code is not `http.StatusOK` 200. + bs = bytes.TrimSpace(bs) logFields = append(logFields, zap.ByteString("body", bs)) } log.Error("[pd] request failed with a non-200 status", logFields...) - return resp.StatusCode, errors.Errorf("request pd http api failed with status: '%s'", resp.Status) + return resp.StatusCode, errors.Errorf("request pd http api failed with status: '%s', body: '%s'", resp.Status, bs) } if res == nil { diff --git a/client/http/request_info.go b/client/http/request_info.go index 202eab1150fe..3fb91c6ca970 100644 --- a/client/http/request_info.go +++ b/client/http/request_info.go @@ -18,6 +18,7 @@ import ( "fmt" "github.com/tikv/pd/client/retry" + "go.uber.org/zap" ) // The following constants are the names of the requests. @@ -157,3 +158,13 @@ func (ri *requestInfo) WithTargetURL(targetURL string) *requestInfo { func (ri *requestInfo) getURL(addr string) string { return fmt.Sprintf("%s%s", addr, ri.uri) } + +func (ri *requestInfo) logFields() []zap.Field { + return []zap.Field{ + zap.String("caller-id", ri.callerID), + zap.String("name", ri.name), + zap.String("uri", ri.uri), + zap.String("method", ri.method), + zap.String("target-url", ri.targetURL), + } +} diff --git a/client/pd_service_discovery_test.go b/client/pd_service_discovery_test.go index f4cde0e1911c..44171873b1a4 100644 --- a/client/pd_service_discovery_test.go +++ b/client/pd_service_discovery_test.go @@ -29,6 +29,7 @@ import ( "github.com/pingcap/kvproto/pkg/pdpb" "github.com/stretchr/testify/require" "github.com/stretchr/testify/suite" + "github.com/tikv/pd/client/errs" "github.com/tikv/pd/client/grpcutil" "github.com/tikv/pd/client/testutil" "google.golang.org/grpc" @@ -205,7 +206,7 @@ func (suite *serviceClientTestSuite) TestServiceClient() { re.NotNil(leaderConn) _, err := pb.NewGreeterClient(followerConn).SayHello(suite.ctx, &pb.HelloRequest{Name: "pd"}) - re.ErrorContains(err, "not leader") + re.ErrorContains(err, errs.NotLeaderErr) resp, err := pb.NewGreeterClient(leaderConn).SayHello(suite.ctx, &pb.HelloRequest{Name: "pd"}) re.NoError(err) re.Equal("Hello pd", resp.GetMessage()) diff --git a/client/resource_manager_client.go b/client/resource_manager_client.go index 872b241cfe72..98b123c08234 100644 --- a/client/resource_manager_client.go +++ b/client/resource_manager_client.go @@ -16,7 +16,6 @@ package pd import ( "context" - "strings" "time" "github.com/gogo/protobuf/proto" @@ -35,10 +34,6 @@ const ( modify actionType = 1 groupSettingsPathPrefix = "resource_group/settings" controllerConfigPathPrefix = "resource_group/controller" - // errNotPrimary is returned when the requested server is not primary. - errNotPrimary = "not primary" - // errNotLeader is returned when the requested server is not pd leader. - errNotLeader = "not leader" ) // GroupSettingsPathPrefixBytes is used to watch or get resource groups. @@ -83,7 +78,7 @@ func (c *client) resourceManagerClient() (rmpb.ResourceManagerClient, error) { // gRPCErrorHandler is used to handle the gRPC error returned by the resource manager service. func (c *client) gRPCErrorHandler(err error) { - if strings.Contains(err.Error(), errNotPrimary) || strings.Contains(err.Error(), errNotLeader) { + if errs.IsLeaderChange(err) { c.pdSvcDiscovery.ScheduleCheckMemberChanged() } } diff --git a/client/tso_dispatcher.go b/client/tso_dispatcher.go index d5b52ad60390..0919fd847447 100644 --- a/client/tso_dispatcher.go +++ b/client/tso_dispatcher.go @@ -303,7 +303,7 @@ tsoBatchLoop: cancel() stream = nil // Because ScheduleCheckMemberChanged is asynchronous, if the leader changes, we better call `updateMember` ASAP. - if IsLeaderChange(err) { + if errs.IsLeaderChange(err) { if err := bo.Exec(ctx, svcDiscovery.CheckMemberChanged); err != nil { select { case <-ctx.Done(): diff --git a/errors.toml b/errors.toml index 64101000478b..a61c23a6fbdf 100644 --- a/errors.toml +++ b/errors.toml @@ -16,11 +16,21 @@ error = ''' redirect failed ''' +["PD:apiutil:ErrRedirectNoLeader"] +error = ''' +redirect finds no leader +''' + ["PD:apiutil:ErrRedirectToNotLeader"] error = ''' redirect to not leader ''' +["PD:apiutil:ErrRedirectToNotPrimary"] +error = ''' +redirect to not primary +''' + ["PD:autoscaling:ErrEmptyMetricsResponse"] error = ''' metrics response from Prometheus is empty diff --git a/pkg/errs/errno.go b/pkg/errs/errno.go index 8c3e914531bd..1f56a821032f 100644 --- a/pkg/errs/errno.go +++ b/pkg/errs/errno.go @@ -195,10 +195,11 @@ var ( // apiutil errors var ( - ErrRedirect = errors.Normalize("redirect failed", errors.RFCCodeText("PD:apiutil:ErrRedirect")) - ErrOptionNotExist = errors.Normalize("the option %s does not exist", errors.RFCCodeText("PD:apiutil:ErrOptionNotExist")) - // ErrRedirectToNotLeader is the error message for redirect to not leader. - ErrRedirectToNotLeader = errors.Normalize("redirect to not leader", errors.RFCCodeText("PD:apiutil:ErrRedirectToNotLeader")) + ErrRedirect = errors.Normalize("redirect failed", errors.RFCCodeText("PD:apiutil:ErrRedirect")) + ErrOptionNotExist = errors.Normalize("the option %s does not exist", errors.RFCCodeText("PD:apiutil:ErrOptionNotExist")) + ErrRedirectNoLeader = errors.Normalize("redirect finds no leader", errors.RFCCodeText("PD:apiutil:ErrRedirectNoLeader")) + ErrRedirectToNotLeader = errors.Normalize("redirect to not leader", errors.RFCCodeText("PD:apiutil:ErrRedirectToNotLeader")) + ErrRedirectToNotPrimary = errors.Normalize("redirect to not primary", errors.RFCCodeText("PD:apiutil:ErrRedirectToNotPrimary")) ) // grpcutil errors diff --git a/pkg/utils/apiutil/multiservicesapi/middleware.go b/pkg/utils/apiutil/multiservicesapi/middleware.go index ed34ecc6afb3..4343adcc981a 100644 --- a/pkg/utils/apiutil/multiservicesapi/middleware.go +++ b/pkg/utils/apiutil/multiservicesapi/middleware.go @@ -48,8 +48,8 @@ func ServiceRedirector() gin.HandlerFunc { // Prevent more than one redirection. if name := c.Request.Header.Get(ServiceRedirectorHeader); len(name) != 0 { - log.Error("redirect but server is not primary", zap.String("from", name), zap.String("server", svr.Name()), errs.ZapError(errs.ErrRedirect)) - c.AbortWithStatusJSON(http.StatusInternalServerError, errs.ErrRedirect.FastGenByArgs().Error()) + log.Error("redirect but server is not primary", zap.String("from", name), zap.String("server", svr.Name()), errs.ZapError(errs.ErrRedirectToNotPrimary)) + c.AbortWithStatusJSON(http.StatusInternalServerError, errs.ErrRedirectToNotPrimary.FastGenByArgs().Error()) return } diff --git a/pkg/utils/apiutil/serverapi/middleware.go b/pkg/utils/apiutil/serverapi/middleware.go index 1cd3d5b53d63..0718702b5a55 100755 --- a/pkg/utils/apiutil/serverapi/middleware.go +++ b/pkg/utils/apiutil/serverapi/middleware.go @@ -210,7 +210,7 @@ func (h *redirector) ServeHTTP(w http.ResponseWriter, r *http.Request, next http leader := h.waitForLeader(r) // The leader has not been elected yet. if leader == nil { - http.Error(w, "no leader", http.StatusServiceUnavailable) + http.Error(w, errs.ErrRedirectNoLeader.FastGenByArgs().Error(), http.StatusServiceUnavailable) return } // If the leader is the current server now, we can handle the request directly. @@ -222,7 +222,7 @@ func (h *redirector) ServeHTTP(w http.ResponseWriter, r *http.Request, next http r.Header.Set(apiutil.PDRedirectorHeader, h.s.Name()) } else { // Prevent more than one redirection among PD/API servers. - log.Error("redirect but server is not leader", zap.String("from", name), zap.String("server", h.s.Name()), errs.ZapError(errs.ErrRedirect)) + log.Error("redirect but server is not leader", zap.String("from", name), zap.String("server", h.s.Name()), errs.ZapError(errs.ErrRedirectToNotLeader)) http.Error(w, errs.ErrRedirectToNotLeader.FastGenByArgs().Error(), http.StatusInternalServerError) return } diff --git a/server/apiv2/middlewares/redirector.go b/server/apiv2/middlewares/redirector.go index 37c06de1585a..9c2c40811753 100644 --- a/server/apiv2/middlewares/redirector.go +++ b/server/apiv2/middlewares/redirector.go @@ -43,8 +43,8 @@ func Redirector() gin.HandlerFunc { // Prevent more than one redirection. if name := c.Request.Header.Get(apiutil.PDRedirectorHeader); len(name) != 0 { - log.Error("redirect but server is not leader", zap.String("from", name), zap.String("server", svr.Name()), errs.ZapError(errs.ErrRedirect)) - c.AbortWithStatusJSON(http.StatusInternalServerError, errs.ErrRedirect.FastGenByArgs().Error()) + log.Error("redirect but server is not leader", zap.String("from", name), zap.String("server", svr.Name()), errs.ZapError(errs.ErrRedirectToNotLeader)) + c.AbortWithStatusJSON(http.StatusInternalServerError, errs.ErrRedirectToNotLeader.FastGenByArgs().Error()) return } diff --git a/tests/integrations/client/client_test.go b/tests/integrations/client/client_test.go index dfe7a6980c78..65acd8977262 100644 --- a/tests/integrations/client/client_test.go +++ b/tests/integrations/client/client_test.go @@ -40,6 +40,7 @@ import ( "github.com/stretchr/testify/require" "github.com/stretchr/testify/suite" pd "github.com/tikv/pd/client" + clierrs "github.com/tikv/pd/client/errs" "github.com/tikv/pd/client/retry" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/errs" @@ -528,7 +529,7 @@ func TestGlobalAndLocalTSO(t *testing.T) { re.NotEmpty(cluster.WaitLeader()) _, _, err = cli.GetTS(ctx) re.Error(err) - re.True(pd.IsLeaderChange(err)) + re.True(clierrs.IsLeaderChange(err)) _, _, err = cli.GetTS(ctx) re.NoError(err) re.NoError(failpoint.Disable("github.com/tikv/pd/client/skipUpdateMember")) diff --git a/tests/integrations/mcs/tso/keyspace_group_manager_test.go b/tests/integrations/mcs/tso/keyspace_group_manager_test.go index 25d9516bf632..6d861962d9b6 100644 --- a/tests/integrations/mcs/tso/keyspace_group_manager_test.go +++ b/tests/integrations/mcs/tso/keyspace_group_manager_test.go @@ -28,6 +28,7 @@ import ( "github.com/stretchr/testify/require" "github.com/stretchr/testify/suite" pd "github.com/tikv/pd/client" + clierrs "github.com/tikv/pd/client/errs" "github.com/tikv/pd/pkg/election" "github.com/tikv/pd/pkg/errs" mcsutils "github.com/tikv/pd/pkg/mcs/utils" @@ -467,8 +468,8 @@ func (suite *tsoKeyspaceGroupManagerTestSuite) dispatchClient( errMsg := err.Error() // Ignore the errors caused by the split and context cancellation. if strings.Contains(errMsg, "context canceled") || - strings.Contains(errMsg, "not leader") || - strings.Contains(errMsg, "not served") || + strings.Contains(errMsg, clierrs.NotLeaderErr) || + strings.Contains(errMsg, clierrs.NotServedErr) || strings.Contains(errMsg, "ErrKeyspaceNotAssigned") || strings.Contains(errMsg, "ErrKeyspaceGroupIsMerging") { continue diff --git a/tests/server/cluster/cluster_test.go b/tests/server/cluster/cluster_test.go index 07bcf3ee2a19..9e70a52d11d9 100644 --- a/tests/server/cluster/cluster_test.go +++ b/tests/server/cluster/cluster_test.go @@ -662,7 +662,7 @@ func TestNotLeader(t *testing.T) { grpcStatus, ok := status.FromError(err) re.True(ok) re.Equal(codes.Unavailable, grpcStatus.Code()) - re.Equal("not leader", grpcStatus.Message()) + re.ErrorContains(server.ErrNotLeader, grpcStatus.Message()) } func TestStoreVersionChange(t *testing.T) { From fcec1882ec12655b1f1bf31c55f56aaf20dc7dfb Mon Sep 17 00:00:00 2001 From: JmPotato Date: Tue, 4 Jun 2024 16:04:26 +0800 Subject: [PATCH 13/47] server/join: log detailed info when a join failure member is detected (#8243) ref tikv/pd#7983 Log the detailed info when a join failure member is detected to help troubleshoot. Signed-off-by: JmPotato --- server/join/join.go | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/server/join/join.go b/server/join/join.go index d17110633136..1319dc08d07e 100644 --- a/server/join/join.go +++ b/server/join/join.go @@ -136,7 +136,11 @@ func PrepareJoinCluster(cfg *config.Config) error { existed := false for _, m := range listResp.Members { if len(m.Name) == 0 { - return errors.New("there is a member that has not joined successfully") + log.Error("there is an abnormal joined member in the current member list", + zap.Uint64("id", m.ID), + zap.Strings("peer-urls", m.PeerURLs), + zap.Strings("client-urls", m.ClientURLs)) + return errors.Errorf("there is a member %d that has not joined successfully", m.ID) } if m.Name == cfg.Name { existed = true @@ -184,7 +188,11 @@ func PrepareJoinCluster(cfg *config.Config) error { listSucc = true } if len(n) == 0 { - return errors.New("there is a member that has not joined successfully") + log.Error("there is an abnormal joined member in the current member list", + zap.Uint64("id", memb.ID), + zap.Strings("peer-urls", memb.PeerURLs), + zap.Strings("client-urls", memb.ClientURLs)) + return errors.Errorf("there is a member %d that has not joined successfully", memb.ID) } for _, m := range memb.PeerURLs { pds = append(pds, fmt.Sprintf("%s=%s", n, m)) From 492a8735fd8935a475982a54934800c546524854 Mon Sep 17 00:00:00 2001 From: wuhuizuo Date: Tue, 4 Jun 2024 16:49:25 +0800 Subject: [PATCH 14/47] chore: add prow OWNERS files to control the approvals for critical configuration files (#8218) close tikv/pd#8167 Signed-off-by: wuhuizuo --- OWNERS_ALIASES | 6 ++++++ client/resource_group/controller/OWNERS | 7 +++++++ client/tlsutil/OWNERS | 7 +++++++ conf/OWNERS | 7 +++++++ pkg/encryption/OWNERS | 7 +++++++ pkg/mcs/resourcemanager/server/OWNERS | 7 +++++++ pkg/mcs/scheduling/server/config/OWNERS | 7 +++++++ pkg/mcs/tso/server/OWNERS | 7 +++++++ pkg/schedule/config/OWNERS | 7 +++++++ pkg/schedule/schedulers/OWNERS | 7 +++++++ server/config/OWNERS | 7 +++++++ 11 files changed, 76 insertions(+) create mode 100644 OWNERS_ALIASES create mode 100644 client/resource_group/controller/OWNERS create mode 100644 client/tlsutil/OWNERS create mode 100644 conf/OWNERS create mode 100644 pkg/encryption/OWNERS create mode 100644 pkg/mcs/resourcemanager/server/OWNERS create mode 100644 pkg/mcs/scheduling/server/config/OWNERS create mode 100644 pkg/mcs/tso/server/OWNERS create mode 100644 pkg/schedule/config/OWNERS create mode 100644 pkg/schedule/schedulers/OWNERS create mode 100644 server/config/OWNERS diff --git a/OWNERS_ALIASES b/OWNERS_ALIASES new file mode 100644 index 000000000000..516a466c91e7 --- /dev/null +++ b/OWNERS_ALIASES @@ -0,0 +1,6 @@ +# Sort the member alphabetically. +aliases: + sig-critical-approvers-config: + - easonn7 + - kevin-xianliu + - niubell diff --git a/client/resource_group/controller/OWNERS b/client/resource_group/controller/OWNERS new file mode 100644 index 000000000000..aa02465dbd9e --- /dev/null +++ b/client/resource_group/controller/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.go)$": + approvers: + - sig-critical-approvers-config diff --git a/client/tlsutil/OWNERS b/client/tlsutil/OWNERS new file mode 100644 index 000000000000..211db06feeec --- /dev/null +++ b/client/tlsutil/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|tlsconfig\\.go)$": + approvers: + - sig-critical-approvers-config diff --git a/conf/OWNERS b/conf/OWNERS new file mode 100644 index 000000000000..1a435c490897 --- /dev/null +++ b/conf/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.toml)$": + approvers: + - sig-critical-approvers-config diff --git a/pkg/encryption/OWNERS b/pkg/encryption/OWNERS new file mode 100644 index 000000000000..aa02465dbd9e --- /dev/null +++ b/pkg/encryption/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.go)$": + approvers: + - sig-critical-approvers-config diff --git a/pkg/mcs/resourcemanager/server/OWNERS b/pkg/mcs/resourcemanager/server/OWNERS new file mode 100644 index 000000000000..aa02465dbd9e --- /dev/null +++ b/pkg/mcs/resourcemanager/server/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.go)$": + approvers: + - sig-critical-approvers-config diff --git a/pkg/mcs/scheduling/server/config/OWNERS b/pkg/mcs/scheduling/server/config/OWNERS new file mode 100644 index 000000000000..aa02465dbd9e --- /dev/null +++ b/pkg/mcs/scheduling/server/config/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.go)$": + approvers: + - sig-critical-approvers-config diff --git a/pkg/mcs/tso/server/OWNERS b/pkg/mcs/tso/server/OWNERS new file mode 100644 index 000000000000..aa02465dbd9e --- /dev/null +++ b/pkg/mcs/tso/server/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.go)$": + approvers: + - sig-critical-approvers-config diff --git a/pkg/schedule/config/OWNERS b/pkg/schedule/config/OWNERS new file mode 100644 index 000000000000..ce5d15ddc193 --- /dev/null +++ b/pkg/schedule/config/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|(config|store_config)\\.go)$": + approvers: + - sig-critical-approvers-config diff --git a/pkg/schedule/schedulers/OWNERS b/pkg/schedule/schedulers/OWNERS new file mode 100644 index 000000000000..ae96e4f1f422 --- /dev/null +++ b/pkg/schedule/schedulers/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|hot_region_config\\.go)$": + approvers: + - sig-critical-approvers-config diff --git a/server/config/OWNERS b/server/config/OWNERS new file mode 100644 index 000000000000..179de4843e68 --- /dev/null +++ b/server/config/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|(config|service_middleware_config)\\.go)$": + approvers: + - sig-critical-approvers-config From d44d7212b3d3fce03adf0f8420bcd5c2cab7f7b3 Mon Sep 17 00:00:00 2001 From: okJiang <819421878@qq.com> Date: Tue, 4 Jun 2024 19:14:43 +0800 Subject: [PATCH 15/47] ctl: fix https client panic (#8239) * fix https client Signed-off-by: okJiang <819421878@qq.com> * fix comment & add one ut Signed-off-by: okJiang <819421878@qq.com> * EnableTraverseRunHooks Signed-off-by: okJiang <819421878@qq.com> * fix comment Signed-off-by: okJiang <819421878@qq.com> * empty Signed-off-by: okJiang <819421878@qq.com> --------- Signed-off-by: okJiang <819421878@qq.com> Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- tools/pd-ctl/pdctl/command/global.go | 72 ++++++++----------- tools/pd-ctl/pdctl/command/global_test.go | 58 ++++++++++++++++ tools/pd-ctl/pdctl/ctl.go | 1 + tools/pd-ctl/tests/health/health_test.go | 84 +++++++++++++++++++++++ 4 files changed, 171 insertions(+), 44 deletions(-) create mode 100644 tools/pd-ctl/pdctl/command/global_test.go diff --git a/tools/pd-ctl/pdctl/command/global.go b/tools/pd-ctl/pdctl/command/global.go index f7c04c3ca5cc..b29e2b632785 100644 --- a/tools/pd-ctl/pdctl/command/global.go +++ b/tools/pd-ctl/pdctl/command/global.go @@ -55,23 +55,15 @@ var PDCli pd.Client func requirePDClient(cmd *cobra.Command, _ []string) error { var ( - caPath string - err error + tlsConfig *tls.Config + err error ) - caPath, err = cmd.Flags().GetString("cacert") - if err == nil && len(caPath) != 0 { - var certPath, keyPath string - certPath, err = cmd.Flags().GetString("cert") - if err != nil { - return err - } - keyPath, err = cmd.Flags().GetString("key") - if err != nil { - return err - } - return initNewPDClientWithTLS(cmd, caPath, certPath, keyPath) + tlsConfig, err = parseTLSConfig(cmd) + if err != nil { + return err } - return initNewPDClient(cmd) + + return initNewPDClient(cmd, pd.WithTLSConfig(tlsConfig)) } // shouldInitPDClient checks whether we should create a new PD client according to the cluster information. @@ -111,44 +103,36 @@ func initNewPDClient(cmd *cobra.Command, opts ...pd.ClientOption) error { return nil } -func initNewPDClientWithTLS(cmd *cobra.Command, caPath, certPath, keyPath string) error { - tlsConfig, err := initTLSConfig(caPath, certPath, keyPath) - if err != nil { - return err - } - initNewPDClient(cmd, pd.WithTLSConfig(tlsConfig)) - return nil -} - // TODO: replace dialClient with the PD HTTP client completely. var dialClient = &http.Client{ Transport: apiutil.NewCallerIDRoundTripper(http.DefaultTransport, PDControlCallerID), } -// RequireHTTPSClient creates a HTTPS client if the related flags are set -func RequireHTTPSClient(cmd *cobra.Command, _ []string) error { +func parseTLSConfig(cmd *cobra.Command) (*tls.Config, error) { caPath, err := cmd.Flags().GetString("cacert") - if err == nil && len(caPath) != 0 { - certPath, err := cmd.Flags().GetString("cert") - if err != nil { - return err - } - keyPath, err := cmd.Flags().GetString("key") - if err != nil { - return err - } - err = initHTTPSClient(caPath, certPath, keyPath) - if err != nil { - cmd.Println(err) - return err - } + if err != nil || len(caPath) == 0 { + return nil, err + } + certPath, err := cmd.Flags().GetString("cert") + if err != nil { + return nil, err + } + keyPath, err := cmd.Flags().GetString("key") + if err != nil { + return nil, err } - return nil -} - -func initHTTPSClient(caPath, certPath, keyPath string) error { tlsConfig, err := initTLSConfig(caPath, certPath, keyPath) if err != nil { + return nil, err + } + + return tlsConfig, nil +} + +// RequireHTTPSClient creates a HTTPS client if the related flags are set +func RequireHTTPSClient(cmd *cobra.Command, _ []string) error { + tlsConfig, err := parseTLSConfig(cmd) + if err != nil || tlsConfig == nil { return err } dialClient = &http.Client{ diff --git a/tools/pd-ctl/pdctl/command/global_test.go b/tools/pd-ctl/pdctl/command/global_test.go new file mode 100644 index 000000000000..86eb4366d045 --- /dev/null +++ b/tools/pd-ctl/pdctl/command/global_test.go @@ -0,0 +1,58 @@ +// Copyright 2024 TiKV Project Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package command + +import ( + "os" + "os/exec" + "testing" + + "github.com/spf13/cobra" + "github.com/stretchr/testify/require" +) + +func TestParseTLSConfig(t *testing.T) { + re := require.New(t) + + rootCmd := &cobra.Command{ + Use: "pd-ctl", + Short: "Placement Driver control", + SilenceErrors: true, + } + certPath := "../../tests/cert" + rootCmd.Flags().String("cacert", certPath+"/ca.pem", "path of file that contains list of trusted SSL CAs") + rootCmd.Flags().String("cert", certPath+"/client.pem", "path of file that contains X509 certificate in PEM format") + rootCmd.Flags().String("key", certPath+"/client-key.pem", "path of file that contains X509 key in PEM format") + + // generate certs + if err := os.Mkdir(certPath, 0755); err != nil { + t.Fatal(err) + } + certScript := "../../tests/cert_opt.sh" + if err := exec.Command(certScript, "generate", certPath).Run(); err != nil { + t.Fatal(err) + } + defer func() { + if err := exec.Command(certScript, "cleanup", certPath).Run(); err != nil { + t.Fatal(err) + } + if err := os.RemoveAll(certPath); err != nil { + t.Fatal(err) + } + }() + + tlsConfig, err := parseTLSConfig(rootCmd) + re.NoError(err) + re.NotNil(tlsConfig) +} diff --git a/tools/pd-ctl/pdctl/ctl.go b/tools/pd-ctl/pdctl/ctl.go index f8eaff5e76e4..fbacd65dc535 100644 --- a/tools/pd-ctl/pdctl/ctl.go +++ b/tools/pd-ctl/pdctl/ctl.go @@ -30,6 +30,7 @@ import ( func init() { cobra.EnablePrefixMatching = true + cobra.EnableTraverseRunHooks = true } // GetRootCmd is exposed for integration tests. But it can be embedded into another suite, too. diff --git a/tools/pd-ctl/tests/health/health_test.go b/tools/pd-ctl/tests/health/health_test.go index 9150a56c91b2..f1d3c7cfbf11 100644 --- a/tools/pd-ctl/tests/health/health_test.go +++ b/tools/pd-ctl/tests/health/health_test.go @@ -17,14 +17,21 @@ package health_test import ( "context" "encoding/json" + "os" + "os/exec" + "path/filepath" + "strings" "testing" "github.com/stretchr/testify/require" + "github.com/tikv/pd/pkg/utils/grpcutil" "github.com/tikv/pd/server/api" "github.com/tikv/pd/server/cluster" + "github.com/tikv/pd/server/config" pdTests "github.com/tikv/pd/tests" ctl "github.com/tikv/pd/tools/pd-ctl/pdctl" "github.com/tikv/pd/tools/pd-ctl/tests" + "go.etcd.io/etcd/pkg/transport" ) func TestHealth(t *testing.T) { @@ -68,3 +75,80 @@ func TestHealth(t *testing.T) { re.NoError(json.Unmarshal(output, &h)) re.Equal(healths, h) } + +func TestHealthTLS(t *testing.T) { + re := require.New(t) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + certPath := "../cert" + certScript := "../cert_opt.sh" + // generate certs + if err := os.Mkdir(certPath, 0755); err != nil { + t.Fatal(err) + } + if err := exec.Command(certScript, "generate", certPath).Run(); err != nil { + t.Fatal(err) + } + defer func() { + if err := exec.Command(certScript, "cleanup", certPath).Run(); err != nil { + t.Fatal(err) + } + if err := os.RemoveAll(certPath); err != nil { + t.Fatal(err) + } + }() + + tlsInfo := transport.TLSInfo{ + KeyFile: filepath.Join(certPath, "pd-server-key.pem"), + CertFile: filepath.Join(certPath, "pd-server.pem"), + TrustedCAFile: filepath.Join(certPath, "ca.pem"), + } + tc, err := pdTests.NewTestCluster(ctx, 1, func(conf *config.Config, _ string) { + conf.Security.TLSConfig = grpcutil.TLSConfig{ + KeyPath: tlsInfo.KeyFile, + CertPath: tlsInfo.CertFile, + CAPath: tlsInfo.TrustedCAFile, + } + conf.AdvertiseClientUrls = strings.ReplaceAll(conf.AdvertiseClientUrls, "http", "https") + conf.ClientUrls = strings.ReplaceAll(conf.ClientUrls, "http", "https") + conf.AdvertisePeerUrls = strings.ReplaceAll(conf.AdvertisePeerUrls, "http", "https") + conf.PeerUrls = strings.ReplaceAll(conf.PeerUrls, "http", "https") + conf.InitialCluster = strings.ReplaceAll(conf.InitialCluster, "http", "https") + }) + re.NoError(err) + defer tc.Destroy() + err = tc.RunInitialServers() + re.NoError(err) + tc.WaitLeader() + cmd := ctl.GetRootCmd() + + client := tc.GetEtcdClient() + members, err := cluster.GetMembers(client) + re.NoError(err) + healthMembers := cluster.CheckHealth(tc.GetHTTPClient(), members) + healths := []api.Health{} + for _, member := range members { + h := api.Health{ + Name: member.Name, + MemberID: member.MemberId, + ClientUrls: member.ClientUrls, + Health: false, + } + if _, ok := healthMembers[member.GetMemberId()]; ok { + h.Health = true + } + healths = append(healths, h) + } + + pdAddr := tc.GetConfig().GetClientURL() + pdAddr = strings.ReplaceAll(pdAddr, "http", "https") + args := []string{"-u", pdAddr, "health", + "--cacert=../cert/ca.pem", + "--cert=../cert/client.pem", + "--key=../cert/client-key.pem"} + output, err := tests.ExecuteCommand(cmd, args...) + re.NoError(err) + h := make([]api.Health, len(healths)) + re.NoError(json.Unmarshal(output, &h)) + re.Equal(healths, h) +} From 82d3a4a241e12d76218e1aba7a5845c1793305b1 Mon Sep 17 00:00:00 2001 From: Jack Yu Date: Wed, 5 Jun 2024 11:11:56 +0800 Subject: [PATCH 16/47] grafana: use log2 for y-axis of uptime (#8240) close tikv/pd#8241 Close #8241 grafana: use log2 for y-axis of uptime Signed-off-by: Jack Yu Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- metrics/grafana/pd.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metrics/grafana/pd.json b/metrics/grafana/pd.json index 69afb93f531c..7965a341f6c5 100644 --- a/metrics/grafana/pd.json +++ b/metrics/grafana/pd.json @@ -2096,7 +2096,7 @@ { "format": "dtdurations", "label": null, - "logBase": 1, + "logBase": 2, "max": null, "min": "0", "show": true From 301fabbedb64088f794b24809259efffe388d77d Mon Sep 17 00:00:00 2001 From: Hu# Date: Wed, 5 Jun 2024 11:20:55 +0800 Subject: [PATCH 17/47] tools/simulator: add store api and replace simulator http with SDK (#8245) ref tikv/pd#8135 Signed-off-by: husharp Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- client/http/interface.go | 9 ++++ client/http/request_info.go | 1 + tests/integrations/client/http_client_test.go | 34 +++++++++++---- tools/pd-simulator/main.go | 6 +++ tools/pd-simulator/simulator/cases/cases.go | 4 +- .../simulator/cases/diagnose_rule.go | 23 +++++----- tools/pd-simulator/simulator/client.go | 43 ++++--------------- tools/pd-simulator/simulator/config/config.go | 4 +- tools/pd-simulator/simulator/event.go | 6 +++ tools/pd-simulator/simulator/node.go | 1 + 10 files changed, 73 insertions(+), 58 deletions(-) diff --git a/client/http/interface.go b/client/http/interface.go index 11c24beaefd8..3684e19b1f50 100644 --- a/client/http/interface.go +++ b/client/http/interface.go @@ -49,6 +49,7 @@ type Client interface { GetRegionStatusByKeyRange(context.Context, *KeyRange, bool) (*RegionStats, error) GetStores(context.Context) (*StoresInfo, error) GetStore(context.Context, uint64) (*StoreInfo, error) + DeleteStore(context.Context, uint64) error SetStoreLabels(context.Context, int64, map[string]string) error GetHealthStatus(context.Context) ([]Health, error) /* Config-related interfaces */ @@ -440,6 +441,14 @@ func (c *client) GetStore(ctx context.Context, storeID uint64) (*StoreInfo, erro return &store, nil } +// DeleteStore deletes the store by ID. +func (c *client) DeleteStore(ctx context.Context, storeID uint64) error { + return c.request(ctx, newRequestInfo(). + WithName(deleteStoreName). + WithURI(StoreByID(storeID)). + WithMethod(http.MethodDelete)) +} + // GetClusterVersion gets the cluster version. func (c *client) GetClusterVersion(ctx context.Context) (string, error) { var version string diff --git a/client/http/request_info.go b/client/http/request_info.go index 3fb91c6ca970..40bd03682504 100644 --- a/client/http/request_info.go +++ b/client/http/request_info.go @@ -39,6 +39,7 @@ const ( getRegionStatusByKeyRangeName = "GetRegionStatusByKeyRange" getStoresName = "GetStores" getStoreName = "GetStore" + deleteStoreName = "DeleteStore" setStoreLabelsName = "SetStoreLabels" getHealthStatusName = "GetHealthStatus" getConfigName = "GetConfig" diff --git a/tests/integrations/client/http_client_test.go b/tests/integrations/client/http_client_test.go index 9d7e0985940e..f4a48dcd63e1 100644 --- a/tests/integrations/client/http_client_test.go +++ b/tests/integrations/client/http_client_test.go @@ -26,6 +26,7 @@ import ( "time" "github.com/pingcap/errors" + "github.com/pingcap/kvproto/pkg/metapb" "github.com/prometheus/client_golang/prometheus" dto "github.com/prometheus/client_model/go" "github.com/stretchr/testify/require" @@ -80,6 +81,15 @@ func (suite *httpClientTestSuite) SetupSuite() { leaderServer := cluster.GetLeaderServer() err = leaderServer.BootstrapCluster() + // Add 2 more stores to the cluster. + for i := 2; i <= 4; i++ { + tests.MustPutStore(re, cluster, &metapb.Store{ + Id: uint64(i), + State: metapb.StoreState_Up, + NodeState: metapb.NodeState_Serving, + LastHeartbeat: time.Now().UnixNano(), + }) + } re.NoError(err) for _, region := range []*core.RegionInfo{ core.NewTestRegionInfo(10, 1, []byte("a1"), []byte("a2")), @@ -165,29 +175,29 @@ func (suite *httpClientTestSuite) TestMeta() { re.Empty(regionStats.StoreLeaderCount) hotReadRegions, err := client.GetHotReadRegions(ctx) re.NoError(err) - re.Len(hotReadRegions.AsPeer, 1) - re.Len(hotReadRegions.AsLeader, 1) + re.Len(hotReadRegions.AsPeer, 4) + re.Len(hotReadRegions.AsLeader, 4) hotWriteRegions, err := client.GetHotWriteRegions(ctx) re.NoError(err) - re.Len(hotWriteRegions.AsPeer, 1) - re.Len(hotWriteRegions.AsLeader, 1) + re.Len(hotWriteRegions.AsPeer, 4) + re.Len(hotWriteRegions.AsLeader, 4) historyHorRegions, err := client.GetHistoryHotRegions(ctx, &pd.HistoryHotRegionsRequest{ StartTime: 0, EndTime: time.Now().AddDate(0, 0, 1).UnixNano() / int64(time.Millisecond), }) re.NoError(err) re.Empty(historyHorRegions.HistoryHotRegion) - store, err := client.GetStores(ctx) + stores, err := client.GetStores(ctx) re.NoError(err) - re.Equal(1, store.Count) - re.Len(store.Stores, 1) - storeID := uint64(store.Stores[0].Store.ID) // TODO: why type is different? + re.Equal(4, stores.Count) + re.Len(stores.Stores, 4) + storeID := uint64(stores.Stores[0].Store.ID) // TODO: why type is different? store2, err := client.GetStore(ctx, storeID) re.NoError(err) re.EqualValues(storeID, store2.Store.ID) version, err := client.GetClusterVersion(ctx) re.NoError(err) - re.Equal("0.0.0", version) + re.Equal("1.0.0", version) rgs, _ := client.GetRegionsByKeyRange(ctx, pd.NewKeyRange([]byte("a"), []byte("a1")), 100) re.Equal(int64(0), rgs.Count) rgs, _ = client.GetRegionsByKeyRange(ctx, pd.NewKeyRange([]byte("a1"), []byte("a3")), 100) @@ -196,6 +206,12 @@ func (suite *httpClientTestSuite) TestMeta() { re.Equal(int64(1), rgs.Count) rgs, _ = client.GetRegionsByKeyRange(ctx, pd.NewKeyRange([]byte(""), []byte("")), 100) re.Equal(int64(2), rgs.Count) + // store 2 origin status:offline + err = client.DeleteStore(ctx, 2) + re.NoError(err) + store2, err = client.GetStore(ctx, 2) + re.NoError(err) + re.Equal(int64(metapb.StoreState_Offline), store2.Store.State) } func (suite *httpClientTestSuite) TestGetMinResolvedTSByStoresIDs() { diff --git a/tools/pd-simulator/main.go b/tools/pd-simulator/main.go index 45b3ecd75c9c..e3dc43ca122e 100644 --- a/tools/pd-simulator/main.go +++ b/tools/pd-simulator/main.go @@ -25,6 +25,7 @@ import ( "github.com/BurntSushi/toml" "github.com/pingcap/log" flag "github.com/spf13/pflag" + pdHttp "github.com/tikv/pd/client/http" "github.com/tikv/pd/pkg/schedule/schedulers" "github.com/tikv/pd/pkg/statistics" "github.com/tikv/pd/pkg/utils/logutil" @@ -92,6 +93,7 @@ func main() { func run(simCase string, simConfig *sc.SimConfig) { if *pdAddr != "" { + simulator.PDHTTPClient = pdHttp.NewClient("pd-simulator", []string{*pdAddr}) simStart(*pdAddr, *statusAddress, simCase, simConfig) } else { local, clean := NewSingleServer(context.Background(), simConfig) @@ -105,6 +107,7 @@ func run(simCase string, simConfig *sc.SimConfig) { } time.Sleep(100 * time.Millisecond) } + simulator.PDHTTPClient = pdHttp.NewClient("pd-simulator", []string{local.GetAddr()}) simStart(local.GetAddr(), "", simCase, simConfig, clean) } } @@ -183,6 +186,9 @@ EXIT: analysis.GetTransferCounter().PrintResult() } + if simulator.PDHTTPClient != nil { + simulator.PDHTTPClient.Close() + } if simResult != "OK" { os.Exit(1) } diff --git a/tools/pd-simulator/simulator/cases/cases.go b/tools/pd-simulator/simulator/cases/cases.go index 00b5404669fe..c4e2f9999785 100644 --- a/tools/pd-simulator/simulator/cases/cases.go +++ b/tools/pd-simulator/simulator/cases/cases.go @@ -16,8 +16,8 @@ package cases import ( "github.com/pingcap/kvproto/pkg/metapb" + pdHttp "github.com/tikv/pd/client/http" "github.com/tikv/pd/pkg/core" - "github.com/tikv/pd/pkg/schedule/placement" "github.com/tikv/pd/pkg/utils/typeutil" "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" @@ -57,7 +57,7 @@ type Case struct { TableNumber int Checker CheckerFunc // To check the schedule is finished. - Rules []*placement.Rule + Rules []*pdHttp.Rule Labels typeutil.StringSlice } diff --git a/tools/pd-simulator/simulator/cases/diagnose_rule.go b/tools/pd-simulator/simulator/cases/diagnose_rule.go index 5d34e051071c..2cd11b9624a0 100644 --- a/tools/pd-simulator/simulator/cases/diagnose_rule.go +++ b/tools/pd-simulator/simulator/cases/diagnose_rule.go @@ -19,6 +19,7 @@ import ( "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" + pdHttp "github.com/tikv/pd/client/http" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/schedule/placement" sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" @@ -30,15 +31,15 @@ import ( func newRule1(_ *sc.SimConfig) *Case { var simCase Case - simCase.Rules = make([]*placement.Rule, 0) - simCase.Rules = append(simCase.Rules, &placement.Rule{ + simCase.Rules = make([]*pdHttp.Rule, 0) + simCase.Rules = append(simCase.Rules, &pdHttp.Rule{ GroupID: "test1", ID: "test1", StartKeyHex: "", EndKeyHex: "", - Role: placement.Learner, + Role: pdHttp.Learner, Count: 1, - LabelConstraints: []placement.LabelConstraint{ + LabelConstraints: []pdHttp.LabelConstraint{ { Key: "region", Op: "in", @@ -46,14 +47,14 @@ func newRule1(_ *sc.SimConfig) *Case { }, }, LocationLabels: []string{"host"}, - }, &placement.Rule{ + }, &pdHttp.Rule{ GroupID: placement.DefaultGroupID, ID: placement.DefaultRuleID, StartKeyHex: "", EndKeyHex: "", - Role: placement.Voter, + Role: pdHttp.Voter, Count: 5, - LabelConstraints: []placement.LabelConstraint{ + LabelConstraints: []pdHttp.LabelConstraint{ { Key: "region", Op: "in", @@ -130,16 +131,16 @@ func newRule1(_ *sc.SimConfig) *Case { func newRule2(_ *sc.SimConfig) *Case { var simCase Case - simCase.Rules = make([]*placement.Rule, 0) + simCase.Rules = make([]*pdHttp.Rule, 0) simCase.Rules = append(simCase.Rules, - &placement.Rule{ + &pdHttp.Rule{ GroupID: "test1", ID: "test1", StartKeyHex: "", EndKeyHex: "", - Role: placement.Leader, + Role: pdHttp.Leader, Count: 1, - LabelConstraints: []placement.LabelConstraint{ + LabelConstraints: []pdHttp.LabelConstraint{ { Key: "region", Op: "in", diff --git a/tools/pd-simulator/simulator/client.go b/tools/pd-simulator/simulator/client.go index 50ed57995dfb..113eadab5e0f 100644 --- a/tools/pd-simulator/simulator/client.go +++ b/tools/pd-simulator/simulator/client.go @@ -15,11 +15,7 @@ package simulator import ( - "bytes" "context" - "encoding/json" - "fmt" - "net/http" "strings" "sync" "time" @@ -27,8 +23,8 @@ import ( "github.com/pingcap/errors" "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/kvproto/pkg/pdpb" + pdHttp "github.com/tikv/pd/client/http" "github.com/tikv/pd/pkg/core" - "github.com/tikv/pd/pkg/schedule/placement" "github.com/tikv/pd/pkg/utils/typeutil" sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" @@ -54,12 +50,12 @@ type Client interface { const ( pdTimeout = time.Second maxInitClusterRetries = 100 - httpPrefix = "pd/api/v1" ) var ( // errFailInitClusterID is returned when failed to load clusterID from all supplied PD addresses. errFailInitClusterID = errors.New("[pd] failed to get cluster id") + PDHTTPClient pdHttp.Client ) type client struct { @@ -67,7 +63,6 @@ type client struct { tag string clusterID uint64 clientConn *grpc.ClientConn - httpClient *http.Client reportRegionHeartbeatCh chan *core.RegionInfo receiveRegionHeartbeatCh chan *pdpb.RegionHeartbeatResponse @@ -88,7 +83,6 @@ func NewClient(pdAddr string, tag string) (Client, <-chan *pdpb.RegionHeartbeatR ctx: ctx, cancel: cancel, tag: tag, - httpClient: &http.Client{}, } cc, err := c.createConn() if err != nil { @@ -319,46 +313,27 @@ func (c *client) PutStore(ctx context.Context, store *metapb.Store) error { func (c *client) PutPDConfig(config *sc.PDConfig) error { if len(config.PlacementRules) > 0 { - path := fmt.Sprintf("%s/%s/config/rules/batch", c.url, httpPrefix) - ruleOps := make([]*placement.RuleOp, 0) + ruleOps := make([]*pdHttp.RuleOp, 0) for _, rule := range config.PlacementRules { - ruleOps = append(ruleOps, &placement.RuleOp{ + ruleOps = append(ruleOps, &pdHttp.RuleOp{ Rule: rule, - Action: placement.RuleOpAdd, + Action: pdHttp.RuleOpAdd, }) } - content, _ := json.Marshal(ruleOps) - req, err := http.NewRequest(http.MethodPost, path, bytes.NewBuffer(content)) - req.Header.Add("Content-Type", "application/json") + err := PDHTTPClient.SetPlacementRuleInBatch(c.ctx, ruleOps) if err != nil { return err } - res, err := c.httpClient.Do(req) - if err != nil { - return err - } - defer res.Body.Close() - simutil.Logger.Info("add placement rule success", zap.String("rules", string(content))) + simutil.Logger.Info("add placement rule success", zap.Any("rules", config.PlacementRules)) } if len(config.LocationLabels) > 0 { - path := fmt.Sprintf("%s/%s/config", c.url, httpPrefix) data := make(map[string]any) data["location-labels"] = config.LocationLabels - content, err := json.Marshal(data) - if err != nil { - return err - } - req, err := http.NewRequest(http.MethodPost, path, bytes.NewBuffer(content)) - req.Header.Add("Content-Type", "application/json") - if err != nil { - return err - } - res, err := c.httpClient.Do(req) + err := PDHTTPClient.SetConfig(c.ctx, data) if err != nil { return err } - defer res.Body.Close() - simutil.Logger.Info("add location labels success", zap.String("labels", string(content))) + simutil.Logger.Info("add location labels success", zap.Any("labels", config.LocationLabels)) } return nil } diff --git a/tools/pd-simulator/simulator/config/config.go b/tools/pd-simulator/simulator/config/config.go index 01bf8199ab42..6598cf35c0f1 100644 --- a/tools/pd-simulator/simulator/config/config.go +++ b/tools/pd-simulator/simulator/config/config.go @@ -21,8 +21,8 @@ import ( "github.com/BurntSushi/toml" "github.com/docker/go-units" + pdHttp "github.com/tikv/pd/client/http" sc "github.com/tikv/pd/pkg/schedule/config" - "github.com/tikv/pd/pkg/schedule/placement" "github.com/tikv/pd/pkg/utils/configutil" "github.com/tikv/pd/pkg/utils/tempurl" "github.com/tikv/pd/pkg/utils/typeutil" @@ -133,6 +133,6 @@ func (sc *SimConfig) Speed() uint64 { // PDConfig saves some config which may be changed in PD. type PDConfig struct { - PlacementRules []*placement.Rule + PlacementRules []*pdHttp.Rule LocationLabels typeutil.StringSlice } diff --git a/tools/pd-simulator/simulator/event.go b/tools/pd-simulator/simulator/event.go index 8be8f89d759e..20c75b583843 100644 --- a/tools/pd-simulator/simulator/event.go +++ b/tools/pd-simulator/simulator/event.go @@ -216,6 +216,12 @@ func (*DownNode) Run(raft *RaftEngine, _ int64) bool { return false } delete(raft.conn.Nodes, node.Id) + // delete store + err := PDHTTPClient.DeleteStore(context.Background(), node.Id) + if err != nil { + simutil.Logger.Error("put store failed", zap.Uint64("node-id", node.Id), zap.Error(err)) + return false + } node.Stop() regions := raft.GetRegions() diff --git a/tools/pd-simulator/simulator/node.go b/tools/pd-simulator/simulator/node.go index 883b5d4474b6..c51cdfd8a38e 100644 --- a/tools/pd-simulator/simulator/node.go +++ b/tools/pd-simulator/simulator/node.go @@ -72,6 +72,7 @@ func NewNode(s *cases.Store, pdAddr string, config *sc.SimConfig) (*Node, error) StoreId: s.ID, Capacity: uint64(config.RaftStore.Capacity), StartTime: uint32(time.Now().Unix()), + Available: uint64(config.RaftStore.Capacity), }, } tag := fmt.Sprintf("store %d", s.ID) From 0bf9e90559f3d1efc7ded573505ddd6886f75264 Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Wed, 5 Jun 2024 16:50:56 +0800 Subject: [PATCH 18/47] storelimit: fix datarace from `getOrCreateStoreLimit` (#8254) close tikv/pd#8253 Signed-off-by: lhy1024 --- pkg/core/storelimit/store_limit.go | 20 ++++++++-- .../operator/operator_controller_test.go | 37 +++++++++++++++++++ 2 files changed, 54 insertions(+), 3 deletions(-) diff --git a/pkg/core/storelimit/store_limit.go b/pkg/core/storelimit/store_limit.go index 8d70b2918a13..e35ec773d80c 100644 --- a/pkg/core/storelimit/store_limit.go +++ b/pkg/core/storelimit/store_limit.go @@ -17,6 +17,7 @@ package storelimit import ( "github.com/tikv/pd/pkg/core/constant" "github.com/tikv/pd/pkg/ratelimit" + "github.com/tikv/pd/pkg/utils/syncutil" ) const ( @@ -106,7 +107,7 @@ func (l *StoreRateLimit) Rate(typ Type) float64 { if l.limits[typ] == nil { return 0.0 } - return l.limits[typ].ratePerSec + return l.limits[typ].GetRatePerSec() } // Take takes count tokens from the bucket without blocking. @@ -128,12 +129,15 @@ func (l *StoreRateLimit) Reset(rate float64, typ Type) { // limit the operators of a store type limit struct { - limiter *ratelimit.RateLimiter - ratePerSec float64 + limiter *ratelimit.RateLimiter + ratePerSecMutex syncutil.RWMutex + ratePerSec float64 } // Reset resets the rate limit. func (l *limit) Reset(ratePerSec float64) { + l.ratePerSecMutex.Lock() + defer l.ratePerSecMutex.Unlock() if l.ratePerSec == ratePerSec { return } @@ -155,6 +159,8 @@ func (l *limit) Reset(ratePerSec float64) { // Available returns the number of available tokens // It returns true if the rate per second is zero. func (l *limit) Available(n int64) bool { + l.ratePerSecMutex.RLock() + defer l.ratePerSecMutex.RUnlock() if l.ratePerSec == 0 { return true } @@ -164,8 +170,16 @@ func (l *limit) Available(n int64) bool { // Take takes count tokens from the bucket without blocking. func (l *limit) Take(count int64) bool { + l.ratePerSecMutex.RLock() + defer l.ratePerSecMutex.RUnlock() if l.ratePerSec == 0 { return true } return l.limiter.AllowN(int(count)) } + +func (l *limit) GetRatePerSec() float64 { + l.ratePerSecMutex.RLock() + defer l.ratePerSecMutex.RUnlock() + return l.ratePerSec +} diff --git a/pkg/schedule/operator/operator_controller_test.go b/pkg/schedule/operator/operator_controller_test.go index d3c50667fe07..2b16516c4c79 100644 --- a/pkg/schedule/operator/operator_controller_test.go +++ b/pkg/schedule/operator/operator_controller_test.go @@ -955,3 +955,40 @@ func (suite *operatorControllerTestSuite) TestInvalidStoreId() { // Although store 3 does not exist in PD, PD can also send op to TiKV. re.Equal(pdpb.OperatorStatus_RUNNING, oc.GetOperatorStatus(1).Status) } + +func TestConcurrentAddOperatorAndSetStoreLimit(t *testing.T) { + re := require.New(t) + opt := mockconfig.NewTestOptions() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + tc := mockcluster.NewCluster(ctx, opt) + stream := hbstream.NewTestHeartbeatStreams(ctx, tc.ID, tc, false /* no need to run */) + oc := NewController(ctx, tc.GetBasicCluster(), tc.GetSharedConfig(), stream) + + regionNum := 1000 + limit := 1600.0 + storeID := uint64(2) + for i := 1; i < 4; i++ { + tc.AddRegionStore(uint64(i), regionNum) + tc.SetStoreLimit(uint64(i), storelimit.AddPeer, limit) + } + for i := 1; i <= regionNum; i++ { + tc.AddLeaderRegion(uint64(i), 1, 3, 4) + } + + // Add operator and set store limit concurrently + var wg sync.WaitGroup + for i := 1; i < 10; i++ { + wg.Add(1) + go func(i uint64) { + defer wg.Done() + for j := 1; j < 10; j++ { + regionID := uint64(j) + i*100 + op := NewTestOperator(regionID, tc.GetRegion(regionID).GetRegionEpoch(), OpRegion, AddPeer{ToStore: storeID, PeerID: regionID}) + re.True(oc.AddOperator(op)) + tc.SetStoreLimit(storeID, storelimit.AddPeer, limit-float64(j)) // every goroutine set a different limit + } + }(uint64(i)) + } + wg.Wait() +} From 494c0e956b622a9b3cec8b98180b942cbbe5a0f3 Mon Sep 17 00:00:00 2001 From: Hu# Date: Thu, 6 Jun 2024 10:41:55 +0800 Subject: [PATCH 19/47] tools/simulator: avoid redundant schedule (#8257) close tikv/pd#5290, ref tikv/pd#8135 Signed-off-by: husharp Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- tools/pd-simulator/main.go | 2 ++ tools/pd-simulator/simulator/client.go | 7 +++++++ tools/pd-simulator/simulator/node.go | 3 +++ 3 files changed, 12 insertions(+) diff --git a/tools/pd-simulator/main.go b/tools/pd-simulator/main.go index e3dc43ca122e..05763cc93b83 100644 --- a/tools/pd-simulator/main.go +++ b/tools/pd-simulator/main.go @@ -154,6 +154,8 @@ func simStart(pdAddr, statusAddress string, simCase string, simConfig *sc.SimCon tick := time.NewTicker(tickInterval) defer tick.Stop() sc := make(chan os.Signal, 1) + // halt scheduling + simulator.ChooseToHaltPDSchedule(true) signal.Notify(sc, syscall.SIGHUP, syscall.SIGINT, diff --git a/tools/pd-simulator/simulator/client.go b/tools/pd-simulator/simulator/client.go index 113eadab5e0f..0bbbebe46021 100644 --- a/tools/pd-simulator/simulator/client.go +++ b/tools/pd-simulator/simulator/client.go @@ -16,6 +16,7 @@ package simulator import ( "context" + "strconv" "strings" "sync" "time" @@ -366,3 +367,9 @@ func (c *client) requestHeader() *pdpb.RequestHeader { ClusterId: c.clusterID, } } + +func ChooseToHaltPDSchedule(halt bool) { + PDHTTPClient.SetConfig(context.Background(), map[string]any{ + "schedule.halt-scheduling": strconv.FormatBool(halt), + }) +} diff --git a/tools/pd-simulator/simulator/node.go b/tools/pd-simulator/simulator/node.go index c51cdfd8a38e..fe8dc74a9445 100644 --- a/tools/pd-simulator/simulator/node.go +++ b/tools/pd-simulator/simulator/node.go @@ -172,6 +172,8 @@ func (n *Node) stepTask() { } } +var schedulerCheck sync.Once + func (n *Node) stepHeartBeat() { config := n.raftEngine.storeConfig @@ -182,6 +184,7 @@ func (n *Node) stepHeartBeat() { period = uint64(config.RaftStore.RegionHeartBeatInterval.Duration / config.SimTickInterval.Duration) if n.tick%period == 0 { n.regionHeartBeat() + schedulerCheck.Do(func() { ChooseToHaltPDSchedule(false) }) } } From f69d600f4b6a0c20a8c75b941ee8c055c48f74e7 Mon Sep 17 00:00:00 2001 From: Hu# Date: Fri, 7 Jun 2024 12:53:57 +0800 Subject: [PATCH 20/47] tests/realcluster: using real pd with race (#8270) ref tikv/pd#7298 Signed-off-by: husharp --- tests/integrations/realcluster/deploy.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integrations/realcluster/deploy.sh b/tests/integrations/realcluster/deploy.sh index d6cd0b27f72e..8cce60e8ee60 100755 --- a/tests/integrations/realcluster/deploy.sh +++ b/tests/integrations/realcluster/deploy.sh @@ -15,10 +15,12 @@ curl --proto '=https' --tlsv1.2 -sSf https://tiup-mirrors.pingcap.com/install.sh $TIUP_BIN_DIR update playground cd ../../.. -if [ ! -d "bin" ] || [ ! -e "bin/tikv-server" ] && [ ! -e "bin/tidb-server" ] && [ ! -e "bin/pd-server" ] && [ ! -e "bin/tiflash" ]; then +if [ ! -d "bin" ] || [ ! -e "bin/tikv-server" ] && [ ! -e "bin/tidb-server" ] && [ ! -e "bin/tiflash" ]; then color-green "downloading binaries..." color-green "this may take a few minutes, you can also download them manually and put them in the bin directory." + make pd-server WITH_RACE=1 $TIUP_BIN_DIR playground nightly --kv 3 --tiflash 1 --db 1 --pd 3 --without-monitor --tag pd_test \ + --pd.binpath ./bin/pd-server \ > $CUR_PATH/playground.log 2>&1 & else color-green "using existing binaries..." From e767c012fb46d7bd6425a89beb1ccb45d7d94473 Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Fri, 7 Jun 2024 17:56:58 +0800 Subject: [PATCH 21/47] schedule: fix datarace in `operator.check` (#8264) close tikv/pd#8263 Signed-off-by: lhy1024 --- pkg/schedule/operator/operator.go | 5 ++-- pkg/schedule/operator/operator_controller.go | 2 +- pkg/schedule/operator/operator_test.go | 25 ++++++++++++++++++++ 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/pkg/schedule/operator/operator.go b/pkg/schedule/operator/operator.go index de197c4fba40..4d57d4fc6c7a 100644 --- a/pkg/schedule/operator/operator.go +++ b/pkg/schedule/operator/operator.go @@ -376,10 +376,11 @@ func (o *Operator) Check(region *core.RegionInfo) OpStep { defer func() { _ = o.CheckTimeout() }() for step := atomic.LoadInt32(&o.currentStep); int(step) < len(o.steps); step++ { if o.steps[int(step)].IsFinish(region) { - if atomic.CompareAndSwapInt64(&(o.stepsTime[step]), 0, time.Now().UnixNano()) { + current := time.Now() + if atomic.CompareAndSwapInt64(&(o.stepsTime[step]), 0, current.UnixNano()) { startTime, _ := o.getCurrentTimeAndStep() operatorStepDuration.WithLabelValues(reflect.TypeOf(o.steps[int(step)]).Name()). - Observe(time.Unix(0, o.stepsTime[step]).Sub(startTime).Seconds()) + Observe(current.Sub(startTime).Seconds()) } atomic.StoreInt32(&o.currentStep, step+1) } else { diff --git a/pkg/schedule/operator/operator_controller.go b/pkg/schedule/operator/operator_controller.go index d63e843f52a1..fe93bd98756a 100644 --- a/pkg/schedule/operator/operator_controller.go +++ b/pkg/schedule/operator/operator_controller.go @@ -461,7 +461,7 @@ func (oc *Controller) checkAddOperator(isPromoting bool, ops ...*Operator) (bool return false, NotInCreateStatus } if !isPromoting && oc.wopStatus.getCount(op.Desc()) >= oc.config.GetSchedulerMaxWaitingOperator() { - log.Debug("exceed max return false", zap.Uint64("waiting", oc.wopStatus.ops[op.Desc()]), zap.String("desc", op.Desc()), zap.Uint64("max", oc.config.GetSchedulerMaxWaitingOperator())) + log.Debug("exceed max return false", zap.Uint64("waiting", oc.wopStatus.getCount(op.Desc())), zap.String("desc", op.Desc()), zap.Uint64("max", oc.config.GetSchedulerMaxWaitingOperator())) operatorCounter.WithLabelValues(op.Desc(), "exceed-max-waiting").Inc() return false, ExceedWaitLimit } diff --git a/pkg/schedule/operator/operator_test.go b/pkg/schedule/operator/operator_test.go index 693f5c174753..1f44d813f1e5 100644 --- a/pkg/schedule/operator/operator_test.go +++ b/pkg/schedule/operator/operator_test.go @@ -17,6 +17,7 @@ package operator import ( "context" "encoding/json" + "sync" "sync/atomic" "testing" "time" @@ -570,3 +571,27 @@ func (suite *operatorTestSuite) TestToJSONObject() { obj = op.ToJSONObject() suite.Equal(TIMEOUT, obj.Status) } + +func TestOperatorCheckConcurrently(t *testing.T) { + re := require.New(t) + region := newTestRegion(1, 1, [2]uint64{1, 1}, [2]uint64{2, 2}) + // addPeer1, transferLeader1, removePeer3 + steps := []OpStep{ + AddPeer{ToStore: 1, PeerID: 1}, + TransferLeader{FromStore: 3, ToStore: 1}, + RemovePeer{FromStore: 3}, + } + op := NewTestOperator(1, &metapb.RegionEpoch{}, OpAdmin|OpLeader|OpRegion, steps...) + re.Equal(constant.Urgent, op.GetPriorityLevel()) + checkSteps(re, op, steps) + op.Start() + var wg sync.WaitGroup + for i := 0; i < 10; i++ { + wg.Add(1) + go func() { + defer wg.Done() + re.Nil(op.Check(region)) + }() + } + wg.Wait() +} From c015f140f49a47e60b3884a9179e79e92482461c Mon Sep 17 00:00:00 2001 From: Hu# Date: Tue, 11 Jun 2024 13:20:59 +0800 Subject: [PATCH 22/47] rc: fix group change will meet data race (#8268) close tikv/pd#8267 Signed-off-by: husharp Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/mcs/resourcemanager/server/manager.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/mcs/resourcemanager/server/manager.go b/pkg/mcs/resourcemanager/server/manager.go index ef402b8cbf98..418d188823f4 100644 --- a/pkg/mcs/resourcemanager/server/manager.go +++ b/pkg/mcs/resourcemanager/server/manager.go @@ -129,7 +129,9 @@ func (m *Manager) Init(ctx context.Context) error { return err } // Load resource group meta info from storage. + m.Lock() m.groups = make(map[string]*ResourceGroup) + m.Unlock() handler := func(k, v string) { group := &rmpb.ResourceGroup{} if err := proto.Unmarshal([]byte(v), group); err != nil { From 934816460a04cf674e79271b7099ddcc1ed35326 Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Wed, 12 Jun 2024 20:42:00 +0800 Subject: [PATCH 23/47] mcs: add more comments about scheduler redirect (#8279) ref tikv/pd#5839 Signed-off-by: lhy1024 --- server/api/server.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/server/api/server.go b/server/api/server.go index ad614593b2fc..7b7066c4f779 100644 --- a/server/api/server.go +++ b/server/api/server.go @@ -51,7 +51,7 @@ func NewHandler(_ context.Context, svr *server.Server) (http.Handler, apiutil.AP // "/checker/{name}", http.MethodPost // "/checker/{name}", http.MethodGet // "/schedulers", http.MethodGet - // "/schedulers/{name}", http.MethodPost + // "/schedulers/{name}", http.MethodPost, which is to be used to pause or resume the scheduler rather than create a new scheduler // "/schedulers/diagnostic/{name}", http.MethodGet // "/scheduler-config", http.MethodGet // "/hotspot/regions/read", http.MethodGet @@ -62,6 +62,8 @@ func NewHandler(_ context.Context, svr *server.Server) (http.Handler, apiutil.AP // Following requests are **not** redirected: // "/schedulers", http.MethodPost // "/schedulers/{name}", http.MethodDelete + // Because the writing of all the config of the scheduling service is in the API server, + // we should not post and delete the scheduler directly in the scheduling service. router.PathPrefix(apiPrefix).Handler(negroni.New( serverapi.NewRuntimeServiceValidator(svr, group), serverapi.NewRedirector(svr, @@ -163,7 +165,7 @@ func NewHandler(_ context.Context, svr *server.Server) (http.Handler, apiutil.AP mcs.SchedulingServiceName, []string{http.MethodGet}), serverapi.MicroserviceRedirectRule( - prefix+"/schedulers/", // Note: this means "/schedulers/{name}" + prefix+"/schedulers/", // Note: this means "/schedulers/{name}", which is to be used to pause or resume the scheduler scheapi.APIPathPrefix+"/schedulers", mcs.SchedulingServiceName, []string{http.MethodPost}), From bf02fb5515267af20378b3d6db3c63181ddc0db5 Mon Sep 17 00:00:00 2001 From: Ryan Leung Date: Thu, 13 Jun 2024 10:56:59 +0800 Subject: [PATCH 24/47] pkg: remove old duplicated task (#8234) ref tikv/pd#7897 Signed-off-by: Ryan Leung Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/core/region.go | 10 +++--- pkg/mcs/scheduling/server/cluster.go | 22 ++++++------ pkg/ratelimit/runner.go | 51 ++++++++++++++++++---------- pkg/ratelimit/runner_test.go | 34 ++++++++++++++++--- server/cluster/cluster.go | 25 +++++++------- 5 files changed, 91 insertions(+), 51 deletions(-) diff --git a/pkg/core/region.go b/pkg/core/region.go index 19c1d0d4794d..5d7880823e80 100644 --- a/pkg/core/region.go +++ b/pkg/core/region.go @@ -16,7 +16,6 @@ package core import ( "bytes" - "context" "encoding/hex" "fmt" "math" @@ -751,21 +750,22 @@ func GenerateRegionGuideFunc(enableLog bool) RegionGuideFunc { logRunner := ctx.LogRunner // print log asynchronously debug, info := d, i + regionID := region.GetID() if logRunner != nil { debug = func(msg string, fields ...zap.Field) { logRunner.RunTask( - ctx.Context, + regionID, "DebugLog", - func(_ context.Context) { + func() { d(msg, fields...) }, ) } info = func(msg string, fields ...zap.Field) { logRunner.RunTask( - ctx.Context, + regionID, "InfoLog", - func(_ context.Context) { + func() { i(msg, fields...) }, ) diff --git a/pkg/mcs/scheduling/server/cluster.go b/pkg/mcs/scheduling/server/cluster.go index caaafe42c87e..5fac3e1604ae 100644 --- a/pkg/mcs/scheduling/server/cluster.go +++ b/pkg/mcs/scheduling/server/cluster.go @@ -607,15 +607,15 @@ func (c *Cluster) processRegionHeartbeat(ctx *core.MetaProcessContext, region *c // Save to storage if meta is updated, except for flashback. // Save to cache if meta or leader is updated, or contains any down/pending peer. _, saveCache, _, retained := core.GenerateRegionGuideFunc(true)(ctx, region, origin) - + regionID := region.GetID() if !saveCache { // Due to some config changes need to update the region stats as well, // so we do some extra checks here. if hasRegionStats && c.regionStats.RegionStatsNeedUpdate(region) { ctx.TaskRunner.RunTask( - ctx, + regionID, ratelimit.ObserveRegionStatsAsync, - func(_ context.Context) { + func() { if c.regionStats.RegionStatsNeedUpdate(region) { cluster.Collect(c, region, hasRegionStats) } @@ -625,9 +625,9 @@ func (c *Cluster) processRegionHeartbeat(ctx *core.MetaProcessContext, region *c // region is not updated to the subtree. if origin.GetRef() < 2 { ctx.TaskRunner.RunTask( - ctx, + regionID, ratelimit.UpdateSubTree, - func(_ context.Context) { + func() { c.CheckAndPutSubTree(region) }, ratelimit.WithRetained(true), @@ -649,18 +649,18 @@ func (c *Cluster) processRegionHeartbeat(ctx *core.MetaProcessContext, region *c return err } ctx.TaskRunner.RunTask( - ctx, + regionID, ratelimit.UpdateSubTree, - func(_ context.Context) { + func() { c.CheckAndPutSubTree(region) }, ratelimit.WithRetained(retained), ) tracer.OnUpdateSubTreeFinished() ctx.TaskRunner.RunTask( - ctx, + regionID, ratelimit.HandleOverlaps, - func(_ context.Context) { + func() { cluster.HandleOverlaps(c, overlaps) }, ) @@ -668,9 +668,9 @@ func (c *Cluster) processRegionHeartbeat(ctx *core.MetaProcessContext, region *c tracer.OnSaveCacheFinished() // handle region stats ctx.TaskRunner.RunTask( - ctx, + regionID, ratelimit.CollectRegionStatsAsync, - func(_ context.Context) { + func() { cluster.Collect(c, region, hasRegionStats) }, ) diff --git a/pkg/ratelimit/runner.go b/pkg/ratelimit/runner.go index 17a45067f3dd..2d88e36106e3 100644 --- a/pkg/ratelimit/runner.go +++ b/pkg/ratelimit/runner.go @@ -42,16 +42,16 @@ const ( // Runner is the interface for running tasks. type Runner interface { - RunTask(ctx context.Context, name string, f func(context.Context), opts ...TaskOption) error + RunTask(id uint64, name string, f func(), opts ...TaskOption) error Start() Stop() } // Task is a task to be run. type Task struct { - ctx context.Context + id uint64 submittedAt time.Time - f func(context.Context) + f func() name string // retained indicates whether the task should be dropped if the task queue exceeds maxPendingDuration. retained bool @@ -60,17 +60,22 @@ type Task struct { // ErrMaxWaitingTasksExceeded is returned when the number of waiting tasks exceeds the maximum. var ErrMaxWaitingTasksExceeded = errors.New("max waiting tasks exceeded") -// ConcurrentRunner is a simple task runner that limits the number of concurrent tasks. +type taskID struct { + id uint64 + name string +} + type ConcurrentRunner struct { name string limiter *ConcurrencyLimiter maxPendingDuration time.Duration taskChan chan *Task - pendingTasks []*Task pendingMu sync.Mutex stopChan chan struct{} wg sync.WaitGroup - pendingTaskCount map[string]int64 + pendingTaskCount map[string]int + pendingTasks []*Task + existTasks map[taskID]*Task maxWaitingDuration prometheus.Gauge } @@ -82,7 +87,8 @@ func NewConcurrentRunner(name string, limiter *ConcurrencyLimiter, maxPendingDur maxPendingDuration: maxPendingDuration, taskChan: make(chan *Task), pendingTasks: make([]*Task, 0, initialCapacity), - pendingTaskCount: make(map[string]int64), + pendingTaskCount: make(map[string]int), + existTasks: make(map[taskID]*Task), maxWaitingDuration: RunnerTaskMaxWaitingDuration.WithLabelValues(name), } return s @@ -101,6 +107,7 @@ func (cr *ConcurrentRunner) Start() { cr.stopChan = make(chan struct{}) cr.wg.Add(1) ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() go func() { defer cr.wg.Done() for { @@ -139,7 +146,7 @@ func (cr *ConcurrentRunner) Start() { func (cr *ConcurrentRunner) run(task *Task, token *TaskToken) { start := time.Now() - task.f(task.ctx) + task.f() if token != nil { cr.limiter.ReleaseToken(token) cr.processPendingTasks() @@ -157,6 +164,7 @@ func (cr *ConcurrentRunner) processPendingTasks() { case cr.taskChan <- task: cr.pendingTasks = cr.pendingTasks[1:] cr.pendingTaskCount[task.name]-- + delete(cr.existTasks, taskID{id: task.id, name: task.name}) default: } return @@ -170,11 +178,12 @@ func (cr *ConcurrentRunner) Stop() { } // RunTask runs the task asynchronously. -func (cr *ConcurrentRunner) RunTask(ctx context.Context, name string, f func(context.Context), opts ...TaskOption) error { +func (cr *ConcurrentRunner) RunTask(id uint64, name string, f func(), opts ...TaskOption) error { task := &Task{ - ctx: ctx, - name: name, - f: f, + id: id, + name: name, + f: f, + submittedAt: time.Now(), } for _, opt := range opts { opt(task) @@ -187,7 +196,15 @@ func (cr *ConcurrentRunner) RunTask(ctx context.Context, name string, f func(con }() pendingTaskNum := len(cr.pendingTasks) + tid := taskID{task.id, task.name} if pendingTaskNum > 0 { + // Here we use a map to find the task with the same ID. + // Then replace the old task with the new one. + if t, ok := cr.existTasks[tid]; ok { + t.f = f + t.submittedAt = time.Now() + return nil + } if !task.retained { maxWait := time.Since(cr.pendingTasks[0].submittedAt) if maxWait > cr.maxPendingDuration { @@ -195,15 +212,13 @@ func (cr *ConcurrentRunner) RunTask(ctx context.Context, name string, f func(con return ErrMaxWaitingTasksExceeded } } - // We use the max task number to limit the memory usage. - // It occupies around 1.5GB memory when there is 20000000 pending task. - if len(cr.pendingTasks) > maxPendingTaskNum { + if pendingTaskNum > maxPendingTaskNum { RunnerFailedTasks.WithLabelValues(cr.name, task.name).Inc() return ErrMaxWaitingTasksExceeded } } - task.submittedAt = time.Now() cr.pendingTasks = append(cr.pendingTasks, task) + cr.existTasks[tid] = task cr.pendingTaskCount[task.name]++ return nil } @@ -217,8 +232,8 @@ func NewSyncRunner() *SyncRunner { } // RunTask runs the task synchronously. -func (*SyncRunner) RunTask(ctx context.Context, _ string, f func(context.Context), _ ...TaskOption) error { - f(ctx) +func (*SyncRunner) RunTask(_ uint64, _ string, f func(), _ ...TaskOption) error { + f() return nil } diff --git a/pkg/ratelimit/runner_test.go b/pkg/ratelimit/runner_test.go index 0241536686b8..0335a78bcbe6 100644 --- a/pkg/ratelimit/runner_test.go +++ b/pkg/ratelimit/runner_test.go @@ -15,7 +15,6 @@ package ratelimit import ( - "context" "sync" "testing" "time" @@ -34,9 +33,9 @@ func TestConcurrentRunner(t *testing.T) { time.Sleep(50 * time.Millisecond) wg.Add(1) err := runner.RunTask( - context.Background(), + uint64(i), "test1", - func(context.Context) { + func() { defer wg.Done() time.Sleep(100 * time.Millisecond) }, @@ -54,9 +53,9 @@ func TestConcurrentRunner(t *testing.T) { for i := 0; i < 10; i++ { wg.Add(1) err := runner.RunTask( - context.Background(), + uint64(i), "test2", - func(context.Context) { + func() { defer wg.Done() time.Sleep(100 * time.Millisecond) }, @@ -74,4 +73,29 @@ func TestConcurrentRunner(t *testing.T) { } wg.Wait() }) + + t.Run("DuplicatedTask", func(t *testing.T) { + runner := NewConcurrentRunner("test", NewConcurrencyLimiter(1), time.Minute) + runner.Start() + defer runner.Stop() + for i := 1; i < 11; i++ { + regionID := uint64(i) + if i == 10 { + regionID = 4 + } + err := runner.RunTask( + regionID, + "test3", + func() { + time.Sleep(time.Second) + }, + ) + require.NoError(t, err) + time.Sleep(1 * time.Millisecond) + } + + updatedSubmitted := runner.pendingTasks[1].submittedAt + lastSubmitted := runner.pendingTasks[len(runner.pendingTasks)-1].submittedAt + require.Greater(t, updatedSubmitted, lastSubmitted) + }) } diff --git a/server/cluster/cluster.go b/server/cluster/cluster.go index 70d6b46b9803..be0ba39b8996 100644 --- a/server/cluster/cluster.go +++ b/server/cluster/cluster.go @@ -1038,6 +1038,7 @@ func (c *RaftCluster) processRegionHeartbeat(ctx *core.MetaProcessContext, regio // Save to cache if meta or leader is updated, or contains any down/pending peer. saveKV, saveCache, needSync, retained := regionGuide(ctx, region, origin) tracer.OnRegionGuideFinished() + regionID := region.GetID() if !saveKV && !saveCache { // Due to some config changes need to update the region stats as well, // so we do some extra checks here. @@ -1046,9 +1047,9 @@ func (c *RaftCluster) processRegionHeartbeat(ctx *core.MetaProcessContext, regio // We need to think of a better way to reduce this part of the cost in the future. if hasRegionStats && c.regionStats.RegionStatsNeedUpdate(region) { ctx.MiscRunner.RunTask( - ctx.Context, + regionID, ratelimit.ObserveRegionStatsAsync, - func(_ context.Context) { + func() { if c.regionStats.RegionStatsNeedUpdate(region) { cluster.Collect(c, region, hasRegionStats) } @@ -1058,9 +1059,9 @@ func (c *RaftCluster) processRegionHeartbeat(ctx *core.MetaProcessContext, regio // region is not updated to the subtree. if origin.GetRef() < 2 { ctx.TaskRunner.RunTask( - ctx, + regionID, ratelimit.UpdateSubTree, - func(_ context.Context) { + func() { c.CheckAndPutSubTree(region) }, ratelimit.WithRetained(true), @@ -1086,9 +1087,9 @@ func (c *RaftCluster) processRegionHeartbeat(ctx *core.MetaProcessContext, regio return err } ctx.TaskRunner.RunTask( - ctx, + regionID, ratelimit.UpdateSubTree, - func(_ context.Context) { + func() { c.CheckAndPutSubTree(region) }, ratelimit.WithRetained(retained), @@ -1097,9 +1098,9 @@ func (c *RaftCluster) processRegionHeartbeat(ctx *core.MetaProcessContext, regio if !c.IsServiceIndependent(mcsutils.SchedulingServiceName) { ctx.MiscRunner.RunTask( - ctx.Context, + regionID, ratelimit.HandleOverlaps, - func(_ context.Context) { + func() { cluster.HandleOverlaps(c, overlaps) }, ) @@ -1110,9 +1111,9 @@ func (c *RaftCluster) processRegionHeartbeat(ctx *core.MetaProcessContext, regio tracer.OnSaveCacheFinished() // handle region stats ctx.MiscRunner.RunTask( - ctx.Context, + regionID, ratelimit.CollectRegionStatsAsync, - func(_ context.Context) { + func() { // TODO: Due to the accuracy requirements of the API "/regions/check/xxx", // region stats needs to be collected in API mode. // We need to think of a better way to reduce this part of the cost in the future. @@ -1124,9 +1125,9 @@ func (c *RaftCluster) processRegionHeartbeat(ctx *core.MetaProcessContext, regio if c.storage != nil { if saveKV { ctx.MiscRunner.RunTask( - ctx.Context, + regionID, ratelimit.SaveRegionToKV, - func(_ context.Context) { + func() { // If there are concurrent heartbeats from the same region, the last write will win even if // writes to storage in the critical area. So don't use mutex to protect it. // Not successfully saved to storage is not fatal, it only leads to longer warm-up From c75e98bd5a601f8ff531a6b389f2e0c9f95ea333 Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Thu, 13 Jun 2024 11:21:00 +0800 Subject: [PATCH 25/47] *: fix error file permissions (#8280) ref tikv/pd#4399 Signed-off-by: lhy1024 --- client/resource_group/controller/controller.go | 0 pkg/storage/leveldb_backend.go | 0 pkg/utils/apiutil/serverapi/middleware.go | 0 3 files changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 client/resource_group/controller/controller.go mode change 100755 => 100644 pkg/storage/leveldb_backend.go mode change 100755 => 100644 pkg/utils/apiutil/serverapi/middleware.go diff --git a/client/resource_group/controller/controller.go b/client/resource_group/controller/controller.go old mode 100755 new mode 100644 diff --git a/pkg/storage/leveldb_backend.go b/pkg/storage/leveldb_backend.go old mode 100755 new mode 100644 diff --git a/pkg/utils/apiutil/serverapi/middleware.go b/pkg/utils/apiutil/serverapi/middleware.go old mode 100755 new mode 100644 From e52f5be81d559595a731c9b1ca235d33b95fc978 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Thu, 13 Jun 2024 14:45:00 +0800 Subject: [PATCH 26/47] server, metrics: let server TSO handle duration including failed requests (#8282) ref tikv/pd#8281 - Delete the 99.999% percentile data because it does not correspond with TiDB and is too tail-end, which can easily mislead. - Emphasize PD server/client in the panel title. - Add corresponding 90/99/99.9% percentile data on the client handle duration for easier comparison. - The PD server TSO handle duration now includes the failed requests, directly reflecting TSO HA anomalies in the monitoring data. Signed-off-by: JmPotato --- metrics/grafana/pd.json | 50 +++++++++++++++++++++-------------------- server/grpc_service.go | 2 +- 2 files changed, 27 insertions(+), 25 deletions(-) diff --git a/metrics/grafana/pd.json b/metrics/grafana/pd.json index 7965a341f6c5..abfe049b905e 100644 --- a/metrics/grafana/pd.json +++ b/metrics/grafana/pd.json @@ -10633,20 +10633,13 @@ "refId": "C", "step": 2 }, - { - "expr": "histogram_quantile(0.99999, sum(rate(pd_server_handle_tso_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (type, le))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "99.999% tso", - "refId": "D" - }, { "expr": "histogram_quantile(0.90, sum(rate(tso_server_handle_tso_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (type, le))", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "90% tso", - "refId": "E", + "refId": "D", "step": 2 }, { @@ -10655,7 +10648,7 @@ "hide": false, "intervalFactor": 2, "legendFormat": "99% tso", - "refId": "F", + "refId": "E", "step": 2 }, { @@ -10664,22 +10657,15 @@ "hide": false, "intervalFactor": 2, "legendFormat": "99.9% tso", - "refId": "G", + "refId": "F", "step": 2 - }, - { - "expr": "histogram_quantile(0.99999, sum(rate(tso_server_handle_tso_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (type, le))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "99.999% tso", - "refId": "H" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "PD server TSO handle time", + "title": "PD server TSO handle duration", "tooltip": { "msResolution": false, "shared": true, @@ -10766,26 +10752,42 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.98, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (type, le))", - "hide": false, + "expr": "avg(rate(pd_client_request_handle_requests_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (type) / avg(rate(pd_client_request_handle_requests_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (type)", "intervalFactor": 2, - "legendFormat": "{{type}} 98th percentile", + "legendFormat": "avg {{type}}", "refId": "A", "step": 2 }, { - "expr": "avg(rate(pd_client_request_handle_requests_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (type) / avg(rate(pd_client_request_handle_requests_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (type)", + "expr": "histogram_quantile(0.90, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (type, le))", + "hide": false, "intervalFactor": 2, - "legendFormat": "{{type}} average", + "legendFormat": "90% {{type}}", "refId": "B", "step": 2 + }, + { + "expr": "histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (type, le))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "99% {{type}}", + "refId": "C", + "step": 2 + }, + { + "expr": "histogram_quantile(0.999, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (type, le))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "99.9% {{type}}", + "refId": "D", + "step": 2 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Handle requests duration", + "title": "PD client requests handle duration", "tooltip": { "msResolution": false, "shared": true, diff --git a/server/grpc_service.go b/server/grpc_service.go index acfc87fcf718..5bc1f1109651 100644 --- a/server/grpc_service.go +++ b/server/grpc_service.go @@ -578,10 +578,10 @@ func (s *GrpcServer) Tso(stream pdpb.PD_TsoServer) error { ctx, task := trace.NewTask(ctx, "tso") ts, err := s.tsoAllocatorManager.HandleRequest(ctx, request.GetDcLocation(), count) task.End() + tsoHandleDuration.Observe(time.Since(start).Seconds()) if err != nil { return status.Errorf(codes.Unknown, err.Error()) } - tsoHandleDuration.Observe(time.Since(start).Seconds()) response := &pdpb.TsoResponse{ Header: s.header(), Timestamp: &ts, From 14c68f6e2af2842ed49d5955dc568c1a1c443171 Mon Sep 17 00:00:00 2001 From: Hu# Date: Thu, 13 Jun 2024 15:00:00 +0800 Subject: [PATCH 27/47] api: supports `/regions/key` by hex key (#8262) close tikv/pd#8261 api: supports `/regions/key` by hex key Signed-off-by: husharp Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/core/region_tree.go | 2 +- pkg/utils/apiutil/apiutil.go | 24 +++++++++++++++++++++ pkg/utils/apiutil/apiutil_test.go | 36 +++++++++++++++++++++++++++++++ server/api/region.go | 33 ++++++++++++++-------------- server/api/region_test.go | 35 ++++++++++++++++++++++++++++++ 5 files changed, 113 insertions(+), 17 deletions(-) diff --git a/pkg/core/region_tree.go b/pkg/core/region_tree.go index d4ef4a880fc8..cf146c05f907 100644 --- a/pkg/core/region_tree.go +++ b/pkg/core/region_tree.go @@ -272,7 +272,7 @@ func (t *regionTree) find(item *regionItem) *regionItem { // until f return false func (t *regionTree) scanRange(startKey []byte, f func(*RegionInfo) bool) { region := &RegionInfo{meta: &metapb.Region{StartKey: startKey}} - // find if there is a region with key range [s, d), s < startKey < d + // find if there is a region with key range [s, d), s <= startKey < d fn := func(item *regionItem) bool { r := item return f(r.RegionInfo) diff --git a/pkg/utils/apiutil/apiutil.go b/pkg/utils/apiutil/apiutil.go index d0745ada2717..20465d8376c9 100644 --- a/pkg/utils/apiutil/apiutil.go +++ b/pkg/utils/apiutil/apiutil.go @@ -336,6 +336,30 @@ func ParseKey(name string, input map[string]any) ([]byte, string, error) { return returned, rawKey, nil } +// ParseHexKeys decodes hexadecimal src into DecodedLen(len(src)) bytes if the format is "hex". +// +// ParseHexKeys expects that each key contains only +// hexadecimal characters and each key has even length. +// If existing one key is malformed, ParseHexKeys returns +// the original bytes. +func ParseHexKeys(format string, keys [][]byte) (decodedBytes [][]byte, err error) { + if format != "hex" { + return keys, nil + } + + for _, key := range keys { + // We can use the source slice itself as the destination + // because the decode loop increments by one and then the 'seen' byte is not used anymore. + // Reference to hex.DecodeString() + n, err := hex.Decode(key, key) + if err != nil { + return keys, err + } + decodedBytes = append(decodedBytes, key[:n]) + } + return decodedBytes, nil +} + // ReadJSON reads a JSON data from r and then closes it. // An error due to invalid json will be returned as a JSONError func ReadJSON(r io.ReadCloser, data any) error { diff --git a/pkg/utils/apiutil/apiutil_test.go b/pkg/utils/apiutil/apiutil_test.go index aee21621dd23..3e8a998d5fd6 100644 --- a/pkg/utils/apiutil/apiutil_test.go +++ b/pkg/utils/apiutil/apiutil_test.go @@ -204,3 +204,39 @@ func TestGetIPPortFromHTTPRequest(t *testing.T) { re.Equal(testCase.port, port, "case %d", idx) } } + +func TestParseHexKeys(t *testing.T) { + re := require.New(t) + // Test for hex format + hexBytes := [][]byte{[]byte(""), []byte("67"), []byte("0001020304050607"), []byte("08090a0b0c0d0e0f"), []byte("f0f1f2f3f4f5f6f7")} + parseKeys, err := ParseHexKeys("hex", hexBytes) + re.NoError(err) + expectedBytes := [][]byte{[]byte(""), []byte("g"), []byte("\x00\x01\x02\x03\x04\x05\x06\x07"), []byte("\x08\t\n\x0b\x0c\r\x0e\x0f"), []byte("\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7")} + re.Equal(expectedBytes, parseKeys) + // Test for other format NOT hex + hexBytes = [][]byte{[]byte("hello")} + parseKeys, err = ParseHexKeys("other", hexBytes) + re.NoError(err) + re.Len(parseKeys, 1) + re.Equal([]byte("hello"), parseKeys[0]) + // Test for wrong key + hexBytes = [][]byte{[]byte("world")} + parseKeys, err = ParseHexKeys("hex", hexBytes) + re.Error(err) + re.Len(parseKeys, 1) + re.Equal([]byte("world"), parseKeys[0]) + // Test for the first key is not valid, but the second key is valid + hexBytes = [][]byte{[]byte("world"), []byte("0001020304050607")} + parseKeys, err = ParseHexKeys("hex", hexBytes) + re.Error(err) + re.Len(parseKeys, 2) + re.Equal([]byte("world"), parseKeys[0]) + re.NotEqual([]byte("\x00\x01\x02\x03\x04\x05\x06\x07"), parseKeys[1]) + // Test for the first key is valid, but the second key is not valid + hexBytes = [][]byte{[]byte("0001020304050607"), []byte("world")} + parseKeys, err = ParseHexKeys("hex", hexBytes) + re.Error(err) + re.Len(parseKeys, 2) + re.NotEqual([]byte("\x00\x01\x02\x03\x04\x05\x06\x07"), parseKeys[0]) + re.Equal([]byte("world"), parseKeys[1]) +} diff --git a/server/api/region.go b/server/api/region.go index 974b5e4fa120..c6bc3d9e6996 100644 --- a/server/api/region.go +++ b/server/api/region.go @@ -16,7 +16,6 @@ package api import ( "container/heap" - "encoding/hex" "fmt" "net/http" "net/url" @@ -86,24 +85,20 @@ func (h *regionHandler) GetRegionByID(w http.ResponseWriter, r *http.Request) { func (h *regionHandler) GetRegion(w http.ResponseWriter, r *http.Request) { rc := getCluster(r) vars := mux.Vars(r) - key := vars["key"] - key, err := url.QueryUnescape(key) + key, err := url.QueryUnescape(vars["key"]) if err != nil { h.rd.JSON(w, http.StatusBadRequest, err.Error()) return } // decode hex if query has params with hex format - formatStr := r.URL.Query().Get("format") - if formatStr == "hex" { - keyBytes, err := hex.DecodeString(key) - if err != nil { - h.rd.JSON(w, http.StatusBadRequest, err.Error()) - return - } - key = string(keyBytes) + paramsByte := [][]byte{[]byte(key)} + paramsByte, err = apiutil.ParseHexKeys(r.URL.Query().Get("format"), paramsByte) + if err != nil { + h.rd.JSON(w, http.StatusBadRequest, err.Error()) + return } - regionInfo := rc.GetRegionByKey([]byte(key)) + regionInfo := rc.GetRegionByKey(paramsByte[0]) b, err := response.MarshalRegionInfoJSON(r.Context(), regionInfo) if err != nil { h.rd.JSON(w, http.StatusInternalServerError, err.Error()) @@ -174,15 +169,21 @@ func (h *regionsHandler) GetRegions(w http.ResponseWriter, r *http.Request) { // @Router /regions/key [get] func (h *regionsHandler) ScanRegions(w http.ResponseWriter, r *http.Request) { rc := getCluster(r) - startKey := r.URL.Query().Get("key") - endKey := r.URL.Query().Get("end_key") - limit, err := h.AdjustLimit(r.URL.Query().Get("limit")) + query := r.URL.Query() + paramsByte := [][]byte{[]byte(query.Get("key")), []byte(query.Get("end_key"))} + paramsByte, err := apiutil.ParseHexKeys(query.Get("format"), paramsByte) + if err != nil { + h.rd.JSON(w, http.StatusBadRequest, err.Error()) + return + } + + limit, err := h.AdjustLimit(query.Get("limit")) if err != nil { h.rd.JSON(w, http.StatusBadRequest, err.Error()) return } - regions := rc.ScanRegions([]byte(startKey), []byte(endKey), limit) + regions := rc.ScanRegions(paramsByte[0], paramsByte[1], limit) b, err := response.MarshalRegionsInfoJSON(r.Context(), regions) if err != nil { h.rd.JSON(w, http.StatusInternalServerError, err.Error()) diff --git a/server/api/region_test.go b/server/api/region_test.go index 0e5dcd976783..886322321751 100644 --- a/server/api/region_test.go +++ b/server/api/region_test.go @@ -468,6 +468,41 @@ func (suite *getRegionTestSuite) TestScanRegionByKeys() { for i, v := range regionIDs { re.Equal(regions.Regions[i].ID, v) } + url = fmt.Sprintf("%s/regions/key?key=%s&format=hex", suite.urlPrefix, hex.EncodeToString([]byte("b"))) + regionIDs = []uint64{3, 4, 5, 99} + regions = &response.RegionsInfo{} + err = tu.ReadGetJSON(re, testDialClient, url, regions) + re.NoError(err) + re.Len(regionIDs, regions.Count) + for i, v := range regionIDs { + re.Equal(regions.Regions[i].ID, v) + } + url = fmt.Sprintf("%s/regions/key?key=%s&end_key=%s&format=hex", + suite.urlPrefix, hex.EncodeToString([]byte("b")), hex.EncodeToString([]byte("g"))) + regionIDs = []uint64{3, 4} + regions = &response.RegionsInfo{} + err = tu.ReadGetJSON(re, testDialClient, url, regions) + re.NoError(err) + re.Len(regionIDs, regions.Count) + for i, v := range regionIDs { + re.Equal(regions.Regions[i].ID, v) + } + url = fmt.Sprintf("%s/regions/key?key=%s&end_key=%s&format=hex", + suite.urlPrefix, hex.EncodeToString([]byte("b")), hex.EncodeToString([]byte{0xFF, 0xFF, 0xCC})) + regionIDs = []uint64{3, 4, 5, 99} + regions = &response.RegionsInfo{} + err = tu.ReadGetJSON(re, testDialClient, url, regions) + re.NoError(err) + re.Len(regionIDs, regions.Count) + for i, v := range regionIDs { + re.Equal(regions.Regions[i].ID, v) + } + // test invalid key + url = fmt.Sprintf("%s/regions/key?key=%s&format=hex", suite.urlPrefix, "invalid") + err = tu.CheckGetJSON(testDialClient, url, nil, + tu.Status(re, http.StatusBadRequest), + tu.StringEqual(re, "\"encoding/hex: invalid byte: U+0069 'i'\"\n")) + re.NoError(err) } // Start a new test suite to prevent from being interfered by other tests. From 3e34b803155529e7a1f42c667baa3ba4eb0a724e Mon Sep 17 00:00:00 2001 From: Ryan Leung Date: Thu, 13 Jun 2024 17:28:30 +0800 Subject: [PATCH 28/47] *: optimize memory usage (#8164) ref tikv/pd#7897 Signed-off-by: Ryan Leung Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/cluster/cluster.go | 22 ++- pkg/core/region.go | 91 ++++----- pkg/core/region_test.go | 8 +- pkg/core/region_tree.go | 14 +- pkg/core/region_tree_test.go | 4 +- pkg/mcs/scheduling/server/cluster.go | 16 +- pkg/mcs/scheduling/server/grpc_service.go | 3 +- pkg/schedule/checker/rule_checker_test.go | 4 +- pkg/schedule/schedulers/hot_region_test.go | 6 +- pkg/schedule/schedulers/scheduler_test.go | 9 +- pkg/statistics/hot_cache.go | 90 ++++++--- pkg/statistics/hot_cache_task.go | 204 --------------------- pkg/statistics/hot_peer_cache.go | 70 +++---- pkg/statistics/hot_peer_cache_test.go | 66 +++---- server/cluster/cluster.go | 16 +- server/cluster/cluster_test.go | 66 ++++++- server/grpc_service.go | 6 +- tests/server/cluster/cluster_test.go | 12 +- tools/pd-ctl/tests/hot/hot_test.go | 8 +- 19 files changed, 322 insertions(+), 393 deletions(-) delete mode 100644 pkg/statistics/hot_cache_task.go diff --git a/pkg/cluster/cluster.go b/pkg/cluster/cluster.go index ab97c7899db9..2cf5787646a8 100644 --- a/pkg/cluster/cluster.go +++ b/pkg/cluster/cluster.go @@ -33,9 +33,25 @@ type Cluster interface { // HandleStatsAsync handles the flow asynchronously. func HandleStatsAsync(c Cluster, region *core.RegionInfo) { - c.GetHotStat().CheckWriteAsync(statistics.NewCheckExpiredItemTask(region)) - c.GetHotStat().CheckReadAsync(statistics.NewCheckExpiredItemTask(region)) - c.GetHotStat().CheckWriteAsync(statistics.NewCheckWritePeerTask(region)) + checkWritePeerTask := func(cache *statistics.HotPeerCache) { + reportInterval := region.GetInterval() + interval := reportInterval.GetEndTimestamp() - reportInterval.GetStartTimestamp() + stats := cache.CheckPeerFlow(region, region.GetPeers(), region.GetWriteLoads(), interval) + for _, stat := range stats { + cache.UpdateStat(stat) + } + } + + checkExpiredTask := func(cache *statistics.HotPeerCache) { + expiredStats := cache.CollectExpiredItems(region) + for _, stat := range expiredStats { + cache.UpdateStat(stat) + } + } + + c.GetHotStat().CheckWriteAsync(checkExpiredTask) + c.GetHotStat().CheckReadAsync(checkExpiredTask) + c.GetHotStat().CheckWriteAsync(checkWritePeerTask) c.GetCoordinator().GetSchedulersController().CheckTransferWitnessLeader(region) } diff --git a/pkg/core/region.go b/pkg/core/region.go index 5d7880823e80..7f70d7182856 100644 --- a/pkg/core/region.go +++ b/pkg/core/region.go @@ -55,7 +55,6 @@ func errRegionIsStale(region *metapb.Region, origin *metapb.Region) error { // the properties are Read-Only once created except buckets. // the `buckets` could be modified by the request `report buckets` with greater version. type RegionInfo struct { - term uint64 meta *metapb.Region learners []*metapb.Peer witnesses []*metapb.Peer @@ -63,6 +62,7 @@ type RegionInfo struct { leader *metapb.Peer downPeers []*pdpb.PeerStats pendingPeers []*metapb.Peer + term uint64 cpuUsage uint64 writtenBytes uint64 writtenKeys uint64 @@ -136,26 +136,22 @@ func NewRegionInfo(region *metapb.Region, leader *metapb.Peer, opts ...RegionCre // classifyVoterAndLearner sorts out voter and learner from peers into different slice. func classifyVoterAndLearner(region *RegionInfo) { - learners := make([]*metapb.Peer, 0, 1) - voters := make([]*metapb.Peer, 0, len(region.meta.Peers)) - witnesses := make([]*metapb.Peer, 0, 1) + region.learners = make([]*metapb.Peer, 0, 1) + region.voters = make([]*metapb.Peer, 0, len(region.meta.Peers)) + region.witnesses = make([]*metapb.Peer, 0, 1) for _, p := range region.meta.Peers { if IsLearner(p) { - learners = append(learners, p) + region.learners = append(region.learners, p) } else { - voters = append(voters, p) + region.voters = append(region.voters, p) } - // Whichever peer role can be a witness if IsWitness(p) { - witnesses = append(witnesses, p) + region.witnesses = append(region.witnesses, p) } } - sort.Sort(peerSlice(learners)) - sort.Sort(peerSlice(voters)) - sort.Sort(peerSlice(witnesses)) - region.learners = learners - region.voters = voters - region.witnesses = witnesses + sort.Sort(peerSlice(region.learners)) + sort.Sort(peerSlice(region.voters)) + sort.Sort(peerSlice(region.witnesses)) } // peersEqualTo returns true when the peers are not changed, which may caused by: the region leader not changed, @@ -213,7 +209,7 @@ type RegionHeartbeatRequest interface { } // RegionFromHeartbeat constructs a Region from region heartbeat. -func RegionFromHeartbeat(heartbeat RegionHeartbeatRequest, opts ...RegionCreateOption) *RegionInfo { +func RegionFromHeartbeat(heartbeat RegionHeartbeatRequest, flowRoundDivisor int) *RegionInfo { // Convert unit to MB. // If region isn't empty and less than 1MB, use 1MB instead. // The size of empty region will be correct by the previous RegionInfo. @@ -223,20 +219,21 @@ func RegionFromHeartbeat(heartbeat RegionHeartbeatRequest, opts ...RegionCreateO } region := &RegionInfo{ - term: heartbeat.GetTerm(), - meta: heartbeat.GetRegion(), - leader: heartbeat.GetLeader(), - downPeers: heartbeat.GetDownPeers(), - pendingPeers: heartbeat.GetPendingPeers(), - writtenBytes: heartbeat.GetBytesWritten(), - writtenKeys: heartbeat.GetKeysWritten(), - readBytes: heartbeat.GetBytesRead(), - readKeys: heartbeat.GetKeysRead(), - approximateSize: int64(regionSize), - approximateKeys: int64(heartbeat.GetApproximateKeys()), - interval: heartbeat.GetInterval(), - queryStats: heartbeat.GetQueryStats(), - source: Heartbeat, + term: heartbeat.GetTerm(), + meta: heartbeat.GetRegion(), + leader: heartbeat.GetLeader(), + downPeers: heartbeat.GetDownPeers(), + pendingPeers: heartbeat.GetPendingPeers(), + writtenBytes: heartbeat.GetBytesWritten(), + writtenKeys: heartbeat.GetKeysWritten(), + readBytes: heartbeat.GetBytesRead(), + readKeys: heartbeat.GetKeysRead(), + approximateSize: int64(regionSize), + approximateKeys: int64(heartbeat.GetApproximateKeys()), + interval: heartbeat.GetInterval(), + queryStats: heartbeat.GetQueryStats(), + source: Heartbeat, + flowRoundDivisor: uint64(flowRoundDivisor), } // scheduling service doesn't need the following fields. @@ -246,10 +243,6 @@ func RegionFromHeartbeat(heartbeat RegionHeartbeatRequest, opts ...RegionCreateO region.cpuUsage = h.GetCpuUsage() } - for _, opt := range opts { - opt(region) - } - if region.writtenKeys >= ImpossibleFlowSize || region.writtenBytes >= ImpossibleFlowSize { region.writtenKeys = 0 region.writtenBytes = 0 @@ -957,11 +950,11 @@ func (r *RegionsInfo) getRegionLocked(regionID uint64) *RegionInfo { func (r *RegionsInfo) CheckAndPutRegion(region *RegionInfo) []*RegionInfo { r.t.Lock() origin := r.getRegionLocked(region.GetID()) - var ols []*regionItem + var ols []*RegionInfo if origin == nil || !bytes.Equal(origin.GetStartKey(), region.GetStartKey()) || !bytes.Equal(origin.GetEndKey(), region.GetEndKey()) { ols = r.tree.overlaps(®ionItem{RegionInfo: region}) } - err := check(region, origin, convertItemsToRegions(ols)) + err := check(region, origin, ols) if err != nil { log.Debug("region is stale", zap.Stringer("origin", origin.GetMeta()), errs.ZapError(err)) // return the state region to delete. @@ -988,25 +981,17 @@ func (r *RegionsInfo) PreCheckPutRegion(region *RegionInfo) (*RegionInfo, []*Reg return origin, overlaps, err } -func convertItemsToRegions(items []*regionItem) []*RegionInfo { - regions := make([]*RegionInfo, 0, len(items)) - for _, item := range items { - regions = append(regions, item.RegionInfo) - } - return regions -} - // AtomicCheckAndPutRegion checks if the region is valid to put, if valid then put. func (r *RegionsInfo) AtomicCheckAndPutRegion(ctx *MetaProcessContext, region *RegionInfo) ([]*RegionInfo, error) { tracer := ctx.Tracer r.t.Lock() - var ols []*regionItem + var ols []*RegionInfo origin := r.getRegionLocked(region.GetID()) if origin == nil || !bytes.Equal(origin.GetStartKey(), region.GetStartKey()) || !bytes.Equal(origin.GetEndKey(), region.GetEndKey()) { ols = r.tree.overlaps(®ionItem{RegionInfo: region}) } tracer.OnCheckOverlapsFinished() - err := check(region, origin, convertItemsToRegions(ols)) + err := check(region, origin, ols) if err != nil { r.t.Unlock() tracer.OnValidateRegionFinished() @@ -1026,13 +1011,13 @@ func (r *RegionsInfo) AtomicCheckAndPutRegion(ctx *MetaProcessContext, region *R func (r *RegionsInfo) CheckAndPutRootTree(ctx *MetaProcessContext, region *RegionInfo) ([]*RegionInfo, error) { tracer := ctx.Tracer r.t.Lock() - var ols []*regionItem + var ols []*RegionInfo origin := r.getRegionLocked(region.GetID()) if origin == nil || !bytes.Equal(origin.GetStartKey(), region.GetStartKey()) || !bytes.Equal(origin.GetEndKey(), region.GetEndKey()) { ols = r.tree.overlaps(®ionItem{RegionInfo: region}) } tracer.OnCheckOverlapsFinished() - err := check(region, origin, convertItemsToRegions(ols)) + err := check(region, origin, ols) if err != nil { r.t.Unlock() tracer.OnValidateRegionFinished() @@ -1123,7 +1108,7 @@ func (r *RegionsInfo) updateSubTreeLocked(rangeChanged bool, overlaps []*RegionI if len(overlaps) == 0 { // If the range has changed but the overlapped regions are not provided, collect them by `[]*regionItem`. for _, item := range r.getOverlapRegionFromOverlapTreeLocked(region) { - r.removeRegionFromSubTreeLocked(item.RegionInfo) + r.removeRegionFromSubTreeLocked(item) } } else { // Remove all provided overlapped regions from the subtrees. @@ -1164,7 +1149,7 @@ func (r *RegionsInfo) updateSubTreeLocked(rangeChanged bool, overlaps []*RegionI setPeers(r.pendingPeers, region.GetPendingPeers()) } -func (r *RegionsInfo) getOverlapRegionFromOverlapTreeLocked(region *RegionInfo) []*regionItem { +func (r *RegionsInfo) getOverlapRegionFromOverlapTreeLocked(region *RegionInfo) []*RegionInfo { return r.overlapTree.overlaps(®ionItem{RegionInfo: region}) } @@ -1174,9 +1159,7 @@ func (r *RegionsInfo) GetRelevantRegions(region *RegionInfo) (origin *RegionInfo defer r.t.RUnlock() origin = r.getRegionLocked(region.GetID()) if origin == nil || !bytes.Equal(origin.GetStartKey(), region.GetStartKey()) || !bytes.Equal(origin.GetEndKey(), region.GetEndKey()) { - for _, item := range r.tree.overlaps(®ionItem{RegionInfo: region}) { - overlaps = append(overlaps, item.RegionInfo) - } + return origin, r.tree.overlaps(®ionItem{RegionInfo: region}) } return } @@ -1211,7 +1194,7 @@ func (r *RegionsInfo) SetRegion(region *RegionInfo) (*RegionInfo, []*RegionInfo, return r.setRegionLocked(region, false) } -func (r *RegionsInfo) setRegionLocked(region *RegionInfo, withOverlaps bool, ol ...*regionItem) (*RegionInfo, []*RegionInfo, bool) { +func (r *RegionsInfo) setRegionLocked(region *RegionInfo, withOverlaps bool, ol ...*RegionInfo) (*RegionInfo, []*RegionInfo, bool) { var ( item *regionItem // Pointer to the *RegionInfo of this ID. origin *RegionInfo @@ -1311,7 +1294,7 @@ func (r *RegionsInfo) TreeLen() int { } // GetOverlaps returns the regions which are overlapped with the specified region range. -func (r *RegionsInfo) GetOverlaps(region *RegionInfo) []*regionItem { +func (r *RegionsInfo) GetOverlaps(region *RegionInfo) []*RegionInfo { r.t.RLock() defer r.t.RUnlock() return r.tree.overlaps(®ionItem{RegionInfo: region}) diff --git a/pkg/core/region_test.go b/pkg/core/region_test.go index aaf440eeeea2..ce59c0075d09 100644 --- a/pkg/core/region_test.go +++ b/pkg/core/region_test.go @@ -156,18 +156,19 @@ func TestSortedEqual(t *testing.T) { re.Equal(testCase.isEqual, SortedPeersEqual(regionA.GetVoters(), regionB.GetVoters())) } + flowRoundDivisor := 3 // test RegionFromHeartbeat for _, testCase := range testCases { regionA := RegionFromHeartbeat(&pdpb.RegionHeartbeatRequest{ Region: &metapb.Region{Id: 100, Peers: pickPeers(testCase.idsA)}, DownPeers: pickPeerStats(testCase.idsA), PendingPeers: pickPeers(testCase.idsA), - }) + }, flowRoundDivisor) regionB := RegionFromHeartbeat(&pdpb.RegionHeartbeatRequest{ Region: &metapb.Region{Id: 100, Peers: pickPeers(testCase.idsB)}, DownPeers: pickPeerStats(testCase.idsB), PendingPeers: pickPeers(testCase.idsB), - }) + }, flowRoundDivisor) re.Equal(testCase.isEqual, SortedPeersEqual(regionA.GetVoters(), regionB.GetVoters())) re.Equal(testCase.isEqual, SortedPeersEqual(regionA.GetVoters(), regionB.GetVoters())) re.Equal(testCase.isEqual, SortedPeersEqual(regionA.GetPendingPeers(), regionB.GetPendingPeers())) @@ -950,9 +951,10 @@ func BenchmarkRegionFromHeartbeat(b *testing.B) { PendingPeers: []*metapb.Peer{peers[1]}, DownPeers: []*pdpb.PeerStats{{Peer: peers[2], DownSeconds: 100}}, } + flowRoundDivisor := 3 b.ResetTimer() for i := 0; i < b.N; i++ { - RegionFromHeartbeat(regionReq) + RegionFromHeartbeat(regionReq, flowRoundDivisor) } } diff --git a/pkg/core/region_tree.go b/pkg/core/region_tree.go index cf146c05f907..9a148eeed18a 100644 --- a/pkg/core/region_tree.go +++ b/pkg/core/region_tree.go @@ -104,7 +104,7 @@ func (t *regionTree) notFromStorageRegionsCount() int { } // GetOverlaps returns the range items that has some intersections with the given items. -func (t *regionTree) overlaps(item *regionItem) []*regionItem { +func (t *regionTree) overlaps(item *regionItem) []*RegionInfo { // note that Find() gets the last item that is less or equal than the item. // in the case: |_______a_______|_____b_____|___c___| // new item is |______d______| @@ -116,12 +116,12 @@ func (t *regionTree) overlaps(item *regionItem) []*regionItem { result = item } endKey := item.GetEndKey() - var overlaps []*regionItem + var overlaps []*RegionInfo t.tree.AscendGreaterOrEqual(result, func(i *regionItem) bool { if len(endKey) > 0 && bytes.Compare(endKey, i.GetStartKey()) <= 0 { return false } - overlaps = append(overlaps, i) + overlaps = append(overlaps, i.RegionInfo) return true }) return overlaps @@ -130,7 +130,7 @@ func (t *regionTree) overlaps(item *regionItem) []*regionItem { // update updates the tree with the region. // It finds and deletes all the overlapped regions first, and then // insert the region. -func (t *regionTree) update(item *regionItem, withOverlaps bool, overlaps ...*regionItem) []*RegionInfo { +func (t *regionTree) update(item *regionItem, withOverlaps bool, overlaps ...*RegionInfo) []*RegionInfo { region := item.RegionInfo t.totalSize += region.approximateSize regionWriteBytesRate, regionWriteKeysRate := region.GetWriteRate() @@ -145,7 +145,7 @@ func (t *regionTree) update(item *regionItem, withOverlaps bool, overlaps ...*re } for _, old := range overlaps { - t.tree.Delete(old) + t.tree.Delete(®ionItem{RegionInfo: old}) } t.tree.ReplaceOrInsert(item) if t.countRef { @@ -153,7 +153,7 @@ func (t *regionTree) update(item *regionItem, withOverlaps bool, overlaps ...*re } result := make([]*RegionInfo, len(overlaps)) for i, overlap := range overlaps { - old := overlap.RegionInfo + old := overlap result[i] = old log.Debug("overlapping region", zap.Uint64("region-id", old.GetID()), @@ -174,7 +174,7 @@ func (t *regionTree) update(item *regionItem, withOverlaps bool, overlaps ...*re return result } -// updateStat is used to update statistics when regionItem.RegionInfo is directly replaced. +// updateStat is used to update statistics when RegionInfo is directly replaced. func (t *regionTree) updateStat(origin *RegionInfo, region *RegionInfo) { t.totalSize += region.approximateSize regionWriteBytesRate, regionWriteKeysRate := region.GetWriteRate() diff --git a/pkg/core/region_tree_test.go b/pkg/core/region_tree_test.go index 5886103191c7..2726b4fdab55 100644 --- a/pkg/core/region_tree_test.go +++ b/pkg/core/region_tree_test.go @@ -159,8 +159,8 @@ func TestRegionTree(t *testing.T) { updateNewItem(tree, regionA) updateNewItem(tree, regionC) re.Nil(tree.overlaps(newRegionItem([]byte("b"), []byte("c")))) - re.Equal(regionC, tree.overlaps(newRegionItem([]byte("c"), []byte("d")))[0].RegionInfo) - re.Equal(regionC, tree.overlaps(newRegionItem([]byte("a"), []byte("cc")))[1].RegionInfo) + re.Equal(regionC, tree.overlaps(newRegionItem([]byte("c"), []byte("d")))[0]) + re.Equal(regionC, tree.overlaps(newRegionItem([]byte("a"), []byte("cc")))[1]) re.Nil(tree.search([]byte{})) re.Equal(regionA, tree.search([]byte("a"))) re.Nil(tree.search([]byte("b"))) diff --git a/pkg/mcs/scheduling/server/cluster.go b/pkg/mcs/scheduling/server/cluster.go index 5fac3e1604ae..b18db7c07986 100644 --- a/pkg/mcs/scheduling/server/cluster.go +++ b/pkg/mcs/scheduling/server/cluster.go @@ -443,11 +443,23 @@ func (c *Cluster) HandleStoreHeartbeat(heartbeat *schedulingpb.StoreHeartbeatReq utils.RegionWriteKeys: 0, utils.RegionWriteQueryNum: 0, } - c.hotStat.CheckReadAsync(statistics.NewCheckReadPeerTask(region, []*metapb.Peer{peer}, loads, interval)) + checkReadPeerTask := func(cache *statistics.HotPeerCache) { + stats := cache.CheckPeerFlow(region, []*metapb.Peer{peer}, loads, interval) + for _, stat := range stats { + cache.UpdateStat(stat) + } + } + c.hotStat.CheckReadAsync(checkReadPeerTask) } // Here we will compare the reported regions with the previous hot peers to decide if it is still hot. - c.hotStat.CheckReadAsync(statistics.NewCollectUnReportedPeerTask(storeID, regions, interval)) + collectUnReportedPeerTask := func(cache *statistics.HotPeerCache) { + stats := cache.CheckColdPeer(storeID, regions, interval) + for _, stat := range stats { + cache.UpdateStat(stat) + } + } + c.hotStat.CheckReadAsync(collectUnReportedPeerTask) return nil } diff --git a/pkg/mcs/scheduling/server/grpc_service.go b/pkg/mcs/scheduling/server/grpc_service.go index 605ec73dad5e..842e876885c1 100644 --- a/pkg/mcs/scheduling/server/grpc_service.go +++ b/pkg/mcs/scheduling/server/grpc_service.go @@ -158,7 +158,8 @@ func (s *Service) RegionHeartbeat(stream schedulingpb.Scheduling_RegionHeartbeat s.hbStreams.BindStream(storeID, server) lastBind = time.Now() } - region := core.RegionFromHeartbeat(request) + // scheduling service doesn't sync the pd server config, so we use 0 here + region := core.RegionFromHeartbeat(request, 0) err = c.HandleRegionHeartbeat(region) if err != nil { // TODO: if we need to send the error back to API server. diff --git a/pkg/schedule/checker/rule_checker_test.go b/pkg/schedule/checker/rule_checker_test.go index e1cc702fd36b..f99208a988be 100644 --- a/pkg/schedule/checker/rule_checker_test.go +++ b/pkg/schedule/checker/rule_checker_test.go @@ -2053,7 +2053,7 @@ func (suite *ruleCheckerTestAdvancedSuite) TestReplaceAnExistingPeerCases() { {"111_learner,211_learner,311_learner,151_leader,252,351", []string{"3/voter//", "3/learner/type=read/"}, ""}, } groupName := "a_test" - for i, cas := range testCases { + for _, cas := range testCases { bundle := placement.GroupBundle{ ID: groupName, Index: 1000, @@ -2071,7 +2071,7 @@ func (suite *ruleCheckerTestAdvancedSuite) TestReplaceAnExistingPeerCases() { suite.cluster.PutRegion(region) op := suite.rc.Check(region) if len(cas.opStr) > 0 { - re.Contains(op.String(), cas.opStr, i, cas.opStr) + re.Contains(op.String(), cas.opStr, cas.opStr) } suite.ruleManager.DeleteGroupBundle(groupName, false) } diff --git a/pkg/schedule/schedulers/hot_region_test.go b/pkg/schedule/schedulers/hot_region_test.go index e5b722a488d4..304698c915e2 100644 --- a/pkg/schedule/schedulers/hot_region_test.go +++ b/pkg/schedule/schedulers/hot_region_test.go @@ -35,6 +35,7 @@ import ( "github.com/tikv/pd/pkg/storage" "github.com/tikv/pd/pkg/storage/endpoint" "github.com/tikv/pd/pkg/utils/operatorutil" + "github.com/tikv/pd/pkg/utils/testutil" "github.com/tikv/pd/pkg/utils/typeutil" "github.com/tikv/pd/pkg/versioninfo" ) @@ -1287,8 +1288,9 @@ func TestHotReadRegionScheduleByteRateOnly(t *testing.T) { {11, []uint64{1, 2, 3}, 7 * units.KiB, 0, 0}, }) - re.True(tc.IsRegionHot(tc.GetRegion(1))) - re.False(tc.IsRegionHot(tc.GetRegion(11))) + testutil.Eventually(re, func() bool { + return tc.IsRegionHot(tc.GetRegion(1)) && !tc.IsRegionHot(tc.GetRegion(11)) + }) // check randomly pick hot region r := tc.HotRegionsFromStore(2, utils.Read) re.Len(r, 3) diff --git a/pkg/schedule/schedulers/scheduler_test.go b/pkg/schedule/schedulers/scheduler_test.go index d30ef3ad0aa0..1480d76b75b1 100644 --- a/pkg/schedule/schedulers/scheduler_test.go +++ b/pkg/schedule/schedulers/scheduler_test.go @@ -31,6 +31,7 @@ import ( "github.com/tikv/pd/pkg/statistics/utils" "github.com/tikv/pd/pkg/storage" "github.com/tikv/pd/pkg/utils/operatorutil" + "github.com/tikv/pd/pkg/utils/testutil" "github.com/tikv/pd/pkg/versioninfo" ) @@ -218,7 +219,9 @@ func TestHotRegionScheduleAbnormalReplica(t *testing.T) { tc.AddRegionWithReadInfo(1, 1, 512*units.KiB*utils.StoreHeartBeatReportInterval, 0, 0, utils.StoreHeartBeatReportInterval, []uint64{2}) tc.AddRegionWithReadInfo(2, 2, 512*units.KiB*utils.StoreHeartBeatReportInterval, 0, 0, utils.StoreHeartBeatReportInterval, []uint64{1, 3}) tc.AddRegionWithReadInfo(3, 1, 512*units.KiB*utils.StoreHeartBeatReportInterval, 0, 0, utils.StoreHeartBeatReportInterval, []uint64{2, 3}) - re.True(tc.IsRegionHot(tc.GetRegion(1))) + testutil.Eventually(re, func() bool { + return tc.IsRegionHot(tc.GetRegion(1)) + }) re.False(hb.IsScheduleAllowed(tc)) } @@ -314,8 +317,6 @@ func TestSpecialUseHotRegion(t *testing.T) { cd := ConfigSliceDecoder(BalanceRegionType, []string{"", ""}) bs, err := CreateScheduler(BalanceRegionType, oc, storage, cd) re.NoError(err) - hs, err := CreateScheduler(utils.Write.String(), oc, storage, cd) - re.NoError(err) tc.SetClusterVersion(versioninfo.MinSupportedVersion(versioninfo.Version4_0)) tc.AddRegionStore(1, 10) @@ -351,6 +352,8 @@ func TestSpecialUseHotRegion(t *testing.T) { tc.AddLeaderRegionWithWriteInfo(3, 1, 512*units.KiB*utils.RegionHeartBeatReportInterval, 0, 0, utils.RegionHeartBeatReportInterval, []uint64{2, 3}) tc.AddLeaderRegionWithWriteInfo(4, 2, 512*units.KiB*utils.RegionHeartBeatReportInterval, 0, 0, utils.RegionHeartBeatReportInterval, []uint64{1, 3}) tc.AddLeaderRegionWithWriteInfo(5, 3, 512*units.KiB*utils.RegionHeartBeatReportInterval, 0, 0, utils.RegionHeartBeatReportInterval, []uint64{1, 2}) + hs, err := CreateScheduler(utils.Write.String(), oc, storage, cd) + re.NoError(err) ops, _ = hs.Schedule(tc, false) re.Len(ops, 1) operatorutil.CheckTransferPeer(re, ops[0], operator.OpHotRegion, 1, 4) diff --git a/pkg/statistics/hot_cache.go b/pkg/statistics/hot_cache.go index 26548c8b47eb..3f076734a7b2 100644 --- a/pkg/statistics/hot_cache.go +++ b/pkg/statistics/hot_cache.go @@ -34,8 +34,8 @@ var ( // HotCache is a cache hold hot regions. type HotCache struct { ctx context.Context - writeCache *hotPeerCache - readCache *hotPeerCache + writeCache *HotPeerCache + readCache *HotPeerCache } // NewHotCache creates a new hot spot cache. @@ -51,7 +51,7 @@ func NewHotCache(ctx context.Context) *HotCache { } // CheckWriteAsync puts the flowItem into queue, and check it asynchronously -func (w *HotCache) CheckWriteAsync(task FlowItemTask) bool { +func (w *HotCache) CheckWriteAsync(task func(cache *HotPeerCache)) bool { if w.writeCache.taskQueue.Len() > chanMaxLength { return false } @@ -64,7 +64,7 @@ func (w *HotCache) CheckWriteAsync(task FlowItemTask) bool { } // CheckReadAsync puts the flowItem into queue, and check it asynchronously -func (w *HotCache) CheckReadAsync(task FlowItemTask) bool { +func (w *HotCache) CheckReadAsync(task func(cache *HotPeerCache)) bool { if w.readCache.taskQueue.Len() > chanMaxLength { return false } @@ -78,52 +78,86 @@ func (w *HotCache) CheckReadAsync(task FlowItemTask) bool { // RegionStats returns hot items according to kind func (w *HotCache) RegionStats(kind utils.RWType, minHotDegree int) map[uint64][]*HotPeerStat { - task := newCollectRegionStatsTask(minHotDegree) + ret := make(chan map[uint64][]*HotPeerStat, 1) + collectRegionStatsTask := func(cache *HotPeerCache) { + ret <- cache.RegionStats(minHotDegree) + } var succ bool switch kind { case utils.Write: - succ = w.CheckWriteAsync(task) + succ = w.CheckWriteAsync(collectRegionStatsTask) case utils.Read: - succ = w.CheckReadAsync(task) + succ = w.CheckReadAsync(collectRegionStatsTask) } if !succ { return nil } - return task.waitRet(w.ctx) + select { + case <-w.ctx.Done(): + return nil + case r := <-ret: + return r + } } // IsRegionHot checks if the region is hot. func (w *HotCache) IsRegionHot(region *core.RegionInfo, minHotDegree int) bool { - checkRegionHotWriteTask := newCheckRegionHotTask(region, minHotDegree) - checkRegionHotReadTask := newCheckRegionHotTask(region, minHotDegree) + retWrite := make(chan bool, 1) + retRead := make(chan bool, 1) + checkRegionHotWriteTask := func(cache *HotPeerCache) { + retWrite <- cache.isRegionHotWithAnyPeers(region, minHotDegree) + } + checkRegionHotReadTask := func(cache *HotPeerCache) { + retRead <- cache.isRegionHotWithAnyPeers(region, minHotDegree) + } succ1 := w.CheckWriteAsync(checkRegionHotWriteTask) succ2 := w.CheckReadAsync(checkRegionHotReadTask) if succ1 && succ2 { - return checkRegionHotWriteTask.waitRet(w.ctx) || checkRegionHotReadTask.waitRet(w.ctx) + select { + case <-w.ctx.Done(): + return false + case r := <-retWrite: + return r + case r := <-retRead: + return r + } } return false } // GetHotPeerStat returns hot peer stat with specified regionID and storeID. func (w *HotCache) GetHotPeerStat(kind utils.RWType, regionID, storeID uint64) *HotPeerStat { - task := newGetHotPeerStatTask(regionID, storeID) + ret := make(chan *HotPeerStat, 1) + getHotPeerStatTask := func(cache *HotPeerCache) { + ret <- cache.getHotPeerStat(regionID, storeID) + } + var succ bool switch kind { case utils.Read: - succ = w.CheckReadAsync(task) + succ = w.CheckReadAsync(getHotPeerStatTask) case utils.Write: - succ = w.CheckWriteAsync(task) + succ = w.CheckWriteAsync(getHotPeerStatTask) } if !succ { return nil } - return task.waitRet(w.ctx) + select { + case <-w.ctx.Done(): + return nil + case r := <-ret: + return r + } } // CollectMetrics collects the hot cache metrics. func (w *HotCache) CollectMetrics() { - w.CheckWriteAsync(newCollectMetricsTask()) - w.CheckReadAsync(newCollectMetricsTask()) + w.CheckWriteAsync(func(cache *HotPeerCache) { + cache.collectMetrics() + }) + w.CheckReadAsync(func(cache *HotPeerCache) { + cache.collectMetrics() + }) } // ResetHotCacheStatusMetrics resets the hot cache metrics. @@ -131,7 +165,7 @@ func ResetHotCacheStatusMetrics() { hotCacheStatusGauge.Reset() } -func (w *HotCache) updateItems(queue *chanx.UnboundedChan[FlowItemTask], runTask func(task FlowItemTask)) { +func (w *HotCache) updateItems(queue *chanx.UnboundedChan[func(*HotPeerCache)], runTask func(task func(*HotPeerCache))) { defer logutil.LogPanic() for { @@ -144,18 +178,18 @@ func (w *HotCache) updateItems(queue *chanx.UnboundedChan[FlowItemTask], runTask } } -func (w *HotCache) runReadTask(task FlowItemTask) { +func (w *HotCache) runReadTask(task func(cache *HotPeerCache)) { if task != nil { // TODO: do we need a run-task timeout to protect the queue won't be stuck by a task? - task.runTask(w.readCache) + task(w.readCache) readTaskMetrics.Set(float64(w.readCache.taskQueue.Len())) } } -func (w *HotCache) runWriteTask(task FlowItemTask) { +func (w *HotCache) runWriteTask(task func(cache *HotPeerCache)) { if task != nil { // TODO: do we need a run-task timeout to protect the queue won't be stuck by a task? - task.runTask(w.writeCache) + task(w.writeCache) writeTaskMetrics.Set(float64(w.writeCache.taskQueue.Len())) } } @@ -165,34 +199,34 @@ func (w *HotCache) runWriteTask(task FlowItemTask) { func (w *HotCache) Update(item *HotPeerStat, kind utils.RWType) { switch kind { case utils.Write: - w.writeCache.updateStat(item) + w.writeCache.UpdateStat(item) case utils.Read: - w.readCache.updateStat(item) + w.readCache.UpdateStat(item) } } // CheckWritePeerSync checks the write status, returns update items. // This is used for mockcluster, for test purpose. func (w *HotCache) CheckWritePeerSync(region *core.RegionInfo, peers []*metapb.Peer, loads []float64, interval uint64) []*HotPeerStat { - return w.writeCache.checkPeerFlow(region, peers, loads, interval) + return w.writeCache.CheckPeerFlow(region, peers, loads, interval) } // CheckReadPeerSync checks the read status, returns update items. // This is used for mockcluster, for test purpose. func (w *HotCache) CheckReadPeerSync(region *core.RegionInfo, peers []*metapb.Peer, loads []float64, interval uint64) []*HotPeerStat { - return w.readCache.checkPeerFlow(region, peers, loads, interval) + return w.readCache.CheckPeerFlow(region, peers, loads, interval) } // ExpiredReadItems returns the read items which are already expired. // This is used for mockcluster, for test purpose. func (w *HotCache) ExpiredReadItems(region *core.RegionInfo) []*HotPeerStat { - return w.readCache.collectExpiredItems(region) + return w.readCache.CollectExpiredItems(region) } // ExpiredWriteItems returns the write items which are already expired. // This is used for mockcluster, for test purpose. func (w *HotCache) ExpiredWriteItems(region *core.RegionInfo) []*HotPeerStat { - return w.writeCache.collectExpiredItems(region) + return w.writeCache.CollectExpiredItems(region) } // GetThresholds returns thresholds. diff --git a/pkg/statistics/hot_cache_task.go b/pkg/statistics/hot_cache_task.go deleted file mode 100644 index 01731f3fe4d5..000000000000 --- a/pkg/statistics/hot_cache_task.go +++ /dev/null @@ -1,204 +0,0 @@ -// Copyright 2021 TiKV Project Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package statistics - -import ( - "context" - - "github.com/pingcap/kvproto/pkg/metapb" - "github.com/tikv/pd/pkg/core" -) - -// FlowItemTask indicates the task in flowItem queue -type FlowItemTask interface { - runTask(cache *hotPeerCache) -} - -type checkReadPeerTask struct { - regionInfo *core.RegionInfo - peers []*metapb.Peer - loads []float64 - interval uint64 -} - -// NewCheckReadPeerTask creates task to update peerInfo -func NewCheckReadPeerTask(regionInfo *core.RegionInfo, peers []*metapb.Peer, loads []float64, interval uint64) FlowItemTask { - return &checkReadPeerTask{ - regionInfo: regionInfo, - peers: peers, - loads: loads, - interval: interval, - } -} - -func (t *checkReadPeerTask) runTask(cache *hotPeerCache) { - stats := cache.checkPeerFlow(t.regionInfo, t.peers, t.loads, t.interval) - for _, stat := range stats { - cache.updateStat(stat) - } -} - -type checkWritePeerTask struct { - region *core.RegionInfo -} - -// NewCheckWritePeerTask creates task to update peerInfo -func NewCheckWritePeerTask(region *core.RegionInfo) FlowItemTask { - return &checkWritePeerTask{ - region: region, - } -} - -func (t *checkWritePeerTask) runTask(cache *hotPeerCache) { - reportInterval := t.region.GetInterval() - interval := reportInterval.GetEndTimestamp() - reportInterval.GetStartTimestamp() - stats := cache.checkPeerFlow(t.region, t.region.GetPeers(), t.region.GetWriteLoads(), interval) - for _, stat := range stats { - cache.updateStat(stat) - } -} - -type checkExpiredTask struct { - region *core.RegionInfo -} - -// NewCheckExpiredItemTask creates task to collect expired items -func NewCheckExpiredItemTask(region *core.RegionInfo) FlowItemTask { - return &checkExpiredTask{ - region: region, - } -} - -func (t *checkExpiredTask) runTask(cache *hotPeerCache) { - expiredStats := cache.collectExpiredItems(t.region) - for _, stat := range expiredStats { - cache.updateStat(stat) - } -} - -type collectUnReportedPeerTask struct { - storeID uint64 - regions map[uint64]*core.RegionInfo - interval uint64 -} - -// NewCollectUnReportedPeerTask creates task to collect unreported peers -func NewCollectUnReportedPeerTask(storeID uint64, regions map[uint64]*core.RegionInfo, interval uint64) FlowItemTask { - return &collectUnReportedPeerTask{ - storeID: storeID, - regions: regions, - interval: interval, - } -} - -func (t *collectUnReportedPeerTask) runTask(cache *hotPeerCache) { - stats := cache.checkColdPeer(t.storeID, t.regions, t.interval) - for _, stat := range stats { - cache.updateStat(stat) - } -} - -type collectRegionStatsTask struct { - minDegree int - ret chan map[uint64][]*HotPeerStat -} - -func newCollectRegionStatsTask(minDegree int) *collectRegionStatsTask { - return &collectRegionStatsTask{ - minDegree: minDegree, - ret: make(chan map[uint64][]*HotPeerStat, 1), - } -} - -func (t *collectRegionStatsTask) runTask(cache *hotPeerCache) { - t.ret <- cache.RegionStats(t.minDegree) -} - -// TODO: do we need a wait-return timeout? -func (t *collectRegionStatsTask) waitRet(ctx context.Context) map[uint64][]*HotPeerStat { - select { - case <-ctx.Done(): - return nil - case ret := <-t.ret: - return ret - } -} - -type checkRegionHotTask struct { - region *core.RegionInfo - minHotDegree int - ret chan bool -} - -func newCheckRegionHotTask(region *core.RegionInfo, minDegree int) *checkRegionHotTask { - return &checkRegionHotTask{ - region: region, - minHotDegree: minDegree, - ret: make(chan bool, 1), - } -} - -func (t *checkRegionHotTask) runTask(cache *hotPeerCache) { - t.ret <- cache.isRegionHotWithAnyPeers(t.region, t.minHotDegree) -} - -// TODO: do we need a wait-return timeout? -func (t *checkRegionHotTask) waitRet(ctx context.Context) bool { - select { - case <-ctx.Done(): - return false - case r := <-t.ret: - return r - } -} - -type collectMetricsTask struct { -} - -func newCollectMetricsTask() *collectMetricsTask { - return &collectMetricsTask{} -} - -func (*collectMetricsTask) runTask(cache *hotPeerCache) { - cache.collectMetrics() -} - -type getHotPeerStatTask struct { - regionID uint64 - storeID uint64 - ret chan *HotPeerStat -} - -func newGetHotPeerStatTask(regionID, storeID uint64) *getHotPeerStatTask { - return &getHotPeerStatTask{ - regionID: regionID, - storeID: storeID, - ret: make(chan *HotPeerStat, 1), - } -} - -func (t *getHotPeerStatTask) runTask(cache *hotPeerCache) { - t.ret <- cache.getHotPeerStat(t.regionID, t.storeID) -} - -// TODO: do we need a wait-return timeout? -func (t *getHotPeerStatTask) waitRet(ctx context.Context) *HotPeerStat { - select { - case <-ctx.Done(): - return nil - case r := <-t.ret: - return r - } -} diff --git a/pkg/statistics/hot_peer_cache.go b/pkg/statistics/hot_peer_cache.go index 3a3d3519bd96..4db0c304bb95 100644 --- a/pkg/statistics/hot_peer_cache.go +++ b/pkg/statistics/hot_peer_cache.go @@ -57,27 +57,27 @@ type thresholds struct { metrics [utils.DimLen + 1]prometheus.Gauge // 0 is for byte, 1 is for key, 2 is for query, 3 is for total length. } -// hotPeerCache saves the hot peer's statistics. -type hotPeerCache struct { +// HotPeerCache saves the hot peer's statistics. +type HotPeerCache struct { kind utils.RWType peersOfStore map[uint64]*utils.TopN // storeID -> hot peers storesOfRegion map[uint64]map[uint64]struct{} // regionID -> storeIDs regionsOfStore map[uint64]map[uint64]struct{} // storeID -> regionIDs topNTTL time.Duration - taskQueue *chanx.UnboundedChan[FlowItemTask] + taskQueue *chanx.UnboundedChan[func(*HotPeerCache)] thresholdsOfStore map[uint64]*thresholds // storeID -> thresholds metrics map[uint64][utils.ActionTypeLen]prometheus.Gauge // storeID -> metrics // TODO: consider to remove store info when store is offline. } -// NewHotPeerCache creates a hotPeerCache -func NewHotPeerCache(ctx context.Context, kind utils.RWType) *hotPeerCache { - return &hotPeerCache{ +// NewHotPeerCache creates a HotPeerCache +func NewHotPeerCache(ctx context.Context, kind utils.RWType) *HotPeerCache { + return &HotPeerCache{ kind: kind, peersOfStore: make(map[uint64]*utils.TopN), storesOfRegion: make(map[uint64]map[uint64]struct{}), regionsOfStore: make(map[uint64]map[uint64]struct{}), - taskQueue: chanx.NewUnboundedChan[FlowItemTask](ctx, queueCap), + taskQueue: chanx.NewUnboundedChan[func(*HotPeerCache)](ctx, queueCap), thresholdsOfStore: make(map[uint64]*thresholds), topNTTL: time.Duration(3*kind.ReportInterval()) * time.Second, metrics: make(map[uint64][utils.ActionTypeLen]prometheus.Gauge), @@ -86,7 +86,7 @@ func NewHotPeerCache(ctx context.Context, kind utils.RWType) *hotPeerCache { // TODO: rename RegionStats as PeerStats // RegionStats returns hot items -func (f *hotPeerCache) RegionStats(minHotDegree int) map[uint64][]*HotPeerStat { +func (f *HotPeerCache) RegionStats(minHotDegree int) map[uint64][]*HotPeerStat { res := make(map[uint64][]*HotPeerStat) defaultAntiCount := f.kind.DefaultAntiCount() for storeID, peers := range f.peersOfStore { @@ -102,7 +102,7 @@ func (f *hotPeerCache) RegionStats(minHotDegree int) map[uint64][]*HotPeerStat { return res } -func (f *hotPeerCache) updateStat(item *HotPeerStat) { +func (f *HotPeerCache) UpdateStat(item *HotPeerStat) { switch item.actionType { case utils.Remove: f.removeItem(item) @@ -116,7 +116,7 @@ func (f *hotPeerCache) updateStat(item *HotPeerStat) { f.incMetrics(item.actionType, item.StoreID) } -func (f *hotPeerCache) incMetrics(action utils.ActionType, storeID uint64) { +func (f *HotPeerCache) incMetrics(action utils.ActionType, storeID uint64) { if _, ok := f.metrics[storeID]; !ok { store := storeTag(storeID) kind := f.kind.String() @@ -129,7 +129,7 @@ func (f *hotPeerCache) incMetrics(action utils.ActionType, storeID uint64) { f.metrics[storeID][action].Inc() } -func (f *hotPeerCache) collectPeerMetrics(loads []float64, interval uint64) { +func (f *HotPeerCache) collectPeerMetrics(loads []float64, interval uint64) { regionHeartbeatIntervalHist.Observe(float64(interval)) if interval == 0 { return @@ -153,8 +153,8 @@ func (f *hotPeerCache) collectPeerMetrics(loads []float64, interval uint64) { } } -// collectExpiredItems collects expired items, mark them as needDelete and puts them into inherit items -func (f *hotPeerCache) collectExpiredItems(region *core.RegionInfo) []*HotPeerStat { +// CollectExpiredItems collects expired items, mark them as needDelete and puts them into inherit items +func (f *HotPeerCache) CollectExpiredItems(region *core.RegionInfo) []*HotPeerStat { regionID := region.GetID() items := make([]*HotPeerStat, 0) if ids, ok := f.storesOfRegion[regionID]; ok { @@ -171,10 +171,10 @@ func (f *hotPeerCache) collectExpiredItems(region *core.RegionInfo) []*HotPeerSt return items } -// checkPeerFlow checks the flow information of a peer. -// Notice: checkPeerFlow couldn't be used concurrently. -// checkPeerFlow will update oldItem's rollingLoads into newItem, thus we should use write lock here. -func (f *hotPeerCache) checkPeerFlow(region *core.RegionInfo, peers []*metapb.Peer, deltaLoads []float64, interval uint64) []*HotPeerStat { +// CheckPeerFlow checks the flow information of a peer. +// Notice: CheckPeerFlow couldn't be used concurrently. +// CheckPeerFlow will update oldItem's rollingLoads into newItem, thus we should use write lock here. +func (f *HotPeerCache) CheckPeerFlow(region *core.RegionInfo, peers []*metapb.Peer, deltaLoads []float64, interval uint64) []*HotPeerStat { if Denoising && interval < HotRegionReportMinInterval { // for test or simulator purpose return nil } @@ -231,8 +231,8 @@ func (f *hotPeerCache) checkPeerFlow(region *core.RegionInfo, peers []*metapb.Pe return stats } -// checkColdPeer checks the collect the un-heartbeat peer and maintain it. -func (f *hotPeerCache) checkColdPeer(storeID uint64, reportRegions map[uint64]*core.RegionInfo, interval uint64) (ret []*HotPeerStat) { +// CheckColdPeer checks the collect the un-heartbeat peer and maintain it. +func (f *HotPeerCache) CheckColdPeer(storeID uint64, reportRegions map[uint64]*core.RegionInfo, interval uint64) (ret []*HotPeerStat) { // for test or simulator purpose if Denoising && interval < HotRegionReportMinInterval { return @@ -278,7 +278,7 @@ func (f *hotPeerCache) checkColdPeer(storeID uint64, reportRegions map[uint64]*c return } -func (f *hotPeerCache) collectMetrics() { +func (f *HotPeerCache) collectMetrics() { for _, thresholds := range f.thresholdsOfStore { thresholds.metrics[utils.ByteDim].Set(thresholds.rates[utils.ByteDim]) thresholds.metrics[utils.KeyDim].Set(thresholds.rates[utils.KeyDim]) @@ -287,7 +287,7 @@ func (f *hotPeerCache) collectMetrics() { } } -func (f *hotPeerCache) getOldHotPeerStat(regionID, storeID uint64) *HotPeerStat { +func (f *HotPeerCache) getOldHotPeerStat(regionID, storeID uint64) *HotPeerStat { if hotPeers, ok := f.peersOfStore[storeID]; ok { if v := hotPeers.Get(regionID); v != nil { return v.(*HotPeerStat) @@ -296,7 +296,7 @@ func (f *hotPeerCache) getOldHotPeerStat(regionID, storeID uint64) *HotPeerStat return nil } -func (f *hotPeerCache) calcHotThresholds(storeID uint64) []float64 { +func (f *HotPeerCache) calcHotThresholds(storeID uint64) []float64 { // check whether the thresholds is updated recently t, ok := f.thresholdsOfStore[storeID] if ok && time.Since(t.updatedTime) <= ThresholdsUpdateInterval { @@ -336,7 +336,7 @@ func (f *hotPeerCache) calcHotThresholds(storeID uint64) []float64 { } // gets the storeIDs, including old region and new region -func (f *hotPeerCache) getAllStoreIDs(region *core.RegionInfo) []uint64 { +func (f *HotPeerCache) getAllStoreIDs(region *core.RegionInfo) []uint64 { regionPeers := region.GetPeers() ret := make([]uint64, 0, len(regionPeers)) isInSlice := func(id uint64) bool { @@ -364,7 +364,7 @@ func (f *hotPeerCache) getAllStoreIDs(region *core.RegionInfo) []uint64 { return ret } -func (f *hotPeerCache) isOldColdPeer(oldItem *HotPeerStat, storeID uint64) bool { +func (f *HotPeerCache) isOldColdPeer(oldItem *HotPeerStat, storeID uint64) bool { isOldPeer := func() bool { for _, id := range oldItem.stores { if id == storeID { @@ -384,7 +384,7 @@ func (f *hotPeerCache) isOldColdPeer(oldItem *HotPeerStat, storeID uint64) bool return isOldPeer() && !isInHotCache() } -func (f *hotPeerCache) justTransferLeader(region *core.RegionInfo, oldItem *HotPeerStat) bool { +func (f *HotPeerCache) justTransferLeader(region *core.RegionInfo, oldItem *HotPeerStat) bool { if region == nil { return false } @@ -406,7 +406,7 @@ func (f *hotPeerCache) justTransferLeader(region *core.RegionInfo, oldItem *HotP return false } -func (f *hotPeerCache) isRegionHotWithAnyPeers(region *core.RegionInfo, hotDegree int) bool { +func (f *HotPeerCache) isRegionHotWithAnyPeers(region *core.RegionInfo, hotDegree int) bool { for _, peer := range region.GetPeers() { if f.isRegionHotWithPeer(region, peer, hotDegree) { return true @@ -415,7 +415,7 @@ func (f *hotPeerCache) isRegionHotWithAnyPeers(region *core.RegionInfo, hotDegre return false } -func (f *hotPeerCache) isRegionHotWithPeer(region *core.RegionInfo, peer *metapb.Peer, hotDegree int) bool { +func (f *HotPeerCache) isRegionHotWithPeer(region *core.RegionInfo, peer *metapb.Peer, hotDegree int) bool { if peer == nil { return false } @@ -425,7 +425,7 @@ func (f *hotPeerCache) isRegionHotWithPeer(region *core.RegionInfo, peer *metapb return false } -func (f *hotPeerCache) getHotPeerStat(regionID, storeID uint64) *HotPeerStat { +func (f *HotPeerCache) getHotPeerStat(regionID, storeID uint64) *HotPeerStat { if peers, ok := f.peersOfStore[storeID]; ok { if stat := peers.Get(regionID); stat != nil { return stat.(*HotPeerStat) @@ -434,7 +434,7 @@ func (f *hotPeerCache) getHotPeerStat(regionID, storeID uint64) *HotPeerStat { return nil } -func (f *hotPeerCache) updateHotPeerStat(region *core.RegionInfo, newItem, oldItem *HotPeerStat, deltaLoads []float64, interval time.Duration, source utils.SourceKind) *HotPeerStat { +func (f *HotPeerCache) updateHotPeerStat(region *core.RegionInfo, newItem, oldItem *HotPeerStat, deltaLoads []float64, interval time.Duration, source utils.SourceKind) *HotPeerStat { regionStats := f.kind.RegionStats() if source == utils.Inherit { @@ -495,7 +495,7 @@ func (f *hotPeerCache) updateHotPeerStat(region *core.RegionInfo, newItem, oldIt return newItem } -func (f *hotPeerCache) updateNewHotPeerStat(newItem *HotPeerStat, deltaLoads []float64, interval time.Duration) *HotPeerStat { +func (f *HotPeerCache) updateNewHotPeerStat(newItem *HotPeerStat, deltaLoads []float64, interval time.Duration) *HotPeerStat { regionStats := f.kind.RegionStats() // interval is not 0 which is guaranteed by the caller. if interval.Seconds() >= float64(f.kind.ReportInterval()) { @@ -514,7 +514,7 @@ func (f *hotPeerCache) updateNewHotPeerStat(newItem *HotPeerStat, deltaLoads []f return newItem } -func (f *hotPeerCache) putItem(item *HotPeerStat) { +func (f *HotPeerCache) putItem(item *HotPeerStat) { peers, ok := f.peersOfStore[item.StoreID] if !ok { peers = utils.NewTopN(utils.DimLen, TopNN, f.topNTTL) @@ -535,7 +535,7 @@ func (f *hotPeerCache) putItem(item *HotPeerStat) { regions[item.RegionID] = struct{}{} } -func (f *hotPeerCache) removeItem(item *HotPeerStat) { +func (f *HotPeerCache) removeItem(item *HotPeerStat) { if peers, ok := f.peersOfStore[item.StoreID]; ok { peers.Remove(item.RegionID) } @@ -549,12 +549,12 @@ func (f *hotPeerCache) removeItem(item *HotPeerStat) { // removeAllItem removes all items of the cache. // It is used for test. -func (f *hotPeerCache) removeAllItem() { +func (f *HotPeerCache) removeAllItem() { for _, peers := range f.peersOfStore { for _, peer := range peers.GetAll() { item := peer.(*HotPeerStat) item.actionType = utils.Remove - f.updateStat(item) + f.UpdateStat(item) } } } @@ -590,7 +590,7 @@ func inheritItem(newItem, oldItem *HotPeerStat) { newItem.AntiCount = oldItem.AntiCount } -func (f *hotPeerCache) interval() time.Duration { +func (f *HotPeerCache) interval() time.Duration { return time.Duration(f.kind.ReportInterval()) * time.Second } diff --git a/pkg/statistics/hot_peer_cache_test.go b/pkg/statistics/hot_peer_cache_test.go index db215238604f..ce4e352bc3d3 100644 --- a/pkg/statistics/hot_peer_cache_test.go +++ b/pkg/statistics/hot_peer_cache_test.go @@ -93,7 +93,7 @@ func TestCache(t *testing.T) { } } -func orderingPeers(cache *hotPeerCache, region *core.RegionInfo) []*metapb.Peer { +func orderingPeers(cache *HotPeerCache, region *core.RegionInfo) []*metapb.Peer { var peers []*metapb.Peer for _, peer := range region.GetPeers() { if cache.getOldHotPeerStat(region.GetID(), peer.StoreId) != nil { @@ -105,23 +105,23 @@ func orderingPeers(cache *hotPeerCache, region *core.RegionInfo) []*metapb.Peer return peers } -func checkFlow(cache *hotPeerCache, region *core.RegionInfo, peers []*metapb.Peer) (res []*HotPeerStat) { +func checkFlow(cache *HotPeerCache, region *core.RegionInfo, peers []*metapb.Peer) (res []*HotPeerStat) { reportInterval := region.GetInterval() interval := reportInterval.GetEndTimestamp() - reportInterval.GetStartTimestamp() - res = append(res, cache.collectExpiredItems(region)...) - return append(res, cache.checkPeerFlow(region, peers, region.GetLoads(), interval)...) + res = append(res, cache.CollectExpiredItems(region)...) + return append(res, cache.CheckPeerFlow(region, peers, region.GetLoads(), interval)...) } -func updateFlow(cache *hotPeerCache, res []*HotPeerStat) []*HotPeerStat { +func updateFlow(cache *HotPeerCache, res []*HotPeerStat) []*HotPeerStat { for _, p := range res { - cache.updateStat(p) + cache.UpdateStat(p) } return res } -type check func(re *require.Assertions, cache *hotPeerCache, region *core.RegionInfo, expect ...int) (res []*HotPeerStat) +type check func(re *require.Assertions, cache *HotPeerCache, region *core.RegionInfo, expect ...int) (res []*HotPeerStat) -func checkAndUpdate(re *require.Assertions, cache *hotPeerCache, region *core.RegionInfo, expect ...int) (res []*HotPeerStat) { +func checkAndUpdate(re *require.Assertions, cache *HotPeerCache, region *core.RegionInfo, expect ...int) (res []*HotPeerStat) { res = checkFlow(cache, region, region.GetPeers()) if len(expect) != 0 { re.Len(res, expect[0]) @@ -131,7 +131,7 @@ func checkAndUpdate(re *require.Assertions, cache *hotPeerCache, region *core.Re // Check and update peers in the specified order that old item that he items that have not expired come first, and the items that have expired come second. // This order is also similar to the previous version. By the way the order in now version is random. -func checkAndUpdateWithOrdering(re *require.Assertions, cache *hotPeerCache, region *core.RegionInfo, expect ...int) (res []*HotPeerStat) { +func checkAndUpdateWithOrdering(re *require.Assertions, cache *HotPeerCache, region *core.RegionInfo, expect ...int) (res []*HotPeerStat) { res = checkFlow(cache, region, orderingPeers(cache, region)) if len(expect) != 0 { re.Len(res, expect[0]) @@ -139,7 +139,7 @@ func checkAndUpdateWithOrdering(re *require.Assertions, cache *hotPeerCache, reg return updateFlow(cache, res) } -func checkAndUpdateSkipOne(re *require.Assertions, cache *hotPeerCache, region *core.RegionInfo, expect ...int) (res []*HotPeerStat) { +func checkAndUpdateSkipOne(re *require.Assertions, cache *HotPeerCache, region *core.RegionInfo, expect ...int) (res []*HotPeerStat) { res = checkFlow(cache, region, region.GetPeers()[1:]) if len(expect) != 0 { re.Len(res, expect[0]) @@ -147,7 +147,7 @@ func checkAndUpdateSkipOne(re *require.Assertions, cache *hotPeerCache, region * return updateFlow(cache, res) } -func checkHit(re *require.Assertions, cache *hotPeerCache, region *core.RegionInfo, kind utils.RWType, actionType utils.ActionType) { +func checkHit(re *require.Assertions, cache *HotPeerCache, region *core.RegionInfo, kind utils.RWType, actionType utils.ActionType) { var peers []*metapb.Peer if kind == utils.Read { peers = []*metapb.Peer{region.GetLeader()} @@ -171,7 +171,7 @@ func checkOp(re *require.Assertions, ret []*HotPeerStat, storeID uint64, actionT } // checkIntervalSum checks whether the interval sum of the peers are different. -func checkIntervalSum(cache *hotPeerCache, region *core.RegionInfo) bool { +func checkIntervalSum(cache *HotPeerCache, region *core.RegionInfo) bool { var intervalSums []int for _, peer := range region.GetPeers() { oldItem := cache.getOldHotPeerStat(region.GetID(), peer.StoreId) @@ -317,7 +317,7 @@ func TestUpdateHotPeerStat(t *testing.T) { utils.MinHotThresholds[utils.RegionReadKeys] = 0.0 utils.MinHotThresholds[utils.RegionReadQueryNum] = 0.0 - newItem := cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) + newItem := cache.CheckPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) re.Nil(newItem) // new peer, interval is larger than report interval, but no hot @@ -326,7 +326,7 @@ func TestUpdateHotPeerStat(t *testing.T) { utils.MinHotThresholds[utils.RegionReadBytes] = 1.0 utils.MinHotThresholds[utils.RegionReadKeys] = 1.0 utils.MinHotThresholds[utils.RegionReadQueryNum] = 1.0 - newItem = cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) + newItem = cache.CheckPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) re.Empty(newItem) // new peer, interval is less than report interval @@ -335,45 +335,45 @@ func TestUpdateHotPeerStat(t *testing.T) { utils.MinHotThresholds[utils.RegionReadBytes] = 0.0 utils.MinHotThresholds[utils.RegionReadKeys] = 0.0 utils.MinHotThresholds[utils.RegionReadQueryNum] = 0.0 - newItem = cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) + newItem = cache.CheckPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) re.NotNil(newItem) re.Equal(0, newItem[0].HotDegree) re.Equal(0, newItem[0].AntiCount) // sum of interval is less than report interval deltaLoads = []float64{60.0, 60.0, 60.0} - cache.updateStat(newItem[0]) - newItem = cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) + cache.UpdateStat(newItem[0]) + newItem = cache.CheckPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) re.Equal(0, newItem[0].HotDegree) re.Equal(0, newItem[0].AntiCount) // sum of interval is larger than report interval, and hot newItem[0].AntiCount = utils.Read.DefaultAntiCount() - cache.updateStat(newItem[0]) - newItem = cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) + cache.UpdateStat(newItem[0]) + newItem = cache.CheckPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) re.Equal(1, newItem[0].HotDegree) re.Equal(2*m, newItem[0].AntiCount) // sum of interval is less than report interval - cache.updateStat(newItem[0]) - newItem = cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) + cache.UpdateStat(newItem[0]) + newItem = cache.CheckPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) re.Equal(1, newItem[0].HotDegree) re.Equal(2*m, newItem[0].AntiCount) // sum of interval is larger than report interval, and hot interval = 10 - cache.updateStat(newItem[0]) - newItem = cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) + cache.UpdateStat(newItem[0]) + newItem = cache.CheckPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) re.Equal(2, newItem[0].HotDegree) re.Equal(2*m, newItem[0].AntiCount) // sum of interval is larger than report interval, and cold utils.MinHotThresholds[utils.RegionReadBytes] = 10.0 utils.MinHotThresholds[utils.RegionReadKeys] = 10.0 utils.MinHotThresholds[utils.RegionReadQueryNum] = 10.0 - cache.updateStat(newItem[0]) - newItem = cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) + cache.UpdateStat(newItem[0]) + newItem = cache.CheckPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) re.Equal(1, newItem[0].HotDegree) re.Equal(2*m-1, newItem[0].AntiCount) // sum of interval is larger than report interval, and cold for i := 0; i < 2*m-1; i++ { - cache.updateStat(newItem[0]) - newItem = cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) + cache.UpdateStat(newItem[0]) + newItem = cache.CheckPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) } re.Negative(newItem[0].HotDegree) re.Equal(0, newItem[0].AntiCount) @@ -422,7 +422,7 @@ func testMetrics(re *require.Assertions, interval, byteRate, expectThreshold flo } else { item = cache.updateHotPeerStat(nil, newItem, oldItem, loads, time.Duration(interval)*time.Second, utils.Direct) } - cache.updateStat(item) + cache.UpdateStat(item) } thresholds := cache.calcHotThresholds(storeID) if i < TopNN { @@ -521,7 +521,7 @@ func TestRemoveFromCacheRandom(t *testing.T) { } } -func checkCoolDown(re *require.Assertions, cache *hotPeerCache, region *core.RegionInfo, expect bool) { +func checkCoolDown(re *require.Assertions, cache *HotPeerCache, region *core.RegionInfo, expect bool) { item := cache.getOldHotPeerStat(region.GetID(), region.GetLeader().GetStoreId()) re.Equal(expect, item.IsNeedCoolDownTransferLeader(3, cache.kind)) } @@ -680,9 +680,9 @@ func TestHotPeerCacheTopNThreshold(t *testing.T) { StartTimestamp: start, EndTimestamp: end, })) - stats := cache.checkPeerFlow(newRegion, newRegion.GetPeers(), newRegion.GetLoads(), end-start) + stats := cache.CheckPeerFlow(newRegion, newRegion.GetPeers(), newRegion.GetLoads(), end-start) for _, stat := range stats { - cache.updateStat(stat) + cache.UpdateStat(stat) } } if ThresholdsUpdateInterval == 0 { @@ -710,9 +710,9 @@ func BenchmarkCheckRegionFlow(b *testing.B) { region := buildRegion(utils.Read, 3, 10) b.ResetTimer() for i := 0; i < b.N; i++ { - stats := cache.checkPeerFlow(region, region.GetPeers(), region.GetLoads(), 10) + stats := cache.CheckPeerFlow(region, region.GetPeers(), region.GetLoads(), 10) for _, stat := range stats { - cache.updateStat(stat) + cache.UpdateStat(stat) } } } diff --git a/server/cluster/cluster.go b/server/cluster/cluster.go index be0ba39b8996..851632bd61a0 100644 --- a/server/cluster/cluster.go +++ b/server/cluster/cluster.go @@ -958,7 +958,13 @@ func (c *RaftCluster) HandleStoreHeartbeat(heartbeat *pdpb.StoreHeartbeatRequest utils.RegionWriteKeys: 0, utils.RegionWriteQueryNum: 0, } - c.hotStat.CheckReadAsync(statistics.NewCheckReadPeerTask(region, []*metapb.Peer{peer}, loads, interval)) + checkReadPeerTask := func(cache *statistics.HotPeerCache) { + stats := cache.CheckPeerFlow(region, []*metapb.Peer{peer}, loads, interval) + for _, stat := range stats { + cache.UpdateStat(stat) + } + } + c.hotStat.CheckReadAsync(checkReadPeerTask) } } for _, stat := range stats.GetSnapshotStats() { @@ -981,7 +987,13 @@ func (c *RaftCluster) HandleStoreHeartbeat(heartbeat *pdpb.StoreHeartbeatRequest } if !c.IsServiceIndependent(mcsutils.SchedulingServiceName) { // Here we will compare the reported regions with the previous hot peers to decide if it is still hot. - c.hotStat.CheckReadAsync(statistics.NewCollectUnReportedPeerTask(storeID, regions, interval)) + collectUnReportedPeerTask := func(cache *statistics.HotPeerCache) { + stats := cache.CheckColdPeer(storeID, regions, interval) + for _, stat := range stats { + cache.UpdateStat(stat) + } + } + c.hotStat.CheckReadAsync(collectUnReportedPeerTask) } return nil } diff --git a/server/cluster/cluster_test.go b/server/cluster/cluster_test.go index ee7c477476be..baf862131a5f 100644 --- a/server/cluster/cluster_test.go +++ b/server/cluster/cluster_test.go @@ -32,6 +32,7 @@ import ( "github.com/pingcap/kvproto/pkg/eraftpb" "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/kvproto/pkg/pdpb" + "github.com/pingcap/log" "github.com/stretchr/testify/require" "github.com/tikv/pd/pkg/cluster" "github.com/tikv/pd/pkg/core" @@ -55,6 +56,7 @@ import ( "github.com/tikv/pd/pkg/statistics" "github.com/tikv/pd/pkg/statistics/utils" "github.com/tikv/pd/pkg/storage" + "github.com/tikv/pd/pkg/utils/logutil" "github.com/tikv/pd/pkg/utils/operatorutil" "github.com/tikv/pd/pkg/utils/testutil" "github.com/tikv/pd/pkg/utils/typeutil" @@ -3736,7 +3738,6 @@ func BenchmarkHandleStatsAsync(b *testing.B) { // Setup: create a new instance of Cluster ctx, cancel := context.WithCancel(context.Background()) defer cancel() - _, opt, _ := newTestScheduleConfig() c := newTestRaftCluster(ctx, mockid.NewIDAllocator(), opt, storage.NewStorageWithMemoryBackend()) c.coordinator = schedule.NewCoordinator(ctx, c, nil) @@ -3754,7 +3755,6 @@ func BenchmarkHandleStatsAsync(b *testing.B) { core.SetApproximateSize(10), core.SetReportInterval(0, 10), ) - // Reset timer after setup b.ResetTimer() // Run HandleStatsAsync b.N times @@ -3762,3 +3762,65 @@ func BenchmarkHandleStatsAsync(b *testing.B) { cluster.HandleStatsAsync(c, region) } } + +func BenchmarkHandleRegionHeartbeat(b *testing.B) { + // Setup: create a new instance of Cluster + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + _, opt, _ := newTestScheduleConfig() + c := newTestRaftCluster(ctx, mockid.NewIDAllocator(), opt, storage.NewStorageWithMemoryBackend()) + c.coordinator = schedule.NewCoordinator(ctx, c, nil) + c.SetPrepared() + log.SetLevel(logutil.StringToZapLogLevel("fatal")) + peers := []*metapb.Peer{ + {Id: 11, StoreId: 1}, + {Id: 22, StoreId: 2}, + {Id: 33, StoreId: 3}, + } + queryStats := &pdpb.QueryStats{ + Get: 5, + Coprocessor: 6, + Scan: 7, + Put: 8, + Delete: 9, + DeleteRange: 10, + AcquirePessimisticLock: 11, + Rollback: 12, + Prewrite: 13, + Commit: 14, + } + interval := &pdpb.TimeInterval{StartTimestamp: 0, EndTimestamp: 10} + downPeers := []*pdpb.PeerStats{{Peer: peers[1], DownSeconds: 100}, {Peer: peers[2], DownSeconds: 100}} + pendingPeers := []*metapb.Peer{peers[1], peers[2]} + + var requests []*pdpb.RegionHeartbeatRequest + for i := 0; i < 1000000; i++ { + request := &pdpb.RegionHeartbeatRequest{ + Region: &metapb.Region{Id: 10, Peers: peers, StartKey: []byte{byte(i)}, EndKey: []byte{byte(i + 1)}}, + Leader: peers[0], + DownPeers: downPeers, + PendingPeers: pendingPeers, + BytesWritten: 10, + BytesRead: 20, + KeysWritten: 100, + KeysRead: 200, + ApproximateSize: 30 * units.MiB, + ApproximateKeys: 300, + Interval: interval, + QueryStats: queryStats, + Term: 1, + CpuUsage: 100, + } + requests = append(requests, request) + } + flowRoundDivisor := opt.GetPDServerConfig().FlowRoundByDigit + + // Reset timer after setup + b.ResetTimer() + // Run HandleRegionHeartbeat b.N times + for i := 0; i < b.N; i++ { + region := core.RegionFromHeartbeat(requests[i], flowRoundDivisor) + c.HandleRegionHeartbeat(region) + } +} diff --git a/server/grpc_service.go b/server/grpc_service.go index 5bc1f1109651..e16fa4a8d4fa 100644 --- a/server/grpc_service.go +++ b/server/grpc_service.go @@ -1169,7 +1169,7 @@ func (s *GrpcServer) ReportBuckets(stream pdpb.PD_ReportBucketsServer) error { func (s *GrpcServer) RegionHeartbeat(stream pdpb.PD_RegionHeartbeatServer) error { var ( server = &heartbeatServer{stream: stream} - flowRoundOption = core.WithFlowRoundByDigit(s.persistOptions.GetPDServerConfig().FlowRoundByDigit) + flowRoundDivisor = s.persistOptions.GetPDServerConfig().FlowRoundByDigit cancel context.CancelFunc lastBind time.Time errCh chan error @@ -1264,11 +1264,11 @@ func (s *GrpcServer) RegionHeartbeat(stream pdpb.PD_RegionHeartbeatServer) error regionHeartbeatCounter.WithLabelValues(storeAddress, storeLabel, "report", "bind").Inc() s.hbStreams.BindStream(storeID, server) // refresh FlowRoundByDigit - flowRoundOption = core.WithFlowRoundByDigit(s.persistOptions.GetPDServerConfig().FlowRoundByDigit) + flowRoundDivisor = s.persistOptions.GetPDServerConfig().FlowRoundByDigit lastBind = time.Now() } - region := core.RegionFromHeartbeat(request, flowRoundOption) + region := core.RegionFromHeartbeat(request, flowRoundDivisor) if region.GetLeader() == nil { log.Error("invalid request, the leader is nil", zap.Reflect("request", request), errs.ZapError(errs.ErrLeaderNil)) regionHeartbeatCounter.WithLabelValues(storeAddress, storeLabel, "report", "invalid-leader").Inc() diff --git a/tests/server/cluster/cluster_test.go b/tests/server/cluster/cluster_test.go index 9e70a52d11d9..e03ef2fe3184 100644 --- a/tests/server/cluster/cluster_test.go +++ b/tests/server/cluster/cluster_test.go @@ -1333,22 +1333,22 @@ func TestStaleTermHeartbeat(t *testing.T) { Term: 5, ApproximateSize: 10, } - - region := core.RegionFromHeartbeat(regionReq) + flowRoundDivisor := leaderServer.GetConfig().PDServerCfg.FlowRoundByDigit + region := core.RegionFromHeartbeat(regionReq, flowRoundDivisor) err = rc.HandleRegionHeartbeat(region) re.NoError(err) // Transfer leader regionReq.Term = 6 regionReq.Leader = peers[1] - region = core.RegionFromHeartbeat(regionReq) + region = core.RegionFromHeartbeat(regionReq, flowRoundDivisor) err = rc.HandleRegionHeartbeat(region) re.NoError(err) // issue #3379 regionReq.KeysWritten = uint64(18446744073709551615) // -1 regionReq.BytesWritten = uint64(18446744073709550602) // -1024 - region = core.RegionFromHeartbeat(regionReq) + region = core.RegionFromHeartbeat(regionReq, flowRoundDivisor) re.Equal(uint64(0), region.GetKeysWritten()) re.Equal(uint64(0), region.GetBytesWritten()) err = rc.HandleRegionHeartbeat(region) @@ -1357,14 +1357,14 @@ func TestStaleTermHeartbeat(t *testing.T) { // Stale heartbeat, update check should fail regionReq.Term = 5 regionReq.Leader = peers[0] - region = core.RegionFromHeartbeat(regionReq) + region = core.RegionFromHeartbeat(regionReq, flowRoundDivisor) err = rc.HandleRegionHeartbeat(region) re.Error(err) // Allow regions that are created by unsafe recover to send a heartbeat, even though they // are considered "stale" because their conf ver and version are both equal to 1. regionReq.Region.RegionEpoch.ConfVer = 1 - region = core.RegionFromHeartbeat(regionReq) + region = core.RegionFromHeartbeat(regionReq, flowRoundDivisor) err = rc.HandleRegionHeartbeat(region) re.NoError(err) } diff --git a/tools/pd-ctl/tests/hot/hot_test.go b/tools/pd-ctl/tests/hot/hot_test.go index f65b811b36a8..dea49a1ffdd0 100644 --- a/tools/pd-ctl/tests/hot/hot_test.go +++ b/tools/pd-ctl/tests/hot/hot_test.go @@ -191,7 +191,13 @@ func (suite *hotTestSuite) checkHot(cluster *pdTests.TestCluster) { region := core.NewRegionInfo(&metapb.Region{ Id: hotRegionID, }, leader) - hotStat.CheckReadAsync(statistics.NewCheckReadPeerTask(region, []*metapb.Peer{leader}, loads, reportInterval)) + checkReadPeerTask := func(cache *statistics.HotPeerCache) { + stats := cache.CheckPeerFlow(region, []*metapb.Peer{leader}, loads, reportInterval) + for _, stat := range stats { + cache.UpdateStat(stat) + } + } + hotStat.CheckReadAsync(checkReadPeerTask) testutil.Eventually(re, func() bool { hotPeerStat := getHotPeerStat(utils.Read, hotRegionID, hotStoreID) return hotPeerStat != nil From 9dff6e60277a2ccb62cb6b513a94b408f01a77d0 Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Thu, 13 Jun 2024 17:41:59 +0800 Subject: [PATCH 29/47] mcs: tso server compare address without scheme (#8283) close tikv/pd#8284 Signed-off-by: lhy1024 Co-authored-by: Hu# --- pkg/keyspace/tso_keyspace_group.go | 2 +- pkg/storage/endpoint/tso_keyspace_group.go | 9 +++++++++ pkg/tso/keyspace_group_manager.go | 4 ++-- pkg/tso/keyspace_group_manager_test.go | 2 +- pkg/utils/typeutil/comparison.go | 9 +++++++++ pkg/utils/typeutil/comparison_test.go | 8 ++++++++ server/apiv2/handlers/tso_keyspace_group.go | 2 +- 7 files changed, 31 insertions(+), 5 deletions(-) diff --git a/pkg/keyspace/tso_keyspace_group.go b/pkg/keyspace/tso_keyspace_group.go index 5ed9747e9234..f42abeee23da 100644 --- a/pkg/keyspace/tso_keyspace_group.go +++ b/pkg/keyspace/tso_keyspace_group.go @@ -874,7 +874,7 @@ func (m *GroupManager) SetPriorityForKeyspaceGroup(id uint32, node string, prior inKeyspaceGroup := false members := make([]endpoint.KeyspaceGroupMember, 0, len(kg.Members)) for _, member := range kg.Members { - if member.Address == node { + if member.CompareAddress(node) { inKeyspaceGroup = true member.Priority = priority } diff --git a/pkg/storage/endpoint/tso_keyspace_group.go b/pkg/storage/endpoint/tso_keyspace_group.go index ba322336feb4..d24b6e0dd1a4 100644 --- a/pkg/storage/endpoint/tso_keyspace_group.go +++ b/pkg/storage/endpoint/tso_keyspace_group.go @@ -20,6 +20,7 @@ import ( "github.com/tikv/pd/pkg/slice" "github.com/tikv/pd/pkg/storage/kv" + "github.com/tikv/pd/pkg/utils/typeutil" "go.etcd.io/etcd/clientv3" ) @@ -80,6 +81,14 @@ type KeyspaceGroupMember struct { Priority int `json:"priority"` } +// CompareAddress compares the address with the given address. +// It compares the address without the scheme. +// Otherwise, it will not work when we update the scheme from http to https. +// Issue: https://github.com/tikv/pd/issues/8284 +func (m *KeyspaceGroupMember) CompareAddress(addr string) bool { + return typeutil.EqualBaseURLs(m.Address, addr) +} + // SplitState defines the split state of a keyspace group. type SplitState struct { // SplitSource is the current keyspace group ID from which the keyspace group is split. diff --git a/pkg/tso/keyspace_group_manager.go b/pkg/tso/keyspace_group_manager.go index b2af48f08da1..ae4cca833746 100644 --- a/pkg/tso/keyspace_group_manager.go +++ b/pkg/tso/keyspace_group_manager.go @@ -290,7 +290,7 @@ func (s *state) getNextPrimaryToReset( if member.Priority > maxPriority { maxPriority = member.Priority } - if member.Address == localAddress { + if member.CompareAddress(localAddress) { localPriority = member.Priority } } @@ -667,7 +667,7 @@ func (kgm *KeyspaceGroupManager) primaryPriorityCheckLoop() { func (kgm *KeyspaceGroupManager) isAssignedToMe(group *endpoint.KeyspaceGroup) bool { return slice.AnyOf(group.Members, func(i int) bool { - return group.Members[i].Address == kgm.tsoServiceID.ServiceAddr + return group.Members[i].CompareAddress(kgm.tsoServiceID.ServiceAddr) }) } diff --git a/pkg/tso/keyspace_group_manager_test.go b/pkg/tso/keyspace_group_manager_test.go index ad67c49fa5ef..0e237fb32f0f 100644 --- a/pkg/tso/keyspace_group_manager_test.go +++ b/pkg/tso/keyspace_group_manager_test.go @@ -891,7 +891,7 @@ func collectAssignedKeyspaceGroupIDs(re *require.Assertions, kgm *KeyspaceGroupM re.Equal(i, int(am.kgID)) re.Equal(i, int(kg.ID)) for _, m := range kg.Members { - if m.Address == kgm.tsoServiceID.ServiceAddr { + if m.CompareAddress(kgm.tsoServiceID.ServiceAddr) { ids = append(ids, uint32(i)) break } diff --git a/pkg/utils/typeutil/comparison.go b/pkg/utils/typeutil/comparison.go index c976ac471021..d86e50543294 100644 --- a/pkg/utils/typeutil/comparison.go +++ b/pkg/utils/typeutil/comparison.go @@ -17,6 +17,7 @@ package typeutil import ( "math" "sort" + "strings" "time" ) @@ -78,3 +79,11 @@ func AreStringSlicesEquivalent(a, b []string) bool { func Float64Equal(a, b float64) bool { return math.Abs(a-b) <= 1e-6 } + +// EqualBaseURLs compares two URLs without scheme. +func EqualBaseURLs(url1, url2 string) bool { + trimScheme := func(s string) string { + return strings.TrimPrefix(strings.TrimPrefix(s, "https://"), "http://") + } + return trimScheme(url1) == trimScheme(url2) +} diff --git a/pkg/utils/typeutil/comparison_test.go b/pkg/utils/typeutil/comparison_test.go index b53e961b4eea..21c802ce382c 100644 --- a/pkg/utils/typeutil/comparison_test.go +++ b/pkg/utils/typeutil/comparison_test.go @@ -66,3 +66,11 @@ func TestAreStringSlicesEquivalent(t *testing.T) { re.False(AreStringSlicesEquivalent([]string{}, []string{"a", "b"})) re.False(AreStringSlicesEquivalent([]string{"a", "b"}, []string{})) } + +func TestCompareURLsWithoutScheme(t *testing.T) { + re := require.New(t) + re.True(EqualBaseURLs("", "")) + re.True(EqualBaseURLs("http://127.0.0.1", "http://127.0.0.1")) + re.True(EqualBaseURLs("http://127.0.0.1", "https://127.0.0.1")) + re.True(EqualBaseURLs("127.0.0.1", "http://127.0.0.1")) +} diff --git a/server/apiv2/handlers/tso_keyspace_group.go b/server/apiv2/handlers/tso_keyspace_group.go index 835bda9d7bb5..d11cc2adab81 100644 --- a/server/apiv2/handlers/tso_keyspace_group.go +++ b/server/apiv2/handlers/tso_keyspace_group.go @@ -507,7 +507,7 @@ func SetPriorityForKeyspaceGroup(c *gin.Context) { // check if node exists members := kg.Members if slice.NoneOf(members, func(i int) bool { - return members[i].Address == node + return members[i].CompareAddress(node) }) { c.AbortWithStatusJSON(http.StatusBadRequest, "tso node does not exist in the keyspace group") } From f1e85de36e9c75e716a705495eb0f8b2e6397ea6 Mon Sep 17 00:00:00 2001 From: okJiang <819421878@qq.com> Date: Fri, 14 Jun 2024 14:34:13 +0800 Subject: [PATCH 30/47] *: refine code, decrease indent, and rename (#8276) ref tikv/pd#4399 - decrease indent for code readability - use `continue loop` to instead with `foundDisabled` variable, this is more concise - no other logic updates - rename `process` to `progress` Signed-off-by: okJiang <819421878@qq.com> Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/progress/progress.go | 122 +++++++++--------- pkg/schedule/schedulers/evict_slow_store.go | 9 +- .../schedulers/scheduler_controller.go | 35 +++-- server/cluster/cluster.go | 37 +++--- 4 files changed, 103 insertions(+), 100 deletions(-) diff --git a/pkg/progress/progress.go b/pkg/progress/progress.go index 855aa793a83e..8319a395ac84 100644 --- a/pkg/progress/progress.go +++ b/pkg/progress/progress.go @@ -130,47 +130,50 @@ func (m *Manager) UpdateProgress(progress string, current, remaining float64, is m.Lock() defer m.Unlock() - if p, exist := m.progresses[progress]; exist { - for _, op := range opts { - op(p) - } - p.remaining = remaining - if p.total < remaining { - p.total = remaining - } + p, exist := m.progresses[progress] + if !exist { + return + } - p.history.PushBack(current) - p.currentWindowLength++ + for _, op := range opts { + op(p) + } + p.remaining = remaining + if p.total < remaining { + p.total = remaining + } - // try to move `front` into correct place. - for p.currentWindowLength > p.windowLength { - p.front = p.front.Next() - p.currentWindowLength-- - } - for p.currentWindowLength < p.windowLength && p.front.Prev() != nil { - p.front = p.front.Prev() - p.currentWindowLength++ - } + p.history.PushBack(current) + p.currentWindowLength++ - for p.history.Len() > p.windowCapacity { - p.history.Remove(p.history.Front()) - } + // try to move `front` into correct place. + for p.currentWindowLength > p.windowLength { + p.front = p.front.Next() + p.currentWindowLength-- + } + for p.currentWindowLength < p.windowLength && p.front.Prev() != nil { + p.front = p.front.Prev() + p.currentWindowLength++ + } - // It means it just init and we haven't update the progress - if p.history.Len() <= 1 { - p.lastSpeed = 0 - } else if isInc { - // the value increases, e.g., [1, 2, 3] - p.lastSpeed = (current - p.front.Value.(float64)) / - (float64(p.currentWindowLength-1) * p.updateInterval.Seconds()) - } else { - // the value decreases, e.g., [3, 2, 1] - p.lastSpeed = (p.front.Value.(float64) - current) / - (float64(p.currentWindowLength-1) * p.updateInterval.Seconds()) - } - if p.lastSpeed < 0 { - p.lastSpeed = 0 - } + for p.history.Len() > p.windowCapacity { + p.history.Remove(p.history.Front()) + } + + // It means it just init and we haven't update the progress + if p.history.Len() <= 1 { + p.lastSpeed = 0 + } else if isInc { + // the value increases, e.g., [1, 2, 3] + p.lastSpeed = (current - p.front.Value.(float64)) / + (float64(p.currentWindowLength-1) * p.updateInterval.Seconds()) + } else { + // the value decreases, e.g., [3, 2, 1] + p.lastSpeed = (p.front.Value.(float64) - current) / + (float64(p.currentWindowLength-1) * p.updateInterval.Seconds()) + } + if p.lastSpeed < 0 { + p.lastSpeed = 0 } } @@ -201,39 +204,40 @@ func (m *Manager) GetProgresses(filter func(p string) bool) []string { m.RLock() defer m.RUnlock() - processes := []string{} + progresses := make([]string, 0, len(m.progresses)) for p := range m.progresses { if filter(p) { - processes = append(processes, p) + progresses = append(progresses, p) } } - return processes + return progresses } // Status returns the current progress status of a give name. -func (m *Manager) Status(progress string) (process, leftSeconds, currentSpeed float64, err error) { +func (m *Manager) Status(progressName string) (progress, leftSeconds, currentSpeed float64, err error) { m.RLock() defer m.RUnlock() - if p, exist := m.progresses[progress]; exist { - process = 1 - p.remaining/p.total - if process < 0 { - process = 0 - err = errs.ErrProgressWrongStatus.FastGenByArgs(fmt.Sprintf("the remaining: %v is larger than the total: %v", p.remaining, p.total)) - return - } - currentSpeed = p.lastSpeed - // When the progress is newly added, there is no last speed. - if p.lastSpeed == 0 && p.history.Len() <= 1 { - currentSpeed = 0 - } - - leftSeconds = p.remaining / currentSpeed - if math.IsNaN(leftSeconds) || math.IsInf(leftSeconds, 0) { - leftSeconds = math.MaxFloat64 - } + p, exist := m.progresses[progressName] + if !exist { + err = errs.ErrProgressNotFound.FastGenByArgs(fmt.Sprintf("the progress: %s", progressName)) + return + } + progress = 1 - p.remaining/p.total + if progress < 0 { + progress = 0 + err = errs.ErrProgressWrongStatus.FastGenByArgs(fmt.Sprintf("the remaining: %v is larger than the total: %v", p.remaining, p.total)) return } - err = errs.ErrProgressNotFound.FastGenByArgs(fmt.Sprintf("the progress: %s", progress)) + currentSpeed = p.lastSpeed + // When the progress is newly added, there is no last speed. + if p.lastSpeed == 0 && p.history.Len() <= 1 { + currentSpeed = 0 + } + + leftSeconds = p.remaining / currentSpeed + if math.IsNaN(leftSeconds) || math.IsInf(leftSeconds, 0) { + leftSeconds = math.MaxFloat64 + } return } diff --git a/pkg/schedule/schedulers/evict_slow_store.go b/pkg/schedule/schedulers/evict_slow_store.go index ab30b2568234..9b13e292c87c 100644 --- a/pkg/schedule/schedulers/evict_slow_store.go +++ b/pkg/schedule/schedulers/evict_slow_store.go @@ -282,7 +282,6 @@ func (s *evictSlowStoreScheduler) IsScheduleAllowed(cluster sche.SchedulerCluste func (s *evictSlowStoreScheduler) Schedule(cluster sche.SchedulerCluster, _ bool) ([]*operator.Operator, []plan.Plan) { evictSlowStoreCounter.Inc() - var ops []*operator.Operator if s.conf.evictStore() != 0 { store := cluster.GetStore(s.conf.evictStore()) @@ -298,7 +297,7 @@ func (s *evictSlowStoreScheduler) Schedule(cluster sche.SchedulerCluster, _ bool return s.schedulerEvictLeader(cluster), nil } s.cleanupEvictLeader(cluster) - return ops, nil + return nil, nil } var slowStore *core.StoreInfo @@ -311,14 +310,14 @@ func (s *evictSlowStoreScheduler) Schedule(cluster sche.SchedulerCluster, _ bool if (store.IsPreparing() || store.IsServing()) && store.IsSlow() { // Do nothing if there is more than one slow store. if slowStore != nil { - return ops, nil + return nil, nil } slowStore = store } } if slowStore == nil || slowStore.GetSlowScore() < slowStoreEvictThreshold { - return ops, nil + return nil, nil } // If there is only one slow store, evict leaders from that store. @@ -327,7 +326,7 @@ func (s *evictSlowStoreScheduler) Schedule(cluster sche.SchedulerCluster, _ bool err := s.prepareEvictLeader(cluster, slowStore.GetID()) if err != nil { log.Info("prepare for evicting leader failed", zap.Error(err), zap.Uint64("store-id", slowStore.GetID())) - return ops, nil + return nil, nil } return s.schedulerEvictLeader(cluster), nil } diff --git a/pkg/schedule/schedulers/scheduler_controller.go b/pkg/schedule/schedulers/scheduler_controller.go index 334a2f1199af..ea480a06845e 100644 --- a/pkg/schedule/schedulers/scheduler_controller.go +++ b/pkg/schedule/schedulers/scheduler_controller.go @@ -456,6 +456,7 @@ func (s *ScheduleController) Stop() { // Schedule tries to create some operators. func (s *ScheduleController) Schedule(diagnosable bool) []*operator.Operator { +retry: for i := 0; i < maxScheduleRetries; i++ { // no need to retry if schedule should stop to speed exit select { @@ -470,29 +471,27 @@ func (s *ScheduleController) Schedule(diagnosable bool) []*operator.Operator { if diagnosable { s.diagnosticRecorder.SetResultFromPlans(ops, plans) } - foundDisabled := false + if len(ops) == 0 { + continue + } + + // If we have schedule, reset interval to the minimal interval. + s.nextInterval = s.Scheduler.GetMinInterval() for _, op := range ops { - if labelMgr := s.cluster.GetRegionLabeler(); labelMgr != nil { - region := s.cluster.GetRegion(op.RegionID()) - if region == nil { - continue - } - if labelMgr.ScheduleDisabled(region) { - denySchedulersByLabelerCounter.Inc() - foundDisabled = true - break - } + region := s.cluster.GetRegion(op.RegionID()) + if region == nil { + continue retry } - } - if len(ops) > 0 { - // If we have schedule, reset interval to the minimal interval. - s.nextInterval = s.Scheduler.GetMinInterval() - // try regenerating operators - if foundDisabled { + labelMgr := s.cluster.GetRegionLabeler() + if labelMgr == nil { continue } - return ops + if labelMgr.ScheduleDisabled(region) { + denySchedulersByLabelerCounter.Inc() + continue retry + } } + return ops } s.nextInterval = s.Scheduler.GetNextInterval(s.nextInterval) return nil diff --git a/server/cluster/cluster.go b/server/cluster/cluster.go index 851632bd61a0..747cf2dc5388 100644 --- a/server/cluster/cluster.go +++ b/server/cluster/cluster.go @@ -1345,21 +1345,22 @@ func (c *RaftCluster) RemoveStore(storeID uint64, physicallyDestroyed bool) erro zap.Uint64("store-id", storeID), zap.String("store-address", newStore.GetAddress()), zap.Bool("physically-destroyed", newStore.IsPhysicallyDestroyed())) - err := c.setStore(newStore) - if err == nil { - regionSize := float64(c.GetStoreRegionSize(storeID)) - c.resetProgress(storeID, store.GetAddress()) - c.progressManager.AddProgress(encodeRemovingProgressKey(storeID), regionSize, regionSize, nodeStateCheckJobInterval, progress.WindowDurationOption(c.GetCoordinator().GetPatrolRegionsDuration())) - // record the current store limit in memory - c.prevStoreLimit[storeID] = map[storelimit.Type]float64{ - storelimit.AddPeer: c.GetStoreLimitByType(storeID, storelimit.AddPeer), - storelimit.RemovePeer: c.GetStoreLimitByType(storeID, storelimit.RemovePeer), - } - // TODO: if the persist operation encounters error, the "Unlimited" will be rollback. - // And considering the store state has changed, RemoveStore is actually successful. - _ = c.SetStoreLimit(storeID, storelimit.RemovePeer, storelimit.Unlimited) + + if err := c.setStore(newStore); err != nil { + return err } - return err + regionSize := float64(c.GetStoreRegionSize(storeID)) + c.resetProgress(storeID, store.GetAddress()) + c.progressManager.AddProgress(encodeRemovingProgressKey(storeID), regionSize, regionSize, nodeStateCheckJobInterval, progress.WindowDurationOption(c.GetCoordinator().GetPatrolRegionsDuration())) + // record the current store limit in memory + c.prevStoreLimit[storeID] = map[storelimit.Type]float64{ + storelimit.AddPeer: c.GetStoreLimitByType(storeID, storelimit.AddPeer), + storelimit.RemovePeer: c.GetStoreLimitByType(storeID, storelimit.RemovePeer), + } + // TODO: if the persist operation encounters error, the "Unlimited" will be rollback. + // And considering the store state has changed, RemoveStore is actually successful. + _ = c.SetStoreLimit(storeID, storelimit.RemovePeer, storelimit.Unlimited) + return nil } func (c *RaftCluster) checkReplicaBeforeOfflineStore(storeID uint64) error { @@ -1846,14 +1847,14 @@ func (c *RaftCluster) updateProgress(storeID uint64, storeAddress, action string return } c.progressManager.UpdateProgress(progressName, current, remaining, isInc, opts...) - process, ls, cs, err := c.progressManager.Status(progressName) + progress, leftSeconds, currentSpeed, err := c.progressManager.Status(progressName) if err != nil { log.Error("get progress status failed", zap.String("progress", progressName), zap.Float64("remaining", remaining), errs.ZapError(err)) return } - storesProgressGauge.WithLabelValues(storeAddress, storeLabel, action).Set(process) - storesSpeedGauge.WithLabelValues(storeAddress, storeLabel, action).Set(cs) - storesETAGauge.WithLabelValues(storeAddress, storeLabel, action).Set(ls) + storesProgressGauge.WithLabelValues(storeAddress, storeLabel, action).Set(progress) + storesSpeedGauge.WithLabelValues(storeAddress, storeLabel, action).Set(currentSpeed) + storesETAGauge.WithLabelValues(storeAddress, storeLabel, action).Set(leftSeconds) } func (c *RaftCluster) resetProgress(storeID uint64, storeAddress string) { From 13174b5d4cab4d958f0b4ea2718ea7bb74992bc7 Mon Sep 17 00:00:00 2001 From: okJiang <819421878@qq.com> Date: Fri, 14 Jun 2024 14:41:44 +0800 Subject: [PATCH 31/47] *: enable errcheck (#8277) ref tikv/pd#1919 Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- .golangci.yml | 13 ++++++++++++- client/http/types.go | 2 +- client/pd_service_discovery.go | 4 +++- client/resource_manager_client.go | 4 +++- client/tso_client.go | 4 +++- client/tso_service_discovery.go | 8 ++++++-- pkg/core/region.go | 4 ++-- pkg/election/leadership.go | 2 +- pkg/election/lease.go | 4 +++- pkg/mcs/scheduling/server/cluster.go | 10 +++++----- pkg/mcs/utils/util.go | 8 ++++++-- pkg/schedule/scatter/region_scatterer.go | 2 +- pkg/utils/apiutil/apiutil.go | 6 +++--- pkg/utils/typeutil/clone.go | 2 +- server/cluster/cluster.go | 12 ++++++------ 15 files changed, 56 insertions(+), 29 deletions(-) diff --git a/.golangci.yml b/.golangci.yml index 283de8e96b0a..be6dc92b18c5 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -14,7 +14,6 @@ linters: - testifylint - gofmt - revive - disable: - errcheck linters-settings: gocritic: @@ -199,3 +198,15 @@ linters-settings: severity: warning disabled: false exclude: [""] +issues: + exclude-rules: + - path: (_test\.go|pkg/mock/.*\.go|tests/.*\.go) + linters: + - errcheck + # following path will enable in the future + - path: (pd-analysis|pd-api-bench|pd-backup|pd-ctl|pd-heartbeat-bench|pd-recover|pd-simulator|pd-tso-bench|pd-ut|regions-dump|stores-dump) + linters: + - errcheck + - path: (pkg/schedule/labeler/labeler.go|pkg/mcs/tso/server/config.go|pkg/tso/admin.go|pkg/mcs/tso/server/grpc_service.go|pkg/schedule/schedulers/split_bucket.go|server/api/plugin_disable.go|server/api/plugin_disable.go|server/api/operator.go|server/api/region.go|pkg/schedule/schedulers/balance_leader.go|pkg/mcs/resourcemanager/server/server.go|pkg/mcs/scheduling/server/grpc_service.go|pkg/mcs/resourcemanager/server/.*\.go|plugin/scheduler_example/evict_leader.go|server/api/.*\.go|pkg/replication/replication_mode.go|pkg/mcs/scheduling/server/server.go|pkg/storage/endpoint/gc_safe_point.go|server/.*\.go|pkg/schedule/schedulers/.*\.go|pkg/schedule/placement/rule.go|pkg/mcs/utils/util.go|pkg/keyspace/tso_keyspace_group.go|pkg/tso/allocator_manager.go|pkg/core/store_stats.go|pkg/autoscaling/handler.go|pkg/core/store_stats.go|pkg/keyspace/keyspace.go|pkg/storage/hot_region_storage.go|pkg/syncer/server.go) + linters: + - errcheck diff --git a/client/http/types.go b/client/http/types.go index f7273068b8cd..ab6240494360 100644 --- a/client/http/types.go +++ b/client/http/types.go @@ -366,7 +366,7 @@ func (r *Rule) String() string { // Clone returns a copy of Rule. func (r *Rule) Clone() *Rule { var clone Rule - json.Unmarshal([]byte(r.String()), &clone) + _ = json.Unmarshal([]byte(r.String()), &clone) clone.StartKey = append(r.StartKey[:0:0], r.StartKey...) clone.EndKey = append(r.EndKey[:0:0], r.EndKey...) return &clone diff --git a/client/pd_service_discovery.go b/client/pd_service_discovery.go index 97e82ec3321b..9378ed278e0f 100644 --- a/client/pd_service_discovery.go +++ b/client/pd_service_discovery.go @@ -805,7 +805,9 @@ func (c *pdServiceDiscovery) SetTSOLocalServURLsUpdatedCallback(callback tsoLoca func (c *pdServiceDiscovery) SetTSOGlobalServURLUpdatedCallback(callback tsoGlobalServURLUpdatedFunc) { url := c.getLeaderURL() if len(url) > 0 { - callback(url) + if err := callback(url); err != nil { + log.Error("[tso] failed to call back when tso global service url update", zap.String("url", url), errs.ZapError(err)) + } } c.tsoGlobalAllocLeaderUpdatedCb = callback } diff --git a/client/resource_manager_client.go b/client/resource_manager_client.go index 98b123c08234..19adbd199b01 100644 --- a/client/resource_manager_client.go +++ b/client/resource_manager_client.go @@ -319,7 +319,9 @@ func (c *client) handleResourceTokenDispatcher(dispatcherCtx context.Context, tb // If the stream is nil or the leader has changed, try to reconnect. if toReconnect { connection.reset() - c.tryResourceManagerConnect(dispatcherCtx, &connection) + if err := c.tryResourceManagerConnect(dispatcherCtx, &connection); err != nil { + log.Error("[resource_manager] try to connect token leader failed", errs.ZapError(err)) + } log.Info("[resource_manager] token leader may change, try to reconnect the stream") stream, streamCtx = connection.stream, connection.ctx } diff --git a/client/tso_client.go b/client/tso_client.go index 72b09d8054df..5e221eae4785 100644 --- a/client/tso_client.go +++ b/client/tso_client.go @@ -118,7 +118,9 @@ func (c *tsoClient) getOption() *option { return c.option } func (c *tsoClient) getServiceDiscovery() ServiceDiscovery { return c.svcDiscovery } func (c *tsoClient) setup() { - c.svcDiscovery.CheckMemberChanged() + if err := c.svcDiscovery.CheckMemberChanged(); err != nil { + log.Warn("[tso] failed to check member changed", errs.ZapError(err)) + } c.updateTSODispatcher() // Start the daemons. diff --git a/client/tso_service_discovery.go b/client/tso_service_discovery.go index f88403bb3220..443d455e911d 100644 --- a/client/tso_service_discovery.go +++ b/client/tso_service_discovery.go @@ -339,7 +339,9 @@ func (c *tsoServiceDiscovery) ScheduleCheckMemberChanged() { // CheckMemberChanged Immediately check if there is any membership change among the primary/secondaries in // a primary/secondary configured cluster. func (c *tsoServiceDiscovery) CheckMemberChanged() error { - c.apiSvcDiscovery.CheckMemberChanged() + if err := c.apiSvcDiscovery.CheckMemberChanged(); err != nil { + log.Warn("[tso] failed to check member changed", errs.ZapError(err)) + } if err := c.retry(tsoQueryRetryMaxTimes, tsoQueryRetryInterval, c.updateMember); err != nil { log.Error("[tso] failed to update member", errs.ZapError(err)) return err @@ -366,7 +368,9 @@ func (c *tsoServiceDiscovery) SetTSOLocalServURLsUpdatedCallback(callback tsoLoc func (c *tsoServiceDiscovery) SetTSOGlobalServURLUpdatedCallback(callback tsoGlobalServURLUpdatedFunc) { url := c.getPrimaryURL() if len(url) > 0 { - callback(url) + if err := callback(url); err != nil { + log.Error("[tso] failed to call back when tso global service url update", zap.String("url", url), errs.ZapError(err)) + } } c.globalAllocPrimariesUpdatedCb = callback } diff --git a/pkg/core/region.go b/pkg/core/region.go index 7f70d7182856..df4cfc17be2b 100644 --- a/pkg/core/region.go +++ b/pkg/core/region.go @@ -746,7 +746,7 @@ func GenerateRegionGuideFunc(enableLog bool) RegionGuideFunc { regionID := region.GetID() if logRunner != nil { debug = func(msg string, fields ...zap.Field) { - logRunner.RunTask( + _ = logRunner.RunTask( regionID, "DebugLog", func() { @@ -755,7 +755,7 @@ func GenerateRegionGuideFunc(enableLog bool) RegionGuideFunc { ) } info = func(msg string, fields ...zap.Field) { - logRunner.RunTask( + _ = logRunner.RunTask( regionID, "InfoLog", func() { diff --git a/pkg/election/leadership.go b/pkg/election/leadership.go index 3ee413818a50..f252eabe0726 100644 --- a/pkg/election/leadership.go +++ b/pkg/election/leadership.go @@ -161,7 +161,7 @@ func (ls *Leadership) Campaign(leaseTimeout int64, leaderData string, cmps ...cl failpoint.Inject("skipGrantLeader", func(val failpoint.Value) { var member pdpb.Member - member.Unmarshal([]byte(leaderData)) + _ = member.Unmarshal([]byte(leaderData)) name, ok := val.(string) if ok && member.Name == name { failpoint.Return(errors.Errorf("failed to grant lease")) diff --git a/pkg/election/lease.go b/pkg/election/lease.go index a6b49fb99f84..45d702def5e1 100644 --- a/pkg/election/lease.go +++ b/pkg/election/lease.go @@ -84,7 +84,9 @@ func (l *lease) Close() error { if l.ID.Load() != nil { leaseID = l.ID.Load().(clientv3.LeaseID) } - l.lease.Revoke(ctx, leaseID) + if _, err := l.lease.Revoke(ctx, leaseID); err != nil { + log.Error("revoke lease failed", zap.String("purpose", l.Purpose), errs.ZapError(err)) + } return l.lease.Close() } diff --git a/pkg/mcs/scheduling/server/cluster.go b/pkg/mcs/scheduling/server/cluster.go index b18db7c07986..4062ed38fd6a 100644 --- a/pkg/mcs/scheduling/server/cluster.go +++ b/pkg/mcs/scheduling/server/cluster.go @@ -624,7 +624,7 @@ func (c *Cluster) processRegionHeartbeat(ctx *core.MetaProcessContext, region *c // Due to some config changes need to update the region stats as well, // so we do some extra checks here. if hasRegionStats && c.regionStats.RegionStatsNeedUpdate(region) { - ctx.TaskRunner.RunTask( + _ = ctx.TaskRunner.RunTask( regionID, ratelimit.ObserveRegionStatsAsync, func() { @@ -636,7 +636,7 @@ func (c *Cluster) processRegionHeartbeat(ctx *core.MetaProcessContext, region *c } // region is not updated to the subtree. if origin.GetRef() < 2 { - ctx.TaskRunner.RunTask( + _ = ctx.TaskRunner.RunTask( regionID, ratelimit.UpdateSubTree, func() { @@ -660,7 +660,7 @@ func (c *Cluster) processRegionHeartbeat(ctx *core.MetaProcessContext, region *c tracer.OnSaveCacheFinished() return err } - ctx.TaskRunner.RunTask( + _ = ctx.TaskRunner.RunTask( regionID, ratelimit.UpdateSubTree, func() { @@ -669,7 +669,7 @@ func (c *Cluster) processRegionHeartbeat(ctx *core.MetaProcessContext, region *c ratelimit.WithRetained(retained), ) tracer.OnUpdateSubTreeFinished() - ctx.TaskRunner.RunTask( + _ = ctx.TaskRunner.RunTask( regionID, ratelimit.HandleOverlaps, func() { @@ -679,7 +679,7 @@ func (c *Cluster) processRegionHeartbeat(ctx *core.MetaProcessContext, region *c } tracer.OnSaveCacheFinished() // handle region stats - ctx.TaskRunner.RunTask( + _ = ctx.TaskRunner.RunTask( regionID, ratelimit.CollectRegionStatsAsync, func() { diff --git a/pkg/mcs/utils/util.go b/pkg/mcs/utils/util.go index b6ac2eb37e52..fb78f0b4be3c 100644 --- a/pkg/mcs/utils/util.go +++ b/pkg/mcs/utils/util.go @@ -266,7 +266,9 @@ func StopHTTPServer(s server) { ch := make(chan struct{}) go func() { defer close(ch) - s.GetHTTPServer().Shutdown(ctx) + if err := s.GetHTTPServer().Shutdown(ctx); err != nil { + log.Error("http server graceful shutdown failed", errs.ZapError(err)) + } }() select { @@ -274,7 +276,9 @@ func StopHTTPServer(s server) { case <-ctx.Done(): // Took too long, manually close open transports log.Warn("http server graceful shutdown timeout, forcing close") - s.GetHTTPServer().Close() + if err := s.GetHTTPServer().Close(); err != nil { + log.Warn("http server close failed", errs.ZapError(err)) + } // concurrent Graceful Shutdown should be interrupted <-ch } diff --git a/pkg/schedule/scatter/region_scatterer.go b/pkg/schedule/scatter/region_scatterer.go index bdec5c98c9c1..100b9eb764dd 100644 --- a/pkg/schedule/scatter/region_scatterer.go +++ b/pkg/schedule/scatter/region_scatterer.go @@ -255,7 +255,7 @@ func (r *RegionScatterer) scatterRegions(regions map[uint64]*core.RegionInfo, fa continue } failpoint.Inject("scatterHbStreamsDrain", func() { - r.opController.GetHBStreams().Drain(1) + _ = r.opController.GetHBStreams().Drain(1) r.opController.RemoveOperator(op, operator.AdminStop) }) } diff --git a/pkg/utils/apiutil/apiutil.go b/pkg/utils/apiutil/apiutil.go index 20465d8376c9..2503ba9aecfb 100644 --- a/pkg/utils/apiutil/apiutil.go +++ b/pkg/utils/apiutil/apiutil.go @@ -116,14 +116,14 @@ func TagJSONError(err error) error { func ErrorResp(rd *render.Render, w http.ResponseWriter, err error) { if err == nil { log.Error("nil is given to errorResp") - rd.JSON(w, http.StatusInternalServerError, "nil error") + _ = rd.JSON(w, http.StatusInternalServerError, "nil error") return } if errCode := errcode.CodeChain(err); errCode != nil { w.Header().Set("TiDB-Error-Code", errCode.Code().CodeStr().String()) - rd.JSON(w, errCode.Code().HTTPCode(), errcode.NewJSONFormat(errCode)) + _ = rd.JSON(w, errCode.Code().HTTPCode(), errcode.NewJSONFormat(errCode)) } else { - rd.JSON(w, http.StatusInternalServerError, err.Error()) + _ = rd.JSON(w, http.StatusInternalServerError, err.Error()) } } diff --git a/pkg/utils/typeutil/clone.go b/pkg/utils/typeutil/clone.go index c8e27b1c2065..783c7b44be2b 100644 --- a/pkg/utils/typeutil/clone.go +++ b/pkg/utils/typeutil/clone.go @@ -36,6 +36,6 @@ func DeepClone[T Codec](src T, factory func() T) T { return dst } dst := factory() - dst.Unmarshal(b) + _ = dst.Unmarshal(b) return dst } diff --git a/server/cluster/cluster.go b/server/cluster/cluster.go index 747cf2dc5388..534d8361b2a2 100644 --- a/server/cluster/cluster.go +++ b/server/cluster/cluster.go @@ -1058,7 +1058,7 @@ func (c *RaftCluster) processRegionHeartbeat(ctx *core.MetaProcessContext, regio // region stats needs to be collected in API mode. // We need to think of a better way to reduce this part of the cost in the future. if hasRegionStats && c.regionStats.RegionStatsNeedUpdate(region) { - ctx.MiscRunner.RunTask( + _ = ctx.MiscRunner.RunTask( regionID, ratelimit.ObserveRegionStatsAsync, func() { @@ -1070,7 +1070,7 @@ func (c *RaftCluster) processRegionHeartbeat(ctx *core.MetaProcessContext, regio } // region is not updated to the subtree. if origin.GetRef() < 2 { - ctx.TaskRunner.RunTask( + _ = ctx.TaskRunner.RunTask( regionID, ratelimit.UpdateSubTree, func() { @@ -1098,7 +1098,7 @@ func (c *RaftCluster) processRegionHeartbeat(ctx *core.MetaProcessContext, regio tracer.OnSaveCacheFinished() return err } - ctx.TaskRunner.RunTask( + _ = ctx.TaskRunner.RunTask( regionID, ratelimit.UpdateSubTree, func() { @@ -1109,7 +1109,7 @@ func (c *RaftCluster) processRegionHeartbeat(ctx *core.MetaProcessContext, regio tracer.OnUpdateSubTreeFinished() if !c.IsServiceIndependent(mcsutils.SchedulingServiceName) { - ctx.MiscRunner.RunTask( + _ = ctx.MiscRunner.RunTask( regionID, ratelimit.HandleOverlaps, func() { @@ -1122,7 +1122,7 @@ func (c *RaftCluster) processRegionHeartbeat(ctx *core.MetaProcessContext, regio tracer.OnSaveCacheFinished() // handle region stats - ctx.MiscRunner.RunTask( + _ = ctx.MiscRunner.RunTask( regionID, ratelimit.CollectRegionStatsAsync, func() { @@ -1136,7 +1136,7 @@ func (c *RaftCluster) processRegionHeartbeat(ctx *core.MetaProcessContext, regio tracer.OnCollectRegionStatsFinished() if c.storage != nil { if saveKV { - ctx.MiscRunner.RunTask( + _ = ctx.MiscRunner.RunTask( regionID, ratelimit.SaveRegionToKV, func() { From 463aee9fa0b73ad632ead64e9e4b6f3c8950462d Mon Sep 17 00:00:00 2001 From: JmPotato Date: Fri, 14 Jun 2024 17:41:44 +0800 Subject: [PATCH 32/47] etcdutil: check if the endpoint has been removed before evicting (#8287) close tikv/pd#8286 Once a member is removed from the cluster, its endpoint should no longer exist in the health checker. This PR adds a check to prevent the removed endpoint from being evicted again unexpectedly. Signed-off-by: JmPotato Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/utils/etcdutil/health_checker.go | 11 ++++++---- pkg/utils/etcdutil/health_checker_test.go | 26 +++++++++++++++++++++++ 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/pkg/utils/etcdutil/health_checker.go b/pkg/utils/etcdutil/health_checker.go index 44bddd8b183d..c5ece5a0804b 100644 --- a/pkg/utils/etcdutil/health_checker.go +++ b/pkg/utils/etcdutil/health_checker.go @@ -273,7 +273,7 @@ func (checker *healthChecker) updateEvictedEps(lastEps, pickedEps []string) { for _, ep := range pickedEps { pickedSet[ep] = true } - // Reset the count to 0 if it's in evictedEps but not in the pickedEps. + // Reset the count to 0 if it's in `evictedEps` but not in `pickedEps`. checker.evictedEps.Range(func(key, value any) bool { ep := key.(string) count := value.(int) @@ -286,18 +286,21 @@ func (checker *healthChecker) updateEvictedEps(lastEps, pickedEps []string) { } return true }) - // Find all endpoints which are in the lastEps but not in the pickedEps, - // and add them to the evictedEps. + // Find all endpoints which are in `lastEps` and `healthyClients` but not in `pickedEps`, + // and add them to the `evictedEps`. for _, ep := range lastEps { if pickedSet[ep] { continue } + if hc := checker.loadClient(ep); hc == nil { + continue + } checker.evictedEps.Store(ep, 0) log.Info("evicted etcd endpoint found", zap.String("endpoint", ep), zap.String("source", checker.source)) } - // Find all endpoints which are in both pickedEps and evictedEps to + // Find all endpoints which are in both `pickedEps` and `evictedEps` to // increase their picked count. for _, ep := range pickedEps { if count, ok := checker.evictedEps.Load(ep); ok { diff --git a/pkg/utils/etcdutil/health_checker_test.go b/pkg/utils/etcdutil/health_checker_test.go index 82dd1362ba26..07a8024e63ca 100644 --- a/pkg/utils/etcdutil/health_checker_test.go +++ b/pkg/utils/etcdutil/health_checker_test.go @@ -35,6 +35,8 @@ func check(re *require.Assertions, testCases []*testCase) { probeCh := make(chan healthProbe, len(tc.healthProbes)) for _, probe := range tc.healthProbes { probeCh <- probe + // Mock that all the endpoints are healthy. + checker.healthyClients.LoadOrStore(probe.ep, &healthyClient{}) } close(probeCh) // Pick and filter the endpoints. @@ -361,3 +363,27 @@ func TestLatencyPick(t *testing.T) { } check(re, testCases) } + +func TestUpdateEvictedEpsAfterRemoval(t *testing.T) { + re := require.New(t) + var ( + checker = &healthChecker{} + lastEps = []string{"A", "B", "C"} + pickedEps = []string{"A", "C"} + ) + // All endpoints are healthy. + for _, ep := range lastEps { + checker.healthyClients.Store(ep, &healthyClient{}) + } + checker.updateEvictedEps(lastEps, pickedEps) + // B should be evicted. + _, ok := checker.evictedEps.Load("B") + re.True(ok) + // Remove the endpoint B to mock member removal. + checker.healthyClients.Delete("B") + checker.evictedEps.Delete("B") + checker.updateEvictedEps(lastEps, pickedEps) + // B should not be evicted since it has been removed. + _, ok = checker.evictedEps.Load("B") + re.False(ok) +} From c1d422ec4924799a5ff7da6606644959e339f1c0 Mon Sep 17 00:00:00 2001 From: Ryan Leung Date: Fri, 14 Jun 2024 18:08:43 +0800 Subject: [PATCH 33/47] *: enable error check for mcs, keyspace and labeler (#8288) ref tikv/pd#1919 Signed-off-by: Ryan Leung Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- .golangci.yml | 2 +- pkg/autoscaling/handler.go | 8 ++++---- pkg/keyspace/keyspace.go | 7 ++++++- pkg/keyspace/tso_keyspace_group.go | 2 +- pkg/mcs/utils/util.go | 2 +- pkg/schedule/labeler/labeler.go | 6 ++++-- 6 files changed, 17 insertions(+), 10 deletions(-) diff --git a/.golangci.yml b/.golangci.yml index be6dc92b18c5..4f5c96cd3433 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -207,6 +207,6 @@ issues: - path: (pd-analysis|pd-api-bench|pd-backup|pd-ctl|pd-heartbeat-bench|pd-recover|pd-simulator|pd-tso-bench|pd-ut|regions-dump|stores-dump) linters: - errcheck - - path: (pkg/schedule/labeler/labeler.go|pkg/mcs/tso/server/config.go|pkg/tso/admin.go|pkg/mcs/tso/server/grpc_service.go|pkg/schedule/schedulers/split_bucket.go|server/api/plugin_disable.go|server/api/plugin_disable.go|server/api/operator.go|server/api/region.go|pkg/schedule/schedulers/balance_leader.go|pkg/mcs/resourcemanager/server/server.go|pkg/mcs/scheduling/server/grpc_service.go|pkg/mcs/resourcemanager/server/.*\.go|plugin/scheduler_example/evict_leader.go|server/api/.*\.go|pkg/replication/replication_mode.go|pkg/mcs/scheduling/server/server.go|pkg/storage/endpoint/gc_safe_point.go|server/.*\.go|pkg/schedule/schedulers/.*\.go|pkg/schedule/placement/rule.go|pkg/mcs/utils/util.go|pkg/keyspace/tso_keyspace_group.go|pkg/tso/allocator_manager.go|pkg/core/store_stats.go|pkg/autoscaling/handler.go|pkg/core/store_stats.go|pkg/keyspace/keyspace.go|pkg/storage/hot_region_storage.go|pkg/syncer/server.go) + - path: (pkg/tso/admin.go|pkg/schedule/schedulers/split_bucket.go|server/api/plugin_disable.go|server/api/plugin_disable.go|server/api/operator.go|server/api/region.go|pkg/schedule/schedulers/balance_leader.go|plugin/scheduler_example/evict_leader.go|server/api/.*\.go|pkg/replication/replication_mode.go|pkg/storage/endpoint/gc_safe_point.go|server/.*\.go|pkg/schedule/schedulers/.*\.go|pkg/schedule/placement/rule.go|pkg/tso/allocator_manager.go|pkg/core/store_stats.go|pkg/core/store_stats.go|pkg/storage/hot_region_storage.go|pkg/syncer/server.go) linters: - errcheck diff --git a/pkg/autoscaling/handler.go b/pkg/autoscaling/handler.go index ea248fdcc555..7bffa8ec1560 100644 --- a/pkg/autoscaling/handler.go +++ b/pkg/autoscaling/handler.go @@ -41,22 +41,22 @@ func NewHTTPHandler(svr *server.Server, rd *render.Render) *HTTPHandler { func (h *HTTPHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { rc := h.svr.GetRaftCluster() if rc == nil { - h.rd.JSON(w, http.StatusInternalServerError, errs.ErrNotBootstrapped.FastGenByArgs().Error()) + _ = h.rd.JSON(w, http.StatusInternalServerError, errs.ErrNotBootstrapped.FastGenByArgs().Error()) return } data, err := io.ReadAll(r.Body) r.Body.Close() if err != nil { - h.rd.JSON(w, http.StatusInternalServerError, err.Error()) + _ = h.rd.JSON(w, http.StatusInternalServerError, err.Error()) return } strategy := Strategy{} if err := json.Unmarshal(data, &strategy); err != nil { - h.rd.JSON(w, http.StatusBadRequest, err.Error()) + _ = h.rd.JSON(w, http.StatusBadRequest, err.Error()) return } plan := calculate(rc, h.svr.GetPDServerConfig(), &strategy) - h.rd.JSON(w, http.StatusOK, plan) + _ = h.rd.JSON(w, http.StatusOK, plan) } diff --git a/pkg/keyspace/keyspace.go b/pkg/keyspace/keyspace.go index b37ec7f0fcaa..26fd4db10f0f 100644 --- a/pkg/keyspace/keyspace.go +++ b/pkg/keyspace/keyspace.go @@ -321,7 +321,12 @@ func (manager *Manager) splitKeyspaceRegion(id uint32, waitRegionSplit bool) (er } defer func() { if err != nil { - cl.GetRegionLabeler().DeleteLabelRule(keyspaceRule.ID) + if err := cl.GetRegionLabeler().DeleteLabelRule(keyspaceRule.ID); err != nil { + log.Warn("[keyspace] failed to delete region label for keyspace", + zap.Uint32("keyspace-id", id), + zap.Error(err), + ) + } } }() diff --git a/pkg/keyspace/tso_keyspace_group.go b/pkg/keyspace/tso_keyspace_group.go index f42abeee23da..30f4d0d88b4e 100644 --- a/pkg/keyspace/tso_keyspace_group.go +++ b/pkg/keyspace/tso_keyspace_group.go @@ -426,7 +426,7 @@ func (m *GroupManager) UpdateKeyspaceForGroup(userKind endpoint.UserKind, groupI failpoint.Inject("externalAllocNode", func(val failpoint.Value) { failpointOnce.Do(func() { addrs := val.(string) - m.SetNodesForKeyspaceGroup(utils.DefaultKeyspaceGroupID, strings.Split(addrs, ",")) + _ = m.SetNodesForKeyspaceGroup(utils.DefaultKeyspaceGroupID, strings.Split(addrs, ",")) }) }) m.Lock() diff --git a/pkg/mcs/utils/util.go b/pkg/mcs/utils/util.go index fb78f0b4be3c..b70b050617ed 100644 --- a/pkg/mcs/utils/util.go +++ b/pkg/mcs/utils/util.go @@ -324,6 +324,6 @@ func StopGRPCServer(s server) { // Exit exits the program with the given code. func Exit(code int) { - log.Sync() + _ = log.Sync() os.Exit(code) } diff --git a/pkg/schedule/labeler/labeler.go b/pkg/schedule/labeler/labeler.go index 2b2bf8abda78..7670ccdedd79 100644 --- a/pkg/schedule/labeler/labeler.go +++ b/pkg/schedule/labeler/labeler.go @@ -201,10 +201,12 @@ func (l *RegionLabeler) getAndCheckRule(id string, now time.Time) *LabelRule { return rule } if len(rule.Labels) == 0 { - l.DeleteLabelRuleLocked(id) + if err := l.DeleteLabelRuleLocked(id); err != nil { + log.Error("failed to delete label rule", zap.String("rule-key", id), zap.Error(err)) + } return nil } - l.SaveLabelRuleLocked(rule) + _ = l.SaveLabelRuleLocked(rule) return rule } From cb28ca8ed2201343b3f785f8fe2aa2edd5d78402 Mon Sep 17 00:00:00 2001 From: okJiang <819421878@qq.com> Date: Wed, 19 Jun 2024 13:31:17 +0800 Subject: [PATCH 34/47] *: enable error check for some files (#8301) ref tikv/pd#1919 Signed-off-by: okJiang <819421878@qq.com> --- .golangci.yml | 2 +- pkg/core/store_stats.go | 5 +- pkg/schedule/placement/rule.go | 2 +- pkg/storage/hot_region_storage.go | 4 +- pkg/tso/allocator_manager.go | 64 ++++++------------------ plugin/scheduler_example/evict_leader.go | 46 ++++++++++------- 6 files changed, 49 insertions(+), 74 deletions(-) diff --git a/.golangci.yml b/.golangci.yml index 4f5c96cd3433..e938c24cc590 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -207,6 +207,6 @@ issues: - path: (pd-analysis|pd-api-bench|pd-backup|pd-ctl|pd-heartbeat-bench|pd-recover|pd-simulator|pd-tso-bench|pd-ut|regions-dump|stores-dump) linters: - errcheck - - path: (pkg/tso/admin.go|pkg/schedule/schedulers/split_bucket.go|server/api/plugin_disable.go|server/api/plugin_disable.go|server/api/operator.go|server/api/region.go|pkg/schedule/schedulers/balance_leader.go|plugin/scheduler_example/evict_leader.go|server/api/.*\.go|pkg/replication/replication_mode.go|pkg/storage/endpoint/gc_safe_point.go|server/.*\.go|pkg/schedule/schedulers/.*\.go|pkg/schedule/placement/rule.go|pkg/tso/allocator_manager.go|pkg/core/store_stats.go|pkg/core/store_stats.go|pkg/storage/hot_region_storage.go|pkg/syncer/server.go) + - path: (pkg/tso/admin.go|pkg/schedule/schedulers/split_bucket.go|server/api/plugin_disable.go|server/api/plugin_disable.go|server/api/operator.go|server/api/region.go|pkg/schedule/schedulers/balance_leader.go|server/api/.*\.go|pkg/replication/replication_mode.go|pkg/storage/endpoint/gc_safe_point.go|server/.*\.go|pkg/schedule/schedulers/.*\.go|pkg/syncer/server.go) linters: - errcheck diff --git a/pkg/core/store_stats.go b/pkg/core/store_stats.go index bcc90a58a2b1..d68f8b8e43ca 100644 --- a/pkg/core/store_stats.go +++ b/pkg/core/store_stats.go @@ -18,6 +18,7 @@ import ( "github.com/pingcap/kvproto/pkg/pdpb" "github.com/tikv/pd/pkg/movingaverage" "github.com/tikv/pd/pkg/utils/syncutil" + "github.com/tikv/pd/pkg/utils/typeutil" ) type storeStats struct { @@ -56,10 +57,8 @@ func (ss *storeStats) GetStoreStats() *pdpb.StoreStats { // CloneStoreStats returns the statistics information cloned from the store. func (ss *storeStats) CloneStoreStats() *pdpb.StoreStats { ss.mu.RLock() - b, _ := ss.rawStats.Marshal() + stats := typeutil.DeepClone(ss.rawStats, StoreStatsFactory) ss.mu.RUnlock() - stats := &pdpb.StoreStats{} - stats.Unmarshal(b) return stats } diff --git a/pkg/schedule/placement/rule.go b/pkg/schedule/placement/rule.go index 75ccd509ee81..07054b7b1cd9 100644 --- a/pkg/schedule/placement/rule.go +++ b/pkg/schedule/placement/rule.go @@ -90,7 +90,7 @@ func (r *Rule) String() string { // Clone returns a copy of Rule. func (r *Rule) Clone() *Rule { var clone Rule - json.Unmarshal([]byte(r.String()), &clone) + _ = json.Unmarshal([]byte(r.String()), &clone) clone.StartKey = append(r.StartKey[:0:0], r.StartKey...) clone.EndKey = append(r.EndKey[:0:0], r.EndKey...) return &clone diff --git a/pkg/storage/hot_region_storage.go b/pkg/storage/hot_region_storage.go index 0393035c85bd..c08825dbba1c 100644 --- a/pkg/storage/hot_region_storage.go +++ b/pkg/storage/hot_region_storage.go @@ -171,7 +171,9 @@ func (h *HotRegionStorage) backgroundDelete() { there may be residual hot regions, you can remove it manually, [pd-dir]/data/hot-region.`) continue } - h.delete(int(curReservedDays)) + if err := h.delete(int(curReservedDays)); err != nil { + log.Error("delete hot region meet error", errs.ZapError(err)) + } case <-h.hotRegionInfoCtx.Done(): return } diff --git a/pkg/tso/allocator_manager.go b/pkg/tso/allocator_manager.go index f1683de1352d..62a4fb97a570 100644 --- a/pkg/tso/allocator_manager.go +++ b/pkg/tso/allocator_manager.go @@ -624,11 +624,13 @@ func (am *AllocatorManager) campaignAllocatorLeader( dcLocationInfo *pdpb.GetDCLocationInfoResponse, isNextLeader bool, ) { - log.Info("start to campaign local tso allocator leader", + logger := log.With( logutil.CondUint32("keyspace-group-id", am.kgID, am.kgID > 0), zap.String("dc-location", allocator.GetDCLocation()), zap.Any("dc-location-info", dcLocationInfo), - zap.String("name", am.member.Name())) + zap.String("name", am.member.Name()), + ) + logger.Info("start to campaign local tso allocator leader") cmps := make([]clientv3.Cmp, 0) nextLeaderKey := am.nextLeaderKey(allocator.GetDCLocation()) if !isNextLeader { @@ -648,18 +650,9 @@ func (am *AllocatorManager) campaignAllocatorLeader( }) if err := allocator.CampaignAllocatorLeader(am.leaderLease, cmps...); err != nil { if err.Error() == errs.ErrEtcdTxnConflict.Error() { - log.Info("failed to campaign local tso allocator leader due to txn conflict, another allocator may campaign successfully", - logutil.CondUint32("keyspace-group-id", am.kgID, am.kgID > 0), - zap.String("dc-location", allocator.GetDCLocation()), - zap.Any("dc-location-info", dcLocationInfo), - zap.String("name", am.member.Name())) + logger.Info("failed to campaign local tso allocator leader due to txn conflict, another allocator may campaign successfully") } else { - log.Error("failed to campaign local tso allocator leader due to etcd error", - logutil.CondUint32("keyspace-group-id", am.kgID, am.kgID > 0), - zap.String("dc-location", allocator.GetDCLocation()), - zap.Any("dc-location-info", dcLocationInfo), - zap.String("name", am.member.Name()), - errs.ZapError(err)) + logger.Error("failed to campaign local tso allocator leader due to etcd error", errs.ZapError(err)) } return } @@ -670,44 +663,25 @@ func (am *AllocatorManager) campaignAllocatorLeader( defer am.ResetAllocatorGroup(allocator.GetDCLocation()) // Maintain the Local TSO Allocator leader go allocator.KeepAllocatorLeader(ctx) - log.Info("campaign local tso allocator leader ok", - logutil.CondUint32("keyspace-group-id", am.kgID, am.kgID > 0), - zap.String("dc-location", allocator.GetDCLocation()), - zap.Any("dc-location-info", dcLocationInfo), - zap.String("name", am.member.Name())) - log.Info("initialize the local TSO allocator", - logutil.CondUint32("keyspace-group-id", am.kgID, am.kgID > 0), - zap.String("dc-location", allocator.GetDCLocation()), - zap.Any("dc-location-info", dcLocationInfo), - zap.String("name", am.member.Name())) + logger.Info("Complete campaign local tso allocator leader, begin to initialize the local TSO allocator") if err := allocator.Initialize(int(dcLocationInfo.Suffix)); err != nil { - log.Error("failed to initialize the local TSO allocator", - logutil.CondUint32("keyspace-group-id", am.kgID, am.kgID > 0), - zap.String("dc-location", allocator.GetDCLocation()), - zap.Any("dc-location-info", dcLocationInfo), - errs.ZapError(err)) + log.Error("failed to initialize the local TSO allocator", errs.ZapError(err)) return } if dcLocationInfo.GetMaxTs().GetPhysical() != 0 { if err := allocator.WriteTSO(dcLocationInfo.GetMaxTs()); err != nil { - log.Error("failed to write the max local TSO after member changed", - logutil.CondUint32("keyspace-group-id", am.kgID, am.kgID > 0), - zap.String("dc-location", allocator.GetDCLocation()), - zap.Any("dc-location-info", dcLocationInfo), - errs.ZapError(err)) + log.Error("failed to write the max local TSO after member changed", errs.ZapError(err)) return } } am.compareAndSetMaxSuffix(dcLocationInfo.Suffix) allocator.EnableAllocatorLeader() // The next leader is me, delete it to finish campaigning - am.deleteNextLeaderID(allocator.GetDCLocation()) - log.Info("local tso allocator leader is ready to serve", - logutil.CondUint32("keyspace-group-id", am.kgID, am.kgID > 0), - zap.String("dc-location", allocator.GetDCLocation()), - zap.Any("dc-location-info", dcLocationInfo), - zap.String("name", am.member.Name())) + if err := am.deleteNextLeaderID(allocator.GetDCLocation()); err != nil { + logger.Warn("failed to delete next leader key after campaign local tso allocator leader", errs.ZapError(err)) + } + logger.Info("local tso allocator leader is ready to serve") leaderTicker := time.NewTicker(mcsutils.LeaderTickInterval) defer leaderTicker.Stop() @@ -716,20 +690,12 @@ func (am *AllocatorManager) campaignAllocatorLeader( select { case <-leaderTicker.C: if !allocator.IsAllocatorLeader() { - log.Info("no longer a local tso allocator leader because lease has expired, local tso allocator leader will step down", - logutil.CondUint32("keyspace-group-id", am.kgID, am.kgID > 0), - zap.String("dc-location", allocator.GetDCLocation()), - zap.Any("dc-location-info", dcLocationInfo), - zap.String("name", am.member.Name())) + logger.Info("no longer a local tso allocator leader because lease has expired, local tso allocator leader will step down") return } case <-ctx.Done(): // Server is closed and it should return nil. - log.Info("server is closed, reset the local tso allocator", - logutil.CondUint32("keyspace-group-id", am.kgID, am.kgID > 0), - zap.String("dc-location", allocator.GetDCLocation()), - zap.Any("dc-location-info", dcLocationInfo), - zap.String("name", am.member.Name())) + logger.Info("server is closed, reset the local tso allocator") return } } diff --git a/plugin/scheduler_example/evict_leader.go b/plugin/scheduler_example/evict_leader.go index f761b3812c57..1e26a97e12c0 100644 --- a/plugin/scheduler_example/evict_leader.go +++ b/plugin/scheduler_example/evict_leader.go @@ -259,7 +259,7 @@ func (handler *evictLeaderHandler) UpdateConfig(w http.ResponseWriter, r *http.R id = (uint64)(idFloat) if _, exists = handler.config.StoreIDWitRanges[id]; !exists { if err := handler.config.cluster.PauseLeaderTransfer(id); err != nil { - handler.rd.JSON(w, http.StatusInternalServerError, err.Error()) + _ = handler.rd.JSON(w, http.StatusInternalServerError, err.Error()) return } } @@ -273,47 +273,55 @@ func (handler *evictLeaderHandler) UpdateConfig(w http.ResponseWriter, r *http.R args = append(args, handler.config.getRanges(id)...) } - handler.config.BuildWithArgs(args) - err := handler.config.Persist() + err := handler.config.BuildWithArgs(args) if err != nil { - handler.rd.JSON(w, http.StatusInternalServerError, err.Error()) + _ = handler.rd.JSON(w, http.StatusBadRequest, err.Error()) + return } - handler.rd.JSON(w, http.StatusOK, nil) + err = handler.config.Persist() + if err != nil { + _ = handler.rd.JSON(w, http.StatusInternalServerError, err.Error()) + return + } + _ = handler.rd.JSON(w, http.StatusOK, nil) } func (handler *evictLeaderHandler) ListConfig(w http.ResponseWriter, _ *http.Request) { conf := handler.config.Clone() - handler.rd.JSON(w, http.StatusOK, conf) + _ = handler.rd.JSON(w, http.StatusOK, conf) } func (handler *evictLeaderHandler) DeleteConfig(w http.ResponseWriter, r *http.Request) { idStr := mux.Vars(r)["store_id"] id, err := strconv.ParseUint(idStr, 10, 64) if err != nil { - handler.rd.JSON(w, http.StatusBadRequest, err.Error()) + _ = handler.rd.JSON(w, http.StatusBadRequest, err.Error()) return } handler.config.mu.Lock() defer handler.config.mu.Unlock() _, exists := handler.config.StoreIDWitRanges[id] - if exists { - delete(handler.config.StoreIDWitRanges, id) - handler.config.cluster.ResumeLeaderTransfer(id) + if !exists { + _ = handler.rd.JSON(w, http.StatusInternalServerError, errors.New("the config does not exist")) + return + } + delete(handler.config.StoreIDWitRanges, id) + handler.config.cluster.ResumeLeaderTransfer(id) - handler.config.mu.Unlock() - handler.config.Persist() + handler.config.mu.Unlock() + if err := handler.config.Persist(); err != nil { handler.config.mu.Lock() - - var resp any - if len(handler.config.StoreIDWitRanges) == 0 { - resp = noStoreInSchedulerInfo - } - handler.rd.JSON(w, http.StatusOK, resp) + _ = handler.rd.JSON(w, http.StatusInternalServerError, err.Error()) return } + handler.config.mu.Lock() - handler.rd.JSON(w, http.StatusInternalServerError, errors.New("the config does not exist")) + var resp any + if len(handler.config.StoreIDWitRanges) == 0 { + resp = noStoreInSchedulerInfo + } + _ = handler.rd.JSON(w, http.StatusOK, resp) } func newEvictLeaderHandler(config *evictLeaderSchedulerConfig) http.Handler { From fbd6bd2c7fe1db0a2b1d5a70797d5d0cb3814682 Mon Sep 17 00:00:00 2001 From: Hu# Date: Wed, 19 Jun 2024 13:46:17 +0800 Subject: [PATCH 35/47] test: add test for clone unmarshal (#8308) ref tikv/pd#4399 Signed-off-by: husharp --- client/http/types_test.go | 27 +++++++++++++++++++++++++++ pkg/schedule/placement/rule_test.go | 27 +++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/client/http/types_test.go b/client/http/types_test.go index 1b8df4f8ed69..904476ceda11 100644 --- a/client/http/types_test.go +++ b/client/http/types_test.go @@ -308,3 +308,30 @@ func mustMarshalAndUnmarshalRuleOp(re *require.Assertions, ruleOp *RuleOp) *Rule re.NoError(err) return newRuleOp } + +// startKey and endKey are json:"-" which means cannot be Unmarshal from json +// We need to take care of `Clone` method. +func TestRuleKeyClone(t *testing.T) { + re := require.New(t) + r := &Rule{ + StartKey: []byte{1, 2, 3}, + EndKey: []byte{4, 5, 6}, + } + + clone := r.Clone() + // Modify the original rule + r.StartKey[0] = 9 + r.EndKey[0] = 9 + + // The clone should not be affected + re.Equal([]byte{1, 2, 3}, clone.StartKey) + re.Equal([]byte{4, 5, 6}, clone.EndKey) + + // Modify the clone + clone.StartKey[0] = 8 + clone.EndKey[0] = 8 + + // The original rule should not be affected + re.Equal([]byte{9, 2, 3}, r.StartKey) + re.Equal([]byte{9, 5, 6}, r.EndKey) +} diff --git a/pkg/schedule/placement/rule_test.go b/pkg/schedule/placement/rule_test.go index 75d7bab23c9e..c7b8dd97ef6c 100644 --- a/pkg/schedule/placement/rule_test.go +++ b/pkg/schedule/placement/rule_test.go @@ -186,3 +186,30 @@ func TestBuildRuleList(t *testing.T) { re.Equal(testCase.expect.ranges, result.ranges) } } + +// startKey and endKey are json:"-" which means cannot be Unmarshal from json +// We need to take care of `Clone` method. +func TestRuleKeyClone(t *testing.T) { + re := require.New(t) + r := &Rule{ + StartKey: []byte{1, 2, 3}, + EndKey: []byte{4, 5, 6}, + } + + clone := r.Clone() + // Modify the original rule + r.StartKey[0] = 9 + r.EndKey[0] = 9 + + // The clone should not be affected + re.Equal([]byte{1, 2, 3}, clone.StartKey) + re.Equal([]byte{4, 5, 6}, clone.EndKey) + + // Modify the clone + clone.StartKey[0] = 8 + clone.EndKey[0] = 8 + + // The original rule should not be affected + re.Equal([]byte{9, 2, 3}, r.StartKey) + re.Equal([]byte{9, 5, 6}, r.EndKey) +} From 89ca8dfff428ceddb5b015f64f425e92118779f9 Mon Sep 17 00:00:00 2001 From: Hu# Date: Wed, 19 Jun 2024 14:41:47 +0800 Subject: [PATCH 36/47] tools/simulator: support simulator with multiple pds (#8304) ref tikv/pd#8135 Signed-off-by: husharp Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- tools/pd-simulator/main.go | 3 - tools/pd-simulator/simulator/client.go | 406 ++++++++++++++++++------- tools/pd-simulator/simulator/conn.go | 10 +- tools/pd-simulator/simulator/drive.go | 62 ++-- tools/pd-simulator/simulator/event.go | 4 +- tools/pd-simulator/simulator/node.go | 70 ++--- 6 files changed, 360 insertions(+), 195 deletions(-) diff --git a/tools/pd-simulator/main.go b/tools/pd-simulator/main.go index 05763cc93b83..12254c1a947b 100644 --- a/tools/pd-simulator/main.go +++ b/tools/pd-simulator/main.go @@ -25,7 +25,6 @@ import ( "github.com/BurntSushi/toml" "github.com/pingcap/log" flag "github.com/spf13/pflag" - pdHttp "github.com/tikv/pd/client/http" "github.com/tikv/pd/pkg/schedule/schedulers" "github.com/tikv/pd/pkg/statistics" "github.com/tikv/pd/pkg/utils/logutil" @@ -93,7 +92,6 @@ func main() { func run(simCase string, simConfig *sc.SimConfig) { if *pdAddr != "" { - simulator.PDHTTPClient = pdHttp.NewClient("pd-simulator", []string{*pdAddr}) simStart(*pdAddr, *statusAddress, simCase, simConfig) } else { local, clean := NewSingleServer(context.Background(), simConfig) @@ -107,7 +105,6 @@ func run(simCase string, simConfig *sc.SimConfig) { } time.Sleep(100 * time.Millisecond) } - simulator.PDHTTPClient = pdHttp.NewClient("pd-simulator", []string{local.GetAddr()}) simStart(local.GetAddr(), "", simCase, simConfig, clean) } } diff --git a/tools/pd-simulator/simulator/client.go b/tools/pd-simulator/simulator/client.go index 0bbbebe46021..f5bd379d17e0 100644 --- a/tools/pd-simulator/simulator/client.go +++ b/tools/pd-simulator/simulator/client.go @@ -16,6 +16,7 @@ package simulator import ( "context" + "fmt" "strconv" "strings" "sync" @@ -24,6 +25,7 @@ import ( "github.com/pingcap/errors" "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/kvproto/pkg/pdpb" + pd "github.com/tikv/pd/client" pdHttp "github.com/tikv/pd/client/http" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/utils/typeutil" @@ -37,32 +39,41 @@ import ( // Client is a PD (Placement Driver) client. // It should not be used after calling Close(). type Client interface { - GetClusterID(ctx context.Context) uint64 - AllocID(ctx context.Context) (uint64, error) - Bootstrap(ctx context.Context, store *metapb.Store, region *metapb.Region) error - PutStore(ctx context.Context, store *metapb.Store) error - StoreHeartbeat(ctx context.Context, stats *pdpb.StoreStats) error - RegionHeartbeat(ctx context.Context, region *core.RegionInfo) error - PutPDConfig(*sc.PDConfig) error + AllocID(context.Context) (uint64, error) + PutStore(context.Context, *metapb.Store) error + StoreHeartbeat(context.Context, *pdpb.StoreStats) error + RegionHeartbeat(context.Context, *core.RegionInfo) error + HeartbeatStreamLoop() + ChangeConn(*grpc.ClientConn) error Close() } const ( pdTimeout = time.Second maxInitClusterRetries = 100 + // retry to get leader URL + leaderChangedWaitTime = 100 * time.Millisecond + retryTimes = 10 ) var ( // errFailInitClusterID is returned when failed to load clusterID from all supplied PD addresses. errFailInitClusterID = errors.New("[pd] failed to get cluster id") PDHTTPClient pdHttp.Client + sd pd.ServiceDiscovery + ClusterID uint64 ) +// requestHeader returns a header for fixed ClusterID. +func requestHeader() *pdpb.RequestHeader { + return &pdpb.RequestHeader{ + ClusterId: ClusterID, + } +} + type client struct { - url string tag string - clusterID uint64 clientConn *grpc.ClientConn reportRegionHeartbeatCh chan *core.RegionInfo @@ -74,29 +85,15 @@ type client struct { } // NewClient creates a PD client. -func NewClient(pdAddr string, tag string) (Client, <-chan *pdpb.RegionHeartbeatResponse, error) { - simutil.Logger.Info("create pd client with endpoints", zap.String("tag", tag), zap.String("pd-address", pdAddr)) +func NewClient(tag string) (Client, <-chan *pdpb.RegionHeartbeatResponse, error) { ctx, cancel := context.WithCancel(context.Background()) c := &client{ - url: pdAddr, reportRegionHeartbeatCh: make(chan *core.RegionInfo, 1), receiveRegionHeartbeatCh: make(chan *pdpb.RegionHeartbeatResponse, 1), ctx: ctx, cancel: cancel, tag: tag, } - cc, err := c.createConn() - if err != nil { - return nil, nil, err - } - c.clientConn = cc - if err := c.initClusterID(); err != nil { - return nil, nil, err - } - simutil.Logger.Info("init cluster id", zap.String("tag", c.tag), zap.Uint64("cluster-id", c.clusterID)) - c.wg.Add(1) - go c.heartbeatStreamLoop() - return c, c.receiveRegionHeartbeatCh, nil } @@ -104,39 +101,18 @@ func (c *client) pdClient() pdpb.PDClient { return pdpb.NewPDClient(c.clientConn) } -func (c *client) initClusterID() error { - ctx, cancel := context.WithCancel(c.ctx) - defer cancel() - for i := 0; i < maxInitClusterRetries; i++ { - members, err := c.getMembers(ctx) - if err != nil || members.GetHeader() == nil { - simutil.Logger.Error("failed to get cluster id", zap.String("tag", c.tag), zap.Error(err)) - continue - } - c.clusterID = members.GetHeader().GetClusterId() - return nil - } - - return errors.WithStack(errFailInitClusterID) -} - -func (c *client) getMembers(ctx context.Context) (*pdpb.GetMembersResponse, error) { - members, err := c.pdClient().GetMembers(ctx, &pdpb.GetMembersRequest{}) +func createConn(url string) (*grpc.ClientConn, error) { + cc, err := grpc.Dial(strings.TrimPrefix(url, "http://"), grpc.WithTransportCredentials(insecure.NewCredentials())) if err != nil { return nil, errors.WithStack(err) } - if members.GetHeader().GetError() != nil { - return nil, errors.WithStack(errors.New(members.GetHeader().GetError().String())) - } - return members, nil + return cc, nil } -func (c *client) createConn() (*grpc.ClientConn, error) { - cc, err := grpc.Dial(strings.TrimPrefix(c.url, "http://"), grpc.WithTransportCredentials(insecure.NewCredentials())) - if err != nil { - return nil, errors.WithStack(err) - } - return cc, nil +func (c *client) ChangeConn(cc *grpc.ClientConn) error { + c.clientConn = cc + simutil.Logger.Info("change pd client with endpoints", zap.String("tag", c.tag), zap.String("pd-address", cc.Target())) + return nil } func (c *client) createHeartbeatStream() (pdpb.PD_RegionHeartbeatClient, context.Context, context.CancelFunc) { @@ -166,7 +142,8 @@ func (c *client) createHeartbeatStream() (pdpb.PD_RegionHeartbeatClient, context return stream, ctx, cancel } -func (c *client) heartbeatStreamLoop() { +func (c *client) HeartbeatStreamLoop() { + c.wg.Add(1) defer c.wg.Done() for { stream, ctx, cancel := c.createHeartbeatStream() @@ -187,6 +164,23 @@ func (c *client) heartbeatStreamLoop() { return } wg.Wait() + + // update connection to recreate heartbeat stream + for i := 0; i < retryTimes; i++ { + sd.ScheduleCheckMemberChanged() + time.Sleep(leaderChangedWaitTime) + if client := sd.GetServiceClient(); client != nil { + _, conn, err := getLeaderURL(ctx, client.GetClientConn()) + if err != nil { + simutil.Logger.Error("[HeartbeatStreamLoop] failed to get leader URL", zap.Error(err)) + continue + } + if err = c.ChangeConn(conn); err == nil { + break + } + } + } + simutil.Logger.Info("recreate heartbeat stream", zap.String("tag", c.tag)) } } @@ -196,6 +190,7 @@ func (c *client) receiveRegionHeartbeat(ctx context.Context, stream pdpb.PD_Regi resp, err := stream.Recv() if err != nil { errCh <- err + simutil.Logger.Error("receive regionHeartbeat error", zap.String("tag", c.tag), zap.Error(err)) return } select { @@ -213,7 +208,7 @@ func (c *client) reportRegionHeartbeat(ctx context.Context, stream pdpb.PD_Regio case r := <-c.reportRegionHeartbeatCh: region := r.Clone() request := &pdpb.RegionHeartbeatRequest{ - Header: c.requestHeader(), + Header: requestHeader(), Region: region.GetMeta(), Leader: region.GetLeader(), DownPeers: region.GetDownPeers(), @@ -227,6 +222,7 @@ func (c *client) reportRegionHeartbeat(ctx context.Context, stream pdpb.PD_Regio if err != nil { errCh <- err simutil.Logger.Error("report regionHeartbeat error", zap.String("tag", c.tag), zap.Error(err)) + return } case <-ctx.Done(): return @@ -235,6 +231,11 @@ func (c *client) reportRegionHeartbeat(ctx context.Context, stream pdpb.PD_Regio } func (c *client) Close() { + if c.cancel == nil { + simutil.Logger.Info("pd client has been closed", zap.String("tag", c.tag)) + return + } + simutil.Logger.Info("closing pd client", zap.String("tag", c.tag)) c.cancel() c.wg.Wait() @@ -243,14 +244,10 @@ func (c *client) Close() { } } -func (c *client) GetClusterID(context.Context) uint64 { - return c.clusterID -} - func (c *client) AllocID(ctx context.Context) (uint64, error) { ctx, cancel := context.WithTimeout(ctx, pdTimeout) resp, err := c.pdClient().AllocID(ctx, &pdpb.AllocIDRequest{ - Header: c.requestHeader(), + Header: requestHeader(), }) cancel() if err != nil { @@ -262,57 +259,259 @@ func (c *client) AllocID(ctx context.Context) (uint64, error) { return resp.GetId(), nil } -func (c *client) Bootstrap(ctx context.Context, store *metapb.Store, region *metapb.Region) error { +func (c *client) PutStore(ctx context.Context, store *metapb.Store) error { ctx, cancel := context.WithTimeout(ctx, pdTimeout) - defer cancel() - req := &pdpb.IsBootstrappedRequest{ - Header: &pdpb.RequestHeader{ - ClusterId: c.clusterID, - }, - } - resp, err := c.pdClient().IsBootstrapped(ctx, req) - if resp.GetBootstrapped() { - simutil.Logger.Fatal("failed to bootstrap, server is not clean") - } - if err != nil { - return err - } newStore := typeutil.DeepClone(store, core.StoreFactory) - newRegion := typeutil.DeepClone(region, core.RegionFactory) - - res, err := c.pdClient().Bootstrap(ctx, &pdpb.BootstrapRequest{ - Header: c.requestHeader(), + resp, err := c.pdClient().PutStore(ctx, &pdpb.PutStoreRequest{ + Header: requestHeader(), Store: newStore, - Region: newRegion, }) + cancel() if err != nil { return err } - if res.GetHeader().GetError() != nil { - return errors.Errorf("bootstrap failed: %s", resp.GetHeader().GetError().String()) + if resp.Header.GetError() != nil { + simutil.Logger.Error("put store error", zap.Reflect("error", resp.Header.GetError())) + return nil } return nil } -func (c *client) PutStore(ctx context.Context, store *metapb.Store) error { +func (c *client) StoreHeartbeat(ctx context.Context, stats *pdpb.StoreStats) error { ctx, cancel := context.WithTimeout(ctx, pdTimeout) - newStore := typeutil.DeepClone(store, core.StoreFactory) - resp, err := c.pdClient().PutStore(ctx, &pdpb.PutStoreRequest{ - Header: c.requestHeader(), - Store: newStore, + newStats := typeutil.DeepClone(stats, core.StoreStatsFactory) + resp, err := c.pdClient().StoreHeartbeat(ctx, &pdpb.StoreHeartbeatRequest{ + Header: requestHeader(), + Stats: newStats, }) cancel() if err != nil { return err } if resp.Header.GetError() != nil { - simutil.Logger.Error("put store error", zap.Reflect("error", resp.Header.GetError())) + simutil.Logger.Error("store heartbeat error", zap.Reflect("error", resp.Header.GetError())) return nil } return nil } -func (c *client) PutPDConfig(config *sc.PDConfig) error { +func (c *client) RegionHeartbeat(_ context.Context, region *core.RegionInfo) error { + c.reportRegionHeartbeatCh <- region + return nil +} + +type RetryClient struct { + client Client + retryCount int +} + +func NewRetryClient(node *Node) *RetryClient { + // Init PD client and putting it into node. + tag := fmt.Sprintf("store %d", node.Store.Id) + var ( + client Client + receiveRegionHeartbeatCh <-chan *pdpb.RegionHeartbeatResponse + err error + ) + + // Client should wait if PD server is not ready. + for i := 0; i < maxInitClusterRetries; i++ { + client, receiveRegionHeartbeatCh, err = NewClient(tag) + if err == nil { + break + } + time.Sleep(time.Second) + } + + if err != nil { + simutil.Logger.Fatal("create client failed", zap.Error(err)) + } + node.client = client + + // Init RetryClient + retryClient := &RetryClient{ + client: client, + retryCount: retryTimes, + } + // check leader url firstly + retryClient.requestWithRetry(func() (any, error) { + return nil, errors.New("retry to create client") + }) + // start heartbeat stream + node.receiveRegionHeartbeatCh = receiveRegionHeartbeatCh + go client.HeartbeatStreamLoop() + + return retryClient +} + +func (rc *RetryClient) requestWithRetry(f func() (any, error)) (any, error) { + // execute the function directly + if res, err := f(); err == nil { + return res, nil + } + // retry to get leader URL + for i := 0; i < rc.retryCount; i++ { + sd.ScheduleCheckMemberChanged() + time.Sleep(100 * time.Millisecond) + if client := sd.GetServiceClient(); client != nil { + _, conn, err := getLeaderURL(context.Background(), client.GetClientConn()) + if err != nil { + simutil.Logger.Error("[retry] failed to get leader URL", zap.Error(err)) + return nil, err + } + if err = rc.client.ChangeConn(conn); err != nil { + simutil.Logger.Error("failed to change connection", zap.Error(err)) + return nil, err + } + return f() + } + } + return nil, errors.New("failed to retry") +} + +func getLeaderURL(ctx context.Context, conn *grpc.ClientConn) (string, *grpc.ClientConn, error) { + pdCli := pdpb.NewPDClient(conn) + members, err := pdCli.GetMembers(ctx, &pdpb.GetMembersRequest{}) + if err != nil { + return "", nil, err + } + if members.GetHeader().GetError() != nil { + return "", nil, errors.New(members.GetHeader().GetError().String()) + } + ClusterID = members.GetHeader().GetClusterId() + if ClusterID == 0 { + return "", nil, errors.New("cluster id is 0") + } + if members.GetLeader() == nil { + return "", nil, errors.New("leader is nil") + } + leaderURL := members.GetLeader().ClientUrls[0] + conn, err = createConn(leaderURL) + return leaderURL, conn, err +} + +func (rc *RetryClient) AllocID(ctx context.Context) (uint64, error) { + res, err := rc.requestWithRetry(func() (any, error) { + id, err := rc.client.AllocID(ctx) + return id, err + }) + if err != nil { + return 0, err + } + return res.(uint64), nil +} + +func (rc *RetryClient) PutStore(ctx context.Context, store *metapb.Store) error { + _, err := rc.requestWithRetry(func() (any, error) { + err := rc.client.PutStore(ctx, store) + return nil, err + }) + return err +} + +func (rc *RetryClient) StoreHeartbeat(ctx context.Context, stats *pdpb.StoreStats) error { + _, err := rc.requestWithRetry(func() (any, error) { + err := rc.client.StoreHeartbeat(ctx, stats) + return nil, err + }) + return err +} + +func (rc *RetryClient) RegionHeartbeat(ctx context.Context, region *core.RegionInfo) error { + _, err := rc.requestWithRetry(func() (any, error) { + err := rc.client.RegionHeartbeat(ctx, region) + return nil, err + }) + return err +} + +func (*RetryClient) ChangeConn(_ *grpc.ClientConn) error { + panic("unImplement") +} + +func (rc *RetryClient) HeartbeatStreamLoop() { + rc.client.HeartbeatStreamLoop() +} + +func (rc *RetryClient) Close() { + rc.client.Close() +} + +// Bootstrap bootstraps the cluster and using the given PD address firstly. +// because before bootstrapping the cluster, PDServiceDiscovery can not been started. +func Bootstrap(ctx context.Context, pdAddrs string, store *metapb.Store, region *metapb.Region) ( + leaderURL string, pdCli pdpb.PDClient, err error) { + urls := strings.Split(pdAddrs, ",") + if len(urls) == 0 { + return "", nil, errors.New("empty pd address") + } + +retry: + for i := 0; i < maxInitClusterRetries; i++ { + time.Sleep(100 * time.Millisecond) + for _, url := range urls { + conn, err := createConn(url) + if err != nil { + continue + } + leaderURL, conn, err = getLeaderURL(ctx, conn) + if err != nil { + continue + } + pdCli = pdpb.NewPDClient(conn) + break retry + } + } + if ClusterID == 0 { + return "", nil, errors.WithStack(errFailInitClusterID) + } + simutil.Logger.Info("get cluster id successfully", zap.Uint64("cluster-id", ClusterID)) + + // Check if the cluster is already bootstrapped. + ctx, cancel := context.WithTimeout(ctx, pdTimeout) + defer cancel() + req := &pdpb.IsBootstrappedRequest{ + Header: requestHeader(), + } + resp, err := pdCli.IsBootstrapped(ctx, req) + if resp.GetBootstrapped() { + simutil.Logger.Fatal("failed to bootstrap, server is not clean") + } + if err != nil { + return "", nil, err + } + // Bootstrap the cluster. + newStore := typeutil.DeepClone(store, core.StoreFactory) + newRegion := typeutil.DeepClone(region, core.RegionFactory) + var res *pdpb.BootstrapResponse + for i := 0; i < maxInitClusterRetries; i++ { + // Bootstrap the cluster. + res, err = pdCli.Bootstrap(ctx, &pdpb.BootstrapRequest{ + Header: requestHeader(), + Store: newStore, + Region: newRegion, + }) + if err != nil { + continue + } + if res.GetHeader().GetError() != nil { + continue + } + break + } + if err != nil { + return "", nil, err + } + if res.GetHeader().GetError() != nil { + return "", nil, errors.New(res.GetHeader().GetError().String()) + } + + return leaderURL, pdCli, nil +} + +/* PDHTTPClient is a client for PD HTTP API, these are the functions that are used in the simulator */ + +func PutPDConfig(config *sc.PDConfig) error { if len(config.PlacementRules) > 0 { ruleOps := make([]*pdHttp.RuleOp, 0) for _, rule := range config.PlacementRules { @@ -321,7 +520,7 @@ func (c *client) PutPDConfig(config *sc.PDConfig) error { Action: pdHttp.RuleOpAdd, }) } - err := PDHTTPClient.SetPlacementRuleInBatch(c.ctx, ruleOps) + err := PDHTTPClient.SetPlacementRuleInBatch(context.Background(), ruleOps) if err != nil { return err } @@ -330,7 +529,7 @@ func (c *client) PutPDConfig(config *sc.PDConfig) error { if len(config.LocationLabels) > 0 { data := make(map[string]any) data["location-labels"] = config.LocationLabels - err := PDHTTPClient.SetConfig(c.ctx, data) + err := PDHTTPClient.SetConfig(context.Background(), data) if err != nil { return err } @@ -339,35 +538,6 @@ func (c *client) PutPDConfig(config *sc.PDConfig) error { return nil } -func (c *client) StoreHeartbeat(ctx context.Context, stats *pdpb.StoreStats) error { - ctx, cancel := context.WithTimeout(ctx, pdTimeout) - newStats := typeutil.DeepClone(stats, core.StoreStatsFactory) - resp, err := c.pdClient().StoreHeartbeat(ctx, &pdpb.StoreHeartbeatRequest{ - Header: c.requestHeader(), - Stats: newStats, - }) - cancel() - if err != nil { - return err - } - if resp.Header.GetError() != nil { - simutil.Logger.Error("store heartbeat error", zap.Reflect("error", resp.Header.GetError())) - return nil - } - return nil -} - -func (c *client) RegionHeartbeat(_ context.Context, region *core.RegionInfo) error { - c.reportRegionHeartbeatCh <- region - return nil -} - -func (c *client) requestHeader() *pdpb.RequestHeader { - return &pdpb.RequestHeader{ - ClusterId: c.clusterID, - } -} - func ChooseToHaltPDSchedule(halt bool) { PDHTTPClient.SetConfig(context.Background(), map[string]any{ "schedule.halt-scheduling": strconv.FormatBool(halt), diff --git a/tools/pd-simulator/simulator/conn.go b/tools/pd-simulator/simulator/conn.go index 4be8a2b76dc9..b1000c0f17b6 100644 --- a/tools/pd-simulator/simulator/conn.go +++ b/tools/pd-simulator/simulator/conn.go @@ -22,19 +22,17 @@ import ( // Connection records the information of connection among nodes. type Connection struct { - pdAddr string - Nodes map[uint64]*Node + Nodes map[uint64]*Node } // NewConnection creates nodes according to the configuration and returns the connection among nodes. -func NewConnection(simCase *cases.Case, pdAddr string, storeConfig *config.SimConfig) (*Connection, error) { +func NewConnection(simCase *cases.Case, storeConfig *config.SimConfig) (*Connection, error) { conn := &Connection{ - pdAddr: pdAddr, - Nodes: make(map[uint64]*Node), + Nodes: make(map[uint64]*Node), } for _, store := range simCase.Stores { - node, err := NewNode(store, pdAddr, storeConfig) + node, err := NewNode(store, storeConfig) if err != nil { return nil, err } diff --git a/tools/pd-simulator/simulator/drive.go b/tools/pd-simulator/simulator/drive.go index 700dd58f87aa..0296710b7050 100644 --- a/tools/pd-simulator/simulator/drive.go +++ b/tools/pd-simulator/simulator/drive.go @@ -20,12 +20,16 @@ import ( "net/http/pprof" "path" "strconv" + "strings" "sync" "time" "github.com/pingcap/errors" "github.com/pingcap/kvproto/pkg/metapb" + "github.com/pingcap/kvproto/pkg/pdpb" "github.com/prometheus/client_golang/prometheus/promhttp" + pd "github.com/tikv/pd/client" + pdHttp "github.com/tikv/pd/client/http" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/utils/typeutil" "github.com/tikv/pd/tools/pd-simulator/simulator/cases" @@ -42,7 +46,6 @@ type Driver struct { pdAddr string statusAddress string simCase *cases.Case - client Client tickCount int64 eventRunner *EventRunner raftEngine *RaftEngine @@ -71,7 +74,7 @@ func NewDriver(pdAddr, statusAddress, caseName string, simConfig *config.SimConf // Prepare initializes cluster information, bootstraps cluster and starts nodes. func (d *Driver) Prepare() error { - conn, err := NewConnection(d.simCase, d.pdAddr, d.simConfig) + conn, err := NewConnection(d.simCase, d.simConfig) if err != nil { return err } @@ -79,22 +82,27 @@ func (d *Driver) Prepare() error { d.raftEngine = NewRaftEngine(d.simCase, d.conn, d.simConfig) d.eventRunner = NewEventRunner(d.simCase.Events, d.raftEngine) - d.updateNodeAvailable() if d.statusAddress != "" { go d.runHTTPServer() } + + if err = d.allocID(); err != nil { + return err + } + + return d.Start() +} + +func (d *Driver) allocID() error { // Bootstrap. store, region, err := d.GetBootstrapInfo(d.raftEngine) if err != nil { return err } - d.client = d.conn.Nodes[store.GetId()].client - ctx, cancel := context.WithTimeout(context.Background(), pdTimeout) - err = d.client.Bootstrap(ctx, store, region) - cancel() + leaderURL, pdCli, err := Bootstrap(context.Background(), d.pdAddr, store, region) if err != nil { simutil.Logger.Fatal("bootstrap error", zap.Error(err)) } else { @@ -107,15 +115,14 @@ func (d *Driver) Prepare() error { requestTimeout := 10 * time.Second etcdTimeout := 3 * time.Second etcdClient, err := clientv3.New(clientv3.Config{ - Endpoints: []string{d.pdAddr}, + Endpoints: []string{leaderURL}, DialTimeout: etcdTimeout, }) if err != nil { return err } - ctx, cancel = context.WithTimeout(context.Background(), requestTimeout) - clusterID := d.client.GetClusterID(ctx) - rootPath := path.Join("/pd", strconv.FormatUint(clusterID, 10)) + ctx, cancel := context.WithTimeout(context.Background(), requestTimeout) + rootPath := path.Join("/pd", strconv.FormatUint(ClusterID, 10)) allocIDPath := path.Join(rootPath, "alloc_id") _, err = etcdClient.Put(ctx, allocIDPath, string(typeutil.Uint64ToBytes(maxID+1000))) if err != nil { @@ -125,22 +132,34 @@ func (d *Driver) Prepare() error { cancel() for { - var id uint64 - id, err = d.client.AllocID(context.Background()) + var resp *pdpb.AllocIDResponse + resp, err = pdCli.AllocID(context.Background(), &pdpb.AllocIDRequest{ + Header: requestHeader(), + }) if err != nil { return errors.WithStack(err) } - if id > maxID { + if resp.Id > maxID { simutil.IDAllocator.ResetID() break } } + return nil +} - err = d.Start() - if err != nil { +func (d *Driver) updateNodesClient() error { + urls := strings.Split(d.pdAddr, ",") + ctx, cancel := context.WithCancel(context.Background()) + sd = pd.NewDefaultPDServiceDiscovery(ctx, cancel, urls, nil) + if err := sd.Init(); err != nil { return err } + // Init PD HTTP client. + PDHTTPClient = pdHttp.NewClientWithServiceDiscovery("pd-simulator", sd) + for _, node := range d.conn.Nodes { + node.client = NewRetryClient(node) + } return nil } @@ -174,19 +193,18 @@ func (d *Driver) Check() bool { // Start starts all nodes. func (d *Driver) Start() error { + if err := d.updateNodesClient(); err != nil { + return err + } + for _, n := range d.conn.Nodes { err := n.Start() if err != nil { return err } } - d.ChangePDConfig() - return nil -} -// ChangePDConfig changes pd config -func (d *Driver) ChangePDConfig() error { - d.client.PutPDConfig(d.pdConfig) + PutPDConfig(d.pdConfig) return nil } diff --git a/tools/pd-simulator/simulator/event.go b/tools/pd-simulator/simulator/event.go index 20c75b583843..8e01a8f5f40e 100644 --- a/tools/pd-simulator/simulator/event.go +++ b/tools/pd-simulator/simulator/event.go @@ -182,7 +182,7 @@ func (*AddNode) Run(raft *RaftEngine, _ int64) bool { Capacity: uint64(config.RaftStore.Capacity), Version: config.StoreVersion, } - n, err := NewNode(s, raft.conn.pdAddr, config) + n, err := NewNode(s, config) if err != nil { simutil.Logger.Error("create node failed", zap.Error(err)) return false @@ -190,6 +190,8 @@ func (*AddNode) Run(raft *RaftEngine, _ int64) bool { raft.conn.Nodes[s.ID] = n n.raftEngine = raft + n.client = NewRetryClient(n) + err = n.Start() if err != nil { delete(raft.conn.Nodes, s.ID) diff --git a/tools/pd-simulator/simulator/node.go b/tools/pd-simulator/simulator/node.go index fe8dc74a9445..8238a6486c1b 100644 --- a/tools/pd-simulator/simulator/node.go +++ b/tools/pd-simulator/simulator/node.go @@ -42,23 +42,24 @@ const ( type Node struct { *metapb.Store syncutil.RWMutex - stats *info.StoreStats - tick uint64 - wg sync.WaitGroup - tasks map[uint64]*Task + stats *info.StoreStats + tick uint64 + wg sync.WaitGroup + tasks map[uint64]*Task + ctx context.Context + cancel context.CancelFunc + raftEngine *RaftEngine + limiter *ratelimit.RateLimiter + sizeMutex syncutil.Mutex + hasExtraUsedSpace bool + snapStats []*pdpb.SnapshotStat + // PD client client Client receiveRegionHeartbeatCh <-chan *pdpb.RegionHeartbeatResponse - ctx context.Context - cancel context.CancelFunc - raftEngine *RaftEngine - limiter *ratelimit.RateLimiter - sizeMutex syncutil.Mutex - hasExtraUsedSpace bool - snapStats []*pdpb.SnapshotStat } // NewNode returns a Node. -func NewNode(s *cases.Store, pdAddr string, config *sc.SimConfig) (*Node, error) { +func NewNode(s *cases.Store, config *sc.SimConfig) (*Node, error) { ctx, cancel := context.WithCancel(context.Background()) store := &metapb.Store{ Id: s.ID, @@ -75,40 +76,19 @@ func NewNode(s *cases.Store, pdAddr string, config *sc.SimConfig) (*Node, error) Available: uint64(config.RaftStore.Capacity), }, } - tag := fmt.Sprintf("store %d", s.ID) - var ( - client Client - receiveRegionHeartbeatCh <-chan *pdpb.RegionHeartbeatResponse - err error - ) - // Client should wait if PD server is not ready. - for i := 0; i < maxInitClusterRetries; i++ { - client, receiveRegionHeartbeatCh, err = NewClient(pdAddr, tag) - if err == nil { - break - } - time.Sleep(time.Second) - } - - if err != nil { - cancel() - return nil, err - } ratio := config.Speed() speed := config.StoreIOMBPerSecond * units.MiB * int64(ratio) return &Node{ - Store: store, - stats: stats, - client: client, - ctx: ctx, - cancel: cancel, - tasks: make(map[uint64]*Task), - receiveRegionHeartbeatCh: receiveRegionHeartbeatCh, - limiter: ratelimit.NewRateLimiter(float64(speed), int(speed)), - tick: uint64(rand.Intn(storeHeartBeatPeriod)), - hasExtraUsedSpace: s.HasExtraUsedSpace, - snapStats: make([]*pdpb.SnapshotStat, 0), + Store: store, + stats: stats, + ctx: ctx, + cancel: cancel, + tasks: make(map[uint64]*Task), + limiter: ratelimit.NewRateLimiter(float64(speed), int(speed)), + tick: uint64(rand.Intn(storeHeartBeatPeriod)), + hasExtraUsedSpace: s.HasExtraUsedSpace, + snapStats: make([]*pdpb.SnapshotStat, 0), }, nil } @@ -205,7 +185,7 @@ func (n *Node) storeHeartBeat() { n.stats.SnapshotStats = stats err := n.client.StoreHeartbeat(ctx, &n.stats.StoreStats) if err != nil { - simutil.Logger.Info("report heartbeat error", + simutil.Logger.Info("report store heartbeat error", zap.Uint64("node-id", n.GetId()), zap.Error(err)) } @@ -230,7 +210,7 @@ func (n *Node) regionHeartBeat() { ctx, cancel := context.WithTimeout(n.ctx, pdTimeout) err := n.client.RegionHeartbeat(ctx, region) if err != nil { - simutil.Logger.Info("report heartbeat error", + simutil.Logger.Info("report region heartbeat error", zap.Uint64("node-id", n.Id), zap.Uint64("region-id", region.GetID()), zap.Error(err)) @@ -247,7 +227,7 @@ func (n *Node) reportRegionChange() { ctx, cancel := context.WithTimeout(n.ctx, pdTimeout) err := n.client.RegionHeartbeat(ctx, region) if err != nil { - simutil.Logger.Info("report heartbeat error", + simutil.Logger.Info("report region change heartbeat error", zap.Uint64("node-id", n.Id), zap.Uint64("region-id", region.GetID()), zap.Error(err)) From 6f8286be18804cdc1a333935b5d19f49ff516628 Mon Sep 17 00:00:00 2001 From: Ryan Leung Date: Wed, 19 Jun 2024 15:49:17 +0800 Subject: [PATCH 37/47] mcs: update node every restart (#8302) close tikv/pd#8154 Signed-off-by: Ryan Leung Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/keyspace/tso_keyspace_group.go | 62 ++++++------ server/apiv2/handlers/tso_keyspace_group.go | 12 ++- .../mcs/keyspace/tso_keyspace_group_test.go | 96 +++++++++++++++++++ 3 files changed, 138 insertions(+), 32 deletions(-) diff --git a/pkg/keyspace/tso_keyspace_group.go b/pkg/keyspace/tso_keyspace_group.go index 30f4d0d88b4e..29b8add740cc 100644 --- a/pkg/keyspace/tso_keyspace_group.go +++ b/pkg/keyspace/tso_keyspace_group.go @@ -36,6 +36,7 @@ import ( "github.com/tikv/pd/pkg/utils/etcdutil" "github.com/tikv/pd/pkg/utils/logutil" "github.com/tikv/pd/pkg/utils/syncutil" + "github.com/tikv/pd/pkg/utils/typeutil" "go.etcd.io/etcd/clientv3" "go.etcd.io/etcd/mvcc/mvccpb" "go.uber.org/zap" @@ -181,10 +182,6 @@ func (m *GroupManager) allocNodesToAllKeyspaceGroups(ctx context.Context) { return case <-ticker.C: } - countOfNodes := m.GetNodesCount() - if countOfNodes < utils.DefaultKeyspaceGroupReplicaCount { - continue - } groups, err := m.store.LoadKeyspaceGroups(utils.DefaultKeyspaceGroupID, 0) if err != nil { log.Error("failed to load all keyspace groups", zap.Error(err)) @@ -194,23 +191,26 @@ func (m *GroupManager) allocNodesToAllKeyspaceGroups(ctx context.Context) { if len(groups) == 0 { continue } - withError := false for _, group := range groups { - if len(group.Members) < utils.DefaultKeyspaceGroupReplicaCount { - nodes, err := m.AllocNodesForKeyspaceGroup(group.ID, utils.DefaultKeyspaceGroupReplicaCount) + existMembers := make(map[string]struct{}) + for _, member := range group.Members { + if exist, addr := m.IsExistNode(member.Address); exist { + existMembers[addr] = struct{}{} + } + } + numExistMembers := len(existMembers) + if numExistMembers != 0 && numExistMembers == len(group.Members) && numExistMembers == m.GetNodesCount() { + continue + } + if numExistMembers < utils.DefaultKeyspaceGroupReplicaCount { + nodes, err := m.AllocNodesForKeyspaceGroup(group.ID, existMembers, utils.DefaultKeyspaceGroupReplicaCount) if err != nil { - withError = true log.Error("failed to alloc nodes for keyspace group", zap.Uint32("keyspace-group-id", group.ID), zap.Error(err)) continue } group.Members = nodes } } - if !withError { - // all keyspace groups have equal or more than default replica count - log.Info("all keyspace groups have equal or more than default replica count, stop to alloc node") - return - } } } @@ -745,7 +745,7 @@ func (m *GroupManager) GetNodesCount() int { } // AllocNodesForKeyspaceGroup allocates nodes for the keyspace group. -func (m *GroupManager) AllocNodesForKeyspaceGroup(id uint32, desiredReplicaCount int) ([]endpoint.KeyspaceGroupMember, error) { +func (m *GroupManager) AllocNodesForKeyspaceGroup(id uint32, existMembers map[string]struct{}, desiredReplicaCount int) ([]endpoint.KeyspaceGroupMember, error) { m.Lock() defer m.Unlock() ctx, cancel := context.WithTimeout(m.ctx, allocNodesTimeout) @@ -770,32 +770,34 @@ func (m *GroupManager) AllocNodesForKeyspaceGroup(id uint32, desiredReplicaCount if kg.IsMerging() { return ErrKeyspaceGroupInMerging(id) } - exists := make(map[string]struct{}) - for _, member := range kg.Members { - exists[member.Address] = struct{}{} - nodes = append(nodes, member) - } - if len(exists) >= desiredReplicaCount { - return nil + + for addr := range existMembers { + nodes = append(nodes, endpoint.KeyspaceGroupMember{ + Address: addr, + Priority: utils.DefaultKeyspaceGroupReplicaPriority, + }) } - for len(exists) < desiredReplicaCount { + + for len(existMembers) < desiredReplicaCount { select { case <-ctx.Done(): return nil case <-ticker.C: } - countOfNodes := m.GetNodesCount() - if countOfNodes < desiredReplicaCount || countOfNodes == 0 { // double check + if m.GetNodesCount() == 0 { // double check return ErrNoAvailableNode } + if len(existMembers) == m.GetNodesCount() { + break + } addr := m.nodesBalancer.Next() if addr == "" { return ErrNoAvailableNode } - if _, ok := exists[addr]; ok { + if _, ok := existMembers[addr]; ok { continue } - exists[addr] = struct{}{} + existMembers[addr] = struct{}{} nodes = append(nodes, endpoint.KeyspaceGroupMember{ Address: addr, Priority: utils.DefaultKeyspaceGroupReplicaPriority, @@ -894,14 +896,14 @@ func (m *GroupManager) SetPriorityForKeyspaceGroup(id uint32, node string, prior } // IsExistNode checks if the node exists. -func (m *GroupManager) IsExistNode(addr string) bool { +func (m *GroupManager) IsExistNode(addr string) (bool, string) { nodes := m.nodesBalancer.GetAll() for _, node := range nodes { - if node == addr { - return true + if typeutil.EqualBaseURLs(node, addr) { + return true, node } } - return false + return false, "" } // MergeKeyspaceGroups merges the keyspace group in the list into the target keyspace group. diff --git a/server/apiv2/handlers/tso_keyspace_group.go b/server/apiv2/handlers/tso_keyspace_group.go index d11cc2adab81..ed3d37c27a9c 100644 --- a/server/apiv2/handlers/tso_keyspace_group.go +++ b/server/apiv2/handlers/tso_keyspace_group.go @@ -413,8 +413,16 @@ func AllocNodesForKeyspaceGroup(c *gin.Context) { c.AbortWithStatusJSON(http.StatusBadRequest, "existed replica is larger than the new replica") return } + + // check if nodes exist + existMembers := make(map[string]struct{}) + for _, member := range keyspaceGroup.Members { + if exist, addr := manager.IsExistNode(member.Address); exist { + existMembers[addr] = struct{}{} + } + } // get the nodes - nodes, err := manager.AllocNodesForKeyspaceGroup(id, allocParams.Replica) + nodes, err := manager.AllocNodesForKeyspaceGroup(id, existMembers, allocParams.Replica) if err != nil { c.AbortWithStatusJSON(http.StatusInternalServerError, err.Error()) return @@ -455,7 +463,7 @@ func SetNodesForKeyspaceGroup(c *gin.Context) { } // check if node exists for _, node := range setParams.Nodes { - if !manager.IsExistNode(node) { + if exist, _ := manager.IsExistNode(node); !exist { c.AbortWithStatusJSON(http.StatusBadRequest, "node does not exist") return } diff --git a/tests/integrations/mcs/keyspace/tso_keyspace_group_test.go b/tests/integrations/mcs/keyspace/tso_keyspace_group_test.go index 0c7683b569c9..7c95b99bcc73 100644 --- a/tests/integrations/mcs/keyspace/tso_keyspace_group_test.go +++ b/tests/integrations/mcs/keyspace/tso_keyspace_group_test.go @@ -335,6 +335,102 @@ func (suite *keyspaceGroupTestSuite) TestDefaultKeyspaceGroup() { } } +func (suite *keyspaceGroupTestSuite) TestAllocNodes() { + re := suite.Require() + // add three nodes. + nodes := make(map[string]bs.Server) + var cleanups []func() + defer func() { + for _, cleanup := range cleanups { + cleanup() + } + }() + for i := 0; i < utils.DefaultKeyspaceGroupReplicaCount+1; i++ { + s, cleanup := tests.StartSingleTSOTestServer(suite.ctx, re, suite.backendEndpoints, tempurl.Alloc()) + cleanups = append(cleanups, cleanup) + nodes[s.GetAddr()] = s + } + tests.WaitForPrimaryServing(re, nodes) + + // create a keyspace group. + kgs := &handlers.CreateKeyspaceGroupParams{KeyspaceGroups: []*endpoint.KeyspaceGroup{ + { + ID: uint32(1), + UserKind: endpoint.Standard.String(), + }, + }} + code := suite.tryCreateKeyspaceGroup(re, kgs) + re.Equal(http.StatusOK, code) + + // alloc nodes for the keyspace group + var kg *endpoint.KeyspaceGroup + testutil.Eventually(re, func() bool { + kg, code = suite.tryGetKeyspaceGroup(re, utils.DefaultKeyspaceGroupID) + return code == http.StatusOK && kg != nil && len(kg.Members) == utils.DefaultKeyspaceGroupReplicaCount + }) + stopNode := kg.Members[0].Address + // close one of members + nodes[stopNode].Close() + + // the member list will be updated + testutil.Eventually(re, func() bool { + kg, code = suite.tryGetKeyspaceGroup(re, utils.DefaultKeyspaceGroupID) + for _, member := range kg.Members { + if member.Address == stopNode { + return false + } + } + return code == http.StatusOK && kg != nil && len(kg.Members) == utils.DefaultKeyspaceGroupReplicaCount + }) +} + +func (suite *keyspaceGroupTestSuite) TestAllocOneNode() { + re := suite.Require() + // add one tso server + nodes := make(map[string]bs.Server) + oldTSOServer, cleanupOldTSOserver := tests.StartSingleTSOTestServer(suite.ctx, re, suite.backendEndpoints, tempurl.Alloc()) + defer cleanupOldTSOserver() + nodes[oldTSOServer.GetAddr()] = oldTSOServer + + tests.WaitForPrimaryServing(re, nodes) + + // create a keyspace group. + kgs := &handlers.CreateKeyspaceGroupParams{KeyspaceGroups: []*endpoint.KeyspaceGroup{ + { + ID: uint32(1), + UserKind: endpoint.Standard.String(), + }, + }} + code := suite.tryCreateKeyspaceGroup(re, kgs) + re.Equal(http.StatusOK, code) + + // alloc nodes for the keyspace group + var kg *endpoint.KeyspaceGroup + testutil.Eventually(re, func() bool { + kg, code = suite.tryGetKeyspaceGroup(re, utils.DefaultKeyspaceGroupID) + return code == http.StatusOK && kg != nil && len(kg.Members) == 1 + }) + stopNode := kg.Members[0].Address + // close old tso server + nodes[stopNode].Close() + + // create a new tso server + newTSOServer, cleanupNewTSOServer := tests.StartSingleTSOTestServer(suite.ctx, re, suite.backendEndpoints, tempurl.Alloc()) + defer cleanupNewTSOServer() + nodes[newTSOServer.GetAddr()] = newTSOServer + + tests.WaitForPrimaryServing(re, nodes) + + // the member list will be updated + testutil.Eventually(re, func() bool { + kg, code = suite.tryGetKeyspaceGroup(re, utils.DefaultKeyspaceGroupID) + if len(kg.Members) != 0 && kg.Members[0].Address == stopNode { + return false + } + return code == http.StatusOK && kg != nil && len(kg.Members) == 1 + }) +} + func (suite *keyspaceGroupTestSuite) tryAllocNodesForKeyspaceGroup(re *require.Assertions, id int, request *handlers.AllocNodesForKeyspaceGroupParams) ([]endpoint.KeyspaceGroupMember, int) { data, err := json.Marshal(request) re.NoError(err) From 98047022dedf2b1a4e826baecc2d5da21cea8607 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Wed, 19 Jun 2024 18:01:18 +0800 Subject: [PATCH 38/47] pd-ctl: fix the output trim (#8309) ref tikv/pd#4399 Fix the output trim of the pd-ctl command. Signed-off-by: JmPotato --- tools/pd-ctl/pdctl/command/global.go | 5 +++-- tools/pd-ctl/tests/scheduler/scheduler_test.go | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/pd-ctl/pdctl/command/global.go b/tools/pd-ctl/pdctl/command/global.go index b29e2b632785..7438345127ea 100644 --- a/tools/pd-ctl/pdctl/command/global.go +++ b/tools/pd-ctl/pdctl/command/global.go @@ -294,10 +294,11 @@ func requestJSON(cmd *cobra.Command, method, prefix string, input map[string]any return nil }) if err != nil { - cmd.Printf("Failed! %s\n", err) + cmd.Printf("Failed! %s\n", strings.TrimSpace(err.Error())) return } - cmd.Printf("Success! %s\n", strings.Trim(string(msg), "\"")) + msg = bytes.Trim(bytes.TrimSpace(msg), "\"") + cmd.Printf("Success! %s\n", string(msg)) } func postJSON(cmd *cobra.Command, prefix string, input map[string]any) { diff --git a/tools/pd-ctl/tests/scheduler/scheduler_test.go b/tools/pd-ctl/tests/scheduler/scheduler_test.go index 96a1f5557f9f..3ea819610152 100644 --- a/tools/pd-ctl/tests/scheduler/scheduler_test.go +++ b/tools/pd-ctl/tests/scheduler/scheduler_test.go @@ -307,9 +307,9 @@ func (suite *schedulerTestSuite) checkScheduler(cluster *pdTests.TestCluster) { echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "add", "balance-region-scheduler"}, nil) re.Contains(echo, "Success!") echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "add", "evict-leader-scheduler", "1"}, nil) - re.Contains(echo, "Success! The scheduler is created.") + re.Equal("Success! The scheduler is created.\n", echo) echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "add", "evict-leader-scheduler", "2"}, nil) - re.Contains(echo, "Success! The scheduler has been applied to the store.") + re.Equal("Success! The scheduler has been applied to the store.\n", echo) echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "remove", "evict-leader-scheduler-1"}, nil) re.Contains(echo, "Success!") echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "remove", "evict-leader-scheduler-2"}, nil) From 8c2bb020328c1b80966681759dc8b2723a1e86f5 Mon Sep 17 00:00:00 2001 From: Hu# Date: Thu, 20 Jun 2024 17:22:18 +0800 Subject: [PATCH 39/47] tests/real_cluster: refine reboot cluster (#8311) close tikv/pd#8310 Signed-off-by: husharp --- tests/integrations/client/http_client_test.go | 24 ++++++---- tests/integrations/realcluster/deploy.sh | 2 + .../realcluster/reboot_pd_test.go | 48 +++++++++---------- 3 files changed, 40 insertions(+), 34 deletions(-) diff --git a/tests/integrations/client/http_client_test.go b/tests/integrations/client/http_client_test.go index f4a48dcd63e1..229a658639d0 100644 --- a/tests/integrations/client/http_client_test.go +++ b/tests/integrations/client/http_client_test.go @@ -569,22 +569,26 @@ func (suite *httpClientTestSuite) TestSetStoreLabels() { defer cancel() resp, err := client.GetStores(ctx) re.NoError(err) - setStore := resp.Stores[0] - re.Empty(setStore.Store.Labels, nil) + re.NotEmpty(resp.Stores) + firstStore := resp.Stores[0] + re.Empty(firstStore.Store.Labels, nil) storeLabels := map[string]string{ "zone": "zone1", } - err = client.SetStoreLabels(ctx, 1, storeLabels) + err = client.SetStoreLabels(ctx, firstStore.Store.ID, storeLabels) re.NoError(err) - resp, err = client.GetStores(ctx) + getResp, err := client.GetStore(ctx, uint64(firstStore.Store.ID)) re.NoError(err) - for _, store := range resp.Stores { - if store.Store.ID == setStore.Store.ID { - for _, label := range store.Store.Labels { - re.Equal(label.Value, storeLabels[label.Key]) - } - } + + labelsMap := make(map[string]string) + for _, label := range getResp.Store.Labels { + re.NotNil(label) + labelsMap[label.Key] = label.Value + } + + for key, value := range storeLabels { + re.Equal(value, labelsMap[key]) } } diff --git a/tests/integrations/realcluster/deploy.sh b/tests/integrations/realcluster/deploy.sh index 8cce60e8ee60..31bf17655f88 100755 --- a/tests/integrations/realcluster/deploy.sh +++ b/tests/integrations/realcluster/deploy.sh @@ -23,6 +23,8 @@ if [ ! -d "bin" ] || [ ! -e "bin/tikv-server" ] && [ ! -e "bin/tidb-server" ] && --pd.binpath ./bin/pd-server \ > $CUR_PATH/playground.log 2>&1 & else + # CI will download the binaries in the prepare phase. + # ref https://github.com/PingCAP-QE/ci/blob/387e9e533b365174962ccb1959442a7070f9cd66/pipelines/tikv/pd/latest/pull_integration_realcluster_test.groovy#L55-L68 color-green "using existing binaries..." $TIUP_BIN_DIR playground nightly --kv 3 --tiflash 1 --db 1 --pd 3 --without-monitor \ --pd.binpath ./bin/pd-server --kv.binpath ./bin/tikv-server --db.binpath ./bin/tidb-server --tiflash.binpath ./bin/tiflash --tag pd_test \ diff --git a/tests/integrations/realcluster/reboot_pd_test.go b/tests/integrations/realcluster/reboot_pd_test.go index 8e99b0822f09..b8914e87bd86 100644 --- a/tests/integrations/realcluster/reboot_pd_test.go +++ b/tests/integrations/realcluster/reboot_pd_test.go @@ -38,37 +38,37 @@ func TestReloadLabel(t *testing.T) { re := require.New(t) ctx := context.Background() - resp, _ := pdHTTPCli.GetStores(ctx) - setStore := resp.Stores[0] + resp, err := pdHTTPCli.GetStores(ctx) + re.NoError(err) + re.NotEmpty(resp.Stores) + firstStore := resp.Stores[0] // TiFlash labels will be ["engine": "tiflash"] - storeLabel := map[string]string{ + // So we need to merge the labels + storeLabels := map[string]string{ "zone": "zone1", } - for _, label := range setStore.Store.Labels { - storeLabel[label.Key] = label.Value + for _, label := range firstStore.Store.Labels { + storeLabels[label.Key] = label.Value } - err := pdHTTPCli.SetStoreLabels(ctx, setStore.Store.ID, storeLabel) - re.NoError(err) + re.NoError(pdHTTPCli.SetStoreLabels(ctx, firstStore.Store.ID, storeLabels)) - resp, err = pdHTTPCli.GetStores(ctx) - re.NoError(err) - for _, store := range resp.Stores { - if store.Store.ID == setStore.Store.ID { - for _, label := range store.Store.Labels { - re.Equal(label.Value, storeLabel[label.Key]) - } - } - } + checkLabelsAreEqual := func() { + resp, err := pdHTTPCli.GetStore(ctx, uint64(firstStore.Store.ID)) + re.NoError(err) - restartTiUP() + labelsMap := make(map[string]string) + for _, label := range resp.Store.Labels { + re.NotNil(label) + labelsMap[label.Key] = label.Value + } - resp, err = pdHTTPCli.GetStores(ctx) - re.NoError(err) - for _, store := range resp.Stores { - if store.Store.ID == setStore.Store.ID { - for _, label := range store.Store.Labels { - re.Equal(label.Value, storeLabel[label.Key]) - } + for key, value := range storeLabels { + re.Equal(value, labelsMap[key]) } } + // Check the label is set + checkLabelsAreEqual() + // Restart TiUP to reload the label + restartTiUP() + checkLabelsAreEqual() } From 049de1761e5623827932864235cd2fbb5d2698ba Mon Sep 17 00:00:00 2001 From: you06 Date: Thu, 20 Jun 2024 20:50:49 +0900 Subject: [PATCH 40/47] api: client and server support `BatchScanRegions` (#8300) close tikv/pd#8307, ref pingcap/tidb#53850 Add `BatchScanRegions` interface for pd-client. Signed-off-by: you06 Co-authored-by: JmPotato --- client/client.go | 114 +++++++++++++++++++- client/go.mod | 2 +- client/go.sum | 4 +- client/http/types.go | 37 +------ client/metrics.go | 4 + go.mod | 3 +- go.sum | 8 +- pkg/core/basic_cluster.go | 8 ++ pkg/core/region.go | 36 +++++++ server/grpc_service.go | 79 ++++++++++++++ tests/integrations/client/client_test.go | 132 ++++++++++++++++++++++- tests/integrations/go.mod | 2 +- tests/integrations/go.sum | 8 +- tools/go.mod | 2 +- tools/go.sum | 8 +- tools/pd-api-bench/cases/cases.go | 1 + 16 files changed, 392 insertions(+), 56 deletions(-) diff --git a/client/client.go b/client/client.go index 1c8ef3cafe84..92cbd3d523fc 100644 --- a/client/client.go +++ b/client/client.go @@ -17,7 +17,9 @@ package pd import ( "context" "crypto/tls" + "encoding/hex" "fmt" + "net/url" "runtime/trace" "strings" "sync" @@ -85,11 +87,18 @@ type RPCClient interface { GetPrevRegion(ctx context.Context, key []byte, opts ...GetRegionOption) (*Region, error) // GetRegionByID gets a region and its leader Peer from PD by id. GetRegionByID(ctx context.Context, regionID uint64, opts ...GetRegionOption) (*Region, error) + // Deprecated: use BatchScanRegions instead. // ScanRegions gets a list of regions, starts from the region that contains key. - // Limit limits the maximum number of regions returned. + // Limit limits the maximum number of regions returned. It returns all the regions in the given range if limit <= 0. // If a region has no leader, corresponding leader will be placed by a peer // with empty value (PeerID is 0). ScanRegions(ctx context.Context, key, endKey []byte, limit int, opts ...GetRegionOption) ([]*Region, error) + // BatchScanRegions gets a list of regions, starts from the region that contains key. + // Limit limits the maximum number of regions returned. It returns all the regions in the given ranges if limit <= 0. + // If a region has no leader, corresponding leader will be placed by a peer + // with empty value (PeerID is 0). + // The returned regions are flattened, even there are key ranges located in the same region, only one region will be returned. + BatchScanRegions(ctx context.Context, keyRanges []KeyRange, limit int, opts ...GetRegionOption) ([]*Region, error) // GetStore gets a store from PD by store id. // The store may expire later. Caller is responsible for caching and taking care // of store change. @@ -337,6 +346,38 @@ type SecurityOption struct { SSLKEYBytes []byte } +// KeyRange defines a range of keys in bytes. +type KeyRange struct { + StartKey []byte + EndKey []byte +} + +// NewKeyRange creates a new key range structure with the given start key and end key bytes. +// Notice: the actual encoding of the key range is not specified here. It should be either UTF-8 or hex. +// - UTF-8 means the key has already been encoded into a string with UTF-8 encoding, like: +// []byte{52 56 54 53 54 99 54 99 54 102 50 48 53 55 54 102 55 50 54 99 54 52}, which will later be converted to "48656c6c6f20576f726c64" +// by using `string()` method. +// - Hex means the key is just a raw hex bytes without encoding to a UTF-8 string, like: +// []byte{72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100}, which will later be converted to "48656c6c6f20576f726c64" +// by using `hex.EncodeToString()` method. +func NewKeyRange(startKey, endKey []byte) *KeyRange { + return &KeyRange{startKey, endKey} +} + +// EscapeAsUTF8Str returns the URL escaped key strings as they are UTF-8 encoded. +func (r *KeyRange) EscapeAsUTF8Str() (startKeyStr, endKeyStr string) { + startKeyStr = url.QueryEscape(string(r.StartKey)) + endKeyStr = url.QueryEscape(string(r.EndKey)) + return +} + +// EscapeAsHexStr returns the URL escaped key strings as they are hex encoded. +func (r *KeyRange) EscapeAsHexStr() (startKeyStr, endKeyStr string) { + startKeyStr = url.QueryEscape(hex.EncodeToString(r.StartKey)) + endKeyStr = url.QueryEscape(hex.EncodeToString(r.EndKey)) + return +} + // NewClient creates a PD client. func NewClient( svrAddrs []string, security SecurityOption, opts ...ClientOption, @@ -1094,6 +1135,7 @@ func (c *client) ScanRegions(ctx context.Context, key, endKey []byte, limit int, if serviceClient == nil { return nil, errs.ErrClientGetProtoClient } + //nolint:staticcheck resp, err := pdpb.NewPDClient(serviceClient.GetClientConn()).ScanRegions(cctx, req) failpoint.Inject("responseNil", func() { resp = nil @@ -1103,6 +1145,7 @@ func (c *client) ScanRegions(ctx context.Context, key, endKey []byte, limit int, if protoClient == nil { return nil, errs.ErrClientGetProtoClient } + //nolint:staticcheck resp, err = protoClient.ScanRegions(cctx, req) } @@ -1113,6 +1156,74 @@ func (c *client) ScanRegions(ctx context.Context, key, endKey []byte, limit int, return handleRegionsResponse(resp), nil } +func (c *client) BatchScanRegions(ctx context.Context, ranges []KeyRange, limit int, opts ...GetRegionOption) ([]*Region, error) { + if span := opentracing.SpanFromContext(ctx); span != nil && span.Tracer() != nil { + span = span.Tracer().StartSpan("pdclient.BatchScanRegions", opentracing.ChildOf(span.Context())) + defer span.Finish() + } + start := time.Now() + defer func() { cmdDurationBatchScanRegions.Observe(time.Since(start).Seconds()) }() + + var cancel context.CancelFunc + scanCtx := ctx + if _, ok := ctx.Deadline(); !ok { + scanCtx, cancel = context.WithTimeout(ctx, c.option.timeout) + defer cancel() + } + options := &GetRegionOp{} + for _, opt := range opts { + opt(options) + } + pbRanges := make([]*pdpb.KeyRange, 0, len(ranges)) + for _, r := range ranges { + pbRanges = append(pbRanges, &pdpb.KeyRange{StartKey: r.StartKey, EndKey: r.EndKey}) + } + req := &pdpb.BatchScanRegionsRequest{ + Header: c.requestHeader(), + NeedBuckets: options.needBuckets, + Ranges: pbRanges, + Limit: int32(limit), + } + serviceClient, cctx := c.getRegionAPIClientAndContext(scanCtx, options.allowFollowerHandle && c.option.getEnableFollowerHandle()) + if serviceClient == nil { + return nil, errs.ErrClientGetProtoClient + } + resp, err := pdpb.NewPDClient(serviceClient.GetClientConn()).BatchScanRegions(cctx, req) + failpoint.Inject("responseNil", func() { + resp = nil + }) + if serviceClient.NeedRetry(resp.GetHeader().GetError(), err) { + protoClient, cctx := c.getClientAndContext(scanCtx) + if protoClient == nil { + return nil, errs.ErrClientGetProtoClient + } + resp, err = protoClient.BatchScanRegions(cctx, req) + } + + if err = c.respForErr(cmdFailedDurationBatchScanRegions, start, err, resp.GetHeader()); err != nil { + return nil, err + } + + return handleBatchRegionsResponse(resp), nil +} + +func handleBatchRegionsResponse(resp *pdpb.BatchScanRegionsResponse) []*Region { + regions := make([]*Region, 0, len(resp.GetRegions())) + for _, r := range resp.GetRegions() { + region := &Region{ + Meta: r.Region, + Leader: r.Leader, + PendingPeers: r.PendingPeers, + Buckets: r.Buckets, + } + for _, p := range r.DownPeers { + region.DownPeers = append(region.DownPeers, p.Peer) + } + regions = append(regions, region) + } + return regions +} + func handleRegionsResponse(resp *pdpb.ScanRegionsResponse) []*Region { var regions []*Region if len(resp.GetRegions()) == 0 { @@ -1131,6 +1242,7 @@ func handleRegionsResponse(resp *pdpb.ScanRegionsResponse) []*Region { Meta: r.Region, Leader: r.Leader, PendingPeers: r.PendingPeers, + Buckets: r.Buckets, } for _, p := range r.DownPeers { region.DownPeers = append(region.DownPeers, p.Peer) diff --git a/client/go.mod b/client/go.mod index 6baa2f112f40..475cf716125d 100644 --- a/client/go.mod +++ b/client/go.mod @@ -10,7 +10,7 @@ require ( github.com/opentracing/opentracing-go v1.2.0 github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00 - github.com/pingcap/kvproto v0.0.0-20231222062942-c0c73f41d0b2 + github.com/pingcap/kvproto v0.0.0-20240620063548-118a4cab53e4 github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 github.com/prometheus/client_golang v1.18.0 github.com/stretchr/testify v1.8.2 diff --git a/client/go.sum b/client/go.sum index 54942bb0bb84..620f70007a77 100644 --- a/client/go.sum +++ b/client/go.sum @@ -46,8 +46,8 @@ github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c h1:xpW9bvK+HuuTm github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c/go.mod h1:X2r9ueLEUZgtx2cIogM0v4Zj5uvvzhuuiu7Pn8HzMPg= github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00 h1:C3N3itkduZXDZFh4N3vQ5HEtld3S+Y+StULhWVvumU0= github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00/go.mod h1:4qGtCB0QK0wBzKtFEGDhxXnSnbQApw1gc9siScUl8ew= -github.com/pingcap/kvproto v0.0.0-20231222062942-c0c73f41d0b2 h1:364A6VCS+l0oHBKZKotX9LzmfEtIO/NTccTIQcPp3Ug= -github.com/pingcap/kvproto v0.0.0-20231222062942-c0c73f41d0b2/go.mod h1:rXxWk2UnwfUhLXha1jxRWPADw9eMZGWEWCg92Tgmb/8= +github.com/pingcap/kvproto v0.0.0-20240620063548-118a4cab53e4 h1:6aIKNB2YGAec4IUDLw6G2eDECiGiufZcgEbZSCELBx0= +github.com/pingcap/kvproto v0.0.0-20240620063548-118a4cab53e4/go.mod h1:rXxWk2UnwfUhLXha1jxRWPADw9eMZGWEWCg92Tgmb/8= github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 h1:HR/ylkkLmGdSSDaD8IDP+SZrdhV1Kibl9KrHxJ9eciw= github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= diff --git a/client/http/types.go b/client/http/types.go index ab6240494360..55f9b65caade 100644 --- a/client/http/types.go +++ b/client/http/types.go @@ -15,15 +15,14 @@ package http import ( - "encoding/hex" "encoding/json" "fmt" - "net/url" "time" "github.com/pingcap/kvproto/pkg/encryptionpb" "github.com/pingcap/kvproto/pkg/keyspacepb" "github.com/pingcap/kvproto/pkg/pdpb" + pd "github.com/tikv/pd/client" ) // ClusterState saves some cluster state information. @@ -43,37 +42,11 @@ type State struct { StartTimestamp int64 `json:"start_timestamp"` } -// KeyRange defines a range of keys in bytes. -type KeyRange struct { - startKey []byte - endKey []byte -} - -// NewKeyRange creates a new key range structure with the given start key and end key bytes. -// Notice: the actual encoding of the key range is not specified here. It should be either UTF-8 or hex. -// - UTF-8 means the key has already been encoded into a string with UTF-8 encoding, like: -// []byte{52 56 54 53 54 99 54 99 54 102 50 48 53 55 54 102 55 50 54 99 54 52}, which will later be converted to "48656c6c6f20576f726c64" -// by using `string()` method. -// - Hex means the key is just a raw hex bytes without encoding to a UTF-8 string, like: -// []byte{72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100}, which will later be converted to "48656c6c6f20576f726c64" -// by using `hex.EncodeToString()` method. -func NewKeyRange(startKey, endKey []byte) *KeyRange { - return &KeyRange{startKey, endKey} -} +// KeyRange alias pd.KeyRange to avoid break client compatibility. +type KeyRange = pd.KeyRange -// EscapeAsUTF8Str returns the URL escaped key strings as they are UTF-8 encoded. -func (r *KeyRange) EscapeAsUTF8Str() (startKeyStr, endKeyStr string) { - startKeyStr = url.QueryEscape(string(r.startKey)) - endKeyStr = url.QueryEscape(string(r.endKey)) - return -} - -// EscapeAsHexStr returns the URL escaped key strings as they are hex encoded. -func (r *KeyRange) EscapeAsHexStr() (startKeyStr, endKeyStr string) { - startKeyStr = url.QueryEscape(hex.EncodeToString(r.startKey)) - endKeyStr = url.QueryEscape(hex.EncodeToString(r.endKey)) - return -} +// NewKeyRange alias pd.NewKeyRange to avoid break client compatibility. +var NewKeyRange = pd.NewKeyRange // NOTICE: the structures below are copied from the PD API definitions. // Please make sure the consistency if any change happens to the PD API. diff --git a/client/metrics.go b/client/metrics.go index 1895306eca24..e0b29fb8bcc6 100644 --- a/client/metrics.go +++ b/client/metrics.go @@ -128,6 +128,7 @@ var ( cmdDurationGetPrevRegion prometheus.Observer cmdDurationGetRegionByID prometheus.Observer cmdDurationScanRegions prometheus.Observer + cmdDurationBatchScanRegions prometheus.Observer cmdDurationGetStore prometheus.Observer cmdDurationGetAllStores prometheus.Observer cmdDurationUpdateGCSafePoint prometheus.Observer @@ -151,6 +152,7 @@ var ( cmdFailDurationGetPrevRegion prometheus.Observer cmdFailedDurationGetRegionByID prometheus.Observer cmdFailedDurationScanRegions prometheus.Observer + cmdFailedDurationBatchScanRegions prometheus.Observer cmdFailedDurationGetStore prometheus.Observer cmdFailedDurationGetAllStores prometheus.Observer cmdFailedDurationUpdateGCSafePoint prometheus.Observer @@ -174,6 +176,7 @@ func initCmdDurations() { cmdDurationGetPrevRegion = cmdDuration.WithLabelValues("get_prev_region") cmdDurationGetRegionByID = cmdDuration.WithLabelValues("get_region_byid") cmdDurationScanRegions = cmdDuration.WithLabelValues("scan_regions") + cmdDurationBatchScanRegions = cmdDuration.WithLabelValues("batch_scan_regions") cmdDurationGetStore = cmdDuration.WithLabelValues("get_store") cmdDurationGetAllStores = cmdDuration.WithLabelValues("get_all_stores") cmdDurationUpdateGCSafePoint = cmdDuration.WithLabelValues("update_gc_safe_point") @@ -197,6 +200,7 @@ func initCmdDurations() { cmdFailDurationGetPrevRegion = cmdFailedDuration.WithLabelValues("get_prev_region") cmdFailedDurationGetRegionByID = cmdFailedDuration.WithLabelValues("get_region_byid") cmdFailedDurationScanRegions = cmdFailedDuration.WithLabelValues("scan_regions") + cmdFailedDurationBatchScanRegions = cmdFailedDuration.WithLabelValues("batch_scan_regions") cmdFailedDurationGetStore = cmdFailedDuration.WithLabelValues("get_store") cmdFailedDurationGetAllStores = cmdFailedDuration.WithLabelValues("get_all_stores") cmdFailedDurationUpdateGCSafePoint = cmdFailedDuration.WithLabelValues("update_gc_safe_point") diff --git a/go.mod b/go.mod index 90c5639c9367..35e064a59b07 100644 --- a/go.mod +++ b/go.mod @@ -34,7 +34,7 @@ require ( github.com/pingcap/errcode v0.3.0 github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00 - github.com/pingcap/kvproto v0.0.0-20240403065636-c699538f7aa1 + github.com/pingcap/kvproto v0.0.0-20240620063548-118a4cab53e4 github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21 github.com/pingcap/tidb-dashboard v0.0.0-20240326110213-9768844ff5d7 @@ -113,7 +113,6 @@ require ( github.com/goccy/go-json v0.10.2 // indirect github.com/golang-jwt/jwt v3.2.2+incompatible // indirect github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 // indirect - github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/golang/protobuf v1.5.4 // indirect github.com/golang/snappy v0.0.4 // indirect github.com/google/pprof v0.0.0-20211122183932-1daafda22083 // indirect diff --git a/go.sum b/go.sum index 6ec1baa72c47..69a7ffc51878 100644 --- a/go.sum +++ b/go.sum @@ -189,8 +189,8 @@ github.com/golang-sql/sqlexp v0.1.0/go.mod h1:J4ad9Vo8ZCWQ2GMrC4UCQy1JpCbwU9m3EO github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g= github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= -github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= -github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/groupcache v0.0.0-20160516000752-02826c3e7903 h1:LbsanbbD6LieFkXbj9YNNBupiGHJgFeLpO0j0Fza1h8= +github.com/golang/groupcache v0.0.0-20160516000752-02826c3e7903/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/protobuf v0.0.0-20180814211427-aa810b61a9c7/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= @@ -371,8 +371,8 @@ github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c/go.mod h1:X2r9ue github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00 h1:C3N3itkduZXDZFh4N3vQ5HEtld3S+Y+StULhWVvumU0= github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00/go.mod h1:4qGtCB0QK0wBzKtFEGDhxXnSnbQApw1gc9siScUl8ew= github.com/pingcap/kvproto v0.0.0-20191211054548-3c6b38ea5107/go.mod h1:WWLmULLO7l8IOcQG+t+ItJ3fEcrL5FxF0Wu+HrMy26w= -github.com/pingcap/kvproto v0.0.0-20240403065636-c699538f7aa1 h1:vDWWJKU6ztczn24XixahtLwcnJ15DOtSRIRM3jVtZNU= -github.com/pingcap/kvproto v0.0.0-20240403065636-c699538f7aa1/go.mod h1:rXxWk2UnwfUhLXha1jxRWPADw9eMZGWEWCg92Tgmb/8= +github.com/pingcap/kvproto v0.0.0-20240620063548-118a4cab53e4 h1:6aIKNB2YGAec4IUDLw6G2eDECiGiufZcgEbZSCELBx0= +github.com/pingcap/kvproto v0.0.0-20240620063548-118a4cab53e4/go.mod h1:rXxWk2UnwfUhLXha1jxRWPADw9eMZGWEWCg92Tgmb/8= github.com/pingcap/log v0.0.0-20210625125904-98ed8e2eb1c7/go.mod h1:8AanEdAHATuRurdGxZXBz0At+9avep+ub7U1AGYLIMM= github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 h1:HR/ylkkLmGdSSDaD8IDP+SZrdhV1Kibl9KrHxJ9eciw= github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4= diff --git a/pkg/core/basic_cluster.go b/pkg/core/basic_cluster.go index 2392b7ddac69..ea78c4ccf9c3 100644 --- a/pkg/core/basic_cluster.go +++ b/pkg/core/basic_cluster.go @@ -97,6 +97,7 @@ type RegionSetInformer interface { GetAdjacentRegions(region *RegionInfo) (*RegionInfo, *RegionInfo) ScanRegions(startKey, endKey []byte, limit int) []*RegionInfo GetRegionByKey(regionKey []byte) *RegionInfo + BatchScanRegions(keyRanges *KeyRanges, limit int) []*RegionInfo } // StoreSetInformer provides access to a shared informer of stores. @@ -140,6 +141,13 @@ type KeyRanges struct { krs []*KeyRange } +// NewKeyRangesWithSize creates a KeyRanges with the hint size. +func NewKeyRangesWithSize(size int) *KeyRanges { + return &KeyRanges{ + krs: make([]*KeyRange, 0, size), + } +} + // Append appends a KeyRange. func (rs *KeyRanges) Append(startKey, endKey []byte) { rs.krs = append(rs.krs, &KeyRange{ diff --git a/pkg/core/region.go b/pkg/core/region.go index df4cfc17be2b..2a2e02fafe80 100644 --- a/pkg/core/region.go +++ b/pkg/core/region.go @@ -1816,6 +1816,42 @@ func (r *RegionsInfo) ScanRegions(startKey, endKey []byte, limit int) []*RegionI return res } +// BatchScanRegions scans regions in given key pairs, returns at most `limit` regions. +// limit <= 0 means no limit. +// The given key pairs should be non-overlapping. +func (r *RegionsInfo) BatchScanRegions(keyRanges *KeyRanges, limit int) []*RegionInfo { + r.t.RLock() + defer r.t.RUnlock() + + krs := keyRanges.Ranges() + res := make([]*RegionInfo, 0, len(krs)) + var lastRegion *RegionInfo + for _, keyRange := range krs { + if limit > 0 && len(res) >= limit { + return res + } + if lastRegion != nil { + if lastRegion.Contains(keyRange.EndKey) { + continue + } else if lastRegion.Contains(keyRange.StartKey) { + keyRange.StartKey = lastRegion.GetEndKey() + } + } + r.tree.scanRange(keyRange.StartKey, func(region *RegionInfo) bool { + if len(keyRange.EndKey) > 0 && bytes.Compare(region.GetStartKey(), keyRange.EndKey) >= 0 { + return false + } + if limit > 0 && len(res) >= limit { + return false + } + lastRegion = region + res = append(res, region) + return true + }) + } + return res +} + // ScanRegionWithIterator scans from the first region containing or behind start key, // until iterator returns false. func (r *RegionsInfo) ScanRegionWithIterator(startKey []byte, iterator func(region *RegionInfo) bool) { diff --git a/server/grpc_service.go b/server/grpc_service.go index e16fa4a8d4fa..d3f58dfe1abe 100644 --- a/server/grpc_service.go +++ b/server/grpc_service.go @@ -15,6 +15,7 @@ package server import ( + "bytes" "context" "fmt" "io" @@ -1569,6 +1570,7 @@ func (s *GrpcServer) GetRegionByID(ctx context.Context, request *pdpb.GetRegionB }, nil } +// Deprecated: use BatchScanRegions instead. // ScanRegions implements gRPC PDServer. func (s *GrpcServer) ScanRegions(ctx context.Context, request *pdpb.ScanRegionsRequest) (*pdpb.ScanRegionsResponse, error) { if s.GetServiceMiddlewarePersistOptions().IsGRPCRateLimitEnabled() { @@ -1627,6 +1629,83 @@ func (s *GrpcServer) ScanRegions(ctx context.Context, request *pdpb.ScanRegionsR return resp, nil } +// BatchScanRegions implements gRPC PDServer. +func (s *GrpcServer) BatchScanRegions(ctx context.Context, request *pdpb.BatchScanRegionsRequest) (*pdpb.BatchScanRegionsResponse, error) { + if s.GetServiceMiddlewarePersistOptions().IsGRPCRateLimitEnabled() { + fName := currentFunction() + limiter := s.GetGRPCRateLimiter() + if done, err := limiter.Allow(fName); err == nil { + defer done() + } else { + return &pdpb.BatchScanRegionsResponse{ + Header: s.wrapErrorToHeader(pdpb.ErrorType_UNKNOWN, err.Error()), + }, nil + } + } + fn := func(ctx context.Context, client *grpc.ClientConn) (any, error) { + return pdpb.NewPDClient(client).BatchScanRegions(ctx, request) + } + followerHandle := new(bool) + if rsp, err := s.unaryFollowerMiddleware(ctx, request, fn, followerHandle); err != nil { + return nil, err + } else if rsp != nil { + return rsp.(*pdpb.BatchScanRegionsResponse), nil + } + + var rc *cluster.RaftCluster + if *followerHandle { + rc = s.cluster + if !rc.GetRegionSyncer().IsRunning() { + return &pdpb.BatchScanRegionsResponse{Header: s.regionNotFound()}, nil + } + } else { + rc = s.GetRaftCluster() + if rc == nil { + return &pdpb.BatchScanRegionsResponse{Header: s.notBootstrappedHeader()}, nil + } + } + needBucket := request.GetNeedBuckets() && !*followerHandle && rc.GetStoreConfig().IsEnableRegionBucket() + limit := request.GetLimit() + // cast to core.KeyRanges and check the validation. + keyRanges := core.NewKeyRangesWithSize(len(request.GetRanges())) + reqRanges := request.GetRanges() + for i, reqRange := range reqRanges { + if i > 0 { + if bytes.Compare(reqRange.StartKey, reqRanges[i-1].EndKey) < 0 { + return &pdpb.BatchScanRegionsResponse{Header: s.wrapErrorToHeader(pdpb.ErrorType_UNKNOWN, "invalid key range, ranges overlapped")}, nil + } + } + if len(reqRange.EndKey) > 0 && bytes.Compare(reqRange.StartKey, reqRange.EndKey) > 0 { + return &pdpb.BatchScanRegionsResponse{Header: s.wrapErrorToHeader(pdpb.ErrorType_UNKNOWN, "invalid key range, start key > end key")}, nil + } + keyRanges.Append(reqRange.StartKey, reqRange.EndKey) + } + res := rc.BatchScanRegions(keyRanges, int(limit)) + regions := make([]*pdpb.Region, 0, len(res)) + for _, r := range res { + leader := r.GetLeader() + if leader == nil { + leader = &metapb.Peer{} + } + var buckets *metapb.Buckets + if needBucket { + buckets = r.GetBuckets() + } + regions = append(regions, &pdpb.Region{ + Region: r.GetMeta(), + Leader: leader, + DownPeers: r.GetDownPeers(), + PendingPeers: r.GetPendingPeers(), + Buckets: buckets, + }) + } + if *followerHandle && len(regions) == 0 { + return &pdpb.BatchScanRegionsResponse{Header: s.regionNotFound()}, nil + } + resp := &pdpb.BatchScanRegionsResponse{Header: s.header(), Regions: regions} + return resp, nil +} + // AskSplit implements gRPC PDServer. func (s *GrpcServer) AskSplit(ctx context.Context, request *pdpb.AskSplitRequest) (*pdpb.AskSplitResponse, error) { if s.GetServiceMiddlewarePersistOptions().IsGRPCRateLimitEnabled() { diff --git a/tests/integrations/client/client_test.go b/tests/integrations/client/client_test.go index 65acd8977262..e2d34f2c96fa 100644 --- a/tests/integrations/client/client_test.go +++ b/tests/integrations/client/client_test.go @@ -736,11 +736,11 @@ func (suite *followerForwardAndHandleTestSuite) TestGetTsoByFollowerForwarding1( checkTS(re, cli, lastTS) re.NoError(failpoint.Enable("github.com/tikv/pd/client/responseNil", "return(true)")) - regions, err := cli.ScanRegions(ctx, []byte(""), []byte(""), 100) + regions, err := cli.BatchScanRegions(ctx, []pd.KeyRange{{StartKey: []byte(""), EndKey: []byte("")}}, 100) re.NoError(err) re.Empty(regions) re.NoError(failpoint.Disable("github.com/tikv/pd/client/responseNil")) - regions, err = cli.ScanRegions(ctx, []byte(""), []byte(""), 100) + regions, err = cli.BatchScanRegions(ctx, []pd.KeyRange{{StartKey: []byte(""), EndKey: []byte("")}}, 100) re.NoError(err) re.Len(regions, 1) } @@ -1412,7 +1412,7 @@ func (suite *clientTestSuite) TestScanRegions() { // Wait for region heartbeats. testutil.Eventually(re, func() bool { - scanRegions, err := suite.client.ScanRegions(context.Background(), []byte{0}, nil, 10) + scanRegions, err := suite.client.BatchScanRegions(context.Background(), []pd.KeyRange{{StartKey: []byte{0}, EndKey: nil}}, 10) return err == nil && len(scanRegions) == 10 }) @@ -1430,7 +1430,7 @@ func (suite *clientTestSuite) TestScanRegions() { t := suite.T() check := func(start, end []byte, limit int, expect []*metapb.Region) { - scanRegions, err := suite.client.ScanRegions(context.Background(), start, end, limit) + scanRegions, err := suite.client.BatchScanRegions(context.Background(), []pd.KeyRange{{StartKey: start, EndKey: end}}, limit) re.NoError(err) re.Len(scanRegions, len(expect)) t.Log("scanRegions", scanRegions) @@ -1999,3 +1999,127 @@ func waitLeaderChange(re *require.Assertions, cluster *tests.TestCluster, old st }) return leader } + +func (suite *clientTestSuite) TestBatchScanRegions() { + re := suite.Require() + regionLen := 10 + regions := make([]*metapb.Region, 0, regionLen) + for i := 0; i < regionLen; i++ { + regionID := regionIDAllocator.alloc() + r := &metapb.Region{ + Id: regionID, + RegionEpoch: &metapb.RegionEpoch{ + ConfVer: 1, + Version: 1, + }, + StartKey: []byte{byte(i)}, + EndKey: []byte{byte(i + 1)}, + Peers: peers, + } + regions = append(regions, r) + req := &pdpb.RegionHeartbeatRequest{ + Header: newHeader(suite.srv), + Region: r, + Leader: peers[0], + } + err := suite.regionHeartbeat.Send(req) + re.NoError(err) + } + + // Wait for region heartbeats. + testutil.Eventually(re, func() bool { + scanRegions, err := suite.client.BatchScanRegions(context.Background(), []pd.KeyRange{{StartKey: []byte{0}, EndKey: nil}}, 10) + return err == nil && len(scanRegions) == 10 + }) + + // Set leader of region3 to nil. + region3 := core.NewRegionInfo(regions[3], nil) + suite.srv.GetRaftCluster().HandleRegionHeartbeat(region3) + + // Add down peer for region4. + region4 := core.NewRegionInfo(regions[4], regions[4].Peers[0], core.WithDownPeers([]*pdpb.PeerStats{{Peer: regions[4].Peers[1]}})) + suite.srv.GetRaftCluster().HandleRegionHeartbeat(region4) + + // Add pending peers for region5. + region5 := core.NewRegionInfo(regions[5], regions[5].Peers[0], core.WithPendingPeers([]*metapb.Peer{regions[5].Peers[1], regions[5].Peers[2]})) + suite.srv.GetRaftCluster().HandleRegionHeartbeat(region5) + + // Add buckets for region6. + region6 := core.NewRegionInfo(regions[6], regions[6].Peers[0], core.SetBuckets(&metapb.Buckets{RegionId: regions[6].Id, Version: 2})) + suite.srv.GetRaftCluster().HandleRegionHeartbeat(region6) + + t := suite.T() + check := func(ranges []pd.KeyRange, limit int, expect []*metapb.Region) { + for _, bucket := range []bool{false, true} { + var opts []pd.GetRegionOption + if bucket { + opts = append(opts, pd.WithBuckets()) + } + scanRegions, err := suite.client.BatchScanRegions(context.Background(), ranges, limit, opts...) + re.NoError(err) + re.Len(scanRegions, len(expect)) + t.Log("scanRegions", scanRegions) + t.Log("expect", expect) + for i := range expect { + re.Equal(expect[i], scanRegions[i].Meta) + + if scanRegions[i].Meta.GetId() == region3.GetID() { + re.Equal(&metapb.Peer{}, scanRegions[i].Leader) + } else { + re.Equal(expect[i].Peers[0], scanRegions[i].Leader) + } + + if scanRegions[i].Meta.GetId() == region4.GetID() { + re.Equal([]*metapb.Peer{expect[i].Peers[1]}, scanRegions[i].DownPeers) + } + + if scanRegions[i].Meta.GetId() == region5.GetID() { + re.Equal([]*metapb.Peer{expect[i].Peers[1], expect[i].Peers[2]}, scanRegions[i].PendingPeers) + } + + if scanRegions[i].Meta.GetId() == region6.GetID() { + if !bucket { + re.Nil(scanRegions[i].Buckets) + } else { + re.Equal(scanRegions[i].Buckets, region6.GetBuckets()) + } + } + } + } + } + + // valid ranges + check([]pd.KeyRange{{StartKey: []byte{0}, EndKey: nil}}, 10, regions) + check([]pd.KeyRange{{StartKey: []byte{1}, EndKey: nil}}, 5, regions[1:6]) + check([]pd.KeyRange{ + {StartKey: []byte{0}, EndKey: []byte{1}}, + {StartKey: []byte{2}, EndKey: []byte{3}}, + {StartKey: []byte{4}, EndKey: []byte{5}}, + {StartKey: []byte{6}, EndKey: []byte{7}}, + {StartKey: []byte{8}, EndKey: []byte{9}}, + }, 10, []*metapb.Region{regions[0], regions[2], regions[4], regions[6], regions[8]}) + check([]pd.KeyRange{ + {StartKey: []byte{0}, EndKey: []byte{1}}, + {StartKey: []byte{2}, EndKey: []byte{3}}, + {StartKey: []byte{4}, EndKey: []byte{5}}, + {StartKey: []byte{6}, EndKey: []byte{7}}, + {StartKey: []byte{8}, EndKey: []byte{9}}, + }, 3, []*metapb.Region{regions[0], regions[2], regions[4]}) + check([]pd.KeyRange{ + {StartKey: []byte{0}, EndKey: []byte{0, 1}}, // non-continuous ranges in a region + {StartKey: []byte{0, 2}, EndKey: []byte{0, 3}}, + {StartKey: []byte{0, 3}, EndKey: []byte{0, 4}}, + {StartKey: []byte{0, 5}, EndKey: []byte{0, 6}}, + {StartKey: []byte{0, 7}, EndKey: []byte{3}}, + {StartKey: []byte{4}, EndKey: []byte{5}}, + }, 2, []*metapb.Region{regions[0], regions[1]}) + + // invalid ranges + _, err := suite.client.BatchScanRegions(context.Background(), []pd.KeyRange{{StartKey: []byte{1}, EndKey: []byte{0}}}, 10) + re.Error(err, "invalid key range, start key > end key") + _, err = suite.client.BatchScanRegions(context.Background(), []pd.KeyRange{ + {StartKey: []byte{0}, EndKey: []byte{2}}, + {StartKey: []byte{1}, EndKey: []byte{3}}, + }, 10) + re.Error(err, "invalid key range, ranges overlapped") +} diff --git a/tests/integrations/go.mod b/tests/integrations/go.mod index 7d07b668c800..3ad8e602a1ca 100644 --- a/tests/integrations/go.mod +++ b/tests/integrations/go.mod @@ -14,7 +14,7 @@ require ( github.com/go-sql-driver/mysql v1.7.0 github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c github.com/pingcap/failpoint v0.0.0-20220801062533-2eaa32854a6c - github.com/pingcap/kvproto v0.0.0-20240403065636-c699538f7aa1 + github.com/pingcap/kvproto v0.0.0-20240620063548-118a4cab53e4 github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 github.com/prometheus/client_golang v1.19.0 github.com/prometheus/client_model v0.6.0 diff --git a/tests/integrations/go.sum b/tests/integrations/go.sum index 0701b42aea71..8e63acdb8cb3 100644 --- a/tests/integrations/go.sum +++ b/tests/integrations/go.sum @@ -184,8 +184,8 @@ github.com/golang-sql/sqlexp v0.1.0/go.mod h1:J4ad9Vo8ZCWQ2GMrC4UCQy1JpCbwU9m3EO github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g= github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= -github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= -github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/groupcache v0.0.0-20160516000752-02826c3e7903 h1:LbsanbbD6LieFkXbj9YNNBupiGHJgFeLpO0j0Fza1h8= +github.com/golang/groupcache v0.0.0-20160516000752-02826c3e7903/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/protobuf v0.0.0-20180814211427-aa810b61a9c7/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= @@ -368,8 +368,8 @@ github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c/go.mod h1:X2r9ue github.com/pingcap/failpoint v0.0.0-20220801062533-2eaa32854a6c h1:CgbKAHto5CQgWM9fSBIvaxsJHuGP0uM74HXtv3MyyGQ= github.com/pingcap/failpoint v0.0.0-20220801062533-2eaa32854a6c/go.mod h1:4qGtCB0QK0wBzKtFEGDhxXnSnbQApw1gc9siScUl8ew= github.com/pingcap/kvproto v0.0.0-20191211054548-3c6b38ea5107/go.mod h1:WWLmULLO7l8IOcQG+t+ItJ3fEcrL5FxF0Wu+HrMy26w= -github.com/pingcap/kvproto v0.0.0-20240403065636-c699538f7aa1 h1:vDWWJKU6ztczn24XixahtLwcnJ15DOtSRIRM3jVtZNU= -github.com/pingcap/kvproto v0.0.0-20240403065636-c699538f7aa1/go.mod h1:rXxWk2UnwfUhLXha1jxRWPADw9eMZGWEWCg92Tgmb/8= +github.com/pingcap/kvproto v0.0.0-20240620063548-118a4cab53e4 h1:6aIKNB2YGAec4IUDLw6G2eDECiGiufZcgEbZSCELBx0= +github.com/pingcap/kvproto v0.0.0-20240620063548-118a4cab53e4/go.mod h1:rXxWk2UnwfUhLXha1jxRWPADw9eMZGWEWCg92Tgmb/8= github.com/pingcap/log v0.0.0-20210625125904-98ed8e2eb1c7/go.mod h1:8AanEdAHATuRurdGxZXBz0At+9avep+ub7U1AGYLIMM= github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 h1:HR/ylkkLmGdSSDaD8IDP+SZrdhV1Kibl9KrHxJ9eciw= github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4= diff --git a/tools/go.mod b/tools/go.mod index eb2c279e7fa9..85b1559b9526 100644 --- a/tools/go.mod +++ b/tools/go.mod @@ -22,7 +22,7 @@ require ( github.com/mattn/go-shellwords v1.0.12 github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00 - github.com/pingcap/kvproto v0.0.0-20240403065636-c699538f7aa1 + github.com/pingcap/kvproto v0.0.0-20240620063548-118a4cab53e4 github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 github.com/pkg/errors v0.9.1 github.com/prometheus/client_golang v1.19.0 diff --git a/tools/go.sum b/tools/go.sum index 535ea668b970..e5692fa43122 100644 --- a/tools/go.sum +++ b/tools/go.sum @@ -182,8 +182,8 @@ github.com/golang-sql/sqlexp v0.1.0/go.mod h1:J4ad9Vo8ZCWQ2GMrC4UCQy1JpCbwU9m3EO github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0 h1:DACJavvAHhabrF08vX0COfcOBJRhZ8lUbR+ZWIs0Y5g= github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= -github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= -github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/groupcache v0.0.0-20160516000752-02826c3e7903 h1:LbsanbbD6LieFkXbj9YNNBupiGHJgFeLpO0j0Fza1h8= +github.com/golang/groupcache v0.0.0-20160516000752-02826c3e7903/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/protobuf v0.0.0-20180814211427-aa810b61a9c7/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= @@ -365,8 +365,8 @@ github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c/go.mod h1:X2r9ue github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00 h1:C3N3itkduZXDZFh4N3vQ5HEtld3S+Y+StULhWVvumU0= github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00/go.mod h1:4qGtCB0QK0wBzKtFEGDhxXnSnbQApw1gc9siScUl8ew= github.com/pingcap/kvproto v0.0.0-20191211054548-3c6b38ea5107/go.mod h1:WWLmULLO7l8IOcQG+t+ItJ3fEcrL5FxF0Wu+HrMy26w= -github.com/pingcap/kvproto v0.0.0-20240403065636-c699538f7aa1 h1:vDWWJKU6ztczn24XixahtLwcnJ15DOtSRIRM3jVtZNU= -github.com/pingcap/kvproto v0.0.0-20240403065636-c699538f7aa1/go.mod h1:rXxWk2UnwfUhLXha1jxRWPADw9eMZGWEWCg92Tgmb/8= +github.com/pingcap/kvproto v0.0.0-20240620063548-118a4cab53e4 h1:6aIKNB2YGAec4IUDLw6G2eDECiGiufZcgEbZSCELBx0= +github.com/pingcap/kvproto v0.0.0-20240620063548-118a4cab53e4/go.mod h1:rXxWk2UnwfUhLXha1jxRWPADw9eMZGWEWCg92Tgmb/8= github.com/pingcap/log v0.0.0-20210625125904-98ed8e2eb1c7/go.mod h1:8AanEdAHATuRurdGxZXBz0At+9avep+ub7U1AGYLIMM= github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 h1:HR/ylkkLmGdSSDaD8IDP+SZrdhV1Kibl9KrHxJ9eciw= github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4= diff --git a/tools/pd-api-bench/cases/cases.go b/tools/pd-api-bench/cases/cases.go index 72986df5ed8d..18d5c8732e6d 100644 --- a/tools/pd-api-bench/cases/cases.go +++ b/tools/pd-api-bench/cases/cases.go @@ -352,6 +352,7 @@ func (c *scanRegions) Unary(ctx context.Context, cli pd.Client) error { random := rand.Intn(upperBound) startID := c.regionSample*random*4 + 1 endID := c.regionSample*(random+1)*4 + 1 + //nolint:staticcheck _, err := cli.ScanRegions(ctx, generateKeyForSimulator(startID), generateKeyForSimulator(endID), c.regionSample) if err != nil { return err From 3b051d727d02dae34d3a8b7f7b373165ef621e5f Mon Sep 17 00:00:00 2001 From: Sparkle <1284531+baurine@users.noreply.github.com> Date: Fri, 21 Jun 2024 13:34:18 +0800 Subject: [PATCH 41/47] chore(dashboard): update TiDB Dashboard to v8.2.0-91f6c281 [master] (#8315) ref tikv/pd#4257 Signed-off-by: baurine <2008.hbl@gmail.com> --- go.mod | 2 +- go.sum | 4 ++-- scripts/dashboard-version | 2 +- tests/integrations/go.mod | 2 +- tests/integrations/go.sum | 4 ++-- tools/go.mod | 2 +- tools/go.sum | 4 ++-- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/go.mod b/go.mod index 35e064a59b07..1ef14f416e8a 100644 --- a/go.mod +++ b/go.mod @@ -37,7 +37,7 @@ require ( github.com/pingcap/kvproto v0.0.0-20240620063548-118a4cab53e4 github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21 - github.com/pingcap/tidb-dashboard v0.0.0-20240326110213-9768844ff5d7 + github.com/pingcap/tidb-dashboard v0.0.0-20240612100141-91f6c281e441 github.com/prometheus/client_golang v1.19.0 github.com/prometheus/common v0.51.1 github.com/sasha-s/go-deadlock v0.2.0 diff --git a/go.sum b/go.sum index 69a7ffc51878..659cd116e9c1 100644 --- a/go.sum +++ b/go.sum @@ -378,8 +378,8 @@ github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 h1:HR/ylkkLmGdSSDaD8 github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4= github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21 h1:QV6jqlfOkh8hqvEAgwBZa+4bSgO0EeKC7s5c6Luam2I= github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21/go.mod h1:QYnjfA95ZaMefyl1NO8oPtKeb8pYUdnDVhQgf+qdpjM= -github.com/pingcap/tidb-dashboard v0.0.0-20240326110213-9768844ff5d7 h1:eFu98FbfJB7PKWOtkaV6YNXXJWqDhczQX56j/iucgU4= -github.com/pingcap/tidb-dashboard v0.0.0-20240326110213-9768844ff5d7/go.mod h1:ucZBRz52icb23T/5Z4CsuUHmarYiin7p2MeiVBe+o8c= +github.com/pingcap/tidb-dashboard v0.0.0-20240612100141-91f6c281e441 h1:01flLztcoWBeT5pe69Q8LAB2Hty0s9Rqc3RvHU4AQK8= +github.com/pingcap/tidb-dashboard v0.0.0-20240612100141-91f6c281e441/go.mod h1:ucZBRz52icb23T/5Z4CsuUHmarYiin7p2MeiVBe+o8c= github.com/pingcap/tipb v0.0.0-20220718022156-3e2483c20a9e h1:FBaTXU8C3xgt/drM58VHxojHo/QoG1oPsgWTGvaSpO4= github.com/pingcap/tipb v0.0.0-20220718022156-3e2483c20a9e/go.mod h1:A7mrd7WHBl1o63LE2bIBGEJMTNWXqhgmYiOvMLxozfs= github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= diff --git a/scripts/dashboard-version b/scripts/dashboard-version index 9b2a38982561..08a22137df5a 100644 --- a/scripts/dashboard-version +++ b/scripts/dashboard-version @@ -1,3 +1,3 @@ # This file is updated by running scripts/update-dashboard.sh # Don't edit it manullay -8.0.0-9768844f +8.2.0-91f6c281 diff --git a/tests/integrations/go.mod b/tests/integrations/go.mod index 3ad8e602a1ca..8a570d52458e 100644 --- a/tests/integrations/go.mod +++ b/tests/integrations/go.mod @@ -125,7 +125,7 @@ require ( github.com/phf/go-queue v0.0.0-20170504031614-9abe38d0371d // indirect github.com/pingcap/errcode v0.3.0 // indirect github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21 // indirect - github.com/pingcap/tidb-dashboard v0.0.0-20240326110213-9768844ff5d7 // indirect + github.com/pingcap/tidb-dashboard v0.0.0-20240612100141-91f6c281e441 // indirect github.com/pingcap/tipb v0.0.0-20220718022156-3e2483c20a9e // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect diff --git a/tests/integrations/go.sum b/tests/integrations/go.sum index 8e63acdb8cb3..c88919f6571c 100644 --- a/tests/integrations/go.sum +++ b/tests/integrations/go.sum @@ -375,8 +375,8 @@ github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 h1:HR/ylkkLmGdSSDaD8 github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4= github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21 h1:QV6jqlfOkh8hqvEAgwBZa+4bSgO0EeKC7s5c6Luam2I= github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21/go.mod h1:QYnjfA95ZaMefyl1NO8oPtKeb8pYUdnDVhQgf+qdpjM= -github.com/pingcap/tidb-dashboard v0.0.0-20240326110213-9768844ff5d7 h1:eFu98FbfJB7PKWOtkaV6YNXXJWqDhczQX56j/iucgU4= -github.com/pingcap/tidb-dashboard v0.0.0-20240326110213-9768844ff5d7/go.mod h1:ucZBRz52icb23T/5Z4CsuUHmarYiin7p2MeiVBe+o8c= +github.com/pingcap/tidb-dashboard v0.0.0-20240612100141-91f6c281e441 h1:01flLztcoWBeT5pe69Q8LAB2Hty0s9Rqc3RvHU4AQK8= +github.com/pingcap/tidb-dashboard v0.0.0-20240612100141-91f6c281e441/go.mod h1:ucZBRz52icb23T/5Z4CsuUHmarYiin7p2MeiVBe+o8c= github.com/pingcap/tipb v0.0.0-20220718022156-3e2483c20a9e h1:FBaTXU8C3xgt/drM58VHxojHo/QoG1oPsgWTGvaSpO4= github.com/pingcap/tipb v0.0.0-20220718022156-3e2483c20a9e/go.mod h1:A7mrd7WHBl1o63LE2bIBGEJMTNWXqhgmYiOvMLxozfs= github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= diff --git a/tools/go.mod b/tools/go.mod index 85b1559b9526..f424f12458e7 100644 --- a/tools/go.mod +++ b/tools/go.mod @@ -127,7 +127,7 @@ require ( github.com/phf/go-queue v0.0.0-20170504031614-9abe38d0371d // indirect github.com/pingcap/errcode v0.3.0 // indirect github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21 // indirect - github.com/pingcap/tidb-dashboard v0.0.0-20240326110213-9768844ff5d7 // indirect + github.com/pingcap/tidb-dashboard v0.0.0-20240612100141-91f6c281e441 // indirect github.com/pingcap/tipb v0.0.0-20220718022156-3e2483c20a9e // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/power-devops/perfstat v0.0.0-20221212215047-62379fc7944b // indirect diff --git a/tools/go.sum b/tools/go.sum index e5692fa43122..c2656b3e6565 100644 --- a/tools/go.sum +++ b/tools/go.sum @@ -372,8 +372,8 @@ github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 h1:HR/ylkkLmGdSSDaD8 github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4= github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21 h1:QV6jqlfOkh8hqvEAgwBZa+4bSgO0EeKC7s5c6Luam2I= github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21/go.mod h1:QYnjfA95ZaMefyl1NO8oPtKeb8pYUdnDVhQgf+qdpjM= -github.com/pingcap/tidb-dashboard v0.0.0-20240326110213-9768844ff5d7 h1:eFu98FbfJB7PKWOtkaV6YNXXJWqDhczQX56j/iucgU4= -github.com/pingcap/tidb-dashboard v0.0.0-20240326110213-9768844ff5d7/go.mod h1:ucZBRz52icb23T/5Z4CsuUHmarYiin7p2MeiVBe+o8c= +github.com/pingcap/tidb-dashboard v0.0.0-20240612100141-91f6c281e441 h1:01flLztcoWBeT5pe69Q8LAB2Hty0s9Rqc3RvHU4AQK8= +github.com/pingcap/tidb-dashboard v0.0.0-20240612100141-91f6c281e441/go.mod h1:ucZBRz52icb23T/5Z4CsuUHmarYiin7p2MeiVBe+o8c= github.com/pingcap/tipb v0.0.0-20220718022156-3e2483c20a9e h1:FBaTXU8C3xgt/drM58VHxojHo/QoG1oPsgWTGvaSpO4= github.com/pingcap/tipb v0.0.0-20220718022156-3e2483c20a9e/go.mod h1:A7mrd7WHBl1o63LE2bIBGEJMTNWXqhgmYiOvMLxozfs= github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= From 41e63768870fd86aaf78f499503a601249a7db39 Mon Sep 17 00:00:00 2001 From: Hu# Date: Fri, 21 Jun 2024 14:31:18 +0800 Subject: [PATCH 42/47] tools/simulator: support deleting a specified store (#8246) ref tikv/pd#8135 Signed-off-by: husharp Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- conf/simconfig.toml | 8 +- .../simulator/cases/balance_leader.go | 21 ++++- .../simulator/cases/balance_region.go | 28 ++++-- tools/pd-simulator/simulator/cases/cases.go | 2 +- .../cases/diagnose_label_isolation.go | 93 ++++++++++++------- .../simulator/cases/diagnose_rule.go | 52 +++++++---- .../pd-simulator/simulator/cases/hot_read.go | 38 ++++++-- .../pd-simulator/simulator/cases/hot_write.go | 49 +++++++--- .../simulator/cases/import_data.go | 22 +++-- .../simulator/cases/makeup_down_replica.go | 2 +- .../simulator/cases/region_merge.go | 17 +++- .../simulator/cases/region_split.go | 11 ++- tools/pd-simulator/simulator/drive.go | 6 +- tools/pd-simulator/simulator/event.go | 26 ++++-- 14 files changed, 258 insertions(+), 117 deletions(-) diff --git a/conf/simconfig.toml b/conf/simconfig.toml index 428ee61e5089..c0edb182652b 100644 --- a/conf/simconfig.toml +++ b/conf/simconfig.toml @@ -1,8 +1,10 @@ # PD Simulator Configuration -[tick] ## the tick interval when starting PD inside (default: "100ms") sim-tick-interval = "100ms" +total-store = 10 +total-region = 10000 +case-name = "balance-leader" [store] ## the capacity size of a new store in GB (default: 1024) @@ -11,8 +13,8 @@ store-capacity = 1024 store-available = 1024 ## the io rate of a new store in MB/s (default: 40) store-io-per-second = 40 -## the version of a new store (default: "2.1.0") -store-version = "2.1.0" +## the version of a new store (default: "8.1.0") +store-version = "8.1.0" ## the meaning of these configurations below are similar with config.toml [server] diff --git a/tools/pd-simulator/simulator/cases/balance_leader.go b/tools/pd-simulator/simulator/cases/balance_leader.go index fd9028bc91af..1dad09850a5a 100644 --- a/tools/pd-simulator/simulator/cases/balance_leader.go +++ b/tools/pd-simulator/simulator/cases/balance_leader.go @@ -28,12 +28,15 @@ func newBalanceLeader(config *sc.SimConfig) *Case { totalStore := config.TotalStore totalRegion := config.TotalRegion + allStores := make(map[uint64]struct{}, totalStore) replica := int(config.ServerConfig.Replication.MaxReplicas) for i := 0; i < totalStore; i++ { + id := simutil.IDAllocator.NextID() simCase.Stores = append(simCase.Stores, &Store{ - ID: simutil.IDAllocator.NextID(), + ID: id, Status: metapb.StoreState_Up, }) + allStores[id] = struct{}{} } leaderStoreID := simCase.Stores[totalStore-1].ID @@ -58,10 +61,18 @@ func newBalanceLeader(config *sc.SimConfig) *Case { }) } - simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - for i := 1; i <= totalStore; i++ { - leaderCount := regions.GetStoreLeaderCount(uint64(i)) - if !isUniform(leaderCount, totalRegion/totalStore) { + simCase.Checker = func(stores []*metapb.Store, regions *core.RegionsInfo, _ []info.StoreStats) bool { + for _, store := range stores { + if store.NodeState == metapb.NodeState_Removed { + delete(allStores, store.GetId()) + } + } + if len(allStores) == 0 { + return false + } + for storeID := range allStores { + leaderCount := regions.GetStoreLeaderCount(storeID) + if !isUniform(leaderCount, totalRegion/len(allStores)) { return false } } diff --git a/tools/pd-simulator/simulator/cases/balance_region.go b/tools/pd-simulator/simulator/cases/balance_region.go index 82a7ac2d7042..8798a656fd75 100644 --- a/tools/pd-simulator/simulator/cases/balance_region.go +++ b/tools/pd-simulator/simulator/cases/balance_region.go @@ -30,6 +30,7 @@ func newRedundantBalanceRegion(config *sc.SimConfig) *Case { totalStore := config.TotalStore totalRegion := config.TotalRegion replica := int(config.ServerConfig.Replication.MaxReplicas) + allStores := make(map[uint64]struct{}, totalStore) for i := 0; i < totalStore; i++ { s := &Store{ @@ -40,6 +41,7 @@ func newRedundantBalanceRegion(config *sc.SimConfig) *Case { s.HasExtraUsedSpace = true } simCase.Stores = append(simCase.Stores, s) + allStores[s.ID] = struct{}{} } for i := 0; i < totalRegion; i++ { @@ -57,21 +59,27 @@ func newRedundantBalanceRegion(config *sc.SimConfig) *Case { }) } - storesLastUpdateTime := make([]int64, totalStore+1) - storeLastAvailable := make([]uint64, totalStore+1) - simCase.Checker = func(_ *core.RegionsInfo, stats []info.StoreStats) bool { + storesLastUpdateTime := make(map[uint64]int64, totalStore) + storeLastAvailable := make(map[uint64]uint64, totalStore) + simCase.Checker = func(stores []*metapb.Store, _ *core.RegionsInfo, stats []info.StoreStats) bool { + for _, store := range stores { + if store.NodeState == metapb.NodeState_Removed { + delete(allStores, store.GetId()) + } + } + curTime := time.Now().Unix() - for i := 1; i <= totalStore; i++ { - available := stats[i].GetAvailable() - if curTime-storesLastUpdateTime[i] > 60 { - if storeLastAvailable[i] != available { + for storeID := range allStores { + available := stats[storeID].GetAvailable() + if curTime-storesLastUpdateTime[storeID] > 60 { + if storeLastAvailable[storeID] != available { return false } - if stats[i].ToCompactionSize != 0 { + if stats[storeID].ToCompactionSize != 0 { return false } - storesLastUpdateTime[i] = curTime - storeLastAvailable[i] = available + storesLastUpdateTime[storeID] = curTime + storeLastAvailable[storeID] = available } else { return false } diff --git a/tools/pd-simulator/simulator/cases/cases.go b/tools/pd-simulator/simulator/cases/cases.go index c4e2f9999785..238b54c935ae 100644 --- a/tools/pd-simulator/simulator/cases/cases.go +++ b/tools/pd-simulator/simulator/cases/cases.go @@ -45,7 +45,7 @@ type Region struct { } // CheckerFunc checks if the scheduler is finished. -type CheckerFunc func(*core.RegionsInfo, []info.StoreStats) bool +type CheckerFunc func([]*metapb.Store, *core.RegionsInfo, []info.StoreStats) bool // Case represents a test suite for simulator. type Case struct { diff --git a/tools/pd-simulator/simulator/cases/diagnose_label_isolation.go b/tools/pd-simulator/simulator/cases/diagnose_label_isolation.go index 090371366083..9fe65a3d56a0 100644 --- a/tools/pd-simulator/simulator/cases/diagnose_label_isolation.go +++ b/tools/pd-simulator/simulator/cases/diagnose_label_isolation.go @@ -33,6 +33,7 @@ func newLabelNotMatch1(_ *sc.SimConfig) *Case { num1, num2 := 3, 1 storeNum, regionNum := num1+num2, 200 + allStores := make(map[uint64]struct{}, storeNum+1) for i := 0; i < num1; i++ { id := IDAllocator.nextID() simCase.Stores = append(simCase.Stores, &Store{ @@ -40,11 +41,14 @@ func newLabelNotMatch1(_ *sc.SimConfig) *Case { Status: metapb.StoreState_Up, Labels: []*metapb.StoreLabel{{Key: "host", Value: fmt.Sprintf("host%d", id)}}, }) + allStores[id] = struct{}{} } + id := IDAllocator.nextID() simCase.Stores = append(simCase.Stores, &Store{ - ID: IDAllocator.nextID(), + ID: id, Status: metapb.StoreState_Up, }) + allStores[id] = struct{}{} for i := 0; i < regionNum; i++ { peers := []*metapb.Peer{ @@ -61,24 +65,30 @@ func newLabelNotMatch1(_ *sc.SimConfig) *Case { }) } - storesLastUpdateTime := make([]int64, storeNum+1) - storeLastAvailable := make([]uint64, storeNum+1) - simCase.Checker = func(_ *core.RegionsInfo, stats []info.StoreStats) bool { + storesLastUpdateTime := make(map[uint64]int64, storeNum+1) + storeLastAvailable := make(map[uint64]uint64, storeNum+1) + simCase.Checker = func(stores []*metapb.Store, _ *core.RegionsInfo, stats []info.StoreStats) bool { + for _, store := range stores { + if store.NodeState == metapb.NodeState_Removed { + delete(allStores, store.GetId()) + } + } + res := true curTime := time.Now().Unix() storesAvailable := make([]uint64, 0, storeNum+1) - for i := 1; i <= storeNum; i++ { - available := stats[i].GetAvailable() + for storeID := range allStores { + available := stats[storeID].GetAvailable() storesAvailable = append(storesAvailable, available) - if curTime-storesLastUpdateTime[i] > 360 { - if storeLastAvailable[i] != available { + if curTime-storesLastUpdateTime[storeID] > 360 { + if storeLastAvailable[storeID] != available { res = false } - if stats[i].ToCompactionSize != 0 { + if stats[storeID].ToCompactionSize != 0 { res = false } - storesLastUpdateTime[i] = curTime - storeLastAvailable[i] = available + storesLastUpdateTime[storeID] = curTime + storeLastAvailable[storeID] = available } else { res = false } @@ -95,6 +105,7 @@ func newLabelIsolation1(_ *sc.SimConfig) *Case { num1, num2 := 2, 2 storeNum, regionNum := num1+num2, 300 + allStores := make(map[uint64]struct{}, storeNum+1) for i := 0; i < num1; i++ { id := IDAllocator.nextID() simCase.Stores = append(simCase.Stores, &Store{ @@ -102,14 +113,16 @@ func newLabelIsolation1(_ *sc.SimConfig) *Case { Status: metapb.StoreState_Up, Labels: []*metapb.StoreLabel{{Key: "host", Value: fmt.Sprintf("host%d", id)}}, }) + allStores[id] = struct{}{} } - id := IDAllocator.GetID() + 1 for i := 0; i < num2; i++ { + id := IDAllocator.nextID() simCase.Stores = append(simCase.Stores, &Store{ - ID: IDAllocator.nextID(), + ID: id, Status: metapb.StoreState_Up, Labels: []*metapb.StoreLabel{{Key: "host", Value: fmt.Sprintf("host%d", id)}}, }) + allStores[id] = struct{}{} } for i := 0; i < regionNum; i++ { @@ -127,24 +140,30 @@ func newLabelIsolation1(_ *sc.SimConfig) *Case { }) } - storesLastUpdateTime := make([]int64, storeNum+1) - storeLastAvailable := make([]uint64, storeNum+1) - simCase.Checker = func(_ *core.RegionsInfo, stats []info.StoreStats) bool { + storesLastUpdateTime := make(map[uint64]int64, storeNum) + storeLastAvailable := make(map[uint64]uint64, storeNum) + simCase.Checker = func(stores []*metapb.Store, _ *core.RegionsInfo, stats []info.StoreStats) bool { + for _, store := range stores { + if store.NodeState == metapb.NodeState_Removed { + delete(allStores, store.GetId()) + } + } + res := true curTime := time.Now().Unix() storesAvailable := make([]uint64, 0, storeNum+1) - for i := 1; i <= storeNum; i++ { - available := stats[i].GetAvailable() + for storeID := range allStores { + available := stats[storeID].GetAvailable() storesAvailable = append(storesAvailable, available) - if curTime-storesLastUpdateTime[i] > 360 { - if storeLastAvailable[i] != available { + if curTime-storesLastUpdateTime[storeID] > 360 { + if storeLastAvailable[storeID] != available { res = false } - if stats[i].ToCompactionSize != 0 { + if stats[storeID].ToCompactionSize != 0 { res = false } - storesLastUpdateTime[i] = curTime - storeLastAvailable[i] = available + storesLastUpdateTime[storeID] = curTime + storeLastAvailable[storeID] = available } else { res = false } @@ -160,12 +179,14 @@ func newLabelIsolation2(_ *sc.SimConfig) *Case { simCase.Labels = []string{"dc", "zone", "host"} storeNum, regionNum := 5, 200 + allStores := make(map[uint64]struct{}, storeNum) for i := 0; i < storeNum; i++ { id := IDAllocator.nextID() simCase.Stores = append(simCase.Stores, &Store{ ID: id, Status: metapb.StoreState_Up, }) + allStores[id] = struct{}{} } simCase.Stores[0].Labels = []*metapb.StoreLabel{{Key: "dc", Value: "dc1"}, {Key: "zone", Value: "zone1"}, {Key: "host", Value: "host1"}} simCase.Stores[1].Labels = []*metapb.StoreLabel{{Key: "dc", Value: "dc1"}, {Key: "zone", Value: "zone1"}, {Key: "host", Value: "host2"}} @@ -188,24 +209,30 @@ func newLabelIsolation2(_ *sc.SimConfig) *Case { }) } - storesLastUpdateTime := make([]int64, storeNum+1) - storeLastAvailable := make([]uint64, storeNum+1) - simCase.Checker = func(_ *core.RegionsInfo, stats []info.StoreStats) bool { + storesLastUpdateTime := make(map[uint64]int64, storeNum) + storeLastAvailable := make(map[uint64]uint64, storeNum) + simCase.Checker = func(stores []*metapb.Store, _ *core.RegionsInfo, stats []info.StoreStats) bool { + for _, store := range stores { + if store.NodeState == metapb.NodeState_Removed { + delete(allStores, store.GetId()) + } + } + res := true curTime := time.Now().Unix() storesAvailable := make([]uint64, 0, storeNum+1) - for i := 1; i <= storeNum; i++ { - available := stats[i].GetAvailable() + for storeID := range allStores { + available := stats[storeID].GetAvailable() storesAvailable = append(storesAvailable, available) - if curTime-storesLastUpdateTime[i] > 360 { - if storeLastAvailable[i] != available { + if curTime-storesLastUpdateTime[storeID] > 360 { + if storeLastAvailable[storeID] != available { res = false } - if stats[i].ToCompactionSize != 0 { + if stats[storeID].ToCompactionSize != 0 { res = false } - storesLastUpdateTime[i] = curTime - storeLastAvailable[i] = available + storesLastUpdateTime[storeID] = curTime + storeLastAvailable[storeID] = available } else { res = false } diff --git a/tools/pd-simulator/simulator/cases/diagnose_rule.go b/tools/pd-simulator/simulator/cases/diagnose_rule.go index 2cd11b9624a0..26f563297aef 100644 --- a/tools/pd-simulator/simulator/cases/diagnose_rule.go +++ b/tools/pd-simulator/simulator/cases/diagnose_rule.go @@ -65,12 +65,14 @@ func newRule1(_ *sc.SimConfig) *Case { }) storeNum, regionNum := 9, 300 + allStores := make(map[uint64]struct{}, storeNum) for i := 0; i < storeNum; i++ { id := IDAllocator.nextID() simCase.Stores = append(simCase.Stores, &Store{ ID: id, Status: metapb.StoreState_Up, }) + allStores[id] = struct{}{} } simCase.Stores[0].Labels = []*metapb.StoreLabel{{Key: "region", Value: "region2"}, {Key: "idc", Value: "idc1"}} simCase.Stores[1].Labels = []*metapb.StoreLabel{{Key: "region", Value: "region2"}, {Key: "idc", Value: "idc1"}} @@ -100,24 +102,30 @@ func newRule1(_ *sc.SimConfig) *Case { }) } - storesLastUpdateTime := make([]int64, storeNum+1) - storeLastAvailable := make([]uint64, storeNum+1) - simCase.Checker = func(_ *core.RegionsInfo, stats []info.StoreStats) bool { + storesLastUpdateTime := make(map[uint64]int64, storeNum) + storeLastAvailable := make(map[uint64]uint64, storeNum) + simCase.Checker = func(stores []*metapb.Store, _ *core.RegionsInfo, stats []info.StoreStats) bool { + for _, store := range stores { + if store.NodeState == metapb.NodeState_Removed { + delete(allStores, store.GetId()) + } + } + res := true curTime := time.Now().Unix() storesAvailable := make([]uint64, 0, storeNum+1) - for i := 1; i <= storeNum; i++ { - available := stats[i].GetAvailable() + for storeID := range allStores { + available := stats[storeID].GetAvailable() storesAvailable = append(storesAvailable, available) - if curTime-storesLastUpdateTime[i] > 360 { - if storeLastAvailable[i] != available { + if curTime-storesLastUpdateTime[storeID] > 360 { + if storeLastAvailable[storeID] != available { res = false } - if stats[i].ToCompactionSize != 0 { + if stats[storeID].ToCompactionSize != 0 { res = false } - storesLastUpdateTime[i] = curTime - storeLastAvailable[i] = available + storesLastUpdateTime[storeID] = curTime + storeLastAvailable[storeID] = available } else { res = false } @@ -150,12 +158,14 @@ func newRule2(_ *sc.SimConfig) *Case { }) storeNum, regionNum := 6, 300 + allStores := make(map[uint64]struct{}, storeNum) for i := 0; i < storeNum; i++ { id := IDAllocator.nextID() simCase.Stores = append(simCase.Stores, &Store{ ID: id, Status: metapb.StoreState_Up, }) + allStores[id] = struct{}{} } simCase.Stores[0].Labels = []*metapb.StoreLabel{{Key: "region", Value: "region1"}} simCase.Stores[1].Labels = []*metapb.StoreLabel{{Key: "region", Value: "region1"}} @@ -181,22 +191,28 @@ func newRule2(_ *sc.SimConfig) *Case { storesLastUpdateTime := make([]int64, storeNum+1) storeLastAvailable := make([]uint64, storeNum+1) - simCase.Checker = func(_ *core.RegionsInfo, stats []info.StoreStats) bool { + simCase.Checker = func(stores []*metapb.Store, _ *core.RegionsInfo, stats []info.StoreStats) bool { + for _, store := range stores { + if store.NodeState == metapb.NodeState_Removed { + delete(allStores, store.GetId()) + } + } + res := true curTime := time.Now().Unix() storesAvailable := make([]uint64, 0, storeNum+1) - for i := 1; i <= storeNum; i++ { - available := stats[i].GetAvailable() + for storeID := range allStores { + available := stats[storeID].GetAvailable() storesAvailable = append(storesAvailable, available) - if curTime-storesLastUpdateTime[i] > 360 { - if storeLastAvailable[i] != available { + if curTime-storesLastUpdateTime[storeID] > 360 { + if storeLastAvailable[storeID] != available { res = false } - if stats[i].ToCompactionSize != 0 { + if stats[storeID].ToCompactionSize != 0 { res = false } - storesLastUpdateTime[i] = curTime - storeLastAvailable[i] = available + storesLastUpdateTime[storeID] = curTime + storeLastAvailable[storeID] = available } else { res = false } diff --git a/tools/pd-simulator/simulator/cases/hot_read.go b/tools/pd-simulator/simulator/cases/hot_read.go index d154886b0a47..7f4d93fb43b8 100644 --- a/tools/pd-simulator/simulator/cases/hot_read.go +++ b/tools/pd-simulator/simulator/cases/hot_read.go @@ -15,6 +15,8 @@ package cases import ( + "fmt" + "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" @@ -23,18 +25,22 @@ import ( "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" ) +var hotReadStore uint64 = 1 + func newHotRead(config *sc.SimConfig) *Case { var simCase Case totalStore := config.TotalStore totalRegion := config.TotalRegion replica := int(config.ServerConfig.Replication.MaxReplicas) - + allStores := make(map[uint64]struct{}, totalStore) // Initialize the cluster for i := 0; i < totalStore; i++ { + id := simutil.IDAllocator.NextID() simCase.Stores = append(simCase.Stores, &Store{ - ID: simutil.IDAllocator.NextID(), + ID: id, Status: metapb.StoreState_Up, }) + allStores[id] = struct{}{} } for i := 0; i < totalRegion; i++ { @@ -54,12 +60,18 @@ func newHotRead(config *sc.SimConfig) *Case { }) } + // select the first store as hot read store + for store := range allStores { + hotReadStore = store + break + } + // Events description - // select regions on store 1 as hot read regions. + // select regions on `hotReadStore` as hot read regions. selectRegionNum := 4 * totalStore readFlow := make(map[uint64]int64, selectRegionNum) for _, r := range simCase.Regions { - if r.Leader.GetStoreId() == 1 { + if r.Leader.GetStoreId() == hotReadStore { readFlow[r.ID] = 128 * units.MiB if len(readFlow) == selectRegionNum { break @@ -72,15 +84,25 @@ func newHotRead(config *sc.SimConfig) *Case { } simCase.Events = []EventDescriptor{e} // Checker description - simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - leaderCount := make([]int, totalStore) + simCase.Checker = func(stores []*metapb.Store, regions *core.RegionsInfo, _ []info.StoreStats) bool { + for _, store := range stores { + if store.NodeState == metapb.NodeState_Removed { + if store.Id == hotReadStore { + simutil.Logger.Error(fmt.Sprintf("hot store %d is removed", hotReadStore)) + return true + } + delete(allStores, store.GetId()) + } + } + + leaderCount := make(map[uint64]int, len(allStores)) for id := range readFlow { leaderStore := regions.GetRegion(id).GetLeader().GetStoreId() - leaderCount[int(leaderStore-1)]++ + leaderCount[leaderStore]++ } // check count diff < 2. - var min, max int + var min, max uint64 for i := range leaderCount { if leaderCount[i] > leaderCount[max] { max = i diff --git a/tools/pd-simulator/simulator/cases/hot_write.go b/tools/pd-simulator/simulator/cases/hot_write.go index e73ca6f3ce3b..8f08264590d2 100644 --- a/tools/pd-simulator/simulator/cases/hot_write.go +++ b/tools/pd-simulator/simulator/cases/hot_write.go @@ -15,6 +15,7 @@ package cases import ( + "fmt" "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" @@ -23,18 +24,22 @@ import ( "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" ) +var hotWriteStore uint64 = 1 + func newHotWrite(config *sc.SimConfig) *Case { var simCase Case totalStore := config.TotalStore totalRegion := config.TotalRegion replica := int(config.ServerConfig.Replication.MaxReplicas) - + allStores := make(map[uint64]struct{}, totalStore) // Initialize the cluster for i := 0; i < totalStore; i++ { + id := simutil.IDAllocator.NextID() simCase.Stores = append(simCase.Stores, &Store{ - ID: simutil.IDAllocator.NextID(), + ID: id, Status: metapb.StoreState_Up, }) + allStores[id] = struct{}{} } for i := 0; i < totalRegion; i++ { @@ -54,14 +59,20 @@ func newHotWrite(config *sc.SimConfig) *Case { }) } + // select the first store as hot write store. + for store := range allStores { + hotWriteStore = store + break + } + // Events description - // select regions on store 1 as hot write regions. - selectStoreNum := totalStore - writeFlow := make(map[uint64]int64, selectStoreNum) + // select regions on `hotWriteStore` as hot write regions. + selectRegionNum := totalStore + writeFlow := make(map[uint64]int64, selectRegionNum) for _, r := range simCase.Regions { - if r.Leader.GetStoreId() == 1 { + if r.Leader.GetStoreId() == hotWriteStore { writeFlow[r.ID] = 2 * units.MiB - if len(writeFlow) == selectStoreNum { + if len(writeFlow) == selectRegionNum { break } } @@ -70,23 +81,31 @@ func newHotWrite(config *sc.SimConfig) *Case { e.Step = func(int64) map[uint64]int64 { return writeFlow } - simCase.Events = []EventDescriptor{e} - // Checker description - simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - leaderCount := make([]int, totalStore) - peerCount := make([]int, totalStore) + simCase.Checker = func(stores []*metapb.Store, regions *core.RegionsInfo, _ []info.StoreStats) bool { + for _, store := range stores { + if store.NodeState == metapb.NodeState_Removed { + if store.Id == hotWriteStore { + simutil.Logger.Error(fmt.Sprintf("hot store %d is removed", hotReadStore)) + return true + } + delete(allStores, store.GetId()) + } + } + + leaderCount := make(map[uint64]int, len(allStores)) + peerCount := make(map[uint64]int, totalStore) for id := range writeFlow { region := regions.GetRegion(id) - leaderCount[int(region.GetLeader().GetStoreId()-1)]++ + leaderCount[region.GetLeader().GetStoreId()]++ for _, p := range region.GetPeers() { - peerCount[int(p.GetStoreId()-1)]++ + peerCount[p.GetStoreId()]++ } } // check count diff <= 2. - var minLeader, maxLeader, minPeer, maxPeer int + var minLeader, maxLeader, minPeer, maxPeer uint64 for i := range leaderCount { if leaderCount[i] > leaderCount[maxLeader] { maxLeader = i diff --git a/tools/pd-simulator/simulator/cases/import_data.go b/tools/pd-simulator/simulator/cases/import_data.go index b9f448a6cf6e..e37aadcfeba2 100644 --- a/tools/pd-simulator/simulator/cases/import_data.go +++ b/tools/pd-simulator/simulator/cases/import_data.go @@ -36,13 +36,15 @@ func newImportData(config *sc.SimConfig) *Case { totalStore := config.TotalStore totalRegion := config.TotalRegion replica := int(config.ServerConfig.Replication.MaxReplicas) - + allStores := make(map[uint64]struct{}, totalStore) // Initialize the cluster for i := 0; i < totalStore; i++ { + id := simutil.IDAllocator.NextID() simCase.Stores = append(simCase.Stores, &Store{ - ID: IDAllocator.nextID(), + ID: id, Status: metapb.StoreState_Up, }) + allStores[id] = struct{}{} } for i := 0; i < totalRegion; i++ { @@ -83,7 +85,13 @@ func newImportData(config *sc.SimConfig) *Case { checkCount := uint64(0) var newRegionCount [][3]int var allRegionCount [][3]int - simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { + simCase.Checker = func(stores []*metapb.Store, regions *core.RegionsInfo, _ []info.StoreStats) bool { + for _, store := range stores { + if store.NodeState == metapb.NodeState_Removed { + delete(allStores, store.GetId()) + } + } + leaderDist := make(map[uint64]int) peerDist := make(map[uint64]int) leaderTotal := 0 @@ -115,9 +123,9 @@ func newImportData(config *sc.SimConfig) *Case { tableLeaderLog = fmt.Sprintf("%s [store %d]:%.2f%%", tableLeaderLog, storeID, float64(leaderCount)/float64(leaderTotal)*100) } } - for storeID := 1; storeID <= 10; storeID++ { - if peerCount, ok := peerDist[uint64(storeID)]; ok { - newRegionCount = append(newRegionCount, [3]int{storeID, int(checkCount), peerCount}) + for storeID := range allStores { + if peerCount, ok := peerDist[storeID]; ok { + newRegionCount = append(newRegionCount, [3]int{int(storeID), int(checkCount), peerCount}) tablePeerLog = fmt.Sprintf("%s [store %d]:%.2f%%", tablePeerLog, storeID, float64(peerCount)/float64(peerTotal)*100) } } @@ -126,7 +134,7 @@ func newImportData(config *sc.SimConfig) *Case { totalPeerLog := fmt.Sprintf("%d peer:", regionTotal*3) isEnd := false var regionProps []float64 - for storeID := uint64(1); storeID <= 10; storeID++ { + for storeID := range allStores { totalLeaderLog = fmt.Sprintf("%s [store %d]:%.2f%%", totalLeaderLog, storeID, float64(regions.GetStoreLeaderCount(storeID))/float64(regionTotal)*100) regionProp := float64(regions.GetStoreRegionCount(storeID)) / float64(regionTotal*3) * 100 regionProps = append(regionProps, regionProp) diff --git a/tools/pd-simulator/simulator/cases/makeup_down_replica.go b/tools/pd-simulator/simulator/cases/makeup_down_replica.go index a5ee63e71a0b..ede3c4ba083f 100644 --- a/tools/pd-simulator/simulator/cases/makeup_down_replica.go +++ b/tools/pd-simulator/simulator/cases/makeup_down_replica.go @@ -69,7 +69,7 @@ func newMakeupDownReplicas(config *sc.SimConfig) *Case { } simCase.Events = []EventDescriptor{e} - simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { + simCase.Checker = func(_ []*metapb.Store, regions *core.RegionsInfo, _ []info.StoreStats) bool { if !down { return false } diff --git a/tools/pd-simulator/simulator/cases/region_merge.go b/tools/pd-simulator/simulator/cases/region_merge.go index 8097565d1a72..3d050070203d 100644 --- a/tools/pd-simulator/simulator/cases/region_merge.go +++ b/tools/pd-simulator/simulator/cases/region_merge.go @@ -28,12 +28,15 @@ func newRegionMerge(config *sc.SimConfig) *Case { totalStore := config.TotalStore totalRegion := config.TotalRegion replica := int(config.ServerConfig.Replication.MaxReplicas) + allStores := make(map[uint64]struct{}, totalStore) for i := 0; i < totalStore; i++ { + id := simutil.IDAllocator.NextID() simCase.Stores = append(simCase.Stores, &Store{ - ID: simutil.IDAllocator.NextID(), + ID: id, Status: metapb.StoreState_Up, }) + allStores[id] = struct{}{} } for i := 0; i < totalRegion; i++ { @@ -54,10 +57,16 @@ func newRegionMerge(config *sc.SimConfig) *Case { } // Checker description mergeRatio := 4 // when max-merge-region-size is 20, per region will reach 40MB - simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { + simCase.Checker = func(stores []*metapb.Store, regions *core.RegionsInfo, _ []info.StoreStats) bool { + for _, store := range stores { + if store.NodeState == metapb.NodeState_Removed { + delete(allStores, store.GetId()) + } + } + currentPeerCount := 0 - for i := 1; i <= totalStore; i++ { - currentPeerCount += regions.GetStoreRegionCount(uint64(i)) + for storeID := range allStores { + currentPeerCount += regions.GetStoreRegionCount(storeID) } return isUniform(currentPeerCount, totalRegion*replica/mergeRatio) } diff --git a/tools/pd-simulator/simulator/cases/region_split.go b/tools/pd-simulator/simulator/cases/region_split.go index 7b712f4dc483..b158541e5cc8 100644 --- a/tools/pd-simulator/simulator/cases/region_split.go +++ b/tools/pd-simulator/simulator/cases/region_split.go @@ -25,12 +25,15 @@ import ( func newRegionSplit(config *sc.SimConfig) *Case { var simCase Case totalStore := config.TotalStore + allStores := make(map[uint64]struct{}, totalStore) for i := 0; i < totalStore; i++ { + id := uint64(i) simCase.Stores = append(simCase.Stores, &Store{ - ID: uint64(i), + ID: id, Status: metapb.StoreState_Up, }) + allStores[id] = struct{}{} } peers := []*metapb.Peer{ {Id: 4, StoreId: 1}, @@ -55,9 +58,9 @@ func newRegionSplit(config *sc.SimConfig) *Case { simCase.Events = []EventDescriptor{e} // Checker description - simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - for i := 1; i <= totalStore; i++ { - peerCount := regions.GetStoreRegionCount(uint64(i)) + simCase.Checker = func(_ []*metapb.Store, regions *core.RegionsInfo, _ []info.StoreStats) bool { + for storeID := range allStores { + peerCount := regions.GetStoreRegionCount(storeID) if peerCount < 5 { return false } diff --git a/tools/pd-simulator/simulator/drive.go b/tools/pd-simulator/simulator/drive.go index 0296710b7050..e6c953dab87c 100644 --- a/tools/pd-simulator/simulator/drive.go +++ b/tools/pd-simulator/simulator/drive.go @@ -179,16 +179,18 @@ func (d *Driver) Tick() { // Check checks if the simulation is completed. func (d *Driver) Check() bool { length := uint64(len(d.conn.Nodes) + 1) - for index := range d.conn.Nodes { + var stores []*metapb.Store + for index, s := range d.conn.Nodes { if index >= length { length = index + 1 } + stores = append(stores, s.Store) } stats := make([]info.StoreStats, length) for index, node := range d.conn.Nodes { stats[index] = *node.stats } - return d.simCase.Checker(d.raftEngine.regionsInfo, stats) + return d.simCase.Checker(stores, d.raftEngine.regionsInfo, stats) } // Start starts all nodes. diff --git a/tools/pd-simulator/simulator/event.go b/tools/pd-simulator/simulator/event.go index 8e01a8f5f40e..7b818a27b229 100644 --- a/tools/pd-simulator/simulator/event.go +++ b/tools/pd-simulator/simulator/event.go @@ -19,6 +19,7 @@ import ( "fmt" "math/rand" "net/http" + "strconv" "sync" "github.com/pingcap/kvproto/pkg/metapb" @@ -74,7 +75,12 @@ func (e *eventHandler) createEvent(w http.ResponseWriter, r *http.Request) { e.er.addEvent(&AddNode{}) return case "down-node": - e.er.addEvent(&DownNode{}) + id := r.URL.Query().Get("node-id") + var ID int + if len(id) != 0 { + ID, _ = strconv.Atoi(id) + } + e.er.addEvent(&DownNode{ID: ID}) return default: } @@ -202,17 +208,25 @@ func (*AddNode) Run(raft *RaftEngine, _ int64) bool { } // DownNode deletes nodes. -type DownNode struct{} +type DownNode struct { + ID int +} // Run implements the event interface. -func (*DownNode) Run(raft *RaftEngine, _ int64) bool { - nodes := raft.conn.getNodes() +func (e *DownNode) Run(raft *RaftEngine, _ int64) bool { + nodes := raft.conn.Nodes if len(nodes) == 0 { simutil.Logger.Error("can not find any node") return false } - i := rand.Intn(len(nodes)) - node := nodes[i] + var node *Node + if e.ID == 0 { + arrNodes := raft.conn.getNodes() + i := rand.Intn(len(arrNodes)) + node = nodes[arrNodes[i].Store.GetId()] + } else { + node = nodes[uint64(e.ID)] + } if node == nil { simutil.Logger.Error("node is not existed", zap.Uint64("node-id", node.Id)) return false From 8215f3088c7373512909d60c9d0ae7d9a577091b Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Fri, 21 Jun 2024 18:01:19 +0800 Subject: [PATCH 43/47] tools/tso: add option `useTSOServerProxy` in pd-tso-bench (#7756) ref tikv/pd#8135 Signed-off-by: lhy1024 Co-authored-by: Hu# Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- client/client.go | 14 ++++++++++++++ client/option.go | 13 +++++++------ tools/pd-tso-bench/main.go | 4 ++++ 3 files changed, 25 insertions(+), 6 deletions(-) diff --git a/client/client.go b/client/client.go index 92cbd3d523fc..8c8299daeabb 100644 --- a/client/client.go +++ b/client/client.go @@ -268,6 +268,15 @@ func WithForwardingOption(enableForwarding bool) ClientOption { } } +// WithTSOServerProxyOption configures the client to use TSO server proxy, +// i.e., the client will send TSO requests to the API leader (the TSO server +// proxy) which will forward the requests to the TSO servers. +func WithTSOServerProxyOption(useTSOServerProxy bool) ClientOption { + return func(c *client) { + c.option.useTSOServerProxy = useTSOServerProxy + } +} + // WithMaxErrorRetry configures the client max retry times when connect meets error. func WithMaxErrorRetry(count int) ClientOption { return func(c *client) { @@ -648,6 +657,11 @@ func (c *client) setServiceMode(newMode pdpb.ServiceMode) { c.Lock() defer c.Unlock() + if c.option.useTSOServerProxy { + // If we are using TSO server proxy, we always use PD_SVC_MODE. + newMode = pdpb.ServiceMode_PD_SVC_MODE + } + if newMode == c.serviceMode { return } diff --git a/client/option.go b/client/option.go index 2a6c285cfb74..0109bfc4ed09 100644 --- a/client/option.go +++ b/client/option.go @@ -51,12 +51,13 @@ const ( // It provides the ability to change some PD client's options online from the outside. type option struct { // Static options. - gRPCDialOptions []grpc.DialOption - timeout time.Duration - maxRetryTimes int - enableForwarding bool - metricsLabels prometheus.Labels - initMetrics bool + gRPCDialOptions []grpc.DialOption + timeout time.Duration + maxRetryTimes int + enableForwarding bool + useTSOServerProxy bool + metricsLabels prometheus.Labels + initMetrics bool // Dynamic options. dynamicOptions [dynamicOptionCount]atomic.Value diff --git a/tools/pd-tso-bench/main.go b/tools/pd-tso-bench/main.go index b4101bda2702..3726373779e5 100644 --- a/tools/pd-tso-bench/main.go +++ b/tools/pd-tso-bench/main.go @@ -62,6 +62,7 @@ var ( maxTSOSendIntervalMilliseconds = flag.Int("max-send-interval-ms", 0, "max tso send interval in milliseconds, 60s by default") keyspaceID = flag.Uint("keyspace-id", 0, "the id of the keyspace to access") keyspaceName = flag.String("keyspace-name", "", "the name of the keyspace to access") + useTSOServerProxy = flag.Bool("use-tso-server-proxy", false, "whether send tso requests to tso server proxy instead of tso service directly") wg sync.WaitGroup ) @@ -424,6 +425,9 @@ func createPDClient(ctx context.Context) (pd.Client, error) { ) opts := make([]pd.ClientOption, 0) + if *useTSOServerProxy { + opts = append(opts, pd.WithTSOServerProxyOption(true)) + } opts = append(opts, pd.WithGRPCDialOptions( grpc.WithKeepaliveParams(keepalive.ClientParameters{ Time: keepaliveTime, From debb5febb83e032da5a0c0162b1bff656a8fe931 Mon Sep 17 00:00:00 2001 From: Hu# Date: Mon, 24 Jun 2024 11:20:50 +0800 Subject: [PATCH 44/47] readme: update go version in readme (#8321) ref tikv/pd#4399 update go version in readme Signed-off-by: husharp --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index cc4a6781b018..564e7adedd24 100644 --- a/README.md +++ b/README.md @@ -17,8 +17,8 @@ If you're interested in contributing to PD, see [CONTRIBUTING.md](./CONTRIBUTING ## Build -1. Make sure [*Go*](https://golang.org/) (version 1.20) is installed. -2. Use `make` to install PD. PD is installed in the `bin` directory. +1. Make sure [*Go*](https://golang.org/) (version 1.21) is installed. +2. Use `make` to install PD. `pd-server` will be installed in the `bin` directory. ## Usage From 26e90e9fffdd3a89ea796d2f7ed3b70cb98ae870 Mon Sep 17 00:00:00 2001 From: okJiang <819421878@qq.com> Date: Mon, 24 Jun 2024 16:52:21 +0800 Subject: [PATCH 45/47] scheduler: skip evict-leader-scheduler when setting schedule deny label (#8303) ref tikv/pd#7300, close tikv/pd#7853 - add a real cluster test to test `skip evict-leader-scheduler when setting schedule deny label` - add `DeleteStoreLabel` API and `DeleteScheduler` API Signed-off-by: okJiang <819421878@qq.com> --- .gitignore | 1 + Makefile | 1 + client/http/interface.go | 27 +++ client/http/request_info.go | 2 + .../schedulers/scheduler_controller.go | 6 +- server/cluster/cluster.go | 3 + tests/integrations/client/http_client_test.go | 12 +- tests/integrations/realcluster/Makefile | 9 +- tests/integrations/realcluster/deploy.sh | 9 +- tests/integrations/realcluster/pd.toml | 5 + .../realcluster/reboot_pd_test.go | 3 + .../realcluster/scheduler_test.go | 188 ++++++++++++++++++ .../realcluster/transfer_leader_test.go | 73 ------- tests/integrations/realcluster/wait_tiup.sh | 2 +- 14 files changed, 261 insertions(+), 80 deletions(-) create mode 100644 tests/integrations/realcluster/pd.toml create mode 100644 tests/integrations/realcluster/scheduler_test.go delete mode 100644 tests/integrations/realcluster/transfer_leader_test.go diff --git a/.gitignore b/.gitignore index b9be6099e244..fb9f0424418f 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,4 @@ coverage *.txt go.work* embedded_assets_handler.go +*.log diff --git a/Makefile b/Makefile index dca000121148..5f5ac871f18b 100644 --- a/Makefile +++ b/Makefile @@ -280,6 +280,7 @@ test-tso-consistency: install-tools REAL_CLUSTER_TEST_PATH := $(ROOT_PATH)/tests/integrations/realcluster test-real-cluster: + @ rm -rf ~/.tiup/data/pd_real_cluster_test # testing with the real cluster... cd $(REAL_CLUSTER_TEST_PATH) && $(MAKE) check diff --git a/client/http/interface.go b/client/http/interface.go index 3684e19b1f50..f90ab19624fa 100644 --- a/client/http/interface.go +++ b/client/http/interface.go @@ -51,6 +51,7 @@ type Client interface { GetStore(context.Context, uint64) (*StoreInfo, error) DeleteStore(context.Context, uint64) error SetStoreLabels(context.Context, int64, map[string]string) error + DeleteStoreLabel(ctx context.Context, storeID int64, labelKey string) error GetHealthStatus(context.Context) ([]Health, error) /* Config-related interfaces */ GetConfig(context.Context) (map[string]any, error) @@ -65,6 +66,7 @@ type Client interface { /* Scheduler-related interfaces */ GetSchedulers(context.Context) ([]string, error) CreateScheduler(ctx context.Context, name string, storeID uint64) error + DeleteScheduler(ctx context.Context, name string) error SetSchedulerDelay(context.Context, string, int64) error /* Rule-related interfaces */ GetAllPlacementRuleBundles(context.Context) ([]*GroupBundle, error) @@ -81,6 +83,10 @@ type Client interface { DeletePlacementRuleGroupByID(context.Context, string) error GetAllRegionLabelRules(context.Context) ([]*LabelRule, error) GetRegionLabelRulesByIDs(context.Context, []string) ([]*LabelRule, error) + // `SetRegionLabelRule` sets the label rule for a region. + // When a label rule (deny scheduler) is set, + // 1. All schedulers will be disabled except for the evict-leader-scheduler. + // 2. The merge-checker will be disabled, preventing these regions from being merged. SetRegionLabelRule(context.Context, *LabelRule) error PatchRegionLabelRules(context.Context, *LabelRulePatch) error /* Scheduling-related interfaces */ @@ -339,6 +345,19 @@ func (c *client) SetStoreLabels(ctx context.Context, storeID int64, storeLabels WithBody(jsonInput)) } +// DeleteStoreLabel deletes the labels of a store. +func (c *client) DeleteStoreLabel(ctx context.Context, storeID int64, labelKey string) error { + jsonInput, err := json.Marshal(labelKey) + if err != nil { + return errors.Trace(err) + } + return c.request(ctx, newRequestInfo(). + WithName(deleteStoreLabelName). + WithURI(LabelByStoreID(storeID)). + WithMethod(http.MethodDelete). + WithBody(jsonInput)) +} + // GetHealthStatus gets the health status of the cluster. func (c *client) GetHealthStatus(ctx context.Context) ([]Health, error) { var healths []Health @@ -762,6 +781,14 @@ func (c *client) CreateScheduler(ctx context.Context, name string, storeID uint6 WithBody(inputJSON)) } +// DeleteScheduler deletes a scheduler from PD cluster. +func (c *client) DeleteScheduler(ctx context.Context, name string) error { + return c.request(ctx, newRequestInfo(). + WithName(deleteSchedulerName). + WithURI(SchedulerByName(name)). + WithMethod(http.MethodDelete)) +} + // AccelerateSchedule accelerates the scheduling of the regions within the given key range. // The keys in the key range should be encoded in the hex bytes format (without encoding to the UTF-8 bytes). func (c *client) AccelerateSchedule(ctx context.Context, keyRange *KeyRange) error { diff --git a/client/http/request_info.go b/client/http/request_info.go index 40bd03682504..783220bcc60f 100644 --- a/client/http/request_info.go +++ b/client/http/request_info.go @@ -41,6 +41,7 @@ const ( getStoreName = "GetStore" deleteStoreName = "DeleteStore" setStoreLabelsName = "SetStoreLabels" + deleteStoreLabelName = "DeleteStoreLabel" getHealthStatusName = "GetHealthStatus" getConfigName = "GetConfig" setConfigName = "SetConfig" @@ -53,6 +54,7 @@ const ( getReplicateConfigName = "GetReplicateConfig" getSchedulersName = "GetSchedulers" createSchedulerName = "CreateScheduler" + deleteSchedulerName = "DeleteScheduler" setSchedulerDelayName = "SetSchedulerDelay" getAllPlacementRuleBundlesName = "GetAllPlacementRuleBundles" getPlacementRuleBundleByGroupName = "GetPlacementRuleBundleByGroup" diff --git a/pkg/schedule/schedulers/scheduler_controller.go b/pkg/schedule/schedulers/scheduler_controller.go index ea480a06845e..04c74af7964f 100644 --- a/pkg/schedule/schedulers/scheduler_controller.go +++ b/pkg/schedule/schedulers/scheduler_controller.go @@ -456,6 +456,7 @@ func (s *ScheduleController) Stop() { // Schedule tries to create some operators. func (s *ScheduleController) Schedule(diagnosable bool) []*operator.Operator { + _, isEvictLeaderScheduler := s.Scheduler.(*evictLeaderScheduler) retry: for i := 0; i < maxScheduleRetries; i++ { // no need to retry if schedule should stop to speed exit @@ -486,7 +487,10 @@ retry: if labelMgr == nil { continue } - if labelMgr.ScheduleDisabled(region) { + + // If the evict-leader-scheduler is disabled, it will obstruct the restart operation of tikv by the operator. + // Refer: https://docs.pingcap.com/tidb-in-kubernetes/stable/restart-a-tidb-cluster#perform-a-graceful-restart-to-a-single-tikv-pod + if labelMgr.ScheduleDisabled(region) && !isEvictLeaderScheduler { denySchedulersByLabelerCounter.Inc() continue retry } diff --git a/server/cluster/cluster.go b/server/cluster/cluster.go index 534d8361b2a2..5c15856cec6f 100644 --- a/server/cluster/cluster.go +++ b/server/cluster/cluster.go @@ -1206,6 +1206,9 @@ func (c *RaftCluster) DeleteStoreLabel(storeID uint64, labelKey string) error { if store == nil { return errs.ErrInvalidStoreID.FastGenByArgs(storeID) } + if len(store.GetLabels()) == 0 { + return errors.Errorf("the label key %s does not exist", labelKey) + } newStore := typeutil.DeepClone(store.GetMeta(), core.StoreFactory) labels := make([]*metapb.StoreLabel, 0, len(newStore.GetLabels())-1) for _, label := range newStore.GetLabels() { diff --git a/tests/integrations/client/http_client_test.go b/tests/integrations/client/http_client_test.go index 229a658639d0..1d7d44886927 100644 --- a/tests/integrations/client/http_client_test.go +++ b/tests/integrations/client/http_client_test.go @@ -560,9 +560,14 @@ func (suite *httpClientTestSuite) TestSchedulers() { re.NoError(err) err = client.SetSchedulerDelay(ctx, "not-exist", 100) re.ErrorContains(err, "500 Internal Server Error") // TODO: should return friendly error message + + re.NoError(client.DeleteScheduler(ctx, schedulerName)) + schedulers, err = client.GetSchedulers(ctx) + re.NoError(err) + re.NotContains(schedulers, schedulerName) } -func (suite *httpClientTestSuite) TestSetStoreLabels() { +func (suite *httpClientTestSuite) TestStoreLabels() { re := suite.Require() client := suite.client ctx, cancel := context.WithCancel(suite.ctx) @@ -590,6 +595,11 @@ func (suite *httpClientTestSuite) TestSetStoreLabels() { for key, value := range storeLabels { re.Equal(value, labelsMap[key]) } + + re.NoError(client.DeleteStoreLabel(ctx, firstStore.Store.ID, "zone")) + store, err := client.GetStore(ctx, uint64(firstStore.Store.ID)) + re.NoError(err) + re.Empty(store.Store.Labels) } func (suite *httpClientTestSuite) TestTransferLeader() { diff --git a/tests/integrations/realcluster/Makefile b/tests/integrations/realcluster/Makefile index e161d52a86e4..28c918ec2bf0 100644 --- a/tests/integrations/realcluster/Makefile +++ b/tests/integrations/realcluster/Makefile @@ -50,7 +50,14 @@ kill_cluster: fi test: - CGO_ENABLED=1 go test ./... -v -tags deadlock -race -cover || { exit 1; } + CGO_ENABLED=1 go test ./... -v -tags deadlock -race -cover || (\ + echo "follow is pd-0 log\n" ; \ + cat ~/.tiup/data/pd_real_cluster_test/pd-0/pd.log ; \ + echo "follow is pd-1 log\n" ; \ + cat ~/.tiup/data/pd_real_cluster_test/pd-1/pd.log ; \ + echo "follow is pd-2 log\n" ; \ + cat ~/.tiup/data/pd_real_cluster_test/pd-2/pd.log ; \ + exit 1) install-tools: cd $(ROOT_PATH) && $(MAKE) install-tools diff --git a/tests/integrations/realcluster/deploy.sh b/tests/integrations/realcluster/deploy.sh index 31bf17655f88..f6f567314f0d 100755 --- a/tests/integrations/realcluster/deploy.sh +++ b/tests/integrations/realcluster/deploy.sh @@ -1,6 +1,8 @@ #!/bin/bash # deploy `tiup playground` +set -x + TIUP_BIN_DIR=$HOME/.tiup/bin/tiup CUR_PATH=$(pwd) @@ -19,15 +21,16 @@ if [ ! -d "bin" ] || [ ! -e "bin/tikv-server" ] && [ ! -e "bin/tidb-server" ] && color-green "downloading binaries..." color-green "this may take a few minutes, you can also download them manually and put them in the bin directory." make pd-server WITH_RACE=1 - $TIUP_BIN_DIR playground nightly --kv 3 --tiflash 1 --db 1 --pd 3 --without-monitor --tag pd_test \ - --pd.binpath ./bin/pd-server \ + $TIUP_BIN_DIR playground nightly --kv 3 --tiflash 1 --db 1 --pd 3 --without-monitor --tag pd_real_cluster_test \ + --pd.binpath ./bin/pd-server --pd.config ./tests/integrations/realcluster/pd.toml \ > $CUR_PATH/playground.log 2>&1 & else # CI will download the binaries in the prepare phase. # ref https://github.com/PingCAP-QE/ci/blob/387e9e533b365174962ccb1959442a7070f9cd66/pipelines/tikv/pd/latest/pull_integration_realcluster_test.groovy#L55-L68 color-green "using existing binaries..." $TIUP_BIN_DIR playground nightly --kv 3 --tiflash 1 --db 1 --pd 3 --without-monitor \ - --pd.binpath ./bin/pd-server --kv.binpath ./bin/tikv-server --db.binpath ./bin/tidb-server --tiflash.binpath ./bin/tiflash --tag pd_test \ + --pd.binpath ./bin/pd-server --kv.binpath ./bin/tikv-server --db.binpath ./bin/tidb-server \ + --tiflash.binpath ./bin/tiflash --tag pd_real_cluster_test --pd.config ./tests/integrations/realcluster/pd.toml \ > $CUR_PATH/playground.log 2>&1 & fi diff --git a/tests/integrations/realcluster/pd.toml b/tests/integrations/realcluster/pd.toml new file mode 100644 index 000000000000..876c7f13af2a --- /dev/null +++ b/tests/integrations/realcluster/pd.toml @@ -0,0 +1,5 @@ +[schedule] +patrol-region-interval = "100ms" + +[log] +level = "debug" diff --git a/tests/integrations/realcluster/reboot_pd_test.go b/tests/integrations/realcluster/reboot_pd_test.go index b8914e87bd86..14c86f2dedb8 100644 --- a/tests/integrations/realcluster/reboot_pd_test.go +++ b/tests/integrations/realcluster/reboot_pd_test.go @@ -51,6 +51,9 @@ func TestReloadLabel(t *testing.T) { storeLabels[label.Key] = label.Value } re.NoError(pdHTTPCli.SetStoreLabels(ctx, firstStore.Store.ID, storeLabels)) + defer func() { + re.NoError(pdHTTPCli.DeleteStoreLabel(ctx, firstStore.Store.ID, "zone")) + }() checkLabelsAreEqual := func() { resp, err := pdHTTPCli.GetStore(ctx, uint64(firstStore.Store.ID)) diff --git a/tests/integrations/realcluster/scheduler_test.go b/tests/integrations/realcluster/scheduler_test.go new file mode 100644 index 000000000000..0ed6f6c6b764 --- /dev/null +++ b/tests/integrations/realcluster/scheduler_test.go @@ -0,0 +1,188 @@ +// Copyright 2024 TiKV Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package realcluster + +import ( + "context" + "fmt" + "sort" + "testing" + "time" + + "github.com/stretchr/testify/require" + pd "github.com/tikv/pd/client/http" + "github.com/tikv/pd/client/testutil" + "github.com/tikv/pd/pkg/schedule/labeler" + "github.com/tikv/pd/pkg/schedule/schedulers" +) + +// https://github.com/tikv/pd/issues/6988#issuecomment-1694924611 +// https://github.com/tikv/pd/issues/6897 +func TestTransferLeader(t *testing.T) { + re := require.New(t) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + resp, err := pdHTTPCli.GetLeader(ctx) + re.NoError(err) + oldLeader := resp.Name + + var newLeader string + for i := 0; i < 2; i++ { + if resp.Name != fmt.Sprintf("pd-%d", i) { + newLeader = fmt.Sprintf("pd-%d", i) + } + } + + // record scheduler + re.NoError(pdHTTPCli.CreateScheduler(ctx, schedulers.EvictLeaderName, 1)) + defer func() { + re.NoError(pdHTTPCli.DeleteScheduler(ctx, schedulers.EvictLeaderName)) + }() + res, err := pdHTTPCli.GetSchedulers(ctx) + re.NoError(err) + oldSchedulersLen := len(res) + + re.NoError(pdHTTPCli.TransferLeader(ctx, newLeader)) + // wait for transfer leader to new leader + time.Sleep(1 * time.Second) + resp, err = pdHTTPCli.GetLeader(ctx) + re.NoError(err) + re.Equal(newLeader, resp.Name) + + res, err = pdHTTPCli.GetSchedulers(ctx) + re.NoError(err) + re.Len(res, oldSchedulersLen) + + // transfer leader to old leader + re.NoError(pdHTTPCli.TransferLeader(ctx, oldLeader)) + // wait for transfer leader + time.Sleep(1 * time.Second) + resp, err = pdHTTPCli.GetLeader(ctx) + re.NoError(err) + re.Equal(oldLeader, resp.Name) + + res, err = pdHTTPCli.GetSchedulers(ctx) + re.NoError(err) + re.Len(res, oldSchedulersLen) +} + +func TestRegionLabelDenyScheduler(t *testing.T) { + re := require.New(t) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + regions, err := pdHTTPCli.GetRegions(ctx) + re.NoError(err) + re.GreaterOrEqual(len(regions.Regions), 1) + region1 := regions.Regions[0] + + err = pdHTTPCli.DeleteScheduler(ctx, schedulers.BalanceLeaderName) + if err == nil { + defer func() { + pdHTTPCli.CreateScheduler(ctx, schedulers.BalanceLeaderName, 0) + }() + } + + re.NoError(pdHTTPCli.CreateScheduler(ctx, schedulers.GrantLeaderName, uint64(region1.Leader.StoreID))) + defer func() { + pdHTTPCli.DeleteScheduler(ctx, schedulers.GrantLeaderName) + }() + + // wait leader transfer + testutil.Eventually(re, func() bool { + regions, err := pdHTTPCli.GetRegions(ctx) + re.NoError(err) + for _, region := range regions.Regions { + if region.Leader.StoreID != region1.Leader.StoreID { + return false + } + } + return true + }, testutil.WithWaitFor(time.Minute)) + + // disable schedule for region1 + labelRule := &pd.LabelRule{ + ID: "rule1", + Labels: []pd.RegionLabel{{Key: "schedule", Value: "deny"}}, + RuleType: "key-range", + Data: labeler.MakeKeyRanges(region1.StartKey, region1.EndKey), + } + re.NoError(pdHTTPCli.SetRegionLabelRule(ctx, labelRule)) + defer func() { + pdHTTPCli.PatchRegionLabelRules(ctx, &pd.LabelRulePatch{DeleteRules: []string{labelRule.ID}}) + }() + labelRules, err := pdHTTPCli.GetAllRegionLabelRules(ctx) + re.NoError(err) + re.Len(labelRules, 2) + sort.Slice(labelRules, func(i, j int) bool { + return labelRules[i].ID < labelRules[j].ID + }) + re.Equal(labelRule.ID, labelRules[1].ID) + re.Equal(labelRule.Labels, labelRules[1].Labels) + re.Equal(labelRule.RuleType, labelRules[1].RuleType) + + // enable evict leader scheduler, and check it works + re.NoError(pdHTTPCli.DeleteScheduler(ctx, schedulers.GrantLeaderName)) + re.NoError(pdHTTPCli.CreateScheduler(ctx, schedulers.EvictLeaderName, uint64(region1.Leader.StoreID))) + defer func() { + pdHTTPCli.DeleteScheduler(ctx, schedulers.EvictLeaderName) + }() + testutil.Eventually(re, func() bool { + regions, err := pdHTTPCli.GetRegions(ctx) + re.NoError(err) + for _, region := range regions.Regions { + if region.Leader.StoreID == region1.Leader.StoreID { + return false + } + } + return true + }, testutil.WithWaitFor(time.Minute)) + + re.NoError(pdHTTPCli.DeleteScheduler(ctx, schedulers.EvictLeaderName)) + re.NoError(pdHTTPCli.CreateScheduler(ctx, schedulers.GrantLeaderName, uint64(region1.Leader.StoreID))) + defer func() { + pdHTTPCli.DeleteScheduler(ctx, schedulers.GrantLeaderName) + }() + testutil.Eventually(re, func() bool { + regions, err := pdHTTPCli.GetRegions(ctx) + re.NoError(err) + for _, region := range regions.Regions { + if region.ID == region1.ID { + continue + } + if region.Leader.StoreID != region1.Leader.StoreID { + return false + } + } + return true + }, testutil.WithWaitFor(time.Minute)) + + pdHTTPCli.PatchRegionLabelRules(ctx, &pd.LabelRulePatch{DeleteRules: []string{labelRule.ID}}) + labelRules, err = pdHTTPCli.GetAllRegionLabelRules(ctx) + re.NoError(err) + re.Len(labelRules, 1) + + testutil.Eventually(re, func() bool { + regions, err := pdHTTPCli.GetRegions(ctx) + re.NoError(err) + for _, region := range regions.Regions { + if region.Leader.StoreID != region1.Leader.StoreID { + return false + } + } + return true + }, testutil.WithWaitFor(time.Minute)) +} diff --git a/tests/integrations/realcluster/transfer_leader_test.go b/tests/integrations/realcluster/transfer_leader_test.go deleted file mode 100644 index 0000f7e14a54..000000000000 --- a/tests/integrations/realcluster/transfer_leader_test.go +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright 2023 TiKV Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package realcluster - -import ( - "context" - "fmt" - "testing" - "time" - - "github.com/stretchr/testify/require" -) - -// https://github.com/tikv/pd/issues/6988#issuecomment-1694924611 -// https://github.com/tikv/pd/issues/6897 -func TestTransferLeader(t *testing.T) { - re := require.New(t) - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - resp, err := pdHTTPCli.GetLeader(ctx) - re.NoError(err) - oldLeader := resp.Name - - var newLeader string - for i := 0; i < 2; i++ { - if resp.Name != fmt.Sprintf("pd-%d", i) { - newLeader = fmt.Sprintf("pd-%d", i) - } - } - - // record scheduler - err = pdHTTPCli.CreateScheduler(ctx, "evict-leader-scheduler", 1) - re.NoError(err) - res, err := pdHTTPCli.GetSchedulers(ctx) - re.NoError(err) - oldSchedulersLen := len(res) - - re.NoError(pdHTTPCli.TransferLeader(ctx, newLeader)) - // wait for transfer leader to new leader - time.Sleep(1 * time.Second) - resp, err = pdHTTPCli.GetLeader(ctx) - re.NoError(err) - re.Equal(newLeader, resp.Name) - - res, err = pdHTTPCli.GetSchedulers(ctx) - re.NoError(err) - re.Len(res, oldSchedulersLen) - - // transfer leader to old leader - re.NoError(pdHTTPCli.TransferLeader(ctx, oldLeader)) - // wait for transfer leader - time.Sleep(1 * time.Second) - resp, err = pdHTTPCli.GetLeader(ctx) - re.NoError(err) - re.Equal(oldLeader, resp.Name) - - res, err = pdHTTPCli.GetSchedulers(ctx) - re.NoError(err) - re.Len(res, oldSchedulersLen) -} diff --git a/tests/integrations/realcluster/wait_tiup.sh b/tests/integrations/realcluster/wait_tiup.sh index 497774f9e969..3a8c02a969e1 100755 --- a/tests/integrations/realcluster/wait_tiup.sh +++ b/tests/integrations/realcluster/wait_tiup.sh @@ -12,7 +12,7 @@ fi for ((i=0; i<${MAX_TIMES}; i++)); do sleep ${INTERVAL} - $TIUP_BIN_DIR playground display --tag pd_test + $TIUP_BIN_DIR playground display --tag pd_real_cluster_test if [ $? -eq 0 ]; then exit 0 fi From 0ec084bacdd1c5e8221fbe9aa04547b5f30b6217 Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Mon, 24 Jun 2024 17:47:51 +0800 Subject: [PATCH 46/47] mcs: fix tso server address compare (#8289) close tikv/pd#8284 Signed-off-by: lhy1024 --- pkg/keyspace/tso_keyspace_group.go | 2 +- pkg/mcs/tso/server/grpc_service.go | 3 +-- pkg/storage/endpoint/tso_keyspace_group.go | 4 ++-- pkg/tso/keyspace_group_manager.go | 8 ++++---- pkg/tso/keyspace_group_manager_test.go | 2 +- pkg/utils/typeutil/comparison.go | 10 ++++++---- server/apiv2/handlers/tso_keyspace_group.go | 2 +- 7 files changed, 16 insertions(+), 15 deletions(-) diff --git a/pkg/keyspace/tso_keyspace_group.go b/pkg/keyspace/tso_keyspace_group.go index 29b8add740cc..a04d73924260 100644 --- a/pkg/keyspace/tso_keyspace_group.go +++ b/pkg/keyspace/tso_keyspace_group.go @@ -876,7 +876,7 @@ func (m *GroupManager) SetPriorityForKeyspaceGroup(id uint32, node string, prior inKeyspaceGroup := false members := make([]endpoint.KeyspaceGroupMember, 0, len(kg.Members)) for _, member := range kg.Members { - if member.CompareAddress(node) { + if member.IsAddressEquivalent(node) { inKeyspaceGroup = true member.Priority = priority } diff --git a/pkg/mcs/tso/server/grpc_service.go b/pkg/mcs/tso/server/grpc_service.go index 03250d9ed37f..0a075a45c050 100644 --- a/pkg/mcs/tso/server/grpc_service.go +++ b/pkg/mcs/tso/server/grpc_service.go @@ -19,7 +19,6 @@ import ( "io" "net/http" "strconv" - "strings" "time" "github.com/pingcap/errors" @@ -164,7 +163,7 @@ func (s *Service) FindGroupByKeyspaceID( Address: member.Address, // TODO: watch the keyspace groups' primary serving address changes // to get the latest primary serving addresses of all keyspace groups. - IsPrimary: strings.EqualFold(member.Address, am.GetLeaderAddr()), + IsPrimary: member.IsAddressEquivalent(am.GetLeaderAddr()), }) } diff --git a/pkg/storage/endpoint/tso_keyspace_group.go b/pkg/storage/endpoint/tso_keyspace_group.go index d24b6e0dd1a4..5b6a7481176b 100644 --- a/pkg/storage/endpoint/tso_keyspace_group.go +++ b/pkg/storage/endpoint/tso_keyspace_group.go @@ -81,11 +81,11 @@ type KeyspaceGroupMember struct { Priority int `json:"priority"` } -// CompareAddress compares the address with the given address. +// IsAddressEquivalent compares the address with the given address. // It compares the address without the scheme. // Otherwise, it will not work when we update the scheme from http to https. // Issue: https://github.com/tikv/pd/issues/8284 -func (m *KeyspaceGroupMember) CompareAddress(addr string) bool { +func (m *KeyspaceGroupMember) IsAddressEquivalent(addr string) bool { return typeutil.EqualBaseURLs(m.Address, addr) } diff --git a/pkg/tso/keyspace_group_manager.go b/pkg/tso/keyspace_group_manager.go index ae4cca833746..83a1369d2f21 100644 --- a/pkg/tso/keyspace_group_manager.go +++ b/pkg/tso/keyspace_group_manager.go @@ -290,7 +290,7 @@ func (s *state) getNextPrimaryToReset( if member.Priority > maxPriority { maxPriority = member.Priority } - if member.CompareAddress(localAddress) { + if member.IsAddressEquivalent(localAddress) { localPriority = member.Priority } } @@ -625,7 +625,7 @@ func (kgm *KeyspaceGroupManager) primaryPriorityCheckLoop() { if member != nil { aliveTSONodes := make(map[string]struct{}) kgm.tsoNodes.Range(func(key, _ any) bool { - aliveTSONodes[key.(string)] = struct{}{} + aliveTSONodes[typeutil.TrimScheme(key.(string))] = struct{}{} return true }) if len(aliveTSONodes) == 0 { @@ -638,7 +638,7 @@ func (kgm *KeyspaceGroupManager) primaryPriorityCheckLoop() { if member.Priority <= localPriority { continue } - if _, ok := aliveTSONodes[member.Address]; ok { + if _, ok := aliveTSONodes[typeutil.TrimScheme(member.Address)]; ok { resetLeader = true break } @@ -667,7 +667,7 @@ func (kgm *KeyspaceGroupManager) primaryPriorityCheckLoop() { func (kgm *KeyspaceGroupManager) isAssignedToMe(group *endpoint.KeyspaceGroup) bool { return slice.AnyOf(group.Members, func(i int) bool { - return group.Members[i].CompareAddress(kgm.tsoServiceID.ServiceAddr) + return group.Members[i].IsAddressEquivalent(kgm.tsoServiceID.ServiceAddr) }) } diff --git a/pkg/tso/keyspace_group_manager_test.go b/pkg/tso/keyspace_group_manager_test.go index 0e237fb32f0f..fc057409c2aa 100644 --- a/pkg/tso/keyspace_group_manager_test.go +++ b/pkg/tso/keyspace_group_manager_test.go @@ -891,7 +891,7 @@ func collectAssignedKeyspaceGroupIDs(re *require.Assertions, kgm *KeyspaceGroupM re.Equal(i, int(am.kgID)) re.Equal(i, int(kg.ID)) for _, m := range kg.Members { - if m.CompareAddress(kgm.tsoServiceID.ServiceAddr) { + if m.IsAddressEquivalent(kgm.tsoServiceID.ServiceAddr) { ids = append(ids, uint32(i)) break } diff --git a/pkg/utils/typeutil/comparison.go b/pkg/utils/typeutil/comparison.go index d86e50543294..f4fb602a2f7d 100644 --- a/pkg/utils/typeutil/comparison.go +++ b/pkg/utils/typeutil/comparison.go @@ -82,8 +82,10 @@ func Float64Equal(a, b float64) bool { // EqualBaseURLs compares two URLs without scheme. func EqualBaseURLs(url1, url2 string) bool { - trimScheme := func(s string) string { - return strings.TrimPrefix(strings.TrimPrefix(s, "https://"), "http://") - } - return trimScheme(url1) == trimScheme(url2) + return TrimScheme(url1) == TrimScheme(url2) +} + +// TrimScheme trims the scheme from the URL. +func TrimScheme(s string) string { + return strings.TrimPrefix(strings.TrimPrefix(s, "https://"), "http://") } diff --git a/server/apiv2/handlers/tso_keyspace_group.go b/server/apiv2/handlers/tso_keyspace_group.go index ed3d37c27a9c..6dafc98e603e 100644 --- a/server/apiv2/handlers/tso_keyspace_group.go +++ b/server/apiv2/handlers/tso_keyspace_group.go @@ -515,7 +515,7 @@ func SetPriorityForKeyspaceGroup(c *gin.Context) { // check if node exists members := kg.Members if slice.NoneOf(members, func(i int) bool { - return members[i].CompareAddress(node) + return members[i].IsAddressEquivalent(node) }) { c.AbortWithStatusJSON(http.StatusBadRequest, "tso node does not exist in the keyspace group") } From 49c3b5a819bc477b1672655dd6d157406a8d4acf Mon Sep 17 00:00:00 2001 From: Hu# Date: Tue, 25 Jun 2024 13:35:51 +0800 Subject: [PATCH 47/47] test/cluster_test: make TestRegionStatistics more stable (#8320) close tikv/pd#8319 Because this test has only 2 pd's and five ResignLeader times, whenever there is a network or disk problem will cause the lease to expire, and result in one more leader resigning. And then there is a high probability of being recognized as frequent campaign times to transfer leader failed Signed-off-by: husharp --- tests/server/cluster/cluster_test.go | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/tests/server/cluster/cluster_test.go b/tests/server/cluster/cluster_test.go index e03ef2fe3184..dc972988b6b3 100644 --- a/tests/server/cluster/cluster_test.go +++ b/tests/server/cluster/cluster_test.go @@ -186,7 +186,7 @@ func TestRegionStatistics(t *testing.T) { re := require.New(t) ctx, cancel := context.WithCancel(context.Background()) defer cancel() - tc, err := tests.NewTestCluster(ctx, 2) + tc, err := tests.NewTestCluster(ctx, 3) defer tc.Destroy() re.NoError(err) @@ -223,9 +223,9 @@ func TestRegionStatistics(t *testing.T) { time.Sleep(1000 * time.Millisecond) leaderServer.ResignLeader() - newLeaderName := tc.WaitLeader() - re.NotEqual(newLeaderName, leaderName) + re.NotEqual(tc.WaitLeader(), leaderName) leaderServer = tc.GetLeaderServer() + leaderName = leaderServer.GetServer().Name() rc = leaderServer.GetRaftCluster() r := rc.GetRegion(region.Id) re.NotNil(r) @@ -238,9 +238,9 @@ func TestRegionStatistics(t *testing.T) { re.Len(regions, 1) leaderServer.ResignLeader() - newLeaderName = tc.WaitLeader() - re.Equal(newLeaderName, leaderName) + re.NotEqual(tc.WaitLeader(), leaderName) leaderServer = tc.GetLeaderServer() + leaderName = leaderServer.GetServer().Name() rc = leaderServer.GetRaftCluster() re.NotNil(r) re.True(r.LoadedFromStorage() || r.LoadedFromSync()) @@ -255,13 +255,12 @@ func TestRegionStatistics(t *testing.T) { re.False(r.LoadedFromStorage() && r.LoadedFromSync()) leaderServer.ResignLeader() - newLeaderName = tc.WaitLeader() - re.NotEqual(newLeaderName, leaderName) - leaderServer.ResignLeader() - newLeaderName = tc.WaitLeader() - re.Equal(newLeaderName, leaderName) + re.NotEqual(tc.WaitLeader(), leaderName) leaderServer = tc.GetLeaderServer() - rc = leaderServer.GetRaftCluster() + leaderName = leaderServer.GetServer().Name() + leaderServer.ResignLeader() + re.NotEqual(tc.WaitLeader(), leaderName) + rc = tc.GetLeaderServer().GetRaftCluster() r = rc.GetRegion(region.Id) re.NotNil(r) re.False(r.LoadedFromStorage() && r.LoadedFromSync())