diff --git a/client/client.go b/client/client.go index 74cb7adf2a58..067872d2d391 100644 --- a/client/client.go +++ b/client/client.go @@ -272,14 +272,6 @@ func WithInitMetricsOption(initMetrics bool) ClientOption { } } -// WithAllowTSOFallback configures the client with `allowTSOFallback` option. -// NOTICE: This should only be used for testing. -func WithAllowTSOFallback() ClientOption { - return func(c *client) { - c.option.allowTSOFallback = true - } -} - var _ Client = (*client)(nil) // serviceModeKeeper is for service mode switching. diff --git a/client/option.go b/client/option.go index 9d46c7b1a708..d6a6d61d2f93 100644 --- a/client/option.go +++ b/client/option.go @@ -54,7 +54,6 @@ type option struct { enableForwarding bool metricsLabels prometheus.Labels initMetrics bool - allowTSOFallback bool // Dynamic options. dynamicOptions [dynamicOptionCount]atomic.Value diff --git a/client/tso_dispatcher.go b/client/tso_dispatcher.go index 6791344eaafe..e4c5bf3c77ab 100644 --- a/client/tso_dispatcher.go +++ b/client/tso_dispatcher.go @@ -796,22 +796,7 @@ func (c *tsoClient) compareAndSwapTS( // all TSOs we get will be [6, 7, 8, 9, 10]. lastTSOInfo.logical stores the logical part of the largest ts returned // last time. if tsoutil.TSLessEqual(physical, firstLogical, lastTSOInfo.physical, lastTSOInfo.logical) { - if !c.option.allowTSOFallback { - log.Panic("[tso] timestamp fallback", - zap.String("dc-location", dcLocation), - zap.Uint32("keyspace", c.svcDiscovery.GetKeyspaceID()), - zap.String("last-ts", fmt.Sprintf("(%d, %d)", lastTSOInfo.physical, lastTSOInfo.logical)), - zap.String("cur-ts", fmt.Sprintf("(%d, %d)", physical, firstLogical)), - zap.String("last-tso-server", lastTSOInfo.tsoServer), - zap.String("cur-tso-server", curTSOInfo.tsoServer), - zap.Uint32("last-keyspace-group-in-request", lastTSOInfo.reqKeyspaceGroupID), - zap.Uint32("cur-keyspace-group-in-request", curTSOInfo.reqKeyspaceGroupID), - zap.Uint32("last-keyspace-group-in-response", lastTSOInfo.respKeyspaceGroupID), - zap.Uint32("cur-keyspace-group-in-response", curTSOInfo.respKeyspaceGroupID), - zap.Time("last-response-received-at", lastTSOInfo.respReceivedAt), - zap.Time("cur-response-received-at", curTSOInfo.respReceivedAt)) - } - log.Error("[tso] timestamp fallback", + log.Panic("[tso] timestamp fallback", zap.String("dc-location", dcLocation), zap.Uint32("keyspace", c.svcDiscovery.GetKeyspaceID()), zap.String("last-ts", fmt.Sprintf("(%d, %d)", lastTSOInfo.physical, lastTSOInfo.logical)), diff --git a/errors.toml b/errors.toml index 06848a79d1e3..95324df18889 100644 --- a/errors.toml +++ b/errors.toml @@ -821,6 +821,11 @@ error = ''' sync max ts failed, %s ''' +["PD:tso:ErrUpdateTimestamp"] +error = ''' +update timestamp failed, %s +''' + ["PD:typeutil:ErrBytesToUint64"] error = ''' invalid data, must 8 bytes, but %d diff --git a/go.mod b/go.mod index 27182c0e1d32..503ac53f33fa 100644 --- a/go.mod +++ b/go.mod @@ -30,7 +30,7 @@ require ( github.com/pingcap/errcode v0.3.0 github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00 - github.com/pingcap/kvproto v0.0.0-20230727073445-53e1f8730c30 + github.com/pingcap/kvproto v0.0.0-20230905082026-5336fac26974 github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21 github.com/pingcap/tidb-dashboard v0.0.0-20230705095454-5e220f970f27 @@ -145,7 +145,7 @@ require ( github.com/pelletier/go-toml/v2 v2.0.1 // indirect github.com/petermattis/goid v0.0.0-20211229010228-4d14c490ee36 // indirect github.com/pingcap/tipb v0.0.0-20220718022156-3e2483c20a9e // indirect - github.com/pkg/errors v0.9.1 + github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/power-devops/perfstat v0.0.0-20221212215047-62379fc7944b // indirect github.com/prometheus/client_model v0.2.0 // indirect diff --git a/go.sum b/go.sum index e2da3969165d..e826e53af373 100644 --- a/go.sum +++ b/go.sum @@ -439,8 +439,8 @@ github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c/go.mod h1:X2r9ue github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00 h1:C3N3itkduZXDZFh4N3vQ5HEtld3S+Y+StULhWVvumU0= github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00/go.mod h1:4qGtCB0QK0wBzKtFEGDhxXnSnbQApw1gc9siScUl8ew= github.com/pingcap/kvproto v0.0.0-20191211054548-3c6b38ea5107/go.mod h1:WWLmULLO7l8IOcQG+t+ItJ3fEcrL5FxF0Wu+HrMy26w= -github.com/pingcap/kvproto v0.0.0-20230727073445-53e1f8730c30 h1:EvqKcDT7ceGLW0mXqM8Cp5Z8DfgQRnwj2YTnlCLj2QI= -github.com/pingcap/kvproto v0.0.0-20230727073445-53e1f8730c30/go.mod h1:r0q/CFcwvyeRhKtoqzmWMBebrtpIziQQ9vR+JKh1knc= +github.com/pingcap/kvproto v0.0.0-20230905082026-5336fac26974 h1:Gn8rf2Mb3QDifUQHdtcopqKclc9L11hjhZFYBE65lcw= +github.com/pingcap/kvproto v0.0.0-20230905082026-5336fac26974/go.mod h1:r0q/CFcwvyeRhKtoqzmWMBebrtpIziQQ9vR+JKh1knc= github.com/pingcap/log v0.0.0-20210625125904-98ed8e2eb1c7/go.mod h1:8AanEdAHATuRurdGxZXBz0At+9avep+ub7U1AGYLIMM= github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 h1:HR/ylkkLmGdSSDaD8IDP+SZrdhV1Kibl9KrHxJ9eciw= github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4= diff --git a/pd.code-workspace b/pd.code-workspace index c4603084a93b..d6110b56a09a 100644 --- a/pd.code-workspace +++ b/pd.code-workspace @@ -23,6 +23,10 @@ { "name": "pd-tso-bench", "path": "tools/pd-tso-bench" + }, + { + "name": "pd-api-bench", + "path": "tools/pd-api-bench" } ], "settings": {} diff --git a/pkg/audit/audit_test.go b/pkg/audit/audit_test.go index 3cb43ceead0a..20f8c9344f7e 100644 --- a/pkg/audit/audit_test.go +++ b/pkg/audit/audit_test.go @@ -24,11 +24,11 @@ import ( "testing" "time" - "github.com/pingcap/log" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/stretchr/testify/require" "github.com/tikv/pd/pkg/utils/requestutil" + "github.com/tikv/pd/pkg/utils/testutil" ) func TestLabelMatcher(t *testing.T) { @@ -93,8 +93,8 @@ func TestLocalLogBackendUsingFile(t *testing.T) { t.Parallel() re := require.New(t) backend := NewLocalLogBackend(true) - fname := initLog() - defer os.Remove(fname) + fname := testutil.InitTempFileLogger("info") + defer os.RemoveAll(fname) req, _ := http.NewRequest(http.MethodGet, "http://127.0.0.1:2379/test?test=test", strings.NewReader("testBody")) re.False(backend.ProcessHTTPRequest(req)) info := requestutil.GetRequestInfo(req) @@ -125,8 +125,8 @@ func BenchmarkLocalLogAuditUsingTerminal(b *testing.B) { func BenchmarkLocalLogAuditUsingFile(b *testing.B) { b.StopTimer() backend := NewLocalLogBackend(true) - fname := initLog() - defer os.Remove(fname) + fname := testutil.InitTempFileLogger("info") + defer os.RemoveAll(fname) req, _ := http.NewRequest(http.MethodGet, "http://127.0.0.1:2379/test?test=test", strings.NewReader("testBody")) b.StartTimer() for i := 0; i < b.N; i++ { @@ -135,15 +135,3 @@ func BenchmarkLocalLogAuditUsingFile(b *testing.B) { backend.ProcessHTTPRequest(req) } } - -func initLog() string { - cfg := &log.Config{} - f, _ := os.CreateTemp("/tmp", "pd_tests") - fname := f.Name() - f.Close() - cfg.File.Filename = fname - cfg.Level = "info" - lg, p, _ := log.InitLogger(cfg) - log.ReplaceGlobals(lg, p) - return fname -} diff --git a/pkg/basicserver/basic_server.go b/pkg/basicserver/basic_server.go index afb56c2edd91..28ba3ad08de3 100644 --- a/pkg/basicserver/basic_server.go +++ b/pkg/basicserver/basic_server.go @@ -44,5 +44,5 @@ type Server interface { // IsServing returns whether the server is the leader, if there is embedded etcd, or the primary otherwise. IsServing() bool // AddServiceReadyCallback adds callbacks when the server becomes the leader, if there is embedded etcd, or the primary otherwise. - AddServiceReadyCallback(callbacks ...func(context.Context)) + AddServiceReadyCallback(callbacks ...func(context.Context) error) } diff --git a/pkg/core/region.go b/pkg/core/region.go index 3123c4704f60..e17e3932084e 100644 --- a/pkg/core/region.go +++ b/pkg/core/region.go @@ -147,8 +147,9 @@ const ( func RegionFromHeartbeat(heartbeat *pdpb.RegionHeartbeatRequest, opts ...RegionCreateOption) *RegionInfo { // Convert unit to MB. // If region isn't empty and less than 1MB, use 1MB instead. - // The size of empty region will be correct by the previous RegionInfo. regionSize := heartbeat.GetApproximateSize() / units.MiB + // Due to https://github.com/tikv/tikv/pull/11170, if region size is not initialized, + // approximate size will be zero, and region size is zero not EmptyRegionApproximateSize if heartbeat.GetApproximateSize() > 0 && regionSize < EmptyRegionApproximateSize { regionSize = EmptyRegionApproximateSize } @@ -193,19 +194,9 @@ func RegionFromHeartbeat(heartbeat *pdpb.RegionHeartbeatRequest, opts ...RegionC return region } -// Inherit inherits the buckets and region size from the parent region if bucket enabled. -// correct approximate size and buckets by the previous size if here exists a reported RegionInfo. -// See https://github.com/tikv/tikv/issues/11114 -func (r *RegionInfo) Inherit(origin *RegionInfo, bucketEnable bool) { - // regionSize should not be zero if region is not empty. - if r.GetApproximateSize() == 0 { - if origin != nil { - r.approximateSize = origin.approximateSize - } else { - r.approximateSize = EmptyRegionApproximateSize - } - } - if bucketEnable && origin != nil && r.buckets == nil { +// InheritBuckets inherits the buckets from the parent region if bucket enabled. +func (r *RegionInfo) InheritBuckets(origin *RegionInfo) { + if origin != nil && r.buckets == nil { r.buckets = origin.buckets } } @@ -515,6 +506,13 @@ func (r *RegionInfo) GetApproximateSize() int64 { return r.approximateSize } +// IsEmptyRegion returns whether the region is empty. +func (r *RegionInfo) IsEmptyRegion() bool { + // When cluster resumes, the region size may be not initialized, but region heartbeat is send. + // So use `==` here. + return r.approximateSize == EmptyRegionApproximateSize +} + // GetStorePeerApproximateKeys returns the approximate keys of the peer on the specified store. func (r *RegionInfo) GetStorePeerApproximateKeys(storeID uint64) int64 { peer := r.GetStorePeer(storeID) diff --git a/pkg/core/region_test.go b/pkg/core/region_test.go index c65d7628b0f9..3b58f5ee15a3 100644 --- a/pkg/core/region_test.go +++ b/pkg/core/region_test.go @@ -186,35 +186,9 @@ func TestSortedEqual(t *testing.T) { } } -func TestInherit(t *testing.T) { +func TestInheritBuckets(t *testing.T) { re := require.New(t) - // size in MB - // case for approximateSize - testCases := []struct { - originExists bool - originSize uint64 - size uint64 - expect uint64 - }{ - {false, 0, 0, 1}, - {false, 0, 2, 2}, - {true, 0, 2, 2}, - {true, 1, 2, 2}, - {true, 2, 0, 2}, - } - for _, testCase := range testCases { - var origin *RegionInfo - if testCase.originExists { - origin = NewRegionInfo(&metapb.Region{Id: 100}, nil) - origin.approximateSize = int64(testCase.originSize) - } - r := NewRegionInfo(&metapb.Region{Id: 100}, nil) - r.approximateSize = int64(testCase.size) - r.Inherit(origin, false) - re.Equal(int64(testCase.expect), r.approximateSize) - } - // bucket data := []struct { originBuckets *metapb.Buckets buckets *metapb.Buckets @@ -227,12 +201,11 @@ func TestInherit(t *testing.T) { for _, d := range data { origin := NewRegionInfo(&metapb.Region{Id: 100}, nil, SetBuckets(d.originBuckets)) r := NewRegionInfo(&metapb.Region{Id: 100}, nil) - r.Inherit(origin, true) + r.InheritBuckets(origin) re.Equal(d.originBuckets, r.GetBuckets()) // region will not inherit bucket keys. if origin.GetBuckets() != nil { newRegion := NewRegionInfo(&metapb.Region{Id: 100}, nil) - newRegion.Inherit(origin, false) re.NotEqual(d.originBuckets, newRegion.GetBuckets()) } } diff --git a/pkg/core/store_option.go b/pkg/core/store_option.go index c6038b5a824c..4d8864ea4788 100644 --- a/pkg/core/store_option.go +++ b/pkg/core/store_option.go @@ -74,33 +74,26 @@ func SetStoreDeployPath(deployPath string) StoreCreateOption { } } -// OfflineStore offline a store -func OfflineStore(physicallyDestroyed bool) StoreCreateOption { +// SetStoreState sets the state for the store. +func SetStoreState(state metapb.StoreState, physicallyDestroyed ...bool) StoreCreateOption { return func(store *StoreInfo) { meta := typeutil.DeepClone(store.meta, StoreFactory) - meta.State = metapb.StoreState_Offline - meta.NodeState = metapb.NodeState_Removing - meta.PhysicallyDestroyed = physicallyDestroyed - store.meta = meta - } -} - -// UpStore up a store -func UpStore() StoreCreateOption { - return func(store *StoreInfo) { - meta := typeutil.DeepClone(store.meta, StoreFactory) - meta.State = metapb.StoreState_Up - meta.NodeState = metapb.NodeState_Serving - store.meta = meta - } -} - -// TombstoneStore set a store to tombstone. -func TombstoneStore() StoreCreateOption { - return func(store *StoreInfo) { - meta := typeutil.DeepClone(store.meta, StoreFactory) - meta.State = metapb.StoreState_Tombstone - meta.NodeState = metapb.NodeState_Removed + switch state { + case metapb.StoreState_Up: + meta.State = metapb.StoreState_Up + meta.NodeState = metapb.NodeState_Serving + case metapb.StoreState_Offline: + if len(physicallyDestroyed) != 0 { + meta.State = metapb.StoreState_Offline + meta.NodeState = metapb.NodeState_Removing + meta.PhysicallyDestroyed = physicallyDestroyed[0] + } else { + panic("physicallyDestroyed should be set when set store state to offline") + } + case metapb.StoreState_Tombstone: + meta.State = metapb.StoreState_Tombstone + meta.NodeState = metapb.NodeState_Removed + } store.meta = meta } } diff --git a/pkg/core/store_test.go b/pkg/core/store_test.go index be0fd0f94185..67618a63ea98 100644 --- a/pkg/core/store_test.go +++ b/pkg/core/store_test.go @@ -84,7 +84,7 @@ func TestCloneStore(t *testing.T) { break } store.Clone( - UpStore(), + SetStoreState(metapb.StoreState_Up), SetLastHeartbeatTS(time.Now()), ) } diff --git a/pkg/election/leadership_test.go b/pkg/election/leadership_test.go index 422f583575fc..c259476e44e5 100644 --- a/pkg/election/leadership_test.go +++ b/pkg/election/leadership_test.go @@ -16,14 +16,12 @@ package election import ( "context" - "fmt" "os" "strings" "testing" "time" "github.com/pingcap/failpoint" - "github.com/pingcap/log" "github.com/stretchr/testify/require" "github.com/tikv/pd/pkg/utils/etcdutil" "github.com/tikv/pd/pkg/utils/testutil" @@ -35,27 +33,15 @@ const defaultLeaseTimeout = 1 func TestLeadership(t *testing.T) { re := require.New(t) - cfg := etcdutil.NewTestSingleConfig(t) - etcd, err := embed.StartEtcd(cfg) - defer func() { - etcd.Close() - }() - re.NoError(err) - - ep := cfg.LCUrls[0].String() - client, err := clientv3.New(clientv3.Config{ - Endpoints: []string{ep}, - }) - re.NoError(err) - - <-etcd.Server.ReadyNotify() + _, client, clean := etcdutil.NewTestEtcdCluster(t, 1) + defer clean() // Campaign the same leadership leadership1 := NewLeadership(client, "/test_leader", "test_leader_1") leadership2 := NewLeadership(client, "/test_leader", "test_leader_2") // leadership1 starts first and get the leadership - err = leadership1.Campaign(defaultLeaseTimeout, "test_leader_1") + err := leadership1.Campaign(defaultLeaseTimeout, "test_leader_1") re.NoError(err) // leadership2 starts then and can not get the leadership err = leadership2.Campaign(defaultLeaseTimeout, "test_leader_2") @@ -168,60 +154,24 @@ func TestExitWatch(t *testing.T) { // Case6: transfer leader without client reconnection. checkExitWatch(t, leaderKey, func(server *embed.Etcd, client *clientv3.Client) func() { cfg1 := server.Config() - cfg2 := etcdutil.NewTestSingleConfig(t) - cfg2.InitialCluster = cfg1.InitialCluster + fmt.Sprintf(",%s=%s", cfg2.Name, &cfg2.LPUrls[0]) - cfg2.ClusterState = embed.ClusterStateFlagExisting - peerURL := cfg2.LPUrls[0].String() - addResp, err := etcdutil.AddEtcdMember(client, []string{peerURL}) - re.NoError(err) - etcd2, err := embed.StartEtcd(cfg2) + etcd2 := etcdutil.MustAddEtcdMember(t, &cfg1, client) + client2, err := etcdutil.CreateEtcdClient(nil, etcd2.Config().LCUrls) re.NoError(err) - re.Equal(uint64(etcd2.Server.ID()), addResp.Member.ID) - <-etcd2.Server.ReadyNotify() - ep := cfg2.LCUrls[0].String() - client1, err := clientv3.New(clientv3.Config{ - Endpoints: []string{ep}, - }) - re.NoError(err) - + // close the original leader server.Server.HardStop() - client1.Delete(context.Background(), leaderKey) + // delete the leader key with the new client + client2.Delete(context.Background(), leaderKey) return func() { etcd2.Close() + client2.Close() } }) // Case7: loss the quorum when the watch loop is running checkExitWatch(t, leaderKey, func(server *embed.Etcd, client *clientv3.Client) func() { - tempStdoutFile, _ := os.CreateTemp("/tmp", "pd_tests") - defer os.Remove(tempStdoutFile.Name()) - logCfg := &log.Config{} - logCfg.File.Filename = tempStdoutFile.Name() - logCfg.Level = "info" - lg, p, _ := log.InitLogger(logCfg) - log.ReplaceGlobals(lg, p) - cfg1 := server.Config() - cfg2 := etcdutil.NewTestSingleConfig(t) - cfg2.InitialCluster = cfg1.InitialCluster + fmt.Sprintf(",%s=%s", cfg2.Name, &cfg2.LPUrls[0]) - cfg2.ClusterState = embed.ClusterStateFlagExisting - peerURL := cfg2.LPUrls[0].String() - addResp, err := etcdutil.AddEtcdMember(client, []string{peerURL}) - re.NoError(err) - etcd2, err := embed.StartEtcd(cfg2) - re.NoError(err) - re.Equal(uint64(etcd2.Server.ID()), addResp.Member.ID) - <-etcd2.Server.ReadyNotify() - - cfg3 := etcdutil.NewTestSingleConfig(t) - cfg3.InitialCluster = cfg2.InitialCluster + fmt.Sprintf(",%s=%s", cfg3.Name, &cfg3.LPUrls[0]) - cfg3.ClusterState = embed.ClusterStateFlagExisting - peerURL = cfg3.LPUrls[0].String() - addResp, err = etcdutil.AddEtcdMember(client, []string{peerURL}) - re.NoError(err) - etcd3, err := embed.StartEtcd(cfg3) - re.NoError(err) - re.Equal(uint64(etcd3.Server.ID()), addResp.Member.ID) - <-etcd3.Server.ReadyNotify() + etcd2 := etcdutil.MustAddEtcdMember(t, &cfg1, client) + cfg2 := etcd2.Config() + etcd3 := etcdutil.MustAddEtcdMember(t, &cfg2, client) resp2, err := client.MemberList(context.Background()) re.NoError(err) @@ -237,24 +187,11 @@ func TestExitWatch(t *testing.T) { func checkExitWatch(t *testing.T, leaderKey string, injectFunc func(server *embed.Etcd, client *clientv3.Client) func()) { re := require.New(t) - cfg := etcdutil.NewTestSingleConfig(t) - etcd, err := embed.StartEtcd(cfg) - defer func() { - etcd.Close() - }() - re.NoError(err) - - ep := cfg.LCUrls[0].String() - client1, err := clientv3.New(clientv3.Config{ - Endpoints: []string{ep}, - }) - re.NoError(err) - client2, err := clientv3.New(clientv3.Config{ - Endpoints: []string{ep}, - }) + servers, client1, clean := etcdutil.NewTestEtcdCluster(t, 1) + defer clean() + client2, err := etcdutil.CreateEtcdClient(nil, servers[0].Config().LCUrls) re.NoError(err) - - <-etcd.Server.ReadyNotify() + defer client2.Close() leadership1 := NewLeadership(client1, leaderKey, "test_leader_1") leadership2 := NewLeadership(client2, leaderKey, "test_leader_2") @@ -268,7 +205,7 @@ func checkExitWatch(t *testing.T, leaderKey string, injectFunc func(server *embe done <- struct{}{} }() - cleanFunc := injectFunc(etcd, client2) + cleanFunc := injectFunc(servers[0], client2) defer cleanFunc() testutil.Eventually(re, func() bool { @@ -283,33 +220,14 @@ func checkExitWatch(t *testing.T, leaderKey string, injectFunc func(server *embe func TestRequestProgress(t *testing.T) { checkWatcherRequestProgress := func(injectWatchChanBlock bool) { - tempStdoutFile, _ := os.CreateTemp("/tmp", "pd_tests") - defer os.Remove(tempStdoutFile.Name()) - logCfg := &log.Config{} - logCfg.File.Filename = tempStdoutFile.Name() - logCfg.Level = "debug" - lg, p, _ := log.InitLogger(logCfg) - log.ReplaceGlobals(lg, p) - re := require.New(t) - cfg := etcdutil.NewTestSingleConfig(t) - etcd, err := embed.StartEtcd(cfg) - defer func() { - etcd.Close() - }() + fname := testutil.InitTempFileLogger("debug") + defer os.RemoveAll(fname) + servers, client1, clean := etcdutil.NewTestEtcdCluster(t, 1) + defer clean() + client2, err := etcdutil.CreateEtcdClient(nil, servers[0].Config().LCUrls) re.NoError(err) - - ep := cfg.LCUrls[0].String() - client1, err := clientv3.New(clientv3.Config{ - Endpoints: []string{ep}, - }) - re.NoError(err) - client2, err := clientv3.New(clientv3.Config{ - Endpoints: []string{ep}, - }) - re.NoError(err) - - <-etcd.Server.ReadyNotify() + defer client2.Close() leaderKey := "/test_leader" leadership1 := NewLeadership(client1, leaderKey, "test_leader_1") @@ -328,14 +246,14 @@ func TestRequestProgress(t *testing.T) { if injectWatchChanBlock { failpoint.Enable("github.com/tikv/pd/pkg/election/watchChanBlock", "return(true)") testutil.Eventually(re, func() bool { - b, _ := os.ReadFile(tempStdoutFile.Name()) + b, _ := os.ReadFile(fname) l := string(b) return strings.Contains(l, "watch channel is blocked for a long time") }) failpoint.Disable("github.com/tikv/pd/pkg/election/watchChanBlock") } else { testutil.Eventually(re, func() bool { - b, _ := os.ReadFile(tempStdoutFile.Name()) + b, _ := os.ReadFile(fname) l := string(b) return strings.Contains(l, "watcher receives progress notify in watch loop") }) diff --git a/pkg/election/lease_test.go b/pkg/election/lease_test.go index 70f552302937..3d8515eadb27 100644 --- a/pkg/election/lease_test.go +++ b/pkg/election/lease_test.go @@ -22,25 +22,12 @@ import ( "github.com/stretchr/testify/require" "github.com/tikv/pd/pkg/utils/etcdutil" "go.etcd.io/etcd/clientv3" - "go.etcd.io/etcd/embed" ) func TestLease(t *testing.T) { re := require.New(t) - cfg := etcdutil.NewTestSingleConfig(t) - etcd, err := embed.StartEtcd(cfg) - defer func() { - etcd.Close() - }() - re.NoError(err) - - ep := cfg.LCUrls[0].String() - client, err := clientv3.New(clientv3.Config{ - Endpoints: []string{ep}, - }) - re.NoError(err) - - <-etcd.Server.ReadyNotify() + _, client, clean := etcdutil.NewTestEtcdCluster(t, 1) + defer clean() // Create the lease. lease1 := &lease{ @@ -104,20 +91,8 @@ func TestLease(t *testing.T) { func TestLeaseKeepAlive(t *testing.T) { re := require.New(t) - cfg := etcdutil.NewTestSingleConfig(t) - etcd, err := embed.StartEtcd(cfg) - defer func() { - etcd.Close() - }() - re.NoError(err) - - ep := cfg.LCUrls[0].String() - client, err := clientv3.New(clientv3.Config{ - Endpoints: []string{ep}, - }) - re.NoError(err) - - <-etcd.Server.ReadyNotify() + _, client, clean := etcdutil.NewTestEtcdCluster(t, 1) + defer clean() // Create the lease. lease := &lease{ diff --git a/pkg/encryption/key_manager_test.go b/pkg/encryption/key_manager_test.go index 59fab0940a6c..3134e714543b 100644 --- a/pkg/encryption/key_manager_test.go +++ b/pkg/encryption/key_manager_test.go @@ -17,8 +17,6 @@ package encryption import ( "context" "encoding/hex" - "fmt" - "net/url" "os" "path/filepath" "sync/atomic" @@ -31,10 +29,8 @@ import ( "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/election" "github.com/tikv/pd/pkg/utils/etcdutil" - "github.com/tikv/pd/pkg/utils/tempurl" "github.com/tikv/pd/pkg/utils/typeutil" "go.etcd.io/etcd/clientv3" - "go.etcd.io/etcd/embed" ) // #nosec G101 @@ -50,35 +46,11 @@ func getTestDataKey() []byte { return key } -func newTestEtcd(t *testing.T, re *require.Assertions) (client *clientv3.Client) { - cfg := embed.NewConfig() - cfg.Name = "test_etcd" - cfg.Dir = t.TempDir() - cfg.Logger = "zap" - pu, err := url.Parse(tempurl.Alloc()) - re.NoError(err) - cfg.LPUrls = []url.URL{*pu} - cfg.APUrls = cfg.LPUrls - cu, err := url.Parse(tempurl.Alloc()) - re.NoError(err) - cfg.LCUrls = []url.URL{*cu} - cfg.ACUrls = cfg.LCUrls - cfg.InitialCluster = fmt.Sprintf("%s=%s", cfg.Name, &cfg.LPUrls[0]) - cfg.ClusterState = embed.ClusterStateFlagNew - server, err := embed.StartEtcd(cfg) - re.NoError(err) - <-server.Server.ReadyNotify() - - client, err = clientv3.New(clientv3.Config{ - Endpoints: []string{cfg.LCUrls[0].String()}, - }) - re.NoError(err) - +func newTestEtcd(t *testing.T) (client *clientv3.Client) { + _, client, clean := etcdutil.NewTestEtcdCluster(t, 1) t.Cleanup(func() { - client.Close() - server.Close() + clean() }) - return client } @@ -114,7 +86,7 @@ func checkMasterKeyMeta(re *require.Assertions, value []byte, meta *encryptionpb func TestNewKeyManagerBasic(t *testing.T) { re := require.New(t) // Initialize. - client := newTestEtcd(t, re) + client := newTestEtcd(t) // Use default config. config := &Config{} err := config.Adjust() @@ -136,7 +108,7 @@ func TestNewKeyManagerBasic(t *testing.T) { func TestNewKeyManagerWithCustomConfig(t *testing.T) { re := require.New(t) // Initialize. - client := newTestEtcd(t, re) + client := newTestEtcd(t) keyFile := newTestKeyFile(t, re) // Custom config rotatePeriod, err := time.ParseDuration("100h") @@ -174,7 +146,7 @@ func TestNewKeyManagerWithCustomConfig(t *testing.T) { func TestNewKeyManagerLoadKeys(t *testing.T) { re := require.New(t) // Initialize. - client := newTestEtcd(t, re) + client := newTestEtcd(t) keyFile := newTestKeyFile(t, re) leadership := newTestLeader(re, client) // Use default config. @@ -215,7 +187,7 @@ func TestNewKeyManagerLoadKeys(t *testing.T) { func TestGetCurrentKey(t *testing.T) { re := require.New(t) // Initialize. - client := newTestEtcd(t, re) + client := newTestEtcd(t) // Use default config. config := &Config{} err := config.Adjust() @@ -258,7 +230,7 @@ func TestGetCurrentKey(t *testing.T) { func TestGetKey(t *testing.T) { re := require.New(t) // Initialize. - client := newTestEtcd(t, re) + client := newTestEtcd(t) keyFile := newTestKeyFile(t, re) leadership := newTestLeader(re, client) // Store initial keys in etcd. @@ -313,7 +285,7 @@ func TestGetKey(t *testing.T) { func TestLoadKeyEmpty(t *testing.T) { re := require.New(t) // Initialize. - client := newTestEtcd(t, re) + client := newTestEtcd(t) keyFile := newTestKeyFile(t, re) leadership := newTestLeader(re, client) // Store initial keys in etcd. @@ -349,7 +321,7 @@ func TestWatcher(t *testing.T) { // Initialize. ctx, cancel := context.WithCancel(context.Background()) defer cancel() - client := newTestEtcd(t, re) + client := newTestEtcd(t) keyFile := newTestKeyFile(t, re) leadership := newTestLeader(re, client) // Setup helper @@ -425,7 +397,7 @@ func TestWatcher(t *testing.T) { func TestSetLeadershipWithEncryptionOff(t *testing.T) { re := require.New(t) // Initialize. - client := newTestEtcd(t, re) + client := newTestEtcd(t) // Use default config. config := &Config{} err := config.Adjust() @@ -450,7 +422,7 @@ func TestSetLeadershipWithEncryptionEnabling(t *testing.T) { // Initialize. ctx, cancel := context.WithCancel(context.Background()) defer cancel() - client := newTestEtcd(t, re) + client := newTestEtcd(t) keyFile := newTestKeyFile(t, re) leadership := newTestLeader(re, client) // Setup helper @@ -503,7 +475,7 @@ func TestSetLeadershipWithEncryptionMethodChanged(t *testing.T) { // Initialize. ctx, cancel := context.WithCancel(context.Background()) defer cancel() - client := newTestEtcd(t, re) + client := newTestEtcd(t) keyFile := newTestKeyFile(t, re) leadership := newTestLeader(re, client) // Setup helper @@ -579,7 +551,7 @@ func TestSetLeadershipWithCurrentKeyExposed(t *testing.T) { // Initialize. ctx, cancel := context.WithCancel(context.Background()) defer cancel() - client := newTestEtcd(t, re) + client := newTestEtcd(t) keyFile := newTestKeyFile(t, re) leadership := newTestLeader(re, client) // Setup helper @@ -650,7 +622,7 @@ func TestSetLeadershipWithCurrentKeyExpired(t *testing.T) { // Initialize. ctx, cancel := context.WithCancel(context.Background()) defer cancel() - client := newTestEtcd(t, re) + client := newTestEtcd(t) keyFile := newTestKeyFile(t, re) leadership := newTestLeader(re, client) // Setup helper @@ -725,7 +697,7 @@ func TestSetLeadershipWithMasterKeyChanged(t *testing.T) { // Initialize. ctx, cancel := context.WithCancel(context.Background()) defer cancel() - client := newTestEtcd(t, re) + client := newTestEtcd(t) keyFile := newTestKeyFile(t, re) keyFile2 := newTestKeyFile(t, re, testMasterKey2) leadership := newTestLeader(re, client) @@ -790,7 +762,7 @@ func TestSetLeadershipWithMasterKeyChanged(t *testing.T) { func TestSetLeadershipMasterKeyWithCiphertextKey(t *testing.T) { re := require.New(t) // Initialize. - client := newTestEtcd(t, re) + client := newTestEtcd(t) keyFile := newTestKeyFile(t, re) leadership := newTestLeader(re, client) // Setup helper @@ -868,7 +840,7 @@ func TestSetLeadershipWithEncryptionDisabling(t *testing.T) { // Initialize. ctx, cancel := context.WithCancel(context.Background()) defer cancel() - client := newTestEtcd(t, re) + client := newTestEtcd(t) keyFile := newTestKeyFile(t, re) leadership := newTestLeader(re, client) // Setup helper @@ -924,7 +896,7 @@ func TestKeyRotation(t *testing.T) { // Initialize. ctx, cancel := context.WithCancel(context.Background()) defer cancel() - client := newTestEtcd(t, re) + client := newTestEtcd(t) keyFile := newTestKeyFile(t, re) leadership := newTestLeader(re, client) // Setup helper @@ -1020,7 +992,7 @@ func TestKeyRotationConflict(t *testing.T) { // Initialize. ctx, cancel := context.WithCancel(context.Background()) defer cancel() - client := newTestEtcd(t, re) + client := newTestEtcd(t) keyFile := newTestKeyFile(t, re) leadership := newTestLeader(re, client) // Setup helper diff --git a/pkg/errs/errno.go b/pkg/errs/errno.go index a5e05219dfaa..91cd4a78c4f6 100644 --- a/pkg/errs/errno.go +++ b/pkg/errs/errno.go @@ -45,6 +45,7 @@ var ( ErrSyncMaxTS = errors.Normalize("sync max ts failed, %s", errors.RFCCodeText("PD:tso:ErrSyncMaxTS")) ErrResetUserTimestamp = errors.Normalize("reset user timestamp failed, %s", errors.RFCCodeText("PD:tso:ErrResetUserTimestamp")) ErrGenerateTimestamp = errors.Normalize("generate timestamp failed, %s", errors.RFCCodeText("PD:tso:ErrGenerateTimestamp")) + ErrUpdateTimestamp = errors.Normalize("update timestamp failed, %s", errors.RFCCodeText("PD:tso:ErrUpdateTimestamp")) ErrLogicOverflow = errors.Normalize("logic part overflow", errors.RFCCodeText("PD:tso:ErrLogicOverflow")) ErrProxyTSOTimeout = errors.Normalize("proxy tso timeout", errors.RFCCodeText("PD:tso:ErrProxyTSOTimeout")) ErrKeyspaceGroupIDInvalid = errors.Normalize("the keyspace group id is invalid, %s", errors.RFCCodeText("PD:tso:ErrKeyspaceGroupIDInvalid")) diff --git a/pkg/gc/safepoint_v2.go b/pkg/gc/safepoint_v2.go index f936601d2d99..665249bcab01 100644 --- a/pkg/gc/safepoint_v2.go +++ b/pkg/gc/safepoint_v2.go @@ -18,10 +18,10 @@ import ( "context" "time" + "github.com/pingcap/errors" "github.com/pingcap/failpoint" "github.com/pingcap/kvproto/pkg/keyspacepb" "github.com/pingcap/log" - "github.com/pkg/errors" "github.com/tikv/pd/pkg/keyspace" "github.com/tikv/pd/pkg/slice" "github.com/tikv/pd/pkg/storage/endpoint" diff --git a/pkg/id/id_test.go b/pkg/id/id_test.go index 1b1632cc7637..94f0670b9792 100644 --- a/pkg/id/id_test.go +++ b/pkg/id/id_test.go @@ -22,8 +22,6 @@ import ( "github.com/stretchr/testify/require" "github.com/tikv/pd/pkg/utils/etcdutil" - "go.etcd.io/etcd/clientv3" - "go.etcd.io/etcd/embed" ) const ( @@ -39,23 +37,11 @@ const ( // share rootPath and member val update their ids concurrently. func TestMultipleAllocator(t *testing.T) { re := require.New(t) - cfg := etcdutil.NewTestSingleConfig(t) - etcd, err := embed.StartEtcd(cfg) - defer func() { - etcd.Close() - }() - re.NoError(err) - - ep := cfg.LCUrls[0].String() - client, err := clientv3.New(clientv3.Config{ - Endpoints: []string{ep}, - }) - re.NoError(err) - - <-etcd.Server.ReadyNotify() + _, client, clean := etcdutil.NewTestEtcdCluster(t, 1) + defer clean() // Put memberValue to leaderPath to simulate an election success. - _, err = client.Put(context.Background(), leaderPath, memberVal) + _, err := client.Put(context.Background(), leaderPath, memberVal) re.NoError(err) wg := sync.WaitGroup{} diff --git a/pkg/keyspace/tso_keyspace_group.go b/pkg/keyspace/tso_keyspace_group.go index 22c265140fd9..ac5b035ae3f8 100644 --- a/pkg/keyspace/tso_keyspace_group.go +++ b/pkg/keyspace/tso_keyspace_group.go @@ -167,8 +167,16 @@ func (m *GroupManager) allocNodesToAllKeyspaceGroups(ctx context.Context) { log.Info("start to alloc nodes to all keyspace groups") for { select { + case <-m.ctx.Done(): + // When the group manager is closed, we should stop to alloc nodes to all keyspace groups. + // Note: If raftcluster is created failed but the group manager has been bootstrapped, + // we need to close this goroutine by m.cancel() rather than ctx.Done() from the raftcluster. + // because the ctx.Done() from the raftcluster will be triggered after raftcluster is created successfully. + log.Info("server is closed, stop to alloc nodes to all keyspace groups") + return case <-ctx.Done(): - log.Info("stop to alloc nodes to all keyspace groups") + // When the API leader is changed, we should stop to alloc nodes to all keyspace groups. + log.Info("the raftcluster is closed, stop to alloc nodes to all keyspace groups") return case <-ticker.C: } diff --git a/pkg/mcs/discovery/discover_test.go b/pkg/mcs/discovery/discover_test.go index fed1d7844a01..2894dfa8d2dd 100644 --- a/pkg/mcs/discovery/discover_test.go +++ b/pkg/mcs/discovery/discover_test.go @@ -21,28 +21,14 @@ import ( "github.com/stretchr/testify/require" "github.com/tikv/pd/pkg/utils/etcdutil" - "go.etcd.io/etcd/clientv3" - "go.etcd.io/etcd/embed" ) func TestDiscover(t *testing.T) { re := require.New(t) - cfg := etcdutil.NewTestSingleConfig(t) - etcd, err := embed.StartEtcd(cfg) - defer func() { - etcd.Close() - }() - re.NoError(err) - - ep := cfg.LCUrls[0].String() - re.NoError(err) - - client, err := clientv3.NewFromURL(ep) - re.NoError(err) - - <-etcd.Server.ReadyNotify() + _, client, clean := etcdutil.NewTestEtcdCluster(t, 1) + defer clean() sr1 := NewServiceRegister(context.Background(), client, "12345", "test_service", "127.0.0.1:1", "127.0.0.1:1", 1) - err = sr1.Register() + err := sr1.Register() re.NoError(err) sr2 := NewServiceRegister(context.Background(), client, "12345", "test_service", "127.0.0.1:2", "127.0.0.1:2", 1) err = sr2.Register() @@ -64,20 +50,8 @@ func TestDiscover(t *testing.T) { func TestServiceRegistryEntry(t *testing.T) { re := require.New(t) - cfg := etcdutil.NewTestSingleConfig(t) - etcd, err := embed.StartEtcd(cfg) - defer func() { - etcd.Close() - }() - re.NoError(err) - - ep := cfg.LCUrls[0].String() - re.NoError(err) - - client, err := clientv3.NewFromURL(ep) - re.NoError(err) - - <-etcd.Server.ReadyNotify() + _, client, clean := etcdutil.NewTestEtcdCluster(t, 1) + defer clean() entry1 := &ServiceRegistryEntry{ServiceAddr: "127.0.0.1:1"} s1, err := entry1.Serialize() re.NoError(err) diff --git a/pkg/mcs/discovery/register_test.go b/pkg/mcs/discovery/register_test.go index f27f3160afef..032b0558a79a 100644 --- a/pkg/mcs/discovery/register_test.go +++ b/pkg/mcs/discovery/register_test.go @@ -28,18 +28,13 @@ import ( func TestRegister(t *testing.T) { re := require.New(t) - cfg := etcdutil.NewTestSingleConfig(t) - etcd, err := embed.StartEtcd(cfg) - re.NoError(err) - ep := cfg.LCUrls[0].String() - client, err := clientv3.NewFromURL(ep) - re.NoError(err) - <-etcd.Server.ReadyNotify() + servers, client, clean := etcdutil.NewTestEtcdCluster(t, 1) + defer clean() + etcd, cfg := servers[0], servers[0].Config() // Test register with http prefix. sr := NewServiceRegister(context.Background(), client, "12345", "test_service", "http://127.0.0.1:1", "http://127.0.0.1:1", 10) - re.NoError(err) - err = sr.Register() + err := sr.Register() re.NoError(err) re.Equal("/ms/12345/test_service/registry/http://127.0.0.1:1", sr.key) resp, err := client.Get(context.Background(), sr.key) @@ -69,14 +64,13 @@ func TestRegister(t *testing.T) { etcd.Server.HardStop() // close the etcd to make the keepalive failed time.Sleep(etcdutil.DefaultDialTimeout) // ensure that the request is timeout etcd.Close() - etcd, err = embed.StartEtcd(cfg) + etcd, err = embed.StartEtcd(&cfg) re.NoError(err) <-etcd.Server.ReadyNotify() testutil.Eventually(re, func() bool { return getKeyAfterLeaseExpired(re, client, sr.key) == "127.0.0.1:2" }) } - etcd.Close() } func getKeyAfterLeaseExpired(re *require.Assertions, client *clientv3.Client, key string) string { diff --git a/pkg/mcs/resourcemanager/server/manager.go b/pkg/mcs/resourcemanager/server/manager.go index a9e53f347fa0..6d1b872575bf 100644 --- a/pkg/mcs/resourcemanager/server/manager.go +++ b/pkg/mcs/resourcemanager/server/manager.go @@ -101,7 +101,7 @@ func (m *Manager) GetBasicServer() bs.Server { } // Init initializes the resource group manager. -func (m *Manager) Init(ctx context.Context) { +func (m *Manager) Init(ctx context.Context) error { // Todo: If we can modify following configs in the future, we should reload these configs. // Store the controller config into the storage. m.storage.SaveControllerConfig(m.controllerConfig) @@ -156,6 +156,7 @@ func (m *Manager) Init(ctx context.Context) { m.persistLoop(ctx) }() log.Info("resource group manager finishes initialization") + return nil } // AddResourceGroup puts a resource group. diff --git a/pkg/mcs/resourcemanager/server/server.go b/pkg/mcs/resourcemanager/server/server.go index 5cb924599929..78685850e86a 100644 --- a/pkg/mcs/resourcemanager/server/server.go +++ b/pkg/mcs/resourcemanager/server/server.go @@ -16,11 +16,9 @@ package server import ( "context" - "fmt" "net/http" "os" "os/signal" - "path" "strconv" "sync" "sync/atomic" @@ -30,6 +28,7 @@ import ( grpcprometheus "github.com/grpc-ecosystem/go-grpc-prometheus" "github.com/pingcap/failpoint" "github.com/pingcap/kvproto/pkg/diagnosticspb" + "github.com/pingcap/kvproto/pkg/resource_manager" "github.com/pingcap/log" "github.com/pingcap/sysutil" "github.com/spf13/cobra" @@ -72,7 +71,7 @@ type Server struct { service *Service // primaryCallbacks will be called after the server becomes leader. - primaryCallbacks []func(context.Context) + primaryCallbacks []func(context.Context) error serviceRegister *discovery.ServiceRegister } @@ -233,7 +232,7 @@ func (s *Server) IsClosed() bool { } // AddServiceReadyCallback adds callbacks when the server becomes the leader, if there is embedded etcd, or the primary otherwise. -func (s *Server) AddServiceReadyCallback(callbacks ...func(context.Context)) { +func (s *Server) AddServiceReadyCallback(callbacks ...func(context.Context) error) { s.primaryCallbacks = append(s.primaryCallbacks, callbacks...) } @@ -284,10 +283,13 @@ func (s *Server) startServer() (err error) { uniqueName := s.cfg.ListenAddr uniqueID := memberutil.GenerateUniqueID(uniqueName) log.Info("joining primary election", zap.String("participant-name", uniqueName), zap.Uint64("participant-id", uniqueID)) - resourceManagerPrimaryPrefix := endpoint.ResourceManagerSvcRootPath(s.clusterID) - s.participant = member.NewParticipant(s.GetClient()) - s.participant.InitInfo(uniqueName, uniqueID, path.Join(resourceManagerPrimaryPrefix, fmt.Sprintf("%05d", 0)), - utils.PrimaryKey, "primary election", s.cfg.AdvertiseListenAddr) + s.participant = member.NewParticipant(s.GetClient(), utils.ResourceManagerServiceName) + p := &resource_manager.Participant{ + Name: uniqueName, + Id: uniqueID, // id is unique among all participants + ListenUrls: []string{s.cfg.AdvertiseListenAddr}, + } + s.participant.InitInfo(p, endpoint.ResourceManagerSvcRootPath(s.clusterID), utils.PrimaryKey, "primary election") s.service = &Service{ ctx: s.Context(), diff --git a/pkg/mcs/scheduling/server/cluster.go b/pkg/mcs/scheduling/server/cluster.go index 18cc55fbc163..19bd891196fa 100644 --- a/pkg/mcs/scheduling/server/cluster.go +++ b/pkg/mcs/scheduling/server/cluster.go @@ -143,12 +143,11 @@ func (c *Cluster) GetStoreConfig() sc.StoreConfigProvider { return c.persistConf // AllocID allocates a new ID. func (c *Cluster) AllocID() (uint64, error) { - cli := c.apiServerLeader.Load().(pdpb.PDClient) - if cli == nil { - c.checkMembershipCh <- struct{}{} - return 0, errors.New("API server leader is not found") + client, err := c.getAPIServerLeaderClient() + if err != nil { + return 0, err } - resp, err := cli.AllocID(c.ctx, &pdpb.AllocIDRequest{Header: &pdpb.RequestHeader{ClusterId: c.clusterID}}) + resp, err := client.AllocID(c.ctx, &pdpb.AllocIDRequest{Header: &pdpb.RequestHeader{ClusterId: c.clusterID}}) if err != nil { c.checkMembershipCh <- struct{}{} return 0, err @@ -156,6 +155,15 @@ func (c *Cluster) AllocID() (uint64, error) { return resp.GetId(), nil } +func (c *Cluster) getAPIServerLeaderClient() (pdpb.PDClient, error) { + cli := c.apiServerLeader.Load() + if cli == nil { + c.checkMembershipCh <- struct{}{} + return nil, errors.New("API server leader is not found") + } + return cli.(pdpb.PDClient), nil +} + // SwitchAPIServerLeader switches the API server leader. func (c *Cluster) SwitchAPIServerLeader(new pdpb.PDClient) bool { old := c.apiServerLeader.Load() diff --git a/pkg/mcs/scheduling/server/config/config.go b/pkg/mcs/scheduling/server/config/config.go index c6b5a5506975..e0e5bb9b6615 100644 --- a/pkg/mcs/scheduling/server/config/config.go +++ b/pkg/mcs/scheduling/server/config/config.go @@ -34,6 +34,7 @@ import ( "github.com/tikv/pd/pkg/core/storelimit" "github.com/tikv/pd/pkg/mcs/utils" sc "github.com/tikv/pd/pkg/schedule/config" + "github.com/tikv/pd/pkg/slice" "github.com/tikv/pd/pkg/storage/endpoint" "github.com/tikv/pd/pkg/utils/configutil" "github.com/tikv/pd/pkg/utils/grpcutil" @@ -239,6 +240,18 @@ func (o *PersistConfig) SetScheduleConfig(cfg *sc.ScheduleConfig) { o.schedule.Store(cfg) } +// AdjustScheduleCfg adjusts the schedule config. +func (o *PersistConfig) AdjustScheduleCfg(scheduleCfg *sc.ScheduleConfig) { + // In case we add new default schedulers. + for _, ps := range sc.DefaultSchedulers { + if slice.NoneOf(scheduleCfg.Schedulers, func(i int) bool { + return scheduleCfg.Schedulers[i].Type == ps.Type + }) { + scheduleCfg.Schedulers = append(scheduleCfg.Schedulers, ps) + } + } +} + // GetReplicationConfig returns replication configurations. func (o *PersistConfig) GetReplicationConfig() *sc.ReplicationConfig { return o.replication.Load().(*sc.ReplicationConfig) diff --git a/pkg/mcs/scheduling/server/config/watcher.go b/pkg/mcs/scheduling/server/config/watcher.go index c9010db69a3e..e6e204b86312 100644 --- a/pkg/mcs/scheduling/server/config/watcher.go +++ b/pkg/mcs/scheduling/server/config/watcher.go @@ -94,6 +94,7 @@ func (cw *Watcher) initializeConfigWatcher() error { zap.String("event-kv-key", string(kv.Key)), zap.Error(err)) return err } + cw.AdjustScheduleCfg(&cfg.Schedule) cw.SetClusterVersion(&cfg.ClusterVersion) cw.SetScheduleConfig(&cfg.Schedule) cw.SetReplicationConfig(&cfg.Replication) diff --git a/pkg/mcs/scheduling/server/meta/watcher.go b/pkg/mcs/scheduling/server/meta/watcher.go new file mode 100644 index 000000000000..1e58bbd845f0 --- /dev/null +++ b/pkg/mcs/scheduling/server/meta/watcher.go @@ -0,0 +1,117 @@ +// Copyright 2023 TiKV Project Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package meta + +import ( + "context" + "sync" + + "github.com/gogo/protobuf/proto" + "github.com/pingcap/kvproto/pkg/metapb" + "github.com/pingcap/log" + "github.com/tikv/pd/pkg/core" + "github.com/tikv/pd/pkg/storage/endpoint" + "github.com/tikv/pd/pkg/utils/etcdutil" + "go.etcd.io/etcd/clientv3" + "go.etcd.io/etcd/mvcc/mvccpb" + "go.uber.org/zap" +) + +// Watcher is used to watch the PD API server for any meta changes. +type Watcher struct { + wg sync.WaitGroup + ctx context.Context + cancel context.CancelFunc + clusterID uint64 + // storePathPrefix is the path of the store in etcd: + // - Key: /pd/{cluster_id}/raft/s/ + // - Value: meta store proto. + storePathPrefix string + + etcdClient *clientv3.Client + basicCluster *core.BasicCluster + storeWatcher *etcdutil.LoopWatcher +} + +// NewWatcher creates a new watcher to watch the meta change from PD API server. +func NewWatcher( + ctx context.Context, + etcdClient *clientv3.Client, + clusterID uint64, + basicCluster *core.BasicCluster, +) (*Watcher, error) { + ctx, cancel := context.WithCancel(ctx) + w := &Watcher{ + ctx: ctx, + cancel: cancel, + clusterID: clusterID, + storePathPrefix: endpoint.StorePathPrefix(clusterID), + etcdClient: etcdClient, + basicCluster: basicCluster, + } + err := w.initializeStoreWatcher() + if err != nil { + return nil, err + } + return w, nil +} + +func (w *Watcher) initializeStoreWatcher() error { + putFn := func(kv *mvccpb.KeyValue) error { + store := &metapb.Store{} + if err := proto.Unmarshal(kv.Value, store); err != nil { + log.Warn("failed to unmarshal store entry", + zap.String("event-kv-key", string(kv.Key)), zap.Error(err)) + return err + } + origin := w.basicCluster.GetStore(store.GetId()) + if origin == nil { + w.basicCluster.PutStore(core.NewStoreInfo(store)) + return nil + } + w.basicCluster.PutStore(origin.Clone(core.SetStoreState(store.GetState(), store.GetPhysicallyDestroyed()))) + return nil + } + deleteFn := func(kv *mvccpb.KeyValue) error { + key := string(kv.Key) + storeID, err := endpoint.ExtractStoreIDFromPath(w.clusterID, key) + if err != nil { + return err + } + origin := w.basicCluster.GetStore(storeID) + if origin != nil { + w.basicCluster.DeleteStore(origin) + } + return nil + } + postEventFn := func() error { + return nil + } + w.storeWatcher = etcdutil.NewLoopWatcher( + w.ctx, &w.wg, + w.etcdClient, + "scheduling-store-watcher", w.storePathPrefix, + putFn, deleteFn, postEventFn, + clientv3.WithPrefix(), + ) + w.storeWatcher.StartWatchLoop() + return w.storeWatcher.WaitLoad() +} + +// Close closes the watcher. +func (w *Watcher) Close() { + w.cancel() + w.wg.Wait() +} diff --git a/pkg/mcs/scheduling/server/server.go b/pkg/mcs/scheduling/server/server.go index 0b2ad56a0e40..1e3aea41aa54 100644 --- a/pkg/mcs/scheduling/server/server.go +++ b/pkg/mcs/scheduling/server/server.go @@ -20,7 +20,6 @@ import ( "net/http" "os" "os/signal" - "path" "strconv" "sync" "sync/atomic" @@ -31,6 +30,7 @@ import ( "github.com/pingcap/failpoint" "github.com/pingcap/kvproto/pkg/diagnosticspb" "github.com/pingcap/kvproto/pkg/pdpb" + "github.com/pingcap/kvproto/pkg/schedulingpb" "github.com/pingcap/log" "github.com/pingcap/sysutil" "github.com/spf13/cobra" @@ -39,12 +39,14 @@ import ( "github.com/tikv/pd/pkg/errs" "github.com/tikv/pd/pkg/mcs/discovery" "github.com/tikv/pd/pkg/mcs/scheduling/server/config" + "github.com/tikv/pd/pkg/mcs/scheduling/server/meta" "github.com/tikv/pd/pkg/mcs/scheduling/server/rule" "github.com/tikv/pd/pkg/mcs/server" "github.com/tikv/pd/pkg/mcs/utils" "github.com/tikv/pd/pkg/member" "github.com/tikv/pd/pkg/schedule" "github.com/tikv/pd/pkg/schedule/hbstream" + "github.com/tikv/pd/pkg/schedule/schedulers" "github.com/tikv/pd/pkg/storage/endpoint" "github.com/tikv/pd/pkg/storage/kv" "github.com/tikv/pd/pkg/utils/apiutil" @@ -76,6 +78,7 @@ type Server struct { cfg *config.Config clusterID uint64 persistConfig *config.PersistConfig + basicCluster *core.BasicCluster // for the primary election of scheduling participant *member.Participant @@ -84,7 +87,8 @@ type Server struct { checkMembershipCh chan struct{} // primaryCallbacks will be called after the server becomes leader. - primaryCallbacks []func(context.Context) + primaryCallbacks []func(context.Context) error + primaryExitCallbacks []func() // for service registry serviceID *discovery.ServiceRegistryEntry @@ -97,6 +101,7 @@ type Server struct { // for watching the PD API server meta info updates that are related to the scheduling. configWatcher *config.Watcher ruleWatcher *rule.Watcher + metaWatcher *meta.Watcher } // Name returns the unique name for this server in the scheduling cluster. @@ -151,6 +156,7 @@ func (s *Server) updateAPIServerMemberLoop() { ticker = time.NewTicker(100 * time.Millisecond) }) defer ticker.Stop() + var curLeader uint64 for { select { case <-ctx.Done(): @@ -159,9 +165,13 @@ func (s *Server) updateAPIServerMemberLoop() { case <-ticker.C: case <-s.checkMembershipCh: } + if !s.IsServing() { + continue + } members, err := s.GetClient().MemberList(ctx) if err != nil { log.Warn("failed to list members", errs.ZapError(err)) + continue } for _, ep := range members.Members { status, err := s.GetClient().Status(ctx, ep.ClientURLs[0]) @@ -175,7 +185,10 @@ func (s *Server) updateAPIServerMemberLoop() { log.Info("failed to get delegate client", errs.ZapError(err)) } if s.cluster.SwitchAPIServerLeader(pdpb.NewPDClient(cc)) { - log.Info("switch leader", zap.String("leader-id", fmt.Sprintf("%x", ep.ID)), zap.String("endpoint", ep.ClientURLs[0])) + if status.Leader != curLeader { + log.Info("switch leader", zap.String("leader-id", fmt.Sprintf("%x", ep.ID)), zap.String("endpoint", ep.ClientURLs[0])) + } + curLeader = ep.ID break } } @@ -238,9 +251,16 @@ func (s *Server) campaignLeader() { log.Info("triggering the primary callback functions") for _, cb := range s.primaryCallbacks { - cb(ctx) + if err := cb(ctx); err != nil { + log.Error("failed to trigger the primary callback functions", errs.ZapError(err)) + return + } } - + defer func() { + for _, cb := range s.primaryExitCallbacks { + cb() + } + }() s.participant.EnableLeader() log.Info("scheduling primary is ready to serve", zap.String("scheduling-primary-name", s.participant.Name())) @@ -274,9 +294,6 @@ func (s *Server) Close() { utils.StopHTTPServer(s) utils.StopGRPCServer(s) s.GetListener().Close() - s.GetCoordinator().Stop() - s.ruleWatcher.Close() - s.configWatcher.Close() s.serverLoopCancel() s.serverLoopWg.Wait() @@ -303,10 +320,15 @@ func (s *Server) IsClosed() bool { } // AddServiceReadyCallback adds callbacks when the server becomes the leader, if there is embedded etcd, or the primary otherwise. -func (s *Server) AddServiceReadyCallback(callbacks ...func(context.Context)) { +func (s *Server) AddServiceReadyCallback(callbacks ...func(context.Context) error) { s.primaryCallbacks = append(s.primaryCallbacks, callbacks...) } +// AddServiceExitCallback adds callbacks when the server becomes the leader, if there is embedded etcd, or the primary otherwise. +func (s *Server) AddServiceExitCallback(callbacks ...func()) { + s.primaryExitCallbacks = append(s.primaryExitCallbacks, callbacks...) +} + // GetTLSConfig gets the security config. func (s *Server) GetTLSConfig() *grpcutil.TLSConfig { return &s.cfg.Security.TLSConfig @@ -317,6 +339,11 @@ func (s *Server) GetCluster() *Cluster { return s.cluster } +// GetBasicCluster returns the basic cluster. +func (s *Server) GetBasicCluster() *core.BasicCluster { + return s.basicCluster +} + // GetCoordinator returns the coordinator. func (s *Server) GetCoordinator() *schedule.Coordinator { return s.GetCluster().GetCoordinator() @@ -359,24 +386,17 @@ func (s *Server) startServer() (err error) { uniqueName := s.cfg.ListenAddr uniqueID := memberutil.GenerateUniqueID(uniqueName) log.Info("joining primary election", zap.String("participant-name", uniqueName), zap.Uint64("participant-id", uniqueID)) - schedulingPrimaryPrefix := endpoint.SchedulingSvcRootPath(s.clusterID) - s.participant = member.NewParticipant(s.GetClient()) - s.participant.InitInfo(uniqueName, uniqueID, path.Join(schedulingPrimaryPrefix, fmt.Sprintf("%05d", 0)), - utils.PrimaryKey, "primary election", s.cfg.AdvertiseListenAddr) - err = s.startWatcher() - if err != nil { - return err - } - s.storage = endpoint.NewStorageEndpoint( - kv.NewEtcdKVBase(s.GetClient(), endpoint.PDRootPath(s.clusterID)), nil) - basicCluster := core.NewBasicCluster() - s.hbStreams = hbstream.NewHeartbeatStreams(s.Context(), s.clusterID, basicCluster) - s.cluster, err = NewCluster(s.Context(), s.persistConfig, s.storage, basicCluster, s.hbStreams, s.clusterID, s.checkMembershipCh) - if err != nil { - return err + s.participant = member.NewParticipant(s.GetClient(), utils.SchedulingServiceName) + p := &schedulingpb.Participant{ + Name: uniqueName, + Id: uniqueID, // id is unique among all participants + ListenUrls: []string{s.cfg.AdvertiseListenAddr}, } + s.participant.InitInfo(p, endpoint.SchedulingSvcRootPath(s.clusterID), utils.PrimaryKey, "primary election") s.service = &Service{Server: s} + s.AddServiceReadyCallback(s.startCluster) + s.AddServiceExitCallback(s.stopCluster) if err := s.InitListener(s.GetTLSConfig(), s.cfg.ListenAddr); err != nil { return err } @@ -388,7 +408,6 @@ func (s *Server) startServer() (err error) { go utils.StartGRPCAndHTTPServers(s, serverReadyChan, s.GetListener()) s.checkMembershipCh <- struct{}{} <-serverReadyChan - go s.GetCoordinator().RunUntilStop() // Run callbacks log.Info("triggering the start callback functions") @@ -411,16 +430,39 @@ func (s *Server) startServer() (err error) { return nil } +func (s *Server) startCluster(context.Context) error { + s.basicCluster = core.NewBasicCluster() + err := s.startWatcher() + if err != nil { + return err + } + s.storage = endpoint.NewStorageEndpoint(kv.NewMemoryKV(), nil) + s.hbStreams = hbstream.NewHeartbeatStreams(s.Context(), s.clusterID, s.basicCluster) + s.cluster, err = NewCluster(s.Context(), s.persistConfig, s.storage, s.basicCluster, s.hbStreams, s.clusterID, s.checkMembershipCh) + if err != nil { + return err + } + go s.GetCoordinator().RunUntilStop() + return nil +} + +func (s *Server) stopCluster() { + s.GetCoordinator().Stop() + s.ruleWatcher.Close() + s.configWatcher.Close() + s.metaWatcher.Close() +} + func (s *Server) startWatcher() (err error) { - s.configWatcher, err = config.NewWatcher( - s.Context(), s.GetClient(), s.clusterID, s.persistConfig, - ) + s.metaWatcher, err = meta.NewWatcher(s.Context(), s.GetClient(), s.clusterID, s.basicCluster) + if err != nil { + return err + } + s.configWatcher, err = config.NewWatcher(s.Context(), s.GetClient(), s.clusterID, s.persistConfig) if err != nil { return err } - s.ruleWatcher, err = rule.NewWatcher( - s.Context(), s.GetClient(), s.clusterID, - ) + s.ruleWatcher, err = rule.NewWatcher(s.Context(), s.GetClient(), s.clusterID) return err } @@ -438,6 +480,7 @@ func CreateServer(ctx context.Context, cfg *config.Config) *Server { // CreateServerWrapper encapsulates the configuration/log/metrics initialization and create the server func CreateServerWrapper(cmd *cobra.Command, args []string) { + schedulers.Register() cmd.Flags().Parse(args) cfg := config.NewConfig() flagSet := cmd.Flags() diff --git a/pkg/mcs/tso/server/config.go b/pkg/mcs/tso/server/config.go index 23bff5582396..eedf3a2f1b1a 100644 --- a/pkg/mcs/tso/server/config.go +++ b/pkg/mcs/tso/server/config.go @@ -22,8 +22,8 @@ import ( "time" "github.com/BurntSushi/toml" + "github.com/pingcap/errors" "github.com/pingcap/log" - "github.com/pkg/errors" "github.com/spf13/pflag" "github.com/tikv/pd/pkg/mcs/utils" "github.com/tikv/pd/pkg/tso" diff --git a/pkg/mcs/tso/server/grpc_service.go b/pkg/mcs/tso/server/grpc_service.go index ce46a11b0d3a..40a308c72f8b 100644 --- a/pkg/mcs/tso/server/grpc_service.go +++ b/pkg/mcs/tso/server/grpc_service.go @@ -18,12 +18,13 @@ import ( "context" "io" "net/http" + "strconv" "strings" "time" + "github.com/pingcap/errors" "github.com/pingcap/kvproto/pkg/tsopb" "github.com/pingcap/log" - "github.com/pkg/errors" bs "github.com/tikv/pd/pkg/basicserver" "github.com/tikv/pd/pkg/mcs/registry" "github.com/tikv/pd/pkg/utils/apiutil" @@ -135,18 +136,26 @@ func (s *Service) Tso(stream tsopb.TSO_TsoServer) error { if s.IsClosed() { return status.Errorf(codes.Unknown, "server not started") } - if request.GetHeader().GetClusterId() != s.clusterID { + header := request.GetHeader() + clusterID := header.GetClusterId() + if clusterID != s.clusterID { return status.Errorf( codes.FailedPrecondition, "mismatch cluster id, need %d but got %d", - s.clusterID, request.GetHeader().GetClusterId()) + s.clusterID, clusterID) } + keyspaceID := header.GetKeyspaceId() + keyspaceGroupID := header.GetKeyspaceGroupId() + dcLocation := request.GetDcLocation() count := request.GetCount() ts, keyspaceGroupBelongTo, err := s.keyspaceGroupManager.HandleTSORequest( - request.Header.KeyspaceId, request.Header.KeyspaceGroupId, request.GetDcLocation(), count) + ctx, + keyspaceID, keyspaceGroupID, + dcLocation, count) if err != nil { return status.Errorf(codes.Unknown, err.Error()) } - tsoHandleDuration.Observe(time.Since(start).Seconds()) + keyspaceGroupIDStr := strconv.FormatUint(uint64(keyspaceGroupID), 10) + tsoHandleDuration.WithLabelValues(keyspaceGroupIDStr).Observe(time.Since(start).Seconds()) response := &tsopb.TsoResponse{ Header: s.header(keyspaceGroupBelongTo), Timestamp: &ts, diff --git a/pkg/mcs/tso/server/metrics.go b/pkg/mcs/tso/server/metrics.go index afd0d47ef133..288d650e1e7b 100644 --- a/pkg/mcs/tso/server/metrics.go +++ b/pkg/mcs/tso/server/metrics.go @@ -16,12 +16,9 @@ package server import "github.com/prometheus/client_golang/prometheus" -const ( - namespace = "tso" -) +const namespace = "tso" var ( - // TODO: pre-allocate gauge metrics timeJumpBackCounter = prometheus.NewCounter( prometheus.CounterOpts{ Namespace: namespace, @@ -30,7 +27,7 @@ var ( Help: "Counter of system time jumps backward.", }) - metadataGauge = prometheus.NewGaugeVec( + metaDataGauge = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: namespace, Subsystem: "cluster", @@ -46,39 +43,19 @@ var ( Help: "Indicate the tso server info, and the value is the start timestamp (s).", }, []string{"version", "hash"}) - tsoProxyHandleDuration = prometheus.NewHistogram( - prometheus.HistogramOpts{ - Namespace: namespace, - Subsystem: "server", - Name: "handle_tso_proxy_duration_seconds", - Help: "Bucketed histogram of processing time (s) of handled tso proxy requests.", - Buckets: prometheus.ExponentialBuckets(0.0005, 2, 13), - }) - - tsoProxyBatchSize = prometheus.NewHistogram( - prometheus.HistogramOpts{ - Namespace: namespace, - Subsystem: "server", - Name: "handle_tso_proxy_batch_size", - Help: "Bucketed histogram of the batch size of handled tso proxy requests.", - Buckets: prometheus.ExponentialBuckets(1, 2, 13), - }) - - tsoHandleDuration = prometheus.NewHistogram( + tsoHandleDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Namespace: namespace, Subsystem: "server", Name: "handle_tso_duration_seconds", Help: "Bucketed histogram of processing time (s) of handled tso requests.", Buckets: prometheus.ExponentialBuckets(0.0005, 2, 13), - }) + }, []string{"group"}) ) func init() { prometheus.MustRegister(timeJumpBackCounter) - prometheus.MustRegister(metadataGauge) + prometheus.MustRegister(metaDataGauge) prometheus.MustRegister(serverInfo) - prometheus.MustRegister(tsoProxyHandleDuration) - prometheus.MustRegister(tsoProxyBatchSize) prometheus.MustRegister(tsoHandleDuration) } diff --git a/pkg/mcs/tso/server/server.go b/pkg/mcs/tso/server/server.go index 01b1d03ef03e..40958ca463ce 100644 --- a/pkg/mcs/tso/server/server.go +++ b/pkg/mcs/tso/server/server.go @@ -236,7 +236,7 @@ func (s *Server) ResignPrimary(keyspaceID, keyspaceGroupID uint32) error { // AddServiceReadyCallback implements basicserver. // It adds callbacks when it's ready for providing tso service. -func (s *Server) AddServiceReadyCallback(callbacks ...func(context.Context)) { +func (s *Server) AddServiceReadyCallback(callbacks ...func(context.Context) error) { // Do nothing here. The primary of each keyspace group assigned to this host // will respond to the requests accordingly. } @@ -345,7 +345,7 @@ func (s *Server) startServer() (err error) { log.Info("init cluster id", zap.Uint64("cluster-id", s.clusterID)) // It may lose accuracy if use float64 to store uint64. So we store the cluster id in label. - metadataGauge.WithLabelValues(fmt.Sprintf("cluster%d", s.clusterID)).Set(0) + metaDataGauge.WithLabelValues(fmt.Sprintf("cluster%d", s.clusterID)).Set(0) // The independent TSO service still reuses PD version info since PD and TSO are just // different service modes provided by the same pd-server binary serverInfo.WithLabelValues(versioninfo.PDReleaseVersion, versioninfo.PDGitHash).Set(float64(time.Now().Unix())) diff --git a/pkg/mcs/utils/util.go b/pkg/mcs/utils/util.go index 2390f5be4f04..682e73f20ae3 100644 --- a/pkg/mcs/utils/util.go +++ b/pkg/mcs/utils/util.go @@ -24,10 +24,10 @@ import ( "time" "github.com/gin-gonic/gin" + "github.com/pingcap/errors" "github.com/pingcap/kvproto/pkg/diagnosticspb" "github.com/pingcap/kvproto/pkg/pdpb" "github.com/pingcap/log" - "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/soheilhy/cmux" diff --git a/pkg/member/election_leader.go b/pkg/member/election_leader.go index 8c0496f670e1..24520bfbe649 100644 --- a/pkg/member/election_leader.go +++ b/pkg/member/election_leader.go @@ -18,7 +18,6 @@ import ( "context" "github.com/pingcap/kvproto/pkg/pdpb" - "github.com/pingcap/kvproto/pkg/tsopb" ) // ElectionLeader defines the common interface of the leader, which is the pdpb.Member @@ -64,14 +63,14 @@ func (l *EmbeddedEtcdLeader) Watch(ctx context.Context) { // EtcdLeader is the leader in the election group backed by the etcd, but it's // decoupled from the embedded etcd. type EtcdLeader struct { - wrapper *Participant - pariticipant *tsopb.Participant - revision int64 + wrapper *Participant + participant participant + revision int64 } // GetListenUrls returns current leader's client urls func (l *EtcdLeader) GetListenUrls() []string { - return l.pariticipant.GetListenUrls() + return l.participant.GetListenUrls() } // GetRevision the revision of the leader in etcd @@ -81,10 +80,10 @@ func (l *EtcdLeader) GetRevision() int64 { // String declares fmt.Stringer func (l *EtcdLeader) String() string { - return l.pariticipant.String() + return l.participant.String() } // Watch on the leader func (l *EtcdLeader) Watch(ctx context.Context) { - l.wrapper.WatchLeader(ctx, l.pariticipant, l.revision) + l.wrapper.WatchLeader(ctx, l.participant, l.revision) } diff --git a/pkg/member/participant.go b/pkg/member/participant.go index 102bfcbce5f3..27dced577912 100644 --- a/pkg/member/participant.go +++ b/pkg/member/participant.go @@ -22,10 +22,13 @@ import ( "sync/atomic" "time" + "github.com/pingcap/kvproto/pkg/resource_manager" + "github.com/pingcap/kvproto/pkg/schedulingpb" "github.com/pingcap/kvproto/pkg/tsopb" "github.com/pingcap/log" "github.com/tikv/pd/pkg/election" "github.com/tikv/pd/pkg/errs" + "github.com/tikv/pd/pkg/mcs/utils" "github.com/tikv/pd/pkg/utils/etcdutil" "go.etcd.io/etcd/clientv3" "go.uber.org/zap" @@ -33,17 +36,28 @@ import ( type leadershipCheckFunc func(*election.Leadership) bool +type participant interface { + GetName() string + GetId() uint64 + GetListenUrls() []string + String() string + Marshal() ([]byte, error) + Reset() + ProtoMessage() +} + // Participant is used for the election related logic. Compared to its counterpart // EmbeddedEtcdMember, Participant relies on etcd for election, but it's decoupled // from the embedded etcd. It implements Member interface. type Participant struct { leadership *election.Leadership // stored as member type - leader atomic.Value - client *clientv3.Client - rootPath string - leaderPath string - member *tsopb.Participant + leader atomic.Value + client *clientv3.Client + rootPath string + leaderPath string + member participant + serviceName string // memberValue is the serialized string of `member`. It will be saved in the // leader key when this participant is successfully elected as the leader of // the group. Every write will use it to check the leadership. @@ -56,42 +70,37 @@ type Participant struct { } // NewParticipant create a new Participant. -func NewParticipant(client *clientv3.Client) *Participant { +func NewParticipant(client *clientv3.Client, serviceName string) *Participant { return &Participant{ - client: client, + client: client, + serviceName: serviceName, } } // InitInfo initializes the member info. The leader key is path.Join(rootPath, leaderName) -func (m *Participant) InitInfo(name string, id uint64, rootPath string, leaderName string, purpose string, advertiseListenAddr string) { - leader := &tsopb.Participant{ - Name: name, - Id: id, // id is unique among all participants - ListenUrls: []string{advertiseListenAddr}, - } - - data, err := leader.Marshal() +func (m *Participant) InitInfo(p participant, rootPath string, leaderName string, purpose string) { + data, err := p.Marshal() if err != nil { // can't fail, so panic here. - log.Fatal("marshal leader meet error", zap.Stringer("leader-name", leader), errs.ZapError(errs.ErrMarshalLeader, err)) + log.Fatal("marshal leader meet error", zap.String("member-name", p.String()), errs.ZapError(errs.ErrMarshalLeader, err)) } - m.member = leader + m.member = p m.memberValue = string(data) m.rootPath = rootPath m.leaderPath = path.Join(rootPath, leaderName) m.leadership = election.NewLeadership(m.client, m.GetLeaderPath(), purpose) m.lastLeaderUpdatedTime.Store(time.Now()) - log.Info("participant joining election", zap.Stringer("participant-info", m.member), zap.String("leader-path", m.leaderPath)) + log.Info("participant joining election", zap.String("participant-info", p.String()), zap.String("leader-path", m.leaderPath)) } // ID returns the unique ID for this participant in the election group func (m *Participant) ID() uint64 { - return m.member.Id + return m.member.GetId() } // Name returns the unique name in the election group. func (m *Participant) Name() string { - return m.member.Name + return m.member.GetName() } // GetMember returns the member. @@ -112,6 +121,9 @@ func (m *Participant) Client() *clientv3.Client { // IsLeader returns whether the participant is the leader or not by checking its leadership's // lease and leader info. func (m *Participant) IsLeader() bool { + if m.GetLeader() == nil { + return false + } return m.leadership.Check() && m.GetLeader().GetId() == m.member.GetId() && m.campaignCheck() } @@ -122,6 +134,9 @@ func (m *Participant) IsLeaderElected() bool { // GetLeaderListenUrls returns current leader's listen urls func (m *Participant) GetLeaderListenUrls() []string { + if m.GetLeader() == nil { + return nil + } return m.GetLeader().GetListenUrls() } @@ -131,12 +146,12 @@ func (m *Participant) GetLeaderID() uint64 { } // GetLeader returns current leader of the election group. -func (m *Participant) GetLeader() *tsopb.Participant { +func (m *Participant) GetLeader() participant { leader := m.leader.Load() if leader == nil { return nil } - member := leader.(*tsopb.Participant) + member := leader.(participant) if member.GetId() == 0 { return nil } @@ -144,14 +159,23 @@ func (m *Participant) GetLeader() *tsopb.Participant { } // setLeader sets the member's leader. -func (m *Participant) setLeader(member *tsopb.Participant) { +func (m *Participant) setLeader(member participant) { m.leader.Store(member) m.lastLeaderUpdatedTime.Store(time.Now()) } // unsetLeader unsets the member's leader. func (m *Participant) unsetLeader() { - m.leader.Store(&tsopb.Participant{}) + var leader participant + switch m.serviceName { + case utils.TSOServiceName: + leader = &tsopb.Participant{} + case utils.SchedulingServiceName: + leader = &schedulingpb.Participant{} + case utils.ResourceManagerServiceName: + leader = &resource_manager.Participant{} + } + m.leader.Store(leader) m.lastLeaderUpdatedTime.Store(time.Now()) } @@ -200,8 +224,16 @@ func (m *Participant) PreCheckLeader() error { } // getPersistentLeader gets the corresponding leader from etcd by given leaderPath (as the key). -func (m *Participant) getPersistentLeader() (*tsopb.Participant, int64, error) { - leader := &tsopb.Participant{} +func (m *Participant) getPersistentLeader() (participant, int64, error) { + var leader participant + switch m.serviceName { + case utils.TSOServiceName: + leader = &tsopb.Participant{} + case utils.SchedulingServiceName: + leader = &schedulingpb.Participant{} + case utils.ResourceManagerServiceName: + leader = &resource_manager.Participant{} + } ok, rev, err := etcdutil.GetProtoMsgWithModRev(m.client, m.GetLeaderPath(), leader) if err != nil { return nil, 0, err @@ -248,14 +280,14 @@ func (m *Participant) CheckLeader() (ElectionLeader, bool) { } return &EtcdLeader{ - wrapper: m, - pariticipant: leader, - revision: revision, + wrapper: m, + participant: leader, + revision: revision, }, false } // WatchLeader is used to watch the changes of the leader. -func (m *Participant) WatchLeader(ctx context.Context, leader *tsopb.Participant, revision int64) { +func (m *Participant) WatchLeader(ctx context.Context, leader participant, revision int64) { m.setLeader(leader) m.leadership.Watch(ctx, revision) m.unsetLeader() @@ -269,7 +301,7 @@ func (m *Participant) ResetLeader() { } // IsSameLeader checks whether a server is the leader itself. -func (m *Participant) IsSameLeader(leader *tsopb.Participant) bool { +func (m *Participant) IsSameLeader(leader participant) bool { return leader.GetId() == m.ID() } diff --git a/pkg/mock/mockcluster/mockcluster.go b/pkg/mock/mockcluster/mockcluster.go index ce392d26a391..1ed7ab4eb9ff 100644 --- a/pkg/mock/mockcluster/mockcluster.go +++ b/pkg/mock/mockcluster/mockcluster.go @@ -21,10 +21,10 @@ import ( "time" "github.com/docker/go-units" + "github.com/pingcap/errors" "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/kvproto/pkg/pdpb" "github.com/pingcap/log" - "github.com/pkg/errors" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/core/storelimit" "github.com/tikv/pd/pkg/errs" @@ -231,7 +231,7 @@ func (mc *Cluster) GetRegionLabeler() *labeler.RegionLabeler { func (mc *Cluster) SetStoreUp(storeID uint64) { store := mc.GetStore(storeID) newStore := store.Clone( - core.UpStore(), + core.SetStoreState(metapb.StoreState_Up), core.SetLastHeartbeatTS(time.Now()), ) mc.PutStore(newStore) @@ -241,7 +241,7 @@ func (mc *Cluster) SetStoreUp(storeID uint64) { func (mc *Cluster) SetStoreDisconnect(storeID uint64) { store := mc.GetStore(storeID) newStore := store.Clone( - core.UpStore(), + core.SetStoreState(metapb.StoreState_Up), core.SetLastHeartbeatTS(time.Now().Add(-time.Second*30)), ) mc.PutStore(newStore) @@ -251,7 +251,7 @@ func (mc *Cluster) SetStoreDisconnect(storeID uint64) { func (mc *Cluster) SetStoreDown(storeID uint64) { store := mc.GetStore(storeID) newStore := store.Clone( - core.UpStore(), + core.SetStoreState(metapb.StoreState_Up), core.SetLastHeartbeatTS(typeutil.ZeroTime), ) mc.PutStore(newStore) @@ -260,7 +260,7 @@ func (mc *Cluster) SetStoreDown(storeID uint64) { // SetStoreOffline sets store state to be offline. func (mc *Cluster) SetStoreOffline(storeID uint64) { store := mc.GetStore(storeID) - newStore := store.Clone(core.OfflineStore(false)) + newStore := store.Clone(core.SetStoreState(metapb.StoreState_Offline, false)) mc.PutStore(newStore) } @@ -287,7 +287,7 @@ func (mc *Cluster) BuryStore(storeID uint64, forceBury bool) error { } } - newStore := store.Clone(core.TombstoneStore()) + newStore := store.Clone(core.SetStoreState(metapb.StoreState_Tombstone)) mc.PutStore(newStore) return nil } diff --git a/pkg/schedule/config/config.go b/pkg/schedule/config/config.go index f87b702e0b2d..c8fa62b8aff8 100644 --- a/pkg/schedule/config/config.go +++ b/pkg/schedule/config/config.go @@ -17,8 +17,8 @@ package config import ( "time" + "github.com/pingcap/errors" "github.com/pingcap/kvproto/pkg/metapb" - "github.com/pkg/errors" "github.com/tikv/pd/pkg/core/storelimit" "github.com/tikv/pd/pkg/utils/configutil" "github.com/tikv/pd/pkg/utils/syncutil" diff --git a/pkg/schedule/coordinator.go b/pkg/schedule/coordinator.go index 0e6c2281f2f6..7e21919b2140 100644 --- a/pkg/schedule/coordinator.go +++ b/pkg/schedule/coordinator.go @@ -379,7 +379,7 @@ func (c *Coordinator) Run() { } } log.Info("Coordinator starts to run schedulers") - c.initSchedulers() + c.InitSchedulers(true) c.wg.Add(4) // Starts to patrol regions. @@ -391,7 +391,8 @@ func (c *Coordinator) Run() { go c.driveSlowNodeScheduler() } -func (c *Coordinator) initSchedulers() { +// InitSchedulers initializes schedulers. +func (c *Coordinator) InitSchedulers(needRun bool) { var ( scheduleNames []string configs []string @@ -401,7 +402,7 @@ func (c *Coordinator) initSchedulers() { scheduleNames, configs, err = c.cluster.GetStorage().LoadAllScheduleConfig() select { case <-c.ctx.Done(): - log.Info("Coordinator stops running") + log.Info("init schedulers has been stopped") return default: } @@ -439,8 +440,10 @@ func (c *Coordinator) initSchedulers() { continue } log.Info("create scheduler with independent configuration", zap.String("scheduler-name", s.GetName())) - if err = c.schedulers.AddScheduler(s); err != nil { - log.Error("can not add scheduler with independent configuration", zap.String("scheduler-name", s.GetName()), zap.Strings("scheduler-args", cfg.Args), errs.ZapError(err)) + if needRun { + if err = c.schedulers.AddScheduler(s); err != nil { + log.Error("can not add scheduler with independent configuration", zap.String("scheduler-name", s.GetName()), zap.Strings("scheduler-args", cfg.Args), errs.ZapError(err)) + } } } @@ -461,12 +464,14 @@ func (c *Coordinator) initSchedulers() { } log.Info("create scheduler", zap.String("scheduler-name", s.GetName()), zap.Strings("scheduler-args", schedulerCfg.Args)) - if err = c.schedulers.AddScheduler(s, schedulerCfg.Args...); err != nil && !errors.ErrorEqual(err, errs.ErrSchedulerExisted.FastGenByArgs()) { - log.Error("can not add scheduler", zap.String("scheduler-name", s.GetName()), zap.Strings("scheduler-args", schedulerCfg.Args), errs.ZapError(err)) - } else { - // Only records the valid scheduler config. - scheduleCfg.Schedulers[k] = schedulerCfg - k++ + if needRun { + if err = c.schedulers.AddScheduler(s, schedulerCfg.Args...); err != nil && !errors.ErrorEqual(err, errs.ErrSchedulerExisted.FastGenByArgs()) { + log.Error("can not add scheduler", zap.String("scheduler-name", s.GetName()), zap.Strings("scheduler-args", schedulerCfg.Args), errs.ZapError(err)) + } else { + // Only records the valid scheduler config. + scheduleCfg.Schedulers[k] = schedulerCfg + k++ + } } } diff --git a/pkg/schedule/scatter/region_scatterer_test.go b/pkg/schedule/scatter/region_scatterer_test.go index bc5fd9c9e292..5bdbdcbf159d 100644 --- a/pkg/schedule/scatter/region_scatterer_test.go +++ b/pkg/schedule/scatter/region_scatterer_test.go @@ -696,7 +696,35 @@ func TestSelectedStoresTooManyPeers(t *testing.T) { } } -// TestBalanceRegion tests whether region peers and leaders are balanced after scatter. +// TestBalanceLeader only tests whether region leaders are balanced after scatter. +func TestBalanceLeader(t *testing.T) { + re := require.New(t) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + opt := mockconfig.NewTestOptions() + tc := mockcluster.NewCluster(ctx, opt) + stream := hbstream.NewTestHeartbeatStreams(ctx, tc.ID, tc, false) + oc := operator.NewController(ctx, tc.GetBasicCluster(), tc.GetSharedConfig(), stream) + // Add 3 stores + for i := uint64(2); i <= 4; i++ { + tc.AddLabelsStore(i, 0, nil) + // prevent store from being disconnected + tc.SetStoreLastHeartbeatInterval(i, -10*time.Minute) + } + group := "group" + scatterer := NewRegionScatterer(ctx, tc, oc, tc.AddSuspectRegions) + for i := uint64(1001); i <= 1300; i++ { + region := tc.AddLeaderRegion(i, 2, 3, 4) + op := scatterer.scatterRegion(region, group, false) + re.False(isPeerCountChanged(op)) + } + // all leader will be balanced in three stores. + for i := uint64(2); i <= 4; i++ { + re.Equal(uint64(100), scatterer.ordinaryEngine.selectedLeader.Get(i, group)) + } +} + +// TestBalanceRegion tests whether region peers are balanced after scatter. // ref https://github.com/tikv/pd/issues/6017 func TestBalanceRegion(t *testing.T) { re := require.New(t) @@ -722,7 +750,6 @@ func TestBalanceRegion(t *testing.T) { } for i := uint64(2); i <= 7; i++ { re.Equal(uint64(150), scatterer.ordinaryEngine.selectedPeer.Get(i, group)) - re.Equal(uint64(50), scatterer.ordinaryEngine.selectedLeader.Get(i, group)) } // Test for unhealthy region // ref https://github.com/tikv/pd/issues/6099 diff --git a/pkg/statistics/region.go b/pkg/statistics/region.go index f39c58ed81cc..f78c8c6add38 100644 --- a/pkg/statistics/region.go +++ b/pkg/statistics/region.go @@ -59,10 +59,12 @@ func (s *RegionStats) Observe(r *core.RegionInfo) { approximateKeys := r.GetApproximateKeys() approximateSize := r.GetApproximateSize() approximateKvSize := r.GetApproximateKvSize() - if approximateSize <= core.EmptyRegionApproximateSize { + if approximateSize == core.EmptyRegionApproximateSize { s.EmptyCount++ } - s.StorageSize += approximateSize + if !r.IsEmptyRegion() { + s.StorageSize += approximateSize + } s.UserStorageSize += approximateKvSize s.StorageKeys += approximateKeys leader := r.GetLeader() diff --git a/pkg/statistics/region_collection.go b/pkg/statistics/region_collection.go index 6d3ac3f17608..c24a1581d266 100644 --- a/pkg/statistics/region_collection.go +++ b/pkg/statistics/region_collection.go @@ -198,7 +198,7 @@ func (r *RegionStatistics) Observe(region *core.RegionInfo, stores []*core.Store return false }(), LearnerPeer: len(region.GetLearners()) > 0, - EmptyRegion: region.GetApproximateSize() <= core.EmptyRegionApproximateSize, + EmptyRegion: region.IsEmptyRegion(), OversizedRegion: region.IsOversized( int64(r.conf.GetRegionMaxSize()), int64(r.conf.GetRegionMaxKeys()), @@ -206,7 +206,7 @@ func (r *RegionStatistics) Observe(region *core.RegionInfo, stores []*core.Store UndersizedRegion: region.NeedMerge( int64(r.conf.GetMaxMergeRegionSize()), int64(r.conf.GetMaxMergeRegionKeys()), - ), + ) && region.GetApproximateSize() >= core.EmptyRegionApproximateSize, WitnessLeader: region.GetLeader().GetIsWitness(), } // Check if the region meets any of the conditions and update the corresponding info. diff --git a/pkg/statistics/region_collection_test.go b/pkg/statistics/region_collection_test.go index bc15c648598b..232fb8b73d8b 100644 --- a/pkg/statistics/region_collection_test.go +++ b/pkg/statistics/region_collection_test.go @@ -59,11 +59,11 @@ func TestRegionStatistics(t *testing.T) { {Peer: peers[1], DownSeconds: 3608}, } - store3 := stores[3].Clone(core.OfflineStore(false)) + store3 := stores[3].Clone(core.SetStoreState(metapb.StoreState_Offline, false)) stores[3] = store3 r1 := &metapb.Region{Id: 1, Peers: peers, StartKey: []byte("aa"), EndKey: []byte("bb")} r2 := &metapb.Region{Id: 2, Peers: peers[0:2], StartKey: []byte("cc"), EndKey: []byte("dd")} - region1 := core.NewRegionInfo(r1, peers[0]) + region1 := core.NewRegionInfo(r1, peers[0], core.SetApproximateSize(1)) region2 := core.NewRegionInfo(r2, peers[0]) regionStats := NewRegionStatistics(nil, opt, manager) regionStats.Observe(region1, stores) @@ -97,7 +97,8 @@ func TestRegionStatistics(t *testing.T) { re.Len(regionStats.stats[PendingPeer], 1) re.Len(regionStats.stats[LearnerPeer], 1) re.Len(regionStats.stats[OversizedRegion], 1) - re.Len(regionStats.stats[UndersizedRegion], 1) + re.Len(regionStats.stats[UndersizedRegion], 0) + re.Len(regionStats.stats[EmptyRegion], 0) re.Len(regionStats.stats[OfflinePeer], 1) region1 = region1.Clone(core.WithRemoveStorePeer(7)) @@ -109,7 +110,7 @@ func TestRegionStatistics(t *testing.T) { re.Empty(regionStats.stats[LearnerPeer]) re.Empty(regionStats.stats[OfflinePeer]) - store3 = stores[3].Clone(core.UpStore()) + store3 = stores[3].Clone(core.SetStoreState(metapb.StoreState_Up)) stores[3] = store3 regionStats.Observe(region1, stores) re.Empty(regionStats.stats[OfflinePeer]) diff --git a/pkg/statistics/store_collection.go b/pkg/statistics/store_collection.go index 2b695ed29230..74008014ddb2 100644 --- a/pkg/statistics/store_collection.go +++ b/pkg/statistics/store_collection.go @@ -61,7 +61,7 @@ func newStoreStatistics(opt *config.PersistOptions) *storeStatistics { } } -func (s *storeStatistics) Observe(store *core.StoreInfo, stats *StoresStats) { +func (s *storeStatistics) Observe(store *core.StoreInfo) { for _, k := range s.opt.GetLocationLabels() { v := store.GetLabelValue(k) if v == "" { @@ -146,8 +146,12 @@ func (s *storeStatistics) Observe(store *core.StoreInfo, stats *StoresStats) { storeStatusGauge.WithLabelValues(storeAddress, id, "store_slow_trend_result_value").Set(slowTrend.ResultValue) storeStatusGauge.WithLabelValues(storeAddress, id, "store_slow_trend_result_rate").Set(slowTrend.ResultRate) } +} +func (s *storeStatistics) ObserveHotStat(store *core.StoreInfo, stats *StoresStats) { // Store flows. + storeAddress := store.GetAddress() + id := strconv.FormatUint(store.GetID(), 10) storeFlowStats := stats.GetRollingStoreStats(store.GetID()) if storeFlowStats == nil { return @@ -298,8 +302,12 @@ func NewStoreStatisticsMap(opt *config.PersistOptions) *storeStatisticsMap { } } -func (m *storeStatisticsMap) Observe(store *core.StoreInfo, stats *StoresStats) { - m.stats.Observe(store, stats) +func (m *storeStatisticsMap) Observe(store *core.StoreInfo) { + m.stats.Observe(store) +} + +func (m *storeStatisticsMap) ObserveHotStat(store *core.StoreInfo, stats *StoresStats) { + m.stats.ObserveHotStat(store, stats) } func (m *storeStatisticsMap) Collect() { diff --git a/pkg/statistics/store_collection_test.go b/pkg/statistics/store_collection_test.go index 229339cb4c4c..054e55a9fdad 100644 --- a/pkg/statistics/store_collection_test.go +++ b/pkg/statistics/store_collection_test.go @@ -55,7 +55,7 @@ func TestStoreStatistics(t *testing.T) { stores = append(stores, s) } - store3 := stores[3].Clone(core.OfflineStore(false)) + store3 := stores[3].Clone(core.SetStoreState(metapb.StoreState_Offline, false)) stores[3] = store3 store4 := stores[4].Clone(core.SetLastHeartbeatTS(stores[4].GetLastHeartbeatTS().Add(-time.Hour))) stores[4] = store4 @@ -67,7 +67,8 @@ func TestStoreStatistics(t *testing.T) { stores[5] = store5 storeStats := NewStoreStatisticsMap(opt) for _, store := range stores { - storeStats.Observe(store, storesStats) + storeStats.Observe(store) + storeStats.ObserveHotStat(store, storesStats) } stats := storeStats.stats diff --git a/pkg/statistics/store_test.go b/pkg/statistics/store_test.go index 9f5d9a1cc42e..a0e7140a8823 100644 --- a/pkg/statistics/store_test.go +++ b/pkg/statistics/store_test.go @@ -35,7 +35,7 @@ func TestFilterUnhealtyStore(t *testing.T) { re.Len(stats.GetStoresLoads(), 5) cluster.PutStore(cluster.GetStore(1).Clone(core.SetLastHeartbeatTS(time.Now().Add(-24 * time.Hour)))) - cluster.PutStore(cluster.GetStore(2).Clone(core.TombstoneStore())) + cluster.PutStore(cluster.GetStore(2).Clone(core.SetStoreState(metapb.StoreState_Tombstone))) cluster.DeleteStore(cluster.GetStore(3)) stats.FilterUnhealthyStore(cluster) diff --git a/pkg/storage/endpoint/key_path.go b/pkg/storage/endpoint/key_path.go index 872d994a270c..85af79203a43 100644 --- a/pkg/storage/endpoint/key_path.go +++ b/pkg/storage/endpoint/key_path.go @@ -121,6 +121,17 @@ func StorePath(storeID uint64) string { return path.Join(clusterPath, "s", fmt.Sprintf("%020d", storeID)) } +// StorePathPrefix returns the store meta info key path prefix. +func StorePathPrefix(clusterID uint64) string { + return path.Join(PDRootPath(clusterID), clusterPath, "s") + "/" +} + +// ExtractStoreIDFromPath extracts the store ID from the given path. +func ExtractStoreIDFromPath(clusterID uint64, path string) (uint64, error) { + idStr := strings.TrimLeft(strings.TrimPrefix(path, StorePathPrefix(clusterID)), "0") + return strconv.ParseUint(idStr, 10, 64) +} + func storeLeaderWeightPath(storeID uint64) string { return path.Join(schedulePath, "store_weight", fmt.Sprintf("%020d", storeID), "leader") } diff --git a/pkg/storage/kv/kv_test.go b/pkg/storage/kv/kv_test.go index d2db558a7488..93359934da19 100644 --- a/pkg/storage/kv/kv_test.go +++ b/pkg/storage/kv/kv_test.go @@ -24,21 +24,12 @@ import ( "github.com/stretchr/testify/require" "github.com/tikv/pd/pkg/utils/etcdutil" "go.etcd.io/etcd/clientv3" - "go.etcd.io/etcd/embed" ) func TestEtcd(t *testing.T) { re := require.New(t) - cfg := etcdutil.NewTestSingleConfig(t) - etcd, err := embed.StartEtcd(cfg) - re.NoError(err) - defer etcd.Close() - - ep := cfg.LCUrls[0].String() - client, err := clientv3.New(clientv3.Config{ - Endpoints: []string{ep}, - }) - re.NoError(err) + _, client, clean := etcdutil.NewTestEtcdCluster(t, 1) + defer clean() rootPath := path.Join("/pd", strconv.FormatUint(100, 10)) kv := NewEtcdKVBase(client, rootPath) diff --git a/pkg/storage/storage_tso_test.go b/pkg/storage/storage_tso_test.go index 1dbba2895120..9718565d78f2 100644 --- a/pkg/storage/storage_tso_test.go +++ b/pkg/storage/storage_tso_test.go @@ -23,28 +23,14 @@ import ( "github.com/stretchr/testify/require" "github.com/tikv/pd/pkg/storage/endpoint" "github.com/tikv/pd/pkg/utils/etcdutil" - "go.etcd.io/etcd/clientv3" - "go.etcd.io/etcd/embed" ) func TestSaveLoadTimestamp(t *testing.T) { re := require.New(t) - - cfg := etcdutil.NewTestSingleConfig(t) - etcd, err := embed.StartEtcd(cfg) - re.NoError(err) - defer etcd.Close() - - ep := cfg.LCUrls[0].String() - client, err := clientv3.New(clientv3.Config{ - Endpoints: []string{ep}, - }) - re.NoError(err) - rootPath := path.Join("/pd", strconv.FormatUint(100, 10)) - storage := NewStorageWithEtcdBackend(client, rootPath) - + storage, clean := newTestStorage(t) + defer clean() expectedTS := time.Now().Round(0) - err = storage.SaveTimestamp(endpoint.TimestampKey, expectedTS) + err := storage.SaveTimestamp(endpoint.TimestampKey, expectedTS) re.NoError(err) ts, err := storage.LoadTimestamp("") re.NoError(err) @@ -53,27 +39,15 @@ func TestSaveLoadTimestamp(t *testing.T) { func TestGlobalLocalTimestamp(t *testing.T) { re := require.New(t) - - cfg := etcdutil.NewTestSingleConfig(t) - etcd, err := embed.StartEtcd(cfg) - re.NoError(err) - defer etcd.Close() - - ep := cfg.LCUrls[0].String() - client, err := clientv3.New(clientv3.Config{ - Endpoints: []string{ep}, - }) - re.NoError(err) - rootPath := path.Join("/pd", strconv.FormatUint(100, 10)) - storage := NewStorageWithEtcdBackend(client, rootPath) - + storage, clean := newTestStorage(t) + defer clean() ltaKey := "lta" dc1LocationKey, dc2LocationKey := "dc1", "dc2" localTS1 := time.Now().Round(0) l1 := path.Join(ltaKey, dc1LocationKey, endpoint.TimestampKey) l2 := path.Join(ltaKey, dc2LocationKey, endpoint.TimestampKey) - err = storage.SaveTimestamp(l1, localTS1) + err := storage.SaveTimestamp(l1, localTS1) re.NoError(err) globalTS := time.Now().Round(0) err = storage.SaveTimestamp(endpoint.TimestampKey, globalTS) @@ -93,22 +67,10 @@ func TestGlobalLocalTimestamp(t *testing.T) { func TestTimestampTxn(t *testing.T) { re := require.New(t) - - cfg := etcdutil.NewTestSingleConfig(t) - etcd, err := embed.StartEtcd(cfg) - re.NoError(err) - defer etcd.Close() - - ep := cfg.LCUrls[0].String() - client, err := clientv3.New(clientv3.Config{ - Endpoints: []string{ep}, - }) - re.NoError(err) - rootPath := path.Join("/pd", strconv.FormatUint(100, 10)) - storage := NewStorageWithEtcdBackend(client, rootPath) - + storage, clean := newTestStorage(t) + defer clean() globalTS1 := time.Now().Round(0) - err = storage.SaveTimestamp(endpoint.TimestampKey, globalTS1) + err := storage.SaveTimestamp(endpoint.TimestampKey, globalTS1) re.NoError(err) globalTS2 := globalTS1.Add(-time.Millisecond).Round(0) @@ -119,3 +81,9 @@ func TestTimestampTxn(t *testing.T) { re.NoError(err) re.Equal(globalTS1, ts) } + +func newTestStorage(t *testing.T) (Storage, func()) { + _, client, clean := etcdutil.NewTestEtcdCluster(t, 1) + rootPath := path.Join("/pd", strconv.FormatUint(100, 10)) + return NewStorageWithEtcdBackend(client, rootPath), clean +} diff --git a/pkg/tso/allocator_manager.go b/pkg/tso/allocator_manager.go index b4a05f88d6c3..55632f9f5ba9 100644 --- a/pkg/tso/allocator_manager.go +++ b/pkg/tso/allocator_manager.go @@ -1198,6 +1198,9 @@ func (am *AllocatorManager) getAllocatorGroup(dcLocation string) (*allocatorGrou func (am *AllocatorManager) GetAllocator(dcLocation string) (Allocator, error) { am.mu.RLock() defer am.mu.RUnlock() + if len(dcLocation) == 0 { + dcLocation = GlobalDCLocation + } allocatorGroup, exist := am.mu.allocatorGroups[dcLocation] if !exist { return nil, errs.ErrGetAllocator.FastGenByArgs(fmt.Sprintf("%s allocator not found", dcLocation)) diff --git a/pkg/tso/keyspace_group_manager.go b/pkg/tso/keyspace_group_manager.go index 7ed22163c1e9..c6d2323aa4bc 100644 --- a/pkg/tso/keyspace_group_manager.go +++ b/pkg/tso/keyspace_group_manager.go @@ -29,6 +29,7 @@ import ( perrors "github.com/pingcap/errors" "github.com/pingcap/failpoint" "github.com/pingcap/kvproto/pkg/pdpb" + "github.com/pingcap/kvproto/pkg/tsopb" "github.com/pingcap/log" "github.com/tikv/pd/pkg/election" "github.com/tikv/pd/pkg/errs" @@ -73,16 +74,23 @@ type state struct { keyspaceLookupTable map[uint32]uint32 // splittingGroups is the cache of splitting keyspace group related information. // The key is the keyspace group ID, and the value is the time when the keyspace group - // is created as the split target. + // is created as the split target. Once the split is finished, the keyspace group will + // be removed from this map. splittingGroups map[uint32]time.Time // deletedGroups is the cache of deleted keyspace group related information. + // Being merged will cause the group to be added to this map and finally be deleted after the merge. deletedGroups map[uint32]struct{} + // requestedGroups is the cache of requested keyspace group related information. + // Once a group receives its first TSO request and pass the certain check, it will be added to this map. + // Being merged will cause the group to be removed from this map eventually if the merge is successful. + requestedGroups map[uint32]struct{} } func (s *state) initialize() { s.keyspaceLookupTable = make(map[uint32]uint32) s.splittingGroups = make(map[uint32]time.Time) s.deletedGroups = make(map[uint32]struct{}) + s.requestedGroups = make(map[uint32]struct{}) } func (s *state) deInitialize() { @@ -146,7 +154,44 @@ func (s *state) getDeletedGroupNum() int { return len(s.deletedGroups) } -func (s *state) checkTSOSplit( +// cleanKeyspaceGroup cleans the given keyspace group from the state. +// NOTICE: currently the only legal way to delete a keyspace group is +// to merge it into another one. This function is used to clean up the +// remaining info after the merge has been finished. +func (s *state) cleanKeyspaceGroup(groupID uint32) { + s.Lock() + defer s.Unlock() + delete(s.deletedGroups, groupID) + delete(s.requestedGroups, groupID) +} + +// markGroupRequested checks if the given keyspace group has been requested and should be marked. +// If yes, it will do nothing and return nil directly. +// If not, it will try to mark the keyspace group as requested inside a critical section, which +// will call the checker passed in to check if the keyspace group is qualified to be marked as requested. +// Any error encountered during the check will be returned to the caller. +func (s *state) markGroupRequested(groupID uint32, checker func() error) error { + // Fast path to check if the keyspace group has been marked as requested. + s.RLock() + _, ok := s.requestedGroups[groupID] + s.RUnlock() + if ok { + return nil + } + s.Lock() + defer s.Unlock() + // Double check if the keyspace group has been marked as requested. + if _, ok := s.requestedGroups[groupID]; ok { + return nil + } + if err := checker(); err != nil { + return err + } + s.requestedGroups[groupID] = struct{}{} + return nil +} + +func (s *state) checkGroupSplit( targetGroupID uint32, ) (splitTargetAM, splitSourceAM *AllocatorManager, err error) { s.RLock() @@ -168,7 +213,7 @@ func (s *state) checkTSOSplit( // Reject any request if the keyspace group is in merging state, // we need to wait for the merging checker to finish the TSO merging. -func (s *state) checkTSOMerge( +func (s *state) checkGroupMerge( groupID uint32, ) error { s.RLock() @@ -692,10 +737,13 @@ func (kgm *KeyspaceGroupManager) updateKeyspaceGroup(group *endpoint.KeyspaceGro zap.String("participant-name", uniqueName), zap.Uint64("participant-id", uniqueID)) // Initialize the participant info to join the primary election. - participant := member.NewParticipant(kgm.etcdClient) - participant.InitInfo( - uniqueName, uniqueID, endpoint.KeyspaceGroupsElectionPath(kgm.tsoSvcRootPath, group.ID), - mcsutils.PrimaryKey, "keyspace group primary election", kgm.cfg.GetAdvertiseListenAddr()) + participant := member.NewParticipant(kgm.etcdClient, mcsutils.TSOServiceName) + p := &tsopb.Participant{ + Name: uniqueName, + Id: uniqueID, // id is unique among all participants + ListenUrls: []string{kgm.cfg.GetAdvertiseListenAddr()}, + } + participant.InitInfo(p, endpoint.KeyspaceGroupsElectionPath(kgm.tsoSvcRootPath, group.ID), mcsutils.PrimaryKey, "keyspace group primary election") // If the keyspace group is in split, we should ensure that the primary elected by the new keyspace group // is always on the same TSO Server node as the primary of the old keyspace group, and this constraint cannot // be broken until the entire split process is completed. @@ -1007,6 +1055,7 @@ func (kgm *KeyspaceGroupManager) GetKeyspaceGroups() map[uint32]*endpoint.Keyspa // HandleTSORequest forwards TSO allocation requests to correct TSO Allocators of the given keyspace group. func (kgm *KeyspaceGroupManager) HandleTSORequest( + ctx context.Context, keyspaceID, keyspaceGroupID uint32, dcLocation string, count uint32, ) (ts pdpb.Timestamp, curKeyspaceGroupID uint32, err error) { @@ -1021,11 +1070,26 @@ func (kgm *KeyspaceGroupManager) HandleTSORequest( if err != nil { return pdpb.Timestamp{}, curKeyspaceGroupID, err } - err = kgm.state.checkTSOMerge(curKeyspaceGroupID) + err = kgm.state.checkGroupMerge(curKeyspaceGroupID) if err != nil { return pdpb.Timestamp{}, curKeyspaceGroupID, err } - ts, err = am.HandleRequest(context.Background(), dcLocation, count) + // If this is the first time to request the keyspace group, we need to sync the + // timestamp one more time before serving the TSO request to make sure that the + // TSO is the latest one from the storage, which could prevent the potential + // fallback caused by the rolling update of the mixed old PD and TSO service deployment. + err = kgm.markGroupRequested(curKeyspaceGroupID, func() error { + allocator, err := am.GetAllocator(dcLocation) + if err != nil { + return err + } + // TODO: support the Local TSO Allocator. + return allocator.Initialize(0) + }) + if err != nil { + return pdpb.Timestamp{}, curKeyspaceGroupID, err + } + ts, err = am.HandleRequest(ctx, dcLocation, count) return ts, curKeyspaceGroupID, err } @@ -1096,7 +1160,7 @@ func (kgm *KeyspaceGroupManager) checkTSOSplit( keyspaceGroupID uint32, dcLocation string, ) error { - splitTargetAM, splitSourceAM, err := kgm.state.checkTSOSplit(keyspaceGroupID) + splitTargetAM, splitSourceAM, err := kgm.state.checkGroupSplit(keyspaceGroupID) if err != nil || splitTargetAM == nil { return err } @@ -1149,6 +1213,7 @@ const keyspaceGroupsAPIPrefix = "/pd/api/v2/tso/keyspace-groups" // Put the code below into the critical section to prevent from sending too many HTTP requests. func (kgm *KeyspaceGroupManager) finishSplitKeyspaceGroup(id uint32) error { + start := time.Now() kgm.Lock() defer kgm.Unlock() // Check if the keyspace group is in split state. @@ -1160,6 +1225,7 @@ func (kgm *KeyspaceGroupManager) finishSplitKeyspaceGroup(id uint32) error { if kgm.httpClient == nil { return nil } + startRequest := time.Now() statusCode, err := apiutil.DoDelete( kgm.httpClient, kgm.cfg.GeBackendEndpoints()+keyspaceGroupsAPIPrefix+fmt.Sprintf("/%d/split", id)) @@ -1172,6 +1238,7 @@ func (kgm *KeyspaceGroupManager) finishSplitKeyspaceGroup(id uint32) error { zap.Int("status-code", statusCode)) return errs.ErrSendRequest.FastGenByArgs() } + kgm.metrics.finishSplitSendDuration.Observe(time.Since(startRequest).Seconds()) // Pre-update the split keyspace group's split state in memory. // Note: to avoid data race with state read APIs, we always replace the group in memory as a whole. // For now, we only have scenarios to update split state/merge state, and the other fields are always @@ -1179,10 +1246,12 @@ func (kgm *KeyspaceGroupManager) finishSplitKeyspaceGroup(id uint32) error { newSplitGroup := *splitGroup newSplitGroup.SplitState = nil kgm.kgs[id] = &newSplitGroup + kgm.metrics.finishSplitDuration.Observe(time.Since(start).Seconds()) return nil } func (kgm *KeyspaceGroupManager) finishMergeKeyspaceGroup(id uint32) error { + start := time.Now() kgm.Lock() defer kgm.Unlock() // Check if the keyspace group is in the merging state. @@ -1194,6 +1263,7 @@ func (kgm *KeyspaceGroupManager) finishMergeKeyspaceGroup(id uint32) error { if kgm.httpClient == nil { return nil } + startRequest := time.Now() statusCode, err := apiutil.DoDelete( kgm.httpClient, kgm.cfg.GeBackendEndpoints()+keyspaceGroupsAPIPrefix+fmt.Sprintf("/%d/merge", id)) @@ -1206,7 +1276,7 @@ func (kgm *KeyspaceGroupManager) finishMergeKeyspaceGroup(id uint32) error { zap.Int("status-code", statusCode)) return errs.ErrSendRequest.FastGenByArgs() } - + kgm.metrics.finishMergeSendDuration.Observe(time.Since(startRequest).Seconds()) // Pre-update the merge target keyspace group's merge state in memory. // Note: to avoid data race with state read APIs, we always replace the group in memory as a whole. // For now, we only have scenarios to update split state/merge state, and the other fields are always @@ -1214,6 +1284,7 @@ func (kgm *KeyspaceGroupManager) finishMergeKeyspaceGroup(id uint32) error { newTargetGroup := *mergeTarget newTargetGroup.MergeState = nil kgm.kgs[id] = &newTargetGroup + kgm.metrics.finishMergeDuration.Observe(time.Since(start).Seconds()) return nil } @@ -1400,7 +1471,7 @@ func (kgm *KeyspaceGroupManager) groupSplitPatroller() { zap.Uint32("keyspace-group-id", groupID), zap.Uint32("keyspace-id", group.Keyspaces[0])) // Request the TSO manually to speed up the split process. - _, _, err := kgm.HandleTSORequest(group.Keyspaces[0], groupID, GlobalDCLocation, 1) + _, _, err := kgm.HandleTSORequest(kgm.ctx, group.Keyspaces[0], groupID, GlobalDCLocation, 1) if err != nil { log.Warn("failed to request tso for the splitting keyspace group", zap.Uint32("keyspace-group-id", groupID), @@ -1465,9 +1536,7 @@ func (kgm *KeyspaceGroupManager) deletedGroupCleaner() { zap.Error(err)) continue } - kgm.Lock() - delete(kgm.deletedGroups, groupID) - kgm.Unlock() + kgm.cleanKeyspaceGroup(groupID) lastDeletedGroupID = groupID lastDeletedGroupNum += 1 } diff --git a/pkg/tso/keyspace_group_manager_test.go b/pkg/tso/keyspace_group_manager_test.go index a4c02f1c6965..91c68a5a2685 100644 --- a/pkg/tso/keyspace_group_manager_test.go +++ b/pkg/tso/keyspace_group_manager_test.go @@ -35,6 +35,7 @@ import ( "github.com/tikv/pd/pkg/mcs/discovery" mcsutils "github.com/tikv/pd/pkg/mcs/utils" "github.com/tikv/pd/pkg/storage/endpoint" + "github.com/tikv/pd/pkg/utils/etcdutil" "github.com/tikv/pd/pkg/utils/tempurl" "github.com/tikv/pd/pkg/utils/testutil" "github.com/tikv/pd/pkg/utils/tsoutil" @@ -67,7 +68,8 @@ func (suite *keyspaceGroupManagerTestSuite) SetupSuite() { t := suite.T() suite.ctx, suite.cancel = context.WithCancel(context.Background()) suite.ClusterID = rand.Uint64() - suite.backendEndpoints, suite.etcdClient, suite.clean = startEmbeddedEtcd(t) + servers, client, clean := etcdutil.NewTestEtcdCluster(t, 1) + suite.backendEndpoints, suite.etcdClient, suite.clean = servers[0].Config().LCUrls[0].String(), client, clean suite.cfg = suite.createConfig() } @@ -665,19 +667,19 @@ func (suite *keyspaceGroupManagerTestSuite) TestHandleTSORequestWithWrongMembers // Should succeed because keyspace 0 is actually in keyspace group 0, which is served // by the current keyspace group manager, instead of keyspace group 1 in ask, and // keyspace group 0 is returned in the response. - _, keyspaceGroupBelongTo, err := mgr.HandleTSORequest(0, 1, GlobalDCLocation, 1) + _, keyspaceGroupBelongTo, err := mgr.HandleTSORequest(suite.ctx, 0, 1, GlobalDCLocation, 1) re.NoError(err) re.Equal(uint32(0), keyspaceGroupBelongTo) // Should succeed because keyspace 100 doesn't belong to any keyspace group, so it will // be served by the default keyspace group 0, and keyspace group 0 is returned in the response. - _, keyspaceGroupBelongTo, err = mgr.HandleTSORequest(100, 0, GlobalDCLocation, 1) + _, keyspaceGroupBelongTo, err = mgr.HandleTSORequest(suite.ctx, 100, 0, GlobalDCLocation, 1) re.NoError(err) re.Equal(uint32(0), keyspaceGroupBelongTo) // Should fail because keyspace 100 doesn't belong to any keyspace group, and the keyspace group // 1 in ask doesn't exist. - _, keyspaceGroupBelongTo, err = mgr.HandleTSORequest(100, 1, GlobalDCLocation, 1) + _, keyspaceGroupBelongTo, err = mgr.HandleTSORequest(suite.ctx, 100, 1, GlobalDCLocation, 1) re.Error(err) re.Equal(uint32(1), keyspaceGroupBelongTo) } @@ -1107,7 +1109,7 @@ func (suite *keyspaceGroupManagerTestSuite) TestPrimaryPriorityChange() { // And the primaries on TSO Server 1 should continue to serve TSO requests without any failures. for i := 0; i < 100; i++ { for _, id := range ids { - _, keyspaceGroupBelongTo, err := mgr1.HandleTSORequest(id, id, GlobalDCLocation, 1) + _, keyspaceGroupBelongTo, err := mgr1.HandleTSORequest(suite.ctx, id, id, GlobalDCLocation, 1) re.NoError(err) re.Equal(id, keyspaceGroupBelongTo) } @@ -1203,7 +1205,7 @@ func checkTSO( return default: } - respTS, respGroupID, err := mgr.HandleTSORequest(id, id, GlobalDCLocation, 1) + respTS, respGroupID, err := mgr.HandleTSORequest(ctx, id, id, GlobalDCLocation, 1) // omit the error check since there are many kinds of errors during primaries movement if err != nil { continue @@ -1226,7 +1228,7 @@ func waitForPrimariesServing( if member, err := mgrs[j].GetElectionMember(id, id); err != nil || !member.IsLeader() { return false } - if _, _, err := mgrs[j].HandleTSORequest(id, id, GlobalDCLocation, 1); err != nil { + if _, _, err := mgrs[j].HandleTSORequest(mgrs[j].ctx, id, id, GlobalDCLocation, 1); err != nil { return false } } diff --git a/pkg/tso/metrics.go b/pkg/tso/metrics.go index 754610aa22e6..02e72ebb376f 100644 --- a/pkg/tso/metrics.go +++ b/pkg/tso/metrics.go @@ -17,16 +17,18 @@ package tso import "github.com/prometheus/client_golang/prometheus" const ( - dcLabel = "dc" - typeLabel = "type" - groupLabel = "group" + pdNamespace = "pd" + tsoNamespace = "tso" + dcLabel = "dc" + typeLabel = "type" + groupLabel = "group" ) var ( // TSO metrics tsoCounter = prometheus.NewCounterVec( prometheus.CounterOpts{ - Namespace: "pd", + Namespace: pdNamespace, Subsystem: "tso", Name: "events", Help: "Counter of tso events", @@ -34,7 +36,7 @@ var ( tsoGauge = prometheus.NewGaugeVec( prometheus.GaugeOpts{ - Namespace: "pd", + Namespace: pdNamespace, Subsystem: "cluster", Name: "tso", Help: "Record of tso metadata.", @@ -42,15 +44,24 @@ var ( tsoGap = prometheus.NewGaugeVec( prometheus.GaugeOpts{ - Namespace: "pd", + Namespace: pdNamespace, Subsystem: "cluster", Name: "tso_gap_millionseconds", Help: "The minimal (non-zero) TSO gap for each DC.", }, []string{groupLabel, dcLabel}) + tsoOpDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: pdNamespace, + Subsystem: "cluster", + Name: "tso_operation_duration_seconds", + Help: "Bucketed histogram of processing time(s) of the TSO operations.", + Buckets: prometheus.ExponentialBuckets(0.0005, 2, 13), + }, []string{typeLabel, groupLabel, dcLabel}) + tsoAllocatorRole = prometheus.NewGaugeVec( prometheus.GaugeOpts{ - Namespace: "pd", + Namespace: pdNamespace, Subsystem: "tso", Name: "role", Help: "Indicate the PD server role info, whether it's a TSO allocator.", @@ -59,7 +70,7 @@ var ( // Keyspace Group metrics keyspaceGroupStateGauge = prometheus.NewGaugeVec( prometheus.GaugeOpts{ - Namespace: "pd", + Namespace: tsoNamespace, Subsystem: "keyspace_group", Name: "state", Help: "Gauge of the Keyspace Group states.", @@ -67,7 +78,7 @@ var ( keyspaceGroupOpDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ - Namespace: "pd", + Namespace: tsoNamespace, Subsystem: "keyspace_group", Name: "operation_duration_seconds", Help: "Bucketed histogram of processing time(s) of the Keyspace Group operations.", @@ -79,6 +90,7 @@ func init() { prometheus.MustRegister(tsoCounter) prometheus.MustRegister(tsoGauge) prometheus.MustRegister(tsoGap) + prometheus.MustRegister(tsoOpDuration) prometheus.MustRegister(tsoAllocatorRole) prometheus.MustRegister(keyspaceGroupStateGauge) prometheus.MustRegister(keyspaceGroupOpDuration) @@ -87,6 +99,7 @@ func init() { type tsoMetrics struct { // timestampOracle event counter syncEvent prometheus.Counter + skipSyncEvent prometheus.Counter syncOKEvent prometheus.Counter errSaveSyncTSEvent prometheus.Counter errLeaseResetTSEvent prometheus.Counter @@ -103,6 +116,10 @@ type tsoMetrics struct { notLeaderAnymoreEvent prometheus.Counter logicalOverflowEvent prometheus.Counter exceededMaxRetryEvent prometheus.Counter + // timestampOracle operation duration + syncSaveDuration prometheus.Observer + resetSaveDuration prometheus.Observer + updateSaveDuration prometheus.Observer // allocator event counter notLeaderEvent prometheus.Counter globalTSOSyncEvent prometheus.Counter @@ -119,6 +136,7 @@ type tsoMetrics struct { func newTSOMetrics(groupID, dcLocation string) *tsoMetrics { return &tsoMetrics{ syncEvent: tsoCounter.WithLabelValues("sync", groupID, dcLocation), + skipSyncEvent: tsoCounter.WithLabelValues("skip_sync", groupID, dcLocation), syncOKEvent: tsoCounter.WithLabelValues("sync_ok", groupID, dcLocation), errSaveSyncTSEvent: tsoCounter.WithLabelValues("err_save_sync_ts", groupID, dcLocation), errLeaseResetTSEvent: tsoCounter.WithLabelValues("err_lease_reset_ts", groupID, dcLocation), @@ -135,6 +153,9 @@ func newTSOMetrics(groupID, dcLocation string) *tsoMetrics { notLeaderAnymoreEvent: tsoCounter.WithLabelValues("not_leader_anymore", groupID, dcLocation), logicalOverflowEvent: tsoCounter.WithLabelValues("logical_overflow", groupID, dcLocation), exceededMaxRetryEvent: tsoCounter.WithLabelValues("exceeded_max_retry", groupID, dcLocation), + syncSaveDuration: tsoOpDuration.WithLabelValues("sync_save", groupID, dcLocation), + resetSaveDuration: tsoOpDuration.WithLabelValues("reset_save", groupID, dcLocation), + updateSaveDuration: tsoOpDuration.WithLabelValues("update_save", groupID, dcLocation), notLeaderEvent: tsoCounter.WithLabelValues("not_leader", groupID, dcLocation), globalTSOSyncEvent: tsoCounter.WithLabelValues("global_tso_sync", groupID, dcLocation), globalTSOEstimateEvent: tsoCounter.WithLabelValues("global_tso_estimate", groupID, dcLocation), @@ -148,21 +169,29 @@ func newTSOMetrics(groupID, dcLocation string) *tsoMetrics { } type keyspaceGroupMetrics struct { - splitSourceGauge prometheus.Gauge - splitTargetGauge prometheus.Gauge - mergeSourceGauge prometheus.Gauge - mergeTargetGauge prometheus.Gauge - splitDuration prometheus.Observer - mergeDuration prometheus.Observer + splitSourceGauge prometheus.Gauge + splitTargetGauge prometheus.Gauge + mergeSourceGauge prometheus.Gauge + mergeTargetGauge prometheus.Gauge + splitDuration prometheus.Observer + mergeDuration prometheus.Observer + finishSplitSendDuration prometheus.Observer + finishSplitDuration prometheus.Observer + finishMergeSendDuration prometheus.Observer + finishMergeDuration prometheus.Observer } func newKeyspaceGroupMetrics() *keyspaceGroupMetrics { return &keyspaceGroupMetrics{ - splitSourceGauge: keyspaceGroupStateGauge.WithLabelValues("split-source"), - splitTargetGauge: keyspaceGroupStateGauge.WithLabelValues("split-target"), - mergeSourceGauge: keyspaceGroupStateGauge.WithLabelValues("merge-source"), - mergeTargetGauge: keyspaceGroupStateGauge.WithLabelValues("merge-target"), - splitDuration: keyspaceGroupOpDuration.WithLabelValues("split"), - mergeDuration: keyspaceGroupOpDuration.WithLabelValues("merge"), + splitSourceGauge: keyspaceGroupStateGauge.WithLabelValues("split-source"), + splitTargetGauge: keyspaceGroupStateGauge.WithLabelValues("split-target"), + mergeSourceGauge: keyspaceGroupStateGauge.WithLabelValues("merge-source"), + mergeTargetGauge: keyspaceGroupStateGauge.WithLabelValues("merge-target"), + splitDuration: keyspaceGroupOpDuration.WithLabelValues("split"), + mergeDuration: keyspaceGroupOpDuration.WithLabelValues("merge"), + finishSplitSendDuration: keyspaceGroupOpDuration.WithLabelValues("finish-split-send"), + finishSplitDuration: keyspaceGroupOpDuration.WithLabelValues("finish-split"), + finishMergeSendDuration: keyspaceGroupOpDuration.WithLabelValues("finish-merge-send"), + finishMergeDuration: keyspaceGroupOpDuration.WithLabelValues("finish-merge"), } } diff --git a/pkg/tso/testutil.go b/pkg/tso/testutil.go index 9225b21dfac4..6fc1ccc98a7c 100644 --- a/pkg/tso/testutil.go +++ b/pkg/tso/testutil.go @@ -15,14 +15,9 @@ package tso import ( - "testing" "time" - "github.com/stretchr/testify/require" - "github.com/tikv/pd/pkg/utils/etcdutil" "github.com/tikv/pd/pkg/utils/grpcutil" - "go.etcd.io/etcd/clientv3" - "go.etcd.io/etcd/embed" ) var _ ServiceConfig = (*TestServiceConfig)(nil) @@ -90,23 +85,3 @@ func (c *TestServiceConfig) GetMaxResetTSGap() time.Duration { func (c *TestServiceConfig) GetTLSConfig() *grpcutil.TLSConfig { return c.TLSConfig } - -func startEmbeddedEtcd(t *testing.T) (backendEndpoint string, etcdClient *clientv3.Client, clean func()) { - re := require.New(t) - cfg := etcdutil.NewTestSingleConfig(t) - etcd, err := embed.StartEtcd(cfg) - re.NoError(err) - clean = func() { - etcd.Close() - } - - backendEndpoint = cfg.LCUrls[0].String() - re.NoError(err) - - etcdClient, err = clientv3.NewFromURL(backendEndpoint) - re.NoError(err) - - <-etcd.Server.ReadyNotify() - - return -} diff --git a/pkg/tso/tso.go b/pkg/tso/tso.go index dd8465735623..93392cecfcf6 100644 --- a/pkg/tso/tso.go +++ b/pkg/tso/tso.go @@ -91,14 +91,10 @@ func (t *timestampOracle) setTSOPhysical(next time.Time, force bool) { if typeutil.SubTSOPhysicalByWallClock(next, t.tsoMux.physical) > 0 { t.tsoMux.physical = next t.tsoMux.logical = 0 - t.setTSOUpdateTimeLocked(time.Now()) + t.tsoMux.updateTime = time.Now() } } -func (t *timestampOracle) setTSOUpdateTimeLocked(updateTime time.Time) { - t.tsoMux.updateTime = updateTime -} - func (t *timestampOracle) getTSO() (time.Time, int64) { t.tsoMux.RLock() defer t.tsoMux.RUnlock() @@ -124,10 +120,18 @@ func (t *timestampOracle) generateTSO(ctx context.Context, count int64, suffixBi } // Return the last update time lastUpdateTime = t.tsoMux.updateTime - t.setTSOUpdateTimeLocked(time.Now()) + t.tsoMux.updateTime = time.Now() return physical, logical, lastUpdateTime } +func (t *timestampOracle) getLastSavedTime() time.Time { + last := t.lastSavedTime.Load() + if last == nil { + return typeutil.ZeroTime + } + return last.(time.Time) +} + // Because the Local TSO in each Local TSO Allocator is independent, so they are possible // to be the same at sometimes, to avoid this case, we need to use the logical part of the // Local TSO to do some differentiating work. @@ -162,6 +166,13 @@ func (t *timestampOracle) SyncTimestamp(leadership *election.Leadership) error { if err != nil { return err } + lastSavedTime := t.getLastSavedTime() + // If `lastSavedTime` is not zero, it means that the `timestampOracle` has already been initialized + // before, so we could safely skip the sync if `lastSavedTime` is equal to `last`. + if lastSavedTime != typeutil.ZeroTime && typeutil.SubRealTimeByWallClock(lastSavedTime, last) == 0 { + t.metrics.skipSyncEvent.Inc() + return nil + } next := time.Now() failpoint.Inject("fallBackSync", func() { @@ -181,10 +192,12 @@ func (t *timestampOracle) SyncTimestamp(leadership *election.Leadership) error { failpoint.Return(errs.ErrEtcdTxnInternal) }) save := next.Add(t.saveInterval) + start := time.Now() if err = t.storage.SaveTimestamp(t.GetTimestampPath(), save); err != nil { t.metrics.errSaveSyncTSEvent.Inc() return err } + t.metrics.syncSaveDuration.Observe(time.Since(start).Seconds()) t.lastSavedTime.Store(save) t.metrics.syncOKEvent.Inc() @@ -247,18 +260,20 @@ func (t *timestampOracle) resetUserTimestampInner(leadership *election.Leadershi return errs.ErrResetUserTimestamp.FastGenByArgs("the specified ts is too larger than now") } // save into etcd only if nextPhysical is close to lastSavedTime - if typeutil.SubRealTimeByWallClock(t.lastSavedTime.Load().(time.Time), nextPhysical) <= UpdateTimestampGuard { + if typeutil.SubRealTimeByWallClock(t.getLastSavedTime(), nextPhysical) <= UpdateTimestampGuard { save := nextPhysical.Add(t.saveInterval) + start := time.Now() if err := t.storage.SaveTimestamp(t.GetTimestampPath(), save); err != nil { t.metrics.errSaveResetTSEvent.Inc() return err } + t.metrics.resetSaveDuration.Observe(time.Since(start).Seconds()) t.lastSavedTime.Store(save) } // save into memory only if nextPhysical or nextLogical is greater. t.tsoMux.physical = nextPhysical t.tsoMux.logical = int64(nextLogical) - t.setTSOUpdateTimeLocked(time.Now()) + t.tsoMux.updateTime = time.Now() t.metrics.resetTSOOKEvent.Inc() return nil } @@ -278,6 +293,9 @@ func (t *timestampOracle) resetUserTimestampInner(leadership *election.Leadershi // NOTICE: this function should be called after the TSO in memory has been initialized // and should not be called when the TSO in memory has been reset anymore. func (t *timestampOracle) UpdateTimestamp(leadership *election.Leadership) error { + if !t.isInitialized() { + return errs.ErrUpdateTimestamp.FastGenByArgs("timestamp in memory has not been initialized") + } prevPhysical, prevLogical := t.getTSO() t.metrics.tsoPhysicalGauge.Set(float64(prevPhysical.UnixNano() / int64(time.Millisecond))) t.metrics.tsoPhysicalGapGauge.Set(float64(time.Since(prevPhysical).Milliseconds())) @@ -323,8 +341,9 @@ func (t *timestampOracle) UpdateTimestamp(leadership *election.Leadership) error // It is not safe to increase the physical time to `next`. // The time window needs to be updated and saved to etcd. - if typeutil.SubRealTimeByWallClock(t.lastSavedTime.Load().(time.Time), next) <= UpdateTimestampGuard { + if typeutil.SubRealTimeByWallClock(t.getLastSavedTime(), next) <= UpdateTimestampGuard { save := next.Add(t.saveInterval) + start := time.Now() if err := t.storage.SaveTimestamp(t.GetTimestampPath(), save); err != nil { log.Warn("save timestamp failed", zap.String("dc-location", t.dcLocation), @@ -333,6 +352,7 @@ func (t *timestampOracle) UpdateTimestamp(leadership *election.Leadership) error t.metrics.errSaveUpdateTSEvent.Inc() return err } + t.metrics.updateSaveDuration.Observe(time.Since(start).Seconds()) t.lastSavedTime.Store(save) } // save into memory @@ -392,5 +412,6 @@ func (t *timestampOracle) ResetTimestamp() { log.Info("reset the timestamp in memory") t.tsoMux.physical = typeutil.ZeroTime t.tsoMux.logical = 0 - t.setTSOUpdateTimeLocked(typeutil.ZeroTime) + t.tsoMux.updateTime = typeutil.ZeroTime + t.lastSavedTime.Store(typeutil.ZeroTime) } diff --git a/pkg/unsaferecovery/unsafe_recovery_controller.go b/pkg/unsaferecovery/unsafe_recovery_controller.go index 675f76b91a99..110b7277d9e7 100644 --- a/pkg/unsaferecovery/unsafe_recovery_controller.go +++ b/pkg/unsaferecovery/unsafe_recovery_controller.go @@ -890,10 +890,11 @@ func (t *regionTree) insert(item *regionItem) (bool, error) { return false, errors.Errorf("region %v shouldn't be updated twice", item.Region().GetId()) } - for _, old := range overlaps { + for _, newer := range overlaps { + log.Info("Unsafe recovery found overlap regions", logutil.ZapRedactStringer("newer-region-meta", core.RegionToHexMeta(newer.Region())), logutil.ZapRedactStringer("older-region-meta", core.RegionToHexMeta(item.Region()))) // it's ensured by the `buildUpFromReports` that peers are inserted in epoch descending order. - if old.IsEpochStale(item) { - return false, errors.Errorf("region %v's epoch shouldn't be staler than old ones %v", item, old) + if newer.IsEpochStale(item) { + return false, errors.Errorf("region %v's epoch shouldn't be staler than old ones %v", item, newer) } } if len(overlaps) != 0 { diff --git a/pkg/utils/configutil/configutil.go b/pkg/utils/configutil/configutil.go index 4349721ba78d..978edce77640 100644 --- a/pkg/utils/configutil/configutil.go +++ b/pkg/utils/configutil/configutil.go @@ -21,7 +21,7 @@ import ( "time" "github.com/BurntSushi/toml" - "github.com/pkg/errors" + "github.com/pingcap/errors" "github.com/spf13/pflag" "github.com/tikv/pd/pkg/encryption" "github.com/tikv/pd/pkg/utils/grpcutil" diff --git a/pkg/utils/etcdutil/etcdutil_test.go b/pkg/utils/etcdutil/etcdutil_test.go index e9731de2329a..80194a6287e3 100644 --- a/pkg/utils/etcdutil/etcdutil_test.go +++ b/pkg/utils/etcdutil/etcdutil_test.go @@ -29,7 +29,6 @@ import ( "time" "github.com/pingcap/failpoint" - "github.com/pingcap/log" "github.com/stretchr/testify/require" "github.com/stretchr/testify/suite" "github.com/tikv/pd/pkg/utils/tempurl" @@ -48,23 +47,9 @@ func TestMain(m *testing.M) { func TestMemberHelpers(t *testing.T) { re := require.New(t) - cfg1 := NewTestSingleConfig(t) - etcd1, err := embed.StartEtcd(cfg1) - defer func() { - etcd1.Close() - }() - re.NoError(err) - - ep1 := cfg1.LCUrls[0].String() - client1, err := clientv3.New(clientv3.Config{ - Endpoints: []string{ep1}, - }) - defer func() { - client1.Close() - }() - re.NoError(err) - - <-etcd1.Server.ReadyNotify() + servers, client1, clean := NewTestEtcdCluster(t, 1) + defer clean() + etcd1, cfg1 := servers[0], servers[0].Config() // Test ListEtcdMembers listResp1, err := ListEtcdMembers(client1) @@ -74,21 +59,12 @@ func TestMemberHelpers(t *testing.T) { re.Equal(uint64(etcd1.Server.ID()), listResp1.Members[0].ID) // Test AddEtcdMember - etcd2 := checkAddEtcdMember(t, cfg1, client1) - cfg2 := etcd2.Config() + etcd2 := MustAddEtcdMember(t, &cfg1, client1) defer etcd2.Close() - ep2 := cfg2.LCUrls[0].String() - client2, err := clientv3.New(clientv3.Config{ - Endpoints: []string{ep2}, - }) - defer func() { - client2.Close() - }() - re.NoError(err) - checkMembers(re, client2, []*embed.Etcd{etcd1, etcd2}) + checkMembers(re, client1, []*embed.Etcd{etcd1, etcd2}) // Test CheckClusterID - urlsMap, err := types.NewURLsMap(cfg2.InitialCluster) + urlsMap, err := types.NewURLsMap(etcd2.Config().InitialCluster) re.NoError(err) err = CheckClusterID(etcd1.Server.Cluster().ID(), urlsMap, &tls.Config{MinVersion: tls.VersionTLS12}) re.NoError(err) @@ -105,30 +81,15 @@ func TestMemberHelpers(t *testing.T) { func TestEtcdKVGet(t *testing.T) { re := require.New(t) - cfg := NewTestSingleConfig(t) - etcd, err := embed.StartEtcd(cfg) - defer func() { - etcd.Close() - }() - re.NoError(err) - - ep := cfg.LCUrls[0].String() - client, err := clientv3.New(clientv3.Config{ - Endpoints: []string{ep}, - }) - defer func() { - client.Close() - }() - re.NoError(err) - - <-etcd.Server.ReadyNotify() + _, client, clean := NewTestEtcdCluster(t, 1) + defer clean() keys := []string{"test/key1", "test/key2", "test/key3", "test/key4", "test/key5"} vals := []string{"val1", "val2", "val3", "val4", "val5"} kv := clientv3.NewKV(client) for i := range keys { - _, err = kv.Put(context.TODO(), keys[i], vals[i]) + _, err := kv.Put(context.TODO(), keys[i], vals[i]) re.NoError(err) } @@ -158,25 +119,10 @@ func TestEtcdKVGet(t *testing.T) { func TestEtcdKVPutWithTTL(t *testing.T) { re := require.New(t) - cfg := NewTestSingleConfig(t) - etcd, err := embed.StartEtcd(cfg) - defer func() { - etcd.Close() - }() - re.NoError(err) - - ep := cfg.LCUrls[0].String() - client, err := clientv3.New(clientv3.Config{ - Endpoints: []string{ep}, - }) - defer func() { - client.Close() - }() - re.NoError(err) - - <-etcd.Server.ReadyNotify() + _, client, clean := NewTestEtcdCluster(t, 1) + defer clean() - _, err = EtcdKVPutWithTTL(context.TODO(), client, "test/ttl1", "val1", 2) + _, err := EtcdKVPutWithTTL(context.TODO(), client, "test/ttl1", "val1", 2) re.NoError(err) _, err = EtcdKVPutWithTTL(context.TODO(), client, "test/ttl2", "val2", 4) re.NoError(err) @@ -201,24 +147,8 @@ func TestEtcdKVPutWithTTL(t *testing.T) { func TestInitClusterID(t *testing.T) { re := require.New(t) - cfg := NewTestSingleConfig(t) - etcd, err := embed.StartEtcd(cfg) - defer func() { - etcd.Close() - }() - re.NoError(err) - - ep := cfg.LCUrls[0].String() - client, err := clientv3.New(clientv3.Config{ - Endpoints: []string{ep}, - }) - defer func() { - client.Close() - }() - re.NoError(err) - - <-etcd.Server.ReadyNotify() - + _, client, clean := NewTestEtcdCluster(t, 1) + defer clean() pdClusterIDPath := "test/TestInitClusterID/pd/cluster_id" // Get any cluster key to parse the cluster ID. resp, err := EtcdKVGet(client, pdClusterIDPath) @@ -238,24 +168,12 @@ func TestEtcdClientSync(t *testing.T) { re := require.New(t) re.NoError(failpoint.Enable("github.com/tikv/pd/pkg/utils/etcdutil/fastTick", "return(true)")) - // Start a etcd server. - cfg1 := NewTestSingleConfig(t) - etcd1, err := embed.StartEtcd(cfg1) - defer func() { - etcd1.Close() - }() - re.NoError(err) - - // Create a etcd client with etcd1 as endpoint. - client1, err := CreateEtcdClient(nil, cfg1.LCUrls) - defer func() { - client1.Close() - }() - re.NoError(err) - <-etcd1.Server.ReadyNotify() + servers, client1, clean := NewTestEtcdCluster(t, 1) + defer clean() + etcd1, cfg1 := servers[0], servers[0].Config() // Add a new member. - etcd2 := checkAddEtcdMember(t, cfg1, client1) + etcd2 := MustAddEtcdMember(t, &cfg1, client1) defer etcd2.Close() checkMembers(re, client1, []*embed.Etcd{etcd1, etcd2}) testutil.Eventually(re, func() bool { @@ -264,7 +182,7 @@ func TestEtcdClientSync(t *testing.T) { }) // Remove the first member and close the etcd1. - _, err = RemoveEtcdMember(client1, uint64(etcd1.Server.ID())) + _, err := RemoveEtcdMember(client1, uint64(etcd1.Server.ID())) re.NoError(err) etcd1.Close() @@ -280,31 +198,21 @@ func TestEtcdClientSync(t *testing.T) { func TestEtcdScaleInAndOut(t *testing.T) { re := require.New(t) // Start a etcd server. - cfg1 := NewTestSingleConfig(t) - etcd1, err := embed.StartEtcd(cfg1) - defer func() { - etcd1.Close() - }() - re.NoError(err) - <-etcd1.Server.ReadyNotify() + servers, _, clean := NewTestEtcdCluster(t, 1) + defer clean() + etcd1, cfg1 := servers[0], servers[0].Config() // Create two etcd clients with etcd1 as endpoint. client1, err := CreateEtcdClient(nil, cfg1.LCUrls) // execute member change operation with this client - defer func() { - client1.Close() - }() re.NoError(err) + defer client1.Close() client2, err := CreateEtcdClient(nil, cfg1.LCUrls) // check member change with this client - defer func() { - client2.Close() - }() re.NoError(err) + defer client2.Close() // Add a new member and check members - etcd2 := checkAddEtcdMember(t, cfg1, client1) - defer func() { - etcd2.Close() - }() + etcd2 := MustAddEtcdMember(t, &cfg1, client1) + defer etcd2.Close() checkMembers(re, client2, []*embed.Etcd{etcd1, etcd2}) // scale in etcd1 @@ -317,29 +225,14 @@ func TestRandomKillEtcd(t *testing.T) { re := require.New(t) re.NoError(failpoint.Enable("github.com/tikv/pd/pkg/utils/etcdutil/fastTick", "return(true)")) // Start a etcd server. - cfg1 := NewTestSingleConfig(t) - etcd1, err := embed.StartEtcd(cfg1) - re.NoError(err) - <-etcd1.Server.ReadyNotify() - client1, err := CreateEtcdClient(nil, cfg1.LCUrls) - re.NoError(err) - defer func() { - client1.Close() - }() - - etcd2 := checkAddEtcdMember(t, cfg1, client1) - cfg2 := etcd2.Config() - <-etcd2.Server.ReadyNotify() - - etcd3 := checkAddEtcdMember(t, &cfg2, client1) - <-etcd3.Server.ReadyNotify() - - time.Sleep(1 * time.Second) - re.Len(client1.Endpoints(), 3) + etcds, client1, clean := NewTestEtcdCluster(t, 3) + defer clean() + testutil.Eventually(re, func() bool { + return len(client1.Endpoints()) == 3 + }) // Randomly kill an etcd server and restart it - etcds := []*embed.Etcd{etcd1, etcd2, etcd3} - cfgs := []embed.Config{etcd1.Config(), etcd2.Config(), etcd3.Config()} + cfgs := []embed.Config{etcds[0].Config(), etcds[1].Config(), etcds[2].Config()} for i := 0; i < 10; i++ { killIndex := rand.Intn(len(etcds)) etcds[killIndex].Close() @@ -381,31 +274,24 @@ func TestEtcdWithHangLeaderEnableCheck(t *testing.T) { func checkEtcdWithHangLeader(t *testing.T) error { re := require.New(t) // Start a etcd server. - cfg1 := NewTestSingleConfig(t) - etcd1, err := embed.StartEtcd(cfg1) - defer func() { - etcd1.Close() - }() - re.NoError(err) - ep1 := cfg1.LCUrls[0].String() - <-etcd1.Server.ReadyNotify() + servers, _, clean := NewTestEtcdCluster(t, 1) + defer clean() + etcd1, cfg1 := servers[0], servers[0].Config() // Create a proxy to etcd1. proxyAddr := tempurl.Alloc() var enableDiscard atomic.Bool - go proxyWithDiscard(re, ep1, proxyAddr, &enableDiscard) + go proxyWithDiscard(re, cfg1.LCUrls[0].String(), proxyAddr, &enableDiscard) // Create a etcd client with etcd1 as endpoint. urls, err := types.NewURLs([]string{proxyAddr}) re.NoError(err) client1, err := CreateEtcdClient(nil, urls) - defer func() { - client1.Close() - }() re.NoError(err) + defer client1.Close() // Add a new member - etcd2 := checkAddEtcdMember(t, cfg1, client1) + etcd2 := MustAddEtcdMember(t, &cfg1, client1) defer etcd2.Close() checkMembers(re, client1, []*embed.Etcd{etcd1, etcd2}) time.Sleep(1 * time.Second) // wait for etcd client sync endpoints @@ -417,40 +303,6 @@ func checkEtcdWithHangLeader(t *testing.T) error { return err } -func checkAddEtcdMember(t *testing.T, cfg1 *embed.Config, client *clientv3.Client) *embed.Etcd { - re := require.New(t) - cfg2 := NewTestSingleConfig(t) - cfg2.Name = genRandName() - cfg2.InitialCluster = cfg1.InitialCluster + fmt.Sprintf(",%s=%s", cfg2.Name, &cfg2.LPUrls[0]) - cfg2.ClusterState = embed.ClusterStateFlagExisting - peerURL := cfg2.LPUrls[0].String() - addResp, err := AddEtcdMember(client, []string{peerURL}) - re.NoError(err) - etcd2, err := embed.StartEtcd(cfg2) - re.NoError(err) - re.Equal(uint64(etcd2.Server.ID()), addResp.Member.ID) - <-etcd2.Server.ReadyNotify() - return etcd2 -} - -func checkMembers(re *require.Assertions, client *clientv3.Client, etcds []*embed.Etcd) { - // Check the client can get the new member. - listResp, err := ListEtcdMembers(client) - re.NoError(err) - re.Len(listResp.Members, len(etcds)) - inList := func(m *etcdserverpb.Member) bool { - for _, etcd := range etcds { - if m.ID == uint64(etcd.Server.ID()) { - return true - } - } - return false - } - for _, m := range listResp.Members { - re.True(inList(m)) - } -} - func proxyWithDiscard(re *require.Assertions, server, proxy string, enableDiscard *atomic.Bool) { server = strings.TrimPrefix(server, "http://") proxy = strings.TrimPrefix(proxy, "http://") @@ -527,7 +379,7 @@ func (suite *loopWatcherTestSuite) SetupSuite() { suite.ctx, suite.cancel = context.WithCancel(context.Background()) suite.cleans = make([]func(), 0) // Start a etcd server and create a client with etcd1 as endpoint. - suite.config = NewTestSingleConfig(t) + suite.config = newTestSingleConfig(t) suite.startEtcd() suite.client, err = CreateEtcdClient(nil, suite.config.LCUrls) suite.NoError(err) @@ -771,13 +623,8 @@ func (suite *loopWatcherTestSuite) TestWatcherBreak() { func (suite *loopWatcherTestSuite) TestWatcherRequestProgress() { checkWatcherRequestProgress := func(injectWatchChanBlock bool) { - tempStdoutFile, _ := os.CreateTemp("/tmp", "pd_tests") - defer os.Remove(tempStdoutFile.Name()) - cfg := &log.Config{} - cfg.File.Filename = tempStdoutFile.Name() - cfg.Level = "debug" - lg, p, _ := log.InitLogger(cfg) - log.ReplaceGlobals(lg, p) + fname := testutil.InitTempFileLogger("debug") + defer os.RemoveAll(fname) watcher := NewLoopWatcher( suite.ctx, @@ -799,14 +646,14 @@ func (suite *loopWatcherTestSuite) TestWatcherRequestProgress() { if injectWatchChanBlock { failpoint.Enable("github.com/tikv/pd/pkg/utils/etcdutil/watchChanBlock", "return(true)") testutil.Eventually(suite.Require(), func() bool { - b, _ := os.ReadFile(tempStdoutFile.Name()) + b, _ := os.ReadFile(fname) l := string(b) return strings.Contains(l, "watch channel is blocked for a long time") }) failpoint.Disable("github.com/tikv/pd/pkg/utils/etcdutil/watchChanBlock") } else { testutil.Eventually(suite.Require(), func() bool { - b, _ := os.ReadFile(tempStdoutFile.Name()) + b, _ := os.ReadFile(fname) l := string(b) return strings.Contains(l, "watcher receives progress notify in watch loop") }) diff --git a/pkg/utils/etcdutil/testutil.go b/pkg/utils/etcdutil/testutil.go index 971e93e1ed67..54ba38b93b6c 100644 --- a/pkg/utils/etcdutil/testutil.go +++ b/pkg/utils/etcdutil/testutil.go @@ -21,12 +21,16 @@ import ( "testing" "time" + "github.com/stretchr/testify/require" "github.com/tikv/pd/pkg/utils/tempurl" + "github.com/tikv/pd/pkg/utils/testutil" + "go.etcd.io/etcd/clientv3" "go.etcd.io/etcd/embed" + "go.etcd.io/etcd/etcdserver/etcdserverpb" ) -// NewTestSingleConfig is used to create a etcd config for the unit test purpose. -func NewTestSingleConfig(t *testing.T) *embed.Config { +// newTestSingleConfig is used to create a etcd config for the unit test purpose. +func newTestSingleConfig(t *testing.T) *embed.Config { cfg := embed.NewConfig() cfg.Name = genRandName() cfg.Dir = t.TempDir() @@ -50,3 +54,85 @@ func NewTestSingleConfig(t *testing.T) *embed.Config { func genRandName() string { return "test_etcd_" + strconv.FormatInt(time.Now().UnixNano()%10000, 10) } + +// NewTestEtcdCluster is used to create a etcd cluster for the unit test purpose. +func NewTestEtcdCluster(t *testing.T, count int) (servers []*embed.Etcd, etcdClient *clientv3.Client, clean func()) { + re := require.New(t) + servers = make([]*embed.Etcd, 0, count) + + cfg := newTestSingleConfig(t) + etcd, err := embed.StartEtcd(cfg) + re.NoError(err) + etcdClient, err = CreateEtcdClient(nil, cfg.LCUrls) + re.NoError(err) + <-etcd.Server.ReadyNotify() + servers = append(servers, etcd) + + for i := 1; i < count; i++ { + // Check the client can get the new member. + listResp, err := ListEtcdMembers(etcdClient) + re.NoError(err) + re.Len(listResp.Members, i) + // Add a new member. + etcd2 := MustAddEtcdMember(t, cfg, etcdClient) + cfg2 := etcd2.Config() + cfg = &cfg2 + <-etcd2.Server.ReadyNotify() + servers = append(servers, etcd2) + } + + checkMembers(re, etcdClient, servers) + + clean = func() { + etcdClient.Close() + for _, server := range servers { + if server != nil { + server.Close() + } + } + } + + return +} + +// MustAddEtcdMember is used to add a new etcd member to the cluster for test. +func MustAddEtcdMember(t *testing.T, cfg1 *embed.Config, client *clientv3.Client) *embed.Etcd { + re := require.New(t) + cfg2 := newTestSingleConfig(t) + cfg2.Name = genRandName() + cfg2.InitialCluster = cfg1.InitialCluster + fmt.Sprintf(",%s=%s", cfg2.Name, &cfg2.LPUrls[0]) + cfg2.ClusterState = embed.ClusterStateFlagExisting + peerURL := cfg2.LPUrls[0].String() + addResp, err := AddEtcdMember(client, []string{peerURL}) + re.NoError(err) + // Check the client can get the new member. + testutil.Eventually(re, func() bool { + members, err := ListEtcdMembers(client) + re.NoError(err) + return len(addResp.Members) == len(members.Members) + }) + // Start the new etcd member. + etcd2, err := embed.StartEtcd(cfg2) + re.NoError(err) + re.Equal(uint64(etcd2.Server.ID()), addResp.Member.ID) + <-etcd2.Server.ReadyNotify() + return etcd2 +} + +func checkMembers(re *require.Assertions, client *clientv3.Client, etcds []*embed.Etcd) { + // Check the client can get the new member. + listResp, err := ListEtcdMembers(client) + re.NoError(err) + re.Len(listResp.Members, len(etcds)) + inList := func(m *etcdserverpb.Member) bool { + for _, etcd := range etcds { + if m.ID == uint64(etcd.Server.ID()) { + return true + } + } + return false + } + for _, m := range listResp.Members { + re.True(inList(m)) + } +} diff --git a/pkg/utils/grpcutil/grpcutil.go b/pkg/utils/grpcutil/grpcutil.go index 6b5290bf31c5..ee9d85a4ee19 100644 --- a/pkg/utils/grpcutil/grpcutil.go +++ b/pkg/utils/grpcutil/grpcutil.go @@ -21,8 +21,8 @@ import ( "net/url" "time" + "github.com/pingcap/errors" "github.com/pingcap/log" - "github.com/pkg/errors" "github.com/tikv/pd/pkg/errs" "github.com/tikv/pd/pkg/utils/logutil" "go.etcd.io/etcd/pkg/transport" diff --git a/pkg/utils/testutil/testutil.go b/pkg/utils/testutil/testutil.go index 7d31f2263c6b..a48db0bd60ff 100644 --- a/pkg/utils/testutil/testutil.go +++ b/pkg/utils/testutil/testutil.go @@ -20,6 +20,7 @@ import ( "time" "github.com/pingcap/kvproto/pkg/pdpb" + "github.com/pingcap/log" "github.com/stretchr/testify/require" "google.golang.org/grpc" ) @@ -86,3 +87,16 @@ func CleanServer(dataDir string) { // Clean data directory os.RemoveAll(dataDir) } + +// InitTempFileLogger initializes the logger and redirects the log output to a temporary file. +func InitTempFileLogger(level string) (fname string) { + cfg := &log.Config{} + f, _ := os.CreateTemp("/tmp", "pd_tests") + fname = f.Name() + f.Close() + cfg.File.Filename = fname + cfg.Level = level + lg, p, _ := log.InitLogger(cfg) + log.ReplaceGlobals(lg, p) + return fname +} diff --git a/server/api/min_resolved_ts.go b/server/api/min_resolved_ts.go index ef05e91b9f78..1edf924370f5 100644 --- a/server/api/min_resolved_ts.go +++ b/server/api/min_resolved_ts.go @@ -53,7 +53,7 @@ type minResolvedTS struct { // @Failure 500 {string} string "PD server failed to proceed the request." // @Router /min-resolved-ts/{store_id} [get] func (h *minResolvedTSHandler) GetStoreMinResolvedTS(w http.ResponseWriter, r *http.Request) { - c := h.svr.GetRaftCluster() + c := getCluster(r) idStr := mux.Vars(r)["store_id"] storeID, err := strconv.ParseUint(idStr, 10, 64) if err != nil { @@ -84,7 +84,7 @@ func (h *minResolvedTSHandler) GetStoreMinResolvedTS(w http.ResponseWriter, r *h // @Failure 500 {string} string "PD server failed to proceed the request." // @Router /min-resolved-ts [get] func (h *minResolvedTSHandler) GetMinResolvedTS(w http.ResponseWriter, r *http.Request) { - c := h.svr.GetRaftCluster() + c := getCluster(r) scopeMinResolvedTS := c.GetMinResolvedTS() persistInterval := c.GetPDServerConfig().MinResolvedTSPersistenceInterval diff --git a/server/api/stats_test.go b/server/api/stats_test.go index 4003f53ed9e4..7281f8ec9858 100644 --- a/server/api/stats_test.go +++ b/server/api/stats_test.go @@ -142,7 +142,7 @@ func (suite *statsTestSuite) TestRegionStats() { statsAll := &statistics.RegionStats{ Count: 4, EmptyCount: 1, - StorageSize: 351, + StorageSize: 350, UserStorageSize: 291, StorageKeys: 221, StoreLeaderCount: map[uint64]int{1: 1, 4: 2, 5: 1}, @@ -156,7 +156,7 @@ func (suite *statsTestSuite) TestRegionStats() { stats23 := &statistics.RegionStats{ Count: 2, EmptyCount: 1, - StorageSize: 201, + StorageSize: 200, UserStorageSize: 181, StorageKeys: 151, StoreLeaderCount: map[uint64]int{4: 1, 5: 1}, diff --git a/server/api/version_test.go b/server/api/version_test.go index 804704b084c7..46bc80537a65 100644 --- a/server/api/version_test.go +++ b/server/api/version_test.go @@ -74,7 +74,7 @@ func TestGetVersion(t *testing.T) { func() { temp.Close() os.Stdout = old - os.Remove(fname) + os.RemoveAll(fname) svr.Close() cancel() testutil.CleanServer(cfg.DataDir) diff --git a/server/apiv2/handlers/tso_keyspace_group.go b/server/apiv2/handlers/tso_keyspace_group.go index d16cd333e871..1e64e1250387 100644 --- a/server/apiv2/handlers/tso_keyspace_group.go +++ b/server/apiv2/handlers/tso_keyspace_group.go @@ -21,7 +21,7 @@ import ( "sync" "github.com/gin-gonic/gin" - "github.com/pkg/errors" + "github.com/pingcap/errors" "github.com/tikv/pd/pkg/errs" "github.com/tikv/pd/pkg/mcs/utils" "github.com/tikv/pd/pkg/slice" diff --git a/server/cluster/cluster.go b/server/cluster/cluster.go index f250df56630a..14e4b6301b1e 100644 --- a/server/cluster/cluster.go +++ b/server/cluster/cluster.go @@ -143,11 +143,12 @@ type RaftCluster struct { etcdClient *clientv3.Client httpClient *http.Client - running bool - meta *metapb.Cluster - storage storage.Storage - minResolvedTS uint64 - externalTS uint64 + running bool + isAPIServiceMode bool + meta *metapb.Cluster + storage storage.Storage + minResolvedTS uint64 + externalTS uint64 // Keep the previous store limit settings when removing a store. prevStoreLimit map[uint64]map[storelimit.Type]float64 @@ -287,6 +288,7 @@ func (c *RaftCluster) Start(s Server) error { return nil } + c.isAPIServiceMode = s.IsAPIServiceMode() c.InitCluster(s.GetAllocator(), s.GetPersistOptions(), s.GetStorage(), s.GetBasicCluster(), s.GetKeyspaceGroupManager()) cluster, err := c.LoadClusterInfo() if err != nil { @@ -296,13 +298,6 @@ func (c *RaftCluster) Start(s Server) error { return nil } - if s.IsAPIServiceMode() { - err = c.keyspaceGroupManager.Bootstrap(c.ctx) - if err != nil { - return err - } - } - c.ruleManager = placement.NewRuleManager(c.storage, c, c.GetOpts()) if c.opt.IsPlacementRulesEnabled() { err = c.ruleManager.Initialize(c.opt.GetMaxReplicas(), c.opt.GetLocationLabels()) @@ -319,6 +314,7 @@ func (c *RaftCluster) Start(s Server) error { if err != nil { return err } + c.coordinator = schedule.NewCoordinator(c.ctx, cluster, s.GetHBStreams()) c.regionStats = statistics.NewRegionStatistics(c.core, c.opt, c.ruleManager) c.limiter = NewStoreLimiter(s.GetPersistOptions()) @@ -327,11 +323,23 @@ func (c *RaftCluster) Start(s Server) error { log.Error("load external timestamp meets error", zap.Error(err)) } - c.wg.Add(10) - go c.runCoordinator() + if s.IsAPIServiceMode() { + // bootstrap keyspace group manager after starting other parts successfully. + // This order avoids a stuck goroutine in keyspaceGroupManager when it fails to create raftcluster. + err = c.keyspaceGroupManager.Bootstrap(c.ctx) + if err != nil { + return err + } + c.initSchedulers() + } else { + c.wg.Add(2) + go c.runCoordinator() + go c.runStatsBackgroundJobs() + } + + c.wg.Add(8) go c.runMetricsCollectionJob() go c.runNodeStateCheckJob() - go c.runStatsBackgroundJobs() go c.syncRegions() go c.runReplicationMode() go c.runMinResolvedTSJob() @@ -586,10 +594,12 @@ func (c *RaftCluster) LoadClusterInfo() (*RaftCluster, error) { zap.Int("count", c.core.GetTotalRegionCount()), zap.Duration("cost", time.Since(start)), ) - for _, store := range c.GetStores() { - storeID := store.GetID() - c.hotStat.GetOrCreateRollingStoreStats(storeID) - c.slowStat.ObserveSlowStoreStatus(storeID, store.IsSlow()) + if !c.isAPIServiceMode { + for _, store := range c.GetStores() { + storeID := store.GetID() + c.hotStat.GetOrCreateRollingStoreStats(storeID) + c.slowStat.ObserveSlowStoreStatus(storeID, store.IsSlow()) + } } return c, nil } @@ -600,6 +610,7 @@ func (c *RaftCluster) runMetricsCollectionJob() { ticker := time.NewTicker(metricsCollectionJobInterval) failpoint.Inject("highFrequencyClusterJobs", func() { + ticker.Stop() ticker = time.NewTicker(time.Microsecond) }) @@ -624,6 +635,7 @@ func (c *RaftCluster) runNodeStateCheckJob() { ticker := time.NewTicker(nodeStateCheckJobInterval) failpoint.Inject("highFrequencyClusterJobs", func() { + ticker.Stop() ticker = time.NewTicker(2 * time.Second) }) defer ticker.Stop() @@ -711,7 +723,9 @@ func (c *RaftCluster) Stop() { return } c.running = false - c.coordinator.Stop() + if !c.isAPIServiceMode { + c.coordinator.Stop() + } c.cancel() c.Unlock() @@ -837,6 +851,10 @@ func (c *RaftCluster) GetOpts() sc.ConfProvider { return c.opt } +func (c *RaftCluster) initSchedulers() { + c.coordinator.InitSchedulers(false) +} + // GetScheduleConfig returns scheduling configurations. func (c *RaftCluster) GetScheduleConfig() *sc.ScheduleConfig { return c.opt.GetScheduleConfig() @@ -932,11 +950,15 @@ func (c *RaftCluster) HandleStoreHeartbeat(heartbeat *pdpb.StoreHeartbeatRequest nowTime := time.Now() var newStore *core.StoreInfo // If this cluster has slow stores, we should awaken hibernated regions in other stores. - if needAwaken, slowStoreIDs := c.NeedAwakenAllRegionsInStore(storeID); needAwaken { - log.Info("forcely awaken hibernated regions", zap.Uint64("store-id", storeID), zap.Uint64s("slow-stores", slowStoreIDs)) - newStore = store.Clone(core.SetStoreStats(stats), core.SetLastHeartbeatTS(nowTime), core.SetLastAwakenTime(nowTime), opt) - resp.AwakenRegions = &pdpb.AwakenRegions{ - AbnormalStores: slowStoreIDs, + if !c.isAPIServiceMode { + if needAwaken, slowStoreIDs := c.NeedAwakenAllRegionsInStore(storeID); needAwaken { + log.Info("forcely awaken hibernated regions", zap.Uint64("store-id", storeID), zap.Uint64s("slow-stores", slowStoreIDs)) + newStore = store.Clone(core.SetStoreStats(stats), core.SetLastHeartbeatTS(nowTime), core.SetLastAwakenTime(nowTime), opt) + resp.AwakenRegions = &pdpb.AwakenRegions{ + AbnormalStores: slowStoreIDs, + } + } else { + newStore = store.Clone(core.SetStoreStats(stats), core.SetLastHeartbeatTS(nowTime), opt) } } else { newStore = store.Clone(core.SetStoreStats(stats), core.SetLastHeartbeatTS(nowTime), opt) @@ -959,41 +981,47 @@ func (c *RaftCluster) HandleStoreHeartbeat(heartbeat *pdpb.StoreHeartbeatRequest statistics.UpdateStoreHeartbeatMetrics(store) } c.core.PutStore(newStore) - c.hotStat.Observe(storeID, newStore.GetStoreStats()) - c.hotStat.FilterUnhealthyStore(c) - c.slowStat.ObserveSlowStoreStatus(storeID, newStore.IsSlow()) - reportInterval := stats.GetInterval() - interval := reportInterval.GetEndTimestamp() - reportInterval.GetStartTimestamp() - - regions := make(map[uint64]*core.RegionInfo, len(stats.GetPeerStats())) - for _, peerStat := range stats.GetPeerStats() { - regionID := peerStat.GetRegionId() - region := c.GetRegion(regionID) - regions[regionID] = region - if region == nil { - log.Warn("discard hot peer stat for unknown region", - zap.Uint64("region-id", regionID), - zap.Uint64("store-id", storeID)) - continue - } - peer := region.GetStorePeer(storeID) - if peer == nil { - log.Warn("discard hot peer stat for unknown region peer", - zap.Uint64("region-id", regionID), - zap.Uint64("store-id", storeID)) - continue - } - readQueryNum := core.GetReadQueryNum(peerStat.GetQueryStats()) - loads := []float64{ - utils.RegionReadBytes: float64(peerStat.GetReadBytes()), - utils.RegionReadKeys: float64(peerStat.GetReadKeys()), - utils.RegionReadQueryNum: float64(readQueryNum), - utils.RegionWriteBytes: 0, - utils.RegionWriteKeys: 0, - utils.RegionWriteQueryNum: 0, + var ( + regions map[uint64]*core.RegionInfo + interval uint64 + ) + if !c.isAPIServiceMode { + c.hotStat.Observe(storeID, newStore.GetStoreStats()) + c.hotStat.FilterUnhealthyStore(c) + c.slowStat.ObserveSlowStoreStatus(storeID, newStore.IsSlow()) + reportInterval := stats.GetInterval() + interval = reportInterval.GetEndTimestamp() - reportInterval.GetStartTimestamp() + + regions = make(map[uint64]*core.RegionInfo, len(stats.GetPeerStats())) + for _, peerStat := range stats.GetPeerStats() { + regionID := peerStat.GetRegionId() + region := c.GetRegion(regionID) + regions[regionID] = region + if region == nil { + log.Warn("discard hot peer stat for unknown region", + zap.Uint64("region-id", regionID), + zap.Uint64("store-id", storeID)) + continue + } + peer := region.GetStorePeer(storeID) + if peer == nil { + log.Warn("discard hot peer stat for unknown region peer", + zap.Uint64("region-id", regionID), + zap.Uint64("store-id", storeID)) + continue + } + readQueryNum := core.GetReadQueryNum(peerStat.GetQueryStats()) + loads := []float64{ + utils.RegionReadBytes: float64(peerStat.GetReadBytes()), + utils.RegionReadKeys: float64(peerStat.GetReadKeys()), + utils.RegionReadQueryNum: float64(readQueryNum), + utils.RegionWriteBytes: 0, + utils.RegionWriteKeys: 0, + utils.RegionWriteQueryNum: 0, + } + peerInfo := core.NewPeerInfo(peer, loads, interval) + c.hotStat.CheckReadAsync(statistics.NewCheckPeerTask(peerInfo, region)) } - peerInfo := core.NewPeerInfo(peer, loads, interval) - c.hotStat.CheckReadAsync(statistics.NewCheckPeerTask(peerInfo, region)) } for _, stat := range stats.GetSnapshotStats() { // the duration of snapshot is the sum between to send and generate snapshot. @@ -1013,8 +1041,10 @@ func (c *RaftCluster) HandleStoreHeartbeat(heartbeat *pdpb.StoreHeartbeatRequest e := int64(dur)*2 - int64(stat.GetTotalDurationSec()) store.Feedback(float64(e)) } - // Here we will compare the reported regions with the previous hot peers to decide if it is still hot. - c.hotStat.CheckReadAsync(statistics.NewCollectUnReportedPeerTask(storeID, regions, interval)) + if !c.isAPIServiceMode { + // Here we will compare the reported regions with the previous hot peers to decide if it is still hot. + c.hotStat.CheckReadAsync(statistics.NewCollectUnReportedPeerTask(storeID, regions, interval)) + } return nil } @@ -1060,17 +1090,21 @@ func (c *RaftCluster) processRegionHeartbeat(region *core.RegionInfo) error { if err != nil { return err } - region.Inherit(origin, c.GetStoreConfig().IsEnableRegionBucket()) + if c.GetStoreConfig().IsEnableRegionBucket() { + region.InheritBuckets(origin) + } - c.hotStat.CheckWriteAsync(statistics.NewCheckExpiredItemTask(region)) - c.hotStat.CheckReadAsync(statistics.NewCheckExpiredItemTask(region)) - reportInterval := region.GetInterval() - interval := reportInterval.GetEndTimestamp() - reportInterval.GetStartTimestamp() - for _, peer := range region.GetPeers() { - peerInfo := core.NewPeerInfo(peer, region.GetWriteLoads(), interval) - c.hotStat.CheckWriteAsync(statistics.NewCheckPeerTask(peerInfo, region)) + if !c.isAPIServiceMode { + c.hotStat.CheckWriteAsync(statistics.NewCheckExpiredItemTask(region)) + c.hotStat.CheckReadAsync(statistics.NewCheckExpiredItemTask(region)) + reportInterval := region.GetInterval() + interval := reportInterval.GetEndTimestamp() - reportInterval.GetStartTimestamp() + for _, peer := range region.GetPeers() { + peerInfo := core.NewPeerInfo(peer, region.GetWriteLoads(), interval) + c.hotStat.CheckWriteAsync(statistics.NewCheckPeerTask(peerInfo, region)) + } + c.coordinator.GetSchedulersController().CheckTransferWitnessLeader(region) } - c.coordinator.GetSchedulersController().CheckTransferWitnessLeader(region) // Save to cache if meta or leader is updated, or contains any down/pending peer. // Mark isNew if the region in cache does not have leader. @@ -1081,7 +1115,7 @@ func (c *RaftCluster) processRegionHeartbeat(region *core.RegionInfo) error { // SaveRegion saves region info into cache and PD storage. func (c *RaftCluster) SaveRegion(region *core.RegionInfo, changed *core.RegionChanged) (err error) { hasRegionStats := c.regionStats != nil - if !changed.SaveKV && !changed.SaveCache && !changed.IsNew { + if !c.isAPIServiceMode && !changed.SaveKV && !changed.SaveCache && !changed.IsNew { // Due to some config changes need to update the region stats as well, // so we do some extra checks here. if hasRegionStats && c.regionStats.RegionStatsNeedUpdate(region) { @@ -1109,23 +1143,25 @@ func (c *RaftCluster) SaveRegion(region *core.RegionInfo, changed *core.RegionCh } for _, item := range overlaps { - if c.regionStats != nil { - c.regionStats.ClearDefunctRegion(item.GetID()) + if !c.isAPIServiceMode { + if c.regionStats != nil { + c.regionStats.ClearDefunctRegion(item.GetID()) + } + c.labelLevelStats.ClearDefunctRegion(item.GetID()) } - c.labelLevelStats.ClearDefunctRegion(item.GetID()) c.ruleManager.InvalidCache(item.GetID()) } regionUpdateCacheEventCounter.Inc() } - if hasRegionStats { - c.regionStats.Observe(region, c.getRegionStoresLocked(region)) + if !c.isAPIServiceMode { + if hasRegionStats { + c.regionStats.Observe(region, c.getRegionStoresLocked(region)) + } } - if !c.IsPrepared() && changed.IsNew { c.coordinator.GetPrepareChecker().Collect(region) } - if c.storage != nil { // If there are concurrent heartbeats from the same region, the last write will win even if // writes to storage in the critical area. So don't use mutex to protect it. @@ -1458,27 +1494,22 @@ func (c *RaftCluster) RemoveStore(storeID uint64, physicallyDestroyed bool) erro if store == nil { return errs.ErrStoreNotFound.FastGenByArgs(storeID) } - // Remove an offline store should be OK, nothing to do. if store.IsRemoving() && store.IsPhysicallyDestroyed() == physicallyDestroyed { return nil } - if store.IsRemoved() { return errs.ErrStoreRemoved.FastGenByArgs(storeID) } - if store.IsPhysicallyDestroyed() { return errs.ErrStoreDestroyed.FastGenByArgs(storeID) } - if (store.IsPreparing() || store.IsServing()) && !physicallyDestroyed { if err := c.checkReplicaBeforeOfflineStore(storeID); err != nil { return err } } - - newStore := store.Clone(core.OfflineStore(physicallyDestroyed)) + newStore := store.Clone(core.SetStoreState(metapb.StoreState_Offline, physicallyDestroyed)) log.Warn("store has been offline", zap.Uint64("store-id", storeID), zap.String("store-address", newStore.GetAddress()), @@ -1587,7 +1618,7 @@ func (c *RaftCluster) BuryStore(storeID uint64, forceBury bool) error { } } - newStore := store.Clone(core.TombstoneStore()) + newStore := store.Clone(core.SetStoreState(metapb.StoreState_Tombstone)) log.Warn("store has been Tombstone", zap.Uint64("store-id", storeID), zap.String("store-address", newStore.GetAddress()), @@ -1600,8 +1631,10 @@ func (c *RaftCluster) BuryStore(storeID uint64, forceBury bool) error { delete(c.prevStoreLimit, storeID) c.RemoveStoreLimit(storeID) c.resetProgress(storeID, store.GetAddress()) - c.hotStat.RemoveRollingStoreStats(storeID) - c.slowStat.RemoveSlowStoreStatus(storeID) + if !c.isAPIServiceMode { + c.hotStat.RemoveRollingStoreStats(storeID) + c.slowStat.RemoveSlowStoreStatus(storeID) + } } return err } @@ -1691,7 +1724,7 @@ func (c *RaftCluster) UpStore(storeID uint64) error { return nil } - options := []core.StoreCreateOption{core.UpStore()} + options := []core.StoreCreateOption{core.SetStoreState(metapb.StoreState_Up)} // get the previous store limit recorded in memory limiter, exist := c.prevStoreLimit[storeID] if exist { @@ -1738,7 +1771,7 @@ func (c *RaftCluster) ReadyToServe(storeID uint64) error { return errs.ErrStoreServing.FastGenByArgs(storeID) } - newStore := store.Clone(core.UpStore()) + newStore := store.Clone(core.SetStoreState(metapb.StoreState_Up)) log.Info("store has changed to serving", zap.Uint64("store-id", storeID), zap.String("store-address", newStore.GetAddress())) @@ -1775,8 +1808,10 @@ func (c *RaftCluster) putStoreLocked(store *core.StoreInfo) error { } } c.core.PutStore(store) - c.hotStat.GetOrCreateRollingStoreStats(store.GetID()) - c.slowStat.ObserveSlowStoreStatus(store.GetID(), store.IsSlow()) + if !c.isAPIServiceMode { + c.hotStat.GetOrCreateRollingStoreStats(store.GetID()) + c.slowStat.ObserveSlowStoreStatus(store.GetID(), store.IsSlow()) + } return nil } @@ -2126,13 +2161,18 @@ func (c *RaftCluster) collectMetrics() { statsMap := statistics.NewStoreStatisticsMap(c.opt) stores := c.GetStores() for _, s := range stores { - statsMap.Observe(s, c.hotStat.StoresStats) + statsMap.Observe(s) + if !c.isAPIServiceMode { + statsMap.ObserveHotStat(s, c.hotStat.StoresStats) + } } statsMap.Collect() - c.coordinator.GetSchedulersController().CollectSchedulerMetrics() - c.coordinator.CollectHotSpotMetrics() - c.collectClusterMetrics() + if !c.isAPIServiceMode { + c.coordinator.GetSchedulersController().CollectSchedulerMetrics() + c.coordinator.CollectHotSpotMetrics() + c.collectClusterMetrics() + } c.collectHealthStatus() } @@ -2140,9 +2180,11 @@ func (c *RaftCluster) resetMetrics() { statsMap := statistics.NewStoreStatisticsMap(c.opt) statsMap.Reset() - c.coordinator.GetSchedulersController().ResetSchedulerMetrics() - c.coordinator.ResetHotSpotMetrics() - c.resetClusterMetrics() + if !c.isAPIServiceMode { + c.coordinator.GetSchedulersController().ResetSchedulerMetrics() + c.coordinator.ResetHotSpotMetrics() + c.resetClusterMetrics() + } c.resetHealthStatus() c.resetProgressIndicator() } diff --git a/server/cluster/cluster_test.go b/server/cluster/cluster_test.go index e5bf862174b2..605fd2225020 100644 --- a/server/cluster/cluster_test.go +++ b/server/cluster/cluster_test.go @@ -225,7 +225,7 @@ func TestFilterUnhealthyStore(t *testing.T) { Available: 50, RegionCount: 1, } - newStore := store.Clone(core.TombstoneStore()) + newStore := store.Clone(core.SetStoreState(metapb.StoreState_Tombstone)) re.NoError(cluster.putStoreLocked(newStore)) re.NoError(cluster.HandleStoreHeartbeat(req, resp)) re.Nil(cluster.hotStat.GetRollingStoreStats(store.GetID())) @@ -2351,7 +2351,7 @@ func (c *testCluster) addLeaderStore(storeID uint64, leaderCount int) error { func (c *testCluster) setStoreDown(storeID uint64) error { store := c.GetStore(storeID) newStore := store.Clone( - core.UpStore(), + core.SetStoreState(metapb.StoreState_Up), core.SetLastHeartbeatTS(typeutil.ZeroTime), ) c.Lock() @@ -2361,7 +2361,7 @@ func (c *testCluster) setStoreDown(storeID uint64) error { func (c *testCluster) setStoreOffline(storeID uint64) error { store := c.GetStore(storeID) - newStore := store.Clone(core.OfflineStore(false)) + newStore := store.Clone(core.SetStoreState(metapb.StoreState_Offline, false)) c.Lock() defer c.Unlock() return c.putStoreLocked(newStore) diff --git a/server/cluster/cluster_worker.go b/server/cluster/cluster_worker.go index f194c8014a5f..3036fe95b3ea 100644 --- a/server/cluster/cluster_worker.go +++ b/server/cluster/cluster_worker.go @@ -285,6 +285,8 @@ func (c *RaftCluster) HandleReportBuckets(b *metapb.Buckets) error { if err := c.processReportBuckets(b); err != nil { return err } - c.hotStat.CheckAsync(buckets.NewCheckPeerTask(b)) + if !c.isAPIServiceMode { + c.hotStat.CheckAsync(buckets.NewCheckPeerTask(b)) + } return nil } diff --git a/server/grpc_service.go b/server/grpc_service.go index 2dd5fdc76225..601b09d4791f 100644 --- a/server/grpc_service.go +++ b/server/grpc_service.go @@ -1628,7 +1628,6 @@ func (s *GrpcServer) ReportBatchSplit(ctx context.Context, request *pdpb.ReportB if rc == nil { return &pdpb.ReportBatchSplitResponse{Header: s.notBootstrappedHeader()}, nil } - _, err := rc.HandleBatchReportSplit(request) if err != nil { return &pdpb.ReportBatchSplitResponse{ @@ -2117,6 +2116,9 @@ func (s *GrpcServer) SplitAndScatterRegions(ctx context.Context, request *pdpb.S return rsp.(*pdpb.SplitAndScatterRegionsResponse), err } rc := s.GetRaftCluster() + if rc == nil { + return &pdpb.SplitAndScatterRegionsResponse{Header: s.notBootstrappedHeader()}, nil + } splitFinishedPercentage, newRegionIDs := rc.GetRegionSplitter().SplitRegions(ctx, request.GetSplitKeys(), int(request.GetRetryLimit())) scatterFinishedPercentage, err := scatterRegions(rc, newRegionIDs, request.GetGroup(), int(request.GetRetryLimit()), false) if err != nil { diff --git a/server/handler.go b/server/handler.go index 02ec6da48081..adc1e8ecd318 100644 --- a/server/handler.go +++ b/server/handler.go @@ -236,14 +236,20 @@ func (h *Handler) AddScheduler(name string, args ...string) error { return err } log.Info("create scheduler", zap.String("scheduler-name", s.GetName()), zap.Strings("scheduler-args", args)) - if err = c.AddScheduler(s, args...); err != nil { - log.Error("can not add scheduler", zap.String("scheduler-name", s.GetName()), zap.Strings("scheduler-args", args), errs.ZapError(err)) - } else if err = h.opt.Persist(c.GetStorage()); err != nil { - log.Error("can not persist scheduler config", errs.ZapError(err)) + if !h.s.IsAPIServiceMode() { + if err = c.AddScheduler(s, args...); err != nil { + log.Error("can not add scheduler", zap.String("scheduler-name", s.GetName()), zap.Strings("scheduler-args", args), errs.ZapError(err)) + return err + } } else { - log.Info("add scheduler successfully", zap.String("scheduler-name", name), zap.Strings("scheduler-args", args)) + c.GetSchedulerConfig().AddSchedulerCfg(s.GetType(), args) } - return err + if err = h.opt.Persist(c.GetStorage()); err != nil { + log.Error("can not persist scheduler config", errs.ZapError(err)) + return err + } + log.Info("add scheduler successfully", zap.String("scheduler-name", name), zap.Strings("scheduler-args", args)) + return nil } // RemoveScheduler removes a scheduler by name. @@ -252,10 +258,24 @@ func (h *Handler) RemoveScheduler(name string) error { if err != nil { return err } - if err = c.RemoveScheduler(name); err != nil { - log.Error("can not remove scheduler", zap.String("scheduler-name", name), errs.ZapError(err)) + if !h.s.IsAPIServiceMode() { + if err = c.RemoveScheduler(name); err != nil { + log.Error("can not remove scheduler", zap.String("scheduler-name", name), errs.ZapError(err)) + } else { + log.Info("remove scheduler successfully", zap.String("scheduler-name", name)) + } } else { - log.Info("remove scheduler successfully", zap.String("scheduler-name", name)) + conf := c.GetSchedulerConfig() + c.GetSchedulerConfig().RemoveSchedulerCfg(schedulers.FindSchedulerTypeByName(name)) + if err := conf.Persist(c.GetStorage()); err != nil { + log.Error("the option can not persist scheduler config", errs.ZapError(err)) + return err + } + + if err := c.GetStorage().RemoveScheduleConfig(name); err != nil { + log.Error("can not remove the scheduler config", errs.ZapError(err)) + return err + } } return err } @@ -941,14 +961,20 @@ func (h *Handler) ResetTS(ts uint64, ignoreSmaller, skipUpperBoundCheck bool, _ // SetStoreLimitScene sets the limit values for different scenes func (h *Handler) SetStoreLimitScene(scene *storelimit.Scene, limitType storelimit.Type) { - cluster := h.s.GetRaftCluster() - cluster.GetStoreLimiter().ReplaceStoreLimitScene(scene, limitType) + rc := h.s.GetRaftCluster() + if rc == nil { + return + } + rc.GetStoreLimiter().ReplaceStoreLimitScene(scene, limitType) } // GetStoreLimitScene returns the limit values for different scenes func (h *Handler) GetStoreLimitScene(limitType storelimit.Type) *storelimit.Scene { - cluster := h.s.GetRaftCluster() - return cluster.GetStoreLimiter().StoreLimitScene(limitType) + rc := h.s.GetRaftCluster() + if rc == nil { + return nil + } + return rc.GetStoreLimiter().StoreLimitScene(limitType) } // GetProgressByID returns the progress details for a given store ID. diff --git a/server/server.go b/server/server.go index 55fc8b886f8e..b74ca5b57b39 100644 --- a/server/server.go +++ b/server/server.go @@ -17,6 +17,7 @@ package server import ( "bytes" "context" + errorspkg "errors" "fmt" "math/rand" "net/http" @@ -189,7 +190,7 @@ type Server struct { // startCallbacks will be called after the server is started. startCallbacks []func() // leaderCallbacks will be called after the server becomes leader. - leaderCallbacks []func(context.Context) + leaderCallbacks []func(context.Context) error // closeCallbacks will be called before the server is closed. closeCallbacks []func() @@ -1022,18 +1023,18 @@ func (s *Server) SetReplicationConfig(cfg sc.ReplicationConfig) error { } old := s.persistOptions.GetReplicationConfig() if cfg.EnablePlacementRules != old.EnablePlacementRules { - raftCluster := s.GetRaftCluster() - if raftCluster == nil { + rc := s.GetRaftCluster() + if rc == nil { return errs.ErrNotBootstrapped.GenWithStackByArgs() } if cfg.EnablePlacementRules { // initialize rule manager. - if err := raftCluster.GetRuleManager().Initialize(int(cfg.MaxReplicas), cfg.LocationLabels); err != nil { + if err := rc.GetRuleManager().Initialize(int(cfg.MaxReplicas), cfg.LocationLabels); err != nil { return err } } else { // NOTE: can be removed after placement rules feature is enabled by default. - for _, s := range raftCluster.GetStores() { + for _, s := range rc.GetStores() { if !s.IsRemoved() && s.IsTiFlash() { return errors.New("cannot disable placement rules with TiFlash nodes") } @@ -1043,8 +1044,12 @@ func (s *Server) SetReplicationConfig(cfg sc.ReplicationConfig) error { var rule *placement.Rule if cfg.EnablePlacementRules { + rc := s.GetRaftCluster() + if rc == nil { + return errs.ErrNotBootstrapped.GenWithStackByArgs() + } // replication.MaxReplicas won't work when placement rule is enabled and not only have one default rule. - defaultRule := s.GetRaftCluster().GetRuleManager().GetRule("pd", "default") + defaultRule := rc.GetRuleManager().GetRule("pd", "default") CheckInDefaultRule := func() error { // replication config won't work when placement rule is enabled and exceeds one default rule @@ -1070,7 +1075,11 @@ func (s *Server) SetReplicationConfig(cfg sc.ReplicationConfig) error { if rule != nil { rule.Count = int(cfg.MaxReplicas) rule.LocationLabels = cfg.LocationLabels - if err := s.GetRaftCluster().GetRuleManager().SetRule(rule); err != nil { + rc := s.GetRaftCluster() + if rc == nil { + return errs.ErrNotBootstrapped.GenWithStackByArgs() + } + if err := rc.GetRuleManager().SetRule(rule); err != nil { log.Error("failed to update rule count", errs.ZapError(err)) return err @@ -1082,7 +1091,11 @@ func (s *Server) SetReplicationConfig(cfg sc.ReplicationConfig) error { s.persistOptions.SetReplicationConfig(old) if rule != nil { rule.Count = int(old.MaxReplicas) - if e := s.GetRaftCluster().GetRuleManager().SetRule(rule); e != nil { + rc := s.GetRaftCluster() + if rc == nil { + return errs.ErrNotBootstrapped.GenWithStackByArgs() + } + if e := rc.GetRuleManager().SetRule(rule); e != nil { log.Error("failed to roll back count of rule when update replication config", errs.ZapError(e)) } } @@ -1370,18 +1383,18 @@ func (s *Server) GetServerOption() *config.PersistOptions { // GetMetaRegions gets meta regions from cluster. func (s *Server) GetMetaRegions() []*metapb.Region { - cluster := s.GetRaftCluster() - if cluster != nil { - return cluster.GetMetaRegions() + rc := s.GetRaftCluster() + if rc != nil { + return rc.GetMetaRegions() } return nil } // GetRegions gets regions from cluster. func (s *Server) GetRegions() []*core.RegionInfo { - cluster := s.GetRaftCluster() - if cluster != nil { - return cluster.GetRegions() + rc := s.GetRaftCluster() + if rc != nil { + return rc.GetRegions() } return nil } @@ -1518,9 +1531,9 @@ func (s *Server) SetReplicationModeConfig(cfg config.ReplicationModeConfig) erro } log.Info("replication mode config is updated", zap.Reflect("new", cfg), zap.Reflect("old", old)) - cluster := s.GetRaftCluster() - if cluster != nil { - err := cluster.GetReplicationMode().UpdateConfig(cfg) + rc := s.GetRaftCluster() + if rc != nil { + err := rc.GetReplicationMode().UpdateConfig(cfg) if err != nil { log.Warn("failed to update replication mode", errs.ZapError(err)) // revert to old config @@ -1546,7 +1559,7 @@ func (s *Server) IsServing() bool { } // AddServiceReadyCallback adds callbacks when the server becomes the leader if there is embedded etcd, or the primary otherwise. -func (s *Server) AddServiceReadyCallback(callbacks ...func(context.Context)) { +func (s *Server) AddServiceReadyCallback(callbacks ...func(context.Context) error) { s.leaderCallbacks = append(s.leaderCallbacks, callbacks...) } @@ -1669,8 +1682,11 @@ func (s *Server) campaignLeader() { defer func() { s.tsoAllocatorManager.ResetAllocatorGroup(tso.GlobalDCLocation) failpoint.Inject("updateAfterResetTSO", func() { - if err = allocator.UpdateTSO(); err != nil { - panic(err) + if err = allocator.UpdateTSO(); !errorspkg.Is(err, errs.ErrUpdateTimestamp) { + log.Panic("the tso update after reset should return ErrUpdateTimestamp as expected", zap.Error(err)) + } + if allocator.IsInitialize() { + log.Panic("the allocator should be uninitialized after reset") } }) }() @@ -1988,7 +2004,11 @@ func (s *Server) RecoverAllocID(ctx context.Context, id uint64) error { // GetExternalTS returns external timestamp. func (s *Server) GetExternalTS() uint64 { - return s.GetRaftCluster().GetExternalTS() + rc := s.GetRaftCluster() + if rc == nil { + return 0 + } + return rc.GetExternalTS() } // SetExternalTS returns external timestamp. @@ -1998,14 +2018,18 @@ func (s *Server) SetExternalTS(externalTS, globalTS uint64) error { log.Error(desc, zap.Uint64("request timestamp", externalTS), zap.Uint64("global ts", globalTS)) return errors.New(desc) } - currentExternalTS := s.GetRaftCluster().GetExternalTS() + c := s.GetRaftCluster() + if c == nil { + return errs.ErrNotBootstrapped.FastGenByArgs() + } + currentExternalTS := c.GetExternalTS() if tsoutil.CompareTimestampUint64(externalTS, currentExternalTS) != 1 { desc := "the external timestamp should be larger than current external timestamp" log.Error(desc, zap.Uint64("request", externalTS), zap.Uint64("current", currentExternalTS)) return errors.New(desc) } - s.GetRaftCluster().SetExternalTS(externalTS) - return nil + + return c.SetExternalTS(externalTS) } // IsLocalTSOEnabled returns if the local TSO is enabled. diff --git a/tests/integrations/client/go.mod b/tests/integrations/client/go.mod index d9e37dc9d168..72eac02d3411 100644 --- a/tests/integrations/client/go.mod +++ b/tests/integrations/client/go.mod @@ -13,7 +13,7 @@ replace google.golang.org/grpc v1.54.0 => google.golang.org/grpc v1.26.0 require ( github.com/docker/go-units v0.4.0 github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00 - github.com/pingcap/kvproto v0.0.0-20230727073445-53e1f8730c30 + github.com/pingcap/kvproto v0.0.0-20230905082026-5336fac26974 github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 github.com/stretchr/testify v1.8.2 github.com/tikv/pd v0.0.0-00010101000000-000000000000 diff --git a/tests/integrations/client/go.sum b/tests/integrations/client/go.sum index 5e8061733aac..2e3d63298861 100644 --- a/tests/integrations/client/go.sum +++ b/tests/integrations/client/go.sum @@ -402,8 +402,8 @@ github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c/go.mod h1:X2r9ue github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00 h1:C3N3itkduZXDZFh4N3vQ5HEtld3S+Y+StULhWVvumU0= github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00/go.mod h1:4qGtCB0QK0wBzKtFEGDhxXnSnbQApw1gc9siScUl8ew= github.com/pingcap/kvproto v0.0.0-20191211054548-3c6b38ea5107/go.mod h1:WWLmULLO7l8IOcQG+t+ItJ3fEcrL5FxF0Wu+HrMy26w= -github.com/pingcap/kvproto v0.0.0-20230727073445-53e1f8730c30 h1:EvqKcDT7ceGLW0mXqM8Cp5Z8DfgQRnwj2YTnlCLj2QI= -github.com/pingcap/kvproto v0.0.0-20230727073445-53e1f8730c30/go.mod h1:r0q/CFcwvyeRhKtoqzmWMBebrtpIziQQ9vR+JKh1knc= +github.com/pingcap/kvproto v0.0.0-20230905082026-5336fac26974 h1:Gn8rf2Mb3QDifUQHdtcopqKclc9L11hjhZFYBE65lcw= +github.com/pingcap/kvproto v0.0.0-20230905082026-5336fac26974/go.mod h1:r0q/CFcwvyeRhKtoqzmWMBebrtpIziQQ9vR+JKh1knc= github.com/pingcap/log v0.0.0-20191012051959-b742a5d432e9/go.mod h1:4rbK1p9ILyIfb6hU7OG2CiWSqMXnp3JMbiaVJ6mvoY8= github.com/pingcap/log v0.0.0-20210625125904-98ed8e2eb1c7/go.mod h1:8AanEdAHATuRurdGxZXBz0At+9avep+ub7U1AGYLIMM= github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 h1:HR/ylkkLmGdSSDaD8IDP+SZrdhV1Kibl9KrHxJ9eciw= diff --git a/tests/integrations/mcs/go.mod b/tests/integrations/mcs/go.mod index 284810c5f1a6..62b4b022eadf 100644 --- a/tests/integrations/mcs/go.mod +++ b/tests/integrations/mcs/go.mod @@ -12,7 +12,7 @@ replace google.golang.org/grpc v1.54.0 => google.golang.org/grpc v1.26.0 require ( github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00 - github.com/pingcap/kvproto v0.0.0-20230727073445-53e1f8730c30 + github.com/pingcap/kvproto v0.0.0-20230905082026-5336fac26974 github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 github.com/stretchr/testify v1.8.2 github.com/tikv/pd v0.0.0-00010101000000-000000000000 diff --git a/tests/integrations/mcs/go.sum b/tests/integrations/mcs/go.sum index b30af0f68fcb..a759c68761d8 100644 --- a/tests/integrations/mcs/go.sum +++ b/tests/integrations/mcs/go.sum @@ -407,8 +407,8 @@ github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c/go.mod h1:X2r9ue github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00 h1:C3N3itkduZXDZFh4N3vQ5HEtld3S+Y+StULhWVvumU0= github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00/go.mod h1:4qGtCB0QK0wBzKtFEGDhxXnSnbQApw1gc9siScUl8ew= github.com/pingcap/kvproto v0.0.0-20191211054548-3c6b38ea5107/go.mod h1:WWLmULLO7l8IOcQG+t+ItJ3fEcrL5FxF0Wu+HrMy26w= -github.com/pingcap/kvproto v0.0.0-20230727073445-53e1f8730c30 h1:EvqKcDT7ceGLW0mXqM8Cp5Z8DfgQRnwj2YTnlCLj2QI= -github.com/pingcap/kvproto v0.0.0-20230727073445-53e1f8730c30/go.mod h1:r0q/CFcwvyeRhKtoqzmWMBebrtpIziQQ9vR+JKh1knc= +github.com/pingcap/kvproto v0.0.0-20230905082026-5336fac26974 h1:Gn8rf2Mb3QDifUQHdtcopqKclc9L11hjhZFYBE65lcw= +github.com/pingcap/kvproto v0.0.0-20230905082026-5336fac26974/go.mod h1:r0q/CFcwvyeRhKtoqzmWMBebrtpIziQQ9vR+JKh1knc= github.com/pingcap/log v0.0.0-20210625125904-98ed8e2eb1c7/go.mod h1:8AanEdAHATuRurdGxZXBz0At+9avep+ub7U1AGYLIMM= github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 h1:HR/ylkkLmGdSSDaD8IDP+SZrdhV1Kibl9KrHxJ9eciw= github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4= diff --git a/tests/integrations/mcs/resourcemanager/resource_manager_test.go b/tests/integrations/mcs/resourcemanager/resource_manager_test.go index e0d295c825ab..926484cea1e0 100644 --- a/tests/integrations/mcs/resourcemanager/resource_manager_test.go +++ b/tests/integrations/mcs/resourcemanager/resource_manager_test.go @@ -767,7 +767,7 @@ func (suite *resourceManagerClientTestSuite) TestBasicResourceGroupCURD() { }, {"test2", rmpb.GroupMode_RUMode, true, true, - `{"name":"test2","mode":1,"r_u_settings":{"r_u":{"settings":{"fill_rate":20000},"state":{"initialized":false}}},"priority":0,"runaway_settings":{"rule":{"exec_elapsed_time_ms":10000},"action":1},"background_settings":{"job_types":["test"]}}`, + `{"name":"test2","mode":1,"r_u_settings":{"r_u":{"settings":{"fill_rate":20000},"state":{"initialized":false}}},"priority":0,"runaway_settings":{"rule":{"exec_elapsed_time_ms":10000},"action":2},"background_settings":{"job_types":["test"]}}`, func(gs *rmpb.ResourceGroup) { gs.RUSettings = &rmpb.GroupRequestUnitSettings{ RU: &rmpb.TokenBucket{ @@ -788,7 +788,7 @@ func (suite *resourceManagerClientTestSuite) TestBasicResourceGroupCURD() { }, }, {"test2", rmpb.GroupMode_RUMode, false, true, - `{"name":"test2","mode":1,"r_u_settings":{"r_u":{"settings":{"fill_rate":30000,"burst_limit":-1},"state":{"initialized":false}}},"priority":0,"runaway_settings":{"rule":{"exec_elapsed_time_ms":1000},"action":2,"watch":{"lasting_duration_ms":100000,"type":2}},"background_settings":{"job_types":["br","lightning"]}}`, + `{"name":"test2","mode":1,"r_u_settings":{"r_u":{"settings":{"fill_rate":30000,"burst_limit":-1},"state":{"initialized":false}}},"priority":0,"runaway_settings":{"rule":{"exec_elapsed_time_ms":1000},"action":3,"watch":{"lasting_duration_ms":100000,"type":2}},"background_settings":{"job_types":["br","lightning"]}}`, func(gs *rmpb.ResourceGroup) { gs.RUSettings = &rmpb.GroupRequestUnitSettings{ RU: &rmpb.TokenBucket{ diff --git a/tests/integrations/mcs/scheduling/config_test.go b/tests/integrations/mcs/scheduling/config_test.go index ef52203e349d..032cb4ad7aeb 100644 --- a/tests/integrations/mcs/scheduling/config_test.go +++ b/tests/integrations/mcs/scheduling/config_test.go @@ -136,18 +136,10 @@ func (suite *configTestSuite) TestSchedulerConfigWatch() { ) re.NoError(err) // Get all default scheduler names. - var ( - schedulerNames []string - schedulerController = suite.pdLeaderServer.GetRaftCluster().GetCoordinator().GetSchedulersController() - ) + var schedulerNames, _, _ = suite.pdLeaderServer.GetRaftCluster().GetStorage().LoadAllScheduleConfig() + testutil.Eventually(re, func() bool { - schedulerNames = schedulerController.GetSchedulerNames() targetCount := len(sc.DefaultSchedulers) - // In the previous case, StoreConfig of raft-kv2 has been persisted. So, it might - // have EvictSlowTrendName. - if exists, _ := schedulerController.IsSchedulerExisted(schedulers.EvictSlowTrendName); exists { - targetCount += 1 - } return len(schedulerNames) == targetCount }) // Check all default schedulers' configs. diff --git a/tests/integrations/mcs/scheduling/meta_test.go b/tests/integrations/mcs/scheduling/meta_test.go new file mode 100644 index 000000000000..74497e0b5523 --- /dev/null +++ b/tests/integrations/mcs/scheduling/meta_test.go @@ -0,0 +1,102 @@ +// Copyright 2023 TiKV Project Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package scheduling + +import ( + "context" + "fmt" + "testing" + "time" + + "github.com/pingcap/failpoint" + "github.com/pingcap/kvproto/pkg/metapb" + "github.com/stretchr/testify/suite" + "github.com/tikv/pd/pkg/core" + "github.com/tikv/pd/pkg/mcs/scheduling/server/meta" + "github.com/tikv/pd/pkg/utils/testutil" + "github.com/tikv/pd/tests" +) + +type metaTestSuite struct { + suite.Suite + + ctx context.Context + cancel context.CancelFunc + + // The PD cluster. + cluster *tests.TestCluster + // pdLeaderServer is the leader server of the PD cluster. + pdLeaderServer *tests.TestServer +} + +func TestMeta(t *testing.T) { + suite.Run(t, &metaTestSuite{}) +} + +func (suite *metaTestSuite) SetupSuite() { + re := suite.Require() + re.NoError(failpoint.Enable("github.com/tikv/pd/server/cluster/highFrequencyClusterJobs", `return(true)`)) + var err error + suite.ctx, suite.cancel = context.WithCancel(context.Background()) + suite.cluster, err = tests.NewTestAPICluster(suite.ctx, 1) + re.NoError(err) + err = suite.cluster.RunInitialServers() + re.NoError(err) + leaderName := suite.cluster.WaitLeader() + suite.pdLeaderServer = suite.cluster.GetServer(leaderName) + re.NoError(suite.pdLeaderServer.BootstrapCluster()) +} + +func (suite *metaTestSuite) TearDownSuite() { + suite.NoError(failpoint.Disable("github.com/tikv/pd/server/cluster/highFrequencyClusterJobs")) + suite.cancel() + suite.cluster.Destroy() +} + +func (suite *metaTestSuite) TestStoreWatch() { + re := suite.Require() + + cluster := core.NewBasicCluster() + // Create a meta watcher. + _, err := meta.NewWatcher( + suite.ctx, + suite.pdLeaderServer.GetEtcdClient(), + suite.cluster.GetCluster().GetId(), + cluster, + ) + re.NoError(err) + for i := uint64(1); i <= 4; i++ { + suite.pdLeaderServer.GetServer().GetRaftCluster().PutStore( + &metapb.Store{Id: i, Address: fmt.Sprintf("mock-%d", i), State: metapb.StoreState_Up, NodeState: metapb.NodeState_Serving, LastHeartbeat: time.Now().UnixNano()}, + ) + } + + suite.pdLeaderServer.GetRaftCluster().RemoveStore(2, false) + testutil.Eventually(re, func() bool { + s := cluster.GetStore(2) + if s == nil { + return false + } + return s.GetState() == metapb.StoreState_Offline + }) + re.Len(cluster.GetStores(), 4) + testutil.Eventually(re, func() bool { + return cluster.GetStore(2).GetState() == metapb.StoreState_Tombstone + }) + re.NoError(suite.pdLeaderServer.GetRaftCluster().RemoveTombStoneRecords()) + testutil.Eventually(re, func() bool { + return cluster.GetStore(2) == nil + }) +} diff --git a/tests/integrations/mcs/scheduling/server_test.go b/tests/integrations/mcs/scheduling/server_test.go index ab612869893b..9b5371deb628 100644 --- a/tests/integrations/mcs/scheduling/server_test.go +++ b/tests/integrations/mcs/scheduling/server_test.go @@ -67,13 +67,16 @@ func (suite *serverTestSuite) TearDownSuite() { func (suite *serverTestSuite) TestAllocID() { re := suite.Require() + re.NoError(failpoint.Enable("github.com/tikv/pd/pkg/mcs/scheduling/server/fastUpdateMember", `return(true)`)) tc, err := tests.NewTestSchedulingCluster(suite.ctx, 1, suite.backendEndpoints) re.NoError(err) defer tc.Destroy() tc.WaitForPrimaryServing(re) + time.Sleep(200 * time.Millisecond) id, err := tc.GetPrimaryServer().GetCluster().AllocID() re.NoError(err) re.NotEqual(uint64(0), id) + re.NoError(failpoint.Disable("github.com/tikv/pd/pkg/mcs/scheduling/server/fastUpdateMember")) } func (suite *serverTestSuite) TestAllocIDAfterLeaderChange() { @@ -83,15 +86,32 @@ func (suite *serverTestSuite) TestAllocIDAfterLeaderChange() { re.NoError(err) defer tc.Destroy() tc.WaitForPrimaryServing(re) + time.Sleep(200 * time.Millisecond) cluster := tc.GetPrimaryServer().GetCluster() id, err := cluster.AllocID() re.NoError(err) re.NotEqual(uint64(0), id) suite.cluster.ResignLeader() suite.cluster.WaitLeader() - time.Sleep(time.Second) + time.Sleep(200 * time.Millisecond) id1, err := cluster.AllocID() re.NoError(err) re.Greater(id1, id) re.NoError(failpoint.Disable("github.com/tikv/pd/pkg/mcs/scheduling/server/fastUpdateMember")) } + +func (suite *serverTestSuite) TestPrimaryChange() { + re := suite.Require() + tc, err := tests.NewTestSchedulingCluster(suite.ctx, 2, suite.backendEndpoints) + re.NoError(err) + defer tc.Destroy() + tc.WaitForPrimaryServing(re) + primary := tc.GetPrimaryServer() + addr := primary.GetAddr() + re.Len(primary.GetCluster().GetCoordinator().GetSchedulersController().GetSchedulerNames(), 5) + primary.Close() + tc.WaitForPrimaryServing(re) + primary = tc.GetPrimaryServer() + re.NotEqual(addr, primary.GetAddr()) + re.Len(primary.GetCluster().GetCoordinator().GetSchedulersController().GetSchedulerNames(), 5) +} diff --git a/tests/integrations/mcs/tso/keyspace_group_manager_test.go b/tests/integrations/mcs/tso/keyspace_group_manager_test.go index 834985edafc9..3d3fe25b3729 100644 --- a/tests/integrations/mcs/tso/keyspace_group_manager_test.go +++ b/tests/integrations/mcs/tso/keyspace_group_manager_test.go @@ -289,7 +289,7 @@ func (suite *tsoKeyspaceGroupManagerTestSuite) requestTSO( primary := suite.tsoCluster.WaitForPrimaryServing(re, keyspaceID, keyspaceGroupID) kgm := primary.GetKeyspaceGroupManager() re.NotNil(kgm) - ts, _, err := kgm.HandleTSORequest(keyspaceID, keyspaceGroupID, tsopkg.GlobalDCLocation, 1) + ts, _, err := kgm.HandleTSORequest(suite.ctx, keyspaceID, keyspaceGroupID, tsopkg.GlobalDCLocation, 1) return ts, err } @@ -648,14 +648,16 @@ func waitFinishMerge( mergeTargetID uint32, keyspaces []uint32, ) { + var kg *endpoint.KeyspaceGroup testutil.Eventually(re, func() bool { - kg := handlersutil.MustLoadKeyspaceGroupByID(re, server, mergeTargetID) - re.Equal(mcsutils.DefaultKeyspaceGroupID, kg.ID) - for _, keyspaceID := range keyspaces { - re.Contains(kg.Keyspaces, keyspaceID) - } + kg = handlersutil.MustLoadKeyspaceGroupByID(re, server, mergeTargetID) + re.Equal(mergeTargetID, kg.ID) return !kg.IsMergeTarget() }) + // If the merge is finished, the target keyspace group should contain all the keyspaces. + for _, keyspaceID := range keyspaces { + re.Contains(kg.Keyspaces, keyspaceID) + } } func (suite *tsoKeyspaceGroupManagerTestSuite) TestTSOKeyspaceGroupMergeBeforeInitTSO() { diff --git a/tests/integrations/mcs/tso/proxy_test.go b/tests/integrations/mcs/tso/proxy_test.go index 625a702ad39f..fc33a6a41be3 100644 --- a/tests/integrations/mcs/tso/proxy_test.go +++ b/tests/integrations/mcs/tso/proxy_test.go @@ -317,7 +317,6 @@ func (s *tsoProxyTestSuite) verifyTSOProxy( for j := 0; j < requestsPerClient; j++ { select { case <-ctx.Done(): - respErr.Store(ctx.Err()) s.cleanupGRPCStream(streams, cleanupFuncs, i) return default: diff --git a/tests/integrations/mcs/tso/server_test.go b/tests/integrations/mcs/tso/server_test.go index 72430c507c7b..d87f15421799 100644 --- a/tests/integrations/mcs/tso/server_test.go +++ b/tests/integrations/mcs/tso/server_test.go @@ -136,7 +136,7 @@ func (suite *tsoServerTestSuite) TestParticipantStartWithAdvertiseListenAddr() { re.NoError(err) // Setup the logger. - err = tests.InitTSOLogger(cfg) + err = tests.InitLogger(cfg.Log, cfg.Logger, cfg.LogProps, cfg.Security.RedactInfoLog) re.NoError(err) s, cleanup, err := tests.NewTSOTestServer(suite.ctx, cfg) diff --git a/tests/integrations/tso/client_test.go b/tests/integrations/tso/client_test.go index d7f1bd3b5d21..1d2f437e8498 100644 --- a/tests/integrations/tso/client_test.go +++ b/tests/integrations/tso/client_test.go @@ -436,7 +436,7 @@ func TestMixedTSODeployment(t *testing.T) { ctx1, cancel1 := context.WithCancel(context.Background()) var wg sync.WaitGroup - checkTSO(ctx1, re, &wg, backendEndpoints, pd.WithAllowTSOFallback() /* It's expected that the timestamp fallback happens here */) + checkTSO(ctx1, re, &wg, backendEndpoints) wg.Add(1) go func() { defer wg.Done() @@ -498,14 +498,13 @@ func TestUpgradingAPIandTSOClusters(t *testing.T) { } func checkTSO( - ctx context.Context, re *require.Assertions, wg *sync.WaitGroup, - backendEndpoints string, opts ...pd.ClientOption, + ctx context.Context, re *require.Assertions, wg *sync.WaitGroup, backendEndpoints string, ) { wg.Add(tsoRequestConcurrencyNumber) for i := 0; i < tsoRequestConcurrencyNumber; i++ { go func() { defer wg.Done() - cli := mcs.SetupClientWithAPIContext(ctx, re, pd.NewAPIContextV1(), strings.Split(backendEndpoints, ","), opts...) + cli := mcs.SetupClientWithAPIContext(ctx, re, pd.NewAPIContextV1(), strings.Split(backendEndpoints, ",")) defer cli.Close() var ts, lastTS uint64 for { diff --git a/tests/integrations/tso/go.mod b/tests/integrations/tso/go.mod index d559e573ec74..84825b5a4660 100644 --- a/tests/integrations/tso/go.mod +++ b/tests/integrations/tso/go.mod @@ -13,7 +13,7 @@ replace google.golang.org/grpc v1.54.0 => google.golang.org/grpc v1.26.0 require ( github.com/pingcap/failpoint v0.0.0-20220801062533-2eaa32854a6c - github.com/pingcap/kvproto v0.0.0-20230727073445-53e1f8730c30 + github.com/pingcap/kvproto v0.0.0-20230905082026-5336fac26974 github.com/stretchr/testify v1.8.4 github.com/tikv/pd v0.0.0-00010101000000-000000000000 github.com/tikv/pd/client v0.0.0-00010101000000-000000000000 diff --git a/tests/integrations/tso/go.sum b/tests/integrations/tso/go.sum index 0503d79d0668..6f9c7f8f2cdb 100644 --- a/tests/integrations/tso/go.sum +++ b/tests/integrations/tso/go.sum @@ -401,8 +401,8 @@ github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c/go.mod h1:X2r9ue github.com/pingcap/failpoint v0.0.0-20220801062533-2eaa32854a6c h1:CgbKAHto5CQgWM9fSBIvaxsJHuGP0uM74HXtv3MyyGQ= github.com/pingcap/failpoint v0.0.0-20220801062533-2eaa32854a6c/go.mod h1:4qGtCB0QK0wBzKtFEGDhxXnSnbQApw1gc9siScUl8ew= github.com/pingcap/kvproto v0.0.0-20191211054548-3c6b38ea5107/go.mod h1:WWLmULLO7l8IOcQG+t+ItJ3fEcrL5FxF0Wu+HrMy26w= -github.com/pingcap/kvproto v0.0.0-20230727073445-53e1f8730c30 h1:EvqKcDT7ceGLW0mXqM8Cp5Z8DfgQRnwj2YTnlCLj2QI= -github.com/pingcap/kvproto v0.0.0-20230727073445-53e1f8730c30/go.mod h1:r0q/CFcwvyeRhKtoqzmWMBebrtpIziQQ9vR+JKh1knc= +github.com/pingcap/kvproto v0.0.0-20230905082026-5336fac26974 h1:Gn8rf2Mb3QDifUQHdtcopqKclc9L11hjhZFYBE65lcw= +github.com/pingcap/kvproto v0.0.0-20230905082026-5336fac26974/go.mod h1:r0q/CFcwvyeRhKtoqzmWMBebrtpIziQQ9vR+JKh1knc= github.com/pingcap/log v0.0.0-20210625125904-98ed8e2eb1c7/go.mod h1:8AanEdAHATuRurdGxZXBz0At+9avep+ub7U1AGYLIMM= github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 h1:HR/ylkkLmGdSSDaD8IDP+SZrdhV1Kibl9KrHxJ9eciw= github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4= diff --git a/tests/pdctl/config/config_test.go b/tests/pdctl/config/config_test.go index 087ec2ca39e4..3d0146589d56 100644 --- a/tests/pdctl/config/config_test.go +++ b/tests/pdctl/config/config_test.go @@ -321,6 +321,7 @@ func TestPlacementRules(t *testing.T) { f, _ := os.CreateTemp("/tmp", "pd_tests") fname := f.Name() f.Close() + defer os.RemoveAll(fname) // test load _, err = pdctl.ExecuteCommand(cmd, "-u", pdAddr, "config", "placement-rules", "load", "--out="+fname) @@ -493,9 +494,7 @@ func TestPlacementRuleBundle(t *testing.T) { re.NoError(err) fname := f.Name() f.Close() - defer func() { - os.RemoveAll(fname) - }() + defer os.RemoveAll(fname) // test load var bundles []placement.GroupBundle diff --git a/tests/pdctl/scheduler/scheduler_test.go b/tests/pdctl/scheduler/scheduler_test.go index e1c0a210c01c..a0447642cb67 100644 --- a/tests/pdctl/scheduler/scheduler_test.go +++ b/tests/pdctl/scheduler/scheduler_test.go @@ -21,7 +21,9 @@ import ( "time" "github.com/pingcap/kvproto/pkg/metapb" + "github.com/spf13/cobra" "github.com/stretchr/testify/require" + "github.com/tikv/pd/pkg/core" sc "github.com/tikv/pd/pkg/schedule/config" "github.com/tikv/pd/pkg/utils/testutil" "github.com/tikv/pd/pkg/versioninfo" @@ -66,25 +68,6 @@ func TestScheduler(t *testing.T) { }, } - mustExec := func(args []string, v interface{}) string { - output, err := pdctl.ExecuteCommand(cmd, args...) - re.NoError(err) - if v == nil { - return string(output) - } - re.NoError(json.Unmarshal(output, v)) - return "" - } - - mightExec := func(args []string, v interface{}) { - output, err := pdctl.ExecuteCommand(cmd, args...) - re.NoError(err) - if v == nil { - return - } - json.Unmarshal(output, v) - } - mustUsage := func(args []string) { output, err := pdctl.ExecuteCommand(cmd, args...) re.NoError(err) @@ -93,10 +76,10 @@ func TestScheduler(t *testing.T) { checkSchedulerCommand := func(args []string, expected map[string]bool) { if args != nil { - mustExec(args, nil) + mustExec(re, cmd, args, nil) } var schedulers []string - mustExec([]string{"-u", pdAddr, "scheduler", "show"}, &schedulers) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "show"}, &schedulers) for _, scheduler := range schedulers { re.True(expected[scheduler]) } @@ -104,30 +87,21 @@ func TestScheduler(t *testing.T) { checkSchedulerConfigCommand := func(args []string, expectedConfig map[string]interface{}, schedulerName string) { if args != nil { - mustExec(args, nil) + mustExec(re, cmd, args, nil) } configInfo := make(map[string]interface{}) - mustExec([]string{"-u", pdAddr, "scheduler", "config", schedulerName}, &configInfo) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", schedulerName}, &configInfo) re.Equal(expectedConfig, configInfo) } - checkSchedulerDescribeCommand := func(schedulerName, expectedStatus, expectedSummary string) { - result := make(map[string]interface{}) - testutil.Eventually(re, func() bool { - mightExec([]string{"-u", pdAddr, "scheduler", "describe", schedulerName}, &result) - return len(result) != 0 - }, testutil.WithTickInterval(50*time.Millisecond)) - re.Equal(expectedStatus, result["status"]) - re.Equal(expectedSummary, result["summary"]) - } - leaderServer := cluster.GetServer(cluster.GetLeader()) re.NoError(leaderServer.BootstrapCluster()) for _, store := range stores { pdctl.MustPutStore(re, leaderServer.GetServer(), store) } - pdctl.MustPutRegion(re, cluster, 1, 1, []byte("a"), []byte("b")) + // note: because pdqsort is a unstable sort algorithm, set ApproximateSize for this region. + pdctl.MustPutRegion(re, cluster, 1, 1, []byte("a"), []byte("b"), core.SetApproximateSize(10)) time.Sleep(3 * time.Second) // scheduler show command @@ -135,29 +109,21 @@ func TestScheduler(t *testing.T) { "balance-region-scheduler": true, "balance-leader-scheduler": true, "balance-hot-region-scheduler": true, - "split-bucket-scheduler": true, "transfer-witness-leader-scheduler": true, "balance-witness-scheduler": true, } checkSchedulerCommand(nil, expected) - echo := mustExec([]string{"-u", pdAddr, "config", "set", "enable-diagnostic", "true"}, nil) - re.Contains(echo, "Success!") - checkSchedulerDescribeCommand("balance-region-scheduler", "pending", "1 store(s) RegionNotMatchRule; ") - // scheduler delete command args := []string{"-u", pdAddr, "scheduler", "remove", "balance-region-scheduler"} expected = map[string]bool{ "balance-leader-scheduler": true, "balance-hot-region-scheduler": true, - "split-bucket-scheduler": true, "transfer-witness-leader-scheduler": true, "balance-witness-scheduler": true, } checkSchedulerCommand(args, expected) - checkSchedulerDescribeCommand("balance-region-scheduler", "disabled", "") - schedulers := []string{"evict-leader-scheduler", "grant-leader-scheduler"} for idx := range schedulers { @@ -166,7 +132,6 @@ func TestScheduler(t *testing.T) { expected = map[string]bool{ "balance-leader-scheduler": true, "balance-hot-region-scheduler": true, - "split-bucket-scheduler": true, schedulers[idx]: true, "transfer-witness-leader-scheduler": true, "balance-witness-scheduler": true, @@ -183,7 +148,6 @@ func TestScheduler(t *testing.T) { expected = map[string]bool{ "balance-leader-scheduler": true, "balance-hot-region-scheduler": true, - "split-bucket-scheduler": true, schedulers[idx]: true, "transfer-witness-leader-scheduler": true, "balance-witness-scheduler": true, @@ -199,7 +163,6 @@ func TestScheduler(t *testing.T) { expected = map[string]bool{ "balance-leader-scheduler": true, "balance-hot-region-scheduler": true, - "split-bucket-scheduler": true, "transfer-witness-leader-scheduler": true, "balance-witness-scheduler": true, } @@ -210,7 +173,6 @@ func TestScheduler(t *testing.T) { expected = map[string]bool{ "balance-leader-scheduler": true, "balance-hot-region-scheduler": true, - "split-bucket-scheduler": true, schedulers[idx]: true, "transfer-witness-leader-scheduler": true, "balance-witness-scheduler": true, @@ -222,7 +184,6 @@ func TestScheduler(t *testing.T) { expected = map[string]bool{ "balance-leader-scheduler": true, "balance-hot-region-scheduler": true, - "split-bucket-scheduler": true, schedulers[idx]: true, "transfer-witness-leader-scheduler": true, "balance-witness-scheduler": true, @@ -238,7 +199,6 @@ func TestScheduler(t *testing.T) { expected = map[string]bool{ "balance-leader-scheduler": true, "balance-hot-region-scheduler": true, - "split-bucket-scheduler": true, schedulers[idx]: true, "transfer-witness-leader-scheduler": true, "balance-witness-scheduler": true, @@ -254,7 +214,6 @@ func TestScheduler(t *testing.T) { expected = map[string]bool{ "balance-leader-scheduler": true, "balance-hot-region-scheduler": true, - "split-bucket-scheduler": true, "transfer-witness-leader-scheduler": true, "balance-witness-scheduler": true, } @@ -265,25 +224,23 @@ func TestScheduler(t *testing.T) { checkSchedulerCommand([]string{"-u", pdAddr, "scheduler", "add", "shuffle-region-scheduler"}, map[string]bool{ "balance-leader-scheduler": true, "balance-hot-region-scheduler": true, - "split-bucket-scheduler": true, "shuffle-region-scheduler": true, "transfer-witness-leader-scheduler": true, "balance-witness-scheduler": true, }) var roles []string - mustExec([]string{"-u", pdAddr, "scheduler", "config", "shuffle-region-scheduler", "show-roles"}, &roles) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "shuffle-region-scheduler", "show-roles"}, &roles) re.Equal([]string{"leader", "follower", "learner"}, roles) - mustExec([]string{"-u", pdAddr, "scheduler", "config", "shuffle-region-scheduler", "set-roles", "learner"}, nil) - mustExec([]string{"-u", pdAddr, "scheduler", "config", "shuffle-region-scheduler", "show-roles"}, &roles) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "shuffle-region-scheduler", "set-roles", "learner"}, nil) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "shuffle-region-scheduler", "show-roles"}, &roles) re.Equal([]string{"learner"}, roles) - mustExec([]string{"-u", pdAddr, "scheduler", "config", "shuffle-region-scheduler"}, &roles) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "shuffle-region-scheduler"}, &roles) re.Equal([]string{"learner"}, roles) // test grant hot region scheduler config checkSchedulerCommand([]string{"-u", pdAddr, "scheduler", "add", "grant-hot-region-scheduler", "1", "1,2,3"}, map[string]bool{ "balance-leader-scheduler": true, "balance-hot-region-scheduler": true, - "split-bucket-scheduler": true, "shuffle-region-scheduler": true, "grant-hot-region-scheduler": true, "transfer-witness-leader-scheduler": true, @@ -294,30 +251,30 @@ func TestScheduler(t *testing.T) { "store-id": []interface{}{float64(1), float64(2), float64(3)}, "store-leader-id": float64(1), } - mustExec([]string{"-u", pdAddr, "scheduler", "config", "grant-hot-region-scheduler"}, &conf3) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "grant-hot-region-scheduler"}, &conf3) re.Equal(expected3, conf3) - mustExec([]string{"-u", pdAddr, "scheduler", "config", "grant-hot-region-scheduler", "set", "2", "1,2,3"}, nil) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "grant-hot-region-scheduler", "set", "2", "1,2,3"}, nil) expected3["store-leader-id"] = float64(2) - mustExec([]string{"-u", pdAddr, "scheduler", "config", "grant-hot-region-scheduler"}, &conf3) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "grant-hot-region-scheduler"}, &conf3) re.Equal(expected3, conf3) // test balance region config - echo = mustExec([]string{"-u", pdAddr, "scheduler", "add", "balance-region-scheduler"}, nil) + echo := mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "add", "balance-region-scheduler"}, nil) re.Contains(echo, "Success!") - echo = mustExec([]string{"-u", pdAddr, "scheduler", "remove", "balance-region-scheduler"}, nil) + echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "remove", "balance-region-scheduler"}, nil) re.Contains(echo, "Success!") - echo = mustExec([]string{"-u", pdAddr, "scheduler", "remove", "balance-region-scheduler"}, nil) + echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "remove", "balance-region-scheduler"}, nil) re.NotContains(echo, "Success!") - echo = mustExec([]string{"-u", pdAddr, "scheduler", "add", "evict-leader-scheduler", "1"}, nil) + echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "add", "evict-leader-scheduler", "1"}, nil) re.Contains(echo, "Success!") - echo = mustExec([]string{"-u", pdAddr, "scheduler", "remove", "evict-leader-scheduler-1"}, nil) + echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "remove", "evict-leader-scheduler-1"}, nil) re.Contains(echo, "Success!") - echo = mustExec([]string{"-u", pdAddr, "scheduler", "remove", "evict-leader-scheduler-1"}, nil) + echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "remove", "evict-leader-scheduler-1"}, nil) re.Contains(echo, "404") // test hot region config - echo = mustExec([]string{"-u", pdAddr, "scheduler", "config", "evict-leader-scheduler"}, nil) + echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "evict-leader-scheduler"}, nil) re.Contains(echo, "[404] scheduler not found") expected1 := map[string]interface{}{ "min-hot-byte-rate": float64(100), @@ -342,63 +299,63 @@ func TestScheduler(t *testing.T) { "split-thresholds": 0.2, } var conf map[string]interface{} - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "list"}, &conf) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "list"}, &conf) re.Equal(expected1, conf) - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "show"}, &conf) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "show"}, &conf) re.Equal(expected1, conf) - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "src-tolerance-ratio", "1.02"}, nil) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "src-tolerance-ratio", "1.02"}, nil) expected1["src-tolerance-ratio"] = 1.02 var conf1 map[string]interface{} - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) re.Equal(expected1, conf1) - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "read-priorities", "byte,key"}, nil) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "read-priorities", "byte,key"}, nil) expected1["read-priorities"] = []interface{}{"byte", "key"} - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) re.Equal(expected1, conf1) - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "read-priorities", "key"}, nil) - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "read-priorities", "key"}, nil) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) re.Equal(expected1, conf1) - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "read-priorities", "key,byte"}, nil) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "read-priorities", "key,byte"}, nil) expected1["read-priorities"] = []interface{}{"key", "byte"} - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) re.Equal(expected1, conf1) - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "read-priorities", "foo,bar"}, nil) - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "read-priorities", "foo,bar"}, nil) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) re.Equal(expected1, conf1) - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "read-priorities", ""}, nil) - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "read-priorities", ""}, nil) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) re.Equal(expected1, conf1) - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "read-priorities", "key,key"}, nil) - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "read-priorities", "key,key"}, nil) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) re.Equal(expected1, conf1) - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "read-priorities", "byte,byte"}, nil) - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "read-priorities", "byte,byte"}, nil) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) re.Equal(expected1, conf1) - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "read-priorities", "key,key,byte"}, nil) - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "read-priorities", "key,key,byte"}, nil) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) re.Equal(expected1, conf1) // write-priorities is divided into write-leader-priorities and write-peer-priorities - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "write-priorities", "key,byte"}, nil) - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "write-priorities", "key,byte"}, nil) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) re.Equal(expected1, conf1) - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "rank-formula-version", "v0"}, nil) - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "rank-formula-version", "v0"}, nil) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) expected1["rank-formula-version"] = "v2" - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "rank-formula-version", "v2"}, nil) - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "rank-formula-version", "v2"}, nil) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) re.Equal(expected1, conf1) expected1["rank-formula-version"] = "v1" - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "rank-formula-version", "v1"}, nil) - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "rank-formula-version", "v1"}, nil) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) re.Equal(expected1, conf1) expected1["forbid-rw-type"] = "read" - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "forbid-rw-type", "read"}, nil) - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "forbid-rw-type", "read"}, nil) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) re.Equal(expected1, conf1) // test compatibility @@ -410,69 +367,68 @@ func TestScheduler(t *testing.T) { } re.Equal("5.2.0", leaderServer.GetClusterVersion().String()) // After upgrading, we should not use query. - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) re.Equal(conf1["read-priorities"], []interface{}{"key", "byte"}) // cannot set qps as write-peer-priorities - echo = mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "write-peer-priorities", "query,byte"}, nil) + echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler", "set", "write-peer-priorities", "query,byte"}, nil) re.Contains(echo, "query is not allowed to be set in priorities for write-peer-priorities") - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-hot-region-scheduler"}, &conf1) re.Equal(conf1["write-peer-priorities"], []interface{}{"byte", "key"}) // test remove and add - echo = mustExec([]string{"-u", pdAddr, "scheduler", "remove", "balance-hot-region-scheduler"}, nil) + echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "remove", "balance-hot-region-scheduler"}, nil) re.Contains(echo, "Success") - echo = mustExec([]string{"-u", pdAddr, "scheduler", "add", "balance-hot-region-scheduler"}, nil) + echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "add", "balance-hot-region-scheduler"}, nil) re.Contains(echo, "Success") // test balance leader config conf = make(map[string]interface{}) conf1 = make(map[string]interface{}) - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-leader-scheduler", "show"}, &conf) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-leader-scheduler", "show"}, &conf) re.Equal(4., conf["batch"]) - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-leader-scheduler", "set", "batch", "3"}, nil) - mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-leader-scheduler"}, &conf1) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-leader-scheduler", "set", "batch", "3"}, nil) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-leader-scheduler"}, &conf1) re.Equal(3., conf1["batch"]) - echo = mustExec([]string{"-u", pdAddr, "scheduler", "add", "balance-leader-scheduler"}, nil) + echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "add", "balance-leader-scheduler"}, nil) re.NotContains(echo, "Success!") - echo = mustExec([]string{"-u", pdAddr, "scheduler", "remove", "balance-leader-scheduler"}, nil) + echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "remove", "balance-leader-scheduler"}, nil) re.Contains(echo, "Success!") - echo = mustExec([]string{"-u", pdAddr, "scheduler", "remove", "balance-leader-scheduler"}, nil) + echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "remove", "balance-leader-scheduler"}, nil) re.Contains(echo, "404") re.Contains(echo, "PD:scheduler:ErrSchedulerNotFound]scheduler not found") - echo = mustExec([]string{"-u", pdAddr, "scheduler", "config", "balance-leader-scheduler"}, nil) + echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "config", "balance-leader-scheduler"}, nil) re.Contains(echo, "404") re.Contains(echo, "scheduler not found") - echo = mustExec([]string{"-u", pdAddr, "scheduler", "add", "balance-leader-scheduler"}, nil) + echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "add", "balance-leader-scheduler"}, nil) re.Contains(echo, "Success!") // test show scheduler with paused and disabled status. checkSchedulerWithStatusCommand := func(args []string, status string, expected []string) { if args != nil { - mustExec(args, nil) + mustExec(re, cmd, args, nil) } var schedulers []string - mustExec([]string{"-u", pdAddr, "scheduler", "show", "--status", status}, &schedulers) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "show", "--status", status}, &schedulers) re.Equal(expected, schedulers) } mustUsage([]string{"-u", pdAddr, "scheduler", "pause", "balance-leader-scheduler"}) - mustExec([]string{"-u", pdAddr, "scheduler", "pause", "balance-leader-scheduler", "60"}, nil) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "pause", "balance-leader-scheduler", "60"}, nil) checkSchedulerWithStatusCommand(nil, "paused", []string{ "balance-leader-scheduler", }) result := make(map[string]interface{}) testutil.Eventually(re, func() bool { - mightExec([]string{"-u", pdAddr, "scheduler", "describe", "balance-leader-scheduler"}, &result) + mightExec(re, cmd, []string{"-u", pdAddr, "scheduler", "describe", "balance-leader-scheduler"}, &result) return len(result) != 0 && result["status"] == "paused" && result["summary"] == "" }, testutil.WithWaitFor(30*time.Second)) mustUsage([]string{"-u", pdAddr, "scheduler", "resume", "balance-leader-scheduler", "60"}) - mustExec([]string{"-u", pdAddr, "scheduler", "resume", "balance-leader-scheduler"}, nil) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "resume", "balance-leader-scheduler"}, nil) checkSchedulerWithStatusCommand(nil, "paused", nil) - checkSchedulerDescribeCommand("balance-leader-scheduler", "normal", "") // set label scheduler to disabled manually. - echo = mustExec([]string{"-u", pdAddr, "scheduler", "add", "label-scheduler"}, nil) + echo = mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "add", "label-scheduler"}, nil) re.Contains(echo, "Success!") cfg := leaderServer.GetServer().GetScheduleConfig() origin := cfg.Schedulers @@ -486,3 +442,91 @@ func TestScheduler(t *testing.T) { re.NoError(err) checkSchedulerWithStatusCommand(nil, "disabled", nil) } + +func TestSchedulerDiagnostic(t *testing.T) { + re := require.New(t) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + cluster, err := tests.NewTestCluster(ctx, 1) + re.NoError(err) + defer cluster.Destroy() + err = cluster.RunInitialServers() + re.NoError(err) + cluster.WaitLeader() + pdAddr := cluster.GetConfig().GetClientURL() + cmd := pdctlCmd.GetRootCmd() + + checkSchedulerDescribeCommand := func(schedulerName, expectedStatus, expectedSummary string) { + result := make(map[string]interface{}) + testutil.Eventually(re, func() bool { + mightExec(re, cmd, []string{"-u", pdAddr, "scheduler", "describe", schedulerName}, &result) + return len(result) != 0 + }, testutil.WithTickInterval(50*time.Millisecond)) + re.Equal(expectedStatus, result["status"]) + re.Equal(expectedSummary, result["summary"]) + } + + stores := []*metapb.Store{ + { + Id: 1, + State: metapb.StoreState_Up, + LastHeartbeat: time.Now().UnixNano(), + }, + { + Id: 2, + State: metapb.StoreState_Up, + LastHeartbeat: time.Now().UnixNano(), + }, + { + Id: 3, + State: metapb.StoreState_Up, + LastHeartbeat: time.Now().UnixNano(), + }, + { + Id: 4, + State: metapb.StoreState_Up, + LastHeartbeat: time.Now().UnixNano(), + }, + } + leaderServer := cluster.GetServer(cluster.GetLeader()) + re.NoError(leaderServer.BootstrapCluster()) + for _, store := range stores { + pdctl.MustPutStore(re, leaderServer.GetServer(), store) + } + + // note: because pdqsort is a unstable sort algorithm, set ApproximateSize for this region. + pdctl.MustPutRegion(re, cluster, 1, 1, []byte("a"), []byte("b"), core.SetApproximateSize(10)) + time.Sleep(3 * time.Second) + + echo := mustExec(re, cmd, []string{"-u", pdAddr, "config", "set", "enable-diagnostic", "true"}, nil) + re.Contains(echo, "Success!") + checkSchedulerDescribeCommand("balance-region-scheduler", "pending", "1 store(s) RegionNotMatchRule; ") + + // scheduler delete command + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "remove", "balance-region-scheduler"}, nil) + + checkSchedulerDescribeCommand("balance-region-scheduler", "disabled", "") + + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "pause", "balance-leader-scheduler", "60"}, nil) + mustExec(re, cmd, []string{"-u", pdAddr, "scheduler", "resume", "balance-leader-scheduler"}, nil) + checkSchedulerDescribeCommand("balance-leader-scheduler", "normal", "") +} + +func mustExec(re *require.Assertions, cmd *cobra.Command, args []string, v interface{}) string { + output, err := pdctl.ExecuteCommand(cmd, args...) + re.NoError(err) + if v == nil { + return string(output) + } + re.NoError(json.Unmarshal(output, v)) + return "" +} + +func mightExec(re *require.Assertions, cmd *cobra.Command, args []string, v interface{}) { + output, err := pdctl.ExecuteCommand(cmd, args...) + re.NoError(err) + if v == nil { + return + } + json.Unmarshal(output, v) +} diff --git a/tests/scheduling_cluster.go b/tests/scheduling_cluster.go index 209371d197cd..1768c4128cc4 100644 --- a/tests/scheduling_cluster.go +++ b/tests/scheduling_cluster.go @@ -21,6 +21,7 @@ import ( "github.com/stretchr/testify/require" scheduling "github.com/tikv/pd/pkg/mcs/scheduling/server" sc "github.com/tikv/pd/pkg/mcs/scheduling/server/config" + "github.com/tikv/pd/pkg/schedule/schedulers" "github.com/tikv/pd/pkg/utils/tempurl" "github.com/tikv/pd/pkg/utils/testutil" ) @@ -36,6 +37,7 @@ type TestSchedulingCluster struct { // NewTestSchedulingCluster creates a new scheduling test cluster. func NewTestSchedulingCluster(ctx context.Context, initialServerCount int, backendEndpoints string) (tc *TestSchedulingCluster, err error) { + schedulers.Register() tc = &TestSchedulingCluster{ ctx: ctx, backendEndpoints: backendEndpoints, @@ -61,7 +63,7 @@ func (tc *TestSchedulingCluster) AddServer(addr string) error { if err != nil { return err } - err = InitSchedulingLogger(generatedCfg) + err = InitLogger(generatedCfg.Log, generatedCfg.Logger, generatedCfg.LogProps, generatedCfg.Security.RedactInfoLog) if err != nil { return err } diff --git a/tests/server/api/api_test.go b/tests/server/api/api_test.go index 375a0cf7c80b..61d47d7790c2 100644 --- a/tests/server/api/api_test.go +++ b/tests/server/api/api_test.go @@ -30,7 +30,6 @@ import ( "github.com/pingcap/failpoint" "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/kvproto/pkg/pdpb" - "github.com/pingcap/log" "github.com/stretchr/testify/require" "github.com/stretchr/testify/suite" "github.com/tikv/pd/pkg/core" @@ -453,13 +452,8 @@ func (suite *middlewareTestSuite) TestAuditPrometheusBackend() { } func (suite *middlewareTestSuite) TestAuditLocalLogBackend() { - tempStdoutFile, _ := os.CreateTemp("/tmp", "pd_tests") - defer os.Remove(tempStdoutFile.Name()) - cfg := &log.Config{} - cfg.File.Filename = tempStdoutFile.Name() - cfg.Level = "info" - lg, p, _ := log.InitLogger(cfg) - log.ReplaceGlobals(lg, p) + fname := testutil.InitTempFileLogger("info") + defer os.RemoveAll(fname) leader := suite.cluster.GetServer(suite.cluster.GetLeader()) input := map[string]interface{}{ "enable-audit": "true", @@ -477,7 +471,7 @@ func (suite *middlewareTestSuite) TestAuditLocalLogBackend() { suite.NoError(err) _, err = io.ReadAll(resp.Body) resp.Body.Close() - b, _ := os.ReadFile(tempStdoutFile.Name()) + b, _ := os.ReadFile(fname) suite.Contains(string(b), "audit log") suite.NoError(err) suite.Equal(http.StatusOK, resp.StatusCode) @@ -667,13 +661,8 @@ func (suite *redirectorTestSuite) TestNotLeader() { func (suite *redirectorTestSuite) TestXForwardedFor() { leader := suite.cluster.GetServer(suite.cluster.GetLeader()) suite.NoError(leader.BootstrapCluster()) - tempStdoutFile, _ := os.CreateTemp("/tmp", "pd_tests") - defer os.Remove(tempStdoutFile.Name()) - cfg := &log.Config{} - cfg.File.Filename = tempStdoutFile.Name() - cfg.Level = "info" - lg, p, _ := log.InitLogger(cfg) - log.ReplaceGlobals(lg, p) + fname := testutil.InitTempFileLogger("info") + defer os.RemoveAll(fname) follower := suite.cluster.GetServer(suite.cluster.GetFollower()) addr := follower.GetAddr() + "/pd/api/v1/regions" @@ -684,7 +673,7 @@ func (suite *redirectorTestSuite) TestXForwardedFor() { defer resp.Body.Close() suite.Equal(http.StatusOK, resp.StatusCode) time.Sleep(1 * time.Second) - b, _ := os.ReadFile(tempStdoutFile.Name()) + b, _ := os.ReadFile(fname) l := string(b) suite.Contains(l, "/pd/api/v1/regions") suite.NotContains(l, suite.cluster.GetConfig().GetClientURLs()) @@ -818,6 +807,46 @@ func TestRemovingProgress(t *testing.T) { re.NoError(failpoint.Disable("github.com/tikv/pd/server/cluster/highFrequencyClusterJobs")) } +func TestSendApiWhenRestartRaftCluster(t *testing.T) { + re := require.New(t) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + cluster, err := tests.NewTestCluster(ctx, 3, func(conf *config.Config, serverName string) { + conf.Replication.MaxReplicas = 1 + }) + re.NoError(err) + defer cluster.Destroy() + + err = cluster.RunInitialServers() + re.NoError(err) + leader := cluster.GetServer(cluster.WaitLeader()) + + grpcPDClient := testutil.MustNewGrpcClient(re, leader.GetAddr()) + clusterID := leader.GetClusterID() + req := &pdpb.BootstrapRequest{ + Header: testutil.NewRequestHeader(clusterID), + Store: &metapb.Store{Id: 1, Address: "127.0.0.1:0"}, + Region: &metapb.Region{Id: 2, Peers: []*metapb.Peer{{Id: 3, StoreId: 1, Role: metapb.PeerRole_Voter}}}, + } + resp, err := grpcPDClient.Bootstrap(context.Background(), req) + re.NoError(err) + re.Nil(resp.GetHeader().GetError()) + + // Mock restart raft cluster + rc := leader.GetRaftCluster() + re.NotNil(rc) + rc.Stop() + + // Mock client-go will still send request + output := sendRequest(re, leader.GetAddr()+"/pd/api/v1/min-resolved-ts", http.MethodGet, http.StatusInternalServerError) + re.Contains(string(output), "TiKV cluster not bootstrapped, please start TiKV first") + + err = rc.Start(leader.GetServer()) + re.NoError(err) + rc = leader.GetRaftCluster() + re.NotNil(rc) +} + func TestPreparingProgress(t *testing.T) { re := require.New(t) re.NoError(failpoint.Enable("github.com/tikv/pd/server/cluster/highFrequencyClusterJobs", `return(true)`)) diff --git a/tests/server/cluster/cluster_test.go b/tests/server/cluster/cluster_test.go index 87acdf897fdc..f22a754b8bf4 100644 --- a/tests/server/cluster/cluster_test.go +++ b/tests/server/cluster/cluster_test.go @@ -351,11 +351,11 @@ func getTestDeployPath(storeID uint64) string { func resetStoreState(re *require.Assertions, rc *cluster.RaftCluster, storeID uint64, state metapb.StoreState) { store := rc.GetStore(storeID) re.NotNil(store) - newStore := store.Clone(core.OfflineStore(false)) + newStore := store.Clone(core.SetStoreState(metapb.StoreState_Offline, false)) if state == metapb.StoreState_Up { - newStore = newStore.Clone(core.UpStore()) + newStore = newStore.Clone(core.SetStoreState(metapb.StoreState_Up)) } else if state == metapb.StoreState_Tombstone { - newStore = newStore.Clone(core.TombstoneStore()) + newStore = newStore.Clone(core.SetStoreState(metapb.StoreState_Tombstone)) } rc.GetBasicCluster().PutStore(newStore) diff --git a/tests/testutil.go b/tests/testutil.go index 9d7b06322c2f..53efcff76580 100644 --- a/tests/testutil.go +++ b/tests/testutil.go @@ -29,34 +29,20 @@ import ( tso "github.com/tikv/pd/pkg/mcs/tso/server" "github.com/tikv/pd/pkg/utils/logutil" "github.com/tikv/pd/pkg/utils/testutil" + "go.uber.org/zap" ) var once sync.Once -// InitSchedulingLogger initializes the logger for test. -func InitSchedulingLogger(cfg *sc.Config) (err error) { +// InitLogger initializes the logger for test. +func InitLogger(logConfig log.Config, logger *zap.Logger, logProps *log.ZapProperties, isRedactInfoLogEnabled bool) (err error) { once.Do(func() { // Setup the logger. - err = logutil.SetupLogger(cfg.Log, &cfg.Logger, &cfg.LogProps, cfg.Security.RedactInfoLog) + err = logutil.SetupLogger(logConfig, &logger, &logProps, isRedactInfoLogEnabled) if err != nil { return } - log.ReplaceGlobals(cfg.Logger, cfg.LogProps) - // Flushing any buffered log entries. - log.Sync() - }) - return err -} - -// InitTSOLogger initializes the logger for test. -func InitTSOLogger(cfg *tso.Config) (err error) { - once.Do(func() { - // Setup the logger. - err = logutil.SetupLogger(cfg.Log, &cfg.Logger, &cfg.LogProps, cfg.Security.RedactInfoLog) - if err != nil { - return - } - log.ReplaceGlobals(cfg.Logger, cfg.LogProps) + log.ReplaceGlobals(logger, logProps) // Flushing any buffered log entries. log.Sync() }) @@ -88,7 +74,7 @@ func StartSingleTSOTestServerWithoutCheck(ctx context.Context, re *require.Asser cfg, err := tso.GenerateConfig(cfg) re.NoError(err) // Setup the logger. - err = InitTSOLogger(cfg) + err = InitLogger(cfg.Log, cfg.Logger, cfg.LogProps, cfg.Security.RedactInfoLog) re.NoError(err) return NewTSOTestServer(ctx, cfg) } diff --git a/tests/tso_cluster.go b/tests/tso_cluster.go index 1264cc8aed83..2f80f7ff9701 100644 --- a/tests/tso_cluster.go +++ b/tests/tso_cluster.go @@ -114,7 +114,7 @@ func (tc *TestTSOCluster) AddServer(addr string) error { if err != nil { return err } - err = InitTSOLogger(generatedCfg) + err = InitLogger(generatedCfg.Log, generatedCfg.Logger, generatedCfg.LogProps, generatedCfg.Security.RedactInfoLog) if err != nil { return err } diff --git a/tools/pd-api-bench/Makefile b/tools/pd-api-bench/Makefile new file mode 100644 index 000000000000..7e517ef92b9f --- /dev/null +++ b/tools/pd-api-bench/Makefile @@ -0,0 +1,34 @@ +# Copyright 2023 TiKV Project Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ROOT_PATH := ../.. +GO_TOOLS_BIN_PATH := $(ROOT_PATH)/.tools/bin +PATH := $(GO_TOOLS_BIN_PATH):$(PATH) +SHELL := env PATH='$(PATH)' GOBIN='$(GO_TOOLS_BIN_PATH)' $(shell which bash) + +install-tools: + cd $(ROOT_PATH) && $(MAKE) install-tools + +static: install-tools + @ echo "gofmt ..." + @ gofmt -s -l -d . 2>&1 | awk '{ print } END { if (NR > 0) { exit 1 } }' + @ echo "golangci-lint ..." + @ golangci-lint run -c $(ROOT_PATH)/.golangci.yml --verbose ./... --allow-parallel-runners + @ echo "revive ..." + @ revive -formatter friendly -config $(ROOT_PATH)/revive.toml ./... + +tidy: + @ go mod tidy + git diff go.mod go.sum | cat + git diff --quiet go.mod go.sum diff --git a/tools/pd-api-bench/cases/cases.go b/tools/pd-api-bench/cases/cases.go index 2b770805cd8e..d431b6f325c4 100644 --- a/tools/pd-api-bench/cases/cases.go +++ b/tools/pd-api-bench/cases/cases.go @@ -29,8 +29,10 @@ import ( ) var ( + // PDAddress is the address of PD server. PDAddress string - Debug bool + // Debug is the flag to print the output of api response for debug. + Debug bool ) var ( @@ -39,6 +41,7 @@ var ( storesID []uint64 ) +// InitCluster initializes the cluster. func InitCluster(ctx context.Context, cli pd.Client, httpClit *http.Client) error { req, _ := http.NewRequestWithContext(ctx, http.MethodGet, PDAddress+"/pd/api/v1/stats/region?start_key=&end_key=&count", nil) @@ -67,6 +70,7 @@ func InitCluster(ctx context.Context, cli pd.Client, httpClit *http.Client) erro return nil } +// Case is the interface for all cases. type Case interface { Name() string SetQPS(int64) @@ -101,11 +105,13 @@ func (c *baseCase) GetBurst() int64 { return c.burst } +// GRPCCase is the interface for all gRPC cases. type GRPCCase interface { Case Unary(context.Context, pd.Client) error } +// GRPCCaseMap is the map for all gRPC cases. var GRPCCaseMap = map[string]GRPCCase{ "GetRegion": newGetRegion(), "GetStore": newGetStore(), @@ -113,12 +119,14 @@ var GRPCCaseMap = map[string]GRPCCase{ "ScanRegions": newScanRegions(), } +// HTTPCase is the interface for all HTTP cases. type HTTPCase interface { Case Do(context.Context, *http.Client) error Params(string) } +// HTTPCaseMap is the map for all HTTP cases. var HTTPCaseMap = map[string]HTTPCase{ "GetRegionStatus": newRegionStats(), "GetMinResolvedTS": newMinResolvedTS(), @@ -319,6 +327,7 @@ func (c *getStores) Unary(ctx context.Context, cli pd.Client) error { return nil } +// nolint func generateKeyForSimulator(id int, keyLen int) []byte { k := make([]byte, keyLen) copy(k, fmt.Sprintf("%010d", id)) diff --git a/tools/pd-api-bench/go.mod b/tools/pd-api-bench/go.mod index 6893f87a7950..1cef635e20b9 100644 --- a/tools/pd-api-bench/go.mod +++ b/tools/pd-api-bench/go.mod @@ -70,10 +70,9 @@ require ( github.com/pingcap/errcode v0.3.0 // indirect github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c // indirect github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00 // indirect - github.com/pingcap/kvproto v0.0.0-20230727073445-53e1f8730c30 // indirect + github.com/pingcap/kvproto v0.0.0-20230905082026-5336fac26974 // indirect github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 // indirect github.com/pingcap/sysutil v1.0.1-0.20230407040306-fb007c5aff21 // indirect - github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/power-devops/perfstat v0.0.0-20221212215047-62379fc7944b // indirect github.com/prometheus/client_golang v1.11.1 // indirect diff --git a/tools/pd-api-bench/go.sum b/tools/pd-api-bench/go.sum index 1fe19220558f..2cdd7ca7a381 100644 --- a/tools/pd-api-bench/go.sum +++ b/tools/pd-api-bench/go.sum @@ -263,8 +263,8 @@ github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c/go.mod h1:X2r9ue github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00 h1:C3N3itkduZXDZFh4N3vQ5HEtld3S+Y+StULhWVvumU0= github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00/go.mod h1:4qGtCB0QK0wBzKtFEGDhxXnSnbQApw1gc9siScUl8ew= github.com/pingcap/kvproto v0.0.0-20191211054548-3c6b38ea5107/go.mod h1:WWLmULLO7l8IOcQG+t+ItJ3fEcrL5FxF0Wu+HrMy26w= -github.com/pingcap/kvproto v0.0.0-20230727073445-53e1f8730c30 h1:EvqKcDT7ceGLW0mXqM8Cp5Z8DfgQRnwj2YTnlCLj2QI= -github.com/pingcap/kvproto v0.0.0-20230727073445-53e1f8730c30/go.mod h1:r0q/CFcwvyeRhKtoqzmWMBebrtpIziQQ9vR+JKh1knc= +github.com/pingcap/kvproto v0.0.0-20230905082026-5336fac26974 h1:Gn8rf2Mb3QDifUQHdtcopqKclc9L11hjhZFYBE65lcw= +github.com/pingcap/kvproto v0.0.0-20230905082026-5336fac26974/go.mod h1:r0q/CFcwvyeRhKtoqzmWMBebrtpIziQQ9vR+JKh1knc= github.com/pingcap/log v0.0.0-20210625125904-98ed8e2eb1c7/go.mod h1:8AanEdAHATuRurdGxZXBz0At+9avep+ub7U1AGYLIMM= github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3 h1:HR/ylkkLmGdSSDaD8IDP+SZrdhV1Kibl9KrHxJ9eciw= github.com/pingcap/log v1.1.1-0.20221110025148-ca232912c9f3/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4= diff --git a/tools/pd-api-bench/main.go b/tools/pd-api-bench/main.go index a891f7d23182..3b413d4dd768 100644 --- a/tools/pd-api-bench/main.go +++ b/tools/pd-api-bench/main.go @@ -160,7 +160,7 @@ func main() { } httpClis := make([]*http.Client, 0) for i := 0; i < *client; i++ { - httpClis = append(httpClis, newHttpClient()) + httpClis = append(httpClis, newHTTPClient()) } err = cases.InitCluster(ctx, pdClis[0], httpClis[0]) if err != nil { @@ -248,8 +248,8 @@ func exit(code int) { os.Exit(code) } -// newHttpClient returns an HTTP(s) client. -func newHttpClient() *http.Client { +// newHTTPClient returns an HTTP(s) client. +func newHTTPClient() *http.Client { // defaultTimeout for non-context requests. const defaultTimeout = 30 * time.Second cli := &http.Client{Timeout: defaultTimeout} diff --git a/tools/pd-backup/pdbackup/backup_test.go b/tools/pd-backup/pdbackup/backup_test.go index d93fd77a336f..40e4190f5d4a 100644 --- a/tools/pd-backup/pdbackup/backup_test.go +++ b/tools/pd-backup/pdbackup/backup_test.go @@ -14,7 +14,6 @@ import ( "testing" "time" - "github.com/stretchr/testify/require" "github.com/stretchr/testify/suite" sc "github.com/tikv/pd/pkg/schedule/config" "github.com/tikv/pd/pkg/storage/endpoint" @@ -48,14 +47,12 @@ type backupTestSuite struct { } func TestBackupTestSuite(t *testing.T) { - re := require.New(t) - - etcd, etcdClient, err := setupEtcd(t) - re.NoError(err) + servers, etcdClient, clean := etcdutil.NewTestEtcdCluster(t, 1) + defer clean() server, serverConfig := setupServer() testSuite := &backupTestSuite{ - etcd: etcd, + etcd: servers[0], etcdClient: etcdClient, server: server, serverConfig: serverConfig, @@ -64,24 +61,6 @@ func TestBackupTestSuite(t *testing.T) { suite.Run(t, testSuite) } -func setupEtcd(t *testing.T) (*embed.Etcd, *clientv3.Client, error) { - etcdCfg := etcdutil.NewTestSingleConfig(t) - etcd, err := embed.StartEtcd(etcdCfg) - if err != nil { - return nil, nil, err - } - - ep := etcdCfg.LCUrls[0].String() - client, err := clientv3.New(clientv3.Config{ - Endpoints: []string{ep}, - }) - if err != nil { - return nil, nil, err - } - - return etcd, client, nil -} - func setupServer() (*httptest.Server, *config.Config) { serverConfig := &config.Config{ ClientUrls: "example.com:2379", @@ -120,10 +99,6 @@ func setupServer() (*httptest.Server, *config.Config) { } func (s *backupTestSuite) BeforeTest(suiteName, testName string) { - // the etcd server is set up in TestBackupTestSuite() before the test suite - // runs - <-s.etcd.Server.ReadyNotify() - ctx, cancel := context.WithTimeout(context.Background(), time.Second*3) defer cancel() @@ -166,7 +141,7 @@ func (s *backupTestSuite) TestGetBackupInfo() { tmpFile, err := os.CreateTemp(os.TempDir(), "pd_backup_info_test.json") s.NoError(err) - defer os.Remove(tmpFile.Name()) + defer os.RemoveAll(tmpFile.Name()) s.NoError(OutputToFile(actual, tmpFile)) _, err = tmpFile.Seek(0, 0) diff --git a/tools/pd-tso-bench/Makefile b/tools/pd-tso-bench/Makefile new file mode 100644 index 000000000000..7e517ef92b9f --- /dev/null +++ b/tools/pd-tso-bench/Makefile @@ -0,0 +1,34 @@ +# Copyright 2023 TiKV Project Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ROOT_PATH := ../.. +GO_TOOLS_BIN_PATH := $(ROOT_PATH)/.tools/bin +PATH := $(GO_TOOLS_BIN_PATH):$(PATH) +SHELL := env PATH='$(PATH)' GOBIN='$(GO_TOOLS_BIN_PATH)' $(shell which bash) + +install-tools: + cd $(ROOT_PATH) && $(MAKE) install-tools + +static: install-tools + @ echo "gofmt ..." + @ gofmt -s -l -d . 2>&1 | awk '{ print } END { if (NR > 0) { exit 1 } }' + @ echo "golangci-lint ..." + @ golangci-lint run -c $(ROOT_PATH)/.golangci.yml --verbose ./... --allow-parallel-runners + @ echo "revive ..." + @ revive -formatter friendly -config $(ROOT_PATH)/revive.toml ./... + +tidy: + @ go mod tidy + git diff go.mod go.sum | cat + git diff --quiet go.mod go.sum