From 692d0c2f0ef672f6aea35a08d57bf3fbd9d30b2e Mon Sep 17 00:00:00 2001 From: ShuNing Date: Wed, 12 Dec 2018 16:48:10 +0800 Subject: [PATCH] server: Fix the issue about RaftCluster cannot be stopped (#1370) (#1375) * server: fix the issue about RaftCluster cannot be stopped Signed-off-by: nolouch --- server/cluster.go | 11 +++++++---- server/cluster_test.go | 31 +++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/server/cluster.go b/server/cluster.go index b195a27df46..9983a6f344e 100644 --- a/server/cluster.go +++ b/server/cluster.go @@ -29,9 +29,7 @@ import ( log "github.com/sirupsen/logrus" ) -const ( - backgroundJobInterval = time.Minute -) +var backgroundJobInterval = time.Minute // RaftCluster is used for cluster config management. // Raft cluster key format: @@ -117,6 +115,10 @@ func (c *RaftCluster) start() error { c.wg.Add(2) go c.runCoordinator() + // gofail: var highFrequencyClusterJobs bool + // if highFrequencyClusterJobs { + // backgroundJobInterval = 100 * time.Microsecond + // } go c.runBackgroundJobs(backgroundJobInterval) c.running = true @@ -135,9 +137,9 @@ func (c *RaftCluster) runCoordinator() { func (c *RaftCluster) stop() { c.Lock() - defer c.Unlock() if !c.running { + c.Unlock() return } @@ -145,6 +147,7 @@ func (c *RaftCluster) stop() { close(c.quit) c.coordinator.stop() + c.Unlock() c.wg.Wait() } diff --git a/server/cluster_test.go b/server/cluster_test.go index 542d81884d9..12bb71c1db3 100644 --- a/server/cluster_test.go +++ b/server/cluster_test.go @@ -18,9 +18,11 @@ import ( "fmt" "strings" "sync" + "time" "github.com/coreos/etcd/clientv3" . "github.com/pingcap/check" + gofail "github.com/pingcap/gofail/runtime" "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/kvproto/pkg/pdpb" "github.com/pingcap/pd/server/core" @@ -426,6 +428,35 @@ func (s *testClusterSuite) TestRaftClusterRestart(c *C) { cluster.stop() } +// Make sure PD will not deadlock if it start and stop again and again. +func (s *testClusterSuite) TestRaftClusterMultipleRestar(c *C) { + var err error + _, s.svr, s.cleanup, err = NewTestServer() + defer s.cleanup() + c.Assert(err, IsNil) + mustWaitLeader(c, []*Server{s.svr}) + _, err = s.svr.bootstrapCluster(s.newBootstrapRequest(c, s.svr.clusterID, "127.0.0.1:0")) + c.Assert(err, IsNil) + // add an offline store + store := s.newStore(c, s.allocID(c), "127.0.0.1:4") + store.State = metapb.StoreState_Offline + cluster := s.svr.GetRaftCluster() + err = cluster.putStore(store) + c.Assert(err, IsNil) + c.Assert(cluster, NotNil) + + // let the job run at small interval + gofail.Enable("github.com/pingcap/pd/server/highFrequencyClusterJobs", `return(true)`) + for i := 0; i < 100; i++ { + err = s.svr.createRaftCluster() + c.Assert(err, IsNil) + time.Sleep(time.Millisecond) + cluster = s.svr.GetRaftCluster() + c.Assert(cluster, NotNil) + cluster.stop() + } +} + func (s *testClusterSuite) TestGetPDMembers(c *C) { req := &pdpb.GetMembersRequest{