From 692d0c2f0ef672f6aea35a08d57bf3fbd9d30b2e Mon Sep 17 00:00:00 2001
From: ShuNing <nolouch@gmail.com>
Date: Wed, 12 Dec 2018 16:48:10 +0800
Subject: [PATCH] server: Fix the issue about RaftCluster cannot be stopped
 (#1370) (#1375)

* server: fix the issue about RaftCluster cannot be stopped

Signed-off-by: nolouch <nolouch@gmail.com>
---
 server/cluster.go      | 11 +++++++----
 server/cluster_test.go | 31 +++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/server/cluster.go b/server/cluster.go
index b195a27df46..9983a6f344e 100644
--- a/server/cluster.go
+++ b/server/cluster.go
@@ -29,9 +29,7 @@ import (
 	log "github.com/sirupsen/logrus"
 )
 
-const (
-	backgroundJobInterval = time.Minute
-)
+var backgroundJobInterval = time.Minute
 
 // RaftCluster is used for cluster config management.
 // Raft cluster key format:
@@ -117,6 +115,10 @@ func (c *RaftCluster) start() error {
 
 	c.wg.Add(2)
 	go c.runCoordinator()
+	// gofail: var highFrequencyClusterJobs bool
+	// if highFrequencyClusterJobs {
+	//     backgroundJobInterval = 100 * time.Microsecond
+	// }
 	go c.runBackgroundJobs(backgroundJobInterval)
 
 	c.running = true
@@ -135,9 +137,9 @@ func (c *RaftCluster) runCoordinator() {
 
 func (c *RaftCluster) stop() {
 	c.Lock()
-	defer c.Unlock()
 
 	if !c.running {
+		c.Unlock()
 		return
 	}
 
@@ -145,6 +147,7 @@ func (c *RaftCluster) stop() {
 
 	close(c.quit)
 	c.coordinator.stop()
+	c.Unlock()
 	c.wg.Wait()
 }
 
diff --git a/server/cluster_test.go b/server/cluster_test.go
index 542d81884d9..12bb71c1db3 100644
--- a/server/cluster_test.go
+++ b/server/cluster_test.go
@@ -18,9 +18,11 @@ import (
 	"fmt"
 	"strings"
 	"sync"
+	"time"
 
 	"github.com/coreos/etcd/clientv3"
 	. "github.com/pingcap/check"
+	gofail "github.com/pingcap/gofail/runtime"
 	"github.com/pingcap/kvproto/pkg/metapb"
 	"github.com/pingcap/kvproto/pkg/pdpb"
 	"github.com/pingcap/pd/server/core"
@@ -426,6 +428,35 @@ func (s *testClusterSuite) TestRaftClusterRestart(c *C) {
 	cluster.stop()
 }
 
+// Make sure PD will not deadlock if it start and stop again and again.
+func (s *testClusterSuite) TestRaftClusterMultipleRestar(c *C) {
+	var err error
+	_, s.svr, s.cleanup, err = NewTestServer()
+	defer s.cleanup()
+	c.Assert(err, IsNil)
+	mustWaitLeader(c, []*Server{s.svr})
+	_, err = s.svr.bootstrapCluster(s.newBootstrapRequest(c, s.svr.clusterID, "127.0.0.1:0"))
+	c.Assert(err, IsNil)
+	// add an offline store
+	store := s.newStore(c, s.allocID(c), "127.0.0.1:4")
+	store.State = metapb.StoreState_Offline
+	cluster := s.svr.GetRaftCluster()
+	err = cluster.putStore(store)
+	c.Assert(err, IsNil)
+	c.Assert(cluster, NotNil)
+
+	// let the job run at small interval
+	gofail.Enable("github.com/pingcap/pd/server/highFrequencyClusterJobs", `return(true)`)
+	for i := 0; i < 100; i++ {
+		err = s.svr.createRaftCluster()
+		c.Assert(err, IsNil)
+		time.Sleep(time.Millisecond)
+		cluster = s.svr.GetRaftCluster()
+		c.Assert(cluster, NotNil)
+		cluster.stop()
+	}
+}
+
 func (s *testClusterSuite) TestGetPDMembers(c *C) {
 
 	req := &pdpb.GetMembersRequest{