From 00a8e594fa31374a0806afa73491825eb280aefa Mon Sep 17 00:00:00 2001 From: zyguan Date: Mon, 13 May 2019 11:49:57 +0800 Subject: [PATCH 1/6] stability: retry truncating sst files upon failure sst files may be deleted after compaction --- tests/pkg/ops/tikv.go | 68 ++++++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 27 deletions(-) diff --git a/tests/pkg/ops/tikv.go b/tests/pkg/ops/tikv.go index 0e06e264cd..f17d32a5ab 100644 --- a/tests/pkg/ops/tikv.go +++ b/tests/pkg/ops/tikv.go @@ -13,6 +13,7 @@ package ops import ( + "strconv" "strings" "github.com/golang/glog" @@ -20,6 +21,8 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +const retryLimit = 15 + type TruncateOptions struct { Namespace string Cluster string @@ -53,41 +56,52 @@ func (ops *TiKVOps) TruncateSSTFile(opts TruncateOptions) error { }) } - stdout, stderr, err := exec("find", "/var/lib/tikv/db", "-name", "*.sst", "-o", "-name", "*.save") - if err != nil { - glog.Errorf("list sst files: stderr=%s err=%s", stderr, err.Error()) - return errors.Annotate(err, "list sst files") - } + retryCount := 0 + for ; retryCount < retryLimit; retryCount++ { + stdout, stderr, err := exec("find", "/var/lib/tikv/db", "-name", "*.sst", "-o", "-name", "*.save") + if err != nil { + glog.Errorf("list sst files: stderr=%s err=%s", stderr, err.Error()) + return errors.Annotate(err, "list sst files") + } - sstCandidates := make(map[string]bool) + sstCandidates := make(map[string]bool) - for _, f := range strings.Split(stdout, "\n") { - f = strings.TrimSpace(f) - if len(f) > 0 { - sstCandidates[f] = true + for _, f := range strings.Split(stdout, "\n") { + f = strings.TrimSpace(f) + if len(f) > 0 { + sstCandidates[f] = true + } } - } - sst := "" - for k := range sstCandidates { - if strings.HasSuffix(k, ".sst") && !sstCandidates[k+".save"] { - sst = k + sst := "" + for k := range sstCandidates { + if strings.HasSuffix(k, ".sst") && !sstCandidates[k+".save"] { + sst = k + } + } + if len(sst) == 0 { + return errors.New("cannot find a sst file") } - } - if len(sst) == 0 { - return errors.New("cannot find a sst file") - } - _, stderr, err = exec("cp", sst, sst+".save") - if err != nil { - glog.Errorf("backup sst file: stderr=%s err=%s", stderr, err.Error()) - return errors.Annotate(err, "backup sst file") + _, stderr, err = exec("cp", sst, sst+".save") + if err != nil { + glog.Warningf("backup sst file: stderr=%s err=%s", stderr, err.Error()) + //return errors.Annotate(err, "backup sst file") + continue + } + + _, stderr, err = exec("truncate", "-s", "0", sst) + if err != nil { + glog.Warningf("truncate sst file: stderr=%s err=%s", stderr, err.Error()) + //return errors.Annotate(err, "truncate sst file") + continue + } + + break } - _, stderr, err = exec("truncate", "-s", "0", sst) - if err != nil { - glog.Errorf("truncate sst file: stderr=%s err=%s", stderr, err.Error()) - return errors.Annotate(err, "truncate sst file") + if retryCount == retryLimit { + return errors.New("failed to truncate sst file after " + strconv.Itoa(retryLimit) + " trials") } return nil From b34437a051931a89afb6eaf686899dc4f06950f7 Mon Sep 17 00:00:00 2001 From: zyguan Date: Tue, 14 May 2019 19:53:10 +0800 Subject: [PATCH 2/6] stability: address comments --- tests/pkg/ops/tikv.go | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tests/pkg/ops/tikv.go b/tests/pkg/ops/tikv.go index f17d32a5ab..35b2995c56 100644 --- a/tests/pkg/ops/tikv.go +++ b/tests/pkg/ops/tikv.go @@ -15,6 +15,7 @@ package ops import ( "strconv" "strings" + "time" "github.com/golang/glog" "github.com/pingcap/errors" @@ -58,10 +59,13 @@ func (ops *TiKVOps) TruncateSSTFile(opts TruncateOptions) error { retryCount := 0 for ; retryCount < retryLimit; retryCount++ { + if retryCount > 0 { + time.Sleep(10 * time.Second) + } stdout, stderr, err := exec("find", "/var/lib/tikv/db", "-name", "*.sst", "-o", "-name", "*.save") if err != nil { - glog.Errorf("list sst files: stderr=%s err=%s", stderr, err.Error()) - return errors.Annotate(err, "list sst files") + glog.Warningf("list sst files: stderr=%s err=%s", stderr, err.Error()) + continue } sstCandidates := make(map[string]bool) @@ -80,20 +84,19 @@ func (ops *TiKVOps) TruncateSSTFile(opts TruncateOptions) error { } } if len(sst) == 0 { - return errors.New("cannot find a sst file") + glog.Warning("cannot find a sst file") + continue } _, stderr, err = exec("cp", sst, sst+".save") if err != nil { glog.Warningf("backup sst file: stderr=%s err=%s", stderr, err.Error()) - //return errors.Annotate(err, "backup sst file") continue } _, stderr, err = exec("truncate", "-s", "0", sst) if err != nil { glog.Warningf("truncate sst file: stderr=%s err=%s", stderr, err.Error()) - //return errors.Annotate(err, "truncate sst file") continue } From 30dca2cee00829bc3bd3e3ed174767699746514b Mon Sep 17 00:00:00 2001 From: zyguan Date: Tue, 14 May 2019 22:43:15 +0800 Subject: [PATCH 3/6] stability: add annotation to each log in truncate case func --- tests/pkg/ops/tikv.go | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/pkg/ops/tikv.go b/tests/pkg/ops/tikv.go index 35b2995c56..6b0a79b380 100644 --- a/tests/pkg/ops/tikv.go +++ b/tests/pkg/ops/tikv.go @@ -13,6 +13,7 @@ package ops import ( + "fmt" "strconv" "strings" "time" @@ -35,7 +36,7 @@ type TiKVOps struct { } func (ops *TiKVOps) TruncateSSTFile(opts TruncateOptions) error { - glog.Infof("truncate sst option: %+v", opts) + logHdr := fmt.Sprintf("store: %s cluster: [%s/%s] ", opts.Store, opts.Namespace, opts.Cluster) tc, err := ops.PingcapV1alpha1().TidbClusters(opts.Namespace).Get(opts.Cluster, metav1.GetOptions{}) if err != nil { @@ -64,7 +65,7 @@ func (ops *TiKVOps) TruncateSSTFile(opts TruncateOptions) error { } stdout, stderr, err := exec("find", "/var/lib/tikv/db", "-name", "*.sst", "-o", "-name", "*.save") if err != nil { - glog.Warningf("list sst files: stderr=%s err=%s", stderr, err.Error()) + glog.Warningf(logHdr+"list sst files: stderr=%s err=%s", stderr, err.Error()) continue } @@ -84,19 +85,19 @@ func (ops *TiKVOps) TruncateSSTFile(opts TruncateOptions) error { } } if len(sst) == 0 { - glog.Warning("cannot find a sst file") + glog.Warning(logHdr + "cannot find a sst file") continue } _, stderr, err = exec("cp", sst, sst+".save") if err != nil { - glog.Warningf("backup sst file: stderr=%s err=%s", stderr, err.Error()) + glog.Warningf(logHdr+"backup sst file: stderr=%s err=%s", stderr, err.Error()) continue } _, stderr, err = exec("truncate", "-s", "0", sst) if err != nil { - glog.Warningf("truncate sst file: stderr=%s err=%s", stderr, err.Error()) + glog.Warningf(logHdr+"truncate sst file: stderr=%s err=%s", stderr, err.Error()) continue } From f9c1cb9c9bb1dc743ea80f09412024e3687d43a7 Mon Sep 17 00:00:00 2001 From: zyguan Date: Fri, 17 May 2019 01:08:38 +0800 Subject: [PATCH 4/6] stability: add error logs to sst file corruption case --- tests/failover.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/failover.go b/tests/failover.go index 2e59734d87..d871913665 100644 --- a/tests/failover.go +++ b/tests/failover.go @@ -65,6 +65,8 @@ func (oa *operatorActions) TruncateSSTFileThenCheckFailover(info *TidbClusterCon glog.Infof("deleting pod: [%s/%s] and wait 1 minute for the pod to terminate", info.Namespace, store.PodName) err = cli.CoreV1().Pods(info.Namespace).Delete(store.PodName, nil) if err != nil { + glog.Errorf("failed to get delete the pod: ns=%s tc=%s pod=%s err=%s", + info.Namespace, info.ClusterName, store.PodName, err.Error()) return err } @@ -77,6 +79,8 @@ func (oa *operatorActions) TruncateSSTFileThenCheckFailover(info *TidbClusterCon Store: store.ID, }) if err != nil { + glog.Errorf("failed to truncate the sst file: ns=%s tc=%s store=%s err=%s", + info.Namespace, info.ClusterName, store.ID, err.Error()) return err } oa.EmitEvent(info, fmt.Sprintf("TruncateSSTFile: tikv: %s/%s", info.Namespace, store.PodName)) From 4bd7fbadc31736e10aa25e2c53a7be82ad663b3d Mon Sep 17 00:00:00 2001 From: zyguan Date: Mon, 20 May 2019 12:39:56 +0800 Subject: [PATCH 5/6] stability: truncate multiple sst files to avoid #501 --- tests/pkg/ops/tikv.go | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/tests/pkg/ops/tikv.go b/tests/pkg/ops/tikv.go index cb1627a675..9e399449c3 100644 --- a/tests/pkg/ops/tikv.go +++ b/tests/pkg/ops/tikv.go @@ -24,7 +24,10 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -const retryLimit = 15 +const ( + retryLimit = 15 + maxSSTFilesToTruncate = 20 +) type TruncateOptions struct { Namespace string @@ -79,29 +82,36 @@ func (ops *TiKVOps) TruncateSSTFile(opts TruncateOptions) error { } } - sst := "" + ssts := make([]string, 0, maxSSTFilesToTruncate) for k := range sstCandidates { + if len(ssts) >= maxSSTFilesToTruncate { + break + } if strings.HasSuffix(k, ".sst") && !sstCandidates[k+".save"] { - sst = k + ssts = append(ssts, k) } } - if len(sst) == 0 { + if len(ssts) == 0 { glog.Warning(logHdr + "cannot find a sst file") continue } - _, stderr, err = exec("cp", sst, sst+".save") - if err != nil { - glog.Warningf(logHdr+"backup sst file: stderr=%s err=%s", stderr, err.Error()) - continue + truncated := 0 + for _, sst := range ssts { + _, stderr, err = exec("sh", "-c", + fmt.Sprintf("cp %s %s.save && truncate -s 0 %s", sst, sst, sst)) + if err != nil { + glog.Warningf(logHdr+"truncate sst file: sst=%s stderr=%s err=%s", sst, stderr, err.Error()) + continue + } + truncated++ } - - _, stderr, err = exec("truncate", "-s", "0", sst) - if err != nil { - glog.Warningf(logHdr+"truncate sst file: stderr=%s err=%s", stderr, err.Error()) + if truncated == 0 { + glog.Warningf(logHdr + "no sst file has been truncated") continue } + glog.Infof(logHdr+"%d sst files got truncated", truncated) break } From fe1172a006c79f69c7c2029e78bb7d9cd72152da Mon Sep 17 00:00:00 2001 From: zyguan Date: Mon, 27 May 2019 12:53:41 +0800 Subject: [PATCH 6/6] stability: format code --- tests/failover.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/failover.go b/tests/failover.go index b790266960..f6ae2544e7 100644 --- a/tests/failover.go +++ b/tests/failover.go @@ -7,8 +7,6 @@ import ( "strings" "time" - "github.com/pingcap/tidb-operator/tests/slack" - // To register MySQL driver _ "github.com/go-sql-driver/mysql" "github.com/golang/glog" @@ -17,6 +15,7 @@ import ( "github.com/pingcap/tidb-operator/pkg/label" "github.com/pingcap/tidb-operator/tests/pkg/client" "github.com/pingcap/tidb-operator/tests/pkg/ops" + "github.com/pingcap/tidb-operator/tests/slack" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels"