diff --git a/Makefile b/Makefile index 1a82b17fba8..831c2bcb3c9 100644 --- a/Makefile +++ b/Makefile @@ -41,7 +41,15 @@ FAILPOINT := bin/failpoint-ctl FAILPOINT_ENABLE := $$(echo $(FAILPOINT_DIR) | xargs $(FAILPOINT) enable >/dev/null) FAILPOINT_DISABLE := $$(find $(FAILPOINT_DIR) | xargs $(FAILPOINT) disable >/dev/null) -RELEASE_VERSION ?= $(shell git describe --tags --dirty="-dev") +RELEASE_VERSION := v5.0.0-master +ifneq ($(shell git rev-parse --abbrev-ref HEAD | egrep '^release-[0-9]\.[0-9].*$$|^HEAD$$'),) + # If we are in release branch, use tag version. + RELEASE_VERSION := $(shell git describe --tags --dirty="-dirty") +else ifneq ($(shell git status --porcelain),) + # Add -dirty if the working tree is dirty for non release branch. + RELEASE_VERSION := $(RELEASE_VERSION)-dirty +endif + LDFLAGS += -X "$(CDC_PKG)/pkg/version.ReleaseVersion=$(RELEASE_VERSION)" LDFLAGS += -X "$(CDC_PKG)/pkg/version.BuildTS=$(shell date -u '+%Y-%m-%d %H:%M:%S')" LDFLAGS += -X "$(CDC_PKG)/pkg/version.GitHash=$(shell git rev-parse HEAD)" diff --git a/cdc/capture.go b/cdc/capture.go index 0bbdc505ae9..2ffef74e42a 100644 --- a/cdc/capture.go +++ b/cdc/capture.go @@ -18,6 +18,8 @@ import ( "sync" "time" + "github.com/pingcap/ticdc/pkg/version" + "github.com/google/uuid" "github.com/pingcap/errors" "github.com/pingcap/failpoint" @@ -41,13 +43,10 @@ import ( "google.golang.org/grpc/backoff" ) -const ( - captureSessionTTL = 3 -) - -// processorOpts records options for processor -type processorOpts struct { +// captureOpts records options for capture +type captureOpts struct { flushCheckpointInterval time.Duration + captureSessionTTL int } // Capture represents a Capture server, it monitors the changefeed information in etcd and schedules Task on it. @@ -67,7 +66,7 @@ type Capture struct { session *concurrency.Session election *concurrency.Election - opts *processorOpts + opts *captureOpts closed chan struct{} } @@ -78,7 +77,7 @@ func NewCapture( pdCli pd.Client, credential *security.Credential, advertiseAddr string, - opts *processorOpts, + opts *captureOpts, ) (c *Capture, err error) { tlsConfig, err := credential.ToTLSConfig() if err != nil { @@ -114,7 +113,7 @@ func NewCapture( return nil, errors.Annotate(cerror.WrapError(cerror.ErrNewCaptureFailed, err), "new etcd client") } sess, err := concurrency.NewSession(etcdCli, - concurrency.WithTTL(captureSessionTTL)) + concurrency.WithTTL(opts.captureSessionTTL)) if err != nil { return nil, errors.Annotate(cerror.WrapError(cerror.ErrNewCaptureFailed, err), "create capture session") } @@ -124,6 +123,7 @@ func NewCapture( info := &model.CaptureInfo{ ID: id, AdvertiseAddr: advertiseAddr, + Version: version.ReleaseVersion, } processorManager := processor.NewManager(pdCli, credential, info) log.Info("creating capture", zap.String("capture-id", id), util.ZapFieldCapture(ctx)) diff --git a/cdc/capture_test.go b/cdc/capture_test.go index 907badb6cab..f3f6d53781e 100644 --- a/cdc/capture_test.go +++ b/cdc/capture_test.go @@ -81,7 +81,7 @@ func (s *captureSuite) TestCaptureSuicide(c *check.C) { defer cancel() capture, err := NewCapture(ctx, []string{s.clientURL.String()}, nil, &security.Credential{}, "127.0.0.1:12034", - &processorOpts{flushCheckpointInterval: time.Millisecond * 200}) + &captureOpts{flushCheckpointInterval: time.Millisecond * 200}) c.Assert(err, check.IsNil) var wg sync.WaitGroup @@ -114,7 +114,7 @@ func (s *captureSuite) TestCaptureSessionDoneDuringHandleTask(c *check.C) { defer cancel() capture, err := NewCapture(ctx, []string{s.clientURL.String()}, nil, &security.Credential{}, "127.0.0.1:12034", - &processorOpts{flushCheckpointInterval: time.Millisecond * 200}) + &captureOpts{flushCheckpointInterval: time.Millisecond * 200}) c.Assert(err, check.IsNil) runProcessorCount := 0 diff --git a/cdc/changefeed.go b/cdc/changefeed.go index d0141877a4b..17eefa27f30 100644 --- a/cdc/changefeed.go +++ b/cdc/changefeed.go @@ -33,6 +33,7 @@ import ( "github.com/pingcap/ticdc/pkg/filter" "github.com/pingcap/ticdc/pkg/scheduler" "github.com/pingcap/tidb/sessionctx/binloginfo" + "go.etcd.io/etcd/clientv3" "go.etcd.io/etcd/mvcc/mvccpb" "go.uber.org/zap" ) @@ -71,8 +72,18 @@ type ChangeFeedRWriter interface { // GetChangeFeedStatus queries the checkpointTs and resovledTs of a given changefeed GetChangeFeedStatus(ctx context.Context, id string) (*model.ChangeFeedStatus, int64, error) + // PutAllChangeFeedStatus the changefeed info to storage such as etcd. PutAllChangeFeedStatus(ctx context.Context, infos map[model.ChangeFeedID]*model.ChangeFeedStatus) error + + // LeaseGuardRemoveAllTaskStatus wraps RemoveAllTaskStatus with a context restricted by lease TTL. + LeaseGuardRemoveAllTaskStatus(ctx context.Context, changefeedID string, leaseID clientv3.LeaseID) error + + // LeaseGuardRemoveAllTaskPositions wraps RemoveAllTaskPositions with a context restricted by lease TTL. + LeaseGuardRemoveAllTaskPositions(ctx context.Context, changefeedID string, leaseID clientv3.LeaseID) error + + // LeaseGuardPutAllChangeFeedStatus wraps PutAllChangeFeedStatus with a context restricted by lease TTL. + LeaseGuardPutAllChangeFeedStatus(ctx context.Context, infos map[model.ChangeFeedID]*model.ChangeFeedStatus, leaseID clientv3.LeaseID) error } type changeFeed struct { @@ -119,6 +130,7 @@ type changeFeed struct { lastRebalanceTime time.Time etcdCli kv.CDCEtcdClient + leaseID clientv3.LeaseID // context cancel function for all internal goroutines cancel context.CancelFunc @@ -321,7 +333,7 @@ func (c *changeFeed) balanceOrphanTables(ctx context.Context, captures map[model id := id targetTs := targetTs updateFuncs[captureID] = append(updateFuncs[captureID], func(_ int64, status *model.TaskStatus) (bool, error) { - status.RemoveTable(id, targetTs) + status.RemoveTable(id, targetTs, false /*isMoveTable*/) return true, nil }) cleanedTables[id] = struct{}{} @@ -366,7 +378,7 @@ func (c *changeFeed) balanceOrphanTables(ctx context.Context, captures map[model } for captureID, funcs := range updateFuncs { - newStatus, _, err := c.etcdCli.AtomicPutTaskStatus(ctx, c.id, captureID, funcs...) + newStatus, _, err := c.etcdCli.LeaseGuardAtomicPutTaskStatus(ctx, c.id, captureID, c.leaseID, funcs...) if err != nil { return errors.Trace(err) } @@ -391,15 +403,17 @@ func (c *changeFeed) balanceOrphanTables(ctx context.Context, captures map[model func (c *changeFeed) updateTaskStatus(ctx context.Context, taskStatus map[model.CaptureID]*model.TaskStatus) error { for captureID, status := range taskStatus { - newStatus, _, err := c.etcdCli.AtomicPutTaskStatus(ctx, c.id, captureID, func(modRevision int64, taskStatus *model.TaskStatus) (bool, error) { - if taskStatus.SomeOperationsUnapplied() { - log.Error("unexpected task status, there are operations unapplied in this status", zap.Any("status", taskStatus)) - return false, cerror.ErrWaitHandleOperationTimeout.GenWithStackByArgs() - } - taskStatus.Tables = status.Tables - taskStatus.Operation = status.Operation - return true, nil - }) + newStatus, _, err := c.etcdCli.LeaseGuardAtomicPutTaskStatus( + ctx, c.id, captureID, c.leaseID, + func(modRevision int64, taskStatus *model.TaskStatus) (bool, error) { + if taskStatus.SomeOperationsUnapplied() { + log.Error("unexpected task status, there are operations unapplied in this status", zap.Any("status", taskStatus)) + return false, cerror.ErrWaitHandleOperationTimeout.GenWithStackByArgs() + } + taskStatus.Tables = status.Tables + taskStatus.Operation = status.Operation + return true, nil + }) if err != nil { return errors.Trace(err) } @@ -530,7 +544,7 @@ func (c *changeFeed) handleMoveTableJobs(ctx context.Context, captures map[model // To ensure that the replication pipeline stops exactly at the boundary TS, // The boundary TS specified by Remove Table Operation MUST greater or equal to the checkpoint TS of this table. // So the global resolved TS is a reasonable values. - replicaInfo, exist := status.RemoveTable(tableID, c.status.ResolvedTs) + replicaInfo, exist := status.RemoveTable(tableID, c.status.ResolvedTs, true /*isMoveTable*/) if !exist { delete(c.moveTableJobs, tableID) log.Warn("ignored the move job, the table is not exist in the source capture", zap.Reflect("job", job)) diff --git a/cdc/entry/mounter.go b/cdc/entry/mounter.go index 00d0818f472..af50a6185e6 100644 --- a/cdc/entry/mounter.go +++ b/cdc/entry/mounter.go @@ -239,7 +239,9 @@ func (m *mounterImpl) unmarshalAndMountRowChanged(ctx context.Context, raw *mode PhysicalTableID: physicalTableID, Delete: raw.OpType == model.OpTypeDelete, } - snap, err := m.schemaStorage.GetSnapshot(ctx, raw.CRTs) + // when async commit is enabled, the commitTs of DMLs may be equals with DDL finishedTs + // a DML whose commitTs is equal to a DDL finishedTs using the schema info before the DDL + snap, err := m.schemaStorage.GetSnapshot(ctx, raw.CRTs-1) if err != nil { return nil, errors.Trace(err) } @@ -582,7 +584,8 @@ func formatColVal(datum types.Datum, tp byte) (value interface{}, warn string, e // Encode bits as integers to avoid pingcap/tidb#10988 (which also affects MySQL itself) v, err := datum.GetBinaryLiteral().ToInt(nil) return v, "", err - case mysql.TypeString, mysql.TypeVarString, mysql.TypeVarchar: + case mysql.TypeString, mysql.TypeVarString, mysql.TypeVarchar, + mysql.TypeTinyBlob, mysql.TypeMediumBlob, mysql.TypeLongBlob, mysql.TypeBlob: b := datum.GetBytes() if b == nil { b = emptyBytes diff --git a/cdc/entry/mounter_test.go b/cdc/entry/mounter_test.go index f82786d65cf..d26c015db8c 100644 --- a/cdc/entry/mounter_test.go +++ b/cdc/entry/mounter_test.go @@ -28,6 +28,7 @@ import ( tidbkv "github.com/pingcap/tidb/kv" "github.com/pingcap/tidb/session" "github.com/pingcap/tidb/store/mockstore" + "github.com/pingcap/tidb/store/tikv/oracle" "github.com/pingcap/tidb/util/testkit" "go.uber.org/zap" ) @@ -254,7 +255,7 @@ func testMounterDisableOldValue(c *check.C, tc struct { tk.MustExec(insertSQL, params...) } - ver, err := store.CurrentVersion() + ver, err := store.CurrentVersion(oracle.GlobalTxnScope) c.Assert(err, check.IsNil) scheamStorage.AdvanceResolvedTs(ver.Ver) mounter := NewMounter(scheamStorage, 1, false).(*mounterImpl) diff --git a/cdc/entry/schema_storage_test.go b/cdc/entry/schema_storage_test.go index ea5f590026e..698681049e0 100644 --- a/cdc/entry/schema_storage_test.go +++ b/cdc/entry/schema_storage_test.go @@ -33,6 +33,7 @@ import ( "github.com/pingcap/tidb/session" "github.com/pingcap/tidb/sessionctx" "github.com/pingcap/tidb/store/mockstore" + "github.com/pingcap/tidb/store/tikv/oracle" "github.com/pingcap/tidb/types" "github.com/pingcap/tidb/util/testkit" ) @@ -681,7 +682,7 @@ func (t *schemaSuite) TestCreateSnapFromMeta(c *check.C) { tk.MustExec("create table test2.simple_test3 (id bigint primary key)") tk.MustExec("create table test2.simple_test4 (id bigint primary key)") tk.MustExec("create table test2.simple_test5 (a bigint)") - ver, err := store.CurrentVersion() + ver, err := store.CurrentVersion(oracle.GlobalTxnScope) c.Assert(err, check.IsNil) meta, err := kv.GetSnapshotMeta(store, ver.Ver) c.Assert(err, check.IsNil) @@ -717,7 +718,7 @@ func (t *schemaSuite) TestSnapshotClone(c *check.C) { tk.MustExec("create table test2.simple_test3 (id bigint primary key)") tk.MustExec("create table test2.simple_test4 (id bigint primary key)") tk.MustExec("create table test2.simple_test5 (a bigint)") - ver, err := store.CurrentVersion() + ver, err := store.CurrentVersion(oracle.GlobalTxnScope) c.Assert(err, check.IsNil) meta, err := kv.GetSnapshotMeta(store, ver.Ver) c.Assert(err, check.IsNil) @@ -753,7 +754,7 @@ func (t *schemaSuite) TestExplicitTables(c *check.C) { defer domain.Close() domain.SetStatsUpdating(true) tk := testkit.NewTestKit(c, store) - ver1, err := store.CurrentVersion() + ver1, err := store.CurrentVersion(oracle.GlobalTxnScope) c.Assert(err, check.IsNil) tk.MustExec("create database test2") tk.MustExec("create table test.simple_test1 (id bigint primary key)") @@ -761,7 +762,7 @@ func (t *schemaSuite) TestExplicitTables(c *check.C) { tk.MustExec("create table test2.simple_test3 (a bigint)") tk.MustExec("create table test2.simple_test4 (a varchar(20) unique key)") tk.MustExec("create table test2.simple_test5 (a varchar(20))") - ver2, err := store.CurrentVersion() + ver2, err := store.CurrentVersion(oracle.GlobalTxnScope) c.Assert(err, check.IsNil) meta1, err := kv.GetSnapshotMeta(store, ver1.Ver) c.Assert(err, check.IsNil) diff --git a/cdc/http_status.go b/cdc/http_status.go index c642f49e2f7..40de9235883 100644 --- a/cdc/http_status.go +++ b/cdc/http_status.go @@ -28,7 +28,6 @@ import ( "github.com/pingcap/ticdc/cdc/kv" "github.com/pingcap/ticdc/pkg/config" cerror "github.com/pingcap/ticdc/pkg/errors" - "github.com/pingcap/ticdc/pkg/security" "github.com/pingcap/ticdc/pkg/version" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" @@ -57,27 +56,22 @@ func (s *Server) startStatusHTTP() error { prometheus.DefaultGatherer = registry serverMux.Handle("/metrics", promhttp.Handler()) - - credential := &security.Credential{} - if s.opts.credential != nil { - credential = s.opts.credential - } - tlsConfig, err := credential.ToTLSConfigWithVerify() + conf := config.GetGlobalServerConfig() + tlsConfig, err := conf.Security.ToTLSConfigWithVerify() if err != nil { log.Error("status server get tls config failed", zap.Error(err)) return errors.Trace(err) } - addr := s.opts.addr - s.statusServer = &http.Server{Addr: addr, Handler: serverMux, TLSConfig: tlsConfig} + s.statusServer = &http.Server{Addr: conf.Addr, Handler: serverMux, TLSConfig: tlsConfig} - ln, err := net.Listen("tcp", addr) + ln, err := net.Listen("tcp", conf.Addr) if err != nil { return cerror.WrapError(cerror.ErrServeHTTP, err) } go func() { - log.Info("status http server is running", zap.String("addr", addr)) + log.Info("status http server is running", zap.String("addr", conf.Addr)) if tlsConfig != nil { - err = s.statusServer.ServeTLS(ln, credential.CertPath, credential.KeyPath) + err = s.statusServer.ServeTLS(ln, conf.Security.CertPath, conf.Security.KeyPath) } else { err = s.statusServer.Serve(ln) } diff --git a/cdc/http_status_test.go b/cdc/http_status_test.go index 152a2ef6a32..cc4ea6a8379 100644 --- a/cdc/http_status_test.go +++ b/cdc/http_status_test.go @@ -21,6 +21,7 @@ import ( "time" "github.com/pingcap/check" + "github.com/pingcap/ticdc/pkg/config" cerror "github.com/pingcap/ticdc/pkg/errors" "github.com/pingcap/ticdc/pkg/util/testleak" "go.etcd.io/etcd/clientv3/concurrency" @@ -32,16 +33,10 @@ var _ = check.Suite(&httpStatusSuite{}) const retryTime = 20 -var testingServerOptions = options{ - pdEndpoints: "http://127.0.0.1:2379", - addr: "127.0.0.1:8300", - advertiseAddr: "127.0.0.1:8300", - timezone: nil, - gcTTL: DefaultCDCGCSafePointTTL, -} +var advertiseAddr4Test = "127.0.0.1:8300" func (s *httpStatusSuite) waitUntilServerOnline(c *check.C) { - statusURL := fmt.Sprintf("http://%s/status", testingServerOptions.advertiseAddr) + statusURL := fmt.Sprintf("http://%s/status", advertiseAddr4Test) for i := 0; i < retryTime; i++ { resp, err := http.Get(statusURL) if err == nil { @@ -57,8 +52,13 @@ func (s *httpStatusSuite) waitUntilServerOnline(c *check.C) { func (s *httpStatusSuite) TestHTTPStatus(c *check.C) { defer testleak.AfterTest(c)() - server := &Server{opts: testingServerOptions} - err := server.startStatusHTTP() + conf := config.GetDefaultServerConfig() + conf.Addr = advertiseAddr4Test + conf.AdvertiseAddr = advertiseAddr4Test + config.StoreGlobalServerConfig(conf) + server, err := NewServer([]string{"http://127.0.0.1:2379"}) + c.Assert(err, check.IsNil) + err = server.startStatusHTTP() c.Assert(err, check.IsNil) defer func() { c.Assert(server.statusServer.Close(), check.IsNil) @@ -75,7 +75,7 @@ func (s *httpStatusSuite) TestHTTPStatus(c *check.C) { } func testPprof(c *check.C) { - resp, err := http.Get(fmt.Sprintf("http://%s/debug/pprof/cmdline", testingServerOptions.advertiseAddr)) + resp, err := http.Get(fmt.Sprintf("http://%s/debug/pprof/cmdline", advertiseAddr4Test)) c.Assert(err, check.IsNil) defer resp.Body.Close() c.Assert(resp.StatusCode, check.Equals, 200) @@ -84,31 +84,31 @@ func testPprof(c *check.C) { } func testReisgnOwner(c *check.C) { - uri := fmt.Sprintf("http://%s/capture/owner/resign", testingServerOptions.advertiseAddr) + uri := fmt.Sprintf("http://%s/capture/owner/resign", advertiseAddr4Test) testHTTPPostOnly(c, uri) testRequestNonOwnerFailed(c, uri) } func testHandleChangefeedAdmin(c *check.C) { - uri := fmt.Sprintf("http://%s/capture/owner/admin", testingServerOptions.advertiseAddr) + uri := fmt.Sprintf("http://%s/capture/owner/admin", advertiseAddr4Test) testHTTPPostOnly(c, uri) testRequestNonOwnerFailed(c, uri) } func testHandleRebalance(c *check.C) { - uri := fmt.Sprintf("http://%s/capture/owner/rebalance_trigger", testingServerOptions.advertiseAddr) + uri := fmt.Sprintf("http://%s/capture/owner/rebalance_trigger", advertiseAddr4Test) testHTTPPostOnly(c, uri) testRequestNonOwnerFailed(c, uri) } func testHandleMoveTable(c *check.C) { - uri := fmt.Sprintf("http://%s/capture/owner/move_table", testingServerOptions.advertiseAddr) + uri := fmt.Sprintf("http://%s/capture/owner/move_table", advertiseAddr4Test) testHTTPPostOnly(c, uri) testRequestNonOwnerFailed(c, uri) } func testHandleChangefeedQuery(c *check.C) { - uri := fmt.Sprintf("http://%s/capture/owner/changefeed/query", testingServerOptions.advertiseAddr) + uri := fmt.Sprintf("http://%s/capture/owner/changefeed/query", advertiseAddr4Test) testHTTPPostOnly(c, uri) testRequestNonOwnerFailed(c, uri) } diff --git a/cdc/kv/client.go b/cdc/kv/client.go index 9c223368b29..c7232627af5 100644 --- a/cdc/kv/client.go +++ b/cdc/kv/client.go @@ -56,9 +56,9 @@ const ( dialTimeout = 10 * time.Second maxRetry = 100 tikvRequestMaxBackoff = 20000 // Maximum total sleep time(in ms) - grpcInitialWindowSize = 1 << 30 // The value for initial window size on a stream - grpcInitialConnWindowSize = 1 << 30 // The value for initial window size on a connection - grpcMaxCallRecvMsgSize = 1 << 30 // The maximum message size the client can receive + grpcInitialWindowSize = 1 << 27 // 128 MB The value for initial window size on a stream + grpcInitialConnWindowSize = 1 << 27 // 128 MB The value for initial window size on a connection + grpcMaxCallRecvMsgSize = 1 << 30 // 1024 MB The maximum message size the client can receive grpcConnCount = 10 // The threshold of warning a message is too large. TiKV split events into 6MB per-message. @@ -68,13 +68,13 @@ const ( // failed region will be reloaded via `BatchLoadRegionsWithKeyRange` API. So we // don't need to force reload region any more. regionScheduleReload = false - - // hard code switch - // true: use kv client v2, which has a region worker for each stream - // false: use kv client v1, which runs a goroutine for every single region - enableKVClientV2 = true ) +// hard code switch +// true: use kv client v2, which has a region worker for each stream +// false: use kv client v1, which runs a goroutine for every single region +var enableKVClientV2 = true + type singleRegionInfo struct { verID tikv.RegionVerID span regionspan.ComparableSpan @@ -344,7 +344,7 @@ type CDCClient struct { } regionCache *tikv.RegionCache - kvStorage tikv.Storage + kvStorage TiKVStorage regionLimiters *regionEventFeedLimiters } @@ -354,11 +354,21 @@ func NewCDCClient(ctx context.Context, pd pd.Client, kvStorage tikv.Storage, cre clusterID := pd.GetClusterID(ctx) log.Info("get clusterID", zap.Uint64("id", clusterID)) + var store TiKVStorage + if kvStorage != nil { + // wrap to TiKVStorage if need. + if s, ok := kvStorage.(TiKVStorage); ok { + store = s + } else { + store = newStorageWithCurVersionCache(kvStorage, kvStorage.UUID()) + } + } + c = &CDCClient{ clusterID: clusterID, pd: pd, + kvStorage: store, credential: credential, - kvStorage: kvStorage, regionCache: tikv.NewRegionCache(pd), mu: struct { sync.Mutex @@ -471,7 +481,7 @@ func currentRequestID() uint64 { type eventFeedSession struct { client *CDCClient regionCache *tikv.RegionCache - kvStorage tikv.Storage + kvStorage TiKVStorage lockResolver txnutil.LockResolver isPullerInit PullerInitialization @@ -514,7 +524,7 @@ type rangeRequestTask struct { func newEventFeedSession( client *CDCClient, regionCache *tikv.RegionCache, - kvStorage tikv.Storage, + kvStorage TiKVStorage, totalSpan regionspan.ComparableSpan, lockResolver txnutil.LockResolver, isPullerInit PullerInitialization, @@ -628,12 +638,14 @@ func (s *eventFeedSession) scheduleRegionRequest(ctx context.Context, sri single // goroutine, it won't block the caller of `schedulerRegionRequest`. s.scheduleDivideRegionAndRequest(ctx, r, sri.ts) } + case regionspan.LockRangeStatusCancel: + return default: panic("unreachable") } } - res := s.rangeLock.LockRange(sri.span.Start, sri.span.End, sri.verID.GetID(), sri.verID.GetVer()) + res := s.rangeLock.LockRange(ctx, sri.span.Start, sri.span.End, sri.verID.GetID(), sri.verID.GetVer()) if res.Status == regionspan.LockRangeStatusWait { res = res.WaitFn() @@ -755,6 +767,7 @@ MainLoop: state := newRegionFeedState(sri, requestID) pendingRegions.insert(requestID, state) + failpoint.Inject("kvClientPendingRegionDelay", nil) stream, ok := s.getStream(rpcCtx.Addr) // Establish the stream if it has not been connected yet. @@ -783,8 +796,12 @@ MainLoop: } bo := tikv.NewBackoffer(ctx, tikvRequestMaxBackoff) s.client.regionCache.OnSendFail(bo, rpcCtx, regionScheduleReload, err) - // Delete the pendingRegion info from `pendingRegions` and retry connecting and sending the request. - pendingRegions.take(requestID) + // Take the pendingRegion from `pendingRegions`, if the region + // is deleted already, we don't retry for this region. Otherwise, + // retry to connect and send request for this region. + if _, exists := pendingRegions.take(requestID); !exists { + continue MainLoop + } continue } s.addStream(rpcCtx.Addr, stream) @@ -1053,6 +1070,7 @@ func (s *eventFeedSession) handleError(ctx context.Context, errInfo regionErrorI } } + failpoint.Inject("kvClientRegionReentrantErrorDelay", nil) s.scheduleRegionRequest(ctx, errInfo.singleRegionInfo) return nil } @@ -1379,7 +1397,7 @@ func (s *eventFeedSession) singleEventFeed( log.Warn("region not receiving event from tikv for too long time", zap.Uint64("regionID", regionID), zap.Stringer("span", span), zap.Duration("duration", sinceLastEvent)) } - version, err := s.kvStorage.(*StorageWithCurVersionCache).GetCachedCurrentVersion() + version, err := s.kvStorage.GetCachedCurrentVersion() if err != nil { log.Warn("failed to get current version from PD", zap.Error(err)) continue diff --git a/cdc/kv/client_bench_test.go b/cdc/kv/client_bench_test.go new file mode 100644 index 00000000000..d347eb66c1b --- /dev/null +++ b/cdc/kv/client_bench_test.go @@ -0,0 +1,498 @@ +// Copyright 2021 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package kv + +import ( + "context" + "fmt" + "net" + "sync" + "testing" + "time" + + "github.com/pingcap/errors" + "github.com/pingcap/kvproto/pkg/cdcpb" + "github.com/pingcap/log" + "github.com/pingcap/ticdc/cdc/model" + "github.com/pingcap/ticdc/pkg/regionspan" + "github.com/pingcap/ticdc/pkg/retry" + "github.com/pingcap/ticdc/pkg/security" + "github.com/pingcap/ticdc/pkg/txnutil" + "github.com/pingcap/tidb/store/mockstore/mocktikv" + "github.com/pingcap/tidb/store/tikv" + "github.com/pingcap/tidb/store/tikv/oracle" + "go.uber.org/zap/zapcore" + "google.golang.org/grpc" +) + +const batchResolvedSize = 100 + +type mockChangeDataService2 struct { + b *testing.B + ch chan *cdcpb.ChangeDataEvent + recvLoop func(server cdcpb.ChangeData_EventFeedServer) +} + +func newMockChangeDataService2(b *testing.B, ch chan *cdcpb.ChangeDataEvent) *mockChangeDataService2 { + s := &mockChangeDataService2{ + b: b, + ch: ch, + } + return s +} + +func (s *mockChangeDataService2) EventFeed(server cdcpb.ChangeData_EventFeedServer) error { + if s.recvLoop != nil { + go func() { + s.recvLoop(server) + }() + } + for e := range s.ch { + if e == nil { + break + } + err := server.Send(e) + if err != nil { + s.b.Error(err) + } + } + return nil +} + +func newMockService2( + ctx context.Context, + b *testing.B, + srv cdcpb.ChangeDataServer, + wg *sync.WaitGroup, +) (grpcServer *grpc.Server, addr string) { + lc := &net.ListenConfig{} + listenAddr := "127.0.0.1:0" + lis, err := lc.Listen(ctx, "tcp", listenAddr) + if err != nil { + b.Error(err) + } + addr = lis.Addr().String() + grpcServer = grpc.NewServer() + cdcpb.RegisterChangeDataServer(grpcServer, srv) + wg.Add(1) + go func() { + err := grpcServer.Serve(lis) + if err != nil { + b.Error(err) + } + wg.Done() + }() + return +} + +func prepareBenchMultiStore(b *testing.B, storeNum, regionNum int) ( + *sync.Map, /* regionID -> requestID/storeID */ + *sync.WaitGroup, /* ensure eventfeed routine exit */ + context.CancelFunc, /* cancle both mock server and cdc kv client */ + chan *model.RegionFeedEvent, /* kv client output channel */ + []chan *cdcpb.ChangeDataEvent, /* mock server data channels */ +) { + ctx, cancel := context.WithCancel(context.Background()) + wg := &sync.WaitGroup{} + requestIDs := new(sync.Map) + + servers := make([]*grpc.Server, storeNum) + inputs := make([]chan *cdcpb.ChangeDataEvent, storeNum) + addrs := make([]string, storeNum) + for i := 0; i < storeNum; i++ { + mockSrvCh := make(chan *cdcpb.ChangeDataEvent, 100000) + srv := newMockChangeDataService2(b, mockSrvCh) + srv.recvLoop = func(server cdcpb.ChangeData_EventFeedServer) { + for { + req, err := server.Recv() + if err != nil { + return + } + requestIDs.Store(req.RegionId, req.RequestId) + } + } + server, addr := newMockService2(ctx, b, srv, wg) + servers[i] = server + inputs[i] = mockSrvCh + addrs[i] = addr + } + + for i := 0; i < storeNum; i++ { + wg.Add(1) + i := i + go func() { + defer wg.Done() + <-ctx.Done() + close(inputs[i]) + servers[i].Stop() + }() + } + + rpcClient, cluster, pdClient, err := mocktikv.NewTiKVAndPDClient("") + if err != nil { + b.Error(err) + } + pdClient = &mockPDClient{Client: pdClient, versionGen: defaultVersionGen} + tiStore, err := tikv.NewTestTiKVStore(rpcClient, pdClient, nil, nil, 0) + if err != nil { + b.Error(err) + } + kvStorage := newStorageWithCurVersionCache(tiStore, addrs[0]) + defer kvStorage.Close() //nolint:errcheck + + // we set each region has `storeNum` peers + regionID := uint64(1_000_000) + peers := make([]uint64, storeNum) + stores := make([]uint64, storeNum) + for i := 0; i < storeNum; i++ { + peers[i] = uint64(100_000_000*i) + regionID + stores[i] = uint64(i + 1) + cluster.AddStore(uint64(i+1), addrs[i]) + } + // bootstrap cluster with the first region + cluster.Bootstrap(regionID, stores, peers, peers[0]) + for i := 0; i < storeNum; i++ { + // first region of stores except for the first store + if i > 0 { + regionID += 1 + peers := make([]uint64, storeNum) + for j := 0; j < storeNum; j++ { + peers[j] = uint64(100_000_000*j) + regionID + } + // peers[i] is the leader peer, locates in store with index i(storeID=i+1) + cluster.SplitRaw(regionID-1, regionID, []byte(fmt.Sprintf("a%d", regionID)), peers, peers[i]) + } + // regions following, split from its previous region + for j := 1; j < regionNum; j++ { + regionID += 1 + peers := make([]uint64, storeNum) + for k := 0; k < storeNum; k++ { + peers[k] = uint64(100_000_000*k) + regionID + } + // peers[i] is the leader peer, locates in store with index i(storeID=i+1) + cluster.SplitRaw(regionID-1, regionID, []byte(fmt.Sprintf("a%d", regionID)), peers, peers[i]) + } + } + + lockresolver := txnutil.NewLockerResolver(kvStorage) + isPullInit := &mockPullerInit{} + cdcClient := NewCDCClient(ctx, pdClient, kvStorage, &security.Credential{}) + eventCh := make(chan *model.RegionFeedEvent, 1000000) + wg.Add(1) + go func() { + err := cdcClient.EventFeed(ctx, regionspan.ComparableSpan{Start: []byte("a"), End: []byte("b")}, 100, false, lockresolver, isPullInit, eventCh) + if errors.Cause(err) != context.Canceled { + b.Error(err) + } + cdcClient.Close() //nolint:errcheck + wg.Done() + }() + + // wait all regions requested from cdc kv client + err = retry.Run(time.Millisecond*500, 20, func() error { + count := 0 + requestIDs.Range(func(_, _ interface{}) bool { + count++ + return true + }) + if count == regionNum*storeNum { + return nil + } + return errors.Errorf("region number %d is not as expected %d", count, regionNum) + }) + if err != nil { + b.Error(err) + } + + return requestIDs, wg, cancel, eventCh, inputs +} + +func prepareBench(b *testing.B, regionNum int) ( + *sync.Map, /* regionID -> requestID */ + *sync.WaitGroup, /* ensure eventfeed routine exit */ + context.CancelFunc, /* cancle both mock server and cdc kv client */ + chan *model.RegionFeedEvent, /* kv client output channel */ + chan *cdcpb.ChangeDataEvent, /* mock server data channel */ +) { + ctx, cancel := context.WithCancel(context.Background()) + wg := &sync.WaitGroup{} + + requestIDs := new(sync.Map) + mockSrvCh := make(chan *cdcpb.ChangeDataEvent, 100000) + srv1 := newMockChangeDataService2(b, mockSrvCh) + srv1.recvLoop = func(server cdcpb.ChangeData_EventFeedServer) { + for { + req, err := server.Recv() + if err != nil { + return + } + requestIDs.Store(req.RegionId, req.RequestId) + } + } + server1, addr1 := newMockService2(ctx, b, srv1, wg) + + wg.Add(1) + go func() { + defer wg.Done() + <-ctx.Done() + close(mockSrvCh) + server1.Stop() + }() + + rpcClient, cluster, pdClient, err := mocktikv.NewTiKVAndPDClient("") + if err != nil { + b.Error(err) + } + pdClient = &mockPDClient{Client: pdClient, versionGen: defaultVersionGen} + tiStore, err := tikv.NewTestTiKVStore(rpcClient, pdClient, nil, nil, 0) + if err != nil { + b.Error(err) + } + kvStorage := newStorageWithCurVersionCache(tiStore, addr1) + defer kvStorage.Close() //nolint:errcheck + + storeID := uint64(1) + cluster.AddStore(storeID, addr1) + // bootstrap with region 100_000(100k) + cluster.Bootstrap(uint64(100_000), []uint64{storeID}, []uint64{100_001}, 100_001) + for i := 1; i < regionNum; i++ { + regionID := uint64(i + 100_000) + peerID := regionID + 1 + // split regions to [min, b100_001), [b100_001, b100_002), ... [bN, max) + cluster.SplitRaw(regionID-1, regionID, []byte(fmt.Sprintf("b%d", regionID)), []uint64{peerID}, peerID) + } + + lockresolver := txnutil.NewLockerResolver(kvStorage) + isPullInit := &mockPullerInit{} + cdcClient := NewCDCClient(ctx, pdClient, kvStorage, &security.Credential{}) + eventCh := make(chan *model.RegionFeedEvent, 1000000) + wg.Add(1) + go func() { + err := cdcClient.EventFeed(ctx, regionspan.ComparableSpan{Start: []byte("a"), End: []byte("z")}, 100, false, lockresolver, isPullInit, eventCh) + if errors.Cause(err) != context.Canceled { + b.Error(err) + } + cdcClient.Close() //nolint:errcheck + wg.Done() + }() + + // wait all regions requested from cdc kv client + err = retry.Run(time.Millisecond*500, 20, func() error { + count := 0 + requestIDs.Range(func(_, _ interface{}) bool { + count++ + return true + }) + if count == regionNum { + return nil + } + return errors.Errorf("region number %d is not as expected %d", count, regionNum) + }) + if err != nil { + b.Error(err) + } + + return requestIDs, wg, cancel, eventCh, mockSrvCh +} + +func benchmarkSingleWorkerResolvedTs(b *testing.B, clientV2 bool) { + enableKVClientV2 = clientV2 + log.SetLevel(zapcore.ErrorLevel) + tests := []struct { + name string + regionNum int + }{ + {name: "10", regionNum: 10}, + {name: "100", regionNum: 100}, + {name: "1k", regionNum: 1000}, + {name: "10k", regionNum: 10_000}, + {name: "20k", regionNum: 20_000}, + } + + for _, test := range tests { + requestIDs, wg, cancel, eventCh, mockSrvCh := prepareBench(b, test.regionNum) + + // copy to a normal map to reduce access latency + copyReqIDs := make(map[uint64]uint64, test.regionNum) + requestIDs.Range(func(key, value interface{}) bool { + regionID := key.(uint64) + requestID := value.(uint64) + initialized := mockInitializedEvent(regionID, requestID) + mockSrvCh <- initialized + copyReqIDs[regionID] = requestID + return true + }) + + b.Run(test.name, func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + batch := 0 + regions := make([]uint64, 0, batchResolvedSize) + rts := oracle.ComposeTS(oracle.GetPhysical(time.Now()), 0) + for regionID := range copyReqIDs { + batch++ + regions = append(regions, regionID) + if batch == batchResolvedSize { + eventResolvedBatch := &cdcpb.ChangeDataEvent{ + ResolvedTs: &cdcpb.ResolvedTs{ + Regions: regions, + Ts: rts, + }, + } + mockSrvCh <- eventResolvedBatch + batch = 0 + regions = regions[:0] + } + } + if len(regions) > 0 { + eventResolvedBatch := &cdcpb.ChangeDataEvent{ + ResolvedTs: &cdcpb.ResolvedTs{ + Regions: regions, + Ts: rts, + }, + } + mockSrvCh <- eventResolvedBatch + } + count := 0 + for range eventCh { + count++ + if count == test.regionNum { + break + } + } + } + }) + + err := retry.Run(time.Millisecond*500, 20, func() error { + if len(mockSrvCh) == 0 { + return nil + } + return errors.New("not all events are sent yet") + }) + if err != nil { + b.Error(err) + } + + cancel() + wg.Wait() + } +} + +func BenchmarkResolvedTsClientV1(b *testing.B) { + benchmarkSingleWorkerResolvedTs(b, false /* clientV1 */) +} + +func BenchmarkResolvedTsClientV2(b *testing.B) { + benchmarkSingleWorkerResolvedTs(b, true /* clientV2 */) +} + +func benchmarkMultipleStoreResolvedTs(b *testing.B, clientV2 bool) { + enableKVClientV2 = clientV2 + log.SetLevel(zapcore.ErrorLevel) + tests := []struct { + name string + storeNum int + regionNum int + }{ + {name: "10", storeNum: 10, regionNum: 1}, + {name: "100", storeNum: 10, regionNum: 10}, + {name: "1k", storeNum: 10, regionNum: 100}, + {name: "10k", storeNum: 10, regionNum: 1_000}, + {name: "20k", storeNum: 10, regionNum: 2_000}, + } + + for _, test := range tests { + requestIDs, wg, cancel, eventCh, inputs := prepareBenchMultiStore(b, test.storeNum, test.regionNum) + + // copy to a normal map to reduce access latency, mapping from store index to region id list + copyReqIDs := make(map[int][]uint64, test.regionNum*test.storeNum) + requestIDs.Range(func(key, value interface{}) bool { + regionID := key.(uint64) + requestID := value.(uint64) + initialized := mockInitializedEvent(regionID, requestID) + index := int(regionID-1_000_000) / test.regionNum + inputs[index] <- initialized + if _, ok := copyReqIDs[index]; !ok { + copyReqIDs[index] = make([]uint64, 0, test.regionNum) + } + copyReqIDs[index] = append(copyReqIDs[index], regionID) + return true + }) + + b.Run(test.name, func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + rts := oracle.ComposeTS(oracle.GetPhysical(time.Now()), 0) + for storeID, regionIDs := range copyReqIDs { + batch := 0 + regions := make([]uint64, 0, batchResolvedSize) + for _, regionID := range regionIDs { + batch++ + regions = append(regions, regionID) + if batch == batchResolvedSize { + eventResolvedBatch := &cdcpb.ChangeDataEvent{ + ResolvedTs: &cdcpb.ResolvedTs{ + Regions: regions, + Ts: rts, + }, + } + inputs[storeID] <- eventResolvedBatch + batch = 0 + regions = regions[:0] + } + } + if len(regions) > 0 { + eventResolvedBatch := &cdcpb.ChangeDataEvent{ + ResolvedTs: &cdcpb.ResolvedTs{ + Regions: regions, + Ts: rts, + }, + } + inputs[storeID] <- eventResolvedBatch + } + } + count := 0 + for range eventCh { + count++ + if count == test.regionNum*test.storeNum { + break + } + } + } + }) + + err := retry.Run(time.Millisecond*500, 1000, func() error { + for _, input := range inputs { + if len(input) != 0 { + return errors.New("not all events are sent yet") + } + } + return nil + }) + if err != nil { + b.Error(err) + } + + cancel() + wg.Wait() + } +} + +func BenchmarkMultiStoreResolvedTsClientV1(b *testing.B) { + benchmarkMultipleStoreResolvedTs(b, false /* clientV1 */) +} + +func BenchmarkMultiStoreResolvedTsClientV2(b *testing.B) { + benchmarkMultipleStoreResolvedTs(b, true /* clientV2 */) +} diff --git a/cdc/kv/client_mock_test.go b/cdc/kv/client_mock_test.go new file mode 100644 index 00000000000..d98961c2de7 --- /dev/null +++ b/cdc/kv/client_mock_test.go @@ -0,0 +1,67 @@ +// Copyright 2021 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +/* +This file provides some common struct for unit tests an benchmark tests. +*/ +package kv + +import ( + "context" + + "github.com/pingcap/kvproto/pkg/cdcpb" + "github.com/pingcap/kvproto/pkg/metapb" + "github.com/pingcap/ticdc/pkg/version" + pd "github.com/tikv/pd/client" +) + +type mockPDClient struct { + pd.Client + versionGen func() string +} + +var _ pd.Client = &mockPDClient{} + +func (m *mockPDClient) GetStore(ctx context.Context, storeID uint64) (*metapb.Store, error) { + s, err := m.Client.GetStore(ctx, storeID) + if err != nil { + return nil, err + } + s.Version = m.versionGen() + return s, nil +} + +var defaultVersionGen = func() string { + return version.MinTiKVVersion.String() +} + +func mockInitializedEvent(regionID, requestID uint64) *cdcpb.ChangeDataEvent { + initialized := &cdcpb.ChangeDataEvent{ + Events: []*cdcpb.Event{ + { + RegionId: regionID, + RequestId: requestID, + Event: &cdcpb.Event_Entries_{ + Entries: &cdcpb.Event_Entries{ + Entries: []*cdcpb.Event_Row{ + { + Type: cdcpb.Event_INITIALIZED, + }, + }, + }, + }, + }, + }, + } + return initialized +} diff --git a/cdc/kv/client_test.go b/cdc/kv/client_test.go index 04778cd8c57..10fff65b07f 100644 --- a/cdc/kv/client_test.go +++ b/cdc/kv/client_test.go @@ -36,11 +36,10 @@ import ( "github.com/pingcap/ticdc/pkg/security" "github.com/pingcap/ticdc/pkg/txnutil" "github.com/pingcap/ticdc/pkg/util/testleak" - "github.com/pingcap/ticdc/pkg/version" + "github.com/pingcap/tidb/kv" "github.com/pingcap/tidb/store/mockstore/mocktikv" "github.com/pingcap/tidb/store/tikv" "github.com/pingcap/tidb/store/tikv/oracle" - pd "github.com/tikv/pd/client" "go.uber.org/zap" "google.golang.org/grpc" ) @@ -182,27 +181,6 @@ func (s *clientSuite) TestAssembleRowEvent(c *check.C) { } } -func mockInitializedEvent(regionID, requestID uint64) *cdcpb.ChangeDataEvent { - initialized := &cdcpb.ChangeDataEvent{ - Events: []*cdcpb.Event{ - { - RegionId: regionID, - RequestId: requestID, - Event: &cdcpb.Event_Entries_{ - Entries: &cdcpb.Event_Entries{ - Entries: []*cdcpb.Event_Row{ - { - Type: cdcpb.Event_INITIALIZED, - }, - }, - }, - }, - }, - }, - } - return initialized -} - type mockChangeDataService struct { c *check.C ch chan *cdcpb.ChangeDataEvent @@ -297,26 +275,6 @@ func newMockServiceSpecificAddr( return } -type mockPDClient struct { - pd.Client - versionGen func() string -} - -var _ pd.Client = &mockPDClient{} - -func (m *mockPDClient) GetStore(ctx context.Context, storeID uint64) (*metapb.Store, error) { - s, err := m.Client.GetStore(ctx, storeID) - if err != nil { - return nil, err - } - s.Version = m.versionGen() - return s, nil -} - -var defaultVersionGen = func() string { - return version.MinTiKVVersion.String() -} - // waitRequestID waits request ID larger than the given allocated ID func waitRequestID(c *check.C, allocatedID uint64) { err := retry.Run(time.Millisecond*20, 10, func() error { @@ -357,9 +315,9 @@ func (s *etcdSuite) TestConnectOfflineTiKV(c *check.C) { cluster.Bootstrap(3, []uint64{1, 2}, []uint64{4, 5}, 4) baseAllocatedID := currentRequestID() - lockresolver := txnutil.NewLockerResolver(kvStorage.(tikv.Storage)) + lockresolver := txnutil.NewLockerResolver(kvStorage) isPullInit := &mockPullerInit{} - cdcClient := NewCDCClient(context.Background(), pdClient, kvStorage.(tikv.Storage), &security.Credential{}) + cdcClient := NewCDCClient(context.Background(), pdClient, kvStorage, &security.Credential{}) defer cdcClient.Close() //nolint:errcheck eventCh := make(chan *model.RegionFeedEvent, 10) wg.Add(1) @@ -395,9 +353,10 @@ func (s *etcdSuite) TestConnectOfflineTiKV(c *check.C) { cluster.ChangeLeader(3, 5) - ts, err := kvStorage.CurrentVersion() + ts, err := kvStorage.CurrentTimestamp(oracle.GlobalTxnScope) + ver := kv.NewVersion(ts) c.Assert(err, check.IsNil) - ch2 <- makeEvent(ts.Ver) + ch2 <- makeEvent(ver.Ver) var event *model.RegionFeedEvent select { case event = <-eventCh: @@ -411,7 +370,7 @@ func (s *etcdSuite) TestConnectOfflineTiKV(c *check.C) { case <-time.After(time.Second): c.Fatalf("reconnection not succeed in 1 second") } - checkEvent(event, ts.Ver) + checkEvent(event, ver.Ver) cancel() } @@ -444,9 +403,9 @@ func (s *etcdSuite) TestRecvLargeMessageSize(c *check.C) { cluster.Bootstrap(3, []uint64{2}, []uint64{4}, 4) baseAllocatedID := currentRequestID() - lockresolver := txnutil.NewLockerResolver(kvStorage.(tikv.Storage)) + lockresolver := txnutil.NewLockerResolver(kvStorage) isPullInit := &mockPullerInit{} - cdcClient := NewCDCClient(ctx, pdClient, kvStorage.(tikv.Storage), &security.Credential{}) + cdcClient := NewCDCClient(ctx, pdClient, kvStorage, &security.Credential{}) eventCh := make(chan *model.RegionFeedEvent, 10) wg.Add(1) go func() { @@ -533,9 +492,9 @@ func (s *etcdSuite) TestHandleError(c *check.C) { cluster.Bootstrap(3, []uint64{1, 2}, []uint64{4, 5}, 4) baseAllocatedID := currentRequestID() - lockresolver := txnutil.NewLockerResolver(kvStorage.(tikv.Storage)) + lockresolver := txnutil.NewLockerResolver(kvStorage) isPullInit := &mockPullerInit{} - cdcClient := NewCDCClient(ctx, pdClient, kvStorage.(tikv.Storage), &security.Credential{}) + cdcClient := NewCDCClient(ctx, pdClient, kvStorage, &security.Credential{}) eventCh := make(chan *model.RegionFeedEvent, 10) wg.Add(1) go func() { @@ -694,9 +653,9 @@ func (s *etcdSuite) TestCompatibilityWithSameConn(c *check.C) { cluster.Bootstrap(3, []uint64{1}, []uint64{4}, 4) baseAllocatedID := currentRequestID() - lockresolver := txnutil.NewLockerResolver(kvStorage.(tikv.Storage)) + lockresolver := txnutil.NewLockerResolver(kvStorage) isPullInit := &mockPullerInit{} - cdcClient := NewCDCClient(ctx, pdClient, kvStorage.(tikv.Storage), &security.Credential{}) + cdcClient := NewCDCClient(ctx, pdClient, kvStorage, &security.Credential{}) eventCh := make(chan *model.RegionFeedEvent, 10) var wg2 sync.WaitGroup wg2.Add(1) @@ -755,9 +714,9 @@ func (s *etcdSuite) TestHandleFeedEvent(c *check.C) { cluster.Bootstrap(3, []uint64{1}, []uint64{4}, 4) baseAllocatedID := currentRequestID() - lockresolver := txnutil.NewLockerResolver(kvStorage.(tikv.Storage)) + lockresolver := txnutil.NewLockerResolver(kvStorage) isPullInit := &mockPullerInit{} - cdcClient := NewCDCClient(ctx, pdClient, kvStorage.(tikv.Storage), &security.Credential{}) + cdcClient := NewCDCClient(ctx, pdClient, kvStorage, &security.Credential{}) eventCh := make(chan *model.RegionFeedEvent, 10) wg.Add(1) go func() { @@ -1155,9 +1114,9 @@ func (s *etcdSuite) TestStreamSendWithError(c *check.C) { cluster.Bootstrap(regionID3, []uint64{1}, []uint64{4}, 4) cluster.SplitRaw(regionID3, regionID4, []byte("b"), []uint64{5}, 5) - lockresolver := txnutil.NewLockerResolver(kvStorage.(tikv.Storage)) + lockresolver := txnutil.NewLockerResolver(kvStorage) isPullInit := &mockPullerInit{} - cdcClient := NewCDCClient(ctx, pdClient, kvStorage.(tikv.Storage), &security.Credential{}) + cdcClient := NewCDCClient(ctx, pdClient, kvStorage, &security.Credential{}) eventCh := make(chan *model.RegionFeedEvent, 10) wg.Add(1) go func() { @@ -1261,9 +1220,9 @@ func (s *etcdSuite) TestStreamRecvWithError(c *check.C) { _ = failpoint.Disable("github.com/pingcap/ticdc/cdc/kv/kvClientStreamRecvError") }() baseAllocatedID := currentRequestID() - lockresolver := txnutil.NewLockerResolver(kvStorage.(tikv.Storage)) + lockresolver := txnutil.NewLockerResolver(kvStorage) isPullInit := &mockPullerInit{} - cdcClient := NewCDCClient(ctx, pdClient, kvStorage.(tikv.Storage), &security.Credential{}) + cdcClient := NewCDCClient(ctx, pdClient, kvStorage, &security.Credential{}) eventCh := make(chan *model.RegionFeedEvent, 10) wg.Add(1) go func() { @@ -1414,9 +1373,9 @@ func (s *etcdSuite) TestIncompatibleTiKV(c *check.C) { defer func() { _ = failpoint.Disable("github.com/pingcap/ticdc/cdc/kv/kvClientDelayWhenIncompatible") }() - lockresolver := txnutil.NewLockerResolver(kvStorage.(tikv.Storage)) + lockresolver := txnutil.NewLockerResolver(kvStorage) isPullInit := &mockPullerInit{} - cdcClient := NewCDCClient(ctx, pdClient, kvStorage.(tikv.Storage), &security.Credential{}) + cdcClient := NewCDCClient(ctx, pdClient, kvStorage, &security.Credential{}) eventCh := make(chan *model.RegionFeedEvent, 10) wg.Add(1) go func() { @@ -1506,9 +1465,9 @@ func (s *etcdSuite) TestNoPendingRegionError(c *check.C) { cluster.Bootstrap(3, []uint64{1}, []uint64{4}, 4) baseAllocatedID := currentRequestID() - lockresolver := txnutil.NewLockerResolver(kvStorage.(tikv.Storage)) + lockresolver := txnutil.NewLockerResolver(kvStorage) isPullInit := &mockPullerInit{} - cdcClient := NewCDCClient(ctx, pdClient, kvStorage.(tikv.Storage), &security.Credential{}) + cdcClient := NewCDCClient(ctx, pdClient, kvStorage, &security.Credential{}) eventCh := make(chan *model.RegionFeedEvent, 10) var wg2 sync.WaitGroup wg2.Add(1) @@ -1563,9 +1522,9 @@ func (s *etcdSuite) TestDropStaleRequest(c *check.C) { cluster.Bootstrap(regionID, []uint64{1}, []uint64{4}, 4) baseAllocatedID := currentRequestID() - lockresolver := txnutil.NewLockerResolver(kvStorage.(tikv.Storage)) + lockresolver := txnutil.NewLockerResolver(kvStorage) isPullInit := &mockPullerInit{} - cdcClient := NewCDCClient(ctx, pdClient, kvStorage.(tikv.Storage), &security.Credential{}) + cdcClient := NewCDCClient(ctx, pdClient, kvStorage, &security.Credential{}) eventCh := make(chan *model.RegionFeedEvent, 10) wg.Add(1) go func() { @@ -1670,9 +1629,9 @@ func (s *etcdSuite) TestResolveLock(c *check.C) { _ = failpoint.Disable("github.com/pingcap/ticdc/cdc/kv/kvClientResolveLockInterval") }() baseAllocatedID := currentRequestID() - lockresolver := txnutil.NewLockerResolver(kvStorage.(tikv.Storage)) + lockresolver := txnutil.NewLockerResolver(kvStorage) isPullInit := &mockPullerInit{} - cdcClient := NewCDCClient(ctx, pdClient, kvStorage.(tikv.Storage), &security.Credential{}) + cdcClient := NewCDCClient(ctx, pdClient, kvStorage, &security.Credential{}) eventCh := make(chan *model.RegionFeedEvent, 10) wg.Add(1) go func() { @@ -1767,9 +1726,9 @@ func (s *etcdSuite) testEventCommitTsFallback(c *check.C, events []*cdcpb.Change _ = failpoint.Disable("github.com/pingcap/ticdc/cdc/kv/kvClientErrUnreachable") }() baseAllocatedID := currentRequestID() - lockresolver := txnutil.NewLockerResolver(kvStorage.(tikv.Storage)) + lockresolver := txnutil.NewLockerResolver(kvStorage) isPullInit := &mockPullerInit{} - cdcClient := NewCDCClient(ctx, pdClient, kvStorage.(tikv.Storage), &security.Credential{}) + cdcClient := NewCDCClient(ctx, pdClient, kvStorage, &security.Credential{}) eventCh := make(chan *model.RegionFeedEvent, 10) var clientWg sync.WaitGroup clientWg.Add(1) @@ -1902,9 +1861,9 @@ func (s *etcdSuite) TestEventAfterFeedStop(c *check.C) { _ = failpoint.Disable("github.com/pingcap/ticdc/cdc/kv/kvClientSingleFeedProcessDelay") }() baseAllocatedID := currentRequestID() - lockresolver := txnutil.NewLockerResolver(kvStorage.(tikv.Storage)) + lockresolver := txnutil.NewLockerResolver(kvStorage) isPullInit := &mockPullerInit{} - cdcClient := NewCDCClient(ctx, pdClient, kvStorage.(tikv.Storage), &security.Credential{}) + cdcClient := NewCDCClient(ctx, pdClient, kvStorage, &security.Credential{}) eventCh := make(chan *model.RegionFeedEvent, 10) wg.Add(1) go func() { @@ -2046,9 +2005,9 @@ func (s *etcdSuite) TestOutOfRegionRangeEvent(c *check.C) { cluster.Bootstrap(3, []uint64{1}, []uint64{4}, 4) baseAllocatedID := currentRequestID() - lockresolver := txnutil.NewLockerResolver(kvStorage.(tikv.Storage)) + lockresolver := txnutil.NewLockerResolver(kvStorage) isPullInit := &mockPullerInit{} - cdcClient := NewCDCClient(ctx, pdClient, kvStorage.(tikv.Storage), &security.Credential{}) + cdcClient := NewCDCClient(ctx, pdClient, kvStorage, &security.Credential{}) eventCh := make(chan *model.RegionFeedEvent, 10) wg.Add(1) go func() { @@ -2274,9 +2233,9 @@ func (s *etcdSuite) TestResolveLockNoCandidate(c *check.C) { cluster.Bootstrap(regionID, []uint64{storeID}, []uint64{peerID}, peerID) baseAllocatedID := currentRequestID() - lockresolver := txnutil.NewLockerResolver(kvStorage.(tikv.Storage)) + lockresolver := txnutil.NewLockerResolver(kvStorage) isPullInit := &mockPullerInit{} - cdcClient := NewCDCClient(ctx, pdClient, kvStorage.(tikv.Storage), &security.Credential{}) + cdcClient := NewCDCClient(ctx, pdClient, kvStorage, &security.Credential{}) eventCh := make(chan *model.RegionFeedEvent, 10) wg.Add(1) go func() { @@ -2321,3 +2280,154 @@ func (s *etcdSuite) TestResolveLockNoCandidate(c *check.C) { wg2.Wait() cancel() } + +// TestFailRegionReentrant tests one region could be failover multiple times, +// kv client must avoid duplicated `onRegionFail` call for the same region. +// In this test +// 1. An `unknownErr` is sent to kv client first to trigger `handleSingleRegionError` in region worker. +// 2. We delay the kv client to re-create a new region request by 500ms via failpoint. +// 3. Before new region request is fired, simulate kv client `stream.Recv` returns an error, the stream +// handler will signal region worker to exit, which will evict all active region states then. +func (s *etcdSuite) TestFailRegionReentrant(c *check.C) { + defer testleak.AfterTest(c)() + defer s.TearDownTest(c) + ctx, cancel := context.WithCancel(context.Background()) + wg := &sync.WaitGroup{} + + ch1 := make(chan *cdcpb.ChangeDataEvent, 10) + srv1 := newMockChangeDataService(c, ch1) + server1, addr1 := newMockService(ctx, c, srv1, wg) + + defer func() { + close(ch1) + server1.Stop() + wg.Wait() + }() + + rpcClient, cluster, pdClient, err := mocktikv.NewTiKVAndPDClient("") + c.Assert(err, check.IsNil) + pdClient = &mockPDClient{Client: pdClient, versionGen: defaultVersionGen} + tiStore, err := tikv.NewTestTiKVStore(rpcClient, pdClient, nil, nil, 0) + c.Assert(err, check.IsNil) + kvStorage := newStorageWithCurVersionCache(tiStore, addr1) + defer kvStorage.Close() //nolint:errcheck + + regionID := uint64(3) + cluster.AddStore(1, addr1) + cluster.Bootstrap(regionID, []uint64{1}, []uint64{4}, 4) + + err = failpoint.Enable("github.com/pingcap/ticdc/cdc/kv/kvClientRegionReentrantError", "1*return(\"ok\")->1*return(\"error\")") + c.Assert(err, check.IsNil) + err = failpoint.Enable("github.com/pingcap/ticdc/cdc/kv/kvClientRegionReentrantErrorDelay", "sleep(500)") + c.Assert(err, check.IsNil) + defer func() { + _ = failpoint.Disable("github.com/pingcap/ticdc/cdc/kv/kvClientStreamRecvError") + _ = failpoint.Disable("github.com/pingcap/ticdc/cdc/kv/kvClientStreamRecvErrorDelay") + }() + baseAllocatedID := currentRequestID() + lockresolver := txnutil.NewLockerResolver(kvStorage.(tikv.Storage)) + isPullInit := &mockPullerInit{} + cdcClient := NewCDCClient(ctx, pdClient, kvStorage.(tikv.Storage), &security.Credential{}) + eventCh := make(chan *model.RegionFeedEvent, 10) + wg.Add(1) + go func() { + err := cdcClient.EventFeed(ctx, regionspan.ComparableSpan{Start: []byte("a"), End: []byte("b")}, 100, false, lockresolver, isPullInit, eventCh) + c.Assert(errors.Cause(err), check.Equals, context.Canceled) + cdcClient.Close() //nolint:errcheck + wg.Done() + }() + + // wait request id allocated with: new session, new request + waitRequestID(c, baseAllocatedID+1) + unknownErr := &cdcpb.ChangeDataEvent{Events: []*cdcpb.Event{ + { + RegionId: 3, + RequestId: currentRequestID(), + Event: &cdcpb.Event_Error{ + Error: &cdcpb.Error{}, + }, + }, + }} + ch1 <- unknownErr + // use a fake event to trigger one more stream.Recv + initialized := mockInitializedEvent(regionID, currentRequestID()) + ch1 <- initialized + // since re-establish new region request is delayed by `kvClientRegionReentrantErrorDelay` + // there will be reentrant region failover, the kv client should not panic. + time.Sleep(time.Second) + cancel() +} + +// TestClientV1UnlockRangeReentrant tests clientV1 can handle region reconnection +// with unstable TiKV store correctly. The test workflow is as follows: +// 1. kv client establishes two regions request, naming region-1, region-2, they +// belong to the same TiKV store. +// 2. The region-1 is firstly established, yet region-2 has some delay after its +// region state is inserted into `pendingRegions` +// 3. At this time the TiKV store crashes and `stream.Recv` returns error. In the +// defer function of `receiveFromStream`, all pending regions will be cleaned +// up, which means the region lock will be unlocked once for these regions. +// 4. In step-2, the region-2 continues to run, it can't get store stream which +// has been deleted in step-3, so it will create new stream but fails because +// of unstable TiKV store, at this point, the kv client should handle with the +// pending region correctly. +func (s *etcdSuite) TestClientV1UnlockRangeReentrant(c *check.C) { + defer testleak.AfterTest(c)() + defer s.TearDownTest(c) + + clientv2 := enableKVClientV2 + enableKVClientV2 = false + defer func() { + enableKVClientV2 = clientv2 + }() + + ctx, cancel := context.WithCancel(context.Background()) + wg := &sync.WaitGroup{} + + ch1 := make(chan *cdcpb.ChangeDataEvent, 10) + srv1 := newMockChangeDataService(c, ch1) + server1, addr1 := newMockService(ctx, c, srv1, wg) + + rpcClient, cluster, pdClient, err := mocktikv.NewTiKVAndPDClient("") + c.Assert(err, check.IsNil) + pdClient = &mockPDClient{Client: pdClient, versionGen: defaultVersionGen} + tiStore, err := tikv.NewTestTiKVStore(rpcClient, pdClient, nil, nil, 0) + c.Assert(err, check.IsNil) + kvStorage := newStorageWithCurVersionCache(tiStore, addr1) + defer kvStorage.Close() //nolint:errcheck + + regionID3 := uint64(3) + regionID4 := uint64(4) + cluster.AddStore(1, addr1) + cluster.Bootstrap(regionID3, []uint64{1}, []uint64{4}, 4) + cluster.SplitRaw(regionID3, regionID4, []byte("b"), []uint64{5}, 5) + + err = failpoint.Enable("github.com/pingcap/ticdc/cdc/kv/kvClientStreamRecvError", "1*return(true)") + c.Assert(err, check.IsNil) + err = failpoint.Enable("github.com/pingcap/ticdc/cdc/kv/kvClientPendingRegionDelay", "1*sleep(0)->1*sleep(2000)") + c.Assert(err, check.IsNil) + defer func() { + _ = failpoint.Disable("github.com/pingcap/ticdc/cdc/kv/kvClientStreamRecvError") + _ = failpoint.Disable("github.com/pingcap/ticdc/cdc/kv/kvClientPendingRegionDelay") + }() + lockresolver := txnutil.NewLockerResolver(kvStorage) + isPullInit := &mockPullerInit{} + cdcClient := NewCDCClient(ctx, pdClient, kvStorage, &security.Credential{}) + eventCh := make(chan *model.RegionFeedEvent, 10) + wg.Add(1) + go func() { + err := cdcClient.EventFeed(ctx, regionspan.ComparableSpan{Start: []byte("a"), End: []byte("c")}, 100, false, lockresolver, isPullInit, eventCh) + c.Assert(errors.Cause(err), check.Equals, context.Canceled) + cdcClient.Close() //nolint:errcheck + wg.Done() + }() + + // wait the second region is scheduled + time.Sleep(time.Millisecond * 500) + close(ch1) + server1.Stop() + // wait the kvClientPendingRegionDelay ends, and the second region is processed + time.Sleep(time.Second * 2) + cancel() + wg.Wait() +} diff --git a/cdc/kv/client_v2.go b/cdc/kv/client_v2.go index 5f600317bd6..625ff1ce8db 100644 --- a/cdc/kv/client_v2.go +++ b/cdc/kv/client_v2.go @@ -205,6 +205,11 @@ func (s *eventFeedSession) receiveFromStreamV2( for { cevent, err := stream.Recv() + failpoint.Inject("kvClientRegionReentrantError", func(op failpoint.Value) { + if op.(string) == "error" { + worker.inputCh <- nil + } + }) failpoint.Inject("kvClientStreamRecvError", func() { err = errors.New("injected stream recv error") }) diff --git a/cdc/kv/etcd.go b/cdc/kv/etcd.go index 9b53649117f..19f44b3f2c9 100644 --- a/cdc/kv/etcd.go +++ b/cdc/kv/etcd.go @@ -131,6 +131,18 @@ func (c CDCEtcdClient) Close() error { return c.Client.Unwrap().Close() } +func (c CDCEtcdClient) contextWithSafeLease(ctx context.Context, leaseID clientv3.LeaseID) (context.Context, context.CancelFunc, error) { + lease, err := c.Client.TimeToLive(ctx, leaseID) + if err != nil { + return nil, nil, cerror.WrapError(cerror.ErrPDEtcdAPIError, err) + } + if lease.TTL == int64(-1) { + return nil, nil, cerror.ErrLeaseTimeout.GenWithStackByArgs() + } + ctx, cancel := context.WithTimeout(ctx, time.Duration(lease.TTL)*time.Second) + return ctx, cancel, nil +} + // ClearAllCDCInfo delete all keys created by CDC func (c CDCEtcdClient) ClearAllCDCInfo(ctx context.Context) error { _, err := c.Client.Delete(ctx, EtcdKeyBase, clientv3.WithPrefix()) @@ -857,3 +869,172 @@ func (c CDCEtcdClient) GetOwnerID(ctx context.Context, key string) (string, erro } return string(resp.Kvs[0].Value), nil } + +// LeaseGuardDeleteTaskStatus is a wrapper to DeleteTaskStatus, +// with a context restricted by lease TTL. +func (c CDCEtcdClient) LeaseGuardDeleteTaskStatus( + ctx context.Context, + cfID string, + captureID string, + leaseID clientv3.LeaseID, +) error { + ctx, cancel, err := c.contextWithSafeLease(ctx, leaseID) + if err != nil { + return errors.Trace(err) + } + defer cancel() + return c.DeleteTaskStatus(ctx, cfID, captureID) +} + +// LeaseGuardDeleteTaskPosition is a wrapper to DeleteTaskPosition, +// with a context restricted by lease TTL. +func (c CDCEtcdClient) LeaseGuardDeleteTaskPosition( + ctx context.Context, + cfID string, + captureID string, + leaseID clientv3.LeaseID, +) error { + ctx, cancel, err := c.contextWithSafeLease(ctx, leaseID) + if err != nil { + return errors.Trace(err) + } + defer cancel() + return c.DeleteTaskPosition(ctx, cfID, captureID) +} + +// LeaseGuardDeleteTaskWorkload is a wrapper to DeleteTaskWorkload, +// with a context restricted by lease TTL. +func (c CDCEtcdClient) LeaseGuardDeleteTaskWorkload( + ctx context.Context, + cfID string, + captureID string, + leaseID clientv3.LeaseID, +) error { + ctx, cancel, err := c.contextWithSafeLease(ctx, leaseID) + if err != nil { + return errors.Trace(err) + } + defer cancel() + return c.DeleteTaskWorkload(ctx, cfID, captureID) +} + +// LeaseGuardSaveChangeFeedInfo is a wrapper to SaveChangeFeedInfo, +// with a context restricted by lease TTL. +func (c CDCEtcdClient) LeaseGuardSaveChangeFeedInfo( + ctx context.Context, + info *model.ChangeFeedInfo, + changefeedID string, + leaseID clientv3.LeaseID, +) error { + ctx, cancel, err := c.contextWithSafeLease(ctx, leaseID) + if err != nil { + return errors.Trace(err) + } + defer cancel() + return c.SaveChangeFeedInfo(ctx, info, changefeedID) +} + +// LeaseGuardDeleteChangeFeedInfo is a wrapper to DeleteChangeFeedInfo, +// with a context restricted by lease TTL. +func (c CDCEtcdClient) LeaseGuardDeleteChangeFeedInfo( + ctx context.Context, + changefeedID string, + leaseID clientv3.LeaseID, +) error { + ctx, cancel, err := c.contextWithSafeLease(ctx, leaseID) + if err != nil { + return errors.Trace(err) + } + defer cancel() + return c.DeleteChangeFeedInfo(ctx, changefeedID) +} + +// LeaseGuardRemoveChangeFeedStatus is a wrapper to RemoveChangeFeedStatus, +// with a context restricted by lease TTL. +func (c CDCEtcdClient) LeaseGuardRemoveChangeFeedStatus( + ctx context.Context, + changefeedID string, + leaseID clientv3.LeaseID, +) error { + ctx, cancel, err := c.contextWithSafeLease(ctx, leaseID) + if err != nil { + return errors.Trace(err) + } + defer cancel() + return c.RemoveChangeFeedStatus(ctx, changefeedID) +} + +// LeaseGuardPutChangeFeedStatus is a wrapper to PutChangeFeedStatus, +// with a context restricted by lease TTL. +func (c CDCEtcdClient) LeaseGuardPutChangeFeedStatus( + ctx context.Context, + changefeedID string, + status *model.ChangeFeedStatus, + leaseID clientv3.LeaseID, +) error { + ctx, cancel, err := c.contextWithSafeLease(ctx, leaseID) + if err != nil { + return errors.Trace(err) + } + defer cancel() + return c.PutChangeFeedStatus(ctx, changefeedID, status) +} + +// LeaseGuardRemoveAllTaskStatus wraps RemoveAllTaskStatus, +// with a context restricted by lease TTL. +func (c CDCEtcdClient) LeaseGuardRemoveAllTaskStatus( + ctx context.Context, + changefeedID string, + leaseID clientv3.LeaseID, +) error { + ctx, cancel, err := c.contextWithSafeLease(ctx, leaseID) + if err != nil { + return errors.Trace(err) + } + defer cancel() + return c.RemoveAllTaskStatus(ctx, changefeedID) +} + +// LeaseGuardRemoveAllTaskPositions wraps RemoveAllTaskPositions with a context restricted by lease TTL. +func (c CDCEtcdClient) LeaseGuardRemoveAllTaskPositions( + ctx context.Context, + changefeedID string, + leaseID clientv3.LeaseID, +) error { + ctx, cancel, err := c.contextWithSafeLease(ctx, leaseID) + if err != nil { + return errors.Trace(err) + } + defer cancel() + return c.RemoveAllTaskPositions(ctx, changefeedID) +} + +// LeaseGuardPutAllChangeFeedStatus wraps PutAllChangeFeedStatus with a context restricted by lease TTL. +func (c CDCEtcdClient) LeaseGuardPutAllChangeFeedStatus( + ctx context.Context, + infos map[model.ChangeFeedID]*model.ChangeFeedStatus, + leaseID clientv3.LeaseID, +) error { + ctx, cancel, err := c.contextWithSafeLease(ctx, leaseID) + if err != nil { + return errors.Trace(err) + } + defer cancel() + return c.PutAllChangeFeedStatus(ctx, infos) +} + +// LeaseGuardAtomicPutTaskStatus puts task status into etcd atomically. +func (c CDCEtcdClient) LeaseGuardAtomicPutTaskStatus( + ctx context.Context, + changefeedID string, + captureID string, + leaseID clientv3.LeaseID, + updateFuncs ...UpdateTaskStatusFunc, +) (*model.TaskStatus, int64, error) { + ctx, cancel, err := c.contextWithSafeLease(ctx, leaseID) + if err != nil { + return nil, 0, errors.Trace(err) + } + defer cancel() + return c.AtomicPutTaskStatus(ctx, changefeedID, captureID, updateFuncs...) +} diff --git a/cdc/kv/etcd_test.go b/cdc/kv/etcd_test.go index d07ec699eac..5fe823ecc9f 100644 --- a/cdc/kv/etcd_test.go +++ b/cdc/kv/etcd_test.go @@ -154,7 +154,9 @@ func (s *etcdSuite) TestDeleteTaskStatus(c *check.C) { err := s.client.PutTaskStatus(ctx, feedID, captureID, info) c.Assert(err, check.IsNil) - err = s.client.DeleteTaskStatus(ctx, feedID, captureID) + sess, err := concurrency.NewSession(s.client.Client.Unwrap(), concurrency.WithTTL(2)) + c.Assert(err, check.IsNil) + err = s.client.LeaseGuardDeleteTaskStatus(ctx, feedID, captureID, sess.Lease()) c.Assert(err, check.IsNil) _, _, err = s.client.GetTaskStatus(ctx, feedID, captureID) c.Assert(cerror.ErrTaskStatusNotExists.Equal(err), check.IsTrue) @@ -209,7 +211,9 @@ func (s *etcdSuite) TestDeleteTaskPosition(c *check.C) { _, err := s.client.PutTaskPositionOnChange(ctx, feedID, captureID, info) c.Assert(err, check.IsNil) - err = s.client.DeleteTaskPosition(ctx, feedID, captureID) + sess, err := concurrency.NewSession(s.client.Client.Unwrap(), concurrency.WithTTL(2)) + c.Assert(err, check.IsNil) + err = s.client.LeaseGuardDeleteTaskPosition(ctx, feedID, captureID, sess.Lease()) c.Assert(err, check.IsNil) _, _, err = s.client.GetTaskPosition(ctx, feedID, captureID) c.Assert(cerror.ErrTaskPositionNotExists.Equal(err), check.IsTrue) @@ -224,20 +228,57 @@ func (s *etcdSuite) TestOpChangeFeedDetail(c *check.C) { } cfID := "test-op-cf" - err := s.client.SaveChangeFeedInfo(ctx, detail, cfID) + sess, err := concurrency.NewSession(s.client.Client.Unwrap(), concurrency.WithTTL(2)) + c.Assert(err, check.IsNil) + err = s.client.LeaseGuardSaveChangeFeedInfo(ctx, detail, cfID, sess.Lease()) c.Assert(err, check.IsNil) d, err := s.client.GetChangeFeedInfo(ctx, cfID) c.Assert(err, check.IsNil) c.Assert(d.SinkURI, check.Equals, detail.SinkURI) - err = s.client.DeleteChangeFeedInfo(ctx, cfID) + err = s.client.LeaseGuardDeleteChangeFeedInfo(ctx, cfID, sess.Lease()) c.Assert(err, check.IsNil) _, err = s.client.GetChangeFeedInfo(ctx, cfID) c.Assert(cerror.ErrChangeFeedNotExists.Equal(err), check.IsTrue) } +func (s *etcdSuite) TestRemoveAllTaskXXX(c *check.C) { + defer testleak.AfterTest(c)() + defer s.TearDownTest(c) + ctx := context.Background() + status := &model.TaskStatus{ + Tables: map[model.TableID]*model.TableReplicaInfo{ + 1: {StartTs: 100}, + }, + } + position := &model.TaskPosition{ + ResolvedTs: 100, + CheckPointTs: 100, + } + + feedID := "feedid" + captureID := "captureid" + + sess, err := concurrency.NewSession(s.client.Client.Unwrap(), concurrency.WithTTL(2)) + c.Assert(err, check.IsNil) + + err = s.client.PutTaskStatus(ctx, feedID, captureID, status) + c.Assert(err, check.IsNil) + _, err = s.client.PutTaskPositionOnChange(ctx, feedID, captureID, position) + c.Assert(err, check.IsNil) + err = s.client.LeaseGuardRemoveAllTaskStatus(ctx, feedID, sess.Lease()) + c.Assert(err, check.IsNil) + err = s.client.LeaseGuardRemoveAllTaskPositions(ctx, feedID, sess.Lease()) + c.Assert(err, check.IsNil) + + _, _, err = s.client.GetTaskStatus(ctx, feedID, captureID) + c.Assert(cerror.ErrTaskStatusNotExists.Equal(err), check.IsTrue) + _, _, err = s.client.GetTaskPosition(ctx, feedID, captureID) + c.Assert(cerror.ErrTaskPositionNotExists.Equal(err), check.IsTrue) +} + func (s *etcdSuite) TestPutAllChangeFeedStatus(c *check.C) { defer testleak.AfterTest(c)() defer s.TearDownTest(c) @@ -272,7 +313,9 @@ func (s *etcdSuite) TestPutAllChangeFeedStatus(c *check.C) { c.Assert(err, check.IsNil) } - err = s.client.PutAllChangeFeedStatus(context.Background(), tc.infos) + sess, err := concurrency.NewSession(s.client.Client.Unwrap(), concurrency.WithTTL(2)) + c.Assert(err, check.IsNil) + err = s.client.LeaseGuardPutAllChangeFeedStatus(context.Background(), tc.infos, sess.Lease()) c.Assert(err, check.IsNil) for changefeedID, info := range tc.infos { @@ -314,12 +357,16 @@ func (s *etcdSuite) TestRemoveChangeFeedStatus(c *check.C) { status := &model.ChangeFeedStatus{ ResolvedTs: 1, } - err := s.client.PutChangeFeedStatus(ctx, changefeedID, status) + + sess, err := concurrency.NewSession(s.client.Client.Unwrap(), concurrency.WithTTL(2)) + c.Assert(err, check.IsNil) + err = s.client.LeaseGuardPutChangeFeedStatus(ctx, changefeedID, status, sess.Lease()) c.Assert(err, check.IsNil) status, _, err = s.client.GetChangeFeedStatus(ctx, changefeedID) c.Assert(err, check.IsNil) c.Assert(status, check.DeepEquals, status) - err = s.client.RemoveChangeFeedStatus(ctx, changefeedID) + + err = s.client.LeaseGuardRemoveChangeFeedStatus(ctx, changefeedID, sess.Lease()) c.Assert(err, check.IsNil) _, _, err = s.client.GetChangeFeedStatus(ctx, changefeedID) c.Assert(cerror.ErrChangeFeedNotExists.Equal(err), check.IsTrue) @@ -372,7 +419,9 @@ func (s *etcdSuite) TestDeleteTaskWorkload(c *check.C) { err := s.client.PutTaskWorkload(ctx, feedID, captureID, workload) c.Assert(err, check.IsNil) - err = s.client.DeleteTaskWorkload(ctx, feedID, captureID) + sess, err := concurrency.NewSession(s.client.Client.Unwrap(), concurrency.WithTTL(2)) + c.Assert(err, check.IsNil) + err = s.client.LeaseGuardDeleteTaskWorkload(ctx, feedID, captureID, sess.Lease()) c.Assert(err, check.IsNil) tw, err := s.client.GetTaskWorkload(ctx, feedID, captureID) @@ -517,3 +566,58 @@ func (s *etcdSuite) TestGetAllCDCInfo(c *check.C) { c.Assert(string(kv.Value), check.Equals, expected[i].value) } } + +func (s *etcdSuite) TestAtomicPutTaskStatus(c *check.C) { + defer testleak.AfterTest(c)() + defer s.TearDownTest(c) + ctx := context.Background() + status := &model.TaskStatus{ + Tables: map[model.TableID]*model.TableReplicaInfo{ + 1: {StartTs: 100}, + }, + } + feedID := "feedid" + captureID := "captureid" + + sess, err := concurrency.NewSession(s.client.Client.Unwrap(), concurrency.WithTTL(2)) + c.Assert(err, check.IsNil) + err = s.client.PutTaskStatus(ctx, feedID, captureID, status) + c.Assert(err, check.IsNil) + + status.Tables[2] = &model.TableReplicaInfo{StartTs: 120} + _, revision, err := s.client.LeaseGuardAtomicPutTaskStatus( + ctx, feedID, captureID, sess.Lease(), + func(modRevision int64, taskStatus *model.TaskStatus) (bool, error) { + taskStatus.Tables = status.Tables + taskStatus.Operation = status.Operation + return true, nil + }, + ) + c.Assert(err, check.IsNil) + modRevision, newStatus, err := s.client.GetTaskStatus(ctx, feedID, captureID) + c.Assert(err, check.IsNil) + c.Assert(modRevision, check.Equals, revision) + c.Assert(newStatus, check.DeepEquals, status) +} + +func (s *etcdSuite) TestLeaseGuardWorks(c *check.C) { + defer testleak.AfterTest(c)() + defer s.TearDownTest(c) + + // embed etcd election timeout is 1s, minimum session ttl is 2s + sess, err := concurrency.NewSession(s.client.Client.Unwrap(), concurrency.WithTTL(2)) + c.Assert(err, check.IsNil) + ctx, _, err := s.client.contextWithSafeLease(context.Background(), sess.Lease()) + c.Assert(err, check.IsNil) + time.Sleep(time.Second * 2) + select { + case <-ctx.Done(): + case <-time.After(time.Second): + c.Errorf("context is not done as expected") + } + + _, err = s.client.Client.Revoke(context.Background(), sess.Lease()) + c.Assert(err, check.IsNil) + _, _, err = s.client.contextWithSafeLease(context.Background(), sess.Lease()) + c.Assert(cerror.ErrLeaseTimeout.Equal(err), check.IsTrue) +} diff --git a/cdc/kv/region_worker.go b/cdc/kv/region_worker.go index 7384f1aaca1..ba014d74128 100644 --- a/cdc/kv/region_worker.go +++ b/cdc/kv/region_worker.go @@ -15,6 +15,7 @@ package kv import ( "context" + "runtime" "sync" "time" @@ -33,6 +34,55 @@ import ( "golang.org/x/time/rate" ) +const ( + minRegionStateBucket = 4 + maxRegionStateBucket = 16 +) + +// regionStateManager provides the get/put way like a sync.Map, and it is divided +// into several buckets to reduce lock contention +type regionStateManager struct { + bucket int + states []*sync.Map +} + +func newRegionStateManager(bucket int) *regionStateManager { + if bucket <= 0 { + bucket = runtime.NumCPU() + if bucket > maxRegionStateBucket { + bucket = maxRegionStateBucket + } + if bucket < minRegionStateBucket { + bucket = minRegionStateBucket + } + } + rsm := ®ionStateManager{ + bucket: bucket, + states: make([]*sync.Map, bucket), + } + for i := range rsm.states { + rsm.states[i] = new(sync.Map) + } + return rsm +} + +func (rsm *regionStateManager) getBucket(regionID uint64) int { + return int(regionID) % rsm.bucket +} + +func (rsm *regionStateManager) getState(regionID uint64) (*regionFeedState, bool) { + bucket := rsm.getBucket(regionID) + if val, ok := rsm.states[bucket].Load(regionID); ok { + return val.(*regionFeedState), true + } + return nil, false +} + +func (rsm *regionStateManager) setState(regionID uint64, state *regionFeedState) { + bucket := rsm.getBucket(regionID) + rsm.states[bucket].Store(regionID, state) +} + /* `regionWorker` maintains N regions, it runs in background for each gRPC stream, corresponding to one TiKV store. It receives `regionStatefulEvent` in a channel @@ -47,15 +97,18 @@ lock for each region state(each region state has one lock). for event processing to increase throughput. */ type regionWorker struct { - session *eventFeedSession - limiter *rate.Limiter - inputCh chan *regionStatefulEvent - outputCh chan<- *model.RegionFeedEvent - regionStates map[uint64]*regionFeedState - statesLock sync.RWMutex + session *eventFeedSession + limiter *rate.Limiter + + inputCh chan *regionStatefulEvent + outputCh chan<- *model.RegionFeedEvent + + statesManager *regionStateManager + + rtsManager *resolvedTsManager + rtsUpdateCh chan *regionResolvedTs + enableOldValue bool - rtsManager *resolvedTsManager - rtsUpdateCh chan *regionResolvedTs } func newRegionWorker(s *eventFeedSession, limiter *rate.Limiter) *regionWorker { @@ -64,25 +117,20 @@ func newRegionWorker(s *eventFeedSession, limiter *rate.Limiter) *regionWorker { limiter: limiter, inputCh: make(chan *regionStatefulEvent, 1024), outputCh: s.eventCh, - regionStates: make(map[uint64]*regionFeedState), - enableOldValue: s.enableOldValue, + statesManager: newRegionStateManager(-1), rtsManager: newResolvedTsManager(), rtsUpdateCh: make(chan *regionResolvedTs, 1024), + enableOldValue: s.enableOldValue, } return worker } func (w *regionWorker) getRegionState(regionID uint64) (*regionFeedState, bool) { - w.statesLock.RLock() - defer w.statesLock.RUnlock() - state, ok := w.regionStates[regionID] - return state, ok + return w.statesManager.getState(regionID) } func (w *regionWorker) setRegionState(regionID uint64, state *regionFeedState) { - w.statesLock.Lock() - defer w.statesLock.Unlock() - w.regionStates[regionID] = state + w.statesManager.setState(regionID, state) } func (w *regionWorker) handleSingleRegionError(ctx context.Context, err error, state *regionFeedState) error { @@ -96,6 +144,10 @@ func (w *regionWorker) handleSingleRegionError(ctx context.Context, err error, s zap.Stringer("span", state.sri.span), zap.Uint64("checkpoint", state.sri.ts), zap.String("error", err.Error())) + // if state is already marked stopped, it must have been or would be processed by `onRegionFail` + if state.isStopped() { + return nil + } // We need to ensure when the error is handled, `isStopped` must be set. So set it before sending the error. state.markStopped() failpoint.Inject("kvClientSingleFeedProcessDelay", nil) @@ -145,7 +197,7 @@ func (w *regionWorker) resolveLock(ctx context.Context) error { // Initializing a puller may take a long time, skip resolved lock to save unnecessary overhead. continue } - version, err := w.session.kvStorage.(*StorageWithCurVersionCache).GetCachedCurrentVersion() + version, err := w.session.kvStorage.GetCachedCurrentVersion() if err != nil { log.Warn("failed to get current version from PD", zap.Error(err)) continue @@ -166,9 +218,8 @@ func (w *regionWorker) resolveLock(ctx context.Context) error { continue } maxVersion := oracle.ComposeTS(oracle.GetPhysical(currentTimeFromPD.Add(-10*time.Second)), 0) - w.statesLock.RLock() for _, rts := range expired { - state, ok := w.regionStates[rts.regionID] + state, ok := w.getRegionState(rts.regionID) if !ok || state.isStopped() { // state is already deleted or stoppped, just continue, // and don't need to push resolved ts back to heap. @@ -193,7 +244,6 @@ func (w *regionWorker) resolveLock(ctx context.Context) error { w.rtsManager.Upsert(rts) state.lock.RUnlock() } - w.statesLock.RUnlock() } } } @@ -440,21 +490,27 @@ func (w *regionWorker) handleResolvedTs( // evictAllRegions is used when gRPC stream meets error and re-establish, notify // all existing regions to re-establish func (w *regionWorker) evictAllRegions(ctx context.Context) error { - w.statesLock.Lock() - defer w.statesLock.Unlock() - for _, state := range w.regionStates { - state.lock.RLock() - singleRegionInfo := state.sri.partialClone() - state.lock.RUnlock() - err := w.session.onRegionFail(ctx, regionErrorInfo{ - singleRegionInfo: singleRegionInfo, - err: &rpcCtxUnavailableErr{ - verID: singleRegionInfo.verID, - }, + var err error + for _, states := range w.statesManager.states { + states.Range(func(_, value interface{}) bool { + state := value.(*regionFeedState) + state.lock.Lock() + // if state is marked as stopped, it must have been or would be processed by `onRegionFail` + if state.isStopped() { + state.lock.Unlock() + return true + } + state.markStopped() + singleRegionInfo := state.sri.partialClone() + state.lock.Unlock() + err = w.session.onRegionFail(ctx, regionErrorInfo{ + singleRegionInfo: singleRegionInfo, + err: &rpcCtxUnavailableErr{ + verID: singleRegionInfo.verID, + }, + }) + return err == nil }) - if err != nil { - return err - } } - return nil + return err } diff --git a/cdc/kv/region_worker_test.go b/cdc/kv/region_worker_test.go new file mode 100644 index 00000000000..aa8e1dda8a6 --- /dev/null +++ b/cdc/kv/region_worker_test.go @@ -0,0 +1,111 @@ +// Copyright 2021 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package kv + +import ( + "math/rand" + "sync" + + "github.com/pingcap/check" + "github.com/pingcap/ticdc/pkg/util/testleak" +) + +type regionWorkerSuite struct{} + +var _ = check.Suite(®ionWorkerSuite{}) + +func (s *regionWorkerSuite) TestRegionStateManager(c *check.C) { + defer testleak.AfterTest(c)() + rsm := newRegionStateManager(4) + + regionID := uint64(1000) + _, ok := rsm.getState(regionID) + c.Assert(ok, check.IsFalse) + + rsm.setState(regionID, ®ionFeedState{requestID: 2}) + state, ok := rsm.getState(regionID) + c.Assert(ok, check.IsTrue) + c.Assert(state.requestID, check.Equals, uint64(2)) +} + +func (s *regionWorkerSuite) TestRegionStateManagerThreadSafe(c *check.C) { + defer testleak.AfterTest(c)() + rsm := newRegionStateManager(4) + regionCount := 100 + regionIDs := make([]uint64, regionCount) + for i := 0; i < regionCount; i++ { + regionID := uint64(1000 + i) + regionIDs[i] = regionID + rsm.setState(regionID, ®ionFeedState{requestID: uint64(i + 1), lastResolvedTs: uint64(1000)}) + } + + var wg sync.WaitGroup + concurrency := 20 + wg.Add(concurrency * 2) + for j := 0; j < concurrency; j++ { + go func() { + defer wg.Done() + for i := 0; i < 10000; i++ { + idx := rand.Intn(regionCount) + regionID := regionIDs[idx] + s, ok := rsm.getState(regionID) + c.Assert(ok, check.IsTrue) + s.lock.RLock() + c.Assert(s.requestID, check.Equals, uint64(idx+1)) + s.lock.RUnlock() + } + }() + } + for j := 0; j < concurrency; j++ { + go func() { + defer wg.Done() + for i := 0; i < 10000; i++ { + // simulate write less than read + if i%5 != 0 { + continue + } + regionID := regionIDs[rand.Intn(regionCount)] + s, ok := rsm.getState(regionID) + c.Assert(ok, check.IsTrue) + s.lock.Lock() + s.lastResolvedTs += 10 + s.lock.Unlock() + rsm.setState(regionID, s) + } + }() + } + wg.Wait() + + totalResolvedTs := uint64(0) + for _, regionID := range regionIDs { + s, ok := rsm.getState(regionID) + c.Assert(ok, check.IsTrue) + c.Assert(s.lastResolvedTs, check.Greater, uint64(1000)) + totalResolvedTs += s.lastResolvedTs + } + // 100 regions, initial resolved ts 1000; + // 2000 * resolved ts forward, increased by 10 each time, routine number is `concurrency`. + c.Assert(totalResolvedTs, check.Equals, uint64(100*1000+2000*10*concurrency)) +} + +func (s *regionWorkerSuite) TestRegionStateManagerBucket(c *check.C) { + defer testleak.AfterTest(c)() + rsm := newRegionStateManager(-1) + c.Assert(rsm.bucket, check.GreaterEqual, minRegionStateBucket) + c.Assert(rsm.bucket, check.LessEqual, maxRegionStateBucket) + + bucket := rsm.bucket * 2 + rsm = newRegionStateManager(bucket) + c.Assert(rsm.bucket, check.Equals, bucket) +} diff --git a/cdc/kv/store_op.go b/cdc/kv/store_op.go index dda13d6f21a..d7583f221bb 100644 --- a/cdc/kv/store_op.go +++ b/cdc/kv/store_op.go @@ -25,13 +25,22 @@ import ( "github.com/pingcap/ticdc/pkg/flags" "github.com/pingcap/ticdc/pkg/security" tidbconfig "github.com/pingcap/tidb/config" + "github.com/pingcap/tidb/kv" tidbkv "github.com/pingcap/tidb/kv" "github.com/pingcap/tidb/meta" "github.com/pingcap/tidb/store" + "github.com/pingcap/tidb/store/driver" "github.com/pingcap/tidb/store/tikv" + "github.com/pingcap/tidb/store/tikv/oracle" "go.uber.org/zap" ) +// TiKVStorage is the tikv storage interface used by CDC. +type TiKVStorage interface { + tikv.Storage + GetCachedCurrentVersion() (version tidbkv.Version, err error) +} + const ( storageVersionCacheUpdateInterval = time.Second * 2 ) @@ -53,7 +62,7 @@ var ( curVersionCacheMu sync.Mutex ) -func newStorageWithCurVersionCache(storage tidbkv.Storage, cacheKey string) tidbkv.Storage { +func newStorageWithCurVersionCache(storage tikv.Storage, cacheKey string) TiKVStorage { curVersionCacheMu.Lock() defer curVersionCacheMu.Unlock() @@ -65,7 +74,7 @@ func newStorageWithCurVersionCache(storage tidbkv.Storage, cacheKey string) tidb } return &StorageWithCurVersionCache{ - Storage: storage.(tikv.Storage), + Storage: storage, cacheKey: cacheKey, } } @@ -85,11 +94,12 @@ func (s *StorageWithCurVersionCache) GetCachedCurrentVersion() (version tidbkv.V defer entry.mu.Unlock() if time.Now().After(entry.lastUpdated.Add(storageVersionCacheUpdateInterval)) { - var ver tidbkv.Version - ver, err = s.CurrentVersion() + var ts uint64 + ts, err = s.CurrentTimestamp(oracle.GlobalTxnScope) if err != nil { return } + ver := kv.NewVersion(ts) entry.ts = ver.Ver entry.lastUpdated = time.Now() } @@ -106,14 +116,14 @@ func GetSnapshotMeta(tiStore tidbkv.Storage, ts uint64) (*meta.Meta, error) { } // CreateTiStore creates a new tikv storage client -func CreateTiStore(urls string, credential *security.Credential) (tidbkv.Storage, error) { +func CreateTiStore(urls string, credential *security.Credential) (kv.Storage, error) { urlv, err := flags.NewURLsValue(urls) if err != nil { return nil, errors.Trace(err) } // Ignore error if it is already registered. - _ = store.Register("tikv", tikv.Driver{}) + _ = store.Register("tikv", driver.TiKVDriver{}) if credential.CAPath != "" { conf := tidbconfig.GetGlobalConfig() @@ -128,7 +138,5 @@ func CreateTiStore(urls string, credential *security.Credential) (tidbkv.Storage if err != nil { return nil, cerror.WrapError(cerror.ErrNewStore, err) } - - tiStore = newStorageWithCurVersionCache(tiStore, tiPath) return tiStore, nil } diff --git a/cdc/kv/testing.go b/cdc/kv/testing.go index e48a0934886..ef57b8a2ee0 100644 --- a/cdc/kv/testing.go +++ b/cdc/kv/testing.go @@ -28,6 +28,7 @@ import ( "github.com/pingcap/ticdc/pkg/txnutil" "github.com/pingcap/tidb/kv" "github.com/pingcap/tidb/store" + "github.com/pingcap/tidb/store/driver" "github.com/pingcap/tidb/store/tikv" "github.com/stretchr/testify/require" pd "github.com/tikv/pd/client" @@ -105,7 +106,7 @@ func (ec *eventChecker) stop() { // CreateStorage creates a tikv Storage instance. func CreateStorage(pdAddr string) (storage kv.Storage, err error) { tiPath := fmt.Sprintf("tikv://%s?disableGC=true", pdAddr) - err = store.Register("tikv", tikv.Driver{}) + err = store.Register("tikv", driver.TiKVDriver{}) if err != nil && !strings.Contains(err.Error(), "already registered") { return } @@ -113,7 +114,7 @@ func CreateStorage(pdAddr string) (storage kv.Storage, err error) { return } -func mustGetTimestamp(t require.TestingT, storage kv.Storage) uint64 { +func mustGetTimestamp(t require.TestingT, storage tikv.Storage) uint64 { ts, err := storage.GetOracle().GetTimestamp(context.Background(), nil) require.NoError(t, err) @@ -143,8 +144,8 @@ func (*mockPullerInit) IsInitialized() bool { // TestSplit try split on every region, and test can get value event from // every region after split. -func TestSplit(t require.TestingT, pdCli pd.Client, storage kv.Storage) { - cli := NewCDCClient(context.Background(), pdCli, storage.(tikv.Storage), &security.Credential{}) +func TestSplit(t require.TestingT, pdCli pd.Client, storage tikv.Storage, kvStore kv.Storage) { + cli := NewCDCClient(context.Background(), pdCli, storage, &security.Credential{}) defer cli.Close() eventCh := make(chan *model.RegionFeedEvent, 1<<20) @@ -153,7 +154,7 @@ func TestSplit(t require.TestingT, pdCli pd.Client, storage kv.Storage) { startTS := mustGetTimestamp(t, storage) - lockresolver := txnutil.NewLockerResolver(storage.(tikv.Storage)) + lockresolver := txnutil.NewLockerResolver(storage) isPullInit := &mockPullerInit{} go func() { err := cli.EventFeed(ctx, regionspan.ComparableSpan{Start: nil, End: nil}, startTS, false, lockresolver, isPullInit, eventCh) @@ -201,7 +202,7 @@ func TestSplit(t require.TestingT, pdCli pd.Client, storage kv.Storage) { value := genValue() var tx kv.Transaction - tx, err = storage.Begin() + tx, err = kvStore.Begin() require.NoError(t, err) err = tx.Set(key, value) require.NoError(t, err) @@ -232,8 +233,8 @@ func mustDeleteKey(t require.TestingT, storage kv.Storage, key []byte) { } // TestGetKVSimple test simple KV operations -func TestGetKVSimple(t require.TestingT, pdCli pd.Client, storage kv.Storage) { - cli := NewCDCClient(context.Background(), pdCli, storage.(tikv.Storage), &security.Credential{}) +func TestGetKVSimple(t require.TestingT, pdCli pd.Client, storage tikv.Storage, kvStore kv.Storage) { + cli := NewCDCClient(context.Background(), pdCli, storage, &security.Credential{}) defer cli.Close() checker := newEventChecker(t) @@ -241,7 +242,7 @@ func TestGetKVSimple(t require.TestingT, pdCli pd.Client, storage kv.Storage) { defer cancel() startTS := mustGetTimestamp(t, storage) - lockresolver := txnutil.NewLockerResolver(storage.(tikv.Storage)) + lockresolver := txnutil.NewLockerResolver(storage) isPullInit := &mockPullerInit{} go func() { err := cli.EventFeed(ctx, regionspan.ComparableSpan{Start: nil, End: nil}, startTS, false, lockresolver, isPullInit, checker.eventCh) @@ -252,13 +253,13 @@ func TestGetKVSimple(t require.TestingT, pdCli pd.Client, storage kv.Storage) { value := []byte("s1v") // set - mustSetKey(t, storage, key, value) + mustSetKey(t, kvStore, key, value) // delete - mustDeleteKey(t, storage, key) + mustDeleteKey(t, kvStore, key) // set again - mustSetKey(t, storage, key, value) + mustSetKey(t, kvStore, key, value) for i := 0; i < 2; i++ { // start a new EventFeed with the startTS before the kv operations should also get the same events. diff --git a/cdc/metrics_owner.go b/cdc/metrics_owner.go index ca97e4ece24..2807ecff7c1 100644 --- a/cdc/metrics_owner.go +++ b/cdc/metrics_owner.go @@ -37,6 +37,20 @@ var ( Name: "ownership_counter", Help: "The counter of ownership increases every 5 seconds on a owner capture", }) + ownerMaintainTableNumGauge = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: "ticdc", + Subsystem: "owner", + Name: "maintain_table_num", + Help: "number of replicated tables maintained in owner", + }, []string{"changefeed", "capture", "type"}) +) + +const ( + // total tables that have been dispatched to a single processor + maintainTableTypeTotal string = "total" + // tables that are dispatched to a processor and have not been finished yet + maintainTableTypeWip string = "wip" ) // initOwnerMetrics registers all metrics used in owner @@ -44,4 +58,5 @@ func initOwnerMetrics(registry *prometheus.Registry) { registry.MustRegister(changefeedCheckpointTsGauge) registry.MustRegister(changefeedCheckpointTsLagGauge) registry.MustRegister(ownershipCounter) + registry.MustRegister(ownerMaintainTableNumGauge) } diff --git a/cdc/model/capture.go b/cdc/model/capture.go index 9be06fc2cac..f6594786211 100644 --- a/cdc/model/capture.go +++ b/cdc/model/capture.go @@ -24,6 +24,7 @@ import ( type CaptureInfo struct { ID CaptureID `json:"id"` AdvertiseAddr string `json:"address"` + Version string `json:"version"` } // Marshal using json.Marshal. diff --git a/cdc/model/capture_test.go b/cdc/model/capture_test.go index fb68c9431b6..688f0bf7726 100644 --- a/cdc/model/capture_test.go +++ b/cdc/model/capture_test.go @@ -27,11 +27,12 @@ func (s *captureSuite) TestMarshalUnmarshal(c *check.C) { info := &CaptureInfo{ ID: "9ff52aca-aea6-4022-8ec4-fbee3f2c7890", AdvertiseAddr: "127.0.0.1:8300", + Version: "dev", } - expected := []byte(`{"id":"9ff52aca-aea6-4022-8ec4-fbee3f2c7890","address":"127.0.0.1:8300"}`) + expected := `{"id":"9ff52aca-aea6-4022-8ec4-fbee3f2c7890","address":"127.0.0.1:8300","version":"dev"}` data, err := info.Marshal() c.Assert(err, check.IsNil) - c.Assert(data, check.DeepEquals, expected) + c.Assert(string(data), check.Equals, expected) decodedInfo := &CaptureInfo{} err = decodedInfo.Unmarshal(data) c.Assert(err, check.IsNil) diff --git a/cdc/model/changefeed.go b/cdc/model/changefeed.go index d325a4ac584..6985333267b 100644 --- a/cdc/model/changefeed.go +++ b/cdc/model/changefeed.go @@ -30,7 +30,7 @@ import ( ) // SortEngine is the sorter engine -type SortEngine string +type SortEngine = string // sort engines const ( @@ -85,6 +85,7 @@ type ChangeFeedInfo struct { SyncPointEnabled bool `json:"sync-point-enabled"` SyncPointInterval time.Duration `json:"sync-point-interval"` + CreatorVersion string `json:"creator-version"` } var changeFeedIDRe *regexp.Regexp = regexp.MustCompile(`^[a-zA-Z0-9]+(\-[a-zA-Z0-9]+)*$`) @@ -170,6 +171,17 @@ func (info *ChangeFeedInfo) Unmarshal(data []byte) error { return nil } +// Clone returns a cloned ChangeFeedInfo +func (info *ChangeFeedInfo) Clone() (*ChangeFeedInfo, error) { + s, err := info.Marshal() + if err != nil { + return nil, err + } + cloned := new(ChangeFeedInfo) + err = cloned.Unmarshal([]byte(s)) + return cloned, err +} + // VerifyAndFix verifies changefeed info and may fillin some fields. // If a must field is not provided, return an error. // If some necessary filed is missing but can use a default value, fillin it. diff --git a/cdc/model/changefeed_test.go b/cdc/model/changefeed_test.go index 88d11a5e260..9e22c35cb55 100644 --- a/cdc/model/changefeed_test.go +++ b/cdc/model/changefeed_test.go @@ -190,6 +190,30 @@ func (s *configSuite) TestVerifyAndFix(c *check.C) { c.Assert(marshalConfig1, check.Equals, marshalConfig2) } +func (s *configSuite) TestChangeFeedInfoClone(c *check.C) { + defer testleak.AfterTest(c)() + info := &ChangeFeedInfo{ + SinkURI: "blackhole://", + Opts: map[string]string{}, + StartTs: 417257993615179777, + Config: &config.ReplicaConfig{ + CaseSensitive: true, + EnableOldValue: true, + CheckGCSafePoint: true, + }, + } + + cloned, err := info.Clone() + c.Assert(err, check.IsNil) + sinkURI := "mysql://unix:/var/run/tidb.sock" + cloned.SinkURI = sinkURI + cloned.Config.EnableOldValue = false + c.Assert(cloned.SinkURI, check.Equals, sinkURI) + c.Assert(cloned.Config.EnableOldValue, check.IsFalse) + c.Assert(info.SinkURI, check.Equals, "blackhole://") + c.Assert(info.Config.EnableOldValue, check.IsTrue) +} + type changefeedSuite struct{} var _ = check.Suite(&changefeedSuite{}) diff --git a/cdc/model/owner.go b/cdc/model/owner.go index 64faa7e21bc..7b14568a181 100644 --- a/cdc/model/owner.go +++ b/cdc/model/owner.go @@ -123,6 +123,14 @@ type MoveTableJob struct { Status MoveTableStatus } +// All TableOperation flags +const ( + // Move means after the delete operation, the table will be re added. + // This field is necessary since we must persist enough information to + // restore complete table operation in case of processor or owner crashes. + OperFlagMoveTable uint64 = 1 << iota +) + // All TableOperation status const ( OperDispatched uint64 = iota @@ -132,11 +140,12 @@ const ( // TableOperation records the current information of a table migration type TableOperation struct { - Delete bool `json:"delete"` + Done bool `json:"done"` // deprecated, will be removed in the next version + Delete bool `json:"delete"` + Flag uint64 `json:"flag,omitempty"` // if the operation is a delete operation, BoundaryTs is checkpoint ts // if the operation is a add operation, BoundaryTs is start ts BoundaryTs uint64 `json:"boundary_ts"` - Done bool `json:"done"` // deprecated, will be removed in the next version Status uint64 `json:"status,omitempty"` } @@ -220,7 +229,7 @@ func (ts *TaskStatus) String() string { } // RemoveTable remove the table in TableInfos and add a remove table operation. -func (ts *TaskStatus) RemoveTable(id TableID, boundaryTs Ts) (*TableReplicaInfo, bool) { +func (ts *TaskStatus) RemoveTable(id TableID, boundaryTs Ts, isMoveTable bool) (*TableReplicaInfo, bool) { if ts.Tables == nil { return nil, false } @@ -232,10 +241,14 @@ func (ts *TaskStatus) RemoveTable(id TableID, boundaryTs Ts) (*TableReplicaInfo, if ts.Operation == nil { ts.Operation = make(map[TableID]*TableOperation) } - ts.Operation[id] = &TableOperation{ + op := &TableOperation{ Delete: true, BoundaryTs: boundaryTs, } + if isMoveTable { + op.Flag |= OperFlagMoveTable + } + ts.Operation[id] = op return table, true } diff --git a/cdc/model/owner_test.go b/cdc/model/owner_test.go index 3deac5d3db1..ea811233cc1 100644 --- a/cdc/model/owner_test.go +++ b/cdc/model/owner_test.go @@ -289,6 +289,31 @@ type removeTableSuite struct{} var _ = check.Suite(&removeTableSuite{}) +func (s *removeTableSuite) TestMoveTable(c *check.C) { + defer testleak.AfterTest(c)() + info := TaskStatus{ + Tables: map[TableID]*TableReplicaInfo{ + 1: {StartTs: 100}, + 2: {StartTs: 200}, + }, + } + + replicaInfo, found := info.RemoveTable(2, 300, true) + c.Assert(found, check.IsTrue) + c.Assert(replicaInfo, check.DeepEquals, &TableReplicaInfo{StartTs: 200}) + c.Assert(info.Tables, check.HasKey, int64(1)) + c.Assert(info.Tables, check.Not(check.HasKey), int64(2)) + expectedFlag := uint64(1) // OperFlagMoveTable + c.Assert(info.Operation, check.DeepEquals, map[int64]*TableOperation{ + 2: { + Delete: true, + Flag: expectedFlag, + BoundaryTs: 300, + Status: OperDispatched, + }, + }) +} + func (s *removeTableSuite) TestShouldReturnRemovedTable(c *check.C) { defer testleak.AfterTest(c)() info := TaskStatus{ @@ -300,7 +325,7 @@ func (s *removeTableSuite) TestShouldReturnRemovedTable(c *check.C) { }, } - replicaInfo, found := info.RemoveTable(2, 666) + replicaInfo, found := info.RemoveTable(2, 666, false) c.Assert(found, check.IsTrue) c.Assert(replicaInfo, check.DeepEquals, &TableReplicaInfo{StartTs: 200}) } @@ -308,7 +333,7 @@ func (s *removeTableSuite) TestShouldReturnRemovedTable(c *check.C) { func (s *removeTableSuite) TestShouldHandleTableNotFound(c *check.C) { defer testleak.AfterTest(c)() info := TaskStatus{} - _, found := info.RemoveTable(404, 666) + _, found := info.RemoveTable(404, 666, false) c.Assert(found, check.IsFalse) info = TaskStatus{ @@ -316,6 +341,6 @@ func (s *removeTableSuite) TestShouldHandleTableNotFound(c *check.C) { 1: {StartTs: 100}, }, } - _, found = info.RemoveTable(404, 666) + _, found = info.RemoveTable(404, 666, false) c.Assert(found, check.IsFalse) } diff --git a/cdc/model/sink.go b/cdc/model/sink.go index 5219544f52b..1b9212f3d4b 100644 --- a/cdc/model/sink.go +++ b/cdc/model/sink.go @@ -16,6 +16,7 @@ package model import ( "fmt" "strconv" + "sync" "github.com/pingcap/log" "github.com/pingcap/parser/model" @@ -412,11 +413,17 @@ func (d *DDLEvent) fillPreTableInfo(preTableInfo *TableInfo) { // SingleTableTxn represents a transaction which includes many row events in a single table type SingleTableTxn struct { + // data fields of SingleTableTxn Table *TableName StartTs uint64 CommitTs uint64 Rows []*RowChangedEvent ReplicaID uint64 + + // control fields of SingleTableTxn + // FinishWg is a barrier txn, after this txn is received, the worker must + // flush cached txns and call FinishWg.Done() to mark txns have been flushed. + FinishWg *sync.WaitGroup } // Append adds a row changed event into SingleTableTxn diff --git a/cdc/owner.go b/cdc/owner.go index a5e4d1b2b0b..6b2aa2e3a15 100644 --- a/cdc/owner.go +++ b/cdc/owner.go @@ -158,7 +158,7 @@ func NewOwner( return owner, nil } -func (o *Owner) addCapture(info *model.CaptureInfo) { +func (o *Owner) addCapture(_ context.Context, info *model.CaptureInfo) { o.l.Lock() o.captures[info.ID] = info o.l.Unlock() @@ -167,7 +167,36 @@ func (o *Owner) addCapture(info *model.CaptureInfo) { o.rebalanceMu.Unlock() } -func (o *Owner) removeCapture(info *model.CaptureInfo) { +// When a table is moved from one capture to another, the workflow is as follows +// 1. Owner deletes the table from the original capture (we call it capture-1), +// and adds an table operation record in the task status +// 2. The processor in capture-1 reads the operation record, and waits the table +// checkpoint ts reaches the boundary ts in operation, which often equals to +// the global resovled ts, larger the current checkpoint ts of this table. +// 3. After table checkpoint ts reaches boundary ts, capture-1 marks the table +// operation as finished. +// 4. Owner reads the finished mark and re-dispatches this table to another capture. +// +// When capture-1 crashes between step-2 and step-3, this function should be +// called to let owner re dispatch the table. Besides owner could also crash at +// the same time, in that case this function should also be called. In addtition, +// this function only handles move table job: 1) the add table job persists both +// table replicaInfo and operation, we can recover enough information from table +// replicaInfo; 2) if a table is deleted from a capture and that capture crashes, +// we just ignore this table. +func (o *Owner) rebuildTableFromOperations(cf *changeFeed, taskStatus *model.TaskStatus, startTs uint64) { + for tableID, op := range taskStatus.Operation { + if op.Delete && op.Flag&model.OperFlagMoveTable > 0 { + cf.orphanTables[tableID] = startTs + if job, ok := cf.moveTableJobs[tableID]; ok { + log.Info("remove outdated move table job", zap.Reflect("job", job), zap.Uint64("start-ts", startTs)) + delete(cf.moveTableJobs, tableID) + } + } + } +} + +func (o *Owner) removeCapture(ctx context.Context, info *model.CaptureInfo) { o.l.Lock() defer o.l.Unlock() @@ -202,19 +231,22 @@ func (o *Owner) removeCapture(info *model.CaptureInfo) { } } - ctx := context.TODO() - if err := o.etcdClient.DeleteTaskStatus(ctx, feed.id, info.ID); err != nil { + o.rebuildTableFromOperations(feed, task, startTs) + + if err := o.etcdClient.LeaseGuardDeleteTaskStatus(ctx, feed.id, info.ID, o.session.Lease()); err != nil { log.Warn("failed to delete task status", zap.String("capture-id", info.ID), zap.String("changefeed", feed.id), zap.Error(err)) } - if err := o.etcdClient.DeleteTaskPosition(ctx, feed.id, info.ID); err != nil { + if err := o.etcdClient.LeaseGuardDeleteTaskPosition(ctx, feed.id, info.ID, o.session.Lease()); err != nil { log.Warn("failed to delete task position", zap.String("capture-id", info.ID), zap.String("changefeed", feed.id), zap.Error(err)) } - if err := o.etcdClient.DeleteTaskWorkload(ctx, feed.id, info.ID); err != nil { + if err := o.etcdClient.LeaseGuardDeleteTaskWorkload(ctx, feed.id, info.ID, o.session.Lease()); err != nil { log.Warn("failed to delete task workload", zap.String("capture-id", info.ID), zap.String("changefeed", feed.id), zap.Error(err)) } + ownerMaintainTableNumGauge.DeleteLabelValues(feed.id, info.AdvertiseAddr, maintainTableTypeTotal) + ownerMaintainTableNumGauge.DeleteLabelValues(feed.id, info.AdvertiseAddr, maintainTableTypeWip) } } @@ -267,6 +299,7 @@ func (o *Owner) newChangeFeed( return nil, errors.Trace(err) } + // TODO delete if info.Engine == model.SortInFile { err = os.MkdirAll(info.SortDir, 0o755) if err != nil { @@ -439,6 +472,7 @@ func (o *Owner) newChangeFeed( taskStatus: processorsInfos, taskPositions: taskPositions, etcdCli: o.etcdClient, + leaseID: o.session.Lease(), filter: filter, sink: primarySink, cyclicEnabled: info.Config.Cyclic.IsEnabled(), @@ -463,7 +497,7 @@ func (o *Owner) checkAndCleanTasksInfo(ctx context.Context) error { if cerror.ErrDecodeFailed.NotEqual(err) { return errors.Trace(err) } - err := o.cfRWriter.RemoveAllTaskStatus(ctx, changefeedID) + err := o.cfRWriter.LeaseGuardRemoveAllTaskStatus(ctx, changefeedID, o.session.Lease()) if err != nil { return errors.Trace(err) } @@ -524,7 +558,7 @@ func (o *Owner) loadChangeFeeds(ctx context.Context) error { } needSave, canInit := cfInfo.CheckErrorHistory() if needSave { - err := o.etcdClient.SaveChangeFeedInfo(ctx, cfInfo, changeFeedID) + err := o.etcdClient.LeaseGuardSaveChangeFeedInfo(ctx, cfInfo, changeFeedID, o.session.Lease()) if err != nil { return err } @@ -589,14 +623,14 @@ func (o *Owner) loadChangeFeeds(ctx context.Context) error { log.Error("create changefeed with fast fail error, mark changefeed as failed", zap.Error(err), zap.String("changefeed", changeFeedID)) cfInfo.State = model.StateFailed - err := o.etcdClient.SaveChangeFeedInfo(ctx, cfInfo, changeFeedID) + err := o.etcdClient.LeaseGuardSaveChangeFeedInfo(ctx, cfInfo, changeFeedID, o.session.Lease()) if err != nil { return err } continue } - err2 := o.etcdClient.SaveChangeFeedInfo(ctx, cfInfo, changeFeedID) + err2 := o.etcdClient.LeaseGuardSaveChangeFeedInfo(ctx, cfInfo, changeFeedID, o.session.Lease()) if err2 != nil { return err2 } @@ -698,7 +732,7 @@ func (o *Owner) flushChangeFeedInfos(ctx context.Context) error { changefeedCheckpointTsLagGauge.WithLabelValues(id).Set(float64(oracle.GetPhysical(time.Now())-phyTs) / 1e3) } if time.Since(o.lastFlushChangefeeds) > o.flushChangefeedInterval { - err := o.cfRWriter.PutAllChangeFeedStatus(ctx, snapshot) + err := o.cfRWriter.LeaseGuardPutAllChangeFeedStatus(ctx, snapshot, o.session.Lease()) if err != nil { return errors.Trace(err) } @@ -807,16 +841,20 @@ func (o *Owner) handleSyncPoint(ctx context.Context) error { } // dispatchJob dispatches job to processors +// Note job type in this function contains pause, remove and finish func (o *Owner) dispatchJob(ctx context.Context, job model.AdminJob) error { cf, ok := o.changeFeeds[job.CfID] if !ok { return cerror.ErrOwnerChangefeedNotFound.GenWithStackByArgs(job.CfID) } for captureID := range cf.taskStatus { - newStatus, _, err := cf.etcdCli.AtomicPutTaskStatus(ctx, cf.id, captureID, func(modRevision int64, taskStatus *model.TaskStatus) (bool, error) { - taskStatus.AdminJobType = job.Type - return true, nil - }) + newStatus, _, err := cf.etcdCli.LeaseGuardAtomicPutTaskStatus( + ctx, cf.id, captureID, o.session.Lease(), + func(modRevision int64, taskStatus *model.TaskStatus) (bool, error) { + taskStatus.AdminJobType = job.Type + return true, nil + }, + ) if err != nil { return errors.Trace(err) } @@ -825,7 +863,7 @@ func (o *Owner) dispatchJob(ctx context.Context, job model.AdminJob) error { // record admin job in changefeed status cf.status.AdminJobType = job.Type infos := map[model.ChangeFeedID]*model.ChangeFeedStatus{job.CfID: cf.status} - err := o.cfRWriter.PutAllChangeFeedStatus(ctx, infos) + err := o.cfRWriter.LeaseGuardPutAllChangeFeedStatus(ctx, infos, o.session.Lease()) if err != nil { return errors.Trace(err) } @@ -836,6 +874,15 @@ func (o *Owner) dispatchJob(ctx context.Context, job model.AdminJob) error { if job.Type == model.AdminStop { o.stoppedFeeds[job.CfID] = cf.status } + for captureID := range cf.taskStatus { + capture, ok := o.captures[captureID] + if !ok { + log.Warn("capture not found", zap.String("capture-id", captureID)) + continue + } + ownerMaintainTableNumGauge.DeleteLabelValues(cf.id, capture.AdvertiseAddr, maintainTableTypeTotal) + ownerMaintainTableNumGauge.DeleteLabelValues(cf.id, capture.AdvertiseAddr, maintainTableTypeWip) + } delete(o.changeFeeds, job.CfID) return nil } @@ -899,6 +946,17 @@ func (o *Owner) checkClusterHealth(_ context.Context) error { } } } + for _, cf := range o.changeFeeds { + for captureID, pinfo := range cf.taskStatus { + capture, ok := o.captures[captureID] + if !ok { + log.Warn("capture not found", zap.String("capture-id", captureID)) + continue + } + ownerMaintainTableNumGauge.WithLabelValues(cf.id, capture.AdvertiseAddr, maintainTableTypeTotal).Set(float64(len(pinfo.Tables))) + ownerMaintainTableNumGauge.WithLabelValues(cf.id, capture.AdvertiseAddr, maintainTableTypeWip).Set(float64(len(pinfo.Operation))) + } + } // TODO: check processor normal exited return nil } @@ -922,7 +980,7 @@ func (o *Owner) handleAdminJob(ctx context.Context) error { if feedState == model.StateFailed && job.Type == model.AdminRemove { // changefeed in failed state, but changefeed status has not // been created yet. Try to remove changefeed info only. - err := o.etcdClient.DeleteChangeFeedInfo(ctx, job.CfID) + err := o.etcdClient.LeaseGuardDeleteChangeFeedInfo(ctx, job.CfID, o.session.Lease()) if err != nil { return errors.Trace(err) } @@ -955,7 +1013,7 @@ func (o *Owner) handleAdminJob(ctx context.Context) error { cf.info.ErrorHis = append(cf.info.ErrorHis, time.Now().UnixNano()/1e6) } - err := o.etcdClient.SaveChangeFeedInfo(ctx, cf.info, job.CfID) + err := o.etcdClient.LeaseGuardSaveChangeFeedInfo(ctx, cf.info, job.CfID, o.session.Lease()) if err != nil { return errors.Trace(err) } @@ -977,7 +1035,7 @@ func (o *Owner) handleAdminJob(ctx context.Context) error { case model.StateRemoved, model.StateFinished: // remove a removed or finished changefeed if job.Opts != nil && job.Opts.ForceRemove { - err := o.etcdClient.RemoveChangeFeedStatus(ctx, job.CfID) + err := o.etcdClient.LeaseGuardRemoveChangeFeedStatus(ctx, job.CfID, o.session.Lease()) if err != nil { return errors.Trace(err) } @@ -988,7 +1046,7 @@ func (o *Owner) handleAdminJob(ctx context.Context) error { case model.StateStopped, model.StateFailed: // remove a paused or failed changefeed status.AdminJobType = model.AdminRemove - err = o.etcdClient.PutChangeFeedStatus(ctx, job.CfID, status) + err = o.etcdClient.LeaseGuardPutChangeFeedStatus(ctx, job.CfID, status, o.session.Lease()) if err != nil { return errors.Trace(err) } @@ -1004,7 +1062,7 @@ func (o *Owner) handleAdminJob(ctx context.Context) error { } if job.Opts != nil && job.Opts.ForceRemove { // if `ForceRemove` is enabled, remove all information related to this changefeed - err := o.etcdClient.RemoveChangeFeedStatus(ctx, job.CfID) + err := o.etcdClient.LeaseGuardRemoveChangeFeedStatus(ctx, job.CfID, o.session.Lease()) if err != nil { return errors.Trace(err) } @@ -1032,7 +1090,7 @@ func (o *Owner) handleAdminJob(ctx context.Context) error { // set admin job in changefeed status to tell owner resume changefeed status.AdminJobType = model.AdminResume - err = o.etcdClient.PutChangeFeedStatus(ctx, job.CfID, status) + err = o.etcdClient.LeaseGuardPutChangeFeedStatus(ctx, job.CfID, status, o.session.Lease()) if err != nil { return errors.Trace(err) } @@ -1042,7 +1100,7 @@ func (o *Owner) handleAdminJob(ctx context.Context) error { // clear last running error cfInfo.State = model.StateNormal cfInfo.Error = nil - err = o.etcdClient.SaveChangeFeedInfo(ctx, cfInfo, job.CfID) + err = o.etcdClient.LeaseGuardSaveChangeFeedInfo(ctx, cfInfo, job.CfID, o.session.Lease()) if err != nil { return errors.Trace(err) } @@ -1143,7 +1201,14 @@ loop: err = o.run(ctx) if err != nil { - if errors.Cause(err) != context.Canceled { + switch errors.Cause(err) { + case context.DeadlineExceeded: + // context timeout means the o.run doesn't finish in a safe owner + // lease cycle, it is safe to retry. If the lease is revoked, + // another run loop will detect it. + continue loop + case context.Canceled: + default: log.Error("owner exited with error", zap.Error(err)) } break loop @@ -1393,15 +1458,18 @@ func (o *Owner) cleanUpStaleTasks(ctx context.Context) error { } o.addOrphanTable(changeFeedID, tableID, startTs) } + if cf, ok := o.changeFeeds[changeFeedID]; ok { + o.rebuildTableFromOperations(cf, status, cf.status.CheckpointTs) + } } - if err := o.etcdClient.DeleteTaskStatus(ctx, changeFeedID, captureID); err != nil { + if err := o.etcdClient.LeaseGuardDeleteTaskStatus(ctx, changeFeedID, captureID, o.session.Lease()); err != nil { return errors.Trace(err) } - if err := o.etcdClient.DeleteTaskPosition(ctx, changeFeedID, captureID); err != nil { + if err := o.etcdClient.LeaseGuardDeleteTaskPosition(ctx, changeFeedID, captureID, o.session.Lease()); err != nil { return errors.Trace(err) } - if err := o.etcdClient.DeleteTaskWorkload(ctx, changeFeedID, captureID); err != nil { + if err := o.etcdClient.LeaseGuardDeleteTaskWorkload(ctx, changeFeedID, captureID, o.session.Lease()); err != nil { return errors.Trace(err) } log.Info("cleanup stale task", zap.String("capture-id", captureID), zap.String("changefeed", changeFeedID)) @@ -1471,7 +1539,7 @@ func (o *Owner) watchCapture(ctx context.Context) error { log.Info("delete capture", zap.String("capture-id", c.ID), zap.String("capture", c.AdvertiseAddr)) - o.removeCapture(c) + o.removeCapture(ctx, c) case clientv3.EventTypePut: if !ev.IsCreate() { continue @@ -1482,7 +1550,7 @@ func (o *Owner) watchCapture(ctx context.Context) error { log.Info("add capture", zap.String("capture-id", c.ID), zap.String("capture", c.AdvertiseAddr)) - o.addCapture(c) + o.addCapture(ctx, c) } } } @@ -1491,11 +1559,11 @@ func (o *Owner) watchCapture(ctx context.Context) error { func (o *Owner) rebuildCaptureEvents(ctx context.Context, captures map[model.CaptureID]*model.CaptureInfo) error { for _, c := range captures { - o.addCapture(c) + o.addCapture(ctx, c) } for _, c := range o.captures { if _, ok := captures[c.ID]; !ok { - o.removeCapture(c) + o.removeCapture(ctx, c) } } // captureLoaded is used to check whether the owner can execute cleanup stale tasks job. diff --git a/cdc/owner_test.go b/cdc/owner_test.go index 60216871dae..3867457292a 100644 --- a/cdc/owner_test.go +++ b/cdc/owner_test.go @@ -43,6 +43,7 @@ import ( "github.com/pingcap/tidb/store/mockstore" pd "github.com/tikv/pd/client" "go.etcd.io/etcd/clientv3" + "go.etcd.io/etcd/clientv3/concurrency" "go.etcd.io/etcd/embed" "golang.org/x/sync/errgroup" ) @@ -105,13 +106,18 @@ func (m *mockPDClient) UpdateServiceGCSafePoint(ctx context.Context, serviceID s func (s *ownerSuite) TestOwnerFlushChangeFeedInfos(c *check.C) { defer testleak.AfterTest(c)() + session, err := concurrency.NewSession(s.client.Client.Unwrap(), + concurrency.WithTTL(config.GetDefaultServerConfig().CaptureSessionTTL)) + c.Assert(err, check.IsNil) mockPDCli := &mockPDClient{} mockOwner := Owner{ + session: session, + etcdClient: s.client, pdClient: mockPDCli, gcSafepointLastUpdate: time.Now(), } - err := mockOwner.flushChangeFeedInfos(s.ctx) + err = mockOwner.flushChangeFeedInfos(s.ctx) c.Assert(err, check.IsNil) c.Assert(mockPDCli.invokeCounter, check.Equals, 1) s.TearDownTest(c) @@ -142,7 +148,11 @@ func (s *ownerSuite) TestOwnerFlushChangeFeedInfosFailed(c *check.C) { }, } + session, err := concurrency.NewSession(s.client.Client.Unwrap(), + concurrency.WithTTL(config.GetDefaultServerConfig().CaptureSessionTTL)) + c.Assert(err, check.IsNil) mockOwner := Owner{ + session: session, pdClient: mockPDCli, etcdClient: s.client, lastFlushChangefeeds: time.Now(), @@ -153,7 +163,7 @@ func (s *ownerSuite) TestOwnerFlushChangeFeedInfosFailed(c *check.C) { } time.Sleep(3 * time.Second) - err := mockOwner.flushChangeFeedInfos(s.ctx) + err = mockOwner.flushChangeFeedInfos(s.ctx) c.Assert(err, check.IsNil) c.Assert(mockPDCli.invokeCounter, check.Equals, 1) @@ -208,8 +218,12 @@ func (s *ownerSuite) TestOwnerUploadGCSafePointOutdated(c *check.C) { }, } + session, err := concurrency.NewSession(s.client.Client.Unwrap(), + concurrency.WithTTL(config.GetDefaultServerConfig().CaptureSessionTTL)) + c.Assert(err, check.IsNil) mockOwner := Owner{ pdClient: mockPDCli, + session: session, etcdClient: s.client, lastFlushChangefeeds: time.Now(), flushChangefeedInterval: 1 * time.Hour, @@ -218,7 +232,7 @@ func (s *ownerSuite) TestOwnerUploadGCSafePointOutdated(c *check.C) { stoppedFeeds: make(map[model.ChangeFeedID]*model.ChangeFeedStatus), } - err := mockOwner.flushChangeFeedInfos(s.ctx) + err = mockOwner.flushChangeFeedInfos(s.ctx) c.Assert(err, check.IsNil) c.Assert(mockPDCli.invokeCounter, check.Equals, 1) @@ -588,6 +602,7 @@ func (s *ownerSuite) TestDDL(c *check.C) { c.Assert(errors.Cause(err), check.DeepEquals, context.Canceled) } */ +var cdcGCSafePointTTL4Test = int64(24 * 60 * 60) func (s *ownerSuite) TestHandleAdmin(c *check.C) { defer testleak.AfterTest(c)() @@ -629,12 +644,12 @@ func (s *ownerSuite) TestHandleAdmin(c *check.C) { sampleCF.sink = sink capture, err := NewCapture(ctx, []string{s.clientURL.String()}, nil, - &security.Credential{}, "127.0.0.1:12034", &processorOpts{flushCheckpointInterval: time.Millisecond * 200}) + &security.Credential{}, "127.0.0.1:12034", &captureOpts{flushCheckpointInterval: time.Millisecond * 200}) c.Assert(err, check.IsNil) err = capture.Campaign(ctx) c.Assert(err, check.IsNil) - owner, err := NewOwner(ctx, nil, &security.Credential{}, capture.session, DefaultCDCGCSafePointTTL, time.Millisecond*200) + owner, err := NewOwner(ctx, nil, &security.Credential{}, capture.session, cdcGCSafePointTTL4Test, time.Millisecond*200) c.Assert(err, check.IsNil) sampleCF.etcdCli = owner.etcdClient @@ -931,14 +946,14 @@ func (s *ownerSuite) TestWatchCampaignKey(c *check.C) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() capture, err := NewCapture(ctx, []string{s.clientURL.String()}, nil, - &security.Credential{}, "127.0.0.1:12034", &processorOpts{}) + &security.Credential{}, "127.0.0.1:12034", &captureOpts{}) c.Assert(err, check.IsNil) err = capture.Campaign(ctx) c.Assert(err, check.IsNil) ctx1, cancel1 := context.WithCancel(ctx) owner, err := NewOwner(ctx1, nil, &security.Credential{}, capture.session, - DefaultCDCGCSafePointTTL, time.Millisecond*200) + cdcGCSafePointTTL4Test, time.Millisecond*200) c.Assert(err, check.IsNil) // check campaign key deleted can be detected @@ -989,7 +1004,7 @@ func (s *ownerSuite) TestCleanUpStaleTasks(c *check.C) { addr := "127.0.0.1:12034" ctx = util.PutCaptureAddrInCtx(ctx, addr) capture, err := NewCapture(ctx, []string{s.clientURL.String()}, nil, - &security.Credential{}, addr, &processorOpts{}) + &security.Credential{}, addr, &captureOpts{}) c.Assert(err, check.IsNil) err = s.client.PutCaptureInfo(ctx, capture.info, capture.session.Lease()) c.Assert(err, check.IsNil) @@ -1020,7 +1035,7 @@ func (s *ownerSuite) TestCleanUpStaleTasks(c *check.C) { captures[c.ID] = c } owner, err := NewOwner(ctx, nil, &security.Credential{}, capture.session, - DefaultCDCGCSafePointTTL, time.Millisecond*200) + cdcGCSafePointTTL4Test, time.Millisecond*200) c.Assert(err, check.IsNil) // It is better to update changefeed information by `loadChangeFeeds`, however // `loadChangeFeeds` is too overweight, just mock enough information here. @@ -1028,6 +1043,9 @@ func (s *ownerSuite) TestCleanUpStaleTasks(c *check.C) { changefeed: { id: changefeed, orphanTables: make(map[model.TableID]model.Ts), + status: &model.ChangeFeedStatus{ + CheckpointTs: 100, + }, }, } @@ -1073,10 +1091,10 @@ func (s *ownerSuite) TestWatchFeedChange(c *check.C) { addr := "127.0.0.1:12034" ctx = util.PutCaptureAddrInCtx(ctx, addr) capture, err := NewCapture(ctx, []string{s.clientURL.String()}, nil, - &security.Credential{}, addr, &processorOpts{}) + &security.Credential{}, addr, &captureOpts{}) c.Assert(err, check.IsNil) owner, err := NewOwner(ctx, nil, &security.Credential{}, capture.session, - DefaultCDCGCSafePointTTL, time.Millisecond*200) + cdcGCSafePointTTL4Test, time.Millisecond*200) c.Assert(err, check.IsNil) var ( diff --git a/cdc/processor.go b/cdc/processor.go index fa64f5e06b5..537882d8464 100644 --- a/cdc/processor.go +++ b/cdc/processor.go @@ -832,7 +832,7 @@ func (p *oldProcessor) addTable(ctx context.Context, tableID int64, replicaInfo switch p.changefeed.Engine { case model.SortInMemory: sorter = puller.NewEntrySorter() - case model.SortInFile, model.SortUnified: + case model.SortInFile: err := util.IsDirAndWritable(p.changefeed.SortDir) if err != nil { if os.IsNotExist(errors.Cause(err)) { @@ -847,16 +847,22 @@ func (p *oldProcessor) addTable(ctx context.Context, tableID int64, replicaInfo } } - if p.changefeed.Engine == model.SortInFile { - sorter = puller.NewFileSorter(p.changefeed.SortDir) - } else { - // Unified Sorter - sorter = psorter.NewUnifiedSorter(p.changefeed.SortDir, tableName, util.CaptureAddrFromCtx(ctx)) + sorter = puller.NewFileSorter(p.changefeed.SortDir) + case model.SortUnified: + err := psorter.UnifiedSorterCheckDir(p.changefeed.SortDir) + if err != nil { + p.sendError(errors.Trace(err)) + return nil } + sorter = psorter.NewUnifiedSorter(p.changefeed.SortDir, p.changefeedID, tableName, tableID, util.CaptureAddrFromCtx(ctx)) default: p.sendError(cerror.ErrUnknownSortEngine.GenWithStackByArgs(p.changefeed.Engine)) return nil } + failpoint.Inject("ProcessorAddTableError", func() { + p.sendError(errors.New("processor add table injected error")) + failpoint.Return(nil) + }) go func() { err := sorter.Run(ctx) if errors.Cause(err) != context.Canceled { @@ -909,6 +915,8 @@ func (p *oldProcessor) addTable(ctx context.Context, tableID int64, replicaInfo syncTableNumGauge.WithLabelValues(p.changefeedID, p.captureInfo.AdvertiseAddr).Inc() } +const maxLagWithCheckpointTs = (30 * 1000) << 18 // 30s + // sorterConsume receives sorted PolymorphicEvent from sorter of each table and // sends to processor's output chan func (p *oldProcessor) sorterConsume( @@ -921,7 +929,7 @@ func (p *oldProcessor) sorterConsume( replicaInfo *model.TableReplicaInfo, sink sink.Sink, ) { - var lastResolvedTs uint64 + var lastResolvedTs, lastCheckPointTs uint64 opDone := false resolvedTsGauge := tableResolvedTsGauge.WithLabelValues(p.changefeedID, p.captureInfo.AdvertiseAddr, tableName) checkDoneTicker := time.NewTicker(1 * time.Second) @@ -1001,7 +1009,7 @@ func (p *oldProcessor) sorterConsume( return nil } - globalResolvedTsReceiver, err := p.globalResolvedTsNotifier.NewReceiver(1 * time.Second) + globalResolvedTsReceiver, err := p.globalResolvedTsNotifier.NewReceiver(500 * time.Millisecond) if err != nil { if errors.Cause(err) != context.Canceled { p.errCh <- errors.Trace(err) @@ -1010,6 +1018,40 @@ func (p *oldProcessor) sorterConsume( } defer globalResolvedTsReceiver.Stop() + sendResolvedTs2Sink := func() error { + localResolvedTs := atomic.LoadUint64(&p.localResolvedTs) + globalResolvedTs := atomic.LoadUint64(&p.globalResolvedTs) + var minTs uint64 + if localResolvedTs < globalResolvedTs { + minTs = localResolvedTs + log.Warn("the local resolved ts is less than the global resolved ts", + zap.Uint64("localResolvedTs", localResolvedTs), zap.Uint64("globalResolvedTs", globalResolvedTs)) + } else { + minTs = globalResolvedTs + } + if minTs == 0 { + return nil + } + + checkpointTs, err := sink.FlushRowChangedEvents(ctx, minTs) + if err != nil { + if errors.Cause(err) != context.Canceled { + p.sendError(errors.Trace(err)) + } + return err + } + lastCheckPointTs = checkpointTs + + if checkpointTs < replicaInfo.StartTs { + checkpointTs = replicaInfo.StartTs + } + + if checkpointTs != 0 { + atomic.StoreUint64(pCheckpointTs, checkpointTs) + p.localCheckpointTsNotifier.Notify() + } + return nil + } for { select { case <-ctx.Done(): @@ -1022,6 +1064,28 @@ func (p *oldProcessor) sorterConsume( continue } + for lastResolvedTs > maxLagWithCheckpointTs+lastCheckPointTs { + log.Debug("the lag between local checkpoint Ts and local resolved Ts is too lang", + zap.Uint64("resolvedTs", lastResolvedTs), zap.Uint64("lastCheckPointTs", lastCheckPointTs), + zap.Int64("tableID", tableID), util.ZapFieldChangefeed(ctx)) + select { + case <-ctx.Done(): + if ctx.Err() != context.Canceled { + p.sendError(errors.Trace(ctx.Err())) + } + return + case <-globalResolvedTsReceiver.C: + if err := sendResolvedTs2Sink(); err != nil { + // error is already sent to processor, so we can just ignore it + return + } + case <-checkDoneTicker.C: + if !opDone { + checkDone() + } + } + } + pEvent.SetUpFinishedChan() select { case <-ctx.Done(): @@ -1073,36 +1137,10 @@ func (p *oldProcessor) sorterConsume( return } case <-globalResolvedTsReceiver.C: - localResolvedTs := atomic.LoadUint64(&p.localResolvedTs) - globalResolvedTs := atomic.LoadUint64(&p.globalResolvedTs) - var minTs uint64 - if localResolvedTs < globalResolvedTs { - minTs = localResolvedTs - log.Warn("the local resolved ts is less than the global resolved ts", - zap.Uint64("localResolvedTs", localResolvedTs), zap.Uint64("globalResolvedTs", globalResolvedTs)) - } else { - minTs = globalResolvedTs - } - if minTs == 0 { - continue - } - - checkpointTs, err := sink.FlushRowChangedEvents(ctx, minTs) - if err != nil { - if errors.Cause(err) != context.Canceled { - p.errCh <- errors.Trace(err) - } + if err := sendResolvedTs2Sink(); err != nil { + // error is already sent to processor, so we can just ignore it return } - - if checkpointTs < replicaInfo.StartTs { - checkpointTs = replicaInfo.StartTs - } - - if checkpointTs != 0 { - atomic.StoreUint64(pCheckpointTs, checkpointTs) - p.localCheckpointTsNotifier.Notify() - } case <-checkDoneTicker.C: if !opDone { checkDone() diff --git a/cdc/processor/pipeline/sorter.go b/cdc/processor/pipeline/sorter.go index 2c925a88b79..31c48f31f2b 100644 --- a/cdc/processor/pipeline/sorter.go +++ b/cdc/processor/pipeline/sorter.go @@ -18,6 +18,7 @@ import ( "os" "github.com/pingcap/errors" + "github.com/pingcap/failpoint" "github.com/pingcap/ticdc/cdc/model" "github.com/pingcap/ticdc/cdc/puller" psorter "github.com/pingcap/ticdc/cdc/puller/sorter" @@ -31,16 +32,27 @@ type sorterNode struct { sortEngine model.SortEngine sortDir string sorter puller.EventSorter - tableName string // quoted schema and table, used in metircs only - wg errgroup.Group - cancel context.CancelFunc + + changeFeedID model.ChangeFeedID + tableID model.TableID + tableName string // quoted schema and table, used in metircs only + + wg errgroup.Group + cancel context.CancelFunc } -func newSorterNode(sortEngine model.SortEngine, sortDir string, tableName string) pipeline.Node { +func newSorterNode( + sortEngine model.SortEngine, + sortDir string, + changeFeedID model.ChangeFeedID, + tableName string, tableID model.TableID) pipeline.Node { return &sorterNode{ sortEngine: sortEngine, sortDir: sortDir, - tableName: tableName, + + changeFeedID: changeFeedID, + tableID: tableID, + tableName: tableName, } } @@ -51,7 +63,7 @@ func (n *sorterNode) Init(ctx pipeline.NodeContext) error { switch n.sortEngine { case model.SortInMemory: sorter = puller.NewEntrySorter() - case model.SortInFile, model.SortUnified: + case model.SortInFile: err := util.IsDirAndWritable(n.sortDir) if err != nil { if os.IsNotExist(errors.Cause(err)) { @@ -64,15 +76,19 @@ func (n *sorterNode) Init(ctx pipeline.NodeContext) error { } } - if n.sortEngine == model.SortInFile { - sorter = puller.NewFileSorter(n.sortDir) - } else { - // Unified Sorter - sorter = psorter.NewUnifiedSorter(n.sortDir, n.tableName, ctx.Vars().CaptureAddr) + sorter = puller.NewFileSorter(n.sortDir) + case model.SortUnified: + err := psorter.UnifiedSorterCheckDir(n.sortDir) + if err != nil { + return errors.Trace(err) } + sorter = psorter.NewUnifiedSorter(n.sortDir, n.changeFeedID, n.tableName, n.tableID, ctx.Vars().CaptureAddr) default: return cerror.ErrUnknownSortEngine.GenWithStackByArgs(n.sortEngine) } + failpoint.Inject("ProcessorAddTableError", func() { + failpoint.Return(errors.New("processor add table injected error")) + }) n.wg.Go(func() error { ctx.Throw(errors.Trace(sorter.Run(stdCtx))) return nil diff --git a/cdc/processor/pipeline/table.go b/cdc/processor/pipeline/table.go index 96ecb584257..f218ff65a94 100644 --- a/cdc/processor/pipeline/table.go +++ b/cdc/processor/pipeline/table.go @@ -156,7 +156,7 @@ func NewTablePipeline(ctx context.Context, ctx, p := pipeline.NewPipeline(ctx, 500*time.Millisecond) p.AppendNode(ctx, "puller", newPullerNode(changefeedID, credential, kvStorage, limitter, tableID, replicaInfo, tableName)) - p.AppendNode(ctx, "sorter", newSorterNode(sortEngine, sortDir, tableName)) + p.AppendNode(ctx, "sorter", newSorterNode(sortEngine, sortDir, changefeedID, tableName, tableID)) p.AppendNode(ctx, "mounter", newMounterNode(mounter)) config := ctx.Vars().Config if config.Cyclic != nil && config.Cyclic.IsEnabled() { diff --git a/cdc/processor/processor.go b/cdc/processor/processor.go index a72823a0b99..a47a939dd9f 100644 --- a/cdc/processor/processor.go +++ b/cdc/processor/processor.go @@ -161,7 +161,7 @@ func (p *processor) tick(ctx context.Context, state *changefeedState) (nextState if err := p.lazyInit(ctx); err != nil { return nil, errors.Trace(err) } - if skip := p.initPosition(); skip { + if skip := p.checkPosition(); skip { return p.changefeed, nil } if err := p.handleTableOperation(ctx); err != nil { @@ -185,9 +185,9 @@ func (p *processor) tick(ctx context.Context, state *changefeedState) (nextState return p.changefeed, nil } -// initPosition create a new task position, and put it into the etcd state. +// checkPosition create a new task position, and put it into the etcd state. // task position maybe be not exist only when the processor is running first time. -func (p *processor) initPosition() bool { +func (p *processor) checkPosition() bool { if p.changefeed.TaskPosition != nil { return false } @@ -322,12 +322,14 @@ func (p *processor) handleErrorCh(ctx context.Context) error { func (p *processor) handleTableOperation(ctx context.Context) error { patchOperation := func(tableID model.TableID, fn func(operation *model.TableOperation) error) { p.changefeed.PatchTaskStatus(func(status *model.TaskStatus) (*model.TaskStatus, error) { - if status.Operation == nil { - log.Panic("Operation not found, may be remove by other patch", zap.Int64("tableID", tableID), zap.Any("status", status)) + if status == nil || status.Operation == nil { + log.Error("Operation not found, may be remove by other patch", zap.Int64("tableID", tableID), zap.Any("status", status)) + return nil, cerror.ErrTaskStatusNotExists.GenWithStackByArgs() } opt := status.Operation[tableID] if opt == nil { - log.Panic("Operation not found, may be remove by other patch", zap.Int64("tableID", tableID), zap.Any("status", status)) + log.Error("Operation not found, may be remove by other patch", zap.Int64("tableID", tableID), zap.Any("status", status)) + return nil, cerror.ErrTaskStatusNotExists.GenWithStackByArgs() } if err := fn(opt); err != nil { return nil, errors.Trace(err) @@ -338,6 +340,10 @@ func (p *processor) handleTableOperation(ctx context.Context) error { // TODO: 👇👇 remove this six lines after the new owner is implemented, applied operation should be removed by owner if !p.changefeed.TaskStatus.SomeOperationsUnapplied() && len(p.changefeed.TaskStatus.Operation) != 0 { p.changefeed.PatchTaskStatus(func(status *model.TaskStatus) (*model.TaskStatus, error) { + if status == nil { + // for safety, status should never be nil + return nil, nil + } status.Operation = nil return status, nil }) @@ -562,7 +568,6 @@ func (p *processor) handlePosition() error { minCheckpointTs := minResolvedTs for _, table := range p.tables { ts := table.CheckpointTs() - if ts < minCheckpointTs { minCheckpointTs = ts } @@ -580,10 +585,17 @@ func (p *processor) handlePosition() error { p.metricCheckpointTsLagGauge.Set(float64(oracle.GetPhysical(time.Now())-checkpointPhyTs) / 1e3) p.metricCheckpointTsGauge.Set(float64(checkpointPhyTs)) - if minResolvedTs > p.changefeed.TaskPosition.ResolvedTs || - minCheckpointTs > p.changefeed.TaskPosition.CheckPointTs { + // minResolvedTs and minCheckpointTs may less than global resolved ts and global checkpoint ts when a new table added, the startTs of the new table is less than global checkpoint ts. + if minResolvedTs != p.changefeed.TaskPosition.ResolvedTs || + minCheckpointTs != p.changefeed.TaskPosition.CheckPointTs { p.changefeed.PatchTaskPosition(func(position *model.TaskPosition) (*model.TaskPosition, error) { failpoint.Inject("ProcessorUpdatePositionDelaying", nil) + if position == nil { + // when the captureInfo is deleted, the old owner will delete task status, task position, task workload in non-atomic + // so processor may see a intermediate state, for example the task status is exist but task position is deleted. + log.Warn("task position is not exist, skip to update position", zap.String("changefeed", p.changefeed.ID)) + return nil, nil + } position.CheckPointTs = minCheckpointTs position.ResolvedTs = minResolvedTs return position, nil diff --git a/cdc/processor/processor_test.go b/cdc/processor/processor_test.go index 4a13b5fa631..547170d0094 100644 --- a/cdc/processor/processor_test.go +++ b/cdc/processor/processor_test.go @@ -53,9 +53,11 @@ func newProcessor4Test() *processor { } p.createTablePipeline = func(ctx context.Context, tableID model.TableID, replicaInfo *model.TableReplicaInfo) (tablepipeline.TablePipeline, error) { return &mockTablePipeline{ - tableID: tableID, - name: fmt.Sprintf("`test`.`table%d`", tableID), - status: pipeline.TableStatusRunning, + tableID: tableID, + name: fmt.Sprintf("`test`.`table%d`", tableID), + status: pipeline.TableStatusRunning, + resolvedTs: replicaInfo.StartTs, + checkpointTs: replicaInfo.StartTs, }, nil } p.changefeed = newChangeFeedState(changefeedID, p.captureInfo.ID) @@ -237,7 +239,9 @@ func (s *processorSuite) TestHandleTableOperation4SingleTable(c *check.C) { c.Assert(err, check.IsNil) applyPatches(c, p.changefeed) p.changefeed.Status.CheckpointTs = 90 + p.changefeed.Status.ResolvedTs = 90 p.changefeed.TaskPosition.ResolvedTs = 100 + p.schemaStorage.AdvanceResolvedTs(200) // no operation _, err = p.Tick(ctx, p.changefeed) @@ -272,12 +276,26 @@ func (s *processorSuite) TestHandleTableOperation4SingleTable(c *check.C) { }, }) - // add table, finished + // add table, push the resolvedTs table66 := p.tables[66].(*mockTablePipeline) table66.resolvedTs = 101 _, err = p.Tick(ctx, p.changefeed) c.Assert(err, check.IsNil) applyPatches(c, p.changefeed) + c.Assert(p.changefeed.TaskStatus, check.DeepEquals, &model.TaskStatus{ + Tables: map[int64]*model.TableReplicaInfo{ + 66: {StartTs: 60}, + }, + Operation: map[int64]*model.TableOperation{ + 66: {Delete: false, BoundaryTs: 60, Done: false, Status: model.OperProcessed}, + }, + }) + c.Assert(p.changefeed.TaskPosition.ResolvedTs, check.Equals, uint64(101)) + + // finish the operation + _, err = p.Tick(ctx, p.changefeed) + c.Assert(err, check.IsNil) + applyPatches(c, p.changefeed) c.Assert(p.changefeed.TaskStatus, check.DeepEquals, &model.TaskStatus{ Tables: map[int64]*model.TableReplicaInfo{ 66: {StartTs: 60}, @@ -299,7 +317,7 @@ func (s *processorSuite) TestHandleTableOperation4SingleTable(c *check.C) { }) // remove table, in processing - p.changefeed.TaskStatus.RemoveTable(66, 120) + p.changefeed.TaskStatus.RemoveTable(66, 120, false) _, err = p.Tick(ctx, p.changefeed) c.Assert(err, check.IsNil) applyPatches(c, p.changefeed) @@ -347,8 +365,11 @@ func (s *processorSuite) TestHandleTableOperation4MultiTable(c *check.C) { _, err = p.Tick(ctx, p.changefeed) c.Assert(err, check.IsNil) applyPatches(c, p.changefeed) + p.schemaStorage.AdvanceResolvedTs(200) p.changefeed.Status.CheckpointTs = 90 + p.changefeed.Status.ResolvedTs = 90 p.changefeed.TaskPosition.ResolvedTs = 100 + p.changefeed.TaskPosition.CheckPointTs = 90 // no operation _, err = p.Tick(ctx, p.changefeed) @@ -378,6 +399,8 @@ func (s *processorSuite) TestHandleTableOperation4MultiTable(c *check.C) { }, }) c.Assert(p.tables, check.HasLen, 4) + c.Assert(p.changefeed.TaskPosition.CheckPointTs, check.Equals, uint64(30)) + c.Assert(p.changefeed.TaskPosition.ResolvedTs, check.Equals, uint64(30)) // add table, not finished _, err = p.Tick(ctx, p.changefeed) @@ -398,16 +421,17 @@ func (s *processorSuite) TestHandleTableOperation4MultiTable(c *check.C) { }) c.Assert(p.tables, check.HasLen, 4) - // add table, finished + // add table, push the resolvedTs table1 := p.tables[1].(*mockTablePipeline) table2 := p.tables[2].(*mockTablePipeline) table3 := p.tables[3].(*mockTablePipeline) table4 := p.tables[4].(*mockTablePipeline) - // table 1 and table 2 finished the add operation table1.resolvedTs = 101 table2.resolvedTs = 101 - // table 3 removed - p.changefeed.TaskStatus.RemoveTable(3, 60) + table3.resolvedTs = 102 + table4.resolvedTs = 103 + // removed table 3 + p.changefeed.TaskStatus.RemoveTable(3, 60, false) _, err = p.Tick(ctx, p.changefeed) c.Assert(err, check.IsNil) applyPatches(c, p.changefeed) @@ -418,16 +442,17 @@ func (s *processorSuite) TestHandleTableOperation4MultiTable(c *check.C) { 4: {StartTs: 30}, }, Operation: map[int64]*model.TableOperation{ - 1: {Delete: false, BoundaryTs: 60, Done: true, Status: model.OperFinished}, - 2: {Delete: false, BoundaryTs: 50, Done: true, Status: model.OperFinished}, + 1: {Delete: false, BoundaryTs: 60, Done: false, Status: model.OperProcessed}, + 2: {Delete: false, BoundaryTs: 50, Done: false, Status: model.OperProcessed}, 3: {Delete: true, BoundaryTs: 60, Done: false, Status: model.OperProcessed}, }, }) c.Assert(p.tables, check.HasLen, 4) c.Assert(table3.canceled, check.IsFalse) c.Assert(table3.stopTs, check.Equals, uint64(60)) + c.Assert(p.changefeed.TaskPosition.ResolvedTs, check.Equals, uint64(101)) - // finish remove table3 + // finish remove and add operations table3.status = pipeline.TableStatusStopped table3.checkpointTs = 65 _, err = p.Tick(ctx, p.changefeed) @@ -463,8 +488,8 @@ func (s *processorSuite) TestHandleTableOperation4MultiTable(c *check.C) { c.Assert(p.tables, check.HasLen, 3) // remove table, in processing - p.changefeed.TaskStatus.RemoveTable(1, 120) - p.changefeed.TaskStatus.RemoveTable(4, 120) + p.changefeed.TaskStatus.RemoveTable(1, 120, false) + p.changefeed.TaskStatus.RemoveTable(4, 120, false) delete(p.changefeed.TaskStatus.Tables, 2) _, err = p.Tick(ctx, p.changefeed) c.Assert(err, check.IsNil) @@ -669,3 +694,46 @@ func (s *processorSuite) TestProcessorClose(c *check.C) { c.Assert(p.tables[1].(*mockTablePipeline).canceled, check.IsTrue) c.Assert(p.tables[2].(*mockTablePipeline).canceled, check.IsTrue) } + +func (s *processorSuite) TestPositionDeleted(c *check.C) { + defer testleak.AfterTest(c)() + ctx := context.Background() + p := newProcessor4Test() + p.changefeed.TaskStatus.Tables[1] = &model.TableReplicaInfo{StartTs: 30} + p.changefeed.TaskStatus.Tables[2] = &model.TableReplicaInfo{StartTs: 40} + var err error + // init tick + _, err = p.Tick(ctx, p.changefeed) + c.Assert(err, check.IsNil) + applyPatches(c, p.changefeed) + p.schemaStorage.AdvanceResolvedTs(200) + + // cal position + _, err = p.Tick(ctx, p.changefeed) + c.Assert(err, check.IsNil) + applyPatches(c, p.changefeed) + c.Assert(p.changefeed.TaskPosition, check.DeepEquals, &model.TaskPosition{ + CheckPointTs: 30, + ResolvedTs: 30, + }) + + // some other delete the task position + p.changefeed.TaskPosition = nil + // position created again + _, err = p.Tick(ctx, p.changefeed) + c.Assert(err, check.IsNil) + applyPatches(c, p.changefeed) + c.Assert(p.changefeed.TaskPosition, check.DeepEquals, &model.TaskPosition{ + CheckPointTs: 0, + ResolvedTs: 0, + }) + + // cal position + _, err = p.Tick(ctx, p.changefeed) + c.Assert(err, check.IsNil) + applyPatches(c, p.changefeed) + c.Assert(p.changefeed.TaskPosition, check.DeepEquals, &model.TaskPosition{ + CheckPointTs: 30, + ResolvedTs: 30, + }) +} diff --git a/cdc/puller/puller.go b/cdc/puller/puller.go index 2387e5f0b69..d284b08869a 100644 --- a/cdc/puller/puller.go +++ b/cdc/puller/puller.go @@ -36,8 +36,8 @@ import ( ) const ( - defaultPullerEventChanSize = 128000 - defaultPullerOutputChanSize = 128000 + defaultPullerEventChanSize = 128 + defaultPullerOutputChanSize = 128 ) // Puller pull data from tikv and push changes into a buffer @@ -56,7 +56,6 @@ type pullerImpl struct { kvStorage tikv.Storage checkpointTs uint64 spans []regionspan.ComparableSpan - buffer *memBuffer outputCh chan *model.RawKVEntry tsTracker frontier.Frontier resolvedTs uint64 @@ -96,7 +95,6 @@ func NewPuller( kvStorage: tikvStorage, checkpointTs: checkpointTs, spans: comparableSpans, - buffer: makeMemBuffer(limitter), outputCh: make(chan *model.RawKVEntry, defaultPullerOutputChanSize), tsTracker: tsTracker, resolvedTs: checkpointTs, @@ -133,10 +131,7 @@ func (p *pullerImpl) Run(ctx context.Context) error { tableID, tableName := util.TableIDFromCtx(ctx) metricOutputChanSize := outputChanSizeGauge.WithLabelValues(captureAddr, changefeedID, tableName) metricEventChanSize := eventChanSizeGauge.WithLabelValues(captureAddr, changefeedID, tableName) - metricMemBufferSize := memBufferSizeGauge.WithLabelValues(captureAddr, changefeedID, tableName) metricPullerResolvedTs := pullerResolvedTsGauge.WithLabelValues(captureAddr, changefeedID, tableName) - metricEventCounterKv := kvEventCounter.WithLabelValues(captureAddr, changefeedID, "kv") - metricEventCounterResolved := kvEventCounter.WithLabelValues(captureAddr, changefeedID, "resolved") metricTxnCollectCounterKv := txnCollectCounter.WithLabelValues(captureAddr, changefeedID, tableName, "kv") metricTxnCollectCounterResolved := txnCollectCounter.WithLabelValues(captureAddr, changefeedID, tableName, "resolved") defer func() { @@ -156,37 +151,12 @@ func (p *pullerImpl) Run(ctx context.Context) error { return nil case <-time.After(15 * time.Second): metricEventChanSize.Set(float64(len(eventCh))) - metricMemBufferSize.Set(float64(p.buffer.Size())) metricOutputChanSize.Set(float64(len(p.outputCh))) metricPullerResolvedTs.Set(float64(oracle.ExtractPhysical(atomic.LoadUint64(&p.resolvedTs)))) } } }) - g.Go(func() error { - for { - select { - case e := <-eventCh: - // Note: puller will process anything received from event channel. - // If one key is not expected in given region, it must be filtered - // out before sending to puller. - if e.Val != nil { - metricEventCounterKv.Inc() - if err := p.buffer.AddEntry(ctx, *e); err != nil { - return errors.Trace(err) - } - } else if e.Resolved != nil { - metricEventCounterResolved.Inc() - if err := p.buffer.AddEntry(ctx, *e); err != nil { - return errors.Trace(err) - } - } - case <-ctx.Done(): - return ctx.Err() - } - } - }) - lastResolvedTs := p.checkpointTs g.Go(func() error { output := func(raw *model.RawKVEntry) error { @@ -208,9 +178,14 @@ func (p *pullerImpl) Run(ctx context.Context) error { start := time.Now() initialized := false for { - e, err := p.buffer.Get(ctx) - if err != nil { - return errors.Trace(err) + var e *model.RegionFeedEvent + select { + case e = <-eventCh: + case <-ctx.Done(): + return errors.Trace(ctx.Err()) + } + if e == nil { + continue } if e.Val != nil { metricTxnCollectCounterKv.Inc() diff --git a/cdc/puller/sorter/backend_pool.go b/cdc/puller/sorter/backend_pool.go index 7411bbaaeee..14f9f0f80ad 100644 --- a/cdc/puller/sorter/backend_pool.go +++ b/cdc/puller/sorter/backend_pool.go @@ -26,6 +26,8 @@ import ( "time" "unsafe" + "github.com/pingcap/ticdc/pkg/util" + "github.com/mackerelio/go-osstat/memory" "github.com/pingcap/errors" "github.com/pingcap/failpoint" @@ -36,7 +38,7 @@ import ( ) const ( - backgroundJobInterval = time.Second * 5 + backgroundJobInterval = time.Second * 15 ) var ( @@ -107,11 +109,11 @@ func newBackEndPool(dir string, captureAddr string) *backEndPool { zap.Int64("usedBySorter", ret.sorterMemoryUsage())) // Increase GC frequency to avoid unnecessary OOMs debug.SetGCPercent(10) - if memPressure > 95 { + if memPressure > 80 { runtime.GC() } } else { - debug.SetGCPercent(100) + debug.SetGCPercent(50) } // garbage collect temporary files in batches @@ -142,7 +144,7 @@ func newBackEndPool(dir string, captureAddr string) *backEndPool { } func (p *backEndPool) alloc(ctx context.Context) (backEnd, error) { - sorterConfig := config.GetSorterConfig() + sorterConfig := config.GetGlobalServerConfig().Sorter if p.sorterMemoryUsage() < int64(sorterConfig.MaxMemoryConsumption) && p.memoryPressure() < int32(sorterConfig.MaxMemoryPressure) { @@ -166,9 +168,11 @@ func (p *backEndPool) alloc(ctx context.Context) (backEnd, error) { } fname := fmt.Sprintf("%s%d.tmp", p.filePrefix, atomic.AddUint64(&p.fileNameCounter, 1)) + tableID, tableName := util.TableIDFromCtx(ctx) log.Debug("Unified Sorter: trying to create file backEnd", zap.String("filename", fname), - zap.String("table", tableNameFromCtx(ctx))) + zap.Int64("table-id", tableID), + zap.String("table-name", tableName)) ret, err := newFileBackEnd(fname, &msgPackGenSerde{}) if err != nil { diff --git a/cdc/puller/sorter/backend_pool_test.go b/cdc/puller/sorter/backend_pool_test.go index 2c923142a9f..98d36ad7363 100644 --- a/cdc/puller/sorter/backend_pool_test.go +++ b/cdc/puller/sorter/backend_pool_test.go @@ -17,7 +17,6 @@ import ( "context" "os" "strconv" - "testing" "time" "github.com/pingcap/check" @@ -26,11 +25,9 @@ import ( "github.com/pingcap/ticdc/pkg/util/testleak" ) -func TestSuite(t *testing.T) { check.TestingT(t) } - type backendPoolSuite struct{} -var _ = check.Suite(&backendPoolSuite{}) +var _ = check.SerialSuites(&backendPoolSuite{}) func (s *backendPoolSuite) TestBasicFunction(c *check.C) { defer testleak.AfterTest(c)() @@ -38,10 +35,10 @@ func (s *backendPoolSuite) TestBasicFunction(c *check.C) { err := os.MkdirAll("/tmp/sorter", 0o755) c.Assert(err, check.IsNil) - config.SetSorterConfig(&config.SorterConfig{ - MaxMemoryPressure: 90, // 90% - MaxMemoryConsumption: 16 * 1024 * 1024 * 1024, // 16G - }) + conf := config.GetDefaultServerConfig() + conf.Sorter.MaxMemoryPressure = 90 // 90% + conf.Sorter.MaxMemoryConsumption = 16 * 1024 * 1024 * 1024 // 16G + config.StoreGlobalServerConfig(conf) err = failpoint.Enable("github.com/pingcap/ticdc/cdc/puller/sorter/memoryPressureInjectPoint", "return(100)") c.Assert(err, check.IsNil) @@ -104,10 +101,10 @@ func (s *backendPoolSuite) TestCleanUp(c *check.C) { err := os.MkdirAll("/tmp/sorter", 0o755) c.Assert(err, check.IsNil) - config.SetSorterConfig(&config.SorterConfig{ - MaxMemoryPressure: 90, // 90% - MaxMemoryConsumption: 16 * 1024 * 1024 * 1024, // 16G - }) + conf := config.GetDefaultServerConfig() + conf.Sorter.MaxMemoryPressure = 90 // 90% + conf.Sorter.MaxMemoryConsumption = 16 * 1024 * 1024 * 1024 // 16G + config.StoreGlobalServerConfig(conf) err = failpoint.Enable("github.com/pingcap/ticdc/cdc/puller/sorter/memoryPressureInjectPoint", "return(100)") c.Assert(err, check.IsNil) diff --git a/cdc/puller/sorter/heap_sorter.go b/cdc/puller/sorter/heap_sorter.go index aa4ca45c1b4..70aae9251fb 100644 --- a/cdc/puller/sorter/heap_sorter.go +++ b/cdc/puller/sorter/heap_sorter.go @@ -1,4 +1,4 @@ -// Copyright 2020 PingCAP, Inc. +// Copyright 2021 PingCAP, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -25,6 +25,7 @@ import ( "github.com/pingcap/log" "github.com/pingcap/ticdc/cdc/model" "github.com/pingcap/ticdc/pkg/config" + cerrors "github.com/pingcap/ticdc/pkg/errors" "github.com/pingcap/ticdc/pkg/util" "github.com/pingcap/ticdc/pkg/workerpool" "go.uber.org/zap" @@ -43,8 +44,10 @@ type flushTask struct { maxResolvedTs uint64 finished chan error dealloc func() error + isDeallocated int32 dataSize int64 lastTs uint64 // for debugging TODO remove + canceller *asyncCanceller } type heapSorter struct { @@ -53,6 +56,7 @@ type heapSorter struct { inputCh chan *model.PolymorphicEvent outputCh chan *flushTask heap sortHeap + canceller *asyncCanceller poolHandle workerpool.EventHandle internalState *heapSorterInternalState @@ -60,10 +64,11 @@ type heapSorter struct { func newHeapSorter(id int, out chan *flushTask) *heapSorter { return &heapSorter{ - id: id, - inputCh: make(chan *model.PolymorphicEvent, 1024*1024), - outputCh: out, - heap: make(sortHeap, 0, 65536), + id: id, + inputCh: make(chan *model.PolymorphicEvent, 1024*1024), + outputCh: out, + heap: make(sortHeap, 0, 65536), + canceller: new(asyncCanceller), } } @@ -115,12 +120,16 @@ func (h *heapSorter) flush(ctx context.Context, maxResolvedTs uint64) error { tsLowerBound: lowerBound, maxResolvedTs: maxResolvedTs, finished: finishCh, + canceller: h.canceller, } h.taskCounter++ var oldHeap sortHeap if !isEmptyFlush { task.dealloc = func() error { + if atomic.SwapInt32(&task.isDeallocated, 1) == 1 { + return nil + } if task.backend != nil { task.backend = nil return pool.dealloc(backEnd) @@ -135,8 +144,10 @@ func (h *heapSorter) flush(ctx context.Context, maxResolvedTs uint64) error { } } failpoint.Inject("sorterDebug", func() { + tableID, tableName := util.TableIDFromCtx(ctx) log.Debug("Unified Sorter new flushTask", - zap.String("table", tableNameFromCtx(ctx)), + zap.Int64("table-id", tableID), + zap.String("table-name", tableName), zap.Int("heap-id", task.heapSorterID), zap.Uint64("resolvedTs", task.maxResolvedTs)) }) @@ -144,6 +155,21 @@ func (h *heapSorter) flush(ctx context.Context, maxResolvedTs uint64) error { if !isEmptyFlush { backEndFinal := backEnd err := heapSorterIOPool.Go(ctx, func() { + failpoint.Inject("asyncFlushStartDelay", func() { + log.Debug("asyncFlushStartDelay") + }) + + h.canceller.EnterAsyncOp() + defer h.canceller.FinishAsyncOp() + + if h.canceller.IsCanceled() { + if backEndFinal != nil { + _ = task.dealloc() + } + task.finished <- cerrors.ErrAsyncIOCancelled.GenWithStackByArgs() + return + } + writer, err := backEnd.writer() if err != nil { if backEndFinal != nil { @@ -164,7 +190,18 @@ func (h *heapSorter) flush(ctx context.Context, maxResolvedTs uint64) error { close(task.finished) }() + counter := 0 for oldHeap.Len() > 0 { + failpoint.Inject("asyncFlushInProcessDelay", func() { + log.Debug("asyncFlushInProcessDelay") + }) + // no need to check for cancellation so frequently. + if counter%10000 == 0 && h.canceller.IsCanceled() { + task.finished <- cerrors.ErrAsyncIOCancelled.GenWithStackByArgs() + return + } + counter++ + event := heap.Pop(&oldHeap).(*sortItem).entry err := writer.writeNext(event) if err != nil { @@ -188,9 +225,11 @@ func (h *heapSorter) flush(ctx context.Context, maxResolvedTs uint64) error { backEndFinal = nil failpoint.Inject("sorterDebug", func() { + tableID, tableName := util.TableIDFromCtx(ctx) log.Debug("Unified Sorter flushTask finished", zap.Int("heap-id", task.heapSorterID), - zap.String("table", tableNameFromCtx(ctx)), + zap.Int64("table-id", tableID), + zap.String("table-name", tableName), zap.Uint64("resolvedTs", task.maxResolvedTs), zap.Uint64("data-size", dataSize), zap.Int("size", eventCount)) @@ -228,7 +267,7 @@ type heapSorterInternalState struct { func (h *heapSorter) init(ctx context.Context, onError func(err error)) { state := &heapSorterInternalState{ - sorterConfig: config.GetSorterConfig(), + sorterConfig: config.GetGlobalServerConfig().Sorter, } poolHandle := heapSorterPool.RegisterEvent(func(ctx context.Context, eventI interface{}) error { @@ -280,9 +319,45 @@ func (h *heapSorter) init(ctx context.Context, onError func(err error)) { h.internalState = state } +// asyncCanceller is a shared object used to cancel async IO operations. +// We do not use `context.Context` because (1) selecting on `ctx.Done()` is expensive +// especially if the context is shared by many goroutines, and (2) due to the complexity +// of managing contexts through the workerpools, using a special shared object seems more reasonable +// and readable. +type asyncCanceller struct { + exitRWLock sync.RWMutex // held when an asynchronous flush is taking place + hasExited int32 // this flag should be accessed atomically +} + +func (c *asyncCanceller) EnterAsyncOp() { + c.exitRWLock.RLock() +} + +func (c *asyncCanceller) FinishAsyncOp() { + c.exitRWLock.RUnlock() +} + +func (c *asyncCanceller) IsCanceled() bool { + return atomic.LoadInt32(&c.hasExited) == 1 +} + +func (c *asyncCanceller) Cancel() { + // Sets the flag + atomic.StoreInt32(&c.hasExited, 1) + + // By taking the lock, we are making sure that all IO operations that started before setting the flag have finished, + // so that by the returning of this function, no more IO operations will finish successfully. + // Since IO operations that are NOT successful will clean up themselves, the goroutine in which this + // function was called is responsible for releasing files written by only those IO operations that complete BEFORE + // this function returns. + // In short, we are creating a linearization point here. + c.exitRWLock.Lock() + defer c.exitRWLock.Unlock() +} + func lazyInitWorkerPool() { poolOnce.Do(func() { - sorterConfig := config.GetSorterConfig() + sorterConfig := config.GetGlobalServerConfig().Sorter heapSorterPool = workerpool.NewDefaultWorkerPool(sorterConfig.NumWorkerPoolGoroutine) heapSorterIOPool = workerpool.NewDefaultAsyncPool(sorterConfig.NumWorkerPoolGoroutine * 2) }) diff --git a/cdc/puller/sorter/memory_backend.go b/cdc/puller/sorter/memory_backend.go index 521175bc7f4..e853758850b 100644 --- a/cdc/puller/sorter/memory_backend.go +++ b/cdc/puller/sorter/memory_backend.go @@ -90,7 +90,9 @@ func (r *memoryBackEndReader) resetAndClose() error { atomic.StoreInt32(&r.backEnd.borrowed, 0) }) - atomic.AddInt64(&pool.memoryUseEstimate, -r.backEnd.estimatedSize) + if pool != nil { + atomic.AddInt64(&pool.memoryUseEstimate, -r.backEnd.estimatedSize) + } r.backEnd.estimatedSize = 0 return nil @@ -134,7 +136,9 @@ func (w *memoryBackEndWriter) flushAndClose() error { }) w.backEnd.estimatedSize = w.bytesWritten - atomic.AddInt64(&pool.memoryUseEstimate, w.bytesWritten) + if pool != nil { + atomic.AddInt64(&pool.memoryUseEstimate, w.bytesWritten) + } return nil } diff --git a/cdc/puller/sorter/merger.go b/cdc/puller/sorter/merger.go index 4c4f6ac8793..4b4dd6bc78c 100644 --- a/cdc/puller/sorter/merger.go +++ b/cdc/puller/sorter/merger.go @@ -18,19 +18,31 @@ import ( "context" "math" "strings" + "sync" + "sync/atomic" "time" - "github.com/pingcap/tidb/store/tikv/oracle" - + "github.com/edwingeng/deque" "github.com/pingcap/errors" "github.com/pingcap/failpoint" "github.com/pingcap/log" "github.com/pingcap/ticdc/cdc/model" + cerrors "github.com/pingcap/ticdc/pkg/errors" + "github.com/pingcap/ticdc/pkg/notify" "github.com/pingcap/ticdc/pkg/util" + "github.com/pingcap/tidb/store/tikv/oracle" "go.uber.org/zap" + "golang.org/x/sync/errgroup" ) -func runMerger(ctx context.Context, numSorters int, in <-chan *flushTask, out chan *model.PolymorphicEvent) error { +// TODO refactor this into a struct Merger. +func runMerger(ctx context.Context, numSorters int, in <-chan *flushTask, out chan *model.PolymorphicEvent, onExit func(), bufLen *int64) error { + // TODO remove bufLenPlaceholder when refactoring + if bufLen == nil { + var bufLenPlaceholder int64 + bufLen = &bufLenPlaceholder + } + captureAddr := util.CaptureAddrFromCtx(ctx) changefeedID := util.ChangefeedIDFromCtx(ctx) _, tableName := util.TableIDFromCtx(ctx) @@ -46,25 +58,72 @@ func runMerger(ctx context.Context, numSorters int, in <-chan *flushTask, out ch lastResolvedTs := make([]uint64, numSorters) minResolvedTs := uint64(0) - + taskBuf := newTaskBuffer(bufLen) + var workingSet map[*flushTask]struct{} pendingSet := make(map[*flushTask]*model.PolymorphicEvent) + defer func() { log.Info("Unified Sorter: merger exiting, cleaning up resources", zap.Int("pending-set-size", len(pendingSet))) - // clean up resources - for task := range pendingSet { + taskBuf.setClosed() + // cancel pending async IO operations. + onExit() + cleanUpTask := func(task *flushTask) { + select { + case err := <-task.finished: + _ = printError(err) + default: + // The task has not finished, so we give up. + // It does not matter because: + // 1) if the async workerpool has exited, it means the CDC process is exiting, UnifiedSorterCleanUp will + // take care of the temp files, + // 2) if the async workerpool is not exiting, the unfinished tasks will eventually be executed, + // and by that time, since the `onExit` have canceled them, they will not do any IO and clean up themselves. + return + } + if task.reader != nil { _ = printError(task.reader.resetAndClose()) task.reader = nil } _ = printError(task.dealloc()) } + + for { + task, err := taskBuf.get(ctx) + if err != nil { + _ = printError(err) + break + } + + if task == nil { + log.Debug("Merger exiting, taskBuf is exhausted") + break + } + + cleanUpTask(task) + } + + for task := range pendingSet { + cleanUpTask(task) + } + for task := range workingSet { + cleanUpTask(task) + } + + taskBuf.close() + log.Info("Merger has exited") }() lastOutputTs := uint64(0) + lastOutputResolvedTs := uint64(0) var lastEvent *model.PolymorphicEvent var lastTask *flushTask sendResolvedEvent := func(ts uint64) error { + lastOutputResolvedTs = ts + if ts == 0 { + return nil + } select { case <-ctx.Done(): return ctx.Err() @@ -77,29 +136,9 @@ func runMerger(ctx context.Context, numSorters int, in <-chan *flushTask, out ch onMinResolvedTsUpdate := func() error { metricSorterMergerStartTsGauge.Set(float64(oracle.ExtractPhysical(minResolvedTs))) - - workingSet := make(map[*flushTask]struct{}) + workingSet = make(map[*flushTask]struct{}) sortHeap := new(sortHeap) - defer func() { - // clean up - for task := range workingSet { - select { - case <-ctx.Done(): - break - case err := <-task.finished: - _ = printError(err) - } - - if task.reader != nil { - err := task.reader.resetAndClose() - task.reader = nil - _ = printError(err) - } - _ = printError(task.dealloc()) - } - }() - for task, cache := range pendingSet { if task.tsLowerBound > minResolvedTs { // the condition above implies that for any event in task.backend, CRTs > minResolvedTs. @@ -183,7 +222,9 @@ func runMerger(ctx context.Context, numSorters int, in <-chan *flushTask, out ch } else { pendingSet[task] = nextEvent if nextEvent.CRTs < minResolvedTs { - log.Panic("remaining event CRTs too small", zap.Uint64("next-ts", nextEvent.CRTs), zap.Uint64("minResolvedTs", minResolvedTs)) + log.Panic("remaining event CRTs too small", + zap.Uint64("next-ts", nextEvent.CRTs), + zap.Uint64("minResolvedTs", minResolvedTs)) } } return nil @@ -191,14 +232,18 @@ func runMerger(ctx context.Context, numSorters int, in <-chan *flushTask, out ch failpoint.Inject("sorterDebug", func() { if sortHeap.Len() > 0 { + tableID, tableName := util.TableIDFromCtx(ctx) log.Debug("Unified Sorter: start merging", - zap.String("table", tableNameFromCtx(ctx)), + zap.Int64("table-id", tableID), + zap.String("table-name", tableName), zap.Uint64("minResolvedTs", minResolvedTs)) } }) counter := 0 for sortHeap.Len() > 0 { + failpoint.Inject("sorterMergeDelay", func() {}) + item := heap.Pop(sortHeap).(*sortItem) task := item.data.(*flushTask) event := item.entry @@ -231,6 +276,11 @@ func runMerger(ctx context.Context, numSorters int, in <-chan *flushTask, out ch zap.Uint64("last-ts", lastOutputTs), zap.Int("sort-heap-len", sortHeap.Len())) } + + if event.CRTs <= lastOutputResolvedTs { + log.Panic("unified sorter: output ts smaller than resolved ts, bug?", zap.Uint64("minResolvedTs", minResolvedTs), + zap.Uint64("lastOutputResolvedTs", lastOutputResolvedTs), zap.Uint64("event-crts", event.CRTs)) + } lastOutputTs = event.CRTs lastEvent = event lastTask = task @@ -290,8 +340,10 @@ func runMerger(ctx context.Context, numSorters int, in <-chan *flushTask, out ch failpoint.Inject("sorterDebug", func() { if counter%10 == 0 { + tableID, tableName := util.TableIDFromCtx(ctx) log.Debug("Merging progress", - zap.String("table", tableNameFromCtx(ctx)), + zap.Int64("table-id", tableID), + zap.String("table-name", tableName), zap.Int("counter", counter)) } }) @@ -308,8 +360,10 @@ func runMerger(ctx context.Context, numSorters int, in <-chan *flushTask, out ch failpoint.Inject("sorterDebug", func() { if counter > 0 { + tableID, tableName := util.TableIDFromCtx(ctx) log.Debug("Unified Sorter: merging ended", - zap.String("table", tableNameFromCtx(ctx)), + zap.Int64("table-id", tableID), + zap.String("table-name", tableName), zap.Uint64("resolvedTs", minResolvedTs), zap.Int("count", counter)) } }) @@ -329,14 +383,45 @@ func runMerger(ctx context.Context, numSorters int, in <-chan *flushTask, out ch resolveTicker := time.NewTicker(1 * time.Second) defer resolveTicker.Stop() - for { - select { - case <-ctx.Done(): - return ctx.Err() - case task := <-in: + errg, ctx := errgroup.WithContext(ctx) + errg.Go(func() error { + for { + var task *flushTask + select { + case <-ctx.Done(): + return ctx.Err() + case task = <-in: + } + + if task == nil { + tableID, tableName := util.TableIDFromCtx(ctx) + log.Debug("Merger input channel closed, exiting", + zap.Int64("table-id", tableID), + zap.String("table-name", tableName)) + return nil + } + + taskBuf.put(task) + } + }) + + errg.Go(func() error { + for { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + task, err := taskBuf.get(ctx) + if err != nil { + return errors.Trace(err) + } + if task == nil { - log.Info("Merger input channel closed, exiting", - zap.String("table", tableNameFromCtx(ctx)), + tableID, tableName := util.TableIDFromCtx(ctx) + log.Debug("Merger buffer exhausted and is closed, exiting", + zap.Int64("table-id", tableID), + zap.String("table-name", tableName), zap.Uint64("max-output", minResolvedTs)) return nil } @@ -363,17 +448,21 @@ func runMerger(ctx context.Context, numSorters int, in <-chan *flushTask, out ch return errors.Trace(err) } } - case <-resolveTicker.C: - err := sendResolvedEvent(minResolvedTs) - if err != nil { - return errors.Trace(err) - } } - } + }) + + return errg.Wait() } func mergerCleanUp(in <-chan *flushTask) { for task := range in { + select { + case err := <-task.finished: + _ = printError(err) + default: + break + } + if task.reader != nil { _ = printError(task.reader.resetAndClose()) } @@ -386,9 +475,98 @@ func printError(err error) error { if err != nil && errors.Cause(err) != context.Canceled && errors.Cause(err) != context.DeadlineExceeded && !strings.Contains(err.Error(), "context canceled") && - !strings.Contains(err.Error(), "context deadline exceeded") { + !strings.Contains(err.Error(), "context deadline exceeded") && + cerrors.ErrAsyncIOCancelled.NotEqual(errors.Cause(err)) { log.Warn("Unified Sorter: Error detected", zap.Error(err)) } return err } + +// taskBuffer is used to store pending flushTasks. +// The design purpose is to reduce the backpressure caused by a congested output chan of the merger, +// so that heapSorter does not block. +type taskBuffer struct { + mu sync.Mutex // mu only protects queue + queue deque.Deque + + notifier notify.Notifier + len *int64 + isClosed int32 +} + +func newTaskBuffer(len *int64) *taskBuffer { + return &taskBuffer{ + queue: deque.NewDeque(), + notifier: notify.Notifier{}, + len: len, + } +} + +func (b *taskBuffer) put(task *flushTask) { + b.mu.Lock() + defer b.mu.Unlock() + + b.queue.PushBack(task) + prevCount := atomic.AddInt64(b.len, 1) + + if prevCount == 1 { + b.notifier.Notify() + } +} + +func (b *taskBuffer) get(ctx context.Context) (*flushTask, error) { + if atomic.LoadInt32(&b.isClosed) == 1 && atomic.LoadInt64(b.len) == 0 { + return nil, nil + } + + if atomic.LoadInt64(b.len) == 0 { + recv, err := b.notifier.NewReceiver(time.Millisecond * 50) + if err != nil { + return nil, errors.Trace(err) + } + defer recv.Stop() + + startTime := time.Now() + for atomic.LoadInt64(b.len) == 0 { + select { + case <-ctx.Done(): + return nil, errors.Trace(ctx.Err()) + case <-recv.C: + // Note that there can be spurious wake-ups + } + + if atomic.LoadInt32(&b.isClosed) == 1 && atomic.LoadInt64(b.len) == 0 { + return nil, nil + } + + if time.Since(startTime) > time.Second*5 { + log.Debug("taskBuffer reading blocked for too long", zap.Duration("duration", time.Since(startTime))) + } + } + } + + postCount := atomic.AddInt64(b.len, -1) + if postCount < 0 { + log.Panic("taskBuffer: len < 0, report a bug", zap.Int64("len", postCount)) + } + + b.mu.Lock() + defer b.mu.Unlock() + + ret := b.queue.PopFront() + if ret == nil { + log.Panic("taskBuffer: PopFront() returned nil, report a bug") + } + + return ret.(*flushTask), nil +} + +func (b *taskBuffer) setClosed() { + atomic.SwapInt32(&b.isClosed, 1) +} + +// Only call this when the taskBuffer is NEVER going to be accessed again. +func (b *taskBuffer) close() { + b.notifier.Close() +} diff --git a/cdc/puller/sorter/merger_test.go b/cdc/puller/sorter/merger_test.go new file mode 100644 index 00000000000..caa67db575d --- /dev/null +++ b/cdc/puller/sorter/merger_test.go @@ -0,0 +1,594 @@ +// Copyright 2021 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package sorter + +import ( + "context" + "sync/atomic" + "time" + + "github.com/pingcap/errors" + + "github.com/pingcap/check" + "github.com/pingcap/failpoint" + "github.com/pingcap/log" + "github.com/pingcap/ticdc/cdc/model" + "github.com/pingcap/ticdc/pkg/util/testleak" + "go.uber.org/zap" + "go.uber.org/zap/zapcore" + "golang.org/x/sync/errgroup" +) + +type mockFlushTaskBuilder struct { + task *flushTask + writer backEndWriter + totalCount int +} + +var backEndCounterForTest int64 + +func newMockFlushTaskBuilder() *mockFlushTaskBuilder { + backEnd := newMemoryBackEnd() + atomic.AddInt64(&backEndCounterForTest, 1) + + task := &flushTask{ + backend: backEnd, + tsLowerBound: 0, + maxResolvedTs: 0, + finished: make(chan error, 2), + } + + task.dealloc = func() error { + if task.backend != nil { + atomic.AddInt64(&backEndCounterForTest, -1) + task.backend = nil + return backEnd.free() + } + return nil + } + + writer, _ := backEnd.writer() + + return &mockFlushTaskBuilder{ + task: task, + writer: writer, + } +} + +func (b *mockFlushTaskBuilder) generateRowChanges(tsRangeBegin, tsRangeEnd uint64, count int) *mockFlushTaskBuilder { + if b.task.tsLowerBound == 0 { + b.task.tsLowerBound = tsRangeBegin + } + density := float64(tsRangeEnd-tsRangeBegin) / float64(count) + for fTs := float64(tsRangeBegin); fTs < float64(tsRangeEnd); fTs += density { + ts := uint64(fTs) + kvEntry := generateMockRawKV(ts) + _ = b.writer.writeNext(model.NewPolymorphicEvent(kvEntry)) + b.totalCount++ + } + return b +} + +func (b *mockFlushTaskBuilder) addResolved(ts uint64) *mockFlushTaskBuilder { + _ = b.writer.writeNext(model.NewResolvedPolymorphicEvent(0, ts)) + b.task.maxResolvedTs = ts + return b +} + +func (b *mockFlushTaskBuilder) build() *flushTask { + _ = b.writer.flushAndClose() + return b.task +} + +// TestMergerSingleHeap simulates a situation where there is only one data stream +// It tests the most basic scenario. +func (s *sorterSuite) TestMergerSingleHeap(c *check.C) { + defer testleak.AfterTest(c)() + err := failpoint.Enable("github.com/pingcap/ticdc/cdc/puller/sorter/sorterDebug", "return(true)") + if err != nil { + log.Panic("Could not enable failpoint", zap.Error(err)) + } + + ctx, cancel := context.WithTimeout(context.TODO(), time.Second*10) + defer cancel() + wg, ctx := errgroup.WithContext(ctx) + inChan := make(chan *flushTask, 1024) + outChan := make(chan *model.PolymorphicEvent, 1024) + + wg.Go(func() error { + return runMerger(ctx, 1, inChan, outChan, func() {}, nil) + }) + + totalCount := 0 + builder := newMockFlushTaskBuilder() + task1 := builder.generateRowChanges(1000, 100000, 2048).addResolved(100001).build() + totalCount += builder.totalCount + builder = newMockFlushTaskBuilder() + task2 := builder.generateRowChanges(100002, 200000, 2048).addResolved(200001).build() + totalCount += builder.totalCount + builder = newMockFlushTaskBuilder() + task3 := builder.generateRowChanges(200002, 300000, 2048).addResolved(300001).build() + totalCount += builder.totalCount + + wg.Go(func() error { + inChan <- task1 + close(task1.finished) + inChan <- task2 + close(task2.finished) + inChan <- task3 + close(task3.finished) + + return nil + }) + + wg.Go(func() error { + count := 0 + lastTs := uint64(0) + lastResolved := uint64(0) + for { + select { + case <-ctx.Done(): + return ctx.Err() + case event := <-outChan: + switch event.RawKV.OpType { + case model.OpTypePut: + count++ + c.Assert(event.CRTs, check.GreaterEqual, lastTs) + c.Assert(event.CRTs, check.GreaterEqual, lastResolved) + lastTs = event.CRTs + case model.OpTypeResolved: + c.Assert(event.CRTs, check.GreaterEqual, lastResolved) + lastResolved = event.CRTs + } + if lastResolved >= 300001 { + c.Assert(count, check.Equals, totalCount) + cancel() + return nil + } + } + } + }) + c.Assert(wg.Wait(), check.ErrorMatches, ".*context canceled.*") + c.Assert(atomic.LoadInt64(&backEndCounterForTest), check.Equals, int64(0)) +} + +// TestMergerSingleHeapRetire simulates a situation where the resolved event is not the last event in a flushTask +func (s *sorterSuite) TestMergerSingleHeapRetire(c *check.C) { + defer testleak.AfterTest(c)() + err := failpoint.Enable("github.com/pingcap/ticdc/cdc/puller/sorter/sorterDebug", "return(true)") + if err != nil { + log.Panic("Could not enable failpoint", zap.Error(err)) + } + + ctx, cancel := context.WithTimeout(context.TODO(), time.Second*10) + defer cancel() + wg, ctx := errgroup.WithContext(ctx) + inChan := make(chan *flushTask, 1024) + outChan := make(chan *model.PolymorphicEvent, 1024) + + wg.Go(func() error { + return runMerger(ctx, 1, inChan, outChan, func() {}, nil) + }) + + totalCount := 0 + builder := newMockFlushTaskBuilder() + task1 := builder.generateRowChanges(1000, 100000, 2048).addResolved(100001).build() + totalCount += builder.totalCount + builder = newMockFlushTaskBuilder() + task2 := builder.generateRowChanges(100002, 200000, 2048).build() + totalCount += builder.totalCount + builder = newMockFlushTaskBuilder() + task3 := builder.generateRowChanges(200002, 300000, 2048).addResolved(300001).build() + totalCount += builder.totalCount + + wg.Go(func() error { + inChan <- task1 + close(task1.finished) + inChan <- task2 + close(task2.finished) + inChan <- task3 + close(task3.finished) + + return nil + }) + + wg.Go(func() error { + count := 0 + lastTs := uint64(0) + lastResolved := uint64(0) + for { + select { + case <-ctx.Done(): + return ctx.Err() + case event := <-outChan: + switch event.RawKV.OpType { + case model.OpTypePut: + count++ + c.Assert(event.CRTs, check.GreaterEqual, lastResolved) + c.Assert(event.CRTs, check.GreaterEqual, lastTs) + lastTs = event.CRTs + case model.OpTypeResolved: + c.Assert(event.CRTs, check.GreaterEqual, lastResolved) + lastResolved = event.CRTs + } + if lastResolved >= 300001 { + c.Assert(count, check.Equals, totalCount) + cancel() + return nil + } + } + } + }) + + c.Assert(wg.Wait(), check.ErrorMatches, ".*context canceled.*") + c.Assert(atomic.LoadInt64(&backEndCounterForTest), check.Equals, int64(0)) +} + +// TestMergerSortDelay simulates a situation where merging takes a long time. +// Expects intermediate resolved events to be generated, so that the sink would not get stuck in a real life situation. +func (s *sorterSuite) TestMergerSortDelay(c *check.C) { + defer testleak.AfterTest(c)() + err := failpoint.Enable("github.com/pingcap/ticdc/cdc/puller/sorter/sorterDebug", "return(true)") + c.Assert(err, check.IsNil) + + // enable the failpoint to simulate delays + err = failpoint.Enable("github.com/pingcap/ticdc/cdc/puller/sorter/sorterMergeDelay", "sleep(5)") + c.Assert(err, check.IsNil) + defer func() { + _ = failpoint.Disable("github.com/pingcap/ticdc/cdc/puller/sorter/sorterMergeDelay") + }() + + log.SetLevel(zapcore.DebugLevel) + defer log.SetLevel(zapcore.InfoLevel) + + ctx, cancel := context.WithTimeout(context.TODO(), time.Second*10) + defer cancel() + wg, ctx := errgroup.WithContext(ctx) + inChan := make(chan *flushTask, 1024) + outChan := make(chan *model.PolymorphicEvent, 1024) + + wg.Go(func() error { + return runMerger(ctx, 1, inChan, outChan, func() {}, nil) + }) + + totalCount := 0 + builder := newMockFlushTaskBuilder() + task1 := builder.generateRowChanges(1000, 1000000, 1024).addResolved(1000001).build() + totalCount += builder.totalCount + + wg.Go(func() error { + inChan <- task1 + close(task1.finished) + return nil + }) + + wg.Go(func() error { + var ( + count int + lastTs uint64 + lastResolved uint64 + lastResolvedTime time.Time + ) + for { + select { + case <-ctx.Done(): + return ctx.Err() + case event := <-outChan: + switch event.RawKV.OpType { + case model.OpTypePut: + count++ + c.Assert(event.CRTs, check.GreaterEqual, lastResolved) + c.Assert(event.CRTs, check.GreaterEqual, lastTs) + lastTs = event.CRTs + case model.OpTypeResolved: + c.Assert(event.CRTs, check.GreaterEqual, lastResolved) + if !lastResolvedTime.IsZero() { + c.Assert(time.Since(lastResolvedTime), check.LessEqual, 2*time.Second) + } + log.Debug("resolved event received", zap.Uint64("ts", event.CRTs)) + lastResolvedTime = time.Now() + lastResolved = event.CRTs + } + if lastResolved >= 1000001 { + c.Assert(count, check.Equals, totalCount) + cancel() + return nil + } + } + } + }) + + c.Assert(wg.Wait(), check.ErrorMatches, ".*context canceled.*") + close(inChan) + mergerCleanUp(inChan) + c.Assert(atomic.LoadInt64(&backEndCounterForTest), check.Equals, int64(0)) +} + +// TestMergerCancel simulates a situation where the merger is cancelled with pending data. +// Expects proper clean-up of the data. +func (s *sorterSuite) TestMergerCancel(c *check.C) { + defer testleak.AfterTest(c)() + err := failpoint.Enable("github.com/pingcap/ticdc/cdc/puller/sorter/sorterDebug", "return(true)") + c.Assert(err, check.IsNil) + + // enable the failpoint to simulate delays + err = failpoint.Enable("github.com/pingcap/ticdc/cdc/puller/sorter/sorterMergeDelay", "sleep(10)") + c.Assert(err, check.IsNil) + defer func() { + _ = failpoint.Disable("github.com/pingcap/ticdc/cdc/puller/sorter/sorterMergeDelay") + }() + + log.SetLevel(zapcore.DebugLevel) + defer log.SetLevel(zapcore.InfoLevel) + + ctx, cancel := context.WithTimeout(context.TODO(), time.Second*10) + defer cancel() + wg, ctx := errgroup.WithContext(ctx) + inChan := make(chan *flushTask, 1024) + outChan := make(chan *model.PolymorphicEvent, 1024) + + wg.Go(func() error { + return runMerger(ctx, 1, inChan, outChan, func() {}, nil) + }) + + builder := newMockFlushTaskBuilder() + task1 := builder.generateRowChanges(1000, 100000, 2048).addResolved(100001).build() + builder = newMockFlushTaskBuilder() + task2 := builder.generateRowChanges(100002, 200000, 2048).addResolved(200001).build() + builder = newMockFlushTaskBuilder() + task3 := builder.generateRowChanges(200002, 300000, 2048).addResolved(300001).build() + + wg.Go(func() error { + inChan <- task1 + close(task1.finished) + inChan <- task2 + close(task2.finished) + inChan <- task3 + close(task3.finished) + return nil + }) + + wg.Go(func() error { + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-outChan: + // We just drain the data here. We don't care about it. + } + } + }) + + time.Sleep(5 * time.Second) + cancel() + c.Assert(wg.Wait(), check.ErrorMatches, ".*context canceled.*") + close(inChan) + mergerCleanUp(inChan) + c.Assert(atomic.LoadInt64(&backEndCounterForTest), check.Equals, int64(0)) +} + +// TestMergerCancel simulates a situation where the merger is cancelled with pending data. +// Expects proper clean-up of the data. +func (s *sorterSuite) TestMergerCancelWithUnfinishedFlushTasks(c *check.C) { + defer testleak.AfterTest(c)() + err := failpoint.Enable("github.com/pingcap/ticdc/cdc/puller/sorter/sorterDebug", "return(true)") + c.Assert(err, check.IsNil) + + log.SetLevel(zapcore.DebugLevel) + defer log.SetLevel(zapcore.InfoLevel) + + ctx, cancel := context.WithTimeout(context.TODO(), time.Second*10) + wg, ctx := errgroup.WithContext(ctx) + inChan := make(chan *flushTask, 1024) + outChan := make(chan *model.PolymorphicEvent, 1024) + + wg.Go(func() error { + return runMerger(ctx, 1, inChan, outChan, func() {}, nil) + }) + + builder := newMockFlushTaskBuilder() + task1 := builder.generateRowChanges(1000, 100000, 2048).addResolved(100001).build() + builder = newMockFlushTaskBuilder() + task2 := builder.generateRowChanges(100002, 200000, 2048).addResolved(200001).build() + builder = newMockFlushTaskBuilder() + task3 := builder.generateRowChanges(200002, 300000, 2048).addResolved(300001).build() + + wg.Go(func() error { + inChan <- task1 + inChan <- task2 + inChan <- task3 + close(task2.finished) + close(task1.finished) + time.Sleep(1 * time.Second) + cancel() + return nil + }) + + wg.Go(func() error { + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-outChan: + // We just drain the data here. We don't care about it. + } + } + }) + + c.Assert(wg.Wait(), check.ErrorMatches, ".*context canceled.*") + close(inChan) + mergerCleanUp(inChan) + // Leaking one task is expected + c.Assert(atomic.LoadInt64(&backEndCounterForTest), check.Equals, int64(1)) + atomic.StoreInt64(&backEndCounterForTest, 0) +} + +// TestMergerCancel simulates a situation where the input channel is abruptly closed. +// There is expected to be NO fatal error. +func (s *sorterSuite) TestMergerCloseChannel(c *check.C) { + defer testleak.AfterTest(c)() + err := failpoint.Enable("github.com/pingcap/ticdc/cdc/puller/sorter/sorterDebug", "return(true)") + c.Assert(err, check.IsNil) + + log.SetLevel(zapcore.DebugLevel) + defer log.SetLevel(zapcore.InfoLevel) + + ctx, cancel := context.WithTimeout(context.TODO(), time.Second*15) + defer cancel() + wg, ctx := errgroup.WithContext(ctx) + inChan := make(chan *flushTask, 1024) + outChan := make(chan *model.PolymorphicEvent, 1024) + + builder := newMockFlushTaskBuilder() + task1 := builder.generateRowChanges(1000, 100000, 2048).addResolved(100001).build() + + inChan <- task1 + close(task1.finished) + + wg.Go(func() error { + return runMerger(ctx, 1, inChan, outChan, func() {}, nil) + }) + + wg.Go(func() error { + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-outChan: + // We just drain the data here. We don't care about it. + } + } + }) + + time.Sleep(5 * time.Second) + close(inChan) + time.Sleep(5 * time.Second) + cancel() + c.Assert(wg.Wait(), check.ErrorMatches, ".*context canceled.*") + mergerCleanUp(inChan) + c.Assert(atomic.LoadInt64(&backEndCounterForTest), check.Equals, int64(0)) +} + +// TestTaskBufferBasic tests the basic functionality of TaskBuffer +func (s *sorterSuite) TestTaskBufferBasic(c *check.C) { + defer testleak.AfterTest(c)() + + ctx, cancel := context.WithTimeout(context.TODO(), 10*time.Second) + defer cancel() + errg, ctx := errgroup.WithContext(ctx) + var bufLen int64 + taskBuf := newTaskBuffer(&bufLen) + + // run producer + errg.Go(func() error { + for i := 0; i < 10000; i++ { + select { + case <-ctx.Done(): + return errors.Trace(ctx.Err()) + default: + } + + var dummyTask flushTask + taskBuf.put(&dummyTask) + } + + taskBuf.setClosed() + return nil + }) + + // run consumer + errg.Go(func() error { + for i := 0; i < 10001; i++ { + select { + case <-ctx.Done(): + return errors.Trace(ctx.Err()) + default: + } + + task, err := taskBuf.get(ctx) + c.Assert(err, check.IsNil) + + if i == 10000 { + c.Assert(task, check.IsNil) + taskBuf.close() + return nil + } + + c.Assert(task, check.NotNil) + } + c.Fail() // unreachable + return nil + }) + + c.Assert(errg.Wait(), check.IsNil) + c.Assert(bufLen, check.Equals, int64(0)) +} + +// TestTaskBufferBasic tests the situation where the taskBuffer's consumer is +// first starved and then exit due to taskBuf shutdown. +func (s *sorterSuite) TestTaskBufferStarveAndClose(c *check.C) { + defer testleak.AfterTest(c)() + + ctx, cancel := context.WithTimeout(context.TODO(), 10*time.Second) + defer cancel() + errg, ctx := errgroup.WithContext(ctx) + var bufLen int64 + taskBuf := newTaskBuffer(&bufLen) + + // run producer + errg.Go(func() error { + for i := 0; i < 1000; i++ { + select { + case <-ctx.Done(): + return errors.Trace(ctx.Err()) + default: + } + + var dummyTask flushTask + taskBuf.put(&dummyTask) + } + + // starve the consumer + time.Sleep(3 * time.Second) + taskBuf.setClosed() + return nil + }) + + // run consumer + errg.Go(func() error { + for i := 0; i < 1001; i++ { + select { + case <-ctx.Done(): + return errors.Trace(ctx.Err()) + default: + } + + task, err := taskBuf.get(ctx) + if i < 1000 { + c.Assert(task, check.NotNil) + c.Assert(err, check.IsNil) + } else { + c.Assert(task, check.IsNil) + c.Assert(err, check.IsNil) + taskBuf.close() + return nil + } + } + c.Fail() // unreachable + return nil + }) + + c.Assert(errg.Wait(), check.IsNil) + c.Assert(bufLen, check.Equals, int64(0)) +} diff --git a/cdc/puller/sorter_test.go b/cdc/puller/sorter/sorter_test.go similarity index 52% rename from cdc/puller/sorter_test.go rename to cdc/puller/sorter/sorter_test.go index f55d22ad01c..ad81e1c1e4f 100644 --- a/cdc/puller/sorter_test.go +++ b/cdc/puller/sorter/sorter_test.go @@ -11,13 +11,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -package puller +package sorter import ( "context" "math" "os" "sync/atomic" + "testing" "time" "github.com/pingcap/check" @@ -25,7 +26,7 @@ import ( "github.com/pingcap/failpoint" "github.com/pingcap/log" "github.com/pingcap/ticdc/cdc/model" - sorter2 "github.com/pingcap/ticdc/cdc/puller/sorter" + "github.com/pingcap/ticdc/cdc/puller" "github.com/pingcap/ticdc/pkg/config" "github.com/pingcap/ticdc/pkg/util/testleak" "go.uber.org/zap" @@ -39,7 +40,9 @@ const ( type sorterSuite struct{} -var _ = check.Suite(&sorterSuite{}) +var _ = check.SerialSuites(&sorterSuite{}) + +func Test(t *testing.T) { check.TestingT(t) } func generateMockRawKV(ts uint64) *model.RawKVEntry { return &model.RawKVEntry{ @@ -55,61 +58,65 @@ func generateMockRawKV(ts uint64) *model.RawKVEntry { func (s *sorterSuite) TestSorterBasic(c *check.C) { defer testleak.AfterTest(c)() - defer sorter2.UnifiedSorterCleanUp() + defer UnifiedSorterCleanUp() - config.SetSorterConfig(&config.SorterConfig{ + conf := config.GetDefaultServerConfig() + conf.Sorter = &config.SorterConfig{ NumConcurrentWorker: 8, ChunkSizeLimit: 1 * 1024 * 1024 * 1024, MaxMemoryPressure: 60, MaxMemoryConsumption: 16 * 1024 * 1024 * 1024, NumWorkerPoolGoroutine: 4, - }) + } + config.StoreGlobalServerConfig(conf) err := os.MkdirAll("/tmp/sorter", 0o755) c.Assert(err, check.IsNil) - sorter := sorter2.NewUnifiedSorter("/tmp/sorter", "test", "0.0.0.0:0") + sorter := NewUnifiedSorter("/tmp/sorter", "test-cf", "test", 0, "0.0.0.0:0") ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) defer cancel() - testSorter(ctx, c, sorter, 10000) + testSorter(ctx, c, sorter, 10000, true) } func (s *sorterSuite) TestSorterCancel(c *check.C) { defer testleak.AfterTest(c)() - defer sorter2.UnifiedSorterCleanUp() + defer UnifiedSorterCleanUp() - config.SetSorterConfig(&config.SorterConfig{ + conf := config.GetDefaultServerConfig() + conf.Sorter = &config.SorterConfig{ NumConcurrentWorker: 8, ChunkSizeLimit: 1 * 1024 * 1024 * 1024, MaxMemoryPressure: 60, MaxMemoryConsumption: 0, NumWorkerPoolGoroutine: 4, - }) + } + config.StoreGlobalServerConfig(conf) err := os.MkdirAll("/tmp/sorter", 0o755) c.Assert(err, check.IsNil) - sorter := sorter2.NewUnifiedSorter("/tmp/sorter", "test", "0.0.0.0:0") + sorter := NewUnifiedSorter("/tmp/sorter", "test-cf", "test", 0, "0.0.0.0:0") ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() finishedCh := make(chan struct{}) go func() { - testSorter(ctx, c, sorter, 10000000) + testSorter(ctx, c, sorter, 10000000, true) close(finishedCh) }() after := time.After(30 * time.Second) select { case <-after: - c.FailNow() + c.Fatal("TestSorterCancel timed out") case <-finishedCh: } log.Info("Sorter successfully cancelled") } -func testSorter(ctx context.Context, c *check.C, sorter EventSorter, count int) { +func testSorter(ctx context.Context, c *check.C, sorter puller.EventSorter, count int, needWorkerPool bool) { err := failpoint.Enable("github.com/pingcap/ticdc/cdc/puller/sorter/sorterDebug", "return(true)") if err != nil { log.Panic("Could not enable failpoint", zap.Error(err)) @@ -121,9 +128,11 @@ func testSorter(ctx context.Context, c *check.C, sorter EventSorter, count int) return sorter.Run(ctx) }) - errg.Go(func() error { - return sorter2.RunWorkerPool(ctx) - }) + if needWorkerPool { + errg.Go(func() error { + return RunWorkerPool(ctx) + }) + } producerProgress := make([]uint64, numProducers) @@ -131,7 +140,7 @@ func testSorter(ctx context.Context, c *check.C, sorter EventSorter, count int) for i := 0; i < numProducers; i++ { finalI := i errg.Go(func() error { - for j := 0; j < count; j++ { + for j := 1; j <= count; j++ { select { case <-ctx.Done(): return ctx.Err() @@ -143,15 +152,15 @@ func testSorter(ctx context.Context, c *check.C, sorter EventSorter, count int) atomic.StoreUint64(&producerProgress[finalI], uint64(j)<<5) } } - sorter.AddEntry(ctx, model.NewPolymorphicEvent(generateMockRawKV(uint64(count)<<5))) - atomic.StoreUint64(&producerProgress[finalI], uint64(count)<<5) + sorter.AddEntry(ctx, model.NewPolymorphicEvent(generateMockRawKV(uint64(count+1)<<5))) + atomic.StoreUint64(&producerProgress[finalI], uint64(count+1)<<5) return nil }) } // launch the resolver errg.Go(func() error { - ticker := time.NewTicker(30 * time.Second) + ticker := time.NewTicker(1 * time.Second) defer ticker.Stop() for { select { @@ -210,3 +219,98 @@ func testSorter(ctx context.Context, c *check.C, sorter EventSorter, count int) } c.Assert(err, check.IsNil) } + +func (s *sorterSuite) TestSortDirConfigLocal(c *check.C) { + defer testleak.AfterTest(c)() + defer UnifiedSorterCleanUp() + + poolMu.Lock() + // Clean up the back-end pool if one has been created + pool = nil + poolMu.Unlock() + + err := os.MkdirAll("/tmp/sorter", 0o755) + c.Assert(err, check.IsNil) + // We expect the local setting to override the changefeed setting + config.GetGlobalServerConfig().Sorter.SortDir = "/tmp/sorter_local" + + _ = NewUnifiedSorter("/tmp/sorter", /* the changefeed setting */ + "test-cf", + "test", + 0, + "0.0.0.0:0") + + poolMu.Lock() + defer poolMu.Unlock() + + c.Assert(pool, check.NotNil) + c.Assert(pool.dir, check.Equals, "/tmp/sorter_local") +} + +func (s *sorterSuite) TestSortDirConfigChangeFeed(c *check.C) { + defer testleak.AfterTest(c)() + defer UnifiedSorterCleanUp() + + poolMu.Lock() + // Clean up the back-end pool if one has been created + pool = nil + poolMu.Unlock() + + err := os.MkdirAll("/tmp/sorter", 0o755) + c.Assert(err, check.IsNil) + // We expect the changefeed setting to take effect + config.GetGlobalServerConfig().Sorter.SortDir = "" + + _ = NewUnifiedSorter("/tmp/sorter", /* the changefeed setting */ + "test-cf", + "test", + 0, + "0.0.0.0:0") + + poolMu.Lock() + defer poolMu.Unlock() + + c.Assert(pool, check.NotNil) + c.Assert(pool.dir, check.Equals, "/tmp/sorter") +} + +// TestSorterCancelRestart tests the situation where the Unified Sorter is repeatedly canceled and +// restarted. There should not be any problem, especially file corruptions. +func (s *sorterSuite) TestSorterCancelRestart(c *check.C) { + defer testleak.AfterTest(c)() + defer UnifiedSorterCleanUp() + + conf := config.GetDefaultServerConfig() + conf.Sorter = &config.SorterConfig{ + NumConcurrentWorker: 8, + ChunkSizeLimit: 1 * 1024 * 1024 * 1024, + MaxMemoryPressure: 0, // disable memory sort + MaxMemoryConsumption: 0, + NumWorkerPoolGoroutine: 4, + } + config.StoreGlobalServerConfig(conf) + + err := os.MkdirAll("/tmp/sorter", 0o755) + c.Assert(err, check.IsNil) + + // enable the failpoint to simulate delays + err = failpoint.Enable("github.com/pingcap/ticdc/cdc/puller/sorter/asyncFlushStartDelay", "sleep(100)") + c.Assert(err, check.IsNil) + defer func() { + _ = failpoint.Disable("github.com/pingcap/ticdc/cdc/puller/sorter/asyncFlushStartDelay") + }() + + // enable the failpoint to simulate delays + err = failpoint.Enable("github.com/pingcap/ticdc/cdc/puller/sorter/asyncFlushInProcessDelay", "1%sleep(1)") + c.Assert(err, check.IsNil) + defer func() { + _ = failpoint.Disable("github.com/pingcap/ticdc/cdc/puller/sorter/asyncFlushInProcessDelay") + }() + + for i := 0; i < 5; i++ { + sorter := NewUnifiedSorter("/tmp/sorter", "test-cf", "test", 0, "0.0.0.0:0") + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + testSorter(ctx, c, sorter, 100000000, true) + cancel() + } +} diff --git a/cdc/puller/sorter/unified_sorter.go b/cdc/puller/sorter/unified_sorter.go index f5ecd54856f..ac87373bc51 100644 --- a/cdc/puller/sorter/unified_sorter.go +++ b/cdc/puller/sorter/unified_sorter.go @@ -15,45 +15,99 @@ package sorter import ( "context" + "os" "sync" + "sync/atomic" + "time" "github.com/pingcap/errors" "github.com/pingcap/failpoint" "github.com/pingcap/log" "github.com/pingcap/ticdc/cdc/model" "github.com/pingcap/ticdc/pkg/config" + cerror "github.com/pingcap/ticdc/pkg/errors" "github.com/pingcap/ticdc/pkg/util" "golang.org/x/sync/errgroup" ) // UnifiedSorter provides both sorting in memory and in file. Memory pressure is used to determine which one to use. type UnifiedSorter struct { - inputCh chan *model.PolymorphicEvent - outputCh chan *model.PolymorphicEvent - dir string - pool *backEndPool - tableName string // used only for debugging and tracing + inputCh chan *model.PolymorphicEvent + outputCh chan *model.PolymorphicEvent + dir string + pool *backEndPool + metricsInfo *metricsInfo +} + +type metricsInfo struct { + changeFeedID model.ChangeFeedID + tableName string + tableID model.TableID + captureAddr string } type ctxKey struct { } +// UnifiedSorterCheckDir checks whether the directory needed exists and is writable. +// If it does not exist, we try to create one. +// parameter: cfSortDir - the directory designated in changefeed's setting, +// which will be overridden by a non-empty local setting of `sort-dir`. +// TODO better way to organize this function after we obsolete chanegfeed setting's `sort-dir` +func UnifiedSorterCheckDir(cfSortDir string) error { + dir := cfSortDir + sorterConfig := config.GetGlobalServerConfig().Sorter + if sorterConfig.SortDir != "" { + // Let the local setting override the changefeed setting + dir = sorterConfig.SortDir + } + + err := util.IsDirAndWritable(dir) + if err != nil { + if os.IsNotExist(errors.Cause(err)) { + err = os.MkdirAll(dir, 0o755) + if err != nil { + return errors.Annotate(cerror.WrapError(cerror.ErrProcessorSortDir, err), "create dir") + } + } else { + return errors.Annotate(cerror.WrapError(cerror.ErrProcessorSortDir, err), "sort dir check") + } + } + + return nil +} + // NewUnifiedSorter creates a new UnifiedSorter -func NewUnifiedSorter(dir string, tableName string, captureAddr string) *UnifiedSorter { +func NewUnifiedSorter( + dir string, + changeFeedID model.ChangeFeedID, + tableName string, + tableID model.TableID, + captureAddr string) *UnifiedSorter { poolMu.Lock() defer poolMu.Unlock() if pool == nil { + sorterConfig := config.GetGlobalServerConfig().Sorter + if sorterConfig.SortDir != "" { + // Let the local setting override the changefeed setting + dir = sorterConfig.SortDir + } pool = newBackEndPool(dir, captureAddr) } lazyInitWorkerPool() return &UnifiedSorter{ - inputCh: make(chan *model.PolymorphicEvent, 128000), - outputCh: make(chan *model.PolymorphicEvent, 128000), - dir: dir, - pool: pool, - tableName: tableName, + inputCh: make(chan *model.PolymorphicEvent, 128), + outputCh: make(chan *model.PolymorphicEvent, 128), + dir: dir, + pool: pool, + metricsInfo: &metricsInfo{ + changeFeedID: changeFeedID, + tableName: tableName, + tableID: tableID, + captureAddr: captureAddr, + }, } } @@ -78,12 +132,15 @@ func (s *UnifiedSorter) Run(ctx context.Context) error { finish := util.MonitorCancelLatency(ctx, "Unified Sorter") defer finish() - valueCtx := context.WithValue(ctx, ctxKey{}, s) + ctx = context.WithValue(ctx, ctxKey{}, s) + ctx = util.PutCaptureAddrInCtx(ctx, s.metricsInfo.captureAddr) + ctx = util.PutChangefeedIDInCtx(ctx, s.metricsInfo.changeFeedID) + ctx = util.PutTableInfoInCtx(ctx, s.metricsInfo.tableID, s.metricsInfo.tableName) - sorterConfig := config.GetSorterConfig() + sorterConfig := config.GetGlobalServerConfig().Sorter numConcurrentHeaps := sorterConfig.NumConcurrentWorker - errg, subctx := errgroup.WithContext(valueCtx) + errg, subctx := errgroup.WithContext(ctx) heapSorterCollectCh := make(chan *flushTask, 4096) // mergerCleanUp will consumer the remaining elements in heapSorterCollectCh to prevent any FD leak. defer mergerCleanUp(heapSorterCollectCh) @@ -93,15 +150,21 @@ func (s *UnifiedSorter) Run(ctx context.Context) error { heapSorterErrOnce := &sync.Once{} heapSorters := make([]*heapSorter, sorterConfig.NumConcurrentWorker) for i := range heapSorters { - finalI := i - heapSorters[finalI] = newHeapSorter(finalI, heapSorterCollectCh) - heapSorters[finalI].init(subctx, func(err error) { + heapSorters[i] = newHeapSorter(i, heapSorterCollectCh) + heapSorters[i].init(subctx, func(err error) { heapSorterErrOnce.Do(func() { heapSorterErrCh <- err }) }) } + ioCancelFunc := func() { + for _, heapSorter := range heapSorters { + // cancels async IO operations + heapSorter.canceller.Cancel() + } + } + errg.Go(func() error { defer func() { // cancelling the heapSorters from the outside @@ -112,18 +175,17 @@ func (s *UnifiedSorter) Run(ctx context.Context) error { close(heapSorterCollectCh) }() - for { - select { - case <-subctx.Done(): - return errors.Trace(subctx.Err()) - case err := <-heapSorterErrCh: - return errors.Trace(err) - } + select { + case <-subctx.Done(): + return errors.Trace(subctx.Err()) + case err := <-heapSorterErrCh: + return errors.Trace(err) } }) + var mergerBufLen int64 errg.Go(func() error { - return printError(runMerger(subctx, numConcurrentHeaps, heapSorterCollectCh, s.outputCh)) + return printError(runMerger(subctx, numConcurrentHeaps, heapSorterCollectCh, s.outputCh, ioCancelFunc, &mergerBufLen)) }) errg.Go(func() error { @@ -139,6 +201,15 @@ func (s *UnifiedSorter) Run(ctx context.Context) error { nextSorterID := 0 for { + // tentative value 1280000 + for atomic.LoadInt64(&mergerBufLen) > 1280000 { + after := time.After(1 * time.Second) + select { + case <-subctx.Done(): + return subctx.Err() + case <-after: + } + } select { case <-subctx.Done(): return subctx.Err() @@ -209,11 +280,3 @@ func RunWorkerPool(ctx context.Context) error { return errors.Trace(errg.Wait()) } - -// tableNameFromCtx is used for retrieving the table's name from a context within the Unified Sorter -func tableNameFromCtx(ctx context.Context) string { - if sorter, ok := ctx.Value(ctxKey{}).(*UnifiedSorter); ok { - return sorter.tableName - } - return "" -} diff --git a/cdc/server.go b/cdc/server.go index 4061dc584ba..0af202c7256 100644 --- a/cdc/server.go +++ b/cdc/server.go @@ -15,9 +15,7 @@ package cdc import ( "context" - "crypto/tls" "fmt" - "net" "net/http" "strings" "sync" @@ -30,7 +28,6 @@ import ( "github.com/pingcap/ticdc/pkg/config" cerror "github.com/pingcap/ticdc/pkg/errors" "github.com/pingcap/ticdc/pkg/httputil" - "github.com/pingcap/ticdc/pkg/security" "github.com/pingcap/ticdc/pkg/util" "github.com/pingcap/ticdc/pkg/version" "github.com/prometheus/client_golang/prometheus" @@ -45,132 +42,10 @@ import ( const ( ownerRunInterval = time.Millisecond * 500 - - // DefaultCDCGCSafePointTTL is the default value of cdc gc safe-point ttl, specified in seconds. - DefaultCDCGCSafePointTTL = 24 * 60 * 60 ) -type options struct { - pdEndpoints string - credential *security.Credential - addr string - advertiseAddr string - gcTTL int64 - timezone *time.Location - ownerFlushInterval time.Duration - processorFlushInterval time.Duration -} - -func (o *options) validateAndAdjust() error { - if o.pdEndpoints == "" { - return cerror.ErrInvalidServerOption.GenWithStack("empty PD address") - } - if o.addr == "" { - return cerror.ErrInvalidServerOption.GenWithStack("empty address") - } - if o.advertiseAddr == "" { - o.advertiseAddr = o.addr - } - // Advertise address must be specified. - if idx := strings.LastIndex(o.advertiseAddr, ":"); idx >= 0 { - ip := net.ParseIP(o.advertiseAddr[:idx]) - // Skip nil as it could be a domain name. - if ip != nil && ip.IsUnspecified() { - return cerror.ErrInvalidServerOption.GenWithStack("advertise address must be specified as a valid IP") - } - } else { - return cerror.ErrInvalidServerOption.GenWithStack("advertise address or address does not contain a port") - } - if o.gcTTL == 0 { - return cerror.ErrInvalidServerOption.GenWithStack("empty GC TTL is not allowed") - } - var tlsConfig *tls.Config - if o.credential != nil { - var err error - tlsConfig, err = o.credential.ToTLSConfig() - if err != nil { - return errors.Annotate(err, "invalidate TLS config") - } - _, err = o.credential.ToGRPCDialOption() - if err != nil { - return errors.Annotate(err, "invalidate TLS config") - } - } - for _, ep := range strings.Split(o.pdEndpoints, ",") { - if tlsConfig != nil { - if strings.Index(ep, "http://") == 0 { - return cerror.ErrInvalidServerOption.GenWithStack("PD endpoint scheme should be https") - } - } else if strings.Index(ep, "http://") != 0 { - return cerror.ErrInvalidServerOption.GenWithStack("PD endpoint scheme should be http") - } - } - - return nil -} - -// PDEndpoints returns a ServerOption that sets the endpoints of PD for the server. -func PDEndpoints(s string) ServerOption { - return func(o *options) { - o.pdEndpoints = s - } -} - -// Address returns a ServerOption that sets the server listen address -func Address(s string) ServerOption { - return func(o *options) { - o.addr = s - } -} - -// AdvertiseAddress returns a ServerOption that sets the server advertise address -func AdvertiseAddress(s string) ServerOption { - return func(o *options) { - o.advertiseAddr = s - } -} - -// GCTTL returns a ServerOption that sets the gc ttl. -func GCTTL(t int64) ServerOption { - return func(o *options) { - o.gcTTL = t - } -} - -// Timezone returns a ServerOption that sets the timezone -func Timezone(tz *time.Location) ServerOption { - return func(o *options) { - o.timezone = tz - } -} - -// OwnerFlushInterval returns a ServerOption that sets the ownerFlushInterval -func OwnerFlushInterval(dur time.Duration) ServerOption { - return func(o *options) { - o.ownerFlushInterval = dur - } -} - -// ProcessorFlushInterval returns a ServerOption that sets the processorFlushInterval -func ProcessorFlushInterval(dur time.Duration) ServerOption { - return func(o *options) { - o.processorFlushInterval = dur - } -} - -// Credential returns a ServerOption that sets the TLS -func Credential(credential *security.Credential) ServerOption { - return func(o *options) { - o.credential = credential - } -} - -// A ServerOption sets options such as the addr of PD. -type ServerOption func(*options) - // Server is the capture server type Server struct { - opts options capture *Capture owner *Owner ownerLock sync.RWMutex @@ -180,39 +55,29 @@ type Server struct { } // NewServer creates a Server instance. -func NewServer(opt ...ServerOption) (*Server, error) { - opts := options{} - for _, o := range opt { - o(&opts) - } - if err := opts.validateAndAdjust(); err != nil { - return nil, err - } +func NewServer(pdEndpoints []string) (*Server, error) { + conf := config.GetGlobalServerConfig() log.Info("creating CDC server", - zap.String("pd-addr", opts.pdEndpoints), - zap.String("address", opts.addr), - zap.String("advertise-address", opts.advertiseAddr), - zap.Int64("gc-ttl", opts.gcTTL), - zap.Any("timezone", opts.timezone), - zap.Duration("owner-flush-interval", opts.ownerFlushInterval), - zap.Duration("processor-flush-interval", opts.processorFlushInterval), + zap.Strings("pd-addrs", pdEndpoints), + zap.Stringer("config", conf), ) s := &Server{ - opts: opts, + pdEndpoints: pdEndpoints, } return s, nil } // Run runs the server. func (s *Server) Run(ctx context.Context) error { - s.pdEndpoints = strings.Split(s.opts.pdEndpoints, ",") - grpcTLSOption, err := s.opts.credential.ToGRPCDialOption() + conf := config.GetGlobalServerConfig() + + grpcTLSOption, err := conf.Security.ToGRPCDialOption() if err != nil { return errors.Trace(err) } pdClient, err := pd.NewClientWithContext( - ctx, s.pdEndpoints, s.opts.credential.PDSecurityOption(), + ctx, s.pdEndpoints, conf.Security.PDSecurityOption(), pd.WithGRPCDialOptions( grpcTLSOption, grpc.WithBlock(), @@ -234,7 +99,7 @@ func (s *Server) Run(ctx context.Context) error { // To not block CDC server startup, we need to warn instead of error // when TiKV is incompatible. errorTiKVIncompatible := false - err = version.CheckClusterVersion(ctx, s.pdClient, s.pdEndpoints[0], s.opts.credential, errorTiKVIncompatible) + err = version.CheckClusterVersion(ctx, s.pdClient, s.pdEndpoints[0], conf.Security, errorTiKVIncompatible) if err != nil { return err } @@ -243,7 +108,7 @@ func (s *Server) Run(ctx context.Context) error { return err } - kvStore, err := kv.CreateTiStore(strings.Join(s.pdEndpoints, ","), s.opts.credential) + kvStore, err := kv.CreateTiStore(strings.Join(s.pdEndpoints, ","), conf.Security) if err != nil { return errors.Trace(err) } @@ -273,6 +138,7 @@ func (s *Server) campaignOwnerLoop(ctx context.Context) error { // In most failure cases, we don't return error directly, just run another // campaign loop. We treat campaign loop as a special background routine. + conf := config.GetGlobalServerConfig() rl := rate.NewLimiter(0.05, 2) for { err := rl.Wait(ctx) @@ -296,7 +162,7 @@ func (s *Server) campaignOwnerLoop(ctx context.Context) error { } captureID := s.capture.info.ID log.Info("campaign owner successfully", zap.String("capture-id", captureID)) - owner, err := NewOwner(ctx, s.pdClient, s.opts.credential, s.capture.session, s.opts.gcTTL, s.opts.ownerFlushInterval) + owner, err := NewOwner(ctx, s.pdClient, conf.Security, s.capture.session, conf.GcTTL, time.Duration(conf.OwnerFlushInterval)) if err != nil { log.Warn("create new owner failed", zap.Error(err)) continue @@ -329,15 +195,16 @@ func (s *Server) campaignOwnerLoop(ctx context.Context) error { func (s *Server) etcdHealthChecker(ctx context.Context) error { ticker := time.NewTicker(time.Second * 3) defer ticker.Stop() + conf := config.GetGlobalServerConfig() - httpCli, err := httputil.NewClient(s.opts.credential) + httpCli, err := httputil.NewClient(conf.Security) if err != nil { return err } defer httpCli.CloseIdleConnections() metrics := make(map[string]prometheus.Observer) for _, pdEndpoint := range s.pdEndpoints { - metrics[pdEndpoint] = etcdHealthCheckDuration.WithLabelValues(s.opts.advertiseAddr, pdEndpoint) + metrics[pdEndpoint] = etcdHealthCheckDuration.WithLabelValues(conf.AdvertiseAddr, pdEndpoint) } for { @@ -368,11 +235,13 @@ func (s *Server) etcdHealthChecker(ctx context.Context) error { } func (s *Server) run(ctx context.Context) (err error) { - ctx = util.PutCaptureAddrInCtx(ctx, s.opts.advertiseAddr) - ctx = util.PutTimezoneInCtx(ctx, s.opts.timezone) + conf := config.GetGlobalServerConfig() - procOpts := &processorOpts{flushCheckpointInterval: s.opts.processorFlushInterval} - capture, err := NewCapture(ctx, s.pdEndpoints, s.pdClient, s.opts.credential, s.opts.advertiseAddr, procOpts) + opts := &captureOpts{ + flushCheckpointInterval: time.Duration(conf.ProcessorFlushInterval), + captureSessionTTL: conf.CaptureSessionTTL, + } + capture, err := NewCapture(ctx, s.pdEndpoints, s.pdClient, conf.Security, conf.AdvertiseAddr, opts) if err != nil { return err } diff --git a/cdc/server_test.go b/cdc/server_test.go index f4a909ced3d..f49fcc08475 100644 --- a/cdc/server_test.go +++ b/cdc/server_test.go @@ -16,7 +16,6 @@ package cdc import ( "context" "net/url" - "strings" "time" "github.com/pingcap/check" @@ -55,72 +54,20 @@ func (s *serverSuite) TearDownTest(c *check.C) { var _ = check.Suite(&serverSuite{}) -func (s *serverSuite) TestNewServer(c *check.C) { - defer testleak.AfterTest(c)() - defer s.TearDownTest(c) - svr, err := NewServer() - c.Assert(err, check.ErrorMatches, ".*empty PD address") - c.Assert(svr, check.IsNil) - - svr, err = NewServer(PDEndpoints("http://pd")) - c.Assert(err, check.ErrorMatches, ".*empty address") - c.Assert(svr, check.IsNil) - - svr, err = NewServer(PDEndpoints("http://pd"), Address("cdc:1234")) - c.Assert(err, check.ErrorMatches, ".*empty GC TTL is not allowed") - c.Assert(svr, check.IsNil) - - svr, err = NewServer(PDEndpoints("http://pd"), Address("cdc:1234"), GCTTL(DefaultCDCGCSafePointTTL)) - c.Assert(err, check.IsNil) - c.Assert(svr, check.NotNil) - c.Assert(svr.opts.advertiseAddr, check.Equals, "cdc:1234") - - svr, err = NewServer(PDEndpoints("http://pd"), Address("cdc:1234"), GCTTL(DefaultCDCGCSafePointTTL), - AdvertiseAddress("advertise:1234")) - c.Assert(err, check.IsNil) - c.Assert(svr, check.NotNil) - c.Assert(svr.opts.addr, check.Equals, "cdc:1234") - c.Assert(svr.opts.advertiseAddr, check.Equals, "advertise:1234") - - svr, err = NewServer(PDEndpoints("http://pd"), Address("0.0.0.0:1234"), GCTTL(DefaultCDCGCSafePointTTL), - AdvertiseAddress("advertise:1234")) - c.Assert(err, check.IsNil) - c.Assert(svr, check.NotNil) - c.Assert(svr.opts.addr, check.Equals, "0.0.0.0:1234") - c.Assert(svr.opts.advertiseAddr, check.Equals, "advertise:1234") - - svr, err = NewServer(PDEndpoints("http://pd"), Address("0.0.0.0:1234"), GCTTL(DefaultCDCGCSafePointTTL)) - c.Assert(err, check.ErrorMatches, ".*must be specified.*") - c.Assert(svr, check.IsNil) - - svr, err = NewServer(PDEndpoints("http://pd"), Address("cdc:1234"), GCTTL(DefaultCDCGCSafePointTTL), - AdvertiseAddress("0.0.0.0:1234")) - c.Assert(err, check.ErrorMatches, ".*must be specified.*") - c.Assert(svr, check.IsNil) - - svr, err = NewServer(PDEndpoints("http://pd"), Address("cdc:1234"), GCTTL(DefaultCDCGCSafePointTTL), - AdvertiseAddress("advertise")) - c.Assert(err, check.ErrorMatches, ".*does not contain a port") - c.Assert(svr, check.IsNil) -} - func (s *serverSuite) TestEtcdHealthChecker(c *check.C) { defer testleak.AfterTest(c)() defer s.TearDownTest(c) ctx, cancel := context.WithCancel(context.Background()) - pdEndpoints := strings.Join([]string{ + pdEndpoints := []string{ "http://" + s.clientURL.Host, "http://invalid-pd-host:2379", - }, ",") - server, err := NewServer( - PDEndpoints(pdEndpoints), Address("0.0.0.0:1234"), GCTTL(DefaultCDCGCSafePointTTL), - AdvertiseAddress("127.0.0.1:1234")) + } + server, err := NewServer(pdEndpoints) c.Assert(err, check.IsNil) c.Assert(server, check.NotNil) s.errg.Go(func() error { - server.pdEndpoints = strings.Split(server.opts.pdEndpoints, ",") err := server.etcdHealthChecker(ctx) c.Assert(err, check.Equals, context.Canceled) return nil diff --git a/cdc/sink/cdclog/s3.go b/cdc/sink/cdclog/s3.go index 632bc1eec31..4d3ec18811c 100644 --- a/cdc/sink/cdclog/s3.go +++ b/cdc/sink/cdclog/s3.go @@ -51,7 +51,7 @@ type tableBuffer struct { encoder codec.EventBatchEncoder uploadParts struct { - uploader storage.Uploader + writer storage.ExternalFileWriter uploadNum int byteSize int64 } @@ -132,15 +132,15 @@ func (tb *tableBuffer) flush(ctx context.Context, sink *logSink) error { // so, if this batch data size is greater than 5Mb or it has uploadPart already // we will use multi-upload this batch data if len(rowDatas) > 0 { - if hashPart.uploader == nil { - uploader, err := sink.storage().CreateUploader(ctx, newFileName) + if hashPart.writer == nil { + fileWriter, err := sink.storage().Create(ctx, newFileName) if err != nil { return cerror.WrapError(cerror.ErrS3SinkStorageAPI, err) } - hashPart.uploader = uploader + hashPart.writer = fileWriter } - err := hashPart.uploader.UploadPart(ctx, rowDatas) + _, err := hashPart.writer.Write(ctx, rowDatas) if err != nil { return cerror.WrapError(cerror.ErrS3SinkStorageAPI, err) } @@ -153,19 +153,19 @@ func (tb *tableBuffer) flush(ctx context.Context, sink *logSink) error { // we need do complete when total upload size is greater than 100Mb // or this part data is less than 5Mb to avoid meet EntityTooSmall error log.Info("[FlushRowChangedEvents] complete file", zap.Int64("tableID", tb.tableID)) - err := hashPart.uploader.CompleteUpload(ctx) + err := hashPart.writer.Close(ctx) if err != nil { return cerror.WrapError(cerror.ErrS3SinkStorageAPI, err) } hashPart.byteSize = 0 hashPart.uploadNum = 0 - hashPart.uploader = nil + hashPart.writer = nil tb.encoder = nil } } else { // generate normal file because S3 multi-upload need every part at least 5Mb. log.Info("[FlushRowChangedEvents] normal upload file", zap.Int64("tableID", tb.tableID)) - err := sink.storage().Write(ctx, newFileName, rowDatas) + err := sink.storage().WriteFile(ctx, newFileName, rowDatas) if err != nil { return cerror.WrapError(cerror.ErrS3SinkStorageAPI, err) } @@ -185,11 +185,11 @@ func newTableBuffer(tableID int64) logUnit { sendSize: atomic.NewInt64(0), sendEvents: atomic.NewInt64(0), uploadParts: struct { - uploader storage.Uploader + writer storage.ExternalFileWriter uploadNum int byteSize int64 }{ - uploader: nil, + writer: nil, uploadNum: 0, byteSize: 0, }, @@ -218,7 +218,7 @@ func (s *s3Sink) flushLogMeta(ctx context.Context) error { if err != nil { return cerror.WrapError(cerror.ErrMarshalFailed, err) } - return cerror.WrapError(cerror.ErrS3SinkWriteStorage, s.storage.Write(ctx, logMetaFile, data)) + return cerror.WrapError(cerror.ErrS3SinkWriteStorage, s.storage.WriteFile(ctx, logMetaFile, data)) } func (s *s3Sink) FlushRowChangedEvents(ctx context.Context, resolvedTs uint64) (uint64, error) { @@ -304,20 +304,20 @@ func (s *s3Sink) EmitDDLEvent(ctx context.Context, ddl *model.DDLEvent) error { // hack way: append data to old file log.Debug("[EmitDDLEvent] append ddl to origin log", zap.String("name", name), zap.Any("ddl", ddl)) - fileData, err = s.storage.Read(ctx, name) + fileData, err = s.storage.ReadFile(ctx, name) if err != nil { return cerror.WrapError(cerror.ErrS3SinkStorageAPI, err) } fileData = append(fileData, data...) } - return s.storage.Write(ctx, name, fileData) + return s.storage.WriteFile(ctx, name, fileData) } func (s *s3Sink) Initialize(ctx context.Context, tableInfo []*model.SimpleTableInfo) error { if tableInfo != nil { for _, table := range tableInfo { if table != nil { - err := s.storage.Write(ctx, makeTableDirectoryName(table.TableID), nil) + err := s.storage.WriteFile(ctx, makeTableDirectoryName(table.TableID), nil) if err != nil { return errors.Annotate( cerror.WrapError(cerror.ErrS3SinkStorageAPI, err), @@ -332,7 +332,7 @@ func (s *s3Sink) Initialize(ctx context.Context, tableInfo []*model.SimpleTableI if err != nil { return cerror.WrapError(cerror.ErrMarshalFailed, err) } - return s.storage.Write(ctx, logMetaFile, data) + return s.storage.WriteFile(ctx, logMetaFile, data) } return nil } diff --git a/cdc/sink/cdclog/utils.go b/cdc/sink/cdclog/utils.go index cf1c6d26ac4..52661293ad5 100644 --- a/cdc/sink/cdclog/utils.go +++ b/cdc/sink/cdclog/utils.go @@ -108,9 +108,9 @@ func (l *logSink) startFlush(ctx context.Context) error { uReplica := u eg.Go(func() error { log.Info("start Flush asynchronously to storage by caller", - zap.Int64("table id", u.TableID()), - zap.Int64("size", u.Size().Load()), - zap.Int64("event count", u.Events().Load()), + zap.Int64("table id", uReplica.TableID()), + zap.Int64("size", uReplica.Size().Load()), + zap.Int64("event count", uReplica.Events().Load()), ) return uReplica.flush(ectx, l) }) @@ -129,9 +129,9 @@ func (l *logSink) startFlush(ctx context.Context) error { if u.shouldFlush() { eg.Go(func() error { log.Info("start Flush asynchronously to storage", - zap.Int64("table id", u.TableID()), - zap.Int64("size", u.Size().Load()), - zap.Int64("event count", u.Events().Load()), + zap.Int64("table id", uReplica.TableID()), + zap.Int64("size", uReplica.Size().Load()), + zap.Int64("event count", uReplica.Events().Load()), ) return uReplica.flush(ectx, l) }) diff --git a/cdc/sink/manager.go b/cdc/sink/manager.go index 43041d4a80c..34d059fe5f9 100644 --- a/cdc/sink/manager.go +++ b/cdc/sink/manager.go @@ -21,13 +21,11 @@ import ( "sync/atomic" "time" - "github.com/pingcap/ticdc/pkg/util" - - "github.com/pingcap/log" - "go.uber.org/zap" - "github.com/pingcap/errors" + "github.com/pingcap/log" "github.com/pingcap/ticdc/cdc/model" + "github.com/pingcap/ticdc/pkg/util" + "go.uber.org/zap" ) const ( @@ -45,6 +43,8 @@ type Manager struct { checkpointTs model.Ts tableSinks map[model.TableID]*tableSink tableSinksMu sync.Mutex + + flushMu sync.Mutex } // NewManager creates a new Sink manager @@ -95,6 +95,8 @@ func (m *Manager) getMinEmittedTs() model.Ts { } func (m *Manager) flushBackendSink(ctx context.Context) (model.Ts, error) { + m.flushMu.Lock() + defer m.flushMu.Unlock() minEmittedTs := m.getMinEmittedTs() checkpointTs, err := m.backendSink.FlushRowChangedEvents(ctx, minEmittedTs) if err != nil { diff --git a/cdc/sink/manager_test.go b/cdc/sink/manager_test.go index c307ac7c93f..1bd12c34d98 100644 --- a/cdc/sink/manager_test.go +++ b/cdc/sink/manager_test.go @@ -142,7 +142,7 @@ func (s *managerSuite) TestManagerAddRemoveTable(c *check.C) { errCh := make(chan error, 16) manager := NewManager(ctx, &checkSink{C: c}, errCh, 0) defer manager.Close() - goroutineNum := 10 + goroutineNum := 100 var wg sync.WaitGroup const ExitSignal = uint64(math.MaxUint64) @@ -184,10 +184,10 @@ func (s *managerSuite) TestManagerAddRemoveTable(c *check.C) { go func() { defer wg.Done() // add three table and then remote one table - for i := 0; i < 10; i++ { + for i := 0; i < 200; i++ { if i%4 != 3 { // add table - table := manager.CreateTableSink(model.TableID(i), maxResolvedTs+1) + table := manager.CreateTableSink(model.TableID(i), maxResolvedTs) close := make(chan struct{}) tableSinks = append(tableSinks, table) closeChs = append(closeChs, close) @@ -202,7 +202,7 @@ func (s *managerSuite) TestManagerAddRemoveTable(c *check.C) { tableSinks = tableSinks[1:] closeChs = closeChs[1:] } - time.Sleep(100 * time.Millisecond) + time.Sleep(10 * time.Millisecond) } atomic.StoreUint64(&maxResolvedTs, ExitSignal) }() diff --git a/cdc/sink/mysql.go b/cdc/sink/mysql.go index 8ff972edc06..10008810de7 100644 --- a/cdc/sink/mysql.go +++ b/cdc/sink/mysql.go @@ -94,6 +94,7 @@ type mysqlSink struct { execWaitNotifier *notify.Notifier resolvedNotifier *notify.Notifier errCh chan error + flushSyncWg sync.WaitGroup statistics *Statistics @@ -644,12 +645,11 @@ func (s *mysqlSink) createSinkWorkers(ctx context.Context) error { } func (s *mysqlSink) notifyAndWaitExec(ctx context.Context) { + s.broadcastFinishTxn(ctx) s.execWaitNotifier.Notify() done := make(chan struct{}) go func() { - for _, w := range s.workers { - w.waitAllTxnsExecuted() - } + s.flushSyncWg.Wait() close(done) }() // This is a hack code to avoid io wait in some routine blocks others to exit. @@ -662,6 +662,15 @@ func (s *mysqlSink) notifyAndWaitExec(ctx context.Context) { } } +func (s *mysqlSink) broadcastFinishTxn(ctx context.Context) { + // Note all data txn is sent via channel, the control txn must come after all + // data txns in each worker. So after worker receives the control txn, it can + // flush txns immediately and call wait group done once. + for _, worker := range s.workers { + worker.appendFinishTxn(&s.flushSyncWg) + } +} + func (s *mysqlSink) dispatchAndExecTxns(ctx context.Context, txnsGroup map[model.TableID][]*model.SingleTableTxn) { nWorkers := s.params.workerCount causality := newCausality() @@ -696,7 +705,6 @@ func (s *mysqlSink) dispatchAndExecTxns(ctx context.Context, txnsGroup map[model type mysqlSinkWorker struct { txnCh chan *model.SingleTableTxn - txnWg sync.WaitGroup maxTxnRow int bucket int execDMLs func(context.Context, []*model.RowChangedEvent, uint64, int) error @@ -722,22 +730,25 @@ func newMySQLSinkWorker( } } -func (w *mysqlSinkWorker) waitAllTxnsExecuted() { - w.txnWg.Wait() -} - func (w *mysqlSinkWorker) appendTxn(ctx context.Context, txn *model.SingleTableTxn) { if txn == nil { return } - w.txnWg.Add(1) select { case <-ctx.Done(): - w.txnWg.Done() case w.txnCh <- txn: } } +func (w *mysqlSinkWorker) appendFinishTxn(wg *sync.WaitGroup) { + // since worker will always fetch txns from txnCh, we don't need to worry the + // txnCh full and send is blocked. + wg.Add(1) + w.txnCh <- &model.SingleTableTxn{ + FinishWg: wg, + } +} + func (w *mysqlSinkWorker) run(ctx context.Context) (err error) { var ( toExecRows []*model.RowChangedEvent @@ -746,6 +757,20 @@ func (w *mysqlSinkWorker) run(ctx context.Context) (err error) { lastCommitTs uint64 ) + // mark FinishWg before worker exits, all data txns can be omitted. + defer func() { + for { + select { + case txn := <-w.txnCh: + if txn.FinishWg != nil { + txn.FinishWg.Done() + } + default: + return + } + } + }() + defer func() { if r := recover(); r != nil { buf := make([]byte, 4096) @@ -753,7 +778,6 @@ func (w *mysqlSinkWorker) run(ctx context.Context) (err error) { buf = buf[:stackSize] err = cerror.ErrMySQLWorkerPanic.GenWithStack("mysql sink concurrent execute panic, stack: %v", string(buf)) log.Error("mysql sink worker panic", zap.Reflect("r", r), zap.Stack("stack trace")) - w.txnWg.Add(-1 * txnNum) } }() @@ -765,14 +789,12 @@ func (w *mysqlSinkWorker) run(ctx context.Context) (err error) { copy(rows, toExecRows) err := w.execDMLs(ctx, rows, replicaID, w.bucket) if err != nil { - w.txnWg.Add(-1 * txnNum) txnNum = 0 return err } atomic.StoreUint64(&w.checkpointTs, lastCommitTs) toExecRows = toExecRows[:0] w.metricBucketSize.Add(float64(txnNum)) - w.txnWg.Add(-1 * txnNum) txnNum = 0 return nil } @@ -780,13 +802,21 @@ func (w *mysqlSinkWorker) run(ctx context.Context) (err error) { for { select { case <-ctx.Done(): - return errors.Trace(flushRows()) + return errors.Trace(ctx.Err()) case txn := <-w.txnCh: if txn == nil { return errors.Trace(flushRows()) } + if txn.FinishWg != nil { + if err := flushRows(); err != nil { + return errors.Trace(err) + } + txn.FinishWg.Done() + continue + } if txn.ReplicaID != replicaID || len(toExecRows)+len(txn.Rows) > w.maxTxnRow { if err := flushRows(); err != nil { + txnNum++ return errors.Trace(err) } } diff --git a/cdc/sink/mysql_test.go b/cdc/sink/mysql_test.go index 2bb98ca7801..92e63677643 100644 --- a/cdc/sink/mysql_test.go +++ b/cdc/sink/mysql_test.go @@ -205,12 +205,14 @@ func (s MySQLSinkSuite) TestMysqlSinkWorker(c *check.C) { for _, txn := range tc.txns { w.appendTxn(cctx, txn) } + var wg sync.WaitGroup + w.appendFinishTxn(&wg) // ensure all txns are fetched from txn channel in sink worker time.Sleep(time.Millisecond * 100) notifier.Notify() - w.waitAllTxnsExecuted() + wg.Wait() cancel() - c.Assert(errg.Wait(), check.IsNil) + c.Assert(errors.Cause(errg.Wait()), check.Equals, context.Canceled) c.Assert(outputRows, check.DeepEquals, tc.expectedOutputRows, check.Commentf("case %v, %s, %s", i, spew.Sdump(outputRows), spew.Sdump(tc.expectedOutputRows))) c.Assert(outputReplicaIDs, check.DeepEquals, tc.exportedOutputReplicaIDs, @@ -218,6 +220,71 @@ func (s MySQLSinkSuite) TestMysqlSinkWorker(c *check.C) { } } +func (s MySQLSinkSuite) TestMySQLSinkWorkerExitWithError(c *check.C) { + defer testleak.AfterTest(c)() + txns1 := []*model.SingleTableTxn{ + { + CommitTs: 1, + Rows: []*model.RowChangedEvent{{CommitTs: 1}}, + }, + { + CommitTs: 2, + Rows: []*model.RowChangedEvent{{CommitTs: 2}}, + }, + { + CommitTs: 3, + Rows: []*model.RowChangedEvent{{CommitTs: 3}}, + }, + { + CommitTs: 4, + Rows: []*model.RowChangedEvent{{CommitTs: 4}}, + }, + } + txns2 := []*model.SingleTableTxn{ + { + CommitTs: 5, + Rows: []*model.RowChangedEvent{{CommitTs: 5}}, + }, + { + CommitTs: 6, + Rows: []*model.RowChangedEvent{{CommitTs: 6}}, + }, + } + maxTxnRow := 1 + ctx := context.Background() + + errExecFailed := errors.New("sink worker exec failed") + notifier := new(notify.Notifier) + cctx, cancel := context.WithCancel(ctx) + receiver, err := notifier.NewReceiver(-1) + c.Assert(err, check.IsNil) + w := newMySQLSinkWorker(maxTxnRow, 1, /*bucket*/ + bucketSizeCounter.WithLabelValues("capture", "changefeed", "1"), + receiver, + func(ctx context.Context, events []*model.RowChangedEvent, replicaID uint64, bucket int) error { + return errExecFailed + }) + errg, cctx := errgroup.WithContext(cctx) + errg.Go(func() error { + return w.run(cctx) + }) + // txn in txns1 will be sent to worker txnCh + for _, txn := range txns1 { + w.appendTxn(cctx, txn) + } + var wg sync.WaitGroup + w.appendFinishTxn(&wg) + time.Sleep(time.Millisecond * 100) + // txn in txn2 will be blocked since the worker has exited + for _, txn := range txns2 { + w.appendTxn(cctx, txn) + } + notifier.Notify() + wg.Wait() + cancel() + c.Assert(errg.Wait(), check.Equals, errExecFailed) +} + func (s MySQLSinkSuite) TestPrepareDML(c *check.C) { defer testleak.AfterTest(c)() testCases := []struct { diff --git a/cmd/client.go b/cmd/client.go index 9c9a48fdd5d..62972e0cb97 100644 --- a/cmd/client.go +++ b/cmd/client.go @@ -59,6 +59,7 @@ var ( noConfirm bool sortEngine string sortDir string + timezone string cyclicReplicaID uint64 cyclicFilterReplicaIDs []uint @@ -149,9 +150,10 @@ func newCliCommand() *cobra.Command { } pdEndpoints := strings.Split(cliPdAddr, ",") - logConfig := etcdlogutil.DefaultZapLoggerConfig logConfig.Level = zap.NewAtomicLevelAt(zapcore.ErrorLevel) + + logHTTPProxies() etcdCli, err := clientv3.New(clientv3.Config{ Context: defaultContext, Endpoints: pdEndpoints, diff --git a/cmd/client_changefeed.go b/cmd/client_changefeed.go index aec0afa4845..118690cff82 100644 --- a/cmd/client_changefeed.go +++ b/cmd/client_changefeed.go @@ -32,14 +32,18 @@ import ( cerror "github.com/pingcap/ticdc/pkg/errors" "github.com/pingcap/ticdc/pkg/security" "github.com/pingcap/ticdc/pkg/util" + "github.com/pingcap/ticdc/pkg/version" "github.com/pingcap/tidb/store/tikv/oracle" "github.com/r3labs/diff" "github.com/spf13/cobra" + "github.com/spf13/pflag" "go.uber.org/zap" ) const ( - defaultSortDir = "/tmp/cdc_sort" + // Use the empty string as the default to let the server local setting override the changefeed setting. + // TODO remove this when we change the changefeed `sort-dir` to no-op, which it currently is NOT. + defaultSortDir = "" ) var forceEnableOldValueProtocols = []string{ @@ -67,6 +71,19 @@ func newChangefeedCommand() *cobra.Command { return command } +func resumeChangefeedCheck(ctx context.Context, cmd *cobra.Command) error { + resp, err := applyOwnerChangefeedQuery(ctx, changefeedID, getCredential()) + if err != nil { + return err + } + info := &cdc.ChangefeedResp{} + err = json.Unmarshal([]byte(resp), info) + if err != nil { + return err + } + return confirmLargeDataGap(ctx, cmd, info.TSO) +} + func newAdminChangefeedCommand() []*cobra.Command { cmds := []*cobra.Command{ { @@ -90,6 +107,9 @@ func newAdminChangefeedCommand() []*cobra.Command { CfID: changefeedID, Type: model.AdminResume, } + if err := resumeChangefeedCheck(ctx, cmd); err != nil { + return err + } return applyAdminChangefeed(ctx, job, getCredential()) }, }, @@ -116,6 +136,9 @@ func newAdminChangefeedCommand() []*cobra.Command { if cmd.Use == "remove" { cmd.PersistentFlags().BoolVarP(&optForceRemove, "force", "f", false, "remove all information of the changefeed") } + if cmd.Use == "resume" { + cmd.PersistentFlags().BoolVar(&noConfirm, "no-confirm", false, "Don't ask user whether to ignore ineligible table") + } } return cmds } @@ -220,7 +243,7 @@ func newQueryChangefeedCommand() *cobra.Command { return command } -func verifyChangefeedParamers(ctx context.Context, cmd *cobra.Command, isCreate bool, credential *security.Credential) (*model.ChangeFeedInfo, error) { +func verifyChangefeedParamers(ctx context.Context, cmd *cobra.Command, isCreate bool, credential *security.Credential, captureInfos []*model.CaptureInfo) (*model.ChangeFeedInfo, error) { if isCreate { if sinkURI == "" { return nil, errors.New("Creating chengfeed without a sink-uri") @@ -235,14 +258,29 @@ func verifyChangefeedParamers(ctx context.Context, cmd *cobra.Command, isCreate if err := verifyStartTs(ctx, startTs); err != nil { return nil, err } + if err := confirmLargeDataGap(ctx, cmd, startTs); err != nil { + return nil, err + } if err := verifyTargetTs(ctx, startTs, targetTs); err != nil { return nil, err } } - + cdcClusterVer, err := version.GetTiCDCClusterVersion(captureInfos) + if err != nil { + return nil, errors.Trace(err) + } cfg := config.GetDefaultReplicaConfig() + + sortEngineFlag := cmd.Flag("sort-engine") + if cdcClusterVer == version.TiCDCClusterVersion4_0 { + cfg.EnableOldValue = false + if !sortEngineFlag.Changed { + sortEngine = model.SortInMemory + } + log.Warn("The TiCDC cluster is built from 4.0-release branch, the old-value and unified-sorter are disabled by default.") + } if len(configFile) > 0 { - if err := strictDecodeFile(configFile, "cdc", cfg); err != nil { + if err := strictDecodeFile(configFile, "TiCDC changefeed", cfg); err != nil { return nil, err } } @@ -298,6 +336,11 @@ func verifyChangefeedParamers(ctx context.Context, cmd *cobra.Command, isCreate } } } + switch sortEngine { + case model.SortUnified, model.SortInMemory, model.SortInFile: + default: + return nil, errors.Errorf("Creating chengfeed with an invalid sort engine(%s), `%s`,`%s` and `%s` are optional.", sortEngine, model.SortUnified, model.SortInMemory, model.SortInFile) + } info := &model.ChangeFeedInfo{ SinkURI: sinkURI, Opts: make(map[string]string), @@ -305,11 +348,18 @@ func verifyChangefeedParamers(ctx context.Context, cmd *cobra.Command, isCreate StartTs: startTs, TargetTs: targetTs, Config: cfg, - Engine: model.SortEngine(sortEngine), + Engine: sortEngine, SortDir: sortDir, State: model.StateNormal, SyncPointEnabled: syncPointEnabled, SyncPointInterval: syncPointInterval, + CreatorVersion: version.ReleaseVersion, + } + + if info.SortDir != "" { + cmd.Printf("[WARN] --sort-dir is deprecated in changefeed settings. "+ + "Please use `cdc server --sort-dir` if possible. "+ + "If you wish to continue, make sure %s is writable ON EACH SERVER where you run TiCDC", info.SortDir) } if info.Engine != model.SortInMemory && (info.SortDir == ".") { @@ -390,7 +440,7 @@ func changefeedConfigVariables(command *cobra.Command) { command.PersistentFlags().StringVar(&sinkURI, "sink-uri", "", "sink uri") command.PersistentFlags().StringVar(&configFile, "config", "", "Path of the configuration file") command.PersistentFlags().StringSliceVar(&opts, "opts", nil, "Extra options, in the `key=value` format") - command.PersistentFlags().StringVar(&sortEngine, "sort-engine", "unified", "sort engine used for data sort") + command.PersistentFlags().StringVar(&sortEngine, "sort-engine", model.SortUnified, "sort engine used for data sort") command.PersistentFlags().StringVar(&sortDir, "sort-dir", defaultSortDir, "directory used for data sort") command.PersistentFlags().StringVar(&timezone, "tz", "SYSTEM", "timezone used when checking sink uri (changefeed timezone is determined by cdc server)") command.PersistentFlags().Uint64Var(&cyclicReplicaID, "cyclic-replica-id", 0, "(Expremental) Cyclic replication replica ID of changefeed") @@ -412,7 +462,11 @@ func newCreateChangefeedCommand() *cobra.Command { id = uuid.New().String() } - info, err := verifyChangefeedParamers(ctx, cmd, true /* isCreate */, getCredential()) + _, captureInfos, err := cdcEtcdCli.GetCaptures(ctx) + if err != nil { + return err + } + info, err := verifyChangefeedParamers(ctx, cmd, true /* isCreate */, getCredential(), captureInfos) if err != nil { return err } @@ -452,17 +506,62 @@ func newUpdateChangefeedCommand() *cobra.Command { if err != nil { return err } - - info, err := verifyChangefeedParamers(ctx, cmd, false /* isCreate */, getCredential()) + info, err := old.Clone() if err != nil { return err } - // Fix some fields that can't be updated. - info.CreateTime = old.CreateTime - info.AdminJobType = old.AdminJobType - info.StartTs = old.StartTs - info.ErrorHis = old.ErrorHis - info.Error = old.Error + + cmd.Flags().Visit(func(flag *pflag.Flag) { + switch flag.Name { + case "target-ts": + info.TargetTs = targetTs + case "sink-uri": + info.SinkURI = sinkURI + case "config": + cfg := info.Config + if err := strictDecodeFile(configFile, "TiCDC changefeed", cfg); err != nil { + log.Panic("decode config file error", zap.Error(err)) + } + case "opts": + for _, opt := range opts { + s := strings.SplitN(opt, "=", 2) + if len(s) <= 0 { + cmd.Printf("omit opt: %s", opt) + continue + } + + var key string + var value string + key = s[0] + if len(s) > 1 { + value = s[1] + } + info.Opts[key] = value + } + + case "sort-engine": + info.Engine = sortEngine + case "sort-dir": + info.SortDir = sortDir + case "cyclic-replica-id": + filter := make([]uint64, 0, len(cyclicFilterReplicaIDs)) + for _, id := range cyclicFilterReplicaIDs { + filter = append(filter, uint64(id)) + } + info.Config.Cyclic.FilterReplicaID = filter + case "cyclic-sync-ddl": + info.Config.Cyclic.SyncDDL = cyclicSyncDDL + case "sync-point": + info.SyncPointEnabled = syncPointEnabled + case "sync-interval": + info.SyncPointInterval = syncPointInterval + case "pd", "tz", "start-ts", "changefeed-id", "no-confirm": + // do nothing + default: + // use this default branch to prevent new added parameter is not added + log.Warn("unsupported flag, please report a bug", zap.String("flagName", flag.Name)) + } + }) resp, err := applyOwnerChangefeedQuery(ctx, changefeedID, getCredential()) // if no cdc owner exists, allow user to update changefeed config @@ -592,7 +691,7 @@ func newCreateChangefeedCyclicCommand() *cobra.Command { cfg := config.GetDefaultReplicaConfig() if len(configFile) > 0 { - if err := strictDecodeFile(configFile, "cdc", cfg); err != nil { + if err := strictDecodeFile(configFile, "TiCDC changefeed", cfg); err != nil { return err } } diff --git a/cmd/client_changefeed_test.go b/cmd/client_changefeed_test.go index 606146168de..be2bba2e7af 100644 --- a/cmd/client_changefeed_test.go +++ b/cmd/client_changefeed_test.go @@ -19,6 +19,7 @@ import ( "path/filepath" "github.com/pingcap/check" + "github.com/pingcap/ticdc/cdc/model" "github.com/pingcap/ticdc/pkg/util/testleak" "github.com/spf13/cobra" ) @@ -32,6 +33,7 @@ func (s *clientChangefeedSuite) TestVerifyChangefeedParams(c *check.C) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() cmd := &cobra.Command{} + changefeedConfigVariables(cmd) dir := c.MkDir() path := filepath.Join(dir, "config.toml") @@ -42,12 +44,18 @@ enable-old-value = false c.Assert(err, check.IsNil) sinkURI = "blackhole:///?protocol=maxwell" - info, err := verifyChangefeedParamers(ctx, cmd, false /* isCreate */, nil) + info, err := verifyChangefeedParamers(ctx, cmd, false /* isCreate */, nil, nil) c.Assert(err, check.IsNil) c.Assert(info.Config.EnableOldValue, check.IsTrue) c.Assert(info.SortDir, check.Equals, defaultSortDir) sinkURI = "" - _, err = verifyChangefeedParamers(ctx, cmd, true /* isCreate */, nil) + _, err = verifyChangefeedParamers(ctx, cmd, true /* isCreate */, nil, nil) c.Assert(err, check.NotNil) + + sinkURI = "blackhole:///" + info, err = verifyChangefeedParamers(ctx, cmd, false /* isCreate */, nil, []*model.CaptureInfo{{Version: "4.0.0"}}) + c.Assert(err, check.IsNil) + c.Assert(info.Config.EnableOldValue, check.IsFalse) + c.Assert(info.Engine, check.Equals, model.SortInMemory) } diff --git a/cmd/cmd_test.go b/cmd/cmd_test.go index ad3b3a36bf1..fab5dfa0eb2 100644 --- a/cmd/cmd_test.go +++ b/cmd/cmd_test.go @@ -14,15 +14,19 @@ package cmd import ( + "context" "io/ioutil" + "os" "path/filepath" "testing" + "github.com/pingcap/check" "github.com/pingcap/parser/model" "github.com/pingcap/ticdc/pkg/config" "github.com/pingcap/ticdc/pkg/util/testleak" - - "github.com/pingcap/check" + "github.com/pingcap/tidb/store/tikv/oracle" + "github.com/spf13/cobra" + pd "github.com/tikv/pd/client" ) func TestSuite(t *testing.T) { check.TestingT(t) } @@ -100,66 +104,10 @@ polling-time = 5 }) } -func (s *decodeFileSuite) TestAndWriteExampleTOML(c *check.C) { +func (s *decodeFileSuite) TestAndWriteExampleReplicaTOML(c *check.C) { defer testleak.AfterTest(c)() - content := ` -# 指定配置文件中涉及的库名、表名是否为大小写敏感的 -# 该配置会同时影响 filter 和 sink 相关配置,默认为 true - -# Specify whether the schema name and table name in this configuration file are case sensitive -# This configuration will affect both filter and sink related configurations, the default is true -case-sensitive = true - -[filter] -# 忽略哪些 StartTs 的事务 -# Transactions with the following StartTs will be ignored -ignore-txn-start-ts = [1, 2] - -# 过滤器规则 -# 过滤规则语法:https://docs.pingcap.com/zh/tidb/stable/table-filter#%E8%A1%A8%E5%BA%93%E8%BF%87%E6%BB%A4%E8%AF%AD%E6%B3%95 -# The rules of the filter -# Filter rules syntax: https://docs.pingcap.com/tidb/stable/table-filter#syntax -rules = ['*.*', '!test.*'] - -[mounter] -# mounter 线程数 -# the thread number of the the mounter -worker-num = 16 - -[sink] -# 对于 MQ 类的 Sink,可以通过 dispatchers 配置 event 分发器 -# 分发器支持 default, ts, rowid, table 四种 -# For MQ Sinks, you can configure event distribution rules through dispatchers -# Dispatchers support default, ts, rowid and table -dispatchers = [ - {matcher = ['test1.*', 'test2.*'], dispatcher = "ts"}, - {matcher = ['test3.*', 'test4.*'], dispatcher = "rowid"}, -] -# 对于 MQ 类的 Sink,可以指定消息的协议格式 -# 协议目前支持 default, canal, avro 和 maxwell 四种,default 为 ticdc-open-protocol -# For MQ Sinks, you can configure the protocol of the messages sending to MQ -# Currently the protocol support default, canal, avro and maxwell. Default is ticdc-open-protocol -protocol = "default" - -[cyclic-replication] -# 是否开启环形复制 -# Whether to enable cyclic replication -enable = false -# 当前 CDC 的复制 ID -# The replica ID of this capture -replica-id = 1 -# 需要过滤掉的复制 ID -# The replica ID should be ignored -filter-replica-ids = [2,3] -# 是否同步 DDL -# Whether to replicate DDL -sync-ddl = true -` - err := ioutil.WriteFile("changefeed.toml", []byte(content), 0o644) - c.Assert(err, check.IsNil) - cfg := config.GetDefaultReplicaConfig() - err = strictDecodeFile("changefeed.toml", "cdc", &cfg) + err := strictDecodeFile("changefeed.toml", "cdc", &cfg) c.Assert(err, check.IsNil) c.Assert(cfg.CaseSensitive, check.IsTrue) @@ -185,6 +133,17 @@ sync-ddl = true }) } +func (s *decodeFileSuite) TestAndWriteExampleServerTOML(c *check.C) { + defer testleak.AfterTest(c)() + cfg := config.GetDefaultServerConfig() + err := strictDecodeFile("ticdc.toml", "cdc", &cfg) + c.Assert(err, check.IsNil) + defcfg := config.GetDefaultServerConfig() + defcfg.AdvertiseAddr = "127.0.0.1:8300" + defcfg.LogFile = "/tmp/ticdc/ticdc.log" + c.Assert(cfg, check.DeepEquals, defcfg) +} + func (s *decodeFileSuite) TestShouldReturnErrForUnknownCfgs(c *check.C) { defer testleak.AfterTest(c)() dir := c.MkDir() @@ -198,3 +157,64 @@ func (s *decodeFileSuite) TestShouldReturnErrForUnknownCfgs(c *check.C) { c.Assert(err, check.NotNil) c.Assert(err, check.ErrorMatches, ".*unknown config.*") } + +type mockPDClient struct { + pd.Client + ts uint64 +} + +func (m *mockPDClient) GetTS(ctx context.Context) (int64, int64, error) { + return oracle.ExtractPhysical(m.ts), 0, nil +} + +type commonUtilSuite struct{} + +var _ = check.Suite(&commonUtilSuite{}) + +func (s *commonUtilSuite) TestConfirmLargeDataGap(c *check.C) { + defer testleak.AfterTest(c)() + ctx := context.Background() + currentTs := uint64(423482306736160769) // 2021-03-11 17:59:57.547 + startTs := uint64(423450030227042420) // 2021-03-10 07:47:52.435 + pdCli = &mockPDClient{ts: currentTs} + cmd := &cobra.Command{} + + // check start ts more than 1 day before current ts, and type N when confirming + dir := c.MkDir() + path := filepath.Join(dir, "confirm.txt") + err := ioutil.WriteFile(path, []byte("n"), 0o644) + c.Assert(err, check.IsNil) + f, err := os.Open(path) + c.Assert(err, check.IsNil) + stdin := os.Stdin + os.Stdin = f + defer func() { + os.Stdin = stdin + }() + err = confirmLargeDataGap(ctx, cmd, startTs) + c.Assert(err, check.ErrorMatches, "abort changefeed create or resume") + + // check no confirm works + originNoConfirm := noConfirm + noConfirm = true + defer func() { + noConfirm = originNoConfirm + }() + err = confirmLargeDataGap(ctx, cmd, startTs) + c.Assert(err, check.IsNil) + noConfirm = false + + // check start ts more than 1 day before current ts, and type Y when confirming + err = ioutil.WriteFile(path, []byte("Y"), 0o644) + c.Assert(err, check.IsNil) + f, err = os.Open(path) + c.Assert(err, check.IsNil) + os.Stdin = f + err = confirmLargeDataGap(ctx, cmd, startTs) + c.Assert(err, check.IsNil) + + // check start ts does not exceed threshold + pdCli = &mockPDClient{ts: startTs} + err = confirmLargeDataGap(ctx, cmd, startTs) + c.Assert(err, check.IsNil) +} diff --git a/cmd/server.go b/cmd/server.go index 4563753d7d8..478c10ce06d 100644 --- a/cmd/server.go +++ b/cmd/server.go @@ -15,39 +15,29 @@ package cmd import ( "context" + "strings" "time" - "github.com/pingcap/ticdc/cdc/puller/sorter" - "github.com/pingcap/errors" "github.com/pingcap/log" "github.com/pingcap/ticdc/cdc" + "github.com/pingcap/ticdc/cdc/puller/sorter" "github.com/pingcap/ticdc/pkg/config" + cerror "github.com/pingcap/ticdc/pkg/errors" "github.com/pingcap/ticdc/pkg/logutil" "github.com/pingcap/ticdc/pkg/util" "github.com/pingcap/ticdc/pkg/version" ticonfig "github.com/pingcap/tidb/config" "github.com/spf13/cobra" + "github.com/spf13/pflag" "go.uber.org/zap" ) var ( - serverPdAddr string - address string - advertiseAddr string - timezone string - gcTTL int64 - logFile string - logLevel string - // variables for unified sorter - numConcurrentWorker int - chunkSizeLimit uint64 - maxMemoryPressure int - maxMemoryConsumption uint64 - numWorkerPoolGoroutine int - - ownerFlushInterval time.Duration - processorFlushInterval time.Duration + serverPdAddr string + serverConfigFilePath string + + serverConfig = config.GetDefaultServerConfig() serverCmd = &cobra.Command{ Use: "server", @@ -66,63 +56,62 @@ func patchTiDBConf() { func init() { patchTiDBConf() rootCmd.AddCommand(serverCmd) + initServerCmd(serverCmd) +} - serverCmd.Flags().StringVar(&serverPdAddr, "pd", "http://127.0.0.1:2379", "Set the PD endpoints to use. Use ',' to separate multiple PDs") - serverCmd.Flags().StringVar(&address, "addr", "127.0.0.1:8300", "Set the listening address") - serverCmd.Flags().StringVar(&advertiseAddr, "advertise-addr", "", "Set the advertise listening address for client communication") - serverCmd.Flags().StringVar(&timezone, "tz", "System", "Specify time zone of TiCDC cluster") - serverCmd.Flags().Int64Var(&gcTTL, "gc-ttl", cdc.DefaultCDCGCSafePointTTL, "CDC GC safepoint TTL duration, specified in seconds") - serverCmd.Flags().StringVar(&logFile, "log-file", "", "log file path") - serverCmd.Flags().StringVar(&logLevel, "log-level", "info", "log level (etc: debug|info|warn|error)") - serverCmd.Flags().DurationVar(&ownerFlushInterval, "owner-flush-interval", time.Millisecond*200, "owner flushes changefeed status interval") - serverCmd.Flags().DurationVar(&processorFlushInterval, "processor-flush-interval", time.Millisecond*100, "processor flushes task status interval") - - serverCmd.Flags().IntVar(&numWorkerPoolGoroutine, "sorter-num-workerpool-goroutine", 16, "sorter workerpool size") - serverCmd.Flags().IntVar(&numConcurrentWorker, "sorter-num-concurrent-worker", 4, "sorter concurrency level") - serverCmd.Flags().Uint64Var(&chunkSizeLimit, "sorter-chunk-size-limit", 1024*1024*1024, "size of heaps for sorting") +func initServerCmd(cmd *cobra.Command) { + defaultServerConfig := config.GetDefaultServerConfig() + cmd.Flags().StringVar(&serverPdAddr, "pd", "http://127.0.0.1:2379", "Set the PD endpoints to use. Use ',' to separate multiple PDs") + cmd.Flags().StringVar(&serverConfig.Addr, "addr", defaultServerConfig.Addr, "Set the listening address") + cmd.Flags().StringVar(&serverConfig.AdvertiseAddr, "advertise-addr", defaultServerConfig.AdvertiseAddr, "Set the advertise listening address for client communication") + cmd.Flags().StringVar(&serverConfig.TZ, "tz", defaultServerConfig.TZ, "Specify time zone of TiCDC cluster") + cmd.Flags().Int64Var(&serverConfig.GcTTL, "gc-ttl", defaultServerConfig.GcTTL, "CDC GC safepoint TTL duration, specified in seconds") + cmd.Flags().StringVar(&serverConfig.LogFile, "log-file", defaultServerConfig.LogFile, "log file path") + cmd.Flags().StringVar(&serverConfig.LogLevel, "log-level", defaultServerConfig.LogLevel, "log level (etc: debug|info|warn|error)") + cmd.Flags().DurationVar((*time.Duration)(&serverConfig.OwnerFlushInterval), "owner-flush-interval", time.Duration(defaultServerConfig.OwnerFlushInterval), "owner flushes changefeed status interval") + cmd.Flags().DurationVar((*time.Duration)(&serverConfig.ProcessorFlushInterval), "processor-flush-interval", time.Duration(defaultServerConfig.ProcessorFlushInterval), "processor flushes task status interval") + + cmd.Flags().IntVar(&serverConfig.Sorter.NumWorkerPoolGoroutine, "sorter-num-workerpool-goroutine", defaultServerConfig.Sorter.NumWorkerPoolGoroutine, "sorter workerpool size") + cmd.Flags().IntVar(&serverConfig.Sorter.NumConcurrentWorker, "sorter-num-concurrent-worker", defaultServerConfig.Sorter.NumConcurrentWorker, "sorter concurrency level") + cmd.Flags().Uint64Var(&serverConfig.Sorter.ChunkSizeLimit, "sorter-chunk-size-limit", defaultServerConfig.Sorter.ChunkSizeLimit, "size of heaps for sorting") // 80 is safe on most systems. - serverCmd.Flags().IntVar(&maxMemoryPressure, "sorter-max-memory-percentage", 80, "system memory usage threshold for forcing in-disk sort") + cmd.Flags().IntVar(&serverConfig.Sorter.MaxMemoryPressure, "sorter-max-memory-percentage", defaultServerConfig.Sorter.MaxMemoryPressure, "system memory usage threshold for forcing in-disk sort") // We use 8GB as a safe default before we support local configuration file. - serverCmd.Flags().Uint64Var(&maxMemoryConsumption, "sorter-max-memory-consumption", 8*1024*1024*1024, "maximum memory consumption of in-memory sort") + cmd.Flags().Uint64Var(&serverConfig.Sorter.MaxMemoryConsumption, "sorter-max-memory-consumption", defaultServerConfig.Sorter.MaxMemoryConsumption, "maximum memory consumption of in-memory sort") + cmd.Flags().StringVar(&serverConfig.Sorter.SortDir, "sort-dir", defaultServerConfig.Sorter.SortDir, "sorter's temporary file directory") + + addSecurityFlags(cmd.Flags(), true /* isServer */) - addSecurityFlags(serverCmd.Flags(), true /* isServer */) + cmd.Flags().StringVar(&serverConfigFilePath, "config", "", "Path of the configuration file") } func runEServer(cmd *cobra.Command, args []string) error { + conf, err := loadAndVerifyServerConfig(cmd) + if err != nil { + return errors.Trace(err) + } + cancel := initCmd(cmd, &logutil.Config{ - File: logFile, - Level: logLevel, + File: conf.LogFile, + Level: conf.LogLevel, }) defer cancel() - tz, err := util.GetTimezone(timezone) + tz, err := util.GetTimezone(conf.TZ) if err != nil { return errors.Annotate(err, "can not load timezone, Please specify the time zone through environment variable `TZ` or command line parameters `--tz`") } - - config.SetSorterConfig(&config.SorterConfig{ - NumConcurrentWorker: numConcurrentWorker, - ChunkSizeLimit: chunkSizeLimit, - MaxMemoryPressure: maxMemoryPressure, - MaxMemoryConsumption: maxMemoryConsumption, - NumWorkerPoolGoroutine: numWorkerPoolGoroutine, - }) + config.StoreGlobalServerConfig(conf) + ctx := util.PutTimezoneInCtx(defaultContext, tz) + ctx = util.PutCaptureAddrInCtx(ctx, conf.AdvertiseAddr) version.LogVersionInfo() - opts := []cdc.ServerOption{ - cdc.PDEndpoints(serverPdAddr), - cdc.Address(address), - cdc.AdvertiseAddress(advertiseAddr), - cdc.GCTTL(gcTTL), - cdc.Timezone(tz), - cdc.Credential(getCredential()), - cdc.OwnerFlushInterval(ownerFlushInterval), - cdc.ProcessorFlushInterval(processorFlushInterval), - } - server, err := cdc.NewServer(opts...) + + logHTTPProxies() + server, err := cdc.NewServer(strings.Split(serverPdAddr, ",")) if err != nil { return errors.Annotate(err, "new server") } - err = server.Run(defaultContext) + err = server.Run(ctx) if err != nil && errors.Cause(err) != context.Canceled { log.Error("run server", zap.String("error", errors.ErrorStack(err))) return errors.Annotate(err, "run server") @@ -133,3 +122,75 @@ func runEServer(cmd *cobra.Command, args []string) error { return nil } + +func loadAndVerifyServerConfig(cmd *cobra.Command) (*config.ServerConfig, error) { + serverConfig.Security = getCredential() + + conf := config.GetDefaultServerConfig() + if len(serverConfigFilePath) > 0 { + if err := strictDecodeFile(serverConfigFilePath, "TiCDC server", conf); err != nil { + return nil, err + } + } + cmd.Flags().Visit(func(flag *pflag.Flag) { + switch flag.Name { + case "addr": + conf.Addr = serverConfig.Addr + case "advertise-addr": + conf.AdvertiseAddr = serverConfig.AdvertiseAddr + case "tz": + conf.TZ = serverConfig.TZ + case "gc-ttl": + conf.GcTTL = serverConfig.GcTTL + case "log-file": + conf.LogFile = serverConfig.LogFile + case "log-level": + conf.LogLevel = serverConfig.LogLevel + case "owner-flush-interval": + conf.OwnerFlushInterval = serverConfig.OwnerFlushInterval + case "processor-flush-interval": + conf.ProcessorFlushInterval = serverConfig.ProcessorFlushInterval + case "sorter-num-workerpool-goroutine": + conf.Sorter.NumWorkerPoolGoroutine = serverConfig.Sorter.NumWorkerPoolGoroutine + case "sorter-num-concurrent-worker": + conf.Sorter.NumConcurrentWorker = serverConfig.Sorter.NumConcurrentWorker + case "sorter-chunk-size-limit": + conf.Sorter.ChunkSizeLimit = serverConfig.Sorter.ChunkSizeLimit + case "sorter-max-memory-percentage": + conf.Sorter.MaxMemoryPressure = serverConfig.Sorter.MaxMemoryPressure + case "sorter-max-memory-consumption": + conf.Sorter.MaxMemoryConsumption = serverConfig.Sorter.MaxMemoryConsumption + case "ca": + conf.Security.CAPath = serverConfig.Security.CAPath + case "cert": + conf.Security.CertPath = serverConfig.Security.CertPath + case "key": + conf.Security.KeyPath = serverConfig.Security.KeyPath + case "cert-allowed-cn": + conf.Security.CertAllowedCN = serverConfig.Security.CertAllowedCN + case "sort-dir": + conf.Sorter.SortDir = serverConfig.Sorter.SortDir + case "pd", "config": + // do nothing + default: + log.Panic("unknown flag, please report a bug", zap.String("flagName", flag.Name)) + } + }) + if err := conf.ValidateAndAdjust(); err != nil { + return nil, errors.Trace(err) + } + if len(serverPdAddr) == 0 { + return nil, cerror.ErrInvalidServerOption.GenWithStack("empty PD address") + } + for _, ep := range strings.Split(serverPdAddr, ",") { + if conf.Security.IsTLSEnabled() { + if strings.Index(ep, "http://") == 0 { + return nil, cerror.ErrInvalidServerOption.GenWithStack("PD endpoint scheme should be https") + } + } else if strings.Index(ep, "http://") != 0 { + return nil, cerror.ErrInvalidServerOption.GenWithStack("PD endpoint scheme should be http") + } + } + + return conf, nil +} diff --git a/cmd/server_test.go b/cmd/server_test.go index e2593a35584..5dca436e276 100644 --- a/cmd/server_test.go +++ b/cmd/server_test.go @@ -14,9 +14,15 @@ package cmd import ( + "io/ioutil" + "path/filepath" + "time" + "github.com/pingcap/check" + "github.com/pingcap/ticdc/pkg/config" "github.com/pingcap/ticdc/pkg/util/testleak" - "github.com/pingcap/tidb/config" + ticonfig "github.com/pingcap/tidb/config" + "github.com/spf13/cobra" ) type serverSuite struct{} @@ -26,6 +32,196 @@ var _ = check.Suite(&serverSuite{}) func (s *serverSuite) TestPatchTiDBConf(c *check.C) { defer testleak.AfterTest(c)() patchTiDBConf() - cfg := config.GetGlobalConfig() + cfg := ticonfig.GetGlobalConfig() c.Assert(cfg.TiKVClient.MaxBatchSize, check.Equals, uint(0)) } + +func (s *serverSuite) TestLoadAndVerifyServerConfig(c *check.C) { + defer testleak.AfterTest(c)() + // test default flag values + cmd := new(cobra.Command) + initServerCmd(cmd) + c.Assert(cmd.ParseFlags([]string{}), check.IsNil) + cfg, err := loadAndVerifyServerConfig(cmd) + c.Assert(err, check.IsNil) + defcfg := config.GetDefaultServerConfig() + c.Assert(defcfg.ValidateAndAdjust(), check.IsNil) + c.Assert(cfg, check.DeepEquals, defcfg) + c.Assert(serverPdAddr, check.Equals, "http://127.0.0.1:2379") + + // test empty PD address + cmd = new(cobra.Command) + initServerCmd(cmd) + c.Assert(cmd.ParseFlags([]string{"--pd="}), check.IsNil) + _, err = loadAndVerifyServerConfig(cmd) + c.Assert(err, check.ErrorMatches, ".*empty PD address.*") + + // test invalid PD address + cmd = new(cobra.Command) + initServerCmd(cmd) + c.Assert(cmd.ParseFlags([]string{"--pd=aa"}), check.IsNil) + _, err = loadAndVerifyServerConfig(cmd) + c.Assert(err, check.ErrorMatches, ".*PD endpoint scheme should be http.*") + + // test undefined flag + cmd = new(cobra.Command) + initServerCmd(cmd) + c.Assert(cmd.ParseFlags([]string{"--PD="}), check.ErrorMatches, ".*unknown flag: --PD.*") + _, err = loadAndVerifyServerConfig(cmd) + c.Assert(err, check.IsNil) + + // test flags without config file + cmd = new(cobra.Command) + initServerCmd(cmd) + c.Assert(cmd.ParseFlags([]string{ + "--addr", "127.5.5.1:8833", + "--advertise-addr", "127.5.5.1:7777", + "--log-file", "/root/cdc.log", + "--log-level", "debug", + "--gc-ttl", "10", + "--tz", "UTC", + "--owner-flush-interval", "150ms", + "--processor-flush-interval", "150ms", + "--cert", "bb", + "--key", "cc", + "--cert-allowed-cn", "dd,ee", + "--sorter-chunk-size-limit", "50000000", + "--sorter-max-memory-consumption", "60000", + "--sorter-max-memory-percentage", "70", + "--sorter-num-concurrent-worker", "80", + "--sorter-num-workerpool-goroutine", "90", + "--sort-dir", "/tmp/just_a_test", + }), check.IsNil) + cfg, err = loadAndVerifyServerConfig(cmd) + c.Assert(err, check.IsNil) + c.Assert(cfg, check.DeepEquals, &config.ServerConfig{ + Addr: "127.5.5.1:8833", + AdvertiseAddr: "127.5.5.1:7777", + LogFile: "/root/cdc.log", + LogLevel: "debug", + GcTTL: 10, + TZ: "UTC", + CaptureSessionTTL: 10, + OwnerFlushInterval: config.TomlDuration(150 * time.Millisecond), + ProcessorFlushInterval: config.TomlDuration(150 * time.Millisecond), + Sorter: &config.SorterConfig{ + NumConcurrentWorker: 80, + ChunkSizeLimit: 50000000, + MaxMemoryPressure: 70, + MaxMemoryConsumption: 60000, + NumWorkerPoolGoroutine: 90, + SortDir: "/tmp/just_a_test", + }, + Security: &config.SecurityConfig{ + CertPath: "bb", + KeyPath: "cc", + CertAllowedCN: []string{"dd", "ee"}, + }, + }) + + // test decode config file + tmpDir := c.MkDir() + configPath := filepath.Join(tmpDir, "ticdc.toml") + configContent := ` +addr = "128.0.0.1:1234" +advertise-addr = "127.0.0.1:1111" + +log-file = "/root/cdc1.log" +log-level = "warn" + +gc-ttl = 500 +tz = "US" +capture-session-ttl = 10 + +owner-flush-interval = "600ms" +processor-flush-interval = "600ms" + +[sorter] +chunk-size-limit = 10000000 +max-memory-consumption = 2000000 +max-memory-percentage = 3 +num-concurrent-worker = 4 +num-workerpool-goroutine = 5 +sort-dir = "/tmp/just_a_test" +` + err = ioutil.WriteFile(configPath, []byte(configContent), 0o644) + c.Assert(err, check.IsNil) + cmd = new(cobra.Command) + initServerCmd(cmd) + c.Assert(cmd.ParseFlags([]string{"--config", configPath}), check.IsNil) + cfg, err = loadAndVerifyServerConfig(cmd) + c.Assert(err, check.IsNil) + c.Assert(cfg, check.DeepEquals, &config.ServerConfig{ + Addr: "128.0.0.1:1234", + AdvertiseAddr: "127.0.0.1:1111", + LogFile: "/root/cdc1.log", + LogLevel: "warn", + GcTTL: 500, + TZ: "US", + CaptureSessionTTL: 10, + OwnerFlushInterval: config.TomlDuration(600 * time.Millisecond), + ProcessorFlushInterval: config.TomlDuration(600 * time.Millisecond), + Sorter: &config.SorterConfig{ + NumConcurrentWorker: 4, + ChunkSizeLimit: 10000000, + MaxMemoryPressure: 3, + MaxMemoryConsumption: 2000000, + NumWorkerPoolGoroutine: 5, + SortDir: "/tmp/just_a_test", + }, + Security: &config.SecurityConfig{}, + }) + + configContent = configContent + ` +[security] +ca-path = "aa" +cert-path = "bb" +key-path = "cc" +cert-allowed-cn = ["dd","ee"] +` + err = ioutil.WriteFile(configPath, []byte(configContent), 0o644) + c.Assert(err, check.IsNil) + cmd = new(cobra.Command) + initServerCmd(cmd) + c.Assert(cmd.ParseFlags([]string{ + "--addr", "127.5.5.1:8833", + "--log-file", "/root/cdc.log", + "--log-level", "debug", + "--gc-ttl", "10", + "--tz", "UTC", + "--owner-flush-interval", "150ms", + "--processor-flush-interval", "150ms", + "--ca", "", + "--sorter-chunk-size-limit", "50000000", + "--sorter-max-memory-consumption", "60000000", + "--sorter-max-memory-percentage", "70", + "--sorter-num-concurrent-worker", "3", + "--config", configPath, + }), check.IsNil) + cfg, err = loadAndVerifyServerConfig(cmd) + c.Assert(err, check.IsNil) + c.Assert(cfg, check.DeepEquals, &config.ServerConfig{ + Addr: "127.5.5.1:8833", + AdvertiseAddr: "127.0.0.1:1111", + LogFile: "/root/cdc.log", + LogLevel: "debug", + GcTTL: 10, + TZ: "UTC", + CaptureSessionTTL: 10, + OwnerFlushInterval: config.TomlDuration(150 * time.Millisecond), + ProcessorFlushInterval: config.TomlDuration(150 * time.Millisecond), + Sorter: &config.SorterConfig{ + NumConcurrentWorker: 3, + ChunkSizeLimit: 50000000, + MaxMemoryPressure: 70, + MaxMemoryConsumption: 60000000, + NumWorkerPoolGoroutine: 5, + SortDir: "/tmp/just_a_test", + }, + Security: &config.SecurityConfig{ + CertPath: "bb", + KeyPath: "cc", + CertAllowedCN: []string{"dd", "ee"}, + }, + }) +} diff --git a/cmd/test.go b/cmd/test.go index e7d4cf695db..1c0a341154e 100644 --- a/cmd/test.go +++ b/cmd/test.go @@ -19,6 +19,7 @@ import ( "strings" "github.com/pingcap/ticdc/cdc/kv" + "github.com/pingcap/tidb/store/tikv" "github.com/spf13/cobra" pd "github.com/tikv/pd/client" ) @@ -63,8 +64,10 @@ var testKVCmd = &cobra.Command{ return } + tikvStorage := storage.(tikv.Storage) // we know it is tikv. + t := new(testingT) - kv.TestGetKVSimple(t, cli, storage) - kv.TestSplit(t, cli, storage) + kv.TestGetKVSimple(t, cli, tikvStorage, storage) + kv.TestSplit(t, cli, tikvStorage, storage) }, } diff --git a/cmd/ticdc.toml b/cmd/ticdc.toml new file mode 100644 index 00000000000..24e6dc06932 --- /dev/null +++ b/cmd/ticdc.toml @@ -0,0 +1,30 @@ + +# TiCDC Server 监听的地址,默认:127.0.0.1:8300 +# the listening address of TiCDC server, default: 127.0.0.1:8300 +addr = "127.0.0.1:8300" + +# 用于客户端建立连接的对外公开的地址,这个地址会被注册到 PD, 默认:和 addr 相同 +# the advertised listening address for client communication, this address will be registered with the PD, default: same with the addr item +advertise-addr = "127.0.0.1:8300" + +# 日志文件路径 +# log file path +log-file = "/tmp/ticdc/ticdc.log" + +# 日志级别 (debug|info|warn|error) 默认:"info" +# log level (debug|info|warn|error) default: "info" +log-level = "info" + +# CDC GC safepoint TTL,以秒为单位,默认:86400(24H) +# CDC GC safepoint TTL duration, specified in seconds, default: 86400(24H) +# gc-ttl = 86400 + +# TiCDC 集群的时区,默认: "System" +# the time zone of TiCDC cluster, default: "System" +# tz = "System" + +[security] +# ca-path = "" +# cert-path = "" +# key-path = "" +# cert-allowed-cn = ["cn1","cn2"] diff --git a/cmd/util.go b/cmd/util.go index 49b72c2dc75..e8e516ace11 100644 --- a/cmd/util.go +++ b/cmd/util.go @@ -24,6 +24,9 @@ import ( "os/signal" "strings" "syscall" + "time" + + "golang.org/x/net/http/httpproxy" "github.com/BurntSushi/toml" "github.com/pingcap/errors" @@ -39,6 +42,7 @@ import ( "github.com/pingcap/ticdc/pkg/logutil" "github.com/pingcap/ticdc/pkg/security" "github.com/pingcap/ticdc/pkg/util" + "github.com/pingcap/tidb/store/tikv/oracle" "github.com/spf13/cobra" "github.com/spf13/pflag" "go.etcd.io/etcd/clientv3/concurrency" @@ -54,6 +58,8 @@ var ( var errOwnerNotFound = liberrors.New("owner not found") +var tsGapWarnning int64 = 86400 * 1000 // 1 day in milliseconds + func addSecurityFlags(flags *pflag.FlagSet, isServer bool) { flags.StringVar(&caPath, "ca", "", "CA certificate path for TLS connection") flags.StringVar(&certPath, "cert", "", "Certificate path for TLS connection") @@ -85,7 +91,7 @@ func initCmd(cmd *cobra.Command, logCfg *logutil.Config) context.CancelFunc { cmd.Printf("init logger error %v\n", errors.ErrorStack(err)) os.Exit(1) } - log.Info("init log", zap.String("file", logFile), zap.String("level", logCfg.Level)) + log.Info("init log", zap.String("file", logCfg.File), zap.String("level", logCfg.Level)) sc := make(chan os.Signal, 1) signal.Notify(sc, @@ -311,3 +317,52 @@ func strictDecodeFile(path, component string, cfg interface{}) error { return errors.Trace(err) } + +// logHTTPProxies logs HTTP proxy relative environment variables. +func logHTTPProxies() { + fields := proxyFields() + if len(fields) > 0 { + log.Info("using proxy config", fields...) + } +} + +func proxyFields() []zap.Field { + proxyCfg := httpproxy.FromEnvironment() + fields := make([]zap.Field, 0, 3) + if proxyCfg.HTTPProxy != "" { + fields = append(fields, zap.String("http_proxy", proxyCfg.HTTPProxy)) + } + if proxyCfg.HTTPSProxy != "" { + fields = append(fields, zap.String("https_proxy", proxyCfg.HTTPSProxy)) + } + if proxyCfg.NoProxy != "" { + fields = append(fields, zap.String("no_proxy", proxyCfg.NoProxy)) + } + return fields +} + +func confirmLargeDataGap(ctx context.Context, cmd *cobra.Command, startTs uint64) error { + if noConfirm { + return nil + } + currentPhysical, _, err := pdCli.GetTS(ctx) + if err != nil { + return err + } + tsGap := currentPhysical - oracle.ExtractPhysical(startTs) + if tsGap > tsGapWarnning { + cmd.Printf("Replicate lag (%s) is larger than 1 days, "+ + "large data may cause OOM, confirm to continue at your own risk [Y/N]\n", + time.Duration(tsGap)*time.Millisecond, + ) + var yOrN string + _, err := fmt.Scan(&yOrN) + if err != nil { + return err + } + if strings.ToLower(strings.TrimSpace(yOrN)) != "y" { + return errors.NewNoStackError("abort changefeed create or resume") + } + } + return nil +} diff --git a/cmd/util_test.go b/cmd/util_test.go new file mode 100644 index 00000000000..703ecb6819d --- /dev/null +++ b/cmd/util_test.go @@ -0,0 +1,57 @@ +// Copyright 2021 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "os" + + "github.com/pingcap/check" + "github.com/pingcap/ticdc/pkg/util/testleak" +) + +type utilsSuite struct{} + +var _ = check.Suite(&utilsSuite{}) + +func (s *utilsSuite) TestProxyFields(c *check.C) { + defer testleak.AfterTest(c)() + revIndex := map[string]int{ + "http_proxy": 0, + "https_proxy": 1, + "no_proxy": 2, + } + envs := [...]string{"http_proxy", "https_proxy", "no_proxy"} + envPreset := [...]string{"http://127.0.0.1:8080", "https://127.0.0.1:8443", "localhost,127.0.0.1"} + + // Exhaust all combinations of those environment variables' selection. + // Each bit of the mask decided whether this index of `envs` would be set. + for mask := 0; mask <= 0b111; mask++ { + for _, env := range envs { + c.Assert(os.Unsetenv(env), check.IsNil) + } + + for i := 0; i < 3; i++ { + if (1< 0", + "expr": "max(ticdc_processor_checkpoint_ts{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\",capture=~\"$capture\"}) by (capture,changefeed) > 0", "format": "time_series", "interval": "", "intervalFactor": 1, - "legendFormat": "changefeed checkpoint", + "legendFormat": "checkpoint-{{capture}}--{{changefeed}}", "refId": "B" } ], @@ -1057,7 +1057,7 @@ "description": "Internal resolved ts of captured tables", "fontSize": "100%", "gridPos": { - "h": 5, + "h": 10, "w": 10, "x": 14, "y": 2 @@ -1106,20 +1106,20 @@ ], "targets": [ { - "expr": "max(ticdc_processor_table_resolved_ts{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\",capture=~\"$capture\"}) by (capture,table)", + "expr": "max(ticdc_processor_table_resolved_ts{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\",capture=~\"$capture\"}) by (capture,changefeed,table)", "format": "time_series", "instant": true, "interval": "", "intervalFactor": 1, - "legendFormat": "{{capture}}-{{table}}", + "legendFormat": "{{capture}}-{{changefeed}}-{{table}}", "refId": "A" }, { - "expr": "max(ticdc_processor_checkpoint_ts{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\",capture=~\"$capture\"}) by (capture) > 0", + "expr": "max(ticdc_processor_checkpoint_ts{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\",capture=~\"$capture\"}) by (capture,changefeed,table) > 0", "format": "time_series", "interval": "", "intervalFactor": 1, - "legendFormat": "changefeed checkpoint", + "legendFormat": "checkpoint-{{changefeed}}", "refId": "B" } ], @@ -1129,6 +1129,79 @@ "transform": "timeseries_aggregations", "type": "table" }, + { + "columns": [ + { + "text": "Current", + "value": "current" + } + ], + "datasource": "${DS_TEST-CLUSTER}", + "description": "The number of replicated tables maintained in owner", + "fontSize": "100%", + "gridPos": { + "h": 5, + "w": 7, + "x": 0, + "y": 7 + }, + "id": 138, + "links": [], + "pageSize": null, + "scroll": true, + "showHeader": true, + "sort": { + "col": null, + "desc": false + }, + "styles": [ + { + "alias": "Time", + "align": "auto", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "date" + }, + { + "alias": "", + "align": "auto", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "decimals": 2, + "pattern": "/.*/", + "thresholds": [], + "type": "number", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum(ticdc_owner_maintain_table_num{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\",capture=~\"$capture\",type=\"total\"}) by (capture)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{capture}}-total", + "refId": "A" + }, + { + "expr": "sum(ticdc_owner_maintain_table_num{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\",capture=~\"$capture\",type=\"wip\"}) by (capture)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{capture}}-wip", + "refId": "B" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Table count maintained by owner", + "transform": "timeseries_aggregations", + "type": "table" + }, { "aliasColors": {}, "bars": true, @@ -1142,7 +1215,7 @@ "h": 7, "w": 9, "x": 0, - "y": 7 + "y": 12 }, "id": 86, "legend": { @@ -1248,7 +1321,7 @@ "h": 7, "w": 8, "x": 9, - "y": 7 + "y": 12 }, "hiddenSeries": false, "id": 102, @@ -1344,7 +1417,7 @@ "h": 7, "w": 7, "x": 17, - "y": 7 + "y": 12 }, "id": 82, "legend": { @@ -1433,7 +1506,7 @@ "h": 7, "w": 12, "x": 0, - "y": 14 + "y": 19 }, "hiddenSeries": false, "id": 3, @@ -1528,7 +1601,7 @@ "h": 7, "w": 12, "x": 12, - "y": 14 + "y": 19 }, "hiddenSeries": false, "id": 2, @@ -1563,11 +1636,11 @@ "steppedLine": false, "targets": [ { - "expr": "sum(ticdc_processor_resolved_ts_lag{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\",capture=~\"$capture\"}) by (capture)", + "expr": "sum(ticdc_processor_resolved_ts_lag{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\",capture=~\"$capture\"}) by (capture,changefeed)", "format": "time_series", "interval": "", "intervalFactor": 1, - "legendFormat": "{{capture}}", + "legendFormat": "{{capture}}--{{changefeed}}", "refId": "A" } ], @@ -1633,7 +1706,7 @@ "h": 7, "w": 12, "x": 0, - "y": 21 + "y": 26 }, "heatmap": {}, "hideZeroBuckets": true, @@ -1704,7 +1777,7 @@ "h": 7, "w": 12, "x": 12, - "y": 21 + "y": 26 }, "hiddenSeries": false, "id": 35, @@ -1812,7 +1885,7 @@ "h": 7, "w": 12, "x": 0, - "y": 28 + "y": 33 }, "hiddenSeries": false, "id": 34, @@ -1915,7 +1988,7 @@ "h": 7, "w": 12, "x": 12, - "y": 28 + "y": 33 }, "hiddenSeries": false, "id": 36, @@ -2032,7 +2105,7 @@ "h": 7, "w": 12, "x": 0, - "y": 35 + "y": 40 }, "heatmap": {}, "hideZeroBuckets": true, @@ -2101,7 +2174,7 @@ "h": 7, "w": 12, "x": 12, - "y": 35 + "y": 40 }, "hiddenSeries": false, "id": 98, @@ -2217,7 +2290,7 @@ "h": 7, "w": 8, "x": 0, - "y": 42 + "y": 47 }, "heatmap": {}, "hideZeroBuckets": true, @@ -2287,7 +2360,7 @@ "h": 7, "w": 8, "x": 8, - "y": 42 + "y": 47 }, "hiddenSeries": false, "id": 83, @@ -2397,7 +2470,7 @@ "h": 7, "w": 8, "x": 16, - "y": 42 + "y": 47 }, "hiddenSeries": false, "id": 95, @@ -2550,7 +2623,7 @@ "fillGradient": 0, "gridPos": { "h": 7, - "w": 6, + "w": 8, "x": 0, "y": 3 }, @@ -2647,7 +2720,7 @@ "gridPos": { "h": 7, "w": 8, - "x": 6, + "x": 8, "y": 3 }, "hiddenSeries": false, @@ -2747,8 +2820,8 @@ "fillGradient": 0, "gridPos": { "h": 7, - "w": 10, - "x": 14, + "w": 8, + "x": 16, "y": 3 }, "hiddenSeries": false, @@ -2862,7 +2935,7 @@ "fillGradient": 0, "gridPos": { "h": 7, - "w": 6, + "w": 8, "x": 0, "y": 10 }, @@ -2947,100 +3020,6 @@ "alignLevel": null } }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "description": "The number of puller received events from kv client per second\n", - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 7, - "w": 6, - "x": 6, - "y": 10 - }, - "hiddenSeries": false, - "id": 32, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "options": { - "dataLinks": [] - }, - "paceLength": 10, - "percentage": false, - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum (rate(ticdc_puller_kv_event_count{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\", capture=~\"$capture\"}[1m])) by (capture, type)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{capture}} - {{type}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Puller receive events/s", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "none", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, { "aliasColors": {}, "bars": false, @@ -3052,8 +3031,8 @@ "fillGradient": 0, "gridPos": { "h": 7, - "w": 6, - "x": 12, + "w": 8, + "x": 8, "y": 10 }, "hiddenSeries": false, @@ -3146,8 +3125,8 @@ "fillGradient": 0, "gridPos": { "h": 7, - "w": 6, - "x": 18, + "w": 8, + "x": 16, "y": 10 }, "hiddenSeries": false, @@ -4260,17 +4239,17 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(ticdc_kvclient_send_event_count{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\"}[1m])) by (capture, type)", + "expr": "sum(rate(ticdc_kvclient_send_event_count{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\"}[1m])) by (capture, changefeed, type)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "{{capture}}-{{type}}", + "legendFormat": "{{capture}}-{{changefeed}}-{{type}}", "refId": "A" }, { - "expr": "sum(rate(ticdc_kvclient_batch_resolved_event_size_count{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\",capture=~\"$capture\"}[1m])) by (capture, table)", + "expr": "sum(rate(ticdc_kvclient_batch_resolved_event_size_count{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\",capture=~\"$capture\"}[1m])) by (capture, changefeed, table)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "{{capture}}-batch-resolved", + "legendFormat": "{{capture}}-{{changefeed}}-batch-resolved", "refId": "B" } ], @@ -4442,9 +4421,10 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(ticdc_sorter_consume_count{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\", capture=~\"$capture\"}[1m])) by (capture)", + "expr": "sum(rate(ticdc_sorter_consume_count{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\", capture=~\"$capture\"}[1m])) by (capture,changefeed)", "format": "time_series", "intervalFactor": 1, + "legendFormat": "{{capture}}-{{changefeed}}", "refId": "A" } ], @@ -4526,9 +4506,10 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(ticdc_sorter_event_count{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\", capture=~\"$capture\"}[1m])) by (capture)", + "expr": "sum(rate(ticdc_sorter_event_count{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\", capture=~\"$capture\"}[1m])) by (capture,changefeed)", "format": "time_series", "intervalFactor": 1, + "legendFormat": "{{capture}}-{{changefeed}}", "refId": "A" } ], @@ -4613,6 +4594,7 @@ "expr": "sum(ticdc_sorter_on_disk_data_size_gauge{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}) by (capture)", "format": "time_series", "intervalFactor": 1, + "legendFormat": "{{capture}}", "refId": "A" } ], @@ -4697,6 +4679,7 @@ "expr": "sum(ticdc_sorter_in_memory_data_size_gauge{tidb_cluster=\"$tidb_cluster\", capture=~\"$capture\"}) by (capture)", "format": "time_series", "intervalFactor": 1, + "legendFormat": "{{capture}}", "refId": "A" } ], @@ -4870,90 +4853,6 @@ "yBucketBound": "auto", "yBucketNumber": null, "yBucketSize": null - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "fill": 1, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 28 - }, - "id": 137, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "min(ticdc_sorter_resolved_ts_gauge{tidb_cluster=\"$tidb_cluster\", changefeed=~\"$changefeed\", capture=~\"$capture\"}) by (capture)", - "format": "time_series", - "intervalFactor": 1, - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Unified Sorter resolved ts", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "dateTimeAsIso", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } } ], "title": "Unified Sorter", @@ -4981,7 +4880,7 @@ "h": 7, "w": 12, "x": 0, - "y": 4 + "y": 5 }, "id": 60, "legend": { @@ -5074,7 +4973,7 @@ "h": 7, "w": 12, "x": 12, - "y": 4 + "y": 5 }, "id": 62, "legend": { @@ -5185,7 +5084,7 @@ "h": 7, "w": 7, "x": 0, - "y": 11 + "y": 12 }, "hideTimeOverride": true, "id": 64, @@ -5267,7 +5166,7 @@ "h": 7, "w": 5, "x": 7, - "y": 11 + "y": 12 }, "hideTimeOverride": true, "id": 66, @@ -5318,7 +5217,7 @@ "h": 7, "w": 12, "x": 12, - "y": 11 + "y": 12 }, "id": 70, "legend": { @@ -5420,7 +5319,7 @@ "h": 7, "w": 12, "x": 0, - "y": 18 + "y": 19 }, "heatmap": {}, "hideZeroBuckets": true, @@ -5490,9 +5389,9 @@ "fill": 1, "gridPos": { "h": 7, - "w": 12, + "w": 6, "x": 12, - "y": 18 + "y": 19 }, "id": 72, "legend": { @@ -5503,7 +5402,7 @@ "hideZero": true, "max": true, "min": false, - "rightSide": true, + "rightSide": false, "show": true, "sideWidth": null, "sort": "current", @@ -5574,6 +5473,115 @@ "alignLevel": null } }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "decimals": 1, + "description": "The number of incremental scan task in different status.", + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 19 + }, + "id": 140, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "paceLength": 10, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*ongoing/", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(tikv_cdc_scan_tasks{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}) by (type, instance)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{instance}} - {{type}}", + "refId": "A" + }, + { + "expr": "sum(tikv_cdc_scan_tasks{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", type=\"total\"}) by (instance) - sum(tikv_cdc_scan_tasks{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", type=\"finish\"}) by (instance) - sum(tikv_cdc_scan_tasks{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", type=\"abort\"}) by (instance)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{instance}} - ongoing", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Initial scan tasks status", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, { "aliasColors": {}, "bars": false, @@ -5590,7 +5598,7 @@ "h": 7, "w": 12, "x": 0, - "y": 25 + "y": 26 }, "id": 74, "legend": { @@ -5698,7 +5706,7 @@ "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, - "description": "The memory usage per TiKV instance", + "description": "The speed of TiKV CDC incremental scan", "editable": true, "error": false, "fill": 0, @@ -5707,13 +5715,15 @@ "h": 7, "w": 12, "x": 12, - "y": 25 + "y": 26 }, "id": 76, "legend": { "alignAsTable": true, "avg": false, "current": true, + "hideEmpty": true, + "hideZero": true, "max": true, "min": false, "rightSide": true, @@ -5739,9 +5749,9 @@ "steppedLine": false, "targets": [ { - "expr": "avg(tikv_cdc_min_resolved_ts{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", job=\"tikv\"}) by (instance)", + "expr": "sum(rate(tikv_cdc_scan_bytes_total{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", job=\"tikv\"}[30s])) by (instance)", "format": "time_series", - "hide": true, + "hide": false, "intervalFactor": 2, "legendFormat": "tikv-{{instance}}", "refId": "A", @@ -5752,7 +5762,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "CDC pending bytes in memory", + "title": "CDC scan speed", "tooltip": { "msResolution": false, "shared": true, @@ -5773,7 +5783,7 @@ "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true }, { @@ -5806,7 +5816,7 @@ "h": 7, "w": 12, "x": 0, - "y": 32 + "y": 33 }, "id": 78, "legend": { @@ -5842,9 +5852,18 @@ "format": "time_series", "hide": false, "intervalFactor": 2, - "legendFormat": "tikv-{{instance}}", + "legendFormat": "tikv-{{instance}}-total", "refId": "A", "step": 10 + }, + { + "expr": "sum(tikv_cdc_region_resolve_status{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}) by (instance, status)", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "tikv-{{instance}}-{{status}}", + "refId": "B", + "step": 10 } ], "thresholds": [], @@ -5888,6 +5907,107 @@ "align": false, "alignLevel": null } + }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "decimals": 1, + "description": "The total bytes of TiKV CDC incremental scan", + "editable": true, + "error": false, + "fill": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 33 + }, + "id": 139, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "paceLength": 10, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(tikv_cdc_scan_bytes_total{tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", job=\"tikv\"}) by (instance)", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "tikv-{{instance}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CDC total scan bytes", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } ], "title": "TiKV", @@ -5902,25 +6022,22 @@ "list": [ { "allValue": null, - "current": { - }, + "current": {}, "datasource": "${DS_TEST-CLUSTER}", + "definition": "", "hide": 2, "includeAll": false, "label": "tidb_cluster", "multi": false, "name": "tidb_cluster", - "options": [ - - ], + "options": [], "query": "label_values(ticdc_processor_resolved_ts, tidb_cluster)", "refresh": 2, "regex": "", + "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", - "tags": [ - - ], + "tags": [], "tagsQuery": "", "type": "query", "useTags": false @@ -6078,5 +6195,5 @@ "timezone": "browser", "title": "Test-Cluster-TiCDC", "uid": "YiGL8hBZ1", - "version": 13 + "version": 14 } diff --git a/pkg/config/config.go b/pkg/config/config.go index 4ce7f2aaebd..2d34d7820cd 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -16,11 +16,16 @@ package config import ( "encoding/json" "fmt" + "net" + "strings" + "sync/atomic" + "time" "github.com/pingcap/errors" "github.com/pingcap/log" "github.com/pingcap/ticdc/pkg/config/outdated" cerror "github.com/pingcap/ticdc/pkg/errors" + "github.com/pingcap/ticdc/pkg/security" "go.uber.org/zap" ) @@ -129,3 +134,198 @@ func (c *replicaConfig) fillFromV1(v1 *outdated.ReplicaConfigV1) { func GetDefaultReplicaConfig() *ReplicaConfig { return defaultReplicaConfig.Clone() } + +// SecurityConfig represents security config for server +type SecurityConfig = security.Credential + +var defaultServerConfig = &ServerConfig{ + Addr: "127.0.0.1:8300", + AdvertiseAddr: "", + LogFile: "", + LogLevel: "info", + GcTTL: 24 * 60 * 60, // 24H + TZ: "System", + // The default election-timeout in PD is 3s and minimum session TTL is 5s, + // which is calculated by `math.Ceil(3 * election-timeout / 2)`, we choose + // default capture session ttl to 10s to increase robust to PD jitter, + // however it will decrease RTO when single TiCDC node error happens. + CaptureSessionTTL: 10, + OwnerFlushInterval: TomlDuration(200 * time.Millisecond), + ProcessorFlushInterval: TomlDuration(100 * time.Millisecond), + Sorter: &SorterConfig{ + NumConcurrentWorker: 4, + ChunkSizeLimit: 1024 * 1024 * 1024, // 1GB + MaxMemoryPressure: 80, + MaxMemoryConsumption: 8 * 1024 * 1024 * 1024, // 8GB + NumWorkerPoolGoroutine: 16, + SortDir: "/tmp/cdc_sort", + }, + Security: &SecurityConfig{}, +} + +// ServerConfig represents a config for server +type ServerConfig struct { + Addr string `toml:"addr" json:"addr"` + AdvertiseAddr string `toml:"advertise-addr" json:"advertise-addr"` + + LogFile string `toml:"log-file" json:"log-file"` + LogLevel string `toml:"log-level" json:"log-level"` + + GcTTL int64 `toml:"gc-ttl" json:"gc-ttl"` + TZ string `toml:"tz" json:"tz"` + + CaptureSessionTTL int `toml:"capture-session-ttl" json:"capture-session-ttl"` + + OwnerFlushInterval TomlDuration `toml:"owner-flush-interval" json:"owner-flush-interval"` + ProcessorFlushInterval TomlDuration `toml:"processor-flush-interval" json:"processor-flush-interval"` + + Sorter *SorterConfig `toml:"sorter" json:"sorter"` + Security *SecurityConfig `toml:"security" json:"security"` +} + +// Marshal returns the json marshal format of a ServerConfig +func (c *ServerConfig) Marshal() (string, error) { + cfg, err := json.Marshal(c) + if err != nil { + return "", cerror.WrapError(cerror.ErrEncodeFailed, errors.Annotatef(err, "Unmarshal data: %v", c)) + } + return string(cfg), nil +} + +// Unmarshal unmarshals into *ServerConfig from json marshal byte slice +func (c *ServerConfig) Unmarshal(data []byte) error { + err := json.Unmarshal(data, c) + if err != nil { + return cerror.WrapError(cerror.ErrDecodeFailed, err) + } + return nil +} + +// String implements the Stringer interface +func (c *ServerConfig) String() string { + s, _ := c.Marshal() + return s +} + +// Clone clones a replication +func (c *ServerConfig) Clone() *ServerConfig { + str, err := c.Marshal() + if err != nil { + log.Panic("failed to marshal replica config", + zap.Error(cerror.WrapError(cerror.ErrDecodeFailed, err))) + } + clone := new(ServerConfig) + err = clone.Unmarshal([]byte(str)) + if err != nil { + log.Panic("failed to unmarshal replica config", + zap.Error(cerror.WrapError(cerror.ErrDecodeFailed, err))) + } + return clone +} + +// ValidateAndAdjust validates and adjusts the server configuration +func (c *ServerConfig) ValidateAndAdjust() error { + if c.Addr == "" { + return cerror.ErrInvalidServerOption.GenWithStack("empty address") + } + if c.AdvertiseAddr == "" { + c.AdvertiseAddr = c.Addr + } + // Advertise address must be specified. + if idx := strings.LastIndex(c.AdvertiseAddr, ":"); idx >= 0 { + ip := net.ParseIP(c.AdvertiseAddr[:idx]) + // Skip nil as it could be a domain name. + if ip != nil && ip.IsUnspecified() { + return cerror.ErrInvalidServerOption.GenWithStack("advertise address must be specified as a valid IP") + } + } else { + return cerror.ErrInvalidServerOption.GenWithStack("advertise address or address does not contain a port") + } + if c.GcTTL == 0 { + return cerror.ErrInvalidServerOption.GenWithStack("empty GC TTL is not allowed") + } + // 5s is minimum lease ttl in etcd(PD) + if c.CaptureSessionTTL < 5 { + log.Warn("capture session ttl too small, set to default value 10s") + c.CaptureSessionTTL = 10 + } + + if c.Security != nil && c.Security.IsTLSEnabled() { + var err error + _, err = c.Security.ToTLSConfig() + if err != nil { + return errors.Annotate(err, "invalidate TLS config") + } + _, err = c.Security.ToGRPCDialOption() + if err != nil { + return errors.Annotate(err, "invalidate TLS config") + } + } + + if c.Sorter == nil { + c.Sorter = defaultServerConfig.Sorter + } + + if c.Sorter.ChunkSizeLimit < 1*1024*1024 { + return cerror.ErrIllegalUnifiedSorterParameter.GenWithStackByArgs("chunk-size-limit should be at least 1MB") + } + if c.Sorter.NumConcurrentWorker < 1 { + return cerror.ErrIllegalUnifiedSorterParameter.GenWithStackByArgs("num-concurrent-worker should be at least 1") + } + if c.Sorter.NumWorkerPoolGoroutine > 4096 { + return cerror.ErrIllegalUnifiedSorterParameter.GenWithStackByArgs("num-workerpool-goroutine should be at most 4096") + } + if c.Sorter.NumConcurrentWorker > c.Sorter.NumWorkerPoolGoroutine { + return cerror.ErrIllegalUnifiedSorterParameter.GenWithStackByArgs("num-concurrent-worker larger than num-workerpool-goroutine is useless") + } + if c.Sorter.NumWorkerPoolGoroutine < 1 { + return cerror.ErrIllegalUnifiedSorterParameter.GenWithStackByArgs("num-workerpool-goroutine should be at least 1, larger than 8 is recommended") + } + if c.Sorter.MaxMemoryPressure < 0 || c.Sorter.MaxMemoryPressure > 100 { + return cerror.ErrIllegalUnifiedSorterParameter.GenWithStackByArgs("max-memory-percentage should be a percentage") + } + + return nil +} + +// GetDefaultServerConfig returns the default server config +func GetDefaultServerConfig() *ServerConfig { + return defaultServerConfig.Clone() +} + +var globalServerConfig atomic.Value + +// GetGlobalServerConfig returns the global configuration for this server. +// It should store configuration from command line and configuration file. +// Other parts of the system can read the global configuration use this function. +func GetGlobalServerConfig() *ServerConfig { + return globalServerConfig.Load().(*ServerConfig) +} + +// StoreGlobalServerConfig stores a new config to the globalServerConfig. It mostly uses in the test to avoid some data races. +func StoreGlobalServerConfig(config *ServerConfig) { + globalServerConfig.Store(config) +} + +// TomlDuration is a duration with a custom json decoder and toml decoder +type TomlDuration time.Duration + +// UnmarshalText is the toml decoder +func (d *TomlDuration) UnmarshalText(text []byte) error { + stdDuration, err := time.ParseDuration(string(text)) + if err != nil { + return err + } + *d = TomlDuration(stdDuration) + return nil +} + +// UnmarshalJSON is the json decoder +func (d *TomlDuration) UnmarshalJSON(b []byte) error { + var stdDuration time.Duration + if err := json.Unmarshal(b, &stdDuration); err != nil { + return err + } + *d = TomlDuration(stdDuration) + return nil +} diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go new file mode 100644 index 00000000000..6f407abe44b --- /dev/null +++ b/pkg/config/config_test.go @@ -0,0 +1,127 @@ +// Copyright 2021 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package config + +import ( + "testing" + + "github.com/pingcap/check" + "github.com/pingcap/ticdc/pkg/util/testleak" +) + +func Test(t *testing.T) { check.TestingT(t) } + +type replicaConfigSuite struct{} + +var _ = check.Suite(&replicaConfigSuite{}) + +func (s *replicaConfigSuite) TestMarshal(c *check.C) { + defer testleak.AfterTest(c)() + conf := GetDefaultReplicaConfig() + conf.CaseSensitive = false + conf.ForceReplicate = true + conf.Filter.Rules = []string{"1.1"} + conf.Mounter.WorkerNum = 3 + b, err := conf.Marshal() + c.Assert(err, check.IsNil) + c.Assert(b, check.Equals, `{"case-sensitive":false,"enable-old-value":true,"force-replicate":true,"check-gc-safe-point":true,"filter":{"rules":["1.1"],"ignore-txn-start-ts":null,"ddl-allow-list":null},"mounter":{"worker-num":3},"sink":{"dispatchers":null,"protocol":"default"},"cyclic-replication":{"enable":false,"replica-id":0,"filter-replica-ids":null,"id-buckets":0,"sync-ddl":false},"scheduler":{"type":"table-number","polling-time":-1}}`) + conf2 := new(ReplicaConfig) + err = conf2.Unmarshal([]byte(`{"case-sensitive":false,"enable-old-value":true,"force-replicate":true,"check-gc-safe-point":true,"filter":{"rules":["1.1"],"ignore-txn-start-ts":null,"ddl-allow-list":null},"mounter":{"worker-num":3},"sink":{"dispatchers":null,"protocol":"default"},"cyclic-replication":{"enable":false,"replica-id":0,"filter-replica-ids":null,"id-buckets":0,"sync-ddl":false},"scheduler":{"type":"table-number","polling-time":-1}}`)) + c.Assert(err, check.IsNil) + c.Assert(conf2, check.DeepEquals, conf) +} + +func (s *replicaConfigSuite) TestClone(c *check.C) { + defer testleak.AfterTest(c)() + conf := GetDefaultReplicaConfig() + conf.CaseSensitive = false + conf.ForceReplicate = true + conf.Filter.Rules = []string{"1.1"} + conf.Mounter.WorkerNum = 3 + conf2 := conf.Clone() + c.Assert(conf2, check.DeepEquals, conf) + conf2.Mounter.WorkerNum = 4 + c.Assert(conf.Mounter.WorkerNum, check.Equals, 3) +} + +func (s *replicaConfigSuite) TestOutDated(c *check.C) { + defer testleak.AfterTest(c)() + conf2 := new(ReplicaConfig) + err := conf2.Unmarshal([]byte(`{"case-sensitive":false,"enable-old-value":true,"force-replicate":true,"check-gc-safe-point":true,"filter":{"rules":["1.1"],"ignore-txn-start-ts":null,"ddl-allow-list":null},"mounter":{"worker-num":3},"sink":{"dispatch-rules":[{"db-name":"a","tbl-name":"b","rule":"r1"},{"db-name":"a","tbl-name":"c","rule":"r2"},{"db-name":"a","tbl-name":"d","rule":"r2"}],"protocol":"default"},"cyclic-replication":{"enable":false,"replica-id":0,"filter-replica-ids":null,"id-buckets":0,"sync-ddl":false},"scheduler":{"type":"table-number","polling-time":-1}}`)) + c.Assert(err, check.IsNil) + + conf := GetDefaultReplicaConfig() + conf.CaseSensitive = false + conf.ForceReplicate = true + conf.Filter.Rules = []string{"1.1"} + conf.Mounter.WorkerNum = 3 + conf.Sink.DispatchRules = []*DispatchRule{ + {Matcher: []string{"a.b"}, Dispatcher: "r1"}, + {Matcher: []string{"a.c"}, Dispatcher: "r2"}, + {Matcher: []string{"a.d"}, Dispatcher: "r2"}, + } + c.Assert(conf2, check.DeepEquals, conf) +} + +type serverConfigSuite struct{} + +var _ = check.Suite(&serverConfigSuite{}) + +func (s *serverConfigSuite) TestMarshal(c *check.C) { + defer testleak.AfterTest(c)() + conf := GetDefaultServerConfig() + conf.Addr = "192.155.22.33:8887" + conf.Sorter.ChunkSizeLimit = 999 + b, err := conf.Marshal() + c.Assert(err, check.IsNil) + + c.Assert(b, check.Equals, `{"addr":"192.155.22.33:8887","advertise-addr":"","log-file":"","log-level":"info","gc-ttl":86400,"tz":"System","capture-session-ttl":10,"owner-flush-interval":200000000,"processor-flush-interval":100000000,"sorter":{"num-concurrent-worker":4,"chunk-size-limit":999,"max-memory-percentage":80,"max-memory-consumption":8589934592,"num-workerpool-goroutine":16,"sort-dir":"/tmp/cdc_sort"},"security":{"ca-path":"","cert-path":"","key-path":"","cert-allowed-cn":null}}`) + conf2 := new(ServerConfig) + err = conf2.Unmarshal([]byte(`{"addr":"192.155.22.33:8887","advertise-addr":"","log-file":"","log-level":"info","gc-ttl":86400,"tz":"System","capture-session-ttl":10,"owner-flush-interval":200000000,"processor-flush-interval":100000000,"sorter":{"num-concurrent-worker":4,"chunk-size-limit":999,"max-memory-percentage":80,"max-memory-consumption":8589934592,"num-workerpool-goroutine":16,"sort-dir":"/tmp/cdc_sort"},"security":{"ca-path":"","cert-path":"","key-path":"","cert-allowed-cn":null}}`)) + c.Assert(err, check.IsNil) + c.Assert(conf2, check.DeepEquals, conf) +} + +func (s *serverConfigSuite) TestClone(c *check.C) { + defer testleak.AfterTest(c)() + conf := GetDefaultServerConfig() + conf.Addr = "192.155.22.33:8887" + conf.Sorter.ChunkSizeLimit = 999 + conf2 := conf.Clone() + c.Assert(conf2, check.DeepEquals, conf) + conf.Sorter.ChunkSizeLimit = 99 + c.Assert(conf.Sorter.ChunkSizeLimit, check.Equals, uint64(99)) +} + +func (s *serverConfigSuite) TestValidateAndAdjust(c *check.C) { + defer testleak.AfterTest(c)() + conf := new(ServerConfig) + + c.Assert(conf.ValidateAndAdjust(), check.ErrorMatches, ".*empty address") + conf.Addr = "cdc:1234" + c.Assert(conf.ValidateAndAdjust(), check.ErrorMatches, ".*empty GC TTL is not allowed") + conf.GcTTL = 60 + c.Assert(conf.ValidateAndAdjust(), check.IsNil) + c.Assert(conf.AdvertiseAddr, check.Equals, conf.Addr) + conf.AdvertiseAddr = "advertise:1234" + c.Assert(conf.ValidateAndAdjust(), check.IsNil) + c.Assert(conf.Addr, check.Equals, "cdc:1234") + c.Assert(conf.AdvertiseAddr, check.Equals, "advertise:1234") + conf.AdvertiseAddr = "0.0.0.0:1234" + c.Assert(conf.ValidateAndAdjust(), check.ErrorMatches, ".*must be specified.*") + conf.Addr = "0.0.0.0:1234" + c.Assert(conf.ValidateAndAdjust(), check.ErrorMatches, ".*must be specified.*") + conf.AdvertiseAddr = "advertise" + c.Assert(conf.ValidateAndAdjust(), check.ErrorMatches, ".*does not contain a port") +} diff --git a/pkg/config/sorter.go b/pkg/config/sorter.go index a1b81b4b375..a9e6ee8f8bf 100644 --- a/pkg/config/sorter.go +++ b/pkg/config/sorter.go @@ -13,37 +13,18 @@ package config -import "sync" - // SorterConfig represents sorter config for a changefeed type SorterConfig struct { // number of concurrent heap sorts - NumConcurrentWorker int `toml:"num-concurrent-workers" json:"num-concurrent-workers"` + NumConcurrentWorker int `toml:"num-concurrent-worker" json:"num-concurrent-worker"` // maximum size for a heap ChunkSizeLimit uint64 `toml:"chunk-size-limit" json:"chunk-size-limit"` // the maximum memory use percentage that allows in-memory sorting - MaxMemoryPressure int `toml:"max-memory-pressure" json:"max-memory-pressure"` + MaxMemoryPressure int `toml:"max-memory-percentage" json:"max-memory-percentage"` // the maximum memory consumption allowed for in-memory sorting MaxMemoryConsumption uint64 `toml:"max-memory-consumption" json:"max-memory-consumption"` // the size of workerpool NumWorkerPoolGoroutine int `toml:"num-workerpool-goroutine" json:"num-workerpool-goroutine"` -} - -var ( - sorterConfig *SorterConfig - mu sync.Mutex -) - -// GetSorterConfig returns the process-local sorter config -func GetSorterConfig() *SorterConfig { - mu.Lock() - defer mu.Unlock() - return sorterConfig -} - -// SetSorterConfig sets the process-local sorter config -func SetSorterConfig(config *SorterConfig) { - mu.Lock() - defer mu.Unlock() - sorterConfig = config + // the directory used to store the temporary files generated by the sorter + SortDir string `toml:"sort-dir" json:"sort-dir"` } diff --git a/pkg/errors/errors.go b/pkg/errors/errors.go index 934fc441c09..acdb6da8661 100644 --- a/pkg/errors/errors.go +++ b/pkg/errors/errors.go @@ -188,6 +188,7 @@ var ( ErrEtcdSessionDone = errors.Normalize("the etcd session is done", errors.RFCCodeText("CDC:ErrEtcdSessionDone")) // ErrReactorFinished is used by reactor to signal a **normal** exit. ErrReactorFinished = errors.Normalize("the reactor has done its job and should no longer be executed", errors.RFCCodeText("CDC:ErrReactorFinished")) + ErrLeaseTimeout = errors.Normalize("owner lease timeout", errors.RFCCodeText("CDC:ErrLeaseTimeout")) // pipeline errors ErrSendToClosedPipeline = errors.Normalize("pipeline is closed, cannot send message", errors.RFCCodeText("CDC:ErrSendToClosedPipeline")) @@ -199,6 +200,8 @@ var ( // unified sorter errors ErrUnifiedSorterBackendTerminating = errors.Normalize("unified sorter backend is terminating", errors.RFCCodeText("CDC:ErrUnifiedSorterBackendTerminating")) + ErrIllegalUnifiedSorterParameter = errors.Normalize("illegal parameter for unified sorter: %s", errors.RFCCodeText("CDC:ErrIllegalUnifiedSorterParameter")) + ErrAsyncIOCancelled = errors.Normalize("asynchronous IO operation is cancelled. Internal use only, report a bug if seen in log", errors.RFCCodeText("CDC:ErrAsyncIOCancelled")) // processor errors ErrTableProcessorStoppedSafely = errors.Normalize("table processor stopped safely", errors.RFCCodeText("CDC:ErrTableProcessorStoppedSafely")) diff --git a/pkg/regionspan/region_range_lock.go b/pkg/regionspan/region_range_lock.go index 8069e620dc6..5f5b97e3a7e 100644 --- a/pkg/regionspan/region_range_lock.go +++ b/pkg/regionspan/region_range_lock.go @@ -15,6 +15,7 @@ package regionspan import ( "bytes" + "context" "encoding/hex" "fmt" "math" @@ -286,7 +287,7 @@ func (l *RegionRangeLock) tryLockRange(startKey, endKey []byte, regionID, versio } // LockRange locks a range with specified version. -func (l *RegionRangeLock) LockRange(startKey, endKey []byte, regionID, version uint64) LockRangeResult { +func (l *RegionRangeLock) LockRange(ctx context.Context, startKey, endKey []byte, regionID, version uint64) LockRangeResult { res, signalChs := l.tryLockRange(startKey, endKey, regionID, version) if res.Status != LockRangeStatusWait { @@ -298,7 +299,11 @@ func (l *RegionRangeLock) LockRange(startKey, endKey []byte, regionID, version u var res1 LockRangeResult for { for _, ch := range signalChs1 { - <-ch + select { + case <-ctx.Done(): + return LockRangeResult{Status: LockRangeStatusCancel} + case <-ch: + } } res1, signalChs1 = l.tryLockRange(startKey, endKey, regionID, version) if res1.Status != LockRangeStatusWait { @@ -374,6 +379,8 @@ const ( LockRangeStatusWait = 1 // LockRangeStatusStale means a LockRange operation is rejected because of the range's version is stale. LockRangeStatusStale = 2 + // LockRangeStatusCancel means a LockRange operation is cancelled. + LockRangeStatusCancel = 3 ) // LockRangeResult represents the result of LockRange method of RegionRangeLock. diff --git a/pkg/regionspan/region_range_lock_test.go b/pkg/regionspan/region_range_lock_test.go index 847a3ea18e8..ec008461d35 100644 --- a/pkg/regionspan/region_range_lock_test.go +++ b/pkg/regionspan/region_range_lock_test.go @@ -14,6 +14,7 @@ package regionspan import ( + "context" "math" "time" @@ -40,13 +41,26 @@ func mustWaitFn(c *check.C, res LockRangeResult) func() LockRangeResult { return res.WaitFn } -func mustLockRangeSuccess(c *check.C, l *RegionRangeLock, startKey, endKey string, regionID, version uint64, expectedCheckpointTs uint64) { - res := l.LockRange([]byte(startKey), []byte(endKey), regionID, version) +func mustLockRangeSuccess( + ctx context.Context, + c *check.C, + l *RegionRangeLock, + startKey, endKey string, + regionID, version, expectedCheckpointTs uint64, +) { + res := l.LockRange(ctx, []byte(startKey), []byte(endKey), regionID, version) mustSuccess(c, res, expectedCheckpointTs) } -func mustLockRangeStale(c *check.C, l *RegionRangeLock, startKey, endKey string, regionID, version uint64, expectRetrySpans ...string) { - res := l.LockRange([]byte(startKey), []byte(endKey), regionID, version) +func mustLockRangeStale( + ctx context.Context, + c *check.C, + l *RegionRangeLock, + startKey, endKey string, + regionID, version uint64, + expectRetrySpans ...string, +) { + res := l.LockRange(ctx, []byte(startKey), []byte(endKey), regionID, version) spans := make([]ComparableSpan, 0) for i := 0; i < len(expectRetrySpans); i += 2 { spans = append(spans, ComparableSpan{Start: []byte(expectRetrySpans[i]), End: []byte(expectRetrySpans[i+1])}) @@ -54,8 +68,14 @@ func mustLockRangeStale(c *check.C, l *RegionRangeLock, startKey, endKey string, mustStale(c, res, spans...) } -func mustLockRangeWait(c *check.C, l *RegionRangeLock, startKey, endKey string, regionID, version uint64) func() LockRangeResult { - res := l.LockRange([]byte(startKey), []byte(endKey), regionID, version) +func mustLockRangeWait( + ctx context.Context, + c *check.C, + l *RegionRangeLock, + startKey, endKey string, + regionID, version uint64, +) func() LockRangeResult { + res := l.LockRange(ctx, []byte(startKey), []byte(endKey), regionID, version) return mustWaitFn(c, res) } @@ -65,13 +85,14 @@ func unlockRange(l *RegionRangeLock, startKey, endKey string, regionID, version func (s *regionRangeLockSuite) TestRegionRangeLock(c *check.C) { defer testleak.AfterTest(c)() + ctx := context.TODO() l := NewRegionRangeLock([]byte("a"), []byte("h"), math.MaxUint64) - mustLockRangeSuccess(c, l, "a", "e", 1, 1, math.MaxUint64) + mustLockRangeSuccess(ctx, c, l, "a", "e", 1, 1, math.MaxUint64) unlockRange(l, "a", "e", 1, 1, 100) - mustLockRangeSuccess(c, l, "a", "e", 1, 2, 100) - mustLockRangeStale(c, l, "a", "e", 1, 2) - wait := mustLockRangeWait(c, l, "a", "h", 1, 3) + mustLockRangeSuccess(ctx, c, l, "a", "e", 1, 2, 100) + mustLockRangeStale(ctx, c, l, "a", "e", 1, 2) + wait := mustLockRangeWait(ctx, c, l, "a", "h", 1, 3) unlockRange(l, "a", "e", 1, 2, 110) res := wait() @@ -82,37 +103,39 @@ func (s *regionRangeLockSuite) TestRegionRangeLock(c *check.C) { func (s *regionRangeLockSuite) TestRegionRangeLockStale(c *check.C) { defer testleak.AfterTest(c)() l := NewRegionRangeLock([]byte("a"), []byte("z"), math.MaxUint64) - mustLockRangeSuccess(c, l, "c", "g", 1, 10, math.MaxUint64) - mustLockRangeSuccess(c, l, "j", "n", 2, 8, math.MaxUint64) - - mustLockRangeStale(c, l, "c", "g", 1, 10) - mustLockRangeStale(c, l, "c", "i", 1, 9, "g", "i") - mustLockRangeStale(c, l, "a", "z", 1, 9, "a", "c", "g", "j", "n", "z") - mustLockRangeStale(c, l, "a", "e", 1, 9, "a", "c") - mustLockRangeStale(c, l, "e", "h", 1, 9, "g", "h") - mustLockRangeStale(c, l, "e", "k", 1, 9, "g", "j") - mustLockRangeSuccess(c, l, "g", "j", 3, 1, math.MaxUint64) + ctx := context.TODO() + mustLockRangeSuccess(ctx, c, l, "c", "g", 1, 10, math.MaxUint64) + mustLockRangeSuccess(ctx, c, l, "j", "n", 2, 8, math.MaxUint64) + + mustLockRangeStale(ctx, c, l, "c", "g", 1, 10) + mustLockRangeStale(ctx, c, l, "c", "i", 1, 9, "g", "i") + mustLockRangeStale(ctx, c, l, "a", "z", 1, 9, "a", "c", "g", "j", "n", "z") + mustLockRangeStale(ctx, c, l, "a", "e", 1, 9, "a", "c") + mustLockRangeStale(ctx, c, l, "e", "h", 1, 9, "g", "h") + mustLockRangeStale(ctx, c, l, "e", "k", 1, 9, "g", "j") + mustLockRangeSuccess(ctx, c, l, "g", "j", 3, 1, math.MaxUint64) unlockRange(l, "g", "j", 3, 1, 2) unlockRange(l, "c", "g", 1, 10, 5) unlockRange(l, "j", "n", 2, 8, 8) - mustLockRangeSuccess(c, l, "a", "z", 1, 11, 2) + mustLockRangeSuccess(ctx, c, l, "a", "z", 1, 11, 2) unlockRange(l, "a", "z", 1, 11, 2) } func (s *regionRangeLockSuite) TestRegionRangeLockLockingRegionID(c *check.C) { defer testleak.AfterTest(c)() + ctx := context.TODO() l := NewRegionRangeLock([]byte("a"), []byte("z"), math.MaxUint64) - mustLockRangeSuccess(c, l, "c", "d", 1, 10, math.MaxUint64) + mustLockRangeSuccess(ctx, c, l, "c", "d", 1, 10, math.MaxUint64) - mustLockRangeStale(c, l, "e", "f", 1, 5, "e", "f") - mustLockRangeStale(c, l, "e", "f", 1, 10, "e", "f") - wait := mustLockRangeWait(c, l, "e", "f", 1, 11) + mustLockRangeStale(ctx, c, l, "e", "f", 1, 5, "e", "f") + mustLockRangeStale(ctx, c, l, "e", "f", 1, 10, "e", "f") + wait := mustLockRangeWait(ctx, c, l, "e", "f", 1, 11) unlockRange(l, "c", "d", 1, 10, 10) mustSuccess(c, wait(), math.MaxUint64) // Now ["e", "f") is locked by region 1 at version 11 and ts 11. - mustLockRangeSuccess(c, l, "g", "h", 2, 10, math.MaxUint64) - wait = mustLockRangeWait(c, l, "g", "h", 1, 12) + mustLockRangeSuccess(ctx, c, l, "g", "h", 2, 10, math.MaxUint64) + wait = mustLockRangeWait(ctx, c, l, "g", "h", 1, 12) ch := make(chan LockRangeResult, 1) go func() { ch <- wait() @@ -133,6 +156,17 @@ func (s *regionRangeLockSuite) TestRegionRangeLockLockingRegionID(c *check.C) { unlockRange(l, "g", "h", 1, 12, 30) } +func (s *regionRangeLockSuite) TestRegionRangeLockCanBeCancelled(c *check.C) { + defer testleak.AfterTest(c)() + ctx, cancel := context.WithCancel(context.Background()) + l := NewRegionRangeLock([]byte("a"), []byte("z"), math.MaxUint64) + mustLockRangeSuccess(ctx, c, l, "g", "h", 1, 10, math.MaxUint64) + wait := mustLockRangeWait(ctx, c, l, "g", "h", 1, 12) + cancel() + lockResult := wait() + c.Assert(lockResult.Status, check.Equals, LockRangeStatusCancel) +} + func (s *regionRangeLockSuite) TestRangeTsMap(c *check.C) { defer testleak.AfterTest(c)() m := NewRangeTsMap([]byte("a"), []byte("z"), math.MaxUint64) diff --git a/pkg/version/check.go b/pkg/version/check.go index 4b3a0005659..54b1d33b8a3 100644 --- a/pkg/version/check.go +++ b/pkg/version/check.go @@ -22,6 +22,8 @@ import ( "regexp" "strings" + "github.com/pingcap/ticdc/cdc/model" + "github.com/coreos/go-semver/semver" "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/log" @@ -135,3 +137,40 @@ func CheckStoreVersion(ctx context.Context, client pd.Client, storeID uint64) er } return nil } + +// TiCDCClusterVersion is the version of TiCDC cluster +type TiCDCClusterVersion string + +// ticdc cluster version +const ( + TiCDCClusterVersionUnknown TiCDCClusterVersion = "Unknown" + TiCDCClusterVersion4_0 TiCDCClusterVersion = "4.0.X" + TiCDCClusterVersion5_0 TiCDCClusterVersion = "5.0.X" +) + +// GetTiCDCClusterVersion returns the version of ticdc cluster +func GetTiCDCClusterVersion(captureInfos []*model.CaptureInfo) (TiCDCClusterVersion, error) { + if len(captureInfos) == 0 { + return TiCDCClusterVersionUnknown, nil + } + var minVer *semver.Version + for _, captureInfo := range captureInfos { + var ver *semver.Version + var err error + if captureInfo.Version != "" { + ver, err = semver.NewVersion(removeVAndHash(captureInfo.Version)) + } else { + ver = semver.New("4.0.1") + } + if err != nil { + return TiCDCClusterVersionUnknown, cerror.WrapError(cerror.ErrNewSemVersion, err) + } + if minVer == nil || ver.Compare(*minVer) < 0 { + minVer = ver + } + } + if minVer.Major < 5 { + return TiCDCClusterVersion4_0, nil + } + return TiCDCClusterVersion5_0, nil +} diff --git a/pkg/version/check_test.go b/pkg/version/check_test.go index a947f4c500e..6fe8e8e5ae7 100644 --- a/pkg/version/check_test.go +++ b/pkg/version/check_test.go @@ -24,6 +24,7 @@ import ( "github.com/coreos/go-semver/semver" "github.com/pingcap/check" "github.com/pingcap/kvproto/pkg/metapb" + "github.com/pingcap/ticdc/cdc/model" "github.com/pingcap/ticdc/pkg/util/testleak" pd "github.com/tikv/pd/client" "github.com/tikv/pd/pkg/tempurl" @@ -153,3 +154,61 @@ func (s *checkSuite) TestReleaseSemver(c *check.C) { c.Assert(ReleaseSemver(), check.Equals, cs.releaseSemver, check.Commentf("%v", cs)) } } + +func (s *checkSuite) TestGetTiCDCClusterVersion(c *check.C) { + defer testleak.AfterTest(c)() + testCases := []struct { + captureInfos []*model.CaptureInfo + expected TiCDCClusterVersion + }{ + { + captureInfos: []*model.CaptureInfo{}, + expected: TiCDCClusterVersionUnknown, + }, + { + captureInfos: []*model.CaptureInfo{ + {ID: "capture1", Version: ""}, + {ID: "capture2", Version: ""}, + {ID: "capture3", Version: ""}, + }, + expected: TiCDCClusterVersion4_0, + }, + { + captureInfos: []*model.CaptureInfo{ + {ID: "capture1", Version: "5.0.1"}, + {ID: "capture2", Version: "4.0.7"}, + {ID: "capture3", Version: "5.0.0-rc"}, + }, + expected: TiCDCClusterVersion4_0, + }, + { + captureInfos: []*model.CaptureInfo{ + {ID: "capture1", Version: "5.0.0-rc"}, + }, + expected: TiCDCClusterVersion5_0, + }, + { + captureInfos: []*model.CaptureInfo{ + {ID: "capture1", Version: "5.0.0"}, + }, + expected: TiCDCClusterVersion5_0, + }, + { + captureInfos: []*model.CaptureInfo{ + {ID: "capture1", Version: "4.1.0"}, + }, + expected: TiCDCClusterVersion4_0, + }, + { + captureInfos: []*model.CaptureInfo{ + {ID: "capture1", Version: "4.0.10"}, + }, + expected: TiCDCClusterVersion4_0, + }, + } + for _, tc := range testCases { + ver, err := GetTiCDCClusterVersion(tc.captureInfos) + c.Assert(err, check.IsNil) + c.Assert(ver, check.Equals, tc.expected) + } +} diff --git a/testing_utils/many_sorters_test/many_sorters.go b/testing_utils/many_sorters_test/many_sorters.go index 8e634cc9707..25409170850 100644 --- a/testing_utils/many_sorters_test/many_sorters.go +++ b/testing_utils/many_sorters_test/many_sorters.go @@ -51,13 +51,15 @@ func main() { } log.SetLevel(zapcore.DebugLevel) - config.SetSorterConfig(&config.SorterConfig{ + conf := config.GetDefaultServerConfig() + conf.Sorter = &config.SorterConfig{ NumConcurrentWorker: 8, ChunkSizeLimit: 1 * 1024 * 1024 * 1024, MaxMemoryPressure: 60, MaxMemoryConsumption: 16 * 1024 * 1024 * 1024, NumWorkerPoolGoroutine: 16, - }) + } + config.StoreGlobalServerConfig(conf) go func() { _ = http.ListenAndServe("localhost:6060", nil) @@ -78,7 +80,11 @@ func main() { var finishCount int32 for i := 0; i < *numSorters; i++ { - sorters[i] = pullerSorter.NewUnifiedSorter(*sorterDir, fmt.Sprintf("test-%d", i), "0.0.0.0:0") + sorters[i] = pullerSorter.NewUnifiedSorter(*sorterDir, + "test-cf", + fmt.Sprintf("test-%d", i), + model.TableID(i), + "0.0.0.0:0") finalI := i // run sorter diff --git a/testing_utils/sorter_stress_test/sorter_stress.go b/testing_utils/sorter_stress_test/sorter_stress.go index b04dbc9fefd..a53d6ae6942 100644 --- a/testing_utils/sorter_stress_test/sorter_stress.go +++ b/testing_utils/sorter_stress_test/sorter_stress.go @@ -48,12 +48,14 @@ func main() { log.Fatal("Could not enable failpoint", zap.Error(err)) } - config.SetSorterConfig(&config.SorterConfig{ + conf := config.GetDefaultServerConfig() + conf.Sorter = &config.SorterConfig{ NumConcurrentWorker: 8, ChunkSizeLimit: 1 * 1024 * 1024 * 1024, MaxMemoryPressure: 60, MaxMemoryConsumption: 16 * 1024 * 1024 * 1024, - }) + } + config.StoreGlobalServerConfig(conf) go func() { _ = http.ListenAndServe("localhost:6060", nil) @@ -64,7 +66,7 @@ func main() { log.Error("sorter_stress_test:", zap.Error(err)) } - sorter := pullerSorter.NewUnifiedSorter(*sorterDir, "test", "0.0.0.0:0") + sorter := pullerSorter.NewUnifiedSorter(*sorterDir, "test-cf", "test", 0, "0.0.0.0:0") ctx1, cancel := context.WithCancel(context.Background()) diff --git a/tests/_utils/run_cdc_server b/tests/_utils/run_cdc_server index 691e3c17818..c06bb1ef47d 100755 --- a/tests/_utils/run_cdc_server +++ b/tests/_utils/run_cdc_server @@ -21,6 +21,7 @@ pwd=$pwd log_level=debug restart= failpoint=$GO_FAILPOINTS +config_path= while [[ ${1} ]]; do case "${1}" in @@ -64,6 +65,10 @@ while [[ ${1} ]]; do failpoint=${2} shift ;; + --config) + config_path="--config ${2}" + shift + ;; *) echo "Unknown parameter: ${1}" >&2 exit 1 @@ -87,6 +92,7 @@ if [[ "$restart" == "true" ]]; then --log-file $workdir/cdc$logsuffix.log \ --log-level $log_level \ --sorter-num-workerpool-goroutine 4 \ + $config_path \ $tls \ $certcn \ $addr \ @@ -101,6 +107,7 @@ else --log-file $workdir/cdc$logsuffix.log \ --log-level $log_level \ --sorter-num-workerpool-goroutine 4 \ + $config_path \ $tls \ $certcn \ $addr \ diff --git a/tests/autorandom/data/test.sql b/tests/autorandom/data/test.sql index 41a4a83e675..e00d7ec8703 100644 --- a/tests/autorandom/data/test.sql +++ b/tests/autorandom/data/test.sql @@ -5,11 +5,11 @@ use `autorandom_test`; CREATE TABLE table_a ( id BIGINT AUTO_RANDOM, data int, - PRIMARY KEY(id) + PRIMARY KEY(id) clustered ); INSERT INTO table_a (data) value (1); INSERT INTO table_a (data) value (2); INSERT INTO table_a (data) value (3); INSERT INTO table_a (data) value (4); -INSERT INTO table_a (data) value (5); \ No newline at end of file +INSERT INTO table_a (data) value (5); diff --git a/tests/capture_suicide_while_balance_table/conf/diff_config.toml b/tests/capture_suicide_while_balance_table/conf/diff_config.toml new file mode 100644 index 00000000000..3f1741cbf5d --- /dev/null +++ b/tests/capture_suicide_while_balance_table/conf/diff_config.toml @@ -0,0 +1,27 @@ +# diff Configuration. + +log-level = "info" +chunk-size = 10 +check-thread-count = 4 +sample-percent = 100 +use-rowid = false +use-checksum = true +fix-sql-file = "fix.sql" + +# tables need to check. +[[check-tables]] + schema = "capture_suicide_while_balance_table" + tables = ["~t.*"] + +[[source-db]] + host = "127.0.0.1" + port = 4000 + user = "root" + password = "" + instance-id = "source-1" + +[target-db] + host = "127.0.0.1" + port = 3306 + user = "root" + password = "" diff --git a/tests/capture_suicide_while_balance_table/run.sh b/tests/capture_suicide_while_balance_table/run.sh new file mode 100644 index 00000000000..7459a927262 --- /dev/null +++ b/tests/capture_suicide_while_balance_table/run.sh @@ -0,0 +1,81 @@ +#!/bin/bash + +set -e + +CUR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) +source $CUR/../_utils/test_prepare +WORK_DIR=$OUT_DIR/$TEST_NAME +CDC_BINARY=cdc.test +SINK_TYPE=$1 + + +# This test mainly verifies CDC can handle the following scenario +# 1. Two captures, capture-1 is the owner, each capture replicates more than one table. +# 2. capture-2 replicates some DMLs but has some delay, such as large amount of +# incremental scan data, sink block, etc, we name this slow table as table-slow. +# 3. Before capture-2 the checkpoint ts of table-slow reaches global resolved ts, +# a rebalance operation is triggered, either by manual rebalance or a new capture +# joins the cluster. So a delete table operation will be dispatched to capture-2, +# and the boundary ts is global resolved ts. capture-2 will continue to replicate +# table-slow until the checkpoint ts reaches the boundary ts. +# 4. However, before the checkpoint ts of table-slow reaches boundary ts, capture-2 +# suicides itself because of some network issue or PD jitter. +# 5. After the cluster recovers, the data of table-slow in downstream should be +# consistent with upstream. +# +# In this test, step-2 is achieved by failpoint injection, step-3 is triggered +# by manual rebalance, step-4 is achieved by revoking the lease of capture key. +function run() { + # test with mysql sink only + if [ "$SINK_TYPE" != "mysql" ]; then + return + fi + + rm -rf $WORK_DIR && mkdir -p $WORK_DIR + start_tidb_cluster --workdir $WORK_DIR + cd $WORK_DIR + + pd_addr="http://$UP_PD_HOST_1:$UP_PD_PORT_1" + run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --pd $pd_addr --logsuffix 1 --addr "127.0.0.1:8300" + export GO_FAILPOINTS='github.com/pingcap/ticdc/cdc/sink/MySQLSinkHangLongTime=1*return(true)' + run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --pd $pd_addr --logsuffix 2 --addr "127.0.0.1:8301" + + SINK_URI="mysql://root@127.0.0.1:3306/?max-txn-row=1" + changefeed_id=$(cdc cli changefeed create --pd=$pd_addr --sink-uri="$SINK_URI" 2>&1|tail -n2|head -n1|awk '{print $2}') + + run_sql "CREATE DATABASE capture_suicide_while_balance_table;" ${UP_TIDB_HOST} ${UP_TIDB_PORT} + for i in $(seq 1 4); do + run_sql "CREATE table capture_suicide_while_balance_table.t$i (id int primary key auto_increment)" ${UP_TIDB_HOST} ${UP_TIDB_PORT} + done + + for i in $(seq 1 4); do + check_table_exists "capture_suicide_while_balance_table.t$i" ${DOWN_TIDB_HOST} ${DOWN_TIDB_PORT} + done + + capture1_id=$(cdc cli capture list|jq -r '.[]|select(.address=="127.0.0.1:8300")|.id') + capture2_id=$(cdc cli capture list|jq -r '.[]|select(.address=="127.0.0.1:8301")|.id') + one_table_id=$(cdc cli processor query -c $changefeed_id -p $capture2_id|jq -r '.status.tables|keys[0]') + table_query=$(mysql -h${UP_TIDB_HOST} -P${UP_TIDB_PORT} -uroot -e "select table_name from information_schema.tables where tidb_table_id = ${one_table_id}\G") + table_name=$(echo $table_query|tail -n 1|awk '{print $(NF)}') + run_sql "insert into capture_suicide_while_balance_table.${table_name} values (),(),(),(),()" + + # sleep some time to wait global resolved ts forwarded + sleep 2 + curl -X POST http://127.0.0.1:8300/capture/owner/move_table -d "cf-id=${changefeed_id}&target-cp-id=${capture1_id}&table-id=${one_table_id}" + # sleep some time to wait table balance job is written to etcd + sleep 2 + + # revoke lease of etcd capture key to simulate etcd session done + lease=$(ETCDCTL_API=3 etcdctl get /tidb/cdc/capture/${capture2_id} -w json|grep -o 'lease":[0-9]*'|awk -F: '{print $2}') + lease_hex=$(printf '%x\n' $lease) + ETCDCTL_API=3 etcdctl lease revoke $lease_hex + + check_sync_diff $WORK_DIR $CUR/conf/diff_config.toml + export GO_FAILPOINTS='' + cleanup_process $CDC_BINARY +} + +trap stop_tidb_cluster EXIT +run $* +check_logs $WORK_DIR +echo "[$(date)] <<<<<< run test case $TEST_NAME success! >>>>>>" diff --git a/tests/cli/run.sh b/tests/cli/run.sh index d561266626d..034802ed6e9 100644 --- a/tests/cli/run.sh +++ b/tests/cli/run.sh @@ -37,6 +37,7 @@ function run() { start_tidb_cluster --workdir $WORK_DIR --multiple-upstream-pd true cd $WORK_DIR + pd_addr="http://$UP_PD_HOST_1:$UP_PD_PORT_1" # record tso before we create tables to skip the system table DDLs start_ts=$(run_cdc_cli tso query --pd=http://$UP_PD_HOST_1:$UP_PD_PORT_1) @@ -52,7 +53,7 @@ function run() { esac uuid="custom-changefeed-name" - run_cdc_cli changefeed create --start-ts=$start_ts --sink-uri="$SINK_URI" --tz="Asia/Shanghai" -c="$uuid" + run_cdc_cli changefeed create --start-ts=$start_ts --sort-engine=memory --sink-uri="$SINK_URI" --tz="Asia/Shanghai" -c="$uuid" if [ "$SINK_TYPE" == "kafka" ]; then run_kafka_consumer $WORK_DIR "kafka://127.0.0.1:9092/$TOPIC_NAME?partition-num=4&version=${KAFKA_VERSION}" fi @@ -87,7 +88,7 @@ case-sensitive = false worker-num = 4 EOF set +e - update_result=$(cdc cli changefeed update --start-ts=$start_ts --sink-uri="$SINK_URI" --tz="Asia/Shanghai" --config="$WORK_DIR/changefeed.toml" --no-confirm --changefeed-id $uuid) + update_result=$(cdc cli changefeed update --pd=$pd_addr --config="$WORK_DIR/changefeed.toml" --no-confirm --changefeed-id $uuid) set -e if [[ ! $update_result == *"can only update changefeed config when it is stopped"* ]]; then echo "update changefeed config should fail when changefeed is running, got $update_result" @@ -103,7 +104,7 @@ EOF check_changefeed_state $uuid "stopped" # Update changefeed - run_cdc_cli changefeed update --start-ts=$start_ts --sink-uri="$SINK_URI" --tz="Asia/Shanghai" --config="$WORK_DIR/changefeed.toml" --no-confirm --changefeed-id $uuid + run_cdc_cli changefeed update --pd=$pd_addr --config="$WORK_DIR/changefeed.toml" --no-confirm --changefeed-id $uuid changefeed_info=$(run_cdc_cli changefeed query --changefeed-id $uuid 2>&1) if [[ ! $changefeed_info == *"\"case-sensitive\": false"* ]]; then echo "[$(date)] <<<<< changefeed info is not updated as expected ${changefeed_info} >>>>>" @@ -113,6 +114,10 @@ EOF echo "[$(date)] <<<<< changefeed info is not updated as expected ${changefeed_info} >>>>>" exit 1 fi + if [[ ! $changefeed_info == *"\"sort-engine\": \"memory\""* ]]; then + echo "[$(date)] <<<<< changefeed info is not updated as expected ${changefeed_info} >>>>>" + exit 1 + fi jobtype=$(run_cdc_cli changefeed --changefeed-id $uuid query 2>&1 | grep 'admin-job-type' | grep -oE '[0-9]' | head -1) if [[ $jobtype != 1 ]]; then diff --git a/tests/clustered_index/data/test.sql b/tests/clustered_index/data/test.sql index 6f8dbd43367..adeb694e79a 100644 --- a/tests/clustered_index/data/test.sql +++ b/tests/clustered_index/data/test.sql @@ -2,8 +2,6 @@ drop database if exists `clustered_index_test`; create database `clustered_index_test`; use `clustered_index_test`; -set @@tidb_enable_clustered_index=1; - CREATE TABLE t0 ( id VARCHAR(255), data INT, diff --git a/tests/clustered_index/run.sh b/tests/clustered_index/run.sh index c89903dd07f..8bb62001148 100755 --- a/tests/clustered_index/run.sh +++ b/tests/clustered_index/run.sh @@ -29,6 +29,9 @@ function run() { if [ "$SINK_TYPE" == "kafka" ]; then run_kafka_consumer $WORK_DIR "kafka://127.0.0.1:9092/$TOPIC_NAME?partition-num=4&version=${KAFKA_VERSION}" fi + run_sql "set global tidb_enable_clustered_index=1;" ${UP_TIDB_HOST} ${UP_TIDB_PORT} + # TiDB global variables cache 2 seconds at most + sleep 2 run_sql_file $CUR/data/test.sql ${UP_TIDB_HOST} ${UP_TIDB_PORT} # sync_diff can't check non-exist table, so we check expected tables are created in downstream first diff --git a/tests/cyclic_abc/run.sh b/tests/cyclic_abc/run.sh index 4d26aace544..65c8704229d 100644 --- a/tests/cyclic_abc/run.sh +++ b/tests/cyclic_abc/run.sh @@ -20,6 +20,14 @@ function run() { start_tidb_cluster --workdir $WORK_DIR start_tls_tidb_cluster --workdir $WORK_DIR --tlsdir $TLS_DIR + echo " \ +[security] + ca-path = \"$TLS_DIR/ca.pem\" + cert-path = \"$TLS_DIR/server.pem\" + key-path = \"$TLS_DIR/server-key.pem\" + cert-allowed-cn = [\"fake_cn\"] +" > $WORK_DIR/server.toml + cd $WORK_DIR # create table in all cluters. @@ -86,7 +94,7 @@ function run() { --logsuffix "_${TEST_NAME}_tls" \ --pd "https://${TLS_PD_HOST}:${TLS_PD_PORT}" \ --addr "127.0.0.1:8302" \ - --tlsdir $TLS_DIR \ + --config "$WORK_DIR/server.toml" \ --cert-allowed-cn "client" # The common name of client.pem run_cdc_cli changefeed create --start-ts=$start_ts \ diff --git a/tests/http_proxies/run-proxy.go b/tests/http_proxies/run-proxy.go new file mode 100644 index 00000000000..5e771f41871 --- /dev/null +++ b/tests/http_proxies/run-proxy.go @@ -0,0 +1,46 @@ +// Copyright 2021 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "flag" + "fmt" + + "github.com/pingcap/log" + "go.uber.org/zap" + + grpc_proxy "github.com/bradleyjkemp/grpc-tools/grpc-proxy" + "google.golang.org/grpc" +) + +func main() { + grpc_proxy.RegisterDefaultFlags() + flag.Parse() + proxy, err := grpc_proxy.New( + grpc_proxy.WithInterceptor(intercept), + grpc_proxy.DefaultFlags(), + ) + if err != nil { + log.Fatal("failed to create proxy", zap.Error(err)) + } + err = proxy.Start() + if err != nil { + log.Fatal("failed to start proxy", zap.Error(err)) + } +} + +func intercept(srv interface{}, ss grpc.ServerStream, info *grpc.StreamServerInfo, handler grpc.StreamHandler) error { + fmt.Println(info.FullMethod) + return handler(srv, ss) +} diff --git a/tests/http_proxies/run.sh.skip b/tests/http_proxies/run.sh.skip new file mode 100644 index 00000000000..14d6c9f2a00 --- /dev/null +++ b/tests/http_proxies/run.sh.skip @@ -0,0 +1,87 @@ +#!/bin/bash + +set -e + +CUR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) +source $CUR/../_utils/test_prepare +WORK_DIR=$OUT_DIR/$TEST_NAME +CDC_BINARY=cdc.test +TEST_HOST_LIST=(UP_TIDB_HOST UP_PD_HOST_{1..3} UP_TIKV_HOST_{1..3}) +# FIXME: hostname in macOS doesn't support -I option. +lan_addrs=($(hostname -I)) +lan_addr=${lan_addrs[0]-"127.0.0.1"} +export UP_TIDB_HOST=$lan_addr \ + UP_PD_HOST_1=$lan_addr \ + UP_PD_HOST_2=$lan_addr \ + UP_PD_HOST_3=$lan_addr \ + UP_TIKV_HOST_1=$lan_addr \ + UP_TIKV_HOST_2=$lan_addr \ + UP_TIKV_HOST_3=$lan_addr + + +proxy_pid="" +function start_proxy() { + echo "dumpling grpc packet to $WORK_DIR/packets.dump..." + GO111MODULE=on WORK_DIR=$WORK_DIR go run $CUR/run-proxy.go --port=8080 >$WORK_DIR/packets.dump & + proxy_pid=$! +} + +function stop_proxy() { + kill "$proxy_pid" || true +} + +function prepare() { + for host in ${TEST_HOST_LIST[@]}; do + echo "$host $(printenv $host)" + case $(printenv $host) in + # Should we handle ::1/128 here? + 127.*.*.* ) + echo "[WARNING] http_proxies: the host of component $host is loopback, hence proxies would be ignored, skipping this test..." + exit 0 + ;; + *) ;; + esac + done + + rm -rf "$WORK_DIR" + mkdir -p "$WORK_DIR" + stop_tidb_cluster + start_tidb_cluster --workdir $WORK_DIR + + start_proxy + echo started proxy at $proxy_pid + + cd $WORK_DIR + start_ts=$(run_cdc_cli tso query --pd=http://$UP_PD_HOST_1:$UP_PD_PORT_1) + + export http_proxy=http://127.0.0.1:8080 + export https_proxy=http://127.0.0.1:8080 + run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY + + SINK_URI="blackhole:///" + run_cdc_cli changefeed create --start-ts=$start_ts --sink-uri="$SINK_URI" +} + +function check() { + services=($(cat $WORK_DIR/packets.dump | xargs -L1 dirname | sort | uniq)) + service_type_count=${#services[@]} + echo "captured services: " + echo ${services[@]} + # at least two types: + # "pdpb.PD" + # "tikvpb.TiKV" + if ! [ $service_type_count -ge 2 ] ; then + echo "services didn't match expected." + echo "[total count]: $service_type_count (expected >= 2)" + exit 1 + fi +} + +trap "stop_tidb_cluster && stop_proxy" EXIT + +prepare +sleep 5 +check + +check_logs $WORK_DIR +echo "[$(date)] <<<<<< run test case $TEST_NAME success! >>>>>>" diff --git a/tests/multi_source/main.go b/tests/multi_source/main.go index 3ca0715518a..aa5ffdd1c4a 100644 --- a/tests/multi_source/main.go +++ b/tests/multi_source/main.go @@ -62,6 +62,9 @@ func main() { log.S().Errorf("Failed to close source database: %s\n", err) } }() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go switchAsyncCommit(ctx, sourceDB0) util.MustExec(sourceDB0, "create database mark;") runDDLTest([]*sql.DB{sourceDB0, sourceDB1}) util.MustExec(sourceDB0, "create table mark.finish_mark(a int primary key);") @@ -109,6 +112,25 @@ func runDDLTest(srcs []*sql.DB) { } } +func switchAsyncCommit(ctx context.Context, db *sql.DB) { + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + enabled := false + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + if enabled { + util.MustExec(db, "set global tidb_enable_async_commit = off") + } else { + util.MustExec(db, "set global tidb_enable_async_commit = on") + } + enabled = !enabled + } + } +} + func getFunctionName(i interface{}) string { strs := strings.Split(runtime.FuncForPC(reflect.ValueOf(i).Pointer()).Name(), ".") return strs[len(strs)-1] diff --git a/tests/processor_err_chan/run.sh b/tests/processor_err_chan/run.sh index a6d335bdae3..fd07dd0c703 100644 --- a/tests/processor_err_chan/run.sh +++ b/tests/processor_err_chan/run.sh @@ -50,11 +50,14 @@ function run() { run_sql "CREATE table processor_err_chan.t$i (id int primary key auto_increment)" ${DOWN_TIDB_HOST} ${DOWN_TIDB_PORT} done + # export GO_FAILPOINTS='github.com/pingcap/ticdc/cdc/ProcessorAddTableError=1*return(true)' # old processor + export GO_FAILPOINTS='github.com/pingcap/ticdc/cdc/processor/pipeline/ProcessorAddTableError=1*return(true)' # new processor run_cdc_server --workdir $WORK_DIR --binary $CDC_BINARY --addr "127.0.0.1:8300" --pd $pd_addr - changefeed_id=$(cdc cli changefeed create --pd=$pd_addr --sink-uri="$SINK_URI" --sort-engine=abc-engine 2>&1|tail -n2|head -n1|awk '{print $2}') + + changefeed_id=$(cdc cli changefeed create --pd=$pd_addr --sink-uri="$SINK_URI" 2>&1|tail -n2|head -n1|awk '{print $2}') retry_time=10 - ensure $retry_time check_changefeed_mark_stopped $pd_addr $changefeed_id "[CDC:ErrUnknownSortEngine]unknown sort engine abc-engine" + ensure $retry_time check_changefeed_mark_stopped $pd_addr $changefeed_id "processor add table injected error" cdc cli changefeed create --pd=$pd_addr --sink-uri="$SINK_URI" for i in $(seq 1 10); do diff --git a/tests/resolve_lock/main.go b/tests/resolve_lock/main.go index 20be7c9f7ca..e20e1de156b 100644 --- a/tests/resolve_lock/main.go +++ b/tests/resolve_lock/main.go @@ -34,6 +34,7 @@ import ( "github.com/pingcap/parser/model" "github.com/pingcap/ticdc/tests/util" "github.com/pingcap/tidb/kv" + "github.com/pingcap/tidb/store/driver" "github.com/pingcap/tidb/store/tikv" "github.com/pingcap/tidb/store/tikv/oracle" "github.com/pingcap/tidb/store/tikv/tikvrpc" @@ -120,7 +121,7 @@ func addLock(ctx context.Context, cfg *util.Config) error { } defer pdcli.Close() - driver := tikv.Driver{} + driver := driver.TiKVDriver{} store, err := driver.Open(fmt.Sprintf("tikv://%s?disableGC=true", cfg.PDAddr)) if err != nil { return errors.Trace(err) diff --git a/tools/check/go.mod b/tools/check/go.mod index 506eab58e44..fbdfba007b8 100644 --- a/tools/check/go.mod +++ b/tools/check/go.mod @@ -3,8 +3,8 @@ module github.com/pingcap/tidb-cdc/_tools go 1.13 require ( - github.com/golangci/golangci-lint v1.33.0 // indirect - github.com/mgechev/revive v1.0.2 // indirect + github.com/golangci/golangci-lint v1.33.0 + github.com/mgechev/revive v1.0.2 github.com/pingcap/errors v0.11.5-0.20201126102027-b0a155152ca3 mvdan.cc/gofumpt v0.0.0-20201123090407-3077abae40c0 ) diff --git a/tools/check/go.sum b/tools/check/go.sum index 0db65a0d155..c825d2bf2e5 100644 --- a/tools/check/go.sum +++ b/tools/check/go.sum @@ -85,6 +85,7 @@ github.com/go-toolsmith/astfmt v1.0.0/go.mod h1:cnWmsOAuq4jJY6Ct5YWlVLmcmLMn1JUP github.com/go-toolsmith/astinfo v0.0.0-20180906194353-9809ff7efb21/go.mod h1:dDStQCHtmZpYOmjRP/8gHHnCCch3Zz3oEgCdZVdtweU= github.com/go-toolsmith/astp v1.0.0 h1:alXE75TXgcmupDsMK1fRAy0YUzLzqPVvBKoyWV+KPXg= github.com/go-toolsmith/astp v1.0.0/go.mod h1:RSyrtpVlfTFGDYRbrjyWP1pYu//tSFcvdYrA8meBmLI= +github.com/go-toolsmith/pkgload v1.0.0 h1:4DFWWMXVfbcN5So1sBNW9+yeiMqLFGl1wFLTL5R0Tgg= github.com/go-toolsmith/pkgload v1.0.0/go.mod h1:5eFArkbO80v7Z0kdngIxsRXRMTaX4Ilcwuh3clNrQJc= github.com/go-toolsmith/strparse v1.0.0 h1:Vcw78DnpCAKlM20kSbAyO4mPfJn/lyYA4BJUDxe2Jb4= github.com/go-toolsmith/strparse v1.0.0/go.mod h1:YI2nUKP9YGZnL/L1/DLFBfixrcjslWct4wyljWhSRy8= @@ -112,6 +113,7 @@ github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:x github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs= github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= +github.com/golang/protobuf v1.4.2 h1:+Z5KGCizgyZCbGh1KZqA0fcLLkwbsjIzS4aV2v7wJX0= github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= github.com/golangci/check v0.0.0-20180506172741-cfe4005ccda2 h1:23T5iq8rbUYlhpt5DB4XJkc6BU31uODLD1o1gKvZmD0= github.com/golangci/check v0.0.0-20180506172741-cfe4005ccda2/go.mod h1:k9Qvh+8juN+UKMCS/3jFtGICgW8O96FVaZsaxdzDkR4= @@ -160,7 +162,9 @@ github.com/google/uuid v1.0.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+ github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg= github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk= github.com/gookit/color v1.3.1/go.mod h1:R3ogXq2B9rTbXoSHJ1HyUVAZ3poOJHpd9nQmyGZsfvQ= +github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 h1:EGx4pi6eqNxGaHF6qqu48+N2wcFQ5qg5FXgOdqsJ5d8= github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= +github.com/gorilla/websocket v1.4.2 h1:+/TMaTYc4QFitKJxsQ7Yye35DkWvkdLcvGKqM+x0Ufc= github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= github.com/gostaticanalysis/analysisutil v0.0.0-20190318220348-4088753ea4d3/go.mod h1:eEOZF4jCKGi+aprrirO9e7WKB3beBRtWgqGunKl6pKE= github.com/gostaticanalysis/analysisutil v0.0.3/go.mod h1:eEOZF4jCKGi+aprrirO9e7WKB3beBRtWgqGunKl6pKE= @@ -193,6 +197,7 @@ github.com/hashicorp/mdns v1.0.0/go.mod h1:tL+uN++7HEJ6SQLQ2/p+z2pH24WQKWjBPkE0m github.com/hashicorp/memberlist v0.1.3/go.mod h1:ajVTdAv/9Im8oMAAj5G31PhhMCZJV2pPBoIllUwCN7I= github.com/hashicorp/serf v0.8.2/go.mod h1:6hOLApaqBFA1NXqRQAsxw9QxuDEvNxSQRwA/JwenrHc= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= +github.com/inconshreveable/mousetrap v1.0.0 h1:Z8tu5sraLXCXIcARxBp/8cbvlwVa7Z1NHg9XEKhtSvM= github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8= github.com/jgautheron/goconst v0.0.0-20201117150253-ccae5bf973f3 h1:7nkB9fLPMwtn/R6qfPcHileL/x9ydlhw8XyDrLI1ZXg= github.com/jgautheron/goconst v0.0.0-20201117150253-ccae5bf973f3/go.mod h1:aAosetZ5zaeC/2EfMeRswtxUFBpe2Hr7HzkgX4fanO4= @@ -205,6 +210,7 @@ github.com/jmoiron/sqlx v1.2.1-0.20190826204134-d7d95172beb5/go.mod h1:1FEQNm3xl github.com/jonboulle/clockwork v0.1.0/go.mod h1:Ii8DK3G1RaLaWxj9trq07+26W01tbo22gdxWY5EU2bo= github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU= github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU= +github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo= github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU= github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w= github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q= @@ -218,6 +224,7 @@ github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFB github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/kunwardeep/paralleltest v1.0.2 h1:/jJRv0TiqPoEy/Y8dQxCFJhD56uS/pnvtatgTZBHokU= github.com/kunwardeep/paralleltest v1.0.2/go.mod h1:ZPqNm1fVHPllh5LPVujzbVz1JN2GhLxSfY+oqUsvG30= @@ -273,18 +280,22 @@ github.com/nakabonne/nestif v0.3.0 h1:+yOViDGhg8ygGrmII72nV9B/zGxY188TYpfolntsaP github.com/nakabonne/nestif v0.3.0/go.mod h1:dI314BppzXjJ4HsCnbo7XzrJHPszZsjnk5wEBSYHI2c= github.com/nbutton23/zxcvbn-go v0.0.0-20180912185939-ae427f1e4c1d h1:AREM5mwr4u1ORQBMvzfzBgpsctsbQikCVpvC+tX285E= github.com/nbutton23/zxcvbn-go v0.0.0-20180912185939-ae427f1e4c1d/go.mod h1:o96djdrsSGy3AWPyBgZMAGfxZNfgntdJG+11KU4QvbU= +github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e h1:fD57ERR4JtEqsWbfPhv4DMiApHyliiK5xCTNVSPiaAs= github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= github.com/nishanths/exhaustive v0.1.0 h1:kVlMw8h2LHPMGUVqUj6230oQjjTMFjwcZrnkhXzFfl8= github.com/nishanths/exhaustive v0.1.0/go.mod h1:S1j9110vxV1ECdCudXRkeMnFQ/DQk9ajLT0Uf2MYZQQ= +github.com/nxadm/tail v1.4.4 h1:DQuhQpB1tVlglWS2hLQ5OV6B5r8aGxSrPc5Qo6uTN78= github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A= github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U= github.com/olekukonko/tablewriter v0.0.4 h1:vHD/YYe1Wolo78koG299f7V/VAS08c6IpCLn+Ejf/w8= github.com/olekukonko/tablewriter v0.0.4/go.mod h1:zq6QwlOf5SlnkVbMSr5EoBv3636FWnp+qbPhuoO21uA= github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/ginkgo v1.12.1/go.mod h1:zj2OWP4+oCPe1qIXoGWkgMRwljMUYCdkwsT2108oapk= +github.com/onsi/ginkgo v1.14.1 h1:jMU0WaQrP0a/YAEq8eJmJKjBoMs+pClEr1vDMlM/Do4= github.com/onsi/ginkgo v1.14.1/go.mod h1:iSB4RoI2tjJc9BBv4NKIKWKya62Rps+oPG/Lv9klQyY= github.com/onsi/gomega v1.7.1/go.mod h1:XdKZgCCFLUoM/7CFJVPcG8C1xQ1AJ0vpAezJrB7JYyY= github.com/onsi/gomega v1.10.1/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1ybHNo= +github.com/onsi/gomega v1.10.2 h1:aY/nuoWlKJud2J6U0E3NWsjlg+0GtwXxgEqthRdzlcs= github.com/onsi/gomega v1.10.2/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1ybHNo= github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= github.com/pborman/uuid v1.2.0/go.mod h1:X/NO0urCmaxf9VXbdlT7C2Yzkj2IKimNn4k+gtPdI/k= @@ -324,6 +335,7 @@ github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6So github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= github.com/rogpeppe/go-internal v1.5.2/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc= github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc= +github.com/rogpeppe/go-internal v1.6.2 h1:aIihoIOHCiLZHxyoNQ+ABL4NKhFTgKLBdMLyEAh98m0= github.com/rogpeppe/go-internal v1.6.2/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc= github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/ryancurrah/gomodguard v1.1.0 h1:DWbye9KyMgytn8uYpuHkwf0RHqAYO6Ay/D0TbCpPtVU= @@ -345,7 +357,9 @@ github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPx github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88= github.com/sirupsen/logrus v1.7.0 h1:ShrD1U9pZB12TX0cVy0DtePoCH97K8EtX+mg7ZARUtM= github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= +github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d h1:zE9ykElWQ6/NYmHa3jpm/yHnI4xSofP+UP6SpjHcSeM= github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc= +github.com/smartystreets/goconvey v1.6.4 h1:fv0U8FUIMPNf1L9lnHLvLhgicrIVChEkdzIKYqbNC9s= github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA= github.com/soheilhy/cmux v0.1.4/go.mod h1:IM3LyeVVIOuxMH7sFAkER9+bJ4dT7Ms6E4xg4kGIyLM= github.com/sonatard/noctx v0.0.1 h1:VC1Qhl6Oxx9vvWo3UDgrGXYCeKCe3Wbw7qAWL6FrmTY= @@ -411,6 +425,7 @@ go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= go.uber.org/atomic v1.3.2/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= +go.uber.org/atomic v1.6.0 h1:Ezj3JGmsOnG1MoRWQkPBsKLe9DwWD9QeXzTRzzldNVk= go.uber.org/atomic v1.6.0/go.mod h1:sABNBOSYdrvTF6hTgEIbc7YasKWGhgEQZyfxyTvoXHQ= go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0= go.uber.org/multierr v1.5.0/go.mod h1:FeouvMocqHpRaaGuG9EjoKcStLC43Zu/fmqdUMPcKYU= @@ -438,6 +453,7 @@ golang.org/x/lint v0.0.0-20190301231843-5614ed5bae6f/go.mod h1:UVdnD1Gm6xHRNCYTk golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/lint v0.0.0-20190409202823-959b441ac422/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/lint v0.0.0-20190909230951-414d861bb4ac/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/lint v0.0.0-20190930215403-16217165b5de h1:5hukYrvBGR8/eNkX5mdUezrA6JiaEZDtJb9Ei+1LlBs= golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/mobile v0.0.0-20190312151609-d3739f865fa6/go.mod h1:z+o9i4GpDbdi3rU15maQ/Ox0txvL9dWGYEHz965HBQE= golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o= @@ -467,6 +483,7 @@ golang.org/x/net v0.0.0-20200520004742-59133d7f0dd7/go.mod h1:qpuaurCH72eLCgpAm/ golang.org/x/net v0.0.0-20200602114024-627f9648deb9/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20200625001655-4c5254603344/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= +golang.org/x/net v0.0.0-20201021035429-f5854403a974 h1:IX6qOQeG5uLjB/hjjwjedwfjND0hgjPMMyO1RoIXQNI= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -587,6 +604,7 @@ google.golang.org/genproto v0.0.0-20190502173448-54afdca5d873/go.mod h1:VzzqZJRn google.golang.org/genproto v0.0.0-20190801165951-fa694d86fc64/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= google.golang.org/genproto v0.0.0-20190911173649-1774047e7e51/go.mod h1:IbNlFCBrqXvoKpeg0TB2l7cyZUmoaFKYIwrEpbDKLA8= +google.golang.org/genproto v0.0.0-20191108220845-16a3f7862a1a h1:Ob5/580gVHBJZgXnff1cZDbG+xLtMVE5mDRTe+nIsX4= google.golang.org/genproto v0.0.0-20191108220845-16a3f7862a1a/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= @@ -596,18 +614,22 @@ google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE= google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= +google.golang.org/protobuf v1.23.0 h1:4MY060fB1DLGMB/7MBTLnwQUY6+F09GEiz6SsrNqyzM= google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f h1:BLraFXnmrev5lT+xlilqcH8XK9/i0At2xKjWk4p6zsU= gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/errgo.v2 v2.1.0 h1:0vLT13EuvQ0hNvakwLuFZ/jYrLp5F3kcWHXdRggjCE8= gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys= gopkg.in/ini.v1 v1.51.0 h1:AQvPpx3LzTDM0AjnIRlVFwFFGC+npRopjZxLJj6gdno= gopkg.in/ini.v1 v1.51.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= gopkg.in/natefinch/lumberjack.v2 v2.0.0/go.mod h1:l0ndWWf7gzL7RNwBG7wST/UCcT4T24xpD6X8LsfU/+k= gopkg.in/resty.v1 v1.12.0/go.mod h1:mDo4pnntr5jdWRML875a/NmxYqAlA73dVijT2AXvQQo= +gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= gopkg.in/yaml.v2 v2.0.0-20170812160011-eb3733d160e7/go.mod h1:JAlM8MvJe8wmxCU4Bli9HhUf9+ttbYbLASfIpnQbh74= gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=