Skip to content

Commit 5f32d79

Browse files
authored
ttl: fix ttl job manager will panic if the status cache doesn't contain table (#41069) (#43552)
close #41067, close #41068
1 parent f54e63b commit 5f32d79

File tree

4 files changed

+26
-12
lines changed

4 files changed

+26
-12
lines changed

ttl/ttlworker/BUILD.bazel

+3
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,10 @@ go_library(
2929
"//util/logutil",
3030
"//util/sqlexec",
3131
"//util/timeutil",
32+
"@com_github_google_uuid//:uuid",
3233
"@com_github_ngaut_pools//:pools",
3334
"@com_github_pingcap_errors//:errors",
35+
"@com_github_pingcap_failpoint//:failpoint",
3436
"@org_golang_x_time//rate",
3537
"@org_uber_go_multierr//:multierr",
3638
"@org_uber_go_zap//:zap",
@@ -66,6 +68,7 @@ go_test(
6668
"//util/logutil",
6769
"@com_github_ngaut_pools//:pools",
6870
"@com_github_pingcap_errors//:errors",
71+
"@com_github_pingcap_failpoint//:failpoint",
6972
"@com_github_stretchr_testify//assert",
7073
"@com_github_stretchr_testify//require",
7174
"@org_golang_x_time//rate",

ttl/ttlworker/job_manager.go

+14-8
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@ import (
1818
"context"
1919
"time"
2020

21+
"github.com/google/uuid"
2122
"github.com/pingcap/errors"
23+
"github.com/pingcap/failpoint"
2224
"github.com/pingcap/tidb/kv"
2325
"github.com/pingcap/tidb/sessionctx/variable"
2426
"github.com/pingcap/tidb/ttl/cache"
@@ -32,7 +34,7 @@ import (
3234

3335
const insertNewTableIntoStatusTemplate = "INSERT INTO mysql.tidb_ttl_table_status (table_id,parent_table_id) VALUES (%?, %?)"
3436
const setTableStatusOwnerTemplate = `UPDATE mysql.tidb_ttl_table_status
35-
SET current_job_id = UUID(),
37+
SET current_job_id = %?,
3638
current_job_owner_id = %?,
3739
current_job_start_time = %?,
3840
current_job_status = 'waiting',
@@ -48,8 +50,8 @@ func insertNewTableIntoStatusSQL(tableID int64, parentTableID int64) (string, []
4850
return insertNewTableIntoStatusTemplate, []interface{}{tableID, parentTableID}
4951
}
5052

51-
func setTableStatusOwnerSQL(tableID int64, now time.Time, currentJobTTLExpire time.Time, id string) (string, []interface{}) {
52-
return setTableStatusOwnerTemplate, []interface{}{id, now.Format(timeFormat), now.Format(timeFormat), currentJobTTLExpire.Format(timeFormat), now.Format(timeFormat), tableID}
53+
func setTableStatusOwnerSQL(jobID string, tableID int64, now time.Time, currentJobTTLExpire time.Time, id string) (string, []interface{}) {
54+
return setTableStatusOwnerTemplate, []interface{}{jobID, id, now.Format(timeFormat), now.Format(timeFormat), currentJobTTLExpire.Format(timeFormat), now.Format(timeFormat), tableID}
5355
}
5456

5557
func updateHeartBeatSQL(tableID int64, now time.Time, id string) (string, []interface{}) {
@@ -508,6 +510,7 @@ func (m *JobManager) couldTrySchedule(table *cache.TableStatus, now time.Time) b
508510
// It could be nil, nil, if the table query doesn't return error but the job has been locked by other instances.
509511
func (m *JobManager) lockNewJob(ctx context.Context, se session.Session, table *cache.PhysicalTable, now time.Time) (*ttlJob, error) {
510512
var expireTime time.Time
513+
var jobID string
511514

512515
err := se.RunInTxn(ctx, func() error {
513516
sql, args := cache.SelectFromTTLTableStatusWithID(table.ID)
@@ -544,7 +547,12 @@ func (m *JobManager) lockNewJob(ctx context.Context, se session.Session, table *
544547
return err
545548
}
546549

547-
sql, args = setTableStatusOwnerSQL(table.ID, now, expireTime, m.id)
550+
jobID = uuid.New().String()
551+
failpoint.Inject("set-job-uuid", func(val failpoint.Value) {
552+
jobID = val.(string)
553+
})
554+
555+
sql, args = setTableStatusOwnerSQL(jobID, table.ID, now, expireTime, m.id)
548556
_, err = se.ExecuteSQL(ctx, sql, args...)
549557
return errors.Wrapf(err, "execute sql: %s", sql)
550558
})
@@ -561,12 +569,10 @@ func (m *JobManager) lockNewJob(ctx context.Context, se session.Session, table *
561569
if err != nil {
562570
return nil, err
563571
}
564-
return m.createNewJob(expireTime, now, table)
572+
return m.createNewJob(jobID, expireTime, now, table)
565573
}
566574

567-
func (m *JobManager) createNewJob(expireTime time.Time, now time.Time, table *cache.PhysicalTable) (*ttlJob, error) {
568-
id := m.tableStatusCache.Tables[table.ID].CurrentJobID
569-
575+
func (m *JobManager) createNewJob(id string, expireTime time.Time, now time.Time, table *cache.PhysicalTable) (*ttlJob, error) {
570576
statistics := &ttlStatistics{}
571577

572578
ranges, err := table.SplitScanRanges(m.ctx, m.store, splitScanCount)

ttl/ttlworker/job_manager_test.go

+8-3
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"time"
2121

2222
"github.com/pingcap/errors"
23+
"github.com/pingcap/failpoint"
2324
"github.com/pingcap/tidb/parser/model"
2425
"github.com/pingcap/tidb/parser/mysql"
2526
"github.com/pingcap/tidb/sessionctx/variable"
@@ -230,6 +231,10 @@ func TestLockNewTable(t *testing.T) {
230231
args,
231232
}
232233
}
234+
235+
failpoint.Enable("github.com/pingcap/tidb/ttl/ttlworker/set-job-uuid", `return("test-job-id")`)
236+
defer failpoint.Disable("github.com/pingcap/tidb/ttl/ttlworker/set-job-uuid")
237+
233238
type sqlExecute struct {
234239
executeInfo
235240

@@ -249,7 +254,7 @@ func TestLockNewTable(t *testing.T) {
249254
newTTLTableStatusRows(&cache.TableStatus{TableID: 1}), nil,
250255
},
251256
{
252-
getExecuteInfo(setTableStatusOwnerSQL(1, now, expireTime, "test-id")),
257+
getExecuteInfo(setTableStatusOwnerSQL("test-job-id", 1, now, expireTime, "test-id")),
253258
nil, nil,
254259
},
255260
{
@@ -271,7 +276,7 @@ func TestLockNewTable(t *testing.T) {
271276
newTTLTableStatusRows(&cache.TableStatus{TableID: 1}), nil,
272277
},
273278
{
274-
getExecuteInfo(setTableStatusOwnerSQL(1, now, expireTime, "test-id")),
279+
getExecuteInfo(setTableStatusOwnerSQL("test-job-id", 1, now, expireTime, "test-id")),
275280
nil, nil,
276281
},
277282
{
@@ -285,7 +290,7 @@ func TestLockNewTable(t *testing.T) {
285290
newTTLTableStatusRows(&cache.TableStatus{TableID: 1}), nil,
286291
},
287292
{
288-
getExecuteInfo(setTableStatusOwnerSQL(1, now, expireTime, "test-id")),
293+
getExecuteInfo(setTableStatusOwnerSQL("test-job-id", 1, now, expireTime, "test-id")),
289294
nil, errors.New("test error message"),
290295
},
291296
}, false, true},

ttl/ttlworker/scan.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ func (t *ttlScanTask) doScan(ctx context.Context, delCh chan<- *ttlDeleteTask, s
166166
zap.String("SQL", sql),
167167
zap.Int("retryTimes", retryTimes),
168168
zap.Bool("needRetry", needRetry),
169-
zap.Error(err),
169+
zap.Error(sqlErr),
170170
)
171171

172172
if !needRetry {

0 commit comments

Comments
 (0)