change

pingcap · Oct 19, 2023 · af2623e · af2623e
1 parent fcbb499
commit af2623e
Show file tree

Hide file tree

Showing 11 changed files with 82 additions and 82 deletions.
diff --git a/docs/design/2021-08-18-charsets.md b/docs/design/2021-08-18-charsets.md
@@ -65,7 +65,7 @@ After receiving the non-utf-8 character set request, this solution will convert
 - All ParseOneStmt/Parse usage needs to be checked
   - For SQL strings that need to be temporarily saved, you need to bring character set information. For example, BindSQL, View Select Stmt, etc.
   - For internally executed SQL statements, since they are already utf-8, they do not need to be processed. For example, the table creation statement in the perfschema package.
-  
+
 ### Runtime
 - Add a repertoire field to collationInfo to facilitate automatic character set conversion in expressions, so that many errors like "illegal mix of collations" can be avoided.
   - The corresponding types of the Repertoire attribute are as follows:
@@ -75,11 +75,11 @@ After receiving the non-utf-8 character set request, this solution will convert
 
     const (
       // RepertoireASCII is pure ASCII and it’s Unicode range: U+0000..U+007F
-      RepertoireASCII     Repertoire = 1           
-      // RepertoireExtended is Extended characters and it’s Unicode range: U+0080..U+FFFF    
-      RepertoireExtended  Repertoire = 1 << 1          
-      // RepertoireUnicode consists ASCII and EXTENDED, and it’s Unicode range: U+0000..U+FFFF 
-      RepertoireUnicode   Repertoire = ASCII | EXTENDED 
+      RepertoireASCII     Repertoire = 1
+      // RepertoireExtended is Extended characters and it’s Unicode range: U+0080..U+FFFF
+      RepertoireExtended  Repertoire = 1 << 1
+      // RepertoireUnicode consists ASCII and EXTENDED, and it’s Unicode range: U+0000..U+FFFF
+      RepertoireUnicode   Repertoire = ASCII | EXTENDED
     )
     ```
 
@@ -90,7 +90,7 @@ After receiving the non-utf-8 character set request, this solution will convert
 
 ### Optimizer
 
-- The statistics module may need special processing functions based on charset: AvgColSizeListInDisk, GetFixedLen, BucketToString and DecodedString, etc.
+- The statistics module may need special processing functions based on charset: AvgColSizeDataInDiskByRows, GetFixedLen, BucketToString and DecodedString, etc.
 - Ranger module
   - The processing of prefix len needs to consider the charset.
   - Functions such as BuildFromPatternLike and checkLikeFunc may need to consider charset.
@@ -123,7 +123,7 @@ Other behaviors that need to be dealt with:
 - Upgrade compatibility:
   - There may be compatibility issues when performing operations during the rolling upgrade.
   - The new version of the cluster is expected to have no compatibility issues when reading old data.
-- Downgrade compatibility: 
+- Downgrade compatibility:
   - Downgrade is not compatible. The index key uses the table of gbk_bin/gbk_chinese_ci. The lower version of TiDB will have problems when decoding, and it needs to be transcoded before downgrading.
 
 #### Compatibility with MySQL
@@ -189,7 +189,7 @@ Test the compatibility of some related features, such as SQL binding, SQL hints,
 - Version 4.0 and above test upgrade:
   - During the rolling upgrade process, gbk operation is not supported.
   - After the upgrade, normal gbk related operations are supported.
-  
+
 ### Downgrade compatibility
 
 There will be incompatibility issues when downgrading tables that use gbk encoding.

diff --git a/pkg/executor/aggregate/agg_hash_executor.go b/pkg/executor/aggregate/agg_hash_executor.go
@@ -127,7 +127,7 @@ type HashAggExec struct {
 	stats *HashAggRuntimeStats
 
 	// listInDisk is the chunks to store row values for spilled data.
-	// The HashAggExec may be set to `spill mode` multiple times, and all spilled data will be appended to ListInDisk.
+	// The HashAggExec may be set to `spill mode` multiple times, and all spilled data will be appended to DataInDiskByRows.
 	listInDisk *chunk.DataInDiskByRows
 	// numOfSpilledChks indicates the number of all the spilled chunks.
 	numOfSpilledChks int
@@ -240,7 +240,7 @@ func (e *HashAggExec) initForUnparallelExec() {
 
 	e.offsetOfSpilledChks, e.numOfSpilledChks = 0, 0
 	e.executed, e.isChildDrained = false, false
-	e.listInDisk = chunk.NewListInDisk(exec.RetTypes(e.Children(0)))
+	e.listInDisk = chunk.NewDataInDiskByRows(exec.RetTypes(e.Children(0)))
 
 	e.tmpChkForSpill = exec.TryNewCacheChunk(e.Children(0))
 	if vars := e.Ctx().GetSessionVars(); vars.TrackAggregateMemoryUsage && variable.EnableTmpStorageOnOOM.Load() {

diff --git a/pkg/planner/cardinality/row_size.go b/pkg/planner/cardinality/row_size.go
@@ -88,8 +88,8 @@ func GetAvgRowSize(ctx sessionctx.Context, coll *statistics.HistColl, cols []*ex
 	return size + float64(len(cols))
 }
 
-// GetAvgRowSizeListInDisk computes average row size for given columns.
-func GetAvgRowSizeListInDisk(coll *statistics.HistColl, cols []*expression.Column) (size float64) {
+// GetAvgRowSizeDataInDiskByRows computes average row size for given columns.
+func GetAvgRowSizeDataInDiskByRows(coll *statistics.HistColl, cols []*expression.Column) (size float64) {
 	if coll.Pseudo || len(coll.Columns) == 0 || coll.RealtimeCount == 0 {
 		for _, col := range cols {
 			size += float64(chunk.EstimateTypeWidth(col.GetType()))
@@ -103,10 +103,10 @@ func GetAvgRowSizeListInDisk(coll *statistics.HistColl, cols []*expression.Colum
 				size += float64(chunk.EstimateTypeWidth(col.GetType()))
 				continue
 			}
-			size += AvgColSizeListInDisk(colHist, coll.RealtimeCount)
+			size += AvgColSizeDataInDiskByRows(colHist, coll.RealtimeCount)
 		}
 	}
-	// Add 8 byte for each column's size record. See `ListInDisk` for details.
+	// Add 8 byte for each column's size record. See `DataInDiskByRows` for details.
 	return size + float64(8*len(cols))
 }
 
@@ -160,9 +160,9 @@ func AvgColSizeChunkFormat(c *statistics.Column, count int64) float64 {
 	return math.Round((avgSize-math.Log2(avgSize))*100)/100 + 8
 }
 
-// AvgColSizeListInDisk is the average column size of the histogram. These sizes are derived
-// from `chunk.ListInDisk` so we need to update them if those 2 functions are changed.
-func AvgColSizeListInDisk(c *statistics.Column, count int64) float64 {
+// AvgColSizeDataInDiskByRows is the average column size of the histogram. These sizes are derived
+// from `chunk.DataInDiskByRows` so we need to update them if those 2 functions are changed.
+func AvgColSizeDataInDiskByRows(c *statistics.Column, count int64) float64 {
 	if count == 0 {
 		return 0
 	}

diff --git a/pkg/planner/cardinality/row_size_test.go b/pkg/planner/cardinality/row_size_test.go
@@ -40,36 +40,36 @@ func TestAvgColLen(t *testing.T) {
 	tableInfo := tbl.Meta()
 	statsTbl := do.StatsHandle().GetTableStats(tableInfo)
 	require.Equal(t, 1.0, cardinality.AvgColSize(statsTbl.Columns[tableInfo.Columns[0].ID], statsTbl.RealtimeCount, false))
-	require.Equal(t, 8.0, cardinality.AvgColSizeListInDisk(statsTbl.Columns[tableInfo.Columns[0].ID], statsTbl.RealtimeCount))
+	require.Equal(t, 8.0, cardinality.AvgColSizeDataInDiskByRows(statsTbl.Columns[tableInfo.Columns[0].ID], statsTbl.RealtimeCount))
 	require.Equal(t, 8.0, cardinality.AvgColSizeChunkFormat(statsTbl.Columns[tableInfo.Columns[0].ID], statsTbl.RealtimeCount))
 
 	// The size of varchar type is LEN + BYTE, here is 1 + 7 = 8
 	require.Equal(t, 8.0, cardinality.AvgColSize(statsTbl.Columns[tableInfo.Columns[1].ID], statsTbl.RealtimeCount, false))
 	require.Equal(t, 8.0, cardinality.AvgColSize(statsTbl.Columns[tableInfo.Columns[2].ID], statsTbl.RealtimeCount, false))
 	require.Equal(t, 8.0, cardinality.AvgColSize(statsTbl.Columns[tableInfo.Columns[3].ID], statsTbl.RealtimeCount, false))
-	require.Equal(t, 8.0-3, cardinality.AvgColSizeListInDisk(statsTbl.Columns[tableInfo.Columns[1].ID], statsTbl.RealtimeCount))
-	require.Equal(t, float64(unsafe.Sizeof(float32(12.3))), cardinality.AvgColSizeListInDisk(statsTbl.Columns[tableInfo.Columns[2].ID], statsTbl.RealtimeCount))
-	require.Equal(t, float64(unsafe.Sizeof(types.ZeroTime)), cardinality.AvgColSizeListInDisk(statsTbl.Columns[tableInfo.Columns[3].ID], statsTbl.RealtimeCount))
+	require.Equal(t, 8.0-3, cardinality.AvgColSizeDataInDiskByRows(statsTbl.Columns[tableInfo.Columns[1].ID], statsTbl.RealtimeCount))
+	require.Equal(t, float64(unsafe.Sizeof(float32(12.3))), cardinality.AvgColSizeDataInDiskByRows(statsTbl.Columns[tableInfo.Columns[2].ID], statsTbl.RealtimeCount))
+	require.Equal(t, float64(unsafe.Sizeof(types.ZeroTime)), cardinality.AvgColSizeDataInDiskByRows(statsTbl.Columns[tableInfo.Columns[3].ID], statsTbl.RealtimeCount))
 	require.Equal(t, 8.0-3+8, cardinality.AvgColSizeChunkFormat(statsTbl.Columns[tableInfo.Columns[1].ID], statsTbl.RealtimeCount))
 	require.Equal(t, float64(unsafe.Sizeof(float32(12.3))), cardinality.AvgColSizeChunkFormat(statsTbl.Columns[tableInfo.Columns[2].ID], statsTbl.RealtimeCount))
 	require.Equal(t, float64(unsafe.Sizeof(types.ZeroTime)), cardinality.AvgColSizeChunkFormat(statsTbl.Columns[tableInfo.Columns[3].ID], statsTbl.RealtimeCount))
 	require.Equal(t, 8.0, cardinality.AvgColSizeChunkFormat(statsTbl.Columns[tableInfo.Columns[4].ID], statsTbl.RealtimeCount))
-	require.Equal(t, 0.0, cardinality.AvgColSizeListInDisk(statsTbl.Columns[tableInfo.Columns[4].ID], statsTbl.RealtimeCount))
+	require.Equal(t, 0.0, cardinality.AvgColSizeDataInDiskByRows(statsTbl.Columns[tableInfo.Columns[4].ID], statsTbl.RealtimeCount))
 	testKit.MustExec("insert into t values(132, '123456789112', 1232.3, '2018-03-07 19:17:29', NULL)")
 	testKit.MustExec("analyze table t")
 	statsTbl = do.StatsHandle().GetTableStats(tableInfo)
 	require.Equal(t, 1.5, cardinality.AvgColSize(statsTbl.Columns[tableInfo.Columns[0].ID], statsTbl.RealtimeCount, false))
 	require.Equal(t, 10.5, cardinality.AvgColSize(statsTbl.Columns[tableInfo.Columns[1].ID], statsTbl.RealtimeCount, false))
 	require.Equal(t, 8.0, cardinality.AvgColSize(statsTbl.Columns[tableInfo.Columns[2].ID], statsTbl.RealtimeCount, false))
 	require.Equal(t, 8.0, cardinality.AvgColSize(statsTbl.Columns[tableInfo.Columns[3].ID], statsTbl.RealtimeCount, false))
-	require.Equal(t, 8.0, cardinality.AvgColSizeListInDisk(statsTbl.Columns[tableInfo.Columns[0].ID], statsTbl.RealtimeCount))
-	require.Equal(t, math.Round((10.5-math.Log2(10.5))*100)/100, cardinality.AvgColSizeListInDisk(statsTbl.Columns[tableInfo.Columns[1].ID], statsTbl.RealtimeCount))
-	require.Equal(t, float64(unsafe.Sizeof(float32(12.3))), cardinality.AvgColSizeListInDisk(statsTbl.Columns[tableInfo.Columns[2].ID], statsTbl.RealtimeCount))
-	require.Equal(t, float64(unsafe.Sizeof(types.ZeroTime)), cardinality.AvgColSizeListInDisk(statsTbl.Columns[tableInfo.Columns[3].ID], statsTbl.RealtimeCount))
+	require.Equal(t, 8.0, cardinality.AvgColSizeDataInDiskByRows(statsTbl.Columns[tableInfo.Columns[0].ID], statsTbl.RealtimeCount))
+	require.Equal(t, math.Round((10.5-math.Log2(10.5))*100)/100, cardinality.AvgColSizeDataInDiskByRows(statsTbl.Columns[tableInfo.Columns[1].ID], statsTbl.RealtimeCount))
+	require.Equal(t, float64(unsafe.Sizeof(float32(12.3))), cardinality.AvgColSizeDataInDiskByRows(statsTbl.Columns[tableInfo.Columns[2].ID], statsTbl.RealtimeCount))
+	require.Equal(t, float64(unsafe.Sizeof(types.ZeroTime)), cardinality.AvgColSizeDataInDiskByRows(statsTbl.Columns[tableInfo.Columns[3].ID], statsTbl.RealtimeCount))
 	require.Equal(t, 8.0, cardinality.AvgColSizeChunkFormat(statsTbl.Columns[tableInfo.Columns[0].ID], statsTbl.RealtimeCount))
 	require.Equal(t, math.Round((10.5-math.Log2(10.5))*100)/100+8, cardinality.AvgColSizeChunkFormat(statsTbl.Columns[tableInfo.Columns[1].ID], statsTbl.RealtimeCount))
 	require.Equal(t, float64(unsafe.Sizeof(float32(12.3))), cardinality.AvgColSizeChunkFormat(statsTbl.Columns[tableInfo.Columns[2].ID], statsTbl.RealtimeCount))
 	require.Equal(t, float64(unsafe.Sizeof(types.ZeroTime)), cardinality.AvgColSizeChunkFormat(statsTbl.Columns[tableInfo.Columns[3].ID], statsTbl.RealtimeCount))
 	require.Equal(t, 8.0, cardinality.AvgColSizeChunkFormat(statsTbl.Columns[tableInfo.Columns[4].ID], statsTbl.RealtimeCount))
-	require.Equal(t, 0.0, cardinality.AvgColSizeListInDisk(statsTbl.Columns[tableInfo.Columns[4].ID], statsTbl.RealtimeCount))
+	require.Equal(t, 0.0, cardinality.AvgColSizeDataInDiskByRows(statsTbl.Columns[tableInfo.Columns[4].ID], statsTbl.RealtimeCount))
 }
diff --git a/pkg/planner/core/task.go b/pkg/planner/core/task.go
@@ -307,7 +307,7 @@ func (p *PhysicalIndexJoin) attach2Task(tasks ...task) task {
 // RowSize for cost model ver2 is simplified, always use this function to calculate row size.
 func getAvgRowSize(stats *property.StatsInfo, cols []*expression.Column) (size float64) {
 	if stats.HistColl != nil {
-		size = cardinality.GetAvgRowSizeListInDisk(stats.HistColl, cols)
+		size = cardinality.GetAvgRowSizeDataInDiskByRows(stats.HistColl, cols)
 	} else {
 		// Estimate using just the type info.
 		for _, col := range cols {

diff --git a/pkg/util/chunk/BUILD.bazel b/pkg/util/chunk/BUILD.bazel
@@ -9,14 +9,14 @@ go_library(
         "codec.go",
         "column.go",
         "compare.go",
-        "disk.go",
         "iterator.go",
         "list.go",
         "mutrow.go",
         "pool.go",
         "row.go",
         "row_container.go",
         "row_container_reader.go",
+        "row_in_disk.go",
     ],
     importpath = "github.com/pingcap/tidb/pkg/util/chunk",
     visibility = ["//visibility:public"],
@@ -48,13 +48,13 @@ go_test(
         "chunk_util_test.go",
         "codec_test.go",
         "column_test.go",
-        "disk_test.go",
         "iterator_test.go",
         "list_test.go",
         "main_test.go",
         "mutrow_test.go",
         "pool_test.go",
         "row_container_test.go",
+        "row_in_disk_test.go",
     ],
     embed = [":chunk"],
     flaky = True,

diff --git a/pkg/util/chunk/list.go b/pkg/util/chunk/list.go
@@ -72,7 +72,7 @@ func (l *List) FieldTypes() []*types.FieldType {
 	return l.fieldTypes
 }
 
-// NumRowsOfChunk returns the number of rows of a chunk in the ListInDisk.
+// NumRowsOfChunk returns the number of rows of a chunk in the DataInDiskByRows.
 func (l *List) NumRowsOfChunk(chkID int) int {
 	return l.chunks[chkID].NumRows()
 }

diff --git a/pkg/util/chunk/row_container.go b/pkg/util/chunk/row_container.go
@@ -152,7 +152,7 @@ func (c *RowContainer) spillToDisk(preSpillError error) {
 	var err error
 	memory.QueryForceDisk.Add(1)
 	n := c.m.records.inMemory.NumChunks()
-	c.m.records.inDisk = NewListInDisk(c.m.records.inMemory.FieldTypes())
+	c.m.records.inDisk = NewDataInDiskByRows(c.m.records.inMemory.FieldTypes())
 	c.m.records.inDisk.diskTracker.AttachTo(c.diskTracker)
 	defer func() {
 		if r := recover(); r != nil {
@@ -221,7 +221,7 @@ func (c *RowContainer) NumRow() int {
 	return c.m.records.inMemory.Len()
 }
 
-// NumRowsOfChunk returns the number of rows of a chunk in the ListInDisk.
+// NumRowsOfChunk returns the number of rows of a chunk in the DataInDiskByRows.
 func (c *RowContainer) NumRowsOfChunk(chkID int) int {
 	c.m.RLock()
 	defer c.m.RUnlock()

diff --git a/pkg/util/chunk/row_in_disk.go b/pkg/util/chunk/row_in_disk.go
@@ -98,15 +98,15 @@ func (l *diskFileReaderWriter) getWriter() io.Writer {
 	return l.w
 }
 
-var defaultChunkListInDiskPath = "chunk.ListInDisk"
-var defaultChunkListInDiskOffsetPath = "chunk.ListInDiskOffset"
+var defaultChunkDataInDiskByRowsPath = "chunk.DataInDiskByRows"
+var defaultChunkDataInDiskByRowsOffsetPath = "chunk.DataInDiskByRowsOffset"
 
-// NewListInDisk creates a new ListInDisk with field types.
-func NewListInDisk(fieldTypes []*types.FieldType) *DataInDiskByRows {
+// NewDataInDiskByRows creates a new DataInDiskByRows with field types.
+func NewDataInDiskByRows(fieldTypes []*types.FieldType) *DataInDiskByRows {
 	l := &DataInDiskByRows{
 		fieldTypes: fieldTypes,
 		// TODO(fengliyuan): set the quota of disk usage.
-		diskTracker: disk.NewTracker(memory.LabelForChunkListInDisk, -1),
+		diskTracker: disk.NewTracker(memory.LabelForChunkDataInDiskByRows, -1),
 	}
 	return l
 }
@@ -116,15 +116,15 @@ func (l *DataInDiskByRows) initDiskFile() (err error) {
 	if err != nil {
 		return
 	}
-	err = l.dataFile.initWithFileName(defaultChunkListInDiskPath + strconv.Itoa(l.diskTracker.Label()))
+	err = l.dataFile.initWithFileName(defaultChunkDataInDiskByRowsPath + strconv.Itoa(l.diskTracker.Label()))
 	if err != nil {
 		return
 	}
-	err = l.offsetFile.initWithFileName(defaultChunkListInDiskOffsetPath + strconv.Itoa(l.diskTracker.Label()))
+	err = l.offsetFile.initWithFileName(defaultChunkDataInDiskByRowsOffsetPath + strconv.Itoa(l.diskTracker.Label()))
 	return
 }
 
-// Len returns the number of rows in ListInDisk
+// Len returns the number of rows in DataInDiskByRows
 func (l *DataInDiskByRows) Len() int {
 	return l.totalNumRows
 }
@@ -134,7 +134,7 @@ func (l *DataInDiskByRows) GetDiskTracker() *disk.Tracker {
 	return l.diskTracker
 }
 
-// Add adds a chunk to the ListInDisk. Caller must make sure the input chk
+// Add adds a chunk to the DataInDiskByRows. Caller must make sure the input chk
 // is not empty and not used any more and has the same field types.
 // Warning: Do not use Add concurrently.
 func (l *DataInDiskByRows) Add(chk *Chunk) (err error) {
@@ -170,7 +170,7 @@ func (l *DataInDiskByRows) Add(chk *Chunk) (err error) {
 	return
 }
 
-// GetChunk gets a Chunk from the ListInDisk by chkIdx.
+// GetChunk gets a Chunk from the DataInDiskByRows by chkIdx.
 func (l *DataInDiskByRows) GetChunk(chkIdx int) (*Chunk, error) {
 	chk := NewChunkWithCapacity(l.fieldTypes, l.NumRowsOfChunk(chkIdx))
 	chkSize := l.numRowsOfEachChunk[chkIdx]
@@ -207,13 +207,13 @@ func (l *DataInDiskByRows) GetChunk(chkIdx int) (*Chunk, error) {
 	return chk, formatChErr
 }
 
-// GetRow gets a Row from the ListInDisk by RowPtr.
+// GetRow gets a Row from the DataInDiskByRows by RowPtr.
 func (l *DataInDiskByRows) GetRow(ptr RowPtr) (row Row, err error) {
 	row, _, err = l.GetRowAndAppendToChunk(ptr, nil)
 	return row, err
 }
 
-// GetRowAndAppendToChunk gets a Row from the ListInDisk by RowPtr. Return the Row and the Ref Chunk.
+// GetRowAndAppendToChunk gets a Row from the DataInDiskByRows by RowPtr. Return the Row and the Ref Chunk.
 func (l *DataInDiskByRows) GetRowAndAppendToChunk(ptr RowPtr, chk *Chunk) (row Row, _ *Chunk, err error) {
 	off, err := l.getOffset(ptr.ChkIdx, ptr.RowIdx)
 	if err != nil {
@@ -243,12 +243,12 @@ func (l *DataInDiskByRows) getOffset(chkIdx uint32, rowIdx uint32) (int64, error
 	return bytesToI64Slice(b)[0], nil
 }
 
-// NumRowsOfChunk returns the number of rows of a chunk in the ListInDisk.
+// NumRowsOfChunk returns the number of rows of a chunk in the DataInDiskByRows.
 func (l *DataInDiskByRows) NumRowsOfChunk(chkID int) int {
 	return l.numRowsOfEachChunk[chkID]
 }
 
-// NumChunks returns the number of chunks in the ListInDisk.
+// NumChunks returns the number of chunks in the DataInDiskByRows.
 func (l *DataInDiskByRows) NumChunks() int {
 	return len(l.numRowsOfEachChunk)
 }
@@ -434,7 +434,7 @@ func (format *diskFormatRow) toRow(fields []*types.FieldType, chk *Chunk) (Row,
 }
 
 // ReaderWithCache helps to read data that has not be flushed to underlying layer.
-// By using ReaderWithCache, user can still write data into ListInDisk even after reading.
+// By using ReaderWithCache, user can still write data into DataInDiskByRows even after reading.
 type ReaderWithCache struct {
 	r        io.ReaderAt
 	cacheOff int64