Skip to content

Commit

Permalink
parser: add IsValid() to Encoding to speed up string validation for U…
Browse files Browse the repository at this point in the history
…TF-8 (#30937)

close #30936
  • Loading branch information
tangenta authored Dec 22, 2021
1 parent 1f6d2dd commit d543b60
Show file tree
Hide file tree
Showing 11 changed files with 54 additions and 24 deletions.
2 changes: 1 addition & 1 deletion expression/builtin_string.go
Original file line number Diff line number Diff line change
Expand Up @@ -1150,7 +1150,7 @@ func (b *builtinConvertSig) evalString(row chunk.Row) (string, bool, error) {
return string(ret), false, err
}
enc := charset.FindEncoding(resultTp.Charset)
if !charset.IsValidString(enc, expr) {
if !enc.IsValid(hack.Slice(expr)) {
replace, _ := enc.Transform(nil, hack.Slice(expr), charset.OpReplace)
return string(replace), false, nil
}
Expand Down
2 changes: 1 addition & 1 deletion expression/builtin_string_vec.go
Original file line number Diff line number Diff line change
Expand Up @@ -689,7 +689,7 @@ func (b *builtinConvertSig) vecEvalString(input *chunk.Chunk, result *chunk.Colu
continue
}
exprI := expr.GetBytes(i)
if !charset.IsValid(enc, exprI) {
if !enc.IsValid(exprI) {
encBuf, _ = enc.Transform(encBuf, exprI, charset.OpReplace)
result.AppendBytes(encBuf)
} else {
Expand Down
3 changes: 2 additions & 1 deletion expression/collation.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"github.com/pingcap/tidb/types"
"github.com/pingcap/tidb/util/chunk"
"github.com/pingcap/tidb/util/collate"
"github.com/pingcap/tidb/util/hack"
"github.com/pingcap/tidb/util/logutil"
)

Expand Down Expand Up @@ -315,7 +316,7 @@ func safeConvert(ctx sessionctx.Context, ec *ExprCollation, args ...Expression)
if isNull {
continue
}
if !charset.IsValidString(enc, str) {
if !enc.IsValid(hack.Slice(str)) {
return false
}
} else {
Expand Down
17 changes: 2 additions & 15 deletions parser/charset/encoding.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ type Encoding interface {
Tp() EncodingTp
// Peek returns the next char.
Peek(src []byte) []byte
// IsValid checks whether the utf-8 bytes can be convert to valid string in current encoding.
IsValid(src []byte) bool
// Foreach iterates the characters in in current encoding.
Foreach(src []byte, op Op, fn func(from, to []byte, ok bool) bool)
// Transform map the bytes in src to dest according to Op.
Expand Down Expand Up @@ -101,21 +103,6 @@ const (
OpDecodeReplace = opToUTF8 | opTruncateReplace | opCollectTo
)

// IsValid checks whether the bytes is valid in current encoding.
func IsValid(e Encoding, src []byte) bool {
isValid := true
e.Foreach(src, opFromUTF8, func(from, to []byte, ok bool) bool {
isValid = ok
return ok
})
return isValid
}

// IsValidString is a string version of IsValid.
func IsValidString(e Encoding, str string) bool {
return IsValid(e, Slice(str))
}

// CountValidBytes counts the first valid bytes in src that
// can be encode to the current encoding.
func CountValidBytes(e Encoding, src []byte) int {
Expand Down
13 changes: 12 additions & 1 deletion parser/charset/encoding_ascii.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,19 @@ func (e *encodingASCII) Peek(src []byte) []byte {
return src[:1]
}

// IsValid implements Encoding interface.
func (e *encodingASCII) IsValid(src []byte) bool {
srcLen := len(src)
for i := 0; i < srcLen; i++ {
if src[i] > go_unicode.MaxASCII {
return false
}
}
return true
}

func (e *encodingASCII) Transform(dest, src []byte, op Op) ([]byte, error) {
if IsValid(e, src) {
if e.IsValid(src) {
return src, nil
}
return e.encodingBase.Transform(dest, src, op)
Expand Down
9 changes: 9 additions & 0 deletions parser/charset/encoding_base.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,15 @@ func (b encodingBase) ToLower(src string) string {
return strings.ToLower(src)
}

func (b encodingBase) IsValid(src []byte) bool {
isValid := true
b.self.Foreach(src, opFromUTF8, func(from, to []byte, ok bool) bool {
isValid = ok
return ok
})
return isValid
}

func (b encodingBase) Transform(dest, src []byte, op Op) (result []byte, err error) {
if dest == nil {
dest = make([]byte, len(src))
Expand Down
5 changes: 5 additions & 0 deletions parser/charset/encoding_bin.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@ func (e *encodingBin) Peek(src []byte) []byte {
return src[:1]
}

// IsValid implements Encoding interface.
func (e *encodingBin) IsValid(src []byte) bool {
return true
}

// Foreach implements Encoding interface.
func (e *encodingBin) Foreach(src []byte, op Op, fn func(from, to []byte, ok bool) bool) {
for i := 0; i < len(src); i++ {
Expand Down
5 changes: 5 additions & 0 deletions parser/charset/encoding_latin1.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,11 @@ func (e *encodingLatin1) Peek(src []byte) []byte {
return src[:1]
}

// IsValid implements Encoding interface.
func (e *encodingLatin1) IsValid(src []byte) bool {
return true
}

// Tp implements Encoding interface.
func (e *encodingLatin1) Tp() EncodingTp {
return EncodingTpLatin1
Expand Down
3 changes: 1 addition & 2 deletions parser/charset/encoding_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,7 @@ func TestEncodingValidate(t *testing.T) {
enc = charset.EncodingUTF8MB3StrictImpl
}
strBytes := []byte(tc.str)
ok := charset.IsValid(enc, strBytes)
require.Equal(t, tc.ok, ok, msg)
require.Equal(t, tc.ok, enc.IsValid(strBytes), msg)
replace, _ := enc.Transform(nil, strBytes, charset.OpReplace)
require.Equal(t, tc.expected, string(replace), msg)
}
Expand Down
17 changes: 15 additions & 2 deletions parser/charset/encoding_utf8.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,17 @@ func (e *encodingUTF8) Peek(src []byte) []byte {
return src[:nextLen]
}

// IsValid implements Encoding interface.
func (e *encodingUTF8) IsValid(src []byte) bool {
if utf8.Valid(src) {
return true
}
return e.encodingBase.IsValid(src)
}

// Transform implements Encoding interface.
func (e *encodingUTF8) Transform(dest, src []byte, op Op) ([]byte, error) {
if IsValid(e, src) {
if e.IsValid(src) {
return src, nil
}
return e.encodingBase.Transform(dest, src, op)
Expand All @@ -93,6 +101,11 @@ type encodingUTF8MB3Strict struct {
encodingUTF8
}

// IsValid implements Encoding interface.
func (e *encodingUTF8MB3Strict) IsValid(src []byte) bool {
return e.encodingBase.IsValid(src)
}

// Foreach implements Encoding interface.
func (e *encodingUTF8MB3Strict) Foreach(src []byte, op Op, fn func(srcCh, dstCh []byte, ok bool) bool) {
for i, w := 0, 0; i < len(src); i += w {
Expand All @@ -107,7 +120,7 @@ func (e *encodingUTF8MB3Strict) Foreach(src []byte, op Op, fn func(srcCh, dstCh

// Transform implements Encoding interface.
func (e *encodingUTF8MB3Strict) Transform(dest, src []byte, op Op) ([]byte, error) {
if IsValid(e, src) {
if e.IsValid(src) {
return src, nil
}
return e.encodingBase.Transform(dest, src, op)
Expand Down
2 changes: 1 addition & 1 deletion table/column.go
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,7 @@ func validateStringDatum(ctx sessionctx.Context, origin, casted *types.Datum, co
}
// Check if the string is valid in the given column charset.
str := casted.GetBytes()
if !charset.IsValid(enc, str) {
if !enc.IsValid(str) {
replace, _ := enc.Transform(nil, str, charset.OpReplace)
casted.SetBytesAsString(replace, col.Collate, 0)
nSrc := charset.CountValidBytes(enc, str)
Expand Down

0 comments on commit d543b60

Please sign in to comment.