Skip to content

Commit

Permalink
mydump: support multi bytes csv delimiter and separator (pingcap#406)
Browse files Browse the repository at this point in the history
* more flexible csv

* fix config and add unit test

* remove useless code

* fix unit test

* use empty string for default quote

* update comments in tidb-lightning.toml for separator and delimiter

Co-authored-by: kennytm <kennytm@gmail.com>
  • Loading branch information
glorv and kennytm authored Sep 27, 2020
1 parent e4de23b commit bce9977
Show file tree
Hide file tree
Showing 8 changed files with 393 additions and 159 deletions.
12 changes: 4 additions & 8 deletions lightning/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -384,16 +384,12 @@ func (cfg *Config) LoadFromTOML(data []byte) error {
func (cfg *Config) Adjust() error {
// Reject problematic CSV configurations.
csv := &cfg.Mydumper.CSV
if len(csv.Separator) != 1 {
return errors.New("invalid config: `mydumper.csv.separator` must be exactly one byte long")
if len(csv.Separator) == 0 {
return errors.New("invalid config: `mydumper.csv.separator` must not be empty")
}

if len(csv.Delimiter) > 1 {
return errors.New("invalid config: `mydumper.csv.delimiter` must be one byte long or empty")
}

if csv.Separator == csv.Delimiter {
return errors.New("invalid config: cannot use the same character for both CSV delimiter and separator")
if len(csv.Delimiter) > 0 && (strings.HasPrefix(csv.Separator, csv.Delimiter) || strings.HasPrefix(csv.Delimiter, csv.Separator)) {
return errors.New("invalid config: `mydumper.csv.separator` and `mydumper.csv.delimiter` must not be prefix of each other")
}

if csv.BackslashEscape {
Expand Down
24 changes: 17 additions & 7 deletions lightning/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -275,14 +275,23 @@ func (s *configTestSuite) TestInvalidCSV(c *C) {
[mydumper.csv]
separator = ''
`,
err: "invalid config: `mydumper.csv.separator` must be exactly one byte long",
err: "invalid config: `mydumper.csv.separator` must not be empty",
},
{
input: `
[mydumper.csv]
separator = 'hello'
delimiter = 'hel'
`,
err: "invalid config: `mydumper.csv.separator` must be exactly one byte long",
err: "invalid config: `mydumper.csv.separator` and `mydumper.csv.delimiter` must not be prefix of each other",
},
{
input: `
[mydumper.csv]
separator = 'hel'
delimiter = 'hello'
`,
err: "invalid config: `mydumper.csv.separator` and `mydumper.csv.delimiter` must not be prefix of each other",
},
{
input: `
Expand All @@ -297,7 +306,7 @@ func (s *configTestSuite) TestInvalidCSV(c *C) {
[mydumper.csv]
separator = ','
`,
err: "invalid config: `mydumper.csv.separator` must be exactly one byte long",
err: "",
},
{
input: `
Expand All @@ -311,7 +320,7 @@ func (s *configTestSuite) TestInvalidCSV(c *C) {
[mydumper.csv]
delimiter = 'hello'
`,
err: "invalid config: `mydumper.csv.delimiter` must be one byte long or empty",
err: "",
},
{
input: `
Expand All @@ -324,17 +333,18 @@ func (s *configTestSuite) TestInvalidCSV(c *C) {
{
input: `
[mydumper.csv]
delimiter = '“'
separator = '\s'
delimiter = '\d'
`,
err: "invalid config: `mydumper.csv.delimiter` must be one byte long or empty",
err: "",
},
{
input: `
[mydumper.csv]
separator = '|'
delimiter = '|'
`,
err: "invalid config: cannot use the same character for both CSV delimiter and separator",
err: "invalid config: `mydumper.csv.separator` and `mydumper.csv.delimiter` must not be prefix of each other",
},
{
input: `
Expand Down
2 changes: 1 addition & 1 deletion lightning/lightning_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ func (s *lightningServerSuite) TestRunServer(c *C) {
c.Assert(data["error"], Matches, "cannot parse task.*")
resp.Body.Close()

resp, err = http.Post(url, "application/toml", strings.NewReader("[mydumper.csv]\nseparator = 'fooo'"))
resp, err = http.Post(url, "application/toml", strings.NewReader("[mydumper.csv]\nseparator = 'fooo'\ndelimiter= 'foo'"))
c.Assert(err, IsNil)
c.Assert(resp.StatusCode, Equals, http.StatusBadRequest)
err = json.NewDecoder(resp.Body).Decode(&data)
Expand Down
32 changes: 10 additions & 22 deletions lightning/mydump/bytes.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,43 +9,31 @@

package mydump

import "unicode/utf8"
// byteSet is a 32-byte value, where each bit represents the presence of a
// given byte value in the set.
type byteSet [8]uint32

// asciiSet is a 32-byte value, where each bit represents the presence of a
// given ASCII character in the set. The 128-bits of the lower 16 bytes,
// starting with the least-significant bit of the lowest word to the
// most-significant bit of the highest word, map to the full range of all
// 128 ASCII characters. The 128-bits of the upper 16 bytes will be zeroed,
// ensuring that any non-ASCII character will be reported as not in the set.
type asciiSet [8]uint32

// makeASCIISet creates a set of ASCII characters and reports whether all
// characters in chars are ASCII.
func makeASCIISet(chars string) (as asciiSet, ok bool) {
// makeByteSet creates a set of byte value.
func makeByteSet(chars []byte) (as byteSet) {
for i := 0; i < len(chars); i++ {
c := chars[i]
if c >= utf8.RuneSelf {
return as, false
}
as[c>>5] |= 1 << uint(c&31)
}
return as, true
return as
}

// contains reports whether c is inside the set.
func (as *asciiSet) contains(c byte) bool {
func (as *byteSet) contains(c byte) bool {
return (as[c>>5] & (1 << uint(c&31))) != 0
}

// IndexAnyAscii returns the byte index of the first occurrence in s of any of the Unicode
// code points in chars. It returns -1 if there is no code
// point in common.
func IndexAnyAscii(s []byte, as *asciiSet) int {
// IndexAnyByte returns the byte index of the first occurrence in s of any of the byte
// points in chars. It returns -1 if there is no code point in common.
func IndexAnyByte(s []byte, as *byteSet) int {
for i, c := range s {
if as.contains(c) {
return i
}
}
return -1

}
Loading

0 comments on commit bce9977

Please sign in to comment.