Skip to content

Commit

Permalink
replace stdlib csv reader with simple csv detector
Browse files Browse the repository at this point in the history
Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>
  • Loading branch information
wagoodman committed Jul 10, 2024
1 parent e0c5c59 commit 8f71d34
Show file tree
Hide file tree
Showing 2 changed files with 288 additions and 18 deletions.
107 changes: 89 additions & 18 deletions internal/magic/text_csv.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,15 @@ package magic

import (
"bytes"
"encoding/csv"
"errors"
"io"
)

const (
svLineLimit = 10
quote = '"'
comment = '#'
)

// Csv matches a comma-separated values file.
func Csv(raw []byte, limit uint32) bool {
return sv(raw, ',', limit)
Expand All @@ -17,26 +21,93 @@ func Tsv(raw []byte, limit uint32) bool {
return sv(raw, '\t', limit)
}

func sv(in []byte, comma rune, limit uint32) bool {
r := csv.NewReader(bytes.NewReader(dropLastLine(in, limit)))
r.Comma = comma
r.ReuseRecord = true
r.LazyQuotes = true
r.Comment = '#'

lines := 0
for {
_, err := r.Read()
if errors.Is(err, io.EOF) {
break
func sv(raw []byte, delimiter byte, limit uint32) bool {
reader := prepSvReader(raw, limit)

isWithinQuote := false
isWithinComment := false
lineIdx := 0
recordFields := make(map[int]int)

buf := make([]byte, 1024)
n, err := reader.Read(buf)

var prev, cur, next byte
loop:
for err == nil {
for i := 0; i < n; i++ {
cur = buf[i]

if i > 0 {
prev = buf[i-1]
} else {
prev = byte(0)
}

if i < n-1 {
next = buf[i+1]
} else {
next = byte(0)
}

isNewline := cur == '\n' && prev != '\r' && next != byte(0) && next != '\n' || cur == '\r'

switch {
case cur == quote:
if (!isWithinQuote || next != quote) && !isWithinComment {
isWithinQuote = !isWithinQuote
} else {
i++
}

case isNewline && !isWithinQuote:
if lineIdx >= svLineLimit {
break loop
}
lineIdx++
isWithinComment = false

case !isWithinQuote && !isWithinComment:
switch cur {
case comment:
isWithinComment = true

case delimiter:
if recordFields[lineIdx] == 0 {
recordFields[lineIdx] = 1
}
recordFields[lineIdx]++
}
}

}
if err != nil {
return false

n, err = reader.Read(buf)
}

var fieldCount int
for _, fields := range recordFields {
if fields > 0 {
fieldCount = fields
break
}
lines++
}

return r.FieldsPerRecord > 1 && lines > 1
sum := fieldCount * len(recordFields)
for _, fields := range recordFields {
sum -= fields
}

return sum == 0 && fieldCount > 1 && lineIdx > 0
}

func prepSvReader(in []byte, limit uint32) io.Reader {
var reader io.Reader = bytes.NewReader(dropLastLine(in, limit))
if limit > 0 {
reader = io.LimitReader(reader, int64(limit))
}

return reader
}

// dropLastLine drops the last incomplete line from b.
Expand Down
199 changes: 199 additions & 0 deletions internal/magic/text_csv_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
package magic

import (
"io"
"reflect"
"testing"
)

func TestCsv(t *testing.T) {
tests := []struct {
name string
input string
limit uint32
want bool
}{

{
name: "csv multiple lines",
input: "a,b,c\n1,2,3",
want: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := Csv([]byte(tt.input), tt.limit); got != tt.want {
t.Errorf("Csv() = %v, want %v", got, tt.want)
}
})
}
}

func TestTsv(t *testing.T) {
tests := []struct {
name string
input string
limit uint32
want bool
}{

{
name: "tsv multiple lines",
input: "a\tb\tc\n1\t2\t3",
want: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := Tsv([]byte(tt.input), tt.limit); got != tt.want {
t.Errorf("Csv() = %v, want %v", got, tt.want)
}
})
}
}

func TestSv(t *testing.T) {
tests := []struct {
name string
delimiter byte
input string
limit uint32
want bool
}{
{
name: "empty",
delimiter: ',',
input: "",
want: false,
},
{
name: "csv single line",
delimiter: ',',
input: "a,b,c",
want: false,
},
{
name: "csv multiple lines",
delimiter: ',',
input: "a,b,c\n1,2,3",
want: true,
},
{
name: "csv with spaces",
delimiter: ',',
input: " a ,\t\tb, c\n1, 2 , 3 ",
want: true,
},
{
name: "csv multiple lines under limit",
delimiter: ',',
input: "a,b,c\n1,2,3\n4,5,6",
limit: 10,
want: true,
},
{
name: "csv multiple lines over limit",
delimiter: ',',
input: "a,b,c\n1,2,3\n4,5,6",
limit: 1,
want: false,
},
{
name: "csv 2 line with incomplete last line",
delimiter: ',',
input: "a,b,c\n1,2",
want: false,
},
{
name: "csv 3 line with incomplete last line",
delimiter: ',',
input: "a,b,c\na,b,c\n1,2",
limit: 10,
want: true,
},
{
name: "within quotes",
delimiter: ',',
input: "\"a,b,c\n1,2,3\n4,5,6\"",
want: false,
},
{
name: "partial quotes",
delimiter: ',',
input: "\"a,b,c\n1,2,3\n4,5,6",
want: false,
},
{
name: "has quotes",
delimiter: ',',
input: "\"a\",\"b\",\"c\"\n1,\",\"2,3\n\"4\",5,6",
want: true,
},
{
name: "comma within quotes",
delimiter: ',',
input: "\"a,b\",\"c\"\n1,2,3\n\"4\",5,6",
want: false,
},
{
name: "ignore comments",
delimiter: ',',
input: "#a,b,c\n#1,2,3",
want: false,
},
{
name: "multiple comments at the end of line",
delimiter: ',',
input: "a,b#,c\n1,2#,3",
want: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := sv([]byte(tt.input), tt.delimiter, tt.limit); got != tt.want {
t.Errorf("Csv() = %v, want %v", got, tt.want)
}
})
}
}

func Test_prepSvReader(t *testing.T) {

tests := []struct {
name string
input string
limit uint32
want string
}{
{
name: "multiple lines",
input: "a,b,c\n1,2,3",
limit: 0,
want: "a,b,c\n1,2,3",
},
{
name: "limit",
input: "a,b,c\n1,2,3",
limit: 5,
want: "a,b,c",
},
{
name: "drop last line",
input: "a,b,c\na,b,c\na,b,c\n1,2",
limit: 20,
want: "a,b,c\na,b,c\na,b,c",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
reader := prepSvReader([]byte(tt.input), tt.limit)
by, err := io.ReadAll(reader)
if err != nil {
t.Fatalf("prepSvReader() error = %v", err)
}
if !reflect.DeepEqual(string(by), tt.want) {
t.Errorf("prepSvReader() = '%v', want '%v'", string(by), tt.want)
}
})
}
}

0 comments on commit 8f71d34

Please sign in to comment.