Skip to content

Commit

Permalink
Move BOM stripping logic to FieldParser
Browse files Browse the repository at this point in the history
  • Loading branch information
tmaxmax committed Jul 11, 2023
1 parent 5266c32 commit 1352b29
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 47 deletions.
30 changes: 27 additions & 3 deletions internal/parser/field_parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,13 @@ import (

// FieldParser extracts fields from a byte slice.
type FieldParser struct {
err error
data string
err error
data string

isUntouched bool

keepComments bool
removeBOM bool
}

func min(a, b int) int {
Expand Down Expand Up @@ -62,6 +66,8 @@ var ErrUnexpectedEOF = errors.New("go-sse: unexpected end of input")
// Next parses the next available field in the remaining buffer.
// It returns false if there are no more fields to parse.
func (f *FieldParser) Next(r *Field) bool {
f.isUntouched = false

for f.data != "" {
chunk, rem, hasNewline := NextChunk(f.data)
if !hasNewline {
Expand All @@ -85,6 +91,8 @@ func (f *FieldParser) Next(r *Field) bool {
func (f *FieldParser) Reset(data string) {
f.data = data
f.err = nil
f.isUntouched = true
f.doRemoveBOM()
}

// Err returns the last error encountered by the parser. It is either nil or ErrUnexpectedEOF.
Expand All @@ -98,7 +106,23 @@ func (f *FieldParser) KeepComments(shouldKeep bool) {
f.keepComments = shouldKeep
}

// RemoveBOM configures the FieldParser to try and remove the Unicode BOM
// when parsing the first field, if it exists.
// If, at the time this option is set, the input is untouched (no fields were parsed),
// it will also be attempted to remove the BOM.
func (f *FieldParser) RemoveBOM(shouldRemove bool) {
f.removeBOM = shouldRemove
f.doRemoveBOM()
}

func (f *FieldParser) doRemoveBOM() {
if f.removeBOM && f.isUntouched && f.data != "" {
f.data = strings.TrimPrefix(f.data, "\xEF\xBB\xBF")
f.isUntouched = false
}
}

// NewFieldParser creates a parser that extracts fields from the given string.
func NewFieldParser(data string) *FieldParser {
return &FieldParser{data: data}
return &FieldParser{data: data, isUntouched: true}
}
23 changes: 23 additions & 0 deletions internal/parser/field_parser_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,29 @@ func TestFieldParser(t *testing.T) {
}
})
}

t.Run("BOM", func(t *testing.T) {
p := parser.NewFieldParser("\xEF\xBB\xBFid: 5\n")
p.RemoveBOM(true)

var f parser.Field
if !p.Next(&f) {
t.Fatalf("a field should be available (err=%v)", p.Err())
}

expectedF := parser.Field{Name: parser.FieldNameID, Value: "5"}
if f != expectedF {
t.Fatalf("invalid field: received %v, expected %v", f, expectedF)
}

p.Reset("\xEF\xBB\xBF")
if p.Next(&f) {
t.Fatalf("no fields should be available")
}
if p.Err() != nil {
t.Fatalf("no error is expected after BOM removal")
}
})
}

func BenchmarkFieldParser(b *testing.B) {
Expand Down
75 changes: 34 additions & 41 deletions internal/parser/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,61 +2,51 @@ package parser

import (
"bufio"
"bytes"
"io"
"unsafe"
)

// newSplitFunc creates a split function for a bufio.Scanner that splits a sequence of
// splitFunc is a split function for a bufio.Scanner that splits a sequence of
// bytes into SSE events. Each event ends with two consecutive newline sequences,
// where a newline sequence is defined as either "\n", "\r", or "\r\n".
//
// This split function also removes the BOM sequence from the first event, if it exists.
func newSplitFunc() bufio.SplitFunc {
isFirstToken := true
func splitFunc(data []byte, atEOF bool) (advance int, token []byte, err error) {
if len(data) == 0 {
return 0, nil, nil
}

return func(data []byte, atEOF bool) (advance int, token []byte, err error) {
if len(data) == 0 {
return 0, nil, nil
var start int
for {
index, endlineLen := NewlineIndex((*(*string)(unsafe.Pointer(&data)))[advance:])
advance += index + endlineLen
if index == 0 {
// If it was a blank line, skip it.
start += endlineLen
}

var start int
for {
index, endlineLen := NewlineIndex((*(*string)(unsafe.Pointer(&data)))[advance:])
advance += index + endlineLen
if index == 0 {
// If it was a blank line, skip it.
start += endlineLen
}
// We've reached the end of data or a second newline follows and the line isn't blank.
// The latter means we have an event.
if advance == len(data) || (isNewlineChar(data[advance]) && index > 0) {
break
}
// We've reached the end of data or a second newline follows and the line isn't blank.
// The latter means we have an event.
if advance == len(data) || (isNewlineChar(data[advance]) && index > 0) {
break
}
}

if l := len(data); advance == l && !atEOF {
// We have reached the end of the buffer but have not yet seen two consecutive
// newline sequences, so we request more data.
return 0, nil, nil
} else if advance < l {
// We have found a newline. Consume the end-of-line sequence.
if l := len(data); advance == l && !atEOF {
// We have reached the end of the buffer but have not yet seen two consecutive
// newline sequences, so we request more data.
return 0, nil, nil
} else if advance < l {
// We have found a newline. Consume the end-of-line sequence.
advance++
// Consume one more character if end-of-line is "\r\n".
if advance < l && data[advance-1] == '\r' && data[advance] == '\n' {
advance++
// Consume one more character if end-of-line is "\r\n".
if advance < l && data[advance-1] == '\r' && data[advance] == '\n' {
advance++
}
}
}

token = data[start:advance]
if isFirstToken {
// Remove BOM, if present.
token = bytes.TrimPrefix(token, []byte("\xEF\xBB\xBF"))
isFirstToken = false
}
token = data[start:advance]

return advance, token, nil
}
return advance, token, nil
}

// Parser extracts fields from a reader. Reading is buffered using a bufio.Scanner.
Expand Down Expand Up @@ -92,7 +82,10 @@ func (r *Parser) Err() error {
// New returns a Parser that extracts fields from a reader.
func New(r io.Reader) *Parser {
sc := bufio.NewScanner(r)
sc.Split(newSplitFunc())
sc.Split(splitFunc)

fsc := NewFieldParser("")
fsc.RemoveBOM(true)

return &Parser{inputScanner: sc, fieldScanner: NewFieldParser("")}
return &Parser{inputScanner: sc, fieldScanner: fsc}
}
6 changes: 3 additions & 3 deletions internal/parser/split_func_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ func TestSplitFunc(t *testing.T) {
longString := strings.Repeat("abcdef\rghijklmn\nopqrstu\r\nvwxyz", 193)
testCases := []testCase{
{
name: "Short sample with BOM",
input: "\xEF\xBB\xBFmama mea e super\nce genial\nsincer n-am ce sa zic\r\n\r\n\nmama tata bunica bunicul\nsarmale\r\n\r\r\naualeu\nce taraboi",
name: "Short sample",
input: "mama mea e super\nce genial\nsincer n-am ce sa zic\r\n\r\n\nmama tata bunica bunicul\nsarmale\r\n\r\r\naualeu\nce taraboi",
expected: []string{
"mama mea e super\nce genial\nsincer n-am ce sa zic\r\n\r\n",
"mama tata bunica bunicul\nsarmale\r\n\r",
Expand All @@ -47,7 +47,7 @@ func TestSplitFunc(t *testing.T) {

r := strings.NewReader(tc.input)
s := bufio.NewScanner(r)
s.Split(newSplitFunc())
s.Split(splitFunc)

tokens := make([]string, 0, len(tc.expected))

Expand Down

0 comments on commit 1352b29

Please sign in to comment.