Move BOM stripping logic to FieldParser

tmaxmax · Jul 11, 2023 · 1352b29 · 1352b29
1 parent 5266c32
commit 1352b29
Show file tree

Hide file tree

Showing 4 changed files with 87 additions and 47 deletions.
diff --git a/internal/parser/field_parser.go b/internal/parser/field_parser.go
@@ -7,9 +7,13 @@ import (
 
 // FieldParser extracts fields from a byte slice.
 type FieldParser struct {
-	err          error
-	data         string
+	err  error
+	data string
+
+	isUntouched bool
+
 	keepComments bool
+	removeBOM    bool
 }
 
 func min(a, b int) int {
@@ -62,6 +66,8 @@ var ErrUnexpectedEOF = errors.New("go-sse: unexpected end of input")
 // Next parses the next available field in the remaining buffer.
 // It returns false if there are no more fields to parse.
 func (f *FieldParser) Next(r *Field) bool {
+	f.isUntouched = false
+
 	for f.data != "" {
 		chunk, rem, hasNewline := NextChunk(f.data)
 		if !hasNewline {
@@ -85,6 +91,8 @@ func (f *FieldParser) Next(r *Field) bool {
 func (f *FieldParser) Reset(data string) {
 	f.data = data
 	f.err = nil
+	f.isUntouched = true
+	f.doRemoveBOM()
 }
 
 // Err returns the last error encountered by the parser. It is either nil or ErrUnexpectedEOF.
@@ -98,7 +106,23 @@ func (f *FieldParser) KeepComments(shouldKeep bool) {
 	f.keepComments = shouldKeep
 }
 
+// RemoveBOM configures the FieldParser to try and remove the Unicode BOM
+// when parsing the first field, if it exists.
+// If, at the time this option is set, the input is untouched (no fields were parsed),
+// it will also be attempted to remove the BOM.
+func (f *FieldParser) RemoveBOM(shouldRemove bool) {
+	f.removeBOM = shouldRemove
+	f.doRemoveBOM()
+}
+
+func (f *FieldParser) doRemoveBOM() {
+	if f.removeBOM && f.isUntouched && f.data != "" {
+		f.data = strings.TrimPrefix(f.data, "\xEF\xBB\xBF")
+		f.isUntouched = false
+	}
+}
+
 // NewFieldParser creates a parser that extracts fields from the given string.
 func NewFieldParser(data string) *FieldParser {
-	return &FieldParser{data: data}
+	return &FieldParser{data: data, isUntouched: true}
 }
diff --git a/internal/parser/field_parser_test.go b/internal/parser/field_parser_test.go
@@ -99,6 +99,29 @@ func TestFieldParser(t *testing.T) {
 			}
 		})
 	}
+
+	t.Run("BOM", func(t *testing.T) {
+		p := parser.NewFieldParser("\xEF\xBB\xBFid: 5\n")
+		p.RemoveBOM(true)
+
+		var f parser.Field
+		if !p.Next(&f) {
+			t.Fatalf("a field should be available (err=%v)", p.Err())
+		}
+
+		expectedF := parser.Field{Name: parser.FieldNameID, Value: "5"}
+		if f != expectedF {
+			t.Fatalf("invalid field: received %v, expected %v", f, expectedF)
+		}
+
+		p.Reset("\xEF\xBB\xBF")
+		if p.Next(&f) {
+			t.Fatalf("no fields should be available")
+		}
+		if p.Err() != nil {
+			t.Fatalf("no error is expected after BOM removal")
+		}
+	})
 }
 
 func BenchmarkFieldParser(b *testing.B) {

diff --git a/internal/parser/parser.go b/internal/parser/parser.go
@@ -2,61 +2,51 @@ package parser
 
 import (
 	"bufio"
-	"bytes"
 	"io"
 	"unsafe"
 )
 
-// newSplitFunc creates a split function for a bufio.Scanner that splits a sequence of
+// splitFunc is a split function for a bufio.Scanner that splits a sequence of
 // bytes into SSE events. Each event ends with two consecutive newline sequences,
 // where a newline sequence is defined as either "\n", "\r", or "\r\n".
 //
 // This split function also removes the BOM sequence from the first event, if it exists.
-func newSplitFunc() bufio.SplitFunc {
-	isFirstToken := true
+func splitFunc(data []byte, atEOF bool) (advance int, token []byte, err error) {
+	if len(data) == 0 {
+		return 0, nil, nil
+	}
 
-	return func(data []byte, atEOF bool) (advance int, token []byte, err error) {
-		if len(data) == 0 {
-			return 0, nil, nil
+	var start int
+	for {
+		index, endlineLen := NewlineIndex((*(*string)(unsafe.Pointer(&data)))[advance:])
+		advance += index + endlineLen
+		if index == 0 {
+			// If it was a blank line, skip it.
+			start += endlineLen
 		}
-
-		var start int
-		for {
-			index, endlineLen := NewlineIndex((*(*string)(unsafe.Pointer(&data)))[advance:])
-			advance += index + endlineLen
-			if index == 0 {
-				// If it was a blank line, skip it.
-				start += endlineLen
-			}
-			// We've reached the end of data or a second newline follows and the line isn't blank.
-			// The latter means we have an event.
-			if advance == len(data) || (isNewlineChar(data[advance]) && index > 0) {
-				break
-			}
+		// We've reached the end of data or a second newline follows and the line isn't blank.
+		// The latter means we have an event.
+		if advance == len(data) || (isNewlineChar(data[advance]) && index > 0) {
+			break
 		}
+	}
 
-		if l := len(data); advance == l && !atEOF {
-			// We have reached the end of the buffer but have not yet seen two consecutive
-			// newline sequences, so we request more data.
-			return 0, nil, nil
-		} else if advance < l {
-			// We have found a newline. Consume the end-of-line sequence.
+	if l := len(data); advance == l && !atEOF {
+		// We have reached the end of the buffer but have not yet seen two consecutive
+		// newline sequences, so we request more data.
+		return 0, nil, nil
+	} else if advance < l {
+		// We have found a newline. Consume the end-of-line sequence.
+		advance++
+		// Consume one more character if end-of-line is "\r\n".
+		if advance < l && data[advance-1] == '\r' && data[advance] == '\n' {
 			advance++
-			// Consume one more character if end-of-line is "\r\n".
-			if advance < l && data[advance-1] == '\r' && data[advance] == '\n' {
-				advance++
-			}
 		}
+	}
 
-		token = data[start:advance]
-		if isFirstToken {
-			// Remove BOM, if present.
-			token = bytes.TrimPrefix(token, []byte("\xEF\xBB\xBF"))
-			isFirstToken = false
-		}
+	token = data[start:advance]
 
-		return advance, token, nil
-	}
+	return advance, token, nil
 }
 
 // Parser extracts fields from a reader. Reading is buffered using a bufio.Scanner.
@@ -92,7 +82,10 @@ func (r *Parser) Err() error {
 // New returns a Parser that extracts fields from a reader.
 func New(r io.Reader) *Parser {
 	sc := bufio.NewScanner(r)
-	sc.Split(newSplitFunc())
+	sc.Split(splitFunc)
+
+	fsc := NewFieldParser("")
+	fsc.RemoveBOM(true)
 
-	return &Parser{inputScanner: sc, fieldScanner: NewFieldParser("")}
+	return &Parser{inputScanner: sc, fieldScanner: fsc}
 }
diff --git a/internal/parser/split_func_test.go b/internal/parser/split_func_test.go
@@ -19,8 +19,8 @@ func TestSplitFunc(t *testing.T) {
 	longString := strings.Repeat("abcdef\rghijklmn\nopqrstu\r\nvwxyz", 193)
 	testCases := []testCase{
 		{
-			name:  "Short sample with BOM",
-			input: "\xEF\xBB\xBFmama mea e super\nce genial\nsincer n-am ce sa zic\r\n\r\n\nmama tata bunica bunicul\nsarmale\r\n\r\r\naualeu\nce taraboi",
+			name:  "Short sample",
+			input: "mama mea e super\nce genial\nsincer n-am ce sa zic\r\n\r\n\nmama tata bunica bunicul\nsarmale\r\n\r\r\naualeu\nce taraboi",
 			expected: []string{
 				"mama mea e super\nce genial\nsincer n-am ce sa zic\r\n\r\n",
 				"mama tata bunica bunicul\nsarmale\r\n\r",
@@ -47,7 +47,7 @@ func TestSplitFunc(t *testing.T) {
 
 			r := strings.NewReader(tc.input)
 			s := bufio.NewScanner(r)
-			s.Split(newSplitFunc())
+			s.Split(splitFunc)
 
 			tokens := make([]string, 0, len(tc.expected))