From f963bc320ff0c7f8cdd03234976c1a9c688b7fd8 Mon Sep 17 00:00:00 2001
From: Thomas Pelletier <pelletier.thomas@gmail.com>
Date: Sun, 24 Jan 2016 20:43:14 +0100
Subject: [PATCH] Generic input

Fixes #47
---
 lexer.go       | 342 +++++++++++++++++++++++++------------------------
 lexer_test.go  |   8 +-
 parser_test.go |   4 +-
 test.sh        |   4 +-
 toml.go        |  26 ++--
 5 files changed, 201 insertions(+), 183 deletions(-)

diff --git a/lexer.go b/lexer.go
index 813f21ad..df0596b2 100644
--- a/lexer.go
+++ b/lexer.go
@@ -7,10 +7,11 @@ package toml
 
 import (
 	"fmt"
+	"github.com/pelletier/go-buffruneio"
+	"io"
 	"regexp"
 	"strconv"
 	"strings"
-	"unicode/utf8"
 )
 
 var dateRegexp *regexp.Regexp
@@ -20,47 +21,56 @@ type tomlLexStateFn func() tomlLexStateFn
 
 // Define lexer
 type tomlLexer struct {
-	input  string
-	start  int
-	pos    int
-	width  int
-	tokens chan token
-	depth  int
-	line   int
-	col    int
+	input         *buffruneio.Reader // Textual source
+	buffer        []rune             // Runes composing the current token
+	tokens        chan token
+	depth         int
+	line          int
+	col           int
+	endbufferLine int
+	endbufferCol  int
+}
+
+// Basic read operations on input
+
+func (l *tomlLexer) read() rune {
+	r, err := l.input.ReadRune()
+	if err != nil {
+		panic(err)
+	}
+	if r == '\n' {
+		l.endbufferLine++
+		l.endbufferCol = 1
+	} else {
+		l.endbufferCol++
+	}
+	return r
 }
 
-func (l *tomlLexer) run() {
-	for state := l.lexVoid; state != nil; {
-		state = state()
+func (l *tomlLexer) next() rune {
+	r := l.read()
+
+	if r != eof {
+		l.buffer = append(l.buffer, r)
 	}
-	close(l.tokens)
+	return r
 }
 
-func (l *tomlLexer) nextStart() {
-	// iterate by runes (utf8 characters)
-	// search for newlines and advance line/col counts
-	for i := l.start; i < l.pos; {
-		r, width := utf8.DecodeRuneInString(l.input[i:])
-		if r == '\n' {
-			l.line++
-			l.col = 1
-		} else {
-			l.col++
-		}
-		i += width
-	}
-	// advance start position to next token
-	l.start = l.pos
+func (l *tomlLexer) ignore() {
+	l.buffer = make([]rune, 0)
+	l.line = l.endbufferLine
+	l.col = l.endbufferCol
 }
 
-func (l *tomlLexer) emit(t tokenType) {
-	l.tokens <- token{
-		Position: Position{l.line, l.col},
-		typ:      t,
-		val:      l.input[l.start:l.pos],
+func (l *tomlLexer) skip() {
+	l.next()
+	l.ignore()
+}
+
+func (l *tomlLexer) fastForward(n int) {
+	for i := 0; i < n; i++ {
+		l.next()
 	}
-	l.nextStart()
 }
 
 func (l *tomlLexer) emitWithValue(t tokenType, value string) {
@@ -69,27 +79,37 @@ func (l *tomlLexer) emitWithValue(t tokenType, value string) {
 		typ:      t,
 		val:      value,
 	}
-	l.nextStart()
+	l.ignore()
 }
 
-func (l *tomlLexer) next() rune {
-	if l.pos >= len(l.input) {
-		l.width = 0
-		return eof
+func (l *tomlLexer) emit(t tokenType) {
+	l.emitWithValue(t, string(l.buffer))
+}
+
+func (l *tomlLexer) peek() rune {
+	r, err := l.input.ReadRune()
+	if err != nil {
+		panic(err)
 	}
-	var r rune
-	r, l.width = utf8.DecodeRuneInString(l.input[l.pos:])
-	l.pos += l.width
+	l.input.UnreadRune()
 	return r
 }
 
-func (l *tomlLexer) ignore() {
-	l.nextStart()
+func (l *tomlLexer) follow(next string) bool {
+	for _, expectedRune := range next {
+		r, err := l.input.ReadRune()
+		defer l.input.UnreadRune()
+		if err != nil {
+			panic(err)
+		}
+		if expectedRune != r {
+			return false
+		}
+	}
+	return true
 }
 
-func (l *tomlLexer) backup() {
-	l.pos -= l.width
-}
+// Error management
 
 func (l *tomlLexer) errorf(format string, args ...interface{}) tomlLexStateFn {
 	l.tokens <- token{
@@ -100,23 +120,7 @@ func (l *tomlLexer) errorf(format string, args ...interface{}) tomlLexStateFn {
 	return nil
 }
 
-func (l *tomlLexer) peek() rune {
-	r := l.next()
-	l.backup()
-	return r
-}
-
-func (l *tomlLexer) accept(valid string) bool {
-	if strings.IndexRune(valid, l.next()) >= 0 {
-		return true
-	}
-	l.backup()
-	return false
-}
-
-func (l *tomlLexer) follow(next string) bool {
-	return strings.HasPrefix(l.input[l.pos:], next)
-}
+// State functions
 
 func (l *tomlLexer) lexVoid() tomlLexStateFn {
 	for {
@@ -128,10 +132,13 @@ func (l *tomlLexer) lexVoid() tomlLexStateFn {
 			return l.lexComment
 		case '=':
 			return l.lexEqual
+		case '\n':
+			l.skip()
+			continue
 		}
 
 		if isSpace(next) {
-			l.ignore()
+			l.skip()
 		}
 
 		if l.depth > 0 {
@@ -142,7 +149,8 @@ func (l *tomlLexer) lexVoid() tomlLexStateFn {
 			return l.lexKey
 		}
 
-		if l.next() == eof {
+		if next == eof {
+			l.next()
 			break
 		}
 	}
@@ -178,8 +186,7 @@ func (l *tomlLexer) lexRvalue() tomlLexStateFn {
 		case ',':
 			return l.lexComma
 		case '\n':
-			l.ignore()
-			l.pos++
+			l.skip()
 			if l.depth == 0 {
 				return l.lexVoid
 			}
@@ -196,14 +203,20 @@ func (l *tomlLexer) lexRvalue() tomlLexStateFn {
 			return l.lexFalse
 		}
 
-		if isAlphanumeric(next) {
-			return l.lexKey
+		if isSpace(next) {
+			l.skip()
+			continue
+		}
+
+		if next == eof {
+			l.next()
+			break
 		}
 
-		dateMatch := dateRegexp.FindString(l.input[l.pos:])
+		possibleDate := string(l.input.Peek(35))
+		dateMatch := dateRegexp.FindString(possibleDate)
 		if dateMatch != "" {
-			l.ignore()
-			l.pos += len(dateMatch)
+			l.fastForward(len(dateMatch))
 			return l.lexDate
 		}
 
@@ -211,13 +224,10 @@ func (l *tomlLexer) lexRvalue() tomlLexStateFn {
 			return l.lexNumber
 		}
 
-		if isSpace(next) {
-			l.ignore()
+		if isAlphanumeric(next) {
+			return l.lexKey
 		}
 
-		if l.next() == eof {
-			break
-		}
 	}
 
 	l.emit(tokenEOF)
@@ -225,15 +235,13 @@ func (l *tomlLexer) lexRvalue() tomlLexStateFn {
 }
 
 func (l *tomlLexer) lexLeftCurlyBrace() tomlLexStateFn {
-	l.ignore()
-	l.pos++
+	l.next()
 	l.emit(tokenLeftCurlyBrace)
 	return l.lexRvalue
 }
 
 func (l *tomlLexer) lexRightCurlyBrace() tomlLexStateFn {
-	l.ignore()
-	l.pos++
+	l.next()
 	l.emit(tokenRightCurlyBrace)
 	return l.lexRvalue
 }
@@ -244,37 +252,32 @@ func (l *tomlLexer) lexDate() tomlLexStateFn {
 }
 
 func (l *tomlLexer) lexTrue() tomlLexStateFn {
-	l.ignore()
-	l.pos += 4
+	l.fastForward(4)
 	l.emit(tokenTrue)
 	return l.lexRvalue
 }
 
 func (l *tomlLexer) lexFalse() tomlLexStateFn {
-	l.ignore()
-	l.pos += 5
+	l.fastForward(5)
 	l.emit(tokenFalse)
 	return l.lexRvalue
 }
 
 func (l *tomlLexer) lexEqual() tomlLexStateFn {
-	l.ignore()
-	l.accept("=")
+	l.next()
 	l.emit(tokenEqual)
 	return l.lexRvalue
 }
 
 func (l *tomlLexer) lexComma() tomlLexStateFn {
-	l.ignore()
-	l.accept(",")
+	l.next()
 	l.emit(tokenComma)
 	return l.lexRvalue
 }
 
 func (l *tomlLexer) lexKey() tomlLexStateFn {
-	l.ignore()
 	inQuotes := false
-	for r := l.next(); isKeyChar(r) || r == '\n'; r = l.next() {
+	for r := l.peek(); isKeyChar(r) || r == '\n'; r = l.peek() {
 		if r == '"' {
 			inQuotes = !inQuotes
 		} else if r == '\n' {
@@ -284,46 +287,40 @@ func (l *tomlLexer) lexKey() tomlLexStateFn {
 		} else if !isValidBareChar(r) && !inQuotes {
 			return l.errorf("keys cannot contain %c character", r)
 		}
+		l.next()
 	}
-	l.backup()
 	l.emit(tokenKey)
 	return l.lexVoid
 }
 
 func (l *tomlLexer) lexComment() tomlLexStateFn {
-	for {
-		next := l.next()
-		if next == '\n' || next == eof {
-			break
-		}
+	for next := l.peek(); next != '\n' && next != eof; next = l.peek() {
+		l.next()
 	}
 	l.ignore()
 	return l.lexVoid
 }
 
 func (l *tomlLexer) lexLeftBracket() tomlLexStateFn {
-	l.ignore()
-	l.pos++
+	l.next()
 	l.emit(tokenLeftBracket)
 	return l.lexRvalue
 }
 
 func (l *tomlLexer) lexLiteralString() tomlLexStateFn {
-	l.pos++
-	l.ignore()
+	l.skip()
 	growingString := ""
 
 	// handle special case for triple-quote
 	terminator := "'"
 	if l.follow("''") {
-		l.pos += 2
-		l.ignore()
+		l.skip()
+		l.skip()
 		terminator = "'''"
 
 		// special case: discard leading newline
 		if l.peek() == '\n' {
-			l.pos++
-			l.ignore()
+			l.skip()
 		}
 	}
 
@@ -331,50 +328,48 @@ func (l *tomlLexer) lexLiteralString() tomlLexStateFn {
 	for {
 		if l.follow(terminator) {
 			l.emitWithValue(tokenString, growingString)
-			l.pos += len(terminator)
+			l.fastForward(len(terminator))
 			l.ignore()
 			return l.lexRvalue
 		}
 
-		growingString += string(l.peek())
-
-		if l.next() == eof {
+		next := l.peek()
+		if next == eof {
 			break
 		}
+		growingString += string(l.next())
 	}
 
 	return l.errorf("unclosed string")
 }
 
 func (l *tomlLexer) lexString() tomlLexStateFn {
-	l.pos++
-	l.ignore()
+	l.skip()
 	growingString := ""
 
 	// handle special case for triple-quote
 	terminator := "\""
 	if l.follow("\"\"") {
-		l.pos += 2
-		l.ignore()
+		l.skip()
+		l.skip()
 		terminator = "\"\"\""
 
 		// special case: discard leading newline
 		if l.peek() == '\n' {
-			l.pos++
-			l.ignore()
+			l.skip()
 		}
 	}
 
 	for {
 		if l.follow(terminator) {
 			l.emitWithValue(tokenString, growingString)
-			l.pos += len(terminator)
+			l.fastForward(len(terminator))
 			l.ignore()
 			return l.lexRvalue
 		}
 
 		if l.follow("\\") {
-			l.pos++
+			l.next()
 			switch l.peek() {
 			case '\r':
 				fallthrough
@@ -384,56 +379,60 @@ func (l *tomlLexer) lexString() tomlLexStateFn {
 				fallthrough
 			case ' ':
 				// skip all whitespace chars following backslash
-				l.pos++
 				for strings.ContainsRune("\r\n\t ", l.peek()) {
-					l.pos++
+					l.next()
 				}
-				l.pos--
 			case '"':
 				growingString += "\""
+				l.next()
 			case 'n':
 				growingString += "\n"
+				l.next()
 			case 'b':
 				growingString += "\b"
+				l.next()
 			case 'f':
 				growingString += "\f"
+				l.next()
 			case '/':
 				growingString += "/"
+				l.next()
 			case 't':
 				growingString += "\t"
+				l.next()
 			case 'r':
 				growingString += "\r"
+				l.next()
 			case '\\':
 				growingString += "\\"
+				l.next()
 			case 'u':
-				l.pos++
+				l.next()
 				code := ""
 				for i := 0; i < 4; i++ {
 					c := l.peek()
-					l.pos++
 					if !isHexDigit(c) {
 						return l.errorf("unfinished unicode escape")
 					}
+					l.next()
 					code = code + string(c)
 				}
-				l.pos--
 				intcode, err := strconv.ParseInt(code, 16, 32)
 				if err != nil {
 					return l.errorf("invalid unicode escape: \\u" + code)
 				}
 				growingString += string(rune(intcode))
 			case 'U':
-				l.pos++
+				l.next()
 				code := ""
 				for i := 0; i < 8; i++ {
 					c := l.peek()
-					l.pos++
 					if !isHexDigit(c) {
 						return l.errorf("unfinished unicode escape")
 					}
+					l.next()
 					code = code + string(c)
 				}
-				l.pos--
 				intcode, err := strconv.ParseInt(code, 16, 64)
 				if err != nil {
 					return l.errorf("invalid unicode escape: \\U" + code)
@@ -447,10 +446,11 @@ func (l *tomlLexer) lexString() tomlLexStateFn {
 			if 0x00 <= r && r <= 0x1F {
 				return l.errorf("unescaped control character %U", r)
 			}
+			l.next()
 			growingString += string(r)
 		}
 
-		if l.next() == eof {
+		if l.peek() == eof {
 			break
 		}
 	}
@@ -459,12 +459,11 @@ func (l *tomlLexer) lexString() tomlLexStateFn {
 }
 
 func (l *tomlLexer) lexKeyGroup() tomlLexStateFn {
-	l.ignore()
-	l.pos++
+	l.next()
 
 	if l.peek() == '[' {
 		// token '[[' signifies an array of anonymous key groups
-		l.pos++
+		l.next()
 		l.emit(tokenDoubleLeftBracket)
 		return l.lexInsideKeyGroupArray
 	}
@@ -474,86 +473,85 @@ func (l *tomlLexer) lexKeyGroup() tomlLexStateFn {
 }
 
 func (l *tomlLexer) lexInsideKeyGroupArray() tomlLexStateFn {
-	for {
-		if l.peek() == ']' {
-			if l.pos > l.start {
+	for r := l.peek(); r != eof; r = l.peek() {
+		switch r {
+		case ']':
+			if len(l.buffer) > 0 {
 				l.emit(tokenKeyGroupArray)
 			}
-			l.ignore()
-			l.pos++
+			l.next()
 			if l.peek() != ']' {
-				break // error
+				break
 			}
-			l.pos++
+			l.next()
 			l.emit(tokenDoubleRightBracket)
 			return l.lexVoid
-		} else if l.peek() == '[' {
+		case '[':
 			return l.errorf("group name cannot contain ']'")
-		}
-
-		if l.next() == eof {
-			break
+		default:
+			l.next()
 		}
 	}
 	return l.errorf("unclosed key group array")
 }
 
 func (l *tomlLexer) lexInsideKeyGroup() tomlLexStateFn {
-	for {
-		if l.peek() == ']' {
-			if l.pos > l.start {
+	for r := l.peek(); r != eof; r = l.peek() {
+		switch r {
+		case ']':
+			if len(l.buffer) > 0 {
 				l.emit(tokenKeyGroup)
 			}
-			l.ignore()
-			l.pos++
+			l.next()
 			l.emit(tokenRightBracket)
 			return l.lexVoid
-		} else if l.peek() == '[' {
+		case '[':
 			return l.errorf("group name cannot contain ']'")
-		}
-
-		if l.next() == eof {
-			break
+		default:
+			l.next()
 		}
 	}
 	return l.errorf("unclosed key group")
 }
 
 func (l *tomlLexer) lexRightBracket() tomlLexStateFn {
-	l.ignore()
-	l.pos++
+	l.next()
 	l.emit(tokenRightBracket)
 	return l.lexRvalue
 }
 
 func (l *tomlLexer) lexNumber() tomlLexStateFn {
-	l.ignore()
-	if !l.accept("+") {
-		l.accept("-")
+	r := l.peek()
+	if r == '+' || r == '-' {
+		l.next()
 	}
 	pointSeen := false
 	expSeen := false
 	digitSeen := false
 	for {
-		next := l.next()
+		next := l.peek()
 		if next == '.' {
 			if pointSeen {
 				return l.errorf("cannot have two dots in one float")
 			}
+			l.next()
 			if !isDigit(l.peek()) {
 				return l.errorf("float cannot end with a dot")
 			}
 			pointSeen = true
 		} else if next == 'e' || next == 'E' {
 			expSeen = true
-			if !l.accept("+") {
-				l.accept("-")
+			l.next()
+			r := l.peek()
+			if r == '+' || r == '-' {
+				l.next()
 			}
 		} else if isDigit(next) {
 			digitSeen = true
+			l.next()
 		} else if next == '_' {
+			l.next()
 		} else {
-			l.backup()
 			break
 		}
 		if pointSeen && !digitSeen {
@@ -572,17 +570,27 @@ func (l *tomlLexer) lexNumber() tomlLexStateFn {
 	return l.lexRvalue
 }
 
+func (l *tomlLexer) run() {
+	for state := l.lexVoid; state != nil; {
+		state = state()
+	}
+	close(l.tokens)
+}
+
 func init() {
 	dateRegexp = regexp.MustCompile("^\\d{1,4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}(\\.\\d{1,9})?(Z|[+-]\\d{2}:\\d{2})")
 }
 
 // Entry point
-func lexToml(input string) chan token {
+func lexToml(input io.Reader) chan token {
+	bufferedInput := buffruneio.NewReader(input)
 	l := &tomlLexer{
-		input:  input,
-		tokens: make(chan token),
-		line:   1,
-		col:    1,
+		input:         bufferedInput,
+		tokens:        make(chan token),
+		line:          1,
+		col:           1,
+		endbufferLine: 1,
+		endbufferCol:  1,
 	}
 	go l.run()
 	return l.tokens
diff --git a/lexer_test.go b/lexer_test.go
index 1964a57a..9fa8be8b 100644
--- a/lexer_test.go
+++ b/lexer_test.go
@@ -1,15 +1,19 @@
 package toml
 
-import "testing"
+import (
+	"strings"
+	"testing"
+)
 
 func testFlow(t *testing.T, input string, expectedFlow []token) {
-	ch := lexToml(input)
+	ch := lexToml(strings.NewReader(input))
 	for _, expected := range expectedFlow {
 		token := <-ch
 		if token != expected {
 			t.Log("While testing: ", input)
 			t.Log("compared (got)", token, "to (expected)", expected)
 			t.Log("\tvalue:", token.val, "<->", expected.val)
+			t.Log("\tvalue as bytes:", []byte(token.val), "<->", []byte(expected.val))
 			t.Log("\ttype:", token.typ.String(), "<->", expected.typ.String())
 			t.Log("\tline:", token.Line, "<->", expected.Line)
 			t.Log("\tcolumn:", token.Col, "<->", expected.Col)
diff --git a/parser_test.go b/parser_test.go
index 53cfcde6..f9191b67 100644
--- a/parser_test.go
+++ b/parser_test.go
@@ -287,7 +287,7 @@ func TestArrayNestedStrings(t *testing.T) {
 
 func TestMissingValue(t *testing.T) {
 	_, err := Load("a = ")
-	if err.Error() != "(1, 4): expecting a value" {
+	if err.Error() != "(1, 5): expecting a value" {
 		t.Error("Bad error message:", err.Error())
 	}
 }
@@ -441,7 +441,7 @@ func TestImplicitDeclarationBefore(t *testing.T) {
 
 func TestFloatsWithoutLeadingZeros(t *testing.T) {
 	_, err := Load("a = .42")
-	if err.Error() != "(1, 4): cannot start float with a dot" {
+	if err.Error() != "(1, 5): cannot start float with a dot" {
 		t.Error("Bad error message:", err.Error())
 	}
 
diff --git a/test.sh b/test.sh
index 410838b0..0a426e03 100755
--- a/test.sh
+++ b/test.sh
@@ -19,6 +19,8 @@ function git_clone() {
   popd
 }
 
+go get github.com/pelletier/go-buffruneio
+
 # get code for BurntSushi TOML validation
 # pinning all to 'HEAD' for version 0.3.x work (TODO: pin to commit hash when tests stabilize)
 git_clone github.com/BurntSushi/toml master HEAD
@@ -66,7 +68,7 @@ else
     echo "Invalid Test TOML for $test:"
     echo "===="
     cat "$invalid_test.toml"
-    
+
     echo "Go-TOML Output for $test:"
     echo "===="
     echo "go-toml Output:"
diff --git a/toml.go b/toml.go
index bf35cd34..7af032aa 100644
--- a/toml.go
+++ b/toml.go
@@ -3,7 +3,8 @@ package toml
 import (
 	"errors"
 	"fmt"
-	"io/ioutil"
+	"io"
+	"os"
 	"runtime"
 	"strconv"
 	"strings"
@@ -360,8 +361,8 @@ func (t *TomlTree) ToString() string {
 	return t.toToml("", "")
 }
 
-// Load creates a TomlTree from a string.
-func Load(content string) (tree *TomlTree, err error) {
+// LoadReader creates a TomlTree from any io.Reader.
+func LoadReader(reader io.Reader) (tree *TomlTree, err error) {
 	defer func() {
 		if r := recover(); r != nil {
 			if _, ok := r.(runtime.Error); ok {
@@ -370,18 +371,21 @@ func Load(content string) (tree *TomlTree, err error) {
 			err = errors.New(r.(string))
 		}
 	}()
-	tree = parseToml(lexToml(content))
+	tree = parseToml(lexToml(reader))
 	return
 }
 
+// Load creates a TomlTree from a string.
+func Load(content string) (tree *TomlTree, err error) {
+	return LoadReader(strings.NewReader(content))
+}
+
 // LoadFile creates a TomlTree from a file.
 func LoadFile(path string) (tree *TomlTree, err error) {
-	buff, ferr := ioutil.ReadFile(path)
-	if ferr != nil {
-		err = ferr
-	} else {
-		s := string(buff)
-		tree, err = Load(s)
+	file, err := os.Open(path)
+	if err != nil {
+		return nil, err
 	}
-	return
+	defer file.Close()
+	return LoadReader(file)
 }