From f963bc320ff0c7f8cdd03234976c1a9c688b7fd8 Mon Sep 17 00:00:00 2001 From: Thomas Pelletier Date: Sun, 24 Jan 2016 20:43:14 +0100 Subject: [PATCH] Generic input Fixes #47 --- lexer.go | 342 +++++++++++++++++++++++++------------------------ lexer_test.go | 8 +- parser_test.go | 4 +- test.sh | 4 +- toml.go | 26 ++-- 5 files changed, 201 insertions(+), 183 deletions(-) diff --git a/lexer.go b/lexer.go index 813f21ad..df0596b2 100644 --- a/lexer.go +++ b/lexer.go @@ -7,10 +7,11 @@ package toml import ( "fmt" + "github.com/pelletier/go-buffruneio" + "io" "regexp" "strconv" "strings" - "unicode/utf8" ) var dateRegexp *regexp.Regexp @@ -20,47 +21,56 @@ type tomlLexStateFn func() tomlLexStateFn // Define lexer type tomlLexer struct { - input string - start int - pos int - width int - tokens chan token - depth int - line int - col int + input *buffruneio.Reader // Textual source + buffer []rune // Runes composing the current token + tokens chan token + depth int + line int + col int + endbufferLine int + endbufferCol int +} + +// Basic read operations on input + +func (l *tomlLexer) read() rune { + r, err := l.input.ReadRune() + if err != nil { + panic(err) + } + if r == '\n' { + l.endbufferLine++ + l.endbufferCol = 1 + } else { + l.endbufferCol++ + } + return r } -func (l *tomlLexer) run() { - for state := l.lexVoid; state != nil; { - state = state() +func (l *tomlLexer) next() rune { + r := l.read() + + if r != eof { + l.buffer = append(l.buffer, r) } - close(l.tokens) + return r } -func (l *tomlLexer) nextStart() { - // iterate by runes (utf8 characters) - // search for newlines and advance line/col counts - for i := l.start; i < l.pos; { - r, width := utf8.DecodeRuneInString(l.input[i:]) - if r == '\n' { - l.line++ - l.col = 1 - } else { - l.col++ - } - i += width - } - // advance start position to next token - l.start = l.pos +func (l *tomlLexer) ignore() { + l.buffer = make([]rune, 0) + l.line = l.endbufferLine + l.col = l.endbufferCol } -func (l *tomlLexer) emit(t tokenType) { - l.tokens <- token{ - Position: Position{l.line, l.col}, - typ: t, - val: l.input[l.start:l.pos], +func (l *tomlLexer) skip() { + l.next() + l.ignore() +} + +func (l *tomlLexer) fastForward(n int) { + for i := 0; i < n; i++ { + l.next() } - l.nextStart() } func (l *tomlLexer) emitWithValue(t tokenType, value string) { @@ -69,27 +79,37 @@ func (l *tomlLexer) emitWithValue(t tokenType, value string) { typ: t, val: value, } - l.nextStart() + l.ignore() } -func (l *tomlLexer) next() rune { - if l.pos >= len(l.input) { - l.width = 0 - return eof +func (l *tomlLexer) emit(t tokenType) { + l.emitWithValue(t, string(l.buffer)) +} + +func (l *tomlLexer) peek() rune { + r, err := l.input.ReadRune() + if err != nil { + panic(err) } - var r rune - r, l.width = utf8.DecodeRuneInString(l.input[l.pos:]) - l.pos += l.width + l.input.UnreadRune() return r } -func (l *tomlLexer) ignore() { - l.nextStart() +func (l *tomlLexer) follow(next string) bool { + for _, expectedRune := range next { + r, err := l.input.ReadRune() + defer l.input.UnreadRune() + if err != nil { + panic(err) + } + if expectedRune != r { + return false + } + } + return true } -func (l *tomlLexer) backup() { - l.pos -= l.width -} +// Error management func (l *tomlLexer) errorf(format string, args ...interface{}) tomlLexStateFn { l.tokens <- token{ @@ -100,23 +120,7 @@ func (l *tomlLexer) errorf(format string, args ...interface{}) tomlLexStateFn { return nil } -func (l *tomlLexer) peek() rune { - r := l.next() - l.backup() - return r -} - -func (l *tomlLexer) accept(valid string) bool { - if strings.IndexRune(valid, l.next()) >= 0 { - return true - } - l.backup() - return false -} - -func (l *tomlLexer) follow(next string) bool { - return strings.HasPrefix(l.input[l.pos:], next) -} +// State functions func (l *tomlLexer) lexVoid() tomlLexStateFn { for { @@ -128,10 +132,13 @@ func (l *tomlLexer) lexVoid() tomlLexStateFn { return l.lexComment case '=': return l.lexEqual + case '\n': + l.skip() + continue } if isSpace(next) { - l.ignore() + l.skip() } if l.depth > 0 { @@ -142,7 +149,8 @@ func (l *tomlLexer) lexVoid() tomlLexStateFn { return l.lexKey } - if l.next() == eof { + if next == eof { + l.next() break } } @@ -178,8 +186,7 @@ func (l *tomlLexer) lexRvalue() tomlLexStateFn { case ',': return l.lexComma case '\n': - l.ignore() - l.pos++ + l.skip() if l.depth == 0 { return l.lexVoid } @@ -196,14 +203,20 @@ func (l *tomlLexer) lexRvalue() tomlLexStateFn { return l.lexFalse } - if isAlphanumeric(next) { - return l.lexKey + if isSpace(next) { + l.skip() + continue + } + + if next == eof { + l.next() + break } - dateMatch := dateRegexp.FindString(l.input[l.pos:]) + possibleDate := string(l.input.Peek(35)) + dateMatch := dateRegexp.FindString(possibleDate) if dateMatch != "" { - l.ignore() - l.pos += len(dateMatch) + l.fastForward(len(dateMatch)) return l.lexDate } @@ -211,13 +224,10 @@ func (l *tomlLexer) lexRvalue() tomlLexStateFn { return l.lexNumber } - if isSpace(next) { - l.ignore() + if isAlphanumeric(next) { + return l.lexKey } - if l.next() == eof { - break - } } l.emit(tokenEOF) @@ -225,15 +235,13 @@ func (l *tomlLexer) lexRvalue() tomlLexStateFn { } func (l *tomlLexer) lexLeftCurlyBrace() tomlLexStateFn { - l.ignore() - l.pos++ + l.next() l.emit(tokenLeftCurlyBrace) return l.lexRvalue } func (l *tomlLexer) lexRightCurlyBrace() tomlLexStateFn { - l.ignore() - l.pos++ + l.next() l.emit(tokenRightCurlyBrace) return l.lexRvalue } @@ -244,37 +252,32 @@ func (l *tomlLexer) lexDate() tomlLexStateFn { } func (l *tomlLexer) lexTrue() tomlLexStateFn { - l.ignore() - l.pos += 4 + l.fastForward(4) l.emit(tokenTrue) return l.lexRvalue } func (l *tomlLexer) lexFalse() tomlLexStateFn { - l.ignore() - l.pos += 5 + l.fastForward(5) l.emit(tokenFalse) return l.lexRvalue } func (l *tomlLexer) lexEqual() tomlLexStateFn { - l.ignore() - l.accept("=") + l.next() l.emit(tokenEqual) return l.lexRvalue } func (l *tomlLexer) lexComma() tomlLexStateFn { - l.ignore() - l.accept(",") + l.next() l.emit(tokenComma) return l.lexRvalue } func (l *tomlLexer) lexKey() tomlLexStateFn { - l.ignore() inQuotes := false - for r := l.next(); isKeyChar(r) || r == '\n'; r = l.next() { + for r := l.peek(); isKeyChar(r) || r == '\n'; r = l.peek() { if r == '"' { inQuotes = !inQuotes } else if r == '\n' { @@ -284,46 +287,40 @@ func (l *tomlLexer) lexKey() tomlLexStateFn { } else if !isValidBareChar(r) && !inQuotes { return l.errorf("keys cannot contain %c character", r) } + l.next() } - l.backup() l.emit(tokenKey) return l.lexVoid } func (l *tomlLexer) lexComment() tomlLexStateFn { - for { - next := l.next() - if next == '\n' || next == eof { - break - } + for next := l.peek(); next != '\n' && next != eof; next = l.peek() { + l.next() } l.ignore() return l.lexVoid } func (l *tomlLexer) lexLeftBracket() tomlLexStateFn { - l.ignore() - l.pos++ + l.next() l.emit(tokenLeftBracket) return l.lexRvalue } func (l *tomlLexer) lexLiteralString() tomlLexStateFn { - l.pos++ - l.ignore() + l.skip() growingString := "" // handle special case for triple-quote terminator := "'" if l.follow("''") { - l.pos += 2 - l.ignore() + l.skip() + l.skip() terminator = "'''" // special case: discard leading newline if l.peek() == '\n' { - l.pos++ - l.ignore() + l.skip() } } @@ -331,50 +328,48 @@ func (l *tomlLexer) lexLiteralString() tomlLexStateFn { for { if l.follow(terminator) { l.emitWithValue(tokenString, growingString) - l.pos += len(terminator) + l.fastForward(len(terminator)) l.ignore() return l.lexRvalue } - growingString += string(l.peek()) - - if l.next() == eof { + next := l.peek() + if next == eof { break } + growingString += string(l.next()) } return l.errorf("unclosed string") } func (l *tomlLexer) lexString() tomlLexStateFn { - l.pos++ - l.ignore() + l.skip() growingString := "" // handle special case for triple-quote terminator := "\"" if l.follow("\"\"") { - l.pos += 2 - l.ignore() + l.skip() + l.skip() terminator = "\"\"\"" // special case: discard leading newline if l.peek() == '\n' { - l.pos++ - l.ignore() + l.skip() } } for { if l.follow(terminator) { l.emitWithValue(tokenString, growingString) - l.pos += len(terminator) + l.fastForward(len(terminator)) l.ignore() return l.lexRvalue } if l.follow("\\") { - l.pos++ + l.next() switch l.peek() { case '\r': fallthrough @@ -384,56 +379,60 @@ func (l *tomlLexer) lexString() tomlLexStateFn { fallthrough case ' ': // skip all whitespace chars following backslash - l.pos++ for strings.ContainsRune("\r\n\t ", l.peek()) { - l.pos++ + l.next() } - l.pos-- case '"': growingString += "\"" + l.next() case 'n': growingString += "\n" + l.next() case 'b': growingString += "\b" + l.next() case 'f': growingString += "\f" + l.next() case '/': growingString += "/" + l.next() case 't': growingString += "\t" + l.next() case 'r': growingString += "\r" + l.next() case '\\': growingString += "\\" + l.next() case 'u': - l.pos++ + l.next() code := "" for i := 0; i < 4; i++ { c := l.peek() - l.pos++ if !isHexDigit(c) { return l.errorf("unfinished unicode escape") } + l.next() code = code + string(c) } - l.pos-- intcode, err := strconv.ParseInt(code, 16, 32) if err != nil { return l.errorf("invalid unicode escape: \\u" + code) } growingString += string(rune(intcode)) case 'U': - l.pos++ + l.next() code := "" for i := 0; i < 8; i++ { c := l.peek() - l.pos++ if !isHexDigit(c) { return l.errorf("unfinished unicode escape") } + l.next() code = code + string(c) } - l.pos-- intcode, err := strconv.ParseInt(code, 16, 64) if err != nil { return l.errorf("invalid unicode escape: \\U" + code) @@ -447,10 +446,11 @@ func (l *tomlLexer) lexString() tomlLexStateFn { if 0x00 <= r && r <= 0x1F { return l.errorf("unescaped control character %U", r) } + l.next() growingString += string(r) } - if l.next() == eof { + if l.peek() == eof { break } } @@ -459,12 +459,11 @@ func (l *tomlLexer) lexString() tomlLexStateFn { } func (l *tomlLexer) lexKeyGroup() tomlLexStateFn { - l.ignore() - l.pos++ + l.next() if l.peek() == '[' { // token '[[' signifies an array of anonymous key groups - l.pos++ + l.next() l.emit(tokenDoubleLeftBracket) return l.lexInsideKeyGroupArray } @@ -474,86 +473,85 @@ func (l *tomlLexer) lexKeyGroup() tomlLexStateFn { } func (l *tomlLexer) lexInsideKeyGroupArray() tomlLexStateFn { - for { - if l.peek() == ']' { - if l.pos > l.start { + for r := l.peek(); r != eof; r = l.peek() { + switch r { + case ']': + if len(l.buffer) > 0 { l.emit(tokenKeyGroupArray) } - l.ignore() - l.pos++ + l.next() if l.peek() != ']' { - break // error + break } - l.pos++ + l.next() l.emit(tokenDoubleRightBracket) return l.lexVoid - } else if l.peek() == '[' { + case '[': return l.errorf("group name cannot contain ']'") - } - - if l.next() == eof { - break + default: + l.next() } } return l.errorf("unclosed key group array") } func (l *tomlLexer) lexInsideKeyGroup() tomlLexStateFn { - for { - if l.peek() == ']' { - if l.pos > l.start { + for r := l.peek(); r != eof; r = l.peek() { + switch r { + case ']': + if len(l.buffer) > 0 { l.emit(tokenKeyGroup) } - l.ignore() - l.pos++ + l.next() l.emit(tokenRightBracket) return l.lexVoid - } else if l.peek() == '[' { + case '[': return l.errorf("group name cannot contain ']'") - } - - if l.next() == eof { - break + default: + l.next() } } return l.errorf("unclosed key group") } func (l *tomlLexer) lexRightBracket() tomlLexStateFn { - l.ignore() - l.pos++ + l.next() l.emit(tokenRightBracket) return l.lexRvalue } func (l *tomlLexer) lexNumber() tomlLexStateFn { - l.ignore() - if !l.accept("+") { - l.accept("-") + r := l.peek() + if r == '+' || r == '-' { + l.next() } pointSeen := false expSeen := false digitSeen := false for { - next := l.next() + next := l.peek() if next == '.' { if pointSeen { return l.errorf("cannot have two dots in one float") } + l.next() if !isDigit(l.peek()) { return l.errorf("float cannot end with a dot") } pointSeen = true } else if next == 'e' || next == 'E' { expSeen = true - if !l.accept("+") { - l.accept("-") + l.next() + r := l.peek() + if r == '+' || r == '-' { + l.next() } } else if isDigit(next) { digitSeen = true + l.next() } else if next == '_' { + l.next() } else { - l.backup() break } if pointSeen && !digitSeen { @@ -572,17 +570,27 @@ func (l *tomlLexer) lexNumber() tomlLexStateFn { return l.lexRvalue } +func (l *tomlLexer) run() { + for state := l.lexVoid; state != nil; { + state = state() + } + close(l.tokens) +} + func init() { dateRegexp = regexp.MustCompile("^\\d{1,4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}(\\.\\d{1,9})?(Z|[+-]\\d{2}:\\d{2})") } // Entry point -func lexToml(input string) chan token { +func lexToml(input io.Reader) chan token { + bufferedInput := buffruneio.NewReader(input) l := &tomlLexer{ - input: input, - tokens: make(chan token), - line: 1, - col: 1, + input: bufferedInput, + tokens: make(chan token), + line: 1, + col: 1, + endbufferLine: 1, + endbufferCol: 1, } go l.run() return l.tokens diff --git a/lexer_test.go b/lexer_test.go index 1964a57a..9fa8be8b 100644 --- a/lexer_test.go +++ b/lexer_test.go @@ -1,15 +1,19 @@ package toml -import "testing" +import ( + "strings" + "testing" +) func testFlow(t *testing.T, input string, expectedFlow []token) { - ch := lexToml(input) + ch := lexToml(strings.NewReader(input)) for _, expected := range expectedFlow { token := <-ch if token != expected { t.Log("While testing: ", input) t.Log("compared (got)", token, "to (expected)", expected) t.Log("\tvalue:", token.val, "<->", expected.val) + t.Log("\tvalue as bytes:", []byte(token.val), "<->", []byte(expected.val)) t.Log("\ttype:", token.typ.String(), "<->", expected.typ.String()) t.Log("\tline:", token.Line, "<->", expected.Line) t.Log("\tcolumn:", token.Col, "<->", expected.Col) diff --git a/parser_test.go b/parser_test.go index 53cfcde6..f9191b67 100644 --- a/parser_test.go +++ b/parser_test.go @@ -287,7 +287,7 @@ func TestArrayNestedStrings(t *testing.T) { func TestMissingValue(t *testing.T) { _, err := Load("a = ") - if err.Error() != "(1, 4): expecting a value" { + if err.Error() != "(1, 5): expecting a value" { t.Error("Bad error message:", err.Error()) } } @@ -441,7 +441,7 @@ func TestImplicitDeclarationBefore(t *testing.T) { func TestFloatsWithoutLeadingZeros(t *testing.T) { _, err := Load("a = .42") - if err.Error() != "(1, 4): cannot start float with a dot" { + if err.Error() != "(1, 5): cannot start float with a dot" { t.Error("Bad error message:", err.Error()) } diff --git a/test.sh b/test.sh index 410838b0..0a426e03 100755 --- a/test.sh +++ b/test.sh @@ -19,6 +19,8 @@ function git_clone() { popd } +go get github.com/pelletier/go-buffruneio + # get code for BurntSushi TOML validation # pinning all to 'HEAD' for version 0.3.x work (TODO: pin to commit hash when tests stabilize) git_clone github.com/BurntSushi/toml master HEAD @@ -66,7 +68,7 @@ else echo "Invalid Test TOML for $test:" echo "====" cat "$invalid_test.toml" - + echo "Go-TOML Output for $test:" echo "====" echo "go-toml Output:" diff --git a/toml.go b/toml.go index bf35cd34..7af032aa 100644 --- a/toml.go +++ b/toml.go @@ -3,7 +3,8 @@ package toml import ( "errors" "fmt" - "io/ioutil" + "io" + "os" "runtime" "strconv" "strings" @@ -360,8 +361,8 @@ func (t *TomlTree) ToString() string { return t.toToml("", "") } -// Load creates a TomlTree from a string. -func Load(content string) (tree *TomlTree, err error) { +// LoadReader creates a TomlTree from any io.Reader. +func LoadReader(reader io.Reader) (tree *TomlTree, err error) { defer func() { if r := recover(); r != nil { if _, ok := r.(runtime.Error); ok { @@ -370,18 +371,21 @@ func Load(content string) (tree *TomlTree, err error) { err = errors.New(r.(string)) } }() - tree = parseToml(lexToml(content)) + tree = parseToml(lexToml(reader)) return } +// Load creates a TomlTree from a string. +func Load(content string) (tree *TomlTree, err error) { + return LoadReader(strings.NewReader(content)) +} + // LoadFile creates a TomlTree from a file. func LoadFile(path string) (tree *TomlTree, err error) { - buff, ferr := ioutil.ReadFile(path) - if ferr != nil { - err = ferr - } else { - s := string(buff) - tree, err = Load(s) + file, err := os.Open(path) + if err != nil { + return nil, err } - return + defer file.Close() + return LoadReader(file) }