diff --git a/frontend/frontend_test.go b/frontend/frontend_test.go index 00d6f67..ee106aa 100644 --- a/frontend/frontend_test.go +++ b/frontend/frontend_test.go @@ -101,7 +101,7 @@ func TestParseConcatAltMaybes(x *testing.T) { t.Error(err) } t.Log(program) - tMatch(program, "", t) + tNoMatch(program, "", t) // will get empty string error tMatch(program, "E", t) tMatch(program, "D", t) tMatch(program, "A", t) @@ -209,7 +209,7 @@ func TestParseConcatAltStar(x *testing.T) { t.Error(err) } t.Log(program) - tMatch(program, "", t) + tNoMatch(program, "", t) // will get empty string error tMatch(program, "X", t) tMatch(program, "Y", t) tMatch(program, "A", t) diff --git a/lexer_test.go b/lexer_test.go index 15c3e6c..1b4b5ab 100644 --- a/lexer_test.go +++ b/lexer_test.go @@ -255,42 +255,61 @@ func TestPartialLexer(x *testing.T) { } func TestRegression(t *testing.T) { - skip := func(*Scanner, *machines.Match) (interface{}, error) { - return nil, nil - } - token := func(id int, name string) Action { + token := func(name string) Action { return func(s *Scanner, m *machines.Match) (interface{}, error) { - return string(m.Bytes), nil + return fmt.Sprintf("%v:%q", name, string(m.Bytes)), nil } } - data := "true" // This input fails. - // data := "true " // this with a trailing space does not. - - lexer := NewLexer() - lexer.Add([]byte("true"), token(0, "TRUE")) - lexer.Add([]byte("( |\t|\n|\r)+"), skip) + newLexer := func() *Lexer { + lexer := NewLexer() + lexer.Add([]byte("true"), token("TRUE")) + lexer.Add([]byte("( |\t|\n|\r)+"), token("SPACE")) + return lexer + } - if err := lexer.CompileDFA(); err != nil { - t.Fatal(err) + tests := []struct { + text string + tokens int + }{ + {`true`, 1}, + {`true `, 2}, } - var scanner *Scanner + runTest := func(lexer *Lexer) { + for _, test := range tests { + scanner, err := lexer.Scanner([]byte(test.text)) + if err != nil { + t.Fatal(err) + } - scanner, err := lexer.Scanner([]byte(data)) - if err != nil { - t.Fatal(err) + found := 0 + tok, err, eos := scanner.Next() + for ; !eos; tok, err, eos = scanner.Next() { + if err != nil { + t.Fatal(err) + } + fmt.Printf("Token: %v\n", tok) + found++ + } + if found != test.tokens { + t.Errorf("Expected exactly %v tokens got %v, ===\nErr: %v\nEOS: %v\nTC: %d\n", test.tokens, found, err, eos, scanner.TC) + } + } } - - found := 0 - tok, err, eos := scanner.Next() - for ; !eos; tok, err, eos = scanner.Next() { - fmt.Printf("Token: %v\n", tok) - found++ + { + lexer := newLexer() + if err := lexer.CompileNFA(); err != nil { + t.Fatal(err) + } + runTest(lexer) } - if found != 1 { - t.Errorf("Expected exactly 1 tokens got %v, ===\nErr: %v\nEOS: %v\nTC: %d\n", found, err, eos, scanner.TC) - + { + lexer := newLexer() + if err := lexer.CompileDFA(); err != nil { + t.Fatal(err) + } + runTest(lexer) } } @@ -353,30 +372,146 @@ ddns-update-style none; for _, lit := range literals { lex.Add([]byte(lit), token(lit)) } + return lex + } - err := lex.Compile() + runTest := func(lexer *Lexer) { + scanner, err := lexer.Scanner([]byte(text)) if err != nil { - panic(err) + return } + for tok, err, eof := scanner.Next(); !eof; tok, err, eof = scanner.Next() { + if err != nil { + t.Fatal(err) + break + } + token := tok.(*Token) + fmt.Printf("%-7v | %-10v | %v:%v-%v:%v\n", + tokens[token.Type], + strings.TrimSpace(string(token.Lexeme)), + token.StartLine, + token.StartColumn, + token.EndLine, + token.EndColumn) + } + } + { + lexer := newLexer() + if err := lexer.CompileNFA(); err != nil { + t.Fatal(err) + } + runTest(lexer) + } + { + lexer := newLexer() + if err := lexer.CompileDFA(); err != nil { + t.Fatal(err) + } + runTest(lexer) + } +} - return lex +func TestPythonStrings(t *testing.T) { + tokens := []string{ + "UNDEF", + "TRUE", + "SINGLE_STRING", + "TRIPLE_STRING", + "TRIPLE_STRING2", + "TY_STRING", + "SPACE", + } + tokenIds := map[string]int{} + for i, tok := range tokens { + tokenIds[tok] = i + } + skip := func(*Scanner, *machines.Match) (interface{}, error) { + return nil, nil + } + token := func(name string) Action { + return func(s *Scanner, m *machines.Match) (interface{}, error) { + return s.Token(tokenIds[name], string(m.Bytes), m), nil + } } - scanner, err := newLexer().Scanner([]byte(text)) - if err != nil { - return + newLexer := func() *Lexer { + lexer := NewLexer() + lexer.Add([]byte("true"), token("TRUE")) + lexer.Add([]byte(`'''([^\\']|(\\.))*'''`), token("TRIPLE_STRING")) + lexer.Add([]byte(`"""([^\\"]|(\\.))*"""`), token("TRIPLE_STRING")) + lexer.Add([]byte(`"([^\\"]|(\\.))*"`), token("SINGLE_STRING")) + lexer.Add([]byte(`'([^\\']|(\\.))*'`), token("SINGLE_STRING")) + lexer.Add([]byte("( |\t|\n|\r)+"), skip) + return lexer } - for tok, err, eof := scanner.Next(); !eof; tok, err, eof = scanner.Next() { - if err != nil { - t.Error(err) + + tests := []struct { + text string + tokens int + }{ + {`'''hi'''`, 1}, + {`"""hi"""`, 1}, + {`"hi"`, 1}, + {`'hi'`, 1}, + {`''`, 1}, + {`""`, 1}, + {`""" . . + hello + """`, 1}, + {`'''' ''''`, 4}, + {`''''''`, 1}, + {`""""""`, 1}, + {`"""""" """ + hi there""" "wizard" true`, 4}, + } + + runTest := func(lexer *Lexer) { + for _, test := range tests { + fmt.Printf("test %q\n", test.text) + scanner, err := lexer.Scanner([]byte(test.text)) + if err != nil { + t.Fatal(err) + } + + found := 0 + tok, err, eos := scanner.Next() + for ; !eos; tok, err, eos = scanner.Next() { + if err != nil { + t.Error(err) + fmt.Printf("err: %v\n", err) + scanner.TC++ + } else { + token := tok.(*Token) + fmt.Printf("%-15v | %-30q | %d-%d | %v:%v-%v:%v\n", + tokens[token.Type], + strings.TrimSpace(string(token.Lexeme)), + token.TC, + token.TC+len(token.Lexeme), + token.StartLine, + token.StartColumn, + token.EndLine, + token.EndColumn) + found++ + } + } + if found != test.tokens { + t.Errorf("expected %v tokens got %v: %q", test.tokens, found, test.text) + } + } + } + { + lexer := newLexer() + if err := lexer.CompileNFA(); err != nil { + t.Fatal(err) } - token := tok.(*Token) - fmt.Printf("%-7v | %-10v | %v:%v-%v:%v\n", - tokens[token.Type], - strings.TrimSpace(string(token.Lexeme)), - token.StartLine, - token.StartColumn, - token.EndLine, - token.EndColumn) + runTest(lexer) } + { + lexer := newLexer() + if err := lexer.CompileDFA(); err != nil { + t.Fatal(err) + } + runTest(lexer) + } + } diff --git a/machines/dfa_machine.go b/machines/dfa_machine.go index 5a08d48..4464a31 100644 --- a/machines/dfa_machine.go +++ b/machines/dfa_machine.go @@ -73,12 +73,23 @@ func DFALexerEngine(startState, errorState int, trans DFATrans, accepting DFAAcc Bytes: text[startTC:matchTC], } matchID = -1 - return tc, match, nil, scan + if matchTC == startTC { + err := &EmptyMatchError{ + MatchID: matchID, + TC: tc, + Line: startLC.line, + Column: startLC.col, + } + return startTC, nil, err, scan + } + return matchTC, match, nil, scan } } - if match, has := accepting[state]; has && startTC < len(text) { + if match, has := accepting[state]; has { matchID = match matchTC = tc + } + if startTC < len(text) && matchTC <= len(text) && matchID > -1 { startLC := lineCols[startTC] endLC := lineCols[matchTC-1] match := &Match{ @@ -91,7 +102,16 @@ func DFALexerEngine(startState, errorState int, trans DFATrans, accepting DFAAcc Bytes: text[startTC:matchTC], } matchID = -1 - return tc, match, nil, scan + if matchTC == startTC { + err := &EmptyMatchError{ + MatchID: matchID, + TC: tc, + Line: startLC.line, + Column: startLC.col, + } + return startTC, nil, err, scan + } + return matchTC, match, nil, scan } if matchTC != len(text) && startTC >= len(text) { // the user has moved us farther than the text. Assume that was diff --git a/machines/machine.go b/machines/machine.go index e132dec..29b1ad7 100644 --- a/machines/machine.go +++ b/machines/machine.go @@ -11,6 +11,21 @@ import ( "github.com/timtadh/lexmachine/queue" ) +// EmptyMatchError is returned when a pattern would have matched the empty +// string +type EmptyMatchError struct { + TC int + Line int + Column int + MatchID int +} + +func (e *EmptyMatchError) Error() string { + return fmt.Sprintf("Lexer error: matched the empty string at %d:%d (tc=%d) for match id %d.", + e.Line, e.Column, e.TC, e.MatchID, + ) +} + // UnconsumedInput error type type UnconsumedInput struct { StartTC int @@ -195,7 +210,16 @@ func LexerEngine(program inst.Slice, text []byte) Scanner { } prevTC = startTC matchPC = -1 - return tc, match, nil, scan + if matchTC == startTC { + err := &EmptyMatchError{ + MatchID: matchPC, + TC: tc, + Line: line, + Column: col, + } + return startTC, nil, err, scan + } + return matchTC, match, nil, scan } } if matchTC != len(text) && startTC >= len(text) {