Fix parsing of invalid yaml (#496)

* fix invalid test cases ( use space indent ) * fix parsing of invalid yaml
goccy · Nov 1, 2024 · 9fabf36 · 9fabf36
1 parent 8f6f26e
commit 9fabf36
Show file tree

Hide file tree

Showing 4 changed files with 100 additions and 66 deletions.
diff --git a/lexer/lexer_test.go b/lexer/lexer_test.go
@@ -533,17 +533,18 @@ func TestTokenize(t *testing.T) {
 			},
 		},
 		{
-			YAML: `v:
-		- A
-		- B
-		`,
+			YAML: `
+v:
+- A
+- B
+`,
 			Tokens: token.Tokens{
 				{
 					Type:          token.StringType,
 					CharacterType: token.CharacterTypeMiscellaneous,
 					Indicator:     token.NotIndicator,
 					Value:         "v",
-					Origin:        "v",
+					Origin:        "\nv",
 				},
 				{
 					Type:          token.MappingValueType,
@@ -557,7 +558,7 @@ func TestTokenize(t *testing.T) {
 					CharacterType: token.CharacterTypeIndicator,
 					Indicator:     token.BlockStructureIndicator,
 					Value:         "-",
-					Origin:        "\n\t\t-",
+					Origin:        "\n-",
 				},
 				{
 					Type:          token.StringType,
@@ -571,31 +572,32 @@ func TestTokenize(t *testing.T) {
 					CharacterType: token.CharacterTypeIndicator,
 					Indicator:     token.BlockStructureIndicator,
 					Value:         "-",
-					Origin:        "\t\t-",
+					Origin:        "-",
 				},
 				{
 					Type:          token.StringType,
 					CharacterType: token.CharacterTypeMiscellaneous,
 					Indicator:     token.NotIndicator,
 					Value:         "B",
-					Origin:        " B\n",
+					Origin:        " B",
 				},
 			},
 		},
 		{
-			YAML: `v:
-		- A
-		- |-
-		 B
-		 C
-		`,
+			YAML: `
+v:
+- A
+- |-
+ B
+ C
+`,
 			Tokens: token.Tokens{
 				{
 					Type:          token.StringType,
 					CharacterType: token.CharacterTypeMiscellaneous,
 					Indicator:     token.NotIndicator,
 					Value:         "v",
-					Origin:        "v",
+					Origin:        "\nv",
 				},
 				{
 					Type:          token.MappingValueType,
@@ -609,7 +611,7 @@ func TestTokenize(t *testing.T) {
 					CharacterType: token.CharacterTypeIndicator,
 					Indicator:     token.BlockStructureIndicator,
 					Value:         "-",
-					Origin:        "\n\t\t-",
+					Origin:        "\n-",
 				},
 				{
 					Type:          token.StringType,
@@ -623,7 +625,7 @@ func TestTokenize(t *testing.T) {
 					CharacterType: token.CharacterTypeIndicator,
 					Indicator:     token.BlockStructureIndicator,
 					Value:         "-",
-					Origin:        "\t\t-",
+					Origin:        "-",
 				},
 				{
 					Type:          token.LiteralType,
@@ -636,33 +638,27 @@ func TestTokenize(t *testing.T) {
 					Type:          token.StringType,
 					CharacterType: token.CharacterTypeMiscellaneous,
 					Indicator:     token.NotIndicator,
-					Value:         "\t\t B",
-					Origin:        "\t\t B\n",
-				},
-				{
-					Type:          token.StringType,
-					CharacterType: token.CharacterTypeMiscellaneous,
-					Indicator:     token.NotIndicator,
-					Value:         "\t\t C",
-					Origin:        "\t\t C\n",
+					Value:         "B\nC",
+					Origin:        " B\n C\n",
 				},
 			},
 		},
 		{
-			YAML: `v:
-		- A
-		- 1
-		- B:
-		 - 2
-		 - 3
-		`,
+			YAML: `
+v:
+- A
+- 1
+- B:
+ - 2
+ - 3
+`,
 			Tokens: token.Tokens{
 				{
 					Type:          token.StringType,
 					CharacterType: token.CharacterTypeMiscellaneous,
 					Indicator:     token.NotIndicator,
 					Value:         "v",
-					Origin:        "v",
+					Origin:        "\nv",
 				},
 				{
 					Type:          token.MappingValueType,
@@ -676,7 +672,7 @@ func TestTokenize(t *testing.T) {
 					CharacterType: token.CharacterTypeIndicator,
 					Indicator:     token.BlockStructureIndicator,
 					Value:         "-",
-					Origin:        "\n\t\t-",
+					Origin:        "\n-",
 				},
 				{
 					Type:          token.StringType,
@@ -690,7 +686,7 @@ func TestTokenize(t *testing.T) {
 					CharacterType: token.CharacterTypeIndicator,
 					Indicator:     token.BlockStructureIndicator,
 					Value:         "-",
-					Origin:        "\t\t-",
+					Origin:        "-",
 				},
 				{
 					Type:          token.IntegerType,
@@ -704,7 +700,7 @@ func TestTokenize(t *testing.T) {
 					CharacterType: token.CharacterTypeIndicator,
 					Indicator:     token.BlockStructureIndicator,
 					Value:         "-",
-					Origin:        "\t\t-",
+					Origin:        "-",
 				},
 				{
 					Type:          token.StringType,
@@ -725,42 +721,43 @@ func TestTokenize(t *testing.T) {
 					CharacterType: token.CharacterTypeIndicator,
 					Indicator:     token.BlockStructureIndicator,
 					Value:         "-",
-					Origin:        "\n\t\t -",
+					Origin:        "\n -",
 				},
 				{
 					Type:          token.IntegerType,
 					CharacterType: token.CharacterTypeMiscellaneous,
 					Indicator:     token.NotIndicator,
 					Value:         "2",
-					Origin:        " 2\n",
+					Origin:        " 2\n ",
 				},
 				{
 					Type:          token.SequenceEntryType,
 					CharacterType: token.CharacterTypeIndicator,
 					Indicator:     token.BlockStructureIndicator,
 					Value:         "-",
-					Origin:        "\t\t -",
+					Origin:        "-",
 				},
 				{
 					Type:          token.IntegerType,
 					CharacterType: token.CharacterTypeMiscellaneous,
 					Indicator:     token.NotIndicator,
 					Value:         "3",
-					Origin:        " 3\n",
+					Origin:        " 3",
 				},
 			},
 		},
 		{
-			YAML: `a:
-		 b: c
-		`,
+			YAML: `
+a:
+ b: c
+`,
 			Tokens: token.Tokens{
 				{
 					Type:          token.StringType,
 					CharacterType: token.CharacterTypeMiscellaneous,
 					Indicator:     token.NotIndicator,
 					Value:         "a",
-					Origin:        "a",
+					Origin:        "\na",
 				},
 				{
 					Type:          token.MappingValueType,
@@ -773,8 +770,8 @@ func TestTokenize(t *testing.T) {
 					Type:          token.StringType,
 					CharacterType: token.CharacterTypeMiscellaneous,
 					Indicator:     token.NotIndicator,
-					Value:         "\t\t b",
-					Origin:        "\n\t\t b",
+					Value:         "b",
+					Origin:        "\n b",
 				},
 				{
 					Type:          token.MappingValueType,
@@ -788,7 +785,7 @@ func TestTokenize(t *testing.T) {
 					CharacterType: token.CharacterTypeMiscellaneous,
 					Indicator:     token.NotIndicator,
 					Value:         "c",
-					Origin:        " c\n",
+					Origin:        " c",
 				},
 			},
 		},
@@ -834,7 +831,7 @@ func TestTokenize(t *testing.T) {
 		},
 		{
 			YAML: `hello: world
-		`,
+`,
 			Tokens: token.Tokens{
 				{
 					Type:          token.StringType,
@@ -855,7 +852,7 @@ func TestTokenize(t *testing.T) {
 					CharacterType: token.CharacterTypeMiscellaneous,
 					Indicator:     token.NotIndicator,
 					Value:         "world",
-					Origin:        " world\n",
+					Origin:        " world",
 				},
 			},
 		},
@@ -1271,17 +1268,18 @@ func TestTokenize(t *testing.T) {
 			},
 		},
 		{
-			YAML: `a: !!binary |
-		 kJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJ
-		 CQ
-		`,
+			YAML: `
+a: !!binary |
+ kJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJ
+ CQ
+`,
 			Tokens: token.Tokens{
 				{
 					Type:          token.StringType,
 					CharacterType: token.CharacterTypeMiscellaneous,
 					Indicator:     token.NotIndicator,
 					Value:         "a",
-					Origin:        "a",
+					Origin:        "\na",
 				},
 				{
 					Type:          token.MappingValueType,
@@ -1308,15 +1306,8 @@ func TestTokenize(t *testing.T) {
 					Type:          token.StringType,
 					CharacterType: token.CharacterTypeMiscellaneous,
 					Indicator:     token.NotIndicator,
-					Value:         "\t\t kJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJ",
-					Origin:        "\t\t kJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJ\n",
-				},
-				{
-					Type:          token.StringType,
-					CharacterType: token.CharacterTypeMiscellaneous,
-					Indicator:     token.NotIndicator,
-					Value:         "\t\t CQ",
-					Origin:        "\t\t CQ\n",
+					Value:         "kJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJ\nCQ\n",
+					Origin:        " kJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJ\n CQ\n",
 				},
 			},
 		},

diff --git a/parser/parser.go b/parser/parser.go
@@ -654,6 +654,10 @@ func (p *parser) parseLiteral(ctx *context) (*ast.LiteralNode, error) {
 	p.progress(1) // skip literal/folded token
 
 	tk := p.currentToken()
+	if tk == nil {
+		node.Value = ast.String(token.New("", "", node.Start.Position))
+		return node, nil
+	}
 	var comment *ast.CommentGroupNode
 	if tk.Type == token.CommentType {
 		comment = p.parseCommentOnly(ctx)

diff --git a/parser/parser_test.go b/parser/parser_test.go
@@ -88,6 +88,10 @@ func TestParser(t *testing.T) {
 		"a_mk: \n  bd: 3\n",
 		"a: :a",
 		"{a: , b: c}",
+		"value: >\n",
+		"value: >\n\n",
+		"value: >\nother:",
+		"value: >\n\nother:",
 	}
 	for _, src := range sources {
 		if _, err := parser.Parse(lexer.Tokenize(src), 0); err != nil {
@@ -811,6 +815,22 @@ b: - 2
 [1:4] found invalid token
 >  1 | a: "\"key\": \"value:\"
           ^
+`,
+		},
+		{
+			`foo: [${should not be allowed}]`,
+			`
+[1:8] ',' or ']' must be specified
+>  1 | foo: [${should not be allowed}]
+              ^
+`,
+		},
+		{
+			`foo: [$[should not be allowed]]`,
+			`
+[1:8] ',' or ']' must be specified
+>  1 | foo: [$[should not be allowed]]
+              ^
 `,
 		},
 	}

diff --git a/scanner/scanner.go b/scanner/scanner.go
@@ -620,11 +620,22 @@ func (s *Scanner) scanNewLine(ctx *Context, c rune) {
 	s.progressLine(ctx)
 }
 
+func (s *Scanner) isFlowMode() bool {
+	if s.startedFlowSequenceNum > 0 {
+		return true
+	}
+	if s.startedFlowMapNum > 0 {
+		return true
+	}
+	return false
+}
+
 func (s *Scanner) scanFlowMapStart(ctx *Context) bool {
-	if ctx.existsBuffer() {
+	if ctx.existsBuffer() && !s.isFlowMode() {
 		return false
 	}
 
+	s.addBufferedTokenIfExists(ctx)
 	ctx.addOriginBuf('{')
 	ctx.addToken(token.MappingStart(string(ctx.obuf), s.pos()))
 	s.startedFlowMapNum++
@@ -648,10 +659,11 @@ func (s *Scanner) scanFlowMapEnd(ctx *Context) bool {
 }
 
 func (s *Scanner) scanFlowArrayStart(ctx *Context) bool {
-	if ctx.existsBuffer() {
+	if ctx.existsBuffer() && !s.isFlowMode() {
 		return false
 	}
 
+	s.addBufferedTokenIfExists(ctx)
 	ctx.addOriginBuf('[')
 	ctx.addToken(token.SequenceStart(string(ctx.obuf), s.pos()))
 	s.startedFlowSequenceNum++
@@ -946,6 +958,13 @@ func (s *Scanner) scan(ctx *Context) error {
 		}
 		if ctx.isDocument() {
 			if s.isChangedToIndentStateDown() {
+				if tk := ctx.lastToken(); tk != nil {
+					// If literal/folded content is empty, no string token is added.
+					// Therefore, add an empty string token.
+					if tk.Type != token.StringType {
+						ctx.addToken(token.String("", "", s.pos()))
+					}
+				}
 				s.breakLiteral(ctx)
 			} else {
 				s.scanLiteral(ctx, c)