Skip to content

Commit

Permalink
Fix parsing of invalid yaml (#496)
Browse files Browse the repository at this point in the history
* fix invalid test cases ( use space indent )

* fix parsing of invalid yaml
  • Loading branch information
goccy authored Nov 1, 2024
1 parent 8f6f26e commit 9fabf36
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 66 deletions.
119 changes: 55 additions & 64 deletions lexer/lexer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -533,17 +533,18 @@ func TestTokenize(t *testing.T) {
},
},
{
YAML: `v:
- A
- B
`,
YAML: `
v:
- A
- B
`,
Tokens: token.Tokens{
{
Type: token.StringType,
CharacterType: token.CharacterTypeMiscellaneous,
Indicator: token.NotIndicator,
Value: "v",
Origin: "v",
Origin: "\nv",
},
{
Type: token.MappingValueType,
Expand All @@ -557,7 +558,7 @@ func TestTokenize(t *testing.T) {
CharacterType: token.CharacterTypeIndicator,
Indicator: token.BlockStructureIndicator,
Value: "-",
Origin: "\n\t\t-",
Origin: "\n-",
},
{
Type: token.StringType,
Expand All @@ -571,31 +572,32 @@ func TestTokenize(t *testing.T) {
CharacterType: token.CharacterTypeIndicator,
Indicator: token.BlockStructureIndicator,
Value: "-",
Origin: "\t\t-",
Origin: "-",
},
{
Type: token.StringType,
CharacterType: token.CharacterTypeMiscellaneous,
Indicator: token.NotIndicator,
Value: "B",
Origin: " B\n",
Origin: " B",
},
},
},
{
YAML: `v:
- A
- |-
B
C
`,
YAML: `
v:
- A
- |-
B
C
`,
Tokens: token.Tokens{
{
Type: token.StringType,
CharacterType: token.CharacterTypeMiscellaneous,
Indicator: token.NotIndicator,
Value: "v",
Origin: "v",
Origin: "\nv",
},
{
Type: token.MappingValueType,
Expand All @@ -609,7 +611,7 @@ func TestTokenize(t *testing.T) {
CharacterType: token.CharacterTypeIndicator,
Indicator: token.BlockStructureIndicator,
Value: "-",
Origin: "\n\t\t-",
Origin: "\n-",
},
{
Type: token.StringType,
Expand All @@ -623,7 +625,7 @@ func TestTokenize(t *testing.T) {
CharacterType: token.CharacterTypeIndicator,
Indicator: token.BlockStructureIndicator,
Value: "-",
Origin: "\t\t-",
Origin: "-",
},
{
Type: token.LiteralType,
Expand All @@ -636,33 +638,27 @@ func TestTokenize(t *testing.T) {
Type: token.StringType,
CharacterType: token.CharacterTypeMiscellaneous,
Indicator: token.NotIndicator,
Value: "\t\t B",
Origin: "\t\t B\n",
},
{
Type: token.StringType,
CharacterType: token.CharacterTypeMiscellaneous,
Indicator: token.NotIndicator,
Value: "\t\t C",
Origin: "\t\t C\n",
Value: "B\nC",
Origin: " B\n C\n",
},
},
},
{
YAML: `v:
- A
- 1
- B:
- 2
- 3
`,
YAML: `
v:
- A
- 1
- B:
- 2
- 3
`,
Tokens: token.Tokens{
{
Type: token.StringType,
CharacterType: token.CharacterTypeMiscellaneous,
Indicator: token.NotIndicator,
Value: "v",
Origin: "v",
Origin: "\nv",
},
{
Type: token.MappingValueType,
Expand All @@ -676,7 +672,7 @@ func TestTokenize(t *testing.T) {
CharacterType: token.CharacterTypeIndicator,
Indicator: token.BlockStructureIndicator,
Value: "-",
Origin: "\n\t\t-",
Origin: "\n-",
},
{
Type: token.StringType,
Expand All @@ -690,7 +686,7 @@ func TestTokenize(t *testing.T) {
CharacterType: token.CharacterTypeIndicator,
Indicator: token.BlockStructureIndicator,
Value: "-",
Origin: "\t\t-",
Origin: "-",
},
{
Type: token.IntegerType,
Expand All @@ -704,7 +700,7 @@ func TestTokenize(t *testing.T) {
CharacterType: token.CharacterTypeIndicator,
Indicator: token.BlockStructureIndicator,
Value: "-",
Origin: "\t\t-",
Origin: "-",
},
{
Type: token.StringType,
Expand All @@ -725,42 +721,43 @@ func TestTokenize(t *testing.T) {
CharacterType: token.CharacterTypeIndicator,
Indicator: token.BlockStructureIndicator,
Value: "-",
Origin: "\n\t\t -",
Origin: "\n -",
},
{
Type: token.IntegerType,
CharacterType: token.CharacterTypeMiscellaneous,
Indicator: token.NotIndicator,
Value: "2",
Origin: " 2\n",
Origin: " 2\n ",
},
{
Type: token.SequenceEntryType,
CharacterType: token.CharacterTypeIndicator,
Indicator: token.BlockStructureIndicator,
Value: "-",
Origin: "\t\t -",
Origin: "-",
},
{
Type: token.IntegerType,
CharacterType: token.CharacterTypeMiscellaneous,
Indicator: token.NotIndicator,
Value: "3",
Origin: " 3\n",
Origin: " 3",
},
},
},
{
YAML: `a:
b: c
`,
YAML: `
a:
b: c
`,
Tokens: token.Tokens{
{
Type: token.StringType,
CharacterType: token.CharacterTypeMiscellaneous,
Indicator: token.NotIndicator,
Value: "a",
Origin: "a",
Origin: "\na",
},
{
Type: token.MappingValueType,
Expand All @@ -773,8 +770,8 @@ func TestTokenize(t *testing.T) {
Type: token.StringType,
CharacterType: token.CharacterTypeMiscellaneous,
Indicator: token.NotIndicator,
Value: "\t\t b",
Origin: "\n\t\t b",
Value: "b",
Origin: "\n b",
},
{
Type: token.MappingValueType,
Expand All @@ -788,7 +785,7 @@ func TestTokenize(t *testing.T) {
CharacterType: token.CharacterTypeMiscellaneous,
Indicator: token.NotIndicator,
Value: "c",
Origin: " c\n",
Origin: " c",
},
},
},
Expand Down Expand Up @@ -834,7 +831,7 @@ func TestTokenize(t *testing.T) {
},
{
YAML: `hello: world
`,
`,
Tokens: token.Tokens{
{
Type: token.StringType,
Expand All @@ -855,7 +852,7 @@ func TestTokenize(t *testing.T) {
CharacterType: token.CharacterTypeMiscellaneous,
Indicator: token.NotIndicator,
Value: "world",
Origin: " world\n",
Origin: " world",
},
},
},
Expand Down Expand Up @@ -1271,17 +1268,18 @@ func TestTokenize(t *testing.T) {
},
},
{
YAML: `a: !!binary |
kJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJ
CQ
`,
YAML: `
a: !!binary |
kJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJ
CQ
`,
Tokens: token.Tokens{
{
Type: token.StringType,
CharacterType: token.CharacterTypeMiscellaneous,
Indicator: token.NotIndicator,
Value: "a",
Origin: "a",
Origin: "\na",
},
{
Type: token.MappingValueType,
Expand All @@ -1308,15 +1306,8 @@ func TestTokenize(t *testing.T) {
Type: token.StringType,
CharacterType: token.CharacterTypeMiscellaneous,
Indicator: token.NotIndicator,
Value: "\t\t kJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJ",
Origin: "\t\t kJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJ\n",
},
{
Type: token.StringType,
CharacterType: token.CharacterTypeMiscellaneous,
Indicator: token.NotIndicator,
Value: "\t\t CQ",
Origin: "\t\t CQ\n",
Value: "kJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJ\nCQ\n",
Origin: " kJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJCQkJ\n CQ\n",
},
},
},
Expand Down
4 changes: 4 additions & 0 deletions parser/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -654,6 +654,10 @@ func (p *parser) parseLiteral(ctx *context) (*ast.LiteralNode, error) {
p.progress(1) // skip literal/folded token

tk := p.currentToken()
if tk == nil {
node.Value = ast.String(token.New("", "", node.Start.Position))
return node, nil
}
var comment *ast.CommentGroupNode
if tk.Type == token.CommentType {
comment = p.parseCommentOnly(ctx)
Expand Down
20 changes: 20 additions & 0 deletions parser/parser_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,10 @@ func TestParser(t *testing.T) {
"a_mk: \n bd: 3\n",
"a: :a",
"{a: , b: c}",
"value: >\n",
"value: >\n\n",
"value: >\nother:",
"value: >\n\nother:",
}
for _, src := range sources {
if _, err := parser.Parse(lexer.Tokenize(src), 0); err != nil {
Expand Down Expand Up @@ -811,6 +815,22 @@ b: - 2
[1:4] found invalid token
> 1 | a: "\"key\": \"value:\"
^
`,
},
{
`foo: [${should not be allowed}]`,
`
[1:8] ',' or ']' must be specified
> 1 | foo: [${should not be allowed}]
^
`,
},
{
`foo: [$[should not be allowed]]`,
`
[1:8] ',' or ']' must be specified
> 1 | foo: [$[should not be allowed]]
^
`,
},
}
Expand Down
23 changes: 21 additions & 2 deletions scanner/scanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -620,11 +620,22 @@ func (s *Scanner) scanNewLine(ctx *Context, c rune) {
s.progressLine(ctx)
}

func (s *Scanner) isFlowMode() bool {
if s.startedFlowSequenceNum > 0 {
return true
}
if s.startedFlowMapNum > 0 {
return true
}
return false
}

func (s *Scanner) scanFlowMapStart(ctx *Context) bool {
if ctx.existsBuffer() {
if ctx.existsBuffer() && !s.isFlowMode() {
return false
}

s.addBufferedTokenIfExists(ctx)
ctx.addOriginBuf('{')
ctx.addToken(token.MappingStart(string(ctx.obuf), s.pos()))
s.startedFlowMapNum++
Expand All @@ -648,10 +659,11 @@ func (s *Scanner) scanFlowMapEnd(ctx *Context) bool {
}

func (s *Scanner) scanFlowArrayStart(ctx *Context) bool {
if ctx.existsBuffer() {
if ctx.existsBuffer() && !s.isFlowMode() {
return false
}

s.addBufferedTokenIfExists(ctx)
ctx.addOriginBuf('[')
ctx.addToken(token.SequenceStart(string(ctx.obuf), s.pos()))
s.startedFlowSequenceNum++
Expand Down Expand Up @@ -946,6 +958,13 @@ func (s *Scanner) scan(ctx *Context) error {
}
if ctx.isDocument() {
if s.isChangedToIndentStateDown() {
if tk := ctx.lastToken(); tk != nil {
// If literal/folded content is empty, no string token is added.
// Therefore, add an empty string token.
if tk.Type != token.StringType {
ctx.addToken(token.String("", "", s.pos()))
}
}
s.breakLiteral(ctx)
} else {
s.scanLiteral(ctx, c)
Expand Down

0 comments on commit 9fabf36

Please sign in to comment.