Skip to content

Commit

Permalink
Revert "fix: Include whitespaces in extracted tokens (#13738)"
Browse files Browse the repository at this point in the history
This reverts commit 7683a79.
  • Loading branch information
MasslessParticle committed Aug 15, 2024
1 parent 9315b3d commit ced3324
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 54 deletions.
19 changes: 11 additions & 8 deletions pkg/pattern/tokenization/tokenization.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package tokenization

import (
"bytes"
"unsafe"
)

Expand Down Expand Up @@ -28,15 +29,15 @@ type tokenizer struct {
tokens []string
}

func (t *tokenizer) countOrSaveToken(endTokenPos int) {
func (t *tokenizer) countOrSaveToken(endTokenPos, skip int) {
if t.tokens != nil {
// Intentionally written like this and not with append(), so this can
// panic if we ever exceed the preallocated slice size, since that means
// we have a nasty bug in handleNextToken() below.
t.tokens[t.tokenCount] = t.line[t.tpos:endTokenPos]
}
t.tokenCount++
t.tpos = endTokenPos
t.tpos = endTokenPos + skip
}

func (t *tokenizer) handleNextToken() bool {
Expand All @@ -54,7 +55,7 @@ func (t *tokenizer) handleNextToken() bool {
// outside of a quoted string.
case escaped:
if curQuotePos < 0 && delimiters[c] {
t.countOrSaveToken(p + 1)
t.countOrSaveToken(p, 1)
return true
} else {
escaped = false
Expand Down Expand Up @@ -88,7 +89,7 @@ func (t *tokenizer) handleNextToken() bool {
// If we encounter a delimiter outside of a quote, count or save the
// token and skip the delimiter.
case delimiters[c]:
t.countOrSaveToken(p + 1)
t.countOrSaveToken(p, 1)
return true

// Handle likely JSON object keys that have been serialized without
Expand All @@ -107,11 +108,11 @@ func (t *tokenizer) handleNextToken() bool {
// wasn't a delimiter right before the comma.
case t.maybeJSON && p > t.tpos && (c == ':' || c == ',') && p+1 < lineLen:
if c == ':' && t.line[p-1] == '"' && !delimiters[t.line[p+1]] {
t.countOrSaveToken(p + 1)
t.countOrSaveToken(p+1, 0)
return true
}
if c == ',' && t.line[p+1] == '"' {
t.countOrSaveToken(p)
t.countOrSaveToken(p, 0)
return true
}
}
Expand All @@ -125,12 +126,12 @@ func (t *tokenizer) handleNextToken() bool {
// unterminated quote and the quote itself as a single token, and continue
// fairly normally from there.
if curQuotePos > 0 {
t.countOrSaveToken(curQuotePos + 1)
t.countOrSaveToken(curQuotePos+1, 0)
return true
}

if t.tpos < len(t.line) {
t.countOrSaveToken(len(t.line))
t.countOrSaveToken(len(t.line), 0)
return true
}

Expand Down Expand Up @@ -192,6 +193,8 @@ func (t *tokenizer) tokenize() []string {
}

func PreprocessAndTokenize(content []byte) []string {
content = bytes.TrimSpace(content)

t := tokenizer{rawLine: content, maxTokens: 100} // TODO: parametrize maxTokens

return t.tokenize()
Expand Down
Loading

0 comments on commit ced3324

Please sign in to comment.