-
-
Notifications
You must be signed in to change notification settings - Fork 239
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
refactor(tokenizer): Use explicit offsets for locations #402
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -220,17 +220,22 @@ export class Tokenizer { | |
|
||
private consumedAfterSnapshot = -1; | ||
|
||
private ctLoc: Location | null; | ||
private currentCharacterToken: CharacterToken | null = null; | ||
private currentToken: Token | null = null; | ||
private currentAttr: Attribute = { name: '', value: '' }; | ||
|
||
// NOTE: Doctypes tokens are created with several different offset. We keep track of the moment a doctype *might* start here. | ||
private doctypeStartLoc: Location | null = null; | ||
|
||
private addLocationInfo; | ||
private onParseError; | ||
|
||
constructor(options: { sourceCodeLocationInfo?: boolean; onParseError?: ParserErrorHandler | null }) { | ||
this.addLocationInfo = !!options.sourceCodeLocationInfo; | ||
this.onParseError = options.onParseError ?? null; | ||
this.preprocessor = new Preprocessor(options); | ||
this.ctLoc = this.getCurrentLocation(-1); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The doubly negated offset makes things a bit difficult here. An offset of The preprocessor is initiated at position I have played with flipping the sign of |
||
} | ||
|
||
//Errors | ||
|
@@ -239,16 +244,16 @@ export class Tokenizer { | |
} | ||
|
||
private currentAttrLocation: Location | null = null; | ||
private ctLoc: Location | null = null; | ||
private _getCurrentLocation(): Location | null { | ||
|
||
private getCurrentLocation(offset: number): Location | null { | ||
wooorm marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if (!this.addLocationInfo) { | ||
return null; | ||
} | ||
|
||
return { | ||
startLine: this.preprocessor.line, | ||
startCol: this.preprocessor.col, | ||
startOffset: this.preprocessor.offset, | ||
startCol: this.preprocessor.col - offset, | ||
startOffset: this.preprocessor.offset - offset, | ||
endLine: -1, | ||
endCol: -1, | ||
endOffset: -1, | ||
|
@@ -334,7 +339,7 @@ export class Tokenizer { | |
selfClosing: false, | ||
ackSelfClosing: false, | ||
attrs: [], | ||
location: this.ctLoc, | ||
location: this.getCurrentLocation(1), | ||
}; | ||
} | ||
|
||
|
@@ -346,15 +351,15 @@ export class Tokenizer { | |
selfClosing: false, | ||
ackSelfClosing: false, | ||
attrs: [], | ||
location: this.ctLoc, | ||
location: this.getCurrentLocation(2), | ||
}; | ||
} | ||
|
||
private _createCommentToken(): void { | ||
private _createCommentToken(offset: number): void { | ||
this.currentToken = { | ||
type: TokenType.COMMENT, | ||
data: '', | ||
location: this.ctLoc, | ||
location: this.getCurrentLocation(offset), | ||
}; | ||
} | ||
|
||
|
@@ -365,7 +370,7 @@ export class Tokenizer { | |
forceQuirks: false, | ||
publicId: null, | ||
systemId: null, | ||
location: this.ctLoc, | ||
location: this.doctypeStartLoc, | ||
}; | ||
} | ||
|
||
|
@@ -378,15 +383,15 @@ export class Tokenizer { | |
} | ||
|
||
private _createEOFToken(): void { | ||
const ctLoc = this._getCurrentLocation(); | ||
const location = this.getCurrentLocation(0); | ||
|
||
if (ctLoc) { | ||
ctLoc.endLine = ctLoc.startLine; | ||
ctLoc.endCol = ctLoc.startCol; | ||
ctLoc.endOffset = ctLoc.startOffset; | ||
if (location) { | ||
location.endLine = location.startLine; | ||
location.endCol = location.startCol; | ||
location.endOffset = location.startOffset; | ||
} | ||
|
||
this.currentToken = { type: TokenType.EOF, location: ctLoc }; | ||
this.currentToken = { type: TokenType.EOF, location }; | ||
} | ||
|
||
//Tag attributes | ||
|
@@ -395,7 +400,7 @@ export class Tokenizer { | |
name: attrNameFirstCh, | ||
value: '', | ||
}; | ||
this.currentAttrLocation = this._getCurrentLocation(); | ||
this.currentAttrLocation = this.getCurrentLocation(0); | ||
} | ||
|
||
private _leaveAttrName(): void { | ||
|
@@ -426,10 +431,10 @@ export class Tokenizer { | |
|
||
//Token emission | ||
private _emitCurrentToken(): void { | ||
this._emitCurrentCharacterToken(); | ||
|
||
const ct = this.currentToken!; | ||
|
||
this._emitCurrentCharacterToken(ct.location); | ||
|
||
this.currentToken = null; | ||
|
||
//NOTE: store emited start tag's tagName to determine is the following end tag token is appropriate. | ||
|
@@ -462,16 +467,17 @@ export class Tokenizer { | |
} | ||
|
||
this.tokenQueue.push(ct); | ||
this.ctLoc = this.getCurrentLocation(-1); | ||
} | ||
|
||
private _emitCurrentCharacterToken(): void { | ||
private _emitCurrentCharacterToken(nextLocation: Location | null): void { | ||
if (this.currentCharacterToken) { | ||
//NOTE: if we have pending character token make it's end location equal to the | ||
//current token's start location. | ||
if (this.ctLoc && this.currentCharacterToken.location) { | ||
this.currentCharacterToken.location.endLine = this.ctLoc.startLine; | ||
this.currentCharacterToken.location.endCol = this.ctLoc.startCol; | ||
this.currentCharacterToken.location.endOffset = this.ctLoc.startOffset; | ||
if (nextLocation && this.currentCharacterToken.location) { | ||
this.currentCharacterToken.location.endLine = nextLocation.startLine; | ||
this.currentCharacterToken.location.endCol = nextLocation.startCol; | ||
this.currentCharacterToken.location.endOffset = nextLocation.startOffset; | ||
} | ||
|
||
this.tokenQueue.push(this.currentCharacterToken); | ||
|
@@ -496,7 +502,8 @@ export class Tokenizer { | |
//3)TokenType.CHARACTER - any character sequence which don't belong to groups 1 and 2 (e.g. 'abcdef1234@@#$%^') | ||
private _appendCharToCurrentCharacterToken(type: CharacterToken['type'], ch: string): void { | ||
if (this.currentCharacterToken && this.currentCharacterToken.type !== type) { | ||
this._emitCurrentCharacterToken(); | ||
this.ctLoc = this.getCurrentLocation(0); | ||
this._emitCurrentCharacterToken(this.ctLoc); | ||
} | ||
|
||
if (this.currentCharacterToken) { | ||
|
@@ -925,7 +932,6 @@ export class Tokenizer { | |
//------------------------------------------------------------------ | ||
private _stateData(cp: number): void { | ||
this.preprocessor.dropParsedChunk(); | ||
this.ctLoc = this._getCurrentLocation(); | ||
|
||
switch (cp) { | ||
case $.LESS_THAN_SIGN: { | ||
|
@@ -956,7 +962,6 @@ export class Tokenizer { | |
//------------------------------------------------------------------ | ||
private _stateRcdata(cp: number): void { | ||
this.preprocessor.dropParsedChunk(); | ||
this.ctLoc = this._getCurrentLocation(); | ||
|
||
switch (cp) { | ||
case $.AMPERSAND: { | ||
|
@@ -987,7 +992,6 @@ export class Tokenizer { | |
//------------------------------------------------------------------ | ||
private _stateRawtext(cp: number): void { | ||
this.preprocessor.dropParsedChunk(); | ||
this.ctLoc = this._getCurrentLocation(); | ||
|
||
switch (cp) { | ||
case $.LESS_THAN_SIGN: { | ||
|
@@ -1013,7 +1017,6 @@ export class Tokenizer { | |
//------------------------------------------------------------------ | ||
private _stateScriptData(cp: number): void { | ||
this.preprocessor.dropParsedChunk(); | ||
this.ctLoc = this._getCurrentLocation(); | ||
|
||
switch (cp) { | ||
case $.LESS_THAN_SIGN: { | ||
|
@@ -1039,7 +1042,6 @@ export class Tokenizer { | |
//------------------------------------------------------------------ | ||
private _statePlaintext(cp: number): void { | ||
this.preprocessor.dropParsedChunk(); | ||
this.ctLoc = this._getCurrentLocation(); | ||
|
||
switch (cp) { | ||
case $.NULL: { | ||
|
@@ -1076,7 +1078,7 @@ export class Tokenizer { | |
} | ||
case $.QUESTION_MARK: { | ||
this._err(ERR.unexpectedQuestionMarkInsteadOfTagName); | ||
this._createCommentToken(); | ||
this._createCommentToken(1); | ||
this.state = State.BOGUS_COMMENT; | ||
this._stateBogusComment(cp); | ||
break; | ||
|
@@ -1118,7 +1120,7 @@ export class Tokenizer { | |
} | ||
default: { | ||
this._err(ERR.invalidFirstCharacterOfTagName); | ||
this._createCommentToken(); | ||
this._createCommentToken(2); | ||
this.state = State.BOGUS_COMMENT; | ||
this._stateBogusComment(cp); | ||
} | ||
|
@@ -1956,16 +1958,17 @@ export class Tokenizer { | |
//------------------------------------------------------------------ | ||
private _stateMarkupDeclarationOpen(cp: number): void { | ||
if (this._consumeSequenceIfMatch($$.DASH_DASH, true)) { | ||
this._createCommentToken(); | ||
this._createCommentToken($$.DASH_DASH.length + 1); | ||
this.state = State.COMMENT_START; | ||
} else if (this._consumeSequenceIfMatch($$.DOCTYPE, false)) { | ||
this.doctypeStartLoc = this.getCurrentLocation($$.DOCTYPE.length + 1); | ||
this.state = State.DOCTYPE; | ||
} else if (this._consumeSequenceIfMatch($$.CDATA_START, true)) { | ||
if (this.allowCDATA) { | ||
this.state = State.CDATA_SECTION; | ||
} else { | ||
this._err(ERR.cdataInHtmlContent); | ||
this._createCommentToken(); | ||
this._createCommentToken($$.CDATA_START.length + 1); | ||
(this.currentToken as CommentToken).data = '[CDATA['; | ||
this.state = State.BOGUS_COMMENT; | ||
} | ||
|
@@ -1975,7 +1978,7 @@ export class Tokenizer { | |
//results are no longer valid and we will need to start over. | ||
else if (!this._ensureHibernation()) { | ||
this._err(ERR.incorrectlyOpenedComment); | ||
this._createCommentToken(); | ||
this._createCommentToken(2); | ||
this.state = State.BOGUS_COMMENT; | ||
this._stateBogusComment(cp); | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,7 +17,8 @@ const DEFAULT_BUFFER_WATERLINE = 1 << 16; | |
export class Preprocessor { | ||
public html = ''; | ||
private pos = -1; | ||
private lastGapPos = -1; | ||
// NOTE: Initial `lastGapPos` is -2, to ensure `col` on initialisation is 0 | ||
private lastGapPos = -2; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That would complicate things quite a bit. The issue here is the check There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
private gapStack: number[] = []; | ||
private skipNextNewLine = false; | ||
private lastChunkWritten = false; | ||
|
@@ -106,7 +107,7 @@ export class Preprocessor { | |
this.lineStartPos -= this.pos; | ||
this.droppedBufferSize += this.pos; | ||
this.pos = 0; | ||
this.lastGapPos = -1; | ||
this.lastGapPos = -2; | ||
this.gapStack.length = 0; | ||
} | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Weird? Should examples of differences perhaps be documented here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It just occurred to me that the
ctLoc
property can be used here; updated accordingly.The issue with doctypes is that they don't start at fixed offsets. Eg. whitespace can push the token creation further back.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
how is that different from elements, or anything else, that is also pushed back by whitespace?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Doctype tokens are a bit weird in the way they are created in the spec. All other tokens are created as soon as you can be certain that a specific type of token is needed. With Doctypes, this is pushed back until we get the beginning of something that needs to be saved. Eg.
<!doctype html>
would only have its token created on theh
ofhtml
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you have a link to the spec that mentions this behavior?
I don’t understand, from the tokenizing chapter, what makes
DOCTYPE
,[CDATA[
, or--
different?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The Before DOCTYPE name state is probably the best example here. The token is only created if a non-whitespace character is found.
This is in contrast to the Markup declaration open state, which creates comment tokens after reading a specific number of characters. This allows us to set specific offsets. The same applies to the Tag open state and all of the
* end tag open state
s. (These are all the states in which non-doctype tokens are created.)