Skip to content
This repository has been archived by the owner on Mar 24, 2022. It is now read-only.

Commit

Permalink
refactor(tokenizer): Use explicit offsets for locations
Browse files Browse the repository at this point in the history
Supersedes inikulin#397

Previously, `ctLoc` was overwritten on every new character. inikulin#397 attempted to improve this by maintaining the locations in an updated object, and creating copies when needed.

With this PR, we instead use explicit offsets when creating tokens. There is no more need to update locations; instead, we exploit the fact that a *DATA state is entered only after emitting the previous token.

This change should lead to a great reduction in allocations when using location information.
  • Loading branch information
fb55 committed Feb 11, 2022
1 parent d938786 commit 65cc12c
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 36 deletions.
71 changes: 37 additions & 34 deletions packages/parse5/lib/tokenizer/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -220,17 +220,22 @@ export class Tokenizer {

private consumedAfterSnapshot = -1;

private ctLoc: Location | null;
private currentCharacterToken: CharacterToken | null = null;
private currentToken: Token | null = null;
private currentAttr: Attribute = { name: '', value: '' };

// NOTE: Doctypes tokens are created with several different offset. We keep track of the moment a doctype *might* start here.
private doctypeStartLoc: Location | null = null;

private addLocationInfo;
private onParseError;

constructor(options: { sourceCodeLocationInfo?: boolean; onParseError?: ParserErrorHandler | null }) {
this.addLocationInfo = !!options.sourceCodeLocationInfo;
this.onParseError = options.onParseError ?? null;
this.preprocessor = new Preprocessor(options);
this.ctLoc = this.getCurrentLocation(-1);
}

//Errors
Expand All @@ -239,16 +244,16 @@ export class Tokenizer {
}

private currentAttrLocation: Location | null = null;
private ctLoc: Location | null = null;
private _getCurrentLocation(): Location | null {

private getCurrentLocation(offset: number): Location | null {
if (!this.addLocationInfo) {
return null;
}

return {
startLine: this.preprocessor.line,
startCol: this.preprocessor.col,
startOffset: this.preprocessor.offset,
startCol: this.preprocessor.col - offset,
startOffset: this.preprocessor.offset - offset,
endLine: -1,
endCol: -1,
endOffset: -1,
Expand Down Expand Up @@ -334,7 +339,7 @@ export class Tokenizer {
selfClosing: false,
ackSelfClosing: false,
attrs: [],
location: this.ctLoc,
location: this.getCurrentLocation(1),
};
}

Expand All @@ -346,15 +351,15 @@ export class Tokenizer {
selfClosing: false,
ackSelfClosing: false,
attrs: [],
location: this.ctLoc,
location: this.getCurrentLocation(2),
};
}

private _createCommentToken(): void {
private _createCommentToken(offset: number): void {
this.currentToken = {
type: TokenType.COMMENT,
data: '',
location: this.ctLoc,
location: this.getCurrentLocation(offset),
};
}

Expand All @@ -365,7 +370,7 @@ export class Tokenizer {
forceQuirks: false,
publicId: null,
systemId: null,
location: this.ctLoc,
location: this.doctypeStartLoc,
};
}

Expand All @@ -378,15 +383,15 @@ export class Tokenizer {
}

private _createEOFToken(): void {
const ctLoc = this._getCurrentLocation();
const location = this.getCurrentLocation(0);

if (ctLoc) {
ctLoc.endLine = ctLoc.startLine;
ctLoc.endCol = ctLoc.startCol;
ctLoc.endOffset = ctLoc.startOffset;
if (location) {
location.endLine = location.startLine;
location.endCol = location.startCol;
location.endOffset = location.startOffset;
}

this.currentToken = { type: TokenType.EOF, location: ctLoc };
this.currentToken = { type: TokenType.EOF, location };
}

//Tag attributes
Expand All @@ -395,7 +400,7 @@ export class Tokenizer {
name: attrNameFirstCh,
value: '',
};
this.currentAttrLocation = this._getCurrentLocation();
this.currentAttrLocation = this.getCurrentLocation(0);
}

private _leaveAttrName(): void {
Expand Down Expand Up @@ -426,10 +431,10 @@ export class Tokenizer {

//Token emission
private _emitCurrentToken(): void {
this._emitCurrentCharacterToken();

const ct = this.currentToken!;

this._emitCurrentCharacterToken(ct.location);

this.currentToken = null;

//NOTE: store emited start tag's tagName to determine is the following end tag token is appropriate.
Expand Down Expand Up @@ -462,16 +467,17 @@ export class Tokenizer {
}

this.tokenQueue.push(ct);
this.ctLoc = this.getCurrentLocation(-1);
}

private _emitCurrentCharacterToken(): void {
private _emitCurrentCharacterToken(nextLocation: Location | null): void {
if (this.currentCharacterToken) {
//NOTE: if we have pending character token make it's end location equal to the
//current token's start location.
if (this.ctLoc && this.currentCharacterToken.location) {
this.currentCharacterToken.location.endLine = this.ctLoc.startLine;
this.currentCharacterToken.location.endCol = this.ctLoc.startCol;
this.currentCharacterToken.location.endOffset = this.ctLoc.startOffset;
if (nextLocation && this.currentCharacterToken.location) {
this.currentCharacterToken.location.endLine = nextLocation.startLine;
this.currentCharacterToken.location.endCol = nextLocation.startCol;
this.currentCharacterToken.location.endOffset = nextLocation.startOffset;
}

this.tokenQueue.push(this.currentCharacterToken);
Expand All @@ -496,7 +502,8 @@ export class Tokenizer {
//3)TokenType.CHARACTER - any character sequence which don't belong to groups 1 and 2 (e.g. 'abcdef1234@@#$%^')
private _appendCharToCurrentCharacterToken(type: CharacterToken['type'], ch: string): void {
if (this.currentCharacterToken && this.currentCharacterToken.type !== type) {
this._emitCurrentCharacterToken();
this.ctLoc = this.getCurrentLocation(0);
this._emitCurrentCharacterToken(this.ctLoc);
}

if (this.currentCharacterToken) {
Expand Down Expand Up @@ -925,7 +932,6 @@ export class Tokenizer {
//------------------------------------------------------------------
private _stateData(cp: number): void {
this.preprocessor.dropParsedChunk();
this.ctLoc = this._getCurrentLocation();

switch (cp) {
case $.LESS_THAN_SIGN: {
Expand Down Expand Up @@ -956,7 +962,6 @@ export class Tokenizer {
//------------------------------------------------------------------
private _stateRcdata(cp: number): void {
this.preprocessor.dropParsedChunk();
this.ctLoc = this._getCurrentLocation();

switch (cp) {
case $.AMPERSAND: {
Expand Down Expand Up @@ -987,7 +992,6 @@ export class Tokenizer {
//------------------------------------------------------------------
private _stateRawtext(cp: number): void {
this.preprocessor.dropParsedChunk();
this.ctLoc = this._getCurrentLocation();

switch (cp) {
case $.LESS_THAN_SIGN: {
Expand All @@ -1013,7 +1017,6 @@ export class Tokenizer {
//------------------------------------------------------------------
private _stateScriptData(cp: number): void {
this.preprocessor.dropParsedChunk();
this.ctLoc = this._getCurrentLocation();

switch (cp) {
case $.LESS_THAN_SIGN: {
Expand All @@ -1039,7 +1042,6 @@ export class Tokenizer {
//------------------------------------------------------------------
private _statePlaintext(cp: number): void {
this.preprocessor.dropParsedChunk();
this.ctLoc = this._getCurrentLocation();

switch (cp) {
case $.NULL: {
Expand Down Expand Up @@ -1076,7 +1078,7 @@ export class Tokenizer {
}
case $.QUESTION_MARK: {
this._err(ERR.unexpectedQuestionMarkInsteadOfTagName);
this._createCommentToken();
this._createCommentToken(1);
this.state = State.BOGUS_COMMENT;
this._stateBogusComment(cp);
break;
Expand Down Expand Up @@ -1118,7 +1120,7 @@ export class Tokenizer {
}
default: {
this._err(ERR.invalidFirstCharacterOfTagName);
this._createCommentToken();
this._createCommentToken(2);
this.state = State.BOGUS_COMMENT;
this._stateBogusComment(cp);
}
Expand Down Expand Up @@ -1956,16 +1958,17 @@ export class Tokenizer {
//------------------------------------------------------------------
private _stateMarkupDeclarationOpen(cp: number): void {
if (this._consumeSequenceIfMatch($$.DASH_DASH, true)) {
this._createCommentToken();
this._createCommentToken($$.DASH_DASH.length + 1);
this.state = State.COMMENT_START;
} else if (this._consumeSequenceIfMatch($$.DOCTYPE, false)) {
this.doctypeStartLoc = this.getCurrentLocation($$.DOCTYPE.length + 1);
this.state = State.DOCTYPE;
} else if (this._consumeSequenceIfMatch($$.CDATA_START, true)) {
if (this.allowCDATA) {
this.state = State.CDATA_SECTION;
} else {
this._err(ERR.cdataInHtmlContent);
this._createCommentToken();
this._createCommentToken($$.CDATA_START.length + 1);
(this.currentToken as CommentToken).data = '[CDATA[';
this.state = State.BOGUS_COMMENT;
}
Expand All @@ -1975,7 +1978,7 @@ export class Tokenizer {
//results are no longer valid and we will need to start over.
else if (!this._ensureHibernation()) {
this._err(ERR.incorrectlyOpenedComment);
this._createCommentToken();
this._createCommentToken(2);
this.state = State.BOGUS_COMMENT;
this._stateBogusComment(cp);
}
Expand Down
5 changes: 3 additions & 2 deletions packages/parse5/lib/tokenizer/preprocessor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ const DEFAULT_BUFFER_WATERLINE = 1 << 16;
export class Preprocessor {
public html = '';
private pos = -1;
private lastGapPos = -1;
// NOTE: Initial `lastGapPos` is -2, to ensure `col` on initialisation is 0
private lastGapPos = -2;
private gapStack: number[] = [];
private skipNextNewLine = false;
private lastChunkWritten = false;
Expand Down Expand Up @@ -106,7 +107,7 @@ export class Preprocessor {
this.lineStartPos -= this.pos;
this.droppedBufferSize += this.pos;
this.pos = 0;
this.lastGapPos = -1;
this.lastGapPos = -2;
this.gapStack.length = 0;
}
}
Expand Down
19 changes: 19 additions & 0 deletions packages/parse5/lib/tokenizer/tokenizer-location-info.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,24 @@ it('Location Info (Tokenizer)', () => {
lastStartTagName: 'plaintext',
htmlChunks: ['Text', ' \n', 'Test</plaintext><div>'],
},
{
initialMode: TokenizerMode.DATA,
lastStartTagName: '',
htmlChunks: [
'\n',
'<!-- regular comment -->',
'<! bogus comment >',
'<? another bogus comment >',
'</!yet another bogus comment>',
'<![CDATA[ cdata as a bogus comment >',
],
},
{
initialMode: TokenizerMode.DATA,
lastStartTagName: '',
allowCDATA: true,
htmlChunks: ['<a>', '<![CDATA[ ', 'CDATA', ' ]]>', '<test>', ' <![CDATA[ ]]>\n'],
},
];

for (const testCase of testCases) {
Expand All @@ -93,6 +111,7 @@ it('Location Info (Tokenizer)', () => {
tokenizer.preprocessor.bufferWaterline = 8;
tokenizer.state = testCase.initialMode;
tokenizer.lastStartTagName = testCase.lastStartTagName;
tokenizer.allowCDATA = !!testCase.allowCDATA;

for (let token = tokenizer.getNextToken(), j = 0; token.type !== TokenType.EOF; ) {
if (token.type === TokenType.HIBERNATION) {
Expand Down

0 comments on commit 65cc12c

Please sign in to comment.