From ff5fac8119dfd33373ac58852732c471f3f74a8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20B=C3=B6hm?= <188768+fb55@users.noreply.github.com> Date: Mon, 22 Nov 2021 18:16:29 +0000 Subject: [PATCH] refactor: Remove Tokenizer statics --- packages/parse5/lib/common/foreign-content.ts | 5 +- packages/parse5/lib/common/token.ts | 10 + .../extensions/location-info/parser-mixin.ts | 7 +- .../location-info/tokenizer-mixin.test.ts | 17 +- .../location-info/tokenizer-mixin.ts | 4 +- packages/parse5/lib/parser/index.test.ts | 2 +- packages/parse5/lib/parser/index.ts | 361 ++++++++---------- .../{tokenizer.test.ts => index.test.ts} | 0 packages/parse5/lib/tokenizer/index.ts | 121 +++--- .../parse5/lib/tree-adapters/interface.ts | 2 +- packages/parse5/lib/utils/mixin.ts | 2 +- packages/sax-parser/lib/index.ts | 33 +- .../lib/parser-feedback-simulator.ts | 24 +- .../generate-parser-feedback-test/index.ts | 15 +- test/utils/generate-tokenization-tests.ts | 30 +- 15 files changed, 302 insertions(+), 331 deletions(-) rename packages/parse5/lib/tokenizer/{tokenizer.test.ts => index.test.ts} (100%) diff --git a/packages/parse5/lib/common/foreign-content.ts b/packages/parse5/lib/common/foreign-content.ts index 0cee8f5b9..9981363e5 100644 --- a/packages/parse5/lib/common/foreign-content.ts +++ b/packages/parse5/lib/common/foreign-content.ts @@ -1,4 +1,3 @@ -import { Tokenizer } from '../tokenizer/index.js'; import { TAG_NAMES as $, NAMESPACES as NS, ATTRS } from './html.js'; import type { TagToken, Attribute } from './token.js'; @@ -180,9 +179,7 @@ export function causesExit(startTagToken: TagToken) { const tn = startTagToken.tagName; const isFontWithAttrs = tn === $.FONT && - (Tokenizer.getTokenAttr(startTagToken, ATTRS.COLOR) !== null || - Tokenizer.getTokenAttr(startTagToken, ATTRS.SIZE) !== null || - Tokenizer.getTokenAttr(startTagToken, ATTRS.FACE) !== null); + startTagToken.attrs.some(({ name }) => name === ATTRS.COLOR || name === ATTRS.SIZE || name === ATTRS.FACE); return isFontWithAttrs || EXITS_FOREIGN_CONTENT.has(tn); } diff --git a/packages/parse5/lib/common/token.ts b/packages/parse5/lib/common/token.ts index 0df13ebd3..25dcf7d09 100644 --- a/packages/parse5/lib/common/token.ts +++ b/packages/parse5/lib/common/token.ts @@ -62,6 +62,16 @@ export interface TagToken extends TokenBase { location?: LocationWithAttributes; } +export function getTokenAttr(token: TagToken, attrName: string) { + for (let i = token.attrs.length - 1; i >= 0; i--) { + if (token.attrs[i].name === attrName) { + return token.attrs[i].value; + } + } + + return null; +} + export interface CommentToken extends TokenBase { readonly type: TokenType.COMMENT; data: string; diff --git a/packages/parse5/lib/extensions/location-info/parser-mixin.ts b/packages/parse5/lib/extensions/location-info/parser-mixin.ts index 6f8aaf406..47c452f25 100644 --- a/packages/parse5/lib/extensions/location-info/parser-mixin.ts +++ b/packages/parse5/lib/extensions/location-info/parser-mixin.ts @@ -1,12 +1,11 @@ import { CommentToken, DoctypeToken, CharacterToken } from '../../common/token'; import { Mixin } from '../../utils/mixin.js'; -import { Tokenizer } from '../../tokenizer/index.js'; import { LocationInfoTokenizerMixin } from './tokenizer-mixin.js'; import { TAG_NAMES as $, NAMESPACES as NS } from '../../common/html.js'; import type { TreeAdapter, TreeAdapterTypeMap, ElementLocation } from '../../tree-adapters/interface'; import type { Parser } from '../../parser/index.js'; import type { PositionTrackingPreprocessorMixin } from '../position-tracking/preprocessor-mixin'; -import type { Token, TagToken } from '../../common/token.js'; +import { TokenType, Token, TagToken } from '../../common/token.js'; export class LocationInfoParserMixin extends Mixin> { treeAdapter: TreeAdapter; @@ -44,7 +43,7 @@ export class LocationInfoParserMixin extends Mixin // NOTE: For cases like

- First 'p' closes without a closing // tag and for cases like

- 'p' closes without a closing tag. - const isClosingEndTag = closingToken.type === Tokenizer.END_TAG_TOKEN && tn === closingToken.tagName; + const isClosingEndTag = closingToken.type === TokenType.END_TAG && tn === closingToken.tagName; const endLoc: Partial = {}; if (isClosingEndTag) { endLoc.endTag = { ...ctLoc }; @@ -100,7 +99,7 @@ export class LocationInfoParserMixin extends Mixin //NOTE: and are never popped from the stack, so we need to updated //their end location explicitly. const requireExplicitUpdate = - token.type === Tokenizer.END_TAG_TOKEN && + token.type === TokenType.END_TAG && (token.tagName === $.HTML || (token.tagName === $.BODY && this.openElements.hasInScope($.BODY))); if (requireExplicitUpdate) { diff --git a/packages/parse5/lib/extensions/location-info/tokenizer-mixin.test.ts b/packages/parse5/lib/extensions/location-info/tokenizer-mixin.test.ts index 91ef3aa04..528bc61d1 100644 --- a/packages/parse5/lib/extensions/location-info/tokenizer-mixin.test.ts +++ b/packages/parse5/lib/extensions/location-info/tokenizer-mixin.test.ts @@ -1,13 +1,14 @@ import * as assert from 'assert'; -import { Tokenizer } from '../../tokenizer/index.js'; +import { Tokenizer, TokenizerMode } from '../../tokenizer/index.js'; import { LocationInfoTokenizerMixin } from './tokenizer-mixin.js'; import { Mixin } from '../../utils/mixin.js'; +import { TokenType } from './../../common/token.js'; import { getSubstringByLineCol, normalizeNewLine } from '../../../../../test/utils/common.js'; it('Location Info (Tokenizer)', () => { const testCases = [ { - initialMode: Tokenizer.MODE.DATA, + initialMode: TokenizerMode.DATA, lastStartTagName: '', htmlChunks: [ '\r\n', @@ -59,22 +60,22 @@ it('Location Info (Tokenizer)', () => { ], }, { - initialMode: Tokenizer.MODE.RCDATA, + initialMode: TokenizerMode.RCDATA, lastStartTagName: 'title', htmlChunks: ['

Test', ' \n ', 'hey', ' ', 'ya!', '', ''], }, { - initialMode: Tokenizer.MODE.RAWTEXT, + initialMode: TokenizerMode.RAWTEXT, lastStartTagName: 'style', htmlChunks: ['.header{', ' \n ', 'color:red;', '\n', '}', '', 'Some', ' ', 'text'], }, { - initialMode: Tokenizer.MODE.SCRIPT_DATA, + initialMode: TokenizerMode.SCRIPT_DATA, lastStartTagName: 'script', htmlChunks: ['var', ' ', 'a=c', ' ', '-', ' ', 'd;', '\n', 'a<--d;', '', '
'], }, { - initialMode: Tokenizer.MODE.PLAINTEXT, + initialMode: TokenizerMode.PLAINTEXT, lastStartTagName: 'plaintext', htmlChunks: ['Text', ' \n', 'Test
'], }, @@ -97,8 +98,8 @@ it('Location Info (Tokenizer)', () => { tokenizer.state = testCase.initialMode; tokenizer.lastStartTagName = testCase.lastStartTagName; - for (let token = tokenizer.getNextToken(), j = 0; token.type !== Tokenizer.EOF_TOKEN; ) { - if (token.type === Tokenizer.HIBERNATION_TOKEN) { + for (let token = tokenizer.getNextToken(), j = 0; token.type !== TokenType.EOF; ) { + if (token.type === TokenType.HIBERNATION) { continue; } diff --git a/packages/parse5/lib/extensions/location-info/tokenizer-mixin.ts b/packages/parse5/lib/extensions/location-info/tokenizer-mixin.ts index 417a5b941..2bd800212 100644 --- a/packages/parse5/lib/extensions/location-info/tokenizer-mixin.ts +++ b/packages/parse5/lib/extensions/location-info/tokenizer-mixin.ts @@ -1,7 +1,7 @@ import { Mixin } from '../../utils/mixin.js'; import { Tokenizer } from '../../tokenizer/index.js'; import { PositionTrackingPreprocessorMixin } from '../position-tracking/preprocessor-mixin.js'; -import { Location, LocationWithAttributes } from '../../common/token.js'; +import { TokenType, Location, LocationWithAttributes } from '../../common/token.js'; export class LocationInfoTokenizerMixin extends Mixin { posTracker: PositionTrackingPreprocessorMixin; @@ -97,7 +97,7 @@ export class LocationInfoTokenizerMixin extends Mixin { currentCharacterToken.location!.endOffset = ctLoc.startOffset; } - if (this.currentToken!.type === Tokenizer.EOF_TOKEN) { + if (this.currentToken!.type === TokenType.EOF) { ctLoc.endLine = ctLoc.startLine; ctLoc.endCol = ctLoc.startCol; ctLoc.endOffset = ctLoc.startOffset; diff --git a/packages/parse5/lib/parser/index.test.ts b/packages/parse5/lib/parser/index.test.ts index 936bcb816..327053bac 100644 --- a/packages/parse5/lib/parser/index.test.ts +++ b/packages/parse5/lib/parser/index.test.ts @@ -7,7 +7,7 @@ import { NAMESPACES as NS } from '../common/html.js'; const origParseFragment = Parser.prototype.parseFragment; -generateParsingTests('parser', 'Parser', { skipFragments: false }, (test, opts) => ({ +generateParsingTests('parser', 'Parser', {}, (test, opts) => ({ node: test.fragmentContext ? parse5.parseFragment(test.fragmentContext, test.input, opts) : parse5.parse(test.input, opts), diff --git a/packages/parse5/lib/parser/index.ts b/packages/parse5/lib/parser/index.ts index 2a8b66eac..7913b67ec 100644 --- a/packages/parse5/lib/parser/index.ts +++ b/packages/parse5/lib/parser/index.ts @@ -1,4 +1,4 @@ -import { Tokenizer } from '../tokenizer/index.js'; +import { Tokenizer, TokenizerMode } from '../tokenizer/index.js'; import { OpenElementStack } from './open-element-stack.js'; import { FormattingElementList, ElementEntry } from './formatting-element-list.js'; import { LocationInfoParserMixin } from '../extensions/location-info/parser-mixin.js'; @@ -19,7 +19,7 @@ import { } from '../common/html.js'; import type { TreeAdapter, TreeAdapterTypeMap } from '../tree-adapters/interface'; import type { ParserError } from '../extensions/error-reporting/mixin-base'; -import { Token, CommentToken, CharacterToken, TagToken, DoctypeToken } from '../common/token'; +import { TokenType, getTokenAttr, Token, CommentToken, CharacterToken, TagToken, DoctypeToken } from '../common/token'; //Misc constants const HIDDEN_INPUT_TYPE = 'hidden'; @@ -257,14 +257,14 @@ export class Parser { const token = this.tokenizer.getNextToken(); - if (token.type === Tokenizer.HIBERNATION_TOKEN) { + if (token.type === TokenType.HIBERNATION) { break; } if (this.skipNextNewLine) { this.skipNextNewLine = false; - if (token.type === Tokenizer.WHITESPACE_CHARACTER_TOKEN && token.chars[0] === '\n') { + if (token.type === TokenType.WHITESPACE_CHARACTER && token.chars[0] === '\n') { if (token.chars.length === 1) { continue; } @@ -313,10 +313,7 @@ export class Parser { !this._isIntegrationPoint(current); } - _switchToTextParsing( - currentToken: TagToken, - nextTokenizerState: typeof Tokenizer.MODE[keyof typeof Tokenizer.MODE] - ) { + _switchToTextParsing(currentToken: TagToken, nextTokenizerState: typeof TokenizerMode[keyof typeof TokenizerMode]) { this._insertElement(currentToken, NS.HTML); this.tokenizer.state = nextTokenizerState; this.originalInsertionMode = this.insertionMode; @@ -326,7 +323,7 @@ export class Parser { switchToPlaintextParsing() { this.insertionMode = InsertionMode.TEXT; this.originalInsertionMode = InsertionMode.IN_BODY; - this.tokenizer.state = Tokenizer.MODE.PLAINTEXT; + this.tokenizer.state = TokenizerMode.PLAINTEXT; } //Fragment parsing @@ -354,7 +351,7 @@ export class Parser { const tn = this.treeAdapter.getTagName(this.fragmentContext!); if (tn === $.TITLE || tn === $.TEXTAREA) { - this.tokenizer.state = Tokenizer.MODE.RCDATA; + this.tokenizer.state = TokenizerMode.RCDATA; } else if ( tn === $.STYLE || tn === $.XMP || @@ -363,11 +360,11 @@ export class Parser { tn === $.NOFRAMES || tn === $.NOSCRIPT ) { - this.tokenizer.state = Tokenizer.MODE.RAWTEXT; + this.tokenizer.state = TokenizerMode.RAWTEXT; } else if (tn === $.SCRIPT) { - this.tokenizer.state = Tokenizer.MODE.SCRIPT_DATA; + this.tokenizer.state = TokenizerMode.SCRIPT_DATA; } else if (tn === $.PLAINTEXT) { - this.tokenizer.state = Tokenizer.MODE.PLAINTEXT; + this.tokenizer.state = TokenizerMode.PLAINTEXT; } } } @@ -467,32 +464,29 @@ export class Parser { if ( this.treeAdapter.getTagName(current) === $.ANNOTATION_XML && ns === NS.MATHML && - token.type === Tokenizer.START_TAG_TOKEN && + token.type === TokenType.START_TAG && token.tagName === $.SVG ) { return false; } const isCharacterToken = - token.type === Tokenizer.CHARACTER_TOKEN || - token.type === Tokenizer.NULL_CHARACTER_TOKEN || - token.type === Tokenizer.WHITESPACE_CHARACTER_TOKEN; + token.type === TokenType.CHARACTER || + token.type === TokenType.NULL_CHARACTER || + token.type === TokenType.WHITESPACE_CHARACTER; const isMathMLTextStartTag = - token.type === Tokenizer.START_TAG_TOKEN && token.tagName !== $.MGLYPH && token.tagName !== $.MALIGNMARK; + token.type === TokenType.START_TAG && token.tagName !== $.MGLYPH && token.tagName !== $.MALIGNMARK; if ((isMathMLTextStartTag || isCharacterToken) && this._isIntegrationPoint(current, NS.MATHML)) { return false; } - if ( - (token.type === Tokenizer.START_TAG_TOKEN || isCharacterToken) && - this._isIntegrationPoint(current, NS.HTML) - ) { + if ((token.type === TokenType.START_TAG || isCharacterToken) && this._isIntegrationPoint(current, NS.HTML)) { return false; } - return token.type !== Tokenizer.EOF_TOKEN; + return token.type !== TokenType.EOF; } _processToken(token: Token) { @@ -546,17 +540,17 @@ export class Parser { } _processTokenInForeignContent(token: Token) { - if (token.type === Tokenizer.CHARACTER_TOKEN) { + if (token.type === TokenType.CHARACTER) { characterInForeignContent(this, token); - } else if (token.type === Tokenizer.NULL_CHARACTER_TOKEN) { + } else if (token.type === TokenType.NULL_CHARACTER) { nullCharacterInForeignContent(this, token); - } else if (token.type === Tokenizer.WHITESPACE_CHARACTER_TOKEN) { + } else if (token.type === TokenType.WHITESPACE_CHARACTER) { this._insertCharacters(token); - } else if (token.type === Tokenizer.COMMENT_TOKEN) { + } else if (token.type === TokenType.COMMENT) { appendComment(this, token); - } else if (token.type === Tokenizer.START_TAG_TOKEN) { + } else if (token.type === TokenType.START_TAG) { startTagInForeignContent(this, token); - } else if (token.type === Tokenizer.END_TAG_TOKEN) { + } else if (token.type === TokenType.END_TAG) { endTagInForeignContent(this, token); } } @@ -568,7 +562,7 @@ export class Parser { this._processToken(token); } - if (token.type === Tokenizer.START_TAG_TOKEN && token.selfClosing && !token.ackSelfClosing) { + if (token.type === TokenType.START_TAG && token.selfClosing && !token.ackSelfClosing) { this._err(ERR.nonVoidHtmlElementStartTagWithTrailingSolidus); } } @@ -932,11 +926,11 @@ function stopParsing(p: Parser) { // The "initial" insertion mode //------------------------------------------------------------------ function modeInitial(p: Parser, token: Token) { - if (token.type === Tokenizer.COMMENT_TOKEN) { + if (token.type === TokenType.COMMENT) { appendComment(p, token); - } else if (token.type === Tokenizer.DOCTYPE_TOKEN) { + } else if (token.type === TokenType.DOCTYPE) { doctypeInInitialMode(p, token); - } else if (token.type !== Tokenizer.WHITESPACE_CHARACTER_TOKEN) { + } else if (token.type !== TokenType.WHITESPACE_CHARACTER) { tokenInInitialMode(p, token); } } @@ -965,17 +959,13 @@ function tokenInInitialMode(p: Parser, token: T // The "before html" insertion mode //------------------------------------------------------------------ function modeBeforeHtml(p: Parser, token: Token) { - if ( - token.type === Tokenizer.CHARACTER_TOKEN || - token.type === Tokenizer.NULL_CHARACTER_TOKEN || - token.type === Tokenizer.EOF_TOKEN - ) { + if (token.type === TokenType.CHARACTER || token.type === TokenType.NULL_CHARACTER || token.type === TokenType.EOF) { tokenBeforeHtml(p, token); - } else if (token.type === Tokenizer.COMMENT_TOKEN) { + } else if (token.type === TokenType.COMMENT) { appendComment(p, token); - } else if (token.type === Tokenizer.START_TAG_TOKEN) { + } else if (token.type === TokenType.START_TAG) { startTagBeforeHtml(p, token); - } else if (token.type === Tokenizer.END_TAG_TOKEN) { + } else if (token.type === TokenType.END_TAG) { endTagBeforeHtml(p, token); } } @@ -1006,19 +996,15 @@ function tokenBeforeHtml(p: Parser, token: Toke // The "before head" insertion mode //------------------------------------------------------------------ function modeBeforeHead(p: Parser, token: Token) { - if ( - token.type === Tokenizer.CHARACTER_TOKEN || - token.type === Tokenizer.NULL_CHARACTER_TOKEN || - token.type === Tokenizer.EOF_TOKEN - ) { + if (token.type === TokenType.CHARACTER || token.type === TokenType.NULL_CHARACTER || token.type === TokenType.EOF) { tokenBeforeHead(p, token); - } else if (token.type === Tokenizer.COMMENT_TOKEN) { + } else if (token.type === TokenType.COMMENT) { appendComment(p, token); - } else if (token.type === Tokenizer.DOCTYPE_TOKEN) { + } else if (token.type === TokenType.DOCTYPE) { misplacedDoctype(p); - } else if (token.type === Tokenizer.START_TAG_TOKEN) { + } else if (token.type === TokenType.START_TAG) { startTagBeforeHead(p, token); - } else if (token.type === Tokenizer.END_TAG_TOKEN) { + } else if (token.type === TokenType.END_TAG) { endTagBeforeHead(p, token); } } @@ -1057,21 +1043,17 @@ function tokenBeforeHead(p: Parser, token: Toke // The "in head" insertion mode //------------------------------------------------------------------ function modeInHead(p: Parser, token: Token) { - if ( - token.type === Tokenizer.CHARACTER_TOKEN || - token.type === Tokenizer.NULL_CHARACTER_TOKEN || - token.type === Tokenizer.EOF_TOKEN - ) { + if (token.type === TokenType.CHARACTER || token.type === TokenType.NULL_CHARACTER || token.type === TokenType.EOF) { tokenInHead(p, token); - } else if (token.type === Tokenizer.WHITESPACE_CHARACTER_TOKEN) { + } else if (token.type === TokenType.WHITESPACE_CHARACTER) { p._insertCharacters(token); - } else if (token.type === Tokenizer.COMMENT_TOKEN) { + } else if (token.type === TokenType.COMMENT) { appendComment(p, token); - } else if (token.type === Tokenizer.DOCTYPE_TOKEN) { + } else if (token.type === TokenType.DOCTYPE) { misplacedDoctype(p); - } else if (token.type === Tokenizer.START_TAG_TOKEN) { + } else if (token.type === TokenType.START_TAG) { startTagInHead(p, token); - } else if (token.type === Tokenizer.END_TAG_TOKEN) { + } else if (token.type === TokenType.END_TAG) { endTagInHead(p, token); } } @@ -1085,18 +1067,18 @@ function startTagInHead(p: Parser, token: TagTo p._appendElement(token, NS.HTML); token.ackSelfClosing = true; } else if (tn === $.TITLE) { - p._switchToTextParsing(token, Tokenizer.MODE.RCDATA); + p._switchToTextParsing(token, TokenizerMode.RCDATA); } else if (tn === $.NOSCRIPT) { if (p.options.scriptingEnabled) { - p._switchToTextParsing(token, Tokenizer.MODE.RAWTEXT); + p._switchToTextParsing(token, TokenizerMode.RAWTEXT); } else { p._insertElement(token, NS.HTML); p.insertionMode = InsertionMode.IN_HEAD_NO_SCRIPT; } } else if (tn === $.NOFRAMES || tn === $.STYLE) { - p._switchToTextParsing(token, Tokenizer.MODE.RAWTEXT); + p._switchToTextParsing(token, TokenizerMode.RAWTEXT); } else if (tn === $.SCRIPT) { - p._switchToTextParsing(token, Tokenizer.MODE.SCRIPT_DATA); + p._switchToTextParsing(token, TokenizerMode.SCRIPT_DATA); } else if (tn === $.TEMPLATE) { p._insertTemplate(token); p.activeFormattingElements.insertMarker(); @@ -1147,21 +1129,17 @@ function tokenInHead(p: Parser, token: Token) { // The "in head no script" insertion mode //------------------------------------------------------------------ function modeInHeadNoScript(p: Parser, token: Token) { - if ( - token.type === Tokenizer.CHARACTER_TOKEN || - token.type === Tokenizer.NULL_CHARACTER_TOKEN || - token.type === Tokenizer.EOF_TOKEN - ) { + if (token.type === TokenType.CHARACTER || token.type === TokenType.NULL_CHARACTER || token.type === TokenType.EOF) { tokenInHeadNoScript(p, token); - } else if (token.type === Tokenizer.WHITESPACE_CHARACTER_TOKEN) { + } else if (token.type === TokenType.WHITESPACE_CHARACTER) { p._insertCharacters(token); - } else if (token.type === Tokenizer.COMMENT_TOKEN) { + } else if (token.type === TokenType.COMMENT) { appendComment(p, token); - } else if (token.type === Tokenizer.DOCTYPE_TOKEN) { + } else if (token.type === TokenType.DOCTYPE) { misplacedDoctype(p); - } else if (token.type === Tokenizer.START_TAG_TOKEN) { + } else if (token.type === TokenType.START_TAG) { startTagInHeadNoScript(p, token); - } else if (token.type === Tokenizer.END_TAG_TOKEN) { + } else if (token.type === TokenType.END_TAG) { endTagInHeadNoScript(p, token); } } @@ -1202,8 +1180,7 @@ function endTagInHeadNoScript(p: Parser, token: } function tokenInHeadNoScript(p: Parser, token: Token) { - const errCode = - token.type === Tokenizer.EOF_TOKEN ? ERR.openElementsLeftAfterEof : ERR.disallowedContentInNoscriptInHead; + const errCode = token.type === TokenType.EOF ? ERR.openElementsLeftAfterEof : ERR.disallowedContentInNoscriptInHead; p._err(errCode); p.openElements.pop(); @@ -1214,21 +1191,17 @@ function tokenInHeadNoScript(p: Parser, token: // The "after head" insertion mode //------------------------------------------------------------------ function modeAfterHead(p: Parser, token: Token) { - if ( - token.type === Tokenizer.CHARACTER_TOKEN || - token.type === Tokenizer.NULL_CHARACTER_TOKEN || - token.type === Tokenizer.EOF_TOKEN - ) { + if (token.type === TokenType.CHARACTER || token.type === TokenType.NULL_CHARACTER || token.type === TokenType.EOF) { tokenAfterHead(p, token); - } else if (token.type === Tokenizer.WHITESPACE_CHARACTER_TOKEN) { + } else if (token.type === TokenType.WHITESPACE_CHARACTER) { p._insertCharacters(token); - } else if (token.type === Tokenizer.COMMENT_TOKEN) { + } else if (token.type === TokenType.COMMENT) { appendComment(p, token); - } else if (token.type === Tokenizer.DOCTYPE_TOKEN) { + } else if (token.type === TokenType.DOCTYPE) { misplacedDoctype(p); - } else if (token.type === Tokenizer.START_TAG_TOKEN) { + } else if (token.type === TokenType.START_TAG) { startTagAfterHead(p, token); - } else if (token.type === Tokenizer.END_TAG_TOKEN) { + } else if (token.type === TokenType.END_TAG) { endTagAfterHead(p, token); } } @@ -1291,17 +1264,17 @@ function tokenAfterHead(p: Parser, token: Token // The "in body" insertion mode //------------------------------------------------------------------ function modeInBody(p: Parser, token: Token) { - if (token.type === Tokenizer.CHARACTER_TOKEN) { + if (token.type === TokenType.CHARACTER) { characterInBody(p, token); - } else if (token.type === Tokenizer.WHITESPACE_CHARACTER_TOKEN) { + } else if (token.type === TokenType.WHITESPACE_CHARACTER) { whitespaceCharacterInBody(p, token); - } else if (token.type === Tokenizer.COMMENT_TOKEN) { + } else if (token.type === TokenType.COMMENT) { appendComment(p, token); - } else if (token.type === Tokenizer.START_TAG_TOKEN) { + } else if (token.type === TokenType.START_TAG) { startTagInBody(p, token); - } else if (token.type === Tokenizer.END_TAG_TOKEN) { + } else if (token.type === TokenType.END_TAG) { endTagInBody(p, token); - } else if (token.type === Tokenizer.EOF_TOKEN) { + } else if (token.type === TokenType.EOF) { eofInBody(p, token); } } @@ -1433,7 +1406,7 @@ function plaintextStartTagInBody(p: Parser, tok } p._insertElement(token, NS.HTML); - p.tokenizer.state = Tokenizer.MODE.PLAINTEXT; + p.tokenizer.state = TokenizerMode.PLAINTEXT; } function buttonStartTagInBody(p: Parser, token: TagToken) { @@ -1503,13 +1476,17 @@ function areaStartTagInBody(p: Parser, token: T token.ackSelfClosing = true; } +function isHiddenInput(token: TagToken) { + const inputType = getTokenAttr(token, ATTRS.TYPE); + + return inputType != null && inputType.toLowerCase() === HIDDEN_INPUT_TYPE; +} + function inputStartTagInBody(p: Parser, token: TagToken) { p._reconstructActiveFormattingElements(); p._appendElement(token, NS.HTML); - const inputType = Tokenizer.getTokenAttr(token, ATTRS.TYPE); - - if (!inputType || inputType.toLowerCase() !== HIDDEN_INPUT_TYPE) { + if (!isHiddenInput(token)) { p.framesetOk = false; } @@ -1541,7 +1518,7 @@ function textareaStartTagInBody(p: Parser, toke //NOTE: If the next token is a U+000A LINE FEED (LF) character token, then ignore that token and move //on to the next one. (Newlines at the start of textarea elements are ignored as an authoring convenience.) p.skipNextNewLine = true; - p.tokenizer.state = Tokenizer.MODE.RCDATA; + p.tokenizer.state = TokenizerMode.RCDATA; p.originalInsertionMode = p.insertionMode; p.framesetOk = false; p.insertionMode = InsertionMode.TEXT; @@ -1554,18 +1531,18 @@ function xmpStartTagInBody(p: Parser, token: Ta p._reconstructActiveFormattingElements(); p.framesetOk = false; - p._switchToTextParsing(token, Tokenizer.MODE.RAWTEXT); + p._switchToTextParsing(token, TokenizerMode.RAWTEXT); } function iframeStartTagInBody(p: Parser, token: TagToken) { p.framesetOk = false; - p._switchToTextParsing(token, Tokenizer.MODE.RAWTEXT); + p._switchToTextParsing(token, TokenizerMode.RAWTEXT); } //NOTE: here we assume that we always act as an user agent with enabled plugins, so we parse // as a rawtext. function noembedStartTagInBody<T extends TreeAdapterTypeMap>(p: Parser<T>, token: TagToken) { - p._switchToTextParsing(token, Tokenizer.MODE.RAWTEXT); + p._switchToTextParsing(token, TokenizerMode.RAWTEXT); } function selectStartTagInBody<T extends TreeAdapterTypeMap>(p: Parser<T>, token: TagToken) { @@ -2119,14 +2096,14 @@ function eofInBody<T extends TreeAdapterTypeMap>(p: Parser<T>, token: Token) { //------------------------------------------------------------------ function modeText<T extends TreeAdapterTypeMap>(p: Parser<T>, token: Token) { if ( - token.type === Tokenizer.CHARACTER_TOKEN || - token.type === Tokenizer.NULL_CHARACTER_TOKEN || - token.type === Tokenizer.WHITESPACE_CHARACTER_TOKEN + token.type === TokenType.CHARACTER || + token.type === TokenType.NULL_CHARACTER || + token.type === TokenType.WHITESPACE_CHARACTER ) { p._insertCharacters(token); - } else if (token.type === Tokenizer.END_TAG_TOKEN) { + } else if (token.type === TokenType.END_TAG) { endTagInText(p, token); - } else if (token.type === Tokenizer.EOF_TOKEN) { + } else if (token.type === TokenType.EOF) { eofInText(p, token); } } @@ -2151,18 +2128,18 @@ function eofInText<T extends TreeAdapterTypeMap>(p: Parser<T>, token: Token) { //------------------------------------------------------------------ function modeInTable<T extends TreeAdapterTypeMap>(p: Parser<T>, token: Token) { if ( - token.type === Tokenizer.CHARACTER_TOKEN || - token.type === Tokenizer.NULL_CHARACTER_TOKEN || - token.type === Tokenizer.WHITESPACE_CHARACTER_TOKEN + token.type === TokenType.CHARACTER || + token.type === TokenType.NULL_CHARACTER || + token.type === TokenType.WHITESPACE_CHARACTER ) { characterInTable(p, token); - } else if (token.type === Tokenizer.COMMENT_TOKEN) { + } else if (token.type === TokenType.COMMENT) { appendComment(p, token); - } else if (token.type === Tokenizer.START_TAG_TOKEN) { + } else if (token.type === TokenType.START_TAG) { startTagInTable(p, token); - } else if (token.type === Tokenizer.END_TAG_TOKEN) { + } else if (token.type === TokenType.END_TAG) { endTagInTable(p, token); - } else if (token.type === Tokenizer.EOF_TOKEN) { + } else if (token.type === TokenType.EOF) { eofInBody(p, token); } } @@ -2223,9 +2200,7 @@ function tableStartTagInTable<T extends TreeAdapterTypeMap>(p: Parser<T>, token: } function inputStartTagInTable<T extends TreeAdapterTypeMap>(p: Parser<T>, token: TagToken) { - const inputType = Tokenizer.getTokenAttr(token, ATTRS.TYPE); - - if (inputType && inputType.toLowerCase() === HIDDEN_INPUT_TYPE) { + if (isHiddenInput(token)) { p._appendElement(token, NS.HTML); } else { tokenInTable(p, token); @@ -2361,11 +2336,11 @@ function tokenInTable<T extends TreeAdapterTypeMap>(p: Parser<T>, token: Token) // The "in table text" insertion mode //------------------------------------------------------------------ function modeInTableText<T extends TreeAdapterTypeMap>(p: Parser<T>, token: Token) { - if (token.type === Tokenizer.CHARACTER_TOKEN) { + if (token.type === TokenType.CHARACTER) { characterInTableText(p, token); - } else if (token.type === Tokenizer.WHITESPACE_CHARACTER_TOKEN) { + } else if (token.type === TokenType.WHITESPACE_CHARACTER) { whitespaceCharacterInTableText(p, token); - } else if (token.type !== Tokenizer.NULL_CHARACTER_TOKEN) { + } else if (token.type !== TokenType.NULL_CHARACTER) { tokenInTableText(p, token); } } @@ -2399,17 +2374,17 @@ function tokenInTableText<T extends TreeAdapterTypeMap>(p: Parser<T>, token: Tok // The "in caption" insertion mode //------------------------------------------------------------------ function modeInCaption<T extends TreeAdapterTypeMap>(p: Parser<T>, token: Token) { - if (token.type === Tokenizer.CHARACTER_TOKEN) { + if (token.type === TokenType.CHARACTER) { characterInBody(p, token); - } else if (token.type === Tokenizer.WHITESPACE_CHARACTER_TOKEN) { + } else if (token.type === TokenType.WHITESPACE_CHARACTER) { whitespaceCharacterInBody(p, token); - } else if (token.type === Tokenizer.COMMENT_TOKEN) { + } else if (token.type === TokenType.COMMENT) { appendComment(p, token); - } else if (token.type === Tokenizer.START_TAG_TOKEN) { + } else if (token.type === TokenType.START_TAG) { startTagInCaption(p, token); - } else if (token.type === Tokenizer.END_TAG_TOKEN) { + } else if (token.type === TokenType.END_TAG) { endTagInCaption(p, token); - } else if (token.type === Tokenizer.EOF_TOKEN) { + } else if (token.type === TokenType.EOF) { eofInBody(p, token); } } @@ -2475,17 +2450,17 @@ function endTagInCaption<T extends TreeAdapterTypeMap>(p: Parser<T>, token: TagT // The "in column group" insertion mode //------------------------------------------------------------------ function modeInColumnGroup<T extends TreeAdapterTypeMap>(p: Parser<T>, token: Token) { - if (token.type === Tokenizer.CHARACTER_TOKEN || token.type === Tokenizer.NULL_CHARACTER_TOKEN) { + if (token.type === TokenType.CHARACTER || token.type === TokenType.NULL_CHARACTER) { tokenInColumnGroup(p, token); - } else if (token.type === Tokenizer.WHITESPACE_CHARACTER_TOKEN) { + } else if (token.type === TokenType.WHITESPACE_CHARACTER) { p._insertCharacters(token); - } else if (token.type === Tokenizer.COMMENT_TOKEN) { + } else if (token.type === TokenType.COMMENT) { appendComment(p, token); - } else if (token.type === Tokenizer.START_TAG_TOKEN) { + } else if (token.type === TokenType.START_TAG) { startTagInColumnGroup(p, token); - } else if (token.type === Tokenizer.END_TAG_TOKEN) { + } else if (token.type === TokenType.END_TAG) { endTagInColumnGroup(p, token); - } else if (token.type === Tokenizer.EOF_TOKEN) { + } else if (token.type === TokenType.EOF) { eofInBody(p, token); } } @@ -2532,18 +2507,18 @@ function tokenInColumnGroup<T extends TreeAdapterTypeMap>(p: Parser<T>, token: T //------------------------------------------------------------------ function modeInTableBody<T extends TreeAdapterTypeMap>(p: Parser<T>, token: Token) { if ( - token.type === Tokenizer.CHARACTER_TOKEN || - token.type === Tokenizer.NULL_CHARACTER_TOKEN || - token.type === Tokenizer.WHITESPACE_CHARACTER_TOKEN + token.type === TokenType.CHARACTER || + token.type === TokenType.NULL_CHARACTER || + token.type === TokenType.WHITESPACE_CHARACTER ) { characterInTable(p, token); - } else if (token.type === Tokenizer.COMMENT_TOKEN) { + } else if (token.type === TokenType.COMMENT) { appendComment(p, token); - } else if (token.type === Tokenizer.START_TAG_TOKEN) { + } else if (token.type === TokenType.START_TAG) { startTagInTableBody(p, token); - } else if (token.type === Tokenizer.END_TAG_TOKEN) { + } else if (token.type === TokenType.END_TAG) { endTagInTableBody(p, token); - } else if (token.type === Tokenizer.EOF_TOKEN) { + } else if (token.type === TokenType.EOF) { eofInBody(p, token); } } @@ -2613,18 +2588,18 @@ function endTagInTableBody<T extends TreeAdapterTypeMap>(p: Parser<T>, token: Ta //------------------------------------------------------------------ function modeInRow<T extends TreeAdapterTypeMap>(p: Parser<T>, token: Token) { if ( - token.type === Tokenizer.CHARACTER_TOKEN || - token.type === Tokenizer.NULL_CHARACTER_TOKEN || - token.type === Tokenizer.WHITESPACE_CHARACTER_TOKEN + token.type === TokenType.CHARACTER || + token.type === TokenType.NULL_CHARACTER || + token.type === TokenType.WHITESPACE_CHARACTER ) { characterInTable(p, token); - } else if (token.type === Tokenizer.COMMENT_TOKEN) { + } else if (token.type === TokenType.COMMENT) { appendComment(p, token); - } else if (token.type === Tokenizer.START_TAG_TOKEN) { + } else if (token.type === TokenType.START_TAG) { startTagInRow(p, token); - } else if (token.type === Tokenizer.END_TAG_TOKEN) { + } else if (token.type === TokenType.END_TAG) { endTagInRow(p, token); - } else if (token.type === Tokenizer.EOF_TOKEN) { + } else if (token.type === TokenType.EOF) { eofInBody(p, token); } } @@ -2696,17 +2671,17 @@ function endTagInRow<T extends TreeAdapterTypeMap>(p: Parser<T>, token: TagToken // The "in cell" insertion mode //------------------------------------------------------------------ function modeInCell<T extends TreeAdapterTypeMap>(p: Parser<T>, token: Token) { - if (token.type === Tokenizer.CHARACTER_TOKEN) { + if (token.type === TokenType.CHARACTER) { characterInBody(p, token); - } else if (token.type === Tokenizer.WHITESPACE_CHARACTER_TOKEN) { + } else if (token.type === TokenType.WHITESPACE_CHARACTER) { whitespaceCharacterInBody(p, token); - } else if (token.type === Tokenizer.COMMENT_TOKEN) { + } else if (token.type === TokenType.COMMENT) { appendComment(p, token); - } else if (token.type === Tokenizer.START_TAG_TOKEN) { + } else if (token.type === TokenType.START_TAG) { startTagInCell(p, token); - } else if (token.type === Tokenizer.END_TAG_TOKEN) { + } else if (token.type === TokenType.END_TAG) { endTagInCell(p, token); - } else if (token.type === Tokenizer.EOF_TOKEN) { + } else if (token.type === TokenType.EOF) { eofInBody(p, token); } } @@ -2747,15 +2722,15 @@ function endTagInCell<T extends TreeAdapterTypeMap>(p: Parser<T>, token: TagToke // The "in select" insertion mode //------------------------------------------------------------------ function modeInSelect<T extends TreeAdapterTypeMap>(p: Parser<T>, token: Token) { - if (token.type === Tokenizer.CHARACTER_TOKEN || token.type === Tokenizer.WHITESPACE_CHARACTER_TOKEN) { + if (token.type === TokenType.CHARACTER || token.type === TokenType.WHITESPACE_CHARACTER) { p._insertCharacters(token); - } else if (token.type === Tokenizer.COMMENT_TOKEN) { + } else if (token.type === TokenType.COMMENT) { appendComment(p, token); - } else if (token.type === Tokenizer.START_TAG_TOKEN) { + } else if (token.type === TokenType.START_TAG) { startTagInSelect(p, token); - } else if (token.type === Tokenizer.END_TAG_TOKEN) { + } else if (token.type === TokenType.END_TAG) { endTagInSelect(p, token); - } else if (token.type === Tokenizer.EOF_TOKEN) { + } else if (token.type === TokenType.EOF) { eofInBody(p, token); } } @@ -2824,15 +2799,15 @@ function endTagInSelect<T extends TreeAdapterTypeMap>(p: Parser<T>, token: TagTo // The "in select in table" insertion mode //------------------------------------------------------------------ function modeInSelectInTable<T extends TreeAdapterTypeMap>(p: Parser<T>, token: Token) { - if (token.type === Tokenizer.CHARACTER_TOKEN || token.type === Tokenizer.WHITESPACE_CHARACTER_TOKEN) { + if (token.type === TokenType.CHARACTER || token.type === TokenType.WHITESPACE_CHARACTER) { p._insertCharacters(token); - } else if (token.type === Tokenizer.COMMENT_TOKEN) { + } else if (token.type === TokenType.COMMENT) { appendComment(p, token); - } else if (token.type === Tokenizer.START_TAG_TOKEN) { + } else if (token.type === TokenType.START_TAG) { startTagInSelectInTable(p, token); - } else if (token.type === Tokenizer.END_TAG_TOKEN) { + } else if (token.type === TokenType.END_TAG) { endTagInSelectInTable(p, token); - } else if (token.type === Tokenizer.EOF_TOKEN) { + } else if (token.type === TokenType.EOF) { eofInBody(p, token); } } @@ -2884,17 +2859,17 @@ function endTagInSelectInTable<T extends TreeAdapterTypeMap>(p: Parser<T>, token // The "in template" insertion mode //------------------------------------------------------------------ function modeInTemplate<T extends TreeAdapterTypeMap>(p: Parser<T>, token: Token) { - if (token.type === Tokenizer.CHARACTER_TOKEN) { + if (token.type === TokenType.CHARACTER) { characterInBody(p, token); - } else if (token.type === Tokenizer.WHITESPACE_CHARACTER_TOKEN) { + } else if (token.type === TokenType.WHITESPACE_CHARACTER) { whitespaceCharacterInBody(p, token); - } else if (token.type === Tokenizer.COMMENT_TOKEN) { + } else if (token.type === TokenType.COMMENT) { appendComment(p, token); - } else if (token.type === Tokenizer.START_TAG_TOKEN) { + } else if (token.type === TokenType.START_TAG) { startTagInTemplate(p, token); - } else if (token.type === Tokenizer.END_TAG_TOKEN) { + } else if (token.type === TokenType.END_TAG) { endTagInTemplate(p, token); - } else if (token.type === Tokenizer.EOF_TOKEN) { + } else if (token.type === TokenType.EOF) { eofInTemplate(p, token); } } @@ -2947,17 +2922,17 @@ function eofInTemplate<T extends TreeAdapterTypeMap>(p: Parser<T>, token: Token) // The "after body" insertion mode //------------------------------------------------------------------ function modeAfterBody<T extends TreeAdapterTypeMap>(p: Parser<T>, token: Token) { - if (token.type === Tokenizer.CHARACTER_TOKEN || token.type === Tokenizer.NULL_CHARACTER_TOKEN) { + if (token.type === TokenType.CHARACTER || token.type === TokenType.NULL_CHARACTER) { tokenAfterBody(p, token); - } else if (token.type === Tokenizer.WHITESPACE_CHARACTER_TOKEN) { + } else if (token.type === TokenType.WHITESPACE_CHARACTER) { whitespaceCharacterInBody(p, token); - } else if (token.type === Tokenizer.COMMENT_TOKEN) { + } else if (token.type === TokenType.COMMENT) { appendCommentToRootHtmlElement(p, token); - } else if (token.type === Tokenizer.START_TAG_TOKEN) { + } else if (token.type === TokenType.START_TAG) { startTagAfterBody(p, token); - } else if (token.type === Tokenizer.END_TAG_TOKEN) { + } else if (token.type === TokenType.END_TAG) { endTagAfterBody(p, token); - } else if (token.type === Tokenizer.EOF_TOKEN) { + } else if (token.type === TokenType.EOF) { stopParsing(p); } } @@ -2988,15 +2963,15 @@ function tokenAfterBody<T extends TreeAdapterTypeMap>(p: Parser<T>, token: Token // The "in frameset" insertion mode //------------------------------------------------------------------ function modeInFrameset<T extends TreeAdapterTypeMap>(p: Parser<T>, token: Token) { - if (token.type === Tokenizer.WHITESPACE_CHARACTER_TOKEN) { + if (token.type === TokenType.WHITESPACE_CHARACTER) { p._insertCharacters(token); - } else if (token.type === Tokenizer.COMMENT_TOKEN) { + } else if (token.type === TokenType.COMMENT) { appendComment(p, token); - } else if (token.type === Tokenizer.START_TAG_TOKEN) { + } else if (token.type === TokenType.START_TAG) { startTagInFrameset(p, token); - } else if (token.type === Tokenizer.END_TAG_TOKEN) { + } else if (token.type === TokenType.END_TAG) { endTagInFrameset(p, token); - } else if (token.type === Tokenizer.EOF_TOKEN) { + } else if (token.type === TokenType.EOF) { stopParsing(p); } } @@ -3029,15 +3004,15 @@ function endTagInFrameset<T extends TreeAdapterTypeMap>(p: Parser<T>, token: Tag // The "after frameset" insertion mode //------------------------------------------------------------------ function modeAfterFrameset<T extends TreeAdapterTypeMap>(p: Parser<T>, token: Token) { - if (token.type === Tokenizer.WHITESPACE_CHARACTER_TOKEN) { + if (token.type === TokenType.WHITESPACE_CHARACTER) { p._insertCharacters(token); - } else if (token.type === Tokenizer.COMMENT_TOKEN) { + } else if (token.type === TokenType.COMMENT) { appendComment(p, token); - } else if (token.type === Tokenizer.START_TAG_TOKEN) { + } else if (token.type === TokenType.START_TAG) { startTagAfterFrameset(p, token); - } else if (token.type === Tokenizer.END_TAG_TOKEN) { + } else if (token.type === TokenType.END_TAG) { endTagAfterFrameset(p, token); - } else if (token.type === Tokenizer.EOF_TOKEN) { + } else if (token.type === TokenType.EOF) { stopParsing(p); } } @@ -3062,18 +3037,18 @@ function endTagAfterFrameset<T extends TreeAdapterTypeMap>(p: Parser<T>, token: //------------------------------------------------------------------ function modeAfterAfterBody<T extends TreeAdapterTypeMap>(p: Parser<T>, token: Token) { if ( - token.type === Tokenizer.CHARACTER_TOKEN || - token.type === Tokenizer.NULL_CHARACTER_TOKEN || - token.type === Tokenizer.END_TAG_TOKEN + token.type === TokenType.CHARACTER || + token.type === TokenType.NULL_CHARACTER || + token.type === TokenType.END_TAG ) { tokenAfterAfterBody(p, token); - } else if (token.type === Tokenizer.WHITESPACE_CHARACTER_TOKEN) { + } else if (token.type === TokenType.WHITESPACE_CHARACTER) { whitespaceCharacterInBody(p, token); - } else if (token.type === Tokenizer.COMMENT_TOKEN) { + } else if (token.type === TokenType.COMMENT) { appendCommentToDocument(p, token); - } else if (token.type === Tokenizer.START_TAG_TOKEN) { + } else if (token.type === TokenType.START_TAG) { startTagAfterAfterBody(p, token); - } else if (token.type === Tokenizer.EOF_TOKEN) { + } else if (token.type === TokenType.EOF) { stopParsing(p); } } @@ -3094,13 +3069,13 @@ function tokenAfterAfterBody<T extends TreeAdapterTypeMap>(p: Parser<T>, token: // The "after after frameset" insertion mode //------------------------------------------------------------------ function modeAfterAfterFrameset<T extends TreeAdapterTypeMap>(p: Parser<T>, token: Token) { - if (token.type === Tokenizer.WHITESPACE_CHARACTER_TOKEN) { + if (token.type === TokenType.WHITESPACE_CHARACTER) { whitespaceCharacterInBody(p, token); - } else if (token.type === Tokenizer.COMMENT_TOKEN) { + } else if (token.type === TokenType.COMMENT) { appendCommentToDocument(p, token); - } else if (token.type === Tokenizer.START_TAG_TOKEN) { + } else if (token.type === TokenType.START_TAG) { startTagAfterAfterFrameset(p, token); - } else if (token.type === Tokenizer.EOF_TOKEN) { + } else if (token.type === TokenType.EOF) { stopParsing(p); } } diff --git a/packages/parse5/lib/tokenizer/tokenizer.test.ts b/packages/parse5/lib/tokenizer/index.test.ts similarity index 100% rename from packages/parse5/lib/tokenizer/tokenizer.test.ts rename to packages/parse5/lib/tokenizer/index.test.ts diff --git a/packages/parse5/lib/tokenizer/index.ts b/packages/parse5/lib/tokenizer/index.ts index 1ba869bce..89f572423 100644 --- a/packages/parse5/lib/tokenizer/index.ts +++ b/packages/parse5/lib/tokenizer/index.ts @@ -1,13 +1,25 @@ import { Preprocessor } from './preprocessor.js'; -import * as unicode from '../common/unicode.js'; -import { TokenType, Token, CharacterToken, DoctypeToken, TagToken, CommentToken, Attribute } from '../common/token.js'; +import { + CODE_POINTS as $, + CODE_POINT_SEQUENCES as $$, + REPLACEMENT_CHARACTER, + isSurrogate, + isUndefinedCodePoint, + isControlCodePoint, +} from '../common/unicode.js'; +import { + TokenType, + Token, + CharacterToken, + DoctypeToken, + TagToken, + getTokenAttr, + CommentToken, + Attribute, +} from '../common/token.js'; import { namedEntityData as neTree } from './named-entity-data.js'; import { ERR } from '../common/error-codes.js'; -//Aliases -const $ = unicode.CODE_POINTS; -const $$ = unicode.CODE_POINT_SEQUENCES; - //C1 Unicode control character reference replacements const C1_CONTROLS_REFERENCE_REPLACEMENTS = new Map([ [0x80, 0x20ac], @@ -129,6 +141,16 @@ enum State { NUMERIC_CHARACTER_REFERENCE_END, } +//Tokenizer initial states for different modes +export const TokenizerMode = { + DATA: State.DATA, + RCDATA: State.RCDATA, + RAWTEXT: State.RAWTEXT, + SCRIPT_DATA: State.SCRIPT_DATA, + PLAINTEXT: State.PLAINTEXT, + CDATA_SECTION: State.CDATA_SECTION, +} as const; + //Utils //OPTIMIZATION: these utility functions should not be moved out of this module. V8 Crankshaft will not inline @@ -379,7 +401,7 @@ export class Tokenizer { _leaveAttrName(toState: State) { const token = this.currentToken as TagToken; - if (Tokenizer.getTokenAttr(token, this.currentAttr.name) === null) { + if (getTokenAttr(token, this.currentAttr.name) === null) { token.attrs.push(this.currentAttr); } else { this._err(ERR.duplicateAttribute); @@ -748,7 +770,7 @@ export class Tokenizer { this.state = State.RCDATA_LESS_THAN_SIGN; } else if (cp === $.NULL) { this._err(ERR.unexpectedNullCharacter); - this._emitChars(unicode.REPLACEMENT_CHARACTER); + this._emitChars(REPLACEMENT_CHARACTER); } else if (cp === $.EOF) { this._emitEOFToken(); } else { @@ -765,7 +787,7 @@ export class Tokenizer { this.state = State.RAWTEXT_LESS_THAN_SIGN; } else if (cp === $.NULL) { this._err(ERR.unexpectedNullCharacter); - this._emitChars(unicode.REPLACEMENT_CHARACTER); + this._emitChars(REPLACEMENT_CHARACTER); } else if (cp === $.EOF) { this._emitEOFToken(); } else { @@ -782,7 +804,7 @@ export class Tokenizer { this.state = State.SCRIPT_DATA_LESS_THAN_SIGN; } else if (cp === $.NULL) { this._err(ERR.unexpectedNullCharacter); - this._emitChars(unicode.REPLACEMENT_CHARACTER); + this._emitChars(REPLACEMENT_CHARACTER); } else if (cp === $.EOF) { this._emitEOFToken(); } else { @@ -797,7 +819,7 @@ export class Tokenizer { if (cp === $.NULL) { this._err(ERR.unexpectedNullCharacter); - this._emitChars(unicode.REPLACEMENT_CHARACTER); + this._emitChars(REPLACEMENT_CHARACTER); } else if (cp === $.EOF) { this._emitEOFToken(); } else { @@ -866,7 +888,7 @@ export class Tokenizer { token.tagName += toAsciiLowerChar(cp); } else if (cp === $.NULL) { this._err(ERR.unexpectedNullCharacter); - token.tagName += unicode.REPLACEMENT_CHARACTER; + token.tagName += REPLACEMENT_CHARACTER; } else if (cp === $.EOF) { this._err(ERR.eofInTag); this._emitEOFToken(); @@ -1086,7 +1108,7 @@ export class Tokenizer { this.state = State.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN; } else if (cp === $.NULL) { this._err(ERR.unexpectedNullCharacter); - this._emitChars(unicode.REPLACEMENT_CHARACTER); + this._emitChars(REPLACEMENT_CHARACTER); } else if (cp === $.EOF) { this._err(ERR.eofInScriptHtmlCommentLikeText); this._emitEOFToken(); @@ -1106,7 +1128,7 @@ export class Tokenizer { } else if (cp === $.NULL) { this._err(ERR.unexpectedNullCharacter); this.state = State.SCRIPT_DATA_ESCAPED; - this._emitChars(unicode.REPLACEMENT_CHARACTER); + this._emitChars(REPLACEMENT_CHARACTER); } else if (cp === $.EOF) { this._err(ERR.eofInScriptHtmlCommentLikeText); this._emitEOFToken(); @@ -1129,7 +1151,7 @@ export class Tokenizer { } else if (cp === $.NULL) { this._err(ERR.unexpectedNullCharacter); this.state = State.SCRIPT_DATA_ESCAPED; - this._emitChars(unicode.REPLACEMENT_CHARACTER); + this._emitChars(REPLACEMENT_CHARACTER); } else if (cp === $.EOF) { this._err(ERR.eofInScriptHtmlCommentLikeText); this._emitEOFToken(); @@ -1233,7 +1255,7 @@ export class Tokenizer { this._emitChars('<'); } else if (cp === $.NULL) { this._err(ERR.unexpectedNullCharacter); - this._emitChars(unicode.REPLACEMENT_CHARACTER); + this._emitChars(REPLACEMENT_CHARACTER); } else if (cp === $.EOF) { this._err(ERR.eofInScriptHtmlCommentLikeText); this._emitEOFToken(); @@ -1254,7 +1276,7 @@ export class Tokenizer { } else if (cp === $.NULL) { this._err(ERR.unexpectedNullCharacter); this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED; - this._emitChars(unicode.REPLACEMENT_CHARACTER); + this._emitChars(REPLACEMENT_CHARACTER); } else if (cp === $.EOF) { this._err(ERR.eofInScriptHtmlCommentLikeText); this._emitEOFToken(); @@ -1278,7 +1300,7 @@ export class Tokenizer { } else if (cp === $.NULL) { this._err(ERR.unexpectedNullCharacter); this.state = State.SCRIPT_DATA_DOUBLE_ESCAPED; - this._emitChars(unicode.REPLACEMENT_CHARACTER); + this._emitChars(REPLACEMENT_CHARACTER); } else if (cp === $.EOF) { this._err(ERR.eofInScriptHtmlCommentLikeText); this._emitEOFToken(); @@ -1354,7 +1376,7 @@ export class Tokenizer { this.currentAttr.name += String.fromCodePoint(cp); } else if (cp === $.NULL) { this._err(ERR.unexpectedNullCharacter); - this.currentAttr.name += unicode.REPLACEMENT_CHARACTER; + this.currentAttr.name += REPLACEMENT_CHARACTER; } else { this.currentAttr.name += String.fromCodePoint(cp); } @@ -1413,7 +1435,7 @@ export class Tokenizer { this.state = State.CHARACTER_REFERENCE; } else if (cp === $.NULL) { this._err(ERR.unexpectedNullCharacter); - this.currentAttr.value += unicode.REPLACEMENT_CHARACTER; + this.currentAttr.value += REPLACEMENT_CHARACTER; } else if (cp === $.EOF) { this._err(ERR.eofInTag); this._emitEOFToken(); @@ -1432,7 +1454,7 @@ export class Tokenizer { this.state = State.CHARACTER_REFERENCE; } else if (cp === $.NULL) { this._err(ERR.unexpectedNullCharacter); - this.currentAttr.value += unicode.REPLACEMENT_CHARACTER; + this.currentAttr.value += REPLACEMENT_CHARACTER; } else if (cp === $.EOF) { this._err(ERR.eofInTag); this._emitEOFToken(); @@ -1454,7 +1476,7 @@ export class Tokenizer { this._emitCurrentToken(); } else if (cp === $.NULL) { this._err(ERR.unexpectedNullCharacter); - this.currentAttr.value += unicode.REPLACEMENT_CHARACTER; + this.currentAttr.value += REPLACEMENT_CHARACTER; } else if ( cp === $.QUOTATION_MARK || cp === $.APOSTROPHE || @@ -1520,7 +1542,7 @@ export class Tokenizer { this._emitEOFToken(); } else if (cp === $.NULL) { this._err(ERR.unexpectedNullCharacter); - token.data += unicode.REPLACEMENT_CHARACTER; + token.data += REPLACEMENT_CHARACTER; } else { token.data += String.fromCodePoint(cp); } @@ -1599,7 +1621,7 @@ export class Tokenizer { this.state = State.COMMENT_LESS_THAN_SIGN; } else if (cp === $.NULL) { this._err(ERR.unexpectedNullCharacter); - token.data += unicode.REPLACEMENT_CHARACTER; + token.data += REPLACEMENT_CHARACTER; } else if (cp === $.EOF) { this._err(ERR.eofInComment); this._emitCurrentToken(); @@ -1744,7 +1766,7 @@ export class Tokenizer { this.state = State.DOCTYPE_NAME; } else if (cp === $.NULL) { this._err(ERR.unexpectedNullCharacter); - this._createDoctypeToken(unicode.REPLACEMENT_CHARACTER); + this._createDoctypeToken(REPLACEMENT_CHARACTER); this.state = State.DOCTYPE_NAME; } else if (cp === $.GREATER_THAN_SIGN) { this._err(ERR.missingDoctypeName); @@ -1778,7 +1800,7 @@ export class Tokenizer { token.name += toAsciiLowerChar(cp); } else if (cp === $.NULL) { this._err(ERR.unexpectedNullCharacter); - token.name += unicode.REPLACEMENT_CHARACTER; + token.name += REPLACEMENT_CHARACTER; } else if (cp === $.EOF) { this._err(ERR.eofInDoctype); token.forceQuirks = true; @@ -1893,7 +1915,7 @@ export class Tokenizer { this.state = State.AFTER_DOCTYPE_PUBLIC_IDENTIFIER; } else if (cp === $.NULL) { this._err(ERR.unexpectedNullCharacter); - token.publicId += unicode.REPLACEMENT_CHARACTER; + token.publicId += REPLACEMENT_CHARACTER; } else if (cp === $.GREATER_THAN_SIGN) { this._err(ERR.abruptDoctypePublicIdentifier); token.forceQuirks = true; @@ -1918,7 +1940,7 @@ export class Tokenizer { this.state = State.AFTER_DOCTYPE_PUBLIC_IDENTIFIER; } else if (cp === $.NULL) { this._err(ERR.unexpectedNullCharacter); - token.publicId += unicode.REPLACEMENT_CHARACTER; + token.publicId += REPLACEMENT_CHARACTER; } else if (cp === $.GREATER_THAN_SIGN) { this._err(ERR.abruptDoctypePublicIdentifier); token.forceQuirks = true; @@ -2067,7 +2089,7 @@ export class Tokenizer { this.state = State.AFTER_DOCTYPE_SYSTEM_IDENTIFIER; } else if (cp === $.NULL) { this._err(ERR.unexpectedNullCharacter); - token.systemId += unicode.REPLACEMENT_CHARACTER; + token.systemId += REPLACEMENT_CHARACTER; } else if (cp === $.GREATER_THAN_SIGN) { this._err(ERR.abruptDoctypeSystemIdentifier); token.forceQuirks = true; @@ -2092,7 +2114,7 @@ export class Tokenizer { this.state = State.AFTER_DOCTYPE_SYSTEM_IDENTIFIER; } else if (cp === $.NULL) { this._err(ERR.unexpectedNullCharacter); - token.systemId += unicode.REPLACEMENT_CHARACTER; + token.systemId += REPLACEMENT_CHARACTER; } else if (cp === $.GREATER_THAN_SIGN) { this._err(ERR.abruptDoctypeSystemIdentifier); token.forceQuirks = true; @@ -2320,12 +2342,12 @@ export class Tokenizer { } else if (this.charRefCode > 0x10ffff) { this._err(ERR.characterReferenceOutsideUnicodeRange); this.charRefCode = $.REPLACEMENT_CHARACTER; - } else if (unicode.isSurrogate(this.charRefCode)) { + } else if (isSurrogate(this.charRefCode)) { this._err(ERR.surrogateCharacterReference); this.charRefCode = $.REPLACEMENT_CHARACTER; - } else if (unicode.isUndefinedCodePoint(this.charRefCode)) { + } else if (isUndefinedCodePoint(this.charRefCode)) { this._err(ERR.noncharacterCharacterReference); - } else if (unicode.isControlCodePoint(this.charRefCode) || this.charRefCode === $.CARRIAGE_RETURN) { + } else if (isControlCodePoint(this.charRefCode) || this.charRefCode === $.CARRIAGE_RETURN) { this._err(ERR.controlCharacterReference); const replacement = C1_CONTROLS_REFERENCE_REPLACEMENTS.get(this.charRefCode); @@ -2340,37 +2362,4 @@ export class Tokenizer { this._flushCodePointsConsumedAsCharacterReference(); this._reconsumeInState(this.returnState); } - - //Token types - // TODO Remove in favour of enum - static CHARACTER_TOKEN = TokenType.CHARACTER as const; - static NULL_CHARACTER_TOKEN = TokenType.NULL_CHARACTER as const; - static WHITESPACE_CHARACTER_TOKEN = TokenType.WHITESPACE_CHARACTER as const; - static START_TAG_TOKEN = TokenType.START_TAG as const; - static END_TAG_TOKEN = TokenType.END_TAG as const; - static COMMENT_TOKEN = TokenType.COMMENT as const; - static DOCTYPE_TOKEN = TokenType.DOCTYPE as const; - static EOF_TOKEN = TokenType.EOF as const; - static HIBERNATION_TOKEN = TokenType.HIBERNATION as const; - - //Tokenizer initial states for different modes - static MODE = { - DATA: State.DATA, - RCDATA: State.RCDATA, - RAWTEXT: State.RAWTEXT, - SCRIPT_DATA: State.SCRIPT_DATA, - PLAINTEXT: State.PLAINTEXT, - CDATA_SECTION: State.CDATA_SECTION, - } as const; - - //Static - static getTokenAttr = function (token: TagToken, attrName: string) { - for (let i = token.attrs.length - 1; i >= 0; i--) { - if (token.attrs[i].name === attrName) { - return token.attrs[i].value; - } - } - - return null; - }; } diff --git a/packages/parse5/lib/tree-adapters/interface.ts b/packages/parse5/lib/tree-adapters/interface.ts index 3d255682c..1b5a554e7 100644 --- a/packages/parse5/lib/tree-adapters/interface.ts +++ b/packages/parse5/lib/tree-adapters/interface.ts @@ -163,7 +163,7 @@ export interface TreeAdapter<T extends TreeAdapterTypeMap = TreeAdapterTypeMap> * * @param node - Node. */ - getNodeSourceCodeLocation(node: T['node']): ElementLocation | undefined; + getNodeSourceCodeLocation(node: T['node']): ElementLocation | undefined | null; /** * Returns the given node's parent. diff --git a/packages/parse5/lib/utils/mixin.ts b/packages/parse5/lib/utils/mixin.ts index f272b1ab2..30c86a276 100644 --- a/packages/parse5/lib/utils/mixin.ts +++ b/packages/parse5/lib/utils/mixin.ts @@ -11,7 +11,7 @@ export abstract class Mixin<Host> { } } - abstract _getOverriddenMethods(mixin: Mixin<Host>, originalMethods: Host): Partial<Host>; + protected abstract _getOverriddenMethods(mixin: Mixin<Host>, originalMethods: Host): Partial<Host>; static install<T, Args extends any[] = [], Mix extends Mixin<T> = Mixin<T>>( host: T, diff --git a/packages/sax-parser/lib/index.ts b/packages/sax-parser/lib/index.ts index 9b983289a..b47f2a8a7 100644 --- a/packages/sax-parser/lib/index.ts +++ b/packages/sax-parser/lib/index.ts @@ -2,7 +2,8 @@ import { Transform } from 'stream'; import { Tokenizer } from '@parse5/parse5/lib/tokenizer/index.js'; import { LocationInfoTokenizerMixin } from '@parse5/parse5/lib/extensions/location-info/tokenizer-mixin.js'; import { Mixin } from '@parse5/parse5/lib/utils/mixin.js'; -import type { +import { + TokenType, Token, CharacterToken, TagToken, @@ -110,17 +111,17 @@ export class SAXParser extends Transform { do { token = this.parserFeedbackSimulator.getNextToken(); - if (token.type === Tokenizer.HIBERNATION_TOKEN) { + if (token.type === TokenType.HIBERNATION) { break; } if ( - token.type === Tokenizer.CHARACTER_TOKEN || - token.type === Tokenizer.WHITESPACE_CHARACTER_TOKEN || - token.type === Tokenizer.NULL_CHARACTER_TOKEN + token.type === TokenType.CHARACTER || + token.type === TokenType.WHITESPACE_CHARACTER || + token.type === TokenType.NULL_CHARACTER ) { if (this.pendingText === null) { - token.type = Tokenizer.CHARACTER_TOKEN; + token.type = TokenType.CHARACTER; this.pendingText = token; } else { this.pendingText.chars += token.chars; @@ -139,11 +140,11 @@ export class SAXParser extends Transform { this._emitPendingText(); this._handleToken(token); } - } while (!this.stopped && token.type !== Tokenizer.EOF_TOKEN); + } while (!this.stopped && token.type !== TokenType.EOF); } _handleToken(token: Token) { - if (token.type === Tokenizer.EOF_TOKEN) { + if (token.type === TokenType.EOF) { return true; } @@ -209,7 +210,7 @@ const TEXT_EMISSION_HELPER = { }; const TOKEN_EMISSION_HELPERS = { - [Tokenizer.START_TAG_TOKEN]: { + [TokenType.START_TAG]: { eventName: 'startTag', reshapeToken: (origToken: TagToken): StartTag => ({ tagName: origToken.tagName, @@ -218,18 +219,18 @@ const TOKEN_EMISSION_HELPERS = { sourceCodeLocation: origToken.location, }), }, - [Tokenizer.END_TAG_TOKEN]: { + [TokenType.END_TAG]: { eventName: 'endTag', reshapeToken: (origToken: TagToken): EndTag => ({ tagName: origToken.tagName, sourceCodeLocation: origToken.location, }), }, - [Tokenizer.COMMENT_TOKEN]: { + [TokenType.COMMENT]: { eventName: 'comment', reshapeToken: (origToken: CommentToken) => ({ text: origToken.data, sourceCodeLocation: origToken.location }), }, - [Tokenizer.DOCTYPE_TOKEN]: { + [TokenType.DOCTYPE]: { eventName: 'doctype', reshapeToken: (origToken: DoctypeToken): Doctype => ({ name: origToken.name, @@ -238,10 +239,10 @@ const TOKEN_EMISSION_HELPERS = { sourceCodeLocation: origToken.location, }), }, - [Tokenizer.CHARACTER_TOKEN]: TEXT_EMISSION_HELPER, - [Tokenizer.NULL_CHARACTER_TOKEN]: TEXT_EMISSION_HELPER, - [Tokenizer.WHITESPACE_CHARACTER_TOKEN]: TEXT_EMISSION_HELPER, - [Tokenizer.HIBERNATION_TOKEN]: { + [TokenType.CHARACTER]: TEXT_EMISSION_HELPER, + [TokenType.NULL_CHARACTER]: TEXT_EMISSION_HELPER, + [TokenType.WHITESPACE_CHARACTER]: TEXT_EMISSION_HELPER, + [TokenType.HIBERNATION]: { eventName: 'hibernation', reshapeToken: () => ({}), }, diff --git a/packages/sax-parser/lib/parser-feedback-simulator.ts b/packages/sax-parser/lib/parser-feedback-simulator.ts index 33a0fc478..ae4bcd467 100644 --- a/packages/sax-parser/lib/parser-feedback-simulator.ts +++ b/packages/sax-parser/lib/parser-feedback-simulator.ts @@ -1,5 +1,5 @@ -import { Tokenizer } from '@parse5/parse5/lib/tokenizer/index.js'; -import type { Token, TagToken } from '@parse5/parse5/lib/common/token.js'; +import { Tokenizer, TokenizerMode } from '@parse5/parse5/lib/tokenizer/index.js'; +import { TokenType, Token, TagToken } from '@parse5/parse5/lib/common/token.js'; import * as foreignContent from '@parse5/parse5/lib/common/foreign-content.js'; import * as unicode from '@parse5/parse5/lib/common/unicode.js'; import { TAG_NAMES as $, NAMESPACES as NS } from '@parse5/parse5/lib/common/html.js'; @@ -20,19 +20,19 @@ export class ParserFeedbackSimulator { getNextToken(): Token { const token = this.tokenizer.getNextToken(); - if (token.type === Tokenizer.START_TAG_TOKEN) { + if (token.type === TokenType.START_TAG) { this._handleStartTagToken(token); - } else if (token.type === Tokenizer.END_TAG_TOKEN) { + } else if (token.type === TokenType.END_TAG) { this._handleEndTagToken(token); - } else if (token.type === Tokenizer.NULL_CHARACTER_TOKEN && this.inForeignContent) { - token.type = Tokenizer.CHARACTER_TOKEN; + } else if (token.type === TokenType.NULL_CHARACTER && this.inForeignContent) { + token.type = TokenType.CHARACTER; token.chars = unicode.REPLACEMENT_CHARACTER; } else if (this.skipNextNewLine) { - if (token.type !== Tokenizer.HIBERNATION_TOKEN) { + if (token.type !== TokenType.HIBERNATION) { this.skipNextNewLine = false; } - if (token.type === Tokenizer.WHITESPACE_CHARACTER_TOKEN && token.chars[0] === '\n') { + if (token.type === TokenType.WHITESPACE_CHARACTER && token.chars[0] === '\n') { if (token.chars.length === 1) { return this.getNextToken(); } @@ -66,11 +66,11 @@ export class ParserFeedbackSimulator { //Token handlers _ensureTokenizerMode(tn: string) { if (tn === $.TEXTAREA || tn === $.TITLE) { - this.tokenizer.state = Tokenizer.MODE.RCDATA; + this.tokenizer.state = TokenizerMode.RCDATA; } else if (tn === $.PLAINTEXT) { - this.tokenizer.state = Tokenizer.MODE.PLAINTEXT; + this.tokenizer.state = TokenizerMode.PLAINTEXT; } else if (tn === $.SCRIPT) { - this.tokenizer.state = Tokenizer.MODE.SCRIPT_DATA; + this.tokenizer.state = TokenizerMode.SCRIPT_DATA; } else if ( tn === $.STYLE || tn === $.IFRAME || @@ -79,7 +79,7 @@ export class ParserFeedbackSimulator { tn === $.NOFRAMES || tn === $.NOSCRIPT ) { - this.tokenizer.state = Tokenizer.MODE.RAWTEXT; + this.tokenizer.state = TokenizerMode.RAWTEXT; } } diff --git a/scripts/generate-parser-feedback-test/index.ts b/scripts/generate-parser-feedback-test/index.ts index ccb70d912..2006370ea 100644 --- a/scripts/generate-parser-feedback-test/index.ts +++ b/scripts/generate-parser-feedback-test/index.ts @@ -2,12 +2,11 @@ import { readFile, writeFile } from 'fs'; import { promisify } from 'util'; import { basename } from 'path'; import { Parser } from '../../packages/parse5/lib/parser/index.js'; -import { Tokenizer } from '../../packages/parse5/lib/tokenizer/index.js'; import * as defaultTreeAdapter from '../../packages/parse5/lib/tree-adapters/default.js'; import { convertTokenToHtml5Lib } from '../../test/utils/generate-tokenization-tests.js'; import { parseDatFile } from '../../test/utils/parse-dat-file.js'; import { addSlashes } from '../../test/utils/common.js'; -import type { Token } from '../../packages/parse5/lib/common/token.js'; +import { TokenType, Token } from '../../packages/parse5/lib/common/token.js'; const readFileAsync = promisify(readFile); const writeFileAsync = promisify(writeFile); @@ -27,15 +26,15 @@ async function main() { } function appendToken(dest: Token[], token: Token) { - if (token.type === Tokenizer.EOF_TOKEN) return; + if (token.type === TokenType.EOF) return; - if (token.type === Tokenizer.NULL_CHARACTER_TOKEN || token.type === Tokenizer.WHITESPACE_CHARACTER_TOKEN) { - token.type = Tokenizer.CHARACTER_TOKEN; + if (token.type === TokenType.NULL_CHARACTER || token.type === TokenType.WHITESPACE_CHARACTER) { + token.type = TokenType.CHARACTER; } - if (token.type === Tokenizer.CHARACTER_TOKEN) { + if (token.type === TokenType.CHARACTER) { const lastToken = dest[dest.length - 1]; - if (lastToken?.type === Tokenizer.CHARACTER_TOKEN) { + if (lastToken?.type === TokenType.CHARACTER) { lastToken.chars += token.chars; return; } @@ -53,7 +52,7 @@ function collectParserTokens(html: string) { // NOTE: Needed to split attributes of duplicate <html> and <body> // which are otherwise merged as per tree constructor spec - if (token.type === Tokenizer.START_TAG_TOKEN) { + if (token.type === TokenType.START_TAG) { token.attrs = token.attrs.slice(); } diff --git a/test/utils/generate-tokenization-tests.ts b/test/utils/generate-tokenization-tests.ts index 409bf126f..9a24f17f8 100644 --- a/test/utils/generate-tokenization-tests.ts +++ b/test/utils/generate-tokenization-tests.ts @@ -1,20 +1,20 @@ import * as assert from 'assert'; import * as fs from 'fs'; import * as path from 'path'; -import { Tokenizer } from '../../packages/parse5/lib/tokenizer/index.js'; +import { Tokenizer, TokenizerMode } from '../../packages/parse5/lib/tokenizer/index.js'; import { makeChunks } from './common.js'; -import type { Attribute, Token } from './../../packages/parse5/lib/common/token'; +import { TokenType, Attribute, Token } from './../../packages/parse5/lib/common/token'; type HtmlLibToken = [string, string | null, ...unknown[]]; export function convertTokenToHtml5Lib(token: Token): HtmlLibToken { switch (token.type) { - case Tokenizer.CHARACTER_TOKEN: - case Tokenizer.NULL_CHARACTER_TOKEN: - case Tokenizer.WHITESPACE_CHARACTER_TOKEN: + case TokenType.CHARACTER: + case TokenType.NULL_CHARACTER: + case TokenType.WHITESPACE_CHARACTER: return ['Character', token.chars]; - case Tokenizer.START_TAG_TOKEN: { + case TokenType.START_TAG: { const reformatedAttrs: Record<string, string> = {}; token.attrs.forEach((attr: Attribute) => { @@ -30,15 +30,15 @@ export function convertTokenToHtml5Lib(token: Token): HtmlLibToken { return startTagEntry; } - case Tokenizer.END_TAG_TOKEN: + case TokenType.END_TAG: // NOTE: parser feedback simulator can produce adjusted SVG // tag names for end tag tokens so we need to lower case it return ['EndTag', token.tagName.toLowerCase()]; - case Tokenizer.COMMENT_TOKEN: + case TokenType.COMMENT: return ['Comment', token.data]; - case Tokenizer.DOCTYPE_TOKEN: + case TokenType.DOCTYPE: return ['DOCTYPE', token.name, token.publicId, token.systemId, !token.forceQuirks]; default: @@ -63,7 +63,7 @@ function tokenize( ) { const result = { tokens: [], errors: [] }; const { tokenizer, getNextToken } = createTokenSource(result); - let token: Token = { type: Tokenizer.HIBERNATION_TOKEN }; + let token: Token = { type: TokenType.HIBERNATION }; let chunkIdx = 0; // NOTE: set small waterline for testing purposes @@ -81,14 +81,14 @@ function tokenize( } do { - if (token.type === Tokenizer.HIBERNATION_TOKEN) { + if (token.type === TokenType.HIBERNATION) { writeChunk(); } else { appendTokenEntry(result.tokens, convertTokenToHtml5Lib(token)); } token = getNextToken(); - } while (token.type !== Tokenizer.EOF_TOKEN); + } while (token.type !== TokenType.EOF); sortErrors(result); @@ -136,7 +136,7 @@ function concatCharacterTokens(tokenEntries: HtmlLibToken[]) { function getTokenizerSuitableStateName(testDataStateName: string) { const state = - Tokenizer.MODE[testDataStateName.slice(0, -6).replace(' ', '_').toUpperCase() as keyof typeof Tokenizer.MODE]; + TokenizerMode[testDataStateName.slice(0, -6).replace(' ', '_').toUpperCase() as keyof typeof TokenizerMode]; return state; } @@ -157,7 +157,7 @@ interface LoadedTest { name: string; input: string; expected: HtmlLibToken[]; - initialState: typeof Tokenizer.MODE[keyof typeof Tokenizer.MODE]; + initialState: Tokenizer['state']; lastStartTag: string; expectedErrors: string[]; } @@ -226,7 +226,7 @@ export function generateTokenizationTests( const result = tokenize( createTokenSource, chunks, - testData.initialState as typeof Tokenizer.MODE[keyof typeof Tokenizer.MODE], + testData.initialState as Tokenizer['state'], testData.lastStartTag );