From e1d1791c2488359ced3c8a22ade3c6431dc71128 Mon Sep 17 00:00:00 2001 From: Chris Andrejewski Date: Wed, 4 Apr 2018 22:03:01 -0400 Subject: [PATCH] Add line, column, and index numbers to nodes --- README.md | 31 ++++ docs/dist/himalaya.js | 241 ++++++++++++++++++++---------- docs/dist/himalaya.js.map | 2 +- docs/index.html | 83 ++++++++++- src/format.js | 20 ++- src/index.js | 3 +- src/lexer.js | 175 ++++++++++++++-------- src/parser.js | 36 +++-- test/format.js | 45 +++++- test/lexer.js | 206 +++++++++++++++---------- test/parser.js | 306 ++++++++++++++++++++++++++++++++++---- text/ast-spec-v1.md | 20 +++ 12 files changed, 888 insertions(+), 280 deletions(-) diff --git a/README.md b/README.md index b6e590f..a3cf082 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,37 @@ Himalaya handles a lot of HTML's fringe cases, like: ### Preserves Whitespace Himalaya does not cut corners and returns an accurate representation of the HTML supplied. To remove whitespace, post-process the JSON; check out [an example script](https://gist.github.com/andrejewski/773487d4f4a46b16865405d7b74eabf9). +### Line, column, and index positions +Himalaya can include the start and end positions of nodes in the parse output. +To enable this, you can pass `parse` the `parseDefaults` extended with `includePositions: true`: + +```js +import { parse, parseDefaults } from 'himalaya' +parse('', { ...parseDefaults, includePositions: true }) +/* => +[ + { + "type": "element", + "tagName": "img", + "attributes": [], + "children": [], + "position": { + "start": { + "index": 0, + "line": 0, + "column": 0 + }, + "end": { + "index": 5, + "line": 0, + "column": 5 + } + } + } +] +*/ +``` + ## Going back to HTML Himalaya provides a `stringify` method. The following example parses the HTML to JSON then parses the JSON back into HTML. diff --git a/docs/dist/himalaya.js b/docs/dist/himalaya.js index 00ad1b0..4910793 100644 --- a/docs/dist/himalaya.js +++ b/docs/dist/himalaya.js @@ -77,17 +77,19 @@ function unquote(str) { return str; } -function format(nodes) { +function format(nodes, options) { return nodes.map(function (node) { var type = node.type; - if (type === 'element') { - var tagName = node.tagName.toLowerCase(); - var attributes = formatAttributes(node.attributes); - var children = format(node.children); - return { type: type, tagName: tagName, attributes: attributes, children: children }; + var outputNode = type === 'element' ? { + type: type, + tagName: node.tagName.toLowerCase(), + attributes: formatAttributes(node.attributes), + children: format(node.children, options) + } : { type: type, content: node.content }; + if (options.includePositions) { + outputNode.position = node.position; } - - return { type: type, content: node.content }; + return outputNode; }); } @@ -130,7 +132,8 @@ var parseDefaults = exports.parseDefaults = { voidTags: _tags.voidTags, closingTags: _tags.closingTags, childlessTags: _tags.childlessTags, - closingTagAncestorBreakers: _tags.closingTagAncestorBreakers + closingTagAncestorBreakers: _tags.closingTagAncestorBreakers, + includePositions: false }; function parse(str) { @@ -153,6 +156,10 @@ function stringify(ast) { Object.defineProperty(exports, "__esModule", { value: true }); +exports.feedPosition = feedPosition; +exports.jumpPosition = jumpPosition; +exports.makeInitialPosition = makeInitialPosition; +exports.copyPosition = copyPosition; exports.default = lexer; exports.lex = lex; exports.findTextEnd = findTextEnd; @@ -166,30 +173,67 @@ exports.lexSkipTag = lexSkipTag; var _compat = require('./compat'); -function _toConsumableArray(arr) { if (Array.isArray(arr)) { for (var i = 0, arr2 = Array(arr.length); i < arr.length; i++) { arr2[i] = arr[i]; } return arr2; } else { return Array.from(arr); } } +function feedPosition(position, str, len) { + var start = position.index; + var end = position.index = start + len; + for (var i = start; i < end; i++) { + var char = str.charAt(i); + if (char === '\n') { + position.line++; + position.column = 0; + } else { + position.column++; + } + } +} + +function jumpPosition(position, str, end) { + var len = end - position.index; + return feedPosition(position, str, len); +} + +function makeInitialPosition() { + return { + index: 0, + column: 0, + line: 0 + }; +} + +function copyPosition(position) { + return { + index: position.index, + line: position.line, + column: position.column + }; +} function lexer(str, options) { - var state = { str: str, options: options, cursor: 0, tokens: [] }; + var state = { + str: str, + options: options, + position: makeInitialPosition(), + tokens: [] + }; lex(state); return state.tokens; } function lex(state) { - var str = state.str; + var str = state.str, + childlessTags = state.options.childlessTags; var len = str.length; - while (state.cursor < len) { - var start = state.cursor; + while (state.position.index < len) { + var start = state.position.index; lexText(state); - if (state.cursor === start) { - var isComment = (0, _compat.startsWith)(str, '!--', state.cursor + 1); + if (state.position.index === start) { + var isComment = (0, _compat.startsWith)(str, '!--', start + 1); if (isComment) { lexComment(state); } else { var tagName = lexTag(state); var safeTag = tagName.toLowerCase(); - var childlessTags = state.options.childlessTags; - if ((0, _compat.arrayIncludes)(childlessTags, safeTag)) { lexSkipTag(tagName, state); } @@ -216,60 +260,64 @@ function findTextEnd(str, index) { function lexText(state) { var type = 'text'; var str = state.str, - cursor = state.cursor; + position = state.position; - var textEnd = findTextEnd(str, cursor); + var textEnd = findTextEnd(str, position.index); + if (textEnd === position.index) return; if (textEnd === -1) { - // there is only text left - var _content = str.slice(cursor); - state.cursor = str.length; - state.tokens.push({ type: type, content: _content }); - return; + textEnd = str.length; } - if (textEnd === cursor) return; - - var content = str.slice(cursor, textEnd); - state.cursor = textEnd; - state.tokens.push({ type: type, content: content }); + var start = copyPosition(position); + var content = str.slice(position.index, textEnd); + jumpPosition(position, str, textEnd); + var end = copyPosition(position); + state.tokens.push({ type: type, content: content, position: { start: start, end: end } }); } function lexComment(state) { - state.cursor += 4; // "', cursor); - var type = 'comment'; - if (commentEnd === -1) { - // there is only the comment left - var _content2 = str.slice(cursor); - state.cursor = str.length; - state.tokens.push({ type: type, content: _content2 }); - return; + position = state.position; + + var start = copyPosition(position); + feedPosition(position, str, 4); // "', position.index); + var commentEnd = contentEnd + 3; // "-->".length + if (contentEnd === -1) { + contentEnd = commentEnd = str.length; } - var content = str.slice(cursor, commentEnd); - state.cursor = commentEnd + 3; // "-->".length - state.tokens.push({ type: type, content: content }); + var content = str.slice(position.index, contentEnd); + jumpPosition(position, str, commentEnd); + state.tokens.push({ + type: 'comment', + content: content, + position: { + start: start, + end: copyPosition(position) + } + }); } function lexTag(state) { - var str = state.str; + var str = state.str, + position = state.position; { - var secondChar = str.charAt(state.cursor + 1); + var secondChar = str.charAt(position.index + 1); var close = secondChar === '/'; - state.tokens.push({ type: 'tag-start', close: close }); - state.cursor += close ? 2 : 1; + var start = copyPosition(position); + feedPosition(position, str, close ? 2 : 1); + state.tokens.push({ type: 'tag-start', close: close, position: { start: start } }); } var tagName = lexTagName(state); lexTagAttributes(state); { - var firstChar = str.charAt(state.cursor); + var firstChar = str.charAt(position.index); var _close = firstChar === '/'; - state.tokens.push({ type: 'tag-end', close: _close }); - state.cursor += _close ? 2 : 1; + feedPosition(position, str, _close ? 2 : 1); + var end = copyPosition(position); + state.tokens.push({ type: 'tag-end', close: _close, position: { end: end } }); } return tagName; } @@ -282,10 +330,10 @@ function isWhitespaceChar(char) { function lexTagName(state) { var str = state.str, - cursor = state.cursor; + position = state.position; var len = str.length; - var start = cursor; + var start = position.index; while (start < len) { var char = str.charAt(start); var isTagChar = !(isWhitespaceChar(char) || char === '/' || char === '>'); @@ -301,17 +349,21 @@ function lexTagName(state) { end++; } - state.cursor = end; + jumpPosition(position, str, end); var tagName = str.slice(start, end); - state.tokens.push({ type: 'tag', content: tagName }); + state.tokens.push({ + type: 'tag', + content: tagName + }); return tagName; } function lexTagAttributes(state) { var str = state.str, + position = state.position, tokens = state.tokens; - var cursor = state.cursor; + var cursor = position.index; var quote = null; // null, single-, or double-quote var wordBegin = cursor; // index of word start var words = []; // "key", "key=value", "key='value'", etc @@ -354,7 +406,7 @@ function lexTagAttributes(state) { cursor++; } - state.cursor = cursor; + jumpPosition(position, str, cursor); var wLen = words.length; var type = 'attribute'; @@ -398,13 +450,16 @@ function lexTagAttributes(state) { } } +var push = [].push; + function lexSkipTag(tagName, state) { var str = state.str, - cursor = state.cursor, + position = state.position, tokens = state.tokens; + var safeTagName = tagName.toLowerCase(); var len = str.length; - var index = cursor; + var index = position.index; while (index < len) { var nextTag = str.indexOf(' -1) { if (stack[index].tagName === tagName) { - stack.splice(index); - didRewind = true; + shouldRewind = true; break; } } @@ -503,7 +575,8 @@ function parse(state) { if (endToken.type !== 'tag-end') break; cursor++; } - if (didRewind) { + if (shouldRewind) { + rewindStack(stack, index, token.position.start, tokens[cursor - 1].position.end); break; } else { continue; @@ -524,7 +597,7 @@ function parse(state) { var currentIndex = stack.length - 1; while (currentIndex > 0) { if (tagName === stack[currentIndex].tagName) { - stack = stack.slice(0, currentIndex); + rewindStack(stack, currentIndex, token.position.start, token.position.start); var previousIndex = currentIndex - 1; nodes = stack[previousIndex].children; break; @@ -544,19 +617,29 @@ function parse(state) { cursor++; var children = []; - nodes.push({ + var position = { + start: token.position.start, + end: attrToken.position.end + }; + var elementNode = { type: 'element', tagName: tagToken.content, attributes: attributes, - children: children - }); + children: children, + position: position + }; + nodes.push(elementNode); var hasChildren = !(attrToken.close || (0, _compat.arrayIncludes)(options.voidTags, tagName)); if (hasChildren) { - stack.push({ tagName: tagName, children: children }); + var size = stack.push({ tagName: tagName, children: children, position: position }); var innerState = { tokens: tokens, options: options, cursor: cursor, stack: stack }; parse(innerState); cursor = innerState.cursor; + var rewoundInElement = stack.length === size; + if (rewoundInElement) { + elementNode.position.end = tokens[cursor - 1].position.end; + } } } state.cursor = cursor; diff --git a/docs/dist/himalaya.js.map b/docs/dist/himalaya.js.map index 48c12fc..53ce6de 100644 --- a/docs/dist/himalaya.js.map +++ b/docs/dist/himalaya.js.map @@ -1 +1 @@ -{"version":3,"names":[],"mappings":"","sources":["himalaya.js"],"sourcesContent":["(function(f){if(typeof exports===\"object\"&&typeof module!==\"undefined\"){module.exports=f()}else if(typeof define===\"function\"&&define.amd){define([],f)}else{var g;if(typeof window!==\"undefined\"){g=window}else if(typeof global!==\"undefined\"){g=global}else if(typeof self!==\"undefined\"){g=self}else{g=this}g.himalaya = f()}})(function(){var define,module,exports;return (function(){function e(t,n,r){function s(o,u){if(!n[o]){if(!t[o]){var a=typeof require==\"function\"&&require;if(!u&&a)return a(o,!0);if(i)return i(o,!0);var f=new Error(\"Cannot find module '\"+o+\"'\");throw f.code=\"MODULE_NOT_FOUND\",f}var l=n[o]={exports:{}};t[o][0].call(l.exports,function(e){var n=t[o][1][e];return s(n?n:e)},l,l.exports,e,t,n,r)}return n[o].exports}var i=typeof require==\"function\"&&require;for(var o=0;o= 0 ? lookupIndex : len + lookupIndex;\n while (searchIndex < len) {\n var element = array[searchIndex++];\n if (element === searchElement) return true;\n if (isNaNElement && isRealNaN(element)) return true;\n }\n\n return false;\n}\n\n},{}],2:[function(require,module,exports){\n'use strict';\n\nObject.defineProperty(exports, \"__esModule\", {\n value: true\n});\nexports.splitHead = splitHead;\nexports.unquote = unquote;\nexports.format = format;\nexports.formatAttributes = formatAttributes;\nfunction splitHead(str, sep) {\n var idx = str.indexOf(sep);\n if (idx === -1) return [str];\n return [str.slice(0, idx), str.slice(idx + sep.length)];\n}\n\nfunction unquote(str) {\n var car = str.charAt(0);\n var end = str.length - 1;\n var isQuoteStart = car === '\"' || car === \"'\";\n if (isQuoteStart && car === str.charAt(end)) {\n return str.slice(1, end);\n }\n return str;\n}\n\nfunction format(nodes) {\n return nodes.map(function (node) {\n var type = node.type;\n if (type === 'element') {\n var tagName = node.tagName.toLowerCase();\n var attributes = formatAttributes(node.attributes);\n var children = format(node.children);\n return { type: type, tagName: tagName, attributes: attributes, children: children };\n }\n\n return { type: type, content: node.content };\n });\n}\n\nfunction formatAttributes(attributes) {\n return attributes.map(function (attribute) {\n var parts = splitHead(attribute.trim(), '=');\n var key = parts[0];\n var value = typeof parts[1] === 'string' ? unquote(parts[1]) : null;\n return { key: key, value: value };\n });\n}\n\n},{}],3:[function(require,module,exports){\n'use strict';\n\nObject.defineProperty(exports, \"__esModule\", {\n value: true\n});\nexports.parseDefaults = undefined;\nexports.parse = parse;\nexports.stringify = stringify;\n\nvar _lexer = require('./lexer');\n\nvar _lexer2 = _interopRequireDefault(_lexer);\n\nvar _parser = require('./parser');\n\nvar _parser2 = _interopRequireDefault(_parser);\n\nvar _format = require('./format');\n\nvar _stringify = require('./stringify');\n\nvar _tags = require('./tags');\n\nfunction _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { default: obj }; }\n\nvar parseDefaults = exports.parseDefaults = {\n voidTags: _tags.voidTags,\n closingTags: _tags.closingTags,\n childlessTags: _tags.childlessTags,\n closingTagAncestorBreakers: _tags.closingTagAncestorBreakers\n};\n\nfunction parse(str) {\n var options = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : parseDefaults;\n\n var tokens = (0, _lexer2.default)(str, options);\n var nodes = (0, _parser2.default)(tokens, options);\n return (0, _format.format)(nodes, options);\n}\n\nfunction stringify(ast) {\n var options = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : parseDefaults;\n\n return (0, _stringify.toHTML)(ast, options);\n}\n\n},{\"./format\":2,\"./lexer\":4,\"./parser\":5,\"./stringify\":6,\"./tags\":7}],4:[function(require,module,exports){\n'use strict';\n\nObject.defineProperty(exports, \"__esModule\", {\n value: true\n});\nexports.default = lexer;\nexports.lex = lex;\nexports.findTextEnd = findTextEnd;\nexports.lexText = lexText;\nexports.lexComment = lexComment;\nexports.lexTag = lexTag;\nexports.isWhitespaceChar = isWhitespaceChar;\nexports.lexTagName = lexTagName;\nexports.lexTagAttributes = lexTagAttributes;\nexports.lexSkipTag = lexSkipTag;\n\nvar _compat = require('./compat');\n\nfunction _toConsumableArray(arr) { if (Array.isArray(arr)) { for (var i = 0, arr2 = Array(arr.length); i < arr.length; i++) { arr2[i] = arr[i]; } return arr2; } else { return Array.from(arr); } }\n\nfunction lexer(str, options) {\n var state = { str: str, options: options, cursor: 0, tokens: [] };\n lex(state);\n return state.tokens;\n}\n\nfunction lex(state) {\n var str = state.str;\n\n var len = str.length;\n while (state.cursor < len) {\n var start = state.cursor;\n lexText(state);\n if (state.cursor === start) {\n var isComment = (0, _compat.startsWith)(str, '!--', state.cursor + 1);\n if (isComment) {\n lexComment(state);\n } else {\n var tagName = lexTag(state);\n var safeTag = tagName.toLowerCase();\n var childlessTags = state.options.childlessTags;\n\n if ((0, _compat.arrayIncludes)(childlessTags, safeTag)) {\n lexSkipTag(tagName, state);\n }\n }\n }\n }\n}\n\nvar alphanumeric = /[A-Za-z0-9]/;\nfunction findTextEnd(str, index) {\n while (true) {\n var textEnd = str.indexOf('<', index);\n if (textEnd === -1) {\n return textEnd;\n }\n var char = str.charAt(textEnd + 1);\n if (char === '/' || char === '!' || alphanumeric.test(char)) {\n return textEnd;\n }\n index = textEnd + 1;\n }\n}\n\nfunction lexText(state) {\n var type = 'text';\n var str = state.str,\n cursor = state.cursor;\n\n var textEnd = findTextEnd(str, cursor);\n if (textEnd === -1) {\n // there is only text left\n var _content = str.slice(cursor);\n state.cursor = str.length;\n state.tokens.push({ type: type, content: _content });\n return;\n }\n\n if (textEnd === cursor) return;\n\n var content = str.slice(cursor, textEnd);\n state.cursor = textEnd;\n state.tokens.push({ type: type, content: content });\n}\n\nfunction lexComment(state) {\n state.cursor += 4; // \"', cursor);\n var type = 'comment';\n if (commentEnd === -1) {\n // there is only the comment left\n var _content2 = str.slice(cursor);\n state.cursor = str.length;\n state.tokens.push({ type: type, content: _content2 });\n return;\n }\n\n var content = str.slice(cursor, commentEnd);\n state.cursor = commentEnd + 3; // \"-->\".length\n state.tokens.push({ type: type, content: content });\n}\n\nfunction lexTag(state) {\n var str = state.str;\n\n {\n var secondChar = str.charAt(state.cursor + 1);\n var close = secondChar === '/';\n state.tokens.push({ type: 'tag-start', close: close });\n state.cursor += close ? 2 : 1;\n }\n var tagName = lexTagName(state);\n lexTagAttributes(state);\n {\n var firstChar = str.charAt(state.cursor);\n var _close = firstChar === '/';\n state.tokens.push({ type: 'tag-end', close: _close });\n state.cursor += _close ? 2 : 1;\n }\n return tagName;\n}\n\n// See https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions#special-white-space\nvar whitespace = /\\s/;\nfunction isWhitespaceChar(char) {\n return whitespace.test(char);\n}\n\nfunction lexTagName(state) {\n var str = state.str,\n cursor = state.cursor;\n\n var len = str.length;\n var start = cursor;\n while (start < len) {\n var char = str.charAt(start);\n var isTagChar = !(isWhitespaceChar(char) || char === '/' || char === '>');\n if (isTagChar) break;\n start++;\n }\n\n var end = start + 1;\n while (end < len) {\n var _char = str.charAt(end);\n var _isTagChar = !(isWhitespaceChar(_char) || _char === '/' || _char === '>');\n if (!_isTagChar) break;\n end++;\n }\n\n state.cursor = end;\n var tagName = str.slice(start, end);\n state.tokens.push({ type: 'tag', content: tagName });\n return tagName;\n}\n\nfunction lexTagAttributes(state) {\n var str = state.str,\n tokens = state.tokens;\n\n var cursor = state.cursor;\n var quote = null; // null, single-, or double-quote\n var wordBegin = cursor; // index of word start\n var words = []; // \"key\", \"key=value\", \"key='value'\", etc\n var len = str.length;\n while (cursor < len) {\n var char = str.charAt(cursor);\n if (quote) {\n var isQuoteEnd = char === quote;\n if (isQuoteEnd) {\n quote = null;\n }\n cursor++;\n continue;\n }\n\n var isTagEnd = char === '/' || char === '>';\n if (isTagEnd) {\n if (cursor !== wordBegin) {\n words.push(str.slice(wordBegin, cursor));\n }\n break;\n }\n\n var isWordEnd = isWhitespaceChar(char);\n if (isWordEnd) {\n if (cursor !== wordBegin) {\n words.push(str.slice(wordBegin, cursor));\n }\n wordBegin = cursor + 1;\n cursor++;\n continue;\n }\n\n var isQuoteStart = char === '\\'' || char === '\"';\n if (isQuoteStart) {\n quote = char;\n cursor++;\n continue;\n }\n\n cursor++;\n }\n state.cursor = cursor;\n\n var wLen = words.length;\n var type = 'attribute';\n for (var i = 0; i < wLen; i++) {\n var word = words[i];\n var isNotPair = word.indexOf('=') === -1;\n if (isNotPair) {\n var secondWord = words[i + 1];\n if (secondWord && (0, _compat.startsWith)(secondWord, '=')) {\n if (secondWord.length > 1) {\n var newWord = word + secondWord;\n tokens.push({ type: type, content: newWord });\n i += 1;\n continue;\n }\n var thirdWord = words[i + 2];\n i += 1;\n if (thirdWord) {\n var _newWord = word + '=' + thirdWord;\n tokens.push({ type: type, content: _newWord });\n i += 1;\n continue;\n }\n }\n }\n if ((0, _compat.endsWith)(word, '=')) {\n var _secondWord = words[i + 1];\n if (_secondWord && !(0, _compat.stringIncludes)(_secondWord, '=')) {\n var _newWord3 = word + _secondWord;\n tokens.push({ type: type, content: _newWord3 });\n i += 1;\n continue;\n }\n\n var _newWord2 = word.slice(0, -1);\n tokens.push({ type: type, content: _newWord2 });\n continue;\n }\n\n tokens.push({ type: type, content: word });\n }\n}\n\nfunction lexSkipTag(tagName, state) {\n var str = state.str,\n cursor = state.cursor,\n tokens = state.tokens;\n\n var len = str.length;\n var index = cursor;\n while (index < len) {\n var nextTag = str.indexOf('= 0) {\n var parentTagName = stack[currentIndex].tagName;\n if (parentTagName === tagName) {\n break;\n }\n if ((0, _compat.arrayIncludes)(tagParents, parentTagName)) {\n return true;\n }\n currentIndex--;\n }\n }\n return false;\n}\n\nfunction parse(state) {\n var tokens = state.tokens,\n options = state.options;\n var stack = state.stack;\n\n var nodes = stack[stack.length - 1].children;\n var len = tokens.length;\n var cursor = state.cursor;\n\n while (cursor < len) {\n var token = tokens[cursor];\n if (token.type !== 'tag-start') {\n nodes.push(token);\n cursor++;\n continue;\n }\n\n var tagToken = tokens[++cursor];\n cursor++;\n var tagName = tagToken.content.toLowerCase();\n if (token.close) {\n var index = stack.length;\n var didRewind = false;\n while (--index > -1) {\n if (stack[index].tagName === tagName) {\n stack.splice(index);\n didRewind = true;\n break;\n }\n }\n while (cursor < len) {\n var endToken = tokens[cursor];\n if (endToken.type !== 'tag-end') break;\n cursor++;\n }\n if (didRewind) {\n break;\n } else {\n continue;\n }\n }\n\n var isClosingTag = (0, _compat.arrayIncludes)(options.closingTags, tagName);\n var shouldRewindToAutoClose = isClosingTag;\n if (shouldRewindToAutoClose) {\n var terminals = options.closingTagAncestorBreakers;\n\n shouldRewindToAutoClose = !hasTerminalParent(tagName, stack, terminals);\n }\n\n if (shouldRewindToAutoClose) {\n // rewind the stack to just above the previous\n // closing tag of the same name\n var currentIndex = stack.length - 1;\n while (currentIndex > 0) {\n if (tagName === stack[currentIndex].tagName) {\n stack = stack.slice(0, currentIndex);\n var previousIndex = currentIndex - 1;\n nodes = stack[previousIndex].children;\n break;\n }\n currentIndex = currentIndex - 1;\n }\n }\n\n var attributes = [];\n var attrToken = void 0;\n while (cursor < len) {\n attrToken = tokens[cursor];\n if (attrToken.type === 'tag-end') break;\n attributes.push(attrToken.content);\n cursor++;\n }\n\n cursor++;\n var children = [];\n nodes.push({\n type: 'element',\n tagName: tagToken.content,\n attributes: attributes,\n children: children\n });\n\n var hasChildren = !(attrToken.close || (0, _compat.arrayIncludes)(options.voidTags, tagName));\n if (hasChildren) {\n stack.push({ tagName: tagName, children: children });\n var innerState = { tokens: tokens, options: options, cursor: cursor, stack: stack };\n parse(innerState);\n cursor = innerState.cursor;\n }\n }\n state.cursor = cursor;\n}\n\n},{\"./compat\":1}],6:[function(require,module,exports){\n'use strict';\n\nObject.defineProperty(exports, \"__esModule\", {\n value: true\n});\nexports.formatAttributes = formatAttributes;\nexports.toHTML = toHTML;\n\nvar _compat = require('./compat');\n\nfunction formatAttributes(attributes) {\n return attributes.reduce(function (attrs, attribute) {\n var key = attribute.key,\n value = attribute.value;\n\n if (value === null) {\n return attrs + ' ' + key;\n }\n var quoteEscape = value.indexOf('\\'') !== -1;\n var quote = quoteEscape ? '\"' : '\\'';\n return attrs + ' ' + key + '=' + quote + value + quote;\n }, '');\n}\n\nfunction toHTML(tree, options) {\n return tree.map(function (node) {\n if (node.type === 'text') {\n return node.content;\n }\n if (node.type === 'comment') {\n return '';\n }\n var tagName = node.tagName,\n attributes = node.attributes,\n children = node.children;\n\n var isSelfClosing = (0, _compat.arrayIncludes)(options.voidTags, tagName.toLowerCase());\n return isSelfClosing ? '<' + tagName + formatAttributes(attributes) + '>' : '<' + tagName + formatAttributes(attributes) + '>' + toHTML(children, options) + '';\n }).join('');\n}\n\nexports.default = { toHTML: toHTML };\n\n},{\"./compat\":1}],7:[function(require,module,exports){\n'use strict';\n\nObject.defineProperty(exports, \"__esModule\", {\n value: true\n});\n/*\n Tags which contain arbitary non-parsed content\n For example: diff --git a/src/format.js b/src/format.js index b43aa09..ffbb47b 100644 --- a/src/format.js +++ b/src/format.js @@ -14,17 +14,21 @@ export function unquote (str) { return str } -export function format (nodes) { +export function format (nodes, options) { return nodes.map(node => { const type = node.type - if (type === 'element') { - const tagName = node.tagName.toLowerCase() - const attributes = formatAttributes(node.attributes) - const children = format(node.children) - return {type, tagName, attributes, children} + const outputNode = type === 'element' + ? { + type, + tagName: node.tagName.toLowerCase(), + attributes: formatAttributes(node.attributes), + children: format(node.children, options) + } + : { type, content: node.content } + if (options.includePositions) { + outputNode.position = node.position } - - return {type, content: node.content} + return outputNode }) } diff --git a/src/index.js b/src/index.js index 07d80d3..0e4cc84 100644 --- a/src/index.js +++ b/src/index.js @@ -13,7 +13,8 @@ export const parseDefaults = { voidTags, closingTags, childlessTags, - closingTagAncestorBreakers + closingTagAncestorBreakers, + includePositions: false } export function parse (str, options = parseDefaults) { diff --git a/src/lexer.js b/src/lexer.js index 0cc2a21..b0b39ac 100644 --- a/src/lexer.js +++ b/src/lexer.js @@ -5,26 +5,65 @@ import { arrayIncludes } from './compat' +export function feedPosition (position, str, len) { + const start = position.index + const end = position.index = start + len + for (let i = start; i < end; i++) { + const char = str.charAt(i) + if (char === '\n') { + position.line++ + position.column = 0 + } else { + position.column++ + } + } +} + +export function jumpPosition (position, str, end) { + const len = end - position.index + return feedPosition(position, str, len) +} + +export function makeInitialPosition () { + return { + index: 0, + column: 0, + line: 0 + } +} + +export function copyPosition (position) { + return { + index: position.index, + line: position.line, + column: position.column + } +} + export default function lexer (str, options) { - const state = {str, options, cursor: 0, tokens: []} + const state = { + str, + options, + position: makeInitialPosition(), + tokens: [] + } lex(state) return state.tokens } export function lex (state) { - const {str} = state + const {str, options: {childlessTags}} = state const len = str.length - while (state.cursor < len) { - const start = state.cursor + while (state.position.index < len) { + const start = state.position.index lexText(state) - if (state.cursor === start) { - const isComment = startsWith(str, '!--', state.cursor + 1) + if (state.position.index === start) { + const isComment = startsWith(str, '!--', start + 1) if (isComment) { lexComment(state) } else { const tagName = lexTag(state) const safeTag = tagName.toLowerCase() - const {childlessTags} = state.options if (arrayIncludes(childlessTags, safeTag)) { lexSkipTag(tagName, state) } @@ -50,56 +89,59 @@ export function findTextEnd (str, index) { export function lexText (state) { const type = 'text' - const {str, cursor} = state - const textEnd = findTextEnd(str, cursor) + const {str, position} = state + let textEnd = findTextEnd(str, position.index) + if (textEnd === position.index) return if (textEnd === -1) { - // there is only text left - const content = str.slice(cursor) - state.cursor = str.length - state.tokens.push({type, content}) - return + textEnd = str.length } - if (textEnd === cursor) return - - const content = str.slice(cursor, textEnd) - state.cursor = textEnd - state.tokens.push({type, content}) + const start = copyPosition(position) + const content = str.slice(position.index, textEnd) + jumpPosition(position, str, textEnd) + const end = copyPosition(position) + state.tokens.push({type, content, position: {start, end}}) } export function lexComment (state) { - state.cursor += 4 // "', cursor) - const type = 'comment' - if (commentEnd === -1) { - // there is only the comment left - const content = str.slice(cursor) - state.cursor = str.length - state.tokens.push({type, content}) - return + const {str, position} = state + const start = copyPosition(position) + feedPosition(position, str, 4) // "', position.index) + let commentEnd = contentEnd + 3 // "-->".length + if (contentEnd === -1) { + contentEnd = commentEnd = str.length } - const content = str.slice(cursor, commentEnd) - state.cursor = commentEnd + 3 // "-->".length - state.tokens.push({type, content}) + const content = str.slice(position.index, contentEnd) + jumpPosition(position, str, commentEnd) + state.tokens.push({ + type: 'comment', + content, + position: { + start, + end: copyPosition(position) + } + }) } export function lexTag (state) { - const {str} = state + const {str, position} = state { - const secondChar = str.charAt(state.cursor + 1) + const secondChar = str.charAt(position.index + 1) const close = secondChar === '/' - state.tokens.push({type: 'tag-start', close}) - state.cursor += close ? 2 : 1 + const start = copyPosition(position) + feedPosition(position, str, close ? 2 : 1) + state.tokens.push({type: 'tag-start', close, position: {start}}) } const tagName = lexTagName(state) lexTagAttributes(state) { - const firstChar = str.charAt(state.cursor) + const firstChar = str.charAt(position.index) const close = firstChar === '/' - state.tokens.push({type: 'tag-end', close}) - state.cursor += close ? 2 : 1 + feedPosition(position, str, close ? 2 : 1) + const end = copyPosition(position) + state.tokens.push({type: 'tag-end', close, position: {end}}) } return tagName } @@ -111,9 +153,9 @@ export function isWhitespaceChar (char) { } export function lexTagName (state) { - const {str, cursor} = state + const {str, position} = state const len = str.length - let start = cursor + let start = position.index while (start < len) { const char = str.charAt(start) const isTagChar = !(isWhitespaceChar(char) || char === '/' || char === '>') @@ -129,15 +171,18 @@ export function lexTagName (state) { end++ } - state.cursor = end + jumpPosition(position, str, end) const tagName = str.slice(start, end) - state.tokens.push({type: 'tag', content: tagName}) + state.tokens.push({ + type: 'tag', + content: tagName + }) return tagName } export function lexTagAttributes (state) { - const {str, tokens} = state - let cursor = state.cursor + const {str, position, tokens} = state + let cursor = position.index let quote = null // null, single-, or double-quote let wordBegin = cursor // index of word start const words = [] // "key", "key=value", "key='value'", etc @@ -180,7 +225,7 @@ export function lexTagAttributes (state) { cursor++ } - state.cursor = cursor + jumpPosition(position, str, cursor) const wLen = words.length const type = 'attribute' @@ -224,10 +269,13 @@ export function lexTagAttributes (state) { } } +const push = [].push + export function lexSkipTag (tagName, state) { - const {str, cursor, tokens} = state + const {str, position, tokens} = state + const safeTagName = tagName.toLowerCase() const len = str.length - let index = cursor + let index = position.index while (index < len) { const nextTag = str.indexOf(' -1) { if (stack[index].tagName === tagName) { - stack.splice(index) - didRewind = true + shouldRewind = true break } } @@ -57,7 +64,8 @@ export function parse (state) { if (endToken.type !== 'tag-end') break cursor++ } - if (didRewind) { + if (shouldRewind) { + rewindStack(stack, index, token.position.start, tokens[cursor - 1].position.end) break } else { continue @@ -77,7 +85,7 @@ export function parse (state) { let currentIndex = stack.length - 1 while (currentIndex > 0) { if (tagName === stack[currentIndex].tagName) { - stack = stack.slice(0, currentIndex) + rewindStack(stack, currentIndex, token.position.start, token.position.start) const previousIndex = currentIndex - 1 nodes = stack[previousIndex].children break @@ -97,19 +105,29 @@ export function parse (state) { cursor++ const children = [] - nodes.push({ + const position = { + start: token.position.start, + end: attrToken.position.end + } + const elementNode = { type: 'element', tagName: tagToken.content, attributes, - children - }) + children, + position + } + nodes.push(elementNode) const hasChildren = !(attrToken.close || arrayIncludes(options.voidTags, tagName)) if (hasChildren) { - stack.push({tagName, children}) + const size = stack.push({tagName, children, position}) const innerState = {tokens, options, cursor, stack} parse(innerState) cursor = innerState.cursor + const rewoundInElement = stack.length === size + if (rewoundInElement) { + elementNode.position.end = tokens[cursor - 1].position.end + } } } state.cursor = cursor diff --git a/test/format.js b/test/format.js index 153085c..384f1d2 100644 --- a/test/format.js +++ b/test/format.js @@ -1,5 +1,5 @@ import test from 'ava' -import {parse, parseDefaults} from '../' +import {parse, parseDefaults} from '../lib' import {formatAttributes} from '../lib/format' test('formatAttributes() should return a key-value array', t => { @@ -15,6 +15,49 @@ test('formatAttributes() should return a key-value array', t => { ]) }) +test('parse() should emit positions if includePositions is true', t => { + t.deepEqual( + parse('

Hello world

', { ...parseDefaults, includePositions: true }), + [ + { + type: 'element', + tagName: 'h1', + attributes: [], + children: [ + { + type: 'text', + content: 'Hello world', + position: { + start: { + index: 4, + line: 0, + column: 4 + }, + end: { + index: 15, + line: 0, + column: 15 + } + } + } + ], + position: { + start: { + index: 0, + line: 0, + column: 0 + }, + end: { + index: 20, + line: 0, + column: 20 + } + } + } + ] + ) +}) + /* These tests ensure the parser and v1 formatting align. diff --git a/test/lexer.js b/test/lexer.js index 00c8811..c990a26 100644 --- a/test/lexer.js +++ b/test/lexer.js @@ -10,18 +10,22 @@ import lexer, { isWhitespaceChar } from '../lib/lexer' +function ps (index) { + return { index, line: 0, column: index } +} + test('lexer should return tokens', t => { const str = '

Test case

' const options = {childlessTags: []} const tokens = lexer(str, options) t.deepEqual(tokens, [ - {type: 'tag-start', close: false}, + {type: 'tag-start', close: false, position: {start: ps(0)}}, {type: 'tag', content: 'h1'}, - {type: 'tag-end', close: false}, - {type: 'text', content: 'Test case'}, - {type: 'tag-start', close: true}, + {type: 'tag-end', close: false, position: {end: ps(4)}}, + {type: 'text', content: 'Test case', position: {start: ps(4), end: ps(13)}}, + {type: 'tag-start', close: true, position: {start: ps(13)}}, {type: 'tag', content: 'h1'}, - {type: 'tag-end', close: false} + {type: 'tag-end', close: false, position: {end: ps(str.length)}} ]) }) @@ -31,7 +35,7 @@ test('lexer should parse tags beginning with alphanumeric names', t => { const options = {childlessTags: []} const tokens = lexer(str, options) t.deepEqual(tokens, [ - {type: 'text', content: '2 <= 4 >'} + {type: 'text', content: '2 <= 4 >', position: {start: ps(0), end: ps(str.length)}} ]) } @@ -40,11 +44,11 @@ test('lexer should parse tags beginning with alphanumeric names', t => { const options = {childlessTags: []} const tokens = lexer(str, options) t.deepEqual(tokens, [ - {type: 'text', content: '2 '}, - {type: 'tag-start', close: false}, + {type: 'text', content: '2 ', position: {start: ps(0), end: ps(2)}}, + {type: 'tag-start', close: false, position: {start: ps(2)}}, {type: 'tag', content: 'a'}, {type: 'attribute', content: '4'}, - {type: 'tag-end', close: false} + {type: 'tag-end', close: false, position: {end: ps(str.length)}} ]) } }) @@ -54,13 +58,13 @@ test('lexer should skip lexing the content of childless tags', t => { const options = {childlessTags: ['template']} const tokens = lexer(str, options) t.deepEqual(tokens, [ - {type: 'tag-start', close: false}, + {type: 'tag-start', close: false, position: {start: ps(0)}}, {type: 'tag', content: 'template'}, - {type: 'tag-end', close: false}, - {type: 'text', content: 'Hello '}, - {type: 'tag-start', close: true}, + {type: 'tag-end', close: false, position: {end: ps(10)}}, + {type: 'text', content: 'Hello ', position: {start: ps(10), end: ps(22)}}, + {type: 'tag-start', close: true, position: {start: ps(22)}}, {type: 'tag', content: 'template'}, - {type: 'tag-end', close: false} + {type: 'tag-end', close: false, position: {end: ps(str.length)}} ]) }) @@ -75,29 +79,37 @@ test('lexText should tokenize the next text segment', t => { const str = 'text that ends' const finish = str.indexOf('<') - const state = {str, cursor: 0, tokens: []} + const state = {str, position: ps(0), tokens: []} lexText(state) - t.is(state.cursor, finish) + t.is(state.position.index, finish) const token = state.tokens[0] t.deepEqual(token, { type: 'text', - content: 'text that ends' + content: 'text that ends', + position: { + start: ps(0), + end: ps(14) + } }) }) -test('lexText should tokenize from the cursor', t => { +test('lexText should tokenize from the current position', t => { const str = 'abcdtext that ends' const finish = str.indexOf('<') - const state = {str, cursor: 4, tokens: []} + const state = {str, position: ps(4), tokens: []} lexText(state) - t.is(state.cursor, finish) + t.is(state.position.index, finish) const token = state.tokens[0] t.deepEqual(token, { type: 'text', - content: 'text that ends' + content: 'text that ends', + position: { + start: ps(4), + end: ps(18) + } }) }) @@ -105,14 +117,18 @@ test('lexText should tokenize safely to string end', t => { const str = 'text that does not end' const finish = str.length - const state = {str, cursor: 0, tokens: []} + const state = {str, position: ps(0), tokens: []} lexText(state) - t.is(state.cursor, finish) + t.is(state.position.index, finish) const token = state.tokens[0] t.deepEqual(token, { type: 'text', - content: 'text that does not end' + content: 'text that does not end', + position: { + start: ps(0), + end: ps(str.length) + } }) }) @@ -121,83 +137,99 @@ test('lexText should not add a token for an empty text', t => { const start = 2 const finish = 2 - const state = {str, cursor: start, tokens: []} + const state = {str, position: ps(start), tokens: []} lexText(state) - t.is(state.cursor, finish) + t.is(state.position.index, finish) t.is(state.tokens.length, 0) }) test('lexComment should tokenize the next comment', t => { const str = 'abcd' const finish = str.indexOf('abcd') - const state = {str, cursor: 0, tokens: []} + const state = {str, position: ps(0), tokens: []} lexComment(state) - t.is(state.cursor, finish) + t.is(state.position.index, finish) t.deepEqual(state.tokens[0], { type: 'comment', - content: ' this is a comment ' + content: ' this is a comment ', + position: { + start: ps(0), + end: ps(finish) + } }) }) test('lexComment should tokenize safely to string end', t => { const str = '' const finish = str.indexOf('') - const state = {str, cursor: 4, tokens: []} + const state = {str, position: ps(4), tokens: []} lexComment(state) - t.is(state.cursor, finish) + t.is(state.position.index, finish) t.deepEqual(state.tokens[0], { type: 'comment', - content: ' comment text ' + content: ' comment text ', + position: { + start: ps(4), + end: ps(finish) + } }) }) test('lexComment should add a token for an empty comment', t => { const str = '' const finish = str.length - const state = {str, cursor: 0, tokens: []} + const state = {str, position: ps(0), tokens: []} lexComment(state) - t.is(state.cursor, finish) + t.is(state.position.index, finish) t.deepEqual(state.tokens[0], { type: 'comment', - content: '' + content: '', + position: { + start: ps(0), + end: ps(finish) + } }) }) test('lexTag should tokenize the next tag', t => { const str = 'abcd' const finish = str.indexOf('abcd') - const state = {str, cursor: 0, tokens: []} + const state = {str, position: ps(0), tokens: []} lexTag(state) - t.is(state.cursor, finish) + t.is(state.position.index, finish) t.deepEqual(state.tokens, [ - {type: 'tag-start', close: false}, + {type: 'tag-start', close: false, position: {start: ps(0)}}, {type: 'tag', content: 'img'}, // not a part of this test - {type: 'tag-end', close: true} + {type: 'tag-end', close: true, position: {end: ps(finish)}} ]) }) test('lexTagName should tokenize the next tag name', t => { const str = 'h1 id="title"> test' const finish = 2 - const state = {str, cursor: 0, tokens: []} + const state = {str, position: ps(0), tokens: []} lexTagName(state) - t.is(state.cursor, finish) + t.is(state.position.index, finish) t.deepEqual(state.tokens[0], { type: 'tag', content: 'h1' @@ -206,9 +238,9 @@ test('lexTagName should tokenize the next tag name', t => { test('lexTagName should ignore leading not-tagname characters', t => { const str = '>/ div' - const state = {str, cursor: 0, tokens: []} + const state = {str, position: ps(0), tokens: []} lexTagName(state) - t.is(state.cursor, str.length) + t.is(state.position.index, str.length) t.deepEqual(state.tokens[0], { type: 'tag', content: 'div' @@ -218,9 +250,9 @@ test('lexTagName should ignore leading not-tagname characters', t => { test('lexTagAttributes should tokenize attributes until tag end', t => { const str = 'yes="no" maybe data-type="array">abcd' const finish = str.indexOf('>abcd') - const state = {str, cursor: 0, tokens: []} + const state = {str, position: ps(0), tokens: []} lexTagAttributes(state) - t.is(state.cursor, finish) + t.is(state.position.index, finish) t.deepEqual(state.tokens, [ {type: 'attribute', content: 'yes="no"'}, {type: 'attribute', content: 'maybe'}, @@ -231,9 +263,9 @@ test('lexTagAttributes should tokenize attributes until tag end', t => { test('lexTagAttributes should tokenize independent of whitespace', t => { const str = 'yes = "no" maybe data-type= "array" key ="value" >abcd' const finish = str.indexOf('>abcd') - const state = {str, cursor: 0, tokens: []} + const state = {str, position: ps(0), tokens: []} lexTagAttributes(state) - t.is(state.cursor, finish) + t.is(state.position.index, finish) t.deepEqual(state.tokens, [ {type: 'attribute', content: 'yes="no"'}, {type: 'attribute', content: 'maybe'}, @@ -244,9 +276,9 @@ test('lexTagAttributes should tokenize independent of whitespace', t => { test('lexTagAttributes should handle an unset attribute name', t => { const str = '
' - const state = {str, cursor: 4, tokens: []} + const state = {str, position: ps(4), tokens: []} lexTagAttributes(state) - t.is(state.cursor, str.indexOf('>')) + t.is(state.position.index, str.indexOf('>')) t.deepEqual(state.tokens, [ {type: 'attribute', content: 'foo'}, {type: 'attribute', content: 'bar="baz"'} @@ -255,9 +287,9 @@ test('lexTagAttributes should handle an unset attribute name', t => { test('lexTagAttributes should handle newline separated attributes', t => { const str = '
' - const state = {str, cursor: 4, tokens: []} + const state = {str, position: ps(4), tokens: []} lexTagAttributes(state) - t.is(state.cursor, str.indexOf('>')) + t.is(state.position.index, str.indexOf('>')) t.deepEqual(state.tokens, [ {type: 'attribute', content: 'foo="bar"'}, {type: 'attribute', content: 'baz="bat"'} @@ -266,9 +298,9 @@ test('lexTagAttributes should handle newline separated attributes', t => { test('lexTagAttributes should handle tab separated attributes', t => { const str = '
' - const state = {str, cursor: 4, tokens: []} + const state = {str, position: ps(4), tokens: []} lexTagAttributes(state) - t.is(state.cursor, str.indexOf('>')) + t.is(state.position.index, str.indexOf('>')) t.deepEqual(state.tokens, [ {type: 'attribute', content: 'foo="bar"'}, {type: 'attribute', content: 'baz="bat"'} @@ -278,9 +310,9 @@ test('lexTagAttributes should handle tab separated attributes', t => { test('lexTagAttributes should handle prefixed spacing', t => { const str = ' \n\tyes="no">abcd' const finish = str.indexOf('>abcd') - const state = {str, cursor: 0, tokens: []} + const state = {str, position: ps(0), tokens: []} lexTagAttributes(state) - t.is(state.cursor, finish) + t.is(state.position.index, finish) t.deepEqual(state.tokens, [ {type: 'attribute', content: 'yes="no"'} ]) @@ -289,9 +321,9 @@ test('lexTagAttributes should handle prefixed spacing', t => { test('lexTagAttributes should handle unquoted one-word values', t => { const str = 'num=8 ham = steak>abcd' const finish = str.indexOf('>abcd') - const state = {str, cursor: 0, tokens: []} + const state = {str, position: ps(0), tokens: []} lexTagAttributes(state) - t.is(state.cursor, finish) + t.is(state.position.index, finish) t.deepEqual(state.tokens, [ {type: 'attribute', content: 'num=8'}, {type: 'attribute', content: 'ham=steak'} @@ -301,9 +333,9 @@ test('lexTagAttributes should handle unquoted one-word values', t => { test('lexTagAttributes should handle incomplete attributes', t => { const str = 'x = >abcd' const finish = str.indexOf('>abcd') - const state = {str, cursor: 0, tokens: []} + const state = {str, position: ps(0), tokens: []} lexTagAttributes(state) - t.is(state.cursor, finish) + t.is(state.position.index, finish) t.deepEqual(state.tokens, [ {type: 'attribute', content: 'x'} ]) @@ -312,51 +344,63 @@ test('lexTagAttributes should handle incomplete attributes', t => { test('lexSkipTag should tokenize as text until the matching tag name', t => { const str = 'abcd

Test case

' const finish = str.indexOf('') - const state = {str, cursor: 10, tokens: []} + const state = {str, position: ps(10), tokens: []} lexSkipTag('test', state) - t.is(state.cursor, finish) + t.is(state.position.index, finish) t.deepEqual(state.tokens, [ - {type: 'text', content: '

Test case

'}, - {type: 'tag-start', close: true}, + {type: 'text', content: '

Test case

', position: {start: ps(10), end: ps(28)}}, + {type: 'tag-start', close: true, position: {start: ps(28)}}, {type: 'tag', content: 'test'}, - {type: 'tag-end', close: false} + {type: 'tag-end', close: false, position: {end: ps(finish)}} ]) }) test('lexSkipTag should stop at the case-insensitive matching tag name', t => { const str = 'proving the point' const finish = str.indexOf('') - const state = {str, cursor: 6, tokens: []} + const state = {str, position: ps(6), tokens: []} lexSkipTag('tEsT', state) - t.is(state.cursor, finish) + t.is(state.position.index, finish) t.deepEqual(state.tokens, [ - {type: 'text', content: 'proving the point'}, - {type: 'tag-start', close: true}, + {type: 'text', content: 'proving the point', position: {start: ps(6), end: ps(29)}}, + {type: 'tag-start', close: true, position: {start: ps(29)}}, {type: 'tag', content: 'TeSt'}, - {type: 'tag-end', close: false} + {type: 'tag-end', close: false, position: {end: ps(finish)}} ]) }) test('lexSkipTag should auto-close if the end tag is not found', t => { const str = '' - const state = {str, cursor: 8, tokens: []} + const state = {str, position: ps(8), tokens: []} + lexSkipTag('script', state) + t.is(state.position.index, str.length) + t.deepEqual(state.tokens, [ + {type: 'text', content: 'proving ', position: {start: ps(8), end: ps(26)}}, + {type: 'tag-start', close: true, position: {start: ps(26)}}, + {type: 'tag', content: 'script'}, + {type: 'tag-end', close: false, position: {end: ps(str.length)}} + ]) +}) + +test('lexSkipTag should not add an empty inner text node', t => { + const str = '' + const state = {str, position: ps(8), tokens: []} lexSkipTag('script', state) - t.is(state.cursor, str.length) + t.is(state.position.index, str.length) t.deepEqual(state.tokens, [ - {type: 'text', content: 'proving '}, - {type: 'tag-start', close: true}, + {type: 'tag-start', close: true, position: {start: ps(8)}}, {type: 'tag', content: 'script'}, - {type: 'tag-end', close: false} + {type: 'tag-end', close: false, position: {end: ps(str.length)}} ]) }) diff --git a/test/parser.js b/test/parser.js index 02ec273..6ceebde 100644 --- a/test/parser.js +++ b/test/parser.js @@ -1,6 +1,10 @@ import test from 'ava' -import parser from '../src/parser' -import lexer from '../src/lexer' +import parser from '../lib/parser' +import lexer from '../lib/lexer' + +function ps (index) { + return { index, line: 0, column: index } +} const lexerOptions = { childlessTags: [] } const parserOptions = { @@ -21,9 +25,17 @@ test('parser() should return nodes', t => { children: [ { type: 'text', - content: 'Hello world' + content: 'Hello world', + position: { + start: ps(4), + end: ps(15) + } } - ] + ], + position: { + start: ps(0), + end: ps(str.length) + } } ]) }) @@ -40,19 +52,35 @@ test('parser() should not nest within void tags', t => { children: [ { type: 'text', - content: 'abc' + content: 'abc', + position: { + start: ps(5), + end: ps(8) + } }, { type: 'element', tagName: 'img', attributes: [], - children: [] + children: [], + position: { + start: ps(8), + end: ps(14) + } }, { type: 'text', - content: 'def' + content: 'def', + position: { + start: ps(14), + end: ps(17) + } } - ] + ], + position: { + start: ps(0), + end: ps(str.length) + } } ]) }) @@ -75,9 +103,17 @@ test('parser() should handle optional-close tags', t => { children: [ { type: 'text', - content: 'This is one' + content: 'This is one', + position: { + start: ps(3), + end: ps(14) + } } - ] + ], + position: { + start: ps(0), + end: ps(14) + } }, { type: 'element', @@ -86,9 +122,17 @@ test('parser() should handle optional-close tags', t => { children: [ { type: 'text', - content: 'This is two' + content: 'This is two', + position: { + start: ps(17), + end: ps(28) + } } - ] + ], + position: { + start: ps(14), + end: ps(str.length) + } } ]) } @@ -110,7 +154,11 @@ test('parser() should handle optional-close tags', t => { children: [ { type: 'text', - content: 'This is one ' + content: 'This is one ', + position: { + start: ps(3), + end: ps(15) + } }, { type: 'element', @@ -119,11 +167,23 @@ test('parser() should handle optional-close tags', t => { children: [ { type: 'text', - content: 'okay' + content: 'okay', + position: { + start: ps(21), + end: ps(25) + } } - ] + ], + position: { + start: ps(15), + end: ps(25) + } } - ] + ], + position: { + start: ps(0), + end: ps(25) + } }, { type: 'element', @@ -132,9 +192,17 @@ test('parser() should handle optional-close tags', t => { children: [ { type: 'text', - content: 'This is two' + content: 'This is two', + position: { + start: ps(28), + end: ps(39) + } } - ] + ], + position: { + start: ps(25), + end: ps(43) + } } ]) } @@ -154,28 +222,52 @@ test('parser() should auto-close unmatched child tags', t => { type: 'element', tagName: 'div', attributes: [], + position: { + start: ps(0), + end: ps(36) + }, children: [ { type: 'text', - content: 'This is ' + content: 'This is ', + position: { + start: ps(5), + end: ps(13) + } }, { type: 'element', tagName: 'b', attributes: [], + position: { + start: ps(13), + end: ps(30) + }, children: [ { type: 'text', - content: 'one ' + content: 'one ', + position: { + start: ps(16), + end: ps(20) + } }, { type: 'element', tagName: 'span', attributes: [], + position: { + start: ps(20), + end: ps(30) + }, children: [ { type: 'text', - content: 'okay' + content: 'okay', + position: { + start: ps(26), + end: ps(30) + } } ] } @@ -201,6 +293,10 @@ test('parser() should report the element attributes', t => { type: 'element', tagName: 'div', attributes: ['class="cake"', 'data-key="abc"', 'disabled'], + position: { + start: ps(0), + end: ps(48) + }, children: [] } ]) @@ -215,10 +311,18 @@ test('parser() should handle unclosed elements', t => { type: 'element', tagName: 'div', attributes: [], + position: { + start: ps(0), + end: ps(str.length) + }, children: [ { type: 'text', - content: 'abc' + content: 'abc', + position: { + start: ps(5), + end: ps(str.length) + } } ] } @@ -234,6 +338,10 @@ test('parser() should preserve case-sensitive tag names', t => { type: 'element', tagName: 'You-Know-8', attributes: [], + position: { + start: ps(0), + end: ps(str.length) + }, children: [] } ]) @@ -248,16 +356,28 @@ test('parser() should match by case-insensitive tags', t => { type: 'element', tagName: 'div', attributes: [], + position: { + start: ps(0), + end: ps(14) + }, children: [ { type: 'text', - content: 'abc' + content: 'abc', + position: { + start: ps(5), + end: ps(8) + } } ] }, { type: 'text', - content: 'def' + content: 'def', + position: { + start: ps(14), + end: ps(17) + } } ]) }) @@ -289,29 +409,53 @@ test('parser() should handle ancestor breaker special case (#39)', t => { type: 'element', tagName: 'ul', attributes: [], + position: { + start: ps(0), + end: ps(42) + }, children: [ { type: 'element', tagName: 'li', attributes: [], + position: { + start: ps(4), + end: ps(37) + }, children: [ { type: 'text', - content: 'abc' + content: 'abc', + position: { + start: ps(8), + end: ps(11) + } }, { type: 'element', tagName: 'ul', attributes: [], + position: { + start: ps(11), + end: ps(32) + }, children: [ { type: 'element', tagName: 'li', attributes: [], + position: { + start: ps(15), + end: ps(27) + }, children: [ { type: 'text', - content: 'def' + content: 'def', + position: { + start: ps(19), + end: ps(22) + } } ] } @@ -340,34 +484,62 @@ test('parser() should handle ancestor breaker special case (#39)', t => { type: 'element', tagName: 'ul', attributes: [], + position: { + start: ps(0), + end: ps(55) + }, children: [ { type: 'element', tagName: 'li', attributes: [], + position: { + start: ps(4), + end: ps(50) + }, children: [ { type: 'text', - content: 'abc' + content: 'abc', + position: { + start: ps(8), + end: ps(11) + } }, { type: 'element', tagName: 'ul', attributes: [], + position: { + start: ps(11), + end: ps(45) + }, children: [ { type: 'element', tagName: 'span', attributes: [], + position: { + start: ps(15), + end: ps(40) + }, children: [ { type: 'element', tagName: 'li', attributes: [], + position: { + start: ps(21), + end: ps(33) + }, children: [ { type: 'text', - content: 'def' + content: 'def', + position: { + start: ps(25), + end: ps(28) + } } ] } @@ -398,29 +570,53 @@ test('parser() should handle ancestor breaker special case (#39)', t => { type: 'element', tagName: 'ul', attributes: [], + position: { + start: ps(0), + end: ps(49) + }, children: [ { type: 'element', tagName: 'li', attributes: [], + position: { + start: ps(4), + end: ps(44) + }, children: [ { type: 'text', - content: 'abc' + content: 'abc', + position: { + start: ps(8), + end: ps(11) + } }, { type: 'element', tagName: 'ul', attributes: [], + position: { + start: ps(11), + end: ps(39) + }, children: [ { type: 'element', tagName: 'li', attributes: [], + position: { + start: ps(15), + end: ps(22) + }, children: [ { type: 'text', - content: 'def' + content: 'def', + position: { + start: ps(19), + end: ps(22) + } } ] }, @@ -428,10 +624,18 @@ test('parser() should handle ancestor breaker special case (#39)', t => { type: 'element', tagName: 'li', attributes: [], + position: { + start: ps(22), + end: ps(34) + }, children: [ { type: 'text', - content: 'ghi' + content: 'ghi', + position: { + start: ps(26), + end: ps(29) + } } ] } @@ -464,41 +668,73 @@ test('parser() should handle nested tables', t => { type: 'element', tagName: 'table', attributes: [], + position: { + start: ps(0), + end: ps(96) + }, children: [ { type: 'element', tagName: 'tbody', attributes: [], + position: { + start: ps(7), + end: ps(88) + }, children: [ { type: 'element', tagName: 'tr', attributes: [], + position: { + start: ps(14), + end: ps(80) + }, children: [ { type: 'element', tagName: 'td', attributes: [], + position: { + start: ps(18), + end: ps(75) + }, children: [ { type: 'element', tagName: 'table', attributes: [], + position: { + start: ps(22), + end: ps(70) + }, children: [ { type: 'element', tagName: 'tbody', attributes: [], + position: { + start: ps(29), + end: ps(62) + }, children: [ { type: 'element', tagName: 'tr', attributes: [], + position: { + start: ps(36), + end: ps(54) + }, children: [ { type: 'element', tagName: 'td', attributes: [], + position: { + start: ps(40), + end: ps(49) + }, children: [] } ] @@ -528,8 +764,12 @@ test('parser() should ignore unnecessary closing tags', t => { const nodes = parser(tokens, parserOptions) t.deepEqual(nodes, [ { - 'type': 'text', - 'content': 'x' + type: 'text', + content: 'x', + position: { + start: ps(4), + end: ps(str.length) + } } ]) }) diff --git a/text/ast-spec-v1.md b/text/ast-spec-v1.md index 6c815ca..5d5be80 100644 --- a/text/ast-spec-v1.md +++ b/text/ast-spec-v1.md @@ -69,3 +69,23 @@ interface Text extends Node { ``` A `text` node. + +## Positions +The parser can be configured to emit line, column, and index numbers for nodes. +The `includePositions: true` parse option adds the `position` field: + +```ts +interface Position { + index: number; + line: number; + column: number; +} + +interface Node { + type: string; + position: { + start: Position; + end: Position; + } +} +```