From d03284be3efc2066e6f7e4d5d646f073facc5a11 Mon Sep 17 00:00:00 2001 From: Titus Wormer Date: Mon, 19 Oct 2020 10:49:36 +0200 Subject: [PATCH] Add `light` version This refactors the project to split into several files, and adds a new `stringify-entities/light` module that has a small bundle size but does not have any of the formatting options. Closes GH-9. --- build.js | 2 +- index.js | 158 +----------------- lib/constant/assign.js | 1 + lib/constant/characters.js | 10 ++ dangerous.json => lib/constant/dangerous.json | 0 lib/constant/from-char-code.js | 1 + lib/constant/has-own-property.js | 1 + lib/core.js | 49 ++++++ lib/encode-hexadecimal.js | 14 ++ lib/encode.js | 13 ++ lib/escape.js | 15 ++ lib/index.js | 7 + lib/util/format-basic.js | 5 + lib/util/format-smart.js | 48 ++++++ lib/util/to-decimal.js | 9 + lib/util/to-hexadecimal.js | 11 ++ lib/util/to-named.js | 33 ++++ light.js | 3 + package.json | 7 +- readme.md | 52 ++++-- test.js | 4 +- types/index.d.ts | 14 +- 22 files changed, 268 insertions(+), 189 deletions(-) create mode 100644 lib/constant/assign.js create mode 100644 lib/constant/characters.js rename dangerous.json => lib/constant/dangerous.json (100%) create mode 100644 lib/constant/from-char-code.js create mode 100644 lib/constant/has-own-property.js create mode 100644 lib/core.js create mode 100644 lib/encode-hexadecimal.js create mode 100644 lib/encode.js create mode 100644 lib/escape.js create mode 100644 lib/index.js create mode 100644 lib/util/format-basic.js create mode 100644 lib/util/format-smart.js create mode 100644 lib/util/to-decimal.js create mode 100644 lib/util/to-hexadecimal.js create mode 100644 lib/util/to-named.js create mode 100644 light.js diff --git a/build.js b/build.js index b96e3d8..64f5e4d 100644 --- a/build.js +++ b/build.js @@ -28,6 +28,6 @@ while (++index < length) { } fs.writeFileSync( - path.join('dangerous.json'), + path.join('lib', 'constant', 'dangerous.json'), JSON.stringify(conflict, null, 2) + '\n' ) diff --git a/index.js b/index.js index ca465c9..3541ac5 100644 --- a/index.js +++ b/index.js @@ -1,159 +1,3 @@ 'use strict' -var entities = require('character-entities-html4') -var legacy = require('character-entities-legacy') -var hexadecimal = require('is-hexadecimal') -var decimal = require('is-decimal') -var alphanumerical = require('is-alphanumerical') -var dangerous = require('./dangerous.json') - -module.exports = encode -encode.escape = escape - -var own = {}.hasOwnProperty - -// Characters -var equalsTo = 61 - -// List of enforced escapes. -var escapes = ['"', "'", '<', '>', '&', '`'] - -// Map of characters to names. -var characters = construct() - -// Default escapes. -var defaultEscapes = toExpression(escapes) - -// Surrogate pairs. -var surrogatePair = /[\uD800-\uDBFF][\uDC00-\uDFFF]/g - -// Non-ASCII characters. -// eslint-disable-next-line no-control-regex, unicorn/no-hex-escape -var bmp = /[\x01-\t\x0B\f\x0E-\x1F\x7F\x81\x8D\x8F\x90\x9D\xA0-\uFFFF]/g - -// Encode special characters in `value`. -function encode(value, options) { - var settings = options || {} - var subset = settings.subset - var set = subset ? toExpression(subset) : defaultEscapes - var escapeOnly = settings.escapeOnly - var omit = settings.omitOptionalSemicolons - - value = value.replace(set, replace) - - if (subset || escapeOnly) { - return value - } - - return value - .replace(surrogatePair, replaceSurrogatePair) - .replace(bmp, replace) - - function replaceSurrogatePair(pair, pos, slice) { - return toHexReference( - (pair.charCodeAt(0) - 0xd800) * 0x400 + - pair.charCodeAt(1) - - 0xdc00 + - 0x10000, - slice.charCodeAt(pos + 2), - omit - ) - } - - function replace(char, pos, slice) { - return one(char, slice.charCodeAt(pos + 1), settings) - } -} - -// Shortcut to escape special characters in HTML. -function escape(value) { - return encode(value, {escapeOnly: true, useNamedReferences: true}) -} - -// Encode `char` according to `options`. -function one(char, next, options) { - var shortest = options.useShortestReferences - var omit = options.omitOptionalSemicolons - var named - var code - var numeric - var decimal - - if ((shortest || options.useNamedReferences) && own.call(characters, char)) { - named = toNamed(characters[char], next, omit, options.attribute) - } - - if (shortest || !named) { - code = char.charCodeAt(0) - numeric = toHexReference(code, next, omit) - - // Use the shortest numeric reference when requested. - // A simple algorithm would use decimal for all code points under 100, as - // those are shorter than hexadecimal: - // - // * `c` vs `c` (decimal shorter) - // * `d` vs `d` (equal) - // - // However, because we take `next` into consideration when `omit` is used, - // And it would be possible that decimals are shorter on bigger values as - // well if `next` is hexadecimal but not decimal, we instead compare both. - if (shortest) { - decimal = toDecimalReference(code, next, omit) - - if (decimal.length < numeric.length) { - numeric = decimal - } - } - } - - if (named && (!shortest || named.length < numeric.length)) { - return named - } - - return numeric -} - -// Transform `code` into an entity. -function toNamed(name, next, omit, attribute) { - var value = '&' + name - - if ( - omit && - own.call(legacy, name) && - dangerous.indexOf(name) === -1 && - (!attribute || (next && next !== equalsTo && !alphanumerical(next))) - ) { - return value - } - - return value + ';' -} - -// Transform `code` into a hexadecimal character reference. -function toHexReference(code, next, omit) { - var value = '&#x' + code.toString(16).toUpperCase() - return omit && next && !hexadecimal(next) ? value : value + ';' -} - -// Transform `code` into a decimal character reference. -function toDecimalReference(code, next, omit) { - var value = '&#' + String(code) - return omit && next && !decimal(next) ? value : value + ';' -} - -// Create an expression for `characters`. -function toExpression(characters) { - return new RegExp('[' + characters.join('') + ']', 'g') -} - -// Construct the map. -function construct() { - var chars = {} - var name - - for (name in entities) { - chars[entities[name]] = name - } - - return chars -} +module.exports = require('./lib') diff --git a/lib/constant/assign.js b/lib/constant/assign.js new file mode 100644 index 0000000..97fcde1 --- /dev/null +++ b/lib/constant/assign.js @@ -0,0 +1 @@ +module.exports = Object.assign diff --git a/lib/constant/characters.js b/lib/constant/characters.js new file mode 100644 index 0000000..9e4fc2b --- /dev/null +++ b/lib/constant/characters.js @@ -0,0 +1,10 @@ +var entities = require('character-entities-html4') + +var characters = {} +var name + +module.exports = characters + +for (name in entities) { + characters[entities[name]] = name +} diff --git a/dangerous.json b/lib/constant/dangerous.json similarity index 100% rename from dangerous.json rename to lib/constant/dangerous.json diff --git a/lib/constant/from-char-code.js b/lib/constant/from-char-code.js new file mode 100644 index 0000000..455fd93 --- /dev/null +++ b/lib/constant/from-char-code.js @@ -0,0 +1 @@ +module.exports = String.fromCharCode diff --git a/lib/constant/has-own-property.js b/lib/constant/has-own-property.js new file mode 100644 index 0000000..e8ca849 --- /dev/null +++ b/lib/constant/has-own-property.js @@ -0,0 +1 @@ +module.exports = {}.hasOwnProperty diff --git a/lib/core.js b/lib/core.js new file mode 100644 index 0000000..081f7b9 --- /dev/null +++ b/lib/core.js @@ -0,0 +1,49 @@ +'use strict' + +module.exports = encode + +// Encode special characters in `value`. +function encode(value, options) { + value = value.replace( + options.subset + ? new RegExp('[' + options.subset.join('') + ']', 'g') + : /["&'<>`]/g, + basic + ) + + if (options.subset || options.escapeOnly) { + return value + } + + return ( + value + // Surrogate pairs. + .replace(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g, surrogate) + // BMP control characters (C0 except for LF, CR, SP; DEL; and some more + // non-ASCII ones). + .replace( + // eslint-disable-next-line no-control-regex, unicorn/no-hex-escape + /[\x01-\t\v\f\x0E-\x1F\x7F\x81\x8D\x8F\x90\x9D\xA0-\uFFFF]/g, + basic + ) + ) + + function surrogate(pair, index, all) { + return options.format( + (pair.charCodeAt(0) - 0xd800) * 0x400 + + pair.charCodeAt(1) - + 0xdc00 + + 0x10000, + all.charCodeAt(index + 2), + options + ) + } + + function basic(character, index, all) { + return options.format( + character.charCodeAt(0), + all.charCodeAt(index + 1), + options + ) + } +} diff --git a/lib/encode-hexadecimal.js b/lib/encode-hexadecimal.js new file mode 100644 index 0000000..a2eaf4d --- /dev/null +++ b/lib/encode-hexadecimal.js @@ -0,0 +1,14 @@ +'use strict' + +var core = require('./core') +var assign = require('./constant/assign') +var basic = require('./util/format-basic') + +module.exports = encodeHexadecimal + +// Encode special characters in `value` as hexadecimals. +function encodeHexadecimal(value, options) { + // Note: this file was added in a minor release, so here we can use + // `Object.assign`. + return core(value, assign({format: basic}, options)) +} diff --git a/lib/encode.js b/lib/encode.js new file mode 100644 index 0000000..0ab0f75 --- /dev/null +++ b/lib/encode.js @@ -0,0 +1,13 @@ +'use strict' + +var xtend = require('xtend') +var core = require('./core') +var smart = require('./util/format-smart') + +module.exports = encode + +// Encode special characters in `value`. +function encode(value, options) { + // Note: Switch to `Object.assign` next major. + return core(value, xtend(options, {format: smart})) +} diff --git a/lib/escape.js b/lib/escape.js new file mode 100644 index 0000000..5dbaaf6 --- /dev/null +++ b/lib/escape.js @@ -0,0 +1,15 @@ +'use strict' + +var core = require('./core') +var smart = require('./util/format-smart') + +module.exports = escape + +// Shortcut to escape special characters in HTML. +function escape(value) { + return core(value, { + escapeOnly: true, + useNamedReferences: true, + format: smart + }) +} diff --git a/lib/index.js b/lib/index.js new file mode 100644 index 0000000..37bc888 --- /dev/null +++ b/lib/index.js @@ -0,0 +1,7 @@ +'use strict' + +var encode = require('./encode') +var escape = require('./escape') + +module.exports = encode +encode.escape = escape diff --git a/lib/util/format-basic.js b/lib/util/format-basic.js new file mode 100644 index 0000000..7d6aee7 --- /dev/null +++ b/lib/util/format-basic.js @@ -0,0 +1,5 @@ +module.exports = formatBasic + +function formatBasic(code) { + return '&#x' + code.toString(16).toUpperCase() + ';' +} diff --git a/lib/util/format-smart.js b/lib/util/format-smart.js new file mode 100644 index 0000000..f9feb7c --- /dev/null +++ b/lib/util/format-smart.js @@ -0,0 +1,48 @@ +module.exports = formatPretty + +var toHexadecimal = require('./to-hexadecimal') +var toDecimal = require('./to-decimal') +var toNamed = require('./to-named') + +// Encode `character` according to `options`. +function formatPretty(code, next, options) { + var named + var numeric + var decimal + + if (options.useNamedReferences || options.useShortestReferences) { + named = toNamed( + code, + next, + options.omitOptionalSemicolons, + options.attribute + ) + } + + if (options.useShortestReferences || !named) { + numeric = toHexadecimal(code, next, options.omitOptionalSemicolons) + + // Use the shortest numeric reference when requested. + // A simple algorithm would use decimal for all code points under 100, as + // those are shorter than hexadecimal: + // + // * `c` vs `c` (decimal shorter) + // * `d` vs `d` (equal) + // + // However, because we take `next` into consideration when `omit` is used, + // And it would be possible that decimals are shorter on bigger values as + // well if `next` is hexadecimal but not decimal, we instead compare both. + if (options.useShortestReferences) { + decimal = toDecimal(code, next, options.omitOptionalSemicolons) + + if (decimal.length < numeric.length) { + numeric = decimal + } + } + } + + return named && + (!options.useShortestReferences || named.length < numeric.length) + ? named + : numeric +} diff --git a/lib/util/to-decimal.js b/lib/util/to-decimal.js new file mode 100644 index 0000000..fd83a17 --- /dev/null +++ b/lib/util/to-decimal.js @@ -0,0 +1,9 @@ +module.exports = toDecimalReference + +var fromCharCode = require('../constant/from-char-code') + +// Transform `code` into a decimal character reference. +function toDecimalReference(code, next, omit) { + var value = '&#' + String(code) + return omit && next && !/\d/.test(fromCharCode(next)) ? value : value + ';' +} diff --git a/lib/util/to-hexadecimal.js b/lib/util/to-hexadecimal.js new file mode 100644 index 0000000..3a3f644 --- /dev/null +++ b/lib/util/to-hexadecimal.js @@ -0,0 +1,11 @@ +module.exports = toHexReference + +var fromCharCode = require('../constant/from-char-code') + +// Transform `code` into a hexadecimal character reference. +function toHexReference(code, next, omit) { + var value = '&#x' + code.toString(16).toUpperCase() + return omit && next && !/[\dA-Fa-f]/.test(fromCharCode(next)) + ? value + : value + ';' +} diff --git a/lib/util/to-named.js b/lib/util/to-named.js new file mode 100644 index 0000000..b5f4ab8 --- /dev/null +++ b/lib/util/to-named.js @@ -0,0 +1,33 @@ +module.exports = toNamed + +var legacy = require('character-entities-legacy') +var characters = require('../constant/characters') +var fromCharCode = require('../constant/from-char-code') +var own = require('../constant/has-own-property') +var dangerous = require('../constant/dangerous.json') + +// Transform `code` into a named character reference. +function toNamed(code, next, omit, attribute) { + var character = fromCharCode(code) + var name + var value + + if (own.call(characters, character)) { + name = characters[character] + value = '&' + name + + if ( + omit && + own.call(legacy, name) && + dangerous.indexOf(name) === -1 && + (!attribute || + (next && next !== 61 /* `=` */ && /[^\da-z]/i.test(fromCharCode(next)))) + ) { + return value + } + + return value + ';' + } + + return '' +} diff --git a/light.js b/light.js new file mode 100644 index 0000000..9425fa2 --- /dev/null +++ b/light.js @@ -0,0 +1,3 @@ +'use strict' + +module.exports = require('./lib/encode-hexadecimal') diff --git a/package.json b/package.json index ddc1690..b1db948 100644 --- a/package.json +++ b/package.json @@ -24,17 +24,16 @@ "Titus Wormer (https://wooorm.com)" ], "files": [ - "dangerous.json", + "lib/", "index.js", + "light.js", "types/index.d.ts" ], "types": "types/index.d.ts", "dependencies": { "character-entities-html4": "^1.0.0", "character-entities-legacy": "^1.0.0", - "is-alphanumerical": "^1.0.0", - "is-decimal": "^1.0.2", - "is-hexadecimal": "^1.0.0" + "xtend": "^4.0.0" }, "devDependencies": { "browserify": "^17.0.0", diff --git a/readme.md b/readme.md index 1d90b9c..bd9e360 100644 --- a/readme.md +++ b/readme.md @@ -5,13 +5,15 @@ [![Downloads][downloads-badge]][downloads] [![Size][size-badge]][size] -Encode HTML character references and character entities. +Encode HTML character references. * [x] Very fast * [x] Just the encoding part +* [x] Has either all the options you need for a minifier/prettifier, or it has + a tiny size w/ `stringify-entities/light` * [x] Reliable: ``'`'`` characters are escaped to ensure no scripts run in Internet Explorer 6 to 8. - Additionally, only named entities recognized by HTML4 are encoded, meaning + Additionally, only named references recognized by HTML4 are encoded, meaning the infamous `'` (which people think is a [virus][]) won’t show up ## Algorithm @@ -21,9 +23,13 @@ encoded. A [subset][] of characters can be given to encode just those characters. Alternatively, pass [`escapeOnly`][escapeonly] to escape just the dangerous characters (`"`, `'`, `<`, `>`, `&`, `` ` ``). -By default, numeric entities are used. -Pass [`useNamedReferences`][named] to use named entities when possible, or -[`useShortestReferences`][short] to use them if that results in less bytes. +By default, hexadecimal character references are used. +Pass [`useNamedReferences`][named] to use named character references when +possible, or [`useShortestReferences`][short] to use them if that results in +less bytes. +There is also a `stringify-entities/light` file exported, which works just like +`stringifyEntities` but without the formatting options: it’s much smaller but +always outputs hexadecimal character references. ## Install @@ -53,39 +59,49 @@ Encode special characters in `value`. ##### `options` +##### Core options + ###### `options.escapeOnly` -Whether to only escape possibly dangerous characters (`boolean`, -default: `false`). -Those characters are `"`, `'`, `<`, `>` `&`, and `` ` ``. +Whether to only escape possibly dangerous characters (`boolean`, default: +`false`). +Those characters are `"`, `&`, `'`, `<`, `>`, and `` ` ``. ###### `options.subset` Whether to only escape the given subset of characters (`Array.`). +##### Formatting options + +If you do not care about these, use `stringify-entities/light`, which always +outputs hexadecimal character references. + ###### `options.useNamedReferences` -Whether to use named entities where possible (`boolean?`, default: `false`). +Prefer named character references (`&`) where possible (`boolean?`, default: +`false`). ###### `options.useShortestReferences` -Whether to use named entities, where possible, if that results in less bytes +Prefer the shortest possible reference, if that results in less bytes (`boolean?`, default: `false`). -**Note**: `useNamedReferences` can be omitted when using `useShortestReferences`. +**Note**: `useNamedReferences` can be omitted when using +`useShortestReferences`. ###### `options.omitOptionalSemicolons` Whether to omit semicolons when possible (`boolean?`, default: `false`). -**Note**: This creates parse errors, don’t use this except when building a -minifier. +**Note**: This creates what HTML calls “parse errors” but is otherwise still +valid HTML — don’t use this except when building a minifier. -Omitting semicolons is possible for [certain][dangerous] [legacy][] named -references, and numeric entities, in some cases. +Omitting semicolons is possible for [legacy][] named references in +[certain][dangerous] cases, and numeric references in some cases. ###### `options.attribute` Only needed when operating dangerously with `omitOptionalSemicolons: true`. -Create entities which don’t fail in attributes (`boolean?`, default: `false`). +Create character references which don’t fail in attributes (`boolean?`, default: +`false`). ## Related @@ -94,7 +110,7 @@ Create entities which don’t fail in attributes (`boolean?`, default: `false`). * [`character-entities`](https://github.com/wooorm/character-entities) — Info on character entities * [`character-entities-html4`](https://github.com/wooorm/character-entities-html4) - — Info on HTML4 character entities + — Info on HTML 4 character entities * [`character-entities-legacy`](https://github.com/wooorm/character-entities-legacy) — Info on legacy character entities * [`character-reference-invalid`](https://github.com/wooorm/character-reference-invalid) @@ -130,7 +146,7 @@ Create entities which don’t fail in attributes (`boolean?`, default: `false`). [virus]: https://www.telegraph.co.uk/technology/advice/10516839/Why-do-some-apostrophes-get-replaced-with-andapos.html -[dangerous]: dangerous.json +[dangerous]: lib/constant/dangerous.json [legacy]: https://github.com/wooorm/character-entities-legacy diff --git a/test.js b/test.js index 7e64094..0dbfe32 100644 --- a/test.js +++ b/test.js @@ -33,7 +33,7 @@ test('stringifyEntities(value[, options])', function (t) { useNamedReferences: true }), 'foo©bar𝌆baz☃qux', - 'Should use named entities if `useNamedReferences` and possible' + 'Should use named character references if `useNamedReferences` and possible' ) t.equal( @@ -41,7 +41,7 @@ test('stringifyEntities(value[, options])', function (t) { useShortestReferences: true }), 'alpha © bravo ≠ charlie 𝌆 delta " echo', - 'Should use shortest entities if `useShortestReferences`' + 'Should use shortest character references if `useShortestReferences`' ) t.equal( diff --git a/types/index.d.ts b/types/index.d.ts index 847cb8d..80a96ea 100644 --- a/types/index.d.ts +++ b/types/index.d.ts @@ -4,7 +4,7 @@ declare namespace stringifyEntities { interface StringifyEntitiesOptions { /** * Whether to only escape possibly dangerous characters (`boolean`, default: `false`). - * Those characters are `"`, `'`, `<`, `>` `&`, and `` ` ``. + * Those characters are `"`, `&`, `'`, `<`, `>`, and `` ` ``. */ escapeOnly?: boolean @@ -14,27 +14,27 @@ declare namespace stringifyEntities { subset?: string[] /** - * Whether to use named entities where possible (`boolean?`, default: `false`). + * Prefer named character references (`&`) where possible (`boolean?`, default: `false`). */ useNamedReferences?: boolean /** - * Whether to use named entities, where possible, if that results in less bytes (`boolean?`, default: `false`). + * Prefer the shortest possible reference, if that results in less bytes (`boolean?`, default: `false`). * **Note**: `useNamedReferences` can be omitted when using `useShortestReferences`. */ useShortestReferences?: boolean /** - * Whether to omit semi-colons when possible (`boolean?`, default: `false`). - * **Note**: This creates parse errors, don’t use this except when building a minifier. + * Whether to omit semicolons when possible (`boolean?`, default: `false`). + * **Note**: This creates what HTML calls “parse errors” but is otherwise still valid HTML — don’t use this except when building a minifier. * - * Omitting semi-colons is possible for certain legacy named references, and numeric entities, in some cases. + * Omitting semicolons is possible for legacy named references in certain cases, and numeric references in some cases. */ omitOptionalSemicolons?: boolean /** * Only needed when operating dangerously with `omitOptionalSemicolons: true`. - * Create entities which don’t fail in attributes (`boolean?`, default: `false`). + * Create character references which don’t fail in attributes (`boolean?`, default: `false`). */ attribute?: boolean }