Skip to content

Commit

Permalink
Add support for generating semi-colonless references
Browse files Browse the repository at this point in the history
Which is quite useful when creating minifiers; but creates “invalid”
HTML.
  • Loading branch information
wooorm committed Jun 28, 2016
1 parent 936db06 commit c5e1c4f
Show file tree
Hide file tree
Showing 6 changed files with 272 additions and 30 deletions.
121 changes: 96 additions & 25 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@

/* Dependencies. */
var entities = require('character-entities-html4');
var legacy = require('character-entities-legacy');
var dangerous = require('./lib/dangerous.json');
var EXPRESSION_NAMED = require('./lib/expression.js');

/* Methods. */
Expand All @@ -37,44 +39,97 @@ var EXPRESSION_SURROGATE_PAIR = /[\uD800-\uDBFF][\uDC00-\uDFFF]/g;
var EXPRESSION_BMP = /[\x01-\t\x0B\f\x0E-\x1F\x7F\x81\x8D\x8F\x90\x9D\xA0-\uFFFF]/g;

/**
* Transform `code` into a hexadecimal character reference.
* Get the first character in `char`.
*
* @param {number} code - Number to encode.
* @return {string} - `code` encoded as hexadecimal.
* @param {string} char - Value.
* @return {string} - First character.
*/
function charCode(char) {
return char.charCodeAt(0);
}

/**
* Check whether `char` is an alphanumeric.
*
* @param {string} char - Value.
* @return {boolean} - Whether `char` is an
* alphanumeric.
*/
function characterCodeToHexadecimalReference(code) {
return '&#x' + code.toString(16).toUpperCase() + ';';
function isAlphanumeric(char) {
var code = charCode(char);

return (code >= 48 /* 0 */ && code <= 57 /* 9 */) ||
(code >= 65 /* A */ && code <= 90 /* Z */) ||
(code >= 97 /* a */ && code <= 122 /* z */);
}

/**
* Transform `character` into a hexadecimal character
* reference.
* Check whether `char` is a hexadecimal.
*
* @param {string} character - Character to encode.
* @return {string} - `character` encoded as hexadecimal.
* @param {string} char - Value.
* @return {boolean} - Whether `char` is a
* hexadecimal.
*/
function characterToHexadecimalReference(character) {
return characterCodeToHexadecimalReference(character.charCodeAt(0));
function isHexadecimal(char) {
var code = charCode(char);

return (code >= 48 /* 0 */ && code <= 57 /* 9 */) ||
(code >= 65 /* A */ && code <= 70 /* F */) ||
(code >= 97 /* a */ && code <= 102 /* f */);
}

/**
* Transform `code` into a hexadecimal character reference.
*
* @param {number} code - Number to encode.
* @param {boolean?} [omit] - Omit optional semi-colons.
* @param {string?} [next] - Next character.
* @return {string} - `code` encoded as hexadecimal.
*/
function toHexadecimalReference(code, omit, next) {
var value = '&#x' + code.toString(16).toUpperCase();

return omit && next && !isHexadecimal(next) ? value : value + ';';
}

/**
* Transform `code` into an entity.
*
* @param {string} name - Name to wrap.
* @param {boolean?} [attribute] - Stringify as attribute.
* @param {boolean?} [omit] - Omit optional semi-colons.
* @param {string?} [next] - Next character.
* @return {string} - `name` encoded as hexadecimal.
*/
function toNamedEntity(name) {
return '&' + name + ';';
function toNamedEntity(name, attribute, omit, next) {
var value = '&' + name;

if (
omit &&
has.call(legacy, name) &&
dangerous.indexOf(name) === -1 &&
(
!attribute ||
(next && next !== '=' && !isAlphanumeric(next))
)
) {
return value;
}

return value + ';';
}

/**
* Transform `code` into an entity.
*
* @param {string} character - Character to encode.
* @param {string} char - Character to encode.
* @param {boolean?} [attribute] - Stringify as attribute.
* @param {boolean?} [omit] - Omit optional semi-colons.
* @param {string?} [next] - Next character.
* @return {string} - `name` encoded as hexadecimal.
*/
function characterToNamedEntity(character) {
return toNamedEntity(characters[character]);
function characterToNamedEntity(char, omit) {
return toNamedEntity(characters[char], omit);
}

/**
Expand All @@ -98,38 +153,54 @@ function toExpression(characters) {
* - Subset of characters to encode.
* @param {boolean?} [options.useNamedReferences=false]
* - Whether to use entities where possible.
* @param {boolean?} [options.omitOptionalSemicolons=false]
* - Whether to omit optional semi-colons.
* @param {boolean?} [options.attribute=false]
* - Whether to stringifying and attribute.
* @return {string} - Encoded `value`.
*/
function encode(value, options) {
var settings = options || {};
var escapeOnly = settings.escapeOnly;
var named = settings.useNamedReferences;
var omit = settings.omitOptionalSemicolons;
var attribute = settings.attribute;
var subset = settings.subset;
var map = named ? characters : null;
var set = subset ? toExpression(subset) : EXPRESSION_ESCAPE;

value = value.replace(set, function (character) {
return map && has.call(map, character) ?
toNamedEntity(map[character]) :
characterToHexadecimalReference(character);
value = value.replace(set, function (char, pos) {
var next = value.charAt(pos + 1);

return map && has.call(map, char) ?
toNamedEntity(map[char], attribute, omit, next) :
toHexadecimalReference(charCode(char), omit, next);
});

if (subset || escapeOnly) {
return value;
}

if (named) {
value = value.replace(EXPRESSION_NAMED, characterToNamedEntity);
value = value.replace(EXPRESSION_NAMED, function (char, pos) {
var next = value.charAt(pos + 1);
return characterToNamedEntity(char, attribute, omit, next);
});
}

return value
.replace(EXPRESSION_SURROGATE_PAIR, function (pair) {
return characterCodeToHexadecimalReference(
.replace(EXPRESSION_SURROGATE_PAIR, function (pair, pos, val) {
return toHexadecimalReference(
((pair.charCodeAt(0) - 0xD800) * 0x400) +
pair.charCodeAt(1) - 0xDC00 + 0x10000
pair.charCodeAt(1) - 0xDC00 + 0x10000,
omit,
val.charAt(pos + 1)
);
})
.replace(EXPRESSION_BMP, characterToHexadecimalReference);
.replace(EXPRESSION_BMP, function (char, pos, val) {
var next = val.charAt(pos + 1);
return toHexadecimalReference(charCode(char), omit, next);
});
}

/**
Expand Down
10 changes: 10 additions & 0 deletions lib/dangerous.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[
"cent",
"copy",
"divide",
"gt",
"lt",
"not",
"para",
"times"
]
11 changes: 7 additions & 4 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,16 @@
"Titus Wormer <tituswormer@gmail.com> (http://wooorm.com)"
],
"files": [
"index.js",
"lib/expression.js"
"lib",
"index.js"
],
"dependencies": {
"character-entities-html4": "^1.0.0"
"character-entities-html4": "^1.0.0",
"character-entities-legacy": "^1.0.0"
},
"devDependencies": {
"browserify": "^13.0.0",
"character-entities": "^1.0.0",
"esmangle": "^1.0.0",
"istanbul": "^0.4.0",
"remark-cli": "^1.0.0",
Expand All @@ -42,9 +44,10 @@
"scripts": {
"build-md": "remark . --quiet --frail",
"build-expression": "node script/generate-expression.js",
"build-dangerous": "node script/generate-dangerous.js",
"build-bundle": "browserify index.js --bare -s stringifyEntities > stringify-entities.js",
"build-mangle": "esmangle stringify-entities.js > stringify-entities.min.js",
"build": "npm run build-md && npm run build-expression && npm run build-bundle && npm run build-mangle",
"build": "npm run build-expression && npm run build-dangerous && npm run build-md && npm run build-bundle && npm run build-mangle",
"lint": "xo",
"test-api": "node test.js",
"test-coverage": "istanbul cover test.js",
Expand Down
21 changes: 20 additions & 1 deletion readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,22 @@ Whether to only escape the given subset of characters (`Array.<string>`).

###### `options.useNamedReferences`

Whether to use entities where possible. (`boolean?`, default: `false`).
Whether to use entities where possible (`boolean?`, default: `false`).

###### `options.omitOptionalSemicolons`

Whether to use omit semi-colons when possible. **This creates parse
errors: don’t do this unless when building a minifier** (`boolean?`,
default: `false`).

Omitting semi-colons is possible for [certain][dangerous] [legacy][]
named references, and numeric entities, in some cases.

###### `options.attribute`

Only needed when operating dangerously with `omitOptionalSemicolons: true`.
Create entities which don’t fail in attributes (`boolean?`, default:
`false`).

## License

Expand All @@ -92,3 +107,7 @@ Whether to use entities where possible. (`boolean?`, default: `false`).
[author]: http://wooorm.com

[npm]: https://docs.npmjs.com/cli/install

[dangerous]: lib/dangerous.json

[legacy]: https://github.com/wooorm/character-entities-legacy
52 changes: 52 additions & 0 deletions script/generate-dangerous.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/**
* @author Titus Wormer
* @copyright 2015 Titus Wormer
* @license MIT
* @module stringify-entities:script
* @fileoverview Generate a list of entities which might
* conflict when used without semi-colon.
* For example, we can’t minify `&not;in;` to `&notin;`,
* as that would render another entity.
*/

'use strict';

/* eslint-env node */

/* Dependencies. */
var fs = require('fs');
var path = require('path');
var legacy = Object.keys(require('character-entities-legacy'));
var entities = Object.keys(require('character-entities'));

/* Escape-codes. */
var conflict = [];

/* Generate the expression. */
var length = legacy.length;
var count = entities.length;
var index = -1;
var offset;
var left;
var right;

/* Generate. */
while (++index < length) {
left = legacy[index];
offset = -1;

while (++offset < count) {
right = entities[offset];

if (left !== right && right.slice(0, left.length) === left) {
conflict.push(left);
break;
}
}
}

/* Write. */
fs.writeFileSync(
path.join('lib', 'dangerous.json'),
JSON.stringify(conflict, null, 2) + '\n'
);
Loading

0 comments on commit c5e1c4f

Please sign in to comment.