Skip to content

Commit

Permalink
Add useShortestReferences option
Browse files Browse the repository at this point in the history
...which encodes using named entities only if that results in less
bytes.

Additionally, refactor the algorithm to use less regexes; fix code
examples; and update docs.
  • Loading branch information
wooorm committed Jul 1, 2016
1 parent 78db02c commit a2ea7aa
Show file tree
Hide file tree
Showing 7 changed files with 101 additions and 120 deletions.
98 changes: 51 additions & 47 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
var entities = require('character-entities-html4');
var legacy = require('character-entities-legacy');
var dangerous = require('./lib/dangerous.json');
var EXPRESSION_NAMED = require('./lib/expression.js');

/* Methods. */
var has = {}.hasOwnProperty;
Expand All @@ -33,9 +32,13 @@ var characters = {};
}
})();

/* Regular expressions. */
/* Default escapes. */
var EXPRESSION_ESCAPE = toExpression(escapes);

/* Surrogate pairs. */
var EXPRESSION_SURROGATE_PAIR = /[\uD800-\uDBFF][\uDC00-\uDFFF]/g;

/* Non-ASCII characters. */
var EXPRESSION_BMP = /[\x01-\t\x0B\f\x0E-\x1F\x7F\x81\x8D\x8F\x90\x9D\xA0-\uFFFF]/g;

/**
Expand Down Expand Up @@ -82,11 +85,11 @@ function isHexadecimal(char) {
* Transform `code` into a hexadecimal character reference.
*
* @param {number} code - Number to encode.
* @param {boolean?} [omit] - Omit optional semi-colons.
* @param {string?} [next] - Next character.
* @param {boolean?} [omit] - Omit optional semi-colons.
* @return {string} - `code` encoded as hexadecimal.
*/
function toHexadecimalReference(code, omit, next) {
function toHexReference(code, next, omit) {
var value = '&#x' + code.toString(16).toUpperCase();

return omit && next && !isHexadecimal(next) ? value : value + ';';
Expand All @@ -96,42 +99,26 @@ function toHexadecimalReference(code, omit, next) {
* Transform `code` into an entity.
*
* @param {string} name - Name to wrap.
* @param {boolean?} [attribute] - Stringify as attribute.
* @param {boolean?} [omit] - Omit optional semi-colons.
* @param {string?} [next] - Next character.
* @param {boolean?} [omit] - Omit optional semi-colons.
* @param {boolean?} [attribute] - Stringify as attribute.
* @return {string} - `name` encoded as hexadecimal.
*/
function toNamedEntity(name, attribute, omit, next) {
function toNamed(name, next, omit, attribute) {
var value = '&' + name;

if (
omit &&
has.call(legacy, name) &&
dangerous.indexOf(name) === -1 &&
(
!attribute ||
(next && next !== '=' && !isAlphanumeric(next))
)
(!attribute || (next && next !== '=' && !isAlphanumeric(next)))
) {
return value;
}

return value + ';';
}

/**
* Transform `code` into an entity.
*
* @param {string} char - Character to encode.
* @param {boolean?} [attribute] - Stringify as attribute.
* @param {boolean?} [omit] - Omit optional semi-colons.
* @param {string?} [next] - Next character.
* @return {string} - `name` encoded as hexadecimal.
*/
function characterToNamedEntity(char, omit) {
return toNamedEntity(characters[char], omit);
}

/**
* Create an expression for `characters`.
*
Expand All @@ -142,6 +129,38 @@ function toExpression(characters) {
return new RegExp('[' + characters.join('') + ']', 'g');
}

/**
* Encode `char` according to `options`.
*
* @param {string} char - Character to encode.
* @param {string} next - Character following `char`.
* @param {Object} options - Configuration.
* @return {string} - Entity.
*/
function one(char, next, options) {
var shortest = options.useShortestReferences;
var omit = options.omitOptionalSemicolons;
var named;
var numeric;

if (
(shortest || options.useNamedReferences) &&
has.call(characters, char)
) {
named = toNamed(characters[char], next, omit, options.attribute);
}

if (shortest || !named) {
numeric = toHexReference(charCode(char), next, omit);
}

if (named && (!shortest || named.length < numeric.length)) {
return named;
}

return numeric;
}

/**
* Encode special characters in `value`.
*
Expand All @@ -161,45 +180,30 @@ function toExpression(characters) {
*/
function encode(value, options) {
var settings = options || {};
var escapeOnly = settings.escapeOnly;
var named = settings.useNamedReferences;
var omit = settings.omitOptionalSemicolons;
var attribute = settings.attribute;
var subset = settings.subset;
var map = named ? characters : null;
var set = subset ? toExpression(subset) : EXPRESSION_ESCAPE;
var escapeOnly = settings.escapeOnly;
var omit = settings.omitOptionalSemicolons;

value = value.replace(set, function (char, pos) {
var next = value.charAt(pos + 1);

return map && has.call(map, char) ?
toNamedEntity(map[char], attribute, omit, next) :
toHexadecimalReference(charCode(char), omit, next);
value = value.replace(set, function (char, pos, val) {
return one(char, val.charAt(pos + 1), settings);
});

if (subset || escapeOnly) {
return value;
}

if (named) {
value = value.replace(EXPRESSION_NAMED, function (char, pos) {
var next = value.charAt(pos + 1);
return characterToNamedEntity(char, attribute, omit, next);
});
}

return value
.replace(EXPRESSION_SURROGATE_PAIR, function (pair, pos, val) {
return toHexadecimalReference(
return toHexReference(
((pair.charCodeAt(0) - 0xD800) * 0x400) +
pair.charCodeAt(1) - 0xDC00 + 0x10000,
omit,
val.charAt(pos + 1)
val.charAt(pos + 2),
omit
);
})
.replace(EXPRESSION_BMP, function (char, pos, val) {
var next = val.charAt(pos + 1);
return toHexadecimalReference(charCode(char), omit, next);
return one(char, val.charAt(pos + 1), settings);
});
}

Expand Down
8 changes: 0 additions & 8 deletions lib/expression.js

This file was deleted.

3 changes: 1 addition & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,10 @@
},
"scripts": {
"build-md": "remark . --quiet --frail",
"build-expression": "node script/generate-expression.js",
"build-dangerous": "node script/generate-dangerous.js",
"build-bundle": "browserify index.js --bare -s stringifyEntities > stringify-entities.js",
"build-mangle": "esmangle stringify-entities.js > stringify-entities.min.js",
"build": "npm run build-expression && npm run build-dangerous && npm run build-md && npm run build-bundle && npm run build-mangle",
"build": "npm run build-dangerous && npm run build-md && npm run build-bundle && npm run build-mangle",
"lint": "xo",
"test-api": "node test.js",
"test-coverage": "istanbul cover test.js",
Expand Down
46 changes: 36 additions & 10 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,20 @@ Encode HTML character references and character entities.

* [x] Very fast;
* [x] Just the encoding part;
* [x] Reliable: ``"`"`` characters are escaped to ensure no scripts
run in IE6-8. Additionally, only named entities recognized by HTML4
* [x] Reliable: ``'`'`` characters are escaped to ensure no scripts
run in IE6-8. Additionally, only named entities recognised by HTML4
are encoded, meaning the infamous `&apos;` (which people think is a
[virus](http://www.telegraph.co.uk/technology/advice/10516839/Why-do-some-apostrophes-get-replaced-with-andapos.html))
won’t show up.
[virus][]) won’t show up.

## Algorithm

By default, all dangerous, non-ASCII, or non-printable ASCII characters
are encoded. A [subset][] of characters can be given to encode just
those characters. Alternatively, pass [`escapeOnly`][escapeonly] to
escape just the dangerous characters (`"`, `'`, `<`, `>`, `&`, `` ` ``).
By default, numeric entities are used. Pass [`useNamedReferences`][named]
to use named entities when possible, or [`useShortestReferences`][short]
to use them if that results in less bytes.

## Installation

Expand All @@ -28,7 +37,7 @@ module, [uncompressed and compressed][releases].
```js
var stringify = require('stringify-entities');

stringify.encode('alpha © bravo ≠ charlie 𝌆 delta');
stringify('alpha © bravo ≠ charlie 𝌆 delta');
```

Yields:
Expand All @@ -40,7 +49,7 @@ alpha &#xA9; bravo &#x2260; charlie &#x1D306; delta
…and with `useNamedReferences: true`.

```js
stringify.encode('alpha © bravo ≠ charlie 𝌆 delta', { useNamedReferences: true });
stringify('alpha © bravo ≠ charlie 𝌆 delta', { useNamedReferences: true });
```

Yields:
Expand Down Expand Up @@ -69,13 +78,20 @@ Whether to only escape the given subset of characters (`Array.<string>`).

###### `options.useNamedReferences`

Whether to use entities where possible (`boolean?`, default: `false`).
Whether to use named entities where possible (`boolean?`, default:
`false`).

###### `options.useShortestReferences`

Whether to use named entities, where possible, if that results in less
bytes (`boolean?`, default: `false`). **Note**: `useNamedReferences`
can be omitted when using `useShortestReferences`.

###### `options.omitOptionalSemicolons`

Whether to use omit semi-colons when possible. **This creates parse
errors: don’t do this unless when building a minifier** (`boolean?`,
default: `false`).
Whether to omit semi-colons when possible. (`boolean?`, default: `false`).
**Note**: This creates parse errors: don’t use this except when building
a minifier.

Omitting semi-colons is possible for [certain][dangerous] [legacy][]
named references, and numeric entities, in some cases.
Expand Down Expand Up @@ -108,6 +124,16 @@ Create entities which don’t fail in attributes (`boolean?`, default:

[npm]: https://docs.npmjs.com/cli/install

[virus]: http://www.telegraph.co.uk/technology/advice/10516839/Why-do-some-apostrophes-get-replaced-with-andapos.html

[dangerous]: lib/dangerous.json

[legacy]: https://github.com/wooorm/character-entities-legacy

[subset]: #optionssubset

[escapeonly]: #optionsescapeonly

[named]: #optionsusenamedreferences

[short]: #optionsuseshortestreferences
2 changes: 1 addition & 1 deletion script/generate-dangerous.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ var entities = Object.keys(require('character-entities'));
/* Escape-codes. */
var conflict = [];

/* Generate the expression. */
/* Generate the list. */
var length = legacy.length;
var count = entities.length;
var index = -1;
Expand Down
46 changes: 0 additions & 46 deletions script/generate-expression.js

This file was deleted.

18 changes: 12 additions & 6 deletions test.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,6 @@
var test = require('tape');
var stringify = require('./');

var named = {
useNamedReferences: true
};

/* Tests. */
test('stringifyEntities.escape(value)', function (t) {
t.equal(
Expand Down Expand Up @@ -46,9 +42,19 @@ test('stringifyEntities(value[, options])', function (t) {
);

t.equal(
stringify('foo\xA9bar\uD834\uDF06baz\u2603qux', named),
stringify('foo\xA9bar\uD834\uDF06baz\u2603qux', {
useNamedReferences: true
}),
'foo&copy;bar&#x1D306;baz&#x2603;qux',
'Other non-ASCII symbols are represented through hexadecimal escapes'
'Should use named entities if `useNamedReferences` and possible'
);

t.equal(
stringify('alpha © bravo ≠ charlie 𝌆 delta', {
useShortestReferences: true
}),
'alpha &#xA9; bravo &ne; charlie &#x1D306; delta',
'Should use shortest entities if `useShortestReferences`'
);

t.equal(
Expand Down

0 comments on commit a2ea7aa

Please sign in to comment.