Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] proposed new unicode module #8075

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions doc/api/buffer.md
Original file line number Diff line number Diff line change
Expand Up @@ -2304,6 +2304,27 @@ added: v3.0.0
On 32-bit architectures, this value is `(2^30)-1` (~1GB).
On 64-bit architectures, this value is `(2^31)-1` (~2GB).

## buffer.normalize(buf, form[, encoding])

* `buf` {Buffer} A `Buffer` instance
* `form` {String} A Unicode normalization form (one of: `'NFC'`, `'NFD'`,
`NFKC`, or `NFKD`)
* `encoding` {String} The source character encoding of the `buf`. Defaults to
`'utf8'`

Performs Unicode Normalization to the `buf` and returns a new `Buffer` instance
containing the UTF-8 encoded results. Throws if the `form` does not specify a
valid Normalization form or if the normalization cannot be successfully applied.

## buffer.transcode(buf, from_enc, to_enc)

* `buf` {Buffer} A `Buffer` instance
* `from_enc` {string} The current encoding
* `to_enc` {string} The target encoding

Re-encodes the given `Buffer` from one character encoding to another. Returns
a new `Buffer` instance.

## Class: SlowBuffer
<!-- YAML
deprecated: v6.0.0
Expand Down
18 changes: 18 additions & 0 deletions doc/api/util.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ module developers as well. It can be accessed using:
const util = require('util');
```

## util.constants

Constants for use with `util.getCharacterProperty()`.

## util.debuglog(section)
<!-- YAML
added: v0.11.3
Expand Down Expand Up @@ -133,6 +137,20 @@ Each argument is converted to a string using `util.inspect()`.
util.format(1, 2, 3); // '1 2 3'
```

## util.getCharacterProperty(codepoint, property)

* `codepoint` {number} A Unicode codepoint value
* `property` {number} A Unicode codepoint constant (from `util.constants.*`)

Returns a specific Unicode codepoint property for the given codepoint value.

## util.getColumnWidth(cp)

* `cp` {number | String} A Unicode codepoint value or a String

Returns the number of terminal columns to be used to display the given Unicode
codepoint or string.

## util.inherits(constructor, superConstructor)
<!-- YAML
added: v0.3.0
Expand Down
3 changes: 3 additions & 0 deletions lib/buffer.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ const binding = process.binding('buffer');
const { isArrayBuffer, isSharedArrayBuffer } = process.binding('util');
const bindingObj = {};
const internalUtil = require('internal/util');
const internalBuffer = require('internal/buffer');

class FastBuffer extends Uint8Array {
constructor(arg1, arg2, arg3) {
Expand All @@ -19,6 +20,8 @@ exports.Buffer = Buffer;
exports.SlowBuffer = SlowBuffer;
exports.INSPECT_MAX_BYTES = 50;
exports.kMaxLength = binding.kMaxLength;
exports.transcode = internalBuffer.transcode;
exports.normalize = internalBuffer.normalize;

const kFromErrorMsg = 'First argument must be a string, Buffer, ' +
'ArrayBuffer, Array, or array-like object.';
Expand Down
96 changes: 96 additions & 0 deletions lib/internal/buffer.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
'use strict';

const Buffer = require('buffer').Buffer;
const normalizeEncoding = require('internal/util').normalizeEncoding;

if (process.binding('config').hasIntl) {

const icu = process.binding('icu');

// Maps the supported transcoding conversions. The top key is the from_enc,
// the child key is the to_enc. The value is the transcoding function to.
const conversions = {
'ascii': {
'latin1': (source) => {
return Buffer.from(source);
},
'utf8': (source) => {
return Buffer.from(source);
},
'utf16le': (source) => {
return icu.convertToUcs2('us-ascii', source);
}
},
'latin1': {
'ascii': (source) => {
return icu.convert('us-ascii', 'iso8859-1', source);
},
'utf8': (source) => {
return icu.convert('utf-8', 'iso8859-1', source);
},
'utf16le': (source) => {
return icu.convertToUcs2('iso8859-1', source);
}
},
'utf8': {
'ascii': (source) => {
return icu.convert('us-ascii', 'utf-8', source);
},
'latin1': (source) => {
return icu.convert('iso-8859-1', 'utf-8', source);
},
'utf16le': icu.convertToUcs2FromUtf8,
},
'utf16le': {
'ascii': (source) => {
if (source.length % 2 !== 0)
throw new TypeError('Invalid UCS2 Buffer');
return icu.convertFromUcs2('us-ascii', source);
},
'latin1': (source) => {
if (source.length % 2 !== 0)
throw new TypeError('Invalid UCS2 Buffer');
return icu.convertFromUcs2('iso-8859-1', source);
},
'utf8': (source) => {
if (source.length % 2 !== 0)
throw new TypeError('Invalid UCS2 Buffer');
return icu.convertToUtf8FromUcs2(source);
}
}
};

// Transcodes the Buffer from one encoding to another, returning a new
// Buffer instance.
exports.transcode = function transcode(source, from_enc, to_enc) {
if (!source || !(source.buffer instanceof ArrayBuffer))
throw new TypeError('"source" argument must be a Buffer');
if (source.length === 0) return Buffer.alloc(0);

from_enc = normalizeEncoding(from_enc) || from_enc;
to_enc = normalizeEncoding(to_enc) || to_enc;

if (from_enc === to_enc)
return Buffer.from(source);

const cnv_from = conversions[from_enc];

if (cnv_from) {
const cnv_to = cnv_from[to_enc];
if (cnv_to)
return cnv_to(source);
}
throw new TypeError(`Unsupported conversion: ${from_enc} to ${to_enc}`);
};

// Perform Unicode Normalization on the Buffer.
exports.normalize = function normalize(buf, form, encoding) {
if (!buf || !(buf.buffer instanceof ArrayBuffer))
throw new TypeError('First argument must be a Buffer');
encoding = normalizeEncoding(encoding);
if (encoding === 'ascii')
encoding == 'us-ascii';
return icu.normalize(buf, encoding, String(form).toUpperCase());
};

}
172 changes: 99 additions & 73 deletions lib/internal/readline.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,102 +2,128 @@

// Regexes used for ansi escape code splitting
// eslint-disable-next-line no-control-regex
const metaKeyCodeReAnywhere = /(?:\x1b)([a-zA-Z0-9])/;
const functionKeyCodeReAnywhere = new RegExp('(?:\x1b+)(O|N|\\[|\\[\\[)(?:' + [
'(\\d+)(?:;(\\d+))?([~^$])',
'(?:M([@ #!a`])(.)(.))', // mouse
'(?:1;)?(\\d+)?([a-zA-Z])'
].join('|') + ')');
// Adopted from https://github.com/chalk/ansi-regex/blob/master/index.js
// License: MIT, authors: @sindresorhus, Qix-, and arjunmehta
const ansi =
/[\u001b\u009b][[()#;?]*(?:[0-9]{1,4}(?:;[0-9]{0,4})*)?[0-9A-ORZcf-nqry=><]/g;


module.exports = {
exports = module.exports = {
emitKeys,
getStringWidth,
isFullWidthCodePoint,
stripVTControlCharacters
};


/**
* Returns the number of columns required to display the given string.
*/
function getStringWidth(str) {
let width = 0;

str = stripVTControlCharacters(str);

for (let i = 0; i < str.length; i++) {
const code = str.codePointAt(i);
if (process.binding('config').hasIntl) {
const util = require('util');
exports.getStringWidth = function getStringWidth(str) {
return util.getColumnWidth(stripVTControlCharacters(str));
};

exports.isFullWidthCodePoint = function isFullWidthCodePoint(code) {
// Defined here largely for legacy support reasons. Updated to
// use character properties rather than fixed ranges.
const eaw =
util.getCharacterProperty(code,
util.constants.UCHAR_EAST_ASIAN_WIDTH);
const emoji =
util.getCharacterProperty(code,
util.constants.UCHAR_EMOJI_PRESENTATION) &&
!util.getCharacterProperty(code,
util.constants.UCHAR_EMOJI_MODIFIER);
return eaw === util.constants.U_EA_FULLWIDTH ||
eaw === util.constants.U_EA_WIDE ||
emoji;
};

} else {
// These old implementations are used as fallbacks only when Node.js
// is compiled without ICU. The getStringWidth implementation here is
// about 30% slower than the ICU based implementation and does not
// work properly for emoji and newer unicode characters. The new impl
// uses ICU's built in character properties data to provide more accurate
// results.
/**
* Returns the number of columns required to display the given string.
*/
function getStringWidth(str) {
let width = 0;

str = stripVTControlCharacters(str);

for (let i = 0; i < str.length; i++) {
const code = str.codePointAt(i);

if (code >= 0x10000) { // surrogates
i++;
}

if (code >= 0x10000) { // surrogates
i++;
if (isFullWidthCodePoint(code)) {
width += 2;
} else {
width++;
}
}

if (isFullWidthCodePoint(code)) {
width += 2;
} else {
width++;
}
return width;
}

return width;
}

/**
* Returns true if the character represented by a given
* Unicode code point is full-width. Otherwise returns false.
*/
function isFullWidthCodePoint(code) {
if (isNaN(code)) {
return false;
}

/**
* Returns true if the character represented by a given
* Unicode code point is full-width. Otherwise returns false.
*/
function isFullWidthCodePoint(code) {
if (isNaN(code)) {
return false;
}
// Code points are derived from:
// http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
if (code >= 0x1100 && (
code <= 0x115f || // Hangul Jamo
0x2329 === code || // LEFT-POINTING ANGLE BRACKET
0x232a === code || // RIGHT-POINTING ANGLE BRACKET
// CJK Radicals Supplement .. Enclosed CJK Letters and Months
(0x2e80 <= code && code <= 0x3247 && code !== 0x303f) ||
// Enclosed CJK Letters and Months .. CJK Unified Ideographs Extension A
0x3250 <= code && code <= 0x4dbf ||
// CJK Unified Ideographs .. Yi Radicals
0x4e00 <= code && code <= 0xa4c6 ||
// Hangul Jamo Extended-A
0xa960 <= code && code <= 0xa97c ||
// Hangul Syllables
0xac00 <= code && code <= 0xd7a3 ||
// CJK Compatibility Ideographs
0xf900 <= code && code <= 0xfaff ||
// Vertical Forms
0xfe10 <= code && code <= 0xfe19 ||
// CJK Compatibility Forms .. Small Form Variants
0xfe30 <= code && code <= 0xfe6b ||
// Halfwidth and Fullwidth Forms
0xff01 <= code && code <= 0xff60 ||
0xffe0 <= code && code <= 0xffe6 ||
// Kana Supplement
0x1b000 <= code && code <= 0x1b001 ||
// Enclosed Ideographic Supplement
0x1f200 <= code && code <= 0x1f251 ||
// CJK Unified Ideographs Extension B .. Tertiary Ideographic Plane
0x20000 <= code && code <= 0x3fffd)) {
return true;
}

// Code points are derived from:
// http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
if (code >= 0x1100 && (
code <= 0x115f || // Hangul Jamo
0x2329 === code || // LEFT-POINTING ANGLE BRACKET
0x232a === code || // RIGHT-POINTING ANGLE BRACKET
// CJK Radicals Supplement .. Enclosed CJK Letters and Months
(0x2e80 <= code && code <= 0x3247 && code !== 0x303f) ||
// Enclosed CJK Letters and Months .. CJK Unified Ideographs Extension A
0x3250 <= code && code <= 0x4dbf ||
// CJK Unified Ideographs .. Yi Radicals
0x4e00 <= code && code <= 0xa4c6 ||
// Hangul Jamo Extended-A
0xa960 <= code && code <= 0xa97c ||
// Hangul Syllables
0xac00 <= code && code <= 0xd7a3 ||
// CJK Compatibility Ideographs
0xf900 <= code && code <= 0xfaff ||
// Vertical Forms
0xfe10 <= code && code <= 0xfe19 ||
// CJK Compatibility Forms .. Small Form Variants
0xfe30 <= code && code <= 0xfe6b ||
// Halfwidth and Fullwidth Forms
0xff01 <= code && code <= 0xff60 ||
0xffe0 <= code && code <= 0xffe6 ||
// Kana Supplement
0x1b000 <= code && code <= 0x1b001 ||
// Enclosed Ideographic Supplement
0x1f200 <= code && code <= 0x1f251 ||
// CJK Unified Ideographs Extension B .. Tertiary Ideographic Plane
0x20000 <= code && code <= 0x3fffd)) {
return true;
return false;
}

return false;
exports.isFullWidthCodePoint = isFullWidthCodePoint;
exports.getStringWidth = getStringWidth;
}


/**
* Tries to remove all VT control characters. Use to estimate displayed
* string width. May be buggy due to not running a real state machine
*/
function stripVTControlCharacters(str) {
str = str.replace(new RegExp(functionKeyCodeReAnywhere.source, 'g'), '');
return str.replace(new RegExp(metaKeyCodeReAnywhere.source, 'g'), '');
return str.replace(ansi, '');
}


Expand Down
12 changes: 12 additions & 0 deletions lib/util.js
Original file line number Diff line number Diff line change
Expand Up @@ -1053,3 +1053,15 @@ exports._exceptionWithHostPort = function(err,
}
return ex;
};

if (process.binding('config').hasIntl) {
const icu = process.binding('icu');
const constants = process.binding('constants').icu;
Object.defineProperty(exports, 'constants', {
configurable: false,
enumerable: true,
value: constants
});
exports.getCharacterProperty = icu.getCharacterProperty;
exports.getColumnWidth = icu.getColumnWidth;
}
1 change: 1 addition & 0 deletions node.gyp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@
'lib/v8.js',
'lib/vm.js',
'lib/zlib.js',
'lib/internal/buffer.js',
'lib/internal/child_process.js',
'lib/internal/cluster.js',
'lib/internal/freelist.js',
Expand Down
Loading