nodejs · jasnell · Apr 14, 2016
diff --git a/doc/api/buffer.md b/doc/api/buffer.md
@@ -2304,6 +2304,27 @@ added: v3.0.0
 On 32-bit architectures, this value is `(2^30)-1` (~1GB).
 On 64-bit architectures, this value is `(2^31)-1` (~2GB).
 
+## buffer.normalize(buf, form[, encoding])
+
+* `buf` {Buffer} A `Buffer` instance
+* `form` {String} A Unicode normalization form (one of: `'NFC'`, `'NFD'`,
+   `NFKC`, or `NFKD`)
+* `encoding` {String} The source character encoding of the `buf`. Defaults to
+  `'utf8'`
+
+Performs Unicode Normalization to the `buf` and returns a new `Buffer` instance
+containing the UTF-8 encoded results. Throws if the `form` does not specify a
+valid Normalization form or if the normalization cannot be successfully applied.
+
+## buffer.transcode(buf, from_enc, to_enc)
+
+* `buf` {Buffer} A `Buffer` instance
+* `from_enc` {string} The current encoding
+* `to_enc` {string} The target encoding
+
+Re-encodes the given `Buffer` from one character encoding to another. Returns
+a new `Buffer` instance.
+
 ## Class: SlowBuffer
 <!-- YAML
 deprecated: v6.0.0

diff --git a/doc/api/util.md b/doc/api/util.md
@@ -10,6 +10,10 @@ module developers as well. It can be accessed using:
 const util = require('util');
 ```
 
+## util.constants
+
+Constants for use with `util.getCharacterProperty()`.
+
 ## util.debuglog(section)
 <!-- YAML
 added: v0.11.3
@@ -133,6 +137,20 @@ Each argument is converted to a string using `util.inspect()`.
 util.format(1, 2, 3); // '1 2 3'
 ```
 
+## util.getCharacterProperty(codepoint, property)
+
+* `codepoint` {number} A Unicode codepoint value
+* `property` {number} A Unicode codepoint constant (from `util.constants.*`)
+
+Returns a specific Unicode codepoint property for the given codepoint value.
+
+## util.getColumnWidth(cp)
+
+* `cp` {number | String} A Unicode codepoint value or a String
+
+Returns the number of terminal columns to be used to display the given Unicode
+codepoint or string.
+
 ## util.inherits(constructor, superConstructor)
 <!-- YAML
 added: v0.3.0

diff --git a/lib/buffer.js b/lib/buffer.js
@@ -5,6 +5,7 @@ const binding = process.binding('buffer');
 const { isArrayBuffer, isSharedArrayBuffer } = process.binding('util');
 const bindingObj = {};
 const internalUtil = require('internal/util');
+const internalBuffer = require('internal/buffer');
 
 class FastBuffer extends Uint8Array {
   constructor(arg1, arg2, arg3) {
@@ -19,6 +20,8 @@ exports.Buffer = Buffer;
 exports.SlowBuffer = SlowBuffer;
 exports.INSPECT_MAX_BYTES = 50;
 exports.kMaxLength = binding.kMaxLength;
+exports.transcode = internalBuffer.transcode;
+exports.normalize = internalBuffer.normalize;
 
 const kFromErrorMsg = 'First argument must be a string, Buffer, ' +
                       'ArrayBuffer, Array, or array-like object.';

diff --git a/lib/internal/buffer.js b/lib/internal/buffer.js
@@ -0,0 +1,96 @@
+'use strict';
+
+const Buffer = require('buffer').Buffer;
+const normalizeEncoding = require('internal/util').normalizeEncoding;
+
+if (process.binding('config').hasIntl) {
+
+  const icu = process.binding('icu');
+
+  // Maps the supported transcoding conversions. The top key is the from_enc,
+  // the child key is the to_enc. The value is the transcoding function to.
+  const conversions = {
+    'ascii': {
+      'latin1': (source) => {
+        return Buffer.from(source);
+      },
+      'utf8': (source) => {
+        return Buffer.from(source);
+      },
+      'utf16le': (source) => {
+        return icu.convertToUcs2('us-ascii', source);
+      }
+    },
+    'latin1': {
+      'ascii': (source) => {
+        return icu.convert('us-ascii', 'iso8859-1', source);
+      },
+      'utf8': (source) => {
+        return icu.convert('utf-8', 'iso8859-1', source);
+      },
+      'utf16le': (source) => {
+        return icu.convertToUcs2('iso8859-1', source);
+      }
+    },
+    'utf8': {
+      'ascii': (source) => {
+        return icu.convert('us-ascii', 'utf-8', source);
+      },
+      'latin1': (source) => {
+        return icu.convert('iso-8859-1', 'utf-8', source);
+      },
+      'utf16le': icu.convertToUcs2FromUtf8,
+    },
+    'utf16le': {
+      'ascii': (source) => {
+        if (source.length % 2 !== 0)
+          throw new TypeError('Invalid UCS2 Buffer');
+        return icu.convertFromUcs2('us-ascii', source);
+      },
+      'latin1': (source) => {
+        if (source.length % 2 !== 0)
+          throw new TypeError('Invalid UCS2 Buffer');
+        return icu.convertFromUcs2('iso-8859-1', source);
+      },
+      'utf8': (source) => {
+        if (source.length % 2 !== 0)
+          throw new TypeError('Invalid UCS2 Buffer');
+        return icu.convertToUtf8FromUcs2(source);
+      }
+    }
+  };
+
+  // Transcodes the Buffer from one encoding to another, returning a new
+  // Buffer instance.
+  exports.transcode = function transcode(source, from_enc, to_enc) {
+    if (!source || !(source.buffer instanceof ArrayBuffer))
+      throw new TypeError('"source" argument must be a Buffer');
+    if (source.length === 0) return Buffer.alloc(0);
+
+    from_enc = normalizeEncoding(from_enc) || from_enc;
+    to_enc = normalizeEncoding(to_enc) || to_enc;
+
+    if (from_enc === to_enc)
+      return Buffer.from(source);
+
+    const cnv_from = conversions[from_enc];
+
+    if (cnv_from) {
+      const cnv_to = cnv_from[to_enc];
+      if (cnv_to)
+        return cnv_to(source);
+    }
+    throw new TypeError(`Unsupported conversion: ${from_enc} to ${to_enc}`);
+  };
+
+  // Perform Unicode Normalization on the Buffer.
+  exports.normalize = function normalize(buf, form, encoding) {
+    if (!buf || !(buf.buffer instanceof ArrayBuffer))
+      throw new TypeError('First argument must be a Buffer');
+    encoding = normalizeEncoding(encoding);
+    if (encoding === 'ascii')
+      encoding == 'us-ascii';
+    return icu.normalize(buf, encoding, String(form).toUpperCase());
+  };
+
+}
diff --git a/lib/internal/readline.js b/lib/internal/readline.js
@@ -2,102 +2,128 @@
 
 // Regexes used for ansi escape code splitting
 // eslint-disable-next-line no-control-regex
-const metaKeyCodeReAnywhere = /(?:\x1b)([a-zA-Z0-9])/;
-const functionKeyCodeReAnywhere = new RegExp('(?:\x1b+)(O|N|\\[|\\[\\[)(?:' + [
-  '(\\d+)(?:;(\\d+))?([~^$])',
-  '(?:M([@ #!a`])(.)(.))', // mouse
-  '(?:1;)?(\\d+)?([a-zA-Z])'
-].join('|') + ')');
+// Adopted from https://github.com/chalk/ansi-regex/blob/master/index.js
+// License: MIT, authors: @sindresorhus, Qix-, and arjunmehta
+const ansi =
+  /[\u001b\u009b][[()#;?]*(?:[0-9]{1,4}(?:;[0-9]{0,4})*)?[0-9A-ORZcf-nqry=><]/g;
 
-
-module.exports = {
+exports = module.exports = {
   emitKeys,
-  getStringWidth,
-  isFullWidthCodePoint,
   stripVTControlCharacters
 };
 
 
-/**
- * Returns the number of columns required to display the given string.
- */
-function getStringWidth(str) {
-  let width = 0;
-
-  str = stripVTControlCharacters(str);
-
-  for (let i = 0; i < str.length; i++) {
-    const code = str.codePointAt(i);
+if (process.binding('config').hasIntl) {
+  const util = require('util');
+  exports.getStringWidth = function getStringWidth(str) {
+    return util.getColumnWidth(stripVTControlCharacters(str));
+  };
+
+  exports.isFullWidthCodePoint = function isFullWidthCodePoint(code) {
+    // Defined here largely for legacy support reasons. Updated to
+    // use character properties rather than fixed ranges.
+    const eaw =
+      util.getCharacterProperty(code,
+                                util.constants.UCHAR_EAST_ASIAN_WIDTH);
+    const emoji =
+      util.getCharacterProperty(code,
+                                util.constants.UCHAR_EMOJI_PRESENTATION) &&
+      !util.getCharacterProperty(code,
+                                 util.constants.UCHAR_EMOJI_MODIFIER);
+    return eaw === util.constants.U_EA_FULLWIDTH ||
+           eaw === util.constants.U_EA_WIDE ||
+           emoji;
+  };
+
+} else {
+  // These old implementations are used as fallbacks only when Node.js
+  // is compiled without ICU. The getStringWidth implementation here is
+  // about 30% slower than the ICU based implementation and does not
+  // work properly for emoji and newer unicode characters. The new impl
+  // uses ICU's built in character properties data to provide more accurate
+  // results.
+  /**
+   * Returns the number of columns required to display the given string.
+   */
+  function getStringWidth(str) {
+    let width = 0;
+
+    str = stripVTControlCharacters(str);
+
+    for (let i = 0; i < str.length; i++) {
+      const code = str.codePointAt(i);
+
+      if (code >= 0x10000) { // surrogates
+        i++;
+      }
 
-    if (code >= 0x10000) { // surrogates
-      i++;
+      if (isFullWidthCodePoint(code)) {
+        width += 2;
+      } else {
+        width++;
+      }
     }
 
-    if (isFullWidthCodePoint(code)) {
-      width += 2;
-    } else {
-      width++;
-    }
+    return width;
   }
 
-  return width;
-}
 
+  /**
+   * Returns true if the character represented by a given
+   * Unicode code point is full-width. Otherwise returns false.
+   */
+  function isFullWidthCodePoint(code) {
+    if (isNaN(code)) {
+      return false;
+    }
 
-/**
- * Returns true if the character represented by a given
- * Unicode code point is full-width. Otherwise returns false.
- */
-function isFullWidthCodePoint(code) {
-  if (isNaN(code)) {
-    return false;
-  }
+    // Code points are derived from:
+    // http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
+    if (code >= 0x1100 && (
+        code <= 0x115f ||  // Hangul Jamo
+        0x2329 === code || // LEFT-POINTING ANGLE BRACKET
+        0x232a === code || // RIGHT-POINTING ANGLE BRACKET
+        // CJK Radicals Supplement .. Enclosed CJK Letters and Months
+        (0x2e80 <= code && code <= 0x3247 && code !== 0x303f) ||
+        // Enclosed CJK Letters and Months .. CJK Unified Ideographs Extension A
+        0x3250 <= code && code <= 0x4dbf ||
+        // CJK Unified Ideographs .. Yi Radicals
+        0x4e00 <= code && code <= 0xa4c6 ||
+        // Hangul Jamo Extended-A
+        0xa960 <= code && code <= 0xa97c ||
+        // Hangul Syllables
+        0xac00 <= code && code <= 0xd7a3 ||
+        // CJK Compatibility Ideographs
+        0xf900 <= code && code <= 0xfaff ||
+        // Vertical Forms
+        0xfe10 <= code && code <= 0xfe19 ||
+        // CJK Compatibility Forms .. Small Form Variants
+        0xfe30 <= code && code <= 0xfe6b ||
+        // Halfwidth and Fullwidth Forms
+        0xff01 <= code && code <= 0xff60 ||
+        0xffe0 <= code && code <= 0xffe6 ||
+        // Kana Supplement
+        0x1b000 <= code && code <= 0x1b001 ||
+        // Enclosed Ideographic Supplement
+        0x1f200 <= code && code <= 0x1f251 ||
+        // CJK Unified Ideographs Extension B .. Tertiary Ideographic Plane
+        0x20000 <= code && code <= 0x3fffd)) {
+      return true;
+    }
 
-  // Code points are derived from:
-  // http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
-  if (code >= 0x1100 && (
-      code <= 0x115f ||  // Hangul Jamo
-      0x2329 === code || // LEFT-POINTING ANGLE BRACKET
-      0x232a === code || // RIGHT-POINTING ANGLE BRACKET
-      // CJK Radicals Supplement .. Enclosed CJK Letters and Months
-      (0x2e80 <= code && code <= 0x3247 && code !== 0x303f) ||
-      // Enclosed CJK Letters and Months .. CJK Unified Ideographs Extension A
-      0x3250 <= code && code <= 0x4dbf ||
-      // CJK Unified Ideographs .. Yi Radicals
-      0x4e00 <= code && code <= 0xa4c6 ||
-      // Hangul Jamo Extended-A
-      0xa960 <= code && code <= 0xa97c ||
-      // Hangul Syllables
-      0xac00 <= code && code <= 0xd7a3 ||
-      // CJK Compatibility Ideographs
-      0xf900 <= code && code <= 0xfaff ||
-      // Vertical Forms
-      0xfe10 <= code && code <= 0xfe19 ||
-      // CJK Compatibility Forms .. Small Form Variants
-      0xfe30 <= code && code <= 0xfe6b ||
-      // Halfwidth and Fullwidth Forms
-      0xff01 <= code && code <= 0xff60 ||
-      0xffe0 <= code && code <= 0xffe6 ||
-      // Kana Supplement
-      0x1b000 <= code && code <= 0x1b001 ||
-      // Enclosed Ideographic Supplement
-      0x1f200 <= code && code <= 0x1f251 ||
-      // CJK Unified Ideographs Extension B .. Tertiary Ideographic Plane
-      0x20000 <= code && code <= 0x3fffd)) {
-    return true;
+    return false;
   }
 
-  return false;
+  exports.isFullWidthCodePoint = isFullWidthCodePoint;
+  exports.getStringWidth = getStringWidth;
 }
 
-
 /**
  * Tries to remove all VT control characters. Use to estimate displayed
  * string width. May be buggy due to not running a real state machine
  */
 function stripVTControlCharacters(str) {
-  str = str.replace(new RegExp(functionKeyCodeReAnywhere.source, 'g'), '');
-  return str.replace(new RegExp(metaKeyCodeReAnywhere.source, 'g'), '');
+  return str.replace(ansi, '');
 }
 
 

diff --git a/lib/util.js b/lib/util.js
@@ -1053,3 +1053,15 @@ exports._exceptionWithHostPort = function(err,
   }
   return ex;
 };
+
+if (process.binding('config').hasIntl) {
+  const icu = process.binding('icu');
+  const constants = process.binding('constants').icu;
+  Object.defineProperty(exports, 'constants', {
+    configurable: false,
+    enumerable: true,
+    value: constants
+  });
+  exports.getCharacterProperty = icu.getCharacterProperty;
+  exports.getColumnWidth = icu.getColumnWidth;
+}
diff --git a/node.gyp b/node.gyp
@@ -74,6 +74,7 @@
       'lib/v8.js',
       'lib/vm.js',
       'lib/zlib.js',
+      'lib/internal/buffer.js',
       'lib/internal/child_process.js',
       'lib/internal/cluster.js',
       'lib/internal/freelist.js',