Further optimize string decoding

mtth · Sep 13, 2024 · 06b8038 · 06b8038
1 parent 141eb72
commit 06b8038
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 48 deletions.
diff --git a/lib/utils.js b/lib/utils.js
@@ -413,56 +413,13 @@ class OrderedQueue {
 }
 
 let decodeSlice;
-if (typeof Buffer === 'function' && Buffer.prototype.utf8Slice) {
-  decodeSlice = function(arr, start, end) {
-    return Buffer.prototype.utf8Slice.call(arr, start, end);
-  };
+if (typeof Buffer === 'function' && typeof Buffer.prototype.utf8Slice === 'function') {
+  decodeSlice = Function.prototype.call.bind(Buffer.prototype.utf8Slice);
 } else {
   const DECODER = new TextDecoder();
 
-  // Calling `subarray` is expensive enough that for small strings, it's faster
-  // to decode manually.
   decodeSlice = function(arr, start, end) {
-    if (end - start > 32) {
-      return DECODER.decode(arr.subarray(start, end));
-    }
-
-    let output = '';
-    let i = start;
-    // Consume the string in 4-byte chunks. The performance benefit comes not
-    // from *reading* in chunks, but calling fromCharCode with 4 characters per
-    // call.
-    while (i + 3 < end) {
-      const n = (arr[i] << 24) |
-          (arr[i + 1] << 16) |
-          (arr[i + 2] << 8) |
-          arr[i + 3];
-      // If the high bit of any character is set, it's a non-ASCII character.
-      // Fall back to TextDecoder for the remaining characters.
-      if (n & 0x80808080) {
-        output += DECODER.decode(arr.subarray(start + i, end));
-        return output;
-      }
-      output += String.fromCharCode(
-        n >>> 24,
-        (n >> 16) & 0xff,
-        (n >> 8) & 0xff,
-        n & 0xff
-      );
-      i += 4;
-    }
-
-    // Handle the remainder of the string.
-    while (i < end) {
-      if (arr[i] & 0x80) {
-        output += DECODER.decode(arr.subarray(start + i, end));
-        return output;
-      }
-      output += String.fromCharCode(arr[i]);
-      i++;
-    }
-
-    return output;
+    return DECODER.decode(arr.subarray(start, end));
   };
 }
 
@@ -839,7 +796,41 @@ class Tap {
     if (this.pos > this.length) {
       return;
     }
-    return decodeSlice(this.arr, pos, pos + len);
+
+    let arr = this.arr;
+    let end = pos + len;
+    if (len > 24) {
+      return decodeSlice(arr, pos, end);
+    }
+
+    let output = '';
+    // Consume the string in 4-byte chunks. The performance benefit comes not
+    // from *reading* in chunks, but calling fromCharCode with 4 characters per
+    // call.
+    while (pos + 3 < end) {
+      let a = arr[pos], b = arr[pos + 1], c = arr[pos + 2], d = arr[pos + 3];
+      // If the high bit of any character is set, it's a non-ASCII character.
+      // Fall back to TextDecoder for the remaining characters.
+      if ((a | b | c | d) & 0x80) {
+        output += decodeSlice(arr, pos, end);
+        return output;
+      }
+      output += String.fromCharCode(a, b, c, d);
+      pos += 4;
+    }
+
+    // Handle the remainder of the string.
+    while (pos < end) {
+      let char = arr[pos];
+      if (char & 0x80) {
+        output += decodeSlice(arr, pos, end);
+        return output;
+      }
+      output += String.fromCharCode(char);
+      pos++;
+    }
+
+    return output;
   }
 
   writeString (s) {

diff --git a/test/test_utils.js b/test/test_utils.js
@@ -211,7 +211,13 @@ suite('utils', () => {
     suite('string', () => {
 
       testWriterReader({
-        elems: ['ahierw', '', 'alh hewlii! rew'],
+        elems: [
+          'ahierw',
+          '',
+          'alh hewlii! rew',
+          'sérialisation',
+          'this string should be long enough that a different code path is exercised'
+        ],
         reader: function () { return this.readString(); },
         skipper: function () { this.skipString(); },
         writer: function (s) { this.writeString(s); }