Code cleanup (flutter#96)

rhencke · Apr 11, 2019 · 2b88ac7 · 2b88ac7
1 parent 38756a0
commit 2b88ac7
Show file tree

Hide file tree

Showing 2 changed files with 33 additions and 44 deletions.
diff --git a/lib/src/char_encodings.dart b/lib/src/char_encodings.dart
diff --git a/lib/src/html_input_stream.dart b/lib/src/html_input_stream.dart
@@ -1,8 +1,8 @@
 import 'dart:collection';
+import 'dart:convert' show ascii, utf8;
 
 import 'package:source_span/source_span.dart';
 
-import 'char_encodings.dart';
 import 'constants.dart';
 import 'encoding_parser.dart';
 import 'utils.dart';
@@ -66,7 +66,7 @@ class HtmlInputStream {
       this.sourceUrl])
       : charEncodingName = codecName(encoding) {
     if (source is String) {
-      _rawChars = toCodepoints(source);
+      _rawChars = source.runes.toList();
       charEncodingName = 'utf-8';
       charEncodingCertain = true;
     } else if (source is List<int>) {
@@ -92,7 +92,7 @@ class HtmlInputStream {
     _chars = <int>[];
 
     if (_rawChars == null) {
-      _rawChars = decodeBytes(charEncodingName, _rawBytes);
+      _rawChars = _decodeBytes(charEncodingName, _rawBytes);
     }
 
     bool skipNewline = false;
@@ -177,7 +177,7 @@ class HtmlInputStream {
   /// encoding otherwise return null.
   String detectBOM() {
     // Try detecting the BOM using bytes from the string
-    if (hasUtf8Bom(_rawBytes)) {
+    if (_hasUtf8Bom(_rawBytes)) {
       return 'utf-8';
     }
     return null;
@@ -292,3 +292,32 @@ String codecName(String encoding) {
   var canonicalName = encoding.replaceAll(asciiPunctuation, '').toLowerCase();
   return encodings[canonicalName];
 }
+
+/// Returns true if the [bytes] starts with a UTF-8 byte order mark.
+/// Since UTF-8 doesn't have byte order, it's somewhat of a misnomer, but it is
+/// used in HTML to detect the UTF-
+bool _hasUtf8Bom(List<int> bytes, [int offset = 0, int length]) {
+  int end = length != null ? offset + length : bytes.length;
+  return (offset + 3) <= end &&
+      bytes[offset] == 0xEF &&
+      bytes[offset + 1] == 0xBB &&
+      bytes[offset + 2] == 0xBF;
+}
+
+/// Decodes the [bytes] with the provided [encoding] and returns an iterable for
+/// the codepoints. Supports the major unicode encodings as well as ascii and
+/// and windows-1252 encodings.
+Iterable<int> _decodeBytes(String encoding, List<int> bytes) {
+  switch (encoding) {
+    case 'ascii':
+      return ascii.decode(bytes).runes;
+
+    case 'utf-8':
+      // NOTE: To match the behavior of the other decode functions, we eat the
+      // UTF-8 BOM here. This is the default behavior of `utf8.decode`.
+      return utf8.decode(bytes).runes;
+
+    default:
+      throw ArgumentError('Encoding $encoding not supported');
+  }
+}