fix: SAX::Parser#parse_memory allows overriding the input encoding

Previously, this functionality worked fine for `#parse_io` but didn't work for `#parse_memory`. This change introduces a new optional encoding parameter to `SAX::Parser#parse_memory` and `SAX::ParserContext.memory`, and makes sure to use that encoding or the one passed to the Parser's initializer. This change also makes optional the encoding_id parameter to `SAX::ParserContext.io`, which was previously required. Finally, this commit also backfills similar test coverage for the HTML4 sax parser encoding, which should help with an upcoming big refactor. Closes #918
sparklemotion · Jul 7, 2024 · a287e6b · a287e6b
1 parent a887af9
commit a287e6b
Show file tree

Hide file tree

Showing 8 changed files with 486 additions and 209 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,7 +17,9 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA
 
 ### Added
 
-* Introduce support for a new SAX callback `XML::SAX::Document#reference`, which is called to report some parsed XML entities when `SAX::ParserContext#replace_entities` is set to the default value `false`. This is necessary functionality for some applications that were previously relying on incorrect entity error reporting which has been fixed (see below). For more information, read the docs for `Nokogiri::XML::SAX::Document`. [#1926] @flavorjones
+* Introduce support for a new SAX callback `XML::SAX::Document#reference`, which is called to report some parsed XML entities when `XML::SAX::ParserContext#replace_entities` is set to the default value `false`. This is necessary functionality for some applications that were previously relying on incorrect entity error reporting which has been fixed (see below). For more information, read the docs for `Nokogiri::XML::SAX::Document`. [#1926] @flavorjones
+* `XML::SAX::Parser#parse_memory` now accepts an optional `encoding` argument. When not provided, the parser will fall back to the encoding passed to the initializer, and then fall back to autodetection. [#918] @flavorjones
+* `XML::SAX::ParserContext.memory` now accepts an optional `encoding_id` argument. When not provided, the encoding will be autodetected. [#918] @flavorjones
 * [CRuby] `Nokogiri::HTML5::Builder` is similar to `HTML4::Builder` but returns an `HTML5::Document`. [#3119] @flavorjones
 * [CRuby] Attributes in an HTML5 document can be serialized individually, something that has always been supported by the HTML4 serializer. [#3125, #3127] @flavorjones
 * [CRuby] Introduce a compile-time option, `--disable-xml2-legacy`, to remove from libxml2 its dependencies on `zlib` and `liblzma` and disable implicit `HTTP` network requests. These all remain enabled by default, and are present in the precompiled native gems. This option is a precursor for removing these libraries in a future major release, but may be interesting for the security-minded who do not need features like automatic decompression and would like to remove these dependencies. You can read more and give feedback on these plans in #3168. [#3247] @flavorjones
@@ -28,6 +30,7 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA
 * Documentation has been improved for `CSS.xpath_for`. [#3224] @flavorjones
 * Documentation for the SAX parsing classes has been greatly improved, including the complex entity-handling behavior. [#3265] @flavorjones
 * `XML::Schema#read_memory` and `XML::RelaxNG#read_memory` are now Ruby methods that call `#from_document`. Previously these were native functions, but they were buggy on both CRuby and JRuby (but worse on JRuby) and so this is now useful, comparable in performance, and simpler code that is easier to maintain. [#2113, #2115] @flavorjones
+* `XML::SAX::ParserContext.io`'s `encoding_id` argument is now optional, and when not provided will default to autodetecting the encoding. [#918] @flavorjones
 * [CRuby] When compiling packaged libraries from source, allow users' `AR` and `LD` environment variables to set the archiver and linker commands, respectively. This augments the existing `CC` environment variable to set the compiler command. [#3165] @ziggythehamster
 * [CRuby] The HTML5 parse methods accept a `:parse_noscript_content_as_text` keyword argument which will emulate the parsing behavior of a browser which has scripting enabled. [#3178, #3231] @stevecheckoway
 * [CRuby] `HTML5::DocumentFragment.parse` and `.new` accept a `:context` keyword argument that is the parse context node or element name. Previously this could only be passed in as a positional argument to `.new` and not at all to `.parse`. @flavorjones

diff --git a/ext/java/nokogiri/Html4SaxParserContext.java b/ext/java/nokogiri/Html4SaxParserContext.java
@@ -2,11 +2,6 @@
 
 import java.io.ByteArrayInputStream;
 import java.io.InputStream;
-import java.nio.charset.Charset;
-import java.nio.charset.IllegalCharsetNameException;
-import java.nio.charset.UnsupportedCharsetException;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 
 import org.apache.xerces.parsers.AbstractSAXParser;
 import net.sourceforge.htmlunit.cyberneko.parsers.SAXParser;
@@ -71,156 +66,26 @@ public class Html4SaxParserContext extends XmlSaxParserContext
     }
   }
 
-  @JRubyMethod(name = "memory", meta = true)
+  @JRubyMethod(name = "memory", meta = true, required = 1, optional = 1)
   public static IRubyObject
   parse_memory(ThreadContext context,
                IRubyObject klazz,
-               IRubyObject data,
-               IRubyObject encoding)
+               IRubyObject[] args)
   {
-    Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klazz);
-    String javaEncoding = findEncodingName(context, encoding);
-    if (javaEncoding != null) {
-      CharSequence input = applyEncoding(rubyStringToString(data.convertToString()), javaEncoding);
-      ByteArrayInputStream istream = new ByteArrayInputStream(input.toString().getBytes());
-      ctx.setInputSource(istream);
-      ctx.getInputSource().setEncoding(javaEncoding);
-    }
-    return ctx;
-  }
-
-  public enum EncodingType {
-    NONE(0, "NONE"),
-    UTF_8(1, "UTF-8"),
-    UTF16LE(2, "UTF16LE"),
-    UTF16BE(3, "UTF16BE"),
-    UCS4LE(4, "UCS4LE"),
-    UCS4BE(5, "UCS4BE"),
-    EBCDIC(6, "EBCDIC"),
-    UCS4_2143(7, "ICS4-2143"),
-    UCS4_3412(8, "UCS4-3412"),
-    UCS2(9, "UCS2"),
-    ISO_8859_1(10, "ISO-8859-1"),
-    ISO_8859_2(11, "ISO-8859-2"),
-    ISO_8859_3(12, "ISO-8859-3"),
-    ISO_8859_4(13, "ISO-8859-4"),
-    ISO_8859_5(14, "ISO-8859-5"),
-    ISO_8859_6(15, "ISO-8859-6"),
-    ISO_8859_7(16, "ISO-8859-7"),
-    ISO_8859_8(17, "ISO-8859-8"),
-    ISO_8859_9(18, "ISO-8859-9"),
-    ISO_2022_JP(19, "ISO-2022-JP"),
-    SHIFT_JIS(20, "SHIFT-JIS"),
-    EUC_JP(21, "EUC-JP"),
-    ASCII(22, "ASCII");
-
-    private final int value;
-    private final String name;
-
-    EncodingType(int value, String name)
-    {
-      this.value = value;
-      this.name = name;
-    }
-
-    public int getValue()
-    {
-      return value;
-    }
-
-    public String toString()
-    {
-      return name;
-    }
-
-    private static transient EncodingType[] values;
-
-    // NOTE: assuming ordinal == value
-    static EncodingType get(final int ordinal)
-    {
-      EncodingType[] values = EncodingType.values;
-      if (values == null) {
-        values = EncodingType.values();
-        EncodingType.values = values;
-      }
-      if (ordinal >= 0 && ordinal < values.length) {
-        return values[ordinal];
-      }
-      return null;
+    IRubyObject data = args[0];
+    IRubyObject encoding = null;
+    if (args.length > 1) {
+      encoding = args[1];
     }
 
-  }
-
-  private static String
-  findEncodingName(final int value)
-  {
-    EncodingType type = EncodingType.get(value);
-    if (type == null) { return null; }
-    assert type.value == value;
-    return type.name;
-  }
-
-  private static String
-  findEncodingName(ThreadContext context, IRubyObject encoding)
-  {
-    String rubyEncoding = null;
-    if (encoding instanceof RubyString) {
-      rubyEncoding = rubyStringToString((RubyString) encoding);
-    } else if (encoding instanceof RubyFixnum) {
-      rubyEncoding = findEncodingName(RubyFixnum.fix2int((RubyFixnum) encoding));
-    }
-    if (rubyEncoding == null) { return null; }
-    try {
-      return Charset.forName(rubyEncoding).displayName();
-    } catch (UnsupportedCharsetException e) {
-      throw context.getRuntime().newEncodingCompatibilityError(rubyEncoding + "is not supported");
-    } catch (IllegalCharsetNameException e) {
-      throw context.getRuntime().newEncodingError(e.getMessage());
-    }
-  }
-
-  private static final Pattern CHARSET_PATTERN = Pattern.compile("charset(()|\\s)=(()|\\s)([a-z]|-|_|\\d)+",
-      Pattern.CASE_INSENSITIVE);
-
-  private static CharSequence
-  applyEncoding(final String input, final String enc)
-  {
-    int start_pos = 0;
-    int end_pos = 0;
-    if (containsIgnoreCase(input, "charset")) {
-      Matcher m = CHARSET_PATTERN.matcher(input);
-      while (m.find()) {
-        start_pos = m.start();
-        end_pos = m.end();
-      }
-    }
-    if (start_pos != end_pos) {
-      return new StringBuilder(input).replace(start_pos, end_pos, "charset=" + enc);
-    }
-    return input;
-  }
-
-  private static boolean
-  containsIgnoreCase(final String str, final String sub)
-  {
-    final int len = sub.length();
-    final int max = str.length() - len;
-
-    if (len == 0) { return true; }
-    final char c0Lower = Character.toLowerCase(sub.charAt(0));
-    final char c0Upper = Character.toUpperCase(sub.charAt(0));
+    Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klazz);
+    ctx.setStringInputSource(context, data, context.runtime.getNil());
 
-    for (int i = 0; i <= max; i++) {
-      final char ch = str.charAt(i);
-      if (ch != c0Lower && Character.toLowerCase(ch) != c0Lower && Character.toUpperCase(ch) != c0Upper) {
-        continue; // first char doesn't match
-      }
+    /* this overrides the encoding guess made by setStringInputSource */
+    String java_encoding = encoding != null ? findEncodingName(context, encoding) : null;
+    ctx.getInputSource().setEncoding(java_encoding);
 
-      if (str.regionMatches(true, i + 1, sub, 0 + 1, len - 1)) {
-        return true;
-      }
-    }
-    return false;
+    return ctx;
   }
 
   @JRubyMethod(name = "file", meta = true)
@@ -239,30 +104,38 @@ static EncodingType get(final int ordinal)
 
     Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klass);
     ctx.setInputSourceFile(context, data);
+
     String javaEncoding = findEncodingName(context, encoding);
     if (javaEncoding != null) {
       ctx.getInputSource().setEncoding(javaEncoding);
     }
+
     return ctx;
   }
 
-  @JRubyMethod(name = "io", meta = true)
+  @JRubyMethod(name = "io", meta = true, required = 1, optional = 1)
   public static IRubyObject
   parse_io(ThreadContext context,
-           IRubyObject klass,
-           IRubyObject data,
-           IRubyObject encoding)
+           IRubyObject klazz,
+           IRubyObject[] args)
   {
-    if (!(encoding instanceof RubyFixnum)) {
+    IRubyObject data = args[0];
+    IRubyObject encoding = null;
+    if (args.length > 1) {
+      encoding = args[1];
+    }
+
+    if (encoding != null && !(encoding instanceof RubyFixnum)) {
       throw context.getRuntime().newTypeError("encoding must be kind_of String");
     }
 
-    Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klass);
+    Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klazz);
     ctx.setIOInputSource(context, data, context.nil);
-    String javaEncoding = findEncodingName(context, encoding);
-    if (javaEncoding != null) {
-      ctx.getInputSource().setEncoding(javaEncoding);
-    }
+
+    /* this overrides the encoding guess made by setIOInputSource */
+    String java_encoding = encoding != null ? findEncodingName(context, encoding) : null;
+    ctx.getInputSource().setEncoding(java_encoding);
+
     return ctx;
   }