diff --git a/CHANGELOG.md b/CHANGELOG.md index da16b06d69a..aacdd425f14 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,7 +17,9 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA ### Added -* Introduce support for a new SAX callback `XML::SAX::Document#reference`, which is called to report some parsed XML entities when `SAX::ParserContext#replace_entities` is set to the default value `false`. This is necessary functionality for some applications that were previously relying on incorrect entity error reporting which has been fixed (see below). For more information, read the docs for `Nokogiri::XML::SAX::Document`. [#1926] @flavorjones +* Introduce support for a new SAX callback `XML::SAX::Document#reference`, which is called to report some parsed XML entities when `XML::SAX::ParserContext#replace_entities` is set to the default value `false`. This is necessary functionality for some applications that were previously relying on incorrect entity error reporting which has been fixed (see below). For more information, read the docs for `Nokogiri::XML::SAX::Document`. [#1926] @flavorjones +* `XML::SAX::Parser#parse_memory` now accepts an optional `encoding` argument. When not provided, the parser will fall back to the encoding passed to the initializer, and then fall back to autodetection. [#918] @flavorjones +* `XML::SAX::ParserContext.memory` now accepts an optional `encoding_id` argument. When not provided, the encoding will be autodetected. [#918] @flavorjones * [CRuby] `Nokogiri::HTML5::Builder` is similar to `HTML4::Builder` but returns an `HTML5::Document`. [#3119] @flavorjones * [CRuby] Attributes in an HTML5 document can be serialized individually, something that has always been supported by the HTML4 serializer. [#3125, #3127] @flavorjones * [CRuby] Introduce a compile-time option, `--disable-xml2-legacy`, to remove from libxml2 its dependencies on `zlib` and `liblzma` and disable implicit `HTTP` network requests. These all remain enabled by default, and are present in the precompiled native gems. This option is a precursor for removing these libraries in a future major release, but may be interesting for the security-minded who do not need features like automatic decompression and would like to remove these dependencies. You can read more and give feedback on these plans in #3168. [#3247] @flavorjones @@ -28,6 +30,7 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA * Documentation has been improved for `CSS.xpath_for`. [#3224] @flavorjones * Documentation for the SAX parsing classes has been greatly improved, including the complex entity-handling behavior. [#3265] @flavorjones * `XML::Schema#read_memory` and `XML::RelaxNG#read_memory` are now Ruby methods that call `#from_document`. Previously these were native functions, but they were buggy on both CRuby and JRuby (but worse on JRuby) and so this is now useful, comparable in performance, and simpler code that is easier to maintain. [#2113, #2115] @flavorjones +* `XML::SAX::ParserContext.io`'s `encoding_id` argument is now optional, and when not provided will default to autodetecting the encoding. [#918] @flavorjones * [CRuby] When compiling packaged libraries from source, allow users' `AR` and `LD` environment variables to set the archiver and linker commands, respectively. This augments the existing `CC` environment variable to set the compiler command. [#3165] @ziggythehamster * [CRuby] The HTML5 parse methods accept a `:parse_noscript_content_as_text` keyword argument which will emulate the parsing behavior of a browser which has scripting enabled. [#3178, #3231] @stevecheckoway * [CRuby] `HTML5::DocumentFragment.parse` and `.new` accept a `:context` keyword argument that is the parse context node or element name. Previously this could only be passed in as a positional argument to `.new` and not at all to `.parse`. @flavorjones diff --git a/ext/java/nokogiri/Html4SaxParserContext.java b/ext/java/nokogiri/Html4SaxParserContext.java index b6944b8ffbf..103091cb47f 100644 --- a/ext/java/nokogiri/Html4SaxParserContext.java +++ b/ext/java/nokogiri/Html4SaxParserContext.java @@ -2,11 +2,6 @@ import java.io.ByteArrayInputStream; import java.io.InputStream; -import java.nio.charset.Charset; -import java.nio.charset.IllegalCharsetNameException; -import java.nio.charset.UnsupportedCharsetException; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import org.apache.xerces.parsers.AbstractSAXParser; import net.sourceforge.htmlunit.cyberneko.parsers.SAXParser; @@ -71,156 +66,26 @@ public class Html4SaxParserContext extends XmlSaxParserContext } } - @JRubyMethod(name = "memory", meta = true) + @JRubyMethod(name = "memory", meta = true, required = 1, optional = 1) public static IRubyObject parse_memory(ThreadContext context, IRubyObject klazz, - IRubyObject data, - IRubyObject encoding) + IRubyObject[] args) { - Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klazz); - String javaEncoding = findEncodingName(context, encoding); - if (javaEncoding != null) { - CharSequence input = applyEncoding(rubyStringToString(data.convertToString()), javaEncoding); - ByteArrayInputStream istream = new ByteArrayInputStream(input.toString().getBytes()); - ctx.setInputSource(istream); - ctx.getInputSource().setEncoding(javaEncoding); - } - return ctx; - } - - public enum EncodingType { - NONE(0, "NONE"), - UTF_8(1, "UTF-8"), - UTF16LE(2, "UTF16LE"), - UTF16BE(3, "UTF16BE"), - UCS4LE(4, "UCS4LE"), - UCS4BE(5, "UCS4BE"), - EBCDIC(6, "EBCDIC"), - UCS4_2143(7, "ICS4-2143"), - UCS4_3412(8, "UCS4-3412"), - UCS2(9, "UCS2"), - ISO_8859_1(10, "ISO-8859-1"), - ISO_8859_2(11, "ISO-8859-2"), - ISO_8859_3(12, "ISO-8859-3"), - ISO_8859_4(13, "ISO-8859-4"), - ISO_8859_5(14, "ISO-8859-5"), - ISO_8859_6(15, "ISO-8859-6"), - ISO_8859_7(16, "ISO-8859-7"), - ISO_8859_8(17, "ISO-8859-8"), - ISO_8859_9(18, "ISO-8859-9"), - ISO_2022_JP(19, "ISO-2022-JP"), - SHIFT_JIS(20, "SHIFT-JIS"), - EUC_JP(21, "EUC-JP"), - ASCII(22, "ASCII"); - - private final int value; - private final String name; - - EncodingType(int value, String name) - { - this.value = value; - this.name = name; - } - - public int getValue() - { - return value; - } - - public String toString() - { - return name; - } - - private static transient EncodingType[] values; - - // NOTE: assuming ordinal == value - static EncodingType get(final int ordinal) - { - EncodingType[] values = EncodingType.values; - if (values == null) { - values = EncodingType.values(); - EncodingType.values = values; - } - if (ordinal >= 0 && ordinal < values.length) { - return values[ordinal]; - } - return null; + IRubyObject data = args[0]; + IRubyObject encoding = null; + if (args.length > 1) { + encoding = args[1]; } - } - - private static String - findEncodingName(final int value) - { - EncodingType type = EncodingType.get(value); - if (type == null) { return null; } - assert type.value == value; - return type.name; - } - - private static String - findEncodingName(ThreadContext context, IRubyObject encoding) - { - String rubyEncoding = null; - if (encoding instanceof RubyString) { - rubyEncoding = rubyStringToString((RubyString) encoding); - } else if (encoding instanceof RubyFixnum) { - rubyEncoding = findEncodingName(RubyFixnum.fix2int((RubyFixnum) encoding)); - } - if (rubyEncoding == null) { return null; } - try { - return Charset.forName(rubyEncoding).displayName(); - } catch (UnsupportedCharsetException e) { - throw context.getRuntime().newEncodingCompatibilityError(rubyEncoding + "is not supported"); - } catch (IllegalCharsetNameException e) { - throw context.getRuntime().newEncodingError(e.getMessage()); - } - } - - private static final Pattern CHARSET_PATTERN = Pattern.compile("charset(()|\\s)=(()|\\s)([a-z]|-|_|\\d)+", - Pattern.CASE_INSENSITIVE); - - private static CharSequence - applyEncoding(final String input, final String enc) - { - int start_pos = 0; - int end_pos = 0; - if (containsIgnoreCase(input, "charset")) { - Matcher m = CHARSET_PATTERN.matcher(input); - while (m.find()) { - start_pos = m.start(); - end_pos = m.end(); - } - } - if (start_pos != end_pos) { - return new StringBuilder(input).replace(start_pos, end_pos, "charset=" + enc); - } - return input; - } - - private static boolean - containsIgnoreCase(final String str, final String sub) - { - final int len = sub.length(); - final int max = str.length() - len; - - if (len == 0) { return true; } - final char c0Lower = Character.toLowerCase(sub.charAt(0)); - final char c0Upper = Character.toUpperCase(sub.charAt(0)); + Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klazz); + ctx.setStringInputSource(context, data, context.runtime.getNil()); - for (int i = 0; i <= max; i++) { - final char ch = str.charAt(i); - if (ch != c0Lower && Character.toLowerCase(ch) != c0Lower && Character.toUpperCase(ch) != c0Upper) { - continue; // first char doesn't match - } + /* this overrides the encoding guess made by setStringInputSource */ + String java_encoding = encoding != null ? findEncodingName(context, encoding) : null; + ctx.getInputSource().setEncoding(java_encoding); - if (str.regionMatches(true, i + 1, sub, 0 + 1, len - 1)) { - return true; - } - } - return false; + return ctx; } @JRubyMethod(name = "file", meta = true) @@ -239,30 +104,38 @@ static EncodingType get(final int ordinal) Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klass); ctx.setInputSourceFile(context, data); + String javaEncoding = findEncodingName(context, encoding); if (javaEncoding != null) { ctx.getInputSource().setEncoding(javaEncoding); } + return ctx; } - @JRubyMethod(name = "io", meta = true) + @JRubyMethod(name = "io", meta = true, required = 1, optional = 1) public static IRubyObject parse_io(ThreadContext context, - IRubyObject klass, - IRubyObject data, - IRubyObject encoding) + IRubyObject klazz, + IRubyObject[] args) { - if (!(encoding instanceof RubyFixnum)) { + IRubyObject data = args[0]; + IRubyObject encoding = null; + if (args.length > 1) { + encoding = args[1]; + } + + if (encoding != null && !(encoding instanceof RubyFixnum)) { throw context.getRuntime().newTypeError("encoding must be kind_of String"); } - Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klass); + Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klazz); ctx.setIOInputSource(context, data, context.nil); - String javaEncoding = findEncodingName(context, encoding); - if (javaEncoding != null) { - ctx.getInputSource().setEncoding(javaEncoding); - } + + /* this overrides the encoding guess made by setIOInputSource */ + String java_encoding = encoding != null ? findEncodingName(context, encoding) : null; + ctx.getInputSource().setEncoding(java_encoding); + return ctx; } diff --git a/ext/java/nokogiri/XmlSaxParserContext.java b/ext/java/nokogiri/XmlSaxParserContext.java index 332eb399185..c884f517dbf 100644 --- a/ext/java/nokogiri/XmlSaxParserContext.java +++ b/ext/java/nokogiri/XmlSaxParserContext.java @@ -1,10 +1,13 @@ package nokogiri; import nokogiri.internals.*; +import static nokogiri.internals.NokogiriHelpers.rubyStringToString; + import org.apache.xerces.parsers.AbstractSAXParser; import org.jruby.Ruby; import org.jruby.RubyClass; import org.jruby.RubyFixnum; +import org.jruby.RubyString; import org.jruby.anno.JRubyClass; import org.jruby.anno.JRubyMethod; import org.jruby.exceptions.RaiseException; @@ -14,8 +17,14 @@ import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; +import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; +import java.nio.charset.Charset; +import java.nio.charset.IllegalCharsetNameException; +import java.nio.charset.UnsupportedCharsetException; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import static org.jruby.runtime.Helpers.invoke; @@ -90,16 +99,26 @@ public class XmlSaxParserContext extends ParserContext * Create a new parser context that will parse the string * data. */ - @JRubyMethod(name = "memory", meta = true) + @JRubyMethod(name = "memory", meta = true, required = 1, optional = 1) public static IRubyObject parse_memory(ThreadContext context, IRubyObject klazz, - IRubyObject data) + IRubyObject[] args) { - final Ruby runtime = context.runtime; - XmlSaxParserContext ctx = newInstance(runtime, (RubyClass) klazz); - ctx.initialize(runtime); - ctx.setStringInputSource(context, data, runtime.getNil()); + IRubyObject data = args[0]; + IRubyObject encoding = null; + if (args.length > 1) { + encoding = args[1]; + } + + XmlSaxParserContext ctx = newInstance(context.runtime, (RubyClass) klazz); + ctx.initialize(context.runtime); + ctx.setStringInputSource(context, data, context.runtime.getNil()); + + /* this overrides the encoding guess made by setStringInputSource */ + String java_encoding = encoding != null ? findEncodingName(context, encoding) : null; + ctx.getInputSource().setEncoding(java_encoding); + return ctx; } @@ -126,21 +145,31 @@ public class XmlSaxParserContext extends ParserContext * * TODO: Currently ignores encoding enc. */ - @JRubyMethod(name = "io", meta = true) + @JRubyMethod(name = "io", meta = true, required = 1, optional = 1) public static IRubyObject parse_io(ThreadContext context, IRubyObject klazz, - IRubyObject data, - IRubyObject encoding) + IRubyObject[] args) { - // check the type of the unused encoding to match behavior of CRuby - if (!(encoding instanceof RubyFixnum)) { + IRubyObject data = args[0]; + IRubyObject encoding = null; + if (args.length > 1) { + encoding = args[1]; + } + + if (encoding != null && !(encoding instanceof RubyFixnum)) { throw context.getRuntime().newTypeError("encoding must be kind_of String"); } + final Ruby runtime = context.runtime; XmlSaxParserContext ctx = newInstance(runtime, (RubyClass) klazz); ctx.initialize(runtime); ctx.setIOInputSource(context, data, runtime.getNil()); + + /* this overrides the encoding guess made by setIOInputSource */ + String java_encoding = encoding != null ? findEncodingName(context, encoding) : null; + ctx.getInputSource().setEncoding(java_encoding); + return ctx; } @@ -329,4 +358,139 @@ public class XmlSaxParserContext extends ParserContext if (number == null) { return context.getRuntime().getNil(); } return RubyFixnum.newFixnum(context.getRuntime(), number.longValue()); } + + public enum EncodingType { + NONE(0, "NONE"), + UTF_8(1, "UTF-8"), + UTF16LE(2, "UTF16LE"), + UTF16BE(3, "UTF16BE"), + UCS4LE(4, "UCS4LE"), + UCS4BE(5, "UCS4BE"), + EBCDIC(6, "EBCDIC"), + UCS4_2143(7, "ICS4-2143"), + UCS4_3412(8, "UCS4-3412"), + UCS2(9, "UCS2"), + ISO_8859_1(10, "ISO-8859-1"), + ISO_8859_2(11, "ISO-8859-2"), + ISO_8859_3(12, "ISO-8859-3"), + ISO_8859_4(13, "ISO-8859-4"), + ISO_8859_5(14, "ISO-8859-5"), + ISO_8859_6(15, "ISO-8859-6"), + ISO_8859_7(16, "ISO-8859-7"), + ISO_8859_8(17, "ISO-8859-8"), + ISO_8859_9(18, "ISO-8859-9"), + ISO_2022_JP(19, "ISO-2022-JP"), + SHIFT_JIS(20, "SHIFT-JIS"), + EUC_JP(21, "EUC-JP"), + ASCII(22, "ASCII"); + + private final int value; + private final String name; + + EncodingType(int value, String name) + { + this.value = value; + this.name = name; + } + + public int getValue() + { + return value; + } + + public String toString() + { + return name; + } + + private static transient EncodingType[] values; + + // NOTE: assuming ordinal == value + static EncodingType get(final int ordinal) + { + EncodingType[] values = EncodingType.values; + if (values == null) { + values = EncodingType.values(); + EncodingType.values = values; + } + if (ordinal >= 0 && ordinal < values.length) { + return values[ordinal]; + } + return null; + } + + } + + protected static String + findEncodingName(final int value) + { + EncodingType type = EncodingType.get(value); + if (type == null) { return null; } + assert type.value == value; + return type.name; + } + + protected static String + findEncodingName(ThreadContext context, IRubyObject encoding) + { + String rubyEncoding = null; + if (encoding instanceof RubyString) { + rubyEncoding = rubyStringToString((RubyString) encoding); + } else if (encoding instanceof RubyFixnum) { + rubyEncoding = findEncodingName(RubyFixnum.fix2int((RubyFixnum) encoding)); + } + if (rubyEncoding == null) { return null; } + if (rubyEncoding.equals("NONE")) { return null; } + try { + return Charset.forName(rubyEncoding).displayName(); + } catch (UnsupportedCharsetException e) { + throw context.getRuntime().newEncodingCompatibilityError(rubyEncoding + " is not supported"); + } catch (IllegalCharsetNameException e) { + throw context.getRuntime().newEncodingError(e.getMessage()); + } + } + + protected static final Pattern CHARSET_PATTERN = Pattern.compile("charset(()|\\s)=(()|\\s)([a-z]|-|_|\\d)+", + Pattern.CASE_INSENSITIVE); + + protected static CharSequence + applyEncoding(final String input, final String enc) + { + int start_pos = 0; + int end_pos = 0; + if (containsIgnoreCase(input, "charset")) { + Matcher m = CHARSET_PATTERN.matcher(input); + while (m.find()) { + start_pos = m.start(); + end_pos = m.end(); + } + } + if (start_pos != end_pos) { + return new StringBuilder(input).replace(start_pos, end_pos, "charset=" + enc); + } + return input; + } + + protected static boolean + containsIgnoreCase(final String str, final String sub) + { + final int len = sub.length(); + final int max = str.length() - len; + + if (len == 0) { return true; } + final char c0Lower = Character.toLowerCase(sub.charAt(0)); + final char c0Upper = Character.toUpperCase(sub.charAt(0)); + + for (int i = 0; i <= max; i++) { + final char ch = str.charAt(i); + if (ch != c0Lower && Character.toLowerCase(ch) != c0Lower && Character.toUpperCase(ch) != c0Upper) { + continue; // first char doesn't match + } + + if (str.regionMatches(true, i + 1, sub, 0 + 1, len - 1)) { + return true; + } + } + return false; + } } diff --git a/ext/nokogiri/xml_sax_parser_context.c b/ext/nokogiri/xml_sax_parser_context.c index 5a966d66a94..841d2a1b928 100644 --- a/ext/nokogiri/xml_sax_parser_context.c +++ b/ext/nokogiri/xml_sax_parser_context.c @@ -76,32 +76,43 @@ noko_xml_sax_parser_context_set_encoding(xmlParserCtxtPtr c_context, VALUE rb_en /* * call-seq: + * io(input) * io(input, encoding_id) * * Create a parser context for an +input+ IO which will assume +encoding+ * * [Parameters] * - +io+ (IO) The readable IO object from which to read input - * - +encoding_id+ (Integer) The libxml2 encoding ID to use, see SAX::Parser::ENCODINGS + * - +encoding_id+ (optional Integer) The libxml2 encoding ID to use, see SAX::Parser::ENCODINGS + * (default: 0, corresponding to "NONE", which means to autodetect the encoding) * * [Returns] Nokogiri::XML::SAX::ParserContext * * 💡 Calling Nokogiri::XML::SAX::Parser.parse is more convenient for most use cases. */ static VALUE -noko_xml_sax_parser_context_s_io(VALUE rb_class, VALUE rb_io, VALUE rb_encoding_id) +noko_xml_sax_parser_context_s_io(int argc, VALUE *argv, VALUE rb_class) { - xmlParserCtxtPtr c_context; - xmlCharEncoding c_encoding = (xmlCharEncoding)NUM2INT(rb_encoding_id); + VALUE rb_io, rb_encoding_id; + + rb_scan_args(argc, argv, "11", &rb_io, &rb_encoding_id); + + xmlCharEncoding c_encoding; + if (RB_TEST(rb_encoding_id)) { + c_encoding = (xmlCharEncoding)NUM2INT(rb_encoding_id); + } else { + c_encoding = XML_CHAR_ENCODING_NONE; + } if (!rb_respond_to(rb_io, id_read)) { rb_raise(rb_eTypeError, "argument expected to respond to :read"); } - c_context = xmlCreateIOParserCtxt(NULL, NULL, - (xmlInputReadCallback)noko_io_read, - (xmlInputCloseCallback)noko_io_close, - (void *)rb_io, c_encoding); + xmlParserCtxtPtr c_context = + xmlCreateIOParserCtxt(NULL, NULL, + (xmlInputReadCallback)noko_io_read, + (xmlInputCloseCallback)noko_io_close, + (void *)rb_io, c_encoding); if (!c_context) { rb_raise(rb_eRuntimeError, "failed to create xml sax parser context"); } @@ -143,34 +154,43 @@ noko_xml_sax_parser_context_s_file(VALUE rb_class, VALUE rb_path) /* * call-seq: * memory(input) + * memory(input, encoding_id) * * Create a parser context for the +input+ String. * * [Parameters] * - +input+ (String) The input string to be parsed. + * - +encoding_id+ (optional Integer) The libxml2 encoding ID to use, see SAX::Parser::ENCODINGS + * (default: 0, corresponding to "NONE", which means to autodetect the encoding) * * [Returns] Nokogiri::XML::SAX::ParserContext * * 💡 Calling Nokogiri::XML::SAX::Parser.parse is more convenient for most use cases. */ static VALUE -noko_xml_sax_parser_context_s_memory(VALUE rb_class, VALUE rb_input) +noko_xml_sax_parser_context_s_memory(int argc, VALUE *argv, VALUE rb_class) { - xmlParserCtxtPtr c_context; + VALUE rb_input, rb_encoding_id; + rb_scan_args(argc, argv, "11", &rb_input, &rb_encoding_id); Check_Type(rb_input, T_STRING); if (!(int)RSTRING_LEN(rb_input)) { rb_raise(rb_eRuntimeError, "input string cannot be empty"); } - c_context = xmlCreateMemoryParserCtxt(StringValuePtr(rb_input), - (int)RSTRING_LEN(rb_input)); + xmlParserCtxtPtr c_context = + xmlCreateMemoryParserCtxt(StringValuePtr(rb_input), (int)RSTRING_LEN(rb_input)); if (c_context->sax) { xmlFree(c_context->sax); c_context->sax = NULL; } + if (RB_TEST(rb_encoding_id)) { + xmlCharEncoding c_encoding = (xmlCharEncoding)NUM2INT(rb_encoding_id); + xmlSwitchEncoding(c_context, c_encoding); + } + return noko_xml_sax_parser_context_wrap(rb_class, c_context); } @@ -383,8 +403,8 @@ noko_init_xml_sax_parser_context(void) rb_undef_alloc_func(cNokogiriXmlSaxParserContext); - rb_define_singleton_method(cNokogiriXmlSaxParserContext, "io", noko_xml_sax_parser_context_s_io, 2); - rb_define_singleton_method(cNokogiriXmlSaxParserContext, "memory", noko_xml_sax_parser_context_s_memory, 1); + rb_define_singleton_method(cNokogiriXmlSaxParserContext, "io", noko_xml_sax_parser_context_s_io, -1); + rb_define_singleton_method(cNokogiriXmlSaxParserContext, "memory", noko_xml_sax_parser_context_s_memory, -1); rb_define_singleton_method(cNokogiriXmlSaxParserContext, "file", noko_xml_sax_parser_context_s_file, 1); rb_define_method(cNokogiriXmlSaxParserContext, "parse_with", noko_xml_sax_parser_context__parse_with, 1); diff --git a/lib/nokogiri/html4/sax/parser.rb b/lib/nokogiri/html4/sax/parser.rb index 4d96928a1c1..817accfa4c3 100644 --- a/lib/nokogiri/html4/sax/parser.rb +++ b/lib/nokogiri/html4/sax/parser.rb @@ -31,10 +31,12 @@ module SAX class Parser < Nokogiri::XML::SAX::Parser ### # Parse html stored in +data+ using +encoding+ - def parse_memory(data, encoding = "UTF-8") + def parse_memory(data, encoding = @encoding) raise TypeError unless String === data return if data.empty? + check_encoding(encoding) + ctx = ParserContext.memory(data, encoding) yield ctx if block_given? ctx.parse_with(self) @@ -42,10 +44,10 @@ def parse_memory(data, encoding = "UTF-8") ### # Parse given +io+ - def parse_io(io, encoding = "UTF-8") - check_encoding(encoding) - @encoding = encoding - ctx = ParserContext.io(io, ENCODINGS[encoding]) + def parse_io(io, encoding = @encoding) + encoding_id = encoding ? ENCODINGS[check_encoding(encoding)] : ENCODINGS["NONE"] + + ctx = ParserContext.io(io, encoding_id) yield ctx if block_given? ctx.parse_with(self) end diff --git a/lib/nokogiri/xml/sax/parser.rb b/lib/nokogiri/xml/sax/parser.rb index 9b68d3cd855..772c1fb3705 100644 --- a/lib/nokogiri/xml/sax/parser.rb +++ b/lib/nokogiri/xml/sax/parser.rb @@ -71,7 +71,7 @@ class Attribute < Struct.new(:localname, :prefix, :uri, :value) attr_accessor :encoding # Create a new Parser with +doc+ and +encoding+ - def initialize(doc = Nokogiri::XML::SAX::Document.new, encoding = "UTF-8") + def initialize(doc = Nokogiri::XML::SAX::Document.new, encoding = nil) @encoding = check_encoding(encoding) @document = doc @warned = false @@ -91,9 +91,27 @@ def parse(thing, &block) end ### - # Parse given +io+ + # :call-seq: + # parse_io(io) + # parse_io(io) { |parser_context| ... } + # parse_io(io, encoding) + # parse_io(io, encoding) { |parser_context| ... } + # + # Parse an input stream. + # + # [Parameters] + # - +io+ (IO) The readable IO object from which to read input + # - +encoding+ (optional String) An encoding name to use when parsing the input. (default + # `nil` for autodetection) + # + # [Yields] + # If a block is given, the underlying ParserContext object will be yielded. This can be used to set + # options on the parser context before parsing begins. + # def parse_io(io, encoding = @encoding) - ctx = ParserContext.io(io, ENCODINGS[check_encoding(encoding)]) + encoding_id = encoding ? ENCODINGS[check_encoding(encoding)] : ENCODINGS["NONE"] + + ctx = ParserContext.io(io, encoding_id) yield ctx if block_given? ctx.parse_with(self) end @@ -110,8 +128,27 @@ def parse_file(filename) ctx.parse_with(self) end - def parse_memory(data) - ctx = ParserContext.memory(data) + # :call-seq: + # parse_memory(input) + # parse_memory(input) { |parser_context| ... } + # parse_memory(input, encoding) + # parse_memory(input, encoding) { |parser_context| ... } + # + # Parse an input string. + # + # [Parameters] + # - +input+ (String) The input string to be parsed. + # - +encoding+ (optional String) An encoding name to use when parsing the input. (default + # `nil` for autodetection) + # + # [Yields] + # If a block is given, the underlying ParserContext object will be yielded. This can be used to set + # options on the parser context before parsing begins. + # + def parse_memory(input, encoding = @encoding) + encoding_id = encoding ? ENCODINGS[check_encoding(encoding)] : ENCODINGS["NONE"] + + ctx = ParserContext.memory(input, encoding_id) yield ctx if block_given? ctx.parse_with(self) end @@ -119,6 +156,7 @@ def parse_memory(data) private def check_encoding(encoding) + return nil unless encoding encoding.upcase.tap do |enc| raise ArgumentError, "'#{enc}' is not a valid encoding" unless ENCODINGS[enc] end diff --git a/test/html4/sax/test_parser.rb b/test/html4/sax/test_parser.rb index 8ec9bbf8614..662ef19b075 100644 --- a/test/html4/sax/test_parser.rb +++ b/test/html4/sax/test_parser.rb @@ -59,15 +59,102 @@ class TestCase end end - it "parse_force_encoding" do - parser.parse_memory(<<-HTML, "UTF-8") - - Информация + describe "encoding" do + let(:html_encoding_iso8859) { <<~HTML } + + B\xF6hnhardt HTML - assert_equal( - "Информация", - parser.document.data.join.strip, - ) + + # this input string is really UTF-8 but is marked as ISO-8859-1 + let(:html_encoding_broken) { <<~HTML } + + Böhnhardt + HTML + + # this input string is really ISO-8859-1 but is marked as UTF-8 + let(:html_encoding_broken2) { <<~HTML } + + B\xF6hnhardt + HTML + + it "is nil by default to indicate encoding should be autodetected" do + parser = Nokogiri::HTML4::SAX::Parser.new(Doc.new) + assert_nil(parser.encoding) + end + + it "can be set in the initializer" do + assert_equal("UTF-8", Nokogiri::HTML4::SAX::Parser.new(Doc.new, "UTF-8").encoding) + assert_equal("ISO-2022-JP", Nokogiri::HTML4::SAX::Parser.new(Doc.new, "ISO-2022-JP").encoding) + end + + it "raises when given an invalid encoding name" do + assert_raises(ArgumentError) { Nokogiri::HTML4::SAX::Parser.new(Doc.new, "not an encoding") } + assert_raises(ArgumentError) { parser.parse_io(StringIO.new(""), "not an encoding") } + assert_raises(ArgumentError) { parser.parse_memory("", "not an encoding") } + end + + it "autodetects the encoding if not overridden" do + parser = Nokogiri::HTML4::SAX::Parser.new(Doc.new) + parser.parse(html_encoding_iso8859) + + # correctly converted the input ISO-8859-1 to UTF-8 for the callback + assert_equal("Böhnhardt", parser.document.data.join.strip) + end + + it "overrides the ISO-8859-1 document's encoding when set via initializer" do + parser = Nokogiri::HTML4::SAX::Parser.new(Doc.new) + parser.parse_memory(html_encoding_broken) + + assert_equal("Böhnhardt", parser.document.data.join.strip) + + parser = Nokogiri::HTML4::SAX::Parser.new(Doc.new, "UTF-8") + parser.parse_memory(html_encoding_broken) + + assert_equal("Böhnhardt", parser.document.data.join.strip) + end + + it "overrides the UTF-8 document's encoding when set via initializer" do + if Nokogiri.uses_libxml?(">= 2.13.0") # nekohtml is a better guesser than libxml2 + parser = Nokogiri::HTML4::SAX::Parser.new(Doc.new) + parser.parse_memory(html_encoding_broken2) + + assert(parser.document.errors.any? { |e| e.match(/Invalid byte/) }) + end + + parser = Nokogiri::HTML4::SAX::Parser.new(Doc.new) + parser.parse_memory(html_encoding_broken2, "ISO-8859-1") + + assert_equal("Böhnhardt", parser.document.data.join.strip) + refute(parser.document.errors.any? { |e| e.match(/Invalid byte/) }) + end + + it "can be set via parse_io" do + if Nokogiri.uses_libxml?("< 2.13.0") + skip("older libxml2 encoding detection is sus") + end + + parser = Nokogiri::HTML4::SAX::Parser.new(Doc.new) + parser.parse_io(StringIO.new(html_encoding_broken), "UTF-8") + + assert_equal("Böhnhardt", parser.document.data.join.strip) + + parser = Nokogiri::HTML4::SAX::Parser.new(Doc.new) + parser.parse_io(StringIO.new(html_encoding_broken2), "ISO-8859-1") + + assert_equal("Böhnhardt", parser.document.data.join.strip) + end + + it "can be set via parse_memory" do + parser = Nokogiri::HTML4::SAX::Parser.new(Doc.new) + parser.parse_memory(html_encoding_broken, "UTF-8") + + assert_equal("Böhnhardt", parser.document.data.join.strip) + + parser = Nokogiri::HTML4::SAX::Parser.new(Doc.new) + parser.parse_memory(html_encoding_broken2, "ISO-8859-1") + + assert_equal("Böhnhardt", parser.document.data.join.strip) + end end it "parse_document" do diff --git a/test/xml/sax/test_parser.rb b/test/xml/sax/test_parser.rb index 98640dea2ef..0882886777e 100644 --- a/test/xml/sax/test_parser.rb +++ b/test/xml/sax/test_parser.rb @@ -215,9 +215,104 @@ class TestCase end end - it "has correct encoding" do - parser = Nokogiri::XML::SAX::Parser.new(Doc.new, "UTF-8") - assert_equal("UTF-8", parser.encoding) + describe "encoding" do + # proper ISO-8859-1 encoding + let(:xml_encoding_iso8859) { "\nB\xF6hnhardt" } + # this input string is really UTF-8 but is marked as ISO-8859-1 + let(:xml_encoding_broken) { "\nBöhnhardt" } + # this input string is really ISO-8859-1 but is marked as UTF-8 + let(:xml_encoding_broken2) { "\nB\xF6hnhardt" } + + it "is nil by default to indicate encoding should be autodetected" do + parser = Nokogiri::XML::SAX::Parser.new(Doc.new) + assert_nil(parser.encoding) + end + + it "can be set in the initializer" do + assert_equal("UTF-8", Nokogiri::XML::SAX::Parser.new(Doc.new, "UTF-8").encoding) + assert_equal("ISO-2022-JP", Nokogiri::XML::SAX::Parser.new(Doc.new, "ISO-2022-JP").encoding) + end + + it "raises when given an invalid encoding name" do + assert_raises(ArgumentError) { Nokogiri::XML::SAX::Parser.new(Doc.new, "not an encoding") } + assert_raises(ArgumentError) { parser.parse_io(StringIO.new(""), "not an encoding") } + assert_raises(ArgumentError) { parser.parse_memory("", "not an encoding") } + end + + it "autodetects the encoding if not overridden" do + parser = Nokogiri::XML::SAX::Parser.new(Doc.new) + parser.parse(xml_encoding_iso8859) + + # correctly converted the input ISO-8859-1 to UTF-8 for the callback + assert_equal("Böhnhardt", parser.document.data.join) + end + + it "overrides the ISO-8859-1 document's encoding when set via initializer" do + if Nokogiri.uses_libxml?("< 2.12.0") # gnome/libxml2@ec7be506 + skip("older libxml2 encoding detection is sus") + end + + # broken encoding! + parser = Nokogiri::XML::SAX::Parser.new(Doc.new) + parser.parse(xml_encoding_broken) + + assert_equal("Böhnhardt", parser.document.data.join) + + # override the encoding + parser = Nokogiri::XML::SAX::Parser.new(Doc.new, "UTF-8") + parser.parse(xml_encoding_broken) + + assert_equal("Böhnhardt", parser.document.data.join) + end + + it "overrides the UTF-8 document's encoding when set via initializer" do + if Nokogiri.uses_libxml?(">= 2.13.0") + # broken encoding! + parser = Nokogiri::XML::SAX::Parser.new(Doc.new) + parser.parse(xml_encoding_broken2) + + assert(parser.document.errors.any? { |e| e.match(/Invalid byte/) }) + end + + # override the encoding + parser = Nokogiri::XML::SAX::Parser.new(Doc.new, "ISO-8859-1") + parser.parse(xml_encoding_broken2) + + assert_equal("Böhnhardt", parser.document.data.join) + refute(parser.document.errors.any? { |e| e.match(/Invalid byte/) }) + end + + it "can be set via parse_io" do + if Nokogiri.uses_libxml?("< 2.13.0") + skip("older libxml2 encoding detection is sus") + end + + parser = Nokogiri::XML::SAX::Parser.new(Doc.new) + parser.parse_io(StringIO.new(xml_encoding_broken), "UTF-8") + + assert_equal("Böhnhardt", parser.document.data.join) + + parser = Nokogiri::XML::SAX::Parser.new(Doc.new) + parser.parse_io(StringIO.new(xml_encoding_broken2), "ISO-8859-1") + + assert_equal("Böhnhardt", parser.document.data.join) + end + + it "can be set via parse_memory" do + if Nokogiri.uses_libxml?("< 2.12.0") # gnome/libxml2@ec7be506 + skip("older libxml2 encoding detection is sus") + end + + parser = Nokogiri::XML::SAX::Parser.new(Doc.new) + parser.parse_memory(xml_encoding_broken, "UTF-8") + + assert_equal("Böhnhardt", parser.document.data.join) # here + + parser = Nokogiri::XML::SAX::Parser.new(Doc.new) + parser.parse_memory(xml_encoding_broken2, "ISO-8859-1") + + assert_equal("Böhnhardt", parser.document.data.join) + end end it "error strings are UTF-8" do @@ -294,11 +389,6 @@ class TestCase end end - it "raises when given an invalid encoding name" do - assert_raises(ArgumentError) { Nokogiri::XML::SAX::Parser.new(Doc.new, "not an encoding") } - assert_raises(ArgumentError) { parser.parse_io(StringIO.new(""), "not an encoding") } - end - it "cdata_block is called when CDATA is parsed" do parser.parse_memory(<<~XML)