diff --git a/CHANGELOG.md b/CHANGELOG.md index 99a8fc6091e..0657ecc44ee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -73,6 +73,7 @@ We've resolved many long-standing bugs in the various schema classes, validation * CSS queries for pseudo-selectors that cannot be translated into XPath expressions now raise a more descriptive `Nokogiri::CSS::SyntaxError` when they are parsed. Previously, an invalid XPath expression was evaluated and a hard-to-understand XPath error was raised by the query engine. [#3193] @flavorjones * `Schema#validate` returns errors on empty and malformed files. Previously, it would return errors on empty/malformed Documents, but not when reading from files. [#642] @flavorjones * `XML::Builder` is now consistent with how it sets block scope. Previously, missing methods with blocks on dynamically-created nodes were always handled by invoking `instance_eval(&block)` on the Builder, even when the Builder was yielding self for all other missing methods with blocks. [#1041] @flavorjones +* `HTML4::DocumentFragment.parse` accepts `IO` input. Previously, it required a string and would raise a `TypeError` when passed an `IO`. [#2069] @sharvy * [CRuby] libgumbo (the HTML5 parser) treats reaching max-depth as EOF. This addresses a class of issues when the parser is interrupted in this way. [#3121] @stevecheckoway * [CRuby] Update node GC lifecycle to avoid a potential memory leak with fragments in libxml 2.13.0 caused by changes in `xmlAddChild`. [#3156] @flavorjones * [CRuby] libgumbo correctly prints nonstandard element names in error messages. [#3219] @stevecheckoway diff --git a/lib/nokogiri/html4/document_fragment.rb b/lib/nokogiri/html4/document_fragment.rb index af97dcb704b..1681822acbb 100644 --- a/lib/nokogiri/html4/document_fragment.rb +++ b/lib/nokogiri/html4/document_fragment.rb @@ -3,11 +3,72 @@ module Nokogiri module HTML4 class DocumentFragment < Nokogiri::XML::DocumentFragment - #### - # Create a Nokogiri::XML::DocumentFragment from +tags+, using +encoding+ + # + # :call-seq: + # parse(tags) => DocumentFragment + # parse(tags, encoding) => DocumentFragment + # parse(tags, encoding, options) => DocumentFragment + # parse(tags, encoding) { |options| ... } => DocumentFragment + # + # Parse an HTML4 fragment. + # + # [Parameters] + # - +tags+ (optional String, or any object that responds to +#read+ such as an IO, or + # StringIO) + # - +encoding+ (optional String) the name of the encoding that should be used when processing + # the document. (default +nil+ for auto-detection) + # - +options+ (optional) configuration object that sets options during parsing, such as + # Nokogiri::XML::ParseOptions::RECOVER. See Nokogiri::XML::ParseOptions for more + # information. + # + # [Yields] If present, the block will be passed a Nokogiri::XML::ParseOptions object to modify + # before the fragment is parsed. See Nokogiri::XML::ParseOptions for more information. + # + # [Returns] DocumentFragment + # + # *Example:* Parsing a string + # + # fragment = DocumentFragment.parse("
Hello World
") + # + # *Example:* Parsing an IO + # + # fragment = File.open("fragment.html") do |file| + # DocumentFragment.parse(file) + # end + # + # *Example:* Specifying encoding + # + # fragment = DocumentFragment.parse(input, "EUC-JP") + # + # *Example:* Setting parse options dynamically + # + # DocumentFragment.parse("
Hello World") do |options| + # options.huge.pedantic + # end + # def self.parse(tags, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block) doc = HTML4::Document.new + if tags.respond_to?(:read) + # Handle IO-like objects (IO, File, StringIO, etc.) + # The _read_ method of these objects doesn't accept an +encoding+ parameter. + # Encoding is usually set when the IO object is created or opened, + # or by using the _set_encoding_ method. + # + # 1. If +encoding+ is provided and the object supports _set_encoding_, + # set the encoding before reading. + # 2. Read the content from the IO-like object. + # + # Note: After reading, the content's encoding will be: + # - The encoding set by _set_encoding_ if it was called + # - The default encoding of the IO object otherwise + # + # For StringIO specifically, _set_encoding_ affects only the internal string, + # not how the data is read out. + tags.set_encoding(encoding) if encoding && tags.respond_to?(:set_encoding) + tags = tags.read + end + encoding ||= if tags.respond_to?(:encoding) encoding = tags.encoding if encoding == ::Encoding::ASCII_8BIT @@ -24,6 +85,8 @@ def self.parse(tags, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, new(doc, tags, nil, options, &block) end + # It's recommended to use either DocumentFragment.parse or XML::Node#parse rather than call this + # method directly. def initialize(document, tags = nil, ctx = nil, options = XML::ParseOptions::DEFAULT_HTML) # rubocop:disable Lint/MissingSuper return self unless tags diff --git a/test/html4/test_document_fragment.rb b/test/html4/test_document_fragment.rb index 5f771c91ef4..09d0173700f 100644 --- a/test/html4/test_document_fragment.rb +++ b/test/html4/test_document_fragment.rb @@ -270,6 +270,15 @@ def test_dup_should_create_an_html_document_fragment assert_instance_of(Nokogiri::HTML4::DocumentFragment, duplicate) end + def test_parse_with_io + fragment = Nokogiri::HTML4::DocumentFragment.parse(StringIO.new("
hello
"), "UTF-8") + assert_instance_of(HTML4::DocumentFragment, fragment) + assert_equal("
hello
", fragment.to_s) + + fragment = Nokogiri::HTML4::DocumentFragment.parse(StringIO.new("
hello
")) + assert_equal("
hello
", fragment.to_s) + end + describe "encoding" do describe "#fragment" do it "parses an encoded string" do