From fac6f6d416d5edb806acfd00c88b944252232746 Mon Sep 17 00:00:00 2001
From: Sharvy Ahmed <sharvy2008@gmail.com>
Date: Thu, 1 Aug 2024 15:07:58 +0600
Subject: [PATCH 1/2] fix: IO handling in HTML4::DocumentFragment.parse
 Previously an exception (TypeError: no implicit conversion of File into
 String) would be raised.

Fixes: #2069
---
 CHANGELOG.md                            |  1 +
 lib/nokogiri/html4/document_fragment.rb | 45 ++++++++++++++++++++++++-
 test/html4/test_document_fragment.rb    |  9 +++++
 3 files changed, 54 insertions(+), 1 deletion(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 99a8fc6091e..afd4a717919 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -85,6 +85,7 @@ We've resolved many long-standing bugs in the various schema classes, validation
 * [JRuby] SAX parsing now respects the `#replace_entities` attribute, which defaults to `false`. Previously this flag defaulted to `true` and was completely ignored. [#614] @flavorjones
 * [JRuby] The SAX callback `Document#start_element_namespace` received a blank string for the URI when a namespace was not present. It now receives `nil` (as does the CRuby impl). [#3265] @flavorjones
 * [JRuby] `Reader#outer_xml` and `#inner_xml` encode entities properly. [#1523] @flavorjones
+* `HTML4::DocumentFragment.parse` can now handle IO objects. Previously, it would raise a `TypeError`. [#2069] @sharvy
 
 
 ### Changed
diff --git a/lib/nokogiri/html4/document_fragment.rb b/lib/nokogiri/html4/document_fragment.rb
index af97dcb704b..bee73114bab 100644
--- a/lib/nokogiri/html4/document_fragment.rb
+++ b/lib/nokogiri/html4/document_fragment.rb
@@ -4,10 +4,53 @@ module Nokogiri
   module HTML4
     class DocumentFragment < Nokogiri::XML::DocumentFragment
       ####
-      # Create a Nokogiri::XML::DocumentFragment from +tags+, using +encoding+
+      # Parse HTML fragment. +tags+ may be a String, or any object that
+      # responds to _read_ and _close_ such as an IO, or StringIO.
+      #
+      # +encoding+ is the encoding that should be used when processing the document.
+      # If not specified, it will be automatically detected.
+      #
+      # +options+ is a number that sets options in the parser, such as
+      # Nokogiri::XML::ParseOptions::DEFAULT_HTML. See the constants in
+      # Nokogiri::XML::ParseOptions.
+      #
+      # This method returns a new DocumentFragment. If a block is given, it will be
+      # passed to the new DocumentFragment as an argument.
+      #
+      # Examples:
+      #   fragment = DocumentFragment.parse("<div>Hello World</div>")
+      #
+      #   file = File.open("fragment.html")
+      #   fragment = DocumentFragment.parse(file)
+      #
+      #   fragment = DocumentFragment.parse("<div>こんにちは世界</div>", "UTF-8")
+      #
+      #   DocumentFragment.parse("<div>Hello World") do |fragment|
+      #     puts fragment.at_css("div").content
+      #   end
       def self.parse(tags, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
         doc = HTML4::Document.new
 
+        if tags.respond_to?(:read)
+          # Handle IO-like objects (IO, File, StringIO, etc.)
+          # The _read_ method of these objects doesn't accept an +encoding+ parameter.
+          # Encoding is usually set when the IO object is created or opened,
+          # or by using the _set_encoding_ method.
+          #
+          # 1. If +encoding+ is provided and the object supports _set_encoding_,
+          #    set the encoding before reading.
+          # 2. Read the content from the IO-like object.
+          #
+          # Note: After reading, the content's encoding will be:
+          # - The encoding set by _set_encoding_ if it was called
+          # - The default encoding of the IO object otherwise
+          #
+          # For StringIO specifically, _set_encoding_ affects only the internal string,
+          # not how the data is read out.
+          tags.set_encoding(encoding) if encoding && tags.respond_to?(:set_encoding)
+          tags = tags.read
+        end
+
         encoding ||= if tags.respond_to?(:encoding)
           encoding = tags.encoding
           if encoding == ::Encoding::ASCII_8BIT
diff --git a/test/html4/test_document_fragment.rb b/test/html4/test_document_fragment.rb
index 5f771c91ef4..09d0173700f 100644
--- a/test/html4/test_document_fragment.rb
+++ b/test/html4/test_document_fragment.rb
@@ -270,6 +270,15 @@ def test_dup_should_create_an_html_document_fragment
           assert_instance_of(Nokogiri::HTML4::DocumentFragment, duplicate)
         end
 
+        def test_parse_with_io
+          fragment = Nokogiri::HTML4::DocumentFragment.parse(StringIO.new("<div>hello</div>"), "UTF-8")
+          assert_instance_of(HTML4::DocumentFragment, fragment)
+          assert_equal("<div>hello</div>", fragment.to_s)
+
+          fragment = Nokogiri::HTML4::DocumentFragment.parse(StringIO.new("<div>hello</div>"))
+          assert_equal("<div>hello</div>", fragment.to_s)
+        end
+
         describe "encoding" do
           describe "#fragment" do
             it "parses an encoded string" do

From bdac6c9476e4f2e1e7d31c626b6f3abe4176b1fd Mon Sep 17 00:00:00 2001
From: Mike Dalessio <mike.dalessio@gmail.com>
Date: Mon, 5 Aug 2024 10:18:00 -0400
Subject: [PATCH 2/2] doc: updated doc string for HTML4::DocumentFragment.parse

---
 CHANGELOG.md                            |  2 +-
 lib/nokogiri/html4/document_fragment.rb | 52 +++++++++++++++++--------
 2 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index afd4a717919..0657ecc44ee 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -73,6 +73,7 @@ We've resolved many long-standing bugs in the various schema classes, validation
 * CSS queries for pseudo-selectors that cannot be translated into XPath expressions now raise a more descriptive `Nokogiri::CSS::SyntaxError` when they are parsed. Previously, an invalid XPath expression was evaluated and a hard-to-understand XPath error was raised by the query engine. [#3193] @flavorjones
 * `Schema#validate` returns errors on empty and malformed files. Previously, it would return errors on empty/malformed Documents, but not when reading from files. [#642] @flavorjones
 * `XML::Builder` is now consistent with how it sets block scope. Previously, missing methods with blocks on dynamically-created nodes were always handled by invoking `instance_eval(&block)` on the Builder, even when the Builder was yielding self for all other missing methods with blocks. [#1041] @flavorjones
+* `HTML4::DocumentFragment.parse` accepts `IO` input. Previously, it required a string and would raise a `TypeError` when passed an `IO`. [#2069] @sharvy
 * [CRuby] libgumbo (the HTML5 parser) treats reaching max-depth as EOF. This addresses a class of issues when the parser is interrupted in this way. [#3121] @stevecheckoway
 * [CRuby] Update node GC lifecycle to avoid a potential memory leak with fragments in libxml 2.13.0 caused by changes in `xmlAddChild`. [#3156] @flavorjones
 * [CRuby] libgumbo correctly prints nonstandard element names in error messages. [#3219] @stevecheckoway
@@ -85,7 +86,6 @@ We've resolved many long-standing bugs in the various schema classes, validation
 * [JRuby] SAX parsing now respects the `#replace_entities` attribute, which defaults to `false`. Previously this flag defaulted to `true` and was completely ignored. [#614] @flavorjones
 * [JRuby] The SAX callback `Document#start_element_namespace` received a blank string for the URI when a namespace was not present. It now receives `nil` (as does the CRuby impl). [#3265] @flavorjones
 * [JRuby] `Reader#outer_xml` and `#inner_xml` encode entities properly. [#1523] @flavorjones
-* `HTML4::DocumentFragment.parse` can now handle IO objects. Previously, it would raise a `TypeError`. [#2069] @sharvy
 
 
 ### Changed
diff --git a/lib/nokogiri/html4/document_fragment.rb b/lib/nokogiri/html4/document_fragment.rb
index bee73114bab..1681822acbb 100644
--- a/lib/nokogiri/html4/document_fragment.rb
+++ b/lib/nokogiri/html4/document_fragment.rb
@@ -3,31 +3,49 @@
 module Nokogiri
   module HTML4
     class DocumentFragment < Nokogiri::XML::DocumentFragment
-      ####
-      # Parse HTML fragment. +tags+ may be a String, or any object that
-      # responds to _read_ and _close_ such as an IO, or StringIO.
       #
-      # +encoding+ is the encoding that should be used when processing the document.
-      # If not specified, it will be automatically detected.
+      # :call-seq:
+      #   parse(tags) => DocumentFragment
+      #   parse(tags, encoding) => DocumentFragment
+      #   parse(tags, encoding, options) => DocumentFragment
+      #   parse(tags, encoding) { |options| ... } => DocumentFragment
       #
-      # +options+ is a number that sets options in the parser, such as
-      # Nokogiri::XML::ParseOptions::DEFAULT_HTML. See the constants in
-      # Nokogiri::XML::ParseOptions.
+      # Parse an HTML4 fragment.
       #
-      # This method returns a new DocumentFragment. If a block is given, it will be
-      # passed to the new DocumentFragment as an argument.
+      # [Parameters]
+      # - +tags+ (optional String, or any object that responds to +#read+ such as an IO, or
+      #   StringIO)
+      # - +encoding+ (optional String) the name of the encoding that should be used when processing
+      #   the document.  (default +nil+ for auto-detection)
+      # - +options+ (optional) configuration object that sets options during parsing, such as
+      #   Nokogiri::XML::ParseOptions::RECOVER. See Nokogiri::XML::ParseOptions for more
+      #   information.
+      #
+      # [Yields] If present, the block will be passed a Nokogiri::XML::ParseOptions object to modify
+      #   before the fragment is parsed. See Nokogiri::XML::ParseOptions for more information.
+      #
+      # [Returns] DocumentFragment
+      #
+      # *Example:* Parsing a string
       #
-      # Examples:
       #   fragment = DocumentFragment.parse("<div>Hello World</div>")
       #
-      #   file = File.open("fragment.html")
-      #   fragment = DocumentFragment.parse(file)
+      # *Example:* Parsing an IO
+      #
+      #   fragment = File.open("fragment.html") do |file|
+      #     DocumentFragment.parse(file)
+      #   end
+      #
+      # *Example:* Specifying encoding
       #
-      #   fragment = DocumentFragment.parse("<div>こんにちは世界</div>", "UTF-8")
+      #   fragment = DocumentFragment.parse(input, "EUC-JP")
       #
-      #   DocumentFragment.parse("<div>Hello World") do |fragment|
-      #     puts fragment.at_css("div").content
+      # *Example:* Setting parse options dynamically
+      #
+      #   DocumentFragment.parse("<div>Hello World") do |options|
+      #     options.huge.pedantic
       #   end
+      #
       def self.parse(tags, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
         doc = HTML4::Document.new
 
@@ -67,6 +85,8 @@ def self.parse(tags, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML,
         new(doc, tags, nil, options, &block)
       end
 
+      # It's recommended to use either DocumentFragment.parse or XML::Node#parse rather than call this
+      # method directly.
       def initialize(document, tags = nil, ctx = nil, options = XML::ParseOptions::DEFAULT_HTML) # rubocop:disable Lint/MissingSuper
         return self unless tags