Skip to content

Commit

Permalink
fix: empty fragment encoding
Browse files Browse the repository at this point in the history
and improve test coverage around fragment encoding

Closes #2649
  • Loading branch information
flavorjones committed Sep 19, 2022
1 parent 75af7e8 commit 4238959
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 36 deletions.
7 changes: 6 additions & 1 deletion lib/nokogiri/xml/node_set.rb
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,12 @@ def to_html(*args)
options[:save_with] ||= Node::SaveOptions::DEFAULT_HTML
args.insert(0, options)
end
map { |x| x.to_html(*args) }.join
if empty?
encoding = (args.first.is_a?(Hash) ? args.first[:encoding] : nil) || document.encoding
"".encode(encoding)
else
map { |x| x.to_html(*args) }.join
end
end

###
Expand Down
91 changes: 56 additions & 35 deletions test/html4/test_document_fragment.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,6 @@ def test_ascii_8bit_encoding
assert_equal("hello", Nokogiri::HTML4::DocumentFragment.parse(s).to_html)
end

def test_inspect_encoding
fragment = "<div>こんにちは!</div>".encode("EUC-JP")
f = Nokogiri::HTML4::DocumentFragment.parse(fragment)
assert_equal("こんにちは!", f.content)
end

def test_html_parse_encoding
fragment = "<div>こんにちは!</div>".encode("EUC-JP")
f = Nokogiri::HTML4.fragment(fragment)
assert_equal("EUC-JP", f.document.encoding)
assert_equal("こんにちは!", f.content)
end

def test_unlink_empty_document
frag = Nokogiri::HTML4::DocumentFragment.parse("").unlink # must_not_raise
assert_nil(frag.parent)
Expand All @@ -38,20 +25,6 @@ def test_colons_are_not_removed
assert_match(/3:30/, doc.to_s)
end

def test_parse_encoding
fragment = "<div>hello world</div>"
f = Nokogiri::HTML4::DocumentFragment.parse(fragment, "ISO-8859-1")
assert_equal("ISO-8859-1", f.document.encoding)
assert_equal("hello world", f.content)
end

def test_html_parse_with_encoding
fragment = "<div>hello world</div>"
f = Nokogiri::HTML4.fragment(fragment, "ISO-8859-1")
assert_equal("ISO-8859-1", f.document.encoding)
assert_equal("hello world", f.content)
end

def test_parse_in_context
assert_equal("<br>", html.root.parse("<br />").to_s)
end
Expand All @@ -76,14 +49,6 @@ def test_ancestors_search
assert(li.matches?("li"))
end

def test_fun_encoding
string = %(<body>こんにちは</body>)
html = Nokogiri::HTML4::DocumentFragment.parse(
string
).to_html(encoding: "UTF-8")
assert_equal(string, html)
end

def test_new
assert(Nokogiri::HTML4::DocumentFragment.new(html))
end
Expand Down Expand Up @@ -306,6 +271,62 @@ def test_dup_should_create_an_html_document_fragment
assert_instance_of(Nokogiri::HTML4::DocumentFragment, duplicate)
end

describe "encoding" do
describe "#fragment" do
it "parses an encoded string" do
input = "<div>こんにちは!</div>".encode("EUC-JP")
fragment = Nokogiri::HTML4.fragment(input)
assert_equal("EUC-JP", fragment.document.encoding)
assert_equal("こんにちは!", fragment.content)
end

it "returns a string matching the passed encoding" do
input = "<div>hello world</div>"

fragment = Nokogiri::HTML4.fragment(input, "ISO-8859-1")
assert_equal("ISO-8859-1", fragment.document.encoding)
assert_equal("hello world", fragment.content)
end
end

describe "#parse" do
it "parses an encoded string" do
input = "<div>こんにちは!</div>".encode("EUC-JP")

fragment = Nokogiri::HTML4::DocumentFragment.parse(input)
assert_equal("EUC-JP", fragment.document.encoding)
assert_equal("こんにちは!", fragment.content)
end

it "returns a string matching the passed encoding" do
input = "<div>hello world</div>"

fragment = Nokogiri::HTML4::DocumentFragment.parse(input, "ISO-8859-1")
assert_equal("ISO-8859-1", fragment.document.encoding)
assert_equal("hello world", fragment.content)
end

it "respects encoding for empty strings" do
fragment = Nokogiri::HTML::DocumentFragment.parse("", "UTF-8")
assert_equal "UTF-8", fragment.to_html.encoding.to_s

fragment = Nokogiri::HTML::DocumentFragment.parse("", "US-ASCII")
assert_equal "US-ASCII", fragment.to_html.encoding.to_s

fragment = Nokogiri::HTML::DocumentFragment.parse("", "ISO-8859-1")
assert_equal "ISO-8859-1", fragment.to_html.encoding.to_s
end
end

describe "#to_html" do
it "serializes empty strings with the passed encoding" do
fragment = Nokogiri::HTML::DocumentFragment.parse("", "UTF-8")
assert_equal "ISO-8859-1", fragment.to_html(encoding: "ISO-8859-1").encoding.to_s
assert_equal "US-ASCII", fragment.to_html(encoding: "US-ASCII").encoding.to_s
end
end
end

describe "parse options" do
let(:html4_default) do
Nokogiri::XML::ParseOptions.new(Nokogiri::XML::ParseOptions::DEFAULT_HTML)
Expand Down

0 comments on commit 4238959

Please sign in to comment.