diff --git a/CHANGELOG.md b/CHANGELOG.md index e505e01f70..476cba12d4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA ### Added * [CRuby] `Nokogiri::HTML5::Builder` is similar to `HTML4::Builder` but returns an `HTML5::Document`. (@flavorjones) +* [CRuby] Attributes in an HTML5 document can be serialized individually, something that has always been supported by the HTML4 serializer. [#3125, #3127] @flavorjones ### Fixed diff --git a/ext/nokogiri/xml_node.c b/ext/nokogiri/xml_node.c index 0d8e345d3f..b881d274b1 100644 --- a/ext/nokogiri/xml_node.c +++ b/ext/nokogiri/xml_node.c @@ -1875,17 +1875,7 @@ output_node( // Add attributes. for (xmlAttrPtr attr = node->properties; attr; attr = attr->next) { output_char(out, ' '); - output_attr_name(out, attr); - if (attr->children) { - output_string(out, "=\""); - xmlChar *value = xmlNodeListGetString(attr->doc, attr->children, 1); - output_escaped_string(out, value, true); - xmlFree(value); - output_char(out, '"'); - } else { - // Output name="" - output_string(out, "=\"\""); - } + output_node(out, (xmlNodePtr)attr, preserve_newline); } output_char(out, '>'); @@ -1903,6 +1893,23 @@ output_node( } break; + case XML_ATTRIBUTE_NODE: + { + xmlAttrPtr attr = (xmlAttrPtr)node; + output_attr_name(out, attr); + if (attr->children) { + output_string(out, "=\""); + xmlChar *value = xmlNodeListGetString(attr->doc, attr->children, 1); + output_escaped_string(out, value, true); + xmlFree(value); + output_char(out, '"'); + } else { + // Output name="" + output_string(out, "=\"\""); + } + } + break; + case XML_TEXT_NODE: if (node->parent && is_one_of(node->parent, UNESCAPED_TEXT_ELEMENTS, diff --git a/test/html4/test_attributes.rb b/test/html4/test_attributes.rb index e6570e37c4..6eb6d385aa 100644 --- a/test/html4/test_attributes.rb +++ b/test/html4/test_attributes.rb @@ -2,90 +2,107 @@ require "helper" -module Nokogiri - module HTML - class TestAttr < Nokogiri::TestCase - # - # libxml2 >= 2.9.2 fails to escape comments within some attributes. It - # wants to ensure these comments can be treated as "server-side includes", - # but as a result fails to ensure that serialization is well-formed, - # resulting in an opportunity for XSS injection of code into a final - # re-parsed document (presumably in a browser). - # - # the offending commit is: - # - # https://github.com/GNOME/libxml2/commit/960f0e2 - # - # we'll test this by parsing the HTML, serializing it, then - # re-parsing it to ensure there isn't any ambiguity in the output - # that might allow code injection into a browser consuming - # "sanitized" output. - # - # complaints have been made upstream about this behavior, notably at - # - # https://bugzilla.gnome.org/show_bug.cgi?id=769760 - # - # and multiple CVEs have been declared and fixed in downstream - # libraries as a result, a list is being kept up to date here: - # - # https://github.com/flavorjones/loofah/issues/144 - # - [ - # - # these tags and attributes are determined by the code at: - # - # https://git.gnome.org/browse/libxml2/tree/HTMLtree.c?h=v2.9.2#n714 - # - { tag: "a", attr: "href" }, - { tag: "div", attr: "href" }, - { tag: "a", attr: "action" }, - { tag: "div", attr: "action" }, - { tag: "a", attr: "src" }, - { tag: "div", attr: "src" }, - { tag: "a", attr: "name" }, - # - # note that div+name is _not_ affected by the libxml2 issue. - # but we test it anyway to ensure our logic isn't modifying - # attributes that don't need modifying. - # - { tag: "div", attr: "name", unescaped: true }, - ].each do |config| - define_method "test_uri_escaping_of_#{config[:attr]}_attr_in_#{config[:tag]}_tag" do - skip if Nokogiri::VersionInfo.instance.libxml2? && Nokogiri::VersionInfo.instance.libxml2_using_system? +class TestHTML4Attributes < Nokogiri::TestCase + # + # libxml2 >= 2.9.2 fails to escape comments within some attributes. It + # wants to ensure these comments can be treated as "server-side includes", + # but as a result fails to ensure that serialization is well-formed, + # resulting in an opportunity for XSS injection of code into a final + # re-parsed document (presumably in a browser). + # + # the offending commit is: + # + # https://github.com/GNOME/libxml2/commit/960f0e2 + # + # we'll test this by parsing the HTML, serializing it, then + # re-parsing it to ensure there isn't any ambiguity in the output + # that might allow code injection into a browser consuming + # "sanitized" output. + # + # complaints have been made upstream about this behavior, notably at + # + # https://bugzilla.gnome.org/show_bug.cgi?id=769760 + # + # and multiple CVEs have been declared and fixed in downstream + # libraries as a result, a list is being kept up to date here: + # + # https://github.com/flavorjones/loofah/issues/144 + # + [ + # + # these tags and attributes are determined by the code at: + # + # https://git.gnome.org/browse/libxml2/tree/HTMLtree.c?h=v2.9.2#n714 + # + { tag: "a", attr: "href" }, + { tag: "div", attr: "href" }, + { tag: "a", attr: "action" }, + { tag: "div", attr: "action" }, + { tag: "a", attr: "src" }, + { tag: "div", attr: "src" }, + { tag: "a", attr: "name" }, + # + # note that div+name is _not_ affected by the libxml2 issue. + # but we test it anyway to ensure our logic isn't modifying + # attributes that don't need modifying. + # + { tag: "div", attr: "name", unescaped: true }, + ].each do |config| + define_method "test_uri_escaping_of_#{config[:attr]}_attr_in_#{config[:tag]}_tag" do + skip if Nokogiri::VersionInfo.instance.libxml2? && Nokogiri::VersionInfo.instance.libxml2_using_system? - html = %{<#{config[:tag]} #{config[:attr]}='example.com'>test} + html = %{<#{config[:tag]} #{config[:attr]}='example.com'>test} - reparsed = Nokogiri::HTML4.fragment(Nokogiri::HTML4.fragment(html).to_html) - attributes = reparsed.at_css(config[:tag]).attribute_nodes + reparsed = Nokogiri::HTML4.fragment(Nokogiri::HTML4.fragment(html).to_html) + attributes = reparsed.at_css(config[:tag]).attribute_nodes - assert_equal [config[:attr]], attributes.collect(&:name) - if Nokogiri::VersionInfo.instance.libxml2? - if config[:unescaped] - # - # this attribute was emitted wrapped in single-quotes, so a double quote is A-OK. - # assert that this attribute's serialization is unaffected. - # - assert_equal %{example.com}, attributes.first.value - elsif Nokogiri.uses_libxml?(">= 2.11.0") - # - # as of upstream 76d6b0d768d4e60a2d2844d in v2.11.0 (dev), quotes are no longer escaped. - # - assert_equal %{example.com}, attributes.first.value - else - # - # let's match the behavior in libxml < 2.9.2. - # test that this attribute's serialization is well-formed and sanitized. - # - assert_equal %{example.com}, attributes.first.value - end - else - # - # yay for consistency in javaland. move along, nothing to see here. - # - assert_equal %{example.com}, attributes.first.value - end + assert_equal [config[:attr]], attributes.collect(&:name) + if Nokogiri::VersionInfo.instance.libxml2? + if config[:unescaped] + # + # this attribute was emitted wrapped in single-quotes, so a double quote is A-OK. + # assert that this attribute's serialization is unaffected. + # + assert_equal %{example.com}, attributes.first.value + elsif Nokogiri.uses_libxml?(">= 2.11.0") + # + # as of upstream 76d6b0d768d4e60a2d2844d in v2.11.0 (dev), quotes are no longer escaped. + # + assert_equal %{example.com}, attributes.first.value + else + # + # let's match the behavior in libxml < 2.9.2. + # test that this attribute's serialization is well-formed and sanitized. + # + assert_equal %{example.com}, attributes.first.value end + else + # + # yay for consistency in javaland. move along, nothing to see here. + # + assert_equal %{example.com}, attributes.first.value end end end + + def test_serialize_attribute + html = <<~HTML +
+ HTML + div = Nokogiri::HTML4::DocumentFragment.parse(html).at_css("div") + attrs = div.attribute_nodes + id_attr = attrs.find { |a| a.name == "id" } + class_attr = attrs.find { |a| a.name == "class" } + + if Nokogiri.uses_libxml? + id_expected = ' id="foo"' + class_expected = ' class="bar baz"' + else + id_expected = 'id="foo"' + class_expected = 'class="bar baz"' + end + + assert_equal(id_expected, id_attr.to_html) + assert_equal(class_expected, class_attr.to_html) + end end diff --git a/test/html5/test_attributes.rb b/test/html5/test_attributes.rb new file mode 100644 index 0000000000..0dbfb6f13e --- /dev/null +++ b/test/html5/test_attributes.rb @@ -0,0 +1,18 @@ +# frozen_string_literal: true + +require "helper" + +class TestHtml5Attributes < Nokogiri::TestCase + def test_serialize_attribute + html = <<~HTML +
+ HTML + div = Nokogiri::HTML5::DocumentFragment.parse(html).at_css("div") + attrs = div.attribute_nodes + id_attr = attrs.find { |a| a.name == "id" } + class_attr = attrs.find { |a| a.name == "class" } + + assert_equal('id="foo"', id_attr.to_html) + assert_equal('class="bar baz"', class_attr.to_html) + end +end if Nokogiri.uses_gumbo?