From 184c0ca8f00df8e2109f96eff9cc8d08e81ae29b Mon Sep 17 00:00:00 2001 From: Watson Date: Wed, 10 Jul 2024 11:38:44 +0900 Subject: [PATCH] Fix performance issue caused by using repeated `>` characters after `" COMMENT_TERM = "-->" CDATA_TERM = "]]>" + DOCTYPE_TERM = "]>" TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um @@ -384,7 +385,7 @@ def pull_event end return [ :comment, md[1] ] if md end - elsif match = @source.match(/(%.*?;)\s*/um, true) + elsif match = @source.match(/(%.*?;)\s*/um, true, term: Private::DOCTYPE_TERM) return [ :externalentity, match[1] ] elsif @source.match(/\]\s*>/um, true) @document_status = :after_doctype diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb index 3ca0b536..61c3f04d 100644 --- a/test/parse/test_document_type_declaration.rb +++ b/test/parse/test_document_type_declaration.rb @@ -1,9 +1,13 @@ # frozen_string_literal: false require "test/unit" +require "core_assertions" + require "rexml/document" module REXMLTests class TestParseDocumentTypeDeclaration < Test::Unit::TestCase + include Test::Unit::CoreAssertions + private def parse(doctype) REXML::Document.new(<<-XML).doctype @@ -276,6 +280,16 @@ def test_notation_attlist doctype.children.collect(&:class)) end + def test_gt_linear_performance_malformed_entity + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + begin + REXML::Document.new('" * n + ']>') + rescue + end + end + end + private def parse(internal_subset) super(<<-DOCTYPE)