From 0656925aa7793b0bd68c25ab80b5c67df80baee1 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sat, 24 Feb 2024 11:53:22 +0900 Subject: [PATCH 1/2] Changed to `frozen_string_literal: true`. ## Why? Because `s.check("a")` is slower than `s.check("a".freeze)`. - benchmark/stringscan_2.yaml ``` loop_count: 100000 contexts: - name: No YJIT prelude: | $LOAD_PATH.unshift(File.expand_path("lib")) require 'rexml' prelude: | require 'strscan' s = StringScanner.new('abcdefg hijklmn opqrstu vwxyz') ptn = "a" benchmark: 'check("a")' : s.check("a") 'check("a".freeze)' : s.check("a".freeze) 'ptn="a";s.check(ptn)' : | ptn="a" s.check(ptn) 'check(ptn)' : s.check(ptn) ``` ``` $benchmark-driver benchmark/stringscan_2.yaml Comparison: check(ptn): 13524479.4 i/s check("a".freeze): 13433638.1 i/s - 1.01x slower check("a"): 10231225.8 i/s - 1.32x slower ptn="a";s.check(ptn): 10013017.0 i/s - 1.35x slower ``` --- lib/rexml/parsers/baseparser.rb | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 595669c9..ceed0af4 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -1,4 +1,4 @@ -# frozen_string_literal: false +# frozen_string_literal: true require_relative '../parseexception' require_relative '../undefinednamespaceexception' require_relative '../source' @@ -365,7 +365,7 @@ def pull_event end if md.nil? or last_tag != md[1] message = "Missing end tag for '#{last_tag}'" - message << " (got '#{md[1]}')" if md + message += " (got '#{md[1]}')" if md raise REXML::ParseException.new(message, @source) end return [ :end_element, last_tag ] @@ -462,8 +462,7 @@ def normalize( input, entities=nil, entity_filter=nil ) # Unescapes all possible entities def unnormalize( string, entities=nil, filter=nil ) - rv = string.clone - rv.gsub!( /\r\n?/, "\n" ) + rv = string.gsub( /\r\n?/, "\n" ) matches = rv.scan( REFERENCE_RE ) return rv if matches.size == 0 rv.gsub!( /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) { From 54b0298c563b2460e4f9e035a61154712324a56a Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sat, 17 Feb 2024 11:49:38 +0900 Subject: [PATCH 2/2] Changed processing in REXML::Parsers::BaseParser#pull_event from regular expression to processing using StringScanner. ## Why Improve maintainability by optimizing the process so that the parsing process proceeds using StringScanner#scan. # Changed - Added Source#string= method for error message output. - Added TestParseDocumentTypeDeclaration#test_no_name test case. - Of the `intSubset` of DOCTYPE, " [28] doctypedecl ::= '' https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-intSubset > [28b] intSubset ::= (markupdecl | DeclSep)* https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-markupdecl > [29] markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-elementdecl > [45] elementdecl ::= '' https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttlistDecl > [52] AttlistDecl ::= '' https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EntityDecl > [70] EntityDecl ::= GEDecl | PEDecl > [71] GEDecl ::= '' > [72] PEDecl ::= '' https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-NotationDecl > [82] NotationDecl ::= '' https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PI > [16] PI ::= '' Char*)))? '?>' https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Comment > [15] Comment ::= '' https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-DeclSep > [28a] DeclSep ::= PEReference | S https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PEReference > [69] PEReference ::= '%' Name ';' [Benchmark] ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.0/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin22] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 11.240 10.569 17.173 18.219 i/s - 100.000 times in 8.896882s 9.461267s 5.823007s 5.488884s sax 31.812 30.716 48.383 52.532 i/s - 100.000 times in 3.143500s 3.255655s 2.066861s 1.903600s pull 36.855 36.354 56.718 61.443 i/s - 100.000 times in 2.713300s 2.750693s 1.763099s 1.627523s stream 34.176 34.758 49.801 54.622 i/s - 100.000 times in 2.925991s 2.877065s 2.008003s 1.830779s Comparison: dom after(YJIT): 18.2 i/s before(YJIT): 17.2 i/s - 1.06x slower before: 11.2 i/s - 1.62x slower after: 10.6 i/s - 1.72x slower sax after(YJIT): 52.5 i/s before(YJIT): 48.4 i/s - 1.09x slower before: 31.8 i/s - 1.65x slower after: 30.7 i/s - 1.71x slower pull after(YJIT): 61.4 i/s before(YJIT): 56.7 i/s - 1.08x slower before: 36.9 i/s - 1.67x slower after: 36.4 i/s - 1.69x slower stream after(YJIT): 54.6 i/s before(YJIT): 49.8 i/s - 1.10x slower after: 34.8 i/s - 1.57x slower before: 34.2 i/s - 1.60x slower ``` - YJIT=ON : 1.06x - 1.10x faster - YJIT=OFF : 0.94x - 1.01x faster Co-authored-by: Sutou Kouhei --- lib/rexml/parsers/baseparser.rb | 318 ++++++++++--------- lib/rexml/source.rb | 31 +- test/parse/test_document_type_declaration.rb | 15 + 3 files changed, 200 insertions(+), 164 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index ceed0af4..bc59bcdc 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -112,6 +112,19 @@ class BaseParser "apos" => [/'/, "'", "'", /'/] } + module Private + INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um + TAG_PATTERN = /((?>#{QNAME_STR}))/um + CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um + ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um + NAME_PATTERN = /\s*#{NAME}/um + GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>" + PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>" + ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um + end + private_constant :Private + include Private + def initialize( source ) self.stream = source @listeners = [] @@ -198,167 +211,155 @@ def pull_event #STDERR.puts @source.encoding #STDERR.puts "BUFFER = #{@source.buffer.inspect}" if @document_status == nil - word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um ) - word = word[1] unless word.nil? - #STDERR.puts "WORD = #{word.inspect}" - case word - when COMMENT_START - return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ] - when XMLDECL_START - #STDERR.puts "XMLDECL" - results = @source.match( XMLDECL_PATTERN, true )[1] - version = VERSION.match( results ) - version = version[1] unless version.nil? - encoding = ENCODING.match(results) - encoding = encoding[1] unless encoding.nil? - if need_source_encoding_update?(encoding) - @source.encoding = encoding - end - if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding - encoding = "UTF-16" - end - standalone = STANDALONE.match(results) - standalone = standalone[1] unless standalone.nil? - return [ :xmldecl, version, encoding, standalone ] - when INSTRUCTION_START + if @source.match("/um, true) - id = [nil, nil, nil] - @document_status = :after_doctype - else - id = parse_id(base_error_message, - accept_external_id: true, - accept_public_id: false) - if id[0] == "SYSTEM" - # For backward compatibility - id[1], id[2] = id[2], nil + elsif @source.match("/um, true)[1] ] + elsif @source.match("DOCTYPE", true) + base_error_message = "Malformed DOCTYPE" + unless @source.match(/\s+/um, true) + if @source.match(">") + message = "#{base_error_message}: name is missing" + else + message = "#{base_error_message}: invalid name" + end + @source.string = "/um, true) + elsif @source.match(/\s*>/um, true) + id = [nil, nil, nil] @document_status = :after_doctype else - message = "#{base_error_message}: garbage after external ID" - raise REXML::ParseException.new(message, @source) + id = parse_id(base_error_message, + accept_external_id: true, + accept_public_id: false) + if id[0] == "SYSTEM" + # For backward compatibility + id[1], id[2] = id[2], nil + end + if @source.match(/\s*\[/um, true) + @document_status = :in_doctype + elsif @source.match(/\s*>/um, true) + @document_status = :after_doctype + else + message = "#{base_error_message}: garbage after external ID" + raise REXML::ParseException.new(message, @source) + end end - end - args = [:start_doctype, name, *id] - if @document_status == :after_doctype - @source.match(/\A\s*/um, true) - @stack << [ :end_doctype ] - end - return args - when /\A\s+/ - else - @document_status = :after_doctype - if @source.encoding == "UTF-8" - @source.buffer_encoding = ::Encoding::UTF_8 + args = [:start_doctype, name, *id] + if @document_status == :after_doctype + @source.match(/\s*/um, true) + @stack << [ :end_doctype ] + end + return args + else + message = "Invalid XML" + raise REXML::ParseException.new(message, @source) end end end if @document_status == :in_doctype - md = @source.match(/\A\s*(.*?>)/um) - case md[1] - when SYSTEMENTITY - match = @source.match( SYSTEMENTITY, true )[1] - return [ :externalentity, match ] - - when ELEMENTDECL_START - return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ] - - when ENTITY_START - match = [:entitydecl, *@source.match( ENTITYDECL, true ).captures.compact] - ref = false - if match[1] == '%' - ref = true - match.delete_at 1 - end - # Now we have to sort out what kind of entity reference this is - if match[2] == 'SYSTEM' - # External reference - match[3] = match[3][1..-2] # PUBID - match.delete_at(4) if match.size > 4 # Chop out NDATA decl - # match is [ :entity, name, SYSTEM, pubid(, ndata)? ] - elsif match[2] == 'PUBLIC' - # External reference - match[3] = match[3][1..-2] # PUBID - match[4] = match[4][1..-2] # HREF - match.delete_at(5) if match.size > 5 # Chop out NDATA decl - # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ] - else - match[2] = match[2][1..-2] - match.pop if match.size == 4 - # match is [ :entity, name, value ] - end - match << '%' if ref - return match - when ATTLISTDECL_START - md = @source.match( ATTLISTDECL_PATTERN, true ) - raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil? - element = md[1] - contents = md[0] - - pairs = {} - values = md[0].scan( ATTDEF_RE ) - values.each do |attdef| - unless attdef[3] == "#IMPLIED" - attdef.compact! - val = attdef[3] - val = attdef[4] if val == "#FIXED " - pairs[attdef[0]] = val - if attdef[0] =~ /^xmlns:(.*)/ - @nsstack[0] << $1 - end + @source.match(/\s*/um, true) # skip spaces + if @source.match("/um, true) + raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil? + return [ :elementdecl, "/um) - message = "#{base_error_message}: name is missing" + # Now we have to sort out what kind of entity reference this is + if match[2] == 'SYSTEM' + # External reference + match[3] = match[3][1..-2] # PUBID + match.delete_at(4) if match.size > 4 # Chop out NDATA decl + # match is [ :entity, name, SYSTEM, pubid(, ndata)? ] + elsif match[2] == 'PUBLIC' + # External reference + match[3] = match[3][1..-2] # PUBID + match[4] = match[4][1..-2] # HREF + match.delete_at(5) if match.size > 5 # Chop out NDATA decl + # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ] else - message = "#{base_error_message}: invalid declaration name" + match[2] = match[2][1..-2] + match.pop if match.size == 4 + # match is [ :entity, name, value ] end - raise REXML::ParseException.new(message, @source) - end - name = parse_name(base_error_message) - id = parse_id(base_error_message, - accept_external_id: true, - accept_public_id: true) - unless @source.match(/\A\s*>/um, true) - message = "#{base_error_message}: garbage before end >" - raise REXML::ParseException.new(message, @source) + match << '%' if ref + return match + elsif @source.match("ATTLIST", true) + md = @source.match(ATTLISTDECL_END, true) + raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil? + element = md[1] + contents = md[0] + + pairs = {} + values = md[0].scan( ATTDEF_RE ) + values.each do |attdef| + unless attdef[3] == "#IMPLIED" + attdef.compact! + val = attdef[3] + val = attdef[4] if val == "#FIXED " + pairs[attdef[0]] = val + if attdef[0] =~ /^xmlns:(.*)/ + @nsstack[0] << $1 + end + end + end + return [ :attlistdecl, element, pairs, contents ] + elsif @source.match("NOTATION", true) + base_error_message = "Malformed notation declaration" + unless @source.match(/\s+/um, true) + if @source.match(">") + message = "#{base_error_message}: name is missing" + else + message = "#{base_error_message}: invalid name" + end + @source.string = " /um, true) + message = "#{base_error_message}: garbage before end >" + raise REXML::ParseException.new(message, @source) + end + return [:notationdecl, name, *id] + elsif md = @source.match(/--(.*?)-->/um, true) + case md[1] + when /--/, /-\z/ + raise REXML::ParseException.new("Malformed comment", @source) + end + return [ :comment, md[1] ] if md end - return [:notationdecl, name, *id] - when DOCTYPE_END + elsif match = @source.match(/(%.*?;)\s*/um, true) + return [ :externalentity, match[1] ] + elsif @source.match(/\]\s*>/um, true) @document_status = :after_doctype - @source.match( DOCTYPE_END, true ) return [ :end_doctype ] end end if @document_status == :after_doctype - @source.match(/\A\s*/um, true) + @source.match(/\s*/um, true) end begin - next_data = @source.buffer - if next_data.size < 2 - @source.read - next_data = @source.buffer - end - if next_data[0] == ?< - if next_data[1] == ?/ + if @source.match("<", true) + if @source.match("/", true) @nsstack.shift last_tag = @tags.pop - md = @source.match( CLOSE_MATCH, true ) + md = @source.match(CLOSE_PATTERN, true) if md and !last_tag message = "Unexpected top-level end tag (got '#{md[1]}')" raise REXML::ParseException.new(message, @source) @@ -366,15 +367,16 @@ def pull_event if md.nil? or last_tag != md[1] message = "Missing end tag for '#{last_tag}'" message += " (got '#{md[1]}')" if md + @source.string = "]*>)/um) + elsif @source.match("!", true) + md = @source.match(/([^>]*>)/um) #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" raise REXML::ParseException.new("Malformed node", @source) unless md - if md[0][2] == ?- - md = @source.match( COMMENT_PATTERN, true ) + if md[0][0] == ?- + md = @source.match(/--(.*?)-->/um, true) case md[1] when /--/, /-\z/ @@ -383,17 +385,18 @@ def pull_event return [ :comment, md[1] ] if md else - md = @source.match( CDATA_PATTERN, true ) + md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true) return [ :cdata, md[1] ] if md end raise REXML::ParseException.new( "Declarations can only occur "+ "in the doctype declaration.", @source) - elsif next_data[1] == ?? + elsif @source.match("?", true) return process_instruction else # Get the next tag - md = @source.match(TAG_MATCH, true) + md = @source.match(TAG_PATTERN, true) unless md + @source.string = "<" + @source.buffer raise REXML::ParseException.new("malformed XML: missing tag start", @source) end tag = md[1] @@ -418,7 +421,7 @@ def pull_event return [ :start_element, tag, attributes ] end else - md = @source.match( TEXT_PATTERN, true ) + md = @source.match(/([^<]*)/um, true) text = md[1] return [ :text, text ] end @@ -497,9 +500,9 @@ def need_source_encoding_update?(xml_declaration_encoding) end def parse_name(base_error_message) - md = @source.match(/\A\s*#{NAME}/um, true) + md = @source.match(NAME_PATTERN, true) unless md - if @source.match(/\A\s*\S/um) + if @source.match(/\s*\S/um) message = "#{base_error_message}: invalid name" else message = "#{base_error_message}: name is missing" @@ -576,11 +579,28 @@ def parse_id_invalid_details(accept_external_id:, end def process_instruction - match_data = @source.match(INSTRUCTION_PATTERN, true) + match_data = @source.match(INSTRUCTION_END, true) unless match_data message = "Invalid processing instruction node" + @source.string = " DETAIL end + + def test_no_name + exception = assert_raise(REXML::ParseException) do + parse(<<-DOCTYPE) + + DOCTYPE + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed DOCTYPE: name is missing +Line: 3 +Position: 17 +Last 80 unconsumed characters: + + DETAIL + end end class TestExternalID < self