From a7c9491b33ec91596158403e985df636871cd82a Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sat, 17 Feb 2024 11:49:38 +0900 Subject: [PATCH] Changed processing in REXML::Parsers::BaseParser#pull_event from regular expression to processing using StringScanner. ## Why Improve maintainability by optimizing the process so that the parsing process proceeds using StringScanner#scan. # Changed - Added read_source option to IOSource#match to suppress read from @source. - Added Source#string= method for error message output. ## Benchmark ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.0/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin22] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 11.278 11.371 17.346 18.248 i/s - 100.000 times in 8.866784s 8.794635s 5.764884s 5.479904s sax 31.845 32.282 48.621 53.235 i/s - 100.000 times in 3.140243s 3.097687s 2.056704s 1.878479s pull 37.033 37.840 59.401 64.111 i/s - 100.000 times in 2.700312s 2.642690s 1.683465s 1.559796s stream 34.799 36.514 49.432 56.885 i/s - 100.000 times in 2.873632s 2.738665s 2.022967s 1.757948s Comparison: dom after(YJIT): 18.2 i/s before(YJIT): 17.3 i/s - 1.05x slower after: 11.4 i/s - 1.60x slower before: 11.3 i/s - 1.62x slower sax after(YJIT): 53.2 i/s before(YJIT): 48.6 i/s - 1.09x slower after: 32.3 i/s - 1.65x slower before: 31.8 i/s - 1.67x slower pull after(YJIT): 64.1 i/s before(YJIT): 59.4 i/s - 1.08x slower after: 37.8 i/s - 1.69x slower before: 37.0 i/s - 1.73x slower stream after(YJIT): 56.9 i/s before(YJIT): 49.4 i/s - 1.15x slower after: 36.5 i/s - 1.56x slower before: 34.8 i/s - 1.63x slower ``` - YJIT=ON : 1.05x - 1.15x faster - YJIT=OFF : 1.00x - 1.05x faster --- lib/rexml/parsers/baseparser.rb | 195 +++++++++++++++----------------- lib/rexml/source.rb | 10 +- 2 files changed, 96 insertions(+), 109 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 2d671be4..c31dd3c4 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -48,29 +48,15 @@ class BaseParser REFERENCE = "&(?:#{NAME};|#\\d+;|#x[0-9a-fA-F]+;)" REFERENCE_RE = /#{REFERENCE}/ - DOCTYPE_START = /\A\s*/um ATTRIBUTE_PATTERN = /\s*(#{QNAME_STR})\s*=\s*(["'])(.*?)\4/um - COMMENT_START = /\A/um - CDATA_START = /\A/um - CDATA_PATTERN = //um - XMLDECL_START = /\A<\?xml\s/u; - XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um - INSTRUCTION_START = /\A<\?/u - INSTRUCTION_PATTERN = /<\?#{NAME}(\s+.*?)?\?>/um - TAG_MATCH = /\A<((?>#{QNAME_STR}))/um - CLOSE_MATCH = /\A\s*<\/(#{QNAME_STR})\s*>/um + INSTRUCTION_PATTERN = /#{NAME}(\s+.*?)?\?>/um + TAG_MATCH = /((?>#{QNAME_STR}))/um + CLOSE_MATCH = /(#{QNAME_STR})\s*>/um VERSION = /\bversion\s*=\s*["'](.*?)['"]/um ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um STANDALONE = /\bstandalone\s*=\s*["'](.*?)['"]/um - ENTITY_START = /\A\s*/um - SYSTEMENTITY = /\A\s*(%.*?;)\s*$/um ENUMERATION = "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)" NOTATIONTYPE = "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)" ENUMERATEDTYPE = "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))" @@ -79,10 +65,7 @@ class BaseParser DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))" ATTDEF = "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}" ATTDEF_RE = /#{ATTDEF}/ - ATTLISTDECL_START = /\A\s*/um - - TEXT_PATTERN = /\A([^<]*)/um + ATTLISTDECL_PATTERN = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um # Entity constants PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#" @@ -94,11 +77,10 @@ class BaseParser ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))} PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})" ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))" - PEDECL = "" - GEDECL = "" - ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um + PEDECL = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>" + GEDECL = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>" + ENTITYDECL = /(?:#{GEDECL})|(?:#{PEDECL})/um - NOTATIONDECL_START = /\A\s*]*>))/um ) - word = word[1] unless word.nil? - #STDERR.puts "WORD = #{word.inspect}" - case word - when COMMENT_START - return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ] - when XMLDECL_START - #STDERR.puts "XMLDECL" - results = @source.match( XMLDECL_PATTERN, true )[1] - version = VERSION.match( results ) - version = version[1] unless version.nil? - encoding = ENCODING.match(results) - encoding = encoding[1] unless encoding.nil? - if need_source_encoding_update?(encoding) - @source.encoding = encoding - end - if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding - encoding = "UTF-16" - end - standalone = STANDALONE.match(results) - standalone = standalone[1] unless standalone.nil? - return [ :xmldecl, version, encoding, standalone ] - when INSTRUCTION_START - return process_instruction - when DOCTYPE_START - base_error_message = "Malformed DOCTYPE" - @source.match(DOCTYPE_START, true) - @nsstack.unshift(curr_ns=Set.new) - name = parse_name(base_error_message) - if @source.match(/\A\s*\[/um, true) - id = [nil, nil, nil] - @document_status = :in_doctype - elsif @source.match(/\A\s*>/um, true) - id = [nil, nil, nil] - @document_status = :after_doctype - else - id = parse_id(base_error_message, - accept_external_id: true, - accept_public_id: false) - if id[0] == "SYSTEM" - # For backward compatibility - id[1], id[2] = id[2], nil + @source.read + if @source.match("/um, true, false) + results = results[1] + version = VERSION.match( results ) + version = version[1] unless version.nil? + encoding = ENCODING.match(results) + encoding = encoding[1] unless encoding.nil? + if need_source_encoding_update?(encoding) + @source.encoding = encoding end - if @source.match(/\A\s*\[/um, true) + if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding + encoding = "UTF-16" + end + standalone = STANDALONE.match(results) + standalone = standalone[1] unless standalone.nil? + return [ :xmldecl, version, encoding, standalone ] + else # instruction + return process_instruction + end + elsif @source.match("/um, true )[1] ] + elsif @source.match(/DOCTYPE\s/um, true, false) + base_error_message = "Malformed DOCTYPE" + @nsstack.unshift(curr_ns=Set.new) + name = parse_name(base_error_message) + if @source.match(/\s*\[/um, true) + id = [nil, nil, nil] @document_status = :in_doctype - elsif @source.match(/\A\s*>/um, true) + elsif @source.match(/\s*>/um, true) + id = [nil, nil, nil] @document_status = :after_doctype else - message = "#{base_error_message}: garbage after external ID" - raise REXML::ParseException.new(message, @source) + id = parse_id(base_error_message, + accept_external_id: true, + accept_public_id: false) + if id[0] == "SYSTEM" + # For backward compatibility + id[1], id[2] = id[2], nil + end + if @source.match(/\s*\[/um, true) + @document_status = :in_doctype + elsif @source.match(/\s*>/um, true) + @document_status = :after_doctype + else + message = "#{base_error_message}: garbage after external ID" + raise REXML::ParseException.new(message, @source) + end end + args = [:start_doctype, name, *id] + if @document_status == :after_doctype + @source.match(/\s*/um, true) + @stack << [ :end_doctype ] + end + return args + else + message = "Invalid XML" + raise REXML::ParseException.new(message, @source) end - args = [:start_doctype, name, *id] - if @document_status == :after_doctype - @source.match(/\A\s*/um, true) - @stack << [ :end_doctype ] - end - return args - when /\A\s+/ + elsif @source.match( /\s+/, false, false ) else @document_status = :after_doctype if @source.encoding == "UTF-8" @@ -265,16 +249,13 @@ def pull_event end end if @document_status == :in_doctype - md = @source.match(/\A\s*(.*?>)/um) - case md[1] - when SYSTEMENTITY - match = @source.match( SYSTEMENTITY, true )[1] - return [ :externalentity, match ] - - when ELEMENTDECL_START - return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ] - - when ENTITY_START + @source.read + @source.match(/\s*/um, true, false) # skip spaces + if match = @source.match( /(%.*?;)\s*$/um, true, false) + return [ :externalentity, match[1] ] + elsif match = @source.match(/(/um, true, false) + return [ :elementdecl, match[1] ] + elsif @source.match( "/um) + unless @source.match(/\s+/um, true) + if @source.match(/\s*>/um) message = "#{base_error_message}: name is missing" else message = "#{base_error_message}: invalid declaration name" end + @source.string = " /um, true) + unless @source.match(/\s*>/um, true) message = "#{base_error_message}: garbage before end >" raise REXML::ParseException.new(message, @source) end return [:notationdecl, name, *id] - when DOCTYPE_END + elsif @source.match( /\]\s*>/um, true, false) @document_status = :after_doctype - @source.match( DOCTYPE_END, true ) return [ :end_doctype ] end end if @document_status == :after_doctype - @source.match(/\A\s*/um, true) + @source.match(/\s*/um, true) end begin next_data = @source.buffer if next_data.size < 2 @source.read - next_data = @source.buffer end - if next_data[0] == ?< - if next_data[1] == ?/ + if @source.match("<", true, false) + if @source.match("/", true, false) @nsstack.shift last_tag = @tags.pop md = @source.match( CLOSE_MATCH, true ) @@ -366,15 +346,16 @@ def pull_event if md.nil? or last_tag != md[1] message = "Missing end tag for '#{last_tag}'" message << " (got '#{md[1]}')" if md + @source.string = "]*>)/um) + elsif @source.match("!", true, false) + md = @source.match(/([^>]*>)/um) #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" raise REXML::ParseException.new("Malformed node", @source) unless md - if md[0][2] == ?- - md = @source.match( COMMENT_PATTERN, true ) + if md[0][0] == ?- + md = @source.match( /--(.*?)-->/um, true ) case md[1] when /--/, /-\z/ @@ -383,17 +364,18 @@ def pull_event return [ :comment, md[1] ] if md else - md = @source.match( CDATA_PATTERN, true ) + md = @source.match( /\[CDATA\[(.*?)\]\]>/um, true ) return [ :cdata, md[1] ] if md end raise REXML::ParseException.new( "Declarations can only occur "+ "in the doctype declaration.", @source) - elsif next_data[1] == ?? + elsif @source.match("?", true, false) return process_instruction else # Get the next tag md = @source.match(TAG_MATCH, true) unless md + @source.string = "<" + @source.buffer raise REXML::ParseException.new("malformed XML: missing tag start", @source) end tag = md[1] @@ -418,7 +400,7 @@ def pull_event return [ :start_element, tag, attributes ] end else - md = @source.match( TEXT_PATTERN, true ) + md = @source.match( /([^<]*)/um, true ) text = md[1] return [ :text, text ] end @@ -579,6 +561,7 @@ def process_instruction match_data = @source.match(INSTRUCTION_PATTERN, true) unless match_data message = "Invalid processing instruction node" + @source.string = "