diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 595669c9..bc59bcdc 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -1,4 +1,4 @@
-# frozen_string_literal: false
+# frozen_string_literal: true
require_relative '../parseexception'
require_relative '../undefinednamespaceexception'
require_relative '../source'
@@ -112,6 +112,19 @@ class BaseParser
"apos" => [/'/, "'", "'", /'/]
}
+ module Private
+ INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
+ TAG_PATTERN = /((?>#{QNAME_STR}))/um
+ CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
+ ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
+ NAME_PATTERN = /\s*#{NAME}/um
+ GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
+ PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
+ ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
+ end
+ private_constant :Private
+ include Private
+
def initialize( source )
self.stream = source
@listeners = []
@@ -198,183 +211,172 @@ def pull_event
#STDERR.puts @source.encoding
#STDERR.puts "BUFFER = #{@source.buffer.inspect}"
if @document_status == nil
- word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um )
- word = word[1] unless word.nil?
- #STDERR.puts "WORD = #{word.inspect}"
- case word
- when COMMENT_START
- return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
- when XMLDECL_START
- #STDERR.puts "XMLDECL"
- results = @source.match( XMLDECL_PATTERN, true )[1]
- version = VERSION.match( results )
- version = version[1] unless version.nil?
- encoding = ENCODING.match(results)
- encoding = encoding[1] unless encoding.nil?
- if need_source_encoding_update?(encoding)
- @source.encoding = encoding
- end
- if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
- encoding = "UTF-16"
- end
- standalone = STANDALONE.match(results)
- standalone = standalone[1] unless standalone.nil?
- return [ :xmldecl, version, encoding, standalone ]
- when INSTRUCTION_START
+ if @source.match("", true)
return process_instruction
- when DOCTYPE_START
- base_error_message = "Malformed DOCTYPE"
- @source.match(DOCTYPE_START, true)
- @nsstack.unshift(curr_ns=Set.new)
- name = parse_name(base_error_message)
- if @source.match(/\A\s*\[/um, true)
- id = [nil, nil, nil]
- @document_status = :in_doctype
- elsif @source.match(/\A\s*>/um, true)
- id = [nil, nil, nil]
- @document_status = :after_doctype
- else
- id = parse_id(base_error_message,
- accept_external_id: true,
- accept_public_id: false)
- if id[0] == "SYSTEM"
- # For backward compatibility
- id[1], id[2] = id[2], nil
+ elsif @source.match("/um, true)[1] ]
+ elsif @source.match("DOCTYPE", true)
+ base_error_message = "Malformed DOCTYPE"
+ unless @source.match(/\s+/um, true)
+ if @source.match(">")
+ message = "#{base_error_message}: name is missing"
+ else
+ message = "#{base_error_message}: invalid name"
+ end
+ @source.string = "/um, true)
+ elsif @source.match(/\s*>/um, true)
+ id = [nil, nil, nil]
@document_status = :after_doctype
else
- message = "#{base_error_message}: garbage after external ID"
- raise REXML::ParseException.new(message, @source)
+ id = parse_id(base_error_message,
+ accept_external_id: true,
+ accept_public_id: false)
+ if id[0] == "SYSTEM"
+ # For backward compatibility
+ id[1], id[2] = id[2], nil
+ end
+ if @source.match(/\s*\[/um, true)
+ @document_status = :in_doctype
+ elsif @source.match(/\s*>/um, true)
+ @document_status = :after_doctype
+ else
+ message = "#{base_error_message}: garbage after external ID"
+ raise REXML::ParseException.new(message, @source)
+ end
end
- end
- args = [:start_doctype, name, *id]
- if @document_status == :after_doctype
- @source.match(/\A\s*/um, true)
- @stack << [ :end_doctype ]
- end
- return args
- when /\A\s+/
- else
- @document_status = :after_doctype
- if @source.encoding == "UTF-8"
- @source.buffer_encoding = ::Encoding::UTF_8
+ args = [:start_doctype, name, *id]
+ if @document_status == :after_doctype
+ @source.match(/\s*/um, true)
+ @stack << [ :end_doctype ]
+ end
+ return args
+ else
+ message = "Invalid XML"
+ raise REXML::ParseException.new(message, @source)
end
end
end
if @document_status == :in_doctype
- md = @source.match(/\A\s*(.*?>)/um)
- case md[1]
- when SYSTEMENTITY
- match = @source.match( SYSTEMENTITY, true )[1]
- return [ :externalentity, match ]
-
- when ELEMENTDECL_START
- return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
-
- when ENTITY_START
- match = [:entitydecl, *@source.match( ENTITYDECL, true ).captures.compact]
- ref = false
- if match[1] == '%'
- ref = true
- match.delete_at 1
- end
- # Now we have to sort out what kind of entity reference this is
- if match[2] == 'SYSTEM'
- # External reference
- match[3] = match[3][1..-2] # PUBID
- match.delete_at(4) if match.size > 4 # Chop out NDATA decl
- # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
- elsif match[2] == 'PUBLIC'
- # External reference
- match[3] = match[3][1..-2] # PUBID
- match[4] = match[4][1..-2] # HREF
- match.delete_at(5) if match.size > 5 # Chop out NDATA decl
- # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
- else
- match[2] = match[2][1..-2]
- match.pop if match.size == 4
- # match is [ :entity, name, value ]
- end
- match << '%' if ref
- return match
- when ATTLISTDECL_START
- md = @source.match( ATTLISTDECL_PATTERN, true )
- raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
- element = md[1]
- contents = md[0]
-
- pairs = {}
- values = md[0].scan( ATTDEF_RE )
- values.each do |attdef|
- unless attdef[3] == "#IMPLIED"
- attdef.compact!
- val = attdef[3]
- val = attdef[4] if val == "#FIXED "
- pairs[attdef[0]] = val
- if attdef[0] =~ /^xmlns:(.*)/
- @nsstack[0] << $1
- end
+ @source.match(/\s*/um, true) # skip spaces
+ if @source.match("/um, true)
+ raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
+ return [ :elementdecl, "/um)
- message = "#{base_error_message}: name is missing"
+ # Now we have to sort out what kind of entity reference this is
+ if match[2] == 'SYSTEM'
+ # External reference
+ match[3] = match[3][1..-2] # PUBID
+ match.delete_at(4) if match.size > 4 # Chop out NDATA decl
+ # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
+ elsif match[2] == 'PUBLIC'
+ # External reference
+ match[3] = match[3][1..-2] # PUBID
+ match[4] = match[4][1..-2] # HREF
+ match.delete_at(5) if match.size > 5 # Chop out NDATA decl
+ # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
else
- message = "#{base_error_message}: invalid declaration name"
+ match[2] = match[2][1..-2]
+ match.pop if match.size == 4
+ # match is [ :entity, name, value ]
end
- raise REXML::ParseException.new(message, @source)
- end
- name = parse_name(base_error_message)
- id = parse_id(base_error_message,
- accept_external_id: true,
- accept_public_id: true)
- unless @source.match(/\A\s*>/um, true)
- message = "#{base_error_message}: garbage before end >"
- raise REXML::ParseException.new(message, @source)
+ match << '%' if ref
+ return match
+ elsif @source.match("ATTLIST", true)
+ md = @source.match(ATTLISTDECL_END, true)
+ raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
+ element = md[1]
+ contents = md[0]
+
+ pairs = {}
+ values = md[0].scan( ATTDEF_RE )
+ values.each do |attdef|
+ unless attdef[3] == "#IMPLIED"
+ attdef.compact!
+ val = attdef[3]
+ val = attdef[4] if val == "#FIXED "
+ pairs[attdef[0]] = val
+ if attdef[0] =~ /^xmlns:(.*)/
+ @nsstack[0] << $1
+ end
+ end
+ end
+ return [ :attlistdecl, element, pairs, contents ]
+ elsif @source.match("NOTATION", true)
+ base_error_message = "Malformed notation declaration"
+ unless @source.match(/\s+/um, true)
+ if @source.match(">")
+ message = "#{base_error_message}: name is missing"
+ else
+ message = "#{base_error_message}: invalid name"
+ end
+ @source.string = " /um, true)
+ message = "#{base_error_message}: garbage before end >"
+ raise REXML::ParseException.new(message, @source)
+ end
+ return [:notationdecl, name, *id]
+ elsif md = @source.match(/--(.*?)-->/um, true)
+ case md[1]
+ when /--/, /-\z/
+ raise REXML::ParseException.new("Malformed comment", @source)
+ end
+ return [ :comment, md[1] ] if md
end
- return [:notationdecl, name, *id]
- when DOCTYPE_END
+ elsif match = @source.match(/(%.*?;)\s*/um, true)
+ return [ :externalentity, match[1] ]
+ elsif @source.match(/\]\s*>/um, true)
@document_status = :after_doctype
- @source.match( DOCTYPE_END, true )
return [ :end_doctype ]
end
end
if @document_status == :after_doctype
- @source.match(/\A\s*/um, true)
+ @source.match(/\s*/um, true)
end
begin
- next_data = @source.buffer
- if next_data.size < 2
- @source.read
- next_data = @source.buffer
- end
- if next_data[0] == ?<
- if next_data[1] == ?/
+ if @source.match("<", true)
+ if @source.match("/", true)
@nsstack.shift
last_tag = @tags.pop
- md = @source.match( CLOSE_MATCH, true )
+ md = @source.match(CLOSE_PATTERN, true)
if md and !last_tag
message = "Unexpected top-level end tag (got '#{md[1]}')"
raise REXML::ParseException.new(message, @source)
end
if md.nil? or last_tag != md[1]
message = "Missing end tag for '#{last_tag}'"
- message << " (got '#{md[1]}')" if md
+ message += " (got '#{md[1]}')" if md
+ @source.string = "" + @source.buffer if md.nil?
raise REXML::ParseException.new(message, @source)
end
return [ :end_element, last_tag ]
- elsif next_data[1] == ?!
- md = @source.match(/\A(\s*[^>]*>)/um)
+ elsif @source.match("!", true)
+ md = @source.match(/([^>]*>)/um)
#STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
raise REXML::ParseException.new("Malformed node", @source) unless md
- if md[0][2] == ?-
- md = @source.match( COMMENT_PATTERN, true )
+ if md[0][0] == ?-
+ md = @source.match(/--(.*?)-->/um, true)
case md[1]
when /--/, /-\z/
@@ -383,17 +385,18 @@ def pull_event
return [ :comment, md[1] ] if md
else
- md = @source.match( CDATA_PATTERN, true )
+ md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
return [ :cdata, md[1] ] if md
end
raise REXML::ParseException.new( "Declarations can only occur "+
"in the doctype declaration.", @source)
- elsif next_data[1] == ??
+ elsif @source.match("?", true)
return process_instruction
else
# Get the next tag
- md = @source.match(TAG_MATCH, true)
+ md = @source.match(TAG_PATTERN, true)
unless md
+ @source.string = "<" + @source.buffer
raise REXML::ParseException.new("malformed XML: missing tag start", @source)
end
tag = md[1]
@@ -418,7 +421,7 @@ def pull_event
return [ :start_element, tag, attributes ]
end
else
- md = @source.match( TEXT_PATTERN, true )
+ md = @source.match(/([^<]*)/um, true)
text = md[1]
return [ :text, text ]
end
@@ -462,8 +465,7 @@ def normalize( input, entities=nil, entity_filter=nil )
# Unescapes all possible entities
def unnormalize( string, entities=nil, filter=nil )
- rv = string.clone
- rv.gsub!( /\r\n?/, "\n" )
+ rv = string.gsub( /\r\n?/, "\n" )
matches = rv.scan( REFERENCE_RE )
return rv if matches.size == 0
rv.gsub!( /*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {
@@ -498,9 +500,9 @@ def need_source_encoding_update?(xml_declaration_encoding)
end
def parse_name(base_error_message)
- md = @source.match(/\A\s*#{NAME}/um, true)
+ md = @source.match(NAME_PATTERN, true)
unless md
- if @source.match(/\A\s*\S/um)
+ if @source.match(/\s*\S/um)
message = "#{base_error_message}: invalid name"
else
message = "#{base_error_message}: name is missing"
@@ -577,11 +579,28 @@ def parse_id_invalid_details(accept_external_id:,
end
def process_instruction
- match_data = @source.match(INSTRUCTION_PATTERN, true)
+ match_data = @source.match(INSTRUCTION_END, true)
unless match_data
message = "Invalid processing instruction node"
+ @source.string = "" + @source.buffer
raise REXML::ParseException.new(message, @source)
end
+ if @document_status.nil? and match_data[1] == "xml"
+ content = match_data[2]
+ version = VERSION.match(content)
+ version = version[1] unless version.nil?
+ encoding = ENCODING.match(content)
+ encoding = encoding[1] unless encoding.nil?
+ if need_source_encoding_update?(encoding)
+ @source.encoding = encoding
+ end
+ if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
+ encoding = "UTF-16"
+ end
+ standalone = STANDALONE.match(content)
+ standalone = standalone[1] unless standalone.nil?
+ return [ :xmldecl, version, encoding, standalone ]
+ end
[:processing_instruction, match_data[1], match_data[2]]
end
diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
index db78a124..4111d1d3 100644
--- a/lib/rexml/source.rb
+++ b/lib/rexml/source.rb
@@ -76,6 +76,10 @@ def match(pattern, cons=false)
end
end
+ def string=(string)
+ @scanner.string = string
+ end
+
# @return true if the Source is exhausted
def empty?
@scanner.eos?
@@ -150,28 +154,25 @@ def initialize(arg, block_size=500, encoding=nil)
def read
begin
@scanner << readline
+ true
rescue Exception, NameError
@source = nil
+ false
end
end
def match( pattern, cons=false )
- if cons
- md = @scanner.scan(pattern)
- else
- md = @scanner.check(pattern)
- end
- while md.nil? and @source
- begin
- @scanner << readline
- if cons
- md = @scanner.scan(pattern)
- else
- md = @scanner.check(pattern)
- end
- rescue
- @source = nil
+ read if @scanner.eos? && @source
+ while true
+ if cons
+ md = @scanner.scan(pattern)
+ else
+ md = @scanner.check(pattern)
end
+ break if md
+ return nil if pattern.is_a?(String) && pattern.bytesize <= @scanner.rest_size
+ return nil if @source.nil?
+ return nil unless read
end
md.nil? ? nil : @scanner
diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb
index 55713909..8faa0b78 100644
--- a/test/parse/test_document_type_declaration.rb
+++ b/test/parse/test_document_type_declaration.rb
@@ -36,6 +36,21 @@ def test_garbage_plus_before_name_at_line_start
+ r SYSTEM "urn:x-rexml:test" [ ]>
DETAIL
end
+
+ def test_no_name
+ exception = assert_raise(REXML::ParseException) do
+ parse(<<-DOCTYPE)
+
+ DOCTYPE
+ end
+ assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed DOCTYPE: name is missing
+Line: 3
+Position: 17
+Last 80 unconsumed characters:
+
+ DETAIL
+ end
end
class TestExternalID < self