diff --git a/lib/prism/parse_result.rb b/lib/prism/parse_result.rb index b8fccc2ac7..7d61fbbaf1 100644 --- a/lib/prism/parse_result.rb +++ b/lib/prism/parse_result.rb @@ -46,7 +46,6 @@ def initialize(source, start_line = 1, offsets = []) @source = source @start_line = start_line # set after parsing is done @offsets = offsets # set after parsing is done - @code_units_caches = {} end # Returns the encoding of the source code, which is set by parameters to the @@ -105,81 +104,26 @@ def character_column(byte_offset) # This method is tested with UTF-8, UTF-16, and UTF-32. If there is the # concept of code units that differs from the number of characters in other # encodings, it is not captured here. - def code_units_offset(byte_offset, encoding) - (@code_units_caches[encoding] ||= CodeUnitsCache.new(source, encoding))[byte_offset] - end - - # A cache that can be used to quickly compute code unit offsets from byte - # offsets. It purposefully provides only a single #[] method to access the - # cache in order to minimize surface area. - # - # Note that there are some known issues here that may or may not be - # addressed in the future: # - # * The first is that there are issues when the cache computes values that - # are not on character boundaries. This can result in subsequent - # computations being off by one or more code units. - # * The second is that this cache is currently unbounded. In theory we could - # introduce some kind of LRU cache to limit the number of entries, but - # this has not yet been implemented. - # - class CodeUnitsCache - class UTF16Counter # :nodoc: - def initialize(source, encoding) - @source = source - @encoding = encoding - end - - def count(byte_offset, byte_length) - @source.byteslice(byte_offset, byte_length).encode(@encoding, invalid: :replace, undef: :replace).bytesize / 2 - end - end - - class LengthCounter # :nodoc: - def initialize(source, encoding) - @source = source - @encoding = encoding - end - - def count(byte_offset, byte_length) - @source.byteslice(byte_offset, byte_length).encode(@encoding, invalid: :replace, undef: :replace).length - end - end - - private_constant :UTF16Counter, :LengthCounter - - # Initialize a new cache with the given source and encoding. - def initialize(source, encoding) - @source = source - @counter = - if encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE - UTF16Counter.new(source, encoding) - else - LengthCounter.new(source, encoding) - end - - @cache = {} - @offsets = [] - end + # We purposefully replace invalid and undefined characters with replacement + # characters in this conversion. This happens for two reasons. First, it's + # possible that the given byte offset will not occur on a character + # boundary. Second, it's possible that the source code will contain a + # character that has no equivalent in the given encoding. + def code_units_offset(byte_offset, encoding) + byteslice = (source.byteslice(0, byte_offset) or raise).encode(encoding, invalid: :replace, undef: :replace) - # Retrieve the code units offset from the given byte offset. - def [](byte_offset) - @cache[byte_offset] ||= - if (index = @offsets.bsearch_index { |offset| offset > byte_offset }).nil? - @offsets << byte_offset - @counter.count(0, byte_offset) - elsif index == 0 - @offsets.unshift(byte_offset) - @counter.count(0, byte_offset) - else - @offsets.insert(index, byte_offset) - offset = @offsets[index - 1] - @cache[offset] + @counter.count(offset, byte_offset - offset) - end + if encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE + byteslice.bytesize / 2 + else + byteslice.length end end - private_constant :CodeUnitsCache + # blah + def code_units_cache(encoding) + CodeUnitsCache.new(source, encoding) + end # Returns the column number in code units for the given encoding for the # given byte offset. @@ -239,6 +183,11 @@ def code_units_offset(byte_offset, encoding) byte_offset end + # blah + def code_units_cache(encoding) + ->(byte_offset) { byte_offset } + end + # Specialized version of `code_units_column` that does not depend on # `code_units_offset`, which is a more expensive operation. This is # essentially the same as `Prism::Source#column`. @@ -247,6 +196,76 @@ def code_units_column(byte_offset, encoding) end end + # A cache that can be used to quickly compute code unit offsets from byte + # offsets. It purposefully provides only a single #[] method to access the + # cache in order to minimize surface area. + # + # Note that there are some known issues here that may or may not be addressed + # in the future: + # + # * The first is that there are issues when the cache computes values that are + # not on character boundaries. This can result in subsequent computations + # being off by one or more code units. + # * The second is that this cache is currently unbounded. In theory we could + # introduce some kind of LRU cache to limit the number of entries, but this + # has not yet been implemented. + # + class CodeUnitsCache + class UTF16Counter # :nodoc: + def initialize(source, encoding) + @source = source + @encoding = encoding + end + + def count(byte_offset, byte_length) + @source.byteslice(byte_offset, byte_length).encode(@encoding, invalid: :replace, undef: :replace).bytesize / 2 + end + end + + class LengthCounter # :nodoc: + def initialize(source, encoding) + @source = source + @encoding = encoding + end + + def count(byte_offset, byte_length) + @source.byteslice(byte_offset, byte_length).encode(@encoding, invalid: :replace, undef: :replace).length + end + end + + private_constant :UTF16Counter, :LengthCounter + + # Initialize a new cache with the given source and encoding. + def initialize(source, encoding) + @source = source + @counter = + if encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE + UTF16Counter.new(source, encoding) + else + LengthCounter.new(source, encoding) + end + + @cache = {} + @offsets = [] + end + + # Retrieve the code units offset from the given byte offset. + def [](byte_offset) + @cache[byte_offset] ||= + if (index = @offsets.bsearch_index { |offset| offset > byte_offset }).nil? + @offsets << byte_offset + @counter.count(0, byte_offset) + elsif index == 0 + @offsets.unshift(byte_offset) + @counter.count(0, byte_offset) + else + @offsets.insert(index, byte_offset) + offset = @offsets[index - 1] + @cache[offset] + @counter.count(offset, byte_offset - offset) + end + end + end + # This represents a location in the source. class Location # A Source object that is used to determine more information from the given @@ -348,6 +367,11 @@ def start_code_units_offset(encoding = Encoding::UTF_16LE) source.code_units_offset(start_offset, encoding) end + # blah + def cache_start_code_units_offset(code_units_cache) + code_units_cache[start_offset] + end + # The byte offset from the beginning of the source where this location ends. def end_offset start_offset + length @@ -364,6 +388,11 @@ def end_code_units_offset(encoding = Encoding::UTF_16LE) source.code_units_offset(end_offset, encoding) end + # blah + def cache_end_code_units_offset(code_units_cache) + code_units_cache[end_offset] + end + # The line number where this location starts. def start_line source.line(start_offset) @@ -398,6 +427,11 @@ def start_code_units_column(encoding = Encoding::UTF_16LE) source.code_units_column(start_offset, encoding) end + # blah + def cache_start_code_units_column(code_units_cache) + code_units_cache[start_offset] - code_units_cache[source.line_start(start_offset)] + end + # The column number in bytes where this location ends from the start of the # line. def end_column @@ -416,6 +450,11 @@ def end_code_units_column(encoding = Encoding::UTF_16LE) source.code_units_column(end_offset, encoding) end + # blah + def cache_end_code_units_column(code_units_cache) + code_units_cache[end_offset] - code_units_cache[source.line_start(end_offset)] + end + # Implement the hash pattern matching interface for Location. def deconstruct_keys(keys) { start_offset: start_offset, end_offset: end_offset } diff --git a/test/prism/ruby/location_test.rb b/test/prism/ruby/location_test.rb index 3d3e7dd562..34a39d009b 100644 --- a/test/prism/ruby/location_test.rb +++ b/test/prism/ruby/location_test.rb @@ -61,45 +61,46 @@ def test_character_offsets end def test_code_units - program = Prism.parse("šŸ˜€ + šŸ˜€\nšŸ˜ ||= šŸ˜").value + result = Prism.parse("šŸ˜€ + šŸ˜€\nšŸ˜ ||= šŸ˜") + program = result.value # first šŸ˜€ location = program.statements.body.first.receiver.location - assert_equal 0, location.start_code_units_offset(Encoding::UTF_8) - assert_equal 0, location.start_code_units_offset(Encoding::UTF_16LE) - assert_equal 0, location.start_code_units_offset(Encoding::UTF_32LE) + assert_equal 0, location.cache_start_code_units_offset(result.source.code_units_cache(Encoding::UTF_8)) + assert_equal 0, location.cache_start_code_units_offset(result.source.code_units_cache(Encoding::UTF_16LE)) + assert_equal 0, location.cache_start_code_units_offset(result.source.code_units_cache(Encoding::UTF_32LE)) - assert_equal 1, location.end_code_units_offset(Encoding::UTF_8) - assert_equal 2, location.end_code_units_offset(Encoding::UTF_16LE) - assert_equal 1, location.end_code_units_offset(Encoding::UTF_32LE) + assert_equal 1, location.cache_end_code_units_offset(result.source.code_units_cache(Encoding::UTF_8)) + assert_equal 2, location.cache_end_code_units_offset(result.source.code_units_cache(Encoding::UTF_16LE)) + assert_equal 1, location.cache_end_code_units_offset(result.source.code_units_cache(Encoding::UTF_32LE)) - assert_equal 0, location.start_code_units_column(Encoding::UTF_8) - assert_equal 0, location.start_code_units_column(Encoding::UTF_16LE) - assert_equal 0, location.start_code_units_column(Encoding::UTF_32LE) + assert_equal 0, location.cache_start_code_units_column(result.source.code_units_cache(Encoding::UTF_8)) + assert_equal 0, location.cache_start_code_units_column(result.source.code_units_cache(Encoding::UTF_16LE)) + assert_equal 0, location.cache_start_code_units_column(result.source.code_units_cache(Encoding::UTF_32LE)) - assert_equal 1, location.end_code_units_column(Encoding::UTF_8) - assert_equal 2, location.end_code_units_column(Encoding::UTF_16LE) - assert_equal 1, location.end_code_units_column(Encoding::UTF_32LE) + assert_equal 1, location.cache_end_code_units_column(result.source.code_units_cache(Encoding::UTF_8)) + assert_equal 2, location.cache_end_code_units_column(result.source.code_units_cache(Encoding::UTF_16LE)) + assert_equal 1, location.cache_end_code_units_column(result.source.code_units_cache(Encoding::UTF_32LE)) # second šŸ˜€ location = program.statements.body.first.arguments.arguments.first.location - assert_equal 4, location.start_code_units_offset(Encoding::UTF_8) - assert_equal 5, location.start_code_units_offset(Encoding::UTF_16LE) - assert_equal 4, location.start_code_units_offset(Encoding::UTF_32LE) + assert_equal 4, location.cache_start_code_units_offset(result.source.code_units_cache(Encoding::UTF_8)) + assert_equal 5, location.cache_start_code_units_offset(result.source.code_units_cache(Encoding::UTF_16LE)) + assert_equal 4, location.cache_start_code_units_offset(result.source.code_units_cache(Encoding::UTF_32LE)) - assert_equal 5, location.end_code_units_offset(Encoding::UTF_8) - assert_equal 7, location.end_code_units_offset(Encoding::UTF_16LE) - assert_equal 5, location.end_code_units_offset(Encoding::UTF_32LE) + assert_equal 5, location.cache_end_code_units_offset(result.source.code_units_cache(Encoding::UTF_8)) + assert_equal 7, location.cache_end_code_units_offset(result.source.code_units_cache(Encoding::UTF_16LE)) + assert_equal 5, location.cache_end_code_units_offset(result.source.code_units_cache(Encoding::UTF_32LE)) - assert_equal 4, location.start_code_units_column(Encoding::UTF_8) - assert_equal 5, location.start_code_units_column(Encoding::UTF_16LE) - assert_equal 4, location.start_code_units_column(Encoding::UTF_32LE) + assert_equal 4, location.cache_start_code_units_column(result.source.code_units_cache(Encoding::UTF_8)) + assert_equal 5, location.cache_start_code_units_column(result.source.code_units_cache(Encoding::UTF_16LE)) + assert_equal 4, location.cache_start_code_units_column(result.source.code_units_cache(Encoding::UTF_32LE)) - assert_equal 5, location.end_code_units_column(Encoding::UTF_8) - assert_equal 7, location.end_code_units_column(Encoding::UTF_16LE) - assert_equal 5, location.end_code_units_column(Encoding::UTF_32LE) + assert_equal 5, location.cache_end_code_units_column(result.source.code_units_cache(Encoding::UTF_8)) + assert_equal 7, location.cache_end_code_units_column(result.source.code_units_cache(Encoding::UTF_16LE)) + assert_equal 5, location.cache_end_code_units_column(result.source.code_units_cache(Encoding::UTF_32LE)) # first šŸ˜ location = program.statements.body.last.name_loc