Merge pull request #2219 from ruby/add-parser

Integrate parser-prism
ruby · Jan 27, 2024 · d6d73d7 · d6d73d7
2 parents d69b9da + 8cdec80
commit d6d73d7
Show file tree

Hide file tree

Showing 14 changed files with 2,564 additions and 11 deletions.
diff --git a/Gemfile b/Gemfile
@@ -12,3 +12,4 @@ group :memcheck do
   gem "ruby_memcheck", platform: %i[mri mswin mingw x64_mingw]
 end
 gem "rbs", platform: %i[mri mswin mingw x64_mingw]
+gem "parser"
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -7,19 +7,24 @@ GEM
   remote: https://rubygems.org/
   specs:
     abbrev (0.1.2)
-    ffi (1.15.5)
+    ast (2.4.2)
+    ffi (1.16.3)
     mini_portile2 (2.8.5)
-    nokogiri (1.15.4)
+    nokogiri (1.16.0)
       mini_portile2 (~> 2.8.2)
       racc (~> 1.4)
+    parser (3.3.0.5)
+      ast (~> 2.4.1)
+      racc
     power_assert (2.0.3)
     racc (1.7.3)
-    rake (13.0.6)
-    rake-compiler (1.2.5)
+    racc (1.7.3-java)
+    rake (13.1.0)
+    rake-compiler (1.2.6)
       rake
-    rbs (3.4.2)
+    rbs (3.4.3)
       abbrev
-    ruby_memcheck (2.2.1)
+    ruby_memcheck (2.3.0)
       nokogiri
     test-unit (3.6.1)
       power_assert
@@ -31,6 +36,7 @@ PLATFORMS
 
 DEPENDENCIES
   ffi
+  parser
   prism!
   rake
   rake-compiler

diff --git a/README.md b/README.md
@@ -88,6 +88,7 @@ See the [CONTRIBUTING.md](CONTRIBUTING.md) file for more information. We additio
 * [JavaScript](docs/javascript.md)
 * [Local variable depth](docs/local_variable_depth.md)
 * [Mapping](docs/mapping.md)
+* [Parser translation](docs/parser_translation.md)
 * [Parsing rules](docs/parsing_rules.md)
 * [Releasing](docs/releasing.md)
 * [Ripper](docs/ripper.md)

diff --git a/docs/parser_translation.md b/docs/parser_translation.md
@@ -0,0 +1,34 @@
+# parser translation
+
+Prism ships with the ability to translate its syntax tree into the syntax tree used by the [whitequark/parser](https://github.com/whitequark/parser) gem. This allows you to use tools built on top of the `parser` gem with the `prism` parser.
+
+## Usage
+
+The `parser` gem provides multiple parsers to support different versions of the Ruby grammar. This includes all of the Ruby versions going back to 1.8, as well as third-party parsers like MacRuby and RubyMotion. The `prism` gem provides another parser that uses the `prism` parser to build the syntax tree.
+
+You can use the `prism` parser like you would any other. After requiring the parser, you should be able to call any of the regular `Parser::Base` APIs that you would normally use.
+
+```ruby
+require "prism/translation/parser"
+
+Prism::Translation::Parser.parse_file("path/to/file.rb")
+```
+
+### RuboCop
+
+To run RuboCop using the `prism` gem as the parser, you will need to require the `prism/translation/parser/rubocop` file. This file injects `prism` into the known options for both `rubocop` and `rubocop-ast`, such that you can specify it in your `.rubocop.yml` file. Unfortunately `rubocop` doesn't support any direct way to do this, so we have to get a bit hacky.
+
+First, set the `TargetRubyVersion` in your RuboCop configuration file to `80_82_73_83_77.33`. This is the version of Ruby that `prism` reports itself as. (The leading numbers are the ASCII values for `PRISM`.)
+
+```yaml
+AllCops:
+  TargetRubyVersion: 80_82_73_83_77.33
+```
+
+Now when you run `rubocop` you will need to require the `prism/translation/parser/rubocop` file before executing so that it can inject the `prism` parser into the known options.
+
+```
+bundle exec ruby -rprism/translation/parser/rubocop $(bundle exec which rubocop)
+```
+
+This should run RuboCop using the `prism` parser.
diff --git a/lib/prism.rb b/lib/prism.rb
@@ -26,6 +26,7 @@ module Prism
   autoload :Pack, "prism/pack"
   autoload :Pattern, "prism/pattern"
   autoload :Serialize, "prism/serialize"
+  autoload :Translation, "prism/translation"
   autoload :Visitor, "prism/visitor"
 
   # Some of these constants are not meant to be exposed, so marking them as

diff --git a/lib/prism/node_ext.rb b/lib/prism/node_ext.rb
@@ -81,7 +81,7 @@ def value
   class RationalNode < Node
     # Returns the value of the node as a Ruby Rational.
     def value
-      Rational(numeric.is_a?(IntegerNode) && !numeric.decimal? ? numeric.value : slice.chomp("r"))
+      Rational(numeric.is_a?(IntegerNode) ? numeric.value : slice.chomp("r"))
     end
   end
 

diff --git a/lib/prism/translation.rb b/lib/prism/translation.rb
@@ -0,0 +1,11 @@
+# frozen_string_literal: true
+
+module Prism
+  # This module is responsible for converting the prism syntax tree into other
+  # syntax trees. At the moment it only supports converting to the
+  # whitequark/parser gem's syntax tree, but support is planned for the
+  # seattlerb/ruby_parser gem's syntax tree as well.
+  module Translation
+    autoload :Parser, "prism/translation/parser"
+  end
+end
diff --git a/lib/prism/translation/parser.rb b/lib/prism/translation/parser.rb
@@ -0,0 +1,136 @@
+# frozen_string_literal: true
+
+require "parser"
+
+module Prism
+  module Translation
+    # This class is the entry-point for converting a prism syntax tree into the
+    # whitequark/parser gem's syntax tree. It inherits from the base parser for
+    # the parser gem, and overrides the parse* methods to parse with prism and
+    # then translate.
+    class Parser < ::Parser::Base
+      Racc_debug_parser = false # :nodoc:
+
+      def version # :nodoc:
+        33
+      end
+
+      # The default encoding for Ruby files is UTF-8.
+      def default_encoding
+        Encoding::UTF_8
+      end
+
+      def yyerror # :nodoc:
+      end
+
+      # Parses a source buffer and returns the AST.
+      def parse(source_buffer)
+        @source_buffer = source_buffer
+        source = source_buffer.source
+
+        build_ast(
+          Prism.parse(source, filepath: source_buffer.name).value,
+          build_offset_cache(source)
+        )
+      ensure
+        @source_buffer = nil
+      end
+
+      # Parses a source buffer and returns the AST and the source code comments.
+      def parse_with_comments(source_buffer)
+        @source_buffer = source_buffer
+        source = source_buffer.source
+
+        offset_cache = build_offset_cache(source)
+        result = Prism.parse(source, filepath: source_buffer.name)
+
+        [
+          build_ast(result.value, offset_cache),
+          build_comments(result.comments, offset_cache)
+        ]
+      ensure
+        @source_buffer = nil
+      end
+
+      # Parses a source buffer and returns the AST, the source code comments,
+      # and the tokens emitted by the lexer.
+      def tokenize(source_buffer, _recover = false)
+        @source_buffer = source_buffer
+        source = source_buffer.source
+
+        offset_cache = build_offset_cache(source)
+        result = Prism.parse_lex(source, filepath: source_buffer.name)
+        program, tokens = result.value
+
+        [
+          build_ast(program, offset_cache),
+          build_comments(result.comments, offset_cache),
+          build_tokens(tokens, offset_cache)
+        ]
+      ensure
+        @source_buffer = nil
+      end
+
+      # Since prism resolves num params for us, we don't need to support this
+      # kind of logic here.
+      def try_declare_numparam(node)
+        node.children[0].match?(/\A_[1-9]\z/)
+      end
+
+      private
+
+      # Prism deals with offsets in bytes, while the parser gem deals with
+      # offsets in characters. We need to handle this conversion in order to
+      # build the parser gem AST.
+      #
+      # If the bytesize of the source is the same as the length, then we can
+      # just use the offset directly. Otherwise, we build a hash that functions
+      # as a cache for the conversion.
+      #
+      # This is a good opportunity for some optimizations. If the source file
+      # has any multi-byte characters, this can tank the performance of the
+      # translator. We could make this significantly faster by using a
+      # different data structure for the cache.
+      def build_offset_cache(source)
+        if source.bytesize == source.length
+          -> (offset) { offset }
+        else
+          Hash.new do |hash, offset|
+            hash[offset] = source.byteslice(0, offset).length
+          end
+        end
+      end
+
+      # Build the parser gem AST from the prism AST.
+      def build_ast(program, offset_cache)
+        program.accept(Compiler.new(self, offset_cache))
+      end
+
+      # Build the parser gem comments from the prism comments.
+      def build_comments(comments, offset_cache)
+        comments.map do |comment|
+          location = comment.location
+
+          ::Parser::Source::Comment.new(
+            ::Parser::Source::Range.new(
+              source_buffer,
+              offset_cache[location.start_offset],
+              offset_cache[location.end_offset]
+            )
+          )
+        end
+      end
+
+      # Build the parser gem tokens from the prism tokens.
+      def build_tokens(tokens, offset_cache)
+        Lexer.new(source_buffer, tokens.map(&:first), offset_cache).to_a
+      end
+
+      require_relative "parser/compiler"
+      require_relative "parser/lexer"
+
+      private_constant :Compiler
+      private_constant :Lexer
+    end
+  end
+end