diff --git a/ext/rbs_extension/parser.c b/ext/rbs_extension/parser.c index 8434b4fb1..cafb4a0d5 100644 --- a/ext/rbs_extension/parser.c +++ b/ext/rbs_extension/parser.c @@ -2885,9 +2885,29 @@ rbsparser_parse_signature(VALUE self, VALUE buffer, VALUE end_pos) return rb_ensure(parse_signature_try, (VALUE)parser, ensure_free_parser, (VALUE)parser); } +static VALUE +rbsparser_lex(VALUE self, VALUE buffer, VALUE end_pos) { + lexstate *lexer = alloc_lexer(buffer, 0, FIX2INT(end_pos)); + VALUE results = rb_ary_new(); + + token token = NullToken; + while (token.type != pEOF) { + token = rbsparser_next_token(lexer); + VALUE type = ID2SYM(rb_intern(token_type_str(token.type))); + VALUE location = rbs_new_location(buffer, token.range); + VALUE pair = rb_ary_new3(2, type, location); + rb_ary_push(results, pair); + } + + free(lexer); + + return results; +} + void rbs__init_parser(void) { RBS_Parser = rb_define_class_under(RBS, "Parser", rb_cObject); rb_define_singleton_method(RBS_Parser, "_parse_type", rbsparser_parse_type, 5); rb_define_singleton_method(RBS_Parser, "_parse_method_type", rbsparser_parse_method_type, 5); rb_define_singleton_method(RBS_Parser, "_parse_signature", rbsparser_parse_signature, 2); + rb_define_singleton_method(RBS_Parser, "_lex", rbsparser_lex, 2); } diff --git a/ext/rbs_extension/parserstate.c b/ext/rbs_extension/parserstate.c index b10bd7295..5e300a373 100644 --- a/ext/rbs_extension/parserstate.c +++ b/ext/rbs_extension/parserstate.c @@ -272,7 +272,7 @@ VALUE comment_to_ruby(comment *com, VALUE buffer) { ); } -parserstate *alloc_parser(VALUE buffer, int start_pos, int end_pos, VALUE variables) { +lexstate *alloc_lexer(VALUE buffer, int start_pos, int end_pos) { VALUE string = rb_funcall(buffer, rb_intern("content"), 0); StringValue(string); @@ -290,6 +290,11 @@ parserstate *alloc_parser(VALUE buffer, int start_pos, int end_pos, VALUE variab lexer->start = lexer->current; lexer->first_token_of_line = lexer->current.column == 0; + return lexer; +} + +parserstate *alloc_parser(VALUE buffer, int start_pos, int end_pos, VALUE variables) { + lexstate *lexer = alloc_lexer(buffer, start_pos, end_pos); parserstate *parser = calloc(1, sizeof(parserstate)); parser->lexstate = lexer; parser->buffer = buffer; diff --git a/ext/rbs_extension/parserstate.h b/ext/rbs_extension/parserstate.h index e42bf7cd8..4f495dbbe 100644 --- a/ext/rbs_extension/parserstate.h +++ b/ext/rbs_extension/parserstate.h @@ -93,6 +93,15 @@ void parser_insert_typevar(parserstate *state, ID id); * */ bool parser_typevar_member(parserstate *state, ID id); +/** + * Allocate new lexstate object. + * + * ``` + * alloc_lexer(buffer, 0, 31) // New lexstate with buffer + * ``` + * */ +lexstate *alloc_lexer(VALUE buffer, int start_pos, int end_pos); + /** * Allocate new parserstate object. * diff --git a/lib/rbs/parser/lex_result.rb b/lib/rbs/parser/lex_result.rb new file mode 100644 index 000000000..4cbf37461 --- /dev/null +++ b/lib/rbs/parser/lex_result.rb @@ -0,0 +1,15 @@ +# frozen_string_literal: true + +module RBS + class Parser + class LexResult + attr_reader :buffer + attr_reader :value + + def initialize(buffer:, value:) + @buffer = buffer + @value = value + end + end + end +end diff --git a/lib/rbs/parser/token.rb b/lib/rbs/parser/token.rb new file mode 100644 index 000000000..913b419c8 --- /dev/null +++ b/lib/rbs/parser/token.rb @@ -0,0 +1,23 @@ +# frozen_string_literal: true + +module RBS + class Parser + class Token + attr_reader :type + attr_reader :location + + def initialize(type:, location:) + @type = type + @location = location + end + + def value + @location.source + end + + def comment? + @type == :tCOMMENT || @type == :tLINECOMMENT + end + end + end +end diff --git a/lib/rbs/parser_aux.rb b/lib/rbs/parser_aux.rb index f0bc1fea4..37999ce90 100644 --- a/lib/rbs/parser_aux.rb +++ b/lib/rbs/parser_aux.rb @@ -1,5 +1,8 @@ # frozen_string_literal: true +require_relative "parser/lex_result" +require_relative "parser/token" + module RBS class Parser def self.parse_type(source, range: 0..., variables: [], require_eof: false) @@ -19,6 +22,15 @@ def self.parse_signature(source) [buf, dirs, decls] end + def self.lex(source) + buf = buffer(source) + list = _lex(buf, buf.last_position) + value = list.map do |type, location| + Token.new(type: type, location: location) + end + LexResult.new(buffer: buf, value: value) + end + def self.buffer(source) case source when String diff --git a/sig/parser.rbs b/sig/parser.rbs index 05d5e5a7d..0c30a5a7d 100644 --- a/sig/parser.rbs +++ b/sig/parser.rbs @@ -1,5 +1,23 @@ module RBS class Parser + # Result of `Parser.lex` + class LexResult + attr_reader buffer: Buffer + attr_reader value: Array[Token] + + def initialize: (buffer: Buffer, value: Array[Token]) -> void + end + + # Represents a token per result of `Parser.lex`. + class Token + attr_reader type: Symbol + attr_reader location: Location[untyped, untyped] + + def initialize: (type: Symbol, location: Location[untyped, untyped]) -> void + def value: () -> String + def comment?: () -> bool + end + # Parse a method type and return it # # When `range` keyword is specified, it starts parsing from the `begin` to the `end` of the range. @@ -50,6 +68,14 @@ module RBS # def self.parse_signature: (Buffer | String) -> [Buffer, Array[AST::Directives::t], Array[AST::Declarations::t]] + # Parse whole RBS file and return result. + # + # ```ruby + # RBS::Parser.lex("# Comment\nmodule A\nend\n").value.map(&:type) + # # => [:tLINECOMMENT, :kMODULE, :tUIDENT, :kEND, :pEOF] + # ``` + def self.lex: (Buffer | String) -> LexResult + KEYWORDS: Hash[String, bot] private @@ -62,6 +88,8 @@ module RBS def self._parse_signature: (Buffer, Integer end_pos) -> [Array[AST::Directives::t], Array[AST::Declarations::t]] + def self._lex: (Buffer, Integer end_pos) -> Array[[Symbol, Location[untyped, untyped]]] + class LocatedValue end end diff --git a/test/rbs/parser_test.rb b/test/rbs/parser_test.rb index a29d992ac..b1b9b3e04 100644 --- a/test/rbs/parser_test.rb +++ b/test/rbs/parser_test.rb @@ -767,4 +767,26 @@ def test_proc__untyped_function_parse_error RBS::Parser.parse_type("^(?) { (?) -> void } -> Integer") end end + + def test__lex + content = <<~RBS + # LineComment + class Foo[T < Integer] < Bar # Comment + end + RBS + tokens = RBS::Parser._lex(buffer(content), content.length) + assert_equal [:tLINECOMMENT, '# LineComment', 0...13], tokens[0].then { |t| [t[0], t[1].source, t[1].range] } + assert_equal [:kCLASS, 'class', 14...19], tokens[1].then { |t| [t[0], t[1].source, t[1].range] } + assert_equal [:tUIDENT, 'Foo', 20...23], tokens[2].then { |t| [t[0], t[1].source, t[1].range] } + assert_equal [:pLBRACKET, '[', 23...24], tokens[3].then { |t| [t[0], t[1].source, t[1].range] } + assert_equal [:tUIDENT, 'T', 24...25], tokens[4].then { |t| [t[0], t[1].source, t[1].range] } + assert_equal [:pLT, '<', 26...27], tokens[5].then { |t| [t[0], t[1].source, t[1].range] } + assert_equal [:tUIDENT, 'Integer', 28...35], tokens[6].then { |t| [t[0], t[1].source, t[1].range] } + assert_equal [:pRBRACKET, ']', 35...36], tokens[7].then { |t| [t[0], t[1].source, t[1].range] } + assert_equal [:pLT, '<', 37...38], tokens[8].then { |t| [t[0], t[1].source, t[1].range] } + assert_equal [:tUIDENT, 'Bar', 39...42], tokens[9].then { |t| [t[0], t[1].source, t[1].range] } + assert_equal [:tCOMMENT, '# Comment', 43...52], tokens[10].then { |t| [t[0], t[1].source, t[1].range] } + assert_equal [:kEND, 'end', 53...56], tokens[11].then { |t| [t[0], t[1].source, t[1].range] } + assert_equal [:pEOF, '', 57...58], tokens[12].then { |t| [t[0], t[1].source, t[1].range] } + end end