Skip to content

Commit

Permalink
feat: Extend line continuation support, implement booleans.
Browse files Browse the repository at this point in the history
Make line continuations work in literals and identifiers.

Unify true and false into bool.

Improve Makefile.
  • Loading branch information
reiniscirpons committed May 26, 2024
1 parent a535485 commit 592483d
Show file tree
Hide file tree
Showing 9 changed files with 11,056 additions and 10,652 deletions.
22 changes: 11 additions & 11 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ define get_files
find ${$@_INPUT_DIRECTORY} -type f -regextype sed -regex ${$@_REGEX} -exec sh -c 'new=${$@_OUTPUT_DIRECTORY}/temp-$$(echo "{}" | tr "/" "-" | tr " " "_"); cp "{}" "$$new"' \;
endef

.PHONY: clean create_gap_tests create_pkg_tests test_g test_tst test_all
.PHONY: clean create_gap_tests create_pkg_tests test_gap test_pkg test_all

$(GAP_DIR):
git clone --depth=1 https://github.com/gap-system/gap $(GAP_DIR)
Expand All @@ -21,22 +21,22 @@ $(GAP_DIR)/pkg: $(GAP_DIR)

create_gap_tests: $(GAP_DIR)
mkdir -p $(EXAMPLES_DIR)/temp_gap
@$(call get_files, ".*\.\(gd\|gi\|g\|tst\)", $(GAP_DIR)/grp, $(EXAMPLES_DIR)/temp_gap)
@$(call get_files, ".*\.\(gd\|gi\|g\|tst\)", $(GAP_DIR)/lib, $(EXAMPLES_DIR)/temp_gap)
@$(call get_files, ".*\.\(gd\|gi\|g\|tst\)", $(GAP_DIR)/tst, $(EXAMPLES_DIR)/temp_gap)
@$(call get_files, ".*\.\(gd\|gi\|g\)", $(GAP_DIR)/grp, $(EXAMPLES_DIR)/temp_gap)
@$(call get_files, ".*\.\(gd\|gi\|g\)", $(GAP_DIR)/lib, $(EXAMPLES_DIR)/temp_gap)
@$(call get_files, ".*\.\(gd\|gi\|g\)", $(GAP_DIR)/tst, $(EXAMPLES_DIR)/temp_gap)

create_pkg_tests: $(GAP_DIR)/pkg
mkdir -p $(EXAMPLES_DIR)/temp_pkg
@$(call get_files, ".*\.\(gd\|gi\|g\|tst\)", $(GAP_DIR)/pkg, $(EXAMPLES_DIR)/temp_pkg)
@$(call get_files, ".*\.\(gd\|gi\|g\)", $(GAP_DIR)/pkg, $(EXAMPLES_DIR)/temp_pkg)

test_g: create_gap_tests
tree-sitter parse '$(EXAMPLES_DIR)/**/*.g*' --quiet --stat
test_gap: create_gap_tests
tree-sitter parse '$(EXAMPLES_DIR)/temp_gap/*.g*' --quiet --stat

test_tst: create_gap_tests
tree-sitter parse '$(EXAMPLES_DIR)/**/*.tst' --quiet --stat
test_pkg: create_pkg_tests
tree-sitter parse '$(EXAMPLES_DIR)/temp_pkg/*.g*' --quiet --stat

test_all: create_gap_tests
tree-sitter parse '$(EXAMPLES_DIR)/**/.*' --quiet --stat
test_all: create_gap_tests create_pkg_tests
tree-sitter parse '$(EXAMPLES_DIR)/**/*.g*' --quiet --stat

clean:
rm -rf $(EXAMPLES_DIR)/temp_gap
Expand Down
205 changes: 144 additions & 61 deletions grammar.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,23 @@ const PREC = {
CALL: 13,
}

const LITERAL_REGEXP = {
IDENTIFIER: /([a-zA-Z_@0-9]|\\.)*([a-zA-Z_@]|\\.)[a-zA-Z_@0-9]*/,
INTEGER: /[0-9]+/,
ESCAPE_SEQUENCE: /\\([^0-7\r\n]|0x[0-9a-fA-F]{2,2}|[0-7]{3,3})/,
LINE_CONTINUATION: /\\\r?\n/,
}

module.exports = grammar({
name: 'GAP',

externals: $ => [
$.line_continuation
],

extras: $ => [
$.comment,
/\s/,
$.line_continuation,
$._line_continuation,
],

inline: $ => [
Expand All @@ -44,7 +50,9 @@ module.exports = grammar({
word: $ => $.identifier,

rules: {
// TODO: add support for test files
// TODO: add support for GAP tst file syntax. This probably needs to be
// a spearate tree-sitter project which imports the base GAP syntax, similar
// to how the cpp grammar is implemented (it imports the c grammar).
source_file: $ => repeat(
choice(
$._expression,
Expand Down Expand Up @@ -161,8 +169,7 @@ module.exports = grammar({

$.integer,
$.float,
$.true,
$.false,
$.bool,
$.tilde,
$.char,
$.string,
Expand Down Expand Up @@ -279,31 +286,14 @@ module.exports = grammar({
$._expression
)),

// GAP source file location: src/scanner.c GetNumber
integer: $ => lineContinuation(
LITERAL_REGEXP.INTEGER,
LITERAL_REGEXP.LINE_CONTINUATION,
),

// GAP source file location: src/scanner.c GetNumber
float: _ => {
const digits = /[0-9]+/;
const exponent = /[edqEDQ][\+-]?[0-9]+/;

const middle_period = token(seq(
digits,
'.',
digits,
optional(exponent),
));

const leading_period_with_exponent = token(seq(
'.',
digits,
exponent,
));

const trailing_period_with_exponent = token(seq(
digits,
'.',
exponent,
));

// TODO: trailing period floats currently cause issues with ranges e.g.
// [1..10] fails producing the parse (list_expression (float) (Error))
// since it (correctly) tries to parse the prefix [1. as the start of a list
Expand All @@ -312,37 +302,41 @@ module.exports = grammar({
// In particular we need two characters of lookahead when our parser has processed
// the prefix [1, with these two characters we check if we have 1. or 1.. .
// Looks like we need to add an external scanner for this.
const trailing_period = token(seq(
digits,
'.',
));
const trailing_period = lineContinuation(
/[0-9]+\./,
LITERAL_REGEXP.LINE_CONTINUATION,
);

const leading_period = token(prec(-1,seq(
'.',
digits,
)));
const middle_period = lineContinuation(
/[0-9]+\.[0-9]+/,
LITERAL_REGEXP.LINE_CONTINUATION,
);

// TODO: Leading periods currently conflict with record selectors
const leading_period = lineContinuation(
/\.[0-9]+/,
LITERAL_REGEXP.LINE_CONTINUATION,
);

const float_with_exponent = lineContinuation(
/([0-9]+\.[0-9]*|[0-9]*\.[0-9]+)[edqEDQ][\+-]?[0-9]+/,
LITERAL_REGEXP.LINE_CONTINUATION,
);

return choice(
//leading_period,
middle_period,
leading_period_with_exponent,
trailing_period_with_exponent,
//trailing_period
//trailing_period,
float_with_exponent,
);
},

// GAP source file location: src/scanner.c GetNumber
integer: $ => /[0-9]+/,

true: $ => 'true',

false: $ => 'false',
bool: _ => choice('true', 'false'),

char: $ => seq(
'\'',
choice(
token.immediate(prec(1, /[^\n]/)),
token.immediate(prec(1, /[^\\\r\n]/)),
$.escape_sequence
),
'\''
Expand Down Expand Up @@ -373,14 +367,15 @@ module.exports = grammar({
$.escape_sequence
)),

escape_sequence: _ => token(prec(1, seq(
'\\',
choice(
/[^0-7]/, // single character
/0x[0-9a-fA-F]{2,2}/, // hex code
/[0-7]{3,3}/, // octal
)
))),
// GAP source file location: src/scanner.c GetEscapedChar
escape_sequence: _ => lineContinuation(
LITERAL_REGEXP.ESCAPE_SEQUENCE,
LITERAL_REGEXP.LINE_CONTINUATION,
),

// TODO: restrict where tilde can be used, i.e., only "inside" a list or
// record expression (but at arbitrary depth)
tilde: $ => '~',


function: $ => seq(
Expand Down Expand Up @@ -447,10 +442,6 @@ module.exports = grammar({
"local", commaSep1($.identifier), ";"
),

// TODO: restrict where tilde can be used, i.e., only "inside" a list or
// record expression (but at arbitrary depth)
tilde: $ => '~',

call: $ => prec(PREC.CALL, seq(
field('function', choice(
$._variable,
Expand Down Expand Up @@ -544,19 +535,23 @@ module.exports = grammar({
')',
),

identifier: _ => /([a-zA-Z_@0-9]|(\\.))*([a-zA-Z_@]|(\\.))[a-zA-Z_@0-9]*/,
identifier: $ => lineContinuation(
LITERAL_REGEXP.IDENTIFIER,
LITERAL_REGEXP.LINE_CONTINUATION,
),

qualified_identifier: $ => seq(
$.qualifier,
$.identifier
),

qualifier: _ => choice('readonly', 'readwrite'),

comment: _ => token(seq('#', /.*/)),

// TODO: implement external scanner for line continuations
// line_continuation: _ => token(seq('\\', choice(seq(optional('\r'), '\n'), '\0'))),
// GAP source file location: src/io.c GetNextChar
_line_continuation: _ => LITERAL_REGEXP.LINE_CONTINUATION,

qualifier: _ => choice('readonly', 'readwrite')

}
});
Expand All @@ -568,3 +563,91 @@ function commaSep(rule) {
function commaSep1(rule) {
return seq(rule, repeat(seq(',', rule)))
}

// This function implements a RegExp transformation for matching an
// arbitrary number of line continuations within the base RegExp.
// Roughly speaking, if L is the regex matching the line continuation,
// and T is this function, then
// T(x) = (xL*) if x is a character class
// T((A)) = (T(A))
// T(AB) = T(A)T(B)
// T(A | B) = T(A) | T(B)
// T(A*) = T(A)*
// We perform this transformation in a linear pass by essentially detecting
// occurences of character classes and performing the transformation on them.
function lineContinuation(base_regex, line_continuation_regex) {
// The irony of writing a custom regex parser within a tree-sitter
// grammar is not lost, but here we are.
// <RegExp> ::= <CharacterClass>
// | '(', <RegExp>, ')'
// | <RegExp>, <RegExp>
// | <RegExp>, '|', <RegExp>
// | <RegExp>, <Quantifier>
// <Quantifier> ::= '*' | '+' | '?'
// | '*?' | '+?' | '??'
// | '{', <Integer>, '}'
// | '{', <Integer>, ',}'
// | '{', <Integer>, ',', <Integer>, '}'
// | '{', <Integer>, '}?'
// | '{', <Integer>, ',}?'
// | '{', <Integer>, ',', <Integer>, '}?'
// <CharacterClass> ::= '[', <StuffThatMayContainEscapedRightSquareBracket>, ']'
// | '\\', <AnyLetterToAGoodApproximation>,
// | <AnyNonQuantifierLetterToAGoodApproximation>
const line_continuation_regex_string = '(' + line_continuation_regex.source + ')*'
const special_symbols = new Set(['*', '+', '?', '|', '(', ')'])
let result_regex_string = '';
let escaped = false;
let square_bracket = false;
let curly_brace = false
for (const c of base_regex.source) {
// TODO: refactor code spaghetti
if (curly_brace) {
if (c == '}') {
curly_brace = false
result_regex_string = result_regex_string.concat(c)
} else {
result_regex_string = result_regex_string.concat(c)
}
} else if (square_bracket) {
if (escaped) {
escaped = false
result_regex_string = result_regex_string.concat(c)
} else if (c == ']') {
square_bracket = false;
result_regex_string = result_regex_string.concat(c)
result_regex_string = result_regex_string.concat(line_continuation_regex_string)
result_regex_string = result_regex_string.concat(')')
} else if (c == '\\') {
escaped = true;
result_regex_string = result_regex_string.concat(c)
} else {
result_regex_string = result_regex_string.concat(c)
}
} else if (escaped) {
escaped = false;
result_regex_string = result_regex_string.concat(c)
result_regex_string = result_regex_string.concat(line_continuation_regex_string)
result_regex_string = result_regex_string.concat(')')
} else if (c == '\\') {
escaped = true;
result_regex_string = result_regex_string.concat('(')
result_regex_string = result_regex_string.concat(c)
} else if (c == '[') {
square_bracket = true;
result_regex_string = result_regex_string.concat('(')
result_regex_string = result_regex_string.concat(c)
} else if (c == '{') {
curly_brace = true;
result_regex_string = result_regex_string.concat(c)
} else if (special_symbols.has(c)) {
result_regex_string = result_regex_string.concat(c)
} else {
result_regex_string = result_regex_string.concat('(')
result_regex_string = result_regex_string.concat(c)
result_regex_string = result_regex_string.concat(line_continuation_regex_string)
result_regex_string = result_regex_string.concat(')')
}
}
return RegExp(result_regex_string);
}
Loading

0 comments on commit 592483d

Please sign in to comment.