Skip to content

Commit

Permalink
Expand monarch functionality to allow state access within rules (#183463
Browse files Browse the repository at this point in the history
)

* Expand monarch functionality to allow state access within rule regular expressions

* Forgot to stage

* Fix substitution regex, add caching and a unit test

---------

Co-authored-by: Alexandru Dima <alexdima@microsoft.com>
  • Loading branch information
jeremy-rifkin and alexdima authored Mar 19, 2024
1 parent a8c5e10 commit 20deb62
Show file tree
Hide file tree
Showing 4 changed files with 107 additions and 9 deletions.
22 changes: 21 additions & 1 deletion src/vs/editor/standalone/common/monarch/monarchCommon.ts
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,10 @@ export function isIAction(what: FuzzyAction): what is IAction {
}

export interface IRule {
regex: RegExp;
action: FuzzyAction;
matchOnlyAtLineStart: boolean;
name: string;
resolveRegex(state: string): RegExp;
}

export interface IAction {
Expand Down Expand Up @@ -175,6 +175,26 @@ export function substituteMatches(lexer: ILexerMin, str: string, id: string, mat
});
}

/**
* substituteMatchesRe is used on lexer regex rules and can substitutes predefined patterns:
* $Sn => n'th part of state
*
*/
export function substituteMatchesRe(lexer: ILexerMin, str: string, state: string): string {
const re = /\$[sS](\d\d?)/g;
let stateMatches: string[] | null = null;
return str.replace(re, function (full, s) {
if (stateMatches === null) { // split state on demand
stateMatches = state.split('.');
stateMatches.unshift(state);
}
if (!empty(s) && s < stateMatches.length) {
return fixCase(lexer, stateMatches[s]); //$Sn
}
return '';
});
}

/**
* Find the tokenizer rules for a specific state (i.e. next action)
*/
Expand Down
39 changes: 34 additions & 5 deletions src/vs/editor/standalone/common/monarch/monarchCompile.ts
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,8 @@ function createKeywordMatcher(arr: string[], caseInsensitive: boolean = false):
* @example /@attr/ will be replaced with the value of lexer[attr]
* @example /@@text/ will not be replaced and will become /@text/.
*/
function compileRegExp(lexer: monarchCommon.ILexerMin, str: string): RegExp {
function compileRegExp<S extends true | false>(lexer: monarchCommon.ILexerMin, str: string, handleSn: S): S extends true ? RegExp | DynamicRegExp : RegExp;
function compileRegExp(lexer: monarchCommon.ILexerMin, str: string, handleSn: true | false): RegExp | DynamicRegExp {
// @@ must be interpreted as a literal @, so we replace all occurences of @@ with a placeholder character
str = str.replace(/@@/g, `\x01`);

Expand Down Expand Up @@ -116,6 +117,24 @@ function compileRegExp(lexer: monarchCommon.ILexerMin, str: string): RegExp {
str = str.replace(/\x01/g, '@');

const flags = (lexer.ignoreCase ? 'i' : '') + (lexer.unicode ? 'u' : '');

// handle $Sn
if (handleSn) {
const match = str.match(/\$[sS](\d\d?)/g);
if (match) {
let lastState: string | null = null;
let lastRegEx: RegExp | null = null;
return (state: string) => {
if (lastRegEx && lastState === state) {
return lastRegEx;
}
lastState = state;
lastRegEx = new RegExp(monarchCommon.substituteMatchesRe(lexer, str, state), flags);
return lastRegEx;
};
}
}

return new RegExp(str, flags);
}

Expand Down Expand Up @@ -196,12 +215,12 @@ function createGuard(lexer: monarchCommon.ILexerMin, ruleName: string, tkey: str
else if (op === '~' || op === '!~') {
if (pat.indexOf('$') < 0) {
// precompile regular expression
const re = compileRegExp(lexer, '^' + pat + '$');
const re = compileRegExp(lexer, '^' + pat + '$', false);
tester = function (s) { return (op === '~' ? re.test(s) : !re.test(s)); };
}
else {
tester = function (s, id, matches, state) {
const re = compileRegExp(lexer, '^' + monarchCommon.substituteMatches(lexer, pat, id, matches, state) + '$');
const re = compileRegExp(lexer, '^' + monarchCommon.substituteMatches(lexer, pat, id, matches, state) + '$', false);
return re.test(s);
};
}
Expand Down Expand Up @@ -355,11 +374,13 @@ function compileAction(lexer: monarchCommon.ILexerMin, ruleName: string, action:
}
}

type DynamicRegExp = (state: string) => RegExp;

/**
* Helper class for creating matching rules
*/
class Rule implements monarchCommon.IRule {
public regex: RegExp = new RegExp('');
private regex: RegExp | DynamicRegExp = new RegExp('');
public action: monarchCommon.FuzzyAction = { token: '' };
public matchOnlyAtLineStart: boolean = false;
public name: string = '';
Expand All @@ -382,12 +403,20 @@ class Rule implements monarchCommon.IRule {

this.matchOnlyAtLineStart = (sregex.length > 0 && sregex[0] === '^');
this.name = this.name + ': ' + sregex;
this.regex = compileRegExp(lexer, '^(?:' + (this.matchOnlyAtLineStart ? sregex.substr(1) : sregex) + ')');
this.regex = compileRegExp(lexer, '^(?:' + (this.matchOnlyAtLineStart ? sregex.substr(1) : sregex) + ')', true);
}

public setAction(lexer: monarchCommon.ILexerMin, act: monarchCommon.IAction) {
this.action = compileAction(lexer, this.name, act);
}

public resolveRegex(state: string): RegExp {
if (this.regex instanceof RegExp) {
return this.regex;
} else {
return this.regex(state);
}
}
}

/**
Expand Down
6 changes: 3 additions & 3 deletions src/vs/editor/standalone/common/monarch/monarchLexer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -519,8 +519,8 @@ export class MonarchTokenizer extends Disposable implements languages.ITokenizat
}
hasEmbeddedPopRule = true;

let regex = rule.regex;
const regexSource = rule.regex.source;
let regex = rule.resolveRegex(state.stack.state);
const regexSource = regex.source;
if (regexSource.substr(0, 4) === '^(?:' && regexSource.substr(regexSource.length - 1, 1) === ')') {
const flags = (regex.ignoreCase ? 'i' : '') + (regex.unicode ? 'u' : '');
regex = new RegExp(regexSource.substr(4, regexSource.length - 5), flags);
Expand Down Expand Up @@ -643,7 +643,7 @@ export class MonarchTokenizer extends Disposable implements languages.ITokenizat
const restOfLine = line.substr(pos);
for (const rule of rules) {
if (pos === 0 || !rule.matchOnlyAtLineStart) {
matches = restOfLine.match(rule.regex);
matches = restOfLine.match(rule.resolveRegex(state));
if (matches) {
matched = matches[0];
action = rule.action;
Expand Down
49 changes: 49 additions & 0 deletions src/vs/editor/standalone/test/browser/monarch.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -346,4 +346,53 @@ suite('Monarch', () => {
disposables.dispose();
});

test('microsoft/monaco-editor#3128: allow state access within rules', () => {
const disposables = new DisposableStore();
const configurationService = new StandaloneConfigurationService();
const languageService = disposables.add(new LanguageService());

const tokenizer = disposables.add(createMonarchTokenizer(languageService, 'test', {
ignoreCase: false,
encoding: /u|u8|U|L/,
tokenizer: {
root: [
// C++ 11 Raw String
[/@encoding?R\"(?:([^ ()\\\t]*))\(/, { token: 'string.raw.begin', next: '@raw.$1' }],
],

raw: [
[/.*\)$S2\"/, 'string.raw', '@pop'],
[/.*/, 'string.raw']
],
},
}, configurationService));

const lines = [
`int main(){`,
``,
` auto s = R""""(`,
` Hello World`,
` )"""";`,
``,
` std::cout << "hello";`,
``,
`}`,
];

const actualTokens = getTokens(tokenizer, lines);
assert.deepStrictEqual(actualTokens, [
[new Token(0, 'source.test', 'test')],
[],
[new Token(0, 'source.test', 'test'), new Token(10, 'string.raw.begin.test', 'test')],
[new Token(0, 'string.raw.test', 'test')],
[new Token(0, 'string.raw.test', 'test'), new Token(6, 'source.test', 'test')],
[],
[new Token(0, 'source.test', 'test')],
[],
[new Token(0, 'source.test', 'test')],
]);

disposables.dispose();
});

});

0 comments on commit 20deb62

Please sign in to comment.