-
-
Notifications
You must be signed in to change notification settings - Fork 1
/
leac.ts
300 lines (281 loc) · 7.83 KB
/
leac.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
import { createPositionQuery } from './positionQuery';
/** Lexer options (not many so far). */
export type Options = {
/**
* Enable line and column numbers computation.
*/
lineNumbers?: boolean
};
/** Result returned by a lexer function. */
export type LexerResult = {
/** Array of tokens. */
tokens: Token[],
/** Final offset. */
offset: number,
/**
* True if whole input string was processed.
*
* Check this to see whether some input left untokenized.
*/
complete: boolean
};
/**
* Lexer function.
*
* @param str - A string to tokenize.
* @param offset - Initial offset. Used when composing lexers.
*/
export type Lexer = (
str: string,
offset?: number
) => LexerResult;
/** Token object, a result of matching an individual lexing rule. */
export type Token = {
/** Name of the lexer containing the rule produced this token. */
state: string;
/** Name of the rule produced this token. */
name: string;
/** Text matched by the rule. _(Unless a replace value was used by a RegexRule.)_ */
text: string;
/** Start index of the match in the input string. */
offset: number;
/**
* The length of the matched substring.
*
* _(Might be different from the text length in case replace value
* was used in a RegexRule.)_
*/
len: number;
/**
* Line number in the source string (1-based).
*
* _(Always zero if not enabled in the lexer options.)_
*/
line: number;
/**
* Column number within the line in the source string (1-based).
*
* _(Always zero if line numbers not enabled in the lexer options.)_
*/
column: number;
}
/**
* Lexing rule.
*
* Base rule looks for exact match by it's name.
*
* If the name and the lookup string have to be different
* then specify `str` property as defined in {@link StringRule}.
*/
export interface Rule {
/** The name of the rule, also the name of tokens produced by this rule. */
name: string;
/**
* Matched token won't be added to the output array if this set to `true`.
*
* (_Think twice before using this._)
* */
discard?: boolean;
/**
* Switch to another lexer function after this match,
* concatenate it's results and continue from where it stopped.
*/
push?: Lexer;
/**
* Stop after this match and return.
*
* If there is a parent parser - it will continue from this point.
*/
pop?: boolean;
}
/**
* String rule - looks for exact string match that
* can be different from the name of the rule.
*/
export interface StringRule extends Rule {
/**
* Specify the exact string to match
* if it is different from the name of the rule.
*/
str: string;
}
/**
* Regex rule - looks for a regular expression match.
*/
export interface RegexRule extends Rule {
/**
* Regular expression to match.
*
* - Can't have the global flag.
*
* - All regular expressions are used as sticky,
* you don't have to specify the sticky flag.
*
* - Empty matches are considered as non-matches -
* no token will be emitted in that case.
*/
regex: RegExp;
/**
* Replacement string can include patterns,
* the same as [String.prototype.replace()](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/replace#specifying_a_string_as_a_parameter).
*
* This will only affect the text property of an output token, not it's offset or length.
*
* Note: the regex has to be able to match the matched substring when taken out of context
* in order for replace to work - boundary/neighborhood conditions may prevent this.
*/
replace?: string;
}
function isRegexRule (r: Rule): r is RegexRule {
return Object.prototype.hasOwnProperty.call(r, 'regex');
}
function isStringRule (r: Rule): r is StringRule {
return Object.prototype.hasOwnProperty.call(r, 'str');
}
/**
* Non-empty array of rules.
*
* Rules are processed in provided order, first match is taken.
*
* Rules can have the same name. For example, you can have
* separate rules for various keywords and use the same name "keyword".
*/
export type Rules = [
(Rule|StringRule|RegexRule),
...(Rule|StringRule|RegexRule)[]
];
/**
* Create a lexer function.
*
* @param rules - Non-empty array of lexing rules.
*
* Rules are processed in provided order, first match is taken.
*
* Rules can have the same name - you can have separate rules
* for keywords and use the same name "keyword" for example.
*
* @param state - The name of this lexer. Use when composing lexers.
* Empty string by default.
*
* @param options - Lexer options object.
*/
export function createLexer (
rules: Rules,
state?: string,
options?: Options
): Lexer;
/**
* Create a lexer function.
*
* @param rules - Non-empty array of lexing rules.
*
* Rules are processed in provided order, first match is taken.
*
* Rules can have the same name - you can have separate rules
* for keywords and use the same name "keyword" for example.
*
* @param options - Lexer options object.
*/
export function createLexer (
rules: Rules,
options?: Options
): Lexer;
export function createLexer (
rules: Rules,
state: string|Options = '',
options: Options = {}
): Lexer {
const options1 = (typeof state !== 'string') ? state : options;
const state1 = (typeof state === 'string') ? state : '';
const regexRules: RegexRule[] = rules.map(toRegexRule);
const isLineNumbers = !!options1.lineNumbers;
return function (str: string, offset = 0) {
const positionQuery = (isLineNumbers)
? createPositionQuery(str)
: () => ({ line: 0, column: 0 });
let currentIndex = offset;
const tokens: Token[] = [];
loopStr:
while (currentIndex < str.length) {
let anyMatch = false;
for (const rule of regexRules) {
rule.regex.lastIndex = currentIndex;
const match = rule.regex.exec(str);
if (match && match[0].length > 0) {
if (!rule.discard) {
const position = positionQuery(currentIndex);
const text = (typeof rule.replace === 'string')
? match[0].replace(
new RegExp(rule.regex.source, rule.regex.flags),
rule.replace
)
: match[0];
tokens.push({
state: state1,
name: rule.name,
text: text,
offset: currentIndex,
len: match[0].length,
line: position.line,
column: position.column
});
}
currentIndex = rule.regex.lastIndex;
anyMatch = true;
if (rule.push) {
const r = rule.push(str, currentIndex);
tokens.push(...r.tokens);
currentIndex = r.offset;
}
if (rule.pop) {
break loopStr;
}
break;
}
}
if (!anyMatch) {
break;
}
}
return {
tokens: tokens,
offset: currentIndex,
complete: str.length <= currentIndex
};
};
}
function toRegexRule (r: Rule, i: number): RegexRule {
return { ...r, regex: toRegExp(r, i) };
}
function toRegExp (r: Rule, i: number): RegExp {
if (r.name.length === 0) {
throw new Error(
`Rule #${i} has empty name, which is not allowed.`
);
}
if (isRegexRule(r)) {
return toSticky(r.regex);
}
if (isStringRule(r)) {
if (r.str.length === 0) {
throw new Error(
`Rule #${i} ("${r.name}") has empty "str" property, which is not allowed.`
);
}
return new RegExp(escapeRegExp(r.str), 'y');
}
return new RegExp(escapeRegExp(r.name), 'y');
}
function escapeRegExp (str: string) {
return str.replace(/[-[\]{}()*+!<=:?./\\^$|#\s,]/g, '\\$&');
}
function toSticky (re: RegExp) {
if (re.global) {
throw new Error(
`Regular expression /${re.source}/${re.flags} contains the global flag, which is not allowed.`
);
}
return (re.sticky)
? re
: new RegExp(re.source, re.flags + 'y');
}