Skip to content

Commit b558a43

Browse files
committed
fix(parser): allow empty groups like /(?:)/
1 parent a2a7a53 commit b558a43

File tree

5 files changed

+18
-2
lines changed

5 files changed

+18
-2
lines changed

benchmark/regex_handwritten.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import fs from 'fs'
22

33
function* readHandWrittenDataset() {
4+
// dataset obtained from regex101.com using this script:
5+
// https://github.com/dataunitylab/semantic-regex/blob/ece59e827cc05b907883aace30d72e02e31e2a9b/download_patterns.sh
46
const jsonStr = fs.readFileSync('./benchmark/regex-dataset.json', 'utf-8')
57

68
for (const item of JSON.parse(jsonStr)) {

src/regex-parser.ts

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ function lookAheadOp(): P.Expr.BinaryOperator<AST.RegExpAST | undefined, AST.Reg
241241
}
242242

243243
function regex(): P.Parser<AST.RegExpAST> {
244-
return P.lazy(() => P.Expr.makeExprParser<AST.RegExpAST>(
244+
const nonEmptyRegex = P.lazy(() => P.Expr.makeExprParser<AST.RegExpAST>(
245245
regexTerm(),
246246
[
247247
{ type: 'postfix', op: P.string('*').map(_ => AST.star) },
@@ -255,6 +255,13 @@ function regex(): P.Parser<AST.RegExpAST> {
255255
{ type: 'infixRightOptional', op: P.string('|').map(_ => AST.union) },
256256
]
257257
))
258+
259+
return P.optional(nonEmptyRegex).map(ast => {
260+
if (ast === undefined)
261+
return AST.epsilon
262+
else
263+
return ast
264+
})
258265
}
259266

260267
export function parseRegExpString(

test/arbitrary-regex.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import fc from 'fast-check'
2-
import * as AST from '../src/ast'
32
import * as CharSet from '../src/char-set'
43
import * as RE from '../src/regex'
54

test/regex-parser.spec.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,11 @@ describe('parseRegExp', () => {
5252
[/[a-]/, AST.literal(CharSet.fromArray(['a', '-']))],
5353
// negative char class:
5454
[/[^abc]/, AST.literal(CharSet.complement(CharSet.fromArray(['a', 'b', 'c'])))],
55+
// regular capturing groups:
56+
[/()/, group(AST.epsilon)],
5557
// non-capturing groups
5658
[/(?:ab)/, str('ab')],
59+
[/(?:)/, AST.epsilon],
5760
// named capturing groups
5861
[/(?<abc_012_ABC>abc)/, group(str('abc'), 'abc_012_ABC')],
5962
[/(?<ABC>abc)/, group(str('abc'), 'ABC')],
@@ -63,16 +66,19 @@ describe('parseRegExp', () => {
6366
[/a^b/, AST.startMarker(char('a'), str('b'))],
6467
[/^a|^b/, AST.union(AST.startMarker(undefined, str('a')), AST.startMarker(undefined, char('b')))],
6568
[/^abc$/, AST.startMarker(undefined, AST.endMarker(str('abc'), undefined))],
69+
[/$a^/, AST.startMarker(AST.endMarker(undefined, char('a')), undefined)],
6670
// positive lookahead - now parsed as lookahead AST nodes, not intersections
6771
[/(?=a)b/, AST.positiveLookahead(char('a'), char('b'))],
6872
[/(?=a)(?:b)/, AST.positiveLookahead(char('a'), char('b'))],
6973
[/(?=a)(?=b)c/, AST.positiveLookahead(char('a'), AST.positiveLookahead(char('b'), char('c')))],
7074
[/a(?=b)c/, AST.concat(char('a'), AST.positiveLookahead(char('b'), char('c')))],
7175
[/a(?=b)/, AST.concat(char('a'), AST.positiveLookahead(char('b'), AST.epsilon))],
7276
[/a(?=b)c(?=d)e/, AST.concat(char('a'), AST.positiveLookahead(char('b'), AST.concat(char('c'), AST.positiveLookahead(char('d'), char('e')))))],
77+
[/(?=)/, AST.positiveLookahead(AST.epsilon, AST.epsilon)],
7378
// negative lookahead
7479
[/(?!a)b/, AST.negativeLookahead(char('a'), char('b'))],
7580
[/(?!a)b|c/, AST.union(AST.negativeLookahead(char('a'), char('b')), char('c'))],
81+
[/(?!)/, AST.negativeLookahead(AST.epsilon, AST.epsilon)],
7682
// TODO: positive lookbehind
7783
// [/(?<=a)/, AST.positiveLookbehind(char('a'))],
7884
// TODO: negative lookbehind

test/regex.spec.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,8 @@ describe('fromRegExpAST', () => {
275275
// the whole expression to empty set:
276276
[/(a^b|c)/, RE.seq([dotStar, RE.singleChar('c'), dotStar])],
277277
[/^(a^b|c)/, RE.seq([RE.singleChar('c'), dotStar])],
278+
// Contradictory regex describes empty set:
279+
[/$.^/, RE.empty],
278280

279281
[/(^a|)^b/, RE.seq([RE.singleChar('b'), dotStar])],
280282
[/^a(b^|c)/, RE.seq([RE.string('ac'), dotStar]) ],

0 commit comments

Comments
 (0)