forked from rust-lang/rust
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add a regex crate to the Rust distribution.
Also adds a regex_macros crate, which provides natively compiled regular expressions with a syntax extension. Closes rust-lang#3591. RFC: 0007-regexps
- Loading branch information
1 parent
6648651
commit b8b7484
Showing
23 changed files
with
11,102 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
#!/usr/bin/env python2 | ||
|
||
# Copyright 2014 The Rust Project Developers. See the COPYRIGHT | ||
# file at the top-level directory of this distribution and at | ||
# http://rust-lang.org/COPYRIGHT. | ||
# | ||
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | ||
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | ||
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | ||
# option. This file may not be copied, modified, or distributed | ||
# except according to those terms. | ||
|
||
from __future__ import absolute_import, division, print_function | ||
import argparse | ||
import datetime | ||
import os.path as path | ||
|
||
|
||
def print_tests(tests): | ||
print('\n'.join([test_tostr(t) for t in tests])) | ||
|
||
|
||
def read_tests(f): | ||
basename, _ = path.splitext(path.basename(f)) | ||
tests = [] | ||
for lineno, line in enumerate(open(f), 1): | ||
fields = filter(None, map(str.strip, line.split('\t'))) | ||
if not (4 <= len(fields) <= 5) \ | ||
or 'E' not in fields[0] or fields[0][0] == '#': | ||
continue | ||
|
||
opts, pat, text, sgroups = fields[0:4] | ||
groups = [] # groups as integer ranges | ||
if sgroups == 'NOMATCH': | ||
groups = [None] | ||
elif ',' in sgroups: | ||
noparen = map(lambda s: s.strip('()'), sgroups.split(')(')) | ||
for g in noparen: | ||
s, e = map(str.strip, g.split(',')) | ||
if s == '?' and e == '?': | ||
groups.append(None) | ||
else: | ||
groups.append((int(s), int(e))) | ||
else: | ||
# This skips tests that should result in an error. | ||
# There aren't many, so I think we can just capture those | ||
# manually. Possibly fix this in future. | ||
continue | ||
|
||
if pat == 'SAME': | ||
pat = tests[-1][1] | ||
if '$' in opts: | ||
pat = pat.decode('string_escape') | ||
text = text.decode('string_escape') | ||
if 'i' in opts: | ||
pat = '(?i)%s' % pat | ||
|
||
name = '%s_%d' % (basename, lineno) | ||
tests.append((name, pat, text, groups)) | ||
return tests | ||
|
||
|
||
def test_tostr(t): | ||
lineno, pat, text, groups = t | ||
options = map(group_tostr, groups) | ||
return 'mat!(match_%s, r"%s", r"%s", %s)' \ | ||
% (lineno, pat, '' if text == "NULL" else text, ', '.join(options)) | ||
|
||
|
||
def group_tostr(g): | ||
if g is None: | ||
return 'None' | ||
else: | ||
return 'Some((%d, %d))' % (g[0], g[1]) | ||
|
||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser( | ||
description='Generate match tests from an AT&T POSIX test file.') | ||
aa = parser.add_argument | ||
aa('files', nargs='+', | ||
help='A list of dat AT&T POSIX test files. See src/libregexp/testdata') | ||
args = parser.parse_args() | ||
|
||
tests = [] | ||
for f in args.files: | ||
tests += read_tests(f) | ||
|
||
tpl = '''// Copyright 2014 The Rust Project Developers. See the COPYRIGHT | ||
// file at the top-level directory of this distribution and at | ||
// http://rust-lang.org/COPYRIGHT. | ||
// | ||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | ||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | ||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | ||
// option. This file may not be copied, modified, or distributed | ||
// except according to those terms. | ||
// ignore-tidy-linelength | ||
// DO NOT EDIT. Automatically generated by 'src/etc/regexp-match-tests' | ||
// on {date}. | ||
''' | ||
print(tpl.format(date=str(datetime.datetime.now()))) | ||
|
||
for f in args.files: | ||
print('// Tests from %s' % path.basename(f)) | ||
print_tests(read_tests(f)) | ||
print('') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,183 @@ | ||
#!/usr/bin/env python2 | ||
|
||
# Copyright 2014 The Rust Project Developers. See the COPYRIGHT | ||
# file at the top-level directory of this distribution and at | ||
# http://rust-lang.org/COPYRIGHT. | ||
# | ||
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | ||
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | ||
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | ||
# option. This file may not be copied, modified, or distributed | ||
# except according to those terms. | ||
|
||
from __future__ import absolute_import, division, print_function | ||
import argparse | ||
from collections import defaultdict | ||
import csv | ||
import datetime | ||
import urllib2 | ||
|
||
BASE_URL = 'http://www.unicode.org/Public/6.3.0/ucd/' | ||
DATA = 'UnicodeData.txt' | ||
SCRIPTS = 'Scripts.txt' | ||
|
||
# Mapping taken from Table 12 from: | ||
# http://www.unicode.org/reports/tr44/#General_Category_Values | ||
expanded_categories = { | ||
'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'], | ||
'Lm': ['L'], 'Lo': ['L'], | ||
'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'], | ||
'Nd': ['N'], 'Nl': ['N'], 'No': ['No'], | ||
'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'], | ||
'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'], | ||
'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'], | ||
'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'], | ||
'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'], | ||
} | ||
|
||
|
||
def as_4byte_uni(n): | ||
s = hex(n)[2:] | ||
return '\\U%s%s' % ('0' * (8 - len(s)), s) | ||
|
||
|
||
def expand_cat(c): | ||
return expanded_categories.get(c, []) + [c] | ||
|
||
|
||
def is_valid_unicode(n): | ||
return 0 <= n <= 0xD7FF or 0xE000 <= n <= 0x10FFFF | ||
|
||
|
||
def read_cats(f): | ||
assigned = defaultdict(list) | ||
for row in csv.reader(f, delimiter=';'): | ||
(hex, cats) = (int(row[0], 16), expand_cat(row[2])) | ||
if not is_valid_unicode(hex): | ||
continue | ||
for cat in cats: | ||
assigned[cat].append(hex) | ||
return assigned | ||
|
||
|
||
def read_scripts(f): | ||
assigned = defaultdict(list) | ||
for line in f: | ||
line = line.strip() | ||
if not line or line.startswith('#'): | ||
continue | ||
hexes, name = map(str.strip, line.split(';'))[:2] | ||
name = name[:name.index('#')].strip() | ||
if '..' not in hexes: | ||
hex = int(hexes, 16) | ||
if is_valid_unicode(hex): | ||
assigned[name].append(hex) | ||
else: | ||
hex1, hex2 = map(lambda s: int(s, 16), hexes.split('..')) | ||
for hex in xrange(hex1, hex2 + 1): | ||
if is_valid_unicode(hex): | ||
assigned[name].append(hex) | ||
return assigned | ||
|
||
|
||
def group(letters): | ||
letters = sorted(set(letters)) | ||
grouped = [] | ||
cur_start = letters.pop(0) | ||
cur_end = cur_start | ||
for letter in letters: | ||
assert letter > cur_end, \ | ||
'cur_end: %s, letter: %s' % (hex(cur_end), hex(letter)) | ||
|
||
if letter == cur_end + 1: | ||
cur_end = letter | ||
else: | ||
grouped.append((cur_start, cur_end)) | ||
cur_start, cur_end = letter, letter | ||
grouped.append((cur_start, cur_end)) | ||
return grouped | ||
|
||
|
||
def ranges_to_rust(rs): | ||
rs = ("('%s', '%s')" % (as_4byte_uni(s), as_4byte_uni(e)) for s, e in rs) | ||
return ',\n '.join(rs) | ||
|
||
|
||
def groups_to_rust(groups): | ||
rust_groups = [] | ||
for group_name in sorted(groups): | ||
rust_groups.append('("%s", &[\n %s\n ]),' | ||
% (group_name, ranges_to_rust(groups[group_name]))) | ||
return '\n'.join(rust_groups) | ||
|
||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser( | ||
description='Generate Unicode character class tables.') | ||
aa = parser.add_argument | ||
aa('--local', action='store_true', | ||
help='When set, Scripts.txt and UnicodeData.txt will be read from ' | ||
'the CWD.') | ||
aa('--base-url', type=str, default=BASE_URL, | ||
help='The base URL to use for downloading Unicode data files.') | ||
args = parser.parse_args() | ||
|
||
if args.local: | ||
cats = read_cats(open(DATA)) | ||
scripts = read_scripts(open(SCRIPTS)) | ||
else: | ||
cats = read_cats(urllib2.urlopen(args.base_url + '/' + DATA)) | ||
scripts = read_scripts(urllib2.urlopen(args.base_url + '/' + SCRIPTS)) | ||
|
||
# Get Rust code for all Unicode general categories and scripts. | ||
combined = dict(cats, **scripts) | ||
unigroups = groups_to_rust({k: group(letters) | ||
for k, letters in combined.items()}) | ||
|
||
# Now get Perl character classes that are Unicode friendly. | ||
perld = range(ord('0'), ord('9') + 1) | ||
dgroups = ranges_to_rust(group(perld + cats['Nd'][:])) | ||
|
||
perls = map(ord, ['\t', '\n', '\x0C', '\r', ' ']) | ||
sgroups = ranges_to_rust(group(perls + cats['Z'][:])) | ||
|
||
low, up = (range(ord('a'), ord('z') + 1), range(ord('A'), ord('Z') + 1)) | ||
perlw = [ord('_')] + perld + low + up | ||
wgroups = ranges_to_rust(group(perlw + cats['L'][:])) | ||
|
||
tpl = '''// Copyright 2014 The Rust Project Developers. See the COPYRIGHT | ||
// file at the top-level directory of this distribution and at | ||
// http://rust-lang.org/COPYRIGHT. | ||
// | ||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | ||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | ||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | ||
// option. This file may not be copied, modified, or distributed | ||
// except according to those terms. | ||
// DO NOT EDIT. Automatically generated by 'src/etc/regexp-unicode-tables' | ||
// on {date}. | ||
use parse::{{Class, NamedClasses}}; | ||
pub static UNICODE_CLASSES: NamedClasses = &[ | ||
{groups} | ||
]; | ||
pub static PERLD: Class = &[ | ||
{dgroups} | ||
]; | ||
pub static PERLS: Class = &[ | ||
{sgroups} | ||
]; | ||
pub static PERLW: Class = &[ | ||
{wgroups} | ||
]; | ||
''' | ||
now = datetime.datetime.now() | ||
print(tpl.format(date=str(now), groups=unigroups, | ||
dgroups=dgroups, sgroups=sgroups, wgroups=wgroups)) |
Oops, something went wrong.