Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit eea4909

Browse files
committedApr 25, 2014
auto merge of #13700 : BurntSushi/rust/regexp, r=alexcrichton
Implements [RFC 7](https://github.com/rust-lang/rfcs/blob/master/active/0007-regexps.md) and will hopefully resolve #3591. The crate is marked as experimental. It includes a syntax extension for compiling regexps to native Rust code. Embeds and passes the `basic`, `nullsubexpr` and `repetition` tests from [Glenn Fowler's (slightly modified by Russ Cox for leftmost-first semantics) testregex test suite](http://www2.research.att.com/~astopen/testregex/testregex.html). I've also hand written a plethora of other tests that exercise Unicode support, the parser, public API, etc. Also includes a `regex-dna` benchmark for the shootout. I know the addition looks huge at first, but consider these things: 1. More than half the number of lines is dedicated to Unicode character classes. 2. Of the ~4,500 lines remaining, 1,225 of them are comments. 3. Another ~800 are tests. 4. That leaves 2500 lines for the meat. The parser is ~850 of them. The public API, compiler, dynamic VM and code generator (for `regexp!`) make up the rest.
2 parents 2bb2341 + 7269bc7 commit eea4909

24 files changed

+11108
-6
lines changed
 

‎mk/crates.mk

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,8 @@
5151

5252
TARGET_CRATES := libc std green rustuv native flate arena glob term semver \
5353
uuid serialize sync getopts collections num test time rand \
54-
workcache url log
55-
HOST_CRATES := syntax rustc rustdoc fourcc hexfloat
54+
workcache url log regex
55+
HOST_CRATES := syntax rustc rustdoc fourcc hexfloat regex_macros
5656
CRATES := $(TARGET_CRATES) $(HOST_CRATES)
5757
TOOLS := compiletest rustdoc rustc
5858

@@ -84,6 +84,8 @@ DEPS_rand := std
8484
DEPS_url := std collections
8585
DEPS_workcache := std serialize collections log
8686
DEPS_log := std sync
87+
DEPS_regex := std collections
88+
DEPS_regex_macros = syntax std regex
8789

8890
TOOL_DEPS_compiletest := test green rustuv getopts
8991
TOOL_DEPS_rustdoc := rustdoc native

‎mk/main.mk

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -311,8 +311,6 @@ HSREQ$(1)_H_$(3) = $$(HBIN$(1)_H_$(3))/rustc$$(X_$(3))
311311
else
312312
HSREQ$(1)_H_$(3) = \
313313
$$(HBIN$(1)_H_$(3))/rustc$$(X_$(3)) \
314-
$$(HLIB$(1)_H_$(3))/stamp.rustc \
315-
$$(foreach dep,$$(RUST_DEPS_rustc),$$(HLIB$(1)_H_$(3))/stamp.$$(dep)) \
316314
$$(MKFILE_DEPS)
317315
endif
318316

@@ -334,8 +332,7 @@ SREQ$(1)_T_$(2)_H_$(3) = \
334332
CSREQ$(1)_T_$(2)_H_$(3) = \
335333
$$(TSREQ$(1)_T_$(2)_H_$(3)) \
336334
$$(HBIN$(1)_H_$(3))/rustdoc$$(X_$(3)) \
337-
$$(foreach dep,$$(CRATES),$$(TLIB$(1)_T_$(2)_H_$(3))/stamp.$$(dep)) \
338-
$$(foreach dep,$$(HOST_CRATES),$$(HLIB$(1)_H_$(3))/stamp.$$(dep))
335+
$$(foreach dep,$$(CRATES),$$(TLIB$(1)_T_$(2)_H_$(3))/stamp.$$(dep))
339336

340337
ifeq ($(1),0)
341338
# Don't run the stage0 compiler under valgrind - that ship has sailed

‎src/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ Source layout:
1919
| `libfourcc/` | Data format identifier library |
2020
| `libgetopts/` | Get command-line-options library |
2121
| `libglob/` | Unix glob patterns library |
22+
| `libregex/` | Regular expressions |
2223
| `libsemver/` | Rust's semantic versioning library |
2324
| `libserialize/` | Encode-Decode types library |
2425
| `libsync/` | Concurrency mechanisms and primitives |

‎src/doc/index.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ li {list-style-type: none; }
4141
* [The `native` 1:1 threading runtime](native/index.html)
4242
* [The `num` arbitrary precision numerics library](num/index.html)
4343
* [The `rand` library for random numbers and distributions](rand/index.html)
44+
* [The `regex` library for regular expressions](regex/index.html)
4445
* [The `rustc` compiler](rustc/index.html)
4546
* [The `rustuv` M:N I/O library](rustuv/index.html)
4647
* [The `semver` version collation library](semver/index.html)

‎src/etc/regex-match-tests.py

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
#!/usr/bin/env python2
2+
3+
# Copyright 2014 The Rust Project Developers. See the COPYRIGHT
4+
# file at the top-level directory of this distribution and at
5+
# http://rust-lang.org/COPYRIGHT.
6+
#
7+
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
8+
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
9+
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
10+
# option. This file may not be copied, modified, or distributed
11+
# except according to those terms.
12+
13+
from __future__ import absolute_import, division, print_function
14+
import argparse
15+
import datetime
16+
import os.path as path
17+
18+
19+
def print_tests(tests):
20+
print('\n'.join([test_tostr(t) for t in tests]))
21+
22+
23+
def read_tests(f):
24+
basename, _ = path.splitext(path.basename(f))
25+
tests = []
26+
for lineno, line in enumerate(open(f), 1):
27+
fields = filter(None, map(str.strip, line.split('\t')))
28+
if not (4 <= len(fields) <= 5) \
29+
or 'E' not in fields[0] or fields[0][0] == '#':
30+
continue
31+
32+
opts, pat, text, sgroups = fields[0:4]
33+
groups = [] # groups as integer ranges
34+
if sgroups == 'NOMATCH':
35+
groups = [None]
36+
elif ',' in sgroups:
37+
noparen = map(lambda s: s.strip('()'), sgroups.split(')('))
38+
for g in noparen:
39+
s, e = map(str.strip, g.split(','))
40+
if s == '?' and e == '?':
41+
groups.append(None)
42+
else:
43+
groups.append((int(s), int(e)))
44+
else:
45+
# This skips tests that should result in an error.
46+
# There aren't many, so I think we can just capture those
47+
# manually. Possibly fix this in future.
48+
continue
49+
50+
if pat == 'SAME':
51+
pat = tests[-1][1]
52+
if '$' in opts:
53+
pat = pat.decode('string_escape')
54+
text = text.decode('string_escape')
55+
if 'i' in opts:
56+
pat = '(?i)%s' % pat
57+
58+
name = '%s_%d' % (basename, lineno)
59+
tests.append((name, pat, text, groups))
60+
return tests
61+
62+
63+
def test_tostr(t):
64+
lineno, pat, text, groups = t
65+
options = map(group_tostr, groups)
66+
return 'mat!(match_%s, r"%s", r"%s", %s)' \
67+
% (lineno, pat, '' if text == "NULL" else text, ', '.join(options))
68+
69+
70+
def group_tostr(g):
71+
if g is None:
72+
return 'None'
73+
else:
74+
return 'Some((%d, %d))' % (g[0], g[1])
75+
76+
77+
if __name__ == '__main__':
78+
parser = argparse.ArgumentParser(
79+
description='Generate match tests from an AT&T POSIX test file.')
80+
aa = parser.add_argument
81+
aa('files', nargs='+',
82+
help='A list of dat AT&T POSIX test files. See src/libregexp/testdata')
83+
args = parser.parse_args()
84+
85+
tests = []
86+
for f in args.files:
87+
tests += read_tests(f)
88+
89+
tpl = '''// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
90+
// file at the top-level directory of this distribution and at
91+
// http://rust-lang.org/COPYRIGHT.
92+
//
93+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
94+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
95+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
96+
// option. This file may not be copied, modified, or distributed
97+
// except according to those terms.
98+
99+
// ignore-tidy-linelength
100+
101+
// DO NOT EDIT. Automatically generated by 'src/etc/regexp-match-tests'
102+
// on {date}.
103+
'''
104+
print(tpl.format(date=str(datetime.datetime.now())))
105+
106+
for f in args.files:
107+
print('// Tests from %s' % path.basename(f))
108+
print_tests(read_tests(f))
109+
print('')

‎src/etc/regex-unicode-tables.py

Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
#!/usr/bin/env python2
2+
3+
# Copyright 2014 The Rust Project Developers. See the COPYRIGHT
4+
# file at the top-level directory of this distribution and at
5+
# http://rust-lang.org/COPYRIGHT.
6+
#
7+
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
8+
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
9+
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
10+
# option. This file may not be copied, modified, or distributed
11+
# except according to those terms.
12+
13+
from __future__ import absolute_import, division, print_function
14+
import argparse
15+
from collections import defaultdict
16+
import csv
17+
import datetime
18+
import urllib2
19+
20+
BASE_URL = 'http://www.unicode.org/Public/6.3.0/ucd/'
21+
DATA = 'UnicodeData.txt'
22+
SCRIPTS = 'Scripts.txt'
23+
24+
# Mapping taken from Table 12 from:
25+
# http://www.unicode.org/reports/tr44/#General_Category_Values
26+
expanded_categories = {
27+
'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
28+
'Lm': ['L'], 'Lo': ['L'],
29+
'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
30+
'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
31+
'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
32+
'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
33+
'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
34+
'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
35+
'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
36+
}
37+
38+
39+
def as_4byte_uni(n):
40+
s = hex(n)[2:]
41+
return '\\U%s%s' % ('0' * (8 - len(s)), s)
42+
43+
44+
def expand_cat(c):
45+
return expanded_categories.get(c, []) + [c]
46+
47+
48+
def is_valid_unicode(n):
49+
return 0 <= n <= 0xD7FF or 0xE000 <= n <= 0x10FFFF
50+
51+
52+
def read_cats(f):
53+
assigned = defaultdict(list)
54+
for row in csv.reader(f, delimiter=';'):
55+
(hex, cats) = (int(row[0], 16), expand_cat(row[2]))
56+
if not is_valid_unicode(hex):
57+
continue
58+
for cat in cats:
59+
assigned[cat].append(hex)
60+
return assigned
61+
62+
63+
def read_scripts(f):
64+
assigned = defaultdict(list)
65+
for line in f:
66+
line = line.strip()
67+
if not line or line.startswith('#'):
68+
continue
69+
hexes, name = map(str.strip, line.split(';'))[:2]
70+
name = name[:name.index('#')].strip()
71+
if '..' not in hexes:
72+
hex = int(hexes, 16)
73+
if is_valid_unicode(hex):
74+
assigned[name].append(hex)
75+
else:
76+
hex1, hex2 = map(lambda s: int(s, 16), hexes.split('..'))
77+
for hex in xrange(hex1, hex2 + 1):
78+
if is_valid_unicode(hex):
79+
assigned[name].append(hex)
80+
return assigned
81+
82+
83+
def group(letters):
84+
letters = sorted(set(letters))
85+
grouped = []
86+
cur_start = letters.pop(0)
87+
cur_end = cur_start
88+
for letter in letters:
89+
assert letter > cur_end, \
90+
'cur_end: %s, letter: %s' % (hex(cur_end), hex(letter))
91+
92+
if letter == cur_end + 1:
93+
cur_end = letter
94+
else:
95+
grouped.append((cur_start, cur_end))
96+
cur_start, cur_end = letter, letter
97+
grouped.append((cur_start, cur_end))
98+
return grouped
99+
100+
101+
def ranges_to_rust(rs):
102+
rs = ("('%s', '%s')" % (as_4byte_uni(s), as_4byte_uni(e)) for s, e in rs)
103+
return ',\n '.join(rs)
104+
105+
106+
def groups_to_rust(groups):
107+
rust_groups = []
108+
for group_name in sorted(groups):
109+
rust_groups.append('("%s", &[\n %s\n ]),'
110+
% (group_name, ranges_to_rust(groups[group_name])))
111+
return '\n'.join(rust_groups)
112+
113+
114+
if __name__ == '__main__':
115+
parser = argparse.ArgumentParser(
116+
description='Generate Unicode character class tables.')
117+
aa = parser.add_argument
118+
aa('--local', action='store_true',
119+
help='When set, Scripts.txt and UnicodeData.txt will be read from '
120+
'the CWD.')
121+
aa('--base-url', type=str, default=BASE_URL,
122+
help='The base URL to use for downloading Unicode data files.')
123+
args = parser.parse_args()
124+
125+
if args.local:
126+
cats = read_cats(open(DATA))
127+
scripts = read_scripts(open(SCRIPTS))
128+
else:
129+
cats = read_cats(urllib2.urlopen(args.base_url + '/' + DATA))
130+
scripts = read_scripts(urllib2.urlopen(args.base_url + '/' + SCRIPTS))
131+
132+
# Get Rust code for all Unicode general categories and scripts.
133+
combined = dict(cats, **scripts)
134+
unigroups = groups_to_rust({k: group(letters)
135+
for k, letters in combined.items()})
136+
137+
# Now get Perl character classes that are Unicode friendly.
138+
perld = range(ord('0'), ord('9') + 1)
139+
dgroups = ranges_to_rust(group(perld + cats['Nd'][:]))
140+
141+
perls = map(ord, ['\t', '\n', '\x0C', '\r', ' '])
142+
sgroups = ranges_to_rust(group(perls + cats['Z'][:]))
143+
144+
low, up = (range(ord('a'), ord('z') + 1), range(ord('A'), ord('Z') + 1))
145+
perlw = [ord('_')] + perld + low + up
146+
wgroups = ranges_to_rust(group(perlw + cats['L'][:]))
147+
148+
tpl = '''// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
149+
// file at the top-level directory of this distribution and at
150+
// http://rust-lang.org/COPYRIGHT.
151+
//
152+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
153+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
154+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
155+
// option. This file may not be copied, modified, or distributed
156+
// except according to those terms.
157+
158+
// DO NOT EDIT. Automatically generated by 'src/etc/regexp-unicode-tables'
159+
// on {date}.
160+
161+
use parse::{{Class, NamedClasses}};
162+
163+
pub static UNICODE_CLASSES: NamedClasses = &[
164+
165+
{groups}
166+
167+
];
168+
169+
pub static PERLD: Class = &[
170+
{dgroups}
171+
];
172+
173+
pub static PERLS: Class = &[
174+
{sgroups}
175+
];
176+
177+
pub static PERLW: Class = &[
178+
{wgroups}
179+
];
180+
'''
181+
now = datetime.datetime.now()
182+
print(tpl.format(date=str(now), groups=unigroups,
183+
dgroups=dgroups, sgroups=sgroups, wgroups=wgroups))

‎src/libregex/compile.rs

Lines changed: 274 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,274 @@
1+
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
2+
// file at the top-level directory of this distribution and at
3+
// http://rust-lang.org/COPYRIGHT.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
// Enable this to squash warnings due to exporting pieces of the representation
12+
// for use with the regex! macro. See lib.rs for explanation.
13+
#![allow(visible_private_types)]
14+
15+
use std::cmp;
16+
use std::iter;
17+
use parse;
18+
use parse::{
19+
Flags, FLAG_EMPTY,
20+
Nothing, Literal, Dot, Class, Begin, End, WordBoundary, Capture, Cat, Alt,
21+
Rep,
22+
ZeroOne, ZeroMore, OneMore,
23+
};
24+
25+
type InstIdx = uint;
26+
27+
#[deriving(Show, Clone)]
28+
pub enum Inst {
29+
// When a Match instruction is executed, the current thread is successful.
30+
Match,
31+
32+
// The OneChar instruction matches a literal character.
33+
// The flags indicate whether to do a case insensitive match.
34+
OneChar(char, Flags),
35+
36+
// The CharClass instruction tries to match one input character against
37+
// the range of characters given.
38+
// The flags indicate whether to do a case insentivie match and whether
39+
// the character class is negated or not.
40+
CharClass(Vec<(char, char)>, Flags),
41+
42+
// Matches any character except new lines.
43+
// The flags indicate whether to include the '\n' character.
44+
Any(Flags),
45+
46+
// Matches the beginning of the string, consumes no characters.
47+
// The flags indicate whether it matches if the preceding character
48+
// is a new line.
49+
EmptyBegin(Flags),
50+
51+
// Matches the end of the string, consumes no characters.
52+
// The flags indicate whether it matches if the proceding character
53+
// is a new line.
54+
EmptyEnd(Flags),
55+
56+
// Matches a word boundary (\w on one side and \W \A or \z on the other),
57+
// and consumes no character.
58+
// The flags indicate whether this matches a word boundary or something
59+
// that isn't a word boundary.
60+
EmptyWordBoundary(Flags),
61+
62+
// Saves the current position in the input string to the Nth save slot.
63+
Save(uint),
64+
65+
// Jumps to the instruction at the index given.
66+
Jump(InstIdx),
67+
68+
// Jumps to the instruction at the first index given. If that leads to
69+
// a failing state, then the instruction at the second index given is
70+
// tried.
71+
Split(InstIdx, InstIdx),
72+
}
73+
74+
/// Program represents a compiled regular expression. Once an expression is
75+
/// compiled, its representation is immutable and will never change.
76+
///
77+
/// All of the data in a compiled expression is wrapped in "MaybeStatic" or
78+
/// "MaybeOwned" types so that a `Program` can be represented as static data.
79+
/// (This makes it convenient and efficient for use with the `regex!` macro.)
80+
#[deriving(Clone)]
81+
pub struct Program {
82+
/// A sequence of instructions.
83+
pub insts: Vec<Inst>,
84+
/// If the regular expression requires a literal prefix in order to have a
85+
/// match, that prefix is stored here. (It's used in the VM to implement
86+
/// an optimization.)
87+
pub prefix: ~str,
88+
}
89+
90+
impl Program {
91+
/// Compiles a Regex given its AST.
92+
pub fn new(ast: ~parse::Ast) -> (Program, ~[Option<~str>]) {
93+
let mut c = Compiler {
94+
insts: Vec::with_capacity(100),
95+
names: Vec::with_capacity(10),
96+
};
97+
98+
c.insts.push(Save(0));
99+
c.compile(ast);
100+
c.insts.push(Save(1));
101+
c.insts.push(Match);
102+
103+
// Try to discover a literal string prefix.
104+
// This is a bit hacky since we have to skip over the initial
105+
// 'Save' instruction.
106+
let mut pre = StrBuf::with_capacity(5);
107+
for i in iter::range(1, c.insts.len()) {
108+
match *c.insts.get(i) {
109+
OneChar(c, FLAG_EMPTY) => pre.push_char(c),
110+
_ => break
111+
}
112+
}
113+
114+
let names = c.names.as_slice().into_owned();
115+
let prog = Program {
116+
insts: c.insts,
117+
prefix: pre.into_owned(),
118+
};
119+
(prog, names)
120+
}
121+
122+
/// Returns the total number of capture groups in the regular expression.
123+
/// This includes the zeroth capture.
124+
pub fn num_captures(&self) -> uint {
125+
let mut n = 0;
126+
for inst in self.insts.iter() {
127+
match *inst {
128+
Save(c) => n = cmp::max(n, c+1),
129+
_ => {}
130+
}
131+
}
132+
// There's exactly 2 Save slots for every capture.
133+
n / 2
134+
}
135+
}
136+
137+
struct Compiler<'r> {
138+
insts: Vec<Inst>,
139+
names: Vec<Option<~str>>,
140+
}
141+
142+
// The compiler implemented here is extremely simple. Most of the complexity
143+
// in this crate is in the parser or the VM.
144+
// The only tricky thing here is patching jump/split instructions to point to
145+
// the right instruction.
146+
impl<'r> Compiler<'r> {
147+
fn compile(&mut self, ast: ~parse::Ast) {
148+
match ast {
149+
~Nothing => {},
150+
~Literal(c, flags) => self.push(OneChar(c, flags)),
151+
~Dot(nl) => self.push(Any(nl)),
152+
~Class(ranges, flags) =>
153+
self.push(CharClass(ranges, flags)),
154+
~Begin(flags) => self.push(EmptyBegin(flags)),
155+
~End(flags) => self.push(EmptyEnd(flags)),
156+
~WordBoundary(flags) => self.push(EmptyWordBoundary(flags)),
157+
~Capture(cap, name, x) => {
158+
let len = self.names.len();
159+
if cap >= len {
160+
self.names.grow(10 + cap - len, &None)
161+
}
162+
*self.names.get_mut(cap) = name;
163+
164+
self.push(Save(2 * cap));
165+
self.compile(x);
166+
self.push(Save(2 * cap + 1));
167+
}
168+
~Cat(xs) => {
169+
for x in xs.move_iter() {
170+
self.compile(x)
171+
}
172+
}
173+
~Alt(x, y) => {
174+
let split = self.empty_split(); // push: split 0, 0
175+
let j1 = self.insts.len();
176+
self.compile(x); // push: insts for x
177+
let jmp = self.empty_jump(); // push: jmp 0
178+
let j2 = self.insts.len();
179+
self.compile(y); // push: insts for y
180+
let j3 = self.insts.len();
181+
182+
self.set_split(split, j1, j2); // split 0, 0 -> split j1, j2
183+
self.set_jump(jmp, j3); // jmp 0 -> jmp j3
184+
}
185+
~Rep(x, ZeroOne, g) => {
186+
let split = self.empty_split();
187+
let j1 = self.insts.len();
188+
self.compile(x);
189+
let j2 = self.insts.len();
190+
191+
if g.is_greedy() {
192+
self.set_split(split, j1, j2);
193+
} else {
194+
self.set_split(split, j2, j1);
195+
}
196+
}
197+
~Rep(x, ZeroMore, g) => {
198+
let j1 = self.insts.len();
199+
let split = self.empty_split();
200+
let j2 = self.insts.len();
201+
self.compile(x);
202+
let jmp = self.empty_jump();
203+
let j3 = self.insts.len();
204+
205+
self.set_jump(jmp, j1);
206+
if g.is_greedy() {
207+
self.set_split(split, j2, j3);
208+
} else {
209+
self.set_split(split, j3, j2);
210+
}
211+
}
212+
~Rep(x, OneMore, g) => {
213+
let j1 = self.insts.len();
214+
self.compile(x);
215+
let split = self.empty_split();
216+
let j2 = self.insts.len();
217+
218+
if g.is_greedy() {
219+
self.set_split(split, j1, j2);
220+
} else {
221+
self.set_split(split, j2, j1);
222+
}
223+
}
224+
}
225+
}
226+
227+
/// Appends the given instruction to the program.
228+
#[inline]
229+
fn push(&mut self, x: Inst) {
230+
self.insts.push(x)
231+
}
232+
233+
/// Appends an *empty* `Split` instruction to the program and returns
234+
/// the index of that instruction. (The index can then be used to "patch"
235+
/// the actual locations of the split in later.)
236+
#[inline]
237+
fn empty_split(&mut self) -> InstIdx {
238+
self.insts.push(Split(0, 0));
239+
self.insts.len() - 1
240+
}
241+
242+
/// Sets the left and right locations of a `Split` instruction at index
243+
/// `i` to `pc1` and `pc2`, respectively.
244+
/// If the instruction at index `i` isn't a `Split` instruction, then
245+
/// `fail!` is called.
246+
#[inline]
247+
fn set_split(&mut self, i: InstIdx, pc1: InstIdx, pc2: InstIdx) {
248+
let split = self.insts.get_mut(i);
249+
match *split {
250+
Split(_, _) => *split = Split(pc1, pc2),
251+
_ => fail!("BUG: Invalid split index."),
252+
}
253+
}
254+
255+
/// Appends an *empty* `Jump` instruction to the program and returns the
256+
/// index of that instruction.
257+
#[inline]
258+
fn empty_jump(&mut self) -> InstIdx {
259+
self.insts.push(Jump(0));
260+
self.insts.len() - 1
261+
}
262+
263+
/// Sets the location of a `Jump` instruction at index `i` to `pc`.
264+
/// If the instruction at index `i` isn't a `Jump` instruction, then
265+
/// `fail!` is called.
266+
#[inline]
267+
fn set_jump(&mut self, i: InstIdx, pc: InstIdx) {
268+
let jmp = self.insts.get_mut(i);
269+
match *jmp {
270+
Jump(_) => *jmp = Jump(pc),
271+
_ => fail!("BUG: Invalid jump index."),
272+
}
273+
}
274+
}

‎src/libregex/lib.rs

Lines changed: 426 additions & 0 deletions
Large diffs are not rendered by default.

‎src/libregex/parse.rs

Lines changed: 1028 additions & 0 deletions
Large diffs are not rendered by default.

‎src/libregex/re.rs

Lines changed: 870 additions & 0 deletions
Large diffs are not rendered by default.

‎src/libregex/test/bench.rs

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
2+
// file at the top-level directory of this distribution and at
3+
// http://rust-lang.org/COPYRIGHT.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
use rand::{Rng, task_rng};
12+
use stdtest::Bencher;
13+
use std::str;
14+
use regex::{Regex, NoExpand};
15+
16+
fn bench_assert_match(b: &mut Bencher, re: Regex, text: &str) {
17+
b.iter(|| if !re.is_match(text) { fail!("no match") });
18+
}
19+
20+
#[bench]
21+
fn no_exponential(b: &mut Bencher) {
22+
let n = 100;
23+
let re = Regex::new("a?".repeat(n) + "a".repeat(n)).unwrap();
24+
let text = "a".repeat(n);
25+
bench_assert_match(b, re, text);
26+
}
27+
28+
#[bench]
29+
fn literal(b: &mut Bencher) {
30+
let re = regex!("y");
31+
let text = "x".repeat(50) + "y";
32+
bench_assert_match(b, re, text);
33+
}
34+
35+
#[bench]
36+
fn not_literal(b: &mut Bencher) {
37+
let re = regex!(".y");
38+
let text = "x".repeat(50) + "y";
39+
bench_assert_match(b, re, text);
40+
}
41+
42+
#[bench]
43+
fn match_class(b: &mut Bencher) {
44+
let re = regex!("[abcdw]");
45+
let text = "xxxx".repeat(20) + "w";
46+
bench_assert_match(b, re, text);
47+
}
48+
49+
#[bench]
50+
fn match_class_in_range(b: &mut Bencher) {
51+
// 'b' is between 'a' and 'c', so the class range checking doesn't help.
52+
let re = regex!("[ac]");
53+
let text = "bbbb".repeat(20) + "c";
54+
bench_assert_match(b, re, text);
55+
}
56+
57+
#[bench]
58+
fn replace_all(b: &mut Bencher) {
59+
let re = regex!("[cjrw]");
60+
let text = "abcdefghijklmnopqrstuvwxyz";
61+
// FIXME: This isn't using the $name expand stuff.
62+
// It's possible RE2/Go is using it, but currently, the expand in this
63+
// crate is actually compiling a regex, so it's incredibly slow.
64+
b.iter(|| re.replace_all(text, NoExpand("")));
65+
}
66+
67+
#[bench]
68+
fn anchored_literal_short_non_match(b: &mut Bencher) {
69+
let re = regex!("^zbc(d|e)");
70+
let text = "abcdefghijklmnopqrstuvwxyz";
71+
b.iter(|| re.is_match(text));
72+
}
73+
74+
#[bench]
75+
fn anchored_literal_long_non_match(b: &mut Bencher) {
76+
let re = regex!("^zbc(d|e)");
77+
let text = "abcdefghijklmnopqrstuvwxyz".repeat(15);
78+
b.iter(|| re.is_match(text));
79+
}
80+
81+
#[bench]
82+
fn anchored_literal_short_match(b: &mut Bencher) {
83+
let re = regex!("^.bc(d|e)");
84+
let text = "abcdefghijklmnopqrstuvwxyz";
85+
b.iter(|| re.is_match(text));
86+
}
87+
88+
#[bench]
89+
fn anchored_literal_long_match(b: &mut Bencher) {
90+
let re = regex!("^.bc(d|e)");
91+
let text = "abcdefghijklmnopqrstuvwxyz".repeat(15);
92+
b.iter(|| re.is_match(text));
93+
}
94+
95+
#[bench]
96+
fn one_pass_short_a(b: &mut Bencher) {
97+
let re = regex!("^.bc(d|e)*$");
98+
let text = "abcddddddeeeededd";
99+
b.iter(|| re.is_match(text));
100+
}
101+
102+
#[bench]
103+
fn one_pass_short_a_not(b: &mut Bencher) {
104+
let re = regex!(".bc(d|e)*$");
105+
let text = "abcddddddeeeededd";
106+
b.iter(|| re.is_match(text));
107+
}
108+
109+
#[bench]
110+
fn one_pass_short_b(b: &mut Bencher) {
111+
let re = regex!("^.bc(?:d|e)*$");
112+
let text = "abcddddddeeeededd";
113+
b.iter(|| re.is_match(text));
114+
}
115+
116+
#[bench]
117+
fn one_pass_short_b_not(b: &mut Bencher) {
118+
let re = regex!(".bc(?:d|e)*$");
119+
let text = "abcddddddeeeededd";
120+
b.iter(|| re.is_match(text));
121+
}
122+
123+
#[bench]
124+
fn one_pass_long_prefix(b: &mut Bencher) {
125+
let re = regex!("^abcdefghijklmnopqrstuvwxyz.*$");
126+
let text = "abcdefghijklmnopqrstuvwxyz";
127+
b.iter(|| re.is_match(text));
128+
}
129+
130+
#[bench]
131+
fn one_pass_long_prefix_not(b: &mut Bencher) {
132+
let re = regex!("^.bcdefghijklmnopqrstuvwxyz.*$");
133+
let text = "abcdefghijklmnopqrstuvwxyz";
134+
b.iter(|| re.is_match(text));
135+
}
136+
137+
macro_rules! throughput(
138+
($name:ident, $regex:expr, $size:expr) => (
139+
#[bench]
140+
fn $name(b: &mut Bencher) {
141+
let text = gen_text($size);
142+
b.bytes = $size;
143+
b.iter(|| if $regex.is_match(text) { fail!("match") });
144+
}
145+
);
146+
)
147+
148+
fn easy0() -> Regex { regex!("ABCDEFGHIJKLMNOPQRSTUVWXYZ$") }
149+
fn easy1() -> Regex { regex!("A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$") }
150+
fn medium() -> Regex { regex!("[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$") }
151+
fn hard() -> Regex { regex!("[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$") }
152+
153+
fn gen_text(n: uint) -> ~str {
154+
let mut rng = task_rng();
155+
let mut bytes = rng.gen_ascii_str(n).into_bytes();
156+
for (i, b) in bytes.mut_iter().enumerate() {
157+
if i % 20 == 0 {
158+
*b = '\n' as u8
159+
}
160+
}
161+
str::from_utf8(bytes).unwrap().to_owned()
162+
}
163+
164+
throughput!(easy0_32, easy0(), 32)
165+
throughput!(easy0_1K, easy0(), 1<<10)
166+
throughput!(easy0_32K, easy0(), 32<<10)
167+
168+
throughput!(easy1_32, easy1(), 32)
169+
throughput!(easy1_1K, easy1(), 1<<10)
170+
throughput!(easy1_32K, easy1(), 32<<10)
171+
172+
throughput!(medium_32, medium(), 32)
173+
throughput!(medium_1K, medium(), 1<<10)
174+
throughput!(medium_32K,medium(), 32<<10)
175+
176+
throughput!(hard_32, hard(), 32)
177+
throughput!(hard_1K, hard(), 1<<10)
178+
throughput!(hard_32K,hard(), 32<<10)
179+

‎src/libregex/test/matches.rs

Lines changed: 373 additions & 0 deletions
Large diffs are not rendered by default.

‎src/libregex/test/mod.rs

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
2+
// file at the top-level directory of this distribution and at
3+
// http://rust-lang.org/COPYRIGHT.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
#[cfg(not(stage1))]
12+
#[phase(syntax)]
13+
extern crate regex_macros;
14+
15+
// Dirty hack: During stage1, test dynamic regexs. For stage2, we test
16+
// native regexs.
17+
#[cfg(stage1)]
18+
macro_rules! regex(
19+
($re:expr) => (
20+
match ::regex::Regex::new($re) {
21+
Ok(re) => re,
22+
Err(err) => fail!("{}", err),
23+
}
24+
);
25+
)
26+
27+
mod bench;
28+
mod tests;
29+

‎src/libregex/test/tests.rs

Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
2+
// file at the top-level directory of this distribution and at
3+
// http://rust-lang.org/COPYRIGHT.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
// ignore-tidy-linelength
12+
13+
use regex::{Regex, NoExpand};
14+
15+
#[test]
16+
fn splitn() {
17+
let re = regex!(r"\d+");
18+
let text = "cauchy123plato456tyler789binx";
19+
let subs: Vec<&str> = re.splitn(text, 2).collect();
20+
assert_eq!(subs, vec!("cauchy", "plato456tyler789binx"));
21+
}
22+
23+
#[test]
24+
fn split() {
25+
let re = regex!(r"\d+");
26+
let text = "cauchy123plato456tyler789binx";
27+
let subs: Vec<&str> = re.split(text).collect();
28+
assert_eq!(subs, vec!("cauchy", "plato", "tyler", "binx"));
29+
}
30+
31+
macro_rules! replace(
32+
($name:ident, $which:ident, $re:expr,
33+
$search:expr, $replace:expr, $result:expr) => (
34+
#[test]
35+
fn $name() {
36+
let re = regex!($re);
37+
assert_eq!(re.$which($search, $replace), StrBuf::from_str($result));
38+
}
39+
);
40+
)
41+
42+
replace!(rep_first, replace, r"\d", "age: 26", "Z", "age: Z6")
43+
replace!(rep_plus, replace, r"\d+", "age: 26", "Z", "age: Z")
44+
replace!(rep_all, replace_all, r"\d", "age: 26", "Z", "age: ZZ")
45+
replace!(rep_groups, replace, r"(\S+)\s+(\S+)", "w1 w2", "$2 $1", "w2 w1")
46+
replace!(rep_double_dollar, replace,
47+
r"(\S+)\s+(\S+)", "w1 w2", "$2 $$1", "w2 $1")
48+
replace!(rep_no_expand, replace,
49+
r"(\S+)\s+(\S+)", "w1 w2", NoExpand("$2 $1"), "$2 $1")
50+
replace!(rep_named, replace_all,
51+
r"(?P<first>\S+)\s+(?P<last>\S+)(?P<space>\s*)",
52+
"w1 w2 w3 w4", "$last $first$space", "w2 w1 w4 w3")
53+
replace!(rep_trim, replace_all, "^[ \t]+|[ \t]+$", " \t trim me\t \t",
54+
"", "trim me")
55+
56+
macro_rules! noparse(
57+
($name:ident, $re:expr) => (
58+
#[test]
59+
fn $name() {
60+
let re = $re;
61+
match Regex::new(re) {
62+
Err(_) => {},
63+
Ok(_) => fail!("Regex '{}' should cause a parse error.", re),
64+
}
65+
}
66+
);
67+
)
68+
69+
noparse!(fail_double_repeat, "a**")
70+
noparse!(fail_no_repeat_arg, "*")
71+
noparse!(fail_no_repeat_arg_begin, "^*")
72+
noparse!(fail_incomplete_escape, "\\")
73+
noparse!(fail_class_incomplete, "[A-")
74+
noparse!(fail_class_not_closed, "[A")
75+
noparse!(fail_class_no_begin, r"[\A]")
76+
noparse!(fail_class_no_end, r"[\z]")
77+
noparse!(fail_class_no_boundary, r"[\b]")
78+
noparse!(fail_open_paren, "(")
79+
noparse!(fail_close_paren, ")")
80+
noparse!(fail_invalid_range, "[a-Z]")
81+
noparse!(fail_empty_capture_name, "(?P<>a)")
82+
noparse!(fail_empty_capture_exp, "(?P<name>)")
83+
noparse!(fail_bad_capture_name, "(?P<na-me>)")
84+
noparse!(fail_bad_flag, "(?a)a")
85+
noparse!(fail_empty_alt_before, "|a")
86+
noparse!(fail_empty_alt_after, "a|")
87+
noparse!(fail_counted_big_exact, "a{1001}")
88+
noparse!(fail_counted_big_min, "a{1001,}")
89+
noparse!(fail_counted_no_close, "a{1001")
90+
noparse!(fail_unfinished_cap, "(?")
91+
noparse!(fail_unfinished_escape, "\\")
92+
noparse!(fail_octal_digit, r"\8")
93+
noparse!(fail_hex_digit, r"\xG0")
94+
noparse!(fail_hex_short, r"\xF")
95+
noparse!(fail_hex_long_digits, r"\x{fffg}")
96+
noparse!(fail_flag_bad, "(?a)")
97+
noparse!(fail_flag_empty, "(?)")
98+
noparse!(fail_double_neg, "(?-i-i)")
99+
noparse!(fail_neg_empty, "(?i-)")
100+
noparse!(fail_empty_group, "()")
101+
noparse!(fail_dupe_named, "(?P<a>.)(?P<a>.)")
102+
103+
macro_rules! mat(
104+
($name:ident, $re:expr, $text:expr, $($loc:tt)+) => (
105+
#[test]
106+
fn $name() {
107+
let text = $text;
108+
let expected: Vec<Option<(uint, uint)>> = vec!($($loc)+);
109+
let r = regex!($re);
110+
let got = match r.captures(text) {
111+
Some(c) => c.iter_pos().collect::<Vec<Option<(uint, uint)>>>(),
112+
None => vec!(None),
113+
};
114+
// The test set sometimes leave out capture groups, so truncate
115+
// actual capture groups to match test set.
116+
let (sexpect, mut sgot) = (expected.as_slice(), got.as_slice());
117+
if sgot.len() > sexpect.len() {
118+
sgot = sgot.slice(0, sexpect.len())
119+
}
120+
if sexpect != sgot {
121+
fail!("For RE '{}' against '{}', expected '{}' but got '{}'",
122+
$re, text, sexpect, sgot);
123+
}
124+
}
125+
);
126+
)
127+
128+
// Some crazy expressions from regular-expressions.info.
129+
mat!(match_ranges,
130+
r"\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b",
131+
"num: 255", Some((5, 8)))
132+
mat!(match_ranges_not,
133+
r"\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b",
134+
"num: 256", None)
135+
mat!(match_float1, r"[-+]?[0-9]*\.?[0-9]+", "0.1", Some((0, 3)))
136+
mat!(match_float2, r"[-+]?[0-9]*\.?[0-9]+", "0.1.2", Some((0, 3)))
137+
mat!(match_float3, r"[-+]?[0-9]*\.?[0-9]+", "a1.2", Some((1, 4)))
138+
mat!(match_float4, r"^[-+]?[0-9]*\.?[0-9]+$", "1.a", None)
139+
mat!(match_email, r"(?i)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b",
140+
"mine is jam.slam@gmail.com ", Some((8, 26)))
141+
mat!(match_email_not, r"(?i)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b",
142+
"mine is jam.slam@gmail ", None)
143+
mat!(match_email_big, r"[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?",
144+
"mine is jam.slam@gmail.com ", Some((8, 26)))
145+
mat!(match_date1,
146+
r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$",
147+
"1900-01-01", Some((0, 10)))
148+
mat!(match_date2,
149+
r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$",
150+
"1900-00-01", None)
151+
mat!(match_date3,
152+
r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$",
153+
"1900-13-01", None)
154+
155+
// Exercise the flags.
156+
mat!(match_flag_case, "(?i)abc", "ABC", Some((0, 3)))
157+
mat!(match_flag_weird_case, "(?i)a(?-i)bc", "Abc", Some((0, 3)))
158+
mat!(match_flag_weird_case_not, "(?i)a(?-i)bc", "ABC", None)
159+
mat!(match_flag_case_dotnl, "(?is)a.", "A\n", Some((0, 2)))
160+
mat!(match_flag_case_dotnl_toggle, "(?is)a.(?-is)a.", "A\nab", Some((0, 4)))
161+
mat!(match_flag_case_dotnl_toggle_not, "(?is)a.(?-is)a.", "A\na\n", None)
162+
mat!(match_flag_case_dotnl_toggle_ok, "(?is)a.(?-is:a.)?", "A\na\n", Some((0, 2)))
163+
mat!(match_flag_multi, "(?m)(?:^\\d+$\n?)+", "123\n456\n789", Some((0, 11)))
164+
mat!(match_flag_ungreedy, "(?U)a+", "aa", Some((0, 1)))
165+
mat!(match_flag_ungreedy_greedy, "(?U)a+?", "aa", Some((0, 2)))
166+
mat!(match_flag_ungreedy_noop, "(?U)(?-U)a+", "aa", Some((0, 2)))
167+
168+
// Some Unicode tests.
169+
mat!(uni_literal, r"Ⅰ", "Ⅰ", Some((0, 3)))
170+
mat!(uni_one, r"\pN", "Ⅰ", Some((0, 3)))
171+
mat!(uni_mixed, r"\pN+", "Ⅰ1Ⅱ2", Some((0, 8)))
172+
mat!(uni_not, r"\PN+", "abⅠ", Some((0, 2)))
173+
mat!(uni_not_class, r"[\PN]+", "abⅠ", Some((0, 2)))
174+
mat!(uni_not_class_neg, r"[^\PN]+", "abⅠ", Some((2, 5)))
175+
mat!(uni_case, r"(?i)Δ", "δ", Some((0, 2)))
176+
mat!(uni_case_not, r"Δ", "δ", None)
177+
mat!(uni_case_upper, r"\p{Lu}+", "ΛΘΓΔα", Some((0, 8)))
178+
mat!(uni_case_upper_nocase_flag, r"(?i)\p{Lu}+", "ΛΘΓΔα", Some((0, 10)))
179+
mat!(uni_case_upper_nocase, r"\p{L}+", "ΛΘΓΔα", Some((0, 10)))
180+
mat!(uni_case_lower, r"\p{Ll}+", "ΛΘΓΔα", Some((8, 10)))
181+
182+
// Test the Unicode friendliness of Perl character classes.
183+
mat!(uni_perl_w, r"\w+", "dδd", Some((0, 4)))
184+
mat!(uni_perl_w_not, r"\w+", "Ⅱ", None)
185+
mat!(uni_perl_w_neg, r"\W+", "Ⅱ", Some((0, 3)))
186+
mat!(uni_perl_d, r"\d+", "1२३9", Some((0, 8)))
187+
mat!(uni_perl_d_not, r"\d+", "Ⅱ", None)
188+
mat!(uni_perl_d_neg, r"\D+", "Ⅱ", Some((0, 3)))
189+
mat!(uni_perl_s, r"\s+", " ", Some((0, 3)))
190+
mat!(uni_perl_s_not, r"\s+", "☃", None)
191+
mat!(uni_perl_s_neg, r"\S+", "☃", Some((0, 3)))
192+
193+
// And do the same for word boundaries.
194+
mat!(uni_boundary_none, r"\d\b", "6δ", None)
195+
mat!(uni_boundary_ogham, r"\d\b", "6 ", Some((0, 1)))
196+
197+
// A whole mess of tests from Glenn Fowler's regex test suite.
198+
// Generated by the 'src/etc/regex-match-tests' program.
199+
mod matches;

‎src/libregex/testdata/LICENSE

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
The following license covers testregex.c and all associated test data.
2+
3+
Permission is hereby granted, free of charge, to any person obtaining a
4+
copy of THIS SOFTWARE FILE (the "Software"), to deal in the Software
5+
without restriction, including without limitation the rights to use,
6+
copy, modify, merge, publish, distribute, and/or sell copies of the
7+
Software, and to permit persons to whom the Software is furnished to do
8+
so, subject to the following disclaimer:
9+
10+
THIS SOFTWARE IS PROVIDED BY AT&T ``AS IS'' AND ANY EXPRESS OR IMPLIED
11+
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
12+
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
13+
IN NO EVENT SHALL AT&T BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
14+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
15+
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
16+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
17+
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
18+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
19+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

‎src/libregex/testdata/README

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
Test data was taken from the Go distribution, which was in turn taken from the
2+
testregex test suite:
3+
4+
http://www2.research.att.com/~astopen/testregex/testregex.html
5+
6+
The LICENSE in this directory corresponds to the LICENSE that the data was
7+
released under.
8+
9+
The tests themselves were modified for RE2/Go. A couple were modified further
10+
by me (Andrew Gallant) (only in repetition.dat) so that RE2/Go would pass them.
11+
(Yes, it seems like RE2/Go includes failing test cases.) This may or may not
12+
have been a bad idea, but I think being consistent with an established Regex
13+
library is worth something.
14+
15+
Note that these files are read by 'src/etc/regexp-match-tests' and turned into
16+
Rust tests found in 'src/libregexp/tests/matches.rs'.
17+

‎src/libregex/testdata/basic.dat

Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
NOTE all standard compliant implementations should pass these : 2002-05-31
2+
3+
BE abracadabra$ abracadabracadabra (7,18)
4+
BE a...b abababbb (2,7)
5+
BE XXXXXX ..XXXXXX (2,8)
6+
E \) () (1,2)
7+
BE a] a]a (0,2)
8+
B } } (0,1)
9+
E \} } (0,1)
10+
BE \] ] (0,1)
11+
B ] ] (0,1)
12+
E ] ] (0,1)
13+
B { { (0,1)
14+
B } } (0,1)
15+
BE ^a ax (0,1)
16+
BE \^a a^a (1,3)
17+
BE a\^ a^ (0,2)
18+
BE a$ aa (1,2)
19+
BE a\$ a$ (0,2)
20+
BE ^$ NULL (0,0)
21+
E $^ NULL (0,0)
22+
E a($) aa (1,2)(2,2)
23+
E a*(^a) aa (0,1)(0,1)
24+
E (..)*(...)* a (0,0)
25+
E (..)*(...)* abcd (0,4)(2,4)
26+
E (ab|a)(bc|c) abc (0,3)(0,2)(2,3)
27+
E (ab)c|abc abc (0,3)(0,2)
28+
E a{0}b ab (1,2)
29+
E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7)
30+
E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7)
31+
E a{9876543210} NULL BADBR
32+
E ((a|a)|a) a (0,1)(0,1)(0,1)
33+
E (a*)(a|aa) aaaa (0,4)(0,3)(3,4)
34+
E a*(a.|aa) aaaa (0,4)(2,4)
35+
E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2)
36+
E (a|b)?.* b (0,1)(0,1)
37+
E (a|b)c|a(b|c) ac (0,2)(0,1)
38+
E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2)
39+
E (a|b)*c|(a|ab)*c abc (0,3)(1,2)
40+
E (a|b)*c|(a|ab)*c xc (1,2)
41+
E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2)
42+
E a?(ab|ba)ab abab (0,4)(0,2)
43+
E a?(ac{0}b|ba)ab abab (0,4)(0,2)
44+
E ab|abab abbabab (0,2)
45+
E aba|bab|bba baaabbbaba (5,8)
46+
E aba|bab baaabbbaba (6,9)
47+
E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2)
48+
E (a.|.a.)*|(a|.a...) aa (0,2)(0,2)
49+
E ab|a xabc (1,3)
50+
E ab|a xxabc (2,4)
51+
Ei (Ab|cD)* aBcD (0,4)(2,4)
52+
BE [^-] --a (2,3)
53+
BE [a-]* --a (0,3)
54+
BE [a-m-]* --amoma-- (0,4)
55+
E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17)
56+
E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17)
57+
{E [[:upper:]] A (0,1) [[<element>]] not supported
58+
E [[:lower:]]+ `az{ (1,3)
59+
E [[:upper:]]+ @AZ[ (1,3)
60+
# No collation in Go
61+
#BE [[-]] [[-]] (2,4)
62+
#BE [[.NIL.]] NULL ECOLLATE
63+
#BE [[=aleph=]] NULL ECOLLATE
64+
}
65+
BE$ \n \n (0,1)
66+
BEn$ \n \n (0,1)
67+
BE$ [^a] \n (0,1)
68+
BE$ \na \na (0,2)
69+
E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3)
70+
BE xxx xxx (0,3)
71+
E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6)
72+
E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3)
73+
E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11)
74+
E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1)
75+
E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2)
76+
E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81)
77+
E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25)
78+
E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22)
79+
E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11)
80+
BE$ .* \x01\x7f (0,2)
81+
E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57)
82+
L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH
83+
E a*a*a*a*a*b aaaaaaaaab (0,10)
84+
BE ^ NULL (0,0)
85+
BE $ NULL (0,0)
86+
BE ^$ NULL (0,0)
87+
BE ^a$ a (0,1)
88+
BE abc abc (0,3)
89+
BE abc xabcy (1,4)
90+
BE abc ababc (2,5)
91+
BE ab*c abc (0,3)
92+
BE ab*bc abc (0,3)
93+
BE ab*bc abbc (0,4)
94+
BE ab*bc abbbbc (0,6)
95+
E ab+bc abbc (0,4)
96+
E ab+bc abbbbc (0,6)
97+
E ab?bc abbc (0,4)
98+
E ab?bc abc (0,3)
99+
E ab?c abc (0,3)
100+
BE ^abc$ abc (0,3)
101+
BE ^abc abcc (0,3)
102+
BE abc$ aabc (1,4)
103+
BE ^ abc (0,0)
104+
BE $ abc (3,3)
105+
BE a.c abc (0,3)
106+
BE a.c axc (0,3)
107+
BE a.*c axyzc (0,5)
108+
BE a[bc]d abd (0,3)
109+
BE a[b-d]e ace (0,3)
110+
BE a[b-d] aac (1,3)
111+
BE a[-b] a- (0,2)
112+
BE a[b-] a- (0,2)
113+
BE a] a] (0,2)
114+
BE a[]]b a]b (0,3)
115+
BE a[^bc]d aed (0,3)
116+
BE a[^-b]c adc (0,3)
117+
BE a[^]b]c adc (0,3)
118+
E ab|cd abc (0,2)
119+
E ab|cd abcd (0,2)
120+
E a\(b a(b (0,3)
121+
E a\(*b ab (0,2)
122+
E a\(*b a((b (0,4)
123+
E ((a)) abc (0,1)(0,1)(0,1)
124+
E (a)b(c) abc (0,3)(0,1)(2,3)
125+
E a+b+c aabbabc (4,7)
126+
E a* aaa (0,3)
127+
#E (a*)* - (0,0)(0,0)
128+
E (a*)* - (0,0)(?,?) RE2/Go
129+
E (a*)+ - (0,0)(0,0)
130+
#E (a*|b)* - (0,0)(0,0)
131+
E (a*|b)* - (0,0)(?,?) RE2/Go
132+
E (a+|b)* ab (0,2)(1,2)
133+
E (a+|b)+ ab (0,2)(1,2)
134+
E (a+|b)? ab (0,1)(0,1)
135+
BE [^ab]* cde (0,3)
136+
#E (^)* - (0,0)(0,0)
137+
E (^)* - (0,0)(?,?) RE2/Go
138+
BE a* NULL (0,0)
139+
E ([abc])*d abbbcd (0,6)(4,5)
140+
E ([abc])*bcd abcd (0,4)(0,1)
141+
E a|b|c|d|e e (0,1)
142+
E (a|b|c|d|e)f ef (0,2)(0,1)
143+
#E ((a*|b))* - (0,0)(0,0)(0,0)
144+
E ((a*|b))* - (0,0)(?,?)(?,?) RE2/Go
145+
BE abcd*efg abcdefg (0,7)
146+
BE ab* xabyabbbz (1,3)
147+
BE ab* xayabbbz (1,2)
148+
E (ab|cd)e abcde (2,5)(2,4)
149+
BE [abhgefdc]ij hij (0,3)
150+
E (a|b)c*d abcd (1,4)(1,2)
151+
E (ab|ab*)bc abc (0,3)(0,1)
152+
E a([bc]*)c* abc (0,3)(1,3)
153+
E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4)
154+
E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4)
155+
E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4)
156+
E a[bcd]*dcdcde adcdcde (0,7)
157+
E (ab|a)b*c abc (0,3)(0,2)
158+
E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4)
159+
BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5)
160+
E ^a(bc+|b[eh])g|.h$ abh (1,3)
161+
E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5)
162+
E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2)
163+
E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6)
164+
E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)
165+
BE multiple words multiple words yeah (0,14)
166+
E (.*)c(.*) abcde (0,5)(0,2)(3,5)
167+
BE abcd abcd (0,4)
168+
E a(bc)d abcd (0,4)(1,3)
169+
E a[-]?c ac (0,3)
170+
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12)
171+
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13)
172+
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12)
173+
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12)
174+
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11)
175+
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12)
176+
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11)
177+
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15)
178+
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13)
179+
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13)
180+
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12)
181+
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13)
182+
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11)
183+
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13)
184+
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13)
185+
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12)
186+
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12)
187+
E a+(b|c)*d+ aabcdd (0,6)(3,4)
188+
E ^.+$ vivi (0,4)
189+
E ^(.+)$ vivi (0,4)(0,4)
190+
E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19)
191+
E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3)
192+
E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7)
193+
E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7)
194+
E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11)
195+
E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3)
196+
E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7)
197+
E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3)
198+
E ((foo)|bar)!bas bar!bas (0,7)(0,3)
199+
E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7)
200+
E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3)
201+
E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3)
202+
E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7)
203+
E (foo|(bar))!bas foo!bas (0,7)(0,3)
204+
E (foo|bar)!bas bar!bas (0,7)(0,3)
205+
E (foo|bar)!bas foo!bar!bas (4,11)(4,7)
206+
E (foo|bar)!bas foo!bas (0,7)(0,3)
207+
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
208+
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3)
209+
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7)
210+
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11)
211+
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7)
212+
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3)
213+
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7)
214+
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
215+
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7)
216+
E .*(/XXX).* /XXX (0,4)(0,4)
217+
E .*(\\XXX).* \XXX (0,4)(0,4)
218+
E \\XXX \XXX (0,4)
219+
E .*(/000).* /000 (0,4)(0,4)
220+
E .*(\\000).* \000 (0,4)(0,4)
221+
E \\000 \000 (0,4)

‎src/libregex/testdata/nullsubexpr.dat

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
NOTE null subexpression matches : 2002-06-06
2+
3+
E (a*)* a (0,1)(0,1)
4+
#E SAME x (0,0)(0,0)
5+
E SAME x (0,0)(?,?) RE2/Go
6+
E SAME aaaaaa (0,6)(0,6)
7+
E SAME aaaaaax (0,6)(0,6)
8+
E (a*)+ a (0,1)(0,1)
9+
E SAME x (0,0)(0,0)
10+
E SAME aaaaaa (0,6)(0,6)
11+
E SAME aaaaaax (0,6)(0,6)
12+
E (a+)* a (0,1)(0,1)
13+
E SAME x (0,0)
14+
E SAME aaaaaa (0,6)(0,6)
15+
E SAME aaaaaax (0,6)(0,6)
16+
E (a+)+ a (0,1)(0,1)
17+
E SAME x NOMATCH
18+
E SAME aaaaaa (0,6)(0,6)
19+
E SAME aaaaaax (0,6)(0,6)
20+
21+
E ([a]*)* a (0,1)(0,1)
22+
#E SAME x (0,0)(0,0)
23+
E SAME x (0,0)(?,?) RE2/Go
24+
E SAME aaaaaa (0,6)(0,6)
25+
E SAME aaaaaax (0,6)(0,6)
26+
E ([a]*)+ a (0,1)(0,1)
27+
E SAME x (0,0)(0,0)
28+
E SAME aaaaaa (0,6)(0,6)
29+
E SAME aaaaaax (0,6)(0,6)
30+
E ([^b]*)* a (0,1)(0,1)
31+
#E SAME b (0,0)(0,0)
32+
E SAME b (0,0)(?,?) RE2/Go
33+
E SAME aaaaaa (0,6)(0,6)
34+
E SAME aaaaaab (0,6)(0,6)
35+
E ([ab]*)* a (0,1)(0,1)
36+
E SAME aaaaaa (0,6)(0,6)
37+
E SAME ababab (0,6)(0,6)
38+
E SAME bababa (0,6)(0,6)
39+
E SAME b (0,1)(0,1)
40+
E SAME bbbbbb (0,6)(0,6)
41+
E SAME aaaabcde (0,5)(0,5)
42+
E ([^a]*)* b (0,1)(0,1)
43+
E SAME bbbbbb (0,6)(0,6)
44+
#E SAME aaaaaa (0,0)(0,0)
45+
E SAME aaaaaa (0,0)(?,?) RE2/Go
46+
E ([^ab]*)* ccccxx (0,6)(0,6)
47+
#E SAME ababab (0,0)(0,0)
48+
E SAME ababab (0,0)(?,?) RE2/Go
49+
50+
E ((z)+|a)* zabcde (0,2)(1,2)
51+
52+
#{E a+? aaaaaa (0,1) no *? +? mimimal match ops
53+
#E (a) aaa (0,1)(0,1)
54+
#E (a*?) aaa (0,0)(0,0)
55+
#E (a)*? aaa (0,0)
56+
#E (a*?)*? aaa (0,0)
57+
#}
58+
59+
B \(a*\)*\(x\) x (0,1)(0,0)(0,1)
60+
B \(a*\)*\(x\) ax (0,2)(0,1)(1,2)
61+
B \(a*\)*\(x\) axa (0,2)(0,1)(1,2)
62+
B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1)
63+
B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2)
64+
B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3)
65+
B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4)
66+
B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3)
67+
68+
#E (a*)*(x) x (0,1)(0,0)(0,1)
69+
E (a*)*(x) x (0,1)(?,?)(0,1) RE2/Go
70+
E (a*)*(x) ax (0,2)(0,1)(1,2)
71+
E (a*)*(x) axa (0,2)(0,1)(1,2)
72+
73+
E (a*)+(x) x (0,1)(0,0)(0,1)
74+
E (a*)+(x) ax (0,2)(0,1)(1,2)
75+
E (a*)+(x) axa (0,2)(0,1)(1,2)
76+
77+
E (a*){2}(x) x (0,1)(0,0)(0,1)
78+
E (a*){2}(x) ax (0,2)(1,1)(1,2)
79+
E (a*){2}(x) axa (0,2)(1,1)(1,2)

‎src/libregex/testdata/repetition.dat

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
NOTE implicit vs. explicit repetitions : 2009-02-02
2+
3+
# Glenn Fowler <gsf@research.att.com>
4+
# conforming matches (column 4) must match one of the following BREs
5+
# NOMATCH
6+
# (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)*
7+
# (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)*
8+
# i.e., each 3-tuple has two identical elements and one (?,?)
9+
10+
E ((..)|(.)) NULL NOMATCH
11+
E ((..)|(.))((..)|(.)) NULL NOMATCH
12+
E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH
13+
14+
E ((..)|(.)){1} NULL NOMATCH
15+
E ((..)|(.)){2} NULL NOMATCH
16+
E ((..)|(.)){3} NULL NOMATCH
17+
18+
E ((..)|(.))* NULL (0,0)
19+
20+
E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1)
21+
E ((..)|(.))((..)|(.)) a NOMATCH
22+
E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH
23+
24+
E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1)
25+
E ((..)|(.)){2} a NOMATCH
26+
E ((..)|(.)){3} a NOMATCH
27+
28+
E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1)
29+
30+
E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?)
31+
E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)
32+
E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH
33+
34+
E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?)
35+
E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2)
36+
E ((..)|(.)){3} aa NOMATCH
37+
38+
E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?)
39+
40+
E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?)
41+
E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)
42+
E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3)
43+
44+
E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?)
45+
#E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3)
46+
E ((..)|(.)){2} aaa (0,3)(2,3)(0,2)(2,3) RE2/Go
47+
E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3)
48+
49+
#E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3)
50+
E ((..)|(.))* aaa (0,3)(2,3)(0,2)(2,3) RE2/Go
51+
52+
E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?)
53+
E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
54+
E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4)
55+
56+
E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?)
57+
E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?)
58+
#E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4)
59+
E ((..)|(.)){3} aaaa (0,4)(3,4)(0,2)(3,4) RE2/Go
60+
61+
E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?)
62+
63+
E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?)
64+
E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
65+
E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5)
66+
67+
E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?)
68+
E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?)
69+
#E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5)
70+
E ((..)|(.)){3} aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go
71+
72+
#E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5)
73+
E ((..)|(.))* aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go
74+
75+
E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?)
76+
E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
77+
E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?)
78+
79+
E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?)
80+
E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?)
81+
E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?)
82+
83+
E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?)
84+
85+
NOTE additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02
86+
87+
# These test a bug in OS X / FreeBSD / NetBSD, and libtree.
88+
# Linux/GLIBC gets the {8,} and {8,8} wrong.
89+
90+
:HA#100:E X(.?){0,}Y X1234567Y (0,9)(7,8)
91+
:HA#101:E X(.?){1,}Y X1234567Y (0,9)(7,8)
92+
:HA#102:E X(.?){2,}Y X1234567Y (0,9)(7,8)
93+
:HA#103:E X(.?){3,}Y X1234567Y (0,9)(7,8)
94+
:HA#104:E X(.?){4,}Y X1234567Y (0,9)(7,8)
95+
:HA#105:E X(.?){5,}Y X1234567Y (0,9)(7,8)
96+
:HA#106:E X(.?){6,}Y X1234567Y (0,9)(7,8)
97+
:HA#107:E X(.?){7,}Y X1234567Y (0,9)(7,8)
98+
:HA#108:E X(.?){8,}Y X1234567Y (0,9)(8,8)
99+
#:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(7,8)
100+
:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(8,8) RE2/Go
101+
#:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(7,8)
102+
:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(8,8) RE2/Go
103+
#:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(7,8)
104+
:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(8,8) RE2/Go
105+
#:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(7,8)
106+
:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(8,8) RE2/Go
107+
#:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(7,8)
108+
:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(8,8) RE2/Go
109+
#:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(7,8)
110+
:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(8,8) RE2/Go
111+
#:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(7,8)
112+
:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(8,8) RE2/Go
113+
#:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(7,8)
114+
:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(8,8) RE2/Go
115+
:HA#118:E X(.?){8,8}Y X1234567Y (0,9)(8,8)
116+
117+
# These test a fixed bug in my regex-tdfa that did not keep the expanded
118+
# form properly grouped, so right association did the wrong thing with
119+
# these ambiguous patterns (crafted just to test my code when I became
120+
# suspicious of my implementation). The first subexpression should use
121+
# "ab" then "a" then "bcd".
122+
123+
# OS X / FreeBSD / NetBSD badly fail many of these, with impossible
124+
# results like (0,6)(4,5)(6,6).
125+
126+
:HA#260:E (a|ab|c|bcd){0,}(d*) ababcd (0,1)(0,1)(1,1)
127+
:HA#261:E (a|ab|c|bcd){1,}(d*) ababcd (0,1)(0,1)(1,1)
128+
:HA#262:E (a|ab|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6)
129+
:HA#263:E (a|ab|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6)
130+
:HA#264:E (a|ab|c|bcd){4,}(d*) ababcd NOMATCH
131+
:HA#265:E (a|ab|c|bcd){0,10}(d*) ababcd (0,1)(0,1)(1,1)
132+
:HA#266:E (a|ab|c|bcd){1,10}(d*) ababcd (0,1)(0,1)(1,1)
133+
:HA#267:E (a|ab|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6)
134+
:HA#268:E (a|ab|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6)
135+
:HA#269:E (a|ab|c|bcd){4,10}(d*) ababcd NOMATCH
136+
:HA#270:E (a|ab|c|bcd)*(d*) ababcd (0,1)(0,1)(1,1)
137+
:HA#271:E (a|ab|c|bcd)+(d*) ababcd (0,1)(0,1)(1,1)
138+
139+
# The above worked on Linux/GLIBC but the following often fail.
140+
# They also trip up OS X / FreeBSD / NetBSD:
141+
142+
#:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6)
143+
:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
144+
#:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6)
145+
:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
146+
#:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6)
147+
:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
148+
#:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6)
149+
:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
150+
:HA#284:E (ab|a|c|bcd){4,}(d*) ababcd NOMATCH
151+
#:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6)
152+
:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
153+
#:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6)
154+
:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
155+
#:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6)
156+
:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
157+
#:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6)
158+
:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
159+
:HA#289:E (ab|a|c|bcd){4,10}(d*) ababcd NOMATCH
160+
#:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6)
161+
:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
162+
#:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6)
163+
:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(4,5)(5,6) RE2/Go

‎src/libregex/unicode.rs

Lines changed: 5537 additions & 0 deletions
Large diffs are not rendered by default.

‎src/libregex/vm.rs

Lines changed: 587 additions & 0 deletions
Large diffs are not rendered by default.

‎src/libregex_macros/lib.rs

Lines changed: 684 additions & 0 deletions
Large diffs are not rendered by default.

‎src/test/bench/shootout-regex-dna.rs

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
2+
// file at the top-level directory of this distribution and at
3+
// http://rust-lang.org/COPYRIGHT.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
// FIXME(#13725) windows needs fixing.
12+
// ignore-win32
13+
// ignore-stage1
14+
// ignore-cross-compile #12102
15+
16+
#![feature(macro_rules, phase)]
17+
18+
extern crate regex;
19+
#[phase(syntax)]extern crate regex_macros;
20+
extern crate sync;
21+
22+
use std::io;
23+
use regex::{NoExpand, Regex};
24+
use sync::Arc;
25+
26+
fn count_matches(seq: &str, variant: &Regex) -> int {
27+
let mut n = 0;
28+
for _ in variant.find_iter(seq) {
29+
n += 1;
30+
}
31+
n
32+
}
33+
34+
fn main() {
35+
let mut rdr = if std::os::getenv("RUST_BENCH").is_some() {
36+
let fd = io::File::open(&Path::new("shootout-k-nucleotide.data"));
37+
~io::BufferedReader::new(fd) as ~io::Reader
38+
} else {
39+
~io::stdin() as ~io::Reader
40+
};
41+
let mut seq = StrBuf::from_str(rdr.read_to_str().unwrap());
42+
let ilen = seq.len();
43+
44+
seq = regex!(">[^\n]*\n|\n").replace_all(seq.as_slice(), NoExpand(""));
45+
let seq_arc = Arc::new(seq.clone()); // copy before it moves
46+
let clen = seq.len();
47+
48+
let mut seqlen = sync::Future::spawn(proc() {
49+
let substs = ~[
50+
(regex!("B"), "(c|g|t)"),
51+
(regex!("D"), "(a|g|t)"),
52+
(regex!("H"), "(a|c|t)"),
53+
(regex!("K"), "(g|t)"),
54+
(regex!("M"), "(a|c)"),
55+
(regex!("N"), "(a|c|g|t)"),
56+
(regex!("R"), "(a|g)"),
57+
(regex!("S"), "(c|g)"),
58+
(regex!("V"), "(a|c|g)"),
59+
(regex!("W"), "(a|t)"),
60+
(regex!("Y"), "(c|t)"),
61+
];
62+
let mut seq = seq;
63+
for (re, replacement) in substs.move_iter() {
64+
seq = re.replace_all(seq.as_slice(), NoExpand(replacement));
65+
}
66+
seq.len()
67+
});
68+
69+
let variants = ~[
70+
regex!("agggtaaa|tttaccct"),
71+
regex!("[cgt]gggtaaa|tttaccc[acg]"),
72+
regex!("a[act]ggtaaa|tttacc[agt]t"),
73+
regex!("ag[act]gtaaa|tttac[agt]ct"),
74+
regex!("agg[act]taaa|ttta[agt]cct"),
75+
regex!("aggg[acg]aaa|ttt[cgt]ccct"),
76+
regex!("agggt[cgt]aa|tt[acg]accct"),
77+
regex!("agggta[cgt]a|t[acg]taccct"),
78+
regex!("agggtaa[cgt]|[acg]ttaccct"),
79+
];
80+
let (mut variant_strs, mut counts) = (vec!(), vec!());
81+
for variant in variants.move_iter() {
82+
let seq_arc_copy = seq_arc.clone();
83+
variant_strs.push(variant.to_str().to_owned());
84+
counts.push(sync::Future::spawn(proc() {
85+
count_matches(seq_arc_copy.as_slice(), &variant)
86+
}));
87+
}
88+
89+
for (i, variant) in variant_strs.iter().enumerate() {
90+
println!("{} {}", variant, counts.get_mut(i).get());
91+
}
92+
println!("");
93+
println!("{}", ilen);
94+
println!("{}", clen);
95+
println!("{}", seqlen.get());
96+
}
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
2+
// file at the top-level directory of this distribution and at
3+
// http://rust-lang.org/COPYRIGHT.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
// FIXME(#13725) windows needs fixing.
12+
// ignore-win32
13+
// ignore-stage1
14+
15+
#![feature(phase)]
16+
17+
extern crate regex;
18+
#[phase(syntax)] extern crate regex_macros;
19+
20+
// Tests to make sure that `regex!` will produce a compile error when given
21+
// an invalid regular expression.
22+
// More exhaustive failure tests for the parser are done with the traditional
23+
// unit testing infrastructure, since both dynamic and native regexes use the
24+
// same parser.
25+
26+
fn main() {
27+
let _ = regex!("("); //~ ERROR Regex syntax error
28+
}

0 commit comments

Comments
 (0)
Please sign in to comment.