1
1
package edu .stanford .nlp .ling .tokensregex ;
2
2
3
- import edu .stanford .nlp .io .IOUtils ;
4
3
import edu .stanford .nlp .ling .CoreAnnotations ;
4
+ import edu .stanford .nlp .ling .CoreLabel ;
5
5
import edu .stanford .nlp .pipeline .*;
6
+ import edu .stanford .nlp .process .CoreLabelTokenFactory ;
7
+ import edu .stanford .nlp .process .PTBTokenizer ;
8
+ import edu .stanford .nlp .process .TokenizerFactory ;
6
9
import edu .stanford .nlp .util .CoreMap ;
7
10
import edu .stanford .nlp .util .Pair ;
8
- import edu .stanford .nlp .util .StringUtils ;
9
11
import edu .stanford .nlp .util .Timing ;
10
12
import junit .framework .TestCase ;
11
13
12
- import java .io .File ;
13
14
import java .io .IOException ;
15
+ import java .io .StringReader ;
14
16
import java .util .ArrayList ;
15
- import java .util .Arrays ;
16
17
import java .util .Iterator ;
17
18
import java .util .List ;
18
- import java .util .Map ;
19
19
import java .util .regex .Pattern ;
20
20
21
21
public class TokenSequenceMatcherITest extends TestCase {
@@ -94,6 +94,50 @@ public void testTokenSequenceMatcherValue() throws IOException {
94
94
assertFalse (match );
95
95
}
96
96
97
+ public void testTokenSequenceMatcherBeginEnd () throws IOException {
98
+ CoreMap doc = createDocument (testText );
99
+
100
+ // Test simple sequence with begin sequence matching
101
+ TokenSequencePattern p = TokenSequencePattern .compile ("^ [] []" );
102
+ TokenSequenceMatcher m = p .getMatcher (doc .get (CoreAnnotations .TokensAnnotation .class ));
103
+
104
+ boolean match = m .find ();
105
+ assertTrue (match );
106
+ assertEquals ("the number" , m .group ());
107
+
108
+ match = m .find ();
109
+ assertFalse (match );
110
+
111
+ // Test simple sequence with end sequence matching
112
+ p = TokenSequencePattern .compile ("[] [] $" );
113
+ m = p .getMatcher (doc .get (CoreAnnotations .TokensAnnotation .class ));
114
+
115
+ match = m .find ();
116
+ assertTrue (match );
117
+ assertEquals ("fifty." , m .group ());
118
+
119
+ match = m .find ();
120
+ assertFalse (match );
121
+
122
+ // Test simple sequence with begin and end sequence matching
123
+ p = TokenSequencePattern .compile ("^ [] [] $" );
124
+ m = p .getMatcher (doc .get (CoreAnnotations .TokensAnnotation .class ));
125
+
126
+ match = m .find ();
127
+ assertFalse (match );
128
+
129
+ // Test simple sequence with ^$ in a string regular expression
130
+ p = TokenSequencePattern .compile ("/^number$/" );
131
+ m = p .getMatcher (doc .get (CoreAnnotations .TokensAnnotation .class ));
132
+
133
+ match = m .find ();
134
+ assertTrue (match );
135
+ assertEquals ("number" , m .group ());
136
+
137
+ match = m .find ();
138
+ assertFalse (match );
139
+ }
140
+
97
141
private static final String testText1 = "Mellitus was the first Bishop of London, the third Archbishop of Canterbury, and a member of the Gregorian mission sent to England to convert the Anglo-Saxons. He arrived in 601 AD, and was consecrated as Bishop of London in 604." ;
98
142
public void testTokenSequenceMatcher1 () throws IOException {
99
143
CoreMap doc = createDocument (testText1 );
@@ -179,7 +223,7 @@ public void testTokenSequenceMatcher1() throws IOException {
179
223
match = m .find ();
180
224
assertTrue (match );
181
225
assertEquals (0 , m .groupCount ());
182
- assertEquals ("London in 604 ." , m .group ());
226
+ assertEquals ("London in 604." , m .group ());
183
227
match = m .find ();
184
228
assertFalse (match );
185
229
}
@@ -435,6 +479,31 @@ public void testTokenSequenceMatcherConj() throws IOException {
435
479
assertFalse (match );
436
480
}
437
481
482
+ public void testTokenSequenceMatcherConj2 () throws IOException {
483
+ String content = "The cat is sleeping on the floor." ;
484
+ String greedyPattern = "(?: ([]* cat []*) & ([]* sleeping []*))" ;
485
+
486
+ TokenizerFactory tf = PTBTokenizer .factory (new CoreLabelTokenFactory (), "" );
487
+ List <CoreLabel > tokens = tf .getTokenizer (new StringReader (content )).tokenize ();
488
+ TokenSequencePattern seqPattern = TokenSequencePattern .compile (greedyPattern );
489
+ TokenSequenceMatcher matcher = seqPattern .getMatcher (tokens );
490
+
491
+ boolean entireMatch = matcher .matches ();
492
+ assertTrue (entireMatch );
493
+
494
+ boolean match = matcher .find ();
495
+ assertTrue (match );
496
+ assertEquals ("The cat is sleeping on the floor." , matcher .group ());
497
+
498
+ String reluctantPattern = "(?: ([]*? cat []*?) & ([]*? sleeping []*?))" ;
499
+ TokenSequencePattern seqPattern2 = TokenSequencePattern .compile (reluctantPattern );
500
+ TokenSequenceMatcher matcher2 = seqPattern2 .getMatcher (tokens );
501
+
502
+ match = matcher2 .find ();
503
+ assertTrue (match );
504
+ assertEquals ("The cat is sleeping" , matcher2 .group ());
505
+ }
506
+
438
507
public void testTokenSequenceMatcherConjAll () throws IOException {
439
508
CoreMap doc = createDocument (testText1 );
440
509
TokenSequencePattern p = TokenSequencePattern .compile (
@@ -979,7 +1048,7 @@ public void testTokenSequenceOptimizeOrString() throws IOException {
979
1048
TokenSequenceMatcher m = p .getMatcher (doc .get (CoreAnnotations .TokensAnnotation .class ));
980
1049
boolean match = m .find ();
981
1050
assertTrue (match );
982
- assertEquals ("atropine we need to have many many words here but we do n 't sweating" , m .group (0 ));
1051
+ assertEquals ("atropine we need to have many many words here but we don 't sweating" , m .group (0 ));
983
1052
984
1053
match = m .find ();
985
1054
assertFalse (match );
@@ -1005,7 +1074,7 @@ public void testMultiplePatterns() throws IOException {
1005
1074
CoreMap doc = createDocument ("atropine we need to have many many words here but we don't sweating" );
1006
1075
MultiPatternMatcher <CoreMap > multiPatternMatcher = TokenSequencePattern .getMultiPatternMatcher (p1 , p2 );
1007
1076
List <String > expected = new ArrayList <String >();
1008
- expected .add ("atropine we need to have many many words here but we do n 't sweating" );
1077
+ expected .add ("atropine we need to have many many words here but we don 't sweating" );
1009
1078
Iterator <String > expectedIter = expected .iterator ();
1010
1079
1011
1080
Iterable <SequenceMatchResult <CoreMap >> matches =
@@ -1187,7 +1256,7 @@ public void testTokenSequenceMatcherNumber() throws IOException {
1187
1256
match = m .find ();
1188
1257
assertTrue (match );
1189
1258
assertEquals (0 , m .groupCount ());
1190
- assertEquals ("January 3 , 2002" , m .group ());
1259
+ assertEquals ("January 3, 2002" , m .group ());
1191
1260
match = m .find ();
1192
1261
assertFalse (match );
1193
1262
@@ -1196,7 +1265,7 @@ public void testTokenSequenceMatcherNumber() throws IOException {
1196
1265
match = m .find ();
1197
1266
assertTrue (match );
1198
1267
assertEquals (0 , m .groupCount ());
1199
- assertEquals ("January 3 , 2002" , m .group ());
1268
+ assertEquals ("January 3, 2002" , m .group ());
1200
1269
match = m .find ();
1201
1270
assertFalse (match );
1202
1271
@@ -1404,6 +1473,32 @@ public void testTokenSequenceMatcherMultiNodePattern() throws IOException {
1404
1473
assertFalse (match );
1405
1474
}
1406
1475
1476
+ public void testTokenSequenceMatcherMultiNodePattern2 () throws IOException {
1477
+ CoreMap doc = createDocument ("Replace the lamp with model wss.32dc55c3e945384dbc5e533ab711fd24" );
1478
+
1479
+ // Greedy
1480
+ TokenSequencePattern p = TokenSequencePattern .compile ("/model/ ((?m){1,4}/\\ w+\\ .\\ w+/)" );
1481
+ TokenSequenceMatcher m = p .getMatcher (doc .get (CoreAnnotations .TokensAnnotation .class ));
1482
+ boolean match = m .find ();
1483
+ assertTrue (match );
1484
+ assertEquals (1 , m .groupCount ());
1485
+ assertEquals ("model wss.32dc55c3e945384dbc5e533ab711fd24" , m .group ());
1486
+ assertEquals ("wss.32dc55c3e945384dbc5e533ab711fd24" , m .group (1 ));
1487
+ match = m .find ();
1488
+ assertFalse (match );
1489
+
1490
+ // Reluctant
1491
+ p = TokenSequencePattern .compile ("/model/ ((?m){1,4}?/\\ w+\\ .\\ w+/)" );
1492
+ m = p .getMatcher (doc .get (CoreAnnotations .TokensAnnotation .class ));
1493
+ match = m .find ();
1494
+ assertTrue (match );
1495
+ assertEquals (1 , m .groupCount ());
1496
+ assertEquals ("model wss.32" , m .group ());
1497
+ assertEquals ("wss.32" , m .group (1 ));
1498
+ match = m .find ();
1499
+ assertFalse (match );
1500
+ }
1501
+
1407
1502
public void testTokenSequenceMatcherBackRef () throws IOException {
1408
1503
CoreMap doc = createDocument ("A A A A A A A B A A B A C A E A A A A A A A A A A A B A A A" );
1409
1504
@@ -1488,17 +1583,18 @@ public void testCompile() {
1488
1583
//assertEquals(m.group(), "matching this");
1489
1584
}
1490
1585
1491
- //This DOES NOT work right now!!
1492
- // public void testCompile2(){
1586
+ public void testBindingCompile (){
1587
+ Env env = TokenSequencePattern .getNewEnv ();
1588
+ env .bind ("wordname" ,CoreAnnotations .TextAnnotation .class );
1589
+ String s = "[wordname:\" name\" ]{1,2}" ;
1590
+ TokenSequencePattern p = TokenSequencePattern .compile (env , s );
1591
+ }
1592
+
1593
+ // // This does not work!!!
1594
+ // public void testNoBindingCompile(){
1493
1595
// Env env = TokenSequencePattern.getNewEnv();
1494
- // env.bind("wordname",CoreAnnotations.TextAnnotation.class);
1495
1596
// String s = "[" + CoreAnnotations.TextAnnotation.class.getName()+":\"name\"]{1,2}";
1496
1597
// TokenSequencePattern p = TokenSequencePattern.compile(env, s);
1497
- // for(Map.Entry<String, Object> vars: env.getVariables().entrySet()){
1498
- // if(vars.getValue().equals(CoreAnnotations.TextAnnotation.class)){
1499
- // System.out.println("Found " + vars.getKey() + " binding for " + vars.getValue());
1500
- // }
1501
- // }
1502
1598
// }
1503
1599
1504
1600
public void testCaseInsensitive1 (){
0 commit comments