Skip to content

Commit af70b51

Browse files
manningStanford NLP
authored and
Stanford NLP
committed
Merge remote branch 'origin/master'
1 parent 90ad647 commit af70b51

File tree

82 files changed

+5713
-3461
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

82 files changed

+5713
-3461
lines changed

doc/loglinear/QUICKSTART.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ loglinear package quickstart:
22

33
First, read the ConcatVector section in ARCH.txt.
44

5-
To jump straight into working code, go read generateSentenceModel() in edu.stanford.nlp.loglinear.learning.CoNLLBenchmark.
5+
To jump straight into working code, go read generateSentenceModel() in edu.stanford.nlp.loglinear.CoNLLBenchmark.
66

77
#####################################################
88

doc/loglinear/README.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
For an explanation of how everything fits together, see ARCH.txt
22

3-
For a quick runnable object, go run edu.stanford.nlp.loglinear.learning.CoNLLBenchmark in core's test package.
3+
For a quick runnable object, go run edu.stanford.nlp.loglinear.CoNLLBenchmark in core's test package.
44

55
For a tutorial, see QUICKSTART.txt
66

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
package edu.stanford.nlp.ie.qe;
2+
3+
import edu.stanford.nlp.ling.tokensregex.MatchedExpression;
4+
import edu.stanford.nlp.pipeline.*;
5+
import junit.framework.TestCase;
6+
7+
import java.util.List;
8+
9+
/**
10+
* Test for quantifiable entity extractor
11+
* @author Angel Chang
12+
*/
13+
public class QuantifiableEntityExtractorITest extends TestCase {
14+
static AnnotationPipeline pipeline = null;
15+
static QuantifiableEntityExtractor extractor = null;
16+
17+
public void test() throws Exception {
18+
// TODO: Enable tests after rules files are added to models
19+
}
20+
21+
@Override
22+
public void setUp() throws Exception {
23+
synchronized(QuantifiableEntityExtractorITest.class) {
24+
if (pipeline == null) {
25+
pipeline = new AnnotationPipeline();
26+
pipeline.addAnnotator(new TokenizerAnnotator(false, "en"));
27+
pipeline.addAnnotator(new WordsToSentencesAnnotator(false));
28+
pipeline.addAnnotator(new POSTaggerAnnotator(DefaultPaths.DEFAULT_POS_MODEL, false));
29+
//pipeline.addAnnotator(new QuantifiableEntityNormalizingAnnotator(false, false));
30+
}
31+
extractor = new QuantifiableEntityExtractor();
32+
//extractor.init(new Options());
33+
}
34+
}
35+
36+
protected static Annotation createDocument(String text) {
37+
Annotation annotation = new Annotation(text);
38+
pipeline.annotate(annotation);
39+
return annotation;
40+
}
41+
42+
public static class ExpectedQuantity {
43+
String text;
44+
String normalizedValue;
45+
String type;
46+
47+
public ExpectedQuantity(String text, String normalizedValue, String type) {
48+
this.text = text;
49+
this.normalizedValue = normalizedValue;
50+
this.type = type;
51+
}
52+
}
53+
54+
public void runAndCheck(String prefix, String[] sentences, ExpectedQuantity[][] expected) throws Exception {
55+
for (int si = 0; si < sentences.length; si++) {
56+
String sentence = sentences[si];
57+
Annotation annotation = createDocument(sentence);
58+
List<MatchedExpression> matchedExpressions = extractor.extract(annotation);
59+
60+
// Print out matched text and value
61+
if (expected == null) {
62+
for (int i = 0; i < matchedExpressions.size(); i++) {
63+
String text = matchedExpressions.get(i).getText();
64+
Object value = matchedExpressions.get(i).getValue();
65+
System.out.println(prefix + ": Got expression " + text + " with value " + value);
66+
}
67+
assertTrue(prefix + ": No expected provided", false);
68+
} else {
69+
int minMatchable = Math.min(expected[si].length, matchedExpressions.size());
70+
for (int i = 0; i < minMatchable; i++) {
71+
ExpectedQuantity expectedQuantity = expected[si][i];
72+
MatchedExpression matched = matchedExpressions.get(i);
73+
SimpleQuantifiableEntity actualQuantity = (SimpleQuantifiableEntity) matched.getValue().get();
74+
assertEquals(prefix + ".matched." + si + "." + i + ".text", expectedQuantity.text, matched.getText());
75+
assertEquals(prefix + ".matched." + si + "." + i + ".normalizedValue", expectedQuantity.normalizedValue, actualQuantity.toString());
76+
assertEquals(prefix + ".matched." + si + "." + i + ".type", expectedQuantity.type, actualQuantity.getUnit().type);
77+
}
78+
assertEquals(prefix + ".length." + si, expected[si].length, matchedExpressions.size());
79+
}
80+
}
81+
}
82+
83+
public void _testMoney() throws Exception {
84+
String[] sentences = {
85+
"I have 1 dollar and 2 cents.",
86+
"It cost 10 thousand million dollars."
87+
};
88+
// TODO: merge the 1 dollar and 2 cents
89+
ExpectedQuantity[][] expected = {
90+
{new ExpectedQuantity("1 dollar", "$1.00", "MONEY"), new ExpectedQuantity("2 cents", "$0.02", "MONEY")},
91+
{new ExpectedQuantity("10 thousand million dollars", "$10000000000.00", "MONEY")}
92+
};
93+
94+
runAndCheck("testMoney", sentences, expected);
95+
}
96+
97+
public void _testLength() throws Exception {
98+
String[] sentences = {
99+
"We are 2 kilometer away.",
100+
"We are 2 kilometers away.",
101+
"We turn after 5 miles.",
102+
"The box is 100 centimeters tall.",
103+
"The box is 10cm wide.",
104+
"The box is over 1000 mm long.",
105+
"The box is 2ft long."
106+
};
107+
ExpectedQuantity[][] expected = {
108+
{new ExpectedQuantity("2 kilometer", "2000.0m", "LENGTH")},
109+
{new ExpectedQuantity("2 kilometers", "2000.0m", "LENGTH")},
110+
{new ExpectedQuantity("5 miles", "5.0mi", "LENGTH")},
111+
{new ExpectedQuantity("100 centimeters", "1.0m", "LENGTH")},
112+
{new ExpectedQuantity("10cm", "0.1m", "LENGTH")},
113+
{new ExpectedQuantity("1000 mm", "1.0m", "LENGTH")},
114+
{new ExpectedQuantity("2ft", "2.0'", "LENGTH")}
115+
};
116+
runAndCheck("testLength", sentences, expected);
117+
}
118+
119+
// We do weight instead of mass since in typical natural language
120+
// kilograms are used to refer to weight vs mass (in scientific usage)
121+
public void _testWeight() throws Exception {
122+
String[] sentences = {
123+
"The ball is 2 kilograms in weight.",
124+
"There are five grams.",
125+
"How much is seven pounds?"
126+
};
127+
ExpectedQuantity[][] expected = {
128+
{new ExpectedQuantity("2 kilograms", "2.0kg", "WEIGHT")},
129+
{new ExpectedQuantity("five grams", "0.005kg", "WEIGHT")},
130+
{new ExpectedQuantity("seven pounds", "7.0lb", "WEIGHT")}
131+
};
132+
runAndCheck("testWeight", sentences, expected);
133+
}
134+
135+
}

itest/src/edu/stanford/nlp/ling/tokensregex/TokenSequenceMatcherITest.java

+114-18
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,21 @@
11
package edu.stanford.nlp.ling.tokensregex;
22

3-
import edu.stanford.nlp.io.IOUtils;
43
import edu.stanford.nlp.ling.CoreAnnotations;
4+
import edu.stanford.nlp.ling.CoreLabel;
55
import edu.stanford.nlp.pipeline.*;
6+
import edu.stanford.nlp.process.CoreLabelTokenFactory;
7+
import edu.stanford.nlp.process.PTBTokenizer;
8+
import edu.stanford.nlp.process.TokenizerFactory;
69
import edu.stanford.nlp.util.CoreMap;
710
import edu.stanford.nlp.util.Pair;
8-
import edu.stanford.nlp.util.StringUtils;
911
import edu.stanford.nlp.util.Timing;
1012
import junit.framework.TestCase;
1113

12-
import java.io.File;
1314
import java.io.IOException;
15+
import java.io.StringReader;
1416
import java.util.ArrayList;
15-
import java.util.Arrays;
1617
import java.util.Iterator;
1718
import java.util.List;
18-
import java.util.Map;
1919
import java.util.regex.Pattern;
2020

2121
public class TokenSequenceMatcherITest extends TestCase {
@@ -94,6 +94,50 @@ public void testTokenSequenceMatcherValue() throws IOException {
9494
assertFalse(match);
9595
}
9696

97+
public void testTokenSequenceMatcherBeginEnd() throws IOException {
98+
CoreMap doc = createDocument(testText);
99+
100+
// Test simple sequence with begin sequence matching
101+
TokenSequencePattern p = TokenSequencePattern.compile("^ [] []");
102+
TokenSequenceMatcher m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
103+
104+
boolean match = m.find();
105+
assertTrue(match);
106+
assertEquals("the number", m.group());
107+
108+
match = m.find();
109+
assertFalse(match);
110+
111+
// Test simple sequence with end sequence matching
112+
p = TokenSequencePattern.compile("[] [] $");
113+
m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
114+
115+
match = m.find();
116+
assertTrue(match);
117+
assertEquals("fifty.", m.group());
118+
119+
match = m.find();
120+
assertFalse(match);
121+
122+
// Test simple sequence with begin and end sequence matching
123+
p = TokenSequencePattern.compile("^ [] [] $");
124+
m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
125+
126+
match = m.find();
127+
assertFalse(match);
128+
129+
// Test simple sequence with ^$ in a string regular expression
130+
p = TokenSequencePattern.compile("/^number$/");
131+
m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
132+
133+
match = m.find();
134+
assertTrue(match);
135+
assertEquals("number", m.group());
136+
137+
match = m.find();
138+
assertFalse(match);
139+
}
140+
97141
private static final String testText1 = "Mellitus was the first Bishop of London, the third Archbishop of Canterbury, and a member of the Gregorian mission sent to England to convert the Anglo-Saxons. He arrived in 601 AD, and was consecrated as Bishop of London in 604.";
98142
public void testTokenSequenceMatcher1() throws IOException {
99143
CoreMap doc = createDocument(testText1);
@@ -179,7 +223,7 @@ public void testTokenSequenceMatcher1() throws IOException {
179223
match = m.find();
180224
assertTrue(match);
181225
assertEquals(0, m.groupCount());
182-
assertEquals("London in 604 .", m.group());
226+
assertEquals("London in 604.", m.group());
183227
match = m.find();
184228
assertFalse(match);
185229
}
@@ -435,6 +479,31 @@ public void testTokenSequenceMatcherConj() throws IOException {
435479
assertFalse(match);
436480
}
437481

482+
public void testTokenSequenceMatcherConj2() throws IOException {
483+
String content = "The cat is sleeping on the floor.";
484+
String greedyPattern = "(?: ([]* cat []*) & ([]* sleeping []*))";
485+
486+
TokenizerFactory tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
487+
List<CoreLabel> tokens = tf.getTokenizer(new StringReader(content)).tokenize();
488+
TokenSequencePattern seqPattern = TokenSequencePattern.compile(greedyPattern);
489+
TokenSequenceMatcher matcher = seqPattern.getMatcher(tokens);
490+
491+
boolean entireMatch = matcher.matches();
492+
assertTrue(entireMatch);
493+
494+
boolean match = matcher.find();
495+
assertTrue(match);
496+
assertEquals("The cat is sleeping on the floor.", matcher.group());
497+
498+
String reluctantPattern = "(?: ([]*? cat []*?) & ([]*? sleeping []*?))";
499+
TokenSequencePattern seqPattern2 = TokenSequencePattern.compile(reluctantPattern);
500+
TokenSequenceMatcher matcher2 = seqPattern2.getMatcher(tokens);
501+
502+
match = matcher2.find();
503+
assertTrue(match);
504+
assertEquals("The cat is sleeping", matcher2.group());
505+
}
506+
438507
public void testTokenSequenceMatcherConjAll() throws IOException {
439508
CoreMap doc = createDocument(testText1);
440509
TokenSequencePattern p = TokenSequencePattern.compile(
@@ -979,7 +1048,7 @@ public void testTokenSequenceOptimizeOrString() throws IOException {
9791048
TokenSequenceMatcher m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
9801049
boolean match = m.find();
9811050
assertTrue(match);
982-
assertEquals("atropine we need to have many many words here but we do n't sweating", m.group(0));
1051+
assertEquals("atropine we need to have many many words here but we don't sweating", m.group(0));
9831052

9841053
match = m.find();
9851054
assertFalse(match);
@@ -1005,7 +1074,7 @@ public void testMultiplePatterns() throws IOException {
10051074
CoreMap doc = createDocument("atropine we need to have many many words here but we don't sweating");
10061075
MultiPatternMatcher<CoreMap> multiPatternMatcher = TokenSequencePattern.getMultiPatternMatcher(p1, p2);
10071076
List<String> expected = new ArrayList<String>();
1008-
expected.add("atropine we need to have many many words here but we do n't sweating");
1077+
expected.add("atropine we need to have many many words here but we don't sweating");
10091078
Iterator<String> expectedIter = expected.iterator();
10101079

10111080
Iterable<SequenceMatchResult<CoreMap>> matches =
@@ -1187,7 +1256,7 @@ public void testTokenSequenceMatcherNumber() throws IOException {
11871256
match = m.find();
11881257
assertTrue(match);
11891258
assertEquals(0, m.groupCount());
1190-
assertEquals("January 3 , 2002", m.group());
1259+
assertEquals("January 3, 2002", m.group());
11911260
match = m.find();
11921261
assertFalse(match);
11931262

@@ -1196,7 +1265,7 @@ public void testTokenSequenceMatcherNumber() throws IOException {
11961265
match = m.find();
11971266
assertTrue(match);
11981267
assertEquals(0, m.groupCount());
1199-
assertEquals("January 3 , 2002", m.group());
1268+
assertEquals("January 3, 2002", m.group());
12001269
match = m.find();
12011270
assertFalse(match);
12021271

@@ -1404,6 +1473,32 @@ public void testTokenSequenceMatcherMultiNodePattern() throws IOException {
14041473
assertFalse(match);
14051474
}
14061475

1476+
public void testTokenSequenceMatcherMultiNodePattern2() throws IOException {
1477+
CoreMap doc = createDocument("Replace the lamp with model wss.32dc55c3e945384dbc5e533ab711fd24");
1478+
1479+
// Greedy
1480+
TokenSequencePattern p = TokenSequencePattern.compile("/model/ ((?m){1,4}/\\w+\\.\\w+/)");
1481+
TokenSequenceMatcher m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
1482+
boolean match = m.find();
1483+
assertTrue(match);
1484+
assertEquals(1, m.groupCount());
1485+
assertEquals("model wss.32dc55c3e945384dbc5e533ab711fd24", m.group());
1486+
assertEquals("wss.32dc55c3e945384dbc5e533ab711fd24", m.group(1));
1487+
match = m.find();
1488+
assertFalse(match);
1489+
1490+
// Reluctant
1491+
p = TokenSequencePattern.compile("/model/ ((?m){1,4}?/\\w+\\.\\w+/)");
1492+
m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
1493+
match = m.find();
1494+
assertTrue(match);
1495+
assertEquals(1, m.groupCount());
1496+
assertEquals("model wss.32", m.group());
1497+
assertEquals("wss.32", m.group(1));
1498+
match = m.find();
1499+
assertFalse(match);
1500+
}
1501+
14071502
public void testTokenSequenceMatcherBackRef() throws IOException {
14081503
CoreMap doc = createDocument("A A A A A A A B A A B A C A E A A A A A A A A A A A B A A A");
14091504

@@ -1488,17 +1583,18 @@ public void testCompile() {
14881583
//assertEquals(m.group(), "matching this");
14891584
}
14901585

1491-
//This DOES NOT work right now!!
1492-
// public void testCompile2(){
1586+
public void testBindingCompile(){
1587+
Env env = TokenSequencePattern.getNewEnv();
1588+
env.bind("wordname",CoreAnnotations.TextAnnotation.class);
1589+
String s = "[wordname:\"name\"]{1,2}";
1590+
TokenSequencePattern p = TokenSequencePattern.compile(env, s);
1591+
}
1592+
1593+
// // This does not work!!!
1594+
// public void testNoBindingCompile(){
14931595
// Env env = TokenSequencePattern.getNewEnv();
1494-
// env.bind("wordname",CoreAnnotations.TextAnnotation.class);
14951596
// String s = "[" + CoreAnnotations.TextAnnotation.class.getName()+":\"name\"]{1,2}";
14961597
// TokenSequencePattern p = TokenSequencePattern.compile(env, s);
1497-
// for(Map.Entry<String, Object> vars: env.getVariables().entrySet()){
1498-
// if(vars.getValue().equals(CoreAnnotations.TextAnnotation.class)){
1499-
// System.out.println("Found " + vars.getKey() + " binding for " + vars.getValue());
1500-
// }
1501-
// }
15021598
// }
15031599

15041600
public void testCaseInsensitive1(){

test/src/edu/stanford/nlp/loglinear/learning/CoNLLBenchmark.java itest/src/edu/stanford/nlp/loglinear/CoNLLBenchmark.java

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
1-
package edu.stanford.nlp.loglinear.learning;
1+
package edu.stanford.nlp.loglinear;
22

33
import edu.stanford.nlp.loglinear.inference.CliqueTree;
4+
import edu.stanford.nlp.loglinear.learning.AbstractBatchOptimizer;
5+
import edu.stanford.nlp.loglinear.learning.BacktrackingAdaGradOptimizer;
6+
import edu.stanford.nlp.loglinear.learning.LogLikelihoodFunction;
47
import edu.stanford.nlp.loglinear.model.ConcatVector;
58
import edu.stanford.nlp.loglinear.model.GraphicalModel;
69
import edu.stanford.nlp.util.HashIndex;

itest/src/edu/stanford/nlp/parser/nndep/DependencyParserITest.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -46,15 +46,15 @@ public void testDependencyParserEnglishSD() {
4646
}
4747

4848
// Lower because we're evaluating on PTB + extraDevTest, not just PTB
49-
private static final double EnglishUdLas = 84.9873;
49+
private static final double EnglishUdLas = 88.72648417258083;
5050

5151
/**
5252
* Test that the NN dependency parser performance doesn't change.
5353
*/
5454
public void testDependencyParserEnglishUD() {
5555
DependencyParser parser = new DependencyParser();
5656
parser.loadModelFile("/u/nlp/data/depparser/nn/distrib-2015-04-16/english_UD.gz");
57-
double las = parser.testCoNLL("/u/nlp/data/depparser/nn/data/dependency_treebanks/USD/dev.conll", null);
57+
double las = parser.testCoNLL("/u/nlp/data/depparser/nn/data/dependency_treebanks/UD-converted/dev.conll", null);
5858
assertEquals(String.format("English UD LAS should be %.2f but was %.2f",
5959
EnglishUdLas, las), EnglishUdLas, las, 1e-4);
6060
}

0 commit comments

Comments
 (0)