stanfordnlp
diff --git a/‎README.md
Lines changed: 10 additions & 10 deletions b/‎README.md
Lines changed: 10 additions & 10 deletions
diff --git a/‎build.gradle
Lines changed: 1 addition & 1 deletion b/‎build.gradle
Lines changed: 1 addition & 1 deletion
diff --git a/‎doc/corenlp/README.txt
Lines changed: 3 additions & 0 deletions b/‎doc/corenlp/README.txt
Lines changed: 3 additions & 0 deletions
diff --git a/‎doc/corenlp/pom-full.xml
Lines changed: 4 additions & 4 deletions b/‎doc/corenlp/pom-full.xml
Lines changed: 4 additions & 4 deletions
diff --git a/‎doc/corenlp/pom-light.xml
Lines changed: 4 additions & 4 deletions b/‎doc/corenlp/pom-light.xml
Lines changed: 4 additions & 4 deletions
diff --git a/‎examples/sample-maven-project/pom.xml
Lines changed: 11 additions & 11 deletions b/‎examples/sample-maven-project/pom.xml
Lines changed: 11 additions & 11 deletions
diff --git a/‎itest/src/edu/stanford/nlp/pipeline/RequirementsCorrectSlowITest.java
Lines changed: 7 additions & 7 deletions b/‎itest/src/edu/stanford/nlp/pipeline/RequirementsCorrectSlowITest.java
Lines changed: 7 additions & 7 deletions
diff --git a/‎itest/src/edu/stanford/nlp/pipeline/StanfordCoreNLPServerITest.java
Lines changed: 1 addition & 1 deletion b/‎itest/src/edu/stanford/nlp/pipeline/StanfordCoreNLPServerITest.java
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/edu/stanford/nlp/parser/lexparser/BaseLexicon.java
Lines changed: 5 additions & 0 deletions b/‎src/edu/stanford/nlp/parser/lexparser/BaseLexicon.java
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/edu/stanford/nlp/pipeline/CoreNLP.proto
Lines changed: 14 additions & 3 deletions b/‎src/edu/stanford/nlp/pipeline/CoreNLP.proto
Lines changed: 14 additions & 3 deletions
@@ -1,6 +1,6 @@
 # Stanford CoreNLP
 
-![Build Status](https://github.com/stanfordnlp/CoreNLP/actions/workflows/run-tests.yaml/badge.svg)
+[![Run Tests](https://github.com/stanfordnlp/CoreNLP/actions/workflows/run-tests.yaml/badge.svg)](https://github.com/stanfordnlp/CoreNLP/actions/workflows/run-tests.yaml)
 [![Maven Central](https://img.shields.io/maven-central/v/edu.stanford.nlp/stanford-corenlp.svg)](https://mvnrepository.com/artifact/edu.stanford.nlp/stanford-corenlp)
 [![Twitter](https://img.shields.io/twitter/follow/stanfordnlp.svg?style=social&label=Follow)](https://twitter.com/stanfordnlp/)
 
@@ -66,15 +66,15 @@ The jars can be directly downloaded from the links below or the Hugging Face Hub
 
 | Language | Model Jar | Last Updated |
 | --- | --- | --- |
-| Arabic  | [download](https://nlp.stanford.edu/software/stanford-arabic-corenlp-models-current.jar) [(HF Hub)](https://huggingface.co/stanfordnlp/corenlp-arabic/tree/main) | 4.4.0 |
-| Chinese | [download](https://nlp.stanford.edu/software/stanford-chinese-corenlp-models-current.jar) [(HF Hub)](https://huggingface.co/stanfordnlp/corenlp-chinese/tree/main)| 4.4.0 |
-| English (extra) | [download](https://nlp.stanford.edu/software/stanford-english-extra-corenlp-models-current.jar) [(HF Hub)](https://huggingface.co/stanfordnlp/corenlp-english-extra/tree/main) | 4.4.0 |
-| English (KBP) | [download](https://nlp.stanford.edu/software/stanford-english-kbp-corenlp-models-current.jar) [(HF Hub)](https://huggingface.co/stanfordnlp/corenlp-english-kbp/tree/main) | 4.4.0 |
-| French | [download](https://nlp.stanford.edu/software/stanford-french-corenlp-models-current.jar) [(HF Hub)](https://huggingface.co/stanfordnlp/corenlp-french/tree/main) | 4.4.0 |
-| German | [download](https://nlp.stanford.edu/software/stanford-german-corenlp-models-current.jar) [(HF Hub)](https://huggingface.co/stanfordnlp/corenlp-german/tree/main) | 4.4.0 |
-| Hungarian | [download](https://nlp.stanford.edu/software/stanford-hungarian-corenlp-models-current.jar) [(HF Hub)](https://huggingface.co/stanfordnlp/corenlp-hungarian/tree/main) | 4.4.0 |
-| Italian | [download](https://nlp.stanford.edu/software/stanford-italian-corenlp-models-current.jar) [(HF Hub)](https://huggingface.co/stanfordnlp/corenlp-italian/tree/main)| 4.4.0 |
-| Spanish | [download](https://nlp.stanford.edu/software/stanford-spanish-corenlp-models-current.jar) [(HF Hub)](https://huggingface.co/stanfordnlp/corenlp-spanish/tree/main)| 4.4.0 |
+| Arabic  | [download](https://nlp.stanford.edu/software/stanford-arabic-corenlp-models-current.jar) [(HF Hub)](https://huggingface.co/stanfordnlp/corenlp-arabic/tree/main) | 4.5.0 |
+| Chinese | [download](https://nlp.stanford.edu/software/stanford-chinese-corenlp-models-current.jar) [(HF Hub)](https://huggingface.co/stanfordnlp/corenlp-chinese/tree/main)| 4.5.0 |
+| English (extra) | [download](https://nlp.stanford.edu/software/stanford-english-extra-corenlp-models-current.jar) [(HF Hub)](https://huggingface.co/stanfordnlp/corenlp-english-extra/tree/main) | 4.5.0 |
+| English (KBP) | [download](https://nlp.stanford.edu/software/stanford-english-kbp-corenlp-models-current.jar) [(HF Hub)](https://huggingface.co/stanfordnlp/corenlp-english-kbp/tree/main) | 4.5.0 |
+| French | [download](https://nlp.stanford.edu/software/stanford-french-corenlp-models-current.jar) [(HF Hub)](https://huggingface.co/stanfordnlp/corenlp-french/tree/main) | 4.5.0 |
+| German | [download](https://nlp.stanford.edu/software/stanford-german-corenlp-models-current.jar) [(HF Hub)](https://huggingface.co/stanfordnlp/corenlp-german/tree/main) | 4.5.0 |
+| Hungarian | [download](https://nlp.stanford.edu/software/stanford-hungarian-corenlp-models-current.jar) [(HF Hub)](https://huggingface.co/stanfordnlp/corenlp-hungarian/tree/main) | 4.5.0 |
+| Italian | [download](https://nlp.stanford.edu/software/stanford-italian-corenlp-models-current.jar) [(HF Hub)](https://huggingface.co/stanfordnlp/corenlp-italian/tree/main)| 4.5.0 |
+| Spanish | [download](https://nlp.stanford.edu/software/stanford-spanish-corenlp-models-current.jar) [(HF Hub)](https://huggingface.co/stanfordnlp/corenlp-spanish/tree/main)| 4.5.0 |
 
 Thank you to [Hugging Face](https://huggingface.co/) for helping with our hosting!
 
 
@@ -11,7 +11,7 @@ sourceCompatibility = 1.8
 targetCompatibility = 1.8
 compileJava.options.encoding = 'UTF-8'
 
-version = '4.4.0'
+version = '4.5.0'
 
 // Gradle application plugin
 mainClassName = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
 
@@ -41,6 +41,9 @@ LICENSE
 CHANGES
 ---------------------------------
 
+2022-07-21    4.5.0     Tokenizer and lemmatizer upgrades, along with
+                        a new tsurgeon operation and some bugfixes
+
 2022-01-20    4.4.0     Fix issue with Italian depparse, tsurgeon CLI, 
                         fix security issues, bug fixes 
 
 
@@ -2,7 +2,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>edu.stanford.nlp</groupId>
   <artifactId>stanford-corenlp</artifactId>
-  <version>4.4.0</version>
+  <version>4.5.0</version>
   <packaging>jar</packaging>
   <name>Stanford CoreNLP</name>
   <description>Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.</description>
@@ -14,8 +14,8 @@
     </license>
   </licenses>
   <scm>
-    <url>https://nlp.stanford.edu/software/stanford-corenlp-4.4.0.zip</url>
-    <connection>https://nlp.stanford.edu/software/stanford-corenlp-4.4.0.zip</connection>
+    <url>https://nlp.stanford.edu/software/stanford-corenlp-4.5.0.zip</url>
+    <connection>https://nlp.stanford.edu/software/stanford-corenlp-4.5.0.zip</connection>
   </scm>
   <developers>
     <developer>
@@ -202,7 +202,7 @@
             <configuration>
               <artifacts>
                 <artifact>
-                  <file>${project.basedir}/stanford-corenlp-4.4.0-models.jar</file>
+                  <file>${project.basedir}/stanford-corenlp-4.5.0-models.jar</file>
                   <type>jar</type>
                   <classifier>models</classifier>
                 </artifact>
 
@@ -2,7 +2,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>edu.stanford.nlp</groupId>
   <artifactId>stanford-corenlp</artifactId>
-  <version>4.4.0</version>
+  <version>4.5.0</version>
   <packaging>jar</packaging>
   <name>Stanford CoreNLP</name>
   <description>Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.</description>
@@ -14,8 +14,8 @@
     </license>
   </licenses>
   <scm>
-    <url>https://nlp.stanford.edu/software/stanford-corenlp-4.4.0.zip</url>
-    <connection>https://nlp.stanford.edu/software/stanford-corenlp-4.4.0.zip</connection>
+    <url>https://nlp.stanford.edu/software/stanford-corenlp-4.5.0.zip</url>
+    <connection>https://nlp.stanford.edu/software/stanford-corenlp-4.5.0.zip</connection>
   </scm>
   <developers>
     <developer>
@@ -56,7 +56,7 @@
             <configuration>
               <artifacts>
                 <artifact>
-                  <file>${project.basedir}/stanford-corenlp-4.4.0-models.jar</file>
+                  <file>${project.basedir}/stanford-corenlp-4.5.0-models.jar</file>
                   <type>jar</type>
                   <classifier>models</classifier>
                 </artifact>
 
@@ -17,66 +17,66 @@
     <dependency>
         <groupId>edu.stanford.nlp</groupId>
         <artifactId>stanford-corenlp</artifactId>
-        <version>4.4.0</version>
+        <version>4.5.0</version>
     </dependency>
     <dependency>
         <groupId>edu.stanford.nlp</groupId>
         <artifactId>stanford-corenlp</artifactId>
-        <version>4.4.0</version>
+        <version>4.5.0</version>
         <classifier>javadoc</classifier>
     </dependency>
     <dependency>
         <groupId>edu.stanford.nlp</groupId>
         <artifactId>stanford-corenlp</artifactId>
-        <version>4.4.0</version>
+        <version>4.5.0</version>
         <classifier>sources</classifier>
     </dependency>
     <dependency>
         <groupId>edu.stanford.nlp</groupId>
         <artifactId>stanford-corenlp</artifactId>
-        <version>4.4.0</version>
+        <version>4.5.0</version>
         <classifier>models</classifier>
     </dependency>
     <dependency>
         <groupId>edu.stanford.nlp</groupId>
         <artifactId>stanford-corenlp</artifactId>
-        <version>4.4.0</version>
+        <version>4.5.0</version>
         <classifier>models-arabic</classifier>
     </dependency>
     <dependency>
         <groupId>edu.stanford.nlp</groupId>
         <artifactId>stanford-corenlp</artifactId>
-        <version>4.4.0</version>
+        <version>4.5.0</version>
         <classifier>models-chinese</classifier>
     </dependency>
     <dependency>
         <groupId>edu.stanford.nlp</groupId>
         <artifactId>stanford-corenlp</artifactId>
-        <version>4.4.0</version>
+        <version>4.5.0</version>
         <classifier>models-english</classifier>
     </dependency>
     <dependency>
         <groupId>edu.stanford.nlp</groupId>
         <artifactId>stanford-corenlp</artifactId>
-        <version>4.4.0</version>
+        <version>4.5.0</version>
         <classifier>models-english-kbp</classifier>
     </dependency>
     <dependency>
         <groupId>edu.stanford.nlp</groupId>
         <artifactId>stanford-corenlp</artifactId>
-        <version>4.4.0</version>
+        <version>4.5.0</version>
         <classifier>models-french</classifier>
     </dependency>
     <dependency>
         <groupId>edu.stanford.nlp</groupId>
         <artifactId>stanford-corenlp</artifactId>
-        <version>4.4.0</version>
+        <version>4.5.0</version>
         <classifier>models-german</classifier>
     </dependency>
     <dependency>
         <groupId>edu.stanford.nlp</groupId>
         <artifactId>stanford-corenlp</artifactId>
-        <version>4.4.0</version>
+        <version>4.5.0</version>
         <classifier>models-spanish</classifier>
     </dependency>
   </dependencies>
 
@@ -119,27 +119,27 @@ private void testAnnotatorSequence(List<String> annotators) {
 
   @Test
   public void testDefaultPipeline() {
-    testAnnotatorSequence(Arrays.asList("tokenize", "ssplit", "pos", "lemma", "ner", "gender", "parse", "coref"));
+    testAnnotatorSequence(Arrays.asList("tokenize", "pos", "lemma", "ner", "gender", "parse", "coref"));
   }
 
   @Test
   public void testDepparsePipeline() {
-    testAnnotatorSequence(Arrays.asList("tokenize", "ssplit", "pos", "depparse"));
+    testAnnotatorSequence(Arrays.asList("tokenize", "pos", "depparse"));
   }
 
   @Test
   public void testQuotePipeline() {
-    testAnnotatorSequence(Arrays.asList("tokenize","ssplit","pos","lemma","ner","depparse","coref","quote"));
+    testAnnotatorSequence(Arrays.asList("tokenize","pos","lemma","ner","depparse","coref","quote"));
   }
 
-   @Test
-   public void testTrueCasePipeline() {
-     testAnnotatorSequence(Arrays.asList("tokenize","ssplit","pos","lemma","truecase"));
+  @Test
+  public void testTrueCasePipeline() {
+    testAnnotatorSequence(Arrays.asList("tokenize","pos","lemma","truecase"));
    }
 
   @Test
   public void testOpenIEPipeline() {
-    testAnnotatorSequence(Arrays.asList("tokenize","ssplit","pos","lemma","depparse","natlog","openie"));
+    testAnnotatorSequence(Arrays.asList("tokenize","pos","lemma","depparse","natlog","openie"));
   }
 
   @Test
 
@@ -143,7 +143,7 @@ public void testSemgrexJson() throws IOException {
 
   @Test
   public void testSemgrexAnnotation() throws IOException {
-    String expected = "result { result { match { matchIndex: 3 node { name: \"verb\" matchIndex: 3 } node { name: \"obj\" matchIndex: 5 } } }}".replaceAll(" ", "");
+    String expected = "result { result { match { matchIndex: 3 node { name: \"verb\" matchIndex: 3 } node { name: \"obj\" matchIndex: 5 } graphIndex:0 semgrexIndex:0 } }}".replaceAll(" ", "");
     String query = "The dog ate a fish";
     byte[] message = query.getBytes("utf-8");
     Properties props = new Properties();
 
@@ -63,6 +63,11 @@ public class BaseLexicon implements Lexicon  {
   protected static final IntTaggedWord NULL_ITW = new IntTaggedWord(nullWord, nullTag);
 
   protected final TrainOptions trainOptions;
+  // TODO: remove this link
+  // the only reason it is needed is because testOptions has an item,
+  // unseenSmooth, which belongs in trainOptions
+  // the problem is moving that and/or removing this link will invalidate
+  // all existing serialized models
   protected final TestOptions testOptions;
 
   protected final Options op;
 
@@ -627,6 +627,10 @@ message SemgrexRequest {
 // If you pass in M semgrex expressions and N dependency graphs,
 // this returns MxN nested results.  Each SemgrexResult can match
 // multiple times in one graph
+//
+// You may want to send multiple semgrexes per query because
+// translating large numbers of dependency graphs to protobufs
+// will be expensive, so doing several queries at once will save time
 message SemgrexResponse {
   message NamedNode {
     required string          name        = 1;
@@ -639,9 +643,16 @@ message SemgrexResponse {
   }
 
   message Match {
-    required int32           matchIndex  = 1;
-    repeated NamedNode       node        = 2;
-    repeated NamedRelation   reln        = 3;
+    required int32           matchIndex   = 1;
+    repeated NamedNode       node         = 2;
+    repeated NamedRelation   reln         = 3;
+    // when processing multiple dependency graphs at once,
+    // which dependency graph this applies to
+    // indexed from 0
+    optional int32           graphIndex   = 4;
+    // index of the semgrex expression this match applies to
+    // indexed from 0
+    optional int32           semgrexIndex = 5;
   }
 
   message SemgrexResult {
Original file line number	Diff line number	Diff line change
`@@ -119,27 +119,27 @@ private void testAnnotatorSequence(List<String> annotators) {`
`119`	`119`
`120`	`120`	`@Test`
`121`	`121`	`public void testDefaultPipeline() {`
`122`		`- testAnnotatorSequence(Arrays.asList("tokenize", "ssplit", "pos", "lemma", "ner", "gender", "parse", "coref"));`
	`122`	`+ testAnnotatorSequence(Arrays.asList("tokenize", "pos", "lemma", "ner", "gender", "parse", "coref"));`
`123`	`123`	`}`
`124`	`124`
`125`	`125`	`@Test`
`126`	`126`	`public void testDepparsePipeline() {`
`127`		`- testAnnotatorSequence(Arrays.asList("tokenize", "ssplit", "pos", "depparse"));`
	`127`	`+ testAnnotatorSequence(Arrays.asList("tokenize", "pos", "depparse"));`
`128`	`128`	`}`
`129`	`129`
`130`	`130`	`@Test`
`131`	`131`	`public void testQuotePipeline() {`
`132`		`- testAnnotatorSequence(Arrays.asList("tokenize","ssplit","pos","lemma","ner","depparse","coref","quote"));`
	`132`	`+ testAnnotatorSequence(Arrays.asList("tokenize","pos","lemma","ner","depparse","coref","quote"));`
`133`	`133`	`}`
`134`	`134`
`135`		`- @Test`
`136`		`- public void testTrueCasePipeline() {`
`137`		`- testAnnotatorSequence(Arrays.asList("tokenize","ssplit","pos","lemma","truecase"));`
	`135`	`+ @Test`
	`136`	`+ public void testTrueCasePipeline() {`
	`137`	`+ testAnnotatorSequence(Arrays.asList("tokenize","pos","lemma","truecase"));`
`138`	`138`	`}`
`139`	`139`
`140`	`140`	`@Test`
`141`	`141`	`public void testOpenIEPipeline() {`
`142`		`- testAnnotatorSequence(Arrays.asList("tokenize","ssplit","pos","lemma","depparse","natlog","openie"));`
	`142`	`+ testAnnotatorSequence(Arrays.asList("tokenize","pos","lemma","depparse","natlog","openie"));`
`143`	`143`	`}`
`144`	`144`
`145`	`145`	`@Test`