From edb19870bce1489ce7b8315b9827253041101006 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sun, 16 Feb 2025 09:19:08 +0900 Subject: [PATCH 1/4] Fix #1244 --- .../grobid/core/document/TEIFormatter.java | 18 ++++++---- .../core/document/TEIFormatterTest.java | 34 ++++++++++++++++++- 2 files changed, 44 insertions(+), 8 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index bd791603b8..8f0a04e285 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -1731,7 +1731,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer, } else { throw new IllegalStateException("Unsupported marker type: " + clusterLabel); } - + if (refNodes != null) { boolean footNoteCallout = false; @@ -2293,9 +2293,9 @@ public List markReferencesFigureTEI(String refText, // second pass with relaxed figure marker matching for(int i=figures.size()-1; i>=0; i--) { Figure figure = figures.get(i); - if ((figure.getLabel() != null) && (figure.getLabel().length() > 0)) { + if (StringUtils.isNotBlank(figure.getLabel())) { String label = TextUtilities.cleanField(figure.getLabel(), false); - if (label != null && (label.length() > 0) && + if (StringUtils.isNotBlank(label) && (textLow.contains(label.toLowerCase()))) { bestFigure = figure.getId(); break; @@ -2313,13 +2313,17 @@ public List markReferencesFigureTEI(String refText, String andWordString = null; if (text.endsWith("and") || text.endsWith("&")) { - // the AND_WORD_PATTERN case, we want to exclude the AND word from the tagged chunk - if (text.endsWith("and")) { + if (text.equals("and") || text.equals("&")) { + nodes.add(new Text(text)); + if (spaceEnd) + nodes.add(new Text(" ")); + continue; + } else if (text.endsWith("and")) { + // the AND_WORD_PATTERN case, we want to exclude the AND word from the tagged chunk text = text.substring(0, text.length()-3); andWordString = "and"; refTokens = refTokens.subList(0,refTokens.size()-1); - } - else if (text.endsWith("&")) { + } else if (text.endsWith("&")) { text = text.substring(0, text.length()-1); andWordString = "&"; refTokens = refTokens.subList(0,refTokens.size()-1); diff --git a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java index 6fb7f3f153..033dbfe990 100644 --- a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java +++ b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java @@ -1,7 +1,9 @@ package org.grobid.core.document; import nu.xom.Element; +import nu.xom.Node; import org.grobid.core.analyzers.GrobidAnalyzer; +import org.grobid.core.data.Figure; import org.grobid.core.data.Note; import org.grobid.core.layout.LayoutToken; import org.grobid.core.utilities.GrobidProperties; @@ -10,9 +12,11 @@ import org.junit.Test; import java.util.List; +import java.util.stream.Collectors; import static org.hamcrest.CoreMatchers.is; -import static org.hamcrest.Matchers.*; +import static org.hamcrest.Matchers.greaterThan; +import static org.hamcrest.Matchers.hasSize; import static org.junit.Assert.assertThat; public class TEIFormatterTest { @@ -73,5 +77,33 @@ public void testGenerateURLRef() throws Exception { assertThat(node.toXML(), is("http:// github.com/ lfoppiano/ grobid-bla")); } + @Test + public void testMarkReferencesFigureTEI() throws Exception { + String input = "and 3D"; + List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + + + List tokensWithOffset = tokens.stream() + .peek(t -> t.setOffset(t.getOffset() + 51393)) + .collect(Collectors.toList()); + + Figure f1 = new Figure(); + f1.setLabel(new StringBuilder("1")); + Figure f2 = new Figure(); + f1.setLabel(new StringBuilder("2")); + Figure f3 = new Figure(); + f1.setLabel(new StringBuilder("")); + + List
figures = List.of(f1, f2, f3); + + + List nodes = new TEIFormatter(null, null) + .markReferencesFigureTEI(input, tokensWithOffset, figures, false); + + assertThat(nodes, hasSize(2)); + assertThat(nodes.get(0).toXML(), is("and")); + assertThat(((Element)nodes.get(1)).toXML(), is("3D")); + } + } \ No newline at end of file From 403d035818d59537c20877d917ca30f2d50e4559 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sun, 16 Feb 2025 09:46:45 +0900 Subject: [PATCH 2/4] Fix #1233 --- .../grobid/core/document/TEIFormatter.java | 7 +++++ .../core/document/TEIFormatterTest.java | 30 +++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index 8f0a04e285..226dc5d1be 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -2311,6 +2311,13 @@ public List markReferencesFigureTEI(String refText, spaceEnd = true; text = text.trim(); + if (StringUtils.isBlank(text)) { + nodes.add(new Text(text)); + if (spaceEnd) + nodes.add(new Text(" ")); + continue; + } + String andWordString = null; if (text.endsWith("and") || text.endsWith("&")) { if (text.equals("and") || text.equals("&")) { diff --git a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java index 033dbfe990..93aae2ab18 100644 --- a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java +++ b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java @@ -105,5 +105,35 @@ public void testMarkReferencesFigureTEI() throws Exception { assertThat(((Element)nodes.get(1)).toXML(), is("3D")); } + @Test + public void testMarkReferencesFigureTEI_() throws Exception { + String input = "5, & "; + List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + + + List tokensWithOffset = tokens.stream() + .peek(t -> t.setOffset(t.getOffset() + 51393)) + .collect(Collectors.toList()); + + Figure f1 = new Figure(); + f1.setLabel(new StringBuilder("1")); + Figure f2 = new Figure(); + f1.setLabel(new StringBuilder("2")); + Figure f3 = new Figure(); + f1.setLabel(new StringBuilder("")); + + List
figures = List.of(f1, f2, f3); + + + List nodes = new TEIFormatter(null, null) + .markReferencesFigureTEI(input, tokensWithOffset, figures, false); + + assertThat(nodes, hasSize(4)); + assertThat(((Element)nodes.get(0)).toXML(), is("5,")); + assertThat(nodes.get(1).toXML(), is(" &")); + assertThat(nodes.get(2).toXML(), is("")); + assertThat(nodes.get(3).toXML(), is(" ")); + } + } \ No newline at end of file From bbebff92c270b93ab2b8487b204731261c64b597 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sun, 16 Feb 2025 10:39:55 +0900 Subject: [PATCH 3/4] Add more tests and fix spaces being added --- .../grobid/core/document/TEIFormatter.java | 25 +++++++++++-- .../core/document/TEIFormatterTest.java | 37 +++++++++++++++++-- 2 files changed, 55 insertions(+), 7 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index 226dc5d1be..e425f76ae3 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -2306,24 +2306,37 @@ public List markReferencesFigureTEI(String refText, } boolean spaceEnd = false; + boolean spaceStart = false; text = text.replace("\n", " "); - if (text.endsWith(" ")) + if (text.endsWith(" ")) { spaceEnd = true; + } + if (!text.equals(" ") & text.startsWith(" ")) { + spaceStart = true; + } text = text.trim(); if (StringUtils.isBlank(text)) { + if (spaceStart) { + nodes.add(new Text(" ")); + } nodes.add(new Text(text)); - if (spaceEnd) + if (spaceEnd) { nodes.add(new Text(" ")); + } continue; } String andWordString = null; if (text.endsWith("and") || text.endsWith("&")) { if (text.equals("and") || text.equals("&")) { + if (spaceStart) { + nodes.add(new Text(" ")); + } nodes.add(new Text(text)); - if (spaceEnd) + if (spaceEnd) { nodes.add(new Text(" ")); + } continue; } else if (text.endsWith("and")) { // the AND_WORD_PATTERN case, we want to exclude the AND word from the tagged chunk @@ -2358,14 +2371,18 @@ public List markReferencesFigureTEI(String refText, if (bestFigure != null) { ref.addAttribute(new Attribute("target", "#fig_" + bestFigure)); } + if (spaceStart) { + nodes.add(new Text(" ")); + } nodes.add(ref); if (andWordString != null) { nodes.add(new Text(andWordString)); } - if (spaceEnd) + if (spaceEnd) { nodes.add(new Text(" ")); + } } return nodes; } diff --git a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java index 93aae2ab18..bb95cb3345 100644 --- a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java +++ b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java @@ -79,6 +79,36 @@ public void testGenerateURLRef() throws Exception { @Test public void testMarkReferencesFigureTEI() throws Exception { + String input = "3C and 3D"; + List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + + + List tokensWithOffset = tokens.stream() + .peek(t -> t.setOffset(t.getOffset() + 51393)) + .collect(Collectors.toList()); + + Figure f1 = new Figure(); + f1.setLabel(new StringBuilder("1")); + Figure f2 = new Figure(); + f1.setLabel(new StringBuilder("2")); + Figure f3 = new Figure(); + f1.setLabel(new StringBuilder("")); + + List
figures = List.of(f1, f2, f3); + + + List nodes = new TEIFormatter(null, null) + .markReferencesFigureTEI(input, tokensWithOffset, figures, false); + + assertThat(nodes, hasSize(4)); + assertThat(((Element)nodes.get(0)).toXML(), is("3C")); + assertThat(nodes.get(1).toXML(), is(" and")); + assertThat(nodes.get(2).toXML(), is(" ")); + assertThat(((Element)nodes.get(3)).toXML(), is("3D")); + } + + @Test + public void testMarkReferencesFigureTEI_truncatedRef_referenceAtTheEnd() throws Exception { String input = "and 3D"; List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); @@ -100,13 +130,14 @@ public void testMarkReferencesFigureTEI() throws Exception { List nodes = new TEIFormatter(null, null) .markReferencesFigureTEI(input, tokensWithOffset, figures, false); - assertThat(nodes, hasSize(2)); + assertThat(nodes, hasSize(3)); assertThat(nodes.get(0).toXML(), is("and")); - assertThat(((Element)nodes.get(1)).toXML(), is("3D")); + assertThat(nodes.get(1).toXML(), is(" ")); + assertThat(((Element)nodes.get(2)).toXML(), is("3D")); } @Test - public void testMarkReferencesFigureTEI_() throws Exception { + public void testMarkReferencesFigureTEI_truncatedRef_referenceAtBeginning() throws Exception { String input = "5, & "; List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); From df74c762ecc5e120d242e3048eb8c8c21648a5c5 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sun, 16 Feb 2025 10:52:04 +0900 Subject: [PATCH 4/4] update table references and add more tests --- .../grobid/core/document/TEIFormatter.java | 48 ++++++-- .../core/document/TEIFormatterTest.java | 103 ++++++++++++++++-- 2 files changed, 131 insertions(+), 20 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index e425f76ae3..30cc5f4d67 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -2445,10 +2445,9 @@ public List markReferencesTableTEI(String refText, List allRe // second pass with relaxed table marker matching for(int i=tables.size()-1; i>=0; i--) { Table table = tables.get(i); - if ((table.getLabel() != null) && (table.getLabel().length() > 0)) { + if (StringUtils.isNotBlank(table.getLabel())) { String label = TextUtilities.cleanField(table.getLabel(), false); - if (label != null && (label.length() > 0) && - (textLow.contains(label.toLowerCase()))) { + if (StringUtils.isNotBlank(label) && (textLow.contains(label.toLowerCase()))) { bestTable = table.getId(); break; } @@ -2458,20 +2457,44 @@ public List markReferencesTableTEI(String refText, List allRe } boolean spaceEnd = false; + boolean spaceStart = false; text = text.replace("\n", " "); - if (text.endsWith(" ")) + if (text.endsWith(" ")) { spaceEnd = true; + } + if (!text.equals(" ") & text.startsWith(" ")) { + spaceStart = true; + } text = text.trim(); + if (StringUtils.isBlank(text)) { + if (spaceStart) { + nodes.add(new Text(" ")); + } + nodes.add(new Text(text)); + if (spaceEnd) { + nodes.add(new Text(" ")); + } + continue; + } + String andWordString = null; if (text.endsWith("and") || text.endsWith("&")) { - // the AND_WORD_PATTERN case, we want to exclude the AND word from the tagged chunk - if (text.endsWith("and")) { + if (text.equals("and") || text.equals("&")) { + if (spaceStart) { + nodes.add(new Text(" ")); + } + nodes.add(new Text(text)); + if (spaceEnd) { + nodes.add(new Text(" ")); + } + continue; + } else if (text.endsWith("and")) { + // the AND_WORD_PATTERN case, we want to exclude the AND word from the tagged chunk text = text.substring(0, text.length()-3); andWordString = "and"; refTokens = refTokens.subList(0,refTokens.size()-1); - } - else if (text.endsWith("&")) { + } else if (text.endsWith("&")) { text = text.substring(0, text.length()-1); andWordString = "&"; refTokens = refTokens.subList(0,refTokens.size()-1); @@ -2495,17 +2518,22 @@ else if (text.endsWith("&")) { ref.addAttribute(new Attribute("coords", coords)); } ref.appendChild(text); + if (bestTable != null) { ref.addAttribute(new Attribute("target", "#tab_" + bestTable)); } + if (spaceStart) { + nodes.add(new Text(" ")); + } nodes.add(ref); if (andWordString != null) { nodes.add(new Text(andWordString)); } - - if (spaceEnd) + + if (spaceEnd) { nodes.add(new Text(" ")); + } } return nodes; } diff --git a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java index bb95cb3345..efac6a9c6e 100644 --- a/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java +++ b/grobid-core/src/test/java/org/grobid/core/document/TEIFormatterTest.java @@ -5,6 +5,7 @@ import org.grobid.core.analyzers.GrobidAnalyzer; import org.grobid.core.data.Figure; import org.grobid.core.data.Note; +import org.grobid.core.data.Table; import org.grobid.core.layout.LayoutToken; import org.grobid.core.utilities.GrobidProperties; import org.grobid.core.utilities.LayoutTokensUtil; @@ -90,9 +91,9 @@ public void testMarkReferencesFigureTEI() throws Exception { Figure f1 = new Figure(); f1.setLabel(new StringBuilder("1")); Figure f2 = new Figure(); - f1.setLabel(new StringBuilder("2")); + f2.setLabel(new StringBuilder("2")); Figure f3 = new Figure(); - f1.setLabel(new StringBuilder("")); + f3.setLabel(new StringBuilder("")); List
figures = List.of(f1, f2, f3); @@ -101,10 +102,10 @@ public void testMarkReferencesFigureTEI() throws Exception { .markReferencesFigureTEI(input, tokensWithOffset, figures, false); assertThat(nodes, hasSize(4)); - assertThat(((Element)nodes.get(0)).toXML(), is("3C")); + assertThat(((Element) nodes.get(0)).toXML(), is("3C")); assertThat(nodes.get(1).toXML(), is(" and")); assertThat(nodes.get(2).toXML(), is(" ")); - assertThat(((Element)nodes.get(3)).toXML(), is("3D")); + assertThat(((Element) nodes.get(3)).toXML(), is("3D")); } @Test @@ -120,9 +121,9 @@ public void testMarkReferencesFigureTEI_truncatedRef_referenceAtTheEnd() throws Figure f1 = new Figure(); f1.setLabel(new StringBuilder("1")); Figure f2 = new Figure(); - f1.setLabel(new StringBuilder("2")); + f2.setLabel(new StringBuilder("2")); Figure f3 = new Figure(); - f1.setLabel(new StringBuilder("")); + f3.setLabel(new StringBuilder("")); List
figures = List.of(f1, f2, f3); @@ -133,7 +134,7 @@ public void testMarkReferencesFigureTEI_truncatedRef_referenceAtTheEnd() throws assertThat(nodes, hasSize(3)); assertThat(nodes.get(0).toXML(), is("and")); assertThat(nodes.get(1).toXML(), is(" ")); - assertThat(((Element)nodes.get(2)).toXML(), is("3D")); + assertThat(((Element) nodes.get(2)).toXML(), is("3D")); } @Test @@ -149,9 +150,9 @@ public void testMarkReferencesFigureTEI_truncatedRef_referenceAtBeginning() thro Figure f1 = new Figure(); f1.setLabel(new StringBuilder("1")); Figure f2 = new Figure(); - f1.setLabel(new StringBuilder("2")); + f2.setLabel(new StringBuilder("2")); Figure f3 = new Figure(); - f1.setLabel(new StringBuilder("")); + f3.setLabel(new StringBuilder("")); List
figures = List.of(f1, f2, f3); @@ -160,7 +161,89 @@ public void testMarkReferencesFigureTEI_truncatedRef_referenceAtBeginning() thro .markReferencesFigureTEI(input, tokensWithOffset, figures, false); assertThat(nodes, hasSize(4)); - assertThat(((Element)nodes.get(0)).toXML(), is("5,")); + assertThat(((Element) nodes.get(0)).toXML(), is("5,")); + assertThat(nodes.get(1).toXML(), is(" &")); + assertThat(nodes.get(2).toXML(), is("")); + assertThat(nodes.get(3).toXML(), is(" ")); + } + + @Test + public void testMarkReferencesTableTEI() throws Exception { + String input = "3C and 3D"; + List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + + List tokensWithOffset = tokens.stream() + .peek(t -> t.setOffset(t.getOffset() + 51393)) + .collect(Collectors.toList()); + + Table t1 = new Table(); + t1.setLabel(new StringBuilder("1")); + Table t2 = new Table(); + t2.setLabel(new StringBuilder("2")); + Table t3 = new Table(); + t3.setLabel(new StringBuilder("")); + + List tables = List.of(t1, t2, t3); + + + List nodes = new TEIFormatter(null, null) + .markReferencesTableTEI(input, tokensWithOffset, tables, false); + assertThat(nodes, hasSize(4)); + assertThat(((Element) nodes.get(0)).toXML(), is("3C")); + assertThat(nodes.get(1).toXML(), is(" and")); + assertThat(nodes.get(2).toXML(), is(" ")); + assertThat(((Element) nodes.get(3)).toXML(), is("3D")); + } + + @Test + public void testMarkReferencesTableTEI_truncatedRef_referenceAtTheEnd() throws Exception { + String input = "and 3D"; + List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + + List tokensWithOffset = tokens.stream() + .peek(t -> t.setOffset(t.getOffset() + 51393)) + .collect(Collectors.toList()); + + Table t1 = new Table(); + t1.setLabel(new StringBuilder("1")); + Table t2 = new Table(); + t2.setLabel(new StringBuilder("2")); + Table t3 = new Table(); + t3.setLabel(new StringBuilder("")); + + List
tables = List.of(t1, t2, t3); + + List nodes = new TEIFormatter(null, null) + .markReferencesTableTEI(input, tokensWithOffset, tables, false); + assertThat(nodes, hasSize(3)); + assertThat(nodes.get(0).toXML(), is("and")); + assertThat(nodes.get(1).toXML(), is(" ")); + assertThat(((Element) nodes.get(2)).toXML(), is("3D")); + } + + @Test + public void testMarkReferencesTableTEI_truncatedRef_referenceAtBeginning() throws Exception { + String input = "5, & "; + List tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + + List tokensWithOffset = tokens.stream() + .peek(t -> t.setOffset(t.getOffset() + 51393)) + .collect(Collectors.toList()); + + Table t1 = new Table(); + t1.setLabel(new StringBuilder("1")); + Table t2 = new Table(); + t2.setLabel(new StringBuilder("2")); + Table t3 = new Table(); + t3.setLabel(new StringBuilder("")); + + List
tables = List.of(t1, t2, t3); + + List nodes = new TEIFormatter(null, null) + .markReferencesTableTEI(input, tokensWithOffset, tables, false); + + assertThat(nodes, hasSize(4)); + assertThat(((Element) nodes.get(0)).toXML(), is("5,")); assertThat(nodes.get(1).toXML(), is(" &")); assertThat(nodes.get(2).toXML(), is("")); assertThat(nodes.get(3).toXML(), is(" "));