Skip to content

Commit

Permalink
Merge pull request kermitt2#1246 from kermitt2/bugfix/avoid-empty-fig…
Browse files Browse the repository at this point in the history
…-ref-markers

Avoid empty figures/tables reference markers
  • Loading branch information
lfoppiano authored Feb 16, 2025
2 parents 311bcde + df74c76 commit 0719ed6
Show file tree
Hide file tree
Showing 2 changed files with 252 additions and 20 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1817,7 +1817,7 @@ public StringBuilder toTEITextPiece(
} else {
throw new IllegalStateException("Unsupported marker type: " + clusterLabel);
}

if (refNodes != null) {
boolean footNoteCallout = false;

Expand Down Expand Up @@ -2379,9 +2379,9 @@ public List<Node> markReferencesFigureTEI(String refText,
// second pass with relaxed figure marker matching
for(int i=figures.size()-1; i>=0; i--) {
Figure figure = figures.get(i);
if ((figure.getLabel() != null) && (figure.getLabel().length() > 0)) {
if (StringUtils.isNotBlank(figure.getLabel())) {
String label = TextUtilities.cleanField(figure.getLabel(), false);
if (label != null && (label.length() > 0) &&
if (StringUtils.isNotBlank(label) &&
(textLow.contains(label.toLowerCase()))) {
bestFigure = figure.getId();
break;
Expand All @@ -2392,20 +2392,44 @@ public List<Node> markReferencesFigureTEI(String refText,
}

boolean spaceEnd = false;
boolean spaceStart = false;
text = text.replace("\n", " ");
if (text.endsWith(" "))
if (text.endsWith(" ")) {
spaceEnd = true;
}
if (!text.equals(" ") & text.startsWith(" ")) {
spaceStart = true;
}
text = text.trim();

if (StringUtils.isBlank(text)) {
if (spaceStart) {
nodes.add(new Text(" "));
}
nodes.add(new Text(text));
if (spaceEnd) {
nodes.add(new Text(" "));
}
continue;
}

String andWordString = null;
if (text.endsWith("and") || text.endsWith("&")) {
// the AND_WORD_PATTERN case, we want to exclude the AND word from the tagged chunk
if (text.endsWith("and")) {
if (text.equals("and") || text.equals("&")) {
if (spaceStart) {
nodes.add(new Text(" "));
}
nodes.add(new Text(text));
if (spaceEnd) {
nodes.add(new Text(" "));
}
continue;
} else if (text.endsWith("and")) {
// the AND_WORD_PATTERN case, we want to exclude the AND word from the tagged chunk
text = text.substring(0, text.length()-3);
andWordString = "and";
refTokens = refTokens.subList(0,refTokens.size()-1);
}
else if (text.endsWith("&")) {
} else if (text.endsWith("&")) {
text = text.substring(0, text.length()-1);
andWordString = "&";
refTokens = refTokens.subList(0,refTokens.size()-1);
Expand Down Expand Up @@ -2433,14 +2457,18 @@ else if (text.endsWith("&")) {
if (bestFigure != null) {
ref.addAttribute(new Attribute("target", "#fig_" + bestFigure));
}
if (spaceStart) {
nodes.add(new Text(" "));
}
nodes.add(ref);

if (andWordString != null) {
nodes.add(new Text(andWordString));
}

if (spaceEnd)
if (spaceEnd) {
nodes.add(new Text(" "));
}
}
return nodes;
}
Expand Down Expand Up @@ -2503,10 +2531,9 @@ public List<Node> markReferencesTableTEI(String refText, List<LayoutToken> allRe
// second pass with relaxed table marker matching
for(int i=tables.size()-1; i>=0; i--) {
Table table = tables.get(i);
if ((table.getLabel() != null) && (table.getLabel().length() > 0)) {
if (StringUtils.isNotBlank(table.getLabel())) {
String label = TextUtilities.cleanField(table.getLabel(), false);
if (label != null && (label.length() > 0) &&
(textLow.contains(label.toLowerCase()))) {
if (StringUtils.isNotBlank(label) && (textLow.contains(label.toLowerCase()))) {
bestTable = table.getId();
break;
}
Expand All @@ -2516,20 +2543,44 @@ public List<Node> markReferencesTableTEI(String refText, List<LayoutToken> allRe
}

boolean spaceEnd = false;
boolean spaceStart = false;
text = text.replace("\n", " ");
if (text.endsWith(" "))
if (text.endsWith(" ")) {
spaceEnd = true;
}
if (!text.equals(" ") & text.startsWith(" ")) {
spaceStart = true;
}
text = text.trim();

if (StringUtils.isBlank(text)) {
if (spaceStart) {
nodes.add(new Text(" "));
}
nodes.add(new Text(text));
if (spaceEnd) {
nodes.add(new Text(" "));
}
continue;
}

String andWordString = null;
if (text.endsWith("and") || text.endsWith("&")) {
// the AND_WORD_PATTERN case, we want to exclude the AND word from the tagged chunk
if (text.endsWith("and")) {
if (text.equals("and") || text.equals("&")) {
if (spaceStart) {
nodes.add(new Text(" "));
}
nodes.add(new Text(text));
if (spaceEnd) {
nodes.add(new Text(" "));
}
continue;
} else if (text.endsWith("and")) {
// the AND_WORD_PATTERN case, we want to exclude the AND word from the tagged chunk
text = text.substring(0, text.length()-3);
andWordString = "and";
refTokens = refTokens.subList(0,refTokens.size()-1);
}
else if (text.endsWith("&")) {
} else if (text.endsWith("&")) {
text = text.substring(0, text.length()-1);
andWordString = "&";
refTokens = refTokens.subList(0,refTokens.size()-1);
Expand All @@ -2553,17 +2604,22 @@ else if (text.endsWith("&")) {
ref.addAttribute(new Attribute("coords", coords));
}
ref.appendChild(text);

if (bestTable != null) {
ref.addAttribute(new Attribute("target", "#tab_" + bestTable));
}
if (spaceStart) {
nodes.add(new Text(" "));
}
nodes.add(ref);

if (andWordString != null) {
nodes.add(new Text(andWordString));
}
if (spaceEnd)

if (spaceEnd) {
nodes.add(new Text(" "));
}
}
return nodes;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,18 +1,23 @@
package org.grobid.core.document;

import nu.xom.Element;
import nu.xom.Node;
import org.grobid.core.analyzers.GrobidAnalyzer;
import org.grobid.core.data.Figure;
import org.grobid.core.data.Note;
import org.grobid.core.data.Table;
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.utilities.GrobidProperties;
import org.grobid.core.utilities.LayoutTokensUtil;
import org.junit.BeforeClass;
import org.junit.Test;

import java.util.List;
import java.util.stream.Collectors;

import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.Matchers.*;
import static org.hamcrest.Matchers.greaterThan;
import static org.hamcrest.Matchers.hasSize;
import static org.junit.Assert.assertThat;

public class TEIFormatterTest {
Expand Down Expand Up @@ -73,5 +78,176 @@ public void testGenerateURLRef() throws Exception {
assertThat(node.toXML(), is("<ref xmlns=\"http://www.tei-c.org/ns/1.0\" type=\"url\" target=\"http://github.com/lfoppiano/grobid-bla\">http:// github.com/ lfoppiano/ grobid-bla</ref>"));
}

@Test
public void testMarkReferencesFigureTEI() throws Exception {
String input = "3C and 3D";
List<LayoutToken> tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);


List<LayoutToken> tokensWithOffset = tokens.stream()
.peek(t -> t.setOffset(t.getOffset() + 51393))
.collect(Collectors.toList());

Figure f1 = new Figure();
f1.setLabel(new StringBuilder("1"));
Figure f2 = new Figure();
f2.setLabel(new StringBuilder("2"));
Figure f3 = new Figure();
f3.setLabel(new StringBuilder(""));

List<Figure> figures = List.of(f1, f2, f3);


List<Node> nodes = new TEIFormatter(null, null)
.markReferencesFigureTEI(input, tokensWithOffset, figures, false);

assertThat(nodes, hasSize(4));
assertThat(((Element) nodes.get(0)).toXML(), is("<ref xmlns=\"http://www.tei-c.org/ns/1.0\" type=\"figure\">3C</ref>"));
assertThat(nodes.get(1).toXML(), is(" and"));
assertThat(nodes.get(2).toXML(), is(" "));
assertThat(((Element) nodes.get(3)).toXML(), is("<ref xmlns=\"http://www.tei-c.org/ns/1.0\" type=\"figure\">3D</ref>"));
}

@Test
public void testMarkReferencesFigureTEI_truncatedRef_referenceAtTheEnd() throws Exception {
String input = "and 3D";
List<LayoutToken> tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);


List<LayoutToken> tokensWithOffset = tokens.stream()
.peek(t -> t.setOffset(t.getOffset() + 51393))
.collect(Collectors.toList());

Figure f1 = new Figure();
f1.setLabel(new StringBuilder("1"));
Figure f2 = new Figure();
f2.setLabel(new StringBuilder("2"));
Figure f3 = new Figure();
f3.setLabel(new StringBuilder(""));

List<Figure> figures = List.of(f1, f2, f3);


List<Node> nodes = new TEIFormatter(null, null)
.markReferencesFigureTEI(input, tokensWithOffset, figures, false);

assertThat(nodes, hasSize(3));
assertThat(nodes.get(0).toXML(), is("and"));
assertThat(nodes.get(1).toXML(), is(" "));
assertThat(((Element) nodes.get(2)).toXML(), is("<ref xmlns=\"http://www.tei-c.org/ns/1.0\" type=\"figure\">3D</ref>"));
}

@Test
public void testMarkReferencesFigureTEI_truncatedRef_referenceAtBeginning() throws Exception {
String input = "5, & ";
List<LayoutToken> tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);


List<LayoutToken> tokensWithOffset = tokens.stream()
.peek(t -> t.setOffset(t.getOffset() + 51393))
.collect(Collectors.toList());

Figure f1 = new Figure();
f1.setLabel(new StringBuilder("1"));
Figure f2 = new Figure();
f2.setLabel(new StringBuilder("2"));
Figure f3 = new Figure();
f3.setLabel(new StringBuilder(""));

List<Figure> figures = List.of(f1, f2, f3);


List<Node> nodes = new TEIFormatter(null, null)
.markReferencesFigureTEI(input, tokensWithOffset, figures, false);

assertThat(nodes, hasSize(4));
assertThat(((Element) nodes.get(0)).toXML(), is("<ref xmlns=\"http://www.tei-c.org/ns/1.0\" type=\"figure\">5,</ref>"));
assertThat(nodes.get(1).toXML(), is(" &amp;"));
assertThat(nodes.get(2).toXML(), is(""));
assertThat(nodes.get(3).toXML(), is(" "));
}

@Test
public void testMarkReferencesTableTEI() throws Exception {
String input = "3C and 3D";
List<LayoutToken> tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);

List<LayoutToken> tokensWithOffset = tokens.stream()
.peek(t -> t.setOffset(t.getOffset() + 51393))
.collect(Collectors.toList());

Table t1 = new Table();
t1.setLabel(new StringBuilder("1"));
Table t2 = new Table();
t2.setLabel(new StringBuilder("2"));
Table t3 = new Table();
t3.setLabel(new StringBuilder(""));

List<Table> tables = List.of(t1, t2, t3);


List<Node> nodes = new TEIFormatter(null, null)
.markReferencesTableTEI(input, tokensWithOffset, tables, false);
assertThat(nodes, hasSize(4));
assertThat(((Element) nodes.get(0)).toXML(), is("<ref xmlns=\"http://www.tei-c.org/ns/1.0\" type=\"table\">3C</ref>"));
assertThat(nodes.get(1).toXML(), is(" and"));
assertThat(nodes.get(2).toXML(), is(" "));
assertThat(((Element) nodes.get(3)).toXML(), is("<ref xmlns=\"http://www.tei-c.org/ns/1.0\" type=\"table\">3D</ref>"));
}

@Test
public void testMarkReferencesTableTEI_truncatedRef_referenceAtTheEnd() throws Exception {
String input = "and 3D";
List<LayoutToken> tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);

List<LayoutToken> tokensWithOffset = tokens.stream()
.peek(t -> t.setOffset(t.getOffset() + 51393))
.collect(Collectors.toList());

Table t1 = new Table();
t1.setLabel(new StringBuilder("1"));
Table t2 = new Table();
t2.setLabel(new StringBuilder("2"));
Table t3 = new Table();
t3.setLabel(new StringBuilder(""));

List<Table> tables = List.of(t1, t2, t3);

List<Node> nodes = new TEIFormatter(null, null)
.markReferencesTableTEI(input, tokensWithOffset, tables, false);
assertThat(nodes, hasSize(3));
assertThat(nodes.get(0).toXML(), is("and"));
assertThat(nodes.get(1).toXML(), is(" "));
assertThat(((Element) nodes.get(2)).toXML(), is("<ref xmlns=\"http://www.tei-c.org/ns/1.0\" type=\"table\">3D</ref>"));
}

@Test
public void testMarkReferencesTableTEI_truncatedRef_referenceAtBeginning() throws Exception {
String input = "5, & ";
List<LayoutToken> tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);

List<LayoutToken> tokensWithOffset = tokens.stream()
.peek(t -> t.setOffset(t.getOffset() + 51393))
.collect(Collectors.toList());

Table t1 = new Table();
t1.setLabel(new StringBuilder("1"));
Table t2 = new Table();
t2.setLabel(new StringBuilder("2"));
Table t3 = new Table();
t3.setLabel(new StringBuilder(""));

List<Table> tables = List.of(t1, t2, t3);

List<Node> nodes = new TEIFormatter(null, null)
.markReferencesTableTEI(input, tokensWithOffset, tables, false);

assertThat(nodes, hasSize(4));
assertThat(((Element) nodes.get(0)).toXML(), is("<ref xmlns=\"http://www.tei-c.org/ns/1.0\" type=\"table\">5,</ref>"));
assertThat(nodes.get(1).toXML(), is(" &amp;"));
assertThat(nodes.get(2).toXML(), is(""));
assertThat(nodes.get(3).toXML(), is(" "));
}


}

0 comments on commit 0719ed6

Please sign in to comment.