From 61a1829e2af5ee17c7c9af8ffb7be36f9178a9b3 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sun, 16 Feb 2025 14:37:52 +0900 Subject: [PATCH 1/3] adjust notes to avoid stack overflow --- .../java/org/grobid/core/data/BiblioItem.java | 10 ++-- .../grobid/core/engines/CitationParser.java | 46 +++++++++---------- .../org/grobid/core/engines/HeaderParser.java | 5 +- 3 files changed, 31 insertions(+), 30 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java b/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java index e602323f4e..934f3e5d16 100755 --- a/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java +++ b/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java @@ -1286,14 +1286,18 @@ public void setInstitution(String inst) { institution = StringUtils.normalizeSpace(inst); } - public void setNote(String not) { + public void setNoteOrConcatenateIfNotEmpty(String note) { if (StringUtils.isBlank(this.note)) { - note = StringUtils.normalizeSpace(not); + this.note = StringUtils.normalizeSpace(note); } else { - note += " " + StringUtils.normalizeSpace(not); + this.note += " " + StringUtils.normalizeSpace(note); } } + public void setNote(String not) { + note = StringUtils.normalizeSpace(not); + } + public void setAffiliation(String a) { affiliation = a; } diff --git a/grobid-core/src/main/java/org/grobid/core/engines/CitationParser.java b/grobid-core/src/main/java/org/grobid/core/engines/CitationParser.java index 8ebf46c2b3..e6dda2f053 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/CitationParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/CitationParser.java @@ -502,9 +502,9 @@ public BiblioItem resultExtractionLayoutTokens(String result, if (biblio.getTitle() == null) biblio.setTitle(clusterContent); else if (biblio.getTitle().length() >= clusterContent.length()) - biblio.setNote(clusterContent); + biblio.setNoteOrConcatenateIfNotEmpty(clusterContent); else { - biblio.setNote(biblio.getTitle()); + biblio.setNoteOrConcatenateIfNotEmpty(biblio.getTitle()); biblio.setTitle(clusterContent); } } else if (clusterLabel.equals(TaggingLabels.CITATION_AUTHOR)) { @@ -528,18 +528,18 @@ else if (biblio.getTitle().length() >= clusterContent.length()) if (biblio.getBookTitle() == null) biblio.setBookTitle(clusterContent); else if (biblio.getBookTitle().length() >= clusterContent.length()) - biblio.setNote(clusterContent); + biblio.setNoteOrConcatenateIfNotEmpty(clusterContent); else { - biblio.setNote(biblio.getBookTitle()); + biblio.setNoteOrConcatenateIfNotEmpty(biblio.getBookTitle()); biblio.setBookTitle(clusterContent); } } else if (clusterLabel.equals(TaggingLabels.CITATION_SERIES)) { if (biblio.getSerieTitle() == null) biblio.setSerieTitle(clusterContent); - else if (biblio.getSerieTitle().length() >= clusterContent.length()) - biblio.setNote(clusterContent); - else { - biblio.setNote(biblio.getSerieTitle()); + else if (biblio.getSerieTitle().length() >= clusterContent.length()) { + biblio.setNoteOrConcatenateIfNotEmpty(clusterContent); + } else { + biblio.setNoteOrConcatenateIfNotEmpty(biblio.getSerieTitle()); biblio.setSerieTitle(clusterContent); } } else if (clusterLabel.equals(TaggingLabels.CITATION_PAGES)) { @@ -553,32 +553,32 @@ else if (biblio.getSerieTitle().length() >= clusterContent.length()) else biblio.setCollaboration(clusterContent); } else if (clusterLabel.equals(TaggingLabels.CITATION_JOURNAL)) { - if (biblio.getJournal() == null) + if (biblio.getJournal() == null) { biblio.setJournal(clusterContent); - else if (biblio.getJournal().length() >= clusterContent.length()) - biblio.setNote(clusterContent); - else { - biblio.setNote(biblio.getJournal()); + }else if (biblio.getJournal().length() >= clusterContent.length()) { + biblio.setNoteOrConcatenateIfNotEmpty(clusterContent); + } else { + biblio.setNoteOrConcatenateIfNotEmpty(biblio.getJournal()); biblio.setJournal(clusterContent); } } else if (clusterLabel.equals(TaggingLabels.CITATION_VOLUME)) { - if (biblio.getVolumeBlock() == null) - biblio.setVolumeBlock(clusterContent, volumePostProcess); + if (biblio.getVolumeBlock() == null) { + biblio.setVolumeBlock(clusterContent, volumePostProcess); + } } else if (clusterLabel.equals(TaggingLabels.CITATION_ISSUE)) { - if (biblio.getIssue() == null) + if (biblio.getIssue() == null) { biblio.setIssue(clusterContent); + } } else if (clusterLabel.equals(TaggingLabels.CITATION_EDITOR)) { biblio.setEditors(clusterContent); } else if (clusterLabel.equals(TaggingLabels.CITATION_INSTITUTION)) { - if (biblio.getInstitution() != null) + if (biblio.getInstitution() != null) { biblio.setInstitution(biblio.getInstitution() + " ; " + clusterContent); - else - biblio.setInstitution(clusterContent); + } else { + biblio.setInstitution(clusterContent); + } } else if (clusterLabel.equals(TaggingLabels.CITATION_NOTE)) { - if (biblio.getNote() != null) - biblio.setNote(biblio.getNote()+ ". " + clusterContent); - else - biblio.setNote(clusterContent); + biblio.setNoteOrConcatenateIfNotEmpty(clusterContent); } else if (clusterLabel.equals(TaggingLabels.CITATION_PUBNUM)) { String clusterNonDehypenizedContent = LayoutTokensUtil.toText(cluster.concatTokens()); biblio.setPubnum(clusterNonDehypenizedContent); diff --git a/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java b/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java index 8070696561..51329ae86e 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java @@ -947,10 +947,7 @@ else if (biblio.getPublicationDate() == null) } else biblio.setInstitution(clusterContent); }*/ else if (clusterLabel.equals(TaggingLabels.HEADER_NOTE)) { - if (biblio.getNote() != null) { - biblio.setNote(biblio.getNote() + " " + clusterContent); - } else - biblio.setNote(clusterContent); + biblio.setNoteOrConcatenateIfNotEmpty(clusterContent); } else if (clusterLabel.equals(TaggingLabels.HEADER_ABSTRACT)) { if (biblio.getAbstract() != null) { // this will need to be reviewed with more training data, for the moment From 05baeb90fb428a91894f682661159b47fec333da Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sun, 16 Feb 2025 18:28:30 +0900 Subject: [PATCH 2/3] fix table layout token missing when collecting "other" text --- .../src/main/java/org/grobid/core/engines/TableParser.java | 1 + 1 file changed, 1 insertion(+) diff --git a/grobid-core/src/main/java/org/grobid/core/engines/TableParser.java b/grobid-core/src/main/java/org/grobid/core/engines/TableParser.java index c58b96efa4..7c655399e7 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/TableParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/TableParser.java @@ -104,6 +104,7 @@ private List getExtractionResult(List tokenizations, String table.addLayoutTokens(tokens); } else if (clusterLabel.equals(TBL_OTHER)) { table.addDiscardedPieceTokens(cluster.concatTokens()); + table.addLayoutTokens(tokens); } else if (clusterLabel.equals(TBL_CONTENT)) { table.appendContent(clusterContent); table.getContentTokens().addAll(tokens); From acd4bd59094acd3206e3152f91320185636ed7b3 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Sun, 16 Feb 2025 20:42:31 +0900 Subject: [PATCH 3/3] Default language to english when applying sentence segmentation --- .../java/org/grobid/core/document/TEIFormatter.java | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index 123c717c90..a585c4e6f5 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -2005,8 +2005,15 @@ public void segmentIntoSentences(Element curParagraph, List curPara List offsetPositionsUrls = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(curParagraphTokens, annotations, text); forbiddenPositions.addAll(offsetPositionsUrls); - List theSentences = - SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, curParagraphTokens, new Language(lang)); + Language language = new Language("en"); + if (lang != null) { + language = new Language(lang); + } else { + LOGGER.warn("There wasn't enough usable text to detect the language. Defaulting to English (en) for applying sentence segmentation. "); + } + + List theSentences = + SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, curParagraphTokens, language); /*if (theSentences.size() == 0) { // this should normally not happen, but it happens (depending on sentence splitter, usually the text