Skip to content

Commit

Permalink
Merge pull request kermitt2#1247 from kermitt2/bugfix/adjust-notes-av…
Browse files Browse the repository at this point in the history
…oid-stackoverflow

Bugfix/adjust notes avoid stackoverflow
  • Loading branch information
lfoppiano authored Feb 16, 2025
2 parents 0719ed6 + acd4bd5 commit 32cb065
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 32 deletions.
10 changes: 7 additions & 3 deletions grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java
Original file line number Diff line number Diff line change
Expand Up @@ -1286,14 +1286,18 @@ public void setInstitution(String inst) {
institution = StringUtils.normalizeSpace(inst);
}

public void setNote(String not) {
public void setNoteOrConcatenateIfNotEmpty(String note) {
if (StringUtils.isBlank(this.note)) {
note = StringUtils.normalizeSpace(not);
this.note = StringUtils.normalizeSpace(note);
} else {
note += " " + StringUtils.normalizeSpace(not);
this.note += " " + StringUtils.normalizeSpace(note);
}
}

public void setNote(String not) {
note = StringUtils.normalizeSpace(not);
}

public void setAffiliation(String a) {
affiliation = a;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2005,8 +2005,15 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
List<OffsetPosition> offsetPositionsUrls = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(curParagraphTokens, annotations, text);
forbiddenPositions.addAll(offsetPositionsUrls);

List<OffsetPosition> theSentences =
SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, curParagraphTokens, new Language(lang));
Language language = new Language("en");
if (lang != null) {
language = new Language(lang);
} else {
LOGGER.warn("There wasn't enough usable text to detect the language. Defaulting to English (en) for applying sentence segmentation. ");
}

List<OffsetPosition> theSentences =
SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, curParagraphTokens, language);

/*if (theSentences.size() == 0) {
// this should normally not happen, but it happens (depending on sentence splitter, usually the text
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -502,9 +502,9 @@ public BiblioItem resultExtractionLayoutTokens(String result,
if (biblio.getTitle() == null)
biblio.setTitle(clusterContent);
else if (biblio.getTitle().length() >= clusterContent.length())
biblio.setNote(clusterContent);
biblio.setNoteOrConcatenateIfNotEmpty(clusterContent);
else {
biblio.setNote(biblio.getTitle());
biblio.setNoteOrConcatenateIfNotEmpty(biblio.getTitle());
biblio.setTitle(clusterContent);
}
} else if (clusterLabel.equals(TaggingLabels.CITATION_AUTHOR)) {
Expand All @@ -528,18 +528,18 @@ else if (biblio.getTitle().length() >= clusterContent.length())
if (biblio.getBookTitle() == null)
biblio.setBookTitle(clusterContent);
else if (biblio.getBookTitle().length() >= clusterContent.length())
biblio.setNote(clusterContent);
biblio.setNoteOrConcatenateIfNotEmpty(clusterContent);
else {
biblio.setNote(biblio.getBookTitle());
biblio.setNoteOrConcatenateIfNotEmpty(biblio.getBookTitle());
biblio.setBookTitle(clusterContent);
}
} else if (clusterLabel.equals(TaggingLabels.CITATION_SERIES)) {
if (biblio.getSerieTitle() == null)
biblio.setSerieTitle(clusterContent);
else if (biblio.getSerieTitle().length() >= clusterContent.length())
biblio.setNote(clusterContent);
else {
biblio.setNote(biblio.getSerieTitle());
else if (biblio.getSerieTitle().length() >= clusterContent.length()) {
biblio.setNoteOrConcatenateIfNotEmpty(clusterContent);
} else {
biblio.setNoteOrConcatenateIfNotEmpty(biblio.getSerieTitle());
biblio.setSerieTitle(clusterContent);
}
} else if (clusterLabel.equals(TaggingLabels.CITATION_PAGES)) {
Expand All @@ -553,32 +553,32 @@ else if (biblio.getSerieTitle().length() >= clusterContent.length())
else
biblio.setCollaboration(clusterContent);
} else if (clusterLabel.equals(TaggingLabels.CITATION_JOURNAL)) {
if (biblio.getJournal() == null)
if (biblio.getJournal() == null) {
biblio.setJournal(clusterContent);
else if (biblio.getJournal().length() >= clusterContent.length())
biblio.setNote(clusterContent);
else {
biblio.setNote(biblio.getJournal());
}else if (biblio.getJournal().length() >= clusterContent.length()) {
biblio.setNoteOrConcatenateIfNotEmpty(clusterContent);
} else {
biblio.setNoteOrConcatenateIfNotEmpty(biblio.getJournal());
biblio.setJournal(clusterContent);
}
} else if (clusterLabel.equals(TaggingLabels.CITATION_VOLUME)) {
if (biblio.getVolumeBlock() == null)
biblio.setVolumeBlock(clusterContent, volumePostProcess);
if (biblio.getVolumeBlock() == null) {
biblio.setVolumeBlock(clusterContent, volumePostProcess);
}
} else if (clusterLabel.equals(TaggingLabels.CITATION_ISSUE)) {
if (biblio.getIssue() == null)
if (biblio.getIssue() == null) {
biblio.setIssue(clusterContent);
}
} else if (clusterLabel.equals(TaggingLabels.CITATION_EDITOR)) {
biblio.setEditors(clusterContent);
} else if (clusterLabel.equals(TaggingLabels.CITATION_INSTITUTION)) {
if (biblio.getInstitution() != null)
if (biblio.getInstitution() != null) {
biblio.setInstitution(biblio.getInstitution() + " ; " + clusterContent);
else
biblio.setInstitution(clusterContent);
} else {
biblio.setInstitution(clusterContent);
}
} else if (clusterLabel.equals(TaggingLabels.CITATION_NOTE)) {
if (biblio.getNote() != null)
biblio.setNote(biblio.getNote()+ ". " + clusterContent);
else
biblio.setNote(clusterContent);
biblio.setNoteOrConcatenateIfNotEmpty(clusterContent);
} else if (clusterLabel.equals(TaggingLabels.CITATION_PUBNUM)) {
String clusterNonDehypenizedContent = LayoutTokensUtil.toText(cluster.concatTokens());
biblio.setPubnum(clusterNonDehypenizedContent);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -947,10 +947,7 @@ else if (biblio.getPublicationDate() == null)
} else
biblio.setInstitution(clusterContent);
}*/ else if (clusterLabel.equals(TaggingLabels.HEADER_NOTE)) {
if (biblio.getNote() != null) {
biblio.setNote(biblio.getNote() + " " + clusterContent);
} else
biblio.setNote(clusterContent);
biblio.setNoteOrConcatenateIfNotEmpty(clusterContent);
} else if (clusterLabel.equals(TaggingLabels.HEADER_ABSTRACT)) {
if (biblio.getAbstract() != null) {
// this will need to be reviewed with more training data, for the moment
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ private List<Table> getExtractionResult(List<LayoutToken> tokenizations, String
table.addLayoutTokens(tokens);
} else if (clusterLabel.equals(TBL_OTHER)) {
table.addDiscardedPieceTokens(cluster.concatTokens());
table.addLayoutTokens(tokens);
} else if (clusterLabel.equals(TBL_CONTENT)) {
table.appendContent(clusterContent);
table.getContentTokens().addAll(tokens);
Expand Down

0 comments on commit 32cb065

Please sign in to comment.