diff --git a/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java b/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java index e602323f4e..934f3e5d16 100755 --- a/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java +++ b/grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java @@ -1286,14 +1286,18 @@ public void setInstitution(String inst) { institution = StringUtils.normalizeSpace(inst); } - public void setNote(String not) { + public void setNoteOrConcatenateIfNotEmpty(String note) { if (StringUtils.isBlank(this.note)) { - note = StringUtils.normalizeSpace(not); + this.note = StringUtils.normalizeSpace(note); } else { - note += " " + StringUtils.normalizeSpace(not); + this.note += " " + StringUtils.normalizeSpace(note); } } + public void setNote(String not) { + note = StringUtils.normalizeSpace(not); + } + public void setAffiliation(String a) { affiliation = a; } diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index 123c717c90..a585c4e6f5 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -2005,8 +2005,15 @@ public void segmentIntoSentences(Element curParagraph, List curPara List offsetPositionsUrls = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(curParagraphTokens, annotations, text); forbiddenPositions.addAll(offsetPositionsUrls); - List theSentences = - SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, curParagraphTokens, new Language(lang)); + Language language = new Language("en"); + if (lang != null) { + language = new Language(lang); + } else { + LOGGER.warn("There wasn't enough usable text to detect the language. Defaulting to English (en) for applying sentence segmentation. "); + } + + List theSentences = + SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, curParagraphTokens, language); /*if (theSentences.size() == 0) { // this should normally not happen, but it happens (depending on sentence splitter, usually the text diff --git a/grobid-core/src/main/java/org/grobid/core/engines/CitationParser.java b/grobid-core/src/main/java/org/grobid/core/engines/CitationParser.java index 8ebf46c2b3..e6dda2f053 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/CitationParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/CitationParser.java @@ -502,9 +502,9 @@ public BiblioItem resultExtractionLayoutTokens(String result, if (biblio.getTitle() == null) biblio.setTitle(clusterContent); else if (biblio.getTitle().length() >= clusterContent.length()) - biblio.setNote(clusterContent); + biblio.setNoteOrConcatenateIfNotEmpty(clusterContent); else { - biblio.setNote(biblio.getTitle()); + biblio.setNoteOrConcatenateIfNotEmpty(biblio.getTitle()); biblio.setTitle(clusterContent); } } else if (clusterLabel.equals(TaggingLabels.CITATION_AUTHOR)) { @@ -528,18 +528,18 @@ else if (biblio.getTitle().length() >= clusterContent.length()) if (biblio.getBookTitle() == null) biblio.setBookTitle(clusterContent); else if (biblio.getBookTitle().length() >= clusterContent.length()) - biblio.setNote(clusterContent); + biblio.setNoteOrConcatenateIfNotEmpty(clusterContent); else { - biblio.setNote(biblio.getBookTitle()); + biblio.setNoteOrConcatenateIfNotEmpty(biblio.getBookTitle()); biblio.setBookTitle(clusterContent); } } else if (clusterLabel.equals(TaggingLabels.CITATION_SERIES)) { if (biblio.getSerieTitle() == null) biblio.setSerieTitle(clusterContent); - else if (biblio.getSerieTitle().length() >= clusterContent.length()) - biblio.setNote(clusterContent); - else { - biblio.setNote(biblio.getSerieTitle()); + else if (biblio.getSerieTitle().length() >= clusterContent.length()) { + biblio.setNoteOrConcatenateIfNotEmpty(clusterContent); + } else { + biblio.setNoteOrConcatenateIfNotEmpty(biblio.getSerieTitle()); biblio.setSerieTitle(clusterContent); } } else if (clusterLabel.equals(TaggingLabels.CITATION_PAGES)) { @@ -553,32 +553,32 @@ else if (biblio.getSerieTitle().length() >= clusterContent.length()) else biblio.setCollaboration(clusterContent); } else if (clusterLabel.equals(TaggingLabels.CITATION_JOURNAL)) { - if (biblio.getJournal() == null) + if (biblio.getJournal() == null) { biblio.setJournal(clusterContent); - else if (biblio.getJournal().length() >= clusterContent.length()) - biblio.setNote(clusterContent); - else { - biblio.setNote(biblio.getJournal()); + }else if (biblio.getJournal().length() >= clusterContent.length()) { + biblio.setNoteOrConcatenateIfNotEmpty(clusterContent); + } else { + biblio.setNoteOrConcatenateIfNotEmpty(biblio.getJournal()); biblio.setJournal(clusterContent); } } else if (clusterLabel.equals(TaggingLabels.CITATION_VOLUME)) { - if (biblio.getVolumeBlock() == null) - biblio.setVolumeBlock(clusterContent, volumePostProcess); + if (biblio.getVolumeBlock() == null) { + biblio.setVolumeBlock(clusterContent, volumePostProcess); + } } else if (clusterLabel.equals(TaggingLabels.CITATION_ISSUE)) { - if (biblio.getIssue() == null) + if (biblio.getIssue() == null) { biblio.setIssue(clusterContent); + } } else if (clusterLabel.equals(TaggingLabels.CITATION_EDITOR)) { biblio.setEditors(clusterContent); } else if (clusterLabel.equals(TaggingLabels.CITATION_INSTITUTION)) { - if (biblio.getInstitution() != null) + if (biblio.getInstitution() != null) { biblio.setInstitution(biblio.getInstitution() + " ; " + clusterContent); - else - biblio.setInstitution(clusterContent); + } else { + biblio.setInstitution(clusterContent); + } } else if (clusterLabel.equals(TaggingLabels.CITATION_NOTE)) { - if (biblio.getNote() != null) - biblio.setNote(biblio.getNote()+ ". " + clusterContent); - else - biblio.setNote(clusterContent); + biblio.setNoteOrConcatenateIfNotEmpty(clusterContent); } else if (clusterLabel.equals(TaggingLabels.CITATION_PUBNUM)) { String clusterNonDehypenizedContent = LayoutTokensUtil.toText(cluster.concatTokens()); biblio.setPubnum(clusterNonDehypenizedContent); diff --git a/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java b/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java index 8070696561..51329ae86e 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/HeaderParser.java @@ -947,10 +947,7 @@ else if (biblio.getPublicationDate() == null) } else biblio.setInstitution(clusterContent); }*/ else if (clusterLabel.equals(TaggingLabels.HEADER_NOTE)) { - if (biblio.getNote() != null) { - biblio.setNote(biblio.getNote() + " " + clusterContent); - } else - biblio.setNote(clusterContent); + biblio.setNoteOrConcatenateIfNotEmpty(clusterContent); } else if (clusterLabel.equals(TaggingLabels.HEADER_ABSTRACT)) { if (biblio.getAbstract() != null) { // this will need to be reviewed with more training data, for the moment diff --git a/grobid-core/src/main/java/org/grobid/core/engines/TableParser.java b/grobid-core/src/main/java/org/grobid/core/engines/TableParser.java index c58b96efa4..7c655399e7 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/TableParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/TableParser.java @@ -104,6 +104,7 @@ private List getExtractionResult(List tokenizations, String table.addLayoutTokens(tokens); } else if (clusterLabel.equals(TBL_OTHER)) { table.addDiscardedPieceTokens(cluster.concatTokens()); + table.addLayoutTokens(tokens); } else if (clusterLabel.equals(TBL_CONTENT)) { table.appendContent(clusterContent); table.getContentTokens().addAll(tokens);