diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index 123c717c90..a585c4e6f5 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -2005,8 +2005,15 @@ public void segmentIntoSentences(Element curParagraph, List curPara List offsetPositionsUrls = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(curParagraphTokens, annotations, text); forbiddenPositions.addAll(offsetPositionsUrls); - List theSentences = - SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, curParagraphTokens, new Language(lang)); + Language language = new Language("en"); + if (lang != null) { + language = new Language(lang); + } else { + LOGGER.warn("There wasn't enough usable text to detect the language. Defaulting to English (en) for applying sentence segmentation. "); + } + + List theSentences = + SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, curParagraphTokens, language); /*if (theSentences.size() == 0) { // this should normally not happen, but it happens (depending on sentence splitter, usually the text