Skip to content

Commit

Permalink
Default language to english when applying sentence segmentation
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed Feb 16, 2025
1 parent 05baeb9 commit acd4bd5
Showing 1 changed file with 9 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2005,8 +2005,15 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
List<OffsetPosition> offsetPositionsUrls = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(curParagraphTokens, annotations, text);
forbiddenPositions.addAll(offsetPositionsUrls);

List<OffsetPosition> theSentences =
SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, curParagraphTokens, new Language(lang));
Language language = new Language("en");
if (lang != null) {
language = new Language(lang);
} else {
LOGGER.warn("There wasn't enough usable text to detect the language. Defaulting to English (en) for applying sentence segmentation. ");
}

List<OffsetPosition> theSentences =
SentenceUtilities.getInstance().runSentenceDetection(text, forbiddenPositions, curParagraphTokens, language);

/*if (theSentences.size() == 0) {
// this should normally not happen, but it happens (depending on sentence splitter, usually the text
Expand Down

0 comments on commit acd4bd5

Please sign in to comment.