Skip to content

Commit

Permalink
Merge pull request kermitt2#1248 from kermitt2/bugfix/flavor-discarde…
Browse files Browse the repository at this point in the history
…d-text

Avoid collecting text twice
  • Loading branch information
lfoppiano authored Feb 19, 2025
2 parents 32cb065 + 5163fb2 commit 66e641e
Showing 1 changed file with 0 additions and 29 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -286,35 +286,6 @@ else if (config.getConsolidateCitations() == 2)
//Correct subsequent I-<figure> or I-<table>
bodyResults = LabelUtils.postProcessFulltextFixInvalidTableOrFigure(bodyResults);

if (flavor != null) {
// To avoid loosing potential data, we add in the body also the part of the header
// that was discarded.

String resultHeader = resHeader.getDiscardedPiecesTokens()
.stream()
.flatMap(ll -> ll.stream()
.filter(l -> StringUtils.isNotBlank(l.getText()))
.map(l -> l.getText() + "\t" + PARAGRAPH_LABEL)
)
.collect(Collectors.joining("\n"));

List<LayoutToken> tokensHeader = resHeader.getDiscardedPiecesTokens()
.stream()
.flatMap(Collection::stream)
.collect(Collectors.toList());

// Add I- prefix on the first label of the discarded pieces from the header
String[] resultHeaderAsArray = resultHeader.split("\n");
resultHeaderAsArray[0] = resultHeaderAsArray[0].replace(PARAGRAPH_LABEL, "I-" + PARAGRAPH_LABEL);
resultHeader = String.join("\n", resultHeaderAsArray);

bodyResults = StringUtils.strip(resultHeader + "\n" + bodyResults);
List<LayoutToken> concatenatedTokenization = Stream
.concat(tokensHeader.stream(), bodyLayoutTokens.getTokenization().stream())
.collect(Collectors.toList());
bodyLayoutTokens.setTokenization(concatenatedTokenization);
}

// we apply now the figure and table models based on the fulltext labeled output
figures = processFigures(bodyResults, bodyLayoutTokens.getTokenization(), doc);
// further parse the caption
Expand Down

0 comments on commit 66e641e

Please sign in to comment.