diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index 494665a05b..c94aca004c 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -277,12 +277,17 @@ else if (config.getConsolidateCitations() == 2) } } + long numberFiguresFulltextModel = Arrays.stream(bodyResults.split("\n")) + .filter(r -> r.endsWith("I-" + TaggingLabels.FIGURE_LABEL)) + .count(); + List
badFigures = figures.stream() .filter(f -> !f.isCompleteForTEI()) .collect(Collectors.toList()); - LOGGER.info("Identified bad figures: " + badFigures.size()); - bodyResults = revertResultsForBadItems(badFigures, bodyResults, TaggingLabels.FIGURE_LABEL); + LOGGER.info("Number of figures badly formatted or incomplete we identified: " + badFigures.size()); + bodyResults = revertResultsForBadItems(badFigures, bodyResults, TaggingLabels.FIGURE_LABEL, + !(figures.size() > numberFiguresFulltextModel)); figures = figures.stream() .filter(f -> !badFigures.contains(f)) @@ -290,6 +295,10 @@ else if (config.getConsolidateCitations() == 2) tables = processTables(bodyResults, bodyLayoutTokens.getTokenization(), doc); + long numberTablesFulltextModel = Arrays.stream(bodyResults.split("\n")) + .filter(r -> r.endsWith("I-" + TaggingLabels.TABLE_LABEL)) + .count(); + //We deal with tables considered bad by reverting them as , to reduce the risk them to be // dropped later on. @@ -299,8 +308,9 @@ else if (config.getConsolidateCitations() == 2) .filter(t -> !(t.isCompleteForTEI() && t.validateTable())) .collect(Collectors.toList()); - LOGGER.info("Identified bad tables: " + badTables.size()); - bodyResults = revertResultsForBadItems(badTables, bodyResults, TaggingLabels.TABLE_LABEL); + LOGGER.info("Number of tables badly formatted or incomplete we identified: " + badTables.size()); + bodyResults = revertResultsForBadItems(badTables, bodyResults, TaggingLabels.TABLE_LABEL, + !(tables.size() > numberTablesFulltextModel)); tables = tables.stream() .filter(t-> !badTables.contains(t)) @@ -367,21 +377,21 @@ else if (config.getConsolidateCitations() == 2) } static String revertResultsForBadItems(List badFiguresOrTables, String resultBody, String itemLabel) { + return revertResultsForBadItems(badFiguresOrTables, resultBody, itemLabel, true); + } + + static String revertResultsForBadItems(List badFiguresOrTables, String resultBody, String itemLabel, boolean strict) { //LF: we update the resultBody sequence by reverting these tables as elements if (CollectionUtils.isNotEmpty(badFiguresOrTables)) { List> labelledResultsAsList = Arrays.stream(resultBody.split("\n")) .map(l -> Arrays.stream(l.split("\t")).collect(Collectors.toList())) .collect(Collectors.toList()); - long numberItems = labelledResultsAsList.stream() - .filter(r -> Iterables.getLast(r).startsWith("I-" + itemLabel)) - .count(); - for (Figure badItem : badFiguresOrTables) { // Find the index of the first layoutToken of the table in the tokenization List layoutTokenItem = badItem.getLayoutTokens(); List candidateIndexes = findCandidateIndex(layoutTokenItem, labelledResultsAsList, - itemLabel, !(badFiguresOrTables.size() > numberItems)); + itemLabel, strict); if (candidateIndexes.isEmpty()) { LOGGER.info("Cannot find the candidate index for fixing the tables."); continue; @@ -455,7 +465,8 @@ static int consolidateResultCandidateThroughSequence(List candidateInde * Find a set of candidates representing the indexes from the labelledResults which could correspond * to the first token of the figure/table * - * strict = True check the I- or I-
first and then the
or
only if there are not candidates + * strict = True then it will check the items related to I-
or I-
first + * and then the
or
only if there are not candidates * strict = False is usually necessary if there are more tables than I- token, this because a figure/table could be * identified within the sequence initially provided by the fulltext model *