Skip to content

Commit b4910c7

Browse files
committed
Positive numbers prefilter for lookup
1 parent b4b7291 commit b4910c7

File tree

2 files changed

+23
-22
lines changed

2 files changed

+23
-22
lines changed

vocabulary-lookup/src/main/java/org/gbif/vocabulary/lookup/InMemoryVocabularyLookup.java

Lines changed: 18 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,17 @@
1313
*/
1414
package org.gbif.vocabulary.lookup;
1515

16-
import org.gbif.vocabulary.model.LanguageRegion;
17-
import org.gbif.vocabulary.model.export.ConceptExportView;
18-
import org.gbif.vocabulary.model.export.Export;
19-
import org.gbif.vocabulary.tools.VocabularyDownloader;
16+
import static org.gbif.vocabulary.model.normalizers.StringNormalizer.normalizeLabel;
17+
import static org.gbif.vocabulary.model.normalizers.StringNormalizer.normalizeName;
18+
import static org.gbif.vocabulary.model.normalizers.StringNormalizer.replaceNonAsciiCharactersWithEquivalents;
2019

20+
import com.fasterxml.jackson.core.JsonParser;
21+
import com.fasterxml.jackson.databind.DeserializationContext;
22+
import com.fasterxml.jackson.databind.DeserializationFeature;
23+
import com.fasterxml.jackson.databind.JsonDeserializer;
24+
import com.fasterxml.jackson.databind.ObjectMapper;
25+
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule;
26+
import com.fasterxml.jackson.datatype.jsr310.deser.LocalDateTimeDeserializer;
2127
import java.io.IOException;
2228
import java.io.InputStream;
2329
import java.time.ZoneId;
@@ -34,24 +40,14 @@
3440
import java.util.Set;
3541
import java.util.function.Function;
3642
import java.util.function.UnaryOperator;
37-
38-
import org.cache2k.Cache;
39-
import org.cache2k.Cache2kBuilder;
40-
41-
import com.fasterxml.jackson.core.JsonParser;
42-
import com.fasterxml.jackson.databind.DeserializationContext;
43-
import com.fasterxml.jackson.databind.DeserializationFeature;
44-
import com.fasterxml.jackson.databind.JsonDeserializer;
45-
import com.fasterxml.jackson.databind.ObjectMapper;
46-
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule;
47-
import com.fasterxml.jackson.datatype.jsr310.deser.LocalDateTimeDeserializer;
48-
4943
import lombok.SneakyThrows;
5044
import lombok.extern.slf4j.Slf4j;
51-
52-
import static org.gbif.vocabulary.model.normalizers.StringNormalizer.normalizeLabel;
53-
import static org.gbif.vocabulary.model.normalizers.StringNormalizer.normalizeName;
54-
import static org.gbif.vocabulary.model.normalizers.StringNormalizer.replaceNonAsciiCharactersWithEquivalents;
45+
import org.cache2k.Cache;
46+
import org.cache2k.Cache2kBuilder;
47+
import org.gbif.vocabulary.model.LanguageRegion;
48+
import org.gbif.vocabulary.model.export.ConceptExportView;
49+
import org.gbif.vocabulary.model.export.Export;
50+
import org.gbif.vocabulary.tools.VocabularyDownloader;
5551

5652
/**
5753
* Class that allows to load a vocabulary export in memory to do fast lookups by concept labels.
@@ -358,8 +354,8 @@ private void addHiddenLabelToCache(String hiddenLabel, ConceptExportView concept
358354
"Incorrect vocabulary: different concepts cannot have the same hidden label. "
359355
+ "The concept hidden label: {} in the concept: {} is also present in: {}",
360356
hiddenLabel,
361-
concept.toString(),
362-
existing.toString());
357+
concept,
358+
existing);
363359
}
364360
}
365361

vocabulary-lookup/src/main/java/org/gbif/vocabulary/lookup/PreFilters.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ public final class PreFilters {
2727
private static final Pattern NON_ALPHANUMERIC_PATTERN = Pattern.compile("[^A-Za-z0-9]");
2828
private static final Pattern NON_LETTERS_PATTERN = Pattern.compile("[^A-Za-z]");
2929
private static final Pattern NUMBERS_PATTERN = Pattern.compile("[0-9]");
30+
private static final Pattern POSITIVE_NUMBERS_PATTERN = Pattern.compile("[1-9]");
3031
private static final Pattern NUMERIC_PREFIX_PATTERN = Pattern.compile("^[0-9]+");
3132
private static final Pattern SIGNED_DECIMAL_NUMBERS_PREFIX_PATTERN =
3233
Pattern.compile("^[+-]?[0-9]+([.,][0-9]+)*|^[+-]?[0-9]*[.,][0-9]+");
@@ -40,6 +41,10 @@ public final class PreFilters {
4041
public static final UnaryOperator<String> REMOVE_NUMERIC =
4142
s -> NUMBERS_PATTERN.matcher(s).replaceAll(EMPTY);
4243

44+
/** Removes all positive numeric chars. */
45+
public static final UnaryOperator<String> REMOVE_POSITIVE_NUMERIC =
46+
s -> POSITIVE_NUMBERS_PATTERN.matcher(s).replaceAll(EMPTY);
47+
4348
/** Removes all the characters that are not letters. */
4449
public static final UnaryOperator<String> REMOVE_NON_LETTER =
4550
s -> NON_LETTERS_PATTERN.matcher(s).replaceAll(EMPTY);

0 commit comments

Comments
 (0)