Skip to content

Commit

Permalink
Glob field mapping for indexer.md.mapping (#1130)
Browse files Browse the repository at this point in the history
* started on globs

Signed-off-by: Julien Nioche <[email protected]>

* added test for indexer.md.mapping and improve doc in conf files

Signed-off-by: Julien Nioche <[email protected]>

* Removed code commented out

Signed-off-by: Julien Nioche <[email protected]>

---------

Signed-off-by: Julien Nioche <[email protected]>
  • Loading branch information
jnioche authored Dec 4, 2023
1 parent 87145c3 commit 642cf5f
Show file tree
Hide file tree
Showing 6 changed files with 134 additions and 26 deletions.
11 changes: 11 additions & 0 deletions archetype/src/main/resources/archetype-resources/crawler-conf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,17 @@ config:
indexer.url.fieldname: "url"
indexer.text.fieldname: "content"
indexer.canonical.name: "canonical"
# How to convert metadata key values into fields for indexing
#
# if no alias is specified with =alias, the key value is used
# for instance below, _domain_ and _format_ will be used
# as field names, whereas _title_ will be used for _parse.title_.
# You can specify the index of the value to store from the values array
# by using the _key[index]_ format, e.g. _parse.title[0]_ would try to
# get the first value for the metadata _parse.title_ (which is the default anyway).
# Finally, you can use a glob (*) to match all the keys, e.g. _parse.*_ would
# index all the keys with _parse_ as a prefix. Note that in that case, you can't
# specify an alias with =, nor can you specify an index.
indexer.md.mapping:
- parse.title=title
- parse.keywords=keywords
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,12 @@
import crawlercommons.domains.PaidLevelDomain;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.ArrayUtils;
Expand Down Expand Up @@ -71,7 +74,7 @@ public abstract class AbstractIndexerBolt extends BaseRichBolt {

private String[] filterKeyValue = null;

private final Map<Key, String> metadata2field = new HashMap<>();
private final List<Key> metadata2field = new ArrayList<>();

private String fieldNameForText = null;

Expand All @@ -83,22 +86,46 @@ public abstract class AbstractIndexerBolt extends BaseRichBolt {

private boolean ignoreEmptyFields = false;

static class Key {
private static class Key {
private final String key;
private final String alias;
private final int index;
private final boolean glob;

public Key(String key, int index) {
this.key = key;
public Key(String key, int index, String alias) {
this.index = index;
this.alias = alias;
if (key.endsWith("*")) {
this.key = key.substring(0, key.length() - 1);
this.glob = true;
// can't have an alias
// or an index
if (index != -1 || alias != null) {
throw new RuntimeException(
"Can't have a mapping for indexer.md.mapping with a glob and index or alias");
}
} else {
this.key = key;
this.glob = false;
}
}

public String getKey() {
return key;
}

public String getAlias() {
// return the alias if set
return alias;
}

public int getIndex() {
return index;
}

public boolean isGlob() {
return glob;
}
}

@Override
Expand Down Expand Up @@ -137,15 +164,15 @@ public void prepare(
} else {
mapping = mapping.trim();
key = mapping;
value = mapping;
value = null;
}
int index = -1;
Matcher match = indexValuePattern.matcher(key);
if (match.find()) {
index = Integer.parseInt(match.group(1));
key = key.substring(0, match.start());
}
metadata2field.put(new Key(key, index), value);
metadata2field.add(new Key(key, index, value));
LOG.info("Mapping key {} to field {}", key, value);
}

Expand Down Expand Up @@ -175,26 +202,42 @@ protected Map<String, String[]> filterMetadata(Metadata meta) {

Map<String, String[]> fieldVals = new HashMap<>();

for (Entry<Key, String> entry : metadata2field.entrySet()) {
Key key = entry.getKey();
String[] values = meta.getValues(key.key);
// not found
if (values == null || values.length == 0) {
continue;
}
// check whether we want a specific value or all of them?
int index = key.index;
// want a value index that it outside the range given
if (index >= values.length) {
continue;
}
// store all values available
if (index == -1) {
fieldVals.put(entry.getValue(), values);
for (Key key : metadata2field) {
Set<String> matchingKeys = new HashSet<>();
// if it is a glob - look for all matching entries in the metadata
if (key.isGlob()) {
matchingKeys = meta.keySet(key.getKey());
} else {
matchingKeys.add(key.getKey());
}
// or only the one we want
else {
fieldVals.put(entry.getValue(), new String[] {values[index]});

for (String matchingKey : matchingKeys) {
String[] values = meta.getValues(matchingKey);
String label = matchingKey;

// won't be the case for globs
if (key.getAlias() != null) {
label = key.getAlias();
}

// not found
if (values == null || values.length == 0) {
continue;
}
// check whether we want a specific value or all of them?
int index = key.index;
// want a value index that it outside the range given
if (index >= values.length) {
continue;
}
// store all values available
if (index == -1) {
fieldVals.put(label, values);
}
// or only the one we want
else {
fieldVals.put(label, new String[] {values[index]});
}
}
}

Expand Down
11 changes: 11 additions & 0 deletions core/src/main/resources/crawler-default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,17 @@ config:
indexer.text.fieldname: "content"
indexer.text.maxlength: -1
indexer.canonical.name: "canonical"
# How to convert metadata key values into fields for indexing
#
# if no alias is specified with =alias, the key value is used
# for instance below, _domain_ and _format_ will be used
# as field names, whereas _title_ will be used for _parse.title_.
# You can specify the index of the value to store from the values array
# by using the _key[index]_ format, e.g. _parse.title[0]_ would try to
# get the first value for the metadata _parse.title_ (which is the default anyway).
# Finally, you can use a glob (*) to match all the keys, e.g. _parse.*_ would
# index all the keys with _parse_ as a prefix. Note that in that case, you can't
# specify an alias with =, nor can you specify an index.
indexer.md.mapping:
- parse.title=title
- parse.keywords=keywords
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -236,4 +236,25 @@ public void testEmptyFilterMetadata() throws Exception {
new String[] {"url"},
fields.keySet().toArray());
}

@Test
public void testGlobFilterMetadata() throws Exception {
Map config = new HashMap();
config.put(AbstractIndexerBolt.urlFieldParamName, "url");
List<String> listKV = new ArrayList<>();
listKV.add("parse.*");
config.put(AbstractIndexerBolt.metadata2fieldParamName, listKV);

prepareIndexerBolt(config);

Metadata metadata = new Metadata();
metadata.setValue("parse.title", "This is the title");
metadata.setValue("parse.keywords", "keyword1, keyword2, keyword3");
metadata.setValue("parse.description", "This is the description");

index(URL, metadata);
Map<String, String> fields = ((DummyIndexer) bolt).returnFields();

Assert.assertEquals("Incorrect number of fields", 4, fields.keySet().size());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,17 @@ config:
indexer.url.fieldname: "url"
indexer.text.fieldname: "content"
indexer.canonical.name: "canonical"
# How to convert metadata key values into fields for indexing
#
# if no alias is specified with =alias, the key value is used
# for instance below, _domain_ and _format_ will be used
# as field names, whereas _title_ will be used for _parse.title_.
# You can specify the index of the value to store from the values array
# by using the _key[index]_ format, e.g. _parse.title[0]_ would try to
# get the first value for the metadata _parse.title_ (which is the default anyway).
# Finally, you can use a glob (*) to match all the keys, e.g. _parse.*_ would
# index all the keys with _parse_ as a prefix. Note that in that case, you can't
# specify an alias with =, nor can you specify an index.
indexer.md.mapping:
- parse.title=title
- parse.keywords=keywords
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,17 @@ config:
indexer.url.fieldname: "url"
indexer.text.fieldname: "content"
indexer.canonical.name: "canonical"
# How to convert metadata key values into fields for indexing
#
# if no alias is specified with =alias, the key value is used
# for instance below, _domain_ and _format_ will be used
# as field names, whereas _title_ will be used for _parse.title_.
# You can specify the index of the value to store from the values array
# by using the _key[index]_ format, e.g. _parse.title[0]_ would try to
# get the first value for the metadata _parse.title_ (which is the default anyway).
# Finally, you can use a glob (*) to match all the keys, e.g. _parse.*_ would
# index all the keys with _parse_ as a prefix. Note that in that case, you can't
# specify an alias with =, nor can you specify an index.
indexer.md.mapping:
- parse.title=title
- parse.keywords=keywords
Expand Down

0 comments on commit 642cf5f

Please sign in to comment.