Skip to content

Commit

Permalink
apache#1248 - Use pre-compiled patterns for mime type matching in Tik…
Browse files Browse the repository at this point in the history
…aParser (apache#1249)

Co-authored-by: Julien Nioche <[email protected]>
  • Loading branch information
rzo1 and jnioche authored Jul 5, 2024
1 parent dc84c56 commit ef0899e
Showing 1 changed file with 14 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import org.apache.html.dom.HTMLDocumentImpl;
import org.apache.storm.metric.api.MultiCountMetric;
Expand Down Expand Up @@ -92,7 +93,7 @@ public class ParserBolt extends BaseRichBolt {
private boolean emitOutlinks = true;

/** regular expressions to apply to the mime-type * */
private List<String> mimeTypeWhiteList = new LinkedList<>();
private List<Pattern> mimeTypeWhiteList = new LinkedList<>();

private String protocolMDprefix;

Expand Down Expand Up @@ -125,7 +126,15 @@ public void prepare(
throw e;
}

mimeTypeWhiteList = ConfUtils.loadListFromConf("parser.mimetype.whitelist", conf);
final List<String> mimeTypeWhiteListStrings =
ConfUtils.loadListFromConf("parser.mimetype.whitelist", conf);
for (String mt : mimeTypeWhiteListStrings) {
try {
this.mimeTypeWhiteList.add(Pattern.compile(mt));
} catch (RuntimeException e) {
LOG.warn("Failed to compile whitelist regex: {}", mt);
}
}

protocolMDprefix = ConfUtils.getString(conf, ProtocolResponse.PROTOCOL_MD_PREFIX_PARAM, "");

Expand All @@ -149,7 +158,7 @@ public void execute(Tuple tuple) {
Metadata metadata = (Metadata) tuple.getValueByField("metadata");

// check that the mimetype is in the whitelist
if (mimeTypeWhiteList.size() > 0) {
if (!mimeTypeWhiteList.isEmpty()) {
boolean mt_match = false;
// see if a mimetype was guessed in JSOUPBolt
String mimeType = metadata.getFirstValue("parse.Content-Type");
Expand All @@ -158,8 +167,8 @@ public void execute(Tuple tuple) {
mimeType = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE, this.protocolMDprefix);
}
if (mimeType != null) {
for (String mt : mimeTypeWhiteList) {
if (mimeType.matches(mt)) {
for (Pattern mt : mimeTypeWhiteList) {
if (mt.matcher(mimeType).matches()) {
mt_match = true;
break;
}
Expand Down

0 comments on commit ef0899e

Please sign in to comment.