Skip to content

Commit 973eb47

Browse files
Issue apache#1042: Forbid all rules by default
Signed-off-by: Michael Dinzinger <[email protected]>
1 parent 5573ed6 commit 973eb47

File tree

1 file changed

+13
-6
lines changed

1 file changed

+13
-6
lines changed

core/src/main/java/com/digitalpebble/stormcrawler/protocol/HttpRobotRulesParser.java

+13-6
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ public class HttpRobotRulesParser extends RobotRulesParser {
3737

3838
protected Metadata fetchRobotsMd;
3939

40+
protected boolean allow5xx = false;
41+
4042
HttpRobotRulesParser() {}
4143

4244
public HttpRobotRulesParser(Config conf) {
@@ -51,6 +53,7 @@ public void setConf(Config conf) {
5153
/* http.content.limit for fetching the robots.txt */
5254
int robotsTxtContentLimit = ConfUtils.getInt(conf, "http.robots.content.limit", -1);
5355
fetchRobotsMd.addValue("http.content.limit", Integer.toString(robotsTxtContentLimit));
56+
allow5xx = ConfUtils.getBoolean(conf, "http.robots.5xx.allow", false);
5457
}
5558

5659
/** Compose unique key to store and access robot rules in cache for given URL */
@@ -165,16 +168,20 @@ public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {
165168
response.getContent() != null ? response.getContent().length : 0);
166169
}
167170
}
168-
if (code == 200) // found rules: parse them
169-
{
171+
172+
// Parsing found rules; by default, all robots are forbidden (RFC 9309)
173+
robotRules = FORBID_ALL_RULES;
174+
if (code == 200) {
170175
String ct = response.getMetadata().getFirstValue(HttpHeaders.CONTENT_TYPE);
171176
robotRules = parseRules(url.toString(), response.getContent(), ct, agentNames);
172-
} else if ((code == 403) && (!allowForbidden)) {
173-
robotRules = FORBID_ALL_RULES; // use forbid all
177+
} else if (code == 403 && allowForbidden) {
178+
robotRules = EMPTY_RULES; // allow all
174179
} else if (code >= 500) {
175180
cacheRule = false;
176-
robotRules = EMPTY_RULES;
177-
} else robotRules = EMPTY_RULES; // use default rules
181+
if (allow5xx) {
182+
robotRules = EMPTY_RULES; // allow all
183+
}
184+
}
178185
} catch (Throwable t) {
179186
LOG.info("Couldn't get robots.txt for {} : {}", url, t.toString());
180187
cacheRule = false;

0 commit comments

Comments
 (0)