Skip to content

Commit f768ec9

Browse files
Rebase - Issue apache#1042: Forbid all rules by default
Signed-off-by: Michael Dinzinger <[email protected]>
1 parent 4ddf884 commit f768ec9

File tree

1 file changed

+13
-6
lines changed

1 file changed

+13
-6
lines changed

core/src/main/java/com/digitalpebble/stormcrawler/protocol/HttpRobotRulesParser.java

+13-6
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@
3434
public class HttpRobotRulesParser extends RobotRulesParser {
3535

3636
protected boolean allowForbidden = false;
37+
38+
protected boolean allow5xx = false;
3739

3840
protected Metadata fetchRobotsMd;
3941

@@ -53,6 +55,7 @@ public void setConf(Config conf) {
5355
/* http.content.limit for fetching the robots.txt */
5456
int robotsTxtContentLimit = ConfUtils.getInt(conf, "http.robots.content.limit", -1);
5557
fetchRobotsMd.addValue("http.content.limit", Integer.toString(robotsTxtContentLimit));
58+
allow5xx = ConfUtils.getBoolean(conf, "http.robots.5xx.allow", false);
5659
}
5760

5861
/** Compose unique key to store and access robot rules in cache for given URL */
@@ -174,16 +177,20 @@ public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {
174177
break;
175178
}
176179
}
177-
if (code == 200) // found rules: parse them
178-
{
180+
181+
// Parsing found rules; by default, all robots are forbidden (RFC 9309)
182+
robotRules = FORBID_ALL_RULES;
183+
if (code == 200) {
179184
String ct = response.getMetadata().getFirstValue(HttpHeaders.CONTENT_TYPE);
180185
robotRules = parseRules(url.toString(), response.getContent(), ct, agentNames);
181-
} else if ((code == 403) && (!allowForbidden)) {
182-
robotRules = FORBID_ALL_RULES; // use forbid all
186+
} else if (code == 403 && allowForbidden) {
187+
robotRules = EMPTY_RULES; // allow all
183188
} else if (code >= 500) {
184189
cacheRule = false;
185-
robotRules = EMPTY_RULES;
186-
} else robotRules = EMPTY_RULES; // use default rules
190+
if (allow5xx) {
191+
robotRules = EMPTY_RULES; // allow all
192+
}
193+
}
187194
} catch (Throwable t) {
188195
LOG.info("Couldn't get robots.txt for {} : {}", url, t.toString());
189196
cacheRule = false;

0 commit comments

Comments
 (0)