@@ -37,6 +37,8 @@ public class HttpRobotRulesParser extends RobotRulesParser {
37
37
38
38
protected Metadata fetchRobotsMd ;
39
39
40
+ protected boolean allow5xx = false ;
41
+
40
42
HttpRobotRulesParser () {}
41
43
42
44
public HttpRobotRulesParser (Config conf ) {
@@ -51,6 +53,7 @@ public void setConf(Config conf) {
51
53
/* http.content.limit for fetching the robots.txt */
52
54
int robotsTxtContentLimit = ConfUtils .getInt (conf , "http.robots.content.limit" , -1 );
53
55
fetchRobotsMd .addValue ("http.content.limit" , Integer .toString (robotsTxtContentLimit ));
56
+ allow5xx = ConfUtils .getBoolean (conf , "http.robots.5xx.allow" , false );
54
57
}
55
58
56
59
/** Compose unique key to store and access robot rules in cache for given URL */
@@ -165,16 +168,20 @@ public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {
165
168
response .getContent () != null ? response .getContent ().length : 0 );
166
169
}
167
170
}
168
- if (code == 200 ) // found rules: parse them
169
- {
171
+
172
+ // Parsing found rules; by default, all robots are forbidden (RFC 9309)
173
+ robotRules = FORBID_ALL_RULES ;
174
+ if (code == 200 ) {
170
175
String ct = response .getMetadata ().getFirstValue (HttpHeaders .CONTENT_TYPE );
171
176
robotRules = parseRules (url .toString (), response .getContent (), ct , agentNames );
172
- } else if (( code == 403 ) && (! allowForbidden ) ) {
173
- robotRules = FORBID_ALL_RULES ; // use forbid all
177
+ } else if (code == 403 && allowForbidden ) {
178
+ robotRules = EMPTY_RULES ; // allow all
174
179
} else if (code >= 500 ) {
175
180
cacheRule = false ;
176
- robotRules = EMPTY_RULES ;
177
- } else robotRules = EMPTY_RULES ; // use default rules
181
+ if (allow5xx ) {
182
+ robotRules = EMPTY_RULES ; // allow all
183
+ }
184
+ }
178
185
} catch (Throwable t ) {
179
186
LOG .info ("Couldn't get robots.txt for {} : {}" , url , t .toString ());
180
187
cacheRule = false ;
0 commit comments