34
34
public class HttpRobotRulesParser extends RobotRulesParser {
35
35
36
36
protected boolean allowForbidden = false ;
37
+
38
+ protected boolean allow5xx = false ;
37
39
38
40
protected Metadata fetchRobotsMd ;
39
41
@@ -53,6 +55,7 @@ public void setConf(Config conf) {
53
55
/* http.content.limit for fetching the robots.txt */
54
56
int robotsTxtContentLimit = ConfUtils .getInt (conf , "http.robots.content.limit" , -1 );
55
57
fetchRobotsMd .addValue ("http.content.limit" , Integer .toString (robotsTxtContentLimit ));
58
+ allow5xx = ConfUtils .getBoolean (conf , "http.robots.5xx.allow" , false );
56
59
}
57
60
58
61
/** Compose unique key to store and access robot rules in cache for given URL */
@@ -174,16 +177,20 @@ public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {
174
177
break ;
175
178
}
176
179
}
177
- if (code == 200 ) // found rules: parse them
178
- {
180
+
181
+ // Parsing found rules; by default, all robots are forbidden (RFC 9309)
182
+ robotRules = FORBID_ALL_RULES ;
183
+ if (code == 200 ) {
179
184
String ct = response .getMetadata ().getFirstValue (HttpHeaders .CONTENT_TYPE );
180
185
robotRules = parseRules (url .toString (), response .getContent (), ct , agentNames );
181
- } else if (( code == 403 ) && (! allowForbidden ) ) {
182
- robotRules = FORBID_ALL_RULES ; // use forbid all
186
+ } else if (code == 403 && allowForbidden ) {
187
+ robotRules = EMPTY_RULES ; // allow all
183
188
} else if (code >= 500 ) {
184
189
cacheRule = false ;
185
- robotRules = EMPTY_RULES ;
186
- } else robotRules = EMPTY_RULES ; // use default rules
190
+ if (allow5xx ) {
191
+ robotRules = EMPTY_RULES ; // allow all
192
+ }
193
+ }
187
194
} catch (Throwable t ) {
188
195
LOG .info ("Couldn't get robots.txt for {} : {}" , url , t .toString ());
189
196
cacheRule = false ;
0 commit comments