Skip to content

Commit

Permalink
feat(robots-txt): Configurable line-skipping rules
Browse files Browse the repository at this point in the history
  • Loading branch information
alturkovic committed May 22, 2023
1 parent d0cdd54 commit a1608b8
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 3 deletions.
30 changes: 30 additions & 0 deletions src/main/kotlin/com/github/alturkovic/robots/txt/LineFilter.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package com.github.alturkovic.robots.txt

interface LineFilter {
fun accept(line: String): Boolean
}

class LineSizeFilter(
private val maxSize: Int
): LineFilter {
override fun accept(line: String): Boolean = line.length <= maxSize
}

class RobotsLinesFilter(
private val maxLines: Int
): LineFilter {
private var seenLines = 0
override fun accept(line: String): Boolean {
return seenLines++ < maxLines
}
}

class RobotsSizeFilter(
private val maxByteSize: Int
): LineFilter {
private var seenBytes = 0
override fun accept(line: String): Boolean {
seenBytes += line.length
return seenBytes < maxByteSize
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@ object RobotsTxtReader {
input: InputStream,
ruleMatchingStrategy: RuleMatchingStrategy = WildcardRuleMatchingStrategy,
ruleSelectionStrategy: RuleSelectionStrategy = LongestRuleSelectionStrategy,
ignoreLinesLongerThan: Long = Long.MAX_VALUE
lineFilter: LineFilter? = null
): RobotsTxt {
val builder = RobotsBuilder()

BufferedReader(InputStreamReader(input)).lineSequence()
.map { it.trim() }
.filter { it.length < ignoreLinesLongerThan }
.filter { lineFilter?.accept(it) ?: true }
.forEach {
when (val entry = RobotsLineParser.parseLine(it)) {
is UserAgentEntry -> builder.acceptUserAgent(entry.userAgent)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,57 @@ private class RobotsTxtReaderTest {
Disallow: /a/
Disallow: /bbbbbbbbbb
""".trimIndent().byteInputStream(),
ignoreLinesLongerThan = 20
lineFilter = LineSizeFilter(20)
)

assertThat(robotsTxt).isEqualTo(
RobotsTxt(
ruleGroups = listOf(
RuleGroup(
userAgents = setOf("FooBot"),
rules = listOf(
Rule(allowed = false, "/a/")
)
)
)
)
)
}

@Test
fun shouldSkipTooLongRobotsUsingByteLimit() {
val robotsTxt = RobotsTxtReader.read(
"""
User-agent: FooBot
Disallow: /a/
Disallow: /bbbbbbbbbb
""".trimIndent().byteInputStream(),
lineFilter = RobotsSizeFilter(32)
)

assertThat(robotsTxt).isEqualTo(
RobotsTxt(
ruleGroups = listOf(
RuleGroup(
userAgents = setOf("FooBot"),
rules = listOf(
Rule(allowed = false, "/a/")
)
)
)
)
)
}

@Test
fun shouldSkipTooLongRobotsUsingLineLimit() {
val robotsTxt = RobotsTxtReader.read(
"""
User-agent: FooBot
Disallow: /a/
Disallow: /bbbbbbbbbb
""".trimIndent().byteInputStream(),
lineFilter = RobotsLinesFilter(2)
)

assertThat(robotsTxt).isEqualTo(
Expand Down

0 comments on commit a1608b8

Please sign in to comment.