Skip to content

Commit

Permalink
crawler: add parameter class, typing, and improve code (#675)
Browse files Browse the repository at this point in the history
* crawler: add params class

* fix tests

* remove failing test

* review code

* improve code and tests

* simplify code and improve class

* fix test

* fix types and simplify
  • Loading branch information
adbar authored Aug 16, 2024
1 parent 0fb6e43 commit b538002
Show file tree
Hide file tree
Showing 4 changed files with 241 additions and 236 deletions.
206 changes: 96 additions & 110 deletions tests/spider_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from courlan import UrlStore

import trafilatura.spider # for global variables
from trafilatura import spider # for global variables

# from trafilatura.utils import LANGID_FLAG

Expand All @@ -22,13 +22,13 @@

def test_redirections():
"Test redirection detection."
_, _, baseurl = trafilatura.spider.probe_alternative_homepage("xyz")
_, _, baseurl = spider.probe_alternative_homepage("xyz")
assert baseurl is None
_, _, baseurl = trafilatura.spider.probe_alternative_homepage(
_, _, baseurl = spider.probe_alternative_homepage(
"https://httpbun.com/redirect-to?url=https://example.org"
)
assert baseurl == "https://example.org"
# _, _, baseurl = trafilatura.spider.probe_alternative_homepage('https://httpbin.org/redirect-to?url=https%3A%2F%2Fhttpbin.org%2Fhtml&status_code=302')
# _, _, baseurl = spider.probe_alternative_homepage('https://httpbin.org/redirect-to?url=https%3A%2F%2Fhttpbin.org%2Fhtml&status_code=302')


def test_meta_redirections():
Expand Down Expand Up @@ -63,9 +63,7 @@ def test_meta_redirections():
]

for htmlstring, homepage, expected_homepage in tests:
htmlstring2, homepage2 = trafilatura.spider.refresh_detection(
htmlstring, homepage
)
htmlstring2, homepage2 = spider.refresh_detection(htmlstring, homepage)
assert homepage2 == expected_homepage
if expected_homepage:
if expected_homepage == homepage:
Expand All @@ -77,211 +75,199 @@ def test_meta_redirections():
def test_process_links():
"Test link extraction procedures."
base_url = "https://example.org"
params = spider.CrawlParameters(base_url)
htmlstring = '<html><body><a href="https://example.org/page1"/><a href="https://example.org/page1/"/><a href="https://test.org/page1"/></body></html>'

# 1 internal link in total
trafilatura.spider.process_links(htmlstring, base_url)
assert len(trafilatura.spider.URL_STORE.find_known_urls(base_url)) == 1
assert len(trafilatura.spider.URL_STORE.find_unvisited_urls(base_url)) == 1
spider.process_links(htmlstring, params)
assert len(spider.URL_STORE.find_known_urls(base_url)) == 1
assert len(spider.URL_STORE.find_unvisited_urls(base_url)) == 1

# same with content already seen
trafilatura.spider.process_links(htmlstring, base_url)
spider.process_links(htmlstring, params)
assert (
len(trafilatura.spider.URL_STORE.find_unvisited_urls(base_url)) == 1
and len(trafilatura.spider.URL_STORE.find_known_urls(base_url)) == 1
len(spider.URL_STORE.find_unvisited_urls(base_url)) == 1
and len(spider.URL_STORE.find_known_urls(base_url)) == 1
)

# test navigation links
url1 = "https://example.org/tag/number1"
url2 = "https://example.org/page2"
htmlstring = f'<html><body><a href="{url1}"/><a href="{url2}"/></body></html>'
trafilatura.spider.process_links(htmlstring, base_url)
todo = trafilatura.spider.URL_STORE.find_unvisited_urls(base_url)
known_links = trafilatura.spider.URL_STORE.find_known_urls(base_url)
spider.process_links(htmlstring, params)
todo = spider.URL_STORE.find_unvisited_urls(base_url)
known_links = spider.URL_STORE.find_known_urls(base_url)
assert len(known_links) == 3
assert len(todo) == 3 and todo[0] == url1

# test cleaning and language
url = "https://example.org/en/page1/?"
target = "https://example.org/en/page1/"
htmlstring = f'<html><body><a href="{url}"/></body></html>'
trafilatura.spider.process_links(htmlstring, base_url, language="en")
todo = trafilatura.spider.URL_STORE.find_unvisited_urls(base_url)
known_links = trafilatura.spider.URL_STORE.find_known_urls(base_url)
params = spider.CrawlParameters(base_url, lang="en")
spider.process_links(htmlstring, params)
todo = spider.URL_STORE.find_unvisited_urls(base_url)
known_links = spider.URL_STORE.find_known_urls(base_url)
assert len(known_links) == 4
assert len(todo) == 4 and target in todo # TODO: remove slash?

# test rejection of URLs out of scope
url = "https://example.org/section2/page2"
htmlstring = f'<html><body><a href="{url}"/></body></html>'
trafilatura.spider.process_links(
htmlstring, base_url, ref="https://example.org/section1"
)
todo = trafilatura.spider.URL_STORE.find_unvisited_urls(base_url)
known_links = trafilatura.spider.URL_STORE.find_known_urls(base_url)
params = spider.CrawlParameters("https://example.org/section1/")
spider.process_links(htmlstring, params)
todo = spider.URL_STORE.find_unvisited_urls(base_url)
known_links = spider.URL_STORE.find_known_urls(base_url)
assert url not in todo and len(known_links) == 4

# wrong language
url = "https://example.org/en/page2"
htmlstring = f'<html><body><a href="{url}"/></body></html>'
trafilatura.spider.process_links(htmlstring, base_url, language="de")
todo = trafilatura.spider.URL_STORE.find_unvisited_urls(base_url)
known_links = trafilatura.spider.URL_STORE.find_known_urls(base_url)
params = spider.CrawlParameters(base_url, lang="de")
spider.process_links(htmlstring, params)
todo = spider.URL_STORE.find_unvisited_urls(base_url)
known_links = spider.URL_STORE.find_known_urls(base_url)
assert url not in todo and len(known_links) == 4

# invalid links
params = spider.CrawlParameters(base_url)
htmlstring = '<html><body><a href="#anchor"/><a href="mailto:[email protected]"/><a href="tel:1234567890"/></body></html>'
trafilatura.spider.process_links(htmlstring, base_url)
spider.process_links(htmlstring, params)
assert len(known_links) == 4 and len(todo) == 4

# not crawlable
htmlstring = '<html><body><a href="https://example.org/login"/></body></html>'
trafilatura.spider.process_links(htmlstring, base_url)
spider.process_links(htmlstring, params)
assert len(known_links) == 4 and len(todo) == 4

# test queue evaluation
todo = deque()
assert trafilatura.spider.is_still_navigation(todo) is False
assert spider.is_still_navigation(todo) is False
todo.append("https://example.org/en/page1")
assert trafilatura.spider.is_still_navigation(todo) is False
assert spider.is_still_navigation(todo) is False
todo.append("https://example.org/tag/1")
assert trafilatura.spider.is_still_navigation(todo) is True
assert spider.is_still_navigation(todo) is True


def test_crawl_logic():
"Test functions related to crawling sequence and consistency."
url = "https://httpbun.com/html"
trafilatura.spider.URL_STORE = UrlStore(compressed=False, strict=False)
spider.URL_STORE = UrlStore(compressed=False, strict=False)

# erroneous webpage
with pytest.raises(ValueError):
_, _, _, _, _ = trafilatura.spider.init_crawl("xyz", None, None)
assert len(trafilatura.spider.URL_STORE.urldict) == 0
params = spider.CrawlParameters("xyz")
assert len(spider.URL_STORE.urldict) == 0

# empty request
trafilatura.spider.process_response(None, "https://example.org", None)
assert len(trafilatura.spider.URL_STORE.urldict) == 0
params = spider.CrawlParameters("https://example.org")
spider.process_response(None, params)
assert len(spider.URL_STORE.urldict) == 0
assert params.start == params.base == params.ref == "https://example.org"
assert params.i == 0 and params.known_num == 0 and params.is_on
assert params.lang is None and params.rules is None

# already visited
base_url, i, known_num, rules, is_on = trafilatura.spider.init_crawl(
url,
None,
[
url,
],
)
todo = trafilatura.spider.URL_STORE.find_unvisited_urls(base_url)
known_links = trafilatura.spider.URL_STORE.find_known_urls(base_url)
assert base_url == "https://httpbun.com"
assert i == 0 and known_num == 1
assert not is_on
params = spider.init_crawl(url, known=[url])
assert params.base == "https://httpbun.com"
assert params.i == 0 and params.known_num == 1
assert not params.is_on
assert not spider.URL_STORE.find_unvisited_urls(params.base)
assert spider.URL_STORE.find_known_urls(params.base) == ["https://httpbun.com/html"]

# normal webpage
trafilatura.spider.URL_STORE = UrlStore(compressed=False, strict=False)
base_url, i, known_num, rules, is_on = trafilatura.spider.init_crawl(
url, None, None
)
todo = trafilatura.spider.URL_STORE.find_unvisited_urls(base_url)
known_links = trafilatura.spider.URL_STORE.find_known_urls(base_url)
spider.URL_STORE = UrlStore(compressed=False, strict=False)
params = spider.init_crawl(url)
assert (
todo == []
and known_links
== [
url,
]
and base_url == "https://httpbun.com"
and i == 1
and not is_on
not spider.URL_STORE.find_unvisited_urls(params.base)
and [url] == spider.URL_STORE.find_known_urls(params.base)
and params.base == "https://httpbun.com"
and params.i == 1
and not params.is_on
)

# delay between requests
assert trafilatura.spider.URL_STORE.get_crawl_delay("https://httpbun.com") == 5
assert (
trafilatura.spider.URL_STORE.get_crawl_delay("https://httpbun.com", default=2.0)
== 2.0
)
assert spider.URL_STORE.get_crawl_delay("https://httpbun.com") == 5
assert spider.URL_STORE.get_crawl_delay("https://httpbun.com", default=2.0) == 2.0

# existing todo
trafilatura.spider.URL_STORE = UrlStore(compressed=False, strict=False)
base_url, i, known_num, rules, is_on = trafilatura.spider.init_crawl(
url,
[
url,
],
None,
)
assert base_url == "https://httpbun.com" and i == 0 and not is_on
params = spider.init_crawl(url, todo=[url, "http://irrelevant.com"])
assert not spider.URL_STORE.find_unvisited_urls(params.base)
assert params.base == "https://httpbun.com" and params.i == 0 and not params.is_on

# new todo
params = spider.init_crawl(url, todo=["https://httpbun.com/links/1/1"])
assert params.base == "https://httpbun.com"
assert spider.URL_STORE.find_unvisited_urls(params.base) == [
"https://httpbun.com/links/1/1"
]
assert params.i == 0 and params.is_on and params.known_num == 2


def test_crawl_page():
"Test page-by-page processing."
base_url = "https://httpbun.com"
trafilatura.spider.URL_STORE = UrlStore(compressed=False, strict=False)
trafilatura.spider.URL_STORE.add_urls(["https://httpbun.com/links/2/2"])
is_on, known_num, visited_num = trafilatura.spider.crawl_page(
0, "https://httpbun.com"
)
todo = trafilatura.spider.URL_STORE.find_unvisited_urls(base_url)
known_links = trafilatura.spider.URL_STORE.find_known_urls(base_url)

spider.URL_STORE = UrlStore(compressed=False, strict=False)
spider.URL_STORE.add_urls(["https://httpbun.com/links/2/2"])
params = spider.CrawlParameters(base_url)
params = spider.crawl_page(params)
todo = spider.URL_STORE.find_unvisited_urls(base_url)
known_links = spider.URL_STORE.find_known_urls(base_url)
assert sorted(todo) == [
"https://httpbun.com/links/2/0",
"https://httpbun.com/links/2/1",
]
assert len(known_links) == 3 and visited_num == 1
assert is_on and known_num == 3
assert params.i == 1 and params.is_on and params.known_num == 3

# initial page
trafilatura.spider.URL_STORE = UrlStore(compressed=False, strict=False)
trafilatura.spider.URL_STORE.add_urls(["https://httpbun.com/html"])
spider.URL_STORE = UrlStore(compressed=False, strict=False)
spider.URL_STORE.add_urls(["https://httpbun.com/html"])
params = spider.CrawlParameters(base_url, lang="de")
# if LANGID_FLAG is True:
is_on, known_num, visited_num = trafilatura.spider.crawl_page(
0, "https://httpbun.com", initial=True, lang="de"
)
todo = trafilatura.spider.URL_STORE.find_unvisited_urls(base_url)
known_links = trafilatura.spider.URL_STORE.find_known_urls(base_url)
assert len(todo) == 0 and len(known_links) == 1 and visited_num == 1
params = spider.crawl_page(params, initial=True)
todo = spider.URL_STORE.find_unvisited_urls(base_url)
known_links = spider.URL_STORE.find_known_urls(base_url)
assert len(todo) == 0 and len(known_links) == 1 and params.i == 1
## TODO: find a better page for language tests


def test_focused_crawler():
"Test the whole focused crawler mechanism."
trafilatura.spider.URL_STORE = UrlStore()
todo, known_links = trafilatura.spider.focused_crawler(
spider.URL_STORE = UrlStore()
todo, known_links = spider.focused_crawler(
"https://httpbun.com/links/2/2", max_seen_urls=2
)
assert len(known_links) > 0
## fails on Github Actions
# assert sorted(known_links) == ['https://httpbun.com/links/2/0', 'https://httpbun.com/links/2/1', 'https://httpbun.com/links/2/2']
# assert todo and sorted(todo)[0] == ['https://httpbun.com/links/2/0']
# assert len(todo) == 1 and todo[0].startswith('https://httpbun.com/links/2')


def test_robots():
"Test robots.txt parsing"
assert trafilatura.spider.get_rules("1234") is None
assert spider.get_rules("1234") is None

robots_url = "https://example.org/robots.txt"

assert trafilatura.spider.parse_robots(robots_url, None) is None
assert trafilatura.spider.parse_robots(robots_url, 123) is None
assert trafilatura.spider.parse_robots(robots_url, b"123") is None
assert spider.parse_robots(robots_url, None) is None
assert spider.parse_robots(robots_url, 123) is None
assert spider.parse_robots(robots_url, b"123") is None

rules = trafilatura.spider.parse_robots(robots_url, "Allow: *")
rules = spider.parse_robots(robots_url, "Allow: *")
assert rules and rules.can_fetch("*", "https://example.org/1")

rules = trafilatura.spider.parse_robots(robots_url, "User-agent: *\nDisallow: /")
rules = spider.parse_robots(robots_url, "User-agent: *\nDisallow: /")
assert rules and not rules.can_fetch("*", "https://example.org/1")

rules = trafilatura.spider.parse_robots(
robots_url, "User-agent: *\nDisallow: /private"
)
rules = spider.parse_robots(robots_url, "User-agent: *\nDisallow: /private")
assert rules and not rules.can_fetch("*", "https://example.org/private")
assert rules.can_fetch("*", "https://example.org/public")

rules = trafilatura.spider.parse_robots(
robots_url, "Allow: *\nUser-agent: *\nCrawl-delay: 10"
)
rules = spider.parse_robots(robots_url, "Allow: *\nUser-agent: *\nCrawl-delay: 10")
assert rules and rules.crawl_delay("*") == 10

# rules = trafilatura.spider.parse_robots(robots_url, "User-agent: *\nAllow: /public")
# rules = spider.parse_robots(robots_url, "User-agent: *\nAllow: /public")
# assert rules is not None and rules.can_fetch("*", "https://example.org/public")
# assert not rules.can_fetch("*", "https://example.org/private")

Expand Down
19 changes: 8 additions & 11 deletions trafilatura/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,39 +301,36 @@ def cli_crawler(args, n=30, url_store=None, options=None):
if not options:
options = args_to_extractor(args)
sleep_time = options.config.getfloat('DEFAULT', 'SLEEP_TIME')
# counter = None
param_dict = {}

# load input URLs
if url_store is None:
spider.URL_STORE.add_urls(load_input_urls(args))
else:
spider.URL_STORE = url_store

# load crawl data
for hostname in spider.URL_STORE.get_known_domains():
if spider.URL_STORE.urldict[hostname].tuples:
startpage = spider.URL_STORE.get_url(hostname, as_visited=False)
# base_url, i, known_num, rules, is_on
_ = spider.init_crawl(startpage, None, set(), language=args.target_language)
param_dict[hostname] = spider.init_crawl(startpage, lang=args.target_language)
# update info
# TODO: register changes?
# if base_url != hostname:
# ...

# iterate until the threshold is reached
while spider.URL_STORE.done is False:
bufferlist, spider.URL_STORE = load_download_buffer(spider.URL_STORE, sleep_time)
# start several threads
for url, result in buffered_downloads(bufferlist, args.parallel, decode=False, options=options):
base_url = get_base_url(url)
# handle result
if result is not None:
spider.process_response(result, base_url, args.target_language, rules=spider.URL_STORE.get_rules(base_url))
# just in case a crawl delay is specified in robots.txt
# sleep(spider.get_crawl_delay(spider.URL_STORE.get_rules(base_url)))
base_url = get_base_url(url)
spider.process_response(result, param_dict[base_url])
# early exit if maximum count is reached
if any(c >= n for c in spider.URL_STORE.get_all_counts()):
break
# print results

print('\n'.join(u for u in spider.URL_STORE.dump_urls()))
#return todo, known_links


def probe_homepage(args):
Expand Down
Loading

0 comments on commit b538002

Please sign in to comment.