crawler: add parameter class, typing, and improve code (#675)

* crawler: add params class * fix tests * remove failing test * review code * improve code and tests * simplify code and improve class * fix test * fix types and simplify
adbar · Aug 16, 2024 · b538002 · b538002
1 parent 0fb6e43
commit b538002
Show file tree

Hide file tree

Showing 4 changed files with 241 additions and 236 deletions.
diff --git a/tests/spider_tests.py b/tests/spider_tests.py
@@ -12,7 +12,7 @@
 
 from courlan import UrlStore
 
-import trafilatura.spider  # for global variables
+from trafilatura import spider  # for global variables
 
 # from trafilatura.utils import LANGID_FLAG
 
@@ -22,13 +22,13 @@
 
 def test_redirections():
     "Test redirection detection."
-    _, _, baseurl = trafilatura.spider.probe_alternative_homepage("xyz")
+    _, _, baseurl = spider.probe_alternative_homepage("xyz")
     assert baseurl is None
-    _, _, baseurl = trafilatura.spider.probe_alternative_homepage(
+    _, _, baseurl = spider.probe_alternative_homepage(
         "https://httpbun.com/redirect-to?url=https://example.org"
     )
     assert baseurl == "https://example.org"
-    # _, _, baseurl = trafilatura.spider.probe_alternative_homepage('https://httpbin.org/redirect-to?url=https%3A%2F%2Fhttpbin.org%2Fhtml&status_code=302')
+    # _, _, baseurl = spider.probe_alternative_homepage('https://httpbin.org/redirect-to?url=https%3A%2F%2Fhttpbin.org%2Fhtml&status_code=302')
 
 
 def test_meta_redirections():
@@ -63,9 +63,7 @@ def test_meta_redirections():
     ]
 
     for htmlstring, homepage, expected_homepage in tests:
-        htmlstring2, homepage2 = trafilatura.spider.refresh_detection(
-            htmlstring, homepage
-        )
+        htmlstring2, homepage2 = spider.refresh_detection(htmlstring, homepage)
         assert homepage2 == expected_homepage
         if expected_homepage:
             if expected_homepage == homepage:
@@ -77,211 +75,199 @@ def test_meta_redirections():
 def test_process_links():
     "Test link extraction procedures."
     base_url = "https://example.org"
+    params = spider.CrawlParameters(base_url)
     htmlstring = '<html><body><a href="https://example.org/page1"/><a href="https://example.org/page1/"/><a href="https://test.org/page1"/></body></html>'
 
     # 1 internal link in total
-    trafilatura.spider.process_links(htmlstring, base_url)
-    assert len(trafilatura.spider.URL_STORE.find_known_urls(base_url)) == 1
-    assert len(trafilatura.spider.URL_STORE.find_unvisited_urls(base_url)) == 1
+    spider.process_links(htmlstring, params)
+    assert len(spider.URL_STORE.find_known_urls(base_url)) == 1
+    assert len(spider.URL_STORE.find_unvisited_urls(base_url)) == 1
 
     # same with content already seen
-    trafilatura.spider.process_links(htmlstring, base_url)
+    spider.process_links(htmlstring, params)
     assert (
-        len(trafilatura.spider.URL_STORE.find_unvisited_urls(base_url)) == 1
-        and len(trafilatura.spider.URL_STORE.find_known_urls(base_url)) == 1
+        len(spider.URL_STORE.find_unvisited_urls(base_url)) == 1
+        and len(spider.URL_STORE.find_known_urls(base_url)) == 1
     )
 
     # test navigation links
     url1 = "https://example.org/tag/number1"
     url2 = "https://example.org/page2"
     htmlstring = f'<html><body><a href="{url1}"/><a href="{url2}"/></body></html>'
-    trafilatura.spider.process_links(htmlstring, base_url)
-    todo = trafilatura.spider.URL_STORE.find_unvisited_urls(base_url)
-    known_links = trafilatura.spider.URL_STORE.find_known_urls(base_url)
+    spider.process_links(htmlstring, params)
+    todo = spider.URL_STORE.find_unvisited_urls(base_url)
+    known_links = spider.URL_STORE.find_known_urls(base_url)
     assert len(known_links) == 3
     assert len(todo) == 3 and todo[0] == url1
 
     # test cleaning and language
     url = "https://example.org/en/page1/?"
     target = "https://example.org/en/page1/"
     htmlstring = f'<html><body><a href="{url}"/></body></html>'
-    trafilatura.spider.process_links(htmlstring, base_url, language="en")
-    todo = trafilatura.spider.URL_STORE.find_unvisited_urls(base_url)
-    known_links = trafilatura.spider.URL_STORE.find_known_urls(base_url)
+    params = spider.CrawlParameters(base_url, lang="en")
+    spider.process_links(htmlstring, params)
+    todo = spider.URL_STORE.find_unvisited_urls(base_url)
+    known_links = spider.URL_STORE.find_known_urls(base_url)
     assert len(known_links) == 4
     assert len(todo) == 4 and target in todo  # TODO: remove slash?
 
     # test rejection of URLs out of scope
     url = "https://example.org/section2/page2"
     htmlstring = f'<html><body><a href="{url}"/></body></html>'
-    trafilatura.spider.process_links(
-        htmlstring, base_url, ref="https://example.org/section1"
-    )
-    todo = trafilatura.spider.URL_STORE.find_unvisited_urls(base_url)
-    known_links = trafilatura.spider.URL_STORE.find_known_urls(base_url)
+    params = spider.CrawlParameters("https://example.org/section1/")
+    spider.process_links(htmlstring, params)
+    todo = spider.URL_STORE.find_unvisited_urls(base_url)
+    known_links = spider.URL_STORE.find_known_urls(base_url)
     assert url not in todo and len(known_links) == 4
 
     # wrong language
     url = "https://example.org/en/page2"
     htmlstring = f'<html><body><a href="{url}"/></body></html>'
-    trafilatura.spider.process_links(htmlstring, base_url, language="de")
-    todo = trafilatura.spider.URL_STORE.find_unvisited_urls(base_url)
-    known_links = trafilatura.spider.URL_STORE.find_known_urls(base_url)
+    params = spider.CrawlParameters(base_url, lang="de")
+    spider.process_links(htmlstring, params)
+    todo = spider.URL_STORE.find_unvisited_urls(base_url)
+    known_links = spider.URL_STORE.find_known_urls(base_url)
     assert url not in todo and len(known_links) == 4
 
     # invalid links
+    params = spider.CrawlParameters(base_url)
     htmlstring = '<html><body><a href="#anchor"/><a href="mailto:[email protected]"/><a href="tel:1234567890"/></body></html>'
-    trafilatura.spider.process_links(htmlstring, base_url)
+    spider.process_links(htmlstring, params)
     assert len(known_links) == 4 and len(todo) == 4
 
     # not crawlable
     htmlstring = '<html><body><a href="https://example.org/login"/></body></html>'
-    trafilatura.spider.process_links(htmlstring, base_url)
+    spider.process_links(htmlstring, params)
     assert len(known_links) == 4 and len(todo) == 4
 
     # test queue evaluation
     todo = deque()
-    assert trafilatura.spider.is_still_navigation(todo) is False
+    assert spider.is_still_navigation(todo) is False
     todo.append("https://example.org/en/page1")
-    assert trafilatura.spider.is_still_navigation(todo) is False
+    assert spider.is_still_navigation(todo) is False
     todo.append("https://example.org/tag/1")
-    assert trafilatura.spider.is_still_navigation(todo) is True
+    assert spider.is_still_navigation(todo) is True
 
 
 def test_crawl_logic():
     "Test functions related to crawling sequence and consistency."
     url = "https://httpbun.com/html"
-    trafilatura.spider.URL_STORE = UrlStore(compressed=False, strict=False)
+    spider.URL_STORE = UrlStore(compressed=False, strict=False)
 
     # erroneous webpage
     with pytest.raises(ValueError):
-        _, _, _, _, _ = trafilatura.spider.init_crawl("xyz", None, None)
-    assert len(trafilatura.spider.URL_STORE.urldict) == 0
+        params = spider.CrawlParameters("xyz")
+    assert len(spider.URL_STORE.urldict) == 0
 
     # empty request
-    trafilatura.spider.process_response(None, "https://example.org", None)
-    assert len(trafilatura.spider.URL_STORE.urldict) == 0
+    params = spider.CrawlParameters("https://example.org")
+    spider.process_response(None, params)
+    assert len(spider.URL_STORE.urldict) == 0
+    assert params.start == params.base == params.ref == "https://example.org"
+    assert params.i == 0 and params.known_num == 0 and params.is_on
+    assert params.lang is None and params.rules is None
 
     # already visited
-    base_url, i, known_num, rules, is_on = trafilatura.spider.init_crawl(
-        url,
-        None,
-        [
-            url,
-        ],
-    )
-    todo = trafilatura.spider.URL_STORE.find_unvisited_urls(base_url)
-    known_links = trafilatura.spider.URL_STORE.find_known_urls(base_url)
-    assert base_url == "https://httpbun.com"
-    assert i == 0 and known_num == 1
-    assert not is_on
+    params = spider.init_crawl(url, known=[url])
+    assert params.base == "https://httpbun.com"
+    assert params.i == 0 and params.known_num == 1
+    assert not params.is_on
+    assert not spider.URL_STORE.find_unvisited_urls(params.base)
+    assert spider.URL_STORE.find_known_urls(params.base) == ["https://httpbun.com/html"]
 
     # normal webpage
-    trafilatura.spider.URL_STORE = UrlStore(compressed=False, strict=False)
-    base_url, i, known_num, rules, is_on = trafilatura.spider.init_crawl(
-        url, None, None
-    )
-    todo = trafilatura.spider.URL_STORE.find_unvisited_urls(base_url)
-    known_links = trafilatura.spider.URL_STORE.find_known_urls(base_url)
+    spider.URL_STORE = UrlStore(compressed=False, strict=False)
+    params = spider.init_crawl(url)
     assert (
-        todo == []
-        and known_links
-        == [
-            url,
-        ]
-        and base_url == "https://httpbun.com"
-        and i == 1
-        and not is_on
+        not spider.URL_STORE.find_unvisited_urls(params.base)
+        and [url] == spider.URL_STORE.find_known_urls(params.base)
+        and params.base == "https://httpbun.com"
+        and params.i == 1
+        and not params.is_on
     )
 
     # delay between requests
-    assert trafilatura.spider.URL_STORE.get_crawl_delay("https://httpbun.com") == 5
-    assert (
-        trafilatura.spider.URL_STORE.get_crawl_delay("https://httpbun.com", default=2.0)
-        == 2.0
-    )
+    assert spider.URL_STORE.get_crawl_delay("https://httpbun.com") == 5
+    assert spider.URL_STORE.get_crawl_delay("https://httpbun.com", default=2.0) == 2.0
 
     # existing todo
-    trafilatura.spider.URL_STORE = UrlStore(compressed=False, strict=False)
-    base_url, i, known_num, rules, is_on = trafilatura.spider.init_crawl(
-        url,
-        [
-            url,
-        ],
-        None,
-    )
-    assert base_url == "https://httpbun.com" and i == 0 and not is_on
+    params = spider.init_crawl(url, todo=[url, "http://irrelevant.com"])
+    assert not spider.URL_STORE.find_unvisited_urls(params.base)
+    assert params.base == "https://httpbun.com" and params.i == 0 and not params.is_on
+
+    # new todo
+    params = spider.init_crawl(url, todo=["https://httpbun.com/links/1/1"])
+    assert params.base == "https://httpbun.com"
+    assert spider.URL_STORE.find_unvisited_urls(params.base) == [
+        "https://httpbun.com/links/1/1"
+    ]
+    assert params.i == 0 and params.is_on and params.known_num == 2
 
 
 def test_crawl_page():
     "Test page-by-page processing."
     base_url = "https://httpbun.com"
-    trafilatura.spider.URL_STORE = UrlStore(compressed=False, strict=False)
-    trafilatura.spider.URL_STORE.add_urls(["https://httpbun.com/links/2/2"])
-    is_on, known_num, visited_num = trafilatura.spider.crawl_page(
-        0, "https://httpbun.com"
-    )
-    todo = trafilatura.spider.URL_STORE.find_unvisited_urls(base_url)
-    known_links = trafilatura.spider.URL_STORE.find_known_urls(base_url)
+
+    spider.URL_STORE = UrlStore(compressed=False, strict=False)
+    spider.URL_STORE.add_urls(["https://httpbun.com/links/2/2"])
+    params = spider.CrawlParameters(base_url)
+    params = spider.crawl_page(params)
+    todo = spider.URL_STORE.find_unvisited_urls(base_url)
+    known_links = spider.URL_STORE.find_known_urls(base_url)
     assert sorted(todo) == [
         "https://httpbun.com/links/2/0",
         "https://httpbun.com/links/2/1",
     ]
-    assert len(known_links) == 3 and visited_num == 1
-    assert is_on and known_num == 3
+    assert params.i == 1 and params.is_on and params.known_num == 3
+
     # initial page
-    trafilatura.spider.URL_STORE = UrlStore(compressed=False, strict=False)
-    trafilatura.spider.URL_STORE.add_urls(["https://httpbun.com/html"])
+    spider.URL_STORE = UrlStore(compressed=False, strict=False)
+    spider.URL_STORE.add_urls(["https://httpbun.com/html"])
+    params = spider.CrawlParameters(base_url, lang="de")
     # if LANGID_FLAG is True:
-    is_on, known_num, visited_num = trafilatura.spider.crawl_page(
-        0, "https://httpbun.com", initial=True, lang="de"
-    )
-    todo = trafilatura.spider.URL_STORE.find_unvisited_urls(base_url)
-    known_links = trafilatura.spider.URL_STORE.find_known_urls(base_url)
-    assert len(todo) == 0 and len(known_links) == 1 and visited_num == 1
+    params = spider.crawl_page(params, initial=True)
+    todo = spider.URL_STORE.find_unvisited_urls(base_url)
+    known_links = spider.URL_STORE.find_known_urls(base_url)
+    assert len(todo) == 0 and len(known_links) == 1 and params.i == 1
     ## TODO: find a better page for language tests
 
 
 def test_focused_crawler():
     "Test the whole focused crawler mechanism."
-    trafilatura.spider.URL_STORE = UrlStore()
-    todo, known_links = trafilatura.spider.focused_crawler(
+    spider.URL_STORE = UrlStore()
+    todo, known_links = spider.focused_crawler(
         "https://httpbun.com/links/2/2", max_seen_urls=2
     )
     assert len(known_links) > 0
     ## fails on Github Actions
     # assert sorted(known_links) == ['https://httpbun.com/links/2/0', 'https://httpbun.com/links/2/1', 'https://httpbun.com/links/2/2']
-    # assert todo and sorted(todo)[0] == ['https://httpbun.com/links/2/0']
+    # assert len(todo) == 1 and todo[0].startswith('https://httpbun.com/links/2')
 
 
 def test_robots():
     "Test robots.txt parsing"
-    assert trafilatura.spider.get_rules("1234") is None
+    assert spider.get_rules("1234") is None
 
     robots_url = "https://example.org/robots.txt"
 
-    assert trafilatura.spider.parse_robots(robots_url, None) is None
-    assert trafilatura.spider.parse_robots(robots_url, 123) is None
-    assert trafilatura.spider.parse_robots(robots_url, b"123") is None
+    assert spider.parse_robots(robots_url, None) is None
+    assert spider.parse_robots(robots_url, 123) is None
+    assert spider.parse_robots(robots_url, b"123") is None
 
-    rules = trafilatura.spider.parse_robots(robots_url, "Allow: *")
+    rules = spider.parse_robots(robots_url, "Allow: *")
     assert rules and rules.can_fetch("*", "https://example.org/1")
 
-    rules = trafilatura.spider.parse_robots(robots_url, "User-agent: *\nDisallow: /")
+    rules = spider.parse_robots(robots_url, "User-agent: *\nDisallow: /")
     assert rules and not rules.can_fetch("*", "https://example.org/1")
 
-    rules = trafilatura.spider.parse_robots(
-        robots_url, "User-agent: *\nDisallow: /private"
-    )
+    rules = spider.parse_robots(robots_url, "User-agent: *\nDisallow: /private")
     assert rules and not rules.can_fetch("*", "https://example.org/private")
     assert rules.can_fetch("*", "https://example.org/public")
 
-    rules = trafilatura.spider.parse_robots(
-        robots_url, "Allow: *\nUser-agent: *\nCrawl-delay: 10"
-    )
+    rules = spider.parse_robots(robots_url, "Allow: *\nUser-agent: *\nCrawl-delay: 10")
     assert rules and rules.crawl_delay("*") == 10
 
-    # rules = trafilatura.spider.parse_robots(robots_url, "User-agent: *\nAllow: /public")
+    # rules = spider.parse_robots(robots_url, "User-agent: *\nAllow: /public")
     # assert rules is not None and rules.can_fetch("*", "https://example.org/public")
     # assert not rules.can_fetch("*", "https://example.org/private")
 

diff --git a/trafilatura/cli_utils.py b/trafilatura/cli_utils.py
@@ -301,39 +301,36 @@ def cli_crawler(args, n=30, url_store=None, options=None):
     if not options:
         options = args_to_extractor(args)
     sleep_time = options.config.getfloat('DEFAULT', 'SLEEP_TIME')
-    # counter = None
+    param_dict = {}
+
     # load input URLs
     if url_store is None:
         spider.URL_STORE.add_urls(load_input_urls(args))
     else:
         spider.URL_STORE = url_store
+
     # load crawl data
     for hostname in spider.URL_STORE.get_known_domains():
         if spider.URL_STORE.urldict[hostname].tuples:
             startpage = spider.URL_STORE.get_url(hostname, as_visited=False)
-            # base_url, i, known_num, rules, is_on
-            _ = spider.init_crawl(startpage, None, set(), language=args.target_language)
+            param_dict[hostname] = spider.init_crawl(startpage, lang=args.target_language)
             # update info
             # TODO: register changes?
             # if base_url != hostname:
             # ...
+
     # iterate until the threshold is reached
     while spider.URL_STORE.done is False:
         bufferlist, spider.URL_STORE = load_download_buffer(spider.URL_STORE, sleep_time)
-        # start several threads
         for url, result in buffered_downloads(bufferlist, args.parallel, decode=False, options=options):
-            base_url = get_base_url(url)
-            # handle result
             if result is not None:
-                spider.process_response(result, base_url, args.target_language, rules=spider.URL_STORE.get_rules(base_url))
-                # just in case a crawl delay is specified in robots.txt
-                # sleep(spider.get_crawl_delay(spider.URL_STORE.get_rules(base_url)))
+                base_url = get_base_url(url)
+                spider.process_response(result, param_dict[base_url])
         # early exit if maximum count is reached
         if any(c >= n for c in spider.URL_STORE.get_all_counts()):
             break
-    # print results
+
     print('\n'.join(u for u in spider.URL_STORE.dump_urls()))
-    #return todo, known_links
 
 
 def probe_homepage(args):