Skip to content

Commit

Permalink
fix: language detection module used by spider
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Jan 24, 2022
1 parent 040d443 commit 188f06c
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 4 deletions.
2 changes: 1 addition & 1 deletion tests/spider_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

# language detection
try:
import cld3
import py3langid
LANGID_FLAG = True
except ImportError:
LANGID_FLAG = False
Expand Down
7 changes: 4 additions & 3 deletions trafilatura/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

# language detection
try:
import cld3
import py3langid
LANGID_FLAG = True
except ImportError:
LANGID_FLAG = False
Expand Down Expand Up @@ -98,8 +98,9 @@ def find_new_links(htmlstring, base_url, known_links, language=None, rules=None)
# optional language check: run baseline extraction + language identifier
if language is not None and LANGID_FLAG is True:
_, text, _ = baseline(htmlstring)
result = cld3.get_language(text)
if result is not None and result.language != language:
result, _ = py3langid.classify(text)

if result != language:
return new_links, known_links
# iterate through the links and filter them
for link in extract_links(htmlstring, base_url, False, language=language, with_nav=True):
Expand Down

0 comments on commit 188f06c

Please sign in to comment.