Skip to content

Commit

Permalink
tests: improve coverage
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Jan 24, 2022
1 parent 93efc32 commit 040d443
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 14 deletions.
1 change: 1 addition & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ exclude_lines =
pragma: no cover
if __name__ == .__main__.:
except .*UnicodeDecodeError.*:
except .*urllib3.exceptions.*:
12 changes: 10 additions & 2 deletions tests/downloads_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

from collections import deque
from datetime import datetime
from unittest.mock import patch
from unittest.mock import Mock, patch

from trafilatura.cli import parse_args
from trafilatura.cli_utils import download_queue_processing, url_processing_pipeline
Expand Down Expand Up @@ -66,6 +66,10 @@ def test_fetch():
response = _send_request(url, False, DEFAULT_CONFIG)
myobject = _handle_response(url, response, False, DEFAULT_CONFIG)
assert myobject.data.startswith(b'<h1>Unicode Demo</h1>')
# too large response object
mock = Mock()
mock.data = (b' '*10000000)
assert _handle_response(url, mock, False, DEFAULT_CONFIG) is None
# straight handling of response object
assert load_html(response) is not None
# nothing to see here
Expand Down Expand Up @@ -95,12 +99,16 @@ def test_config():
def test_decode():
'''Test how responses are being decoded.'''
assert decode_response(b'\x1f\x8babcdef') is not None
assert decode_response(b'\x1f\x8babcdef') is not None
mock = Mock()
mock.data = (b' ')
assert decode_response(mock) is not None


def test_queue():
'Test creation, modification and download of URL queues.'
# test conversion and storage
inputdict = add_to_compressed_dict(['ftps://www.example.org/'])
inputdict = add_to_compressed_dict(['ftps://www.example.org/', 'http://'])
assert inputdict == dict()
inputdict = add_to_compressed_dict(['https://www.example.org/'])
# CLI args
Expand Down
6 changes: 6 additions & 0 deletions tests/metadata_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

from lxml import html

from trafilatura.json_metadata import normalize_json
from trafilatura.metadata import extract_metadata, METADATA_LIST, extract_meta_json
from trafilatura.utils import normalize_authors

Expand Down Expand Up @@ -125,6 +126,7 @@ def test_authors():
assert normalize_authors(None, 'abc') == 'Abc'
assert normalize_authors(None, 'Steve Steve 123') == 'Steve Steve'
assert normalize_authors(None, 'By Steve Steve') == 'Steve Steve'
assert normalize_json('Test \\nthis') == 'Test this'
# extraction
metadata = extract_metadata('<html><head><meta itemprop="author" content="Jenny Smith"/></head><body></body></html>')
assert metadata['author'] == 'Jenny Smith'
Expand Down Expand Up @@ -264,6 +266,10 @@ def test_meta():
assert extract_metadata('') is None
metadata = extract_metadata('<html><title></title></html>')
assert metadata['sitename'] is None
metadata = extract_metadata('<html><head><title>' + 'AAA'*10000 + '</title></head></html>')
assert metadata['title'].endswith('…') and len(metadata['title']) == 10000
assert extract_metadata('<html><head><meta otherkey="example" content="Unknown text"/></head></html>') is not None
assert extract_metadata('<html><head><title></title><title></title><title></title></head></html>') is not None


def test_catstags():
Expand Down
40 changes: 32 additions & 8 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ def test_txttocsv():
# test json
result = extract(mystring, output_format='json', config=ZERO_CONFIG)
assert result.endswith('}') and '"fingerprint":' in result
assert extract(mystring, output_format='json', include_comments=False, config=ZERO_CONFIG).endswith('}')
# bare extraction for python
result = bare_extraction(mystring, config=ZERO_CONFIG)
assert isinstance(result, dict) and len(result) == 14
Expand Down Expand Up @@ -238,7 +239,12 @@ def test_lrucache():

def test_formatting():
'''Test HTML formatting conversion and extraction'''
# simple
# trailing <lb>
my_document = html.fromstring('<html><body><p>This here is the text.</p><br/></body></html>')
my_result = extract(my_document, output_format='xml', config=ZERO_CONFIG)
assert 'lb' not in my_result

# simple formatting
my_document = html.fromstring('<html><body><p><b>This here is in bold font.</b></p></body></html>')
my_result = extract(my_document, output_format='xml', include_formatting=True, config=ZERO_CONFIG)
assert '<hi rend="#b">This here is in bold font.</hi>' in my_result
Expand Down Expand Up @@ -300,10 +306,9 @@ def test_formatting():

# double <p>-elems
# could be solved by keeping the elements instead of reconstructing them
#my_document = html.fromstring('<html><body><p>AAA, <p>BBB</p>, CCC.</p></body></html>')
#my_result = extract(my_document, output_format='xml', include_formatting=True, include_links=True, no_fallback=True, config=ZERO_CONFIG)
#print(my_result)
#assert 1 == 0
my_document = html.fromstring('<html><body><p>AAA, <p>BBB</p>, CCC.</p></body></html>')
my_result = extract(my_document, output_format='xml', include_formatting=True, include_links=True, no_fallback=True, config=ZERO_CONFIG)
assert 'AAA' in my_result and 'BBB' in my_result and 'CCC' in my_result

# line-break following formatting
my_document = html.fromstring('<html><body><article><p><strong>Staff Review of the Financial Situation</strong><br>Domestic financial conditions remained accommodative over the intermeeting period.</p></article></body></html>')
Expand Down Expand Up @@ -379,7 +384,7 @@ def test_filters():
# text + lang
my_p = '<p>In sleep a king, but waking no such matter.</p>'
assert extract(html.fromstring('<html lang="en-US"><body>' + my_p*50 + '</body></html>'), target_language='en') is not None
#assert extract(html.fromstring('<html lang="en-US"><body>' + my_p*50 + '</body></html>'), target_language='de') is None
assert extract(html.fromstring('<html lang="en-US"><body>' + my_p*50 + '</body></html>'), target_language='de') is None
assert check_html_lang(html.fromstring('<html lang="de_DE, en_US"><body></body></html>'), target_language='de') is True
assert check_html_lang(html.fromstring('<html lang="de_DE, en_US"><body></body></html>'), target_language='en') is True
assert check_html_lang(html.fromstring('<html lang="de_DE, en_US"><body></body></html>'), target_language='de', strict=True) is True
Expand Down Expand Up @@ -430,11 +435,14 @@ def test_external():

def test_images():
'''Test image extraction function'''
# file type
assert utils.is_image_file('test.jpg') is True
assert utils.is_image_file('test.txt') is False
# tag with attributes
assert handle_image(html.fromstring('<img src="test.jpg"/>')) is not None
assert handle_image(html.fromstring('<img data-src="test.jpg" alt="text" title="a title"/>')) is not None
assert handle_image(html.fromstring('<img other="test.jpg"/>')) is None
assert utils.is_image_file('test.jpg') is True
assert utils.is_image_file('test.txt') is False
# HTML conversion
assert handle_textelem(etree.Element('graphic'), [], False, DEFAULT_CONFIG) is None
with open(os.path.join(RESOURCES_DIR, 'http_sample.html')) as f:
teststring = f.read()
Expand Down Expand Up @@ -508,6 +516,9 @@ def test_tei():
docmeta['title'] = 'Title'
assert xml.write_fullheader(header, docmeta) is not None
docmeta['sitename'] = 'Site Name'
docmeta['date'] = '2021-01-01'
assert xml.write_fullheader(header, docmeta) is not None
docmeta['date'] = None
assert xml.write_fullheader(header, docmeta) is not None
docmeta['hostname'] = 'hostname'
assert xml.write_fullheader(header, docmeta) is not None
Expand Down Expand Up @@ -547,6 +558,16 @@ def test_htmlprocessing():
assert b'<p>A B tail C</p>' in etree.tostring(mydoc)


def test_extraction_options():
'''Test the different parameters available in extract() and bare_extraction()'''
my_html = '<html><head><meta http-equiv="content-language" content="EN"/></head><body><div="article-body"><p>Text.</p></div></body></html>'
assert extract(my_html, config=ZERO_CONFIG) is not None
assert extract(my_html, with_metadata=True, output_format='xml', config=ZERO_CONFIG) is None
assert extract(my_html, only_with_metadata=True, output_format='xml', config=ZERO_CONFIG) is None
assert extract(my_html, target_language='de', config=ZERO_CONFIG) is None
# assert extract(my_html) is None


def test_precision_recall():
'''test precision- and recall-oriented settings'''
# the test cases could be better
Expand All @@ -556,6 +577,8 @@ def test_precision_recall():
my_document = html.fromstring('<html><body><div class="article-body"><div class="teaser-content"><p>This here is a teaser text.</p></div><p>This here is the text.</p></div></body></html>')
assert 'teaser text' in extract(my_document, favor_recall=True, config=ZERO_CONFIG)
assert 'teaser text' not in extract(my_document, config=ZERO_CONFIG)
assert 'teaser text' not in extract(my_document, favor_precision=True, config=ZERO_CONFIG)



if __name__ == '__main__':
Expand All @@ -567,6 +590,7 @@ def test_precision_recall():
test_images()
test_links()
test_htmlprocessing()
test_extraction_options()
test_precision_recall()
test_filters()
test_baseline()
Expand Down
4 changes: 0 additions & 4 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,10 +237,6 @@ def handle_paragraphs(element, potential_tags, dedupbool, config):
elif child.tag == 'ref':
if child.get('target') is not None:
newsub.set('target', child.get('target'))
# to be removed after thorough testing
elif child.get('href') is not None:
newsub.set('target', child.get('href'))
# del processed_child.attrib['href']
# handle line breaks
# elif processed_child.tag == 'lb':
# try:
Expand Down

0 comments on commit 040d443

Please sign in to comment.