prepare new version: 1.2.0

adbar · Mar 7, 2022 · daf5d8d · daf5d8d
1 parent b877cac
commit daf5d8d
Show file tree

Hide file tree

Showing 10 changed files with 321 additions and 322 deletions.
diff --git a/HISTORY.md b/HISTORY.md
@@ -1,5 +1,11 @@
 ## History / Changelog
 
+### 1.2.0
+- efficiency: replaced module readability-lxml by trimmed fork
+- bug fixed: (#179, #180, #183, #184)
+- improved baseline extraction
+- cleaner metadata (with @felipehertzer)
+
 
 ### 1.1.0
 - encodings: better detection, output NFC-normalized Unicode

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,17 +1,16 @@
 include CITATION.cff CONTRIBUTING.md HISTORY.md README.rst LICENSE pytest.ini
 graft trafilatura/data/
+include trafilatura/settings.cfg
 
 include tests/__init__.py
 include tests/*test*.py
-include tests/eval-requirements.txt tests/README.rst
-graft tests/cache/
 graft tests/resources/
+exclude tests/realworld_tests.py
+recursive-exclude tests/cache/
 
-recursive-exclude * __pycache__
-recursive-exclude * *.py[co]
-
-recursive-include conf.py Makefile make.bat *.jpg *.png
-
+recursive-include docs/ conf.py Makefile make.bat *.rst *.gif *.jpg *.png
 include docs/requirements.txt
-recursive-include docs *.rst *.gif *.jpg *.png
 recursive-include docs/_build/ *.gif *.jpg *.png
+
+recursive-exclude * __pycache__
+recursive-exclude * *.py[co]
diff --git a/docs/index.rst b/docs/index.rst
@@ -39,7 +39,7 @@ Description
 
 Distinguishing between a whole page and the page's essential parts can help to alleviate many quality problems related to web text processing, by dealing with the noise caused by recurring elements (headers and footers, ads, links/blogroll, etc.).
 
-The extractor aims to be precise enough in order not to miss texts or discard valid documents. In addition, it must be robust and reasonably fast. With these objectives in mind, it is designed to run in production on millions of web documents. It is based on `lxml <http://lxml.de/>`_ as well as `readability <https://github.com/buriy/python-readability>`_ and `jusText <http://corpus.tools/wiki/Justext>`_ used as fallback.
+The extractor aims to be precise enough in order not to miss texts or discard valid documents. In addition, it must be robust and reasonably fast. With these objectives in mind, it is designed to run in production on millions of web documents. It is based on `lxml <http://lxml.de/>`_ and on generic algorithms used as fallback (`jusText <http://corpus.tools/wiki/Justext>`_ and a fork of readability-lxml).
 
 The intended audience encompasses disciplines where collecting web pages represents an important step for data collection, notably linguistics, natural language processing and social sciences. In general, it is relevant for anyone interested in gathering texts from the Web, e.g. web crawling and scraping-intensive fields like information security and search engine optimization.
 

diff --git a/tests/eval-requirements.txt b/tests/eval-requirements.txt
@@ -1,7 +1,7 @@
-trafilatura==1.0.0
+trafilatura==1.2.0
 
 # alternatives
-boilerpy3==1.0.5
+boilerpy3==1.0.6
 dragnet==2.0.4
 goose3==3.1.11
 html2text==2020.1.16

diff --git a/tests/metadata_tests.py b/tests/metadata_tests.py
diff --git a/tests/realworld_tests.py b/tests/realworld_tests.py
diff --git a/tests/cache/exotic_tags.html → tests/resources/exotic_tags.html b/tests/cache/exotic_tags.html → tests/resources/exotic_tags.html
diff --git a/tests/cache/exotic_tags_tei.html → tests/resources/exotic_tags_tei.html b/tests/cache/exotic_tags_tei.html → tests/resources/exotic_tags_tei.html
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -54,12 +54,12 @@
 def load_mock_page(url, xml_flag=False, langcheck=None, tei_output=False):
     '''load mock page from samples'''
     try:
-        with open(os.path.join(TEST_DIR, 'cache', MOCK_PAGES[url]), 'r') as inputf:
+        with open(os.path.join(TEST_DIR, 'resources', MOCK_PAGES[url]), 'r') as inputf:
             htmlstring = inputf.read()
     # encoding/windows fix for the tests
     except UnicodeDecodeError:
         # read as binary
-        with open(os.path.join(TEST_DIR, 'cache', MOCK_PAGES[url]), 'rb') as inputf:
+        with open(os.path.join(TEST_DIR, 'resources', MOCK_PAGES[url]), 'rb') as inputf:
             htmlbinary = inputf.read()
         guessed_encoding = detect(htmlbinary)['encoding']
         if guessed_encoding is not None:
@@ -151,7 +151,7 @@ def test_exotic_tags(xmloutput=False):
     # cover some edge cases with a specially crafted file
     result = load_mock_page('http://exotic_tags', xml_flag=xmloutput, tei_output=True)
     assert 'Teletype text' in result and 'My new car is silver.' in result
-    filepath = os.path.join(TEST_DIR, 'cache', 'exotic_tags_tei.html')
+    filepath = os.path.join(TEST_DIR, 'resources', 'exotic_tags_tei.html')
     with open(filepath) as f:
         content = etree.fromstring(f.read())
     res = xml.check_tei(content, 'http://dummy')

diff --git a/trafilatura/__init__.py b/trafilatura/__init__.py
@@ -8,7 +8,7 @@
 __author__ = 'Adrien Barbaresi and contributors'
 __license__ = 'GNU GPL v3+'
 __copyright__ = 'Copyright 2019-2022, Adrien Barbaresi'
-__version__ = '1.1.0'
+__version__ = '1.2.0'
 
 
 import logging