Skip to content

Commit

Permalink
CLI: review code, add types and tests (#677)
Browse files Browse the repository at this point in the history
* CLI: review code and add types

* fix download tests

* lint and add test

* typing and formatting

* add tests

* add verbosity test

* redo test
  • Loading branch information
adbar authored Aug 16, 2024
1 parent b538002 commit 49fbc15
Show file tree
Hide file tree
Showing 6 changed files with 242 additions and 164 deletions.
19 changes: 13 additions & 6 deletions tests/cli_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,22 @@
from contextlib import redirect_stdout
from datetime import datetime
from os import path
from tempfile import gettempdir
from unittest.mock import patch

import pytest

from courlan import UrlStore

from trafilatura import cli, cli_utils, spider # settings
from trafilatura import cli, cli_utils, spider, settings
from trafilatura.downloads import add_to_compressed_dict, fetch_url
from trafilatura.settings import args_to_extractor
from trafilatura.utils import LANGID_FLAG

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
RESOURCES_DIR = path.join(path.abspath(path.dirname(__file__)), "resources")

settings.MAX_FILES_PER_DIRECTORY = 1


def test_parser():
"""test argument parsing for the command-line interface"""
Expand Down Expand Up @@ -84,6 +86,7 @@ def test_parser():
"--url-filter",
"test1",
"test2",
"-vvv",
]
with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
Expand Down Expand Up @@ -181,6 +184,7 @@ def test_input_type():
with open(testfile, "rb") as f:
teststring = f.read(1024)
assert cli.examine(teststring, args) is None
assert cli.examine([1, 2, 3], args) is None
testfile = "docs/usage.rst"
with open(testfile, "r", encoding="utf-8") as f:
teststring = f.read()
Expand Down Expand Up @@ -232,10 +236,13 @@ def test_sysoutput():
args = cli.parse_args(testargs)
result = "DADIDA"
cli_utils.write_result(result, args)
args.output_dir = gettempdir()
args.backup_dir = None
cli_utils.write_result(result, args)
# process with backup directory and no counter
options = args_to_extractor(args)
options = settings.args_to_extractor(args)
assert options.format == "markdown" and options.formatting is True
assert cli_utils.process_result("DADIDA", args, None, options) is None
assert cli_utils.process_result("DADIDA", args, -1, options) == -1
# test keeping dir structure
testargs = ["", "-i", "myinputdir/", "-o", "test/", "--keep-dirs"]
with patch.object(sys, "argv", testargs):
Expand Down Expand Up @@ -404,7 +411,7 @@ def test_file_processing():
# test manually
for f in cli_utils.generate_filelist(args.input_dir):
cli_utils.file_processing(f, args)
options = args_to_extractor(args)
options = settings.args_to_extractor(args)
args.output_dir = "/dev/null"
for f in cli_utils.generate_filelist(args.input_dir):
cli_utils.file_processing(f, args, options=options)
Expand All @@ -420,7 +427,7 @@ def test_cli_config_file():
) as f:
teststring = f.read()
args.config_file = path.join(RESOURCES_DIR, args.config_file)
options = args_to_extractor(args)
options = settings.args_to_extractor(args)
assert cli.examine(teststring, args, options=options) is None


Expand Down
4 changes: 2 additions & 2 deletions tests/downloads_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,8 +233,8 @@ def test_queue():
args.config_file = os.path.join(RESOURCES_DIR, 'newsettings.cfg')
options = args_to_extractor(args)
options.config['DEFAULT']['SLEEP_TIME'] = '0.2'
results = download_queue_processing(url_store, args, None, options)
assert len(results[0]) == 5 and results[1] is None
results = download_queue_processing(url_store, args, -1, options)
assert len(results[0]) == 5 and results[1] is -1


if __name__ == '__main__':
Expand Down
28 changes: 15 additions & 13 deletions trafilatura/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from importlib_metadata import version

from platform import python_version
from typing import Any

from .cli_utils import (cli_crawler, cli_discovery, examine,
file_processing_pipeline, load_blacklist,
Expand All @@ -34,7 +35,7 @@
sys.stderr = codecs.getwriter('utf-8')(sys.stderr.buffer, 'strict')


def add_args(parser):
def add_args(parser: Any) -> Any:
"Add argument groups and arguments to parser."

group1 = parser.add_argument_group('Input', 'URLs, files or directories to process')
Expand Down Expand Up @@ -202,15 +203,15 @@ def add_args(parser):
return parser


def parse_args(args):
def parse_args(args: Any) -> Any:
"""Define parser for command-line arguments"""
parser = argparse.ArgumentParser(description='Command-line interface for Trafilatura')
parser = add_args(parser)
# wrap in mapping to prevent invalid input
return map_args(parser.parse_args())


def map_args(args):
def map_args(args: Any) -> Any:
'''Map existing options to format and output choices.'''
# formats
for otype in ("csv", "html", "json", "markdown", "xml", "xmltei"):
Expand Down Expand Up @@ -249,32 +250,28 @@ def map_args(args):
return args


def main():
def main() -> None:
""" Run as a command-line utility. """
args = parse_args(sys.argv[1:])
process_args(args)


def process_args(args):
def process_args(args: Any) -> None:
"""Perform the actual processing according to the arguments"""
# init
error_caught = False
# verbosity

if args.verbose == 1:
logging.basicConfig(stream=sys.stdout, level=logging.WARNING)
elif args.verbose >= 2:
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

if args.blacklist:
args.blacklist = load_blacklist(args.blacklist)

# processing according to mutually exclusive options
# read url list from input file
if args.input_file and all([not args.crawl, not args.explore, not args.feed, not args.probe, not args.sitemap]):
url_store = load_input_dict(args)
error_caught = url_processing_pipeline(args, url_store)

# fetch urls from a feed or a sitemap
elif args.explore or args.feed or args.sitemap:
if args.explore or args.feed or args.sitemap:
cli_discovery(args)

# activate crawler/spider
Expand All @@ -289,6 +286,11 @@ def process_args(args):
elif args.input_dir:
file_processing_pipeline(args)

# read url list from input file
elif args.input_file:
url_store = load_input_dict(args)
error_caught = url_processing_pipeline(args, url_store)

# process input URL
elif args.URL:
url_store = load_input_dict(args)
Expand All @@ -300,7 +302,7 @@ def process_args(args):
write_result(result, args)

# change exit code if there are errors
if error_caught is True:
if error_caught:
sys.exit(1)


Expand Down
Loading

0 comments on commit 49fbc15

Please sign in to comment.