Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

basic JSON & JUNIT reporters #22

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
*.pyc
*.json
*.xml
pylinkvalidator.egg-info/
dist/
build/
.idea/
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ usage examples.
These options change the output of the crawler.

-f FORMAT, --format=FORMAT
Format of the report: plain (default)
Format of the report: plain (default), json, junit
-o OUTPUT, --output=OUTPUT
Path of the file where the report will be printed.
-W WHEN, --when=WHEN
Expand Down
6 changes: 4 additions & 2 deletions pylinkvalidator/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,7 @@ def _get_links(self, elements, attribute, base_url_split,
for element in elements:
if attribute in element.attrs:
url = element[attribute]
target = element.attrs.get('target', None)

if not self.worker_config.strict_mode:
url = url.strip()
Expand All @@ -540,7 +541,7 @@ def _get_links(self, elements, attribute, base_url_split,
link = Link(
type=unicode(element.name), url_split=abs_url_split,
original_url_split=original_url_split,
source_str=unicode(element))
source_str=unicode(element), target=target)
links.append(link)

return links
Expand Down Expand Up @@ -658,7 +659,8 @@ def process_links(self, page_crawl):
continue

page_status = self.page_statuses.get(url_split, None)
page_source = PageSource(source_url_split, link.source_str)
page_source = PageSource(
source_url_split, link.source_str, link.target)

if not page_status:
# We never encountered this url before
Expand Down
10 changes: 6 additions & 4 deletions pylinkvalidator/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ def namedtuple_with_defaults(typename, field_names, default_values=[]):
FORMAT_PLAIN = "plain"
FORMAT_HTML = "html"
FORMAT_JSON = "json"
FORMAT_JUNIT = "junit"


WHEN_ALWAYS = "always"
Expand Down Expand Up @@ -133,7 +134,7 @@ def namedtuple_with_defaults(typename, field_names, default_values=[]):

Link = namedtuple_with_defaults(
"Link",
["type", "url_split", "original_url_split", "source_str"])
["type", "url_split", "original_url_split", "source_str", "target"])


PageCrawl = namedtuple_with_defaults(
Expand All @@ -149,7 +150,7 @@ def namedtuple_with_defaults(typename, field_names, default_values=[]):


PageSource = namedtuple_with_defaults(
"PageSource", ["origin", "origin_str"])
"PageSource", ["origin", "origin_str", "target"])


ContentCheck = namedtuple_with_defaults(
Expand Down Expand Up @@ -582,8 +583,9 @@ def _build_parser(self):

output_group.add_option(
"-f", "--format", dest="format", action="store",
default=FORMAT_PLAIN, choices=[FORMAT_PLAIN],
help="Format of the report: plain")
default=FORMAT_PLAIN,
choices=[FORMAT_PLAIN, FORMAT_JSON, FORMAT_JUNIT],
help="Format of the report: plain (default), json, junit")
output_group.add_option(
"-o", "--output", dest="output", action="store",
default=None,
Expand Down
163 changes: 159 additions & 4 deletions pylinkvalidator/reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,23 @@
from __future__ import unicode_literals, absolute_import, print_function

import codecs
import json
import re
import smtplib
import sys

from email.mime.text import MIMEText

from junit_xml import TestSuite, TestCase

from pylinkvalidator.compat import StringIO
from pylinkvalidator.models import (
REPORT_TYPE_ERRORS, REPORT_TYPE_ALL, FORMAT_PLAIN)
FORMAT_JSON,
FORMAT_JUNIT,
FORMAT_PLAIN,
REPORT_TYPE_ALL,
REPORT_TYPE_ERRORS,
)


PLAIN_TEXT = "text/plain"
Expand Down Expand Up @@ -54,6 +62,10 @@ def report(site, config, total_time, logger=None):
try:
if config.options.format == FORMAT_PLAIN:
_write_plain_text_report(site, config, output_files, total_time)
if config.options.format == FORMAT_JSON:
_write_json_report(site, config, output_file, total_time)
if config.options.format == FORMAT_JUNIT:
_write_junit_report(site, config, output_file, total_time)
except Exception:
if logger:
logger.exception("An exception occurred while writing the report")
Expand All @@ -72,6 +84,113 @@ def _write_plain_text_report(site, config, output_files, total_time):
_write_plain_text_report_single(site, config, output_files, total_time)


def _write_junit_report(site, config, output_file, total_time):
pages = site.pages
test_cases = []

for results, resource in pages.items():
origins = [source.origin.geturl() for source in resource.sources]
if resource.status == 200:
test_case = TestCase(
name=resource.url_split.geturl(),
classname=results.hostname,
elapsed_sec=resource.response_time,
stdout=resource.status,
status="passed"
)
else:
stderr_message = "Link found on:\n{}".format("\n".join(origins))
test_case = TestCase(
name=resource.url_split.geturl(),
classname=results.hostname,
elapsed_sec=resource.response_time,
stderr=stderr_message,
status="failed"
)
if resource.exception:
message = str(resource.exception)
else:
message = "Expected 200 OK but got {}".format(resource.status)
test_case.add_failure_info(
message=message, failure_type="UnexpectedStatusCode")
test_cases.append(test_case)
test_suite = TestSuite("pylinkvalidator test suite", test_cases)
output_file.write(TestSuite.to_xml_string([test_suite]))
print_summary(site, config, total_time)


def _write_json_report(site, config, output_file, total_time):
start_urls = ",".join((start_url_split.geturl() for start_url_split in
site.start_url_splits))

total_urls = len(site.pages)
total_errors = len(site.error_pages)

if not site.is_ok:
global_status = "ERROR"
error_summary = "with {0} error(s) ".format(total_errors)
else:
global_status = "SUCCESS"
error_summary = ""

meta = {
"total_urls": total_urls,
"total_errors": total_errors,
"total_time": total_time,
"start_urls": start_urls,
"global_status": global_status,
"error_summary": error_summary
}
try:
avg_response_time = site.get_average_response_time()
avg_process_time = site.get_average_process_time()
meta.update({"avg_response_time": avg_response_time})
meta.update({"avg_process_time": avg_process_time})
except Exception:
from traceback import print_exc
print_exc()

pages = {}

if config.options.report_type == REPORT_TYPE_ERRORS:
pages = site.error_pages
elif config.options.report_type == REPORT_TYPE_ALL:
pages = site.pages

res_pages = []

for results, resource in pages.items():
details = {
'link': resource.url_split.geturl(),
'fragment': results.fragment,
'hostname': results.hostname,
'netloc': results.netloc,
'is_local': resource.is_local,
'is_html': resource.is_html,
'is_ok': resource.is_ok,
'is_timeout': resource.is_timeout,
'process_time': resource.process_time,
'response_time': resource.response_time,
'status': resource.status,
'path': results.path,
'port': results.port,
'query': results.query,
'scheme': results.scheme,
'origins': [source.origin.geturl() for source in resource.sources],
'sources': [source.origin_str for source in resource.sources],
'targets': [source.target for source in resource.sources]
}
res_pages.append(details)

res = {
"meta": meta,
"pages": res_pages
}
output_file.write(
json.dumps(res, sort_keys=True, indent=4, separators=(',', ': ')))
print_summary(site, config, total_time)


def _write_plain_text_report_multi(site, config, output_files, total_time):
total_urls = len(site.pages)
total_errors = len(site.error_pages)
Expand Down Expand Up @@ -163,6 +282,42 @@ def _write_plain_text_report_single(site, config, output_files, total_time):
_print_details(pages.values(), output_files, config)


def print_summary(site, config, total_time, indent=2):
total_urls = len(site.pages)
total_errors = len(site.error_pages)

if not site.is_ok:
global_status = "ERROR"
error_summary = "with {0} error(s) ".format(total_errors)
else:
global_status = "SUCCESS"
error_summary = ""

print("{0} Crawled {1} urls {2}in {3:.2f} seconds".format(
global_status, total_urls, error_summary, total_time))

pages = {}

if config.options.report_type == REPORT_TYPE_ERRORS:
pages = site.error_pages
elif config.options.report_type == REPORT_TYPE_ALL:
pages = site.pages

initial_indent = " " * indent
for page in pages.values():
print("\n{2}{0}: {1}".format(
page.get_status_message(), page.url_split.geturl(),
initial_indent))
for content_message in page.get_content_messages():
print("{1} {0}".format(content_message, initial_indent))
for source in page.sources:
print("{1} from {0} target={2}".format(
source.origin.geturl(), initial_indent, source.target))
if config.options.show_source:
print("{1} {0}".format(
source.origin_str, initial_indent))


def _print_details(page_iterator, output_files, config, indent=2):
initial_indent = " " * indent
for page in page_iterator:
Expand All @@ -174,11 +329,11 @@ def _print_details(page_iterator, output_files, config, indent=2):
oprint("{1} {0}".format(content_message, initial_indent),
files=output_files)
for source in page.sources:
oprint("{1} from {0}".format(
source.origin.geturl(), initial_indent), files=output_files)
oprint("{1} from {0} target={2}".format(
source.origin.geturl(), initial_indent, source.target), files=output_files)
if config.options.show_source:
oprint("{1} {0}".format(
truncate(source.origin_str), initial_indent),
source.origin_str, initial_indent),
files=output_files)


Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
beautifulsoup4>=4.2.0
beautifulsoup4>=4.2.0
junit-xml>=1.8