bartdag · kowalcj0 · Dec 17, 2017 · Dec 17, 2017 · Dec 17, 2017 · Dec 17, 2017
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,7 @@
 *.pyc
+*.json
+*.xml
 pylinkvalidator.egg-info/
 dist/
 build/
+.idea/
diff --git a/README.rst b/README.rst
@@ -171,7 +171,7 @@ usage examples.
       These options change the output of the crawler.
 
       -f FORMAT, --format=FORMAT
-                          Format of the report: plain (default)
+                          Format of the report: plain (default), json, junit
       -o OUTPUT, --output=OUTPUT
                           Path of the file where the report will be printed.
       -W WHEN, --when=WHEN

diff --git a/pylinkvalidator/crawler.py b/pylinkvalidator/crawler.py
@@ -525,6 +525,7 @@ def _get_links(self, elements, attribute, base_url_split,
         for element in elements:
             if attribute in element.attrs:
                 url = element[attribute]
+                target = element.attrs.get('target', None)
 
                 if not self.worker_config.strict_mode:
                     url = url.strip()
@@ -540,7 +541,7 @@ def _get_links(self, elements, attribute, base_url_split,
                 link = Link(
                     type=unicode(element.name), url_split=abs_url_split,
                     original_url_split=original_url_split,
-                    source_str=unicode(element))
+                    source_str=unicode(element), target=target)
                 links.append(link)
 
         return links
@@ -658,7 +659,8 @@ def process_links(self, page_crawl):
                 continue
 
             page_status = self.page_statuses.get(url_split, None)
-            page_source = PageSource(source_url_split, link.source_str)
+            page_source = PageSource(
+                source_url_split, link.source_str, link.target)
 
             if not page_status:
                 # We never encountered this url before

diff --git a/pylinkvalidator/models.py b/pylinkvalidator/models.py
@@ -81,6 +81,7 @@ def namedtuple_with_defaults(typename, field_names, default_values=[]):
 FORMAT_PLAIN = "plain"
 FORMAT_HTML = "html"
 FORMAT_JSON = "json"
+FORMAT_JUNIT = "junit"
 
 
 WHEN_ALWAYS = "always"
@@ -133,7 +134,7 @@ def namedtuple_with_defaults(typename, field_names, default_values=[]):
 
 Link = namedtuple_with_defaults(
     "Link",
-    ["type", "url_split", "original_url_split", "source_str"])
+    ["type", "url_split", "original_url_split", "source_str", "target"])
 
 
 PageCrawl = namedtuple_with_defaults(
@@ -149,7 +150,7 @@ def namedtuple_with_defaults(typename, field_names, default_values=[]):
 
 
 PageSource = namedtuple_with_defaults(
-    "PageSource", ["origin", "origin_str"])
+    "PageSource", ["origin", "origin_str", "target"])
 
 
 ContentCheck = namedtuple_with_defaults(
@@ -582,8 +583,9 @@ def _build_parser(self):
 
         output_group.add_option(
             "-f", "--format", dest="format", action="store",
-            default=FORMAT_PLAIN, choices=[FORMAT_PLAIN],
-            help="Format of the report: plain")
+            default=FORMAT_PLAIN,
+            choices=[FORMAT_PLAIN, FORMAT_JSON, FORMAT_JUNIT],
+            help="Format of the report: plain (default), json, junit")
         output_group.add_option(
             "-o", "--output", dest="output", action="store",
             default=None,

diff --git a/pylinkvalidator/reporter.py b/pylinkvalidator/reporter.py
@@ -4,15 +4,23 @@
 from __future__ import unicode_literals, absolute_import, print_function
 
 import codecs
+import json
 import re
 import smtplib
 import sys
 
 from email.mime.text import MIMEText
 
+from junit_xml import TestSuite, TestCase
+
 from pylinkvalidator.compat import StringIO
 from pylinkvalidator.models import (
-    REPORT_TYPE_ERRORS, REPORT_TYPE_ALL, FORMAT_PLAIN)
+    FORMAT_JSON,
+    FORMAT_JUNIT,
+    FORMAT_PLAIN,
+    REPORT_TYPE_ALL,
+    REPORT_TYPE_ERRORS,
+)
 
 
 PLAIN_TEXT = "text/plain"
@@ -54,6 +62,10 @@ def report(site, config, total_time, logger=None):
     try:
         if config.options.format == FORMAT_PLAIN:
             _write_plain_text_report(site, config, output_files, total_time)
+        if config.options.format == FORMAT_JSON:
+            _write_json_report(site, config, output_file, total_time)
+        if config.options.format == FORMAT_JUNIT:
+            _write_junit_report(site, config, output_file, total_time)
     except Exception:
         if logger:
             logger.exception("An exception occurred while writing the report")
@@ -72,6 +84,113 @@ def _write_plain_text_report(site, config, output_files, total_time):
         _write_plain_text_report_single(site, config, output_files, total_time)
 
 
+def _write_junit_report(site, config, output_file, total_time):
+    pages = site.pages
+    test_cases = []
+
+    for results, resource in pages.items():
+        origins = [source.origin.geturl() for source in resource.sources]
+        if resource.status == 200:
+            test_case = TestCase(
+                name=resource.url_split.geturl(),
+                classname=results.hostname,
+                elapsed_sec=resource.response_time,
+                stdout=resource.status,
+                status="passed"
+                )
+        else:
+            stderr_message = "Link found on:\n{}".format("\n".join(origins))
+            test_case = TestCase(
+                name=resource.url_split.geturl(),
+                classname=results.hostname,
+                elapsed_sec=resource.response_time,
+                stderr=stderr_message,
+                status="failed"
+            )
+            if resource.exception:
+                message = str(resource.exception)
+            else:
+                message = "Expected 200 OK but got {}".format(resource.status)
+            test_case.add_failure_info(
+                message=message, failure_type="UnexpectedStatusCode")
+        test_cases.append(test_case)
+    test_suite = TestSuite("pylinkvalidator test suite", test_cases)
+    output_file.write(TestSuite.to_xml_string([test_suite]))
+    print_summary(site, config, total_time)
+
+
+def _write_json_report(site, config, output_file, total_time):
+    start_urls = ",".join((start_url_split.geturl() for start_url_split in
+                           site.start_url_splits))
+
+    total_urls = len(site.pages)
+    total_errors = len(site.error_pages)
+
+    if not site.is_ok:
+        global_status = "ERROR"
+        error_summary = "with {0} error(s) ".format(total_errors)
+    else:
+        global_status = "SUCCESS"
+        error_summary = ""
+
+    meta = {
+        "total_urls": total_urls,
+        "total_errors": total_errors,
+        "total_time": total_time,
+        "start_urls": start_urls,
+        "global_status": global_status,
+        "error_summary": error_summary
+    }
+    try:
+        avg_response_time = site.get_average_response_time()
+        avg_process_time = site.get_average_process_time()
+        meta.update({"avg_response_time": avg_response_time})
+        meta.update({"avg_process_time": avg_process_time})
+    except Exception:
+        from traceback import print_exc
+        print_exc()
+
+    pages = {}
+
+    if config.options.report_type == REPORT_TYPE_ERRORS:
+        pages = site.error_pages
+    elif config.options.report_type == REPORT_TYPE_ALL:
+        pages = site.pages
+
+    res_pages = []
+
+    for results, resource in pages.items():
+        details = {
+            'link': resource.url_split.geturl(),
+            'fragment': results.fragment,
+            'hostname': results.hostname,
+            'netloc': results.netloc,
+            'is_local': resource.is_local,
+            'is_html': resource.is_html,
+            'is_ok': resource.is_ok,
+            'is_timeout': resource.is_timeout,
+            'process_time': resource.process_time,
+            'response_time': resource.response_time,
+            'status': resource.status,
+            'path': results.path,
+            'port': results.port,
+            'query': results.query,
+            'scheme': results.scheme,
+            'origins': [source.origin.geturl() for source in resource.sources],
+            'sources': [source.origin_str for source in resource.sources],
+            'targets': [source.target for source in resource.sources]
+        }
+        res_pages.append(details)
+
+    res = {
+        "meta": meta,
+        "pages": res_pages
+    }
+    output_file.write(
+            json.dumps(res, sort_keys=True, indent=4, separators=(',', ': ')))
+    print_summary(site, config, total_time)
+
+
 def _write_plain_text_report_multi(site, config, output_files, total_time):
     total_urls = len(site.pages)
     total_errors = len(site.error_pages)
@@ -163,6 +282,42 @@ def _write_plain_text_report_single(site, config, output_files, total_time):
         _print_details(pages.values(), output_files, config)
 
 
+def print_summary(site, config, total_time, indent=2):
+    total_urls = len(site.pages)
+    total_errors = len(site.error_pages)
+
+    if not site.is_ok:
+        global_status = "ERROR"
+        error_summary = "with {0} error(s) ".format(total_errors)
+    else:
+        global_status = "SUCCESS"
+        error_summary = ""
+
+    print("{0} Crawled {1} urls {2}in {3:.2f} seconds".format(
+        global_status, total_urls, error_summary, total_time))
+
+    pages = {}
+
+    if config.options.report_type == REPORT_TYPE_ERRORS:
+        pages = site.error_pages
+    elif config.options.report_type == REPORT_TYPE_ALL:
+        pages = site.pages
+
+    initial_indent = " " * indent
+    for page in pages.values():
+        print("\n{2}{0}: {1}".format(
+            page.get_status_message(), page.url_split.geturl(),
+            initial_indent))
+        for content_message in page.get_content_messages():
+            print("{1}  {0}".format(content_message, initial_indent))
+        for source in page.sources:
+            print("{1}  from {0} target={2}".format(
+                source.origin.geturl(), initial_indent, source.target))
+            if config.options.show_source:
+                print("{1}    {0}".format(
+                    source.origin_str, initial_indent))
+
+
 def _print_details(page_iterator, output_files, config, indent=2):
     initial_indent = " " * indent
     for page in page_iterator:
@@ -174,11 +329,11 @@ def _print_details(page_iterator, output_files, config, indent=2):
             oprint("{1}  {0}".format(content_message, initial_indent),
                    files=output_files)
         for source in page.sources:
-            oprint("{1}  from {0}".format(
-                source.origin.geturl(), initial_indent), files=output_files)
+            oprint("{1}  from {0} target={2}".format(
+                source.origin.geturl(), initial_indent, source.target), files=output_files)
             if config.options.show_source:
                 oprint("{1}    {0}".format(
-                    truncate(source.origin_str), initial_indent),
+                    source.origin_str, initial_indent),
                        files=output_files)
 
 

diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,2 @@
-beautifulsoup4>=4.2.0
+beautifulsoup4>=4.2.0
+junit-xml>=1.8