BitLooter · lsaint · Apr 30, 2020 · May 7, 2020
diff --git a/htmlark.py b/htmlark.py
@@ -2,6 +2,9 @@
 """Embed images, CSS, and JavaScript into an HTML file, using data URIs."""
 __version__ = "1.0.0"
 
+import os
+import string
+import pathlib
 import argparse
 import base64
 from datetime import datetime
@@ -13,6 +16,7 @@
 from urllib.parse import urlparse
 
 import bs4
+
 # Import requests if available, dummy it if not
 try:
     from requests import get as requests_get
@@ -22,9 +26,11 @@
 
     class RequestException(Exception):  # NOQA   make flake8 shut up
         """Dummy exception for when Requests is not installed."""
+
         pass
 
-PARSERS = ['lxml', 'html5lib', 'html.parser']
+
+PARSERS = ["lxml", "html5lib", "html.parser"]
 
 
 def get_available_parsers():
@@ -53,23 +59,24 @@ def _get_resource(resource_url: str) -> (str, bytes):
         ValueError: If ``resource_url``'s protocol is invalid.
     """
     url_parsed = urlparse(resource_url)
-    if url_parsed.scheme in ['http', 'https']:
+    if url_parsed.scheme in ["http", "https"]:
         # Requests might not be installed
         if requests_get is not None:
             request = requests_get(resource_url)
             data = request.content
-            if 'Content-Type' in request.headers:
-                mimetype = request.headers['Content-Type']
+            if "Content-Type" in request.headers:
+                mimetype = request.headers["Content-Type"]
             else:
                 mimetype = mimetypes.guess_type(resource_url)
         else:
             raise NameError("HTTP URL found but requests not available")
-    elif url_parsed.scheme == '':
+    elif url_parsed.scheme in string.ascii_letters:
         # '' is local file
-        with open(resource_url, 'rb') as f:
+        # null-string("") is in any string
+        with open(resource_url, "rb") as f:
             data = f.read()
         mimetype, _ = mimetypes.guess_type(resource_url)
-    elif url_parsed.scheme == 'data':
+    elif url_parsed.scheme == "data":
         raise ValueError("Resource path is a data URI", url_parsed.scheme)
     else:
         raise ValueError("Not local path or HTTP/HTTPS URL", url_parsed.scheme)
@@ -88,20 +95,26 @@ def make_data_uri(mimetype: str, data: bytes) -> str:
     Returns:
         str: Input data encoded into a data URI.
     """
-    mimetype = '' if mimetype is None else mimetype
-    if mimetype in ['', 'text/css', 'application/javascript']:
+    mimetype = "" if mimetype is None else mimetype
+    if mimetype in ["", "text/css", "application/javascript"]:
         # Text data can simply be URL-encoded
         encoded_data = quote(data.decode())
     else:
-        mimetype = mimetype + ';base64'
+        mimetype = mimetype + ";base64"
         encoded_data = base64.b64encode(data).decode()
     return "data:{},{}".format(mimetype, encoded_data)
 
 
-def convert_page(page_path: str, parser: str='auto',
-                 callback: Callable[[str, str, str], None]=lambda *_: None,
-                 ignore_errors: bool=False, ignore_images: bool=False,
-                 ignore_css: bool=False, ignore_js: bool=False) -> str:
+def convert_page(
+    page_path: str,
+    parser: str = "auto",
+    callback: Callable[[str, str, str], None] = lambda *_: None,
+    ignore_errors: bool = False,
+    ignore_images: bool = False,
+    ignore_css: bool = False,
+    ignore_js: bool = False,
+    charset: str = "utf-8",
+) -> str:
     """Take an HTML file or URL and outputs new HTML with resources as data URIs.
 
     Parameters:
@@ -165,7 +178,7 @@ def convert_page(page_path: str, parser: str='auto',
     """
     # Check features
     if requests_get is None:
-        callback('INFO', 'feature', "Requests not available, web downloading disabled")
+        callback("INFO", "feature", "Requests not available, web downloading disabled")
 
     # Get page HTML, whether from a server, a local file, or stdin
     if page_path is None:
@@ -176,109 +189,165 @@ def convert_page(page_path: str, parser: str='auto',
 
     # Not all parsers are equal - it can be specified on the command line
     # so the user can try another when one fails
-    if parser == 'auto':
+    if parser == "auto":
         parser = get_available_parsers()[0]
-    soup = bs4.BeautifulSoup(page_text, parser)
-    callback('INFO', 'parser', "Using parser " + parser)
+    soup = bs4.BeautifulSoup(page_text.decode(charset), parser)
+    callback("INFO", "parser", "Using parser " + parser)
 
     tags = []
 
     # Gather all the relevant tags together
     if not ignore_images:
-        tags += soup('img')
+        tags += soup("img")
     if not ignore_css:
-        csstags = soup('link')
+        csstags = soup("link")
         for css in csstags:
-            if 'stylesheet' in css['rel']:
+            if "stylesheet" in css["rel"]:
                 tags.append(css)
     if not ignore_js:
-        scripttags = soup('script')
+        scripttags = soup("script")
         for script in scripttags:
-            if 'src' in script.attrs:
+            if "src" in script.attrs:
                 tags.append(script)
 
     # Convert the linked resources
     for tag in tags:
-        tag_url = tag['href'] if tag.name == 'link' else tag['src']
+        tag_url = tag["href"] if tag.name == "link" else tag["src"]
         try:
-            # BUG: doesn't work if using relative remote URLs in a local file
-            fullpath = urljoin(page_path, tag_url)
+            if not bool(urlparse(page_path).netloc):
+                dir_path = pathlib.Path(page_path).parent.absolute()
+                fullpath = os.path.join(dir_path, tag_url)
+            else:
+                fullpath = urljoin(page_path, tag_url)
             tag_mime, tag_data = _get_resource(fullpath)
         except RequestException:
-            callback('ERROR', tag.name, "Can't access URL " + fullpath)
+            callback("ERROR", tag.name, "Can't access URL " + fullpath)
             if not ignore_errors:
                 raise
         except OSError as e:
-            callback('ERROR', tag.name, "Error reading '{}': {}".format(e.filename, e.strerror))
+            callback(
+                "ERROR",
+                tag.name,
+                "Error reading '{}': {}".format(e.filename, e.strerror),
+            )
             if not ignore_errors:
                 raise
         except ValueError as e:
             # Raised when a problem with the URL is found
             scheme = e.args[1]
             # Don't need to process things that are already data URIs
-            if scheme == 'data':
-                callback('INFO', tag.name, "Already data URI")
+            if scheme == "data":
+                callback("INFO", tag.name, "Already data URI")
             else:
                 # htmlark can only get from http/https and local files
-                callback('ERROR', tag.name, "Unknown protocol in URL: " + tag_url)
+                callback("ERROR", tag.name, "Unknown protocol in URL: " + tag_url)
                 if not ignore_errors:
                     raise
         except NameError as e:
             # Requests module is not available
-            callback('ERROR', tag.name, str(e))
+            callback("ERROR", tag.name, str(e))
             if not ignore_errors:
                 raise
         else:
             encoded_resource = make_data_uri(tag_mime, tag_data)
-            if tag.name == 'link':
-                tag['href'] = encoded_resource
+            if tag.name == "link":
+                tag["href"] = encoded_resource
             else:
-                tag['src'] = encoded_resource
-            callback('INFO', tag.name, tag_url)
+                tag["src"] = encoded_resource
+            callback("INFO", tag.name, tag_url)
         # Record the original URL so the original HTML can be recovered
         tag.insert_after(bs4.Comment("URL:" + tag_url))
 
-    soup.html.insert_after(bs4.Comment(
-        "Generated by HTMLArk {}. Original URL {}".format(datetime.now(),
-                                                          page_path)))
+    soup.html.insert_after(
+        bs4.Comment(
+            "Generated by HTMLArk {}. Original URL {}".format(datetime.now(), page_path)
+        )
+    )
     return str(soup)
 
 
 def _get_options():
     """Parse command line options."""
-    parser = argparse.ArgumentParser(description="""
+    parser = argparse.ArgumentParser(
+        description="""
         Converts a webpage including external resources into a single HTML
         file. Note that resources loaded with JavaScript will not be handled
-        by this program, it will only work properly with static pages.""")
+        by this program, it will only work properly with static pages."""
+    )
     # Can't make this an argparse.FileType, because it could be a local path
     # or an URL, and convert_page needs the path
-    parser.add_argument('webpage', nargs='?', default=None,
-                        help="""URL or path of webpage to convert. If not
-                        specified, read from STDIN.""")
-    parser.add_argument('-o', '--output', default=sys.stdout,
-                        type=argparse.FileType('w', encoding='UTF-8'),
-                        help="File to write output. Defaults to STDOUT.")
-    parser.add_argument('-E', '--ignore-errors', action='store_true', default=False,
-                        help="Ignores unreadable resources")
-    parser.add_argument('-I', '--ignore-images', action='store_true', default=False,
-                        help="Ignores images during conversion")
-    parser.add_argument('-C', '--ignore-css', action='store_true', default=False,
-                        help="Ignores stylesheets during conversion")
-    parser.add_argument('-J', '--ignore-js', action='store_true', default=False,
-                        help="Ignores external JavaScript during conversion")
-    parser.add_argument('-p', '--parser', default='auto',
-                        choices=['html.parser', 'lxml', 'html5lib', 'auto'],
-                        help="""Select HTML parser. Defaults to auto, which
+    parser.add_argument(
+        "webpage",
+        nargs="?",
+        default=None,
+        help="""URL or path of webpage to convert. If not
+                        specified, read from STDIN.""",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        default=sys.stdout,
+        type=argparse.FileType("w", encoding="UTF-8"),
+        help="File to write output. Defaults to STDOUT.",
+    )
+    parser.add_argument(
+        "-E",
+        "--ignore-errors",
+        action="store_true",
+        default=False,
+        help="Ignores unreadable resources",
+    )
+    parser.add_argument(
+        "-I",
+        "--ignore-images",
+        action="store_true",
+        default=False,
+        help="Ignores images during conversion",
+    )
+    parser.add_argument(
+        "-C",
+        "--ignore-css",
+        action="store_true",
+        default=False,
+        help="Ignores stylesheets during conversion",
+    )
+    parser.add_argument(
+        "-J",
+        "--ignore-js",
+        action="store_true",
+        default=False,
+        help="Ignores external JavaScript during conversion",
+    )
+    parser.add_argument(
+        "-p",
+        "--parser",
+        default="auto",
+        choices=["html.parser", "lxml", "html5lib", "auto"],
+        help="""Select HTML parser. Defaults to auto, which
                                 tries to use lxml, html5lib, and html.parser
                                 in that order. See documentation for more
-                                information.""")
-    parser.add_argument('--list-parsers', action='store_true', default=False,
-                        help="Lists installed parsers available to HTMLArk")
-    parser.add_argument('-v', '--verbose', action='store_true', default=False,
-                        help="Prints information during conversion")
-    parser.add_argument('-V', '--version', action='version',
-                        version="HTMLArk v{}".format(__version__),
-                        help="Displays version information")
+                                information.""",
+    )
+    parser.add_argument(
+        "--list-parsers",
+        action="store_true",
+        default=False,
+        help="Lists installed parsers available to HTMLArk",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        default=False,
+        help="Prints information during conversion",
+    )
+    parser.add_argument(
+        "-V",
+        "--version",
+        action="version",
+        version="HTMLArk v{}".format(__version__),
+        help="Displays version information",
+    )
     parsed = parser.parse_args()
 
     if parsed.list_parsers:
@@ -296,31 +365,37 @@ def _main():
     # All further messages should use print_verbose() or print_error()
     def print_error(m):
         print(m, file=sys.stderr)
+
     # print_error = lambda m: print(m, file=sys.stderr)
     if options.verbose:
         print_verbose = print_error
     else:
+
         def print_verbose(_):
             pass
 
     def info_callback(severity, message_type, message_data):
         """Display progress information during conversion."""
-        if message_type == 'img':
+        if message_type == "img":
             tagtype = "Image"
-        elif message_type == 'link':
+        elif message_type == "link":
             tagtype = "CSS"
-        elif message_type == 'script':
+        elif message_type == "script":
             tagtype = "JS"
         else:
             tagtype = message_type
         # Only display info messages if -v/--verbose flag is set
-        if severity == 'INFO':
+        if severity == "INFO":
             if options.verbose:
                 print_verbose("{}: {}".format(tagtype, message_data))
-        elif severity == 'ERROR':
+        elif severity == "ERROR":
             print_error("{}: {}".format(tagtype, message_data))
         else:
-            print_error("Unknown message level {}, please tell the author of the program".format(severity))
+            print_error(
+                "Unknown message level {}, please tell the author of the program".format(
+                    severity
+                )
+            )
             print_error("{}: {}".format(tagtype, message_data))
 
     # Convert page
@@ -330,13 +405,15 @@ def info_callback(severity, message_type, message_data):
         print_verbose("Processing {}".format(options.webpage))
 
     try:
-        newhtml = convert_page(options.webpage,
-                               parser=options.parser,
-                               ignore_errors=options.ignore_errors,
-                               ignore_images=options.ignore_images,
-                               ignore_css=options.ignore_css,
-                               ignore_js=options.ignore_js,
-                               callback=info_callback)
+        newhtml = convert_page(
+            options.webpage,
+            parser=options.parser,
+            ignore_errors=options.ignore_errors,
+            ignore_images=options.ignore_images,
+            ignore_css=options.ignore_css,
+            ignore_js=options.ignore_js,
+            callback=info_callback,
+        )
     except (OSError, RequestException, ValueError) as e:
         sys.exit("Unable to convert webpage: {}".format(e))
     except NameError: