ArchiveTeam · PromyLOPh · Mar 15, 2019
diff --git a/wpull/document/html_test.py b/wpull/document/html_test.py
@@ -129,10 +129,7 @@ def test_html_encoding(self):
             elements = tuple(reader.iter_elements(data, encoding=name))
 
             html_element = elements[0]
-            if isinstance(html_parser, LxmlHTMLParser):
-                self.assertEqual('html', html_element.tag)
-            else:
-                self.assertEqual('img', html_element.tag)
+            self.assertEqual('html', html_element.tag)
 
     def test_html_layout(self):
         html_parser = self.get_html_parser()
@@ -160,13 +157,9 @@ def test_html_layout(self):
         self.assertEqual('body', elements[5].tag)
         self.assertEqual('img', elements[6].tag)
 
-        if isinstance(html_parser, LxmlHTMLParser):
-            self.assertEqual('img', elements[7].tag)
-            self.assertEqual('body', elements[8].tag)
-            self.assertEqual('html', elements[9].tag)
-        else:
-            self.assertEqual('body', elements[7].tag)
-            self.assertEqual('html', elements[8].tag)
+        self.assertEqual('img', elements[7].tag)
+        self.assertEqual('body', elements[8].tag)
+        self.assertEqual('html', elements[9].tag)
 
     def test_html_early_html(self):
         reader = HTMLReader(self.get_html_parser())

diff --git a/wpull/document/htmlparse/html5lib_.py b/wpull/document/htmlparse/html5lib_.py
@@ -1,21 +1,21 @@
 '''Parsing using html5lib python.'''
-import html5lib.constants
-import html5lib.tokenizer
+from html5lib.treewalkers.dom import TreeWalker
+import html5lib
 import io
 import os.path
 
 from wpull.document.htmlparse.base import BaseParser
 from wpull.document.htmlparse.element import Comment, Doctype, Element
 
 
-DOCTYPE = html5lib.constants.tokenTypes['Doctype']
-CHARACTERS = html5lib.constants.tokenTypes['Characters']
-SPACE_CHARACTERS = html5lib.constants.tokenTypes['SpaceCharacters']
-START_TAG = html5lib.constants.tokenTypes['StartTag']
-END_TAG = html5lib.constants.tokenTypes['EndTag']
-EMPTY_TAG = html5lib.constants.tokenTypes['EmptyTag']
-COMMENT = html5lib.constants.tokenTypes['Comment']
-PARSE_ERROR = html5lib.constants.tokenTypes['ParseError']
+class TreeWalkerAdapter(TreeWalker):
+    """ Simple adapter for TreeWalker. Splits up EmptyTag into start/end tag,
+    so the fragile logic of HTMLParser does not break """
+    def emptyTag(self, namespace, name, attrs, hasChildren=False):
+        yield self.startTag(namespace, name, attrs)
+        if hasChildren:
+            yield self.error("Void element has children")
+        yield self.endTag(namespace, name)
 
 
 class HTMLParser(BaseParser):
@@ -24,11 +24,10 @@ def parser_error(self):
         return ValueError
 
     def parse(self, file, encoding=None):
-        tokenizer = html5lib.tokenizer.HTMLTokenizer(
-            file, encoding=encoding,
-            useChardet=False if encoding else True,
-            parseMeta=False if encoding else True,
-        )
+        tokenizer = TreeWalkerAdapter(html5lib.parse(
+            file, treebuilder='dom',
+            override_encoding=encoding,
+        ))
 
         tag = None
         attrib = None
@@ -38,7 +37,7 @@ def parse(self, file, encoding=None):
         for token in tokenizer:
             token_type = token['type']
 
-            if token_type == START_TAG:
+            if token_type == 'StartTag':
                 if buffer:
                     yield Element(tag, attrib, buffer.getvalue(), None, False)
                     buffer = None
@@ -48,19 +47,22 @@ def parse(self, file, encoding=None):
                     tail_buffer = None
 
                 tag = token['name']
-                attrib = dict(token['data'])
+                # html5lib returns node names as ((namespace, name), value),
+                # but we expect just (name, value) pairs
+                attrib = dict(map(lambda x: (x[0][1], x[1]), token['data'].items()))
                 buffer = io.StringIO()
 
-                if token['name'] == 'script':
-                    tokenizer.state = tokenizer.scriptDataState
+                # XXX: ?
+                #if token['name'] == 'script':
+                #    tokenizer.state = tokenizer.scriptDataState
 
-            elif token_type in (CHARACTERS, SPACE_CHARACTERS):
+            elif token_type in ('Characters', 'SpaceCharacters'):
                 if buffer:
                     buffer.write(token['data'])
                 if tail_buffer:
                     tail_buffer.write(token['data'])
 
-            elif token_type == END_TAG:
+            elif token_type == 'EndTag':
                 if buffer:
                     yield Element(tag, attrib, buffer.getvalue(), None, False)
                     buffer = None
@@ -72,12 +74,12 @@ def parse(self, file, encoding=None):
                 tail_buffer = io.StringIO()
                 tag = token['name']
 
-            elif token_type == COMMENT:
+            elif token_type == 'Comment':
                 yield Comment(token['data'])
-            elif token_type == DOCTYPE:
+            elif token_type == 'Doctype':
                 yield Doctype('{} {} {}'.format(
                     token['name'], token['publicId'], token['systemId']))
-            elif token_type == PARSE_ERROR:
+            elif token_type == 'SerializeError':
                 pass
             else:
                 raise ValueError('Unhandled token {}'.format(token))
@@ -90,17 +92,17 @@ def parse(self, file, encoding=None):
             yield Element(tag, dict(), None, tail_buffer.getvalue(), True)
             tail_buffer = None
 
-
 if __name__ == '__main__':
     path = os.path.join(
         os.path.dirname(__file__), '..', '..',
         'testing', 'samples', 'xkcd_1.html'
         )
     with open(path, 'rb') as in_file:
-        tokenizer = html5lib.tokenizer.HTMLTokenizer(in_file)
+        tokenizer = TreeWalkerAdapter(html5lib.parse(in_file, treebuilder='dom'))
 
         for token in tokenizer:
             print(token)
         html_parser = HTMLParser()
         for element in html_parser.parse(in_file):
             print(element)
+