Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 4 additions & 11 deletions wpull/document/html_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,10 +129,7 @@ def test_html_encoding(self):
elements = tuple(reader.iter_elements(data, encoding=name))

html_element = elements[0]
if isinstance(html_parser, LxmlHTMLParser):
self.assertEqual('html', html_element.tag)
else:
self.assertEqual('img', html_element.tag)
self.assertEqual('html', html_element.tag)

def test_html_layout(self):
html_parser = self.get_html_parser()
Expand Down Expand Up @@ -160,13 +157,9 @@ def test_html_layout(self):
self.assertEqual('body', elements[5].tag)
self.assertEqual('img', elements[6].tag)

if isinstance(html_parser, LxmlHTMLParser):
self.assertEqual('img', elements[7].tag)
self.assertEqual('body', elements[8].tag)
self.assertEqual('html', elements[9].tag)
else:
self.assertEqual('body', elements[7].tag)
self.assertEqual('html', elements[8].tag)
self.assertEqual('img', elements[7].tag)
self.assertEqual('body', elements[8].tag)
self.assertEqual('html', elements[9].tag)

def test_html_early_html(self):
reader = HTMLReader(self.get_html_parser())
Expand Down
54 changes: 28 additions & 26 deletions wpull/document/htmlparse/html5lib_.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
'''Parsing using html5lib python.'''
import html5lib.constants
import html5lib.tokenizer
from html5lib.treewalkers.dom import TreeWalker
import html5lib
import io
import os.path

from wpull.document.htmlparse.base import BaseParser
from wpull.document.htmlparse.element import Comment, Doctype, Element


DOCTYPE = html5lib.constants.tokenTypes['Doctype']
CHARACTERS = html5lib.constants.tokenTypes['Characters']
SPACE_CHARACTERS = html5lib.constants.tokenTypes['SpaceCharacters']
START_TAG = html5lib.constants.tokenTypes['StartTag']
END_TAG = html5lib.constants.tokenTypes['EndTag']
EMPTY_TAG = html5lib.constants.tokenTypes['EmptyTag']
COMMENT = html5lib.constants.tokenTypes['Comment']
PARSE_ERROR = html5lib.constants.tokenTypes['ParseError']
class TreeWalkerAdapter(TreeWalker):
""" Simple adapter for TreeWalker. Splits up EmptyTag into start/end tag,
so the fragile logic of HTMLParser does not break """
def emptyTag(self, namespace, name, attrs, hasChildren=False):
yield self.startTag(namespace, name, attrs)
if hasChildren:
yield self.error("Void element has children")
yield self.endTag(namespace, name)


class HTMLParser(BaseParser):
Expand All @@ -24,11 +24,10 @@ def parser_error(self):
return ValueError

def parse(self, file, encoding=None):
tokenizer = html5lib.tokenizer.HTMLTokenizer(
file, encoding=encoding,
useChardet=False if encoding else True,
parseMeta=False if encoding else True,
)
tokenizer = TreeWalkerAdapter(html5lib.parse(
file, treebuilder='dom',
override_encoding=encoding,
))

tag = None
attrib = None
Expand All @@ -38,7 +37,7 @@ def parse(self, file, encoding=None):
for token in tokenizer:
token_type = token['type']

if token_type == START_TAG:
if token_type == 'StartTag':
if buffer:
yield Element(tag, attrib, buffer.getvalue(), None, False)
buffer = None
Expand All @@ -48,19 +47,22 @@ def parse(self, file, encoding=None):
tail_buffer = None

tag = token['name']
attrib = dict(token['data'])
# html5lib returns node names as ((namespace, name), value),
# but we expect just (name, value) pairs
attrib = dict(map(lambda x: (x[0][1], x[1]), token['data'].items()))
buffer = io.StringIO()

if token['name'] == 'script':
tokenizer.state = tokenizer.scriptDataState
# XXX: ?
#if token['name'] == 'script':
# tokenizer.state = tokenizer.scriptDataState

elif token_type in (CHARACTERS, SPACE_CHARACTERS):
elif token_type in ('Characters', 'SpaceCharacters'):
if buffer:
buffer.write(token['data'])
if tail_buffer:
tail_buffer.write(token['data'])

elif token_type == END_TAG:
elif token_type == 'EndTag':
if buffer:
yield Element(tag, attrib, buffer.getvalue(), None, False)
buffer = None
Expand All @@ -72,12 +74,12 @@ def parse(self, file, encoding=None):
tail_buffer = io.StringIO()
tag = token['name']

elif token_type == COMMENT:
elif token_type == 'Comment':
yield Comment(token['data'])
elif token_type == DOCTYPE:
elif token_type == 'Doctype':
yield Doctype('{} {} {}'.format(
token['name'], token['publicId'], token['systemId']))
elif token_type == PARSE_ERROR:
elif token_type == 'SerializeError':
pass
else:
raise ValueError('Unhandled token {}'.format(token))
Expand All @@ -90,17 +92,17 @@ def parse(self, file, encoding=None):
yield Element(tag, dict(), None, tail_buffer.getvalue(), True)
tail_buffer = None


if __name__ == '__main__':
path = os.path.join(
os.path.dirname(__file__), '..', '..',
'testing', 'samples', 'xkcd_1.html'
)
with open(path, 'rb') as in_file:
tokenizer = html5lib.tokenizer.HTMLTokenizer(in_file)
tokenizer = TreeWalkerAdapter(html5lib.parse(in_file, treebuilder='dom'))

for token in tokenizer:
print(token)
html_parser = HTMLParser()
for element in html_parser.parse(in_file):
print(element)