Skip to content
This repository was archived by the owner on Oct 1, 2024. It is now read-only.

Commit 1823a4f

Browse files
committed
Simple HTMLTokenizer done, innerHTML and outerHTML done. With tests.
1 parent bd5b830 commit 1823a4f

File tree

2 files changed

+264
-24
lines changed

2 files changed

+264
-24
lines changed

polyplug.py

Lines changed: 76 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -184,16 +184,22 @@ def get_attrs(self):
184184

185185
def get_text(self, until="<"):
186186
"""
187-
Return textual content until the start of a new Node ("<") or
188-
until matches.
187+
Return textual content until the start of a new Node ("<") or until
188+
"until" matches.
189189
"""
190190
result = ""
191191
until_len = len(until)
192192
while result[-until_len:] != until and self.char:
193193
result += self.get_char()
194-
self.pos = self.pos - until_len
195-
self.next_char()
196-
return result[:-until_len]
194+
if result[-until_len:] == until:
195+
self.pos = self.pos - until_len
196+
if self.char:
197+
self.pos = self.pos - 1
198+
self.next_char()
199+
return result[:-until_len]
200+
else:
201+
# EOF
202+
return result
197203

198204
def tokenize(self, parent=None):
199205
"""
@@ -205,29 +211,25 @@ def tokenize(self, parent=None):
205211
"""
206212
if not isinstance(parent, ElementNode):
207213
raise ValueError("Parent must be an ElementNode")
208-
current_children = []
209214
current_node = None
210215
current_parent = parent
211216
while self.char:
212217
if self.match("<"):
213218
# Tag opens.
214219
if self.match("/"):
215220
# End tag. Close and check depth of tree.
216-
if not current_node:
217-
# Can't close an un-opened current node.
218-
raise ValueError("Missing opening tag.")
219221
# Get the name of the closing tag.
220222
name = self.get_name()
221-
if name == current_node.tagName:
223+
if current_node and name == current_node.tagName:
222224
# Close current node and continue at current depth.
223-
current_children.append(current_node)
225+
current_parent.add_child(current_node)
224226
current_node = None
225227
elif name == current_parent.tagName:
226228
# Step back up the tree to the parent context.
227-
for child in current_children:
228-
current_parent.add_child(child)
229229
current_node = current_parent
230230
current_parent = current_node.parent
231+
current_parent.add_child(current_node)
232+
current_node = None
231233
else:
232234
# Unexpected close tag.
233235
raise ValueError("Unexpected close tag.", name)
@@ -246,31 +248,41 @@ def tokenize(self, parent=None):
246248
value += self.get_char()
247249
if value[-3:] == "-->":
248250
break
249-
comment = CommentNode(nodeValue=value)
250-
current_children.append(comment)
251+
comment = CommentNode(nodeValue=value[:-3])
252+
current_parent.add_child(comment)
251253
else:
252254
# ElementNode
253255
tagName = self.get_name()
254256
attrs = self.get_attrs()
257+
if current_node:
258+
current_parent = current_node
255259
if tagName == "textarea":
260+
self.expect(">")
256261
value = self.get_text(until="</textarea>")
257-
current_node = ElementNode(
258-
tagName=tagName, attributes=attrs, value=value
262+
textarea_node = ElementNode(
263+
tagName=tagName,
264+
attributes=attrs,
265+
value=value,
266+
parent=current_parent,
259267
)
260-
expect("</textarea>")
268+
for c in "</textarea>":
269+
self.expect(c)
270+
current_parent.add_child(textarea_node)
261271
else:
262272
current_node = ElementNode(
263-
tagName=tagName, attributes=attrs
273+
tagName=tagName,
274+
attributes=attrs,
275+
parent=current_parent,
264276
)
265-
current_children = []
277+
self.expect(">")
266278
else:
267279
# TextNode
268280
value = self.get_text()
269281
text = TextNode(nodeValue=value)
270-
current_children.append(text)
271-
# Append "root" children to the parent node.
272-
for child in current_children:
273-
parent.add_child(child)
282+
if current_node:
283+
current_node.add_child(text)
284+
else:
285+
current_parent.add_child(text)
274286

275287

276288
class Node:
@@ -333,6 +345,46 @@ def add_child(self, child):
333345
children.append(node_dict)
334346
self._node["childNodes"] = children
335347

348+
@property
349+
def outerHTML(self):
350+
"""
351+
Get a string representation of the element's outer HTML.
352+
"""
353+
result = "<" + self.tagName
354+
for attr, val in self.attributes.items():
355+
result += " " + attr + "=\"" + val + "\""
356+
result += ">"
357+
if self.tagName == "textarea":
358+
result += self.value
359+
else:
360+
result += self.innerHTML
361+
result += "</" + self.tagName + ">"
362+
return result
363+
364+
@property
365+
def innerHTML(self):
366+
"""
367+
Get a string representation of the element's inner HTML.
368+
"""
369+
result = ""
370+
for child in self.childNodes:
371+
if isinstance(child, ElementNode):
372+
result += child.outerHTML
373+
elif isinstance(child, TextNode):
374+
result += child.nodeValue
375+
elif isinstance(child, CommentNode):
376+
result += "<!--" + child.nodeValue + "-->"
377+
return result
378+
379+
@innerHTML.setter
380+
def innerHTML(self, raw):
381+
"""
382+
Use the raw innerHTML to create children.
383+
"""
384+
self._node["childNodes"] = []
385+
tok = HTMLTokenizer(raw)
386+
tok.tokenize(self)
387+
336388
@property
337389
def childNodes(self):
338390
if self.tagName == "textarea":

tests/test_polyplug.py

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,49 @@ def test_element_node_add_child_as_dict():
323323
assert new_child.nodeValue == "Hello"
324324

325325

326+
def test_element_node_get_outer_html():
327+
"""
328+
Get a string representation of the node's complete structure.
329+
"""
330+
n = polyplug.ElementNode(tagName="div", attributes={"foo": "bar"})
331+
n.add_child(polyplug.TextNode(nodeValue="Hello"))
332+
assert n.outerHTML == "<div foo=\"bar\">Hello</div>"
333+
334+
335+
def test_element_node_get_inner_html_empty():
336+
"""
337+
Get a string representation of the node's inner structure. Empty.
338+
"""
339+
n = polyplug.ElementNode(tagName="div", attributes={"foo": "bar"})
340+
assert n.innerHTML == ""
341+
342+
343+
def test_element_node_get_set_inner_html_complex():
344+
"""
345+
Get a string representation of the node's inner structure.
346+
"""
347+
n = polyplug.ElementNode(tagName="div", attributes={"foo": "bar"})
348+
n.innerHTML = "<!-- comment --><p>Hello</p>"
349+
assert n.innerHTML == "<!-- comment --><p>Hello</p>"
350+
351+
def test_element_node_set_inner_html_empty():
352+
"""
353+
Set the innerHTML of the node as empty.
354+
"""
355+
n = polyplug.ElementNode(tagName="div", attributes={"foo": "bar"})
356+
n.innerHTML = "<!-- comment --><p>Hello</p>"
357+
n.innerHTML = ""
358+
assert n.innerHTML == ""
359+
360+
361+
def test_element_node_set_inner_html_textarea():
362+
"""
363+
Set the innerHTML of the node as a textarea.
364+
"""
365+
n = polyplug.ElementNode(tagName="div", attributes={"foo": "bar"})
366+
n.innerHTML = "<textarea>Test <fake html></textarea>"
367+
assert n.innerHTML == "<textarea>Test <fake html></textarea>"
368+
326369
def test_text_node():
327370
"""
328371
The TextNode instantiates as expected.
@@ -511,6 +554,17 @@ def test_htmltokenizer_get_attrs_no_attributes():
511554

512555

513556
def test_htmltokenizer_get_text():
557+
"""
558+
Get the textual content of a TextNode (i.e. everything until encountering
559+
"<").
560+
"""
561+
raw = "Hello, world!<div>"
562+
tok = polyplug.HTMLTokenizer(raw)
563+
assert tok.get_text() == "Hello, world!"
564+
assert tok.char == "<"
565+
566+
567+
def test_htmltokenizer_get_text_bound_check():
514568
"""
515569
Get the textual content of a TextNode (i.e. everything until encountering
516570
"<").
@@ -521,6 +575,17 @@ def test_htmltokenizer_get_text():
521575
assert tok.char == "<"
522576

523577

578+
def test_htmltokenizer_get_text_eof():
579+
"""
580+
Get the textual content of a TextNode (i.e. everything until encountering
581+
"<").
582+
"""
583+
raw = "Hello, world!"
584+
tok = polyplug.HTMLTokenizer(raw)
585+
assert tok.get_text() == "Hello, world!"
586+
assert tok.char == ""
587+
588+
524589
def test_htmltokenizer_get_text_until():
525590
"""
526591
Get the textual content of a TextNode (i.e. everything until encountering
@@ -571,3 +636,126 @@ def test_htmltokenizer_tokenize_comment():
571636
tok = polyplug.HTMLTokenizer(raw)
572637
tok.tokenize(parent)
573638
assert len(parent.childNodes) == 1
639+
assert isinstance(parent.childNodes[0], polyplug.CommentNode)
640+
assert parent.childNodes[0].nodeValue == " Test comment. "
641+
642+
643+
def test_htmltokenizer_tokenize_prolog():
644+
"""
645+
Given a valid parent, an XML prolog node is ignored. <? foo ?>
646+
"""
647+
parent = polyplug.ElementNode(tagName="div")
648+
raw = "<? xml ?>"
649+
tok = polyplug.HTMLTokenizer(raw)
650+
tok.tokenize(parent)
651+
assert len(parent.childNodes) == 0
652+
653+
654+
def test_htmltokenizer_tokenize_text():
655+
"""
656+
Textual content becomes a child TextNode.
657+
"""
658+
parent = polyplug.ElementNode(tagName="div")
659+
raw = "Test text."
660+
tok = polyplug.HTMLTokenizer(raw)
661+
tok.tokenize(parent)
662+
assert len(parent.childNodes) == 1
663+
assert isinstance(parent.childNodes[0], polyplug.TextNode)
664+
assert parent.childNodes[0].nodeValue == "Test text."
665+
666+
667+
def test_htmltokenizer_tokenize_element():
668+
"""
669+
An HTML element becomes a child ElementNode.
670+
"""
671+
parent = polyplug.ElementNode(tagName="div")
672+
raw = "<p>Hello</p>"
673+
tok = polyplug.HTMLTokenizer(raw)
674+
tok.tokenize(parent)
675+
assert len(parent.childNodes) == 1
676+
p_node = parent.childNodes[0]
677+
assert isinstance(p_node, polyplug.ElementNode)
678+
assert len(p_node.childNodes) == 1
679+
assert isinstance(p_node.childNodes[0], polyplug.TextNode)
680+
assert p_node.childNodes[0].nodeValue == "Hello"
681+
682+
683+
def test_htmltokenizer_tokenize_element_textarea():
684+
"""
685+
An HTML element becomes a child ElementNode. But textarea nodes will only
686+
have a value (no children) containing the text within the tags.
687+
"""
688+
parent = polyplug.ElementNode(tagName="div")
689+
raw = "<textarea>Hello <fake-html></textarea>"
690+
tok = polyplug.HTMLTokenizer(raw)
691+
tok.tokenize(parent)
692+
assert len(parent.childNodes) == 1
693+
textarea = parent.childNodes[0]
694+
assert isinstance(textarea, polyplug.ElementNode)
695+
assert len(textarea.childNodes) == 0
696+
assert textarea.value == "Hello <fake-html>"
697+
698+
699+
def test_htmltokenizer_tokenize_element_unexpected():
700+
"""
701+
An unexpected close tag results in a ValueError.
702+
"""
703+
parent = polyplug.ElementNode(tagName="div")
704+
raw = "</textarea>"
705+
tok = polyplug.HTMLTokenizer(raw)
706+
with pytest.raises(ValueError):
707+
tok.tokenize(parent)
708+
709+
710+
def test_htmltokenizer_tokenize_complex_tree():
711+
"""
712+
A more complex tree with several branches and node types.
713+
"""
714+
parent = polyplug.ElementNode(tagName="div")
715+
raw = "<!-- comment --><div id='myId'><p>Hello</p><p>world</p></div>"
716+
expected = {
717+
"childNodes": [
718+
{
719+
"childNodes": [],
720+
"nodeName": "#comment",
721+
"nodeType": 8,
722+
"nodeValue": " comment ",
723+
},
724+
{
725+
"attributes": {"id": "myId"},
726+
"childNodes": [
727+
{
728+
"childNodes": [
729+
{
730+
"childNodes": [],
731+
"nodeName": "#text",
732+
"nodeType": 3,
733+
"nodeValue": "Hello",
734+
}
735+
],
736+
"nodeType": 1,
737+
"tagName": "p",
738+
},
739+
{
740+
"childNodes": [
741+
{
742+
"childNodes": [],
743+
"nodeName": "#text",
744+
"nodeType": 3,
745+
"nodeValue": "world",
746+
}
747+
],
748+
"nodeType": 1,
749+
"tagName": "p",
750+
},
751+
],
752+
"nodeType": 1,
753+
"tagName": "div",
754+
},
755+
],
756+
"nodeType": 1,
757+
"tagName": "div",
758+
}
759+
tok = polyplug.HTMLTokenizer(raw)
760+
tok.tokenize(parent)
761+
assert parent.as_dict == expected

0 commit comments

Comments
 (0)