Skip to content
This repository was archived by the owner on Oct 1, 2024. It is now read-only.

Commit bd5b830

Browse files
committed
Work in progress on HTMLTokenizer class (almost done)... to allow setting of innerHTML as raw HTML string.
1 parent 2d701c4 commit bd5b830

File tree

2 files changed

+470
-17
lines changed

2 files changed

+470
-17
lines changed

polyplug.py

Lines changed: 231 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import json
22

3+
34
def output(content):
45
print(json.dumps({"type": "stdout", "content": content}))
56

@@ -11,7 +12,7 @@ class Query:
1112
* id - a unique element id (e.g. "myElementId").
1213
* tag - a tag name (e.g. "p").
1314
* classname - a class (e.g. "my-css-class").
14-
* css - a css selector (e.g "p.className").
15+
* css - a css selector (e.g "p.className").
1516
1617
These types of query relate to the four ways in which elements in an HTML
1718
document may be identified.
@@ -28,7 +29,6 @@ class Query:
2829
CSS = "css"
2930
QUERY_TYPES = (ID, TAG, CLASSNAME, CSS)
3031

31-
3232
def __init__(self, **kwargs):
3333
"""
3434
Raise a ValueError if it's not the case that one, and only one of the
@@ -39,16 +39,16 @@ def __init__(self, **kwargs):
3939
self.query_type = query_type[0]
4040
setattr(self, self.query_type, kwargs[self.query_type])
4141
else:
42-
raise ValueError(
43-
"Bad query specification."
44-
)
42+
raise ValueError("Bad query specification.")
4543

4644
@property
4745
def as_dict(self):
4846
"""
4947
Return a dict to be JSON encoded.
5048
"""
51-
return {self.query_type: getattr(self, self.query_type), }
49+
return {
50+
self.query_type: getattr(self, self.query_type),
51+
}
5252

5353

5454
class DomEvent:
@@ -64,6 +64,215 @@ def __init__(self, event_type, target):
6464
self.target = target
6565

6666

67+
class HTMLTokenizer:
68+
"""
69+
Turns a string into a structured representation of the DOM.
70+
71+
MUST USE XHTML (i.e. open and closing tags).
72+
"""
73+
74+
QUOTES = "\"'"
75+
76+
def __init__(self, raw):
77+
self.raw = raw
78+
self.len = len(raw)
79+
self.pos = 0
80+
self.char = self.next_char()
81+
82+
def next_char(self):
83+
"""
84+
Get the next character, or return an empty string for EOF or no raw
85+
input.
86+
"""
87+
if self.len and self.pos < self.len:
88+
self.char = self.raw[self.pos]
89+
self.pos += 1
90+
return self.char
91+
else:
92+
self.char = ""
93+
return ""
94+
95+
def get_char(self):
96+
"""
97+
Get the current character, and step to the next one.
98+
"""
99+
result = self.char
100+
self.next_char()
101+
return result
102+
103+
def skip_ws(self):
104+
"""
105+
Skip over whitespace.
106+
"""
107+
while self.char.isspace():
108+
self.next_char()
109+
110+
def match(self, expected):
111+
"""
112+
Return a boolean indication if the next character[s] is the expected
113+
character.
114+
115+
Expected could be a multi-character match (mainly used for single and
116+
double quotes). E.g. expected = "\"'"
117+
"""
118+
self.skip_ws()
119+
if self.char in expected:
120+
self.next_char()
121+
return True
122+
return False
123+
124+
def expect(self, expected):
125+
"""
126+
Raise an exception if the expected character[s] is not matched.
127+
"""
128+
if not self.match(expected):
129+
raise ValueError("Bad HTML syntax.")
130+
131+
def get_name(self):
132+
"""
133+
Get the name of a tag or attribute.
134+
135+
E.g. used to extract "div" and "id" from this fragment:
136+
137+
<div id="foo">
138+
"""
139+
self.skip_ws()
140+
result = ""
141+
while True:
142+
c = self.char
143+
if not (c.isalpha() or c.isdigit() or c in "_-."):
144+
break
145+
result += self.get_char()
146+
return result
147+
148+
def get_value(self):
149+
"""
150+
Get the value associated with an attribute.
151+
152+
E.g. used to extract the "foo" value (without quotes) from this
153+
fragment:
154+
155+
<div id="foo">
156+
"""
157+
self.skip_ws()
158+
result = ""
159+
try:
160+
self.expect("=")
161+
self.expect(self.QUOTES)
162+
while True:
163+
c = self.char
164+
if not (c.isalpha() or c.isdigit() or c in "_-."):
165+
break
166+
result += self.get_char()
167+
self.expect(self.QUOTES)
168+
return result
169+
except ValueError:
170+
return ""
171+
172+
def get_attrs(self):
173+
"""
174+
Return an Attributes instance representing any attributes attached to
175+
an ElementTag.
176+
"""
177+
attrs = Attributes()
178+
name = self.get_name()
179+
while name:
180+
value = self.get_value()
181+
attrs[name] = value
182+
name = self.get_name()
183+
return attrs
184+
185+
def get_text(self, until="<"):
186+
"""
187+
Return textual content until the start of a new Node ("<") or
188+
until matches.
189+
"""
190+
result = ""
191+
until_len = len(until)
192+
while result[-until_len:] != until and self.char:
193+
result += self.get_char()
194+
self.pos = self.pos - until_len
195+
self.next_char()
196+
return result[:-until_len]
197+
198+
def tokenize(self, parent=None):
199+
"""
200+
Tokenize the raw HTML input and return a DOM representation using the
201+
Node and Attributes classes defined below.
202+
203+
The parent ElementNode is given since we're always parsing its
204+
innerHTML.
205+
"""
206+
if not isinstance(parent, ElementNode):
207+
raise ValueError("Parent must be an ElementNode")
208+
current_children = []
209+
current_node = None
210+
current_parent = parent
211+
while self.char:
212+
if self.match("<"):
213+
# Tag opens.
214+
if self.match("/"):
215+
# End tag. Close and check depth of tree.
216+
if not current_node:
217+
# Can't close an un-opened current node.
218+
raise ValueError("Missing opening tag.")
219+
# Get the name of the closing tag.
220+
name = self.get_name()
221+
if name == current_node.tagName:
222+
# Close current node and continue at current depth.
223+
current_children.append(current_node)
224+
current_node = None
225+
elif name == current_parent.tagName:
226+
# Step back up the tree to the parent context.
227+
for child in current_children:
228+
current_parent.add_child(child)
229+
current_node = current_parent
230+
current_parent = current_node.parent
231+
else:
232+
# Unexpected close tag.
233+
raise ValueError("Unexpected close tag.", name)
234+
self.expect(">")
235+
elif self.match("?"):
236+
# XML prolog - consume and ignore.
237+
self.get_attrs()
238+
self.expect("?")
239+
self.expect(">")
240+
elif self.match("!"):
241+
# CommentNode - get nodeValue.
242+
self.expect("-")
243+
self.expect("-")
244+
value = ""
245+
while True:
246+
value += self.get_char()
247+
if value[-3:] == "-->":
248+
break
249+
comment = CommentNode(nodeValue=value)
250+
current_children.append(comment)
251+
else:
252+
# ElementNode
253+
tagName = self.get_name()
254+
attrs = self.get_attrs()
255+
if tagName == "textarea":
256+
value = self.get_text(until="</textarea>")
257+
current_node = ElementNode(
258+
tagName=tagName, attributes=attrs, value=value
259+
)
260+
expect("</textarea>")
261+
else:
262+
current_node = ElementNode(
263+
tagName=tagName, attributes=attrs
264+
)
265+
current_children = []
266+
else:
267+
# TextNode
268+
value = self.get_text()
269+
text = TextNode(nodeValue=value)
270+
current_children.append(text)
271+
# Append "root" children to the parent node.
272+
for child in current_children:
273+
parent.add_child(child)
274+
275+
67276
class Node:
68277
"""
69278
Represents a node in the DOM.
@@ -112,6 +321,18 @@ def __init__(self, **kwargs):
112321
# The textarea doesn't have children. Only a text value.
113322
self.value = kwargs.get("value", "")
114323

324+
def add_child(self, child):
325+
"""
326+
Add a child node to the children of this node.
327+
"""
328+
children = self._node.get("childNodes", [])
329+
node_dict = child
330+
if isinstance(child, Node):
331+
node_dict = child.as_dict
332+
node_dict["parent"] = self
333+
children.append(node_dict)
334+
self._node["childNodes"] = children
335+
115336
@property
116337
def childNodes(self):
117338
if self.tagName == "textarea":
@@ -142,7 +363,7 @@ def as_dict(self):
142363
result = {
143364
"nodeType": 1,
144365
"tagName": self.tagName,
145-
"childNodes": [child.as_dict for child in self.childNodes]
366+
"childNodes": [child.as_dict for child in self.childNodes],
146367
}
147368
if self.attributes:
148369
result["attributes"] = self.attributes
@@ -166,7 +387,7 @@ def as_dict(self):
166387
"nodeType": 3,
167388
"nodeName": "#text",
168389
"nodeValue": self.nodeValue,
169-
"childNodes": []
390+
"childNodes": [],
170391
}
171392

172393

@@ -185,7 +406,7 @@ def as_dict(self):
185406
"nodeType": 8,
186407
"nodeName": "#comment",
187408
"nodeValue": self.nodeValue,
188-
"childNodes": []
409+
"childNodes": [],
189410
}
190411

191412

@@ -199,10 +420,7 @@ def __init__(self, **kwargs):
199420

200421
@property
201422
def as_dict(self):
202-
return {
203-
"nodeType": 11,
204-
"childNodes": []
205-
}
423+
return {"nodeType": 11, "childNodes": []}
206424

207425

208426
def plug(query, event_type):

0 commit comments

Comments
 (0)