1
1
import json
2
2
3
+
3
4
def output (content ):
4
5
print (json .dumps ({"type" : "stdout" , "content" : content }))
5
6
@@ -11,7 +12,7 @@ class Query:
11
12
* id - a unique element id (e.g. "myElementId").
12
13
* tag - a tag name (e.g. "p").
13
14
* classname - a class (e.g. "my-css-class").
14
- * css - a css selector (e.g "p.className").
15
+ * css - a css selector (e.g "p.className").
15
16
16
17
These types of query relate to the four ways in which elements in an HTML
17
18
document may be identified.
@@ -28,7 +29,6 @@ class Query:
28
29
CSS = "css"
29
30
QUERY_TYPES = (ID , TAG , CLASSNAME , CSS )
30
31
31
-
32
32
def __init__ (self , ** kwargs ):
33
33
"""
34
34
Raise a ValueError if it's not the case that one, and only one of the
@@ -39,16 +39,16 @@ def __init__(self, **kwargs):
39
39
self .query_type = query_type [0 ]
40
40
setattr (self , self .query_type , kwargs [self .query_type ])
41
41
else :
42
- raise ValueError (
43
- "Bad query specification."
44
- )
42
+ raise ValueError ("Bad query specification." )
45
43
46
44
@property
47
45
def as_dict (self ):
48
46
"""
49
47
Return a dict to be JSON encoded.
50
48
"""
51
- return {self .query_type : getattr (self , self .query_type ), }
49
+ return {
50
+ self .query_type : getattr (self , self .query_type ),
51
+ }
52
52
53
53
54
54
class DomEvent :
@@ -64,6 +64,215 @@ def __init__(self, event_type, target):
64
64
self .target = target
65
65
66
66
67
+ class HTMLTokenizer :
68
+ """
69
+ Turns a string into a structured representation of the DOM.
70
+
71
+ MUST USE XHTML (i.e. open and closing tags).
72
+ """
73
+
74
+ QUOTES = "\" '"
75
+
76
+ def __init__ (self , raw ):
77
+ self .raw = raw
78
+ self .len = len (raw )
79
+ self .pos = 0
80
+ self .char = self .next_char ()
81
+
82
+ def next_char (self ):
83
+ """
84
+ Get the next character, or return an empty string for EOF or no raw
85
+ input.
86
+ """
87
+ if self .len and self .pos < self .len :
88
+ self .char = self .raw [self .pos ]
89
+ self .pos += 1
90
+ return self .char
91
+ else :
92
+ self .char = ""
93
+ return ""
94
+
95
+ def get_char (self ):
96
+ """
97
+ Get the current character, and step to the next one.
98
+ """
99
+ result = self .char
100
+ self .next_char ()
101
+ return result
102
+
103
+ def skip_ws (self ):
104
+ """
105
+ Skip over whitespace.
106
+ """
107
+ while self .char .isspace ():
108
+ self .next_char ()
109
+
110
+ def match (self , expected ):
111
+ """
112
+ Return a boolean indication if the next character[s] is the expected
113
+ character.
114
+
115
+ Expected could be a multi-character match (mainly used for single and
116
+ double quotes). E.g. expected = "\" '"
117
+ """
118
+ self .skip_ws ()
119
+ if self .char in expected :
120
+ self .next_char ()
121
+ return True
122
+ return False
123
+
124
+ def expect (self , expected ):
125
+ """
126
+ Raise an exception if the expected character[s] is not matched.
127
+ """
128
+ if not self .match (expected ):
129
+ raise ValueError ("Bad HTML syntax." )
130
+
131
+ def get_name (self ):
132
+ """
133
+ Get the name of a tag or attribute.
134
+
135
+ E.g. used to extract "div" and "id" from this fragment:
136
+
137
+ <div id="foo">
138
+ """
139
+ self .skip_ws ()
140
+ result = ""
141
+ while True :
142
+ c = self .char
143
+ if not (c .isalpha () or c .isdigit () or c in "_-." ):
144
+ break
145
+ result += self .get_char ()
146
+ return result
147
+
148
+ def get_value (self ):
149
+ """
150
+ Get the value associated with an attribute.
151
+
152
+ E.g. used to extract the "foo" value (without quotes) from this
153
+ fragment:
154
+
155
+ <div id="foo">
156
+ """
157
+ self .skip_ws ()
158
+ result = ""
159
+ try :
160
+ self .expect ("=" )
161
+ self .expect (self .QUOTES )
162
+ while True :
163
+ c = self .char
164
+ if not (c .isalpha () or c .isdigit () or c in "_-." ):
165
+ break
166
+ result += self .get_char ()
167
+ self .expect (self .QUOTES )
168
+ return result
169
+ except ValueError :
170
+ return ""
171
+
172
+ def get_attrs (self ):
173
+ """
174
+ Return an Attributes instance representing any attributes attached to
175
+ an ElementTag.
176
+ """
177
+ attrs = Attributes ()
178
+ name = self .get_name ()
179
+ while name :
180
+ value = self .get_value ()
181
+ attrs [name ] = value
182
+ name = self .get_name ()
183
+ return attrs
184
+
185
+ def get_text (self , until = "<" ):
186
+ """
187
+ Return textual content until the start of a new Node ("<") or
188
+ until matches.
189
+ """
190
+ result = ""
191
+ until_len = len (until )
192
+ while result [- until_len :] != until and self .char :
193
+ result += self .get_char ()
194
+ self .pos = self .pos - until_len
195
+ self .next_char ()
196
+ return result [:- until_len ]
197
+
198
+ def tokenize (self , parent = None ):
199
+ """
200
+ Tokenize the raw HTML input and return a DOM representation using the
201
+ Node and Attributes classes defined below.
202
+
203
+ The parent ElementNode is given since we're always parsing its
204
+ innerHTML.
205
+ """
206
+ if not isinstance (parent , ElementNode ):
207
+ raise ValueError ("Parent must be an ElementNode" )
208
+ current_children = []
209
+ current_node = None
210
+ current_parent = parent
211
+ while self .char :
212
+ if self .match ("<" ):
213
+ # Tag opens.
214
+ if self .match ("/" ):
215
+ # End tag. Close and check depth of tree.
216
+ if not current_node :
217
+ # Can't close an un-opened current node.
218
+ raise ValueError ("Missing opening tag." )
219
+ # Get the name of the closing tag.
220
+ name = self .get_name ()
221
+ if name == current_node .tagName :
222
+ # Close current node and continue at current depth.
223
+ current_children .append (current_node )
224
+ current_node = None
225
+ elif name == current_parent .tagName :
226
+ # Step back up the tree to the parent context.
227
+ for child in current_children :
228
+ current_parent .add_child (child )
229
+ current_node = current_parent
230
+ current_parent = current_node .parent
231
+ else :
232
+ # Unexpected close tag.
233
+ raise ValueError ("Unexpected close tag." , name )
234
+ self .expect (">" )
235
+ elif self .match ("?" ):
236
+ # XML prolog - consume and ignore.
237
+ self .get_attrs ()
238
+ self .expect ("?" )
239
+ self .expect (">" )
240
+ elif self .match ("!" ):
241
+ # CommentNode - get nodeValue.
242
+ self .expect ("-" )
243
+ self .expect ("-" )
244
+ value = ""
245
+ while True :
246
+ value += self .get_char ()
247
+ if value [- 3 :] == "-->" :
248
+ break
249
+ comment = CommentNode (nodeValue = value )
250
+ current_children .append (comment )
251
+ else :
252
+ # ElementNode
253
+ tagName = self .get_name ()
254
+ attrs = self .get_attrs ()
255
+ if tagName == "textarea" :
256
+ value = self .get_text (until = "</textarea>" )
257
+ current_node = ElementNode (
258
+ tagName = tagName , attributes = attrs , value = value
259
+ )
260
+ expect ("</textarea>" )
261
+ else :
262
+ current_node = ElementNode (
263
+ tagName = tagName , attributes = attrs
264
+ )
265
+ current_children = []
266
+ else :
267
+ # TextNode
268
+ value = self .get_text ()
269
+ text = TextNode (nodeValue = value )
270
+ current_children .append (text )
271
+ # Append "root" children to the parent node.
272
+ for child in current_children :
273
+ parent .add_child (child )
274
+
275
+
67
276
class Node :
68
277
"""
69
278
Represents a node in the DOM.
@@ -112,6 +321,18 @@ def __init__(self, **kwargs):
112
321
# The textarea doesn't have children. Only a text value.
113
322
self .value = kwargs .get ("value" , "" )
114
323
324
+ def add_child (self , child ):
325
+ """
326
+ Add a child node to the children of this node.
327
+ """
328
+ children = self ._node .get ("childNodes" , [])
329
+ node_dict = child
330
+ if isinstance (child , Node ):
331
+ node_dict = child .as_dict
332
+ node_dict ["parent" ] = self
333
+ children .append (node_dict )
334
+ self ._node ["childNodes" ] = children
335
+
115
336
@property
116
337
def childNodes (self ):
117
338
if self .tagName == "textarea" :
@@ -142,7 +363,7 @@ def as_dict(self):
142
363
result = {
143
364
"nodeType" : 1 ,
144
365
"tagName" : self .tagName ,
145
- "childNodes" : [child .as_dict for child in self .childNodes ]
366
+ "childNodes" : [child .as_dict for child in self .childNodes ],
146
367
}
147
368
if self .attributes :
148
369
result ["attributes" ] = self .attributes
@@ -166,7 +387,7 @@ def as_dict(self):
166
387
"nodeType" : 3 ,
167
388
"nodeName" : "#text" ,
168
389
"nodeValue" : self .nodeValue ,
169
- "childNodes" : []
390
+ "childNodes" : [],
170
391
}
171
392
172
393
@@ -185,7 +406,7 @@ def as_dict(self):
185
406
"nodeType" : 8 ,
186
407
"nodeName" : "#comment" ,
187
408
"nodeValue" : self .nodeValue ,
188
- "childNodes" : []
409
+ "childNodes" : [],
189
410
}
190
411
191
412
@@ -199,10 +420,7 @@ def __init__(self, **kwargs):
199
420
200
421
@property
201
422
def as_dict (self ):
202
- return {
203
- "nodeType" : 11 ,
204
- "childNodes" : []
205
- }
423
+ return {"nodeType" : 11 , "childNodes" : []}
206
424
207
425
208
426
def plug (query , event_type ):
0 commit comments