30
30
# may be bound to those statements (embedded in thin_impl.pyx).
31
31
# ------------------------------------------------------------------------------
32
32
33
- # Rules for named binds:
34
- # 1. Quoted and non-quoted bind names are allowed.
35
- # 2. Quoted binds can contain any characters.
36
- # 3. Non-quoted binds must begin with an alphabet character.
37
- # 4. Non-quoted binds can only contain alphanumeric characters, the underscore,
38
- # the dollar sign and the pound sign.
39
- # 5. Non-quoted binds cannot be Oracle Database Reserved Names (Server handles
40
- # this case and returns an appropriate error)
41
- BIND_PATTERN = r ' :\s * ( (?: ". *? ") | (?: [^ \W\d _ ][\w \$ # ]* ) | \d + ) '
42
-
43
- # pattern used for detecting a DML returning clause; bind variables in the
44
- # SQL prior to the INTO keyword are input variables; bind varibles in the SQL
45
- # after the INTO keyword are output variables
46
- DML_RETURNING_PATTERN = r ' (?si) (?<= \b RETURNING\b ) ( . *? ) (?= \b INTO\b ) '
47
-
48
- # patterns for identifying comments and quoted strings
49
- SINGLE_LINE_COMMENT_PATTERN = r ' --. * '
50
- MULTI_LINE_COMMENT_PATTERN = r ' (?s) /\* . *? \* /'
51
- CONSTANT_STRING_PATTERN = r " (?s) '. *? '"
52
- QUOTED_NAME_PATTERN = r ' ( :\s * ) ? ( ". *? ") '
53
-
54
33
cdef class BindInfo:
55
34
56
35
cdef:
@@ -75,6 +54,244 @@ cdef class BindInfo:
75
54
return BindInfo(self ._bind_name, self ._is_return_bind)
76
55
77
56
57
+ cdef class Parser:
58
+
59
+ cdef:
60
+ bint returning_keyword_found
61
+ ssize_t pos, max_pos
62
+ void * sql_data
63
+ int sql_kind
64
+
65
+ cdef int _parse_bind_name(self , Statement stmt) except - 1 :
66
+ """
67
+ Bind variables are identified as follows:
68
+ - Quoted and non-quoted bind names are allowed.
69
+ - Quoted bind names can contain any characters.
70
+ - Non-quoted bind names must begin with an alphabetic character.
71
+ - Non-quoted bind names can only contain alphanumeric characters, the
72
+ underscore, the dollar sign and the pound sign.
73
+ - Non-quoted bind names cannot be Oracle Database Reserved Names (this
74
+ is left to the server to detct and return an appropriate error).
75
+ """
76
+ cdef:
77
+ bint quoted_name = False , in_bind = False , digits_only = False
78
+ ssize_t start_pos = 0 , pos = self .pos + 1
79
+ str bind_name
80
+ Py_UCS4 ch
81
+ while pos <= self .max_pos:
82
+ ch = cpython.PyUnicode_READ(self .sql_kind, self .sql_data, pos)
83
+ if not in_bind:
84
+ if cpython.Py_UNICODE_ISSPACE(ch):
85
+ pos += 1
86
+ continue
87
+ elif ch == ' "' :
88
+ quoted_name = True
89
+ elif cpython.Py_UNICODE_ISDIGIT(ch):
90
+ digits_only = True
91
+ elif not cpython.Py_UNICODE_ISALPHA(ch):
92
+ break
93
+ in_bind = True
94
+ start_pos = pos
95
+ elif digits_only and not cpython.Py_UNICODE_ISDIGIT(ch):
96
+ self .pos = pos - 1
97
+ break
98
+ elif quoted_name and ch == ' "' :
99
+ self .pos = pos
100
+ break
101
+ elif not digits_only and not quoted_name \
102
+ and not cpython.Py_UNICODE_ISALNUM(ch) \
103
+ and ch not in (' _' , ' $' , ' #' ):
104
+ self .pos = pos - 1
105
+ break
106
+ pos += 1
107
+ if in_bind:
108
+ if quoted_name:
109
+ bind_name = stmt._sql[start_pos + 1 :pos]
110
+ elif digits_only:
111
+ bind_name = stmt._sql[start_pos:pos]
112
+ else :
113
+ bind_name = stmt._sql[start_pos:pos].upper()
114
+ stmt._add_bind(bind_name)
115
+
116
+ cdef int _parse_multiple_line_comment(self ) except - 1 :
117
+ """
118
+ Multiple line comments consist of the characters /* followed by all
119
+ characters up until */. This method is called when the first slash is
120
+ detected and checks for the subsequent asterisk. If found, the comment
121
+ is traversed and the current position is updaqted; otherwise, the
122
+ current position is left untouched.
123
+ """
124
+ cdef:
125
+ bint in_comment = False , exiting_comment = False
126
+ ssize_t pos = self .pos + 1
127
+ Py_UCS4 ch
128
+ while pos <= self .max_pos:
129
+ ch = cpython.PyUnicode_READ(self .sql_kind, self .sql_data, pos)
130
+ if not in_comment:
131
+ if ch != ' *' :
132
+ break
133
+ in_comment = True
134
+ elif not exiting_comment and ch == ' *' :
135
+ exiting_comment = True
136
+ elif exiting_comment:
137
+ if ch == ' /' :
138
+ self .pos = pos
139
+ break
140
+ exiting_comment = False
141
+ pos += 1
142
+
143
+ cdef int _parse_qstring(self ) except - 1 :
144
+ """
145
+ Parses a q-string which consists of the characters "q" and a single
146
+ quote followed by a start separator, any text that does not contain the
147
+ end seprator and the end separator and ending quote. The following are
148
+ examples that demonstrate this:
149
+ - q'[...]'
150
+ - q'{...}'
151
+ - q'<...>'
152
+ - q'(...)'
153
+ - q'?...?' (where ? is any character)
154
+ """
155
+ cdef:
156
+ bint exiting_qstring = False , in_qstring = False
157
+ Py_UCS4 ch, sep = 0
158
+ self .pos += 1
159
+ while self .pos <= self .max_pos:
160
+ ch = cpython.PyUnicode_READ(self .sql_kind, self .sql_data, self .pos)
161
+ if not in_qstring:
162
+ if ch == ' [' :
163
+ sep = ' ]'
164
+ elif ch == ' {' :
165
+ sep = ' }'
166
+ elif ch == ' <' :
167
+ sep = ' >'
168
+ elif ch == ' (' :
169
+ sep = ' )'
170
+ else :
171
+ sep = ch
172
+ in_qstring = True
173
+ elif not exiting_qstring and ch == sep:
174
+ exiting_qstring = True
175
+ elif exiting_qstring:
176
+ if ch == " '" :
177
+ break
178
+ elif ch != sep:
179
+ exiting_qstring = False
180
+ self .pos += 1
181
+
182
+ cdef int _parse_quoted_string(self , Py_UCS4 sep) except - 1 :
183
+ """
184
+ Parses a quoted string with the given separator. All characters until
185
+ the separate is detected are discarded.
186
+ """
187
+ cdef Py_UCS4 ch
188
+ self .pos += 1
189
+ while self .pos <= self .max_pos:
190
+ ch = cpython.PyUnicode_READ(self .sql_kind, self .sql_data, self .pos)
191
+ if ch == sep:
192
+ break
193
+ self .pos += 1
194
+
195
+ cdef int _parse_single_line_comment(self ) except - 1 :
196
+ """
197
+ Single line comments consist of two dashes and all characters up to the
198
+ next line break. This method is called when the first dash is detected
199
+ and checks for the subsequent dash. If found, the single line comment
200
+ is traversed and the current position is updated; otherwise, the
201
+ current position is left untouched.
202
+ """
203
+ cdef:
204
+ ssize_t pos = self .pos + 1
205
+ bint in_comment = False
206
+ Py_UCS4 ch
207
+ while pos <= self .max_pos:
208
+ ch = cpython.PyUnicode_READ(self .sql_kind, self .sql_data, pos)
209
+ if not in_comment:
210
+ if ch != ' -' :
211
+ break
212
+ in_comment = True
213
+ elif cpython.Py_UNICODE_ISLINEBREAK(ch):
214
+ self .pos = pos
215
+ break
216
+ pos += 1
217
+
218
+ cdef int parse(self , Statement stmt) except - 1 :
219
+ """
220
+ Parses the SQL stored in the statement in order to determine the
221
+ keyword that identifies the type of SQL being executed as well as a
222
+ list of bind variable names. A check is also made for DML returning
223
+ statements since the bind variables following the "INTO" keyword are
224
+ treated differently from other bind variables.
225
+ """
226
+ cdef:
227
+ bint initial_keyword_found = False , last_was_string = False
228
+ Py_UCS4 ch, last_ch = 0 , alpha_start_ch = 0
229
+ ssize_t alpha_start_pos = 0 , alpha_len
230
+ bint last_was_alpha = False , is_alpha
231
+ str keyword
232
+
233
+ # initialization
234
+ self .pos = 0
235
+ self .max_pos = cpython.PyUnicode_GET_LENGTH(stmt._sql) - 1
236
+ self .sql_kind = cpython.PyUnicode_KIND(stmt._sql)
237
+ self .sql_data = cpython.PyUnicode_DATA(stmt._sql)
238
+
239
+ # scan all characters in the string
240
+ while self .pos <= self .max_pos:
241
+ ch = cpython.PyUnicode_READ(self .sql_kind, self .sql_data, self .pos)
242
+
243
+ # look for certain keywords (initial keyword and the ones for
244
+ # detecting DML returning statements
245
+ is_alpha = cpython.Py_UNICODE_ISALPHA(ch)
246
+ if is_alpha and not last_was_alpha:
247
+ alpha_start_pos = self .pos
248
+ alpha_start_ch = ch
249
+ elif not is_alpha and last_was_alpha:
250
+ alpha_len = self .pos - alpha_start_pos
251
+ if not initial_keyword_found:
252
+ keyword = stmt._sql[alpha_start_pos:self .pos].upper()
253
+ stmt._determine_statement_type(keyword)
254
+ if stmt._is_ddl:
255
+ break
256
+ initial_keyword_found = True
257
+ elif stmt._is_dml and not self .returning_keyword_found \
258
+ and alpha_len == 9 and alpha_start_ch in (' r' , ' R' ):
259
+ keyword = stmt._sql[alpha_start_pos:self .pos].upper()
260
+ if keyword == " RETURNING" :
261
+ self .returning_keyword_found = True
262
+ elif self .returning_keyword_found and alpha_len == 4 \
263
+ and alpha_start_ch in (' i' , ' I' ):
264
+ keyword = stmt._sql[alpha_start_pos:self .pos].upper()
265
+ if keyword == " INTO" :
266
+ stmt._is_returning = True
267
+
268
+ # need to keep track of whether the last token parsed was a string
269
+ # (excluding whitespace) as if the last token parsed was a string
270
+ # a following colon is not a bind variable but a part of the JSON
271
+ # constant syntax
272
+ if ch == " '" :
273
+ last_was_string = True
274
+ if last_ch in (' q' , ' Q' ):
275
+ self ._parse_qstring()
276
+ else :
277
+ self ._parse_quoted_string(ch)
278
+ elif not cpython.Py_UNICODE_ISSPACE(ch):
279
+ if ch == ' -' :
280
+ self ._parse_single_line_comment()
281
+ elif ch == ' /' :
282
+ self ._parse_multiple_line_comment()
283
+ elif ch == ' "' :
284
+ self ._parse_quoted_string(ch)
285
+ elif ch == ' :' and not last_was_string:
286
+ self ._parse_bind_name(stmt)
287
+ last_was_string = False
288
+
289
+ # advance to next character and track previous character
290
+ self .pos += 1
291
+ last_was_alpha = is_alpha
292
+ last_ch = ch
293
+
294
+
78
295
cdef class Statement:
79
296
80
297
cdef:
@@ -126,94 +343,53 @@ cdef class Statement:
126
343
copied_statement._return_to_cache = False
127
344
return copied_statement
128
345
129
- cdef int _add_binds (self , str sql, bint is_return_bind ) except - 1 :
346
+ cdef int _add_bind (self , str name ) except - 1 :
130
347
"""
131
348
Add bind information to the statement by examining the passed SQL for
132
349
bind variable names.
133
350
"""
134
- cdef:
135
- BindInfo info
136
- str name
137
- for name in re.findall(BIND_PATTERN, sql):
138
- if name.startswith(' "' ) and name.endswith(' "' ):
139
- name = name[1 :- 1 ]
140
- else :
141
- name = name.upper()
142
- if self ._is_plsql and name in self ._bind_info_dict:
143
- continue
144
- info = BindInfo(name, is_return_bind)
351
+ cdef BindInfo info
352
+ if not self ._is_plsql or name not in self ._bind_info_dict:
353
+ info = BindInfo(name, self ._is_returning)
145
354
self ._bind_info_list.append(info)
146
355
if info._bind_name in self ._bind_info_dict:
147
356
self ._bind_info_dict[info._bind_name].append(info)
148
357
else :
149
358
self ._bind_info_dict[info._bind_name] = [info]
150
359
151
- cdef _determine_statement_type(self , str sql ):
360
+ cdef _determine_statement_type(self , str sql_keyword ):
152
361
"""
153
362
Determine the type of the SQL statement by examining the first keyword
154
363
found in the statement.
155
364
"""
156
- tokens = sql.strip().lstrip(" (" )[:10 ].split()
157
- if tokens:
158
- sql_keyword = tokens[0 ].upper()
159
- if sql_keyword in (" DECLARE" , " BEGIN" , " CALL" ):
160
- self ._is_plsql = True
161
- elif sql_keyword in (" SELECT " , " WITH" ):
162
- self ._is_query = True
163
- elif sql_keyword in (" INSERT" , " UPDATE " , " DELETE " , " MERGE" ):
164
- self ._is_dml = True
165
- elif sql_keyword in (" CREATE" , " ALTER" , " DROP" , " TRUNCATE" ):
166
- self ._is_ddl = True
365
+ if sql_keyword in (" DECLARE" , " BEGIN" , " CALL" ):
366
+ self ._is_plsql = True
367
+ elif sql_keyword in (" SELECT " , " WITH" ):
368
+ self ._is_query = True
369
+ elif sql_keyword in (" INSERT" , " UPDATE " , " DELETE " , " MERGE" ):
370
+ self ._is_dml = True
371
+ elif sql_keyword in (" CREATE" , " ALTER" , " DROP" , " GRANT" , " REVOKE" ,
372
+ " ANALYZE" , " AUDIT" , " COMMENT" , " TRUNCATE" ):
373
+ self ._is_ddl = True
167
374
168
375
cdef int _prepare(self , str sql) except - 1 :
169
376
"""
170
377
Prepare the SQL for execution by determining the list of bind names
171
378
that are found within it. The length of the SQL text is also calculated
172
- at this time. If the character sets of the client and server are
173
- identical, the length is calculated in bytes; otherwise, the length is
174
- calculated in characters.
379
+ at this time.
175
380
"""
176
- cdef:
177
- str input_sql, returning_sql = None
178
- object match
381
+ cdef Parser parser = Parser.__new__ (Parser)
179
382
180
383
# retain normalized SQL (as string and bytes) as well as the length
181
384
self ._sql = sql
182
385
self ._sql_bytes = self ._sql.encode()
183
386
self ._sql_length = < uint32_t> len (self ._sql_bytes)
184
387
185
- # create empty list (bind by position) and dict (bind by name)
388
+ # parse SQL and populate bind variable list (bind by position) and dict
389
+ # (bind by name)
186
390
self ._bind_info_dict = collections.OrderedDict()
187
391
self ._bind_info_list = []
188
-
189
- # Strip single/multiline comments and replace constant strings and
190
- # quoted names with single characters in order to facilitate detection
191
- # of bind variables; note that bind variables can be quoted so a check
192
- # must be made to ensure that a quoted string doesn't refer to a bind
193
- # variable first before it can be replaced
194
- sql = re.sub(MULTI_LINE_COMMENT_PATTERN, " " , sql)
195
- sql = re.sub(SINGLE_LINE_COMMENT_PATTERN, " " , sql)
196
- sql = re.sub(CONSTANT_STRING_PATTERN, " S" , sql)
197
- sql = re.sub(QUOTED_NAME_PATTERN,
198
- lambda m : m.group(0 ) if sql[m.start(0 )] == " :" else " Q" ,
199
- sql)
200
-
201
- # determine statement type
202
- self ._determine_statement_type(sql)
203
-
204
- # bind variables can only be found in queries, DML and PL/SQL
205
- if self ._is_query or self ._is_dml or self ._is_plsql:
206
- input_sql = sql
207
- if self ._is_dml:
208
- match = re.search(DML_RETURNING_PATTERN, sql)
209
- if match is not None :
210
- pos = match.end()
211
- input_sql = sql[:pos]
212
- returning_sql = sql[pos + 4 :]
213
- self ._add_binds(input_sql, is_return_bind = False )
214
- if returning_sql is not None :
215
- self ._is_returning = True
216
- self ._add_binds(returning_sql, is_return_bind = True )
392
+ parser.parse(self )
217
393
218
394
cdef int _set_var(self , BindInfo bind_info, ThinVarImpl var_impl,
219
395
ThinCursorImpl cursor_impl) except - 1 :
0 commit comments