5
5
6
6
'use strict'
7
7
8
- const logger = require ( '../../logger' ) . child ( { component : 'sql_query_parser' } )
9
- const StatementMatcher = require ( '../statement-matcher' )
8
+ const defaultLogger = require ( '../../logger' ) . child ( { component : 'sql_query_parser' } )
10
9
const stringify = require ( 'json-stringify-safe' )
11
10
12
- const OPERATIONS = [
13
- new StatementMatcher (
14
- 'select' ,
15
- / ^ [ ^ \S ] * ?s e l e c t \b [ \s \S ] + ?\b f r o m [ \s \n \r \[ \( ] + ( [ ^ \] \s \n \r , ) ( ; ] * ) / gim
16
- ) ,
17
- new StatementMatcher ( 'update' , / ^ [ ^ \S ] * ?u p d a t e [ ^ \S ] + ?( [ ^ \s \n \r , ; ] + ) / gim) ,
18
- new StatementMatcher (
19
- 'insert' ,
20
- / ^ [ ^ \S ] * ?i n s e r t (?: [ ^ \S ] + i g n o r e ) ? [ ^ \S ] + i n t o [ ^ \S ] + ( [ ^ \s \n \r ( , ; ] + ) / gim
21
- ) ,
22
- new StatementMatcher ( 'delete' , / ^ [ ^ \S ] * ?d e l e t e [ ^ \S ] + ?f r o m [ ^ \S ] + ( [ ^ \s \n \r , ( ; ] + ) / gim)
23
- ]
24
- const COMMENT_PATTERN = / \/ \\ * .* ?\\ * \/ / g
25
-
26
- // This must be called synchronously after the initial db call for backtraces to
27
- // work correctly
28
-
29
- module . exports = function parseSql ( sql ) {
11
+ /**
12
+ * In a query like `select * from (select * from foo)`, extract the subquery
13
+ * as the statement to retrieve the target identifier from.
14
+ *
15
+ * @type {RegExp }
16
+ */
17
+ const selectSubquery = / f r o m \s * ?\( (?< subquery > s e l e c t .* ?) \) \s * ?/ i
18
+
19
+ /**
20
+ * Matches queries with leading common table expressions and assigns the
21
+ * actual query to a match group named `query`.
22
+ *
23
+ * @type {RegExp }
24
+ */
25
+ const cteMatcher = / ^ \s * ?w i t h [ \w \W ] * ?\) \s * ?(?< query > (?: i n s e r t | u p d a t e | d e l e t e | s e l e c t ) [ \w \W ] * ) / i
26
+
27
+ /**
28
+ * Parses a SQL statement into the parts we want to report as metadata in
29
+ * database transactions.
30
+ *
31
+ * @param {string } sql The statement to parse.
32
+ * @param {object } [deps] A set of optional dependencies.
33
+ * @param {object } [deps.logger] A logger instance.
34
+ *
35
+ * @returns {{query: string, collection: null|string, operation: string} } Parsed
36
+ * metadata.
37
+ */
38
+ module . exports = function parseSql ( sql , { logger = defaultLogger } = { } ) {
30
39
// Sometimes we get an object here from MySQL. We have been unable to
31
40
// reproduce it, so we'll just log what that object is and return a statement
32
41
// type of `other`.
@@ -36,9 +45,9 @@ module.exports = function parseSql(sql) {
36
45
if ( typeof sql !== 'string' ) {
37
46
if ( logger . traceEnabled ( ) ) {
38
47
try {
39
- logger . trace ( 'parseSQL got an a non-string sql that looks like: %s' , stringify ( sql ) )
48
+ logger . trace ( 'parseSQL got a non-string sql that looks like: %s' , stringify ( sql ) )
40
49
} catch ( err ) {
41
- logger . debug ( err , 'Unabler to stringify SQL' )
50
+ logger . debug ( err , 'Unable to stringify SQL' )
42
51
}
43
52
}
44
53
return {
@@ -48,24 +57,287 @@ module.exports = function parseSql(sql) {
48
57
}
49
58
}
50
59
51
- sql = sql . replace ( COMMENT_PATTERN , '' ) . trim ( )
60
+ sql = removeMultiLineComments ( sql ) . trim ( )
61
+ sql = removeSingleLineComments ( sql ) . trim ( )
62
+ let result = {
63
+ operation : 'other' ,
64
+ collection : null ,
65
+ query : sql
66
+ }
67
+
68
+ // We want to remove the CTE _after_ assigning the statement to the result's
69
+ // `query` property. Otherwise, the actual query will not be recorded in
70
+ // the trace.
71
+ sql = removeCte ( sql )
72
+
73
+ // After all of our normalizing of the overall query, if it doesn't actually
74
+ // look like an SQL statement, short-circuit the parsing routine.
75
+ if ( looksLikeValidSql ( sql ) === false ) {
76
+ return result
77
+ }
78
+
79
+ const lines = sql . split ( '\n' )
80
+ result = { ...result , ...parseLines ( lines ) }
81
+ result . query = sql . trim ( )
82
+
83
+ return result
84
+ }
52
85
53
- let parsedStatement
86
+ /**
87
+ * Iterates the lines of an SQL statement, reducing them to the relevant lines,
88
+ * and returns the metadata found within.
89
+ *
90
+ * We do not inline this in `parseSql` because doing so will violate a
91
+ * code complexity linting rule.
92
+ *
93
+ * @param {string[] } lines Set of SQL statement lines.
94
+ *
95
+ * @returns {{collection: null, operation: string} } SQL statement metadata.
96
+ */
97
+ function parseLines ( lines ) {
98
+ let result = {
99
+ operation : 'other' ,
100
+ collection : null
101
+ }
102
+
103
+ parser: for ( let i = 0 ; i < lines . length ; i += 1 ) {
104
+ const line = lines [ i ] . toLowerCase ( ) . trim ( )
105
+ switch ( true ) {
106
+ case line . startsWith ( 'select' ) : {
107
+ const statement = lines . slice ( i ) . join ( ' ' )
108
+ result . operation = 'select'
109
+ result = { ...result , ...parseStatement ( statement , 'select' ) }
110
+ break parser
111
+ }
112
+
113
+ case line . startsWith ( 'update' ) : {
114
+ const statement = lines . slice ( i ) . join ( ' ' )
115
+ result . operation = 'update'
116
+ result = { ...result , ...parseStatement ( statement , 'update' ) }
117
+ break parser
118
+ }
119
+
120
+ case line . startsWith ( 'insert' ) : {
121
+ const statement = lines . slice ( i ) . join ( ' ' )
122
+ result . operation = 'insert'
123
+ result = { ...result , ...parseStatement ( statement , 'insert' ) }
124
+ break parser
125
+ }
126
+
127
+ case line . startsWith ( 'delete' ) : {
128
+ const statement = lines . slice ( i ) . join ( ' ' )
129
+ result . operation = 'delete'
130
+ result = { ...result , ...parseStatement ( statement , 'delete' ) }
131
+ break parser
132
+ }
133
+ }
134
+ }
135
+
136
+ return result
137
+ }
138
+
139
+ /**
140
+ * Iterates through the provided string and removes all multi-line comments
141
+ * found therein.
142
+ *
143
+ * @param {string } input The string to parse.
144
+ *
145
+ * @returns {string } Cleaned up string.
146
+ */
147
+ function removeMultiLineComments ( input ) {
148
+ const startPos = input . indexOf ( '/*' )
149
+ if ( startPos === - 1 ) {
150
+ return input
151
+ }
152
+
153
+ const endPos = input . indexOf ( '*/' , startPos + 2 )
154
+ const part1 = input . slice ( 0 , startPos ) . trim ( )
155
+ const part2 = input . slice ( endPos + 2 ) . trim ( )
156
+ return removeMultiLineComments ( `${ part1 } ${ part2 } ` )
157
+ }
158
+
159
+ /**
160
+ * Removes all single line, and trailing, comments from the input query.
161
+ * These are comments that start with `--` or `#`.
162
+ *
163
+ * @param {string } input The query that might contain comments.
164
+ * @returns {string } The query without any comments.
165
+ */
166
+ function removeSingleLineComments ( input ) {
167
+ const resultLines = [ ]
168
+ const lines = input . split ( '\n' )
169
+ for ( let i = 0 ; i < lines . length ; i += 1 ) {
170
+ let line = lines [ i ]
171
+ if ( / ^ ( - - | # ) / . test ( line ) === true ) {
172
+ continue
173
+ }
174
+ let pos = line . indexOf ( ' --' )
175
+ if ( pos > - 1 ) {
176
+ line = line . slice ( 0 , pos )
177
+ resultLines . push ( line )
178
+ continue
179
+ }
180
+ pos = line . indexOf ( ' #' )
181
+ if ( pos > - 1 ) {
182
+ line = line . slice ( 0 , pos )
183
+ resultLines . push ( line )
184
+ continue
185
+ }
186
+
187
+ resultLines . push ( line )
188
+ }
189
+ return resultLines . join ( '\n' )
190
+ }
54
191
55
- for ( let i = 0 , l = OPERATIONS . length ; i < l ; i ++ ) {
56
- parsedStatement = OPERATIONS [ i ] . getParsedStatement ( sql )
57
- if ( parsedStatement ) {
192
+ /**
193
+ * Removes any leading common table expression (CTE) from the query and returns
194
+ * the query that targets the CTE. The metadata we are interested in, is not
195
+ * contained in the CTE, but in the query targeting the CTE.
196
+ *
197
+ * @param {string } statement The SQL statement that might have a CTE.
198
+ * @returns {string } The SQL statement without a leading CTE.
199
+ */
200
+ function removeCte ( statement ) {
201
+ const matches = cteMatcher . exec ( statement )
202
+ if ( matches === null ) {
203
+ return statement
204
+ }
205
+ return matches . groups . query
206
+ }
207
+
208
+ /**
209
+ * Tests the start of the statement to determine if it looks like a valid
210
+ * SQL statement.
211
+ *
212
+ * @param {string } sql SQL statement with any comments stripped.
213
+ *
214
+ * @returns {boolean } True if the statement looks good. Otherwise, false.
215
+ */
216
+ function looksLikeValidSql ( sql ) {
217
+ return / ^ \s * ?(?: w i t h | s e l e c t | i n s e r t | u p d a t e | d e l e t e ) / i. test ( sql . toLowerCase ( ) )
218
+ }
219
+
220
+ /**
221
+ * Extracts the collection, database, and table information from an SQL
222
+ * statement.
223
+ *
224
+ * @param {string } statement The SQL statement to parse.
225
+ * @param {string } [kind] The type of SQL statement being parsed. This
226
+ * dictates how the algorithm will determine where the desired fields are.
227
+ * Valid values are: `insert`, `delete`, `select`, and `update`.
228
+ *
229
+ * @returns {{database: string, collection, table} } The found information.
230
+ */
231
+ function parseStatement ( statement , kind = 'insert' ) {
232
+ let splitter
233
+ switch ( kind ) {
234
+ case 'insert' : {
235
+ splitter = / \s * ?\b i n t o \b \s * ?/ i
236
+ break
237
+ }
238
+
239
+ case 'delete' : {
240
+ splitter = / \s * ?\b f r o m \b \s * ?/ i
241
+ break
242
+ }
243
+
244
+ case 'select' : {
245
+ const subqueryMatch = selectSubquery . exec ( statement )
246
+ if ( subqueryMatch !== null ) {
247
+ statement = subqueryMatch . groups . subquery
248
+ }
249
+
250
+ if ( / \b f r o m \b / i. test ( statement ) === false ) {
251
+ // Statement does not specify a table. We don't need further processing.
252
+ // E.g., we have a statement like `select 1 + 1 as added`.
253
+ return { collection : 'unknown' , table : 'unknown' }
254
+ }
255
+
256
+ splitter = / \s * ?\b f r o m \b \s * ?/ i
257
+ break
258
+ }
259
+
260
+ case 'update' : {
261
+ splitter = / \s * ?\b u p d a t e \b \s * ?/ i
58
262
break
59
263
}
60
264
}
61
265
62
- if ( parsedStatement ) {
63
- return parsedStatement
266
+ const targetIdentifier = statement . split ( splitter ) . pop ( ) . trim ( ) . split ( / \s / ) . shift ( )
267
+ return parseTableIdentifier ( targetIdentifier )
268
+ }
269
+
270
+ function parseTableIdentifier ( identifier ) {
271
+ const leadingChars = / ^ [ ` ' " ] /
272
+ const trailingChars = / [ ` ' " ] $ /
273
+ let collection
274
+ let database
275
+ let table
276
+
277
+ const separatorPos = identifier . indexOf ( '.' )
278
+ if ( separatorPos > 0 ) {
279
+ const parts = identifier . split ( '.' , 2 )
280
+ database = parts [ 0 ]
281
+ table = parts [ 1 ]
282
+ } else {
283
+ table = identifier . replace ( leadingChars , '' ) . replace ( trailingChars , '' )
284
+ table = normalizeTableName ( identifier )
64
285
}
65
286
66
- return {
67
- operation : 'other' ,
68
- collection : null ,
69
- query : sql
287
+ if ( table !== undefined ) {
288
+ table = table . replace ( leadingChars , '' ) . replace ( trailingChars , '' )
289
+ table = normalizeTableName ( table )
70
290
}
291
+ if ( database !== undefined ) {
292
+ database = database . replace ( leadingChars , '' ) . replace ( trailingChars , '' )
293
+ collection = `${ database } .${ table } `
294
+ }
295
+ if ( collection === undefined ) {
296
+ collection = table
297
+ }
298
+
299
+ return { collection, database, table }
300
+ }
301
+
302
+ /**
303
+ * Our cross-application tests have tests that do not match any known SQL
304
+ * engine's valid syntax for table names. But we need to support them, so this
305
+ * function will inspect table names and try to return the correct thing.
306
+ *
307
+ * @param {string } tableIdentifier Something that _should_ represent a table
308
+ * name.
309
+ *
310
+ * @returns {string } The normalized table name.
311
+ */
312
+ function normalizeTableName ( tableIdentifier ) {
313
+ // Some of our tests add non-standard characters to table names and expects
314
+ // they will be stripped.
315
+ tableIdentifier = tableIdentifier . replace ( / [ ; ] / g, '' )
316
+
317
+ if ( tableIdentifier [ 0 ] === '(' ) {
318
+ // We might have a subquery. If there is a single word between the
319
+ // parentheticals, we return it as the table name (even though this is not
320
+ // valid SQL). Otherwise, we return a special value.
321
+
322
+ const parts = tableIdentifier . replace ( / [ ( ) ] / g, '' ) . split ( / \s / )
323
+ if ( parts . length === 1 ) {
324
+ return parts [ 0 ]
325
+ }
326
+ }
327
+
328
+ const parenPos = tableIdentifier . indexOf ( '(' )
329
+ if ( parenPos > 0 ) {
330
+ // We seem to accept `into foo(x,y)` as a valid table name, where we
331
+ // decide that "foo" is the actual table name.
332
+ return tableIdentifier . slice ( 0 , parenPos )
333
+ }
334
+
335
+ const commaPos = tableIdentifier . indexOf ( ',' )
336
+ if ( commaPos > - 1 ) {
337
+ // For some reason, we accept `from foo,bar` and decide that "foo" is
338
+ // the actual table name.
339
+ return tableIdentifier . slice ( 0 , commaPos )
340
+ }
341
+
342
+ return tableIdentifier
71
343
}
0 commit comments