Skip to content

Commit 2d46a58

Browse files
committed
chore: Refactored sql parser
1 parent 9b6de68 commit 2d46a58

File tree

3 files changed

+511
-87
lines changed

3 files changed

+511
-87
lines changed

lib/db/query-parsers/sql.js

+305-33
Original file line numberDiff line numberDiff line change
@@ -5,28 +5,37 @@
55

66
'use strict'
77

8-
const logger = require('../../logger').child({ component: 'sql_query_parser' })
9-
const StatementMatcher = require('../statement-matcher')
8+
const defaultLogger = require('../../logger').child({ component: 'sql_query_parser' })
109
const stringify = require('json-stringify-safe')
1110

12-
const OPERATIONS = [
13-
new StatementMatcher(
14-
'select',
15-
/^[^\S]*?select\b[\s\S]+?\bfrom[\s\n\r\[\(]+([^\]\s\n\r,)(;]*)/gim
16-
),
17-
new StatementMatcher('update', /^[^\S]*?update[^\S]+?([^\s\n\r,;]+)/gim),
18-
new StatementMatcher(
19-
'insert',
20-
/^[^\S]*?insert(?:[^\S]+ignore)?[^\S]+into[^\S]+([^\s\n\r(,;]+)/gim
21-
),
22-
new StatementMatcher('delete', /^[^\S]*?delete[^\S]+?from[^\S]+([^\s\n\r,(;]+)/gim)
23-
]
24-
const COMMENT_PATTERN = /\/\\*.*?\\*\//g
25-
26-
// This must be called synchronously after the initial db call for backtraces to
27-
// work correctly
28-
29-
module.exports = function parseSql(sql) {
11+
/**
12+
* In a query like `select * from (select * from foo)`, extract the subquery
13+
* as the statement to retrieve the target identifier from.
14+
*
15+
* @type {RegExp}
16+
*/
17+
const selectSubquery = /from\s*?\((?<subquery>select.*?)\)\s*?/i
18+
19+
/**
20+
* Matches queries with leading common table expressions and assigns the
21+
* actual query to a match group named `query`.
22+
*
23+
* @type {RegExp}
24+
*/
25+
const cteMatcher = /^\s*?with[\w\W]*?\)\s*?(?<query>(?:insert|update|delete|select)[\w\W]*)/i
26+
27+
/**
28+
* Parses a SQL statement into the parts we want to report as metadata in
29+
* database transactions.
30+
*
31+
* @param {string} sql The statement to parse.
32+
* @param {object} [deps] A set of optional dependencies.
33+
* @param {object} [deps.logger] A logger instance.
34+
*
35+
* @returns {{query: string, collection: null|string, operation: string}} Parsed
36+
* metadata.
37+
*/
38+
module.exports = function parseSql(sql, { logger = defaultLogger } = {}) {
3039
// Sometimes we get an object here from MySQL. We have been unable to
3140
// reproduce it, so we'll just log what that object is and return a statement
3241
// type of `other`.
@@ -36,9 +45,9 @@ module.exports = function parseSql(sql) {
3645
if (typeof sql !== 'string') {
3746
if (logger.traceEnabled()) {
3847
try {
39-
logger.trace('parseSQL got an a non-string sql that looks like: %s', stringify(sql))
48+
logger.trace('parseSQL got a non-string sql that looks like: %s', stringify(sql))
4049
} catch (err) {
41-
logger.debug(err, 'Unabler to stringify SQL')
50+
logger.debug(err, 'Unable to stringify SQL')
4251
}
4352
}
4453
return {
@@ -48,24 +57,287 @@ module.exports = function parseSql(sql) {
4857
}
4958
}
5059

51-
sql = sql.replace(COMMENT_PATTERN, '').trim()
60+
sql = removeMultiLineComments(sql).trim()
61+
sql = removeSingleLineComments(sql).trim()
62+
let result = {
63+
operation: 'other',
64+
collection: null,
65+
query: sql
66+
}
67+
68+
// We want to remove the CTE _after_ assigning the statement to the result's
69+
// `query` property. Otherwise, the actual query will not be recorded in
70+
// the trace.
71+
sql = removeCte(sql)
72+
73+
// After all of our normalizing of the overall query, if it doesn't actually
74+
// look like an SQL statement, short-circuit the parsing routine.
75+
if (looksLikeValidSql(sql) === false) {
76+
return result
77+
}
78+
79+
const lines = sql.split('\n')
80+
result = { ...result, ...parseLines(lines) }
81+
result.query = sql.trim()
82+
83+
return result
84+
}
5285

53-
let parsedStatement
86+
/**
87+
* Iterates the lines of an SQL statement, reducing them to the relevant lines,
88+
* and returns the metadata found within.
89+
*
90+
* We do not inline this in `parseSql` because doing so will violate a
91+
* code complexity linting rule.
92+
*
93+
* @param {string[]} lines Set of SQL statement lines.
94+
*
95+
* @returns {{collection: null, operation: string}} SQL statement metadata.
96+
*/
97+
function parseLines(lines) {
98+
let result = {
99+
operation: 'other',
100+
collection: null
101+
}
102+
103+
parser: for (let i = 0; i < lines.length; i += 1) {
104+
const line = lines[i].toLowerCase().trim()
105+
switch (true) {
106+
case line.startsWith('select'): {
107+
const statement = lines.slice(i).join(' ')
108+
result.operation = 'select'
109+
result = { ...result, ...parseStatement(statement, 'select') }
110+
break parser
111+
}
112+
113+
case line.startsWith('update'): {
114+
const statement = lines.slice(i).join(' ')
115+
result.operation = 'update'
116+
result = { ...result, ...parseStatement(statement, 'update') }
117+
break parser
118+
}
119+
120+
case line.startsWith('insert'): {
121+
const statement = lines.slice(i).join(' ')
122+
result.operation = 'insert'
123+
result = { ...result, ...parseStatement(statement, 'insert') }
124+
break parser
125+
}
126+
127+
case line.startsWith('delete'): {
128+
const statement = lines.slice(i).join(' ')
129+
result.operation = 'delete'
130+
result = { ...result, ...parseStatement(statement, 'delete') }
131+
break parser
132+
}
133+
}
134+
}
135+
136+
return result
137+
}
138+
139+
/**
140+
* Iterates through the provided string and removes all multi-line comments
141+
* found therein.
142+
*
143+
* @param {string} input The string to parse.
144+
*
145+
* @returns {string} Cleaned up string.
146+
*/
147+
function removeMultiLineComments(input) {
148+
const startPos = input.indexOf('/*')
149+
if (startPos === -1) {
150+
return input
151+
}
152+
153+
const endPos = input.indexOf('*/', startPos + 2)
154+
const part1 = input.slice(0, startPos).trim()
155+
const part2 = input.slice(endPos + 2).trim()
156+
return removeMultiLineComments(`${part1} ${part2}`)
157+
}
158+
159+
/**
160+
* Removes all single line, and trailing, comments from the input query.
161+
* These are comments that start with `--` or `#`.
162+
*
163+
* @param {string} input The query that might contain comments.
164+
* @returns {string} The query without any comments.
165+
*/
166+
function removeSingleLineComments(input) {
167+
const resultLines = []
168+
const lines = input.split('\n')
169+
for (let i = 0; i < lines.length; i += 1) {
170+
let line = lines[i]
171+
if (/^(--|#)/.test(line) === true) {
172+
continue
173+
}
174+
let pos = line.indexOf(' --')
175+
if (pos > -1) {
176+
line = line.slice(0, pos)
177+
resultLines.push(line)
178+
continue
179+
}
180+
pos = line.indexOf(' #')
181+
if (pos > -1) {
182+
line = line.slice(0, pos)
183+
resultLines.push(line)
184+
continue
185+
}
186+
187+
resultLines.push(line)
188+
}
189+
return resultLines.join('\n')
190+
}
54191

55-
for (let i = 0, l = OPERATIONS.length; i < l; i++) {
56-
parsedStatement = OPERATIONS[i].getParsedStatement(sql)
57-
if (parsedStatement) {
192+
/**
193+
* Removes any leading common table expression (CTE) from the query and returns
194+
* the query that targets the CTE. The metadata we are interested in, is not
195+
* contained in the CTE, but in the query targeting the CTE.
196+
*
197+
* @param {string} statement The SQL statement that might have a CTE.
198+
* @returns {string} The SQL statement without a leading CTE.
199+
*/
200+
function removeCte(statement) {
201+
const matches = cteMatcher.exec(statement)
202+
if (matches === null) {
203+
return statement
204+
}
205+
return matches.groups.query
206+
}
207+
208+
/**
209+
* Tests the start of the statement to determine if it looks like a valid
210+
* SQL statement.
211+
*
212+
* @param {string} sql SQL statement with any comments stripped.
213+
*
214+
* @returns {boolean} True if the statement looks good. Otherwise, false.
215+
*/
216+
function looksLikeValidSql(sql) {
217+
return /^\s*?(?:with|select|insert|update|delete)/i.test(sql.toLowerCase())
218+
}
219+
220+
/**
221+
* Extracts the collection, database, and table information from an SQL
222+
* statement.
223+
*
224+
* @param {string} statement The SQL statement to parse.
225+
* @param {string} [kind] The type of SQL statement being parsed. This
226+
* dictates how the algorithm will determine where the desired fields are.
227+
* Valid values are: `insert`, `delete`, `select`, and `update`.
228+
*
229+
* @returns {{database: string, collection, table}} The found information.
230+
*/
231+
function parseStatement(statement, kind = 'insert') {
232+
let splitter
233+
switch (kind) {
234+
case 'insert': {
235+
splitter = /\s*?\binto\b\s*?/i
236+
break
237+
}
238+
239+
case 'delete': {
240+
splitter = /\s*?\bfrom\b\s*?/i
241+
break
242+
}
243+
244+
case 'select': {
245+
const subqueryMatch = selectSubquery.exec(statement)
246+
if (subqueryMatch !== null) {
247+
statement = subqueryMatch.groups.subquery
248+
}
249+
250+
if (/\bfrom\b/i.test(statement) === false) {
251+
// Statement does not specify a table. We don't need further processing.
252+
// E.g., we have a statement like `select 1 + 1 as added`.
253+
return { collection: 'unknown', table: 'unknown' }
254+
}
255+
256+
splitter = /\s*?\bfrom\b\s*?/i
257+
break
258+
}
259+
260+
case 'update': {
261+
splitter = /\s*?\bupdate\b\s*?/i
58262
break
59263
}
60264
}
61265

62-
if (parsedStatement) {
63-
return parsedStatement
266+
const targetIdentifier = statement.split(splitter).pop().trim().split(/\s/).shift()
267+
return parseTableIdentifier(targetIdentifier)
268+
}
269+
270+
function parseTableIdentifier(identifier) {
271+
const leadingChars = /^[`'"]/
272+
const trailingChars = /[`'"]$/
273+
let collection
274+
let database
275+
let table
276+
277+
const separatorPos = identifier.indexOf('.')
278+
if (separatorPos > 0) {
279+
const parts = identifier.split('.', 2)
280+
database = parts[0]
281+
table = parts[1]
282+
} else {
283+
table = identifier.replace(leadingChars, '').replace(trailingChars, '')
284+
table = normalizeTableName(identifier)
64285
}
65286

66-
return {
67-
operation: 'other',
68-
collection: null,
69-
query: sql
287+
if (table !== undefined) {
288+
table = table.replace(leadingChars, '').replace(trailingChars, '')
289+
table = normalizeTableName(table)
70290
}
291+
if (database !== undefined) {
292+
database = database.replace(leadingChars, '').replace(trailingChars, '')
293+
collection = `${database}.${table}`
294+
}
295+
if (collection === undefined) {
296+
collection = table
297+
}
298+
299+
return { collection, database, table }
300+
}
301+
302+
/**
303+
* Our cross-application tests have tests that do not match any known SQL
304+
* engine's valid syntax for table names. But we need to support them, so this
305+
* function will inspect table names and try to return the correct thing.
306+
*
307+
* @param {string} tableIdentifier Something that _should_ represent a table
308+
* name.
309+
*
310+
* @returns {string} The normalized table name.
311+
*/
312+
function normalizeTableName(tableIdentifier) {
313+
// Some of our tests add non-standard characters to table names and expects
314+
// they will be stripped.
315+
tableIdentifier = tableIdentifier.replace(/[;]/g, '')
316+
317+
if (tableIdentifier[0] === '(') {
318+
// We might have a subquery. If there is a single word between the
319+
// parentheticals, we return it as the table name (even though this is not
320+
// valid SQL). Otherwise, we return a special value.
321+
322+
const parts = tableIdentifier.replace(/[()]/g, '').split(/\s/)
323+
if (parts.length === 1) {
324+
return parts[0]
325+
}
326+
}
327+
328+
const parenPos = tableIdentifier.indexOf('(')
329+
if (parenPos > 0) {
330+
// We seem to accept `into foo(x,y)` as a valid table name, where we
331+
// decide that "foo" is the actual table name.
332+
return tableIdentifier.slice(0, parenPos)
333+
}
334+
335+
const commaPos = tableIdentifier.indexOf(',')
336+
if (commaPos > -1) {
337+
// For some reason, we accept `from foo,bar` and decide that "foo" is
338+
// the actual table name.
339+
return tableIdentifier.slice(0, commaPos)
340+
}
341+
342+
return tableIdentifier
71343
}

0 commit comments

Comments
 (0)