Fix/query complexity (#15)

* Fix operators calculation * Fix spaghetti query thresholds * Fix testcase * Linting * Fix test case fail
leonardomathon · May 17, 2022 · d1ec895 · d1ec895
1 parent c1fb479
commit d1ec895
Show file tree

Hide file tree

Showing 4 changed files with 68 additions and 25 deletions.
diff --git a/sqleyes/detector/antipatterns/spaghetti_query.py b/sqleyes/detector/antipatterns/spaghetti_query.py
@@ -15,22 +15,22 @@ def __init__(self, query):
         super().__init__(query)
 
     def check(self):
-        LOW_THRESHOLD = 60
-        MEDIUM_THRESHOLD = 75
-        HIGH_THRESHOLD = 90
+        LOW_THRESHOLD = 2.5
+        MEDIUM_THRESHOLD = 4
+        HIGH_THRESHOLD = 5.5
 
         query_complexity = get_query_complexity(self.query)
 
         if query_complexity < LOW_THRESHOLD:
             return None
 
-        if LOW_THRESHOLD < query_complexity < MEDIUM_THRESHOLD:
+        if LOW_THRESHOLD <= query_complexity < MEDIUM_THRESHOLD:
             certainty = "low"
 
-        if MEDIUM_THRESHOLD < query_complexity < HIGH_THRESHOLD:
+        if MEDIUM_THRESHOLD <= query_complexity < HIGH_THRESHOLD:
             certainty = "medium"
 
-        if HIGH_THRESHOLD < query_complexity:
+        if HIGH_THRESHOLD <= query_complexity:
             certainty = "high"
 
         return DetectorOutput(query=self.query,

diff --git a/sqleyes/utils/code_complexity_metrics.py b/sqleyes/utils/code_complexity_metrics.py
@@ -29,7 +29,7 @@ def halstead_metrics(n1: int, n2: int, N1: int, N2: int):
     V = N * math.log2(n)
 
     # Difficulty
-    D = n1/2 * N2/n2
+    D = (n1/2) * (N2/n2)
 
     # Effort
     E = D * V

diff --git a/sqleyes/utils/query_functions.py b/sqleyes/utils/query_functions.py
@@ -8,14 +8,15 @@
 from sqleyes.utils.query_keywords import SQL_FUNCTIONS
 
 
-OPERATORS = ["+", "-", "*", "**", "/", "%", "&", "|", "||", "^", "=", ">", "<",
-             ">=", "<=", "!<", "!>", "<>", "+=", "-=", "/=", "/=", "%=", "&=",
-             "^-=", "|*=", "ALL", "AND", "&&", "ANY", "BETWEEN", "EXISTS",
-             "IN", "LIKE", "NOT", "OR", "SOME", "IS NULL", "IS NOT NULL",
-             "UNIQUE"]
+OPERATORS = ["+", "-", "*", "**", "/", "%", "&", "|", "||", "^", "=", "!=",
+             ">", "<", ">=", "<=", "!<", "!>", "<>", "+=", "-=", "/=", "/=",
+             "%=", "&=", "^-=", "|*=", "ALL", "AND", "&&", "ANY", "BETWEEN",
+             "EXISTS", "IN", "LIKE", "NOT", "OR", "SOME", "IS NULL",
+             "IS NOT NULL", "UNIQUE"]
 
 EXPRESSIONS = ["CASE", "DECODE", "IF", "NULLIF", "COALESCE", "GREATEST",
-               "GREATER", "LEAST", "LESSER", "CAST"]
+               "GREATER", "LEAST", "LESSER", "CAST", "JOIN", "GROUP BY",
+               "WHERE", "HAVING", "ORDER BY", "UNION", "EXCEPT"]
 
 
 def get_subqueries(parsed_query: sqlparse.sql.Statement) -> Tuple[str, List[str]]:
@@ -290,16 +291,56 @@ def get_query_ops_and_expr(query: str) -> List[str]:
     """
     result = []
 
-    for operator in OPERATORS:
-        count = query.count(operator)
-        if count != 0:
-            result.extend([operator] * count)
-
-    # Get all expressions used in the query
-    for expression in EXPRESSIONS:
-        count = query.count(expression)
-        if count != 0:
-            result.extend([expression] * count)
+    # Format the query
+    query = sqlparse.format(query, keyword_case="upper")
+
+    # Split the query
+    query_tokens = query.split()
+
+    # Fix split
+    # Merges ["SELECT", "*"] into ["SELECT *"]
+    # Merges ["IS", "NULL"] into ["IS NULL"]
+    # Merges ["ORDER", "BY"] into ["ORDER BY"]
+    # Merges ["GROUP", "BY"] into ["GROUP BY"]
+    i, query_len = 0, len(query_tokens)
+    while i < query_len - 1:
+        merge_current_cel = False
+        if "SELECT" in query_tokens[i] and query_tokens[i + 1] == "*":
+            merge_current_cel = True
+        elif query_tokens[i] == "IS" and query_tokens[i + 1] == "NULL":
+            merge_current_cel = True
+        elif query_tokens[i] == "ORDER" and query_tokens[i + 1] == "BY":
+            merge_current_cel = True
+        elif query_tokens[i] == "GROUP" and query_tokens[i + 1] == "BY":
+            merge_current_cel = True
+
+        # Merge two cells into one
+        # Adjust overall query length since we have one less cell
+        if merge_current_cel:
+            query_tokens[i:i + 2] = [' '.join(query_tokens[i:i + 2])]
+            query_len -= 1
+
+        i += 1
+
+    # Fix split
+    # Merges ["IS", "NOT", "NULL"] into ["IS NOT NULL"]
+    i, query_len = 0, len(query_tokens)
+    while i < query_len - 1:
+        if (query_tokens[i] == "IS" and query_tokens[i + 1] == "NOT" and
+                query_tokens[i + 2] == "NULL"):
+            query_tokens[i:i + 3] = [' '.join(query_tokens[i: i + 3])]
+            query_len -= 2
+        i += 1
+
+    # Check every element if its an operator or expression
+    for elem in query_tokens:
+        for operator in OPERATORS:
+            if elem == operator:
+                result.append(operator)
+
+        for expression in EXPRESSIONS:
+            if elem == expression:
+                result.append(expression)
 
     return result
 
@@ -332,7 +373,9 @@ def get_query_complexity(query: str) -> float:
     N1, N2 = len(operators), len(operands)
     n1, n2 = len(set(operators)), len(set(operands))
 
-    return float(halstead_metrics(n1, n2, N1, N2)[4])
+    complexity = float(halstead_metrics(n1, n2, N1, N2)[3])
+
+    return complexity
 
 
 def check_single_value_rule(columns: List[str]) -> bool:

diff --git a/tests/test_main.py b/tests/test_main.py
@@ -131,7 +131,7 @@ def test_main_fear_of_the_unknown(test_input, expected):
            FROM product
            WHERE price > 9.99
            GROUP BY pSupplier
-           HAVING count(pId) > 10""",
+        """,
         [[DEFINITIONS["anti_patterns"]["ambiguous_groups"]["title"],
          DEFINITIONS["anti_patterns"]["ambiguous_groups"]["type"]]]
     ),