Merge pull request #170 from datafold/oracle_tests

erezsh · web-flow · commit c9ab57fb5836 · 2022-07-22T10:55:17.000+02:00
Tests now cover oracle, Redshift, snowflake and bigquery; Various fixes to said drivers.
diff --git a/data_diff/__init__.py b/data_diff/__init__.py
@@ -13,7 +13,11 @@
 
 
 def connect_to_table(
-    db_info: Union[str, dict], table_name: Union[DbPath, str], key_column: str = "id", thread_count: Optional[int] = 1, **kwargs
+    db_info: Union[str, dict],
+    table_name: Union[DbPath, str],
+    key_column: str = "id",
+    thread_count: Optional[int] = 1,
+    **kwargs,
 ):
     """Connects to the given database, and creates a TableSegment instance
 
diff --git a/data_diff/__main__.py b/data_diff/__main__.py
@@ -26,10 +26,11 @@
     "-": "red",
 }
 
+
 def _remove_passwords_in_dict(d: dict):
     for k, v in d.items():
-        if k == 'password':
-            d[k] = '*' * len(v)
+        if k == "password":
+            d[k] = "*" * len(v)
         elif isinstance(v, dict):
             _remove_passwords_in_dict(v)
 
diff --git a/data_diff/databases/base.py b/data_diff/databases/base.py
@@ -20,6 +20,7 @@
     TemporalType,
     UnknownColType,
     Text,
+    DbTime,
 )
 from data_diff.sql import DbPath, SqlOrStr, Compiler, Explain, Select, TableName
 
@@ -151,9 +152,10 @@ def _parse_type(
 
         elif issubclass(cls, Decimal):
             if numeric_scale is None:
-                raise ValueError(
-                    f"{self.name}: Unexpected numeric_scale is NULL, for column {'.'.join(table_path)}.{col_name} of type {type_repr}."
-                )
+                numeric_scale = 0  # Needed for Oracle.
+                # raise ValueError(
+                #     f"{self.name}: Unexpected numeric_scale is NULL, for column {'.'.join(table_path)}.{col_name} of type {type_repr}."
+                # )
             return cls(precision=numeric_scale)
 
         elif issubclass(cls, Float):
@@ -242,6 +244,13 @@ def offset_limit(self, offset: Optional[int] = None, limit: Optional[int] = None
 
         return f"LIMIT {limit}"
 
+    def concat(self, l: List[str]) -> str:
+        joined_exprs = ", ".join(l)
+        return f"concat({joined_exprs})"
+
+    def timestamp_value(self, t: DbTime) -> str:
+        return "'%s'" % t.isoformat()
+
     def normalize_uuid(self, value: str, coltype: ColType_UUID) -> str:
         if isinstance(coltype, String_UUID):
             return f"TRIM({value})"
diff --git a/data_diff/databases/connect.py b/data_diff/databases/connect.py
@@ -80,7 +80,9 @@ def match_path(self, dsn):
     "presto": MatchUriPath(Presto, ["catalog", "schema"], help_str="presto://<user>@<host>/<catalog>/<schema>"),
     "bigquery": MatchUriPath(BigQuery, ["dataset"], help_str="bigquery://<project>/<dataset>"),
     "databricks": MatchUriPath(
-        Databricks, ["catalog", "schema"], help_str="databricks://:access_token@server_name/http_path",
+        Databricks,
+        ["catalog", "schema"],
+        help_str="databricks://:access_token@server_name/http_path",
     ),
     "trino": MatchUriPath(Trino, ["catalog", "schema"], help_str="trino://<user>@<host>/<catalog>/<schema>"),
 }
@@ -125,9 +127,9 @@ def connect_to_uri(db_uri: str, thread_count: Optional[int] = 1) -> Database:
     if scheme == "databricks":
         assert not dsn.user
         kw = {}
-        kw['access_token'] = dsn.password
-        kw['http_path'] = dsn.path
-        kw['server_hostname'] = dsn.host
+        kw["access_token"] = dsn.password
+        kw["http_path"] = dsn.path
+        kw["server_hostname"] = dsn.host
         kw.update(dsn.query)
     else:
         kw = matcher.match_path(dsn)
diff --git a/data_diff/databases/database_types.py b/data_diff/databases/database_types.py
@@ -1,6 +1,6 @@
 import decimal
 from abc import ABC, abstractmethod
-from typing import Sequence, Optional, Tuple, Union, Dict, Any
+from typing import Sequence, Optional, Tuple, Union, Dict, List
 from datetime import datetime
 
 from runtype import dataclass
@@ -120,13 +120,24 @@ def to_string(self, s: str) -> str:
         "Provide SQL for casting a column to string"
         ...
 
+    @abstractmethod
+    def concat(self, s: List[str]) -> str:
+        "Provide SQL for concatenating a bunch of column into a string"
+        ...
+
+    @abstractmethod
+    def timestamp_value(self, t: DbTime) -> str:
+        "Provide SQL for the given timestamp value"
+        ...
+
     @abstractmethod
     def md5_to_int(self, s: str) -> str:
         "Provide SQL for computing md5 and returning an int"
         ...
 
     @abstractmethod
     def offset_limit(self, offset: Optional[int] = None, limit: Optional[int] = None):
+        "Provide SQL fragment for limit and offset inside a select"
         ...
 
     @abstractmethod
diff --git a/data_diff/databases/oracle.py b/data_diff/databases/oracle.py
@@ -4,7 +4,8 @@
 from .base import ThreadedDatabase, import_helper, ConnectError, QueryError
 from .base import DEFAULT_DATETIME_PRECISION, DEFAULT_NUMERIC_PRECISION
 
-SESSION_TIME_ZONE = None    # Changed by the tests
+SESSION_TIME_ZONE = None  # Changed by the tests
+
 
 @import_helper("oracle")
 def import_oracle():
@@ -89,6 +90,7 @@ def _parse_type(
         regexps = {
             r"TIMESTAMP\((\d)\) WITH LOCAL TIME ZONE": Timestamp,
             r"TIMESTAMP\((\d)\) WITH TIME ZONE": TimestampTZ,
+            r"TIMESTAMP\((\d)\)": Timestamp,
         }
         for regexp, t_cls in regexps.items():
             m = re.match(regexp + "$", type_repr)
@@ -99,14 +101,23 @@ def _parse_type(
                     rounds=self.ROUNDS_ON_PREC_LOSS,
                 )
 
-        return super()._parse_type(type_repr, col_name, type_repr, datetime_precision, numeric_precision, numeric_scale)
+        return super()._parse_type(
+            table_name, col_name, type_repr, datetime_precision, numeric_precision, numeric_scale
+        )
 
     def offset_limit(self, offset: Optional[int] = None, limit: Optional[int] = None):
         if offset:
             raise NotImplementedError("No support for OFFSET in query")
 
         return f"FETCH NEXT {limit} ROWS ONLY"
 
+    def concat(self, l: List[str]) -> str:
+        joined_exprs = " || ".join(l)
+        return f"({joined_exprs})"
+
+    def timestamp_value(self, t: DbTime) -> str:
+        return "timestamp '%s'" % t.isoformat(" ")
+
     def normalize_uuid(self, value: str, coltype: ColType_UUID) -> str:
         # Cast is necessary for correct MD5 (trimming not enough)
         return f"CAST(TRIM({value}) AS VARCHAR(36))"
diff --git a/data_diff/databases/postgresql.py b/data_diff/databases/postgresql.py
@@ -2,7 +2,8 @@
 from .base import ThreadedDatabase, import_helper, ConnectError
 from .base import MD5_HEXDIGITS, CHECKSUM_HEXDIGITS, _CHECKSUM_BITSIZE, TIMESTAMP_PRECISION_POS
 
-SESSION_TIME_ZONE = None    # Changed by the tests
+SESSION_TIME_ZONE = None  # Changed by the tests
+
 
 @import_helper("postgresql")
 def import_postgresql():
@@ -49,7 +50,7 @@ def _convert_db_precision_to_digits(self, p: int) -> int:
 
     def create_connection(self):
         if not self._args:
-            self._args['host'] = None   # psycopg2 requires 1+ arguments
+            self._args["host"] = None  # psycopg2 requires 1+ arguments
 
         pg = import_postgresql()
         try:
diff --git a/data_diff/databases/redshift.py b/data_diff/databases/redshift.py
@@ -35,6 +35,10 @@ def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
     def normalize_number(self, value: str, coltype: FractionalType) -> str:
         return self.to_string(f"{value}::decimal(38,{coltype.precision})")
 
+    def concat(self, l: List[str]) -> str:
+        joined_exprs = " || ".join(l)
+        return f"({joined_exprs})"
+
     def select_table_schema(self, path: DbPath) -> str:
         schema, table = self._normalize_table_path(path)
 
diff --git a/data_diff/databases/trino.py b/data_diff/databases/trino.py
@@ -66,7 +66,9 @@ def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
         else:
             s = f"date_format(cast({value} as timestamp(6)), '%Y-%m-%d %H:%i:%S.%f')"
 
-        return f"RPAD(RPAD({s}, {TIMESTAMP_PRECISION_POS + coltype.precision}, '.'), {TIMESTAMP_PRECISION_POS + 6}, '0')"
+        return (
+            f"RPAD(RPAD({s}, {TIMESTAMP_PRECISION_POS + coltype.precision}, '.'), {TIMESTAMP_PRECISION_POS + 6}, '0')"
+        )
 
     def normalize_number(self, value: str, coltype: FractionalType) -> str:
         return self.to_string(f"cast({value} as decimal(38,{coltype.precision}))")
@@ -96,9 +98,7 @@ def _parse_type(
             if m:
                 datetime_precision = int(m.group(1))
                 return t_cls(
-                    precision=datetime_precision
-                    if datetime_precision is not None
-                    else DEFAULT_DATETIME_PRECISION,
+                    precision=datetime_precision if datetime_precision is not None else DEFAULT_DATETIME_PRECISION,
                     rounds=self.ROUNDS_ON_PREC_LOSS,
                 )
 
@@ -115,9 +115,7 @@ def _parse_type(
             if m:
                 return n_cls()
 
-        return super()._parse_type(
-            table_path, col_name, type_repr, datetime_precision, numeric_precision
-        )
+        return super()._parse_type(table_path, col_name, type_repr, datetime_precision, numeric_precision)
 
     def normalize_uuid(self, value: str, coltype: ColType_UUID) -> str:
         return f"TRIM({value})"
diff --git a/data_diff/sql.py b/data_diff/sql.py
@@ -121,8 +121,8 @@ class Checksum(Sql):
 
     def compile(self, c: Compiler):
         if len(self.exprs) > 1:
-            compiled_exprs = ", ".join(f"coalesce({c.compile(expr)}, '<null>')" for expr in self.exprs)
-            expr = f"concat({compiled_exprs})"
+            compiled_exprs = [f"coalesce({c.compile(expr)}, '<null>')" for expr in self.exprs]
+            expr = c.database.concat(compiled_exprs)
         else:
             # No need to coalesce - safe to assume that key cannot be null
             (expr,) = self.exprs
@@ -180,10 +180,9 @@ def compile(self, c: Compiler):
 @dataclass
 class Time(Sql):
     time: datetime
-    column: Optional[SqlOrStr] = None
 
     def compile(self, c: Compiler):
-        return "'%s'" % self.time.isoformat()
+        return c.database.timestamp_value(self.time)
 
 
 @dataclass
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -33,7 +33,7 @@ toml = "^0.10.2"
 [tool.poetry.dev-dependencies]
 parameterized = "*"
 unittest-parallel = "*"
-preql = "^0.2.16"
+preql = "^0.2.17"
 mysql-connector-python = "*"
 databricks-sql-connector = "*"
 snowflake-connector-python = "*"
diff --git a/tests/common.py b/tests/common.py
@@ -98,4 +98,3 @@ def _drop_table_if_exists(conn, table):
             conn.query(f"DROP TABLE IF EXISTS {table}", None)
             if not isinstance(conn, (db.BigQuery, db.Databricks)):
                 conn.query("COMMIT", None)
-
diff --git a/tests/test_database_types.py b/tests/test_database_types.py
@@ -14,13 +14,21 @@
 from data_diff.databases import postgresql, oracle
 from data_diff.utils import number_to_human
 from data_diff.diff_tables import TableDiffer, TableSegment, DEFAULT_BISECTION_THRESHOLD
-from .common import CONN_STRINGS, N_SAMPLES, N_THREADS, BENCHMARK, GIT_REVISION, random_table_suffix, _drop_table_if_exists
+from .common import (
+    CONN_STRINGS,
+    N_SAMPLES,
+    N_THREADS,
+    BENCHMARK,
+    GIT_REVISION,
+    random_table_suffix,
+    _drop_table_if_exists,
+)
 
 
 CONNS = {k: db.connect_to_uri(v, N_THREADS) for k, v in CONN_STRINGS.items()}
 
 CONNS[db.MySQL].query("SET @@session.time_zone='+00:00'", None)
-oracle.SESSION_TIME_ZONE = postgresql.SESSION_TIME_ZONE = 'UTC'
+oracle.SESSION_TIME_ZONE = postgresql.SESSION_TIME_ZONE = "UTC"
 
 DATABASE_TYPES = {
     db.PostgreSQL: {
@@ -196,12 +204,10 @@
             "INT",
             "BIGINT",
         ],
-
         # https://docs.databricks.com/spark/latest/spark-sql/language-manual/data-types/timestamp-type.html
         "datetime": [
             "TIMESTAMP",
         ],
-
         # https://docs.databricks.com/spark/latest/spark-sql/language-manual/data-types/float-type.html
         # https://docs.databricks.com/spark/latest/spark-sql/language-manual/data-types/double-type.html
         # https://docs.databricks.com/spark/latest/spark-sql/language-manual/data-types/decimal-type.html
@@ -210,10 +216,9 @@
             "DOUBLE",
             "DECIMAL(6, 2)",
         ],
-
         "uuid": [
             "STRING",
-        ]
+        ],
     },
     db.Trino: {
         "int": [
@@ -406,7 +411,7 @@ def __iter__(self):
         ) in source_type_categories.items():  # int, datetime, ..
             for source_type in source_types:
                 for target_type in target_type_categories[type_category]:
-                    if (CONNS.get(source_db, False) and CONNS.get(target_db, False)):
+                    if CONNS.get(source_db, False) and CONNS.get(target_db, False):
                         type_pairs.append(
                             (
                                 source_db,
@@ -480,7 +485,7 @@ def _insert_to_table(conn, table, values, type):
             value = str(sample)
         elif isinstance(sample, datetime) and isinstance(conn, (db.Presto, db.Oracle, db.Trino)):
             value = f"timestamp '{sample}'"
-        elif isinstance(sample, datetime) and isinstance(conn, db.BigQuery) and type == 'datetime':
+        elif isinstance(sample, datetime) and isinstance(conn, db.BigQuery) and type == "datetime":
             value = f"cast(timestamp '{sample}' as datetime)"
         elif isinstance(sample, bytearray):
             value = f"'{sample.decode()}'"
diff --git a/tests/test_diff_tables.py b/tests/test_diff_tables.py