databricks · susodapop · Oct 6, 2023 · Oct 6, 2023 · Oct 6, 2023 · Oct 6, 2023
@@ -148,16 +148,14 @@ The suites marked `[not documented]` require additional configuration which will
 
 SQLAlchemy provides reusable tests for testing dialect implementations.
 
-To run these tests, assuming the environment variables needed for e2e tests are set, do the following:
-
 ```
-cd src/databricks/sqlalchemy
-poetry run python -m pytest test/sqlalchemy_dialect_compliance.py --dburi \
+poetry shell
+cd src/databricks/sqlalchemy/test
+python -m pytest test_suite.py --dburi \
   "databricks://token:$access_token@$host?http_path=$http_path&catalog=$catalog&schema=$schema" 
 ```
 
-Some of these of these tests fail currently. We're working on getting
-relavent tests passing and others skipped.
+Some of these of these tests fail currently. We're working on getting relevant tests passing and others skipped.
 
 ### Code formatting
 

@@ -13,7 +13,7 @@
 from databricks import sql
 
 # This import is required to process our @compiles decorators
-import databricks.sqlalchemy.types
+import databricks.sqlalchemy._types as dialect_type_impl
 
 
 from databricks.sqlalchemy.base import (
@@ -48,6 +48,12 @@ class DatabricksDialect(default.DefaultDialect):
     non_native_boolean_check_constraint: bool = False
     paramstyle: str = "named"
 
+    colspecs = {
+        sqlalchemy.types.DateTime: dialect_type_impl.DatabricksDateTimeNoTimezoneType,
+        sqlalchemy.types.Time: dialect_type_impl.DatabricksTimeType,
+        sqlalchemy.types.String: dialect_type_impl.DatabricksStringType,
+    }
+
     @classmethod
     def dbapi(cls):
         return sql
@@ -130,7 +136,6 @@ def get_columns(self, connection, table_name, schema=None, **kwargs):
         columns = []
 
         for col in resp:
-
             # Taken from PyHive. This removes added type info from decimals and maps
             _col_type = re.search(r"^\w+", col.TYPE_NAME).group(0)
             this_column = {
@@ -277,6 +282,13 @@ def get_schema_names(self, connection, **kw):
 
         # TODO: replace with call to cursor.schemas() once its performance matches raw SQL
         return [row[0] for row in connection.execute("SHOW SCHEMAS")]
+
+    @classmethod
+    def load_provisioning(cls):
+        try:
+            __import__("databricks.sqlalchemy.provision")
+        except ImportError:
+            pass
 
 
 @event.listens_for(Engine, "do_connect")

@@ -0,0 +1,213 @@
+import sqlalchemy
+from sqlalchemy.ext.compiler import compiles
+
+from typing import Union
+
+from datetime import datetime
+
+
+from databricks.sql.utils import ParamEscaper
+
+
+@compiles(sqlalchemy.types.Enum, "databricks")
+@compiles(sqlalchemy.types.String, "databricks")
+@compiles(sqlalchemy.types.Text, "databricks")
+@compiles(sqlalchemy.types.Time, "databricks")
+@compiles(sqlalchemy.types.Unicode, "databricks")
+@compiles(sqlalchemy.types.UnicodeText, "databricks")
+@compiles(sqlalchemy.types.Uuid, "databricks")
+def compile_string_databricks(type_, compiler, **kw):
+    """
+    We override the default compilation for Enum(), String(), Text(), and Time() because SQLAlchemy
+    defaults to incompatible / abnormal compiled names
+
+      Enum -> VARCHAR
+      String -> VARCHAR[LENGTH]
+      Text -> VARCHAR[LENGTH]
+      Time -> TIME
+      Unicode -> VARCHAR[LENGTH]
+      UnicodeText -> TEXT
+      Uuid -> CHAR[32]
+
+    But all of these types will be compiled to STRING in Databricks SQL
+    """
+    return "STRING"
+
+
+@compiles(sqlalchemy.types.Integer, "databricks")
+def compile_integer_databricks(type_, compiler, **kw):
+    """
+    We need to override the default Integer compilation rendering because Databricks uses "INT" instead of "INTEGER"
+    """
+    return "INT"
+
+
+@compiles(sqlalchemy.types.LargeBinary, "databricks")
+def compile_binary_databricks(type_, compiler, **kw):
+    """
+    We need to override the default LargeBinary compilation rendering because Databricks uses "BINARY" instead of "BLOB"
+    """
+    return "BINARY"
+
+
+@compiles(sqlalchemy.types.Numeric, "databricks")
+def compile_numeric_databricks(type_, compiler, **kw):
+    """
+    We need to override the default Numeric compilation rendering because Databricks uses "DECIMAL" instead of "NUMERIC"
+
+    The built-in visit_DECIMAL behaviour captures the precision and scale. Here we're just mapping calls to compile Numeric
+    to the SQLAlchemy Decimal() implementation
+    """
+    return compiler.visit_DECIMAL(type_, **kw)
+
+
+@compiles(sqlalchemy.types.DateTime, "databricks")
+def compile_datetime_databricks(type_, compiler, **kw):
+    """
+    We need to override the default DateTime compilation rendering because Databricks uses "TIMESTAMP" instead of "DATETIME"
+    """
+    return "TIMESTAMP_NTZ"
+
+
+@compiles(sqlalchemy.types.ARRAY, "databricks")
+def compile_array_databricks(type_, compiler, **kw):
+    """
+    SQLAlchemy's default ARRAY can't compile as it's only implemented for Postgresql.
+    The Postgres implementation works for Databricks SQL, so we duplicate that here.
+
+    :type_:
+        This is an instance of sqlalchemy.types.ARRAY which always includes an item_type attribute
+        which is itself an instance of TypeEngine
+
+    https://docs.sqlalchemy.org/en/20/core/type_basics.html#sqlalchemy.types.ARRAY
+    """
+
+    inner = compiler.process(type_.item_type, **kw)
+
+    return f"ARRAY<{inner}>"
+
+
+class DatabricksDateTimeNoTimezoneType(sqlalchemy.types.TypeDecorator):
+    """The decimal that pysql creates when it receives the contents of a TIMESTAMP_NTZ
+    includes a timezone of 'Etc/UTC'.  But since SQLAlchemy's test suite assumes that
+    the sqlalchemy.types.DateTime type will return a datetime.datetime _without_ any
+    timezone set, we need to strip the timezone off the value received from pysql.
+
+    It's not clear if DBR sends a timezone to pysql or if pysql is adding it. This could be a bug.
+    """
+
+    impl = sqlalchemy.types.DateTime
+
+    cache_ok = True
+
+    def process_result_value(self, value: Union[None, datetime], dialect):
+        if value is None:
+            return None
+        return value.replace(tzinfo=None)
+
+
+class DatabricksTimeType(sqlalchemy.types.TypeDecorator):
+    """Databricks has no native TIME type. So we store it as a string."""
+
+    impl = sqlalchemy.types.Time
+    cache_ok = True
+
+    TIME_WITH_MICROSECONDS_FMT = "%H:%M:%S.%f"
+    TIME_NO_MICROSECONDS_FMT = "%H:%M:%S"
+
+    def process_bind_param(self, value: Union[datetime.time, None], dialect) -> str:
+        """Values sent to the database are converted to %:H:%M:%S strings."""
+        if value is None:
+            return None
+        return value.strftime(self.TIME_WITH_MICROSECONDS_FMT)
+
+    def process_literal_param(self, value, dialect) -> datetime.time:
+        """It's not clear to me why this is necessary. Without it, SQLAlchemy's Timetest:test_literal fails
+        because the string literal renderer receives a str() object and calls .isoformat() on it.
+
+        Whereas this method receives a datetime.time() object which is subsequently passed to that
+        same renderer. And that works.
+
+        UPDATE: After coping with the literal_processor override in DatabricksStringType, I suspect a similar
+        mechanism is at play. Two different processors are are called in sequence. This is likely a byproduct
+        of Databricks not having a true TIME type. I think the string representation of Time() types is
+        somehow affecting the literal rendering process. But as long as this passes the tests, I'm not
+        worried about it.
+        """
+        return value
+
+    def process_result_value(
+        self, value: Union[None, str], dialect
+    ) -> Union[datetime.time, None]:
+        """Values received from the database are parsed into datetime.time() objects"""
+        if value is None:
+            return None
+
+        try:
+            _parsed = datetime.strptime(value, self.TIME_WITH_MICROSECONDS_FMT)
+        except ValueError:
+            # If the string doesn't have microseconds, try parsing it without them
+            _parsed = datetime.strptime(value, self.TIME_NO_MICROSECONDS_FMT)
+
+        return _parsed.time()
+
+
+class DatabricksStringType(sqlalchemy.types.TypeDecorator):
+    """We have to implement our own String() type because SQLAlchemy's default implementation
+    wants to escape single-quotes with a doubled single-quote. Databricks uses a backslash for
+    escaping of literal strings. And SQLAlchemy's default escaping breaks Databricks SQL.
+    """
+
+    impl = sqlalchemy.types.String
+    cache_ok = True
+    pe = ParamEscaper()
+
+    def process_literal_param(self, value, dialect) -> str:
+        """SQLAlchemy's default string escaping for backslashes doesn't work for databricks. The logic here
+        implements the same logic as our legacy inline escaping logic.
+        """
+
+        return self.pe.escape_string(value)
+
+    def literal_processor(self, dialect):
+        """We manually override this method to prevent further processing of the string literal beyond
+        what happens in the process_literal_param() method.
+
+        The SQLAlchemy docs _specifically_ say to not override this method.
+
+        It appears that any processing that happens from TypeEngine.process_literal_param happens _before_
+        and _in addition to_ whatever the class's impl.literal_processor() method does. The String.literal_processor()
+        method performs a string replacement that doubles any single-quote in the contained string. This raises a syntax
+        error in Databricks. And it's not necessary because ParamEscaper() already implements all the escaping we need.
+
+        We should consider opening an issue on the SQLAlchemy project to see if I'm using it wrong.
+
+        See type_api.py::TypeEngine.literal_processor:
+
+        ```python
+            def process(value: Any) -> str:
+                return fixed_impl_processor(
+                    fixed_process_literal_param(value, dialect)
+                )
+        ```
+
+        That call to fixed_impl_processor wraps the result of fixed_process_literal_param (which is the
+        process_literal_param defined in our Databricks dialect)
+
+        https://docs.sqlalchemy.org/en/20/core/custom_types.html#sqlalchemy.types.TypeDecorator.literal_processor
+        """
+
+        def process(value):
+            """This is a copy of the default String.literal_processor() method but stripping away
+            its double-escaping behaviour for single-quotes.
+            """
+
+            _step1 = self.process_literal_param(value, dialect="databricks")
+            if dialect.identifier_preparer._double_percents:
+                _step2 = _step1.replace("%", "%%")
+            else:
+                _step2 = _step1
+
+            return "%s" % _step2
+
+        return process
@@ -0,0 +1,12 @@
+from sqlalchemy.testing.provision import create_db, drop_db
+
+@create_db.for_db("databricks")
+def _databricks_create_db(cfg, eng, ident):
+    with eng.begin() as conn:
+        create_string = "CREATE SCHEMA `main`.`%s`" % ident
+        conn.exec_driver_sql(create_string)
+
+@drop_db.for_db("databricks")
+def _databricks_drop_db(cfg, eng, ident):
+    with eng.begin() as conn:
+        conn.exec_driver_sql("DROP SCHEMA `main`.`%s`" % ident)
@@ -1,20 +1,4 @@
 """
-This module is supposedly used by the compliance tests to control which tests are run based on database capabilities.
-However, based on some experimentation that does not appear to be consistently the case. Until we better understand
-when these requirements are and are not implemented, we prefer to manually capture the exact nature of the failures
-and errors.
-
-Once we better understand how to use requirements.py, an example exclusion will look like this:
-
-    import sqlalchemy.testing.requirements
-    import sqlalchemy.testing.exclusions
-
-    class Requirements(sqlalchemy.testing.requirements.SuiteRequirements):
-        @property
-        def __some_example_requirement(self):
-            return sqlalchemy.testing.exclusions.closed
-
-
 The complete list of requirements is provided by SQLAlchemy here:
 
 https://github.com/sqlalchemy/sqlalchemy/blob/main/lib/sqlalchemy/testing/requirements.py
@@ -23,12 +7,107 @@ def __some_example_requirement(self):
 import sqlalchemy.testing.requirements
 import sqlalchemy.testing.exclusions
 
-import logging
+class Requirements(sqlalchemy.testing.requirements.SuiteRequirements):
+    @property
+    def date_historic(self):
+        """target dialect supports representation of Python
+        datetime.datetime() objects with historic (pre 1970) values."""
+
+        return sqlalchemy.testing.exclusions.open()
 
-logger = logging.getLogger(__name__)
+    @property
+    def datetime_historic(self):
+        """target dialect supports representation of Python
+        datetime.datetime() objects with historic (pre 1970) values."""
 
-logger.warning("requirements.py is not currently employed by Databricks dialect")
+        return sqlalchemy.testing.exclusions.open()
 
+    @property
+    def datetime_literals(self):
+        """target dialect supports rendering of a date, time, or datetime as a
+        literal string, e.g. via the TypeEngine.literal_processor() method.
 
-class Requirements(sqlalchemy.testing.requirements.SuiteRequirements):
-    pass
+        """
+
+        return sqlalchemy.testing.exclusions.open()
+
+    @property
+    def timestamp_microseconds(self):
+        """target dialect supports representation of Python
+        datetime.datetime() with microsecond objects but only
+        if TIMESTAMP is used."""
+
+        return sqlalchemy.testing.exclusions.open()
+
+    @property
+    def time_microseconds(self):
+        """target dialect supports representation of Python
+        datetime.time() with microsecond objects.
+
+        This requirement declaration isn't needed but I've included it here for completeness.
+        Since Databricks doesn't have a TIME type, SQLAlchemy will compile Time() columns
+        as STRING Databricks data types. And we use a custom time type to render those strings
+        between str() and time.time() representations. Therefore we can store _any_ precision
+        that SQLAlchemy needs. The time_microseconds requirement defaults to ON for all dialects
+        except mssql, mysql, mariadb, and oracle.
+        """
+
+        return sqlalchemy.testing.exclusions.open()
+
+    @property
+    def precision_generic_float_type(self):
+        """target backend will return native floating point numbers with at
+        least seven decimal places when using the generic Float type.
+
+        Databricks sometimes only returns six digits of precision for the generic Float type
+        """
+        return sqlalchemy.testing.exclusions.closed()
+
+    @property
+    def literal_float_coercion(self):
+        """target backend will return the exact float value 15.7563
+        with only four significant digits from this statement:
+
+        SELECT :param
+
+        where :param is the Python float 15.7563
+
+        i.e. it does not return 15.75629997253418
+
+        Without additional work, Databricks returns 15.75629997253418
+        This is a potential area where we could override the Float literal processor.
+        Will leave to a PM to decide if we should do so.
+        """
+        return sqlalchemy.testing.exclusions.closed()
+
+    @property
+    def precision_numerics_enotation_large(self):
+        """target backend supports Decimal() objects using E notation
+        to represent very large values.
+
+        Databricks supports E notation for FLOAT data types but not for DECIMAL types,
+        which is the underlying data type SQLAlchemy uses for Numeric() types.
+
+        """
+        return sqlalchemy.testing.exclusions.closed()
+
+    @property
+    def infinity_floats(self):
+        """The Float type can persist and load float('inf'), float('-inf')."""
+
+        return sqlalchemy.testing.exclusions.open()
+
+    @property
+    def precision_numerics_retains_significant_digits(self):
+        """A precision numeric type will return empty significant digits,
+        i.e. a value such as 10.000 will come back in Decimal form with
+        the .000 maintained."""
+
+        return sqlalchemy.testing.exclusions.open()
+
+    @property
+    def array_type(self):
+        """While Databricks does support ARRAY types, pysql cannot bind them. So
+        we cannot use them with SQLAlchemy"""
+
+        return sqlalchemy.testing.exclusions.closed()
@@ -2,3 +2,6 @@
 [sqla_testing]
 requirement_cls=databricks.sqlalchemy.requirements:Requirements
 profile_file=profiles.txt
+
+[db]
+databricks=<enter valid connection string here>
@@ -36,12 +36,12 @@ class DatabricksDataType(enum.Enum):
     sqlalchemy.types.LargeBinary: DatabricksDataType.BINARY,
     sqlalchemy.types.Boolean: DatabricksDataType.BOOLEAN,
     sqlalchemy.types.Date: DatabricksDataType.DATE,
-    sqlalchemy.types.DateTime: DatabricksDataType.TIMESTAMP,
+    sqlalchemy.types.DateTime: DatabricksDataType.TIMESTAMP_NTZ,
     sqlalchemy.types.Double: DatabricksDataType.DOUBLE,
     sqlalchemy.types.Enum: DatabricksDataType.STRING,
     sqlalchemy.types.Float: DatabricksDataType.FLOAT,
     sqlalchemy.types.Integer: DatabricksDataType.INT,
-    sqlalchemy.types.Interval: DatabricksDataType.TIMESTAMP,
+    sqlalchemy.types.Interval: DatabricksDataType.TIMESTAMP_NTZ,
     sqlalchemy.types.Numeric: DatabricksDataType.DECIMAL,
     sqlalchemy.types.PickleType: DatabricksDataType.BINARY,
     sqlalchemy.types.SmallInteger: DatabricksDataType.SMALLINT,