Add README and updated example for SQLAlchemy usage (databricks#273)

Jesse · web-flow · commit 9a532c2eeb5c · 2023-11-16T16:32:38.000-05:00
Signed-off-by: Jesse Whitehouse &lt;jesse.whitehouse@databricks.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,11 +2,23 @@
 
 ## 3.0.0 (Unreleased)
 
-- Other: Introduce SQLAlchemy dialect compliance test suite and enumerate all excluded tests
+- Remove support for Python 3.7
+- Enable cloud fetch by default. To disable, set `use_cloud_fetch=False` when building `databricks.sql.client`.
+- Completely rewritten SQLAlchemy dialect
+  - Adds support for SQLAlchemy >= 2.0 and drops support for SQLAlchemy 1.x
+  - Full e2e test coverage of all supported features
+  - Detailed usage notes in `README.sqlalchemy.md`
+  - Adds support for:
+    - New types: `TIME`, `TIMESTAMP`, `TIMESTAMP_NTZ`, `TINYINT`
+    - `Numeric` type scale and precision, like `Numeric(10,2)`
+    - Reading and writing `PrimaryKeyConstraint` and `ForeignKeyConstraint`
+    - Reading and writing composite keys
+    - Reading and writing from views
+    - Writing `Identity` to tables (i.e. autoincrementing primary keys)
+    - `LIMIT` and `OFFSET` for paging through results
+    - Caching metadata calls
 - Add integration tests for Databricks UC Volumes ingestion queries
 - Add `_retry_max_redirects` config
-- Enable cloud fetch by default. To disable, set `use_cloud_fetch=False` when building `databricks.sql.client`.
-- Remove support for Python 3.7
 
 ## 2.9.3 (2023-08-24)
 
diff --git a/examples/README.md b/examples/README.md
@@ -38,6 +38,6 @@ To run all of these examples you can clone the entire repository to your disk. O
 - **`set_user_agent.py`** shows how to customize the user agent header used for Thrift commands. In
 this example the string `ExamplePartnerTag` will be added to the the user agent on every request.
 - **`staging_ingestion.py`** shows how the connector handles Databricks' experimental staging ingestion commands `GET`, `PUT`, and `REMOVE`.
-- **`sqlalchemy.py`** shows a basic example of connecting to Databricks with [SQLAlchemy](https://www.sqlalchemy.org/). 
+- **`sqlalchemy.py`** shows a basic example of connecting to Databricks with [SQLAlchemy 2.0](https://www.sqlalchemy.org/). 
 - **`custom_cred_provider.py`** shows how to pass a custom credential provider to bypass connector authentication. Please install databricks-sdk prior to running this example.
 - **`v3_retries_query_execute.py`** shows how to enable v3 retries in connector version 2.9.x including how to enable retries for non-default retry cases.
diff --git a/examples/sqlalchemy.py b/examples/sqlalchemy.py
@@ -1,55 +1,45 @@
 """
-databricks-sql-connector includes a SQLAlchemy dialect compatible with Databricks SQL.
-It aims to be a drop-in replacement for the crflynn/sqlalchemy-databricks project, that implements
-more of the Databricks API, particularly around table reflection, Alembic usage, and data
-ingestion with pandas.
-
-Expected URI format is: databricks+thrift://token:dapi***@***.cloud.databricks.com?http_path=/sql/***
-
-Because of the extent of SQLAlchemy's capabilities it isn't feasible to provide examples of every
-usage in a single script, so we only provide a basic one here. More examples are found in our test
-suite at tests/e2e/sqlalchemy/test_basic.py and in the PR that implements this change: 
-
-https://github.com/databricks/databricks-sql-python/pull/57
-
-# What's already supported
-
-Most of the functionality is demonstrated in the e2e tests mentioned above. The below list we
-derived from those test method names:
-
-    - Create and drop tables with SQLAlchemy Core
-    - Create and drop tables with SQLAlchemy ORM
-    - Read created tables via reflection
-    - Modify column nullability
-    - Insert records manually
-    - Insert records with pandas.to_sql (note that this does not work for DataFrames with indexes)
-
-This connector also aims to support Alembic for programmatic delta table schema maintenance. This
-behaviour is not yet backed by integration tests, which will follow in a subsequent PR as we learn
-more about customer use cases there. That said, the following behaviours have been tested manually:
-
-    - Autogenerate revisions with alembic revision --autogenerate
-    - Upgrade and downgrade between revisions with `alembic upgrade <revision hash>` and
-      `alembic downgrade <revision hash>`
-
-# Known Gaps
-    - MAP, ARRAY, and STRUCT types: this dialect can read these types out as strings. But you cannot
-      define a SQLAlchemy model with databricks.sqlalchemy.types.DatabricksMap (e.g.) because
-      we haven't implemented them yet.
-    - Constraints: with the addition of information_schema to Unity Catalog, Databricks SQL supports
-      foreign key and primary key constraints. This dialect can write these constraints but the ability
-      for alembic to reflect and modify them programmatically has not been tested.
+databricks-sql-connector includes a SQLAlchemy 2.0 dialect compatible with Databricks SQL. To install
+its dependencies you can run `pip install databricks-sql-connector[sqlalchemy]`.
+
+The expected connection string format which you can pass to create_engine() is:
+
+databricks://token:dapi***@***.cloud.databricks.com?http_path=/sql/***&catalog=**&schema=**
+
+Our dialect implements the majority of SQLAlchemy 2.0's API. Because of the extent of SQLAlchemy's
+capabilities it isn't feasible to provide examples of every usage in a single script, so we only
+provide a basic one here. Learn more about usage in README.sqlalchemy.md in this repo. 
 """
 
-import os
-import sqlalchemy
-from sqlalchemy.orm import Session
-from sqlalchemy import Column, String, Integer, BOOLEAN, create_engine, select
+# fmt: off
 
-try:
-    from sqlalchemy.orm import declarative_base
-except ImportError:
-    from sqlalchemy.ext.declarative import declarative_base
+import os
+from datetime import date, datetime, time, timedelta, timezone
+from decimal import Decimal
+from uuid import UUID
+
+# By convention, backend-specific SQLA types are defined in uppercase
+# This dialect exposes Databricks SQL's TIMESTAMP and TINYINT types
+# as these are not covered by the generic, camelcase types shown below
+from databricks.sqlalchemy import TIMESTAMP, TINYINT
+
+# Beside the CamelCase types shown below, line comments reflect
+# the underlying Databricks SQL / Delta table type
+from sqlalchemy import (
+    BigInteger,      # BIGINT
+    Boolean,         # BOOLEAN
+    Column,
+    Date,            # DATE
+    DateTime,        # TIMESTAMP_NTZ
+    Integer,         # INTEGER
+    Numeric,         # DECIMAL
+    String,          # STRING
+    Time,            # STRING
+    Uuid,            # STRING
+    create_engine,
+    select,
+)
+from sqlalchemy.orm import DeclarativeBase, Session
 
 host = os.getenv("DATABRICKS_SERVER_HOSTNAME")
 http_path = os.getenv("DATABRICKS_HTTP_PATH")
@@ -58,58 +48,127 @@
 schema = os.getenv("DATABRICKS_SCHEMA")
 
 
-# Extra arguments are passed untouched to the driver
-# See thrift_backend.py for complete list
+# Extra arguments are passed untouched to databricks-sql-connector
+# See src/databricks/sql/thrift_backend.py for complete list
 extra_connect_args = {
     "_tls_verify_hostname": True,
     "_user_agent_entry": "PySQL Example Script",
 }
 
-if sqlalchemy.__version__.startswith("1.3"):
-    # SQLAlchemy 1.3.x fails to parse the http_path, catalog, and schema from our connection string
-    # Pass these in as connect_args instead
-
-    conn_string = f"databricks://token:{access_token}@{host}"
-    connect_args = dict(catalog=catalog, schema=schema, http_path=http_path)
-    all_connect_args = {**extra_connect_args, **connect_args}
-    engine = create_engine(conn_string, connect_args=all_connect_args)
-else:
-    engine = create_engine(
-        f"databricks://token:{access_token}@{host}?http_path={http_path}&catalog={catalog}&schema={schema}",
-        connect_args=extra_connect_args,
-    )
-
-session = Session(bind=engine)
-base = declarative_base(bind=engine)
-
-
-class SampleObject(base):
 
-    __tablename__ = "mySampleTable"
-
-    name = Column(String(255), primary_key=True)
-    episodes = Column(Integer)
-    some_bool = Column(BOOLEAN)
-
-
-base.metadata.create_all()
-
-sample_object_1 = SampleObject(name="Bim Adewunmi", episodes=6, some_bool=True)
-sample_object_2 = SampleObject(name="Miki Meek", episodes=12, some_bool=False)
-
-session.add(sample_object_1)
-session.add(sample_object_2)
+engine = create_engine(
+    f"databricks://token:{access_token}@{host}?http_path={http_path}&catalog={catalog}&schema={schema}",
+    connect_args=extra_connect_args, echo=True,
+)
+
+
+class Base(DeclarativeBase):
+    pass
+
+
+# This object gives a usage example for each supported type
+# for more details on these, see README.sqlalchemy.md
+class SampleObject(Base):
+    __tablename__ = "pysql_sqlalchemy_example_table"
+
+    bigint_col = Column(BigInteger, primary_key=True)
+    string_col = Column(String)
+    tinyint_col = Column(TINYINT)
+    int_col = Column(Integer)
+    numeric_col = Column(Numeric(10, 2))
+    boolean_col = Column(Boolean)
+    date_col = Column(Date)
+    datetime_col = Column(TIMESTAMP)
+    datetime_col_ntz = Column(DateTime)
+    time_col = Column(Time)
+    uuid_col = Column(Uuid)
+
+# This generates a CREATE TABLE statement against the catalog and schema
+# specified in the connection string
+Base.metadata.create_all(engine)
+
+# Output SQL is:
+# CREATE TABLE pysql_sqlalchemy_example_table (
+#         bigint_col BIGINT NOT NULL, 
+#         string_col STRING, 
+#         tinyint_col SMALLINT, 
+#         int_col INT, 
+#         numeric_col DECIMAL(10, 2), 
+#         boolean_col BOOLEAN, 
+#         date_col DATE, 
+#         datetime_col TIMESTAMP, 
+#         datetime_col_ntz TIMESTAMP_NTZ, 
+#         time_col STRING, 
+#         uuid_col STRING, 
+#         PRIMARY KEY (bigint_col)
+# ) USING DELTA
+
+# The code that follows will INSERT a record using SQLAlchemy ORM containing these values
+# and then SELECT it back out. The output is compared to the input to demonstrate that
+# all type information is preserved.
+sample_object = {
+    "bigint_col": 1234567890123456789,
+    "string_col": "foo",
+    "tinyint_col": -100,
+    "int_col": 5280,
+    "numeric_col": Decimal("525600.01"),
+    "boolean_col": True,
+    "date_col": date(2020, 12, 25),
+    "datetime_col": datetime(
+        1991, 8, 3, 21, 30, 5, tzinfo=timezone(timedelta(hours=-8))
+    ),
+    "datetime_col_ntz": datetime(1990, 12, 4, 6, 33, 41),
+    "time_col": time(23, 59, 59),
+    "uuid_col": UUID(int=255),
+}
+sa_obj = SampleObject(**sample_object)
 
+session = Session(engine)
+session.add(sa_obj)
 session.commit()
 
-# SQLAlchemy 1.3 has slightly different methods
-if sqlalchemy.__version__.startswith("1.3"):
-    stmt = select([SampleObject]).where(SampleObject.name.in_(["Bim Adewunmi", "Miki Meek"]))
-    output = [i for i in session.execute(stmt)]
-else:
-    stmt = select(SampleObject).where(SampleObject.name.in_(["Bim Adewunmi", "Miki Meek"]))
-    output = [i for i in session.scalars(stmt)]
-
-assert len(output) == 2
-
-base.metadata.drop_all()
+# Output SQL is:
+# INSERT INTO
+#   pysql_sqlalchemy_example_table (
+#     bigint_col,
+#     string_col,
+#     tinyint_col,
+#     int_col,
+#     numeric_col,
+#     boolean_col,
+#     date_col,
+#     datetime_col,
+#     datetime_col_ntz,
+#     time_col,
+#     uuid_col
+#   )
+# VALUES
+#   (
+#     :bigint_col,
+#     :string_col,
+#     :tinyint_col,
+#     :int_col,
+#     :numeric_col,
+#     :boolean_col,
+#     :date_col,
+#     :datetime_col,
+#     :datetime_col_ntz,
+#     :time_col,
+#     :uuid_col
+#   )
+
+# Here we build a SELECT query using ORM
+stmt = select(SampleObject).where(SampleObject.int_col == 5280)
+
+# Then fetch one result with session.scalar()
+result = session.scalar(stmt)
+
+# Finally, we read out the input data and compare it to the output
+compare = {key: getattr(result, key) for key in sample_object.keys()}
+assert compare == sample_object
+
+# Then we drop the demonstration table
+Base.metadata.drop_all(engine)
+
+# Output SQL is:
+# DROP TABLE pysql_sqlalchemy_example_table
diff --git a/src/databricks/sqlalchemy/README.sqlalchemy.md b/src/databricks/sqlalchemy/README.sqlalchemy.md