Skip to content

Commit 9ee5cd4

Browse files
committed
SQLAlchemy: Add documentation and tests for usage with Dask [WIP]
1 parent fe626c2 commit 9ee5cd4

File tree

4 files changed

+57
-3
lines changed

4 files changed

+57
-3
lines changed

CHANGES.txt

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ Unreleased
77

88
- SQLAlchemy: Added ``insert_bulk`` fast-path ``INSERT`` method for pandas, in
99
order to support efficient batch inserts using CrateDB's bulk operations endpoint.
10+
- SQLAlchemy: Add documentation and software tests for usage with Dask [WIP]
1011

1112

1213
2023/04/18 0.31.1

docs/by-example/sqlalchemy/dataframe.rst

+12-3
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,12 @@
55
SQLAlchemy: DataFrame operations
66
================================
77

8+
.. rubric:: Table of Contents
9+
10+
.. contents::
11+
:local:
12+
13+
814
About
915
=====
1016

@@ -29,10 +35,13 @@ The :ref:`pandas I/O subsystem <pandas:api.io>` for `relational databases`_
2935
using `SQL`_ is based on `SQLAlchemy`_.
3036

3137

32-
.. rubric:: Table of Contents
38+
Compatibility notes
39+
===================
3340

34-
.. contents::
35-
:local:
41+
.. NOTE::
42+
43+
Please note that DataFrame support for pandas and Dask is only validated
44+
with Python 3.8 and higher, and SQLAlchemy 1.4 and higher.
3645

3746

3847
Efficient ``INSERT`` operations with pandas

setup.py

+1
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ def read(path):
6868
'zope.testrunner>=5,<7',
6969
'zc.customdoctests>=1.0.1,<2',
7070
'createcoverage>=1,<2',
71+
'dask',
7172
'stopit>=1.1.2,<2',
7273
'flake8>=4,<7',
7374
'pandas',

src/crate/client/sqlalchemy/tests/bulk_test.py

+43
Original file line numberDiff line numberDiff line change
@@ -207,3 +207,46 @@ def test_bulk_save_pandas(self, mock_cursor):
207207

208208
# Verify number of batches.
209209
self.assertEqual(effective_op_count, OPCOUNT)
210+
211+
@skipIf(sys.version_info < (3, 8), "SQLAlchemy/Dask is not supported on Python <3.8")
212+
@skipIf(SA_VERSION < SA_1_4, "SQLAlchemy 1.3 is not supported by pandas")
213+
@patch('crate.client.connection.Cursor', mock_cursor=FakeCursor)
214+
def test_bulk_save_dask(self, mock_cursor):
215+
"""
216+
Verify bulk INSERT with Dask.
217+
"""
218+
import dask.dataframe as dd
219+
from pandas._testing import makeTimeDataFrame
220+
from crate.client.sqlalchemy.support import insert_bulk
221+
222+
# 42 records / 4 partitions means each partition has a size of 10.5 elements.
223+
# Because the chunk size 8 is slightly smaller than 10, the partition will not
224+
# fit into it, so two batches will be emitted to the database for each data
225+
# partition. 4 partitions * 2 batches = 8 insert operations will be emitted.
226+
INSERT_RECORDS = 42
227+
NPARTITIONS = 4
228+
CHUNK_SIZE = 8
229+
OPCOUNT = math.ceil(INSERT_RECORDS / NPARTITIONS / CHUNK_SIZE) * NPARTITIONS
230+
231+
# Create a DataFrame to feed into the database.
232+
df = makeTimeDataFrame(nper=INSERT_RECORDS, freq="S")
233+
ddf = dd.from_pandas(df, npartitions=NPARTITIONS)
234+
235+
dburi = "crate://localhost:4200"
236+
retval = ddf.to_sql(
237+
name="test-testdrive",
238+
uri=dburi,
239+
if_exists="replace",
240+
index=False,
241+
chunksize=CHUNK_SIZE,
242+
method=insert_bulk,
243+
)
244+
self.assertIsNone(retval)
245+
246+
# Each of the insert operation incurs another call to the cursor object. This is probably
247+
# the initial connection from the DB-API driver, to inquire the database version.
248+
# This compensation formula has been determined empirically / by educated guessing.
249+
effective_op_count = (mock_cursor.call_count - 2 * NPARTITIONS) - 2
250+
251+
# Verify number of batches.
252+
self.assertEqual(effective_op_count, OPCOUNT)

0 commit comments

Comments
 (0)