refactor: amended process pool to use executor supplied to duckdb pipeline/data contract rather than always instantiating new pool

stevenhsd · stevenhsd · commit 85e9693c1bfe · 2026-02-11T12:10:37.000Z
diff --git a/src/dve/core_engine/backends/implementations/duckdb/contract.py b/src/dve/core_engine/backends/implementations/duckdb/contract.py
@@ -3,8 +3,9 @@
 # pylint: disable=R0903
 import logging
 from collections.abc import Iterator
+from concurrent.futures import Future, ProcessPoolExecutor, as_completed
 from functools import partial
-from multiprocessing import Pool, cpu_count
+from multiprocessing import cpu_count
 from typing import Any, Optional
 from uuid import uuid4
 
@@ -70,10 +71,12 @@ def __init__(
         connection: DuckDBPyConnection,
         logger: Optional[logging.Logger] = None,
         debug: bool = False,
+        executor: Optional[ProcessPoolExecutor] = None,
         **kwargs: Any,
     ):
         self.debug = debug
         self._connection = connection
+        self._executor = ProcessPoolExecutor(cpu_count() - 1) if not executor else executor
         """A bool indicating whether to enable debug logging."""
 
         super().__init__(logger, **kwargs)
@@ -164,11 +167,13 @@ def apply_data_contract(
 
                 batches = pq.ParquetFile(entity_locations[entity_name]).iter_batches(10000)
                 msg_count = 0
-                with Pool(cpu_count() - 1) as pool:
-                    for msgs in pool.imap_unordered(row_validator_helper, batches):
-                        if msgs:
-                            msg_writer.write_queue.put(msgs)
-                            msg_count += len(msgs)
+                futures: list[Future] = [
+                    self._executor.submit(row_validator_helper, batch) for batch in batches
+                ]
+                for future in as_completed(futures):
+                    if msgs := future.result():
+                        msg_writer.write_queue.put(msgs)
+                        msg_count += len(msgs)
 
                 self.logger.info(f"Data contract found {msg_count} issues in {entity_name}")
 
diff --git a/src/dve/pipeline/duckdb_pipeline.py b/src/dve/pipeline/duckdb_pipeline.py
@@ -1,6 +1,7 @@
 """DuckDB implementation for `Pipeline` object."""
 
 import logging
+from concurrent.futures import ProcessPoolExecutor
 from typing import Optional
 
 from duckdb import DuckDBPyConnection, DuckDBPyRelation
@@ -33,12 +34,13 @@ def __init__(
         reference_data_loader: Optional[type[BaseRefDataLoader]] = None,
         job_run_id: Optional[int] = None,
         logger: Optional[logging.Logger] = None,
+        executor: Optional[ProcessPoolExecutor] = None,
     ):
         self._connection = connection
         super().__init__(
             processed_files_path,
             audit_tables,
-            DuckDBDataContract(connection=self._connection),
+            DuckDBDataContract(connection=self._connection, executor=executor),
             DuckDBStepImplementations.register_udfs(connection=self._connection),
             rules_path,
             submitted_files_path,
diff --git a/tests/features/environment.py b/tests/features/environment.py
@@ -1,3 +1,5 @@
+from concurrent.futures import ProcessPoolExecutor
+from multiprocessing import cpu_count
 import shutil
 import tempfile
 from pathlib import Path
@@ -27,6 +29,7 @@ def before_all(context: Context):
         temp_dir = Path(context.dbfs_root.__enter__())
         dbfs_impl = DBFSFilesystemImplementation(temp_dir)
         add_implementation(dbfs_impl)
+    context.process_pool = ProcessPoolExecutor(cpu_count() - 1)
 
 
 def before_scenario(context: Context, scenario: Scenario):
@@ -78,3 +81,4 @@ def after_all(context: Context):
 
     context.connection.close()
     shutil.rmtree(context.ddb_db_file.parent)
+    context.process_pool.shutdown(wait=True, cancel_futures=True)
diff --git a/tests/features/steps/steps_pipeline.py b/tests/features/steps/steps_pipeline.py
@@ -6,7 +6,7 @@
 
 """
 # pylint: disable=no-name-in-module
-from concurrent.futures import ThreadPoolExecutor
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from functools import partial, reduce
 from itertools import chain
 import operator
@@ -75,6 +75,7 @@ def setup_duckdb_pipeline(
     dataset_id: str,
     processing_path: Path,
     schema_file_name: Optional[str] = None,
+    executor: Optional[ProcessPoolExecutor] = None
 ):
 
     schema_file_name = f"{dataset_id}.dischema.json" if not schema_file_name else schema_file_name
@@ -97,6 +98,7 @@ def setup_duckdb_pipeline(
         rules_path=rules_path,
         submitted_files_path=processing_path.as_posix(),
         reference_data_loader=DuckDBRefDataLoader,
+        executor=executor
     )
 
 
@@ -204,7 +206,7 @@ def add_pipeline_to_ctx(
     context: Context, implementation: str, schema_file_name: Optional[str] = None
 ):
     pipeline_map: Dict[str, Callable] = {
-        "duckdb": partial(setup_duckdb_pipeline, connection=context.connection),
+        "duckdb": partial(setup_duckdb_pipeline, connection=context.connection, executor=context.process_pool),
         "spark": partial(setup_spark_pipeline, spark=context.spark_session),
     }
     if not implementation in pipeline_map:
diff --git a/tests/test_core_engine/test_backends/test_implementations/test_duckdb/test_data_contract.py b/tests/test_core_engine/test_backends/test_implementations/test_duckdb/test_data_contract.py
@@ -1,4 +1,6 @@
+from concurrent.futures import ProcessPoolExecutor
 import json
+from multiprocessing import cpu_count
 from pathlib import Path
 from typing import Any, Dict, List, Tuple
 
@@ -30,7 +32,13 @@
     temp_xml_file,
 )
 
-def test_duckdb_data_contract_csv(temp_csv_file):
+@pytest.fixture(scope="module")
+def temp_process_pool_executor():
+    with ProcessPoolExecutor(cpu_count() - 1) as pool:
+        yield pool
+
+
+def test_duckdb_data_contract_csv(temp_csv_file, temp_process_pool_executor):
     uri, _, _, mdl = temp_csv_file
     connection = default_connection
 
@@ -89,7 +97,7 @@ def test_duckdb_data_contract_csv(temp_csv_file):
     }
     entity_locations: Dict[str, URI] = {"test_ds": str(uri)}
 
-    data_contract: DuckDBDataContract = DuckDBDataContract(connection)
+    data_contract: DuckDBDataContract = DuckDBDataContract(connection, executor=temp_process_pool_executor)
     entities, feedback_errors_uri, stage_successful = data_contract.apply_data_contract(get_parent(uri.as_posix()), entities, entity_locations, dc_meta)
     rel: DuckDBPyRelation = entities.get("test_ds")
     assert dict(zip(rel.columns, rel.dtypes)) == {
@@ -100,7 +108,7 @@ def test_duckdb_data_contract_csv(temp_csv_file):
     assert stage_successful
 
 
-def test_duckdb_data_contract_xml(temp_xml_file):
+def test_duckdb_data_contract_xml(temp_xml_file, temp_process_pool_executor):
     uri, header_model, header_data, class_model, class_data = temp_xml_file
     connection = default_connection
     contract_meta = json.dumps(
@@ -187,7 +195,7 @@ def test_duckdb_data_contract_xml(temp_xml_file):
         reporting_fields={"test_header": ["school"], "test_class_info": ["year"]},
     )
 
-    data_contract: DuckDBDataContract = DuckDBDataContract(connection)
+    data_contract: DuckDBDataContract = DuckDBDataContract(connection, executor=temp_process_pool_executor)
     entities, feedback_errors_uri, stage_successful = data_contract.apply_data_contract(get_parent(uri.as_posix()), entities, entity_locations, dc_meta)
     header_rel: DuckDBPyRelation = entities.get("test_header")
     header_expected_schema: Dict[str, DuckDBPyType] = {
@@ -327,10 +335,11 @@ def test_ddb_data_contract_read_nested_parquet(nested_all_string_parquet):
     }
 
 def test_duckdb_data_contract_custom_error_details(nested_all_string_parquet_w_errors,
-                                                  nested_parquet_custom_dc_err_details):
+                                                  nested_parquet_custom_dc_err_details,
+                                                  temp_process_pool_executor):
     parquet_uri, contract_meta, _ = nested_all_string_parquet_w_errors
     connection = default_connection
-    data_contract = DuckDBDataContract(connection)
+    data_contract = DuckDBDataContract(connection, executor=temp_process_pool_executor)
 
     entity = data_contract.read_parquet(path=parquet_uri)
     assert entity.count("*").fetchone()[0] == 2