Skip to content

Commit 94df54f

Browse files
Pipeline Executor for Profiling (#1453)
Introduces a config driven ETL framework, currently orchestrated using SQL files as Pipeline.
1 parent 2984f0d commit 94df54f

File tree

13 files changed

+257
-48
lines changed

13 files changed

+257
-48
lines changed

.github/workflows/acceptance.yml

+3-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,9 @@ jobs:
2222
integration:
2323
if: github.event_name == 'pull_request' && github.event.pull_request.draft == false
2424
environment: tool
25-
runs-on: larger
25+
runs-on:
26+
group: databrickslabs-protected-runner-group
27+
labels: linux-ubuntu-latest
2628
steps:
2729
- name: Checkout Code
2830
uses: actions/checkout@v4

.github/workflows/release.yml

+3-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@ on:
77

88
jobs:
99
publish:
10-
runs-on: ubuntu-latest
10+
runs-on:
11+
group: databrickslabs-protected-runner-group
12+
labels: linux-ubuntu-latest
1113
environment: release
1214
permissions:
1315
# Used to authenticate to PyPI via OIDC and sign the release's artifacts with sigstore-python.

pyproject.toml

+4-3
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ dependencies = [
2121
"pyodbc",
2222
"SQLAlchemy",
2323
"pygls>=2.0.0a2",
24+
"duckdb",
2425
]
2526

2627
[project.urls]
@@ -52,7 +53,7 @@ dependencies = [
5253
"pytest",
5354
"pytest-cov>=5.0.0,<6.0.0",
5455
"pytest-asyncio>=0.24.0",
55-
"pytest-xdist~=3.5.0",
56+
"pytest-xdist~=3.5.0",
5657
"black>=23.1.0",
5758
"ruff>=0.0.243",
5859
"databricks-connect==15.1",
@@ -68,8 +69,8 @@ reconcile = "databricks.labs.remorph.reconcile.execute:main"
6869

6970
[tool.hatch.envs.default.scripts]
7071
test = "pytest --cov src --cov-report=xml tests/unit"
71-
coverage = "pytest --cov src tests --cov-report=html --ignore=tests/integration/connections"
72-
integration = "pytest --cov src tests/integration --durations 20 --ignore=tests/integration/connections"
72+
coverage = "pytest --cov src tests --cov-report=html --ignore=tests/integration/connections --ignore=tests/integration/assessments"
73+
integration = "pytest --cov src tests/integration/reconcile --durations 20"
7374
fmt = ["black .",
7475
"ruff check . --fix",
7576
"mypy --disable-error-code 'annotation-unchecked' .",
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
from pathlib import Path
2+
import logging
3+
import yaml
4+
import duckdb
5+
6+
from databricks.labs.remorph.assessments.profiler_config import PipelineConfig, Step
7+
from databricks.labs.remorph.connections.database_manager import DatabaseManager
8+
9+
logger = logging.getLogger(__name__)
10+
logger.setLevel("INFO")
11+
12+
DB_NAME = "profiler_extract.db"
13+
14+
15+
class PipelineClass:
16+
def __init__(self, config: PipelineConfig, executor: DatabaseManager):
17+
self.config = config
18+
self.executor = executor
19+
self.db_path_prefix = Path(config.extract_folder)
20+
21+
def execute(self):
22+
logging.info(f"Pipeline initialized with config: {self.config.name}, version: {self.config.version}")
23+
for step in self.config.steps:
24+
if step.flag == "active":
25+
logging.debug(f"Executing step: {step.name}")
26+
self._execute_step(step)
27+
logging.info("Pipeline execution completed")
28+
29+
def _execute_step(self, step: Step):
30+
logging.debug(f"Reading query from file: {step.extract_query}")
31+
with open(step.extract_query, 'r', encoding='utf-8') as file:
32+
query = file.read()
33+
34+
# Execute the query using the database manager
35+
logging.info(f"Executing query: {query}")
36+
result = self.executor.execute_query(query)
37+
38+
# Save the result to duckdb
39+
self._save_to_db(result, step.name, str(step.mode))
40+
41+
def _save_to_db(self, result, step_name: str, mode: str, batch_size: int = 1000):
42+
self._create_dir(self.db_path_prefix)
43+
conn = duckdb.connect(str(self.db_path_prefix) + '/' + DB_NAME)
44+
columns = result.keys()
45+
# TODO: Add support for figuring out data types from SQLALCHEMY result object result.cursor.description is not reliable
46+
schema = ' STRING, '.join(columns) + ' STRING'
47+
48+
# Handle write modes
49+
if mode == 'overwrite':
50+
conn.execute(f"CREATE OR REPLACE TABLE {step_name} ({schema})")
51+
elif mode == 'append' and step_name not in conn.get_table_names(""):
52+
conn.execute(f"CREATE TABLE {step_name} ({schema})")
53+
54+
# Batch insert using prepared statements
55+
placeholders = ', '.join(['?' for _ in columns])
56+
insert_query = f"INSERT INTO {step_name} VALUES ({placeholders})"
57+
58+
# Fetch and insert rows in batches
59+
while True:
60+
rows = result.fetchmany(batch_size)
61+
if not rows:
62+
break
63+
conn.executemany(insert_query, rows)
64+
65+
conn.close()
66+
67+
@staticmethod
68+
def _create_dir(dir_path: Path):
69+
if not Path(dir_path).exists():
70+
dir_path.mkdir(parents=True, exist_ok=True)
71+
72+
@staticmethod
73+
def load_config_from_yaml(file_path: str) -> PipelineConfig:
74+
with open(file_path, 'r', encoding='utf-8') as file:
75+
data = yaml.safe_load(file)
76+
steps = [Step(**step) for step in data['steps']]
77+
return PipelineConfig(
78+
name=data['name'], version=data['version'], extract_folder=data['extract_folder'], steps=steps
79+
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
from dataclasses import dataclass, field
2+
3+
4+
@dataclass
5+
class Step:
6+
name: str
7+
type: str | None
8+
extract_query: str
9+
mode: str | None
10+
frequency: str | None
11+
flag: str | None
12+
13+
def __post_init__(self):
14+
if self.frequency is None:
15+
self.frequency = "once"
16+
if self.flag is None:
17+
self.flag = "active"
18+
if self.mode is None:
19+
self.mode = "append"
20+
21+
22+
@dataclass
23+
class PipelineConfig:
24+
name: str
25+
version: str
26+
extract_folder: str
27+
steps: list[Step] = field(default_factory=list)

tests/integration/assessments/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
from pathlib import Path
2+
import duckdb
3+
import pytest
4+
5+
from databricks.labs.remorph.assessments.pipeline import PipelineClass, DB_NAME
6+
from ..connections.helpers import get_db_manager
7+
8+
9+
@pytest.fixture()
10+
def extractor(mock_credentials):
11+
return get_db_manager("remorph", "mssql")
12+
13+
14+
@pytest.fixture(scope="module")
15+
def pipeline_config():
16+
prefix = Path(__file__).parent
17+
config_path = f"{prefix}/../../resources/assessments/pipeline_config.yml"
18+
config = PipelineClass.load_config_from_yaml(config_path)
19+
20+
for step in config.steps:
21+
step.extract_query = f"{prefix}/../../{step.extract_query}"
22+
return config
23+
24+
25+
def test_run_pipeline(extractor, pipeline_config, get_logger):
26+
pipeline = PipelineClass(config=pipeline_config, executor=extractor)
27+
pipeline.execute()
28+
assert verify_output(get_logger, pipeline_config.extract_folder)
29+
30+
31+
def verify_output(get_logger, path):
32+
conn = duckdb.connect(str(Path(path)) + "/" + DB_NAME)
33+
34+
expected_tables = ["usage", "inventory"]
35+
logger = get_logger
36+
for table in expected_tables:
37+
try:
38+
result = conn.execute(f"SELECT COUNT(*) FROM {table}").fetchone()
39+
logger.info(f"Count for {table}: {result[0]}")
40+
if result[0] == 0:
41+
logger.debug(f"Table {table} is empty")
42+
return False
43+
except duckdb.CatalogException:
44+
logger.debug(f"Table {table} does not exist")
45+
return False
46+
47+
conn.close()
48+
logger.info("All expected tables exist and are not empty")
49+
return True

tests/integration/conftest.py

+29-2
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
import os
22
import logging
3+
from unittest.mock import patch
4+
35
import pytest
46
from pyspark.sql import SparkSession
5-
67
from databricks.labs.remorph.__about__ import __version__
78

9+
810
logging.getLogger("tests").setLevel("DEBUG")
911
logging.getLogger("databricks.labs.remorph").setLevel("DEBUG")
1012

@@ -21,12 +23,18 @@ def product_info():
2123
return "remorph", __version__
2224

2325

26+
@pytest.fixture
27+
def get_logger():
28+
return logger
29+
30+
2431
def pytest_collection_modifyitems(config, items):
2532
if os.getenv('TEST_ENV') == 'ACCEPTANCE':
2633
selected_items = []
2734
deselected_items = []
35+
# Added only specific tests to run from acceptance.yml
2836
for item in items:
29-
if 'tests/integration/connections' in str(item.fspath):
37+
if 'tests/integration/reconcile' not in str(item.fspath) and 'tests/unit/' not in str(item.fspath):
3038
selected_items.append(item)
3139
else:
3240
deselected_items.append(item)
@@ -41,3 +49,22 @@ def mock_spark() -> SparkSession:
4149
:return: returns the spark session
4250
"""
4351
return SparkSession.builder.appName("Remorph Reconcile Test").remote("sc://localhost").getOrCreate()
52+
53+
54+
@pytest.fixture(scope="session")
55+
def mock_credentials():
56+
with patch(
57+
'databricks.labs.remorph.connections.credential_manager._load_credentials',
58+
return_value={
59+
'secret_vault_type': 'env',
60+
'secret_vault_name': '',
61+
'mssql': {
62+
'user': 'TEST_TSQL_USER',
63+
'password': 'TEST_TSQL_PASS',
64+
'server': 'TEST_TSQL_JDBC',
65+
'database': 'TEST_TSQL_JDBC',
66+
'driver': 'ODBC Driver 18 for SQL Server',
67+
},
68+
},
69+
):
70+
yield
+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
from urllib.parse import urlparse
2+
from databricks.labs.remorph.connections.credential_manager import create_credential_manager
3+
from databricks.labs.remorph.connections.database_manager import DatabaseManager
4+
from .debug_envgetter import TestEnvGetter
5+
6+
7+
def get_db_manager(product_name: str, source: str) -> DatabaseManager:
8+
env = TestEnvGetter(True)
9+
config = create_credential_manager(product_name, env).get_credentials(source)
10+
11+
# since the kv has only URL so added explicit parse rules
12+
base_url, params = config['server'].replace("jdbc:", "", 1).split(";", 1)
13+
14+
url_parts = urlparse(base_url)
15+
server = url_parts.hostname
16+
query_params = dict(param.split("=", 1) for param in params.split(";") if "=" in param)
17+
database = query_params.get("database", "")
18+
config['server'] = server
19+
config['database'] = database
20+
21+
return DatabaseManager(source, config)

tests/integration/connections/test_mssql_connector.py

+5-41
Original file line numberDiff line numberDiff line change
@@ -1,48 +1,12 @@
1-
from unittest.mock import patch
2-
from urllib.parse import urlparse
3-
41
import pytest
52

6-
from databricks.labs.remorph.connections.credential_manager import create_credential_manager
7-
from databricks.labs.remorph.connections.database_manager import DatabaseManager, MSSQLConnector
8-
from .debug_envgetter import TestEnvGetter
9-
10-
11-
@pytest.fixture(scope="module")
12-
def mock_credentials():
13-
with patch(
14-
'databricks.labs.remorph.connections.credential_manager._load_credentials',
15-
return_value={
16-
'secret_vault_type': 'env',
17-
'secret_vault_name': '',
18-
'mssql': {
19-
'user': 'TEST_TSQL_USER',
20-
'password': 'TEST_TSQL_PASS',
21-
'server': 'TEST_TSQL_JDBC',
22-
'database': 'TEST_TSQL_JDBC',
23-
'driver': 'ODBC Driver 18 for SQL Server',
24-
},
25-
},
26-
):
27-
yield
28-
29-
30-
@pytest.fixture(scope="module")
31-
def db_manager(mock_credentials):
32-
env = TestEnvGetter(True)
33-
config = create_credential_manager("remorph", env).get_credentials("mssql")
34-
35-
# since the kv has only URL so added explicit parse rules
36-
base_url, params = config['server'].replace("jdbc:", "", 1).split(";", 1)
3+
from databricks.labs.remorph.connections.database_manager import MSSQLConnector
4+
from .helpers import get_db_manager
375

38-
url_parts = urlparse(base_url)
39-
server = url_parts.hostname
40-
query_params = dict(param.split("=", 1) for param in params.split(";") if "=" in param)
41-
database = query_params.get("database", "" "")
42-
config['server'] = server
43-
config['database'] = database
446

45-
return DatabaseManager("mssql", config)
7+
@pytest.fixture()
8+
def db_manager(mock_credentials):
9+
return get_db_manager("remorph", "mssql")
4610

4711

4812
def test_mssql_connector_connection(db_manager):
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
select DB_ID(name) as db_id,
2+
name,
3+
collation_name,
4+
create_date,
5+
SYSDATETIME() as extract_ts
6+
from SYS.DATABASES
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
name: ExamplePipeline
2+
version: "1.0"
3+
extract_folder: /tmp/extracts/
4+
steps:
5+
- name: inventory
6+
type: inventory
7+
extract_query: resources/assessments/inventory.sql
8+
mode: overwrite
9+
frequency: daily
10+
flag: active
11+
- name: usage
12+
type: usage
13+
extract_query: resources/assessments/usage.sql
14+
mode: overwrite
15+
frequency: weekly
16+
flag: active
17+
- name: usage_2
18+
type: usage_2
19+
extract_query: resources/assessments/usage.sql
20+
mode: overwrite
21+
frequency: daily
22+
flag: inactive

tests/resources/assessments/usage.sql

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
SELECT
2+
CONVERT(VARCHAR(64), HASHBYTES('SHA2_256', qs.sql_handle), 1) as sql_handle,
3+
qs.creation_time,
4+
qs.last_execution_time,
5+
qs.execution_count,
6+
qs.total_worker_time,
7+
qs.total_elapsed_time,
8+
qs.total_rows
9+
FROM SYS.DM_EXEC_QUERY_STATS as qs

0 commit comments

Comments
 (0)