Pipeline Executor for Profiling (#1453)

sundarshankar89 · web-flow · commit 94df54fdb735 · 2025-03-06T11:29:15.000Z
Introduces a config driven ETL framework, currently orchestrated using
SQL files as Pipeline.
diff --git a/.github/workflows/acceptance.yml b/.github/workflows/acceptance.yml
@@ -22,7 +22,9 @@ jobs:
   integration:
     if: github.event_name == 'pull_request' && github.event.pull_request.draft == false
     environment: tool
-    runs-on: larger
+    runs-on:
+      group: databrickslabs-protected-runner-group
+      labels: linux-ubuntu-latest
     steps:
       - name: Checkout Code
         uses: actions/checkout@v4
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -7,7 +7,9 @@ on:
 
 jobs:
   publish:
-    runs-on: ubuntu-latest
+    runs-on:
+      group: databrickslabs-protected-runner-group
+      labels: linux-ubuntu-latest
     environment: release
     permissions:
       # Used to authenticate to PyPI via OIDC and sign the release's artifacts with sigstore-python.
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,6 +21,7 @@ dependencies = [
   "pyodbc",
   "SQLAlchemy",
   "pygls>=2.0.0a2",
+  "duckdb",
 ]
 
 [project.urls]
@@ -52,7 +53,7 @@ dependencies = [
   "pytest",
   "pytest-cov>=5.0.0,<6.0.0",
   "pytest-asyncio>=0.24.0",
-    "pytest-xdist~=3.5.0",
+  "pytest-xdist~=3.5.0",
   "black>=23.1.0",
   "ruff>=0.0.243",
   "databricks-connect==15.1",
@@ -68,8 +69,8 @@ reconcile = "databricks.labs.remorph.reconcile.execute:main"
 
 [tool.hatch.envs.default.scripts]
 test        = "pytest --cov src --cov-report=xml tests/unit"
-coverage    = "pytest --cov src tests --cov-report=html --ignore=tests/integration/connections"
-integration = "pytest --cov src tests/integration --durations 20 --ignore=tests/integration/connections"
+coverage    = "pytest --cov src tests --cov-report=html --ignore=tests/integration/connections --ignore=tests/integration/assessments"
+integration = "pytest --cov src tests/integration/reconcile --durations 20"
 fmt         = ["black .",
                "ruff  check . --fix",
                "mypy --disable-error-code 'annotation-unchecked' .",
diff --git a/src/databricks/labs/remorph/assessments/pipeline.py b/src/databricks/labs/remorph/assessments/pipeline.py
@@ -0,0 +1,79 @@
+from pathlib import Path
+import logging
+import yaml
+import duckdb
+
+from databricks.labs.remorph.assessments.profiler_config import PipelineConfig, Step
+from databricks.labs.remorph.connections.database_manager import DatabaseManager
+
+logger = logging.getLogger(__name__)
+logger.setLevel("INFO")
+
+DB_NAME = "profiler_extract.db"
+
+
+class PipelineClass:
+    def __init__(self, config: PipelineConfig, executor: DatabaseManager):
+        self.config = config
+        self.executor = executor
+        self.db_path_prefix = Path(config.extract_folder)
+
+    def execute(self):
+        logging.info(f"Pipeline initialized with config: {self.config.name}, version: {self.config.version}")
+        for step in self.config.steps:
+            if step.flag == "active":
+                logging.debug(f"Executing step: {step.name}")
+                self._execute_step(step)
+        logging.info("Pipeline execution completed")
+
+    def _execute_step(self, step: Step):
+        logging.debug(f"Reading query from file: {step.extract_query}")
+        with open(step.extract_query, 'r', encoding='utf-8') as file:
+            query = file.read()
+
+        # Execute the query using the database manager
+        logging.info(f"Executing query: {query}")
+        result = self.executor.execute_query(query)
+
+        # Save the result to duckdb
+        self._save_to_db(result, step.name, str(step.mode))
+
+    def _save_to_db(self, result, step_name: str, mode: str, batch_size: int = 1000):
+        self._create_dir(self.db_path_prefix)
+        conn = duckdb.connect(str(self.db_path_prefix) + '/' + DB_NAME)
+        columns = result.keys()
+        # TODO: Add support for figuring out data types from SQLALCHEMY result object result.cursor.description is not reliable
+        schema = ' STRING, '.join(columns) + ' STRING'
+
+        # Handle write modes
+        if mode == 'overwrite':
+            conn.execute(f"CREATE OR REPLACE TABLE {step_name} ({schema})")
+        elif mode == 'append' and step_name not in conn.get_table_names(""):
+            conn.execute(f"CREATE TABLE {step_name} ({schema})")
+
+        # Batch insert using prepared statements
+        placeholders = ', '.join(['?' for _ in columns])
+        insert_query = f"INSERT INTO {step_name} VALUES ({placeholders})"
+
+        # Fetch and insert rows in batches
+        while True:
+            rows = result.fetchmany(batch_size)
+            if not rows:
+                break
+            conn.executemany(insert_query, rows)
+
+        conn.close()
+
+    @staticmethod
+    def _create_dir(dir_path: Path):
+        if not Path(dir_path).exists():
+            dir_path.mkdir(parents=True, exist_ok=True)
+
+    @staticmethod
+    def load_config_from_yaml(file_path: str) -> PipelineConfig:
+        with open(file_path, 'r', encoding='utf-8') as file:
+            data = yaml.safe_load(file)
+        steps = [Step(**step) for step in data['steps']]
+        return PipelineConfig(
+            name=data['name'], version=data['version'], extract_folder=data['extract_folder'], steps=steps
+        )
diff --git a/src/databricks/labs/remorph/assessments/profiler_config.py b/src/databricks/labs/remorph/assessments/profiler_config.py
@@ -0,0 +1,27 @@
+from dataclasses import dataclass, field
+
+
+@dataclass
+class Step:
+    name: str
+    type: str | None
+    extract_query: str
+    mode: str | None
+    frequency: str | None
+    flag: str | None
+
+    def __post_init__(self):
+        if self.frequency is None:
+            self.frequency = "once"
+        if self.flag is None:
+            self.flag = "active"
+        if self.mode is None:
+            self.mode = "append"
+
+
+@dataclass
+class PipelineConfig:
+    name: str
+    version: str
+    extract_folder: str
+    steps: list[Step] = field(default_factory=list)
diff --git a/tests/integration/assessments/__init__.py b/tests/integration/assessments/__init__.py
diff --git a/tests/integration/assessments/test_pipeline.py b/tests/integration/assessments/test_pipeline.py
@@ -0,0 +1,49 @@
+from pathlib import Path
+import duckdb
+import pytest
+
+from databricks.labs.remorph.assessments.pipeline import PipelineClass, DB_NAME
+from ..connections.helpers import get_db_manager
+
+
+@pytest.fixture()
+def extractor(mock_credentials):
+    return get_db_manager("remorph", "mssql")
+
+
+@pytest.fixture(scope="module")
+def pipeline_config():
+    prefix = Path(__file__).parent
+    config_path = f"{prefix}/../../resources/assessments/pipeline_config.yml"
+    config = PipelineClass.load_config_from_yaml(config_path)
+
+    for step in config.steps:
+        step.extract_query = f"{prefix}/../../{step.extract_query}"
+    return config
+
+
+def test_run_pipeline(extractor, pipeline_config, get_logger):
+    pipeline = PipelineClass(config=pipeline_config, executor=extractor)
+    pipeline.execute()
+    assert verify_output(get_logger, pipeline_config.extract_folder)
+
+
+def verify_output(get_logger, path):
+    conn = duckdb.connect(str(Path(path)) + "/" + DB_NAME)
+
+    expected_tables = ["usage", "inventory"]
+    logger = get_logger
+    for table in expected_tables:
+        try:
+            result = conn.execute(f"SELECT COUNT(*) FROM {table}").fetchone()
+            logger.info(f"Count for {table}: {result[0]}")
+            if result[0] == 0:
+                logger.debug(f"Table {table} is empty")
+                return False
+        except duckdb.CatalogException:
+            logger.debug(f"Table {table} does not exist")
+            return False
+
+    conn.close()
+    logger.info("All expected tables exist and are not empty")
+    return True
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
@@ -1,10 +1,12 @@
 import os
 import logging
+from unittest.mock import patch
+
 import pytest
 from pyspark.sql import SparkSession
-
 from databricks.labs.remorph.__about__ import __version__
 
+
 logging.getLogger("tests").setLevel("DEBUG")
 logging.getLogger("databricks.labs.remorph").setLevel("DEBUG")
 
@@ -21,12 +23,18 @@ def product_info():
     return "remorph", __version__
 
 
+@pytest.fixture
+def get_logger():
+    return logger
+
+
 def pytest_collection_modifyitems(config, items):
     if os.getenv('TEST_ENV') == 'ACCEPTANCE':
         selected_items = []
         deselected_items = []
+        # Added only specific tests to run from acceptance.yml
         for item in items:
-            if 'tests/integration/connections' in str(item.fspath):
+            if 'tests/integration/reconcile' not in str(item.fspath) and 'tests/unit/' not in str(item.fspath):
                 selected_items.append(item)
             else:
                 deselected_items.append(item)
@@ -41,3 +49,22 @@ def mock_spark() -> SparkSession:
     :return: returns the spark session
     """
     return SparkSession.builder.appName("Remorph Reconcile Test").remote("sc://localhost").getOrCreate()
+
+
+@pytest.fixture(scope="session")
+def mock_credentials():
+    with patch(
+        'databricks.labs.remorph.connections.credential_manager._load_credentials',
+        return_value={
+            'secret_vault_type': 'env',
+            'secret_vault_name': '',
+            'mssql': {
+                'user': 'TEST_TSQL_USER',
+                'password': 'TEST_TSQL_PASS',
+                'server': 'TEST_TSQL_JDBC',
+                'database': 'TEST_TSQL_JDBC',
+                'driver': 'ODBC Driver 18 for SQL Server',
+            },
+        },
+    ):
+        yield
diff --git a/tests/integration/connections/helpers.py b/tests/integration/connections/helpers.py
@@ -0,0 +1,21 @@
+from urllib.parse import urlparse
+from databricks.labs.remorph.connections.credential_manager import create_credential_manager
+from databricks.labs.remorph.connections.database_manager import DatabaseManager
+from .debug_envgetter import TestEnvGetter
+
+
+def get_db_manager(product_name: str, source: str) -> DatabaseManager:
+    env = TestEnvGetter(True)
+    config = create_credential_manager(product_name, env).get_credentials(source)
+
+    # since the kv has only URL so added explicit parse rules
+    base_url, params = config['server'].replace("jdbc:", "", 1).split(";", 1)
+
+    url_parts = urlparse(base_url)
+    server = url_parts.hostname
+    query_params = dict(param.split("=", 1) for param in params.split(";") if "=" in param)
+    database = query_params.get("database", "")
+    config['server'] = server
+    config['database'] = database
+
+    return DatabaseManager(source, config)
diff --git a/tests/integration/connections/test_mssql_connector.py b/tests/integration/connections/test_mssql_connector.py
@@ -1,48 +1,12 @@
-from unittest.mock import patch
-from urllib.parse import urlparse
-
 import pytest
 
-from databricks.labs.remorph.connections.credential_manager import create_credential_manager
-from databricks.labs.remorph.connections.database_manager import DatabaseManager, MSSQLConnector
-from .debug_envgetter import TestEnvGetter
-
-
-@pytest.fixture(scope="module")
-def mock_credentials():
-    with patch(
-        'databricks.labs.remorph.connections.credential_manager._load_credentials',
-        return_value={
-            'secret_vault_type': 'env',
-            'secret_vault_name': '',
-            'mssql': {
-                'user': 'TEST_TSQL_USER',
-                'password': 'TEST_TSQL_PASS',
-                'server': 'TEST_TSQL_JDBC',
-                'database': 'TEST_TSQL_JDBC',
-                'driver': 'ODBC Driver 18 for SQL Server',
-            },
-        },
-    ):
-        yield
-
-
-@pytest.fixture(scope="module")
-def db_manager(mock_credentials):
-    env = TestEnvGetter(True)
-    config = create_credential_manager("remorph", env).get_credentials("mssql")
-
-    # since the kv has only URL so added explicit parse rules
-    base_url, params = config['server'].replace("jdbc:", "", 1).split(";", 1)
+from databricks.labs.remorph.connections.database_manager import MSSQLConnector
+from .helpers import get_db_manager
 
-    url_parts = urlparse(base_url)
-    server = url_parts.hostname
-    query_params = dict(param.split("=", 1) for param in params.split(";") if "=" in param)
-    database = query_params.get("database", "" "")
-    config['server'] = server
-    config['database'] = database
 
-    return DatabaseManager("mssql", config)
+@pytest.fixture()
+def db_manager(mock_credentials):
+    return get_db_manager("remorph", "mssql")
 
 
 def test_mssql_connector_connection(db_manager):
diff --git a/tests/resources/assessments/inventory.sql b/tests/resources/assessments/inventory.sql
@@ -0,0 +1,6 @@
+select DB_ID(name) as db_id,
+       name,
+       collation_name,
+       create_date,
+       SYSDATETIME() as extract_ts
+from SYS.DATABASES
diff --git a/tests/resources/assessments/pipeline_config.yml b/tests/resources/assessments/pipeline_config.yml
@@ -0,0 +1,22 @@
+name: ExamplePipeline
+version: "1.0"
+extract_folder: /tmp/extracts/
+steps:
+  - name: inventory
+    type: inventory
+    extract_query: resources/assessments/inventory.sql
+    mode: overwrite
+    frequency: daily
+    flag: active
+  - name: usage
+    type: usage
+    extract_query: resources/assessments/usage.sql
+    mode: overwrite
+    frequency: weekly
+    flag: active
+  - name: usage_2
+    type: usage_2
+    extract_query: resources/assessments/usage.sql
+    mode: overwrite
+    frequency: daily
+    flag: inactive
diff --git a/tests/resources/assessments/usage.sql b/tests/resources/assessments/usage.sql
@@ -0,0 +1,9 @@
+SELECT
+    CONVERT(VARCHAR(64), HASHBYTES('SHA2_256', qs.sql_handle), 1) as sql_handle,
+    qs.creation_time,
+    qs.last_execution_time,
+    qs.execution_count,
+    qs.total_worker_time,
+    qs.total_elapsed_time,
+    qs.total_rows
+FROM SYS.DM_EXEC_QUERY_STATS as qs