Enhance Profiler config to execute Python Script (#1465)

sundarshankar89 · web-flow · commit e0c4d1aaa4ab · 2025-03-06T12:41:41.000Z
Added Support for Execution Python Script in the Profiler Config
diff --git a/pyproject.toml b/pyproject.toml
@@ -62,6 +62,7 @@ dependencies = [
   "databricks-labs-pylint~=0.4.0",
   "mypy~=1.10.0",
   "numpy==1.26.4",
+  "pandas==1.4.1",
 ]
 
 [project.entry-points.databricks]
@@ -94,6 +95,8 @@ cache_dir = ".venv/pytest-cache"
 asyncio_mode = "auto"
 asyncio_default_fixture_loop_scope="function"
 
+[tool.mypy]
+exclude = ["tests/resources/.*"]
 
 [tool.black]
 target-version = ["py310"]
@@ -182,6 +185,9 @@ fail-under = 10.0
 # file locks
 ignore-patterns = ["^\\.#"]
 
+# Ignore files under tests/resources
+ignore-paths = ["tests/resources"]
+
 # List of module names for which member attributes should not be checked (useful
 # for modules/projects where namespaces are manipulated during runtime and thus
 # existing member attributes cannot be deduced by static analysis). It supports
diff --git a/src/databricks/labs/remorph/assessments/pipeline.py b/src/databricks/labs/remorph/assessments/pipeline.py
@@ -1,8 +1,13 @@
 from pathlib import Path
+
+import json
 import logging
+import subprocess
 import yaml
 import duckdb
 
+from databricks.labs.remorph.connections.credential_manager import cred_file
+
 from databricks.labs.remorph.assessments.profiler_config import PipelineConfig, Step
 from databricks.labs.remorph.connections.database_manager import DatabaseManager
 
@@ -27,8 +32,16 @@ def execute(self):
         logging.info("Pipeline execution completed")
 
     def _execute_step(self, step: Step):
-        logging.debug(f"Reading query from file: {step.extract_query}")
-        with open(step.extract_query, 'r', encoding='utf-8') as file:
+        if step.type == "sql":
+            self._execute_sql_step(step)
+        elif step.type == "python":
+            self._execute_python_step(step)
+        else:
+            logging.error(f"Unsupported step type: {step.type}")
+
+    def _execute_sql_step(self, step: Step):
+        logging.debug(f"Reading query from file: {step.extract_source}")
+        with open(step.extract_source, 'r', encoding='utf-8') as file:
             query = file.read()
 
         # Execute the query using the database manager
@@ -38,6 +51,33 @@ def _execute_step(self, step: Step):
         # Save the result to duckdb
         self._save_to_db(result, step.name, str(step.mode))
 
+    def _execute_python_step(self, step: Step):
+        logging.debug(f"Executing Python script: {step.extract_source}")
+        db_path = str(self.db_path_prefix / DB_NAME)
+        credential_config = str(cred_file("remorph"))
+
+        try:
+            result = subprocess.run(
+                ["python", step.extract_source, "--db-path", db_path, "--credential-config-path", credential_config],
+                check=True,
+                capture_output=True,
+                text=True,
+            )
+
+            try:
+                output = json.loads(result.stdout)
+                if output["status"] == "success":
+                    logging.info(f"Python script completed: {output['message']}")
+                else:
+                    raise RuntimeError(f"Script reported error: {output['message']}")
+            except json.JSONDecodeError:
+                logging.info(f"Python script output: {result.stdout}")
+
+        except subprocess.CalledProcessError as e:
+            error_msg = e.stderr
+            logging.error(f"Python script failed: {error_msg}")
+            raise RuntimeError(f"Script execution failed: {error_msg}") from e
+
     def _save_to_db(self, result, step_name: str, mode: str, batch_size: int = 1000):
         self._create_dir(self.db_path_prefix)
         conn = duckdb.connect(str(self.db_path_prefix) + '/' + DB_NAME)
diff --git a/src/databricks/labs/remorph/assessments/profiler_config.py b/src/databricks/labs/remorph/assessments/profiler_config.py
@@ -5,7 +5,7 @@
 class Step:
     name: str
     type: str | None
-    extract_query: str
+    extract_source: str
     mode: str | None
     frequency: str | None
     flag: str | None
diff --git a/tests/integration/assessments/test_pipeline.py b/tests/integration/assessments/test_pipeline.py
@@ -18,7 +18,7 @@ def pipeline_config():
     config = PipelineClass.load_config_from_yaml(config_path)
 
     for step in config.steps:
-        step.extract_query = f"{prefix}/../../{step.extract_query}"
+        step.extract_source = f"{prefix}/../../{step.extract_source}"
     return config
 
 
@@ -31,7 +31,7 @@ def test_run_pipeline(extractor, pipeline_config, get_logger):
 def verify_output(get_logger, path):
     conn = duckdb.connect(str(Path(path)) + "/" + DB_NAME)
 
-    expected_tables = ["usage", "inventory"]
+    expected_tables = ["usage", "inventory", "random_data"]
     logger = get_logger
     for table in expected_tables:
         try:
diff --git a/tests/resources/assessments/db_extract.py b/tests/resources/assessments/db_extract.py
@@ -0,0 +1,78 @@
+import pandas as pd
+import duckdb
+import argparse
+import json
+import sys
+import numpy as np
+import logging
+from datetime import datetime, timedelta
+
+
+def generate_random_dataset(size=10):
+    # Generate dates for the last 30 days
+    end_date = datetime.now()
+    start_date = end_date - timedelta(days=30)
+    dates = pd.date_range(start=start_date, end=end_date, periods=size)
+
+    data = {
+        'id': range(1, size + 1),
+        'date': dates,
+        'category': np.random.choice(['Low', 'Medium', 'High'], size),
+        'department': np.random.choice(['Sales', 'Marketing', 'Engineering', 'Support'], size),
+        'is_active': np.random.choice([True, False], size, p=[0.8, 0.2]),
+        'score': np.random.uniform(0, 100, size).round(2),
+    }
+
+    return pd.DataFrame(data)
+
+
+def execute():
+    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+    logger = logging.getLogger(__name__)
+
+    parser = argparse.ArgumentParser(description='Generate and store random dataset in DuckDB')
+    parser.add_argument('--db-path', type=str, required=True, help='Path to DuckDB database file')
+    parser.add_argument(
+        '--credential-config-path', type=str, required=True, help='Path string containing credential configuration'
+    )
+    args = parser.parse_args()
+    credential_file = args.credential_config_path
+
+    if not credential_file.endswith('credentials.yml'):
+        msg = "Credential config file must have 'credentials.yml' extension"
+        # This is the output format expected by the pipeline.py which orchestrates the execution of this script
+        print(json.dumps({"status": "error", "message": msg}), file=sys.stderr)
+        raise ValueError("Credential config file must have 'credentials.yml' extension")
+
+    try:
+        df = generate_random_dataset()
+        logger.info(f'DataFrame columns: {df.columns}')
+        # Connect to DuckDB
+        conn = duckdb.connect(args.db_path)
+
+        # Create table with appropriate schema
+        conn.execute(
+            """
+            CREATE OR REPLACE TABLE random_data (
+                id INTEGER,
+                date TIMESTAMP,
+                category VARCHAR,
+                department VARCHAR,
+                is_active BOOLEAN,
+                score DOUBLE
+            )
+        """
+        )
+
+        conn.execute("INSERT INTO random_data SELECT * FROM df")
+        conn.close()
+        # This is the output format expected by the pipeline.py which orchestrates the execution of this script
+        print(json.dumps({"status": "success", "message": "Data loaded successfully"}))
+
+    except Exception as e:
+        print(json.dumps({"status": "error", "message": str(e)}), file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    execute()
diff --git a/tests/resources/assessments/pipeline_config.yml b/tests/resources/assessments/pipeline_config.yml
@@ -3,20 +3,26 @@ version: "1.0"
 extract_folder: /tmp/extracts/
 steps:
   - name: inventory
-    type: inventory
-    extract_query: resources/assessments/inventory.sql
+    type: sql
+    extract_source: resources/assessments/inventory.sql
     mode: overwrite
     frequency: daily
     flag: active
   - name: usage
-    type: usage
-    extract_query: resources/assessments/usage.sql
+    type: sql
+    extract_source: resources/assessments/usage.sql
     mode: overwrite
     frequency: weekly
     flag: active
   - name: usage_2
-    type: usage_2
-    extract_query: resources/assessments/usage.sql
+    type: sql
+    extract_source: resources/assessments/usage.sql
     mode: overwrite
     frequency: daily
     flag: inactive
+  - name: random_data
+    type: python
+    extract_source: resources/assessments/db_extract.py
+    mode: overwrite
+    frequency: daily
+    flag: active