Attemps to submit job with dataproc python sdk

Skylar Pape · Skylar Pape · commit aa30abe73ac1 · 2025-11-07T15:53:31.000-06:00
diff --git a/devops/pyspark-requirements.txt b/devops/pyspark-requirements.txt
diff --git a/devops/pyspark/cluster-requirements.txt b/devops/pyspark/cluster-requirements.txt
@@ -0,0 +1,4 @@
+click == 8.2.1
+pyspark == 3.5.6
+# Potential pyspark dependency conflict: https://github.com/GoogleCloudPlatform/dataproc-templates/blob/main/python/requirements.txt#L1
+google-dataproc-templates >= 2.0.0b0
diff --git a/devops/pyspark/orchestration-requirements.txt b/devops/pyspark/orchestration-requirements.txt
@@ -0,0 +1,5 @@
+google-cloud-dataproc >= 5.23.0
+google-cloud-storage >= 3.5.0
+# For consistency's sake, this is the same version installed on cluster
+# and the same version defined in cluster-requirements.txt
+click == 8.2.1
diff --git a/src/pyspark/job.py b/src/pyspark/job.py
@@ -1,54 +1,45 @@
+from typing import Optional
 import click
-import os
 from pyspark.sql import SparkSession
-
-from src_config import SrcConfig
-from src_db_type import SrcDbType
-
-SOURCE_DB_DRIVERS: dict[SrcDbType, SrcConfig] = {
-    SrcDbType.POSTGRESQL: SrcConfig(
-        driver="org.postgresql.Driver",
-        secret_id="wtr-read-replica",
-    ),
-}
+from utilities.args import jdbc_to_gbq_options
 
 @click.command()
-@click.option('--src-db-type', required=True, type=click.Choice(SrcDbType, case_sensitive=False), help='Type of source database (e.g. mysql, postgres)')
-@click.option('--src-protocol-action', default=None, help='Action specified in JDBC protocol for source database, if any')
-@click.option('--src-host', required=True, help='Host of source database')
-@click.option('--src-port', required=True, help='Port for source database, if any')
-@click.option('--src-db-name', required=True, help='Database name of source database')
-@click.option('--src-params', default=None, help='Connection parameters for source database, if any')
-@click.option('--src-user', required=True, help='User to connect to source database with')
+@jdbc_to_gbq_options
 def main(
-    src_db_type: SrcDbType,
-    src_protocol_action: str,
-    src_host: str,
-    src_port: int,
-    src_db_name: str,
-    src_params: str,
-    src_user: str,
+    input_url_secret: str,
+    input_driver: str,
+    input_table: Optional[str] = None,
+    input_partition_column: Optional[str] = None,
+    input_lower_bound: Optional[str] = None,
+    input_upper_bound: Optional[str] = None,
+    input_fetch_size: Optional[str] = None,
+    input_session_init_statement: Optional[str] = None,
+    num_partitions: Optional[int] = None,
+    output_mode: Optional[str] = None,
 ):
     """
-    PySpark job that tests DB replication to GBQ via JDBC
-    """
-    src_jdbc_action = f":{src_protocol_action}" if src_protocol_action else ""
-    src_jdbc_params = f"?{src_params}" if src_params else ""
-    src_jdbc_url = f"jdbc:{src_db_type.value}{src_jdbc_action}://{src_host}:{src_port}/{src_db_name}{src_jdbc_params}"
-
-    src_config = SOURCE_DB_DRIVERS[src_db_type]
-    src_properties = {
-        "user": src_user,
-        "password": os.environ['WTR_READ_REPLICA_PASSWORD'],
-        "driver": src_config.driver,
-    }
+    PySpark job that replicates a JDBC-connected database to GBQ.
 
+    Simple wrapper around this template:
+    https://github.com/GoogleCloudPlatform/dataproc-templates/tree/main/python/dataproc_templates/jdbc#arguments-2
+    """
     spark = SparkSession.builder \
-        .appName("DbReplicationTest") \
-        .master("spark://localhost:46411") \
+        .appName("JdbcToGbq") \
         .getOrCreate()
 
-    data = ["Hello", "World", "PySpark", "Job"]
+    data = [
+        input_url_secret,
+        input_driver,
+        input_table,
+        input_partition_column,
+        input_lower_bound,
+        input_upper_bound,
+        input_fetch_size,
+        input_session_init_statement,
+        num_partitions,
+        output_mode
+    ]
+    print(data)
     rdd = spark.sparkContext.parallelize(data)
     print(f"Number of elements in RDD: {rdd.count()}")
 
diff --git a/src/pyspark/src_config.py b/src/pyspark/src_config.py
diff --git a/src/pyspark/src_db_type.py b/src/pyspark/src_db_type.py
diff --git a/src/pyspark/submit_job.py b/src/pyspark/submit_job.py
@@ -0,0 +1,73 @@
+from typing import Optional
+import click
+import re
+from google.cloud import dataproc_v1
+from google.cloud import storage
+from utilities.args import jdbc_to_gbq_options
+
+# TODO: Make this configurable in CI
+GCP_PROJECT = "tmc-data-transfer"
+DATAPROC_CLUSTER_NAME = "data-transfer-cluster"
+DATAPROC_CLUSTER_REGION = "us-central1"
+GCS_PARENT_FOLDER = "gs://dataproc-staging-us-central1-386874222317-aaiovycl/pyspark"
+
+@click.command()
+@jdbc_to_gbq_options
+def main(**kwargs):
+    """
+    Submits a PySpark job to Dataproc to replicate a JDBC-connected database to GBQ.
+
+    Docs & Code Referenced:
+        - https://docs.cloud.google.com/dataproc/docs/samples/dataproc-submit-job
+        - https://github.com/googleapis/google-cloud-python/blob/2feb74032fd9c5cc7eaf6072ab03e9e8397bd434/packages/google-cloud-dataproc/google/cloud/dataproc_v1/types/jobs.py#L305
+    """
+    # TODO: Make this configurable in CI
+    job_client = dataproc_v1.JobControllerClient(
+        client_options={
+            "api_endpoint": f"{DATAPROC_CLUSTER_REGION}-dataproc.googleapis.com:443",
+        }
+    )
+
+    gcs_utilities_folder = f"{GCS_PARENT_FOLDER}/utilities"
+    args = []
+    for key, value in kwargs.items():
+        if value is not None:
+            args.append(f"--{key.replace('_', '-')}")
+            args.append(value)
+    job_config = {
+        "placement": {
+            "cluster_name": DATAPROC_CLUSTER_NAME,
+        },
+        "pyspark_job": {
+            "main_python_file_uri": f"{GCS_PARENT_FOLDER}/cluster/job.py",
+            "python_file_uris": [
+                f"{gcs_utilities_folder}/args.py",
+                f"{gcs_utilities_folder}/driver.py"
+            ],
+            "args": args,
+        },
+    }
+
+    operation = job_client.submit_job_as_operation(
+        request={
+            "project_id": GCP_PROJECT,
+            "region": DATAPROC_CLUSTER_REGION,
+            "job": job_config
+        }
+    )
+    response = operation.result()
+
+    # Dataproc job output is saved to the Cloud Storage bucket
+    # allocated to the job. Use regex to obtain the bucket and blob info.
+    matches = re.match("gs://(.*?)/(.*)", response.driver_output_resource_uri)
+    output = (
+        storage.Client()
+        .get_bucket(matches.group(1))
+        .blob(f"{matches.group(2)}.000000000")
+        .download_as_bytes()
+        .decode("utf-8")
+    )
+    print(f"Job finished successfully: {output}\r\n")
+
+if __name__ == '__main__':
+    main()
diff --git a/src/pyspark/utilities/args.py b/src/pyspark/utilities/args.py
@@ -0,0 +1,65 @@
+import click
+import functools
+
+from utilities.driver import Driver
+
+def jdbc_to_gbq_options(f: callable) -> callable:
+    """
+    Click args for the JDBC to GBQ PySpark template:
+    https://github.com/GoogleCloudPlatform/dataproc-templates/tree/main/python/dataproc_templates/jdbc#arguments-2
+    """
+    @click.option(
+        '--input-url-secret',
+        required=True,
+        help='Name of the Google secret whose value is a JDBC url that includes a password to connect to the input database',
+    )
+    @click.option(
+        '--input-driver',
+        required=True,
+        type=click.Choice(Driver),
+        help='Enum value for JDBC input driver name',
+    )
+    @click.option(
+        '--input-table',
+        required=True,
+        help='JDBC input table name',
+    )
+    @click.option(
+        '--input-partition-column',
+        required=False,
+        help='JDBC input table partition column name',
+    )
+    @click.option(
+        '--input-lower-bound',
+        required=False,
+        help='JDBC input table partition column lower bound which is used to decide the partition stride',
+    )
+    @click.option(
+        '--input-upper-bound',
+        required=False,
+        help='JDBC input table partition column upper bound which is used to decide the partition stride',
+    )
+    @click.option(
+        '--input-fetch-size',
+        required=False,
+        help='Determines how many rows to fetch per round trip',
+    )
+    @click.option(
+        '--input-session-init-statement',
+        required=False,
+        help='Custom SQL statement to execute in each reader database session',
+    )
+    @click.option(
+        '--num-partitions',
+        required=False,
+        help='The maximum number of partitions that can be used for parallelism in table reading and writing. Same value will be used for both input and output jdbc connection. Default set to 10',
+    )
+    @click.option(
+        '--output-mode',
+        required=False,
+        help='Output write mode (one of: append,overwrite,ignore,errorifexists) (Defaults to append)',
+    )
+    @functools.wraps(f)
+    def wrapper(**kwargs):
+        return f(**kwargs)
+    return wrapper
diff --git a/src/pyspark/utilities/driver.py b/src/pyspark/utilities/driver.py
@@ -0,0 +1,7 @@
+from enum import Enum
+
+class Driver(Enum):
+    MYSQL = "com.mysql.cj.jdbc.Driver"
+    POSTGRESQL = "org.postgresql.Driver"
+    SQL_SERVER = "com.microsoft.sqlserver.jdbc.SQLServerDriver"
+    ORACLE = "oracle.jdbc.driver.OracleDriver"