test: Add Spark write and Embucket read verification test (#1443)

YevheniiNiestierov · github-actions[bot] · web-flow · commit aaf25ba255f0 · 2025-08-04T22:17:43.000+03:00
* test: Add Spark write and Embucket read verification test

* refactor: update test_read_spark with pytest logic, add pytest configuration and Spark session fixtures for testing

* refactor: remove logging and AWS configuration from Spark tests

---------

Co-authored-by: github-actions[bot] &lt;41898282+github-actions[bot]@users.noreply.github.com&gt;
diff --git a/test/integration_tests/conftest.py b/test/integration_tests/conftest.py
@@ -0,0 +1,87 @@
+import pytest
+import os
+from pyspark.sql import SparkSession
+
+CATALOG_URL = "http://localhost:8080"
+WAREHOUSE_ID = "test_db"
+AWS_REGION = os.getenv("AWS_REGION")
+AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
+AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
+
+@pytest.fixture()
+def rest_spark_session():
+    spark_session = spark_session_factory(config_type='file_catalog', app_name='FileCatalogTest')
+    yield spark_session
+    spark_session.stop()
+
+
+@pytest.fixture()
+def s3_spark_session():
+    spark_session = spark_session_factory(config_type='s3_catalog', app_name='S3CatalogTest')
+    yield spark_session
+    spark_session.stop()
+
+
+def spark_session_factory(
+        config_type: str,
+        app_name: str,
+        config_overrides: dict = None
+    ):
+    """
+    The actual function that builds the SparkSession.
+
+    Args:
+        config_type (str): The type of configuration to use.
+                           Expected values: 'file_catalog' or 's3_catalog'.
+        app_name (str): The name for the Spark application.
+        config_overrides (dict): A dictionary of Spark configs to add or
+                                 override the base configuration.
+
+    Returns:
+        SparkSession: An initialized SparkSession object.
+    """
+    print(f"\nBuilding Spark session with config_type='{config_type}'...")
+    builder = SparkSession.builder.appName(app_name)
+
+    # --- Configuration Type 1: REST Client with SimpleAWSCredentialsProvider ---
+    if config_type == 'file_catalog':
+        builder.config("spark.driver.memory", "15g") \
+               .config("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.9.1") \
+               .config("spark.driver.extraJavaOptions", "-Dlog4j.configurationFile=log4j2.properties") \
+               .config("spark.hadoop.fs.s3a.path.style.access", "true") \
+               .config("spark.hadoop.fs.s3a.change.detection.mode", "error") \
+               .config("spark.hadoop.fs.s3a.change.detection.version.required", "false") \
+               .config("spark.hadoop.fs.s3a.multiobjectdelete.enable", "true") \
+               .config("spark.hadoop.fs.s3a.impl", "org.apache.iceberg.hadoop.HadoopFileIO") \
+               .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
+               .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
+               .config("spark.sql.catalog.rest", "org.apache.iceberg.spark.SparkCatalog") \
+               .config("spark.sql.catalog.rest.catalog-impl", "org.apache.iceberg.rest.RESTCatalog") \
+               .config("spark.sql.catalog.rest.io-impl", "org.apache.iceberg.hadoop.HadoopFileIO") \
+               .config("spark.sql.catalog.rest.uri", CATALOG_URL) \
+               .config("spark.sql.catalog.rest.warehouse", WAREHOUSE_ID) \
+               .config("spark.sql.defaultCatalog", "rest")
+
+    # --- Configuration Type 2: Direct S3 Access with Explicit Keys ---
+    elif config_type == 's3_catalog':
+        builder.config("spark.driver.memory", "15g") \
+               .config("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.9.1,org.apache.hadoop:hadoop-aws:3.3.4") \
+               .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
+               .config("spark.hadoop.fs.s3a.path.style.access", "true") \
+               .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
+               .config("spark.sql.catalog.rest", "org.apache.iceberg.spark.SparkCatalog") \
+               .config("spark.sql.catalog.rest.catalog-impl", "org.apache.iceberg.rest.RESTCatalog") \
+               .config("spark.sql.catalog.rest.io-impl", "org.apache.iceberg.hadoop.HadoopFileIO") \
+               .config("spark.sql.catalog.rest.uri", CATALOG_URL) \
+               .config("spark.sql.catalog.rest.warehouse", WAREHOUSE_ID) \
+               .config("spark.sql.defaultCatalog", "rest")
+    else:
+        raise ValueError(f"Unknown config_type: '{config_type}'. Expected 'file_catalog' or 's3_catalog'.")
+
+    # Apply any specific overrides for the test
+    if config_overrides:
+        for key, value in config_overrides.items():
+            builder.config(key, value)
+
+    spark_session = builder.getOrCreate()
+    return spark_session
diff --git a/test/integration_tests/pytest.ini b/test/integration_tests/pytest.ini
@@ -0,0 +1,4 @@
+[pytest]
+log_cli = true
+log_cli_level = INFO
+log_cli_format = %(asctime)s %(levelname)s %(message)s
diff --git a/test/integration_tests/test_read_spark.py b/test/integration_tests/test_read_spark.py
@@ -0,0 +1,85 @@
+import pandas as pd
+from clients import EmbucketClient
+
+DB = "test_db"
+SCHEMA = "public"
+TABLE = "spark_embucket"
+
+def get_embucket_client():
+    emb = EmbucketClient()
+    emb.volume()
+    emb.sql(f"CREATE DATABASE IF NOT EXISTS {DB} EXTERNAL_VOLUME = 'test'")
+    emb.sql(f"CREATE SCHEMA IF NOT EXISTS {DB}.{SCHEMA}")
+    return emb
+
+
+def perform_spark_operations(spark):
+    # create table
+    spark.sql(
+        f"""CREATE TABLE IF NOT EXISTS {SCHEMA}.{TABLE} (
+            id INT,
+            page_name STRING,
+            category STRING
+        )"""
+    )
+
+    # clear
+    spark.sql(f"DELETE FROM {SCHEMA}.{TABLE} WHERE TRUE")
+
+    # insert rows
+    spark.sql(
+        f"""
+        INSERT INTO {SCHEMA}.{TABLE} VALUES
+        (1, 'page_1', 'category_1'),
+        (2, 'is_object', 'category_2'),
+        (3, 'page_3', 'Conditional_expression')
+        """
+    )
+
+    # update
+    spark.sql(
+        f"UPDATE {SCHEMA}.{TABLE} SET category='updated_category' WHERE page_name='is_object'"
+    )
+
+    # delete
+    spark.sql(
+        f"DELETE FROM {SCHEMA}.{TABLE} WHERE category='Conditional_expression'"
+    )
+
+
+def read_and_validate_from_embucket(emb: EmbucketClient) -> pd.DataFrame:
+    res = emb.sql(f"SELECT id, page_name, category FROM {DB}.{SCHEMA}.{TABLE} ORDER BY id")
+    result = res.get("result", {})
+    rows = result.get("rows", [])
+    cols = [c["name"] for c in result.get("columns", [])]
+    df = pd.DataFrame(rows, columns=cols)
+    return df
+
+
+# --- test ---
+def test_spark_embucket_sync(rest_spark_session):
+    """
+    Write via Spark (create/insert/update/delete) and verify Embucket sees those changes
+    """
+    spark = rest_spark_session
+
+    # Perform Spark-side mutations
+    perform_spark_operations(spark)
+
+    # Read back through Embucket
+    embucket_client = get_embucket_client()
+    df = read_and_validate_from_embucket(embucket_client)
+
+    # Validate update happened
+    updated_mask = (df["page_name"] == "is_object") & (df["category"] == "updated_category")
+    assert updated_mask.any(), "Expected row with page_name='is_object' to have category='updated_category'"
+
+    # Validate delete happened
+    assert not (df["category"] == "Conditional_expression").any(), (
+        "Expected no rows with category='Conditional_expression' after delete"
+    )
+
+    # Check that unaffected row is still there
+    assert ((df["page_name"] == "page_1") & (df["category"] == "category_1")).any(), (
+        "Expected original row ('page_1','category_1') to persist"
+    )