Skip to content

Commit

Permalink
Merge pull request #550 from rtdip/develop
Browse files Browse the repository at this point in the history
v0.8.6
  • Loading branch information
GBBBAS authored Oct 19, 2023
2 parents 60f2fc1 + 10e18af commit 48bdc16
Show file tree
Hide file tree
Showing 12 changed files with 110 additions and 31 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/sonarcloud_reusable.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ jobs:
matrix:
os: [ubuntu-latest]
python-version: ["3.11"]
pyspark: ["3.4.1"]
delta-spark: ["2.4.0"]
pyspark: ["3.5.0"]
delta-spark: ["3.0.0"]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v3
Expand Down
10 changes: 8 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,12 @@ jobs:
python-version: ["3.8", "3.9", "3.10", "3.11"]
pyspark: ["3.3.0", "3.3.1", "3.3.2", "3.4.0", "3.4.1"]
exclude:
- pyspark: "3.4.1"
python-version: "3.8"
- pyspark: "3.4.1"
python-version: "3.9"
- pyspark: "3.4.1"
python-version: "3.10"
- pyspark: "3.4.0"
python-version: "3.8"
- pyspark: "3.4.0"
Expand Down Expand Up @@ -95,8 +101,8 @@ jobs:
matrix:
os: [ubuntu-latest]
python-version: ["3.11"]
pyspark: ["3.4.1"]
delta-spark: ["2.4.0"]
pyspark: ["3.5.0"]
delta-spark: ["3.0.0"]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v3
Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ dependencies:
- pyodbc==4.0.39
- fastapi==0.103.2
- httpx==0.24.1
- pyspark>=3.3.0,<3.5.0
- pyspark>=3.3.0,<3.6.0
- delta-spark>=2.2.0,<3.1.0
- grpcio>=1.48.1
- grpcio-status>=1.48.1
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@
]

PYSPARK_PACKAGES = [
"pyspark>=3.3.0,<3.5.0",
"delta-spark>=2.2.0,<2.5.0",
"pyspark>=3.3.0,<3.6.0",
"delta-spark>=2.2.0,<3.1.0",
]

PIPELINE_PACKAGES = [
Expand Down
26 changes: 16 additions & 10 deletions src/api/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,29 +14,35 @@

FROM mcr.microsoft.com/azure-functions/python:4-python3.10

RUN adduser --system --group app

ENV AzureWebJobsScriptRoot=/home/site/wwwroot \
AzureFunctionsJobHost__Logging__Console__IsEnabled=true

COPY src/api/requirements.txt /

RUN rm -rf /var/lib/apt/lists/partial \
apt-get clean \
&& apt-get update -o Acquire::CompressionTypes::Order::=gz \
&& apt-get install -y odbcinst1debian2 libodbc1 odbcinst unixodbc unixodbc-dev libsasl2-dev libsasl2-modules-gssapi-mit libboost-all-dev \
&& apt-get install -y ca-certificates curl python3-pip python3-dev python3-setuptools python3-wheel \
&& apt-get install -y zip wget \
&& wget https://databricks-bi-artifacts.s3.us-east-2.amazonaws.com/simbaspark-drivers/odbc/2.6.29/SimbaSparkODBC-2.6.29.1049-Debian-64bit.zip -P /odbc/ \
&& unzip /odbc/SimbaSparkODBC-2.6.29.1049-Debian-64bit.zip -d /odbc \
&& dpkg -i /odbc/simbaspark_2.6.29.1049-2_amd64.deb \
&& apt-get --no-install-recommends install -y odbcinst1debian2 libodbc1 odbcinst unixodbc unixodbc-dev libsasl2-dev libsasl2-modules-gssapi-mit libboost-all-dev \
&& apt-get --no-install-recommends install -y ca-certificates curl python3-pip python3-dev python3-setuptools python3-wheel gcc g++ \
&& apt-get --no-install-recommends install -y zip unzip wget \
&& wget --secure-protocol=TLSv1_2 --max-redirect=0 https://databricks-bi-artifacts.s3.us-east-2.amazonaws.com/simbaspark-drivers/odbc/2.7.5/SimbaSparkODBC-2.7.5.1012-Debian-64bit.zip -P /odbc/ \
&& unzip /odbc/SimbaSparkODBC-2.7.5.1012-Debian-64bit.zip -d /odbc \
&& dpkg -i /odbc/simbaspark_2.7.5.1012-2_amd64.deb \
&& pip install --no-cache-dir pyarrow==12.0.0 \
&& python -c "import pyarrow; pyarrow.create_library_symlinks()" \
&& CFLAGS="-D_GLIBCXX_USE_CXX11_ABI=0" pip install --no-cache-dir -r /requirements.txt \
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* \
&& rm /odbc -r

RUN echo '[ODBC Drivers]' > /etc/odbcinst.ini \
&& echo 'Simba Spark ODBC Driver = Installed' >> /etc/odbcinst.ini \
&& echo '[Simba Spark ODBC Driver]' >> /etc/odbcinst.ini \
&& echo 'Driver = /opt/simba/spark/lib/64/libsparkodbc_sb64.so' >> /etc/odbcinst.ini

COPY src/api/requirements.txt /
RUN pip install --no-cache-dir pyarrow==12.0.0 \
&& python -c "import pyarrow; pyarrow.create_library_symlinks()" \
&& CFLAGS="-D_GLIBCXX_USE_CXX11_ABI=0" pip install --no-cache-dir -r /requirements.txt
USER app

COPY src/api/ /home/site/wwwroot
COPY src /home/site/wwwroot/src

3 changes: 0 additions & 3 deletions src/sdk/python/rtdip_sdk/pipelines/_pipeline_utils/spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,6 @@ def get_spark_session(self) -> SparkSession:
maven_package.to_string()
for maven_package in self.spark_libraries.maven_libraries
)
temp_spark_configuration[
"spark.jars.repositories"
] = "https://oss.sonatype.org/content/repositories/iodelta-1080"

for configuration in temp_spark_configuration.items():
spark = spark.config(configuration[0], configuration[1])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,12 @@
import sys

sys.path.insert(0, ".")
from semver.version import Version
import pytest
from src.sdk.python.rtdip_sdk._sdk_utils.compare_versions import _get_package_version
from src.sdk.python.rtdip_sdk._sdk_utils.compare_versions import (
_get_package_version,
_get_python_package_version,
)
from src.sdk.python.rtdip_sdk.pipelines.destinations.spark.delta import (
SparkDeltaDestination,
)
Expand All @@ -38,11 +42,19 @@ def test_spark_delta_write_setup():
None, {}, "test_delta_destination_setup", "overwrite"
)
assert delta_destination.system_type().value == 2
delta_spark_artifact_id = "delta-core_2.12"
if (
Version.compare(
_get_python_package_version("delta-spark"), Version.parse("3.0.0")
)
>= 0
):
delta_spark_artifact_id = "delta-spark_2.12"
assert delta_destination.libraries() == Libraries(
maven_libraries=[
MavenLibrary(
group_id="io.delta",
artifact_id="delta-core_2.12",
artifact_id=delta_spark_artifact_id,
version=_get_package_version("delta-spark"),
)
],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,11 @@
import sys

sys.path.insert(0, ".")
from semver.version import Version
import pytest
from importlib_metadata import version
from src.sdk.python.rtdip_sdk._sdk_utils.compare_versions import (
_get_python_package_version,
_package_version_meets_minimum,
_get_package_version,
)
Expand Down Expand Up @@ -51,11 +53,19 @@ def test_spark_delta_merge_write_setup(spark_session: SparkSession):
spark_session, None, "test_delta_merge_destination_setup", {}, "1=2"
)
assert delta_merge_destination.system_type().value == 2
delta_spark_artifact_id = "delta-core_2.12"
if (
Version.compare(
_get_python_package_version("delta-spark"), Version.parse("3.0.0")
)
>= 0
):
delta_spark_artifact_id = "delta-spark_2.12"
assert delta_merge_destination.libraries() == Libraries(
maven_libraries=[
MavenLibrary(
group_id="io.delta",
artifact_id="delta-core_2.12",
artifact_id=delta_spark_artifact_id,
version=_get_package_version("delta-spark"),
)
],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,12 @@
import sys

sys.path.insert(0, ".")
from semver.version import Version
import pytest
from src.sdk.python.rtdip_sdk._sdk_utils.compare_versions import _get_package_version
from src.sdk.python.rtdip_sdk._sdk_utils.compare_versions import (
_get_package_version,
_get_python_package_version,
)
from src.sdk.python.rtdip_sdk.pipelines.destinations import (
SparkPCDMLatestToDeltaDestination,
)
Expand Down Expand Up @@ -73,11 +77,19 @@ def test_spark_pcdm_latest_to_delta_write_setup(spark_session: SparkSession):
"test_delta_latest_destination_setup",
)
assert pcdm_latest_o_delta_destination.system_type().value == 2
delta_spark_artifact_id = "delta-core_2.12"
if (
Version.compare(
_get_python_package_version("delta-spark"), Version.parse("3.0.0")
)
>= 0
):
delta_spark_artifact_id = "delta-spark_2.12"
assert pcdm_latest_o_delta_destination.libraries() == Libraries(
maven_libraries=[
MavenLibrary(
group_id="io.delta",
artifact_id="delta-core_2.12",
artifact_id=delta_spark_artifact_id,
version=_get_package_version("delta-spark"),
)
],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,12 @@
import sys

sys.path.insert(0, ".")
from semver.version import Version
import pytest
from src.sdk.python.rtdip_sdk._sdk_utils.compare_versions import _get_package_version
from src.sdk.python.rtdip_sdk._sdk_utils.compare_versions import (
_get_package_version,
_get_python_package_version,
)
from src.sdk.python.rtdip_sdk.pipelines.destinations.spark.delta import (
SparkDeltaDestination,
)
Expand Down Expand Up @@ -81,11 +85,19 @@ def test_spark_pcdm_to_delta_write_setup(spark_session: SparkSession):
"append",
)
assert pcdm_to_delta_destination.system_type().value == 2
delta_spark_artifact_id = "delta-core_2.12"
if (
Version.compare(
_get_python_package_version("delta-spark"), Version.parse("3.0.0")
)
>= 0
):
delta_spark_artifact_id = "delta-spark_2.12"
assert pcdm_to_delta_destination.libraries() == Libraries(
maven_libraries=[
MavenLibrary(
group_id="io.delta",
artifact_id="delta-core_2.12",
artifact_id=delta_spark_artifact_id,
version=_get_package_version("delta-spark"),
)
],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,13 @@
import sys

sys.path.insert(0, ".")
from semver.version import Version
import pytest
from pytest_mock import MockerFixture
from src.sdk.python.rtdip_sdk._sdk_utils.compare_versions import _get_package_version
from src.sdk.python.rtdip_sdk._sdk_utils.compare_versions import (
_get_package_version,
_get_python_package_version,
)
from src.sdk.python.rtdip_sdk.pipelines.sources.spark.autoloader import (
DataBricksAutoLoaderSource,
)
Expand All @@ -33,11 +37,19 @@
def test_databricks_autoloader_setup(spark_session: SparkSession):
autoloader_source = DataBricksAutoLoaderSource(spark_session, {}, path, "parquet")
assert autoloader_source.system_type().value == 3
delta_spark_artifact_id = "delta-core_2.12"
if (
Version.compare(
_get_python_package_version("delta-spark"), Version.parse("3.0.0")
)
>= 0
):
delta_spark_artifact_id = "delta-spark_2.12"
assert autoloader_source.libraries() == Libraries(
maven_libraries=[
MavenLibrary(
group_id="io.delta",
artifact_id="delta-core_2.12",
artifact_id=delta_spark_artifact_id,
version=_get_package_version("delta-spark"),
)
],
Expand Down
16 changes: 14 additions & 2 deletions tests/sdk/python/rtdip_sdk/pipelines/sources/spark/test_delta.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,13 @@
import sys

sys.path.insert(0, ".")
from semver.version import Version
from importlib_metadata import version
import pytest
from src.sdk.python.rtdip_sdk._sdk_utils.compare_versions import _get_package_version
from src.sdk.python.rtdip_sdk._sdk_utils.compare_versions import (
_get_package_version,
_get_python_package_version,
)
from src.sdk.python.rtdip_sdk.pipelines.destinations.spark.delta import (
SparkDeltaDestination,
)
Expand All @@ -34,11 +38,19 @@
def test_spark_delta_read_setup(spark_session: SparkSession):
delta_source = SparkDeltaSource(spark_session, {}, "test_spark_delta_read_setup")
assert delta_source.system_type().value == 2
delta_spark_artifact_id = "delta-core_2.12"
if (
Version.compare(
_get_python_package_version("delta-spark"), Version.parse("3.0.0")
)
>= 0
):
delta_spark_artifact_id = "delta-spark_2.12"
assert delta_source.libraries() == Libraries(
maven_libraries=[
MavenLibrary(
group_id="io.delta",
artifact_id="delta-core_2.12",
artifact_id=delta_spark_artifact_id,
version=_get_package_version("delta-spark"),
)
],
Expand Down

0 comments on commit 48bdc16

Please sign in to comment.