Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/docker_image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ on:
- main
paths:
- '.github/workflows/docker_image.yml'
- '.github/workflows/util/install-resources.sh'
- '.github/workflows/util/install-spark-deps.sh'
- '.github/workflows/util/install-spark-resources.sh'
- 'dev/docker/*'
- 'dev/docker/cudf/*'
- 'dev/docker/ubuntu/*'
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/flink.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ jobs:
source /opt/rh/gcc-toolset-11/enable
sudo dnf install -y patchelf
sudo yum install https://mirror.stream.centos.org/9-stream/BaseOS/x86_64/os/Packages/tzdata-2025a-1.el9.noarch.rpm -y
sudo .github/workflows/util/install-flink-resources.sh
sudo .github/workflows/util/install-flink-deps.sh
git clone -b gluten-0530 https://github.com/bigo-sg/velox4j.git
cd velox4j && git reset --hard 889bafcf2fa04e8c31a30edbdf40fe203ef58484
git apply $GITHUB_WORKSPACE/gluten-flink/patches/fix-velox4j.patch
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@
# for spark.test.home in mvn test.
#
# This file can be:
# 1. Executed directly: ./install-resources.sh <spark-version> [install-dir]
# 2. Sourced to use functions: source install-resources.sh; install_hadoop; setup_hdfs
# Sourced to use functions: source install-spark-deps.sh; install_hadoop; setup_hdfs

set -e

Expand Down Expand Up @@ -152,112 +151,3 @@ function setup_minio {
mc alias set s3local http://localhost:9100 "$MINIO_ROOT_USER" "$MINIO_ROOT_PASSWORD"
mc mb -p s3local/gluten-it || true
}

# Installs Spark binary and source releases with:
# 1 - spark version
# 2 - hadoop version
# 3 - scala version
function install_spark() {
local spark_version="$1"
local hadoop_version="$2"
local scala_version="$3"
local spark_version_short=$(echo "${spark_version}" | cut -d '.' -f 1,2 | tr -d '.')
local scala_suffix=$([ "${scala_version}" == '2.13' ] && echo '-scala-2.13' || echo '')
local scala_suffix_short=$([ "${scala_version}" == '2.13' ] && echo '-scala2.13' || echo '')
local mirror_host='https://www.apache.org/dyn/closer.lua/'
local mirror_host2='https://mirror.lyrahosting.com/apache/' # Fallback mirror due to closer.lua slowness
local url_query='?action=download'
local checksum_suffix='sha512'
local url_path="spark/spark-${spark_version}/"
local local_binary="spark-${spark_version}-bin-hadoop${hadoop_version}${scala_suffix_short}.tgz"
local local_binary_checksum="${local_binary}.${checksum_suffix}"
local local_source="spark-${spark_version}.tgz"
local local_source_checksum="${local_source}.${checksum_suffix}"
local remote_binary="${mirror_host2}${url_path}${local_binary}${url_query}"
local remote_binary_checksum="${mirror_host}${url_path}${local_binary_checksum}${url_query}"
local remote_source="${mirror_host2}${url_path}${local_source}${url_query}"
local remote_source_checksum="${mirror_host}${url_path}${local_source_checksum}${url_query}"
local wget_opts="--no-verbose --no-check-certificate"

wget ${wget_opts} -O "${local_binary}" "${remote_binary}"
wget ${wget_opts} -O "${local_source}" "${remote_source}"

# Checksum may not have been specified; don't check if doesn't exist
if [ "$(command -v shasum)" ]; then
wget ${wget_opts} -O "${local_binary_checksum}" "${remote_binary_checksum}"
if ! shasum -a 512 -c "${local_binary_checksum}" > /dev/null ; then
echo "Bad checksum from ${remote_binary_checksum}"
rm -f "${local_binary_checksum}"
exit 2
fi
rm -f "${local_binary_checksum}"

wget ${wget_opts} -O "${local_source_checksum}" "${remote_source_checksum}"
if ! shasum -a 512 -c "${local_source_checksum}" > /dev/null ; then
echo "Bad checksum from ${remote_source_checksum}"
rm -f "${local_source_checksum}"
exit 2
fi
rm -f "${local_source_checksum}"
else
echo "Skipping checksum because shasum is not installed." 1>&2
fi

tar --strip-components=1 -xf "${local_binary}" spark-"${spark_version}"-bin-hadoop"${hadoop_version}""${scala_suffix_short}"/jars/ \
spark-"${spark_version}"-bin-hadoop"${hadoop_version}""${scala_suffix_short}"/python \
spark-"${spark_version}"-bin-hadoop"${hadoop_version}""${scala_suffix_short}"/bin
mkdir -p ${INSTALL_DIR}/shims/spark"${spark_version_short}""${scala_suffix}"/spark_home/assembly/target/scala-"${scala_version}"
mv jars ${INSTALL_DIR}/shims/spark"${spark_version_short}""${scala_suffix}"/spark_home/assembly/target/scala-"${scala_version}"
mv python ${INSTALL_DIR}/shims/spark"${spark_version_short}""${scala_suffix}"/spark_home
mv bin ${INSTALL_DIR}/shims/spark"${spark_version_short}""${scala_suffix}"/spark_home

tar --strip-components=1 -xf "${local_source}" spark-"${spark_version}"/sql/core/src/test/resources/
mkdir -p shims/spark"${spark_version_short}${scala_suffix}"/spark_home/
mv sql shims/spark"${spark_version_short}${scala_suffix}"/spark_home/

rm -rf "${local_binary}"
rm -rf "${local_source}"
}

# Only run install_spark when script is executed directly (not sourced)
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
INSTALL_DIR=${2:-/opt/}
mkdir -p ${INSTALL_DIR}

case "$1" in
3.3)
# Spark-3.3
cd ${INSTALL_DIR} && \
install_spark "3.3.1" "3" "2.12"
;;
3.4)
# Spark-3.4
cd ${INSTALL_DIR} && \
install_spark "3.4.4" "3" "2.12"
;;
3.5)
# Spark-3.5
cd ${INSTALL_DIR} && \
install_spark "3.5.5" "3" "2.12"
;;
3.5-scala2.13)
# Spark-3.5, scala 2.13
cd ${INSTALL_DIR} && \
install_spark "3.5.5" "3" "2.13"
;;
4.0)
# Spark-4.0, scala 2.12 // using 2.12 as a hack as 4.0 does not have 2.13 suffix
cd ${INSTALL_DIR} && \
install_spark "4.0.1" "3" "2.12"
;;
4.1)
# Spark-4.x, scala 2.12 // using 2.12 as a hack as 4.0 does not have 2.13 suffix
cd ${INSTALL_DIR} && \
install_spark "4.1.1" "3" "2.12"
;;
*)
echo "Spark version is expected to be specified."
exit 1
;;
esac
fi
132 changes: 132 additions & 0 deletions .github/workflows/util/install-spark-resources.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
#!/usr/bin/env bash
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Download Spark resources, required by some Spark UTs. The resource path should be set
# for spark.test.home in mvn test.
#
# This file can be:
# Executed directly: ./install-spark-resources.sh <spark-version> [install-dir]

set -e

# Installs Spark binary and source releases with:
# 1 - spark version
# 2 - hadoop version
# 3 - scala version
function install_spark() {
local spark_version="$1"
local hadoop_version="$2"
local scala_version="$3"
local spark_version_short=$(echo "${spark_version}" | cut -d '.' -f 1,2 | tr -d '.')
local scala_suffix=$([ "${scala_version}" == '2.13' ] && echo '-scala-2.13' || echo '')
local scala_suffix_short=$([ "${scala_version}" == '2.13' ] && echo '-scala2.13' || echo '')
local mirror_host='https://www.apache.org/dyn/closer.lua/'
local mirror_host2='https://mirror.lyrahosting.com/apache/' # Fallback mirror due to closer.lua slowness
local url_query='?action=download'
local checksum_suffix='sha512'
local url_path="spark/spark-${spark_version}/"
local local_binary="spark-${spark_version}-bin-hadoop${hadoop_version}${scala_suffix_short}.tgz"
local local_binary_checksum="${local_binary}.${checksum_suffix}"
local local_source="spark-${spark_version}.tgz"
local local_source_checksum="${local_source}.${checksum_suffix}"
local remote_binary="${mirror_host2}${url_path}${local_binary}${url_query}"
local remote_binary_checksum="${mirror_host}${url_path}${local_binary_checksum}${url_query}"
local remote_source="${mirror_host2}${url_path}${local_source}${url_query}"
local remote_source_checksum="${mirror_host}${url_path}${local_source_checksum}${url_query}"
local wget_opts="--no-verbose --no-check-certificate"

wget ${wget_opts} -O "${local_binary}" "${remote_binary}"
wget ${wget_opts} -O "${local_source}" "${remote_source}"

# Checksum may not have been specified; don't check if doesn't exist
if [ "$(command -v shasum)" ]; then
wget ${wget_opts} -O "${local_binary_checksum}" "${remote_binary_checksum}"
if ! shasum -a 512 -c "${local_binary_checksum}" > /dev/null ; then
echo "Bad checksum from ${remote_binary_checksum}"
rm -f "${local_binary_checksum}"
exit 2
fi
rm -f "${local_binary_checksum}"

wget ${wget_opts} -O "${local_source_checksum}" "${remote_source_checksum}"
if ! shasum -a 512 -c "${local_source_checksum}" > /dev/null ; then
echo "Bad checksum from ${remote_source_checksum}"
rm -f "${local_source_checksum}"
exit 2
fi
rm -f "${local_source_checksum}"
else
echo "Skipping checksum because shasum is not installed." 1>&2
fi

tar --strip-components=1 -xf "${local_binary}" spark-"${spark_version}"-bin-hadoop"${hadoop_version}""${scala_suffix_short}"/jars/ \
spark-"${spark_version}"-bin-hadoop"${hadoop_version}""${scala_suffix_short}"/python \
spark-"${spark_version}"-bin-hadoop"${hadoop_version}""${scala_suffix_short}"/bin
mkdir -p ${INSTALL_DIR}/shims/spark"${spark_version_short}""${scala_suffix}"/spark_home/assembly/target/scala-"${scala_version}"
mv jars ${INSTALL_DIR}/shims/spark"${spark_version_short}""${scala_suffix}"/spark_home/assembly/target/scala-"${scala_version}"
mv python ${INSTALL_DIR}/shims/spark"${spark_version_short}""${scala_suffix}"/spark_home
mv bin ${INSTALL_DIR}/shims/spark"${spark_version_short}""${scala_suffix}"/spark_home

tar --strip-components=1 -xf "${local_source}" spark-"${spark_version}"/sql/core/src/test/resources/
mkdir -p shims/spark"${spark_version_short}${scala_suffix}"/spark_home/
mv sql shims/spark"${spark_version_short}${scala_suffix}"/spark_home/

rm -rf "${local_binary}"
rm -rf "${local_source}"
}

# Only run install_spark when script is executed directly (not sourced)
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
INSTALL_DIR=${2:-/opt/}
mkdir -p ${INSTALL_DIR}

case "$1" in
3.3)
# Spark-3.3
cd ${INSTALL_DIR} && \
install_spark "3.3.1" "3" "2.12"
;;
3.4)
# Spark-3.4
cd ${INSTALL_DIR} && \
install_spark "3.4.4" "3" "2.12"
;;
3.5)
# Spark-3.5
cd ${INSTALL_DIR} && \
install_spark "3.5.5" "3" "2.12"
;;
3.5-scala2.13)
# Spark-3.5, scala 2.13
cd ${INSTALL_DIR} && \
install_spark "3.5.5" "3" "2.13"
;;
4.0)
# Spark-4.0, scala 2.12 // using 2.12 as a hack as 4.0 does not have 2.13 suffix
cd ${INSTALL_DIR} && \
install_spark "4.0.1" "3" "2.12"
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems we should add the rename command here. This avoids having to handle it in the CI job when the Docker image is updated later after this PR is merged.

mv /opt/shims/spark40/spark_home/assembly/target/scala-2.12 /opt/shims/spark40/spark_home/assembly/target/scala-2.13

;;
4.1)
# Spark-4.x, scala 2.12 // using 2.12 as a hack as 4.0 does not have 2.13 suffix
cd ${INSTALL_DIR} && \
install_spark "4.1.1" "3" "2.12"
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ditto

;;
*)
echo "Spark version is expected to be specified."
exit 1
;;
esac
fi
4 changes: 2 additions & 2 deletions .github/workflows/velox_backend_enhanced.yml
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ jobs:
- name: Prepare Spark Resources for Spark 3.5.5
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you help remove this section? It should be unnecessary now.

run: |
rm -rf /opt/shims/spark35
bash .github/workflows/util/install-resources.sh 3.5
bash .github/workflows/util/install-spark-resources.sh 3.5
- name: Build and Run unit test for Spark 3.5.5 (slow tests)
run: |
cd $GITHUB_WORKSPACE/
Expand Down Expand Up @@ -286,7 +286,7 @@ jobs:
- name: Prepare Spark Resources for Spark 4.0.1 #TODO remove after image update
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This installation may be unnecessary also. Maybe, only need to keep the following command for test with scala-2.13.

mv /opt/shims/spark40/spark_home/assembly/target/scala-2.12 /opt/shims/spark40/spark_home/assembly/target/scala-2.13

run: |
rm -rf /opt/shims/spark40
bash .github/workflows/util/install-resources.sh 4.0
bash .github/workflows/util/install-spark-resources.sh 4.0
mv /opt/shims/spark40/spark_home/assembly/target/scala-2.12 /opt/shims/spark40/spark_home/assembly/target/scala-2.13
- name: Build and Run unit test for Spark 4.0.0 with scala-2.13 (other tests)
run: |
Expand Down
17 changes: 9 additions & 8 deletions .github/workflows/velox_backend_x86.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ on:
pull_request:
paths:
- '.github/workflows/velox_backend_x86.yml'
- '.github/workflows/util/install-resources.sh' #TODO remove after image update
- '.github/workflows/util/install-spark-deps.sh' #TODO remove after image update
- '.github/workflows/util/install-spark-resources.sh' #TODO remove after image update
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps, we can remove the above two lines now.

- 'pom.xml'
- 'backends-velox/**'
- 'gluten-uniffle/**'
Expand Down Expand Up @@ -180,15 +181,15 @@ jobs:
shell: bash
run: |
export JAVA_HOME=/usr/lib/jvm/${{ matrix.java }}-openjdk-amd64
source .github/workflows/util/install-resources.sh
source .github/workflows/util/install-spark-deps.sh
install_hadoop
setup_hdfs
- name: Install MinIO
if: matrix.os == 'ubuntu:22.04' && matrix.spark == 'spark-3.5' && matrix.java == 'java-8'
shell: bash
run: |
export JAVA_HOME=/usr/lib/jvm/${{ matrix.java }}-openjdk-amd64
source .github/workflows/util/install-resources.sh
source .github/workflows/util/install-spark-deps.sh
install_minio
- name: Build and run TPC-H / TPC-DS
shell: bash
Expand All @@ -210,7 +211,7 @@ jobs:
if [ "${{ matrix.os }}" = "ubuntu:22.04" ] && \
[ "${{ matrix.spark }}" = "spark-3.5" ] && \
[ "${{ matrix.java }}" = "java-8" ]; then
source $GITHUB_WORKSPACE/.github/workflows/util/install-resources.sh
source $GITHUB_WORKSPACE/.github/workflows/util/install-spark-deps.sh
SPARK_VERSION=$(echo "${{ matrix.spark }}" | sed 's/spark-//')
setup_minio "$SPARK_VERSION"
fi
Expand Down Expand Up @@ -1259,7 +1260,7 @@ jobs:
- name: Prepare Spark Resources for Spark 4.0.1 #TODO remove after image update
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ditto.

run: |
rm -rf /opt/shims/spark40
bash .github/workflows/util/install-resources.sh 4.0
bash .github/workflows/util/install-spark-resources.sh 4.0
mv /opt/shims/spark40/spark_home/assembly/target/scala-2.12 /opt/shims/spark40/spark_home/assembly/target/scala-2.13
- name: Build and Run unit test for Spark 4.0.0 with scala-2.13 (other tests)
run: |
Expand Down Expand Up @@ -1309,7 +1310,7 @@ jobs:
- name: Prepare Spark Resources for Spark 4.0.1 #TODO remove after image update
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ditto.

run: |
rm -rf /opt/shims/spark40
bash .github/workflows/util/install-resources.sh 4.0
bash .github/workflows/util/install-spark-resources.sh 4.0
mv /opt/shims/spark40/spark_home/assembly/target/scala-2.12 /opt/shims/spark40/spark_home/assembly/target/scala-2.13
- name: Build and Run unit test for Spark 4.0 (slow tests)
run: |
Expand Down Expand Up @@ -1367,7 +1368,7 @@ jobs:
- name: Prepare Spark Resources for Spark 4.1.0 #TODO remove after image update
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ditto

run: |
rm -rf /opt/shims/spark41
bash .github/workflows/util/install-resources.sh 4.1
bash .github/workflows/util/install-spark-resources.sh 4.1
mv /opt/shims/spark41/spark_home/assembly/target/scala-2.12 /opt/shims/spark41/spark_home/assembly/target/scala-2.13
- name: Build and Run unit test for Spark 4.1.0 with scala-2.13 (other tests)
run: |
Expand Down Expand Up @@ -1417,7 +1418,7 @@ jobs:
- name: Prepare Spark Resources for Spark 4.1.0 #TODO remove after image update
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ditto.

run: |
rm -rf /opt/shims/spark41
bash .github/workflows/util/install-resources.sh 4.1
bash .github/workflows/util/install-spark-resources.sh 4.1
mv /opt/shims/spark41/spark_home/assembly/target/scala-2.12 /opt/shims/spark41/spark_home/assembly/target/scala-2.13
- name: Build and Run unit test for Spark 4.0 (slow tests)
run: |
Expand Down
10 changes: 5 additions & 5 deletions dev/docker/Dockerfile.centos8-dynamic-build
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,11 @@ RUN set -ex; \
wget -nv ${mirror_host}/hadoop/common/hadoop-2.8.5/hadoop-2.8.5.tar.gz?action=download -O /opt/hadoop-2.8.5.tar.gz; \
git clone --depth=1 https://github.com/apache/gluten /opt/gluten; \
cd /opt/gluten/.github/workflows/util/; \
./install-resources.sh 3.3; \
./install-resources.sh 3.4; \
./install-resources.sh 3.5; \
./install-resources.sh 3.5-scala2.13; \
./install-resources.sh 4.0; \
./install-spark-resources.sh 3.3; \
./install-spark-resources.sh 3.4; \
./install-spark-resources.sh 3.5; \
./install-spark-resources.sh 3.5-scala2.13; \
./install-spark-resources.sh 4.0; \
if [ "$(uname -m)" = "aarch64" ]; then \
export CPU_TARGET="aarch64"; \
fi; \
Expand Down
10 changes: 5 additions & 5 deletions dev/docker/Dockerfile.centos9-dynamic-build
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,11 @@ RUN set -ex; \
wget -nv ${mirror_host}/hadoop/common/hadoop-2.8.5/hadoop-2.8.5.tar.gz?action=download -O /opt/hadoop-2.8.5.tar.gz; \
git clone --depth=1 https://github.com/apache/gluten /opt/gluten; \
cd /opt/gluten/.github/workflows/util/; \
./install-resources.sh 3.3; \
./install-resources.sh 3.4; \
./install-resources.sh 3.5; \
./install-resources.sh 3.5-scala2.13; \
./install-resources.sh 4.0; \
./install-spark-resources.sh 3.3; \
./install-spark-resources.sh 3.4; \
./install-spark-resources.sh 3.5; \
./install-spark-resources.sh 3.5-scala2.13; \
./install-spark-resources.sh 4.0; \
if [ "$(uname -m)" = "aarch64" ]; then \
export CPU_TARGET="aarch64"; \
fi; \
Expand Down
Loading