apache · andygrove · Apr 14, 2025 · Apr 14, 2025 · Apr 14, 2025 · Apr 15, 2025
diff --git a/dev/benchmarks/README.md b/dev/benchmarks/README.md
@@ -0,0 +1,75 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Benchmarks
+
+[WIP] the purpose of this document / folder is to automate running the 100 GB benchmark in AWS so that anyone 
+can run this as part of the release process and update the charts that we use in the repo.
+
+This is separate from the general benchmarking advice that we have in the contributor guide but does duplicate much of it.
+
+The documentation assumes that we are using instance type `m6id.2xlarge` (subject to change). 
+
+For now I am using 1024 GB EBS but this can probably be much smaller, maybe as low as 100 GB.
+
+Connect to the instance and clone this repo (or the fork/branch to be used for benchmarking).
+
+```shell
+git clone https://github.com/apache/datafusion-comet
+```
+
+## Install Prerequisites
+
+This is not fully tested yet. I am copying and pasting from the script for testing. 
+
+```shell
+cd dev/benchmarks
+./setup.sh
+```
+
+## Generate data locally
+
+TODO this is using the new tpchgen-rs project, which is much more convenient that the previous approach, but it 
+only generates a single Parquet file per table by default. I have not looked into any performance impact that this may
+have on the benchmark.
+
+```shell
+cargo install tpchgen-cli
+tpchgen-cli -s 100 --format parquet
+```
+
+## Spark Benchmark
+
+TODO clear page cache before run
+
+```shell
+./spark-tpch.sh
+```
+
+## Comet Benchmark
+
+TODO clear page cache before run
+
+```shell
+./comet-tpch.sh
+```
+
+## Produce Charts
+
+TBD
diff --git a/dev/benchmarks/comet-tpch.sh b/dev/benchmarks/comet-tpch.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+export COMET_JAR=`pwd`/../../spark/target/comet-spark-spark3.5_2.12-0.8.0-SNAPSHOT.jar
+
+$SPARK_HOME/bin/spark-submit \
+    --master $SPARK_MASTER \
+    --conf spark.driver.memory=8G \
+    --conf spark.executor.instances=1 \
+    --conf spark.executor.cores=8 \
+    --conf spark.cores.max=8 \
+    --conf spark.executor.memory=8g \
+    --conf spark.memory.offHeap.enabled=true \
+    --conf spark.memory.offHeap.size=8g \
+    --conf spark.local.dir=/home/ec2-user/tmp \
+    --conf spark.driver.extraJavaOptions="-Djava.io.tmpdir=/home/ec2-user/tmp" \
+    --conf spark.executor.extraJavaOptions="-Djava.io.tmpdir=/home/ec2-user/tmp" \
+    --jars $COMET_JAR \
+    --driver-class-path $COMET_JAR \
+    --conf spark.driver.extraClassPath=$COMET_JAR \
+    --conf spark.executor.extraClassPath=$COMET_JAR \
+    --conf spark.plugins=org.apache.spark.CometPlugin \
+    --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \
+    --conf spark.comet.enabled=true \
+    --conf spark.comet.exec.shuffle.enableFastEncoding=true \
+    --conf spark.comet.exec.shuffle.fallbackToColumnar=true \
+    --conf spark.comet.cast.allowIncompatible=true \
+    --conf spark.comet.exec.replaceSortMergeJoin=true \
+    tpcbench.py \
+    --name comet \
+    --benchmark tpch \
+    --data /home/ec2-user/ \
+    --queries /home/ec2-user/datafusion-benchmarks/tpch/queries \
+    --output . \
+    --iterations 1
diff --git a/dev/benchmarks/setup.sh b/dev/benchmarks/setup.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+sudo yum install -y wget
+
+# Install Rust
+sudo yum groupinstall -y "Development Tools"
+curl https://sh.rustup.rs -sSf | sh -s -- -y
+. "$HOME/.cargo/env"
+
+# Install Java
+sudo yum install -y java-17-amazon-corretto-headless java-17-amazon-corretto-devel
+export JAVA_HOME=/usr/lib/jvm/java-17-amazon-corretto
+
+# Build Comet
+make release PROFILES="-Pspark-3.5"
+
+# Clone datafusion-benchmarks repo, which has the queries
+git clone https://github.com/apache/datafusion-benchmarks.git
+
+# Install Spark
+wget https://dlcdn.apache.org/spark/spark-3.5.5/spark-3.5.5-bin-hadoop3.tgz
+if [ ! -d "spark-3.5.5-bin-hadoop3" ]; then
+  wget https://dlcdn.apache.org/spark/spark-3.5.5/spark-3.5.5-bin-hadoop3.tgz
+  tar xzf spark-3.5.5-bin-hadoop3.tgz
+fi
+
+# Create a temp folder on the primary EBS volume
+mkdir /home/ec2-user/tmp
+
+# Start Spark
+export SPARK_HOME=/home/ec2-user/spark-3.5.5-bin-hadoop3
+export SPARK_MASTER=spark://localhost:7077
+$SPARK_HOME/sbin/start-master.sh --host localhost
+$SPARK_HOME/sbin/start-worker.sh $SPARK_MASTER
diff --git a/dev/benchmarks/spark-env.sh b/dev/benchmarks/spark-env.sh
@@ -0,0 +1,19 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+SPARK_LOCAL_DIRS=/home/ec2-user/tmp
diff --git a/dev/benchmarks/spark-tpch.sh b/dev/benchmarks/spark-tpch.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+export SPARK_MASTER=spark://localhost:7077
+
+$SPARK_HOME/bin/spark-submit \
+    --master $SPARK_MASTER \
+    --conf spark.driver.memory=8G \
+    --conf spark.executor.instances=1 \
+    --conf spark.executor.cores=8 \
+    --conf spark.cores.max=8 \
+    --conf spark.executor.memory=8g \
+    --conf spark.memory.offHeap.enabled=true \
+    --conf spark.memory.offHeap.size=8g \
+    --conf spark.local.dir=/home/ec2-user/tmp \
+    --conf spark.driver.extraJavaOptions="-Djava.io.tmpdir=/home/ec2-user/tmp" \
+    --conf spark.executor.extraJavaOptions="-Djava.io.tmpdir=/home/ec2-user/tmp" \
+    tpcbench.py \
+    --name spark \
+    --benchmark tpch \
+    --data /home/ec2-user/ \
+    --queries /home/ec2-user/datafusion-benchmarks/tpch/queries \
+    --output . \
+    --iterations 1
diff --git a/dev/benchmarks/tpcbench.py b/dev/benchmarks/tpcbench.py
@@ -0,0 +1,126 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+from datetime import datetime
+import json
+from pyspark.sql import SparkSession
+import time
+
+def main(benchmark: str, data_path: str, query_path: str, iterations: int, output: str, name: str):
+
+    # Initialize a SparkSession
+    spark = SparkSession.builder \
+        .appName(f"{name} benchmark derived from {benchmark}") \
+        .getOrCreate()
+
+    # Register the tables
+    if benchmark == "tpch":
+        num_queries = 22
+        table_names = ["customer", "lineitem", "nation", "orders", "part", "partsupp", "region", "supplier"]
+    elif benchmark == "tpcds":
+        num_queries = 99
+        table_names = ["call_center", "catalog_page", "catalog_returns", "catalog_sales", "customer",
+           "customer_address", "customer_demographics", "date_dim", "time_dim", "household_demographics",
+           "income_band", "inventory", "item", "promotion", "reason", "ship_mode", "store", "store_returns",
+           "store_sales", "warehouse", "web_page", "web_returns", "web_sales", "web_site"]
+    else:
+        raise "invalid benchmark"
+
+    for table in table_names:
+        path = f"{data_path}/{table}.parquet"
+        print(f"Registering table {table} using path {path}")
+        df = spark.read.parquet(path)
+        df.createOrReplaceTempView(table)
+
+    conf_dict = {k: v for k, v in spark.sparkContext.getConf().getAll()}
+
+    results = {
+        'engine': 'datafusion-comet',
+        'benchmark': benchmark,
+        'data_path': data_path,
+        'query_path': query_path,
+        'spark_conf': conf_dict,
+    }
+
+    for iteration in range(0, iterations):
+        print(f"Starting iteration {iteration} of {iterations}")
+        iter_start_time = time.time()
+
+        for query in range(1, num_queries+1):
+            spark.sparkContext.setJobDescription(f"{benchmark} q{query}")
+
+#            if query != 13:
+#                continue
+
+            # read text file
+            path = f"{query_path}/q{query}.sql"
+
+#            if query == 72:
+#                # use version with sensible join order
+#                path = f"{query_path}/q{query}_optimized.sql"
+
+            print(f"Reading query {query} using path {path}")
+            with open(path, "r") as f:
+                text = f.read()
+                # each file can contain multiple queries
+                queries = text.split(";")
+
+                start_time = time.time()
+                for sql in queries:
+                    sql = sql.strip().replace("create view", "create temp view")
+                    if len(sql) > 0:
+                        print(f"Executing: {sql}")
+                        df = spark.sql(sql)
+                        df.explain()
+                        rows = df.collect()
+
+                        print(f"Query {query} returned {len(rows)} rows")
+
+                end_time = time.time()
+                print(f"Query {query} took {end_time - start_time} seconds")
+
+                # store timings in list and later add option to run > 1 iterations
+                query_timings = results.setdefault(query, [])
+                query_timings.append(end_time - start_time)
+
+        iter_end_time = time.time()
+        print(f"Iteration {iteration} took {round(iter_end_time - iter_start_time,2)} seconds")
+
+    str = json.dumps(results, indent=4)
+    current_time_millis = int(datetime.now().timestamp() * 1000)
+    results_path = f"{output}/{name}-{benchmark}-{current_time_millis}.json"
+    print(f"Writing results to {results_path}")
+    with open(results_path, "w") as f:
+        f.write(str)
+
+    # Stop the SparkSession
+    spark.stop()
+
+    #print(str)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="DataFusion benchmark derived from TPC-H / TPC-DS")
+    parser.add_argument("--benchmark", required=True, help="Benchmark to run (tpch or tpcds)")
+    parser.add_argument("--data", required=True, help="Path to data files")
+    parser.add_argument("--queries", required=True, help="Path to query files")
+    parser.add_argument("--iterations", required=False, default="1", help="How many iterations to run")
+    parser.add_argument("--output", required=True, help="Path to write output")
+    parser.add_argument("--name", required=True, help="Prefix for result file e.g. spark/comet/gluten")
+    args = parser.parse_args()
+
+    main(args.benchmark, args.data, args.queries, int(args.iterations), args.output, args.name)