From 95cfdc694d3e0b68979cd06b78b52e107aa58a9f Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Wed, 5 Oct 2022 18:01:55 -0700 Subject: [PATCH] [SPARK-40669][SQL][TESTS] Parameterize `rowsNum` in `InMemoryColumnarBenchmark` ### What changes were proposed in this pull request? This PR aims to parameterize `InMemoryColumnarBenchmark` to accept `rowsNum`. ### Why are the changes needed? This enables us to benchmark more flexibly. ``` build/sbt "sql/test:runMain org.apache.spark.sql.execution.columnar.InMemoryColumnarBenchmark 1000000" ... [info] Running benchmark: Int In-Memory scan [info] Running case: columnar deserialization + columnar-to-row [info] Stopped after 3 iterations, 444 ms [info] Running case: row-based deserialization [info] Stopped after 3 iterations, 462 ms [info] OpenJDK 64-Bit Server VM 17.0.4+8-LTS on Mac OS X 12.6 [info] Apple M1 Max [info] Int In-Memory scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative [info] -------------------------------------------------------------------------------------------------------------------------- [info] columnar deserialization + columnar-to-row 119 148 26 8.4 118.5 1.0X [info] row-based deserialization 119 154 32 8.4 119.5 1.0X ``` ``` $ build/sbt "sql/test:runMain org.apache.spark.sql.execution.columnar.InMemoryColumnarBenchmark 10000000" ... [info] Running benchmark: Int In-Memory scan [info] Running case: columnar deserialization + columnar-to-row [info] Stopped after 3 iterations, 3855 ms [info] Running case: row-based deserialization [info] Stopped after 3 iterations, 4250 ms [info] OpenJDK 64-Bit Server VM 17.0.4+8-LTS on Mac OS X 12.6 [info] Apple M1 Max [info] Int In-Memory scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative [info] -------------------------------------------------------------------------------------------------------------------------- [info] columnar deserialization + columnar-to-row 1082 1285 199 9.2 108.2 1.0X [info] row-based deserialization 1057 1417 335 9.5 105.7 1.0X ``` ``` $ build/sbt "sql/test:runMain org.apache.spark.sql.execution.columnar.InMemoryColumnarBenchmark 20000000" [info] Running benchmark: Int In-Memory scan [info] Running case: columnar deserialization + columnar-to-row [info] Stopped after 3 iterations, 8482 ms [info] Running case: row-based deserialization [info] Stopped after 3 iterations, 7534 ms [info] OpenJDK 64-Bit Server VM 17.0.4+8-LTS on Mac OS X 12.6 [info] Apple M1 Max [info] Int In-Memory scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative [info] -------------------------------------------------------------------------------------------------------------------------- [info] columnar deserialization + columnar-to-row 2261 2828 555 8.8 113.1 1.0X [info] row-based deserialization 1788 2511 1187 11.2 89.4 1.3X ``` ### Does this PR introduce _any_ user-facing change? No. This is a benchmark test code. ### How was this patch tested? Manually. Closes #38114 from dongjoon-hyun/SPARK-40669. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun --- .../columnar/InMemoryColumnarBenchmark.scala | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarBenchmark.scala index d8f154bfb1e05..55d9fb2731799 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarBenchmark.scala @@ -26,14 +26,15 @@ import org.apache.spark.sql.execution.benchmark.SqlBasedBenchmark * {{{ * 1. without sbt: * bin/spark-submit --class - * --jars - * 2. build/sbt "sql/Test/runMain " - * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/Test/runMain " + * --jars , + * 2. build/sbt "sql/Test/runMain " + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/Test/runMain + * " * Results will be written to "benchmarks/InMemoryColumnarBenchmark-results.txt". * }}} */ object InMemoryColumnarBenchmark extends SqlBasedBenchmark { - def intCache(rowsNum: Int, numIters: Int): Unit = { + def intCache(rowsNum: Long, numIters: Int): Unit = { val data = spark.range(0, rowsNum, 1, 1).toDF("i").cache() val inMemoryScan = data.queryExecution.executedPlan.collect { @@ -59,8 +60,9 @@ object InMemoryColumnarBenchmark extends SqlBasedBenchmark { } override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { - runBenchmark("Int In-memory") { - intCache(rowsNum = 1000000, numIters = 3) + val rowsNum = if (mainArgs.length > 0) mainArgs(0).toLong else 1000000 + runBenchmark(s"Int In-memory with $rowsNum rows") { + intCache(rowsNum = rowsNum, numIters = 3) } } }