From f35e28fea5605de4b28630eb643a821ecd7c8523 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 24 Nov 2020 13:30:06 +0900 Subject: [PATCH] [SPARK-33523][SQL][TEST] Add predicate related benchmark to SubExprEliminationBenchmark ### What changes were proposed in this pull request? This patch adds predicate related benchmark to `SubExprEliminationBenchmark`. ### Why are the changes needed? We should have a benchmark for subexpression elimination of predicate. ### Does this PR introduce _any_ user-facing change? No, dev only. ### How was this patch tested? Run benchmark locally. Closes #30476 from viirya/SPARK-33523. Authored-by: Liang-Chi Hsieh Signed-off-by: HyukjinKwon --- ...ExprEliminationBenchmark-jdk11-results.txt | 22 +++- .../SubExprEliminationBenchmark-results.txt | 22 +++- .../SubExprEliminationBenchmark.scala | 106 ++++++++++-------- 3 files changed, 90 insertions(+), 60 deletions(-) diff --git a/sql/core/benchmarks/SubExprEliminationBenchmark-jdk11-results.txt b/sql/core/benchmarks/SubExprEliminationBenchmark-jdk11-results.txt index 3d2b2e5c8edba..1eb7b534d2194 100644 --- a/sql/core/benchmarks/SubExprEliminationBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/SubExprEliminationBenchmark-jdk11-results.txt @@ -5,11 +5,21 @@ Benchmark for performance of subexpression elimination Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 11.0.9+11 on Mac OS X 10.15.6 Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz -from_json as subExpr: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -subexpressionElimination off, codegen on 25932 26908 916 0.0 259320042.3 1.0X -subexpressionElimination off, codegen off 26085 26159 65 0.0 260848905.0 1.0X -subexpressionElimination on, codegen on 2860 2939 72 0.0 28603312.9 9.1X -subexpressionElimination on, codegen off 2517 2617 93 0.0 25165157.7 10.3X +from_json as subExpr in Project: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +subExprElimination false, codegen: true 26447 27127 605 0.0 264467933.4 1.0X +subExprElimination false, codegen: false 25673 26035 546 0.0 256732419.1 1.0X +subExprElimination true, codegen: true 1384 1448 102 0.0 13842910.3 19.1X +subExprElimination true, codegen: false 1244 1347 123 0.0 12442389.3 21.3X + +Preparing data for benchmarking ... +OpenJDK 64-Bit Server VM 11.0.9+11 on Mac OS X 10.15.6 +Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz +from_json as subExpr in Filter: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +subexpressionElimination off, codegen on 34631 35449 833 0.0 346309884.0 1.0X +subexpressionElimination off, codegen on 34480 34851 353 0.0 344798490.4 1.0X +subexpressionElimination off, codegen on 16618 16811 291 0.0 166176642.6 2.1X +subexpressionElimination off, codegen on 34316 34667 310 0.0 343157094.7 1.0X diff --git a/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt b/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt index ca2a9c6497500..801f519ca76a1 100644 --- a/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt +++ b/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt @@ -5,11 +5,21 @@ Benchmark for performance of subexpression elimination Preparing data for benchmarking ... OpenJDK 64-Bit Server VM 1.8.0_265-b01 on Mac OS X 10.15.6 Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz -from_json as subExpr: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -subexpressionElimination off, codegen on 26503 27622 1937 0.0 265033362.4 1.0X -subexpressionElimination off, codegen off 24920 25376 430 0.0 249196978.2 1.1X -subexpressionElimination on, codegen on 2421 2466 39 0.0 24213606.1 10.9X -subexpressionElimination on, codegen off 2360 2435 87 0.0 23604320.7 11.2X +from_json as subExpr in Project: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +subExprElimination false, codegen: true 22767 23240 424 0.0 227665316.7 1.0X +subExprElimination false, codegen: false 22869 23351 465 0.0 228693464.1 1.0X +subExprElimination true, codegen: true 1328 1340 10 0.0 13280056.2 17.1X +subExprElimination true, codegen: false 1248 1276 31 0.0 12476135.1 18.2X + +Preparing data for benchmarking ... +OpenJDK 64-Bit Server VM 1.8.0_265-b01 on Mac OS X 10.15.6 +Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz +from_json as subExpr in Filter: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +subexpressionElimination off, codegen on 37691 38846 1004 0.0 376913767.9 1.0X +subexpressionElimination off, codegen on 37852 39124 1103 0.0 378517745.5 1.0X +subexpressionElimination off, codegen on 22900 23085 202 0.0 229000242.5 1.6X +subexpressionElimination off, codegen on 38298 38598 374 0.0 382978731.3 1.0X diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SubExprEliminationBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SubExprEliminationBenchmark.scala index 34b4a70d05a25..e26acbcb3cd21 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SubExprEliminationBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SubExprEliminationBenchmark.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.execution import org.apache.spark.benchmark.Benchmark +import org.apache.spark.sql.Column +import org.apache.spark.sql.catalyst.expressions.{Expression, Literal, Or} import org.apache.spark.sql.execution.benchmark.SqlBasedBenchmark import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf @@ -39,7 +41,7 @@ object SubExprEliminationBenchmark extends SqlBasedBenchmark { import spark.implicits._ def withFromJson(rowsNum: Int, numIters: Int): Unit = { - val benchmark = new Benchmark("from_json as subExpr", rowsNum, output = output) + val benchmark = new Benchmark("from_json as subExpr in Project", rowsNum, output = output) withTempPath { path => prepareDataInfo(benchmark) @@ -50,57 +52,65 @@ object SubExprEliminationBenchmark extends SqlBasedBenchmark { from_json('value, schema).getField(s"col$idx") } - // We only benchmark subexpression performance under codegen/non-codegen, so disabling - // json optimization. - benchmark.addCase("subexpressionElimination off, codegen on", numIters) { _ => - withSQLConf( - SQLConf.SUBEXPRESSION_ELIMINATION_ENABLED.key -> "false", - SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true", - SQLConf.CODEGEN_FACTORY_MODE.key -> "CODEGEN_ONLY", - SQLConf.JSON_EXPRESSION_OPTIMIZATION.key -> "false") { - val df = spark.read - .text(path.getAbsolutePath) - .select(cols: _*) - df.collect() + Seq( + ("false", "true", "CODEGEN_ONLY"), + ("false", "false", "NO_CODEGEN"), + ("true", "true", "CODEGEN_ONLY"), + ("true", "false", "NO_CODEGEN") + ).foreach { case (subExprEliminationEnabled, codegenEnabled, codegenFactory) => + // We only benchmark subexpression performance under codegen/non-codegen, so disabling + // json optimization. + val caseName = s"subExprElimination $subExprEliminationEnabled, codegen: $codegenEnabled" + benchmark.addCase(caseName, numIters) { _ => + withSQLConf( + SQLConf.SUBEXPRESSION_ELIMINATION_ENABLED.key -> subExprEliminationEnabled, + SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> codegenEnabled, + SQLConf.CODEGEN_FACTORY_MODE.key -> codegenFactory, + SQLConf.JSON_EXPRESSION_OPTIMIZATION.key -> "false") { + val df = spark.read + .text(path.getAbsolutePath) + .select(cols: _*) + df.write.mode("overwrite").format("noop").save() + } } } - benchmark.addCase("subexpressionElimination off, codegen off", numIters) { _ => - withSQLConf( - SQLConf.SUBEXPRESSION_ELIMINATION_ENABLED.key -> "false", - SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false", - SQLConf.CODEGEN_FACTORY_MODE.key -> "NO_CODEGEN", - SQLConf.JSON_EXPRESSION_OPTIMIZATION.key -> "false") { - val df = spark.read - .text(path.getAbsolutePath) - .select(cols: _*) - df.collect() - } - } + benchmark.run() + } + } - benchmark.addCase("subexpressionElimination on, codegen on", numIters) { _ => - withSQLConf( - SQLConf.SUBEXPRESSION_ELIMINATION_ENABLED.key -> "true", - SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true", - SQLConf.CODEGEN_FACTORY_MODE.key -> "CODEGEN_ONLY", - SQLConf.JSON_EXPRESSION_OPTIMIZATION.key -> "false") { - val df = spark.read - .text(path.getAbsolutePath) - .select(cols: _*) - df.collect() - } - } + def withFilter(rowsNum: Int, numIters: Int): Unit = { + val benchmark = new Benchmark("from_json as subExpr in Filter", rowsNum, output = output) - benchmark.addCase("subexpressionElimination on, codegen off", numIters) { _ => - withSQLConf( - SQLConf.SUBEXPRESSION_ELIMINATION_ENABLED.key -> "true", - SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false", - SQLConf.CODEGEN_FACTORY_MODE.key -> "NO_CODEGEN", - SQLConf.JSON_EXPRESSION_OPTIMIZATION.key -> "false") { - val df = spark.read - .text(path.getAbsolutePath) - .select(cols: _*) - df.collect() + withTempPath { path => + prepareDataInfo(benchmark) + val numCols = 1000 + val schema = writeWideRow(path.getAbsolutePath, rowsNum, numCols) + + val predicate = (0 until numCols).map { idx => + (from_json('value, schema).getField(s"col$idx") >= Literal(100000)).expr + }.asInstanceOf[Seq[Expression]].reduce(Or) + + Seq( + ("false", "true", "CODEGEN_ONLY"), + ("false", "false", "NO_CODEGEN"), + ("true", "true", "CODEGEN_ONLY"), + ("true", "false", "NO_CODEGEN") + ).foreach { case (subExprEliminationEnabled, codegenEnabled, codegenFactory) => + // We only benchmark subexpression performance under codegen/non-codegen, so disabling + // json optimization. + val caseName = s"subExprElimination $subExprEliminationEnabled, codegen: $codegenEnabled" + benchmark.addCase("subexpressionElimination off, codegen on", numIters) { _ => + withSQLConf( + SQLConf.SUBEXPRESSION_ELIMINATION_ENABLED.key -> subExprEliminationEnabled, + SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> codegenEnabled, + SQLConf.CODEGEN_FACTORY_MODE.key -> codegenFactory, + SQLConf.JSON_EXPRESSION_OPTIMIZATION.key -> "false") { + val df = spark.read + .text(path.getAbsolutePath) + .where(Column(predicate)) + df.write.mode("overwrite").format("noop").save() + } } } @@ -108,11 +118,11 @@ object SubExprEliminationBenchmark extends SqlBasedBenchmark { } } - override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { val numIters = 3 runBenchmark("Benchmark for performance of subexpression elimination") { withFromJson(100, numIters) + withFilter(100, numIters) } } }