From f35e28fea5605de4b28630eb643a821ecd7c8523 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Tue, 24 Nov 2020 13:30:06 +0900
Subject: [PATCH] [SPARK-33523][SQL][TEST] Add predicate related benchmark to
 SubExprEliminationBenchmark

### What changes were proposed in this pull request?

This patch adds predicate related benchmark to `SubExprEliminationBenchmark`.

### Why are the changes needed?

We should have a benchmark for subexpression elimination of predicate.

### Does this PR introduce _any_ user-facing change?

No, dev only.

### How was this patch tested?

Run benchmark locally.

Closes #30476 from viirya/SPARK-33523.

Authored-by: Liang-Chi Hsieh <viirya@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 ...ExprEliminationBenchmark-jdk11-results.txt |  22 +++-
 .../SubExprEliminationBenchmark-results.txt   |  22 +++-
 .../SubExprEliminationBenchmark.scala         | 106 ++++++++++--------
 3 files changed, 90 insertions(+), 60 deletions(-)

diff --git a/sql/core/benchmarks/SubExprEliminationBenchmark-jdk11-results.txt b/sql/core/benchmarks/SubExprEliminationBenchmark-jdk11-results.txt
index 3d2b2e5c8edba..1eb7b534d2194 100644
--- a/sql/core/benchmarks/SubExprEliminationBenchmark-jdk11-results.txt
+++ b/sql/core/benchmarks/SubExprEliminationBenchmark-jdk11-results.txt
@@ -5,11 +5,21 @@ Benchmark for performance of subexpression elimination
 Preparing data for benchmarking ...
 OpenJDK 64-Bit Server VM 11.0.9+11 on Mac OS X 10.15.6
 Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz
-from_json as subExpr:                      Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
--------------------------------------------------------------------------------------------------------------------------
-subexpressionElimination off, codegen on           25932          26908         916          0.0   259320042.3       1.0X
-subexpressionElimination off, codegen off          26085          26159          65          0.0   260848905.0       1.0X
-subexpressionElimination on, codegen on             2860           2939          72          0.0    28603312.9       9.1X
-subexpressionElimination on, codegen off            2517           2617          93          0.0    25165157.7      10.3X
+from_json as subExpr in Project:          Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+subExprElimination false, codegen: true           26447          27127         605          0.0   264467933.4       1.0X
+subExprElimination false, codegen: false          25673          26035         546          0.0   256732419.1       1.0X
+subExprElimination true, codegen: true             1384           1448         102          0.0    13842910.3      19.1X
+subExprElimination true, codegen: false            1244           1347         123          0.0    12442389.3      21.3X
+
+Preparing data for benchmarking ...
+OpenJDK 64-Bit Server VM 11.0.9+11 on Mac OS X 10.15.6
+Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz
+from_json as subExpr in Filter:           Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+subexpressionElimination off, codegen on          34631          35449         833          0.0   346309884.0       1.0X
+subexpressionElimination off, codegen on          34480          34851         353          0.0   344798490.4       1.0X
+subexpressionElimination off, codegen on          16618          16811         291          0.0   166176642.6       2.1X
+subexpressionElimination off, codegen on          34316          34667         310          0.0   343157094.7       1.0X
 
 
diff --git a/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt b/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt
index ca2a9c6497500..801f519ca76a1 100644
--- a/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt
+++ b/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt
@@ -5,11 +5,21 @@ Benchmark for performance of subexpression elimination
 Preparing data for benchmarking ...
 OpenJDK 64-Bit Server VM 1.8.0_265-b01 on Mac OS X 10.15.6
 Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz
-from_json as subExpr:                      Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
--------------------------------------------------------------------------------------------------------------------------
-subexpressionElimination off, codegen on           26503          27622        1937          0.0   265033362.4       1.0X
-subexpressionElimination off, codegen off          24920          25376         430          0.0   249196978.2       1.1X
-subexpressionElimination on, codegen on             2421           2466          39          0.0    24213606.1      10.9X
-subexpressionElimination on, codegen off            2360           2435          87          0.0    23604320.7      11.2X
+from_json as subExpr in Project:          Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+subExprElimination false, codegen: true           22767          23240         424          0.0   227665316.7       1.0X
+subExprElimination false, codegen: false          22869          23351         465          0.0   228693464.1       1.0X
+subExprElimination true, codegen: true             1328           1340          10          0.0    13280056.2      17.1X
+subExprElimination true, codegen: false            1248           1276          31          0.0    12476135.1      18.2X
+
+Preparing data for benchmarking ...
+OpenJDK 64-Bit Server VM 1.8.0_265-b01 on Mac OS X 10.15.6
+Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz
+from_json as subExpr in Filter:           Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+subexpressionElimination off, codegen on          37691          38846        1004          0.0   376913767.9       1.0X
+subexpressionElimination off, codegen on          37852          39124        1103          0.0   378517745.5       1.0X
+subexpressionElimination off, codegen on          22900          23085         202          0.0   229000242.5       1.6X
+subexpressionElimination off, codegen on          38298          38598         374          0.0   382978731.3       1.0X
 
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SubExprEliminationBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SubExprEliminationBenchmark.scala
index 34b4a70d05a25..e26acbcb3cd21 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SubExprEliminationBenchmark.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SubExprEliminationBenchmark.scala
@@ -17,6 +17,8 @@
 package org.apache.spark.sql.execution
 
 import org.apache.spark.benchmark.Benchmark
+import org.apache.spark.sql.Column
+import org.apache.spark.sql.catalyst.expressions.{Expression, Literal, Or}
 import org.apache.spark.sql.execution.benchmark.SqlBasedBenchmark
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
@@ -39,7 +41,7 @@ object SubExprEliminationBenchmark extends SqlBasedBenchmark {
   import spark.implicits._
 
   def withFromJson(rowsNum: Int, numIters: Int): Unit = {
-    val benchmark = new Benchmark("from_json as subExpr", rowsNum, output = output)
+    val benchmark = new Benchmark("from_json as subExpr in Project", rowsNum, output = output)
 
     withTempPath { path =>
       prepareDataInfo(benchmark)
@@ -50,57 +52,65 @@ object SubExprEliminationBenchmark extends SqlBasedBenchmark {
         from_json('value, schema).getField(s"col$idx")
       }
 
-      // We only benchmark subexpression performance under codegen/non-codegen, so disabling
-      // json optimization.
-      benchmark.addCase("subexpressionElimination off, codegen on", numIters) { _ =>
-        withSQLConf(
-          SQLConf.SUBEXPRESSION_ELIMINATION_ENABLED.key -> "false",
-          SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true",
-          SQLConf.CODEGEN_FACTORY_MODE.key -> "CODEGEN_ONLY",
-          SQLConf.JSON_EXPRESSION_OPTIMIZATION.key -> "false") {
-          val df = spark.read
-            .text(path.getAbsolutePath)
-            .select(cols: _*)
-          df.collect()
+      Seq(
+        ("false", "true", "CODEGEN_ONLY"),
+        ("false", "false", "NO_CODEGEN"),
+        ("true", "true", "CODEGEN_ONLY"),
+        ("true", "false", "NO_CODEGEN")
+      ).foreach { case (subExprEliminationEnabled, codegenEnabled, codegenFactory) =>
+        // We only benchmark subexpression performance under codegen/non-codegen, so disabling
+        // json optimization.
+        val caseName = s"subExprElimination $subExprEliminationEnabled, codegen: $codegenEnabled"
+        benchmark.addCase(caseName, numIters) { _ =>
+          withSQLConf(
+            SQLConf.SUBEXPRESSION_ELIMINATION_ENABLED.key -> subExprEliminationEnabled,
+            SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> codegenEnabled,
+            SQLConf.CODEGEN_FACTORY_MODE.key -> codegenFactory,
+            SQLConf.JSON_EXPRESSION_OPTIMIZATION.key -> "false") {
+            val df = spark.read
+              .text(path.getAbsolutePath)
+              .select(cols: _*)
+            df.write.mode("overwrite").format("noop").save()
+          }
         }
       }
 
-      benchmark.addCase("subexpressionElimination off, codegen off", numIters) { _ =>
-        withSQLConf(
-          SQLConf.SUBEXPRESSION_ELIMINATION_ENABLED.key -> "false",
-          SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false",
-          SQLConf.CODEGEN_FACTORY_MODE.key -> "NO_CODEGEN",
-          SQLConf.JSON_EXPRESSION_OPTIMIZATION.key -> "false") {
-          val df = spark.read
-            .text(path.getAbsolutePath)
-            .select(cols: _*)
-          df.collect()
-        }
-      }
+      benchmark.run()
+    }
+  }
 
-      benchmark.addCase("subexpressionElimination on, codegen on", numIters) { _ =>
-        withSQLConf(
-            SQLConf.SUBEXPRESSION_ELIMINATION_ENABLED.key -> "true",
-            SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true",
-            SQLConf.CODEGEN_FACTORY_MODE.key -> "CODEGEN_ONLY",
-            SQLConf.JSON_EXPRESSION_OPTIMIZATION.key -> "false") {
-          val df = spark.read
-            .text(path.getAbsolutePath)
-            .select(cols: _*)
-          df.collect()
-        }
-      }
+  def withFilter(rowsNum: Int, numIters: Int): Unit = {
+    val benchmark = new Benchmark("from_json as subExpr in Filter", rowsNum, output = output)
 
-      benchmark.addCase("subexpressionElimination on, codegen off", numIters) { _ =>
-        withSQLConf(
-          SQLConf.SUBEXPRESSION_ELIMINATION_ENABLED.key -> "true",
-          SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false",
-          SQLConf.CODEGEN_FACTORY_MODE.key -> "NO_CODEGEN",
-          SQLConf.JSON_EXPRESSION_OPTIMIZATION.key -> "false") {
-          val df = spark.read
-            .text(path.getAbsolutePath)
-            .select(cols: _*)
-          df.collect()
+    withTempPath { path =>
+      prepareDataInfo(benchmark)
+      val numCols = 1000
+      val schema = writeWideRow(path.getAbsolutePath, rowsNum, numCols)
+
+      val predicate = (0 until numCols).map { idx =>
+        (from_json('value, schema).getField(s"col$idx") >= Literal(100000)).expr
+      }.asInstanceOf[Seq[Expression]].reduce(Or)
+
+      Seq(
+        ("false", "true", "CODEGEN_ONLY"),
+        ("false", "false", "NO_CODEGEN"),
+        ("true", "true", "CODEGEN_ONLY"),
+        ("true", "false", "NO_CODEGEN")
+      ).foreach { case (subExprEliminationEnabled, codegenEnabled, codegenFactory) =>
+        // We only benchmark subexpression performance under codegen/non-codegen, so disabling
+        // json optimization.
+        val caseName = s"subExprElimination $subExprEliminationEnabled, codegen: $codegenEnabled"
+        benchmark.addCase("subexpressionElimination off, codegen on", numIters) { _ =>
+          withSQLConf(
+            SQLConf.SUBEXPRESSION_ELIMINATION_ENABLED.key -> subExprEliminationEnabled,
+            SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> codegenEnabled,
+            SQLConf.CODEGEN_FACTORY_MODE.key -> codegenFactory,
+            SQLConf.JSON_EXPRESSION_OPTIMIZATION.key -> "false") {
+            val df = spark.read
+              .text(path.getAbsolutePath)
+              .where(Column(predicate))
+            df.write.mode("overwrite").format("noop").save()
+          }
         }
       }
 
@@ -108,11 +118,11 @@ object SubExprEliminationBenchmark extends SqlBasedBenchmark {
     }
   }
 
-
   override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
     val numIters = 3
     runBenchmark("Benchmark for performance of subexpression elimination") {
       withFromJson(100, numIters)
+      withFilter(100, numIters)
     }
   }
 }