apache · viirya · Nov 22, 2020 · Nov 24, 2020 · Nov 24, 2020 · dongjoon-hyun
diff --git a/sql/core/benchmarks/SubExprEliminationBenchmark-jdk11-results.txt b/sql/core/benchmarks/SubExprEliminationBenchmark-jdk11-results.txt
@@ -5,11 +5,21 @@ Benchmark for performance of subexpression elimination
 Preparing data for benchmarking ...
 OpenJDK 64-Bit Server VM 11.0.9+11 on Mac OS X 10.15.6
 Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz
-from_json as subExpr:                      Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+from_json as subExpr in Project:           Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 -------------------------------------------------------------------------------------------------------------------------
-subexpressionElimination off, codegen on           25932          26908         916          0.0   259320042.3       1.0X
-subexpressionElimination off, codegen off          26085          26159          65          0.0   260848905.0       1.0X
-subexpressionElimination on, codegen on             2860           2939          72          0.0    28603312.9       9.1X
-subexpressionElimination on, codegen off            2517           2617          93          0.0    25165157.7      10.3X
+subexpressionElimination off, codegen on           22605          22935         291          0.0   226047196.5       1.0X
+subexpressionElimination off, codegen off          21811          22151         303          0.0   218105716.6       1.0X
+subexpressionElimination on, codegen on             1353           1385          36          0.0    13531011.3      16.7X
+subexpressionElimination on, codegen off            1237           1260          20          0.0    12368657.3      18.3X
+
+Preparing data for benchmarking ...
+OpenJDK 64-Bit Server VM 11.0.9+11 on Mac OS X 10.15.6
+Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz
+from_json as subExpr in Filter:            Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+-------------------------------------------------------------------------------------------------------------------------
+subexpressionElimination off, codegen on           32792          33101         282          0.0   327922763.5       1.0X
+subexpressionElimination off, codegen off          32809          33433         550          0.0   328088662.6       1.0X
+subexpressionElimination on, codegen on            18173          18828         869          0.0   181734709.5       1.8X
+subexpressionElimination on, codegen off           33695          33951         287          0.0   336950807.7       1.0X
 
 
diff --git a/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt b/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt
@@ -5,11 +5,21 @@ Benchmark for performance of subexpression elimination
 Preparing data for benchmarking ...
 OpenJDK 64-Bit Server VM 1.8.0_265-b01 on Mac OS X 10.15.6
 Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz
-from_json as subExpr:                      Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+from_json as subExpr in Project:           Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 -------------------------------------------------------------------------------------------------------------------------
-subexpressionElimination off, codegen on           26503          27622        1937          0.0   265033362.4       1.0X
-subexpressionElimination off, codegen off          24920          25376         430          0.0   249196978.2       1.1X
-subexpressionElimination on, codegen on             2421           2466          39          0.0    24213606.1      10.9X
-subexpressionElimination on, codegen off            2360           2435          87          0.0    23604320.7      11.2X
+subexpressionElimination off, codegen on           25887          26105         326          0.0   258868246.6       1.0X
+subexpressionElimination off, codegen off          25131          25454         522          0.0   251309329.7       1.0X
+subexpressionElimination on, codegen on             2230           2340         106          0.0    22302959.3      11.6X
+subexpressionElimination on, codegen off            2185           2254          64          0.0    21852694.0      11.8X
+
+Preparing data for benchmarking ...
+OpenJDK 64-Bit Server VM 1.8.0_265-b01 on Mac OS X 10.15.6
+Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz
+from_json as subExpr in Filter:            Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+-------------------------------------------------------------------------------------------------------------------------
+subexpressionElimination off, codegen on           42687          42805         111          0.0   426873372.5       1.0X
+subexpressionElimination off, codegen off          43606          45108        1613          0.0   436055236.3       1.0X
+subexpressionElimination on, codegen on            29761          30563         704          0.0   297614324.4       1.4X
+subexpressionElimination on, codegen off           41676          42598         955          0.0   416758112.3       1.0X
 
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SubExprEliminationBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SubExprEliminationBenchmark.scala
@@ -17,6 +17,8 @@
 package org.apache.spark.sql.execution
 
 import org.apache.spark.benchmark.Benchmark
+import org.apache.spark.sql.Column
+import org.apache.spark.sql.catalyst.expressions.{Expression, Literal, Or}
 import org.apache.spark.sql.execution.benchmark.SqlBasedBenchmark
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
@@ -39,7 +41,7 @@ object SubExprEliminationBenchmark extends SqlBasedBenchmark {
   import spark.implicits._
 
   def withFromJson(rowsNum: Int, numIters: Int): Unit = {
-    val benchmark = new Benchmark("from_json as subExpr", rowsNum, output = output)
+    val benchmark = new Benchmark("from_json as subExpr in Project", rowsNum, output = output)
 
     withTempPath { path =>
       prepareDataInfo(benchmark)
@@ -61,7 +63,7 @@ object SubExprEliminationBenchmark extends SqlBasedBenchmark {
           val df = spark.read
             .text(path.getAbsolutePath)
             .select(cols: _*)
-          df.collect()
+          df.write.mode("overwrite").format("noop").save()
         }
       }
 
@@ -74,7 +76,7 @@ object SubExprEliminationBenchmark extends SqlBasedBenchmark {
           val df = spark.read
             .text(path.getAbsolutePath)
             .select(cols: _*)
-          df.collect()
+          df.write.mode("overwrite").format("noop").save()
         }
       }
 
@@ -87,7 +89,7 @@ object SubExprEliminationBenchmark extends SqlBasedBenchmark {
           val df = spark.read
             .text(path.getAbsolutePath)
             .select(cols: _*)
-          df.collect()
+          df.write.mode("overwrite").format("noop").save()
         }
       }
 
@@ -100,19 +102,89 @@ object SubExprEliminationBenchmark extends SqlBasedBenchmark {
           val df = spark.read
             .text(path.getAbsolutePath)
             .select(cols: _*)
-          df.collect()
+          df.write.mode("overwrite").format("noop").save()
         }
       }
 
       benchmark.run()
     }
   }
 
+  def withFilter(rowsNum: Int, numIters: Int): Unit = {
+    val benchmark = new Benchmark("from_json as subExpr in Filter", rowsNum, output = output)
+
+    withTempPath { path =>
+      prepareDataInfo(benchmark)
+      val numCols = 1000
+      val schema = writeWideRow(path.getAbsolutePath, rowsNum, numCols)
+
+      val predicate = (0 until numCols).map { idx =>
+        (from_json('value, schema).getField(s"col$idx") >= Literal(100000)).expr
+      }.asInstanceOf[Seq[Expression]].reduce(Or)
+
+      // We only benchmark subexpression performance under codegen/non-codegen, so disabling
+      // json optimization.
+      benchmark.addCase("subexpressionElimination off, codegen on", numIters) { _ =>
+        withSQLConf(
+          SQLConf.SUBEXPRESSION_ELIMINATION_ENABLED.key -> "false",
+          SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true",
+          SQLConf.CODEGEN_FACTORY_MODE.key -> "CODEGEN_ONLY",
+          SQLConf.JSON_EXPRESSION_OPTIMIZATION.key -> "false") {
+          val df = spark.read
+            .text(path.getAbsolutePath)
+            .where(Column(predicate))
+          df.write.mode("overwrite").format("noop").save()
+        }
+      }
+
+      benchmark.addCase("subexpressionElimination off, codegen off", numIters) { _ =>
+        withSQLConf(
+          SQLConf.SUBEXPRESSION_ELIMINATION_ENABLED.key -> "false",
+          SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false",
+          SQLConf.CODEGEN_FACTORY_MODE.key -> "NO_CODEGEN",
+          SQLConf.JSON_EXPRESSION_OPTIMIZATION.key -> "false") {
+          val df = spark.read
+            .text(path.getAbsolutePath)
+            .where(Column(predicate))
+          df.write.mode("overwrite").format("noop").save()
+        }
+      }
+
+      benchmark.addCase("subexpressionElimination on, codegen on", numIters) { _ =>
+        withSQLConf(
+          SQLConf.SUBEXPRESSION_ELIMINATION_ENABLED.key -> "true",
+          SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true",
+          SQLConf.CODEGEN_FACTORY_MODE.key -> "CODEGEN_ONLY",
+          SQLConf.JSON_EXPRESSION_OPTIMIZATION.key -> "false") {
+          val df = spark.read
+            .text(path.getAbsolutePath)
+            .where(Column(predicate))
+          df.write.mode("overwrite").format("noop").save()
+        }
+      }
+
+      benchmark.addCase("subexpressionElimination on, codegen off", numIters) { _ =>
+        withSQLConf(
+          SQLConf.SUBEXPRESSION_ELIMINATION_ENABLED.key -> "true",
+          SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false",
+          SQLConf.CODEGEN_FACTORY_MODE.key -> "NO_CODEGEN",
+          SQLConf.JSON_EXPRESSION_OPTIMIZATION.key -> "false") {
+          val df = spark.read
+            .text(path.getAbsolutePath)
+            .where(Column(predicate))
+          df.write.mode("overwrite").format("noop").save()
+        }
+      }
+
+      benchmark.run()
+    }
+  }
 
   override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
     val numIters = 3
     runBenchmark("Benchmark for performance of subexpression elimination") {
       withFromJson(100, numIters)
+      withFilter(100, numIters)
     }
   }
 }