-
Notifications
You must be signed in to change notification settings - Fork 138
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Make Literals foldable, ensure Parquet predicates pushdown (#721)
* #343 - unpack to Literals * #343 - add struct test showing difference between extension and experimental rules * #343 - toString test to stop the patch complaint * #343 - sample docs * #343 - package rename and adding logging that the extension is injected * Apply suggestions from code review Co-authored-by: Cédric Chantepie <cchantep@users.noreply.github.com> * Refactor LitRule and LitRules tests by making them slightly more generic, adjust docs, add negative tests * #343 - disable the rule, foldable and eval evals * #343 - cleaned up * #343 - true with link for 3.2 support * #343 - bring back code gen with lazy to stop recompiles * #343 - more compat and a foldable only backport of SPARK-39106 and SPARK-40380 * #343 - option 3 - let 3.2 fail as per oss impl, seperate tests --------- Co-authored-by: Cédric Chantepie <cchantep@users.noreply.github.com> Co-authored-by: Grigory Pomadchin <grigory.pomadchin@disneystreaming.com>
- Loading branch information
1 parent
e257c4c
commit dec676b
Showing
10 changed files
with
290 additions
and
16 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
package frameless | ||
|
||
import org.apache.spark.sql.catalyst.expressions.Expression | ||
import org.apache.spark.sql.catalyst.expressions.{And, Or} | ||
|
||
package object sql { | ||
implicit class ExpressionOps(val self: Expression) extends AnyVal { | ||
def toList: List[Expression] = { | ||
def rec(expr: Expression, acc: List[Expression]): List[Expression] = { | ||
expr match { | ||
case And(left, right) => rec(left, rec(right, acc)) | ||
case Or(left, right) => rec(left, rec(right, acc)) | ||
case e => e +: acc | ||
} | ||
} | ||
|
||
rec(self, Nil) | ||
} | ||
} | ||
} |
74 changes: 74 additions & 0 deletions
74
dataset/src/test/scala/frameless/sql/rules/SQLRulesSuite.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
package frameless.sql.rules | ||
|
||
import frameless._ | ||
import frameless.sql._ | ||
import org.apache.spark.sql.catalyst.expressions.Expression | ||
import org.apache.spark.sql.sources.Filter | ||
import org.apache.spark.sql.catalyst.plans.logical | ||
import org.apache.spark.sql.execution.FileSourceScanExec | ||
import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec | ||
import org.scalatest.Assertion | ||
import org.scalatest.matchers.should.Matchers | ||
|
||
trait SQLRulesSuite extends TypedDatasetSuite with Matchers { self => | ||
protected lazy val path: String = { | ||
val tmpDir = System.getProperty("java.io.tmpdir") | ||
s"$tmpDir/${self.getClass.getName}" | ||
} | ||
|
||
def withDataset[A: TypedEncoder: CatalystOrdered](payload: A)(f: TypedDataset[A] => Assertion): Assertion = { | ||
TypedDataset.create(Seq(payload)).write.mode("overwrite").parquet(path) | ||
f(TypedDataset.createUnsafe[A](session.read.parquet(path))) | ||
} | ||
|
||
def predicatePushDownTest[A: TypedEncoder: CatalystOrdered]( | ||
expected: X1[A], | ||
expectedPushDownFilters: List[Filter], | ||
planShouldNotContain: PartialFunction[Expression, Expression], | ||
op: TypedColumn[X1[A], A] => TypedColumn[X1[A], Boolean] | ||
): Assertion = { | ||
withDataset(expected) { dataset => | ||
val ds = dataset.filter(op(dataset('a))) | ||
val actualPushDownFilters = pushDownFilters(ds) | ||
|
||
val optimizedPlan = ds.queryExecution.optimizedPlan.collect { case logical.Filter(condition, _) => condition }.flatMap(_.toList) | ||
|
||
// check the optimized plan | ||
optimizedPlan.collectFirst(planShouldNotContain) should be (empty) | ||
|
||
// compare filters | ||
actualPushDownFilters shouldBe expectedPushDownFilters | ||
|
||
val actual = ds.collect().run().toVector.headOption | ||
|
||
// ensure serialization is not broken | ||
actual should be(Some(expected)) | ||
} | ||
} | ||
|
||
protected def pushDownFilters[T](ds: TypedDataset[T]): List[Filter] = { | ||
val sparkPlan = ds.queryExecution.executedPlan | ||
|
||
val initialPlan = | ||
if (sparkPlan.children.isEmpty) // assume it's AQE | ||
sparkPlan match { | ||
case aq: AdaptiveSparkPlanExec => aq.initialPlan | ||
case _ => sparkPlan | ||
} | ||
else | ||
sparkPlan | ||
|
||
initialPlan.collect { | ||
case fs: FileSourceScanExec => | ||
import scala.reflect.runtime.{universe => ru} | ||
|
||
val runtimeMirror = ru.runtimeMirror(getClass.getClassLoader) | ||
val instanceMirror = runtimeMirror.reflect(fs) | ||
val getter = ru.typeOf[FileSourceScanExec].member(ru.TermName("pushedDownFilters")).asTerm.getter | ||
val m = instanceMirror.reflectMethod(getter.asMethod) | ||
val res = m.apply(fs).asInstanceOf[Seq[Filter]] | ||
|
||
res | ||
}.flatten.toList | ||
} | ||
} |
7 changes: 7 additions & 0 deletions
7
dataset/src/test/scala/org/apache/hadoop/fs/local/StreamingFS.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
package org.apache.hadoop.fs.local | ||
|
||
import com.globalmentor.apache.hadoop.fs.BareLocalFileSystem | ||
import org.apache.hadoop.fs.DelegateToFileSystem | ||
|
||
class StreamingFS(uri: java.net.URI, conf: org.apache.hadoop.conf.Configuration) extends | ||
DelegateToFileSystem(uri, new BareLocalFileSystem(), conf, "file", false) {} |
85 changes: 85 additions & 0 deletions
85
dataset/src/test/spark-3.2/frameless/sql/rules/FramelessLitPushDownTests.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
package frameless.sql.rules | ||
|
||
import frameless._ | ||
import frameless.sql._ | ||
import frameless.functions.Lit | ||
import org.apache.spark.sql.catalyst.util.DateTimeUtils.{currentTimestamp, microsToInstant} | ||
import org.apache.spark.sql.sources.{Filter, IsNotNull} | ||
import org.apache.spark.sql.catalyst.expressions | ||
import org.apache.spark.sql.catalyst.expressions.{Cast, Expression, GenericRowWithSchema} | ||
import java.time.Instant | ||
|
||
import org.apache.spark.sql.catalyst.plans.logical | ||
import org.scalatest.Assertion | ||
|
||
//Note as InvokeLike and "ConditionalExpression" don't have SPARK-40380 and SPARK-39106 no predicate pushdowns can happen in 3.2.4 | ||
class FramelessLitPushDownTests extends SQLRulesSuite { | ||
private val now: Long = currentTimestamp() | ||
|
||
test("java.sql.Timestamp push-down") { | ||
val expected = java.sql.Timestamp.from(microsToInstant(now)) | ||
val expectedStructure = X1(SQLTimestamp(now)) | ||
val expectedPushDownFilters = List(IsNotNull("a")) | ||
|
||
predicatePushDownTest[SQLTimestamp]( | ||
expectedStructure, | ||
expectedPushDownFilters, | ||
{ case e @ expressions.GreaterThanOrEqual(_, _: Lit[_]) => e }, | ||
_ >= expectedStructure.a | ||
) | ||
} | ||
|
||
test("java.time.Instant push-down") { | ||
val expected = java.sql.Timestamp.from(microsToInstant(now)) | ||
val expectedStructure = X1(microsToInstant(now)) | ||
val expectedPushDownFilters = List(IsNotNull("a")) | ||
|
||
predicatePushDownTest[Instant]( | ||
expectedStructure, | ||
expectedPushDownFilters, | ||
{ case e @ expressions.GreaterThanOrEqual(_, _: Lit[_]) => e }, | ||
_ >= expectedStructure.a | ||
) | ||
} | ||
|
||
test("struct push-down") { | ||
type Payload = X4[Int, Int, Int, Int] | ||
val expectedStructure = X1(X4(1, 2, 3, 4)) | ||
val expected = new GenericRowWithSchema(Array(1, 2, 3, 4), TypedExpressionEncoder[Payload].schema) | ||
val expectedPushDownFilters = List(IsNotNull("a")) | ||
|
||
predicatePushDownTest[Payload]( | ||
expectedStructure, | ||
expectedPushDownFilters, | ||
// Cast not Lit because of SPARK-40380 | ||
{ case e @ expressions.EqualTo(_, _: Cast) => e }, | ||
_ === expectedStructure.a | ||
) | ||
} | ||
|
||
override def predicatePushDownTest[A: TypedEncoder: CatalystOrdered]( | ||
expected: X1[A], | ||
expectedPushDownFilters: List[Filter], | ||
planShouldContain: PartialFunction[Expression, Expression], | ||
op: TypedColumn[X1[A], A] => TypedColumn[X1[A], Boolean] | ||
): Assertion = { | ||
withDataset(expected) { dataset => | ||
val ds = dataset.filter(op(dataset('a))) | ||
val actualPushDownFilters = pushDownFilters(ds) | ||
|
||
val optimizedPlan = ds.queryExecution.optimizedPlan.collect { case logical.Filter(condition, _) => condition }.flatMap(_.toList) | ||
|
||
// check the optimized plan | ||
optimizedPlan.collectFirst(planShouldContain) should not be (empty) | ||
|
||
// compare filters | ||
actualPushDownFilters shouldBe expectedPushDownFilters | ||
|
||
val actual = ds.collect().run().toVector.headOption | ||
|
||
// ensure serialization is not broken | ||
actual should be(Some(expected)) | ||
} | ||
} | ||
|
||
} |
Oops, something went wrong.