addressed comments v0.1

apache · Apr 28, 2015 · 2384266 · 2384266
1 parent e98ebac
commit 2384266
Show file tree

Hide file tree

Showing 4 changed files with 19 additions and 16 deletions.
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -420,10 +420,9 @@ abstract class RDD[T: ClassTag](
    * @return A random sub-sample of the RDD without replacement.
    */
   private[spark] def randomSampleWithRange(lb: Double, ub: Double, seed: Long): RDD[T] = {
-    val random = new Random(seed)
-    this.mapPartitions { partition =>
+    this.mapPartitionsWithIndex { case (index, partition) =>
       val sampler = new BernoulliCellSampler[T](lb, ub)
-      sampler.setSeed(random.nextLong)
+      sampler.setSeed(seed + index)
       sampler.sample(partition)
     }
   }

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
@@ -278,13 +278,6 @@ package object dsl {
     def sfilter[T1](arg1: Symbol)(udf: (T1) => Boolean): LogicalPlan =
       Filter(ScalaUdf(udf, BooleanType, Seq(UnresolvedAttribute(arg1.name))), logicalPlan)
 
-    def sample(
-        lb: Double,
-        ub: Double,
-        withReplacement: Boolean = true,
-        seed: Int = (math.random * 1000).toInt): LogicalPlan =
-      Sample(lb, ub, withReplacement, seed, logicalPlan)
-
     // TODO specify the output column names
     def generate(
         generator: Generator,

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicOperators.scala
@@ -300,9 +300,19 @@ case class Subquery(alias: String, child: LogicalPlan) extends UnaryNode {
   override def output: Seq[Attribute] = child.output.map(_.withQualifiers(alias :: Nil))
 }
 
+/**
+ * Sample the dataset.
+ *
+ * @param lowerBound Lower-bound of the sampling probability (usually 0.0)
+ * @param upperBound Upper-bound of the sampling probability. The expected fraction sampled
+ *                   will be ub - lb.
+ * @param withReplacement Whether to sample with replacement.
+ * @param seed the random seed
+ * @param child the LogicalPlan
+ */
 case class Sample(
-    lb: Double,
-    ub: Double,
+    lowerBound: Double,
+    upperBound: Double,
     withReplacement: Boolean,
     seed: Long,
     child: LogicalPlan) extends UnaryNode {

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -64,16 +64,17 @@ case class Filter(condition: Expression, child: SparkPlan) extends UnaryNode {
 /**
  * :: DeveloperApi ::
  * Sample the dataset.
- * @param lb Lower-bound of the sampling probability (usually 0.0)
- * @param ub Upper-bound of the sampling probability. The expected fraction sampled will be ub - lb.
+ * @param lowerBound Lower-bound of the sampling probability (usually 0.0)
+ * @param upperBound Upper-bound of the sampling probability. The expected fraction sampled 
+ *                   will be ub - lb.
  * @param withReplacement Whether to sample with replacement.
  * @param seed the random seed
  * @param child the QueryPlan
  */
 @DeveloperApi
 case class Sample(
-    lb: Double,
-    ub: Double,
+    lowerBound: Double,
+    upperBound: Double,
     withReplacement: Boolean,
     seed: Long,
     child: SparkPlan)