Skip to content

Commit

Permalink
addressed comments v0.1
Browse files Browse the repository at this point in the history
  • Loading branch information
brkyvz committed Apr 28, 2015
1 parent e98ebac commit 2384266
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 16 deletions.
5 changes: 2 additions & 3 deletions core/src/main/scala/org/apache/spark/rdd/RDD.scala
Original file line number Diff line number Diff line change
Expand Up @@ -420,10 +420,9 @@ abstract class RDD[T: ClassTag](
* @return A random sub-sample of the RDD without replacement.
*/
private[spark] def randomSampleWithRange(lb: Double, ub: Double, seed: Long): RDD[T] = {
val random = new Random(seed)
this.mapPartitions { partition =>
this.mapPartitionsWithIndex { case (index, partition) =>
val sampler = new BernoulliCellSampler[T](lb, ub)
sampler.setSeed(random.nextLong)
sampler.setSeed(seed + index)
sampler.sample(partition)
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -278,13 +278,6 @@ package object dsl {
def sfilter[T1](arg1: Symbol)(udf: (T1) => Boolean): LogicalPlan =
Filter(ScalaUdf(udf, BooleanType, Seq(UnresolvedAttribute(arg1.name))), logicalPlan)

def sample(
lb: Double,
ub: Double,
withReplacement: Boolean = true,
seed: Int = (math.random * 1000).toInt): LogicalPlan =
Sample(lb, ub, withReplacement, seed, logicalPlan)

// TODO specify the output column names
def generate(
generator: Generator,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -300,9 +300,19 @@ case class Subquery(alias: String, child: LogicalPlan) extends UnaryNode {
override def output: Seq[Attribute] = child.output.map(_.withQualifiers(alias :: Nil))
}

/**
* Sample the dataset.
*
* @param lowerBound Lower-bound of the sampling probability (usually 0.0)
* @param upperBound Upper-bound of the sampling probability. The expected fraction sampled
* will be ub - lb.
* @param withReplacement Whether to sample with replacement.
* @param seed the random seed
* @param child the LogicalPlan
*/
case class Sample(
lb: Double,
ub: Double,
lowerBound: Double,
upperBound: Double,
withReplacement: Boolean,
seed: Long,
child: LogicalPlan) extends UnaryNode {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,16 +64,17 @@ case class Filter(condition: Expression, child: SparkPlan) extends UnaryNode {
/**
* :: DeveloperApi ::
* Sample the dataset.
* @param lb Lower-bound of the sampling probability (usually 0.0)
* @param ub Upper-bound of the sampling probability. The expected fraction sampled will be ub - lb.
* @param lowerBound Lower-bound of the sampling probability (usually 0.0)
* @param upperBound Upper-bound of the sampling probability. The expected fraction sampled
* will be ub - lb.
* @param withReplacement Whether to sample with replacement.
* @param seed the random seed
* @param child the QueryPlan
*/
@DeveloperApi
case class Sample(
lb: Double,
ub: Double,
lowerBound: Double,
upperBound: Double,
withReplacement: Boolean,
seed: Long,
child: SparkPlan)
Expand Down

0 comments on commit 2384266

Please sign in to comment.