Skip to content

Commit

Permalink
fix scala 2.13
Browse files Browse the repository at this point in the history
  • Loading branch information
zhengruifeng committed Nov 17, 2020
1 parent 4626614 commit 91ae454
Showing 1 changed file with 7 additions and 12 deletions.
19 changes: 7 additions & 12 deletions mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
Original file line number Diff line number Diff line change
Expand Up @@ -176,23 +176,18 @@ class Imputer @Since("2.2.0") (@Since("2.2.0") override val uid: String)
.map(_.headOption.getOrElse(Double.NaN))

case Imputer.mode =>
// Keep in line with sklearn.impute.SimpleImputer (using scipy.stats.mode).
// If there is more than one mode, choose the smallest one.
val modes = dataset.select(cols: _*).rdd.flatMap { row =>
Iterator.range(0, numCols).flatMap { i =>
// Ignore null.
if (row.isNullAt(i)) Iterator.empty else Iterator.single((i, row.getDouble(i)), 1L)
}
}.reduceByKey(_ + _).map { case ((i, v), c) => (i, (v, c))
}.reduceByKey { case ((v1, c1), (v2, c2)) =>
if (c1 > c2) {
(v1, c1)
} else if (c1 < c2) {
(v2, c2)
} else {
// Keep in line with sklearn.impute.SimpleImputer (using scipy.stats.mode).
// If there is more than one mode, choose the smallest one.
(math.min(v1, v2), c1)
}
}.mapValues(_._1).collectAsMap()
}.reduceByKey(_ + _).map { case ((i, v), c) =>
// negative value to apply the default ranking of [Long, Double]
(i, (c, -v))
}.reduceByKey(Ordering.apply[(Long, Double)].max
).mapValues(-_._2).collectAsMap()
Array.tabulate(numCols)(i => modes.getOrElse(i, Double.NaN))
}

Expand Down

0 comments on commit 91ae454

Please sign in to comment.