Skip to content

Commit

Permalink
[SPARK-6756] [MLLIB] add toSparse, toDense, numActives, numNonzeros, …
Browse files Browse the repository at this point in the history
…and compressed to Vector

Add `compressed` to `Vector` with some other methods: `numActives`, `numNonzeros`, `toSparse`, and `toDense`. jkbradley

Author: Xiangrui Meng <meng@databricks.com>

Closes apache#5756 from mengxr/SPARK-6756 and squashes the following commits:

8d4ecbd [Xiangrui Meng] address comment and add mima excludes
da54179 [Xiangrui Meng] add toSparse, toDense, numActives, numNonzeros, and compressed to Vector
  • Loading branch information
mengxr authored and jeanlyn committed Jun 12, 2015
1 parent b32d281 commit f0ea634
Show file tree
Hide file tree
Showing 3 changed files with 149 additions and 0 deletions.
93 changes: 93 additions & 0 deletions mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,40 @@ sealed trait Vector extends Serializable {
* with type `Double`.
*/
private[spark] def foreachActive(f: (Int, Double) => Unit)

/**
* Number of active entries. An "active entry" is an element which is explicitly stored,
* regardless of its value. Note that inactive entries have value 0.
*/
def numActives: Int

/**
* Number of nonzero elements. This scans all active values and count nonzeros.
*/
def numNonzeros: Int

/**
* Converts this vector to a sparse vector with all explicit zeros removed.
*/
def toSparse: SparseVector

/**
* Converts this vector to a dense vector.
*/
def toDense: DenseVector = new DenseVector(this.toArray)

/**
* Returns a vector in either dense or sparse format, whichever uses less storage.
*/
def compressed: Vector = {
val nnz = numNonzeros
// A dense vector needs 8 * size + 8 bytes, while a sparse vector needs 12 * nnz + 20 bytes.
if (1.5 * (nnz + 1.0) < size) {
toSparse
} else {
toDense
}
}
}

/**
Expand Down Expand Up @@ -525,6 +559,34 @@ class DenseVector(val values: Array[Double]) extends Vector {
}
result
}

override def numActives: Int = size

override def numNonzeros: Int = {
// same as values.count(_ != 0.0) but faster
var nnz = 0
values.foreach { v =>
if (v != 0.0) {
nnz += 1
}
}
nnz
}

override def toSparse: SparseVector = {
val nnz = numNonzeros
val ii = new Array[Int](nnz)
val vv = new Array[Double](nnz)
var k = 0
foreachActive { (i, v) =>
if (v != 0) {
ii(k) = i
vv(k) = v
k += 1
}
}
new SparseVector(size, ii, vv)
}
}

object DenseVector {
Expand Down Expand Up @@ -602,6 +664,37 @@ class SparseVector(
}
result
}

override def numActives: Int = values.length

override def numNonzeros: Int = {
var nnz = 0
values.foreach { v =>
if (v != 0.0) {
nnz += 1
}
}
nnz
}

override def toSparse: SparseVector = {
val nnz = numNonzeros
if (nnz == numActives) {
this
} else {
val ii = new Array[Int](nnz)
val vv = new Array[Double](nnz)
var k = 0
foreachActive { (i, v) =>
if (v != 0.0) {
ii(k) = i
vv(k) = v
k += 1
}
}
new SparseVector(size, ii, vv)
}
}
}

object SparseVector {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -270,4 +270,48 @@ class VectorsSuite extends FunSuite {
assert(Vectors.norm(sv, 3.7) ~== math.pow(sv.toArray.foldLeft(0.0)((a, v) =>
a + math.pow(math.abs(v), 3.7)), 1.0 / 3.7) relTol 1E-8)
}

test("Vector numActive and numNonzeros") {
val dv = Vectors.dense(0.0, 2.0, 3.0, 0.0)
assert(dv.numActives === 4)
assert(dv.numNonzeros === 2)

val sv = Vectors.sparse(4, Array(0, 1, 2), Array(0.0, 2.0, 3.0))
assert(sv.numActives === 3)
assert(sv.numNonzeros === 2)
}

test("Vector toSparse and toDense") {
val dv0 = Vectors.dense(0.0, 2.0, 3.0, 0.0)
assert(dv0.toDense === dv0)
val dv0s = dv0.toSparse
assert(dv0s.numActives === 2)
assert(dv0s === dv0)

val sv0 = Vectors.sparse(4, Array(0, 1, 2), Array(0.0, 2.0, 3.0))
assert(sv0.toDense === sv0)
val sv0s = sv0.toSparse
assert(sv0s.numActives === 2)
assert(sv0s === sv0)
}

test("Vector.compressed") {
val dv0 = Vectors.dense(1.0, 2.0, 3.0, 0.0)
val dv0c = dv0.compressed.asInstanceOf[DenseVector]
assert(dv0c === dv0)

val dv1 = Vectors.dense(0.0, 2.0, 0.0, 0.0)
val dv1c = dv1.compressed.asInstanceOf[SparseVector]
assert(dv1 === dv1c)
assert(dv1c.numActives === 1)

val sv0 = Vectors.sparse(4, Array(1, 2), Array(2.0, 0.0))
val sv0c = sv0.compressed.asInstanceOf[SparseVector]
assert(sv0 === sv0c)
assert(sv0c.numActives === 1)

val sv1 = Vectors.sparse(4, Array(0, 1, 2), Array(1.0, 2.0, 3.0))
val sv1c = sv1.compressed.asInstanceOf[DenseVector]
assert(sv1 === sv1c)
}
}
12 changes: 12 additions & 0 deletions project/MimaExcludes.scala
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,18 @@ object MimaExcludes {
// SPARK-7090 Introduce LDAOptimizer to LDA to further improve extensibility
ProblemFilters.exclude[MissingClassProblem](
"org.apache.spark.mllib.clustering.LDA$EMOptimizer")
) ++ Seq(
// SPARK-6756 add toSparse, toDense, numActives, numNonzeros, and compressed to Vector
ProblemFilters.exclude[MissingMethodProblem](
"org.apache.spark.mllib.linalg.Vector.compressed"),
ProblemFilters.exclude[MissingMethodProblem](
"org.apache.spark.mllib.linalg.Vector.toDense"),
ProblemFilters.exclude[MissingMethodProblem](
"org.apache.spark.mllib.linalg.Vector.numNonzeros"),
ProblemFilters.exclude[MissingMethodProblem](
"org.apache.spark.mllib.linalg.Vector.toSparse"),
ProblemFilters.exclude[MissingMethodProblem](
"org.apache.spark.mllib.linalg.Vector.numActives")
)

case v if v.startsWith("1.3") =>
Expand Down

0 comments on commit f0ea634

Please sign in to comment.