Skip to content

Commit

Permalink
add toSparse, toDense, numActives, numNonzeros, and compressed to Vector
Browse files Browse the repository at this point in the history
  • Loading branch information
mengxr committed Apr 28, 2015
1 parent 52ccf1d commit da54179
Show file tree
Hide file tree
Showing 2 changed files with 136 additions and 0 deletions.
92 changes: 92 additions & 0 deletions mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,39 @@ sealed trait Vector extends Serializable {
* with type `Double`.
*/
private[spark] def foreachActive(f: (Int, Double) => Unit)

/**
* Number of active entries. Inactive entries are all zeros, while active entries could be zero.
*/
def numActives: Int

/**
* Number of nonzero elements. This scans all active values and count nonzeros.
*/
def numNonzeros: Int

/**
* Converts this vector to a sparse vector with all explicit zeros removed.
*/
def toSparse: SparseVector

/**
* Converts this vector to a dense vector.
*/
def toDense: DenseVector = new DenseVector(this.toArray)

/**
* Returns a vector in either dense or sparse format, whichever uses less storage.
*/
def compressed: Vector = {
val nnz = numNonzeros
// A dense vector needs 8 * size + 8 bytes, while a sparse vector needs 12 * nnz + 20 bytes.
if (1.5 * (nnz + 1.0) < size) {
toSparse
} else {
toDense
}
}
}

/**
Expand Down Expand Up @@ -525,6 +558,34 @@ class DenseVector(val values: Array[Double]) extends Vector {
}
result
}

override def numActives: Int = size

override def numNonzeros: Int = {
// same as values.count(_ != 0.0) but faster
var nnz = 0
values.foreach { v =>
if (v != 0.0) {
nnz += 1
}
}
nnz
}

override def toSparse: SparseVector = {
val nnz = numNonzeros
val ii = new Array[Int](nnz)
val vv = new Array[Double](nnz)
var k = 0
foreachActive { (i, v) =>
if (v != 0) {
ii(k) = i
vv(k) = v
k += 1
}
}
new SparseVector(size, ii, vv)
}
}

object DenseVector {
Expand Down Expand Up @@ -602,6 +663,37 @@ class SparseVector(
}
result
}

override def numActives: Int = values.length

override def numNonzeros: Int = {
var nnz = 0
values.foreach { v =>
if (v != 0.0) {
nnz += 1
}
}
nnz
}

override def toSparse: SparseVector = {
val nnz = numNonzeros
if (nnz == numActives) {
this
} else {
val ii = new Array[Int](nnz)
val vv = new Array[Double](nnz)
var k = 0
foreachActive { (i, v) =>
if (v != 0.0) {
ii(k) = i
vv(k) = v
k += 1
}
}
new SparseVector(size, ii, vv)
}
}
}

object SparseVector {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -270,4 +270,48 @@ class VectorsSuite extends FunSuite {
assert(Vectors.norm(sv, 3.7) ~== math.pow(sv.toArray.foldLeft(0.0)((a, v) =>
a + math.pow(math.abs(v), 3.7)), 1.0 / 3.7) relTol 1E-8)
}

test("Vector numActive and numNonzeros") {
val dv = Vectors.dense(0.0, 2.0, 3.0, 0.0)
assert(dv.numActives === 4)
assert(dv.numNonzeros === 2)

val sv = Vectors.sparse(4, Array(0, 1, 2), Array(0.0, 2.0, 3.0))
assert(sv.numActives === 3)
assert(sv.numNonzeros === 2)
}

test("Vector toSparse and toDense") {
val dv0 = Vectors.dense(0.0, 2.0, 3.0, 0.0)
assert(dv0.toDense === dv0)
val dv0s = dv0.toSparse
assert(dv0s.numActives === 2)
assert(dv0s === dv0)

val sv0 = Vectors.sparse(4, Array(0, 1, 2), Array(0.0, 2.0, 3.0))
assert(sv0.toDense === sv0)
val sv0s = sv0.toSparse
assert(sv0s.numActives === 2)
assert(sv0s === sv0)
}

test("Vector.compressed") {
val dv0 = Vectors.dense(1.0, 2.0, 3.0, 0.0)
val dv0c = dv0.compressed.asInstanceOf[DenseVector]
assert(dv0c === dv0)

val dv1 = Vectors.dense(0.0, 2.0, 0.0, 0.0)
val dv1c = dv1.compressed.asInstanceOf[SparseVector]
assert(dv1 === dv1c)
assert(dv1c.numActives === 1)

val sv0 = Vectors.sparse(4, Array(1, 2), Array(2.0, 0.0))
val sv0c = sv0.compressed.asInstanceOf[SparseVector]
assert(sv0 === sv0c)
assert(sv0c.numActives === 1)

val sv1 = Vectors.sparse(4, Array(0, 1, 2), Array(1.0, 2.0, 3.0))
val sv1c = sv1.compressed.asInstanceOf[DenseVector]
assert(sv1 === sv1c)
}
}

0 comments on commit da54179

Please sign in to comment.