From aaa8f25a579d9c9aa191734377b503fb73299b78 Mon Sep 17 00:00:00 2001
From: Travis Galoppo <tjg2107@columbia.edu>
Date: Mon, 22 Dec 2014 15:20:47 -0500
Subject: [PATCH] MLUtils: changed privacy of EPSILON from [util] to [mllib]

GaussianMixtureEM: Renamed from GaussianMixtureModelEM; corrected formatting issues

GaussianMixtureModel: Renamed predictLabels() to predict()

Others: Modifications based on rename of GaussianMixtureEM
---
 .../org/apache/spark/examples/mllib/DenseGmmEM.scala |  6 +++---
 ...nMixtureModelEM.scala => GaussianMixtureEM.scala} | 11 +++++------
 .../mllib/clustering/GaussianMixtureModel.scala      | 12 +++++-------
 .../scala/org/apache/spark/mllib/util/MLUtils.scala  |  2 +-
 .../clustering/GMMExpectationMaximizationSuite.scala |  4 ++--
 5 files changed, 16 insertions(+), 19 deletions(-)
 rename mllib/src/main/scala/org/apache/spark/mllib/clustering/{GaussianMixtureModelEM.scala => GaussianMixtureEM.scala} (97%)

diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGmmEM.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGmmEM.scala
index 02d73b1af59bf..948c350953e27 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGmmEM.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGmmEM.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.examples.mllib
 
 import org.apache.spark.{SparkConf, SparkContext}
-import org.apache.spark.mllib.clustering.GaussianMixtureModelEM
+import org.apache.spark.mllib.clustering.GaussianMixtureEM
 import org.apache.spark.mllib.linalg.Vectors
 
 /**
@@ -46,7 +46,7 @@ object DenseGmmEM {
       Vectors.dense(line.trim.split(' ').map(_.toDouble))
     }.cache()
       
-    val clusters = new GaussianMixtureModelEM()
+    val clusters = new GaussianMixtureEM()
       .setK(k)
       .setConvergenceTol(convergenceTol)
       .setMaxIterations(maxIterations)
@@ -58,7 +58,7 @@ object DenseGmmEM {
     }
     
     println("Cluster labels (first <= 100):")
-    val clusterLabels = clusters.predictLabels(data)
+    val clusterLabels = clusters.predict(data)
     clusterLabels.take(100).foreach { x =>
       print(" " + x)
     }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModelEM.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala
similarity index 97%
rename from mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModelEM.scala
rename to mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala
index f985f3828952b..bdf984aee4dae 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModelEM.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala
@@ -23,6 +23,7 @@ import breeze.linalg.{DenseVector => BreezeVector, DenseMatrix => BreezeMatrix,
 import org.apache.spark.rdd.RDD
 import org.apache.spark.mllib.linalg.{Matrices, Vector, Vectors}
 import org.apache.spark.mllib.stat.impl.MultivariateGaussian
+import org.apache.spark.mllib.util.MLUtils
 
 /**
  * This class performs expectation maximization for multivariate Gaussian
@@ -41,7 +42,7 @@ import org.apache.spark.mllib.stat.impl.MultivariateGaussian
  * is considered to have occurred.
  * @param maxIterations The maximum number of iterations to perform
  */
-class GaussianMixtureModelEM private (
+class GaussianMixtureEM private (
     private var k: Int, 
     private var convergenceTol: Double, 
     private var maxIterations: Int) extends Serializable {
@@ -49,8 +50,6 @@ class GaussianMixtureModelEM private (
   /** A default instance, 2 Gaussians, 100 iterations, 0.01 log-likelihood threshold */
   def this() = this(2, 0.01, 100)
   
-  
-  
   // number of samples per cluster to use when initializing Gaussians
   private val nSamples = 5
   
@@ -190,8 +189,6 @@ class GaussianMixtureModelEM private (
 
 // companion class to provide zero constructor for ExpectationSum
 private object ExpectationSum {
-  private val eps = math.pow(2.0, -52)
-  
   def zero(k: Int, d: Int): ExpectationSum = {
     new ExpectationSum(0.0, Array.fill(k)(0.0), 
       Array.fill(k)(BreezeVector.zeros(d)), Array.fill(k)(BreezeMatrix.zeros(d,d)))
@@ -203,7 +200,9 @@ private object ExpectationSum {
       weights: Array[Double], 
       dists: Array[MultivariateGaussian])
       (sums: ExpectationSum, x: BreezeVector[Double]): ExpectationSum = {
-    val p = weights.zip(dists).map { case (weight, dist) => eps + weight * dist.pdf(x) }
+    val p = weights.zip(dists).map {
+      case (weight, dist) => MLUtils.EPSILON + weight * dist.pdf(x)
+    }
     val pSum = p.sum
     sums.logLikelihood += math.log(pSum)
     val xxt = x * new Transpose(x)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
index 0285a847bd1b3..11a110db1f7ca 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
@@ -22,6 +22,7 @@ import breeze.linalg.{DenseVector => BreezeVector}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.mllib.linalg.{Matrix, Vector}
 import org.apache.spark.mllib.stat.impl.MultivariateGaussian
+import org.apache.spark.mllib.util.MLUtils
 
 /**
  * Multivariate Gaussian Mixture Model (GMM) consisting of k Gaussians, where points 
@@ -43,7 +44,7 @@ class GaussianMixtureModel(
   def k: Int = weight.length
 
   /** Maps given points to their cluster indices. */
-  def predictLabels(points: RDD[Vector]): RDD[Int] = {
+  def predict(points: RDD[Vector]): RDD[Int] = {
     val responsibilityMatrix = predictMembership(points, mu, sigma, weight, k)
     responsibilityMatrix.map(r => r.indexOf(r.max))
   }
@@ -70,11 +71,6 @@ class GaussianMixtureModel(
     }
   }
   
-  // We use "eps" as the minimum likelihood density for any given point
-  // in every cluster; this prevents any divide by zero conditions for
-  // outlier points.
-  private val eps = math.pow(2.0, -52)
-  
   /**
    * Compute the partial assignments for each vector
    */
@@ -83,7 +79,9 @@ class GaussianMixtureModel(
       dists: Array[MultivariateGaussian],
       weights: Array[Double],
       k: Int): Array[Double] = {
-    val p = weights.zip(dists).map { case (weight, dist) => eps + weight * dist.pdf(pt) }
+    val p = weights.zip(dists).map {
+      case (weight, dist) => MLUtils.EPSILON + weight * dist.pdf(pt)
+    }
     val pSum = p.sum 
     for (i <- 0 until k) {
       p(i) /= pSum
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index 9353351af72a0..06e20e6451dd9 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -38,7 +38,7 @@ import org.apache.spark.streaming.dstream.DStream
  */
 object MLUtils {
 
-  private[util] lazy val EPSILON = {
+  private[mllib] lazy val EPSILON = {
     var eps = 1.0
     while ((1.0 + (eps / 2.0)) != 1.0) {
       eps /= 2.0
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GMMExpectationMaximizationSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GMMExpectationMaximizationSuite.scala
index d19b23c7b1600..23feb82874b70 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GMMExpectationMaximizationSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GMMExpectationMaximizationSuite.scala
@@ -36,7 +36,7 @@ class GMMExpectationMaximizationSuite extends FunSuite with MLlibTestSparkContex
     val Emu = Vectors.dense(5.0, 10.0)
     val Esigma = Matrices.dense(2, 2, Array(2.0 / 3.0, -2.0 / 3.0, -2.0 / 3.0, 2.0 / 3.0))
     
-    val gmm = new GaussianMixtureModelEM().setK(1).run(data)
+    val gmm = new GaussianMixtureEM().setK(1).run(data)
                 
     assert(gmm.weight(0) ~== Ew absTol 1E-5)
     assert(gmm.mu(0) ~== Emu absTol 1E-5)
@@ -63,7 +63,7 @@ class GMMExpectationMaximizationSuite extends FunSuite with MLlibTestSparkContex
     val Emu = Array(Vectors.dense(-4.3673), Vectors.dense(5.1604))
     val Esigma = Array(Matrices.dense(1, 1, Array(1.1098)), Matrices.dense(1, 1, Array(0.86644)))
     
-    val gmm = new GaussianMixtureModelEM()
+    val gmm = new GaussianMixtureEM()
       .setK(2)
       .setInitialModel(initialGmm)
       .run(data)