From aaa8f25a579d9c9aa191734377b503fb73299b78 Mon Sep 17 00:00:00 2001 From: Travis Galoppo Date: Mon, 22 Dec 2014 15:20:47 -0500 Subject: [PATCH] MLUtils: changed privacy of EPSILON from [util] to [mllib] GaussianMixtureEM: Renamed from GaussianMixtureModelEM; corrected formatting issues GaussianMixtureModel: Renamed predictLabels() to predict() Others: Modifications based on rename of GaussianMixtureEM --- .../org/apache/spark/examples/mllib/DenseGmmEM.scala | 6 +++--- ...nMixtureModelEM.scala => GaussianMixtureEM.scala} | 11 +++++------ .../mllib/clustering/GaussianMixtureModel.scala | 12 +++++------- .../scala/org/apache/spark/mllib/util/MLUtils.scala | 2 +- .../clustering/GMMExpectationMaximizationSuite.scala | 4 ++-- 5 files changed, 16 insertions(+), 19 deletions(-) rename mllib/src/main/scala/org/apache/spark/mllib/clustering/{GaussianMixtureModelEM.scala => GaussianMixtureEM.scala} (97%) diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGmmEM.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGmmEM.scala index 02d73b1af59bf..948c350953e27 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGmmEM.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGmmEM.scala @@ -18,7 +18,7 @@ package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} -import org.apache.spark.mllib.clustering.GaussianMixtureModelEM +import org.apache.spark.mllib.clustering.GaussianMixtureEM import org.apache.spark.mllib.linalg.Vectors /** @@ -46,7 +46,7 @@ object DenseGmmEM { Vectors.dense(line.trim.split(' ').map(_.toDouble)) }.cache() - val clusters = new GaussianMixtureModelEM() + val clusters = new GaussianMixtureEM() .setK(k) .setConvergenceTol(convergenceTol) .setMaxIterations(maxIterations) @@ -58,7 +58,7 @@ object DenseGmmEM { } println("Cluster labels (first <= 100):") - val clusterLabels = clusters.predictLabels(data) + val clusterLabels = clusters.predict(data) clusterLabels.take(100).foreach { x => print(" " + x) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModelEM.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala similarity index 97% rename from mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModelEM.scala rename to mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala index f985f3828952b..bdf984aee4dae 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModelEM.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureEM.scala @@ -23,6 +23,7 @@ import breeze.linalg.{DenseVector => BreezeVector, DenseMatrix => BreezeMatrix, import org.apache.spark.rdd.RDD import org.apache.spark.mllib.linalg.{Matrices, Vector, Vectors} import org.apache.spark.mllib.stat.impl.MultivariateGaussian +import org.apache.spark.mllib.util.MLUtils /** * This class performs expectation maximization for multivariate Gaussian @@ -41,7 +42,7 @@ import org.apache.spark.mllib.stat.impl.MultivariateGaussian * is considered to have occurred. * @param maxIterations The maximum number of iterations to perform */ -class GaussianMixtureModelEM private ( +class GaussianMixtureEM private ( private var k: Int, private var convergenceTol: Double, private var maxIterations: Int) extends Serializable { @@ -49,8 +50,6 @@ class GaussianMixtureModelEM private ( /** A default instance, 2 Gaussians, 100 iterations, 0.01 log-likelihood threshold */ def this() = this(2, 0.01, 100) - - // number of samples per cluster to use when initializing Gaussians private val nSamples = 5 @@ -190,8 +189,6 @@ class GaussianMixtureModelEM private ( // companion class to provide zero constructor for ExpectationSum private object ExpectationSum { - private val eps = math.pow(2.0, -52) - def zero(k: Int, d: Int): ExpectationSum = { new ExpectationSum(0.0, Array.fill(k)(0.0), Array.fill(k)(BreezeVector.zeros(d)), Array.fill(k)(BreezeMatrix.zeros(d,d))) @@ -203,7 +200,9 @@ private object ExpectationSum { weights: Array[Double], dists: Array[MultivariateGaussian]) (sums: ExpectationSum, x: BreezeVector[Double]): ExpectationSum = { - val p = weights.zip(dists).map { case (weight, dist) => eps + weight * dist.pdf(x) } + val p = weights.zip(dists).map { + case (weight, dist) => MLUtils.EPSILON + weight * dist.pdf(x) + } val pSum = p.sum sums.logLikelihood += math.log(pSum) val xxt = x * new Transpose(x) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala index 0285a847bd1b3..11a110db1f7ca 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala @@ -22,6 +22,7 @@ import breeze.linalg.{DenseVector => BreezeVector} import org.apache.spark.rdd.RDD import org.apache.spark.mllib.linalg.{Matrix, Vector} import org.apache.spark.mllib.stat.impl.MultivariateGaussian +import org.apache.spark.mllib.util.MLUtils /** * Multivariate Gaussian Mixture Model (GMM) consisting of k Gaussians, where points @@ -43,7 +44,7 @@ class GaussianMixtureModel( def k: Int = weight.length /** Maps given points to their cluster indices. */ - def predictLabels(points: RDD[Vector]): RDD[Int] = { + def predict(points: RDD[Vector]): RDD[Int] = { val responsibilityMatrix = predictMembership(points, mu, sigma, weight, k) responsibilityMatrix.map(r => r.indexOf(r.max)) } @@ -70,11 +71,6 @@ class GaussianMixtureModel( } } - // We use "eps" as the minimum likelihood density for any given point - // in every cluster; this prevents any divide by zero conditions for - // outlier points. - private val eps = math.pow(2.0, -52) - /** * Compute the partial assignments for each vector */ @@ -83,7 +79,9 @@ class GaussianMixtureModel( dists: Array[MultivariateGaussian], weights: Array[Double], k: Int): Array[Double] = { - val p = weights.zip(dists).map { case (weight, dist) => eps + weight * dist.pdf(pt) } + val p = weights.zip(dists).map { + case (weight, dist) => MLUtils.EPSILON + weight * dist.pdf(pt) + } val pSum = p.sum for (i <- 0 until k) { p(i) /= pSum diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index 9353351af72a0..06e20e6451dd9 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -38,7 +38,7 @@ import org.apache.spark.streaming.dstream.DStream */ object MLUtils { - private[util] lazy val EPSILON = { + private[mllib] lazy val EPSILON = { var eps = 1.0 while ((1.0 + (eps / 2.0)) != 1.0) { eps /= 2.0 diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GMMExpectationMaximizationSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GMMExpectationMaximizationSuite.scala index d19b23c7b1600..23feb82874b70 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GMMExpectationMaximizationSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GMMExpectationMaximizationSuite.scala @@ -36,7 +36,7 @@ class GMMExpectationMaximizationSuite extends FunSuite with MLlibTestSparkContex val Emu = Vectors.dense(5.0, 10.0) val Esigma = Matrices.dense(2, 2, Array(2.0 / 3.0, -2.0 / 3.0, -2.0 / 3.0, 2.0 / 3.0)) - val gmm = new GaussianMixtureModelEM().setK(1).run(data) + val gmm = new GaussianMixtureEM().setK(1).run(data) assert(gmm.weight(0) ~== Ew absTol 1E-5) assert(gmm.mu(0) ~== Emu absTol 1E-5) @@ -63,7 +63,7 @@ class GMMExpectationMaximizationSuite extends FunSuite with MLlibTestSparkContex val Emu = Array(Vectors.dense(-4.3673), Vectors.dense(5.1604)) val Esigma = Array(Matrices.dense(1, 1, Array(1.1098)), Matrices.dense(1, 1, Array(0.86644))) - val gmm = new GaussianMixtureModelEM() + val gmm = new GaussianMixtureEM() .setK(2) .setInitialModel(initialGmm) .run(data)