From ce73c63e8bac40b02ae0a8147c3b424783f6094a Mon Sep 17 00:00:00 2001 From: leahmcguire Date: Fri, 16 Jan 2015 08:06:06 -0800 Subject: [PATCH 01/23] added Bernoulli option to niave bayes model in mllib, added optional model type parameter for training. When Bernoulli is given the Bernoulli smoothing is used for fitting and for prediction http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html --- .../mllib/classification/NaiveBayes.scala | 104 ++++++++++++++---- .../classification/NaiveBayesSuite.scala | 86 ++++++++++++--- 2 files changed, 158 insertions(+), 32 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala index a967df857bed3..12b5a0f4178f6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala @@ -17,7 +17,8 @@ package org.apache.spark.mllib.classification -import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, argmax => brzArgmax, sum => brzSum} +import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, argmax => brzArgmax, sum => brzSum, Axis} +import org.apache.spark.mllib.classification.NaiveBayesModels.NaiveBayesModels import org.apache.spark.{SparkException, Logging} import org.apache.spark.SparkContext._ @@ -25,6 +26,15 @@ import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD + +/** + * + */ +object NaiveBayesModels extends Enumeration { + type NaiveBayesModels = Value + val Multinomial, Bernoulli = Value +} + /** * Model for Naive Bayes Classifiers. * @@ -32,28 +42,42 @@ import org.apache.spark.rdd.RDD * @param pi log of class priors, whose dimension is C, number of labels * @param theta log of class conditional probabilities, whose dimension is C-by-D, * where D is number of features + * @param model The type of NB model to fit from the enumeration NaiveBayesModels, can be + * Multinomial or Bernoulli */ + class NaiveBayesModel private[mllib] ( val labels: Array[Double], val pi: Array[Double], - val theta: Array[Array[Double]]) extends ClassificationModel with Serializable { - - private val brzPi = new BDV[Double](pi) - private val brzTheta = new BDM[Double](theta.length, theta(0).length) + val theta: Array[Array[Double]], + val model: NaiveBayesModels) extends ClassificationModel with Serializable { - { - // Need to put an extra pair of braces to prevent Scala treating `i` as a member. + def populateMatrix(arrayIn: Array[Array[Double]], + matrixIn: BDM[Double], + transformation: (Double) => Double = (x) => x) = { var i = 0 - while (i < theta.length) { + while (i < arrayIn.length) { var j = 0 - while (j < theta(i).length) { - brzTheta(i, j) = theta(i)(j) + while (j < arrayIn(i).length) { + matrixIn(i, j) = transformation(theta(i)(j)) j += 1 } i += 1 } } + private val brzPi = new BDV[Double](pi) + private val brzTheta = new BDM[Double](theta.length, theta(0).length) + populateMatrix(theta, brzTheta) + + private val brzNegTheta: Option[BDM[Double]] = model match { + case NaiveBayesModels.Multinomial => None + case NaiveBayesModels.Bernoulli => + val negTheta = new BDM[Double](theta.length, theta(0).length) + populateMatrix(theta, negTheta, (x) => math.log(1.0 - math.exp(x))) + Option(negTheta) + } + override def predict(testData: RDD[Vector]): RDD[Double] = { val bcModel = testData.context.broadcast(this) testData.mapPartitions { iter => @@ -63,7 +87,14 @@ class NaiveBayesModel private[mllib] ( } override def predict(testData: Vector): Double = { - labels(brzArgmax(brzPi + brzTheta * testData.toBreeze)) + model match { + case NaiveBayesModels.Multinomial => + labels (brzArgmax (brzPi + brzTheta * testData.toBreeze) ) + case NaiveBayesModels.Bernoulli => + labels (brzArgmax (brzPi + + (brzTheta - brzNegTheta.get) * testData.toBreeze + + brzSum(brzNegTheta.get, Axis._1))) + } } } @@ -75,9 +106,12 @@ class NaiveBayesModel private[mllib] ( * document classification. By making every vector a 0-1 vector, it can also be used as * Bernoulli NB ([[http://tinyurl.com/p7c96j6]]). The input feature values must be nonnegative. */ -class NaiveBayes private (private var lambda: Double) extends Serializable with Logging { +class NaiveBayes private (private var lambda: Double, + var model: NaiveBayesModels) extends Serializable with Logging { - def this() = this(1.0) + def this(lambda: Double) = this(lambda, NaiveBayesModels.Multinomial) + + def this() = this(1.0, NaiveBayesModels.Multinomial) /** Set the smoothing parameter. Default: 1.0. */ def setLambda(lambda: Double): NaiveBayes = { @@ -85,6 +119,13 @@ class NaiveBayes private (private var lambda: Double) extends Serializable with this } + /** Set the model type. Default: Multinomial. */ + def setModelType(model: NaiveBayesModels): NaiveBayes = { + this.model = model + this + } + + /** * Run the algorithm with the configured parameters on an input RDD of LabeledPoint entries. * @@ -118,21 +159,27 @@ class NaiveBayes private (private var lambda: Double) extends Serializable with mergeCombiners = (c1: (Long, BDV[Double]), c2: (Long, BDV[Double])) => (c1._1 + c2._1, c1._2 += c2._2) ).collect() + val numLabels = aggregated.length var numDocuments = 0L aggregated.foreach { case (_, (n, _)) => numDocuments += n } val numFeatures = aggregated.head match { case (_, (_, v)) => v.size } + val labels = new Array[Double](numLabels) val pi = new Array[Double](numLabels) val theta = Array.fill(numLabels)(new Array[Double](numFeatures)) + val piLogDenom = math.log(numDocuments + numLabels * lambda) var i = 0 aggregated.foreach { case (label, (n, sumTermFreqs)) => labels(i) = label - val thetaLogDenom = math.log(brzSum(sumTermFreqs) + numFeatures * lambda) pi(i) = math.log(n + lambda) - piLogDenom + val thetaLogDenom = model match { + case NaiveBayesModels.Multinomial => math.log(brzSum(sumTermFreqs) + numFeatures * lambda) + case NaiveBayesModels.Bernoulli => math.log(n + 2.0 * lambda) + } var j = 0 while (j < numFeatures) { theta(i)(j) = math.log(sumTermFreqs(j) + lambda) - thetaLogDenom @@ -141,7 +188,7 @@ class NaiveBayes private (private var lambda: Double) extends Serializable with i += 1 } - new NaiveBayesModel(labels, pi, theta) + new NaiveBayesModel(labels, pi, theta, model) } } @@ -154,8 +201,7 @@ object NaiveBayes { * * This is the Multinomial NB ([[http://tinyurl.com/lsdw6p]]) which can handle all kinds of * discrete data. For example, by converting documents into TF-IDF vectors, it can be used for - * document classification. By making every vector a 0-1 vector, it can also be used as - * Bernoulli NB ([[http://tinyurl.com/p7c96j6]]). + * document classification. * * This version of the method uses a default smoothing parameter of 1.0. * @@ -171,8 +217,7 @@ object NaiveBayes { * * This is the Multinomial NB ([[http://tinyurl.com/lsdw6p]]) which can handle all kinds of * discrete data. For example, by converting documents into TF-IDF vectors, it can be used for - * document classification. By making every vector a 0-1 vector, it can also be used as - * Bernoulli NB ([[http://tinyurl.com/p7c96j6]]). + * document classification. * * @param input RDD of `(label, array of features)` pairs. Every vector should be a frequency * vector or a count vector. @@ -181,4 +226,25 @@ object NaiveBayes { def train(input: RDD[LabeledPoint], lambda: Double): NaiveBayesModel = { new NaiveBayes(lambda).run(input) } + + + /** + * Trains a Naive Bayes model given an RDD of `(label, features)` pairs. + * + * This is by default the Multinomial NB ([[http://tinyurl.com/lsdw6p]]) which can handle + * all kinds of discrete data. For example, by converting documents into TF-IDF vectors, + * it can be used for document classification. By making every vector a 0-1 vector and + * setting the model type to NaiveBayesModels.Bernoulli, it fits and predicts as + * Bernoulli NB ([[http://tinyurl.com/p7c96j6]]). + * + * @param input RDD of `(label, array of features)` pairs. Every vector should be a frequency + * vector or a count vector. + * @param lambda The smoothing parameter + * + * @param model The type of NB model to fit from the enumeration NaiveBayesModels, can be + * Multinomial or Bernoulli + */ + def train(input: RDD[LabeledPoint], lambda: Double, model: NaiveBayesModels): NaiveBayesModel = { + new NaiveBayes(lambda, model).run(input) + } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala index e68fe89d6ccea..44ba6118eb61d 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala @@ -17,6 +17,10 @@ package org.apache.spark.mllib.classification +import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, argmax => brzArgmax, sum => brzSum, Axis} +import breeze.stats.distributions.Multinomial +import org.apache.spark.mllib.classification.NaiveBayesModels.NaiveBayesModels + import scala.util.Random import org.scalatest.FunSuite @@ -39,10 +43,12 @@ object NaiveBayesSuite { // Generate input of the form Y = (theta * x).argmax() def generateNaiveBayesInput( - pi: Array[Double], // 1XC - theta: Array[Array[Double]], // CXD - nPoints: Int, - seed: Int): Seq[LabeledPoint] = { + pi: Array[Double], // 1XC + theta: Array[Array[Double]], // CXD + nPoints: Int, + seed: Int, + dataModel: NaiveBayesModels = NaiveBayesModels.Multinomial, + sample: Int = 10): Seq[LabeledPoint] = { val D = theta(0).length val rnd = new Random(seed) @@ -51,8 +57,17 @@ object NaiveBayesSuite { for (i <- 0 until nPoints) yield { val y = calcLabel(rnd.nextDouble(), _pi) - val xi = Array.tabulate[Double](D) { j => - if (rnd.nextDouble() < _theta(y)(j)) 1 else 0 + val xi = dataModel match { + case NaiveBayesModels.Bernoulli => Array.tabulate[Double] (D) {j => + if (rnd.nextDouble () < _theta(y)(j) ) 1 else 0 + } + case NaiveBayesModels.Multinomial => + val mult = Multinomial(BDV(_theta(y))) + val emptyMap = (0 until D).map(x => (x, 0.0)).toMap + val counts = emptyMap ++ mult.sample(sample).groupBy(x => x).map { + case (index, reps) => (index, reps.size.toDouble) + } + counts.toArray.sortBy(_._1).map(_._2) } LabeledPoint(y, Vectors.dense(xi)) @@ -71,23 +86,68 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { assert(numOfPredictions < input.length / 5) } - test("Naive Bayes") { + def validateModelFit(piData: Array[Double], thetaData: Array[Array[Double]], model: NaiveBayesModel) = { + def closeFit(d1: Double, d2: Double, precision: Double): Boolean = { + (d1 - d2).abs <= precision + } + val modelIndex = (0 until piData.length).zip(model.labels.map(_.toInt)) + for (i <- modelIndex) { + assert(closeFit(math.exp(piData(i._2)), math.exp(model.pi(i._1)), 0.05)) + } + for (i <- modelIndex) { + val sortedData = thetaData(i._2).sorted + val sortedModel = model.theta(i._1).sorted + for (j <- 0 until sortedData.length) { + assert(closeFit(math.exp(sortedData(j)), math.exp(sortedModel(j)), 0.05)) + } + } + } + + test("Naive Bayes Multinomial") { + val nPoints = 1000 + + val pi = Array(0.5, 0.1, 0.4).map(math.log) + val theta = Array( + Array(0.70, 0.10, 0.10, 0.10), // label 0 + Array(0.10, 0.70, 0.10, 0.10), // label 1 + Array(0.10, 0.10, 0.70, 0.10) // label 2 + ).map(_.map(math.log)) + + val testData = NaiveBayesSuite.generateNaiveBayesInput(pi, theta, nPoints, 42, NaiveBayesModels.Multinomial) + val testRDD = sc.parallelize(testData, 2) + testRDD.cache() + + val model = NaiveBayes.train(testRDD, 1.0, NaiveBayesModels.Multinomial) + validateModelFit(pi, theta, model) + + val validationData = NaiveBayesSuite.generateNaiveBayesInput(pi, theta, nPoints, 17, NaiveBayesModels.Multinomial) + val validationRDD = sc.parallelize(validationData, 2) + + // Test prediction on RDD. + validatePrediction(model.predict(validationRDD.map(_.features)).collect(), validationData) + + // Test prediction on Array. + validatePrediction(validationData.map(row => model.predict(row.features)), validationData) + } + + test("Naive Bayes Bernoulli") { val nPoints = 10000 val pi = Array(0.5, 0.3, 0.2).map(math.log) val theta = Array( - Array(0.91, 0.03, 0.03, 0.03), // label 0 - Array(0.03, 0.91, 0.03, 0.03), // label 1 - Array(0.03, 0.03, 0.91, 0.03) // label 2 + Array(0.50, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.40), // label 0 + Array(0.02, 0.70, 0.10, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02), // label 1 + Array(0.02, 0.02, 0.60, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.30) // label 2 ).map(_.map(math.log)) - val testData = NaiveBayesSuite.generateNaiveBayesInput(pi, theta, nPoints, 42) + val testData = NaiveBayesSuite.generateNaiveBayesInput(pi, theta, nPoints, 45, NaiveBayesModels.Bernoulli) val testRDD = sc.parallelize(testData, 2) testRDD.cache() - val model = NaiveBayes.train(testRDD) + val model = NaiveBayes.train(testRDD, 1.0, NaiveBayesModels.Bernoulli) ///!!! this gives same result on both models check the math + validateModelFit(pi, theta, model) - val validationData = NaiveBayesSuite.generateNaiveBayesInput(pi, theta, nPoints, 17) + val validationData = NaiveBayesSuite.generateNaiveBayesInput(pi, theta, nPoints, 20, NaiveBayesModels.Bernoulli) val validationRDD = sc.parallelize(validationData, 2) // Test prediction on RDD. From 4a3676d8d7e8c30778f95e9f479d97b4b1651ce4 Mon Sep 17 00:00:00 2001 From: leahmcguire Date: Tue, 20 Jan 2015 16:19:14 -0800 Subject: [PATCH 02/23] Updated changes re-comments. Got rid of verbose populateMatrix method. Public api now has string instead of enumeration. Docs are updated." --- docs/mllib-naive-bayes.md | 17 ++++++----- .../mllib/classification/NaiveBayes.scala | 28 +++++-------------- .../classification/NaiveBayesSuite.scala | 5 ++-- 3 files changed, 20 insertions(+), 30 deletions(-) diff --git a/docs/mllib-naive-bayes.md b/docs/mllib-naive-bayes.md index d5b044d94fdd7..a71b93fe0daf4 100644 --- a/docs/mllib-naive-bayes.md +++ b/docs/mllib-naive-bayes.md @@ -13,12 +13,15 @@ compute the conditional probability distribution of label given an observation and use it for prediction. MLlib supports [multinomial naive -Bayes](http://en.wikipedia.org/wiki/Naive_Bayes_classifier#Multinomial_naive_Bayes), -which is typically used for [document -classification](http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html). +Bayes](http://en.wikipedia.org/wiki/Naive_Bayes_classifier#Multinomial_naive_Bayes) +and [Bernoulli naive Bayes] (http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html). +Which are typically used for [document classification] +(http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html). Within that context, each observation is a document and each -feature represents a term whose value is the frequency of the term. -Feature values must be nonnegative to represent term frequencies. +feature represents a term whose value is the frequency of the term (in multinomial naive Bayes) or +a zero or one indicating whether the term was found in the document (in Bernoulli naive Bayes). +Feature values must be nonnegative.The model type is selected with on optional parameter +"Multinomial" or "Bernoulli" with "Multinomial" as the default. [Additive smoothing](http://en.wikipedia.org/wiki/Lidstone_smoothing) can be used by setting the parameter $\lambda$ (default to $1.0$). For document classification, the input feature vectors are usually sparse, and sparse vectors should be supplied as input to take advantage of @@ -32,7 +35,7 @@ sparsity. Since the training data is only used once, it is not necessary to cach [NaiveBayes](api/scala/index.html#org.apache.spark.mllib.classification.NaiveBayes$) implements multinomial naive Bayes. It takes an RDD of [LabeledPoint](api/scala/index.html#org.apache.spark.mllib.regression.LabeledPoint) and an optional -smoothing parameter `lambda` as input, and output a +smoothing parameter `lambda` as input, an optional model type parameter (default is Multinomial), and outputs a [NaiveBayesModel](api/scala/index.html#org.apache.spark.mllib.classification.NaiveBayesModel), which can be used for evaluation and prediction. @@ -51,7 +54,7 @@ val splits = parsedData.randomSplit(Array(0.6, 0.4), seed = 11L) val training = splits(0) val test = splits(1) -val model = NaiveBayes.train(training, lambda = 1.0) +val model = NaiveBayes.train(training, lambda = 1.0, model = "Multinomial") val predictionAndLabel = test.map(p => (model.predict(p.features), p.label)) val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count() diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala index 12b5a0f4178f6..cadfe85c76a19 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala @@ -18,12 +18,13 @@ package org.apache.spark.mllib.classification import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, argmax => brzArgmax, sum => brzSum, Axis} -import org.apache.spark.mllib.classification.NaiveBayesModels.NaiveBayesModels +import breeze.numerics.{exp => brzExp, log => brzLog} import org.apache.spark.{SparkException, Logging} import org.apache.spark.SparkContext._ import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector} import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.mllib.classification.NaiveBayesModels.NaiveBayesModels import org.apache.spark.rdd.RDD @@ -52,29 +53,14 @@ class NaiveBayesModel private[mllib] ( val theta: Array[Array[Double]], val model: NaiveBayesModels) extends ClassificationModel with Serializable { - def populateMatrix(arrayIn: Array[Array[Double]], - matrixIn: BDM[Double], - transformation: (Double) => Double = (x) => x) = { - var i = 0 - while (i < arrayIn.length) { - var j = 0 - while (j < arrayIn(i).length) { - matrixIn(i, j) = transformation(theta(i)(j)) - j += 1 - } - i += 1 - } - } - private val brzPi = new BDV[Double](pi) - private val brzTheta = new BDM[Double](theta.length, theta(0).length) - populateMatrix(theta, brzTheta) + private val brzTheta = new BDM(theta(0).length, theta.length, theta.flatten).t private val brzNegTheta: Option[BDM[Double]] = model match { case NaiveBayesModels.Multinomial => None case NaiveBayesModels.Bernoulli => - val negTheta = new BDM[Double](theta.length, theta(0).length) - populateMatrix(theta, negTheta, (x) => math.log(1.0 - math.exp(x))) + val negTheta = brzLog((brzExp(brzTheta.copy) :*= (-1.0)) :+= 1.0) + //((x) => math.log(1.0 - math.exp(x)) Option(negTheta) } @@ -244,7 +230,7 @@ object NaiveBayes { * @param model The type of NB model to fit from the enumeration NaiveBayesModels, can be * Multinomial or Bernoulli */ - def train(input: RDD[LabeledPoint], lambda: Double, model: NaiveBayesModels): NaiveBayesModel = { - new NaiveBayes(lambda, model).run(input) + def train(input: RDD[LabeledPoint], lambda: Double, model: String): NaiveBayesModel = { + new NaiveBayes(lambda, NaiveBayesModels.withName(model)).run(input) } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala index 44ba6118eb61d..d269377c7c9d7 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala @@ -117,7 +117,7 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { val testRDD = sc.parallelize(testData, 2) testRDD.cache() - val model = NaiveBayes.train(testRDD, 1.0, NaiveBayesModels.Multinomial) + val model = NaiveBayes.train(testRDD, 1.0, "Multinomial") validateModelFit(pi, theta, model) val validationData = NaiveBayesSuite.generateNaiveBayesInput(pi, theta, nPoints, 17, NaiveBayesModels.Multinomial) @@ -140,11 +140,12 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { Array(0.02, 0.02, 0.60, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.30) // label 2 ).map(_.map(math.log)) + val testData = NaiveBayesSuite.generateNaiveBayesInput(pi, theta, nPoints, 45, NaiveBayesModels.Bernoulli) val testRDD = sc.parallelize(testData, 2) testRDD.cache() - val model = NaiveBayes.train(testRDD, 1.0, NaiveBayesModels.Bernoulli) ///!!! this gives same result on both models check the math + val model = NaiveBayes.train(testRDD, 1.0, "Bernoulli") ///!!! this gives same result on both models check the math validateModelFit(pi, theta, model) val validationData = NaiveBayesSuite.generateNaiveBayesInput(pi, theta, nPoints, 20, NaiveBayesModels.Bernoulli) From 0313c0cbf8d41b9bcfb0536df253f6af0f1398f7 Mon Sep 17 00:00:00 2001 From: leahmcguire Date: Wed, 21 Jan 2015 09:43:00 -0800 Subject: [PATCH 03/23] fixed style error in NaiveBayes.scala --- .../org/apache/spark/mllib/classification/NaiveBayes.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala index cadfe85c76a19..4ec4bdf9f18a6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala @@ -59,8 +59,7 @@ class NaiveBayesModel private[mllib] ( private val brzNegTheta: Option[BDM[Double]] = model match { case NaiveBayesModels.Multinomial => None case NaiveBayesModels.Bernoulli => - val negTheta = brzLog((brzExp(brzTheta.copy) :*= (-1.0)) :+= 1.0) - //((x) => math.log(1.0 - math.exp(x)) + val negTheta = brzLog((brzExp(brzTheta.copy) :*= (-1.0)) :+= 1.0) // log(1.0 - exp(x)) Option(negTheta) } From 76e5b0f90e370e2cda20e1348bf40ff890f51782 Mon Sep 17 00:00:00 2001 From: leahmcguire Date: Mon, 26 Jan 2015 10:29:47 -0800 Subject: [PATCH 04/23] removed unnecessary sort from test --- .../apache/spark/mllib/classification/NaiveBayesSuite.scala | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala index d269377c7c9d7..853e4501d8685 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala @@ -95,10 +95,8 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { assert(closeFit(math.exp(piData(i._2)), math.exp(model.pi(i._1)), 0.05)) } for (i <- modelIndex) { - val sortedData = thetaData(i._2).sorted - val sortedModel = model.theta(i._1).sorted - for (j <- 0 until sortedData.length) { - assert(closeFit(math.exp(sortedData(j)), math.exp(sortedModel(j)), 0.05)) + for (j <- 0 until thetaData(i._2).length) { + assert(closeFit(math.exp(thetaData(i._2)(j)), math.exp(model.theta(i._1)(j)), 0.05)) } } } From d9477ed8450594de9f2da24af8f82c82def5ce24 Mon Sep 17 00:00:00 2001 From: leahmcguire Date: Thu, 26 Feb 2015 09:16:12 -0800 Subject: [PATCH 05/23] removed old inaccurate comment from test suite for mllib naive bayes --- .../org/apache/spark/mllib/classification/NaiveBayesSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala index 853e4501d8685..29871215f9deb 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala @@ -143,7 +143,7 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { val testRDD = sc.parallelize(testData, 2) testRDD.cache() - val model = NaiveBayes.train(testRDD, 1.0, "Bernoulli") ///!!! this gives same result on both models check the math + val model = NaiveBayes.train(testRDD, 1.0, "Bernoulli") validateModelFit(pi, theta, model) val validationData = NaiveBayesSuite.generateNaiveBayesInput(pi, theta, nPoints, 20, NaiveBayesModels.Bernoulli) From 5a4a534d3636100546b5fa86d2d7ec2ed2051582 Mon Sep 17 00:00:00 2001 From: leahmcguire Date: Fri, 27 Feb 2015 08:56:24 -0800 Subject: [PATCH 06/23] fixed scala style error in NaiveBayes --- .../org/apache/spark/mllib/classification/NaiveBayes.scala | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala index 4269d80f028fa..fbdf53ec28963 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala @@ -111,7 +111,10 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] { def thisClassName = "org.apache.spark.mllib.classification.NaiveBayesModel" /** Model data for model import/export */ - case class Data(labels: Array[Double], pi: Array[Double], theta: Array[Array[Double]], modelType: String) + case class Data(labels: Array[Double], + pi: Array[Double], + theta: Array[Array[Double]], + modelType: String) def save(sc: SparkContext, path: String, data: Data): Unit = { val sqlContext = new SQLContext(sc) From b61b5e2d91582689642fb045849df62a16ce111c Mon Sep 17 00:00:00 2001 From: leahmcguire Date: Mon, 2 Mar 2015 10:50:18 -0800 Subject: [PATCH 07/23] added back compatable constructor to NaiveBayesModel to fix MIMA test failure --- .../org/apache/spark/mllib/classification/NaiveBayes.scala | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala index fbdf53ec28963..8f9418deb045a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala @@ -62,6 +62,9 @@ class NaiveBayesModel private[mllib] ( val theta: Array[Array[Double]], val modelType: NaiveBayesModels) extends ClassificationModel with Serializable with Saveable { + def this(labels: Array[Double], pi: Array[Double], theta: Array[Array[Double]]) = + this(labels, pi, theta, NaiveBayesModels.Multinomial) + private val brzPi = new BDV[Double](pi) private val brzTheta = new BDM(theta(0).length, theta.length, theta.flatten).t From 37305729334922c40804752598a30a2fb892c317 Mon Sep 17 00:00:00 2001 From: "Joseph K. Bradley" Date: Tue, 3 Mar 2015 15:22:20 -0800 Subject: [PATCH 08/23] modified NB model type to be more Java-friendly --- .../spark/examples/mllib/JavaNaiveBayes.java | 67 ++++++++++++++++ .../examples/mllib/SparseNaiveBayes.scala | 4 + .../mllib/classification/NaiveBayes.scala | 77 +++++++++++-------- 3 files changed, 117 insertions(+), 31 deletions(-) create mode 100644 examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayes.java diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayes.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayes.java new file mode 100644 index 0000000000000..952648f14da89 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayes.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.mllib; + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.mllib.classification.NaiveBayes; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.mllib.regression.LabeledPoint; + +import java.util.regex.Pattern; + +public final class JavaNaiveBayes { + + static class ParsePoint implements Function { + private static final Pattern COMMA = Pattern.compile(","); + private static final Pattern SPACE = Pattern.compile(" "); + + @Override + public LabeledPoint call(String line) { + String[] parts = COMMA.split(line); + double y = Double.parseDouble(parts[0]); + String[] tok = SPACE.split(parts[1]); + double[] x = new double[tok.length]; + for (int i = 0; i < tok.length; ++i) { + x[i] = Double.parseDouble(tok[i]); + } + return new LabeledPoint(y, Vectors.dense(x)); + } + } + + public static void main(String[] args) { + if (args.length != 3) { + System.err.println("Usage: JavaLR "); + System.exit(1); + } + SparkConf sparkConf = new SparkConf().setAppName("JavaLR"); + JavaSparkContext sc = new JavaSparkContext(sparkConf); + JavaRDD lines = sc.textFile(args[0]); + JavaRDD points = lines.map(new ParsePoint()).cache(); + double stepSize = Double.parseDouble(args[1]); + int iterations = Integer.parseInt(args[2]); + + // Example which compiles. (Don't actually include!) + NaiveBayes nb = new NaiveBayes(); + nb.setModelType(NaiveBayes.Bernoulli()); + + sc.stop(); + } +} diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/SparseNaiveBayes.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/SparseNaiveBayes.scala index f1ff4e6911f5e..9c157a6719084 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/SparseNaiveBayes.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/SparseNaiveBayes.scala @@ -89,6 +89,10 @@ object SparseNaiveBayes { println(s"numTraining = $numTraining, numTest = $numTest.") + // Example which compiles. (Don't actually include!) + val nb = new NaiveBayes() + nb.setModelType(NaiveBayes.Bernoulli) + val model = new NaiveBayes().setLambda(params.lambda).run(training) val prediction = model.predict(test.map(_.features)) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala index 8f9418deb045a..0ebfd87889ea8 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala @@ -27,24 +27,11 @@ import org.json4s.{DefaultFormats, JValue} import org.apache.spark.{Logging, SparkContext, SparkException} import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector} import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.classification.NaiveBayesModels.NaiveBayesModels import org.apache.spark.mllib.util.{Loader, Saveable} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, SQLContext} -/** - * - */ -object NaiveBayesModels extends Enumeration { - type NaiveBayesModels = Value - val Multinomial, Bernoulli = Value - - implicit def toString(model: NaiveBayesModels): String = { - model.toString - } -} - /** * Model for Naive Bayes Classifiers. * @@ -60,17 +47,18 @@ class NaiveBayesModel private[mllib] ( val labels: Array[Double], val pi: Array[Double], val theta: Array[Array[Double]], - val modelType: NaiveBayesModels) extends ClassificationModel with Serializable with Saveable { + val modelType: NaiveBayes.ModelType) + extends ClassificationModel with Serializable with Saveable { def this(labels: Array[Double], pi: Array[Double], theta: Array[Array[Double]]) = - this(labels, pi, theta, NaiveBayesModels.Multinomial) + this(labels, pi, theta, NaiveBayes.Multinomial) private val brzPi = new BDV[Double](pi) private val brzTheta = new BDM(theta(0).length, theta.length, theta.flatten).t private val brzNegTheta: Option[BDM[Double]] = modelType match { - case NaiveBayesModels.Multinomial => None - case NaiveBayesModels.Bernoulli => + case NaiveBayes.Multinomial => None + case NaiveBayes.Bernoulli => val negTheta = brzLog((brzExp(brzTheta.copy) :*= (-1.0)) :+= 1.0) // log(1.0 - exp(x)) Option(negTheta) } @@ -85,9 +73,9 @@ class NaiveBayesModel private[mllib] ( override def predict(testData: Vector): Double = { modelType match { - case NaiveBayesModels.Multinomial => + case NaiveBayes.Multinomial => labels (brzArgmax (brzPi + brzTheta * testData.toBreeze) ) - case NaiveBayesModels.Bernoulli => + case NaiveBayes.Bernoulli => labels (brzArgmax (brzPi + (brzTheta - brzNegTheta.get) * testData.toBreeze + brzSum(brzNegTheta.get, Axis._1))) @@ -95,7 +83,7 @@ class NaiveBayesModel private[mllib] ( } override def save(sc: SparkContext, path: String): Unit = { - val data = NaiveBayesModel.SaveLoadV1_0.Data(labels, pi, theta, modelType) + val data = NaiveBayesModel.SaveLoadV1_0.Data(labels, pi, theta, modelType.toString) NaiveBayesModel.SaveLoadV1_0.save(sc, path, data) } @@ -147,15 +135,15 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] { val labels = data.getAs[Seq[Double]](0).toArray val pi = data.getAs[Seq[Double]](1).toArray val theta = data.getAs[Seq[Seq[Double]]](2).map(_.toArray).toArray - val modelType: NaiveBayesModels = NaiveBayesModels.withName(data.getAs[String](3)) + val modelType = NaiveBayes.ModelType.fromString(data.getString(3)) new NaiveBayesModel(labels, pi, theta, modelType) } } override def load(sc: SparkContext, path: String): NaiveBayesModel = { - def getModelType(metadata: JValue): NaiveBayesModels = { + def getModelType(metadata: JValue): NaiveBayes.ModelType = { implicit val formats = DefaultFormats - NaiveBayesModels.withName((metadata \ "modelType").extract[String]) + NaiveBayes.ModelType.fromString((metadata \ "modelType").extract[String]) } val (loadedClassName, version, metadata) = loadMetadata(sc, path) val classNameV1_0 = SaveLoadV1_0.thisClassName @@ -191,12 +179,13 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] { * document classification. By making every vector a 0-1 vector, it can also be used as * Bernoulli NB ([[http://tinyurl.com/p7c96j6]]). The input feature values must be nonnegative. */ -class NaiveBayes private (private var lambda: Double, - var modelType: NaiveBayesModels) extends Serializable with Logging { +class NaiveBayes private ( + private var lambda: Double, + var modelType: NaiveBayes.ModelType) extends Serializable with Logging { - def this(lambda: Double) = this(lambda, NaiveBayesModels.Multinomial) + def this(lambda: Double) = this(lambda, NaiveBayes.Multinomial) - def this() = this(1.0, NaiveBayesModels.Multinomial) + def this() = this(1.0, NaiveBayes.Multinomial) /** Set the smoothing parameter. Default: 1.0. */ def setLambda(lambda: Double): NaiveBayes = { @@ -205,7 +194,7 @@ class NaiveBayes private (private var lambda: Double, } /** Set the model type. Default: Multinomial. */ - def setModelType(model: NaiveBayesModels): NaiveBayes = { + def setModelType(model: NaiveBayes.ModelType): NaiveBayes = { this.modelType = model this } @@ -262,8 +251,8 @@ class NaiveBayes private (private var lambda: Double, labels(i) = label pi(i) = math.log(n + lambda) - piLogDenom val thetaLogDenom = modelType match { - case NaiveBayesModels.Multinomial => math.log(brzSum(sumTermFreqs) + numFeatures * lambda) - case NaiveBayesModels.Bernoulli => math.log(n + 2.0 * lambda) + case NaiveBayes.Multinomial => math.log(brzSum(sumTermFreqs) + numFeatures * lambda) + case NaiveBayes.Bernoulli => math.log(n + 2.0 * lambda) } var j = 0 while (j < numFeatures) { @@ -330,6 +319,32 @@ object NaiveBayes { * Multinomial or Bernoulli */ def train(input: RDD[LabeledPoint], lambda: Double, modelType: String): NaiveBayesModel = { - new NaiveBayes(lambda, NaiveBayesModels.withName(modelType)).run(input) + new NaiveBayes(lambda, Multinomial).run(input) } + + sealed abstract class ModelType + + object MODELTYPE { + final val MULTINOMIAL_STRING = "multinomial" + final val BERNOULLI_STRING = "bernoulli" + + def fromString(modelType: String): ModelType = modelType match { + case MULTINOMIAL_STRING => Multinomial + case BERNOULLI_STRING => Bernoulli + case _ => + throw new IllegalArgumentException(s"Cannot recognize NaiveBayes ModelType: $modelType") + } + } + + final val ModelType = MODELTYPE + + final val Multinomial: ModelType = new ModelType { + override def toString: String = ModelType.MULTINOMIAL_STRING + } + + final val Bernoulli: ModelType = new ModelType { + override def toString: String = ModelType.BERNOULLI_STRING + } + } + From 7622b0c002c12efd8fb2c6fa34a691c82c86edd8 Mon Sep 17 00:00:00 2001 From: leahmcguire Date: Thu, 5 Mar 2015 11:07:25 -0800 Subject: [PATCH 09/23] added comments and fixed style as per rb --- .../mllib/classification/NaiveBayes.scala | 47 +++++++++++-------- .../classification/NaiveBayesSuite.scala | 37 +++++++++++---- 2 files changed, 55 insertions(+), 29 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala index 8f9418deb045a..cb864caca47af 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala @@ -25,16 +25,17 @@ import org.json4s.jackson.JsonMethods._ import org.json4s.{DefaultFormats, JValue} import org.apache.spark.{Logging, SparkContext, SparkException} +import org.apache.spark.mllib.classification.NaiveBayesModels.NaiveBayesModels import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector} import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.classification.NaiveBayesModels.NaiveBayesModels import org.apache.spark.mllib.util.{Loader, Saveable} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, SQLContext} /** - * + * Model types supported in Naive Bayes: + * multinomial and Bernoulli currently supported */ object NaiveBayesModels extends Enumeration { type NaiveBayesModels = Value @@ -45,6 +46,8 @@ object NaiveBayesModels extends Enumeration { } } + + /** * Model for Naive Bayes Classifiers. * @@ -55,7 +58,6 @@ object NaiveBayesModels extends Enumeration { * @param modelType The type of NB model to fit from the enumeration NaiveBayesModels, can be * Multinomial or Bernoulli */ - class NaiveBayesModel private[mllib] ( val labels: Array[Double], val pi: Array[Double], @@ -68,11 +70,14 @@ class NaiveBayesModel private[mllib] ( private val brzPi = new BDV[Double](pi) private val brzTheta = new BDM(theta(0).length, theta.length, theta.flatten).t - private val brzNegTheta: Option[BDM[Double]] = modelType match { - case NaiveBayesModels.Multinomial => None + //Bernoulli scoring requires log(condprob) if 1 log(1-condprob) if 0 + //precomputing log(1.0 - exp(theta)) and its sum for linear algebra application + //of this condition in predict function + private val (brzNegTheta, brzNegThetaSum) = modelType match { + case NaiveBayesModels.Multinomial => (None, None) case NaiveBayesModels.Bernoulli => val negTheta = brzLog((brzExp(brzTheta.copy) :*= (-1.0)) :+= 1.0) // log(1.0 - exp(x)) - Option(negTheta) + (Option(negTheta), Option(brzSum(brzNegTheta, Axis._1))) } override def predict(testData: RDD[Vector]): RDD[Double] = { @@ -89,8 +94,7 @@ class NaiveBayesModel private[mllib] ( labels (brzArgmax (brzPi + brzTheta * testData.toBreeze) ) case NaiveBayesModels.Bernoulli => labels (brzArgmax (brzPi + - (brzTheta - brzNegTheta.get) * testData.toBreeze + - brzSum(brzNegTheta.get, Axis._1))) + (brzTheta - brzNegTheta.get) * testData.toBreeze + brzNegThetaSum.get)) } } @@ -114,10 +118,11 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] { def thisClassName = "org.apache.spark.mllib.classification.NaiveBayesModel" /** Model data for model import/export */ - case class Data(labels: Array[Double], - pi: Array[Double], - theta: Array[Array[Double]], - modelType: String) + case class Data( + labels: Array[Double], + pi: Array[Double], + theta: Array[Array[Double]], + modelType: String) def save(sc: SparkContext, path: String, data: Data): Unit = { val sqlContext = new SQLContext(sc) @@ -192,7 +197,7 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] { * Bernoulli NB ([[http://tinyurl.com/p7c96j6]]). The input feature values must be nonnegative. */ class NaiveBayes private (private var lambda: Double, - var modelType: NaiveBayesModels) extends Serializable with Logging { + private var modelType: NaiveBayesModels) extends Serializable with Logging { def this(lambda: Double) = this(lambda, NaiveBayesModels.Multinomial) @@ -284,7 +289,7 @@ object NaiveBayes { /** * Trains a Naive Bayes model given an RDD of `(label, features)` pairs. * - * This is the Multinomial NB ([[http://tinyurl.com/lsdw6p]]) which can handle all kinds of + * This is the default Multinomial NB ([[http://tinyurl.com/lsdw6p]]) which can handle all kinds of * discrete data. For example, by converting documents into TF-IDF vectors, it can be used for * document classification. * @@ -300,7 +305,7 @@ object NaiveBayes { /** * Trains a Naive Bayes model given an RDD of `(label, features)` pairs. * - * This is the Multinomial NB ([[http://tinyurl.com/lsdw6p]]) which can handle all kinds of + * This is the default Multinomial NB ([[http://tinyurl.com/lsdw6p]]) which can handle all kinds of * discrete data. For example, by converting documents into TF-IDF vectors, it can be used for * document classification. * @@ -316,11 +321,13 @@ object NaiveBayes { /** * Trains a Naive Bayes model given an RDD of `(label, features)` pairs. * - * This is by default the Multinomial NB ([[http://tinyurl.com/lsdw6p]]) which can handle - * all kinds of discrete data. For example, by converting documents into TF-IDF vectors, - * it can be used for document classification. By making every vector a 0-1 vector and - * setting the model type to NaiveBayesModels.Bernoulli, it fits and predicts as - * Bernoulli NB ([[http://tinyurl.com/p7c96j6]]). + * The model type can be set to either Multinomial NB ([[http://tinyurl.com/lsdw6p]]) + * or Bernoulli NB ([[http://tinyurl.com/p7c96j6]]). The Multinomial NB can handle + * discrete count data and can be called by setting the model type to "Multinomial". + * For example, it can be used with word counts or TF_IDF vectors of documents. + * The Bernoulli model fits presence or absence (0-1) counts. By making every vector a + * 0-1 vector and setting the model type to "Bernoulli", the fits and predicts as + * Bernoulli NB. * * @param input RDD of `(label, array of features)` pairs. Every vector should be a frequency * vector or a count vector. diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala index 93acb424dd5a4..eceea68a0284b 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala @@ -19,13 +19,13 @@ package org.apache.spark.mllib.classification import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, argmax => brzArgmax, sum => brzSum, Axis} import breeze.stats.distributions.Multinomial -import org.apache.spark.mllib.classification.NaiveBayesModels.NaiveBayesModels import scala.util.Random import org.scalatest.FunSuite import org.apache.spark.SparkException +import org.apache.spark.mllib.classification.NaiveBayesModels.NaiveBayesModels import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.{LocalClusterSparkContext, MLlibTestSparkContext} @@ -49,7 +49,7 @@ object NaiveBayesSuite { theta: Array[Array[Double]], // CXD nPoints: Int, seed: Int, - dataModel: NaiveBayesModels = NaiveBayesModels.Multinomial, + dataModel: NaiveBayesModels= NaiveBayesModels.Multinomial, sample: Int = 10): Seq[LabeledPoint] = { val D = theta(0).length val rnd = new Random(seed) @@ -92,7 +92,10 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { assert(numOfPredictions < input.length / 5) } - def validateModelFit(piData: Array[Double], thetaData: Array[Array[Double]], model: NaiveBayesModel) = { + def validateModelFit( + piData: Array[Double], + thetaData: Array[Array[Double]], + model: NaiveBayesModel) = { def closeFit(d1: Double, d2: Double, precision: Double): Boolean = { (d1 - d2).abs <= precision } @@ -117,14 +120,20 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { Array(0.10, 0.10, 0.70, 0.10) // label 2 ).map(_.map(math.log)) - val testData = NaiveBayesSuite.generateNaiveBayesInput(pi, theta, nPoints, 42, NaiveBayesModels.Multinomial) + val testData = NaiveBayesSuite.generateNaiveBayesInput( + pi, theta, nPoints, 42, NaiveBayesModels.Multinomial) val testRDD = sc.parallelize(testData, 2) testRDD.cache() val model = NaiveBayes.train(testRDD, 1.0, "Multinomial") validateModelFit(pi, theta, model) - val validationData = NaiveBayesSuite.generateNaiveBayesInput(pi, theta, nPoints, 17, NaiveBayesModels.Multinomial) + val validationData = NaiveBayesSuite.generateNaiveBayesInput( + pi, + theta, + nPoints, + 17, + NaiveBayesModels.Multinomial) val validationRDD = sc.parallelize(validationData, 2) // Test prediction on RDD. @@ -144,14 +153,24 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { Array(0.02, 0.02, 0.60, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.30) // label 2 ).map(_.map(math.log)) - val testData = NaiveBayesSuite.generateNaiveBayesInput(pi, theta, nPoints, 45, NaiveBayesModels.Bernoulli) + val testData = NaiveBayesSuite.generateNaiveBayesInput( + pi, + theta, + nPoints, + 45, + NaiveBayesModels.Bernoulli) val testRDD = sc.parallelize(testData, 2) testRDD.cache() val model = NaiveBayes.train(testRDD, 1.0, "Bernoulli") validateModelFit(pi, theta, model) - val validationData = NaiveBayesSuite.generateNaiveBayesInput(pi, theta, nPoints, 20, NaiveBayesModels.Bernoulli) + val validationData = NaiveBayesSuite.generateNaiveBayesInput( + pi, + theta, + nPoints, + 20, + NaiveBayesModels.Bernoulli) val validationRDD = sc.parallelize(validationData, 2) // Test prediction on RDD. @@ -218,8 +237,8 @@ class NaiveBayesClusterSuite extends FunSuite with LocalClusterSparkContext { LabeledPoint(random.nextInt(2), Vectors.dense(Array.fill(n)(random.nextDouble()))) } } - // If we serialize data directly in the task closure, the size of the serialized task would be - // greater than 1MB and hence Spark would throw an error. + // If we serialize data directly in the task closure, the size of the serialized task + // would be greater than 1MB and hence Spark would throw an error. val model = NaiveBayes.train(examples) val predictions = model.predict(examples.map(_.features)) } From e01656978174f8ecbd75ef6a50211234a1babfc6 Mon Sep 17 00:00:00 2001 From: leahmcguire Date: Thu, 5 Mar 2015 11:28:05 -0800 Subject: [PATCH 10/23] updated test suite with model type fix --- .../mllib/classification/NaiveBayesSuite.scala | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala index eceea68a0284b..0874bb0b90ce4 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala @@ -25,7 +25,6 @@ import scala.util.Random import org.scalatest.FunSuite import org.apache.spark.SparkException -import org.apache.spark.mllib.classification.NaiveBayesModels.NaiveBayesModels import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.{LocalClusterSparkContext, MLlibTestSparkContext} @@ -49,7 +48,7 @@ object NaiveBayesSuite { theta: Array[Array[Double]], // CXD nPoints: Int, seed: Int, - dataModel: NaiveBayesModels= NaiveBayesModels.Multinomial, + dataModel: NaiveBayes.ModelType = NaiveBayes.Multinomial, sample: Int = 10): Seq[LabeledPoint] = { val D = theta(0).length val rnd = new Random(seed) @@ -60,10 +59,10 @@ object NaiveBayesSuite { for (i <- 0 until nPoints) yield { val y = calcLabel(rnd.nextDouble(), _pi) val xi = dataModel match { - case NaiveBayesModels.Bernoulli => Array.tabulate[Double] (D) {j => + case NaiveBayes.Bernoulli => Array.tabulate[Double] (D) {j => if (rnd.nextDouble () < _theta(y)(j) ) 1 else 0 } - case NaiveBayesModels.Multinomial => + case NaiveBayes.Multinomial => val mult = Multinomial(BDV(_theta(y))) val emptyMap = (0 until D).map(x => (x, 0.0)).toMap val counts = emptyMap ++ mult.sample(sample).groupBy(x => x).map { @@ -78,7 +77,7 @@ object NaiveBayesSuite { /** Binary labels, 3 features */ private val binaryModel = new NaiveBayesModel(labels = Array(0.0, 1.0), pi = Array(0.2, 0.8), - theta = Array(Array(0.1, 0.3, 0.6), Array(0.2, 0.4, 0.4)), NaiveBayesModels.Bernoulli) + theta = Array(Array(0.1, 0.3, 0.6), Array(0.2, 0.4, 0.4)), NaiveBayes.Bernoulli) } class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { @@ -121,7 +120,7 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { ).map(_.map(math.log)) val testData = NaiveBayesSuite.generateNaiveBayesInput( - pi, theta, nPoints, 42, NaiveBayesModels.Multinomial) + pi, theta, nPoints, 42, NaiveBayes.Multinomial) val testRDD = sc.parallelize(testData, 2) testRDD.cache() @@ -133,7 +132,7 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { theta, nPoints, 17, - NaiveBayesModels.Multinomial) + NaiveBayes.Multinomial) val validationRDD = sc.parallelize(validationData, 2) // Test prediction on RDD. @@ -158,7 +157,7 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { theta, nPoints, 45, - NaiveBayesModels.Bernoulli) + NaiveBayes.Bernoulli) val testRDD = sc.parallelize(testData, 2) testRDD.cache() @@ -170,7 +169,7 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { theta, nPoints, 20, - NaiveBayesModels.Bernoulli) + NaiveBayes.Bernoulli) val validationRDD = sc.parallelize(validationData, 2) // Test prediction on RDD. From 900b5864c16cc0db93a46ec3a4591a787e5a21a0 Mon Sep 17 00:00:00 2001 From: leahmcguire Date: Thu, 5 Mar 2015 11:53:46 -0800 Subject: [PATCH 11/23] fixed model call so that uses type argument --- .../apache/spark/mllib/classification/NaiveBayes.scala | 8 ++++---- .../spark/mllib/classification/NaiveBayesSuite.scala | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala index 4e1c6a63fb01c..bcf5acdada671 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala @@ -310,10 +310,10 @@ object NaiveBayes { * * The model type can be set to either Multinomial NB ([[http://tinyurl.com/lsdw6p]]) * or Bernoulli NB ([[http://tinyurl.com/p7c96j6]]). The Multinomial NB can handle - * discrete count data and can be called by setting the model type to "Multinomial". + * discrete count data and can be called by setting the model type to "multinomial". * For example, it can be used with word counts or TF_IDF vectors of documents. * The Bernoulli model fits presence or absence (0-1) counts. By making every vector a - * 0-1 vector and setting the model type to "Bernoulli", the fits and predicts as + * 0-1 vector and setting the model type to "bernoulli", the fits and predicts as * Bernoulli NB. * * @param input RDD of `(label, array of features)` pairs. Every vector should be a frequency @@ -321,10 +321,10 @@ object NaiveBayes { * @param lambda The smoothing parameter * * @param modelType The type of NB model to fit from the enumeration NaiveBayesModels, can be - * Multinomial or Bernoulli + * multinomial or bernoulli */ def train(input: RDD[LabeledPoint], lambda: Double, modelType: String): NaiveBayesModel = { - new NaiveBayes(lambda, Multinomial).run(input) + new NaiveBayes(lambda, MODELTYPE.fromString(modelType)).run(input) } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala index 0874bb0b90ce4..f7310bef2bc9b 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala @@ -124,7 +124,7 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { val testRDD = sc.parallelize(testData, 2) testRDD.cache() - val model = NaiveBayes.train(testRDD, 1.0, "Multinomial") + val model = NaiveBayes.train(testRDD, 1.0, "multinomial") validateModelFit(pi, theta, model) val validationData = NaiveBayesSuite.generateNaiveBayesInput( @@ -161,7 +161,7 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { val testRDD = sc.parallelize(testData, 2) testRDD.cache() - val model = NaiveBayes.train(testRDD, 1.0, "Bernoulli") + val model = NaiveBayes.train(testRDD, 1.0, "bernoulli") validateModelFit(pi, theta, model) val validationData = NaiveBayesSuite.generateNaiveBayesInput( From c298e78ba7d58bb4d7e9b54d56ce51fe6b6b10a9 Mon Sep 17 00:00:00 2001 From: leahmcguire Date: Thu, 5 Mar 2015 13:16:08 -0800 Subject: [PATCH 12/23] fixed scala style errors --- .../mllib/classification/NaiveBayes.scala | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala index bcf5acdada671..f6683051fad0e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala @@ -55,9 +55,9 @@ class NaiveBayesModel private[mllib] ( private val brzPi = new BDV[Double](pi) private val brzTheta = new BDM(theta(0).length, theta.length, theta.flatten).t - //Bernoulli scoring requires log(condprob) if 1 log(1-condprob) if 0 - //this precomputes log(1.0 - exp(theta)) and its sum for linear algebra application - //of this condition in predict function + // Bernoulli scoring requires log(condprob) if 1 log(1-condprob) if 0 + // this precomputes log(1.0 - exp(theta)) and its sum for linear algebra application + // of this condition in predict function private val (brzNegTheta, brzNegThetaSum) = modelType match { case NaiveBayes.Multinomial => (None, None) case NaiveBayes.Bernoulli => @@ -276,9 +276,9 @@ object NaiveBayes { /** * Trains a Naive Bayes model given an RDD of `(label, features)` pairs. * - * This is the default Multinomial NB ([[http://tinyurl.com/lsdw6p]]) which can handle all kinds of - * discrete data. For example, by converting documents into TF-IDF vectors, it can be used for - * document classification. + * This is the default Multinomial NB ([[http://tinyurl.com/lsdw6p]]) which can handle all + * kinds of discrete data. For example, by converting documents into TF-IDF vectors, it + * can be used for document classification. * * This version of the method uses a default smoothing parameter of 1.0. * @@ -292,9 +292,9 @@ object NaiveBayes { /** * Trains a Naive Bayes model given an RDD of `(label, features)` pairs. * - * This is the default Multinomial NB ([[http://tinyurl.com/lsdw6p]]) which can handle all kinds of - * discrete data. For example, by converting documents into TF-IDF vectors, it can be used for - * document classification. + * This is the default Multinomial NB ([[http://tinyurl.com/lsdw6p]]) which can handle all + * kinds of discrete data. For example, by converting documents into TF-IDF vectors, it + * can be used for document classification. * * @param input RDD of `(label, array of features)` pairs. Every vector should be a frequency * vector or a count vector. From 2d0c1ba631841a0c55212fbc8dd7327285972ef8 Mon Sep 17 00:00:00 2001 From: leahmcguire Date: Thu, 5 Mar 2015 13:42:42 -0800 Subject: [PATCH 13/23] fixed typo in NaiveBayes --- .../org/apache/spark/mllib/classification/NaiveBayes.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala index f6683051fad0e..a2bc8c9aa64ab 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala @@ -62,7 +62,7 @@ class NaiveBayesModel private[mllib] ( case NaiveBayes.Multinomial => (None, None) case NaiveBayes.Bernoulli => val negTheta = brzLog((brzExp(brzTheta.copy) :*= (-1.0)) :+= 1.0) // log(1.0 - exp(x)) - (Option(negTheta), Option(brzSum(brzNegTheta, Axis._1))) + (Option(negTheta), Option(brzSum(negTheta, Axis._1))) } override def predict(testData: RDD[Vector]): RDD[Double] = { From e2d925eb088f7cabb38024ecb7b0628557d261ba Mon Sep 17 00:00:00 2001 From: leahmcguire Date: Fri, 6 Mar 2015 17:26:17 -0800 Subject: [PATCH 14/23] fixed nonserializable error that was causing naivebayes test failures --- .../mllib/classification/NaiveBayes.scala | 18 +++++++++--------- .../mllib/classification/NaiveBayesSuite.scala | 12 +++++++----- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala index a2bc8c9aa64ab..61085225d01c8 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala @@ -46,11 +46,11 @@ class NaiveBayesModel private[mllib] ( val labels: Array[Double], val pi: Array[Double], val theta: Array[Array[Double]], - val modelType: NaiveBayes.ModelType) + val modelType: String) extends ClassificationModel with Serializable with Saveable { def this(labels: Array[Double], pi: Array[Double], theta: Array[Array[Double]]) = - this(labels, pi, theta, NaiveBayes.Multinomial) + this(labels, pi, theta, NaiveBayes.Multinomial.toString) private val brzPi = new BDV[Double](pi) private val brzTheta = new BDM(theta(0).length, theta.length, theta.flatten).t @@ -58,7 +58,7 @@ class NaiveBayesModel private[mllib] ( // Bernoulli scoring requires log(condprob) if 1 log(1-condprob) if 0 // this precomputes log(1.0 - exp(theta)) and its sum for linear algebra application // of this condition in predict function - private val (brzNegTheta, brzNegThetaSum) = modelType match { + private val (brzNegTheta, brzNegThetaSum) = NaiveBayes.ModelType.fromString(modelType) match { case NaiveBayes.Multinomial => (None, None) case NaiveBayes.Bernoulli => val negTheta = brzLog((brzExp(brzTheta.copy) :*= (-1.0)) :+= 1.0) // log(1.0 - exp(x)) @@ -74,7 +74,7 @@ class NaiveBayesModel private[mllib] ( } override def predict(testData: Vector): Double = { - modelType match { + NaiveBayes.ModelType.fromString(modelType) match { case NaiveBayes.Multinomial => labels (brzArgmax (brzPi + brzTheta * testData.toBreeze) ) case NaiveBayes.Bernoulli => @@ -84,7 +84,7 @@ class NaiveBayesModel private[mllib] ( } override def save(sc: SparkContext, path: String): Unit = { - val data = NaiveBayesModel.SaveLoadV1_0.Data(labels, pi, theta, modelType.toString) + val data = NaiveBayesModel.SaveLoadV1_0.Data(labels, pi, theta, modelType) NaiveBayesModel.SaveLoadV1_0.save(sc, path, data) } @@ -137,15 +137,15 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] { val labels = data.getAs[Seq[Double]](0).toArray val pi = data.getAs[Seq[Double]](1).toArray val theta = data.getAs[Seq[Seq[Double]]](2).map(_.toArray).toArray - val modelType = NaiveBayes.ModelType.fromString(data.getString(3)) + val modelType = NaiveBayes.ModelType.fromString(data.getString(3)).toString new NaiveBayesModel(labels, pi, theta, modelType) } } override def load(sc: SparkContext, path: String): NaiveBayesModel = { - def getModelType(metadata: JValue): NaiveBayes.ModelType = { + def getModelType(metadata: JValue): String = { implicit val formats = DefaultFormats - NaiveBayes.ModelType.fromString((metadata \ "modelType").extract[String]) + NaiveBayes.ModelType.fromString((metadata \ "modelType").extract[String]).toString } val (loadedClassName, version, metadata) = loadMetadata(sc, path) val classNameV1_0 = SaveLoadV1_0.thisClassName @@ -265,7 +265,7 @@ class NaiveBayes private ( i += 1 } - new NaiveBayesModel(labels, pi, theta, modelType) + new NaiveBayesModel(labels, pi, theta, modelType.toString) } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala index f7310bef2bc9b..0d97a480bff5d 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala @@ -52,7 +52,7 @@ object NaiveBayesSuite { sample: Int = 10): Seq[LabeledPoint] = { val D = theta(0).length val rnd = new Random(seed) - +c val _pi = pi.map(math.pow(math.E, _)) val _theta = theta.map(row => row.map(math.pow(math.E, _))) @@ -77,7 +77,7 @@ object NaiveBayesSuite { /** Binary labels, 3 features */ private val binaryModel = new NaiveBayesModel(labels = Array(0.0, 1.0), pi = Array(0.2, 0.8), - theta = Array(Array(0.1, 0.3, 0.6), Array(0.2, 0.4, 0.4)), NaiveBayes.Bernoulli) + theta = Array(Array(0.1, 0.3, 0.6), Array(0.2, 0.4, 0.4)), NaiveBayes.Bernoulli.toString) } class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { @@ -111,7 +111,6 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { test("Naive Bayes Multinomial") { val nPoints = 1000 - val pi = Array(0.5, 0.1, 0.4).map(math.log) val theta = Array( Array(0.70, 0.10, 0.10, 0.10), // label 0 @@ -120,7 +119,11 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { ).map(_.map(math.log)) val testData = NaiveBayesSuite.generateNaiveBayesInput( - pi, theta, nPoints, 42, NaiveBayes.Multinomial) + pi, + theta, + nPoints, + 42, + NaiveBayes.Multinomial) val testRDD = sc.parallelize(testData, 2) testRDD.cache() @@ -144,7 +147,6 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { test("Naive Bayes Bernoulli") { val nPoints = 10000 - val pi = Array(0.5, 0.3, 0.2).map(math.log) val theta = Array( Array(0.50, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.40), // label 0 From fb0a5c70ce935cb8d9495152c809e06c8f350443 Mon Sep 17 00:00:00 2001 From: leahmcguire Date: Mon, 9 Mar 2015 13:36:36 -0700 Subject: [PATCH 15/23] removed typo --- .../org/apache/spark/mllib/classification/NaiveBayesSuite.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala index 0d97a480bff5d..acc5b35c1bdb6 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala @@ -52,7 +52,6 @@ object NaiveBayesSuite { sample: Int = 10): Seq[LabeledPoint] = { val D = theta(0).length val rnd = new Random(seed) -c val _pi = pi.map(math.pow(math.E, _)) val _theta = theta.map(row => row.map(math.pow(math.E, _))) From 01baad70f44fa12ad37a743d5d0fba861d89f149 Mon Sep 17 00:00:00 2001 From: leahmcguire Date: Wed, 11 Mar 2015 15:44:22 -0700 Subject: [PATCH 16/23] made fixes from code review --- docs/mllib-naive-bayes.md | 4 ++-- .../mllib/classification/NaiveBayes.scala | 22 ++++++++----------- .../classification/NaiveBayesSuite.scala | 14 +++--------- 3 files changed, 14 insertions(+), 26 deletions(-) diff --git a/docs/mllib-naive-bayes.md b/docs/mllib-naive-bayes.md index 9e54fccad577a..d481eabe563bc 100644 --- a/docs/mllib-naive-bayes.md +++ b/docs/mllib-naive-bayes.md @@ -15,12 +15,12 @@ and use it for prediction. MLlib supports [multinomial naive Bayes](http://en.wikipedia.org/wiki/Naive_Bayes_classifier#Multinomial_naive_Bayes) and [Bernoulli naive Bayes] (http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html). -Which are typically used for [document classification] +These models are typically used for [document classification] (http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html). Within that context, each observation is a document and each feature represents a term whose value is the frequency of the term (in multinomial naive Bayes) or a zero or one indicating whether the term was found in the document (in Bernoulli naive Bayes). -Feature values must be nonnegative.The model type is selected with on optional parameter +Feature values must be nonnegative. The model type is selected with an optional parameter "Multinomial" or "Bernoulli" with "Multinomial" as the default. [Additive smoothing](http://en.wikipedia.org/wiki/Lidstone_smoothing) can be used by setting the parameter $\lambda$ (default to $1.0$). For document classification, the input feature diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala index 61085225d01c8..c7cb86fa30f0b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala @@ -49,15 +49,15 @@ class NaiveBayesModel private[mllib] ( val modelType: String) extends ClassificationModel with Serializable with Saveable { - def this(labels: Array[Double], pi: Array[Double], theta: Array[Array[Double]]) = + private[mllib] def this(labels: Array[Double], pi: Array[Double], theta: Array[Array[Double]]) = this(labels, pi, theta, NaiveBayes.Multinomial.toString) private val brzPi = new BDV[Double](pi) private val brzTheta = new BDM(theta(0).length, theta.length, theta.flatten).t - // Bernoulli scoring requires log(condprob) if 1 log(1-condprob) if 0 - // this precomputes log(1.0 - exp(theta)) and its sum for linear algebra application - // of this condition in predict function + // Bernoulli scoring requires log(condprob) if 1, log(1-condprob) if 0. + // This precomputes log(1.0 - exp(theta)) and its sum which are used for the linear algebra + // application of this condition (in predict function). private val (brzNegTheta, brzNegThetaSum) = NaiveBayes.ModelType.fromString(modelType) match { case NaiveBayes.Multinomial => (None, None) case NaiveBayes.Bernoulli => @@ -186,8 +186,6 @@ class NaiveBayes private ( private var lambda: Double, private var modelType: NaiveBayes.ModelType) extends Serializable with Logging { - def this(lambda: Double) = this(lambda, NaiveBayes.Multinomial) - def this() = this(1.0, NaiveBayes.Multinomial) /** Set the smoothing parameter. Default: 1.0. */ @@ -202,6 +200,7 @@ class NaiveBayes private ( this } + def getModelType(): NaiveBayes.ModelType = this.modelType /** * Run the algorithm with the configured parameters on an input RDD of LabeledPoint entries. @@ -301,10 +300,9 @@ object NaiveBayes { * @param lambda The smoothing parameter */ def train(input: RDD[LabeledPoint], lambda: Double): NaiveBayesModel = { - new NaiveBayes(lambda).run(input) + new NaiveBayes(lambda, NaiveBayes.Multinomial).run(input) } - /** * Trains a Naive Bayes model given an RDD of `(label, features)` pairs. * @@ -327,11 +325,7 @@ object NaiveBayes { new NaiveBayes(lambda, MODELTYPE.fromString(modelType)).run(input) } - - /** - * Model types supported in Naive Bayes: - * multinomial and Bernoulli currently supported - */ + /** Provides static methods for using ModelType. */ sealed abstract class ModelType object MODELTYPE { @@ -348,10 +342,12 @@ object NaiveBayes { final val ModelType = MODELTYPE + /** Constant for specifying ModelType parameter: multinomial model */ final val Multinomial: ModelType = new ModelType { override def toString: String = ModelType.MULTINOMIAL_STRING } + /** Constant for specifying ModelType parameter: bernoulli model */ final val Bernoulli: ModelType = new ModelType { override def toString: String = ModelType.BERNOULLI_STRING } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala index acc5b35c1bdb6..7ce9be4e3cdd4 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala @@ -58,7 +58,7 @@ object NaiveBayesSuite { for (i <- 0 until nPoints) yield { val y = calcLabel(rnd.nextDouble(), _pi) val xi = dataModel match { - case NaiveBayes.Bernoulli => Array.tabulate[Double] (D) {j => + case NaiveBayes.Bernoulli => Array.tabulate[Double] (D) { j => if (rnd.nextDouble () < _theta(y)(j) ) 1 else 0 } case NaiveBayes.Multinomial => @@ -118,11 +118,7 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { ).map(_.map(math.log)) val testData = NaiveBayesSuite.generateNaiveBayesInput( - pi, - theta, - nPoints, - 42, - NaiveBayes.Multinomial) + pi, theta, nPoints, 42, NaiveBayes.Multinomial) val testRDD = sc.parallelize(testData, 2) testRDD.cache() @@ -130,11 +126,7 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { validateModelFit(pi, theta, model) val validationData = NaiveBayesSuite.generateNaiveBayesInput( - pi, - theta, - nPoints, - 17, - NaiveBayes.Multinomial) + pi, theta, nPoints, 17, NaiveBayes.Multinomial) val validationRDD = sc.parallelize(validationData, 2) // Test prediction on RDD. From bea62af37fdf389474474d80fdac3c94f6a8808f Mon Sep 17 00:00:00 2001 From: leahmcguire Date: Thu, 12 Mar 2015 11:10:16 -0700 Subject: [PATCH 17/23] put back in constructor for NaiveBayes --- .../spark/mllib/classification/NaiveBayes.scala | 2 ++ .../spark/mllib/classification/NaiveBayesSuite.scala | 12 ++---------- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala index c7cb86fa30f0b..70c7ff305541d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala @@ -186,6 +186,8 @@ class NaiveBayes private ( private var lambda: Double, private var modelType: NaiveBayes.ModelType) extends Serializable with Logging { + private def this(lambda: Double) = this(lambda, NaiveBayes.Multinomial) + def this() = this(1.0, NaiveBayes.Multinomial) /** Set the smoothing parameter. Default: 1.0. */ diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala index 7ce9be4e3cdd4..c296a4b7b4627 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala @@ -146,11 +146,7 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { ).map(_.map(math.log)) val testData = NaiveBayesSuite.generateNaiveBayesInput( - pi, - theta, - nPoints, - 45, - NaiveBayes.Bernoulli) + pi, theta, nPoints, 45, NaiveBayes.Bernoulli) val testRDD = sc.parallelize(testData, 2) testRDD.cache() @@ -158,11 +154,7 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { validateModelFit(pi, theta, model) val validationData = NaiveBayesSuite.generateNaiveBayesInput( - pi, - theta, - nPoints, - 20, - NaiveBayes.Bernoulli) + pi, theta, nPoints, 20, NaiveBayes.Bernoulli) val validationRDD = sc.parallelize(validationData, 2) // Test prediction on RDD. From 18f32199dd0e526ccab7d2a0259dd5503be35ff3 Mon Sep 17 00:00:00 2001 From: leahmcguire Date: Thu, 12 Mar 2015 12:42:45 -0700 Subject: [PATCH 18/23] removed private from naive bayes constructor for lambda only --- .../org/apache/spark/mllib/classification/NaiveBayes.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala index 70c7ff305541d..d2a6009c43edb 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala @@ -186,7 +186,7 @@ class NaiveBayes private ( private var lambda: Double, private var modelType: NaiveBayes.ModelType) extends Serializable with Logging { - private def this(lambda: Double) = this(lambda, NaiveBayes.Multinomial) + def this(lambda: Double) = this(lambda, NaiveBayes.Multinomial) def this() = this(1.0, NaiveBayes.Multinomial) From a22d6703d4888574b379f5f1929de8292ce2aec2 Mon Sep 17 00:00:00 2001 From: leahmcguire Date: Mon, 16 Mar 2015 18:46:05 -0700 Subject: [PATCH 19/23] changed NaiveBayesModel modelType parameter back to NaiveBayes.ModelType, made NaiveBayes.ModelType serializable, fixed getter method in NavieBayes --- .../mllib/classification/NaiveBayes.scala | 24 +++++++++---------- .../classification/NaiveBayesSuite.scala | 2 +- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala index d2a6009c43edb..0997d0f81b4b8 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala @@ -46,11 +46,11 @@ class NaiveBayesModel private[mllib] ( val labels: Array[Double], val pi: Array[Double], val theta: Array[Array[Double]], - val modelType: String) + val modelType: NaiveBayes.ModelType) extends ClassificationModel with Serializable with Saveable { private[mllib] def this(labels: Array[Double], pi: Array[Double], theta: Array[Array[Double]]) = - this(labels, pi, theta, NaiveBayes.Multinomial.toString) + this(labels, pi, theta, NaiveBayes.Multinomial) private val brzPi = new BDV[Double](pi) private val brzTheta = new BDM(theta(0).length, theta.length, theta.flatten).t @@ -58,7 +58,7 @@ class NaiveBayesModel private[mllib] ( // Bernoulli scoring requires log(condprob) if 1, log(1-condprob) if 0. // This precomputes log(1.0 - exp(theta)) and its sum which are used for the linear algebra // application of this condition (in predict function). - private val (brzNegTheta, brzNegThetaSum) = NaiveBayes.ModelType.fromString(modelType) match { + private val (brzNegTheta, brzNegThetaSum) = modelType match { case NaiveBayes.Multinomial => (None, None) case NaiveBayes.Bernoulli => val negTheta = brzLog((brzExp(brzTheta.copy) :*= (-1.0)) :+= 1.0) // log(1.0 - exp(x)) @@ -74,7 +74,7 @@ class NaiveBayesModel private[mllib] ( } override def predict(testData: Vector): Double = { - NaiveBayes.ModelType.fromString(modelType) match { + modelType match { case NaiveBayes.Multinomial => labels (brzArgmax (brzPi + brzTheta * testData.toBreeze) ) case NaiveBayes.Bernoulli => @@ -84,7 +84,7 @@ class NaiveBayesModel private[mllib] ( } override def save(sc: SparkContext, path: String): Unit = { - val data = NaiveBayesModel.SaveLoadV1_0.Data(labels, pi, theta, modelType) + val data = NaiveBayesModel.SaveLoadV1_0.Data(labels, pi, theta, modelType.toString) NaiveBayesModel.SaveLoadV1_0.save(sc, path, data) } @@ -137,15 +137,15 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] { val labels = data.getAs[Seq[Double]](0).toArray val pi = data.getAs[Seq[Double]](1).toArray val theta = data.getAs[Seq[Seq[Double]]](2).map(_.toArray).toArray - val modelType = NaiveBayes.ModelType.fromString(data.getString(3)).toString + val modelType = NaiveBayes.ModelType.fromString(data.getString(3)) new NaiveBayesModel(labels, pi, theta, modelType) } } override def load(sc: SparkContext, path: String): NaiveBayesModel = { - def getModelType(metadata: JValue): String = { + def getModelType(metadata: JValue): NaiveBayes.ModelType = { implicit val formats = DefaultFormats - NaiveBayes.ModelType.fromString((metadata \ "modelType").extract[String]).toString + NaiveBayes.ModelType.fromString((metadata \ "modelType").extract[String]) } val (loadedClassName, version, metadata) = loadMetadata(sc, path) val classNameV1_0 = SaveLoadV1_0.thisClassName @@ -202,7 +202,7 @@ class NaiveBayes private ( this } - def getModelType(): NaiveBayes.ModelType = this.modelType + def getModelType: NaiveBayes.ModelType = this.modelType /** * Run the algorithm with the configured parameters on an input RDD of LabeledPoint entries. @@ -266,7 +266,7 @@ class NaiveBayes private ( i += 1 } - new NaiveBayesModel(labels, pi, theta, modelType.toString) + new NaiveBayesModel(labels, pi, theta, modelType) } } @@ -328,9 +328,9 @@ object NaiveBayes { } /** Provides static methods for using ModelType. */ - sealed abstract class ModelType + sealed abstract class ModelType extends Serializable - object MODELTYPE { + object MODELTYPE extends Serializable{ final val MULTINOMIAL_STRING = "multinomial" final val BERNOULLI_STRING = "bernoulli" diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala index c296a4b7b4627..03470a2c4b278 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala @@ -76,7 +76,7 @@ object NaiveBayesSuite { /** Binary labels, 3 features */ private val binaryModel = new NaiveBayesModel(labels = Array(0.0, 1.0), pi = Array(0.2, 0.8), - theta = Array(Array(0.1, 0.3, 0.6), Array(0.2, 0.4, 0.4)), NaiveBayes.Bernoulli.toString) + theta = Array(Array(0.1, 0.3, 0.6), Array(0.2, 0.4, 0.4)), NaiveBayes.Bernoulli) } class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { From 6a8f3830e25fdad24e280ae68fb83ba522d23c1b Mon Sep 17 00:00:00 2001 From: "Joseph K. Bradley" Date: Sun, 22 Mar 2015 14:14:20 -0700 Subject: [PATCH 20/23] Added new model save/load format 2.0 for NaiveBayesModel after modelType parameter was added. Updated tests. Also updated ModelType enum-like type. --- .../mllib/classification/NaiveBayes.scala | 180 ++++++++++++++---- .../classification/JavaNaiveBayesSuite.java | 23 ++- .../classification/NaiveBayesSuite.scala | 70 +++++-- 3 files changed, 208 insertions(+), 65 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala index b69af44a35b2a..0d2be054ac72a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala @@ -35,6 +35,8 @@ import org.apache.spark.mllib.util.{Loader, Saveable} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, SQLContext} +import NaiveBayes.ModelType.{Bernoulli, Multinomial} + /** * Model for Naive Bayes Classifiers. @@ -54,7 +56,7 @@ class NaiveBayesModel private[mllib] ( extends ClassificationModel with Serializable with Saveable { private[mllib] def this(labels: Array[Double], pi: Array[Double], theta: Array[Array[Double]]) = - this(labels, pi, theta, NaiveBayes.Multinomial) + this(labels, pi, theta, Multinomial) /** A Java-friendly constructor that takes three Iterable parameters. */ private[mllib] def this( @@ -70,10 +72,13 @@ class NaiveBayesModel private[mllib] ( // This precomputes log(1.0 - exp(theta)) and its sum which are used for the linear algebra // application of this condition (in predict function). private val (brzNegTheta, brzNegThetaSum) = modelType match { - case NaiveBayes.Multinomial => (None, None) - case NaiveBayes.Bernoulli => + case Multinomial => (None, None) + case Bernoulli => val negTheta = brzLog((brzExp(brzTheta.copy) :*= (-1.0)) :+= 1.0) // log(1.0 - exp(x)) (Option(negTheta), Option(brzSum(negTheta, Axis._1))) + case _ => + // This should never happen. + throw new UnknownError(s"NaiveBayesModel was created with an unknown ModelType: $modelType") } override def predict(testData: RDD[Vector]): RDD[Double] = { @@ -86,29 +91,32 @@ class NaiveBayesModel private[mllib] ( override def predict(testData: Vector): Double = { modelType match { - case NaiveBayes.Multinomial => + case Multinomial => labels (brzArgmax (brzPi + brzTheta * testData.toBreeze) ) - case NaiveBayes.Bernoulli => + case Bernoulli => labels (brzArgmax (brzPi + (brzTheta - brzNegTheta.get) * testData.toBreeze + brzNegThetaSum.get)) + case _ => + // This should never happen. + throw new UnknownError(s"NaiveBayesModel was created with an unknown ModelType: $modelType") } } override def save(sc: SparkContext, path: String): Unit = { - val data = NaiveBayesModel.SaveLoadV1_0.Data(labels, pi, theta, modelType.toString) - NaiveBayesModel.SaveLoadV1_0.save(sc, path, data) + val data = NaiveBayesModel.SaveLoadV2_0.Data(labels, pi, theta, modelType.toString) + NaiveBayesModel.SaveLoadV2_0.save(sc, path, data) } - override protected def formatVersion: String = "1.0" + override protected def formatVersion: String = "2.0" } object NaiveBayesModel extends Loader[NaiveBayesModel] { import org.apache.spark.mllib.util.Loader._ - private object SaveLoadV1_0 { + private[mllib] object SaveLoadV2_0 { - def thisFormatVersion: String = "1.0" + def thisFormatVersion: String = "2.0" /** Hard-code class name string in case it changes in the future */ def thisClassName: String = "org.apache.spark.mllib.classification.NaiveBayesModel" @@ -127,8 +135,7 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] { // Create JSON metadata. val metadata = compact(render( ("class" -> thisClassName) ~ ("version" -> thisFormatVersion) ~ - ("numFeatures" -> data.theta(0).length) ~ ("numClasses" -> data.pi.length) ~ - ("modelType" -> data.modelType))) + ("numFeatures" -> data.theta(0).length) ~ ("numClasses" -> data.pi.length))) sc.parallelize(Seq(metadata), 1).saveAsTextFile(metadataPath(path)) // Create Parquet data. @@ -151,36 +158,82 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] { val modelType = NaiveBayes.ModelType.fromString(data.getString(3)) new NaiveBayesModel(labels, pi, theta, modelType) } + } - override def load(sc: SparkContext, path: String): NaiveBayesModel = { - def getModelType(metadata: JValue): NaiveBayes.ModelType = { - implicit val formats = DefaultFormats - NaiveBayes.ModelType.fromString((metadata \ "modelType").extract[String]) + private[mllib] object SaveLoadV1_0 { + + def thisFormatVersion: String = "1.0" + + /** Hard-code class name string in case it changes in the future */ + def thisClassName: String = "org.apache.spark.mllib.classification.NaiveBayesModel" + + /** Model data for model import/export */ + case class Data( + labels: Array[Double], + pi: Array[Double], + theta: Array[Array[Double]]) + + def save(sc: SparkContext, path: String, data: Data): Unit = { + val sqlContext = new SQLContext(sc) + import sqlContext.implicits._ + + // Create JSON metadata. + val metadata = compact(render( + ("class" -> thisClassName) ~ ("version" -> thisFormatVersion) ~ + ("numFeatures" -> data.theta(0).length) ~ ("numClasses" -> data.pi.length))) + sc.parallelize(Seq(metadata), 1).saveAsTextFile(metadataPath(path)) + + // Create Parquet data. + val dataRDD: DataFrame = sc.parallelize(Seq(data), 1).toDF() + dataRDD.saveAsParquetFile(dataPath(path)) + } + + def load(sc: SparkContext, path: String): NaiveBayesModel = { + val sqlContext = new SQLContext(sc) + // Load Parquet data. + val dataRDD = sqlContext.parquetFile(dataPath(path)) + // Check schema explicitly since erasure makes it hard to use match-case for checking. + checkSchema[Data](dataRDD.schema) + val dataArray = dataRDD.select("labels", "pi", "theta").take(1) + assert(dataArray.size == 1, s"Unable to load NaiveBayesModel data from: ${dataPath(path)}") + val data = dataArray(0) + val labels = data.getAs[Seq[Double]](0).toArray + val pi = data.getAs[Seq[Double]](1).toArray + val theta = data.getAs[Seq[Seq[Double]]](2).map(_.toArray).toArray + new NaiveBayesModel(labels, pi, theta) } + } + + override def load(sc: SparkContext, path: String): NaiveBayesModel = { val (loadedClassName, version, metadata) = loadMetadata(sc, path) val classNameV1_0 = SaveLoadV1_0.thisClassName - (loadedClassName, version) match { + val classNameV2_0 = SaveLoadV2_0.thisClassName + val (model, numFeatures, numClasses) = (loadedClassName, version) match { case (className, "1.0") if className == classNameV1_0 => val (numFeatures, numClasses) = ClassificationModel.getNumFeaturesClasses(metadata) val model = SaveLoadV1_0.load(sc, path) - assert(model.pi.size == numClasses, - s"NaiveBayesModel.load expected $numClasses classes," + - s" but class priors vector pi had ${model.pi.size} elements") - assert(model.theta.size == numClasses, - s"NaiveBayesModel.load expected $numClasses classes," + - s" but class conditionals array theta had ${model.theta.size} elements") - assert(model.theta.forall(_.size == numFeatures), - s"NaiveBayesModel.load expected $numFeatures features," + - s" but class conditionals array theta had elements of size:" + - s" ${model.theta.map(_.size).mkString(",")}") - assert(model.modelType == getModelType(metadata)) - model + (model, numFeatures, numClasses) + case (className, "2.0") if className == classNameV2_0 => + val (numFeatures, numClasses) = ClassificationModel.getNumFeaturesClasses(metadata) + val model = SaveLoadV2_0.load(sc, path) + (model, numFeatures, numClasses) case _ => throw new Exception( s"NaiveBayesModel.load did not recognize model with (className, format version):" + s"($loadedClassName, $version). Supported:\n" + s" ($classNameV1_0, 1.0)") } + assert(model.pi.size == numClasses, + s"NaiveBayesModel.load expected $numClasses classes," + + s" but class priors vector pi had ${model.pi.size} elements") + assert(model.theta.size == numClasses, + s"NaiveBayesModel.load expected $numClasses classes," + + s" but class conditionals array theta had ${model.theta.size} elements") + assert(model.theta.forall(_.size == numFeatures), + s"NaiveBayesModel.load expected $numFeatures features," + + s" but class conditionals array theta had elements of size:" + + s" ${model.theta.map(_.size).mkString(",")}") + model } } @@ -197,9 +250,9 @@ class NaiveBayes private ( private var lambda: Double, private var modelType: NaiveBayes.ModelType) extends Serializable with Logging { - def this(lambda: Double) = this(lambda, NaiveBayes.Multinomial) + def this(lambda: Double) = this(lambda, Multinomial) - def this() = this(1.0, NaiveBayes.Multinomial) + def this() = this(1.0, Multinomial) /** Set the smoothing parameter. Default: 1.0. */ def setLambda(lambda: Double): NaiveBayes = { @@ -210,9 +263,22 @@ class NaiveBayes private ( /** Get the smoothing parameter. */ def getLambda: Double = lambda - /** Set the model type. Default: Multinomial. */ - def setModelType(model: NaiveBayes.ModelType): NaiveBayes = { - this.modelType = model + /** + * Set the model type using a string (case-insensitive). + * Supported options: "multinomial" and "bernoulli". + * (default: multinomial) + */ + def setModelType(modelType: String): NaiveBayes = { + setModelType(NaiveBayes.ModelType.fromString(modelType)) + } + + /** + * Set the model type. + * Supported options: [[NaiveBayes.ModelType.Bernoulli]], [[NaiveBayes.ModelType.Multinomial]] + * (default: Multinomial) + */ + def setModelType(modelType: NaiveBayes.ModelType): NaiveBayes = { + this.modelType = modelType this } @@ -270,8 +336,11 @@ class NaiveBayes private ( labels(i) = label pi(i) = math.log(n + lambda) - piLogDenom val thetaLogDenom = modelType match { - case NaiveBayes.Multinomial => math.log(brzSum(sumTermFreqs) + numFeatures * lambda) - case NaiveBayes.Bernoulli => math.log(n + 2.0 * lambda) + case Multinomial => math.log(brzSum(sumTermFreqs) + numFeatures * lambda) + case Bernoulli => math.log(n + 2.0 * lambda) + case _ => + // This should never happen. + throw new UnknownError(s"NaiveBayes was created with an unknown ModelType: $modelType") } var j = 0 while (j < numFeatures) { @@ -317,7 +386,7 @@ object NaiveBayes { * @param lambda The smoothing parameter */ def train(input: RDD[LabeledPoint], lambda: Double): NaiveBayesModel = { - new NaiveBayes(lambda, NaiveBayes.Multinomial).run(input) + new NaiveBayes(lambda, NaiveBayes.ModelType.Multinomial).run(input) } /** @@ -339,12 +408,45 @@ object NaiveBayes { * multinomial or bernoulli */ def train(input: RDD[LabeledPoint], lambda: Double, modelType: String): NaiveBayesModel = { - new NaiveBayes(lambda, MODELTYPE.fromString(modelType)).run(input) + new NaiveBayes(lambda, ModelType.fromString(modelType)).run(input) } /** Provides static methods for using ModelType. */ sealed abstract class ModelType extends Serializable + object ModelType extends Serializable { + + /** + * Get the model type from a string. + * @param modelType Supported: "multinomial" or "bernoulli" (case-insensitive) + */ + def fromString(modelType: String): ModelType = modelType.toLowerCase match { + case "multinomial" => Multinomial + case "bernoulli" => Bernoulli + case _ => + throw new IllegalArgumentException( + s"NaiveBayes.ModelType.fromString did not recognize string: $modelType") + } + + final val Multinomial: ModelType = { + case object Multinomial extends ModelType with Serializable { + override def toString: String = "multinomial" + } + Multinomial + } + + final val Bernoulli: ModelType = { + case object Bernoulli extends ModelType with Serializable { + override def toString: String = "bernoulli" + } + Bernoulli + } + } + + /** Java-friendly accessor for supported ModelType options */ + final val modelTypes = ModelType + + /* object MODELTYPE extends Serializable{ final val MULTINOMIAL_STRING = "multinomial" final val BERNOULLI_STRING = "bernoulli" @@ -368,6 +470,6 @@ object NaiveBayes { final val Bernoulli: ModelType = new ModelType { override def toString: String = ModelType.BERNOULLI_STRING } - + */ } diff --git a/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java b/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java index 1c90522a0714a..4d89c06b88c0e 100644 --- a/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java +++ b/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java @@ -17,20 +17,22 @@ package org.apache.spark.mllib.classification; +import java.io.Serializable; +import java.util.Arrays; +import java.util.List; + +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.mllib.linalg.Vectors; import org.apache.spark.mllib.regression.LabeledPoint; -import org.junit.After; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; -import java.io.Serializable; -import java.util.Arrays; -import java.util.List; public class JavaNaiveBayesSuite implements Serializable { private transient JavaSparkContext sc; @@ -102,4 +104,11 @@ public Vector call(LabeledPoint v) throws Exception { // Should be able to get the first prediction. predictions.first(); } + + @Test + public void testModelTypeSetters() { + NaiveBayes nb = new NaiveBayes() + .setModelType(NaiveBayes.modelTypes().Bernoulli()) + .setModelType(NaiveBayes.modelTypes().Multinomial()); + } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala index 8b795e411817c..2d87d6893250b 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala @@ -17,14 +17,15 @@ package org.apache.spark.mllib.classification -import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, argmax => brzArgmax, sum => brzSum, Axis} -import breeze.stats.distributions.Multinomial - import scala.util.Random +import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, argmax => brzArgmax, sum => brzSum, Axis} +import breeze.stats.distributions.{Multinomial => BrzMultinomial} + import org.scalatest.FunSuite import org.apache.spark.SparkException +import org.apache.spark.mllib.classification.NaiveBayes.ModelType.{Bernoulli, Multinomial} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.{LocalClusterSparkContext, MLlibTestSparkContext} @@ -48,7 +49,7 @@ object NaiveBayesSuite { theta: Array[Array[Double]], // CXD nPoints: Int, seed: Int, - dataModel: NaiveBayes.ModelType = NaiveBayes.Multinomial, + modelType: NaiveBayes.ModelType = Multinomial, sample: Int = 10): Seq[LabeledPoint] = { val D = theta(0).length val rnd = new Random(seed) @@ -57,26 +58,35 @@ object NaiveBayesSuite { for (i <- 0 until nPoints) yield { val y = calcLabel(rnd.nextDouble(), _pi) - val xi = dataModel match { - case NaiveBayes.Bernoulli => Array.tabulate[Double] (D) { j => + val xi = modelType match { + case Bernoulli => Array.tabulate[Double] (D) { j => if (rnd.nextDouble () < _theta(y)(j) ) 1 else 0 } - case NaiveBayes.Multinomial => - val mult = Multinomial(BDV(_theta(y))) + case Multinomial => + val mult = BrzMultinomial(BDV(_theta(y))) val emptyMap = (0 until D).map(x => (x, 0.0)).toMap val counts = emptyMap ++ mult.sample(sample).groupBy(x => x).map { case (index, reps) => (index, reps.size.toDouble) } counts.toArray.sortBy(_._1).map(_._2) + case _ => + // This should never happen. + throw new UnknownError(s"NaiveBayesSuite found unknown ModelType: $modelType") } LabeledPoint(y, Vectors.dense(xi)) } } - /** Binary labels, 3 features */ - private val binaryModel = new NaiveBayesModel(labels = Array(0.0, 1.0), pi = Array(0.2, 0.8), - theta = Array(Array(0.1, 0.3, 0.6), Array(0.2, 0.4, 0.4)), NaiveBayes.Bernoulli) + /** Bernoulli NaiveBayes with binary labels, 3 features */ + private val binaryBernoulliModel = new NaiveBayesModel(labels = Array(0.0, 1.0), + pi = Array(0.2, 0.8), theta = Array(Array(0.1, 0.3, 0.6), Array(0.2, 0.4, 0.4)), + Bernoulli) + + /** Multinomial NaiveBayes with binary labels, 3 features */ + private val binaryMultinomialModel = new NaiveBayesModel(labels = Array(0.0, 1.0), + pi = Array(0.2, 0.8), theta = Array(Array(0.1, 0.3, 0.6), Array(0.2, 0.4, 0.4)), + Multinomial) } class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { @@ -126,7 +136,7 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { ).map(_.map(math.log)) val testData = NaiveBayesSuite.generateNaiveBayesInput( - pi, theta, nPoints, 42, NaiveBayes.Multinomial) + pi, theta, nPoints, 42, Multinomial) val testRDD = sc.parallelize(testData, 2) testRDD.cache() @@ -134,7 +144,7 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { validateModelFit(pi, theta, model) val validationData = NaiveBayesSuite.generateNaiveBayesInput( - pi, theta, nPoints, 17, NaiveBayes.Multinomial) + pi, theta, nPoints, 17, Multinomial) val validationRDD = sc.parallelize(validationData, 2) // Test prediction on RDD. @@ -154,7 +164,7 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { ).map(_.map(math.log)) val testData = NaiveBayesSuite.generateNaiveBayesInput( - pi, theta, nPoints, 45, NaiveBayes.Bernoulli) + pi, theta, nPoints, 45, Bernoulli) val testRDD = sc.parallelize(testData, 2) testRDD.cache() @@ -162,7 +172,7 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { validateModelFit(pi, theta, model) val validationData = NaiveBayesSuite.generateNaiveBayesInput( - pi, theta, nPoints, 20, NaiveBayes.Bernoulli) + pi, theta, nPoints, 20, Bernoulli) val validationRDD = sc.parallelize(validationData, 2) // Test prediction on RDD. @@ -199,19 +209,41 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { } } - test("model save/load") { - val model = NaiveBayesSuite.binaryModel + test("model save/load: 2.0 to 2.0") { + val tempDir = Utils.createTempDir() + val path = tempDir.toURI.toString + + Seq(NaiveBayesSuite.binaryBernoulliModel, NaiveBayesSuite.binaryMultinomialModel).map { + model => + // Save model, load it back, and compare. + try { + model.save(sc, path) + val sameModel = NaiveBayesModel.load(sc, path) + assert(model.labels === sameModel.labels) + assert(model.pi === sameModel.pi) + assert(model.theta === sameModel.theta) + assert(model.modelType === sameModel.modelType) + } finally { + Utils.deleteRecursively(tempDir) + } + } + } + + test("model save/load: 1.0 to 2.0") { + val model = NaiveBayesSuite.binaryMultinomialModel val tempDir = Utils.createTempDir() val path = tempDir.toURI.toString - // Save model, load it back, and compare. + // Save model as version 1.0, load it back, and compare. try { - model.save(sc, path) + val data = NaiveBayesModel.SaveLoadV1_0.Data(model.labels, model.pi, model.theta) + NaiveBayesModel.SaveLoadV1_0.save(sc, path, data) val sameModel = NaiveBayesModel.load(sc, path) assert(model.labels === sameModel.labels) assert(model.pi === sameModel.pi) assert(model.theta === sameModel.theta) + assert(model.modelType === NaiveBayes.ModelType.Multinomial) } finally { Utils.deleteRecursively(tempDir) } From 9ad89ca860a7c653f46c5aad5535889a7ad4dcf4 Mon Sep 17 00:00:00 2001 From: "Joseph K. Bradley" Date: Sun, 22 Mar 2015 14:21:00 -0700 Subject: [PATCH 21/23] removed old code --- .../mllib/classification/NaiveBayes.scala | 26 ------------------- 1 file changed, 26 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala index 0d2be054ac72a..cacea5b1eced6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala @@ -446,30 +446,4 @@ object NaiveBayes { /** Java-friendly accessor for supported ModelType options */ final val modelTypes = ModelType - /* - object MODELTYPE extends Serializable{ - final val MULTINOMIAL_STRING = "multinomial" - final val BERNOULLI_STRING = "bernoulli" - - def fromString(modelType: String): ModelType = modelType match { - case MULTINOMIAL_STRING => Multinomial - case BERNOULLI_STRING => Bernoulli - case _ => - throw new IllegalArgumentException(s"Cannot recognize NaiveBayes ModelType: $modelType") - } - } - - final val ModelType = MODELTYPE - - /** Constant for specifying ModelType parameter: multinomial model */ - final val Multinomial: ModelType = new ModelType { - override def toString: String = ModelType.MULTINOMIAL_STRING - } - - /** Constant for specifying ModelType parameter: bernoulli model */ - final val Bernoulli: ModelType = new ModelType { - override def toString: String = ModelType.BERNOULLI_STRING - } - */ } - From acb69af01bd063349fc071d2a5d9c1547d71c331 Mon Sep 17 00:00:00 2001 From: leahmcguire Date: Fri, 27 Mar 2015 20:18:52 -0700 Subject: [PATCH 22/23] removed enum type and replaces all modelType parameters with strings --- .../mllib/classification/NaiveBayes.scala | 101 ++++++------------ .../classification/JavaNaiveBayesSuite.java | 4 +- .../classification/NaiveBayesSuite.scala | 25 +++-- 3 files changed, 47 insertions(+), 83 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala index cacea5b1eced6..6fc88754e4a17 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala @@ -35,8 +35,6 @@ import org.apache.spark.mllib.util.{Loader, Saveable} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, SQLContext} -import NaiveBayes.ModelType.{Bernoulli, Multinomial} - /** * Model for Naive Bayes Classifiers. @@ -45,18 +43,17 @@ import NaiveBayes.ModelType.{Bernoulli, Multinomial} * @param pi log of class priors, whose dimension is C, number of labels * @param theta log of class conditional probabilities, whose dimension is C-by-D, * where D is number of features - * @param modelType The type of NB model to fit from the enumeration NaiveBayesModels, can be - * Multinomial or Bernoulli + * @param modelType The type of NB model to fit can be "Multinomial" or "Bernoulli" */ class NaiveBayesModel private[mllib] ( val labels: Array[Double], val pi: Array[Double], val theta: Array[Array[Double]], - val modelType: NaiveBayes.ModelType) + val modelType: String) extends ClassificationModel with Serializable with Saveable { private[mllib] def this(labels: Array[Double], pi: Array[Double], theta: Array[Array[Double]]) = - this(labels, pi, theta, Multinomial) + this(labels, pi, theta, "Multinomial") /** A Java-friendly constructor that takes three Iterable parameters. */ private[mllib] def this( @@ -72,8 +69,8 @@ class NaiveBayesModel private[mllib] ( // This precomputes log(1.0 - exp(theta)) and its sum which are used for the linear algebra // application of this condition (in predict function). private val (brzNegTheta, brzNegThetaSum) = modelType match { - case Multinomial => (None, None) - case Bernoulli => + case "Multinomial" => (None, None) + case "Bernoulli" => val negTheta = brzLog((brzExp(brzTheta.copy) :*= (-1.0)) :+= 1.0) // log(1.0 - exp(x)) (Option(negTheta), Option(brzSum(negTheta, Axis._1))) case _ => @@ -91,9 +88,9 @@ class NaiveBayesModel private[mllib] ( override def predict(testData: Vector): Double = { modelType match { - case Multinomial => + case "Multinomial" => labels (brzArgmax (brzPi + brzTheta * testData.toBreeze) ) - case Bernoulli => + case "Bernoulli" => labels (brzArgmax (brzPi + (brzTheta - brzNegTheta.get) * testData.toBreeze + brzNegThetaSum.get)) case _ => @@ -103,7 +100,7 @@ class NaiveBayesModel private[mllib] ( } override def save(sc: SparkContext, path: String): Unit = { - val data = NaiveBayesModel.SaveLoadV2_0.Data(labels, pi, theta, modelType.toString) + val data = NaiveBayesModel.SaveLoadV2_0.Data(labels, pi, theta, modelType) NaiveBayesModel.SaveLoadV2_0.save(sc, path, data) } @@ -155,7 +152,7 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] { val labels = data.getAs[Seq[Double]](0).toArray val pi = data.getAs[Seq[Double]](1).toArray val theta = data.getAs[Seq[Seq[Double]]](2).map(_.toArray).toArray - val modelType = NaiveBayes.ModelType.fromString(data.getString(3)) + val modelType = data.getString(3) new NaiveBayesModel(labels, pi, theta, modelType) } @@ -248,11 +245,11 @@ object NaiveBayesModel extends Loader[NaiveBayesModel] { class NaiveBayes private ( private var lambda: Double, - private var modelType: NaiveBayes.ModelType) extends Serializable with Logging { + private var modelType: String) extends Serializable with Logging { - def this(lambda: Double) = this(lambda, Multinomial) + def this(lambda: Double) = this(lambda, "Multinomial") - def this() = this(1.0, Multinomial) + def this() = this(1.0, "Multinomial") /** Set the smoothing parameter. Default: 1.0. */ def setLambda(lambda: Double): NaiveBayes = { @@ -264,26 +261,21 @@ class NaiveBayes private ( def getLambda: Double = lambda /** - * Set the model type using a string (case-insensitive). - * Supported options: "multinomial" and "bernoulli". - * (default: multinomial) - */ - def setModelType(modelType: String): NaiveBayes = { - setModelType(NaiveBayes.ModelType.fromString(modelType)) - } - - /** - * Set the model type. - * Supported options: [[NaiveBayes.ModelType.Bernoulli]], [[NaiveBayes.ModelType.Multinomial]] + * Set the model type using a string (case-sensitive). + * Supported options: "Multinomial" and "Bernoulli". * (default: Multinomial) */ - def setModelType(modelType: NaiveBayes.ModelType): NaiveBayes = { - this.modelType = modelType - this + def setModelType(modelType:String): NaiveBayes = { + if (NaiveBayes.supportedModelTypes.contains(modelType)) { + this.modelType = modelType + this + } else { + throw new UnknownError(s"NaiveBayesModel does not support ModelType: $modelType") + } } /** Get the model type. */ - def getModelType: NaiveBayes.ModelType = this.modelType + def getModelType: String = this.modelType /** * Run the algorithm with the configured parameters on an input RDD of LabeledPoint entries. @@ -336,8 +328,8 @@ class NaiveBayes private ( labels(i) = label pi(i) = math.log(n + lambda) - piLogDenom val thetaLogDenom = modelType match { - case Multinomial => math.log(brzSum(sumTermFreqs) + numFeatures * lambda) - case Bernoulli => math.log(n + 2.0 * lambda) + case "Multinomial" => math.log(brzSum(sumTermFreqs) + numFeatures * lambda) + case "Bernoulli" => math.log(n + 2.0 * lambda) case _ => // This should never happen. throw new UnknownError(s"NaiveBayes was created with an unknown ModelType: $modelType") @@ -358,6 +350,10 @@ class NaiveBayes private ( * Top-level methods for calling naive Bayes. */ object NaiveBayes { + + /* Set of modelTypes that NaiveBayes supports */ + private[mllib] val supportedModelTypes = Set("Multinomial", "Bernoulli") + /** * Trains a Naive Bayes model given an RDD of `(label, features)` pairs. * @@ -386,7 +382,7 @@ object NaiveBayes { * @param lambda The smoothing parameter */ def train(input: RDD[LabeledPoint], lambda: Double): NaiveBayesModel = { - new NaiveBayes(lambda, NaiveBayes.ModelType.Multinomial).run(input) + new NaiveBayes(lambda, "Multinomial").run(input) } /** @@ -408,42 +404,11 @@ object NaiveBayes { * multinomial or bernoulli */ def train(input: RDD[LabeledPoint], lambda: Double, modelType: String): NaiveBayesModel = { - new NaiveBayes(lambda, ModelType.fromString(modelType)).run(input) - } - - /** Provides static methods for using ModelType. */ - sealed abstract class ModelType extends Serializable - - object ModelType extends Serializable { - - /** - * Get the model type from a string. - * @param modelType Supported: "multinomial" or "bernoulli" (case-insensitive) - */ - def fromString(modelType: String): ModelType = modelType.toLowerCase match { - case "multinomial" => Multinomial - case "bernoulli" => Bernoulli - case _ => - throw new IllegalArgumentException( - s"NaiveBayes.ModelType.fromString did not recognize string: $modelType") - } - - final val Multinomial: ModelType = { - case object Multinomial extends ModelType with Serializable { - override def toString: String = "multinomial" - } - Multinomial - } - - final val Bernoulli: ModelType = { - case object Bernoulli extends ModelType with Serializable { - override def toString: String = "bernoulli" - } - Bernoulli + if (supportedModelTypes.contains(modelType)) { + new NaiveBayes(lambda, modelType).run(input) + } else { + throw new UnknownError(s"NaiveBayes was created with an unknown ModelType: $modelType") } } - /** Java-friendly accessor for supported ModelType options */ - final val modelTypes = ModelType - } diff --git a/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java b/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java index 4d89c06b88c0e..71fb7f13c39c2 100644 --- a/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java +++ b/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java @@ -108,7 +108,7 @@ public Vector call(LabeledPoint v) throws Exception { @Test public void testModelTypeSetters() { NaiveBayes nb = new NaiveBayes() - .setModelType(NaiveBayes.modelTypes().Bernoulli()) - .setModelType(NaiveBayes.modelTypes().Multinomial()); + .setModelType("Bernoulli") + .setModelType("Multinomial"); } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala index 2d87d6893250b..f9fe3e006ccb8 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/NaiveBayesSuite.scala @@ -25,7 +25,6 @@ import breeze.stats.distributions.{Multinomial => BrzMultinomial} import org.scalatest.FunSuite import org.apache.spark.SparkException -import org.apache.spark.mllib.classification.NaiveBayes.ModelType.{Bernoulli, Multinomial} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.{LocalClusterSparkContext, MLlibTestSparkContext} @@ -49,7 +48,7 @@ object NaiveBayesSuite { theta: Array[Array[Double]], // CXD nPoints: Int, seed: Int, - modelType: NaiveBayes.ModelType = Multinomial, + modelType: String = "Multinomial", sample: Int = 10): Seq[LabeledPoint] = { val D = theta(0).length val rnd = new Random(seed) @@ -59,10 +58,10 @@ object NaiveBayesSuite { for (i <- 0 until nPoints) yield { val y = calcLabel(rnd.nextDouble(), _pi) val xi = modelType match { - case Bernoulli => Array.tabulate[Double] (D) { j => + case "Bernoulli" => Array.tabulate[Double] (D) { j => if (rnd.nextDouble () < _theta(y)(j) ) 1 else 0 } - case Multinomial => + case "Multinomial" => val mult = BrzMultinomial(BDV(_theta(y))) val emptyMap = (0 until D).map(x => (x, 0.0)).toMap val counts = emptyMap ++ mult.sample(sample).groupBy(x => x).map { @@ -81,12 +80,12 @@ object NaiveBayesSuite { /** Bernoulli NaiveBayes with binary labels, 3 features */ private val binaryBernoulliModel = new NaiveBayesModel(labels = Array(0.0, 1.0), pi = Array(0.2, 0.8), theta = Array(Array(0.1, 0.3, 0.6), Array(0.2, 0.4, 0.4)), - Bernoulli) + "Bernoulli") /** Multinomial NaiveBayes with binary labels, 3 features */ private val binaryMultinomialModel = new NaiveBayesModel(labels = Array(0.0, 1.0), pi = Array(0.2, 0.8), theta = Array(Array(0.1, 0.3, 0.6), Array(0.2, 0.4, 0.4)), - Multinomial) + "Multinomial") } class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { @@ -136,15 +135,15 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { ).map(_.map(math.log)) val testData = NaiveBayesSuite.generateNaiveBayesInput( - pi, theta, nPoints, 42, Multinomial) + pi, theta, nPoints, 42, "Multinomial") val testRDD = sc.parallelize(testData, 2) testRDD.cache() - val model = NaiveBayes.train(testRDD, 1.0, "multinomial") + val model = NaiveBayes.train(testRDD, 1.0, "Multinomial") validateModelFit(pi, theta, model) val validationData = NaiveBayesSuite.generateNaiveBayesInput( - pi, theta, nPoints, 17, Multinomial) + pi, theta, nPoints, 17, "Multinomial") val validationRDD = sc.parallelize(validationData, 2) // Test prediction on RDD. @@ -164,15 +163,15 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { ).map(_.map(math.log)) val testData = NaiveBayesSuite.generateNaiveBayesInput( - pi, theta, nPoints, 45, Bernoulli) + pi, theta, nPoints, 45, "Bernoulli") val testRDD = sc.parallelize(testData, 2) testRDD.cache() - val model = NaiveBayes.train(testRDD, 1.0, "bernoulli") + val model = NaiveBayes.train(testRDD, 1.0, "Bernoulli") validateModelFit(pi, theta, model) val validationData = NaiveBayesSuite.generateNaiveBayesInput( - pi, theta, nPoints, 20, Bernoulli) + pi, theta, nPoints, 20, "Bernoulli") val validationRDD = sc.parallelize(validationData, 2) // Test prediction on RDD. @@ -243,7 +242,7 @@ class NaiveBayesSuite extends FunSuite with MLlibTestSparkContext { assert(model.labels === sameModel.labels) assert(model.pi === sameModel.pi) assert(model.theta === sameModel.theta) - assert(model.modelType === NaiveBayes.ModelType.Multinomial) + assert(model.modelType === "Multinomial") } finally { Utils.deleteRecursively(tempDir) } From f3c89944e6410103deed673fa0914947e2e8b0f9 Mon Sep 17 00:00:00 2001 From: leahmcguire Date: Mon, 30 Mar 2015 18:29:54 -0700 Subject: [PATCH 23/23] changed checks on model type to requires --- .../mllib/classification/NaiveBayes.scala | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala index 6fc88754e4a17..c9b3ff0172e2e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/NaiveBayes.scala @@ -266,12 +266,10 @@ class NaiveBayes private ( * (default: Multinomial) */ def setModelType(modelType:String): NaiveBayes = { - if (NaiveBayes.supportedModelTypes.contains(modelType)) { - this.modelType = modelType - this - } else { - throw new UnknownError(s"NaiveBayesModel does not support ModelType: $modelType") - } + require(NaiveBayes.supportedModelTypes.contains(modelType), + s"NaiveBayes was created with an unknown ModelType: $modelType") + this.modelType = modelType + this } /** Get the model type. */ @@ -404,11 +402,9 @@ object NaiveBayes { * multinomial or bernoulli */ def train(input: RDD[LabeledPoint], lambda: Double, modelType: String): NaiveBayesModel = { - if (supportedModelTypes.contains(modelType)) { - new NaiveBayes(lambda, modelType).run(input) - } else { - throw new UnknownError(s"NaiveBayes was created with an unknown ModelType: $modelType") - } + require(supportedModelTypes.contains(modelType), + s"NaiveBayes was created with an unknown ModelType: $modelType") + new NaiveBayes(lambda, modelType).run(input) } }