Add Scala Naive Bayes example, to use existing example data file (who…

…se format needed a tweak)
pdeyhim · May 6, 2014 · 23c9ac3 · 23c9ac3
1 parent 8c81982
commit 23c9ac3
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 10 deletions.
diff --git a/docs/mllib-naive-bayes.md b/docs/mllib-naive-bayes.md
@@ -36,9 +36,18 @@ can be used for evaluation and prediction.
 
 {% highlight scala %}
 import org.apache.spark.mllib.classification.NaiveBayes
-
-val training: RDD[LabeledPoint] = ... // training set
-val test: RDD[LabeledPoint] = ... // test set
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.mllib.regression.LabeledPoint
+
+val data = sc.textFile("mllib/data/sample_naive_bayes_data.txt")
+val parsedData = data.map { line =>
+  val parts = line.split(',')
+  LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
+}
+// Split data into training (60%) and test (40%).
+val splits = parsedData.randomSplit(Array(0.6, 0.4), seed = 11L)
+val training = splits(0)
+val test = splits(1)
 
 val model = NaiveBayes.train(training, lambda = 1.0)
 val prediction = model.predict(test.map(_.features))
@@ -69,7 +78,7 @@ import scala.Tuple2;
 JavaRDD<LabeledPoint> training = ... // training set
 JavaRDD<LabeledPoint> test = ... // test set
 
-NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0);
+final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0);
 
 JavaRDD<Double> prediction =
   test.map(new Function<LabeledPoint, Double>() {

diff --git a/mllib/data/sample_naive_bayes_data.txt b/mllib/data/sample_naive_bayes_data.txt
@@ -1,6 +1,6 @@
-0, 1 0 0
-0, 2 0 0
-1, 0 1 0
-1, 0 2 0
-2, 0 0 1
-2, 0 0 2
+0,1 0 0
+0,2 0 0
+1,0 1 0
+1,0 2 0
+2,0 0 1
+2,0 0 2
-Original file line number
+Diff line change
@@ -1,6 +1,6 @@
-, 1 0 0
-, 2 0 0
-, 0 1 0
-, 0 2 0
-, 0 0 1
-, 0 0 2
+,1 0 0
+,2 0 0
+,0 1 0
+,0 2 0
+,0 0 1
+,0 0 2