From 1be9892cb03bff8150df07a08877a3b7f3879caa Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Tue, 30 Dec 2014 13:41:26 -0800
Subject: [PATCH] =?UTF-8?q?*=20Made=20ProbabilisticClassificationModel=20i?=
 =?UTF-8?q?nto=20a=20subclass=20of=20ClassificationModel.=20=20Also=20intr?=
 =?UTF-8?q?oduced=20ProbabilisticClassifier.=20=20*=20This=20was=20to=20su?=
 =?UTF-8?q?pport=20output=20column=20=E2=80=9CprobabilityCol=E2=80=9D=20in?=
 =?UTF-8?q?=20transform().?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* SPARK-4942 : ML Transformers should allow output cols to be turned on,off
 * Update validateAndTransformSchema
 * Update transform

* Update based on design review
 * Make prediction API protected, but add output columns
 * Remove training API

* LogisticRegression:
 * Changed output column “score” to “probability” in logreg.
 * I also implemented transform() to avoid repeated computation.  This improves upon the default implementation in ProbabilisticClassificationModel.  However, it’s a lot of code, so I would be fine with removing it.  There is also a question of whether all algorithms should implement a method which would allow the ProbabilisticClassificationModel.transform implementation to avoid repeated computation:
   * protected def raw2prob(rawPredictions: Vector): Vector = // compute probabilities from raw predictions

* trait Params:
 * Changed set() and get() from private[ml] to protected.  This was needed for the example of defining a class from outside of the MLlib namespace.

* VectorUDT: Changed from private[spark] to public.  This is needed for outside users to write their own validateAndTransformSchema() methods using vectors.

* Add example of defining class from outside of the MLlib namespace.
 * Scala
---
 .../ml/JavaCrossValidatorExample.java         |   4 +-
 .../examples/ml/JavaSimpleParamsExample.java  |   6 +-
 .../JavaSimpleTextClassificationPipeline.java |   4 +-
 .../examples/ml/CrossValidatorExample.scala   |   7 +-
 .../examples/ml/DeveloperApiExample.scala     | 197 ++++++++++++++++++
 .../examples/ml/SimpleParamsExample.scala     |  12 +-
 .../ml/SimpleTextClassificationPipeline.scala |   7 +-
 .../org/apache/spark/ml/LabeledPoint.scala    |  52 -----
 .../spark/ml/classification/Classifier.scala  | 161 ++++++++++++--
 .../classification/LogisticRegression.scala   | 145 +++++++------
 .../ProbabilisticClassifier.scala             | 145 +++++++++++++
 .../BinaryClassificationEvaluator.scala       |  20 +-
 .../apache/spark/ml/feature/Tokenizer.scala   |   4 +-
 .../spark/ml/impl/estimator/Predictor.scala   | 182 ++++++++--------
 .../ProbabilisticClassificationModel.scala    |  46 ----
 .../org/apache/spark/ml/param/params.scala    |  45 +++-
 .../apache/spark/ml/param/sharedParams.scala  |  26 ++-
 .../ml/regression/LinearRegression.scala      |  48 ++---
 .../spark/ml/regression/Regressor.scala       |  32 ++-
 .../apache/spark/mllib/linalg/Vectors.scala   |   9 +-
 20 files changed, 811 insertions(+), 341 deletions(-)
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
 delete mode 100644 mllib/src/main/scala/org/apache/spark/ml/LabeledPoint.scala
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala
 delete mode 100644 mllib/src/main/scala/org/apache/spark/ml/impl/estimator/ProbabilisticClassificationModel.scala

diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java
index f4b4f8d8c7b2f..65b393353b28b 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java
@@ -117,9 +117,9 @@ public static void main(String[] args) {
 
     // Make predictions on test documents. cvModel uses the best model found (lrModel).
     cvModel.transform(test).registerAsTable("prediction");
-    JavaSchemaRDD predictions = jsql.sql("SELECT id, text, score, prediction FROM prediction");
+    JavaSchemaRDD predictions = jsql.sql("SELECT id, text, probability, prediction FROM prediction");
     for (Row r: predictions.collect()) {
-      System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> score=" + r.get(2)
+      System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2)
           + ", prediction=" + r.get(3));
     }
   }
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
index e25b271777ed4..16ced6b911ac8 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
@@ -81,7 +81,7 @@ public static void main(String[] args) {
 
     // One can also combine ParamMaps.
     ParamMap paramMap2 = new ParamMap();
-    paramMap2.put(lr.scoreCol().w("probability")); // Change output column name
+    paramMap2.put(lr.probabilityCol().w("myProbability")); // Change output column name
     ParamMap paramMapCombined = paramMap.$plus$plus(paramMap2);
 
     // Now learn a new model using the paramMapCombined parameters.
@@ -98,8 +98,8 @@ public static void main(String[] args) {
 
     // Make predictions on test documents using the Transformer.transform() method.
     // LogisticRegression.transform will only use the 'features' column.
-    // Note that model2.transform() outputs a 'probability' column instead of the usual 'score'
-    // column since we renamed the lr.scoreCol parameter previously.
+    // Note that model2.transform() outputs a 'myProbability' column instead of the usual
+    // 'probability' column since we renamed the lr.probabilityCol parameter previously.
     model2.transform(test).registerAsTable("results");
     JavaSchemaRDD results =
         jsql.sql("SELECT features, label, probability, prediction FROM results");
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
index 54f18014e4b2f..c2496d9c57b15 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
@@ -84,9 +84,9 @@ public static void main(String[] args) {
 
     // Make predictions on test documents.
     model.transform(test).registerAsTable("prediction");
-    JavaSchemaRDD predictions = jsql.sql("SELECT id, text, score, prediction FROM prediction");
+    JavaSchemaRDD predictions = jsql.sql("SELECT id, text, probability, prediction FROM prediction");
     for (Row r: predictions.collect()) {
-      System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> score=" + r.get(2)
+      System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2)
           + ", prediction=" + r.get(3));
     }
   }
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala
index ce6bc066bd70d..0db32835b8fdf 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala
@@ -24,6 +24,7 @@ import org.apache.spark.ml.classification.LogisticRegression
 import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
 import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
 import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator}
+import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.sql.{Row, SQLContext}
 
 /**
@@ -101,10 +102,10 @@ object CrossValidatorExample {
 
     // Make predictions on test documents. cvModel uses the best model found (lrModel).
     cvModel.transform(test)
-      .select('id, 'text, 'score, 'prediction)
+      .select('id, 'text, 'probability, 'prediction)
       .collect()
-      .foreach { case Row(id: Long, text: String, score: Double, prediction: Double) =>
-      println("(" + id + ", " + text + ") --> score=" + score + ", prediction=" + prediction)
+      .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
+      println("(" + id + ", " + text + ") --> prob=" + prob + ", prediction=" + prediction)
     }
   }
 }
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
new file mode 100644
index 0000000000000..2f1de5c58ed1e
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
@@ -0,0 +1,197 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml
+
+import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.SparkContext._
+import org.apache.spark.ml.classification.{Classifier, ClassifierParams, ClassificationModel}
+import org.apache.spark.ml.param.{Params, IntParam, ParamMap}
+import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors, VectorUDT}
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.sql.{DataType, SchemaRDD, Row, SQLContext}
+
+/**
+ * A simple example demonstrating how to write your own learning algorithm using Estimator,
+ * Transformer, and other abstractions.
+ * This mimics [[org.apache.spark.ml.classification.LogisticRegression]].
+ * Run with
+ * {{{
+ * bin/run-example ml.DeveloperApiExample
+ * }}}
+ */
+object DeveloperApiExample {
+
+  def main(args: Array[String]) {
+    val conf = new SparkConf().setAppName("DeveloperApiExample")
+    val sc = new SparkContext(conf)
+    val sqlContext = new SQLContext(sc)
+    import sqlContext._
+
+    // Prepare training data.
+    // We use LabeledPoint, which is a case class.  Spark SQL can convert RDDs of Java Beans
+    // into SchemaRDDs, where it uses the bean metadata to infer the schema.
+    val training = sparkContext.parallelize(Seq(
+      LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)),
+      LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)),
+      LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)),
+      LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5))))
+
+    // Create a LogisticRegression instance.  This instance is an Estimator.
+    val lr = new MyLogisticRegression()
+    // Print out the parameters, documentation, and any default values.
+    println("MyLogisticRegression parameters:\n" + lr.explainParams() + "\n")
+
+    // We may set parameters using setter methods.
+    lr.setMaxIter(10)
+
+    // Learn a LogisticRegression model.  This uses the parameters stored in lr.
+    val model = lr.fit(training)
+
+    // Prepare test data.
+    val test = sparkContext.parallelize(Seq(
+      LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
+      LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)),
+      LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5))))
+
+    // Make predictions on test data.
+    val sumPredictions: Double = model.transform(test)
+      .select('features, 'label, 'prediction)
+      .collect()
+      .map { case Row(features: Vector, label: Double, prediction: Double) =>
+        prediction
+      }.sum
+    assert(sumPredictions == 0.0,
+      "MyLogisticRegression predicted something other than 0, even though all weights are 0!")
+  }
+}
+
+/**
+ * Example of defining a parameter trait for a user-defined type of [[Classifier]].
+ *
+ * NOTE: This is private since it is an example.  In practice, you may not want it to be private.
+ */
+private trait MyLogisticRegressionParams extends ClassifierParams {
+
+  /** param for max number of iterations */
+  val maxIter: IntParam = new IntParam(this, "maxIter", "max number of iterations")
+  def getMaxIter: Int = get(maxIter)
+}
+
+/**
+ * Example of defining a type of [[Classifier]].
+ *
+ * NOTE: This is private since it is an example.  In practice, you may not want it to be private.
+ */
+private class MyLogisticRegression
+  extends Classifier[Vector, MyLogisticRegression, MyLogisticRegressionModel]
+  with MyLogisticRegressionParams {
+
+  setMaxIter(100) // Initialize
+
+  def setMaxIter(value: Int): this.type = set(maxIter, value)
+
+  override def fit(dataset: SchemaRDD, paramMap: ParamMap): MyLogisticRegressionModel = {
+    // Check schema (types). This allows early failure before running the algorithm.
+    transformSchema(dataset.schema, paramMap, logging = true)
+
+    // Extract columns from data using helper method.
+    val oldDataset = extractLabeledPoints(dataset, paramMap)
+
+    // Combine given parameters with the embedded parameters, where the given paramMap overrides
+    // any embedded settings.
+    val map = this.paramMap ++ paramMap
+
+    // Do learning to estimate the weight vector.
+    val numFeatures = oldDataset.take(1)(0).features.size
+    val weights = Vectors.zeros(numFeatures) // Learning would happen here.
+
+    // Create a model to return.
+    val lrm = new MyLogisticRegressionModel(this, map, weights)
+
+    // Copy model params.
+    // An Estimator stores the parameters for the Model it produces, and this copies any relevant
+    // parameters to the model.
+    Params.inheritValues(map, this, lrm)
+
+    // Return the learned model.
+    lrm
+  }
+
+  /**
+   * Returns the SQL DataType corresponding to the FeaturesType type parameter.
+   * This is used by [[ClassifierParams.validateAndTransformSchema()]] to check the input data.
+   */
+  override protected def featuresDataType: DataType = new VectorUDT
+}
+
+/**
+ * Example of defining a type of [[ClassificationModel]].
+ *
+ * NOTE: This is private since it is an example.  In practice, you may not want it to be private.
+ */
+private class MyLogisticRegressionModel(
+    override val parent: MyLogisticRegression,
+    override val fittingParamMap: ParamMap,
+    val weights: Vector)
+  extends ClassificationModel[Vector, MyLogisticRegressionModel]
+  with MyLogisticRegressionParams {
+
+  // This uses the default implementation of transform(), which reads column "features" and outputs
+  // columns "prediction" and "rawPrediction."
+
+  // This uses the default implementation of predict(), which chooses the label corresponding to
+  // the maximum value returned by [[predictRaw()]].
+
+  /**
+   * Raw prediction for each possible label.
+   * The meaning of a "raw" prediction may vary between algorithms, but it intuitively gives
+   * a measure of confidence in each possible label (where larger = more confident).
+   * This internal method is used to implement [[transform()]] and output [[rawPredictionCol]].
+   *
+   * @return  vector where element i is the raw prediction for label i.
+   *          This raw prediction may be any real number, where a larger value indicates greater
+   *          confidence for that label.
+   */
+  override protected def predictRaw(features: Vector): Vector = {
+    val margin = BLAS.dot(features, weights)
+    // There are 2 classes (binary classification), so we return a length-2 vector,
+    // where index i corresponds to class i (i = 0, 1).
+    Vectors.dense(-margin, margin)
+  }
+
+  /** Number of classes the label can take.  2 indicates binary classification. */
+  override val numClasses: Int = 2
+
+  /**
+   * Create a copy of the model.
+   * The copy is shallow, except for the embedded paramMap, which gets a deep copy.
+   *
+   * This is used for the defaul implementation of [[transform()]].
+   */
+  override protected def copy(): MyLogisticRegressionModel = {
+    val m = new MyLogisticRegressionModel(parent, fittingParamMap, weights)
+    Params.inheritValues(this.paramMap, this, m)
+    m
+  }
+
+  /**
+   * Returns the SQL DataType corresponding to the FeaturesType type parameter.
+   * This is used by [[ClassifierParams.validateAndTransformSchema()]] to check the input data.
+   */
+  override protected def featuresDataType: DataType = new VectorUDT
+}
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
index 44d5b084c269a..1c0d4d2c647ca 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
@@ -73,7 +73,7 @@ object SimpleParamsExample {
     paramMap.put(lr.regParam -> 0.1, lr.threshold -> 0.55) // Specify multiple Params.
 
     // One can also combine ParamMaps.
-    val paramMap2 = ParamMap(lr.scoreCol -> "probability") // Change output column name
+    val paramMap2 = ParamMap(lr.probabilityCol -> "myProbability") // Change output column name
     val paramMapCombined = paramMap ++ paramMap2
 
     // Now learn a new model using the paramMapCombined parameters.
@@ -81,18 +81,18 @@ object SimpleParamsExample {
     val model2 = lr.fit(training, paramMapCombined)
     println("Model 2 was fit using parameters: " + model2.fittingParamMap)
 
-    // Prepare test documents.
+    // Prepare test data.
     val test = sparkContext.parallelize(Seq(
       LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
       LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)),
       LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5))))
 
-    // Make predictions on test documents using the Transformer.transform() method.
+    // Make predictions on test data using the Transformer.transform() method.
     // LogisticRegression.transform will only use the 'features' column.
-    // Note that model2.transform() outputs a 'probability' column instead of the usual 'score'
-    // column since we renamed the lr.scoreCol parameter previously.
+    // Note that model2.transform() outputs a 'myProbability' column instead of the usual
+    // 'probability' column since we renamed the lr.probabilityCol parameter previously.
     model2.transform(test)
-      .select('features, 'label, 'probability, 'prediction)
+      .select('features, 'label, 'myProbability, 'prediction)
       .collect()
       .foreach { case Row(features: Vector, label: Double, prob: Double, prediction: Double) =>
         println("(" + features + ", " + label + ") -> prob=" + prob + ", prediction=" + prediction)
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
index 92895a05e479a..795852b9efc03 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
@@ -24,6 +24,7 @@ import org.apache.spark.SparkContext._
 import org.apache.spark.ml.Pipeline
 import org.apache.spark.ml.classification.LogisticRegression
 import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
+import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.sql.{Row, SQLContext}
 
 @BeanInfo
@@ -80,10 +81,10 @@ object SimpleTextClassificationPipeline {
 
     // Make predictions on test documents.
     model.transform(test)
-      .select('id, 'text, 'score, 'prediction)
+      .select('id, 'text, 'probability, 'prediction)
       .collect()
-      .foreach { case Row(id: Long, text: String, score: Double, prediction: Double) =>
-        println("(" + id + ", " + text + ") --> score=" + score + ", prediction=" + prediction)
+      .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
+        println("(" + id + ", " + text + ") --> prob=" + prob + ", prediction=" + prediction)
       }
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/LabeledPoint.scala b/mllib/src/main/scala/org/apache/spark/ml/LabeledPoint.scala
deleted file mode 100644
index 8b6b2f3fa2756..0000000000000
--- a/mllib/src/main/scala/org/apache/spark/ml/LabeledPoint.scala
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.ml
-
-import scala.beans.BeanInfo
-
-import org.apache.spark.annotation.AlphaComponent
-import org.apache.spark.mllib.linalg.Vector
-
-/**
- * :: AlphaComponent ::
- * Class that represents an instance (data point) for prediction tasks.
- *
- * @param label Label to predict
- * @param features List of features describing this instance
- * @param weight Instance weight
- */
-@AlphaComponent
-@BeanInfo
-case class LabeledPoint(label: Double, features: Vector, weight: Double) {
-
-  /** Constructor which sets instance weight to 1.0 */
-  def this(label: Double, features: Vector) = this(label, features, 1.0)
-
-  override def toString: String = {
-    "(%s,%s,%s)".format(label, features, weight)
-  }
-}
-
-/**
- * :: AlphaComponent ::
- */
-@AlphaComponent
-object LabeledPoint {
-  /** Constructor which sets instance weight to 1.0 */
-  def apply(label: Double, features: Vector) = new LabeledPoint(label, features, 1.0)
-}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala
index 2f31beb7303fb..243de234dffdf 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/Classifier.scala
@@ -17,27 +17,56 @@
 
 package org.apache.spark.ml.classification
 
-import org.apache.spark.annotation.AlphaComponent
-import org.apache.spark.api.java.JavaRDD
+import scala.reflect.runtime.universe._
+
+import org.apache.spark.annotation.{DeveloperApi, AlphaComponent}
 import org.apache.spark.ml.impl.estimator.{PredictionModel, Predictor, PredictorParams}
-import org.apache.spark.mllib.linalg.Vector
-import org.apache.spark.rdd.RDD
+import org.apache.spark.ml.param.{Params, ParamMap, HasRawPredictionCol}
+import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
+import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.analysis.Star
 
 /**
+ * :: DeveloperApi ::
  * Params for classification.
- * Currently empty, but may add functionality later.
  */
-private[classification] trait ClassifierParams extends PredictorParams
+@DeveloperApi
+trait ClassifierParams extends PredictorParams
+  with HasRawPredictionCol {
+
+  override protected def validateAndTransformSchema(
+      schema: StructType,
+      paramMap: ParamMap,
+      fitting: Boolean,
+      featuresDataType: DataType): StructType = {
+    val parentSchema = super.validateAndTransformSchema(schema, paramMap, fitting, featuresDataType)
+    val map = this.paramMap ++ paramMap
+    addOutputColumn(parentSchema, map(rawPredictionCol), new VectorUDT)
+  }
+}
 
 /**
- * Single-label binary or multiclass classification
+ * :: AlphaComponent ::
+ * Single-label binary or multiclass classification.
  * Classes are indexed {0, 1, ..., numClasses - 1}.
+ *
+ * @tparam FeaturesType  Type of input features.  E.g., [[Vector]]
+ * @tparam Learner  Concrete Estimator type
+ * @tparam M  Concrete Model type
  */
 @AlphaComponent
-abstract class Classifier[Learner <: Classifier[Learner, M], M <: ClassificationModel[M]]
-  extends Predictor[Learner, M]
+abstract class Classifier[
+    FeaturesType,
+    Learner <: Classifier[FeaturesType, Learner, M],
+    M <: ClassificationModel[FeaturesType, M]]
+  extends Predictor[FeaturesType, Learner, M]
   with ClassifierParams {
 
+  setRawPredictionCol("") // Do not output by default
+
+  def setRawPredictionCol(value: String): Learner =
+    set(rawPredictionCol, value).asInstanceOf[Learner]
+
   // TODO: defaultEvaluator (follow-up PR)
 }
 
@@ -46,42 +75,130 @@ abstract class Classifier[Learner <: Classifier[Learner, M], M <: Classification
  * Model produced by a [[Classifier]].
  * Classes are indexed {0, 1, ..., numClasses - 1}.
  *
- * @tparam M  Model type.
+ * @tparam FeaturesType  Type of input features.  E.g., [[Vector]]
+ * @tparam M  Concrete Model type
  */
 @AlphaComponent
-abstract class ClassificationModel[M <: ClassificationModel[M]]
-  extends PredictionModel[M] with ClassifierParams {
+abstract class ClassificationModel[FeaturesType, M <: ClassificationModel[FeaturesType, M]]
+  extends PredictionModel[FeaturesType, M] with ClassifierParams {
+
+  setRawPredictionCol("") // Do not output by default
+
+  def setRawPredictionCol(value: String): M = set(rawPredictionCol, value).asInstanceOf[M]
 
   /** Number of classes (values which the label can take). */
   def numClasses: Int
 
   /**
+   * Transforms dataset by reading from [[featuresCol]], and appending new columns as specified by
+   * parameters:
+   *  - predicted labels as [[predictionCol]] of type [[Double]]
+   *  - raw predictions (confidences) as [[rawPredictionCol]] of type [[Vector]].
+   *
+   * @param dataset input dataset
+   * @param paramMap additional parameters, overwrite embedded params
+   * @return transformed dataset
+   */
+  override def transform(dataset: SchemaRDD, paramMap: ParamMap): SchemaRDD = {
+    // This default implementation should be overridden as needed.
+
+    // Check schema
+    transformSchema(dataset.schema, paramMap, logging = true)
+    val map = this.paramMap ++ paramMap
+
+    // Prepare model
+    val tmpModel = if (paramMap.size != 0) {
+      val tmpModel = this.copy()
+      Params.inheritValues(paramMap, parent, tmpModel)
+      tmpModel
+    } else {
+      this
+    }
+
+    val (numColsOutput, outputData) =
+      ClassificationModel.transformColumnsImpl[FeaturesType](dataset, tmpModel, map)
+    if (numColsOutput == 0) {
+      logWarning(s"$uid: ClassificationModel.transform() was called as NOOP" +
+        " since no output columns were set.")
+    }
+    outputData
+  }
+
+  /**
+   * :: DeveloperApi ::
+   *
    * Predict label for the given features.
+   * This internal method is used to implement [[transform()]] and output [[predictionCol]].
+   *
    * This default implementation for classification predicts the index of the maximum value
    * from [[predictRaw()]].
    */
-  override def predict(features: Vector): Double = {
+  @DeveloperApi
+  override protected def predict(features: FeaturesType): Double = {
     predictRaw(features).toArray.zipWithIndex.maxBy(_._1)._2
   }
 
   /**
+   * :: DeveloperApi ::
+   *
    * Raw prediction for each possible label.
    * The meaning of a "raw" prediction may vary between algorithms, but it intuitively gives
-   * a magnitude of confidence in each possible label.
+   * a measure of confidence in each possible label (where larger = more confident).
+   * This internal method is used to implement [[transform()]] and output [[rawPredictionCol]].
+   *
    * @return  vector where element i is the raw prediction for label i.
    *          This raw prediction may be any real number, where a larger value indicates greater
    *          confidence for that label.
    */
-  def predictRaw(features: Vector): Vector
+  @DeveloperApi
+  protected def predictRaw(features: FeaturesType): Vector
+
+}
+
+private[ml] object ClassificationModel {
+
+  /**
+   * Added prediction column(s).  This is separated from [[ClassificationModel.transform()]]
+   * since it is used by [[org.apache.spark.ml.classification.ProbabilisticClassificationModel]].
+   * @param dataset  Input dataset
+   * @param map  Parameter map.  This will NOT be merged with the embedded paramMap; the merge
+   *             should already be done.
+   * @return (number of columns added, transformed dataset)
+   */
+  private[ml] def transformColumnsImpl[FeaturesType](
+      dataset: SchemaRDD,
+      model: ClassificationModel[FeaturesType, _],
+      map: ParamMap): (Int, SchemaRDD) = {
 
-  /** Batch version of [[predictRaw]] */
-  def predictRaw(dataset: RDD[Vector]): RDD[Vector] = dataset.map(predictRaw)
+    import org.apache.spark.sql.catalyst.dsl._
+    import dataset.sqlContext._
 
-  /** Java-friendly batch version of [[predictRaw]] */
-  def predictRaw(dataset: JavaRDD[Vector]): JavaRDD[Vector] = {
-    dataset.rdd.map(predictRaw).toJavaRDD()
+    // Output selected columns only.
+    // This is a bit complicated since it tries to avoid repeated computation.
+    var tmpData = dataset
+    var numColsOutput = 0
+    if (map(model.rawPredictionCol) != "") {
+      // output raw prediction
+      val features2raw: FeaturesType => Vector = model.predictRaw
+      tmpData = tmpData.select(Star(None),
+        features2raw.call(map(model.featuresCol).attr) as map(model.rawPredictionCol))
+      numColsOutput += 1
+      if (map(model.predictionCol) != "") {
+        val raw2pred: Vector => Double = (rawPred) => {
+          rawPred.toArray.zipWithIndex.maxBy(_._1)._2
+        }
+        tmpData = tmpData.select(Star(None),
+          raw2pred.call(map(model.rawPredictionCol).attr) as map(model.predictionCol))
+        numColsOutput += 1
+      }
+    } else if (map(model.predictionCol) != "") {
+      // output prediction
+      val features2pred: FeaturesType => Double = model.predict
+      tmpData = tmpData.select(Star(None),
+        features2pred.call(map(model.featuresCol).attr) as map(model.predictionCol))
+      numColsOutput += 1
+    }
+    (numColsOutput, tmpData)
   }
 
-  // TODO: accuracy(dataset: RDD[LabeledPoint]): Double (follow-up PR)
-
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index f9e8a2277faf9..62b543ef7141c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -18,12 +18,9 @@
 package org.apache.spark.ml.classification
 
 import org.apache.spark.annotation.AlphaComponent
-import org.apache.spark.ml.LabeledPoint
-import org.apache.spark.ml.impl.estimator.ProbabilisticClassificationModel
 import org.apache.spark.ml.param._
 import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
-import org.apache.spark.mllib.linalg.{Vectors, BLAS, Vector}
-import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.{VectorUDT, Vectors, BLAS, Vector}
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.analysis.Star
 import org.apache.spark.sql.catalyst.dsl._
@@ -32,22 +29,8 @@ import org.apache.spark.storage.StorageLevel
 /**
  * Params for logistic regression.
  */
-private[classification] trait LogisticRegressionParams extends ClassifierParams
-  with HasRegParam with HasMaxIter with HasThreshold with HasScoreCol {
-
-  override protected def validateAndTransformSchema(
-      schema: StructType,
-      paramMap: ParamMap,
-      fitting: Boolean): StructType = {
-    val parentSchema = super.validateAndTransformSchema(schema, paramMap, fitting)
-    val map = this.paramMap ++ paramMap
-    val fieldNames = parentSchema.fieldNames
-    require(!fieldNames.contains(map(scoreCol)), s"Score column ${map(scoreCol)} already exists.")
-    val outputFields = parentSchema.fields ++ Seq(
-      StructField(map(scoreCol), DoubleType, nullable = false))
-    StructType(outputFields)
-  }
-}
+private[classification] trait LogisticRegressionParams extends ProbabilisticClassifierParams
+  with HasRegParam with HasMaxIter with HasThreshold
 
 
 /**
@@ -56,7 +39,8 @@ private[classification] trait LogisticRegressionParams extends ClassifierParams
  * Currently, this class only supports binary classification.
  */
 @AlphaComponent
-class LogisticRegression extends Classifier[LogisticRegression, LogisticRegressionModel]
+class LogisticRegression
+  extends ProbabilisticClassifier[Vector, LogisticRegression, LogisticRegressionModel]
   with LogisticRegressionParams {
 
   setRegParam(0.1)
@@ -66,44 +50,37 @@ class LogisticRegression extends Classifier[LogisticRegression, LogisticRegressi
   def setRegParam(value: Double): this.type = set(regParam, value)
   def setMaxIter(value: Int): this.type = set(maxIter, value)
   def setThreshold(value: Double): this.type = set(threshold, value)
-  def setScoreCol(value: String): this.type = set(scoreCol, value)
 
-  /**
-   * Same as [[fit()]], but using strong types.
-   * NOTE: This does NOT support instance weights.
-   * @param dataset  Training data.  Instance weights are ignored.
-   * @param paramMap  Parameters for training.
-   *                  These values override any specified in this Estimator's embedded ParamMap.
-   */
-  override def train(dataset: RDD[LabeledPoint], paramMap: ParamMap): LogisticRegressionModel = {
+  override def fit(dataset: SchemaRDD, paramMap: ParamMap): LogisticRegressionModel = {
+    // Check schema
+    transformSchema(dataset.schema, paramMap, logging = true)
+
+    // Extract columns from data.  If dataset is persisted, do not persist oldDataset.
+    val oldDataset = extractLabeledPoints(dataset, paramMap)
     val map = this.paramMap ++ paramMap
-    val oldDataset = dataset.map { case LabeledPoint(label: Double, features: Vector, weight) =>
-      org.apache.spark.mllib.regression.LabeledPoint(label, features)
-    }
-    // If dataset is persisted, do not persist oldDataset.
     val handlePersistence = dataset.getStorageLevel == StorageLevel.NONE
     if (handlePersistence) {
       oldDataset.persist(StorageLevel.MEMORY_AND_DISK)
     }
+
+    // Train model
     val lr = new LogisticRegressionWithLBFGS
     lr.optimizer
       .setRegParam(map(regParam))
       .setNumIterations(map(maxIter))
-    val model = lr.run(oldDataset)
-    val lrm = new LogisticRegressionModel(this, map, model.weights, model.intercept)
+    val oldModel = lr.run(oldDataset)
+    val lrm = new LogisticRegressionModel(this, map, oldModel.weights, oldModel.intercept)
+
     if (handlePersistence) {
       oldDataset.unpersist()
     }
+
+    // copy model params
+    Params.inheritValues(map, this, lrm)
     lrm
   }
 
-  /**
-   * Same as [[fit()]], but using strong types.
-   * NOTE: This does NOT support instance weights.
-   * @param dataset  Training data.  Instance weights are ignored.
-   */
-  override def train(dataset: RDD[LabeledPoint]): LogisticRegressionModel =
-    train(dataset, new ParamMap()) // Override documentation
+  override protected def featuresDataType: DataType = new VectorUDT
 }
 
 
@@ -117,14 +94,12 @@ class LogisticRegressionModel private[ml] (
     override val fittingParamMap: ParamMap,
     val weights: Vector,
     val intercept: Double)
-  extends ClassificationModel[LogisticRegressionModel]
-  with ProbabilisticClassificationModel
+  extends ProbabilisticClassificationModel[Vector, LogisticRegressionModel]
   with LogisticRegressionParams {
 
   setThreshold(0.5)
 
   def setThreshold(value: Double): this.type = set(threshold, value)
-  def setScoreCol(value: String): this.type = set(scoreCol, value)
 
   private val margin: Vector => Double = (features) => {
     BLAS.dot(features, weights) + intercept
@@ -136,42 +111,94 @@ class LogisticRegressionModel private[ml] (
   }
 
   override def transform(dataset: SchemaRDD, paramMap: ParamMap): SchemaRDD = {
+    // Check schema
     transformSchema(dataset.schema, paramMap, logging = true)
+
     import dataset.sqlContext._
     val map = this.paramMap ++ paramMap
-    val t = map(threshold)
-    val predict: Double => Double = (score) => {
-      if (score > t) 1.0 else 0.0
+
+    // Output selected columns only.
+    // This is a bit complicated since it tries to avoid repeated computation.
+    //   rawPrediction (-margin, margin)
+    //   probability (1.0-score, score)
+    //   prediction (max margin)
+    var tmpData = dataset
+    var numColsOutput = 0
+    if (map(rawPredictionCol) != "") {
+      val features2raw: Vector => Vector = predictRaw
+      tmpData = tmpData.select(Star(None),
+        features2raw.call(map(featuresCol).attr) as map(rawPredictionCol))
+      numColsOutput += 1
     }
-    dataset.select(Star(None), score.call(map(featuresCol).attr) as map(scoreCol))
-      .select(Star(None), predict.call(map(scoreCol).attr) as map(predictionCol))
+    if (map(probabilityCol) != "") {
+      if (map(rawPredictionCol) != "") {
+        val raw2prob: Vector => Vector = (rawPreds) => {
+          val prob1 = 1.0 / 1.0 + math.exp(-rawPreds(1))
+          Vectors.dense(1.0 - prob1, prob1)
+        }
+        tmpData = tmpData.select(Star(None),
+          raw2prob.call(map(rawPredictionCol).attr) as map(probabilityCol))
+      } else {
+        val features2prob: Vector => Vector = predictProbabilities
+        tmpData = tmpData.select(Star(None),
+          features2prob.call(map(featuresCol).attr) as map(probabilityCol))
+      }
+      numColsOutput += 1
+    }
+    if (map(predictionCol) != "") {
+      val t = map(threshold)
+      if (map(probabilityCol) != "") {
+        val predict: Vector => Double = (probs) => {
+          if (probs(1) > t) 1.0 else 0.0
+        }
+        tmpData = tmpData.select(Star(None),
+          predict.call(map(probabilityCol).attr) as map(predictionCol))
+      } else if (map(rawPredictionCol) != "") {
+        val predict: Vector => Double = (rawPreds) => {
+          val prob1 = 1.0 / 1.0 + math.exp(-rawPreds(1))
+          if (prob1 > t) 1.0 else 0.0
+        }
+        tmpData = tmpData.select(Star(None),
+          predict.call(map(rawPredictionCol).attr) as map(predictionCol))
+      } else {
+        val predict: Vector => Double = this.predict
+        tmpData = tmpData.select(Star(None),
+          predict.call(map(featuresCol).attr) as map(predictionCol))
+      }
+      numColsOutput += 1
+    }
+    if (numColsOutput == 0) {
+      this.logWarning(s"$uid: LogisticRegressionModel.transform() was called as NOOP" +
+        " since no output columns were set.")
+    }
+    tmpData
   }
 
   override val numClasses: Int = 2
 
-  // TODO: Override batch predict() for efficiency.
-
   /**
    * Predict label for the given feature vector.
    * The behavior of this can be adjusted using [[threshold]].
    */
-  override def predict(features: Vector): Double = {
+  override protected def predict(features: Vector): Double = {
     if (score(features) > paramMap(threshold)) 1 else 0
   }
 
-  override def predictProbabilities(features: Vector): Vector = {
+  override protected def predictProbabilities(features: Vector): Vector = {
     val s = score(features)
-    Vectors.dense(Array(1.0 - s, s))
+    Vectors.dense(1.0 - s, s)
   }
 
-  override def predictRaw(features: Vector): Vector = {
+  override protected def predictRaw(features: Vector): Vector = {
     val m = margin(features)
-    Vectors.dense(Array(-m, m))
+    Vectors.dense(-m, m)
   }
 
-  private[ml] override def copy(): LogisticRegressionModel = {
+  override protected def copy(): LogisticRegressionModel = {
     val m = new LogisticRegressionModel(parent, fittingParamMap, weights, intercept)
     Params.inheritValues(this.paramMap, this, m)
     m
   }
+
+  override protected def featuresDataType: DataType = new VectorUDT
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala
new file mode 100644
index 0000000000000..41f9b9601a00b
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.classification
+
+import scala.reflect.runtime.universe._
+
+import org.apache.spark.annotation.{AlphaComponent, DeveloperApi}
+import org.apache.spark.ml.param.{HasProbabilityCol, ParamMap, Params}
+import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
+import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.analysis.Star
+
+/**
+ * Params for probabilistic classification.
+ */
+private[classification] trait ProbabilisticClassifierParams
+  extends ClassifierParams with HasProbabilityCol {
+
+  override protected def validateAndTransformSchema(
+      schema: StructType,
+      paramMap: ParamMap,
+      fitting: Boolean,
+      featuresDataType: DataType): StructType = {
+    val parentSchema = super.validateAndTransformSchema(schema, paramMap, fitting, featuresDataType)
+    val map = this.paramMap ++ paramMap
+    addOutputColumn(parentSchema, map(probabilityCol), new VectorUDT)
+  }
+}
+
+/**
+ * :: AlphaComponent ::
+ * Single-label binary or multiclass classifier which can output class conditional probabilities.
+ *
+ * @tparam FeaturesType  Type of input features.  E.g., [[Vector]]
+ * @tparam Learner  Concrete Estimator type
+ * @tparam M  Concrete Model type
+ */
+@AlphaComponent
+abstract class ProbabilisticClassifier[
+    FeaturesType,
+    Learner <: ProbabilisticClassifier[FeaturesType, Learner, M],
+    M <: ProbabilisticClassificationModel[FeaturesType, M]]
+  extends Classifier[FeaturesType, Learner, M] with ProbabilisticClassifierParams {
+
+  setProbabilityCol("") // Do not output by default
+
+  def setProbabilityCol(value: String): Learner = set(probabilityCol, value).asInstanceOf[Learner]
+}
+
+/**
+ * :: AlphaComponent ::
+ * Model produced by a [[ProbabilisticClassifier]].
+ * Classes are indexed {0, 1, ..., numClasses - 1}.
+ *
+ * @tparam FeaturesType  Type of input features.  E.g., [[Vector]]
+ * @tparam M  Concrete Model type
+ */
+@AlphaComponent
+abstract class ProbabilisticClassificationModel[
+    FeaturesType,
+    M <: ProbabilisticClassificationModel[FeaturesType, M]]
+  extends ClassificationModel[FeaturesType, M] with ProbabilisticClassifierParams {
+
+  setProbabilityCol("") // Do not output by default
+
+  def setProbabilityCol(value: String): M = set(probabilityCol, value).asInstanceOf[M]
+
+  /**
+   * Transforms dataset by reading from [[featuresCol]], and appending new columns as specified by
+   * parameters:
+   *  - predicted labels as [[predictionCol]] of type [[Double]]
+   *  - raw predictions (confidences) as [[rawPredictionCol]] of type [[Vector]]
+   *  - probability of each class as [[probabilityCol]] of type [[Vector]].
+   *
+   * @param dataset input dataset
+   * @param paramMap additional parameters, overwrite embedded params
+   * @return transformed dataset
+   */
+  override def transform(dataset: SchemaRDD, paramMap: ParamMap): SchemaRDD = {
+    // This default implementation should be overridden as needed.
+    import dataset.sqlContext._
+    import org.apache.spark.sql.catalyst.dsl._
+
+    // Check schema
+    transformSchema(dataset.schema, paramMap, logging = true)
+    val map = this.paramMap ++ paramMap
+
+    // Prepare model
+    val tmpModel = if (paramMap.size != 0) {
+      val tmpModel = this.copy()
+      Params.inheritValues(paramMap, parent, tmpModel)
+      tmpModel
+    } else {
+      this
+    }
+
+    val (numColsOutput, outputData) =
+      ClassificationModel.transformColumnsImpl[FeaturesType](dataset, tmpModel, map)
+
+    // Output selected columns only.
+    if (map(probabilityCol) != "") {
+      // output probabilities
+      val features2probs: FeaturesType => Vector = (features) => {
+        tmpModel.predictProbabilities(features)
+      }
+      outputData.select(Star(None),
+        features2probs.call(map(featuresCol).attr) as map(probabilityCol))
+    } else {
+      if (numColsOutput == 0) {
+        this.logWarning(s"$uid: ProbabilisticClassificationModel.transform() was called as NOOP" +
+          " since no output columns were set.")
+      }
+      outputData
+    }
+  }
+
+  /**
+   * :: DeveloperApi ::
+   *
+   * Predict the probability of each class given the features.
+   * These predictions are also called class conditional probabilities.
+   *
+   * WARNING: Not all models output well-calibrated probability estimates!  These probabilities
+   *          should be treated as confidences, not precise probabilities.
+   *
+   * This internal method is used to implement [[transform()]] and output [[probabilityCol]].
+   */
+  @DeveloperApi
+  protected def predictProbabilities(features: FeaturesType): Vector
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
index 0b0504e036ec9..602d1fce1fc0f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
@@ -21,6 +21,7 @@ import org.apache.spark.annotation.AlphaComponent
 import org.apache.spark.ml._
 import org.apache.spark.ml.param._
 import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
+import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
 import org.apache.spark.sql.{DoubleType, Row, SchemaRDD}
 
 /**
@@ -29,7 +30,7 @@ import org.apache.spark.sql.{DoubleType, Row, SchemaRDD}
  */
 @AlphaComponent
 class BinaryClassificationEvaluator extends Evaluator with Params
-  with HasScoreCol with HasLabelCol {
+  with HasRawPredictionCol with HasLabelCol {
 
   /** param for metric name in evaluation */
   val metricName: Param[String] = new Param(this, "metricName",
@@ -37,24 +38,21 @@ class BinaryClassificationEvaluator extends Evaluator with Params
   def getMetricName: String = get(metricName)
   def setMetricName(value: String): this.type = set(metricName, value)
 
-  def setScoreCol(value: String): this.type = set(scoreCol, value)
+  def setScoreCol(value: String): this.type = set(rawPredictionCol, value)
   def setLabelCol(value: String): this.type = set(labelCol, value)
 
   override def evaluate(dataset: SchemaRDD, paramMap: ParamMap): Double = {
     val map = this.paramMap ++ paramMap
 
     val schema = dataset.schema
-    val scoreType = schema(map(scoreCol)).dataType
-    require(scoreType == DoubleType,
-      s"Score column ${map(scoreCol)} must be double type but found $scoreType")
-    val labelType = schema(map(labelCol)).dataType
-    require(labelType == DoubleType,
-      s"Label column ${map(labelCol)} must be double type but found $labelType")
+    checkInputColumn(schema, map(rawPredictionCol), new VectorUDT)
+    checkInputColumn(schema, map(labelCol), DoubleType)
 
     import dataset.sqlContext._
-    val scoreAndLabels = dataset.select(map(scoreCol).attr, map(labelCol).attr)
-      .map { case Row(score: Double, label: Double) =>
-        (score, label)
+    // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2.
+    val scoreAndLabels = dataset.select(map(rawPredictionCol).attr, map(labelCol).attr)
+      .map { case Row(rawPrediction: Vector, label: Double) =>
+        (rawPrediction(1), label)
       }
     val metrics = new BinaryClassificationMetrics(scoreAndLabels)
     val metric = map(metricName) match {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
index 9352f40f372d3..caaca07a4b013 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
@@ -29,11 +29,11 @@ import org.apache.spark.sql.{DataType, StringType, ArrayType}
 @AlphaComponent
 class Tokenizer extends UnaryTransformer[String, Seq[String], Tokenizer] {
 
-  protected override def createTransformFunc(paramMap: ParamMap): String => Seq[String] = {
+  override protected def createTransformFunc(paramMap: ParamMap): String => Seq[String] = {
     _.toLowerCase.split("\\s")
   }
 
-  protected override def validateInputType(inputType: DataType): Unit = {
+  override protected def validateInputType(inputType: DataType): Unit = {
     require(inputType == StringType, s"Input type must be string type but got $inputType.")
   }
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/Predictor.scala b/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/Predictor.scala
index 48cecfefd4c07..35ca5a0bcbe00 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/Predictor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/Predictor.scala
@@ -17,15 +17,23 @@
 
 package org.apache.spark.ml.impl.estimator
 
-import org.apache.spark.api.java.JavaRDD
-import org.apache.spark.ml.{Estimator, LabeledPoint, Model}
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.param._
-import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
+import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.analysis.Star
 
-private[ml] trait PredictorParams extends Params
+
+/**
+ * :: DeveloperApi ::
+ *
+ * Trait for parameters for prediction (regression and classification).
+ */
+@DeveloperApi
+trait PredictorParams extends Params
   with HasLabelCol with HasFeaturesCol with HasPredictionCol {
 
   /**
@@ -33,33 +41,41 @@ private[ml] trait PredictorParams extends Params
    * @param schema input schema
    * @param paramMap additional parameters
    * @param fitting whether this is in fitting
+   * @param featuresDataType  SQL DataType for FeaturesType.
+   *                          E.g., [[org.apache.spark.mllib.linalg.VectorUDT]] for vector features.
    * @return output schema
    */
   protected def validateAndTransformSchema(
       schema: StructType,
       paramMap: ParamMap,
-      fitting: Boolean): StructType = {
+      fitting: Boolean,
+      featuresDataType: DataType): StructType = {
     val map = this.paramMap ++ paramMap
-    val featuresType = schema(map(featuresCol)).dataType
-    // TODO: Support casting Array[Double] and Array[Float] to Vector.
-    require(featuresType.isInstanceOf[VectorUDT],
-      s"Features column ${map(featuresCol)} must be Vector types" +
-        s" but was actually $featuresType.")
+    // TODO: Support casting Array[Double] and Array[Float] to Vector when FeaturesType = Vector
+    checkInputColumn(schema, map(featuresCol), featuresDataType)
     if (fitting) {
-      val labelType = schema(map(labelCol)).dataType
-      require(labelType == DoubleType || labelType == IntegerType,
-        s"Cannot convert label column ${map(labelCol)} of type $labelType to a Double column.")
+      // TODO: Allow other numeric types
+      checkInputColumn(schema, map(labelCol), DoubleType)
     }
-    val fieldNames = schema.fieldNames
-    require(!fieldNames.contains(map(predictionCol)),
-      s"Prediction column ${map(predictionCol)} already exists.")
-    val outputFields = schema.fields ++ Seq(
-      StructField(map(predictionCol), DoubleType, nullable = false))
-    StructType(outputFields)
+    addOutputColumn(schema, map(predictionCol), DoubleType)
   }
 }
 
-private[ml] abstract class Predictor[Learner <: Predictor[Learner, M], M <: PredictionModel[M]]
+/**
+ * Abstraction for prediction problems (regression and classification).
+ *
+ * @tparam FeaturesType  Type of features.
+ *                       E.g., [[org.apache.spark.mllib.linalg.VectorUDT]] for vector features.
+ * @tparam Learner  Specialization of this class.  If you subclass this type, use this type
+ *                  parameter to specify the concrete type.
+ * @tparam M  Specialization of [[PredictionModel]].  If you subclass this type, use this type
+ *            parameter to specify the concrete type for the corresponding model.
+ */
+@DeveloperApi
+abstract class Predictor[
+    FeaturesType,
+    Learner <: Predictor[FeaturesType, Learner, M],
+    M <: PredictionModel[FeaturesType, M]]
   extends Estimator[M] with PredictorParams {
 
   // TODO: Eliminate asInstanceOf and see if that works.
@@ -67,6 +83,8 @@ private[ml] abstract class Predictor[Learner <: Predictor[Learner, M], M <: Pred
   def setFeaturesCol(value: String): Learner = set(featuresCol, value).asInstanceOf[Learner]
   def setPredictionCol(value: String): Learner = set(predictionCol, value).asInstanceOf[Learner]
 
+  /*
+  // This will be useful for boosting.
   protected def selectLabelColumn(dataset: SchemaRDD, paramMap: ParamMap): RDD[Double] = {
     import dataset.sqlContext._
     val map = this.paramMap ++ paramMap
@@ -75,113 +93,109 @@ private[ml] abstract class Predictor[Learner <: Predictor[Learner, M], M <: Pred
       case Row(label: Int) => label.toDouble
     }
   }
+  */
+
+  /**
+   * :: DeveloperApi ::
+   *
+   * Returns the SQL DataType corresponding to the FeaturesType type parameter.
+   *
+   * This is used by [[validateAndTransformSchema()]].
+   * This workaround is needed since SQL has different APIs for Scala and Java.
+   */
+  @DeveloperApi
+  protected def featuresDataType: DataType
 
   private[ml] override def transformSchema(schema: StructType, paramMap: ParamMap): StructType = {
-    validateAndTransformSchema(schema, paramMap, fitting = true)
+    validateAndTransformSchema(schema, paramMap, fitting = true, featuresDataType)
   }
 
-  override def fit(dataset: SchemaRDD, paramMap: ParamMap): M = {
-    transformSchema(dataset.schema, paramMap, logging = true)
+  /**
+   * Extract [[labelCol]] and [[featuresCol]] from the given dataset,
+   * and put it in an RDD with strong types.
+   */
+  protected def extractLabeledPoints(dataset: SchemaRDD, paramMap: ParamMap): RDD[LabeledPoint] = {
     import dataset.sqlContext._
     val map = this.paramMap ++ paramMap
-    val instances = dataset.select(map(labelCol).attr, map(featuresCol).attr)
+    dataset.select(map(labelCol).attr, map(featuresCol).attr)
       .map { case Row(label: Double, features: Vector) =>
       LabeledPoint(label, features)
     }
-    val model = train(instances, map)
-    // copy model params
-    Params.inheritValues(map, this, model)
-    model
   }
-
-  /**
-   * Same as [[fit()]], but using strong types.
-   *
-   * @param dataset  Training data
-   * @param paramMap  Parameters for training.
-   *                  These values override any specified in this Estimator's embedded ParamMap.
-   */
-  def train(dataset: RDD[LabeledPoint], paramMap: ParamMap): M
-
-  /**
-   * Same as [[fit()]], but using strong types.
-   * @param dataset  Training data
-   */
-  def train(dataset: RDD[LabeledPoint]): M = train(dataset, new ParamMap())
-
-  /** Java-friendly version of [[train()]]. */
-  def train(dataset: JavaRDD[LabeledPoint], paramMap: ParamMap): M = train(dataset.rdd, paramMap)
-
-  /** Java-friendly version of [[train()]]. */
-  def train(dataset: JavaRDD[LabeledPoint]): M = train(dataset.rdd)
 }
 
-private[ml] abstract class PredictionModel[M <: PredictionModel[M]]
+private[ml] abstract class PredictionModel[FeaturesType, M <: PredictionModel[FeaturesType, M]]
   extends Model[M] with PredictorParams {
 
   def setFeaturesCol(value: String): M = set(featuresCol, value).asInstanceOf[M]
 
   def setPredictionCol(value: String): M = set(predictionCol, value).asInstanceOf[M]
 
+  /**
+   * :: DeveloperApi ::
+   *
+   * Returns the SQL DataType corresponding to the FeaturesType type parameter.
+   *
+   * This is used by [[validateAndTransformSchema()]].
+   * This workaround is needed since SQL has different APIs for Scala and Java.
+   */
+  @DeveloperApi
+  protected def featuresDataType: DataType
+
   private[ml] override def transformSchema(schema: StructType, paramMap: ParamMap): StructType = {
-    validateAndTransformSchema(schema, paramMap, fitting = false)
+    validateAndTransformSchema(schema, paramMap, fitting = false, featuresDataType)
   }
 
   /**
-   * Transforms dataset by reading from [[featuresCol]], calling [[predict( )]], and storing
+   * Transforms dataset by reading from [[featuresCol]], calling [[predict()]], and storing
    * the predictions as a new column [[predictionCol]].
-   * This default implementation should be overridden as needed.
+   *
    * @param dataset input dataset
    * @param paramMap additional parameters, overwrite embedded params
    * @return transformed dataset with [[predictionCol]] of type [[Double]]
    */
   override def transform(dataset: SchemaRDD, paramMap: ParamMap): SchemaRDD = {
+    // This default implementation should be overridden as needed.
     import org.apache.spark.sql.catalyst.dsl._
     import dataset.sqlContext._
 
+    // Check schema
     transformSchema(dataset.schema, paramMap, logging = true)
     val map = this.paramMap ++ paramMap
-    val tmpModel = this.copy()
-    Params.inheritValues(paramMap, parent, tmpModel)
-    val pred: Vector => Double = (features) => {
-      tmpModel.predict(features)
+
+    // Prepare model
+    val tmpModel = if (paramMap.size != 0) {
+      val tmpModel = this.copy()
+      Params.inheritValues(paramMap, parent, tmpModel)
+      tmpModel
+    } else {
+      this
     }
-    dataset.select(Star(None), pred.call(map(featuresCol).attr) as map(predictionCol))
-  }
 
-  /**
-   * Strongly typed version of [[transform()]].
-   * Default implementation using single-instance predict().
-   *
-   * Developers should override this for efficiency.  E.g., this does not broadcast the model.
-   */
-  def predict(dataset: RDD[Vector], paramMap: ParamMap): RDD[Double] = {
-    val tmpModel = this.copy()
-    Params.inheritValues(paramMap, parent, tmpModel)
-    dataset.map(tmpModel.predict)
+    if (map(predictionCol) != "") {
+      val pred: FeaturesType => Double = (features) => {
+        tmpModel.predict(features)
+      }
+      dataset.select(Star(None), pred.call(map(featuresCol).attr) as map(predictionCol))
+    } else {
+      this.logWarning(s"$uid: Predictor.transform() was called as NOOP" +
+        " since no output columns were set.")
+      dataset
+    }
   }
 
-  /** Strongly typed version of [[transform()]]. */
-  def predict(dataset: RDD[Vector]): RDD[Double] = predict(dataset, new ParamMap)
-
   /**
+   * :: DeveloperApi ::
+   *
    * Predict label for the given features.
+   * This internal method is used to implement [[transform()]] and output [[predictionCol]].
    */
-  def predict(features: Vector): Double
-
-  /** Java-friendly version of [[predict()]]. */
-  def predict(dataset: JavaRDD[Vector], paramMap: ParamMap): JavaRDD[java.lang.Double] = {
-    predict(dataset.rdd, paramMap).map(_.asInstanceOf[java.lang.Double]).toJavaRDD()
-  }
-
-  /** Java-friendly version of [[predict()]]. */
-  def predict(dataset: JavaRDD[Vector]): JavaRDD[java.lang.Double] = {
-    predict(dataset.rdd, new ParamMap).map(_.asInstanceOf[java.lang.Double]).toJavaRDD()
-  }
+  @DeveloperApi
+  protected def predict(features: FeaturesType): Double
 
   /**
    * Create a copy of the model.
    * The copy is shallow, except for the embedded paramMap, which gets a deep copy.
    */
-  private[ml] def copy(): M
+  protected def copy(): M
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/ProbabilisticClassificationModel.scala b/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/ProbabilisticClassificationModel.scala
deleted file mode 100644
index e534a8c264bb3..0000000000000
--- a/mllib/src/main/scala/org/apache/spark/ml/impl/estimator/ProbabilisticClassificationModel.scala
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.ml.impl.estimator
-
-import org.apache.spark.api.java.JavaRDD
-import org.apache.spark.mllib.linalg.Vector
-import org.apache.spark.rdd.RDD
-
-/**
- * Trait for a [[org.apache.spark.ml.classification.ClassificationModel]] which can output
- * class conditional probabilities.
- */
-private[ml] trait ProbabilisticClassificationModel {
-
-  /**
-   * Predict the probability of each class given the features.
-   * These predictions are also called class conditional probabilities.
-   *
-   * WARNING: Not all models output well-calibrated probability estimates!  These probabilities
-   *          should be treated as confidences, not precise probabilities.
-   */
-  def predictProbabilities(features: Vector): Vector
-
-  /** Batch version of [[predictProbabilities()]] */
-  def predictProbabilities(features: RDD[Vector]): RDD[Vector] = features.map(predictProbabilities)
-
-  /** Java-friendly batch version of [[predictProbabilities()]] */
-  def predictProbabilities(features: JavaRDD[Vector]): JavaRDD[Vector] = {
-    features.rdd.map(predictProbabilities).toJavaRDD()
-  }
-}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index 4e18a5c9b7d08..e0e334c9b4f8a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -19,11 +19,14 @@ package org.apache.spark.ml.param
 
 import scala.annotation.varargs
 import scala.collection.mutable
+import scala.reflect.runtime.universe._
 
 import java.lang.reflect.Modifier
 
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.annotation.{DeveloperApi, AlphaComponent}
 import org.apache.spark.ml.Identifiable
+import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.ScalaReflection
 
 /**
  * :: AlphaComponent ::
@@ -158,7 +161,7 @@ trait Params extends Identifiable with Serializable {
   /**
    * Sets a parameter in the embedded param map.
    */
-  private[ml] def set[T](param: Param[T], value: T): this.type = {
+  protected def set[T](param: Param[T], value: T): this.type = {
     require(param.parent.eq(this))
     paramMap.put(param.asInstanceOf[Param[Any]], value)
     this
@@ -167,7 +170,7 @@ trait Params extends Identifiable with Serializable {
   /**
    * Gets the value of a parameter in the embedded param map.
    */
-  private[ml] def get[T](param: Param[T]): T = {
+  protected def get[T](param: Param[T]): T = {
     require(param.parent.eq(this))
     paramMap(param)
   }
@@ -176,9 +179,38 @@ trait Params extends Identifiable with Serializable {
    * Internal param map.
    */
   protected val paramMap: ParamMap = ParamMap.empty
+
+  /**
+   * Check whether the given schema contains an input column.
+   * @param colName  Parameter name for the input column.
+   * @param dataType  SQL DataType of the input column.
+   */
+  protected def checkInputColumn(schema: StructType, colName: String, dataType: DataType): Unit = {
+    val actualDataType = schema(colName).dataType
+    require(actualDataType.equals(dataType),
+      s"Input column $colName must be of type $dataType" +
+        s" but was actually $actualDataType.  Column param description: ${getParam(colName)}")
+  }
+
+  protected def addOutputColumn(
+      schema: StructType,
+      colName: String,
+      dataType: DataType): StructType = {
+    if (colName.length == 0) return schema
+    val fieldNames = schema.fieldNames
+    require(!fieldNames.contains(colName), s"Prediction column $colName already exists.")
+    val outputFields = schema.fields ++ Seq(StructField(colName, dataType, nullable = false))
+    StructType(outputFields)
+  }
 }
 
-private[ml] object Params {
+/**
+ * :: DeveloperApi ::
+ *
+ * Helper functionality for developers.
+ */
+@DeveloperApi
+object Params {
 
   /**
    * Copies parameter values from the parent estimator to the child model it produced.
@@ -304,6 +336,11 @@ class ParamMap private[ml] (private val map: mutable.Map[Param[Any], Any]) exten
       ParamPair(param, value)
     }
   }
+
+  /**
+   * Number of param pairs in this set.
+   */
+  def size: Int = map.size
 }
 
 object ParamMap {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/sharedParams.scala
index ef141d3eb2b06..bf336f3f7173b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/sharedParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/sharedParams.scala
@@ -17,6 +17,10 @@
 
 package org.apache.spark.ml.param
 
+/* NOTE TO DEVELOPERS:
+ * If you add these parameter traits into your algorithm, you need to add a setter method as well.
+ */
+
 private[ml] trait HasRegParam extends Params {
   /** param for regularization parameter */
   val regParam: DoubleParam = new DoubleParam(this, "regParam", "regularization parameter")
@@ -42,12 +46,6 @@ private[ml] trait HasLabelCol extends Params {
   def getLabelCol: String = get(labelCol)
 }
 
-private[ml] trait HasScoreCol extends Params {
-  /** param for score column name */
-  val scoreCol: Param[String] = new Param(this, "scoreCol", "score column name", Some("score"))
-  def getScoreCol: String = get(scoreCol)
-}
-
 private[ml] trait HasPredictionCol extends Params {
   /** param for prediction column name */
   val predictionCol: Param[String] =
@@ -55,6 +53,22 @@ private[ml] trait HasPredictionCol extends Params {
   def getPredictionCol: String = get(predictionCol)
 }
 
+private[ml] trait HasRawPredictionCol extends Params {
+  /** param for raw prediction column name */
+  val rawPredictionCol: Param[String] =
+    new Param(this, "rawPredictionCol", "raw prediction (a.k.a. confidence) column name",
+      Some("rawPrediction"))
+  def getRawPredictionCol: String = get(rawPredictionCol)
+}
+
+private[ml] trait HasProbabilityCol extends Params {
+  /** param for predicted class conditional probabilities column name */
+  val probabilityCol: Param[String] =
+    new Param(this, "probabilityCol", "column name for predicted class conditional probabilities",
+      Some("probability"))
+  def getProbabilityCol: String = get(probabilityCol)
+}
+
 private[ml] trait HasThreshold extends Params {
   /** param for threshold in (binary) prediction */
   val threshold: DoubleParam = new DoubleParam(this, "threshold", "threshold in prediction")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index e6abe9b404808..3ff7107221763 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -18,11 +18,10 @@
 package org.apache.spark.ml.regression
 
 import org.apache.spark.annotation.AlphaComponent
-import org.apache.spark.ml.LabeledPoint
 import org.apache.spark.ml.param.{Params, ParamMap, HasMaxIter, HasRegParam}
-import org.apache.spark.mllib.linalg.{BLAS, Vector}
+import org.apache.spark.mllib.linalg.{VectorUDT, BLAS, Vector}
 import org.apache.spark.mllib.regression.LinearRegressionWithSGD
-import org.apache.spark.rdd.RDD
+import org.apache.spark.sql._
 import org.apache.spark.storage.StorageLevel
 
 /**
@@ -36,7 +35,7 @@ private[regression] trait LinearRegressionParams extends RegressorParams
  * Logistic regression.
  */
 @AlphaComponent
-class LinearRegression extends Regressor[LinearRegression, LinearRegressionModel]
+class LinearRegression extends Regressor[Vector, LinearRegression, LinearRegressionModel]
   with LinearRegressionParams {
 
   setRegParam(0.1)
@@ -45,41 +44,36 @@ class LinearRegression extends Regressor[LinearRegression, LinearRegressionModel
   def setRegParam(value: Double): this.type = set(regParam, value)
   def setMaxIter(value: Int): this.type = set(maxIter, value)
 
-  /**
-   * Same as [[fit()]], but using strong types.
-   * NOTE: This does NOT support instance weights.
-   * @param dataset  Training data.  Instance weights are ignored.
-   * @param paramMap  Parameters for training.
-   *                  These values override any specified in this Estimator's embedded ParamMap.
-   */
-  override def train(dataset: RDD[LabeledPoint], paramMap: ParamMap): LinearRegressionModel = {
+  override def fit(dataset: SchemaRDD, paramMap: ParamMap): LinearRegressionModel = {
+    // Check schema
+    transformSchema(dataset.schema, paramMap, logging = true)
+
+    // Extract columns from data.  If dataset is persisted, do not persist oldDataset.
+    val oldDataset = extractLabeledPoints(dataset, paramMap)
     val map = this.paramMap ++ paramMap
-    val oldDataset = dataset.map { case LabeledPoint(label: Double, features: Vector, weight) =>
-      org.apache.spark.mllib.regression.LabeledPoint(label, features)
-    }
-    val handlePersistence = oldDataset.getStorageLevel == StorageLevel.NONE
+    val handlePersistence = dataset.getStorageLevel == StorageLevel.NONE
     if (handlePersistence) {
       oldDataset.persist(StorageLevel.MEMORY_AND_DISK)
     }
+
+    // Train model
     val lr = new LinearRegressionWithSGD()
     lr.optimizer
       .setRegParam(map(regParam))
       .setNumIterations(map(maxIter))
     val model = lr.run(oldDataset)
     val lrm = new LinearRegressionModel(this, map, model.weights, model.intercept)
+
     if (handlePersistence) {
       oldDataset.unpersist()
     }
+
+    // copy model params
+    Params.inheritValues(map, this, lrm)
     lrm
   }
 
-  /**
-   * Same as [[fit()]], but using strong types.
-   * NOTE: This does NOT support instance weights.
-   * @param dataset  Training data.  Instance weights are ignored.
-   */
-  override def train(dataset: RDD[LabeledPoint]): LinearRegressionModel =
-    train(dataset, new ParamMap()) // Override documentation
+  override protected def featuresDataType: DataType = new VectorUDT
 }
 
 /**
@@ -92,16 +86,18 @@ class LinearRegressionModel private[ml] (
     override val fittingParamMap: ParamMap,
     val weights: Vector,
     val intercept: Double)
-  extends RegressionModel[LinearRegressionModel]
+  extends RegressionModel[Vector, LinearRegressionModel]
   with LinearRegressionParams {
 
-  override def predict(features: Vector): Double = {
+  override protected def predict(features: Vector): Double = {
     BLAS.dot(features, weights) + intercept
   }
 
-  private[ml] override def copy(): LinearRegressionModel = {
+  override protected def copy(): LinearRegressionModel = {
     val m = new LinearRegressionModel(parent, fittingParamMap, weights, intercept)
     Params.inheritValues(this.paramMap, this, m)
     m
   }
+
+  override protected def featuresDataType: DataType = new VectorUDT
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/Regressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/Regressor.scala
index 78086fe16fd60..5f10344456a10 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/Regressor.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/Regressor.scala
@@ -17,23 +17,31 @@
 
 package org.apache.spark.ml.regression
 
-import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.annotation.{DeveloperApi, AlphaComponent}
 import org.apache.spark.ml.impl.estimator.{PredictionModel, Predictor, PredictorParams}
-import org.apache.spark.mllib.linalg.Vector
 
 /**
+ * :: DeveloperApi ::
  * Params for regression.
  * Currently empty, but may add functionality later.
  */
-private[regression] trait RegressorParams extends PredictorParams
+@DeveloperApi
+trait RegressorParams extends PredictorParams
 
 /**
  * :: AlphaComponent ::
  * Single-label regression
+ *
+ * @tparam FeaturesType  Type of input features.  E.g., [[org.apache.spark.mllib.linalg.Vector]]
+ * @tparam Learner  Concrete Estimator type
+ * @tparam M  Concrete Model type
  */
 @AlphaComponent
-abstract class Regressor[Learner <: Regressor[Learner, M], M <: RegressionModel[M]]
-  extends Predictor[Learner, M]
+abstract class Regressor[
+    FeaturesType,
+    Learner <: Regressor[FeaturesType, Learner, M],
+    M <: RegressionModel[FeaturesType, M]]
+  extends Predictor[FeaturesType, Learner, M]
   with RegressorParams {
 
   // TODO: defaultEvaluator (follow-up PR)
@@ -42,15 +50,21 @@ abstract class Regressor[Learner <: Regressor[Learner, M], M <: RegressionModel[
 /**
  * :: AlphaComponent ::
  * Model produced by a [[Regressor]].
- * @tparam M  Model type.
+ *
+ * @tparam FeaturesType  Type of input features.  E.g., [[org.apache.spark.mllib.linalg.Vector]]
+ * @tparam M  Concrete Model type.
  */
 @AlphaComponent
-abstract class RegressionModel[M <: RegressionModel[M]]
-  extends PredictionModel[M] with RegressorParams {
+abstract class RegressionModel[FeaturesType, M <: RegressionModel[FeaturesType, M]]
+  extends PredictionModel[FeaturesType, M] with RegressorParams {
 
   /**
+   * :: DeveloperApi ::
+   *
    * Predict real-valued label for the given features.
+   * This internal method is used to implement [[transform()]] and output [[predictionCol]].
    */
-  def predict(features: Vector): Double
+  @DeveloperApi
+  protected def predict(features: FeaturesType): Double
 
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
index 01f3f90577142..7bc18711e9cbb 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala
@@ -91,7 +91,7 @@ sealed trait Vector extends Serializable {
  * User-defined type for [[Vector]] which allows easy interaction with SQL
  * via [[org.apache.spark.sql.SchemaRDD]].
  */
-private[spark] class VectorUDT extends UserDefinedType[Vector] {
+class VectorUDT extends UserDefinedType[Vector] {
 
   override def sqlType: StructType = {
     // type: 0 = sparse, 1 = dense
@@ -147,6 +147,13 @@ private[spark] class VectorUDT extends UserDefinedType[Vector] {
   override def pyUDT: String = "pyspark.mllib.linalg.VectorUDT"
 
   override def userClass: Class[Vector] = classOf[Vector]
+
+  override def equals(o: Any): Boolean = {
+    o match {
+      case v: VectorUDT => true
+      case _ => false
+    }
+  }
 }
 
 /**