From 60050f42885582a699fc7a6fa0529964162bb8a3 Mon Sep 17 00:00:00 2001
From: Aaron Staple <aaron.staple@gmail.com>
Date: Mon, 15 Sep 2014 19:28:17 -0700
Subject: [PATCH 01/24] [SPARK-1087] Move python traceback utilities into new
 traceback_utils.py file.

Also made some cosmetic cleanups.

Author: Aaron Staple <aaron.staple@gmail.com>

Closes #2385 from staple/SPARK-1087 and squashes the following commits:

7b3bb13 [Aaron Staple] Address review comments, cosmetic cleanups.
10ba6e1 [Aaron Staple] [SPARK-1087] Move python traceback utilities into new traceback_utils.py file.
---
 python/pyspark/context.py         |  8 +---
 python/pyspark/rdd.py             | 58 ++---------------------
 python/pyspark/traceback_utils.py | 78 +++++++++++++++++++++++++++++++
 3 files changed, 83 insertions(+), 61 deletions(-)
 create mode 100644 python/pyspark/traceback_utils.py

diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index ea28e8cd8c89f..a33aae87f65e8 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -20,7 +20,6 @@
 import sys
 from threading import Lock
 from tempfile import NamedTemporaryFile
-from collections import namedtuple
 
 from pyspark import accumulators
 from pyspark.accumulators import Accumulator
@@ -33,6 +32,7 @@
 from pyspark.storagelevel import StorageLevel
 from pyspark import rdd
 from pyspark.rdd import RDD
+from pyspark.traceback_utils import CallSite, first_spark_call
 
 from py4j.java_collections import ListConverter
 
@@ -99,11 +99,7 @@ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
             ...
         ValueError:...
         """
-        if rdd._extract_concise_traceback() is not None:
-            self._callsite = rdd._extract_concise_traceback()
-        else:
-            tempNamedTuple = namedtuple("Callsite", "function file linenum")
-            self._callsite = tempNamedTuple(function=None, file=None, linenum=None)
+        self._callsite = first_spark_call() or CallSite(None, None, None)
         SparkContext._ensure_initialized(self, gateway=gateway)
         try:
             self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 6ad5ab2a2d1ae..21f182b0ff137 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -18,13 +18,11 @@
 from base64 import standard_b64encode as b64enc
 import copy
 from collections import defaultdict
-from collections import namedtuple
 from itertools import chain, ifilter, imap
 import operator
 import os
 import sys
 import shlex
-import traceback
 from subprocess import Popen, PIPE
 from tempfile import NamedTemporaryFile
 from threading import Thread
@@ -45,6 +43,7 @@
 from pyspark.resultiterable import ResultIterable
 from pyspark.shuffle import Aggregator, InMemoryMerger, ExternalMerger, \
     get_used_memory, ExternalSorter
+from pyspark.traceback_utils import SCCallSiteSync
 
 from py4j.java_collections import ListConverter, MapConverter
 
@@ -81,57 +80,6 @@ def portable_hash(x):
     return hash(x)
 
 
-def _extract_concise_traceback():
-    """
-    This function returns the traceback info for a callsite, returns a dict
-    with function name, file name and line number
-    """
-    tb = traceback.extract_stack()
-    callsite = namedtuple("Callsite", "function file linenum")
-    if len(tb) == 0:
-        return None
-    file, line, module, what = tb[len(tb) - 1]
-    sparkpath = os.path.dirname(file)
-    first_spark_frame = len(tb) - 1
-    for i in range(0, len(tb)):
-        file, line, fun, what = tb[i]
-        if file.startswith(sparkpath):
-            first_spark_frame = i
-            break
-    if first_spark_frame == 0:
-        file, line, fun, what = tb[0]
-        return callsite(function=fun, file=file, linenum=line)
-    sfile, sline, sfun, swhat = tb[first_spark_frame]
-    ufile, uline, ufun, uwhat = tb[first_spark_frame - 1]
-    return callsite(function=sfun, file=ufile, linenum=uline)
-
-_spark_stack_depth = 0
-
-
-class _JavaStackTrace(object):
-
-    def __init__(self, sc):
-        tb = _extract_concise_traceback()
-        if tb is not None:
-            self._traceback = "%s at %s:%s" % (
-                tb.function, tb.file, tb.linenum)
-        else:
-            self._traceback = "Error! Could not extract traceback info"
-        self._context = sc
-
-    def __enter__(self):
-        global _spark_stack_depth
-        if _spark_stack_depth == 0:
-            self._context._jsc.setCallSite(self._traceback)
-        _spark_stack_depth += 1
-
-    def __exit__(self, type, value, tb):
-        global _spark_stack_depth
-        _spark_stack_depth -= 1
-        if _spark_stack_depth == 0:
-            self._context._jsc.setCallSite(None)
-
-
 class BoundedFloat(float):
     """
     Bounded value is generated by approximate job, with confidence and low
@@ -704,7 +652,7 @@ def collect(self):
         """
         Return a list that contains all of the elements in this RDD.
         """
-        with _JavaStackTrace(self.context) as st:
+        with SCCallSiteSync(self.context) as css:
             bytesInJava = self._jrdd.collect().iterator()
         return list(self._collect_iterator_through_file(bytesInJava))
 
@@ -1515,7 +1463,7 @@ def add_shuffle_key(split, iterator):
 
         keyed = self.mapPartitionsWithIndex(add_shuffle_key)
         keyed._bypass_serializer = True
-        with _JavaStackTrace(self.context) as st:
+        with SCCallSiteSync(self.context) as css:
             pairRDD = self.ctx._jvm.PairwiseRDD(
                 keyed._jrdd.rdd()).asJavaPairRDD()
             partitioner = self.ctx._jvm.PythonPartitioner(numPartitions,
diff --git a/python/pyspark/traceback_utils.py b/python/pyspark/traceback_utils.py
new file mode 100644
index 0000000000000..bb8646df2b0bf
--- /dev/null
+++ b/python/pyspark/traceback_utils.py
@@ -0,0 +1,78 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from collections import namedtuple
+import os
+import traceback
+
+
+CallSite = namedtuple("CallSite", "function file linenum")
+
+
+def first_spark_call():
+    """
+    Return a CallSite representing the first Spark call in the current call stack.
+    """
+    tb = traceback.extract_stack()
+    if len(tb) == 0:
+        return None
+    file, line, module, what = tb[len(tb) - 1]
+    sparkpath = os.path.dirname(file)
+    first_spark_frame = len(tb) - 1
+    for i in range(0, len(tb)):
+        file, line, fun, what = tb[i]
+        if file.startswith(sparkpath):
+            first_spark_frame = i
+            break
+    if first_spark_frame == 0:
+        file, line, fun, what = tb[0]
+        return CallSite(function=fun, file=file, linenum=line)
+    sfile, sline, sfun, swhat = tb[first_spark_frame]
+    ufile, uline, ufun, uwhat = tb[first_spark_frame - 1]
+    return CallSite(function=sfun, file=ufile, linenum=uline)
+
+
+class SCCallSiteSync(object):
+    """
+    Helper for setting the spark context call site.
+
+    Example usage:
+    from pyspark.context import SCCallSiteSync
+    with SCCallSiteSync(<relevant SparkContext>) as css:
+        <a Spark call>
+    """
+
+    _spark_stack_depth = 0
+
+    def __init__(self, sc):
+        call_site = first_spark_call()
+        if call_site is not None:
+            self._call_site = "%s at %s:%s" % (
+                call_site.function, call_site.file, call_site.linenum)
+        else:
+            self._call_site = "Error! Could not extract traceback info"
+        self._context = sc
+
+    def __enter__(self):
+        if SCCallSiteSync._spark_stack_depth == 0:
+            self._context._jsc.setCallSite(self._call_site)
+        SCCallSiteSync._spark_stack_depth += 1
+
+    def __exit__(self, type, value, tb):
+        SCCallSiteSync._spark_stack_depth -= 1
+        if SCCallSiteSync._spark_stack_depth == 0:
+            self._context._jsc.setCallSite(None)

From d428ac6a221d2dce19c43442abf197f2ade6658f Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@apache.org>
Date: Mon, 15 Sep 2014 21:09:58 -0700
Subject: [PATCH 02/24] [SPARK-3540] Add reboot-slaves functionality to the ec2
 script

Tested on a real cluster.

Author: Reynold Xin <rxin@apache.org>

Closes #2404 from rxin/ec2-reboot-slaves and squashes the following commits:

00a2dbd [Reynold Xin] Allow rebooting slaves.
---
 ec2/spark_ec2.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index bfd07593b92ed..5682e96aa8770 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -52,7 +52,7 @@ class UsageError(Exception):
 def parse_args():
     parser = OptionParser(
         usage="spark-ec2 [options] <action> <cluster_name>"
-        + "\n\n<action> can be: launch, destroy, login, stop, start, get-master",
+        + "\n\n<action> can be: launch, destroy, login, stop, start, get-master, reboot-slaves",
         add_help_option=False)
     parser.add_option(
         "-h", "--help", action="help",
@@ -950,6 +950,20 @@ def real_main():
         subprocess.check_call(
             ssh_command(opts) + proxy_opt + ['-t', '-t', "%s@%s" % (opts.user, master)])
 
+    elif action == "reboot-slaves":
+        response = raw_input(
+            "Are you sure you want to reboot the cluster " +
+            cluster_name + " slaves?\n" +
+            "Reboot cluster slaves " + cluster_name + " (y/N): ")
+        if response == "y":
+            (master_nodes, slave_nodes) = get_existing_cluster(
+                conn, opts, cluster_name, die_on_error=False)
+            print "Rebooting slaves..."
+            for inst in slave_nodes:
+                if inst.state not in ["shutting-down", "terminated"]:
+                    print "Rebooting " + inst.id
+                    inst.reboot()
+
     elif action == "get-master":
         (master_nodes, slave_nodes) = get_existing_cluster(conn, opts, cluster_name)
         print master_nodes[0].public_dns_name

From ecf0c02935815f0d4018c0e30ec4c784e60a5db0 Mon Sep 17 00:00:00 2001
From: Prashant Sharma <prashant@apache.org>
Date: Mon, 15 Sep 2014 21:14:00 -0700
Subject: [PATCH 03/24] [SPARK-3433][BUILD] Fix for Mima false-positives with
 @DeveloperAPI and @Experimental annotations.

Actually false positive reported was due to mima generator not picking up the new jars in presence of old jars(theoretically this should not have happened.). So as a workaround, ran them both separately and just append them together.

Author: Prashant Sharma <prashant@apache.org>
Author: Prashant Sharma <prashant.s@imaginea.com>

Closes #2285 from ScrapCodes/mima-fix and squashes the following commits:

093c76f [Prashant Sharma] Update mima
59012a8 [Prashant Sharma] Update mima
35b6c71 [Prashant Sharma] SPARK-3433 Fix for Mima false-positives with @DeveloperAPI and @Experimental annotations.
---
 dev/mima                                           |  8 ++++++++
 project/MimaBuild.scala                            |  6 ++++++
 project/MimaExcludes.scala                         |  8 +-------
 project/SparkBuild.scala                           |  2 +-
 .../apache/spark/tools/GenerateMIMAIgnore.scala    | 14 ++++++++++----
 5 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/dev/mima b/dev/mima
index f9b9b03538f15..40603166c21ae 100755
--- a/dev/mima
+++ b/dev/mima
@@ -25,11 +25,19 @@ FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
 cd "$FWDIR"
 
 echo -e "q\n" | sbt/sbt oldDeps/update
+rm -f .generated-mima*
+
+# Generate Mima Ignore is called twice, first with latest built jars 
+# on the classpath and then again with previous version jars on the classpath.
+# Because of a bug in GenerateMIMAIgnore that when old jars are ahead on classpath
+# it did not process the new classes (which are in assembly jar). 
+./bin/spark-class org.apache.spark.tools.GenerateMIMAIgnore
 
 export SPARK_CLASSPATH="`find lib_managed \( -name '*spark*jar' -a -type f \) | tr "\\n" ":"`"
 echo "SPARK_CLASSPATH=$SPARK_CLASSPATH"
 
 ./bin/spark-class org.apache.spark.tools.GenerateMIMAIgnore
+
 echo -e "q\n" | sbt/sbt mima-report-binary-issues | grep -v -e "info.*Resolving"
 ret_val=$?
 
diff --git a/project/MimaBuild.scala b/project/MimaBuild.scala
index 0f5d71afcf616..39f8ba4745737 100644
--- a/project/MimaBuild.scala
+++ b/project/MimaBuild.scala
@@ -30,6 +30,12 @@ object MimaBuild {
 
   def excludeMember(fullName: String) = Seq(
       ProblemFilters.exclude[MissingMethodProblem](fullName),
+      // Sometimes excluded methods have default arguments and 
+      // they are translated into public methods/fields($default$) in generated
+      // bytecode. It is not possible to exhustively list everything.
+      // But this should be okay.
+      ProblemFilters.exclude[MissingMethodProblem](fullName+"$default$2"),
+      ProblemFilters.exclude[MissingMethodProblem](fullName+"$default$1"),
       ProblemFilters.exclude[MissingFieldProblem](fullName),
       ProblemFilters.exclude[IncompatibleResultTypeProblem](fullName),
       ProblemFilters.exclude[IncompatibleMethTypeProblem](fullName),
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 46b78bd5c7061..2f1e05dfcc7b1 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -37,14 +37,8 @@ object MimaExcludes {
           Seq(
             MimaBuild.excludeSparkPackage("deploy"),
             MimaBuild.excludeSparkPackage("graphx")
-          ) ++
-          // This is @DeveloperAPI, but Mima still gives false-positives:
-          MimaBuild.excludeSparkClass("scheduler.SparkListenerApplicationStart") ++
-          Seq(
-            // This is @Experimental, but Mima still gives false-positives:
-            ProblemFilters.exclude[MissingMethodProblem](
-              "org.apache.spark.api.java.JavaRDDLike.foreachAsync")
           )
+
         case v if v.startsWith("1.1") =>
           Seq(
             MimaBuild.excludeSparkPackage("deploy"),
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index c07ea313f1228..ab9f8ba120e83 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -187,7 +187,7 @@ object OldDeps {
     Some("org.apache.spark" % fullId % "1.1.0")
   }
 
-  def oldDepsSettings() = Defaults.defaultSettings ++ Seq(
+  def oldDepsSettings() = Defaults.coreDefaultSettings ++ Seq(
     name := "old-deps",
     scalaVersion := "2.10.4",
     retrieveManaged := true,
diff --git a/tools/src/main/scala/org/apache/spark/tools/GenerateMIMAIgnore.scala b/tools/src/main/scala/org/apache/spark/tools/GenerateMIMAIgnore.scala
index bcf6d43ab34eb..595ded6ae67fa 100644
--- a/tools/src/main/scala/org/apache/spark/tools/GenerateMIMAIgnore.scala
+++ b/tools/src/main/scala/org/apache/spark/tools/GenerateMIMAIgnore.scala
@@ -24,6 +24,7 @@ import scala.collection.mutable
 import scala.collection.JavaConversions._
 import scala.reflect.runtime.universe.runtimeMirror
 import scala.reflect.runtime.{universe => unv}
+import scala.util.Try
 
 /**
  * A tool for generating classes to be excluded during binary checking with MIMA. It is expected
@@ -121,12 +122,17 @@ object GenerateMIMAIgnore {
   }
 
   def main(args: Array[String]) {
+    import scala.tools.nsc.io.File
     val (privateClasses, privateMembers) = privateWithin("org.apache.spark")
-    scala.tools.nsc.io.File(".generated-mima-class-excludes").
-      writeAll(privateClasses.mkString("\n"))
+    val previousContents = Try(File(".generated-mima-class-excludes").lines()).
+      getOrElse(Iterator.empty).mkString("\n")
+    File(".generated-mima-class-excludes")
+      .writeAll(previousContents + privateClasses.mkString("\n"))
     println("Created : .generated-mima-class-excludes in current directory.")
-    scala.tools.nsc.io.File(".generated-mima-member-excludes").
-      writeAll(privateMembers.mkString("\n"))
+    val previousMembersContents = Try(File(".generated-mima-member-excludes").lines)
+      .getOrElse(Iterator.empty).mkString("\n")
+    File(".generated-mima-member-excludes").writeAll(previousMembersContents +
+      privateMembers.mkString("\n"))
     println("Created : .generated-mima-member-excludes in current directory.")
   }
 

From febafefa5aaee3b3eda5e1b45a75bc6d8e7fb13f Mon Sep 17 00:00:00 2001
From: Ye Xianjin <advancedxy@gmail.com>
Date: Mon, 15 Sep 2014 21:53:38 -0700
Subject: [PATCH 04/24] [SPARK-3040] pick up a more proper local ip address for
 Utils.findLocalIpAddress method

Short version: NetworkInterface.getNetworkInterfaces returns ifs in reverse order compared to ifconfig output. It may pick up ip address associated with tun0 or virtual network interface.
See [SPARK_3040](https://issues.apache.org/jira/browse/SPARK-3040) for more detail

Author: Ye Xianjin <advancedxy@gmail.com>

Closes #1946 from advancedxy/SPARK-3040 and squashes the following commits:

f33f6b2 [Ye Xianjin] add windows support
087a785 [Ye Xianjin] reverse the Networkinterface.getNetworkInterfaces output order to get a more proper local ip address.
---
 core/src/main/scala/org/apache/spark/util/Utils.scala | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 79943766d0f0f..c76b7af18481d 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -530,7 +530,12 @@ private[spark] object Utils extends Logging {
       if (address.isLoopbackAddress) {
         // Address resolves to something like 127.0.1.1, which happens on Debian; try to find
         // a better address using the local network interfaces
-        for (ni <- NetworkInterface.getNetworkInterfaces) {
+        // getNetworkInterfaces returns ifs in reverse order compared to ifconfig output order
+        // on unix-like system. On windows, it returns in index order.
+        // It's more proper to pick ip address following system output order.
+        val activeNetworkIFs = NetworkInterface.getNetworkInterfaces.toList
+        val reOrderedNetworkIFs = if (isWindows) activeNetworkIFs else activeNetworkIFs.reverse
+        for (ni <- reOrderedNetworkIFs) {
           for (addr <- ni.getInetAddresses if !addr.isLinkLocalAddress &&
                !addr.isLoopbackAddress && addr.isInstanceOf[Inet4Address]) {
             // We've found an address that looks reasonable!

From 61e21fe7f478e7b06b72851f26b87d99cbbdf117 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Tue, 16 Sep 2014 09:18:03 -0700
Subject: [PATCH 05/24] SPARK-3069 [DOCS] Build instructions in README are
 outdated

Here's my crack at Bertrand's suggestion. The Github `README.md` contains build info that's outdated. It should just point to the current online docs, and reflect that Maven is the primary build now.

(Incidentally, the stanza at the end about contributions of original work should go in https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark too. It won't hurt to be crystal clear about the agreement to license, given that ICLAs are not required of anyone here.)

Author: Sean Owen <sowen@cloudera.com>

Closes #2014 from srowen/SPARK-3069 and squashes the following commits:

501507e [Sean Owen] Note that Zinc is for Maven builds too
db2bd97 [Sean Owen] sbt -> sbt/sbt and add note about zinc
be82027 [Sean Owen] Fix additional occurrences of building-with-maven -> building-spark
91c921f [Sean Owen] Move building-with-maven to building-spark and create a redirect. Update doc links to building-spark.html Add jekyll-redirect-from plugin and make associated config changes (including fixing pygments deprecation). Add example of SBT to README.md
999544e [Sean Owen] Change "Building Spark with Maven" title to "Building Spark"; reinstate tl;dr info about dev/run-tests in README.md; add brief note about building with SBT
c18d140 [Sean Owen] Optionally, remove the copy of contributing text from main README.md
8e83934 [Sean Owen] Add CONTRIBUTING.md to trigger notice on new pull request page
b1c04a1 [Sean Owen] Refer to current online documentation for building, and remove slightly outdated copy in README.md
---
 CONTRIBUTING.md                               | 12 +++
 README.md                                     | 78 ++++---------------
 docs/README.md                                |  5 +-
 docs/_config.yml                              |  4 +-
 docs/_layouts/global.html                     |  2 +-
 ...ilding-with-maven.md => building-spark.md} | 20 ++++-
 docs/hadoop-third-party-distributions.md      |  2 +-
 docs/index.md                                 |  4 +-
 docs/running-on-yarn.md                       |  2 +-
 docs/streaming-kinesis-integration.md         |  2 +-
 make-distribution.sh                          |  2 +-
 11 files changed, 60 insertions(+), 73 deletions(-)
 create mode 100644 CONTRIBUTING.md
 rename docs/{building-with-maven.md => building-spark.md} (87%)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000000000..c6b4aa5344757
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,12 @@
+## Contributing to Spark
+
+Contributions via GitHub pull requests are gladly accepted from their original
+author. Along with any pull requests, please state that the contribution is
+your original work and that you license the work to the project under the
+project's open source license. Whether or not you state this explicitly, by
+submitting any copyrighted material via pull request, email, or other means
+you agree to license the material under the project's open source license and
+warrant that you have the legal authority to do so.
+
+Please see [Contributing to Spark wiki page](https://cwiki.apache.org/SPARK/Contributing+to+Spark)
+for more information.
diff --git a/README.md b/README.md
index 5b09ad86849e7..b05bbfb5a594c 100644
--- a/README.md
+++ b/README.md
@@ -13,16 +13,19 @@ and Spark Streaming for stream processing.
 ## Online Documentation
 
 You can find the latest Spark documentation, including a programming
-guide, on the project webpage at <http://spark.apache.org/documentation.html>.
+guide, on the [project web page](http://spark.apache.org/documentation.html).
 This README file only contains basic setup instructions.
 
 ## Building Spark
 
-Spark is built on Scala 2.10. To build Spark and its example programs, run:
+Spark is built using [Apache Maven](http://maven.apache.org/).
+To build Spark and its example programs, run:
 
-    ./sbt/sbt assembly
+    mvn -DskipTests clean package
 
 (You do not need to do this if you downloaded a pre-built package.)
+More detailed documentation is available from the project site, at
+["Building Spark"](http://spark.apache.org/docs/latest/building-spark.html).
 
 ## Interactive Scala Shell
 
@@ -71,73 +74,24 @@ can be run using:
 
     ./dev/run-tests
 
+Please see the guidance on how to 
+[run all automated tests](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark#ContributingtoSpark-AutomatedTesting)
+
 ## A Note About Hadoop Versions
 
 Spark uses the Hadoop core library to talk to HDFS and other Hadoop-supported
 storage systems. Because the protocols have changed in different versions of
 Hadoop, you must build Spark against the same version that your cluster runs.
-You can change the version by setting `-Dhadoop.version` when building Spark.
-
-For Apache Hadoop versions 1.x, Cloudera CDH MRv1, and other Hadoop
-versions without YARN, use:
-
-    # Apache Hadoop 1.2.1
-    $ sbt/sbt -Dhadoop.version=1.2.1 assembly
-
-    # Cloudera CDH 4.2.0 with MapReduce v1
-    $ sbt/sbt -Dhadoop.version=2.0.0-mr1-cdh4.2.0 assembly
-
-For Apache Hadoop 2.2.X, 2.1.X, 2.0.X, 0.23.x, Cloudera CDH MRv2, and other Hadoop versions
-with YARN, also set `-Pyarn`:
-
-    # Apache Hadoop 2.0.5-alpha
-    $ sbt/sbt -Dhadoop.version=2.0.5-alpha -Pyarn assembly
-
-    # Cloudera CDH 4.2.0 with MapReduce v2
-    $ sbt/sbt -Dhadoop.version=2.0.0-cdh4.2.0 -Pyarn assembly
-
-    # Apache Hadoop 2.2.X and newer
-    $ sbt/sbt -Dhadoop.version=2.2.0 -Pyarn assembly
-
-When developing a Spark application, specify the Hadoop version by adding the
-"hadoop-client" artifact to your project's dependencies. For example, if you're
-using Hadoop 1.2.1 and build your application using SBT, add this entry to
-`libraryDependencies`:
-
-    "org.apache.hadoop" % "hadoop-client" % "1.2.1"
 
-If your project is built with Maven, add this to your POM file's `<dependencies>` section:
-
-    <dependency>
-      <groupId>org.apache.hadoop</groupId>
-      <artifactId>hadoop-client</artifactId>
-      <version>1.2.1</version>
-    </dependency>
-
-
-## A Note About Thrift JDBC server and CLI for Spark SQL
-
-Spark SQL supports Thrift JDBC server and CLI.
-See sql-programming-guide.md for more information about using the JDBC server and CLI.
-You can use those features by setting `-Phive` when building Spark as follows.
-
-    $ sbt/sbt -Phive  assembly
+Please refer to the build documentation at
+["Specifying the Hadoop Version"](http://spark.apache.org/docs/latest/building-spark.html#specifying-the-hadoop-version)
+for detailed guidance on building for a particular distribution of Hadoop, including
+building for particular Hive and Hive Thriftserver distributions. See also
+["Third Party Hadoop Distributions"](http://spark.apache.org/docs/latest/hadoop-third-party-distributions.html)
+for guidance on building a Spark application that works with a particular
+distribution.
 
 ## Configuration
 
 Please refer to the [Configuration guide](http://spark.apache.org/docs/latest/configuration.html)
 in the online documentation for an overview on how to configure Spark.
-
-
-## Contributing to Spark
-
-Contributions via GitHub pull requests are gladly accepted from their original
-author. Along with any pull requests, please state that the contribution is
-your original work and that you license the work to the project under the
-project's open source license. Whether or not you state this explicitly, by
-submitting any copyrighted material via pull request, email, or other means
-you agree to license the material under the project's open source license and
-warrant that you have the legal authority to do so.
-
-Please see [Contributing to Spark wiki page](https://cwiki.apache.org/SPARK/Contributing+to+Spark)
-for more information.
diff --git a/docs/README.md b/docs/README.md
index 0a0126c5747d1..fdc89d2eb767a 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -23,8 +23,9 @@ The markdown code can be compiled to HTML using the [Jekyll tool](http://jekyllr
 To use the `jekyll` command, you will need to have Jekyll installed. 
 The easiest way to do this is via a Ruby Gem, see the 
 [jekyll installation instructions](http://jekyllrb.com/docs/installation).
-If not already installed, you need to install `kramdown` with `sudo gem install kramdown`.
-Execute `jekyll` from the `docs/` directory. Compiling the site with Jekyll will create a directory
+If not already installed, you need to install `kramdown` and `jekyll-redirect-from` Gems
+with `sudo gem install kramdown jekyll-redirect-from`.
+Execute `jekyll build` from the `docs/` directory. Compiling the site with Jekyll will create a directory
 called `_site` containing index.html as well as the rest of the compiled files.
 
 You can modify the default Jekyll build as follows:
diff --git a/docs/_config.yml b/docs/_config.yml
index 45b78fe724a50..d3ea2625c7448 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -1,5 +1,7 @@
-pygments: true
+highlighter: pygments
 markdown: kramdown
+gems:
+  - jekyll-redirect-from
 
 # These allow the documentation to be updated with nerw releases
 # of Spark, Scala, and Mesos.
diff --git a/docs/_layouts/global.html b/docs/_layouts/global.html
index b30ab1e5218c0..a53e8a775b71f 100755
--- a/docs/_layouts/global.html
+++ b/docs/_layouts/global.html
@@ -109,7 +109,7 @@
                                 <li><a href="hardware-provisioning.html">Hardware Provisioning</a></li>
                                 <li><a href="hadoop-third-party-distributions.html">3<sup>rd</sup>-Party Hadoop Distros</a></li>
                                 <li class="divider"></li>
-                                <li><a href="building-with-maven.html">Building Spark with Maven</a></li>
+                                <li><a href="building-spark.html">Building Spark</a></li>
                                 <li><a href="https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark">Contributing to Spark</a></li>
                             </ul>
                         </li>
diff --git a/docs/building-with-maven.md b/docs/building-spark.md
similarity index 87%
rename from docs/building-with-maven.md
rename to docs/building-spark.md
index bce7412c7d4c9..2378092d4a1a8 100644
--- a/docs/building-with-maven.md
+++ b/docs/building-spark.md
@@ -1,6 +1,7 @@
 ---
 layout: global
-title: Building Spark with Maven
+title: Building Spark
+redirect_from: "building-with-maven.html"
 ---
 
 * This will become a table of contents (this text will be scraped).
@@ -159,4 +160,21 @@ then ship it over to the cluster. We are investigating the exact cause for this.
 
 The assembly jar produced by `mvn package` will, by default, include all of Spark's dependencies, including Hadoop and some of its ecosystem projects. On YARN deployments, this causes multiple versions of these to appear on executor classpaths: the version packaged in the Spark assembly and the version on each node, included with yarn.application.classpath.  The `hadoop-provided` profile builds the assembly without including Hadoop-ecosystem projects, like ZooKeeper and Hadoop itself. 
 
+# Building with SBT
 
+Maven is the official recommendation for packaging Spark, and is the "build of reference".
+But SBT is supported for day-to-day development since it can provide much faster iterative
+compilation. More advanced developers may wish to use SBT.
+
+The SBT build is derived from the Maven POM files, and so the same Maven profiles and variables
+can be set to control the SBT build. For example:
+
+    sbt/sbt -Pyarn -Phadoop-2.3 compile
+
+# Speeding up Compilation with Zinc
+
+[Zinc](https://github.com/typesafehub/zinc) is a long-running server version of SBT's incremental
+compiler. When run locally as a background process, it speeds up builds of Scala-based projects
+like Spark. Developers who regularly recompile Spark with Maven will be the most interested in
+Zinc. The project site gives instructions for building and running `zinc`; OS X users can
+install it using `brew install zinc`.
\ No newline at end of file
diff --git a/docs/hadoop-third-party-distributions.md b/docs/hadoop-third-party-distributions.md
index ab1023b8f1842..dd73e9dc54440 100644
--- a/docs/hadoop-third-party-distributions.md
+++ b/docs/hadoop-third-party-distributions.md
@@ -11,7 +11,7 @@ with these distributions:
 
 When compiling Spark, you'll need to specify the Hadoop version by defining the `hadoop.version`
 property. For certain versions, you will need to specify additional profiles. For more detail,
-see the guide on [building with maven](building-with-maven.html#specifying-the-hadoop-version):
+see the guide on [building with maven](building-spark.html#specifying-the-hadoop-version):
 
     mvn -Dhadoop.version=1.0.4 -DskipTests clean package
     mvn -Phadoop-2.2 -Dhadoop.version=2.2.0 -DskipTests clean package
diff --git a/docs/index.md b/docs/index.md
index 7fe6b43d32af7..e8ebadbd4e427 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -12,7 +12,7 @@ It also supports a rich set of higher-level tools including [Spark SQL](sql-prog
 
 Get Spark from the [downloads page](http://spark.apache.org/downloads.html) of the project website. This documentation is for Spark version {{site.SPARK_VERSION}}. The downloads page 
 contains Spark packages for many popular HDFS versions. If you'd like to build Spark from 
-scratch, visit [building Spark with Maven](building-with-maven.html).
+scratch, visit [Building Spark](building-spark.html).
 
 Spark runs on both Windows and UNIX-like systems (e.g. Linux, Mac OS). It's easy to run
 locally on one machine --- all you need is to have `java` installed on your system `PATH`,
@@ -105,7 +105,7 @@ options for deployment:
 * [3<sup>rd</sup> Party Hadoop Distributions](hadoop-third-party-distributions.html): using common Hadoop distributions
 * Integration with other storage systems:
   * [OpenStack Swift](storage-openstack-swift.html)
-* [Building Spark with Maven](building-with-maven.html): build Spark using the Maven system
+* [Building Spark](building-spark.html): build Spark using the Maven system
 * [Contributing to Spark](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark)
 
 **External Resources:**
diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index 212248bcce1c1..74bcc2eeb65f6 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -11,7 +11,7 @@ was added to Spark in version 0.6.0, and improved in subsequent releases.
 
 Running Spark-on-YARN requires a binary distribution of Spark which is built with YARN support.
 Binary distributions can be downloaded from the Spark project website. 
-To build Spark yourself, refer to the [building with Maven guide](building-with-maven.html).
+To build Spark yourself, refer to [Building Spark](building-spark.html).
 
 # Configuration
 
diff --git a/docs/streaming-kinesis-integration.md b/docs/streaming-kinesis-integration.md
index c6090d9ec30c7..379eb513d521e 100644
--- a/docs/streaming-kinesis-integration.md
+++ b/docs/streaming-kinesis-integration.md
@@ -108,7 +108,7 @@ A Kinesis stream can be set up at one of the valid Kinesis endpoints with 1 or m
 #### Running the Example
 To run the example,
 
-- Download Spark source and follow the [instructions](building-with-maven.html) to build Spark with profile *-Pkinesis-asl*.
+- Download Spark source and follow the [instructions](building-spark.html) to build Spark with profile *-Pkinesis-asl*.
 
         mvn -Pkinesis-asl -DskipTests clean package
 
diff --git a/make-distribution.sh b/make-distribution.sh
index 9b012b9222db4..884659954a491 100755
--- a/make-distribution.sh
+++ b/make-distribution.sh
@@ -40,7 +40,7 @@ function exit_with_usage {
   echo ""
   echo "usage:"
   echo "./make-distribution.sh [--name] [--tgz] [--with-tachyon] <maven build options>"
-  echo "See Spark's \"Building with Maven\" doc for correct Maven options."
+  echo "See Spark's \"Building Spark\" doc for correct Maven options."
   echo ""
   exit 1
 }

From 7b8008f5a4d413b61aa88fbc60959e98e59f17dd Mon Sep 17 00:00:00 2001
From: Prashant Sharma <prashant.s@imaginea.com>
Date: Tue, 16 Sep 2014 09:21:03 -0700
Subject: [PATCH 06/24] [SPARK-2182] Scalastyle rule blocking non ascii
 characters.

...erators.

Author: Prashant Sharma <prashant.s@imaginea.com>

Closes #2358 from ScrapCodes/scalastyle-unicode and squashes the following commits:

12a20f2 [Prashant Sharma] [SPARK-2182] Scalastyle rule blocking (non keyboard typeable) unicode operators.
---
 .../scalastyle/NonASCIICharacterChecker.scala | 39 +++++++++++++++++++
 scalastyle-config.xml                         |  1 +
 2 files changed, 40 insertions(+)
 create mode 100644 project/spark-style/src/main/scala/org/apache/spark/scalastyle/NonASCIICharacterChecker.scala

diff --git a/project/spark-style/src/main/scala/org/apache/spark/scalastyle/NonASCIICharacterChecker.scala b/project/spark-style/src/main/scala/org/apache/spark/scalastyle/NonASCIICharacterChecker.scala
new file mode 100644
index 0000000000000..3d43c35299555
--- /dev/null
+++ b/project/spark-style/src/main/scala/org/apache/spark/scalastyle/NonASCIICharacterChecker.scala
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.spark.scalastyle
+
+import java.util.regex.Pattern
+
+import org.scalastyle.{PositionError, ScalariformChecker, ScalastyleError}
+
+import scalariform.lexer.Token
+import scalariform.parser.CompilationUnit
+
+class NonASCIICharacterChecker extends ScalariformChecker {
+  val errorKey: String = "non.ascii.character.disallowed"
+
+  override def verify(ast: CompilationUnit): List[ScalastyleError] = {
+    ast.tokens.filter(hasNonAsciiChars).map(x => PositionError(x.offset)).toList
+  }
+
+  private def hasNonAsciiChars(x: Token) =
+    x.rawText.trim.nonEmpty && !Pattern.compile( """\p{ASCII}+""", Pattern.DOTALL)
+    .matcher(x.text.trim).matches()
+
+}
diff --git a/scalastyle-config.xml b/scalastyle-config.xml
index 76ba1ecca33ab..c54f8b72ebf42 100644
--- a/scalastyle-config.xml
+++ b/scalastyle-config.xml
@@ -140,5 +140,6 @@
  <!-- <check level="error" class="org.scalastyle.scalariform.PublicMethodsHaveTypeChecker" enabled="true"></check> -->
  <check level="error" class="org.scalastyle.file.NewLineAtEofChecker" enabled="true"></check>
  <check level="error" class="org.scalastyle.file.NoNewLineAtEofChecker" enabled="false"></check>
+ <check level="error" class="org.apache.spark.scalastyle.NonASCIICharacterChecker" enabled="true"></check>
  <check level="error" class="org.apache.spark.scalastyle.SparkSpaceAfterCommentStartChecker" enabled="true"></check>
 </scalastyle>

From 86d253ec4e2ed94c68687d575f9e2dfbb44463e1 Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Tue, 16 Sep 2014 11:21:30 -0700
Subject: [PATCH 07/24] [SPARK-3527] [SQL] Strip the string message

Author: Cheng Hao <hao.cheng@intel.com>

Closes #2392 from chenghao-intel/trim and squashes the following commits:

e52024f [Cheng Hao] trim the string message
---
 sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index c551c7c9877e8..7dbaf7faff0c0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -414,7 +414,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
     def simpleString: String =
       s"""== Physical Plan ==
          |${stringOrError(executedPlan)}
-      """
+      """.stripMargin.trim
 
     override def toString: String =
       // TODO previously will output RDD details by run (${stringOrError(toRdd.toDebugString)})

From 9d5fa763d8559ac412a18d7a2f43c4368a0af897 Mon Sep 17 00:00:00 2001
From: Matthew Farrellee <matt@redhat.com>
Date: Tue, 16 Sep 2014 11:39:57 -0700
Subject: [PATCH 08/24] [SPARK-3519] add distinct(n) to PySpark

Added missing rdd.distinct(numPartitions) and associated tests

Author: Matthew Farrellee <matt@redhat.com>

Closes #2383 from mattf/SPARK-3519 and squashes the following commits:

30b837a [Matthew Farrellee] Combine test cases to save on JVM startups
6bc4a2c [Matthew Farrellee] [SPARK-3519] add distinct(n) to SchemaRDD in PySpark
7a17f2b [Matthew Farrellee] [SPARK-3519] add distinct(n) to PySpark
---
 python/pyspark/rdd.py   |  4 ++--
 python/pyspark/sql.py   |  7 +++++--
 python/pyspark/tests.py | 17 +++++++++++++++++
 3 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 21f182b0ff137..cb09c191bed71 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -301,7 +301,7 @@ def func(iterator):
             return ifilter(f, iterator)
         return self.mapPartitions(func, True)
 
-    def distinct(self):
+    def distinct(self, numPartitions=None):
         """
         Return a new RDD containing the distinct elements in this RDD.
 
@@ -309,7 +309,7 @@ def distinct(self):
         [1, 2, 3]
         """
         return self.map(lambda x: (x, None)) \
-                   .reduceByKey(lambda x, _: x) \
+                   .reduceByKey(lambda x, _: x, numPartitions) \
                    .map(lambda (x, _): x)
 
     def sample(self, withReplacement, fraction, seed=None):
diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py
index fc9310fef318c..eac55cbe15193 100644
--- a/python/pyspark/sql.py
+++ b/python/pyspark/sql.py
@@ -1694,8 +1694,11 @@ def coalesce(self, numPartitions, shuffle=False):
         rdd = self._jschema_rdd.coalesce(numPartitions, shuffle)
         return SchemaRDD(rdd, self.sql_ctx)
 
-    def distinct(self):
-        rdd = self._jschema_rdd.distinct()
+    def distinct(self, numPartitions=None):
+        if numPartitions is None:
+            rdd = self._jschema_rdd.distinct()
+        else:
+            rdd = self._jschema_rdd.distinct(numPartitions)
         return SchemaRDD(rdd, self.sql_ctx)
 
     def intersection(self, other):
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index f255b44359fec..0b3854347ad2e 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -587,6 +587,14 @@ def test_repartitionAndSortWithinPartitions(self):
         self.assertEquals(partitions[0], [(0, 5), (0, 8), (2, 6)])
         self.assertEquals(partitions[1], [(1, 3), (3, 8), (3, 8)])
 
+    def test_distinct(self):
+        rdd = self.sc.parallelize((1, 2, 3)*10, 10)
+        self.assertEquals(rdd.getNumPartitions(), 10)
+        self.assertEquals(rdd.distinct().count(), 3)
+        result = rdd.distinct(5)
+        self.assertEquals(result.getNumPartitions(), 5)
+        self.assertEquals(result.count(), 3)
+
 
 class TestSQL(PySparkTestCase):
 
@@ -636,6 +644,15 @@ def test_basic_functions(self):
         srdd.count()
         srdd.collect()
 
+    def test_distinct(self):
+        rdd = self.sc.parallelize(['{"a": 1}', '{"b": 2}', '{"c": 3}']*10, 10)
+        srdd = self.sqlCtx.jsonRDD(rdd)
+        self.assertEquals(srdd.getNumPartitions(), 10)
+        self.assertEquals(srdd.distinct().count(), 3)
+        result = srdd.distinct(5)
+        self.assertEquals(result.getNumPartitions(), 5)
+        self.assertEquals(result.count(), 3)
+
 
 class TestIO(PySparkTestCase):
 

From 7583699873fb4f252c6ce65db1096783ef438731 Mon Sep 17 00:00:00 2001
From: Yin Huai <huai@cse.ohio-state.edu>
Date: Tue, 16 Sep 2014 11:40:28 -0700
Subject: [PATCH 09/24] [SPARK-3308][SQL] Ability to read JSON Arrays as tables

This PR aims to support reading top level JSON arrays and take every element in such an array as a row (an empty array will not generate a row).

JIRA: https://issues.apache.org/jira/browse/SPARK-3308

Author: Yin Huai <huai@cse.ohio-state.edu>

Closes #2400 from yhuai/SPARK-3308 and squashes the following commits:

990077a [Yin Huai] Handle top level JSON arrays.
---
 .../org/apache/spark/sql/json/JsonRDD.scala     | 10 +++++++---
 .../org/apache/spark/sql/json/JsonSuite.scala   | 17 +++++++++++++++++
 .../apache/spark/sql/json/TestJsonData.scala    |  7 +++++++
 3 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
index 873221835daf8..0f27fd13e7379 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
@@ -287,9 +287,13 @@ private[sql] object JsonRDD extends Logging {
       // the ObjectMapper will take the last value associated with this duplicate key.
       // For example: for {"key": 1, "key":2}, we will get "key"->2.
       val mapper = new ObjectMapper()
-      iter.map { record =>
-        val parsed = scalafy(mapper.readValue(record, classOf[java.util.Map[String, Any]]))
-        parsed.asInstanceOf[Map[String, Any]]
+      iter.flatMap { record =>
+        val parsed = mapper.readValue(record, classOf[Object]) match {
+          case map: java.util.Map[_, _] => scalafy(map).asInstanceOf[Map[String, Any]] :: Nil
+          case list: java.util.List[_] => scalafy(list).asInstanceOf[Seq[Map[String, Any]]]
+        }
+
+        parsed
       }
     })
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
index b50d93855405a..685e788207725 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala
@@ -622,4 +622,21 @@ class JsonSuite extends QueryTest {
       ("str1", Nil, "str4", 2) :: Nil
     )
   }
+
+  test("SPARK-3308 Read top level JSON arrays") {
+    val jsonSchemaRDD = jsonRDD(jsonArray)
+    jsonSchemaRDD.registerTempTable("jsonTable")
+
+    checkAnswer(
+      sql(
+        """
+          |select a, b, c
+          |from jsonTable
+        """.stripMargin),
+      ("str_a_1", null, null) ::
+      ("str_a_2", null, null) ::
+      (null, "str_b_3", null) ::
+      ("str_a_4", "str_b_4", "str_c_4") ::Nil
+    )
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala
index 5f0b3959a63ad..fc833b8b54e4c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala
@@ -136,4 +136,11 @@ object TestJsonData {
             ]
           ]]
       }""" :: Nil)
+
+  val jsonArray =
+    TestSQLContext.sparkContext.parallelize(
+      """[{"a":"str_a_1"}]""" ::
+      """[{"a":"str_a_2"}, {"b":"str_b_3"}]""" ::
+      """{"b":"str_b_4", "a":"str_a_4", "c":"str_c_4"}""" ::
+      """[]""" :: Nil)
 }

From 30f288ae34a67307aa45b7aecbd0d02a0a14fe69 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Tue, 16 Sep 2014 11:42:26 -0700
Subject: [PATCH 10/24] [SPARK-2890][SQL] Allow reading of data when case
 insensitive resolution could cause possible ambiguity.

Throwing an error in the constructor makes it possible to run queries, even when there is no actual ambiguity.  Remove this check in favor of throwing an error in analysis when they query is actually is ambiguous.

Also took the opportunity to add test cases that would have caught a subtle bug in my first attempt at fixing this and refactor some other test code.

Author: Michael Armbrust <michael@databricks.com>

Closes #2209 from marmbrus/sameNameStruct and squashes the following commits:

729cca4 [Michael Armbrust] Better tests.
a003aeb [Michael Armbrust] Remove error (it'll be caught in analysis).
---
 .../spark/sql/catalyst/types/dataTypes.scala  |  4 --
 .../sql/hive/execution/HiveUdfSuite.scala     | 67 ++++++++++++-------
 2 files changed, 44 insertions(+), 27 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/dataTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/dataTypes.scala
index 70c6d06cf2534..49520b7678e90 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/dataTypes.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/dataTypes.scala
@@ -308,13 +308,9 @@ case class StructField(name: String, dataType: DataType, nullable: Boolean) {
 object StructType {
   protected[sql] def fromAttributes(attributes: Seq[Attribute]): StructType =
     StructType(attributes.map(a => StructField(a.name, a.dataType, a.nullable)))
-
-  private def validateFields(fields: Seq[StructField]): Boolean =
-    fields.map(field => field.name).distinct.size == fields.size
 }
 
 case class StructType(fields: Seq[StructField]) extends DataType {
-  require(StructType.validateFields(fields), "Found fields with the same name.")
 
   /**
    * Returns all field names in a [[Seq]].
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUdfSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUdfSuite.scala
index b6b8592344ef5..cc125d539c3c2 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUdfSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUdfSuite.scala
@@ -17,47 +17,68 @@
 
 package org.apache.spark.sql.hive.execution
 
-import org.apache.spark.sql.hive.test.TestHive
-import org.apache.hadoop.conf.Configuration
-import org.apache.spark.SparkContext._
+import java.io.{DataOutput, DataInput}
 import java.util
-import org.apache.hadoop.fs.{FileSystem, Path}
+import java.util.Properties
+
+import org.apache.spark.util.Utils
+
+import scala.collection.JavaConversions._
+
+import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.hive.serde2.{SerDeStats, AbstractSerDe}
-import org.apache.hadoop.io.{NullWritable, Writable}
+import org.apache.hadoop.io.Writable
 import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspectorFactory, ObjectInspector}
-import java.util.Properties
+
 import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory
-import scala.collection.JavaConversions._
-import java.io.{DataOutput, DataInput}
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDF
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject
 
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.hive.test.TestHive
+import org.apache.spark.sql.hive.test.TestHive._
+
+case class Fields(f1: Int, f2: Int, f3: Int, f4: Int, f5: Int)
+
 /**
  * A test suite for Hive custom UDFs.
  */
 class HiveUdfSuite extends HiveComparisonTest {
 
-  TestHive.sql(
-    """
+  test("spark sql udf test that returns a struct") {
+    registerFunction("getStruct", (_: Int) => Fields(1, 2, 3, 4, 5))
+    assert(sql(
+      """
+        |SELECT getStruct(1).f1,
+        |       getStruct(1).f2,
+        |       getStruct(1).f3,
+        |       getStruct(1).f4,
+        |       getStruct(1).f5 FROM src LIMIT 1
+      """.stripMargin).first() === Row(1, 2, 3, 4, 5))
+  }
+
+  test("hive struct udf") {
+    sql(
+      """
       |CREATE EXTERNAL TABLE hiveUdfTestTable (
       |   pair STRUCT<id: INT, value: INT>
       |)
       |PARTITIONED BY (partition STRING)
       |ROW FORMAT SERDE '%s'
       |STORED AS SEQUENCEFILE
-    """.stripMargin.format(classOf[PairSerDe].getName)
-  )
-
-  TestHive.sql(
-    "ALTER TABLE hiveUdfTestTable ADD IF NOT EXISTS PARTITION(partition='testUdf') LOCATION '%s'"
-      .format(this.getClass.getClassLoader.getResource("data/files/testUdf").getFile)
-  )
-
-  TestHive.sql("CREATE TEMPORARY FUNCTION testUdf AS '%s'".format(classOf[PairUdf].getName))
-
-  TestHive.sql("SELECT testUdf(pair) FROM hiveUdfTestTable")
-
-  TestHive.sql("DROP TEMPORARY FUNCTION IF EXISTS testUdf")
+    """.
+        stripMargin.format(classOf[PairSerDe].getName))
+
+    val location = Utils.getSparkClassLoader.getResource("data/files/testUdf").getFile
+    sql(s"""
+      ALTER TABLE hiveUdfTestTable
+      ADD IF NOT EXISTS PARTITION(partition='testUdf')
+      LOCATION '$location'""")
+
+    sql(s"CREATE TEMPORARY FUNCTION testUdf AS '${classOf[PairUdf].getName}'")
+    sql("SELECT testUdf(pair) FROM hiveUdfTestTable")
+    sql("DROP TEMPORARY FUNCTION IF EXISTS testUdf")
+  }
 }
 
 class TestPair(x: Int, y: Int) extends Writable with Serializable {

From 8e7ae477ba40a064d27cf149aa211ff6108fe239 Mon Sep 17 00:00:00 2001
From: Aaron Staple <aaron.staple@gmail.com>
Date: Tue, 16 Sep 2014 11:45:35 -0700
Subject: [PATCH 11/24] [SPARK-2314][SQL] Override collect and take in python
 library, and count in java library, with optimized versions.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SchemaRDD overrides RDD functions, including collect, count, and take, with optimized versions making use of the query optimizer.  The java and python interface classes wrapping SchemaRDD need to ensure the optimized versions are called as well.  This patch overrides relevant calls in the python and java interfaces with optimized versions.

Adds a new Row serialization pathway between python and java, based on JList[Array[Byte]] versus the existing RDD[Array[Byte]]. I wasn’t overjoyed about doing this, but I noticed that some QueryPlans implement optimizations in executeCollect(), which outputs an Array[Row] rather than the typical RDD[Row] that can be shipped to python using the existing serialization code. To me it made sense to ship the Array[Row] over to python directly instead of converting it back to an RDD[Row] just for the purpose of sending the Rows to python using the existing serialization code.

Author: Aaron Staple <aaron.staple@gmail.com>

Closes #1592 from staple/SPARK-2314 and squashes the following commits:

89ff550 [Aaron Staple] Merge with master.
6bb7b6c [Aaron Staple] Fix typo.
b56d0ac [Aaron Staple] [SPARK-2314][SQL] Override count in JavaSchemaRDD, forwarding to SchemaRDD's count.
0fc9d40 [Aaron Staple] Fix comment typos.
f03cdfa [Aaron Staple] [SPARK-2314][SQL] Override collect and take in sql.py, forwarding to SchemaRDD's collect.
---
 .../apache/spark/api/python/PythonRDD.scala   |  2 +-
 python/pyspark/sql.py                         | 47 +++++++++++++++++--
 .../org/apache/spark/sql/SchemaRDD.scala      | 37 ++++++++++-----
 .../spark/sql/api/java/JavaSchemaRDD.scala    |  2 +
 4 files changed, 71 insertions(+), 17 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index d5002fa02992b..12b345a8fa7c3 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -776,7 +776,7 @@ private[spark] object PythonRDD extends Logging {
   }
 
   /**
-   * Convert and RDD of Java objects to and RDD of serialized Python objects, that is usable by
+   * Convert an RDD of Java objects to an RDD of serialized Python objects, that is usable by
    * PySpark.
    */
   def javaToPython(jRDD: JavaRDD[Any]): JavaRDD[Array[Byte]] = {
diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py
index eac55cbe15193..621a556ec6356 100644
--- a/python/pyspark/sql.py
+++ b/python/pyspark/sql.py
@@ -30,6 +30,7 @@
 from pyspark.rdd import RDD, PipelinedRDD
 from pyspark.serializers import BatchedSerializer, PickleSerializer, CloudPickleSerializer
 from pyspark.storagelevel import StorageLevel
+from pyspark.traceback_utils import SCCallSiteSync
 
 from itertools import chain, ifilter, imap
 
@@ -1550,6 +1551,18 @@ def id(self):
             self._id = self._jrdd.id()
         return self._id
 
+    def limit(self, num):
+        """Limit the result count to the number specified.
+
+        >>> srdd = sqlCtx.inferSchema(rdd)
+        >>> srdd.limit(2).collect()
+        [Row(field1=1, field2=u'row1'), Row(field1=2, field2=u'row2')]
+        >>> srdd.limit(0).collect()
+        []
+        """
+        rdd = self._jschema_rdd.baseSchemaRDD().limit(num).toJavaSchemaRDD()
+        return SchemaRDD(rdd, self.sql_ctx)
+
     def saveAsParquetFile(self, path):
         """Save the contents as a Parquet file, preserving the schema.
 
@@ -1626,15 +1639,39 @@ def count(self):
         return self._jschema_rdd.count()
 
     def collect(self):
-        """
-        Return a list that contains all of the rows in this RDD.
+        """Return a list that contains all of the rows in this RDD.
 
-        Each object in the list is on Row, the fields can be accessed as
+        Each object in the list is a Row, the fields can be accessed as
         attributes.
+
+        Unlike the base RDD implementation of collect, this implementation
+        leverages the query optimizer to perform a collect on the SchemaRDD,
+        which supports features such as filter pushdown.
+
+        >>> srdd = sqlCtx.inferSchema(rdd)
+        >>> srdd.collect()
+        [Row(field1=1, field2=u'row1'), ..., Row(field1=3, field2=u'row3')]
         """
-        rows = RDD.collect(self)
+        with SCCallSiteSync(self.context) as css:
+            bytesInJava = self._jschema_rdd.baseSchemaRDD().collectToPython().iterator()
         cls = _create_cls(self.schema())
-        return map(cls, rows)
+        return map(cls, self._collect_iterator_through_file(bytesInJava))
+
+    def take(self, num):
+        """Take the first num rows of the RDD.
+
+        Each object in the list is a Row, the fields can be accessed as
+        attributes.
+
+        Unlike the base RDD implementation of take, this implementation
+        leverages the query optimizer to perform a collect on a SchemaRDD,
+        which supports features such as filter pushdown.
+
+        >>> srdd = sqlCtx.inferSchema(rdd)
+        >>> srdd.take(2)
+        [Row(field1=1, field2=u'row1'), Row(field1=2, field2=u'row2')]
+        """
+        return self.limit(num).collect()
 
     # Convert each object in the RDD to a Row with the right class
     # for this SchemaRDD, so that fields can be accessed as attributes.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala
index d2ceb4a2b0b25..3bc5dce095511 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala
@@ -377,15 +377,15 @@ class SchemaRDD(
   def toJavaSchemaRDD: JavaSchemaRDD = new JavaSchemaRDD(sqlContext, logicalPlan)
 
   /**
-   * Converts a JavaRDD to a PythonRDD. It is used by pyspark.
+   * Helper for converting a Row to a simple Array suitable for pyspark serialization.
    */
-  private[sql] def javaToPython: JavaRDD[Array[Byte]] = {
+  private def rowToJArray(row: Row, structType: StructType): Array[Any] = {
     import scala.collection.Map
 
     def toJava(obj: Any, dataType: DataType): Any = (obj, dataType) match {
       case (null, _) => null
 
-      case (obj: Row, struct: StructType) => rowToArray(obj, struct)
+      case (obj: Row, struct: StructType) => rowToJArray(obj, struct)
 
       case (seq: Seq[Any], array: ArrayType) =>
         seq.map(x => toJava(x, array.elementType)).asJava
@@ -402,22 +402,37 @@ class SchemaRDD(
       case (other, _) => other
     }
 
-    def rowToArray(row: Row, structType: StructType): Array[Any] = {
-      val fields = structType.fields.map(field => field.dataType)
-      row.zip(fields).map {
-        case (obj, dataType) => toJava(obj, dataType)
-      }.toArray
-    }
+    val fields = structType.fields.map(field => field.dataType)
+    row.zip(fields).map {
+      case (obj, dataType) => toJava(obj, dataType)
+    }.toArray
+  }
 
+  /**
+   * Converts a JavaRDD to a PythonRDD. It is used by pyspark.
+   */
+  private[sql] def javaToPython: JavaRDD[Array[Byte]] = {
     val rowSchema = StructType.fromAttributes(this.queryExecution.analyzed.output)
     this.mapPartitions { iter =>
       val pickle = new Pickler
       iter.map { row =>
-        rowToArray(row, rowSchema)
+        rowToJArray(row, rowSchema)
       }.grouped(100).map(batched => pickle.dumps(batched.toArray))
     }
   }
 
+  /**
+   * Serializes the Array[Row] returned by SchemaRDD's optimized collect(), using the same
+   * format as javaToPython. It is used by pyspark.
+   */
+  private[sql] def collectToPython: JList[Array[Byte]] = {
+    val rowSchema = StructType.fromAttributes(this.queryExecution.analyzed.output)
+    val pickle = new Pickler
+    new java.util.ArrayList(collect().map { row =>
+      rowToJArray(row, rowSchema)
+    }.grouped(100).map(batched => pickle.dumps(batched.toArray)).toIterable)
+  }
+
   /**
    * Creates SchemaRDD by applying own schema to derived RDD. Typically used to wrap return value
    * of base RDD functions that do not change schema.
@@ -433,7 +448,7 @@ class SchemaRDD(
   }
 
   // =======================================================================
-  // Overriden RDD actions
+  // Overridden RDD actions
   // =======================================================================
 
   override def collect(): Array[Row] = queryExecution.executedPlan.executeCollect()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSchemaRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSchemaRDD.scala
index 4d799b4038fdd..e7faba0c7f620 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSchemaRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSchemaRDD.scala
@@ -112,6 +112,8 @@ class JavaSchemaRDD(
     new java.util.ArrayList(arr)
   }
 
+  override def count(): Long = baseSchemaRDD.count
+
   override def take(num: Int): JList[Row] = {
     import scala.collection.JavaConversions._
     val arr: java.util.Collection[Row] = baseSchemaRDD.take(num).toSeq.map(new Row(_))

From df90e81fd383c0d89dee6db16d5520def9190c56 Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Tue, 16 Sep 2014 11:48:20 -0700
Subject: [PATCH 12/24] [Docs] minor punctuation fix

Author: Nicholas Chammas <nicholas.chammas@gmail.com>

Closes #2414 from nchammas/patch-1 and squashes the following commits:

14664bf [Nicholas Chammas] [Docs] minor punctuation fix
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b05bbfb5a594c..8dd8b70696aa2 100644
--- a/README.md
+++ b/README.md
@@ -75,7 +75,7 @@ can be run using:
     ./dev/run-tests
 
 Please see the guidance on how to 
-[run all automated tests](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark#ContributingtoSpark-AutomatedTesting)
+[run all automated tests](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark#ContributingtoSpark-AutomatedTesting).
 
 ## A Note About Hadoop Versions
 

From 84073eb1172dc959936149265378f6e24d303685 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Tue, 16 Sep 2014 11:51:46 -0700
Subject: [PATCH 13/24] [SQL][DOCS] Improve section on thrift-server

Taken from liancheng's updates. Merged conflicts with #2316.

Author: Michael Armbrust <michael@databricks.com>

Closes #2384 from marmbrus/sqlDocUpdate and squashes the following commits:

2db6319 [Michael Armbrust] @liancheng's updates
---
 docs/sql-programming-guide.md | 58 ++++++++++++++++++++++++-----------
 1 file changed, 40 insertions(+), 18 deletions(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 8d41fdec699e9..c498b41c43380 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -128,7 +128,7 @@ feature parity with a HiveContext.
 
 </div>
 
-The specific variant of SQL that is used to parse queries can also be selected using the 
+The specific variant of SQL that is used to parse queries can also be selected using the
 `spark.sql.dialect` option.  This parameter can be changed using either the `setConf` method on
 a SQLContext or by using a `SET key=value` command in SQL.  For a SQLContext, the only dialect
 available is "sql" which uses a simple SQL parser provided by Spark SQL.  In a HiveContext, the
@@ -139,7 +139,7 @@ default is "hiveql", though "sql" is also available.  Since the HiveQL parser is
 
 Spark SQL supports operating on a variety of data sources through the `SchemaRDD` interface.
 A SchemaRDD can be operated on as normal RDDs and can also be registered as a temporary table.
-Registering a SchemaRDD as a table allows you to run SQL queries over its data.  This section 
+Registering a SchemaRDD as a table allows you to run SQL queries over its data.  This section
 describes the various methods for loading data into a SchemaRDD.
 
 ## RDDs
@@ -152,7 +152,7 @@ while writing your Spark application.
 The second method for creating SchemaRDDs is through a programmatic interface that allows you to
 construct a schema and then apply it to an existing RDD.  While this method is more verbose, it allows
 you to construct SchemaRDDs when the columns and their types are not known until runtime.
- 
+
 ### Inferring the Schema Using Reflection
 <div class="codetabs">
 
@@ -193,7 +193,7 @@ teenagers.map(t => "Name: " + t(0)).collect().foreach(println)
 <div data-lang="java"  markdown="1">
 
 Spark SQL supports automatically converting an RDD of [JavaBeans](http://stackoverflow.com/questions/3295496/what-is-a-javabean-exactly)
-into a Schema RDD.  The BeanInfo, obtained using reflection, defines the schema of the table. 
+into a Schema RDD.  The BeanInfo, obtained using reflection, defines the schema of the table.
 Currently, Spark SQL does not support JavaBeans that contain
 nested or contain complex types such as Lists or Arrays.  You can create a JavaBean by creating a
 class that implements Serializable and has getters and setters for all of its fields.
@@ -480,7 +480,7 @@ for name in names.collect():
 
 [Parquet](http://parquet.io) is a columnar format that is supported by many other data processing systems.
 Spark SQL provides support for both reading and writing Parquet files that automatically preserves the schema
-of the original data.  
+of the original data.
 
 ### Loading Data Programmatically
 
@@ -562,7 +562,7 @@ for teenName in teenNames.collect():
 
 </div>
 
-</div>                                                                             
+</div>
 
 ### Configuration
 
@@ -808,7 +808,7 @@ memory usage and GC pressure. You can call `uncacheTable("tableName")` to remove
 Note that if you call `cache` rather than `cacheTable`, tables will _not_ be cached using
 the in-memory columnar format, and therefore `cacheTable` is strongly recommended for this use case.
 
-Configuration of in-memory caching can be done using the `setConf` method on SQLContext or by running 
+Configuration of in-memory caching can be done using the `setConf` method on SQLContext or by running
 `SET key=value` commands using SQL.
 
 <table class="table">
@@ -881,10 +881,32 @@ To start the JDBC server, run the following in the Spark directory:
 
     ./sbin/start-thriftserver.sh
 
-The default port the server listens on is 10000.  To listen on customized host and port, please set
-the `HIVE_SERVER2_THRIFT_PORT` and `HIVE_SERVER2_THRIFT_BIND_HOST` environment variables. You may
-run `./sbin/start-thriftserver.sh --help` for a complete list of all available options.  Now you can
-use beeline to test the Thrift JDBC server:
+This script accepts all `bin/spark-submit` command line options, plus a `--hiveconf` option to
+specify Hive properties.  You may run `./sbin/start-thriftserver.sh --help` for a complete list of
+all available options.  By default, the server listens on localhost:10000. You may override this
+bahaviour via either environment variables, i.e.:
+
+{% highlight bash %}
+export HIVE_SERVER2_THRIFT_PORT=<listening-port>
+export HIVE_SERVER2_THRIFT_BIND_HOST=<listening-host>
+./sbin/start-thriftserver.sh \
+  --master <master-uri> \
+  ...
+```
+{% endhighlight %}
+
+or system properties:
+
+{% highlight bash %}
+./sbin/start-thriftserver.sh \
+  --hiveconf hive.server2.thrift.port=<listening-port> \
+  --hiveconf hive.server2.thrift.bind.host=<listening-host> \
+  --master <master-uri>
+  ...
+```
+{% endhighlight %}
+
+Now you can use beeline to test the Thrift JDBC server:
 
     ./bin/beeline
 
@@ -930,7 +952,7 @@ SQL deprecates this property in favor of `spark.sql.shuffle.partitions`, whose d
 is 200. Users may customize this property via `SET`:
 
     SET spark.sql.shuffle.partitions=10;
-    SELECT page, count(*) c 
+    SELECT page, count(*) c
     FROM logs_last_month_cached
     GROUP BY page ORDER BY c DESC LIMIT 10;
 
@@ -1139,7 +1161,7 @@ evaluated by the SQL execution engine.  A full list of the functions supported c
 <div data-lang="scala"  markdown="1">
 
 All data types of Spark SQL are located in the package `org.apache.spark.sql`.
-You can access them by doing 
+You can access them by doing
 {% highlight scala %}
 import  org.apache.spark.sql._
 {% endhighlight %}
@@ -1245,7 +1267,7 @@ import  org.apache.spark.sql._
 <tr>
   <td> <b>StructType</b> </td>
   <td> org.apache.spark.sql.Row </td>
-  <td> 
+  <td>
   StructType(<i>fields</i>)<br />
   <b>Note:</b> <i>fields</i> is a Seq of StructFields. Also, two fields with the same
   name are not allowed.
@@ -1267,7 +1289,7 @@ import  org.apache.spark.sql._
 
 All data types of Spark SQL are located in the package of
 `org.apache.spark.sql.api.java`. To access or create a data type,
-please use factory methods provided in 
+please use factory methods provided in
 `org.apache.spark.sql.api.java.DataType`.
 
 <table class="table">
@@ -1373,7 +1395,7 @@ please use factory methods provided in
 <tr>
   <td> <b>StructType</b> </td>
   <td> org.apache.spark.sql.api.java </td>
-  <td> 
+  <td>
   DataType.createStructType(<i>fields</i>)<br />
   <b>Note:</b> <i>fields</i> is a List or an array of StructFields.
   Also, two fields with the same name are not allowed.
@@ -1394,7 +1416,7 @@ please use factory methods provided in
 <div data-lang="python"  markdown="1">
 
 All data types of Spark SQL are located in the package of `pyspark.sql`.
-You can access them by doing 
+You can access them by doing
 {% highlight python %}
 from pyspark.sql import *
 {% endhighlight %}
@@ -1518,7 +1540,7 @@ from pyspark.sql import *
 <tr>
   <td> <b>StructType</b> </td>
   <td> list or tuple </td>
-  <td> 
+  <td>
   StructType(<i>fields</i>)<br />
   <b>Note:</b> <i>fields</i> is a Seq of StructFields. Also, two fields with the same
   name are not allowed.

From a9e910430fb6bb4ef1f6ae20761c43b96bb018df Mon Sep 17 00:00:00 2001
From: Kousuke Saruta <sarutak@oss.nttdata.co.jp>
Date: Tue, 16 Sep 2014 12:41:45 -0700
Subject: [PATCH 14/24] [SPARK-3546] InputStream of ManagedBuffer is not closed
 and causes running out of file descriptor

Author: Kousuke Saruta <sarutak@oss.nttdata.co.jp>

Closes #2408 from sarutak/resolve-resource-leak-issue and squashes the following commits:

074781d [Kousuke Saruta] Modified SuffleBlockFetcherIterator
5f63f67 [Kousuke Saruta] Move metrics increment logic and debug logging outside try block
b37231a [Kousuke Saruta] Modified FileSegmentManagedBuffer#nioByteBuffer to check null or not before invoking channel.close
bf29d4a [Kousuke Saruta] Modified FileSegment to close channel
---
 .../org/apache/spark/network/ManagedBuffer.scala     | 12 ++++++++++--
 .../spark/storage/ShuffleBlockFetcherIterator.scala  |  2 +-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/network/ManagedBuffer.scala b/core/src/main/scala/org/apache/spark/network/ManagedBuffer.scala
index dcecb6beeea9b..e990c1da6730f 100644
--- a/core/src/main/scala/org/apache/spark/network/ManagedBuffer.scala
+++ b/core/src/main/scala/org/apache/spark/network/ManagedBuffer.scala
@@ -19,6 +19,7 @@ package org.apache.spark.network
 
 import java.io.{FileInputStream, RandomAccessFile, File, InputStream}
 import java.nio.ByteBuffer
+import java.nio.channels.FileChannel
 import java.nio.channels.FileChannel.MapMode
 
 import com.google.common.io.ByteStreams
@@ -66,8 +67,15 @@ final class FileSegmentManagedBuffer(val file: File, val offset: Long, val lengt
   override def size: Long = length
 
   override def nioByteBuffer(): ByteBuffer = {
-    val channel = new RandomAccessFile(file, "r").getChannel
-    channel.map(MapMode.READ_ONLY, offset, length)
+    var channel: FileChannel = null
+    try {
+      channel = new RandomAccessFile(file, "r").getChannel
+      channel.map(MapMode.READ_ONLY, offset, length)
+    } finally {
+      if (channel != null) {
+        channel.close()
+      }
+    }
   }
 
   override def inputStream(): InputStream = {
diff --git a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
index c8e708aa6b1bc..d868758a7f549 100644
--- a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
+++ b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
@@ -23,7 +23,7 @@ import scala.collection.mutable.ArrayBuffer
 import scala.collection.mutable.HashSet
 import scala.collection.mutable.Queue
 
-import org.apache.spark.{TaskContext, Logging, SparkException}
+import org.apache.spark.{TaskContext, Logging}
 import org.apache.spark.network.{ManagedBuffer, BlockFetchingListener, BlockTransferService}
 import org.apache.spark.serializer.Serializer
 import org.apache.spark.util.Utils

From ec1adecbb72d291d7ef122fb0505bae53116e0e6 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies.liu@gmail.com>
Date: Tue, 16 Sep 2014 12:51:58 -0700
Subject: [PATCH 15/24] [SPARK-3430] [PySpark] [Doc] generate PySpark API docs
 using Sphinx

Using Sphinx to generate API docs for PySpark.

requirement: Sphinx

```
$ cd python/docs/
$ make html
```

The generated API docs will be located at python/docs/_build/html/index.html

It can co-exists with those generated by Epydoc.

This is the first working version, after merging in, then we can continue to improve it and replace the epydoc finally.

Author: Davies Liu <davies.liu@gmail.com>

Closes #2292 from davies/sphinx and squashes the following commits:

425a3b1 [Davies Liu] cleanup
1573298 [Davies Liu] move docs to python/docs/
5fe3903 [Davies Liu] Merge branch 'master' into sphinx
9468ab0 [Davies Liu] fix makefile
b408f38 [Davies Liu] address all comments
e2ccb1b [Davies Liu] update name and version
9081ead [Davies Liu] generate PySpark API docs using Sphinx
---
 python/docs/Makefile          | 179 ++++++++++++++++++
 python/docs/conf.py           | 332 ++++++++++++++++++++++++++++++++++
 python/docs/epytext.py        |  27 +++
 python/docs/index.rst         |  37 ++++
 python/docs/make.bat          | 242 +++++++++++++++++++++++++
 python/docs/modules.rst       |   7 +
 python/docs/pyspark.mllib.rst |  77 ++++++++
 python/docs/pyspark.rst       |  18 ++
 python/docs/pyspark.sql.rst   |  10 +
 python/pyspark/broadcast.py   |   3 +
 python/pyspark/context.py     |   2 +-
 python/pyspark/serializers.py |   3 +
 python/pyspark/sql.py         |  12 +-
 13 files changed, 944 insertions(+), 5 deletions(-)
 create mode 100644 python/docs/Makefile
 create mode 100644 python/docs/conf.py
 create mode 100644 python/docs/epytext.py
 create mode 100644 python/docs/index.rst
 create mode 100644 python/docs/make.bat
 create mode 100644 python/docs/modules.rst
 create mode 100644 python/docs/pyspark.mllib.rst
 create mode 100644 python/docs/pyspark.rst
 create mode 100644 python/docs/pyspark.sql.rst

diff --git a/python/docs/Makefile b/python/docs/Makefile
new file mode 100644
index 0000000000000..8a1324eecd325
--- /dev/null
+++ b/python/docs/Makefile
@@ -0,0 +1,179 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = _build
+
+export PYTHONPATH=$(realpath ..):$(realpath ../lib/py4j-0.8.2.1-src.zip)
+
+# User-friendly check for sphinx-build
+ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
+$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
+endif
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
+
+help:
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  html       to make standalone HTML files"
+	@echo "  dirhtml    to make HTML files named index.html in directories"
+	@echo "  singlehtml to make a single large HTML file"
+	@echo "  pickle     to make pickle files"
+	@echo "  json       to make JSON files"
+	@echo "  htmlhelp   to make HTML files and a HTML help project"
+	@echo "  qthelp     to make HTML files and a qthelp project"
+	@echo "  devhelp    to make HTML files and a Devhelp project"
+	@echo "  epub       to make an epub"
+	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
+	@echo "  text       to make text files"
+	@echo "  man        to make manual pages"
+	@echo "  texinfo    to make Texinfo files"
+	@echo "  info       to make Texinfo files and run them through makeinfo"
+	@echo "  gettext    to make PO message catalogs"
+	@echo "  changes    to make an overview of all changed/added/deprecated items"
+	@echo "  xml        to make Docutils-native XML files"
+	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
+	@echo "  linkcheck  to check all external links for integrity"
+	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+	rm -rf $(BUILDDIR)/*
+
+html:
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+	@echo
+	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+	@echo
+	@echo "Build finished; now you can process the pickle files."
+
+json:
+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+	@echo
+	@echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+	@echo
+	@echo "Build finished; now you can run HTML Help Workshop with the" \
+	      ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+	@echo
+	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
+	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pyspark.qhcp"
+	@echo "To view the help file:"
+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pyspark.qhc"
+
+devhelp:
+	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+	@echo
+	@echo "Build finished."
+	@echo "To view the help file:"
+	@echo "# mkdir -p $$HOME/.local/share/devhelp/pyspark"
+	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pyspark"
+	@echo "# devhelp"
+
+epub:
+	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+	@echo
+	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo
+	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+	@echo "Run \`make' in that directory to run these through (pdf)latex" \
+	      "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through pdflatex..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+latexpdfja:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through platex and dvipdfmx..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+	@echo
+	@echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+	@echo
+	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo
+	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+	@echo "Run \`make' in that directory to run these through makeinfo" \
+	      "(use \`make info' here to do that automatically)."
+
+info:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo "Running Texinfo files through makeinfo..."
+	make -C $(BUILDDIR)/texinfo info
+	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+	@echo
+	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+	@echo
+	@echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+	@echo
+	@echo "Link check complete; look for any errors in the above output " \
+	      "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+	@echo "Testing of doctests in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/doctest/output.txt."
+
+xml:
+	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
+	@echo
+	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
+
+pseudoxml:
+	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
+	@echo
+	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
diff --git a/python/docs/conf.py b/python/docs/conf.py
new file mode 100644
index 0000000000000..c368cf81a003b
--- /dev/null
+++ b/python/docs/conf.py
@@ -0,0 +1,332 @@
+# -*- coding: utf-8 -*-
+#
+# pyspark documentation build configuration file, created by
+# sphinx-quickstart on Thu Aug 28 15:17:47 2014.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys
+import os
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.viewcode',
+    'epytext',
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'PySpark'
+copyright = u'2014, Author'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '1.1'
+# The full version, including alpha/beta/rc tags.
+release = ''
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+html_theme = 'default'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = []
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+#html_extra_path = []
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'pysparkdoc'
+
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+# The paper size ('letterpaper' or 'a4paper').
+#'papersize': 'letterpaper',
+
+# The font size ('10pt', '11pt' or '12pt').
+#'pointsize': '10pt',
+
+# Additional stuff for the LaTeX preamble.
+#'preamble': '',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+  ('index', 'pyspark.tex', u'pyspark Documentation',
+   u'Author', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    ('index', 'pyspark', u'pyspark Documentation',
+     [u'Author'], 1)
+]
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+  ('index', 'pyspark', u'pyspark Documentation',
+   u'Author', 'pyspark', 'One line description of project.',
+   'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#texinfo_no_detailmenu = False
+
+
+# -- Options for Epub output ----------------------------------------------
+
+# Bibliographic Dublin Core info.
+epub_title = u'pyspark'
+epub_author = u'Author'
+epub_publisher = u'Author'
+epub_copyright = u'2014, Author'
+
+# The basename for the epub file. It defaults to the project name.
+#epub_basename = u'pyspark'
+
+# The HTML theme for the epub output. Since the default themes are not optimized
+# for small screen space, using the same theme for HTML and epub output is
+# usually not wise. This defaults to 'epub', a theme designed to save visual
+# space.
+#epub_theme = 'epub'
+
+# The language of the text. It defaults to the language option
+# or en if the language is not set.
+#epub_language = ''
+
+# The scheme of the identifier. Typical schemes are ISBN or URL.
+#epub_scheme = ''
+
+# The unique identifier of the text. This can be a ISBN number
+# or the project homepage.
+#epub_identifier = ''
+
+# A unique identification for the text.
+#epub_uid = ''
+
+# A tuple containing the cover image and cover page html template filenames.
+#epub_cover = ()
+
+# A sequence of (type, uri, title) tuples for the guide element of content.opf.
+#epub_guide = ()
+
+# HTML files that should be inserted before the pages created by sphinx.
+# The format is a list of tuples containing the path and title.
+#epub_pre_files = []
+
+# HTML files shat should be inserted after the pages created by sphinx.
+# The format is a list of tuples containing the path and title.
+#epub_post_files = []
+
+# A list of files that should not be packed into the epub file.
+epub_exclude_files = ['search.html']
+
+# The depth of the table of contents in toc.ncx.
+#epub_tocdepth = 3
+
+# Allow duplicate toc entries.
+#epub_tocdup = True
+
+# Choose between 'default' and 'includehidden'.
+#epub_tocscope = 'default'
+
+# Fix unsupported image types using the PIL.
+#epub_fix_images = False
+
+# Scale large images.
+#epub_max_image_width = 0
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#epub_show_urls = 'inline'
+
+# If false, no index is generated.
+#epub_use_index = True
diff --git a/python/docs/epytext.py b/python/docs/epytext.py
new file mode 100644
index 0000000000000..61d731bff570d
--- /dev/null
+++ b/python/docs/epytext.py
@@ -0,0 +1,27 @@
+import re
+
+RULES = (
+    (r"<[\w.]+>", r""),
+    (r"L{([\w.()]+)}", r":class:`\1`"),
+    (r"[LC]{(\w+\.\w+)\(\)}", r":func:`\1`"),
+    (r"C{([\w.()]+)}", r":class:`\1`"),
+    (r"[IBCM]{(.+)}", r"`\1`"),
+    ('pyspark.rdd.RDD', 'RDD'),
+)
+
+def _convert_epytext(line):
+    """
+    >>> _convert_epytext("L{A}")
+    :class:`A`
+    """
+    line = line.replace('@', ':')
+    for p, sub in RULES:
+        line = re.sub(p, sub, line)
+    return line
+
+def _process_docstring(app, what, name, obj, options, lines):
+    for i in range(len(lines)):
+        lines[i] = _convert_epytext(lines[i])
+
+def setup(app):
+    app.connect("autodoc-process-docstring", _process_docstring)
diff --git a/python/docs/index.rst b/python/docs/index.rst
new file mode 100644
index 0000000000000..25b3f9bd93e63
--- /dev/null
+++ b/python/docs/index.rst
@@ -0,0 +1,37 @@
+.. pyspark documentation master file, created by
+   sphinx-quickstart on Thu Aug 28 15:17:47 2014.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to PySpark API reference!
+===================================
+
+Contents:
+
+.. toctree::
+   :maxdepth: 2
+
+   pyspark
+   pyspark.sql
+   pyspark.mllib
+
+
+Core classes:
+---------------
+
+    :class:`pyspark.SparkContext`
+
+    Main entry point for Spark functionality.
+
+    :class:`pyspark.RDD`
+    
+    A Resilient Distributed Dataset (RDD), the basic abstraction in Spark.
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
+
diff --git a/python/docs/make.bat b/python/docs/make.bat
new file mode 100644
index 0000000000000..adad44fd7536a
--- /dev/null
+++ b/python/docs/make.bat
@@ -0,0 +1,242 @@
+@ECHO OFF
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set BUILDDIR=_build
+set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
+set I18NSPHINXOPTS=%SPHINXOPTS% .
+if NOT "%PAPER%" == "" (
+	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
+	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
+)
+
+if "%1" == "" goto help
+
+if "%1" == "help" (
+	:help
+	echo.Please use `make ^<target^>` where ^<target^> is one of
+	echo.  html       to make standalone HTML files
+	echo.  dirhtml    to make HTML files named index.html in directories
+	echo.  singlehtml to make a single large HTML file
+	echo.  pickle     to make pickle files
+	echo.  json       to make JSON files
+	echo.  htmlhelp   to make HTML files and a HTML help project
+	echo.  qthelp     to make HTML files and a qthelp project
+	echo.  devhelp    to make HTML files and a Devhelp project
+	echo.  epub       to make an epub
+	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
+	echo.  text       to make text files
+	echo.  man        to make manual pages
+	echo.  texinfo    to make Texinfo files
+	echo.  gettext    to make PO message catalogs
+	echo.  changes    to make an overview over all changed/added/deprecated items
+	echo.  xml        to make Docutils-native XML files
+	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
+	echo.  linkcheck  to check all external links for integrity
+	echo.  doctest    to run all doctests embedded in the documentation if enabled
+	goto end
+)
+
+if "%1" == "clean" (
+	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
+	del /q /s %BUILDDIR%\*
+	goto end
+)
+
+
+%SPHINXBUILD% 2> nul
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "html" (
+	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
+	goto end
+)
+
+if "%1" == "dirhtml" (
+	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
+	goto end
+)
+
+if "%1" == "singlehtml" (
+	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
+	goto end
+)
+
+if "%1" == "pickle" (
+	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can process the pickle files.
+	goto end
+)
+
+if "%1" == "json" (
+	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can process the JSON files.
+	goto end
+)
+
+if "%1" == "htmlhelp" (
+	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can run HTML Help Workshop with the ^
+.hhp project file in %BUILDDIR%/htmlhelp.
+	goto end
+)
+
+if "%1" == "qthelp" (
+	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can run "qcollectiongenerator" with the ^
+.qhcp project file in %BUILDDIR%/qthelp, like this:
+	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\pyspark.qhcp
+	echo.To view the help file:
+	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\pyspark.ghc
+	goto end
+)
+
+if "%1" == "devhelp" (
+	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished.
+	goto end
+)
+
+if "%1" == "epub" (
+	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The epub file is in %BUILDDIR%/epub.
+	goto end
+)
+
+if "%1" == "latex" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
+	goto end
+)
+
+if "%1" == "latexpdf" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	cd %BUILDDIR%/latex
+	make all-pdf
+	cd %BUILDDIR%/..
+	echo.
+	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
+	goto end
+)
+
+if "%1" == "latexpdfja" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	cd %BUILDDIR%/latex
+	make all-pdf-ja
+	cd %BUILDDIR%/..
+	echo.
+	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
+	goto end
+)
+
+if "%1" == "text" (
+	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The text files are in %BUILDDIR%/text.
+	goto end
+)
+
+if "%1" == "man" (
+	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The manual pages are in %BUILDDIR%/man.
+	goto end
+)
+
+if "%1" == "texinfo" (
+	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
+	goto end
+)
+
+if "%1" == "gettext" (
+	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
+	goto end
+)
+
+if "%1" == "changes" (
+	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.The overview file is in %BUILDDIR%/changes.
+	goto end
+)
+
+if "%1" == "linkcheck" (
+	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Link check complete; look for any errors in the above output ^
+or in %BUILDDIR%/linkcheck/output.txt.
+	goto end
+)
+
+if "%1" == "doctest" (
+	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Testing of doctests in the sources finished, look at the ^
+results in %BUILDDIR%/doctest/output.txt.
+	goto end
+)
+
+if "%1" == "xml" (
+	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The XML files are in %BUILDDIR%/xml.
+	goto end
+)
+
+if "%1" == "pseudoxml" (
+	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
+	goto end
+)
+
+:end
diff --git a/python/docs/modules.rst b/python/docs/modules.rst
new file mode 100644
index 0000000000000..183564659fbcf
--- /dev/null
+++ b/python/docs/modules.rst
@@ -0,0 +1,7 @@
+.
+=
+
+.. toctree::
+   :maxdepth: 4
+
+   pyspark
diff --git a/python/docs/pyspark.mllib.rst b/python/docs/pyspark.mllib.rst
new file mode 100644
index 0000000000000..e95d19e97f151
--- /dev/null
+++ b/python/docs/pyspark.mllib.rst
@@ -0,0 +1,77 @@
+pyspark.mllib package
+=====================
+
+Submodules
+----------
+
+pyspark.mllib.classification module
+-----------------------------------
+
+.. automodule:: pyspark.mllib.classification
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+pyspark.mllib.clustering module
+-------------------------------
+
+.. automodule:: pyspark.mllib.clustering
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+pyspark.mllib.linalg module
+---------------------------
+
+.. automodule:: pyspark.mllib.linalg
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+pyspark.mllib.random module
+---------------------------
+
+.. automodule:: pyspark.mllib.random
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+pyspark.mllib.recommendation module
+-----------------------------------
+
+.. automodule:: pyspark.mllib.recommendation
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+pyspark.mllib.regression module
+-------------------------------
+
+.. automodule:: pyspark.mllib.regression
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+pyspark.mllib.stat module
+-------------------------
+
+.. automodule:: pyspark.mllib.stat
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+pyspark.mllib.tree module
+-------------------------
+
+.. automodule:: pyspark.mllib.tree
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+pyspark.mllib.util module
+-------------------------
+
+.. automodule:: pyspark.mllib.util
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/python/docs/pyspark.rst b/python/docs/pyspark.rst
new file mode 100644
index 0000000000000..a68bd62433085
--- /dev/null
+++ b/python/docs/pyspark.rst
@@ -0,0 +1,18 @@
+pyspark package
+===============
+
+Subpackages
+-----------
+
+.. toctree::
+    :maxdepth: 1
+    
+    pyspark.mllib
+    pyspark.sql
+
+Contents
+--------
+
+.. automodule:: pyspark
+    :members:
+    :undoc-members:
diff --git a/python/docs/pyspark.sql.rst b/python/docs/pyspark.sql.rst
new file mode 100644
index 0000000000000..65b3650ae10ab
--- /dev/null
+++ b/python/docs/pyspark.sql.rst
@@ -0,0 +1,10 @@
+pyspark.sql module
+==================
+
+Module contents
+---------------
+
+.. automodule:: pyspark.sql
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/python/pyspark/broadcast.py b/python/pyspark/broadcast.py
index 5c7c9cc161dff..f124dc6c07575 100644
--- a/python/pyspark/broadcast.py
+++ b/python/pyspark/broadcast.py
@@ -78,6 +78,9 @@ def value(self):
         return self._value
 
     def unpersist(self, blocking=False):
+        """
+        Delete cached copies of this broadcast on the executors.
+        """
         self._jbroadcast.unpersist(blocking)
         os.unlink(self.path)
 
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index a33aae87f65e8..a17f2c1203d36 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -53,7 +53,7 @@ class SparkContext(object):
 
     """
     Main entry point for Spark functionality. A SparkContext represents the
-    connection to a Spark cluster, and can be used to create L{RDD}s and
+    connection to a Spark cluster, and can be used to create L{RDD} and
     broadcast variables on that cluster.
     """
 
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index ec3c6f055441d..44ac5642836e0 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -110,6 +110,9 @@ def __eq__(self, other):
     def __ne__(self, other):
         return not self.__eq__(other)
 
+    def __repr__(self):
+        return "<%s object>" % self.__class__.__name__
+
 
 class FramedSerializer(Serializer):
 
diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py
index 621a556ec6356..8f6dbab240c7b 100644
--- a/python/pyspark/sql.py
+++ b/python/pyspark/sql.py
@@ -289,7 +289,7 @@ class StructType(DataType):
     """Spark SQL StructType
 
     The data type representing rows.
-    A StructType object comprises a list of L{StructField}s.
+    A StructType object comprises a list of L{StructField}.
 
     """
 
@@ -904,7 +904,7 @@ class SQLContext(object):
 
     """Main entry point for Spark SQL functionality.
 
-    A SQLContext can be used create L{SchemaRDD}s, register L{SchemaRDD}s as
+    A SQLContext can be used create L{SchemaRDD}, register L{SchemaRDD} as
     tables, execute SQL over tables, cache tables, and read parquet files.
     """
 
@@ -994,7 +994,7 @@ def registerFunction(self, name, f, returnType=StringType()):
                                       str(returnType))
 
     def inferSchema(self, rdd):
-        """Infer and apply a schema to an RDD of L{Row}s.
+        """Infer and apply a schema to an RDD of L{Row}.
 
         We peek at the first row of the RDD to determine the fields' names
         and types. Nested collections are supported, which include array,
@@ -1047,7 +1047,7 @@ def inferSchema(self, rdd):
 
     def applySchema(self, rdd, schema):
         """
-        Applies the given schema to the given RDD of L{tuple} or L{list}s.
+        Applies the given schema to the given RDD of L{tuple} or L{list}.
 
         These tuples or lists can contain complex nested structures like
         lists, maps or nested rows.
@@ -1183,6 +1183,7 @@ def jsonFile(self, path, schema=None):
         Row(f1=1, f2=u'row1', f3=Row(field4=11, field5=None), f4=None)
         Row(f1=2, f2=None, f3=Row(field4=22,..., f4=[Row(field7=u'row2')])
         Row(f1=None, f2=u'row3', f3=Row(field4=33, field5=[]), f4=None)
+
         >>> srdd3 = sqlCtx.jsonFile(jsonFile, srdd1.schema())
         >>> sqlCtx.registerRDDAsTable(srdd3, "table2")
         >>> srdd4 = sqlCtx.sql(
@@ -1193,6 +1194,7 @@ def jsonFile(self, path, schema=None):
         Row(f1=1, f2=u'row1', f3=Row(field4=11, field5=None), f4=None)
         Row(f1=2, f2=None, f3=Row(field4=22,..., f4=[Row(field7=u'row2')])
         Row(f1=None, f2=u'row3', f3=Row(field4=33, field5=[]), f4=None)
+
         >>> schema = StructType([
         ...     StructField("field2", StringType(), True),
         ...     StructField("field3",
@@ -1233,6 +1235,7 @@ def jsonRDD(self, rdd, schema=None):
         Row(f1=1, f2=u'row1', f3=Row(field4=11, field5=None), f4=None)
         Row(f1=2, f2=None, f3=Row(field4=22..., f4=[Row(field7=u'row2')])
         Row(f1=None, f2=u'row3', f3=Row(field4=33, field5=[]), f4=None)
+
         >>> srdd3 = sqlCtx.jsonRDD(json, srdd1.schema())
         >>> sqlCtx.registerRDDAsTable(srdd3, "table2")
         >>> srdd4 = sqlCtx.sql(
@@ -1243,6 +1246,7 @@ def jsonRDD(self, rdd, schema=None):
         Row(f1=1, f2=u'row1', f3=Row(field4=11, field5=None), f4=None)
         Row(f1=2, f2=None, f3=Row(field4=22..., f4=[Row(field7=u'row2')])
         Row(f1=None, f2=u'row3', f3=Row(field4=33, field5=[]), f4=None)
+
         >>> schema = StructType([
         ...     StructField("field2", StringType(), True),
         ...     StructField("field3",

From b20171267d610715d5b0a86b474c903e9bc3a1a3 Mon Sep 17 00:00:00 2001
From: Dan Osipov <daniil.osipov@shazam.com>
Date: Tue, 16 Sep 2014 13:40:16 -0700
Subject: [PATCH 16/24] [SPARK-787] Add S3 configuration parameters to the EC2
 deploy scripts

When deploying to AWS, there is additional configuration that is required to read S3 files. EMR creates it automatically, there is no reason that the Spark EC2 script shouldn't.

This PR requires a corresponding PR to the mesos/spark-ec2 to be merged, as it gets cloned in the process of setting up machines: https://github.com/mesos/spark-ec2/pull/58

Author: Dan Osipov <daniil.osipov@shazam.com>

Closes #1120 from danosipov/s3_credentials and squashes the following commits:

758da8b [Dan Osipov] Modify documentation to include the new parameter
71fab14 [Dan Osipov] Use a parameter --copy-aws-credentials to enable S3 credential deployment
7e0da26 [Dan Osipov] Get AWS credentials out of boto connection instance
39bdf30 [Dan Osipov] Add S3 configuration parameters to the EC2 deploy scripts
---
 docs/ec2-scripts.md                                |  2 +-
 ec2/deploy.generic/root/spark-ec2/ec2-variables.sh |  2 ++
 ec2/spark_ec2.py                                   | 10 ++++++++++
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/docs/ec2-scripts.md b/docs/ec2-scripts.md
index f5ac6d894e1eb..b2ca6a9b48f32 100644
--- a/docs/ec2-scripts.md
+++ b/docs/ec2-scripts.md
@@ -156,6 +156,6 @@ If you have a patch or suggestion for one of these limitations, feel free to
 
 # Accessing Data in S3
 
-Spark's file interface allows it to process data in Amazon S3 using the same URI formats that are supported for Hadoop. You can specify a path in S3 as input through a URI of the form `s3n://<bucket>/path`. You will also need to set your Amazon security credentials, either by setting the environment variables `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` before your program or through `SparkContext.hadoopConfiguration`. Full instructions on S3 access using the Hadoop input libraries can be found on the [Hadoop S3 page](http://wiki.apache.org/hadoop/AmazonS3).
+Spark's file interface allows it to process data in Amazon S3 using the same URI formats that are supported for Hadoop. You can specify a path in S3 as input through a URI of the form `s3n://<bucket>/path`. To provide AWS credentials for S3 access, launch the Spark cluster with the option `--copy-aws-credentials`. Full instructions on S3 access using the Hadoop input libraries can be found on the [Hadoop S3 page](http://wiki.apache.org/hadoop/AmazonS3).
 
 In addition to using a single input file, you can also use a directory of files as input by simply giving the path to the directory.
diff --git a/ec2/deploy.generic/root/spark-ec2/ec2-variables.sh b/ec2/deploy.generic/root/spark-ec2/ec2-variables.sh
index 3570891be804e..740c267fd9866 100644
--- a/ec2/deploy.generic/root/spark-ec2/ec2-variables.sh
+++ b/ec2/deploy.generic/root/spark-ec2/ec2-variables.sh
@@ -30,3 +30,5 @@ export HADOOP_MAJOR_VERSION="{{hadoop_major_version}}"
 export SWAP_MB="{{swap}}"
 export SPARK_WORKER_INSTANCES="{{spark_worker_instances}}"
 export SPARK_MASTER_OPTS="{{spark_master_opts}}"
+export AWS_ACCESS_KEY_ID="{{aws_access_key_id}}"
+export AWS_SECRET_ACCESS_KEY="{{aws_secret_access_key}}"
\ No newline at end of file
diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index 5682e96aa8770..abac71eaca595 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -158,6 +158,9 @@ def parse_args():
     parser.add_option(
         "--additional-security-group", type="string", default="",
         help="Additional security group to place the machines in")
+    parser.add_option(
+        "--copy-aws-credentials", action="store_true", default=False,
+        help="Add AWS credentials to hadoop configuration to allow Spark to access S3")
 
     (opts, args) = parser.parse_args()
     if len(args) != 2:
@@ -714,6 +717,13 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules):
         "spark_master_opts": opts.master_opts
     }
 
+    if opts.copy_aws_credentials:
+        template_vars["aws_access_key_id"] = conn.aws_access_key_id
+        template_vars["aws_secret_access_key"] = conn.aws_secret_access_key
+    else:
+        template_vars["aws_access_key_id"] = ""
+        template_vars["aws_secret_access_key"] = ""
+
     # Create a temp directory in which we will place all the files to be
     # deployed after we substitue template parameters in them
     tmp_dir = tempfile.mkdtemp()

From a6e1712f1e9c36deb24c5073aa8edcfc047d76eb Mon Sep 17 00:00:00 2001
From: Evan Chan <velvia@gmail.com>
Date: Tue, 16 Sep 2014 13:46:06 -0700
Subject: [PATCH 17/24] Add a Community Projects page

This adds a new page to the docs listing community projects -- those created outside of Apache Spark that are of interest to the community of Spark users.   Anybody can add to it just by submitting a PR.

There was a discussion thread about alternatives:
* Creating a Github organization for Spark projects -  we could not find any sponsors for this, and it would be difficult to organize since many folks just create repos in their company organization or personal accounts
* Apache has some place for storing community projects, but it was deemed difficult to work with, and again would be some permissions issues -- not everyone could update it.

Author: Evan Chan <velvia@gmail.com>

Closes #2219 from velvia/community-projects-page and squashes the following commits:

7316822 [Evan Chan] Point to Spark wiki: supplemental projects page
613b021 [Evan Chan] Add a few more projects
a85eaaf [Evan Chan] Add a Community Projects page
---
 docs/_layouts/global.html | 3 ++-
 docs/index.md             | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/_layouts/global.html b/docs/_layouts/global.html
index a53e8a775b71f..627ed37de4a9c 100755
--- a/docs/_layouts/global.html
+++ b/docs/_layouts/global.html
@@ -111,6 +111,7 @@
                                 <li class="divider"></li>
                                 <li><a href="building-spark.html">Building Spark</a></li>
                                 <li><a href="https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark">Contributing to Spark</a></li>
+                                <li><a href="https://cwiki.apache.org/confluence/display/SPARK/Supplemental+Spark+Projects">Supplemental Projects</a></li>
                             </ul>
                         </li>
                     </ul>
@@ -151,7 +152,7 @@ <h1 class="title">{{ page.title }}</h1>
                     MathJax.Hub.Config({
                         tex2jax: {
                             inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
-                            displayMath: [ ["$$","$$"], ["\\[", "\\]"] ], 
+                            displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
                             processEscapes: true,
                             skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
                         }
diff --git a/docs/index.md b/docs/index.md
index e8ebadbd4e427..edd622ec90f64 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -107,6 +107,7 @@ options for deployment:
   * [OpenStack Swift](storage-openstack-swift.html)
 * [Building Spark](building-spark.html): build Spark using the Maven system
 * [Contributing to Spark](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark)
+* [Supplemental Projects](https://cwiki.apache.org/confluence/display/SPARK/Supplemental+Spark+Projects): related third party Spark projects
 
 **External Resources:**
 

From 0a7091e689a4c8b1e7b61e9f0873e6557f40d952 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrewor14@gmail.com>
Date: Tue, 16 Sep 2014 16:03:20 -0700
Subject: [PATCH 18/24] [SPARK-3555] Fix UISuite race condition

The test "jetty selects different port under contention" is flaky.

If another process binds to 4040 before the test starts, then the first server we start there will fail, and the subsequent servers we start thereafter may successfully bind to 4040 if it was released between the servers starting. Instead, we should just let Java find a random free port for us and hold onto it for the duration of the test.

Author: Andrew Or <andrewor14@gmail.com>

Closes #2418 from andrewor14/fix-port-contention and squashes the following commits:

0cd4974 [Andrew Or] Stop them servers
a7071fe [Andrew Or] Pick random port instead of 4040
---
 .../test/scala/org/apache/spark/ui/UISuite.scala   | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/ui/UISuite.scala b/core/src/test/scala/org/apache/spark/ui/UISuite.scala
index 48790b59e7fbd..92a21f82f3c21 100644
--- a/core/src/test/scala/org/apache/spark/ui/UISuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/UISuite.scala
@@ -23,7 +23,6 @@ import javax.servlet.http.HttpServletRequest
 import scala.io.Source
 import scala.util.{Failure, Success, Try}
 
-import org.eclipse.jetty.server.Server
 import org.eclipse.jetty.servlet.ServletContextHandler
 import org.scalatest.FunSuite
 import org.scalatest.concurrent.Eventually._
@@ -108,14 +107,8 @@ class UISuite extends FunSuite {
   }
 
   test("jetty selects different port under contention") {
-    val startPort = 4040
-    val server = new Server(startPort)
-
-    Try { server.start() } match {
-      case Success(s) =>
-      case Failure(e) =>
-      // Either case server port is busy hence setup for test complete
-    }
+    val server = new ServerSocket(0)
+    val startPort = server.getLocalPort
     val serverInfo1 = JettyUtils.startJettyServer(
       "0.0.0.0", startPort, Seq[ServletContextHandler](), new SparkConf)
     val serverInfo2 = JettyUtils.startJettyServer(
@@ -126,6 +119,9 @@ class UISuite extends FunSuite {
     assert(boundPort1 != startPort)
     assert(boundPort2 != startPort)
     assert(boundPort1 != boundPort2)
+    serverInfo1.server.stop()
+    serverInfo2.server.stop()
+    server.close()
   }
 
   test("jetty binds to port 0 correctly") {

From 008a5ed4808d1467b47c1d6fa4d950cc6c4976b7 Mon Sep 17 00:00:00 2001
From: wangfei <wangfei1@huawei.com>
Date: Tue, 16 Sep 2014 21:57:33 -0700
Subject: [PATCH 19/24] [Minor]ignore all config files in conf

Some config files in ```conf``` should ignore, such as
        conf/fairscheduler.xml
        conf/hive-log4j.properties
        conf/metrics.properties
...
So ignore all ```sh```/```properties```/```conf```/```xml``` files

Author: wangfei <wangfei1@huawei.com>

Closes #2395 from scwf/patch-2 and squashes the following commits:

3dc53f2 [wangfei] duplicate ```conf/*.conf```
3c2986f [wangfei] ignore all config files
---
 .gitignore | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/.gitignore b/.gitignore
index 7ec8d45e12c6b..a31bf7e0091f4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,11 +15,10 @@ out/
 third_party/libmesos.so
 third_party/libmesos.dylib
 conf/java-opts
-conf/spark-env.sh
-conf/streaming-env.sh
-conf/log4j.properties
-conf/spark-defaults.conf
-conf/hive-site.xml
+conf/*.sh
+conf/*.properties
+conf/*.conf
+conf/*.xml
 docs/_site
 docs/api
 target/
@@ -50,7 +49,6 @@ unit-tests.log
 /lib/
 rat-results.txt
 scalastyle.txt
-conf/*.conf
 scalastyle-output.xml
 
 # For Hive

From 983609a4dd83e25598455bfce93fa1c1fa9f2c51 Mon Sep 17 00:00:00 2001
From: viper-kun <xukun.xu@huawei.com>
Date: Wed, 17 Sep 2014 00:09:57 -0700
Subject: [PATCH 20/24] [Docs] Correct spark.files.fetchTimeout default value

change the value of spark.files.fetchTimeout

Author: viper-kun <xukun.xu@huawei.com>

Closes #2406 from viper-kun/master and squashes the following commits:

ecb0d46 [viper-kun] [Docs] Correct spark.files.fetchTimeout default value
7cf4c7a [viper-kun] Update configuration.md
---
 docs/configuration.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/configuration.md b/docs/configuration.md
index af16489a44281..99faf51c6f3db 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -520,10 +520,10 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 <tr>
   <td><code>spark.files.fetchTimeout</code></td>
-  <td>false</td>
+  <td>60</td>
   <td>
     Communication timeout to use when fetching files added through SparkContext.addFile() from
-    the driver.
+    the driver, in seconds.
   </td>
 </tr>
 <tr>

From 7d1a37239c50394025d9f16acf5dcd05cfbe7250 Mon Sep 17 00:00:00 2001
From: chesterxgchen <chester@alpinenow.com>
Date: Wed, 17 Sep 2014 10:25:52 -0500
Subject: [PATCH 21/24] SPARK-3177 (on Master Branch)

The JIRA and PR was original created for branch-1.1, and move to master branch now.
Chester

The Issue is due to that yarn-alpha and yarn have different APIs for certain class fields. In this particular case,  the ClientBase using reflection to to address this issue, and we need to different way to test the ClientBase's method.  Original ClientBaseSuite using getFieldValue() method to do this. But it doesn't work for yarn-alpha as the API returns an array of String instead of just String (which is the case for Yarn-stable API).

 To fix the test, I add a new method

  def getFieldValue2[A: ClassTag, A1: ClassTag, B](clazz: Class[_], field: String,
                                                      defaults: => B)
                              (mapTo:  A => B)(mapTo1: A1 => B) : B =
    Try(clazz.getField(field)).map(_.get(null)).map {
      case v: A => mapTo(v)
      case v1: A1 => mapTo1(v1)
      case _ => defaults
    }.toOption.getOrElse(defaults)

to handle the cases where the field type can be either type A or A1. In this new method the type A or A1 is pattern matched and corresponding mapTo function (mapTo or mapTo1) is used.

Author: chesterxgchen <chester@alpinenow.com>

Closes #2204 from chesterxgchen/SPARK-3177-master and squashes the following commits:

e72a6ea [chesterxgchen]  The Issue is due to that yarn-alpha and yarn have different APIs for certain class fields. In this particular case,  the ClientBase using reflection to to address this issue, and we need to different way to test the ClientBase's method.  Original ClientBaseSuite using getFieldValue() method to do this. But it doesn't work for yarn-alpha as the API returns an array of String instead of just String (which is the case for Yarn-stable API).
---
 .../spark/deploy/yarn/ClientBaseSuite.scala   | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/yarn/common/src/test/scala/org/apache/spark/deploy/yarn/ClientBaseSuite.scala b/yarn/common/src/test/scala/org/apache/spark/deploy/yarn/ClientBaseSuite.scala
index 5480eca7c832c..c3b7a2c8f02e5 100644
--- a/yarn/common/src/test/scala/org/apache/spark/deploy/yarn/ClientBaseSuite.scala
+++ b/yarn/common/src/test/scala/org/apache/spark/deploy/yarn/ClientBaseSuite.scala
@@ -38,6 +38,7 @@ import org.scalatest.Matchers
 
 import scala.collection.JavaConversions._
 import scala.collection.mutable.{ HashMap => MutableHashMap }
+import scala.reflect.ClassTag
 import scala.util.Try
 
 import org.apache.spark.{SparkException, SparkConf}
@@ -200,9 +201,10 @@ class ClientBaseSuite extends FunSuite with Matchers {
 
 
     val knownDefMRAppCP: Seq[String] =
-      getFieldValue[String, Seq[String]](classOf[MRJobConfig],
-                                         "DEFAULT_MAPREDUCE_APPLICATION_CLASSPATH",
-                                         Seq[String]())(a => a.split(","))
+      getFieldValue2[String, Array[String], Seq[String]](
+        classOf[MRJobConfig],
+        "DEFAULT_MAPREDUCE_APPLICATION_CLASSPATH",
+        Seq[String]())(a => a.split(","))(a => a.toSeq)
 
     val knownYARNAppCP = Some(Seq("/known/yarn/path"))
 
@@ -232,6 +234,17 @@ class ClientBaseSuite extends FunSuite with Matchers {
   def getFieldValue[A, B](clazz: Class[_], field: String, defaults: => B)(mapTo: A => B): B =
     Try(clazz.getField(field)).map(_.get(null).asInstanceOf[A]).toOption.map(mapTo).getOrElse(defaults)
 
+  def getFieldValue2[A: ClassTag, A1: ClassTag, B](
+        clazz: Class[_],
+        field: String,
+        defaults: => B)(mapTo:  A => B)(mapTo1: A1 => B) : B = {
+    Try(clazz.getField(field)).map(_.get(null)).map {
+      case v: A => mapTo(v)
+      case v1: A1 => mapTo1(v1)
+      case _ => defaults
+    }.toOption.getOrElse(defaults)
+  }
+
   private class DummyClient(
       val args: ClientArguments,
       val conf: Configuration,

From 8fbd5f4a90f92e064aa057adbd3f8c58dd0087fa Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Wed, 17 Sep 2014 12:33:09 -0700
Subject: [PATCH 22/24] [Docs] minor grammar fix

Author: Nicholas Chammas <nicholas.chammas@gmail.com>

Closes #2430 from nchammas/patch-2 and squashes the following commits:

d476bfb [Nicholas Chammas] [Docs] minor grammar fix
---
 CONTRIBUTING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index c6b4aa5344757..b6c6b050fa331 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -8,5 +8,5 @@ submitting any copyrighted material via pull request, email, or other means
 you agree to license the material under the project's open source license and
 warrant that you have the legal authority to do so.
 
-Please see [Contributing to Spark wiki page](https://cwiki.apache.org/SPARK/Contributing+to+Spark)
+Please see the [Contributing to Spark wiki page](https://cwiki.apache.org/SPARK/Contributing+to+Spark)
 for more information.

From cbf983bb4a550ff26756ed7308fb03db42cffcff Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Wed, 17 Sep 2014 12:41:49 -0700
Subject: [PATCH 23/24] [SQL][DOCS] Improve table caching section

Author: Michael Armbrust <michael@databricks.com>

Closes #2434 from marmbrus/patch-1 and squashes the following commits:

67215be [Michael Armbrust] [SQL][DOCS] Improve table caching section
---
 docs/sql-programming-guide.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index c498b41c43380..5212e19c41349 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -801,12 +801,12 @@ turning on some experimental options.
 
 ## Caching Data In Memory
 
-Spark SQL can cache tables using an in-memory columnar format by calling `cacheTable("tableName")`.
+Spark SQL can cache tables using an in-memory columnar format by calling `sqlContext.cacheTable("tableName")`.
 Then Spark SQL will scan only required columns and will automatically tune compression to minimize
-memory usage and GC pressure. You can call `uncacheTable("tableName")` to remove the table from memory.
+memory usage and GC pressure. You can call `sqlContext.uncacheTable("tableName")` to remove the table from memory.
 
-Note that if you call `cache` rather than `cacheTable`, tables will _not_ be cached using
-the in-memory columnar format, and therefore `cacheTable` is strongly recommended for this use case.
+Note that if you call `schemaRDD.cache()` rather than `sqlContext.cacheTable(...)`, tables will _not_ be cached using
+the in-memory columnar format, and therefore `sqlContext.cacheTable(...)` is strongly recommended for this use case.
 
 Configuration of in-memory caching can be done using the `setConf` method on SQLContext or by running
 `SET key=value` commands using SQL.

From 5044e4953a1744593d83fe90628fb4893e5463f1 Mon Sep 17 00:00:00 2001
From: Nicholas Chammas <nicholas.chammas@gmail.com>
Date: Wed, 17 Sep 2014 12:44:44 -0700
Subject: [PATCH 24/24] [SPARK-1455] [SPARK-3534] [Build] When possible, run
 SQL tests only.

If the only files changed are related to SQL, then only run the SQL tests.

This patch includes some cosmetic/maintainability refactoring. I would be more than happy to undo some of these changes if they are inappropriate.

We can accept this patch mostly as-is and address the immediate need documented in [SPARK-3534](https://issues.apache.org/jira/browse/SPARK-3534), or we can keep it open until a satisfactory solution along the lines [discussed here](https://issues.apache.org/jira/browse/SPARK-1455?focusedCommentId=14136424&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-14136424) is reached.

Note: I had to hack this patch up to test it locally, so what I'm submitting here and what I tested are technically different.

Author: Nicholas Chammas <nicholas.chammas@gmail.com>

Closes #2420 from nchammas/selective-testing and squashes the following commits:

db3fa2d [Nicholas Chammas] diff against master!
f9e23f6 [Nicholas Chammas] when possible, run SQL tests only
---
 dev/run-tests | 156 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 106 insertions(+), 50 deletions(-)

diff --git a/dev/run-tests b/dev/run-tests
index 79401213a7fa2..53148d23f385f 100755
--- a/dev/run-tests
+++ b/dev/run-tests
@@ -21,44 +21,73 @@
 FWDIR="$(cd "`dirname $0`"/..; pwd)"
 cd "$FWDIR"
 
-if [ -n "$AMPLAB_JENKINS_BUILD_PROFILE" ]; then
-  if [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop1.0" ]; then
-    export SBT_MAVEN_PROFILES_ARGS="-Dhadoop.version=1.0.4"
-  elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.0" ]; then
-    export SBT_MAVEN_PROFILES_ARGS="-Dhadoop.version=2.0.0-mr1-cdh4.1.1"
-  elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.2" ]; then
-    export SBT_MAVEN_PROFILES_ARGS="-Pyarn -Dhadoop.version=2.2.0"
-  elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.3" ]; then
-    export SBT_MAVEN_PROFILES_ARGS="-Pyarn -Phadoop-2.3 -Dhadoop.version=2.3.0"
+# Remove work directory
+rm -rf ./work
+
+# Build against the right verison of Hadoop.
+{
+  if [ -n "$AMPLAB_JENKINS_BUILD_PROFILE" ]; then
+    if [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop1.0" ]; then
+      export SBT_MAVEN_PROFILES_ARGS="-Dhadoop.version=1.0.4"
+    elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.0" ]; then
+      export SBT_MAVEN_PROFILES_ARGS="-Dhadoop.version=2.0.0-mr1-cdh4.1.1"
+    elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.2" ]; then
+      export SBT_MAVEN_PROFILES_ARGS="-Pyarn -Dhadoop.version=2.2.0"
+    elif [ "$AMPLAB_JENKINS_BUILD_PROFILE" = "hadoop2.3" ]; then
+      export SBT_MAVEN_PROFILES_ARGS="-Pyarn -Phadoop-2.3 -Dhadoop.version=2.3.0"
+    fi
   fi
-fi
 
-if [ -z "$SBT_MAVEN_PROFILES_ARGS" ]; then
-  export SBT_MAVEN_PROFILES_ARGS="-Pyarn -Phadoop-2.3 -Dhadoop.version=2.3.0"
-fi
+  if [ -z "$SBT_MAVEN_PROFILES_ARGS" ]; then
+    export SBT_MAVEN_PROFILES_ARGS="-Pyarn -Phadoop-2.3 -Dhadoop.version=2.3.0"
+  fi
+}
 
 export SBT_MAVEN_PROFILES_ARGS="$SBT_MAVEN_PROFILES_ARGS -Pkinesis-asl"
 
-echo "SBT_MAVEN_PROFILES_ARGS=\"$SBT_MAVEN_PROFILES_ARGS\""
-
-# Remove work directory
-rm -rf ./work
-
-if test -x "$JAVA_HOME/bin/java"; then
-    declare java_cmd="$JAVA_HOME/bin/java"
-else 
-    declare java_cmd=java
-fi
-JAVA_VERSION=$($java_cmd -version 2>&1 | sed 's/java version "\(.*\)\.\(.*\)\..*"/\1\2/; 1q')
-[ "$JAVA_VERSION" -ge 18 ] && echo "" || echo "[Warn] Java 8 tests will not run because JDK version is < 1.8."
+# Determine Java path and version.
+{
+  if test -x "$JAVA_HOME/bin/java"; then
+      declare java_cmd="$JAVA_HOME/bin/java"
+  else 
+      declare java_cmd=java
+  fi
+  
+  # We can't use sed -r -e due to OS X / BSD compatibility; hence, all the parentheses.
+  JAVA_VERSION=$(
+    $java_cmd -version 2>&1 \
+    | grep -e "^java version" --max-count=1 \
+    | sed "s/java version \"\(.*\)\.\(.*\)\.\(.*\)\"/\1\2/"
+  )
+  
+  if [ "$JAVA_VERSION" -lt 18 ]; then
+    echo "[warn] Java 8 tests will not run because JDK version is < 1.8."
+  fi
+}
 
-# Partial solution for SPARK-1455. Only run Hive tests if there are sql changes.
+# Only run Hive tests if there are sql changes.
+# Partial solution for SPARK-1455.
 if [ -n "$AMPLAB_JENKINS" ]; then
   git fetch origin master:master
-  diffs=`git diff --name-only master | grep "^\(sql/\)\|\(bin/spark-sql\)\|\(sbin/start-thriftserver.sh\)"`
-  if [ -n "$diffs" ]; then
-    echo "Detected changes in SQL. Will run Hive test suite."
+  
+  sql_diffs=$(
+    git diff --name-only master \
+    | grep -e "^sql/" -e "^bin/spark-sql" -e "^sbin/start-thriftserver.sh"
+  )
+
+  non_sql_diffs=$(
+    git diff --name-only master \
+    | grep -v -e "^sql/" -e "^bin/spark-sql" -e "^sbin/start-thriftserver.sh"
+  )
+
+  if [ -n "$sql_diffs" ]; then
+    echo "[info] Detected changes in SQL. Will run Hive test suite."
     _RUN_SQL_TESTS=true
+    
+    if [ -z "$non_sql_diffs" ]; then
+      echo "[info] Detected no changes except in SQL. Will only run SQL tests."
+      _SQL_TESTS_ONLY=true
+    fi
   fi
 fi
 
@@ -70,42 +99,69 @@ echo ""
 echo "========================================================================="
 echo "Running Apache RAT checks"
 echo "========================================================================="
-dev/check-license
+./dev/check-license
 
 echo ""
 echo "========================================================================="
 echo "Running Scala style checks"
 echo "========================================================================="
-dev/lint-scala
+./dev/lint-scala
 
 echo ""
 echo "========================================================================="
 echo "Running Python style checks"
 echo "========================================================================="
-dev/lint-python
+./dev/lint-python
+
+echo ""
+echo "========================================================================="
+echo "Building Spark"
+echo "========================================================================="
+
+{
+  # We always build with Hive because the PySpark Spark SQL tests need it.
+  BUILD_MVN_PROFILE_ARGS="$SBT_MAVEN_PROFILES_ARGS -Phive"
+
+  echo "[info] Building Spark with these arguments: $BUILD_MVN_PROFILE_ARGS"
+
+  # NOTE: echo "q" is needed because sbt on encountering a build file with failure
+  #+ (either resolution or compilation) prompts the user for input either q, r, etc
+  #+ to quit or retry. This echo is there to make it not block.
+  # QUESTION: Why doesn't 'yes "q"' work?
+  # QUESTION: Why doesn't 'grep -v -e "^\[info\] Resolving"' work?
+  echo -e "q\n" \
+    | sbt/sbt $BUILD_MVN_PROFILE_ARGS clean package assembly/assembly \
+    | grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including"
+}
 
 echo ""
 echo "========================================================================="
 echo "Running Spark unit tests"
 echo "========================================================================="
 
-# Build Spark; we always build with Hive because the PySpark Spark SQL tests need it.
-# echo "q" is needed because sbt on encountering a build file with failure
-# (either resolution or compilation) prompts the user for input either q, r,
-# etc to quit or retry. This echo is there to make it not block.
-BUILD_MVN_PROFILE_ARGS="$SBT_MAVEN_PROFILES_ARGS -Phive "
-echo -e "q\n" | sbt/sbt $BUILD_MVN_PROFILE_ARGS clean package assembly/assembly | \
-  grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including"
-
-# If the Spark SQL tests are enabled, run the tests with the Hive profiles enabled:
-if [ -n "$_RUN_SQL_TESTS" ]; then
-  SBT_MAVEN_PROFILES_ARGS="$SBT_MAVEN_PROFILES_ARGS -Phive"
-fi
-# echo "q" is needed because sbt on encountering a build file with failure
-# (either resolution or compilation) prompts the user for input either q, r,
-# etc to quit or retry. This echo is there to make it not block.
-echo -e "q\n" | sbt/sbt $SBT_MAVEN_PROFILES_ARGS test | \
-  grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including"
+{
+  # If the Spark SQL tests are enabled, run the tests with the Hive profiles enabled.
+  if [ -n "$_RUN_SQL_TESTS" ]; then
+    SBT_MAVEN_PROFILES_ARGS="$SBT_MAVEN_PROFILES_ARGS -Phive"
+  fi
+  
+  if [ -n "$_SQL_TESTS_ONLY" ]; then
+    SBT_MAVEN_TEST_ARGS="catalyst/test sql/test hive/test"
+  else
+    SBT_MAVEN_TEST_ARGS="test"
+  fi
+  
+  echo "[info] Running Spark tests with these arguments: $SBT_MAVEN_PROFILES_ARGS $SBT_MAVEN_TEST_ARGS"
+  
+  # NOTE: echo "q" is needed because sbt on encountering a build file with failure
+  #+ (either resolution or compilation) prompts the user for input either q, r, etc
+  #+ to quit or retry. This echo is there to make it not block.
+  # QUESTION: Why doesn't 'yes "q"' work?
+  # QUESTION: Why doesn't 'grep -v -e "^\[info\] Resolving"' work?
+  echo -e "q\n" \
+    | sbt/sbt "$SBT_MAVEN_PROFILES_ARGS" "$SBT_MAVEN_TEST_ARGS" \
+    | grep -v -e "info.*Resolving" -e "warn.*Merging" -e "info.*Including"
+}
 
 echo ""
 echo "========================================================================="
@@ -117,4 +173,4 @@ echo ""
 echo "========================================================================="
 echo "Detecting binary incompatibilites with MiMa"
 echo "========================================================================="
-dev/mima
+./dev/mima